1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2010-2011 Juniper Networks, Inc.
5 * All rights reserved.
6 *
7 * This software was developed by Robert N. M. Watson under contract
8 * to Juniper Networks, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33
34__FBSDID("$FreeBSD$");
35
36#include "opt_inet6.h"
37#include "opt_rss.h"
38
39#include <sys/param.h>
40#include <sys/lock.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/mutex.h>
44#include <sys/smp.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47
48#include <net/rss_config.h>
49
50#include <netinet/in.h>
51
52#include <netinet/in_pcb.h>
53#include <netinet/in_rss.h>
54#ifdef INET6
55#include <netinet6/in6_pcb.h>
56#endif /* INET6 */
57
58/*
59 * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's
60 * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization
61 * Strategies in Modern Operating Systems".  This implementation differs
62 * significantly from that described in the paper, in that it attempts to
63 * introduce not just notions of affinity for connections and distribute work
64 * so as to reduce lock contention, but also align those notions with
65 * hardware work distribution strategies such as RSS.  In this construction,
66 * connection groups supplement, rather than replace, existing reservation
67 * tables for protocol 4-tuples, offering CPU-affine lookup tables with
68 * minimal cache line migration and lock contention during steady state
69 * operation.
70 *
71 * Hardware-offloaded checksums are often inefficient in software -- for
72 * example, Toeplitz, specified by RSS, introduced a significant overhead if
73 * performed during per-packge processing.  It is therefore desirable to fall
74 * back on traditional reservation table lookups without affinity where
75 * hardware-offloaded checksums aren't available, such as for traffic over
76 * non-RSS interfaces.
77 *
78 * Internet protocols, such as UDP and TCP, register to use connection groups
79 * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this
80 * indicates to the connection group code whether a 2-tuple or 4-tuple is
81 * used as an argument to hashes that assign a connection to a particular
82 * group.  This must be aligned with any hardware offloaded distribution
83 * model, such as RSS or similar approaches taken in embedded network boards.
84 * Wildcard sockets require special handling, as in Willman 2006, and are
85 * shared between connection groups -- while being protected by group-local
86 * locks.  This means that connection establishment and teardown can be
87 * signficantly more expensive than without connection groups, but that
88 * steady-state processing can be significantly faster.
89 *
90 * When RSS is used, certain connection group parameters, such as the number
91 * of groups, are provided by the RSS implementation, found in in_rss.c.
92 * Otherwise, in_pcbgroup.c selects possible sensible parameters
93 * corresponding to the degree of parallelism exposed by netisr.
94 *
95 * Most of the implementation of connection groups is in this file; however,
96 * connection group lookup is implemented in in_pcb.c alongside reservation
97 * table lookups -- see in_pcblookup_group().
98 *
99 * TODO:
100 *
101 * Implement dynamic rebalancing of buckets with connection groups; when
102 * load is unevenly distributed, search for more optimal balancing on
103 * demand.  This might require scaling up the number of connection groups
104 * by <<1.
105 *
106 * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection
107 * groups for ip_input and ip6_input, allowing non-offloaded work
108 * distribution.
109 *
110 * Expose effective CPU affinity of connections to userspace using socket
111 * options.
112 *
113 * Investigate per-connection affinity overrides based on socket options; an
114 * option could be set, certainly resulting in work being distributed
115 * differently in software, and possibly propagated to supporting hardware
116 * with TCAMs or hardware hash tables.  This might require connections to
117 * exist in more than one connection group at a time.
118 *
119 * Hook netisr thread reconfiguration events, and propagate those to RSS so
120 * that rebalancing can occur when the thread pool grows or shrinks.
121 *
122 * Expose per-pcbgroup statistics to userspace monitoring tools such as
123 * netstat, in order to allow better debugging and profiling.
124 */
125
126void
127in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields,
128    int hash_nelements)
129{
130	struct inpcbgroup *pcbgroup;
131	u_int numpcbgroups, pgn;
132
133	/*
134	 * Only enable connection groups for a protocol if it has been
135	 * specifically requested.
136	 */
137	if (hashfields == IPI_HASHFIELDS_NONE)
138		return;
139
140	/*
141	 * Connection groups are about multi-processor load distribution,
142	 * lock contention, and connection CPU affinity.  As such, no point
143	 * in turning them on for a uniprocessor machine, it only wastes
144	 * memory.
145	 */
146	if (mp_ncpus == 1)
147		return;
148
149#ifdef RSS
150	/*
151	 * If we're using RSS, then RSS determines the number of connection
152	 * groups to use: one connection group per RSS bucket.  If for some
153	 * reason RSS isn't able to provide a number of buckets, disable
154	 * connection groups entirely.
155	 *
156	 * XXXRW: Can this ever happen?
157	 */
158	numpcbgroups = rss_getnumbuckets();
159	if (numpcbgroups == 0)
160		return;
161#else
162	/*
163	 * Otherwise, we'll just use one per CPU for now.  If we decide to
164	 * do dynamic rebalancing a la RSS, we'll need similar logic here.
165	 */
166	numpcbgroups = mp_ncpus;
167#endif
168
169	pcbinfo->ipi_hashfields = hashfields;
170	pcbinfo->ipi_pcbgroups = malloc(numpcbgroups *
171	    sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO);
172	pcbinfo->ipi_npcbgroups = numpcbgroups;
173	pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB,
174	    &pcbinfo->ipi_wildmask);
175	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
176		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
177		pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB,
178		    &pcbgroup->ipg_hashmask);
179		INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup");
180
181		/*
182		 * Initialise notional affinity of the pcbgroup -- for RSS,
183		 * we want the same notion of affinity as NICs to be used.  In
184		 * the non-RSS case, just round robin for the time being.
185		 *
186		 * XXXRW: The notion of a bucket to CPU mapping is common at
187		 * both pcbgroup and RSS layers -- does that mean that we
188		 * should migrate it all from RSS to here, and just leave RSS
189		 * responsible only for providing hashing and mapping funtions?
190		 */
191#ifdef RSS
192		pcbgroup->ipg_cpu = rss_getcpu(pgn);
193#else
194		pcbgroup->ipg_cpu = (pgn % mp_ncpus);
195#endif
196	}
197}
198
199void
200in_pcbgroup_destroy(struct inpcbinfo *pcbinfo)
201{
202	struct inpcbgroup *pcbgroup;
203	u_int pgn;
204
205	if (pcbinfo->ipi_npcbgroups == 0)
206		return;
207
208	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
209		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
210		KASSERT(CK_LIST_EMPTY(pcbinfo->ipi_listhead),
211		    ("in_pcbinfo_destroy: listhead not empty"));
212		INP_GROUP_LOCK_DESTROY(pcbgroup);
213		hashdestroy(pcbgroup->ipg_hashbase, M_PCB,
214		    pcbgroup->ipg_hashmask);
215	}
216	hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask);
217	free(pcbinfo->ipi_pcbgroups, M_PCB);
218	pcbinfo->ipi_pcbgroups = NULL;
219	pcbinfo->ipi_npcbgroups = 0;
220	pcbinfo->ipi_hashfields = 0;
221}
222
223/*
224 * Given a hash of whatever the covered tuple might be, return a pcbgroup
225 * index.  Where RSS is supported, try to align bucket selection with RSS CPU
226 * affinity strategy.
227 */
228static __inline u_int
229in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
230{
231
232#ifdef RSS
233	return (rss_getbucket(hash));
234#else
235	return (hash % pcbinfo->ipi_npcbgroups);
236#endif
237}
238
239/*
240 * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
241 * information is insufficient to identify the pcbgroup.  This might occur if
242 * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but
243 * RSS is not compiled into the kernel.
244 */
245struct inpcbgroup *
246in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
247{
248
249#ifdef RSS
250	if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
251	    hashtype == M_HASHTYPE_RSS_TCP_IPV4) ||
252	    (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
253	    hashtype == M_HASHTYPE_RSS_UDP_IPV4) ||
254	    (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE &&
255	    hashtype == M_HASHTYPE_RSS_IPV4))
256		return (&pcbinfo->ipi_pcbgroups[
257		    in_pcbgroup_getbucket(pcbinfo, hash)]);
258#endif
259	return (NULL);
260}
261
262static struct inpcbgroup *
263in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
264{
265
266	return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
267	    m->m_pkthdr.flowid));
268}
269
270struct inpcbgroup *
271in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
272    u_short lport, struct in_addr faddr, u_short fport)
273{
274	uint32_t hash;
275
276	/*
277	 * RSS note: we pass foreign addr/port as source, and local addr/port
278	 * as destination, as we want to align with what the hardware is
279	 * doing.
280	 */
281	switch (pcbinfo->ipi_hashfields) {
282	case IPI_HASHFIELDS_4TUPLE:
283#ifdef RSS
284		hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport);
285#else
286		hash = faddr.s_addr ^ fport;
287#endif
288		break;
289
290	case IPI_HASHFIELDS_2TUPLE:
291#ifdef RSS
292		hash = rss_hash_ip4_2tuple(faddr, laddr);
293#else
294		hash = faddr.s_addr ^ laddr.s_addr;
295#endif
296		break;
297
298	default:
299		hash = 0;
300	}
301	return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo,
302	    hash)]);
303}
304
305struct inpcbgroup *
306in_pcbgroup_byinpcb(struct inpcb *inp)
307{
308#ifdef	RSS
309	/*
310	 * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined
311	 * RSS bucket and thus we should use this pcbgroup, rather than
312	 * using a tuple or hash.
313	 *
314	 * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket
315	 * fits in that!
316	 */
317	if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
318		return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]);
319#endif
320
321	return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
322	    inp->inp_lport, inp->inp_faddr, inp->inp_fport));
323}
324
325static void
326in_pcbwild_add(struct inpcb *inp)
327{
328	struct inpcbinfo *pcbinfo;
329	struct inpcbhead *head;
330	u_int pgn;
331
332	INP_WLOCK_ASSERT(inp);
333	KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD),
334	    ("%s: is wild",__func__));
335
336	pcbinfo = inp->inp_pcbinfo;
337	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
338		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
339	head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport,
340	    0, pcbinfo->ipi_wildmask)];
341	CK_LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild);
342	inp->inp_flags2 |= INP_PCBGROUPWILD;
343	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
344		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
345}
346
347static void
348in_pcbwild_remove(struct inpcb *inp)
349{
350	struct inpcbinfo *pcbinfo;
351	u_int pgn;
352
353	INP_WLOCK_ASSERT(inp);
354	KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD),
355	    ("%s: not wild", __func__));
356
357	pcbinfo = inp->inp_pcbinfo;
358	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
359		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
360	CK_LIST_REMOVE(inp, inp_pcbgroup_wild);
361	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
362		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
363	inp->inp_flags2 &= ~INP_PCBGROUPWILD;
364}
365
366static __inline int
367in_pcbwild_needed(struct inpcb *inp)
368{
369#ifdef	RSS
370	/*
371	 * If it's a listen socket and INP_RSS_BUCKET_SET is set,
372	 * it's a wildcard socket _but_ it's in a specific pcbgroup.
373	 * Thus we don't treat it as a pcbwild inp.
374	 */
375	if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
376		return (0);
377#endif
378
379#ifdef INET6
380	if (inp->inp_vflag & INP_IPV6)
381		return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr));
382	else
383#endif
384		return (inp->inp_faddr.s_addr == htonl(INADDR_ANY));
385}
386
387static void
388in_pcbwild_update_internal(struct inpcb *inp)
389{
390	int wildcard_needed;
391
392	wildcard_needed = in_pcbwild_needed(inp);
393	if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD))
394		in_pcbwild_add(inp);
395	else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD))
396		in_pcbwild_remove(inp);
397}
398
399/*
400 * Update the pcbgroup of an inpcb, which might include removing an old
401 * pcbgroup reference and/or adding a new one.  Wildcard processing is not
402 * performed here, although ideally we'll never install a pcbgroup for a
403 * wildcard inpcb (asserted below).
404 */
405static void
406in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
407    struct inpcbgroup *newpcbgroup, struct inpcb *inp)
408{
409	struct inpcbgroup *oldpcbgroup;
410	struct inpcbhead *pcbhash;
411	uint32_t hashkey_faddr;
412
413	INP_WLOCK_ASSERT(inp);
414
415	oldpcbgroup = inp->inp_pcbgroup;
416	if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
417		INP_GROUP_LOCK(oldpcbgroup);
418		CK_LIST_REMOVE(inp, inp_pcbgrouphash);
419		inp->inp_pcbgroup = NULL;
420		INP_GROUP_UNLOCK(oldpcbgroup);
421	}
422	if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
423#ifdef INET6
424		if (inp->inp_vflag & INP_IPV6)
425			hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
426		else
427#endif
428			hashkey_faddr = inp->inp_faddr.s_addr;
429		INP_GROUP_LOCK(newpcbgroup);
430		/*
431		 * If the inp is an RSS bucket wildcard entry, ensure
432		 * that the PCB hash is calculated correctly.
433		 *
434		 * The wildcard hash calculation differs from the
435		 * non-wildcard definition.  The source address is
436		 * INADDR_ANY and the far port is 0.
437		 */
438		if (inp->inp_flags2 & INP_RSS_BUCKET_SET) {
439			pcbhash = &newpcbgroup->ipg_hashbase[
440			    INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0,
441			    newpcbgroup->ipg_hashmask)];
442		} else {
443			pcbhash = &newpcbgroup->ipg_hashbase[
444			    INP_PCBHASH(hashkey_faddr, inp->inp_lport,
445			    inp->inp_fport,
446			    newpcbgroup->ipg_hashmask)];
447		}
448		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
449		inp->inp_pcbgroup = newpcbgroup;
450		INP_GROUP_UNLOCK(newpcbgroup);
451	}
452
453	KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)),
454	    ("%s: pcbgroup and wildcard!", __func__));
455}
456
457/*
458 * Two update paths: one in which the 4-tuple on an inpcb has been updated
459 * and therefore connection groups may need to change (or a wildcard entry
460 * may needed to be installed), and another in which the 4-tuple has been
461 * set as a result of a packet received, in which case we may be able to use
462 * the hash on the mbuf to avoid doing a software hash calculation for RSS.
463 *
464 * In each case: first, let the wildcard code have a go at placing it as a
465 * wildcard socket.  If it was a wildcard, or if the connection has been
466 * dropped, then no pcbgroup is required (so potentially clear it);
467 * otherwise, calculate and update the pcbgroup for the inpcb.
468 */
469void
470in_pcbgroup_update(struct inpcb *inp)
471{
472	struct inpcbinfo *pcbinfo;
473	struct inpcbgroup *newpcbgroup;
474
475	INP_WLOCK_ASSERT(inp);
476
477	pcbinfo = inp->inp_pcbinfo;
478	if (!in_pcbgroup_enabled(pcbinfo))
479		return;
480
481	in_pcbwild_update_internal(inp);
482	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
483	    !(inp->inp_flags & INP_DROPPED)) {
484#ifdef INET6
485		if (inp->inp_vflag & INP_IPV6)
486			newpcbgroup = in6_pcbgroup_byinpcb(inp);
487		else
488#endif
489			newpcbgroup = in_pcbgroup_byinpcb(inp);
490	} else
491		newpcbgroup = NULL;
492	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
493}
494
495void
496in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m)
497{
498	struct inpcbinfo *pcbinfo;
499	struct inpcbgroup *newpcbgroup;
500
501	INP_WLOCK_ASSERT(inp);
502
503	pcbinfo = inp->inp_pcbinfo;
504	if (!in_pcbgroup_enabled(pcbinfo))
505		return;
506
507	/*
508	 * Possibly should assert !INP_PCBGROUPWILD rather than testing for
509	 * it; presumably this function should never be called for anything
510	 * other than non-wildcard socket?
511	 */
512	in_pcbwild_update_internal(inp);
513	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
514	    !(inp->inp_flags & INP_DROPPED)) {
515		newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m);
516#ifdef INET6
517		if (inp->inp_vflag & INP_IPV6) {
518			if (newpcbgroup == NULL)
519				newpcbgroup = in6_pcbgroup_byinpcb(inp);
520		} else {
521#endif
522			if (newpcbgroup == NULL)
523				newpcbgroup = in_pcbgroup_byinpcb(inp);
524#ifdef INET6
525		}
526#endif
527	} else
528		newpcbgroup = NULL;
529	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
530}
531
532/*
533 * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb.
534 */
535void
536in_pcbgroup_remove(struct inpcb *inp)
537{
538	struct inpcbgroup *pcbgroup;
539
540	INP_WLOCK_ASSERT(inp);
541
542	if (!in_pcbgroup_enabled(inp->inp_pcbinfo))
543		return;
544
545	if (inp->inp_flags2 & INP_PCBGROUPWILD)
546		in_pcbwild_remove(inp);
547
548	pcbgroup = inp->inp_pcbgroup;
549	if (pcbgroup != NULL) {
550		INP_GROUP_LOCK(pcbgroup);
551		CK_LIST_REMOVE(inp, inp_pcbgrouphash);
552		inp->inp_pcbgroup = NULL;
553		INP_GROUP_UNLOCK(pcbgroup);
554	}
555}
556
557/*
558 * Query whether or not it is appropriate to use pcbgroups to look up inpcbs
559 * for a protocol.
560 */
561int
562in_pcbgroup_enabled(struct inpcbinfo *pcbinfo)
563{
564
565	return (pcbinfo->ipi_npcbgroups > 0);
566}
567