1/*-
2 * Copyright (c) 2010-2011 Juniper Networks, Inc.
3 * All rights reserved.
4 *
5 * This software was developed by Robert N. M. Watson under contract
6 * to Juniper Networks, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31
32__FBSDID("$FreeBSD$");
33
34#include "opt_inet6.h"
35
36#include <sys/param.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mbuf.h>
40#include <sys/mutex.h>
41#include <sys/smp.h>
42#include <sys/socketvar.h>
43
44#include <netinet/in.h>
45#include <netinet/in_pcb.h>
46#ifdef INET6
47#include <netinet6/in6_pcb.h>
48#endif /* INET6 */
49
50/*
51 * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's
52 * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization
53 * Strategies in Modern Operating Systems".  This implementation differs
54 * significantly from that described in the paper, in that it attempts to
55 * introduce not just notions of affinity for connections and distribute work
56 * so as to reduce lock contention, but also align those notions with
57 * hardware work distribution strategies such as RSS.  In this construction,
58 * connection groups supplement, rather than replace, existing reservation
59 * tables for protocol 4-tuples, offering CPU-affine lookup tables with
60 * minimal cache line migration and lock contention during steady state
61 * operation.
62 *
63 * Internet protocols, such as UDP and TCP, register to use connection groups
64 * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this
65 * indicates to the connection group code whether a 2-tuple or 4-tuple is
66 * used as an argument to hashes that assign a connection to a particular
67 * group.  This must be aligned with any hardware offloaded distribution
68 * model, such as RSS or similar approaches taken in embedded network boards.
69 * Wildcard sockets require special handling, as in Willman 2006, and are
70 * shared between connection groups -- while being protected by group-local
71 * locks.  This means that connection establishment and teardown can be
72 * signficantly more expensive than without connection groups, but that
73 * steady-state processing can be significantly faster.
74 *
75 * Most of the implementation of connection groups is in this file; however,
76 * connection group lookup is implemented in in_pcb.c alongside reservation
77 * table lookups -- see in_pcblookup_group().
78 *
79 * TODO:
80 *
81 * Implement dynamic rebalancing of buckets with connection groups; when
82 * load is unevenly distributed, search for more optimal balancing on
83 * demand.  This might require scaling up the number of connection groups
84 * by <<1.
85 *
86 * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection
87 * groups for ip_input and ip6_input, allowing non-offloaded work
88 * distribution.
89 *
90 * Expose effective CPU affinity of connections to userspace using socket
91 * options.
92 *
93 * Investigate per-connection affinity overrides based on socket options; an
94 * option could be set, certainly resulting in work being distributed
95 * differently in software, and possibly propagated to supporting hardware
96 * with TCAMs or hardware hash tables.  This might require connections to
97 * exist in more than one connection group at a time.
98 *
99 * Hook netisr thread reconfiguration events, and propagate those to RSS so
100 * that rebalancing can occur when the thread pool grows or shrinks.
101 *
102 * Expose per-pcbgroup statistics to userspace monitoring tools such as
103 * netstat, in order to allow better debugging and profiling.
104 */
105
106void
107in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields,
108    int hash_nelements)
109{
110	struct inpcbgroup *pcbgroup;
111	u_int numpcbgroups, pgn;
112
113	/*
114	 * Only enable connection groups for a protocol if it has been
115	 * specifically requested.
116	 */
117	if (hashfields == IPI_HASHFIELDS_NONE)
118		return;
119
120	/*
121	 * Connection groups are about multi-processor load distribution,
122	 * lock contention, and connection CPU affinity.  As such, no point
123	 * in turning them on for a uniprocessor machine, it only wastes
124	 * memory.
125	 */
126	if (mp_ncpus == 1)
127		return;
128
129	/*
130	 * Use one group per CPU for now.  If we decide to do dynamic
131	 * rebalancing a la RSS, we'll need to shift left by at least 1.
132	 */
133	numpcbgroups = mp_ncpus;
134
135	pcbinfo->ipi_hashfields = hashfields;
136	pcbinfo->ipi_pcbgroups = malloc(numpcbgroups *
137	    sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO);
138	pcbinfo->ipi_npcbgroups = numpcbgroups;
139	pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB,
140	    &pcbinfo->ipi_wildmask);
141	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
142		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
143		pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB,
144		    &pcbgroup->ipg_hashmask);
145		INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup");
146
147		/*
148		 * Initialise notional affinity of the pcbgroup -- for RSS,
149		 * we want the same notion of affinity as NICs to be used.
150		 * Just round robin for the time being.
151		 */
152		pcbgroup->ipg_cpu = (pgn % mp_ncpus);
153	}
154}
155
156void
157in_pcbgroup_destroy(struct inpcbinfo *pcbinfo)
158{
159	struct inpcbgroup *pcbgroup;
160	u_int pgn;
161
162	if (pcbinfo->ipi_npcbgroups == 0)
163		return;
164
165	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
166		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
167		KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead),
168		    ("in_pcbinfo_destroy: listhead not empty"));
169		INP_GROUP_LOCK_DESTROY(pcbgroup);
170		hashdestroy(pcbgroup->ipg_hashbase, M_PCB,
171		    pcbgroup->ipg_hashmask);
172	}
173	hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask);
174	free(pcbinfo->ipi_pcbgroups, M_PCB);
175	pcbinfo->ipi_pcbgroups = NULL;
176	pcbinfo->ipi_npcbgroups = 0;
177	pcbinfo->ipi_hashfields = 0;
178}
179
180/*
181 * Given a hash of whatever the covered tuple might be, return a pcbgroup
182 * index.
183 */
184static __inline u_int
185in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
186{
187
188	return (hash % pcbinfo->ipi_npcbgroups);
189}
190
191/*
192 * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
193 * information is insufficient to identify the pcbgroup.
194 */
195struct inpcbgroup *
196in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
197{
198
199	return (NULL);
200}
201
202static struct inpcbgroup *
203in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
204{
205
206	return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
207	    m->m_pkthdr.flowid));
208}
209
210struct inpcbgroup *
211in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
212    u_short lport, struct in_addr faddr, u_short fport)
213{
214	uint32_t hash;
215
216	switch (pcbinfo->ipi_hashfields) {
217	case IPI_HASHFIELDS_4TUPLE:
218		hash = faddr.s_addr ^ fport;
219		break;
220
221	case IPI_HASHFIELDS_2TUPLE:
222		hash = faddr.s_addr ^ laddr.s_addr;
223		break;
224
225	default:
226		hash = 0;
227	}
228	return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo,
229	    hash)]);
230}
231
232struct inpcbgroup *
233in_pcbgroup_byinpcb(struct inpcb *inp)
234{
235
236	return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
237	    inp->inp_lport, inp->inp_faddr, inp->inp_fport));
238}
239
240static void
241in_pcbwild_add(struct inpcb *inp)
242{
243	struct inpcbinfo *pcbinfo;
244	struct inpcbhead *head;
245	u_int pgn;
246
247	INP_WLOCK_ASSERT(inp);
248	KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD),
249	    ("%s: is wild",__func__));
250
251	pcbinfo = inp->inp_pcbinfo;
252	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
253		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
254	head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport,
255	    0, pcbinfo->ipi_wildmask)];
256	LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild);
257	inp->inp_flags2 |= INP_PCBGROUPWILD;
258	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
259		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
260}
261
262static void
263in_pcbwild_remove(struct inpcb *inp)
264{
265	struct inpcbinfo *pcbinfo;
266	u_int pgn;
267
268	INP_WLOCK_ASSERT(inp);
269	KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD),
270	    ("%s: not wild", __func__));
271
272	pcbinfo = inp->inp_pcbinfo;
273	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
274		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
275	LIST_REMOVE(inp, inp_pcbgroup_wild);
276	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
277		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
278	inp->inp_flags2 &= ~INP_PCBGROUPWILD;
279}
280
281static __inline int
282in_pcbwild_needed(struct inpcb *inp)
283{
284
285#ifdef INET6
286	if (inp->inp_vflag & INP_IPV6)
287		return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr));
288	else
289#endif
290		return (inp->inp_faddr.s_addr == htonl(INADDR_ANY));
291}
292
293static void
294in_pcbwild_update_internal(struct inpcb *inp)
295{
296	int wildcard_needed;
297
298	wildcard_needed = in_pcbwild_needed(inp);
299	if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD))
300		in_pcbwild_add(inp);
301	else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD))
302		in_pcbwild_remove(inp);
303}
304
305/*
306 * Update the pcbgroup of an inpcb, which might include removing an old
307 * pcbgroup reference and/or adding a new one.  Wildcard processing is not
308 * performed here, although ideally we'll never install a pcbgroup for a
309 * wildcard inpcb (asserted below).
310 */
311static void
312in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
313    struct inpcbgroup *newpcbgroup, struct inpcb *inp)
314{
315	struct inpcbgroup *oldpcbgroup;
316	struct inpcbhead *pcbhash;
317	uint32_t hashkey_faddr;
318
319	INP_WLOCK_ASSERT(inp);
320
321	oldpcbgroup = inp->inp_pcbgroup;
322	if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
323		INP_GROUP_LOCK(oldpcbgroup);
324		LIST_REMOVE(inp, inp_pcbgrouphash);
325		inp->inp_pcbgroup = NULL;
326		INP_GROUP_UNLOCK(oldpcbgroup);
327	}
328	if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
329#ifdef INET6
330		if (inp->inp_vflag & INP_IPV6)
331			hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */
332		else
333#endif
334			hashkey_faddr = inp->inp_faddr.s_addr;
335		INP_GROUP_LOCK(newpcbgroup);
336		pcbhash = &newpcbgroup->ipg_hashbase[
337		    INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport,
338		    newpcbgroup->ipg_hashmask)];
339		LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
340		inp->inp_pcbgroup = newpcbgroup;
341		INP_GROUP_UNLOCK(newpcbgroup);
342	}
343
344	KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)),
345	    ("%s: pcbgroup and wildcard!", __func__));
346}
347
348/*
349 * Two update paths: one in which the 4-tuple on an inpcb has been updated
350 * and therefore connection groups may need to change (or a wildcard entry
351 * may needed to be installed), and another in which the 4-tuple has been
352 * set as a result of a packet received, in which case we may be able to use
353 * the hash on the mbuf to avoid doing a software hash calculation for RSS.
354 *
355 * In each case: first, let the wildcard code have a go at placing it as a
356 * wildcard socket.  If it was a wildcard, or if the connection has been
357 * dropped, then no pcbgroup is required (so potentially clear it);
358 * otherwise, calculate and update the pcbgroup for the inpcb.
359 */
360void
361in_pcbgroup_update(struct inpcb *inp)
362{
363	struct inpcbinfo *pcbinfo;
364	struct inpcbgroup *newpcbgroup;
365
366	INP_WLOCK_ASSERT(inp);
367
368	pcbinfo = inp->inp_pcbinfo;
369	if (!in_pcbgroup_enabled(pcbinfo))
370		return;
371
372	in_pcbwild_update_internal(inp);
373	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
374	    !(inp->inp_flags & INP_DROPPED)) {
375#ifdef INET6
376		if (inp->inp_vflag & INP_IPV6)
377			newpcbgroup = in6_pcbgroup_byinpcb(inp);
378		else
379#endif
380			newpcbgroup = in_pcbgroup_byinpcb(inp);
381	} else
382		newpcbgroup = NULL;
383	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
384}
385
386void
387in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m)
388{
389	struct inpcbinfo *pcbinfo;
390	struct inpcbgroup *newpcbgroup;
391
392	INP_WLOCK_ASSERT(inp);
393
394	pcbinfo = inp->inp_pcbinfo;
395	if (!in_pcbgroup_enabled(pcbinfo))
396		return;
397
398	/*
399	 * Possibly should assert !INP_PCBGROUPWILD rather than testing for
400	 * it; presumably this function should never be called for anything
401	 * other than non-wildcard socket?
402	 */
403	in_pcbwild_update_internal(inp);
404	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
405	    !(inp->inp_flags & INP_DROPPED)) {
406		newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m);
407#ifdef INET6
408		if (inp->inp_vflag & INP_IPV6) {
409			if (newpcbgroup == NULL)
410				newpcbgroup = in6_pcbgroup_byinpcb(inp);
411		} else {
412#endif
413			if (newpcbgroup == NULL)
414				newpcbgroup = in_pcbgroup_byinpcb(inp);
415#ifdef INET6
416		}
417#endif
418	} else
419		newpcbgroup = NULL;
420	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
421}
422
423/*
424 * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb.
425 */
426void
427in_pcbgroup_remove(struct inpcb *inp)
428{
429	struct inpcbgroup *pcbgroup;
430
431	INP_WLOCK_ASSERT(inp);
432
433	if (!in_pcbgroup_enabled(inp->inp_pcbinfo))
434		return;
435
436	if (inp->inp_flags2 & INP_PCBGROUPWILD)
437		in_pcbwild_remove(inp);
438
439	pcbgroup = inp->inp_pcbgroup;
440	if (pcbgroup != NULL) {
441		INP_GROUP_LOCK(pcbgroup);
442		LIST_REMOVE(inp, inp_pcbgrouphash);
443		inp->inp_pcbgroup = NULL;
444		INP_GROUP_UNLOCK(pcbgroup);
445	}
446}
447
448/*
449 * Query whether or not it is appropriate to use pcbgroups to look up inpcbs
450 * for a protocol.
451 */
452int
453in_pcbgroup_enabled(struct inpcbinfo *pcbinfo)
454{
455
456	return (pcbinfo->ipi_npcbgroups > 0);
457}
458