1222748Srwatson/*-
2222748Srwatson * Copyright (c) 2010-2011 Juniper Networks, Inc.
3222748Srwatson * All rights reserved.
4222748Srwatson *
5222748Srwatson * This software was developed by Robert N. M. Watson under contract
6222748Srwatson * to Juniper Networks, Inc.
7222748Srwatson *
8222748Srwatson * Redistribution and use in source and binary forms, with or without
9222748Srwatson * modification, are permitted provided that the following conditions
10222748Srwatson * are met:
11222748Srwatson * 1. Redistributions of source code must retain the above copyright
12222748Srwatson *    notice, this list of conditions and the following disclaimer.
13222748Srwatson * 2. Redistributions in binary form must reproduce the above copyright
14222748Srwatson *    notice, this list of conditions and the following disclaimer in the
15222748Srwatson *    documentation and/or other materials provided with the distribution.
16222748Srwatson *
17222748Srwatson * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18222748Srwatson * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19222748Srwatson * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20222748Srwatson * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21222748Srwatson * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22222748Srwatson * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23222748Srwatson * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24222748Srwatson * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25222748Srwatson * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26222748Srwatson * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27222748Srwatson * SUCH DAMAGE.
28222748Srwatson */
29222748Srwatson
30222748Srwatson#include <sys/cdefs.h>
31222748Srwatson
32222748Srwatson__FBSDID("$FreeBSD$");
33222748Srwatson
34222748Srwatson#include "opt_inet6.h"
35263198Srwatson#include "opt_rss.h"
36222748Srwatson
37222748Srwatson#include <sys/param.h>
38222748Srwatson#include <sys/lock.h>
39222748Srwatson#include <sys/malloc.h>
40222748Srwatson#include <sys/mbuf.h>
41222748Srwatson#include <sys/mutex.h>
42222748Srwatson#include <sys/smp.h>
43297439Sgnn#include <sys/socket.h>
44222748Srwatson#include <sys/socketvar.h>
45222748Srwatson
46277331Sadrian#include <net/rss_config.h>
47277331Sadrian
48222748Srwatson#include <netinet/in.h>
49277331Sadrian
50222748Srwatson#include <netinet/in_pcb.h>
51263198Srwatson#include <netinet/in_rss.h>
52222748Srwatson#ifdef INET6
53222748Srwatson#include <netinet6/in6_pcb.h>
54222748Srwatson#endif /* INET6 */
55222748Srwatson
56222748Srwatson/*
57222748Srwatson * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's
58222748Srwatson * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization
59222748Srwatson * Strategies in Modern Operating Systems".  This implementation differs
60222748Srwatson * significantly from that described in the paper, in that it attempts to
61222748Srwatson * introduce not just notions of affinity for connections and distribute work
62222748Srwatson * so as to reduce lock contention, but also align those notions with
63222748Srwatson * hardware work distribution strategies such as RSS.  In this construction,
64222748Srwatson * connection groups supplement, rather than replace, existing reservation
65222748Srwatson * tables for protocol 4-tuples, offering CPU-affine lookup tables with
66222748Srwatson * minimal cache line migration and lock contention during steady state
67222748Srwatson * operation.
68222748Srwatson *
69263198Srwatson * Hardware-offloaded checksums are often inefficient in software -- for
70263198Srwatson * example, Toeplitz, specified by RSS, introduced a significant overhead if
71263198Srwatson * performed during per-packge processing.  It is therefore desirable to fall
72263198Srwatson * back on traditional reservation table lookups without affinity where
73263198Srwatson * hardware-offloaded checksums aren't available, such as for traffic over
74263198Srwatson * non-RSS interfaces.
75263198Srwatson *
76222748Srwatson * Internet protocols, such as UDP and TCP, register to use connection groups
77222748Srwatson * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this
78222748Srwatson * indicates to the connection group code whether a 2-tuple or 4-tuple is
79222748Srwatson * used as an argument to hashes that assign a connection to a particular
80222748Srwatson * group.  This must be aligned with any hardware offloaded distribution
81222748Srwatson * model, such as RSS or similar approaches taken in embedded network boards.
82222748Srwatson * Wildcard sockets require special handling, as in Willman 2006, and are
83222748Srwatson * shared between connection groups -- while being protected by group-local
84222748Srwatson * locks.  This means that connection establishment and teardown can be
85222748Srwatson * signficantly more expensive than without connection groups, but that
86222748Srwatson * steady-state processing can be significantly faster.
87222748Srwatson *
88263198Srwatson * When RSS is used, certain connection group parameters, such as the number
89263198Srwatson * of groups, are provided by the RSS implementation, found in in_rss.c.
90263198Srwatson * Otherwise, in_pcbgroup.c selects possible sensible parameters
91263198Srwatson * corresponding to the degree of parallelism exposed by netisr.
92263198Srwatson *
93222748Srwatson * Most of the implementation of connection groups is in this file; however,
94222748Srwatson * connection group lookup is implemented in in_pcb.c alongside reservation
95222748Srwatson * table lookups -- see in_pcblookup_group().
96222748Srwatson *
97222748Srwatson * TODO:
98222748Srwatson *
99222748Srwatson * Implement dynamic rebalancing of buckets with connection groups; when
100222748Srwatson * load is unevenly distributed, search for more optimal balancing on
101222748Srwatson * demand.  This might require scaling up the number of connection groups
102222748Srwatson * by <<1.
103222748Srwatson *
104222748Srwatson * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection
105222748Srwatson * groups for ip_input and ip6_input, allowing non-offloaded work
106222748Srwatson * distribution.
107222748Srwatson *
108222748Srwatson * Expose effective CPU affinity of connections to userspace using socket
109222748Srwatson * options.
110222748Srwatson *
111222748Srwatson * Investigate per-connection affinity overrides based on socket options; an
112222748Srwatson * option could be set, certainly resulting in work being distributed
113222748Srwatson * differently in software, and possibly propagated to supporting hardware
114222748Srwatson * with TCAMs or hardware hash tables.  This might require connections to
115222748Srwatson * exist in more than one connection group at a time.
116222748Srwatson *
117222748Srwatson * Hook netisr thread reconfiguration events, and propagate those to RSS so
118222748Srwatson * that rebalancing can occur when the thread pool grows or shrinks.
119222748Srwatson *
120222748Srwatson * Expose per-pcbgroup statistics to userspace monitoring tools such as
121222748Srwatson * netstat, in order to allow better debugging and profiling.
122222748Srwatson */
123222748Srwatson
124222748Srwatsonvoid
125222748Srwatsonin_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields,
126222748Srwatson    int hash_nelements)
127222748Srwatson{
128222748Srwatson	struct inpcbgroup *pcbgroup;
129222748Srwatson	u_int numpcbgroups, pgn;
130222748Srwatson
131222748Srwatson	/*
132222748Srwatson	 * Only enable connection groups for a protocol if it has been
133222748Srwatson	 * specifically requested.
134222748Srwatson	 */
135222748Srwatson	if (hashfields == IPI_HASHFIELDS_NONE)
136222748Srwatson		return;
137222748Srwatson
138222748Srwatson	/*
139222748Srwatson	 * Connection groups are about multi-processor load distribution,
140222748Srwatson	 * lock contention, and connection CPU affinity.  As such, no point
141222748Srwatson	 * in turning them on for a uniprocessor machine, it only wastes
142222748Srwatson	 * memory.
143222748Srwatson	 */
144222748Srwatson	if (mp_ncpus == 1)
145222748Srwatson		return;
146222748Srwatson
147263198Srwatson#ifdef RSS
148222748Srwatson	/*
149263198Srwatson	 * If we're using RSS, then RSS determines the number of connection
150263198Srwatson	 * groups to use: one connection group per RSS bucket.  If for some
151263198Srwatson	 * reason RSS isn't able to provide a number of buckets, disable
152263198Srwatson	 * connection groups entirely.
153263198Srwatson	 *
154263198Srwatson	 * XXXRW: Can this ever happen?
155222748Srwatson	 */
156263198Srwatson	numpcbgroups = rss_getnumbuckets();
157263198Srwatson	if (numpcbgroups == 0)
158263198Srwatson		return;
159263198Srwatson#else
160263198Srwatson	/*
161263198Srwatson	 * Otherwise, we'll just use one per CPU for now.  If we decide to
162263198Srwatson	 * do dynamic rebalancing a la RSS, we'll need similar logic here.
163263198Srwatson	 */
164222748Srwatson	numpcbgroups = mp_ncpus;
165263198Srwatson#endif
166222748Srwatson
167222748Srwatson	pcbinfo->ipi_hashfields = hashfields;
168222748Srwatson	pcbinfo->ipi_pcbgroups = malloc(numpcbgroups *
169222748Srwatson	    sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO);
170222748Srwatson	pcbinfo->ipi_npcbgroups = numpcbgroups;
171222748Srwatson	pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB,
172222748Srwatson	    &pcbinfo->ipi_wildmask);
173222748Srwatson	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
174222748Srwatson		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
175222748Srwatson		pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB,
176222748Srwatson		    &pcbgroup->ipg_hashmask);
177222748Srwatson		INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup");
178222748Srwatson
179222748Srwatson		/*
180222748Srwatson		 * Initialise notional affinity of the pcbgroup -- for RSS,
181263198Srwatson		 * we want the same notion of affinity as NICs to be used.  In
182263198Srwatson		 * the non-RSS case, just round robin for the time being.
183263198Srwatson		 *
184263198Srwatson		 * XXXRW: The notion of a bucket to CPU mapping is common at
185263198Srwatson		 * both pcbgroup and RSS layers -- does that mean that we
186263198Srwatson		 * should migrate it all from RSS to here, and just leave RSS
187263198Srwatson		 * responsible only for providing hashing and mapping funtions?
188222748Srwatson		 */
189263198Srwatson#ifdef RSS
190263198Srwatson		pcbgroup->ipg_cpu = rss_getcpu(pgn);
191263198Srwatson#else
192222748Srwatson		pcbgroup->ipg_cpu = (pgn % mp_ncpus);
193263198Srwatson#endif
194222748Srwatson	}
195222748Srwatson}
196222748Srwatson
197222748Srwatsonvoid
198222748Srwatsonin_pcbgroup_destroy(struct inpcbinfo *pcbinfo)
199222748Srwatson{
200222748Srwatson	struct inpcbgroup *pcbgroup;
201222748Srwatson	u_int pgn;
202222748Srwatson
203222748Srwatson	if (pcbinfo->ipi_npcbgroups == 0)
204222748Srwatson		return;
205222748Srwatson
206222748Srwatson	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
207222748Srwatson		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
208222748Srwatson		KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead),
209222748Srwatson		    ("in_pcbinfo_destroy: listhead not empty"));
210222748Srwatson		INP_GROUP_LOCK_DESTROY(pcbgroup);
211222748Srwatson		hashdestroy(pcbgroup->ipg_hashbase, M_PCB,
212222748Srwatson		    pcbgroup->ipg_hashmask);
213222748Srwatson	}
214222748Srwatson	hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask);
215222748Srwatson	free(pcbinfo->ipi_pcbgroups, M_PCB);
216222748Srwatson	pcbinfo->ipi_pcbgroups = NULL;
217222748Srwatson	pcbinfo->ipi_npcbgroups = 0;
218222748Srwatson	pcbinfo->ipi_hashfields = 0;
219222748Srwatson}
220222748Srwatson
221222748Srwatson/*
222222748Srwatson * Given a hash of whatever the covered tuple might be, return a pcbgroup
223263198Srwatson * index.  Where RSS is supported, try to align bucket selection with RSS CPU
224263198Srwatson * affinity strategy.
225222748Srwatson */
226222748Srwatsonstatic __inline u_int
227222748Srwatsonin_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
228222748Srwatson{
229222748Srwatson
230263198Srwatson#ifdef RSS
231263198Srwatson	return (rss_getbucket(hash));
232263198Srwatson#else
233222748Srwatson	return (hash % pcbinfo->ipi_npcbgroups);
234263198Srwatson#endif
235222748Srwatson}
236222748Srwatson
237222748Srwatson/*
238222748Srwatson * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
239263198Srwatson * information is insufficient to identify the pcbgroup.  This might occur if
240263198Srwatson * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but
241263198Srwatson * RSS is not compiled into the kernel.
242222748Srwatson */
243222748Srwatsonstruct inpcbgroup *
244222748Srwatsonin_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
245222748Srwatson{
246222748Srwatson
247263198Srwatson#ifdef RSS
248263198Srwatson	if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
249263198Srwatson	    hashtype == M_HASHTYPE_RSS_TCP_IPV4) ||
250268913Sadrian	    (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
251268913Sadrian	    hashtype == M_HASHTYPE_RSS_UDP_IPV4) ||
252263198Srwatson	    (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE &&
253263198Srwatson	    hashtype == M_HASHTYPE_RSS_IPV4))
254263198Srwatson		return (&pcbinfo->ipi_pcbgroups[
255263198Srwatson		    in_pcbgroup_getbucket(pcbinfo, hash)]);
256263198Srwatson#endif
257222748Srwatson	return (NULL);
258222748Srwatson}
259222748Srwatson
260222748Srwatsonstatic struct inpcbgroup *
261222748Srwatsonin_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
262222748Srwatson{
263222748Srwatson
264222748Srwatson	return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
265222748Srwatson	    m->m_pkthdr.flowid));
266222748Srwatson}
267222748Srwatson
268222748Srwatsonstruct inpcbgroup *
269222748Srwatsonin_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
270222748Srwatson    u_short lport, struct in_addr faddr, u_short fport)
271222748Srwatson{
272222748Srwatson	uint32_t hash;
273222748Srwatson
274263198Srwatson	/*
275263198Srwatson	 * RSS note: we pass foreign addr/port as source, and local addr/port
276263198Srwatson	 * as destination, as we want to align with what the hardware is
277263198Srwatson	 * doing.
278263198Srwatson	 */
279222748Srwatson	switch (pcbinfo->ipi_hashfields) {
280222748Srwatson	case IPI_HASHFIELDS_4TUPLE:
281263198Srwatson#ifdef RSS
282263198Srwatson		hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport);
283263198Srwatson#else
284222748Srwatson		hash = faddr.s_addr ^ fport;
285263198Srwatson#endif
286222748Srwatson		break;
287222748Srwatson
288222748Srwatson	case IPI_HASHFIELDS_2TUPLE:
289263198Srwatson#ifdef RSS
290263198Srwatson		hash = rss_hash_ip4_2tuple(faddr, laddr);
291263198Srwatson#else
292222748Srwatson		hash = faddr.s_addr ^ laddr.s_addr;
293263198Srwatson#endif
294222748Srwatson		break;
295222748Srwatson
296222748Srwatson	default:
297222748Srwatson		hash = 0;
298222748Srwatson	}
299222748Srwatson	return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo,
300222748Srwatson	    hash)]);
301222748Srwatson}
302222748Srwatson
303222748Srwatsonstruct inpcbgroup *
304222748Srwatsonin_pcbgroup_byinpcb(struct inpcb *inp)
305222748Srwatson{
306268479Sadrian#ifdef	RSS
307268479Sadrian	/*
308268479Sadrian	 * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined
309268479Sadrian	 * RSS bucket and thus we should use this pcbgroup, rather than
310268479Sadrian	 * using a tuple or hash.
311268479Sadrian	 *
312268479Sadrian	 * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket
313268479Sadrian	 * fits in that!
314268479Sadrian	 */
315268479Sadrian	if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
316268479Sadrian		return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]);
317268479Sadrian#endif
318222748Srwatson
319222748Srwatson	return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
320222748Srwatson	    inp->inp_lport, inp->inp_faddr, inp->inp_fport));
321222748Srwatson}
322222748Srwatson
323222748Srwatsonstatic void
324222748Srwatsonin_pcbwild_add(struct inpcb *inp)
325222748Srwatson{
326222748Srwatson	struct inpcbinfo *pcbinfo;
327222748Srwatson	struct inpcbhead *head;
328222748Srwatson	u_int pgn;
329222748Srwatson
330222748Srwatson	INP_WLOCK_ASSERT(inp);
331222748Srwatson	KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD),
332222748Srwatson	    ("%s: is wild",__func__));
333222748Srwatson
334222748Srwatson	pcbinfo = inp->inp_pcbinfo;
335222748Srwatson	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
336222748Srwatson		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
337222748Srwatson	head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport,
338222748Srwatson	    0, pcbinfo->ipi_wildmask)];
339222748Srwatson	LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild);
340222748Srwatson	inp->inp_flags2 |= INP_PCBGROUPWILD;
341222748Srwatson	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
342222748Srwatson		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
343222748Srwatson}
344222748Srwatson
345222748Srwatsonstatic void
346222748Srwatsonin_pcbwild_remove(struct inpcb *inp)
347222748Srwatson{
348222748Srwatson	struct inpcbinfo *pcbinfo;
349222748Srwatson	u_int pgn;
350222748Srwatson
351222748Srwatson	INP_WLOCK_ASSERT(inp);
352222748Srwatson	KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD),
353222748Srwatson	    ("%s: not wild", __func__));
354222748Srwatson
355222748Srwatson	pcbinfo = inp->inp_pcbinfo;
356222748Srwatson	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
357222748Srwatson		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
358222748Srwatson	LIST_REMOVE(inp, inp_pcbgroup_wild);
359222748Srwatson	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
360222748Srwatson		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
361222748Srwatson	inp->inp_flags2 &= ~INP_PCBGROUPWILD;
362222748Srwatson}
363222748Srwatson
364222748Srwatsonstatic __inline int
365222748Srwatsonin_pcbwild_needed(struct inpcb *inp)
366222748Srwatson{
367268479Sadrian#ifdef	RSS
368268479Sadrian	/*
369268479Sadrian	 * If it's a listen socket and INP_RSS_BUCKET_SET is set,
370268479Sadrian	 * it's a wildcard socket _but_ it's in a specific pcbgroup.
371268479Sadrian	 * Thus we don't treat it as a pcbwild inp.
372268479Sadrian	 */
373268479Sadrian	if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
374268479Sadrian		return (0);
375268479Sadrian#endif
376222748Srwatson
377222748Srwatson#ifdef INET6
378222748Srwatson	if (inp->inp_vflag & INP_IPV6)
379222748Srwatson		return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr));
380222748Srwatson	else
381222748Srwatson#endif
382222748Srwatson		return (inp->inp_faddr.s_addr == htonl(INADDR_ANY));
383222748Srwatson}
384222748Srwatson
385222748Srwatsonstatic void
386222748Srwatsonin_pcbwild_update_internal(struct inpcb *inp)
387222748Srwatson{
388222748Srwatson	int wildcard_needed;
389222748Srwatson
390222748Srwatson	wildcard_needed = in_pcbwild_needed(inp);
391222748Srwatson	if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD))
392222748Srwatson		in_pcbwild_add(inp);
393222748Srwatson	else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD))
394222748Srwatson		in_pcbwild_remove(inp);
395222748Srwatson}
396222748Srwatson
397222748Srwatson/*
398222748Srwatson * Update the pcbgroup of an inpcb, which might include removing an old
399222748Srwatson * pcbgroup reference and/or adding a new one.  Wildcard processing is not
400222748Srwatson * performed here, although ideally we'll never install a pcbgroup for a
401222748Srwatson * wildcard inpcb (asserted below).
402222748Srwatson */
403222748Srwatsonstatic void
404222748Srwatsonin_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
405222748Srwatson    struct inpcbgroup *newpcbgroup, struct inpcb *inp)
406222748Srwatson{
407222748Srwatson	struct inpcbgroup *oldpcbgroup;
408222748Srwatson	struct inpcbhead *pcbhash;
409222748Srwatson	uint32_t hashkey_faddr;
410222748Srwatson
411222748Srwatson	INP_WLOCK_ASSERT(inp);
412222748Srwatson
413222748Srwatson	oldpcbgroup = inp->inp_pcbgroup;
414222748Srwatson	if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
415222748Srwatson		INP_GROUP_LOCK(oldpcbgroup);
416222748Srwatson		LIST_REMOVE(inp, inp_pcbgrouphash);
417222748Srwatson		inp->inp_pcbgroup = NULL;
418222748Srwatson		INP_GROUP_UNLOCK(oldpcbgroup);
419222748Srwatson	}
420222748Srwatson	if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
421222748Srwatson#ifdef INET6
422222748Srwatson		if (inp->inp_vflag & INP_IPV6)
423271386Sae			hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
424222748Srwatson		else
425222748Srwatson#endif
426222748Srwatson			hashkey_faddr = inp->inp_faddr.s_addr;
427222748Srwatson		INP_GROUP_LOCK(newpcbgroup);
428268479Sadrian		/*
429268479Sadrian		 * If the inp is an RSS bucket wildcard entry, ensure
430268479Sadrian		 * that the PCB hash is calculated correctly.
431268479Sadrian		 *
432268479Sadrian		 * The wildcard hash calculation differs from the
433268479Sadrian		 * non-wildcard definition.  The source address is
434268479Sadrian		 * INADDR_ANY and the far port is 0.
435268479Sadrian		 */
436268479Sadrian		if (inp->inp_flags2 & INP_RSS_BUCKET_SET) {
437268479Sadrian			pcbhash = &newpcbgroup->ipg_hashbase[
438268479Sadrian			    INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0,
439268479Sadrian			    newpcbgroup->ipg_hashmask)];
440268479Sadrian		} else {
441268479Sadrian			pcbhash = &newpcbgroup->ipg_hashbase[
442268479Sadrian			    INP_PCBHASH(hashkey_faddr, inp->inp_lport,
443268479Sadrian			    inp->inp_fport,
444268479Sadrian			    newpcbgroup->ipg_hashmask)];
445268479Sadrian		}
446222748Srwatson		LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
447222748Srwatson		inp->inp_pcbgroup = newpcbgroup;
448222748Srwatson		INP_GROUP_UNLOCK(newpcbgroup);
449222748Srwatson	}
450222748Srwatson
451222748Srwatson	KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)),
452222748Srwatson	    ("%s: pcbgroup and wildcard!", __func__));
453222748Srwatson}
454222748Srwatson
455222748Srwatson/*
456222748Srwatson * Two update paths: one in which the 4-tuple on an inpcb has been updated
457222748Srwatson * and therefore connection groups may need to change (or a wildcard entry
458222748Srwatson * may needed to be installed), and another in which the 4-tuple has been
459222748Srwatson * set as a result of a packet received, in which case we may be able to use
460222748Srwatson * the hash on the mbuf to avoid doing a software hash calculation for RSS.
461222748Srwatson *
462222748Srwatson * In each case: first, let the wildcard code have a go at placing it as a
463222748Srwatson * wildcard socket.  If it was a wildcard, or if the connection has been
464222748Srwatson * dropped, then no pcbgroup is required (so potentially clear it);
465222748Srwatson * otherwise, calculate and update the pcbgroup for the inpcb.
466222748Srwatson */
467222748Srwatsonvoid
468222748Srwatsonin_pcbgroup_update(struct inpcb *inp)
469222748Srwatson{
470222748Srwatson	struct inpcbinfo *pcbinfo;
471222748Srwatson	struct inpcbgroup *newpcbgroup;
472222748Srwatson
473222748Srwatson	INP_WLOCK_ASSERT(inp);
474222748Srwatson
475222748Srwatson	pcbinfo = inp->inp_pcbinfo;
476222748Srwatson	if (!in_pcbgroup_enabled(pcbinfo))
477222748Srwatson		return;
478222748Srwatson
479222748Srwatson	in_pcbwild_update_internal(inp);
480222748Srwatson	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
481222748Srwatson	    !(inp->inp_flags & INP_DROPPED)) {
482222748Srwatson#ifdef INET6
483222748Srwatson		if (inp->inp_vflag & INP_IPV6)
484222748Srwatson			newpcbgroup = in6_pcbgroup_byinpcb(inp);
485222748Srwatson		else
486222748Srwatson#endif
487222748Srwatson			newpcbgroup = in_pcbgroup_byinpcb(inp);
488222748Srwatson	} else
489222748Srwatson		newpcbgroup = NULL;
490222748Srwatson	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
491222748Srwatson}
492222748Srwatson
493222748Srwatsonvoid
494222748Srwatsonin_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m)
495222748Srwatson{
496222748Srwatson	struct inpcbinfo *pcbinfo;
497222748Srwatson	struct inpcbgroup *newpcbgroup;
498222748Srwatson
499222748Srwatson	INP_WLOCK_ASSERT(inp);
500222748Srwatson
501222748Srwatson	pcbinfo = inp->inp_pcbinfo;
502222748Srwatson	if (!in_pcbgroup_enabled(pcbinfo))
503222748Srwatson		return;
504222748Srwatson
505222748Srwatson	/*
506222748Srwatson	 * Possibly should assert !INP_PCBGROUPWILD rather than testing for
507222748Srwatson	 * it; presumably this function should never be called for anything
508222748Srwatson	 * other than non-wildcard socket?
509222748Srwatson	 */
510222748Srwatson	in_pcbwild_update_internal(inp);
511222748Srwatson	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
512222748Srwatson	    !(inp->inp_flags & INP_DROPPED)) {
513222748Srwatson		newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m);
514222748Srwatson#ifdef INET6
515222748Srwatson		if (inp->inp_vflag & INP_IPV6) {
516222748Srwatson			if (newpcbgroup == NULL)
517222748Srwatson				newpcbgroup = in6_pcbgroup_byinpcb(inp);
518222748Srwatson		} else {
519222748Srwatson#endif
520222748Srwatson			if (newpcbgroup == NULL)
521222748Srwatson				newpcbgroup = in_pcbgroup_byinpcb(inp);
522222748Srwatson#ifdef INET6
523222748Srwatson		}
524222748Srwatson#endif
525222748Srwatson	} else
526222748Srwatson		newpcbgroup = NULL;
527222748Srwatson	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
528222748Srwatson}
529222748Srwatson
530222748Srwatson/*
531222748Srwatson * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb.
532222748Srwatson */
533222748Srwatsonvoid
534222748Srwatsonin_pcbgroup_remove(struct inpcb *inp)
535222748Srwatson{
536222748Srwatson	struct inpcbgroup *pcbgroup;
537222748Srwatson
538222748Srwatson	INP_WLOCK_ASSERT(inp);
539222748Srwatson
540222748Srwatson	if (!in_pcbgroup_enabled(inp->inp_pcbinfo))
541222748Srwatson		return;
542222748Srwatson
543222748Srwatson	if (inp->inp_flags2 & INP_PCBGROUPWILD)
544222748Srwatson		in_pcbwild_remove(inp);
545222748Srwatson
546222748Srwatson	pcbgroup = inp->inp_pcbgroup;
547222748Srwatson	if (pcbgroup != NULL) {
548222748Srwatson		INP_GROUP_LOCK(pcbgroup);
549222748Srwatson		LIST_REMOVE(inp, inp_pcbgrouphash);
550222748Srwatson		inp->inp_pcbgroup = NULL;
551222748Srwatson		INP_GROUP_UNLOCK(pcbgroup);
552222748Srwatson	}
553222748Srwatson}
554222748Srwatson
555222748Srwatson/*
556222748Srwatson * Query whether or not it is appropriate to use pcbgroups to look up inpcbs
557222748Srwatson * for a protocol.
558222748Srwatson */
559222748Srwatsonint
560222748Srwatsonin_pcbgroup_enabled(struct inpcbinfo *pcbinfo)
561222748Srwatson{
562222748Srwatson
563222748Srwatson	return (pcbinfo->ipi_npcbgroups > 0);
564222748Srwatson}
565