netisr.c revision 227309
162587Sitojun/*-
295023Ssuz * Copyright (c) 2007-2009 Robert N. M. Watson
362587Sitojun * Copyright (c) 2010-2011 Juniper Networks, Inc.
4139823Simp * All rights reserved.
554263Sshin *
654263Sshin * This software was developed by Robert N. M. Watson under contract
754263Sshin * to Juniper Networks, Inc.
854263Sshin *
954263Sshin * Redistribution and use in source and binary forms, with or without
1054263Sshin * modification, are permitted provided that the following conditions
1154263Sshin * are met:
1254263Sshin * 1. Redistributions of source code must retain the above copyright
1354263Sshin *    notice, this list of conditions and the following disclaimer.
1454263Sshin * 2. Redistributions in binary form must reproduce the above copyright
1554263Sshin *    notice, this list of conditions and the following disclaimer in the
1654263Sshin *    documentation and/or other materials provided with the distribution.
1754263Sshin *
1854263Sshin * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1954263Sshin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2054263Sshin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2154263Sshin * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2254263Sshin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2354263Sshin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2454263Sshin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2554263Sshin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2654263Sshin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2754263Sshin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2854263Sshin * SUCH DAMAGE.
2954263Sshin */
3054263Sshin
3154263Sshin#include <sys/cdefs.h>
3254263Sshin__FBSDID("$FreeBSD: head/sys/net/netisr.c 227309 2011-11-07 15:43:11Z ed $");
3354263Sshin
3454263Sshin/*
35101739Srwatson * netisr is a packet dispatch service, allowing synchronous (directly
3654263Sshin * dispatched) and asynchronous (deferred dispatch) processing of packets by
3754263Sshin * registered protocol handlers.  Callers pass a protocol identifier and
3854263Sshin * packet to netisr, along with a direct dispatch hint, and work will either
3954263Sshin * be immediately processed by the registered handler, or passed to a
40101182Srwatson * software interrupt (SWI) thread for deferred dispatch.  Callers will
4154263Sshin * generally select one or the other based on:
4254263Sshin *
43129880Sphk * - Whether directly dispatching a netisr handler lead to code reentrance or
4454263Sshin *   lock recursion, such as entering the socket code from the socket code.
4554263Sshin * - Whether directly dispatching a netisr handler lead to recursive
4654263Sshin *   processing, such as when decapsulating several wrapped layers of tunnel
4754263Sshin *   information (IPSEC within IPSEC within ...).
4891270Sbrooks *
4954263Sshin * Maintaining ordering for protocol streams is a critical design concern.
5062587Sitojun * Enforcing ordering limits the opportunity for concurrency, but maintains
5179106Sbrooks * the strong ordering requirements found in some protocols, such as TCP.  Of
5254263Sshin * related concern is CPU affinity--it is desirable to process all data
5354263Sshin * associated with a particular stream on the same CPU over time in order to
5454263Sshin * avoid acquiring locks associated with the connection on different CPUs,
55130933Sbrooks * keep connection data in one cache, and to generally encourage associated
5654263Sshin * user threads to live on the same CPU as the stream.  It's also desirable
5754263Sshin * to avoid lock migration and contention where locks are associated with
5854263Sshin * more than one flow.
5954263Sshin *
6054263Sshin * netisr supports several policy variations, represented by the
6154263Sshin * NETISR_POLICY_* constants, allowing protocols to play various roles in
6254263Sshin * identifying flows, assigning work to CPUs, etc.  These are described in
6378064Sume * netisr.h.
6478064Sume */
6554263Sshin
6654263Sshin#include "opt_ddb.h"
6779106Sbrooks#include "opt_device_polling.h"
6854263Sshin
6954263Sshin#include <sys/param.h>
7054263Sshin#include <sys/bus.h>
7154263Sshin#include <sys/kernel.h>
7254263Sshin#include <sys/kthread.h>
7354263Sshin#include <sys/interrupt.h>
7454263Sshin#include <sys/lock.h>
7554263Sshin#include <sys/mbuf.h>
7654263Sshin#include <sys/mutex.h>
77148385Sume#include <sys/pcpu.h>
7854263Sshin#include <sys/proc.h>
7962587Sitojun#include <sys/rmlock.h>
8054263Sshin#include <sys/sched.h>
8154263Sshin#include <sys/smp.h>
8262587Sitojun#include <sys/socket.h>
83153621Sthompsa#include <sys/sysctl.h>
84153621Sthompsa#include <sys/systm.h>
8554263Sshin
8654263Sshin#ifdef DDB
8754263Sshin#include <ddb/ddb.h>
8854263Sshin#endif
8979106Sbrooks
9062587Sitojun#define	_WANT_NETISR_INTERNAL	/* Enable definitions from netisr_internal.h */
91127305Srwatson#include <net/if.h>
92127898Sru#include <net/if_var.h>
93127305Srwatson#include <net/netisr.h>
94127305Srwatson#include <net/netisr_internal.h>
9579106Sbrooks#include <net/vnet.h>
9689065Smsmith
9779106Sbrooks/*-
9883998Sbrooks * Synchronize use and modification of the registered netisr data structures;
9983998Sbrooks * acquire a read lock while modifying the set of registered protocols to
10083998Sbrooks * prevent partially registered or unregistered protocols from being run.
10183998Sbrooks *
10283998Sbrooks * The following data structures and fields are protected by this lock:
103153621Sthompsa *
104128209Sbrooks * - The netisr_proto array, including all fields of struct netisr_proto.
105128209Sbrooks * - The nws array, including all fields of struct netisr_worker.
10679106Sbrooks * - The nws_array array.
107130933Sbrooks *
10879106Sbrooks * Note: the NETISR_LOCKING define controls whether read locks are acquired
10992725Salfred * in packet processing paths requiring netisr registration stability.  This
11079106Sbrooks * is disabled by default as it can lead to measurable performance
11191270Sbrooks * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and
11291270Sbrooks * because netisr registration and unregistration is extremely rare at
11391270Sbrooks * runtime.  If it becomes more common, this decision should be revisited.
11462587Sitojun *
11562587Sitojun * XXXRW: rmlocks don't support assertions.
11691270Sbrooks */
11762587Sitojunstatic struct rmlock	netisr_rmlock;
11862587Sitojun#define	NETISR_LOCK_INIT()	rm_init_flags(&netisr_rmlock, "netisr", \
11962587Sitojun				    RM_NOWITNESS)
12095023Ssuz#define	NETISR_LOCK_ASSERT()
12162587Sitojun#define	NETISR_RLOCK(tracker)	rm_rlock(&netisr_rmlock, (tracker))
12262587Sitojun#define	NETISR_RUNLOCK(tracker)	rm_runlock(&netisr_rmlock, (tracker))
12362587Sitojun#define	NETISR_WLOCK()		rm_wlock(&netisr_rmlock)
12462587Sitojun#define	NETISR_WUNLOCK()	rm_wunlock(&netisr_rmlock)
12562587Sitojun/* #define	NETISR_LOCKING */
12691270Sbrooks
12791270Sbrooksstatic SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr");
12862587Sitojun
12991270Sbrooks/*-
13091270Sbrooks * Three global direct dispatch policies are supported:
13191270Sbrooks *
13291270Sbrooks * NETISR_DISPATCH_QUEUED: All work is deferred for a netisr, regardless of
13391270Sbrooks * context (may be overriden by protocols).
13491270Sbrooks *
13591270Sbrooks * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch,
13691270Sbrooks * and we're running on the CPU the work would be performed on, then direct
13791270Sbrooks * dispatch it if it wouldn't violate ordering constraints on the workstream.
13891270Sbrooks *
13991270Sbrooks * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch,
14091270Sbrooks * always direct dispatch.  (The default.)
14191270Sbrooks *
142128209Sbrooks * Notice that changing the global policy could lead to short periods of
14379106Sbrooks * misordered processing, but this is considered acceptable as compared to
14479106Sbrooks * the complexity of enforcing ordering during policy changes.  Protocols can
14592081Smux * override the global policy (when they're not doing that, they select
14654263Sshin * NETISR_DISPATCH_DEFAULT).
14778064Sume */
14854263Sshin#define	NETISR_DISPATCH_POLICY_DEFAULT	NETISR_DISPATCH_DIRECT
149131672Sbms#define	NETISR_DISPATCH_POLICY_MAXSTR	20 /* Used for temporary buffers. */
150147256Sbrooksstatic u_int	netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT;
151147256Sbrooksstatic int	sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS);
152147256SbrooksSYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RW |
153147256Sbrooks    CTLFLAG_TUN, 0, 0, sysctl_netisr_dispatch_policy, "A",
154147256Sbrooks    "netisr dispatch policy");
15579106Sbrooks
156155037Sglebius/*
157155037Sglebius * These sysctls were used in previous versions to control and export
158147256Sbrooks * dispatch policy state.  Now, we provide read-only export via them so that
159147256Sbrooks * older netstat binaries work.  At some point they can be garbage collected.
16079106Sbrooks */
16179106Sbrooksstatic int	netisr_direct_force;
16262587SitojunSYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RD,
163147256Sbrooks    &netisr_direct_force, 0, "compat: force direct dispatch");
164147256Sbrooks
165147256Sbrooksstatic int	netisr_direct;
16678064SumeSYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RD, &netisr_direct, 0,
16779106Sbrooks    "compat: enable direct dispatch");
168147256Sbrooks
16978064Sume/*
170147256Sbrooks * Allow the administrator to limit the number of threads (CPUs) to use for
171153621Sthompsa * netisr.  We don't check netisr_maxthreads before creating the thread for
172147256Sbrooks * CPU 0, so in practice we ignore values <= 1.  This must be set at boot.
173147256Sbrooks * We will create at most one thread per CPU.
174147256Sbrooks */
175147611Sdwmalonestatic int	netisr_maxthreads = -1;		/* Max number of threads. */
17683998SbrooksTUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads);
177147256SbrooksSYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN,
178155037Sglebius    &netisr_maxthreads, 0,
179155037Sglebius    "Use at most this many CPUs for netisr processing");
180155037Sglebius
181155037Sglebiusstatic int	netisr_bindthreads = 0;		/* Bind threads to CPUs. */
182155037SglebiusTUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads);
183155037SglebiusSYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN,
18479106Sbrooks    &netisr_bindthreads, 0, "Bind netisr threads to CPUs.");
18579106Sbrooks
186127305Srwatson/*
187151266Sthompsa * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit,
188151266Sthompsa * both for initial configuration and later modification using
18979106Sbrooks * netisr_setqlimit().
19079106Sbrooks */
191151266Sthompsa#define	NETISR_DEFAULT_MAXQLIMIT	10240
19279106Sbrooksstatic u_int	netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT;
193151266SthompsaTUNABLE_INT("net.isr.maxqlimit", &netisr_maxqlimit);
194151266SthompsaSYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN,
195151266Sthompsa    &netisr_maxqlimit, 0,
196151266Sthompsa    "Maximum netisr per-protocol, per-CPU queue depth.");
197127305Srwatson
198105293Sume/*
199105293Sume * The default per-workstream mbuf queue limit for protocols that don't
200105293Sume * initialize the nh_qlimit field of their struct netisr_handler.  If this is
201105293Sume * set above netisr_maxqlimit, we truncate it to the maximum during boot.
202105293Sume */
203105293Sume#define	NETISR_DEFAULT_DEFAULTQLIMIT	256
204105293Sumestatic u_int	netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT;
20579106SbrooksTUNABLE_INT("net.isr.defaultqlimit", &netisr_defaultqlimit);
20679106SbrooksSYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN,
20779106Sbrooks    &netisr_defaultqlimit, 0,
20879106Sbrooks    "Default netisr per-protocol, per-CPU queue limit if not set by protocol");
209105293Sume
21079106Sbrooks/*
21183998Sbrooks * Store and export the compile-time constant NETISR_MAXPROT limit on the
21283998Sbrooks * number of protocols that can register with netisr at a time.  This is
21379106Sbrooks * required for crashdump analysis, as it sizes netisr_proto[].
21479106Sbrooks */
215147256Sbrooksstatic u_int	netisr_maxprot = NETISR_MAXPROT;
21679106SbrooksSYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD,
217155037Sglebius    &netisr_maxprot, 0,
218155037Sglebius    "Compile-time limit on the number of protocols supported by netisr.");
21979106Sbrooks
22079106Sbrooks/*
22179106Sbrooks * The netisr_proto array describes all registered protocols, indexed by
22279106Sbrooks * protocol number.  See netisr_internal.h for more details.
22379106Sbrooks */
22479106Sbrooksstatic struct netisr_proto	netisr_proto[NETISR_MAXPROT];
22579106Sbrooks
22679106Sbrooks/*
22779106Sbrooks * Per-CPU workstream data.  See netisr_internal.h for more details.
22879106Sbrooks */
22979106SbrooksDPCPU_DEFINE(struct netisr_workstream, nws);
23079106Sbrooks
231127305Srwatson/*
23283997Sbrooks * Map contiguous values between 0 and nws_count into CPU IDs appropriate for
23379106Sbrooks * accessing workstreams.  This allows constructions of the form
23479106Sbrooks * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws).
23579106Sbrooks */
23679106Sbrooksstatic u_int				 nws_array[MAXCPU];
23762587Sitojun
23879106Sbrooks/*
23979106Sbrooks * Number of registered workstreams.  Will be at most the number of running
24079106Sbrooks * CPUs once fully started.
24179106Sbrooks */
242127305Srwatsonstatic u_int				 nws_count;
24379106SbrooksSYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD,
24479106Sbrooks    &nws_count, 0, "Number of extant netisr threads.");
24562587Sitojun
24679106Sbrooks/*
247132199Sphk * Synchronization for each workstream: a mutex protects all mutable fields
248132199Sphk * in each stream, including per-protocol state (mbuf queues).  The SWI is
24954263Sshin * woken up if asynchronous dispatch is required.
25079106Sbrooks */
25154263Sshin#define	NWS_LOCK(s)		mtx_lock(&(s)->nws_mtx)
25254263Sshin#define	NWS_LOCK_ASSERT(s)	mtx_assert(&(s)->nws_mtx, MA_OWNED)
25379106Sbrooks#define	NWS_UNLOCK(s)		mtx_unlock(&(s)->nws_mtx)
25479106Sbrooks#define	NWS_SIGNAL(s)		swi_sched((s)->nws_swi_cookie, 0)
25579106Sbrooks
25679106Sbrooks/*
25779106Sbrooks * Utility routines for protocols that implement their own mapping of flows
25854263Sshin * to CPUs.
25979106Sbrooks */
26083997Sbrooksu_int
26179106Sbrooksnetisr_get_cpucount(void)
262105293Sume{
26362587Sitojun
26462587Sitojun	return (nws_count);
26562587Sitojun}
26662587Sitojun
26762587Sitojunu_int
26862587Sitojunnetisr_get_cpuid(u_int cpunumber)
26962587Sitojun{
27062587Sitojun
27162587Sitojun	KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber,
27262587Sitojun	    nws_count));
27362587Sitojun
27462587Sitojun	return (nws_array[cpunumber]);
27562587Sitojun}
276147256Sbrooks
27762587Sitojun/*
27862587Sitojun * The default implementation of flow -> CPU ID mapping.
27962587Sitojun *
28062587Sitojun * Non-static so that protocols can use it to map their own work to specific
28162587Sitojun * CPUs in a manner consistent to netisr for affinity purposes.
28262587Sitojun */
28362587Sitojunu_int
28462587Sitojunnetisr_default_flow2cpu(u_int flowid)
28562587Sitojun{
28662587Sitojun
28762587Sitojun	return (nws_array[flowid % nws_count]);
28862587Sitojun}
28962587Sitojun
29062587Sitojun/*
29162587Sitojun * Dispatch tunable and sysctl configuration.
292153621Sthompsa */
293153621Sthompsastruct netisr_dispatch_table_entry {
294153621Sthompsa	u_int		 ndte_policy;
29562587Sitojun	const char	*ndte_policy_str;
29662587Sitojun};
29762587Sitojunstatic const struct netisr_dispatch_table_entry netisr_dispatch_table[] = {
29862587Sitojun	{ NETISR_DISPATCH_DEFAULT, "default" },
299105339Sume	{ NETISR_DISPATCH_DEFERRED, "deferred" },
300105339Sume	{ NETISR_DISPATCH_HYBRID, "hybrid" },
301105339Sume	{ NETISR_DISPATCH_DIRECT, "direct" },
302105339Sume};
30391327Sbrooksstatic const u_int netisr_dispatch_table_len =
30462587Sitojun    (sizeof(netisr_dispatch_table) / sizeof(netisr_dispatch_table[0]));
30562587Sitojun
30662587Sitojunstatic void
30762587Sitojunnetisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer,
30862587Sitojun    u_int buflen)
30962587Sitojun{
31062587Sitojun	const struct netisr_dispatch_table_entry *ndtep;
31162587Sitojun	const char *str;
31262587Sitojun	u_int i;
31362587Sitojun
31462587Sitojun	str = "unknown";
315105293Sume	for (i = 0; i < netisr_dispatch_table_len; i++) {
316105293Sume		ndtep = &netisr_dispatch_table[i];
31762587Sitojun		if (ndtep->ndte_policy == dispatch_policy) {
31862587Sitojun			str = ndtep->ndte_policy_str;
31962587Sitojun			break;
32062587Sitojun		}
32162587Sitojun	}
32262587Sitojun	snprintf(buffer, buflen, "%s", str);
32362587Sitojun}
32462587Sitojun
32562587Sitojunstatic int
32662587Sitojunnetisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp)
327153621Sthompsa{
328153621Sthompsa	const struct netisr_dispatch_table_entry *ndtep;
329153621Sthompsa	u_int i;
330153621Sthompsa
331153621Sthompsa	for (i = 0; i < netisr_dispatch_table_len; i++) {
332153621Sthompsa		ndtep = &netisr_dispatch_table[i];
333153621Sthompsa		if (strcmp(ndtep->ndte_policy_str, str) == 0) {
334153621Sthompsa			*dispatch_policyp = ndtep->ndte_policy;
335153621Sthompsa			return (0);
336153621Sthompsa		}
337153621Sthompsa	}
338153621Sthompsa	return (EINVAL);
339153621Sthompsa}
340153621Sthompsa
341153621Sthompsastatic void
342153621Sthompsanetisr_dispatch_policy_compat(void)
343153621Sthompsa{
344153621Sthompsa
345153621Sthompsa	switch (netisr_dispatch_policy) {
346153621Sthompsa	case NETISR_DISPATCH_DEFERRED:
347153621Sthompsa		netisr_direct_force = 0;
348153621Sthompsa		netisr_direct = 0;
34954263Sshin		break;
35054263Sshin
35154263Sshin	case NETISR_DISPATCH_HYBRID:
35254263Sshin		netisr_direct_force = 0;
35354263Sshin		netisr_direct = 1;
35454263Sshin		break;
35554263Sshin
356147256Sbrooks	case NETISR_DISPATCH_DIRECT:
357127898Sru		netisr_direct_force = 1;
35854263Sshin		netisr_direct = 1;
359127898Sru		break;
360147611Sdwmalone
36154263Sshin	default:
362101182Srwatson		panic("%s: unknown policy %u", __func__,
363101182Srwatson		    netisr_dispatch_policy);
364101739Srwatson	}
365101739Srwatson}
366101739Srwatson
367101739Srwatsonstatic int
368101182Srwatsonsysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS)
369101182Srwatson{
37054263Sshin	char tmp[NETISR_DISPATCH_POLICY_MAXSTR];
37154263Sshin	u_int dispatch_policy;
372127898Sru	int error;
373127898Sru
374127898Sru	netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp,
37554263Sshin	    sizeof(tmp));
37654263Sshin	error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req);
377127898Sru	if (error == 0 && req->newptr != NULL) {
378127898Sru		error = netisr_dispatch_policy_from_str(tmp,
379127898Sru		    &dispatch_policy);
380127898Sru		if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT)
381127898Sru			error = EINVAL;
382127898Sru		if (error == 0) {
383127898Sru			netisr_dispatch_policy = dispatch_policy;
384127898Sru			netisr_dispatch_policy_compat();
385127898Sru		}
386127898Sru	}
387127898Sru	return (error);
388127898Sru}
389127898Sru
390127898Sru/*
391127898Sru * Register a new netisr handler, which requires initializing per-protocol
39254263Sshin * fields for each workstream.  All netisr work is briefly suspended while
39354263Sshin * the protocol is installed.
394127303Srwatson */
39554263Sshinvoid
39654263Sshinnetisr_register(const struct netisr_handler *nhp)
39754263Sshin{
39854263Sshin	struct netisr_work *npwp;
399127898Sru	const char *name;
400127898Sru	u_int i, proto;
401127898Sru
402127898Sru	proto = nhp->nh_proto;
403127898Sru	name = nhp->nh_name;
404127898Sru
405127898Sru	/*
406127898Sru	 * Test that the requested registration is valid.
407127898Sru	 */
40862587Sitojun	KASSERT(nhp->nh_name != NULL,
40954263Sshin	    ("%s: nh_name NULL for %u", __func__, proto));
410155037Sglebius	KASSERT(nhp->nh_handler != NULL,
411155037Sglebius	    ("%s: nh_handler NULL for %s", __func__, name));
412155037Sglebius	KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE ||
41354263Sshin	    nhp->nh_policy == NETISR_POLICY_FLOW ||
41454263Sshin	    nhp->nh_policy == NETISR_POLICY_CPU,
415159174Sglebius	    ("%s: unsupported nh_policy %u for %s", __func__,
41654263Sshin	    nhp->nh_policy, name));
41754263Sshin	KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW ||
41854263Sshin	    nhp->nh_m2flow == NULL,
41954263Sshin	    ("%s: nh_policy != FLOW but m2flow defined for %s", __func__,
42054263Sshin	    name));
421147611Sdwmalone	KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL,
422147611Sdwmalone	    ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__,
423147611Sdwmalone	    name));
424147611Sdwmalone	KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL,
425147611Sdwmalone	    ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__,
426147611Sdwmalone	    name));
427153621Sthompsa	KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT ||
428159180Scsjp	    nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED ||
42962587Sitojun	    nhp->nh_dispatch == NETISR_DISPATCH_HYBRID ||
43054263Sshin	    nhp->nh_dispatch == NETISR_DISPATCH_DIRECT,
43154263Sshin	    ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch));
432153621Sthompsa
433153621Sthompsa	KASSERT(proto < NETISR_MAXPROT,
434153621Sthompsa	    ("%s(%u, %s): protocol too big", __func__, proto, name));
435153621Sthompsa
43678064Sume	/*
43778064Sume	 * Test that no existing registration exists for this protocol.
43862587Sitojun	 */
43962587Sitojun	NETISR_WLOCK();
44078064Sume	KASSERT(netisr_proto[proto].np_name == NULL,
44154263Sshin	    ("%s(%u, %s): name present", __func__, proto, name));
44254263Sshin	KASSERT(netisr_proto[proto].np_handler == NULL,
44354263Sshin	    ("%s(%u, %s): handler present", __func__, proto, name));
444153621Sthompsa
44554263Sshin	netisr_proto[proto].np_name = name;
44654263Sshin	netisr_proto[proto].np_handler = nhp->nh_handler;
44754263Sshin	netisr_proto[proto].np_m2flow = nhp->nh_m2flow;
44854263Sshin	netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid;
449153621Sthompsa	netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu;
45054263Sshin	if (nhp->nh_qlimit == 0)
45154263Sshin		netisr_proto[proto].np_qlimit = netisr_defaultqlimit;
45254263Sshin	else if (nhp->nh_qlimit > netisr_maxqlimit) {
45362587Sitojun		printf("%s: %s requested queue limit %u capped to "
45454263Sshin		    "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit,
45554263Sshin		    netisr_maxqlimit);
45654263Sshin		netisr_proto[proto].np_qlimit = netisr_maxqlimit;
457159174Sglebius	} else
45854263Sshin		netisr_proto[proto].np_qlimit = nhp->nh_qlimit;
45978064Sume	netisr_proto[proto].np_policy = nhp->nh_policy;
46078064Sume	netisr_proto[proto].np_dispatch = nhp->nh_dispatch;
461155037Sglebius	CPU_FOREACH(i) {
46254263Sshin		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
46354263Sshin		bzero(npwp, sizeof(*npwp));
46454263Sshin		npwp->nw_qlimit = netisr_proto[proto].np_qlimit;
465105338Sume	}
46654263Sshin	NETISR_WUNLOCK();
46754263Sshin}
468105338Sume
46954263Sshin/*
470153621Sthompsa * Clear drop counters across all workstreams for a protocol.
471153621Sthompsa */
47254263Sshinvoid
473105338Sumenetisr_clearqdrops(const struct netisr_handler *nhp)
47454263Sshin{
47554263Sshin	struct netisr_work *npwp;
47654263Sshin#ifdef INVARIANTS
47754263Sshin	const char *name;
47854263Sshin#endif
479105338Sume	u_int i, proto;
480101182Srwatson
481101182Srwatson	proto = nhp->nh_proto;
482105338Sume#ifdef INVARIANTS
483101182Srwatson	name = nhp->nh_name;
484101182Srwatson#endif
485159180Scsjp	KASSERT(proto < NETISR_MAXPROT,
48678064Sume	    ("%s(%u): protocol too big for %s", __func__, proto, name));
487123922Ssam
48854263Sshin	NETISR_WLOCK();
48954263Sshin	KASSERT(netisr_proto[proto].np_handler != NULL,
49083998Sbrooks	    ("%s(%u): protocol not registered for %s", __func__, proto,
491105338Sume	    name));
49283998Sbrooks
49383998Sbrooks	CPU_FOREACH(i) {
49483998Sbrooks		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
49583998Sbrooks		npwp->nw_qdrops = 0;
49654263Sshin	}
49754263Sshin	NETISR_WUNLOCK();
49854263Sshin}
49954263Sshin
50095023Ssuz/*
50154263Sshin * Query current drop counters across all workstreams for a protocol.
50295023Ssuz */
50354263Sshinvoid
50495023Ssuznetisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp)
50595023Ssuz{
50654263Sshin	struct netisr_work *npwp;
50754263Sshin	struct rm_priotracker tracker;
50854263Sshin#ifdef INVARIANTS
50954263Sshin	const char *name;
51054263Sshin#endif
51154263Sshin	u_int i, proto;
51254263Sshin
51354263Sshin	*qdropp = 0;
51454263Sshin	proto = nhp->nh_proto;
51554263Sshin#ifdef INVARIANTS
51654263Sshin	name = nhp->nh_name;
51754263Sshin#endif
518153621Sthompsa	KASSERT(proto < NETISR_MAXPROT,
519153621Sthompsa	    ("%s(%u): protocol too big for %s", __func__, proto, name));
520153621Sthompsa
521153621Sthompsa	NETISR_RLOCK(&tracker);
522153621Sthompsa	KASSERT(netisr_proto[proto].np_handler != NULL,
523153621Sthompsa	    ("%s(%u): protocol not registered for %s", __func__, proto,
524153621Sthompsa	    name));
525153621Sthompsa
526153621Sthompsa	CPU_FOREACH(i) {
527153621Sthompsa		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
528153621Sthompsa		*qdropp += npwp->nw_qdrops;
529153621Sthompsa	}
530153621Sthompsa	NETISR_RUNLOCK(&tracker);
531153621Sthompsa}
532153621Sthompsa
533153621Sthompsa/*
534153621Sthompsa * Query current per-workstream queue limit for a protocol.
535153621Sthompsa */
536153621Sthompsavoid
537153621Sthompsanetisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp)
538153621Sthompsa{
539153621Sthompsa	struct rm_priotracker tracker;
540153621Sthompsa#ifdef INVARIANTS
541153621Sthompsa	const char *name;
542153621Sthompsa#endif
543153621Sthompsa	u_int proto;
544153621Sthompsa
545153621Sthompsa	proto = nhp->nh_proto;
546153621Sthompsa#ifdef INVARIANTS
54754263Sshin	name = nhp->nh_name;
54883998Sbrooks#endif
549105338Sume	KASSERT(proto < NETISR_MAXPROT,
55083998Sbrooks	    ("%s(%u): protocol too big for %s", __func__, proto, name));
55183998Sbrooks
55254263Sshin	NETISR_RLOCK(&tracker);
55354263Sshin	KASSERT(netisr_proto[proto].np_handler != NULL,
55454263Sshin	    ("%s(%u): protocol not registered for %s", __func__, proto,
555105338Sume	    name));
556105338Sume	*qlimitp = netisr_proto[proto].np_qlimit;
557111888Sjlemon	NETISR_RUNLOCK(&tracker);
55854263Sshin}
55954263Sshin
56062587Sitojun/*
56154263Sshin * Update the queue limit across per-workstream queues for a protocol.  We
56254263Sshin * simply change the limits, and don't drain overflowed packets as they will
56354263Sshin * (hopefully) take care of themselves shortly.
56454263Sshin */
56554263Sshinint
56654263Sshinnetisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit)
567147256Sbrooks{
56854263Sshin	struct netisr_work *npwp;
56954263Sshin#ifdef INVARIANTS
57062587Sitojun	const char *name;
571105339Sume#endif
572105339Sume	u_int i, proto;
573105339Sume
574105339Sume	if (qlimit > netisr_maxqlimit)
57554263Sshin		return (EINVAL);
57654263Sshin
577105293Sume	proto = nhp->nh_proto;
57854263Sshin#ifdef INVARIANTS
57962587Sitojun	name = nhp->nh_name;
58054263Sshin#endif
58154263Sshin	KASSERT(proto < NETISR_MAXPROT,
58254263Sshin	    ("%s(%u): protocol too big for %s", __func__, proto, name));
58354263Sshin
58454263Sshin	NETISR_WLOCK();
58554263Sshin	KASSERT(netisr_proto[proto].np_handler != NULL,
58654263Sshin	    ("%s(%u): protocol not registered for %s", __func__, proto,
58762587Sitojun	    name));
58854263Sshin
58954263Sshin	netisr_proto[proto].np_qlimit = qlimit;
59062587Sitojun	CPU_FOREACH(i) {
59154263Sshin		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
592105339Sume		npwp->nw_qlimit = qlimit;
593105339Sume	}
594105339Sume	NETISR_WUNLOCK();
595105339Sume	return (0);
59654263Sshin}
59762587Sitojun
59854263Sshin/*
599105339Sume * Drain all packets currently held in a particular protocol work queue.
60054263Sshin */
601105339Sumestatic void
60254263Sshinnetisr_drain_proto(struct netisr_work *npwp)
60354263Sshin{
60454263Sshin	struct mbuf *m;
60578064Sume
60662587Sitojun	/*
60778064Sume	 * We would assert the lock on the workstream but it's not passed in.
60862587Sitojun	 */
60954263Sshin	while ((m = npwp->nw_head) != NULL) {
61054263Sshin		npwp->nw_head = m->m_nextpkt;
61154263Sshin		m->m_nextpkt = NULL;
61254263Sshin		if (npwp->nw_head == NULL)
61362587Sitojun			npwp->nw_tail = NULL;
61478064Sume		npwp->nw_len--;
61562587Sitojun		m_freem(m);
61662587Sitojun	}
61762587Sitojun	KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__));
61862587Sitojun	KASSERT(npwp->nw_len == 0, ("%s: len", __func__));
61962587Sitojun}
62062587Sitojun
62162587Sitojun/*
62262587Sitojun * Remove the registration of a network protocol, which requires clearing
62378064Sume * per-protocol fields across all workstreams, including freeing all mbufs in
62478064Sume * the queues at time of unregister.  All work in netisr is briefly suspended
62578064Sume * while this takes place.
62678064Sume */
62778064Sumevoid
628105293Sumenetisr_unregister(const struct netisr_handler *nhp)
62991327Sbrooks{
630105293Sume	struct netisr_work *npwp;
63162587Sitojun#ifdef INVARIANTS
63254263Sshin	const char *name;
63378064Sume#endif
63478064Sume	u_int i, proto;
63578064Sume
63678064Sume	proto = nhp->nh_proto;
63778064Sume#ifdef INVARIANTS
63878064Sume	name = nhp->nh_name;
63978064Sume#endif
64078064Sume	KASSERT(proto < NETISR_MAXPROT,
64178064Sume	    ("%s(%u): protocol too big for %s", __func__, proto, name));
64278064Sume
64378064Sume	NETISR_WLOCK();
64478064Sume	KASSERT(netisr_proto[proto].np_handler != NULL,
64578064Sume	    ("%s(%u): protocol not registered for %s", __func__, proto,
64678064Sume	    name));
64778064Sume
64878064Sume	netisr_proto[proto].np_name = NULL;
64978064Sume	netisr_proto[proto].np_handler = NULL;
65078064Sume	netisr_proto[proto].np_m2flow = NULL;
65178064Sume	netisr_proto[proto].np_m2cpuid = NULL;
65278064Sume	netisr_proto[proto].np_qlimit = 0;
65378064Sume	netisr_proto[proto].np_policy = 0;
65478064Sume	CPU_FOREACH(i) {
65578064Sume		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
65678064Sume		netisr_drain_proto(npwp);
65778064Sume		bzero(npwp, sizeof(*npwp));
65878064Sume	}
65978064Sume	NETISR_WUNLOCK();
66078064Sume}
66178064Sume
66278064Sume/*
66378064Sume * Compose the global and per-protocol policies on dispatch, and return the
66478064Sume * dispatch policy to use.
66578064Sume */
66678064Sumestatic u_int
66778064Sumenetisr_get_dispatch(struct netisr_proto *npp)
66878064Sume{
66978064Sume
67078064Sume	/*
67178064Sume	 * Protocol-specific configuration overrides the global default.
67278064Sume	 */
67378064Sume	if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT)
67478064Sume		return (npp->np_dispatch);
67578064Sume	return (netisr_dispatch_policy);
67678064Sume}
67778064Sume
67878064Sume/*
67978064Sume * Look up the workstream given a packet and source identifier.  Do this by
68078064Sume * checking the protocol's policy, and optionally call out to the protocol
68178064Sume * for assistance if required.
68278064Sume */
68378064Sumestatic struct mbuf *
68478064Sumenetisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy,
68578064Sume    uintptr_t source, struct mbuf *m, u_int *cpuidp)
68678064Sume{
68778064Sume	struct ifnet *ifp;
688147256Sbrooks	u_int policy;
68962587Sitojun
69062587Sitojun	NETISR_LOCK_ASSERT();
69162587Sitojun
69262587Sitojun	/*
693147256Sbrooks	 * In the event we have only one worker, shortcut and deliver to it
69454263Sshin	 * without further ado.
69562587Sitojun	 */
69662587Sitojun	if (nws_count == 1) {
69754263Sshin		*cpuidp = nws_array[0];
69854263Sshin		return (m);
69954263Sshin	}
70054263Sshin
70154263Sshin	/*
70254263Sshin	 * What happens next depends on the policy selected by the protocol.
70354263Sshin	 * If we want to support per-interface policies, we should do that
70454263Sshin	 * here first.
70554263Sshin	 */
70678064Sume	policy = npp->np_policy;
70754263Sshin	if (policy == NETISR_POLICY_CPU) {
70878064Sume		m = npp->np_m2cpuid(m, source, cpuidp);
70954263Sshin		if (m == NULL)
71078064Sume			return (NULL);
71154263Sshin
71254263Sshin		/*
71354263Sshin		 * It's possible for a protocol not to have a good idea about
71478064Sume		 * where to process a packet, in which case we fall back on
71554263Sshin		 * the netisr code to decide.  In the hybrid case, return the
71654263Sshin		 * current CPU ID, which will force an immediate direct
71778064Sume		 * dispatch.  In the queued case, fall back on the SOURCE
71854263Sshin		 * policy.
71954263Sshin		 */
72054263Sshin		if (*cpuidp != NETISR_CPUID_NONE)
72154263Sshin			return (m);
72254263Sshin		if (dispatch_policy == NETISR_DISPATCH_HYBRID) {
72354263Sshin			*cpuidp = curcpu;
72478064Sume			return (m);
72578064Sume		}
72678064Sume		policy = NETISR_POLICY_SOURCE;
727148385Sume	}
728148385Sume
729148385Sume	if (policy == NETISR_POLICY_FLOW) {
730148385Sume		if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) {
731148385Sume			m = npp->np_m2flow(m, source);
732148385Sume			if (m == NULL)
733148385Sume				return (NULL);
73454263Sshin		}
73562587Sitojun		if (m->m_flags & M_FLOWID) {
73654263Sshin			*cpuidp =
73754263Sshin			    netisr_default_flow2cpu(m->m_pkthdr.flowid);
73854263Sshin			return (m);
73954263Sshin		}
74054263Sshin		policy = NETISR_POLICY_SOURCE;
74154263Sshin	}
74254263Sshin
74354263Sshin	KASSERT(policy == NETISR_POLICY_SOURCE,
74454263Sshin	    ("%s: invalid policy %u for %s", __func__, npp->np_policy,
74578064Sume	    npp->np_name));
74654263Sshin
74778064Sume	ifp = m->m_pkthdr.rcvif;
74854263Sshin	if (ifp != NULL)
74978064Sume		*cpuidp = nws_array[(ifp->if_index + source) % nws_count];
75054263Sshin	else
75154263Sshin		*cpuidp = nws_array[source % nws_count];
75254263Sshin	return (m);
75378064Sume}
75454263Sshin
75554263Sshin/*
75678064Sume * Process packets associated with a workstream and protocol.  For reasons of
75754263Sshin * fairness, we process up to one complete netisr queue at a time, moving the
75854263Sshin * queue to a stack-local queue for processing, but do not loop refreshing
75954263Sshin * from the global queue.  The caller is responsible for deciding whether to
76054263Sshin * loop, and for setting the NWS_RUNNING flag.  The passed workstream will be
76154263Sshin * locked on entry and relocked before return, but will be released while
76254263Sshin * processing.  The number of packets processed is returned.
76378064Sume */
76478064Sumestatic u_int
76578064Sumenetisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto)
766148385Sume{
767148385Sume	struct netisr_work local_npw, *npwp;
768148385Sume	u_int handled;
769148385Sume	struct mbuf *m;
770148385Sume
771148385Sume	NETISR_LOCK_ASSERT();
772148385Sume	NWS_LOCK_ASSERT(nwsp);
77354263Sshin
77454263Sshin	KASSERT(nwsp->nws_flags & NWS_RUNNING,
77578064Sume	    ("%s(%u): not running", __func__, proto));
77678064Sume	KASSERT(proto >= 0 && proto < NETISR_MAXPROT,
77778064Sume	    ("%s(%u): invalid proto\n", __func__, proto));
77878064Sume
77978064Sume	npwp = &nwsp->nws_work[proto];
78078064Sume	if (npwp->nw_len == 0)
78178064Sume		return (0);
78278064Sume
78378064Sume	/*
78478064Sume	 * Move the global work queue to a thread-local work queue.
78578064Sume	 *
78678064Sume	 * Notice that this means the effective maximum length of the queue
78778064Sume	 * is actually twice that of the maximum queue length specified in
78878064Sume	 * the protocol registration call.
78978064Sume	 */
79078064Sume	handled = npwp->nw_len;
79178064Sume	local_npw = *npwp;
79278064Sume	npwp->nw_head = NULL;
79378064Sume	npwp->nw_tail = NULL;
79478064Sume	npwp->nw_len = 0;
79578064Sume	nwsp->nws_pendingbits &= ~(1 << proto);
79678064Sume	NWS_UNLOCK(nwsp);
79778064Sume	while ((m = local_npw.nw_head) != NULL) {
79878064Sume		local_npw.nw_head = m->m_nextpkt;
79978064Sume		m->m_nextpkt = NULL;
80054263Sshin		if (local_npw.nw_head == NULL)
80162587Sitojun			local_npw.nw_tail = NULL;
80254263Sshin		local_npw.nw_len--;
80354263Sshin		VNET_ASSERT(m->m_pkthdr.rcvif != NULL,
80454263Sshin		    ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m));
80554263Sshin		CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
80654263Sshin		netisr_proto[proto].np_handler(m);
80754263Sshin		CURVNET_RESTORE();
80854263Sshin	}
80954263Sshin	KASSERT(local_npw.nw_len == 0,
81054263Sshin	    ("%s(%u): len %u", __func__, proto, local_npw.nw_len));
81179106Sbrooks	if (netisr_proto[proto].np_drainedcpu)
812127305Srwatson		netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu);
813127305Srwatson	NWS_LOCK(nwsp);
814127305Srwatson	npwp->nw_handled += handled;
815127305Srwatson	return (handled);
816127305Srwatson}
817127305Srwatson
818127305Srwatson/*
819105293Sume * SWI handler for netisr -- processes packets in a set of workstreams that
820105293Sume * it owns, woken up by calls to NWS_SIGNAL().  If this workstream is already
821105293Sume * being direct dispatched, go back to sleep and wait for the dispatching
822105293Sume * thread to wake us up again.
823105293Sume */
824105293Sumestatic void
825147256Sbrooksswi_net(void *arg)
826105293Sume{
827105293Sume#ifdef NETISR_LOCKING
828105293Sume	struct rm_priotracker tracker;
829105293Sume#endif
830127305Srwatson	struct netisr_workstream *nwsp;
831105293Sume	u_int bits, prot;
832105293Sume
833105293Sume	nwsp = arg;
834105293Sume
835105293Sume#ifdef DEVICE_POLLING
836105293Sume	KASSERT(nws_count == 1,
837105293Sume	    ("%s: device_polling but nws_count != 1", __func__));
838105293Sume	netisr_poll();
839105293Sume#endif
840105293Sume#ifdef NETISR_LOCKING
841105293Sume	NETISR_RLOCK(&tracker);
842105293Sume#endif
843105293Sume	NWS_LOCK(nwsp);
844105293Sume	KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running"));
845105293Sume	if (nwsp->nws_flags & NWS_DISPATCHING)
846105293Sume		goto out;
847105293Sume	nwsp->nws_flags |= NWS_RUNNING;
848105293Sume	nwsp->nws_flags &= ~NWS_SCHEDULED;
849105293Sume	while ((bits = nwsp->nws_pendingbits) != 0) {
850127305Srwatson		while ((prot = ffs(bits)) != 0) {
851105293Sume			prot--;
852105293Sume			bits &= ~(1 << prot);
853105293Sume			(void)netisr_process_workstream_proto(nwsp, prot);
854105293Sume		}
855105293Sume	}
856127305Srwatson	nwsp->nws_flags &= ~NWS_RUNNING;
857105293Sumeout:
858105293Sume	NWS_UNLOCK(nwsp);
859105293Sume#ifdef NETISR_LOCKING
860105293Sume	NETISR_RUNLOCK(&tracker);
861105293Sume#endif
862105293Sume#ifdef DEVICE_POLLING
863105293Sume	netisr_pollmore();
864105293Sume#endif
865105293Sume}
866105293Sume
867105293Sumestatic int
868105293Sumenetisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto,
869105293Sume    struct netisr_work *npwp, struct mbuf *m, int *dosignalp)
870105293Sume{
871105293Sume
872105293Sume	NWS_LOCK_ASSERT(nwsp);
873105293Sume
874111119Simp	*dosignalp = 0;
875105293Sume	if (npwp->nw_len < npwp->nw_qlimit) {
876105293Sume		m->m_nextpkt = NULL;
877105293Sume		if (npwp->nw_head == NULL) {
878105293Sume			npwp->nw_head = m;
879111119Simp			npwp->nw_tail = m;
880105293Sume		} else {
881105293Sume			npwp->nw_tail->m_nextpkt = m;
882105293Sume			npwp->nw_tail = m;
883105293Sume		}
884105293Sume		npwp->nw_len++;
885105293Sume		if (npwp->nw_len > npwp->nw_watermark)
886105293Sume			npwp->nw_watermark = npwp->nw_len;
887105293Sume
888105293Sume		/*
889105293Sume		 * We must set the bit regardless of NWS_RUNNING, so that
890105293Sume		 * swi_net() keeps calling netisr_process_workstream_proto().
891148385Sume		 */
892148385Sume		nwsp->nws_pendingbits |= (1 << proto);
893148385Sume		if (!(nwsp->nws_flags &
894148385Sume		    (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) {
895148385Sume			nwsp->nws_flags |= NWS_SCHEDULED;
896148385Sume			*dosignalp = 1;	/* Defer until unlocked. */
897148385Sume		}
898148385Sume		npwp->nw_queued++;
899148385Sume		return (0);
900148385Sume	} else {
901105293Sume		m_freem(m);
902105293Sume		npwp->nw_qdrops++;
903105293Sume		return (ENOBUFS);
904105293Sume	}
905105293Sume}
906105293Sume
907105293Sumestatic int
908105293Sumenetisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid)
909105293Sume{
910105293Sume	struct netisr_workstream *nwsp;
911105293Sume	struct netisr_work *npwp;
912105293Sume	int dosignal, error;
913105293Sume
914105293Sume#ifdef NETISR_LOCKING
915105293Sume	NETISR_LOCK_ASSERT();
916105293Sume#endif
917105293Sume	KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__,
918105293Sume	    cpuid, mp_maxid));
919105293Sume	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
920105293Sume
921148887Srwatson	dosignal = 0;
922105293Sume	error = 0;
923148887Srwatson	nwsp = DPCPU_ID_PTR(cpuid, nws);
924105293Sume	npwp = &nwsp->nws_work[proto];
925105293Sume	NWS_LOCK(nwsp);
926105293Sume	error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal);
927105293Sume	NWS_UNLOCK(nwsp);
92879106Sbrooks	if (dosignal)
929105293Sume		NWS_SIGNAL(nwsp);
930105293Sume	return (error);
93179106Sbrooks}
932147256Sbrooks
93379106Sbrooksint
93479106Sbrooksnetisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m)
93579106Sbrooks{
93679106Sbrooks#ifdef NETISR_LOCKING
93779106Sbrooks	struct rm_priotracker tracker;
93879106Sbrooks#endif
93979106Sbrooks	u_int cpuid;
94079106Sbrooks	int error;
94179106Sbrooks
942105293Sume	KASSERT(proto < NETISR_MAXPROT,
943105293Sume	    ("%s: invalid proto %u", __func__, proto));
944105293Sume
945105293Sume#ifdef NETISR_LOCKING
946105293Sume	NETISR_RLOCK(&tracker);
947105293Sume#endif
948105293Sume	KASSERT(netisr_proto[proto].np_handler != NULL,
949160018Syar	    ("%s: invalid proto %u", __func__, proto));
95079106Sbrooks
951	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED,
952	    source, m, &cpuid);
953	if (m != NULL) {
954		KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__,
955		    cpuid));
956		error = netisr_queue_internal(proto, m, cpuid);
957	} else
958		error = ENOBUFS;
959#ifdef NETISR_LOCKING
960	NETISR_RUNLOCK(&tracker);
961#endif
962	return (error);
963}
964
965int
966netisr_queue(u_int proto, struct mbuf *m)
967{
968
969	return (netisr_queue_src(proto, 0, m));
970}
971
972/*
973 * Dispatch a packet for netisr processing; direct dispatch is permitted by
974 * calling context.
975 */
976int
977netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m)
978{
979#ifdef NETISR_LOCKING
980	struct rm_priotracker tracker;
981#endif
982	struct netisr_workstream *nwsp;
983	struct netisr_proto *npp;
984	struct netisr_work *npwp;
985	int dosignal, error;
986	u_int cpuid, dispatch_policy;
987
988	KASSERT(proto < NETISR_MAXPROT,
989	    ("%s: invalid proto %u", __func__, proto));
990#ifdef NETISR_LOCKING
991	NETISR_RLOCK(&tracker);
992#endif
993	npp = &netisr_proto[proto];
994	KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__,
995	    proto));
996
997	dispatch_policy = netisr_get_dispatch(npp);
998	if (dispatch_policy == NETISR_DISPATCH_DEFERRED)
999		return (netisr_queue_src(proto, source, m));
1000
1001	/*
1002	 * If direct dispatch is forced, then unconditionally dispatch
1003	 * without a formal CPU selection.  Borrow the current CPU's stats,
1004	 * even if there's no worker on it.  In this case we don't update
1005	 * nws_flags because all netisr processing will be source ordered due
1006	 * to always being forced to directly dispatch.
1007	 */
1008	if (dispatch_policy == NETISR_DISPATCH_DIRECT) {
1009		nwsp = DPCPU_PTR(nws);
1010		npwp = &nwsp->nws_work[proto];
1011		npwp->nw_dispatched++;
1012		npwp->nw_handled++;
1013		netisr_proto[proto].np_handler(m);
1014		error = 0;
1015		goto out_unlock;
1016	}
1017
1018	KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID,
1019	    ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy));
1020
1021	/*
1022	 * Otherwise, we execute in a hybrid mode where we will try to direct
1023	 * dispatch if we're on the right CPU and the netisr worker isn't
1024	 * already running.
1025	 */
1026	sched_pin();
1027	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID,
1028	    source, m, &cpuid);
1029	if (m == NULL) {
1030		error = ENOBUFS;
1031		goto out_unpin;
1032	}
1033	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
1034	if (cpuid != curcpu)
1035		goto queue_fallback;
1036	nwsp = DPCPU_PTR(nws);
1037	npwp = &nwsp->nws_work[proto];
1038
1039	/*-
1040	 * We are willing to direct dispatch only if three conditions hold:
1041	 *
1042	 * (1) The netisr worker isn't already running,
1043	 * (2) Another thread isn't already directly dispatching, and
1044	 * (3) The netisr hasn't already been woken up.
1045	 */
1046	NWS_LOCK(nwsp);
1047	if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) {
1048		error = netisr_queue_workstream(nwsp, proto, npwp, m,
1049		    &dosignal);
1050		NWS_UNLOCK(nwsp);
1051		if (dosignal)
1052			NWS_SIGNAL(nwsp);
1053		goto out_unpin;
1054	}
1055
1056	/*
1057	 * The current thread is now effectively the netisr worker, so set
1058	 * the dispatching flag to prevent concurrent processing of the
1059	 * stream from another thread (even the netisr worker), which could
1060	 * otherwise lead to effective misordering of the stream.
1061	 */
1062	nwsp->nws_flags |= NWS_DISPATCHING;
1063	NWS_UNLOCK(nwsp);
1064	netisr_proto[proto].np_handler(m);
1065	NWS_LOCK(nwsp);
1066	nwsp->nws_flags &= ~NWS_DISPATCHING;
1067	npwp->nw_handled++;
1068	npwp->nw_hybrid_dispatched++;
1069
1070	/*
1071	 * If other work was enqueued by another thread while we were direct
1072	 * dispatching, we need to signal the netisr worker to do that work.
1073	 * In the future, we might want to do some of that work in the
1074	 * current thread, rather than trigger further context switches.  If
1075	 * so, we'll want to establish a reasonable bound on the work done in
1076	 * the "borrowed" context.
1077	 */
1078	if (nwsp->nws_pendingbits != 0) {
1079		nwsp->nws_flags |= NWS_SCHEDULED;
1080		dosignal = 1;
1081	} else
1082		dosignal = 0;
1083	NWS_UNLOCK(nwsp);
1084	if (dosignal)
1085		NWS_SIGNAL(nwsp);
1086	error = 0;
1087	goto out_unpin;
1088
1089queue_fallback:
1090	error = netisr_queue_internal(proto, m, cpuid);
1091out_unpin:
1092	sched_unpin();
1093out_unlock:
1094#ifdef NETISR_LOCKING
1095	NETISR_RUNLOCK(&tracker);
1096#endif
1097	return (error);
1098}
1099
1100int
1101netisr_dispatch(u_int proto, struct mbuf *m)
1102{
1103
1104	return (netisr_dispatch_src(proto, 0, m));
1105}
1106
1107#ifdef DEVICE_POLLING
1108/*
1109 * Kernel polling borrows a netisr thread to run interface polling in; this
1110 * function allows kernel polling to request that the netisr thread be
1111 * scheduled even if no packets are pending for protocols.
1112 */
1113void
1114netisr_sched_poll(void)
1115{
1116	struct netisr_workstream *nwsp;
1117
1118	nwsp = DPCPU_ID_PTR(nws_array[0], nws);
1119	NWS_SIGNAL(nwsp);
1120}
1121#endif
1122
1123static void
1124netisr_start_swi(u_int cpuid, struct pcpu *pc)
1125{
1126	char swiname[12];
1127	struct netisr_workstream *nwsp;
1128	int error;
1129
1130	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
1131
1132	nwsp = DPCPU_ID_PTR(cpuid, nws);
1133	mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF);
1134	nwsp->nws_cpu = cpuid;
1135	snprintf(swiname, sizeof(swiname), "netisr %u", cpuid);
1136	error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp,
1137	    SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie);
1138	if (error)
1139		panic("%s: swi_add %d", __func__, error);
1140	pc->pc_netisr = nwsp->nws_intr_event;
1141	if (netisr_bindthreads) {
1142		error = intr_event_bind(nwsp->nws_intr_event, cpuid);
1143		if (error != 0)
1144			printf("%s: cpu %u: intr_event_bind: %d", __func__,
1145			    cpuid, error);
1146	}
1147	NETISR_WLOCK();
1148	nws_array[nws_count] = nwsp->nws_cpu;
1149	nws_count++;
1150	NETISR_WUNLOCK();
1151}
1152
1153/*
1154 * Initialize the netisr subsystem.  We rely on BSS and static initialization
1155 * of most fields in global data structures.
1156 *
1157 * Start a worker thread for the boot CPU so that we can support network
1158 * traffic immediately in case the network stack is used before additional
1159 * CPUs are started (for example, diskless boot).
1160 */
1161static void
1162netisr_init(void *arg)
1163{
1164	char tmp[NETISR_DISPATCH_POLICY_MAXSTR];
1165	u_int dispatch_policy;
1166	int error;
1167
1168	KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__));
1169
1170	NETISR_LOCK_INIT();
1171	if (netisr_maxthreads < 1)
1172		netisr_maxthreads = 1;
1173	if (netisr_maxthreads > mp_ncpus) {
1174		printf("netisr_init: forcing maxthreads from %d to %d\n",
1175		    netisr_maxthreads, mp_ncpus);
1176		netisr_maxthreads = mp_ncpus;
1177	}
1178	if (netisr_defaultqlimit > netisr_maxqlimit) {
1179		printf("netisr_init: forcing defaultqlimit from %d to %d\n",
1180		    netisr_defaultqlimit, netisr_maxqlimit);
1181		netisr_defaultqlimit = netisr_maxqlimit;
1182	}
1183#ifdef DEVICE_POLLING
1184	/*
1185	 * The device polling code is not yet aware of how to deal with
1186	 * multiple netisr threads, so for the time being compiling in device
1187	 * polling disables parallel netisr workers.
1188	 */
1189	if (netisr_maxthreads != 1 || netisr_bindthreads != 0) {
1190		printf("netisr_init: forcing maxthreads to 1 and "
1191		    "bindthreads to 0 for device polling\n");
1192		netisr_maxthreads = 1;
1193		netisr_bindthreads = 0;
1194	}
1195#endif
1196
1197	if (TUNABLE_STR_FETCH("net.isr.dispatch", tmp, sizeof(tmp))) {
1198		error = netisr_dispatch_policy_from_str(tmp,
1199		    &dispatch_policy);
1200		if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT)
1201			error = EINVAL;
1202		if (error == 0) {
1203			netisr_dispatch_policy = dispatch_policy;
1204			netisr_dispatch_policy_compat();
1205		} else
1206			printf(
1207			    "%s: invalid dispatch policy %s, using default\n",
1208			    __func__, tmp);
1209	}
1210
1211	netisr_start_swi(curcpu, pcpu_find(curcpu));
1212}
1213SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL);
1214
1215/*
1216 * Start worker threads for additional CPUs.  No attempt to gracefully handle
1217 * work reassignment, we don't yet support dynamic reconfiguration.
1218 */
1219static void
1220netisr_start(void *arg)
1221{
1222	struct pcpu *pc;
1223
1224	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
1225		if (nws_count >= netisr_maxthreads)
1226			break;
1227		/* XXXRW: Is skipping absent CPUs still required here? */
1228		if (CPU_ABSENT(pc->pc_cpuid))
1229			continue;
1230		/* Worker will already be present for boot CPU. */
1231		if (pc->pc_netisr != NULL)
1232			continue;
1233		netisr_start_swi(pc->pc_cpuid, pc);
1234	}
1235}
1236SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL);
1237
1238/*
1239 * Sysctl monitoring for netisr: query a list of registered protocols.
1240 */
1241static int
1242sysctl_netisr_proto(SYSCTL_HANDLER_ARGS)
1243{
1244	struct rm_priotracker tracker;
1245	struct sysctl_netisr_proto *snpp, *snp_array;
1246	struct netisr_proto *npp;
1247	u_int counter, proto;
1248	int error;
1249
1250	if (req->newptr != NULL)
1251		return (EINVAL);
1252	snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP,
1253	    M_ZERO | M_WAITOK);
1254	counter = 0;
1255	NETISR_RLOCK(&tracker);
1256	for (proto = 0; proto < NETISR_MAXPROT; proto++) {
1257		npp = &netisr_proto[proto];
1258		if (npp->np_name == NULL)
1259			continue;
1260		snpp = &snp_array[counter];
1261		snpp->snp_version = sizeof(*snpp);
1262		strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN);
1263		snpp->snp_proto = proto;
1264		snpp->snp_qlimit = npp->np_qlimit;
1265		snpp->snp_policy = npp->np_policy;
1266		snpp->snp_dispatch = npp->np_dispatch;
1267		if (npp->np_m2flow != NULL)
1268			snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW;
1269		if (npp->np_m2cpuid != NULL)
1270			snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID;
1271		if (npp->np_drainedcpu != NULL)
1272			snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU;
1273		counter++;
1274	}
1275	NETISR_RUNLOCK(&tracker);
1276	KASSERT(counter <= NETISR_MAXPROT,
1277	    ("sysctl_netisr_proto: counter too big (%d)", counter));
1278	error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter);
1279	free(snp_array, M_TEMP);
1280	return (error);
1281}
1282
1283SYSCTL_PROC(_net_isr, OID_AUTO, proto,
1284    CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto,
1285    "S,sysctl_netisr_proto",
1286    "Return list of protocols registered with netisr");
1287
1288/*
1289 * Sysctl monitoring for netisr: query a list of workstreams.
1290 */
1291static int
1292sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS)
1293{
1294	struct rm_priotracker tracker;
1295	struct sysctl_netisr_workstream *snwsp, *snws_array;
1296	struct netisr_workstream *nwsp;
1297	u_int counter, cpuid;
1298	int error;
1299
1300	if (req->newptr != NULL)
1301		return (EINVAL);
1302	snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP,
1303	    M_ZERO | M_WAITOK);
1304	counter = 0;
1305	NETISR_RLOCK(&tracker);
1306	CPU_FOREACH(cpuid) {
1307		nwsp = DPCPU_ID_PTR(cpuid, nws);
1308		if (nwsp->nws_intr_event == NULL)
1309			continue;
1310		NWS_LOCK(nwsp);
1311		snwsp = &snws_array[counter];
1312		snwsp->snws_version = sizeof(*snwsp);
1313
1314		/*
1315		 * For now, we equate workstream IDs and CPU IDs in the
1316		 * kernel, but expose them independently to userspace in case
1317		 * that assumption changes in the future.
1318		 */
1319		snwsp->snws_wsid = cpuid;
1320		snwsp->snws_cpu = cpuid;
1321		if (nwsp->nws_intr_event != NULL)
1322			snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR;
1323		NWS_UNLOCK(nwsp);
1324		counter++;
1325	}
1326	NETISR_RUNLOCK(&tracker);
1327	KASSERT(counter <= MAXCPU,
1328	    ("sysctl_netisr_workstream: counter too big (%d)", counter));
1329	error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter);
1330	free(snws_array, M_TEMP);
1331	return (error);
1332}
1333
1334SYSCTL_PROC(_net_isr, OID_AUTO, workstream,
1335    CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream,
1336    "S,sysctl_netisr_workstream",
1337    "Return list of workstreams implemented by netisr");
1338
1339/*
1340 * Sysctl monitoring for netisr: query per-protocol data across all
1341 * workstreams.
1342 */
1343static int
1344sysctl_netisr_work(SYSCTL_HANDLER_ARGS)
1345{
1346	struct rm_priotracker tracker;
1347	struct sysctl_netisr_work *snwp, *snw_array;
1348	struct netisr_workstream *nwsp;
1349	struct netisr_proto *npp;
1350	struct netisr_work *nwp;
1351	u_int counter, cpuid, proto;
1352	int error;
1353
1354	if (req->newptr != NULL)
1355		return (EINVAL);
1356	snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT,
1357	    M_TEMP, M_ZERO | M_WAITOK);
1358	counter = 0;
1359	NETISR_RLOCK(&tracker);
1360	CPU_FOREACH(cpuid) {
1361		nwsp = DPCPU_ID_PTR(cpuid, nws);
1362		if (nwsp->nws_intr_event == NULL)
1363			continue;
1364		NWS_LOCK(nwsp);
1365		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
1366			npp = &netisr_proto[proto];
1367			if (npp->np_name == NULL)
1368				continue;
1369			nwp = &nwsp->nws_work[proto];
1370			snwp = &snw_array[counter];
1371			snwp->snw_version = sizeof(*snwp);
1372			snwp->snw_wsid = cpuid;		/* See comment above. */
1373			snwp->snw_proto = proto;
1374			snwp->snw_len = nwp->nw_len;
1375			snwp->snw_watermark = nwp->nw_watermark;
1376			snwp->snw_dispatched = nwp->nw_dispatched;
1377			snwp->snw_hybrid_dispatched =
1378			    nwp->nw_hybrid_dispatched;
1379			snwp->snw_qdrops = nwp->nw_qdrops;
1380			snwp->snw_queued = nwp->nw_queued;
1381			snwp->snw_handled = nwp->nw_handled;
1382			counter++;
1383		}
1384		NWS_UNLOCK(nwsp);
1385	}
1386	KASSERT(counter <= MAXCPU * NETISR_MAXPROT,
1387	    ("sysctl_netisr_work: counter too big (%d)", counter));
1388	NETISR_RUNLOCK(&tracker);
1389	error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter);
1390	free(snw_array, M_TEMP);
1391	return (error);
1392}
1393
1394SYSCTL_PROC(_net_isr, OID_AUTO, work,
1395    CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work,
1396    "S,sysctl_netisr_work",
1397    "Return list of per-workstream, per-protocol work in netisr");
1398
1399#ifdef DDB
1400DB_SHOW_COMMAND(netisr, db_show_netisr)
1401{
1402	struct netisr_workstream *nwsp;
1403	struct netisr_work *nwp;
1404	int first, proto;
1405	u_int cpuid;
1406
1407	db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto",
1408	    "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue");
1409	CPU_FOREACH(cpuid) {
1410		nwsp = DPCPU_ID_PTR(cpuid, nws);
1411		if (nwsp->nws_intr_event == NULL)
1412			continue;
1413		first = 1;
1414		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
1415			if (netisr_proto[proto].np_handler == NULL)
1416				continue;
1417			nwp = &nwsp->nws_work[proto];
1418			if (first) {
1419				db_printf("%3d ", cpuid);
1420				first = 0;
1421			} else
1422				db_printf("%3s ", "");
1423			db_printf(
1424			    "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n",
1425			    netisr_proto[proto].np_name, nwp->nw_len,
1426			    nwp->nw_watermark, nwp->nw_qlimit,
1427			    nwp->nw_dispatched, nwp->nw_hybrid_dispatched,
1428			    nwp->nw_qdrops, nwp->nw_queued);
1429		}
1430	}
1431}
1432#endif
1433