1/*
2 * Copyright (c) 1999-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections.  This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34#include <stddef.h>
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/malloc.h>
40#include <sys/mbuf.h>
41#include <sys/socket.h>
42#include <sys/domain.h>
43#include <sys/user.h>
44#include <sys/random.h>
45#include <sys/socketvar.h>
46#include <net/if_dl.h>
47#include <net/if.h>
48#include <net/route.h>
49#include <net/if_var.h>
50#include <net/dlil.h>
51#include <net/if_arp.h>
52#include <net/iptap.h>
53#include <net/pktap.h>
54#include <sys/kern_event.h>
55#include <sys/kdebug.h>
56#include <sys/mcache.h>
57#include <sys/syslog.h>
58#include <sys/protosw.h>
59#include <sys/priv.h>
60
61#include <kern/assert.h>
62#include <kern/task.h>
63#include <kern/thread.h>
64#include <kern/sched_prim.h>
65#include <kern/locks.h>
66#include <kern/zalloc.h>
67
68#include <net/kpi_protocol.h>
69#include <net/if_types.h>
70#include <net/if_llreach.h>
71#include <net/kpi_interfacefilter.h>
72#include <net/classq/classq.h>
73#include <net/classq/classq_sfb.h>
74#include <net/flowhash.h>
75#include <net/ntstat.h>
76
77#if INET
78#include <netinet/in_var.h>
79#include <netinet/igmp_var.h>
80#include <netinet/ip_var.h>
81#include <netinet/tcp.h>
82#include <netinet/tcp_var.h>
83#include <netinet/udp.h>
84#include <netinet/udp_var.h>
85#include <netinet/if_ether.h>
86#include <netinet/in_pcb.h>
87#endif /* INET */
88
89#if INET6
90#include <netinet6/in6_var.h>
91#include <netinet6/nd6.h>
92#include <netinet6/mld6_var.h>
93#include <netinet6/scope6_var.h>
94#endif /* INET6 */
95
96#include <libkern/OSAtomic.h>
97#include <libkern/tree.h>
98
99#include <dev/random/randomdev.h>
100#include <machine/machine_routines.h>
101
102#include <mach/thread_act.h>
103#include <mach/sdt.h>
104
105#if CONFIG_MACF
106#include <sys/kauth.h>
107#include <security/mac_framework.h>
108#include <net/ethernet.h>
109#include <net/firewire.h>
110#endif
111
112#if PF
113#include <net/pfvar.h>
114#endif /* PF */
115#if PF_ALTQ
116#include <net/altq/altq.h>
117#endif /* PF_ALTQ */
118#include <net/pktsched/pktsched.h>
119
120#define DBG_LAYER_BEG		DLILDBG_CODE(DBG_DLIL_STATIC, 0)
121#define DBG_LAYER_END		DLILDBG_CODE(DBG_DLIL_STATIC, 2)
122#define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
123#define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
124#define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
125
126#define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
127#define MAX_LINKADDR	    4 /* LONGWORDS */
128#define M_NKE M_IFADDR
129
130#if 1
131#define DLIL_PRINTF	printf
132#else
133#define DLIL_PRINTF	kprintf
134#endif
135
136#define	IF_DATA_REQUIRE_ALIGNED_64(f)	\
137	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
138
139#define	IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)	\
140	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
141
142enum {
143	kProtoKPI_v1	= 1,
144	kProtoKPI_v2	= 2
145};
146
147/*
148 * List of if_proto structures in if_proto_hash[] is protected by
149 * the ifnet lock.  The rest of the fields are initialized at protocol
150 * attach time and never change, thus no lock required as long as
151 * a reference to it is valid, via if_proto_ref().
152 */
153struct if_proto {
154    SLIST_ENTRY(if_proto)	next_hash;
155    u_int32_t			refcount;
156    u_int32_t			detached;
157    struct ifnet		*ifp;
158    protocol_family_t		protocol_family;
159    int				proto_kpi;
160    union {
161		struct {
162			proto_media_input		input;
163			proto_media_preout		pre_output;
164			proto_media_event		event;
165			proto_media_ioctl		ioctl;
166			proto_media_detached		detached;
167			proto_media_resolve_multi	resolve_multi;
168			proto_media_send_arp		send_arp;
169		} v1;
170		struct {
171			proto_media_input_v2		input;
172			proto_media_preout		pre_output;
173			proto_media_event		event;
174			proto_media_ioctl		ioctl;
175			proto_media_detached		detached;
176			proto_media_resolve_multi	resolve_multi;
177			proto_media_send_arp		send_arp;
178		} v2;
179	} kpi;
180};
181
182SLIST_HEAD(proto_hash_entry, if_proto);
183
184#define	DLIL_SDLMAXLEN	64
185#define	DLIL_SDLDATALEN	\
186	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
187
188struct dlil_ifnet {
189	struct ifnet	dl_if;			/* public ifnet */
190	/*
191	 * DLIL private fields, protected by dl_if_lock
192	 */
193	decl_lck_mtx_data(, dl_if_lock);
194	TAILQ_ENTRY(dlil_ifnet) dl_if_link;	/* dlil_ifnet link */
195	u_int32_t dl_if_flags;			/* flags (below) */
196	u_int32_t dl_if_refcnt;			/* refcnt */
197	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
198	void	*dl_if_uniqueid;		/* unique interface id */
199	size_t	dl_if_uniqueid_len;		/* length of the unique id */
200	char	dl_if_namestorage[IFNAMSIZ];	/* interface name storage */
201	char	dl_if_xnamestorage[IFXNAMSIZ];	/* external name storage */
202	struct {
203		struct ifaddr	ifa;		/* lladdr ifa */
204		u_int8_t	asdl[DLIL_SDLMAXLEN]; /* addr storage */
205		u_int8_t	msdl[DLIL_SDLMAXLEN]; /* mask storage */
206	} dl_if_lladdr;
207	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
208	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
209	ctrace_t	dl_if_attach;		/* attach PC stacktrace */
210	ctrace_t	dl_if_detach;		/* detach PC stacktrace */
211};
212
213/* Values for dl_if_flags (private to DLIL) */
214#define	DLIF_INUSE	0x1	/* DLIL ifnet recycler, ifnet in use */
215#define	DLIF_REUSE	0x2	/* DLIL ifnet recycles, ifnet is not new */
216#define	DLIF_DEBUG	0x4	/* has debugging info */
217
218#define	IF_REF_TRACE_HIST_SIZE	8	/* size of ref trace history */
219
220/* For gdb */
221__private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
222
223struct dlil_ifnet_dbg {
224	struct dlil_ifnet	dldbg_dlif;		/* dlil_ifnet */
225	u_int16_t		dldbg_if_refhold_cnt;	/* # ifnet references */
226	u_int16_t		dldbg_if_refrele_cnt;	/* # ifnet releases */
227	/*
228	 * Circular lists of ifnet_{reference,release} callers.
229	 */
230	ctrace_t		dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
231	ctrace_t		dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
232};
233
234#define	DLIL_TO_IFP(s)	(&s->dl_if)
235#define	IFP_TO_DLIL(s)	((struct dlil_ifnet *)s)
236
237struct ifnet_filter {
238	TAILQ_ENTRY(ifnet_filter)	filt_next;
239	u_int32_t			filt_skip;
240	u_int32_t			filt_flags;
241	ifnet_t				filt_ifp;
242	const char			*filt_name;
243	void				*filt_cookie;
244	protocol_family_t		filt_protocol;
245	iff_input_func			filt_input;
246	iff_output_func			filt_output;
247	iff_event_func			filt_event;
248	iff_ioctl_func			filt_ioctl;
249	iff_detached_func		filt_detached;
250};
251
252struct proto_input_entry;
253
254static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
255static lck_grp_t *dlil_lock_group;
256lck_grp_t *ifnet_lock_group;
257static lck_grp_t *ifnet_head_lock_group;
258static lck_grp_t *ifnet_snd_lock_group;
259static lck_grp_t *ifnet_rcv_lock_group;
260lck_attr_t *ifnet_lock_attr;
261decl_lck_rw_data(static, ifnet_head_lock);
262decl_lck_mtx_data(static, dlil_ifnet_lock);
263u_int32_t dlil_filter_disable_tso_count = 0;
264
265#if DEBUG
266static unsigned int ifnet_debug = 1;	/* debugging (enabled) */
267#else
268static unsigned int ifnet_debug;	/* debugging (disabled) */
269#endif /* !DEBUG */
270static unsigned int dlif_size;		/* size of dlil_ifnet to allocate */
271static unsigned int dlif_bufsize;	/* size of dlif_size + headroom */
272static struct zone *dlif_zone;		/* zone for dlil_ifnet */
273
274#define	DLIF_ZONE_MAX		64		/* maximum elements in zone */
275#define	DLIF_ZONE_NAME		"ifnet"		/* zone name */
276
277static unsigned int dlif_filt_size;	/* size of ifnet_filter */
278static struct zone *dlif_filt_zone;	/* zone for ifnet_filter */
279
280#define	DLIF_FILT_ZONE_MAX	8		/* maximum elements in zone */
281#define	DLIF_FILT_ZONE_NAME	"ifnet_filter"	/* zone name */
282
283static unsigned int dlif_phash_size;	/* size of ifnet proto hash table */
284static struct zone *dlif_phash_zone;	/* zone for ifnet proto hash table */
285
286#define	DLIF_PHASH_ZONE_MAX	DLIF_ZONE_MAX	/* maximum elements in zone */
287#define	DLIF_PHASH_ZONE_NAME	"ifnet_proto_hash" /* zone name */
288
289static unsigned int dlif_proto_size;	/* size of if_proto */
290static struct zone *dlif_proto_zone;	/* zone for if_proto */
291
292#define	DLIF_PROTO_ZONE_MAX	(DLIF_ZONE_MAX*2) /* maximum elements in zone */
293#define	DLIF_PROTO_ZONE_NAME	"ifnet_proto"	/* zone name */
294
295static unsigned int dlif_tcpstat_size;		/* size of tcpstat_local to allocate */
296static unsigned int dlif_tcpstat_bufsize;	/* size of dlif_tcpstat_size + headroom */
297static struct zone *dlif_tcpstat_zone;		/* zone for tcpstat_local */
298
299#define	DLIF_TCPSTAT_ZONE_MAX	1		/* maximum elements in zone */
300#define	DLIF_TCPSTAT_ZONE_NAME	"ifnet_tcpstat"	/* zone name */
301
302static unsigned int dlif_udpstat_size;		/* size of udpstat_local to allocate */
303static unsigned int dlif_udpstat_bufsize;	/* size of dlif_udpstat_size + headroom */
304static struct zone *dlif_udpstat_zone;		/* zone for udpstat_local */
305
306#define	DLIF_UDPSTAT_ZONE_MAX	1		/* maximum elements in zone */
307#define	DLIF_UDPSTAT_ZONE_NAME	"ifnet_udpstat"	/* zone name */
308
309/*
310 * Updating this variable should be done by first acquiring the global
311 * radix node head (rnh_lock), in tandem with settting/clearing the
312 * PR_AGGDRAIN for routedomain.
313 */
314u_int32_t ifnet_aggressive_drainers;
315static u_int32_t net_rtref;
316
317static struct dlil_main_threading_info dlil_main_input_thread_info;
318__private_extern__ struct dlil_threading_info *dlil_main_input_thread =
319    (struct dlil_threading_info *)&dlil_main_input_thread_info;
320
321static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg);
322static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
323static void dlil_if_trace(struct dlil_ifnet *, int);
324static void if_proto_ref(struct if_proto *);
325static void if_proto_free(struct if_proto *);
326static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
327static int dlil_ifp_proto_count(struct ifnet *);
328static void if_flt_monitor_busy(struct ifnet *);
329static void if_flt_monitor_unbusy(struct ifnet *);
330static void if_flt_monitor_enter(struct ifnet *);
331static void if_flt_monitor_leave(struct ifnet *);
332static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
333    char **, protocol_family_t);
334static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
335    protocol_family_t);
336static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
337    const struct sockaddr_dl *);
338static int ifnet_lookup(struct ifnet *);
339static void if_purgeaddrs(struct ifnet *);
340
341static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
342    struct mbuf *, char *);
343static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
344    struct mbuf *);
345static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
346    mbuf_t *, const struct sockaddr *, void *, char *, char *);
347static void ifproto_media_event(struct ifnet *, protocol_family_t,
348    const struct kev_msg *);
349static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
350    unsigned long, void *);
351static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
352    struct sockaddr_dl *, size_t);
353static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
354    const struct sockaddr_dl *, const struct sockaddr *,
355    const struct sockaddr_dl *, const struct sockaddr *);
356
357static errno_t ifp_if_output(struct ifnet *, struct mbuf *);
358static void ifp_if_start(struct ifnet *);
359static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
360    struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
361static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
362static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
363    protocol_family_t *);
364static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
365    const struct ifnet_demux_desc *, u_int32_t);
366static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
367static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
368static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
369    const struct sockaddr *, const char *, const char *);
370static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
371    const struct sockaddr *, const char *, const char *,
372    u_int32_t *, u_int32_t *);
373static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
374static void ifp_if_free(struct ifnet *);
375static void ifp_if_event(struct ifnet *, const struct kev_msg *);
376static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
377static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
378
379static void dlil_main_input_thread_func(void *, wait_result_t);
380static void dlil_input_thread_func(void *, wait_result_t);
381static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
382static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *);
383static void dlil_terminate_input_thread(struct dlil_threading_info *);
384static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
385    struct dlil_threading_info *, boolean_t);
386static void dlil_input_stats_sync(struct ifnet *, struct dlil_threading_info *);
387static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
388    u_int32_t, ifnet_model_t, boolean_t);
389static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
390    const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
391
392#if DEBUG
393static void dlil_verify_sum16(void);
394#endif /* DEBUG */
395static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
396    protocol_family_t);
397static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
398    protocol_family_t);
399
400static void ifnet_detacher_thread_func(void *, wait_result_t);
401static int ifnet_detacher_thread_cont(int);
402static void ifnet_detach_final(struct ifnet *);
403static void ifnet_detaching_enqueue(struct ifnet *);
404static struct ifnet *ifnet_detaching_dequeue(void);
405
406static void ifnet_start_thread_fn(void *, wait_result_t);
407static void ifnet_poll_thread_fn(void *, wait_result_t);
408static void ifnet_poll(struct ifnet *);
409
410static void ifp_src_route_copyout(struct ifnet *, struct route *);
411static void ifp_src_route_copyin(struct ifnet *, struct route *);
412#if INET6
413static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
414static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
415#endif /* INET6 */
416
417static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
418static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
419static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
420static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
421static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
422static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
423static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
424static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
425static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
426static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
427static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
428
429/* The following are protected by dlil_ifnet_lock */
430static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
431static u_int32_t ifnet_detaching_cnt;
432static void *ifnet_delayed_run;	/* wait channel for detaching thread */
433
434decl_lck_mtx_data(static, ifnet_fc_lock);
435
436static uint32_t ifnet_flowhash_seed;
437
438struct ifnet_flowhash_key {
439	char		ifk_name[IFNAMSIZ];
440	uint32_t	ifk_unit;
441	uint32_t	ifk_flags;
442	uint32_t	ifk_eflags;
443	uint32_t	ifk_capabilities;
444	uint32_t	ifk_capenable;
445	uint32_t	ifk_output_sched_model;
446	uint32_t	ifk_rand1;
447	uint32_t	ifk_rand2;
448};
449
450/* Flow control entry per interface */
451struct ifnet_fc_entry {
452	RB_ENTRY(ifnet_fc_entry) ifce_entry;
453	u_int32_t	ifce_flowhash;
454	struct ifnet	*ifce_ifp;
455};
456
457static uint32_t ifnet_calc_flowhash(struct ifnet *);
458static int ifce_cmp(const struct ifnet_fc_entry *,
459    const struct ifnet_fc_entry *);
460static int ifnet_fc_add(struct ifnet *);
461static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
462static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
463
464/* protected by ifnet_fc_lock */
465RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
466RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
467RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
468
469static unsigned int ifnet_fc_zone_size;		/* sizeof ifnet_fc_entry */
470static struct zone *ifnet_fc_zone;		/* ifnet_fc_entry zone */
471
472#define	IFNET_FC_ZONE_NAME	"ifnet_fc_zone"
473#define	IFNET_FC_ZONE_MAX	 32
474
475extern void bpfdetach(struct ifnet*);
476extern void proto_input_run(void);
477
478extern uint32_t udp_count_opportunistic(unsigned int ifindex,
479	u_int32_t flags);
480extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
481	u_int32_t flags);
482
483__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
484
485#if CONFIG_MACF
486int dlil_lladdr_ckreq = 0;
487#endif
488
489#if DEBUG
490int dlil_verbose = 1;
491#else
492int dlil_verbose = 0;
493#endif /* DEBUG */
494#if IFNET_INPUT_SANITY_CHK
495/* sanity checking of input packet lists received */
496static u_int32_t dlil_input_sanity_check = 0;
497#endif /* IFNET_INPUT_SANITY_CHK */
498/* rate limit debug messages */
499struct timespec dlil_dbgrate = { 1, 0 };
500
501SYSCTL_DECL(_net_link_generic_system);
502
503#if CONFIG_MACF
504SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_lladdr_ckreq,
505	CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_lladdr_ckreq, 0,
506	"Require MACF system info check to expose link-layer address");
507#endif
508
509SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
510    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
511
512#define	IF_SNDQ_MINLEN	32
513u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
514SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
515    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
516    sysctl_sndq_maxlen, "I", "Default transmit queue max length");
517
518#define	IF_RCVQ_MINLEN	32
519#define IF_RCVQ_MAXLEN	256
520u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
521SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
522    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
523    sysctl_rcvq_maxlen, "I", "Default receive queue max length");
524
525#define	IF_RXPOLL_DECAY		2	/* ilog2 of EWMA decay rate (4) */
526static u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
527SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
528    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
529    "ilog2 of EWMA decay rate of avg inbound packets");
530
531#define	IF_RXPOLL_MODE_HOLDTIME_MIN	(10ULL * 1000 * 1000)   /* 10 ms */
532#define	IF_RXPOLL_MODE_HOLDTIME		(1000ULL * 1000 * 1000)	/* 1 sec */
533static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
534SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
535    CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
536    IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
537    "Q", "input poll mode freeze time");
538
539#define	IF_RXPOLL_SAMPLETIME_MIN	(1ULL * 1000 * 1000)	/* 1 ms */
540#define	IF_RXPOLL_SAMPLETIME		(10ULL * 1000 * 1000)	/* 10 ms */
541static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
542SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
543    CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
544    IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
545    "Q", "input poll sampling time");
546
547#define	IF_RXPOLL_INTERVALTIME_MIN	(1ULL * 1000)		/* 1 us */
548#define	IF_RXPOLL_INTERVALTIME		(1ULL * 1000 * 1000)	/* 1 ms */
549static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
550SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
551    CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
552    IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
553    "Q", "input poll interval (time)");
554
555#define	IF_RXPOLL_INTERVAL_PKTS	0	/* 0 (disabled) */
556static u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
557SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
558    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
559    IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
560
561#define	IF_RXPOLL_WLOWAT	10
562static u_int32_t if_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
563SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
564    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_wlowat,
565    IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
566    "I", "input poll wakeup low watermark");
567
568#define	IF_RXPOLL_WHIWAT	100
569static u_int32_t if_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
570SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
571    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_whiwat,
572    IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
573    "I", "input poll wakeup high watermark");
574
575static u_int32_t if_rxpoll_max = 0;			/* 0 (automatic) */
576SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
577    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
578    "max packets per poll call");
579
580static u_int32_t if_rxpoll = 1;
581SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
582    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
583    sysctl_rxpoll, "I", "enable opportunistic input polling");
584
585u_int32_t if_bw_smoothing_val = 3;
586SYSCTL_UINT(_net_link_generic_system, OID_AUTO, if_bw_smoothing_val,
587    CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_smoothing_val, 0, "");
588
589u_int32_t if_bw_measure_size = 10;
590SYSCTL_INT(_net_link_generic_system, OID_AUTO, if_bw_measure_size,
591    CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_measure_size, 0, "");
592
593static u_int32_t cur_dlil_input_threads = 0;
594SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
595    CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads , 0,
596    "Current number of DLIL input threads");
597
598#if IFNET_INPUT_SANITY_CHK
599SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
600    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check , 0,
601    "Turn on sanity checking in DLIL input");
602#endif /* IFNET_INPUT_SANITY_CHK */
603
604static u_int32_t if_flowadv = 1;
605SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
606    CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
607    "enable flow-advisory mechanism");
608
609static uint64_t hwcksum_in_invalidated = 0;
610SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
611    hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
612    &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
613
614uint32_t hwcksum_dbg = 0;
615SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
616    CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
617    "enable hardware cksum debugging");
618
619#define	HWCKSUM_DBG_PARTIAL_FORCED	0x1	/* forced partial checksum */
620#define	HWCKSUM_DBG_PARTIAL_RXOFF_ADJ	0x2	/* adjust start offset */
621#define	HWCKSUM_DBG_FINALIZE_FORCED	0x10	/* forced finalize */
622#define	HWCKSUM_DBG_MASK \
623	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |	\
624	HWCKSUM_DBG_FINALIZE_FORCED)
625
626static uint32_t hwcksum_dbg_mode = 0;
627SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
628    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
629    0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
630
631static uint64_t hwcksum_dbg_partial_forced = 0;
632SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
633    hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
634    &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
635
636static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
637SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
638    hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
639    &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
640
641static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
642SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
643    hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
644    &hwcksum_dbg_partial_rxoff_forced, 0,
645    sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
646    "forced partial cksum rx offset");
647
648static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
649SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
650    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
651    0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
652    "adjusted partial cksum rx offset");
653
654static uint64_t hwcksum_dbg_verified = 0;
655SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
656    hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
657    &hwcksum_dbg_verified, "packets verified for having good checksum");
658
659static uint64_t hwcksum_dbg_bad_cksum = 0;
660SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
661    hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
662    &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
663
664static uint64_t hwcksum_dbg_bad_rxoff = 0;
665SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
666    hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
667    &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
668
669static uint64_t hwcksum_dbg_adjusted = 0;
670SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
671    hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
672    &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
673
674static uint64_t hwcksum_dbg_finalized_hdr = 0;
675SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
676    hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
677    &hwcksum_dbg_finalized_hdr, "finalized headers");
678
679static uint64_t hwcksum_dbg_finalized_data = 0;
680SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
681    hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
682    &hwcksum_dbg_finalized_data, "finalized payloads");
683
684uint32_t hwcksum_tx = 1;
685SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
686    CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
687    "enable transmit hardware checksum offload");
688
689uint32_t hwcksum_rx = 1;
690SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
691    CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
692    "enable receive hardware checksum offload");
693
694unsigned int net_rxpoll = 1;
695unsigned int net_affinity = 1;
696static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
697
698extern u_int32_t	inject_buckets;
699
700static	lck_grp_attr_t	*dlil_grp_attributes = NULL;
701static	lck_attr_t	*dlil_lck_attributes = NULL;
702
703
704#define	DLIL_INPUT_CHECK(m, ifp) {					\
705	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);			\
706	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||	\
707	    !(mbuf_flags(m) & MBUF_PKTHDR)) {				\
708		panic_plain("%s: invalid mbuf %p\n", __func__, m);	\
709		/* NOTREACHED */					\
710	}								\
711}
712
713#define	DLIL_EWMA(old, new, decay) do {					\
714	u_int32_t _avg;							\
715	if ((_avg = (old)) > 0)						\
716		_avg = (((_avg << (decay)) - _avg) + (new)) >> (decay);	\
717	else								\
718		_avg = (new);						\
719	(old) = _avg;							\
720} while (0)
721
722#define	MBPS	(1ULL * 1000 * 1000)
723#define	GBPS	(MBPS * 1000)
724
725struct rxpoll_time_tbl {
726	u_int64_t	speed;		/* downlink speed */
727	u_int32_t	plowat;		/* packets low watermark */
728	u_int32_t	phiwat;		/* packets high watermark */
729	u_int32_t	blowat;		/* bytes low watermark */
730	u_int32_t	bhiwat;		/* bytes high watermark */
731};
732
733static struct rxpoll_time_tbl rxpoll_tbl[] = {
734	{  10 * MBPS,	2,	8,	(1 * 1024),	(6 * 1024)	},
735	{ 100 * MBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
736	{   1 * GBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
737	{  10 * GBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
738	{ 100 * GBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
739	{ 0, 0, 0, 0, 0 }
740};
741
742int
743proto_hash_value(u_int32_t protocol_family)
744{
745	/*
746	 * dlil_proto_unplumb_all() depends on the mapping between
747	 * the hash bucket index and the protocol family defined
748	 * here; future changes must be applied there as well.
749	 */
750	switch(protocol_family) {
751		case PF_INET:
752			return (0);
753		case PF_INET6:
754			return (1);
755		case PF_VLAN:
756			return (2);
757		case PF_UNSPEC:
758		default:
759			return (3);
760	}
761}
762
763/*
764 * Caller must already be holding ifnet lock.
765 */
766static struct if_proto *
767find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
768{
769	struct if_proto *proto = NULL;
770	u_int32_t i = proto_hash_value(protocol_family);
771
772	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
773
774	if (ifp->if_proto_hash != NULL)
775		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
776
777	while (proto != NULL && proto->protocol_family != protocol_family)
778		proto = SLIST_NEXT(proto, next_hash);
779
780	if (proto != NULL)
781		if_proto_ref(proto);
782
783	return (proto);
784}
785
786static void
787if_proto_ref(struct if_proto *proto)
788{
789	atomic_add_32(&proto->refcount, 1);
790}
791
792extern void if_rtproto_del(struct ifnet *ifp, int protocol);
793
794static void
795if_proto_free(struct if_proto *proto)
796{
797	u_int32_t oldval;
798	struct ifnet *ifp = proto->ifp;
799	u_int32_t proto_family = proto->protocol_family;
800	struct kev_dl_proto_data ev_pr_data;
801
802	oldval = atomic_add_32_ov(&proto->refcount, -1);
803	if (oldval > 1)
804		return;
805
806	/* No more reference on this, protocol must have been detached */
807	VERIFY(proto->detached);
808
809	if (proto->proto_kpi == kProtoKPI_v1) {
810		if (proto->kpi.v1.detached)
811			proto->kpi.v1.detached(ifp, proto->protocol_family);
812	}
813	if (proto->proto_kpi == kProtoKPI_v2) {
814		if (proto->kpi.v2.detached)
815			proto->kpi.v2.detached(ifp, proto->protocol_family);
816	}
817
818	/*
819	 * Cleanup routes that may still be in the routing table for that
820	 * interface/protocol pair.
821	 */
822	if_rtproto_del(ifp, proto_family);
823
824	/*
825	 * The reserved field carries the number of protocol still attached
826	 * (subject to change)
827	 */
828	ifnet_lock_shared(ifp);
829	ev_pr_data.proto_family = proto_family;
830	ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp);
831	ifnet_lock_done(ifp);
832
833	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
834	    (struct net_event_data *)&ev_pr_data,
835	    sizeof(struct kev_dl_proto_data));
836
837	zfree(dlif_proto_zone, proto);
838}
839
840__private_extern__ void
841ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
842{
843	unsigned int type = 0;
844	int ass = 1;
845
846	switch (what) {
847	case IFNET_LCK_ASSERT_EXCLUSIVE:
848		type = LCK_RW_ASSERT_EXCLUSIVE;
849		break;
850
851	case IFNET_LCK_ASSERT_SHARED:
852		type = LCK_RW_ASSERT_SHARED;
853		break;
854
855	case IFNET_LCK_ASSERT_OWNED:
856		type = LCK_RW_ASSERT_HELD;
857		break;
858
859	case IFNET_LCK_ASSERT_NOTOWNED:
860		/* nothing to do here for RW lock; bypass assert */
861		ass = 0;
862		break;
863
864	default:
865		panic("bad ifnet assert type: %d", what);
866		/* NOTREACHED */
867	}
868	if (ass)
869		lck_rw_assert(&ifp->if_lock, type);
870}
871
872__private_extern__ void
873ifnet_lock_shared(struct ifnet *ifp)
874{
875	lck_rw_lock_shared(&ifp->if_lock);
876}
877
878__private_extern__ void
879ifnet_lock_exclusive(struct ifnet *ifp)
880{
881	lck_rw_lock_exclusive(&ifp->if_lock);
882}
883
884__private_extern__ void
885ifnet_lock_done(struct ifnet *ifp)
886{
887	lck_rw_done(&ifp->if_lock);
888}
889
890#if INET6
891__private_extern__ void
892if_inet6data_lock_shared(struct ifnet *ifp)
893{
894	lck_rw_lock_shared(&ifp->if_inet6data_lock);
895}
896
897__private_extern__ void
898if_inet6data_lock_exclusive(struct ifnet *ifp)
899{
900	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
901}
902
903__private_extern__ void
904if_inet6data_lock_done(struct ifnet *ifp)
905{
906	lck_rw_done(&ifp->if_inet6data_lock);
907}
908#endif
909
910__private_extern__ void
911ifnet_head_lock_shared(void)
912{
913	lck_rw_lock_shared(&ifnet_head_lock);
914}
915
916__private_extern__ void
917ifnet_head_lock_exclusive(void)
918{
919	lck_rw_lock_exclusive(&ifnet_head_lock);
920}
921
922__private_extern__ void
923ifnet_head_done(void)
924{
925	lck_rw_done(&ifnet_head_lock);
926}
927
928/*
929 * Caller must already be holding ifnet lock.
930 */
931static int
932dlil_ifp_proto_count(struct ifnet * ifp)
933{
934	int i, count = 0;
935
936	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
937
938	if (ifp->if_proto_hash == NULL)
939		goto done;
940
941	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
942		struct if_proto *proto;
943		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
944			count++;
945		}
946	}
947done:
948	return (count);
949}
950
951__private_extern__ void
952dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
953    u_int32_t event_code, struct net_event_data *event_data,
954    u_int32_t event_data_len)
955{
956	struct net_event_data ev_data;
957	struct kev_msg ev_msg;
958
959	bzero(&ev_msg, sizeof (ev_msg));
960	bzero(&ev_data, sizeof (ev_data));
961	/*
962	 * a net event always starts with a net_event_data structure
963	 * but the caller can generate a simple net event or
964	 * provide a longer event structure to post
965	 */
966	ev_msg.vendor_code	= KEV_VENDOR_APPLE;
967	ev_msg.kev_class	= KEV_NETWORK_CLASS;
968	ev_msg.kev_subclass	= event_subclass;
969	ev_msg.event_code	= event_code;
970
971	if (event_data == NULL) {
972		event_data = &ev_data;
973		event_data_len = sizeof(struct net_event_data);
974	}
975
976	strncpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
977	event_data->if_family = ifp->if_family;
978	event_data->if_unit   = (u_int32_t) ifp->if_unit;
979
980	ev_msg.dv[0].data_length = event_data_len;
981	ev_msg.dv[0].data_ptr    = event_data;
982	ev_msg.dv[1].data_length = 0;
983
984	dlil_event_internal(ifp, &ev_msg);
985}
986
987__private_extern__ int
988dlil_alloc_local_stats(struct ifnet *ifp)
989{
990	int ret = EINVAL;
991	void *buf, *base, **pbuf;
992
993	if (ifp == NULL)
994		goto end;
995
996	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
997		/* allocate tcpstat_local structure */
998		buf = zalloc(dlif_tcpstat_zone);
999		if (buf == NULL) {
1000			ret = ENOMEM;
1001			goto end;
1002		}
1003		bzero(buf, dlif_tcpstat_bufsize);
1004
1005		/* Get the 64-bit aligned base address for this object */
1006		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
1007		    sizeof (u_int64_t));
1008		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1009		    ((intptr_t)buf + dlif_tcpstat_bufsize));
1010
1011		/*
1012		 * Wind back a pointer size from the aligned base and
1013		 * save the original address so we can free it later.
1014		 */
1015		pbuf = (void **)((intptr_t)base - sizeof (void *));
1016		*pbuf = buf;
1017		ifp->if_tcp_stat = base;
1018
1019		/* allocate udpstat_local structure */
1020		buf = zalloc(dlif_udpstat_zone);
1021		if (buf == NULL) {
1022			ret = ENOMEM;
1023			goto end;
1024		}
1025		bzero(buf, dlif_udpstat_bufsize);
1026
1027		/* Get the 64-bit aligned base address for this object */
1028		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
1029		    sizeof (u_int64_t));
1030		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
1031		    ((intptr_t)buf + dlif_udpstat_bufsize));
1032
1033		/*
1034		 * Wind back a pointer size from the aligned base and
1035		 * save the original address so we can free it later.
1036		 */
1037		pbuf = (void **)((intptr_t)base - sizeof (void *));
1038		*pbuf = buf;
1039		ifp->if_udp_stat = base;
1040
1041		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof (u_int64_t)) &&
1042		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof (u_int64_t)));
1043
1044		ret = 0;
1045	}
1046
1047end:
1048	if (ret != 0) {
1049		if (ifp->if_tcp_stat != NULL) {
1050			pbuf = (void **)
1051			    ((intptr_t)ifp->if_tcp_stat - sizeof (void *));
1052			zfree(dlif_tcpstat_zone, *pbuf);
1053			ifp->if_tcp_stat = NULL;
1054		}
1055		if (ifp->if_udp_stat != NULL) {
1056			pbuf = (void **)
1057			    ((intptr_t)ifp->if_udp_stat - sizeof (void *));
1058			zfree(dlif_udpstat_zone, *pbuf);
1059			ifp->if_udp_stat = NULL;
1060		}
1061	}
1062
1063	return (ret);
1064}
1065
1066static int
1067dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp)
1068{
1069	thread_continue_t func;
1070	u_int32_t limit;
1071	int error;
1072
1073	/* NULL ifp indicates the main input thread, called at dlil_init time */
1074	if (ifp == NULL) {
1075		func = dlil_main_input_thread_func;
1076		VERIFY(inp == dlil_main_input_thread);
1077		(void) strlcat(inp->input_name,
1078		    "main_input", DLIL_THREADNAME_LEN);
1079	} else if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
1080		func = dlil_rxpoll_input_thread_func;
1081		VERIFY(inp != dlil_main_input_thread);
1082		(void) snprintf(inp->input_name, DLIL_THREADNAME_LEN,
1083		    "%s_input_poll", if_name(ifp));
1084	} else {
1085		func = dlil_input_thread_func;
1086		VERIFY(inp != dlil_main_input_thread);
1087		(void) snprintf(inp->input_name, DLIL_THREADNAME_LEN,
1088		    "%s_input", if_name(ifp));
1089	}
1090	VERIFY(inp->input_thr == THREAD_NULL);
1091
1092	inp->lck_grp = lck_grp_alloc_init(inp->input_name, dlil_grp_attributes);
1093	lck_mtx_init(&inp->input_lck, inp->lck_grp, dlil_lck_attributes);
1094
1095	inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
1096	inp->ifp = ifp;		/* NULL for main input thread */
1097
1098	net_timerclear(&inp->mode_holdtime);
1099	net_timerclear(&inp->mode_lasttime);
1100	net_timerclear(&inp->sample_holdtime);
1101	net_timerclear(&inp->sample_lasttime);
1102	net_timerclear(&inp->dbg_lasttime);
1103
1104	/*
1105	 * For interfaces that support opportunistic polling, set the
1106	 * low and high watermarks for outstanding inbound packets/bytes.
1107	 * Also define freeze times for transitioning between modes
1108	 * and updating the average.
1109	 */
1110	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
1111		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
1112		(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
1113	} else {
1114		limit = (u_int32_t)-1;
1115	}
1116
1117	_qinit(&inp->rcvq_pkts, Q_DROPTAIL, limit);
1118	if (inp == dlil_main_input_thread) {
1119		struct dlil_main_threading_info *inpm =
1120		    (struct dlil_main_threading_info *)inp;
1121		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit);
1122	}
1123
1124	error = kernel_thread_start(func, inp, &inp->input_thr);
1125	if (error == KERN_SUCCESS) {
1126		ml_thread_policy(inp->input_thr, MACHINE_GROUP,
1127		    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR));
1128		/*
1129		 * We create an affinity set so that the matching workloop
1130		 * thread or the starter thread (for loopback) can be
1131		 * scheduled on the same processor set as the input thread.
1132		 */
1133		if (net_affinity) {
1134			struct thread *tp = inp->input_thr;
1135			u_int32_t tag;
1136			/*
1137			 * Randomize to reduce the probability
1138			 * of affinity tag namespace collision.
1139			 */
1140			read_random(&tag, sizeof (tag));
1141			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
1142				thread_reference(tp);
1143				inp->tag = tag;
1144				inp->net_affinity = TRUE;
1145			}
1146		}
1147	} else if (inp == dlil_main_input_thread) {
1148		panic_plain("%s: couldn't create main input thread", __func__);
1149		/* NOTREACHED */
1150	} else {
1151		panic_plain("%s: couldn't create %s input thread", __func__,
1152		    if_name(ifp));
1153		/* NOTREACHED */
1154	}
1155	OSAddAtomic(1, &cur_dlil_input_threads);
1156
1157	return (error);
1158}
1159
1160static void
1161dlil_terminate_input_thread(struct dlil_threading_info *inp)
1162{
1163	struct ifnet *ifp;
1164
1165	VERIFY(current_thread() == inp->input_thr);
1166	VERIFY(inp != dlil_main_input_thread);
1167
1168	OSAddAtomic(-1, &cur_dlil_input_threads);
1169
1170	lck_mtx_destroy(&inp->input_lck, inp->lck_grp);
1171	lck_grp_free(inp->lck_grp);
1172
1173	inp->input_waiting = 0;
1174	inp->wtot = 0;
1175	bzero(inp->input_name, sizeof (inp->input_name));
1176	ifp = inp->ifp;
1177	inp->ifp = NULL;
1178	VERIFY(qhead(&inp->rcvq_pkts) == NULL && qempty(&inp->rcvq_pkts));
1179	qlimit(&inp->rcvq_pkts) = 0;
1180	bzero(&inp->stats, sizeof (inp->stats));
1181
1182	VERIFY(!inp->net_affinity);
1183	inp->input_thr = THREAD_NULL;
1184	VERIFY(inp->wloop_thr == THREAD_NULL);
1185	VERIFY(inp->poll_thr == THREAD_NULL);
1186	VERIFY(inp->tag == 0);
1187
1188	inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
1189	bzero(&inp->tstats, sizeof (inp->tstats));
1190	bzero(&inp->pstats, sizeof (inp->pstats));
1191	bzero(&inp->sstats, sizeof (inp->sstats));
1192
1193	net_timerclear(&inp->mode_holdtime);
1194	net_timerclear(&inp->mode_lasttime);
1195	net_timerclear(&inp->sample_holdtime);
1196	net_timerclear(&inp->sample_lasttime);
1197	net_timerclear(&inp->dbg_lasttime);
1198
1199#if IFNET_INPUT_SANITY_CHK
1200	inp->input_mbuf_cnt = 0;
1201#endif /* IFNET_INPUT_SANITY_CHK */
1202
1203	if (dlil_verbose) {
1204		printf("%s: input thread terminated\n",
1205		    if_name(ifp));
1206	}
1207
1208	/* for the extra refcnt from kernel_thread_start() */
1209	thread_deallocate(current_thread());
1210
1211	/* this is the end */
1212	thread_terminate(current_thread());
1213	/* NOTREACHED */
1214}
1215
1216static kern_return_t
1217dlil_affinity_set(struct thread *tp, u_int32_t tag)
1218{
1219	thread_affinity_policy_data_t policy;
1220
1221	bzero(&policy, sizeof (policy));
1222	policy.affinity_tag = tag;
1223	return (thread_policy_set(tp, THREAD_AFFINITY_POLICY,
1224	    (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT));
1225}
1226
1227void
1228dlil_init(void)
1229{
1230	thread_t thread = THREAD_NULL;
1231
1232	/*
1233	 * The following fields must be 64-bit aligned for atomic operations.
1234	 */
1235	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
1236	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors)
1237	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
1238	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
1239	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
1240	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
1241	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
1242	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
1243	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
1244	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
1245	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
1246	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
1247	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
1248	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
1249	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
1250
1251	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
1252	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors)
1253	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
1254	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
1255	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
1256	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
1257	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
1258	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
1259	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
1260	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
1261	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
1262	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
1263	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
1264	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
1265	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
1266
1267	/*
1268	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
1269	 */
1270	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
1271	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
1272	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
1273	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
1274	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
1275	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
1276	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
1277	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
1278	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
1279	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
1280	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
1281	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
1282	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
1283
1284	/*
1285	 * ... as well as the mbuf checksum flags counterparts.
1286	 */
1287	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
1288	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
1289	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
1290	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
1291	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
1292	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
1293	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
1294	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
1295	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
1296	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
1297
1298	/*
1299	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
1300	 */
1301	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
1302	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
1303
1304	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
1305	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
1306	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
1307	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
1308
1309	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
1310	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
1311	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
1312
1313	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
1314	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
1315	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
1316	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
1317	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
1318	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
1319	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
1320	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
1321	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
1322	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
1323	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
1324	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
1325	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
1326	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
1327	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
1328	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
1329
1330	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
1331	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
1332	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
1333	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
1334	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
1335
1336	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
1337	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
1338
1339	PE_parse_boot_argn("net_affinity", &net_affinity,
1340	    sizeof (net_affinity));
1341
1342	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof (net_rxpoll));
1343
1344	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof (net_rtref));
1345
1346	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof (ifnet_debug));
1347
1348	dlif_size = (ifnet_debug == 0) ? sizeof (struct dlil_ifnet) :
1349	    sizeof (struct dlil_ifnet_dbg);
1350	/* Enforce 64-bit alignment for dlil_ifnet structure */
1351	dlif_bufsize = dlif_size + sizeof (void *) + sizeof (u_int64_t);
1352	dlif_bufsize = P2ROUNDUP(dlif_bufsize, sizeof (u_int64_t));
1353	dlif_zone = zinit(dlif_bufsize, DLIF_ZONE_MAX * dlif_bufsize,
1354	    0, DLIF_ZONE_NAME);
1355	if (dlif_zone == NULL) {
1356		panic_plain("%s: failed allocating %s", __func__,
1357		    DLIF_ZONE_NAME);
1358		/* NOTREACHED */
1359	}
1360	zone_change(dlif_zone, Z_EXPAND, TRUE);
1361	zone_change(dlif_zone, Z_CALLERACCT, FALSE);
1362
1363	dlif_filt_size = sizeof (struct ifnet_filter);
1364	dlif_filt_zone = zinit(dlif_filt_size,
1365	    DLIF_FILT_ZONE_MAX * dlif_filt_size, 0, DLIF_FILT_ZONE_NAME);
1366	if (dlif_filt_zone == NULL) {
1367		panic_plain("%s: failed allocating %s", __func__,
1368		    DLIF_FILT_ZONE_NAME);
1369		/* NOTREACHED */
1370	}
1371	zone_change(dlif_filt_zone, Z_EXPAND, TRUE);
1372	zone_change(dlif_filt_zone, Z_CALLERACCT, FALSE);
1373
1374	dlif_phash_size = sizeof (struct proto_hash_entry) * PROTO_HASH_SLOTS;
1375	dlif_phash_zone = zinit(dlif_phash_size,
1376	    DLIF_PHASH_ZONE_MAX * dlif_phash_size, 0, DLIF_PHASH_ZONE_NAME);
1377	if (dlif_phash_zone == NULL) {
1378		panic_plain("%s: failed allocating %s", __func__,
1379		    DLIF_PHASH_ZONE_NAME);
1380		/* NOTREACHED */
1381	}
1382	zone_change(dlif_phash_zone, Z_EXPAND, TRUE);
1383	zone_change(dlif_phash_zone, Z_CALLERACCT, FALSE);
1384
1385	dlif_proto_size = sizeof (struct if_proto);
1386	dlif_proto_zone = zinit(dlif_proto_size,
1387	    DLIF_PROTO_ZONE_MAX * dlif_proto_size, 0, DLIF_PROTO_ZONE_NAME);
1388	if (dlif_proto_zone == NULL) {
1389		panic_plain("%s: failed allocating %s", __func__,
1390		    DLIF_PROTO_ZONE_NAME);
1391		/* NOTREACHED */
1392	}
1393	zone_change(dlif_proto_zone, Z_EXPAND, TRUE);
1394	zone_change(dlif_proto_zone, Z_CALLERACCT, FALSE);
1395
1396	dlif_tcpstat_size = sizeof (struct tcpstat_local);
1397	/* Enforce 64-bit alignment for tcpstat_local structure */
1398	dlif_tcpstat_bufsize =
1399	    dlif_tcpstat_size + sizeof (void *) + sizeof (u_int64_t);
1400	dlif_tcpstat_bufsize =
1401	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof (u_int64_t));
1402	dlif_tcpstat_zone = zinit(dlif_tcpstat_bufsize,
1403	    DLIF_TCPSTAT_ZONE_MAX * dlif_tcpstat_bufsize, 0,
1404	    DLIF_TCPSTAT_ZONE_NAME);
1405	if (dlif_tcpstat_zone == NULL) {
1406		panic_plain("%s: failed allocating %s", __func__,
1407		    DLIF_TCPSTAT_ZONE_NAME);
1408		/* NOTREACHED */
1409	}
1410	zone_change(dlif_tcpstat_zone, Z_EXPAND, TRUE);
1411	zone_change(dlif_tcpstat_zone, Z_CALLERACCT, FALSE);
1412
1413	dlif_udpstat_size = sizeof (struct udpstat_local);
1414	/* Enforce 64-bit alignment for udpstat_local structure */
1415	dlif_udpstat_bufsize =
1416	    dlif_udpstat_size + sizeof (void *) + sizeof (u_int64_t);
1417	dlif_udpstat_bufsize =
1418	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof (u_int64_t));
1419	dlif_udpstat_zone = zinit(dlif_udpstat_bufsize,
1420	    DLIF_TCPSTAT_ZONE_MAX * dlif_udpstat_bufsize, 0,
1421	    DLIF_UDPSTAT_ZONE_NAME);
1422	if (dlif_udpstat_zone == NULL) {
1423		panic_plain("%s: failed allocating %s", __func__,
1424		    DLIF_UDPSTAT_ZONE_NAME);
1425		/* NOTREACHED */
1426	}
1427	zone_change(dlif_udpstat_zone, Z_EXPAND, TRUE);
1428	zone_change(dlif_udpstat_zone, Z_CALLERACCT, FALSE);
1429
1430	ifnet_llreach_init();
1431
1432	TAILQ_INIT(&dlil_ifnet_head);
1433	TAILQ_INIT(&ifnet_head);
1434	TAILQ_INIT(&ifnet_detaching_head);
1435
1436	/* Setup the lock groups we will use */
1437	dlil_grp_attributes = lck_grp_attr_alloc_init();
1438
1439	dlil_lock_group = lck_grp_alloc_init("DLIL internal locks",
1440	    dlil_grp_attributes);
1441	ifnet_lock_group = lck_grp_alloc_init("ifnet locks",
1442	    dlil_grp_attributes);
1443	ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock",
1444	    dlil_grp_attributes);
1445	ifnet_rcv_lock_group = lck_grp_alloc_init("ifnet rcv locks",
1446	    dlil_grp_attributes);
1447	ifnet_snd_lock_group = lck_grp_alloc_init("ifnet snd locks",
1448	    dlil_grp_attributes);
1449
1450	/* Setup the lock attributes we will use */
1451	dlil_lck_attributes = lck_attr_alloc_init();
1452
1453	ifnet_lock_attr = lck_attr_alloc_init();
1454
1455	lck_rw_init(&ifnet_head_lock, ifnet_head_lock_group,
1456	    dlil_lck_attributes);
1457	lck_mtx_init(&dlil_ifnet_lock, dlil_lock_group, dlil_lck_attributes);
1458
1459	/* Setup interface flow control related items */
1460	lck_mtx_init(&ifnet_fc_lock, dlil_lock_group, dlil_lck_attributes);
1461
1462	ifnet_fc_zone_size = sizeof (struct ifnet_fc_entry);
1463	ifnet_fc_zone = zinit(ifnet_fc_zone_size,
1464	    IFNET_FC_ZONE_MAX * ifnet_fc_zone_size, 0, IFNET_FC_ZONE_NAME);
1465	if (ifnet_fc_zone == NULL) {
1466		panic_plain("%s: failed allocating %s", __func__,
1467		    IFNET_FC_ZONE_NAME);
1468		/* NOTREACHED */
1469	}
1470	zone_change(ifnet_fc_zone, Z_EXPAND, TRUE);
1471	zone_change(ifnet_fc_zone, Z_CALLERACCT, FALSE);
1472
1473	/* Initialize interface address subsystem */
1474	ifa_init();
1475
1476#if PF
1477	/* Initialize the packet filter */
1478	pfinit();
1479#endif /* PF */
1480
1481	/* Initialize queue algorithms */
1482	classq_init();
1483
1484	/* Initialize packet schedulers */
1485	pktsched_init();
1486
1487	/* Initialize flow advisory subsystem */
1488	flowadv_init();
1489
1490	/* Initialize the pktap virtual interface */
1491	pktap_init();
1492
1493#if DEBUG
1494	/* Run self-tests */
1495	dlil_verify_sum16();
1496#endif /* DEBUG */
1497
1498	/*
1499	 * Create and start up the main DLIL input thread and the interface
1500	 * detacher threads once everything is initialized.
1501	 */
1502	dlil_create_input_thread(NULL, dlil_main_input_thread);
1503
1504	if (kernel_thread_start(ifnet_detacher_thread_func,
1505	    NULL, &thread) != KERN_SUCCESS) {
1506		panic_plain("%s: couldn't create detacher thread", __func__);
1507		/* NOTREACHED */
1508	}
1509	thread_deallocate(thread);
1510}
1511
1512static void
1513if_flt_monitor_busy(struct ifnet *ifp)
1514{
1515	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1516
1517	++ifp->if_flt_busy;
1518	VERIFY(ifp->if_flt_busy != 0);
1519}
1520
1521static void
1522if_flt_monitor_unbusy(struct ifnet *ifp)
1523{
1524	if_flt_monitor_leave(ifp);
1525}
1526
1527static void
1528if_flt_monitor_enter(struct ifnet *ifp)
1529{
1530	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1531
1532	while (ifp->if_flt_busy) {
1533		++ifp->if_flt_waiters;
1534		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
1535		    (PZERO - 1), "if_flt_monitor", NULL);
1536	}
1537	if_flt_monitor_busy(ifp);
1538}
1539
1540static void
1541if_flt_monitor_leave(struct ifnet *ifp)
1542{
1543	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1544
1545	VERIFY(ifp->if_flt_busy != 0);
1546	--ifp->if_flt_busy;
1547
1548	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
1549		ifp->if_flt_waiters = 0;
1550		wakeup(&ifp->if_flt_head);
1551	}
1552}
1553
1554__private_extern__ int
1555dlil_attach_filter(struct ifnet	*ifp, const struct iff_filter *if_filter,
1556    interface_filter_t *filter_ref, u_int32_t flags)
1557{
1558	int retval = 0;
1559	struct ifnet_filter *filter = NULL;
1560
1561	ifnet_head_lock_shared();
1562	/* Check that the interface is in the global list */
1563	if (!ifnet_lookup(ifp)) {
1564		retval = ENXIO;
1565		goto done;
1566	}
1567
1568	filter = zalloc(dlif_filt_zone);
1569	if (filter == NULL) {
1570		retval = ENOMEM;
1571		goto done;
1572	}
1573	bzero(filter, dlif_filt_size);
1574
1575	/* refcnt held above during lookup */
1576	filter->filt_flags = flags;
1577	filter->filt_ifp = ifp;
1578	filter->filt_cookie = if_filter->iff_cookie;
1579	filter->filt_name = if_filter->iff_name;
1580	filter->filt_protocol = if_filter->iff_protocol;
1581	filter->filt_input = if_filter->iff_input;
1582	filter->filt_output = if_filter->iff_output;
1583	filter->filt_event = if_filter->iff_event;
1584	filter->filt_ioctl = if_filter->iff_ioctl;
1585	filter->filt_detached = if_filter->iff_detached;
1586
1587	lck_mtx_lock(&ifp->if_flt_lock);
1588	if_flt_monitor_enter(ifp);
1589
1590	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1591	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
1592
1593	if_flt_monitor_leave(ifp);
1594	lck_mtx_unlock(&ifp->if_flt_lock);
1595
1596	*filter_ref = filter;
1597
1598	/*
1599	 * Bump filter count and route_generation ID to let TCP
1600	 * know it shouldn't do TSO on this connection
1601	 */
1602	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
1603		OSAddAtomic(1, &dlil_filter_disable_tso_count);
1604		routegenid_update();
1605	}
1606	if (dlil_verbose) {
1607		printf("%s: %s filter attached\n", if_name(ifp),
1608		    if_filter->iff_name);
1609	}
1610done:
1611	ifnet_head_done();
1612	if (retval != 0 && ifp != NULL) {
1613		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
1614		    if_name(ifp), if_filter->iff_name, retval);
1615	}
1616	if (retval != 0 && filter != NULL)
1617		zfree(dlif_filt_zone, filter);
1618
1619	return (retval);
1620}
1621
1622static int
1623dlil_detach_filter_internal(interface_filter_t	filter, int detached)
1624{
1625	int retval = 0;
1626
1627	if (detached == 0) {
1628		ifnet_t ifp = NULL;
1629
1630		ifnet_head_lock_shared();
1631		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
1632			interface_filter_t entry = NULL;
1633
1634			lck_mtx_lock(&ifp->if_flt_lock);
1635			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
1636				if (entry != filter || entry->filt_skip)
1637					continue;
1638				/*
1639				 * We've found a match; since it's possible
1640				 * that the thread gets blocked in the monitor,
1641				 * we do the lock dance.  Interface should
1642				 * not be detached since we still have a use
1643				 * count held during filter attach.
1644				 */
1645				entry->filt_skip = 1;	/* skip input/output */
1646				lck_mtx_unlock(&ifp->if_flt_lock);
1647				ifnet_head_done();
1648
1649				lck_mtx_lock(&ifp->if_flt_lock);
1650				if_flt_monitor_enter(ifp);
1651				lck_mtx_assert(&ifp->if_flt_lock,
1652				    LCK_MTX_ASSERT_OWNED);
1653
1654				/* Remove the filter from the list */
1655				TAILQ_REMOVE(&ifp->if_flt_head, filter,
1656				    filt_next);
1657
1658				if_flt_monitor_leave(ifp);
1659				lck_mtx_unlock(&ifp->if_flt_lock);
1660				if (dlil_verbose) {
1661					printf("%s: %s filter detached\n",
1662					    if_name(ifp), filter->filt_name);
1663				}
1664				goto destroy;
1665			}
1666			lck_mtx_unlock(&ifp->if_flt_lock);
1667		}
1668		ifnet_head_done();
1669
1670		/* filter parameter is not a valid filter ref */
1671		retval = EINVAL;
1672		goto done;
1673	}
1674
1675	if (dlil_verbose)
1676		printf("%s filter detached\n", filter->filt_name);
1677
1678destroy:
1679
1680	/* Call the detached function if there is one */
1681	if (filter->filt_detached)
1682		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
1683
1684	/* Free the filter */
1685	zfree(dlif_filt_zone, filter);
1686
1687	/*
1688	 * Decrease filter count and route_generation ID to let TCP
1689	 * know it should reevalute doing TSO or not
1690	 */
1691	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
1692		OSAddAtomic(-1, &dlil_filter_disable_tso_count);
1693		routegenid_update();
1694	}
1695done:
1696	if (retval != 0) {
1697		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
1698		    filter->filt_name, retval);
1699	}
1700	return (retval);
1701}
1702
1703__private_extern__ void
1704dlil_detach_filter(interface_filter_t filter)
1705{
1706	if (filter == NULL)
1707		return;
1708	dlil_detach_filter_internal(filter, 0);
1709}
1710
1711/*
1712 * Main input thread:
1713 *
1714 *   a) handles all inbound packets for lo0
1715 *   b) handles all inbound packets for interfaces with no dedicated
1716 *	input thread (e.g. anything but Ethernet/PDP or those that support
1717 *	opportunistic polling.)
1718 *   c) protocol registrations
1719 *   d) packet injections
1720 */
1721static void
1722dlil_main_input_thread_func(void *v, wait_result_t w)
1723{
1724#pragma unused(w)
1725	struct dlil_main_threading_info *inpm = v;
1726	struct dlil_threading_info *inp = v;
1727
1728	VERIFY(inp == dlil_main_input_thread);
1729	VERIFY(inp->ifp == NULL);
1730	VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
1731
1732	while (1) {
1733		struct mbuf *m = NULL, *m_loop = NULL;
1734		u_int32_t m_cnt, m_cnt_loop;
1735		boolean_t proto_req;
1736
1737		lck_mtx_lock_spin(&inp->input_lck);
1738
1739		/* Wait until there is work to be done */
1740		while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
1741			inp->input_waiting &= ~DLIL_INPUT_RUNNING;
1742			(void) msleep(&inp->input_waiting, &inp->input_lck,
1743			    (PZERO - 1) | PSPIN, inp->input_name, NULL);
1744		}
1745
1746		inp->input_waiting |= DLIL_INPUT_RUNNING;
1747		inp->input_waiting &= ~DLIL_INPUT_WAITING;
1748
1749		/* Main input thread cannot be terminated */
1750		VERIFY(!(inp->input_waiting & DLIL_INPUT_TERMINATE));
1751
1752		proto_req = (inp->input_waiting &
1753		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
1754
1755		/* Packets for non-dedicated interfaces other than lo0 */
1756		m_cnt = qlen(&inp->rcvq_pkts);
1757		m = _getq_all(&inp->rcvq_pkts);
1758
1759		/* Packets exclusive to lo0 */
1760		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
1761		m_loop = _getq_all(&inpm->lo_rcvq_pkts);
1762
1763		inp->wtot = 0;
1764
1765		lck_mtx_unlock(&inp->input_lck);
1766
1767		/*
1768		* NOTE warning %%% attention !!!!
1769		* We should think about putting some thread starvation
1770		* safeguards if we deal with long chains of packets.
1771		*/
1772		if (m_loop != NULL)
1773			dlil_input_packet_list_extended(lo_ifp, m_loop,
1774			    m_cnt_loop, inp->mode);
1775
1776		if (m != NULL)
1777			dlil_input_packet_list_extended(NULL, m,
1778			    m_cnt, inp->mode);
1779
1780		if (proto_req)
1781			proto_input_run();
1782	}
1783
1784	/* NOTREACHED */
1785	VERIFY(0);	/* we should never get here */
1786}
1787
1788/*
1789 * Input thread for interfaces with legacy input model.
1790 */
1791static void
1792dlil_input_thread_func(void *v, wait_result_t w)
1793{
1794#pragma unused(w)
1795	struct dlil_threading_info *inp = v;
1796	struct ifnet *ifp = inp->ifp;
1797
1798	VERIFY(inp != dlil_main_input_thread);
1799	VERIFY(ifp != NULL);
1800	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll);
1801	VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
1802
1803	while (1) {
1804		struct mbuf *m = NULL;
1805		u_int32_t m_cnt;
1806
1807		lck_mtx_lock_spin(&inp->input_lck);
1808
1809		/* Wait until there is work to be done */
1810		while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
1811			inp->input_waiting &= ~DLIL_INPUT_RUNNING;
1812			(void) msleep(&inp->input_waiting, &inp->input_lck,
1813			    (PZERO - 1) | PSPIN, inp->input_name, NULL);
1814		}
1815
1816		inp->input_waiting |= DLIL_INPUT_RUNNING;
1817		inp->input_waiting &= ~DLIL_INPUT_WAITING;
1818
1819		/*
1820		 * Protocol registration and injection must always use
1821		 * the main input thread; in theory the latter can utilize
1822		 * the corresponding input thread where the packet arrived
1823		 * on, but that requires our knowing the interface in advance
1824		 * (and the benefits might not worth the trouble.)
1825		 */
1826		VERIFY(!(inp->input_waiting &
1827		    (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER)));
1828
1829		/* Packets for this interface */
1830		m_cnt = qlen(&inp->rcvq_pkts);
1831		m = _getq_all(&inp->rcvq_pkts);
1832
1833		if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
1834			lck_mtx_unlock(&inp->input_lck);
1835
1836			/* Free up pending packets */
1837			if (m != NULL)
1838				mbuf_freem_list(m);
1839
1840			dlil_terminate_input_thread(inp);
1841			/* NOTREACHED */
1842			return;
1843		}
1844
1845		inp->wtot = 0;
1846
1847		dlil_input_stats_sync(ifp, inp);
1848
1849		lck_mtx_unlock(&inp->input_lck);
1850
1851		/*
1852		* NOTE warning %%% attention !!!!
1853		* We should think about putting some thread starvation
1854		* safeguards if we deal with long chains of packets.
1855		*/
1856		if (m != NULL)
1857			dlil_input_packet_list_extended(NULL, m,
1858			    m_cnt, inp->mode);
1859	}
1860
1861	/* NOTREACHED */
1862	VERIFY(0);	/* we should never get here */
1863}
1864
1865/*
1866 * Input thread for interfaces with opportunistic polling input model.
1867 */
1868static void
1869dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
1870{
1871#pragma unused(w)
1872	struct dlil_threading_info *inp = v;
1873	struct ifnet *ifp = inp->ifp;
1874	struct timespec ts;
1875
1876	VERIFY(inp != dlil_main_input_thread);
1877	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL));
1878
1879	while (1) {
1880		struct mbuf *m = NULL;
1881		u_int32_t m_cnt, m_size, poll_req = 0;
1882		ifnet_model_t mode;
1883		struct timespec now, delta;
1884		u_int64_t ival;
1885
1886		lck_mtx_lock_spin(&inp->input_lck);
1887
1888		if ((ival = inp->rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN)
1889			ival = IF_RXPOLL_INTERVALTIME_MIN;
1890
1891		/* Link parameters changed? */
1892		if (ifp->if_poll_update != 0) {
1893			ifp->if_poll_update = 0;
1894			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
1895		}
1896
1897		/* Current operating mode */
1898		mode = inp->mode;
1899
1900		/* Wait until there is work to be done */
1901		while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
1902			inp->input_waiting &= ~DLIL_INPUT_RUNNING;
1903			(void) msleep(&inp->input_waiting, &inp->input_lck,
1904			    (PZERO - 1) | PSPIN, inp->input_name, NULL);
1905		}
1906
1907		inp->input_waiting |= DLIL_INPUT_RUNNING;
1908		inp->input_waiting &= ~DLIL_INPUT_WAITING;
1909
1910		/*
1911		 * Protocol registration and injection must always use
1912		 * the main input thread; in theory the latter can utilize
1913		 * the corresponding input thread where the packet arrived
1914		 * on, but that requires our knowing the interface in advance
1915		 * (and the benefits might not worth the trouble.)
1916		 */
1917		VERIFY(!(inp->input_waiting &
1918		    (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER)));
1919
1920		if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
1921			/* Free up pending packets */
1922			_flushq(&inp->rcvq_pkts);
1923			lck_mtx_unlock(&inp->input_lck);
1924
1925			dlil_terminate_input_thread(inp);
1926			/* NOTREACHED */
1927			return;
1928		}
1929
1930		/* Total count of all packets */
1931		m_cnt = qlen(&inp->rcvq_pkts);
1932
1933		/* Total bytes of all packets */
1934		m_size = qsize(&inp->rcvq_pkts);
1935
1936		/* Packets for this interface */
1937		m = _getq_all(&inp->rcvq_pkts);
1938		VERIFY(m != NULL || m_cnt == 0);
1939
1940		nanouptime(&now);
1941		if (!net_timerisset(&inp->sample_lasttime))
1942			*(&inp->sample_lasttime) = *(&now);
1943
1944		net_timersub(&now, &inp->sample_lasttime, &delta);
1945		if (if_rxpoll && net_timerisset(&inp->sample_holdtime)) {
1946			u_int32_t ptot, btot;
1947
1948			/* Accumulate statistics for current sampling */
1949			PKTCNTR_ADD(&inp->sstats, m_cnt, m_size);
1950
1951			if (net_timercmp(&delta, &inp->sample_holdtime, <))
1952				goto skip;
1953
1954			*(&inp->sample_lasttime) = *(&now);
1955
1956			/* Calculate min/max of inbound bytes */
1957			btot = (u_int32_t)inp->sstats.bytes;
1958			if (inp->rxpoll_bmin == 0 || inp->rxpoll_bmin > btot)
1959				inp->rxpoll_bmin = btot;
1960			if (btot > inp->rxpoll_bmax)
1961				inp->rxpoll_bmax = btot;
1962
1963			/* Calculate EWMA of inbound bytes */
1964			DLIL_EWMA(inp->rxpoll_bavg, btot, if_rxpoll_decay);
1965
1966			/* Calculate min/max of inbound packets */
1967			ptot = (u_int32_t)inp->sstats.packets;
1968			if (inp->rxpoll_pmin == 0 || inp->rxpoll_pmin > ptot)
1969				inp->rxpoll_pmin = ptot;
1970			if (ptot > inp->rxpoll_pmax)
1971				inp->rxpoll_pmax = ptot;
1972
1973			/* Calculate EWMA of inbound packets */
1974			DLIL_EWMA(inp->rxpoll_pavg, ptot, if_rxpoll_decay);
1975
1976			/* Reset sampling statistics */
1977			PKTCNTR_CLEAR(&inp->sstats);
1978
1979			/* Calculate EWMA of wakeup requests */
1980			DLIL_EWMA(inp->rxpoll_wavg, inp->wtot, if_rxpoll_decay);
1981			inp->wtot = 0;
1982
1983			if (dlil_verbose) {
1984				if (!net_timerisset(&inp->dbg_lasttime))
1985					*(&inp->dbg_lasttime) = *(&now);
1986				net_timersub(&now, &inp->dbg_lasttime, &delta);
1987				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
1988					*(&inp->dbg_lasttime) = *(&now);
1989					printf("%s: [%s] pkts avg %d max %d "
1990					    "limits [%d/%d], wreq avg %d "
1991					    "limits [%d/%d], bytes avg %d "
1992					    "limits [%d/%d]\n", if_name(ifp),
1993					    (inp->mode ==
1994					    IFNET_MODEL_INPUT_POLL_ON) ?
1995					    "ON" : "OFF", inp->rxpoll_pavg,
1996					    inp->rxpoll_pmax,
1997					    inp->rxpoll_plowat,
1998					    inp->rxpoll_phiwat,
1999					    inp->rxpoll_wavg,
2000					    inp->rxpoll_wlowat,
2001					    inp->rxpoll_whiwat,
2002					    inp->rxpoll_bavg,
2003					    inp->rxpoll_blowat,
2004					    inp->rxpoll_bhiwat);
2005				}
2006			}
2007
2008			/* Perform mode transition, if necessary */
2009			if (!net_timerisset(&inp->mode_lasttime))
2010				*(&inp->mode_lasttime) = *(&now);
2011
2012			net_timersub(&now, &inp->mode_lasttime, &delta);
2013			if (net_timercmp(&delta, &inp->mode_holdtime, <))
2014				goto skip;
2015
2016			if (inp->rxpoll_pavg <= inp->rxpoll_plowat &&
2017			    inp->rxpoll_bavg <= inp->rxpoll_blowat &&
2018			    inp->mode != IFNET_MODEL_INPUT_POLL_OFF) {
2019				mode = IFNET_MODEL_INPUT_POLL_OFF;
2020			} else if (inp->rxpoll_pavg >= inp->rxpoll_phiwat &&
2021			    (inp->rxpoll_bavg >= inp->rxpoll_bhiwat ||
2022			    inp->rxpoll_wavg >= inp->rxpoll_whiwat) &&
2023			    inp->mode != IFNET_MODEL_INPUT_POLL_ON) {
2024				mode = IFNET_MODEL_INPUT_POLL_ON;
2025			}
2026
2027			if (mode != inp->mode) {
2028				inp->mode = mode;
2029				*(&inp->mode_lasttime) = *(&now);
2030				poll_req++;
2031			}
2032		}
2033skip:
2034		dlil_input_stats_sync(ifp, inp);
2035
2036		lck_mtx_unlock(&inp->input_lck);
2037
2038		/*
2039		 * If there's a mode change and interface is still attached,
2040		 * perform a downcall to the driver for the new mode.  Also
2041		 * hold an IO refcnt on the interface to prevent it from
2042		 * being detached (will be release below.)
2043		 */
2044		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
2045			struct ifnet_model_params p = { mode, { 0 } };
2046			errno_t err;
2047
2048			if (dlil_verbose) {
2049				printf("%s: polling is now %s, "
2050				    "pkts avg %d max %d limits [%d/%d], "
2051				    "wreq avg %d limits [%d/%d], "
2052				    "bytes avg %d limits [%d/%d]\n",
2053				    if_name(ifp),
2054				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
2055				    "ON" : "OFF", inp->rxpoll_pavg,
2056				    inp->rxpoll_pmax, inp->rxpoll_plowat,
2057				    inp->rxpoll_phiwat, inp->rxpoll_wavg,
2058				    inp->rxpoll_wlowat, inp->rxpoll_whiwat,
2059				    inp->rxpoll_bavg, inp->rxpoll_blowat,
2060				    inp->rxpoll_bhiwat);
2061			}
2062
2063			if ((err = ((*ifp->if_input_ctl)(ifp,
2064			    IFNET_CTL_SET_INPUT_MODEL, sizeof (p), &p))) != 0) {
2065				printf("%s: error setting polling mode "
2066				    "to %s (%d)\n", if_name(ifp),
2067				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
2068				    "ON" : "OFF", err);
2069			}
2070
2071			switch (mode) {
2072			case IFNET_MODEL_INPUT_POLL_OFF:
2073				ifnet_set_poll_cycle(ifp, NULL);
2074				inp->rxpoll_offreq++;
2075				if (err != 0)
2076					inp->rxpoll_offerr++;
2077				break;
2078
2079			case IFNET_MODEL_INPUT_POLL_ON:
2080				net_nsectimer(&ival, &ts);
2081				ifnet_set_poll_cycle(ifp, &ts);
2082				ifnet_poll(ifp);
2083				inp->rxpoll_onreq++;
2084				if (err != 0)
2085					inp->rxpoll_onerr++;
2086				break;
2087
2088			default:
2089				VERIFY(0);
2090				/* NOTREACHED */
2091			}
2092
2093			/* Release the IO refcnt */
2094			ifnet_decr_iorefcnt(ifp);
2095		}
2096
2097		/*
2098		* NOTE warning %%% attention !!!!
2099		* We should think about putting some thread starvation
2100		* safeguards if we deal with long chains of packets.
2101		*/
2102		if (m != NULL)
2103			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
2104	}
2105
2106	/* NOTREACHED */
2107	VERIFY(0);	/* we should never get here */
2108}
2109
2110/*
2111 * Must be called on an attached ifnet (caller is expected to check.)
2112 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
2113 */
2114errno_t
2115dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
2116    boolean_t locked)
2117{
2118	struct dlil_threading_info *inp;
2119	u_int64_t sample_holdtime, inbw;
2120
2121	VERIFY(ifp != NULL);
2122	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL)
2123		return (ENXIO);
2124
2125	if (p != NULL) {
2126		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
2127		    (p->packets_lowat != 0 && p->packets_hiwat == 0))
2128			return (EINVAL);
2129		if (p->packets_lowat != 0 &&	/* hiwat must be non-zero */
2130		    p->packets_lowat >= p->packets_hiwat)
2131			return (EINVAL);
2132		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
2133		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0))
2134			return (EINVAL);
2135		if (p->bytes_lowat != 0 &&	/* hiwat must be non-zero */
2136		    p->bytes_lowat >= p->bytes_hiwat)
2137			return (EINVAL);
2138		if (p->interval_time != 0 &&
2139		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN)
2140			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
2141	}
2142
2143	if (!locked)
2144		lck_mtx_lock(&inp->input_lck);
2145
2146	lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED);
2147
2148	/*
2149	 * Normally, we'd reset the parameters to the auto-tuned values
2150	 * if the the input thread detects a change in link rate.  If the
2151	 * driver provides its own parameters right after a link rate
2152	 * changes, but before the input thread gets to run, we want to
2153	 * make sure to keep the driver's values.  Clearing if_poll_update
2154	 * will achieve that.
2155	 */
2156	if (p != NULL && !locked && ifp->if_poll_update != 0)
2157		ifp->if_poll_update = 0;
2158
2159	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
2160		sample_holdtime = 0;	/* polling is disabled */
2161		inp->rxpoll_wlowat = inp->rxpoll_plowat =
2162		    inp->rxpoll_blowat = 0;
2163		inp->rxpoll_whiwat = inp->rxpoll_phiwat =
2164		    inp->rxpoll_bhiwat = (u_int32_t)-1;
2165		inp->rxpoll_plim = 0;
2166		inp->rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
2167	} else {
2168		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
2169		u_int64_t ival;
2170		unsigned int n, i;
2171
2172		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
2173			if (inbw < rxpoll_tbl[i].speed)
2174				break;
2175			n = i;
2176		}
2177		/* auto-tune if caller didn't specify a value */
2178		plowat = ((p == NULL || p->packets_lowat == 0) ?
2179		    rxpoll_tbl[n].plowat : p->packets_lowat);
2180		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
2181		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
2182		blowat = ((p == NULL || p->bytes_lowat == 0) ?
2183		    rxpoll_tbl[n].blowat : p->bytes_lowat);
2184		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
2185		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
2186		plim = ((p == NULL || p->packets_limit == 0) ?
2187		    if_rxpoll_max : p->packets_limit);
2188		ival = ((p == NULL || p->interval_time == 0) ?
2189		    if_rxpoll_interval_time : p->interval_time);
2190
2191		VERIFY(plowat != 0 && phiwat != 0);
2192		VERIFY(blowat != 0 && bhiwat != 0);
2193		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
2194
2195		sample_holdtime = if_rxpoll_sample_holdtime;
2196		inp->rxpoll_wlowat = if_rxpoll_wlowat;
2197		inp->rxpoll_whiwat = if_rxpoll_whiwat;
2198		inp->rxpoll_plowat = plowat;
2199		inp->rxpoll_phiwat = phiwat;
2200		inp->rxpoll_blowat = blowat;
2201		inp->rxpoll_bhiwat = bhiwat;
2202		inp->rxpoll_plim = plim;
2203		inp->rxpoll_ival = ival;
2204	}
2205
2206	net_nsectimer(&if_rxpoll_mode_holdtime, &inp->mode_holdtime);
2207	net_nsectimer(&sample_holdtime, &inp->sample_holdtime);
2208
2209	if (dlil_verbose) {
2210		printf("%s: speed %llu bps, sample per %llu nsec, "
2211		    "poll interval %llu nsec, pkts per poll %u, "
2212		    "pkt limits [%u/%u], wreq limits [%u/%u], "
2213		    "bytes limits [%u/%u]\n", if_name(ifp),
2214		    inbw, sample_holdtime, inp->rxpoll_ival, inp->rxpoll_plim,
2215		    inp->rxpoll_plowat, inp->rxpoll_phiwat, inp->rxpoll_wlowat,
2216		    inp->rxpoll_whiwat, inp->rxpoll_blowat, inp->rxpoll_bhiwat);
2217	}
2218
2219	if (!locked)
2220		lck_mtx_unlock(&inp->input_lck);
2221
2222	return (0);
2223}
2224
2225/*
2226 * Must be called on an attached ifnet (caller is expected to check.)
2227 */
2228errno_t
2229dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
2230{
2231	struct dlil_threading_info *inp;
2232
2233	VERIFY(ifp != NULL && p != NULL);
2234	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL)
2235		return (ENXIO);
2236
2237	bzero(p, sizeof (*p));
2238
2239	lck_mtx_lock(&inp->input_lck);
2240	p->packets_limit = inp->rxpoll_plim;
2241	p->packets_lowat = inp->rxpoll_plowat;
2242	p->packets_hiwat = inp->rxpoll_phiwat;
2243	p->bytes_lowat = inp->rxpoll_blowat;
2244	p->bytes_hiwat = inp->rxpoll_bhiwat;
2245	p->interval_time = inp->rxpoll_ival;
2246	lck_mtx_unlock(&inp->input_lck);
2247
2248	return (0);
2249}
2250
2251errno_t
2252ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
2253    const struct ifnet_stat_increment_param *s)
2254{
2255	return (ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE));
2256}
2257
2258errno_t
2259ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
2260    struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
2261{
2262	return (ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE));
2263}
2264
2265static errno_t
2266ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
2267    const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
2268{
2269	struct thread *tp = current_thread();
2270	struct mbuf *last;
2271	struct dlil_threading_info *inp;
2272	u_int32_t m_cnt = 0, m_size = 0;
2273
2274	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
2275		if (m_head != NULL)
2276			mbuf_freem_list(m_head);
2277		return (EINVAL);
2278	}
2279
2280	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
2281	VERIFY(m_tail == NULL || ext);
2282	VERIFY(s != NULL || !ext);
2283
2284	/*
2285	 * Drop the packet(s) if the parameters are invalid, or if the
2286	 * interface is no longer attached; else hold an IO refcnt to
2287	 * prevent it from being detached (will be released below.)
2288	 */
2289	if (ifp == NULL || (ifp != lo_ifp && !ifnet_is_attached(ifp, 1))) {
2290		if (m_head != NULL)
2291			mbuf_freem_list(m_head);
2292		return (EINVAL);
2293	}
2294
2295	if (m_tail == NULL) {
2296		last = m_head;
2297		while (m_head != NULL) {
2298#if IFNET_INPUT_SANITY_CHK
2299			if (dlil_input_sanity_check != 0)
2300				DLIL_INPUT_CHECK(last, ifp);
2301#endif /* IFNET_INPUT_SANITY_CHK */
2302			m_cnt++;
2303			m_size += m_length(last);
2304			if (mbuf_nextpkt(last) == NULL)
2305				break;
2306			last = mbuf_nextpkt(last);
2307		}
2308		m_tail = last;
2309	} else {
2310#if IFNET_INPUT_SANITY_CHK
2311		if (dlil_input_sanity_check != 0) {
2312			last = m_head;
2313			while (1) {
2314				DLIL_INPUT_CHECK(last, ifp);
2315				m_cnt++;
2316				m_size += m_length(last);
2317				if (mbuf_nextpkt(last) == NULL)
2318					break;
2319				last = mbuf_nextpkt(last);
2320			}
2321		} else {
2322			m_cnt = s->packets_in;
2323			m_size = s->bytes_in;
2324			last = m_tail;
2325		}
2326#else
2327		m_cnt = s->packets_in;
2328		m_size = s->bytes_in;
2329		last = m_tail;
2330#endif /* IFNET_INPUT_SANITY_CHK */
2331	}
2332
2333	if (last != m_tail) {
2334		panic_plain("%s: invalid input packet chain for %s, "
2335		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
2336		    m_tail, last);
2337	}
2338
2339	/*
2340	 * Assert packet count only for the extended variant, for backwards
2341	 * compatibility, since this came directly from the device driver.
2342	 * Relax this assertion for input bytes, as the driver may have
2343	 * included the link-layer headers in the computation; hence
2344	 * m_size is just an approximation.
2345	 */
2346	if (ext && s->packets_in != m_cnt) {
2347		panic_plain("%s: input packet count mismatch for %s, "
2348		    "%d instead of %d\n", __func__, if_name(ifp),
2349		    s->packets_in, m_cnt);
2350	}
2351
2352	if ((inp = ifp->if_inp) == NULL)
2353		inp = dlil_main_input_thread;
2354
2355	/*
2356	 * If there is a matching DLIL input thread associated with an
2357	 * affinity set, associate this thread with the same set.  We
2358	 * will only do this once.
2359	 */
2360	lck_mtx_lock_spin(&inp->input_lck);
2361	if (inp != dlil_main_input_thread && inp->net_affinity &&
2362	    ((!poll && inp->wloop_thr == THREAD_NULL) ||
2363	    (poll && inp->poll_thr == THREAD_NULL))) {
2364		u_int32_t tag = inp->tag;
2365
2366		if (poll) {
2367			VERIFY(inp->poll_thr == THREAD_NULL);
2368			inp->poll_thr = tp;
2369		} else {
2370			VERIFY(inp->wloop_thr == THREAD_NULL);
2371			inp->wloop_thr = tp;
2372		}
2373		lck_mtx_unlock(&inp->input_lck);
2374
2375		/* Associate the current thread with the new affinity tag */
2376		(void) dlil_affinity_set(tp, tag);
2377
2378		/*
2379		 * Take a reference on the current thread; during detach,
2380		 * we will need to refer to it in order ot tear down its
2381		 * affinity.
2382		 */
2383		thread_reference(tp);
2384		lck_mtx_lock_spin(&inp->input_lck);
2385	}
2386
2387	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
2388
2389        /*
2390	 * Because of loopbacked multicast we cannot stuff the ifp in
2391	 * the rcvif of the packet header: loopback (lo0) packets use a
2392	 * dedicated list so that we can later associate them with lo_ifp
2393	 * on their way up the stack.  Packets for other interfaces without
2394	 * dedicated input threads go to the regular list.
2395	 */
2396	if (m_head != NULL) {
2397		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
2398			struct dlil_main_threading_info *inpm =
2399			    (struct dlil_main_threading_info *)inp;
2400			_addq_multi(&inpm->lo_rcvq_pkts, m_head, m_tail,
2401			    m_cnt, m_size);
2402		} else {
2403			_addq_multi(&inp->rcvq_pkts, m_head, m_tail,
2404			    m_cnt, m_size);
2405		}
2406	}
2407
2408#if IFNET_INPUT_SANITY_CHK
2409	if (dlil_input_sanity_check != 0) {
2410		u_int32_t count;
2411		struct mbuf *m0;
2412
2413		for (m0 = m_head, count = 0; m0; m0 = mbuf_nextpkt(m0))
2414			count++;
2415
2416		if (count != m_cnt) {
2417			panic_plain("%s: invalid packet count %d "
2418			    "(expected %d)\n", if_name(ifp),
2419			    count, m_cnt);
2420			/* NOTREACHED */
2421		}
2422
2423		inp->input_mbuf_cnt += m_cnt;
2424	}
2425#endif /* IFNET_INPUT_SANITY_CHK */
2426
2427	if (s != NULL) {
2428		dlil_input_stats_add(s, inp, poll);
2429		/*
2430		 * If we're using the main input thread, synchronize the
2431		 * stats now since we have the interface context.  All
2432		 * other cases involving dedicated input threads will
2433		 * have their stats synchronized there.
2434		 */
2435		if (inp == dlil_main_input_thread)
2436			dlil_input_stats_sync(ifp, inp);
2437	}
2438
2439	inp->input_waiting |= DLIL_INPUT_WAITING;
2440	if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
2441		inp->wtot++;
2442		wakeup_one((caddr_t)&inp->input_waiting);
2443	}
2444	lck_mtx_unlock(&inp->input_lck);
2445
2446	if (ifp != lo_ifp) {
2447		/* Release the IO refcnt */
2448		ifnet_decr_iorefcnt(ifp);
2449	}
2450
2451	return (0);
2452}
2453
2454static void
2455ifnet_start_common(struct ifnet *ifp, int resetfc)
2456{
2457	if (!(ifp->if_eflags & IFEF_TXSTART))
2458		return;
2459	/*
2460	 * If the starter thread is inactive, signal it to do work,
2461	 * unless the interface is being flow controlled from below,
2462	 * e.g. a virtual interface being flow controlled by a real
2463	 * network interface beneath it.
2464	 */
2465	lck_mtx_lock_spin(&ifp->if_start_lock);
2466	if (resetfc) {
2467		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
2468	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
2469		lck_mtx_unlock(&ifp->if_start_lock);
2470		return;
2471	}
2472	ifp->if_start_req++;
2473	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL) {
2474		wakeup_one((caddr_t)&ifp->if_start_thread);
2475	}
2476	lck_mtx_unlock(&ifp->if_start_lock);
2477}
2478
2479void
2480ifnet_start(struct ifnet *ifp)
2481{
2482	ifnet_start_common(ifp, 0);
2483}
2484
2485static void
2486ifnet_start_thread_fn(void *v, wait_result_t w)
2487{
2488#pragma unused(w)
2489	struct ifnet *ifp = v;
2490	char ifname[IFNAMSIZ + 1];
2491	struct timespec *ts = NULL;
2492	struct ifclassq *ifq = &ifp->if_snd;
2493
2494	/*
2495	 * Treat the dedicated starter thread for lo0 as equivalent to
2496	 * the driver workloop thread; if net_affinity is enabled for
2497	 * the main input thread, associate this starter thread to it
2498	 * by binding them with the same affinity tag.  This is done
2499	 * only once (as we only have one lo_ifp which never goes away.)
2500	 */
2501	if (ifp == lo_ifp) {
2502		struct dlil_threading_info *inp = dlil_main_input_thread;
2503		struct thread *tp = current_thread();
2504
2505		lck_mtx_lock(&inp->input_lck);
2506		if (inp->net_affinity) {
2507			u_int32_t tag = inp->tag;
2508
2509			VERIFY(inp->wloop_thr == THREAD_NULL);
2510			VERIFY(inp->poll_thr == THREAD_NULL);
2511			inp->wloop_thr = tp;
2512			lck_mtx_unlock(&inp->input_lck);
2513
2514			/* Associate this thread with the affinity tag */
2515			(void) dlil_affinity_set(tp, tag);
2516		} else {
2517			lck_mtx_unlock(&inp->input_lck);
2518		}
2519	}
2520
2521	snprintf(ifname, sizeof (ifname), "%s_starter",
2522	    if_name(ifp));
2523
2524	lck_mtx_lock_spin(&ifp->if_start_lock);
2525
2526	for (;;) {
2527		(void) msleep(&ifp->if_start_thread, &ifp->if_start_lock,
2528		    (PZERO - 1) | PSPIN, ifname, ts);
2529
2530		/* interface is detached? */
2531		if (ifp->if_start_thread == THREAD_NULL) {
2532			ifnet_set_start_cycle(ifp, NULL);
2533			lck_mtx_unlock(&ifp->if_start_lock);
2534			ifnet_purge(ifp);
2535
2536			if (dlil_verbose) {
2537				printf("%s: starter thread terminated\n",
2538				    if_name(ifp));
2539			}
2540
2541			/* for the extra refcnt from kernel_thread_start() */
2542			thread_deallocate(current_thread());
2543			/* this is the end */
2544			thread_terminate(current_thread());
2545			/* NOTREACHED */
2546			return;
2547		}
2548
2549		ifp->if_start_active = 1;
2550		for (;;) {
2551			u_int32_t req = ifp->if_start_req;
2552
2553			lck_mtx_unlock(&ifp->if_start_lock);
2554			/* invoke the driver's start routine */
2555			((*ifp->if_start)(ifp));
2556			lck_mtx_lock_spin(&ifp->if_start_lock);
2557
2558			/* if there's no pending request, we're done */
2559			if (req == ifp->if_start_req)
2560				break;
2561		}
2562		ifp->if_start_req = 0;
2563		ifp->if_start_active = 0;
2564		/*
2565		 * Wakeup N ns from now if rate-controlled by TBR, and if
2566		 * there are still packets in the send queue which haven't
2567		 * been dequeued so far; else sleep indefinitely (ts = NULL)
2568		 * until ifnet_start() is called again.
2569		 */
2570		ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
2571		    &ifp->if_start_cycle : NULL);
2572
2573		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0)
2574			ts = NULL;
2575	}
2576
2577	/* NOTREACHED */
2578	lck_mtx_unlock(&ifp->if_start_lock);
2579	VERIFY(0);	/* we should never get here */
2580}
2581
2582void
2583ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
2584{
2585	if (ts == NULL)
2586		bzero(&ifp->if_start_cycle, sizeof (ifp->if_start_cycle));
2587	else
2588		*(&ifp->if_start_cycle) = *ts;
2589
2590	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose)
2591		printf("%s: restart interval set to %lu nsec\n",
2592		    if_name(ifp), ts->tv_nsec);
2593}
2594
2595static void
2596ifnet_poll(struct ifnet *ifp)
2597{
2598	/*
2599	 * If the poller thread is inactive, signal it to do work.
2600	 */
2601	lck_mtx_lock_spin(&ifp->if_poll_lock);
2602	ifp->if_poll_req++;
2603	if (!ifp->if_poll_active && ifp->if_poll_thread != THREAD_NULL) {
2604		wakeup_one((caddr_t)&ifp->if_poll_thread);
2605	}
2606	lck_mtx_unlock(&ifp->if_poll_lock);
2607}
2608
2609static void
2610ifnet_poll_thread_fn(void *v, wait_result_t w)
2611{
2612#pragma unused(w)
2613	struct dlil_threading_info *inp;
2614	struct ifnet *ifp = v;
2615	char ifname[IFNAMSIZ + 1];
2616	struct timespec *ts = NULL;
2617	struct ifnet_stat_increment_param s;
2618
2619	snprintf(ifname, sizeof (ifname), "%s_poller",
2620	    if_name(ifp));
2621	bzero(&s, sizeof (s));
2622
2623	lck_mtx_lock_spin(&ifp->if_poll_lock);
2624
2625	inp = ifp->if_inp;
2626	VERIFY(inp != NULL);
2627
2628	for (;;) {
2629		if (ifp->if_poll_thread != THREAD_NULL) {
2630			(void) msleep(&ifp->if_poll_thread, &ifp->if_poll_lock,
2631			    (PZERO - 1) | PSPIN, ifname, ts);
2632		}
2633
2634		/* interface is detached (maybe while asleep)? */
2635		if (ifp->if_poll_thread == THREAD_NULL) {
2636			ifnet_set_poll_cycle(ifp, NULL);
2637			lck_mtx_unlock(&ifp->if_poll_lock);
2638
2639			if (dlil_verbose) {
2640				printf("%s: poller thread terminated\n",
2641				    if_name(ifp));
2642			}
2643
2644			/* for the extra refcnt from kernel_thread_start() */
2645			thread_deallocate(current_thread());
2646			/* this is the end */
2647			thread_terminate(current_thread());
2648			/* NOTREACHED */
2649			return;
2650		}
2651
2652		ifp->if_poll_active = 1;
2653		for (;;) {
2654			struct mbuf *m_head, *m_tail;
2655			u_int32_t m_lim, m_cnt, m_totlen;
2656			u_int16_t req = ifp->if_poll_req;
2657
2658			lck_mtx_unlock(&ifp->if_poll_lock);
2659
2660			/*
2661			 * If no longer attached, there's nothing to do;
2662			 * else hold an IO refcnt to prevent the interface
2663			 * from being detached (will be released below.)
2664			 */
2665			if (!ifnet_is_attached(ifp, 1)) {
2666				lck_mtx_lock_spin(&ifp->if_poll_lock);
2667				break;
2668			}
2669
2670			m_lim = (inp->rxpoll_plim != 0) ? inp->rxpoll_plim :
2671			    MAX((qlimit(&inp->rcvq_pkts)),
2672			    (inp->rxpoll_phiwat << 2));
2673
2674			if (dlil_verbose > 1) {
2675				printf("%s: polling up to %d pkts, "
2676				    "pkts avg %d max %d, wreq avg %d, "
2677				    "bytes avg %d\n",
2678				    if_name(ifp), m_lim,
2679				    inp->rxpoll_pavg, inp->rxpoll_pmax,
2680				    inp->rxpoll_wavg, inp->rxpoll_bavg);
2681			}
2682
2683			/* invoke the driver's input poll routine */
2684			((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
2685			    &m_cnt, &m_totlen));
2686
2687			if (m_head != NULL) {
2688				VERIFY(m_tail != NULL && m_cnt > 0);
2689
2690				if (dlil_verbose > 1) {
2691					printf("%s: polled %d pkts, "
2692					    "pkts avg %d max %d, wreq avg %d, "
2693					    "bytes avg %d\n",
2694					    if_name(ifp), m_cnt,
2695					    inp->rxpoll_pavg, inp->rxpoll_pmax,
2696					    inp->rxpoll_wavg, inp->rxpoll_bavg);
2697				}
2698
2699				/* stats are required for extended variant */
2700				s.packets_in = m_cnt;
2701				s.bytes_in = m_totlen;
2702
2703				(void) ifnet_input_common(ifp, m_head, m_tail,
2704				    &s, TRUE, TRUE);
2705			} else {
2706				if (dlil_verbose > 1) {
2707					printf("%s: no packets, "
2708					    "pkts avg %d max %d, wreq avg %d, "
2709					    "bytes avg %d\n",
2710					    if_name(ifp), inp->rxpoll_pavg,
2711					    inp->rxpoll_pmax, inp->rxpoll_wavg,
2712					    inp->rxpoll_bavg);
2713				}
2714
2715				(void) ifnet_input_common(ifp, NULL, NULL,
2716				    NULL, FALSE, TRUE);
2717			}
2718
2719			/* Release the io ref count */
2720			ifnet_decr_iorefcnt(ifp);
2721
2722			lck_mtx_lock_spin(&ifp->if_poll_lock);
2723
2724			/* if there's no pending request, we're done */
2725			if (req == ifp->if_poll_req)
2726				break;
2727		}
2728		ifp->if_poll_req = 0;
2729		ifp->if_poll_active = 0;
2730
2731		/*
2732		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
2733		 * until ifnet_poll() is called again.
2734		 */
2735		ts = &ifp->if_poll_cycle;
2736		if (ts->tv_sec == 0 && ts->tv_nsec == 0)
2737			ts = NULL;
2738	}
2739
2740	/* NOTREACHED */
2741	lck_mtx_unlock(&ifp->if_poll_lock);
2742	VERIFY(0);	/* we should never get here */
2743}
2744
2745void
2746ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
2747{
2748	if (ts == NULL)
2749		bzero(&ifp->if_poll_cycle, sizeof (ifp->if_poll_cycle));
2750	else
2751		*(&ifp->if_poll_cycle) = *ts;
2752
2753	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose)
2754		printf("%s: poll interval set to %lu nsec\n",
2755		    if_name(ifp), ts->tv_nsec);
2756}
2757
2758void
2759ifnet_purge(struct ifnet *ifp)
2760{
2761	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART))
2762		if_qflush(ifp, 0);
2763}
2764
2765void
2766ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
2767{
2768	IFCQ_LOCK_ASSERT_HELD(ifq);
2769
2770	if (!(IFCQ_IS_READY(ifq)))
2771		return;
2772
2773	if (IFCQ_TBR_IS_ENABLED(ifq)) {
2774		struct tb_profile tb = { ifq->ifcq_tbr.tbr_rate_raw,
2775		    ifq->ifcq_tbr.tbr_percent, 0 };
2776		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
2777	}
2778
2779	ifclassq_update(ifq, ev);
2780}
2781
2782void
2783ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
2784{
2785	switch (ev) {
2786	case CLASSQ_EV_LINK_BANDWIDTH:
2787		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL))
2788			ifp->if_poll_update++;
2789		break;
2790
2791	default:
2792		break;
2793	}
2794}
2795
2796errno_t
2797ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
2798{
2799	struct ifclassq *ifq;
2800	u_int32_t omodel;
2801	errno_t err;
2802
2803	if (ifp == NULL || (model != IFNET_SCHED_MODEL_DRIVER_MANAGED &&
2804	    model != IFNET_SCHED_MODEL_NORMAL))
2805		return (EINVAL);
2806	else if (!(ifp->if_eflags & IFEF_TXSTART))
2807		return (ENXIO);
2808
2809	ifq = &ifp->if_snd;
2810	IFCQ_LOCK(ifq);
2811	omodel = ifp->if_output_sched_model;
2812	ifp->if_output_sched_model = model;
2813	if ((err = ifclassq_pktsched_setup(ifq)) != 0)
2814		ifp->if_output_sched_model = omodel;
2815	IFCQ_UNLOCK(ifq);
2816
2817	return (err);
2818}
2819
2820errno_t
2821ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
2822{
2823	if (ifp == NULL)
2824		return (EINVAL);
2825	else if (!(ifp->if_eflags & IFEF_TXSTART))
2826		return (ENXIO);
2827
2828	ifclassq_set_maxlen(&ifp->if_snd, maxqlen);
2829
2830	return (0);
2831}
2832
2833errno_t
2834ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
2835{
2836	if (ifp == NULL || maxqlen == NULL)
2837		return (EINVAL);
2838	else if (!(ifp->if_eflags & IFEF_TXSTART))
2839		return (ENXIO);
2840
2841	*maxqlen = ifclassq_get_maxlen(&ifp->if_snd);
2842
2843	return (0);
2844}
2845
2846errno_t
2847ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
2848{
2849	errno_t err;
2850
2851	if (ifp == NULL || pkts == NULL)
2852		err = EINVAL;
2853	else if (!(ifp->if_eflags & IFEF_TXSTART))
2854		err = ENXIO;
2855	else
2856		err = ifclassq_get_len(&ifp->if_snd, MBUF_SC_UNSPEC,
2857		    pkts, NULL);
2858
2859	return (err);
2860}
2861
2862errno_t
2863ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
2864    u_int32_t *pkts, u_int32_t *bytes)
2865{
2866	errno_t err;
2867
2868	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
2869	    (pkts == NULL && bytes == NULL))
2870		err = EINVAL;
2871	else if (!(ifp->if_eflags & IFEF_TXSTART))
2872		err = ENXIO;
2873	else
2874		err = ifclassq_get_len(&ifp->if_snd, sc, pkts, bytes);
2875
2876	return (err);
2877}
2878
2879errno_t
2880ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
2881{
2882	struct dlil_threading_info *inp;
2883
2884	if (ifp == NULL)
2885		return (EINVAL);
2886	else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL)
2887		return (ENXIO);
2888
2889	if (maxqlen == 0)
2890		maxqlen = if_rcvq_maxlen;
2891	else if (maxqlen < IF_RCVQ_MINLEN)
2892		maxqlen = IF_RCVQ_MINLEN;
2893
2894	inp = ifp->if_inp;
2895	lck_mtx_lock(&inp->input_lck);
2896	qlimit(&inp->rcvq_pkts) = maxqlen;
2897	lck_mtx_unlock(&inp->input_lck);
2898
2899	return (0);
2900}
2901
2902errno_t
2903ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
2904{
2905	struct dlil_threading_info *inp;
2906
2907	if (ifp == NULL || maxqlen == NULL)
2908		return (EINVAL);
2909	else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL)
2910		return (ENXIO);
2911
2912	inp = ifp->if_inp;
2913	lck_mtx_lock(&inp->input_lck);
2914	*maxqlen = qlimit(&inp->rcvq_pkts);
2915	lck_mtx_unlock(&inp->input_lck);
2916	return (0);
2917}
2918
2919errno_t
2920ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
2921{
2922	int error;
2923
2924	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
2925	    m->m_nextpkt != NULL) {
2926		if (m != NULL)
2927			m_freem_list(m);
2928		return (EINVAL);
2929	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2930	    !(ifp->if_refflags & IFRF_ATTACHED)) {
2931		/* flag tested without lock for performance */
2932		m_freem(m);
2933		return (ENXIO);
2934	} else if (!(ifp->if_flags & IFF_UP)) {
2935		m_freem(m);
2936		return (ENETDOWN);
2937	}
2938
2939	/* enqueue the packet */
2940	error = ifclassq_enqueue(&ifp->if_snd, m);
2941
2942	/*
2943	 * Tell the driver to start dequeueing; do this even when the queue
2944	 * for the packet is suspended (EQSUSPENDED), as the driver could still
2945	 * be dequeueing from other unsuspended queues.
2946	 */
2947	if (error == 0 || error == EQFULL || error == EQSUSPENDED)
2948		ifnet_start(ifp);
2949
2950	return (error);
2951}
2952
2953errno_t
2954ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
2955{
2956	if (ifp == NULL || mp == NULL)
2957		return (EINVAL);
2958	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2959	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL))
2960		return (ENXIO);
2961
2962	return (ifclassq_dequeue(&ifp->if_snd, 1, mp, NULL, NULL, NULL));
2963}
2964
2965errno_t
2966ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
2967    struct mbuf **mp)
2968{
2969	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc))
2970		return (EINVAL);
2971	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2972	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED))
2973		return (ENXIO);
2974
2975	return (ifclassq_dequeue_sc(&ifp->if_snd, sc, 1, mp, NULL, NULL, NULL));
2976}
2977
2978errno_t
2979ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t limit, struct mbuf **head,
2980    struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
2981{
2982	if (ifp == NULL || head == NULL || limit < 1)
2983		return (EINVAL);
2984	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2985	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL))
2986		return (ENXIO);
2987
2988	return (ifclassq_dequeue(&ifp->if_snd, limit, head, tail, cnt, len));
2989}
2990
2991errno_t
2992ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
2993    u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
2994    u_int32_t *len)
2995{
2996
2997	if (ifp == NULL || head == NULL || limit < 1 || !MBUF_VALID_SC(sc))
2998		return (EINVAL);
2999	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
3000	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED))
3001		return (ENXIO);
3002
3003	return (ifclassq_dequeue_sc(&ifp->if_snd, sc, limit, head,
3004	    tail, cnt, len));
3005}
3006
3007errno_t
3008ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
3009    const struct sockaddr *dest, const char *dest_linkaddr,
3010    const char *frame_type, u_int32_t *pre, u_int32_t *post)
3011{
3012	if (pre != NULL)
3013		*pre = 0;
3014	if (post != NULL)
3015		*post = 0;
3016
3017	return (ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type));
3018}
3019
3020static int
3021dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
3022    char **frame_header_p, protocol_family_t protocol_family)
3023{
3024	struct ifnet_filter *filter;
3025
3026	/*
3027	 * Pass the inbound packet to the interface filters
3028	 */
3029	lck_mtx_lock_spin(&ifp->if_flt_lock);
3030	/* prevent filter list from changing in case we drop the lock */
3031	if_flt_monitor_busy(ifp);
3032	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
3033		int result;
3034
3035		if (!filter->filt_skip && filter->filt_input != NULL &&
3036		    (filter->filt_protocol == 0 ||
3037		    filter->filt_protocol == protocol_family)) {
3038			lck_mtx_unlock(&ifp->if_flt_lock);
3039
3040			result = (*filter->filt_input)(filter->filt_cookie,
3041			    ifp, protocol_family, m_p, frame_header_p);
3042
3043			lck_mtx_lock_spin(&ifp->if_flt_lock);
3044			if (result != 0) {
3045				/* we're done with the filter list */
3046				if_flt_monitor_unbusy(ifp);
3047				lck_mtx_unlock(&ifp->if_flt_lock);
3048				return (result);
3049			}
3050		}
3051	}
3052	/* we're done with the filter list */
3053	if_flt_monitor_unbusy(ifp);
3054	lck_mtx_unlock(&ifp->if_flt_lock);
3055
3056	/*
3057	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
3058	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
3059	 */
3060	if (*m_p != NULL)
3061		(*m_p)->m_flags &= ~M_PROTO1;
3062
3063	return (0);
3064}
3065
3066static int
3067dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
3068    protocol_family_t protocol_family)
3069{
3070	struct ifnet_filter *filter;
3071
3072	/*
3073	 * Pass the outbound packet to the interface filters
3074	 */
3075	lck_mtx_lock_spin(&ifp->if_flt_lock);
3076	/* prevent filter list from changing in case we drop the lock */
3077	if_flt_monitor_busy(ifp);
3078	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
3079		int result;
3080
3081		if (!filter->filt_skip && filter->filt_output != NULL &&
3082		    (filter->filt_protocol == 0 ||
3083		    filter->filt_protocol == protocol_family)) {
3084			lck_mtx_unlock(&ifp->if_flt_lock);
3085
3086			result = filter->filt_output(filter->filt_cookie, ifp,
3087			    protocol_family, m_p);
3088
3089			lck_mtx_lock_spin(&ifp->if_flt_lock);
3090			if (result != 0) {
3091				/* we're done with the filter list */
3092				if_flt_monitor_unbusy(ifp);
3093				lck_mtx_unlock(&ifp->if_flt_lock);
3094				return (result);
3095			}
3096		}
3097	}
3098	/* we're done with the filter list */
3099	if_flt_monitor_unbusy(ifp);
3100	lck_mtx_unlock(&ifp->if_flt_lock);
3101
3102	return (0);
3103}
3104
3105static void
3106dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
3107{
3108	int error;
3109
3110	if (ifproto->proto_kpi == kProtoKPI_v1) {
3111		/* Version 1 protocols get one packet at a time */
3112		while (m != NULL) {
3113			char *	frame_header;
3114			mbuf_t	next_packet;
3115
3116			next_packet = m->m_nextpkt;
3117			m->m_nextpkt = NULL;
3118			frame_header = m->m_pkthdr.pkt_hdr;
3119			m->m_pkthdr.pkt_hdr = NULL;
3120			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
3121			    ifproto->protocol_family, m, frame_header);
3122			if (error != 0 && error != EJUSTRETURN)
3123				m_freem(m);
3124			m = next_packet;
3125		}
3126	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
3127		/* Version 2 protocols support packet lists */
3128		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
3129		    ifproto->protocol_family, m);
3130		if (error != 0 && error != EJUSTRETURN)
3131			m_freem_list(m);
3132	}
3133	return;
3134}
3135
3136static void
3137dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
3138    struct dlil_threading_info *inp, boolean_t poll)
3139{
3140	struct ifnet_stat_increment_param *d = &inp->stats;
3141
3142	if (s->packets_in != 0)
3143		d->packets_in += s->packets_in;
3144	if (s->bytes_in != 0)
3145		d->bytes_in += s->bytes_in;
3146	if (s->errors_in != 0)
3147		d->errors_in += s->errors_in;
3148
3149	if (s->packets_out != 0)
3150		d->packets_out += s->packets_out;
3151	if (s->bytes_out != 0)
3152		d->bytes_out += s->bytes_out;
3153	if (s->errors_out != 0)
3154		d->errors_out += s->errors_out;
3155
3156	if (s->collisions != 0)
3157		d->collisions += s->collisions;
3158	if (s->dropped != 0)
3159		d->dropped += s->dropped;
3160
3161	if (poll)
3162		PKTCNTR_ADD(&inp->tstats, s->packets_in, s->bytes_in);
3163}
3164
3165static void
3166dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
3167{
3168	struct ifnet_stat_increment_param *s = &inp->stats;
3169
3170	/*
3171	 * Use of atomic operations is unavoidable here because
3172	 * these stats may also be incremented elsewhere via KPIs.
3173	 */
3174	if (s->packets_in != 0) {
3175		atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
3176		s->packets_in = 0;
3177	}
3178	if (s->bytes_in != 0) {
3179		atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
3180		s->bytes_in = 0;
3181	}
3182	if (s->errors_in != 0) {
3183		atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
3184		s->errors_in = 0;
3185	}
3186
3187	if (s->packets_out != 0) {
3188		atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
3189		s->packets_out = 0;
3190	}
3191	if (s->bytes_out != 0) {
3192		atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
3193		s->bytes_out = 0;
3194	}
3195	if (s->errors_out != 0) {
3196		atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
3197		s->errors_out = 0;
3198	}
3199
3200	if (s->collisions != 0) {
3201		atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
3202		s->collisions = 0;
3203	}
3204	if (s->dropped != 0) {
3205		atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
3206		s->dropped = 0;
3207	}
3208	/*
3209	 * If we went over the threshold, notify NetworkStatistics.
3210	 */
3211	if (ifp->if_data_threshold &&
3212	    (ifp->if_ibytes + ifp->if_obytes) - ifp->if_dt_bytes >
3213	    ifp->if_data_threshold) {
3214		ifp->if_dt_bytes = ifp->if_ibytes + ifp->if_obytes;
3215		nstat_ifnet_threshold_reached(ifp->if_index);
3216	}
3217	/*
3218	 * No need for atomic operations as they are modified here
3219	 * only from within the DLIL input thread context.
3220	 */
3221	if (inp->tstats.packets != 0) {
3222		inp->pstats.ifi_poll_packets += inp->tstats.packets;
3223		inp->tstats.packets = 0;
3224	}
3225	if (inp->tstats.bytes != 0) {
3226		inp->pstats.ifi_poll_bytes += inp->tstats.bytes;
3227		inp->tstats.bytes = 0;
3228	}
3229}
3230
3231__private_extern__ void
3232dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
3233{
3234	return (dlil_input_packet_list_common(ifp, m, 0,
3235	    IFNET_MODEL_INPUT_POLL_OFF, FALSE));
3236}
3237
3238__private_extern__ void
3239dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
3240    u_int32_t cnt, ifnet_model_t mode)
3241{
3242	return (dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE));
3243}
3244
3245static void
3246dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
3247    u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
3248{
3249	int				error = 0;
3250	protocol_family_t		protocol_family;
3251	mbuf_t				next_packet;
3252	ifnet_t				ifp = ifp_param;
3253	char *				frame_header;
3254	struct if_proto	*		last_ifproto = NULL;
3255	mbuf_t				pkt_first = NULL;
3256	mbuf_t *			pkt_next = NULL;
3257	u_int32_t			poll_thresh = 0, poll_ival = 0;
3258
3259	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START,0,0,0,0,0);
3260
3261	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
3262	    (poll_ival = if_rxpoll_interval_pkts) > 0)
3263		poll_thresh = cnt;
3264
3265	while (m != NULL) {
3266		struct if_proto *ifproto = NULL;
3267		int iorefcnt = 0;
3268		uint32_t pktf_mask;	/* pkt flags to preserve */
3269
3270		if (ifp_param == NULL)
3271			ifp = m->m_pkthdr.rcvif;
3272
3273		if ((ifp->if_eflags & IFEF_RXPOLL) && poll_thresh != 0 &&
3274		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0)
3275			ifnet_poll(ifp);
3276
3277		/* Check if this mbuf looks valid */
3278		MBUF_INPUT_CHECK(m, ifp);
3279
3280		next_packet = m->m_nextpkt;
3281		m->m_nextpkt = NULL;
3282		frame_header = m->m_pkthdr.pkt_hdr;
3283		m->m_pkthdr.pkt_hdr = NULL;
3284
3285		/*
3286		 * Get an IO reference count if the interface is not
3287		 * loopback (lo0) and it is attached; lo0 never goes
3288		 * away, so optimize for that.
3289		 */
3290		if (ifp != lo_ifp) {
3291			if (!ifnet_is_attached(ifp, 1)) {
3292				m_freem(m);
3293				goto next;
3294			}
3295			iorefcnt = 1;
3296			pktf_mask = 0;
3297		} else {
3298			/*
3299			 * If this arrived on lo0, preserve interface addr
3300			 * info to allow for connectivity between loopback
3301			 * and local interface addresses.
3302			 */
3303			pktf_mask = (PKTF_LOOP|PKTF_IFAINFO);
3304		}
3305
3306		/* make sure packet comes in clean */
3307		m_classifier_init(m, pktf_mask);
3308
3309		ifp_inc_traffic_class_in(ifp, m);
3310
3311		/* find which protocol family this packet is for */
3312		ifnet_lock_shared(ifp);
3313		error = (*ifp->if_demux)(ifp, m, frame_header,
3314		    &protocol_family);
3315		ifnet_lock_done(ifp);
3316		if (error != 0) {
3317			if (error == EJUSTRETURN)
3318				goto next;
3319			protocol_family = 0;
3320		}
3321
3322		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
3323		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3324			dlil_input_cksum_dbg(ifp, m, frame_header,
3325			    protocol_family);
3326
3327		/*
3328		 * For partial checksum offload, we expect the driver to
3329		 * set the start offset indicating the start of the span
3330		 * that is covered by the hardware-computed checksum;
3331		 * adjust this start offset accordingly because the data
3332		 * pointer has been advanced beyond the link-layer header.
3333		 *
3334		 * Don't adjust if the interface is a bridge member, as
3335		 * the adjustment will occur from the context of the
3336		 * bridge interface during input.
3337		 */
3338		if (ifp->if_bridge == NULL && (m->m_pkthdr.csum_flags &
3339		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
3340		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
3341			int adj;
3342
3343			if (frame_header == NULL ||
3344			    frame_header < (char *)mbuf_datastart(m) ||
3345			    frame_header > (char *)m->m_data ||
3346			    (adj = (m->m_data - frame_header)) >
3347			    m->m_pkthdr.csum_rx_start) {
3348				m->m_pkthdr.csum_data = 0;
3349				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
3350				hwcksum_in_invalidated++;
3351			} else {
3352				m->m_pkthdr.csum_rx_start -= adj;
3353			}
3354		}
3355
3356		pktap_input(ifp, protocol_family, m, frame_header);
3357
3358		if (m->m_flags & (M_BCAST|M_MCAST))
3359			atomic_add_64(&ifp->if_imcasts, 1);
3360
3361		/* run interface filters, exclude VLAN packets PR-3586856 */
3362		if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) {
3363			error = dlil_interface_filters_input(ifp, &m,
3364			    &frame_header, protocol_family);
3365			if (error != 0) {
3366				if (error != EJUSTRETURN)
3367					m_freem(m);
3368				goto next;
3369			}
3370		}
3371		if (error != 0 || ((m->m_flags & M_PROMISC) != 0) ) {
3372			m_freem(m);
3373			goto next;
3374		}
3375
3376		/* Lookup the protocol attachment to this interface */
3377		if (protocol_family == 0) {
3378			ifproto = NULL;
3379		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
3380		    (last_ifproto->protocol_family == protocol_family)) {
3381			VERIFY(ifproto == NULL);
3382			ifproto = last_ifproto;
3383			if_proto_ref(last_ifproto);
3384		} else {
3385			VERIFY(ifproto == NULL);
3386			ifnet_lock_shared(ifp);
3387			/* callee holds a proto refcnt upon success */
3388			ifproto	= find_attached_proto(ifp, protocol_family);
3389			ifnet_lock_done(ifp);
3390		}
3391		if (ifproto == NULL) {
3392			/* no protocol for this packet, discard */
3393			m_freem(m);
3394			goto next;
3395		}
3396		if (ifproto != last_ifproto) {
3397			if (last_ifproto != NULL) {
3398				/* pass up the list for the previous protocol */
3399				dlil_ifproto_input(last_ifproto, pkt_first);
3400				pkt_first = NULL;
3401				if_proto_free(last_ifproto);
3402			}
3403			last_ifproto = ifproto;
3404			if_proto_ref(ifproto);
3405		}
3406		/* extend the list */
3407		m->m_pkthdr.pkt_hdr = frame_header;
3408		if (pkt_first == NULL) {
3409			pkt_first = m;
3410		} else {
3411			*pkt_next = m;
3412		}
3413		pkt_next = &m->m_nextpkt;
3414
3415next:
3416		if (next_packet == NULL && last_ifproto != NULL) {
3417			/* pass up the last list of packets */
3418			dlil_ifproto_input(last_ifproto, pkt_first);
3419			if_proto_free(last_ifproto);
3420			last_ifproto = NULL;
3421		}
3422		if (ifproto != NULL) {
3423			if_proto_free(ifproto);
3424			ifproto = NULL;
3425		}
3426
3427		m = next_packet;
3428
3429		/* update the driver's multicast filter, if needed */
3430		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0)
3431			ifp->if_updatemcasts = 0;
3432		if (iorefcnt == 1)
3433			ifnet_decr_iorefcnt(ifp);
3434	}
3435
3436	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0);
3437}
3438
3439errno_t
3440if_mcasts_update(struct ifnet *ifp)
3441{
3442	errno_t err;
3443
3444	err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
3445	if (err == EAFNOSUPPORT)
3446		err = 0;
3447	printf("%s: %s %d suspended link-layer multicast membership(s) "
3448	    "(err=%d)\n", if_name(ifp),
3449	    (err == 0 ? "successfully restored" : "failed to restore"),
3450	    ifp->if_updatemcasts, err);
3451
3452	/* just return success */
3453	return (0);
3454}
3455
3456static int
3457dlil_event_internal(struct ifnet *ifp, struct kev_msg *event)
3458{
3459	struct ifnet_filter *filter;
3460
3461	/* Get an io ref count if the interface is attached */
3462	if (!ifnet_is_attached(ifp, 1))
3463		goto done;
3464
3465	/*
3466	 * Pass the event to the interface filters
3467	 */
3468	lck_mtx_lock_spin(&ifp->if_flt_lock);
3469	/* prevent filter list from changing in case we drop the lock */
3470	if_flt_monitor_busy(ifp);
3471	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
3472		if (filter->filt_event != NULL) {
3473			lck_mtx_unlock(&ifp->if_flt_lock);
3474
3475			filter->filt_event(filter->filt_cookie, ifp,
3476			    filter->filt_protocol, event);
3477
3478			lck_mtx_lock_spin(&ifp->if_flt_lock);
3479		}
3480	}
3481	/* we're done with the filter list */
3482	if_flt_monitor_unbusy(ifp);
3483	lck_mtx_unlock(&ifp->if_flt_lock);
3484
3485	ifnet_lock_shared(ifp);
3486	if (ifp->if_proto_hash != NULL) {
3487		int i;
3488
3489		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
3490			struct if_proto *proto;
3491
3492			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
3493			    next_hash) {
3494				proto_media_event eventp =
3495				    (proto->proto_kpi == kProtoKPI_v1 ?
3496				    proto->kpi.v1.event :
3497				    proto->kpi.v2.event);
3498
3499				if (eventp != NULL) {
3500					if_proto_ref(proto);
3501					ifnet_lock_done(ifp);
3502
3503					eventp(ifp, proto->protocol_family,
3504					    event);
3505
3506					ifnet_lock_shared(ifp);
3507					if_proto_free(proto);
3508				}
3509			}
3510		}
3511	}
3512	ifnet_lock_done(ifp);
3513
3514	/* Pass the event to the interface */
3515	if (ifp->if_event != NULL)
3516		ifp->if_event(ifp, event);
3517
3518	/* Release the io ref count */
3519	ifnet_decr_iorefcnt(ifp);
3520
3521done:
3522	return (kev_post_msg(event));
3523}
3524
3525errno_t
3526ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
3527{
3528	struct kev_msg               kev_msg;
3529	int result = 0;
3530
3531	if (ifp == NULL || event == NULL)
3532		return (EINVAL);
3533
3534	bzero(&kev_msg, sizeof (kev_msg));
3535	kev_msg.vendor_code    = event->vendor_code;
3536	kev_msg.kev_class      = event->kev_class;
3537	kev_msg.kev_subclass   = event->kev_subclass;
3538	kev_msg.event_code     = event->event_code;
3539	kev_msg.dv[0].data_ptr = &event->event_data[0];
3540	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
3541	kev_msg.dv[1].data_length = 0;
3542
3543	result = dlil_event_internal(ifp, &kev_msg);
3544
3545	return (result);
3546}
3547
3548#if CONFIG_MACF_NET
3549#include <netinet/ip6.h>
3550#include <netinet/ip.h>
3551static int
3552dlil_get_socket_type(struct mbuf **mp, int family, int raw)
3553{
3554	struct mbuf *m;
3555	struct ip *ip;
3556	struct ip6_hdr *ip6;
3557	int type = SOCK_RAW;
3558
3559	if (!raw) {
3560		switch (family) {
3561		case PF_INET:
3562			m = m_pullup(*mp, sizeof(struct ip));
3563			if (m == NULL)
3564				break;
3565			*mp = m;
3566			ip = mtod(m, struct ip *);
3567			if (ip->ip_p == IPPROTO_TCP)
3568				type = SOCK_STREAM;
3569			else if (ip->ip_p == IPPROTO_UDP)
3570				type = SOCK_DGRAM;
3571			break;
3572		case PF_INET6:
3573			m = m_pullup(*mp, sizeof(struct ip6_hdr));
3574			if (m == NULL)
3575				break;
3576			*mp = m;
3577			ip6 = mtod(m, struct ip6_hdr *);
3578			if (ip6->ip6_nxt == IPPROTO_TCP)
3579				type = SOCK_STREAM;
3580			else if (ip6->ip6_nxt == IPPROTO_UDP)
3581				type = SOCK_DGRAM;
3582			break;
3583		}
3584	}
3585
3586	return (type);
3587}
3588#endif
3589
3590/*
3591 * This is mostly called from the context of the DLIL input thread;
3592 * because of that there is no need for atomic operations.
3593 */
3594static __inline void
3595ifp_inc_traffic_class_in(struct ifnet *ifp, struct mbuf *m)
3596{
3597	if (!(m->m_flags & M_PKTHDR))
3598		return;
3599
3600	switch (m_get_traffic_class(m)) {
3601	case MBUF_TC_BE:
3602		ifp->if_tc.ifi_ibepackets++;
3603		ifp->if_tc.ifi_ibebytes += m->m_pkthdr.len;
3604		break;
3605	case MBUF_TC_BK:
3606		ifp->if_tc.ifi_ibkpackets++;
3607		ifp->if_tc.ifi_ibkbytes += m->m_pkthdr.len;
3608		break;
3609	case MBUF_TC_VI:
3610		ifp->if_tc.ifi_ivipackets++;
3611		ifp->if_tc.ifi_ivibytes += m->m_pkthdr.len;
3612		break;
3613	case MBUF_TC_VO:
3614		ifp->if_tc.ifi_ivopackets++;
3615		ifp->if_tc.ifi_ivobytes += m->m_pkthdr.len;
3616		break;
3617	default:
3618		break;
3619	}
3620
3621	if (mbuf_is_traffic_class_privileged(m)) {
3622		ifp->if_tc.ifi_ipvpackets++;
3623		ifp->if_tc.ifi_ipvbytes += m->m_pkthdr.len;
3624	}
3625}
3626
3627/*
3628 * This is called from DLIL output, hence multiple threads could end
3629 * up modifying the statistics.  We trade off acccuracy for performance
3630 * by not using atomic operations here.
3631 */
3632static __inline void
3633ifp_inc_traffic_class_out(struct ifnet *ifp, struct mbuf *m)
3634{
3635	if (!(m->m_flags & M_PKTHDR))
3636		return;
3637
3638	switch (m_get_traffic_class(m)) {
3639	case MBUF_TC_BE:
3640		ifp->if_tc.ifi_obepackets++;
3641		ifp->if_tc.ifi_obebytes += m->m_pkthdr.len;
3642		break;
3643	case MBUF_TC_BK:
3644		ifp->if_tc.ifi_obkpackets++;
3645		ifp->if_tc.ifi_obkbytes += m->m_pkthdr.len;
3646		break;
3647	case MBUF_TC_VI:
3648		ifp->if_tc.ifi_ovipackets++;
3649		ifp->if_tc.ifi_ovibytes += m->m_pkthdr.len;
3650		break;
3651	case MBUF_TC_VO:
3652		ifp->if_tc.ifi_ovopackets++;
3653		ifp->if_tc.ifi_ovobytes += m->m_pkthdr.len;
3654		break;
3655	default:
3656		break;
3657	}
3658
3659	if (mbuf_is_traffic_class_privileged(m)) {
3660		ifp->if_tc.ifi_opvpackets++;
3661		ifp->if_tc.ifi_opvbytes += m->m_pkthdr.len;
3662	}
3663}
3664
3665/*
3666 * dlil_output
3667 *
3668 * Caller should have a lock on the protocol domain if the protocol
3669 * doesn't support finer grained locking. In most cases, the lock
3670 * will be held from the socket layer and won't be released until
3671 * we return back to the socket layer.
3672 *
3673 * This does mean that we must take a protocol lock before we take
3674 * an interface lock if we're going to take both. This makes sense
3675 * because a protocol is likely to interact with an ifp while it
3676 * is under the protocol lock.
3677 *
3678 * An advisory code will be returned if adv is not null. This
3679 * can be used to provide feedback about interface queues to the
3680 * application.
3681 */
3682errno_t
3683dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
3684    void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
3685{
3686	char *frame_type = NULL;
3687	char *dst_linkaddr = NULL;
3688	int retval = 0;
3689	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
3690	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
3691	struct if_proto	*proto = NULL;
3692	mbuf_t	m;
3693	mbuf_t	send_head = NULL;
3694	mbuf_t	*send_tail = &send_head;
3695	int iorefcnt = 0;
3696	u_int32_t pre = 0, post = 0;
3697	u_int32_t fpkts = 0, fbytes = 0;
3698	int32_t flen = 0;
3699
3700	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
3701
3702	/* Get an io refcnt if the interface is attached to prevent ifnet_detach
3703	 * from happening while this operation is in progress */
3704	if (!ifnet_is_attached(ifp, 1)) {
3705		retval = ENXIO;
3706		goto cleanup;
3707	}
3708	iorefcnt = 1;
3709
3710	/* update the driver's multicast filter, if needed */
3711	if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0)
3712		ifp->if_updatemcasts = 0;
3713
3714	frame_type = frame_type_buffer;
3715	dst_linkaddr = dst_linkaddr_buffer;
3716
3717	if (raw == 0) {
3718		ifnet_lock_shared(ifp);
3719		/* callee holds a proto refcnt upon success */
3720		proto = find_attached_proto(ifp, proto_family);
3721		if (proto == NULL) {
3722			ifnet_lock_done(ifp);
3723			retval = ENXIO;
3724			goto cleanup;
3725		}
3726		ifnet_lock_done(ifp);
3727	}
3728
3729preout_again:
3730	if (packetlist == NULL)
3731		goto cleanup;
3732
3733	m = packetlist;
3734	packetlist = packetlist->m_nextpkt;
3735	m->m_nextpkt = NULL;
3736
3737	if (raw == 0) {
3738		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
3739		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
3740		retval = 0;
3741		if (preoutp != NULL) {
3742			retval = preoutp(ifp, proto_family, &m, dest, route,
3743			    frame_type, dst_linkaddr);
3744
3745			if (retval != 0) {
3746				if (retval == EJUSTRETURN)
3747					goto preout_again;
3748				m_freem(m);
3749				goto cleanup;
3750			}
3751		}
3752	}
3753
3754#if CONFIG_MACF_NET
3755	retval = mac_ifnet_check_transmit(ifp, m, proto_family,
3756	    dlil_get_socket_type(&m, proto_family, raw));
3757	if (retval != 0) {
3758		m_freem(m);
3759		goto cleanup;
3760	}
3761#endif
3762
3763	do {
3764#if CONFIG_DTRACE
3765		if (!raw && proto_family == PF_INET) {
3766			struct ip *ip = mtod(m, struct ip*);
3767	                DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
3768				struct ip *, ip, struct ifnet *, ifp,
3769				struct ip *, ip, struct ip6_hdr *, NULL);
3770
3771		} else if (!raw && proto_family == PF_INET6) {
3772			struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr*);
3773			DTRACE_IP6(send, struct mbuf*, m, struct inpcb *, NULL,
3774				struct ip6_hdr *, ip6, struct ifnet*, ifp,
3775				struct ip*, NULL, struct ip6_hdr *, ip6);
3776		}
3777#endif /* CONFIG_DTRACE */
3778
3779		if (raw == 0 && ifp->if_framer != NULL) {
3780			int rcvif_set = 0;
3781
3782			/*
3783			 * If this is a broadcast packet that needs to be
3784			 * looped back into the system, set the inbound ifp
3785			 * to that of the outbound ifp.  This will allow
3786			 * us to determine that it is a legitimate packet
3787			 * for the system.  Only set the ifp if it's not
3788			 * already set, just to be safe.
3789			 */
3790			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
3791			    m->m_pkthdr.rcvif == NULL) {
3792				m->m_pkthdr.rcvif = ifp;
3793				rcvif_set = 1;
3794			}
3795
3796			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
3797			    frame_type, &pre, &post);
3798			if (retval != 0) {
3799				if (retval != EJUSTRETURN)
3800					m_freem(m);
3801				goto next;
3802			}
3803
3804			/*
3805			 * For partial checksum offload, adjust the start
3806			 * and stuff offsets based on the prepended header.
3807			 */
3808			if ((m->m_pkthdr.csum_flags &
3809			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
3810			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
3811				m->m_pkthdr.csum_tx_stuff += pre;
3812				m->m_pkthdr.csum_tx_start += pre;
3813			}
3814
3815			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK))
3816				dlil_output_cksum_dbg(ifp, m, pre,
3817				    proto_family);
3818
3819			/*
3820			 * Clear the ifp if it was set above, and to be
3821			 * safe, only if it is still the same as the
3822			 * outbound ifp we have in context.  If it was
3823			 * looped back, then a copy of it was sent to the
3824			 * loopback interface with the rcvif set, and we
3825			 * are clearing the one that will go down to the
3826			 * layer below.
3827			 */
3828			if (rcvif_set && m->m_pkthdr.rcvif == ifp)
3829				m->m_pkthdr.rcvif = NULL;
3830		}
3831
3832		/*
3833		 * Let interface filters (if any) do their thing ...
3834		 */
3835		/* Do not pass VLAN tagged packets to filters PR-3586856 */
3836		if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) {
3837			retval = dlil_interface_filters_output(ifp,
3838			    &m, proto_family);
3839			if (retval != 0) {
3840				if (retval != EJUSTRETURN)
3841					m_freem(m);
3842				goto next;
3843			}
3844		}
3845		/*
3846		 * Strip away M_PROTO1 bit prior to sending packet
3847		 * to the driver as this field may be used by the driver
3848		 */
3849		m->m_flags &= ~M_PROTO1;
3850
3851		/*
3852		 * If the underlying interface is not capable of handling a
3853		 * packet whose data portion spans across physically disjoint
3854		 * pages, we need to "normalize" the packet so that we pass
3855		 * down a chain of mbufs where each mbuf points to a span that
3856		 * resides in the system page boundary.  If the packet does
3857		 * not cross page(s), the following is a no-op.
3858		 */
3859		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
3860			if ((m = m_normalize(m)) == NULL)
3861				goto next;
3862		}
3863
3864		/*
3865		 * If this is a TSO packet, make sure the interface still
3866		 * advertise TSO capability.
3867		 */
3868		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
3869			retval = EMSGSIZE;
3870			m_freem(m);
3871			goto cleanup;
3872		}
3873
3874		/*
3875		 * If the packet service class is not background,
3876		 * update the timestamp to indicate recent activity
3877		 * on a foreground socket.
3878		 */
3879		if (!(m->m_pkthdr.pkt_flags & PKTF_SO_BACKGROUND) &&
3880		    (m->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
3881		    m->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB)
3882			ifp->if_fg_sendts = net_uptime();
3883
3884		ifp_inc_traffic_class_out(ifp, m);
3885		pktap_output(ifp, proto_family, m, pre, post);
3886
3887		/*
3888		 * Finally, call the driver.
3889		 */
3890		if (ifp->if_eflags & IFEF_SENDLIST) {
3891			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
3892				flen += (m_pktlen(m) - (pre + post));
3893				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
3894			}
3895			*send_tail = m;
3896			send_tail = &m->m_nextpkt;
3897		} else {
3898			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
3899				flen = (m_pktlen(m) - (pre + post));
3900				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
3901			} else {
3902				flen = 0;
3903			}
3904			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
3905			    0, 0, 0, 0, 0);
3906			retval = (*ifp->if_output)(ifp, m);
3907			if (retval == EQFULL || retval == EQSUSPENDED) {
3908				if (adv != NULL && adv->code == FADV_SUCCESS) {
3909					adv->code = (retval == EQFULL ?
3910					    FADV_FLOW_CONTROLLED :
3911					    FADV_SUSPENDED);
3912				}
3913				retval = 0;
3914			}
3915			if (retval == 0 && flen > 0) {
3916				fbytes += flen;
3917				fpkts++;
3918			}
3919			if (retval != 0 && dlil_verbose) {
3920				printf("%s: output error on %s retval = %d\n",
3921				    __func__, if_name(ifp),
3922				    retval);
3923			}
3924			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
3925			    0, 0, 0, 0, 0);
3926		}
3927		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3928
3929next:
3930		m = packetlist;
3931		if (m != NULL) {
3932			packetlist = packetlist->m_nextpkt;
3933			m->m_nextpkt = NULL;
3934		}
3935	} while (m != NULL);
3936
3937	if (send_head != NULL) {
3938		VERIFY(ifp->if_eflags & IFEF_SENDLIST);
3939		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
3940		    0, 0, 0, 0, 0);
3941		retval = (*ifp->if_output)(ifp, send_head);
3942		if (retval == EQFULL || retval == EQSUSPENDED) {
3943			if (adv != NULL) {
3944				adv->code = (retval == EQFULL ?
3945				    FADV_FLOW_CONTROLLED : FADV_SUSPENDED);
3946			}
3947			retval = 0;
3948		}
3949		if (retval == 0 && flen > 0) {
3950			fbytes += flen;
3951			fpkts++;
3952		}
3953		if (retval != 0 && dlil_verbose) {
3954			printf("%s: output error on %s retval = %d\n",
3955			    __func__, if_name(ifp), retval);
3956		}
3957		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3958	}
3959
3960	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3961
3962cleanup:
3963	if (fbytes > 0)
3964		ifp->if_fbytes += fbytes;
3965	if (fpkts > 0)
3966		ifp->if_fpackets += fpkts;
3967	if (proto != NULL)
3968		if_proto_free(proto);
3969	if (packetlist) /* if any packets are left, clean up */
3970		mbuf_freem_list(packetlist);
3971	if (retval == EJUSTRETURN)
3972		retval = 0;
3973	if (iorefcnt == 1)
3974		ifnet_decr_iorefcnt(ifp);
3975
3976	return (retval);
3977}
3978
3979errno_t
3980ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
3981    void *ioctl_arg)
3982{
3983	struct ifnet_filter *filter;
3984	int retval = EOPNOTSUPP;
3985	int result = 0;
3986
3987	if (ifp == NULL || ioctl_code == 0)
3988		return (EINVAL);
3989
3990	/* Get an io ref count if the interface is attached */
3991	if (!ifnet_is_attached(ifp, 1))
3992		return (EOPNOTSUPP);
3993
3994	/* Run the interface filters first.
3995	 * We want to run all filters before calling the protocol,
3996	 * interface family, or interface.
3997	 */
3998	lck_mtx_lock_spin(&ifp->if_flt_lock);
3999	/* prevent filter list from changing in case we drop the lock */
4000	if_flt_monitor_busy(ifp);
4001	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
4002		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
4003		    filter->filt_protocol == proto_fam)) {
4004			lck_mtx_unlock(&ifp->if_flt_lock);
4005
4006			result = filter->filt_ioctl(filter->filt_cookie, ifp,
4007			    proto_fam, ioctl_code, ioctl_arg);
4008
4009			lck_mtx_lock_spin(&ifp->if_flt_lock);
4010
4011			/* Only update retval if no one has handled the ioctl */
4012			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
4013				if (result == ENOTSUP)
4014					result = EOPNOTSUPP;
4015				retval = result;
4016				if (retval != 0 && retval != EOPNOTSUPP) {
4017					/* we're done with the filter list */
4018					if_flt_monitor_unbusy(ifp);
4019					lck_mtx_unlock(&ifp->if_flt_lock);
4020					goto cleanup;
4021				}
4022			}
4023		}
4024	}
4025	/* we're done with the filter list */
4026	if_flt_monitor_unbusy(ifp);
4027	lck_mtx_unlock(&ifp->if_flt_lock);
4028
4029	/* Allow the protocol to handle the ioctl */
4030	if (proto_fam != 0) {
4031		struct if_proto	*proto;
4032
4033		/* callee holds a proto refcnt upon success */
4034		ifnet_lock_shared(ifp);
4035		proto = find_attached_proto(ifp, proto_fam);
4036		ifnet_lock_done(ifp);
4037		if (proto != NULL) {
4038			proto_media_ioctl ioctlp =
4039			    (proto->proto_kpi == kProtoKPI_v1 ?
4040			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
4041			result = EOPNOTSUPP;
4042			if (ioctlp != NULL)
4043				result = ioctlp(ifp, proto_fam, ioctl_code,
4044				    ioctl_arg);
4045			if_proto_free(proto);
4046
4047			/* Only update retval if no one has handled the ioctl */
4048			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
4049				if (result == ENOTSUP)
4050					result = EOPNOTSUPP;
4051				retval = result;
4052				if (retval && retval != EOPNOTSUPP)
4053					goto cleanup;
4054			}
4055		}
4056	}
4057
4058	/* retval is either 0 or EOPNOTSUPP */
4059
4060	/*
4061	 * Let the interface handle this ioctl.
4062	 * If it returns EOPNOTSUPP, ignore that, we may have
4063	 * already handled this in the protocol or family.
4064	 */
4065	if (ifp->if_ioctl)
4066		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
4067
4068	/* Only update retval if no one has handled the ioctl */
4069	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
4070		if (result == ENOTSUP)
4071			result = EOPNOTSUPP;
4072		retval = result;
4073		if (retval && retval != EOPNOTSUPP) {
4074			goto cleanup;
4075		}
4076	}
4077
4078cleanup:
4079	if (retval == EJUSTRETURN)
4080		retval = 0;
4081
4082	ifnet_decr_iorefcnt(ifp);
4083
4084	return (retval);
4085}
4086
4087__private_extern__ errno_t
4088dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
4089{
4090	errno_t	error = 0;
4091
4092
4093	if (ifp->if_set_bpf_tap) {
4094		/* Get an io reference on the interface if it is attached */
4095		if (!ifnet_is_attached(ifp, 1))
4096			return ENXIO;
4097		error = ifp->if_set_bpf_tap(ifp, mode, callback);
4098		ifnet_decr_iorefcnt(ifp);
4099	}
4100	return (error);
4101}
4102
4103errno_t
4104dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
4105    struct sockaddr *ll_addr, size_t ll_len)
4106{
4107	errno_t	result = EOPNOTSUPP;
4108	struct if_proto *proto;
4109	const struct sockaddr *verify;
4110	proto_media_resolve_multi resolvep;
4111
4112	if (!ifnet_is_attached(ifp, 1))
4113		return result;
4114
4115	bzero(ll_addr, ll_len);
4116
4117	/* Call the protocol first; callee holds a proto refcnt upon success */
4118	ifnet_lock_shared(ifp);
4119	proto = find_attached_proto(ifp, proto_addr->sa_family);
4120	ifnet_lock_done(ifp);
4121	if (proto != NULL) {
4122		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
4123		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
4124		if (resolvep != NULL)
4125			result = resolvep(ifp, proto_addr,
4126			    (struct sockaddr_dl*)(void *)ll_addr, ll_len);
4127		if_proto_free(proto);
4128	}
4129
4130	/* Let the interface verify the multicast address */
4131	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
4132		if (result == 0)
4133			verify = ll_addr;
4134		else
4135			verify = proto_addr;
4136		result = ifp->if_check_multi(ifp, verify);
4137	}
4138
4139	ifnet_decr_iorefcnt(ifp);
4140	return (result);
4141}
4142
4143__private_extern__ errno_t
4144dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
4145    const struct sockaddr_dl* sender_hw, const struct sockaddr* sender_proto,
4146    const struct sockaddr_dl* target_hw, const struct sockaddr* target_proto)
4147{
4148	struct if_proto *proto;
4149	errno_t	result = 0;
4150
4151	/* callee holds a proto refcnt upon success */
4152	ifnet_lock_shared(ifp);
4153	proto = find_attached_proto(ifp, target_proto->sa_family);
4154	ifnet_lock_done(ifp);
4155	if (proto == NULL) {
4156		result = ENOTSUP;
4157	} else {
4158		proto_media_send_arp	arpp;
4159		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
4160		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
4161		if (arpp == NULL) {
4162			result = ENOTSUP;
4163		} else {
4164			switch (arpop) {
4165			case ARPOP_REQUEST:
4166				arpstat.txrequests++;
4167				if (target_hw != NULL)
4168					arpstat.txurequests++;
4169				break;
4170			case ARPOP_REPLY:
4171				arpstat.txreplies++;
4172				break;
4173			}
4174			result = arpp(ifp, arpop, sender_hw, sender_proto,
4175			    target_hw, target_proto);
4176		}
4177		if_proto_free(proto);
4178	}
4179
4180	return (result);
4181}
4182
4183struct net_thread_marks { };
4184static const struct net_thread_marks net_thread_marks_base = { };
4185
4186__private_extern__ const net_thread_marks_t net_thread_marks_none =
4187    &net_thread_marks_base;
4188
4189__private_extern__ net_thread_marks_t
4190net_thread_marks_push(u_int32_t push)
4191{
4192	static const char *const base = (const void*)&net_thread_marks_base;
4193	u_int32_t pop = 0;
4194
4195	if (push != 0) {
4196		struct uthread *uth = get_bsdthread_info(current_thread());
4197
4198		pop = push & ~uth->uu_network_marks;
4199		if (pop != 0)
4200			uth->uu_network_marks |= pop;
4201	}
4202
4203	return ((net_thread_marks_t)&base[pop]);
4204}
4205
4206__private_extern__ net_thread_marks_t
4207net_thread_unmarks_push(u_int32_t unpush)
4208{
4209	static const char *const base = (const void*)&net_thread_marks_base;
4210	u_int32_t unpop = 0;
4211
4212	if (unpush != 0) {
4213		struct uthread *uth = get_bsdthread_info(current_thread());
4214
4215		unpop = unpush & uth->uu_network_marks;
4216		if (unpop != 0)
4217			uth->uu_network_marks &= ~unpop;
4218	}
4219
4220	return ((net_thread_marks_t)&base[unpop]);
4221}
4222
4223__private_extern__ void
4224net_thread_marks_pop(net_thread_marks_t popx)
4225{
4226	static const char *const base = (const void*)&net_thread_marks_base;
4227	ptrdiff_t pop = (caddr_t)popx - (caddr_t)base;
4228
4229	if (pop != 0) {
4230		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
4231		struct uthread *uth = get_bsdthread_info(current_thread());
4232
4233		VERIFY((pop & ones) == pop);
4234		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
4235		uth->uu_network_marks &= ~pop;
4236	}
4237}
4238
4239__private_extern__ void
4240net_thread_unmarks_pop(net_thread_marks_t unpopx)
4241{
4242	static const char *const base = (const void*)&net_thread_marks_base;
4243	ptrdiff_t unpop = (caddr_t)unpopx - (caddr_t)base;
4244
4245	if (unpop != 0) {
4246		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
4247		struct uthread *uth = get_bsdthread_info(current_thread());
4248
4249		VERIFY((unpop & ones) == unpop);
4250		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
4251		uth->uu_network_marks |= unpop;
4252	}
4253}
4254
4255__private_extern__ u_int32_t
4256net_thread_is_marked(u_int32_t check)
4257{
4258	if (check != 0) {
4259		struct uthread *uth = get_bsdthread_info(current_thread());
4260		return (uth->uu_network_marks & check);
4261	}
4262	else
4263		return (0);
4264}
4265
4266__private_extern__ u_int32_t
4267net_thread_is_unmarked(u_int32_t check)
4268{
4269	if (check != 0) {
4270		struct uthread *uth = get_bsdthread_info(current_thread());
4271		return (~uth->uu_network_marks & check);
4272	}
4273	else
4274		return (0);
4275}
4276
4277static __inline__ int
4278_is_announcement(const struct sockaddr_in * sender_sin,
4279    const struct sockaddr_in * target_sin)
4280{
4281	if (sender_sin == NULL) {
4282		return (FALSE);
4283	}
4284	return (sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr);
4285}
4286
4287__private_extern__ errno_t
4288dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw,
4289    const struct sockaddr* sender_proto, const struct sockaddr_dl* target_hw,
4290    const struct sockaddr* target_proto0, u_int32_t rtflags)
4291{
4292	errno_t	result = 0;
4293	const struct sockaddr_in * sender_sin;
4294	const struct sockaddr_in * target_sin;
4295	struct sockaddr_inarp target_proto_sinarp;
4296	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
4297
4298	if (target_proto == NULL || (sender_proto != NULL &&
4299	    sender_proto->sa_family != target_proto->sa_family))
4300		return (EINVAL);
4301
4302	/*
4303	 * If the target is a (default) router, provide that
4304	 * information to the send_arp callback routine.
4305	 */
4306	if (rtflags & RTF_ROUTER) {
4307		bcopy(target_proto, &target_proto_sinarp,
4308		    sizeof (struct sockaddr_in));
4309		target_proto_sinarp.sin_other |= SIN_ROUTER;
4310		target_proto = (struct sockaddr *)&target_proto_sinarp;
4311	}
4312
4313	/*
4314	 * If this is an ARP request and the target IP is IPv4LL,
4315	 * send the request on all interfaces.  The exception is
4316	 * an announcement, which must only appear on the specific
4317	 * interface.
4318	 */
4319	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
4320	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
4321	if (target_proto->sa_family == AF_INET &&
4322	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
4323	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
4324	    !_is_announcement(target_sin, sender_sin)) {
4325		ifnet_t		*ifp_list;
4326		u_int32_t	count;
4327		u_int32_t	ifp_on;
4328
4329		result = ENOTSUP;
4330
4331		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
4332			for (ifp_on = 0; ifp_on < count; ifp_on++) {
4333				errno_t new_result;
4334				ifaddr_t source_hw = NULL;
4335				ifaddr_t source_ip = NULL;
4336				struct sockaddr_in source_ip_copy;
4337				struct ifnet *cur_ifp = ifp_list[ifp_on];
4338
4339				/*
4340				 * Only arp on interfaces marked for IPv4LL
4341				 * ARPing.  This may mean that we don't ARP on
4342				 * the interface the subnet route points to.
4343				 */
4344				if (!(cur_ifp->if_eflags & IFEF_ARPLL))
4345					continue;
4346
4347				/* Find the source IP address */
4348				ifnet_lock_shared(cur_ifp);
4349				source_hw = cur_ifp->if_lladdr;
4350				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
4351				    ifa_link) {
4352					IFA_LOCK(source_ip);
4353					if (source_ip->ifa_addr != NULL &&
4354					    source_ip->ifa_addr->sa_family ==
4355					    AF_INET) {
4356						/* Copy the source IP address */
4357						source_ip_copy =
4358						    *(struct sockaddr_in *)
4359						    (void *)source_ip->ifa_addr;
4360						IFA_UNLOCK(source_ip);
4361						break;
4362					}
4363					IFA_UNLOCK(source_ip);
4364				}
4365
4366				/* No IP Source, don't arp */
4367				if (source_ip == NULL) {
4368					ifnet_lock_done(cur_ifp);
4369					continue;
4370				}
4371
4372				IFA_ADDREF(source_hw);
4373				ifnet_lock_done(cur_ifp);
4374
4375				/* Send the ARP */
4376				new_result = dlil_send_arp_internal(cur_ifp,
4377				    arpop, (struct sockaddr_dl *)(void *)
4378				    source_hw->ifa_addr,
4379				    (struct sockaddr *)&source_ip_copy, NULL,
4380				    target_proto);
4381
4382				IFA_REMREF(source_hw);
4383				if (result == ENOTSUP) {
4384					result = new_result;
4385				}
4386			}
4387			ifnet_list_free(ifp_list);
4388		}
4389	} else {
4390		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
4391		    sender_proto, target_hw, target_proto);
4392	}
4393
4394	return (result);
4395}
4396
4397/*
4398 * Caller must hold ifnet head lock.
4399 */
4400static int
4401ifnet_lookup(struct ifnet *ifp)
4402{
4403	struct ifnet *_ifp;
4404
4405	lck_rw_assert(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
4406	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
4407		if (_ifp == ifp)
4408			break;
4409	}
4410	return (_ifp != NULL);
4411}
4412/*
4413 * Caller has to pass a non-zero refio argument to get a
4414 * IO reference count. This will prevent ifnet_detach from
4415 * being called when there are outstanding io reference counts.
4416 */
4417int
4418ifnet_is_attached(struct ifnet *ifp, int refio)
4419{
4420	int ret;
4421
4422	lck_mtx_lock_spin(&ifp->if_ref_lock);
4423	if ((ret = ((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) ==
4424	    IFRF_ATTACHED))) {
4425		if (refio > 0)
4426			ifp->if_refio++;
4427	}
4428	lck_mtx_unlock(&ifp->if_ref_lock);
4429
4430	return (ret);
4431}
4432
4433void
4434ifnet_decr_iorefcnt(struct ifnet *ifp)
4435{
4436	lck_mtx_lock_spin(&ifp->if_ref_lock);
4437	VERIFY(ifp->if_refio > 0);
4438	VERIFY((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) != 0);
4439	ifp->if_refio--;
4440
4441	/* if there are no more outstanding io references, wakeup the
4442	 * ifnet_detach thread if detaching flag is set.
4443	 */
4444	if (ifp->if_refio == 0 &&
4445		(ifp->if_refflags & IFRF_DETACHING) != 0) {
4446		wakeup(&(ifp->if_refio));
4447	}
4448	lck_mtx_unlock(&ifp->if_ref_lock);
4449}
4450
4451static void
4452dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
4453{
4454	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
4455	ctrace_t *tr;
4456	u_int32_t idx;
4457	u_int16_t *cnt;
4458
4459	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
4460		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
4461		/* NOTREACHED */
4462	}
4463
4464	if (refhold) {
4465		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
4466		tr = dl_if_dbg->dldbg_if_refhold;
4467	} else {
4468		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
4469		tr = dl_if_dbg->dldbg_if_refrele;
4470	}
4471
4472	idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
4473	ctrace_record(&tr[idx]);
4474}
4475
4476errno_t
4477dlil_if_ref(struct ifnet *ifp)
4478{
4479	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
4480
4481	if (dl_if == NULL)
4482		return (EINVAL);
4483
4484	lck_mtx_lock_spin(&dl_if->dl_if_lock);
4485	++dl_if->dl_if_refcnt;
4486	if (dl_if->dl_if_refcnt == 0) {
4487		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
4488		/* NOTREACHED */
4489	}
4490	if (dl_if->dl_if_trace != NULL)
4491		(*dl_if->dl_if_trace)(dl_if, TRUE);
4492	lck_mtx_unlock(&dl_if->dl_if_lock);
4493
4494	return (0);
4495}
4496
4497errno_t
4498dlil_if_free(struct ifnet *ifp)
4499{
4500	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
4501
4502	if (dl_if == NULL)
4503		return (EINVAL);
4504
4505	lck_mtx_lock_spin(&dl_if->dl_if_lock);
4506	if (dl_if->dl_if_refcnt == 0) {
4507		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
4508		/* NOTREACHED */
4509	}
4510	--dl_if->dl_if_refcnt;
4511	if (dl_if->dl_if_trace != NULL)
4512		(*dl_if->dl_if_trace)(dl_if, FALSE);
4513	lck_mtx_unlock(&dl_if->dl_if_lock);
4514
4515	return (0);
4516}
4517
4518static errno_t
4519dlil_attach_protocol_internal(struct if_proto *proto,
4520    const struct ifnet_demux_desc *demux_list, u_int32_t demux_count)
4521{
4522	struct kev_dl_proto_data ev_pr_data;
4523	struct ifnet *ifp = proto->ifp;
4524	int retval = 0;
4525	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
4526	struct if_proto *prev_proto;
4527	struct if_proto *_proto;
4528
4529	/* callee holds a proto refcnt upon success */
4530	ifnet_lock_exclusive(ifp);
4531	_proto = find_attached_proto(ifp, proto->protocol_family);
4532	if (_proto != NULL) {
4533		ifnet_lock_done(ifp);
4534		if_proto_free(_proto);
4535		return (EEXIST);
4536	}
4537
4538	/*
4539	 * Call family module add_proto routine so it can refine the
4540	 * demux descriptors as it wishes.
4541	 */
4542	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
4543	    demux_count);
4544	if (retval) {
4545		ifnet_lock_done(ifp);
4546		return (retval);
4547	}
4548
4549	/*
4550	 * Insert the protocol in the hash
4551	 */
4552	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
4553	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL)
4554		prev_proto = SLIST_NEXT(prev_proto, next_hash);
4555	if (prev_proto)
4556		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
4557	else
4558		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
4559		    proto, next_hash);
4560
4561	/* hold a proto refcnt for attach */
4562	if_proto_ref(proto);
4563
4564	/*
4565	 * The reserved field carries the number of protocol still attached
4566	 * (subject to change)
4567	 */
4568	ev_pr_data.proto_family = proto->protocol_family;
4569	ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp);
4570	ifnet_lock_done(ifp);
4571
4572	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
4573	    (struct net_event_data *)&ev_pr_data,
4574	    sizeof (struct kev_dl_proto_data));
4575	return (retval);
4576}
4577
4578errno_t
4579ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
4580    const struct ifnet_attach_proto_param *proto_details)
4581{
4582	int retval = 0;
4583	struct if_proto  *ifproto = NULL;
4584
4585	ifnet_head_lock_shared();
4586	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
4587		retval = EINVAL;
4588		goto end;
4589	}
4590	/* Check that the interface is in the global list */
4591	if (!ifnet_lookup(ifp)) {
4592		retval = ENXIO;
4593		goto end;
4594	}
4595
4596	ifproto = zalloc(dlif_proto_zone);
4597	if (ifproto == NULL) {
4598		retval = ENOMEM;
4599		goto end;
4600	}
4601	bzero(ifproto, dlif_proto_size);
4602
4603	/* refcnt held above during lookup */
4604	ifproto->ifp = ifp;
4605	ifproto->protocol_family = protocol;
4606	ifproto->proto_kpi = kProtoKPI_v1;
4607	ifproto->kpi.v1.input = proto_details->input;
4608	ifproto->kpi.v1.pre_output = proto_details->pre_output;
4609	ifproto->kpi.v1.event = proto_details->event;
4610	ifproto->kpi.v1.ioctl = proto_details->ioctl;
4611	ifproto->kpi.v1.detached = proto_details->detached;
4612	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
4613	ifproto->kpi.v1.send_arp = proto_details->send_arp;
4614
4615	retval = dlil_attach_protocol_internal(ifproto,
4616	    proto_details->demux_list, proto_details->demux_count);
4617
4618	if (dlil_verbose) {
4619		printf("%s: attached v1 protocol %d\n", if_name(ifp),
4620		    protocol);
4621	}
4622
4623end:
4624	if (retval != 0 && retval != EEXIST && ifp != NULL) {
4625		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
4626		    if_name(ifp), protocol, retval);
4627	}
4628	ifnet_head_done();
4629	if (retval != 0  && ifproto != NULL)
4630		zfree(dlif_proto_zone, ifproto);
4631	return (retval);
4632}
4633
4634errno_t
4635ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
4636    const struct ifnet_attach_proto_param_v2 *proto_details)
4637{
4638	int retval = 0;
4639	struct if_proto  *ifproto = NULL;
4640
4641	ifnet_head_lock_shared();
4642	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
4643		retval = EINVAL;
4644		goto end;
4645	}
4646	/* Check that the interface is in the global list */
4647	if (!ifnet_lookup(ifp)) {
4648		retval = ENXIO;
4649		goto end;
4650	}
4651
4652	ifproto = zalloc(dlif_proto_zone);
4653	if (ifproto == NULL) {
4654		retval = ENOMEM;
4655		goto end;
4656	}
4657	bzero(ifproto, sizeof(*ifproto));
4658
4659	/* refcnt held above during lookup */
4660	ifproto->ifp = ifp;
4661	ifproto->protocol_family = protocol;
4662	ifproto->proto_kpi = kProtoKPI_v2;
4663	ifproto->kpi.v2.input = proto_details->input;
4664	ifproto->kpi.v2.pre_output = proto_details->pre_output;
4665	ifproto->kpi.v2.event = proto_details->event;
4666	ifproto->kpi.v2.ioctl = proto_details->ioctl;
4667	ifproto->kpi.v2.detached = proto_details->detached;
4668	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
4669	ifproto->kpi.v2.send_arp = proto_details->send_arp;
4670
4671	retval = dlil_attach_protocol_internal(ifproto,
4672	    proto_details->demux_list, proto_details->demux_count);
4673
4674	if (dlil_verbose) {
4675		printf("%s: attached v2 protocol %d\n", if_name(ifp),
4676		    protocol);
4677	}
4678
4679end:
4680	if (retval != 0 && retval != EEXIST && ifp != NULL) {
4681		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
4682		    if_name(ifp), protocol, retval);
4683	}
4684	ifnet_head_done();
4685	if (retval != 0 && ifproto != NULL)
4686		zfree(dlif_proto_zone, ifproto);
4687	return (retval);
4688}
4689
4690errno_t
4691ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
4692{
4693	struct if_proto *proto = NULL;
4694	int	retval = 0;
4695
4696	if (ifp == NULL || proto_family == 0) {
4697		retval = EINVAL;
4698		goto end;
4699	}
4700
4701	ifnet_lock_exclusive(ifp);
4702	/* callee holds a proto refcnt upon success */
4703	proto = find_attached_proto(ifp, proto_family);
4704	if (proto == NULL) {
4705		retval = ENXIO;
4706		ifnet_lock_done(ifp);
4707		goto end;
4708	}
4709
4710	/* call family module del_proto */
4711	if (ifp->if_del_proto)
4712		ifp->if_del_proto(ifp, proto->protocol_family);
4713
4714	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
4715	    proto, if_proto, next_hash);
4716
4717	if (proto->proto_kpi == kProtoKPI_v1) {
4718		proto->kpi.v1.input = ifproto_media_input_v1;
4719		proto->kpi.v1.pre_output= ifproto_media_preout;
4720		proto->kpi.v1.event = ifproto_media_event;
4721		proto->kpi.v1.ioctl = ifproto_media_ioctl;
4722		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
4723		proto->kpi.v1.send_arp = ifproto_media_send_arp;
4724	} else {
4725		proto->kpi.v2.input = ifproto_media_input_v2;
4726		proto->kpi.v2.pre_output = ifproto_media_preout;
4727		proto->kpi.v2.event = ifproto_media_event;
4728		proto->kpi.v2.ioctl = ifproto_media_ioctl;
4729		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
4730		proto->kpi.v2.send_arp = ifproto_media_send_arp;
4731	}
4732	proto->detached = 1;
4733	ifnet_lock_done(ifp);
4734
4735	if (dlil_verbose) {
4736		printf("%s: detached %s protocol %d\n", if_name(ifp),
4737		    (proto->proto_kpi == kProtoKPI_v1) ?
4738		    "v1" : "v2", proto_family);
4739	}
4740
4741	/* release proto refcnt held during protocol attach */
4742	if_proto_free(proto);
4743
4744	/*
4745	 * Release proto refcnt held during lookup; the rest of
4746	 * protocol detach steps will happen when the last proto
4747	 * reference is released.
4748	 */
4749	if_proto_free(proto);
4750
4751end:
4752	return (retval);
4753}
4754
4755
4756static errno_t
4757ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
4758    struct mbuf *packet, char *header)
4759{
4760#pragma unused(ifp, protocol, packet, header)
4761	return (ENXIO);
4762}
4763
4764static errno_t
4765ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
4766    struct mbuf *packet)
4767{
4768#pragma unused(ifp, protocol, packet)
4769	return (ENXIO);
4770
4771}
4772
4773static errno_t
4774ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
4775    mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
4776    char *link_layer_dest)
4777{
4778#pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
4779	return (ENXIO);
4780
4781}
4782
4783static void
4784ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
4785    const struct kev_msg *event)
4786{
4787#pragma unused(ifp, protocol, event)
4788}
4789
4790static errno_t
4791ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
4792    unsigned long command, void *argument)
4793{
4794#pragma unused(ifp, protocol, command, argument)
4795	return (ENXIO);
4796}
4797
4798static errno_t
4799ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
4800    struct sockaddr_dl *out_ll, size_t ll_len)
4801{
4802#pragma unused(ifp, proto_addr, out_ll, ll_len)
4803	return (ENXIO);
4804}
4805
4806static errno_t
4807ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
4808    const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
4809    const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
4810{
4811#pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
4812	return (ENXIO);
4813}
4814
4815extern int if_next_index(void);
4816
4817errno_t
4818ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
4819{
4820	struct ifnet *tmp_if;
4821	struct ifaddr *ifa;
4822	struct if_data_internal if_data_saved;
4823	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
4824	struct dlil_threading_info *dl_inp;
4825	u_int32_t sflags = 0;
4826	int err;
4827
4828	if (ifp == NULL)
4829		return (EINVAL);
4830
4831	/*
4832	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
4833	 * prevent the interface from being configured while it is
4834	 * embryonic, as ifnet_head_lock is dropped and reacquired
4835	 * below prior to marking the ifnet with IFRF_ATTACHED.
4836	 */
4837	dlil_if_lock();
4838	ifnet_head_lock_exclusive();
4839	/* Verify we aren't already on the list */
4840	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
4841		if (tmp_if == ifp) {
4842			ifnet_head_done();
4843			dlil_if_unlock();
4844			return (EEXIST);
4845		}
4846	}
4847
4848	lck_mtx_lock_spin(&ifp->if_ref_lock);
4849	if (ifp->if_refflags & IFRF_ATTACHED) {
4850		panic_plain("%s: flags mismatch (attached set) ifp=%p",
4851		    __func__, ifp);
4852		/* NOTREACHED */
4853	}
4854	lck_mtx_unlock(&ifp->if_ref_lock);
4855
4856	ifnet_lock_exclusive(ifp);
4857
4858	/* Sanity check */
4859	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
4860	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
4861
4862	if (ll_addr != NULL) {
4863		if (ifp->if_addrlen == 0) {
4864			ifp->if_addrlen = ll_addr->sdl_alen;
4865		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
4866			ifnet_lock_done(ifp);
4867			ifnet_head_done();
4868			dlil_if_unlock();
4869			return (EINVAL);
4870		}
4871	}
4872
4873	/*
4874	 * Allow interfaces without protocol families to attach
4875	 * only if they have the necessary fields filled out.
4876	 */
4877	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
4878		DLIL_PRINTF("%s: Attempt to attach interface without "
4879		    "family module - %d\n", __func__, ifp->if_family);
4880		ifnet_lock_done(ifp);
4881		ifnet_head_done();
4882		dlil_if_unlock();
4883		return (ENODEV);
4884	}
4885
4886	/* Allocate protocol hash table */
4887	VERIFY(ifp->if_proto_hash == NULL);
4888	ifp->if_proto_hash = zalloc(dlif_phash_zone);
4889	if (ifp->if_proto_hash == NULL) {
4890		ifnet_lock_done(ifp);
4891		ifnet_head_done();
4892		dlil_if_unlock();
4893		return (ENOBUFS);
4894	}
4895	bzero(ifp->if_proto_hash, dlif_phash_size);
4896
4897	lck_mtx_lock_spin(&ifp->if_flt_lock);
4898	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
4899	TAILQ_INIT(&ifp->if_flt_head);
4900	VERIFY(ifp->if_flt_busy == 0);
4901	VERIFY(ifp->if_flt_waiters == 0);
4902	lck_mtx_unlock(&ifp->if_flt_lock);
4903
4904	VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead));
4905	TAILQ_INIT(&ifp->if_prefixhead);
4906
4907	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
4908		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
4909		LIST_INIT(&ifp->if_multiaddrs);
4910	}
4911
4912	VERIFY(ifp->if_allhostsinm == NULL);
4913	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
4914	TAILQ_INIT(&ifp->if_addrhead);
4915
4916	if (ifp->if_index == 0) {
4917		int idx = if_next_index();
4918
4919		if (idx == -1) {
4920			ifp->if_index = 0;
4921			ifnet_lock_done(ifp);
4922			ifnet_head_done();
4923			dlil_if_unlock();
4924			return (ENOBUFS);
4925		}
4926		ifp->if_index = idx;
4927	}
4928	/* There should not be anything occupying this slot */
4929	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
4930
4931	/* allocate (if needed) and initialize a link address */
4932	VERIFY(!(dl_if->dl_if_flags & DLIF_REUSE) || ifp->if_lladdr != NULL);
4933	ifa = dlil_alloc_lladdr(ifp, ll_addr);
4934	if (ifa == NULL) {
4935		ifnet_lock_done(ifp);
4936		ifnet_head_done();
4937		dlil_if_unlock();
4938		return (ENOBUFS);
4939	}
4940
4941	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
4942	ifnet_addrs[ifp->if_index - 1] = ifa;
4943
4944	/* make this address the first on the list */
4945	IFA_LOCK(ifa);
4946	/* hold a reference for ifnet_addrs[] */
4947	IFA_ADDREF_LOCKED(ifa);
4948	/* if_attach_link_ifa() holds a reference for ifa_link */
4949	if_attach_link_ifa(ifp, ifa);
4950	IFA_UNLOCK(ifa);
4951
4952#if CONFIG_MACF_NET
4953	mac_ifnet_label_associate(ifp);
4954#endif
4955
4956	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
4957	ifindex2ifnet[ifp->if_index] = ifp;
4958
4959	/* Hold a reference to the underlying dlil_ifnet */
4960	ifnet_reference(ifp);
4961
4962	/* Clear stats (save and restore other fields that we care) */
4963	if_data_saved = ifp->if_data;
4964	bzero(&ifp->if_data, sizeof (ifp->if_data));
4965	ifp->if_data.ifi_type = if_data_saved.ifi_type;
4966	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
4967	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
4968	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
4969	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
4970	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
4971	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
4972	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
4973	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
4974	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
4975	ifnet_touch_lastchange(ifp);
4976
4977	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
4978	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED);
4979
4980	/* By default, use SFB and enable flow advisory */
4981	sflags = PKTSCHEDF_QALG_SFB;
4982	if (if_flowadv)
4983		sflags |= PKTSCHEDF_QALG_FLOWCTL;
4984
4985	/* Initialize transmit queue(s) */
4986	err = ifclassq_setup(ifp, sflags, (dl_if->dl_if_flags & DLIF_REUSE));
4987	if (err != 0) {
4988		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
4989		    "err=%d", __func__, ifp, err);
4990		/* NOTREACHED */
4991	}
4992
4993	/* Sanity checks on the input thread storage */
4994	dl_inp = &dl_if->dl_if_inpstorage;
4995	bzero(&dl_inp->stats, sizeof (dl_inp->stats));
4996	VERIFY(dl_inp->input_waiting == 0);
4997	VERIFY(dl_inp->wtot == 0);
4998	VERIFY(dl_inp->ifp == NULL);
4999	VERIFY(qhead(&dl_inp->rcvq_pkts) == NULL && qempty(&dl_inp->rcvq_pkts));
5000	VERIFY(qlimit(&dl_inp->rcvq_pkts) == 0);
5001	VERIFY(!dl_inp->net_affinity);
5002	VERIFY(ifp->if_inp == NULL);
5003	VERIFY(dl_inp->input_thr == THREAD_NULL);
5004	VERIFY(dl_inp->wloop_thr == THREAD_NULL);
5005	VERIFY(dl_inp->poll_thr == THREAD_NULL);
5006	VERIFY(dl_inp->tag == 0);
5007	VERIFY(dl_inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
5008	bzero(&dl_inp->tstats, sizeof (dl_inp->tstats));
5009	bzero(&dl_inp->pstats, sizeof (dl_inp->pstats));
5010	bzero(&dl_inp->sstats, sizeof (dl_inp->sstats));
5011#if IFNET_INPUT_SANITY_CHK
5012	VERIFY(dl_inp->input_mbuf_cnt == 0);
5013#endif /* IFNET_INPUT_SANITY_CHK */
5014
5015	/*
5016	 * A specific DLIL input thread is created per Ethernet/cellular
5017	 * interface or for an interface which supports opportunistic
5018	 * input polling.  Pseudo interfaces or other types of interfaces
5019	 * use the main input thread instead.
5020	 */
5021	if ((net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) ||
5022	    ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR) {
5023		ifp->if_inp = dl_inp;
5024		err = dlil_create_input_thread(ifp, ifp->if_inp);
5025		if (err != 0) {
5026			panic_plain("%s: ifp=%p couldn't get an input thread; "
5027			    "err=%d", __func__, ifp, err);
5028			/* NOTREACHED */
5029		}
5030	}
5031
5032	/*
5033	 * If the driver supports the new transmit model, calculate flow hash
5034	 * and create a workloop starter thread to invoke the if_start callback
5035	 * where the packets may be dequeued and transmitted.
5036	 */
5037	if (ifp->if_eflags & IFEF_TXSTART) {
5038		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
5039		VERIFY(ifp->if_flowhash != 0);
5040
5041		VERIFY(ifp->if_start != NULL);
5042		VERIFY(ifp->if_start_thread == THREAD_NULL);
5043
5044		ifnet_set_start_cycle(ifp, NULL);
5045		ifp->if_start_active = 0;
5046		ifp->if_start_req = 0;
5047		ifp->if_start_flags = 0;
5048		if ((err = kernel_thread_start(ifnet_start_thread_fn, ifp,
5049		    &ifp->if_start_thread)) != KERN_SUCCESS) {
5050			panic_plain("%s: ifp=%p couldn't get a start thread; "
5051			    "err=%d", __func__, ifp, err);
5052			/* NOTREACHED */
5053		}
5054		ml_thread_policy(ifp->if_start_thread, MACHINE_GROUP,
5055		    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP));
5056	} else {
5057		ifp->if_flowhash = 0;
5058	}
5059
5060	/*
5061	 * If the driver supports the new receive model, create a poller
5062	 * thread to invoke if_input_poll callback where the packets may
5063	 * be dequeued from the driver and processed for reception.
5064	 */
5065	if (ifp->if_eflags & IFEF_RXPOLL) {
5066		VERIFY(ifp->if_input_poll != NULL);
5067		VERIFY(ifp->if_input_ctl != NULL);
5068		VERIFY(ifp->if_poll_thread == THREAD_NULL);
5069
5070		ifnet_set_poll_cycle(ifp, NULL);
5071		ifp->if_poll_update = 0;
5072		ifp->if_poll_active = 0;
5073		ifp->if_poll_req = 0;
5074		if ((err = kernel_thread_start(ifnet_poll_thread_fn, ifp,
5075		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
5076			panic_plain("%s: ifp=%p couldn't get a poll thread; "
5077			    "err=%d", __func__, ifp, err);
5078			/* NOTREACHED */
5079		}
5080		ml_thread_policy(ifp->if_poll_thread, MACHINE_GROUP,
5081		    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP));
5082	}
5083
5084	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
5085	VERIFY(ifp->if_desc.ifd_len == 0);
5086	VERIFY(ifp->if_desc.ifd_desc != NULL);
5087
5088	/* Record attach PC stacktrace */
5089	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
5090
5091	ifp->if_updatemcasts = 0;
5092	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
5093		struct ifmultiaddr *ifma;
5094		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
5095			IFMA_LOCK(ifma);
5096			if (ifma->ifma_addr->sa_family == AF_LINK ||
5097			    ifma->ifma_addr->sa_family == AF_UNSPEC)
5098				ifp->if_updatemcasts++;
5099			IFMA_UNLOCK(ifma);
5100		}
5101
5102		printf("%s: attached with %d suspended link-layer multicast "
5103		    "membership(s)\n", if_name(ifp),
5104		    ifp->if_updatemcasts);
5105	}
5106
5107	/* Clear logging parameters */
5108	bzero(&ifp->if_log, sizeof (ifp->if_log));
5109	ifp->if_fg_sendts = 0;
5110
5111	VERIFY(ifp->if_delegated.ifp == NULL);
5112	VERIFY(ifp->if_delegated.type == 0);
5113	VERIFY(ifp->if_delegated.family == 0);
5114	VERIFY(ifp->if_delegated.subfamily == 0);
5115
5116	ifnet_lock_done(ifp);
5117	ifnet_head_done();
5118
5119	lck_mtx_lock(&ifp->if_cached_route_lock);
5120	/* Enable forwarding cached route */
5121	ifp->if_fwd_cacheok = 1;
5122	/* Clean up any existing cached routes */
5123	ROUTE_RELEASE(&ifp->if_fwd_route);
5124	bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route));
5125	ROUTE_RELEASE(&ifp->if_src_route);
5126	bzero(&ifp->if_src_route, sizeof (ifp->if_src_route));
5127	ROUTE_RELEASE(&ifp->if_src_route6);
5128	bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6));
5129	lck_mtx_unlock(&ifp->if_cached_route_lock);
5130
5131	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
5132
5133	/*
5134	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
5135	 * and trees; do this before the ifnet is marked as attached.
5136	 * The ifnet keeps the reference to the info structures even after
5137	 * the ifnet is detached, since the network-layer records still
5138	 * refer to the info structures even after that.  This also
5139	 * makes it possible for them to still function after the ifnet
5140	 * is recycled or reattached.
5141	 */
5142#if INET
5143	if (IGMP_IFINFO(ifp) == NULL) {
5144		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, M_WAITOK);
5145		VERIFY(IGMP_IFINFO(ifp) != NULL);
5146	} else {
5147		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
5148		igmp_domifreattach(IGMP_IFINFO(ifp));
5149	}
5150#endif /* INET */
5151#if INET6
5152	if (MLD_IFINFO(ifp) == NULL) {
5153		MLD_IFINFO(ifp) = mld_domifattach(ifp, M_WAITOK);
5154		VERIFY(MLD_IFINFO(ifp) != NULL);
5155	} else {
5156		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
5157		mld_domifreattach(MLD_IFINFO(ifp));
5158	}
5159#endif /* INET6 */
5160
5161	VERIFY(ifp->if_data_threshold == 0);
5162
5163	/*
5164	 * Finally, mark this ifnet as attached.
5165	 */
5166	lck_mtx_lock(rnh_lock);
5167	ifnet_lock_exclusive(ifp);
5168	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
5169	ifp->if_lqm = (ifp == lo_ifp) ? IFNET_LQM_THRESH_GOOD :
5170	    IFNET_LQM_THRESH_UNKNOWN;
5171	lck_mtx_lock_spin(&ifp->if_ref_lock);
5172	ifp->if_refflags = IFRF_ATTACHED;
5173	lck_mtx_unlock(&ifp->if_ref_lock);
5174	if (net_rtref) {
5175		/* boot-args override; enable idle notification */
5176		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
5177		    IFRF_IDLE_NOTIFY);
5178	} else {
5179		/* apply previous request(s) to set the idle flags, if any */
5180		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
5181		    ifp->if_idle_new_flags_mask);
5182
5183	}
5184	ifnet_lock_done(ifp);
5185	lck_mtx_unlock(rnh_lock);
5186	dlil_if_unlock();
5187
5188#if PF
5189	/*
5190	 * Attach packet filter to this interface, if enabled.
5191	 */
5192	pf_ifnet_hook(ifp, 1);
5193#endif /* PF */
5194
5195	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0);
5196
5197	if (dlil_verbose) {
5198		printf("%s: attached%s\n", if_name(ifp),
5199		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
5200	}
5201
5202	return (0);
5203}
5204
5205/*
5206 * Prepare the storage for the first/permanent link address, which must
5207 * must have the same lifetime as the ifnet itself.  Although the link
5208 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
5209 * its location in memory must never change as it may still be referred
5210 * to by some parts of the system afterwards (unfortunate implementation
5211 * artifacts inherited from BSD.)
5212 *
5213 * Caller must hold ifnet lock as writer.
5214 */
5215static struct ifaddr *
5216dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
5217{
5218	struct ifaddr *ifa, *oifa;
5219	struct sockaddr_dl *asdl, *msdl;
5220	char workbuf[IFNAMSIZ*2];
5221	int namelen, masklen, socksize;
5222	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
5223
5224	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
5225	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
5226
5227	namelen = snprintf(workbuf, sizeof (workbuf), "%s",
5228	    if_name(ifp));
5229	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
5230	socksize = masklen + ifp->if_addrlen;
5231#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
5232	if ((u_int32_t)socksize < sizeof (struct sockaddr_dl))
5233		socksize = sizeof(struct sockaddr_dl);
5234	socksize = ROUNDUP(socksize);
5235#undef ROUNDUP
5236
5237	ifa = ifp->if_lladdr;
5238	if (socksize > DLIL_SDLMAXLEN ||
5239	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
5240		/*
5241		 * Rare, but in the event that the link address requires
5242		 * more storage space than DLIL_SDLMAXLEN, allocate the
5243		 * largest possible storages for address and mask, such
5244		 * that we can reuse the same space when if_addrlen grows.
5245		 * This same space will be used when if_addrlen shrinks.
5246		 */
5247		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
5248			int ifasize = sizeof (*ifa) + 2 * SOCK_MAXADDRLEN;
5249			ifa = _MALLOC(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
5250			if (ifa == NULL)
5251				return (NULL);
5252			ifa_lock_init(ifa);
5253			/* Don't set IFD_ALLOC, as this is permanent */
5254			ifa->ifa_debug = IFD_LINK;
5255		}
5256		IFA_LOCK(ifa);
5257		/* address and mask sockaddr_dl locations */
5258		asdl = (struct sockaddr_dl *)(ifa + 1);
5259		bzero(asdl, SOCK_MAXADDRLEN);
5260		msdl = (struct sockaddr_dl *)(void *)
5261		    ((char *)asdl + SOCK_MAXADDRLEN);
5262		bzero(msdl, SOCK_MAXADDRLEN);
5263	} else {
5264		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
5265		/*
5266		 * Use the storage areas for address and mask within the
5267		 * dlil_ifnet structure.  This is the most common case.
5268		 */
5269		if (ifa == NULL) {
5270			ifa = &dl_if->dl_if_lladdr.ifa;
5271			ifa_lock_init(ifa);
5272			/* Don't set IFD_ALLOC, as this is permanent */
5273			ifa->ifa_debug = IFD_LINK;
5274		}
5275		IFA_LOCK(ifa);
5276		/* address and mask sockaddr_dl locations */
5277		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
5278		bzero(asdl, sizeof (dl_if->dl_if_lladdr.asdl));
5279		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
5280		bzero(msdl, sizeof (dl_if->dl_if_lladdr.msdl));
5281	}
5282
5283	/* hold a permanent reference for the ifnet itself */
5284	IFA_ADDREF_LOCKED(ifa);
5285	oifa = ifp->if_lladdr;
5286	ifp->if_lladdr = ifa;
5287
5288	VERIFY(ifa->ifa_debug == IFD_LINK);
5289	ifa->ifa_ifp = ifp;
5290	ifa->ifa_rtrequest = link_rtrequest;
5291	ifa->ifa_addr = (struct sockaddr *)asdl;
5292	asdl->sdl_len = socksize;
5293	asdl->sdl_family = AF_LINK;
5294	bcopy(workbuf, asdl->sdl_data, namelen);
5295	asdl->sdl_nlen = namelen;
5296	asdl->sdl_index = ifp->if_index;
5297	asdl->sdl_type = ifp->if_type;
5298	if (ll_addr != NULL) {
5299		asdl->sdl_alen = ll_addr->sdl_alen;
5300		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
5301	} else {
5302		asdl->sdl_alen = 0;
5303	}
5304	ifa->ifa_netmask = (struct sockaddr*)msdl;
5305	msdl->sdl_len = masklen;
5306	while (namelen != 0)
5307		msdl->sdl_data[--namelen] = 0xff;
5308	IFA_UNLOCK(ifa);
5309
5310	if (oifa != NULL)
5311		IFA_REMREF(oifa);
5312
5313	return (ifa);
5314}
5315
5316static void
5317if_purgeaddrs(struct ifnet *ifp)
5318{
5319#if INET
5320	in_purgeaddrs(ifp);
5321#endif /* INET */
5322#if INET6
5323	in6_purgeaddrs(ifp);
5324#endif /* INET6 */
5325}
5326
5327errno_t
5328ifnet_detach(ifnet_t ifp)
5329{
5330	struct ifnet *delegated_ifp;
5331
5332	if (ifp == NULL)
5333		return (EINVAL);
5334
5335	lck_mtx_lock(rnh_lock);
5336	ifnet_head_lock_exclusive();
5337	ifnet_lock_exclusive(ifp);
5338
5339	/*
5340	 * Check to see if this interface has previously triggered
5341	 * aggressive protocol draining; if so, decrement the global
5342	 * refcnt and clear PR_AGGDRAIN on the route domain if
5343	 * there are no more of such an interface around.
5344	 */
5345	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
5346
5347	lck_mtx_lock_spin(&ifp->if_ref_lock);
5348	 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
5349		lck_mtx_unlock(&ifp->if_ref_lock);
5350		ifnet_lock_done(ifp);
5351		ifnet_head_done();
5352		lck_mtx_unlock(rnh_lock);
5353		return (EINVAL);
5354	} else if (ifp->if_refflags & IFRF_DETACHING) {
5355		/* Interface has already been detached */
5356		lck_mtx_unlock(&ifp->if_ref_lock);
5357		ifnet_lock_done(ifp);
5358		ifnet_head_done();
5359		lck_mtx_unlock(rnh_lock);
5360		return (ENXIO);
5361	}
5362	/* Indicate this interface is being detached */
5363	ifp->if_refflags &= ~IFRF_ATTACHED;
5364	ifp->if_refflags |= IFRF_DETACHING;
5365	lck_mtx_unlock(&ifp->if_ref_lock);
5366
5367	if (dlil_verbose)
5368		printf("%s: detaching\n", if_name(ifp));
5369
5370	/*
5371	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
5372	 * no longer be visible during lookups from this point.
5373	 */
5374	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
5375	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
5376	ifp->if_link.tqe_next = NULL;
5377	ifp->if_link.tqe_prev = NULL;
5378	ifindex2ifnet[ifp->if_index] = NULL;
5379
5380	/* Record detach PC stacktrace */
5381	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
5382
5383	/* Clear logging parameters */
5384	bzero(&ifp->if_log, sizeof (ifp->if_log));
5385
5386	/* Clear delegated interface info (reference released below) */
5387	delegated_ifp = ifp->if_delegated.ifp;
5388	bzero(&ifp->if_delegated, sizeof (ifp->if_delegated));
5389
5390	ifnet_lock_done(ifp);
5391	ifnet_head_done();
5392	lck_mtx_unlock(rnh_lock);
5393
5394	/* Release reference held on the delegated interface */
5395	if (delegated_ifp != NULL)
5396		ifnet_release(delegated_ifp);
5397
5398	/* Reset Link Quality Metric (unless loopback [lo0]) */
5399	if (ifp != lo_ifp)
5400		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF);
5401
5402	/* Reset TCP local statistics */
5403	if (ifp->if_tcp_stat != NULL)
5404		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
5405
5406	/* Reset UDP local statistics */
5407	if (ifp->if_udp_stat != NULL)
5408		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
5409
5410	/* Let BPF know we're detaching */
5411	bpfdetach(ifp);
5412
5413	/* Mark the interface as DOWN */
5414	if_down(ifp);
5415
5416	/* Drain send queue */
5417	ifclassq_teardown(ifp);
5418
5419	/* Disable forwarding cached route */
5420	lck_mtx_lock(&ifp->if_cached_route_lock);
5421	ifp->if_fwd_cacheok = 0;
5422	lck_mtx_unlock(&ifp->if_cached_route_lock);
5423
5424	ifp->if_data_threshold = 0;
5425	/*
5426	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
5427	 * references to the info structures and leave them attached to
5428	 * this ifnet.
5429	 */
5430#if INET
5431	igmp_domifdetach(ifp);
5432#endif /* INET */
5433#if INET6
5434	mld_domifdetach(ifp);
5435#endif /* INET6 */
5436
5437	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0);
5438
5439	/* Let worker thread take care of the rest, to avoid reentrancy */
5440	dlil_if_lock();
5441	ifnet_detaching_enqueue(ifp);
5442	dlil_if_unlock();
5443
5444	return (0);
5445}
5446
5447static void
5448ifnet_detaching_enqueue(struct ifnet *ifp)
5449{
5450	dlil_if_lock_assert();
5451
5452	++ifnet_detaching_cnt;
5453	VERIFY(ifnet_detaching_cnt != 0);
5454	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
5455	wakeup((caddr_t)&ifnet_delayed_run);
5456}
5457
5458static struct ifnet *
5459ifnet_detaching_dequeue(void)
5460{
5461	struct ifnet *ifp;
5462
5463	dlil_if_lock_assert();
5464
5465	ifp = TAILQ_FIRST(&ifnet_detaching_head);
5466	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
5467	if (ifp != NULL) {
5468		VERIFY(ifnet_detaching_cnt != 0);
5469		--ifnet_detaching_cnt;
5470		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
5471		ifp->if_detaching_link.tqe_next = NULL;
5472		ifp->if_detaching_link.tqe_prev = NULL;
5473	}
5474	return (ifp);
5475}
5476
5477static int
5478ifnet_detacher_thread_cont(int err)
5479{
5480#pragma unused(err)
5481	struct ifnet *ifp;
5482
5483	for (;;) {
5484		dlil_if_lock_assert();
5485		while (ifnet_detaching_cnt == 0) {
5486			(void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock,
5487			    (PZERO - 1), "ifnet_detacher_cont", 0,
5488			    ifnet_detacher_thread_cont);
5489			/* NOTREACHED */
5490		}
5491
5492		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
5493
5494		/* Take care of detaching ifnet */
5495		ifp = ifnet_detaching_dequeue();
5496		if (ifp != NULL) {
5497			dlil_if_unlock();
5498			ifnet_detach_final(ifp);
5499			dlil_if_lock();
5500		}
5501	}
5502	/* NOTREACHED */
5503	return (0);
5504}
5505
5506static void
5507ifnet_detacher_thread_func(void *v, wait_result_t w)
5508{
5509#pragma unused(v, w)
5510	dlil_if_lock();
5511	(void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock,
5512	    (PZERO - 1), "ifnet_detacher", 0, ifnet_detacher_thread_cont);
5513	/*
5514	 * msleep0() shouldn't have returned as PCATCH was not set;
5515	 * therefore assert in this case.
5516	 */
5517	dlil_if_unlock();
5518	VERIFY(0);
5519}
5520
5521static void
5522ifnet_detach_final(struct ifnet *ifp)
5523{
5524	struct ifnet_filter *filter, *filter_next;
5525	struct ifnet_filter_head fhead;
5526	struct dlil_threading_info *inp;
5527	struct ifaddr *ifa;
5528	ifnet_detached_func if_free;
5529	int i;
5530
5531	lck_mtx_lock(&ifp->if_ref_lock);
5532	if (!(ifp->if_refflags & IFRF_DETACHING)) {
5533		panic("%s: flags mismatch (detaching not set) ifp=%p",
5534		    __func__, ifp);
5535		/* NOTREACHED */
5536	}
5537
5538	/*
5539	 * Wait until the existing IO references get released
5540	 * before we proceed with ifnet_detach.  This is not a
5541	 * common case, so block without using a continuation.
5542	 */
5543	while (ifp->if_refio > 0) {
5544		printf("%s: Waiting for IO references on %s interface "
5545		    "to be released\n", __func__, if_name(ifp));
5546		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
5547			(PZERO - 1), "ifnet_ioref_wait", NULL);
5548	}
5549	lck_mtx_unlock(&ifp->if_ref_lock);
5550
5551	/* Detach interface filters */
5552	lck_mtx_lock(&ifp->if_flt_lock);
5553	if_flt_monitor_enter(ifp);
5554
5555	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
5556	fhead = ifp->if_flt_head;
5557	TAILQ_INIT(&ifp->if_flt_head);
5558
5559	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
5560		filter_next = TAILQ_NEXT(filter, filt_next);
5561		lck_mtx_unlock(&ifp->if_flt_lock);
5562
5563		dlil_detach_filter_internal(filter, 1);
5564		lck_mtx_lock(&ifp->if_flt_lock);
5565	}
5566	if_flt_monitor_leave(ifp);
5567	lck_mtx_unlock(&ifp->if_flt_lock);
5568
5569	/* Tell upper layers to drop their network addresses */
5570	if_purgeaddrs(ifp);
5571
5572	ifnet_lock_exclusive(ifp);
5573
5574	/* Uplumb all protocols */
5575	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
5576		struct if_proto *proto;
5577
5578		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
5579		while (proto != NULL) {
5580			protocol_family_t family = proto->protocol_family;
5581			ifnet_lock_done(ifp);
5582			proto_unplumb(family, ifp);
5583			ifnet_lock_exclusive(ifp);
5584			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
5585		}
5586		/* There should not be any protocols left */
5587		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
5588	}
5589	zfree(dlif_phash_zone, ifp->if_proto_hash);
5590	ifp->if_proto_hash = NULL;
5591
5592	/* Detach (permanent) link address from if_addrhead */
5593	ifa = TAILQ_FIRST(&ifp->if_addrhead);
5594	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
5595	IFA_LOCK(ifa);
5596	if_detach_link_ifa(ifp, ifa);
5597	IFA_UNLOCK(ifa);
5598
5599	/* Remove (permanent) link address from ifnet_addrs[] */
5600	IFA_REMREF(ifa);
5601	ifnet_addrs[ifp->if_index - 1] = NULL;
5602
5603	/* This interface should not be on {ifnet_head,detaching} */
5604	VERIFY(ifp->if_link.tqe_next == NULL);
5605	VERIFY(ifp->if_link.tqe_prev == NULL);
5606	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
5607	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
5608
5609	/* Prefix list should be empty by now */
5610	VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead));
5611
5612	/* The slot should have been emptied */
5613	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
5614
5615	/* There should not be any addresses left */
5616	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
5617
5618	/*
5619	 * Signal the starter thread to terminate itself.
5620	 */
5621	if (ifp->if_start_thread != THREAD_NULL) {
5622		lck_mtx_lock_spin(&ifp->if_start_lock);
5623		ifp->if_start_flags = 0;
5624		ifp->if_start_thread = THREAD_NULL;
5625		wakeup_one((caddr_t)&ifp->if_start_thread);
5626		lck_mtx_unlock(&ifp->if_start_lock);
5627	}
5628
5629	/*
5630	 * Signal the poller thread to terminate itself.
5631	 */
5632	if (ifp->if_poll_thread != THREAD_NULL) {
5633		lck_mtx_lock_spin(&ifp->if_poll_lock);
5634		ifp->if_poll_thread = THREAD_NULL;
5635		wakeup_one((caddr_t)&ifp->if_poll_thread);
5636		lck_mtx_unlock(&ifp->if_poll_lock);
5637	}
5638
5639	/*
5640	 * If thread affinity was set for the workloop thread, we will need
5641	 * to tear down the affinity and release the extra reference count
5642	 * taken at attach time.  Does not apply to lo0 or other interfaces
5643	 * without dedicated input threads.
5644	 */
5645	if ((inp = ifp->if_inp) != NULL) {
5646		VERIFY(inp != dlil_main_input_thread);
5647
5648		if (inp->net_affinity) {
5649			struct thread *tp, *wtp, *ptp;
5650
5651			lck_mtx_lock_spin(&inp->input_lck);
5652			wtp = inp->wloop_thr;
5653			inp->wloop_thr = THREAD_NULL;
5654			ptp = inp->poll_thr;
5655			inp->poll_thr = THREAD_NULL;
5656			tp = inp->input_thr;	/* don't nullify now */
5657			inp->tag = 0;
5658			inp->net_affinity = FALSE;
5659			lck_mtx_unlock(&inp->input_lck);
5660
5661			/* Tear down poll thread affinity */
5662			if (ptp != NULL) {
5663				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
5664				(void) dlil_affinity_set(ptp,
5665				    THREAD_AFFINITY_TAG_NULL);
5666				thread_deallocate(ptp);
5667			}
5668
5669			/* Tear down workloop thread affinity */
5670			if (wtp != NULL) {
5671				(void) dlil_affinity_set(wtp,
5672				    THREAD_AFFINITY_TAG_NULL);
5673				thread_deallocate(wtp);
5674			}
5675
5676			/* Tear down DLIL input thread affinity */
5677			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
5678			thread_deallocate(tp);
5679		}
5680
5681		/* disassociate ifp DLIL input thread */
5682		ifp->if_inp = NULL;
5683
5684		lck_mtx_lock_spin(&inp->input_lck);
5685		inp->input_waiting |= DLIL_INPUT_TERMINATE;
5686		if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
5687			wakeup_one((caddr_t)&inp->input_waiting);
5688		}
5689		lck_mtx_unlock(&inp->input_lck);
5690	}
5691
5692	/* The driver might unload, so point these to ourselves */
5693	if_free = ifp->if_free;
5694	ifp->if_output = ifp_if_output;
5695	ifp->if_pre_enqueue = ifp_if_output;
5696	ifp->if_start = ifp_if_start;
5697	ifp->if_output_ctl = ifp_if_ctl;
5698	ifp->if_input_poll = ifp_if_input_poll;
5699	ifp->if_input_ctl = ifp_if_ctl;
5700	ifp->if_ioctl = ifp_if_ioctl;
5701	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
5702	ifp->if_free = ifp_if_free;
5703	ifp->if_demux = ifp_if_demux;
5704	ifp->if_event = ifp_if_event;
5705	ifp->if_framer_legacy = ifp_if_framer;
5706	ifp->if_framer = ifp_if_framer_extended;
5707	ifp->if_add_proto = ifp_if_add_proto;
5708	ifp->if_del_proto = ifp_if_del_proto;
5709	ifp->if_check_multi = ifp_if_check_multi;
5710
5711	/* wipe out interface description */
5712	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
5713	ifp->if_desc.ifd_len = 0;
5714	VERIFY(ifp->if_desc.ifd_desc != NULL);
5715	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
5716
5717	/* there shouldn't be any delegation by now */
5718	VERIFY(ifp->if_delegated.ifp == NULL);
5719	VERIFY(ifp->if_delegated.type == 0);
5720	VERIFY(ifp->if_delegated.family == 0);
5721	VERIFY(ifp->if_delegated.subfamily == 0);
5722
5723	ifnet_lock_done(ifp);
5724
5725#if PF
5726	/*
5727	 * Detach this interface from packet filter, if enabled.
5728	 */
5729	pf_ifnet_hook(ifp, 0);
5730#endif /* PF */
5731
5732	/* Filter list should be empty */
5733	lck_mtx_lock_spin(&ifp->if_flt_lock);
5734	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
5735	VERIFY(ifp->if_flt_busy == 0);
5736	VERIFY(ifp->if_flt_waiters == 0);
5737	lck_mtx_unlock(&ifp->if_flt_lock);
5738
5739	/* Last chance to drain send queue */
5740	if_qflush(ifp, 0);
5741
5742	/* Last chance to cleanup any cached route */
5743	lck_mtx_lock(&ifp->if_cached_route_lock);
5744	VERIFY(!ifp->if_fwd_cacheok);
5745	ROUTE_RELEASE(&ifp->if_fwd_route);
5746	bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route));
5747	ROUTE_RELEASE(&ifp->if_src_route);
5748	bzero(&ifp->if_src_route, sizeof (ifp->if_src_route));
5749	ROUTE_RELEASE(&ifp->if_src_route6);
5750	bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6));
5751	lck_mtx_unlock(&ifp->if_cached_route_lock);
5752
5753	VERIFY(ifp->if_data_threshold == 0);
5754
5755	ifnet_llreach_ifdetach(ifp);
5756
5757	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0);
5758
5759	if (if_free != NULL)
5760		if_free(ifp);
5761
5762	/*
5763	 * Finally, mark this ifnet as detached.
5764	 */
5765	lck_mtx_lock_spin(&ifp->if_ref_lock);
5766	if (!(ifp->if_refflags & IFRF_DETACHING)) {
5767		panic("%s: flags mismatch (detaching not set) ifp=%p",
5768		    __func__, ifp);
5769		/* NOTREACHED */
5770	}
5771	ifp->if_refflags &= ~IFRF_DETACHING;
5772	lck_mtx_unlock(&ifp->if_ref_lock);
5773
5774	if (dlil_verbose)
5775		printf("%s: detached\n", if_name(ifp));
5776
5777	/* Release reference held during ifnet attach */
5778	ifnet_release(ifp);
5779}
5780
5781static errno_t
5782ifp_if_output(struct ifnet *ifp, struct mbuf *m)
5783{
5784#pragma unused(ifp)
5785	m_freem(m);
5786	return (0);
5787}
5788
5789static void
5790ifp_if_start(struct ifnet *ifp)
5791{
5792	ifnet_purge(ifp);
5793}
5794
5795static void
5796ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
5797    struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
5798{
5799#pragma unused(ifp, flags, max_cnt)
5800	if (m_head != NULL)
5801		*m_head = NULL;
5802	if (m_tail != NULL)
5803		*m_tail = NULL;
5804	if (cnt != NULL)
5805		*cnt = 0;
5806	if (len != NULL)
5807		*len = 0;
5808}
5809
5810static errno_t
5811ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
5812{
5813#pragma unused(ifp, cmd, arglen, arg)
5814	return (EOPNOTSUPP);
5815}
5816
5817static errno_t
5818ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
5819{
5820#pragma unused(ifp, fh, pf)
5821	m_freem(m);
5822	return (EJUSTRETURN);
5823}
5824
5825static errno_t
5826ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
5827    const struct ifnet_demux_desc *da, u_int32_t dc)
5828{
5829#pragma unused(ifp, pf, da, dc)
5830	return (EINVAL);
5831}
5832
5833static errno_t
5834ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
5835{
5836#pragma unused(ifp, pf)
5837	return (EINVAL);
5838}
5839
5840static errno_t
5841ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
5842{
5843#pragma unused(ifp, sa)
5844	return (EOPNOTSUPP);
5845}
5846
5847static errno_t
5848ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
5849    const struct sockaddr *sa, const char *ll, const char *t)
5850{
5851#pragma unused(ifp, m, sa, ll, t)
5852	return (ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL));
5853}
5854
5855static errno_t
5856ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
5857    const struct sockaddr *sa, const char *ll, const char *t,
5858    u_int32_t *pre, u_int32_t *post)
5859{
5860#pragma unused(ifp, sa, ll, t)
5861	m_freem(*m);
5862	*m = NULL;
5863
5864	if (pre != NULL)
5865		*pre = 0;
5866	if (post != NULL)
5867		*post = 0;
5868
5869	return (EJUSTRETURN);
5870}
5871
5872errno_t
5873ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
5874{
5875#pragma unused(ifp, cmd, arg)
5876	return (EOPNOTSUPP);
5877}
5878
5879static errno_t
5880ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
5881{
5882#pragma unused(ifp, tm, f)
5883	/* XXX not sure what to do here */
5884	return (0);
5885}
5886
5887static void
5888ifp_if_free(struct ifnet *ifp)
5889{
5890#pragma unused(ifp)
5891}
5892
5893static void
5894ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
5895{
5896#pragma unused(ifp, e)
5897}
5898
5899__private_extern__
5900int dlil_if_acquire(u_int32_t family, const void *uniqueid,
5901    size_t uniqueid_len, struct ifnet **ifp)
5902{
5903	struct ifnet *ifp1 = NULL;
5904	struct dlil_ifnet *dlifp1 = NULL;
5905	void *buf, *base, **pbuf;
5906	int ret = 0;
5907
5908	dlil_if_lock();
5909	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
5910		ifp1 = (struct ifnet *)dlifp1;
5911
5912		if (ifp1->if_family != family)
5913			continue;
5914
5915		lck_mtx_lock(&dlifp1->dl_if_lock);
5916		/* same uniqueid and same len or no unique id specified */
5917		if ((uniqueid_len == dlifp1->dl_if_uniqueid_len) &&
5918		    !bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len)) {
5919			/* check for matching interface in use */
5920			if (dlifp1->dl_if_flags & DLIF_INUSE) {
5921				if (uniqueid_len) {
5922					ret = EBUSY;
5923					lck_mtx_unlock(&dlifp1->dl_if_lock);
5924					goto end;
5925				}
5926			} else {
5927				dlifp1->dl_if_flags |= (DLIF_INUSE|DLIF_REUSE);
5928				lck_mtx_unlock(&dlifp1->dl_if_lock);
5929				*ifp = ifp1;
5930				goto end;
5931			}
5932		}
5933		lck_mtx_unlock(&dlifp1->dl_if_lock);
5934	}
5935
5936	/* no interface found, allocate a new one */
5937	buf = zalloc(dlif_zone);
5938	if (buf == NULL) {
5939		ret = ENOMEM;
5940		goto end;
5941	}
5942	bzero(buf, dlif_bufsize);
5943
5944	/* Get the 64-bit aligned base address for this object */
5945	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
5946	    sizeof (u_int64_t));
5947	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
5948
5949	/*
5950	 * Wind back a pointer size from the aligned base and
5951	 * save the original address so we can free it later.
5952	 */
5953	pbuf = (void **)((intptr_t)base - sizeof (void *));
5954	*pbuf = buf;
5955	dlifp1 = base;
5956
5957	if (uniqueid_len) {
5958		MALLOC(dlifp1->dl_if_uniqueid, void *, uniqueid_len,
5959		    M_NKE, M_WAITOK);
5960		if (dlifp1->dl_if_uniqueid == NULL) {
5961			zfree(dlif_zone, dlifp1);
5962			ret = ENOMEM;
5963			goto end;
5964		}
5965		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
5966		dlifp1->dl_if_uniqueid_len = uniqueid_len;
5967	}
5968
5969	ifp1 = (struct ifnet *)dlifp1;
5970	dlifp1->dl_if_flags = DLIF_INUSE;
5971	if (ifnet_debug) {
5972		dlifp1->dl_if_flags |= DLIF_DEBUG;
5973		dlifp1->dl_if_trace = dlil_if_trace;
5974	}
5975	ifp1->if_name = dlifp1->dl_if_namestorage;
5976	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
5977
5978	/* initialize interface description */
5979	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
5980	ifp1->if_desc.ifd_len = 0;
5981	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
5982
5983#if CONFIG_MACF_NET
5984	mac_ifnet_label_init(ifp1);
5985#endif
5986
5987	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
5988		DLIL_PRINTF("%s: failed to allocate if local stats, "
5989		    "error: %d\n", __func__, ret);
5990		/* This probably shouldn't be fatal */
5991		ret = 0;
5992	}
5993
5994	lck_mtx_init(&dlifp1->dl_if_lock, ifnet_lock_group, ifnet_lock_attr);
5995	lck_rw_init(&ifp1->if_lock, ifnet_lock_group, ifnet_lock_attr);
5996	lck_mtx_init(&ifp1->if_ref_lock, ifnet_lock_group, ifnet_lock_attr);
5997	lck_mtx_init(&ifp1->if_flt_lock, ifnet_lock_group, ifnet_lock_attr);
5998	lck_mtx_init(&ifp1->if_addrconfig_lock, ifnet_lock_group,
5999	    ifnet_lock_attr);
6000	lck_rw_init(&ifp1->if_llreach_lock, ifnet_lock_group, ifnet_lock_attr);
6001#if INET6
6002	lck_rw_init(&ifp1->if_inet6data_lock, ifnet_lock_group, ifnet_lock_attr);
6003	ifp1->if_inet6data = NULL;
6004#endif
6005
6006	/* for send data paths */
6007	lck_mtx_init(&ifp1->if_start_lock, ifnet_snd_lock_group,
6008	    ifnet_lock_attr);
6009	lck_mtx_init(&ifp1->if_cached_route_lock, ifnet_snd_lock_group,
6010	    ifnet_lock_attr);
6011	lck_mtx_init(&ifp1->if_snd.ifcq_lock, ifnet_snd_lock_group,
6012	    ifnet_lock_attr);
6013
6014	/* for receive data paths */
6015	lck_mtx_init(&ifp1->if_poll_lock, ifnet_rcv_lock_group,
6016	    ifnet_lock_attr);
6017
6018	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
6019
6020	*ifp = ifp1;
6021
6022end:
6023	dlil_if_unlock();
6024
6025	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof (u_int64_t)) &&
6026	    IS_P2ALIGNED(&ifp1->if_data, sizeof (u_int64_t))));
6027
6028	return (ret);
6029}
6030
6031__private_extern__ void
6032dlil_if_release(ifnet_t	ifp)
6033{
6034	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
6035
6036	ifnet_lock_exclusive(ifp);
6037	lck_mtx_lock(&dlifp->dl_if_lock);
6038	dlifp->dl_if_flags &= ~DLIF_INUSE;
6039	strncpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
6040	ifp->if_name = dlifp->dl_if_namestorage;
6041	/* Reset external name (name + unit) */
6042	ifp->if_xname = dlifp->dl_if_xnamestorage;
6043	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
6044	    "%s?", ifp->if_name);
6045	lck_mtx_unlock(&dlifp->dl_if_lock);
6046#if CONFIG_MACF_NET
6047	/*
6048	* We can either recycle the MAC label here or in dlil_if_acquire().
6049	* It seems logical to do it here but this means that anything that
6050	* still has a handle on ifp will now see it as unlabeled.
6051	* Since the interface is "dead" that may be OK.  Revisit later.
6052	*/
6053	mac_ifnet_label_recycle(ifp);
6054#endif
6055	ifnet_lock_done(ifp);
6056}
6057
6058__private_extern__ void
6059dlil_if_lock(void)
6060{
6061	lck_mtx_lock(&dlil_ifnet_lock);
6062}
6063
6064__private_extern__ void
6065dlil_if_unlock(void)
6066{
6067	lck_mtx_unlock(&dlil_ifnet_lock);
6068}
6069
6070__private_extern__ void
6071dlil_if_lock_assert(void)
6072{
6073	lck_mtx_assert(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
6074}
6075
6076__private_extern__ void
6077dlil_proto_unplumb_all(struct ifnet *ifp)
6078{
6079	/*
6080	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
6081	 * each bucket contains exactly one entry; PF_VLAN does not need an
6082	 * explicit unplumb.
6083	 *
6084	 * if_proto_hash[3] is for other protocols; we expect anything
6085	 * in this bucket to respond to the DETACHING event (which would
6086	 * have happened by now) and do the unplumb then.
6087	 */
6088	(void) proto_unplumb(PF_INET, ifp);
6089#if INET6
6090	(void) proto_unplumb(PF_INET6, ifp);
6091#endif /* INET6 */
6092}
6093
6094static void
6095ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
6096{
6097	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
6098	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
6099
6100	route_copyout(dst, &ifp->if_src_route, sizeof (*dst));
6101
6102	lck_mtx_unlock(&ifp->if_cached_route_lock);
6103}
6104
6105static void
6106ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
6107{
6108	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
6109	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
6110
6111	if (ifp->if_fwd_cacheok) {
6112		route_copyin(src, &ifp->if_src_route, sizeof (*src));
6113	} else {
6114		ROUTE_RELEASE(src);
6115	}
6116	lck_mtx_unlock(&ifp->if_cached_route_lock);
6117}
6118
6119#if INET6
6120static void
6121ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
6122{
6123	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
6124	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
6125
6126	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
6127	    sizeof (*dst));
6128
6129	lck_mtx_unlock(&ifp->if_cached_route_lock);
6130}
6131
6132static void
6133ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
6134{
6135	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
6136	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
6137
6138	if (ifp->if_fwd_cacheok) {
6139		route_copyin((struct route *)src,
6140		    (struct route *)&ifp->if_src_route6, sizeof (*src));
6141	} else {
6142		ROUTE_RELEASE(src);
6143	}
6144	lck_mtx_unlock(&ifp->if_cached_route_lock);
6145}
6146#endif /* INET6 */
6147
6148struct rtentry *
6149ifnet_cached_rtlookup_inet(struct ifnet	*ifp, struct in_addr src_ip)
6150{
6151	struct route		src_rt;
6152	struct sockaddr_in	*dst;
6153
6154	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
6155
6156	ifp_src_route_copyout(ifp, &src_rt);
6157
6158	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
6159		ROUTE_RELEASE(&src_rt);
6160		if (dst->sin_family != AF_INET) {
6161			bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst));
6162			dst->sin_len = sizeof (src_rt.ro_dst);
6163			dst->sin_family = AF_INET;
6164		}
6165		dst->sin_addr = src_ip;
6166
6167		if (src_rt.ro_rt == NULL) {
6168			src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
6169			    0, 0, ifp->if_index);
6170
6171			if (src_rt.ro_rt != NULL) {
6172				/* retain a ref, copyin consumes one */
6173				struct rtentry	*rte = src_rt.ro_rt;
6174				RT_ADDREF(rte);
6175				ifp_src_route_copyin(ifp, &src_rt);
6176				src_rt.ro_rt = rte;
6177			}
6178		}
6179	}
6180
6181	return (src_rt.ro_rt);
6182}
6183
6184#if INET6
6185struct rtentry*
6186ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
6187{
6188	struct route_in6 src_rt;
6189
6190	ifp_src_route6_copyout(ifp, &src_rt);
6191
6192	if (ROUTE_UNUSABLE(&src_rt) ||
6193	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
6194		ROUTE_RELEASE(&src_rt);
6195		if (src_rt.ro_dst.sin6_family != AF_INET6) {
6196			bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst));
6197			src_rt.ro_dst.sin6_len = sizeof (src_rt.ro_dst);
6198			src_rt.ro_dst.sin6_family = AF_INET6;
6199		}
6200		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
6201		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
6202		    sizeof (src_rt.ro_dst.sin6_addr));
6203
6204		if (src_rt.ro_rt == NULL) {
6205			src_rt.ro_rt = rtalloc1_scoped(
6206			    (struct sockaddr *)&src_rt.ro_dst, 0, 0,
6207			    ifp->if_index);
6208
6209			if (src_rt.ro_rt != NULL) {
6210				/* retain a ref, copyin consumes one */
6211				struct rtentry	*rte = src_rt.ro_rt;
6212				RT_ADDREF(rte);
6213				ifp_src_route6_copyin(ifp, &src_rt);
6214				src_rt.ro_rt = rte;
6215			}
6216		}
6217	}
6218
6219	return (src_rt.ro_rt);
6220}
6221#endif /* INET6 */
6222
6223void
6224if_lqm_update(struct ifnet *ifp, int lqm)
6225{
6226	struct kev_dl_link_quality_metric_data ev_lqm_data;
6227
6228	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
6229
6230	/* Normalize to edge */
6231	if (lqm > IFNET_LQM_THRESH_UNKNOWN && lqm <= IFNET_LQM_THRESH_POOR)
6232		lqm = IFNET_LQM_THRESH_POOR;
6233	else if (lqm > IFNET_LQM_THRESH_POOR && lqm <= IFNET_LQM_THRESH_GOOD)
6234		lqm = IFNET_LQM_THRESH_GOOD;
6235
6236	ifnet_lock_exclusive(ifp);
6237	if (lqm == ifp->if_lqm) {
6238		ifnet_lock_done(ifp);
6239		return;		/* nothing to update */
6240	}
6241	ifp->if_lqm = lqm;
6242	ifnet_lock_done(ifp);
6243
6244	bzero(&ev_lqm_data, sizeof (ev_lqm_data));
6245	ev_lqm_data.link_quality_metric = lqm;
6246
6247	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
6248	    (struct net_event_data *)&ev_lqm_data, sizeof (ev_lqm_data));
6249}
6250
6251/* for uuid.c */
6252int
6253uuid_get_ethernet(u_int8_t *node)
6254{
6255	struct ifnet *ifp;
6256	struct sockaddr_dl *sdl;
6257
6258	ifnet_head_lock_shared();
6259	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
6260		ifnet_lock_shared(ifp);
6261		IFA_LOCK_SPIN(ifp->if_lladdr);
6262		sdl = (struct sockaddr_dl *)(void *)ifp->if_lladdr->ifa_addr;
6263		if (sdl->sdl_type == IFT_ETHER) {
6264			memcpy(node, LLADDR(sdl), ETHER_ADDR_LEN);
6265			IFA_UNLOCK(ifp->if_lladdr);
6266			ifnet_lock_done(ifp);
6267			ifnet_head_done();
6268			return (0);
6269		}
6270		IFA_UNLOCK(ifp->if_lladdr);
6271		ifnet_lock_done(ifp);
6272	}
6273	ifnet_head_done();
6274
6275	return (-1);
6276}
6277
6278static int
6279sysctl_rxpoll SYSCTL_HANDLER_ARGS
6280{
6281#pragma unused(arg1, arg2)
6282	uint32_t i;
6283	int err;
6284
6285	i = if_rxpoll;
6286
6287	err = sysctl_handle_int(oidp, &i, 0, req);
6288	if (err != 0 || req->newptr == USER_ADDR_NULL)
6289		return (err);
6290
6291	if (net_rxpoll == 0)
6292		return (ENXIO);
6293
6294	if_rxpoll = i;
6295	return (err);
6296}
6297
6298static int
6299sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
6300{
6301#pragma unused(arg1, arg2)
6302	uint64_t q;
6303	int err;
6304
6305	q = if_rxpoll_mode_holdtime;
6306
6307	err = sysctl_handle_quad(oidp, &q, 0, req);
6308	if (err != 0 || req->newptr == USER_ADDR_NULL)
6309		return (err);
6310
6311	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN)
6312		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
6313
6314	if_rxpoll_mode_holdtime = q;
6315
6316	return (err);
6317}
6318
6319static int
6320sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
6321{
6322#pragma unused(arg1, arg2)
6323	uint64_t q;
6324	int err;
6325
6326	q = if_rxpoll_sample_holdtime;
6327
6328	err = sysctl_handle_quad(oidp, &q, 0, req);
6329	if (err != 0 || req->newptr == USER_ADDR_NULL)
6330		return (err);
6331
6332	if (q < IF_RXPOLL_SAMPLETIME_MIN)
6333		q = IF_RXPOLL_SAMPLETIME_MIN;
6334
6335	if_rxpoll_sample_holdtime = q;
6336
6337	return (err);
6338}
6339
6340static int
6341sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
6342{
6343#pragma unused(arg1, arg2)
6344	uint64_t q;
6345	int err;
6346
6347	q = if_rxpoll_interval_time;
6348
6349	err = sysctl_handle_quad(oidp, &q, 0, req);
6350	if (err != 0 || req->newptr == USER_ADDR_NULL)
6351		return (err);
6352
6353	if (q < IF_RXPOLL_INTERVALTIME_MIN)
6354		q = IF_RXPOLL_INTERVALTIME_MIN;
6355
6356	if_rxpoll_interval_time = q;
6357
6358	return (err);
6359}
6360
6361static int
6362sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
6363{
6364#pragma unused(arg1, arg2)
6365	uint32_t i;
6366	int err;
6367
6368	i = if_rxpoll_wlowat;
6369
6370	err = sysctl_handle_int(oidp, &i, 0, req);
6371	if (err != 0 || req->newptr == USER_ADDR_NULL)
6372		return (err);
6373
6374	if (i == 0 || i >= if_rxpoll_whiwat)
6375		return (EINVAL);
6376
6377	if_rxpoll_wlowat = i;
6378	return (err);
6379}
6380
6381static int
6382sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
6383{
6384#pragma unused(arg1, arg2)
6385	uint32_t i;
6386	int err;
6387
6388	i = if_rxpoll_whiwat;
6389
6390	err = sysctl_handle_int(oidp, &i, 0, req);
6391	if (err != 0 || req->newptr == USER_ADDR_NULL)
6392		return (err);
6393
6394	if (i <= if_rxpoll_wlowat)
6395		return (EINVAL);
6396
6397	if_rxpoll_whiwat = i;
6398	return (err);
6399}
6400
6401static int
6402sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
6403{
6404#pragma unused(arg1, arg2)
6405	int i, err;
6406
6407	i = if_sndq_maxlen;
6408
6409	err = sysctl_handle_int(oidp, &i, 0, req);
6410	if (err != 0 || req->newptr == USER_ADDR_NULL)
6411		return (err);
6412
6413	if (i < IF_SNDQ_MINLEN)
6414		i = IF_SNDQ_MINLEN;
6415
6416	if_sndq_maxlen = i;
6417	return (err);
6418}
6419
6420static int
6421sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
6422{
6423#pragma unused(arg1, arg2)
6424	int i, err;
6425
6426	i = if_rcvq_maxlen;
6427
6428	err = sysctl_handle_int(oidp, &i, 0, req);
6429	if (err != 0 || req->newptr == USER_ADDR_NULL)
6430		return (err);
6431
6432	if (i < IF_RCVQ_MINLEN)
6433		i = IF_RCVQ_MINLEN;
6434
6435	if_rcvq_maxlen = i;
6436	return (err);
6437}
6438
6439void
6440dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
6441    int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
6442{
6443	struct kev_dl_node_presence kev;
6444	struct sockaddr_dl *sdl;
6445	struct sockaddr_in6 *sin6;
6446
6447	VERIFY(ifp);
6448	VERIFY(sa);
6449	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
6450
6451	bzero(&kev, sizeof (kev));
6452	sin6 = &kev.sin6_node_address;
6453	sdl = &kev.sdl_node_address;
6454	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
6455	kev.rssi = rssi;
6456	kev.link_quality_metric = lqm;
6457	kev.node_proximity_metric = npm;
6458	bcopy(srvinfo, kev.node_service_info, sizeof (kev.node_service_info));
6459
6460	nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
6461	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
6462	    &kev.link_data, sizeof (kev));
6463}
6464
6465void
6466dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
6467{
6468	struct kev_dl_node_absence kev;
6469	struct sockaddr_in6 *sin6;
6470	struct sockaddr_dl *sdl;
6471
6472	VERIFY(ifp);
6473	VERIFY(sa);
6474	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
6475
6476	bzero(&kev, sizeof (kev));
6477	sin6 = &kev.sin6_node_address;
6478	sdl = &kev.sdl_node_address;
6479	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
6480
6481	nd6_alt_node_absent(ifp, sin6);
6482	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
6483	    &kev.link_data, sizeof (kev));
6484}
6485
6486const void *
6487dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
6488	kauth_cred_t *credp)
6489{
6490	const u_int8_t *bytes;
6491	size_t size;
6492
6493	bytes = CONST_LLADDR(sdl);
6494	size = sdl->sdl_alen;
6495
6496#if CONFIG_MACF
6497	if (dlil_lladdr_ckreq) {
6498		switch (sdl->sdl_type) {
6499		case IFT_ETHER:
6500		case IFT_BRIDGE:
6501		case IFT_IEEE1394:
6502		case IFT_IEEE8023ADLAG:
6503		case IFT_L2VLAN:
6504			break;
6505		default:
6506			credp = NULL;
6507			break;
6508		};
6509
6510		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
6511			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
6512			    [0] = 2
6513			};
6514
6515			switch (sdl->sdl_type) {
6516			case IFT_ETHER:
6517			case IFT_BRIDGE:
6518			case IFT_IEEE8023ADLAG:
6519			case IFT_L2VLAN:
6520				VERIFY(size == ETHER_ADDR_LEN);
6521				bytes = unspec;
6522				break;
6523			case IFT_IEEE1394:
6524				VERIFY(size == FIREWIRE_EUI64_LEN);
6525				bytes = unspec;
6526				break;
6527			default:
6528				VERIFY(FALSE);
6529				break;
6530			};
6531		}
6532	}
6533#else
6534#pragma unused(credp)
6535#endif
6536
6537	if (sizep != NULL) *sizep = size;
6538	return (bytes);
6539}
6540
6541void
6542dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
6543    u_int8_t info[DLIL_MODARGLEN])
6544{
6545	struct kev_dl_issues kev;
6546	struct timeval tv;
6547
6548	VERIFY(ifp != NULL);
6549	VERIFY(modid != NULL);
6550	_CASSERT(sizeof (kev.modid) == DLIL_MODIDLEN);
6551	_CASSERT(sizeof (kev.info) == DLIL_MODARGLEN);
6552
6553	bzero(&kev, sizeof (&kev));
6554
6555	microtime(&tv);
6556	kev.timestamp = tv.tv_sec;
6557	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
6558	if (info != NULL)
6559		bcopy(info, &kev.info, DLIL_MODARGLEN);
6560
6561	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
6562	    &kev.link_data, sizeof (kev));
6563}
6564
6565errno_t
6566ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
6567    struct proc *p)
6568{
6569	u_int32_t level = IFNET_THROTTLE_OFF;
6570	errno_t result = 0;
6571
6572	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
6573
6574	if (cmd == SIOCSIFOPPORTUNISTIC) {
6575		/*
6576		 * XXX: Use priv_check_cred() instead of root check?
6577		 */
6578		if ((result = proc_suser(p)) != 0)
6579			return (result);
6580
6581		if (ifr->ifr_opportunistic.ifo_flags ==
6582		    IFRIFOF_BLOCK_OPPORTUNISTIC)
6583			level = IFNET_THROTTLE_OPPORTUNISTIC;
6584		else if (ifr->ifr_opportunistic.ifo_flags == 0)
6585			level = IFNET_THROTTLE_OFF;
6586		else
6587			result = EINVAL;
6588
6589		if (result == 0)
6590			result = ifnet_set_throttle(ifp, level);
6591	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
6592		ifr->ifr_opportunistic.ifo_flags = 0;
6593		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
6594			ifr->ifr_opportunistic.ifo_flags |=
6595			    IFRIFOF_BLOCK_OPPORTUNISTIC;
6596		}
6597	}
6598
6599	/*
6600	 * Return the count of current opportunistic connections
6601	 * over the interface.
6602	 */
6603	if (result == 0) {
6604		uint32_t flags = 0;
6605		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
6606			INPCB_OPPORTUNISTIC_SETCMD : 0;
6607		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
6608			INPCB_OPPORTUNISTIC_THROTTLEON : 0;
6609		ifr->ifr_opportunistic.ifo_inuse =
6610		    udp_count_opportunistic(ifp->if_index, flags) +
6611		    tcp_count_opportunistic(ifp->if_index, flags);
6612	}
6613
6614	if (result == EALREADY)
6615		result = 0;
6616
6617	return (result);
6618}
6619
6620int
6621ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
6622{
6623	struct ifclassq *ifq;
6624	int err = 0;
6625
6626	if (!(ifp->if_eflags & IFEF_TXSTART))
6627		return (ENXIO);
6628
6629	*level = IFNET_THROTTLE_OFF;
6630
6631	ifq = &ifp->if_snd;
6632	IFCQ_LOCK(ifq);
6633	/* Throttling works only for IFCQ, not ALTQ instances */
6634	if (IFCQ_IS_ENABLED(ifq))
6635		IFCQ_GET_THROTTLE(ifq, *level, err);
6636	IFCQ_UNLOCK(ifq);
6637
6638	return (err);
6639}
6640
6641int
6642ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
6643{
6644	struct ifclassq *ifq;
6645	int err = 0;
6646
6647	if (!(ifp->if_eflags & IFEF_TXSTART))
6648		return (ENXIO);
6649
6650	ifq = &ifp->if_snd;
6651
6652	switch (level) {
6653	case IFNET_THROTTLE_OFF:
6654	case IFNET_THROTTLE_OPPORTUNISTIC:
6655#if PF_ALTQ
6656		/* Throttling works only for IFCQ, not ALTQ instances */
6657		if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq)))
6658			return (ENXIO);
6659#endif /* PF_ALTQ */
6660		break;
6661	default:
6662		return (EINVAL);
6663	}
6664
6665	IFCQ_LOCK(ifq);
6666	if (IFCQ_IS_ENABLED(ifq))
6667		IFCQ_SET_THROTTLE(ifq, level, err);
6668	IFCQ_UNLOCK(ifq);
6669
6670	if (err == 0) {
6671		printf("%s: throttling level set to %d\n", if_name(ifp),
6672		    level);
6673		if (level == IFNET_THROTTLE_OFF)
6674			ifnet_start(ifp);
6675	}
6676
6677	return (err);
6678}
6679
6680errno_t
6681ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
6682    struct proc *p)
6683{
6684#pragma unused(p)
6685	errno_t result = 0;
6686	uint32_t flags;
6687	int level, category, subcategory;
6688
6689	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
6690
6691	if (cmd == SIOCSIFLOG) {
6692		if ((result = priv_check_cred(kauth_cred_get(),
6693		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0)
6694			return (result);
6695
6696		level = ifr->ifr_log.ifl_level;
6697		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX)
6698			result = EINVAL;
6699
6700		flags = ifr->ifr_log.ifl_flags;
6701		if ((flags &= IFNET_LOGF_MASK) == 0)
6702			result = EINVAL;
6703
6704		category = ifr->ifr_log.ifl_category;
6705		subcategory = ifr->ifr_log.ifl_subcategory;
6706
6707		if (result == 0)
6708			result = ifnet_set_log(ifp, level, flags,
6709			    category, subcategory);
6710	} else {
6711		result = ifnet_get_log(ifp, &level, &flags, &category,
6712		    &subcategory);
6713		if (result == 0) {
6714			ifr->ifr_log.ifl_level = level;
6715			ifr->ifr_log.ifl_flags = flags;
6716			ifr->ifr_log.ifl_category = category;
6717			ifr->ifr_log.ifl_subcategory = subcategory;
6718		}
6719	}
6720
6721	return (result);
6722}
6723
6724int
6725ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
6726    int32_t category, int32_t subcategory)
6727{
6728	int err = 0;
6729
6730	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
6731	VERIFY(flags & IFNET_LOGF_MASK);
6732
6733	/*
6734	 * The logging level applies to all facilities; make sure to
6735	 * update them all with the most current level.
6736	 */
6737	flags |= ifp->if_log.flags;
6738
6739	if (ifp->if_output_ctl != NULL) {
6740		struct ifnet_log_params l;
6741
6742		bzero(&l, sizeof (l));
6743		l.level = level;
6744		l.flags = flags;
6745		l.flags &= ~IFNET_LOGF_DLIL;
6746		l.category = category;
6747		l.subcategory = subcategory;
6748
6749		/* Send this request to lower layers */
6750		if (l.flags != 0) {
6751			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
6752			    sizeof (l), &l);
6753		}
6754	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
6755		/*
6756		 * If targeted to the lower layers without an output
6757		 * control callback registered on the interface, just
6758		 * silently ignore facilities other than ours.
6759		 */
6760		flags &= IFNET_LOGF_DLIL;
6761		if (flags == 0 && (!ifp->if_log.flags & IFNET_LOGF_DLIL))
6762			level = 0;
6763	}
6764
6765	if (err == 0) {
6766		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT)
6767			ifp->if_log.flags = 0;
6768		else
6769			ifp->if_log.flags |= flags;
6770
6771		log(LOG_INFO, "%s: logging level set to %d flags=%b "
6772		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
6773		    ifp->if_log.level, ifp->if_log.flags,
6774		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
6775		    category, subcategory);
6776	}
6777
6778	return (err);
6779}
6780
6781int
6782ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
6783    int32_t *category, int32_t *subcategory)
6784{
6785	if (level != NULL)
6786		*level = ifp->if_log.level;
6787	if (flags != NULL)
6788		*flags = ifp->if_log.flags;
6789	if (category != NULL)
6790		*category = ifp->if_log.category;
6791	if (subcategory != NULL)
6792		*subcategory = ifp->if_log.subcategory;
6793
6794	return (0);
6795}
6796
6797int
6798ifnet_notify_address(struct ifnet *ifp, int af)
6799{
6800	struct ifnet_notify_address_params na;
6801
6802#if PF
6803	(void) pf_ifaddr_hook(ifp);
6804#endif /* PF */
6805
6806	if (ifp->if_output_ctl == NULL)
6807		return (EOPNOTSUPP);
6808
6809	bzero(&na, sizeof (na));
6810	na.address_family = af;
6811
6812	return (ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
6813	    sizeof (na), &na));
6814}
6815
6816errno_t
6817ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
6818{
6819	if (ifp == NULL || flowid == NULL) {
6820		return (EINVAL);
6821	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6822	    !(ifp->if_refflags & IFRF_ATTACHED)) {
6823		return (ENXIO);
6824	}
6825
6826	*flowid = ifp->if_flowhash;
6827
6828	return (0);
6829}
6830
6831errno_t
6832ifnet_disable_output(struct ifnet *ifp)
6833{
6834	int err;
6835
6836	if (ifp == NULL) {
6837		return (EINVAL);
6838	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6839	    !(ifp->if_refflags & IFRF_ATTACHED)) {
6840		return (ENXIO);
6841	}
6842
6843	if ((err = ifnet_fc_add(ifp)) == 0) {
6844		lck_mtx_lock_spin(&ifp->if_start_lock);
6845		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
6846		lck_mtx_unlock(&ifp->if_start_lock);
6847	}
6848	return (err);
6849}
6850
6851errno_t
6852ifnet_enable_output(struct ifnet *ifp)
6853{
6854	if (ifp == NULL) {
6855		return (EINVAL);
6856	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6857	    !(ifp->if_refflags & IFRF_ATTACHED)) {
6858		return (ENXIO);
6859	}
6860
6861	ifnet_start_common(ifp, 1);
6862	return (0);
6863}
6864
6865void
6866ifnet_flowadv(uint32_t flowhash)
6867{
6868	struct ifnet_fc_entry *ifce;
6869	struct ifnet *ifp;
6870
6871	ifce = ifnet_fc_get(flowhash);
6872	if (ifce == NULL)
6873		return;
6874
6875	VERIFY(ifce->ifce_ifp != NULL);
6876	ifp = ifce->ifce_ifp;
6877
6878	/* flow hash gets recalculated per attach, so check */
6879	if (ifnet_is_attached(ifp, 1)) {
6880		if (ifp->if_flowhash == flowhash)
6881			(void) ifnet_enable_output(ifp);
6882		ifnet_decr_iorefcnt(ifp);
6883	}
6884	ifnet_fc_entry_free(ifce);
6885}
6886
6887/*
6888 * Function to compare ifnet_fc_entries in ifnet flow control tree
6889 */
6890static inline int
6891ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
6892{
6893	return (fc1->ifce_flowhash - fc2->ifce_flowhash);
6894}
6895
6896static int
6897ifnet_fc_add(struct ifnet *ifp)
6898{
6899	struct ifnet_fc_entry keyfc, *ifce;
6900	uint32_t flowhash;
6901
6902	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
6903	VERIFY(ifp->if_flowhash != 0);
6904	flowhash = ifp->if_flowhash;
6905
6906	bzero(&keyfc, sizeof (keyfc));
6907	keyfc.ifce_flowhash = flowhash;
6908
6909	lck_mtx_lock_spin(&ifnet_fc_lock);
6910	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
6911	if (ifce != NULL && ifce->ifce_ifp == ifp) {
6912		/* Entry is already in ifnet_fc_tree, return */
6913		lck_mtx_unlock(&ifnet_fc_lock);
6914		return (0);
6915	}
6916
6917	if (ifce != NULL) {
6918		/*
6919		 * There is a different fc entry with the same flow hash
6920		 * but different ifp pointer.  There can be a collision
6921		 * on flow hash but the probability is low.  Let's just
6922		 * avoid adding a second one when there is a collision.
6923		 */
6924		lck_mtx_unlock(&ifnet_fc_lock);
6925		return (EAGAIN);
6926	}
6927
6928	/* become regular mutex */
6929	lck_mtx_convert_spin(&ifnet_fc_lock);
6930
6931	ifce = zalloc_noblock(ifnet_fc_zone);
6932	if (ifce == NULL) {
6933		/* memory allocation failed */
6934		lck_mtx_unlock(&ifnet_fc_lock);
6935		return (ENOMEM);
6936	}
6937	bzero(ifce, ifnet_fc_zone_size);
6938
6939	ifce->ifce_flowhash = flowhash;
6940	ifce->ifce_ifp = ifp;
6941
6942	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
6943	lck_mtx_unlock(&ifnet_fc_lock);
6944	return (0);
6945}
6946
6947static struct ifnet_fc_entry *
6948ifnet_fc_get(uint32_t flowhash)
6949{
6950	struct ifnet_fc_entry keyfc, *ifce;
6951	struct ifnet *ifp;
6952
6953	bzero(&keyfc, sizeof (keyfc));
6954	keyfc.ifce_flowhash = flowhash;
6955
6956	lck_mtx_lock_spin(&ifnet_fc_lock);
6957	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
6958	if (ifce == NULL) {
6959		/* Entry is not present in ifnet_fc_tree, return */
6960		lck_mtx_unlock(&ifnet_fc_lock);
6961		return (NULL);
6962	}
6963
6964	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
6965
6966	VERIFY(ifce->ifce_ifp != NULL);
6967	ifp = ifce->ifce_ifp;
6968
6969	/* become regular mutex */
6970	lck_mtx_convert_spin(&ifnet_fc_lock);
6971
6972	if (!ifnet_is_attached(ifp, 0)) {
6973		/*
6974		 * This ifp is not attached or in the process of being
6975		 * detached; just don't process it.
6976		 */
6977		ifnet_fc_entry_free(ifce);
6978		ifce = NULL;
6979	}
6980	lck_mtx_unlock(&ifnet_fc_lock);
6981
6982	return (ifce);
6983}
6984
6985static void
6986ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
6987{
6988	zfree(ifnet_fc_zone, ifce);
6989}
6990
6991static uint32_t
6992ifnet_calc_flowhash(struct ifnet *ifp)
6993{
6994	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
6995	uint32_t flowhash = 0;
6996
6997	if (ifnet_flowhash_seed == 0)
6998		ifnet_flowhash_seed = RandomULong();
6999
7000	bzero(&fh, sizeof (fh));
7001
7002	(void) snprintf(fh.ifk_name, sizeof (fh.ifk_name), "%s", ifp->if_name);
7003	fh.ifk_unit = ifp->if_unit;
7004	fh.ifk_flags = ifp->if_flags;
7005	fh.ifk_eflags = ifp->if_eflags;
7006	fh.ifk_capabilities = ifp->if_capabilities;
7007	fh.ifk_capenable = ifp->if_capenable;
7008	fh.ifk_output_sched_model = ifp->if_output_sched_model;
7009	fh.ifk_rand1 = RandomULong();
7010	fh.ifk_rand2 = RandomULong();
7011
7012try_again:
7013	flowhash = net_flowhash(&fh, sizeof (fh), ifnet_flowhash_seed);
7014	if (flowhash == 0) {
7015		/* try to get a non-zero flowhash */
7016		ifnet_flowhash_seed = RandomULong();
7017		goto try_again;
7018	}
7019
7020	return (flowhash);
7021}
7022
7023static void
7024dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
7025    protocol_family_t pf)
7026{
7027#pragma unused(ifp)
7028	uint32_t did_sw;
7029
7030	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
7031	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4|CSUM_TSO_IPV6)))
7032		return;
7033
7034	switch (pf) {
7035	case PF_INET:
7036		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
7037		if (did_sw & CSUM_DELAY_IP)
7038			hwcksum_dbg_finalized_hdr++;
7039		if (did_sw & CSUM_DELAY_DATA)
7040			hwcksum_dbg_finalized_data++;
7041		break;
7042#if INET6
7043	case PF_INET6:
7044		/*
7045		 * Checksum offload should not have been enabled when
7046		 * extension headers exist; that also means that we
7047		 * cannot force-finalize packets with extension headers.
7048		 * Indicate to the callee should it skip such case by
7049		 * setting optlen to -1.
7050		 */
7051		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
7052		    m->m_pkthdr.csum_flags);
7053		if (did_sw & CSUM_DELAY_IPV6_DATA)
7054			hwcksum_dbg_finalized_data++;
7055		break;
7056#endif /* INET6 */
7057	default:
7058		return;
7059	}
7060}
7061
7062static void
7063dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
7064    protocol_family_t pf)
7065{
7066	uint16_t sum;
7067	uint32_t hlen;
7068
7069	if (frame_header == NULL ||
7070	    frame_header < (char *)mbuf_datastart(m) ||
7071	    frame_header > (char *)m->m_data) {
7072		printf("%s: frame header pointer 0x%llx out of range "
7073		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
7074		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
7075		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
7076		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7077		    (uint64_t)VM_KERNEL_ADDRPERM(m));
7078		return;
7079	}
7080	hlen = (m->m_data - frame_header);
7081
7082	switch (pf) {
7083	case PF_INET:
7084#if INET6
7085	case PF_INET6:
7086#endif /* INET6 */
7087		break;
7088	default:
7089		return;
7090	}
7091
7092	/*
7093	 * Force partial checksum offload; useful to simulate cases
7094	 * where the hardware does not support partial checksum offload,
7095	 * in order to validate correctness throughout the layers above.
7096	 */
7097	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
7098		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
7099
7100		if (foff > (uint32_t)m->m_pkthdr.len)
7101			return;
7102
7103		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
7104
7105		/* Compute 16-bit 1's complement sum from forced offset */
7106		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
7107
7108		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
7109		m->m_pkthdr.csum_rx_val = sum;
7110		m->m_pkthdr.csum_rx_start = (foff + hlen);
7111
7112		hwcksum_dbg_partial_forced++;
7113		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
7114	}
7115
7116	/*
7117	 * Partial checksum offload verification (and adjustment);
7118	 * useful to validate and test cases where the hardware
7119	 * supports partial checksum offload.
7120	 */
7121	if ((m->m_pkthdr.csum_flags &
7122	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
7123	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7124		uint32_t rxoff;
7125
7126		/* Start offset must begin after frame header */
7127		rxoff = m->m_pkthdr.csum_rx_start;
7128		if (hlen > rxoff) {
7129			hwcksum_dbg_bad_rxoff++;
7130			if (dlil_verbose) {
7131				printf("%s: partial cksum start offset %d "
7132				    "is less than frame header length %d for "
7133				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
7134				    (uint64_t)VM_KERNEL_ADDRPERM(m));
7135			}
7136			return;
7137		}
7138		rxoff -=hlen;
7139
7140		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
7141			/*
7142			 * Compute the expected 16-bit 1's complement sum;
7143			 * skip this if we've already computed it above
7144			 * when partial checksum offload is forced.
7145			 */
7146			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
7147
7148			/* Hardware or driver is buggy */
7149			if (sum != m->m_pkthdr.csum_rx_val) {
7150				hwcksum_dbg_bad_cksum++;
7151				if (dlil_verbose) {
7152					printf("%s: bad partial cksum value "
7153					    "0x%x (expected 0x%x) for mbuf "
7154					    "0x%llx [rx_start %d]\n",
7155					    if_name(ifp),
7156					    m->m_pkthdr.csum_rx_val, sum,
7157					    (uint64_t)VM_KERNEL_ADDRPERM(m),
7158					    m->m_pkthdr.csum_rx_start);
7159				}
7160				return;
7161			}
7162		}
7163		hwcksum_dbg_verified++;
7164
7165		/*
7166		 * This code allows us to emulate various hardwares that
7167		 * perform 16-bit 1's complement sum beginning at various
7168		 * start offset values.
7169		 */
7170		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
7171			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
7172
7173			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len)
7174				return;
7175
7176			sum = m_adj_sum16(m, rxoff, aoff, sum);
7177
7178			m->m_pkthdr.csum_rx_val = sum;
7179			m->m_pkthdr.csum_rx_start = (aoff + hlen);
7180
7181			hwcksum_dbg_adjusted++;
7182		}
7183	}
7184}
7185
7186static int
7187sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
7188{
7189#pragma unused(arg1, arg2)
7190	u_int32_t i;
7191	int err;
7192
7193	i = hwcksum_dbg_mode;
7194
7195	err = sysctl_handle_int(oidp, &i, 0, req);
7196	if (err != 0 || req->newptr == USER_ADDR_NULL)
7197		return (err);
7198
7199	if (hwcksum_dbg == 0)
7200		return (ENODEV);
7201
7202	if ((i & ~HWCKSUM_DBG_MASK) != 0)
7203		return (EINVAL);
7204
7205	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
7206
7207	return (err);
7208}
7209
7210static int
7211sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
7212{
7213#pragma unused(arg1, arg2)
7214	u_int32_t i;
7215	int err;
7216
7217	i = hwcksum_dbg_partial_rxoff_forced;
7218
7219	err = sysctl_handle_int(oidp, &i, 0, req);
7220	if (err != 0 || req->newptr == USER_ADDR_NULL)
7221		return (err);
7222
7223	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED))
7224		return (ENODEV);
7225
7226	hwcksum_dbg_partial_rxoff_forced = i;
7227
7228	return (err);
7229}
7230
7231static int
7232sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
7233{
7234#pragma unused(arg1, arg2)
7235	u_int32_t i;
7236	int err;
7237
7238	i = hwcksum_dbg_partial_rxoff_adj;
7239
7240	err = sysctl_handle_int(oidp, &i, 0, req);
7241	if (err != 0 || req->newptr == USER_ADDR_NULL)
7242		return (err);
7243
7244	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ))
7245		return (ENODEV);
7246
7247	hwcksum_dbg_partial_rxoff_adj = i;
7248
7249	return (err);
7250}
7251
7252#if DEBUG
7253/* Blob for sum16 verification */
7254static uint8_t sumdata[] = {
7255	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
7256	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
7257	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
7258	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
7259	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
7260	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
7261	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
7262	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
7263	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
7264	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
7265	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
7266	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
7267	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
7268	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
7269	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
7270	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
7271	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
7272	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
7273	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
7274	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
7275	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
7276	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
7277	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
7278	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
7279	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
7280	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
7281	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
7282	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
7283	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
7284	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
7285	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
7286	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
7287	0xc8, 0x28, 0x02, 0x00, 0x00
7288};
7289
7290/* Precomputed 16-bit 1's complement sums for various spans of the above data */
7291static struct {
7292	int		len;
7293	uint16_t	sum;
7294} sumtbl[] = {
7295	{	11,	0xcb6d	},
7296	{	20,	0x20dd	},
7297	{	27,	0xbabd	},
7298	{	32,	0xf3e8	},
7299	{	37,	0x197d	},
7300	{	43,	0x9eae	},
7301	{	64,	0x4678	},
7302	{	127,	0x9399	},
7303	{	256,	0xd147	},
7304	{	325,	0x0358	}
7305};
7306#define	SUMTBL_MAX	((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
7307
7308static void
7309dlil_verify_sum16(void)
7310{
7311	struct mbuf *m;
7312	uint8_t *buf;
7313	int n;
7314
7315	/* Make sure test data plus extra room for alignment fits in cluster */
7316	_CASSERT((sizeof (sumdata) + (sizeof (uint64_t) * 2)) <= MCLBYTES);
7317
7318	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
7319	MH_ALIGN(m, sizeof (uint32_t));		/* 32-bit starting alignment */
7320	buf = mtod(m, uint8_t *);		/* base address */
7321
7322	for (n = 0; n < SUMTBL_MAX; n++) {
7323		uint16_t len = sumtbl[n].len;
7324		int i;
7325
7326		/* Verify for all possible alignments */
7327		for (i = 0; i < (int)sizeof (uint64_t); i++) {
7328			uint16_t sum;
7329			uint8_t *c;
7330
7331			/* Copy over test data to mbuf */
7332			VERIFY(len <= sizeof (sumdata));
7333			c = buf + i;
7334			bcopy(sumdata, c, len);
7335
7336			/* Zero-offset test (align by data pointer) */
7337			m->m_data = (caddr_t)c;
7338			m->m_len = len;
7339			sum = m_sum16(m, 0, len);
7340
7341			/* Something is horribly broken; stop now */
7342			if (sum != sumtbl[n].sum) {
7343				panic("%s: broken m_sum16 for len=%d align=%d "
7344				    "sum=0x%04x [expected=0x%04x]\n", __func__,
7345				    len, i, sum, sumtbl[n].sum);
7346				/* NOTREACHED */
7347			}
7348
7349			/* Alignment test by offset (fixed data pointer) */
7350			m->m_data = (caddr_t)buf;
7351			m->m_len = i + len;
7352			sum = m_sum16(m, i, len);
7353
7354			/* Something is horribly broken; stop now */
7355			if (sum != sumtbl[n].sum) {
7356				panic("%s: broken m_sum16 for len=%d offset=%d "
7357				    "sum=0x%04x [expected=0x%04x]\n", __func__,
7358				    len, i, sum, sumtbl[n].sum);
7359				/* NOTREACHED */
7360			}
7361#if INET
7362			/* Simple sum16 contiguous buffer test by aligment */
7363			sum = b_sum16(c, len);
7364
7365			/* Something is horribly broken; stop now */
7366			if (sum != sumtbl[n].sum) {
7367				panic("%s: broken b_sum16 for len=%d align=%d "
7368				    "sum=0x%04x [expected=0x%04x]\n", __func__,
7369				    len, i, sum, sumtbl[n].sum);
7370				/* NOTREACHED */
7371			}
7372#endif /* INET */
7373		}
7374	}
7375	m_freem(m);
7376
7377	printf("DLIL: SUM16 self-tests PASSED\n");
7378}
7379#endif /* DEBUG */
7380
7381#define	CASE_STRINGIFY(x) case x: return #x
7382
7383__private_extern__ const char *
7384dlil_kev_dl_code_str(u_int32_t event_code)
7385{
7386	switch (event_code) {
7387	CASE_STRINGIFY(KEV_DL_SIFFLAGS);
7388	CASE_STRINGIFY(KEV_DL_SIFMETRICS);
7389	CASE_STRINGIFY(KEV_DL_SIFMTU);
7390	CASE_STRINGIFY(KEV_DL_SIFPHYS);
7391	CASE_STRINGIFY(KEV_DL_SIFMEDIA);
7392	CASE_STRINGIFY(KEV_DL_SIFGENERIC);
7393	CASE_STRINGIFY(KEV_DL_ADDMULTI);
7394	CASE_STRINGIFY(KEV_DL_DELMULTI);
7395	CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
7396	CASE_STRINGIFY(KEV_DL_IF_DETACHING);
7397	CASE_STRINGIFY(KEV_DL_IF_DETACHED);
7398	CASE_STRINGIFY(KEV_DL_LINK_OFF);
7399	CASE_STRINGIFY(KEV_DL_LINK_ON);
7400	CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
7401	CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
7402	CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
7403	CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
7404	CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
7405	CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
7406	CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
7407	CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
7408	CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
7409	CASE_STRINGIFY(KEV_DL_MASTER_ELECTED);
7410	CASE_STRINGIFY(KEV_DL_ISSUES);
7411	CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
7412	default:
7413		break;
7414	}
7415	return ("");
7416}
7417