1/*
2 * Copyright (c) 1999-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections.  This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34#include <stddef.h>
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/malloc.h>
40#include <sys/mbuf.h>
41#include <sys/socket.h>
42#include <sys/domain.h>
43#include <sys/user.h>
44#include <sys/random.h>
45#include <sys/socketvar.h>
46#include <net/if_dl.h>
47#include <net/if.h>
48#include <net/route.h>
49#include <net/if_var.h>
50#include <net/dlil.h>
51#include <net/if_arp.h>
52#include <net/iptap.h>
53#include <net/pktap.h>
54#include <sys/kern_event.h>
55#include <sys/kdebug.h>
56#include <sys/mcache.h>
57#include <sys/syslog.h>
58#include <sys/protosw.h>
59#include <sys/priv.h>
60
61#include <kern/assert.h>
62#include <kern/task.h>
63#include <kern/thread.h>
64#include <kern/sched_prim.h>
65#include <kern/locks.h>
66#include <kern/zalloc.h>
67
68#include <net/kpi_protocol.h>
69#include <net/if_types.h>
70#include <net/if_llreach.h>
71#include <net/kpi_interfacefilter.h>
72#include <net/classq/classq.h>
73#include <net/classq/classq_sfb.h>
74#include <net/flowhash.h>
75#include <net/ntstat.h>
76
77#if INET
78#include <netinet/in_var.h>
79#include <netinet/igmp_var.h>
80#include <netinet/ip_var.h>
81#include <netinet/tcp.h>
82#include <netinet/tcp_var.h>
83#include <netinet/udp.h>
84#include <netinet/udp_var.h>
85#include <netinet/if_ether.h>
86#include <netinet/in_pcb.h>
87#endif /* INET */
88
89#if INET6
90#include <netinet6/in6_var.h>
91#include <netinet6/nd6.h>
92#include <netinet6/mld6_var.h>
93#include <netinet6/scope6_var.h>
94#endif /* INET6 */
95
96#include <libkern/OSAtomic.h>
97#include <libkern/tree.h>
98
99#include <dev/random/randomdev.h>
100#include <machine/machine_routines.h>
101
102#include <mach/thread_act.h>
103#include <mach/sdt.h>
104
105#if CONFIG_MACF
106#include <sys/kauth.h>
107#include <security/mac_framework.h>
108#include <net/ethernet.h>
109#include <net/firewire.h>
110#endif
111
112#if PF
113#include <net/pfvar.h>
114#endif /* PF */
115#if PF_ALTQ
116#include <net/altq/altq.h>
117#endif /* PF_ALTQ */
118#include <net/pktsched/pktsched.h>
119
120#define DBG_LAYER_BEG		DLILDBG_CODE(DBG_DLIL_STATIC, 0)
121#define DBG_LAYER_END		DLILDBG_CODE(DBG_DLIL_STATIC, 2)
122#define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
123#define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
124#define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
125
126#define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
127#define MAX_LINKADDR	    4 /* LONGWORDS */
128#define M_NKE M_IFADDR
129
130#if 1
131#define DLIL_PRINTF	printf
132#else
133#define DLIL_PRINTF	kprintf
134#endif
135
136#define	IF_DATA_REQUIRE_ALIGNED_64(f)	\
137	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
138
139#define	IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)	\
140	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
141
142enum {
143	kProtoKPI_v1	= 1,
144	kProtoKPI_v2	= 2
145};
146
147/*
148 * List of if_proto structures in if_proto_hash[] is protected by
149 * the ifnet lock.  The rest of the fields are initialized at protocol
150 * attach time and never change, thus no lock required as long as
151 * a reference to it is valid, via if_proto_ref().
152 */
153struct if_proto {
154    SLIST_ENTRY(if_proto)	next_hash;
155    u_int32_t			refcount;
156    u_int32_t			detached;
157    struct ifnet		*ifp;
158    protocol_family_t		protocol_family;
159    int				proto_kpi;
160    union {
161		struct {
162			proto_media_input		input;
163			proto_media_preout		pre_output;
164			proto_media_event		event;
165			proto_media_ioctl		ioctl;
166			proto_media_detached		detached;
167			proto_media_resolve_multi	resolve_multi;
168			proto_media_send_arp		send_arp;
169		} v1;
170		struct {
171			proto_media_input_v2		input;
172			proto_media_preout		pre_output;
173			proto_media_event		event;
174			proto_media_ioctl		ioctl;
175			proto_media_detached		detached;
176			proto_media_resolve_multi	resolve_multi;
177			proto_media_send_arp		send_arp;
178		} v2;
179	} kpi;
180};
181
182SLIST_HEAD(proto_hash_entry, if_proto);
183
184#define	DLIL_SDLMAXLEN	64
185#define	DLIL_SDLDATALEN	\
186	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
187
188struct dlil_ifnet {
189	struct ifnet	dl_if;			/* public ifnet */
190	/*
191	 * DLIL private fields, protected by dl_if_lock
192	 */
193	decl_lck_mtx_data(, dl_if_lock);
194	TAILQ_ENTRY(dlil_ifnet) dl_if_link;	/* dlil_ifnet link */
195	u_int32_t dl_if_flags;			/* flags (below) */
196	u_int32_t dl_if_refcnt;			/* refcnt */
197	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
198	void	*dl_if_uniqueid;		/* unique interface id */
199	size_t	dl_if_uniqueid_len;		/* length of the unique id */
200	char	dl_if_namestorage[IFNAMSIZ];	/* interface name storage */
201	char	dl_if_xnamestorage[IFXNAMSIZ];	/* external name storage */
202	struct {
203		struct ifaddr	ifa;		/* lladdr ifa */
204		u_int8_t	asdl[DLIL_SDLMAXLEN]; /* addr storage */
205		u_int8_t	msdl[DLIL_SDLMAXLEN]; /* mask storage */
206	} dl_if_lladdr;
207	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
208	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
209	ctrace_t	dl_if_attach;		/* attach PC stacktrace */
210	ctrace_t	dl_if_detach;		/* detach PC stacktrace */
211};
212
213/* Values for dl_if_flags (private to DLIL) */
214#define	DLIF_INUSE	0x1	/* DLIL ifnet recycler, ifnet in use */
215#define	DLIF_REUSE	0x2	/* DLIL ifnet recycles, ifnet is not new */
216#define	DLIF_DEBUG	0x4	/* has debugging info */
217
218#define	IF_REF_TRACE_HIST_SIZE	8	/* size of ref trace history */
219
220/* For gdb */
221__private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
222
223struct dlil_ifnet_dbg {
224	struct dlil_ifnet	dldbg_dlif;		/* dlil_ifnet */
225	u_int16_t		dldbg_if_refhold_cnt;	/* # ifnet references */
226	u_int16_t		dldbg_if_refrele_cnt;	/* # ifnet releases */
227	/*
228	 * Circular lists of ifnet_{reference,release} callers.
229	 */
230	ctrace_t		dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
231	ctrace_t		dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
232};
233
234#define	DLIL_TO_IFP(s)	(&s->dl_if)
235#define	IFP_TO_DLIL(s)	((struct dlil_ifnet *)s)
236
237struct ifnet_filter {
238	TAILQ_ENTRY(ifnet_filter)	filt_next;
239	u_int32_t			filt_skip;
240	u_int32_t			filt_flags;
241	ifnet_t				filt_ifp;
242	const char			*filt_name;
243	void				*filt_cookie;
244	protocol_family_t		filt_protocol;
245	iff_input_func			filt_input;
246	iff_output_func			filt_output;
247	iff_event_func			filt_event;
248	iff_ioctl_func			filt_ioctl;
249	iff_detached_func		filt_detached;
250};
251
252struct proto_input_entry;
253
254static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
255static lck_grp_t *dlil_lock_group;
256lck_grp_t *ifnet_lock_group;
257static lck_grp_t *ifnet_head_lock_group;
258static lck_grp_t *ifnet_snd_lock_group;
259static lck_grp_t *ifnet_rcv_lock_group;
260lck_attr_t *ifnet_lock_attr;
261decl_lck_rw_data(static, ifnet_head_lock);
262decl_lck_mtx_data(static, dlil_ifnet_lock);
263u_int32_t dlil_filter_disable_tso_count = 0;
264
265#if DEBUG
266static unsigned int ifnet_debug = 1;	/* debugging (enabled) */
267#else
268static unsigned int ifnet_debug;	/* debugging (disabled) */
269#endif /* !DEBUG */
270static unsigned int dlif_size;		/* size of dlil_ifnet to allocate */
271static unsigned int dlif_bufsize;	/* size of dlif_size + headroom */
272static struct zone *dlif_zone;		/* zone for dlil_ifnet */
273
274#define	DLIF_ZONE_MAX		64		/* maximum elements in zone */
275#define	DLIF_ZONE_NAME		"ifnet"		/* zone name */
276
277static unsigned int dlif_filt_size;	/* size of ifnet_filter */
278static struct zone *dlif_filt_zone;	/* zone for ifnet_filter */
279
280#define	DLIF_FILT_ZONE_MAX	8		/* maximum elements in zone */
281#define	DLIF_FILT_ZONE_NAME	"ifnet_filter"	/* zone name */
282
283static unsigned int dlif_phash_size;	/* size of ifnet proto hash table */
284static struct zone *dlif_phash_zone;	/* zone for ifnet proto hash table */
285
286#define	DLIF_PHASH_ZONE_MAX	DLIF_ZONE_MAX	/* maximum elements in zone */
287#define	DLIF_PHASH_ZONE_NAME	"ifnet_proto_hash" /* zone name */
288
289static unsigned int dlif_proto_size;	/* size of if_proto */
290static struct zone *dlif_proto_zone;	/* zone for if_proto */
291
292#define	DLIF_PROTO_ZONE_MAX	(DLIF_ZONE_MAX*2) /* maximum elements in zone */
293#define	DLIF_PROTO_ZONE_NAME	"ifnet_proto"	/* zone name */
294
295static unsigned int dlif_tcpstat_size;		/* size of tcpstat_local to allocate */
296static unsigned int dlif_tcpstat_bufsize;	/* size of dlif_tcpstat_size + headroom */
297static struct zone *dlif_tcpstat_zone;		/* zone for tcpstat_local */
298
299#define	DLIF_TCPSTAT_ZONE_MAX	1		/* maximum elements in zone */
300#define	DLIF_TCPSTAT_ZONE_NAME	"ifnet_tcpstat"	/* zone name */
301
302static unsigned int dlif_udpstat_size;		/* size of udpstat_local to allocate */
303static unsigned int dlif_udpstat_bufsize;	/* size of dlif_udpstat_size + headroom */
304static struct zone *dlif_udpstat_zone;		/* zone for udpstat_local */
305
306#define	DLIF_UDPSTAT_ZONE_MAX	1		/* maximum elements in zone */
307#define	DLIF_UDPSTAT_ZONE_NAME	"ifnet_udpstat"	/* zone name */
308
309/*
310 * Updating this variable should be done by first acquiring the global
311 * radix node head (rnh_lock), in tandem with settting/clearing the
312 * PR_AGGDRAIN for routedomain.
313 */
314u_int32_t ifnet_aggressive_drainers;
315static u_int32_t net_rtref;
316
317static struct dlil_main_threading_info dlil_main_input_thread_info;
318__private_extern__ struct dlil_threading_info *dlil_main_input_thread =
319    (struct dlil_threading_info *)&dlil_main_input_thread_info;
320
321static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg);
322static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
323static void dlil_if_trace(struct dlil_ifnet *, int);
324static void if_proto_ref(struct if_proto *);
325static void if_proto_free(struct if_proto *);
326static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
327static int dlil_ifp_proto_count(struct ifnet *);
328static void if_flt_monitor_busy(struct ifnet *);
329static void if_flt_monitor_unbusy(struct ifnet *);
330static void if_flt_monitor_enter(struct ifnet *);
331static void if_flt_monitor_leave(struct ifnet *);
332static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
333    char **, protocol_family_t);
334static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
335    protocol_family_t);
336static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
337    const struct sockaddr_dl *);
338static int ifnet_lookup(struct ifnet *);
339static void if_purgeaddrs(struct ifnet *);
340
341static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
342    struct mbuf *, char *);
343static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
344    struct mbuf *);
345static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
346    mbuf_t *, const struct sockaddr *, void *, char *, char *);
347static void ifproto_media_event(struct ifnet *, protocol_family_t,
348    const struct kev_msg *);
349static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
350    unsigned long, void *);
351static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
352    struct sockaddr_dl *, size_t);
353static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
354    const struct sockaddr_dl *, const struct sockaddr *,
355    const struct sockaddr_dl *, const struct sockaddr *);
356
357static errno_t ifp_if_output(struct ifnet *, struct mbuf *);
358static void ifp_if_start(struct ifnet *);
359static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
360    struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
361static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
362static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
363    protocol_family_t *);
364static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
365    const struct ifnet_demux_desc *, u_int32_t);
366static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
367static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
368static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
369    const struct sockaddr *, const char *, const char *);
370static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
371    const struct sockaddr *, const char *, const char *,
372    u_int32_t *, u_int32_t *);
373static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
374static void ifp_if_free(struct ifnet *);
375static void ifp_if_event(struct ifnet *, const struct kev_msg *);
376static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
377static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
378
379static void dlil_main_input_thread_func(void *, wait_result_t);
380static void dlil_input_thread_func(void *, wait_result_t);
381static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
382static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *);
383static void dlil_terminate_input_thread(struct dlil_threading_info *);
384static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
385    struct dlil_threading_info *, boolean_t);
386static void dlil_input_stats_sync(struct ifnet *, struct dlil_threading_info *);
387static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
388    u_int32_t, ifnet_model_t, boolean_t);
389static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
390    const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
391
392#if DEBUG
393static void dlil_verify_sum16(void);
394#endif /* DEBUG */
395static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
396    protocol_family_t);
397static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
398    protocol_family_t);
399
400static void ifnet_detacher_thread_func(void *, wait_result_t);
401static int ifnet_detacher_thread_cont(int);
402static void ifnet_detach_final(struct ifnet *);
403static void ifnet_detaching_enqueue(struct ifnet *);
404static struct ifnet *ifnet_detaching_dequeue(void);
405
406static void ifnet_start_thread_fn(void *, wait_result_t);
407static void ifnet_poll_thread_fn(void *, wait_result_t);
408static void ifnet_poll(struct ifnet *);
409
410static void ifp_src_route_copyout(struct ifnet *, struct route *);
411static void ifp_src_route_copyin(struct ifnet *, struct route *);
412#if INET6
413static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
414static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
415#endif /* INET6 */
416
417static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
418static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
419static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
420static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
421static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
422static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
423static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
424static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
425static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
426static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
427static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
428
429/* The following are protected by dlil_ifnet_lock */
430static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
431static u_int32_t ifnet_detaching_cnt;
432static void *ifnet_delayed_run;	/* wait channel for detaching thread */
433
434decl_lck_mtx_data(static, ifnet_fc_lock);
435
436static uint32_t ifnet_flowhash_seed;
437
438struct ifnet_flowhash_key {
439	char		ifk_name[IFNAMSIZ];
440	uint32_t	ifk_unit;
441	uint32_t	ifk_flags;
442	uint32_t	ifk_eflags;
443	uint32_t	ifk_capabilities;
444	uint32_t	ifk_capenable;
445	uint32_t	ifk_output_sched_model;
446	uint32_t	ifk_rand1;
447	uint32_t	ifk_rand2;
448};
449
450/* Flow control entry per interface */
451struct ifnet_fc_entry {
452	RB_ENTRY(ifnet_fc_entry) ifce_entry;
453	u_int32_t	ifce_flowhash;
454	struct ifnet	*ifce_ifp;
455};
456
457static uint32_t ifnet_calc_flowhash(struct ifnet *);
458static int ifce_cmp(const struct ifnet_fc_entry *,
459    const struct ifnet_fc_entry *);
460static int ifnet_fc_add(struct ifnet *);
461static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
462static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
463
464/* protected by ifnet_fc_lock */
465RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
466RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
467RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
468
469static unsigned int ifnet_fc_zone_size;		/* sizeof ifnet_fc_entry */
470static struct zone *ifnet_fc_zone;		/* ifnet_fc_entry zone */
471
472#define	IFNET_FC_ZONE_NAME	"ifnet_fc_zone"
473#define	IFNET_FC_ZONE_MAX	 32
474
475extern void bpfdetach(struct ifnet*);
476extern void proto_input_run(void);
477
478extern uint32_t udp_count_opportunistic(unsigned int ifindex,
479	u_int32_t flags);
480extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
481	u_int32_t flags);
482
483__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
484
485#if CONFIG_MACF
486int dlil_lladdr_ckreq = 0;
487#endif
488
489#if DEBUG
490int dlil_verbose = 1;
491#else
492int dlil_verbose = 0;
493#endif /* DEBUG */
494#if IFNET_INPUT_SANITY_CHK
495/* sanity checking of input packet lists received */
496static u_int32_t dlil_input_sanity_check = 0;
497#endif /* IFNET_INPUT_SANITY_CHK */
498/* rate limit debug messages */
499struct timespec dlil_dbgrate = { 1, 0 };
500
501SYSCTL_DECL(_net_link_generic_system);
502
503#if CONFIG_MACF
504SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_lladdr_ckreq,
505	CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_lladdr_ckreq, 0,
506	"Require MACF system info check to expose link-layer address");
507#endif
508
509SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
510    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
511
512#define	IF_SNDQ_MINLEN	32
513u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
514SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
515    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
516    sysctl_sndq_maxlen, "I", "Default transmit queue max length");
517
518#define	IF_RCVQ_MINLEN	32
519#define IF_RCVQ_MAXLEN	256
520u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
521SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
522    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
523    sysctl_rcvq_maxlen, "I", "Default receive queue max length");
524
525#define	IF_RXPOLL_DECAY		2	/* ilog2 of EWMA decay rate (4) */
526static u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
527SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
528    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
529    "ilog2 of EWMA decay rate of avg inbound packets");
530
531#define	IF_RXPOLL_MODE_HOLDTIME_MIN	(10ULL * 1000 * 1000)   /* 10 ms */
532#define	IF_RXPOLL_MODE_HOLDTIME		(1000ULL * 1000 * 1000)	/* 1 sec */
533static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
534SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
535    CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
536    IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
537    "Q", "input poll mode freeze time");
538
539#define	IF_RXPOLL_SAMPLETIME_MIN	(1ULL * 1000 * 1000)	/* 1 ms */
540#define	IF_RXPOLL_SAMPLETIME		(10ULL * 1000 * 1000)	/* 10 ms */
541static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
542SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
543    CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
544    IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
545    "Q", "input poll sampling time");
546
547#define	IF_RXPOLL_INTERVALTIME_MIN	(1ULL * 1000)		/* 1 us */
548#define	IF_RXPOLL_INTERVALTIME		(1ULL * 1000 * 1000)	/* 1 ms */
549static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
550SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
551    CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
552    IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
553    "Q", "input poll interval (time)");
554
555#define	IF_RXPOLL_INTERVAL_PKTS	0	/* 0 (disabled) */
556static u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
557SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
558    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
559    IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
560
561#define	IF_RXPOLL_WLOWAT	10
562static u_int32_t if_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
563SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
564    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_wlowat,
565    IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
566    "I", "input poll wakeup low watermark");
567
568#define	IF_RXPOLL_WHIWAT	100
569static u_int32_t if_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
570SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
571    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_whiwat,
572    IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
573    "I", "input poll wakeup high watermark");
574
575static u_int32_t if_rxpoll_max = 0;			/* 0 (automatic) */
576SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
577    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
578    "max packets per poll call");
579
580static u_int32_t if_rxpoll = 1;
581SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
582    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
583    sysctl_rxpoll, "I", "enable opportunistic input polling");
584
585u_int32_t if_bw_smoothing_val = 3;
586SYSCTL_UINT(_net_link_generic_system, OID_AUTO, if_bw_smoothing_val,
587    CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_smoothing_val, 0, "");
588
589u_int32_t if_bw_measure_size = 10;
590SYSCTL_INT(_net_link_generic_system, OID_AUTO, if_bw_measure_size,
591    CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_measure_size, 0, "");
592
593static u_int32_t cur_dlil_input_threads = 0;
594SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
595    CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads , 0,
596    "Current number of DLIL input threads");
597
598#if IFNET_INPUT_SANITY_CHK
599SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
600    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check , 0,
601    "Turn on sanity checking in DLIL input");
602#endif /* IFNET_INPUT_SANITY_CHK */
603
604static u_int32_t if_flowadv = 1;
605SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
606    CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
607    "enable flow-advisory mechanism");
608
609static u_int32_t if_delaybased_queue = 1;
610SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
611    CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
612    "enable delay based dynamic queue sizing");
613
614static uint64_t hwcksum_in_invalidated = 0;
615SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
616    hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
617    &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
618
619uint32_t hwcksum_dbg = 0;
620SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
621    CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
622    "enable hardware cksum debugging");
623
624#define	HWCKSUM_DBG_PARTIAL_FORCED	0x1	/* forced partial checksum */
625#define	HWCKSUM_DBG_PARTIAL_RXOFF_ADJ	0x2	/* adjust start offset */
626#define	HWCKSUM_DBG_FINALIZE_FORCED	0x10	/* forced finalize */
627#define	HWCKSUM_DBG_MASK \
628	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |	\
629	HWCKSUM_DBG_FINALIZE_FORCED)
630
631static uint32_t hwcksum_dbg_mode = 0;
632SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
633    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
634    0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
635
636static uint64_t hwcksum_dbg_partial_forced = 0;
637SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
638    hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
639    &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
640
641static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
642SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
643    hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
644    &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
645
646static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
647SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
648    hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
649    &hwcksum_dbg_partial_rxoff_forced, 0,
650    sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
651    "forced partial cksum rx offset");
652
653static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
654SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
655    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
656    0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
657    "adjusted partial cksum rx offset");
658
659static uint64_t hwcksum_dbg_verified = 0;
660SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
661    hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
662    &hwcksum_dbg_verified, "packets verified for having good checksum");
663
664static uint64_t hwcksum_dbg_bad_cksum = 0;
665SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
666    hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
667    &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
668
669static uint64_t hwcksum_dbg_bad_rxoff = 0;
670SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
671    hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
672    &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
673
674static uint64_t hwcksum_dbg_adjusted = 0;
675SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
676    hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
677    &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
678
679static uint64_t hwcksum_dbg_finalized_hdr = 0;
680SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
681    hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
682    &hwcksum_dbg_finalized_hdr, "finalized headers");
683
684static uint64_t hwcksum_dbg_finalized_data = 0;
685SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
686    hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
687    &hwcksum_dbg_finalized_data, "finalized payloads");
688
689uint32_t hwcksum_tx = 1;
690SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
691    CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
692    "enable transmit hardware checksum offload");
693
694uint32_t hwcksum_rx = 1;
695SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
696    CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
697    "enable receive hardware checksum offload");
698
699unsigned int net_rxpoll = 1;
700unsigned int net_affinity = 1;
701static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
702
703extern u_int32_t	inject_buckets;
704
705static	lck_grp_attr_t	*dlil_grp_attributes = NULL;
706static	lck_attr_t	*dlil_lck_attributes = NULL;
707
708
709#define	DLIL_INPUT_CHECK(m, ifp) {					\
710	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);			\
711	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||	\
712	    !(mbuf_flags(m) & MBUF_PKTHDR)) {				\
713		panic_plain("%s: invalid mbuf %p\n", __func__, m);	\
714		/* NOTREACHED */					\
715	}								\
716}
717
718#define	DLIL_EWMA(old, new, decay) do {					\
719	u_int32_t _avg;							\
720	if ((_avg = (old)) > 0)						\
721		_avg = (((_avg << (decay)) - _avg) + (new)) >> (decay);	\
722	else								\
723		_avg = (new);						\
724	(old) = _avg;							\
725} while (0)
726
727#define	MBPS	(1ULL * 1000 * 1000)
728#define	GBPS	(MBPS * 1000)
729
730struct rxpoll_time_tbl {
731	u_int64_t	speed;		/* downlink speed */
732	u_int32_t	plowat;		/* packets low watermark */
733	u_int32_t	phiwat;		/* packets high watermark */
734	u_int32_t	blowat;		/* bytes low watermark */
735	u_int32_t	bhiwat;		/* bytes high watermark */
736};
737
738static struct rxpoll_time_tbl rxpoll_tbl[] = {
739	{  10 * MBPS,	2,	8,	(1 * 1024),	(6 * 1024)	},
740	{ 100 * MBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
741	{   1 * GBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
742	{  10 * GBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
743	{ 100 * GBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
744	{ 0, 0, 0, 0, 0 }
745};
746
747int
748proto_hash_value(u_int32_t protocol_family)
749{
750	/*
751	 * dlil_proto_unplumb_all() depends on the mapping between
752	 * the hash bucket index and the protocol family defined
753	 * here; future changes must be applied there as well.
754	 */
755	switch(protocol_family) {
756		case PF_INET:
757			return (0);
758		case PF_INET6:
759			return (1);
760		case PF_VLAN:
761			return (2);
762		case PF_UNSPEC:
763		default:
764			return (3);
765	}
766}
767
768/*
769 * Caller must already be holding ifnet lock.
770 */
771static struct if_proto *
772find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
773{
774	struct if_proto *proto = NULL;
775	u_int32_t i = proto_hash_value(protocol_family);
776
777	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
778
779	if (ifp->if_proto_hash != NULL)
780		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
781
782	while (proto != NULL && proto->protocol_family != protocol_family)
783		proto = SLIST_NEXT(proto, next_hash);
784
785	if (proto != NULL)
786		if_proto_ref(proto);
787
788	return (proto);
789}
790
791static void
792if_proto_ref(struct if_proto *proto)
793{
794	atomic_add_32(&proto->refcount, 1);
795}
796
797extern void if_rtproto_del(struct ifnet *ifp, int protocol);
798
799static void
800if_proto_free(struct if_proto *proto)
801{
802	u_int32_t oldval;
803	struct ifnet *ifp = proto->ifp;
804	u_int32_t proto_family = proto->protocol_family;
805	struct kev_dl_proto_data ev_pr_data;
806
807	oldval = atomic_add_32_ov(&proto->refcount, -1);
808	if (oldval > 1)
809		return;
810
811	/* No more reference on this, protocol must have been detached */
812	VERIFY(proto->detached);
813
814	if (proto->proto_kpi == kProtoKPI_v1) {
815		if (proto->kpi.v1.detached)
816			proto->kpi.v1.detached(ifp, proto->protocol_family);
817	}
818	if (proto->proto_kpi == kProtoKPI_v2) {
819		if (proto->kpi.v2.detached)
820			proto->kpi.v2.detached(ifp, proto->protocol_family);
821	}
822
823	/*
824	 * Cleanup routes that may still be in the routing table for that
825	 * interface/protocol pair.
826	 */
827	if_rtproto_del(ifp, proto_family);
828
829	/*
830	 * The reserved field carries the number of protocol still attached
831	 * (subject to change)
832	 */
833	ifnet_lock_shared(ifp);
834	ev_pr_data.proto_family = proto_family;
835	ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp);
836	ifnet_lock_done(ifp);
837
838	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
839	    (struct net_event_data *)&ev_pr_data,
840	    sizeof(struct kev_dl_proto_data));
841
842	zfree(dlif_proto_zone, proto);
843}
844
845__private_extern__ void
846ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
847{
848	unsigned int type = 0;
849	int ass = 1;
850
851	switch (what) {
852	case IFNET_LCK_ASSERT_EXCLUSIVE:
853		type = LCK_RW_ASSERT_EXCLUSIVE;
854		break;
855
856	case IFNET_LCK_ASSERT_SHARED:
857		type = LCK_RW_ASSERT_SHARED;
858		break;
859
860	case IFNET_LCK_ASSERT_OWNED:
861		type = LCK_RW_ASSERT_HELD;
862		break;
863
864	case IFNET_LCK_ASSERT_NOTOWNED:
865		/* nothing to do here for RW lock; bypass assert */
866		ass = 0;
867		break;
868
869	default:
870		panic("bad ifnet assert type: %d", what);
871		/* NOTREACHED */
872	}
873	if (ass)
874		lck_rw_assert(&ifp->if_lock, type);
875}
876
877__private_extern__ void
878ifnet_lock_shared(struct ifnet *ifp)
879{
880	lck_rw_lock_shared(&ifp->if_lock);
881}
882
883__private_extern__ void
884ifnet_lock_exclusive(struct ifnet *ifp)
885{
886	lck_rw_lock_exclusive(&ifp->if_lock);
887}
888
889__private_extern__ void
890ifnet_lock_done(struct ifnet *ifp)
891{
892	lck_rw_done(&ifp->if_lock);
893}
894
895#if INET6
896__private_extern__ void
897if_inet6data_lock_shared(struct ifnet *ifp)
898{
899	lck_rw_lock_shared(&ifp->if_inet6data_lock);
900}
901
902__private_extern__ void
903if_inet6data_lock_exclusive(struct ifnet *ifp)
904{
905	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
906}
907
908__private_extern__ void
909if_inet6data_lock_done(struct ifnet *ifp)
910{
911	lck_rw_done(&ifp->if_inet6data_lock);
912}
913#endif
914
915__private_extern__ void
916ifnet_head_lock_shared(void)
917{
918	lck_rw_lock_shared(&ifnet_head_lock);
919}
920
921__private_extern__ void
922ifnet_head_lock_exclusive(void)
923{
924	lck_rw_lock_exclusive(&ifnet_head_lock);
925}
926
927__private_extern__ void
928ifnet_head_done(void)
929{
930	lck_rw_done(&ifnet_head_lock);
931}
932
933/*
934 * Caller must already be holding ifnet lock.
935 */
936static int
937dlil_ifp_proto_count(struct ifnet * ifp)
938{
939	int i, count = 0;
940
941	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
942
943	if (ifp->if_proto_hash == NULL)
944		goto done;
945
946	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
947		struct if_proto *proto;
948		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
949			count++;
950		}
951	}
952done:
953	return (count);
954}
955
956__private_extern__ void
957dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
958    u_int32_t event_code, struct net_event_data *event_data,
959    u_int32_t event_data_len)
960{
961	struct net_event_data ev_data;
962	struct kev_msg ev_msg;
963
964	bzero(&ev_msg, sizeof (ev_msg));
965	bzero(&ev_data, sizeof (ev_data));
966	/*
967	 * a net event always starts with a net_event_data structure
968	 * but the caller can generate a simple net event or
969	 * provide a longer event structure to post
970	 */
971	ev_msg.vendor_code	= KEV_VENDOR_APPLE;
972	ev_msg.kev_class	= KEV_NETWORK_CLASS;
973	ev_msg.kev_subclass	= event_subclass;
974	ev_msg.event_code	= event_code;
975
976	if (event_data == NULL) {
977		event_data = &ev_data;
978		event_data_len = sizeof(struct net_event_data);
979	}
980
981	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
982	event_data->if_family = ifp->if_family;
983	event_data->if_unit   = (u_int32_t) ifp->if_unit;
984
985	ev_msg.dv[0].data_length = event_data_len;
986	ev_msg.dv[0].data_ptr    = event_data;
987	ev_msg.dv[1].data_length = 0;
988
989	dlil_event_internal(ifp, &ev_msg);
990}
991
992__private_extern__ int
993dlil_alloc_local_stats(struct ifnet *ifp)
994{
995	int ret = EINVAL;
996	void *buf, *base, **pbuf;
997
998	if (ifp == NULL)
999		goto end;
1000
1001	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
1002		/* allocate tcpstat_local structure */
1003		buf = zalloc(dlif_tcpstat_zone);
1004		if (buf == NULL) {
1005			ret = ENOMEM;
1006			goto end;
1007		}
1008		bzero(buf, dlif_tcpstat_bufsize);
1009
1010		/* Get the 64-bit aligned base address for this object */
1011		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
1012		    sizeof (u_int64_t));
1013		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1014		    ((intptr_t)buf + dlif_tcpstat_bufsize));
1015
1016		/*
1017		 * Wind back a pointer size from the aligned base and
1018		 * save the original address so we can free it later.
1019		 */
1020		pbuf = (void **)((intptr_t)base - sizeof (void *));
1021		*pbuf = buf;
1022		ifp->if_tcp_stat = base;
1023
1024		/* allocate udpstat_local structure */
1025		buf = zalloc(dlif_udpstat_zone);
1026		if (buf == NULL) {
1027			ret = ENOMEM;
1028			goto end;
1029		}
1030		bzero(buf, dlif_udpstat_bufsize);
1031
1032		/* Get the 64-bit aligned base address for this object */
1033		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
1034		    sizeof (u_int64_t));
1035		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
1036		    ((intptr_t)buf + dlif_udpstat_bufsize));
1037
1038		/*
1039		 * Wind back a pointer size from the aligned base and
1040		 * save the original address so we can free it later.
1041		 */
1042		pbuf = (void **)((intptr_t)base - sizeof (void *));
1043		*pbuf = buf;
1044		ifp->if_udp_stat = base;
1045
1046		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof (u_int64_t)) &&
1047		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof (u_int64_t)));
1048
1049		ret = 0;
1050	}
1051
1052end:
1053	if (ret != 0) {
1054		if (ifp->if_tcp_stat != NULL) {
1055			pbuf = (void **)
1056			    ((intptr_t)ifp->if_tcp_stat - sizeof (void *));
1057			zfree(dlif_tcpstat_zone, *pbuf);
1058			ifp->if_tcp_stat = NULL;
1059		}
1060		if (ifp->if_udp_stat != NULL) {
1061			pbuf = (void **)
1062			    ((intptr_t)ifp->if_udp_stat - sizeof (void *));
1063			zfree(dlif_udpstat_zone, *pbuf);
1064			ifp->if_udp_stat = NULL;
1065		}
1066	}
1067
1068	return (ret);
1069}
1070
1071static int
1072dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp)
1073{
1074	thread_continue_t func;
1075	u_int32_t limit;
1076	int error;
1077
1078	/* NULL ifp indicates the main input thread, called at dlil_init time */
1079	if (ifp == NULL) {
1080		func = dlil_main_input_thread_func;
1081		VERIFY(inp == dlil_main_input_thread);
1082		(void) strlcat(inp->input_name,
1083		    "main_input", DLIL_THREADNAME_LEN);
1084	} else if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
1085		func = dlil_rxpoll_input_thread_func;
1086		VERIFY(inp != dlil_main_input_thread);
1087		(void) snprintf(inp->input_name, DLIL_THREADNAME_LEN,
1088		    "%s_input_poll", if_name(ifp));
1089	} else {
1090		func = dlil_input_thread_func;
1091		VERIFY(inp != dlil_main_input_thread);
1092		(void) snprintf(inp->input_name, DLIL_THREADNAME_LEN,
1093		    "%s_input", if_name(ifp));
1094	}
1095	VERIFY(inp->input_thr == THREAD_NULL);
1096
1097	inp->lck_grp = lck_grp_alloc_init(inp->input_name, dlil_grp_attributes);
1098	lck_mtx_init(&inp->input_lck, inp->lck_grp, dlil_lck_attributes);
1099
1100	inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
1101	inp->ifp = ifp;		/* NULL for main input thread */
1102
1103	net_timerclear(&inp->mode_holdtime);
1104	net_timerclear(&inp->mode_lasttime);
1105	net_timerclear(&inp->sample_holdtime);
1106	net_timerclear(&inp->sample_lasttime);
1107	net_timerclear(&inp->dbg_lasttime);
1108
1109	/*
1110	 * For interfaces that support opportunistic polling, set the
1111	 * low and high watermarks for outstanding inbound packets/bytes.
1112	 * Also define freeze times for transitioning between modes
1113	 * and updating the average.
1114	 */
1115	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
1116		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
1117		(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
1118	} else {
1119		limit = (u_int32_t)-1;
1120	}
1121
1122	_qinit(&inp->rcvq_pkts, Q_DROPTAIL, limit);
1123	if (inp == dlil_main_input_thread) {
1124		struct dlil_main_threading_info *inpm =
1125		    (struct dlil_main_threading_info *)inp;
1126		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit);
1127	}
1128
1129	error = kernel_thread_start(func, inp, &inp->input_thr);
1130	if (error == KERN_SUCCESS) {
1131		ml_thread_policy(inp->input_thr, MACHINE_GROUP,
1132		    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR));
1133		/*
1134		 * We create an affinity set so that the matching workloop
1135		 * thread or the starter thread (for loopback) can be
1136		 * scheduled on the same processor set as the input thread.
1137		 */
1138		if (net_affinity) {
1139			struct thread *tp = inp->input_thr;
1140			u_int32_t tag;
1141			/*
1142			 * Randomize to reduce the probability
1143			 * of affinity tag namespace collision.
1144			 */
1145			read_random(&tag, sizeof (tag));
1146			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
1147				thread_reference(tp);
1148				inp->tag = tag;
1149				inp->net_affinity = TRUE;
1150			}
1151		}
1152	} else if (inp == dlil_main_input_thread) {
1153		panic_plain("%s: couldn't create main input thread", __func__);
1154		/* NOTREACHED */
1155	} else {
1156		panic_plain("%s: couldn't create %s input thread", __func__,
1157		    if_name(ifp));
1158		/* NOTREACHED */
1159	}
1160	OSAddAtomic(1, &cur_dlil_input_threads);
1161
1162	return (error);
1163}
1164
1165static void
1166dlil_terminate_input_thread(struct dlil_threading_info *inp)
1167{
1168	struct ifnet *ifp;
1169
1170	VERIFY(current_thread() == inp->input_thr);
1171	VERIFY(inp != dlil_main_input_thread);
1172
1173	OSAddAtomic(-1, &cur_dlil_input_threads);
1174
1175	lck_mtx_destroy(&inp->input_lck, inp->lck_grp);
1176	lck_grp_free(inp->lck_grp);
1177
1178	inp->input_waiting = 0;
1179	inp->wtot = 0;
1180	bzero(inp->input_name, sizeof (inp->input_name));
1181	ifp = inp->ifp;
1182	inp->ifp = NULL;
1183	VERIFY(qhead(&inp->rcvq_pkts) == NULL && qempty(&inp->rcvq_pkts));
1184	qlimit(&inp->rcvq_pkts) = 0;
1185	bzero(&inp->stats, sizeof (inp->stats));
1186
1187	VERIFY(!inp->net_affinity);
1188	inp->input_thr = THREAD_NULL;
1189	VERIFY(inp->wloop_thr == THREAD_NULL);
1190	VERIFY(inp->poll_thr == THREAD_NULL);
1191	VERIFY(inp->tag == 0);
1192
1193	inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
1194	bzero(&inp->tstats, sizeof (inp->tstats));
1195	bzero(&inp->pstats, sizeof (inp->pstats));
1196	bzero(&inp->sstats, sizeof (inp->sstats));
1197
1198	net_timerclear(&inp->mode_holdtime);
1199	net_timerclear(&inp->mode_lasttime);
1200	net_timerclear(&inp->sample_holdtime);
1201	net_timerclear(&inp->sample_lasttime);
1202	net_timerclear(&inp->dbg_lasttime);
1203
1204#if IFNET_INPUT_SANITY_CHK
1205	inp->input_mbuf_cnt = 0;
1206#endif /* IFNET_INPUT_SANITY_CHK */
1207
1208	if (dlil_verbose) {
1209		printf("%s: input thread terminated\n",
1210		    if_name(ifp));
1211	}
1212
1213	/* for the extra refcnt from kernel_thread_start() */
1214	thread_deallocate(current_thread());
1215
1216	/* this is the end */
1217	thread_terminate(current_thread());
1218	/* NOTREACHED */
1219}
1220
1221static kern_return_t
1222dlil_affinity_set(struct thread *tp, u_int32_t tag)
1223{
1224	thread_affinity_policy_data_t policy;
1225
1226	bzero(&policy, sizeof (policy));
1227	policy.affinity_tag = tag;
1228	return (thread_policy_set(tp, THREAD_AFFINITY_POLICY,
1229	    (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT));
1230}
1231
1232void
1233dlil_init(void)
1234{
1235	thread_t thread = THREAD_NULL;
1236
1237	/*
1238	 * The following fields must be 64-bit aligned for atomic operations.
1239	 */
1240	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
1241	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors)
1242	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
1243	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
1244	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
1245	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
1246	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
1247	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
1248	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
1249	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
1250	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
1251	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
1252	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
1253	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
1254	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
1255
1256	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
1257	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors)
1258	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
1259	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
1260	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
1261	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
1262	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
1263	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
1264	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
1265	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
1266	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
1267	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
1268	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
1269	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
1270	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
1271
1272	/*
1273	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
1274	 */
1275	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
1276	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
1277	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
1278	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
1279	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
1280	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
1281	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
1282	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
1283	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
1284	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
1285	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
1286	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
1287	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
1288
1289	/*
1290	 * ... as well as the mbuf checksum flags counterparts.
1291	 */
1292	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
1293	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
1294	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
1295	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
1296	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
1297	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
1298	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
1299	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
1300	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
1301	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
1302
1303	/*
1304	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
1305	 */
1306	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
1307	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
1308
1309	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
1310	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
1311	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
1312	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
1313
1314	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
1315	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
1316	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
1317
1318	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
1319	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
1320	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
1321	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
1322	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
1323	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
1324	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
1325	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
1326	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
1327	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
1328	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
1329	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
1330	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
1331	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
1332	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
1333	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
1334
1335	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
1336	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
1337	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
1338	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
1339	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
1340	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
1341
1342	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
1343	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
1344
1345	PE_parse_boot_argn("net_affinity", &net_affinity,
1346	    sizeof (net_affinity));
1347
1348	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof (net_rxpoll));
1349
1350	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof (net_rtref));
1351
1352	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof (ifnet_debug));
1353
1354	dlif_size = (ifnet_debug == 0) ? sizeof (struct dlil_ifnet) :
1355	    sizeof (struct dlil_ifnet_dbg);
1356	/* Enforce 64-bit alignment for dlil_ifnet structure */
1357	dlif_bufsize = dlif_size + sizeof (void *) + sizeof (u_int64_t);
1358	dlif_bufsize = P2ROUNDUP(dlif_bufsize, sizeof (u_int64_t));
1359	dlif_zone = zinit(dlif_bufsize, DLIF_ZONE_MAX * dlif_bufsize,
1360	    0, DLIF_ZONE_NAME);
1361	if (dlif_zone == NULL) {
1362		panic_plain("%s: failed allocating %s", __func__,
1363		    DLIF_ZONE_NAME);
1364		/* NOTREACHED */
1365	}
1366	zone_change(dlif_zone, Z_EXPAND, TRUE);
1367	zone_change(dlif_zone, Z_CALLERACCT, FALSE);
1368
1369	dlif_filt_size = sizeof (struct ifnet_filter);
1370	dlif_filt_zone = zinit(dlif_filt_size,
1371	    DLIF_FILT_ZONE_MAX * dlif_filt_size, 0, DLIF_FILT_ZONE_NAME);
1372	if (dlif_filt_zone == NULL) {
1373		panic_plain("%s: failed allocating %s", __func__,
1374		    DLIF_FILT_ZONE_NAME);
1375		/* NOTREACHED */
1376	}
1377	zone_change(dlif_filt_zone, Z_EXPAND, TRUE);
1378	zone_change(dlif_filt_zone, Z_CALLERACCT, FALSE);
1379
1380	dlif_phash_size = sizeof (struct proto_hash_entry) * PROTO_HASH_SLOTS;
1381	dlif_phash_zone = zinit(dlif_phash_size,
1382	    DLIF_PHASH_ZONE_MAX * dlif_phash_size, 0, DLIF_PHASH_ZONE_NAME);
1383	if (dlif_phash_zone == NULL) {
1384		panic_plain("%s: failed allocating %s", __func__,
1385		    DLIF_PHASH_ZONE_NAME);
1386		/* NOTREACHED */
1387	}
1388	zone_change(dlif_phash_zone, Z_EXPAND, TRUE);
1389	zone_change(dlif_phash_zone, Z_CALLERACCT, FALSE);
1390
1391	dlif_proto_size = sizeof (struct if_proto);
1392	dlif_proto_zone = zinit(dlif_proto_size,
1393	    DLIF_PROTO_ZONE_MAX * dlif_proto_size, 0, DLIF_PROTO_ZONE_NAME);
1394	if (dlif_proto_zone == NULL) {
1395		panic_plain("%s: failed allocating %s", __func__,
1396		    DLIF_PROTO_ZONE_NAME);
1397		/* NOTREACHED */
1398	}
1399	zone_change(dlif_proto_zone, Z_EXPAND, TRUE);
1400	zone_change(dlif_proto_zone, Z_CALLERACCT, FALSE);
1401
1402	dlif_tcpstat_size = sizeof (struct tcpstat_local);
1403	/* Enforce 64-bit alignment for tcpstat_local structure */
1404	dlif_tcpstat_bufsize =
1405	    dlif_tcpstat_size + sizeof (void *) + sizeof (u_int64_t);
1406	dlif_tcpstat_bufsize =
1407	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof (u_int64_t));
1408	dlif_tcpstat_zone = zinit(dlif_tcpstat_bufsize,
1409	    DLIF_TCPSTAT_ZONE_MAX * dlif_tcpstat_bufsize, 0,
1410	    DLIF_TCPSTAT_ZONE_NAME);
1411	if (dlif_tcpstat_zone == NULL) {
1412		panic_plain("%s: failed allocating %s", __func__,
1413		    DLIF_TCPSTAT_ZONE_NAME);
1414		/* NOTREACHED */
1415	}
1416	zone_change(dlif_tcpstat_zone, Z_EXPAND, TRUE);
1417	zone_change(dlif_tcpstat_zone, Z_CALLERACCT, FALSE);
1418
1419	dlif_udpstat_size = sizeof (struct udpstat_local);
1420	/* Enforce 64-bit alignment for udpstat_local structure */
1421	dlif_udpstat_bufsize =
1422	    dlif_udpstat_size + sizeof (void *) + sizeof (u_int64_t);
1423	dlif_udpstat_bufsize =
1424	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof (u_int64_t));
1425	dlif_udpstat_zone = zinit(dlif_udpstat_bufsize,
1426	    DLIF_TCPSTAT_ZONE_MAX * dlif_udpstat_bufsize, 0,
1427	    DLIF_UDPSTAT_ZONE_NAME);
1428	if (dlif_udpstat_zone == NULL) {
1429		panic_plain("%s: failed allocating %s", __func__,
1430		    DLIF_UDPSTAT_ZONE_NAME);
1431		/* NOTREACHED */
1432	}
1433	zone_change(dlif_udpstat_zone, Z_EXPAND, TRUE);
1434	zone_change(dlif_udpstat_zone, Z_CALLERACCT, FALSE);
1435
1436	ifnet_llreach_init();
1437
1438	TAILQ_INIT(&dlil_ifnet_head);
1439	TAILQ_INIT(&ifnet_head);
1440	TAILQ_INIT(&ifnet_detaching_head);
1441
1442	/* Setup the lock groups we will use */
1443	dlil_grp_attributes = lck_grp_attr_alloc_init();
1444
1445	dlil_lock_group = lck_grp_alloc_init("DLIL internal locks",
1446	    dlil_grp_attributes);
1447	ifnet_lock_group = lck_grp_alloc_init("ifnet locks",
1448	    dlil_grp_attributes);
1449	ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock",
1450	    dlil_grp_attributes);
1451	ifnet_rcv_lock_group = lck_grp_alloc_init("ifnet rcv locks",
1452	    dlil_grp_attributes);
1453	ifnet_snd_lock_group = lck_grp_alloc_init("ifnet snd locks",
1454	    dlil_grp_attributes);
1455
1456	/* Setup the lock attributes we will use */
1457	dlil_lck_attributes = lck_attr_alloc_init();
1458
1459	ifnet_lock_attr = lck_attr_alloc_init();
1460
1461	lck_rw_init(&ifnet_head_lock, ifnet_head_lock_group,
1462	    dlil_lck_attributes);
1463	lck_mtx_init(&dlil_ifnet_lock, dlil_lock_group, dlil_lck_attributes);
1464
1465	/* Setup interface flow control related items */
1466	lck_mtx_init(&ifnet_fc_lock, dlil_lock_group, dlil_lck_attributes);
1467
1468	ifnet_fc_zone_size = sizeof (struct ifnet_fc_entry);
1469	ifnet_fc_zone = zinit(ifnet_fc_zone_size,
1470	    IFNET_FC_ZONE_MAX * ifnet_fc_zone_size, 0, IFNET_FC_ZONE_NAME);
1471	if (ifnet_fc_zone == NULL) {
1472		panic_plain("%s: failed allocating %s", __func__,
1473		    IFNET_FC_ZONE_NAME);
1474		/* NOTREACHED */
1475	}
1476	zone_change(ifnet_fc_zone, Z_EXPAND, TRUE);
1477	zone_change(ifnet_fc_zone, Z_CALLERACCT, FALSE);
1478
1479	/* Initialize interface address subsystem */
1480	ifa_init();
1481
1482#if PF
1483	/* Initialize the packet filter */
1484	pfinit();
1485#endif /* PF */
1486
1487	/* Initialize queue algorithms */
1488	classq_init();
1489
1490	/* Initialize packet schedulers */
1491	pktsched_init();
1492
1493	/* Initialize flow advisory subsystem */
1494	flowadv_init();
1495
1496	/* Initialize the pktap virtual interface */
1497	pktap_init();
1498
1499#if DEBUG
1500	/* Run self-tests */
1501	dlil_verify_sum16();
1502#endif /* DEBUG */
1503
1504	/*
1505	 * Create and start up the main DLIL input thread and the interface
1506	 * detacher threads once everything is initialized.
1507	 */
1508	dlil_create_input_thread(NULL, dlil_main_input_thread);
1509
1510	if (kernel_thread_start(ifnet_detacher_thread_func,
1511	    NULL, &thread) != KERN_SUCCESS) {
1512		panic_plain("%s: couldn't create detacher thread", __func__);
1513		/* NOTREACHED */
1514	}
1515	thread_deallocate(thread);
1516}
1517
1518static void
1519if_flt_monitor_busy(struct ifnet *ifp)
1520{
1521	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1522
1523	++ifp->if_flt_busy;
1524	VERIFY(ifp->if_flt_busy != 0);
1525}
1526
1527static void
1528if_flt_monitor_unbusy(struct ifnet *ifp)
1529{
1530	if_flt_monitor_leave(ifp);
1531}
1532
1533static void
1534if_flt_monitor_enter(struct ifnet *ifp)
1535{
1536	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1537
1538	while (ifp->if_flt_busy) {
1539		++ifp->if_flt_waiters;
1540		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
1541		    (PZERO - 1), "if_flt_monitor", NULL);
1542	}
1543	if_flt_monitor_busy(ifp);
1544}
1545
1546static void
1547if_flt_monitor_leave(struct ifnet *ifp)
1548{
1549	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1550
1551	VERIFY(ifp->if_flt_busy != 0);
1552	--ifp->if_flt_busy;
1553
1554	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
1555		ifp->if_flt_waiters = 0;
1556		wakeup(&ifp->if_flt_head);
1557	}
1558}
1559
1560__private_extern__ int
1561dlil_attach_filter(struct ifnet	*ifp, const struct iff_filter *if_filter,
1562    interface_filter_t *filter_ref, u_int32_t flags)
1563{
1564	int retval = 0;
1565	struct ifnet_filter *filter = NULL;
1566
1567	ifnet_head_lock_shared();
1568	/* Check that the interface is in the global list */
1569	if (!ifnet_lookup(ifp)) {
1570		retval = ENXIO;
1571		goto done;
1572	}
1573
1574	filter = zalloc(dlif_filt_zone);
1575	if (filter == NULL) {
1576		retval = ENOMEM;
1577		goto done;
1578	}
1579	bzero(filter, dlif_filt_size);
1580
1581	/* refcnt held above during lookup */
1582	filter->filt_flags = flags;
1583	filter->filt_ifp = ifp;
1584	filter->filt_cookie = if_filter->iff_cookie;
1585	filter->filt_name = if_filter->iff_name;
1586	filter->filt_protocol = if_filter->iff_protocol;
1587	filter->filt_input = if_filter->iff_input;
1588	filter->filt_output = if_filter->iff_output;
1589	filter->filt_event = if_filter->iff_event;
1590	filter->filt_ioctl = if_filter->iff_ioctl;
1591	filter->filt_detached = if_filter->iff_detached;
1592
1593	lck_mtx_lock(&ifp->if_flt_lock);
1594	if_flt_monitor_enter(ifp);
1595
1596	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1597	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
1598
1599	if_flt_monitor_leave(ifp);
1600	lck_mtx_unlock(&ifp->if_flt_lock);
1601
1602	*filter_ref = filter;
1603
1604	/*
1605	 * Bump filter count and route_generation ID to let TCP
1606	 * know it shouldn't do TSO on this connection
1607	 */
1608	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
1609		OSAddAtomic(1, &dlil_filter_disable_tso_count);
1610		routegenid_update();
1611	}
1612	if (dlil_verbose) {
1613		printf("%s: %s filter attached\n", if_name(ifp),
1614		    if_filter->iff_name);
1615	}
1616done:
1617	ifnet_head_done();
1618	if (retval != 0 && ifp != NULL) {
1619		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
1620		    if_name(ifp), if_filter->iff_name, retval);
1621	}
1622	if (retval != 0 && filter != NULL)
1623		zfree(dlif_filt_zone, filter);
1624
1625	return (retval);
1626}
1627
1628static int
1629dlil_detach_filter_internal(interface_filter_t	filter, int detached)
1630{
1631	int retval = 0;
1632
1633	if (detached == 0) {
1634		ifnet_t ifp = NULL;
1635
1636		ifnet_head_lock_shared();
1637		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
1638			interface_filter_t entry = NULL;
1639
1640			lck_mtx_lock(&ifp->if_flt_lock);
1641			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
1642				if (entry != filter || entry->filt_skip)
1643					continue;
1644				/*
1645				 * We've found a match; since it's possible
1646				 * that the thread gets blocked in the monitor,
1647				 * we do the lock dance.  Interface should
1648				 * not be detached since we still have a use
1649				 * count held during filter attach.
1650				 */
1651				entry->filt_skip = 1;	/* skip input/output */
1652				lck_mtx_unlock(&ifp->if_flt_lock);
1653				ifnet_head_done();
1654
1655				lck_mtx_lock(&ifp->if_flt_lock);
1656				if_flt_monitor_enter(ifp);
1657				lck_mtx_assert(&ifp->if_flt_lock,
1658				    LCK_MTX_ASSERT_OWNED);
1659
1660				/* Remove the filter from the list */
1661				TAILQ_REMOVE(&ifp->if_flt_head, filter,
1662				    filt_next);
1663
1664				if_flt_monitor_leave(ifp);
1665				lck_mtx_unlock(&ifp->if_flt_lock);
1666				if (dlil_verbose) {
1667					printf("%s: %s filter detached\n",
1668					    if_name(ifp), filter->filt_name);
1669				}
1670				goto destroy;
1671			}
1672			lck_mtx_unlock(&ifp->if_flt_lock);
1673		}
1674		ifnet_head_done();
1675
1676		/* filter parameter is not a valid filter ref */
1677		retval = EINVAL;
1678		goto done;
1679	}
1680
1681	if (dlil_verbose)
1682		printf("%s filter detached\n", filter->filt_name);
1683
1684destroy:
1685
1686	/* Call the detached function if there is one */
1687	if (filter->filt_detached)
1688		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
1689
1690	/* Free the filter */
1691	zfree(dlif_filt_zone, filter);
1692
1693	/*
1694	 * Decrease filter count and route_generation ID to let TCP
1695	 * know it should reevalute doing TSO or not
1696	 */
1697	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
1698		OSAddAtomic(-1, &dlil_filter_disable_tso_count);
1699		routegenid_update();
1700	}
1701done:
1702	if (retval != 0) {
1703		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
1704		    filter->filt_name, retval);
1705	}
1706	return (retval);
1707}
1708
1709__private_extern__ void
1710dlil_detach_filter(interface_filter_t filter)
1711{
1712	if (filter == NULL)
1713		return;
1714	dlil_detach_filter_internal(filter, 0);
1715}
1716
1717/*
1718 * Main input thread:
1719 *
1720 *   a) handles all inbound packets for lo0
1721 *   b) handles all inbound packets for interfaces with no dedicated
1722 *	input thread (e.g. anything but Ethernet/PDP or those that support
1723 *	opportunistic polling.)
1724 *   c) protocol registrations
1725 *   d) packet injections
1726 */
1727static void
1728dlil_main_input_thread_func(void *v, wait_result_t w)
1729{
1730#pragma unused(w)
1731	struct dlil_main_threading_info *inpm = v;
1732	struct dlil_threading_info *inp = v;
1733
1734	VERIFY(inp == dlil_main_input_thread);
1735	VERIFY(inp->ifp == NULL);
1736	VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
1737
1738	while (1) {
1739		struct mbuf *m = NULL, *m_loop = NULL;
1740		u_int32_t m_cnt, m_cnt_loop;
1741		boolean_t proto_req;
1742
1743		lck_mtx_lock_spin(&inp->input_lck);
1744
1745		/* Wait until there is work to be done */
1746		while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
1747			inp->input_waiting &= ~DLIL_INPUT_RUNNING;
1748			(void) msleep(&inp->input_waiting, &inp->input_lck,
1749			    (PZERO - 1) | PSPIN, inp->input_name, NULL);
1750		}
1751
1752		inp->input_waiting |= DLIL_INPUT_RUNNING;
1753		inp->input_waiting &= ~DLIL_INPUT_WAITING;
1754
1755		/* Main input thread cannot be terminated */
1756		VERIFY(!(inp->input_waiting & DLIL_INPUT_TERMINATE));
1757
1758		proto_req = (inp->input_waiting &
1759		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
1760
1761		/* Packets for non-dedicated interfaces other than lo0 */
1762		m_cnt = qlen(&inp->rcvq_pkts);
1763		m = _getq_all(&inp->rcvq_pkts);
1764
1765		/* Packets exclusive to lo0 */
1766		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
1767		m_loop = _getq_all(&inpm->lo_rcvq_pkts);
1768
1769		inp->wtot = 0;
1770
1771		lck_mtx_unlock(&inp->input_lck);
1772
1773		/*
1774		* NOTE warning %%% attention !!!!
1775		* We should think about putting some thread starvation
1776		* safeguards if we deal with long chains of packets.
1777		*/
1778		if (m_loop != NULL)
1779			dlil_input_packet_list_extended(lo_ifp, m_loop,
1780			    m_cnt_loop, inp->mode);
1781
1782		if (m != NULL)
1783			dlil_input_packet_list_extended(NULL, m,
1784			    m_cnt, inp->mode);
1785
1786		if (proto_req)
1787			proto_input_run();
1788	}
1789
1790	/* NOTREACHED */
1791	VERIFY(0);	/* we should never get here */
1792}
1793
1794/*
1795 * Input thread for interfaces with legacy input model.
1796 */
1797static void
1798dlil_input_thread_func(void *v, wait_result_t w)
1799{
1800#pragma unused(w)
1801	struct dlil_threading_info *inp = v;
1802	struct ifnet *ifp = inp->ifp;
1803
1804	VERIFY(inp != dlil_main_input_thread);
1805	VERIFY(ifp != NULL);
1806	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll);
1807	VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
1808
1809	while (1) {
1810		struct mbuf *m = NULL;
1811		u_int32_t m_cnt;
1812
1813		lck_mtx_lock_spin(&inp->input_lck);
1814
1815		/* Wait until there is work to be done */
1816		while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
1817			inp->input_waiting &= ~DLIL_INPUT_RUNNING;
1818			(void) msleep(&inp->input_waiting, &inp->input_lck,
1819			    (PZERO - 1) | PSPIN, inp->input_name, NULL);
1820		}
1821
1822		inp->input_waiting |= DLIL_INPUT_RUNNING;
1823		inp->input_waiting &= ~DLIL_INPUT_WAITING;
1824
1825		/*
1826		 * Protocol registration and injection must always use
1827		 * the main input thread; in theory the latter can utilize
1828		 * the corresponding input thread where the packet arrived
1829		 * on, but that requires our knowing the interface in advance
1830		 * (and the benefits might not worth the trouble.)
1831		 */
1832		VERIFY(!(inp->input_waiting &
1833		    (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER)));
1834
1835		/* Packets for this interface */
1836		m_cnt = qlen(&inp->rcvq_pkts);
1837		m = _getq_all(&inp->rcvq_pkts);
1838
1839		if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
1840			lck_mtx_unlock(&inp->input_lck);
1841
1842			/* Free up pending packets */
1843			if (m != NULL)
1844				mbuf_freem_list(m);
1845
1846			dlil_terminate_input_thread(inp);
1847			/* NOTREACHED */
1848			return;
1849		}
1850
1851		inp->wtot = 0;
1852
1853		dlil_input_stats_sync(ifp, inp);
1854
1855		lck_mtx_unlock(&inp->input_lck);
1856
1857		/*
1858		* NOTE warning %%% attention !!!!
1859		* We should think about putting some thread starvation
1860		* safeguards if we deal with long chains of packets.
1861		*/
1862		if (m != NULL)
1863			dlil_input_packet_list_extended(NULL, m,
1864			    m_cnt, inp->mode);
1865	}
1866
1867	/* NOTREACHED */
1868	VERIFY(0);	/* we should never get here */
1869}
1870
1871/*
1872 * Input thread for interfaces with opportunistic polling input model.
1873 */
1874static void
1875dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
1876{
1877#pragma unused(w)
1878	struct dlil_threading_info *inp = v;
1879	struct ifnet *ifp = inp->ifp;
1880	struct timespec ts;
1881
1882	VERIFY(inp != dlil_main_input_thread);
1883	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL));
1884
1885	while (1) {
1886		struct mbuf *m = NULL;
1887		u_int32_t m_cnt, m_size, poll_req = 0;
1888		ifnet_model_t mode;
1889		struct timespec now, delta;
1890		u_int64_t ival;
1891
1892		lck_mtx_lock_spin(&inp->input_lck);
1893
1894		if ((ival = inp->rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN)
1895			ival = IF_RXPOLL_INTERVALTIME_MIN;
1896
1897		/* Link parameters changed? */
1898		if (ifp->if_poll_update != 0) {
1899			ifp->if_poll_update = 0;
1900			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
1901		}
1902
1903		/* Current operating mode */
1904		mode = inp->mode;
1905
1906		/* Wait until there is work to be done */
1907		while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
1908			inp->input_waiting &= ~DLIL_INPUT_RUNNING;
1909			(void) msleep(&inp->input_waiting, &inp->input_lck,
1910			    (PZERO - 1) | PSPIN, inp->input_name, NULL);
1911		}
1912
1913		inp->input_waiting |= DLIL_INPUT_RUNNING;
1914		inp->input_waiting &= ~DLIL_INPUT_WAITING;
1915
1916		/*
1917		 * Protocol registration and injection must always use
1918		 * the main input thread; in theory the latter can utilize
1919		 * the corresponding input thread where the packet arrived
1920		 * on, but that requires our knowing the interface in advance
1921		 * (and the benefits might not worth the trouble.)
1922		 */
1923		VERIFY(!(inp->input_waiting &
1924		    (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER)));
1925
1926		if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
1927			/* Free up pending packets */
1928			_flushq(&inp->rcvq_pkts);
1929			lck_mtx_unlock(&inp->input_lck);
1930
1931			dlil_terminate_input_thread(inp);
1932			/* NOTREACHED */
1933			return;
1934		}
1935
1936		/* Total count of all packets */
1937		m_cnt = qlen(&inp->rcvq_pkts);
1938
1939		/* Total bytes of all packets */
1940		m_size = qsize(&inp->rcvq_pkts);
1941
1942		/* Packets for this interface */
1943		m = _getq_all(&inp->rcvq_pkts);
1944		VERIFY(m != NULL || m_cnt == 0);
1945
1946		nanouptime(&now);
1947		if (!net_timerisset(&inp->sample_lasttime))
1948			*(&inp->sample_lasttime) = *(&now);
1949
1950		net_timersub(&now, &inp->sample_lasttime, &delta);
1951		if (if_rxpoll && net_timerisset(&inp->sample_holdtime)) {
1952			u_int32_t ptot, btot;
1953
1954			/* Accumulate statistics for current sampling */
1955			PKTCNTR_ADD(&inp->sstats, m_cnt, m_size);
1956
1957			if (net_timercmp(&delta, &inp->sample_holdtime, <))
1958				goto skip;
1959
1960			*(&inp->sample_lasttime) = *(&now);
1961
1962			/* Calculate min/max of inbound bytes */
1963			btot = (u_int32_t)inp->sstats.bytes;
1964			if (inp->rxpoll_bmin == 0 || inp->rxpoll_bmin > btot)
1965				inp->rxpoll_bmin = btot;
1966			if (btot > inp->rxpoll_bmax)
1967				inp->rxpoll_bmax = btot;
1968
1969			/* Calculate EWMA of inbound bytes */
1970			DLIL_EWMA(inp->rxpoll_bavg, btot, if_rxpoll_decay);
1971
1972			/* Calculate min/max of inbound packets */
1973			ptot = (u_int32_t)inp->sstats.packets;
1974			if (inp->rxpoll_pmin == 0 || inp->rxpoll_pmin > ptot)
1975				inp->rxpoll_pmin = ptot;
1976			if (ptot > inp->rxpoll_pmax)
1977				inp->rxpoll_pmax = ptot;
1978
1979			/* Calculate EWMA of inbound packets */
1980			DLIL_EWMA(inp->rxpoll_pavg, ptot, if_rxpoll_decay);
1981
1982			/* Reset sampling statistics */
1983			PKTCNTR_CLEAR(&inp->sstats);
1984
1985			/* Calculate EWMA of wakeup requests */
1986			DLIL_EWMA(inp->rxpoll_wavg, inp->wtot, if_rxpoll_decay);
1987			inp->wtot = 0;
1988
1989			if (dlil_verbose) {
1990				if (!net_timerisset(&inp->dbg_lasttime))
1991					*(&inp->dbg_lasttime) = *(&now);
1992				net_timersub(&now, &inp->dbg_lasttime, &delta);
1993				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
1994					*(&inp->dbg_lasttime) = *(&now);
1995					printf("%s: [%s] pkts avg %d max %d "
1996					    "limits [%d/%d], wreq avg %d "
1997					    "limits [%d/%d], bytes avg %d "
1998					    "limits [%d/%d]\n", if_name(ifp),
1999					    (inp->mode ==
2000					    IFNET_MODEL_INPUT_POLL_ON) ?
2001					    "ON" : "OFF", inp->rxpoll_pavg,
2002					    inp->rxpoll_pmax,
2003					    inp->rxpoll_plowat,
2004					    inp->rxpoll_phiwat,
2005					    inp->rxpoll_wavg,
2006					    inp->rxpoll_wlowat,
2007					    inp->rxpoll_whiwat,
2008					    inp->rxpoll_bavg,
2009					    inp->rxpoll_blowat,
2010					    inp->rxpoll_bhiwat);
2011				}
2012			}
2013
2014			/* Perform mode transition, if necessary */
2015			if (!net_timerisset(&inp->mode_lasttime))
2016				*(&inp->mode_lasttime) = *(&now);
2017
2018			net_timersub(&now, &inp->mode_lasttime, &delta);
2019			if (net_timercmp(&delta, &inp->mode_holdtime, <))
2020				goto skip;
2021
2022			if (inp->rxpoll_pavg <= inp->rxpoll_plowat &&
2023			    inp->rxpoll_bavg <= inp->rxpoll_blowat &&
2024			    inp->mode != IFNET_MODEL_INPUT_POLL_OFF) {
2025				mode = IFNET_MODEL_INPUT_POLL_OFF;
2026			} else if (inp->rxpoll_pavg >= inp->rxpoll_phiwat &&
2027			    (inp->rxpoll_bavg >= inp->rxpoll_bhiwat ||
2028			    inp->rxpoll_wavg >= inp->rxpoll_whiwat) &&
2029			    inp->mode != IFNET_MODEL_INPUT_POLL_ON) {
2030				mode = IFNET_MODEL_INPUT_POLL_ON;
2031			}
2032
2033			if (mode != inp->mode) {
2034				inp->mode = mode;
2035				*(&inp->mode_lasttime) = *(&now);
2036				poll_req++;
2037			}
2038		}
2039skip:
2040		dlil_input_stats_sync(ifp, inp);
2041
2042		lck_mtx_unlock(&inp->input_lck);
2043
2044		/*
2045		 * If there's a mode change and interface is still attached,
2046		 * perform a downcall to the driver for the new mode.  Also
2047		 * hold an IO refcnt on the interface to prevent it from
2048		 * being detached (will be release below.)
2049		 */
2050		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
2051			struct ifnet_model_params p = { mode, { 0 } };
2052			errno_t err;
2053
2054			if (dlil_verbose) {
2055				printf("%s: polling is now %s, "
2056				    "pkts avg %d max %d limits [%d/%d], "
2057				    "wreq avg %d limits [%d/%d], "
2058				    "bytes avg %d limits [%d/%d]\n",
2059				    if_name(ifp),
2060				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
2061				    "ON" : "OFF", inp->rxpoll_pavg,
2062				    inp->rxpoll_pmax, inp->rxpoll_plowat,
2063				    inp->rxpoll_phiwat, inp->rxpoll_wavg,
2064				    inp->rxpoll_wlowat, inp->rxpoll_whiwat,
2065				    inp->rxpoll_bavg, inp->rxpoll_blowat,
2066				    inp->rxpoll_bhiwat);
2067			}
2068
2069			if ((err = ((*ifp->if_input_ctl)(ifp,
2070			    IFNET_CTL_SET_INPUT_MODEL, sizeof (p), &p))) != 0) {
2071				printf("%s: error setting polling mode "
2072				    "to %s (%d)\n", if_name(ifp),
2073				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
2074				    "ON" : "OFF", err);
2075			}
2076
2077			switch (mode) {
2078			case IFNET_MODEL_INPUT_POLL_OFF:
2079				ifnet_set_poll_cycle(ifp, NULL);
2080				inp->rxpoll_offreq++;
2081				if (err != 0)
2082					inp->rxpoll_offerr++;
2083				break;
2084
2085			case IFNET_MODEL_INPUT_POLL_ON:
2086				net_nsectimer(&ival, &ts);
2087				ifnet_set_poll_cycle(ifp, &ts);
2088				ifnet_poll(ifp);
2089				inp->rxpoll_onreq++;
2090				if (err != 0)
2091					inp->rxpoll_onerr++;
2092				break;
2093
2094			default:
2095				VERIFY(0);
2096				/* NOTREACHED */
2097			}
2098
2099			/* Release the IO refcnt */
2100			ifnet_decr_iorefcnt(ifp);
2101		}
2102
2103		/*
2104		* NOTE warning %%% attention !!!!
2105		* We should think about putting some thread starvation
2106		* safeguards if we deal with long chains of packets.
2107		*/
2108		if (m != NULL)
2109			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
2110	}
2111
2112	/* NOTREACHED */
2113	VERIFY(0);	/* we should never get here */
2114}
2115
2116/*
2117 * Must be called on an attached ifnet (caller is expected to check.)
2118 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
2119 */
2120errno_t
2121dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
2122    boolean_t locked)
2123{
2124	struct dlil_threading_info *inp;
2125	u_int64_t sample_holdtime, inbw;
2126
2127	VERIFY(ifp != NULL);
2128	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL)
2129		return (ENXIO);
2130
2131	if (p != NULL) {
2132		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
2133		    (p->packets_lowat != 0 && p->packets_hiwat == 0))
2134			return (EINVAL);
2135		if (p->packets_lowat != 0 &&	/* hiwat must be non-zero */
2136		    p->packets_lowat >= p->packets_hiwat)
2137			return (EINVAL);
2138		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
2139		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0))
2140			return (EINVAL);
2141		if (p->bytes_lowat != 0 &&	/* hiwat must be non-zero */
2142		    p->bytes_lowat >= p->bytes_hiwat)
2143			return (EINVAL);
2144		if (p->interval_time != 0 &&
2145		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN)
2146			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
2147	}
2148
2149	if (!locked)
2150		lck_mtx_lock(&inp->input_lck);
2151
2152	lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED);
2153
2154	/*
2155	 * Normally, we'd reset the parameters to the auto-tuned values
2156	 * if the the input thread detects a change in link rate.  If the
2157	 * driver provides its own parameters right after a link rate
2158	 * changes, but before the input thread gets to run, we want to
2159	 * make sure to keep the driver's values.  Clearing if_poll_update
2160	 * will achieve that.
2161	 */
2162	if (p != NULL && !locked && ifp->if_poll_update != 0)
2163		ifp->if_poll_update = 0;
2164
2165	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
2166		sample_holdtime = 0;	/* polling is disabled */
2167		inp->rxpoll_wlowat = inp->rxpoll_plowat =
2168		    inp->rxpoll_blowat = 0;
2169		inp->rxpoll_whiwat = inp->rxpoll_phiwat =
2170		    inp->rxpoll_bhiwat = (u_int32_t)-1;
2171		inp->rxpoll_plim = 0;
2172		inp->rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
2173	} else {
2174		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
2175		u_int64_t ival;
2176		unsigned int n, i;
2177
2178		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
2179			if (inbw < rxpoll_tbl[i].speed)
2180				break;
2181			n = i;
2182		}
2183		/* auto-tune if caller didn't specify a value */
2184		plowat = ((p == NULL || p->packets_lowat == 0) ?
2185		    rxpoll_tbl[n].plowat : p->packets_lowat);
2186		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
2187		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
2188		blowat = ((p == NULL || p->bytes_lowat == 0) ?
2189		    rxpoll_tbl[n].blowat : p->bytes_lowat);
2190		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
2191		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
2192		plim = ((p == NULL || p->packets_limit == 0) ?
2193		    if_rxpoll_max : p->packets_limit);
2194		ival = ((p == NULL || p->interval_time == 0) ?
2195		    if_rxpoll_interval_time : p->interval_time);
2196
2197		VERIFY(plowat != 0 && phiwat != 0);
2198		VERIFY(blowat != 0 && bhiwat != 0);
2199		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
2200
2201		sample_holdtime = if_rxpoll_sample_holdtime;
2202		inp->rxpoll_wlowat = if_rxpoll_wlowat;
2203		inp->rxpoll_whiwat = if_rxpoll_whiwat;
2204		inp->rxpoll_plowat = plowat;
2205		inp->rxpoll_phiwat = phiwat;
2206		inp->rxpoll_blowat = blowat;
2207		inp->rxpoll_bhiwat = bhiwat;
2208		inp->rxpoll_plim = plim;
2209		inp->rxpoll_ival = ival;
2210	}
2211
2212	net_nsectimer(&if_rxpoll_mode_holdtime, &inp->mode_holdtime);
2213	net_nsectimer(&sample_holdtime, &inp->sample_holdtime);
2214
2215	if (dlil_verbose) {
2216		printf("%s: speed %llu bps, sample per %llu nsec, "
2217		    "poll interval %llu nsec, pkts per poll %u, "
2218		    "pkt limits [%u/%u], wreq limits [%u/%u], "
2219		    "bytes limits [%u/%u]\n", if_name(ifp),
2220		    inbw, sample_holdtime, inp->rxpoll_ival, inp->rxpoll_plim,
2221		    inp->rxpoll_plowat, inp->rxpoll_phiwat, inp->rxpoll_wlowat,
2222		    inp->rxpoll_whiwat, inp->rxpoll_blowat, inp->rxpoll_bhiwat);
2223	}
2224
2225	if (!locked)
2226		lck_mtx_unlock(&inp->input_lck);
2227
2228	return (0);
2229}
2230
2231/*
2232 * Must be called on an attached ifnet (caller is expected to check.)
2233 */
2234errno_t
2235dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
2236{
2237	struct dlil_threading_info *inp;
2238
2239	VERIFY(ifp != NULL && p != NULL);
2240	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL)
2241		return (ENXIO);
2242
2243	bzero(p, sizeof (*p));
2244
2245	lck_mtx_lock(&inp->input_lck);
2246	p->packets_limit = inp->rxpoll_plim;
2247	p->packets_lowat = inp->rxpoll_plowat;
2248	p->packets_hiwat = inp->rxpoll_phiwat;
2249	p->bytes_lowat = inp->rxpoll_blowat;
2250	p->bytes_hiwat = inp->rxpoll_bhiwat;
2251	p->interval_time = inp->rxpoll_ival;
2252	lck_mtx_unlock(&inp->input_lck);
2253
2254	return (0);
2255}
2256
2257errno_t
2258ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
2259    const struct ifnet_stat_increment_param *s)
2260{
2261	return (ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE));
2262}
2263
2264errno_t
2265ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
2266    struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
2267{
2268	return (ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE));
2269}
2270
2271static errno_t
2272ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
2273    const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
2274{
2275	struct thread *tp = current_thread();
2276	struct mbuf *last;
2277	struct dlil_threading_info *inp;
2278	u_int32_t m_cnt = 0, m_size = 0;
2279
2280	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
2281		if (m_head != NULL)
2282			mbuf_freem_list(m_head);
2283		return (EINVAL);
2284	}
2285
2286	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
2287	VERIFY(m_tail == NULL || ext);
2288	VERIFY(s != NULL || !ext);
2289
2290	/*
2291	 * Drop the packet(s) if the parameters are invalid, or if the
2292	 * interface is no longer attached; else hold an IO refcnt to
2293	 * prevent it from being detached (will be released below.)
2294	 */
2295	if (ifp == NULL || (ifp != lo_ifp && !ifnet_is_attached(ifp, 1))) {
2296		if (m_head != NULL)
2297			mbuf_freem_list(m_head);
2298		return (EINVAL);
2299	}
2300
2301	if (m_tail == NULL) {
2302		last = m_head;
2303		while (m_head != NULL) {
2304#if IFNET_INPUT_SANITY_CHK
2305			if (dlil_input_sanity_check != 0)
2306				DLIL_INPUT_CHECK(last, ifp);
2307#endif /* IFNET_INPUT_SANITY_CHK */
2308			m_cnt++;
2309			m_size += m_length(last);
2310			if (mbuf_nextpkt(last) == NULL)
2311				break;
2312			last = mbuf_nextpkt(last);
2313		}
2314		m_tail = last;
2315	} else {
2316#if IFNET_INPUT_SANITY_CHK
2317		if (dlil_input_sanity_check != 0) {
2318			last = m_head;
2319			while (1) {
2320				DLIL_INPUT_CHECK(last, ifp);
2321				m_cnt++;
2322				m_size += m_length(last);
2323				if (mbuf_nextpkt(last) == NULL)
2324					break;
2325				last = mbuf_nextpkt(last);
2326			}
2327		} else {
2328			m_cnt = s->packets_in;
2329			m_size = s->bytes_in;
2330			last = m_tail;
2331		}
2332#else
2333		m_cnt = s->packets_in;
2334		m_size = s->bytes_in;
2335		last = m_tail;
2336#endif /* IFNET_INPUT_SANITY_CHK */
2337	}
2338
2339	if (last != m_tail) {
2340		panic_plain("%s: invalid input packet chain for %s, "
2341		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
2342		    m_tail, last);
2343	}
2344
2345	/*
2346	 * Assert packet count only for the extended variant, for backwards
2347	 * compatibility, since this came directly from the device driver.
2348	 * Relax this assertion for input bytes, as the driver may have
2349	 * included the link-layer headers in the computation; hence
2350	 * m_size is just an approximation.
2351	 */
2352	if (ext && s->packets_in != m_cnt) {
2353		panic_plain("%s: input packet count mismatch for %s, "
2354		    "%d instead of %d\n", __func__, if_name(ifp),
2355		    s->packets_in, m_cnt);
2356	}
2357
2358	if ((inp = ifp->if_inp) == NULL)
2359		inp = dlil_main_input_thread;
2360
2361	/*
2362	 * If there is a matching DLIL input thread associated with an
2363	 * affinity set, associate this thread with the same set.  We
2364	 * will only do this once.
2365	 */
2366	lck_mtx_lock_spin(&inp->input_lck);
2367	if (inp != dlil_main_input_thread && inp->net_affinity &&
2368	    ((!poll && inp->wloop_thr == THREAD_NULL) ||
2369	    (poll && inp->poll_thr == THREAD_NULL))) {
2370		u_int32_t tag = inp->tag;
2371
2372		if (poll) {
2373			VERIFY(inp->poll_thr == THREAD_NULL);
2374			inp->poll_thr = tp;
2375		} else {
2376			VERIFY(inp->wloop_thr == THREAD_NULL);
2377			inp->wloop_thr = tp;
2378		}
2379		lck_mtx_unlock(&inp->input_lck);
2380
2381		/* Associate the current thread with the new affinity tag */
2382		(void) dlil_affinity_set(tp, tag);
2383
2384		/*
2385		 * Take a reference on the current thread; during detach,
2386		 * we will need to refer to it in order ot tear down its
2387		 * affinity.
2388		 */
2389		thread_reference(tp);
2390		lck_mtx_lock_spin(&inp->input_lck);
2391	}
2392
2393	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
2394
2395        /*
2396	 * Because of loopbacked multicast we cannot stuff the ifp in
2397	 * the rcvif of the packet header: loopback (lo0) packets use a
2398	 * dedicated list so that we can later associate them with lo_ifp
2399	 * on their way up the stack.  Packets for other interfaces without
2400	 * dedicated input threads go to the regular list.
2401	 */
2402	if (m_head != NULL) {
2403		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
2404			struct dlil_main_threading_info *inpm =
2405			    (struct dlil_main_threading_info *)inp;
2406			_addq_multi(&inpm->lo_rcvq_pkts, m_head, m_tail,
2407			    m_cnt, m_size);
2408		} else {
2409			_addq_multi(&inp->rcvq_pkts, m_head, m_tail,
2410			    m_cnt, m_size);
2411		}
2412	}
2413
2414#if IFNET_INPUT_SANITY_CHK
2415	if (dlil_input_sanity_check != 0) {
2416		u_int32_t count;
2417		struct mbuf *m0;
2418
2419		for (m0 = m_head, count = 0; m0; m0 = mbuf_nextpkt(m0))
2420			count++;
2421
2422		if (count != m_cnt) {
2423			panic_plain("%s: invalid packet count %d "
2424			    "(expected %d)\n", if_name(ifp),
2425			    count, m_cnt);
2426			/* NOTREACHED */
2427		}
2428
2429		inp->input_mbuf_cnt += m_cnt;
2430	}
2431#endif /* IFNET_INPUT_SANITY_CHK */
2432
2433	if (s != NULL) {
2434		dlil_input_stats_add(s, inp, poll);
2435		/*
2436		 * If we're using the main input thread, synchronize the
2437		 * stats now since we have the interface context.  All
2438		 * other cases involving dedicated input threads will
2439		 * have their stats synchronized there.
2440		 */
2441		if (inp == dlil_main_input_thread)
2442			dlil_input_stats_sync(ifp, inp);
2443	}
2444
2445	inp->input_waiting |= DLIL_INPUT_WAITING;
2446	if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
2447		inp->wtot++;
2448		wakeup_one((caddr_t)&inp->input_waiting);
2449	}
2450	lck_mtx_unlock(&inp->input_lck);
2451
2452	if (ifp != lo_ifp) {
2453		/* Release the IO refcnt */
2454		ifnet_decr_iorefcnt(ifp);
2455	}
2456
2457	return (0);
2458}
2459
2460static void
2461ifnet_start_common(struct ifnet *ifp, int resetfc)
2462{
2463	if (!(ifp->if_eflags & IFEF_TXSTART))
2464		return;
2465	/*
2466	 * If the starter thread is inactive, signal it to do work,
2467	 * unless the interface is being flow controlled from below,
2468	 * e.g. a virtual interface being flow controlled by a real
2469	 * network interface beneath it.
2470	 */
2471	lck_mtx_lock_spin(&ifp->if_start_lock);
2472	if (resetfc) {
2473		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
2474	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
2475		lck_mtx_unlock(&ifp->if_start_lock);
2476		return;
2477	}
2478	ifp->if_start_req++;
2479	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL) {
2480		wakeup_one((caddr_t)&ifp->if_start_thread);
2481	}
2482	lck_mtx_unlock(&ifp->if_start_lock);
2483}
2484
2485void
2486ifnet_start(struct ifnet *ifp)
2487{
2488	ifnet_start_common(ifp, 0);
2489}
2490
2491static void
2492ifnet_start_thread_fn(void *v, wait_result_t w)
2493{
2494#pragma unused(w)
2495	struct ifnet *ifp = v;
2496	char ifname[IFNAMSIZ + 1];
2497	struct timespec *ts = NULL;
2498	struct ifclassq *ifq = &ifp->if_snd;
2499
2500	/*
2501	 * Treat the dedicated starter thread for lo0 as equivalent to
2502	 * the driver workloop thread; if net_affinity is enabled for
2503	 * the main input thread, associate this starter thread to it
2504	 * by binding them with the same affinity tag.  This is done
2505	 * only once (as we only have one lo_ifp which never goes away.)
2506	 */
2507	if (ifp == lo_ifp) {
2508		struct dlil_threading_info *inp = dlil_main_input_thread;
2509		struct thread *tp = current_thread();
2510
2511		lck_mtx_lock(&inp->input_lck);
2512		if (inp->net_affinity) {
2513			u_int32_t tag = inp->tag;
2514
2515			VERIFY(inp->wloop_thr == THREAD_NULL);
2516			VERIFY(inp->poll_thr == THREAD_NULL);
2517			inp->wloop_thr = tp;
2518			lck_mtx_unlock(&inp->input_lck);
2519
2520			/* Associate this thread with the affinity tag */
2521			(void) dlil_affinity_set(tp, tag);
2522		} else {
2523			lck_mtx_unlock(&inp->input_lck);
2524		}
2525	}
2526
2527	snprintf(ifname, sizeof (ifname), "%s_starter",
2528	    if_name(ifp));
2529
2530	lck_mtx_lock_spin(&ifp->if_start_lock);
2531
2532	for (;;) {
2533		(void) msleep(&ifp->if_start_thread, &ifp->if_start_lock,
2534		    (PZERO - 1) | PSPIN, ifname, ts);
2535
2536		/* interface is detached? */
2537		if (ifp->if_start_thread == THREAD_NULL) {
2538			ifnet_set_start_cycle(ifp, NULL);
2539			lck_mtx_unlock(&ifp->if_start_lock);
2540			ifnet_purge(ifp);
2541
2542			if (dlil_verbose) {
2543				printf("%s: starter thread terminated\n",
2544				    if_name(ifp));
2545			}
2546
2547			/* for the extra refcnt from kernel_thread_start() */
2548			thread_deallocate(current_thread());
2549			/* this is the end */
2550			thread_terminate(current_thread());
2551			/* NOTREACHED */
2552			return;
2553		}
2554
2555		ifp->if_start_active = 1;
2556		for (;;) {
2557			u_int32_t req = ifp->if_start_req;
2558
2559			lck_mtx_unlock(&ifp->if_start_lock);
2560			/* invoke the driver's start routine */
2561			((*ifp->if_start)(ifp));
2562			lck_mtx_lock_spin(&ifp->if_start_lock);
2563
2564			/* if there's no pending request, we're done */
2565			if (req == ifp->if_start_req)
2566				break;
2567		}
2568		ifp->if_start_req = 0;
2569		ifp->if_start_active = 0;
2570		/*
2571		 * Wakeup N ns from now if rate-controlled by TBR, and if
2572		 * there are still packets in the send queue which haven't
2573		 * been dequeued so far; else sleep indefinitely (ts = NULL)
2574		 * until ifnet_start() is called again.
2575		 */
2576		ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
2577		    &ifp->if_start_cycle : NULL);
2578
2579		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0)
2580			ts = NULL;
2581	}
2582
2583	/* NOTREACHED */
2584}
2585
2586void
2587ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
2588{
2589	if (ts == NULL)
2590		bzero(&ifp->if_start_cycle, sizeof (ifp->if_start_cycle));
2591	else
2592		*(&ifp->if_start_cycle) = *ts;
2593
2594	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose)
2595		printf("%s: restart interval set to %lu nsec\n",
2596		    if_name(ifp), ts->tv_nsec);
2597}
2598
2599static void
2600ifnet_poll(struct ifnet *ifp)
2601{
2602	/*
2603	 * If the poller thread is inactive, signal it to do work.
2604	 */
2605	lck_mtx_lock_spin(&ifp->if_poll_lock);
2606	ifp->if_poll_req++;
2607	if (!ifp->if_poll_active && ifp->if_poll_thread != THREAD_NULL) {
2608		wakeup_one((caddr_t)&ifp->if_poll_thread);
2609	}
2610	lck_mtx_unlock(&ifp->if_poll_lock);
2611}
2612
2613static void
2614ifnet_poll_thread_fn(void *v, wait_result_t w)
2615{
2616#pragma unused(w)
2617	struct dlil_threading_info *inp;
2618	struct ifnet *ifp = v;
2619	char ifname[IFNAMSIZ + 1];
2620	struct timespec *ts = NULL;
2621	struct ifnet_stat_increment_param s;
2622
2623	snprintf(ifname, sizeof (ifname), "%s_poller",
2624	    if_name(ifp));
2625	bzero(&s, sizeof (s));
2626
2627	lck_mtx_lock_spin(&ifp->if_poll_lock);
2628
2629	inp = ifp->if_inp;
2630	VERIFY(inp != NULL);
2631
2632	for (;;) {
2633		if (ifp->if_poll_thread != THREAD_NULL) {
2634			(void) msleep(&ifp->if_poll_thread, &ifp->if_poll_lock,
2635			    (PZERO - 1) | PSPIN, ifname, ts);
2636		}
2637
2638		/* interface is detached (maybe while asleep)? */
2639		if (ifp->if_poll_thread == THREAD_NULL) {
2640			ifnet_set_poll_cycle(ifp, NULL);
2641			lck_mtx_unlock(&ifp->if_poll_lock);
2642
2643			if (dlil_verbose) {
2644				printf("%s: poller thread terminated\n",
2645				    if_name(ifp));
2646			}
2647
2648			/* for the extra refcnt from kernel_thread_start() */
2649			thread_deallocate(current_thread());
2650			/* this is the end */
2651			thread_terminate(current_thread());
2652			/* NOTREACHED */
2653			return;
2654		}
2655
2656		ifp->if_poll_active = 1;
2657		for (;;) {
2658			struct mbuf *m_head, *m_tail;
2659			u_int32_t m_lim, m_cnt, m_totlen;
2660			u_int16_t req = ifp->if_poll_req;
2661
2662			lck_mtx_unlock(&ifp->if_poll_lock);
2663
2664			/*
2665			 * If no longer attached, there's nothing to do;
2666			 * else hold an IO refcnt to prevent the interface
2667			 * from being detached (will be released below.)
2668			 */
2669			if (!ifnet_is_attached(ifp, 1)) {
2670				lck_mtx_lock_spin(&ifp->if_poll_lock);
2671				break;
2672			}
2673
2674			m_lim = (inp->rxpoll_plim != 0) ? inp->rxpoll_plim :
2675			    MAX((qlimit(&inp->rcvq_pkts)),
2676			    (inp->rxpoll_phiwat << 2));
2677
2678			if (dlil_verbose > 1) {
2679				printf("%s: polling up to %d pkts, "
2680				    "pkts avg %d max %d, wreq avg %d, "
2681				    "bytes avg %d\n",
2682				    if_name(ifp), m_lim,
2683				    inp->rxpoll_pavg, inp->rxpoll_pmax,
2684				    inp->rxpoll_wavg, inp->rxpoll_bavg);
2685			}
2686
2687			/* invoke the driver's input poll routine */
2688			((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
2689			    &m_cnt, &m_totlen));
2690
2691			if (m_head != NULL) {
2692				VERIFY(m_tail != NULL && m_cnt > 0);
2693
2694				if (dlil_verbose > 1) {
2695					printf("%s: polled %d pkts, "
2696					    "pkts avg %d max %d, wreq avg %d, "
2697					    "bytes avg %d\n",
2698					    if_name(ifp), m_cnt,
2699					    inp->rxpoll_pavg, inp->rxpoll_pmax,
2700					    inp->rxpoll_wavg, inp->rxpoll_bavg);
2701				}
2702
2703				/* stats are required for extended variant */
2704				s.packets_in = m_cnt;
2705				s.bytes_in = m_totlen;
2706
2707				(void) ifnet_input_common(ifp, m_head, m_tail,
2708				    &s, TRUE, TRUE);
2709			} else {
2710				if (dlil_verbose > 1) {
2711					printf("%s: no packets, "
2712					    "pkts avg %d max %d, wreq avg %d, "
2713					    "bytes avg %d\n",
2714					    if_name(ifp), inp->rxpoll_pavg,
2715					    inp->rxpoll_pmax, inp->rxpoll_wavg,
2716					    inp->rxpoll_bavg);
2717				}
2718
2719				(void) ifnet_input_common(ifp, NULL, NULL,
2720				    NULL, FALSE, TRUE);
2721			}
2722
2723			/* Release the io ref count */
2724			ifnet_decr_iorefcnt(ifp);
2725
2726			lck_mtx_lock_spin(&ifp->if_poll_lock);
2727
2728			/* if there's no pending request, we're done */
2729			if (req == ifp->if_poll_req)
2730				break;
2731		}
2732		ifp->if_poll_req = 0;
2733		ifp->if_poll_active = 0;
2734
2735		/*
2736		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
2737		 * until ifnet_poll() is called again.
2738		 */
2739		ts = &ifp->if_poll_cycle;
2740		if (ts->tv_sec == 0 && ts->tv_nsec == 0)
2741			ts = NULL;
2742	}
2743
2744	/* NOTREACHED */
2745}
2746
2747void
2748ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
2749{
2750	if (ts == NULL)
2751		bzero(&ifp->if_poll_cycle, sizeof (ifp->if_poll_cycle));
2752	else
2753		*(&ifp->if_poll_cycle) = *ts;
2754
2755	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose)
2756		printf("%s: poll interval set to %lu nsec\n",
2757		    if_name(ifp), ts->tv_nsec);
2758}
2759
2760void
2761ifnet_purge(struct ifnet *ifp)
2762{
2763	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART))
2764		if_qflush(ifp, 0);
2765}
2766
2767void
2768ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
2769{
2770	IFCQ_LOCK_ASSERT_HELD(ifq);
2771
2772	if (!(IFCQ_IS_READY(ifq)))
2773		return;
2774
2775	if (IFCQ_TBR_IS_ENABLED(ifq)) {
2776		struct tb_profile tb = { ifq->ifcq_tbr.tbr_rate_raw,
2777		    ifq->ifcq_tbr.tbr_percent, 0 };
2778		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
2779	}
2780
2781	ifclassq_update(ifq, ev);
2782}
2783
2784void
2785ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
2786{
2787	switch (ev) {
2788	case CLASSQ_EV_LINK_BANDWIDTH:
2789		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL))
2790			ifp->if_poll_update++;
2791		break;
2792
2793	default:
2794		break;
2795	}
2796}
2797
2798errno_t
2799ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
2800{
2801	struct ifclassq *ifq;
2802	u_int32_t omodel;
2803	errno_t err;
2804
2805	if (ifp == NULL || (model != IFNET_SCHED_MODEL_DRIVER_MANAGED &&
2806	    model != IFNET_SCHED_MODEL_NORMAL))
2807		return (EINVAL);
2808	else if (!(ifp->if_eflags & IFEF_TXSTART))
2809		return (ENXIO);
2810
2811	ifq = &ifp->if_snd;
2812	IFCQ_LOCK(ifq);
2813	omodel = ifp->if_output_sched_model;
2814	ifp->if_output_sched_model = model;
2815	if ((err = ifclassq_pktsched_setup(ifq)) != 0)
2816		ifp->if_output_sched_model = omodel;
2817	IFCQ_UNLOCK(ifq);
2818
2819	return (err);
2820}
2821
2822errno_t
2823ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
2824{
2825	if (ifp == NULL)
2826		return (EINVAL);
2827	else if (!(ifp->if_eflags & IFEF_TXSTART))
2828		return (ENXIO);
2829
2830	ifclassq_set_maxlen(&ifp->if_snd, maxqlen);
2831
2832	return (0);
2833}
2834
2835errno_t
2836ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
2837{
2838	if (ifp == NULL || maxqlen == NULL)
2839		return (EINVAL);
2840	else if (!(ifp->if_eflags & IFEF_TXSTART))
2841		return (ENXIO);
2842
2843	*maxqlen = ifclassq_get_maxlen(&ifp->if_snd);
2844
2845	return (0);
2846}
2847
2848errno_t
2849ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
2850{
2851	errno_t err;
2852
2853	if (ifp == NULL || pkts == NULL)
2854		err = EINVAL;
2855	else if (!(ifp->if_eflags & IFEF_TXSTART))
2856		err = ENXIO;
2857	else
2858		err = ifclassq_get_len(&ifp->if_snd, MBUF_SC_UNSPEC,
2859		    pkts, NULL);
2860
2861	return (err);
2862}
2863
2864errno_t
2865ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
2866    u_int32_t *pkts, u_int32_t *bytes)
2867{
2868	errno_t err;
2869
2870	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
2871	    (pkts == NULL && bytes == NULL))
2872		err = EINVAL;
2873	else if (!(ifp->if_eflags & IFEF_TXSTART))
2874		err = ENXIO;
2875	else
2876		err = ifclassq_get_len(&ifp->if_snd, sc, pkts, bytes);
2877
2878	return (err);
2879}
2880
2881errno_t
2882ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
2883{
2884	struct dlil_threading_info *inp;
2885
2886	if (ifp == NULL)
2887		return (EINVAL);
2888	else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL)
2889		return (ENXIO);
2890
2891	if (maxqlen == 0)
2892		maxqlen = if_rcvq_maxlen;
2893	else if (maxqlen < IF_RCVQ_MINLEN)
2894		maxqlen = IF_RCVQ_MINLEN;
2895
2896	inp = ifp->if_inp;
2897	lck_mtx_lock(&inp->input_lck);
2898	qlimit(&inp->rcvq_pkts) = maxqlen;
2899	lck_mtx_unlock(&inp->input_lck);
2900
2901	return (0);
2902}
2903
2904errno_t
2905ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
2906{
2907	struct dlil_threading_info *inp;
2908
2909	if (ifp == NULL || maxqlen == NULL)
2910		return (EINVAL);
2911	else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL)
2912		return (ENXIO);
2913
2914	inp = ifp->if_inp;
2915	lck_mtx_lock(&inp->input_lck);
2916	*maxqlen = qlimit(&inp->rcvq_pkts);
2917	lck_mtx_unlock(&inp->input_lck);
2918	return (0);
2919}
2920
2921errno_t
2922ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
2923{
2924	int error;
2925
2926	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
2927	    m->m_nextpkt != NULL) {
2928		if (m != NULL)
2929			m_freem_list(m);
2930		return (EINVAL);
2931	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2932	    !(ifp->if_refflags & IFRF_ATTACHED)) {
2933		/* flag tested without lock for performance */
2934		m_freem(m);
2935		return (ENXIO);
2936	} else if (!(ifp->if_flags & IFF_UP)) {
2937		m_freem(m);
2938		return (ENETDOWN);
2939	}
2940
2941	/* enqueue the packet */
2942	error = ifclassq_enqueue(&ifp->if_snd, m);
2943
2944	/*
2945	 * Tell the driver to start dequeueing; do this even when the queue
2946	 * for the packet is suspended (EQSUSPENDED), as the driver could still
2947	 * be dequeueing from other unsuspended queues.
2948	 */
2949	if (error == 0 || error == EQFULL || error == EQSUSPENDED)
2950		ifnet_start(ifp);
2951
2952	return (error);
2953}
2954
2955errno_t
2956ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
2957{
2958	errno_t rc;
2959	if (ifp == NULL || mp == NULL)
2960		return (EINVAL);
2961	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2962	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL))
2963		return (ENXIO);
2964	if (!ifnet_is_attached(ifp, 1))
2965		return (ENXIO);
2966	rc = ifclassq_dequeue(&ifp->if_snd, 1, mp, NULL, NULL, NULL);
2967	ifnet_decr_iorefcnt(ifp);
2968
2969	return (rc);
2970}
2971
2972errno_t
2973ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
2974    struct mbuf **mp)
2975{
2976	errno_t rc;
2977	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc))
2978		return (EINVAL);
2979	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2980	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED))
2981		return (ENXIO);
2982	if (!ifnet_is_attached(ifp, 1))
2983		return (ENXIO);
2984
2985	rc = ifclassq_dequeue_sc(&ifp->if_snd, sc, 1, mp, NULL, NULL, NULL);
2986	ifnet_decr_iorefcnt(ifp);
2987	return (rc);
2988}
2989
2990errno_t
2991ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t limit, struct mbuf **head,
2992    struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
2993{
2994	errno_t rc;
2995	if (ifp == NULL || head == NULL || limit < 1)
2996		return (EINVAL);
2997	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2998	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL))
2999		return (ENXIO);
3000	if (!ifnet_is_attached(ifp, 1))
3001		return (ENXIO);
3002
3003	rc = ifclassq_dequeue(&ifp->if_snd, limit, head, tail, cnt, len);
3004	ifnet_decr_iorefcnt(ifp);
3005	return (rc);
3006}
3007
3008errno_t
3009ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
3010    u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
3011    u_int32_t *len)
3012{
3013	errno_t rc;
3014	if (ifp == NULL || head == NULL || limit < 1 || !MBUF_VALID_SC(sc))
3015		return (EINVAL);
3016	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
3017	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED))
3018		return (ENXIO);
3019	if (!ifnet_is_attached(ifp, 1))
3020		return (ENXIO);
3021	rc = ifclassq_dequeue_sc(&ifp->if_snd, sc, limit, head,
3022	    tail, cnt, len);
3023	ifnet_decr_iorefcnt(ifp);
3024	return (rc);
3025}
3026
3027errno_t
3028ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
3029    const struct sockaddr *dest, const char *dest_linkaddr,
3030    const char *frame_type, u_int32_t *pre, u_int32_t *post)
3031{
3032	if (pre != NULL)
3033		*pre = 0;
3034	if (post != NULL)
3035		*post = 0;
3036
3037	return (ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type));
3038}
3039
3040static int
3041dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
3042    char **frame_header_p, protocol_family_t protocol_family)
3043{
3044	struct ifnet_filter *filter;
3045
3046	/*
3047	 * Pass the inbound packet to the interface filters
3048	 */
3049	lck_mtx_lock_spin(&ifp->if_flt_lock);
3050	/* prevent filter list from changing in case we drop the lock */
3051	if_flt_monitor_busy(ifp);
3052	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
3053		int result;
3054
3055		if (!filter->filt_skip && filter->filt_input != NULL &&
3056		    (filter->filt_protocol == 0 ||
3057		    filter->filt_protocol == protocol_family)) {
3058			lck_mtx_unlock(&ifp->if_flt_lock);
3059
3060			result = (*filter->filt_input)(filter->filt_cookie,
3061			    ifp, protocol_family, m_p, frame_header_p);
3062
3063			lck_mtx_lock_spin(&ifp->if_flt_lock);
3064			if (result != 0) {
3065				/* we're done with the filter list */
3066				if_flt_monitor_unbusy(ifp);
3067				lck_mtx_unlock(&ifp->if_flt_lock);
3068				return (result);
3069			}
3070		}
3071	}
3072	/* we're done with the filter list */
3073	if_flt_monitor_unbusy(ifp);
3074	lck_mtx_unlock(&ifp->if_flt_lock);
3075
3076	/*
3077	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
3078	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
3079	 */
3080	if (*m_p != NULL)
3081		(*m_p)->m_flags &= ~M_PROTO1;
3082
3083	return (0);
3084}
3085
3086static int
3087dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
3088    protocol_family_t protocol_family)
3089{
3090	struct ifnet_filter *filter;
3091
3092	/*
3093	 * Pass the outbound packet to the interface filters
3094	 */
3095	lck_mtx_lock_spin(&ifp->if_flt_lock);
3096	/* prevent filter list from changing in case we drop the lock */
3097	if_flt_monitor_busy(ifp);
3098	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
3099		int result;
3100
3101		if (!filter->filt_skip && filter->filt_output != NULL &&
3102		    (filter->filt_protocol == 0 ||
3103		    filter->filt_protocol == protocol_family)) {
3104			lck_mtx_unlock(&ifp->if_flt_lock);
3105
3106			result = filter->filt_output(filter->filt_cookie, ifp,
3107			    protocol_family, m_p);
3108
3109			lck_mtx_lock_spin(&ifp->if_flt_lock);
3110			if (result != 0) {
3111				/* we're done with the filter list */
3112				if_flt_monitor_unbusy(ifp);
3113				lck_mtx_unlock(&ifp->if_flt_lock);
3114				return (result);
3115			}
3116		}
3117	}
3118	/* we're done with the filter list */
3119	if_flt_monitor_unbusy(ifp);
3120	lck_mtx_unlock(&ifp->if_flt_lock);
3121
3122	return (0);
3123}
3124
3125static void
3126dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
3127{
3128	int error;
3129
3130	if (ifproto->proto_kpi == kProtoKPI_v1) {
3131		/* Version 1 protocols get one packet at a time */
3132		while (m != NULL) {
3133			char *	frame_header;
3134			mbuf_t	next_packet;
3135
3136			next_packet = m->m_nextpkt;
3137			m->m_nextpkt = NULL;
3138			frame_header = m->m_pkthdr.pkt_hdr;
3139			m->m_pkthdr.pkt_hdr = NULL;
3140			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
3141			    ifproto->protocol_family, m, frame_header);
3142			if (error != 0 && error != EJUSTRETURN)
3143				m_freem(m);
3144			m = next_packet;
3145		}
3146	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
3147		/* Version 2 protocols support packet lists */
3148		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
3149		    ifproto->protocol_family, m);
3150		if (error != 0 && error != EJUSTRETURN)
3151			m_freem_list(m);
3152	}
3153	return;
3154}
3155
3156static void
3157dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
3158    struct dlil_threading_info *inp, boolean_t poll)
3159{
3160	struct ifnet_stat_increment_param *d = &inp->stats;
3161
3162	if (s->packets_in != 0)
3163		d->packets_in += s->packets_in;
3164	if (s->bytes_in != 0)
3165		d->bytes_in += s->bytes_in;
3166	if (s->errors_in != 0)
3167		d->errors_in += s->errors_in;
3168
3169	if (s->packets_out != 0)
3170		d->packets_out += s->packets_out;
3171	if (s->bytes_out != 0)
3172		d->bytes_out += s->bytes_out;
3173	if (s->errors_out != 0)
3174		d->errors_out += s->errors_out;
3175
3176	if (s->collisions != 0)
3177		d->collisions += s->collisions;
3178	if (s->dropped != 0)
3179		d->dropped += s->dropped;
3180
3181	if (poll)
3182		PKTCNTR_ADD(&inp->tstats, s->packets_in, s->bytes_in);
3183}
3184
3185static void
3186dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
3187{
3188	struct ifnet_stat_increment_param *s = &inp->stats;
3189
3190	/*
3191	 * Use of atomic operations is unavoidable here because
3192	 * these stats may also be incremented elsewhere via KPIs.
3193	 */
3194	if (s->packets_in != 0) {
3195		atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
3196		s->packets_in = 0;
3197	}
3198	if (s->bytes_in != 0) {
3199		atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
3200		s->bytes_in = 0;
3201	}
3202	if (s->errors_in != 0) {
3203		atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
3204		s->errors_in = 0;
3205	}
3206
3207	if (s->packets_out != 0) {
3208		atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
3209		s->packets_out = 0;
3210	}
3211	if (s->bytes_out != 0) {
3212		atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
3213		s->bytes_out = 0;
3214	}
3215	if (s->errors_out != 0) {
3216		atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
3217		s->errors_out = 0;
3218	}
3219
3220	if (s->collisions != 0) {
3221		atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
3222		s->collisions = 0;
3223	}
3224	if (s->dropped != 0) {
3225		atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
3226		s->dropped = 0;
3227	}
3228	/*
3229	 * If we went over the threshold, notify NetworkStatistics.
3230	 */
3231	if (ifp->if_data_threshold &&
3232	    (ifp->if_ibytes + ifp->if_obytes) - ifp->if_dt_bytes >
3233	    ifp->if_data_threshold) {
3234		ifp->if_dt_bytes = ifp->if_ibytes + ifp->if_obytes;
3235		nstat_ifnet_threshold_reached(ifp->if_index);
3236	}
3237	/*
3238	 * No need for atomic operations as they are modified here
3239	 * only from within the DLIL input thread context.
3240	 */
3241	if (inp->tstats.packets != 0) {
3242		inp->pstats.ifi_poll_packets += inp->tstats.packets;
3243		inp->tstats.packets = 0;
3244	}
3245	if (inp->tstats.bytes != 0) {
3246		inp->pstats.ifi_poll_bytes += inp->tstats.bytes;
3247		inp->tstats.bytes = 0;
3248	}
3249}
3250
3251__private_extern__ void
3252dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
3253{
3254	return (dlil_input_packet_list_common(ifp, m, 0,
3255	    IFNET_MODEL_INPUT_POLL_OFF, FALSE));
3256}
3257
3258__private_extern__ void
3259dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
3260    u_int32_t cnt, ifnet_model_t mode)
3261{
3262	return (dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE));
3263}
3264
3265static void
3266dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
3267    u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
3268{
3269	int				error = 0;
3270	protocol_family_t		protocol_family;
3271	mbuf_t				next_packet;
3272	ifnet_t				ifp = ifp_param;
3273	char *				frame_header;
3274	struct if_proto	*		last_ifproto = NULL;
3275	mbuf_t				pkt_first = NULL;
3276	mbuf_t *			pkt_next = NULL;
3277	u_int32_t			poll_thresh = 0, poll_ival = 0;
3278
3279	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START,0,0,0,0,0);
3280
3281	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
3282	    (poll_ival = if_rxpoll_interval_pkts) > 0)
3283		poll_thresh = cnt;
3284
3285	while (m != NULL) {
3286		struct if_proto *ifproto = NULL;
3287		int iorefcnt = 0;
3288		uint32_t pktf_mask;	/* pkt flags to preserve */
3289
3290		if (ifp_param == NULL)
3291			ifp = m->m_pkthdr.rcvif;
3292
3293		if ((ifp->if_eflags & IFEF_RXPOLL) && poll_thresh != 0 &&
3294		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0)
3295			ifnet_poll(ifp);
3296
3297		/* Check if this mbuf looks valid */
3298		MBUF_INPUT_CHECK(m, ifp);
3299
3300		next_packet = m->m_nextpkt;
3301		m->m_nextpkt = NULL;
3302		frame_header = m->m_pkthdr.pkt_hdr;
3303		m->m_pkthdr.pkt_hdr = NULL;
3304
3305		/*
3306		 * Get an IO reference count if the interface is not
3307		 * loopback (lo0) and it is attached; lo0 never goes
3308		 * away, so optimize for that.
3309		 */
3310		if (ifp != lo_ifp) {
3311			if (!ifnet_is_attached(ifp, 1)) {
3312				m_freem(m);
3313				goto next;
3314			}
3315			iorefcnt = 1;
3316			pktf_mask = 0;
3317		} else {
3318			/*
3319			 * If this arrived on lo0, preserve interface addr
3320			 * info to allow for connectivity between loopback
3321			 * and local interface addresses.
3322			 */
3323			pktf_mask = (PKTF_LOOP|PKTF_IFAINFO);
3324		}
3325
3326		/* make sure packet comes in clean */
3327		m_classifier_init(m, pktf_mask);
3328
3329		ifp_inc_traffic_class_in(ifp, m);
3330
3331		/* find which protocol family this packet is for */
3332		ifnet_lock_shared(ifp);
3333		error = (*ifp->if_demux)(ifp, m, frame_header,
3334		    &protocol_family);
3335		ifnet_lock_done(ifp);
3336		if (error != 0) {
3337			if (error == EJUSTRETURN)
3338				goto next;
3339			protocol_family = 0;
3340		}
3341
3342		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
3343		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3344			dlil_input_cksum_dbg(ifp, m, frame_header,
3345			    protocol_family);
3346
3347		/*
3348		 * For partial checksum offload, we expect the driver to
3349		 * set the start offset indicating the start of the span
3350		 * that is covered by the hardware-computed checksum;
3351		 * adjust this start offset accordingly because the data
3352		 * pointer has been advanced beyond the link-layer header.
3353		 *
3354		 * Don't adjust if the interface is a bridge member, as
3355		 * the adjustment will occur from the context of the
3356		 * bridge interface during input.
3357		 */
3358		if (ifp->if_bridge == NULL && (m->m_pkthdr.csum_flags &
3359		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
3360		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
3361			int adj;
3362
3363			if (frame_header == NULL ||
3364			    frame_header < (char *)mbuf_datastart(m) ||
3365			    frame_header > (char *)m->m_data ||
3366			    (adj = (m->m_data - frame_header)) >
3367			    m->m_pkthdr.csum_rx_start) {
3368				m->m_pkthdr.csum_data = 0;
3369				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
3370				hwcksum_in_invalidated++;
3371			} else {
3372				m->m_pkthdr.csum_rx_start -= adj;
3373			}
3374		}
3375
3376		pktap_input(ifp, protocol_family, m, frame_header);
3377
3378		if (m->m_flags & (M_BCAST|M_MCAST))
3379			atomic_add_64(&ifp->if_imcasts, 1);
3380
3381		/* run interface filters, exclude VLAN packets PR-3586856 */
3382		if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) {
3383			error = dlil_interface_filters_input(ifp, &m,
3384			    &frame_header, protocol_family);
3385			if (error != 0) {
3386				if (error != EJUSTRETURN)
3387					m_freem(m);
3388				goto next;
3389			}
3390		}
3391		if (error != 0 || ((m->m_flags & M_PROMISC) != 0) ) {
3392			m_freem(m);
3393			goto next;
3394		}
3395
3396		/* Lookup the protocol attachment to this interface */
3397		if (protocol_family == 0) {
3398			ifproto = NULL;
3399		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
3400		    (last_ifproto->protocol_family == protocol_family)) {
3401			VERIFY(ifproto == NULL);
3402			ifproto = last_ifproto;
3403			if_proto_ref(last_ifproto);
3404		} else {
3405			VERIFY(ifproto == NULL);
3406			ifnet_lock_shared(ifp);
3407			/* callee holds a proto refcnt upon success */
3408			ifproto	= find_attached_proto(ifp, protocol_family);
3409			ifnet_lock_done(ifp);
3410		}
3411		if (ifproto == NULL) {
3412			/* no protocol for this packet, discard */
3413			m_freem(m);
3414			goto next;
3415		}
3416		if (ifproto != last_ifproto) {
3417			if (last_ifproto != NULL) {
3418				/* pass up the list for the previous protocol */
3419				dlil_ifproto_input(last_ifproto, pkt_first);
3420				pkt_first = NULL;
3421				if_proto_free(last_ifproto);
3422			}
3423			last_ifproto = ifproto;
3424			if_proto_ref(ifproto);
3425		}
3426		/* extend the list */
3427		m->m_pkthdr.pkt_hdr = frame_header;
3428		if (pkt_first == NULL) {
3429			pkt_first = m;
3430		} else {
3431			*pkt_next = m;
3432		}
3433		pkt_next = &m->m_nextpkt;
3434
3435next:
3436		if (next_packet == NULL && last_ifproto != NULL) {
3437			/* pass up the last list of packets */
3438			dlil_ifproto_input(last_ifproto, pkt_first);
3439			if_proto_free(last_ifproto);
3440			last_ifproto = NULL;
3441		}
3442		if (ifproto != NULL) {
3443			if_proto_free(ifproto);
3444			ifproto = NULL;
3445		}
3446
3447		m = next_packet;
3448
3449		/* update the driver's multicast filter, if needed */
3450		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0)
3451			ifp->if_updatemcasts = 0;
3452		if (iorefcnt == 1)
3453			ifnet_decr_iorefcnt(ifp);
3454	}
3455
3456	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0);
3457}
3458
3459errno_t
3460if_mcasts_update(struct ifnet *ifp)
3461{
3462	errno_t err;
3463
3464	err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
3465	if (err == EAFNOSUPPORT)
3466		err = 0;
3467	printf("%s: %s %d suspended link-layer multicast membership(s) "
3468	    "(err=%d)\n", if_name(ifp),
3469	    (err == 0 ? "successfully restored" : "failed to restore"),
3470	    ifp->if_updatemcasts, err);
3471
3472	/* just return success */
3473	return (0);
3474}
3475
3476static int
3477dlil_event_internal(struct ifnet *ifp, struct kev_msg *event)
3478{
3479	struct ifnet_filter *filter;
3480
3481	/* Get an io ref count if the interface is attached */
3482	if (!ifnet_is_attached(ifp, 1))
3483		goto done;
3484
3485	/*
3486	 * Pass the event to the interface filters
3487	 */
3488	lck_mtx_lock_spin(&ifp->if_flt_lock);
3489	/* prevent filter list from changing in case we drop the lock */
3490	if_flt_monitor_busy(ifp);
3491	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
3492		if (filter->filt_event != NULL) {
3493			lck_mtx_unlock(&ifp->if_flt_lock);
3494
3495			filter->filt_event(filter->filt_cookie, ifp,
3496			    filter->filt_protocol, event);
3497
3498			lck_mtx_lock_spin(&ifp->if_flt_lock);
3499		}
3500	}
3501	/* we're done with the filter list */
3502	if_flt_monitor_unbusy(ifp);
3503	lck_mtx_unlock(&ifp->if_flt_lock);
3504
3505	ifnet_lock_shared(ifp);
3506	if (ifp->if_proto_hash != NULL) {
3507		int i;
3508
3509		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
3510			struct if_proto *proto;
3511
3512			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
3513			    next_hash) {
3514				proto_media_event eventp =
3515				    (proto->proto_kpi == kProtoKPI_v1 ?
3516				    proto->kpi.v1.event :
3517				    proto->kpi.v2.event);
3518
3519				if (eventp != NULL) {
3520					if_proto_ref(proto);
3521					ifnet_lock_done(ifp);
3522
3523					eventp(ifp, proto->protocol_family,
3524					    event);
3525
3526					ifnet_lock_shared(ifp);
3527					if_proto_free(proto);
3528				}
3529			}
3530		}
3531	}
3532	ifnet_lock_done(ifp);
3533
3534	/* Pass the event to the interface */
3535	if (ifp->if_event != NULL)
3536		ifp->if_event(ifp, event);
3537
3538	/* Release the io ref count */
3539	ifnet_decr_iorefcnt(ifp);
3540
3541done:
3542	return (kev_post_msg(event));
3543}
3544
3545errno_t
3546ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
3547{
3548	struct kev_msg               kev_msg;
3549	int result = 0;
3550
3551	if (ifp == NULL || event == NULL)
3552		return (EINVAL);
3553
3554	bzero(&kev_msg, sizeof (kev_msg));
3555	kev_msg.vendor_code    = event->vendor_code;
3556	kev_msg.kev_class      = event->kev_class;
3557	kev_msg.kev_subclass   = event->kev_subclass;
3558	kev_msg.event_code     = event->event_code;
3559	kev_msg.dv[0].data_ptr = &event->event_data[0];
3560	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
3561	kev_msg.dv[1].data_length = 0;
3562
3563	result = dlil_event_internal(ifp, &kev_msg);
3564
3565	return (result);
3566}
3567
3568#if CONFIG_MACF_NET
3569#include <netinet/ip6.h>
3570#include <netinet/ip.h>
3571static int
3572dlil_get_socket_type(struct mbuf **mp, int family, int raw)
3573{
3574	struct mbuf *m;
3575	struct ip *ip;
3576	struct ip6_hdr *ip6;
3577	int type = SOCK_RAW;
3578
3579	if (!raw) {
3580		switch (family) {
3581		case PF_INET:
3582			m = m_pullup(*mp, sizeof(struct ip));
3583			if (m == NULL)
3584				break;
3585			*mp = m;
3586			ip = mtod(m, struct ip *);
3587			if (ip->ip_p == IPPROTO_TCP)
3588				type = SOCK_STREAM;
3589			else if (ip->ip_p == IPPROTO_UDP)
3590				type = SOCK_DGRAM;
3591			break;
3592		case PF_INET6:
3593			m = m_pullup(*mp, sizeof(struct ip6_hdr));
3594			if (m == NULL)
3595				break;
3596			*mp = m;
3597			ip6 = mtod(m, struct ip6_hdr *);
3598			if (ip6->ip6_nxt == IPPROTO_TCP)
3599				type = SOCK_STREAM;
3600			else if (ip6->ip6_nxt == IPPROTO_UDP)
3601				type = SOCK_DGRAM;
3602			break;
3603		}
3604	}
3605
3606	return (type);
3607}
3608#endif
3609
3610/*
3611 * This is mostly called from the context of the DLIL input thread;
3612 * because of that there is no need for atomic operations.
3613 */
3614static __inline void
3615ifp_inc_traffic_class_in(struct ifnet *ifp, struct mbuf *m)
3616{
3617	if (!(m->m_flags & M_PKTHDR))
3618		return;
3619
3620	switch (m_get_traffic_class(m)) {
3621	case MBUF_TC_BE:
3622		ifp->if_tc.ifi_ibepackets++;
3623		ifp->if_tc.ifi_ibebytes += m->m_pkthdr.len;
3624		break;
3625	case MBUF_TC_BK:
3626		ifp->if_tc.ifi_ibkpackets++;
3627		ifp->if_tc.ifi_ibkbytes += m->m_pkthdr.len;
3628		break;
3629	case MBUF_TC_VI:
3630		ifp->if_tc.ifi_ivipackets++;
3631		ifp->if_tc.ifi_ivibytes += m->m_pkthdr.len;
3632		break;
3633	case MBUF_TC_VO:
3634		ifp->if_tc.ifi_ivopackets++;
3635		ifp->if_tc.ifi_ivobytes += m->m_pkthdr.len;
3636		break;
3637	default:
3638		break;
3639	}
3640
3641	if (mbuf_is_traffic_class_privileged(m)) {
3642		ifp->if_tc.ifi_ipvpackets++;
3643		ifp->if_tc.ifi_ipvbytes += m->m_pkthdr.len;
3644	}
3645}
3646
3647/*
3648 * This is called from DLIL output, hence multiple threads could end
3649 * up modifying the statistics.  We trade off acccuracy for performance
3650 * by not using atomic operations here.
3651 */
3652static __inline void
3653ifp_inc_traffic_class_out(struct ifnet *ifp, struct mbuf *m)
3654{
3655	if (!(m->m_flags & M_PKTHDR))
3656		return;
3657
3658	switch (m_get_traffic_class(m)) {
3659	case MBUF_TC_BE:
3660		ifp->if_tc.ifi_obepackets++;
3661		ifp->if_tc.ifi_obebytes += m->m_pkthdr.len;
3662		break;
3663	case MBUF_TC_BK:
3664		ifp->if_tc.ifi_obkpackets++;
3665		ifp->if_tc.ifi_obkbytes += m->m_pkthdr.len;
3666		break;
3667	case MBUF_TC_VI:
3668		ifp->if_tc.ifi_ovipackets++;
3669		ifp->if_tc.ifi_ovibytes += m->m_pkthdr.len;
3670		break;
3671	case MBUF_TC_VO:
3672		ifp->if_tc.ifi_ovopackets++;
3673		ifp->if_tc.ifi_ovobytes += m->m_pkthdr.len;
3674		break;
3675	default:
3676		break;
3677	}
3678
3679	if (mbuf_is_traffic_class_privileged(m)) {
3680		ifp->if_tc.ifi_opvpackets++;
3681		ifp->if_tc.ifi_opvbytes += m->m_pkthdr.len;
3682	}
3683}
3684
3685/*
3686 * dlil_output
3687 *
3688 * Caller should have a lock on the protocol domain if the protocol
3689 * doesn't support finer grained locking. In most cases, the lock
3690 * will be held from the socket layer and won't be released until
3691 * we return back to the socket layer.
3692 *
3693 * This does mean that we must take a protocol lock before we take
3694 * an interface lock if we're going to take both. This makes sense
3695 * because a protocol is likely to interact with an ifp while it
3696 * is under the protocol lock.
3697 *
3698 * An advisory code will be returned if adv is not null. This
3699 * can be used to provide feedback about interface queues to the
3700 * application.
3701 */
3702errno_t
3703dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
3704    void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
3705{
3706	char *frame_type = NULL;
3707	char *dst_linkaddr = NULL;
3708	int retval = 0;
3709	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
3710	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
3711	struct if_proto	*proto = NULL;
3712	mbuf_t	m;
3713	mbuf_t	send_head = NULL;
3714	mbuf_t	*send_tail = &send_head;
3715	int iorefcnt = 0;
3716	u_int32_t pre = 0, post = 0;
3717	u_int32_t fpkts = 0, fbytes = 0;
3718	int32_t flen = 0;
3719
3720	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
3721
3722	/* Get an io refcnt if the interface is attached to prevent ifnet_detach
3723	 * from happening while this operation is in progress */
3724	if (!ifnet_is_attached(ifp, 1)) {
3725		retval = ENXIO;
3726		goto cleanup;
3727	}
3728	iorefcnt = 1;
3729
3730	/* update the driver's multicast filter, if needed */
3731	if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0)
3732		ifp->if_updatemcasts = 0;
3733
3734	frame_type = frame_type_buffer;
3735	dst_linkaddr = dst_linkaddr_buffer;
3736
3737	if (raw == 0) {
3738		ifnet_lock_shared(ifp);
3739		/* callee holds a proto refcnt upon success */
3740		proto = find_attached_proto(ifp, proto_family);
3741		if (proto == NULL) {
3742			ifnet_lock_done(ifp);
3743			retval = ENXIO;
3744			goto cleanup;
3745		}
3746		ifnet_lock_done(ifp);
3747	}
3748
3749preout_again:
3750	if (packetlist == NULL)
3751		goto cleanup;
3752
3753	m = packetlist;
3754	packetlist = packetlist->m_nextpkt;
3755	m->m_nextpkt = NULL;
3756
3757	if (raw == 0) {
3758		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
3759		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
3760		retval = 0;
3761		if (preoutp != NULL) {
3762			retval = preoutp(ifp, proto_family, &m, dest, route,
3763			    frame_type, dst_linkaddr);
3764
3765			if (retval != 0) {
3766				if (retval == EJUSTRETURN)
3767					goto preout_again;
3768				m_freem(m);
3769				goto cleanup;
3770			}
3771		}
3772	}
3773
3774#if CONFIG_MACF_NET
3775	retval = mac_ifnet_check_transmit(ifp, m, proto_family,
3776	    dlil_get_socket_type(&m, proto_family, raw));
3777	if (retval != 0) {
3778		m_freem(m);
3779		goto cleanup;
3780	}
3781#endif
3782
3783	do {
3784#if CONFIG_DTRACE
3785		if (!raw && proto_family == PF_INET) {
3786			struct ip *ip = mtod(m, struct ip*);
3787	                DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
3788				struct ip *, ip, struct ifnet *, ifp,
3789				struct ip *, ip, struct ip6_hdr *, NULL);
3790
3791		} else if (!raw && proto_family == PF_INET6) {
3792			struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr*);
3793			DTRACE_IP6(send, struct mbuf*, m, struct inpcb *, NULL,
3794				struct ip6_hdr *, ip6, struct ifnet*, ifp,
3795				struct ip*, NULL, struct ip6_hdr *, ip6);
3796		}
3797#endif /* CONFIG_DTRACE */
3798
3799		if (raw == 0 && ifp->if_framer != NULL) {
3800			int rcvif_set = 0;
3801
3802			/*
3803			 * If this is a broadcast packet that needs to be
3804			 * looped back into the system, set the inbound ifp
3805			 * to that of the outbound ifp.  This will allow
3806			 * us to determine that it is a legitimate packet
3807			 * for the system.  Only set the ifp if it's not
3808			 * already set, just to be safe.
3809			 */
3810			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
3811			    m->m_pkthdr.rcvif == NULL) {
3812				m->m_pkthdr.rcvif = ifp;
3813				rcvif_set = 1;
3814			}
3815
3816			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
3817			    frame_type, &pre, &post);
3818			if (retval != 0) {
3819				if (retval != EJUSTRETURN)
3820					m_freem(m);
3821				goto next;
3822			}
3823
3824			/*
3825			 * For partial checksum offload, adjust the start
3826			 * and stuff offsets based on the prepended header.
3827			 */
3828			if ((m->m_pkthdr.csum_flags &
3829			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
3830			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
3831				m->m_pkthdr.csum_tx_stuff += pre;
3832				m->m_pkthdr.csum_tx_start += pre;
3833			}
3834
3835			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK))
3836				dlil_output_cksum_dbg(ifp, m, pre,
3837				    proto_family);
3838
3839			/*
3840			 * Clear the ifp if it was set above, and to be
3841			 * safe, only if it is still the same as the
3842			 * outbound ifp we have in context.  If it was
3843			 * looped back, then a copy of it was sent to the
3844			 * loopback interface with the rcvif set, and we
3845			 * are clearing the one that will go down to the
3846			 * layer below.
3847			 */
3848			if (rcvif_set && m->m_pkthdr.rcvif == ifp)
3849				m->m_pkthdr.rcvif = NULL;
3850		}
3851
3852		/*
3853		 * Let interface filters (if any) do their thing ...
3854		 */
3855		/* Do not pass VLAN tagged packets to filters PR-3586856 */
3856		if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) {
3857			retval = dlil_interface_filters_output(ifp,
3858			    &m, proto_family);
3859			if (retval != 0) {
3860				if (retval != EJUSTRETURN)
3861					m_freem(m);
3862				goto next;
3863			}
3864		}
3865		/*
3866		 * Strip away M_PROTO1 bit prior to sending packet
3867		 * to the driver as this field may be used by the driver
3868		 */
3869		m->m_flags &= ~M_PROTO1;
3870
3871		/*
3872		 * If the underlying interface is not capable of handling a
3873		 * packet whose data portion spans across physically disjoint
3874		 * pages, we need to "normalize" the packet so that we pass
3875		 * down a chain of mbufs where each mbuf points to a span that
3876		 * resides in the system page boundary.  If the packet does
3877		 * not cross page(s), the following is a no-op.
3878		 */
3879		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
3880			if ((m = m_normalize(m)) == NULL)
3881				goto next;
3882		}
3883
3884		/*
3885		 * If this is a TSO packet, make sure the interface still
3886		 * advertise TSO capability.
3887		 */
3888		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
3889			retval = EMSGSIZE;
3890			m_freem(m);
3891			goto cleanup;
3892		}
3893
3894		/*
3895		 * If the packet service class is not background,
3896		 * update the timestamp to indicate recent activity
3897		 * on a foreground socket.
3898		 */
3899		if (!(m->m_pkthdr.pkt_flags & PKTF_SO_BACKGROUND) &&
3900		    (m->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
3901		    m->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB)
3902			ifp->if_fg_sendts = net_uptime();
3903
3904		ifp_inc_traffic_class_out(ifp, m);
3905		pktap_output(ifp, proto_family, m, pre, post);
3906
3907		/*
3908		 * Finally, call the driver.
3909		 */
3910		if (ifp->if_eflags & IFEF_SENDLIST) {
3911			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
3912				flen += (m_pktlen(m) - (pre + post));
3913				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
3914			}
3915			*send_tail = m;
3916			send_tail = &m->m_nextpkt;
3917		} else {
3918			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
3919				flen = (m_pktlen(m) - (pre + post));
3920				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
3921			} else {
3922				flen = 0;
3923			}
3924			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
3925			    0, 0, 0, 0, 0);
3926			retval = (*ifp->if_output)(ifp, m);
3927			if (retval == EQFULL || retval == EQSUSPENDED) {
3928				if (adv != NULL && adv->code == FADV_SUCCESS) {
3929					adv->code = (retval == EQFULL ?
3930					    FADV_FLOW_CONTROLLED :
3931					    FADV_SUSPENDED);
3932				}
3933				retval = 0;
3934			}
3935			if (retval == 0 && flen > 0) {
3936				fbytes += flen;
3937				fpkts++;
3938			}
3939			if (retval != 0 && dlil_verbose) {
3940				printf("%s: output error on %s retval = %d\n",
3941				    __func__, if_name(ifp),
3942				    retval);
3943			}
3944			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
3945			    0, 0, 0, 0, 0);
3946		}
3947		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3948
3949next:
3950		m = packetlist;
3951		if (m != NULL) {
3952			packetlist = packetlist->m_nextpkt;
3953			m->m_nextpkt = NULL;
3954		}
3955	} while (m != NULL);
3956
3957	if (send_head != NULL) {
3958		VERIFY(ifp->if_eflags & IFEF_SENDLIST);
3959		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
3960		    0, 0, 0, 0, 0);
3961		retval = (*ifp->if_output)(ifp, send_head);
3962		if (retval == EQFULL || retval == EQSUSPENDED) {
3963			if (adv != NULL) {
3964				adv->code = (retval == EQFULL ?
3965				    FADV_FLOW_CONTROLLED : FADV_SUSPENDED);
3966			}
3967			retval = 0;
3968		}
3969		if (retval == 0 && flen > 0) {
3970			fbytes += flen;
3971			fpkts++;
3972		}
3973		if (retval != 0 && dlil_verbose) {
3974			printf("%s: output error on %s retval = %d\n",
3975			    __func__, if_name(ifp), retval);
3976		}
3977		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3978	}
3979
3980	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3981
3982cleanup:
3983	if (fbytes > 0)
3984		ifp->if_fbytes += fbytes;
3985	if (fpkts > 0)
3986		ifp->if_fpackets += fpkts;
3987	if (proto != NULL)
3988		if_proto_free(proto);
3989	if (packetlist) /* if any packets are left, clean up */
3990		mbuf_freem_list(packetlist);
3991	if (retval == EJUSTRETURN)
3992		retval = 0;
3993	if (iorefcnt == 1)
3994		ifnet_decr_iorefcnt(ifp);
3995
3996	return (retval);
3997}
3998
3999errno_t
4000ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
4001    void *ioctl_arg)
4002{
4003	struct ifnet_filter *filter;
4004	int retval = EOPNOTSUPP;
4005	int result = 0;
4006
4007	if (ifp == NULL || ioctl_code == 0)
4008		return (EINVAL);
4009
4010	/* Get an io ref count if the interface is attached */
4011	if (!ifnet_is_attached(ifp, 1))
4012		return (EOPNOTSUPP);
4013
4014	/* Run the interface filters first.
4015	 * We want to run all filters before calling the protocol,
4016	 * interface family, or interface.
4017	 */
4018	lck_mtx_lock_spin(&ifp->if_flt_lock);
4019	/* prevent filter list from changing in case we drop the lock */
4020	if_flt_monitor_busy(ifp);
4021	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
4022		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
4023		    filter->filt_protocol == proto_fam)) {
4024			lck_mtx_unlock(&ifp->if_flt_lock);
4025
4026			result = filter->filt_ioctl(filter->filt_cookie, ifp,
4027			    proto_fam, ioctl_code, ioctl_arg);
4028
4029			lck_mtx_lock_spin(&ifp->if_flt_lock);
4030
4031			/* Only update retval if no one has handled the ioctl */
4032			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
4033				if (result == ENOTSUP)
4034					result = EOPNOTSUPP;
4035				retval = result;
4036				if (retval != 0 && retval != EOPNOTSUPP) {
4037					/* we're done with the filter list */
4038					if_flt_monitor_unbusy(ifp);
4039					lck_mtx_unlock(&ifp->if_flt_lock);
4040					goto cleanup;
4041				}
4042			}
4043		}
4044	}
4045	/* we're done with the filter list */
4046	if_flt_monitor_unbusy(ifp);
4047	lck_mtx_unlock(&ifp->if_flt_lock);
4048
4049	/* Allow the protocol to handle the ioctl */
4050	if (proto_fam != 0) {
4051		struct if_proto	*proto;
4052
4053		/* callee holds a proto refcnt upon success */
4054		ifnet_lock_shared(ifp);
4055		proto = find_attached_proto(ifp, proto_fam);
4056		ifnet_lock_done(ifp);
4057		if (proto != NULL) {
4058			proto_media_ioctl ioctlp =
4059			    (proto->proto_kpi == kProtoKPI_v1 ?
4060			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
4061			result = EOPNOTSUPP;
4062			if (ioctlp != NULL)
4063				result = ioctlp(ifp, proto_fam, ioctl_code,
4064				    ioctl_arg);
4065			if_proto_free(proto);
4066
4067			/* Only update retval if no one has handled the ioctl */
4068			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
4069				if (result == ENOTSUP)
4070					result = EOPNOTSUPP;
4071				retval = result;
4072				if (retval && retval != EOPNOTSUPP)
4073					goto cleanup;
4074			}
4075		}
4076	}
4077
4078	/* retval is either 0 or EOPNOTSUPP */
4079
4080	/*
4081	 * Let the interface handle this ioctl.
4082	 * If it returns EOPNOTSUPP, ignore that, we may have
4083	 * already handled this in the protocol or family.
4084	 */
4085	if (ifp->if_ioctl)
4086		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
4087
4088	/* Only update retval if no one has handled the ioctl */
4089	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
4090		if (result == ENOTSUP)
4091			result = EOPNOTSUPP;
4092		retval = result;
4093		if (retval && retval != EOPNOTSUPP) {
4094			goto cleanup;
4095		}
4096	}
4097
4098cleanup:
4099	if (retval == EJUSTRETURN)
4100		retval = 0;
4101
4102	ifnet_decr_iorefcnt(ifp);
4103
4104	return (retval);
4105}
4106
4107__private_extern__ errno_t
4108dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
4109{
4110	errno_t	error = 0;
4111
4112
4113	if (ifp->if_set_bpf_tap) {
4114		/* Get an io reference on the interface if it is attached */
4115		if (!ifnet_is_attached(ifp, 1))
4116			return ENXIO;
4117		error = ifp->if_set_bpf_tap(ifp, mode, callback);
4118		ifnet_decr_iorefcnt(ifp);
4119	}
4120	return (error);
4121}
4122
4123errno_t
4124dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
4125    struct sockaddr *ll_addr, size_t ll_len)
4126{
4127	errno_t	result = EOPNOTSUPP;
4128	struct if_proto *proto;
4129	const struct sockaddr *verify;
4130	proto_media_resolve_multi resolvep;
4131
4132	if (!ifnet_is_attached(ifp, 1))
4133		return result;
4134
4135	bzero(ll_addr, ll_len);
4136
4137	/* Call the protocol first; callee holds a proto refcnt upon success */
4138	ifnet_lock_shared(ifp);
4139	proto = find_attached_proto(ifp, proto_addr->sa_family);
4140	ifnet_lock_done(ifp);
4141	if (proto != NULL) {
4142		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
4143		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
4144		if (resolvep != NULL)
4145			result = resolvep(ifp, proto_addr,
4146			    (struct sockaddr_dl*)(void *)ll_addr, ll_len);
4147		if_proto_free(proto);
4148	}
4149
4150	/* Let the interface verify the multicast address */
4151	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
4152		if (result == 0)
4153			verify = ll_addr;
4154		else
4155			verify = proto_addr;
4156		result = ifp->if_check_multi(ifp, verify);
4157	}
4158
4159	ifnet_decr_iorefcnt(ifp);
4160	return (result);
4161}
4162
4163__private_extern__ errno_t
4164dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
4165    const struct sockaddr_dl* sender_hw, const struct sockaddr* sender_proto,
4166    const struct sockaddr_dl* target_hw, const struct sockaddr* target_proto)
4167{
4168	struct if_proto *proto;
4169	errno_t	result = 0;
4170
4171	/* callee holds a proto refcnt upon success */
4172	ifnet_lock_shared(ifp);
4173	proto = find_attached_proto(ifp, target_proto->sa_family);
4174	ifnet_lock_done(ifp);
4175	if (proto == NULL) {
4176		result = ENOTSUP;
4177	} else {
4178		proto_media_send_arp	arpp;
4179		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
4180		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
4181		if (arpp == NULL) {
4182			result = ENOTSUP;
4183		} else {
4184			switch (arpop) {
4185			case ARPOP_REQUEST:
4186				arpstat.txrequests++;
4187				if (target_hw != NULL)
4188					arpstat.txurequests++;
4189				break;
4190			case ARPOP_REPLY:
4191				arpstat.txreplies++;
4192				break;
4193			}
4194			result = arpp(ifp, arpop, sender_hw, sender_proto,
4195			    target_hw, target_proto);
4196		}
4197		if_proto_free(proto);
4198	}
4199
4200	return (result);
4201}
4202
4203struct net_thread_marks { };
4204static const struct net_thread_marks net_thread_marks_base = { };
4205
4206__private_extern__ const net_thread_marks_t net_thread_marks_none =
4207    &net_thread_marks_base;
4208
4209__private_extern__ net_thread_marks_t
4210net_thread_marks_push(u_int32_t push)
4211{
4212	static const char *const base = (const void*)&net_thread_marks_base;
4213	u_int32_t pop = 0;
4214
4215	if (push != 0) {
4216		struct uthread *uth = get_bsdthread_info(current_thread());
4217
4218		pop = push & ~uth->uu_network_marks;
4219		if (pop != 0)
4220			uth->uu_network_marks |= pop;
4221	}
4222
4223	return ((net_thread_marks_t)&base[pop]);
4224}
4225
4226__private_extern__ net_thread_marks_t
4227net_thread_unmarks_push(u_int32_t unpush)
4228{
4229	static const char *const base = (const void*)&net_thread_marks_base;
4230	u_int32_t unpop = 0;
4231
4232	if (unpush != 0) {
4233		struct uthread *uth = get_bsdthread_info(current_thread());
4234
4235		unpop = unpush & uth->uu_network_marks;
4236		if (unpop != 0)
4237			uth->uu_network_marks &= ~unpop;
4238	}
4239
4240	return ((net_thread_marks_t)&base[unpop]);
4241}
4242
4243__private_extern__ void
4244net_thread_marks_pop(net_thread_marks_t popx)
4245{
4246	static const char *const base = (const void*)&net_thread_marks_base;
4247	ptrdiff_t pop = (caddr_t)popx - (caddr_t)base;
4248
4249	if (pop != 0) {
4250		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
4251		struct uthread *uth = get_bsdthread_info(current_thread());
4252
4253		VERIFY((pop & ones) == pop);
4254		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
4255		uth->uu_network_marks &= ~pop;
4256	}
4257}
4258
4259__private_extern__ void
4260net_thread_unmarks_pop(net_thread_marks_t unpopx)
4261{
4262	static const char *const base = (const void*)&net_thread_marks_base;
4263	ptrdiff_t unpop = (caddr_t)unpopx - (caddr_t)base;
4264
4265	if (unpop != 0) {
4266		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
4267		struct uthread *uth = get_bsdthread_info(current_thread());
4268
4269		VERIFY((unpop & ones) == unpop);
4270		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
4271		uth->uu_network_marks |= unpop;
4272	}
4273}
4274
4275__private_extern__ u_int32_t
4276net_thread_is_marked(u_int32_t check)
4277{
4278	if (check != 0) {
4279		struct uthread *uth = get_bsdthread_info(current_thread());
4280		return (uth->uu_network_marks & check);
4281	}
4282	else
4283		return (0);
4284}
4285
4286__private_extern__ u_int32_t
4287net_thread_is_unmarked(u_int32_t check)
4288{
4289	if (check != 0) {
4290		struct uthread *uth = get_bsdthread_info(current_thread());
4291		return (~uth->uu_network_marks & check);
4292	}
4293	else
4294		return (0);
4295}
4296
4297static __inline__ int
4298_is_announcement(const struct sockaddr_in * sender_sin,
4299    const struct sockaddr_in * target_sin)
4300{
4301	if (sender_sin == NULL) {
4302		return (FALSE);
4303	}
4304	return (sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr);
4305}
4306
4307__private_extern__ errno_t
4308dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw,
4309    const struct sockaddr* sender_proto, const struct sockaddr_dl* target_hw,
4310    const struct sockaddr* target_proto0, u_int32_t rtflags)
4311{
4312	errno_t	result = 0;
4313	const struct sockaddr_in * sender_sin;
4314	const struct sockaddr_in * target_sin;
4315	struct sockaddr_inarp target_proto_sinarp;
4316	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
4317
4318	if (target_proto == NULL || (sender_proto != NULL &&
4319	    sender_proto->sa_family != target_proto->sa_family))
4320		return (EINVAL);
4321
4322	/*
4323	 * If the target is a (default) router, provide that
4324	 * information to the send_arp callback routine.
4325	 */
4326	if (rtflags & RTF_ROUTER) {
4327		bcopy(target_proto, &target_proto_sinarp,
4328		    sizeof (struct sockaddr_in));
4329		target_proto_sinarp.sin_other |= SIN_ROUTER;
4330		target_proto = (struct sockaddr *)&target_proto_sinarp;
4331	}
4332
4333	/*
4334	 * If this is an ARP request and the target IP is IPv4LL,
4335	 * send the request on all interfaces.  The exception is
4336	 * an announcement, which must only appear on the specific
4337	 * interface.
4338	 */
4339	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
4340	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
4341	if (target_proto->sa_family == AF_INET &&
4342	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
4343	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
4344	    !_is_announcement(target_sin, sender_sin)) {
4345		ifnet_t		*ifp_list;
4346		u_int32_t	count;
4347		u_int32_t	ifp_on;
4348
4349		result = ENOTSUP;
4350
4351		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
4352			for (ifp_on = 0; ifp_on < count; ifp_on++) {
4353				errno_t new_result;
4354				ifaddr_t source_hw = NULL;
4355				ifaddr_t source_ip = NULL;
4356				struct sockaddr_in source_ip_copy;
4357				struct ifnet *cur_ifp = ifp_list[ifp_on];
4358
4359				/*
4360				 * Only arp on interfaces marked for IPv4LL
4361				 * ARPing.  This may mean that we don't ARP on
4362				 * the interface the subnet route points to.
4363				 */
4364				if (!(cur_ifp->if_eflags & IFEF_ARPLL))
4365					continue;
4366
4367				/* Find the source IP address */
4368				ifnet_lock_shared(cur_ifp);
4369				source_hw = cur_ifp->if_lladdr;
4370				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
4371				    ifa_link) {
4372					IFA_LOCK(source_ip);
4373					if (source_ip->ifa_addr != NULL &&
4374					    source_ip->ifa_addr->sa_family ==
4375					    AF_INET) {
4376						/* Copy the source IP address */
4377						source_ip_copy =
4378						    *(struct sockaddr_in *)
4379						    (void *)source_ip->ifa_addr;
4380						IFA_UNLOCK(source_ip);
4381						break;
4382					}
4383					IFA_UNLOCK(source_ip);
4384				}
4385
4386				/* No IP Source, don't arp */
4387				if (source_ip == NULL) {
4388					ifnet_lock_done(cur_ifp);
4389					continue;
4390				}
4391
4392				IFA_ADDREF(source_hw);
4393				ifnet_lock_done(cur_ifp);
4394
4395				/* Send the ARP */
4396				new_result = dlil_send_arp_internal(cur_ifp,
4397				    arpop, (struct sockaddr_dl *)(void *)
4398				    source_hw->ifa_addr,
4399				    (struct sockaddr *)&source_ip_copy, NULL,
4400				    target_proto);
4401
4402				IFA_REMREF(source_hw);
4403				if (result == ENOTSUP) {
4404					result = new_result;
4405				}
4406			}
4407			ifnet_list_free(ifp_list);
4408		}
4409	} else {
4410		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
4411		    sender_proto, target_hw, target_proto);
4412	}
4413
4414	return (result);
4415}
4416
4417/*
4418 * Caller must hold ifnet head lock.
4419 */
4420static int
4421ifnet_lookup(struct ifnet *ifp)
4422{
4423	struct ifnet *_ifp;
4424
4425	lck_rw_assert(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
4426	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
4427		if (_ifp == ifp)
4428			break;
4429	}
4430	return (_ifp != NULL);
4431}
4432/*
4433 * Caller has to pass a non-zero refio argument to get a
4434 * IO reference count. This will prevent ifnet_detach from
4435 * being called when there are outstanding io reference counts.
4436 */
4437int
4438ifnet_is_attached(struct ifnet *ifp, int refio)
4439{
4440	int ret;
4441
4442	lck_mtx_lock_spin(&ifp->if_ref_lock);
4443	if ((ret = ((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) ==
4444	    IFRF_ATTACHED))) {
4445		if (refio > 0)
4446			ifp->if_refio++;
4447	}
4448	lck_mtx_unlock(&ifp->if_ref_lock);
4449
4450	return (ret);
4451}
4452
4453void
4454ifnet_decr_iorefcnt(struct ifnet *ifp)
4455{
4456	lck_mtx_lock_spin(&ifp->if_ref_lock);
4457	VERIFY(ifp->if_refio > 0);
4458	VERIFY((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) != 0);
4459	ifp->if_refio--;
4460
4461	/* if there are no more outstanding io references, wakeup the
4462	 * ifnet_detach thread if detaching flag is set.
4463	 */
4464	if (ifp->if_refio == 0 &&
4465		(ifp->if_refflags & IFRF_DETACHING) != 0) {
4466		wakeup(&(ifp->if_refio));
4467	}
4468	lck_mtx_unlock(&ifp->if_ref_lock);
4469}
4470
4471static void
4472dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
4473{
4474	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
4475	ctrace_t *tr;
4476	u_int32_t idx;
4477	u_int16_t *cnt;
4478
4479	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
4480		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
4481		/* NOTREACHED */
4482	}
4483
4484	if (refhold) {
4485		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
4486		tr = dl_if_dbg->dldbg_if_refhold;
4487	} else {
4488		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
4489		tr = dl_if_dbg->dldbg_if_refrele;
4490	}
4491
4492	idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
4493	ctrace_record(&tr[idx]);
4494}
4495
4496errno_t
4497dlil_if_ref(struct ifnet *ifp)
4498{
4499	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
4500
4501	if (dl_if == NULL)
4502		return (EINVAL);
4503
4504	lck_mtx_lock_spin(&dl_if->dl_if_lock);
4505	++dl_if->dl_if_refcnt;
4506	if (dl_if->dl_if_refcnt == 0) {
4507		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
4508		/* NOTREACHED */
4509	}
4510	if (dl_if->dl_if_trace != NULL)
4511		(*dl_if->dl_if_trace)(dl_if, TRUE);
4512	lck_mtx_unlock(&dl_if->dl_if_lock);
4513
4514	return (0);
4515}
4516
4517errno_t
4518dlil_if_free(struct ifnet *ifp)
4519{
4520	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
4521
4522	if (dl_if == NULL)
4523		return (EINVAL);
4524
4525	lck_mtx_lock_spin(&dl_if->dl_if_lock);
4526	if (dl_if->dl_if_refcnt == 0) {
4527		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
4528		/* NOTREACHED */
4529	}
4530	--dl_if->dl_if_refcnt;
4531	if (dl_if->dl_if_trace != NULL)
4532		(*dl_if->dl_if_trace)(dl_if, FALSE);
4533	lck_mtx_unlock(&dl_if->dl_if_lock);
4534
4535	return (0);
4536}
4537
4538static errno_t
4539dlil_attach_protocol_internal(struct if_proto *proto,
4540    const struct ifnet_demux_desc *demux_list, u_int32_t demux_count)
4541{
4542	struct kev_dl_proto_data ev_pr_data;
4543	struct ifnet *ifp = proto->ifp;
4544	int retval = 0;
4545	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
4546	struct if_proto *prev_proto;
4547	struct if_proto *_proto;
4548
4549	/* callee holds a proto refcnt upon success */
4550	ifnet_lock_exclusive(ifp);
4551	_proto = find_attached_proto(ifp, proto->protocol_family);
4552	if (_proto != NULL) {
4553		ifnet_lock_done(ifp);
4554		if_proto_free(_proto);
4555		return (EEXIST);
4556	}
4557
4558	/*
4559	 * Call family module add_proto routine so it can refine the
4560	 * demux descriptors as it wishes.
4561	 */
4562	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
4563	    demux_count);
4564	if (retval) {
4565		ifnet_lock_done(ifp);
4566		return (retval);
4567	}
4568
4569	/*
4570	 * Insert the protocol in the hash
4571	 */
4572	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
4573	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL)
4574		prev_proto = SLIST_NEXT(prev_proto, next_hash);
4575	if (prev_proto)
4576		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
4577	else
4578		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
4579		    proto, next_hash);
4580
4581	/* hold a proto refcnt for attach */
4582	if_proto_ref(proto);
4583
4584	/*
4585	 * The reserved field carries the number of protocol still attached
4586	 * (subject to change)
4587	 */
4588	ev_pr_data.proto_family = proto->protocol_family;
4589	ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp);
4590	ifnet_lock_done(ifp);
4591
4592	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
4593	    (struct net_event_data *)&ev_pr_data,
4594	    sizeof (struct kev_dl_proto_data));
4595	return (retval);
4596}
4597
4598errno_t
4599ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
4600    const struct ifnet_attach_proto_param *proto_details)
4601{
4602	int retval = 0;
4603	struct if_proto  *ifproto = NULL;
4604
4605	ifnet_head_lock_shared();
4606	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
4607		retval = EINVAL;
4608		goto end;
4609	}
4610	/* Check that the interface is in the global list */
4611	if (!ifnet_lookup(ifp)) {
4612		retval = ENXIO;
4613		goto end;
4614	}
4615
4616	ifproto = zalloc(dlif_proto_zone);
4617	if (ifproto == NULL) {
4618		retval = ENOMEM;
4619		goto end;
4620	}
4621	bzero(ifproto, dlif_proto_size);
4622
4623	/* refcnt held above during lookup */
4624	ifproto->ifp = ifp;
4625	ifproto->protocol_family = protocol;
4626	ifproto->proto_kpi = kProtoKPI_v1;
4627	ifproto->kpi.v1.input = proto_details->input;
4628	ifproto->kpi.v1.pre_output = proto_details->pre_output;
4629	ifproto->kpi.v1.event = proto_details->event;
4630	ifproto->kpi.v1.ioctl = proto_details->ioctl;
4631	ifproto->kpi.v1.detached = proto_details->detached;
4632	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
4633	ifproto->kpi.v1.send_arp = proto_details->send_arp;
4634
4635	retval = dlil_attach_protocol_internal(ifproto,
4636	    proto_details->demux_list, proto_details->demux_count);
4637
4638	if (dlil_verbose) {
4639		printf("%s: attached v1 protocol %d\n", if_name(ifp),
4640		    protocol);
4641	}
4642
4643end:
4644	if (retval != 0 && retval != EEXIST && ifp != NULL) {
4645		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
4646		    if_name(ifp), protocol, retval);
4647	}
4648	ifnet_head_done();
4649	if (retval != 0  && ifproto != NULL)
4650		zfree(dlif_proto_zone, ifproto);
4651	return (retval);
4652}
4653
4654errno_t
4655ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
4656    const struct ifnet_attach_proto_param_v2 *proto_details)
4657{
4658	int retval = 0;
4659	struct if_proto  *ifproto = NULL;
4660
4661	ifnet_head_lock_shared();
4662	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
4663		retval = EINVAL;
4664		goto end;
4665	}
4666	/* Check that the interface is in the global list */
4667	if (!ifnet_lookup(ifp)) {
4668		retval = ENXIO;
4669		goto end;
4670	}
4671
4672	ifproto = zalloc(dlif_proto_zone);
4673	if (ifproto == NULL) {
4674		retval = ENOMEM;
4675		goto end;
4676	}
4677	bzero(ifproto, sizeof(*ifproto));
4678
4679	/* refcnt held above during lookup */
4680	ifproto->ifp = ifp;
4681	ifproto->protocol_family = protocol;
4682	ifproto->proto_kpi = kProtoKPI_v2;
4683	ifproto->kpi.v2.input = proto_details->input;
4684	ifproto->kpi.v2.pre_output = proto_details->pre_output;
4685	ifproto->kpi.v2.event = proto_details->event;
4686	ifproto->kpi.v2.ioctl = proto_details->ioctl;
4687	ifproto->kpi.v2.detached = proto_details->detached;
4688	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
4689	ifproto->kpi.v2.send_arp = proto_details->send_arp;
4690
4691	retval = dlil_attach_protocol_internal(ifproto,
4692	    proto_details->demux_list, proto_details->demux_count);
4693
4694	if (dlil_verbose) {
4695		printf("%s: attached v2 protocol %d\n", if_name(ifp),
4696		    protocol);
4697	}
4698
4699end:
4700	if (retval != 0 && retval != EEXIST && ifp != NULL) {
4701		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
4702		    if_name(ifp), protocol, retval);
4703	}
4704	ifnet_head_done();
4705	if (retval != 0 && ifproto != NULL)
4706		zfree(dlif_proto_zone, ifproto);
4707	return (retval);
4708}
4709
4710errno_t
4711ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
4712{
4713	struct if_proto *proto = NULL;
4714	int	retval = 0;
4715
4716	if (ifp == NULL || proto_family == 0) {
4717		retval = EINVAL;
4718		goto end;
4719	}
4720
4721	ifnet_lock_exclusive(ifp);
4722	/* callee holds a proto refcnt upon success */
4723	proto = find_attached_proto(ifp, proto_family);
4724	if (proto == NULL) {
4725		retval = ENXIO;
4726		ifnet_lock_done(ifp);
4727		goto end;
4728	}
4729
4730	/* call family module del_proto */
4731	if (ifp->if_del_proto)
4732		ifp->if_del_proto(ifp, proto->protocol_family);
4733
4734	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
4735	    proto, if_proto, next_hash);
4736
4737	if (proto->proto_kpi == kProtoKPI_v1) {
4738		proto->kpi.v1.input = ifproto_media_input_v1;
4739		proto->kpi.v1.pre_output= ifproto_media_preout;
4740		proto->kpi.v1.event = ifproto_media_event;
4741		proto->kpi.v1.ioctl = ifproto_media_ioctl;
4742		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
4743		proto->kpi.v1.send_arp = ifproto_media_send_arp;
4744	} else {
4745		proto->kpi.v2.input = ifproto_media_input_v2;
4746		proto->kpi.v2.pre_output = ifproto_media_preout;
4747		proto->kpi.v2.event = ifproto_media_event;
4748		proto->kpi.v2.ioctl = ifproto_media_ioctl;
4749		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
4750		proto->kpi.v2.send_arp = ifproto_media_send_arp;
4751	}
4752	proto->detached = 1;
4753	ifnet_lock_done(ifp);
4754
4755	if (dlil_verbose) {
4756		printf("%s: detached %s protocol %d\n", if_name(ifp),
4757		    (proto->proto_kpi == kProtoKPI_v1) ?
4758		    "v1" : "v2", proto_family);
4759	}
4760
4761	/* release proto refcnt held during protocol attach */
4762	if_proto_free(proto);
4763
4764	/*
4765	 * Release proto refcnt held during lookup; the rest of
4766	 * protocol detach steps will happen when the last proto
4767	 * reference is released.
4768	 */
4769	if_proto_free(proto);
4770
4771end:
4772	return (retval);
4773}
4774
4775
4776static errno_t
4777ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
4778    struct mbuf *packet, char *header)
4779{
4780#pragma unused(ifp, protocol, packet, header)
4781	return (ENXIO);
4782}
4783
4784static errno_t
4785ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
4786    struct mbuf *packet)
4787{
4788#pragma unused(ifp, protocol, packet)
4789	return (ENXIO);
4790
4791}
4792
4793static errno_t
4794ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
4795    mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
4796    char *link_layer_dest)
4797{
4798#pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
4799	return (ENXIO);
4800
4801}
4802
4803static void
4804ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
4805    const struct kev_msg *event)
4806{
4807#pragma unused(ifp, protocol, event)
4808}
4809
4810static errno_t
4811ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
4812    unsigned long command, void *argument)
4813{
4814#pragma unused(ifp, protocol, command, argument)
4815	return (ENXIO);
4816}
4817
4818static errno_t
4819ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
4820    struct sockaddr_dl *out_ll, size_t ll_len)
4821{
4822#pragma unused(ifp, proto_addr, out_ll, ll_len)
4823	return (ENXIO);
4824}
4825
4826static errno_t
4827ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
4828    const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
4829    const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
4830{
4831#pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
4832	return (ENXIO);
4833}
4834
4835extern int if_next_index(void);
4836
4837errno_t
4838ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
4839{
4840	struct ifnet *tmp_if;
4841	struct ifaddr *ifa;
4842	struct if_data_internal if_data_saved;
4843	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
4844	struct dlil_threading_info *dl_inp;
4845	u_int32_t sflags = 0;
4846	int err;
4847
4848	if (ifp == NULL)
4849		return (EINVAL);
4850
4851	/*
4852	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
4853	 * prevent the interface from being configured while it is
4854	 * embryonic, as ifnet_head_lock is dropped and reacquired
4855	 * below prior to marking the ifnet with IFRF_ATTACHED.
4856	 */
4857	dlil_if_lock();
4858	ifnet_head_lock_exclusive();
4859	/* Verify we aren't already on the list */
4860	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
4861		if (tmp_if == ifp) {
4862			ifnet_head_done();
4863			dlil_if_unlock();
4864			return (EEXIST);
4865		}
4866	}
4867
4868	lck_mtx_lock_spin(&ifp->if_ref_lock);
4869	if (ifp->if_refflags & IFRF_ATTACHED) {
4870		panic_plain("%s: flags mismatch (attached set) ifp=%p",
4871		    __func__, ifp);
4872		/* NOTREACHED */
4873	}
4874	lck_mtx_unlock(&ifp->if_ref_lock);
4875
4876	ifnet_lock_exclusive(ifp);
4877
4878	/* Sanity check */
4879	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
4880	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
4881
4882	if (ll_addr != NULL) {
4883		if (ifp->if_addrlen == 0) {
4884			ifp->if_addrlen = ll_addr->sdl_alen;
4885		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
4886			ifnet_lock_done(ifp);
4887			ifnet_head_done();
4888			dlil_if_unlock();
4889			return (EINVAL);
4890		}
4891	}
4892
4893	/*
4894	 * Allow interfaces without protocol families to attach
4895	 * only if they have the necessary fields filled out.
4896	 */
4897	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
4898		DLIL_PRINTF("%s: Attempt to attach interface without "
4899		    "family module - %d\n", __func__, ifp->if_family);
4900		ifnet_lock_done(ifp);
4901		ifnet_head_done();
4902		dlil_if_unlock();
4903		return (ENODEV);
4904	}
4905
4906	/* Allocate protocol hash table */
4907	VERIFY(ifp->if_proto_hash == NULL);
4908	ifp->if_proto_hash = zalloc(dlif_phash_zone);
4909	if (ifp->if_proto_hash == NULL) {
4910		ifnet_lock_done(ifp);
4911		ifnet_head_done();
4912		dlil_if_unlock();
4913		return (ENOBUFS);
4914	}
4915	bzero(ifp->if_proto_hash, dlif_phash_size);
4916
4917	lck_mtx_lock_spin(&ifp->if_flt_lock);
4918	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
4919	TAILQ_INIT(&ifp->if_flt_head);
4920	VERIFY(ifp->if_flt_busy == 0);
4921	VERIFY(ifp->if_flt_waiters == 0);
4922	lck_mtx_unlock(&ifp->if_flt_lock);
4923
4924	VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead));
4925	TAILQ_INIT(&ifp->if_prefixhead);
4926
4927	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
4928		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
4929		LIST_INIT(&ifp->if_multiaddrs);
4930	}
4931
4932	VERIFY(ifp->if_allhostsinm == NULL);
4933	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
4934	TAILQ_INIT(&ifp->if_addrhead);
4935
4936	if (ifp->if_index == 0) {
4937		int idx = if_next_index();
4938
4939		if (idx == -1) {
4940			ifp->if_index = 0;
4941			ifnet_lock_done(ifp);
4942			ifnet_head_done();
4943			dlil_if_unlock();
4944			return (ENOBUFS);
4945		}
4946		ifp->if_index = idx;
4947	}
4948	/* There should not be anything occupying this slot */
4949	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
4950
4951	/* allocate (if needed) and initialize a link address */
4952	VERIFY(!(dl_if->dl_if_flags & DLIF_REUSE) || ifp->if_lladdr != NULL);
4953	ifa = dlil_alloc_lladdr(ifp, ll_addr);
4954	if (ifa == NULL) {
4955		ifnet_lock_done(ifp);
4956		ifnet_head_done();
4957		dlil_if_unlock();
4958		return (ENOBUFS);
4959	}
4960
4961	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
4962	ifnet_addrs[ifp->if_index - 1] = ifa;
4963
4964	/* make this address the first on the list */
4965	IFA_LOCK(ifa);
4966	/* hold a reference for ifnet_addrs[] */
4967	IFA_ADDREF_LOCKED(ifa);
4968	/* if_attach_link_ifa() holds a reference for ifa_link */
4969	if_attach_link_ifa(ifp, ifa);
4970	IFA_UNLOCK(ifa);
4971
4972#if CONFIG_MACF_NET
4973	mac_ifnet_label_associate(ifp);
4974#endif
4975
4976	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
4977	ifindex2ifnet[ifp->if_index] = ifp;
4978
4979	/* Hold a reference to the underlying dlil_ifnet */
4980	ifnet_reference(ifp);
4981
4982	/* Clear stats (save and restore other fields that we care) */
4983	if_data_saved = ifp->if_data;
4984	bzero(&ifp->if_data, sizeof (ifp->if_data));
4985	ifp->if_data.ifi_type = if_data_saved.ifi_type;
4986	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
4987	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
4988	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
4989	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
4990	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
4991	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
4992	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
4993	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
4994	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
4995	ifnet_touch_lastchange(ifp);
4996
4997	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
4998	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED);
4999
5000	/* By default, use SFB and enable flow advisory */
5001	sflags = PKTSCHEDF_QALG_SFB;
5002	if (if_flowadv)
5003		sflags |= PKTSCHEDF_QALG_FLOWCTL;
5004
5005	if (if_delaybased_queue)
5006		sflags |= PKTSCHEDF_QALG_DELAYBASED;
5007
5008	/* Initialize transmit queue(s) */
5009	err = ifclassq_setup(ifp, sflags, (dl_if->dl_if_flags & DLIF_REUSE));
5010	if (err != 0) {
5011		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
5012		    "err=%d", __func__, ifp, err);
5013		/* NOTREACHED */
5014	}
5015
5016	/* Sanity checks on the input thread storage */
5017	dl_inp = &dl_if->dl_if_inpstorage;
5018	bzero(&dl_inp->stats, sizeof (dl_inp->stats));
5019	VERIFY(dl_inp->input_waiting == 0);
5020	VERIFY(dl_inp->wtot == 0);
5021	VERIFY(dl_inp->ifp == NULL);
5022	VERIFY(qhead(&dl_inp->rcvq_pkts) == NULL && qempty(&dl_inp->rcvq_pkts));
5023	VERIFY(qlimit(&dl_inp->rcvq_pkts) == 0);
5024	VERIFY(!dl_inp->net_affinity);
5025	VERIFY(ifp->if_inp == NULL);
5026	VERIFY(dl_inp->input_thr == THREAD_NULL);
5027	VERIFY(dl_inp->wloop_thr == THREAD_NULL);
5028	VERIFY(dl_inp->poll_thr == THREAD_NULL);
5029	VERIFY(dl_inp->tag == 0);
5030	VERIFY(dl_inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
5031	bzero(&dl_inp->tstats, sizeof (dl_inp->tstats));
5032	bzero(&dl_inp->pstats, sizeof (dl_inp->pstats));
5033	bzero(&dl_inp->sstats, sizeof (dl_inp->sstats));
5034#if IFNET_INPUT_SANITY_CHK
5035	VERIFY(dl_inp->input_mbuf_cnt == 0);
5036#endif /* IFNET_INPUT_SANITY_CHK */
5037
5038	/*
5039	 * A specific DLIL input thread is created per Ethernet/cellular
5040	 * interface or for an interface which supports opportunistic
5041	 * input polling.  Pseudo interfaces or other types of interfaces
5042	 * use the main input thread instead.
5043	 */
5044	if ((net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) ||
5045	    ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR) {
5046		ifp->if_inp = dl_inp;
5047		err = dlil_create_input_thread(ifp, ifp->if_inp);
5048		if (err != 0) {
5049			panic_plain("%s: ifp=%p couldn't get an input thread; "
5050			    "err=%d", __func__, ifp, err);
5051			/* NOTREACHED */
5052		}
5053	}
5054
5055	/*
5056	 * If the driver supports the new transmit model, calculate flow hash
5057	 * and create a workloop starter thread to invoke the if_start callback
5058	 * where the packets may be dequeued and transmitted.
5059	 */
5060	if (ifp->if_eflags & IFEF_TXSTART) {
5061		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
5062		VERIFY(ifp->if_flowhash != 0);
5063
5064		VERIFY(ifp->if_start != NULL);
5065		VERIFY(ifp->if_start_thread == THREAD_NULL);
5066
5067		ifnet_set_start_cycle(ifp, NULL);
5068		ifp->if_start_active = 0;
5069		ifp->if_start_req = 0;
5070		ifp->if_start_flags = 0;
5071		if ((err = kernel_thread_start(ifnet_start_thread_fn, ifp,
5072		    &ifp->if_start_thread)) != KERN_SUCCESS) {
5073			panic_plain("%s: ifp=%p couldn't get a start thread; "
5074			    "err=%d", __func__, ifp, err);
5075			/* NOTREACHED */
5076		}
5077		ml_thread_policy(ifp->if_start_thread, MACHINE_GROUP,
5078		    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP));
5079	} else {
5080		ifp->if_flowhash = 0;
5081	}
5082
5083	/*
5084	 * If the driver supports the new receive model, create a poller
5085	 * thread to invoke if_input_poll callback where the packets may
5086	 * be dequeued from the driver and processed for reception.
5087	 */
5088	if (ifp->if_eflags & IFEF_RXPOLL) {
5089		VERIFY(ifp->if_input_poll != NULL);
5090		VERIFY(ifp->if_input_ctl != NULL);
5091		VERIFY(ifp->if_poll_thread == THREAD_NULL);
5092
5093		ifnet_set_poll_cycle(ifp, NULL);
5094		ifp->if_poll_update = 0;
5095		ifp->if_poll_active = 0;
5096		ifp->if_poll_req = 0;
5097		if ((err = kernel_thread_start(ifnet_poll_thread_fn, ifp,
5098		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
5099			panic_plain("%s: ifp=%p couldn't get a poll thread; "
5100			    "err=%d", __func__, ifp, err);
5101			/* NOTREACHED */
5102		}
5103		ml_thread_policy(ifp->if_poll_thread, MACHINE_GROUP,
5104		    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP));
5105	}
5106
5107	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
5108	VERIFY(ifp->if_desc.ifd_len == 0);
5109	VERIFY(ifp->if_desc.ifd_desc != NULL);
5110
5111	/* Record attach PC stacktrace */
5112	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
5113
5114	ifp->if_updatemcasts = 0;
5115	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
5116		struct ifmultiaddr *ifma;
5117		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
5118			IFMA_LOCK(ifma);
5119			if (ifma->ifma_addr->sa_family == AF_LINK ||
5120			    ifma->ifma_addr->sa_family == AF_UNSPEC)
5121				ifp->if_updatemcasts++;
5122			IFMA_UNLOCK(ifma);
5123		}
5124
5125		printf("%s: attached with %d suspended link-layer multicast "
5126		    "membership(s)\n", if_name(ifp),
5127		    ifp->if_updatemcasts);
5128	}
5129
5130	/* Clear logging parameters */
5131	bzero(&ifp->if_log, sizeof (ifp->if_log));
5132	ifp->if_fg_sendts = 0;
5133
5134	VERIFY(ifp->if_delegated.ifp == NULL);
5135	VERIFY(ifp->if_delegated.type == 0);
5136	VERIFY(ifp->if_delegated.family == 0);
5137	VERIFY(ifp->if_delegated.subfamily == 0);
5138	VERIFY(ifp->if_delegated.expensive == 0);
5139
5140	ifnet_lock_done(ifp);
5141	ifnet_head_done();
5142
5143	lck_mtx_lock(&ifp->if_cached_route_lock);
5144	/* Enable forwarding cached route */
5145	ifp->if_fwd_cacheok = 1;
5146	/* Clean up any existing cached routes */
5147	ROUTE_RELEASE(&ifp->if_fwd_route);
5148	bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route));
5149	ROUTE_RELEASE(&ifp->if_src_route);
5150	bzero(&ifp->if_src_route, sizeof (ifp->if_src_route));
5151	ROUTE_RELEASE(&ifp->if_src_route6);
5152	bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6));
5153	lck_mtx_unlock(&ifp->if_cached_route_lock);
5154
5155	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
5156
5157	/*
5158	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
5159	 * and trees; do this before the ifnet is marked as attached.
5160	 * The ifnet keeps the reference to the info structures even after
5161	 * the ifnet is detached, since the network-layer records still
5162	 * refer to the info structures even after that.  This also
5163	 * makes it possible for them to still function after the ifnet
5164	 * is recycled or reattached.
5165	 */
5166#if INET
5167	if (IGMP_IFINFO(ifp) == NULL) {
5168		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, M_WAITOK);
5169		VERIFY(IGMP_IFINFO(ifp) != NULL);
5170	} else {
5171		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
5172		igmp_domifreattach(IGMP_IFINFO(ifp));
5173	}
5174#endif /* INET */
5175#if INET6
5176	if (MLD_IFINFO(ifp) == NULL) {
5177		MLD_IFINFO(ifp) = mld_domifattach(ifp, M_WAITOK);
5178		VERIFY(MLD_IFINFO(ifp) != NULL);
5179	} else {
5180		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
5181		mld_domifreattach(MLD_IFINFO(ifp));
5182	}
5183#endif /* INET6 */
5184
5185	VERIFY(ifp->if_data_threshold == 0);
5186
5187	/*
5188	 * Finally, mark this ifnet as attached.
5189	 */
5190	lck_mtx_lock(rnh_lock);
5191	ifnet_lock_exclusive(ifp);
5192	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
5193	ifp->if_lqm = (ifp == lo_ifp) ? IFNET_LQM_THRESH_GOOD :
5194	    IFNET_LQM_THRESH_UNKNOWN;
5195	lck_mtx_lock_spin(&ifp->if_ref_lock);
5196	ifp->if_refflags = IFRF_ATTACHED;
5197	lck_mtx_unlock(&ifp->if_ref_lock);
5198	if (net_rtref) {
5199		/* boot-args override; enable idle notification */
5200		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
5201		    IFRF_IDLE_NOTIFY);
5202	} else {
5203		/* apply previous request(s) to set the idle flags, if any */
5204		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
5205		    ifp->if_idle_new_flags_mask);
5206
5207	}
5208	ifnet_lock_done(ifp);
5209	lck_mtx_unlock(rnh_lock);
5210	dlil_if_unlock();
5211
5212#if PF
5213	/*
5214	 * Attach packet filter to this interface, if enabled.
5215	 */
5216	pf_ifnet_hook(ifp, 1);
5217#endif /* PF */
5218
5219	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0);
5220
5221	if (dlil_verbose) {
5222		printf("%s: attached%s\n", if_name(ifp),
5223		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
5224	}
5225
5226	return (0);
5227}
5228
5229/*
5230 * Prepare the storage for the first/permanent link address, which must
5231 * must have the same lifetime as the ifnet itself.  Although the link
5232 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
5233 * its location in memory must never change as it may still be referred
5234 * to by some parts of the system afterwards (unfortunate implementation
5235 * artifacts inherited from BSD.)
5236 *
5237 * Caller must hold ifnet lock as writer.
5238 */
5239static struct ifaddr *
5240dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
5241{
5242	struct ifaddr *ifa, *oifa;
5243	struct sockaddr_dl *asdl, *msdl;
5244	char workbuf[IFNAMSIZ*2];
5245	int namelen, masklen, socksize;
5246	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
5247
5248	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
5249	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
5250
5251	namelen = snprintf(workbuf, sizeof (workbuf), "%s",
5252	    if_name(ifp));
5253	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
5254	socksize = masklen + ifp->if_addrlen;
5255#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
5256	if ((u_int32_t)socksize < sizeof (struct sockaddr_dl))
5257		socksize = sizeof(struct sockaddr_dl);
5258	socksize = ROUNDUP(socksize);
5259#undef ROUNDUP
5260
5261	ifa = ifp->if_lladdr;
5262	if (socksize > DLIL_SDLMAXLEN ||
5263	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
5264		/*
5265		 * Rare, but in the event that the link address requires
5266		 * more storage space than DLIL_SDLMAXLEN, allocate the
5267		 * largest possible storages for address and mask, such
5268		 * that we can reuse the same space when if_addrlen grows.
5269		 * This same space will be used when if_addrlen shrinks.
5270		 */
5271		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
5272			int ifasize = sizeof (*ifa) + 2 * SOCK_MAXADDRLEN;
5273			ifa = _MALLOC(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
5274			if (ifa == NULL)
5275				return (NULL);
5276			ifa_lock_init(ifa);
5277			/* Don't set IFD_ALLOC, as this is permanent */
5278			ifa->ifa_debug = IFD_LINK;
5279		}
5280		IFA_LOCK(ifa);
5281		/* address and mask sockaddr_dl locations */
5282		asdl = (struct sockaddr_dl *)(ifa + 1);
5283		bzero(asdl, SOCK_MAXADDRLEN);
5284		msdl = (struct sockaddr_dl *)(void *)
5285		    ((char *)asdl + SOCK_MAXADDRLEN);
5286		bzero(msdl, SOCK_MAXADDRLEN);
5287	} else {
5288		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
5289		/*
5290		 * Use the storage areas for address and mask within the
5291		 * dlil_ifnet structure.  This is the most common case.
5292		 */
5293		if (ifa == NULL) {
5294			ifa = &dl_if->dl_if_lladdr.ifa;
5295			ifa_lock_init(ifa);
5296			/* Don't set IFD_ALLOC, as this is permanent */
5297			ifa->ifa_debug = IFD_LINK;
5298		}
5299		IFA_LOCK(ifa);
5300		/* address and mask sockaddr_dl locations */
5301		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
5302		bzero(asdl, sizeof (dl_if->dl_if_lladdr.asdl));
5303		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
5304		bzero(msdl, sizeof (dl_if->dl_if_lladdr.msdl));
5305	}
5306
5307	/* hold a permanent reference for the ifnet itself */
5308	IFA_ADDREF_LOCKED(ifa);
5309	oifa = ifp->if_lladdr;
5310	ifp->if_lladdr = ifa;
5311
5312	VERIFY(ifa->ifa_debug == IFD_LINK);
5313	ifa->ifa_ifp = ifp;
5314	ifa->ifa_rtrequest = link_rtrequest;
5315	ifa->ifa_addr = (struct sockaddr *)asdl;
5316	asdl->sdl_len = socksize;
5317	asdl->sdl_family = AF_LINK;
5318	bcopy(workbuf, asdl->sdl_data, namelen);
5319	asdl->sdl_nlen = namelen;
5320	asdl->sdl_index = ifp->if_index;
5321	asdl->sdl_type = ifp->if_type;
5322	if (ll_addr != NULL) {
5323		asdl->sdl_alen = ll_addr->sdl_alen;
5324		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
5325	} else {
5326		asdl->sdl_alen = 0;
5327	}
5328	ifa->ifa_netmask = (struct sockaddr*)msdl;
5329	msdl->sdl_len = masklen;
5330	while (namelen != 0)
5331		msdl->sdl_data[--namelen] = 0xff;
5332	IFA_UNLOCK(ifa);
5333
5334	if (oifa != NULL)
5335		IFA_REMREF(oifa);
5336
5337	return (ifa);
5338}
5339
5340static void
5341if_purgeaddrs(struct ifnet *ifp)
5342{
5343#if INET
5344	in_purgeaddrs(ifp);
5345#endif /* INET */
5346#if INET6
5347	in6_purgeaddrs(ifp);
5348#endif /* INET6 */
5349}
5350
5351errno_t
5352ifnet_detach(ifnet_t ifp)
5353{
5354	struct ifnet *delegated_ifp;
5355
5356	if (ifp == NULL)
5357		return (EINVAL);
5358
5359	lck_mtx_lock(rnh_lock);
5360	ifnet_head_lock_exclusive();
5361	ifnet_lock_exclusive(ifp);
5362
5363	/*
5364	 * Check to see if this interface has previously triggered
5365	 * aggressive protocol draining; if so, decrement the global
5366	 * refcnt and clear PR_AGGDRAIN on the route domain if
5367	 * there are no more of such an interface around.
5368	 */
5369	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
5370
5371	lck_mtx_lock_spin(&ifp->if_ref_lock);
5372	 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
5373		lck_mtx_unlock(&ifp->if_ref_lock);
5374		ifnet_lock_done(ifp);
5375		ifnet_head_done();
5376		lck_mtx_unlock(rnh_lock);
5377		return (EINVAL);
5378	} else if (ifp->if_refflags & IFRF_DETACHING) {
5379		/* Interface has already been detached */
5380		lck_mtx_unlock(&ifp->if_ref_lock);
5381		ifnet_lock_done(ifp);
5382		ifnet_head_done();
5383		lck_mtx_unlock(rnh_lock);
5384		return (ENXIO);
5385	}
5386	/* Indicate this interface is being detached */
5387	ifp->if_refflags &= ~IFRF_ATTACHED;
5388	ifp->if_refflags |= IFRF_DETACHING;
5389	lck_mtx_unlock(&ifp->if_ref_lock);
5390
5391	if (dlil_verbose)
5392		printf("%s: detaching\n", if_name(ifp));
5393
5394	/*
5395	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
5396	 * no longer be visible during lookups from this point.
5397	 */
5398	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
5399	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
5400	ifp->if_link.tqe_next = NULL;
5401	ifp->if_link.tqe_prev = NULL;
5402	ifindex2ifnet[ifp->if_index] = NULL;
5403
5404	/* Record detach PC stacktrace */
5405	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
5406
5407	/* Clear logging parameters */
5408	bzero(&ifp->if_log, sizeof (ifp->if_log));
5409
5410	/* Clear delegated interface info (reference released below) */
5411	delegated_ifp = ifp->if_delegated.ifp;
5412	bzero(&ifp->if_delegated, sizeof (ifp->if_delegated));
5413
5414	ifnet_lock_done(ifp);
5415	ifnet_head_done();
5416	lck_mtx_unlock(rnh_lock);
5417
5418	/* Release reference held on the delegated interface */
5419	if (delegated_ifp != NULL)
5420		ifnet_release(delegated_ifp);
5421
5422	/* Reset Link Quality Metric (unless loopback [lo0]) */
5423	if (ifp != lo_ifp)
5424		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF);
5425
5426	/* Reset TCP local statistics */
5427	if (ifp->if_tcp_stat != NULL)
5428		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
5429
5430	/* Reset UDP local statistics */
5431	if (ifp->if_udp_stat != NULL)
5432		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
5433
5434	/* Let BPF know we're detaching */
5435	bpfdetach(ifp);
5436
5437	/* Mark the interface as DOWN */
5438	if_down(ifp);
5439
5440	/* Disable forwarding cached route */
5441	lck_mtx_lock(&ifp->if_cached_route_lock);
5442	ifp->if_fwd_cacheok = 0;
5443	lck_mtx_unlock(&ifp->if_cached_route_lock);
5444
5445	ifp->if_data_threshold = 0;
5446	/*
5447	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
5448	 * references to the info structures and leave them attached to
5449	 * this ifnet.
5450	 */
5451#if INET
5452	igmp_domifdetach(ifp);
5453#endif /* INET */
5454#if INET6
5455	mld_domifdetach(ifp);
5456#endif /* INET6 */
5457
5458	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0);
5459
5460	/* Let worker thread take care of the rest, to avoid reentrancy */
5461	dlil_if_lock();
5462	ifnet_detaching_enqueue(ifp);
5463	dlil_if_unlock();
5464
5465	return (0);
5466}
5467
5468static void
5469ifnet_detaching_enqueue(struct ifnet *ifp)
5470{
5471	dlil_if_lock_assert();
5472
5473	++ifnet_detaching_cnt;
5474	VERIFY(ifnet_detaching_cnt != 0);
5475	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
5476	wakeup((caddr_t)&ifnet_delayed_run);
5477}
5478
5479static struct ifnet *
5480ifnet_detaching_dequeue(void)
5481{
5482	struct ifnet *ifp;
5483
5484	dlil_if_lock_assert();
5485
5486	ifp = TAILQ_FIRST(&ifnet_detaching_head);
5487	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
5488	if (ifp != NULL) {
5489		VERIFY(ifnet_detaching_cnt != 0);
5490		--ifnet_detaching_cnt;
5491		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
5492		ifp->if_detaching_link.tqe_next = NULL;
5493		ifp->if_detaching_link.tqe_prev = NULL;
5494	}
5495	return (ifp);
5496}
5497
5498static int
5499ifnet_detacher_thread_cont(int err)
5500{
5501#pragma unused(err)
5502	struct ifnet *ifp;
5503
5504	for (;;) {
5505		dlil_if_lock_assert();
5506		while (ifnet_detaching_cnt == 0) {
5507			(void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock,
5508			    (PZERO - 1), "ifnet_detacher_cont", 0,
5509			    ifnet_detacher_thread_cont);
5510			/* NOTREACHED */
5511		}
5512
5513		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
5514
5515		/* Take care of detaching ifnet */
5516		ifp = ifnet_detaching_dequeue();
5517		if (ifp != NULL) {
5518			dlil_if_unlock();
5519			ifnet_detach_final(ifp);
5520			dlil_if_lock();
5521		}
5522	}
5523	/* NOTREACHED */
5524	return (0);
5525}
5526
5527static void
5528ifnet_detacher_thread_func(void *v, wait_result_t w)
5529{
5530#pragma unused(v, w)
5531	dlil_if_lock();
5532	(void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock,
5533	    (PZERO - 1), "ifnet_detacher", 0, ifnet_detacher_thread_cont);
5534	/*
5535	 * msleep0() shouldn't have returned as PCATCH was not set;
5536	 * therefore assert in this case.
5537	 */
5538	dlil_if_unlock();
5539	VERIFY(0);
5540}
5541
5542static void
5543ifnet_detach_final(struct ifnet *ifp)
5544{
5545	struct ifnet_filter *filter, *filter_next;
5546	struct ifnet_filter_head fhead;
5547	struct dlil_threading_info *inp;
5548	struct ifaddr *ifa;
5549	ifnet_detached_func if_free;
5550	int i;
5551
5552	lck_mtx_lock(&ifp->if_ref_lock);
5553	if (!(ifp->if_refflags & IFRF_DETACHING)) {
5554		panic("%s: flags mismatch (detaching not set) ifp=%p",
5555		    __func__, ifp);
5556		/* NOTREACHED */
5557	}
5558
5559	/*
5560	 * Wait until the existing IO references get released
5561	 * before we proceed with ifnet_detach.  This is not a
5562	 * common case, so block without using a continuation.
5563	 */
5564	while (ifp->if_refio > 0) {
5565		printf("%s: Waiting for IO references on %s interface "
5566		    "to be released\n", __func__, if_name(ifp));
5567		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
5568			(PZERO - 1), "ifnet_ioref_wait", NULL);
5569	}
5570	lck_mtx_unlock(&ifp->if_ref_lock);
5571
5572	/* Drain and destroy send queue */
5573	ifclassq_teardown(ifp);
5574
5575	/* Detach interface filters */
5576	lck_mtx_lock(&ifp->if_flt_lock);
5577	if_flt_monitor_enter(ifp);
5578
5579	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
5580	fhead = ifp->if_flt_head;
5581	TAILQ_INIT(&ifp->if_flt_head);
5582
5583	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
5584		filter_next = TAILQ_NEXT(filter, filt_next);
5585		lck_mtx_unlock(&ifp->if_flt_lock);
5586
5587		dlil_detach_filter_internal(filter, 1);
5588		lck_mtx_lock(&ifp->if_flt_lock);
5589	}
5590	if_flt_monitor_leave(ifp);
5591	lck_mtx_unlock(&ifp->if_flt_lock);
5592
5593	/* Tell upper layers to drop their network addresses */
5594	if_purgeaddrs(ifp);
5595
5596	ifnet_lock_exclusive(ifp);
5597
5598	/* Uplumb all protocols */
5599	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
5600		struct if_proto *proto;
5601
5602		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
5603		while (proto != NULL) {
5604			protocol_family_t family = proto->protocol_family;
5605			ifnet_lock_done(ifp);
5606			proto_unplumb(family, ifp);
5607			ifnet_lock_exclusive(ifp);
5608			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
5609		}
5610		/* There should not be any protocols left */
5611		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
5612	}
5613	zfree(dlif_phash_zone, ifp->if_proto_hash);
5614	ifp->if_proto_hash = NULL;
5615
5616	/* Detach (permanent) link address from if_addrhead */
5617	ifa = TAILQ_FIRST(&ifp->if_addrhead);
5618	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
5619	IFA_LOCK(ifa);
5620	if_detach_link_ifa(ifp, ifa);
5621	IFA_UNLOCK(ifa);
5622
5623	/* Remove (permanent) link address from ifnet_addrs[] */
5624	IFA_REMREF(ifa);
5625	ifnet_addrs[ifp->if_index - 1] = NULL;
5626
5627	/* This interface should not be on {ifnet_head,detaching} */
5628	VERIFY(ifp->if_link.tqe_next == NULL);
5629	VERIFY(ifp->if_link.tqe_prev == NULL);
5630	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
5631	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
5632
5633	/* Prefix list should be empty by now */
5634	VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead));
5635
5636	/* The slot should have been emptied */
5637	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
5638
5639	/* There should not be any addresses left */
5640	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
5641
5642	/*
5643	 * Signal the starter thread to terminate itself.
5644	 */
5645	if (ifp->if_start_thread != THREAD_NULL) {
5646		lck_mtx_lock_spin(&ifp->if_start_lock);
5647		ifp->if_start_flags = 0;
5648		ifp->if_start_thread = THREAD_NULL;
5649		wakeup_one((caddr_t)&ifp->if_start_thread);
5650		lck_mtx_unlock(&ifp->if_start_lock);
5651	}
5652
5653	/*
5654	 * Signal the poller thread to terminate itself.
5655	 */
5656	if (ifp->if_poll_thread != THREAD_NULL) {
5657		lck_mtx_lock_spin(&ifp->if_poll_lock);
5658		ifp->if_poll_thread = THREAD_NULL;
5659		wakeup_one((caddr_t)&ifp->if_poll_thread);
5660		lck_mtx_unlock(&ifp->if_poll_lock);
5661	}
5662
5663	/*
5664	 * If thread affinity was set for the workloop thread, we will need
5665	 * to tear down the affinity and release the extra reference count
5666	 * taken at attach time.  Does not apply to lo0 or other interfaces
5667	 * without dedicated input threads.
5668	 */
5669	if ((inp = ifp->if_inp) != NULL) {
5670		VERIFY(inp != dlil_main_input_thread);
5671
5672		if (inp->net_affinity) {
5673			struct thread *tp, *wtp, *ptp;
5674
5675			lck_mtx_lock_spin(&inp->input_lck);
5676			wtp = inp->wloop_thr;
5677			inp->wloop_thr = THREAD_NULL;
5678			ptp = inp->poll_thr;
5679			inp->poll_thr = THREAD_NULL;
5680			tp = inp->input_thr;	/* don't nullify now */
5681			inp->tag = 0;
5682			inp->net_affinity = FALSE;
5683			lck_mtx_unlock(&inp->input_lck);
5684
5685			/* Tear down poll thread affinity */
5686			if (ptp != NULL) {
5687				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
5688				(void) dlil_affinity_set(ptp,
5689				    THREAD_AFFINITY_TAG_NULL);
5690				thread_deallocate(ptp);
5691			}
5692
5693			/* Tear down workloop thread affinity */
5694			if (wtp != NULL) {
5695				(void) dlil_affinity_set(wtp,
5696				    THREAD_AFFINITY_TAG_NULL);
5697				thread_deallocate(wtp);
5698			}
5699
5700			/* Tear down DLIL input thread affinity */
5701			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
5702			thread_deallocate(tp);
5703		}
5704
5705		/* disassociate ifp DLIL input thread */
5706		ifp->if_inp = NULL;
5707
5708		lck_mtx_lock_spin(&inp->input_lck);
5709		inp->input_waiting |= DLIL_INPUT_TERMINATE;
5710		if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
5711			wakeup_one((caddr_t)&inp->input_waiting);
5712		}
5713		lck_mtx_unlock(&inp->input_lck);
5714	}
5715
5716	/* The driver might unload, so point these to ourselves */
5717	if_free = ifp->if_free;
5718	ifp->if_output = ifp_if_output;
5719	ifp->if_pre_enqueue = ifp_if_output;
5720	ifp->if_start = ifp_if_start;
5721	ifp->if_output_ctl = ifp_if_ctl;
5722	ifp->if_input_poll = ifp_if_input_poll;
5723	ifp->if_input_ctl = ifp_if_ctl;
5724	ifp->if_ioctl = ifp_if_ioctl;
5725	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
5726	ifp->if_free = ifp_if_free;
5727	ifp->if_demux = ifp_if_demux;
5728	ifp->if_event = ifp_if_event;
5729	ifp->if_framer_legacy = ifp_if_framer;
5730	ifp->if_framer = ifp_if_framer_extended;
5731	ifp->if_add_proto = ifp_if_add_proto;
5732	ifp->if_del_proto = ifp_if_del_proto;
5733	ifp->if_check_multi = ifp_if_check_multi;
5734
5735	/* wipe out interface description */
5736	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
5737	ifp->if_desc.ifd_len = 0;
5738	VERIFY(ifp->if_desc.ifd_desc != NULL);
5739	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
5740
5741	/* there shouldn't be any delegation by now */
5742	VERIFY(ifp->if_delegated.ifp == NULL);
5743	VERIFY(ifp->if_delegated.type == 0);
5744	VERIFY(ifp->if_delegated.family == 0);
5745	VERIFY(ifp->if_delegated.subfamily == 0);
5746	VERIFY(ifp->if_delegated.expensive == 0);
5747
5748	ifnet_lock_done(ifp);
5749
5750#if PF
5751	/*
5752	 * Detach this interface from packet filter, if enabled.
5753	 */
5754	pf_ifnet_hook(ifp, 0);
5755#endif /* PF */
5756
5757	/* Filter list should be empty */
5758	lck_mtx_lock_spin(&ifp->if_flt_lock);
5759	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
5760	VERIFY(ifp->if_flt_busy == 0);
5761	VERIFY(ifp->if_flt_waiters == 0);
5762	lck_mtx_unlock(&ifp->if_flt_lock);
5763
5764	/* Last chance to drain send queue */
5765	if_qflush(ifp, 0);
5766
5767	/* Last chance to cleanup any cached route */
5768	lck_mtx_lock(&ifp->if_cached_route_lock);
5769	VERIFY(!ifp->if_fwd_cacheok);
5770	ROUTE_RELEASE(&ifp->if_fwd_route);
5771	bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route));
5772	ROUTE_RELEASE(&ifp->if_src_route);
5773	bzero(&ifp->if_src_route, sizeof (ifp->if_src_route));
5774	ROUTE_RELEASE(&ifp->if_src_route6);
5775	bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6));
5776	lck_mtx_unlock(&ifp->if_cached_route_lock);
5777
5778	VERIFY(ifp->if_data_threshold == 0);
5779
5780	ifnet_llreach_ifdetach(ifp);
5781
5782	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0);
5783
5784	if (if_free != NULL)
5785		if_free(ifp);
5786
5787	/*
5788	 * Finally, mark this ifnet as detached.
5789	 */
5790	lck_mtx_lock_spin(&ifp->if_ref_lock);
5791	if (!(ifp->if_refflags & IFRF_DETACHING)) {
5792		panic("%s: flags mismatch (detaching not set) ifp=%p",
5793		    __func__, ifp);
5794		/* NOTREACHED */
5795	}
5796	ifp->if_refflags &= ~IFRF_DETACHING;
5797	lck_mtx_unlock(&ifp->if_ref_lock);
5798
5799	if (dlil_verbose)
5800		printf("%s: detached\n", if_name(ifp));
5801
5802	/* Release reference held during ifnet attach */
5803	ifnet_release(ifp);
5804}
5805
5806static errno_t
5807ifp_if_output(struct ifnet *ifp, struct mbuf *m)
5808{
5809#pragma unused(ifp)
5810	m_freem(m);
5811	return (0);
5812}
5813
5814static void
5815ifp_if_start(struct ifnet *ifp)
5816{
5817	ifnet_purge(ifp);
5818}
5819
5820static void
5821ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
5822    struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
5823{
5824#pragma unused(ifp, flags, max_cnt)
5825	if (m_head != NULL)
5826		*m_head = NULL;
5827	if (m_tail != NULL)
5828		*m_tail = NULL;
5829	if (cnt != NULL)
5830		*cnt = 0;
5831	if (len != NULL)
5832		*len = 0;
5833}
5834
5835static errno_t
5836ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
5837{
5838#pragma unused(ifp, cmd, arglen, arg)
5839	return (EOPNOTSUPP);
5840}
5841
5842static errno_t
5843ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
5844{
5845#pragma unused(ifp, fh, pf)
5846	m_freem(m);
5847	return (EJUSTRETURN);
5848}
5849
5850static errno_t
5851ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
5852    const struct ifnet_demux_desc *da, u_int32_t dc)
5853{
5854#pragma unused(ifp, pf, da, dc)
5855	return (EINVAL);
5856}
5857
5858static errno_t
5859ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
5860{
5861#pragma unused(ifp, pf)
5862	return (EINVAL);
5863}
5864
5865static errno_t
5866ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
5867{
5868#pragma unused(ifp, sa)
5869	return (EOPNOTSUPP);
5870}
5871
5872static errno_t
5873ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
5874    const struct sockaddr *sa, const char *ll, const char *t)
5875{
5876#pragma unused(ifp, m, sa, ll, t)
5877	return (ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL));
5878}
5879
5880static errno_t
5881ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
5882    const struct sockaddr *sa, const char *ll, const char *t,
5883    u_int32_t *pre, u_int32_t *post)
5884{
5885#pragma unused(ifp, sa, ll, t)
5886	m_freem(*m);
5887	*m = NULL;
5888
5889	if (pre != NULL)
5890		*pre = 0;
5891	if (post != NULL)
5892		*post = 0;
5893
5894	return (EJUSTRETURN);
5895}
5896
5897errno_t
5898ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
5899{
5900#pragma unused(ifp, cmd, arg)
5901	return (EOPNOTSUPP);
5902}
5903
5904static errno_t
5905ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
5906{
5907#pragma unused(ifp, tm, f)
5908	/* XXX not sure what to do here */
5909	return (0);
5910}
5911
5912static void
5913ifp_if_free(struct ifnet *ifp)
5914{
5915#pragma unused(ifp)
5916}
5917
5918static void
5919ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
5920{
5921#pragma unused(ifp, e)
5922}
5923
5924__private_extern__
5925int dlil_if_acquire(u_int32_t family, const void *uniqueid,
5926    size_t uniqueid_len, struct ifnet **ifp)
5927{
5928	struct ifnet *ifp1 = NULL;
5929	struct dlil_ifnet *dlifp1 = NULL;
5930	void *buf, *base, **pbuf;
5931	int ret = 0;
5932
5933	dlil_if_lock();
5934	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
5935		ifp1 = (struct ifnet *)dlifp1;
5936
5937		if (ifp1->if_family != family)
5938			continue;
5939
5940		lck_mtx_lock(&dlifp1->dl_if_lock);
5941		/* same uniqueid and same len or no unique id specified */
5942		if ((uniqueid_len == dlifp1->dl_if_uniqueid_len) &&
5943		    !bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len)) {
5944			/* check for matching interface in use */
5945			if (dlifp1->dl_if_flags & DLIF_INUSE) {
5946				if (uniqueid_len) {
5947					ret = EBUSY;
5948					lck_mtx_unlock(&dlifp1->dl_if_lock);
5949					goto end;
5950				}
5951			} else {
5952				dlifp1->dl_if_flags |= (DLIF_INUSE|DLIF_REUSE);
5953				lck_mtx_unlock(&dlifp1->dl_if_lock);
5954				*ifp = ifp1;
5955				goto end;
5956			}
5957		}
5958		lck_mtx_unlock(&dlifp1->dl_if_lock);
5959	}
5960
5961	/* no interface found, allocate a new one */
5962	buf = zalloc(dlif_zone);
5963	if (buf == NULL) {
5964		ret = ENOMEM;
5965		goto end;
5966	}
5967	bzero(buf, dlif_bufsize);
5968
5969	/* Get the 64-bit aligned base address for this object */
5970	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
5971	    sizeof (u_int64_t));
5972	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
5973
5974	/*
5975	 * Wind back a pointer size from the aligned base and
5976	 * save the original address so we can free it later.
5977	 */
5978	pbuf = (void **)((intptr_t)base - sizeof (void *));
5979	*pbuf = buf;
5980	dlifp1 = base;
5981
5982	if (uniqueid_len) {
5983		MALLOC(dlifp1->dl_if_uniqueid, void *, uniqueid_len,
5984		    M_NKE, M_WAITOK);
5985		if (dlifp1->dl_if_uniqueid == NULL) {
5986			zfree(dlif_zone, dlifp1);
5987			ret = ENOMEM;
5988			goto end;
5989		}
5990		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
5991		dlifp1->dl_if_uniqueid_len = uniqueid_len;
5992	}
5993
5994	ifp1 = (struct ifnet *)dlifp1;
5995	dlifp1->dl_if_flags = DLIF_INUSE;
5996	if (ifnet_debug) {
5997		dlifp1->dl_if_flags |= DLIF_DEBUG;
5998		dlifp1->dl_if_trace = dlil_if_trace;
5999	}
6000	ifp1->if_name = dlifp1->dl_if_namestorage;
6001	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
6002
6003	/* initialize interface description */
6004	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
6005	ifp1->if_desc.ifd_len = 0;
6006	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
6007
6008#if CONFIG_MACF_NET
6009	mac_ifnet_label_init(ifp1);
6010#endif
6011
6012	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
6013		DLIL_PRINTF("%s: failed to allocate if local stats, "
6014		    "error: %d\n", __func__, ret);
6015		/* This probably shouldn't be fatal */
6016		ret = 0;
6017	}
6018
6019	lck_mtx_init(&dlifp1->dl_if_lock, ifnet_lock_group, ifnet_lock_attr);
6020	lck_rw_init(&ifp1->if_lock, ifnet_lock_group, ifnet_lock_attr);
6021	lck_mtx_init(&ifp1->if_ref_lock, ifnet_lock_group, ifnet_lock_attr);
6022	lck_mtx_init(&ifp1->if_flt_lock, ifnet_lock_group, ifnet_lock_attr);
6023	lck_mtx_init(&ifp1->if_addrconfig_lock, ifnet_lock_group,
6024	    ifnet_lock_attr);
6025	lck_rw_init(&ifp1->if_llreach_lock, ifnet_lock_group, ifnet_lock_attr);
6026#if INET6
6027	lck_rw_init(&ifp1->if_inet6data_lock, ifnet_lock_group, ifnet_lock_attr);
6028	ifp1->if_inet6data = NULL;
6029#endif
6030
6031	/* for send data paths */
6032	lck_mtx_init(&ifp1->if_start_lock, ifnet_snd_lock_group,
6033	    ifnet_lock_attr);
6034	lck_mtx_init(&ifp1->if_cached_route_lock, ifnet_snd_lock_group,
6035	    ifnet_lock_attr);
6036	lck_mtx_init(&ifp1->if_snd.ifcq_lock, ifnet_snd_lock_group,
6037	    ifnet_lock_attr);
6038
6039	/* for receive data paths */
6040	lck_mtx_init(&ifp1->if_poll_lock, ifnet_rcv_lock_group,
6041	    ifnet_lock_attr);
6042
6043	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
6044
6045	*ifp = ifp1;
6046
6047end:
6048	dlil_if_unlock();
6049
6050	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof (u_int64_t)) &&
6051	    IS_P2ALIGNED(&ifp1->if_data, sizeof (u_int64_t))));
6052
6053	return (ret);
6054}
6055
6056__private_extern__ void
6057dlil_if_release(ifnet_t	ifp)
6058{
6059	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
6060
6061	ifnet_lock_exclusive(ifp);
6062	lck_mtx_lock(&dlifp->dl_if_lock);
6063	dlifp->dl_if_flags &= ~DLIF_INUSE;
6064	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
6065	ifp->if_name = dlifp->dl_if_namestorage;
6066	/* Reset external name (name + unit) */
6067	ifp->if_xname = dlifp->dl_if_xnamestorage;
6068	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
6069	    "%s?", ifp->if_name);
6070	lck_mtx_unlock(&dlifp->dl_if_lock);
6071#if CONFIG_MACF_NET
6072	/*
6073	* We can either recycle the MAC label here or in dlil_if_acquire().
6074	* It seems logical to do it here but this means that anything that
6075	* still has a handle on ifp will now see it as unlabeled.
6076	* Since the interface is "dead" that may be OK.  Revisit later.
6077	*/
6078	mac_ifnet_label_recycle(ifp);
6079#endif
6080	ifnet_lock_done(ifp);
6081}
6082
6083__private_extern__ void
6084dlil_if_lock(void)
6085{
6086	lck_mtx_lock(&dlil_ifnet_lock);
6087}
6088
6089__private_extern__ void
6090dlil_if_unlock(void)
6091{
6092	lck_mtx_unlock(&dlil_ifnet_lock);
6093}
6094
6095__private_extern__ void
6096dlil_if_lock_assert(void)
6097{
6098	lck_mtx_assert(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
6099}
6100
6101__private_extern__ void
6102dlil_proto_unplumb_all(struct ifnet *ifp)
6103{
6104	/*
6105	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
6106	 * each bucket contains exactly one entry; PF_VLAN does not need an
6107	 * explicit unplumb.
6108	 *
6109	 * if_proto_hash[3] is for other protocols; we expect anything
6110	 * in this bucket to respond to the DETACHING event (which would
6111	 * have happened by now) and do the unplumb then.
6112	 */
6113	(void) proto_unplumb(PF_INET, ifp);
6114#if INET6
6115	(void) proto_unplumb(PF_INET6, ifp);
6116#endif /* INET6 */
6117}
6118
6119static void
6120ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
6121{
6122	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
6123	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
6124
6125	route_copyout(dst, &ifp->if_src_route, sizeof (*dst));
6126
6127	lck_mtx_unlock(&ifp->if_cached_route_lock);
6128}
6129
6130static void
6131ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
6132{
6133	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
6134	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
6135
6136	if (ifp->if_fwd_cacheok) {
6137		route_copyin(src, &ifp->if_src_route, sizeof (*src));
6138	} else {
6139		ROUTE_RELEASE(src);
6140	}
6141	lck_mtx_unlock(&ifp->if_cached_route_lock);
6142}
6143
6144#if INET6
6145static void
6146ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
6147{
6148	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
6149	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
6150
6151	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
6152	    sizeof (*dst));
6153
6154	lck_mtx_unlock(&ifp->if_cached_route_lock);
6155}
6156
6157static void
6158ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
6159{
6160	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
6161	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
6162
6163	if (ifp->if_fwd_cacheok) {
6164		route_copyin((struct route *)src,
6165		    (struct route *)&ifp->if_src_route6, sizeof (*src));
6166	} else {
6167		ROUTE_RELEASE(src);
6168	}
6169	lck_mtx_unlock(&ifp->if_cached_route_lock);
6170}
6171#endif /* INET6 */
6172
6173struct rtentry *
6174ifnet_cached_rtlookup_inet(struct ifnet	*ifp, struct in_addr src_ip)
6175{
6176	struct route		src_rt;
6177	struct sockaddr_in	*dst;
6178
6179	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
6180
6181	ifp_src_route_copyout(ifp, &src_rt);
6182
6183	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
6184		ROUTE_RELEASE(&src_rt);
6185		if (dst->sin_family != AF_INET) {
6186			bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst));
6187			dst->sin_len = sizeof (src_rt.ro_dst);
6188			dst->sin_family = AF_INET;
6189		}
6190		dst->sin_addr = src_ip;
6191
6192		if (src_rt.ro_rt == NULL) {
6193			src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
6194			    0, 0, ifp->if_index);
6195
6196			if (src_rt.ro_rt != NULL) {
6197				/* retain a ref, copyin consumes one */
6198				struct rtentry	*rte = src_rt.ro_rt;
6199				RT_ADDREF(rte);
6200				ifp_src_route_copyin(ifp, &src_rt);
6201				src_rt.ro_rt = rte;
6202			}
6203		}
6204	}
6205
6206	return (src_rt.ro_rt);
6207}
6208
6209#if INET6
6210struct rtentry*
6211ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
6212{
6213	struct route_in6 src_rt;
6214
6215	ifp_src_route6_copyout(ifp, &src_rt);
6216
6217	if (ROUTE_UNUSABLE(&src_rt) ||
6218	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
6219		ROUTE_RELEASE(&src_rt);
6220		if (src_rt.ro_dst.sin6_family != AF_INET6) {
6221			bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst));
6222			src_rt.ro_dst.sin6_len = sizeof (src_rt.ro_dst);
6223			src_rt.ro_dst.sin6_family = AF_INET6;
6224		}
6225		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
6226		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
6227		    sizeof (src_rt.ro_dst.sin6_addr));
6228
6229		if (src_rt.ro_rt == NULL) {
6230			src_rt.ro_rt = rtalloc1_scoped(
6231			    (struct sockaddr *)&src_rt.ro_dst, 0, 0,
6232			    ifp->if_index);
6233
6234			if (src_rt.ro_rt != NULL) {
6235				/* retain a ref, copyin consumes one */
6236				struct rtentry	*rte = src_rt.ro_rt;
6237				RT_ADDREF(rte);
6238				ifp_src_route6_copyin(ifp, &src_rt);
6239				src_rt.ro_rt = rte;
6240			}
6241		}
6242	}
6243
6244	return (src_rt.ro_rt);
6245}
6246#endif /* INET6 */
6247
6248void
6249if_lqm_update(struct ifnet *ifp, int lqm)
6250{
6251	struct kev_dl_link_quality_metric_data ev_lqm_data;
6252
6253	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
6254
6255	/* Normalize to edge */
6256	if (lqm > IFNET_LQM_THRESH_UNKNOWN && lqm <= IFNET_LQM_THRESH_BAD)
6257		lqm = IFNET_LQM_THRESH_BAD;
6258	else if (lqm > IFNET_LQM_THRESH_BAD && lqm <= IFNET_LQM_THRESH_POOR)
6259		lqm = IFNET_LQM_THRESH_POOR;
6260	else if (lqm > IFNET_LQM_THRESH_POOR && lqm <= IFNET_LQM_THRESH_GOOD)
6261		lqm = IFNET_LQM_THRESH_GOOD;
6262
6263	ifnet_lock_exclusive(ifp);
6264	if (lqm == ifp->if_lqm) {
6265		ifnet_lock_done(ifp);
6266		return;		/* nothing to update */
6267	}
6268	ifp->if_lqm = lqm;
6269	ifnet_lock_done(ifp);
6270
6271	bzero(&ev_lqm_data, sizeof (ev_lqm_data));
6272	ev_lqm_data.link_quality_metric = lqm;
6273
6274	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
6275	    (struct net_event_data *)&ev_lqm_data, sizeof (ev_lqm_data));
6276}
6277
6278/* for uuid.c */
6279int
6280uuid_get_ethernet(u_int8_t *node)
6281{
6282	struct ifnet *ifp;
6283	struct sockaddr_dl *sdl;
6284
6285	ifnet_head_lock_shared();
6286	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
6287		ifnet_lock_shared(ifp);
6288		IFA_LOCK_SPIN(ifp->if_lladdr);
6289		sdl = (struct sockaddr_dl *)(void *)ifp->if_lladdr->ifa_addr;
6290		if (sdl->sdl_type == IFT_ETHER) {
6291			memcpy(node, LLADDR(sdl), ETHER_ADDR_LEN);
6292			IFA_UNLOCK(ifp->if_lladdr);
6293			ifnet_lock_done(ifp);
6294			ifnet_head_done();
6295			return (0);
6296		}
6297		IFA_UNLOCK(ifp->if_lladdr);
6298		ifnet_lock_done(ifp);
6299	}
6300	ifnet_head_done();
6301
6302	return (-1);
6303}
6304
6305static int
6306sysctl_rxpoll SYSCTL_HANDLER_ARGS
6307{
6308#pragma unused(arg1, arg2)
6309	uint32_t i;
6310	int err;
6311
6312	i = if_rxpoll;
6313
6314	err = sysctl_handle_int(oidp, &i, 0, req);
6315	if (err != 0 || req->newptr == USER_ADDR_NULL)
6316		return (err);
6317
6318	if (net_rxpoll == 0)
6319		return (ENXIO);
6320
6321	if_rxpoll = i;
6322	return (err);
6323}
6324
6325static int
6326sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
6327{
6328#pragma unused(arg1, arg2)
6329	uint64_t q;
6330	int err;
6331
6332	q = if_rxpoll_mode_holdtime;
6333
6334	err = sysctl_handle_quad(oidp, &q, 0, req);
6335	if (err != 0 || req->newptr == USER_ADDR_NULL)
6336		return (err);
6337
6338	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN)
6339		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
6340
6341	if_rxpoll_mode_holdtime = q;
6342
6343	return (err);
6344}
6345
6346static int
6347sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
6348{
6349#pragma unused(arg1, arg2)
6350	uint64_t q;
6351	int err;
6352
6353	q = if_rxpoll_sample_holdtime;
6354
6355	err = sysctl_handle_quad(oidp, &q, 0, req);
6356	if (err != 0 || req->newptr == USER_ADDR_NULL)
6357		return (err);
6358
6359	if (q < IF_RXPOLL_SAMPLETIME_MIN)
6360		q = IF_RXPOLL_SAMPLETIME_MIN;
6361
6362	if_rxpoll_sample_holdtime = q;
6363
6364	return (err);
6365}
6366
6367static int
6368sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
6369{
6370#pragma unused(arg1, arg2)
6371	uint64_t q;
6372	int err;
6373
6374	q = if_rxpoll_interval_time;
6375
6376	err = sysctl_handle_quad(oidp, &q, 0, req);
6377	if (err != 0 || req->newptr == USER_ADDR_NULL)
6378		return (err);
6379
6380	if (q < IF_RXPOLL_INTERVALTIME_MIN)
6381		q = IF_RXPOLL_INTERVALTIME_MIN;
6382
6383	if_rxpoll_interval_time = q;
6384
6385	return (err);
6386}
6387
6388static int
6389sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
6390{
6391#pragma unused(arg1, arg2)
6392	uint32_t i;
6393	int err;
6394
6395	i = if_rxpoll_wlowat;
6396
6397	err = sysctl_handle_int(oidp, &i, 0, req);
6398	if (err != 0 || req->newptr == USER_ADDR_NULL)
6399		return (err);
6400
6401	if (i == 0 || i >= if_rxpoll_whiwat)
6402		return (EINVAL);
6403
6404	if_rxpoll_wlowat = i;
6405	return (err);
6406}
6407
6408static int
6409sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
6410{
6411#pragma unused(arg1, arg2)
6412	uint32_t i;
6413	int err;
6414
6415	i = if_rxpoll_whiwat;
6416
6417	err = sysctl_handle_int(oidp, &i, 0, req);
6418	if (err != 0 || req->newptr == USER_ADDR_NULL)
6419		return (err);
6420
6421	if (i <= if_rxpoll_wlowat)
6422		return (EINVAL);
6423
6424	if_rxpoll_whiwat = i;
6425	return (err);
6426}
6427
6428static int
6429sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
6430{
6431#pragma unused(arg1, arg2)
6432	int i, err;
6433
6434	i = if_sndq_maxlen;
6435
6436	err = sysctl_handle_int(oidp, &i, 0, req);
6437	if (err != 0 || req->newptr == USER_ADDR_NULL)
6438		return (err);
6439
6440	if (i < IF_SNDQ_MINLEN)
6441		i = IF_SNDQ_MINLEN;
6442
6443	if_sndq_maxlen = i;
6444	return (err);
6445}
6446
6447static int
6448sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
6449{
6450#pragma unused(arg1, arg2)
6451	int i, err;
6452
6453	i = if_rcvq_maxlen;
6454
6455	err = sysctl_handle_int(oidp, &i, 0, req);
6456	if (err != 0 || req->newptr == USER_ADDR_NULL)
6457		return (err);
6458
6459	if (i < IF_RCVQ_MINLEN)
6460		i = IF_RCVQ_MINLEN;
6461
6462	if_rcvq_maxlen = i;
6463	return (err);
6464}
6465
6466void
6467dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
6468    int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
6469{
6470	struct kev_dl_node_presence kev;
6471	struct sockaddr_dl *sdl;
6472	struct sockaddr_in6 *sin6;
6473
6474	VERIFY(ifp);
6475	VERIFY(sa);
6476	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
6477
6478	bzero(&kev, sizeof (kev));
6479	sin6 = &kev.sin6_node_address;
6480	sdl = &kev.sdl_node_address;
6481	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
6482	kev.rssi = rssi;
6483	kev.link_quality_metric = lqm;
6484	kev.node_proximity_metric = npm;
6485	bcopy(srvinfo, kev.node_service_info, sizeof (kev.node_service_info));
6486
6487	nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
6488	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
6489	    &kev.link_data, sizeof (kev));
6490}
6491
6492void
6493dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
6494{
6495	struct kev_dl_node_absence kev;
6496	struct sockaddr_in6 *sin6;
6497	struct sockaddr_dl *sdl;
6498
6499	VERIFY(ifp);
6500	VERIFY(sa);
6501	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
6502
6503	bzero(&kev, sizeof (kev));
6504	sin6 = &kev.sin6_node_address;
6505	sdl = &kev.sdl_node_address;
6506	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
6507
6508	nd6_alt_node_absent(ifp, sin6);
6509	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
6510	    &kev.link_data, sizeof (kev));
6511}
6512
6513const void *
6514dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
6515	kauth_cred_t *credp)
6516{
6517	const u_int8_t *bytes;
6518	size_t size;
6519
6520	bytes = CONST_LLADDR(sdl);
6521	size = sdl->sdl_alen;
6522
6523#if CONFIG_MACF
6524	if (dlil_lladdr_ckreq) {
6525		switch (sdl->sdl_type) {
6526		case IFT_ETHER:
6527		case IFT_IEEE1394:
6528			break;
6529		default:
6530			credp = NULL;
6531			break;
6532		};
6533
6534		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
6535			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
6536			    [0] = 2
6537			};
6538
6539			switch (sdl->sdl_type) {
6540			case IFT_ETHER:
6541				VERIFY(size == ETHER_ADDR_LEN);
6542				bytes = unspec;
6543				break;
6544			case IFT_IEEE1394:
6545				VERIFY(size == FIREWIRE_EUI64_LEN);
6546				bytes = unspec;
6547				break;
6548			default:
6549				VERIFY(FALSE);
6550				break;
6551			};
6552		}
6553	}
6554#else
6555#pragma unused(credp)
6556#endif
6557
6558	if (sizep != NULL) *sizep = size;
6559	return (bytes);
6560}
6561
6562void
6563dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
6564    u_int8_t info[DLIL_MODARGLEN])
6565{
6566	struct kev_dl_issues kev;
6567	struct timeval tv;
6568
6569	VERIFY(ifp != NULL);
6570	VERIFY(modid != NULL);
6571	_CASSERT(sizeof (kev.modid) == DLIL_MODIDLEN);
6572	_CASSERT(sizeof (kev.info) == DLIL_MODARGLEN);
6573
6574	bzero(&kev, sizeof (&kev));
6575
6576	microtime(&tv);
6577	kev.timestamp = tv.tv_sec;
6578	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
6579	if (info != NULL)
6580		bcopy(info, &kev.info, DLIL_MODARGLEN);
6581
6582	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
6583	    &kev.link_data, sizeof (kev));
6584}
6585
6586errno_t
6587ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
6588    struct proc *p)
6589{
6590	u_int32_t level = IFNET_THROTTLE_OFF;
6591	errno_t result = 0;
6592
6593	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
6594
6595	if (cmd == SIOCSIFOPPORTUNISTIC) {
6596		/*
6597		 * XXX: Use priv_check_cred() instead of root check?
6598		 */
6599		if ((result = proc_suser(p)) != 0)
6600			return (result);
6601
6602		if (ifr->ifr_opportunistic.ifo_flags ==
6603		    IFRIFOF_BLOCK_OPPORTUNISTIC)
6604			level = IFNET_THROTTLE_OPPORTUNISTIC;
6605		else if (ifr->ifr_opportunistic.ifo_flags == 0)
6606			level = IFNET_THROTTLE_OFF;
6607		else
6608			result = EINVAL;
6609
6610		if (result == 0)
6611			result = ifnet_set_throttle(ifp, level);
6612	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
6613		ifr->ifr_opportunistic.ifo_flags = 0;
6614		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
6615			ifr->ifr_opportunistic.ifo_flags |=
6616			    IFRIFOF_BLOCK_OPPORTUNISTIC;
6617		}
6618	}
6619
6620	/*
6621	 * Return the count of current opportunistic connections
6622	 * over the interface.
6623	 */
6624	if (result == 0) {
6625		uint32_t flags = 0;
6626		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
6627			INPCB_OPPORTUNISTIC_SETCMD : 0;
6628		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
6629			INPCB_OPPORTUNISTIC_THROTTLEON : 0;
6630		ifr->ifr_opportunistic.ifo_inuse =
6631		    udp_count_opportunistic(ifp->if_index, flags) +
6632		    tcp_count_opportunistic(ifp->if_index, flags);
6633	}
6634
6635	if (result == EALREADY)
6636		result = 0;
6637
6638	return (result);
6639}
6640
6641int
6642ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
6643{
6644	struct ifclassq *ifq;
6645	int err = 0;
6646
6647	if (!(ifp->if_eflags & IFEF_TXSTART))
6648		return (ENXIO);
6649
6650	*level = IFNET_THROTTLE_OFF;
6651
6652	ifq = &ifp->if_snd;
6653	IFCQ_LOCK(ifq);
6654	/* Throttling works only for IFCQ, not ALTQ instances */
6655	if (IFCQ_IS_ENABLED(ifq))
6656		IFCQ_GET_THROTTLE(ifq, *level, err);
6657	IFCQ_UNLOCK(ifq);
6658
6659	return (err);
6660}
6661
6662int
6663ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
6664{
6665	struct ifclassq *ifq;
6666	int err = 0;
6667
6668	if (!(ifp->if_eflags & IFEF_TXSTART))
6669		return (ENXIO);
6670
6671	ifq = &ifp->if_snd;
6672
6673	switch (level) {
6674	case IFNET_THROTTLE_OFF:
6675	case IFNET_THROTTLE_OPPORTUNISTIC:
6676#if PF_ALTQ
6677		/* Throttling works only for IFCQ, not ALTQ instances */
6678		if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq)))
6679			return (ENXIO);
6680#endif /* PF_ALTQ */
6681		break;
6682	default:
6683		return (EINVAL);
6684	}
6685
6686	IFCQ_LOCK(ifq);
6687	if (IFCQ_IS_ENABLED(ifq))
6688		IFCQ_SET_THROTTLE(ifq, level, err);
6689	IFCQ_UNLOCK(ifq);
6690
6691	if (err == 0) {
6692		printf("%s: throttling level set to %d\n", if_name(ifp),
6693		    level);
6694		if (level == IFNET_THROTTLE_OFF)
6695			ifnet_start(ifp);
6696	}
6697
6698	return (err);
6699}
6700
6701errno_t
6702ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
6703    struct proc *p)
6704{
6705#pragma unused(p)
6706	errno_t result = 0;
6707	uint32_t flags;
6708	int level, category, subcategory;
6709
6710	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
6711
6712	if (cmd == SIOCSIFLOG) {
6713		if ((result = priv_check_cred(kauth_cred_get(),
6714		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0)
6715			return (result);
6716
6717		level = ifr->ifr_log.ifl_level;
6718		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX)
6719			result = EINVAL;
6720
6721		flags = ifr->ifr_log.ifl_flags;
6722		if ((flags &= IFNET_LOGF_MASK) == 0)
6723			result = EINVAL;
6724
6725		category = ifr->ifr_log.ifl_category;
6726		subcategory = ifr->ifr_log.ifl_subcategory;
6727
6728		if (result == 0)
6729			result = ifnet_set_log(ifp, level, flags,
6730			    category, subcategory);
6731	} else {
6732		result = ifnet_get_log(ifp, &level, &flags, &category,
6733		    &subcategory);
6734		if (result == 0) {
6735			ifr->ifr_log.ifl_level = level;
6736			ifr->ifr_log.ifl_flags = flags;
6737			ifr->ifr_log.ifl_category = category;
6738			ifr->ifr_log.ifl_subcategory = subcategory;
6739		}
6740	}
6741
6742	return (result);
6743}
6744
6745int
6746ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
6747    int32_t category, int32_t subcategory)
6748{
6749	int err = 0;
6750
6751	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
6752	VERIFY(flags & IFNET_LOGF_MASK);
6753
6754	/*
6755	 * The logging level applies to all facilities; make sure to
6756	 * update them all with the most current level.
6757	 */
6758	flags |= ifp->if_log.flags;
6759
6760	if (ifp->if_output_ctl != NULL) {
6761		struct ifnet_log_params l;
6762
6763		bzero(&l, sizeof (l));
6764		l.level = level;
6765		l.flags = flags;
6766		l.flags &= ~IFNET_LOGF_DLIL;
6767		l.category = category;
6768		l.subcategory = subcategory;
6769
6770		/* Send this request to lower layers */
6771		if (l.flags != 0) {
6772			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
6773			    sizeof (l), &l);
6774		}
6775	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
6776		/*
6777		 * If targeted to the lower layers without an output
6778		 * control callback registered on the interface, just
6779		 * silently ignore facilities other than ours.
6780		 */
6781		flags &= IFNET_LOGF_DLIL;
6782		if (flags == 0 && (!ifp->if_log.flags & IFNET_LOGF_DLIL))
6783			level = 0;
6784	}
6785
6786	if (err == 0) {
6787		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT)
6788			ifp->if_log.flags = 0;
6789		else
6790			ifp->if_log.flags |= flags;
6791
6792		log(LOG_INFO, "%s: logging level set to %d flags=%b "
6793		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
6794		    ifp->if_log.level, ifp->if_log.flags,
6795		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
6796		    category, subcategory);
6797	}
6798
6799	return (err);
6800}
6801
6802int
6803ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
6804    int32_t *category, int32_t *subcategory)
6805{
6806	if (level != NULL)
6807		*level = ifp->if_log.level;
6808	if (flags != NULL)
6809		*flags = ifp->if_log.flags;
6810	if (category != NULL)
6811		*category = ifp->if_log.category;
6812	if (subcategory != NULL)
6813		*subcategory = ifp->if_log.subcategory;
6814
6815	return (0);
6816}
6817
6818int
6819ifnet_notify_address(struct ifnet *ifp, int af)
6820{
6821	struct ifnet_notify_address_params na;
6822
6823#if PF
6824	(void) pf_ifaddr_hook(ifp);
6825#endif /* PF */
6826
6827	if (ifp->if_output_ctl == NULL)
6828		return (EOPNOTSUPP);
6829
6830	bzero(&na, sizeof (na));
6831	na.address_family = af;
6832
6833	return (ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
6834	    sizeof (na), &na));
6835}
6836
6837errno_t
6838ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
6839{
6840	if (ifp == NULL || flowid == NULL) {
6841		return (EINVAL);
6842	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6843	    !(ifp->if_refflags & IFRF_ATTACHED)) {
6844		return (ENXIO);
6845	}
6846
6847	*flowid = ifp->if_flowhash;
6848
6849	return (0);
6850}
6851
6852errno_t
6853ifnet_disable_output(struct ifnet *ifp)
6854{
6855	int err;
6856
6857	if (ifp == NULL) {
6858		return (EINVAL);
6859	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6860	    !(ifp->if_refflags & IFRF_ATTACHED)) {
6861		return (ENXIO);
6862	}
6863
6864	if ((err = ifnet_fc_add(ifp)) == 0) {
6865		lck_mtx_lock_spin(&ifp->if_start_lock);
6866		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
6867		lck_mtx_unlock(&ifp->if_start_lock);
6868	}
6869	return (err);
6870}
6871
6872errno_t
6873ifnet_enable_output(struct ifnet *ifp)
6874{
6875	if (ifp == NULL) {
6876		return (EINVAL);
6877	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6878	    !(ifp->if_refflags & IFRF_ATTACHED)) {
6879		return (ENXIO);
6880	}
6881
6882	ifnet_start_common(ifp, 1);
6883	return (0);
6884}
6885
6886void
6887ifnet_flowadv(uint32_t flowhash)
6888{
6889	struct ifnet_fc_entry *ifce;
6890	struct ifnet *ifp;
6891
6892	ifce = ifnet_fc_get(flowhash);
6893	if (ifce == NULL)
6894		return;
6895
6896	VERIFY(ifce->ifce_ifp != NULL);
6897	ifp = ifce->ifce_ifp;
6898
6899	/* flow hash gets recalculated per attach, so check */
6900	if (ifnet_is_attached(ifp, 1)) {
6901		if (ifp->if_flowhash == flowhash)
6902			(void) ifnet_enable_output(ifp);
6903		ifnet_decr_iorefcnt(ifp);
6904	}
6905	ifnet_fc_entry_free(ifce);
6906}
6907
6908/*
6909 * Function to compare ifnet_fc_entries in ifnet flow control tree
6910 */
6911static inline int
6912ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
6913{
6914	return (fc1->ifce_flowhash - fc2->ifce_flowhash);
6915}
6916
6917static int
6918ifnet_fc_add(struct ifnet *ifp)
6919{
6920	struct ifnet_fc_entry keyfc, *ifce;
6921	uint32_t flowhash;
6922
6923	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
6924	VERIFY(ifp->if_flowhash != 0);
6925	flowhash = ifp->if_flowhash;
6926
6927	bzero(&keyfc, sizeof (keyfc));
6928	keyfc.ifce_flowhash = flowhash;
6929
6930	lck_mtx_lock_spin(&ifnet_fc_lock);
6931	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
6932	if (ifce != NULL && ifce->ifce_ifp == ifp) {
6933		/* Entry is already in ifnet_fc_tree, return */
6934		lck_mtx_unlock(&ifnet_fc_lock);
6935		return (0);
6936	}
6937
6938	if (ifce != NULL) {
6939		/*
6940		 * There is a different fc entry with the same flow hash
6941		 * but different ifp pointer.  There can be a collision
6942		 * on flow hash but the probability is low.  Let's just
6943		 * avoid adding a second one when there is a collision.
6944		 */
6945		lck_mtx_unlock(&ifnet_fc_lock);
6946		return (EAGAIN);
6947	}
6948
6949	/* become regular mutex */
6950	lck_mtx_convert_spin(&ifnet_fc_lock);
6951
6952	ifce = zalloc_noblock(ifnet_fc_zone);
6953	if (ifce == NULL) {
6954		/* memory allocation failed */
6955		lck_mtx_unlock(&ifnet_fc_lock);
6956		return (ENOMEM);
6957	}
6958	bzero(ifce, ifnet_fc_zone_size);
6959
6960	ifce->ifce_flowhash = flowhash;
6961	ifce->ifce_ifp = ifp;
6962
6963	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
6964	lck_mtx_unlock(&ifnet_fc_lock);
6965	return (0);
6966}
6967
6968static struct ifnet_fc_entry *
6969ifnet_fc_get(uint32_t flowhash)
6970{
6971	struct ifnet_fc_entry keyfc, *ifce;
6972	struct ifnet *ifp;
6973
6974	bzero(&keyfc, sizeof (keyfc));
6975	keyfc.ifce_flowhash = flowhash;
6976
6977	lck_mtx_lock_spin(&ifnet_fc_lock);
6978	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
6979	if (ifce == NULL) {
6980		/* Entry is not present in ifnet_fc_tree, return */
6981		lck_mtx_unlock(&ifnet_fc_lock);
6982		return (NULL);
6983	}
6984
6985	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
6986
6987	VERIFY(ifce->ifce_ifp != NULL);
6988	ifp = ifce->ifce_ifp;
6989
6990	/* become regular mutex */
6991	lck_mtx_convert_spin(&ifnet_fc_lock);
6992
6993	if (!ifnet_is_attached(ifp, 0)) {
6994		/*
6995		 * This ifp is not attached or in the process of being
6996		 * detached; just don't process it.
6997		 */
6998		ifnet_fc_entry_free(ifce);
6999		ifce = NULL;
7000	}
7001	lck_mtx_unlock(&ifnet_fc_lock);
7002
7003	return (ifce);
7004}
7005
7006static void
7007ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
7008{
7009	zfree(ifnet_fc_zone, ifce);
7010}
7011
7012static uint32_t
7013ifnet_calc_flowhash(struct ifnet *ifp)
7014{
7015	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
7016	uint32_t flowhash = 0;
7017
7018	if (ifnet_flowhash_seed == 0)
7019		ifnet_flowhash_seed = RandomULong();
7020
7021	bzero(&fh, sizeof (fh));
7022
7023	(void) snprintf(fh.ifk_name, sizeof (fh.ifk_name), "%s", ifp->if_name);
7024	fh.ifk_unit = ifp->if_unit;
7025	fh.ifk_flags = ifp->if_flags;
7026	fh.ifk_eflags = ifp->if_eflags;
7027	fh.ifk_capabilities = ifp->if_capabilities;
7028	fh.ifk_capenable = ifp->if_capenable;
7029	fh.ifk_output_sched_model = ifp->if_output_sched_model;
7030	fh.ifk_rand1 = RandomULong();
7031	fh.ifk_rand2 = RandomULong();
7032
7033try_again:
7034	flowhash = net_flowhash(&fh, sizeof (fh), ifnet_flowhash_seed);
7035	if (flowhash == 0) {
7036		/* try to get a non-zero flowhash */
7037		ifnet_flowhash_seed = RandomULong();
7038		goto try_again;
7039	}
7040
7041	return (flowhash);
7042}
7043
7044static void
7045dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
7046    protocol_family_t pf)
7047{
7048#pragma unused(ifp)
7049	uint32_t did_sw;
7050
7051	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
7052	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4|CSUM_TSO_IPV6)))
7053		return;
7054
7055	switch (pf) {
7056	case PF_INET:
7057		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
7058		if (did_sw & CSUM_DELAY_IP)
7059			hwcksum_dbg_finalized_hdr++;
7060		if (did_sw & CSUM_DELAY_DATA)
7061			hwcksum_dbg_finalized_data++;
7062		break;
7063#if INET6
7064	case PF_INET6:
7065		/*
7066		 * Checksum offload should not have been enabled when
7067		 * extension headers exist; that also means that we
7068		 * cannot force-finalize packets with extension headers.
7069		 * Indicate to the callee should it skip such case by
7070		 * setting optlen to -1.
7071		 */
7072		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
7073		    m->m_pkthdr.csum_flags);
7074		if (did_sw & CSUM_DELAY_IPV6_DATA)
7075			hwcksum_dbg_finalized_data++;
7076		break;
7077#endif /* INET6 */
7078	default:
7079		return;
7080	}
7081}
7082
7083static void
7084dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
7085    protocol_family_t pf)
7086{
7087	uint16_t sum;
7088	uint32_t hlen;
7089
7090	if (frame_header == NULL ||
7091	    frame_header < (char *)mbuf_datastart(m) ||
7092	    frame_header > (char *)m->m_data) {
7093		printf("%s: frame header pointer 0x%llx out of range "
7094		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
7095		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
7096		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
7097		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7098		    (uint64_t)VM_KERNEL_ADDRPERM(m));
7099		return;
7100	}
7101	hlen = (m->m_data - frame_header);
7102
7103	switch (pf) {
7104	case PF_INET:
7105#if INET6
7106	case PF_INET6:
7107#endif /* INET6 */
7108		break;
7109	default:
7110		return;
7111	}
7112
7113	/*
7114	 * Force partial checksum offload; useful to simulate cases
7115	 * where the hardware does not support partial checksum offload,
7116	 * in order to validate correctness throughout the layers above.
7117	 */
7118	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
7119		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
7120
7121		if (foff > (uint32_t)m->m_pkthdr.len)
7122			return;
7123
7124		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
7125
7126		/* Compute 16-bit 1's complement sum from forced offset */
7127		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
7128
7129		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
7130		m->m_pkthdr.csum_rx_val = sum;
7131		m->m_pkthdr.csum_rx_start = (foff + hlen);
7132
7133		hwcksum_dbg_partial_forced++;
7134		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
7135	}
7136
7137	/*
7138	 * Partial checksum offload verification (and adjustment);
7139	 * useful to validate and test cases where the hardware
7140	 * supports partial checksum offload.
7141	 */
7142	if ((m->m_pkthdr.csum_flags &
7143	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
7144	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7145		uint32_t rxoff;
7146
7147		/* Start offset must begin after frame header */
7148		rxoff = m->m_pkthdr.csum_rx_start;
7149		if (hlen > rxoff) {
7150			hwcksum_dbg_bad_rxoff++;
7151			if (dlil_verbose) {
7152				printf("%s: partial cksum start offset %d "
7153				    "is less than frame header length %d for "
7154				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
7155				    (uint64_t)VM_KERNEL_ADDRPERM(m));
7156			}
7157			return;
7158		}
7159		rxoff -=hlen;
7160
7161		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
7162			/*
7163			 * Compute the expected 16-bit 1's complement sum;
7164			 * skip this if we've already computed it above
7165			 * when partial checksum offload is forced.
7166			 */
7167			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
7168
7169			/* Hardware or driver is buggy */
7170			if (sum != m->m_pkthdr.csum_rx_val) {
7171				hwcksum_dbg_bad_cksum++;
7172				if (dlil_verbose) {
7173					printf("%s: bad partial cksum value "
7174					    "0x%x (expected 0x%x) for mbuf "
7175					    "0x%llx [rx_start %d]\n",
7176					    if_name(ifp),
7177					    m->m_pkthdr.csum_rx_val, sum,
7178					    (uint64_t)VM_KERNEL_ADDRPERM(m),
7179					    m->m_pkthdr.csum_rx_start);
7180				}
7181				return;
7182			}
7183		}
7184		hwcksum_dbg_verified++;
7185
7186		/*
7187		 * This code allows us to emulate various hardwares that
7188		 * perform 16-bit 1's complement sum beginning at various
7189		 * start offset values.
7190		 */
7191		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
7192			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
7193
7194			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len)
7195				return;
7196
7197			sum = m_adj_sum16(m, rxoff, aoff, sum);
7198
7199			m->m_pkthdr.csum_rx_val = sum;
7200			m->m_pkthdr.csum_rx_start = (aoff + hlen);
7201
7202			hwcksum_dbg_adjusted++;
7203		}
7204	}
7205}
7206
7207static int
7208sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
7209{
7210#pragma unused(arg1, arg2)
7211	u_int32_t i;
7212	int err;
7213
7214	i = hwcksum_dbg_mode;
7215
7216	err = sysctl_handle_int(oidp, &i, 0, req);
7217	if (err != 0 || req->newptr == USER_ADDR_NULL)
7218		return (err);
7219
7220	if (hwcksum_dbg == 0)
7221		return (ENODEV);
7222
7223	if ((i & ~HWCKSUM_DBG_MASK) != 0)
7224		return (EINVAL);
7225
7226	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
7227
7228	return (err);
7229}
7230
7231static int
7232sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
7233{
7234#pragma unused(arg1, arg2)
7235	u_int32_t i;
7236	int err;
7237
7238	i = hwcksum_dbg_partial_rxoff_forced;
7239
7240	err = sysctl_handle_int(oidp, &i, 0, req);
7241	if (err != 0 || req->newptr == USER_ADDR_NULL)
7242		return (err);
7243
7244	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED))
7245		return (ENODEV);
7246
7247	hwcksum_dbg_partial_rxoff_forced = i;
7248
7249	return (err);
7250}
7251
7252static int
7253sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
7254{
7255#pragma unused(arg1, arg2)
7256	u_int32_t i;
7257	int err;
7258
7259	i = hwcksum_dbg_partial_rxoff_adj;
7260
7261	err = sysctl_handle_int(oidp, &i, 0, req);
7262	if (err != 0 || req->newptr == USER_ADDR_NULL)
7263		return (err);
7264
7265	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ))
7266		return (ENODEV);
7267
7268	hwcksum_dbg_partial_rxoff_adj = i;
7269
7270	return (err);
7271}
7272
7273#if DEBUG
7274/* Blob for sum16 verification */
7275static uint8_t sumdata[] = {
7276	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
7277	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
7278	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
7279	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
7280	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
7281	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
7282	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
7283	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
7284	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
7285	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
7286	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
7287	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
7288	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
7289	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
7290	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
7291	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
7292	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
7293	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
7294	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
7295	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
7296	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
7297	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
7298	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
7299	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
7300	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
7301	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
7302	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
7303	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
7304	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
7305	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
7306	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
7307	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
7308	0xc8, 0x28, 0x02, 0x00, 0x00
7309};
7310
7311/* Precomputed 16-bit 1's complement sums for various spans of the above data */
7312static struct {
7313	int		len;
7314	uint16_t	sum;
7315} sumtbl[] = {
7316	{	11,	0xcb6d	},
7317	{	20,	0x20dd	},
7318	{	27,	0xbabd	},
7319	{	32,	0xf3e8	},
7320	{	37,	0x197d	},
7321	{	43,	0x9eae	},
7322	{	64,	0x4678	},
7323	{	127,	0x9399	},
7324	{	256,	0xd147	},
7325	{	325,	0x0358	}
7326};
7327#define	SUMTBL_MAX	((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
7328
7329static void
7330dlil_verify_sum16(void)
7331{
7332	struct mbuf *m;
7333	uint8_t *buf;
7334	int n;
7335
7336	/* Make sure test data plus extra room for alignment fits in cluster */
7337	_CASSERT((sizeof (sumdata) + (sizeof (uint64_t) * 2)) <= MCLBYTES);
7338
7339	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
7340	MH_ALIGN(m, sizeof (uint32_t));		/* 32-bit starting alignment */
7341	buf = mtod(m, uint8_t *);		/* base address */
7342
7343	for (n = 0; n < SUMTBL_MAX; n++) {
7344		uint16_t len = sumtbl[n].len;
7345		int i;
7346
7347		/* Verify for all possible alignments */
7348		for (i = 0; i < (int)sizeof (uint64_t); i++) {
7349			uint16_t sum;
7350			uint8_t *c;
7351
7352			/* Copy over test data to mbuf */
7353			VERIFY(len <= sizeof (sumdata));
7354			c = buf + i;
7355			bcopy(sumdata, c, len);
7356
7357			/* Zero-offset test (align by data pointer) */
7358			m->m_data = (caddr_t)c;
7359			m->m_len = len;
7360			sum = m_sum16(m, 0, len);
7361
7362			/* Something is horribly broken; stop now */
7363			if (sum != sumtbl[n].sum) {
7364				panic("%s: broken m_sum16 for len=%d align=%d "
7365				    "sum=0x%04x [expected=0x%04x]\n", __func__,
7366				    len, i, sum, sumtbl[n].sum);
7367				/* NOTREACHED */
7368			}
7369
7370			/* Alignment test by offset (fixed data pointer) */
7371			m->m_data = (caddr_t)buf;
7372			m->m_len = i + len;
7373			sum = m_sum16(m, i, len);
7374
7375			/* Something is horribly broken; stop now */
7376			if (sum != sumtbl[n].sum) {
7377				panic("%s: broken m_sum16 for len=%d offset=%d "
7378				    "sum=0x%04x [expected=0x%04x]\n", __func__,
7379				    len, i, sum, sumtbl[n].sum);
7380				/* NOTREACHED */
7381			}
7382#if INET
7383			/* Simple sum16 contiguous buffer test by aligment */
7384			sum = b_sum16(c, len);
7385
7386			/* Something is horribly broken; stop now */
7387			if (sum != sumtbl[n].sum) {
7388				panic("%s: broken b_sum16 for len=%d align=%d "
7389				    "sum=0x%04x [expected=0x%04x]\n", __func__,
7390				    len, i, sum, sumtbl[n].sum);
7391				/* NOTREACHED */
7392			}
7393#endif /* INET */
7394		}
7395	}
7396	m_freem(m);
7397
7398	printf("DLIL: SUM16 self-tests PASSED\n");
7399}
7400#endif /* DEBUG */
7401
7402#define	CASE_STRINGIFY(x) case x: return #x
7403
7404__private_extern__ const char *
7405dlil_kev_dl_code_str(u_int32_t event_code)
7406{
7407	switch (event_code) {
7408	CASE_STRINGIFY(KEV_DL_SIFFLAGS);
7409	CASE_STRINGIFY(KEV_DL_SIFMETRICS);
7410	CASE_STRINGIFY(KEV_DL_SIFMTU);
7411	CASE_STRINGIFY(KEV_DL_SIFPHYS);
7412	CASE_STRINGIFY(KEV_DL_SIFMEDIA);
7413	CASE_STRINGIFY(KEV_DL_SIFGENERIC);
7414	CASE_STRINGIFY(KEV_DL_ADDMULTI);
7415	CASE_STRINGIFY(KEV_DL_DELMULTI);
7416	CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
7417	CASE_STRINGIFY(KEV_DL_IF_DETACHING);
7418	CASE_STRINGIFY(KEV_DL_IF_DETACHED);
7419	CASE_STRINGIFY(KEV_DL_LINK_OFF);
7420	CASE_STRINGIFY(KEV_DL_LINK_ON);
7421	CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
7422	CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
7423	CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
7424	CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
7425	CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
7426	CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
7427	CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
7428	CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
7429	CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
7430	CASE_STRINGIFY(KEV_DL_MASTER_ELECTED);
7431	CASE_STRINGIFY(KEV_DL_ISSUES);
7432	CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
7433	default:
7434		break;
7435	}
7436	return ("");
7437}
7438