bpf.c revision 178208
1/*-
2 * Copyright (c) 1990, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from the Stanford/CMU enet packet filter,
6 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
7 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
8 * Berkeley Laboratory.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/net/bpf.c 178208 2008-04-15 00:50:01Z jkim $");
39
40#include "opt_bpf.h"
41#include "opt_mac.h"
42#include "opt_netgraph.h"
43
44#include <sys/types.h>
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/conf.h>
48#include <sys/fcntl.h>
49#include <sys/malloc.h>
50#include <sys/mbuf.h>
51#include <sys/time.h>
52#include <sys/priv.h>
53#include <sys/proc.h>
54#include <sys/signalvar.h>
55#include <sys/filio.h>
56#include <sys/sockio.h>
57#include <sys/ttycom.h>
58#include <sys/uio.h>
59
60#include <sys/event.h>
61#include <sys/file.h>
62#include <sys/poll.h>
63#include <sys/proc.h>
64
65#include <sys/socket.h>
66
67#include <net/if.h>
68#include <net/bpf.h>
69#include <net/bpf_buffer.h>
70#ifdef BPF_JITTER
71#include <net/bpf_jitter.h>
72#endif
73#include <net/bpf_zerocopy.h>
74#include <net/bpfdesc.h>
75
76#include <netinet/in.h>
77#include <netinet/if_ether.h>
78#include <sys/kernel.h>
79#include <sys/sysctl.h>
80
81#include <net80211/ieee80211_freebsd.h>
82
83#include <security/mac/mac_framework.h>
84
85MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
86
87#if defined(DEV_BPF) || defined(NETGRAPH_BPF)
88
89#define PRINET  26			/* interruptible */
90
91/*
92 * bpf_iflist is a list of BPF interface structures, each corresponding to a
93 * specific DLT.  The same network interface might have several BPF interface
94 * structures registered by different layers in the stack (i.e., 802.11
95 * frames, ethernet frames, etc).
96 */
97static LIST_HEAD(, bpf_if)	bpf_iflist;
98static struct mtx	bpf_mtx;		/* bpf global lock */
99static int		bpf_bpfd_cnt;
100
101static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
102static void	bpf_detachd(struct bpf_d *);
103static void	bpf_freed(struct bpf_d *);
104static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
105		    struct sockaddr *, int *, struct bpf_insn *);
106static int	bpf_setif(struct bpf_d *, struct ifreq *);
107static void	bpf_timed_out(void *);
108static __inline void
109		bpf_wakeup(struct bpf_d *);
110static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
111		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
112		    struct timeval *);
113static void	reset_d(struct bpf_d *);
114static int	 bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
115static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
116static int	bpf_setdlt(struct bpf_d *, u_int);
117static void	filt_bpfdetach(struct knote *);
118static int	filt_bpfread(struct knote *, long);
119static void	bpf_drvinit(void *);
120static void	bpf_clone(void *, struct ucred *, char *, int, struct cdev **);
121static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
122
123SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
124static int bpf_maxinsns = BPF_MAXINSNS;
125SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
126    &bpf_maxinsns, 0, "Maximum bpf program instructions");
127static int bpf_zerocopy_enable = 0;
128SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
129    &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
130SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW,
131    bpf_stats_sysctl, "bpf statistics portal");
132
133static	d_open_t	bpfopen;
134static	d_close_t	bpfclose;
135static	d_read_t	bpfread;
136static	d_write_t	bpfwrite;
137static	d_ioctl_t	bpfioctl;
138static	d_poll_t	bpfpoll;
139static	d_kqfilter_t	bpfkqfilter;
140
141static struct cdevsw bpf_cdevsw = {
142	.d_version =	D_VERSION,
143	.d_open =	bpfopen,
144	.d_close =	bpfclose,
145	.d_read =	bpfread,
146	.d_write =	bpfwrite,
147	.d_ioctl =	bpfioctl,
148	.d_poll =	bpfpoll,
149	.d_name =	"bpf",
150	.d_kqfilter =	bpfkqfilter,
151};
152
153static struct filterops bpfread_filtops =
154	{ 1, NULL, filt_bpfdetach, filt_bpfread };
155
156/*
157 * Wrapper functions for various buffering methods.  If the set of buffer
158 * modes expands, we will probably want to introduce a switch data structure
159 * similar to protosw, et.
160 */
161static void
162bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
163    u_int len)
164{
165
166	BPFD_LOCK_ASSERT(d);
167
168	switch (d->bd_bufmode) {
169	case BPF_BUFMODE_BUFFER:
170		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
171
172	case BPF_BUFMODE_ZBUF:
173		d->bd_zcopy++;
174		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
175
176	default:
177		panic("bpf_buf_append_bytes");
178	}
179}
180
181static void
182bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
183    u_int len)
184{
185
186	BPFD_LOCK_ASSERT(d);
187
188	switch (d->bd_bufmode) {
189	case BPF_BUFMODE_BUFFER:
190		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
191
192	case BPF_BUFMODE_ZBUF:
193		d->bd_zcopy++;
194		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
195
196	default:
197		panic("bpf_buf_append_mbuf");
198	}
199}
200
201/*
202 * If the buffer mechanism has a way to decide that a held buffer can be made
203 * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
204 * returned if the buffer can be discarded, (0) is returned if it cannot.
205 */
206static int
207bpf_canfreebuf(struct bpf_d *d)
208{
209
210	BPFD_LOCK_ASSERT(d);
211
212	switch (d->bd_bufmode) {
213	case BPF_BUFMODE_ZBUF:
214		return (bpf_zerocopy_canfreebuf(d));
215	}
216	return (0);
217}
218
219/*
220 * Allow the buffer model to indicate that the current store buffer is
221 * immutable, regardless of the appearance of space.  Return (1) if the
222 * buffer is writable, and (0) if not.
223 */
224static int
225bpf_canwritebuf(struct bpf_d *d)
226{
227
228	BPFD_LOCK_ASSERT(d);
229
230	switch (d->bd_bufmode) {
231	case BPF_BUFMODE_ZBUF:
232		return (bpf_zerocopy_canwritebuf(d));
233	}
234	return (1);
235}
236
237/*
238 * Notify buffer model that an attempt to write to the store buffer has
239 * resulted in a dropped packet, in which case the buffer may be considered
240 * full.
241 */
242static void
243bpf_buffull(struct bpf_d *d)
244{
245
246	BPFD_LOCK_ASSERT(d);
247
248	switch (d->bd_bufmode) {
249	case BPF_BUFMODE_ZBUF:
250		bpf_zerocopy_buffull(d);
251		break;
252	}
253}
254
255/*
256 * Notify the buffer model that a buffer has moved into the hold position.
257 */
258void
259bpf_bufheld(struct bpf_d *d)
260{
261
262	BPFD_LOCK_ASSERT(d);
263
264	switch (d->bd_bufmode) {
265	case BPF_BUFMODE_ZBUF:
266		bpf_zerocopy_bufheld(d);
267		break;
268	}
269}
270
271static void
272bpf_free(struct bpf_d *d)
273{
274
275	switch (d->bd_bufmode) {
276	case BPF_BUFMODE_BUFFER:
277		return (bpf_buffer_free(d));
278
279	case BPF_BUFMODE_ZBUF:
280		return (bpf_zerocopy_free(d));
281
282	default:
283		panic("bpf_buf_free");
284	}
285}
286
287static int
288bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
289{
290
291	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
292		return (EOPNOTSUPP);
293	return (bpf_buffer_uiomove(d, buf, len, uio));
294}
295
296static int
297bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
298{
299
300	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
301		return (EOPNOTSUPP);
302	return (bpf_buffer_ioctl_sblen(d, i));
303}
304
305static int
306bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
307{
308
309	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
310		return (EOPNOTSUPP);
311	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
312}
313
314static int
315bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
316{
317
318	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
319		return (EOPNOTSUPP);
320	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
321}
322
323static int
324bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
325{
326
327	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
328		return (EOPNOTSUPP);
329	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
330}
331
332/*
333 * General BPF functions.
334 */
335static int
336bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
337    struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
338{
339	const struct ieee80211_bpf_params *p;
340	struct ether_header *eh;
341	struct mbuf *m;
342	int error;
343	int len;
344	int hlen;
345	int slen;
346
347	/*
348	 * Build a sockaddr based on the data link layer type.
349	 * We do this at this level because the ethernet header
350	 * is copied directly into the data field of the sockaddr.
351	 * In the case of SLIP, there is no header and the packet
352	 * is forwarded as is.
353	 * Also, we are careful to leave room at the front of the mbuf
354	 * for the link level header.
355	 */
356	switch (linktype) {
357
358	case DLT_SLIP:
359		sockp->sa_family = AF_INET;
360		hlen = 0;
361		break;
362
363	case DLT_EN10MB:
364		sockp->sa_family = AF_UNSPEC;
365		/* XXX Would MAXLINKHDR be better? */
366		hlen = ETHER_HDR_LEN;
367		break;
368
369	case DLT_FDDI:
370		sockp->sa_family = AF_IMPLINK;
371		hlen = 0;
372		break;
373
374	case DLT_RAW:
375		sockp->sa_family = AF_UNSPEC;
376		hlen = 0;
377		break;
378
379	case DLT_NULL:
380		/*
381		 * null interface types require a 4 byte pseudo header which
382		 * corresponds to the address family of the packet.
383		 */
384		sockp->sa_family = AF_UNSPEC;
385		hlen = 4;
386		break;
387
388	case DLT_ATM_RFC1483:
389		/*
390		 * en atm driver requires 4-byte atm pseudo header.
391		 * though it isn't standard, vpi:vci needs to be
392		 * specified anyway.
393		 */
394		sockp->sa_family = AF_UNSPEC;
395		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
396		break;
397
398	case DLT_PPP:
399		sockp->sa_family = AF_UNSPEC;
400		hlen = 4;	/* This should match PPP_HDRLEN */
401		break;
402
403	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
404		sockp->sa_family = AF_IEEE80211;
405		hlen = 0;
406		break;
407
408	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
409		sockp->sa_family = AF_IEEE80211;
410		sockp->sa_len = 12;	/* XXX != 0 */
411		hlen = sizeof(struct ieee80211_bpf_params);
412		break;
413
414	default:
415		return (EIO);
416	}
417
418	len = uio->uio_resid;
419
420	if (len - hlen > ifp->if_mtu)
421		return (EMSGSIZE);
422
423	if ((unsigned)len > MCLBYTES)
424		return (EIO);
425
426	if (len > MHLEN)
427		m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
428	else
429		MGETHDR(m, M_WAIT, MT_DATA);
430	m->m_pkthdr.len = m->m_len = len;
431	m->m_pkthdr.rcvif = NULL;
432	*mp = m;
433
434	if (m->m_len < hlen) {
435		error = EPERM;
436		goto bad;
437	}
438
439	error = uiomove(mtod(m, u_char *), len, uio);
440	if (error)
441		goto bad;
442
443	slen = bpf_filter(wfilter, mtod(m, u_char *), len, len);
444	if (slen == 0) {
445		error = EPERM;
446		goto bad;
447	}
448
449	/* Check for multicast destination */
450	switch (linktype) {
451	case DLT_EN10MB:
452		eh = mtod(m, struct ether_header *);
453		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
454			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
455			    ETHER_ADDR_LEN) == 0)
456				m->m_flags |= M_BCAST;
457			else
458				m->m_flags |= M_MCAST;
459		}
460		break;
461	}
462
463	/*
464	 * Make room for link header, and copy it to sockaddr
465	 */
466	if (hlen != 0) {
467		if (sockp->sa_family == AF_IEEE80211) {
468			/*
469			 * Collect true length from the parameter header
470			 * NB: sockp is known to be zero'd so if we do a
471			 *     short copy unspecified parameters will be
472			 *     zero.
473			 * NB: packet may not be aligned after stripping
474			 *     bpf params
475			 * XXX check ibp_vers
476			 */
477			p = mtod(m, const struct ieee80211_bpf_params *);
478			hlen = p->ibp_len;
479			if (hlen > sizeof(sockp->sa_data)) {
480				error = EINVAL;
481				goto bad;
482			}
483		}
484		bcopy(m->m_data, sockp->sa_data, hlen);
485	}
486	*hdrlen = hlen;
487
488	return (0);
489bad:
490	m_freem(m);
491	return (error);
492}
493
494/*
495 * Attach file to the bpf interface, i.e. make d listen on bp.
496 */
497static void
498bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
499{
500	/*
501	 * Point d at bp, and add d to the interface's list of listeners.
502	 * Finally, point the driver's bpf cookie at the interface so
503	 * it will divert packets to bpf.
504	 */
505	BPFIF_LOCK(bp);
506	d->bd_bif = bp;
507	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
508
509	bpf_bpfd_cnt++;
510	BPFIF_UNLOCK(bp);
511}
512
513/*
514 * Detach a file from its interface.
515 */
516static void
517bpf_detachd(struct bpf_d *d)
518{
519	int error;
520	struct bpf_if *bp;
521	struct ifnet *ifp;
522
523	bp = d->bd_bif;
524	BPFIF_LOCK(bp);
525	BPFD_LOCK(d);
526	ifp = d->bd_bif->bif_ifp;
527
528	/*
529	 * Remove d from the interface's descriptor list.
530	 */
531	LIST_REMOVE(d, bd_next);
532
533	bpf_bpfd_cnt--;
534	d->bd_bif = NULL;
535	BPFD_UNLOCK(d);
536	BPFIF_UNLOCK(bp);
537
538	/*
539	 * Check if this descriptor had requested promiscuous mode.
540	 * If so, turn it off.
541	 */
542	if (d->bd_promisc) {
543		d->bd_promisc = 0;
544		error = ifpromisc(ifp, 0);
545		if (error != 0 && error != ENXIO) {
546			/*
547			 * ENXIO can happen if a pccard is unplugged
548			 * Something is really wrong if we were able to put
549			 * the driver into promiscuous mode, but can't
550			 * take it out.
551			 */
552			if_printf(bp->bif_ifp,
553				"bpf_detach: ifpromisc failed (%d)\n", error);
554		}
555	}
556}
557
558/*
559 * Open ethernet device.  Returns ENXIO for illegal minor device number,
560 * EBUSY if file is open by another process.
561 */
562/* ARGSUSED */
563static	int
564bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
565{
566	struct bpf_d *d;
567
568	mtx_lock(&bpf_mtx);
569	d = dev->si_drv1;
570	/*
571	 * Each minor can be opened by only one process.  If the requested
572	 * minor is in use, return EBUSY.
573	 */
574	if (d != NULL) {
575		mtx_unlock(&bpf_mtx);
576		return (EBUSY);
577	}
578	dev->si_drv1 = (struct bpf_d *)~0;	/* mark device in use */
579	mtx_unlock(&bpf_mtx);
580
581	if ((dev->si_flags & SI_NAMED) == 0)
582		make_dev(&bpf_cdevsw, minor(dev), UID_ROOT, GID_WHEEL, 0600,
583		    "bpf%d", dev2unit(dev));
584	MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
585	dev->si_drv1 = d;
586
587	/*
588	 * For historical reasons, perform a one-time initialization call to
589	 * the buffer routines, even though we're not yet committed to a
590	 * particular buffer method.
591	 */
592	bpf_buffer_init(d);
593	d->bd_bufmode = BPF_BUFMODE_BUFFER;
594	d->bd_sig = SIGIO;
595	d->bd_direction = BPF_D_INOUT;
596	d->bd_pid = td->td_proc->p_pid;
597#ifdef MAC
598	mac_bpfdesc_init(d);
599	mac_bpfdesc_create(td->td_ucred, d);
600#endif
601	mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF);
602	callout_init(&d->bd_callout, CALLOUT_MPSAFE);
603	knlist_init(&d->bd_sel.si_note, &d->bd_mtx, NULL, NULL, NULL);
604
605	return (0);
606}
607
608/*
609 * Close the descriptor by detaching it from its interface,
610 * deallocating its buffers, and marking it free.
611 */
612/* ARGSUSED */
613static	int
614bpfclose(struct cdev *dev, int flags, int fmt, struct thread *td)
615{
616	struct bpf_d *d = dev->si_drv1;
617
618	BPFD_LOCK(d);
619	if (d->bd_state == BPF_WAITING)
620		callout_stop(&d->bd_callout);
621	d->bd_state = BPF_IDLE;
622	BPFD_UNLOCK(d);
623	funsetown(&d->bd_sigio);
624	mtx_lock(&bpf_mtx);
625	if (d->bd_bif)
626		bpf_detachd(d);
627	mtx_unlock(&bpf_mtx);
628	selwakeuppri(&d->bd_sel, PRINET);
629#ifdef MAC
630	mac_bpfdesc_destroy(d);
631#endif /* MAC */
632	knlist_destroy(&d->bd_sel.si_note);
633	bpf_freed(d);
634	dev->si_drv1 = NULL;
635	free(d, M_BPF);
636
637	return (0);
638}
639
640/*
641 *  bpfread - read next chunk of packets from buffers
642 */
643static	int
644bpfread(struct cdev *dev, struct uio *uio, int ioflag)
645{
646	struct bpf_d *d = dev->si_drv1;
647	int timed_out;
648	int error;
649
650	/*
651	 * Restrict application to use a buffer the same size as
652	 * as kernel buffers.
653	 */
654	if (uio->uio_resid != d->bd_bufsize)
655		return (EINVAL);
656
657	BPFD_LOCK(d);
658	d->bd_pid = curthread->td_proc->p_pid;
659	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
660		BPFD_UNLOCK(d);
661		return (EOPNOTSUPP);
662	}
663	if (d->bd_state == BPF_WAITING)
664		callout_stop(&d->bd_callout);
665	timed_out = (d->bd_state == BPF_TIMED_OUT);
666	d->bd_state = BPF_IDLE;
667	/*
668	 * If the hold buffer is empty, then do a timed sleep, which
669	 * ends when the timeout expires or when enough packets
670	 * have arrived to fill the store buffer.
671	 */
672	while (d->bd_hbuf == NULL) {
673		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
674			/*
675			 * A packet(s) either arrived since the previous
676			 * read or arrived while we were asleep.
677			 * Rotate the buffers and return what's here.
678			 */
679			ROTATE_BUFFERS(d);
680			break;
681		}
682
683		/*
684		 * No data is available, check to see if the bpf device
685		 * is still pointed at a real interface.  If not, return
686		 * ENXIO so that the userland process knows to rebind
687		 * it before using it again.
688		 */
689		if (d->bd_bif == NULL) {
690			BPFD_UNLOCK(d);
691			return (ENXIO);
692		}
693
694		if (ioflag & O_NONBLOCK) {
695			BPFD_UNLOCK(d);
696			return (EWOULDBLOCK);
697		}
698		error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
699		     "bpf", d->bd_rtout);
700		if (error == EINTR || error == ERESTART) {
701			BPFD_UNLOCK(d);
702			return (error);
703		}
704		if (error == EWOULDBLOCK) {
705			/*
706			 * On a timeout, return what's in the buffer,
707			 * which may be nothing.  If there is something
708			 * in the store buffer, we can rotate the buffers.
709			 */
710			if (d->bd_hbuf)
711				/*
712				 * We filled up the buffer in between
713				 * getting the timeout and arriving
714				 * here, so we don't need to rotate.
715				 */
716				break;
717
718			if (d->bd_slen == 0) {
719				BPFD_UNLOCK(d);
720				return (0);
721			}
722			ROTATE_BUFFERS(d);
723			break;
724		}
725	}
726	/*
727	 * At this point, we know we have something in the hold slot.
728	 */
729	BPFD_UNLOCK(d);
730
731	/*
732	 * Move data from hold buffer into user space.
733	 * We know the entire buffer is transferred since
734	 * we checked above that the read buffer is bpf_bufsize bytes.
735	 *
736	 * XXXRW: More synchronization needed here: what if a second thread
737	 * issues a read on the same fd at the same time?  Don't want this
738	 * getting invalidated.
739	 */
740	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
741
742	BPFD_LOCK(d);
743	d->bd_fbuf = d->bd_hbuf;
744	d->bd_hbuf = NULL;
745	d->bd_hlen = 0;
746	BPFD_UNLOCK(d);
747
748	return (error);
749}
750
751/*
752 * If there are processes sleeping on this descriptor, wake them up.
753 */
754static __inline void
755bpf_wakeup(struct bpf_d *d)
756{
757
758	BPFD_LOCK_ASSERT(d);
759	if (d->bd_state == BPF_WAITING) {
760		callout_stop(&d->bd_callout);
761		d->bd_state = BPF_IDLE;
762	}
763	wakeup(d);
764	if (d->bd_async && d->bd_sig && d->bd_sigio)
765		pgsigio(&d->bd_sigio, d->bd_sig, 0);
766
767	selwakeuppri(&d->bd_sel, PRINET);
768	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
769}
770
771static void
772bpf_timed_out(void *arg)
773{
774	struct bpf_d *d = (struct bpf_d *)arg;
775
776	BPFD_LOCK(d);
777	if (d->bd_state == BPF_WAITING) {
778		d->bd_state = BPF_TIMED_OUT;
779		if (d->bd_slen != 0)
780			bpf_wakeup(d);
781	}
782	BPFD_UNLOCK(d);
783}
784
785static int
786bpf_ready(struct bpf_d *d)
787{
788
789	BPFD_LOCK_ASSERT(d);
790
791	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
792		return (1);
793	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
794	    d->bd_slen != 0)
795		return (1);
796	return (0);
797}
798
799static int
800bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
801{
802	struct bpf_d *d = dev->si_drv1;
803	struct ifnet *ifp;
804	struct mbuf *m, *mc;
805	struct sockaddr dst;
806	int error, hlen;
807
808	d->bd_pid = curthread->td_proc->p_pid;
809	d->bd_wcount++;
810	if (d->bd_bif == NULL) {
811		d->bd_wdcount++;
812		return (ENXIO);
813	}
814
815	ifp = d->bd_bif->bif_ifp;
816
817	if ((ifp->if_flags & IFF_UP) == 0) {
818		d->bd_wdcount++;
819		return (ENETDOWN);
820	}
821
822	if (uio->uio_resid == 0) {
823		d->bd_wdcount++;
824		return (0);
825	}
826
827	bzero(&dst, sizeof(dst));
828	m = NULL;
829	hlen = 0;
830	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
831	    &m, &dst, &hlen, d->bd_wfilter);
832	if (error) {
833		d->bd_wdcount++;
834		return (error);
835	}
836	d->bd_wfcount++;
837	if (d->bd_hdrcmplt)
838		dst.sa_family = pseudo_AF_HDRCMPLT;
839
840	if (d->bd_feedback) {
841		mc = m_dup(m, M_DONTWAIT);
842		if (mc != NULL)
843			mc->m_pkthdr.rcvif = ifp;
844	} else
845		mc = NULL;
846
847	m->m_pkthdr.len -= hlen;
848	m->m_len -= hlen;
849	m->m_data += hlen;	/* XXX */
850
851#ifdef MAC
852	BPFD_LOCK(d);
853	mac_bpfdesc_create_mbuf(d, m);
854	if (mc != NULL)
855		mac_bpfdesc_create_mbuf(d, mc);
856	BPFD_UNLOCK(d);
857#endif
858
859	error = (*ifp->if_output)(ifp, m, &dst, NULL);
860	if (error)
861		d->bd_wdcount++;
862
863	if (mc != NULL) {
864		if (error == 0)
865			(*ifp->if_input)(ifp, mc);
866		else
867			m_freem(mc);
868	}
869
870	return (error);
871}
872
873/*
874 * Reset a descriptor by flushing its packet buffer and clearing the
875 * receive and drop counts.
876 */
877static void
878reset_d(struct bpf_d *d)
879{
880
881	mtx_assert(&d->bd_mtx, MA_OWNED);
882	if (d->bd_hbuf) {
883		/* Free the hold buffer. */
884		d->bd_fbuf = d->bd_hbuf;
885		d->bd_hbuf = NULL;
886	}
887	d->bd_slen = 0;
888	d->bd_hlen = 0;
889	d->bd_rcount = 0;
890	d->bd_dcount = 0;
891	d->bd_fcount = 0;
892	d->bd_wcount = 0;
893	d->bd_wfcount = 0;
894	d->bd_wdcount = 0;
895	d->bd_zcopy = 0;
896}
897
898/*
899 *  FIONREAD		Check for read packet available.
900 *  SIOCGIFADDR		Get interface address - convenient hook to driver.
901 *  BIOCGBLEN		Get buffer len [for read()].
902 *  BIOCSETF		Set ethernet read filter.
903 *  BIOCSETWF		Set ethernet write filter.
904 *  BIOCFLUSH		Flush read packet buffer.
905 *  BIOCPROMISC		Put interface into promiscuous mode.
906 *  BIOCGDLT		Get link layer type.
907 *  BIOCGETIF		Get interface name.
908 *  BIOCSETIF		Set interface.
909 *  BIOCSRTIMEOUT	Set read timeout.
910 *  BIOCGRTIMEOUT	Get read timeout.
911 *  BIOCGSTATS		Get packet stats.
912 *  BIOCIMMEDIATE	Set immediate mode.
913 *  BIOCVERSION		Get filter language version.
914 *  BIOCGHDRCMPLT	Get "header already complete" flag
915 *  BIOCSHDRCMPLT	Set "header already complete" flag
916 *  BIOCGDIRECTION	Get packet direction flag
917 *  BIOCSDIRECTION	Set packet direction flag
918 *  BIOCLOCK		Set "locked" flag
919 *  BIOCFEEDBACK	Set packet feedback mode.
920 *  BIOCSETZBUF		Set current zero-copy buffer locations.
921 *  BIOCGETZMAX		Get maximum zero-copy buffer size.
922 *  BIOCROTZBUF		Force rotation of zero-copy buffer
923 *  BIOCSETBUFMODE	Set buffer mode.
924 *  BIOCGETBUFMODE	Get current buffer mode.
925 */
926/* ARGSUSED */
927static	int
928bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
929    struct thread *td)
930{
931	struct bpf_d *d = dev->si_drv1;
932	int error = 0;
933
934	/*
935	 * Refresh PID associated with this descriptor.
936	 */
937	BPFD_LOCK(d);
938	d->bd_pid = td->td_proc->p_pid;
939	if (d->bd_state == BPF_WAITING)
940		callout_stop(&d->bd_callout);
941	d->bd_state = BPF_IDLE;
942	BPFD_UNLOCK(d);
943
944	if (d->bd_locked == 1) {
945		switch (cmd) {
946		case BIOCGBLEN:
947		case BIOCFLUSH:
948		case BIOCGDLT:
949		case BIOCGDLTLIST:
950		case BIOCGETIF:
951		case BIOCGRTIMEOUT:
952		case BIOCGSTATS:
953		case BIOCVERSION:
954		case BIOCGRSIG:
955		case BIOCGHDRCMPLT:
956		case BIOCFEEDBACK:
957		case FIONREAD:
958		case BIOCLOCK:
959		case BIOCSRTIMEOUT:
960		case BIOCIMMEDIATE:
961		case TIOCGPGRP:
962		case BIOCROTZBUF:
963			break;
964		default:
965			return (EPERM);
966		}
967	}
968	switch (cmd) {
969
970	default:
971		error = EINVAL;
972		break;
973
974	/*
975	 * Check for read packet available.
976	 */
977	case FIONREAD:
978		{
979			int n;
980
981			BPFD_LOCK(d);
982			n = d->bd_slen;
983			if (d->bd_hbuf)
984				n += d->bd_hlen;
985			BPFD_UNLOCK(d);
986
987			*(int *)addr = n;
988			break;
989		}
990
991	case SIOCGIFADDR:
992		{
993			struct ifnet *ifp;
994
995			if (d->bd_bif == NULL)
996				error = EINVAL;
997			else {
998				ifp = d->bd_bif->bif_ifp;
999				error = (*ifp->if_ioctl)(ifp, cmd, addr);
1000			}
1001			break;
1002		}
1003
1004	/*
1005	 * Get buffer len [for read()].
1006	 */
1007	case BIOCGBLEN:
1008		*(u_int *)addr = d->bd_bufsize;
1009		break;
1010
1011	/*
1012	 * Set buffer length.
1013	 */
1014	case BIOCSBLEN:
1015		error = bpf_ioctl_sblen(d, (u_int *)addr);
1016		break;
1017
1018	/*
1019	 * Set link layer read filter.
1020	 */
1021	case BIOCSETF:
1022	case BIOCSETWF:
1023		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1024		break;
1025
1026	/*
1027	 * Flush read packet buffer.
1028	 */
1029	case BIOCFLUSH:
1030		BPFD_LOCK(d);
1031		reset_d(d);
1032		BPFD_UNLOCK(d);
1033		break;
1034
1035	/*
1036	 * Put interface into promiscuous mode.
1037	 */
1038	case BIOCPROMISC:
1039		if (d->bd_bif == NULL) {
1040			/*
1041			 * No interface attached yet.
1042			 */
1043			error = EINVAL;
1044			break;
1045		}
1046		if (d->bd_promisc == 0) {
1047			error = ifpromisc(d->bd_bif->bif_ifp, 1);
1048			if (error == 0)
1049				d->bd_promisc = 1;
1050		}
1051		break;
1052
1053	/*
1054	 * Get current data link type.
1055	 */
1056	case BIOCGDLT:
1057		if (d->bd_bif == NULL)
1058			error = EINVAL;
1059		else
1060			*(u_int *)addr = d->bd_bif->bif_dlt;
1061		break;
1062
1063	/*
1064	 * Get a list of supported data link types.
1065	 */
1066	case BIOCGDLTLIST:
1067		if (d->bd_bif == NULL)
1068			error = EINVAL;
1069		else
1070			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1071		break;
1072
1073	/*
1074	 * Set data link type.
1075	 */
1076	case BIOCSDLT:
1077		if (d->bd_bif == NULL)
1078			error = EINVAL;
1079		else
1080			error = bpf_setdlt(d, *(u_int *)addr);
1081		break;
1082
1083	/*
1084	 * Get interface name.
1085	 */
1086	case BIOCGETIF:
1087		if (d->bd_bif == NULL)
1088			error = EINVAL;
1089		else {
1090			struct ifnet *const ifp = d->bd_bif->bif_ifp;
1091			struct ifreq *const ifr = (struct ifreq *)addr;
1092
1093			strlcpy(ifr->ifr_name, ifp->if_xname,
1094			    sizeof(ifr->ifr_name));
1095		}
1096		break;
1097
1098	/*
1099	 * Set interface.
1100	 */
1101	case BIOCSETIF:
1102		error = bpf_setif(d, (struct ifreq *)addr);
1103		break;
1104
1105	/*
1106	 * Set read timeout.
1107	 */
1108	case BIOCSRTIMEOUT:
1109		{
1110			struct timeval *tv = (struct timeval *)addr;
1111
1112			/*
1113			 * Subtract 1 tick from tvtohz() since this isn't
1114			 * a one-shot timer.
1115			 */
1116			if ((error = itimerfix(tv)) == 0)
1117				d->bd_rtout = tvtohz(tv) - 1;
1118			break;
1119		}
1120
1121	/*
1122	 * Get read timeout.
1123	 */
1124	case BIOCGRTIMEOUT:
1125		{
1126			struct timeval *tv = (struct timeval *)addr;
1127
1128			tv->tv_sec = d->bd_rtout / hz;
1129			tv->tv_usec = (d->bd_rtout % hz) * tick;
1130			break;
1131		}
1132
1133	/*
1134	 * Get packet stats.
1135	 */
1136	case BIOCGSTATS:
1137		{
1138			struct bpf_stat *bs = (struct bpf_stat *)addr;
1139
1140			/* XXXCSJP overflow */
1141			bs->bs_recv = d->bd_rcount;
1142			bs->bs_drop = d->bd_dcount;
1143			break;
1144		}
1145
1146	/*
1147	 * Set immediate mode.
1148	 */
1149	case BIOCIMMEDIATE:
1150		d->bd_immediate = *(u_int *)addr;
1151		break;
1152
1153	case BIOCVERSION:
1154		{
1155			struct bpf_version *bv = (struct bpf_version *)addr;
1156
1157			bv->bv_major = BPF_MAJOR_VERSION;
1158			bv->bv_minor = BPF_MINOR_VERSION;
1159			break;
1160		}
1161
1162	/*
1163	 * Get "header already complete" flag
1164	 */
1165	case BIOCGHDRCMPLT:
1166		*(u_int *)addr = d->bd_hdrcmplt;
1167		break;
1168
1169	/*
1170	 * Set "header already complete" flag
1171	 */
1172	case BIOCSHDRCMPLT:
1173		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1174		break;
1175
1176	/*
1177	 * Get packet direction flag
1178	 */
1179	case BIOCGDIRECTION:
1180		*(u_int *)addr = d->bd_direction;
1181		break;
1182
1183	/*
1184	 * Set packet direction flag
1185	 */
1186	case BIOCSDIRECTION:
1187		{
1188			u_int	direction;
1189
1190			direction = *(u_int *)addr;
1191			switch (direction) {
1192			case BPF_D_IN:
1193			case BPF_D_INOUT:
1194			case BPF_D_OUT:
1195				d->bd_direction = direction;
1196				break;
1197			default:
1198				error = EINVAL;
1199			}
1200		}
1201		break;
1202
1203	case BIOCFEEDBACK:
1204		d->bd_feedback = *(u_int *)addr;
1205		break;
1206
1207	case BIOCLOCK:
1208		d->bd_locked = 1;
1209		break;
1210
1211	case FIONBIO:		/* Non-blocking I/O */
1212		break;
1213
1214	case FIOASYNC:		/* Send signal on receive packets */
1215		d->bd_async = *(int *)addr;
1216		break;
1217
1218	case FIOSETOWN:
1219		error = fsetown(*(int *)addr, &d->bd_sigio);
1220		break;
1221
1222	case FIOGETOWN:
1223		*(int *)addr = fgetown(&d->bd_sigio);
1224		break;
1225
1226	/* This is deprecated, FIOSETOWN should be used instead. */
1227	case TIOCSPGRP:
1228		error = fsetown(-(*(int *)addr), &d->bd_sigio);
1229		break;
1230
1231	/* This is deprecated, FIOGETOWN should be used instead. */
1232	case TIOCGPGRP:
1233		*(int *)addr = -fgetown(&d->bd_sigio);
1234		break;
1235
1236	case BIOCSRSIG:		/* Set receive signal */
1237		{
1238			u_int sig;
1239
1240			sig = *(u_int *)addr;
1241
1242			if (sig >= NSIG)
1243				error = EINVAL;
1244			else
1245				d->bd_sig = sig;
1246			break;
1247		}
1248	case BIOCGRSIG:
1249		*(u_int *)addr = d->bd_sig;
1250		break;
1251
1252	case BIOCGETBUFMODE:
1253		*(u_int *)addr = d->bd_bufmode;
1254		break;
1255
1256	case BIOCSETBUFMODE:
1257		/*
1258		 * Allow the buffering mode to be changed as long as we
1259		 * haven't yet committed to a particular mode.  Our
1260		 * definition of commitment, for now, is whether or not a
1261		 * buffer has been allocated or an interface attached, since
1262		 * that's the point where things get tricky.
1263		 */
1264		switch (*(u_int *)addr) {
1265		case BPF_BUFMODE_BUFFER:
1266			break;
1267
1268		case BPF_BUFMODE_ZBUF:
1269			if (bpf_zerocopy_enable)
1270				break;
1271			/* FALLSTHROUGH */
1272
1273		default:
1274			return (EINVAL);
1275		}
1276
1277		BPFD_LOCK(d);
1278		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1279		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
1280			BPFD_UNLOCK(d);
1281			return (EBUSY);
1282		}
1283		d->bd_bufmode = *(u_int *)addr;
1284		BPFD_UNLOCK(d);
1285		break;
1286
1287	case BIOCGETZMAX:
1288		return (bpf_ioctl_getzmax(td, d, (size_t *)addr));
1289
1290	case BIOCSETZBUF:
1291		return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr));
1292
1293	case BIOCROTZBUF:
1294		return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr));
1295	}
1296	return (error);
1297}
1298
1299/*
1300 * Set d's packet filter program to fp.  If this file already has a filter,
1301 * free it and replace it.  Returns EINVAL for bogus requests.
1302 */
1303static int
1304bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1305{
1306	struct bpf_insn *fcode, *old;
1307	u_int wfilter, flen, size;
1308#ifdef BPF_JITTER
1309	bpf_jit_filter *ofunc;
1310#endif
1311
1312	if (cmd == BIOCSETWF) {
1313		old = d->bd_wfilter;
1314		wfilter = 1;
1315#ifdef BPF_JITTER
1316		ofunc = NULL;
1317#endif
1318	} else {
1319		wfilter = 0;
1320		old = d->bd_rfilter;
1321#ifdef BPF_JITTER
1322		ofunc = d->bd_bfilter;
1323#endif
1324	}
1325	if (fp->bf_insns == NULL) {
1326		if (fp->bf_len != 0)
1327			return (EINVAL);
1328		BPFD_LOCK(d);
1329		if (wfilter)
1330			d->bd_wfilter = NULL;
1331		else {
1332			d->bd_rfilter = NULL;
1333#ifdef BPF_JITTER
1334			d->bd_bfilter = NULL;
1335#endif
1336		}
1337		reset_d(d);
1338		BPFD_UNLOCK(d);
1339		if (old != NULL)
1340			free((caddr_t)old, M_BPF);
1341#ifdef BPF_JITTER
1342		if (ofunc != NULL)
1343			bpf_destroy_jit_filter(ofunc);
1344#endif
1345		return (0);
1346	}
1347	flen = fp->bf_len;
1348	if (flen > bpf_maxinsns)
1349		return (EINVAL);
1350
1351	size = flen * sizeof(*fp->bf_insns);
1352	fcode = (struct bpf_insn *)malloc(size, M_BPF, M_WAITOK);
1353	if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 &&
1354	    bpf_validate(fcode, (int)flen)) {
1355		BPFD_LOCK(d);
1356		if (wfilter)
1357			d->bd_wfilter = fcode;
1358		else {
1359			d->bd_rfilter = fcode;
1360#ifdef BPF_JITTER
1361			d->bd_bfilter = bpf_jitter(fcode, flen);
1362#endif
1363		}
1364		reset_d(d);
1365		BPFD_UNLOCK(d);
1366		if (old != NULL)
1367			free((caddr_t)old, M_BPF);
1368#ifdef BPF_JITTER
1369		if (ofunc != NULL)
1370			bpf_destroy_jit_filter(ofunc);
1371#endif
1372
1373		return (0);
1374	}
1375	free((caddr_t)fcode, M_BPF);
1376	return (EINVAL);
1377}
1378
1379/*
1380 * Detach a file from its current interface (if attached at all) and attach
1381 * to the interface indicated by the name stored in ifr.
1382 * Return an errno or 0.
1383 */
1384static int
1385bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1386{
1387	struct bpf_if *bp;
1388	struct ifnet *theywant;
1389
1390	theywant = ifunit(ifr->ifr_name);
1391	if (theywant == NULL || theywant->if_bpf == NULL)
1392		return (ENXIO);
1393
1394	bp = theywant->if_bpf;
1395
1396	/*
1397	 * Behavior here depends on the buffering model.  If we're using
1398	 * kernel memory buffers, then we can allocate them here.  If we're
1399	 * using zero-copy, then the user process must have registered
1400	 * buffers by the time we get here.  If not, return an error.
1401	 *
1402	 * XXXRW: There are locking issues here with multi-threaded use: what
1403	 * if two threads try to set the interface at once?
1404	 */
1405	switch (d->bd_bufmode) {
1406	case BPF_BUFMODE_BUFFER:
1407		if (d->bd_sbuf == NULL)
1408			bpf_buffer_alloc(d);
1409		KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL"));
1410		break;
1411
1412	case BPF_BUFMODE_ZBUF:
1413		if (d->bd_sbuf == NULL)
1414			return (EINVAL);
1415		break;
1416
1417	default:
1418		panic("bpf_setif: bufmode %d", d->bd_bufmode);
1419	}
1420	if (bp != d->bd_bif) {
1421		if (d->bd_bif)
1422			/*
1423			 * Detach if attached to something else.
1424			 */
1425			bpf_detachd(d);
1426
1427		bpf_attachd(d, bp);
1428	}
1429	BPFD_LOCK(d);
1430	reset_d(d);
1431	BPFD_UNLOCK(d);
1432	return (0);
1433}
1434
1435/*
1436 * Support for select() and poll() system calls
1437 *
1438 * Return true iff the specific operation will not block indefinitely.
1439 * Otherwise, return false but make a note that a selwakeup() must be done.
1440 */
1441static int
1442bpfpoll(struct cdev *dev, int events, struct thread *td)
1443{
1444	struct bpf_d *d;
1445	int revents;
1446
1447	d = dev->si_drv1;
1448	if (d->bd_bif == NULL)
1449		return (ENXIO);
1450
1451	/*
1452	 * Refresh PID associated with this descriptor.
1453	 */
1454	revents = events & (POLLOUT | POLLWRNORM);
1455	BPFD_LOCK(d);
1456	d->bd_pid = td->td_proc->p_pid;
1457	if (events & (POLLIN | POLLRDNORM)) {
1458		if (bpf_ready(d))
1459			revents |= events & (POLLIN | POLLRDNORM);
1460		else {
1461			selrecord(td, &d->bd_sel);
1462			/* Start the read timeout if necessary. */
1463			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1464				callout_reset(&d->bd_callout, d->bd_rtout,
1465				    bpf_timed_out, d);
1466				d->bd_state = BPF_WAITING;
1467			}
1468		}
1469	}
1470	BPFD_UNLOCK(d);
1471	return (revents);
1472}
1473
1474/*
1475 * Support for kevent() system call.  Register EVFILT_READ filters and
1476 * reject all others.
1477 */
1478int
1479bpfkqfilter(struct cdev *dev, struct knote *kn)
1480{
1481	struct bpf_d *d = (struct bpf_d *)dev->si_drv1;
1482
1483	if (kn->kn_filter != EVFILT_READ)
1484		return (1);
1485
1486	/*
1487	 * Refresh PID associated with this descriptor.
1488	 */
1489	BPFD_LOCK(d);
1490	d->bd_pid = curthread->td_proc->p_pid;
1491	kn->kn_fop = &bpfread_filtops;
1492	kn->kn_hook = d;
1493	knlist_add(&d->bd_sel.si_note, kn, 1);
1494	BPFD_UNLOCK(d);
1495
1496	return (0);
1497}
1498
1499static void
1500filt_bpfdetach(struct knote *kn)
1501{
1502	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
1503
1504	knlist_remove(&d->bd_sel.si_note, kn, 0);
1505}
1506
1507static int
1508filt_bpfread(struct knote *kn, long hint)
1509{
1510	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
1511	int ready;
1512
1513	BPFD_LOCK_ASSERT(d);
1514	ready = bpf_ready(d);
1515	if (ready) {
1516		kn->kn_data = d->bd_slen;
1517		if (d->bd_hbuf)
1518			kn->kn_data += d->bd_hlen;
1519	}
1520	else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1521		callout_reset(&d->bd_callout, d->bd_rtout,
1522		    bpf_timed_out, d);
1523		d->bd_state = BPF_WAITING;
1524	}
1525
1526	return (ready);
1527}
1528
1529/*
1530 * Incoming linkage from device drivers.  Process the packet pkt, of length
1531 * pktlen, which is stored in a contiguous buffer.  The packet is parsed
1532 * by each process' filter, and if accepted, stashed into the corresponding
1533 * buffer.
1534 */
1535void
1536bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
1537{
1538	struct bpf_d *d;
1539	u_int slen;
1540	int gottime;
1541	struct timeval tv;
1542
1543	gottime = 0;
1544	BPFIF_LOCK(bp);
1545	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1546		BPFD_LOCK(d);
1547		++d->bd_rcount;
1548#ifdef BPF_JITTER
1549		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL)
1550			slen = (*(d->bd_bfilter->func))(pkt, pktlen, pktlen);
1551		else
1552#endif
1553		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
1554		if (slen != 0) {
1555			d->bd_fcount++;
1556			if (!gottime) {
1557				microtime(&tv);
1558				gottime = 1;
1559			}
1560#ifdef MAC
1561			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
1562#endif
1563				catchpacket(d, pkt, pktlen, slen,
1564				    bpf_append_bytes, &tv);
1565		}
1566		BPFD_UNLOCK(d);
1567	}
1568	BPFIF_UNLOCK(bp);
1569}
1570
1571#define	BPF_CHECK_DIRECTION(d, i)				\
1572	    (((d)->bd_direction == BPF_D_IN && (i) == NULL) ||	\
1573	    ((d)->bd_direction == BPF_D_OUT && (i) != NULL))
1574#define	BPF_CHECK_DUPLICATE(d, i)				\
1575	    ((d)->bd_feedback &&				\
1576	    (d)->bd_direction == BPF_D_INOUT &&	(i) == NULL)
1577
1578/*
1579 * Incoming linkage from device drivers, when packet is in an mbuf chain.
1580 */
1581void
1582bpf_mtap(struct bpf_if *bp, struct mbuf *m)
1583{
1584	struct bpf_d *d;
1585	u_int pktlen, slen;
1586	int gottime;
1587	struct timeval tv;
1588
1589	gottime = 0;
1590
1591	pktlen = m_length(m, NULL);
1592
1593	BPFIF_LOCK(bp);
1594	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1595		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif) ||
1596		    BPF_CHECK_DUPLICATE(d, m->m_pkthdr.rcvif))
1597			continue;
1598		BPFD_LOCK(d);
1599		++d->bd_rcount;
1600#ifdef BPF_JITTER
1601		/* XXX We cannot handle multiple mbufs. */
1602		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL &&
1603		    m->m_next == NULL)
1604			slen = (*(d->bd_bfilter->func))(mtod(m, u_char *),
1605			    pktlen, pktlen);
1606		else
1607#endif
1608		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
1609		if (slen != 0) {
1610			d->bd_fcount++;
1611			if (!gottime) {
1612				microtime(&tv);
1613				gottime = 1;
1614			}
1615#ifdef MAC
1616			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
1617#endif
1618				catchpacket(d, (u_char *)m, pktlen, slen,
1619				    bpf_append_mbuf, &tv);
1620		}
1621		BPFD_UNLOCK(d);
1622	}
1623	BPFIF_UNLOCK(bp);
1624}
1625
1626/*
1627 * Incoming linkage from device drivers, when packet is in
1628 * an mbuf chain and to be prepended by a contiguous header.
1629 */
1630void
1631bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
1632{
1633	struct mbuf mb;
1634	struct bpf_d *d;
1635	u_int pktlen, slen;
1636	int gottime;
1637	struct timeval tv;
1638
1639	gottime = 0;
1640
1641	pktlen = m_length(m, NULL);
1642	/*
1643	 * Craft on-stack mbuf suitable for passing to bpf_filter.
1644	 * Note that we cut corners here; we only setup what's
1645	 * absolutely needed--this mbuf should never go anywhere else.
1646	 */
1647	mb.m_next = m;
1648	mb.m_data = data;
1649	mb.m_len = dlen;
1650	pktlen += dlen;
1651
1652	BPFIF_LOCK(bp);
1653	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1654		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif) ||
1655		    BPF_CHECK_DUPLICATE(d, m->m_pkthdr.rcvif))
1656			continue;
1657		BPFD_LOCK(d);
1658		++d->bd_rcount;
1659		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
1660		if (slen != 0) {
1661			d->bd_fcount++;
1662			if (!gottime) {
1663				microtime(&tv);
1664				gottime = 1;
1665			}
1666#ifdef MAC
1667			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
1668#endif
1669				catchpacket(d, (u_char *)&mb, pktlen, slen,
1670				    bpf_append_mbuf, &tv);
1671		}
1672		BPFD_UNLOCK(d);
1673	}
1674	BPFIF_UNLOCK(bp);
1675}
1676
1677#undef	BPF_CHECK_DIRECTION
1678#undef	BPF_CHECK_DUPLICATE
1679
1680/*
1681 * Move the packet data from interface memory (pkt) into the
1682 * store buffer.  "cpfn" is the routine called to do the actual data
1683 * transfer.  bcopy is passed in to copy contiguous chunks, while
1684 * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
1685 * pkt is really an mbuf.
1686 */
1687static void
1688catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
1689    void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
1690    struct timeval *tv)
1691{
1692	struct bpf_hdr hdr;
1693	int totlen, curlen;
1694	int hdrlen = d->bd_bif->bif_hdrlen;
1695	int do_wakeup = 0;
1696
1697	BPFD_LOCK_ASSERT(d);
1698
1699	/*
1700	 * Detect whether user space has released a buffer back to us, and if
1701	 * so, move it from being a hold buffer to a free buffer.  This may
1702	 * not be the best place to do it (for example, we might only want to
1703	 * run this check if we need the space), but for now it's a reliable
1704	 * spot to do it.
1705	 */
1706	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
1707		d->bd_fbuf = d->bd_hbuf;
1708		d->bd_hbuf = NULL;
1709		d->bd_hlen = 0;
1710	}
1711
1712	/*
1713	 * Figure out how many bytes to move.  If the packet is
1714	 * greater or equal to the snapshot length, transfer that
1715	 * much.  Otherwise, transfer the whole packet (unless
1716	 * we hit the buffer size limit).
1717	 */
1718	totlen = hdrlen + min(snaplen, pktlen);
1719	if (totlen > d->bd_bufsize)
1720		totlen = d->bd_bufsize;
1721
1722	/*
1723	 * Round up the end of the previous packet to the next longword.
1724	 *
1725	 * Drop the packet if there's no room and no hope of room
1726	 * If the packet would overflow the storage buffer or the storage
1727	 * buffer is considered immutable by the buffer model, try to rotate
1728	 * the buffer and wakeup pending processes.
1729	 */
1730	curlen = BPF_WORDALIGN(d->bd_slen);
1731	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
1732		if (d->bd_fbuf == NULL) {
1733			/*
1734			 * There's no room in the store buffer, and no
1735			 * prospect of room, so drop the packet.  Notify the
1736			 * buffer model.
1737			 */
1738			bpf_buffull(d);
1739			++d->bd_dcount;
1740			return;
1741		}
1742		ROTATE_BUFFERS(d);
1743		do_wakeup = 1;
1744		curlen = 0;
1745	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
1746		/*
1747		 * Immediate mode is set, or the read timeout has already
1748		 * expired during a select call.  A packet arrived, so the
1749		 * reader should be woken up.
1750		 */
1751		do_wakeup = 1;
1752
1753	/*
1754	 * Append the bpf header.  Note we append the actual header size, but
1755	 * move forward the length of the header plus padding.
1756	 */
1757	bzero(&hdr, sizeof(hdr));
1758	hdr.bh_tstamp = *tv;
1759	hdr.bh_datalen = pktlen;
1760	hdr.bh_hdrlen = hdrlen;
1761	hdr.bh_caplen = totlen - hdrlen;
1762	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
1763
1764	/*
1765	 * Copy the packet data into the store buffer and update its length.
1766	 */
1767	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen);
1768	d->bd_slen = curlen + totlen;
1769
1770	if (do_wakeup)
1771		bpf_wakeup(d);
1772}
1773
1774/*
1775 * Free buffers currently in use by a descriptor.
1776 * Called on close.
1777 */
1778static void
1779bpf_freed(struct bpf_d *d)
1780{
1781
1782	/*
1783	 * We don't need to lock out interrupts since this descriptor has
1784	 * been detached from its interface and it yet hasn't been marked
1785	 * free.
1786	 */
1787	bpf_free(d);
1788	if (d->bd_rfilter) {
1789		free((caddr_t)d->bd_rfilter, M_BPF);
1790#ifdef BPF_JITTER
1791		bpf_destroy_jit_filter(d->bd_bfilter);
1792#endif
1793	}
1794	if (d->bd_wfilter)
1795		free((caddr_t)d->bd_wfilter, M_BPF);
1796	mtx_destroy(&d->bd_mtx);
1797}
1798
1799/*
1800 * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
1801 * fixed size of the link header (variable length headers not yet supported).
1802 */
1803void
1804bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
1805{
1806
1807	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
1808}
1809
1810/*
1811 * Attach an interface to bpf.  ifp is a pointer to the structure
1812 * defining the interface to be attached, dlt is the link layer type,
1813 * and hdrlen is the fixed size of the link header (variable length
1814 * headers are not yet supporrted).
1815 */
1816void
1817bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
1818{
1819	struct bpf_if *bp;
1820
1821	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
1822	if (bp == NULL)
1823		panic("bpfattach");
1824
1825	LIST_INIT(&bp->bif_dlist);
1826	bp->bif_ifp = ifp;
1827	bp->bif_dlt = dlt;
1828	mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF);
1829	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
1830	*driverp = bp;
1831
1832	mtx_lock(&bpf_mtx);
1833	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
1834	mtx_unlock(&bpf_mtx);
1835
1836	/*
1837	 * Compute the length of the bpf header.  This is not necessarily
1838	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1839	 * that the network layer header begins on a longword boundary (for
1840	 * performance reasons and to alleviate alignment restrictions).
1841	 */
1842	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1843
1844	if (bootverbose)
1845		if_printf(ifp, "bpf attached\n");
1846}
1847
1848/*
1849 * Detach bpf from an interface.  This involves detaching each descriptor
1850 * associated with the interface, and leaving bd_bif NULL.  Notify each
1851 * descriptor as it's detached so that any sleepers wake up and get
1852 * ENXIO.
1853 */
1854void
1855bpfdetach(struct ifnet *ifp)
1856{
1857	struct bpf_if	*bp;
1858	struct bpf_d	*d;
1859
1860	/* Locate BPF interface information */
1861	mtx_lock(&bpf_mtx);
1862	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
1863		if (ifp == bp->bif_ifp)
1864			break;
1865	}
1866
1867	/* Interface wasn't attached */
1868	if ((bp == NULL) || (bp->bif_ifp == NULL)) {
1869		mtx_unlock(&bpf_mtx);
1870		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
1871		return;
1872	}
1873
1874	LIST_REMOVE(bp, bif_next);
1875	mtx_unlock(&bpf_mtx);
1876
1877	while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
1878		bpf_detachd(d);
1879		BPFD_LOCK(d);
1880		bpf_wakeup(d);
1881		BPFD_UNLOCK(d);
1882	}
1883
1884	mtx_destroy(&bp->bif_mtx);
1885	free(bp, M_BPF);
1886}
1887
1888/*
1889 * Get a list of available data link type of the interface.
1890 */
1891static int
1892bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1893{
1894	int n, error;
1895	struct ifnet *ifp;
1896	struct bpf_if *bp;
1897
1898	ifp = d->bd_bif->bif_ifp;
1899	n = 0;
1900	error = 0;
1901	mtx_lock(&bpf_mtx);
1902	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
1903		if (bp->bif_ifp != ifp)
1904			continue;
1905		if (bfl->bfl_list != NULL) {
1906			if (n >= bfl->bfl_len) {
1907				mtx_unlock(&bpf_mtx);
1908				return (ENOMEM);
1909			}
1910			error = copyout(&bp->bif_dlt,
1911			    bfl->bfl_list + n, sizeof(u_int));
1912		}
1913		n++;
1914	}
1915	mtx_unlock(&bpf_mtx);
1916	bfl->bfl_len = n;
1917	return (error);
1918}
1919
1920/*
1921 * Set the data link type of a BPF instance.
1922 */
1923static int
1924bpf_setdlt(struct bpf_d *d, u_int dlt)
1925{
1926	int error, opromisc;
1927	struct ifnet *ifp;
1928	struct bpf_if *bp;
1929
1930	if (d->bd_bif->bif_dlt == dlt)
1931		return (0);
1932	ifp = d->bd_bif->bif_ifp;
1933	mtx_lock(&bpf_mtx);
1934	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
1935		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
1936			break;
1937	}
1938	mtx_unlock(&bpf_mtx);
1939	if (bp != NULL) {
1940		opromisc = d->bd_promisc;
1941		bpf_detachd(d);
1942		bpf_attachd(d, bp);
1943		BPFD_LOCK(d);
1944		reset_d(d);
1945		BPFD_UNLOCK(d);
1946		if (opromisc) {
1947			error = ifpromisc(bp->bif_ifp, 1);
1948			if (error)
1949				if_printf(bp->bif_ifp,
1950					"bpf_setdlt: ifpromisc failed (%d)\n",
1951					error);
1952			else
1953				d->bd_promisc = 1;
1954		}
1955	}
1956	return (bp == NULL ? EINVAL : 0);
1957}
1958
1959static void
1960bpf_clone(void *arg, struct ucred *cred, char *name, int namelen,
1961    struct cdev **dev)
1962{
1963	int u;
1964
1965	if (*dev != NULL)
1966		return;
1967	if (dev_stdclone(name, NULL, "bpf", &u) != 1)
1968		return;
1969	*dev = make_dev(&bpf_cdevsw, unit2minor(u), UID_ROOT, GID_WHEEL, 0600,
1970	    "bpf%d", u);
1971	dev_ref(*dev);
1972	(*dev)->si_flags |= SI_CHEAPCLONE;
1973	return;
1974}
1975
1976static void
1977bpf_drvinit(void *unused)
1978{
1979
1980	mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF);
1981	LIST_INIT(&bpf_iflist);
1982	EVENTHANDLER_REGISTER(dev_clone, bpf_clone, 0, 1000);
1983}
1984
1985static void
1986bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
1987{
1988
1989	bzero(d, sizeof(*d));
1990	BPFD_LOCK_ASSERT(bd);
1991	d->bd_structsize = sizeof(*d);
1992	d->bd_immediate = bd->bd_immediate;
1993	d->bd_promisc = bd->bd_promisc;
1994	d->bd_hdrcmplt = bd->bd_hdrcmplt;
1995	d->bd_direction = bd->bd_direction;
1996	d->bd_feedback = bd->bd_feedback;
1997	d->bd_async = bd->bd_async;
1998	d->bd_rcount = bd->bd_rcount;
1999	d->bd_dcount = bd->bd_dcount;
2000	d->bd_fcount = bd->bd_fcount;
2001	d->bd_sig = bd->bd_sig;
2002	d->bd_slen = bd->bd_slen;
2003	d->bd_hlen = bd->bd_hlen;
2004	d->bd_bufsize = bd->bd_bufsize;
2005	d->bd_pid = bd->bd_pid;
2006	strlcpy(d->bd_ifname,
2007	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
2008	d->bd_locked = bd->bd_locked;
2009	d->bd_wcount = bd->bd_wcount;
2010	d->bd_wdcount = bd->bd_wdcount;
2011	d->bd_wfcount = bd->bd_wfcount;
2012	d->bd_zcopy = bd->bd_zcopy;
2013	d->bd_bufmode = bd->bd_bufmode;
2014}
2015
2016static int
2017bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
2018{
2019	struct xbpf_d *xbdbuf, *xbd;
2020	int index, error;
2021	struct bpf_if *bp;
2022	struct bpf_d *bd;
2023
2024	/*
2025	 * XXX This is not technically correct. It is possible for non
2026	 * privileged users to open bpf devices. It would make sense
2027	 * if the users who opened the devices were able to retrieve
2028	 * the statistics for them, too.
2029	 */
2030	error = priv_check(req->td, PRIV_NET_BPF);
2031	if (error)
2032		return (error);
2033	if (req->oldptr == NULL)
2034		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
2035	if (bpf_bpfd_cnt == 0)
2036		return (SYSCTL_OUT(req, 0, 0));
2037	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
2038	mtx_lock(&bpf_mtx);
2039	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
2040		mtx_unlock(&bpf_mtx);
2041		free(xbdbuf, M_BPF);
2042		return (ENOMEM);
2043	}
2044	index = 0;
2045	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2046		BPFIF_LOCK(bp);
2047		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2048			xbd = &xbdbuf[index++];
2049			BPFD_LOCK(bd);
2050			bpfstats_fill_xbpf(xbd, bd);
2051			BPFD_UNLOCK(bd);
2052		}
2053		BPFIF_UNLOCK(bp);
2054	}
2055	mtx_unlock(&bpf_mtx);
2056	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
2057	free(xbdbuf, M_BPF);
2058	return (error);
2059}
2060
2061SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
2062
2063#else /* !DEV_BPF && !NETGRAPH_BPF */
2064/*
2065 * NOP stubs to allow bpf-using drivers to load and function.
2066 *
2067 * A 'better' implementation would allow the core bpf functionality
2068 * to be loaded at runtime.
2069 */
2070static struct bpf_if bp_null;
2071
2072void
2073bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2074{
2075}
2076
2077void
2078bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2079{
2080}
2081
2082void
2083bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
2084{
2085}
2086
2087void
2088bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2089{
2090
2091	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2092}
2093
2094void
2095bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
2096{
2097
2098	*driverp = &bp_null;
2099}
2100
2101void
2102bpfdetach(struct ifnet *ifp)
2103{
2104}
2105
2106u_int
2107bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
2108{
2109	return -1;	/* "no filter" behaviour */
2110}
2111
2112int
2113bpf_validate(const struct bpf_insn *f, int len)
2114{
2115	return 0;		/* false */
2116}
2117
2118#endif /* !DEV_BPF && !NETGRAPH_BPF */
2119