bpf.c revision 178223
1/*-
2 * Copyright (c) 1990, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from the Stanford/CMU enet packet filter,
6 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
7 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
8 * Berkeley Laboratory.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/net/bpf.c 178223 2008-04-15 17:08:24Z jkim $");
39
40#include "opt_bpf.h"
41#include "opt_mac.h"
42#include "opt_netgraph.h"
43
44#include <sys/types.h>
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/conf.h>
48#include <sys/fcntl.h>
49#include <sys/malloc.h>
50#include <sys/mbuf.h>
51#include <sys/time.h>
52#include <sys/priv.h>
53#include <sys/proc.h>
54#include <sys/signalvar.h>
55#include <sys/filio.h>
56#include <sys/sockio.h>
57#include <sys/ttycom.h>
58#include <sys/uio.h>
59
60#include <sys/event.h>
61#include <sys/file.h>
62#include <sys/poll.h>
63#include <sys/proc.h>
64
65#include <sys/socket.h>
66
67#include <net/if.h>
68#include <net/bpf.h>
69#include <net/bpf_buffer.h>
70#ifdef BPF_JITTER
71#include <net/bpf_jitter.h>
72#endif
73#include <net/bpf_zerocopy.h>
74#include <net/bpfdesc.h>
75
76#include <netinet/in.h>
77#include <netinet/if_ether.h>
78#include <sys/kernel.h>
79#include <sys/sysctl.h>
80
81#include <net80211/ieee80211_freebsd.h>
82
83#include <security/mac/mac_framework.h>
84
85MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
86
87#if defined(DEV_BPF) || defined(NETGRAPH_BPF)
88
89#define PRINET  26			/* interruptible */
90
91/*
92 * bpf_iflist is a list of BPF interface structures, each corresponding to a
93 * specific DLT.  The same network interface might have several BPF interface
94 * structures registered by different layers in the stack (i.e., 802.11
95 * frames, ethernet frames, etc).
96 */
97static LIST_HEAD(, bpf_if)	bpf_iflist;
98static struct mtx	bpf_mtx;		/* bpf global lock */
99static int		bpf_bpfd_cnt;
100
101static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
102static void	bpf_detachd(struct bpf_d *);
103static void	bpf_freed(struct bpf_d *);
104static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
105		    struct sockaddr *, int *, struct bpf_insn *);
106static int	bpf_setif(struct bpf_d *, struct ifreq *);
107static void	bpf_timed_out(void *);
108static __inline void
109		bpf_wakeup(struct bpf_d *);
110static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
111		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
112		    struct timeval *);
113static void	reset_d(struct bpf_d *);
114static int	 bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
115static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
116static int	bpf_setdlt(struct bpf_d *, u_int);
117static void	filt_bpfdetach(struct knote *);
118static int	filt_bpfread(struct knote *, long);
119static void	bpf_drvinit(void *);
120static void	bpf_clone(void *, struct ucred *, char *, int, struct cdev **);
121static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
122
123SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
124static int bpf_maxinsns = BPF_MAXINSNS;
125SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
126    &bpf_maxinsns, 0, "Maximum bpf program instructions");
127static int bpf_zerocopy_enable = 0;
128SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
129    &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
130SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW,
131    bpf_stats_sysctl, "bpf statistics portal");
132
133static	d_open_t	bpfopen;
134static	d_close_t	bpfclose;
135static	d_read_t	bpfread;
136static	d_write_t	bpfwrite;
137static	d_ioctl_t	bpfioctl;
138static	d_poll_t	bpfpoll;
139static	d_kqfilter_t	bpfkqfilter;
140
141static struct cdevsw bpf_cdevsw = {
142	.d_version =	D_VERSION,
143	.d_open =	bpfopen,
144	.d_close =	bpfclose,
145	.d_read =	bpfread,
146	.d_write =	bpfwrite,
147	.d_ioctl =	bpfioctl,
148	.d_poll =	bpfpoll,
149	.d_name =	"bpf",
150	.d_kqfilter =	bpfkqfilter,
151};
152
153static struct filterops bpfread_filtops =
154	{ 1, NULL, filt_bpfdetach, filt_bpfread };
155
156/*
157 * Wrapper functions for various buffering methods.  If the set of buffer
158 * modes expands, we will probably want to introduce a switch data structure
159 * similar to protosw, et.
160 */
161static void
162bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
163    u_int len)
164{
165
166	BPFD_LOCK_ASSERT(d);
167
168	switch (d->bd_bufmode) {
169	case BPF_BUFMODE_BUFFER:
170		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
171
172	case BPF_BUFMODE_ZBUF:
173		d->bd_zcopy++;
174		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
175
176	default:
177		panic("bpf_buf_append_bytes");
178	}
179}
180
181static void
182bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
183    u_int len)
184{
185
186	BPFD_LOCK_ASSERT(d);
187
188	switch (d->bd_bufmode) {
189	case BPF_BUFMODE_BUFFER:
190		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
191
192	case BPF_BUFMODE_ZBUF:
193		d->bd_zcopy++;
194		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
195
196	default:
197		panic("bpf_buf_append_mbuf");
198	}
199}
200
201/*
202 * If the buffer mechanism has a way to decide that a held buffer can be made
203 * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
204 * returned if the buffer can be discarded, (0) is returned if it cannot.
205 */
206static int
207bpf_canfreebuf(struct bpf_d *d)
208{
209
210	BPFD_LOCK_ASSERT(d);
211
212	switch (d->bd_bufmode) {
213	case BPF_BUFMODE_ZBUF:
214		return (bpf_zerocopy_canfreebuf(d));
215	}
216	return (0);
217}
218
219/*
220 * Allow the buffer model to indicate that the current store buffer is
221 * immutable, regardless of the appearance of space.  Return (1) if the
222 * buffer is writable, and (0) if not.
223 */
224static int
225bpf_canwritebuf(struct bpf_d *d)
226{
227
228	BPFD_LOCK_ASSERT(d);
229
230	switch (d->bd_bufmode) {
231	case BPF_BUFMODE_ZBUF:
232		return (bpf_zerocopy_canwritebuf(d));
233	}
234	return (1);
235}
236
237/*
238 * Notify buffer model that an attempt to write to the store buffer has
239 * resulted in a dropped packet, in which case the buffer may be considered
240 * full.
241 */
242static void
243bpf_buffull(struct bpf_d *d)
244{
245
246	BPFD_LOCK_ASSERT(d);
247
248	switch (d->bd_bufmode) {
249	case BPF_BUFMODE_ZBUF:
250		bpf_zerocopy_buffull(d);
251		break;
252	}
253}
254
255/*
256 * Notify the buffer model that a buffer has moved into the hold position.
257 */
258void
259bpf_bufheld(struct bpf_d *d)
260{
261
262	BPFD_LOCK_ASSERT(d);
263
264	switch (d->bd_bufmode) {
265	case BPF_BUFMODE_ZBUF:
266		bpf_zerocopy_bufheld(d);
267		break;
268	}
269}
270
271static void
272bpf_free(struct bpf_d *d)
273{
274
275	switch (d->bd_bufmode) {
276	case BPF_BUFMODE_BUFFER:
277		return (bpf_buffer_free(d));
278
279	case BPF_BUFMODE_ZBUF:
280		return (bpf_zerocopy_free(d));
281
282	default:
283		panic("bpf_buf_free");
284	}
285}
286
287static int
288bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
289{
290
291	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
292		return (EOPNOTSUPP);
293	return (bpf_buffer_uiomove(d, buf, len, uio));
294}
295
296static int
297bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
298{
299
300	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
301		return (EOPNOTSUPP);
302	return (bpf_buffer_ioctl_sblen(d, i));
303}
304
305static int
306bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
307{
308
309	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
310		return (EOPNOTSUPP);
311	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
312}
313
314static int
315bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
316{
317
318	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
319		return (EOPNOTSUPP);
320	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
321}
322
323static int
324bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
325{
326
327	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
328		return (EOPNOTSUPP);
329	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
330}
331
332/*
333 * General BPF functions.
334 */
335static int
336bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
337    struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
338{
339	const struct ieee80211_bpf_params *p;
340	struct ether_header *eh;
341	struct mbuf *m;
342	int error;
343	int len;
344	int hlen;
345	int slen;
346
347	/*
348	 * Build a sockaddr based on the data link layer type.
349	 * We do this at this level because the ethernet header
350	 * is copied directly into the data field of the sockaddr.
351	 * In the case of SLIP, there is no header and the packet
352	 * is forwarded as is.
353	 * Also, we are careful to leave room at the front of the mbuf
354	 * for the link level header.
355	 */
356	switch (linktype) {
357
358	case DLT_SLIP:
359		sockp->sa_family = AF_INET;
360		hlen = 0;
361		break;
362
363	case DLT_EN10MB:
364		sockp->sa_family = AF_UNSPEC;
365		/* XXX Would MAXLINKHDR be better? */
366		hlen = ETHER_HDR_LEN;
367		break;
368
369	case DLT_FDDI:
370		sockp->sa_family = AF_IMPLINK;
371		hlen = 0;
372		break;
373
374	case DLT_RAW:
375		sockp->sa_family = AF_UNSPEC;
376		hlen = 0;
377		break;
378
379	case DLT_NULL:
380		/*
381		 * null interface types require a 4 byte pseudo header which
382		 * corresponds to the address family of the packet.
383		 */
384		sockp->sa_family = AF_UNSPEC;
385		hlen = 4;
386		break;
387
388	case DLT_ATM_RFC1483:
389		/*
390		 * en atm driver requires 4-byte atm pseudo header.
391		 * though it isn't standard, vpi:vci needs to be
392		 * specified anyway.
393		 */
394		sockp->sa_family = AF_UNSPEC;
395		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
396		break;
397
398	case DLT_PPP:
399		sockp->sa_family = AF_UNSPEC;
400		hlen = 4;	/* This should match PPP_HDRLEN */
401		break;
402
403	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
404		sockp->sa_family = AF_IEEE80211;
405		hlen = 0;
406		break;
407
408	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
409		sockp->sa_family = AF_IEEE80211;
410		sockp->sa_len = 12;	/* XXX != 0 */
411		hlen = sizeof(struct ieee80211_bpf_params);
412		break;
413
414	default:
415		return (EIO);
416	}
417
418	len = uio->uio_resid;
419
420	if (len - hlen > ifp->if_mtu)
421		return (EMSGSIZE);
422
423	if ((unsigned)len > MCLBYTES)
424		return (EIO);
425
426	if (len > MHLEN)
427		m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
428	else
429		MGETHDR(m, M_WAIT, MT_DATA);
430	m->m_pkthdr.len = m->m_len = len;
431	m->m_pkthdr.rcvif = NULL;
432	*mp = m;
433
434	if (m->m_len < hlen) {
435		error = EPERM;
436		goto bad;
437	}
438
439	error = uiomove(mtod(m, u_char *), len, uio);
440	if (error)
441		goto bad;
442
443	slen = bpf_filter(wfilter, mtod(m, u_char *), len, len);
444	if (slen == 0) {
445		error = EPERM;
446		goto bad;
447	}
448
449	/* Check for multicast destination */
450	switch (linktype) {
451	case DLT_EN10MB:
452		eh = mtod(m, struct ether_header *);
453		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
454			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
455			    ETHER_ADDR_LEN) == 0)
456				m->m_flags |= M_BCAST;
457			else
458				m->m_flags |= M_MCAST;
459		}
460		break;
461	}
462
463	/*
464	 * Make room for link header, and copy it to sockaddr
465	 */
466	if (hlen != 0) {
467		if (sockp->sa_family == AF_IEEE80211) {
468			/*
469			 * Collect true length from the parameter header
470			 * NB: sockp is known to be zero'd so if we do a
471			 *     short copy unspecified parameters will be
472			 *     zero.
473			 * NB: packet may not be aligned after stripping
474			 *     bpf params
475			 * XXX check ibp_vers
476			 */
477			p = mtod(m, const struct ieee80211_bpf_params *);
478			hlen = p->ibp_len;
479			if (hlen > sizeof(sockp->sa_data)) {
480				error = EINVAL;
481				goto bad;
482			}
483		}
484		bcopy(m->m_data, sockp->sa_data, hlen);
485	}
486	*hdrlen = hlen;
487
488	return (0);
489bad:
490	m_freem(m);
491	return (error);
492}
493
494/*
495 * Attach file to the bpf interface, i.e. make d listen on bp.
496 */
497static void
498bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
499{
500	/*
501	 * Point d at bp, and add d to the interface's list of listeners.
502	 * Finally, point the driver's bpf cookie at the interface so
503	 * it will divert packets to bpf.
504	 */
505	BPFIF_LOCK(bp);
506	d->bd_bif = bp;
507	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
508
509	bpf_bpfd_cnt++;
510	BPFIF_UNLOCK(bp);
511}
512
513/*
514 * Detach a file from its interface.
515 */
516static void
517bpf_detachd(struct bpf_d *d)
518{
519	int error;
520	struct bpf_if *bp;
521	struct ifnet *ifp;
522
523	bp = d->bd_bif;
524	BPFIF_LOCK(bp);
525	BPFD_LOCK(d);
526	ifp = d->bd_bif->bif_ifp;
527
528	/*
529	 * Remove d from the interface's descriptor list.
530	 */
531	LIST_REMOVE(d, bd_next);
532
533	bpf_bpfd_cnt--;
534	d->bd_bif = NULL;
535	BPFD_UNLOCK(d);
536	BPFIF_UNLOCK(bp);
537
538	/*
539	 * Check if this descriptor had requested promiscuous mode.
540	 * If so, turn it off.
541	 */
542	if (d->bd_promisc) {
543		d->bd_promisc = 0;
544		error = ifpromisc(ifp, 0);
545		if (error != 0 && error != ENXIO) {
546			/*
547			 * ENXIO can happen if a pccard is unplugged
548			 * Something is really wrong if we were able to put
549			 * the driver into promiscuous mode, but can't
550			 * take it out.
551			 */
552			if_printf(bp->bif_ifp,
553				"bpf_detach: ifpromisc failed (%d)\n", error);
554		}
555	}
556}
557
558/*
559 * Open ethernet device.  Returns ENXIO for illegal minor device number,
560 * EBUSY if file is open by another process.
561 */
562/* ARGSUSED */
563static	int
564bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
565{
566	struct bpf_d *d;
567
568	mtx_lock(&bpf_mtx);
569	d = dev->si_drv1;
570	/*
571	 * Each minor can be opened by only one process.  If the requested
572	 * minor is in use, return EBUSY.
573	 */
574	if (d != NULL) {
575		mtx_unlock(&bpf_mtx);
576		return (EBUSY);
577	}
578	dev->si_drv1 = (struct bpf_d *)~0;	/* mark device in use */
579	mtx_unlock(&bpf_mtx);
580
581	if ((dev->si_flags & SI_NAMED) == 0)
582		make_dev(&bpf_cdevsw, minor(dev), UID_ROOT, GID_WHEEL, 0600,
583		    "bpf%d", dev2unit(dev));
584	MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
585	dev->si_drv1 = d;
586
587	/*
588	 * For historical reasons, perform a one-time initialization call to
589	 * the buffer routines, even though we're not yet committed to a
590	 * particular buffer method.
591	 */
592	bpf_buffer_init(d);
593	d->bd_bufmode = BPF_BUFMODE_BUFFER;
594	d->bd_sig = SIGIO;
595	d->bd_direction = BPF_D_INOUT;
596	d->bd_pid = td->td_proc->p_pid;
597#ifdef MAC
598	mac_bpfdesc_init(d);
599	mac_bpfdesc_create(td->td_ucred, d);
600#endif
601	mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF);
602	callout_init(&d->bd_callout, CALLOUT_MPSAFE);
603	knlist_init(&d->bd_sel.si_note, &d->bd_mtx, NULL, NULL, NULL);
604
605	return (0);
606}
607
608/*
609 * Close the descriptor by detaching it from its interface,
610 * deallocating its buffers, and marking it free.
611 */
612/* ARGSUSED */
613static	int
614bpfclose(struct cdev *dev, int flags, int fmt, struct thread *td)
615{
616	struct bpf_d *d = dev->si_drv1;
617
618	BPFD_LOCK(d);
619	if (d->bd_state == BPF_WAITING)
620		callout_stop(&d->bd_callout);
621	d->bd_state = BPF_IDLE;
622	BPFD_UNLOCK(d);
623	funsetown(&d->bd_sigio);
624	mtx_lock(&bpf_mtx);
625	if (d->bd_bif)
626		bpf_detachd(d);
627	mtx_unlock(&bpf_mtx);
628	selwakeuppri(&d->bd_sel, PRINET);
629#ifdef MAC
630	mac_bpfdesc_destroy(d);
631#endif /* MAC */
632	knlist_destroy(&d->bd_sel.si_note);
633	bpf_freed(d);
634	dev->si_drv1 = NULL;
635	free(d, M_BPF);
636
637	return (0);
638}
639
640/*
641 *  bpfread - read next chunk of packets from buffers
642 */
643static	int
644bpfread(struct cdev *dev, struct uio *uio, int ioflag)
645{
646	struct bpf_d *d = dev->si_drv1;
647	int timed_out;
648	int error;
649
650	/*
651	 * Restrict application to use a buffer the same size as
652	 * as kernel buffers.
653	 */
654	if (uio->uio_resid != d->bd_bufsize)
655		return (EINVAL);
656
657	BPFD_LOCK(d);
658	d->bd_pid = curthread->td_proc->p_pid;
659	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
660		BPFD_UNLOCK(d);
661		return (EOPNOTSUPP);
662	}
663	if (d->bd_state == BPF_WAITING)
664		callout_stop(&d->bd_callout);
665	timed_out = (d->bd_state == BPF_TIMED_OUT);
666	d->bd_state = BPF_IDLE;
667	/*
668	 * If the hold buffer is empty, then do a timed sleep, which
669	 * ends when the timeout expires or when enough packets
670	 * have arrived to fill the store buffer.
671	 */
672	while (d->bd_hbuf == NULL) {
673		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
674			/*
675			 * A packet(s) either arrived since the previous
676			 * read or arrived while we were asleep.
677			 * Rotate the buffers and return what's here.
678			 */
679			ROTATE_BUFFERS(d);
680			break;
681		}
682
683		/*
684		 * No data is available, check to see if the bpf device
685		 * is still pointed at a real interface.  If not, return
686		 * ENXIO so that the userland process knows to rebind
687		 * it before using it again.
688		 */
689		if (d->bd_bif == NULL) {
690			BPFD_UNLOCK(d);
691			return (ENXIO);
692		}
693
694		if (ioflag & O_NONBLOCK) {
695			BPFD_UNLOCK(d);
696			return (EWOULDBLOCK);
697		}
698		error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
699		     "bpf", d->bd_rtout);
700		if (error == EINTR || error == ERESTART) {
701			BPFD_UNLOCK(d);
702			return (error);
703		}
704		if (error == EWOULDBLOCK) {
705			/*
706			 * On a timeout, return what's in the buffer,
707			 * which may be nothing.  If there is something
708			 * in the store buffer, we can rotate the buffers.
709			 */
710			if (d->bd_hbuf)
711				/*
712				 * We filled up the buffer in between
713				 * getting the timeout and arriving
714				 * here, so we don't need to rotate.
715				 */
716				break;
717
718			if (d->bd_slen == 0) {
719				BPFD_UNLOCK(d);
720				return (0);
721			}
722			ROTATE_BUFFERS(d);
723			break;
724		}
725	}
726	/*
727	 * At this point, we know we have something in the hold slot.
728	 */
729	BPFD_UNLOCK(d);
730
731	/*
732	 * Move data from hold buffer into user space.
733	 * We know the entire buffer is transferred since
734	 * we checked above that the read buffer is bpf_bufsize bytes.
735	 *
736	 * XXXRW: More synchronization needed here: what if a second thread
737	 * issues a read on the same fd at the same time?  Don't want this
738	 * getting invalidated.
739	 */
740	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
741
742	BPFD_LOCK(d);
743	d->bd_fbuf = d->bd_hbuf;
744	d->bd_hbuf = NULL;
745	d->bd_hlen = 0;
746	BPFD_UNLOCK(d);
747
748	return (error);
749}
750
751/*
752 * If there are processes sleeping on this descriptor, wake them up.
753 */
754static __inline void
755bpf_wakeup(struct bpf_d *d)
756{
757
758	BPFD_LOCK_ASSERT(d);
759	if (d->bd_state == BPF_WAITING) {
760		callout_stop(&d->bd_callout);
761		d->bd_state = BPF_IDLE;
762	}
763	wakeup(d);
764	if (d->bd_async && d->bd_sig && d->bd_sigio)
765		pgsigio(&d->bd_sigio, d->bd_sig, 0);
766
767	selwakeuppri(&d->bd_sel, PRINET);
768	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
769}
770
771static void
772bpf_timed_out(void *arg)
773{
774	struct bpf_d *d = (struct bpf_d *)arg;
775
776	BPFD_LOCK(d);
777	if (d->bd_state == BPF_WAITING) {
778		d->bd_state = BPF_TIMED_OUT;
779		if (d->bd_slen != 0)
780			bpf_wakeup(d);
781	}
782	BPFD_UNLOCK(d);
783}
784
785static int
786bpf_ready(struct bpf_d *d)
787{
788
789	BPFD_LOCK_ASSERT(d);
790
791	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
792		return (1);
793	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
794	    d->bd_slen != 0)
795		return (1);
796	return (0);
797}
798
799static int
800bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
801{
802	struct bpf_d *d = dev->si_drv1;
803	struct ifnet *ifp;
804	struct mbuf *m, *mc;
805	struct sockaddr dst;
806	int error, hlen;
807
808	d->bd_pid = curthread->td_proc->p_pid;
809	d->bd_wcount++;
810	if (d->bd_bif == NULL) {
811		d->bd_wdcount++;
812		return (ENXIO);
813	}
814
815	ifp = d->bd_bif->bif_ifp;
816
817	if ((ifp->if_flags & IFF_UP) == 0) {
818		d->bd_wdcount++;
819		return (ENETDOWN);
820	}
821
822	if (uio->uio_resid == 0) {
823		d->bd_wdcount++;
824		return (0);
825	}
826
827	bzero(&dst, sizeof(dst));
828	m = NULL;
829	hlen = 0;
830	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
831	    &m, &dst, &hlen, d->bd_wfilter);
832	if (error) {
833		d->bd_wdcount++;
834		return (error);
835	}
836	d->bd_wfcount++;
837	if (d->bd_hdrcmplt)
838		dst.sa_family = pseudo_AF_HDRCMPLT;
839
840	if (d->bd_feedback) {
841		mc = m_dup(m, M_DONTWAIT);
842		if (mc != NULL)
843			mc->m_pkthdr.rcvif = ifp;
844		/* Set M_PROMISC for outgoing packets to be discarded. */
845		if (d->bd_direction == BPF_D_INOUT)
846			m->m_flags |= M_PROMISC;
847	} else
848		mc = NULL;
849
850	m->m_pkthdr.len -= hlen;
851	m->m_len -= hlen;
852	m->m_data += hlen;	/* XXX */
853
854#ifdef MAC
855	BPFD_LOCK(d);
856	mac_bpfdesc_create_mbuf(d, m);
857	if (mc != NULL)
858		mac_bpfdesc_create_mbuf(d, mc);
859	BPFD_UNLOCK(d);
860#endif
861
862	error = (*ifp->if_output)(ifp, m, &dst, NULL);
863	if (error)
864		d->bd_wdcount++;
865
866	if (mc != NULL) {
867		if (error == 0)
868			(*ifp->if_input)(ifp, mc);
869		else
870			m_freem(mc);
871	}
872
873	return (error);
874}
875
876/*
877 * Reset a descriptor by flushing its packet buffer and clearing the
878 * receive and drop counts.
879 */
880static void
881reset_d(struct bpf_d *d)
882{
883
884	mtx_assert(&d->bd_mtx, MA_OWNED);
885	if (d->bd_hbuf) {
886		/* Free the hold buffer. */
887		d->bd_fbuf = d->bd_hbuf;
888		d->bd_hbuf = NULL;
889	}
890	d->bd_slen = 0;
891	d->bd_hlen = 0;
892	d->bd_rcount = 0;
893	d->bd_dcount = 0;
894	d->bd_fcount = 0;
895	d->bd_wcount = 0;
896	d->bd_wfcount = 0;
897	d->bd_wdcount = 0;
898	d->bd_zcopy = 0;
899}
900
901/*
902 *  FIONREAD		Check for read packet available.
903 *  SIOCGIFADDR		Get interface address - convenient hook to driver.
904 *  BIOCGBLEN		Get buffer len [for read()].
905 *  BIOCSETF		Set ethernet read filter.
906 *  BIOCSETWF		Set ethernet write filter.
907 *  BIOCFLUSH		Flush read packet buffer.
908 *  BIOCPROMISC		Put interface into promiscuous mode.
909 *  BIOCGDLT		Get link layer type.
910 *  BIOCGETIF		Get interface name.
911 *  BIOCSETIF		Set interface.
912 *  BIOCSRTIMEOUT	Set read timeout.
913 *  BIOCGRTIMEOUT	Get read timeout.
914 *  BIOCGSTATS		Get packet stats.
915 *  BIOCIMMEDIATE	Set immediate mode.
916 *  BIOCVERSION		Get filter language version.
917 *  BIOCGHDRCMPLT	Get "header already complete" flag
918 *  BIOCSHDRCMPLT	Set "header already complete" flag
919 *  BIOCGDIRECTION	Get packet direction flag
920 *  BIOCSDIRECTION	Set packet direction flag
921 *  BIOCLOCK		Set "locked" flag
922 *  BIOCFEEDBACK	Set packet feedback mode.
923 *  BIOCSETZBUF		Set current zero-copy buffer locations.
924 *  BIOCGETZMAX		Get maximum zero-copy buffer size.
925 *  BIOCROTZBUF		Force rotation of zero-copy buffer
926 *  BIOCSETBUFMODE	Set buffer mode.
927 *  BIOCGETBUFMODE	Get current buffer mode.
928 */
929/* ARGSUSED */
930static	int
931bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
932    struct thread *td)
933{
934	struct bpf_d *d = dev->si_drv1;
935	int error = 0;
936
937	/*
938	 * Refresh PID associated with this descriptor.
939	 */
940	BPFD_LOCK(d);
941	d->bd_pid = td->td_proc->p_pid;
942	if (d->bd_state == BPF_WAITING)
943		callout_stop(&d->bd_callout);
944	d->bd_state = BPF_IDLE;
945	BPFD_UNLOCK(d);
946
947	if (d->bd_locked == 1) {
948		switch (cmd) {
949		case BIOCGBLEN:
950		case BIOCFLUSH:
951		case BIOCGDLT:
952		case BIOCGDLTLIST:
953		case BIOCGETIF:
954		case BIOCGRTIMEOUT:
955		case BIOCGSTATS:
956		case BIOCVERSION:
957		case BIOCGRSIG:
958		case BIOCGHDRCMPLT:
959		case BIOCFEEDBACK:
960		case FIONREAD:
961		case BIOCLOCK:
962		case BIOCSRTIMEOUT:
963		case BIOCIMMEDIATE:
964		case TIOCGPGRP:
965		case BIOCROTZBUF:
966			break;
967		default:
968			return (EPERM);
969		}
970	}
971	switch (cmd) {
972
973	default:
974		error = EINVAL;
975		break;
976
977	/*
978	 * Check for read packet available.
979	 */
980	case FIONREAD:
981		{
982			int n;
983
984			BPFD_LOCK(d);
985			n = d->bd_slen;
986			if (d->bd_hbuf)
987				n += d->bd_hlen;
988			BPFD_UNLOCK(d);
989
990			*(int *)addr = n;
991			break;
992		}
993
994	case SIOCGIFADDR:
995		{
996			struct ifnet *ifp;
997
998			if (d->bd_bif == NULL)
999				error = EINVAL;
1000			else {
1001				ifp = d->bd_bif->bif_ifp;
1002				error = (*ifp->if_ioctl)(ifp, cmd, addr);
1003			}
1004			break;
1005		}
1006
1007	/*
1008	 * Get buffer len [for read()].
1009	 */
1010	case BIOCGBLEN:
1011		*(u_int *)addr = d->bd_bufsize;
1012		break;
1013
1014	/*
1015	 * Set buffer length.
1016	 */
1017	case BIOCSBLEN:
1018		error = bpf_ioctl_sblen(d, (u_int *)addr);
1019		break;
1020
1021	/*
1022	 * Set link layer read filter.
1023	 */
1024	case BIOCSETF:
1025	case BIOCSETWF:
1026		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1027		break;
1028
1029	/*
1030	 * Flush read packet buffer.
1031	 */
1032	case BIOCFLUSH:
1033		BPFD_LOCK(d);
1034		reset_d(d);
1035		BPFD_UNLOCK(d);
1036		break;
1037
1038	/*
1039	 * Put interface into promiscuous mode.
1040	 */
1041	case BIOCPROMISC:
1042		if (d->bd_bif == NULL) {
1043			/*
1044			 * No interface attached yet.
1045			 */
1046			error = EINVAL;
1047			break;
1048		}
1049		if (d->bd_promisc == 0) {
1050			error = ifpromisc(d->bd_bif->bif_ifp, 1);
1051			if (error == 0)
1052				d->bd_promisc = 1;
1053		}
1054		break;
1055
1056	/*
1057	 * Get current data link type.
1058	 */
1059	case BIOCGDLT:
1060		if (d->bd_bif == NULL)
1061			error = EINVAL;
1062		else
1063			*(u_int *)addr = d->bd_bif->bif_dlt;
1064		break;
1065
1066	/*
1067	 * Get a list of supported data link types.
1068	 */
1069	case BIOCGDLTLIST:
1070		if (d->bd_bif == NULL)
1071			error = EINVAL;
1072		else
1073			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1074		break;
1075
1076	/*
1077	 * Set data link type.
1078	 */
1079	case BIOCSDLT:
1080		if (d->bd_bif == NULL)
1081			error = EINVAL;
1082		else
1083			error = bpf_setdlt(d, *(u_int *)addr);
1084		break;
1085
1086	/*
1087	 * Get interface name.
1088	 */
1089	case BIOCGETIF:
1090		if (d->bd_bif == NULL)
1091			error = EINVAL;
1092		else {
1093			struct ifnet *const ifp = d->bd_bif->bif_ifp;
1094			struct ifreq *const ifr = (struct ifreq *)addr;
1095
1096			strlcpy(ifr->ifr_name, ifp->if_xname,
1097			    sizeof(ifr->ifr_name));
1098		}
1099		break;
1100
1101	/*
1102	 * Set interface.
1103	 */
1104	case BIOCSETIF:
1105		error = bpf_setif(d, (struct ifreq *)addr);
1106		break;
1107
1108	/*
1109	 * Set read timeout.
1110	 */
1111	case BIOCSRTIMEOUT:
1112		{
1113			struct timeval *tv = (struct timeval *)addr;
1114
1115			/*
1116			 * Subtract 1 tick from tvtohz() since this isn't
1117			 * a one-shot timer.
1118			 */
1119			if ((error = itimerfix(tv)) == 0)
1120				d->bd_rtout = tvtohz(tv) - 1;
1121			break;
1122		}
1123
1124	/*
1125	 * Get read timeout.
1126	 */
1127	case BIOCGRTIMEOUT:
1128		{
1129			struct timeval *tv = (struct timeval *)addr;
1130
1131			tv->tv_sec = d->bd_rtout / hz;
1132			tv->tv_usec = (d->bd_rtout % hz) * tick;
1133			break;
1134		}
1135
1136	/*
1137	 * Get packet stats.
1138	 */
1139	case BIOCGSTATS:
1140		{
1141			struct bpf_stat *bs = (struct bpf_stat *)addr;
1142
1143			/* XXXCSJP overflow */
1144			bs->bs_recv = d->bd_rcount;
1145			bs->bs_drop = d->bd_dcount;
1146			break;
1147		}
1148
1149	/*
1150	 * Set immediate mode.
1151	 */
1152	case BIOCIMMEDIATE:
1153		d->bd_immediate = *(u_int *)addr;
1154		break;
1155
1156	case BIOCVERSION:
1157		{
1158			struct bpf_version *bv = (struct bpf_version *)addr;
1159
1160			bv->bv_major = BPF_MAJOR_VERSION;
1161			bv->bv_minor = BPF_MINOR_VERSION;
1162			break;
1163		}
1164
1165	/*
1166	 * Get "header already complete" flag
1167	 */
1168	case BIOCGHDRCMPLT:
1169		*(u_int *)addr = d->bd_hdrcmplt;
1170		break;
1171
1172	/*
1173	 * Set "header already complete" flag
1174	 */
1175	case BIOCSHDRCMPLT:
1176		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1177		break;
1178
1179	/*
1180	 * Get packet direction flag
1181	 */
1182	case BIOCGDIRECTION:
1183		*(u_int *)addr = d->bd_direction;
1184		break;
1185
1186	/*
1187	 * Set packet direction flag
1188	 */
1189	case BIOCSDIRECTION:
1190		{
1191			u_int	direction;
1192
1193			direction = *(u_int *)addr;
1194			switch (direction) {
1195			case BPF_D_IN:
1196			case BPF_D_INOUT:
1197			case BPF_D_OUT:
1198				d->bd_direction = direction;
1199				break;
1200			default:
1201				error = EINVAL;
1202			}
1203		}
1204		break;
1205
1206	case BIOCFEEDBACK:
1207		d->bd_feedback = *(u_int *)addr;
1208		break;
1209
1210	case BIOCLOCK:
1211		d->bd_locked = 1;
1212		break;
1213
1214	case FIONBIO:		/* Non-blocking I/O */
1215		break;
1216
1217	case FIOASYNC:		/* Send signal on receive packets */
1218		d->bd_async = *(int *)addr;
1219		break;
1220
1221	case FIOSETOWN:
1222		error = fsetown(*(int *)addr, &d->bd_sigio);
1223		break;
1224
1225	case FIOGETOWN:
1226		*(int *)addr = fgetown(&d->bd_sigio);
1227		break;
1228
1229	/* This is deprecated, FIOSETOWN should be used instead. */
1230	case TIOCSPGRP:
1231		error = fsetown(-(*(int *)addr), &d->bd_sigio);
1232		break;
1233
1234	/* This is deprecated, FIOGETOWN should be used instead. */
1235	case TIOCGPGRP:
1236		*(int *)addr = -fgetown(&d->bd_sigio);
1237		break;
1238
1239	case BIOCSRSIG:		/* Set receive signal */
1240		{
1241			u_int sig;
1242
1243			sig = *(u_int *)addr;
1244
1245			if (sig >= NSIG)
1246				error = EINVAL;
1247			else
1248				d->bd_sig = sig;
1249			break;
1250		}
1251	case BIOCGRSIG:
1252		*(u_int *)addr = d->bd_sig;
1253		break;
1254
1255	case BIOCGETBUFMODE:
1256		*(u_int *)addr = d->bd_bufmode;
1257		break;
1258
1259	case BIOCSETBUFMODE:
1260		/*
1261		 * Allow the buffering mode to be changed as long as we
1262		 * haven't yet committed to a particular mode.  Our
1263		 * definition of commitment, for now, is whether or not a
1264		 * buffer has been allocated or an interface attached, since
1265		 * that's the point where things get tricky.
1266		 */
1267		switch (*(u_int *)addr) {
1268		case BPF_BUFMODE_BUFFER:
1269			break;
1270
1271		case BPF_BUFMODE_ZBUF:
1272			if (bpf_zerocopy_enable)
1273				break;
1274			/* FALLSTHROUGH */
1275
1276		default:
1277			return (EINVAL);
1278		}
1279
1280		BPFD_LOCK(d);
1281		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1282		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
1283			BPFD_UNLOCK(d);
1284			return (EBUSY);
1285		}
1286		d->bd_bufmode = *(u_int *)addr;
1287		BPFD_UNLOCK(d);
1288		break;
1289
1290	case BIOCGETZMAX:
1291		return (bpf_ioctl_getzmax(td, d, (size_t *)addr));
1292
1293	case BIOCSETZBUF:
1294		return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr));
1295
1296	case BIOCROTZBUF:
1297		return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr));
1298	}
1299	return (error);
1300}
1301
1302/*
1303 * Set d's packet filter program to fp.  If this file already has a filter,
1304 * free it and replace it.  Returns EINVAL for bogus requests.
1305 */
1306static int
1307bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1308{
1309	struct bpf_insn *fcode, *old;
1310	u_int wfilter, flen, size;
1311#ifdef BPF_JITTER
1312	bpf_jit_filter *ofunc;
1313#endif
1314
1315	if (cmd == BIOCSETWF) {
1316		old = d->bd_wfilter;
1317		wfilter = 1;
1318#ifdef BPF_JITTER
1319		ofunc = NULL;
1320#endif
1321	} else {
1322		wfilter = 0;
1323		old = d->bd_rfilter;
1324#ifdef BPF_JITTER
1325		ofunc = d->bd_bfilter;
1326#endif
1327	}
1328	if (fp->bf_insns == NULL) {
1329		if (fp->bf_len != 0)
1330			return (EINVAL);
1331		BPFD_LOCK(d);
1332		if (wfilter)
1333			d->bd_wfilter = NULL;
1334		else {
1335			d->bd_rfilter = NULL;
1336#ifdef BPF_JITTER
1337			d->bd_bfilter = NULL;
1338#endif
1339		}
1340		reset_d(d);
1341		BPFD_UNLOCK(d);
1342		if (old != NULL)
1343			free((caddr_t)old, M_BPF);
1344#ifdef BPF_JITTER
1345		if (ofunc != NULL)
1346			bpf_destroy_jit_filter(ofunc);
1347#endif
1348		return (0);
1349	}
1350	flen = fp->bf_len;
1351	if (flen > bpf_maxinsns)
1352		return (EINVAL);
1353
1354	size = flen * sizeof(*fp->bf_insns);
1355	fcode = (struct bpf_insn *)malloc(size, M_BPF, M_WAITOK);
1356	if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 &&
1357	    bpf_validate(fcode, (int)flen)) {
1358		BPFD_LOCK(d);
1359		if (wfilter)
1360			d->bd_wfilter = fcode;
1361		else {
1362			d->bd_rfilter = fcode;
1363#ifdef BPF_JITTER
1364			d->bd_bfilter = bpf_jitter(fcode, flen);
1365#endif
1366		}
1367		reset_d(d);
1368		BPFD_UNLOCK(d);
1369		if (old != NULL)
1370			free((caddr_t)old, M_BPF);
1371#ifdef BPF_JITTER
1372		if (ofunc != NULL)
1373			bpf_destroy_jit_filter(ofunc);
1374#endif
1375
1376		return (0);
1377	}
1378	free((caddr_t)fcode, M_BPF);
1379	return (EINVAL);
1380}
1381
1382/*
1383 * Detach a file from its current interface (if attached at all) and attach
1384 * to the interface indicated by the name stored in ifr.
1385 * Return an errno or 0.
1386 */
1387static int
1388bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1389{
1390	struct bpf_if *bp;
1391	struct ifnet *theywant;
1392
1393	theywant = ifunit(ifr->ifr_name);
1394	if (theywant == NULL || theywant->if_bpf == NULL)
1395		return (ENXIO);
1396
1397	bp = theywant->if_bpf;
1398
1399	/*
1400	 * Behavior here depends on the buffering model.  If we're using
1401	 * kernel memory buffers, then we can allocate them here.  If we're
1402	 * using zero-copy, then the user process must have registered
1403	 * buffers by the time we get here.  If not, return an error.
1404	 *
1405	 * XXXRW: There are locking issues here with multi-threaded use: what
1406	 * if two threads try to set the interface at once?
1407	 */
1408	switch (d->bd_bufmode) {
1409	case BPF_BUFMODE_BUFFER:
1410		if (d->bd_sbuf == NULL)
1411			bpf_buffer_alloc(d);
1412		KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL"));
1413		break;
1414
1415	case BPF_BUFMODE_ZBUF:
1416		if (d->bd_sbuf == NULL)
1417			return (EINVAL);
1418		break;
1419
1420	default:
1421		panic("bpf_setif: bufmode %d", d->bd_bufmode);
1422	}
1423	if (bp != d->bd_bif) {
1424		if (d->bd_bif)
1425			/*
1426			 * Detach if attached to something else.
1427			 */
1428			bpf_detachd(d);
1429
1430		bpf_attachd(d, bp);
1431	}
1432	BPFD_LOCK(d);
1433	reset_d(d);
1434	BPFD_UNLOCK(d);
1435	return (0);
1436}
1437
1438/*
1439 * Support for select() and poll() system calls
1440 *
1441 * Return true iff the specific operation will not block indefinitely.
1442 * Otherwise, return false but make a note that a selwakeup() must be done.
1443 */
1444static int
1445bpfpoll(struct cdev *dev, int events, struct thread *td)
1446{
1447	struct bpf_d *d;
1448	int revents;
1449
1450	d = dev->si_drv1;
1451	if (d->bd_bif == NULL)
1452		return (ENXIO);
1453
1454	/*
1455	 * Refresh PID associated with this descriptor.
1456	 */
1457	revents = events & (POLLOUT | POLLWRNORM);
1458	BPFD_LOCK(d);
1459	d->bd_pid = td->td_proc->p_pid;
1460	if (events & (POLLIN | POLLRDNORM)) {
1461		if (bpf_ready(d))
1462			revents |= events & (POLLIN | POLLRDNORM);
1463		else {
1464			selrecord(td, &d->bd_sel);
1465			/* Start the read timeout if necessary. */
1466			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1467				callout_reset(&d->bd_callout, d->bd_rtout,
1468				    bpf_timed_out, d);
1469				d->bd_state = BPF_WAITING;
1470			}
1471		}
1472	}
1473	BPFD_UNLOCK(d);
1474	return (revents);
1475}
1476
1477/*
1478 * Support for kevent() system call.  Register EVFILT_READ filters and
1479 * reject all others.
1480 */
1481int
1482bpfkqfilter(struct cdev *dev, struct knote *kn)
1483{
1484	struct bpf_d *d = (struct bpf_d *)dev->si_drv1;
1485
1486	if (kn->kn_filter != EVFILT_READ)
1487		return (1);
1488
1489	/*
1490	 * Refresh PID associated with this descriptor.
1491	 */
1492	BPFD_LOCK(d);
1493	d->bd_pid = curthread->td_proc->p_pid;
1494	kn->kn_fop = &bpfread_filtops;
1495	kn->kn_hook = d;
1496	knlist_add(&d->bd_sel.si_note, kn, 1);
1497	BPFD_UNLOCK(d);
1498
1499	return (0);
1500}
1501
1502static void
1503filt_bpfdetach(struct knote *kn)
1504{
1505	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
1506
1507	knlist_remove(&d->bd_sel.si_note, kn, 0);
1508}
1509
1510static int
1511filt_bpfread(struct knote *kn, long hint)
1512{
1513	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
1514	int ready;
1515
1516	BPFD_LOCK_ASSERT(d);
1517	ready = bpf_ready(d);
1518	if (ready) {
1519		kn->kn_data = d->bd_slen;
1520		if (d->bd_hbuf)
1521			kn->kn_data += d->bd_hlen;
1522	}
1523	else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1524		callout_reset(&d->bd_callout, d->bd_rtout,
1525		    bpf_timed_out, d);
1526		d->bd_state = BPF_WAITING;
1527	}
1528
1529	return (ready);
1530}
1531
1532/*
1533 * Incoming linkage from device drivers.  Process the packet pkt, of length
1534 * pktlen, which is stored in a contiguous buffer.  The packet is parsed
1535 * by each process' filter, and if accepted, stashed into the corresponding
1536 * buffer.
1537 */
1538void
1539bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
1540{
1541	struct bpf_d *d;
1542	u_int slen;
1543	int gottime;
1544	struct timeval tv;
1545
1546	gottime = 0;
1547	BPFIF_LOCK(bp);
1548	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1549		BPFD_LOCK(d);
1550		++d->bd_rcount;
1551#ifdef BPF_JITTER
1552		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL)
1553			slen = (*(d->bd_bfilter->func))(pkt, pktlen, pktlen);
1554		else
1555#endif
1556		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
1557		if (slen != 0) {
1558			d->bd_fcount++;
1559			if (!gottime) {
1560				microtime(&tv);
1561				gottime = 1;
1562			}
1563#ifdef MAC
1564			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
1565#endif
1566				catchpacket(d, pkt, pktlen, slen,
1567				    bpf_append_bytes, &tv);
1568		}
1569		BPFD_UNLOCK(d);
1570	}
1571	BPFIF_UNLOCK(bp);
1572}
1573
1574#define	BPF_CHECK_DIRECTION(d, i)				\
1575	    (((d)->bd_direction == BPF_D_IN && (i) == NULL) ||	\
1576	    ((d)->bd_direction == BPF_D_OUT && (i) != NULL))
1577
1578/*
1579 * Incoming linkage from device drivers, when packet is in an mbuf chain.
1580 */
1581void
1582bpf_mtap(struct bpf_if *bp, struct mbuf *m)
1583{
1584	struct bpf_d *d;
1585	u_int pktlen, slen;
1586	int gottime;
1587	struct timeval tv;
1588
1589	/* Skip outgoing duplicate packets. */
1590	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
1591		m->m_flags &= ~M_PROMISC;
1592		return;
1593	}
1594
1595	gottime = 0;
1596
1597	pktlen = m_length(m, NULL);
1598
1599	BPFIF_LOCK(bp);
1600	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1601		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif))
1602			continue;
1603		BPFD_LOCK(d);
1604		++d->bd_rcount;
1605#ifdef BPF_JITTER
1606		/* XXX We cannot handle multiple mbufs. */
1607		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL &&
1608		    m->m_next == NULL)
1609			slen = (*(d->bd_bfilter->func))(mtod(m, u_char *),
1610			    pktlen, pktlen);
1611		else
1612#endif
1613		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
1614		if (slen != 0) {
1615			d->bd_fcount++;
1616			if (!gottime) {
1617				microtime(&tv);
1618				gottime = 1;
1619			}
1620#ifdef MAC
1621			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
1622#endif
1623				catchpacket(d, (u_char *)m, pktlen, slen,
1624				    bpf_append_mbuf, &tv);
1625		}
1626		BPFD_UNLOCK(d);
1627	}
1628	BPFIF_UNLOCK(bp);
1629}
1630
1631/*
1632 * Incoming linkage from device drivers, when packet is in
1633 * an mbuf chain and to be prepended by a contiguous header.
1634 */
1635void
1636bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
1637{
1638	struct mbuf mb;
1639	struct bpf_d *d;
1640	u_int pktlen, slen;
1641	int gottime;
1642	struct timeval tv;
1643
1644	/* Skip outgoing duplicate packets. */
1645	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
1646		m->m_flags &= ~M_PROMISC;
1647		return;
1648	}
1649
1650	gottime = 0;
1651
1652	pktlen = m_length(m, NULL);
1653	/*
1654	 * Craft on-stack mbuf suitable for passing to bpf_filter.
1655	 * Note that we cut corners here; we only setup what's
1656	 * absolutely needed--this mbuf should never go anywhere else.
1657	 */
1658	mb.m_next = m;
1659	mb.m_data = data;
1660	mb.m_len = dlen;
1661	pktlen += dlen;
1662
1663	BPFIF_LOCK(bp);
1664	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1665		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif))
1666			continue;
1667		BPFD_LOCK(d);
1668		++d->bd_rcount;
1669		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
1670		if (slen != 0) {
1671			d->bd_fcount++;
1672			if (!gottime) {
1673				microtime(&tv);
1674				gottime = 1;
1675			}
1676#ifdef MAC
1677			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
1678#endif
1679				catchpacket(d, (u_char *)&mb, pktlen, slen,
1680				    bpf_append_mbuf, &tv);
1681		}
1682		BPFD_UNLOCK(d);
1683	}
1684	BPFIF_UNLOCK(bp);
1685}
1686
1687#undef	BPF_CHECK_DIRECTION
1688
1689/*
1690 * Move the packet data from interface memory (pkt) into the
1691 * store buffer.  "cpfn" is the routine called to do the actual data
1692 * transfer.  bcopy is passed in to copy contiguous chunks, while
1693 * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
1694 * pkt is really an mbuf.
1695 */
1696static void
1697catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
1698    void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
1699    struct timeval *tv)
1700{
1701	struct bpf_hdr hdr;
1702	int totlen, curlen;
1703	int hdrlen = d->bd_bif->bif_hdrlen;
1704	int do_wakeup = 0;
1705
1706	BPFD_LOCK_ASSERT(d);
1707
1708	/*
1709	 * Detect whether user space has released a buffer back to us, and if
1710	 * so, move it from being a hold buffer to a free buffer.  This may
1711	 * not be the best place to do it (for example, we might only want to
1712	 * run this check if we need the space), but for now it's a reliable
1713	 * spot to do it.
1714	 */
1715	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
1716		d->bd_fbuf = d->bd_hbuf;
1717		d->bd_hbuf = NULL;
1718		d->bd_hlen = 0;
1719	}
1720
1721	/*
1722	 * Figure out how many bytes to move.  If the packet is
1723	 * greater or equal to the snapshot length, transfer that
1724	 * much.  Otherwise, transfer the whole packet (unless
1725	 * we hit the buffer size limit).
1726	 */
1727	totlen = hdrlen + min(snaplen, pktlen);
1728	if (totlen > d->bd_bufsize)
1729		totlen = d->bd_bufsize;
1730
1731	/*
1732	 * Round up the end of the previous packet to the next longword.
1733	 *
1734	 * Drop the packet if there's no room and no hope of room
1735	 * If the packet would overflow the storage buffer or the storage
1736	 * buffer is considered immutable by the buffer model, try to rotate
1737	 * the buffer and wakeup pending processes.
1738	 */
1739	curlen = BPF_WORDALIGN(d->bd_slen);
1740	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
1741		if (d->bd_fbuf == NULL) {
1742			/*
1743			 * There's no room in the store buffer, and no
1744			 * prospect of room, so drop the packet.  Notify the
1745			 * buffer model.
1746			 */
1747			bpf_buffull(d);
1748			++d->bd_dcount;
1749			return;
1750		}
1751		ROTATE_BUFFERS(d);
1752		do_wakeup = 1;
1753		curlen = 0;
1754	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
1755		/*
1756		 * Immediate mode is set, or the read timeout has already
1757		 * expired during a select call.  A packet arrived, so the
1758		 * reader should be woken up.
1759		 */
1760		do_wakeup = 1;
1761
1762	/*
1763	 * Append the bpf header.  Note we append the actual header size, but
1764	 * move forward the length of the header plus padding.
1765	 */
1766	bzero(&hdr, sizeof(hdr));
1767	hdr.bh_tstamp = *tv;
1768	hdr.bh_datalen = pktlen;
1769	hdr.bh_hdrlen = hdrlen;
1770	hdr.bh_caplen = totlen - hdrlen;
1771	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
1772
1773	/*
1774	 * Copy the packet data into the store buffer and update its length.
1775	 */
1776	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen);
1777	d->bd_slen = curlen + totlen;
1778
1779	if (do_wakeup)
1780		bpf_wakeup(d);
1781}
1782
1783/*
1784 * Free buffers currently in use by a descriptor.
1785 * Called on close.
1786 */
1787static void
1788bpf_freed(struct bpf_d *d)
1789{
1790
1791	/*
1792	 * We don't need to lock out interrupts since this descriptor has
1793	 * been detached from its interface and it yet hasn't been marked
1794	 * free.
1795	 */
1796	bpf_free(d);
1797	if (d->bd_rfilter) {
1798		free((caddr_t)d->bd_rfilter, M_BPF);
1799#ifdef BPF_JITTER
1800		bpf_destroy_jit_filter(d->bd_bfilter);
1801#endif
1802	}
1803	if (d->bd_wfilter)
1804		free((caddr_t)d->bd_wfilter, M_BPF);
1805	mtx_destroy(&d->bd_mtx);
1806}
1807
1808/*
1809 * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
1810 * fixed size of the link header (variable length headers not yet supported).
1811 */
1812void
1813bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
1814{
1815
1816	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
1817}
1818
1819/*
1820 * Attach an interface to bpf.  ifp is a pointer to the structure
1821 * defining the interface to be attached, dlt is the link layer type,
1822 * and hdrlen is the fixed size of the link header (variable length
1823 * headers are not yet supporrted).
1824 */
1825void
1826bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
1827{
1828	struct bpf_if *bp;
1829
1830	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
1831	if (bp == NULL)
1832		panic("bpfattach");
1833
1834	LIST_INIT(&bp->bif_dlist);
1835	bp->bif_ifp = ifp;
1836	bp->bif_dlt = dlt;
1837	mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF);
1838	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
1839	*driverp = bp;
1840
1841	mtx_lock(&bpf_mtx);
1842	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
1843	mtx_unlock(&bpf_mtx);
1844
1845	/*
1846	 * Compute the length of the bpf header.  This is not necessarily
1847	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1848	 * that the network layer header begins on a longword boundary (for
1849	 * performance reasons and to alleviate alignment restrictions).
1850	 */
1851	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1852
1853	if (bootverbose)
1854		if_printf(ifp, "bpf attached\n");
1855}
1856
1857/*
1858 * Detach bpf from an interface.  This involves detaching each descriptor
1859 * associated with the interface, and leaving bd_bif NULL.  Notify each
1860 * descriptor as it's detached so that any sleepers wake up and get
1861 * ENXIO.
1862 */
1863void
1864bpfdetach(struct ifnet *ifp)
1865{
1866	struct bpf_if	*bp;
1867	struct bpf_d	*d;
1868
1869	/* Locate BPF interface information */
1870	mtx_lock(&bpf_mtx);
1871	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
1872		if (ifp == bp->bif_ifp)
1873			break;
1874	}
1875
1876	/* Interface wasn't attached */
1877	if ((bp == NULL) || (bp->bif_ifp == NULL)) {
1878		mtx_unlock(&bpf_mtx);
1879		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
1880		return;
1881	}
1882
1883	LIST_REMOVE(bp, bif_next);
1884	mtx_unlock(&bpf_mtx);
1885
1886	while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
1887		bpf_detachd(d);
1888		BPFD_LOCK(d);
1889		bpf_wakeup(d);
1890		BPFD_UNLOCK(d);
1891	}
1892
1893	mtx_destroy(&bp->bif_mtx);
1894	free(bp, M_BPF);
1895}
1896
1897/*
1898 * Get a list of available data link type of the interface.
1899 */
1900static int
1901bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1902{
1903	int n, error;
1904	struct ifnet *ifp;
1905	struct bpf_if *bp;
1906
1907	ifp = d->bd_bif->bif_ifp;
1908	n = 0;
1909	error = 0;
1910	mtx_lock(&bpf_mtx);
1911	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
1912		if (bp->bif_ifp != ifp)
1913			continue;
1914		if (bfl->bfl_list != NULL) {
1915			if (n >= bfl->bfl_len) {
1916				mtx_unlock(&bpf_mtx);
1917				return (ENOMEM);
1918			}
1919			error = copyout(&bp->bif_dlt,
1920			    bfl->bfl_list + n, sizeof(u_int));
1921		}
1922		n++;
1923	}
1924	mtx_unlock(&bpf_mtx);
1925	bfl->bfl_len = n;
1926	return (error);
1927}
1928
1929/*
1930 * Set the data link type of a BPF instance.
1931 */
1932static int
1933bpf_setdlt(struct bpf_d *d, u_int dlt)
1934{
1935	int error, opromisc;
1936	struct ifnet *ifp;
1937	struct bpf_if *bp;
1938
1939	if (d->bd_bif->bif_dlt == dlt)
1940		return (0);
1941	ifp = d->bd_bif->bif_ifp;
1942	mtx_lock(&bpf_mtx);
1943	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
1944		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
1945			break;
1946	}
1947	mtx_unlock(&bpf_mtx);
1948	if (bp != NULL) {
1949		opromisc = d->bd_promisc;
1950		bpf_detachd(d);
1951		bpf_attachd(d, bp);
1952		BPFD_LOCK(d);
1953		reset_d(d);
1954		BPFD_UNLOCK(d);
1955		if (opromisc) {
1956			error = ifpromisc(bp->bif_ifp, 1);
1957			if (error)
1958				if_printf(bp->bif_ifp,
1959					"bpf_setdlt: ifpromisc failed (%d)\n",
1960					error);
1961			else
1962				d->bd_promisc = 1;
1963		}
1964	}
1965	return (bp == NULL ? EINVAL : 0);
1966}
1967
1968static void
1969bpf_clone(void *arg, struct ucred *cred, char *name, int namelen,
1970    struct cdev **dev)
1971{
1972	int u;
1973
1974	if (*dev != NULL)
1975		return;
1976	if (dev_stdclone(name, NULL, "bpf", &u) != 1)
1977		return;
1978	*dev = make_dev(&bpf_cdevsw, unit2minor(u), UID_ROOT, GID_WHEEL, 0600,
1979	    "bpf%d", u);
1980	dev_ref(*dev);
1981	(*dev)->si_flags |= SI_CHEAPCLONE;
1982	return;
1983}
1984
1985static void
1986bpf_drvinit(void *unused)
1987{
1988
1989	mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF);
1990	LIST_INIT(&bpf_iflist);
1991	EVENTHANDLER_REGISTER(dev_clone, bpf_clone, 0, 1000);
1992}
1993
1994static void
1995bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
1996{
1997
1998	bzero(d, sizeof(*d));
1999	BPFD_LOCK_ASSERT(bd);
2000	d->bd_structsize = sizeof(*d);
2001	d->bd_immediate = bd->bd_immediate;
2002	d->bd_promisc = bd->bd_promisc;
2003	d->bd_hdrcmplt = bd->bd_hdrcmplt;
2004	d->bd_direction = bd->bd_direction;
2005	d->bd_feedback = bd->bd_feedback;
2006	d->bd_async = bd->bd_async;
2007	d->bd_rcount = bd->bd_rcount;
2008	d->bd_dcount = bd->bd_dcount;
2009	d->bd_fcount = bd->bd_fcount;
2010	d->bd_sig = bd->bd_sig;
2011	d->bd_slen = bd->bd_slen;
2012	d->bd_hlen = bd->bd_hlen;
2013	d->bd_bufsize = bd->bd_bufsize;
2014	d->bd_pid = bd->bd_pid;
2015	strlcpy(d->bd_ifname,
2016	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
2017	d->bd_locked = bd->bd_locked;
2018	d->bd_wcount = bd->bd_wcount;
2019	d->bd_wdcount = bd->bd_wdcount;
2020	d->bd_wfcount = bd->bd_wfcount;
2021	d->bd_zcopy = bd->bd_zcopy;
2022	d->bd_bufmode = bd->bd_bufmode;
2023}
2024
2025static int
2026bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
2027{
2028	struct xbpf_d *xbdbuf, *xbd;
2029	int index, error;
2030	struct bpf_if *bp;
2031	struct bpf_d *bd;
2032
2033	/*
2034	 * XXX This is not technically correct. It is possible for non
2035	 * privileged users to open bpf devices. It would make sense
2036	 * if the users who opened the devices were able to retrieve
2037	 * the statistics for them, too.
2038	 */
2039	error = priv_check(req->td, PRIV_NET_BPF);
2040	if (error)
2041		return (error);
2042	if (req->oldptr == NULL)
2043		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
2044	if (bpf_bpfd_cnt == 0)
2045		return (SYSCTL_OUT(req, 0, 0));
2046	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
2047	mtx_lock(&bpf_mtx);
2048	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
2049		mtx_unlock(&bpf_mtx);
2050		free(xbdbuf, M_BPF);
2051		return (ENOMEM);
2052	}
2053	index = 0;
2054	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2055		BPFIF_LOCK(bp);
2056		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2057			xbd = &xbdbuf[index++];
2058			BPFD_LOCK(bd);
2059			bpfstats_fill_xbpf(xbd, bd);
2060			BPFD_UNLOCK(bd);
2061		}
2062		BPFIF_UNLOCK(bp);
2063	}
2064	mtx_unlock(&bpf_mtx);
2065	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
2066	free(xbdbuf, M_BPF);
2067	return (error);
2068}
2069
2070SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
2071
2072#else /* !DEV_BPF && !NETGRAPH_BPF */
2073/*
2074 * NOP stubs to allow bpf-using drivers to load and function.
2075 *
2076 * A 'better' implementation would allow the core bpf functionality
2077 * to be loaded at runtime.
2078 */
2079static struct bpf_if bp_null;
2080
2081void
2082bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2083{
2084}
2085
2086void
2087bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2088{
2089}
2090
2091void
2092bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
2093{
2094}
2095
2096void
2097bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2098{
2099
2100	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2101}
2102
2103void
2104bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
2105{
2106
2107	*driverp = &bp_null;
2108}
2109
2110void
2111bpfdetach(struct ifnet *ifp)
2112{
2113}
2114
2115u_int
2116bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
2117{
2118	return -1;	/* "no filter" behaviour */
2119}
2120
2121int
2122bpf_validate(const struct bpf_insn *f, int len)
2123{
2124	return 0;		/* false */
2125}
2126
2127#endif /* !DEV_BPF && !NETGRAPH_BPF */
2128