bpf.c revision 178882
1/*-
2 * Copyright (c) 1990, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from the Stanford/CMU enet packet filter,
6 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
7 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
8 * Berkeley Laboratory.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/net/bpf.c 178882 2008-05-09 19:29:08Z jhb $");
39
40#include "opt_bpf.h"
41#include "opt_mac.h"
42#include "opt_netgraph.h"
43
44#include <sys/types.h>
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/conf.h>
48#include <sys/fcntl.h>
49#include <sys/malloc.h>
50#include <sys/mbuf.h>
51#include <sys/time.h>
52#include <sys/priv.h>
53#include <sys/proc.h>
54#include <sys/signalvar.h>
55#include <sys/filio.h>
56#include <sys/sockio.h>
57#include <sys/ttycom.h>
58#include <sys/uio.h>
59
60#include <sys/event.h>
61#include <sys/file.h>
62#include <sys/poll.h>
63#include <sys/proc.h>
64
65#include <sys/socket.h>
66
67#include <net/if.h>
68#include <net/bpf.h>
69#include <net/bpf_buffer.h>
70#ifdef BPF_JITTER
71#include <net/bpf_jitter.h>
72#endif
73#include <net/bpf_zerocopy.h>
74#include <net/bpfdesc.h>
75
76#include <netinet/in.h>
77#include <netinet/if_ether.h>
78#include <sys/kernel.h>
79#include <sys/sysctl.h>
80
81#include <net80211/ieee80211_freebsd.h>
82
83#include <security/mac/mac_framework.h>
84
85MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
86
87#if defined(DEV_BPF) || defined(NETGRAPH_BPF)
88
89#define PRINET  26			/* interruptible */
90
91/*
92 * bpf_iflist is a list of BPF interface structures, each corresponding to a
93 * specific DLT.  The same network interface might have several BPF interface
94 * structures registered by different layers in the stack (i.e., 802.11
95 * frames, ethernet frames, etc).
96 */
97static LIST_HEAD(, bpf_if)	bpf_iflist;
98static struct mtx	bpf_mtx;		/* bpf global lock */
99static int		bpf_bpfd_cnt;
100
101static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
102static void	bpf_detachd(struct bpf_d *);
103static void	bpf_freed(struct bpf_d *);
104static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
105		    struct sockaddr *, int *, struct bpf_insn *);
106static int	bpf_setif(struct bpf_d *, struct ifreq *);
107static void	bpf_timed_out(void *);
108static __inline void
109		bpf_wakeup(struct bpf_d *);
110static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
111		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
112		    struct timeval *);
113static void	reset_d(struct bpf_d *);
114static int	 bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
115static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
116static int	bpf_setdlt(struct bpf_d *, u_int);
117static void	filt_bpfdetach(struct knote *);
118static int	filt_bpfread(struct knote *, long);
119static void	bpf_drvinit(void *);
120static void	bpf_clone(void *, struct ucred *, char *, int, struct cdev **);
121static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
122
123SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
124static int bpf_maxinsns = BPF_MAXINSNS;
125SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
126    &bpf_maxinsns, 0, "Maximum bpf program instructions");
127static int bpf_zerocopy_enable = 0;
128SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
129    &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
130SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW,
131    bpf_stats_sysctl, "bpf statistics portal");
132
133static	d_open_t	bpfopen;
134static	d_close_t	bpfclose;
135static	d_read_t	bpfread;
136static	d_write_t	bpfwrite;
137static	d_ioctl_t	bpfioctl;
138static	d_poll_t	bpfpoll;
139static	d_kqfilter_t	bpfkqfilter;
140
141static struct cdevsw bpf_cdevsw = {
142	.d_version =	D_VERSION,
143	.d_flags =	D_TRACKCLOSE,
144	.d_open =	bpfopen,
145	.d_close =	bpfclose,
146	.d_read =	bpfread,
147	.d_write =	bpfwrite,
148	.d_ioctl =	bpfioctl,
149	.d_poll =	bpfpoll,
150	.d_name =	"bpf",
151	.d_kqfilter =	bpfkqfilter,
152};
153
154static struct filterops bpfread_filtops =
155	{ 1, NULL, filt_bpfdetach, filt_bpfread };
156
157/*
158 * Wrapper functions for various buffering methods.  If the set of buffer
159 * modes expands, we will probably want to introduce a switch data structure
160 * similar to protosw, et.
161 */
162static void
163bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
164    u_int len)
165{
166
167	BPFD_LOCK_ASSERT(d);
168
169	switch (d->bd_bufmode) {
170	case BPF_BUFMODE_BUFFER:
171		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
172
173	case BPF_BUFMODE_ZBUF:
174		d->bd_zcopy++;
175		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
176
177	default:
178		panic("bpf_buf_append_bytes");
179	}
180}
181
182static void
183bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
184    u_int len)
185{
186
187	BPFD_LOCK_ASSERT(d);
188
189	switch (d->bd_bufmode) {
190	case BPF_BUFMODE_BUFFER:
191		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
192
193	case BPF_BUFMODE_ZBUF:
194		d->bd_zcopy++;
195		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
196
197	default:
198		panic("bpf_buf_append_mbuf");
199	}
200}
201
202/*
203 * If the buffer mechanism has a way to decide that a held buffer can be made
204 * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
205 * returned if the buffer can be discarded, (0) is returned if it cannot.
206 */
207static int
208bpf_canfreebuf(struct bpf_d *d)
209{
210
211	BPFD_LOCK_ASSERT(d);
212
213	switch (d->bd_bufmode) {
214	case BPF_BUFMODE_ZBUF:
215		return (bpf_zerocopy_canfreebuf(d));
216	}
217	return (0);
218}
219
220/*
221 * Allow the buffer model to indicate that the current store buffer is
222 * immutable, regardless of the appearance of space.  Return (1) if the
223 * buffer is writable, and (0) if not.
224 */
225static int
226bpf_canwritebuf(struct bpf_d *d)
227{
228
229	BPFD_LOCK_ASSERT(d);
230
231	switch (d->bd_bufmode) {
232	case BPF_BUFMODE_ZBUF:
233		return (bpf_zerocopy_canwritebuf(d));
234	}
235	return (1);
236}
237
238/*
239 * Notify buffer model that an attempt to write to the store buffer has
240 * resulted in a dropped packet, in which case the buffer may be considered
241 * full.
242 */
243static void
244bpf_buffull(struct bpf_d *d)
245{
246
247	BPFD_LOCK_ASSERT(d);
248
249	switch (d->bd_bufmode) {
250	case BPF_BUFMODE_ZBUF:
251		bpf_zerocopy_buffull(d);
252		break;
253	}
254}
255
256/*
257 * Notify the buffer model that a buffer has moved into the hold position.
258 */
259void
260bpf_bufheld(struct bpf_d *d)
261{
262
263	BPFD_LOCK_ASSERT(d);
264
265	switch (d->bd_bufmode) {
266	case BPF_BUFMODE_ZBUF:
267		bpf_zerocopy_bufheld(d);
268		break;
269	}
270}
271
272static void
273bpf_free(struct bpf_d *d)
274{
275
276	switch (d->bd_bufmode) {
277	case BPF_BUFMODE_BUFFER:
278		return (bpf_buffer_free(d));
279
280	case BPF_BUFMODE_ZBUF:
281		return (bpf_zerocopy_free(d));
282
283	default:
284		panic("bpf_buf_free");
285	}
286}
287
288static int
289bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
290{
291
292	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
293		return (EOPNOTSUPP);
294	return (bpf_buffer_uiomove(d, buf, len, uio));
295}
296
297static int
298bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
299{
300
301	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
302		return (EOPNOTSUPP);
303	return (bpf_buffer_ioctl_sblen(d, i));
304}
305
306static int
307bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
308{
309
310	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
311		return (EOPNOTSUPP);
312	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
313}
314
315static int
316bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
317{
318
319	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
320		return (EOPNOTSUPP);
321	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
322}
323
324static int
325bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
326{
327
328	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
329		return (EOPNOTSUPP);
330	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
331}
332
333/*
334 * General BPF functions.
335 */
336static int
337bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
338    struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
339{
340	const struct ieee80211_bpf_params *p;
341	struct ether_header *eh;
342	struct mbuf *m;
343	int error;
344	int len;
345	int hlen;
346	int slen;
347
348	/*
349	 * Build a sockaddr based on the data link layer type.
350	 * We do this at this level because the ethernet header
351	 * is copied directly into the data field of the sockaddr.
352	 * In the case of SLIP, there is no header and the packet
353	 * is forwarded as is.
354	 * Also, we are careful to leave room at the front of the mbuf
355	 * for the link level header.
356	 */
357	switch (linktype) {
358
359	case DLT_SLIP:
360		sockp->sa_family = AF_INET;
361		hlen = 0;
362		break;
363
364	case DLT_EN10MB:
365		sockp->sa_family = AF_UNSPEC;
366		/* XXX Would MAXLINKHDR be better? */
367		hlen = ETHER_HDR_LEN;
368		break;
369
370	case DLT_FDDI:
371		sockp->sa_family = AF_IMPLINK;
372		hlen = 0;
373		break;
374
375	case DLT_RAW:
376		sockp->sa_family = AF_UNSPEC;
377		hlen = 0;
378		break;
379
380	case DLT_NULL:
381		/*
382		 * null interface types require a 4 byte pseudo header which
383		 * corresponds to the address family of the packet.
384		 */
385		sockp->sa_family = AF_UNSPEC;
386		hlen = 4;
387		break;
388
389	case DLT_ATM_RFC1483:
390		/*
391		 * en atm driver requires 4-byte atm pseudo header.
392		 * though it isn't standard, vpi:vci needs to be
393		 * specified anyway.
394		 */
395		sockp->sa_family = AF_UNSPEC;
396		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
397		break;
398
399	case DLT_PPP:
400		sockp->sa_family = AF_UNSPEC;
401		hlen = 4;	/* This should match PPP_HDRLEN */
402		break;
403
404	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
405		sockp->sa_family = AF_IEEE80211;
406		hlen = 0;
407		break;
408
409	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
410		sockp->sa_family = AF_IEEE80211;
411		sockp->sa_len = 12;	/* XXX != 0 */
412		hlen = sizeof(struct ieee80211_bpf_params);
413		break;
414
415	default:
416		return (EIO);
417	}
418
419	len = uio->uio_resid;
420
421	if (len - hlen > ifp->if_mtu)
422		return (EMSGSIZE);
423
424	if ((unsigned)len > MCLBYTES)
425		return (EIO);
426
427	if (len > MHLEN)
428		m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
429	else
430		MGETHDR(m, M_WAIT, MT_DATA);
431	m->m_pkthdr.len = m->m_len = len;
432	m->m_pkthdr.rcvif = NULL;
433	*mp = m;
434
435	if (m->m_len < hlen) {
436		error = EPERM;
437		goto bad;
438	}
439
440	error = uiomove(mtod(m, u_char *), len, uio);
441	if (error)
442		goto bad;
443
444	slen = bpf_filter(wfilter, mtod(m, u_char *), len, len);
445	if (slen == 0) {
446		error = EPERM;
447		goto bad;
448	}
449
450	/* Check for multicast destination */
451	switch (linktype) {
452	case DLT_EN10MB:
453		eh = mtod(m, struct ether_header *);
454		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
455			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
456			    ETHER_ADDR_LEN) == 0)
457				m->m_flags |= M_BCAST;
458			else
459				m->m_flags |= M_MCAST;
460		}
461		break;
462	}
463
464	/*
465	 * Make room for link header, and copy it to sockaddr
466	 */
467	if (hlen != 0) {
468		if (sockp->sa_family == AF_IEEE80211) {
469			/*
470			 * Collect true length from the parameter header
471			 * NB: sockp is known to be zero'd so if we do a
472			 *     short copy unspecified parameters will be
473			 *     zero.
474			 * NB: packet may not be aligned after stripping
475			 *     bpf params
476			 * XXX check ibp_vers
477			 */
478			p = mtod(m, const struct ieee80211_bpf_params *);
479			hlen = p->ibp_len;
480			if (hlen > sizeof(sockp->sa_data)) {
481				error = EINVAL;
482				goto bad;
483			}
484		}
485		bcopy(m->m_data, sockp->sa_data, hlen);
486	}
487	*hdrlen = hlen;
488
489	return (0);
490bad:
491	m_freem(m);
492	return (error);
493}
494
495/*
496 * Attach file to the bpf interface, i.e. make d listen on bp.
497 */
498static void
499bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
500{
501	/*
502	 * Point d at bp, and add d to the interface's list of listeners.
503	 * Finally, point the driver's bpf cookie at the interface so
504	 * it will divert packets to bpf.
505	 */
506	BPFIF_LOCK(bp);
507	d->bd_bif = bp;
508	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
509
510	bpf_bpfd_cnt++;
511	BPFIF_UNLOCK(bp);
512}
513
514/*
515 * Detach a file from its interface.
516 */
517static void
518bpf_detachd(struct bpf_d *d)
519{
520	int error;
521	struct bpf_if *bp;
522	struct ifnet *ifp;
523
524	bp = d->bd_bif;
525	BPFIF_LOCK(bp);
526	BPFD_LOCK(d);
527	ifp = d->bd_bif->bif_ifp;
528
529	/*
530	 * Remove d from the interface's descriptor list.
531	 */
532	LIST_REMOVE(d, bd_next);
533
534	bpf_bpfd_cnt--;
535	d->bd_bif = NULL;
536	BPFD_UNLOCK(d);
537	BPFIF_UNLOCK(bp);
538
539	/*
540	 * Check if this descriptor had requested promiscuous mode.
541	 * If so, turn it off.
542	 */
543	if (d->bd_promisc) {
544		d->bd_promisc = 0;
545		error = ifpromisc(ifp, 0);
546		if (error != 0 && error != ENXIO) {
547			/*
548			 * ENXIO can happen if a pccard is unplugged
549			 * Something is really wrong if we were able to put
550			 * the driver into promiscuous mode, but can't
551			 * take it out.
552			 */
553			if_printf(bp->bif_ifp,
554				"bpf_detach: ifpromisc failed (%d)\n", error);
555		}
556	}
557}
558
559/*
560 * Open ethernet device.  Returns ENXIO for illegal minor device number,
561 * EBUSY if file is open by another process.
562 */
563/* ARGSUSED */
564static	int
565bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
566{
567	struct bpf_d *d;
568
569	mtx_lock(&bpf_mtx);
570	d = dev->si_drv1;
571	/*
572	 * Each minor can be opened by only one process.  If the requested
573	 * minor is in use, return EBUSY.
574	 */
575	if (d != NULL) {
576		mtx_unlock(&bpf_mtx);
577		return (EBUSY);
578	}
579	dev->si_drv1 = (struct bpf_d *)~0;	/* mark device in use */
580	mtx_unlock(&bpf_mtx);
581
582	if ((dev->si_flags & SI_NAMED) == 0)
583		make_dev(&bpf_cdevsw, minor(dev), UID_ROOT, GID_WHEEL, 0600,
584		    "bpf%d", dev2unit(dev));
585	MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
586	dev->si_drv1 = d;
587
588	/*
589	 * For historical reasons, perform a one-time initialization call to
590	 * the buffer routines, even though we're not yet committed to a
591	 * particular buffer method.
592	 */
593	bpf_buffer_init(d);
594	d->bd_bufmode = BPF_BUFMODE_BUFFER;
595	d->bd_sig = SIGIO;
596	d->bd_direction = BPF_D_INOUT;
597	d->bd_pid = td->td_proc->p_pid;
598#ifdef MAC
599	mac_bpfdesc_init(d);
600	mac_bpfdesc_create(td->td_ucred, d);
601#endif
602	mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF);
603	callout_init(&d->bd_callout, CALLOUT_MPSAFE);
604	knlist_init(&d->bd_sel.si_note, &d->bd_mtx, NULL, NULL, NULL);
605
606	return (0);
607}
608
609/*
610 * Close the descriptor by detaching it from its interface,
611 * deallocating its buffers, and marking it free.
612 */
613/* ARGSUSED */
614static	int
615bpfclose(struct cdev *dev, int flags, int fmt, struct thread *td)
616{
617	struct bpf_d *d = dev->si_drv1;
618
619	BPFD_LOCK(d);
620	if (d->bd_state == BPF_WAITING)
621		callout_stop(&d->bd_callout);
622	d->bd_state = BPF_IDLE;
623	BPFD_UNLOCK(d);
624	funsetown(&d->bd_sigio);
625	mtx_lock(&bpf_mtx);
626	if (d->bd_bif)
627		bpf_detachd(d);
628	mtx_unlock(&bpf_mtx);
629	selwakeuppri(&d->bd_sel, PRINET);
630#ifdef MAC
631	mac_bpfdesc_destroy(d);
632#endif /* MAC */
633	knlist_destroy(&d->bd_sel.si_note);
634	bpf_freed(d);
635	dev->si_drv1 = NULL;
636	free(d, M_BPF);
637
638	return (0);
639}
640
641/*
642 *  bpfread - read next chunk of packets from buffers
643 */
644static	int
645bpfread(struct cdev *dev, struct uio *uio, int ioflag)
646{
647	struct bpf_d *d = dev->si_drv1;
648	int timed_out;
649	int error;
650
651	/*
652	 * Restrict application to use a buffer the same size as
653	 * as kernel buffers.
654	 */
655	if (uio->uio_resid != d->bd_bufsize)
656		return (EINVAL);
657
658	BPFD_LOCK(d);
659	d->bd_pid = curthread->td_proc->p_pid;
660	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
661		BPFD_UNLOCK(d);
662		return (EOPNOTSUPP);
663	}
664	if (d->bd_state == BPF_WAITING)
665		callout_stop(&d->bd_callout);
666	timed_out = (d->bd_state == BPF_TIMED_OUT);
667	d->bd_state = BPF_IDLE;
668	/*
669	 * If the hold buffer is empty, then do a timed sleep, which
670	 * ends when the timeout expires or when enough packets
671	 * have arrived to fill the store buffer.
672	 */
673	while (d->bd_hbuf == NULL) {
674		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
675			/*
676			 * A packet(s) either arrived since the previous
677			 * read or arrived while we were asleep.
678			 * Rotate the buffers and return what's here.
679			 */
680			ROTATE_BUFFERS(d);
681			break;
682		}
683
684		/*
685		 * No data is available, check to see if the bpf device
686		 * is still pointed at a real interface.  If not, return
687		 * ENXIO so that the userland process knows to rebind
688		 * it before using it again.
689		 */
690		if (d->bd_bif == NULL) {
691			BPFD_UNLOCK(d);
692			return (ENXIO);
693		}
694
695		if (ioflag & O_NONBLOCK) {
696			BPFD_UNLOCK(d);
697			return (EWOULDBLOCK);
698		}
699		error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
700		     "bpf", d->bd_rtout);
701		if (error == EINTR || error == ERESTART) {
702			BPFD_UNLOCK(d);
703			return (error);
704		}
705		if (error == EWOULDBLOCK) {
706			/*
707			 * On a timeout, return what's in the buffer,
708			 * which may be nothing.  If there is something
709			 * in the store buffer, we can rotate the buffers.
710			 */
711			if (d->bd_hbuf)
712				/*
713				 * We filled up the buffer in between
714				 * getting the timeout and arriving
715				 * here, so we don't need to rotate.
716				 */
717				break;
718
719			if (d->bd_slen == 0) {
720				BPFD_UNLOCK(d);
721				return (0);
722			}
723			ROTATE_BUFFERS(d);
724			break;
725		}
726	}
727	/*
728	 * At this point, we know we have something in the hold slot.
729	 */
730	BPFD_UNLOCK(d);
731
732	/*
733	 * Move data from hold buffer into user space.
734	 * We know the entire buffer is transferred since
735	 * we checked above that the read buffer is bpf_bufsize bytes.
736	 *
737	 * XXXRW: More synchronization needed here: what if a second thread
738	 * issues a read on the same fd at the same time?  Don't want this
739	 * getting invalidated.
740	 */
741	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
742
743	BPFD_LOCK(d);
744	d->bd_fbuf = d->bd_hbuf;
745	d->bd_hbuf = NULL;
746	d->bd_hlen = 0;
747	BPFD_UNLOCK(d);
748
749	return (error);
750}
751
752/*
753 * If there are processes sleeping on this descriptor, wake them up.
754 */
755static __inline void
756bpf_wakeup(struct bpf_d *d)
757{
758
759	BPFD_LOCK_ASSERT(d);
760	if (d->bd_state == BPF_WAITING) {
761		callout_stop(&d->bd_callout);
762		d->bd_state = BPF_IDLE;
763	}
764	wakeup(d);
765	if (d->bd_async && d->bd_sig && d->bd_sigio)
766		pgsigio(&d->bd_sigio, d->bd_sig, 0);
767
768	selwakeuppri(&d->bd_sel, PRINET);
769	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
770}
771
772static void
773bpf_timed_out(void *arg)
774{
775	struct bpf_d *d = (struct bpf_d *)arg;
776
777	BPFD_LOCK(d);
778	if (d->bd_state == BPF_WAITING) {
779		d->bd_state = BPF_TIMED_OUT;
780		if (d->bd_slen != 0)
781			bpf_wakeup(d);
782	}
783	BPFD_UNLOCK(d);
784}
785
786static int
787bpf_ready(struct bpf_d *d)
788{
789
790	BPFD_LOCK_ASSERT(d);
791
792	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
793		return (1);
794	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
795	    d->bd_slen != 0)
796		return (1);
797	return (0);
798}
799
800static int
801bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
802{
803	struct bpf_d *d = dev->si_drv1;
804	struct ifnet *ifp;
805	struct mbuf *m, *mc;
806	struct sockaddr dst;
807	int error, hlen;
808
809	d->bd_pid = curthread->td_proc->p_pid;
810	d->bd_wcount++;
811	if (d->bd_bif == NULL) {
812		d->bd_wdcount++;
813		return (ENXIO);
814	}
815
816	ifp = d->bd_bif->bif_ifp;
817
818	if ((ifp->if_flags & IFF_UP) == 0) {
819		d->bd_wdcount++;
820		return (ENETDOWN);
821	}
822
823	if (uio->uio_resid == 0) {
824		d->bd_wdcount++;
825		return (0);
826	}
827
828	bzero(&dst, sizeof(dst));
829	m = NULL;
830	hlen = 0;
831	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
832	    &m, &dst, &hlen, d->bd_wfilter);
833	if (error) {
834		d->bd_wdcount++;
835		return (error);
836	}
837	d->bd_wfcount++;
838	if (d->bd_hdrcmplt)
839		dst.sa_family = pseudo_AF_HDRCMPLT;
840
841	if (d->bd_feedback) {
842		mc = m_dup(m, M_DONTWAIT);
843		if (mc != NULL)
844			mc->m_pkthdr.rcvif = ifp;
845		/* Set M_PROMISC for outgoing packets to be discarded. */
846		if (d->bd_direction == BPF_D_INOUT)
847			m->m_flags |= M_PROMISC;
848	} else
849		mc = NULL;
850
851	m->m_pkthdr.len -= hlen;
852	m->m_len -= hlen;
853	m->m_data += hlen;	/* XXX */
854
855#ifdef MAC
856	BPFD_LOCK(d);
857	mac_bpfdesc_create_mbuf(d, m);
858	if (mc != NULL)
859		mac_bpfdesc_create_mbuf(d, mc);
860	BPFD_UNLOCK(d);
861#endif
862
863	error = (*ifp->if_output)(ifp, m, &dst, NULL);
864	if (error)
865		d->bd_wdcount++;
866
867	if (mc != NULL) {
868		if (error == 0)
869			(*ifp->if_input)(ifp, mc);
870		else
871			m_freem(mc);
872	}
873
874	return (error);
875}
876
877/*
878 * Reset a descriptor by flushing its packet buffer and clearing the
879 * receive and drop counts.
880 */
881static void
882reset_d(struct bpf_d *d)
883{
884
885	mtx_assert(&d->bd_mtx, MA_OWNED);
886	if (d->bd_hbuf) {
887		/* Free the hold buffer. */
888		d->bd_fbuf = d->bd_hbuf;
889		d->bd_hbuf = NULL;
890	}
891	d->bd_slen = 0;
892	d->bd_hlen = 0;
893	d->bd_rcount = 0;
894	d->bd_dcount = 0;
895	d->bd_fcount = 0;
896	d->bd_wcount = 0;
897	d->bd_wfcount = 0;
898	d->bd_wdcount = 0;
899	d->bd_zcopy = 0;
900}
901
902/*
903 *  FIONREAD		Check for read packet available.
904 *  SIOCGIFADDR		Get interface address - convenient hook to driver.
905 *  BIOCGBLEN		Get buffer len [for read()].
906 *  BIOCSETF		Set ethernet read filter.
907 *  BIOCSETWF		Set ethernet write filter.
908 *  BIOCFLUSH		Flush read packet buffer.
909 *  BIOCPROMISC		Put interface into promiscuous mode.
910 *  BIOCGDLT		Get link layer type.
911 *  BIOCGETIF		Get interface name.
912 *  BIOCSETIF		Set interface.
913 *  BIOCSRTIMEOUT	Set read timeout.
914 *  BIOCGRTIMEOUT	Get read timeout.
915 *  BIOCGSTATS		Get packet stats.
916 *  BIOCIMMEDIATE	Set immediate mode.
917 *  BIOCVERSION		Get filter language version.
918 *  BIOCGHDRCMPLT	Get "header already complete" flag
919 *  BIOCSHDRCMPLT	Set "header already complete" flag
920 *  BIOCGDIRECTION	Get packet direction flag
921 *  BIOCSDIRECTION	Set packet direction flag
922 *  BIOCLOCK		Set "locked" flag
923 *  BIOCFEEDBACK	Set packet feedback mode.
924 *  BIOCSETZBUF		Set current zero-copy buffer locations.
925 *  BIOCGETZMAX		Get maximum zero-copy buffer size.
926 *  BIOCROTZBUF		Force rotation of zero-copy buffer
927 *  BIOCSETBUFMODE	Set buffer mode.
928 *  BIOCGETBUFMODE	Get current buffer mode.
929 */
930/* ARGSUSED */
931static	int
932bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
933    struct thread *td)
934{
935	struct bpf_d *d = dev->si_drv1;
936	int error = 0;
937
938	/*
939	 * Refresh PID associated with this descriptor.
940	 */
941	BPFD_LOCK(d);
942	d->bd_pid = td->td_proc->p_pid;
943	if (d->bd_state == BPF_WAITING)
944		callout_stop(&d->bd_callout);
945	d->bd_state = BPF_IDLE;
946	BPFD_UNLOCK(d);
947
948	if (d->bd_locked == 1) {
949		switch (cmd) {
950		case BIOCGBLEN:
951		case BIOCFLUSH:
952		case BIOCGDLT:
953		case BIOCGDLTLIST:
954		case BIOCGETIF:
955		case BIOCGRTIMEOUT:
956		case BIOCGSTATS:
957		case BIOCVERSION:
958		case BIOCGRSIG:
959		case BIOCGHDRCMPLT:
960		case BIOCFEEDBACK:
961		case FIONREAD:
962		case BIOCLOCK:
963		case BIOCSRTIMEOUT:
964		case BIOCIMMEDIATE:
965		case TIOCGPGRP:
966		case BIOCROTZBUF:
967			break;
968		default:
969			return (EPERM);
970		}
971	}
972	switch (cmd) {
973
974	default:
975		error = EINVAL;
976		break;
977
978	/*
979	 * Check for read packet available.
980	 */
981	case FIONREAD:
982		{
983			int n;
984
985			BPFD_LOCK(d);
986			n = d->bd_slen;
987			if (d->bd_hbuf)
988				n += d->bd_hlen;
989			BPFD_UNLOCK(d);
990
991			*(int *)addr = n;
992			break;
993		}
994
995	case SIOCGIFADDR:
996		{
997			struct ifnet *ifp;
998
999			if (d->bd_bif == NULL)
1000				error = EINVAL;
1001			else {
1002				ifp = d->bd_bif->bif_ifp;
1003				error = (*ifp->if_ioctl)(ifp, cmd, addr);
1004			}
1005			break;
1006		}
1007
1008	/*
1009	 * Get buffer len [for read()].
1010	 */
1011	case BIOCGBLEN:
1012		*(u_int *)addr = d->bd_bufsize;
1013		break;
1014
1015	/*
1016	 * Set buffer length.
1017	 */
1018	case BIOCSBLEN:
1019		error = bpf_ioctl_sblen(d, (u_int *)addr);
1020		break;
1021
1022	/*
1023	 * Set link layer read filter.
1024	 */
1025	case BIOCSETF:
1026	case BIOCSETWF:
1027		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1028		break;
1029
1030	/*
1031	 * Flush read packet buffer.
1032	 */
1033	case BIOCFLUSH:
1034		BPFD_LOCK(d);
1035		reset_d(d);
1036		BPFD_UNLOCK(d);
1037		break;
1038
1039	/*
1040	 * Put interface into promiscuous mode.
1041	 */
1042	case BIOCPROMISC:
1043		if (d->bd_bif == NULL) {
1044			/*
1045			 * No interface attached yet.
1046			 */
1047			error = EINVAL;
1048			break;
1049		}
1050		if (d->bd_promisc == 0) {
1051			error = ifpromisc(d->bd_bif->bif_ifp, 1);
1052			if (error == 0)
1053				d->bd_promisc = 1;
1054		}
1055		break;
1056
1057	/*
1058	 * Get current data link type.
1059	 */
1060	case BIOCGDLT:
1061		if (d->bd_bif == NULL)
1062			error = EINVAL;
1063		else
1064			*(u_int *)addr = d->bd_bif->bif_dlt;
1065		break;
1066
1067	/*
1068	 * Get a list of supported data link types.
1069	 */
1070	case BIOCGDLTLIST:
1071		if (d->bd_bif == NULL)
1072			error = EINVAL;
1073		else
1074			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1075		break;
1076
1077	/*
1078	 * Set data link type.
1079	 */
1080	case BIOCSDLT:
1081		if (d->bd_bif == NULL)
1082			error = EINVAL;
1083		else
1084			error = bpf_setdlt(d, *(u_int *)addr);
1085		break;
1086
1087	/*
1088	 * Get interface name.
1089	 */
1090	case BIOCGETIF:
1091		if (d->bd_bif == NULL)
1092			error = EINVAL;
1093		else {
1094			struct ifnet *const ifp = d->bd_bif->bif_ifp;
1095			struct ifreq *const ifr = (struct ifreq *)addr;
1096
1097			strlcpy(ifr->ifr_name, ifp->if_xname,
1098			    sizeof(ifr->ifr_name));
1099		}
1100		break;
1101
1102	/*
1103	 * Set interface.
1104	 */
1105	case BIOCSETIF:
1106		error = bpf_setif(d, (struct ifreq *)addr);
1107		break;
1108
1109	/*
1110	 * Set read timeout.
1111	 */
1112	case BIOCSRTIMEOUT:
1113		{
1114			struct timeval *tv = (struct timeval *)addr;
1115
1116			/*
1117			 * Subtract 1 tick from tvtohz() since this isn't
1118			 * a one-shot timer.
1119			 */
1120			if ((error = itimerfix(tv)) == 0)
1121				d->bd_rtout = tvtohz(tv) - 1;
1122			break;
1123		}
1124
1125	/*
1126	 * Get read timeout.
1127	 */
1128	case BIOCGRTIMEOUT:
1129		{
1130			struct timeval *tv = (struct timeval *)addr;
1131
1132			tv->tv_sec = d->bd_rtout / hz;
1133			tv->tv_usec = (d->bd_rtout % hz) * tick;
1134			break;
1135		}
1136
1137	/*
1138	 * Get packet stats.
1139	 */
1140	case BIOCGSTATS:
1141		{
1142			struct bpf_stat *bs = (struct bpf_stat *)addr;
1143
1144			/* XXXCSJP overflow */
1145			bs->bs_recv = d->bd_rcount;
1146			bs->bs_drop = d->bd_dcount;
1147			break;
1148		}
1149
1150	/*
1151	 * Set immediate mode.
1152	 */
1153	case BIOCIMMEDIATE:
1154		d->bd_immediate = *(u_int *)addr;
1155		break;
1156
1157	case BIOCVERSION:
1158		{
1159			struct bpf_version *bv = (struct bpf_version *)addr;
1160
1161			bv->bv_major = BPF_MAJOR_VERSION;
1162			bv->bv_minor = BPF_MINOR_VERSION;
1163			break;
1164		}
1165
1166	/*
1167	 * Get "header already complete" flag
1168	 */
1169	case BIOCGHDRCMPLT:
1170		*(u_int *)addr = d->bd_hdrcmplt;
1171		break;
1172
1173	/*
1174	 * Set "header already complete" flag
1175	 */
1176	case BIOCSHDRCMPLT:
1177		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1178		break;
1179
1180	/*
1181	 * Get packet direction flag
1182	 */
1183	case BIOCGDIRECTION:
1184		*(u_int *)addr = d->bd_direction;
1185		break;
1186
1187	/*
1188	 * Set packet direction flag
1189	 */
1190	case BIOCSDIRECTION:
1191		{
1192			u_int	direction;
1193
1194			direction = *(u_int *)addr;
1195			switch (direction) {
1196			case BPF_D_IN:
1197			case BPF_D_INOUT:
1198			case BPF_D_OUT:
1199				d->bd_direction = direction;
1200				break;
1201			default:
1202				error = EINVAL;
1203			}
1204		}
1205		break;
1206
1207	case BIOCFEEDBACK:
1208		d->bd_feedback = *(u_int *)addr;
1209		break;
1210
1211	case BIOCLOCK:
1212		d->bd_locked = 1;
1213		break;
1214
1215	case FIONBIO:		/* Non-blocking I/O */
1216		break;
1217
1218	case FIOASYNC:		/* Send signal on receive packets */
1219		d->bd_async = *(int *)addr;
1220		break;
1221
1222	case FIOSETOWN:
1223		error = fsetown(*(int *)addr, &d->bd_sigio);
1224		break;
1225
1226	case FIOGETOWN:
1227		*(int *)addr = fgetown(&d->bd_sigio);
1228		break;
1229
1230	/* This is deprecated, FIOSETOWN should be used instead. */
1231	case TIOCSPGRP:
1232		error = fsetown(-(*(int *)addr), &d->bd_sigio);
1233		break;
1234
1235	/* This is deprecated, FIOGETOWN should be used instead. */
1236	case TIOCGPGRP:
1237		*(int *)addr = -fgetown(&d->bd_sigio);
1238		break;
1239
1240	case BIOCSRSIG:		/* Set receive signal */
1241		{
1242			u_int sig;
1243
1244			sig = *(u_int *)addr;
1245
1246			if (sig >= NSIG)
1247				error = EINVAL;
1248			else
1249				d->bd_sig = sig;
1250			break;
1251		}
1252	case BIOCGRSIG:
1253		*(u_int *)addr = d->bd_sig;
1254		break;
1255
1256	case BIOCGETBUFMODE:
1257		*(u_int *)addr = d->bd_bufmode;
1258		break;
1259
1260	case BIOCSETBUFMODE:
1261		/*
1262		 * Allow the buffering mode to be changed as long as we
1263		 * haven't yet committed to a particular mode.  Our
1264		 * definition of commitment, for now, is whether or not a
1265		 * buffer has been allocated or an interface attached, since
1266		 * that's the point where things get tricky.
1267		 */
1268		switch (*(u_int *)addr) {
1269		case BPF_BUFMODE_BUFFER:
1270			break;
1271
1272		case BPF_BUFMODE_ZBUF:
1273			if (bpf_zerocopy_enable)
1274				break;
1275			/* FALLSTHROUGH */
1276
1277		default:
1278			return (EINVAL);
1279		}
1280
1281		BPFD_LOCK(d);
1282		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1283		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
1284			BPFD_UNLOCK(d);
1285			return (EBUSY);
1286		}
1287		d->bd_bufmode = *(u_int *)addr;
1288		BPFD_UNLOCK(d);
1289		break;
1290
1291	case BIOCGETZMAX:
1292		return (bpf_ioctl_getzmax(td, d, (size_t *)addr));
1293
1294	case BIOCSETZBUF:
1295		return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr));
1296
1297	case BIOCROTZBUF:
1298		return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr));
1299	}
1300	return (error);
1301}
1302
1303/*
1304 * Set d's packet filter program to fp.  If this file already has a filter,
1305 * free it and replace it.  Returns EINVAL for bogus requests.
1306 */
1307static int
1308bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1309{
1310	struct bpf_insn *fcode, *old;
1311	u_int wfilter, flen, size;
1312#ifdef BPF_JITTER
1313	bpf_jit_filter *ofunc;
1314#endif
1315
1316	if (cmd == BIOCSETWF) {
1317		old = d->bd_wfilter;
1318		wfilter = 1;
1319#ifdef BPF_JITTER
1320		ofunc = NULL;
1321#endif
1322	} else {
1323		wfilter = 0;
1324		old = d->bd_rfilter;
1325#ifdef BPF_JITTER
1326		ofunc = d->bd_bfilter;
1327#endif
1328	}
1329	if (fp->bf_insns == NULL) {
1330		if (fp->bf_len != 0)
1331			return (EINVAL);
1332		BPFD_LOCK(d);
1333		if (wfilter)
1334			d->bd_wfilter = NULL;
1335		else {
1336			d->bd_rfilter = NULL;
1337#ifdef BPF_JITTER
1338			d->bd_bfilter = NULL;
1339#endif
1340		}
1341		reset_d(d);
1342		BPFD_UNLOCK(d);
1343		if (old != NULL)
1344			free((caddr_t)old, M_BPF);
1345#ifdef BPF_JITTER
1346		if (ofunc != NULL)
1347			bpf_destroy_jit_filter(ofunc);
1348#endif
1349		return (0);
1350	}
1351	flen = fp->bf_len;
1352	if (flen > bpf_maxinsns)
1353		return (EINVAL);
1354
1355	size = flen * sizeof(*fp->bf_insns);
1356	fcode = (struct bpf_insn *)malloc(size, M_BPF, M_WAITOK);
1357	if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 &&
1358	    bpf_validate(fcode, (int)flen)) {
1359		BPFD_LOCK(d);
1360		if (wfilter)
1361			d->bd_wfilter = fcode;
1362		else {
1363			d->bd_rfilter = fcode;
1364#ifdef BPF_JITTER
1365			d->bd_bfilter = bpf_jitter(fcode, flen);
1366#endif
1367		}
1368		reset_d(d);
1369		BPFD_UNLOCK(d);
1370		if (old != NULL)
1371			free((caddr_t)old, M_BPF);
1372#ifdef BPF_JITTER
1373		if (ofunc != NULL)
1374			bpf_destroy_jit_filter(ofunc);
1375#endif
1376
1377		return (0);
1378	}
1379	free((caddr_t)fcode, M_BPF);
1380	return (EINVAL);
1381}
1382
1383/*
1384 * Detach a file from its current interface (if attached at all) and attach
1385 * to the interface indicated by the name stored in ifr.
1386 * Return an errno or 0.
1387 */
1388static int
1389bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1390{
1391	struct bpf_if *bp;
1392	struct ifnet *theywant;
1393
1394	theywant = ifunit(ifr->ifr_name);
1395	if (theywant == NULL || theywant->if_bpf == NULL)
1396		return (ENXIO);
1397
1398	bp = theywant->if_bpf;
1399
1400	/*
1401	 * Behavior here depends on the buffering model.  If we're using
1402	 * kernel memory buffers, then we can allocate them here.  If we're
1403	 * using zero-copy, then the user process must have registered
1404	 * buffers by the time we get here.  If not, return an error.
1405	 *
1406	 * XXXRW: There are locking issues here with multi-threaded use: what
1407	 * if two threads try to set the interface at once?
1408	 */
1409	switch (d->bd_bufmode) {
1410	case BPF_BUFMODE_BUFFER:
1411		if (d->bd_sbuf == NULL)
1412			bpf_buffer_alloc(d);
1413		KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL"));
1414		break;
1415
1416	case BPF_BUFMODE_ZBUF:
1417		if (d->bd_sbuf == NULL)
1418			return (EINVAL);
1419		break;
1420
1421	default:
1422		panic("bpf_setif: bufmode %d", d->bd_bufmode);
1423	}
1424	if (bp != d->bd_bif) {
1425		if (d->bd_bif)
1426			/*
1427			 * Detach if attached to something else.
1428			 */
1429			bpf_detachd(d);
1430
1431		bpf_attachd(d, bp);
1432	}
1433	BPFD_LOCK(d);
1434	reset_d(d);
1435	BPFD_UNLOCK(d);
1436	return (0);
1437}
1438
1439/*
1440 * Support for select() and poll() system calls
1441 *
1442 * Return true iff the specific operation will not block indefinitely.
1443 * Otherwise, return false but make a note that a selwakeup() must be done.
1444 */
1445static int
1446bpfpoll(struct cdev *dev, int events, struct thread *td)
1447{
1448	struct bpf_d *d;
1449	int revents;
1450
1451	d = dev->si_drv1;
1452	if (d->bd_bif == NULL)
1453		return (ENXIO);
1454
1455	/*
1456	 * Refresh PID associated with this descriptor.
1457	 */
1458	revents = events & (POLLOUT | POLLWRNORM);
1459	BPFD_LOCK(d);
1460	d->bd_pid = td->td_proc->p_pid;
1461	if (events & (POLLIN | POLLRDNORM)) {
1462		if (bpf_ready(d))
1463			revents |= events & (POLLIN | POLLRDNORM);
1464		else {
1465			selrecord(td, &d->bd_sel);
1466			/* Start the read timeout if necessary. */
1467			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1468				callout_reset(&d->bd_callout, d->bd_rtout,
1469				    bpf_timed_out, d);
1470				d->bd_state = BPF_WAITING;
1471			}
1472		}
1473	}
1474	BPFD_UNLOCK(d);
1475	return (revents);
1476}
1477
1478/*
1479 * Support for kevent() system call.  Register EVFILT_READ filters and
1480 * reject all others.
1481 */
1482int
1483bpfkqfilter(struct cdev *dev, struct knote *kn)
1484{
1485	struct bpf_d *d = (struct bpf_d *)dev->si_drv1;
1486
1487	if (kn->kn_filter != EVFILT_READ)
1488		return (1);
1489
1490	/*
1491	 * Refresh PID associated with this descriptor.
1492	 */
1493	BPFD_LOCK(d);
1494	d->bd_pid = curthread->td_proc->p_pid;
1495	kn->kn_fop = &bpfread_filtops;
1496	kn->kn_hook = d;
1497	knlist_add(&d->bd_sel.si_note, kn, 1);
1498	BPFD_UNLOCK(d);
1499
1500	return (0);
1501}
1502
1503static void
1504filt_bpfdetach(struct knote *kn)
1505{
1506	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
1507
1508	knlist_remove(&d->bd_sel.si_note, kn, 0);
1509}
1510
1511static int
1512filt_bpfread(struct knote *kn, long hint)
1513{
1514	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
1515	int ready;
1516
1517	BPFD_LOCK_ASSERT(d);
1518	ready = bpf_ready(d);
1519	if (ready) {
1520		kn->kn_data = d->bd_slen;
1521		if (d->bd_hbuf)
1522			kn->kn_data += d->bd_hlen;
1523	}
1524	else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1525		callout_reset(&d->bd_callout, d->bd_rtout,
1526		    bpf_timed_out, d);
1527		d->bd_state = BPF_WAITING;
1528	}
1529
1530	return (ready);
1531}
1532
1533/*
1534 * Incoming linkage from device drivers.  Process the packet pkt, of length
1535 * pktlen, which is stored in a contiguous buffer.  The packet is parsed
1536 * by each process' filter, and if accepted, stashed into the corresponding
1537 * buffer.
1538 */
1539void
1540bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
1541{
1542	struct bpf_d *d;
1543	u_int slen;
1544	int gottime;
1545	struct timeval tv;
1546
1547	gottime = 0;
1548	BPFIF_LOCK(bp);
1549	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1550		BPFD_LOCK(d);
1551		++d->bd_rcount;
1552#ifdef BPF_JITTER
1553		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL)
1554			slen = (*(d->bd_bfilter->func))(pkt, pktlen, pktlen);
1555		else
1556#endif
1557		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
1558		if (slen != 0) {
1559			d->bd_fcount++;
1560			if (!gottime) {
1561				microtime(&tv);
1562				gottime = 1;
1563			}
1564#ifdef MAC
1565			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
1566#endif
1567				catchpacket(d, pkt, pktlen, slen,
1568				    bpf_append_bytes, &tv);
1569		}
1570		BPFD_UNLOCK(d);
1571	}
1572	BPFIF_UNLOCK(bp);
1573}
1574
1575#define	BPF_CHECK_DIRECTION(d, r, i)				\
1576	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
1577	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
1578
1579/*
1580 * Incoming linkage from device drivers, when packet is in an mbuf chain.
1581 */
1582void
1583bpf_mtap(struct bpf_if *bp, struct mbuf *m)
1584{
1585	struct bpf_d *d;
1586	u_int pktlen, slen;
1587	int gottime;
1588	struct timeval tv;
1589
1590	/* Skip outgoing duplicate packets. */
1591	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
1592		m->m_flags &= ~M_PROMISC;
1593		return;
1594	}
1595
1596	gottime = 0;
1597
1598	pktlen = m_length(m, NULL);
1599
1600	BPFIF_LOCK(bp);
1601	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1602		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
1603			continue;
1604		BPFD_LOCK(d);
1605		++d->bd_rcount;
1606#ifdef BPF_JITTER
1607		/* XXX We cannot handle multiple mbufs. */
1608		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL &&
1609		    m->m_next == NULL)
1610			slen = (*(d->bd_bfilter->func))(mtod(m, u_char *),
1611			    pktlen, pktlen);
1612		else
1613#endif
1614		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
1615		if (slen != 0) {
1616			d->bd_fcount++;
1617			if (!gottime) {
1618				microtime(&tv);
1619				gottime = 1;
1620			}
1621#ifdef MAC
1622			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
1623#endif
1624				catchpacket(d, (u_char *)m, pktlen, slen,
1625				    bpf_append_mbuf, &tv);
1626		}
1627		BPFD_UNLOCK(d);
1628	}
1629	BPFIF_UNLOCK(bp);
1630}
1631
1632/*
1633 * Incoming linkage from device drivers, when packet is in
1634 * an mbuf chain and to be prepended by a contiguous header.
1635 */
1636void
1637bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
1638{
1639	struct mbuf mb;
1640	struct bpf_d *d;
1641	u_int pktlen, slen;
1642	int gottime;
1643	struct timeval tv;
1644
1645	/* Skip outgoing duplicate packets. */
1646	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
1647		m->m_flags &= ~M_PROMISC;
1648		return;
1649	}
1650
1651	gottime = 0;
1652
1653	pktlen = m_length(m, NULL);
1654	/*
1655	 * Craft on-stack mbuf suitable for passing to bpf_filter.
1656	 * Note that we cut corners here; we only setup what's
1657	 * absolutely needed--this mbuf should never go anywhere else.
1658	 */
1659	mb.m_next = m;
1660	mb.m_data = data;
1661	mb.m_len = dlen;
1662	pktlen += dlen;
1663
1664	BPFIF_LOCK(bp);
1665	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1666		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
1667			continue;
1668		BPFD_LOCK(d);
1669		++d->bd_rcount;
1670		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
1671		if (slen != 0) {
1672			d->bd_fcount++;
1673			if (!gottime) {
1674				microtime(&tv);
1675				gottime = 1;
1676			}
1677#ifdef MAC
1678			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
1679#endif
1680				catchpacket(d, (u_char *)&mb, pktlen, slen,
1681				    bpf_append_mbuf, &tv);
1682		}
1683		BPFD_UNLOCK(d);
1684	}
1685	BPFIF_UNLOCK(bp);
1686}
1687
1688#undef	BPF_CHECK_DIRECTION
1689
1690/*
1691 * Move the packet data from interface memory (pkt) into the
1692 * store buffer.  "cpfn" is the routine called to do the actual data
1693 * transfer.  bcopy is passed in to copy contiguous chunks, while
1694 * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
1695 * pkt is really an mbuf.
1696 */
1697static void
1698catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
1699    void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
1700    struct timeval *tv)
1701{
1702	struct bpf_hdr hdr;
1703	int totlen, curlen;
1704	int hdrlen = d->bd_bif->bif_hdrlen;
1705	int do_wakeup = 0;
1706
1707	BPFD_LOCK_ASSERT(d);
1708
1709	/*
1710	 * Detect whether user space has released a buffer back to us, and if
1711	 * so, move it from being a hold buffer to a free buffer.  This may
1712	 * not be the best place to do it (for example, we might only want to
1713	 * run this check if we need the space), but for now it's a reliable
1714	 * spot to do it.
1715	 */
1716	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
1717		d->bd_fbuf = d->bd_hbuf;
1718		d->bd_hbuf = NULL;
1719		d->bd_hlen = 0;
1720	}
1721
1722	/*
1723	 * Figure out how many bytes to move.  If the packet is
1724	 * greater or equal to the snapshot length, transfer that
1725	 * much.  Otherwise, transfer the whole packet (unless
1726	 * we hit the buffer size limit).
1727	 */
1728	totlen = hdrlen + min(snaplen, pktlen);
1729	if (totlen > d->bd_bufsize)
1730		totlen = d->bd_bufsize;
1731
1732	/*
1733	 * Round up the end of the previous packet to the next longword.
1734	 *
1735	 * Drop the packet if there's no room and no hope of room
1736	 * If the packet would overflow the storage buffer or the storage
1737	 * buffer is considered immutable by the buffer model, try to rotate
1738	 * the buffer and wakeup pending processes.
1739	 */
1740	curlen = BPF_WORDALIGN(d->bd_slen);
1741	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
1742		if (d->bd_fbuf == NULL) {
1743			/*
1744			 * There's no room in the store buffer, and no
1745			 * prospect of room, so drop the packet.  Notify the
1746			 * buffer model.
1747			 */
1748			bpf_buffull(d);
1749			++d->bd_dcount;
1750			return;
1751		}
1752		ROTATE_BUFFERS(d);
1753		do_wakeup = 1;
1754		curlen = 0;
1755	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
1756		/*
1757		 * Immediate mode is set, or the read timeout has already
1758		 * expired during a select call.  A packet arrived, so the
1759		 * reader should be woken up.
1760		 */
1761		do_wakeup = 1;
1762
1763	/*
1764	 * Append the bpf header.  Note we append the actual header size, but
1765	 * move forward the length of the header plus padding.
1766	 */
1767	bzero(&hdr, sizeof(hdr));
1768	hdr.bh_tstamp = *tv;
1769	hdr.bh_datalen = pktlen;
1770	hdr.bh_hdrlen = hdrlen;
1771	hdr.bh_caplen = totlen - hdrlen;
1772	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
1773
1774	/*
1775	 * Copy the packet data into the store buffer and update its length.
1776	 */
1777	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen);
1778	d->bd_slen = curlen + totlen;
1779
1780	if (do_wakeup)
1781		bpf_wakeup(d);
1782}
1783
1784/*
1785 * Free buffers currently in use by a descriptor.
1786 * Called on close.
1787 */
1788static void
1789bpf_freed(struct bpf_d *d)
1790{
1791
1792	/*
1793	 * We don't need to lock out interrupts since this descriptor has
1794	 * been detached from its interface and it yet hasn't been marked
1795	 * free.
1796	 */
1797	bpf_free(d);
1798	if (d->bd_rfilter) {
1799		free((caddr_t)d->bd_rfilter, M_BPF);
1800#ifdef BPF_JITTER
1801		bpf_destroy_jit_filter(d->bd_bfilter);
1802#endif
1803	}
1804	if (d->bd_wfilter)
1805		free((caddr_t)d->bd_wfilter, M_BPF);
1806	mtx_destroy(&d->bd_mtx);
1807}
1808
1809/*
1810 * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
1811 * fixed size of the link header (variable length headers not yet supported).
1812 */
1813void
1814bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
1815{
1816
1817	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
1818}
1819
1820/*
1821 * Attach an interface to bpf.  ifp is a pointer to the structure
1822 * defining the interface to be attached, dlt is the link layer type,
1823 * and hdrlen is the fixed size of the link header (variable length
1824 * headers are not yet supporrted).
1825 */
1826void
1827bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
1828{
1829	struct bpf_if *bp;
1830
1831	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
1832	if (bp == NULL)
1833		panic("bpfattach");
1834
1835	LIST_INIT(&bp->bif_dlist);
1836	bp->bif_ifp = ifp;
1837	bp->bif_dlt = dlt;
1838	mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF);
1839	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
1840	*driverp = bp;
1841
1842	mtx_lock(&bpf_mtx);
1843	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
1844	mtx_unlock(&bpf_mtx);
1845
1846	/*
1847	 * Compute the length of the bpf header.  This is not necessarily
1848	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1849	 * that the network layer header begins on a longword boundary (for
1850	 * performance reasons and to alleviate alignment restrictions).
1851	 */
1852	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1853
1854	if (bootverbose)
1855		if_printf(ifp, "bpf attached\n");
1856}
1857
1858/*
1859 * Detach bpf from an interface.  This involves detaching each descriptor
1860 * associated with the interface, and leaving bd_bif NULL.  Notify each
1861 * descriptor as it's detached so that any sleepers wake up and get
1862 * ENXIO.
1863 */
1864void
1865bpfdetach(struct ifnet *ifp)
1866{
1867	struct bpf_if	*bp;
1868	struct bpf_d	*d;
1869
1870	/* Locate BPF interface information */
1871	mtx_lock(&bpf_mtx);
1872	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
1873		if (ifp == bp->bif_ifp)
1874			break;
1875	}
1876
1877	/* Interface wasn't attached */
1878	if ((bp == NULL) || (bp->bif_ifp == NULL)) {
1879		mtx_unlock(&bpf_mtx);
1880		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
1881		return;
1882	}
1883
1884	LIST_REMOVE(bp, bif_next);
1885	mtx_unlock(&bpf_mtx);
1886
1887	while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
1888		bpf_detachd(d);
1889		BPFD_LOCK(d);
1890		bpf_wakeup(d);
1891		BPFD_UNLOCK(d);
1892	}
1893
1894	mtx_destroy(&bp->bif_mtx);
1895	free(bp, M_BPF);
1896}
1897
1898/*
1899 * Get a list of available data link type of the interface.
1900 */
1901static int
1902bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1903{
1904	int n, error;
1905	struct ifnet *ifp;
1906	struct bpf_if *bp;
1907
1908	ifp = d->bd_bif->bif_ifp;
1909	n = 0;
1910	error = 0;
1911	mtx_lock(&bpf_mtx);
1912	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
1913		if (bp->bif_ifp != ifp)
1914			continue;
1915		if (bfl->bfl_list != NULL) {
1916			if (n >= bfl->bfl_len) {
1917				mtx_unlock(&bpf_mtx);
1918				return (ENOMEM);
1919			}
1920			error = copyout(&bp->bif_dlt,
1921			    bfl->bfl_list + n, sizeof(u_int));
1922		}
1923		n++;
1924	}
1925	mtx_unlock(&bpf_mtx);
1926	bfl->bfl_len = n;
1927	return (error);
1928}
1929
1930/*
1931 * Set the data link type of a BPF instance.
1932 */
1933static int
1934bpf_setdlt(struct bpf_d *d, u_int dlt)
1935{
1936	int error, opromisc;
1937	struct ifnet *ifp;
1938	struct bpf_if *bp;
1939
1940	if (d->bd_bif->bif_dlt == dlt)
1941		return (0);
1942	ifp = d->bd_bif->bif_ifp;
1943	mtx_lock(&bpf_mtx);
1944	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
1945		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
1946			break;
1947	}
1948	mtx_unlock(&bpf_mtx);
1949	if (bp != NULL) {
1950		opromisc = d->bd_promisc;
1951		bpf_detachd(d);
1952		bpf_attachd(d, bp);
1953		BPFD_LOCK(d);
1954		reset_d(d);
1955		BPFD_UNLOCK(d);
1956		if (opromisc) {
1957			error = ifpromisc(bp->bif_ifp, 1);
1958			if (error)
1959				if_printf(bp->bif_ifp,
1960					"bpf_setdlt: ifpromisc failed (%d)\n",
1961					error);
1962			else
1963				d->bd_promisc = 1;
1964		}
1965	}
1966	return (bp == NULL ? EINVAL : 0);
1967}
1968
1969static void
1970bpf_clone(void *arg, struct ucred *cred, char *name, int namelen,
1971    struct cdev **dev)
1972{
1973	int u;
1974
1975	if (*dev != NULL)
1976		return;
1977	if (dev_stdclone(name, NULL, "bpf", &u) != 1)
1978		return;
1979	*dev = make_dev(&bpf_cdevsw, unit2minor(u), UID_ROOT, GID_WHEEL, 0600,
1980	    "bpf%d", u);
1981	dev_ref(*dev);
1982	(*dev)->si_flags |= SI_CHEAPCLONE;
1983	return;
1984}
1985
1986static void
1987bpf_drvinit(void *unused)
1988{
1989
1990	mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF);
1991	LIST_INIT(&bpf_iflist);
1992	EVENTHANDLER_REGISTER(dev_clone, bpf_clone, 0, 1000);
1993}
1994
1995static void
1996bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
1997{
1998
1999	bzero(d, sizeof(*d));
2000	BPFD_LOCK_ASSERT(bd);
2001	d->bd_structsize = sizeof(*d);
2002	d->bd_immediate = bd->bd_immediate;
2003	d->bd_promisc = bd->bd_promisc;
2004	d->bd_hdrcmplt = bd->bd_hdrcmplt;
2005	d->bd_direction = bd->bd_direction;
2006	d->bd_feedback = bd->bd_feedback;
2007	d->bd_async = bd->bd_async;
2008	d->bd_rcount = bd->bd_rcount;
2009	d->bd_dcount = bd->bd_dcount;
2010	d->bd_fcount = bd->bd_fcount;
2011	d->bd_sig = bd->bd_sig;
2012	d->bd_slen = bd->bd_slen;
2013	d->bd_hlen = bd->bd_hlen;
2014	d->bd_bufsize = bd->bd_bufsize;
2015	d->bd_pid = bd->bd_pid;
2016	strlcpy(d->bd_ifname,
2017	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
2018	d->bd_locked = bd->bd_locked;
2019	d->bd_wcount = bd->bd_wcount;
2020	d->bd_wdcount = bd->bd_wdcount;
2021	d->bd_wfcount = bd->bd_wfcount;
2022	d->bd_zcopy = bd->bd_zcopy;
2023	d->bd_bufmode = bd->bd_bufmode;
2024}
2025
2026static int
2027bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
2028{
2029	struct xbpf_d *xbdbuf, *xbd;
2030	int index, error;
2031	struct bpf_if *bp;
2032	struct bpf_d *bd;
2033
2034	/*
2035	 * XXX This is not technically correct. It is possible for non
2036	 * privileged users to open bpf devices. It would make sense
2037	 * if the users who opened the devices were able to retrieve
2038	 * the statistics for them, too.
2039	 */
2040	error = priv_check(req->td, PRIV_NET_BPF);
2041	if (error)
2042		return (error);
2043	if (req->oldptr == NULL)
2044		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
2045	if (bpf_bpfd_cnt == 0)
2046		return (SYSCTL_OUT(req, 0, 0));
2047	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
2048	mtx_lock(&bpf_mtx);
2049	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
2050		mtx_unlock(&bpf_mtx);
2051		free(xbdbuf, M_BPF);
2052		return (ENOMEM);
2053	}
2054	index = 0;
2055	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2056		BPFIF_LOCK(bp);
2057		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2058			xbd = &xbdbuf[index++];
2059			BPFD_LOCK(bd);
2060			bpfstats_fill_xbpf(xbd, bd);
2061			BPFD_UNLOCK(bd);
2062		}
2063		BPFIF_UNLOCK(bp);
2064	}
2065	mtx_unlock(&bpf_mtx);
2066	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
2067	free(xbdbuf, M_BPF);
2068	return (error);
2069}
2070
2071SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
2072
2073#else /* !DEV_BPF && !NETGRAPH_BPF */
2074/*
2075 * NOP stubs to allow bpf-using drivers to load and function.
2076 *
2077 * A 'better' implementation would allow the core bpf functionality
2078 * to be loaded at runtime.
2079 */
2080static struct bpf_if bp_null;
2081
2082void
2083bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2084{
2085}
2086
2087void
2088bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2089{
2090}
2091
2092void
2093bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
2094{
2095}
2096
2097void
2098bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2099{
2100
2101	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2102}
2103
2104void
2105bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
2106{
2107
2108	*driverp = &bp_null;
2109}
2110
2111void
2112bpfdetach(struct ifnet *ifp)
2113{
2114}
2115
2116u_int
2117bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
2118{
2119	return -1;	/* "no filter" behaviour */
2120}
2121
2122int
2123bpf_validate(const struct bpf_insn *f, int len)
2124{
2125	return 0;		/* false */
2126}
2127
2128#endif /* !DEV_BPF && !NETGRAPH_BPF */
2129