kern_event.c revision 142217
159290Sjlemon/*-
272969Sjlemon * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3133741Sjmg * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
459290Sjlemon * All rights reserved.
559290Sjlemon *
659290Sjlemon * Redistribution and use in source and binary forms, with or without
759290Sjlemon * modification, are permitted provided that the following conditions
859290Sjlemon * are met:
959290Sjlemon * 1. Redistributions of source code must retain the above copyright
1059290Sjlemon *    notice, this list of conditions and the following disclaimer.
1159290Sjlemon * 2. Redistributions in binary form must reproduce the above copyright
1259290Sjlemon *    notice, this list of conditions and the following disclaimer in the
1359290Sjlemon *    documentation and/or other materials provided with the distribution.
1459290Sjlemon *
1559290Sjlemon * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1659290Sjlemon * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1759290Sjlemon * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1859290Sjlemon * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1959290Sjlemon * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2059290Sjlemon * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2159290Sjlemon * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2259290Sjlemon * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2359290Sjlemon * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2459290Sjlemon * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2559290Sjlemon * SUCH DAMAGE.
2659290Sjlemon */
2759290Sjlemon
28116182Sobrien#include <sys/cdefs.h>
29116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/kern_event.c 142217 2005-02-22 13:11:33Z rwatson $");
30116182Sobrien
3159290Sjlemon#include <sys/param.h>
3259290Sjlemon#include <sys/systm.h>
3359290Sjlemon#include <sys/kernel.h>
3476166Smarkm#include <sys/lock.h>
3576166Smarkm#include <sys/mutex.h>
3659290Sjlemon#include <sys/proc.h>
37132138Salfred#include <sys/malloc.h>
3859290Sjlemon#include <sys/unistd.h>
3959290Sjlemon#include <sys/file.h>
40108524Salfred#include <sys/filedesc.h>
41132138Salfred#include <sys/filio.h>
4259290Sjlemon#include <sys/fcntl.h>
43133741Sjmg#include <sys/kthread.h>
4470834Swollman#include <sys/selinfo.h>
4559290Sjlemon#include <sys/queue.h>
4659290Sjlemon#include <sys/event.h>
4759290Sjlemon#include <sys/eventvar.h>
4859290Sjlemon#include <sys/poll.h>
4959290Sjlemon#include <sys/protosw.h>
50132138Salfred#include <sys/sigio.h>
51132138Salfred#include <sys/signalvar.h>
5259290Sjlemon#include <sys/socket.h>
5359290Sjlemon#include <sys/socketvar.h>
5459290Sjlemon#include <sys/stat.h>
5584138Sjlemon#include <sys/sysctl.h>
5659290Sjlemon#include <sys/sysproto.h>
57133741Sjmg#include <sys/taskqueue.h>
5859290Sjlemon#include <sys/uio.h>
5959290Sjlemon
6092751Sjeff#include <vm/uma.h>
6159290Sjlemon
62141616Sphkstatic MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
63141616Sphk
64133741Sjmg/*
65133741Sjmg * This lock is used if multiple kq locks are required.  This possibly
66133741Sjmg * should be made into a per proc lock.
67133741Sjmg */
68133741Sjmgstatic struct mtx	kq_global;
69133741SjmgMTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
70133741Sjmg#define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
71133741Sjmg	if (!haslck)				\
72133741Sjmg		mtx_lock(lck);			\
73133741Sjmg	haslck = 1;				\
74133741Sjmg} while (0)
75133741Sjmg#define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
76133741Sjmg	if (haslck)				\
77133741Sjmg		mtx_unlock(lck);			\
78133741Sjmg	haslck = 0;				\
79133741Sjmg} while (0)
8084138Sjlemon
81133741SjmgTASKQUEUE_DEFINE_THREAD(kqueue);
82133741Sjmg
83133741Sjmgstatic int	kqueue_aquire(struct file *fp, struct kqueue **kqp);
84133741Sjmgstatic void	kqueue_release(struct kqueue *kq, int locked);
85133741Sjmgstatic int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
86133741Sjmg		    uintptr_t ident, int waitok);
87133741Sjmgstatic void	kqueue_task(void *arg, int pending);
88133741Sjmgstatic int	kqueue_scan(struct kqueue *kq, int maxevents,
8963977Speter		    struct kevent *ulistp, const struct timespec *timeout,
90133741Sjmg		    struct kevent *keva, struct thread *td);
9159290Sjlemonstatic void 	kqueue_wakeup(struct kqueue *kq);
92133741Sjmgstatic struct filterops *kqueue_fo_find(int filt);
93133741Sjmgstatic void	kqueue_fo_release(int filt);
9459290Sjlemon
95108255Sphkstatic fo_rdwr_t	kqueue_read;
96108255Sphkstatic fo_rdwr_t	kqueue_write;
97108255Sphkstatic fo_ioctl_t	kqueue_ioctl;
98108255Sphkstatic fo_poll_t	kqueue_poll;
99108255Sphkstatic fo_kqfilter_t	kqueue_kqfilter;
100108255Sphkstatic fo_stat_t	kqueue_stat;
101108255Sphkstatic fo_close_t	kqueue_close;
102108238Sphk
10372521Sjlemonstatic struct fileops kqueueops = {
104116546Sphk	.fo_read = kqueue_read,
105116546Sphk	.fo_write = kqueue_write,
106116546Sphk	.fo_ioctl = kqueue_ioctl,
107116546Sphk	.fo_poll = kqueue_poll,
108116546Sphk	.fo_kqfilter = kqueue_kqfilter,
109116546Sphk	.fo_stat = kqueue_stat,
110116546Sphk	.fo_close = kqueue_close,
11172521Sjlemon};
11272521Sjlemon
113133741Sjmgstatic int 	knote_attach(struct knote *kn, struct kqueue *kq);
11483366Sjulianstatic void 	knote_drop(struct knote *kn, struct thread *td);
11559290Sjlemonstatic void 	knote_enqueue(struct knote *kn);
11659290Sjlemonstatic void 	knote_dequeue(struct knote *kn);
11759290Sjlemonstatic void 	knote_init(void);
118133741Sjmgstatic struct 	knote *knote_alloc(int waitok);
11959290Sjlemonstatic void 	knote_free(struct knote *kn);
12059290Sjlemon
12172521Sjlemonstatic void	filt_kqdetach(struct knote *kn);
12272521Sjlemonstatic int	filt_kqueue(struct knote *kn, long hint);
12372521Sjlemonstatic int	filt_procattach(struct knote *kn);
12472521Sjlemonstatic void	filt_procdetach(struct knote *kn);
12572521Sjlemonstatic int	filt_proc(struct knote *kn, long hint);
12672521Sjlemonstatic int	filt_fileattach(struct knote *kn);
12779989Sjlemonstatic void	filt_timerexpire(void *knx);
12879989Sjlemonstatic int	filt_timerattach(struct knote *kn);
12979989Sjlemonstatic void	filt_timerdetach(struct knote *kn);
13079989Sjlemonstatic int	filt_timer(struct knote *kn, long hint);
13172521Sjlemon
13279989Sjlemonstatic struct filterops file_filtops =
13379989Sjlemon	{ 1, filt_fileattach, NULL, NULL };
13472521Sjlemonstatic struct filterops kqread_filtops =
13572521Sjlemon	{ 1, NULL, filt_kqdetach, filt_kqueue };
136133741Sjmg/* XXX - move to kern_proc.c?  */
13772521Sjlemonstatic struct filterops proc_filtops =
13872521Sjlemon	{ 0, filt_procattach, filt_procdetach, filt_proc };
13979989Sjlemonstatic struct filterops timer_filtops =
14079989Sjlemon	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
14172521Sjlemon
14292751Sjeffstatic uma_zone_t	knote_zone;
14384138Sjlemonstatic int 		kq_ncallouts = 0;
14484138Sjlemonstatic int 		kq_calloutmax = (4 * 1024);
14584138SjlemonSYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
14684138Sjlemon    &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
14759290Sjlemon
148133741Sjmg/* XXX - ensure not KN_INFLUX?? */
149133741Sjmg#define KNOTE_ACTIVATE(kn, islock) do { 				\
150133741Sjmg	if ((islock))							\
151133741Sjmg		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
152133741Sjmg	else								\
153133741Sjmg		KQ_LOCK((kn)->kn_kq);					\
154133741Sjmg	(kn)->kn_status |= KN_ACTIVE;					\
155133741Sjmg	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
156133741Sjmg		knote_enqueue((kn));					\
157133741Sjmg	if (!(islock))							\
158133741Sjmg		KQ_UNLOCK((kn)->kn_kq);					\
15959290Sjlemon} while(0)
160133741Sjmg#define KQ_LOCK(kq) do {						\
161133741Sjmg	mtx_lock(&(kq)->kq_lock);					\
162133741Sjmg} while (0)
163133741Sjmg#define KQ_FLUX_WAKEUP(kq) do {						\
164133741Sjmg	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
165133741Sjmg		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
166133741Sjmg		wakeup((kq));						\
167133741Sjmg	}								\
168133741Sjmg} while (0)
169133741Sjmg#define KQ_UNLOCK_FLUX(kq) do {						\
170133741Sjmg	KQ_FLUX_WAKEUP(kq);						\
171133741Sjmg	mtx_unlock(&(kq)->kq_lock);					\
172133741Sjmg} while (0)
173133741Sjmg#define KQ_UNLOCK(kq) do {						\
174133741Sjmg	mtx_unlock(&(kq)->kq_lock);					\
175133741Sjmg} while (0)
176133741Sjmg#define KQ_OWNED(kq) do {						\
177133741Sjmg	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
178133741Sjmg} while (0)
179133741Sjmg#define KQ_NOTOWNED(kq) do {						\
180133741Sjmg	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
181133741Sjmg} while (0)
182133741Sjmg#define KN_LIST_LOCK(kn) do {						\
183133741Sjmg	if (kn->kn_knlist != NULL)					\
184133741Sjmg		mtx_lock(kn->kn_knlist->kl_lock);			\
185133741Sjmg} while (0)
186133741Sjmg#define KN_LIST_UNLOCK(kn) do {						\
187133741Sjmg	if (kn->kn_knlist != NULL)					\
188133741Sjmg		mtx_unlock(kn->kn_knlist->kl_lock);			\
189133741Sjmg} while (0)
19059290Sjlemon
19159290Sjlemon#define	KN_HASHSIZE		64		/* XXX should be tunable */
19259290Sjlemon#define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
19359290Sjlemon
19488633Salfredstatic int
19588633Salfredfilt_nullattach(struct knote *kn)
19688633Salfred{
19788633Salfred
19888633Salfred	return (ENXIO);
19988633Salfred};
20088633Salfred
20188633Salfredstruct filterops null_filtops =
20288633Salfred	{ 0, filt_nullattach, NULL, NULL };
20388633Salfred
204133741Sjmg/* XXX - make SYSINIT to add these, and move into respective modules. */
20559290Sjlemonextern struct filterops sig_filtops;
206131562Salfredextern struct filterops fs_filtops;
20759290Sjlemon
20859290Sjlemon/*
20972521Sjlemon * Table for for all system-defined filters.
21059290Sjlemon */
211133741Sjmgstatic struct mtx	filterops_lock;
212133741SjmgMTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
213133741Sjmg	MTX_DEF);
214133741Sjmgstatic struct {
215133741Sjmg	struct filterops *for_fop;
216133741Sjmg	int for_refcnt;
217133741Sjmg} sysfilt_ops[EVFILT_SYSCOUNT] = {
218133741Sjmg	{ &file_filtops },			/* EVFILT_READ */
219133741Sjmg	{ &file_filtops },			/* EVFILT_WRITE */
220133741Sjmg	{ &null_filtops },			/* EVFILT_AIO */
221133741Sjmg	{ &file_filtops },			/* EVFILT_VNODE */
222133741Sjmg	{ &proc_filtops },			/* EVFILT_PROC */
223133741Sjmg	{ &sig_filtops },			/* EVFILT_SIGNAL */
224133741Sjmg	{ &timer_filtops },			/* EVFILT_TIMER */
225133741Sjmg	{ &file_filtops },			/* EVFILT_NETDEV */
226133741Sjmg	{ &fs_filtops },			/* EVFILT_FS */
22759290Sjlemon};
22859290Sjlemon
229133741Sjmg/*
230133741Sjmg * Simple redirection for all cdevsw style objects to call their fo_kqfilter
231133741Sjmg * method.
232133741Sjmg */
23359290Sjlemonstatic int
23472521Sjlemonfilt_fileattach(struct knote *kn)
23559290Sjlemon{
236133635Sjmg
23772521Sjlemon	return (fo_kqfilter(kn->kn_fp, kn));
23859290Sjlemon}
23959290Sjlemon
24072521Sjlemon/*ARGSUSED*/
24159290Sjlemonstatic int
24272521Sjlemonkqueue_kqfilter(struct file *fp, struct knote *kn)
24359290Sjlemon{
244109153Sdillon	struct kqueue *kq = kn->kn_fp->f_data;
24559290Sjlemon
24672521Sjlemon	if (kn->kn_filter != EVFILT_READ)
247133741Sjmg		return (EINVAL);
24859290Sjlemon
249133741Sjmg	kn->kn_status |= KN_KQUEUE;
25072521Sjlemon	kn->kn_fop = &kqread_filtops;
251133741Sjmg	knlist_add(&kq->kq_sel.si_note, kn, 0);
252133741Sjmg
25359290Sjlemon	return (0);
25459290Sjlemon}
25559290Sjlemon
25659290Sjlemonstatic void
25759290Sjlemonfilt_kqdetach(struct knote *kn)
25859290Sjlemon{
259109153Sdillon	struct kqueue *kq = kn->kn_fp->f_data;
26059290Sjlemon
261133741Sjmg	knlist_remove(&kq->kq_sel.si_note, kn, 0);
26259290Sjlemon}
26359290Sjlemon
26459290Sjlemon/*ARGSUSED*/
26559290Sjlemonstatic int
26659290Sjlemonfilt_kqueue(struct knote *kn, long hint)
26759290Sjlemon{
268109153Sdillon	struct kqueue *kq = kn->kn_fp->f_data;
26959290Sjlemon
27059290Sjlemon	kn->kn_data = kq->kq_count;
27159290Sjlemon	return (kn->kn_data > 0);
27259290Sjlemon}
27359290Sjlemon
274133741Sjmg/* XXX - move to kern_proc.c?  */
27559290Sjlemonstatic int
27659290Sjlemonfilt_procattach(struct knote *kn)
27759290Sjlemon{
27859290Sjlemon	struct proc *p;
279113377Skbyanc	int immediate;
28075451Srwatson	int error;
28159290Sjlemon
282113377Skbyanc	immediate = 0;
28359290Sjlemon	p = pfind(kn->kn_id);
284113377Skbyanc	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
285113377Skbyanc		p = zpfind(kn->kn_id);
286113377Skbyanc		immediate = 1;
287133741Sjmg	} else if (p != NULL && (p->p_flag & P_WEXIT)) {
288133741Sjmg		immediate = 1;
289113377Skbyanc	}
290133741Sjmg
291122019Scognet	if (p == NULL)
292122019Scognet		return (ESRCH);
293133741Sjmg	if ((error = p_cansee(curthread, p)))
29475451Srwatson		return (error);
29559290Sjlemon
29659290Sjlemon	kn->kn_ptr.p_proc = p;
29759290Sjlemon	kn->kn_flags |= EV_CLEAR;		/* automatically set */
29859290Sjlemon
29959290Sjlemon	/*
30059290Sjlemon	 * internal flag indicating registration done by kernel
30159290Sjlemon	 */
30259290Sjlemon	if (kn->kn_flags & EV_FLAG1) {
30359290Sjlemon		kn->kn_data = kn->kn_sdata;		/* ppid */
30459290Sjlemon		kn->kn_fflags = NOTE_CHILD;
30559290Sjlemon		kn->kn_flags &= ~EV_FLAG1;
30659290Sjlemon	}
30759290Sjlemon
308122686Scognet	if (immediate == 0)
309133741Sjmg		knlist_add(&p->p_klist, kn, 1);
310113377Skbyanc
311113377Skbyanc	/*
312113377Skbyanc	 * Immediately activate any exit notes if the target process is a
313113377Skbyanc	 * zombie.  This is necessary to handle the case where the target
314113377Skbyanc	 * process, e.g. a child, dies before the kevent is registered.
315113377Skbyanc	 */
316113377Skbyanc	if (immediate && filt_proc(kn, NOTE_EXIT))
317133741Sjmg		KNOTE_ACTIVATE(kn, 0);
318113377Skbyanc
31971500Sjhb	PROC_UNLOCK(p);
32059290Sjlemon
32159290Sjlemon	return (0);
32259290Sjlemon}
32359290Sjlemon
32459290Sjlemon/*
32559290Sjlemon * The knote may be attached to a different process, which may exit,
32659290Sjlemon * leaving nothing for the knote to be attached to.  So when the process
32759290Sjlemon * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
32859290Sjlemon * it will be deleted when read out.  However, as part of the knote deletion,
32959290Sjlemon * this routine is called, so a check is needed to avoid actually performing
33059290Sjlemon * a detach, because the original process does not exist any more.
33159290Sjlemon */
332133741Sjmg/* XXX - move to kern_proc.c?  */
33359290Sjlemonstatic void
33459290Sjlemonfilt_procdetach(struct knote *kn)
33559290Sjlemon{
336133741Sjmg	struct proc *p;
33759290Sjlemon
338133741Sjmg	p = kn->kn_ptr.p_proc;
339133741Sjmg	knlist_remove(&p->p_klist, kn, 0);
340133741Sjmg	kn->kn_ptr.p_proc = NULL;
34159290Sjlemon}
34259290Sjlemon
343133741Sjmg/* XXX - move to kern_proc.c?  */
34459290Sjlemonstatic int
34559290Sjlemonfilt_proc(struct knote *kn, long hint)
34659290Sjlemon{
347133741Sjmg	struct proc *p = kn->kn_ptr.p_proc;
34859290Sjlemon	u_int event;
34959290Sjlemon
35059290Sjlemon	/*
35159290Sjlemon	 * mask off extra data
35259290Sjlemon	 */
35359290Sjlemon	event = (u_int)hint & NOTE_PCTRLMASK;
35459290Sjlemon
35559290Sjlemon	/*
35659290Sjlemon	 * if the user is interested in this event, record it.
35759290Sjlemon	 */
35859290Sjlemon	if (kn->kn_sfflags & event)
35959290Sjlemon		kn->kn_fflags |= event;
36059290Sjlemon
36159290Sjlemon	/*
36259290Sjlemon	 * process is gone, so flag the event as finished.
36359290Sjlemon	 */
36459290Sjlemon	if (event == NOTE_EXIT) {
365133741Sjmg		if (!(kn->kn_status & KN_DETACHED))
366133741Sjmg			knlist_remove_inevent(&p->p_klist, kn);
367133590Srwatson		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
368133741Sjmg		kn->kn_ptr.p_proc = NULL;
36959290Sjlemon		return (1);
37059290Sjlemon	}
37159290Sjlemon
37259290Sjlemon	/*
37359290Sjlemon	 * process forked, and user wants to track the new process,
37459290Sjlemon	 * so attach a new knote to it, and immediately report an
37559290Sjlemon	 * event with the parent's pid.
37659290Sjlemon	 */
37759290Sjlemon	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
37859290Sjlemon		struct kevent kev;
37959290Sjlemon		int error;
38059290Sjlemon
38159290Sjlemon		/*
38259290Sjlemon		 * register knote with new process.
38359290Sjlemon		 */
38459290Sjlemon		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
38559290Sjlemon		kev.filter = kn->kn_filter;
38659290Sjlemon		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
38759290Sjlemon		kev.fflags = kn->kn_sfflags;
38859290Sjlemon		kev.data = kn->kn_id;			/* parent */
38961962Sjlemon		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
390133741Sjmg		error = kqueue_register(kn->kn_kq, &kev, NULL, 0);
39159290Sjlemon		if (error)
39259290Sjlemon			kn->kn_fflags |= NOTE_TRACKERR;
39359290Sjlemon	}
39459290Sjlemon
39559290Sjlemon	return (kn->kn_fflags != 0);
39659290Sjlemon}
39759290Sjlemon
398133741Sjmgstatic int
399133741Sjmgtimertoticks(intptr_t data)
400133741Sjmg{
401133741Sjmg	struct timeval tv;
402133741Sjmg	int tticks;
403133741Sjmg
404133741Sjmg	tv.tv_sec = data / 1000;
405133741Sjmg	tv.tv_usec = (data % 1000) * 1000;
406133741Sjmg	tticks = tvtohz(&tv);
407133741Sjmg
408133741Sjmg	return tticks;
409133741Sjmg}
410133741Sjmg
411133741Sjmg/* XXX - move to kern_timeout.c? */
41279989Sjlemonstatic void
41379989Sjlemonfilt_timerexpire(void *knx)
41479989Sjlemon{
41579989Sjlemon	struct knote *kn = knx;
41684138Sjlemon	struct callout *calloutp;
41779989Sjlemon
41879989Sjlemon	kn->kn_data++;
419133741Sjmg	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
42079989Sjlemon
421133741Sjmg	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
42284138Sjlemon		calloutp = (struct callout *)kn->kn_hook;
423133741Sjmg		callout_reset(calloutp, timertoticks(kn->kn_sdata),
424133741Sjmg		    filt_timerexpire, kn);
42579989Sjlemon	}
42679989Sjlemon}
42779989Sjlemon
42879989Sjlemon/*
42979989Sjlemon * data contains amount of time to sleep, in milliseconds
430133590Srwatson */
431133741Sjmg/* XXX - move to kern_timeout.c? */
43279989Sjlemonstatic int
43379989Sjlemonfilt_timerattach(struct knote *kn)
43479989Sjlemon{
43584138Sjlemon	struct callout *calloutp;
43679989Sjlemon
437133741Sjmg	atomic_add_int(&kq_ncallouts, 1);
438133741Sjmg
439133741Sjmg	if (kq_ncallouts >= kq_calloutmax) {
440133741Sjmg		atomic_add_int(&kq_ncallouts, -1);
44184138Sjlemon		return (ENOMEM);
442133741Sjmg	}
44384138Sjlemon
44479989Sjlemon	kn->kn_flags |= EV_CLEAR;		/* automatically set */
445136500Sjmg	kn->kn_status &= ~KN_DETACHED;		/* knlist_add usually sets it */
44684138Sjlemon	MALLOC(calloutp, struct callout *, sizeof(*calloutp),
447111119Simp	    M_KQUEUE, M_WAITOK);
448142217Srwatson	callout_init(calloutp, CALLOUT_MPSAFE);
449127982Scperciva	kn->kn_hook = calloutp;
450133741Sjmg	callout_reset(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire,
451133741Sjmg	    kn);
45279989Sjlemon
45379989Sjlemon	return (0);
45479989Sjlemon}
45579989Sjlemon
456133741Sjmg/* XXX - move to kern_timeout.c? */
45779989Sjlemonstatic void
45879989Sjlemonfilt_timerdetach(struct knote *kn)
45979989Sjlemon{
46084138Sjlemon	struct callout *calloutp;
46179989Sjlemon
46284138Sjlemon	calloutp = (struct callout *)kn->kn_hook;
463127982Scperciva	callout_drain(calloutp);
46484138Sjlemon	FREE(calloutp, M_KQUEUE);
465133741Sjmg	atomic_add_int(&kq_ncallouts, -1);
466136500Sjmg	kn->kn_status |= KN_DETACHED;	/* knlist_remove usually clears it */
46779989Sjlemon}
46879989Sjlemon
469133741Sjmg/* XXX - move to kern_timeout.c? */
47079989Sjlemonstatic int
47179989Sjlemonfilt_timer(struct knote *kn, long hint)
47279989Sjlemon{
47379989Sjlemon
47479989Sjlemon	return (kn->kn_data != 0);
47579989Sjlemon}
47679989Sjlemon
47782710Sdillon/*
47882710Sdillon * MPSAFE
47982710Sdillon */
48061468Sjlemonint
48183366Sjuliankqueue(struct thread *td, struct kqueue_args *uap)
48259290Sjlemon{
48382710Sdillon	struct filedesc *fdp;
48459290Sjlemon	struct kqueue *kq;
48561468Sjlemon	struct file *fp;
48661468Sjlemon	int fd, error;
48759290Sjlemon
48883366Sjulian	fdp = td->td_proc->p_fd;
48983366Sjulian	error = falloc(td, &fp, &fd);
49061468Sjlemon	if (error)
49182710Sdillon		goto done2;
492133741Sjmg
493121256Sdwmalone	/* An extra reference on `nfp' has been held for us by falloc(). */
494133741Sjmg	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
495133741Sjmg	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
49689306Salfred	TAILQ_INIT(&kq->kq_head);
497133741Sjmg	kq->kq_fdp = fdp;
498133741Sjmg	knlist_init(&kq->kq_sel.si_note, &kq->kq_lock);
499133741Sjmg	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
500133741Sjmg
501137647Sphk	FILEDESC_LOCK_FAST(fdp);
502133741Sjmg	SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
503137647Sphk	FILEDESC_UNLOCK_FAST(fdp);
504133741Sjmg
50589306Salfred	FILE_LOCK(fp);
50661468Sjlemon	fp->f_flag = FREAD | FWRITE;
50761468Sjlemon	fp->f_type = DTYPE_KQUEUE;
50861468Sjlemon	fp->f_ops = &kqueueops;
509109153Sdillon	fp->f_data = kq;
51089306Salfred	FILE_UNLOCK(fp);
511121256Sdwmalone	fdrop(fp, td);
512133741Sjmg
51383366Sjulian	td->td_retval[0] = fd;
51482710Sdillondone2:
51561468Sjlemon	return (error);
51659290Sjlemon}
51759290Sjlemon
51859290Sjlemon#ifndef _SYS_SYSPROTO_H_
51959290Sjlemonstruct kevent_args {
52059290Sjlemon	int	fd;
52163977Speter	const struct kevent *changelist;
52259290Sjlemon	int	nchanges;
52363452Sjlemon	struct	kevent *eventlist;
52459290Sjlemon	int	nevents;
52563977Speter	const struct timespec *timeout;
52659290Sjlemon};
52759290Sjlemon#endif
52882710Sdillon/*
52982710Sdillon * MPSAFE
53082710Sdillon */
53159290Sjlemonint
53283366Sjuliankevent(struct thread *td, struct kevent_args *uap)
53359290Sjlemon{
534133741Sjmg	struct kevent keva[KQ_NEVENTS];
53563452Sjlemon	struct kevent *kevp;
53659290Sjlemon	struct kqueue *kq;
53786341Sdillon	struct file *fp;
53859290Sjlemon	struct timespec ts;
53959290Sjlemon	int i, n, nerrors, error;
54059290Sjlemon
54189319Salfred	if ((error = fget(td, uap->fd, &fp)) != 0)
54289319Salfred		return (error);
543133741Sjmg	if ((error = kqueue_aquire(fp, &kq)) != 0)
544133741Sjmg		goto done_norel;
545133741Sjmg
54659290Sjlemon	if (uap->timeout != NULL) {
54763452Sjlemon		error = copyin(uap->timeout, &ts, sizeof(ts));
54859290Sjlemon		if (error)
549133741Sjmg			goto done;
55059290Sjlemon		uap->timeout = &ts;
55159290Sjlemon	}
55259290Sjlemon
55359290Sjlemon	nerrors = 0;
55459290Sjlemon
55559290Sjlemon	while (uap->nchanges > 0) {
55659290Sjlemon		n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
557133741Sjmg		error = copyin(uap->changelist, keva,
558133741Sjmg		    n * sizeof *keva);
55959290Sjlemon		if (error)
56068883Sdillon			goto done;
56159290Sjlemon		for (i = 0; i < n; i++) {
562133741Sjmg			kevp = &keva[i];
56363452Sjlemon			kevp->flags &= ~EV_SYSFLAGS;
564133741Sjmg			error = kqueue_register(kq, kevp, td, 1);
56559290Sjlemon			if (error) {
56659290Sjlemon				if (uap->nevents != 0) {
56763452Sjlemon					kevp->flags = EV_ERROR;
56863452Sjlemon					kevp->data = error;
56998998Salfred					(void) copyout(kevp,
57098998Salfred					    uap->eventlist,
57163452Sjlemon					    sizeof(*kevp));
57259290Sjlemon					uap->eventlist++;
57359290Sjlemon					uap->nevents--;
57459290Sjlemon					nerrors++;
57559290Sjlemon				} else {
57668883Sdillon					goto done;
57759290Sjlemon				}
57859290Sjlemon			}
57959290Sjlemon		}
58059290Sjlemon		uap->nchanges -= n;
58159290Sjlemon		uap->changelist += n;
58259290Sjlemon	}
58359290Sjlemon	if (nerrors) {
584133741Sjmg		td->td_retval[0] = nerrors;
58568883Sdillon		error = 0;
58668883Sdillon		goto done;
58759290Sjlemon	}
58859290Sjlemon
589133741Sjmg	error = kqueue_scan(kq, uap->nevents, uap->eventlist, uap->timeout,
590133741Sjmg	    keva, td);
59168883Sdillondone:
592133741Sjmg	kqueue_release(kq, 0);
593133741Sjmgdone_norel:
59468883Sdillon	if (fp != NULL)
59583366Sjulian		fdrop(fp, td);
59659290Sjlemon	return (error);
59759290Sjlemon}
59859290Sjlemon
59959290Sjlemonint
60088633Salfredkqueue_add_filteropts(int filt, struct filterops *filtops)
60188633Salfred{
602133741Sjmg	int error;
60388633Salfred
604133741Sjmg	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
605133741Sjmg		printf(
606133741Sjmg"trying to add a filterop that is out of range: %d is beyond %d\n",
607133741Sjmg		    ~filt, EVFILT_SYSCOUNT);
608133741Sjmg		return EINVAL;
609133741Sjmg	}
610133741Sjmg	mtx_lock(&filterops_lock);
611133741Sjmg	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
612133741Sjmg	    sysfilt_ops[~filt].for_fop != NULL)
613133741Sjmg		error = EEXIST;
614133741Sjmg	else {
615133741Sjmg		sysfilt_ops[~filt].for_fop = filtops;
616133741Sjmg		sysfilt_ops[~filt].for_refcnt = 0;
617133741Sjmg	}
618133741Sjmg	mtx_unlock(&filterops_lock);
619133741Sjmg
62088633Salfred	return (0);
62188633Salfred}
62288633Salfred
62388633Salfredint
62488633Salfredkqueue_del_filteropts(int filt)
62588633Salfred{
626133741Sjmg	int error;
62788633Salfred
628133741Sjmg	error = 0;
629133741Sjmg	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
630133741Sjmg		return EINVAL;
631133741Sjmg
632133741Sjmg	mtx_lock(&filterops_lock);
633133741Sjmg	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
634133741Sjmg	    sysfilt_ops[~filt].for_fop == NULL)
635133741Sjmg		error = EINVAL;
636133741Sjmg	else if (sysfilt_ops[~filt].for_refcnt != 0)
637133741Sjmg		error = EBUSY;
638133741Sjmg	else {
639133741Sjmg		sysfilt_ops[~filt].for_fop = &null_filtops;
640133741Sjmg		sysfilt_ops[~filt].for_refcnt = 0;
641133741Sjmg	}
642133741Sjmg	mtx_unlock(&filterops_lock);
643133741Sjmg
644133741Sjmg	return error;
64588633Salfred}
64688633Salfred
647133741Sjmgstatic struct filterops *
648133741Sjmgkqueue_fo_find(int filt)
649133741Sjmg{
650133741Sjmg
651133741Sjmg	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
652133741Sjmg		return NULL;
653133741Sjmg
654133741Sjmg	mtx_lock(&filterops_lock);
655133741Sjmg	sysfilt_ops[~filt].for_refcnt++;
656133741Sjmg	if (sysfilt_ops[~filt].for_fop == NULL)
657133741Sjmg		sysfilt_ops[~filt].for_fop = &null_filtops;
658133741Sjmg	mtx_unlock(&filterops_lock);
659133741Sjmg
660133741Sjmg	return sysfilt_ops[~filt].for_fop;
661133741Sjmg}
662133741Sjmg
663133741Sjmgstatic void
664133741Sjmgkqueue_fo_release(int filt)
665133741Sjmg{
666133741Sjmg
667133741Sjmg	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
668133741Sjmg		return;
669133741Sjmg
670133741Sjmg	mtx_lock(&filterops_lock);
671133741Sjmg	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
672133741Sjmg	    ("filter object refcount not valid on release"));
673133741Sjmg	sysfilt_ops[~filt].for_refcnt--;
674133741Sjmg	mtx_unlock(&filterops_lock);
675133741Sjmg}
676133741Sjmg
677133741Sjmg/*
678133741Sjmg * A ref to kq (obtained via kqueue_aquire) should be held.  waitok will
679133741Sjmg * influence if memory allocation should wait.  Make sure it is 0 if you
680133741Sjmg * hold any mutexes.
681133741Sjmg */
68288633Salfredint
683133741Sjmgkqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
68459290Sjlemon{
685133741Sjmg	struct filedesc *fdp;
68659290Sjlemon	struct filterops *fops;
687133741Sjmg	struct file *fp;
688133741Sjmg	struct knote *kn, *tkn;
689133741Sjmg	int error, filt, event;
690133741Sjmg	int haskqglobal;
691133741Sjmg	int fd;
69259290Sjlemon
693133741Sjmg	fdp = NULL;
694133741Sjmg	fp = NULL;
695133741Sjmg	kn = NULL;
696133741Sjmg	error = 0;
697133741Sjmg	haskqglobal = 0;
69859290Sjlemon
699133741Sjmg	filt = kev->filter;
700133741Sjmg	fops = kqueue_fo_find(filt);
701133741Sjmg	if (fops == NULL)
702133741Sjmg		return EINVAL;
703133741Sjmg
704133741Sjmg	tkn = knote_alloc(waitok);		/* prevent waiting with locks */
705133741Sjmg
706133741Sjmgfindkn:
70759290Sjlemon	if (fops->f_isfd) {
708133741Sjmg		KASSERT(td != NULL, ("td is NULL"));
709133741Sjmg		fdp = td->td_proc->p_fd;
710133741Sjmg		FILEDESC_LOCK(fdp);
71164343Sjlemon		/* validate descriptor */
712133741Sjmg		fd = kev->ident;
713133741Sjmg		if (fd < 0 || fd >= fdp->fd_nfiles ||
714133741Sjmg		    (fp = fdp->fd_ofiles[fd]) == NULL) {
71589306Salfred			FILEDESC_UNLOCK(fdp);
716133741Sjmg			error = EBADF;
717133741Sjmg			goto done;
71889306Salfred		}
71968883Sdillon		fhold(fp);
72059290Sjlemon
721133741Sjmg		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
722133741Sjmg		    kev->ident, 0) != 0) {
723133741Sjmg			/* unlock and try again */
724133741Sjmg			FILEDESC_UNLOCK(fdp);
725133741Sjmg			fdrop(fp, td);
726133741Sjmg			fp = NULL;
727133741Sjmg			error = kqueue_expand(kq, fops, kev->ident, waitok);
728133741Sjmg			if (error)
729133741Sjmg				goto done;
730133741Sjmg			goto findkn;
731133741Sjmg		}
732133741Sjmg
733133741Sjmg		if (fp->f_type == DTYPE_KQUEUE) {
734133741Sjmg			/*
735133741Sjmg			 * if we add some inteligence about what we are doing,
736133741Sjmg			 * we should be able to support events on ourselves.
737133741Sjmg			 * We need to know when we are doing this to prevent
738133741Sjmg			 * getting both the knlist lock and the kq lock since
739133741Sjmg			 * they are the same thing.
740133741Sjmg			 */
741133741Sjmg			if (fp->f_data == kq) {
742133741Sjmg				FILEDESC_UNLOCK(fdp);
743133741Sjmg				error = EINVAL;
744133741Sjmg				goto done_noglobal;
745133741Sjmg			}
746133741Sjmg
747133741Sjmg			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
748133741Sjmg		}
749133741Sjmg
750137772Sphk		FILEDESC_UNLOCK(fdp);
751133741Sjmg		KQ_LOCK(kq);
752133741Sjmg		if (kev->ident < kq->kq_knlistsize) {
753133741Sjmg			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
754133741Sjmg				if (kev->filter == kn->kn_filter)
75559290Sjlemon					break;
75659290Sjlemon		}
75759290Sjlemon	} else {
758133741Sjmg		if ((kev->flags & EV_ADD) == EV_ADD)
759133741Sjmg			kqueue_expand(kq, fops, kev->ident, waitok);
760133741Sjmg
761133741Sjmg		KQ_LOCK(kq);
762133741Sjmg		if (kq->kq_knhashmask != 0) {
76359290Sjlemon			struct klist *list;
764133635Sjmg
765133741Sjmg			list = &kq->kq_knhash[
766133741Sjmg			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
76759290Sjlemon			SLIST_FOREACH(kn, list, kn_link)
76859290Sjlemon				if (kev->ident == kn->kn_id &&
76959290Sjlemon				    kev->filter == kn->kn_filter)
77059290Sjlemon					break;
77159290Sjlemon		}
77259290Sjlemon	}
77359290Sjlemon
774133741Sjmg	/* knote is in the process of changing, wait for it to stablize. */
775133741Sjmg	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
776133741Sjmg		if (fp != NULL) {
777133741Sjmg			fdrop(fp, td);
778133741Sjmg			fp = NULL;
779133741Sjmg		}
780133741Sjmg		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
781133741Sjmg		kq->kq_state |= KQ_FLUXWAIT;
782133741Sjmg		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
783133741Sjmg		goto findkn;
784133741Sjmg	}
785133741Sjmg
78668883Sdillon	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
787133741Sjmg		KQ_UNLOCK(kq);
78868883Sdillon		error = ENOENT;
78968883Sdillon		goto done;
79068883Sdillon	}
79159290Sjlemon
79259290Sjlemon	/*
79359290Sjlemon	 * kn now contains the matching knote, or NULL if no match
79459290Sjlemon	 */
79559290Sjlemon	if (kev->flags & EV_ADD) {
79659290Sjlemon		if (kn == NULL) {
797133741Sjmg			kn = tkn;
798133741Sjmg			tkn = NULL;
79968883Sdillon			if (kn == NULL) {
80068883Sdillon				error = ENOMEM;
80168883Sdillon				goto done;
80268883Sdillon			}
80359290Sjlemon			kn->kn_fp = fp;
80459290Sjlemon			kn->kn_kq = kq;
80559290Sjlemon			kn->kn_fop = fops;
80668883Sdillon			/*
807133741Sjmg			 * apply reference counts to knote structure, and
80868883Sdillon			 * do not release it at the end of this routine.
80968883Sdillon			 */
810133741Sjmg			fops = NULL;
81168883Sdillon			fp = NULL;
81268883Sdillon
81361962Sjlemon			kn->kn_sfflags = kev->fflags;
81461962Sjlemon			kn->kn_sdata = kev->data;
81561962Sjlemon			kev->fflags = 0;
81661962Sjlemon			kev->data = 0;
81761962Sjlemon			kn->kn_kevent = *kev;
818133741Sjmg			kn->kn_status = KN_INFLUX|KN_DETACHED;
81961962Sjlemon
820133741Sjmg			error = knote_attach(kn, kq);
821133741Sjmg			KQ_UNLOCK(kq);
822133741Sjmg			if (error != 0) {
823133741Sjmg				tkn = kn;
824133741Sjmg				goto done;
825133741Sjmg			}
826133741Sjmg
827133741Sjmg			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
82883366Sjulian				knote_drop(kn, td);
82959290Sjlemon				goto done;
83059290Sjlemon			}
831133741Sjmg			KN_LIST_LOCK(kn);
83261962Sjlemon		} else {
83361962Sjlemon			/*
83461962Sjlemon			 * The user may change some filter values after the
835133590Srwatson			 * initial EV_ADD, but doing so will not reset any
836106171Srwatson			 * filter which has already been triggered.
83761962Sjlemon			 */
838133741Sjmg			kn->kn_status |= KN_INFLUX;
839133741Sjmg			KQ_UNLOCK(kq);
840133741Sjmg			KN_LIST_LOCK(kn);
84161962Sjlemon			kn->kn_sfflags = kev->fflags;
84261962Sjlemon			kn->kn_sdata = kev->data;
84361962Sjlemon			kn->kn_kevent.udata = kev->udata;
84459290Sjlemon		}
84561962Sjlemon
846133741Sjmg		/*
847133741Sjmg		 * We can get here with kn->kn_knlist == NULL.
848133741Sjmg		 * This can happen when the initial attach event decides that
849133741Sjmg		 * the event is "completed" already.  i.e. filt_procattach
850133741Sjmg		 * is called on a zombie process.  It will call filt_proc
851133741Sjmg		 * which will remove it from the list, and NULL kn_knlist.
852133741Sjmg		 */
853133741Sjmg		event = kn->kn_fop->f_event(kn, 0);
854133741Sjmg		KN_LIST_UNLOCK(kn);
855133741Sjmg		KQ_LOCK(kq);
856133741Sjmg		if (event)
857133741Sjmg			KNOTE_ACTIVATE(kn, 1);
858133741Sjmg		kn->kn_status &= ~KN_INFLUX;
85959290Sjlemon	} else if (kev->flags & EV_DELETE) {
860133741Sjmg		kn->kn_status |= KN_INFLUX;
861133741Sjmg		KQ_UNLOCK(kq);
862134859Sjmg		if (!(kn->kn_status & KN_DETACHED))
863134859Sjmg			kn->kn_fop->f_detach(kn);
86483366Sjulian		knote_drop(kn, td);
86559290Sjlemon		goto done;
86659290Sjlemon	}
86759290Sjlemon
86859290Sjlemon	if ((kev->flags & EV_DISABLE) &&
86959290Sjlemon	    ((kn->kn_status & KN_DISABLED) == 0)) {
87059290Sjlemon		kn->kn_status |= KN_DISABLED;
87159290Sjlemon	}
87259290Sjlemon
87359290Sjlemon	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
87459290Sjlemon		kn->kn_status &= ~KN_DISABLED;
87559290Sjlemon		if ((kn->kn_status & KN_ACTIVE) &&
87659290Sjlemon		    ((kn->kn_status & KN_QUEUED) == 0))
87759290Sjlemon			knote_enqueue(kn);
87859290Sjlemon	}
879133741Sjmg	KQ_UNLOCK_FLUX(kq);
88059290Sjlemon
88159290Sjlemondone:
882133741Sjmg	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
883133741Sjmgdone_noglobal:
88468883Sdillon	if (fp != NULL)
88583366Sjulian		fdrop(fp, td);
886133741Sjmg	if (tkn != NULL)
887133741Sjmg		knote_free(tkn);
888133741Sjmg	if (fops != NULL)
889133741Sjmg		kqueue_fo_release(filt);
89059290Sjlemon	return (error);
89159290Sjlemon}
89259290Sjlemon
89359290Sjlemonstatic int
894133741Sjmgkqueue_aquire(struct file *fp, struct kqueue **kqp)
89559290Sjlemon{
896133741Sjmg	int error;
89789306Salfred	struct kqueue *kq;
898133741Sjmg
899133741Sjmg	error = 0;
900133741Sjmg
901133741Sjmg	FILE_LOCK(fp);
902133741Sjmg	do {
903133741Sjmg		kq = fp->f_data;
904133741Sjmg		if (fp->f_type != DTYPE_KQUEUE || kq == NULL) {
905133741Sjmg			error = EBADF;
906133741Sjmg			break;
907133741Sjmg		}
908133741Sjmg		*kqp = kq;
909133741Sjmg		KQ_LOCK(kq);
910133741Sjmg		if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
911133741Sjmg			KQ_UNLOCK(kq);
912133741Sjmg			error = EBADF;
913133741Sjmg			break;
914133741Sjmg		}
915133741Sjmg		kq->kq_refcnt++;
916133741Sjmg		KQ_UNLOCK(kq);
917133741Sjmg	} while (0);
918133741Sjmg	FILE_UNLOCK(fp);
919133741Sjmg
920133741Sjmg	return error;
921133741Sjmg}
922133741Sjmg
923133741Sjmgstatic void
924133741Sjmgkqueue_release(struct kqueue *kq, int locked)
925133741Sjmg{
926133741Sjmg	if (locked)
927133741Sjmg		KQ_OWNED(kq);
928133741Sjmg	else
929133741Sjmg		KQ_LOCK(kq);
930133741Sjmg	kq->kq_refcnt--;
931133741Sjmg	if (kq->kq_refcnt == 1)
932133741Sjmg		wakeup(&kq->kq_refcnt);
933133741Sjmg	if (!locked)
934133741Sjmg		KQ_UNLOCK(kq);
935133741Sjmg}
936133741Sjmg
937133741Sjmgstatic void
938133741Sjmgkqueue_schedtask(struct kqueue *kq)
939133741Sjmg{
940133741Sjmg
941133741Sjmg	KQ_OWNED(kq);
942133741Sjmg	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
943133741Sjmg	    ("scheduling kqueue task while draining"));
944133741Sjmg
945133741Sjmg	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
946133741Sjmg		taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
947133741Sjmg		kq->kq_state |= KQ_TASKSCHED;
948133741Sjmg	}
949133741Sjmg}
950133741Sjmg
951133741Sjmg/*
952133741Sjmg * Expand the kq to make sure we have storage for fops/ident pair.
953133741Sjmg *
954133741Sjmg * Return 0 on success (or no work necessary), return errno on failure.
955133741Sjmg *
956133741Sjmg * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
957133741Sjmg * If kqueue_register is called from a non-fd context, there usually/should
958133741Sjmg * be no locks held.
959133741Sjmg */
960133741Sjmgstatic int
961133741Sjmgkqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
962133741Sjmg	int waitok)
963133741Sjmg{
964133741Sjmg	struct klist *list, *tmp_knhash;
965133741Sjmg	u_long tmp_knhashmask;
966133741Sjmg	int size;
967133741Sjmg	int fd;
968133741Sjmg	int mflag = waitok ? M_WAITOK : M_NOWAIT;
969133741Sjmg
970133741Sjmg	KQ_NOTOWNED(kq);
971133741Sjmg
972133741Sjmg	if (fops->f_isfd) {
973133741Sjmg		fd = ident;
974133741Sjmg		if (kq->kq_knlistsize <= fd) {
975133741Sjmg			size = kq->kq_knlistsize;
976133741Sjmg			while (size <= fd)
977133741Sjmg				size += KQEXTENT;
978133741Sjmg			MALLOC(list, struct klist *,
979133741Sjmg			    size * sizeof list, M_KQUEUE, mflag);
980133741Sjmg			if (list == NULL)
981133741Sjmg				return ENOMEM;
982133741Sjmg			KQ_LOCK(kq);
983133741Sjmg			if (kq->kq_knlistsize > fd) {
984133741Sjmg				FREE(list, M_KQUEUE);
985133741Sjmg				list = NULL;
986133741Sjmg			} else {
987133741Sjmg				if (kq->kq_knlist != NULL) {
988133741Sjmg					bcopy(kq->kq_knlist, list,
989133741Sjmg					    kq->kq_knlistsize * sizeof list);
990133741Sjmg					FREE(kq->kq_knlist, M_KQUEUE);
991133741Sjmg					kq->kq_knlist = NULL;
992133741Sjmg				}
993133741Sjmg				bzero((caddr_t)list +
994133741Sjmg				    kq->kq_knlistsize * sizeof list,
995133741Sjmg				    (size - kq->kq_knlistsize) * sizeof list);
996133741Sjmg				kq->kq_knlistsize = size;
997133741Sjmg				kq->kq_knlist = list;
998133741Sjmg			}
999133741Sjmg			KQ_UNLOCK(kq);
1000133741Sjmg		}
1001133741Sjmg	} else {
1002133741Sjmg		if (kq->kq_knhashmask == 0) {
1003133741Sjmg			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1004133741Sjmg			    &tmp_knhashmask);
1005133741Sjmg			if (tmp_knhash == NULL)
1006133741Sjmg				return ENOMEM;
1007133741Sjmg			KQ_LOCK(kq);
1008133741Sjmg			if (kq->kq_knhashmask == 0) {
1009133741Sjmg				kq->kq_knhash = tmp_knhash;
1010133741Sjmg				kq->kq_knhashmask = tmp_knhashmask;
1011133741Sjmg			} else {
1012133741Sjmg				free(tmp_knhash, M_KQUEUE);
1013133741Sjmg			}
1014133741Sjmg			KQ_UNLOCK(kq);
1015133741Sjmg		}
1016133741Sjmg	}
1017133741Sjmg
1018133741Sjmg	KQ_NOTOWNED(kq);
1019133741Sjmg	return 0;
1020133741Sjmg}
1021133741Sjmg
1022133741Sjmgstatic void
1023133741Sjmgkqueue_task(void *arg, int pending)
1024133741Sjmg{
1025133741Sjmg	struct kqueue *kq;
1026133741Sjmg	int haskqglobal;
1027133741Sjmg
1028133741Sjmg	haskqglobal = 0;
1029133741Sjmg	kq = arg;
1030133741Sjmg
1031133741Sjmg	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1032133741Sjmg	KQ_LOCK(kq);
1033133741Sjmg
1034133741Sjmg	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1035133741Sjmg
1036133741Sjmg	kq->kq_state &= ~KQ_TASKSCHED;
1037133741Sjmg	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1038133741Sjmg		wakeup(&kq->kq_state);
1039133741Sjmg	}
1040133741Sjmg	KQ_UNLOCK(kq);
1041133741Sjmg	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1042133741Sjmg}
1043133741Sjmg
1044133741Sjmg/*
1045133741Sjmg * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1046133741Sjmg * We treat KN_MARKER knotes as if they are INFLUX.
1047133741Sjmg */
1048133741Sjmgstatic int
1049133741Sjmgkqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
1050133741Sjmg	const struct timespec *tsp, struct kevent *keva, struct thread *td)
1051133741Sjmg{
105259290Sjlemon	struct kevent *kevp;
105359290Sjlemon	struct timeval atv, rtv, ttv;
1054133794Sgreen	struct knote *kn, *marker;
1055133741Sjmg	int count, timeout, nkev, error;
1056133741Sjmg	int haskqglobal;
105759290Sjlemon
105859290Sjlemon	count = maxevents;
1059133741Sjmg	nkev = 0;
1060133741Sjmg	error = 0;
1061133741Sjmg	haskqglobal = 0;
106259290Sjlemon
1063133741Sjmg	if (maxevents == 0)
1064133741Sjmg		goto done_nl;
1065133741Sjmg
106664343Sjlemon	if (tsp != NULL) {
106759290Sjlemon		TIMESPEC_TO_TIMEVAL(&atv, tsp);
106864343Sjlemon		if (itimerfix(&atv)) {
106959290Sjlemon			error = EINVAL;
1070133741Sjmg			goto done_nl;
107159290Sjlemon		}
107264343Sjlemon		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
107364343Sjlemon			timeout = -1;
1074133590Srwatson		else
107564343Sjlemon			timeout = atv.tv_sec > 24 * 60 * 60 ?
107664343Sjlemon			    24 * 60 * 60 * hz : tvtohz(&atv);
107764343Sjlemon		getmicrouptime(&rtv);
107864343Sjlemon		timevaladd(&atv, &rtv);
107964343Sjlemon	} else {
108064343Sjlemon		atv.tv_sec = 0;
108164343Sjlemon		atv.tv_usec = 0;
108259290Sjlemon		timeout = 0;
108359290Sjlemon	}
1084133794Sgreen	marker = knote_alloc(1);
1085133794Sgreen	if (marker == NULL) {
1086133794Sgreen		error = ENOMEM;
1087133794Sgreen		goto done_nl;
1088133794Sgreen	}
1089133794Sgreen	marker->kn_status = KN_MARKER;
1090133741Sjmg	KQ_LOCK(kq);
109159290Sjlemon	goto start;
109259290Sjlemon
109359290Sjlemonretry:
109464343Sjlemon	if (atv.tv_sec || atv.tv_usec) {
109559290Sjlemon		getmicrouptime(&rtv);
109659290Sjlemon		if (timevalcmp(&rtv, &atv, >=))
109759290Sjlemon			goto done;
109859290Sjlemon		ttv = atv;
109959290Sjlemon		timevalsub(&ttv, &rtv);
110059290Sjlemon		timeout = ttv.tv_sec > 24 * 60 * 60 ?
110159290Sjlemon			24 * 60 * 60 * hz : tvtohz(&ttv);
110259290Sjlemon	}
110359290Sjlemon
110459290Sjlemonstart:
1105133741Sjmg	kevp = keva;
110659290Sjlemon	if (kq->kq_count == 0) {
1107133590Srwatson		if (timeout < 0) {
110864343Sjlemon			error = EWOULDBLOCK;
110964343Sjlemon		} else {
1110135240Sjmg			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
111164343Sjlemon			kq->kq_state |= KQ_SLEEP;
1112133741Sjmg			error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
1113133741Sjmg			    "kqread", timeout);
111464343Sjlemon		}
111564084Sjlemon		if (error == 0)
111659290Sjlemon			goto retry;
111764084Sjlemon		/* don't restart after signals... */
111864084Sjlemon		if (error == ERESTART)
111964084Sjlemon			error = EINTR;
112064084Sjlemon		else if (error == EWOULDBLOCK)
112159290Sjlemon			error = 0;
112259290Sjlemon		goto done;
112359290Sjlemon	}
112459290Sjlemon
1125133794Sgreen	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
112659290Sjlemon	while (count) {
1127133741Sjmg		KQ_OWNED(kq);
112859290Sjlemon		kn = TAILQ_FIRST(&kq->kq_head);
1129133741Sjmg
1130133794Sgreen		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1131133741Sjmg		    (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1132135240Sjmg			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1133133741Sjmg			kq->kq_state |= KQ_FLUXWAIT;
1134133741Sjmg			error = msleep(kq, &kq->kq_lock, PSOCK,
1135133741Sjmg			    "kqflxwt", 0);
1136133741Sjmg			continue;
1137133741Sjmg		}
1138133741Sjmg
1139133590Srwatson		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1140133741Sjmg		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1141133741Sjmg			kn->kn_status &= ~KN_QUEUED;
1142133741Sjmg			kq->kq_count--;
1143133741Sjmg			continue;
1144133741Sjmg		}
1145133794Sgreen		if (kn == marker) {
1146133741Sjmg			KQ_FLUX_WAKEUP(kq);
114759290Sjlemon			if (count == maxevents)
114859290Sjlemon				goto retry;
114959290Sjlemon			goto done;
115059290Sjlemon		}
1151133741Sjmg		KASSERT((kn->kn_status & KN_INFLUX) == 0,
1152133741Sjmg		    ("KN_INFLUX set when not suppose to be"));
1153133741Sjmg
1154133741Sjmg		if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
115559290Sjlemon			kn->kn_status &= ~KN_QUEUED;
1156133741Sjmg			kn->kn_status |= KN_INFLUX;
115759290Sjlemon			kq->kq_count--;
1158133741Sjmg			KQ_UNLOCK(kq);
1159133741Sjmg			/*
1160133741Sjmg			 * We don't need to lock the list since we've marked
1161133741Sjmg			 * it _INFLUX.
1162133741Sjmg			 */
1163133741Sjmg			*kevp = kn->kn_kevent;
1164134859Sjmg			if (!(kn->kn_status & KN_DETACHED))
1165134859Sjmg				kn->kn_fop->f_detach(kn);
116683366Sjulian			knote_drop(kn, td);
1167133741Sjmg			KQ_LOCK(kq);
1168133741Sjmg			kn = NULL;
116959290Sjlemon		} else {
1170133741Sjmg			kn->kn_status |= KN_INFLUX;
1171133741Sjmg			KQ_UNLOCK(kq);
1172133741Sjmg			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1173133741Sjmg				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1174133741Sjmg			KN_LIST_LOCK(kn);
1175133741Sjmg			if (kn->kn_fop->f_event(kn, 0) == 0) {
1176133741Sjmg				KN_LIST_UNLOCK(kn);
1177133741Sjmg				KQ_LOCK(kq);
1178133741Sjmg				kn->kn_status &=
1179133741Sjmg				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
1180133741Sjmg				kq->kq_count--;
1181133741Sjmg				continue;
1182133741Sjmg			}
1183133741Sjmg			*kevp = kn->kn_kevent;
1184133741Sjmg			KQ_LOCK(kq);
1185133741Sjmg			if (kn->kn_flags & EV_CLEAR) {
1186133741Sjmg				kn->kn_data = 0;
1187133741Sjmg				kn->kn_fflags = 0;
1188133741Sjmg				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1189133741Sjmg				kq->kq_count--;
1190133741Sjmg			} else
1191133741Sjmg				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1192133741Sjmg			KN_LIST_UNLOCK(kn);
1193133741Sjmg			kn->kn_status &= ~(KN_INFLUX);
119459290Sjlemon		}
1195133741Sjmg
1196133741Sjmg		/* we are returning a copy to the user */
1197133741Sjmg		kevp++;
1198133741Sjmg		nkev++;
119959290Sjlemon		count--;
1200133741Sjmg
120159290Sjlemon		if (nkev == KQ_NEVENTS) {
1202133741Sjmg			KQ_UNLOCK_FLUX(kq);
1203133741Sjmg			error = copyout(keva, ulistp, sizeof *keva * nkev);
120459290Sjlemon			ulistp += nkev;
120559290Sjlemon			nkev = 0;
1206133741Sjmg			kevp = keva;
1207133741Sjmg			KQ_LOCK(kq);
120859997Sjlemon			if (error)
120959997Sjlemon				break;
121059290Sjlemon		}
121159290Sjlemon	}
1212133794Sgreen	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
121359290Sjlemondone:
1214133741Sjmg	KQ_OWNED(kq);
1215133741Sjmg	KQ_UNLOCK_FLUX(kq);
1216133741Sjmg	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1217133794Sgreen	knote_free(marker);
1218133741Sjmgdone_nl:
1219133741Sjmg	KQ_NOTOWNED(kq);
122059290Sjlemon	if (nkev != 0)
1221133741Sjmg		error = copyout(keva, ulistp, sizeof *keva * nkev);
1222133741Sjmg	td->td_retval[0] = maxevents - count;
122359290Sjlemon	return (error);
122459290Sjlemon}
122559290Sjlemon
122659290Sjlemon/*
122759290Sjlemon * XXX
122859290Sjlemon * This could be expanded to call kqueue_scan, if desired.
122959290Sjlemon */
123059290Sjlemon/*ARGSUSED*/
123159290Sjlemonstatic int
1232101941Srwatsonkqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
123383366Sjulian	int flags, struct thread *td)
123459290Sjlemon{
123559290Sjlemon	return (ENXIO);
123659290Sjlemon}
123759290Sjlemon
123859290Sjlemon/*ARGSUSED*/
123959290Sjlemonstatic int
1240101941Srwatsonkqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
124183366Sjulian	 int flags, struct thread *td)
124259290Sjlemon{
124359290Sjlemon	return (ENXIO);
124459290Sjlemon}
124559290Sjlemon
124659290Sjlemon/*ARGSUSED*/
124759290Sjlemonstatic int
1248132138Salfredkqueue_ioctl(struct file *fp, u_long cmd, void *data,
1249102003Srwatson	struct ucred *active_cred, struct thread *td)
125059290Sjlemon{
1251132174Salfred	/*
1252132174Salfred	 * Enabling sigio causes two major problems:
1253132174Salfred	 * 1) infinite recursion:
1254132174Salfred	 * Synopsys: kevent is being used to track signals and have FIOASYNC
1255132174Salfred	 * set.  On receipt of a signal this will cause a kqueue to recurse
1256132174Salfred	 * into itself over and over.  Sending the sigio causes the kqueue
1257132174Salfred	 * to become ready, which in turn posts sigio again, forever.
1258132174Salfred	 * Solution: this can be solved by setting a flag in the kqueue that
1259132174Salfred	 * we have a SIGIO in progress.
1260132174Salfred	 * 2) locking problems:
1261132174Salfred	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1262132174Salfred	 * us above the proc and pgrp locks.
1263132174Salfred	 * Solution: Post a signal using an async mechanism, being sure to
1264132174Salfred	 * record a generation count in the delivery so that we do not deliver
1265132174Salfred	 * a signal to the wrong process.
1266132174Salfred	 *
1267132174Salfred	 * Note, these two mechanisms are somewhat mutually exclusive!
1268132174Salfred	 */
1269132174Salfred#if 0
1270132138Salfred	struct kqueue *kq;
1271132138Salfred
1272132138Salfred	kq = fp->f_data;
1273132138Salfred	switch (cmd) {
1274132138Salfred	case FIOASYNC:
1275132138Salfred		if (*(int *)data) {
1276132138Salfred			kq->kq_state |= KQ_ASYNC;
1277132138Salfred		} else {
1278132138Salfred			kq->kq_state &= ~KQ_ASYNC;
1279132138Salfred		}
1280132138Salfred		return (0);
1281132138Salfred
1282132138Salfred	case FIOSETOWN:
1283132138Salfred		return (fsetown(*(int *)data, &kq->kq_sigio));
1284132138Salfred
1285132138Salfred	case FIOGETOWN:
1286132138Salfred		*(int *)data = fgetown(&kq->kq_sigio);
1287132138Salfred		return (0);
1288132138Salfred	}
1289132174Salfred#endif
1290132138Salfred
129159290Sjlemon	return (ENOTTY);
129259290Sjlemon}
129359290Sjlemon
129459290Sjlemon/*ARGSUSED*/
129559290Sjlemonstatic int
1296101983Srwatsonkqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1297101987Srwatson	struct thread *td)
129859290Sjlemon{
129989306Salfred	struct kqueue *kq;
130059290Sjlemon	int revents = 0;
1301133741Sjmg	int error;
130259290Sjlemon
1303133741Sjmg	if ((error = kqueue_aquire(fp, &kq)))
1304133741Sjmg		return POLLERR;
1305133741Sjmg
1306133741Sjmg	KQ_LOCK(kq);
1307133741Sjmg	if (events & (POLLIN | POLLRDNORM)) {
1308133741Sjmg		if (kq->kq_count) {
1309133741Sjmg			revents |= events & (POLLIN | POLLRDNORM);
131059290Sjlemon		} else {
1311133741Sjmg			selrecord(td, &kq->kq_sel);
131259290Sjlemon			kq->kq_state |= KQ_SEL;
131359290Sjlemon		}
131459290Sjlemon	}
1315133741Sjmg	kqueue_release(kq, 1);
1316133741Sjmg	KQ_UNLOCK(kq);
131759290Sjlemon	return (revents);
131859290Sjlemon}
131959290Sjlemon
132059290Sjlemon/*ARGSUSED*/
132159290Sjlemonstatic int
1322101983Srwatsonkqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1323101987Srwatson	struct thread *td)
132459290Sjlemon{
132559290Sjlemon
1326133741Sjmg	return (ENXIO);
132759290Sjlemon}
132859290Sjlemon
132959290Sjlemon/*ARGSUSED*/
133059290Sjlemonstatic int
133183366Sjuliankqueue_close(struct file *fp, struct thread *td)
133259290Sjlemon{
1333109153Sdillon	struct kqueue *kq = fp->f_data;
1334133741Sjmg	struct filedesc *fdp;
1335133741Sjmg	struct knote *kn;
133659290Sjlemon	int i;
1337133741Sjmg	int error;
133859290Sjlemon
1339133741Sjmg	if ((error = kqueue_aquire(fp, &kq)))
1340133741Sjmg		return error;
1341133741Sjmg
1342133741Sjmg	KQ_LOCK(kq);
1343133741Sjmg
1344133741Sjmg	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1345133741Sjmg	    ("kqueue already closing"));
1346133741Sjmg	kq->kq_state |= KQ_CLOSING;
1347133741Sjmg	if (kq->kq_refcnt > 1)
1348133741Sjmg		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1349133741Sjmg
1350133741Sjmg	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1351133741Sjmg	fdp = kq->kq_fdp;
1352133741Sjmg
1353133741Sjmg	KASSERT(knlist_empty(&kq->kq_sel.si_note),
1354133741Sjmg	    ("kqueue's knlist not empty"));
1355133741Sjmg
1356133741Sjmg	for (i = 0; i < kq->kq_knlistsize; i++) {
1357133741Sjmg		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1358133741Sjmg			KASSERT((kn->kn_status & KN_INFLUX) == 0,
1359133741Sjmg			    ("KN_INFLUX set when not suppose to be"));
1360133741Sjmg			kn->kn_status |= KN_INFLUX;
1361133741Sjmg			KQ_UNLOCK(kq);
1362134859Sjmg			if (!(kn->kn_status & KN_DETACHED))
1363134859Sjmg				kn->kn_fop->f_detach(kn);
1364133741Sjmg			knote_drop(kn, td);
1365133741Sjmg			KQ_LOCK(kq);
136659290Sjlemon		}
136759290Sjlemon	}
1368133741Sjmg	if (kq->kq_knhashmask != 0) {
1369133741Sjmg		for (i = 0; i <= kq->kq_knhashmask; i++) {
1370133741Sjmg			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
1371133741Sjmg				KASSERT((kn->kn_status & KN_INFLUX) == 0,
1372133741Sjmg				    ("KN_INFLUX set when not suppose to be"));
1373133741Sjmg				kn->kn_status |= KN_INFLUX;
1374133741Sjmg				KQ_UNLOCK(kq);
1375134859Sjmg				if (!(kn->kn_status & KN_DETACHED))
1376134859Sjmg					kn->kn_fop->f_detach(kn);
1377133741Sjmg				knote_drop(kn, td);
1378133741Sjmg				KQ_LOCK(kq);
137959290Sjlemon			}
138059290Sjlemon		}
138159290Sjlemon	}
1382133741Sjmg
1383133741Sjmg	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
1384133741Sjmg		kq->kq_state |= KQ_TASKDRAIN;
1385133741Sjmg		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
1386133741Sjmg	}
1387133741Sjmg
1388133741Sjmg	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1389126033Sgreen		kq->kq_state &= ~KQ_SEL;
1390126033Sgreen		selwakeuppri(&kq->kq_sel, PSOCK);
1391126033Sgreen	}
1392133741Sjmg
1393133741Sjmg	KQ_UNLOCK(kq);
1394133741Sjmg
1395137647Sphk	FILEDESC_LOCK_FAST(fdp);
1396133741Sjmg	SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
1397137647Sphk	FILEDESC_UNLOCK_FAST(fdp);
1398133741Sjmg
1399133741Sjmg	knlist_destroy(&kq->kq_sel.si_note);
1400133741Sjmg	mtx_destroy(&kq->kq_lock);
1401133741Sjmg	kq->kq_fdp = NULL;
1402133741Sjmg
1403133741Sjmg	if (kq->kq_knhash != NULL)
1404133741Sjmg		free(kq->kq_knhash, M_KQUEUE);
1405133741Sjmg	if (kq->kq_knlist != NULL)
1406133741Sjmg		free(kq->kq_knlist, M_KQUEUE);
1407133741Sjmg
1408132138Salfred	funsetown(&kq->kq_sigio);
140984138Sjlemon	free(kq, M_KQUEUE);
1410109153Sdillon	fp->f_data = NULL;
141159290Sjlemon
141259290Sjlemon	return (0);
141359290Sjlemon}
141459290Sjlemon
141559290Sjlemonstatic void
141659290Sjlemonkqueue_wakeup(struct kqueue *kq)
141759290Sjlemon{
1418133741Sjmg	KQ_OWNED(kq);
141959290Sjlemon
1420133741Sjmg	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
142159290Sjlemon		kq->kq_state &= ~KQ_SLEEP;
142259290Sjlemon		wakeup(kq);
142359290Sjlemon	}
1424133741Sjmg	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
142559290Sjlemon		kq->kq_state &= ~KQ_SEL;
1426122352Stanimura		selwakeuppri(&kq->kq_sel, PSOCK);
142759290Sjlemon	}
1428133741Sjmg	if (!knlist_empty(&kq->kq_sel.si_note))
1429133741Sjmg		kqueue_schedtask(kq);
1430133741Sjmg	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
1431132138Salfred		pgsigio(&kq->kq_sigio, SIGIO, 0);
1432132138Salfred	}
143359290Sjlemon}
143459290Sjlemon
143559290Sjlemon/*
1436133741Sjmg * Walk down a list of knotes, activating them if their event has triggered.
1437133741Sjmg *
1438133741Sjmg * There is a possibility to optimize in the case of one kq watching another.
1439133741Sjmg * Instead of scheduling a task to wake it up, you could pass enough state
1440133741Sjmg * down the chain to make up the parent kqueue.  Make this code functional
1441133741Sjmg * first.
144259290Sjlemon */
144359290Sjlemonvoid
1444133741Sjmgknote(struct knlist *list, long hint, int islocked)
144559290Sjlemon{
1446133741Sjmg	struct kqueue *kq;
144759290Sjlemon	struct knote *kn;
144859290Sjlemon
1449133741Sjmg	if (list == NULL)
1450133741Sjmg		return;
1451133741Sjmg
1452133741Sjmg	mtx_assert(list->kl_lock, islocked ? MA_OWNED : MA_NOTOWNED);
1453133741Sjmg	if (!islocked)
1454133741Sjmg		mtx_lock(list->kl_lock);
1455133741Sjmg	/*
1456133741Sjmg	 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
1457133741Sjmg	 * the kqueue scheduling, but this will introduce four
1458133741Sjmg	 * lock/unlock's for each knote to test.  If we do, continue to use
1459133741Sjmg	 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
1460133741Sjmg	 * only safe if you want to remove the current item, which we are
1461133741Sjmg	 * not doing.
1462133741Sjmg	 */
1463133741Sjmg	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
1464133741Sjmg		kq = kn->kn_kq;
1465133741Sjmg		if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
1466133741Sjmg			KQ_LOCK(kq);
1467133741Sjmg			if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
1468133741Sjmg				kn->kn_status |= KN_HASKQLOCK;
1469133741Sjmg				if (kn->kn_fop->f_event(kn, hint))
1470133741Sjmg					KNOTE_ACTIVATE(kn, 1);
1471133741Sjmg				kn->kn_status &= ~KN_HASKQLOCK;
1472133741Sjmg			}
1473133741Sjmg			KQ_UNLOCK(kq);
1474133741Sjmg		}
1475133741Sjmg		kq = NULL;
1476133741Sjmg	}
1477133741Sjmg	if (!islocked)
1478133741Sjmg		mtx_unlock(list->kl_lock);
147959290Sjlemon}
148059290Sjlemon
148159290Sjlemon/*
1482133741Sjmg * add a knote to a knlist
1483133741Sjmg */
1484133741Sjmgvoid
1485133741Sjmgknlist_add(struct knlist *knl, struct knote *kn, int islocked)
1486133741Sjmg{
1487133741Sjmg	mtx_assert(knl->kl_lock, islocked ? MA_OWNED : MA_NOTOWNED);
1488133741Sjmg	KQ_NOTOWNED(kn->kn_kq);
1489133741Sjmg	KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
1490133741Sjmg	    (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
1491133741Sjmg	if (!islocked)
1492133741Sjmg		mtx_lock(knl->kl_lock);
1493133741Sjmg	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
1494133741Sjmg	if (!islocked)
1495133741Sjmg		mtx_unlock(knl->kl_lock);
1496133741Sjmg	KQ_LOCK(kn->kn_kq);
1497133741Sjmg	kn->kn_knlist = knl;
1498133741Sjmg	kn->kn_status &= ~KN_DETACHED;
1499133741Sjmg	KQ_UNLOCK(kn->kn_kq);
1500133741Sjmg}
1501133741Sjmg
1502133741Sjmgstatic void
1503133741Sjmgknlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
1504133741Sjmg{
1505133741Sjmg	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
1506133741Sjmg	mtx_assert(knl->kl_lock, knlislocked ? MA_OWNED : MA_NOTOWNED);
1507133741Sjmg	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
1508133741Sjmg	if (!kqislocked)
1509133741Sjmg		KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
1510133741Sjmg    ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
1511133741Sjmg	if (!knlislocked)
1512133741Sjmg		mtx_lock(knl->kl_lock);
1513133741Sjmg	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
1514133741Sjmg	kn->kn_knlist = NULL;
1515133741Sjmg	if (!knlislocked)
1516133741Sjmg		mtx_unlock(knl->kl_lock);
1517133741Sjmg	if (!kqislocked)
1518133741Sjmg		KQ_LOCK(kn->kn_kq);
1519133741Sjmg	kn->kn_status |= KN_DETACHED;
1520133741Sjmg	if (!kqislocked)
1521133741Sjmg		KQ_UNLOCK(kn->kn_kq);
1522133741Sjmg}
1523133741Sjmg
1524133741Sjmg/*
152559290Sjlemon * remove all knotes from a specified klist
152659290Sjlemon */
152759290Sjlemonvoid
1528133741Sjmgknlist_remove(struct knlist *knl, struct knote *kn, int islocked)
152959290Sjlemon{
1530133741Sjmg
1531133741Sjmg	knlist_remove_kq(knl, kn, islocked, 0);
1532133741Sjmg}
1533133741Sjmg
1534133741Sjmg/*
1535133741Sjmg * remove knote from a specified klist while in f_event handler.
1536133741Sjmg */
1537133741Sjmgvoid
1538133741Sjmgknlist_remove_inevent(struct knlist *knl, struct knote *kn)
1539133741Sjmg{
1540133741Sjmg
1541133741Sjmg	knlist_remove_kq(knl, kn, 1,
1542133741Sjmg	    (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
1543133741Sjmg}
1544133741Sjmg
1545133741Sjmgint
1546133741Sjmgknlist_empty(struct knlist *knl)
1547133741Sjmg{
1548133741Sjmg
1549133741Sjmg	mtx_assert(knl->kl_lock, MA_OWNED);
1550133741Sjmg	return SLIST_EMPTY(&knl->kl_list);
1551133741Sjmg}
1552133741Sjmg
1553133741Sjmgstatic struct mtx	knlist_lock;
1554133741SjmgMTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
1555133741Sjmg	MTX_DEF);
1556133741Sjmg
1557133741Sjmgvoid
1558133741Sjmgknlist_init(struct knlist *knl, struct mtx *mtx)
1559133741Sjmg{
1560133741Sjmg
1561133741Sjmg	if (mtx == NULL)
1562133741Sjmg		knl->kl_lock = &knlist_lock;
1563133741Sjmg	else
1564133741Sjmg		knl->kl_lock = mtx;
1565133741Sjmg
1566133741Sjmg	SLIST_INIT(&knl->kl_list);
1567133741Sjmg}
1568133741Sjmg
1569133741Sjmgvoid
1570133741Sjmgknlist_destroy(struct knlist *knl)
1571133741Sjmg{
1572133741Sjmg
1573133741Sjmg#ifdef INVARIANTS
1574133741Sjmg	/*
1575133741Sjmg	 * if we run across this error, we need to find the offending
1576133741Sjmg	 * driver and have it call knlist_clear.
1577133741Sjmg	 */
1578133741Sjmg	if (!SLIST_EMPTY(&knl->kl_list))
1579133741Sjmg		printf("WARNING: destroying knlist w/ knotes on it!\n");
1580133741Sjmg#endif
1581133741Sjmg
1582133741Sjmg	knl->kl_lock = NULL;
1583133741Sjmg	SLIST_INIT(&knl->kl_list);
1584133741Sjmg}
1585133741Sjmg
1586133741Sjmg/*
1587133741Sjmg * Even if we are locked, we may need to drop the lock to allow any influx
1588133741Sjmg * knotes time to "settle".
1589133741Sjmg */
1590133741Sjmgvoid
1591133741Sjmgknlist_clear(struct knlist *knl, int islocked)
1592133741Sjmg{
159359290Sjlemon	struct knote *kn;
1594133741Sjmg	struct kqueue *kq;
159559290Sjlemon
1596133741Sjmg	if (islocked)
1597133741Sjmg		mtx_assert(knl->kl_lock, MA_OWNED);
1598133741Sjmg	else {
1599133741Sjmg		mtx_assert(knl->kl_lock, MA_NOTOWNED);
1600133741Sjmgagain:		/* need to reaquire lock since we have dropped it */
1601133741Sjmg		mtx_lock(knl->kl_lock);
160259290Sjlemon	}
1603133741Sjmg
1604133741Sjmg	SLIST_FOREACH(kn, &knl->kl_list, kn_selnext) {
1605133741Sjmg		kq = kn->kn_kq;
1606133741Sjmg		KQ_LOCK(kq);
1607133741Sjmg		if ((kn->kn_status & KN_INFLUX) &&
1608133741Sjmg		    (kn->kn_status & KN_DETACHED) != KN_DETACHED) {
1609133741Sjmg			KQ_UNLOCK(kq);
1610133741Sjmg			continue;
1611133741Sjmg		}
1612133741Sjmg		/* Make sure cleared knotes disappear soon */
1613133741Sjmg		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1614133741Sjmg		knlist_remove_kq(knl, kn, 1, 1);
1615133741Sjmg		KQ_UNLOCK(kq);
1616133741Sjmg		kq = NULL;
1617133741Sjmg	}
1618133741Sjmg
1619133741Sjmg	if (!SLIST_EMPTY(&knl->kl_list)) {
1620133741Sjmg		/* there are still KN_INFLUX remaining */
1621133741Sjmg		kn = SLIST_FIRST(&knl->kl_list);
1622133741Sjmg		kq = kn->kn_kq;
1623133741Sjmg		KQ_LOCK(kq);
1624133741Sjmg		KASSERT(kn->kn_status & KN_INFLUX,
1625133741Sjmg		    ("knote removed w/o list lock"));
1626133741Sjmg		mtx_unlock(knl->kl_lock);
1627133741Sjmg		kq->kq_state |= KQ_FLUXWAIT;
1628133741Sjmg		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
1629133741Sjmg		kq = NULL;
1630133741Sjmg		goto again;
1631133741Sjmg	}
1632133741Sjmg
1633133741Sjmg	SLIST_INIT(&knl->kl_list);
1634133741Sjmg
1635133741Sjmg	if (islocked)
1636133741Sjmg		mtx_assert(knl->kl_lock, MA_OWNED);
1637133741Sjmg	else {
1638133741Sjmg		mtx_unlock(knl->kl_lock);
1639133741Sjmg		mtx_assert(knl->kl_lock, MA_NOTOWNED);
1640133741Sjmg	}
164159290Sjlemon}
164259290Sjlemon
164359290Sjlemon/*
164459290Sjlemon * remove all knotes referencing a specified fd
1645133741Sjmg * must be called with FILEDESC lock.  This prevents a race where a new fd
1646133741Sjmg * comes along and occupies the entry and we attach a knote to the fd.
164759290Sjlemon */
164859290Sjlemonvoid
164983366Sjulianknote_fdclose(struct thread *td, int fd)
165059290Sjlemon{
165183366Sjulian	struct filedesc *fdp = td->td_proc->p_fd;
1652133741Sjmg	struct kqueue *kq;
1653133741Sjmg	struct knote *kn;
1654133741Sjmg	int influx;
165559290Sjlemon
1656133741Sjmg	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1657133741Sjmg
1658133741Sjmg	/*
1659133741Sjmg	 * We shouldn't have to worry about new kevents appearing on fd
1660133741Sjmg	 * since filedesc is locked.
1661133741Sjmg	 */
1662133741Sjmg	SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
1663133741Sjmg		KQ_LOCK(kq);
1664133741Sjmg
1665133741Sjmgagain:
1666133741Sjmg		influx = 0;
1667133741Sjmg		while (kq->kq_knlistsize > fd &&
1668133741Sjmg		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
1669133741Sjmg			if (kn->kn_status & KN_INFLUX) {
1670133741Sjmg				/* someone else might be waiting on our knote */
1671133741Sjmg				if (influx)
1672133741Sjmg					wakeup(kq);
1673133741Sjmg				kq->kq_state |= KQ_FLUXWAIT;
1674133741Sjmg				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
1675133741Sjmg				goto again;
1676133741Sjmg			}
1677133741Sjmg			kn->kn_status |= KN_INFLUX;
1678133741Sjmg			KQ_UNLOCK(kq);
1679134859Sjmg			if (!(kn->kn_status & KN_DETACHED))
1680134859Sjmg				kn->kn_fop->f_detach(kn);
1681133741Sjmg			knote_drop(kn, td);
1682133741Sjmg			influx = 1;
1683133741Sjmg			KQ_LOCK(kq);
1684133741Sjmg		}
1685133741Sjmg		KQ_UNLOCK_FLUX(kq);
1686133741Sjmg	}
168759290Sjlemon}
168859290Sjlemon
1689133741Sjmgstatic int
1690133741Sjmgknote_attach(struct knote *kn, struct kqueue *kq)
169159290Sjlemon{
1692133741Sjmg	struct klist *list;
169359290Sjlemon
1694133741Sjmg	KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
1695133741Sjmg	KQ_OWNED(kq);
169689306Salfred
1697133741Sjmg	if (kn->kn_fop->f_isfd) {
1698133741Sjmg		if (kn->kn_id >= kq->kq_knlistsize)
1699133741Sjmg			return ENOMEM;
1700133741Sjmg		list = &kq->kq_knlist[kn->kn_id];
1701133741Sjmg	} else {
1702133741Sjmg		if (kq->kq_knhash == NULL)
1703133741Sjmg			return ENOMEM;
1704133741Sjmg		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
170559290Sjlemon	}
170659290Sjlemon
170759290Sjlemon	SLIST_INSERT_HEAD(list, kn, kn_link);
1708133741Sjmg
1709133741Sjmg	return 0;
171059290Sjlemon}
171159290Sjlemon
171259290Sjlemon/*
1713133741Sjmg * knote must already have been detatched using the f_detach method.
1714133741Sjmg * no lock need to be held, it is assumed that the KN_INFLUX flag is set
1715133741Sjmg * to prevent other removal.
171659290Sjlemon */
171759290Sjlemonstatic void
171883366Sjulianknote_drop(struct knote *kn, struct thread *td)
171959290Sjlemon{
1720133741Sjmg	struct kqueue *kq;
172159290Sjlemon	struct klist *list;
172259290Sjlemon
1723133741Sjmg	kq = kn->kn_kq;
1724133741Sjmg
1725133741Sjmg	KQ_NOTOWNED(kq);
1726133741Sjmg	KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
1727133741Sjmg	    ("knote_drop called without KN_INFLUX set in kn_status"));
1728133741Sjmg
1729133741Sjmg	KQ_LOCK(kq);
173059290Sjlemon	if (kn->kn_fop->f_isfd)
1731133741Sjmg		list = &kq->kq_knlist[kn->kn_id];
173259290Sjlemon	else
1733133741Sjmg		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
173459290Sjlemon
173560938Sjake	SLIST_REMOVE(list, kn, knote, kn_link);
173659290Sjlemon	if (kn->kn_status & KN_QUEUED)
173759290Sjlemon		knote_dequeue(kn);
1738133741Sjmg	KQ_UNLOCK_FLUX(kq);
1739133741Sjmg
1740133741Sjmg	if (kn->kn_fop->f_isfd) {
1741133741Sjmg		fdrop(kn->kn_fp, td);
1742133741Sjmg		kn->kn_fp = NULL;
1743133741Sjmg	}
1744133741Sjmg	kqueue_fo_release(kn->kn_kevent.filter);
1745133741Sjmg	kn->kn_fop = NULL;
174659290Sjlemon	knote_free(kn);
174759290Sjlemon}
174859290Sjlemon
174959290Sjlemonstatic void
175059290Sjlemonknote_enqueue(struct knote *kn)
175159290Sjlemon{
175259290Sjlemon	struct kqueue *kq = kn->kn_kq;
175359290Sjlemon
1754133741Sjmg	KQ_OWNED(kn->kn_kq);
175559997Sjlemon	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
175659997Sjlemon
1757133590Srwatson	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
175859290Sjlemon	kn->kn_status |= KN_QUEUED;
175959290Sjlemon	kq->kq_count++;
176059290Sjlemon	kqueue_wakeup(kq);
176159290Sjlemon}
176259290Sjlemon
176359290Sjlemonstatic void
176459290Sjlemonknote_dequeue(struct knote *kn)
176559290Sjlemon{
176659290Sjlemon	struct kqueue *kq = kn->kn_kq;
176759290Sjlemon
1768133741Sjmg	KQ_OWNED(kn->kn_kq);
176959997Sjlemon	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
177059997Sjlemon
1771133590Srwatson	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
177259290Sjlemon	kn->kn_status &= ~KN_QUEUED;
177359290Sjlemon	kq->kq_count--;
177459290Sjlemon}
177559290Sjlemon
177659290Sjlemonstatic void
177759290Sjlemonknote_init(void)
177859290Sjlemon{
1779133741Sjmg
178092751Sjeff	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
178192751Sjeff	    NULL, NULL, UMA_ALIGN_PTR, 0);
178259290Sjlemon}
178359290SjlemonSYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
178459290Sjlemon
178559290Sjlemonstatic struct knote *
1786133741Sjmgknote_alloc(int waitok)
178759290Sjlemon{
1788133741Sjmg	return ((struct knote *)uma_zalloc(knote_zone,
1789133741Sjmg	    (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
179059290Sjlemon}
179159290Sjlemon
179259290Sjlemonstatic void
179359290Sjlemonknote_free(struct knote *kn)
179459290Sjlemon{
1795133741Sjmg	if (kn != NULL)
1796133741Sjmg		uma_zfree(knote_zone, kn);
179759290Sjlemon}
1798