159290Sjlemon/*-
272969Sjlemon * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3133741Sjmg * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
4197240Ssson * Copyright (c) 2009 Apple, Inc.
559290Sjlemon * All rights reserved.
659290Sjlemon *
759290Sjlemon * Redistribution and use in source and binary forms, with or without
859290Sjlemon * modification, are permitted provided that the following conditions
959290Sjlemon * are met:
1059290Sjlemon * 1. Redistributions of source code must retain the above copyright
1159290Sjlemon *    notice, this list of conditions and the following disclaimer.
1259290Sjlemon * 2. Redistributions in binary form must reproduce the above copyright
1359290Sjlemon *    notice, this list of conditions and the following disclaimer in the
1459290Sjlemon *    documentation and/or other materials provided with the distribution.
1559290Sjlemon *
1659290Sjlemon * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1759290Sjlemon * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1859290Sjlemon * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1959290Sjlemon * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2059290Sjlemon * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2159290Sjlemon * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2259290Sjlemon * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2359290Sjlemon * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2459290Sjlemon * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2559290Sjlemon * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2659290Sjlemon * SUCH DAMAGE.
2759290Sjlemon */
2859290Sjlemon
29116182Sobrien#include <sys/cdefs.h>
30116182Sobrien__FBSDID("$FreeBSD$");
31116182Sobrien
32162592Sjmg#include "opt_ktrace.h"
33162592Sjmg
3459290Sjlemon#include <sys/param.h>
3559290Sjlemon#include <sys/systm.h>
36224778Srwatson#include <sys/capability.h>
3759290Sjlemon#include <sys/kernel.h>
3876166Smarkm#include <sys/lock.h>
3976166Smarkm#include <sys/mutex.h>
4059290Sjlemon#include <sys/proc.h>
41132138Salfred#include <sys/malloc.h>
4259290Sjlemon#include <sys/unistd.h>
4359290Sjlemon#include <sys/file.h>
44108524Salfred#include <sys/filedesc.h>
45132138Salfred#include <sys/filio.h>
4659290Sjlemon#include <sys/fcntl.h>
47133741Sjmg#include <sys/kthread.h>
4870834Swollman#include <sys/selinfo.h>
4959290Sjlemon#include <sys/queue.h>
5059290Sjlemon#include <sys/event.h>
5159290Sjlemon#include <sys/eventvar.h>
5259290Sjlemon#include <sys/poll.h>
5359290Sjlemon#include <sys/protosw.h>
54132138Salfred#include <sys/sigio.h>
55132138Salfred#include <sys/signalvar.h>
5659290Sjlemon#include <sys/socket.h>
5759290Sjlemon#include <sys/socketvar.h>
5859290Sjlemon#include <sys/stat.h>
5984138Sjlemon#include <sys/sysctl.h>
6059290Sjlemon#include <sys/sysproto.h>
61142934Sps#include <sys/syscallsubr.h>
62133741Sjmg#include <sys/taskqueue.h>
6359290Sjlemon#include <sys/uio.h>
64162592Sjmg#ifdef KTRACE
65162592Sjmg#include <sys/ktrace.h>
66162592Sjmg#endif
6759290Sjlemon
6892751Sjeff#include <vm/uma.h>
6959290Sjlemon
70141616Sphkstatic MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
71141616Sphk
72133741Sjmg/*
73133741Sjmg * This lock is used if multiple kq locks are required.  This possibly
74133741Sjmg * should be made into a per proc lock.
75133741Sjmg */
76133741Sjmgstatic struct mtx	kq_global;
77133741SjmgMTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
78133741Sjmg#define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
79133741Sjmg	if (!haslck)				\
80133741Sjmg		mtx_lock(lck);			\
81133741Sjmg	haslck = 1;				\
82133741Sjmg} while (0)
83133741Sjmg#define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
84133741Sjmg	if (haslck)				\
85133741Sjmg		mtx_unlock(lck);			\
86133741Sjmg	haslck = 0;				\
87133741Sjmg} while (0)
8884138Sjlemon
89133741SjmgTASKQUEUE_DEFINE_THREAD(kqueue);
90133741Sjmg
91146950Spsstatic int	kevent_copyout(void *arg, struct kevent *kevp, int count);
92146950Spsstatic int	kevent_copyin(void *arg, struct kevent *kevp, int count);
93162594Sjmgstatic int	kqueue_register(struct kqueue *kq, struct kevent *kev,
94162594Sjmg		    struct thread *td, int waitok);
95170029Srwatsonstatic int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
96133741Sjmgstatic void	kqueue_release(struct kqueue *kq, int locked);
97133741Sjmgstatic int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
98133741Sjmg		    uintptr_t ident, int waitok);
99133741Sjmgstatic void	kqueue_task(void *arg, int pending);
100133741Sjmgstatic int	kqueue_scan(struct kqueue *kq, int maxevents,
101146950Sps		    struct kevent_copyops *k_ops,
102146950Sps		    const struct timespec *timeout,
103146950Sps		    struct kevent *keva, struct thread *td);
10459290Sjlemonstatic void 	kqueue_wakeup(struct kqueue *kq);
105133741Sjmgstatic struct filterops *kqueue_fo_find(int filt);
106133741Sjmgstatic void	kqueue_fo_release(int filt);
10759290Sjlemon
108108255Sphkstatic fo_rdwr_t	kqueue_read;
109108255Sphkstatic fo_rdwr_t	kqueue_write;
110175140Sjhbstatic fo_truncate_t	kqueue_truncate;
111108255Sphkstatic fo_ioctl_t	kqueue_ioctl;
112108255Sphkstatic fo_poll_t	kqueue_poll;
113108255Sphkstatic fo_kqfilter_t	kqueue_kqfilter;
114108255Sphkstatic fo_stat_t	kqueue_stat;
115108255Sphkstatic fo_close_t	kqueue_close;
116108238Sphk
11772521Sjlemonstatic struct fileops kqueueops = {
118116546Sphk	.fo_read = kqueue_read,
119116546Sphk	.fo_write = kqueue_write,
120175140Sjhb	.fo_truncate = kqueue_truncate,
121116546Sphk	.fo_ioctl = kqueue_ioctl,
122116546Sphk	.fo_poll = kqueue_poll,
123116546Sphk	.fo_kqfilter = kqueue_kqfilter,
124116546Sphk	.fo_stat = kqueue_stat,
125116546Sphk	.fo_close = kqueue_close,
126224914Skib	.fo_chmod = invfo_chmod,
127224914Skib	.fo_chown = invfo_chown,
12872521Sjlemon};
12972521Sjlemon
130133741Sjmgstatic int 	knote_attach(struct knote *kn, struct kqueue *kq);
13183366Sjulianstatic void 	knote_drop(struct knote *kn, struct thread *td);
13259290Sjlemonstatic void 	knote_enqueue(struct knote *kn);
13359290Sjlemonstatic void 	knote_dequeue(struct knote *kn);
13459290Sjlemonstatic void 	knote_init(void);
135133741Sjmgstatic struct 	knote *knote_alloc(int waitok);
13659290Sjlemonstatic void 	knote_free(struct knote *kn);
13759290Sjlemon
13872521Sjlemonstatic void	filt_kqdetach(struct knote *kn);
13972521Sjlemonstatic int	filt_kqueue(struct knote *kn, long hint);
14072521Sjlemonstatic int	filt_procattach(struct knote *kn);
14172521Sjlemonstatic void	filt_procdetach(struct knote *kn);
14272521Sjlemonstatic int	filt_proc(struct knote *kn, long hint);
14372521Sjlemonstatic int	filt_fileattach(struct knote *kn);
14479989Sjlemonstatic void	filt_timerexpire(void *knx);
14579989Sjlemonstatic int	filt_timerattach(struct knote *kn);
14679989Sjlemonstatic void	filt_timerdetach(struct knote *kn);
14779989Sjlemonstatic int	filt_timer(struct knote *kn, long hint);
148197241Sssonstatic int	filt_userattach(struct knote *kn);
149197241Sssonstatic void	filt_userdetach(struct knote *kn);
150197241Sssonstatic int	filt_user(struct knote *kn, long hint);
151197294Srdivackystatic void	filt_usertouch(struct knote *kn, struct kevent *kev,
152197407Srdivacky		    u_long type);
15372521Sjlemon
154197134Srwatsonstatic struct filterops file_filtops = {
155197134Srwatson	.f_isfd = 1,
156197134Srwatson	.f_attach = filt_fileattach,
157197134Srwatson};
158197134Srwatsonstatic struct filterops kqread_filtops = {
159197134Srwatson	.f_isfd = 1,
160197134Srwatson	.f_detach = filt_kqdetach,
161197134Srwatson	.f_event = filt_kqueue,
162197134Srwatson};
163133741Sjmg/* XXX - move to kern_proc.c?  */
164197134Srwatsonstatic struct filterops proc_filtops = {
165197134Srwatson	.f_isfd = 0,
166197134Srwatson	.f_attach = filt_procattach,
167197134Srwatson	.f_detach = filt_procdetach,
168197134Srwatson	.f_event = filt_proc,
169197134Srwatson};
170197134Srwatsonstatic struct filterops timer_filtops = {
171197134Srwatson	.f_isfd = 0,
172197134Srwatson	.f_attach = filt_timerattach,
173197134Srwatson	.f_detach = filt_timerdetach,
174197134Srwatson	.f_event = filt_timer,
175197134Srwatson};
176197241Sssonstatic struct filterops user_filtops = {
177197241Ssson	.f_attach = filt_userattach,
178197241Ssson	.f_detach = filt_userdetach,
179197241Ssson	.f_event = filt_user,
180197241Ssson	.f_touch = filt_usertouch,
181197241Ssson};
18272521Sjlemon
18392751Sjeffstatic uma_zone_t	knote_zone;
18484138Sjlemonstatic int 		kq_ncallouts = 0;
18584138Sjlemonstatic int 		kq_calloutmax = (4 * 1024);
18684138SjlemonSYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
18784138Sjlemon    &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
18859290Sjlemon
189133741Sjmg/* XXX - ensure not KN_INFLUX?? */
190133741Sjmg#define KNOTE_ACTIVATE(kn, islock) do { 				\
191133741Sjmg	if ((islock))							\
192133741Sjmg		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
193133741Sjmg	else								\
194133741Sjmg		KQ_LOCK((kn)->kn_kq);					\
195133741Sjmg	(kn)->kn_status |= KN_ACTIVE;					\
196133741Sjmg	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
197133741Sjmg		knote_enqueue((kn));					\
198133741Sjmg	if (!(islock))							\
199133741Sjmg		KQ_UNLOCK((kn)->kn_kq);					\
20059290Sjlemon} while(0)
201133741Sjmg#define KQ_LOCK(kq) do {						\
202133741Sjmg	mtx_lock(&(kq)->kq_lock);					\
203133741Sjmg} while (0)
204133741Sjmg#define KQ_FLUX_WAKEUP(kq) do {						\
205133741Sjmg	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
206133741Sjmg		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
207133741Sjmg		wakeup((kq));						\
208133741Sjmg	}								\
209133741Sjmg} while (0)
210133741Sjmg#define KQ_UNLOCK_FLUX(kq) do {						\
211133741Sjmg	KQ_FLUX_WAKEUP(kq);						\
212133741Sjmg	mtx_unlock(&(kq)->kq_lock);					\
213133741Sjmg} while (0)
214133741Sjmg#define KQ_UNLOCK(kq) do {						\
215133741Sjmg	mtx_unlock(&(kq)->kq_lock);					\
216133741Sjmg} while (0)
217133741Sjmg#define KQ_OWNED(kq) do {						\
218133741Sjmg	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
219133741Sjmg} while (0)
220133741Sjmg#define KQ_NOTOWNED(kq) do {						\
221133741Sjmg	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
222133741Sjmg} while (0)
223133741Sjmg#define KN_LIST_LOCK(kn) do {						\
224133741Sjmg	if (kn->kn_knlist != NULL)					\
225147730Sssouhlal		kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg);	\
226133741Sjmg} while (0)
227133741Sjmg#define KN_LIST_UNLOCK(kn) do {						\
228147730Sssouhlal	if (kn->kn_knlist != NULL) 					\
229147730Sssouhlal		kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg);	\
230133741Sjmg} while (0)
231147730Sssouhlal#define	KNL_ASSERT_LOCK(knl, islocked) do {				\
232147730Sssouhlal	if (islocked)							\
233147730Sssouhlal		KNL_ASSERT_LOCKED(knl);				\
234147730Sssouhlal	else								\
235147730Sssouhlal		KNL_ASSERT_UNLOCKED(knl);				\
236147730Sssouhlal} while (0)
237147730Sssouhlal#ifdef INVARIANTS
238147730Sssouhlal#define	KNL_ASSERT_LOCKED(knl) do {					\
239193951Skib	knl->kl_assert_locked((knl)->kl_lockarg);			\
240147730Sssouhlal} while (0)
241193951Skib#define	KNL_ASSERT_UNLOCKED(knl) do {					\
242193951Skib	knl->kl_assert_unlocked((knl)->kl_lockarg);			\
243147730Sssouhlal} while (0)
244147730Sssouhlal#else /* !INVARIANTS */
245147730Sssouhlal#define	KNL_ASSERT_LOCKED(knl) do {} while(0)
246147730Sssouhlal#define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
247147730Sssouhlal#endif /* INVARIANTS */
24859290Sjlemon
24959290Sjlemon#define	KN_HASHSIZE		64		/* XXX should be tunable */
25059290Sjlemon#define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
25159290Sjlemon
25288633Salfredstatic int
25388633Salfredfilt_nullattach(struct knote *kn)
25488633Salfred{
25588633Salfred
25688633Salfred	return (ENXIO);
25788633Salfred};
25888633Salfred
259197134Srwatsonstruct filterops null_filtops = {
260197134Srwatson	.f_isfd = 0,
261197134Srwatson	.f_attach = filt_nullattach,
262197134Srwatson};
26388633Salfred
264133741Sjmg/* XXX - make SYSINIT to add these, and move into respective modules. */
26559290Sjlemonextern struct filterops sig_filtops;
266131562Salfredextern struct filterops fs_filtops;
26759290Sjlemon
26859290Sjlemon/*
26972521Sjlemon * Table for for all system-defined filters.
27059290Sjlemon */
271133741Sjmgstatic struct mtx	filterops_lock;
272133741SjmgMTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
273133741Sjmg	MTX_DEF);
274133741Sjmgstatic struct {
275133741Sjmg	struct filterops *for_fop;
276133741Sjmg	int for_refcnt;
277133741Sjmg} sysfilt_ops[EVFILT_SYSCOUNT] = {
278133741Sjmg	{ &file_filtops },			/* EVFILT_READ */
279133741Sjmg	{ &file_filtops },			/* EVFILT_WRITE */
280133741Sjmg	{ &null_filtops },			/* EVFILT_AIO */
281133741Sjmg	{ &file_filtops },			/* EVFILT_VNODE */
282133741Sjmg	{ &proc_filtops },			/* EVFILT_PROC */
283133741Sjmg	{ &sig_filtops },			/* EVFILT_SIGNAL */
284133741Sjmg	{ &timer_filtops },			/* EVFILT_TIMER */
285201350Sbrooks	{ &null_filtops },			/* former EVFILT_NETDEV */
286133741Sjmg	{ &fs_filtops },			/* EVFILT_FS */
287151260Sambrisko	{ &null_filtops },			/* EVFILT_LIO */
288197241Ssson	{ &user_filtops },			/* EVFILT_USER */
28959290Sjlemon};
29059290Sjlemon
291133741Sjmg/*
292133741Sjmg * Simple redirection for all cdevsw style objects to call their fo_kqfilter
293133741Sjmg * method.
294133741Sjmg */
29559290Sjlemonstatic int
29672521Sjlemonfilt_fileattach(struct knote *kn)
29759290Sjlemon{
298133635Sjmg
29972521Sjlemon	return (fo_kqfilter(kn->kn_fp, kn));
30059290Sjlemon}
30159290Sjlemon
30272521Sjlemon/*ARGSUSED*/
30359290Sjlemonstatic int
30472521Sjlemonkqueue_kqfilter(struct file *fp, struct knote *kn)
30559290Sjlemon{
306109153Sdillon	struct kqueue *kq = kn->kn_fp->f_data;
30759290Sjlemon
30872521Sjlemon	if (kn->kn_filter != EVFILT_READ)
309133741Sjmg		return (EINVAL);
31059290Sjlemon
311133741Sjmg	kn->kn_status |= KN_KQUEUE;
31272521Sjlemon	kn->kn_fop = &kqread_filtops;
313133741Sjmg	knlist_add(&kq->kq_sel.si_note, kn, 0);
314133741Sjmg
31559290Sjlemon	return (0);
31659290Sjlemon}
31759290Sjlemon
31859290Sjlemonstatic void
31959290Sjlemonfilt_kqdetach(struct knote *kn)
32059290Sjlemon{
321109153Sdillon	struct kqueue *kq = kn->kn_fp->f_data;
32259290Sjlemon
323133741Sjmg	knlist_remove(&kq->kq_sel.si_note, kn, 0);
32459290Sjlemon}
32559290Sjlemon
32659290Sjlemon/*ARGSUSED*/
32759290Sjlemonstatic int
32859290Sjlemonfilt_kqueue(struct knote *kn, long hint)
32959290Sjlemon{
330109153Sdillon	struct kqueue *kq = kn->kn_fp->f_data;
33159290Sjlemon
33259290Sjlemon	kn->kn_data = kq->kq_count;
33359290Sjlemon	return (kn->kn_data > 0);
33459290Sjlemon}
33559290Sjlemon
336133741Sjmg/* XXX - move to kern_proc.c?  */
33759290Sjlemonstatic int
33859290Sjlemonfilt_procattach(struct knote *kn)
33959290Sjlemon{
34059290Sjlemon	struct proc *p;
341113377Skbyanc	int immediate;
34275451Srwatson	int error;
34359290Sjlemon
344113377Skbyanc	immediate = 0;
34559290Sjlemon	p = pfind(kn->kn_id);
346113377Skbyanc	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
347113377Skbyanc		p = zpfind(kn->kn_id);
348113377Skbyanc		immediate = 1;
349133741Sjmg	} else if (p != NULL && (p->p_flag & P_WEXIT)) {
350133741Sjmg		immediate = 1;
351113377Skbyanc	}
352133741Sjmg
353122019Scognet	if (p == NULL)
354122019Scognet		return (ESRCH);
355203875Skib	if ((error = p_cansee(curthread, p))) {
356203875Skib		PROC_UNLOCK(p);
35775451Srwatson		return (error);
358203875Skib	}
35959290Sjlemon
36059290Sjlemon	kn->kn_ptr.p_proc = p;
36159290Sjlemon	kn->kn_flags |= EV_CLEAR;		/* automatically set */
36259290Sjlemon
36359290Sjlemon	/*
36459290Sjlemon	 * internal flag indicating registration done by kernel
36559290Sjlemon	 */
36659290Sjlemon	if (kn->kn_flags & EV_FLAG1) {
36759290Sjlemon		kn->kn_data = kn->kn_sdata;		/* ppid */
36859290Sjlemon		kn->kn_fflags = NOTE_CHILD;
36959290Sjlemon		kn->kn_flags &= ~EV_FLAG1;
37059290Sjlemon	}
37159290Sjlemon
372122686Scognet	if (immediate == 0)
373133741Sjmg		knlist_add(&p->p_klist, kn, 1);
374113377Skbyanc
375113377Skbyanc	/*
376113377Skbyanc	 * Immediately activate any exit notes if the target process is a
377113377Skbyanc	 * zombie.  This is necessary to handle the case where the target
378113377Skbyanc	 * process, e.g. a child, dies before the kevent is registered.
379113377Skbyanc	 */
380113377Skbyanc	if (immediate && filt_proc(kn, NOTE_EXIT))
381133741Sjmg		KNOTE_ACTIVATE(kn, 0);
382113377Skbyanc
38371500Sjhb	PROC_UNLOCK(p);
38459290Sjlemon
38559290Sjlemon	return (0);
38659290Sjlemon}
38759290Sjlemon
38859290Sjlemon/*
38959290Sjlemon * The knote may be attached to a different process, which may exit,
39059290Sjlemon * leaving nothing for the knote to be attached to.  So when the process
39159290Sjlemon * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
39259290Sjlemon * it will be deleted when read out.  However, as part of the knote deletion,
39359290Sjlemon * this routine is called, so a check is needed to avoid actually performing
39459290Sjlemon * a detach, because the original process does not exist any more.
39559290Sjlemon */
396133741Sjmg/* XXX - move to kern_proc.c?  */
39759290Sjlemonstatic void
39859290Sjlemonfilt_procdetach(struct knote *kn)
39959290Sjlemon{
400133741Sjmg	struct proc *p;
40159290Sjlemon
402133741Sjmg	p = kn->kn_ptr.p_proc;
403133741Sjmg	knlist_remove(&p->p_klist, kn, 0);
404133741Sjmg	kn->kn_ptr.p_proc = NULL;
40559290Sjlemon}
40659290Sjlemon
407133741Sjmg/* XXX - move to kern_proc.c?  */
40859290Sjlemonstatic int
40959290Sjlemonfilt_proc(struct knote *kn, long hint)
41059290Sjlemon{
411133741Sjmg	struct proc *p = kn->kn_ptr.p_proc;
41259290Sjlemon	u_int event;
41359290Sjlemon
41459290Sjlemon	/*
41559290Sjlemon	 * mask off extra data
41659290Sjlemon	 */
41759290Sjlemon	event = (u_int)hint & NOTE_PCTRLMASK;
41859290Sjlemon
41959290Sjlemon	/*
42059290Sjlemon	 * if the user is interested in this event, record it.
42159290Sjlemon	 */
42259290Sjlemon	if (kn->kn_sfflags & event)
42359290Sjlemon		kn->kn_fflags |= event;
42459290Sjlemon
42559290Sjlemon	/*
42659290Sjlemon	 * process is gone, so flag the event as finished.
42759290Sjlemon	 */
42859290Sjlemon	if (event == NOTE_EXIT) {
429133741Sjmg		if (!(kn->kn_status & KN_DETACHED))
430133741Sjmg			knlist_remove_inevent(&p->p_klist, kn);
431133590Srwatson		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
432133741Sjmg		kn->kn_ptr.p_proc = NULL;
433257759Sjhb		if (kn->kn_fflags & NOTE_EXIT)
434257759Sjhb			kn->kn_data = p->p_xstat;
435257759Sjhb		if (kn->kn_fflags == 0)
436257759Sjhb			kn->kn_flags |= EV_DROP;
43759290Sjlemon		return (1);
43859290Sjlemon	}
43959290Sjlemon
440180340Skib	return (kn->kn_fflags != 0);
441180340Skib}
44259290Sjlemon
443180340Skib/*
444180340Skib * Called when the process forked. It mostly does the same as the
445180340Skib * knote(), activating all knotes registered to be activated when the
446180340Skib * process forked. Additionally, for each knote attached to the
447180340Skib * parent, check whether user wants to track the new process. If so
448180340Skib * attach a new knote to it, and immediately report an event with the
449180340Skib * child's pid.
450180340Skib */
451180340Skibvoid
452180340Skibknote_fork(struct knlist *list, int pid)
453180340Skib{
454180340Skib	struct kqueue *kq;
455180340Skib	struct knote *kn;
456180340Skib	struct kevent kev;
457180340Skib	int error;
458180340Skib
459180340Skib	if (list == NULL)
460180340Skib		return;
461180340Skib	list->kl_lock(list->kl_lockarg);
462180340Skib
463180340Skib	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
464180340Skib		if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
465180340Skib			continue;
466180340Skib		kq = kn->kn_kq;
467180340Skib		KQ_LOCK(kq);
468264369Skib		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
469180340Skib			KQ_UNLOCK(kq);
470180340Skib			continue;
471180340Skib		}
472180340Skib
47359290Sjlemon		/*
474180340Skib		 * The same as knote(), activate the event.
47559290Sjlemon		 */
476180340Skib		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
477180340Skib			kn->kn_status |= KN_HASKQLOCK;
478257763Sjhb			if (kn->kn_fop->f_event(kn, NOTE_FORK))
479180340Skib				KNOTE_ACTIVATE(kn, 1);
480180340Skib			kn->kn_status &= ~KN_HASKQLOCK;
481180340Skib			KQ_UNLOCK(kq);
482180340Skib			continue;
483180340Skib		}
484180340Skib
485180340Skib		/*
486180340Skib		 * The NOTE_TRACK case. In addition to the activation
487180340Skib		 * of the event, we need to register new event to
488180340Skib		 * track the child. Drop the locks in preparation for
489180340Skib		 * the call to kqueue_register().
490180340Skib		 */
491180340Skib		kn->kn_status |= KN_INFLUX;
492180340Skib		KQ_UNLOCK(kq);
493180340Skib		list->kl_unlock(list->kl_lockarg);
494180340Skib
495180340Skib		/*
496180340Skib		 * Activate existing knote and register a knote with
497180340Skib		 * new process.
498180340Skib		 */
499180340Skib		kev.ident = pid;
50059290Sjlemon		kev.filter = kn->kn_filter;
50159290Sjlemon		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
50259290Sjlemon		kev.fflags = kn->kn_sfflags;
503180340Skib		kev.data = kn->kn_id;		/* parent */
504180340Skib		kev.udata = kn->kn_kevent.udata;/* preserve udata */
505180340Skib		error = kqueue_register(kq, &kev, NULL, 0);
50659290Sjlemon		if (error)
50759290Sjlemon			kn->kn_fflags |= NOTE_TRACKERR;
508257763Sjhb		if (kn->kn_fop->f_event(kn, NOTE_FORK))
509257763Sjhb			KNOTE_ACTIVATE(kn, 0);
510180340Skib		KQ_LOCK(kq);
511180340Skib		kn->kn_status &= ~KN_INFLUX;
512180340Skib		KQ_UNLOCK_FLUX(kq);
513180340Skib		list->kl_lock(list->kl_lockarg);
51459290Sjlemon	}
515180340Skib	list->kl_unlock(list->kl_lockarg);
51659290Sjlemon}
51759290Sjlemon
518239915Sjhb/*
519239915Sjhb * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
520239915Sjhb * interval timer support code.
521239915Sjhb */
522133741Sjmgstatic int
523133741Sjmgtimertoticks(intptr_t data)
524133741Sjmg{
525133741Sjmg	struct timeval tv;
526133741Sjmg	int tticks;
527133741Sjmg
528133741Sjmg	tv.tv_sec = data / 1000;
529133741Sjmg	tv.tv_usec = (data % 1000) * 1000;
530133741Sjmg	tticks = tvtohz(&tv);
531133741Sjmg
532133741Sjmg	return tticks;
533133741Sjmg}
534133741Sjmg
53579989Sjlemonstatic void
53679989Sjlemonfilt_timerexpire(void *knx)
53779989Sjlemon{
53879989Sjlemon	struct knote *kn = knx;
53984138Sjlemon	struct callout *calloutp;
54079989Sjlemon
54179989Sjlemon	kn->kn_data++;
542133741Sjmg	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
54379989Sjlemon
544239915Sjhb	/*
545239915Sjhb	 * timertoticks() uses tvtohz() which always adds 1 to allow
546239915Sjhb	 * for the time until the next clock interrupt being strictly
547239915Sjhb	 * less than 1 clock tick.  We don't want that here since we
548239915Sjhb	 * want to appear to be in sync with the clock interrupt even
549239915Sjhb	 * when we're delayed.
550239915Sjhb	 */
551133741Sjmg	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
55284138Sjlemon		calloutp = (struct callout *)kn->kn_hook;
553239915Sjhb		callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata) - 1,
554133741Sjmg		    filt_timerexpire, kn);
55579989Sjlemon	}
55679989Sjlemon}
55779989Sjlemon
55879989Sjlemon/*
55979989Sjlemon * data contains amount of time to sleep, in milliseconds
560133590Srwatson */
56179989Sjlemonstatic int
56279989Sjlemonfilt_timerattach(struct knote *kn)
56379989Sjlemon{
56484138Sjlemon	struct callout *calloutp;
56579989Sjlemon
566133741Sjmg	atomic_add_int(&kq_ncallouts, 1);
567133741Sjmg
568133741Sjmg	if (kq_ncallouts >= kq_calloutmax) {
569133741Sjmg		atomic_add_int(&kq_ncallouts, -1);
57084138Sjlemon		return (ENOMEM);
571133741Sjmg	}
57284138Sjlemon
57379989Sjlemon	kn->kn_flags |= EV_CLEAR;		/* automatically set */
574136500Sjmg	kn->kn_status &= ~KN_DETACHED;		/* knlist_add usually sets it */
575184214Sdes	calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
576142217Srwatson	callout_init(calloutp, CALLOUT_MPSAFE);
577127982Scperciva	kn->kn_hook = calloutp;
578177860Sjeff	callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
579177860Sjeff	    filt_timerexpire, kn);
58079989Sjlemon
58179989Sjlemon	return (0);
58279989Sjlemon}
58379989Sjlemon
58479989Sjlemonstatic void
58579989Sjlemonfilt_timerdetach(struct knote *kn)
58679989Sjlemon{
58784138Sjlemon	struct callout *calloutp;
58879989Sjlemon
58984138Sjlemon	calloutp = (struct callout *)kn->kn_hook;
590127982Scperciva	callout_drain(calloutp);
591184205Sdes	free(calloutp, M_KQUEUE);
592133741Sjmg	atomic_add_int(&kq_ncallouts, -1);
593136500Sjmg	kn->kn_status |= KN_DETACHED;	/* knlist_remove usually clears it */
59479989Sjlemon}
59579989Sjlemon
59679989Sjlemonstatic int
59779989Sjlemonfilt_timer(struct knote *kn, long hint)
59879989Sjlemon{
59979989Sjlemon
60079989Sjlemon	return (kn->kn_data != 0);
60179989Sjlemon}
60279989Sjlemon
603197241Sssonstatic int
604197241Sssonfilt_userattach(struct knote *kn)
605197241Ssson{
606197241Ssson
607197241Ssson	/*
608197241Ssson	 * EVFILT_USER knotes are not attached to anything in the kernel.
609197241Ssson	 */
610197241Ssson	kn->kn_hook = NULL;
611197241Ssson	if (kn->kn_fflags & NOTE_TRIGGER)
612197241Ssson		kn->kn_hookid = 1;
613197241Ssson	else
614197241Ssson		kn->kn_hookid = 0;
615197241Ssson	return (0);
616197241Ssson}
617197241Ssson
618197241Sssonstatic void
619197241Sssonfilt_userdetach(__unused struct knote *kn)
620197241Ssson{
621197241Ssson
622197241Ssson	/*
623197241Ssson	 * EVFILT_USER knotes are not attached to anything in the kernel.
624197241Ssson	 */
625197241Ssson}
626197241Ssson
627197241Sssonstatic int
628197241Sssonfilt_user(struct knote *kn, __unused long hint)
629197241Ssson{
630197241Ssson
631197241Ssson	return (kn->kn_hookid);
632197241Ssson}
633197241Ssson
634197241Sssonstatic void
635197407Srdivackyfilt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
636197241Ssson{
637197407Srdivacky	u_int ffctrl;
638197241Ssson
639197241Ssson	switch (type) {
640197241Ssson	case EVENT_REGISTER:
641197241Ssson		if (kev->fflags & NOTE_TRIGGER)
642197241Ssson			kn->kn_hookid = 1;
643197241Ssson
644197241Ssson		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
645197241Ssson		kev->fflags &= NOTE_FFLAGSMASK;
646197241Ssson		switch (ffctrl) {
647197241Ssson		case NOTE_FFNOP:
648197241Ssson			break;
649197241Ssson
650197241Ssson		case NOTE_FFAND:
651197241Ssson			kn->kn_sfflags &= kev->fflags;
652197241Ssson			break;
653197241Ssson
654197241Ssson		case NOTE_FFOR:
655197241Ssson			kn->kn_sfflags |= kev->fflags;
656197241Ssson			break;
657197241Ssson
658197241Ssson		case NOTE_FFCOPY:
659197241Ssson			kn->kn_sfflags = kev->fflags;
660197241Ssson			break;
661197241Ssson
662197241Ssson		default:
663197241Ssson			/* XXX Return error? */
664197241Ssson			break;
665197241Ssson		}
666197241Ssson		kn->kn_sdata = kev->data;
667197241Ssson		if (kev->flags & EV_CLEAR) {
668197241Ssson			kn->kn_hookid = 0;
669197241Ssson			kn->kn_data = 0;
670197241Ssson			kn->kn_fflags = 0;
671197241Ssson		}
672197241Ssson		break;
673197241Ssson
674197241Ssson        case EVENT_PROCESS:
675197241Ssson		*kev = kn->kn_kevent;
676197241Ssson		kev->fflags = kn->kn_sfflags;
677197241Ssson		kev->data = kn->kn_sdata;
678197241Ssson		if (kn->kn_flags & EV_CLEAR) {
679197241Ssson			kn->kn_hookid = 0;
680197241Ssson			kn->kn_data = 0;
681197241Ssson			kn->kn_fflags = 0;
682197241Ssson		}
683197241Ssson		break;
684197241Ssson
685197241Ssson	default:
686197241Ssson		panic("filt_usertouch() - invalid type (%ld)", type);
687197241Ssson		break;
688197241Ssson	}
689197241Ssson}
690197241Ssson
69161468Sjlemonint
692225617Skmacysys_kqueue(struct thread *td, struct kqueue_args *uap)
69359290Sjlemon{
69482710Sdillon	struct filedesc *fdp;
69559290Sjlemon	struct kqueue *kq;
69661468Sjlemon	struct file *fp;
69761468Sjlemon	int fd, error;
69859290Sjlemon
69983366Sjulian	fdp = td->td_proc->p_fd;
700220245Skib	error = falloc(td, &fp, &fd, 0);
70161468Sjlemon	if (error)
70282710Sdillon		goto done2;
703133741Sjmg
704121256Sdwmalone	/* An extra reference on `nfp' has been held for us by falloc(). */
705133741Sjmg	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
706133741Sjmg	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
70789306Salfred	TAILQ_INIT(&kq->kq_head);
708133741Sjmg	kq->kq_fdp = fdp;
709193951Skib	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
710133741Sjmg	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
711133741Sjmg
712168355Srwatson	FILEDESC_XLOCK(fdp);
713255729Skib	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
714168355Srwatson	FILEDESC_XUNLOCK(fdp);
715133741Sjmg
716174988Sjeff	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
717121256Sdwmalone	fdrop(fp, td);
718133741Sjmg
71983366Sjulian	td->td_retval[0] = fd;
72082710Sdillondone2:
72161468Sjlemon	return (error);
72259290Sjlemon}
72359290Sjlemon
72459290Sjlemon#ifndef _SYS_SYSPROTO_H_
72559290Sjlemonstruct kevent_args {
72659290Sjlemon	int	fd;
72763977Speter	const struct kevent *changelist;
72859290Sjlemon	int	nchanges;
72963452Sjlemon	struct	kevent *eventlist;
73059290Sjlemon	int	nevents;
73163977Speter	const struct timespec *timeout;
73259290Sjlemon};
73359290Sjlemon#endif
73459290Sjlemonint
735225617Skmacysys_kevent(struct thread *td, struct kevent_args *uap)
73659290Sjlemon{
737142934Sps	struct timespec ts, *tsp;
738146950Sps	struct kevent_copyops k_ops = { uap,
739146950Sps					kevent_copyout,
740146950Sps					kevent_copyin};
741142934Sps	int error;
742162592Sjmg#ifdef KTRACE
743162592Sjmg	struct uio ktruio;
744162592Sjmg	struct iovec ktriov;
745162592Sjmg	struct uio *ktruioin = NULL;
746162592Sjmg	struct uio *ktruioout = NULL;
747162592Sjmg#endif
748142934Sps
749142934Sps	if (uap->timeout != NULL) {
750142934Sps		error = copyin(uap->timeout, &ts, sizeof(ts));
751142934Sps		if (error)
752142934Sps			return (error);
753142934Sps		tsp = &ts;
754142934Sps	} else
755142934Sps		tsp = NULL;
756142934Sps
757162592Sjmg#ifdef KTRACE
758162592Sjmg	if (KTRPOINT(td, KTR_GENIO)) {
759162592Sjmg		ktriov.iov_base = uap->changelist;
760162592Sjmg		ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
761162592Sjmg		ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
762162592Sjmg		    .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
763162592Sjmg		    .uio_td = td };
764162592Sjmg		ktruioin = cloneuio(&ktruio);
765162592Sjmg		ktriov.iov_base = uap->eventlist;
766162592Sjmg		ktriov.iov_len = uap->nevents * sizeof(struct kevent);
767162592Sjmg		ktruioout = cloneuio(&ktruio);
768162592Sjmg	}
769162592Sjmg#endif
770162592Sjmg
771162592Sjmg	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
772162592Sjmg	    &k_ops, tsp);
773162592Sjmg
774162592Sjmg#ifdef KTRACE
775162592Sjmg	if (ktruioin != NULL) {
776162592Sjmg		ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
777162592Sjmg		ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
778162592Sjmg		ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
779162592Sjmg		ktrgenio(uap->fd, UIO_READ, ktruioout, error);
780162592Sjmg	}
781162592Sjmg#endif
782162592Sjmg
783162592Sjmg	return (error);
784142934Sps}
785142934Sps
786142934Sps/*
787146950Sps * Copy 'count' items into the destination list pointed to by uap->eventlist.
788142934Sps */
789142934Spsstatic int
790146950Spskevent_copyout(void *arg, struct kevent *kevp, int count)
791142934Sps{
792146950Sps	struct kevent_args *uap;
793142934Sps	int error;
794142934Sps
795146950Sps	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
796146950Sps	uap = (struct kevent_args *)arg;
797146950Sps
798146950Sps	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
799146950Sps	if (error == 0)
800146950Sps		uap->eventlist += count;
801142934Sps	return (error);
802142934Sps}
803142934Sps
804146950Sps/*
805146950Sps * Copy 'count' items from the list pointed to by uap->changelist.
806146950Sps */
807146950Spsstatic int
808146950Spskevent_copyin(void *arg, struct kevent *kevp, int count)
809146950Sps{
810146950Sps	struct kevent_args *uap;
811146950Sps	int error;
812146950Sps
813146950Sps	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
814146950Sps	uap = (struct kevent_args *)arg;
815146950Sps
816146950Sps	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
817146950Sps	if (error == 0)
818146950Sps		uap->changelist += count;
819146950Sps	return (error);
820146950Sps}
821146950Sps
822142934Spsint
823146950Spskern_kevent(struct thread *td, int fd, int nchanges, int nevents,
824146950Sps    struct kevent_copyops *k_ops, const struct timespec *timeout)
825142934Sps{
826133741Sjmg	struct kevent keva[KQ_NEVENTS];
827142934Sps	struct kevent *kevp, *changes;
82859290Sjlemon	struct kqueue *kq;
82986341Sdillon	struct file *fp;
83059290Sjlemon	int i, n, nerrors, error;
83159290Sjlemon
832224797Sjonathan	if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
83389319Salfred		return (error);
834170029Srwatson	if ((error = kqueue_acquire(fp, &kq)) != 0)
835133741Sjmg		goto done_norel;
836133741Sjmg
83759290Sjlemon	nerrors = 0;
83859290Sjlemon
839142934Sps	while (nchanges > 0) {
840146950Sps		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
841146950Sps		error = k_ops->k_copyin(k_ops->arg, keva, n);
842146950Sps		if (error)
843146950Sps			goto done;
844146950Sps		changes = keva;
84559290Sjlemon		for (i = 0; i < n; i++) {
846142934Sps			kevp = &changes[i];
847151260Sambrisko			if (!kevp->filter)
848151260Sambrisko				continue;
84963452Sjlemon			kevp->flags &= ~EV_SYSFLAGS;
850133741Sjmg			error = kqueue_register(kq, kevp, td, 1);
851197243Ssson			if (error || (kevp->flags & EV_RECEIPT)) {
852142934Sps				if (nevents != 0) {
85363452Sjlemon					kevp->flags = EV_ERROR;
85463452Sjlemon					kevp->data = error;
855146950Sps					(void) k_ops->k_copyout(k_ops->arg,
856146950Sps					    kevp, 1);
857142934Sps					nevents--;
85859290Sjlemon					nerrors++;
85959290Sjlemon				} else {
86068883Sdillon					goto done;
86159290Sjlemon				}
86259290Sjlemon			}
86359290Sjlemon		}
864142934Sps		nchanges -= n;
86559290Sjlemon	}
86659290Sjlemon	if (nerrors) {
867133741Sjmg		td->td_retval[0] = nerrors;
86868883Sdillon		error = 0;
86968883Sdillon		goto done;
87059290Sjlemon	}
87159290Sjlemon
872146950Sps	error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
87368883Sdillondone:
874133741Sjmg	kqueue_release(kq, 0);
875133741Sjmgdone_norel:
876170066Srwatson	fdrop(fp, td);
87759290Sjlemon	return (error);
87859290Sjlemon}
87959290Sjlemon
88059290Sjlemonint
88188633Salfredkqueue_add_filteropts(int filt, struct filterops *filtops)
88288633Salfred{
883133741Sjmg	int error;
88488633Salfred
885201352Sbrooks	error = 0;
886133741Sjmg	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
887133741Sjmg		printf(
888133741Sjmg"trying to add a filterop that is out of range: %d is beyond %d\n",
889133741Sjmg		    ~filt, EVFILT_SYSCOUNT);
890133741Sjmg		return EINVAL;
891133741Sjmg	}
892133741Sjmg	mtx_lock(&filterops_lock);
893133741Sjmg	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
894133741Sjmg	    sysfilt_ops[~filt].for_fop != NULL)
895133741Sjmg		error = EEXIST;
896133741Sjmg	else {
897133741Sjmg		sysfilt_ops[~filt].for_fop = filtops;
898133741Sjmg		sysfilt_ops[~filt].for_refcnt = 0;
899133741Sjmg	}
900133741Sjmg	mtx_unlock(&filterops_lock);
901133741Sjmg
902201352Sbrooks	return (error);
90388633Salfred}
90488633Salfred
90588633Salfredint
90688633Salfredkqueue_del_filteropts(int filt)
90788633Salfred{
908133741Sjmg	int error;
90988633Salfred
910133741Sjmg	error = 0;
911133741Sjmg	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
912133741Sjmg		return EINVAL;
913133741Sjmg
914133741Sjmg	mtx_lock(&filterops_lock);
915133741Sjmg	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
916133741Sjmg	    sysfilt_ops[~filt].for_fop == NULL)
917133741Sjmg		error = EINVAL;
918133741Sjmg	else if (sysfilt_ops[~filt].for_refcnt != 0)
919133741Sjmg		error = EBUSY;
920133741Sjmg	else {
921133741Sjmg		sysfilt_ops[~filt].for_fop = &null_filtops;
922133741Sjmg		sysfilt_ops[~filt].for_refcnt = 0;
923133741Sjmg	}
924133741Sjmg	mtx_unlock(&filterops_lock);
925133741Sjmg
926133741Sjmg	return error;
92788633Salfred}
92888633Salfred
929133741Sjmgstatic struct filterops *
930133741Sjmgkqueue_fo_find(int filt)
931133741Sjmg{
932133741Sjmg
933133741Sjmg	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
934133741Sjmg		return NULL;
935133741Sjmg
936133741Sjmg	mtx_lock(&filterops_lock);
937133741Sjmg	sysfilt_ops[~filt].for_refcnt++;
938133741Sjmg	if (sysfilt_ops[~filt].for_fop == NULL)
939133741Sjmg		sysfilt_ops[~filt].for_fop = &null_filtops;
940133741Sjmg	mtx_unlock(&filterops_lock);
941133741Sjmg
942133741Sjmg	return sysfilt_ops[~filt].for_fop;
943133741Sjmg}
944133741Sjmg
945133741Sjmgstatic void
946133741Sjmgkqueue_fo_release(int filt)
947133741Sjmg{
948133741Sjmg
949133741Sjmg	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
950133741Sjmg		return;
951133741Sjmg
952133741Sjmg	mtx_lock(&filterops_lock);
953133741Sjmg	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
954133741Sjmg	    ("filter object refcount not valid on release"));
955133741Sjmg	sysfilt_ops[~filt].for_refcnt--;
956133741Sjmg	mtx_unlock(&filterops_lock);
957133741Sjmg}
958133741Sjmg
959133741Sjmg/*
960170029Srwatson * A ref to kq (obtained via kqueue_acquire) must be held.  waitok will
961133741Sjmg * influence if memory allocation should wait.  Make sure it is 0 if you
962133741Sjmg * hold any mutexes.
963133741Sjmg */
964162594Sjmgstatic int
965133741Sjmgkqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
96659290Sjlemon{
96759290Sjlemon	struct filterops *fops;
968133741Sjmg	struct file *fp;
969133741Sjmg	struct knote *kn, *tkn;
970133741Sjmg	int error, filt, event;
971256074Skib	int haskqglobal, filedesc_unlock;
97259290Sjlemon
973133741Sjmg	fp = NULL;
974133741Sjmg	kn = NULL;
975133741Sjmg	error = 0;
976133741Sjmg	haskqglobal = 0;
977256074Skib	filedesc_unlock = 0;
97859290Sjlemon
979133741Sjmg	filt = kev->filter;
980133741Sjmg	fops = kqueue_fo_find(filt);
981133741Sjmg	if (fops == NULL)
982133741Sjmg		return EINVAL;
983133741Sjmg
984133741Sjmg	tkn = knote_alloc(waitok);		/* prevent waiting with locks */
985133741Sjmg
986133741Sjmgfindkn:
98759290Sjlemon	if (fops->f_isfd) {
988133741Sjmg		KASSERT(td != NULL, ("td is NULL"));
989224797Sjonathan		error = fget(td, kev->ident, CAP_POLL_EVENT, &fp);
990159553Sjhb		if (error)
991133741Sjmg			goto done;
99259290Sjlemon
993133741Sjmg		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
994133741Sjmg		    kev->ident, 0) != 0) {
995159553Sjhb			/* try again */
996133741Sjmg			fdrop(fp, td);
997133741Sjmg			fp = NULL;
998133741Sjmg			error = kqueue_expand(kq, fops, kev->ident, waitok);
999133741Sjmg			if (error)
1000133741Sjmg				goto done;
1001133741Sjmg			goto findkn;
1002133741Sjmg		}
1003133741Sjmg
1004133741Sjmg		if (fp->f_type == DTYPE_KQUEUE) {
1005133741Sjmg			/*
1006133741Sjmg			 * if we add some inteligence about what we are doing,
1007133741Sjmg			 * we should be able to support events on ourselves.
1008133741Sjmg			 * We need to know when we are doing this to prevent
1009133741Sjmg			 * getting both the knlist lock and the kq lock since
1010133741Sjmg			 * they are the same thing.
1011133741Sjmg			 */
1012133741Sjmg			if (fp->f_data == kq) {
1013133741Sjmg				error = EINVAL;
1014159172Spjd				goto done;
1015133741Sjmg			}
1016133741Sjmg
1017256074Skib			/*
1018256074Skib			 * Pre-lock the filedesc before the global
1019256074Skib			 * lock mutex, see the comment in
1020256074Skib			 * kqueue_close().
1021256074Skib			 */
1022256074Skib			FILEDESC_XLOCK(td->td_proc->p_fd);
1023256074Skib			filedesc_unlock = 1;
1024133741Sjmg			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1025133741Sjmg		}
1026133741Sjmg
1027133741Sjmg		KQ_LOCK(kq);
1028133741Sjmg		if (kev->ident < kq->kq_knlistsize) {
1029133741Sjmg			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1030133741Sjmg				if (kev->filter == kn->kn_filter)
103159290Sjlemon					break;
103259290Sjlemon		}
103359290Sjlemon	} else {
1034133741Sjmg		if ((kev->flags & EV_ADD) == EV_ADD)
1035133741Sjmg			kqueue_expand(kq, fops, kev->ident, waitok);
1036133741Sjmg
1037133741Sjmg		KQ_LOCK(kq);
1038133741Sjmg		if (kq->kq_knhashmask != 0) {
103959290Sjlemon			struct klist *list;
1040133635Sjmg
1041133741Sjmg			list = &kq->kq_knhash[
1042133741Sjmg			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
104359290Sjlemon			SLIST_FOREACH(kn, list, kn_link)
104459290Sjlemon				if (kev->ident == kn->kn_id &&
104559290Sjlemon				    kev->filter == kn->kn_filter)
104659290Sjlemon					break;
104759290Sjlemon		}
104859290Sjlemon	}
104959290Sjlemon
1050133741Sjmg	/* knote is in the process of changing, wait for it to stablize. */
1051133741Sjmg	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1052197930Skib		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1053256074Skib		if (filedesc_unlock) {
1054256074Skib			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1055256074Skib			filedesc_unlock = 0;
1056256074Skib		}
1057197930Skib		kq->kq_state |= KQ_FLUXWAIT;
1058197930Skib		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1059133741Sjmg		if (fp != NULL) {
1060133741Sjmg			fdrop(fp, td);
1061133741Sjmg			fp = NULL;
1062133741Sjmg		}
1063133741Sjmg		goto findkn;
1064133741Sjmg	}
1065133741Sjmg
106659290Sjlemon	/*
106759290Sjlemon	 * kn now contains the matching knote, or NULL if no match
106859290Sjlemon	 */
1069197240Ssson	if (kn == NULL) {
1070197240Ssson		if (kev->flags & EV_ADD) {
1071133741Sjmg			kn = tkn;
1072133741Sjmg			tkn = NULL;
107368883Sdillon			if (kn == NULL) {
1074159173Spjd				KQ_UNLOCK(kq);
107568883Sdillon				error = ENOMEM;
107668883Sdillon				goto done;
107768883Sdillon			}
107859290Sjlemon			kn->kn_fp = fp;
107959290Sjlemon			kn->kn_kq = kq;
108059290Sjlemon			kn->kn_fop = fops;
108168883Sdillon			/*
1082133741Sjmg			 * apply reference counts to knote structure, and
108368883Sdillon			 * do not release it at the end of this routine.
108468883Sdillon			 */
1085133741Sjmg			fops = NULL;
108668883Sdillon			fp = NULL;
108768883Sdillon
108861962Sjlemon			kn->kn_sfflags = kev->fflags;
108961962Sjlemon			kn->kn_sdata = kev->data;
109061962Sjlemon			kev->fflags = 0;
109161962Sjlemon			kev->data = 0;
109261962Sjlemon			kn->kn_kevent = *kev;
1093157383Sjmg			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1094157383Sjmg			    EV_ENABLE | EV_DISABLE);
1095133741Sjmg			kn->kn_status = KN_INFLUX|KN_DETACHED;
109661962Sjlemon
1097133741Sjmg			error = knote_attach(kn, kq);
1098133741Sjmg			KQ_UNLOCK(kq);
1099133741Sjmg			if (error != 0) {
1100133741Sjmg				tkn = kn;
1101133741Sjmg				goto done;
1102133741Sjmg			}
1103133741Sjmg
1104133741Sjmg			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
110583366Sjulian				knote_drop(kn, td);
110659290Sjlemon				goto done;
110759290Sjlemon			}
1108133741Sjmg			KN_LIST_LOCK(kn);
1109197240Ssson			goto done_ev_add;
111061962Sjlemon		} else {
1111197240Ssson			/* No matching knote and the EV_ADD flag is not set. */
1112133741Sjmg			KQ_UNLOCK(kq);
1113197240Ssson			error = ENOENT;
1114197240Ssson			goto done;
111559290Sjlemon		}
1116197240Ssson	}
1117197240Ssson
1118197240Ssson	if (kev->flags & EV_DELETE) {
1119133741Sjmg		kn->kn_status |= KN_INFLUX;
1120133741Sjmg		KQ_UNLOCK(kq);
1121134859Sjmg		if (!(kn->kn_status & KN_DETACHED))
1122134859Sjmg			kn->kn_fop->f_detach(kn);
112383366Sjulian		knote_drop(kn, td);
112459290Sjlemon		goto done;
112559290Sjlemon	}
112659290Sjlemon
1127197240Ssson	/*
1128197240Ssson	 * The user may change some filter values after the initial EV_ADD,
1129197240Ssson	 * but doing so will not reset any filter which has already been
1130197240Ssson	 * triggered.
1131197240Ssson	 */
1132264369Skib	kn->kn_status |= KN_INFLUX | KN_SCAN;
1133197240Ssson	KQ_UNLOCK(kq);
1134197240Ssson	KN_LIST_LOCK(kn);
1135197240Ssson	kn->kn_kevent.udata = kev->udata;
1136197240Ssson	if (!fops->f_isfd && fops->f_touch != NULL) {
1137197240Ssson		fops->f_touch(kn, kev, EVENT_REGISTER);
1138197240Ssson	} else {
1139197240Ssson		kn->kn_sfflags = kev->fflags;
1140197240Ssson		kn->kn_sdata = kev->data;
1141197240Ssson	}
1142197240Ssson
1143197240Ssson	/*
1144197240Ssson	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1145197240Ssson	 * the initial attach event decides that the event is "completed"
1146197240Ssson	 * already.  i.e. filt_procattach is called on a zombie process.  It
1147197240Ssson	 * will call filt_proc which will remove it from the list, and NULL
1148197240Ssson	 * kn_knlist.
1149197240Ssson	 */
1150197240Sssondone_ev_add:
1151197240Ssson	event = kn->kn_fop->f_event(kn, 0);
1152197240Ssson	KQ_LOCK(kq);
1153197240Ssson	if (event)
1154197240Ssson		KNOTE_ACTIVATE(kn, 1);
1155264369Skib	kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1156197240Ssson	KN_LIST_UNLOCK(kn);
1157197240Ssson
115859290Sjlemon	if ((kev->flags & EV_DISABLE) &&
115959290Sjlemon	    ((kn->kn_status & KN_DISABLED) == 0)) {
116059290Sjlemon		kn->kn_status |= KN_DISABLED;
116159290Sjlemon	}
116259290Sjlemon
116359290Sjlemon	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
116459290Sjlemon		kn->kn_status &= ~KN_DISABLED;
116559290Sjlemon		if ((kn->kn_status & KN_ACTIVE) &&
116659290Sjlemon		    ((kn->kn_status & KN_QUEUED) == 0))
116759290Sjlemon			knote_enqueue(kn);
116859290Sjlemon	}
1169133741Sjmg	KQ_UNLOCK_FLUX(kq);
117059290Sjlemon
117159290Sjlemondone:
1172133741Sjmg	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1173256074Skib	if (filedesc_unlock)
1174256074Skib		FILEDESC_XUNLOCK(td->td_proc->p_fd);
117568883Sdillon	if (fp != NULL)
117683366Sjulian		fdrop(fp, td);
1177133741Sjmg	if (tkn != NULL)
1178133741Sjmg		knote_free(tkn);
1179133741Sjmg	if (fops != NULL)
1180133741Sjmg		kqueue_fo_release(filt);
118159290Sjlemon	return (error);
118259290Sjlemon}
118359290Sjlemon
118459290Sjlemonstatic int
1185170029Srwatsonkqueue_acquire(struct file *fp, struct kqueue **kqp)
118659290Sjlemon{
1187133741Sjmg	int error;
118889306Salfred	struct kqueue *kq;
1189133741Sjmg
1190133741Sjmg	error = 0;
1191133741Sjmg
1192174988Sjeff	kq = fp->f_data;
1193174988Sjeff	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1194174988Sjeff		return (EBADF);
1195174988Sjeff	*kqp = kq;
1196174988Sjeff	KQ_LOCK(kq);
1197174988Sjeff	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1198133741Sjmg		KQ_UNLOCK(kq);
1199174988Sjeff		return (EBADF);
1200174988Sjeff	}
1201174988Sjeff	kq->kq_refcnt++;
1202174988Sjeff	KQ_UNLOCK(kq);
1203133741Sjmg
1204133741Sjmg	return error;
1205133741Sjmg}
1206133741Sjmg
1207133741Sjmgstatic void
1208133741Sjmgkqueue_release(struct kqueue *kq, int locked)
1209133741Sjmg{
1210133741Sjmg	if (locked)
1211133741Sjmg		KQ_OWNED(kq);
1212133741Sjmg	else
1213133741Sjmg		KQ_LOCK(kq);
1214133741Sjmg	kq->kq_refcnt--;
1215133741Sjmg	if (kq->kq_refcnt == 1)
1216133741Sjmg		wakeup(&kq->kq_refcnt);
1217133741Sjmg	if (!locked)
1218133741Sjmg		KQ_UNLOCK(kq);
1219133741Sjmg}
1220133741Sjmg
1221133741Sjmgstatic void
1222133741Sjmgkqueue_schedtask(struct kqueue *kq)
1223133741Sjmg{
1224133741Sjmg
1225133741Sjmg	KQ_OWNED(kq);
1226133741Sjmg	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1227133741Sjmg	    ("scheduling kqueue task while draining"));
1228133741Sjmg
1229133741Sjmg	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1230133741Sjmg		taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
1231133741Sjmg		kq->kq_state |= KQ_TASKSCHED;
1232133741Sjmg	}
1233133741Sjmg}
1234133741Sjmg
1235133741Sjmg/*
1236133741Sjmg * Expand the kq to make sure we have storage for fops/ident pair.
1237133741Sjmg *
1238133741Sjmg * Return 0 on success (or no work necessary), return errno on failure.
1239133741Sjmg *
1240133741Sjmg * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
1241133741Sjmg * If kqueue_register is called from a non-fd context, there usually/should
1242133741Sjmg * be no locks held.
1243133741Sjmg */
1244133741Sjmgstatic int
1245133741Sjmgkqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1246133741Sjmg	int waitok)
1247133741Sjmg{
1248205886Sjhb	struct klist *list, *tmp_knhash, *to_free;
1249133741Sjmg	u_long tmp_knhashmask;
1250133741Sjmg	int size;
1251133741Sjmg	int fd;
1252133741Sjmg	int mflag = waitok ? M_WAITOK : M_NOWAIT;
1253133741Sjmg
1254133741Sjmg	KQ_NOTOWNED(kq);
1255133741Sjmg
1256205886Sjhb	to_free = NULL;
1257133741Sjmg	if (fops->f_isfd) {
1258133741Sjmg		fd = ident;
1259133741Sjmg		if (kq->kq_knlistsize <= fd) {
1260133741Sjmg			size = kq->kq_knlistsize;
1261133741Sjmg			while (size <= fd)
1262133741Sjmg				size += KQEXTENT;
1263197575Sdelphij			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1264133741Sjmg			if (list == NULL)
1265133741Sjmg				return ENOMEM;
1266133741Sjmg			KQ_LOCK(kq);
1267133741Sjmg			if (kq->kq_knlistsize > fd) {
1268205886Sjhb				to_free = list;
1269133741Sjmg				list = NULL;
1270133741Sjmg			} else {
1271133741Sjmg				if (kq->kq_knlist != NULL) {
1272133741Sjmg					bcopy(kq->kq_knlist, list,
1273197575Sdelphij					    kq->kq_knlistsize * sizeof(*list));
1274205886Sjhb					to_free = kq->kq_knlist;
1275133741Sjmg					kq->kq_knlist = NULL;
1276133741Sjmg				}
1277133741Sjmg				bzero((caddr_t)list +
1278197575Sdelphij				    kq->kq_knlistsize * sizeof(*list),
1279197575Sdelphij				    (size - kq->kq_knlistsize) * sizeof(*list));
1280133741Sjmg				kq->kq_knlistsize = size;
1281133741Sjmg				kq->kq_knlist = list;
1282133741Sjmg			}
1283133741Sjmg			KQ_UNLOCK(kq);
1284133741Sjmg		}
1285133741Sjmg	} else {
1286133741Sjmg		if (kq->kq_knhashmask == 0) {
1287133741Sjmg			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1288133741Sjmg			    &tmp_knhashmask);
1289133741Sjmg			if (tmp_knhash == NULL)
1290133741Sjmg				return ENOMEM;
1291133741Sjmg			KQ_LOCK(kq);
1292133741Sjmg			if (kq->kq_knhashmask == 0) {
1293133741Sjmg				kq->kq_knhash = tmp_knhash;
1294133741Sjmg				kq->kq_knhashmask = tmp_knhashmask;
1295133741Sjmg			} else {
1296205886Sjhb				to_free = tmp_knhash;
1297133741Sjmg			}
1298133741Sjmg			KQ_UNLOCK(kq);
1299133741Sjmg		}
1300133741Sjmg	}
1301205886Sjhb	free(to_free, M_KQUEUE);
1302133741Sjmg
1303133741Sjmg	KQ_NOTOWNED(kq);
1304133741Sjmg	return 0;
1305133741Sjmg}
1306133741Sjmg
1307133741Sjmgstatic void
1308133741Sjmgkqueue_task(void *arg, int pending)
1309133741Sjmg{
1310133741Sjmg	struct kqueue *kq;
1311133741Sjmg	int haskqglobal;
1312133741Sjmg
1313133741Sjmg	haskqglobal = 0;
1314133741Sjmg	kq = arg;
1315133741Sjmg
1316133741Sjmg	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1317133741Sjmg	KQ_LOCK(kq);
1318133741Sjmg
1319133741Sjmg	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1320133741Sjmg
1321133741Sjmg	kq->kq_state &= ~KQ_TASKSCHED;
1322133741Sjmg	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1323133741Sjmg		wakeup(&kq->kq_state);
1324133741Sjmg	}
1325133741Sjmg	KQ_UNLOCK(kq);
1326133741Sjmg	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1327133741Sjmg}
1328133741Sjmg
1329133741Sjmg/*
1330133741Sjmg * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1331133741Sjmg * We treat KN_MARKER knotes as if they are INFLUX.
1332133741Sjmg */
1333133741Sjmgstatic int
1334146950Spskqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1335146950Sps    const struct timespec *tsp, struct kevent *keva, struct thread *td)
1336133741Sjmg{
133759290Sjlemon	struct kevent *kevp;
133859290Sjlemon	struct timeval atv, rtv, ttv;
1339133794Sgreen	struct knote *kn, *marker;
1340178914Skib	int count, timeout, nkev, error, influx;
1341197240Ssson	int haskqglobal, touch;
134259290Sjlemon
134359290Sjlemon	count = maxevents;
1344133741Sjmg	nkev = 0;
1345133741Sjmg	error = 0;
1346133741Sjmg	haskqglobal = 0;
134759290Sjlemon
1348133741Sjmg	if (maxevents == 0)
1349133741Sjmg		goto done_nl;
1350133741Sjmg
135164343Sjlemon	if (tsp != NULL) {
135259290Sjlemon		TIMESPEC_TO_TIMEVAL(&atv, tsp);
135364343Sjlemon		if (itimerfix(&atv)) {
135459290Sjlemon			error = EINVAL;
1355133741Sjmg			goto done_nl;
135659290Sjlemon		}
135764343Sjlemon		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
135864343Sjlemon			timeout = -1;
1359133590Srwatson		else
136064343Sjlemon			timeout = atv.tv_sec > 24 * 60 * 60 ?
136164343Sjlemon			    24 * 60 * 60 * hz : tvtohz(&atv);
136264343Sjlemon		getmicrouptime(&rtv);
136364343Sjlemon		timevaladd(&atv, &rtv);
136464343Sjlemon	} else {
136564343Sjlemon		atv.tv_sec = 0;
136664343Sjlemon		atv.tv_usec = 0;
136759290Sjlemon		timeout = 0;
136859290Sjlemon	}
1369133794Sgreen	marker = knote_alloc(1);
1370133794Sgreen	if (marker == NULL) {
1371133794Sgreen		error = ENOMEM;
1372133794Sgreen		goto done_nl;
1373133794Sgreen	}
1374133794Sgreen	marker->kn_status = KN_MARKER;
1375133741Sjmg	KQ_LOCK(kq);
137659290Sjlemon	goto start;
137759290Sjlemon
137859290Sjlemonretry:
137964343Sjlemon	if (atv.tv_sec || atv.tv_usec) {
138059290Sjlemon		getmicrouptime(&rtv);
138159290Sjlemon		if (timevalcmp(&rtv, &atv, >=))
138259290Sjlemon			goto done;
138359290Sjlemon		ttv = atv;
138459290Sjlemon		timevalsub(&ttv, &rtv);
138559290Sjlemon		timeout = ttv.tv_sec > 24 * 60 * 60 ?
138659290Sjlemon			24 * 60 * 60 * hz : tvtohz(&ttv);
138759290Sjlemon	}
138859290Sjlemon
138959290Sjlemonstart:
1390133741Sjmg	kevp = keva;
139159290Sjlemon	if (kq->kq_count == 0) {
1392133590Srwatson		if (timeout < 0) {
139364343Sjlemon			error = EWOULDBLOCK;
139464343Sjlemon		} else {
139564343Sjlemon			kq->kq_state |= KQ_SLEEP;
1396133741Sjmg			error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
1397133741Sjmg			    "kqread", timeout);
139864343Sjlemon		}
139964084Sjlemon		if (error == 0)
140059290Sjlemon			goto retry;
140164084Sjlemon		/* don't restart after signals... */
140264084Sjlemon		if (error == ERESTART)
140364084Sjlemon			error = EINTR;
140464084Sjlemon		else if (error == EWOULDBLOCK)
140559290Sjlemon			error = 0;
140659290Sjlemon		goto done;
140759290Sjlemon	}
140859290Sjlemon
1409133794Sgreen	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1410178914Skib	influx = 0;
141159290Sjlemon	while (count) {
1412133741Sjmg		KQ_OWNED(kq);
141359290Sjlemon		kn = TAILQ_FIRST(&kq->kq_head);
1414133741Sjmg
1415133794Sgreen		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1416133741Sjmg		    (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1417178914Skib			if (influx) {
1418178914Skib				influx = 0;
1419178914Skib				KQ_FLUX_WAKEUP(kq);
1420178914Skib			}
1421180336Skib			kq->kq_state |= KQ_FLUXWAIT;
1422133741Sjmg			error = msleep(kq, &kq->kq_lock, PSOCK,
1423133741Sjmg			    "kqflxwt", 0);
1424133741Sjmg			continue;
1425133741Sjmg		}
1426133741Sjmg
1427133590Srwatson		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1428133741Sjmg		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1429133741Sjmg			kn->kn_status &= ~KN_QUEUED;
1430133741Sjmg			kq->kq_count--;
1431133741Sjmg			continue;
1432133741Sjmg		}
1433133794Sgreen		if (kn == marker) {
1434133741Sjmg			KQ_FLUX_WAKEUP(kq);
143559290Sjlemon			if (count == maxevents)
143659290Sjlemon				goto retry;
143759290Sjlemon			goto done;
143859290Sjlemon		}
1439133741Sjmg		KASSERT((kn->kn_status & KN_INFLUX) == 0,
1440133741Sjmg		    ("KN_INFLUX set when not suppose to be"));
1441133741Sjmg
1442257759Sjhb		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
144359290Sjlemon			kn->kn_status &= ~KN_QUEUED;
1444133741Sjmg			kn->kn_status |= KN_INFLUX;
144559290Sjlemon			kq->kq_count--;
1446133741Sjmg			KQ_UNLOCK(kq);
1447133741Sjmg			/*
1448133741Sjmg			 * We don't need to lock the list since we've marked
1449133741Sjmg			 * it _INFLUX.
1450133741Sjmg			 */
1451257759Sjhb			if (!(kn->kn_status & KN_DETACHED))
1452257759Sjhb				kn->kn_fop->f_detach(kn);
1453257759Sjhb			knote_drop(kn, td);
1454257759Sjhb			KQ_LOCK(kq);
1455257759Sjhb			continue;
1456257759Sjhb		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1457257759Sjhb			kn->kn_status &= ~KN_QUEUED;
1458257759Sjhb			kn->kn_status |= KN_INFLUX;
1459257759Sjhb			kq->kq_count--;
1460257759Sjhb			KQ_UNLOCK(kq);
1461257759Sjhb			/*
1462257759Sjhb			 * We don't need to lock the list since we've marked
1463257759Sjhb			 * it _INFLUX.
1464257759Sjhb			 */
1465133741Sjmg			*kevp = kn->kn_kevent;
1466134859Sjmg			if (!(kn->kn_status & KN_DETACHED))
1467134859Sjmg				kn->kn_fop->f_detach(kn);
146883366Sjulian			knote_drop(kn, td);
1469133741Sjmg			KQ_LOCK(kq);
1470133741Sjmg			kn = NULL;
147159290Sjlemon		} else {
1472264369Skib			kn->kn_status |= KN_INFLUX | KN_SCAN;
1473133741Sjmg			KQ_UNLOCK(kq);
1474133741Sjmg			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1475133741Sjmg				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1476133741Sjmg			KN_LIST_LOCK(kn);
1477133741Sjmg			if (kn->kn_fop->f_event(kn, 0) == 0) {
1478133741Sjmg				KQ_LOCK(kq);
1479157754Sjhb				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1480133741Sjmg				kn->kn_status &=
1481264369Skib				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
1482264369Skib				    KN_SCAN);
1483133741Sjmg				kq->kq_count--;
1484150199Sups				KN_LIST_UNLOCK(kn);
1485178914Skib				influx = 1;
1486133741Sjmg				continue;
1487133741Sjmg			}
1488197240Ssson			touch = (!kn->kn_fop->f_isfd &&
1489197240Ssson			    kn->kn_fop->f_touch != NULL);
1490197240Ssson			if (touch)
1491197240Ssson				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
1492197240Ssson			else
1493197240Ssson				*kevp = kn->kn_kevent;
1494133741Sjmg			KQ_LOCK(kq);
1495157754Sjhb			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1496197242Ssson			if (kn->kn_flags & (EV_CLEAR |  EV_DISPATCH)) {
1497197240Ssson				/*
1498197240Ssson				 * Manually clear knotes who weren't
1499197240Ssson				 * 'touch'ed.
1500197240Ssson				 */
1501197242Ssson				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
1502197240Ssson					kn->kn_data = 0;
1503197240Ssson					kn->kn_fflags = 0;
1504197240Ssson				}
1505197242Ssson				if (kn->kn_flags & EV_DISPATCH)
1506197242Ssson					kn->kn_status |= KN_DISABLED;
1507133741Sjmg				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1508133741Sjmg				kq->kq_count--;
1509133741Sjmg			} else
1510133741Sjmg				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1511150199Sups
1512264369Skib			kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1513133741Sjmg			KN_LIST_UNLOCK(kn);
1514178914Skib			influx = 1;
151559290Sjlemon		}
1516133741Sjmg
1517133741Sjmg		/* we are returning a copy to the user */
1518133741Sjmg		kevp++;
1519133741Sjmg		nkev++;
152059290Sjlemon		count--;
1521133741Sjmg
152259290Sjlemon		if (nkev == KQ_NEVENTS) {
1523178914Skib			influx = 0;
1524133741Sjmg			KQ_UNLOCK_FLUX(kq);
1525146950Sps			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
152659290Sjlemon			nkev = 0;
1527133741Sjmg			kevp = keva;
1528133741Sjmg			KQ_LOCK(kq);
152959997Sjlemon			if (error)
153059997Sjlemon				break;
153159290Sjlemon		}
153259290Sjlemon	}
1533133794Sgreen	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
153459290Sjlemondone:
1535133741Sjmg	KQ_OWNED(kq);
1536133741Sjmg	KQ_UNLOCK_FLUX(kq);
1537133794Sgreen	knote_free(marker);
1538133741Sjmgdone_nl:
1539133741Sjmg	KQ_NOTOWNED(kq);
154059290Sjlemon	if (nkev != 0)
1541146950Sps		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1542133741Sjmg	td->td_retval[0] = maxevents - count;
154359290Sjlemon	return (error);
154459290Sjlemon}
154559290Sjlemon
154659290Sjlemon/*
154759290Sjlemon * XXX
154859290Sjlemon * This could be expanded to call kqueue_scan, if desired.
154959290Sjlemon */
155059290Sjlemon/*ARGSUSED*/
155159290Sjlemonstatic int
1552101941Srwatsonkqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
155383366Sjulian	int flags, struct thread *td)
155459290Sjlemon{
155559290Sjlemon	return (ENXIO);
155659290Sjlemon}
155759290Sjlemon
155859290Sjlemon/*ARGSUSED*/
155959290Sjlemonstatic int
1560101941Srwatsonkqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
156183366Sjulian	 int flags, struct thread *td)
156259290Sjlemon{
156359290Sjlemon	return (ENXIO);
156459290Sjlemon}
156559290Sjlemon
156659290Sjlemon/*ARGSUSED*/
156759290Sjlemonstatic int
1568175140Sjhbkqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1569175140Sjhb	struct thread *td)
1570175140Sjhb{
1571175140Sjhb
1572175140Sjhb	return (EINVAL);
1573175140Sjhb}
1574175140Sjhb
1575175140Sjhb/*ARGSUSED*/
1576175140Sjhbstatic int
1577132138Salfredkqueue_ioctl(struct file *fp, u_long cmd, void *data,
1578102003Srwatson	struct ucred *active_cred, struct thread *td)
157959290Sjlemon{
1580132174Salfred	/*
1581132174Salfred	 * Enabling sigio causes two major problems:
1582132174Salfred	 * 1) infinite recursion:
1583132174Salfred	 * Synopsys: kevent is being used to track signals and have FIOASYNC
1584132174Salfred	 * set.  On receipt of a signal this will cause a kqueue to recurse
1585132174Salfred	 * into itself over and over.  Sending the sigio causes the kqueue
1586132174Salfred	 * to become ready, which in turn posts sigio again, forever.
1587132174Salfred	 * Solution: this can be solved by setting a flag in the kqueue that
1588132174Salfred	 * we have a SIGIO in progress.
1589132174Salfred	 * 2) locking problems:
1590132174Salfred	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1591132174Salfred	 * us above the proc and pgrp locks.
1592132174Salfred	 * Solution: Post a signal using an async mechanism, being sure to
1593132174Salfred	 * record a generation count in the delivery so that we do not deliver
1594132174Salfred	 * a signal to the wrong process.
1595132174Salfred	 *
1596132174Salfred	 * Note, these two mechanisms are somewhat mutually exclusive!
1597132174Salfred	 */
1598132174Salfred#if 0
1599132138Salfred	struct kqueue *kq;
1600132138Salfred
1601132138Salfred	kq = fp->f_data;
1602132138Salfred	switch (cmd) {
1603132138Salfred	case FIOASYNC:
1604132138Salfred		if (*(int *)data) {
1605132138Salfred			kq->kq_state |= KQ_ASYNC;
1606132138Salfred		} else {
1607132138Salfred			kq->kq_state &= ~KQ_ASYNC;
1608132138Salfred		}
1609132138Salfred		return (0);
1610132138Salfred
1611132138Salfred	case FIOSETOWN:
1612132138Salfred		return (fsetown(*(int *)data, &kq->kq_sigio));
1613132138Salfred
1614132138Salfred	case FIOGETOWN:
1615132138Salfred		*(int *)data = fgetown(&kq->kq_sigio);
1616132138Salfred		return (0);
1617132138Salfred	}
1618132174Salfred#endif
1619132138Salfred
162059290Sjlemon	return (ENOTTY);
162159290Sjlemon}
162259290Sjlemon
162359290Sjlemon/*ARGSUSED*/
162459290Sjlemonstatic int
1625101983Srwatsonkqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1626101987Srwatson	struct thread *td)
162759290Sjlemon{
162889306Salfred	struct kqueue *kq;
162959290Sjlemon	int revents = 0;
1630133741Sjmg	int error;
163159290Sjlemon
1632170029Srwatson	if ((error = kqueue_acquire(fp, &kq)))
1633133741Sjmg		return POLLERR;
1634133741Sjmg
1635133741Sjmg	KQ_LOCK(kq);
1636133741Sjmg	if (events & (POLLIN | POLLRDNORM)) {
1637133741Sjmg		if (kq->kq_count) {
1638133741Sjmg			revents |= events & (POLLIN | POLLRDNORM);
163959290Sjlemon		} else {
1640133741Sjmg			selrecord(td, &kq->kq_sel);
1641174647Sjeff			if (SEL_WAITING(&kq->kq_sel))
1642174647Sjeff				kq->kq_state |= KQ_SEL;
164359290Sjlemon		}
164459290Sjlemon	}
1645133741Sjmg	kqueue_release(kq, 1);
1646133741Sjmg	KQ_UNLOCK(kq);
164759290Sjlemon	return (revents);
164859290Sjlemon}
164959290Sjlemon
165059290Sjlemon/*ARGSUSED*/
165159290Sjlemonstatic int
1652101983Srwatsonkqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1653101987Srwatson	struct thread *td)
165459290Sjlemon{
165559290Sjlemon
1656146603Sjmg	bzero((void *)st, sizeof *st);
1657146603Sjmg	/*
1658146603Sjmg	 * We no longer return kq_count because the unlocked value is useless.
1659146603Sjmg	 * If you spent all this time getting the count, why not spend your
1660146603Sjmg	 * syscall better by calling kevent?
1661146603Sjmg	 *
1662146603Sjmg	 * XXX - This is needed for libc_r.
1663146603Sjmg	 */
1664146603Sjmg	st->st_mode = S_IFIFO;
1665146603Sjmg	return (0);
166659290Sjlemon}
166759290Sjlemon
166859290Sjlemon/*ARGSUSED*/
166959290Sjlemonstatic int
167083366Sjuliankqueue_close(struct file *fp, struct thread *td)
167159290Sjlemon{
1672109153Sdillon	struct kqueue *kq = fp->f_data;
1673133741Sjmg	struct filedesc *fdp;
1674133741Sjmg	struct knote *kn;
167559290Sjlemon	int i;
1676133741Sjmg	int error;
1677256074Skib	int filedesc_unlock;
167859290Sjlemon
1679170029Srwatson	if ((error = kqueue_acquire(fp, &kq)))
1680133741Sjmg		return error;
1681133741Sjmg
1682256074Skib	filedesc_unlock = 0;
1683133741Sjmg	KQ_LOCK(kq);
1684133741Sjmg
1685133741Sjmg	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1686133741Sjmg	    ("kqueue already closing"));
1687133741Sjmg	kq->kq_state |= KQ_CLOSING;
1688133741Sjmg	if (kq->kq_refcnt > 1)
1689133741Sjmg		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1690133741Sjmg
1691133741Sjmg	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1692133741Sjmg	fdp = kq->kq_fdp;
1693133741Sjmg
1694133741Sjmg	KASSERT(knlist_empty(&kq->kq_sel.si_note),
1695133741Sjmg	    ("kqueue's knlist not empty"));
1696133741Sjmg
1697133741Sjmg	for (i = 0; i < kq->kq_knlistsize; i++) {
1698133741Sjmg		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1699178913Skib			if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1700178913Skib				kq->kq_state |= KQ_FLUXWAIT;
1701178913Skib				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
1702178913Skib				continue;
1703178913Skib			}
1704133741Sjmg			kn->kn_status |= KN_INFLUX;
1705133741Sjmg			KQ_UNLOCK(kq);
1706134859Sjmg			if (!(kn->kn_status & KN_DETACHED))
1707134859Sjmg				kn->kn_fop->f_detach(kn);
1708133741Sjmg			knote_drop(kn, td);
1709133741Sjmg			KQ_LOCK(kq);
171059290Sjlemon		}
171159290Sjlemon	}
1712133741Sjmg	if (kq->kq_knhashmask != 0) {
1713133741Sjmg		for (i = 0; i <= kq->kq_knhashmask; i++) {
1714133741Sjmg			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
1715178913Skib				if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1716178913Skib					kq->kq_state |= KQ_FLUXWAIT;
1717178913Skib					msleep(kq, &kq->kq_lock, PSOCK,
1718178913Skib					       "kqclo2", 0);
1719178913Skib					continue;
1720178913Skib				}
1721133741Sjmg				kn->kn_status |= KN_INFLUX;
1722133741Sjmg				KQ_UNLOCK(kq);
1723134859Sjmg				if (!(kn->kn_status & KN_DETACHED))
1724134859Sjmg					kn->kn_fop->f_detach(kn);
1725133741Sjmg				knote_drop(kn, td);
1726133741Sjmg				KQ_LOCK(kq);
172759290Sjlemon			}
172859290Sjlemon		}
172959290Sjlemon	}
1730133741Sjmg
1731133741Sjmg	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
1732133741Sjmg		kq->kq_state |= KQ_TASKDRAIN;
1733133741Sjmg		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
1734133741Sjmg	}
1735133741Sjmg
1736133741Sjmg	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1737126033Sgreen		selwakeuppri(&kq->kq_sel, PSOCK);
1738174647Sjeff		if (!SEL_WAITING(&kq->kq_sel))
1739174647Sjeff			kq->kq_state &= ~KQ_SEL;
1740126033Sgreen	}
1741133741Sjmg
1742133741Sjmg	KQ_UNLOCK(kq);
1743133741Sjmg
1744256074Skib	/*
1745256074Skib	 * We could be called due to the knote_drop() doing fdrop(),
1746256074Skib	 * called from kqueue_register().  In this case the global
1747256074Skib	 * lock is owned, and filedesc sx is locked before, to not
1748256074Skib	 * take the sleepable lock after non-sleepable.
1749256074Skib	 */
1750256074Skib	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
1751256074Skib		FILEDESC_XLOCK(fdp);
1752256074Skib		filedesc_unlock = 1;
1753256074Skib	} else
1754256074Skib		filedesc_unlock = 0;
1755255729Skib	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
1756256074Skib	if (filedesc_unlock)
1757256074Skib		FILEDESC_XUNLOCK(fdp);
1758133741Sjmg
1759225177Sattilio	seldrain(&kq->kq_sel);
1760133741Sjmg	knlist_destroy(&kq->kq_sel.si_note);
1761133741Sjmg	mtx_destroy(&kq->kq_lock);
1762133741Sjmg	kq->kq_fdp = NULL;
1763133741Sjmg
1764133741Sjmg	if (kq->kq_knhash != NULL)
1765133741Sjmg		free(kq->kq_knhash, M_KQUEUE);
1766133741Sjmg	if (kq->kq_knlist != NULL)
1767133741Sjmg		free(kq->kq_knlist, M_KQUEUE);
1768133741Sjmg
1769132138Salfred	funsetown(&kq->kq_sigio);
177084138Sjlemon	free(kq, M_KQUEUE);
1771109153Sdillon	fp->f_data = NULL;
177259290Sjlemon
177359290Sjlemon	return (0);
177459290Sjlemon}
177559290Sjlemon
177659290Sjlemonstatic void
177759290Sjlemonkqueue_wakeup(struct kqueue *kq)
177859290Sjlemon{
1779133741Sjmg	KQ_OWNED(kq);
178059290Sjlemon
1781133741Sjmg	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
178259290Sjlemon		kq->kq_state &= ~KQ_SLEEP;
178359290Sjlemon		wakeup(kq);
178459290Sjlemon	}
1785133741Sjmg	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1786122352Stanimura		selwakeuppri(&kq->kq_sel, PSOCK);
1787174647Sjeff		if (!SEL_WAITING(&kq->kq_sel))
1788174647Sjeff			kq->kq_state &= ~KQ_SEL;
178959290Sjlemon	}
1790133741Sjmg	if (!knlist_empty(&kq->kq_sel.si_note))
1791133741Sjmg		kqueue_schedtask(kq);
1792133741Sjmg	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
1793132138Salfred		pgsigio(&kq->kq_sigio, SIGIO, 0);
1794132138Salfred	}
179559290Sjlemon}
179659290Sjlemon
179759290Sjlemon/*
1798133741Sjmg * Walk down a list of knotes, activating them if their event has triggered.
1799133741Sjmg *
1800133741Sjmg * There is a possibility to optimize in the case of one kq watching another.
1801133741Sjmg * Instead of scheduling a task to wake it up, you could pass enough state
1802133741Sjmg * down the chain to make up the parent kqueue.  Make this code functional
1803133741Sjmg * first.
180459290Sjlemon */
180559290Sjlemonvoid
1806195148Sstasknote(struct knlist *list, long hint, int lockflags)
180759290Sjlemon{
1808133741Sjmg	struct kqueue *kq;
180959290Sjlemon	struct knote *kn;
1810195148Sstas	int error;
181159290Sjlemon
1812133741Sjmg	if (list == NULL)
1813133741Sjmg		return;
1814133741Sjmg
1815195148Sstas	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
1816147730Sssouhlal
1817195148Sstas	if ((lockflags & KNF_LISTLOCKED) == 0)
1818147730Sssouhlal		list->kl_lock(list->kl_lockarg);
1819147730Sssouhlal
1820133741Sjmg	/*
1821133741Sjmg	 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
1822133741Sjmg	 * the kqueue scheduling, but this will introduce four
1823133741Sjmg	 * lock/unlock's for each knote to test.  If we do, continue to use
1824133741Sjmg	 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
1825133741Sjmg	 * only safe if you want to remove the current item, which we are
1826133741Sjmg	 * not doing.
1827133741Sjmg	 */
1828133741Sjmg	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
1829133741Sjmg		kq = kn->kn_kq;
1830264369Skib		KQ_LOCK(kq);
1831264369Skib		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
1832264369Skib			/*
1833264369Skib			 * Do not process the influx notes, except for
1834264369Skib			 * the influx coming from the kq unlock in the
1835264369Skib			 * kqueue_scan().  In the later case, we do
1836264369Skib			 * not interfere with the scan, since the code
1837264369Skib			 * fragment in kqueue_scan() locks the knlist,
1838264369Skib			 * and cannot proceed until we finished.
1839264369Skib			 */
1840264369Skib			KQ_UNLOCK(kq);
1841264369Skib		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
1842264369Skib			kn->kn_status |= KN_INFLUX;
1843264369Skib			KQ_UNLOCK(kq);
1844264369Skib			error = kn->kn_fop->f_event(kn, hint);
1845133741Sjmg			KQ_LOCK(kq);
1846264369Skib			kn->kn_status &= ~KN_INFLUX;
1847264369Skib			if (error)
1848264369Skib				KNOTE_ACTIVATE(kn, 1);
1849264369Skib			KQ_UNLOCK_FLUX(kq);
1850264369Skib		} else {
1851264369Skib			kn->kn_status |= KN_HASKQLOCK;
1852264369Skib			if (kn->kn_fop->f_event(kn, hint))
1853264369Skib				KNOTE_ACTIVATE(kn, 1);
1854264369Skib			kn->kn_status &= ~KN_HASKQLOCK;
1855264369Skib			KQ_UNLOCK(kq);
1856133741Sjmg		}
1857133741Sjmg	}
1858195148Sstas	if ((lockflags & KNF_LISTLOCKED) == 0)
1859147730Sssouhlal		list->kl_unlock(list->kl_lockarg);
186059290Sjlemon}
186159290Sjlemon
186259290Sjlemon/*
1863133741Sjmg * add a knote to a knlist
1864133741Sjmg */
1865133741Sjmgvoid
1866133741Sjmgknlist_add(struct knlist *knl, struct knote *kn, int islocked)
1867133741Sjmg{
1868147730Sssouhlal	KNL_ASSERT_LOCK(knl, islocked);
1869133741Sjmg	KQ_NOTOWNED(kn->kn_kq);
1870133741Sjmg	KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
1871133741Sjmg	    (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
1872133741Sjmg	if (!islocked)
1873147730Sssouhlal		knl->kl_lock(knl->kl_lockarg);
1874133741Sjmg	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
1875133741Sjmg	if (!islocked)
1876147730Sssouhlal		knl->kl_unlock(knl->kl_lockarg);
1877133741Sjmg	KQ_LOCK(kn->kn_kq);
1878133741Sjmg	kn->kn_knlist = knl;
1879133741Sjmg	kn->kn_status &= ~KN_DETACHED;
1880133741Sjmg	KQ_UNLOCK(kn->kn_kq);
1881133741Sjmg}
1882133741Sjmg
1883133741Sjmgstatic void
1884133741Sjmgknlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
1885133741Sjmg{
1886133741Sjmg	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
1887147730Sssouhlal	KNL_ASSERT_LOCK(knl, knlislocked);
1888133741Sjmg	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
1889133741Sjmg	if (!kqislocked)
1890133741Sjmg		KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
1891133741Sjmg    ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
1892133741Sjmg	if (!knlislocked)
1893147730Sssouhlal		knl->kl_lock(knl->kl_lockarg);
1894133741Sjmg	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
1895133741Sjmg	kn->kn_knlist = NULL;
1896133741Sjmg	if (!knlislocked)
1897147730Sssouhlal		knl->kl_unlock(knl->kl_lockarg);
1898133741Sjmg	if (!kqislocked)
1899133741Sjmg		KQ_LOCK(kn->kn_kq);
1900133741Sjmg	kn->kn_status |= KN_DETACHED;
1901133741Sjmg	if (!kqislocked)
1902133741Sjmg		KQ_UNLOCK(kn->kn_kq);
1903133741Sjmg}
1904133741Sjmg
1905133741Sjmg/*
190659290Sjlemon * remove all knotes from a specified klist
190759290Sjlemon */
190859290Sjlemonvoid
1909133741Sjmgknlist_remove(struct knlist *knl, struct knote *kn, int islocked)
191059290Sjlemon{
1911133741Sjmg
1912133741Sjmg	knlist_remove_kq(knl, kn, islocked, 0);
1913133741Sjmg}
1914133741Sjmg
1915133741Sjmg/*
1916133741Sjmg * remove knote from a specified klist while in f_event handler.
1917133741Sjmg */
1918133741Sjmgvoid
1919133741Sjmgknlist_remove_inevent(struct knlist *knl, struct knote *kn)
1920133741Sjmg{
1921133741Sjmg
1922133741Sjmg	knlist_remove_kq(knl, kn, 1,
1923133741Sjmg	    (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
1924133741Sjmg}
1925133741Sjmg
1926133741Sjmgint
1927133741Sjmgknlist_empty(struct knlist *knl)
1928133741Sjmg{
1929147730Sssouhlal	KNL_ASSERT_LOCKED(knl);
1930133741Sjmg	return SLIST_EMPTY(&knl->kl_list);
1931133741Sjmg}
1932133741Sjmg
1933133741Sjmgstatic struct mtx	knlist_lock;
1934133741SjmgMTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
1935133741Sjmg	MTX_DEF);
1936147730Sssouhlalstatic void knlist_mtx_lock(void *arg);
1937147730Sssouhlalstatic void knlist_mtx_unlock(void *arg);
1938133741Sjmg
1939147730Sssouhlalstatic void
1940147730Sssouhlalknlist_mtx_lock(void *arg)
1941147730Sssouhlal{
1942147730Sssouhlal	mtx_lock((struct mtx *)arg);
1943147730Sssouhlal}
1944147730Sssouhlal
1945147730Sssouhlalstatic void
1946147730Sssouhlalknlist_mtx_unlock(void *arg)
1947147730Sssouhlal{
1948147730Sssouhlal	mtx_unlock((struct mtx *)arg);
1949147730Sssouhlal}
1950147730Sssouhlal
1951193951Skibstatic void
1952193951Skibknlist_mtx_assert_locked(void *arg)
1953147730Sssouhlal{
1954193951Skib	mtx_assert((struct mtx *)arg, MA_OWNED);
1955147730Sssouhlal}
1956147730Sssouhlal
1957193951Skibstatic void
1958193951Skibknlist_mtx_assert_unlocked(void *arg)
1959193951Skib{
1960193951Skib	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
1961193951Skib}
1962193951Skib
1963133741Sjmgvoid
1964147730Sssouhlalknlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
1965193951Skib    void (*kl_unlock)(void *),
1966193951Skib    void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
1967133741Sjmg{
1968133741Sjmg
1969147730Sssouhlal	if (lock == NULL)
1970147730Sssouhlal		knl->kl_lockarg = &knlist_lock;
1971133741Sjmg	else
1972147730Sssouhlal		knl->kl_lockarg = lock;
1973133741Sjmg
1974147730Sssouhlal	if (kl_lock == NULL)
1975147730Sssouhlal		knl->kl_lock = knlist_mtx_lock;
1976147730Sssouhlal	else
1977147730Sssouhlal		knl->kl_lock = kl_lock;
1978157582Sjmg	if (kl_unlock == NULL)
1979147730Sssouhlal		knl->kl_unlock = knlist_mtx_unlock;
1980147730Sssouhlal	else
1981147730Sssouhlal		knl->kl_unlock = kl_unlock;
1982193951Skib	if (kl_assert_locked == NULL)
1983193951Skib		knl->kl_assert_locked = knlist_mtx_assert_locked;
1984147730Sssouhlal	else
1985193951Skib		knl->kl_assert_locked = kl_assert_locked;
1986193951Skib	if (kl_assert_unlocked == NULL)
1987193951Skib		knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
1988193951Skib	else
1989193951Skib		knl->kl_assert_unlocked = kl_assert_unlocked;
1990147730Sssouhlal
1991133741Sjmg	SLIST_INIT(&knl->kl_list);
1992133741Sjmg}
1993133741Sjmg
1994133741Sjmgvoid
1995193951Skibknlist_init_mtx(struct knlist *knl, struct mtx *lock)
1996193951Skib{
1997193951Skib
1998193951Skib	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
1999193951Skib}
2000193951Skib
2001193951Skibvoid
2002133741Sjmgknlist_destroy(struct knlist *knl)
2003133741Sjmg{
2004133741Sjmg
2005133741Sjmg#ifdef INVARIANTS
2006133741Sjmg	/*
2007133741Sjmg	 * if we run across this error, we need to find the offending
2008133741Sjmg	 * driver and have it call knlist_clear.
2009133741Sjmg	 */
2010133741Sjmg	if (!SLIST_EMPTY(&knl->kl_list))
2011133741Sjmg		printf("WARNING: destroying knlist w/ knotes on it!\n");
2012133741Sjmg#endif
2013133741Sjmg
2014147730Sssouhlal	knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
2015133741Sjmg	SLIST_INIT(&knl->kl_list);
2016133741Sjmg}
2017133741Sjmg
2018133741Sjmg/*
2019133741Sjmg * Even if we are locked, we may need to drop the lock to allow any influx
2020133741Sjmg * knotes time to "settle".
2021133741Sjmg */
2022133741Sjmgvoid
2023143776Sjmgknlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2024133741Sjmg{
2025159171Spjd	struct knote *kn, *kn2;
2026133741Sjmg	struct kqueue *kq;
202759290Sjlemon
2028133741Sjmg	if (islocked)
2029147730Sssouhlal		KNL_ASSERT_LOCKED(knl);
2030133741Sjmg	else {
2031147730Sssouhlal		KNL_ASSERT_UNLOCKED(knl);
2032170029Srwatsonagain:		/* need to reacquire lock since we have dropped it */
2033147730Sssouhlal		knl->kl_lock(knl->kl_lockarg);
203459290Sjlemon	}
2035133741Sjmg
2036159171Spjd	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2037133741Sjmg		kq = kn->kn_kq;
2038133741Sjmg		KQ_LOCK(kq);
2039143776Sjmg		if ((kn->kn_status & KN_INFLUX)) {
2040133741Sjmg			KQ_UNLOCK(kq);
2041133741Sjmg			continue;
2042133741Sjmg		}
2043133741Sjmg		knlist_remove_kq(knl, kn, 1, 1);
2044143776Sjmg		if (killkn) {
2045143776Sjmg			kn->kn_status |= KN_INFLUX | KN_DETACHED;
2046143776Sjmg			KQ_UNLOCK(kq);
2047143776Sjmg			knote_drop(kn, td);
2048143776Sjmg		} else {
2049143776Sjmg			/* Make sure cleared knotes disappear soon */
2050143776Sjmg			kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2051143776Sjmg			KQ_UNLOCK(kq);
2052143776Sjmg		}
2053133741Sjmg		kq = NULL;
2054133741Sjmg	}
2055133741Sjmg
2056133741Sjmg	if (!SLIST_EMPTY(&knl->kl_list)) {
2057133741Sjmg		/* there are still KN_INFLUX remaining */
2058133741Sjmg		kn = SLIST_FIRST(&knl->kl_list);
2059133741Sjmg		kq = kn->kn_kq;
2060133741Sjmg		KQ_LOCK(kq);
2061133741Sjmg		KASSERT(kn->kn_status & KN_INFLUX,
2062133741Sjmg		    ("knote removed w/o list lock"));
2063147730Sssouhlal		knl->kl_unlock(knl->kl_lockarg);
2064133741Sjmg		kq->kq_state |= KQ_FLUXWAIT;
2065133741Sjmg		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2066133741Sjmg		kq = NULL;
2067133741Sjmg		goto again;
2068133741Sjmg	}
2069133741Sjmg
2070133741Sjmg	if (islocked)
2071147730Sssouhlal		KNL_ASSERT_LOCKED(knl);
2072133741Sjmg	else {
2073147730Sssouhlal		knl->kl_unlock(knl->kl_lockarg);
2074147730Sssouhlal		KNL_ASSERT_UNLOCKED(knl);
2075133741Sjmg	}
207659290Sjlemon}
207759290Sjlemon
207859290Sjlemon/*
2079168355Srwatson * Remove all knotes referencing a specified fd must be called with FILEDESC
2080168355Srwatson * lock.  This prevents a race where a new fd comes along and occupies the
2081168355Srwatson * entry and we attach a knote to the fd.
208259290Sjlemon */
208359290Sjlemonvoid
208483366Sjulianknote_fdclose(struct thread *td, int fd)
208559290Sjlemon{
208683366Sjulian	struct filedesc *fdp = td->td_proc->p_fd;
2087133741Sjmg	struct kqueue *kq;
2088133741Sjmg	struct knote *kn;
2089133741Sjmg	int influx;
209059290Sjlemon
2091168355Srwatson	FILEDESC_XLOCK_ASSERT(fdp);
2092133741Sjmg
2093133741Sjmg	/*
2094133741Sjmg	 * We shouldn't have to worry about new kevents appearing on fd
2095133741Sjmg	 * since filedesc is locked.
2096133741Sjmg	 */
2097255729Skib	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2098133741Sjmg		KQ_LOCK(kq);
2099133741Sjmg
2100133741Sjmgagain:
2101133741Sjmg		influx = 0;
2102133741Sjmg		while (kq->kq_knlistsize > fd &&
2103133741Sjmg		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2104133741Sjmg			if (kn->kn_status & KN_INFLUX) {
2105133741Sjmg				/* someone else might be waiting on our knote */
2106133741Sjmg				if (influx)
2107133741Sjmg					wakeup(kq);
2108133741Sjmg				kq->kq_state |= KQ_FLUXWAIT;
2109133741Sjmg				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2110133741Sjmg				goto again;
2111133741Sjmg			}
2112133741Sjmg			kn->kn_status |= KN_INFLUX;
2113133741Sjmg			KQ_UNLOCK(kq);
2114134859Sjmg			if (!(kn->kn_status & KN_DETACHED))
2115134859Sjmg				kn->kn_fop->f_detach(kn);
2116133741Sjmg			knote_drop(kn, td);
2117133741Sjmg			influx = 1;
2118133741Sjmg			KQ_LOCK(kq);
2119133741Sjmg		}
2120133741Sjmg		KQ_UNLOCK_FLUX(kq);
2121133741Sjmg	}
212259290Sjlemon}
212359290Sjlemon
2124133741Sjmgstatic int
2125133741Sjmgknote_attach(struct knote *kn, struct kqueue *kq)
212659290Sjlemon{
2127133741Sjmg	struct klist *list;
212859290Sjlemon
2129133741Sjmg	KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
2130133741Sjmg	KQ_OWNED(kq);
213189306Salfred
2132133741Sjmg	if (kn->kn_fop->f_isfd) {
2133133741Sjmg		if (kn->kn_id >= kq->kq_knlistsize)
2134133741Sjmg			return ENOMEM;
2135133741Sjmg		list = &kq->kq_knlist[kn->kn_id];
2136133741Sjmg	} else {
2137133741Sjmg		if (kq->kq_knhash == NULL)
2138133741Sjmg			return ENOMEM;
2139133741Sjmg		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
214059290Sjlemon	}
214159290Sjlemon
214259290Sjlemon	SLIST_INSERT_HEAD(list, kn, kn_link);
2143133741Sjmg
2144133741Sjmg	return 0;
214559290Sjlemon}
214659290Sjlemon
214759290Sjlemon/*
2148151260Sambrisko * knote must already have been detached using the f_detach method.
2149133741Sjmg * no lock need to be held, it is assumed that the KN_INFLUX flag is set
2150133741Sjmg * to prevent other removal.
215159290Sjlemon */
215259290Sjlemonstatic void
215383366Sjulianknote_drop(struct knote *kn, struct thread *td)
215459290Sjlemon{
2155133741Sjmg	struct kqueue *kq;
215659290Sjlemon	struct klist *list;
215759290Sjlemon
2158133741Sjmg	kq = kn->kn_kq;
2159133741Sjmg
2160133741Sjmg	KQ_NOTOWNED(kq);
2161133741Sjmg	KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
2162133741Sjmg	    ("knote_drop called without KN_INFLUX set in kn_status"));
2163133741Sjmg
2164133741Sjmg	KQ_LOCK(kq);
216559290Sjlemon	if (kn->kn_fop->f_isfd)
2166133741Sjmg		list = &kq->kq_knlist[kn->kn_id];
216759290Sjlemon	else
2168133741Sjmg		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
216959290Sjlemon
2170151260Sambrisko	if (!SLIST_EMPTY(list))
2171151260Sambrisko		SLIST_REMOVE(list, kn, knote, kn_link);
217259290Sjlemon	if (kn->kn_status & KN_QUEUED)
217359290Sjlemon		knote_dequeue(kn);
2174133741Sjmg	KQ_UNLOCK_FLUX(kq);
2175133741Sjmg
2176133741Sjmg	if (kn->kn_fop->f_isfd) {
2177133741Sjmg		fdrop(kn->kn_fp, td);
2178133741Sjmg		kn->kn_fp = NULL;
2179133741Sjmg	}
2180133741Sjmg	kqueue_fo_release(kn->kn_kevent.filter);
2181133741Sjmg	kn->kn_fop = NULL;
218259290Sjlemon	knote_free(kn);
218359290Sjlemon}
218459290Sjlemon
218559290Sjlemonstatic void
218659290Sjlemonknote_enqueue(struct knote *kn)
218759290Sjlemon{
218859290Sjlemon	struct kqueue *kq = kn->kn_kq;
218959290Sjlemon
2190133741Sjmg	KQ_OWNED(kn->kn_kq);
219159997Sjlemon	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
219259997Sjlemon
2193133590Srwatson	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
219459290Sjlemon	kn->kn_status |= KN_QUEUED;
219559290Sjlemon	kq->kq_count++;
219659290Sjlemon	kqueue_wakeup(kq);
219759290Sjlemon}
219859290Sjlemon
219959290Sjlemonstatic void
220059290Sjlemonknote_dequeue(struct knote *kn)
220159290Sjlemon{
220259290Sjlemon	struct kqueue *kq = kn->kn_kq;
220359290Sjlemon
2204133741Sjmg	KQ_OWNED(kn->kn_kq);
220559997Sjlemon	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
220659997Sjlemon
2207133590Srwatson	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
220859290Sjlemon	kn->kn_status &= ~KN_QUEUED;
220959290Sjlemon	kq->kq_count--;
221059290Sjlemon}
221159290Sjlemon
221259290Sjlemonstatic void
221359290Sjlemonknote_init(void)
221459290Sjlemon{
2215133741Sjmg
221692751Sjeff	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
221792751Sjeff	    NULL, NULL, UMA_ALIGN_PTR, 0);
221859290Sjlemon}
2219177253SrwatsonSYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
222059290Sjlemon
222159290Sjlemonstatic struct knote *
2222133741Sjmgknote_alloc(int waitok)
222359290Sjlemon{
2224133741Sjmg	return ((struct knote *)uma_zalloc(knote_zone,
2225133741Sjmg	    (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
222659290Sjlemon}
222759290Sjlemon
222859290Sjlemonstatic void
222959290Sjlemonknote_free(struct knote *kn)
223059290Sjlemon{
2231133741Sjmg	if (kn != NULL)
2232133741Sjmg		uma_zfree(knote_zone, kn);
223359290Sjlemon}
2234162594Sjmg
2235162594Sjmg/*
2236162594Sjmg * Register the kev w/ the kq specified by fd.
2237162594Sjmg */
2238162594Sjmgint
2239162594Sjmgkqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
2240162594Sjmg{
2241162594Sjmg	struct kqueue *kq;
2242162594Sjmg	struct file *fp;
2243162594Sjmg	int error;
2244162594Sjmg
2245224797Sjonathan	if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
2246162594Sjmg		return (error);
2247170029Srwatson	if ((error = kqueue_acquire(fp, &kq)) != 0)
2248170029Srwatson		goto noacquire;
2249162594Sjmg
2250162594Sjmg	error = kqueue_register(kq, kev, td, waitok);
2251162594Sjmg
2252162594Sjmg	kqueue_release(kq, 0);
2253162594Sjmg
2254170029Srwatsonnoacquire:
2255162608Sjmg	fdrop(fp, td);
2256162594Sjmg
2257162594Sjmg	return error;
2258162594Sjmg}
2259