kern_event.c revision 197242
1/*-
2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
4 * Copyright (c) 2009 Apple, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/kern/kern_event.c 197242 2009-09-16 03:37:39Z sson $");
31
32#include "opt_ktrace.h"
33
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/kernel.h>
37#include <sys/lock.h>
38#include <sys/mutex.h>
39#include <sys/proc.h>
40#include <sys/malloc.h>
41#include <sys/unistd.h>
42#include <sys/file.h>
43#include <sys/filedesc.h>
44#include <sys/filio.h>
45#include <sys/fcntl.h>
46#include <sys/kthread.h>
47#include <sys/selinfo.h>
48#include <sys/queue.h>
49#include <sys/event.h>
50#include <sys/eventvar.h>
51#include <sys/poll.h>
52#include <sys/protosw.h>
53#include <sys/sigio.h>
54#include <sys/signalvar.h>
55#include <sys/socket.h>
56#include <sys/socketvar.h>
57#include <sys/stat.h>
58#include <sys/sysctl.h>
59#include <sys/sysproto.h>
60#include <sys/syscallsubr.h>
61#include <sys/taskqueue.h>
62#include <sys/uio.h>
63#ifdef KTRACE
64#include <sys/ktrace.h>
65#endif
66
67#include <vm/uma.h>
68
69static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
70
71/*
72 * This lock is used if multiple kq locks are required.  This possibly
73 * should be made into a per proc lock.
74 */
75static struct mtx	kq_global;
76MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
77#define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
78	if (!haslck)				\
79		mtx_lock(lck);			\
80	haslck = 1;				\
81} while (0)
82#define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
83	if (haslck)				\
84		mtx_unlock(lck);			\
85	haslck = 0;				\
86} while (0)
87
88TASKQUEUE_DEFINE_THREAD(kqueue);
89
90static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
91static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
92static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
93		    struct thread *td, int waitok);
94static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
95static void	kqueue_release(struct kqueue *kq, int locked);
96static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
97		    uintptr_t ident, int waitok);
98static void	kqueue_task(void *arg, int pending);
99static int	kqueue_scan(struct kqueue *kq, int maxevents,
100		    struct kevent_copyops *k_ops,
101		    const struct timespec *timeout,
102		    struct kevent *keva, struct thread *td);
103static void 	kqueue_wakeup(struct kqueue *kq);
104static struct filterops *kqueue_fo_find(int filt);
105static void	kqueue_fo_release(int filt);
106
107static fo_rdwr_t	kqueue_read;
108static fo_rdwr_t	kqueue_write;
109static fo_truncate_t	kqueue_truncate;
110static fo_ioctl_t	kqueue_ioctl;
111static fo_poll_t	kqueue_poll;
112static fo_kqfilter_t	kqueue_kqfilter;
113static fo_stat_t	kqueue_stat;
114static fo_close_t	kqueue_close;
115
116static struct fileops kqueueops = {
117	.fo_read = kqueue_read,
118	.fo_write = kqueue_write,
119	.fo_truncate = kqueue_truncate,
120	.fo_ioctl = kqueue_ioctl,
121	.fo_poll = kqueue_poll,
122	.fo_kqfilter = kqueue_kqfilter,
123	.fo_stat = kqueue_stat,
124	.fo_close = kqueue_close,
125};
126
127static int 	knote_attach(struct knote *kn, struct kqueue *kq);
128static void 	knote_drop(struct knote *kn, struct thread *td);
129static void 	knote_enqueue(struct knote *kn);
130static void 	knote_dequeue(struct knote *kn);
131static void 	knote_init(void);
132static struct 	knote *knote_alloc(int waitok);
133static void 	knote_free(struct knote *kn);
134
135static void	filt_kqdetach(struct knote *kn);
136static int	filt_kqueue(struct knote *kn, long hint);
137static int	filt_procattach(struct knote *kn);
138static void	filt_procdetach(struct knote *kn);
139static int	filt_proc(struct knote *kn, long hint);
140static int	filt_fileattach(struct knote *kn);
141static void	filt_timerexpire(void *knx);
142static int	filt_timerattach(struct knote *kn);
143static void	filt_timerdetach(struct knote *kn);
144static int	filt_timer(struct knote *kn, long hint);
145static int	filt_userattach(struct knote *kn);
146static void	filt_userdetach(struct knote *kn);
147static int	filt_user(struct knote *kn, long hint);
148static void	filt_usertouch(struct knote *kn, struct kevent *kev, long type);
149
150static struct filterops file_filtops = {
151	.f_isfd = 1,
152	.f_attach = filt_fileattach,
153};
154static struct filterops kqread_filtops = {
155	.f_isfd = 1,
156	.f_detach = filt_kqdetach,
157	.f_event = filt_kqueue,
158};
159/* XXX - move to kern_proc.c?  */
160static struct filterops proc_filtops = {
161	.f_isfd = 0,
162	.f_attach = filt_procattach,
163	.f_detach = filt_procdetach,
164	.f_event = filt_proc,
165};
166static struct filterops timer_filtops = {
167	.f_isfd = 0,
168	.f_attach = filt_timerattach,
169	.f_detach = filt_timerdetach,
170	.f_event = filt_timer,
171};
172static struct filterops user_filtops = {
173	.f_attach = filt_userattach,
174	.f_detach = filt_userdetach,
175	.f_event = filt_user,
176	.f_touch = filt_usertouch,
177};
178
179static uma_zone_t	knote_zone;
180static int 		kq_ncallouts = 0;
181static int 		kq_calloutmax = (4 * 1024);
182SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
183    &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
184
185/* XXX - ensure not KN_INFLUX?? */
186#define KNOTE_ACTIVATE(kn, islock) do { 				\
187	if ((islock))							\
188		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
189	else								\
190		KQ_LOCK((kn)->kn_kq);					\
191	(kn)->kn_status |= KN_ACTIVE;					\
192	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
193		knote_enqueue((kn));					\
194	if (!(islock))							\
195		KQ_UNLOCK((kn)->kn_kq);					\
196} while(0)
197#define KQ_LOCK(kq) do {						\
198	mtx_lock(&(kq)->kq_lock);					\
199} while (0)
200#define KQ_FLUX_WAKEUP(kq) do {						\
201	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
202		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
203		wakeup((kq));						\
204	}								\
205} while (0)
206#define KQ_UNLOCK_FLUX(kq) do {						\
207	KQ_FLUX_WAKEUP(kq);						\
208	mtx_unlock(&(kq)->kq_lock);					\
209} while (0)
210#define KQ_UNLOCK(kq) do {						\
211	mtx_unlock(&(kq)->kq_lock);					\
212} while (0)
213#define KQ_OWNED(kq) do {						\
214	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
215} while (0)
216#define KQ_NOTOWNED(kq) do {						\
217	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
218} while (0)
219#define KN_LIST_LOCK(kn) do {						\
220	if (kn->kn_knlist != NULL)					\
221		kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg);	\
222} while (0)
223#define KN_LIST_UNLOCK(kn) do {						\
224	if (kn->kn_knlist != NULL) 					\
225		kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg);	\
226} while (0)
227#define	KNL_ASSERT_LOCK(knl, islocked) do {				\
228	if (islocked)							\
229		KNL_ASSERT_LOCKED(knl);				\
230	else								\
231		KNL_ASSERT_UNLOCKED(knl);				\
232} while (0)
233#ifdef INVARIANTS
234#define	KNL_ASSERT_LOCKED(knl) do {					\
235	knl->kl_assert_locked((knl)->kl_lockarg);			\
236} while (0)
237#define	KNL_ASSERT_UNLOCKED(knl) do {					\
238	knl->kl_assert_unlocked((knl)->kl_lockarg);			\
239} while (0)
240#else /* !INVARIANTS */
241#define	KNL_ASSERT_LOCKED(knl) do {} while(0)
242#define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
243#endif /* INVARIANTS */
244
245#define	KN_HASHSIZE		64		/* XXX should be tunable */
246#define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
247
248static int
249filt_nullattach(struct knote *kn)
250{
251
252	return (ENXIO);
253};
254
255struct filterops null_filtops = {
256	.f_isfd = 0,
257	.f_attach = filt_nullattach,
258};
259
260/* XXX - make SYSINIT to add these, and move into respective modules. */
261extern struct filterops sig_filtops;
262extern struct filterops fs_filtops;
263
264/*
265 * Table for for all system-defined filters.
266 */
267static struct mtx	filterops_lock;
268MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
269	MTX_DEF);
270static struct {
271	struct filterops *for_fop;
272	int for_refcnt;
273} sysfilt_ops[EVFILT_SYSCOUNT] = {
274	{ &file_filtops },			/* EVFILT_READ */
275	{ &file_filtops },			/* EVFILT_WRITE */
276	{ &null_filtops },			/* EVFILT_AIO */
277	{ &file_filtops },			/* EVFILT_VNODE */
278	{ &proc_filtops },			/* EVFILT_PROC */
279	{ &sig_filtops },			/* EVFILT_SIGNAL */
280	{ &timer_filtops },			/* EVFILT_TIMER */
281	{ &file_filtops },			/* EVFILT_NETDEV */
282	{ &fs_filtops },			/* EVFILT_FS */
283	{ &null_filtops },			/* EVFILT_LIO */
284	{ &user_filtops },			/* EVFILT_USER */
285};
286
287/*
288 * Simple redirection for all cdevsw style objects to call their fo_kqfilter
289 * method.
290 */
291static int
292filt_fileattach(struct knote *kn)
293{
294
295	return (fo_kqfilter(kn->kn_fp, kn));
296}
297
298/*ARGSUSED*/
299static int
300kqueue_kqfilter(struct file *fp, struct knote *kn)
301{
302	struct kqueue *kq = kn->kn_fp->f_data;
303
304	if (kn->kn_filter != EVFILT_READ)
305		return (EINVAL);
306
307	kn->kn_status |= KN_KQUEUE;
308	kn->kn_fop = &kqread_filtops;
309	knlist_add(&kq->kq_sel.si_note, kn, 0);
310
311	return (0);
312}
313
314static void
315filt_kqdetach(struct knote *kn)
316{
317	struct kqueue *kq = kn->kn_fp->f_data;
318
319	knlist_remove(&kq->kq_sel.si_note, kn, 0);
320}
321
322/*ARGSUSED*/
323static int
324filt_kqueue(struct knote *kn, long hint)
325{
326	struct kqueue *kq = kn->kn_fp->f_data;
327
328	kn->kn_data = kq->kq_count;
329	return (kn->kn_data > 0);
330}
331
332/* XXX - move to kern_proc.c?  */
333static int
334filt_procattach(struct knote *kn)
335{
336	struct proc *p;
337	int immediate;
338	int error;
339
340	immediate = 0;
341	p = pfind(kn->kn_id);
342	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
343		p = zpfind(kn->kn_id);
344		immediate = 1;
345	} else if (p != NULL && (p->p_flag & P_WEXIT)) {
346		immediate = 1;
347	}
348
349	if (p == NULL)
350		return (ESRCH);
351	if ((error = p_cansee(curthread, p)))
352		return (error);
353
354	kn->kn_ptr.p_proc = p;
355	kn->kn_flags |= EV_CLEAR;		/* automatically set */
356
357	/*
358	 * internal flag indicating registration done by kernel
359	 */
360	if (kn->kn_flags & EV_FLAG1) {
361		kn->kn_data = kn->kn_sdata;		/* ppid */
362		kn->kn_fflags = NOTE_CHILD;
363		kn->kn_flags &= ~EV_FLAG1;
364	}
365
366	if (immediate == 0)
367		knlist_add(&p->p_klist, kn, 1);
368
369	/*
370	 * Immediately activate any exit notes if the target process is a
371	 * zombie.  This is necessary to handle the case where the target
372	 * process, e.g. a child, dies before the kevent is registered.
373	 */
374	if (immediate && filt_proc(kn, NOTE_EXIT))
375		KNOTE_ACTIVATE(kn, 0);
376
377	PROC_UNLOCK(p);
378
379	return (0);
380}
381
382/*
383 * The knote may be attached to a different process, which may exit,
384 * leaving nothing for the knote to be attached to.  So when the process
385 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
386 * it will be deleted when read out.  However, as part of the knote deletion,
387 * this routine is called, so a check is needed to avoid actually performing
388 * a detach, because the original process does not exist any more.
389 */
390/* XXX - move to kern_proc.c?  */
391static void
392filt_procdetach(struct knote *kn)
393{
394	struct proc *p;
395
396	p = kn->kn_ptr.p_proc;
397	knlist_remove(&p->p_klist, kn, 0);
398	kn->kn_ptr.p_proc = NULL;
399}
400
401/* XXX - move to kern_proc.c?  */
402static int
403filt_proc(struct knote *kn, long hint)
404{
405	struct proc *p = kn->kn_ptr.p_proc;
406	u_int event;
407
408	/*
409	 * mask off extra data
410	 */
411	event = (u_int)hint & NOTE_PCTRLMASK;
412
413	/*
414	 * if the user is interested in this event, record it.
415	 */
416	if (kn->kn_sfflags & event)
417		kn->kn_fflags |= event;
418
419	/*
420	 * process is gone, so flag the event as finished.
421	 */
422	if (event == NOTE_EXIT) {
423		if (!(kn->kn_status & KN_DETACHED))
424			knlist_remove_inevent(&p->p_klist, kn);
425		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
426		kn->kn_data = p->p_xstat;
427		kn->kn_ptr.p_proc = NULL;
428		return (1);
429	}
430
431	return (kn->kn_fflags != 0);
432}
433
434/*
435 * Called when the process forked. It mostly does the same as the
436 * knote(), activating all knotes registered to be activated when the
437 * process forked. Additionally, for each knote attached to the
438 * parent, check whether user wants to track the new process. If so
439 * attach a new knote to it, and immediately report an event with the
440 * child's pid.
441 */
442void
443knote_fork(struct knlist *list, int pid)
444{
445	struct kqueue *kq;
446	struct knote *kn;
447	struct kevent kev;
448	int error;
449
450	if (list == NULL)
451		return;
452	list->kl_lock(list->kl_lockarg);
453
454	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
455		if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
456			continue;
457		kq = kn->kn_kq;
458		KQ_LOCK(kq);
459		if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
460			KQ_UNLOCK(kq);
461			continue;
462		}
463
464		/*
465		 * The same as knote(), activate the event.
466		 */
467		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
468			kn->kn_status |= KN_HASKQLOCK;
469			if (kn->kn_fop->f_event(kn, NOTE_FORK | pid))
470				KNOTE_ACTIVATE(kn, 1);
471			kn->kn_status &= ~KN_HASKQLOCK;
472			KQ_UNLOCK(kq);
473			continue;
474		}
475
476		/*
477		 * The NOTE_TRACK case. In addition to the activation
478		 * of the event, we need to register new event to
479		 * track the child. Drop the locks in preparation for
480		 * the call to kqueue_register().
481		 */
482		kn->kn_status |= KN_INFLUX;
483		KQ_UNLOCK(kq);
484		list->kl_unlock(list->kl_lockarg);
485
486		/*
487		 * Activate existing knote and register a knote with
488		 * new process.
489		 */
490		kev.ident = pid;
491		kev.filter = kn->kn_filter;
492		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
493		kev.fflags = kn->kn_sfflags;
494		kev.data = kn->kn_id;		/* parent */
495		kev.udata = kn->kn_kevent.udata;/* preserve udata */
496		error = kqueue_register(kq, &kev, NULL, 0);
497		if (kn->kn_fop->f_event(kn, NOTE_FORK | pid))
498			KNOTE_ACTIVATE(kn, 0);
499		if (error)
500			kn->kn_fflags |= NOTE_TRACKERR;
501		KQ_LOCK(kq);
502		kn->kn_status &= ~KN_INFLUX;
503		KQ_UNLOCK_FLUX(kq);
504		list->kl_lock(list->kl_lockarg);
505	}
506	list->kl_unlock(list->kl_lockarg);
507}
508
509static int
510timertoticks(intptr_t data)
511{
512	struct timeval tv;
513	int tticks;
514
515	tv.tv_sec = data / 1000;
516	tv.tv_usec = (data % 1000) * 1000;
517	tticks = tvtohz(&tv);
518
519	return tticks;
520}
521
522/* XXX - move to kern_timeout.c? */
523static void
524filt_timerexpire(void *knx)
525{
526	struct knote *kn = knx;
527	struct callout *calloutp;
528
529	kn->kn_data++;
530	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
531
532	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
533		calloutp = (struct callout *)kn->kn_hook;
534		callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
535		    filt_timerexpire, kn);
536	}
537}
538
539/*
540 * data contains amount of time to sleep, in milliseconds
541 */
542/* XXX - move to kern_timeout.c? */
543static int
544filt_timerattach(struct knote *kn)
545{
546	struct callout *calloutp;
547
548	atomic_add_int(&kq_ncallouts, 1);
549
550	if (kq_ncallouts >= kq_calloutmax) {
551		atomic_add_int(&kq_ncallouts, -1);
552		return (ENOMEM);
553	}
554
555	kn->kn_flags |= EV_CLEAR;		/* automatically set */
556	kn->kn_status &= ~KN_DETACHED;		/* knlist_add usually sets it */
557	calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
558	callout_init(calloutp, CALLOUT_MPSAFE);
559	kn->kn_hook = calloutp;
560	callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
561	    filt_timerexpire, kn);
562
563	return (0);
564}
565
566/* XXX - move to kern_timeout.c? */
567static void
568filt_timerdetach(struct knote *kn)
569{
570	struct callout *calloutp;
571
572	calloutp = (struct callout *)kn->kn_hook;
573	callout_drain(calloutp);
574	free(calloutp, M_KQUEUE);
575	atomic_add_int(&kq_ncallouts, -1);
576	kn->kn_status |= KN_DETACHED;	/* knlist_remove usually clears it */
577}
578
579/* XXX - move to kern_timeout.c? */
580static int
581filt_timer(struct knote *kn, long hint)
582{
583
584	return (kn->kn_data != 0);
585}
586
587static int
588filt_userattach(struct knote *kn)
589{
590
591	/*
592	 * EVFILT_USER knotes are not attached to anything in the kernel.
593	 */
594	kn->kn_hook = NULL;
595	if (kn->kn_fflags & NOTE_TRIGGER)
596		kn->kn_hookid = 1;
597	else
598		kn->kn_hookid = 0;
599	return (0);
600}
601
602static void
603filt_userdetach(__unused struct knote *kn)
604{
605
606	/*
607	 * EVFILT_USER knotes are not attached to anything in the kernel.
608	 */
609}
610
611static int
612filt_user(struct knote *kn, __unused long hint)
613{
614
615	return (kn->kn_hookid);
616}
617
618static void
619filt_usertouch(struct knote *kn, struct kevent *kev, long type)
620{
621	int ffctrl;
622
623	switch (type) {
624	case EVENT_REGISTER:
625		if (kev->fflags & NOTE_TRIGGER)
626			kn->kn_hookid = 1;
627
628		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
629		kev->fflags &= NOTE_FFLAGSMASK;
630		switch (ffctrl) {
631		case NOTE_FFNOP:
632			break;
633
634		case NOTE_FFAND:
635			kn->kn_sfflags &= kev->fflags;
636			break;
637
638		case NOTE_FFOR:
639			kn->kn_sfflags |= kev->fflags;
640			break;
641
642		case NOTE_FFCOPY:
643			kn->kn_sfflags = kev->fflags;
644			break;
645
646		default:
647			/* XXX Return error? */
648			break;
649		}
650		kn->kn_sdata = kev->data;
651		if (kev->flags & EV_CLEAR) {
652			kn->kn_hookid = 0;
653			kn->kn_data = 0;
654			kn->kn_fflags = 0;
655		}
656		break;
657
658        case EVENT_PROCESS:
659		*kev = kn->kn_kevent;
660		kev->fflags = kn->kn_sfflags;
661		kev->data = kn->kn_sdata;
662		if (kn->kn_flags & EV_CLEAR) {
663			kn->kn_hookid = 0;
664			kn->kn_data = 0;
665			kn->kn_fflags = 0;
666		}
667		break;
668
669	default:
670		panic("filt_usertouch() - invalid type (%ld)", type);
671		break;
672	}
673}
674
675int
676kqueue(struct thread *td, struct kqueue_args *uap)
677{
678	struct filedesc *fdp;
679	struct kqueue *kq;
680	struct file *fp;
681	int fd, error;
682
683	fdp = td->td_proc->p_fd;
684	error = falloc(td, &fp, &fd);
685	if (error)
686		goto done2;
687
688	/* An extra reference on `nfp' has been held for us by falloc(). */
689	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
690	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
691	TAILQ_INIT(&kq->kq_head);
692	kq->kq_fdp = fdp;
693	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
694	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
695
696	FILEDESC_XLOCK(fdp);
697	SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
698	FILEDESC_XUNLOCK(fdp);
699
700	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
701	fdrop(fp, td);
702
703	td->td_retval[0] = fd;
704done2:
705	return (error);
706}
707
708#ifndef _SYS_SYSPROTO_H_
709struct kevent_args {
710	int	fd;
711	const struct kevent *changelist;
712	int	nchanges;
713	struct	kevent *eventlist;
714	int	nevents;
715	const struct timespec *timeout;
716};
717#endif
718int
719kevent(struct thread *td, struct kevent_args *uap)
720{
721	struct timespec ts, *tsp;
722	struct kevent_copyops k_ops = { uap,
723					kevent_copyout,
724					kevent_copyin};
725	int error;
726#ifdef KTRACE
727	struct uio ktruio;
728	struct iovec ktriov;
729	struct uio *ktruioin = NULL;
730	struct uio *ktruioout = NULL;
731#endif
732
733	if (uap->timeout != NULL) {
734		error = copyin(uap->timeout, &ts, sizeof(ts));
735		if (error)
736			return (error);
737		tsp = &ts;
738	} else
739		tsp = NULL;
740
741#ifdef KTRACE
742	if (KTRPOINT(td, KTR_GENIO)) {
743		ktriov.iov_base = uap->changelist;
744		ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
745		ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
746		    .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
747		    .uio_td = td };
748		ktruioin = cloneuio(&ktruio);
749		ktriov.iov_base = uap->eventlist;
750		ktriov.iov_len = uap->nevents * sizeof(struct kevent);
751		ktruioout = cloneuio(&ktruio);
752	}
753#endif
754
755	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
756	    &k_ops, tsp);
757
758#ifdef KTRACE
759	if (ktruioin != NULL) {
760		ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
761		ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
762		ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
763		ktrgenio(uap->fd, UIO_READ, ktruioout, error);
764	}
765#endif
766
767	return (error);
768}
769
770/*
771 * Copy 'count' items into the destination list pointed to by uap->eventlist.
772 */
773static int
774kevent_copyout(void *arg, struct kevent *kevp, int count)
775{
776	struct kevent_args *uap;
777	int error;
778
779	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
780	uap = (struct kevent_args *)arg;
781
782	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
783	if (error == 0)
784		uap->eventlist += count;
785	return (error);
786}
787
788/*
789 * Copy 'count' items from the list pointed to by uap->changelist.
790 */
791static int
792kevent_copyin(void *arg, struct kevent *kevp, int count)
793{
794	struct kevent_args *uap;
795	int error;
796
797	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
798	uap = (struct kevent_args *)arg;
799
800	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
801	if (error == 0)
802		uap->changelist += count;
803	return (error);
804}
805
806int
807kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
808    struct kevent_copyops *k_ops, const struct timespec *timeout)
809{
810	struct kevent keva[KQ_NEVENTS];
811	struct kevent *kevp, *changes;
812	struct kqueue *kq;
813	struct file *fp;
814	int i, n, nerrors, error;
815
816	if ((error = fget(td, fd, &fp)) != 0)
817		return (error);
818	if ((error = kqueue_acquire(fp, &kq)) != 0)
819		goto done_norel;
820
821	nerrors = 0;
822
823	while (nchanges > 0) {
824		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
825		error = k_ops->k_copyin(k_ops->arg, keva, n);
826		if (error)
827			goto done;
828		changes = keva;
829		for (i = 0; i < n; i++) {
830			kevp = &changes[i];
831			if (!kevp->filter)
832				continue;
833			kevp->flags &= ~EV_SYSFLAGS;
834			error = kqueue_register(kq, kevp, td, 1);
835			if (error) {
836				if (nevents != 0) {
837					kevp->flags = EV_ERROR;
838					kevp->data = error;
839					(void) k_ops->k_copyout(k_ops->arg,
840					    kevp, 1);
841					nevents--;
842					nerrors++;
843				} else {
844					goto done;
845				}
846			}
847		}
848		nchanges -= n;
849	}
850	if (nerrors) {
851		td->td_retval[0] = nerrors;
852		error = 0;
853		goto done;
854	}
855
856	error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
857done:
858	kqueue_release(kq, 0);
859done_norel:
860	fdrop(fp, td);
861	return (error);
862}
863
864int
865kqueue_add_filteropts(int filt, struct filterops *filtops)
866{
867	int error;
868
869	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
870		printf(
871"trying to add a filterop that is out of range: %d is beyond %d\n",
872		    ~filt, EVFILT_SYSCOUNT);
873		return EINVAL;
874	}
875	mtx_lock(&filterops_lock);
876	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
877	    sysfilt_ops[~filt].for_fop != NULL)
878		error = EEXIST;
879	else {
880		sysfilt_ops[~filt].for_fop = filtops;
881		sysfilt_ops[~filt].for_refcnt = 0;
882	}
883	mtx_unlock(&filterops_lock);
884
885	return (0);
886}
887
888int
889kqueue_del_filteropts(int filt)
890{
891	int error;
892
893	error = 0;
894	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
895		return EINVAL;
896
897	mtx_lock(&filterops_lock);
898	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
899	    sysfilt_ops[~filt].for_fop == NULL)
900		error = EINVAL;
901	else if (sysfilt_ops[~filt].for_refcnt != 0)
902		error = EBUSY;
903	else {
904		sysfilt_ops[~filt].for_fop = &null_filtops;
905		sysfilt_ops[~filt].for_refcnt = 0;
906	}
907	mtx_unlock(&filterops_lock);
908
909	return error;
910}
911
912static struct filterops *
913kqueue_fo_find(int filt)
914{
915
916	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
917		return NULL;
918
919	mtx_lock(&filterops_lock);
920	sysfilt_ops[~filt].for_refcnt++;
921	if (sysfilt_ops[~filt].for_fop == NULL)
922		sysfilt_ops[~filt].for_fop = &null_filtops;
923	mtx_unlock(&filterops_lock);
924
925	return sysfilt_ops[~filt].for_fop;
926}
927
928static void
929kqueue_fo_release(int filt)
930{
931
932	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
933		return;
934
935	mtx_lock(&filterops_lock);
936	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
937	    ("filter object refcount not valid on release"));
938	sysfilt_ops[~filt].for_refcnt--;
939	mtx_unlock(&filterops_lock);
940}
941
942/*
943 * A ref to kq (obtained via kqueue_acquire) must be held.  waitok will
944 * influence if memory allocation should wait.  Make sure it is 0 if you
945 * hold any mutexes.
946 */
947static int
948kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
949{
950	struct filterops *fops;
951	struct file *fp;
952	struct knote *kn, *tkn;
953	int error, filt, event;
954	int haskqglobal;
955
956	fp = NULL;
957	kn = NULL;
958	error = 0;
959	haskqglobal = 0;
960
961	filt = kev->filter;
962	fops = kqueue_fo_find(filt);
963	if (fops == NULL)
964		return EINVAL;
965
966	tkn = knote_alloc(waitok);		/* prevent waiting with locks */
967
968findkn:
969	if (fops->f_isfd) {
970		KASSERT(td != NULL, ("td is NULL"));
971		error = fget(td, kev->ident, &fp);
972		if (error)
973			goto done;
974
975		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
976		    kev->ident, 0) != 0) {
977			/* try again */
978			fdrop(fp, td);
979			fp = NULL;
980			error = kqueue_expand(kq, fops, kev->ident, waitok);
981			if (error)
982				goto done;
983			goto findkn;
984		}
985
986		if (fp->f_type == DTYPE_KQUEUE) {
987			/*
988			 * if we add some inteligence about what we are doing,
989			 * we should be able to support events on ourselves.
990			 * We need to know when we are doing this to prevent
991			 * getting both the knlist lock and the kq lock since
992			 * they are the same thing.
993			 */
994			if (fp->f_data == kq) {
995				error = EINVAL;
996				goto done;
997			}
998
999			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1000		}
1001
1002		KQ_LOCK(kq);
1003		if (kev->ident < kq->kq_knlistsize) {
1004			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1005				if (kev->filter == kn->kn_filter)
1006					break;
1007		}
1008	} else {
1009		if ((kev->flags & EV_ADD) == EV_ADD)
1010			kqueue_expand(kq, fops, kev->ident, waitok);
1011
1012		KQ_LOCK(kq);
1013		if (kq->kq_knhashmask != 0) {
1014			struct klist *list;
1015
1016			list = &kq->kq_knhash[
1017			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1018			SLIST_FOREACH(kn, list, kn_link)
1019				if (kev->ident == kn->kn_id &&
1020				    kev->filter == kn->kn_filter)
1021					break;
1022		}
1023	}
1024
1025	/* knote is in the process of changing, wait for it to stablize. */
1026	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1027		if (fp != NULL) {
1028			fdrop(fp, td);
1029			fp = NULL;
1030		}
1031		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1032		kq->kq_state |= KQ_FLUXWAIT;
1033		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1034		goto findkn;
1035	}
1036
1037	/*
1038	 * kn now contains the matching knote, or NULL if no match
1039	 */
1040	if (kn == NULL) {
1041		if (kev->flags & EV_ADD) {
1042			kn = tkn;
1043			tkn = NULL;
1044			if (kn == NULL) {
1045				KQ_UNLOCK(kq);
1046				error = ENOMEM;
1047				goto done;
1048			}
1049			kn->kn_fp = fp;
1050			kn->kn_kq = kq;
1051			kn->kn_fop = fops;
1052			/*
1053			 * apply reference counts to knote structure, and
1054			 * do not release it at the end of this routine.
1055			 */
1056			fops = NULL;
1057			fp = NULL;
1058
1059			kn->kn_sfflags = kev->fflags;
1060			kn->kn_sdata = kev->data;
1061			kev->fflags = 0;
1062			kev->data = 0;
1063			kn->kn_kevent = *kev;
1064			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1065			    EV_ENABLE | EV_DISABLE);
1066			kn->kn_status = KN_INFLUX|KN_DETACHED;
1067
1068			error = knote_attach(kn, kq);
1069			KQ_UNLOCK(kq);
1070			if (error != 0) {
1071				tkn = kn;
1072				goto done;
1073			}
1074
1075			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1076				knote_drop(kn, td);
1077				goto done;
1078			}
1079			KN_LIST_LOCK(kn);
1080			goto done_ev_add;
1081		} else {
1082			/* No matching knote and the EV_ADD flag is not set. */
1083			KQ_UNLOCK(kq);
1084			error = ENOENT;
1085			goto done;
1086		}
1087	}
1088
1089	if (kev->flags & EV_DELETE) {
1090		kn->kn_status |= KN_INFLUX;
1091		KQ_UNLOCK(kq);
1092		if (!(kn->kn_status & KN_DETACHED))
1093			kn->kn_fop->f_detach(kn);
1094		knote_drop(kn, td);
1095		goto done;
1096	}
1097
1098	/*
1099	 * The user may change some filter values after the initial EV_ADD,
1100	 * but doing so will not reset any filter which has already been
1101	 * triggered.
1102	 */
1103	kn->kn_status |= KN_INFLUX;
1104	KQ_UNLOCK(kq);
1105	KN_LIST_LOCK(kn);
1106	kn->kn_kevent.udata = kev->udata;
1107	if (!fops->f_isfd && fops->f_touch != NULL) {
1108		fops->f_touch(kn, kev, EVENT_REGISTER);
1109	} else {
1110		kn->kn_sfflags = kev->fflags;
1111		kn->kn_sdata = kev->data;
1112	}
1113
1114	/*
1115	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1116	 * the initial attach event decides that the event is "completed"
1117	 * already.  i.e. filt_procattach is called on a zombie process.  It
1118	 * will call filt_proc which will remove it from the list, and NULL
1119	 * kn_knlist.
1120	 */
1121done_ev_add:
1122	event = kn->kn_fop->f_event(kn, 0);
1123	KQ_LOCK(kq);
1124	if (event)
1125		KNOTE_ACTIVATE(kn, 1);
1126	kn->kn_status &= ~KN_INFLUX;
1127	KN_LIST_UNLOCK(kn);
1128
1129	if ((kev->flags & EV_DISABLE) &&
1130	    ((kn->kn_status & KN_DISABLED) == 0)) {
1131		kn->kn_status |= KN_DISABLED;
1132	}
1133
1134	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
1135		kn->kn_status &= ~KN_DISABLED;
1136		if ((kn->kn_status & KN_ACTIVE) &&
1137		    ((kn->kn_status & KN_QUEUED) == 0))
1138			knote_enqueue(kn);
1139	}
1140	KQ_UNLOCK_FLUX(kq);
1141
1142done:
1143	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1144	if (fp != NULL)
1145		fdrop(fp, td);
1146	if (tkn != NULL)
1147		knote_free(tkn);
1148	if (fops != NULL)
1149		kqueue_fo_release(filt);
1150	return (error);
1151}
1152
1153static int
1154kqueue_acquire(struct file *fp, struct kqueue **kqp)
1155{
1156	int error;
1157	struct kqueue *kq;
1158
1159	error = 0;
1160
1161	kq = fp->f_data;
1162	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1163		return (EBADF);
1164	*kqp = kq;
1165	KQ_LOCK(kq);
1166	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1167		KQ_UNLOCK(kq);
1168		return (EBADF);
1169	}
1170	kq->kq_refcnt++;
1171	KQ_UNLOCK(kq);
1172
1173	return error;
1174}
1175
1176static void
1177kqueue_release(struct kqueue *kq, int locked)
1178{
1179	if (locked)
1180		KQ_OWNED(kq);
1181	else
1182		KQ_LOCK(kq);
1183	kq->kq_refcnt--;
1184	if (kq->kq_refcnt == 1)
1185		wakeup(&kq->kq_refcnt);
1186	if (!locked)
1187		KQ_UNLOCK(kq);
1188}
1189
1190static void
1191kqueue_schedtask(struct kqueue *kq)
1192{
1193
1194	KQ_OWNED(kq);
1195	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1196	    ("scheduling kqueue task while draining"));
1197
1198	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1199		taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
1200		kq->kq_state |= KQ_TASKSCHED;
1201	}
1202}
1203
1204/*
1205 * Expand the kq to make sure we have storage for fops/ident pair.
1206 *
1207 * Return 0 on success (or no work necessary), return errno on failure.
1208 *
1209 * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
1210 * If kqueue_register is called from a non-fd context, there usually/should
1211 * be no locks held.
1212 */
1213static int
1214kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1215	int waitok)
1216{
1217	struct klist *list, *tmp_knhash;
1218	u_long tmp_knhashmask;
1219	int size;
1220	int fd;
1221	int mflag = waitok ? M_WAITOK : M_NOWAIT;
1222
1223	KQ_NOTOWNED(kq);
1224
1225	if (fops->f_isfd) {
1226		fd = ident;
1227		if (kq->kq_knlistsize <= fd) {
1228			size = kq->kq_knlistsize;
1229			while (size <= fd)
1230				size += KQEXTENT;
1231			list = malloc(size * sizeof list, M_KQUEUE, mflag);
1232			if (list == NULL)
1233				return ENOMEM;
1234			KQ_LOCK(kq);
1235			if (kq->kq_knlistsize > fd) {
1236				free(list, M_KQUEUE);
1237				list = NULL;
1238			} else {
1239				if (kq->kq_knlist != NULL) {
1240					bcopy(kq->kq_knlist, list,
1241					    kq->kq_knlistsize * sizeof list);
1242					free(kq->kq_knlist, M_KQUEUE);
1243					kq->kq_knlist = NULL;
1244				}
1245				bzero((caddr_t)list +
1246				    kq->kq_knlistsize * sizeof list,
1247				    (size - kq->kq_knlistsize) * sizeof list);
1248				kq->kq_knlistsize = size;
1249				kq->kq_knlist = list;
1250			}
1251			KQ_UNLOCK(kq);
1252		}
1253	} else {
1254		if (kq->kq_knhashmask == 0) {
1255			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1256			    &tmp_knhashmask);
1257			if (tmp_knhash == NULL)
1258				return ENOMEM;
1259			KQ_LOCK(kq);
1260			if (kq->kq_knhashmask == 0) {
1261				kq->kq_knhash = tmp_knhash;
1262				kq->kq_knhashmask = tmp_knhashmask;
1263			} else {
1264				free(tmp_knhash, M_KQUEUE);
1265			}
1266			KQ_UNLOCK(kq);
1267		}
1268	}
1269
1270	KQ_NOTOWNED(kq);
1271	return 0;
1272}
1273
1274static void
1275kqueue_task(void *arg, int pending)
1276{
1277	struct kqueue *kq;
1278	int haskqglobal;
1279
1280	haskqglobal = 0;
1281	kq = arg;
1282
1283	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1284	KQ_LOCK(kq);
1285
1286	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1287
1288	kq->kq_state &= ~KQ_TASKSCHED;
1289	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1290		wakeup(&kq->kq_state);
1291	}
1292	KQ_UNLOCK(kq);
1293	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1294}
1295
1296/*
1297 * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1298 * We treat KN_MARKER knotes as if they are INFLUX.
1299 */
1300static int
1301kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1302    const struct timespec *tsp, struct kevent *keva, struct thread *td)
1303{
1304	struct kevent *kevp;
1305	struct timeval atv, rtv, ttv;
1306	struct knote *kn, *marker;
1307	int count, timeout, nkev, error, influx;
1308	int haskqglobal, touch;
1309
1310	count = maxevents;
1311	nkev = 0;
1312	error = 0;
1313	haskqglobal = 0;
1314
1315	if (maxevents == 0)
1316		goto done_nl;
1317
1318	if (tsp != NULL) {
1319		TIMESPEC_TO_TIMEVAL(&atv, tsp);
1320		if (itimerfix(&atv)) {
1321			error = EINVAL;
1322			goto done_nl;
1323		}
1324		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
1325			timeout = -1;
1326		else
1327			timeout = atv.tv_sec > 24 * 60 * 60 ?
1328			    24 * 60 * 60 * hz : tvtohz(&atv);
1329		getmicrouptime(&rtv);
1330		timevaladd(&atv, &rtv);
1331	} else {
1332		atv.tv_sec = 0;
1333		atv.tv_usec = 0;
1334		timeout = 0;
1335	}
1336	marker = knote_alloc(1);
1337	if (marker == NULL) {
1338		error = ENOMEM;
1339		goto done_nl;
1340	}
1341	marker->kn_status = KN_MARKER;
1342	KQ_LOCK(kq);
1343	goto start;
1344
1345retry:
1346	if (atv.tv_sec || atv.tv_usec) {
1347		getmicrouptime(&rtv);
1348		if (timevalcmp(&rtv, &atv, >=))
1349			goto done;
1350		ttv = atv;
1351		timevalsub(&ttv, &rtv);
1352		timeout = ttv.tv_sec > 24 * 60 * 60 ?
1353			24 * 60 * 60 * hz : tvtohz(&ttv);
1354	}
1355
1356start:
1357	kevp = keva;
1358	if (kq->kq_count == 0) {
1359		if (timeout < 0) {
1360			error = EWOULDBLOCK;
1361		} else {
1362			kq->kq_state |= KQ_SLEEP;
1363			error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
1364			    "kqread", timeout);
1365		}
1366		if (error == 0)
1367			goto retry;
1368		/* don't restart after signals... */
1369		if (error == ERESTART)
1370			error = EINTR;
1371		else if (error == EWOULDBLOCK)
1372			error = 0;
1373		goto done;
1374	}
1375
1376	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1377	influx = 0;
1378	while (count) {
1379		KQ_OWNED(kq);
1380		kn = TAILQ_FIRST(&kq->kq_head);
1381
1382		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1383		    (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1384			if (influx) {
1385				influx = 0;
1386				KQ_FLUX_WAKEUP(kq);
1387			}
1388			kq->kq_state |= KQ_FLUXWAIT;
1389			error = msleep(kq, &kq->kq_lock, PSOCK,
1390			    "kqflxwt", 0);
1391			continue;
1392		}
1393
1394		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1395		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1396			kn->kn_status &= ~KN_QUEUED;
1397			kq->kq_count--;
1398			continue;
1399		}
1400		if (kn == marker) {
1401			KQ_FLUX_WAKEUP(kq);
1402			if (count == maxevents)
1403				goto retry;
1404			goto done;
1405		}
1406		KASSERT((kn->kn_status & KN_INFLUX) == 0,
1407		    ("KN_INFLUX set when not suppose to be"));
1408
1409		if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1410			kn->kn_status &= ~KN_QUEUED;
1411			kn->kn_status |= KN_INFLUX;
1412			kq->kq_count--;
1413			KQ_UNLOCK(kq);
1414			/*
1415			 * We don't need to lock the list since we've marked
1416			 * it _INFLUX.
1417			 */
1418			*kevp = kn->kn_kevent;
1419			if (!(kn->kn_status & KN_DETACHED))
1420				kn->kn_fop->f_detach(kn);
1421			knote_drop(kn, td);
1422			KQ_LOCK(kq);
1423			kn = NULL;
1424		} else {
1425			kn->kn_status |= KN_INFLUX;
1426			KQ_UNLOCK(kq);
1427			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1428				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1429			KN_LIST_LOCK(kn);
1430			if (kn->kn_fop->f_event(kn, 0) == 0) {
1431				KQ_LOCK(kq);
1432				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1433				kn->kn_status &=
1434				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
1435				kq->kq_count--;
1436				KN_LIST_UNLOCK(kn);
1437				influx = 1;
1438				continue;
1439			}
1440			touch = (!kn->kn_fop->f_isfd &&
1441			    kn->kn_fop->f_touch != NULL);
1442			if (touch)
1443				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
1444			else
1445				*kevp = kn->kn_kevent;
1446			KQ_LOCK(kq);
1447			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1448			if (kn->kn_flags & (EV_CLEAR |  EV_DISPATCH)) {
1449				/*
1450				 * Manually clear knotes who weren't
1451				 * 'touch'ed.
1452				 */
1453				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
1454					kn->kn_data = 0;
1455					kn->kn_fflags = 0;
1456				}
1457				if (kn->kn_flags & EV_DISPATCH)
1458					kn->kn_status |= KN_DISABLED;
1459				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1460				kq->kq_count--;
1461			} else
1462				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1463
1464			kn->kn_status &= ~(KN_INFLUX);
1465			KN_LIST_UNLOCK(kn);
1466			influx = 1;
1467		}
1468
1469		/* we are returning a copy to the user */
1470		kevp++;
1471		nkev++;
1472		count--;
1473
1474		if (nkev == KQ_NEVENTS) {
1475			influx = 0;
1476			KQ_UNLOCK_FLUX(kq);
1477			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1478			nkev = 0;
1479			kevp = keva;
1480			KQ_LOCK(kq);
1481			if (error)
1482				break;
1483		}
1484	}
1485	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1486done:
1487	KQ_OWNED(kq);
1488	KQ_UNLOCK_FLUX(kq);
1489	knote_free(marker);
1490done_nl:
1491	KQ_NOTOWNED(kq);
1492	if (nkev != 0)
1493		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1494	td->td_retval[0] = maxevents - count;
1495	return (error);
1496}
1497
1498/*
1499 * XXX
1500 * This could be expanded to call kqueue_scan, if desired.
1501 */
1502/*ARGSUSED*/
1503static int
1504kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
1505	int flags, struct thread *td)
1506{
1507	return (ENXIO);
1508}
1509
1510/*ARGSUSED*/
1511static int
1512kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
1513	 int flags, struct thread *td)
1514{
1515	return (ENXIO);
1516}
1517
1518/*ARGSUSED*/
1519static int
1520kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1521	struct thread *td)
1522{
1523
1524	return (EINVAL);
1525}
1526
1527/*ARGSUSED*/
1528static int
1529kqueue_ioctl(struct file *fp, u_long cmd, void *data,
1530	struct ucred *active_cred, struct thread *td)
1531{
1532	/*
1533	 * Enabling sigio causes two major problems:
1534	 * 1) infinite recursion:
1535	 * Synopsys: kevent is being used to track signals and have FIOASYNC
1536	 * set.  On receipt of a signal this will cause a kqueue to recurse
1537	 * into itself over and over.  Sending the sigio causes the kqueue
1538	 * to become ready, which in turn posts sigio again, forever.
1539	 * Solution: this can be solved by setting a flag in the kqueue that
1540	 * we have a SIGIO in progress.
1541	 * 2) locking problems:
1542	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1543	 * us above the proc and pgrp locks.
1544	 * Solution: Post a signal using an async mechanism, being sure to
1545	 * record a generation count in the delivery so that we do not deliver
1546	 * a signal to the wrong process.
1547	 *
1548	 * Note, these two mechanisms are somewhat mutually exclusive!
1549	 */
1550#if 0
1551	struct kqueue *kq;
1552
1553	kq = fp->f_data;
1554	switch (cmd) {
1555	case FIOASYNC:
1556		if (*(int *)data) {
1557			kq->kq_state |= KQ_ASYNC;
1558		} else {
1559			kq->kq_state &= ~KQ_ASYNC;
1560		}
1561		return (0);
1562
1563	case FIOSETOWN:
1564		return (fsetown(*(int *)data, &kq->kq_sigio));
1565
1566	case FIOGETOWN:
1567		*(int *)data = fgetown(&kq->kq_sigio);
1568		return (0);
1569	}
1570#endif
1571
1572	return (ENOTTY);
1573}
1574
1575/*ARGSUSED*/
1576static int
1577kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1578	struct thread *td)
1579{
1580	struct kqueue *kq;
1581	int revents = 0;
1582	int error;
1583
1584	if ((error = kqueue_acquire(fp, &kq)))
1585		return POLLERR;
1586
1587	KQ_LOCK(kq);
1588	if (events & (POLLIN | POLLRDNORM)) {
1589		if (kq->kq_count) {
1590			revents |= events & (POLLIN | POLLRDNORM);
1591		} else {
1592			selrecord(td, &kq->kq_sel);
1593			if (SEL_WAITING(&kq->kq_sel))
1594				kq->kq_state |= KQ_SEL;
1595		}
1596	}
1597	kqueue_release(kq, 1);
1598	KQ_UNLOCK(kq);
1599	return (revents);
1600}
1601
1602/*ARGSUSED*/
1603static int
1604kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1605	struct thread *td)
1606{
1607
1608	bzero((void *)st, sizeof *st);
1609	/*
1610	 * We no longer return kq_count because the unlocked value is useless.
1611	 * If you spent all this time getting the count, why not spend your
1612	 * syscall better by calling kevent?
1613	 *
1614	 * XXX - This is needed for libc_r.
1615	 */
1616	st->st_mode = S_IFIFO;
1617	return (0);
1618}
1619
1620/*ARGSUSED*/
1621static int
1622kqueue_close(struct file *fp, struct thread *td)
1623{
1624	struct kqueue *kq = fp->f_data;
1625	struct filedesc *fdp;
1626	struct knote *kn;
1627	int i;
1628	int error;
1629
1630	if ((error = kqueue_acquire(fp, &kq)))
1631		return error;
1632
1633	KQ_LOCK(kq);
1634
1635	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1636	    ("kqueue already closing"));
1637	kq->kq_state |= KQ_CLOSING;
1638	if (kq->kq_refcnt > 1)
1639		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1640
1641	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1642	fdp = kq->kq_fdp;
1643
1644	KASSERT(knlist_empty(&kq->kq_sel.si_note),
1645	    ("kqueue's knlist not empty"));
1646
1647	for (i = 0; i < kq->kq_knlistsize; i++) {
1648		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1649			if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1650				kq->kq_state |= KQ_FLUXWAIT;
1651				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
1652				continue;
1653			}
1654			kn->kn_status |= KN_INFLUX;
1655			KQ_UNLOCK(kq);
1656			if (!(kn->kn_status & KN_DETACHED))
1657				kn->kn_fop->f_detach(kn);
1658			knote_drop(kn, td);
1659			KQ_LOCK(kq);
1660		}
1661	}
1662	if (kq->kq_knhashmask != 0) {
1663		for (i = 0; i <= kq->kq_knhashmask; i++) {
1664			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
1665				if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1666					kq->kq_state |= KQ_FLUXWAIT;
1667					msleep(kq, &kq->kq_lock, PSOCK,
1668					       "kqclo2", 0);
1669					continue;
1670				}
1671				kn->kn_status |= KN_INFLUX;
1672				KQ_UNLOCK(kq);
1673				if (!(kn->kn_status & KN_DETACHED))
1674					kn->kn_fop->f_detach(kn);
1675				knote_drop(kn, td);
1676				KQ_LOCK(kq);
1677			}
1678		}
1679	}
1680
1681	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
1682		kq->kq_state |= KQ_TASKDRAIN;
1683		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
1684	}
1685
1686	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1687		selwakeuppri(&kq->kq_sel, PSOCK);
1688		if (!SEL_WAITING(&kq->kq_sel))
1689			kq->kq_state &= ~KQ_SEL;
1690	}
1691
1692	KQ_UNLOCK(kq);
1693
1694	FILEDESC_XLOCK(fdp);
1695	SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
1696	FILEDESC_XUNLOCK(fdp);
1697
1698	knlist_destroy(&kq->kq_sel.si_note);
1699	mtx_destroy(&kq->kq_lock);
1700	kq->kq_fdp = NULL;
1701
1702	if (kq->kq_knhash != NULL)
1703		free(kq->kq_knhash, M_KQUEUE);
1704	if (kq->kq_knlist != NULL)
1705		free(kq->kq_knlist, M_KQUEUE);
1706
1707	funsetown(&kq->kq_sigio);
1708	free(kq, M_KQUEUE);
1709	fp->f_data = NULL;
1710
1711	return (0);
1712}
1713
1714static void
1715kqueue_wakeup(struct kqueue *kq)
1716{
1717	KQ_OWNED(kq);
1718
1719	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
1720		kq->kq_state &= ~KQ_SLEEP;
1721		wakeup(kq);
1722	}
1723	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1724		selwakeuppri(&kq->kq_sel, PSOCK);
1725		if (!SEL_WAITING(&kq->kq_sel))
1726			kq->kq_state &= ~KQ_SEL;
1727	}
1728	if (!knlist_empty(&kq->kq_sel.si_note))
1729		kqueue_schedtask(kq);
1730	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
1731		pgsigio(&kq->kq_sigio, SIGIO, 0);
1732	}
1733}
1734
1735/*
1736 * Walk down a list of knotes, activating them if their event has triggered.
1737 *
1738 * There is a possibility to optimize in the case of one kq watching another.
1739 * Instead of scheduling a task to wake it up, you could pass enough state
1740 * down the chain to make up the parent kqueue.  Make this code functional
1741 * first.
1742 */
1743void
1744knote(struct knlist *list, long hint, int lockflags)
1745{
1746	struct kqueue *kq;
1747	struct knote *kn;
1748	int error;
1749
1750	if (list == NULL)
1751		return;
1752
1753	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
1754
1755	if ((lockflags & KNF_LISTLOCKED) == 0)
1756		list->kl_lock(list->kl_lockarg);
1757
1758	/*
1759	 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
1760	 * the kqueue scheduling, but this will introduce four
1761	 * lock/unlock's for each knote to test.  If we do, continue to use
1762	 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
1763	 * only safe if you want to remove the current item, which we are
1764	 * not doing.
1765	 */
1766	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
1767		kq = kn->kn_kq;
1768		if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
1769			KQ_LOCK(kq);
1770			if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1771				KQ_UNLOCK(kq);
1772			} else if ((lockflags & KNF_NOKQLOCK) != 0) {
1773				kn->kn_status |= KN_INFLUX;
1774				KQ_UNLOCK(kq);
1775				error = kn->kn_fop->f_event(kn, hint);
1776				KQ_LOCK(kq);
1777				kn->kn_status &= ~KN_INFLUX;
1778				if (error)
1779					KNOTE_ACTIVATE(kn, 1);
1780				KQ_UNLOCK_FLUX(kq);
1781			} else {
1782				kn->kn_status |= KN_HASKQLOCK;
1783				if (kn->kn_fop->f_event(kn, hint))
1784					KNOTE_ACTIVATE(kn, 1);
1785				kn->kn_status &= ~KN_HASKQLOCK;
1786				KQ_UNLOCK(kq);
1787			}
1788		}
1789		kq = NULL;
1790	}
1791	if ((lockflags & KNF_LISTLOCKED) == 0)
1792		list->kl_unlock(list->kl_lockarg);
1793}
1794
1795/*
1796 * add a knote to a knlist
1797 */
1798void
1799knlist_add(struct knlist *knl, struct knote *kn, int islocked)
1800{
1801	KNL_ASSERT_LOCK(knl, islocked);
1802	KQ_NOTOWNED(kn->kn_kq);
1803	KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
1804	    (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
1805	if (!islocked)
1806		knl->kl_lock(knl->kl_lockarg);
1807	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
1808	if (!islocked)
1809		knl->kl_unlock(knl->kl_lockarg);
1810	KQ_LOCK(kn->kn_kq);
1811	kn->kn_knlist = knl;
1812	kn->kn_status &= ~KN_DETACHED;
1813	KQ_UNLOCK(kn->kn_kq);
1814}
1815
1816static void
1817knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
1818{
1819	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
1820	KNL_ASSERT_LOCK(knl, knlislocked);
1821	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
1822	if (!kqislocked)
1823		KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
1824    ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
1825	if (!knlislocked)
1826		knl->kl_lock(knl->kl_lockarg);
1827	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
1828	kn->kn_knlist = NULL;
1829	if (!knlislocked)
1830		knl->kl_unlock(knl->kl_lockarg);
1831	if (!kqislocked)
1832		KQ_LOCK(kn->kn_kq);
1833	kn->kn_status |= KN_DETACHED;
1834	if (!kqislocked)
1835		KQ_UNLOCK(kn->kn_kq);
1836}
1837
1838/*
1839 * remove all knotes from a specified klist
1840 */
1841void
1842knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
1843{
1844
1845	knlist_remove_kq(knl, kn, islocked, 0);
1846}
1847
1848/*
1849 * remove knote from a specified klist while in f_event handler.
1850 */
1851void
1852knlist_remove_inevent(struct knlist *knl, struct knote *kn)
1853{
1854
1855	knlist_remove_kq(knl, kn, 1,
1856	    (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
1857}
1858
1859int
1860knlist_empty(struct knlist *knl)
1861{
1862	KNL_ASSERT_LOCKED(knl);
1863	return SLIST_EMPTY(&knl->kl_list);
1864}
1865
1866static struct mtx	knlist_lock;
1867MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
1868	MTX_DEF);
1869static void knlist_mtx_lock(void *arg);
1870static void knlist_mtx_unlock(void *arg);
1871
1872static void
1873knlist_mtx_lock(void *arg)
1874{
1875	mtx_lock((struct mtx *)arg);
1876}
1877
1878static void
1879knlist_mtx_unlock(void *arg)
1880{
1881	mtx_unlock((struct mtx *)arg);
1882}
1883
1884static void
1885knlist_mtx_assert_locked(void *arg)
1886{
1887	mtx_assert((struct mtx *)arg, MA_OWNED);
1888}
1889
1890static void
1891knlist_mtx_assert_unlocked(void *arg)
1892{
1893	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
1894}
1895
1896void
1897knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
1898    void (*kl_unlock)(void *),
1899    void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
1900{
1901
1902	if (lock == NULL)
1903		knl->kl_lockarg = &knlist_lock;
1904	else
1905		knl->kl_lockarg = lock;
1906
1907	if (kl_lock == NULL)
1908		knl->kl_lock = knlist_mtx_lock;
1909	else
1910		knl->kl_lock = kl_lock;
1911	if (kl_unlock == NULL)
1912		knl->kl_unlock = knlist_mtx_unlock;
1913	else
1914		knl->kl_unlock = kl_unlock;
1915	if (kl_assert_locked == NULL)
1916		knl->kl_assert_locked = knlist_mtx_assert_locked;
1917	else
1918		knl->kl_assert_locked = kl_assert_locked;
1919	if (kl_assert_unlocked == NULL)
1920		knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
1921	else
1922		knl->kl_assert_unlocked = kl_assert_unlocked;
1923
1924	SLIST_INIT(&knl->kl_list);
1925}
1926
1927void
1928knlist_init_mtx(struct knlist *knl, struct mtx *lock)
1929{
1930
1931	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
1932}
1933
1934void
1935knlist_destroy(struct knlist *knl)
1936{
1937
1938#ifdef INVARIANTS
1939	/*
1940	 * if we run across this error, we need to find the offending
1941	 * driver and have it call knlist_clear.
1942	 */
1943	if (!SLIST_EMPTY(&knl->kl_list))
1944		printf("WARNING: destroying knlist w/ knotes on it!\n");
1945#endif
1946
1947	knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
1948	SLIST_INIT(&knl->kl_list);
1949}
1950
1951/*
1952 * Even if we are locked, we may need to drop the lock to allow any influx
1953 * knotes time to "settle".
1954 */
1955void
1956knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
1957{
1958	struct knote *kn, *kn2;
1959	struct kqueue *kq;
1960
1961	if (islocked)
1962		KNL_ASSERT_LOCKED(knl);
1963	else {
1964		KNL_ASSERT_UNLOCKED(knl);
1965again:		/* need to reacquire lock since we have dropped it */
1966		knl->kl_lock(knl->kl_lockarg);
1967	}
1968
1969	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
1970		kq = kn->kn_kq;
1971		KQ_LOCK(kq);
1972		if ((kn->kn_status & KN_INFLUX)) {
1973			KQ_UNLOCK(kq);
1974			continue;
1975		}
1976		knlist_remove_kq(knl, kn, 1, 1);
1977		if (killkn) {
1978			kn->kn_status |= KN_INFLUX | KN_DETACHED;
1979			KQ_UNLOCK(kq);
1980			knote_drop(kn, td);
1981		} else {
1982			/* Make sure cleared knotes disappear soon */
1983			kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1984			KQ_UNLOCK(kq);
1985		}
1986		kq = NULL;
1987	}
1988
1989	if (!SLIST_EMPTY(&knl->kl_list)) {
1990		/* there are still KN_INFLUX remaining */
1991		kn = SLIST_FIRST(&knl->kl_list);
1992		kq = kn->kn_kq;
1993		KQ_LOCK(kq);
1994		KASSERT(kn->kn_status & KN_INFLUX,
1995		    ("knote removed w/o list lock"));
1996		knl->kl_unlock(knl->kl_lockarg);
1997		kq->kq_state |= KQ_FLUXWAIT;
1998		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
1999		kq = NULL;
2000		goto again;
2001	}
2002
2003	if (islocked)
2004		KNL_ASSERT_LOCKED(knl);
2005	else {
2006		knl->kl_unlock(knl->kl_lockarg);
2007		KNL_ASSERT_UNLOCKED(knl);
2008	}
2009}
2010
2011/*
2012 * Remove all knotes referencing a specified fd must be called with FILEDESC
2013 * lock.  This prevents a race where a new fd comes along and occupies the
2014 * entry and we attach a knote to the fd.
2015 */
2016void
2017knote_fdclose(struct thread *td, int fd)
2018{
2019	struct filedesc *fdp = td->td_proc->p_fd;
2020	struct kqueue *kq;
2021	struct knote *kn;
2022	int influx;
2023
2024	FILEDESC_XLOCK_ASSERT(fdp);
2025
2026	/*
2027	 * We shouldn't have to worry about new kevents appearing on fd
2028	 * since filedesc is locked.
2029	 */
2030	SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2031		KQ_LOCK(kq);
2032
2033again:
2034		influx = 0;
2035		while (kq->kq_knlistsize > fd &&
2036		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2037			if (kn->kn_status & KN_INFLUX) {
2038				/* someone else might be waiting on our knote */
2039				if (influx)
2040					wakeup(kq);
2041				kq->kq_state |= KQ_FLUXWAIT;
2042				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2043				goto again;
2044			}
2045			kn->kn_status |= KN_INFLUX;
2046			KQ_UNLOCK(kq);
2047			if (!(kn->kn_status & KN_DETACHED))
2048				kn->kn_fop->f_detach(kn);
2049			knote_drop(kn, td);
2050			influx = 1;
2051			KQ_LOCK(kq);
2052		}
2053		KQ_UNLOCK_FLUX(kq);
2054	}
2055}
2056
2057static int
2058knote_attach(struct knote *kn, struct kqueue *kq)
2059{
2060	struct klist *list;
2061
2062	KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
2063	KQ_OWNED(kq);
2064
2065	if (kn->kn_fop->f_isfd) {
2066		if (kn->kn_id >= kq->kq_knlistsize)
2067			return ENOMEM;
2068		list = &kq->kq_knlist[kn->kn_id];
2069	} else {
2070		if (kq->kq_knhash == NULL)
2071			return ENOMEM;
2072		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2073	}
2074
2075	SLIST_INSERT_HEAD(list, kn, kn_link);
2076
2077	return 0;
2078}
2079
2080/*
2081 * knote must already have been detached using the f_detach method.
2082 * no lock need to be held, it is assumed that the KN_INFLUX flag is set
2083 * to prevent other removal.
2084 */
2085static void
2086knote_drop(struct knote *kn, struct thread *td)
2087{
2088	struct kqueue *kq;
2089	struct klist *list;
2090
2091	kq = kn->kn_kq;
2092
2093	KQ_NOTOWNED(kq);
2094	KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
2095	    ("knote_drop called without KN_INFLUX set in kn_status"));
2096
2097	KQ_LOCK(kq);
2098	if (kn->kn_fop->f_isfd)
2099		list = &kq->kq_knlist[kn->kn_id];
2100	else
2101		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2102
2103	if (!SLIST_EMPTY(list))
2104		SLIST_REMOVE(list, kn, knote, kn_link);
2105	if (kn->kn_status & KN_QUEUED)
2106		knote_dequeue(kn);
2107	KQ_UNLOCK_FLUX(kq);
2108
2109	if (kn->kn_fop->f_isfd) {
2110		fdrop(kn->kn_fp, td);
2111		kn->kn_fp = NULL;
2112	}
2113	kqueue_fo_release(kn->kn_kevent.filter);
2114	kn->kn_fop = NULL;
2115	knote_free(kn);
2116}
2117
2118static void
2119knote_enqueue(struct knote *kn)
2120{
2121	struct kqueue *kq = kn->kn_kq;
2122
2123	KQ_OWNED(kn->kn_kq);
2124	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2125
2126	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2127	kn->kn_status |= KN_QUEUED;
2128	kq->kq_count++;
2129	kqueue_wakeup(kq);
2130}
2131
2132static void
2133knote_dequeue(struct knote *kn)
2134{
2135	struct kqueue *kq = kn->kn_kq;
2136
2137	KQ_OWNED(kn->kn_kq);
2138	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2139
2140	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2141	kn->kn_status &= ~KN_QUEUED;
2142	kq->kq_count--;
2143}
2144
2145static void
2146knote_init(void)
2147{
2148
2149	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2150	    NULL, NULL, UMA_ALIGN_PTR, 0);
2151}
2152SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2153
2154static struct knote *
2155knote_alloc(int waitok)
2156{
2157	return ((struct knote *)uma_zalloc(knote_zone,
2158	    (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
2159}
2160
2161static void
2162knote_free(struct knote *kn)
2163{
2164	if (kn != NULL)
2165		uma_zfree(knote_zone, kn);
2166}
2167
2168/*
2169 * Register the kev w/ the kq specified by fd.
2170 */
2171int
2172kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
2173{
2174	struct kqueue *kq;
2175	struct file *fp;
2176	int error;
2177
2178	if ((error = fget(td, fd, &fp)) != 0)
2179		return (error);
2180	if ((error = kqueue_acquire(fp, &kq)) != 0)
2181		goto noacquire;
2182
2183	error = kqueue_register(kq, kev, td, waitok);
2184
2185	kqueue_release(kq, 0);
2186
2187noacquire:
2188	fdrop(fp, td);
2189
2190	return error;
2191}
2192