1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
5 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
6 * Copyright (c) 2009 Apple, Inc.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include "opt_ktrace.h"
35#include "opt_kqueue.h"
36
37#ifdef COMPAT_FREEBSD11
38#define	_WANT_FREEBSD11_KEVENT
39#endif
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/capsicum.h>
44#include <sys/kernel.h>
45#include <sys/limits.h>
46#include <sys/lock.h>
47#include <sys/mutex.h>
48#include <sys/rwlock.h>
49#include <sys/proc.h>
50#include <sys/malloc.h>
51#include <sys/unistd.h>
52#include <sys/file.h>
53#include <sys/filedesc.h>
54#include <sys/filio.h>
55#include <sys/fcntl.h>
56#include <sys/kthread.h>
57#include <sys/selinfo.h>
58#include <sys/queue.h>
59#include <sys/event.h>
60#include <sys/eventvar.h>
61#include <sys/poll.h>
62#include <sys/protosw.h>
63#include <sys/resourcevar.h>
64#include <sys/sigio.h>
65#include <sys/signalvar.h>
66#include <sys/socket.h>
67#include <sys/socketvar.h>
68#include <sys/stat.h>
69#include <sys/sysctl.h>
70#include <sys/sysproto.h>
71#include <sys/syscallsubr.h>
72#include <sys/taskqueue.h>
73#include <sys/uio.h>
74#include <sys/user.h>
75#ifdef KTRACE
76#include <sys/ktrace.h>
77#endif
78#include <machine/atomic.h>
79
80#include <vm/uma.h>
81
82static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
83
84/*
85 * This lock is used if multiple kq locks are required.  This possibly
86 * should be made into a per proc lock.
87 */
88static struct mtx	kq_global;
89MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
90#define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
91	if (!haslck)				\
92		mtx_lock(lck);			\
93	haslck = 1;				\
94} while (0)
95#define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
96	if (haslck)				\
97		mtx_unlock(lck);			\
98	haslck = 0;				\
99} while (0)
100
101TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
102
103static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
104static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
105static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
106		    struct thread *td, int mflag);
107static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
108static void	kqueue_release(struct kqueue *kq, int locked);
109static void	kqueue_destroy(struct kqueue *kq);
110static void	kqueue_drain(struct kqueue *kq, struct thread *td);
111static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
112		    uintptr_t ident, int mflag);
113static void	kqueue_task(void *arg, int pending);
114static int	kqueue_scan(struct kqueue *kq, int maxevents,
115		    struct kevent_copyops *k_ops,
116		    const struct timespec *timeout,
117		    struct kevent *keva, struct thread *td);
118static void 	kqueue_wakeup(struct kqueue *kq);
119static struct filterops *kqueue_fo_find(int filt);
120static void	kqueue_fo_release(int filt);
121struct g_kevent_args;
122static int	kern_kevent_generic(struct thread *td,
123		    struct g_kevent_args *uap,
124		    struct kevent_copyops *k_ops, const char *struct_name);
125
126static fo_ioctl_t	kqueue_ioctl;
127static fo_poll_t	kqueue_poll;
128static fo_kqfilter_t	kqueue_kqfilter;
129static fo_stat_t	kqueue_stat;
130static fo_close_t	kqueue_close;
131static fo_fill_kinfo_t	kqueue_fill_kinfo;
132
133static struct fileops kqueueops = {
134	.fo_read = invfo_rdwr,
135	.fo_write = invfo_rdwr,
136	.fo_truncate = invfo_truncate,
137	.fo_ioctl = kqueue_ioctl,
138	.fo_poll = kqueue_poll,
139	.fo_kqfilter = kqueue_kqfilter,
140	.fo_stat = kqueue_stat,
141	.fo_close = kqueue_close,
142	.fo_chmod = invfo_chmod,
143	.fo_chown = invfo_chown,
144	.fo_sendfile = invfo_sendfile,
145	.fo_fill_kinfo = kqueue_fill_kinfo,
146};
147
148static int 	knote_attach(struct knote *kn, struct kqueue *kq);
149static void 	knote_drop(struct knote *kn, struct thread *td);
150static void 	knote_drop_detached(struct knote *kn, struct thread *td);
151static void 	knote_enqueue(struct knote *kn);
152static void 	knote_dequeue(struct knote *kn);
153static void 	knote_init(void);
154static struct 	knote *knote_alloc(int mflag);
155static void 	knote_free(struct knote *kn);
156
157static void	filt_kqdetach(struct knote *kn);
158static int	filt_kqueue(struct knote *kn, long hint);
159static int	filt_procattach(struct knote *kn);
160static void	filt_procdetach(struct knote *kn);
161static int	filt_proc(struct knote *kn, long hint);
162static int	filt_fileattach(struct knote *kn);
163static void	filt_timerexpire(void *knx);
164static void	filt_timerexpire_l(struct knote *kn, bool proc_locked);
165static int	filt_timerattach(struct knote *kn);
166static void	filt_timerdetach(struct knote *kn);
167static void	filt_timerstart(struct knote *kn, sbintime_t to);
168static void	filt_timertouch(struct knote *kn, struct kevent *kev,
169		    u_long type);
170static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
171static int	filt_timer(struct knote *kn, long hint);
172static int	filt_userattach(struct knote *kn);
173static void	filt_userdetach(struct knote *kn);
174static int	filt_user(struct knote *kn, long hint);
175static void	filt_usertouch(struct knote *kn, struct kevent *kev,
176		    u_long type);
177
178static struct filterops file_filtops = {
179	.f_isfd = 1,
180	.f_attach = filt_fileattach,
181};
182static struct filterops kqread_filtops = {
183	.f_isfd = 1,
184	.f_detach = filt_kqdetach,
185	.f_event = filt_kqueue,
186};
187/* XXX - move to kern_proc.c?  */
188static struct filterops proc_filtops = {
189	.f_isfd = 0,
190	.f_attach = filt_procattach,
191	.f_detach = filt_procdetach,
192	.f_event = filt_proc,
193};
194static struct filterops timer_filtops = {
195	.f_isfd = 0,
196	.f_attach = filt_timerattach,
197	.f_detach = filt_timerdetach,
198	.f_event = filt_timer,
199	.f_touch = filt_timertouch,
200};
201static struct filterops user_filtops = {
202	.f_attach = filt_userattach,
203	.f_detach = filt_userdetach,
204	.f_event = filt_user,
205	.f_touch = filt_usertouch,
206};
207
208static uma_zone_t	knote_zone;
209static unsigned int __exclusive_cache_line	kq_ncallouts;
210static unsigned int 	kq_calloutmax = 4 * 1024;
211SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
212    &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
213
214/* XXX - ensure not influx ? */
215#define KNOTE_ACTIVATE(kn, islock) do { 				\
216	if ((islock))							\
217		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
218	else								\
219		KQ_LOCK((kn)->kn_kq);					\
220	(kn)->kn_status |= KN_ACTIVE;					\
221	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
222		knote_enqueue((kn));					\
223	if (!(islock))							\
224		KQ_UNLOCK((kn)->kn_kq);					\
225} while(0)
226#define KQ_LOCK(kq) do {						\
227	mtx_lock(&(kq)->kq_lock);					\
228} while (0)
229#define KQ_FLUX_WAKEUP(kq) do {						\
230	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
231		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
232		wakeup((kq));						\
233	}								\
234} while (0)
235#define KQ_UNLOCK_FLUX(kq) do {						\
236	KQ_FLUX_WAKEUP(kq);						\
237	mtx_unlock(&(kq)->kq_lock);					\
238} while (0)
239#define KQ_UNLOCK(kq) do {						\
240	mtx_unlock(&(kq)->kq_lock);					\
241} while (0)
242#define KQ_OWNED(kq) do {						\
243	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
244} while (0)
245#define KQ_NOTOWNED(kq) do {						\
246	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
247} while (0)
248
249static struct knlist *
250kn_list_lock(struct knote *kn)
251{
252	struct knlist *knl;
253
254	knl = kn->kn_knlist;
255	if (knl != NULL)
256		knl->kl_lock(knl->kl_lockarg);
257	return (knl);
258}
259
260static void
261kn_list_unlock(struct knlist *knl)
262{
263	bool do_free;
264
265	if (knl == NULL)
266		return;
267	do_free = knl->kl_autodestroy && knlist_empty(knl);
268	knl->kl_unlock(knl->kl_lockarg);
269	if (do_free) {
270		knlist_destroy(knl);
271		free(knl, M_KQUEUE);
272	}
273}
274
275static bool
276kn_in_flux(struct knote *kn)
277{
278
279	return (kn->kn_influx > 0);
280}
281
282static void
283kn_enter_flux(struct knote *kn)
284{
285
286	KQ_OWNED(kn->kn_kq);
287	MPASS(kn->kn_influx < INT_MAX);
288	kn->kn_influx++;
289}
290
291static bool
292kn_leave_flux(struct knote *kn)
293{
294
295	KQ_OWNED(kn->kn_kq);
296	MPASS(kn->kn_influx > 0);
297	kn->kn_influx--;
298	return (kn->kn_influx == 0);
299}
300
301#define	KNL_ASSERT_LOCK(knl, islocked) do {				\
302	if (islocked)							\
303		KNL_ASSERT_LOCKED(knl);				\
304	else								\
305		KNL_ASSERT_UNLOCKED(knl);				\
306} while (0)
307#ifdef INVARIANTS
308#define	KNL_ASSERT_LOCKED(knl) do {					\
309	knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED);		\
310} while (0)
311#define	KNL_ASSERT_UNLOCKED(knl) do {					\
312	knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED);		\
313} while (0)
314#else /* !INVARIANTS */
315#define	KNL_ASSERT_LOCKED(knl) do {} while(0)
316#define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
317#endif /* INVARIANTS */
318
319#ifndef	KN_HASHSIZE
320#define	KN_HASHSIZE		64		/* XXX should be tunable */
321#endif
322
323#define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
324
325static int
326filt_nullattach(struct knote *kn)
327{
328
329	return (ENXIO);
330};
331
332struct filterops null_filtops = {
333	.f_isfd = 0,
334	.f_attach = filt_nullattach,
335};
336
337/* XXX - make SYSINIT to add these, and move into respective modules. */
338extern struct filterops sig_filtops;
339extern struct filterops fs_filtops;
340
341/*
342 * Table for for all system-defined filters.
343 */
344static struct mtx	filterops_lock;
345MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
346	MTX_DEF);
347static struct {
348	struct filterops *for_fop;
349	int for_nolock;
350	int for_refcnt;
351} sysfilt_ops[EVFILT_SYSCOUNT] = {
352	{ &file_filtops, 1 },			/* EVFILT_READ */
353	{ &file_filtops, 1 },			/* EVFILT_WRITE */
354	{ &null_filtops },			/* EVFILT_AIO */
355	{ &file_filtops, 1 },			/* EVFILT_VNODE */
356	{ &proc_filtops, 1 },			/* EVFILT_PROC */
357	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
358	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
359	{ &file_filtops, 1 },			/* EVFILT_PROCDESC */
360	{ &fs_filtops, 1 },			/* EVFILT_FS */
361	{ &null_filtops },			/* EVFILT_LIO */
362	{ &user_filtops, 1 },			/* EVFILT_USER */
363	{ &null_filtops },			/* EVFILT_SENDFILE */
364	{ &file_filtops, 1 },                   /* EVFILT_EMPTY */
365};
366
367/*
368 * Simple redirection for all cdevsw style objects to call their fo_kqfilter
369 * method.
370 */
371static int
372filt_fileattach(struct knote *kn)
373{
374
375	return (fo_kqfilter(kn->kn_fp, kn));
376}
377
378/*ARGSUSED*/
379static int
380kqueue_kqfilter(struct file *fp, struct knote *kn)
381{
382	struct kqueue *kq = kn->kn_fp->f_data;
383
384	if (kn->kn_filter != EVFILT_READ)
385		return (EINVAL);
386
387	kn->kn_status |= KN_KQUEUE;
388	kn->kn_fop = &kqread_filtops;
389	knlist_add(&kq->kq_sel.si_note, kn, 0);
390
391	return (0);
392}
393
394static void
395filt_kqdetach(struct knote *kn)
396{
397	struct kqueue *kq = kn->kn_fp->f_data;
398
399	knlist_remove(&kq->kq_sel.si_note, kn, 0);
400}
401
402/*ARGSUSED*/
403static int
404filt_kqueue(struct knote *kn, long hint)
405{
406	struct kqueue *kq = kn->kn_fp->f_data;
407
408	kn->kn_data = kq->kq_count;
409	return (kn->kn_data > 0);
410}
411
412/* XXX - move to kern_proc.c?  */
413static int
414filt_procattach(struct knote *kn)
415{
416	struct proc *p;
417	int error;
418	bool exiting, immediate;
419
420	exiting = immediate = false;
421	if (kn->kn_sfflags & NOTE_EXIT)
422		p = pfind_any(kn->kn_id);
423	else
424		p = pfind(kn->kn_id);
425	if (p == NULL)
426		return (ESRCH);
427	if (p->p_flag & P_WEXIT)
428		exiting = true;
429
430	if ((error = p_cansee(curthread, p))) {
431		PROC_UNLOCK(p);
432		return (error);
433	}
434
435	kn->kn_ptr.p_proc = p;
436	kn->kn_flags |= EV_CLEAR;		/* automatically set */
437
438	/*
439	 * Internal flag indicating registration done by kernel for the
440	 * purposes of getting a NOTE_CHILD notification.
441	 */
442	if (kn->kn_flags & EV_FLAG2) {
443		kn->kn_flags &= ~EV_FLAG2;
444		kn->kn_data = kn->kn_sdata;		/* ppid */
445		kn->kn_fflags = NOTE_CHILD;
446		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
447		immediate = true; /* Force immediate activation of child note. */
448	}
449	/*
450	 * Internal flag indicating registration done by kernel (for other than
451	 * NOTE_CHILD).
452	 */
453	if (kn->kn_flags & EV_FLAG1) {
454		kn->kn_flags &= ~EV_FLAG1;
455	}
456
457	knlist_add(p->p_klist, kn, 1);
458
459	/*
460	 * Immediately activate any child notes or, in the case of a zombie
461	 * target process, exit notes.  The latter is necessary to handle the
462	 * case where the target process, e.g. a child, dies before the kevent
463	 * is registered.
464	 */
465	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
466		KNOTE_ACTIVATE(kn, 0);
467
468	PROC_UNLOCK(p);
469
470	return (0);
471}
472
473/*
474 * The knote may be attached to a different process, which may exit,
475 * leaving nothing for the knote to be attached to.  So when the process
476 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
477 * it will be deleted when read out.  However, as part of the knote deletion,
478 * this routine is called, so a check is needed to avoid actually performing
479 * a detach, because the original process does not exist any more.
480 */
481/* XXX - move to kern_proc.c?  */
482static void
483filt_procdetach(struct knote *kn)
484{
485
486	knlist_remove(kn->kn_knlist, kn, 0);
487	kn->kn_ptr.p_proc = NULL;
488}
489
490/* XXX - move to kern_proc.c?  */
491static int
492filt_proc(struct knote *kn, long hint)
493{
494	struct proc *p;
495	u_int event;
496
497	p = kn->kn_ptr.p_proc;
498	if (p == NULL) /* already activated, from attach filter */
499		return (0);
500
501	/* Mask off extra data. */
502	event = (u_int)hint & NOTE_PCTRLMASK;
503
504	/* If the user is interested in this event, record it. */
505	if (kn->kn_sfflags & event)
506		kn->kn_fflags |= event;
507
508	/* Process is gone, so flag the event as finished. */
509	if (event == NOTE_EXIT) {
510		kn->kn_flags |= EV_EOF | EV_ONESHOT;
511		kn->kn_ptr.p_proc = NULL;
512		if (kn->kn_fflags & NOTE_EXIT)
513			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
514		if (kn->kn_fflags == 0)
515			kn->kn_flags |= EV_DROP;
516		return (1);
517	}
518
519	return (kn->kn_fflags != 0);
520}
521
522/*
523 * Called when the process forked. It mostly does the same as the
524 * knote(), activating all knotes registered to be activated when the
525 * process forked. Additionally, for each knote attached to the
526 * parent, check whether user wants to track the new process. If so
527 * attach a new knote to it, and immediately report an event with the
528 * child's pid.
529 */
530void
531knote_fork(struct knlist *list, int pid)
532{
533	struct kqueue *kq;
534	struct knote *kn;
535	struct kevent kev;
536	int error;
537
538	MPASS(list != NULL);
539	KNL_ASSERT_LOCKED(list);
540	if (SLIST_EMPTY(&list->kl_list))
541		return;
542
543	memset(&kev, 0, sizeof(kev));
544	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
545		kq = kn->kn_kq;
546		KQ_LOCK(kq);
547		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
548			KQ_UNLOCK(kq);
549			continue;
550		}
551
552		/*
553		 * The same as knote(), activate the event.
554		 */
555		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
556			if (kn->kn_fop->f_event(kn, NOTE_FORK))
557				KNOTE_ACTIVATE(kn, 1);
558			KQ_UNLOCK(kq);
559			continue;
560		}
561
562		/*
563		 * The NOTE_TRACK case. In addition to the activation
564		 * of the event, we need to register new events to
565		 * track the child. Drop the locks in preparation for
566		 * the call to kqueue_register().
567		 */
568		kn_enter_flux(kn);
569		KQ_UNLOCK(kq);
570		list->kl_unlock(list->kl_lockarg);
571
572		/*
573		 * Activate existing knote and register tracking knotes with
574		 * new process.
575		 *
576		 * First register a knote to get just the child notice. This
577		 * must be a separate note from a potential NOTE_EXIT
578		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
579		 * to use the data field (in conflicting ways).
580		 */
581		kev.ident = pid;
582		kev.filter = kn->kn_filter;
583		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
584		    EV_FLAG2;
585		kev.fflags = kn->kn_sfflags;
586		kev.data = kn->kn_id;		/* parent */
587		kev.udata = kn->kn_kevent.udata;/* preserve udata */
588		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
589		if (error)
590			kn->kn_fflags |= NOTE_TRACKERR;
591
592		/*
593		 * Then register another knote to track other potential events
594		 * from the new process.
595		 */
596		kev.ident = pid;
597		kev.filter = kn->kn_filter;
598		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
599		kev.fflags = kn->kn_sfflags;
600		kev.data = kn->kn_id;		/* parent */
601		kev.udata = kn->kn_kevent.udata;/* preserve udata */
602		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
603		if (error)
604			kn->kn_fflags |= NOTE_TRACKERR;
605		if (kn->kn_fop->f_event(kn, NOTE_FORK))
606			KNOTE_ACTIVATE(kn, 0);
607		list->kl_lock(list->kl_lockarg);
608		KQ_LOCK(kq);
609		kn_leave_flux(kn);
610		KQ_UNLOCK_FLUX(kq);
611	}
612}
613
614/*
615 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
616 * interval timer support code.
617 */
618
619#define NOTE_TIMER_PRECMASK						\
620    (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
621
622static sbintime_t
623timer2sbintime(int64_t data, int flags)
624{
625	int64_t secs;
626
627        /*
628         * Macros for converting to the fractional second portion of an
629         * sbintime_t using 64bit multiplication to improve precision.
630         */
631#define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
632#define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
633#define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
634	switch (flags & NOTE_TIMER_PRECMASK) {
635	case NOTE_SECONDS:
636#ifdef __LP64__
637		if (data > (SBT_MAX / SBT_1S))
638			return (SBT_MAX);
639#endif
640		return ((sbintime_t)data << 32);
641	case NOTE_MSECONDS: /* FALLTHROUGH */
642	case 0:
643		if (data >= 1000) {
644			secs = data / 1000;
645#ifdef __LP64__
646			if (secs > (SBT_MAX / SBT_1S))
647				return (SBT_MAX);
648#endif
649			return (secs << 32 | MS_TO_SBT(data % 1000));
650		}
651		return (MS_TO_SBT(data));
652	case NOTE_USECONDS:
653		if (data >= 1000000) {
654			secs = data / 1000000;
655#ifdef __LP64__
656			if (secs > (SBT_MAX / SBT_1S))
657				return (SBT_MAX);
658#endif
659			return (secs << 32 | US_TO_SBT(data % 1000000));
660		}
661		return (US_TO_SBT(data));
662	case NOTE_NSECONDS:
663		if (data >= 1000000000) {
664			secs = data / 1000000000;
665#ifdef __LP64__
666			if (secs > (SBT_MAX / SBT_1S))
667				return (SBT_MAX);
668#endif
669			return (secs << 32 | NS_TO_SBT(data % 1000000000));
670		}
671		return (NS_TO_SBT(data));
672	default:
673		break;
674	}
675	return (-1);
676}
677
678struct kq_timer_cb_data {
679	struct callout c;
680	struct proc *p;
681	struct knote *kn;
682	int cpuid;
683	int flags;
684	TAILQ_ENTRY(kq_timer_cb_data) link;
685	sbintime_t next;	/* next timer event fires at */
686	sbintime_t to;		/* precalculated timer period, 0 for abs */
687};
688
689#define	KQ_TIMER_CB_ENQUEUED	0x01
690
691static void
692kqtimer_sched_callout(struct kq_timer_cb_data *kc)
693{
694	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn,
695	    kc->cpuid, C_ABSOLUTE);
696}
697
698void
699kqtimer_proc_continue(struct proc *p)
700{
701	struct kq_timer_cb_data *kc, *kc1;
702	struct bintime bt;
703	sbintime_t now;
704
705	PROC_LOCK_ASSERT(p, MA_OWNED);
706
707	getboottimebin(&bt);
708	now = bttosbt(bt);
709
710	TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) {
711		TAILQ_REMOVE(&p->p_kqtim_stop, kc, link);
712		kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
713		if (kc->next <= now)
714			filt_timerexpire_l(kc->kn, true);
715		else
716			kqtimer_sched_callout(kc);
717	}
718}
719
720static void
721filt_timerexpire_l(struct knote *kn, bool proc_locked)
722{
723	struct kq_timer_cb_data *kc;
724	struct proc *p;
725	uint64_t delta;
726	sbintime_t now;
727
728	kc = kn->kn_ptr.p_v;
729
730	if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) {
731		kn->kn_data++;
732		KNOTE_ACTIVATE(kn, 0);
733		return;
734	}
735
736	now = sbinuptime();
737	if (now >= kc->next) {
738		delta = (now - kc->next) / kc->to;
739		if (delta == 0)
740			delta = 1;
741		kn->kn_data += delta;
742		kc->next += (delta + 1) * kc->to;
743		if (now >= kc->next)	/* overflow */
744			kc->next = now + kc->to;
745		KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
746	}
747
748	/*
749	 * Initial check for stopped kc->p is racy.  It is fine to
750	 * miss the set of the stop flags, at worst we would schedule
751	 * one more callout.  On the other hand, it is not fine to not
752	 * schedule when we we missed clearing of the flags, we
753	 * recheck them under the lock and observe consistent state.
754	 */
755	p = kc->p;
756	if (P_SHOULDSTOP(p) || P_KILLED(p)) {
757		if (!proc_locked)
758			PROC_LOCK(p);
759		if (P_SHOULDSTOP(p) || P_KILLED(p)) {
760			if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) {
761				kc->flags |= KQ_TIMER_CB_ENQUEUED;
762				TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link);
763			}
764			if (!proc_locked)
765				PROC_UNLOCK(p);
766			return;
767		}
768		if (!proc_locked)
769			PROC_UNLOCK(p);
770	}
771	kqtimer_sched_callout(kc);
772}
773
774static void
775filt_timerexpire(void *knx)
776{
777	filt_timerexpire_l(knx, false);
778}
779
780/*
781 * data contains amount of time to sleep
782 */
783static int
784filt_timervalidate(struct knote *kn, sbintime_t *to)
785{
786	struct bintime bt;
787	sbintime_t sbt;
788
789	if (kn->kn_sdata < 0)
790		return (EINVAL);
791	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
792		kn->kn_sdata = 1;
793	/*
794	 * The only fflags values supported are the timer unit
795	 * (precision) and the absolute time indicator.
796	 */
797	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
798		return (EINVAL);
799
800	*to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
801	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
802		getboottimebin(&bt);
803		sbt = bttosbt(bt);
804		*to -= sbt;
805	}
806	if (*to < 0)
807		return (EINVAL);
808	return (0);
809}
810
811static int
812filt_timerattach(struct knote *kn)
813{
814	struct kq_timer_cb_data *kc;
815	sbintime_t to;
816	int error;
817
818	error = filt_timervalidate(kn, &to);
819	if (error != 0)
820		return (error);
821
822	if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) {
823		atomic_subtract_int(&kq_ncallouts, 1);
824		return (ENOMEM);
825	}
826
827	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
828		kn->kn_flags |= EV_CLEAR;	/* automatically set */
829	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
830	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
831	kc->kn = kn;
832	kc->p = curproc;
833	kc->cpuid = PCPU_GET(cpuid);
834	kc->flags = 0;
835	callout_init(&kc->c, 1);
836	filt_timerstart(kn, to);
837
838	return (0);
839}
840
841static void
842filt_timerstart(struct knote *kn, sbintime_t to)
843{
844	struct kq_timer_cb_data *kc;
845
846	kc = kn->kn_ptr.p_v;
847	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
848		kc->next = to;
849		kc->to = 0;
850	} else {
851		kc->next = to + sbinuptime();
852		kc->to = to;
853	}
854	kqtimer_sched_callout(kc);
855}
856
857static void
858filt_timerdetach(struct knote *kn)
859{
860	struct kq_timer_cb_data *kc;
861	unsigned int old __unused;
862
863	kc = kn->kn_ptr.p_v;
864	callout_drain(&kc->c);
865	if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) {
866		PROC_LOCK(kc->p);
867		TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link);
868		PROC_UNLOCK(kc->p);
869	}
870	free(kc, M_KQUEUE);
871	old = atomic_fetchadd_int(&kq_ncallouts, -1);
872	KASSERT(old > 0, ("Number of callouts cannot become negative"));
873	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
874}
875
876static void
877filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
878{
879	struct kq_timer_cb_data *kc;
880	struct kqueue *kq;
881	sbintime_t to;
882	int error;
883
884	switch (type) {
885	case EVENT_REGISTER:
886		/* Handle re-added timers that update data/fflags */
887		if (kev->flags & EV_ADD) {
888			kc = kn->kn_ptr.p_v;
889
890			/* Drain any existing callout. */
891			callout_drain(&kc->c);
892
893			/* Throw away any existing undelivered record
894			 * of the timer expiration. This is done under
895			 * the presumption that if a process is
896			 * re-adding this timer with new parameters,
897			 * it is no longer interested in what may have
898			 * happened under the old parameters. If it is
899			 * interested, it can wait for the expiration,
900			 * delete the old timer definition, and then
901			 * add the new one.
902			 *
903			 * This has to be done while the kq is locked:
904			 *   - if enqueued, dequeue
905			 *   - make it no longer active
906			 *   - clear the count of expiration events
907			 */
908			kq = kn->kn_kq;
909			KQ_LOCK(kq);
910			if (kn->kn_status & KN_QUEUED)
911				knote_dequeue(kn);
912
913			kn->kn_status &= ~KN_ACTIVE;
914			kn->kn_data = 0;
915			KQ_UNLOCK(kq);
916
917			/* Reschedule timer based on new data/fflags */
918			kn->kn_sfflags = kev->fflags;
919			kn->kn_sdata = kev->data;
920			error = filt_timervalidate(kn, &to);
921			if (error != 0) {
922			  	kn->kn_flags |= EV_ERROR;
923				kn->kn_data = error;
924			} else
925			  	filt_timerstart(kn, to);
926		}
927		break;
928
929        case EVENT_PROCESS:
930		*kev = kn->kn_kevent;
931		if (kn->kn_flags & EV_CLEAR) {
932			kn->kn_data = 0;
933			kn->kn_fflags = 0;
934		}
935		break;
936
937	default:
938		panic("filt_timertouch() - invalid type (%ld)", type);
939		break;
940	}
941}
942
943static int
944filt_timer(struct knote *kn, long hint)
945{
946
947	return (kn->kn_data != 0);
948}
949
950static int
951filt_userattach(struct knote *kn)
952{
953
954	/*
955	 * EVFILT_USER knotes are not attached to anything in the kernel.
956	 */
957	kn->kn_hook = NULL;
958	if (kn->kn_fflags & NOTE_TRIGGER)
959		kn->kn_hookid = 1;
960	else
961		kn->kn_hookid = 0;
962	return (0);
963}
964
965static void
966filt_userdetach(__unused struct knote *kn)
967{
968
969	/*
970	 * EVFILT_USER knotes are not attached to anything in the kernel.
971	 */
972}
973
974static int
975filt_user(struct knote *kn, __unused long hint)
976{
977
978	return (kn->kn_hookid);
979}
980
981static void
982filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
983{
984	u_int ffctrl;
985
986	switch (type) {
987	case EVENT_REGISTER:
988		if (kev->fflags & NOTE_TRIGGER)
989			kn->kn_hookid = 1;
990
991		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
992		kev->fflags &= NOTE_FFLAGSMASK;
993		switch (ffctrl) {
994		case NOTE_FFNOP:
995			break;
996
997		case NOTE_FFAND:
998			kn->kn_sfflags &= kev->fflags;
999			break;
1000
1001		case NOTE_FFOR:
1002			kn->kn_sfflags |= kev->fflags;
1003			break;
1004
1005		case NOTE_FFCOPY:
1006			kn->kn_sfflags = kev->fflags;
1007			break;
1008
1009		default:
1010			/* XXX Return error? */
1011			break;
1012		}
1013		kn->kn_sdata = kev->data;
1014		if (kev->flags & EV_CLEAR) {
1015			kn->kn_hookid = 0;
1016			kn->kn_data = 0;
1017			kn->kn_fflags = 0;
1018		}
1019		break;
1020
1021        case EVENT_PROCESS:
1022		*kev = kn->kn_kevent;
1023		kev->fflags = kn->kn_sfflags;
1024		kev->data = kn->kn_sdata;
1025		if (kn->kn_flags & EV_CLEAR) {
1026			kn->kn_hookid = 0;
1027			kn->kn_data = 0;
1028			kn->kn_fflags = 0;
1029		}
1030		break;
1031
1032	default:
1033		panic("filt_usertouch() - invalid type (%ld)", type);
1034		break;
1035	}
1036}
1037
1038int
1039sys_kqueue(struct thread *td, struct kqueue_args *uap)
1040{
1041
1042	return (kern_kqueue(td, 0, NULL));
1043}
1044
1045static void
1046kqueue_init(struct kqueue *kq)
1047{
1048
1049	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
1050	TAILQ_INIT(&kq->kq_head);
1051	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
1052	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
1053}
1054
1055int
1056kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
1057{
1058	struct filedesc *fdp;
1059	struct kqueue *kq;
1060	struct file *fp;
1061	struct ucred *cred;
1062	int fd, error;
1063
1064	fdp = td->td_proc->p_fd;
1065	cred = td->td_ucred;
1066	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
1067		return (ENOMEM);
1068
1069	error = falloc_caps(td, &fp, &fd, flags, fcaps);
1070	if (error != 0) {
1071		chgkqcnt(cred->cr_ruidinfo, -1, 0);
1072		return (error);
1073	}
1074
1075	/* An extra reference on `fp' has been held for us by falloc(). */
1076	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
1077	kqueue_init(kq);
1078	kq->kq_fdp = fdp;
1079	kq->kq_cred = crhold(cred);
1080
1081	FILEDESC_XLOCK(fdp);
1082	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
1083	FILEDESC_XUNLOCK(fdp);
1084
1085	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
1086	fdrop(fp, td);
1087
1088	td->td_retval[0] = fd;
1089	return (0);
1090}
1091
1092struct g_kevent_args {
1093	int	fd;
1094	void	*changelist;
1095	int	nchanges;
1096	void	*eventlist;
1097	int	nevents;
1098	const struct timespec *timeout;
1099};
1100
1101int
1102sys_kevent(struct thread *td, struct kevent_args *uap)
1103{
1104	struct kevent_copyops k_ops = {
1105		.arg = uap,
1106		.k_copyout = kevent_copyout,
1107		.k_copyin = kevent_copyin,
1108		.kevent_size = sizeof(struct kevent),
1109	};
1110	struct g_kevent_args gk_args = {
1111		.fd = uap->fd,
1112		.changelist = uap->changelist,
1113		.nchanges = uap->nchanges,
1114		.eventlist = uap->eventlist,
1115		.nevents = uap->nevents,
1116		.timeout = uap->timeout,
1117	};
1118
1119	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
1120}
1121
1122static int
1123kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
1124    struct kevent_copyops *k_ops, const char *struct_name)
1125{
1126	struct timespec ts, *tsp;
1127#ifdef KTRACE
1128	struct kevent *eventlist = uap->eventlist;
1129#endif
1130	int error;
1131
1132	if (uap->timeout != NULL) {
1133		error = copyin(uap->timeout, &ts, sizeof(ts));
1134		if (error)
1135			return (error);
1136		tsp = &ts;
1137	} else
1138		tsp = NULL;
1139
1140#ifdef KTRACE
1141	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
1142		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
1143		    uap->nchanges, k_ops->kevent_size);
1144#endif
1145
1146	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
1147	    k_ops, tsp);
1148
1149#ifdef KTRACE
1150	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
1151		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
1152		    td->td_retval[0], k_ops->kevent_size);
1153#endif
1154
1155	return (error);
1156}
1157
1158/*
1159 * Copy 'count' items into the destination list pointed to by uap->eventlist.
1160 */
1161static int
1162kevent_copyout(void *arg, struct kevent *kevp, int count)
1163{
1164	struct kevent_args *uap;
1165	int error;
1166
1167	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1168	uap = (struct kevent_args *)arg;
1169
1170	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
1171	if (error == 0)
1172		uap->eventlist += count;
1173	return (error);
1174}
1175
1176/*
1177 * Copy 'count' items from the list pointed to by uap->changelist.
1178 */
1179static int
1180kevent_copyin(void *arg, struct kevent *kevp, int count)
1181{
1182	struct kevent_args *uap;
1183	int error;
1184
1185	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1186	uap = (struct kevent_args *)arg;
1187
1188	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
1189	if (error == 0)
1190		uap->changelist += count;
1191	return (error);
1192}
1193
1194#ifdef COMPAT_FREEBSD11
1195static int
1196kevent11_copyout(void *arg, struct kevent *kevp, int count)
1197{
1198	struct freebsd11_kevent_args *uap;
1199	struct kevent_freebsd11 kev11;
1200	int error, i;
1201
1202	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1203	uap = (struct freebsd11_kevent_args *)arg;
1204
1205	for (i = 0; i < count; i++) {
1206		kev11.ident = kevp->ident;
1207		kev11.filter = kevp->filter;
1208		kev11.flags = kevp->flags;
1209		kev11.fflags = kevp->fflags;
1210		kev11.data = kevp->data;
1211		kev11.udata = kevp->udata;
1212		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
1213		if (error != 0)
1214			break;
1215		uap->eventlist++;
1216		kevp++;
1217	}
1218	return (error);
1219}
1220
1221/*
1222 * Copy 'count' items from the list pointed to by uap->changelist.
1223 */
1224static int
1225kevent11_copyin(void *arg, struct kevent *kevp, int count)
1226{
1227	struct freebsd11_kevent_args *uap;
1228	struct kevent_freebsd11 kev11;
1229	int error, i;
1230
1231	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1232	uap = (struct freebsd11_kevent_args *)arg;
1233
1234	for (i = 0; i < count; i++) {
1235		error = copyin(uap->changelist, &kev11, sizeof(kev11));
1236		if (error != 0)
1237			break;
1238		kevp->ident = kev11.ident;
1239		kevp->filter = kev11.filter;
1240		kevp->flags = kev11.flags;
1241		kevp->fflags = kev11.fflags;
1242		kevp->data = (uintptr_t)kev11.data;
1243		kevp->udata = kev11.udata;
1244		bzero(&kevp->ext, sizeof(kevp->ext));
1245		uap->changelist++;
1246		kevp++;
1247	}
1248	return (error);
1249}
1250
1251int
1252freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
1253{
1254	struct kevent_copyops k_ops = {
1255		.arg = uap,
1256		.k_copyout = kevent11_copyout,
1257		.k_copyin = kevent11_copyin,
1258		.kevent_size = sizeof(struct kevent_freebsd11),
1259	};
1260	struct g_kevent_args gk_args = {
1261		.fd = uap->fd,
1262		.changelist = uap->changelist,
1263		.nchanges = uap->nchanges,
1264		.eventlist = uap->eventlist,
1265		.nevents = uap->nevents,
1266		.timeout = uap->timeout,
1267	};
1268
1269	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11"));
1270}
1271#endif
1272
1273int
1274kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
1275    struct kevent_copyops *k_ops, const struct timespec *timeout)
1276{
1277	cap_rights_t rights;
1278	struct file *fp;
1279	int error;
1280
1281	cap_rights_init_zero(&rights);
1282	if (nchanges > 0)
1283		cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE);
1284	if (nevents > 0)
1285		cap_rights_set_one(&rights, CAP_KQUEUE_EVENT);
1286	error = fget(td, fd, &rights, &fp);
1287	if (error != 0)
1288		return (error);
1289
1290	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
1291	fdrop(fp, td);
1292
1293	return (error);
1294}
1295
1296static int
1297kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
1298    struct kevent_copyops *k_ops, const struct timespec *timeout)
1299{
1300	struct kevent keva[KQ_NEVENTS];
1301	struct kevent *kevp, *changes;
1302	int i, n, nerrors, error;
1303
1304	nerrors = 0;
1305	while (nchanges > 0) {
1306		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
1307		error = k_ops->k_copyin(k_ops->arg, keva, n);
1308		if (error)
1309			return (error);
1310		changes = keva;
1311		for (i = 0; i < n; i++) {
1312			kevp = &changes[i];
1313			if (!kevp->filter)
1314				continue;
1315			kevp->flags &= ~EV_SYSFLAGS;
1316			error = kqueue_register(kq, kevp, td, M_WAITOK);
1317			if (error || (kevp->flags & EV_RECEIPT)) {
1318				if (nevents == 0)
1319					return (error);
1320				kevp->flags = EV_ERROR;
1321				kevp->data = error;
1322				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
1323				nevents--;
1324				nerrors++;
1325			}
1326		}
1327		nchanges -= n;
1328	}
1329	if (nerrors) {
1330		td->td_retval[0] = nerrors;
1331		return (0);
1332	}
1333
1334	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
1335}
1336
1337int
1338kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
1339    struct kevent_copyops *k_ops, const struct timespec *timeout)
1340{
1341	struct kqueue *kq;
1342	int error;
1343
1344	error = kqueue_acquire(fp, &kq);
1345	if (error != 0)
1346		return (error);
1347	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
1348	kqueue_release(kq, 0);
1349	return (error);
1350}
1351
1352/*
1353 * Performs a kevent() call on a temporarily created kqueue. This can be
1354 * used to perform one-shot polling, similar to poll() and select().
1355 */
1356int
1357kern_kevent_anonymous(struct thread *td, int nevents,
1358    struct kevent_copyops *k_ops)
1359{
1360	struct kqueue kq = {};
1361	int error;
1362
1363	kqueue_init(&kq);
1364	kq.kq_refcnt = 1;
1365	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
1366	kqueue_drain(&kq, td);
1367	kqueue_destroy(&kq);
1368	return (error);
1369}
1370
1371int
1372kqueue_add_filteropts(int filt, struct filterops *filtops)
1373{
1374	int error;
1375
1376	error = 0;
1377	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1378		printf(
1379"trying to add a filterop that is out of range: %d is beyond %d\n",
1380		    ~filt, EVFILT_SYSCOUNT);
1381		return EINVAL;
1382	}
1383	mtx_lock(&filterops_lock);
1384	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1385	    sysfilt_ops[~filt].for_fop != NULL)
1386		error = EEXIST;
1387	else {
1388		sysfilt_ops[~filt].for_fop = filtops;
1389		sysfilt_ops[~filt].for_refcnt = 0;
1390	}
1391	mtx_unlock(&filterops_lock);
1392
1393	return (error);
1394}
1395
1396int
1397kqueue_del_filteropts(int filt)
1398{
1399	int error;
1400
1401	error = 0;
1402	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1403		return EINVAL;
1404
1405	mtx_lock(&filterops_lock);
1406	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1407	    sysfilt_ops[~filt].for_fop == NULL)
1408		error = EINVAL;
1409	else if (sysfilt_ops[~filt].for_refcnt != 0)
1410		error = EBUSY;
1411	else {
1412		sysfilt_ops[~filt].for_fop = &null_filtops;
1413		sysfilt_ops[~filt].for_refcnt = 0;
1414	}
1415	mtx_unlock(&filterops_lock);
1416
1417	return error;
1418}
1419
1420static struct filterops *
1421kqueue_fo_find(int filt)
1422{
1423
1424	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1425		return NULL;
1426
1427	if (sysfilt_ops[~filt].for_nolock)
1428		return sysfilt_ops[~filt].for_fop;
1429
1430	mtx_lock(&filterops_lock);
1431	sysfilt_ops[~filt].for_refcnt++;
1432	if (sysfilt_ops[~filt].for_fop == NULL)
1433		sysfilt_ops[~filt].for_fop = &null_filtops;
1434	mtx_unlock(&filterops_lock);
1435
1436	return sysfilt_ops[~filt].for_fop;
1437}
1438
1439static void
1440kqueue_fo_release(int filt)
1441{
1442
1443	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1444		return;
1445
1446	if (sysfilt_ops[~filt].for_nolock)
1447		return;
1448
1449	mtx_lock(&filterops_lock);
1450	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1451	    ("filter object refcount not valid on release"));
1452	sysfilt_ops[~filt].for_refcnt--;
1453	mtx_unlock(&filterops_lock);
1454}
1455
1456/*
1457 * A ref to kq (obtained via kqueue_acquire) must be held.
1458 */
1459static int
1460kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
1461    int mflag)
1462{
1463	struct filterops *fops;
1464	struct file *fp;
1465	struct knote *kn, *tkn;
1466	struct knlist *knl;
1467	int error, filt, event;
1468	int haskqglobal, filedesc_unlock;
1469
1470	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
1471		return (EINVAL);
1472
1473	fp = NULL;
1474	kn = NULL;
1475	knl = NULL;
1476	error = 0;
1477	haskqglobal = 0;
1478	filedesc_unlock = 0;
1479
1480	filt = kev->filter;
1481	fops = kqueue_fo_find(filt);
1482	if (fops == NULL)
1483		return EINVAL;
1484
1485	if (kev->flags & EV_ADD) {
1486		/*
1487		 * Prevent waiting with locks.  Non-sleepable
1488		 * allocation failures are handled in the loop, only
1489		 * if the spare knote appears to be actually required.
1490		 */
1491		tkn = knote_alloc(mflag);
1492	} else {
1493		tkn = NULL;
1494	}
1495
1496findkn:
1497	if (fops->f_isfd) {
1498		KASSERT(td != NULL, ("td is NULL"));
1499		if (kev->ident > INT_MAX)
1500			error = EBADF;
1501		else
1502			error = fget(td, kev->ident, &cap_event_rights, &fp);
1503		if (error)
1504			goto done;
1505
1506		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1507		    kev->ident, M_NOWAIT) != 0) {
1508			/* try again */
1509			fdrop(fp, td);
1510			fp = NULL;
1511			error = kqueue_expand(kq, fops, kev->ident, mflag);
1512			if (error)
1513				goto done;
1514			goto findkn;
1515		}
1516
1517		if (fp->f_type == DTYPE_KQUEUE) {
1518			/*
1519			 * If we add some intelligence about what we are doing,
1520			 * we should be able to support events on ourselves.
1521			 * We need to know when we are doing this to prevent
1522			 * getting both the knlist lock and the kq lock since
1523			 * they are the same thing.
1524			 */
1525			if (fp->f_data == kq) {
1526				error = EINVAL;
1527				goto done;
1528			}
1529
1530			/*
1531			 * Pre-lock the filedesc before the global
1532			 * lock mutex, see the comment in
1533			 * kqueue_close().
1534			 */
1535			FILEDESC_XLOCK(td->td_proc->p_fd);
1536			filedesc_unlock = 1;
1537			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1538		}
1539
1540		KQ_LOCK(kq);
1541		if (kev->ident < kq->kq_knlistsize) {
1542			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1543				if (kev->filter == kn->kn_filter)
1544					break;
1545		}
1546	} else {
1547		if ((kev->flags & EV_ADD) == EV_ADD) {
1548			error = kqueue_expand(kq, fops, kev->ident, mflag);
1549			if (error != 0)
1550				goto done;
1551		}
1552
1553		KQ_LOCK(kq);
1554
1555		/*
1556		 * If possible, find an existing knote to use for this kevent.
1557		 */
1558		if (kev->filter == EVFILT_PROC &&
1559		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
1560			/* This is an internal creation of a process tracking
1561			 * note. Don't attempt to coalesce this with an
1562			 * existing note.
1563			 */
1564			;
1565		} else if (kq->kq_knhashmask != 0) {
1566			struct klist *list;
1567
1568			list = &kq->kq_knhash[
1569			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1570			SLIST_FOREACH(kn, list, kn_link)
1571				if (kev->ident == kn->kn_id &&
1572				    kev->filter == kn->kn_filter)
1573					break;
1574		}
1575	}
1576
1577	/* knote is in the process of changing, wait for it to stabilize. */
1578	if (kn != NULL && kn_in_flux(kn)) {
1579		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1580		if (filedesc_unlock) {
1581			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1582			filedesc_unlock = 0;
1583		}
1584		kq->kq_state |= KQ_FLUXWAIT;
1585		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1586		if (fp != NULL) {
1587			fdrop(fp, td);
1588			fp = NULL;
1589		}
1590		goto findkn;
1591	}
1592
1593	/*
1594	 * kn now contains the matching knote, or NULL if no match
1595	 */
1596	if (kn == NULL) {
1597		if (kev->flags & EV_ADD) {
1598			kn = tkn;
1599			tkn = NULL;
1600			if (kn == NULL) {
1601				KQ_UNLOCK(kq);
1602				error = ENOMEM;
1603				goto done;
1604			}
1605			kn->kn_fp = fp;
1606			kn->kn_kq = kq;
1607			kn->kn_fop = fops;
1608			/*
1609			 * apply reference counts to knote structure, and
1610			 * do not release it at the end of this routine.
1611			 */
1612			fops = NULL;
1613			fp = NULL;
1614
1615			kn->kn_sfflags = kev->fflags;
1616			kn->kn_sdata = kev->data;
1617			kev->fflags = 0;
1618			kev->data = 0;
1619			kn->kn_kevent = *kev;
1620			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1621			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
1622			kn->kn_status = KN_DETACHED;
1623			if ((kev->flags & EV_DISABLE) != 0)
1624				kn->kn_status |= KN_DISABLED;
1625			kn_enter_flux(kn);
1626
1627			error = knote_attach(kn, kq);
1628			KQ_UNLOCK(kq);
1629			if (error != 0) {
1630				tkn = kn;
1631				goto done;
1632			}
1633
1634			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1635				knote_drop_detached(kn, td);
1636				goto done;
1637			}
1638			knl = kn_list_lock(kn);
1639			goto done_ev_add;
1640		} else {
1641			/* No matching knote and the EV_ADD flag is not set. */
1642			KQ_UNLOCK(kq);
1643			error = ENOENT;
1644			goto done;
1645		}
1646	}
1647
1648	if (kev->flags & EV_DELETE) {
1649		kn_enter_flux(kn);
1650		KQ_UNLOCK(kq);
1651		knote_drop(kn, td);
1652		goto done;
1653	}
1654
1655	if (kev->flags & EV_FORCEONESHOT) {
1656		kn->kn_flags |= EV_ONESHOT;
1657		KNOTE_ACTIVATE(kn, 1);
1658	}
1659
1660	if ((kev->flags & EV_ENABLE) != 0)
1661		kn->kn_status &= ~KN_DISABLED;
1662	else if ((kev->flags & EV_DISABLE) != 0)
1663		kn->kn_status |= KN_DISABLED;
1664
1665	/*
1666	 * The user may change some filter values after the initial EV_ADD,
1667	 * but doing so will not reset any filter which has already been
1668	 * triggered.
1669	 */
1670	kn->kn_status |= KN_SCAN;
1671	kn_enter_flux(kn);
1672	KQ_UNLOCK(kq);
1673	knl = kn_list_lock(kn);
1674	kn->kn_kevent.udata = kev->udata;
1675	if (!fops->f_isfd && fops->f_touch != NULL) {
1676		fops->f_touch(kn, kev, EVENT_REGISTER);
1677	} else {
1678		kn->kn_sfflags = kev->fflags;
1679		kn->kn_sdata = kev->data;
1680	}
1681
1682done_ev_add:
1683	/*
1684	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1685	 * the initial attach event decides that the event is "completed"
1686	 * already, e.g., filt_procattach() is called on a zombie process.  It
1687	 * will call filt_proc() which will remove it from the list, and NULL
1688	 * kn_knlist.
1689	 *
1690	 * KN_DISABLED will be stable while the knote is in flux, so the
1691	 * unlocked read will not race with an update.
1692	 */
1693	if ((kn->kn_status & KN_DISABLED) == 0)
1694		event = kn->kn_fop->f_event(kn, 0);
1695	else
1696		event = 0;
1697
1698	KQ_LOCK(kq);
1699	if (event)
1700		kn->kn_status |= KN_ACTIVE;
1701	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
1702	    KN_ACTIVE)
1703		knote_enqueue(kn);
1704	kn->kn_status &= ~KN_SCAN;
1705	kn_leave_flux(kn);
1706	kn_list_unlock(knl);
1707	KQ_UNLOCK_FLUX(kq);
1708
1709done:
1710	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1711	if (filedesc_unlock)
1712		FILEDESC_XUNLOCK(td->td_proc->p_fd);
1713	if (fp != NULL)
1714		fdrop(fp, td);
1715	knote_free(tkn);
1716	if (fops != NULL)
1717		kqueue_fo_release(filt);
1718	return (error);
1719}
1720
1721static int
1722kqueue_acquire(struct file *fp, struct kqueue **kqp)
1723{
1724	int error;
1725	struct kqueue *kq;
1726
1727	error = 0;
1728
1729	kq = fp->f_data;
1730	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1731		return (EBADF);
1732	*kqp = kq;
1733	KQ_LOCK(kq);
1734	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1735		KQ_UNLOCK(kq);
1736		return (EBADF);
1737	}
1738	kq->kq_refcnt++;
1739	KQ_UNLOCK(kq);
1740
1741	return error;
1742}
1743
1744static void
1745kqueue_release(struct kqueue *kq, int locked)
1746{
1747	if (locked)
1748		KQ_OWNED(kq);
1749	else
1750		KQ_LOCK(kq);
1751	kq->kq_refcnt--;
1752	if (kq->kq_refcnt == 1)
1753		wakeup(&kq->kq_refcnt);
1754	if (!locked)
1755		KQ_UNLOCK(kq);
1756}
1757
1758static void
1759kqueue_schedtask(struct kqueue *kq)
1760{
1761
1762	KQ_OWNED(kq);
1763	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1764	    ("scheduling kqueue task while draining"));
1765
1766	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1767		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
1768		kq->kq_state |= KQ_TASKSCHED;
1769	}
1770}
1771
1772/*
1773 * Expand the kq to make sure we have storage for fops/ident pair.
1774 *
1775 * Return 0 on success (or no work necessary), return errno on failure.
1776 */
1777static int
1778kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1779    int mflag)
1780{
1781	struct klist *list, *tmp_knhash, *to_free;
1782	u_long tmp_knhashmask;
1783	int error, fd, size;
1784
1785	KQ_NOTOWNED(kq);
1786
1787	error = 0;
1788	to_free = NULL;
1789	if (fops->f_isfd) {
1790		fd = ident;
1791		if (kq->kq_knlistsize <= fd) {
1792			size = kq->kq_knlistsize;
1793			while (size <= fd)
1794				size += KQEXTENT;
1795			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1796			if (list == NULL)
1797				return ENOMEM;
1798			KQ_LOCK(kq);
1799			if ((kq->kq_state & KQ_CLOSING) != 0) {
1800				to_free = list;
1801				error = EBADF;
1802			} else if (kq->kq_knlistsize > fd) {
1803				to_free = list;
1804			} else {
1805				if (kq->kq_knlist != NULL) {
1806					bcopy(kq->kq_knlist, list,
1807					    kq->kq_knlistsize * sizeof(*list));
1808					to_free = kq->kq_knlist;
1809					kq->kq_knlist = NULL;
1810				}
1811				bzero((caddr_t)list +
1812				    kq->kq_knlistsize * sizeof(*list),
1813				    (size - kq->kq_knlistsize) * sizeof(*list));
1814				kq->kq_knlistsize = size;
1815				kq->kq_knlist = list;
1816			}
1817			KQ_UNLOCK(kq);
1818		}
1819	} else {
1820		if (kq->kq_knhashmask == 0) {
1821			tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
1822			    &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
1823			    HASH_WAITOK : HASH_NOWAIT);
1824			if (tmp_knhash == NULL)
1825				return (ENOMEM);
1826			KQ_LOCK(kq);
1827			if ((kq->kq_state & KQ_CLOSING) != 0) {
1828				to_free = tmp_knhash;
1829				error = EBADF;
1830			} else if (kq->kq_knhashmask == 0) {
1831				kq->kq_knhash = tmp_knhash;
1832				kq->kq_knhashmask = tmp_knhashmask;
1833			} else {
1834				to_free = tmp_knhash;
1835			}
1836			KQ_UNLOCK(kq);
1837		}
1838	}
1839	free(to_free, M_KQUEUE);
1840
1841	KQ_NOTOWNED(kq);
1842	return (error);
1843}
1844
1845static void
1846kqueue_task(void *arg, int pending)
1847{
1848	struct kqueue *kq;
1849	int haskqglobal;
1850
1851	haskqglobal = 0;
1852	kq = arg;
1853
1854	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1855	KQ_LOCK(kq);
1856
1857	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1858
1859	kq->kq_state &= ~KQ_TASKSCHED;
1860	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1861		wakeup(&kq->kq_state);
1862	}
1863	KQ_UNLOCK(kq);
1864	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1865}
1866
1867/*
1868 * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1869 * We treat KN_MARKER knotes as if they are in flux.
1870 */
1871static int
1872kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1873    const struct timespec *tsp, struct kevent *keva, struct thread *td)
1874{
1875	struct kevent *kevp;
1876	struct knote *kn, *marker;
1877	struct knlist *knl;
1878	sbintime_t asbt, rsbt;
1879	int count, error, haskqglobal, influx, nkev, touch;
1880
1881	count = maxevents;
1882	nkev = 0;
1883	error = 0;
1884	haskqglobal = 0;
1885
1886	if (maxevents == 0)
1887		goto done_nl;
1888
1889	rsbt = 0;
1890	if (tsp != NULL) {
1891		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
1892		    tsp->tv_nsec >= 1000000000) {
1893			error = EINVAL;
1894			goto done_nl;
1895		}
1896		if (timespecisset(tsp)) {
1897			if (tsp->tv_sec <= INT32_MAX) {
1898				rsbt = tstosbt(*tsp);
1899				if (TIMESEL(&asbt, rsbt))
1900					asbt += tc_tick_sbt;
1901				if (asbt <= SBT_MAX - rsbt)
1902					asbt += rsbt;
1903				else
1904					asbt = 0;
1905				rsbt >>= tc_precexp;
1906			} else
1907				asbt = 0;
1908		} else
1909			asbt = -1;
1910	} else
1911		asbt = 0;
1912	marker = knote_alloc(M_WAITOK);
1913	marker->kn_status = KN_MARKER;
1914	KQ_LOCK(kq);
1915
1916retry:
1917	kevp = keva;
1918	if (kq->kq_count == 0) {
1919		if (asbt == -1) {
1920			error = EWOULDBLOCK;
1921		} else {
1922			kq->kq_state |= KQ_SLEEP;
1923			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
1924			    "kqread", asbt, rsbt, C_ABSOLUTE);
1925		}
1926		if (error == 0)
1927			goto retry;
1928		/* don't restart after signals... */
1929		if (error == ERESTART)
1930			error = EINTR;
1931		else if (error == EWOULDBLOCK)
1932			error = 0;
1933		goto done;
1934	}
1935
1936	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1937	influx = 0;
1938	while (count) {
1939		KQ_OWNED(kq);
1940		kn = TAILQ_FIRST(&kq->kq_head);
1941
1942		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1943		    kn_in_flux(kn)) {
1944			if (influx) {
1945				influx = 0;
1946				KQ_FLUX_WAKEUP(kq);
1947			}
1948			kq->kq_state |= KQ_FLUXWAIT;
1949			error = msleep(kq, &kq->kq_lock, PSOCK,
1950			    "kqflxwt", 0);
1951			continue;
1952		}
1953
1954		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1955		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1956			kn->kn_status &= ~KN_QUEUED;
1957			kq->kq_count--;
1958			continue;
1959		}
1960		if (kn == marker) {
1961			KQ_FLUX_WAKEUP(kq);
1962			if (count == maxevents)
1963				goto retry;
1964			goto done;
1965		}
1966		KASSERT(!kn_in_flux(kn),
1967		    ("knote %p is unexpectedly in flux", kn));
1968
1969		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
1970			kn->kn_status &= ~KN_QUEUED;
1971			kn_enter_flux(kn);
1972			kq->kq_count--;
1973			KQ_UNLOCK(kq);
1974			/*
1975			 * We don't need to lock the list since we've
1976			 * marked it as in flux.
1977			 */
1978			knote_drop(kn, td);
1979			KQ_LOCK(kq);
1980			continue;
1981		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1982			kn->kn_status &= ~KN_QUEUED;
1983			kn_enter_flux(kn);
1984			kq->kq_count--;
1985			KQ_UNLOCK(kq);
1986			/*
1987			 * We don't need to lock the list since we've
1988			 * marked the knote as being in flux.
1989			 */
1990			*kevp = kn->kn_kevent;
1991			knote_drop(kn, td);
1992			KQ_LOCK(kq);
1993			kn = NULL;
1994		} else {
1995			kn->kn_status |= KN_SCAN;
1996			kn_enter_flux(kn);
1997			KQ_UNLOCK(kq);
1998			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1999				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
2000			knl = kn_list_lock(kn);
2001			if (kn->kn_fop->f_event(kn, 0) == 0) {
2002				KQ_LOCK(kq);
2003				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2004				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
2005				    KN_SCAN);
2006				kn_leave_flux(kn);
2007				kq->kq_count--;
2008				kn_list_unlock(knl);
2009				influx = 1;
2010				continue;
2011			}
2012			touch = (!kn->kn_fop->f_isfd &&
2013			    kn->kn_fop->f_touch != NULL);
2014			if (touch)
2015				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
2016			else
2017				*kevp = kn->kn_kevent;
2018			KQ_LOCK(kq);
2019			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2020			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
2021				/*
2022				 * Manually clear knotes who weren't
2023				 * 'touch'ed.
2024				 */
2025				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
2026					kn->kn_data = 0;
2027					kn->kn_fflags = 0;
2028				}
2029				if (kn->kn_flags & EV_DISPATCH)
2030					kn->kn_status |= KN_DISABLED;
2031				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
2032				kq->kq_count--;
2033			} else
2034				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2035
2036			kn->kn_status &= ~KN_SCAN;
2037			kn_leave_flux(kn);
2038			kn_list_unlock(knl);
2039			influx = 1;
2040		}
2041
2042		/* we are returning a copy to the user */
2043		kevp++;
2044		nkev++;
2045		count--;
2046
2047		if (nkev == KQ_NEVENTS) {
2048			influx = 0;
2049			KQ_UNLOCK_FLUX(kq);
2050			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2051			nkev = 0;
2052			kevp = keva;
2053			KQ_LOCK(kq);
2054			if (error)
2055				break;
2056		}
2057	}
2058	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
2059done:
2060	KQ_OWNED(kq);
2061	KQ_UNLOCK_FLUX(kq);
2062	knote_free(marker);
2063done_nl:
2064	KQ_NOTOWNED(kq);
2065	if (nkev != 0)
2066		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2067	td->td_retval[0] = maxevents - count;
2068	return (error);
2069}
2070
2071/*ARGSUSED*/
2072static int
2073kqueue_ioctl(struct file *fp, u_long cmd, void *data,
2074	struct ucred *active_cred, struct thread *td)
2075{
2076	/*
2077	 * Enabling sigio causes two major problems:
2078	 * 1) infinite recursion:
2079	 * Synopsys: kevent is being used to track signals and have FIOASYNC
2080	 * set.  On receipt of a signal this will cause a kqueue to recurse
2081	 * into itself over and over.  Sending the sigio causes the kqueue
2082	 * to become ready, which in turn posts sigio again, forever.
2083	 * Solution: this can be solved by setting a flag in the kqueue that
2084	 * we have a SIGIO in progress.
2085	 * 2) locking problems:
2086	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
2087	 * us above the proc and pgrp locks.
2088	 * Solution: Post a signal using an async mechanism, being sure to
2089	 * record a generation count in the delivery so that we do not deliver
2090	 * a signal to the wrong process.
2091	 *
2092	 * Note, these two mechanisms are somewhat mutually exclusive!
2093	 */
2094#if 0
2095	struct kqueue *kq;
2096
2097	kq = fp->f_data;
2098	switch (cmd) {
2099	case FIOASYNC:
2100		if (*(int *)data) {
2101			kq->kq_state |= KQ_ASYNC;
2102		} else {
2103			kq->kq_state &= ~KQ_ASYNC;
2104		}
2105		return (0);
2106
2107	case FIOSETOWN:
2108		return (fsetown(*(int *)data, &kq->kq_sigio));
2109
2110	case FIOGETOWN:
2111		*(int *)data = fgetown(&kq->kq_sigio);
2112		return (0);
2113	}
2114#endif
2115
2116	return (ENOTTY);
2117}
2118
2119/*ARGSUSED*/
2120static int
2121kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
2122	struct thread *td)
2123{
2124	struct kqueue *kq;
2125	int revents = 0;
2126	int error;
2127
2128	if ((error = kqueue_acquire(fp, &kq)))
2129		return POLLERR;
2130
2131	KQ_LOCK(kq);
2132	if (events & (POLLIN | POLLRDNORM)) {
2133		if (kq->kq_count) {
2134			revents |= events & (POLLIN | POLLRDNORM);
2135		} else {
2136			selrecord(td, &kq->kq_sel);
2137			if (SEL_WAITING(&kq->kq_sel))
2138				kq->kq_state |= KQ_SEL;
2139		}
2140	}
2141	kqueue_release(kq, 1);
2142	KQ_UNLOCK(kq);
2143	return (revents);
2144}
2145
2146/*ARGSUSED*/
2147static int
2148kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
2149	struct thread *td)
2150{
2151
2152	bzero((void *)st, sizeof *st);
2153	/*
2154	 * We no longer return kq_count because the unlocked value is useless.
2155	 * If you spent all this time getting the count, why not spend your
2156	 * syscall better by calling kevent?
2157	 *
2158	 * XXX - This is needed for libc_r.
2159	 */
2160	st->st_mode = S_IFIFO;
2161	return (0);
2162}
2163
2164static void
2165kqueue_drain(struct kqueue *kq, struct thread *td)
2166{
2167	struct knote *kn;
2168	int i;
2169
2170	KQ_LOCK(kq);
2171
2172	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
2173	    ("kqueue already closing"));
2174	kq->kq_state |= KQ_CLOSING;
2175	if (kq->kq_refcnt > 1)
2176		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
2177
2178	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
2179
2180	KASSERT(knlist_empty(&kq->kq_sel.si_note),
2181	    ("kqueue's knlist not empty"));
2182
2183	for (i = 0; i < kq->kq_knlistsize; i++) {
2184		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
2185			if (kn_in_flux(kn)) {
2186				kq->kq_state |= KQ_FLUXWAIT;
2187				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
2188				continue;
2189			}
2190			kn_enter_flux(kn);
2191			KQ_UNLOCK(kq);
2192			knote_drop(kn, td);
2193			KQ_LOCK(kq);
2194		}
2195	}
2196	if (kq->kq_knhashmask != 0) {
2197		for (i = 0; i <= kq->kq_knhashmask; i++) {
2198			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
2199				if (kn_in_flux(kn)) {
2200					kq->kq_state |= KQ_FLUXWAIT;
2201					msleep(kq, &kq->kq_lock, PSOCK,
2202					       "kqclo2", 0);
2203					continue;
2204				}
2205				kn_enter_flux(kn);
2206				KQ_UNLOCK(kq);
2207				knote_drop(kn, td);
2208				KQ_LOCK(kq);
2209			}
2210		}
2211	}
2212
2213	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
2214		kq->kq_state |= KQ_TASKDRAIN;
2215		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
2216	}
2217
2218	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2219		selwakeuppri(&kq->kq_sel, PSOCK);
2220		if (!SEL_WAITING(&kq->kq_sel))
2221			kq->kq_state &= ~KQ_SEL;
2222	}
2223
2224	KQ_UNLOCK(kq);
2225}
2226
2227static void
2228kqueue_destroy(struct kqueue *kq)
2229{
2230
2231	KASSERT(kq->kq_fdp == NULL,
2232	    ("kqueue still attached to a file descriptor"));
2233	seldrain(&kq->kq_sel);
2234	knlist_destroy(&kq->kq_sel.si_note);
2235	mtx_destroy(&kq->kq_lock);
2236
2237	if (kq->kq_knhash != NULL)
2238		free(kq->kq_knhash, M_KQUEUE);
2239	if (kq->kq_knlist != NULL)
2240		free(kq->kq_knlist, M_KQUEUE);
2241
2242	funsetown(&kq->kq_sigio);
2243}
2244
2245/*ARGSUSED*/
2246static int
2247kqueue_close(struct file *fp, struct thread *td)
2248{
2249	struct kqueue *kq = fp->f_data;
2250	struct filedesc *fdp;
2251	int error;
2252	int filedesc_unlock;
2253
2254	if ((error = kqueue_acquire(fp, &kq)))
2255		return error;
2256	kqueue_drain(kq, td);
2257
2258	/*
2259	 * We could be called due to the knote_drop() doing fdrop(),
2260	 * called from kqueue_register().  In this case the global
2261	 * lock is owned, and filedesc sx is locked before, to not
2262	 * take the sleepable lock after non-sleepable.
2263	 */
2264	fdp = kq->kq_fdp;
2265	kq->kq_fdp = NULL;
2266	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
2267		FILEDESC_XLOCK(fdp);
2268		filedesc_unlock = 1;
2269	} else
2270		filedesc_unlock = 0;
2271	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
2272	if (filedesc_unlock)
2273		FILEDESC_XUNLOCK(fdp);
2274
2275	kqueue_destroy(kq);
2276	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
2277	crfree(kq->kq_cred);
2278	free(kq, M_KQUEUE);
2279	fp->f_data = NULL;
2280
2281	return (0);
2282}
2283
2284static int
2285kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2286{
2287
2288	kif->kf_type = KF_TYPE_KQUEUE;
2289	return (0);
2290}
2291
2292static void
2293kqueue_wakeup(struct kqueue *kq)
2294{
2295	KQ_OWNED(kq);
2296
2297	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
2298		kq->kq_state &= ~KQ_SLEEP;
2299		wakeup(kq);
2300	}
2301	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2302		selwakeuppri(&kq->kq_sel, PSOCK);
2303		if (!SEL_WAITING(&kq->kq_sel))
2304			kq->kq_state &= ~KQ_SEL;
2305	}
2306	if (!knlist_empty(&kq->kq_sel.si_note))
2307		kqueue_schedtask(kq);
2308	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
2309		pgsigio(&kq->kq_sigio, SIGIO, 0);
2310	}
2311}
2312
2313/*
2314 * Walk down a list of knotes, activating them if their event has triggered.
2315 *
2316 * There is a possibility to optimize in the case of one kq watching another.
2317 * Instead of scheduling a task to wake it up, you could pass enough state
2318 * down the chain to make up the parent kqueue.  Make this code functional
2319 * first.
2320 */
2321void
2322knote(struct knlist *list, long hint, int lockflags)
2323{
2324	struct kqueue *kq;
2325	struct knote *kn, *tkn;
2326	int error;
2327
2328	if (list == NULL)
2329		return;
2330
2331	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
2332
2333	if ((lockflags & KNF_LISTLOCKED) == 0)
2334		list->kl_lock(list->kl_lockarg);
2335
2336	/*
2337	 * If we unlock the list lock (and enter influx), we can
2338	 * eliminate the kqueue scheduling, but this will introduce
2339	 * four lock/unlock's for each knote to test.  Also, marker
2340	 * would be needed to keep iteration position, since filters
2341	 * or other threads could remove events.
2342	 */
2343	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
2344		kq = kn->kn_kq;
2345		KQ_LOCK(kq);
2346		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
2347			/*
2348			 * Do not process the influx notes, except for
2349			 * the influx coming from the kq unlock in the
2350			 * kqueue_scan().  In the later case, we do
2351			 * not interfere with the scan, since the code
2352			 * fragment in kqueue_scan() locks the knlist,
2353			 * and cannot proceed until we finished.
2354			 */
2355			KQ_UNLOCK(kq);
2356		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
2357			kn_enter_flux(kn);
2358			KQ_UNLOCK(kq);
2359			error = kn->kn_fop->f_event(kn, hint);
2360			KQ_LOCK(kq);
2361			kn_leave_flux(kn);
2362			if (error)
2363				KNOTE_ACTIVATE(kn, 1);
2364			KQ_UNLOCK_FLUX(kq);
2365		} else {
2366			if (kn->kn_fop->f_event(kn, hint))
2367				KNOTE_ACTIVATE(kn, 1);
2368			KQ_UNLOCK(kq);
2369		}
2370	}
2371	if ((lockflags & KNF_LISTLOCKED) == 0)
2372		list->kl_unlock(list->kl_lockarg);
2373}
2374
2375/*
2376 * add a knote to a knlist
2377 */
2378void
2379knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2380{
2381
2382	KNL_ASSERT_LOCK(knl, islocked);
2383	KQ_NOTOWNED(kn->kn_kq);
2384	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
2385	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2386	    ("knote %p was not detached", kn));
2387	if (!islocked)
2388		knl->kl_lock(knl->kl_lockarg);
2389	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2390	if (!islocked)
2391		knl->kl_unlock(knl->kl_lockarg);
2392	KQ_LOCK(kn->kn_kq);
2393	kn->kn_knlist = knl;
2394	kn->kn_status &= ~KN_DETACHED;
2395	KQ_UNLOCK(kn->kn_kq);
2396}
2397
2398static void
2399knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
2400    int kqislocked)
2401{
2402
2403	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
2404	KNL_ASSERT_LOCK(knl, knlislocked);
2405	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2406	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
2407	KASSERT((kn->kn_status & KN_DETACHED) == 0,
2408	    ("knote %p was already detached", kn));
2409	if (!knlislocked)
2410		knl->kl_lock(knl->kl_lockarg);
2411	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2412	kn->kn_knlist = NULL;
2413	if (!knlislocked)
2414		kn_list_unlock(knl);
2415	if (!kqislocked)
2416		KQ_LOCK(kn->kn_kq);
2417	kn->kn_status |= KN_DETACHED;
2418	if (!kqislocked)
2419		KQ_UNLOCK(kn->kn_kq);
2420}
2421
2422/*
2423 * remove knote from the specified knlist
2424 */
2425void
2426knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2427{
2428
2429	knlist_remove_kq(knl, kn, islocked, 0);
2430}
2431
2432int
2433knlist_empty(struct knlist *knl)
2434{
2435
2436	KNL_ASSERT_LOCKED(knl);
2437	return (SLIST_EMPTY(&knl->kl_list));
2438}
2439
2440static struct mtx knlist_lock;
2441MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2442    MTX_DEF);
2443static void knlist_mtx_lock(void *arg);
2444static void knlist_mtx_unlock(void *arg);
2445
2446static void
2447knlist_mtx_lock(void *arg)
2448{
2449
2450	mtx_lock((struct mtx *)arg);
2451}
2452
2453static void
2454knlist_mtx_unlock(void *arg)
2455{
2456
2457	mtx_unlock((struct mtx *)arg);
2458}
2459
2460static void
2461knlist_mtx_assert_lock(void *arg, int what)
2462{
2463
2464	if (what == LA_LOCKED)
2465		mtx_assert((struct mtx *)arg, MA_OWNED);
2466	else
2467		mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2468}
2469
2470static void
2471knlist_rw_rlock(void *arg)
2472{
2473
2474	rw_rlock((struct rwlock *)arg);
2475}
2476
2477static void
2478knlist_rw_runlock(void *arg)
2479{
2480
2481	rw_runlock((struct rwlock *)arg);
2482}
2483
2484static void
2485knlist_rw_assert_lock(void *arg, int what)
2486{
2487
2488	if (what == LA_LOCKED)
2489		rw_assert((struct rwlock *)arg, RA_LOCKED);
2490	else
2491		rw_assert((struct rwlock *)arg, RA_UNLOCKED);
2492}
2493
2494void
2495knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2496    void (*kl_unlock)(void *),
2497    void (*kl_assert_lock)(void *, int))
2498{
2499
2500	if (lock == NULL)
2501		knl->kl_lockarg = &knlist_lock;
2502	else
2503		knl->kl_lockarg = lock;
2504
2505	if (kl_lock == NULL)
2506		knl->kl_lock = knlist_mtx_lock;
2507	else
2508		knl->kl_lock = kl_lock;
2509	if (kl_unlock == NULL)
2510		knl->kl_unlock = knlist_mtx_unlock;
2511	else
2512		knl->kl_unlock = kl_unlock;
2513	if (kl_assert_lock == NULL)
2514		knl->kl_assert_lock = knlist_mtx_assert_lock;
2515	else
2516		knl->kl_assert_lock = kl_assert_lock;
2517
2518	knl->kl_autodestroy = 0;
2519	SLIST_INIT(&knl->kl_list);
2520}
2521
2522void
2523knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2524{
2525
2526	knlist_init(knl, lock, NULL, NULL, NULL);
2527}
2528
2529struct knlist *
2530knlist_alloc(struct mtx *lock)
2531{
2532	struct knlist *knl;
2533
2534	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
2535	knlist_init_mtx(knl, lock);
2536	return (knl);
2537}
2538
2539void
2540knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
2541{
2542
2543	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
2544	    knlist_rw_assert_lock);
2545}
2546
2547void
2548knlist_destroy(struct knlist *knl)
2549{
2550
2551	KASSERT(KNLIST_EMPTY(knl),
2552	    ("destroying knlist %p with knotes on it", knl));
2553}
2554
2555void
2556knlist_detach(struct knlist *knl)
2557{
2558
2559	KNL_ASSERT_LOCKED(knl);
2560	knl->kl_autodestroy = 1;
2561	if (knlist_empty(knl)) {
2562		knlist_destroy(knl);
2563		free(knl, M_KQUEUE);
2564	}
2565}
2566
2567/*
2568 * Even if we are locked, we may need to drop the lock to allow any influx
2569 * knotes time to "settle".
2570 */
2571void
2572knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2573{
2574	struct knote *kn, *kn2;
2575	struct kqueue *kq;
2576
2577	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
2578	if (islocked)
2579		KNL_ASSERT_LOCKED(knl);
2580	else {
2581		KNL_ASSERT_UNLOCKED(knl);
2582again:		/* need to reacquire lock since we have dropped it */
2583		knl->kl_lock(knl->kl_lockarg);
2584	}
2585
2586	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2587		kq = kn->kn_kq;
2588		KQ_LOCK(kq);
2589		if (kn_in_flux(kn)) {
2590			KQ_UNLOCK(kq);
2591			continue;
2592		}
2593		knlist_remove_kq(knl, kn, 1, 1);
2594		if (killkn) {
2595			kn_enter_flux(kn);
2596			KQ_UNLOCK(kq);
2597			knote_drop_detached(kn, td);
2598		} else {
2599			/* Make sure cleared knotes disappear soon */
2600			kn->kn_flags |= EV_EOF | EV_ONESHOT;
2601			KQ_UNLOCK(kq);
2602		}
2603		kq = NULL;
2604	}
2605
2606	if (!SLIST_EMPTY(&knl->kl_list)) {
2607		/* there are still in flux knotes remaining */
2608		kn = SLIST_FIRST(&knl->kl_list);
2609		kq = kn->kn_kq;
2610		KQ_LOCK(kq);
2611		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
2612		knl->kl_unlock(knl->kl_lockarg);
2613		kq->kq_state |= KQ_FLUXWAIT;
2614		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2615		kq = NULL;
2616		goto again;
2617	}
2618
2619	if (islocked)
2620		KNL_ASSERT_LOCKED(knl);
2621	else {
2622		knl->kl_unlock(knl->kl_lockarg);
2623		KNL_ASSERT_UNLOCKED(knl);
2624	}
2625}
2626
2627/*
2628 * Remove all knotes referencing a specified fd must be called with FILEDESC
2629 * lock.  This prevents a race where a new fd comes along and occupies the
2630 * entry and we attach a knote to the fd.
2631 */
2632void
2633knote_fdclose(struct thread *td, int fd)
2634{
2635	struct filedesc *fdp = td->td_proc->p_fd;
2636	struct kqueue *kq;
2637	struct knote *kn;
2638	int influx;
2639
2640	FILEDESC_XLOCK_ASSERT(fdp);
2641
2642	/*
2643	 * We shouldn't have to worry about new kevents appearing on fd
2644	 * since filedesc is locked.
2645	 */
2646	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2647		KQ_LOCK(kq);
2648
2649again:
2650		influx = 0;
2651		while (kq->kq_knlistsize > fd &&
2652		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2653			if (kn_in_flux(kn)) {
2654				/* someone else might be waiting on our knote */
2655				if (influx)
2656					wakeup(kq);
2657				kq->kq_state |= KQ_FLUXWAIT;
2658				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2659				goto again;
2660			}
2661			kn_enter_flux(kn);
2662			KQ_UNLOCK(kq);
2663			influx = 1;
2664			knote_drop(kn, td);
2665			KQ_LOCK(kq);
2666		}
2667		KQ_UNLOCK_FLUX(kq);
2668	}
2669}
2670
2671static int
2672knote_attach(struct knote *kn, struct kqueue *kq)
2673{
2674	struct klist *list;
2675
2676	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
2677	KQ_OWNED(kq);
2678
2679	if ((kq->kq_state & KQ_CLOSING) != 0)
2680		return (EBADF);
2681	if (kn->kn_fop->f_isfd) {
2682		if (kn->kn_id >= kq->kq_knlistsize)
2683			return (ENOMEM);
2684		list = &kq->kq_knlist[kn->kn_id];
2685	} else {
2686		if (kq->kq_knhash == NULL)
2687			return (ENOMEM);
2688		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2689	}
2690	SLIST_INSERT_HEAD(list, kn, kn_link);
2691	return (0);
2692}
2693
2694static void
2695knote_drop(struct knote *kn, struct thread *td)
2696{
2697
2698	if ((kn->kn_status & KN_DETACHED) == 0)
2699		kn->kn_fop->f_detach(kn);
2700	knote_drop_detached(kn, td);
2701}
2702
2703static void
2704knote_drop_detached(struct knote *kn, struct thread *td)
2705{
2706	struct kqueue *kq;
2707	struct klist *list;
2708
2709	kq = kn->kn_kq;
2710
2711	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2712	    ("knote %p still attached", kn));
2713	KQ_NOTOWNED(kq);
2714
2715	KQ_LOCK(kq);
2716	KASSERT(kn->kn_influx == 1,
2717	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
2718
2719	if (kn->kn_fop->f_isfd)
2720		list = &kq->kq_knlist[kn->kn_id];
2721	else
2722		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2723
2724	if (!SLIST_EMPTY(list))
2725		SLIST_REMOVE(list, kn, knote, kn_link);
2726	if (kn->kn_status & KN_QUEUED)
2727		knote_dequeue(kn);
2728	KQ_UNLOCK_FLUX(kq);
2729
2730	if (kn->kn_fop->f_isfd) {
2731		fdrop(kn->kn_fp, td);
2732		kn->kn_fp = NULL;
2733	}
2734	kqueue_fo_release(kn->kn_kevent.filter);
2735	kn->kn_fop = NULL;
2736	knote_free(kn);
2737}
2738
2739static void
2740knote_enqueue(struct knote *kn)
2741{
2742	struct kqueue *kq = kn->kn_kq;
2743
2744	KQ_OWNED(kn->kn_kq);
2745	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2746
2747	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2748	kn->kn_status |= KN_QUEUED;
2749	kq->kq_count++;
2750	kqueue_wakeup(kq);
2751}
2752
2753static void
2754knote_dequeue(struct knote *kn)
2755{
2756	struct kqueue *kq = kn->kn_kq;
2757
2758	KQ_OWNED(kn->kn_kq);
2759	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2760
2761	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2762	kn->kn_status &= ~KN_QUEUED;
2763	kq->kq_count--;
2764}
2765
2766static void
2767knote_init(void)
2768{
2769
2770	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2771	    NULL, NULL, UMA_ALIGN_PTR, 0);
2772}
2773SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2774
2775static struct knote *
2776knote_alloc(int mflag)
2777{
2778
2779	return (uma_zalloc(knote_zone, mflag | M_ZERO));
2780}
2781
2782static void
2783knote_free(struct knote *kn)
2784{
2785
2786	uma_zfree(knote_zone, kn);
2787}
2788
2789/*
2790 * Register the kev w/ the kq specified by fd.
2791 */
2792int
2793kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
2794{
2795	struct kqueue *kq;
2796	struct file *fp;
2797	cap_rights_t rights;
2798	int error;
2799
2800	error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE),
2801	    &fp);
2802	if (error != 0)
2803		return (error);
2804	if ((error = kqueue_acquire(fp, &kq)) != 0)
2805		goto noacquire;
2806
2807	error = kqueue_register(kq, kev, td, mflag);
2808	kqueue_release(kq, 0);
2809
2810noacquire:
2811	fdrop(fp, td);
2812	return (error);
2813}
2814