1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
5 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
6 * Copyright (c) 2009 Apple, Inc.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include "opt_ktrace.h"
35#include "opt_kqueue.h"
36
37#ifdef COMPAT_FREEBSD11
38#define	_WANT_FREEBSD11_KEVENT
39#endif
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/capsicum.h>
44#include <sys/kernel.h>
45#include <sys/limits.h>
46#include <sys/lock.h>
47#include <sys/mutex.h>
48#include <sys/rwlock.h>
49#include <sys/proc.h>
50#include <sys/malloc.h>
51#include <sys/unistd.h>
52#include <sys/file.h>
53#include <sys/filedesc.h>
54#include <sys/filio.h>
55#include <sys/fcntl.h>
56#include <sys/kthread.h>
57#include <sys/selinfo.h>
58#include <sys/queue.h>
59#include <sys/event.h>
60#include <sys/eventvar.h>
61#include <sys/poll.h>
62#include <sys/protosw.h>
63#include <sys/resourcevar.h>
64#include <sys/sigio.h>
65#include <sys/signalvar.h>
66#include <sys/socket.h>
67#include <sys/socketvar.h>
68#include <sys/stat.h>
69#include <sys/sysctl.h>
70#include <sys/sysproto.h>
71#include <sys/syscallsubr.h>
72#include <sys/taskqueue.h>
73#include <sys/uio.h>
74#include <sys/user.h>
75#ifdef KTRACE
76#include <sys/ktrace.h>
77#endif
78#include <machine/atomic.h>
79
80#include <vm/uma.h>
81
82static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
83
84/*
85 * This lock is used if multiple kq locks are required.  This possibly
86 * should be made into a per proc lock.
87 */
88static struct mtx	kq_global;
89MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
90#define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
91	if (!haslck)				\
92		mtx_lock(lck);			\
93	haslck = 1;				\
94} while (0)
95#define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
96	if (haslck)				\
97		mtx_unlock(lck);			\
98	haslck = 0;				\
99} while (0)
100
101TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
102
103static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
104static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
105static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
106		    struct thread *td, int mflag);
107static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
108static void	kqueue_release(struct kqueue *kq, int locked);
109static void	kqueue_destroy(struct kqueue *kq);
110static void	kqueue_drain(struct kqueue *kq, struct thread *td);
111static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
112		    uintptr_t ident, int mflag);
113static void	kqueue_task(void *arg, int pending);
114static int	kqueue_scan(struct kqueue *kq, int maxevents,
115		    struct kevent_copyops *k_ops,
116		    const struct timespec *timeout,
117		    struct kevent *keva, struct thread *td);
118static void 	kqueue_wakeup(struct kqueue *kq);
119static struct filterops *kqueue_fo_find(int filt);
120static void	kqueue_fo_release(int filt);
121struct g_kevent_args;
122static int	kern_kevent_generic(struct thread *td,
123		    struct g_kevent_args *uap,
124		    struct kevent_copyops *k_ops, const char *struct_name);
125
126static fo_ioctl_t	kqueue_ioctl;
127static fo_poll_t	kqueue_poll;
128static fo_kqfilter_t	kqueue_kqfilter;
129static fo_stat_t	kqueue_stat;
130static fo_close_t	kqueue_close;
131static fo_fill_kinfo_t	kqueue_fill_kinfo;
132
133static struct fileops kqueueops = {
134	.fo_read = invfo_rdwr,
135	.fo_write = invfo_rdwr,
136	.fo_truncate = invfo_truncate,
137	.fo_ioctl = kqueue_ioctl,
138	.fo_poll = kqueue_poll,
139	.fo_kqfilter = kqueue_kqfilter,
140	.fo_stat = kqueue_stat,
141	.fo_close = kqueue_close,
142	.fo_chmod = invfo_chmod,
143	.fo_chown = invfo_chown,
144	.fo_sendfile = invfo_sendfile,
145	.fo_fill_kinfo = kqueue_fill_kinfo,
146};
147
148static int 	knote_attach(struct knote *kn, struct kqueue *kq);
149static void 	knote_drop(struct knote *kn, struct thread *td);
150static void 	knote_drop_detached(struct knote *kn, struct thread *td);
151static void 	knote_enqueue(struct knote *kn);
152static void 	knote_dequeue(struct knote *kn);
153static void 	knote_init(void);
154static struct 	knote *knote_alloc(int mflag);
155static void 	knote_free(struct knote *kn);
156
157static void	filt_kqdetach(struct knote *kn);
158static int	filt_kqueue(struct knote *kn, long hint);
159static int	filt_procattach(struct knote *kn);
160static void	filt_procdetach(struct knote *kn);
161static int	filt_proc(struct knote *kn, long hint);
162static int	filt_fileattach(struct knote *kn);
163static void	filt_timerexpire(void *knx);
164static int	filt_timerattach(struct knote *kn);
165static void	filt_timerdetach(struct knote *kn);
166static void	filt_timerstart(struct knote *kn, sbintime_t to);
167static void	filt_timertouch(struct knote *kn, struct kevent *kev,
168		    u_long type);
169static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
170static int	filt_timer(struct knote *kn, long hint);
171static int	filt_userattach(struct knote *kn);
172static void	filt_userdetach(struct knote *kn);
173static int	filt_user(struct knote *kn, long hint);
174static void	filt_usertouch(struct knote *kn, struct kevent *kev,
175		    u_long type);
176
177static struct filterops file_filtops = {
178	.f_isfd = 1,
179	.f_attach = filt_fileattach,
180};
181static struct filterops kqread_filtops = {
182	.f_isfd = 1,
183	.f_detach = filt_kqdetach,
184	.f_event = filt_kqueue,
185};
186/* XXX - move to kern_proc.c?  */
187static struct filterops proc_filtops = {
188	.f_isfd = 0,
189	.f_attach = filt_procattach,
190	.f_detach = filt_procdetach,
191	.f_event = filt_proc,
192};
193static struct filterops timer_filtops = {
194	.f_isfd = 0,
195	.f_attach = filt_timerattach,
196	.f_detach = filt_timerdetach,
197	.f_event = filt_timer,
198	.f_touch = filt_timertouch,
199};
200static struct filterops user_filtops = {
201	.f_attach = filt_userattach,
202	.f_detach = filt_userdetach,
203	.f_event = filt_user,
204	.f_touch = filt_usertouch,
205};
206
207static uma_zone_t	knote_zone;
208static unsigned int	kq_ncallouts = 0;
209static unsigned int 	kq_calloutmax = 4 * 1024;
210SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
211    &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
212
213/* XXX - ensure not influx ? */
214#define KNOTE_ACTIVATE(kn, islock) do { 				\
215	if ((islock))							\
216		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
217	else								\
218		KQ_LOCK((kn)->kn_kq);					\
219	(kn)->kn_status |= KN_ACTIVE;					\
220	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
221		knote_enqueue((kn));					\
222	if (!(islock))							\
223		KQ_UNLOCK((kn)->kn_kq);					\
224} while(0)
225#define KQ_LOCK(kq) do {						\
226	mtx_lock(&(kq)->kq_lock);					\
227} while (0)
228#define KQ_FLUX_WAKEUP(kq) do {						\
229	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
230		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
231		wakeup((kq));						\
232	}								\
233} while (0)
234#define KQ_UNLOCK_FLUX(kq) do {						\
235	KQ_FLUX_WAKEUP(kq);						\
236	mtx_unlock(&(kq)->kq_lock);					\
237} while (0)
238#define KQ_UNLOCK(kq) do {						\
239	mtx_unlock(&(kq)->kq_lock);					\
240} while (0)
241#define KQ_OWNED(kq) do {						\
242	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
243} while (0)
244#define KQ_NOTOWNED(kq) do {						\
245	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
246} while (0)
247
248static struct knlist *
249kn_list_lock(struct knote *kn)
250{
251	struct knlist *knl;
252
253	knl = kn->kn_knlist;
254	if (knl != NULL)
255		knl->kl_lock(knl->kl_lockarg);
256	return (knl);
257}
258
259static void
260kn_list_unlock(struct knlist *knl)
261{
262	bool do_free;
263
264	if (knl == NULL)
265		return;
266	do_free = knl->kl_autodestroy && knlist_empty(knl);
267	knl->kl_unlock(knl->kl_lockarg);
268	if (do_free) {
269		knlist_destroy(knl);
270		free(knl, M_KQUEUE);
271	}
272}
273
274static bool
275kn_in_flux(struct knote *kn)
276{
277
278	return (kn->kn_influx > 0);
279}
280
281static void
282kn_enter_flux(struct knote *kn)
283{
284
285	KQ_OWNED(kn->kn_kq);
286	MPASS(kn->kn_influx < INT_MAX);
287	kn->kn_influx++;
288}
289
290static bool
291kn_leave_flux(struct knote *kn)
292{
293
294	KQ_OWNED(kn->kn_kq);
295	MPASS(kn->kn_influx > 0);
296	kn->kn_influx--;
297	return (kn->kn_influx == 0);
298}
299
300#define	KNL_ASSERT_LOCK(knl, islocked) do {				\
301	if (islocked)							\
302		KNL_ASSERT_LOCKED(knl);				\
303	else								\
304		KNL_ASSERT_UNLOCKED(knl);				\
305} while (0)
306#ifdef INVARIANTS
307#define	KNL_ASSERT_LOCKED(knl) do {					\
308	knl->kl_assert_locked((knl)->kl_lockarg);			\
309} while (0)
310#define	KNL_ASSERT_UNLOCKED(knl) do {					\
311	knl->kl_assert_unlocked((knl)->kl_lockarg);			\
312} while (0)
313#else /* !INVARIANTS */
314#define	KNL_ASSERT_LOCKED(knl) do {} while(0)
315#define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
316#endif /* INVARIANTS */
317
318#ifndef	KN_HASHSIZE
319#define	KN_HASHSIZE		64		/* XXX should be tunable */
320#endif
321
322#define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
323
324static int
325filt_nullattach(struct knote *kn)
326{
327
328	return (ENXIO);
329};
330
331struct filterops null_filtops = {
332	.f_isfd = 0,
333	.f_attach = filt_nullattach,
334};
335
336/* XXX - make SYSINIT to add these, and move into respective modules. */
337extern struct filterops sig_filtops;
338extern struct filterops fs_filtops;
339
340/*
341 * Table for for all system-defined filters.
342 */
343static struct mtx	filterops_lock;
344MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
345	MTX_DEF);
346static struct {
347	struct filterops *for_fop;
348	int for_nolock;
349	int for_refcnt;
350} sysfilt_ops[EVFILT_SYSCOUNT] = {
351	{ &file_filtops, 1 },			/* EVFILT_READ */
352	{ &file_filtops, 1 },			/* EVFILT_WRITE */
353	{ &null_filtops },			/* EVFILT_AIO */
354	{ &file_filtops, 1 },			/* EVFILT_VNODE */
355	{ &proc_filtops, 1 },			/* EVFILT_PROC */
356	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
357	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
358	{ &file_filtops, 1 },			/* EVFILT_PROCDESC */
359	{ &fs_filtops, 1 },			/* EVFILT_FS */
360	{ &null_filtops },			/* EVFILT_LIO */
361	{ &user_filtops, 1 },			/* EVFILT_USER */
362	{ &null_filtops },			/* EVFILT_SENDFILE */
363	{ &file_filtops, 1 },                   /* EVFILT_EMPTY */
364};
365
366/*
367 * Simple redirection for all cdevsw style objects to call their fo_kqfilter
368 * method.
369 */
370static int
371filt_fileattach(struct knote *kn)
372{
373
374	return (fo_kqfilter(kn->kn_fp, kn));
375}
376
377/*ARGSUSED*/
378static int
379kqueue_kqfilter(struct file *fp, struct knote *kn)
380{
381	struct kqueue *kq = kn->kn_fp->f_data;
382
383	if (kn->kn_filter != EVFILT_READ)
384		return (EINVAL);
385
386	kn->kn_status |= KN_KQUEUE;
387	kn->kn_fop = &kqread_filtops;
388	knlist_add(&kq->kq_sel.si_note, kn, 0);
389
390	return (0);
391}
392
393static void
394filt_kqdetach(struct knote *kn)
395{
396	struct kqueue *kq = kn->kn_fp->f_data;
397
398	knlist_remove(&kq->kq_sel.si_note, kn, 0);
399}
400
401/*ARGSUSED*/
402static int
403filt_kqueue(struct knote *kn, long hint)
404{
405	struct kqueue *kq = kn->kn_fp->f_data;
406
407	kn->kn_data = kq->kq_count;
408	return (kn->kn_data > 0);
409}
410
411/* XXX - move to kern_proc.c?  */
412static int
413filt_procattach(struct knote *kn)
414{
415	struct proc *p;
416	int error;
417	bool exiting, immediate;
418
419	exiting = immediate = false;
420	if (kn->kn_sfflags & NOTE_EXIT)
421		p = pfind_any(kn->kn_id);
422	else
423		p = pfind(kn->kn_id);
424	if (p == NULL)
425		return (ESRCH);
426	if (p->p_flag & P_WEXIT)
427		exiting = true;
428
429	if ((error = p_cansee(curthread, p))) {
430		PROC_UNLOCK(p);
431		return (error);
432	}
433
434	kn->kn_ptr.p_proc = p;
435	kn->kn_flags |= EV_CLEAR;		/* automatically set */
436
437	/*
438	 * Internal flag indicating registration done by kernel for the
439	 * purposes of getting a NOTE_CHILD notification.
440	 */
441	if (kn->kn_flags & EV_FLAG2) {
442		kn->kn_flags &= ~EV_FLAG2;
443		kn->kn_data = kn->kn_sdata;		/* ppid */
444		kn->kn_fflags = NOTE_CHILD;
445		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
446		immediate = true; /* Force immediate activation of child note. */
447	}
448	/*
449	 * Internal flag indicating registration done by kernel (for other than
450	 * NOTE_CHILD).
451	 */
452	if (kn->kn_flags & EV_FLAG1) {
453		kn->kn_flags &= ~EV_FLAG1;
454	}
455
456	knlist_add(p->p_klist, kn, 1);
457
458	/*
459	 * Immediately activate any child notes or, in the case of a zombie
460	 * target process, exit notes.  The latter is necessary to handle the
461	 * case where the target process, e.g. a child, dies before the kevent
462	 * is registered.
463	 */
464	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
465		KNOTE_ACTIVATE(kn, 0);
466
467	PROC_UNLOCK(p);
468
469	return (0);
470}
471
472/*
473 * The knote may be attached to a different process, which may exit,
474 * leaving nothing for the knote to be attached to.  So when the process
475 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
476 * it will be deleted when read out.  However, as part of the knote deletion,
477 * this routine is called, so a check is needed to avoid actually performing
478 * a detach, because the original process does not exist any more.
479 */
480/* XXX - move to kern_proc.c?  */
481static void
482filt_procdetach(struct knote *kn)
483{
484
485	knlist_remove(kn->kn_knlist, kn, 0);
486	kn->kn_ptr.p_proc = NULL;
487}
488
489/* XXX - move to kern_proc.c?  */
490static int
491filt_proc(struct knote *kn, long hint)
492{
493	struct proc *p;
494	u_int event;
495
496	p = kn->kn_ptr.p_proc;
497	if (p == NULL) /* already activated, from attach filter */
498		return (0);
499
500	/* Mask off extra data. */
501	event = (u_int)hint & NOTE_PCTRLMASK;
502
503	/* If the user is interested in this event, record it. */
504	if (kn->kn_sfflags & event)
505		kn->kn_fflags |= event;
506
507	/* Process is gone, so flag the event as finished. */
508	if (event == NOTE_EXIT) {
509		kn->kn_flags |= EV_EOF | EV_ONESHOT;
510		kn->kn_ptr.p_proc = NULL;
511		if (kn->kn_fflags & NOTE_EXIT)
512			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
513		if (kn->kn_fflags == 0)
514			kn->kn_flags |= EV_DROP;
515		return (1);
516	}
517
518	return (kn->kn_fflags != 0);
519}
520
521/*
522 * Called when the process forked. It mostly does the same as the
523 * knote(), activating all knotes registered to be activated when the
524 * process forked. Additionally, for each knote attached to the
525 * parent, check whether user wants to track the new process. If so
526 * attach a new knote to it, and immediately report an event with the
527 * child's pid.
528 */
529void
530knote_fork(struct knlist *list, int pid)
531{
532	struct kqueue *kq;
533	struct knote *kn;
534	struct kevent kev;
535	int error;
536
537	if (list == NULL)
538		return;
539
540	memset(&kev, 0, sizeof(kev));
541	list->kl_lock(list->kl_lockarg);
542	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
543		kq = kn->kn_kq;
544		KQ_LOCK(kq);
545		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
546			KQ_UNLOCK(kq);
547			continue;
548		}
549
550		/*
551		 * The same as knote(), activate the event.
552		 */
553		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
554			if (kn->kn_fop->f_event(kn, NOTE_FORK))
555				KNOTE_ACTIVATE(kn, 1);
556			KQ_UNLOCK(kq);
557			continue;
558		}
559
560		/*
561		 * The NOTE_TRACK case. In addition to the activation
562		 * of the event, we need to register new events to
563		 * track the child. Drop the locks in preparation for
564		 * the call to kqueue_register().
565		 */
566		kn_enter_flux(kn);
567		KQ_UNLOCK(kq);
568		list->kl_unlock(list->kl_lockarg);
569
570		/*
571		 * Activate existing knote and register tracking knotes with
572		 * new process.
573		 *
574		 * First register a knote to get just the child notice. This
575		 * must be a separate note from a potential NOTE_EXIT
576		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
577		 * to use the data field (in conflicting ways).
578		 */
579		kev.ident = pid;
580		kev.filter = kn->kn_filter;
581		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
582		    EV_FLAG2;
583		kev.fflags = kn->kn_sfflags;
584		kev.data = kn->kn_id;		/* parent */
585		kev.udata = kn->kn_kevent.udata;/* preserve udata */
586		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
587		if (error)
588			kn->kn_fflags |= NOTE_TRACKERR;
589
590		/*
591		 * Then register another knote to track other potential events
592		 * from the new process.
593		 */
594		kev.ident = pid;
595		kev.filter = kn->kn_filter;
596		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
597		kev.fflags = kn->kn_sfflags;
598		kev.data = kn->kn_id;		/* parent */
599		kev.udata = kn->kn_kevent.udata;/* preserve udata */
600		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
601		if (error)
602			kn->kn_fflags |= NOTE_TRACKERR;
603		if (kn->kn_fop->f_event(kn, NOTE_FORK))
604			KNOTE_ACTIVATE(kn, 0);
605		list->kl_lock(list->kl_lockarg);
606		KQ_LOCK(kq);
607		kn_leave_flux(kn);
608		KQ_UNLOCK_FLUX(kq);
609	}
610	list->kl_unlock(list->kl_lockarg);
611}
612
613/*
614 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
615 * interval timer support code.
616 */
617
618#define NOTE_TIMER_PRECMASK						\
619    (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
620
621static sbintime_t
622timer2sbintime(int64_t data, int flags)
623{
624	int64_t secs;
625
626        /*
627         * Macros for converting to the fractional second portion of an
628         * sbintime_t using 64bit multiplication to improve precision.
629         */
630#define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
631#define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
632#define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
633	switch (flags & NOTE_TIMER_PRECMASK) {
634	case NOTE_SECONDS:
635#ifdef __LP64__
636		if (data > (SBT_MAX / SBT_1S))
637			return (SBT_MAX);
638#endif
639		return ((sbintime_t)data << 32);
640	case NOTE_MSECONDS: /* FALLTHROUGH */
641	case 0:
642		if (data >= 1000) {
643			secs = data / 1000;
644#ifdef __LP64__
645			if (secs > (SBT_MAX / SBT_1S))
646				return (SBT_MAX);
647#endif
648			return (secs << 32 | MS_TO_SBT(data % 1000));
649		}
650		return (MS_TO_SBT(data));
651	case NOTE_USECONDS:
652		if (data >= 1000000) {
653			secs = data / 1000000;
654#ifdef __LP64__
655			if (secs > (SBT_MAX / SBT_1S))
656				return (SBT_MAX);
657#endif
658			return (secs << 32 | US_TO_SBT(data % 1000000));
659		}
660		return (US_TO_SBT(data));
661	case NOTE_NSECONDS:
662		if (data >= 1000000000) {
663			secs = data / 1000000000;
664#ifdef __LP64__
665			if (secs > (SBT_MAX / SBT_1S))
666				return (SBT_MAX);
667#endif
668			return (secs << 32 | NS_TO_SBT(data % 1000000000));
669		}
670		return (NS_TO_SBT(data));
671	default:
672		break;
673	}
674	return (-1);
675}
676
677struct kq_timer_cb_data {
678	struct callout c;
679	sbintime_t next;	/* next timer event fires at */
680	sbintime_t to;		/* precalculated timer period, 0 for abs */
681};
682
683static void
684filt_timerexpire(void *knx)
685{
686	struct knote *kn;
687	struct kq_timer_cb_data *kc;
688
689	kn = knx;
690	kn->kn_data++;
691	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
692
693	if ((kn->kn_flags & EV_ONESHOT) != 0)
694		return;
695	kc = kn->kn_ptr.p_v;
696	if (kc->to == 0)
697		return;
698	kc->next += kc->to;
699	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
700	    PCPU_GET(cpuid), C_ABSOLUTE);
701}
702
703/*
704 * data contains amount of time to sleep
705 */
706static int
707filt_timervalidate(struct knote *kn, sbintime_t *to)
708{
709	struct bintime bt;
710	sbintime_t sbt;
711
712	if (kn->kn_sdata < 0)
713		return (EINVAL);
714	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
715		kn->kn_sdata = 1;
716	/*
717	 * The only fflags values supported are the timer unit
718	 * (precision) and the absolute time indicator.
719	 */
720	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
721		return (EINVAL);
722
723	*to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
724	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
725		getboottimebin(&bt);
726		sbt = bttosbt(bt);
727		*to -= sbt;
728	}
729	if (*to < 0)
730		return (EINVAL);
731	return (0);
732}
733
734static int
735filt_timerattach(struct knote *kn)
736{
737	struct kq_timer_cb_data *kc;
738	sbintime_t to;
739	unsigned int ncallouts;
740	int error;
741
742	error = filt_timervalidate(kn, &to);
743	if (error != 0)
744		return (error);
745
746	do {
747		ncallouts = kq_ncallouts;
748		if (ncallouts >= kq_calloutmax)
749			return (ENOMEM);
750	} while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
751
752	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
753		kn->kn_flags |= EV_CLEAR;	/* automatically set */
754	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
755	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
756	callout_init(&kc->c, 1);
757	filt_timerstart(kn, to);
758
759	return (0);
760}
761
762static void
763filt_timerstart(struct knote *kn, sbintime_t to)
764{
765	struct kq_timer_cb_data *kc;
766
767	kc = kn->kn_ptr.p_v;
768	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
769		kc->next = to;
770		kc->to = 0;
771	} else {
772		kc->next = to + sbinuptime();
773		kc->to = to;
774	}
775	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
776	    PCPU_GET(cpuid), C_ABSOLUTE);
777}
778
779static void
780filt_timerdetach(struct knote *kn)
781{
782	struct kq_timer_cb_data *kc;
783	unsigned int old __unused;
784
785	kc = kn->kn_ptr.p_v;
786	callout_drain(&kc->c);
787	free(kc, M_KQUEUE);
788	old = atomic_fetchadd_int(&kq_ncallouts, -1);
789	KASSERT(old > 0, ("Number of callouts cannot become negative"));
790	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
791}
792
793static void
794filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
795{
796	struct kq_timer_cb_data *kc;
797	struct kqueue *kq;
798	sbintime_t to;
799	int error;
800
801	switch (type) {
802	case EVENT_REGISTER:
803		/* Handle re-added timers that update data/fflags */
804		if (kev->flags & EV_ADD) {
805			kc = kn->kn_ptr.p_v;
806
807			/* Drain any existing callout. */
808			callout_drain(&kc->c);
809
810			/* Throw away any existing undelivered record
811			 * of the timer expiration. This is done under
812			 * the presumption that if a process is
813			 * re-adding this timer with new parameters,
814			 * it is no longer interested in what may have
815			 * happened under the old parameters. If it is
816			 * interested, it can wait for the expiration,
817			 * delete the old timer definition, and then
818			 * add the new one.
819			 *
820			 * This has to be done while the kq is locked:
821			 *   - if enqueued, dequeue
822			 *   - make it no longer active
823			 *   - clear the count of expiration events
824			 */
825			kq = kn->kn_kq;
826			KQ_LOCK(kq);
827			if (kn->kn_status & KN_QUEUED)
828				knote_dequeue(kn);
829
830			kn->kn_status &= ~KN_ACTIVE;
831			kn->kn_data = 0;
832			KQ_UNLOCK(kq);
833
834			/* Reschedule timer based on new data/fflags */
835			kn->kn_sfflags = kev->fflags;
836			kn->kn_sdata = kev->data;
837			error = filt_timervalidate(kn, &to);
838			if (error != 0) {
839			  	kn->kn_flags |= EV_ERROR;
840				kn->kn_data = error;
841			} else
842			  	filt_timerstart(kn, to);
843		}
844		break;
845
846        case EVENT_PROCESS:
847		*kev = kn->kn_kevent;
848		if (kn->kn_flags & EV_CLEAR) {
849			kn->kn_data = 0;
850			kn->kn_fflags = 0;
851		}
852		break;
853
854	default:
855		panic("filt_timertouch() - invalid type (%ld)", type);
856		break;
857	}
858}
859
860static int
861filt_timer(struct knote *kn, long hint)
862{
863
864	return (kn->kn_data != 0);
865}
866
867static int
868filt_userattach(struct knote *kn)
869{
870
871	/*
872	 * EVFILT_USER knotes are not attached to anything in the kernel.
873	 */
874	kn->kn_hook = NULL;
875	if (kn->kn_fflags & NOTE_TRIGGER)
876		kn->kn_hookid = 1;
877	else
878		kn->kn_hookid = 0;
879	return (0);
880}
881
882static void
883filt_userdetach(__unused struct knote *kn)
884{
885
886	/*
887	 * EVFILT_USER knotes are not attached to anything in the kernel.
888	 */
889}
890
891static int
892filt_user(struct knote *kn, __unused long hint)
893{
894
895	return (kn->kn_hookid);
896}
897
898static void
899filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
900{
901	u_int ffctrl;
902
903	switch (type) {
904	case EVENT_REGISTER:
905		if (kev->fflags & NOTE_TRIGGER)
906			kn->kn_hookid = 1;
907
908		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
909		kev->fflags &= NOTE_FFLAGSMASK;
910		switch (ffctrl) {
911		case NOTE_FFNOP:
912			break;
913
914		case NOTE_FFAND:
915			kn->kn_sfflags &= kev->fflags;
916			break;
917
918		case NOTE_FFOR:
919			kn->kn_sfflags |= kev->fflags;
920			break;
921
922		case NOTE_FFCOPY:
923			kn->kn_sfflags = kev->fflags;
924			break;
925
926		default:
927			/* XXX Return error? */
928			break;
929		}
930		kn->kn_sdata = kev->data;
931		if (kev->flags & EV_CLEAR) {
932			kn->kn_hookid = 0;
933			kn->kn_data = 0;
934			kn->kn_fflags = 0;
935		}
936		break;
937
938        case EVENT_PROCESS:
939		*kev = kn->kn_kevent;
940		kev->fflags = kn->kn_sfflags;
941		kev->data = kn->kn_sdata;
942		if (kn->kn_flags & EV_CLEAR) {
943			kn->kn_hookid = 0;
944			kn->kn_data = 0;
945			kn->kn_fflags = 0;
946		}
947		break;
948
949	default:
950		panic("filt_usertouch() - invalid type (%ld)", type);
951		break;
952	}
953}
954
955int
956sys_kqueue(struct thread *td, struct kqueue_args *uap)
957{
958
959	return (kern_kqueue(td, 0, NULL));
960}
961
962static void
963kqueue_init(struct kqueue *kq)
964{
965
966	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
967	TAILQ_INIT(&kq->kq_head);
968	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
969	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
970}
971
972int
973kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
974{
975	struct filedesc *fdp;
976	struct kqueue *kq;
977	struct file *fp;
978	struct ucred *cred;
979	int fd, error;
980
981	fdp = td->td_proc->p_fd;
982	cred = td->td_ucred;
983	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
984		return (ENOMEM);
985
986	error = falloc_caps(td, &fp, &fd, flags, fcaps);
987	if (error != 0) {
988		chgkqcnt(cred->cr_ruidinfo, -1, 0);
989		return (error);
990	}
991
992	/* An extra reference on `fp' has been held for us by falloc(). */
993	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
994	kqueue_init(kq);
995	kq->kq_fdp = fdp;
996	kq->kq_cred = crhold(cred);
997
998	FILEDESC_XLOCK(fdp);
999	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
1000	FILEDESC_XUNLOCK(fdp);
1001
1002	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
1003	fdrop(fp, td);
1004
1005	td->td_retval[0] = fd;
1006	return (0);
1007}
1008
1009struct g_kevent_args {
1010	int	fd;
1011	void	*changelist;
1012	int	nchanges;
1013	void	*eventlist;
1014	int	nevents;
1015	const struct timespec *timeout;
1016};
1017
1018int
1019sys_kevent(struct thread *td, struct kevent_args *uap)
1020{
1021	struct kevent_copyops k_ops = {
1022		.arg = uap,
1023		.k_copyout = kevent_copyout,
1024		.k_copyin = kevent_copyin,
1025		.kevent_size = sizeof(struct kevent),
1026	};
1027	struct g_kevent_args gk_args = {
1028		.fd = uap->fd,
1029		.changelist = uap->changelist,
1030		.nchanges = uap->nchanges,
1031		.eventlist = uap->eventlist,
1032		.nevents = uap->nevents,
1033		.timeout = uap->timeout,
1034	};
1035
1036	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
1037}
1038
1039static int
1040kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
1041    struct kevent_copyops *k_ops, const char *struct_name)
1042{
1043	struct timespec ts, *tsp;
1044#ifdef KTRACE
1045	struct kevent *eventlist = uap->eventlist;
1046#endif
1047	int error;
1048
1049	if (uap->timeout != NULL) {
1050		error = copyin(uap->timeout, &ts, sizeof(ts));
1051		if (error)
1052			return (error);
1053		tsp = &ts;
1054	} else
1055		tsp = NULL;
1056
1057#ifdef KTRACE
1058	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
1059		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
1060		    uap->nchanges, k_ops->kevent_size);
1061#endif
1062
1063	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
1064	    k_ops, tsp);
1065
1066#ifdef KTRACE
1067	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
1068		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
1069		    td->td_retval[0], k_ops->kevent_size);
1070#endif
1071
1072	return (error);
1073}
1074
1075/*
1076 * Copy 'count' items into the destination list pointed to by uap->eventlist.
1077 */
1078static int
1079kevent_copyout(void *arg, struct kevent *kevp, int count)
1080{
1081	struct kevent_args *uap;
1082	int error;
1083
1084	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1085	uap = (struct kevent_args *)arg;
1086
1087	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
1088	if (error == 0)
1089		uap->eventlist += count;
1090	return (error);
1091}
1092
1093/*
1094 * Copy 'count' items from the list pointed to by uap->changelist.
1095 */
1096static int
1097kevent_copyin(void *arg, struct kevent *kevp, int count)
1098{
1099	struct kevent_args *uap;
1100	int error;
1101
1102	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1103	uap = (struct kevent_args *)arg;
1104
1105	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
1106	if (error == 0)
1107		uap->changelist += count;
1108	return (error);
1109}
1110
1111#ifdef COMPAT_FREEBSD11
1112static int
1113kevent11_copyout(void *arg, struct kevent *kevp, int count)
1114{
1115	struct freebsd11_kevent_args *uap;
1116	struct kevent_freebsd11 kev11;
1117	int error, i;
1118
1119	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1120	uap = (struct freebsd11_kevent_args *)arg;
1121
1122	for (i = 0; i < count; i++) {
1123		kev11.ident = kevp->ident;
1124		kev11.filter = kevp->filter;
1125		kev11.flags = kevp->flags;
1126		kev11.fflags = kevp->fflags;
1127		kev11.data = kevp->data;
1128		kev11.udata = kevp->udata;
1129		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
1130		if (error != 0)
1131			break;
1132		uap->eventlist++;
1133		kevp++;
1134	}
1135	return (error);
1136}
1137
1138/*
1139 * Copy 'count' items from the list pointed to by uap->changelist.
1140 */
1141static int
1142kevent11_copyin(void *arg, struct kevent *kevp, int count)
1143{
1144	struct freebsd11_kevent_args *uap;
1145	struct kevent_freebsd11 kev11;
1146	int error, i;
1147
1148	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1149	uap = (struct freebsd11_kevent_args *)arg;
1150
1151	for (i = 0; i < count; i++) {
1152		error = copyin(uap->changelist, &kev11, sizeof(kev11));
1153		if (error != 0)
1154			break;
1155		kevp->ident = kev11.ident;
1156		kevp->filter = kev11.filter;
1157		kevp->flags = kev11.flags;
1158		kevp->fflags = kev11.fflags;
1159		kevp->data = (uintptr_t)kev11.data;
1160		kevp->udata = kev11.udata;
1161		bzero(&kevp->ext, sizeof(kevp->ext));
1162		uap->changelist++;
1163		kevp++;
1164	}
1165	return (error);
1166}
1167
1168int
1169freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
1170{
1171	struct kevent_copyops k_ops = {
1172		.arg = uap,
1173		.k_copyout = kevent11_copyout,
1174		.k_copyin = kevent11_copyin,
1175		.kevent_size = sizeof(struct kevent_freebsd11),
1176	};
1177	struct g_kevent_args gk_args = {
1178		.fd = uap->fd,
1179		.changelist = uap->changelist,
1180		.nchanges = uap->nchanges,
1181		.eventlist = uap->eventlist,
1182		.nevents = uap->nevents,
1183		.timeout = uap->timeout,
1184	};
1185
1186	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11"));
1187}
1188#endif
1189
1190int
1191kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
1192    struct kevent_copyops *k_ops, const struct timespec *timeout)
1193{
1194	cap_rights_t rights;
1195	struct file *fp;
1196	int error;
1197
1198	cap_rights_init(&rights);
1199	if (nchanges > 0)
1200		cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
1201	if (nevents > 0)
1202		cap_rights_set(&rights, CAP_KQUEUE_EVENT);
1203	error = fget(td, fd, &rights, &fp);
1204	if (error != 0)
1205		return (error);
1206
1207	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
1208	fdrop(fp, td);
1209
1210	return (error);
1211}
1212
1213static int
1214kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
1215    struct kevent_copyops *k_ops, const struct timespec *timeout)
1216{
1217	struct kevent keva[KQ_NEVENTS];
1218	struct kevent *kevp, *changes;
1219	int i, n, nerrors, error;
1220
1221	nerrors = 0;
1222	while (nchanges > 0) {
1223		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
1224		error = k_ops->k_copyin(k_ops->arg, keva, n);
1225		if (error)
1226			return (error);
1227		changes = keva;
1228		for (i = 0; i < n; i++) {
1229			kevp = &changes[i];
1230			if (!kevp->filter)
1231				continue;
1232			kevp->flags &= ~EV_SYSFLAGS;
1233			error = kqueue_register(kq, kevp, td, M_WAITOK);
1234			if (error || (kevp->flags & EV_RECEIPT)) {
1235				if (nevents == 0)
1236					return (error);
1237				kevp->flags = EV_ERROR;
1238				kevp->data = error;
1239				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
1240				nevents--;
1241				nerrors++;
1242			}
1243		}
1244		nchanges -= n;
1245	}
1246	if (nerrors) {
1247		td->td_retval[0] = nerrors;
1248		return (0);
1249	}
1250
1251	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
1252}
1253
1254int
1255kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
1256    struct kevent_copyops *k_ops, const struct timespec *timeout)
1257{
1258	struct kqueue *kq;
1259	int error;
1260
1261	error = kqueue_acquire(fp, &kq);
1262	if (error != 0)
1263		return (error);
1264	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
1265	kqueue_release(kq, 0);
1266	return (error);
1267}
1268
1269/*
1270 * Performs a kevent() call on a temporarily created kqueue. This can be
1271 * used to perform one-shot polling, similar to poll() and select().
1272 */
1273int
1274kern_kevent_anonymous(struct thread *td, int nevents,
1275    struct kevent_copyops *k_ops)
1276{
1277	struct kqueue kq = {};
1278	int error;
1279
1280	kqueue_init(&kq);
1281	kq.kq_refcnt = 1;
1282	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
1283	kqueue_drain(&kq, td);
1284	kqueue_destroy(&kq);
1285	return (error);
1286}
1287
1288int
1289kqueue_add_filteropts(int filt, struct filterops *filtops)
1290{
1291	int error;
1292
1293	error = 0;
1294	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1295		printf(
1296"trying to add a filterop that is out of range: %d is beyond %d\n",
1297		    ~filt, EVFILT_SYSCOUNT);
1298		return EINVAL;
1299	}
1300	mtx_lock(&filterops_lock);
1301	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1302	    sysfilt_ops[~filt].for_fop != NULL)
1303		error = EEXIST;
1304	else {
1305		sysfilt_ops[~filt].for_fop = filtops;
1306		sysfilt_ops[~filt].for_refcnt = 0;
1307	}
1308	mtx_unlock(&filterops_lock);
1309
1310	return (error);
1311}
1312
1313int
1314kqueue_del_filteropts(int filt)
1315{
1316	int error;
1317
1318	error = 0;
1319	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1320		return EINVAL;
1321
1322	mtx_lock(&filterops_lock);
1323	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1324	    sysfilt_ops[~filt].for_fop == NULL)
1325		error = EINVAL;
1326	else if (sysfilt_ops[~filt].for_refcnt != 0)
1327		error = EBUSY;
1328	else {
1329		sysfilt_ops[~filt].for_fop = &null_filtops;
1330		sysfilt_ops[~filt].for_refcnt = 0;
1331	}
1332	mtx_unlock(&filterops_lock);
1333
1334	return error;
1335}
1336
1337static struct filterops *
1338kqueue_fo_find(int filt)
1339{
1340
1341	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1342		return NULL;
1343
1344	if (sysfilt_ops[~filt].for_nolock)
1345		return sysfilt_ops[~filt].for_fop;
1346
1347	mtx_lock(&filterops_lock);
1348	sysfilt_ops[~filt].for_refcnt++;
1349	if (sysfilt_ops[~filt].for_fop == NULL)
1350		sysfilt_ops[~filt].for_fop = &null_filtops;
1351	mtx_unlock(&filterops_lock);
1352
1353	return sysfilt_ops[~filt].for_fop;
1354}
1355
1356static void
1357kqueue_fo_release(int filt)
1358{
1359
1360	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1361		return;
1362
1363	if (sysfilt_ops[~filt].for_nolock)
1364		return;
1365
1366	mtx_lock(&filterops_lock);
1367	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1368	    ("filter object refcount not valid on release"));
1369	sysfilt_ops[~filt].for_refcnt--;
1370	mtx_unlock(&filterops_lock);
1371}
1372
1373/*
1374 * A ref to kq (obtained via kqueue_acquire) must be held.
1375 */
1376static int
1377kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
1378    int mflag)
1379{
1380	struct filterops *fops;
1381	struct file *fp;
1382	struct knote *kn, *tkn;
1383	struct knlist *knl;
1384	int error, filt, event;
1385	int haskqglobal, filedesc_unlock;
1386
1387	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
1388		return (EINVAL);
1389
1390	fp = NULL;
1391	kn = NULL;
1392	knl = NULL;
1393	error = 0;
1394	haskqglobal = 0;
1395	filedesc_unlock = 0;
1396
1397	filt = kev->filter;
1398	fops = kqueue_fo_find(filt);
1399	if (fops == NULL)
1400		return EINVAL;
1401
1402	if (kev->flags & EV_ADD) {
1403		/*
1404		 * Prevent waiting with locks.  Non-sleepable
1405		 * allocation failures are handled in the loop, only
1406		 * if the spare knote appears to be actually required.
1407		 */
1408		tkn = knote_alloc(mflag);
1409	} else {
1410		tkn = NULL;
1411	}
1412
1413findkn:
1414	if (fops->f_isfd) {
1415		KASSERT(td != NULL, ("td is NULL"));
1416		if (kev->ident > INT_MAX)
1417			error = EBADF;
1418		else
1419			error = fget(td, kev->ident, &cap_event_rights, &fp);
1420		if (error)
1421			goto done;
1422
1423		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1424		    kev->ident, M_NOWAIT) != 0) {
1425			/* try again */
1426			fdrop(fp, td);
1427			fp = NULL;
1428			error = kqueue_expand(kq, fops, kev->ident, mflag);
1429			if (error)
1430				goto done;
1431			goto findkn;
1432		}
1433
1434		if (fp->f_type == DTYPE_KQUEUE) {
1435			/*
1436			 * If we add some intelligence about what we are doing,
1437			 * we should be able to support events on ourselves.
1438			 * We need to know when we are doing this to prevent
1439			 * getting both the knlist lock and the kq lock since
1440			 * they are the same thing.
1441			 */
1442			if (fp->f_data == kq) {
1443				error = EINVAL;
1444				goto done;
1445			}
1446
1447			/*
1448			 * Pre-lock the filedesc before the global
1449			 * lock mutex, see the comment in
1450			 * kqueue_close().
1451			 */
1452			FILEDESC_XLOCK(td->td_proc->p_fd);
1453			filedesc_unlock = 1;
1454			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1455		}
1456
1457		KQ_LOCK(kq);
1458		if (kev->ident < kq->kq_knlistsize) {
1459			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1460				if (kev->filter == kn->kn_filter)
1461					break;
1462		}
1463	} else {
1464		if ((kev->flags & EV_ADD) == EV_ADD) {
1465			error = kqueue_expand(kq, fops, kev->ident, mflag);
1466			if (error != 0)
1467				goto done;
1468		}
1469
1470		KQ_LOCK(kq);
1471
1472		/*
1473		 * If possible, find an existing knote to use for this kevent.
1474		 */
1475		if (kev->filter == EVFILT_PROC &&
1476		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
1477			/* This is an internal creation of a process tracking
1478			 * note. Don't attempt to coalesce this with an
1479			 * existing note.
1480			 */
1481			;
1482		} else if (kq->kq_knhashmask != 0) {
1483			struct klist *list;
1484
1485			list = &kq->kq_knhash[
1486			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1487			SLIST_FOREACH(kn, list, kn_link)
1488				if (kev->ident == kn->kn_id &&
1489				    kev->filter == kn->kn_filter)
1490					break;
1491		}
1492	}
1493
1494	/* knote is in the process of changing, wait for it to stabilize. */
1495	if (kn != NULL && kn_in_flux(kn)) {
1496		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1497		if (filedesc_unlock) {
1498			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1499			filedesc_unlock = 0;
1500		}
1501		kq->kq_state |= KQ_FLUXWAIT;
1502		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1503		if (fp != NULL) {
1504			fdrop(fp, td);
1505			fp = NULL;
1506		}
1507		goto findkn;
1508	}
1509
1510	/*
1511	 * kn now contains the matching knote, or NULL if no match
1512	 */
1513	if (kn == NULL) {
1514		if (kev->flags & EV_ADD) {
1515			kn = tkn;
1516			tkn = NULL;
1517			if (kn == NULL) {
1518				KQ_UNLOCK(kq);
1519				error = ENOMEM;
1520				goto done;
1521			}
1522			kn->kn_fp = fp;
1523			kn->kn_kq = kq;
1524			kn->kn_fop = fops;
1525			/*
1526			 * apply reference counts to knote structure, and
1527			 * do not release it at the end of this routine.
1528			 */
1529			fops = NULL;
1530			fp = NULL;
1531
1532			kn->kn_sfflags = kev->fflags;
1533			kn->kn_sdata = kev->data;
1534			kev->fflags = 0;
1535			kev->data = 0;
1536			kn->kn_kevent = *kev;
1537			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1538			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
1539			kn->kn_status = KN_DETACHED;
1540			if ((kev->flags & EV_DISABLE) != 0)
1541				kn->kn_status |= KN_DISABLED;
1542			kn_enter_flux(kn);
1543
1544			error = knote_attach(kn, kq);
1545			KQ_UNLOCK(kq);
1546			if (error != 0) {
1547				tkn = kn;
1548				goto done;
1549			}
1550
1551			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1552				knote_drop_detached(kn, td);
1553				goto done;
1554			}
1555			knl = kn_list_lock(kn);
1556			goto done_ev_add;
1557		} else {
1558			/* No matching knote and the EV_ADD flag is not set. */
1559			KQ_UNLOCK(kq);
1560			error = ENOENT;
1561			goto done;
1562		}
1563	}
1564
1565	if (kev->flags & EV_DELETE) {
1566		kn_enter_flux(kn);
1567		KQ_UNLOCK(kq);
1568		knote_drop(kn, td);
1569		goto done;
1570	}
1571
1572	if (kev->flags & EV_FORCEONESHOT) {
1573		kn->kn_flags |= EV_ONESHOT;
1574		KNOTE_ACTIVATE(kn, 1);
1575	}
1576
1577	if ((kev->flags & EV_ENABLE) != 0)
1578		kn->kn_status &= ~KN_DISABLED;
1579	else if ((kev->flags & EV_DISABLE) != 0)
1580		kn->kn_status |= KN_DISABLED;
1581
1582	/*
1583	 * The user may change some filter values after the initial EV_ADD,
1584	 * but doing so will not reset any filter which has already been
1585	 * triggered.
1586	 */
1587	kn->kn_status |= KN_SCAN;
1588	kn_enter_flux(kn);
1589	KQ_UNLOCK(kq);
1590	knl = kn_list_lock(kn);
1591	kn->kn_kevent.udata = kev->udata;
1592	if (!fops->f_isfd && fops->f_touch != NULL) {
1593		fops->f_touch(kn, kev, EVENT_REGISTER);
1594	} else {
1595		kn->kn_sfflags = kev->fflags;
1596		kn->kn_sdata = kev->data;
1597	}
1598
1599done_ev_add:
1600	/*
1601	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1602	 * the initial attach event decides that the event is "completed"
1603	 * already, e.g., filt_procattach() is called on a zombie process.  It
1604	 * will call filt_proc() which will remove it from the list, and NULL
1605	 * kn_knlist.
1606	 *
1607	 * KN_DISABLED will be stable while the knote is in flux, so the
1608	 * unlocked read will not race with an update.
1609	 */
1610	if ((kn->kn_status & KN_DISABLED) == 0)
1611		event = kn->kn_fop->f_event(kn, 0);
1612	else
1613		event = 0;
1614
1615	KQ_LOCK(kq);
1616	if (event)
1617		kn->kn_status |= KN_ACTIVE;
1618	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
1619	    KN_ACTIVE)
1620		knote_enqueue(kn);
1621	kn->kn_status &= ~KN_SCAN;
1622	kn_leave_flux(kn);
1623	kn_list_unlock(knl);
1624	KQ_UNLOCK_FLUX(kq);
1625
1626done:
1627	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1628	if (filedesc_unlock)
1629		FILEDESC_XUNLOCK(td->td_proc->p_fd);
1630	if (fp != NULL)
1631		fdrop(fp, td);
1632	knote_free(tkn);
1633	if (fops != NULL)
1634		kqueue_fo_release(filt);
1635	return (error);
1636}
1637
1638static int
1639kqueue_acquire(struct file *fp, struct kqueue **kqp)
1640{
1641	int error;
1642	struct kqueue *kq;
1643
1644	error = 0;
1645
1646	kq = fp->f_data;
1647	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1648		return (EBADF);
1649	*kqp = kq;
1650	KQ_LOCK(kq);
1651	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1652		KQ_UNLOCK(kq);
1653		return (EBADF);
1654	}
1655	kq->kq_refcnt++;
1656	KQ_UNLOCK(kq);
1657
1658	return error;
1659}
1660
1661static void
1662kqueue_release(struct kqueue *kq, int locked)
1663{
1664	if (locked)
1665		KQ_OWNED(kq);
1666	else
1667		KQ_LOCK(kq);
1668	kq->kq_refcnt--;
1669	if (kq->kq_refcnt == 1)
1670		wakeup(&kq->kq_refcnt);
1671	if (!locked)
1672		KQ_UNLOCK(kq);
1673}
1674
1675static void
1676kqueue_schedtask(struct kqueue *kq)
1677{
1678
1679	KQ_OWNED(kq);
1680	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1681	    ("scheduling kqueue task while draining"));
1682
1683	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1684		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
1685		kq->kq_state |= KQ_TASKSCHED;
1686	}
1687}
1688
1689/*
1690 * Expand the kq to make sure we have storage for fops/ident pair.
1691 *
1692 * Return 0 on success (or no work necessary), return errno on failure.
1693 */
1694static int
1695kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1696    int mflag)
1697{
1698	struct klist *list, *tmp_knhash, *to_free;
1699	u_long tmp_knhashmask;
1700	int error, fd, size;
1701
1702	KQ_NOTOWNED(kq);
1703
1704	error = 0;
1705	to_free = NULL;
1706	if (fops->f_isfd) {
1707		fd = ident;
1708		if (kq->kq_knlistsize <= fd) {
1709			size = kq->kq_knlistsize;
1710			while (size <= fd)
1711				size += KQEXTENT;
1712			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1713			if (list == NULL)
1714				return ENOMEM;
1715			KQ_LOCK(kq);
1716			if ((kq->kq_state & KQ_CLOSING) != 0) {
1717				to_free = list;
1718				error = EBADF;
1719			} else if (kq->kq_knlistsize > fd) {
1720				to_free = list;
1721			} else {
1722				if (kq->kq_knlist != NULL) {
1723					bcopy(kq->kq_knlist, list,
1724					    kq->kq_knlistsize * sizeof(*list));
1725					to_free = kq->kq_knlist;
1726					kq->kq_knlist = NULL;
1727				}
1728				bzero((caddr_t)list +
1729				    kq->kq_knlistsize * sizeof(*list),
1730				    (size - kq->kq_knlistsize) * sizeof(*list));
1731				kq->kq_knlistsize = size;
1732				kq->kq_knlist = list;
1733			}
1734			KQ_UNLOCK(kq);
1735		}
1736	} else {
1737		if (kq->kq_knhashmask == 0) {
1738			tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
1739			    &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
1740			    HASH_WAITOK : HASH_NOWAIT);
1741			if (tmp_knhash == NULL)
1742				return (ENOMEM);
1743			KQ_LOCK(kq);
1744			if ((kq->kq_state & KQ_CLOSING) != 0) {
1745				to_free = tmp_knhash;
1746				error = EBADF;
1747			} else if (kq->kq_knhashmask == 0) {
1748				kq->kq_knhash = tmp_knhash;
1749				kq->kq_knhashmask = tmp_knhashmask;
1750			} else {
1751				to_free = tmp_knhash;
1752			}
1753			KQ_UNLOCK(kq);
1754		}
1755	}
1756	free(to_free, M_KQUEUE);
1757
1758	KQ_NOTOWNED(kq);
1759	return (error);
1760}
1761
1762static void
1763kqueue_task(void *arg, int pending)
1764{
1765	struct kqueue *kq;
1766	int haskqglobal;
1767
1768	haskqglobal = 0;
1769	kq = arg;
1770
1771	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1772	KQ_LOCK(kq);
1773
1774	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1775
1776	kq->kq_state &= ~KQ_TASKSCHED;
1777	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1778		wakeup(&kq->kq_state);
1779	}
1780	KQ_UNLOCK(kq);
1781	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1782}
1783
1784/*
1785 * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1786 * We treat KN_MARKER knotes as if they are in flux.
1787 */
1788static int
1789kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1790    const struct timespec *tsp, struct kevent *keva, struct thread *td)
1791{
1792	struct kevent *kevp;
1793	struct knote *kn, *marker;
1794	struct knlist *knl;
1795	sbintime_t asbt, rsbt;
1796	int count, error, haskqglobal, influx, nkev, touch;
1797
1798	count = maxevents;
1799	nkev = 0;
1800	error = 0;
1801	haskqglobal = 0;
1802
1803	if (maxevents == 0)
1804		goto done_nl;
1805
1806	rsbt = 0;
1807	if (tsp != NULL) {
1808		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
1809		    tsp->tv_nsec >= 1000000000) {
1810			error = EINVAL;
1811			goto done_nl;
1812		}
1813		if (timespecisset(tsp)) {
1814			if (tsp->tv_sec <= INT32_MAX) {
1815				rsbt = tstosbt(*tsp);
1816				if (TIMESEL(&asbt, rsbt))
1817					asbt += tc_tick_sbt;
1818				if (asbt <= SBT_MAX - rsbt)
1819					asbt += rsbt;
1820				else
1821					asbt = 0;
1822				rsbt >>= tc_precexp;
1823			} else
1824				asbt = 0;
1825		} else
1826			asbt = -1;
1827	} else
1828		asbt = 0;
1829	marker = knote_alloc(M_WAITOK);
1830	marker->kn_status = KN_MARKER;
1831	KQ_LOCK(kq);
1832
1833retry:
1834	kevp = keva;
1835	if (kq->kq_count == 0) {
1836		if (asbt == -1) {
1837			error = EWOULDBLOCK;
1838		} else {
1839			kq->kq_state |= KQ_SLEEP;
1840			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
1841			    "kqread", asbt, rsbt, C_ABSOLUTE);
1842		}
1843		if (error == 0)
1844			goto retry;
1845		/* don't restart after signals... */
1846		if (error == ERESTART)
1847			error = EINTR;
1848		else if (error == EWOULDBLOCK)
1849			error = 0;
1850		goto done;
1851	}
1852
1853	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1854	influx = 0;
1855	while (count) {
1856		KQ_OWNED(kq);
1857		kn = TAILQ_FIRST(&kq->kq_head);
1858
1859		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1860		    kn_in_flux(kn)) {
1861			if (influx) {
1862				influx = 0;
1863				KQ_FLUX_WAKEUP(kq);
1864			}
1865			kq->kq_state |= KQ_FLUXWAIT;
1866			error = msleep(kq, &kq->kq_lock, PSOCK,
1867			    "kqflxwt", 0);
1868			continue;
1869		}
1870
1871		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1872		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1873			kn->kn_status &= ~KN_QUEUED;
1874			kq->kq_count--;
1875			continue;
1876		}
1877		if (kn == marker) {
1878			KQ_FLUX_WAKEUP(kq);
1879			if (count == maxevents)
1880				goto retry;
1881			goto done;
1882		}
1883		KASSERT(!kn_in_flux(kn),
1884		    ("knote %p is unexpectedly in flux", kn));
1885
1886		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
1887			kn->kn_status &= ~KN_QUEUED;
1888			kn_enter_flux(kn);
1889			kq->kq_count--;
1890			KQ_UNLOCK(kq);
1891			/*
1892			 * We don't need to lock the list since we've
1893			 * marked it as in flux.
1894			 */
1895			knote_drop(kn, td);
1896			KQ_LOCK(kq);
1897			continue;
1898		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1899			kn->kn_status &= ~KN_QUEUED;
1900			kn_enter_flux(kn);
1901			kq->kq_count--;
1902			KQ_UNLOCK(kq);
1903			/*
1904			 * We don't need to lock the list since we've
1905			 * marked the knote as being in flux.
1906			 */
1907			*kevp = kn->kn_kevent;
1908			knote_drop(kn, td);
1909			KQ_LOCK(kq);
1910			kn = NULL;
1911		} else {
1912			kn->kn_status |= KN_SCAN;
1913			kn_enter_flux(kn);
1914			KQ_UNLOCK(kq);
1915			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1916				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1917			knl = kn_list_lock(kn);
1918			if (kn->kn_fop->f_event(kn, 0) == 0) {
1919				KQ_LOCK(kq);
1920				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1921				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
1922				    KN_SCAN);
1923				kn_leave_flux(kn);
1924				kq->kq_count--;
1925				kn_list_unlock(knl);
1926				influx = 1;
1927				continue;
1928			}
1929			touch = (!kn->kn_fop->f_isfd &&
1930			    kn->kn_fop->f_touch != NULL);
1931			if (touch)
1932				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
1933			else
1934				*kevp = kn->kn_kevent;
1935			KQ_LOCK(kq);
1936			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1937			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
1938				/*
1939				 * Manually clear knotes who weren't
1940				 * 'touch'ed.
1941				 */
1942				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
1943					kn->kn_data = 0;
1944					kn->kn_fflags = 0;
1945				}
1946				if (kn->kn_flags & EV_DISPATCH)
1947					kn->kn_status |= KN_DISABLED;
1948				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1949				kq->kq_count--;
1950			} else
1951				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1952
1953			kn->kn_status &= ~KN_SCAN;
1954			kn_leave_flux(kn);
1955			kn_list_unlock(knl);
1956			influx = 1;
1957		}
1958
1959		/* we are returning a copy to the user */
1960		kevp++;
1961		nkev++;
1962		count--;
1963
1964		if (nkev == KQ_NEVENTS) {
1965			influx = 0;
1966			KQ_UNLOCK_FLUX(kq);
1967			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1968			nkev = 0;
1969			kevp = keva;
1970			KQ_LOCK(kq);
1971			if (error)
1972				break;
1973		}
1974	}
1975	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1976done:
1977	KQ_OWNED(kq);
1978	KQ_UNLOCK_FLUX(kq);
1979	knote_free(marker);
1980done_nl:
1981	KQ_NOTOWNED(kq);
1982	if (nkev != 0)
1983		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1984	td->td_retval[0] = maxevents - count;
1985	return (error);
1986}
1987
1988/*ARGSUSED*/
1989static int
1990kqueue_ioctl(struct file *fp, u_long cmd, void *data,
1991	struct ucred *active_cred, struct thread *td)
1992{
1993	/*
1994	 * Enabling sigio causes two major problems:
1995	 * 1) infinite recursion:
1996	 * Synopsys: kevent is being used to track signals and have FIOASYNC
1997	 * set.  On receipt of a signal this will cause a kqueue to recurse
1998	 * into itself over and over.  Sending the sigio causes the kqueue
1999	 * to become ready, which in turn posts sigio again, forever.
2000	 * Solution: this can be solved by setting a flag in the kqueue that
2001	 * we have a SIGIO in progress.
2002	 * 2) locking problems:
2003	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
2004	 * us above the proc and pgrp locks.
2005	 * Solution: Post a signal using an async mechanism, being sure to
2006	 * record a generation count in the delivery so that we do not deliver
2007	 * a signal to the wrong process.
2008	 *
2009	 * Note, these two mechanisms are somewhat mutually exclusive!
2010	 */
2011#if 0
2012	struct kqueue *kq;
2013
2014	kq = fp->f_data;
2015	switch (cmd) {
2016	case FIOASYNC:
2017		if (*(int *)data) {
2018			kq->kq_state |= KQ_ASYNC;
2019		} else {
2020			kq->kq_state &= ~KQ_ASYNC;
2021		}
2022		return (0);
2023
2024	case FIOSETOWN:
2025		return (fsetown(*(int *)data, &kq->kq_sigio));
2026
2027	case FIOGETOWN:
2028		*(int *)data = fgetown(&kq->kq_sigio);
2029		return (0);
2030	}
2031#endif
2032
2033	return (ENOTTY);
2034}
2035
2036/*ARGSUSED*/
2037static int
2038kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
2039	struct thread *td)
2040{
2041	struct kqueue *kq;
2042	int revents = 0;
2043	int error;
2044
2045	if ((error = kqueue_acquire(fp, &kq)))
2046		return POLLERR;
2047
2048	KQ_LOCK(kq);
2049	if (events & (POLLIN | POLLRDNORM)) {
2050		if (kq->kq_count) {
2051			revents |= events & (POLLIN | POLLRDNORM);
2052		} else {
2053			selrecord(td, &kq->kq_sel);
2054			if (SEL_WAITING(&kq->kq_sel))
2055				kq->kq_state |= KQ_SEL;
2056		}
2057	}
2058	kqueue_release(kq, 1);
2059	KQ_UNLOCK(kq);
2060	return (revents);
2061}
2062
2063/*ARGSUSED*/
2064static int
2065kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
2066	struct thread *td)
2067{
2068
2069	bzero((void *)st, sizeof *st);
2070	/*
2071	 * We no longer return kq_count because the unlocked value is useless.
2072	 * If you spent all this time getting the count, why not spend your
2073	 * syscall better by calling kevent?
2074	 *
2075	 * XXX - This is needed for libc_r.
2076	 */
2077	st->st_mode = S_IFIFO;
2078	return (0);
2079}
2080
2081static void
2082kqueue_drain(struct kqueue *kq, struct thread *td)
2083{
2084	struct knote *kn;
2085	int i;
2086
2087	KQ_LOCK(kq);
2088
2089	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
2090	    ("kqueue already closing"));
2091	kq->kq_state |= KQ_CLOSING;
2092	if (kq->kq_refcnt > 1)
2093		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
2094
2095	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
2096
2097	KASSERT(knlist_empty(&kq->kq_sel.si_note),
2098	    ("kqueue's knlist not empty"));
2099
2100	for (i = 0; i < kq->kq_knlistsize; i++) {
2101		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
2102			if (kn_in_flux(kn)) {
2103				kq->kq_state |= KQ_FLUXWAIT;
2104				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
2105				continue;
2106			}
2107			kn_enter_flux(kn);
2108			KQ_UNLOCK(kq);
2109			knote_drop(kn, td);
2110			KQ_LOCK(kq);
2111		}
2112	}
2113	if (kq->kq_knhashmask != 0) {
2114		for (i = 0; i <= kq->kq_knhashmask; i++) {
2115			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
2116				if (kn_in_flux(kn)) {
2117					kq->kq_state |= KQ_FLUXWAIT;
2118					msleep(kq, &kq->kq_lock, PSOCK,
2119					       "kqclo2", 0);
2120					continue;
2121				}
2122				kn_enter_flux(kn);
2123				KQ_UNLOCK(kq);
2124				knote_drop(kn, td);
2125				KQ_LOCK(kq);
2126			}
2127		}
2128	}
2129
2130	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
2131		kq->kq_state |= KQ_TASKDRAIN;
2132		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
2133	}
2134
2135	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2136		selwakeuppri(&kq->kq_sel, PSOCK);
2137		if (!SEL_WAITING(&kq->kq_sel))
2138			kq->kq_state &= ~KQ_SEL;
2139	}
2140
2141	KQ_UNLOCK(kq);
2142}
2143
2144static void
2145kqueue_destroy(struct kqueue *kq)
2146{
2147
2148	KASSERT(kq->kq_fdp == NULL,
2149	    ("kqueue still attached to a file descriptor"));
2150	seldrain(&kq->kq_sel);
2151	knlist_destroy(&kq->kq_sel.si_note);
2152	mtx_destroy(&kq->kq_lock);
2153
2154	if (kq->kq_knhash != NULL)
2155		free(kq->kq_knhash, M_KQUEUE);
2156	if (kq->kq_knlist != NULL)
2157		free(kq->kq_knlist, M_KQUEUE);
2158
2159	funsetown(&kq->kq_sigio);
2160}
2161
2162/*ARGSUSED*/
2163static int
2164kqueue_close(struct file *fp, struct thread *td)
2165{
2166	struct kqueue *kq = fp->f_data;
2167	struct filedesc *fdp;
2168	int error;
2169	int filedesc_unlock;
2170
2171	if ((error = kqueue_acquire(fp, &kq)))
2172		return error;
2173	kqueue_drain(kq, td);
2174
2175	/*
2176	 * We could be called due to the knote_drop() doing fdrop(),
2177	 * called from kqueue_register().  In this case the global
2178	 * lock is owned, and filedesc sx is locked before, to not
2179	 * take the sleepable lock after non-sleepable.
2180	 */
2181	fdp = kq->kq_fdp;
2182	kq->kq_fdp = NULL;
2183	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
2184		FILEDESC_XLOCK(fdp);
2185		filedesc_unlock = 1;
2186	} else
2187		filedesc_unlock = 0;
2188	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
2189	if (filedesc_unlock)
2190		FILEDESC_XUNLOCK(fdp);
2191
2192	kqueue_destroy(kq);
2193	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
2194	crfree(kq->kq_cred);
2195	free(kq, M_KQUEUE);
2196	fp->f_data = NULL;
2197
2198	return (0);
2199}
2200
2201static int
2202kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2203{
2204
2205	kif->kf_type = KF_TYPE_KQUEUE;
2206	return (0);
2207}
2208
2209static void
2210kqueue_wakeup(struct kqueue *kq)
2211{
2212	KQ_OWNED(kq);
2213
2214	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
2215		kq->kq_state &= ~KQ_SLEEP;
2216		wakeup(kq);
2217	}
2218	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2219		selwakeuppri(&kq->kq_sel, PSOCK);
2220		if (!SEL_WAITING(&kq->kq_sel))
2221			kq->kq_state &= ~KQ_SEL;
2222	}
2223	if (!knlist_empty(&kq->kq_sel.si_note))
2224		kqueue_schedtask(kq);
2225	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
2226		pgsigio(&kq->kq_sigio, SIGIO, 0);
2227	}
2228}
2229
2230/*
2231 * Walk down a list of knotes, activating them if their event has triggered.
2232 *
2233 * There is a possibility to optimize in the case of one kq watching another.
2234 * Instead of scheduling a task to wake it up, you could pass enough state
2235 * down the chain to make up the parent kqueue.  Make this code functional
2236 * first.
2237 */
2238void
2239knote(struct knlist *list, long hint, int lockflags)
2240{
2241	struct kqueue *kq;
2242	struct knote *kn, *tkn;
2243	int error;
2244
2245	if (list == NULL)
2246		return;
2247
2248	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
2249
2250	if ((lockflags & KNF_LISTLOCKED) == 0)
2251		list->kl_lock(list->kl_lockarg);
2252
2253	/*
2254	 * If we unlock the list lock (and enter influx), we can
2255	 * eliminate the kqueue scheduling, but this will introduce
2256	 * four lock/unlock's for each knote to test.  Also, marker
2257	 * would be needed to keep iteration position, since filters
2258	 * or other threads could remove events.
2259	 */
2260	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
2261		kq = kn->kn_kq;
2262		KQ_LOCK(kq);
2263		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
2264			/*
2265			 * Do not process the influx notes, except for
2266			 * the influx coming from the kq unlock in the
2267			 * kqueue_scan().  In the later case, we do
2268			 * not interfere with the scan, since the code
2269			 * fragment in kqueue_scan() locks the knlist,
2270			 * and cannot proceed until we finished.
2271			 */
2272			KQ_UNLOCK(kq);
2273		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
2274			kn_enter_flux(kn);
2275			KQ_UNLOCK(kq);
2276			error = kn->kn_fop->f_event(kn, hint);
2277			KQ_LOCK(kq);
2278			kn_leave_flux(kn);
2279			if (error)
2280				KNOTE_ACTIVATE(kn, 1);
2281			KQ_UNLOCK_FLUX(kq);
2282		} else {
2283			if (kn->kn_fop->f_event(kn, hint))
2284				KNOTE_ACTIVATE(kn, 1);
2285			KQ_UNLOCK(kq);
2286		}
2287	}
2288	if ((lockflags & KNF_LISTLOCKED) == 0)
2289		list->kl_unlock(list->kl_lockarg);
2290}
2291
2292/*
2293 * add a knote to a knlist
2294 */
2295void
2296knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2297{
2298
2299	KNL_ASSERT_LOCK(knl, islocked);
2300	KQ_NOTOWNED(kn->kn_kq);
2301	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
2302	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2303	    ("knote %p was not detached", kn));
2304	if (!islocked)
2305		knl->kl_lock(knl->kl_lockarg);
2306	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2307	if (!islocked)
2308		knl->kl_unlock(knl->kl_lockarg);
2309	KQ_LOCK(kn->kn_kq);
2310	kn->kn_knlist = knl;
2311	kn->kn_status &= ~KN_DETACHED;
2312	KQ_UNLOCK(kn->kn_kq);
2313}
2314
2315static void
2316knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
2317    int kqislocked)
2318{
2319
2320	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
2321	KNL_ASSERT_LOCK(knl, knlislocked);
2322	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2323	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
2324	KASSERT((kn->kn_status & KN_DETACHED) == 0,
2325	    ("knote %p was already detached", kn));
2326	if (!knlislocked)
2327		knl->kl_lock(knl->kl_lockarg);
2328	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2329	kn->kn_knlist = NULL;
2330	if (!knlislocked)
2331		kn_list_unlock(knl);
2332	if (!kqislocked)
2333		KQ_LOCK(kn->kn_kq);
2334	kn->kn_status |= KN_DETACHED;
2335	if (!kqislocked)
2336		KQ_UNLOCK(kn->kn_kq);
2337}
2338
2339/*
2340 * remove knote from the specified knlist
2341 */
2342void
2343knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2344{
2345
2346	knlist_remove_kq(knl, kn, islocked, 0);
2347}
2348
2349int
2350knlist_empty(struct knlist *knl)
2351{
2352
2353	KNL_ASSERT_LOCKED(knl);
2354	return (SLIST_EMPTY(&knl->kl_list));
2355}
2356
2357static struct mtx knlist_lock;
2358MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2359    MTX_DEF);
2360static void knlist_mtx_lock(void *arg);
2361static void knlist_mtx_unlock(void *arg);
2362
2363static void
2364knlist_mtx_lock(void *arg)
2365{
2366
2367	mtx_lock((struct mtx *)arg);
2368}
2369
2370static void
2371knlist_mtx_unlock(void *arg)
2372{
2373
2374	mtx_unlock((struct mtx *)arg);
2375}
2376
2377static void
2378knlist_mtx_assert_locked(void *arg)
2379{
2380
2381	mtx_assert((struct mtx *)arg, MA_OWNED);
2382}
2383
2384static void
2385knlist_mtx_assert_unlocked(void *arg)
2386{
2387
2388	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2389}
2390
2391static void
2392knlist_rw_rlock(void *arg)
2393{
2394
2395	rw_rlock((struct rwlock *)arg);
2396}
2397
2398static void
2399knlist_rw_runlock(void *arg)
2400{
2401
2402	rw_runlock((struct rwlock *)arg);
2403}
2404
2405static void
2406knlist_rw_assert_locked(void *arg)
2407{
2408
2409	rw_assert((struct rwlock *)arg, RA_LOCKED);
2410}
2411
2412static void
2413knlist_rw_assert_unlocked(void *arg)
2414{
2415
2416	rw_assert((struct rwlock *)arg, RA_UNLOCKED);
2417}
2418
2419void
2420knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2421    void (*kl_unlock)(void *),
2422    void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
2423{
2424
2425	if (lock == NULL)
2426		knl->kl_lockarg = &knlist_lock;
2427	else
2428		knl->kl_lockarg = lock;
2429
2430	if (kl_lock == NULL)
2431		knl->kl_lock = knlist_mtx_lock;
2432	else
2433		knl->kl_lock = kl_lock;
2434	if (kl_unlock == NULL)
2435		knl->kl_unlock = knlist_mtx_unlock;
2436	else
2437		knl->kl_unlock = kl_unlock;
2438	if (kl_assert_locked == NULL)
2439		knl->kl_assert_locked = knlist_mtx_assert_locked;
2440	else
2441		knl->kl_assert_locked = kl_assert_locked;
2442	if (kl_assert_unlocked == NULL)
2443		knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
2444	else
2445		knl->kl_assert_unlocked = kl_assert_unlocked;
2446
2447	knl->kl_autodestroy = 0;
2448	SLIST_INIT(&knl->kl_list);
2449}
2450
2451void
2452knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2453{
2454
2455	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
2456}
2457
2458struct knlist *
2459knlist_alloc(struct mtx *lock)
2460{
2461	struct knlist *knl;
2462
2463	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
2464	knlist_init_mtx(knl, lock);
2465	return (knl);
2466}
2467
2468void
2469knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
2470{
2471
2472	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
2473	    knlist_rw_assert_locked, knlist_rw_assert_unlocked);
2474}
2475
2476void
2477knlist_destroy(struct knlist *knl)
2478{
2479
2480	KASSERT(KNLIST_EMPTY(knl),
2481	    ("destroying knlist %p with knotes on it", knl));
2482}
2483
2484void
2485knlist_detach(struct knlist *knl)
2486{
2487
2488	KNL_ASSERT_LOCKED(knl);
2489	knl->kl_autodestroy = 1;
2490	if (knlist_empty(knl)) {
2491		knlist_destroy(knl);
2492		free(knl, M_KQUEUE);
2493	}
2494}
2495
2496/*
2497 * Even if we are locked, we may need to drop the lock to allow any influx
2498 * knotes time to "settle".
2499 */
2500void
2501knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2502{
2503	struct knote *kn, *kn2;
2504	struct kqueue *kq;
2505
2506	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
2507	if (islocked)
2508		KNL_ASSERT_LOCKED(knl);
2509	else {
2510		KNL_ASSERT_UNLOCKED(knl);
2511again:		/* need to reacquire lock since we have dropped it */
2512		knl->kl_lock(knl->kl_lockarg);
2513	}
2514
2515	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2516		kq = kn->kn_kq;
2517		KQ_LOCK(kq);
2518		if (kn_in_flux(kn)) {
2519			KQ_UNLOCK(kq);
2520			continue;
2521		}
2522		knlist_remove_kq(knl, kn, 1, 1);
2523		if (killkn) {
2524			kn_enter_flux(kn);
2525			KQ_UNLOCK(kq);
2526			knote_drop_detached(kn, td);
2527		} else {
2528			/* Make sure cleared knotes disappear soon */
2529			kn->kn_flags |= EV_EOF | EV_ONESHOT;
2530			KQ_UNLOCK(kq);
2531		}
2532		kq = NULL;
2533	}
2534
2535	if (!SLIST_EMPTY(&knl->kl_list)) {
2536		/* there are still in flux knotes remaining */
2537		kn = SLIST_FIRST(&knl->kl_list);
2538		kq = kn->kn_kq;
2539		KQ_LOCK(kq);
2540		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
2541		knl->kl_unlock(knl->kl_lockarg);
2542		kq->kq_state |= KQ_FLUXWAIT;
2543		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2544		kq = NULL;
2545		goto again;
2546	}
2547
2548	if (islocked)
2549		KNL_ASSERT_LOCKED(knl);
2550	else {
2551		knl->kl_unlock(knl->kl_lockarg);
2552		KNL_ASSERT_UNLOCKED(knl);
2553	}
2554}
2555
2556/*
2557 * Remove all knotes referencing a specified fd must be called with FILEDESC
2558 * lock.  This prevents a race where a new fd comes along and occupies the
2559 * entry and we attach a knote to the fd.
2560 */
2561void
2562knote_fdclose(struct thread *td, int fd)
2563{
2564	struct filedesc *fdp = td->td_proc->p_fd;
2565	struct kqueue *kq;
2566	struct knote *kn;
2567	int influx;
2568
2569	FILEDESC_XLOCK_ASSERT(fdp);
2570
2571	/*
2572	 * We shouldn't have to worry about new kevents appearing on fd
2573	 * since filedesc is locked.
2574	 */
2575	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2576		KQ_LOCK(kq);
2577
2578again:
2579		influx = 0;
2580		while (kq->kq_knlistsize > fd &&
2581		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2582			if (kn_in_flux(kn)) {
2583				/* someone else might be waiting on our knote */
2584				if (influx)
2585					wakeup(kq);
2586				kq->kq_state |= KQ_FLUXWAIT;
2587				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2588				goto again;
2589			}
2590			kn_enter_flux(kn);
2591			KQ_UNLOCK(kq);
2592			influx = 1;
2593			knote_drop(kn, td);
2594			KQ_LOCK(kq);
2595		}
2596		KQ_UNLOCK_FLUX(kq);
2597	}
2598}
2599
2600static int
2601knote_attach(struct knote *kn, struct kqueue *kq)
2602{
2603	struct klist *list;
2604
2605	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
2606	KQ_OWNED(kq);
2607
2608	if ((kq->kq_state & KQ_CLOSING) != 0)
2609		return (EBADF);
2610	if (kn->kn_fop->f_isfd) {
2611		if (kn->kn_id >= kq->kq_knlistsize)
2612			return (ENOMEM);
2613		list = &kq->kq_knlist[kn->kn_id];
2614	} else {
2615		if (kq->kq_knhash == NULL)
2616			return (ENOMEM);
2617		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2618	}
2619	SLIST_INSERT_HEAD(list, kn, kn_link);
2620	return (0);
2621}
2622
2623static void
2624knote_drop(struct knote *kn, struct thread *td)
2625{
2626
2627	if ((kn->kn_status & KN_DETACHED) == 0)
2628		kn->kn_fop->f_detach(kn);
2629	knote_drop_detached(kn, td);
2630}
2631
2632static void
2633knote_drop_detached(struct knote *kn, struct thread *td)
2634{
2635	struct kqueue *kq;
2636	struct klist *list;
2637
2638	kq = kn->kn_kq;
2639
2640	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2641	    ("knote %p still attached", kn));
2642	KQ_NOTOWNED(kq);
2643
2644	KQ_LOCK(kq);
2645	KASSERT(kn->kn_influx == 1,
2646	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
2647
2648	if (kn->kn_fop->f_isfd)
2649		list = &kq->kq_knlist[kn->kn_id];
2650	else
2651		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2652
2653	if (!SLIST_EMPTY(list))
2654		SLIST_REMOVE(list, kn, knote, kn_link);
2655	if (kn->kn_status & KN_QUEUED)
2656		knote_dequeue(kn);
2657	KQ_UNLOCK_FLUX(kq);
2658
2659	if (kn->kn_fop->f_isfd) {
2660		fdrop(kn->kn_fp, td);
2661		kn->kn_fp = NULL;
2662	}
2663	kqueue_fo_release(kn->kn_kevent.filter);
2664	kn->kn_fop = NULL;
2665	knote_free(kn);
2666}
2667
2668static void
2669knote_enqueue(struct knote *kn)
2670{
2671	struct kqueue *kq = kn->kn_kq;
2672
2673	KQ_OWNED(kn->kn_kq);
2674	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2675
2676	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2677	kn->kn_status |= KN_QUEUED;
2678	kq->kq_count++;
2679	kqueue_wakeup(kq);
2680}
2681
2682static void
2683knote_dequeue(struct knote *kn)
2684{
2685	struct kqueue *kq = kn->kn_kq;
2686
2687	KQ_OWNED(kn->kn_kq);
2688	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2689
2690	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2691	kn->kn_status &= ~KN_QUEUED;
2692	kq->kq_count--;
2693}
2694
2695static void
2696knote_init(void)
2697{
2698
2699	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2700	    NULL, NULL, UMA_ALIGN_PTR, 0);
2701}
2702SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2703
2704static struct knote *
2705knote_alloc(int mflag)
2706{
2707
2708	return (uma_zalloc(knote_zone, mflag | M_ZERO));
2709}
2710
2711static void
2712knote_free(struct knote *kn)
2713{
2714
2715	uma_zfree(knote_zone, kn);
2716}
2717
2718/*
2719 * Register the kev w/ the kq specified by fd.
2720 */
2721int
2722kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
2723{
2724	struct kqueue *kq;
2725	struct file *fp;
2726	cap_rights_t rights;
2727	int error;
2728
2729	error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
2730	if (error != 0)
2731		return (error);
2732	if ((error = kqueue_acquire(fp, &kq)) != 0)
2733		goto noacquire;
2734
2735	error = kqueue_register(kq, kev, td, mflag);
2736	kqueue_release(kq, 0);
2737
2738noacquire:
2739	fdrop(fp, td);
2740	return (error);
2741}
2742