1/*	$NetBSD: kern_event.c,v 1.75 2012/01/25 00:28:35 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*-
33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
58 */
59
60#include <sys/cdefs.h>
61__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.75 2012/01/25 00:28:35 christos Exp $");
62
63#include <sys/param.h>
64#include <sys/systm.h>
65#include <sys/kernel.h>
66#include <sys/proc.h>
67#include <sys/file.h>
68#include <sys/select.h>
69#include <sys/queue.h>
70#include <sys/event.h>
71#include <sys/eventvar.h>
72#include <sys/poll.h>
73#include <sys/kmem.h>
74#include <sys/stat.h>
75#include <sys/filedesc.h>
76#include <sys/syscallargs.h>
77#include <sys/kauth.h>
78#include <sys/conf.h>
79#include <sys/atomic.h>
80
81static int	kqueue_scan(file_t *, size_t, struct kevent *,
82			    const struct timespec *, register_t *,
83			    const struct kevent_ops *, struct kevent *,
84			    size_t);
85static int	kqueue_ioctl(file_t *, u_long, void *);
86static int	kqueue_fcntl(file_t *, u_int, void *);
87static int	kqueue_poll(file_t *, int);
88static int	kqueue_kqfilter(file_t *, struct knote *);
89static int	kqueue_stat(file_t *, struct stat *);
90static int	kqueue_close(file_t *);
91static int	kqueue_register(struct kqueue *, struct kevent *);
92static void	kqueue_doclose(struct kqueue *, struct klist *, int);
93
94static void	knote_detach(struct knote *, filedesc_t *fdp, bool);
95static void	knote_enqueue(struct knote *);
96static void	knote_activate(struct knote *);
97
98static void	filt_kqdetach(struct knote *);
99static int	filt_kqueue(struct knote *, long hint);
100static int	filt_procattach(struct knote *);
101static void	filt_procdetach(struct knote *);
102static int	filt_proc(struct knote *, long hint);
103static int	filt_fileattach(struct knote *);
104static void	filt_timerexpire(void *x);
105static int	filt_timerattach(struct knote *);
106static void	filt_timerdetach(struct knote *);
107static int	filt_timer(struct knote *, long hint);
108
109static const struct fileops kqueueops = {
110	.fo_read = (void *)enxio,
111	.fo_write = (void *)enxio,
112	.fo_ioctl = kqueue_ioctl,
113	.fo_fcntl = kqueue_fcntl,
114	.fo_poll = kqueue_poll,
115	.fo_stat = kqueue_stat,
116	.fo_close = kqueue_close,
117	.fo_kqfilter = kqueue_kqfilter,
118	.fo_restart = fnullop_restart,
119};
120
121static const struct filterops kqread_filtops =
122	{ 1, NULL, filt_kqdetach, filt_kqueue };
123static const struct filterops proc_filtops =
124	{ 0, filt_procattach, filt_procdetach, filt_proc };
125static const struct filterops file_filtops =
126	{ 1, filt_fileattach, NULL, NULL };
127static const struct filterops timer_filtops =
128	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
129
130static u_int	kq_ncallouts = 0;
131static int	kq_calloutmax = (4 * 1024);
132
133#define	KN_HASHSIZE		64		/* XXX should be tunable */
134#define	KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
135
136extern const struct filterops sig_filtops;
137
138/*
139 * Table for for all system-defined filters.
140 * These should be listed in the numeric order of the EVFILT_* defines.
141 * If filtops is NULL, the filter isn't implemented in NetBSD.
142 * End of list is when name is NULL.
143 *
144 * Note that 'refcnt' is meaningless for built-in filters.
145 */
146struct kfilter {
147	const char	*name;		/* name of filter */
148	uint32_t	filter;		/* id of filter */
149	unsigned	refcnt;		/* reference count */
150	const struct filterops *filtops;/* operations for filter */
151	size_t		namelen;	/* length of name string */
152};
153
154/* System defined filters */
155static struct kfilter sys_kfilters[] = {
156	{ "EVFILT_READ",	EVFILT_READ,	0, &file_filtops, 0 },
157	{ "EVFILT_WRITE",	EVFILT_WRITE,	0, &file_filtops, 0, },
158	{ "EVFILT_AIO",		EVFILT_AIO,	0, NULL, 0 },
159	{ "EVFILT_VNODE",	EVFILT_VNODE,	0, &file_filtops, 0 },
160	{ "EVFILT_PROC",	EVFILT_PROC,	0, &proc_filtops, 0 },
161	{ "EVFILT_SIGNAL",	EVFILT_SIGNAL,	0, &sig_filtops, 0 },
162	{ "EVFILT_TIMER",	EVFILT_TIMER,	0, &timer_filtops, 0 },
163	{ NULL,			0,		0, NULL, 0 },
164};
165
166/* User defined kfilters */
167static struct kfilter	*user_kfilters;		/* array */
168static int		user_kfilterc;		/* current offset */
169static int		user_kfiltermaxc;	/* max size so far */
170static size_t		user_kfiltersz;		/* size of allocated memory */
171
172/* Locks */
173static krwlock_t	kqueue_filter_lock;	/* lock on filter lists */
174static kmutex_t		kqueue_misc_lock;	/* miscellaneous */
175
176static kauth_listener_t	kqueue_listener;
177
178static int
179kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
180    void *arg0, void *arg1, void *arg2, void *arg3)
181{
182	struct proc *p;
183	int result;
184
185	result = KAUTH_RESULT_DEFER;
186	p = arg0;
187
188	if (action != KAUTH_PROCESS_KEVENT_FILTER)
189		return result;
190
191	if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
192	    ISSET(p->p_flag, PK_SUGID)))
193		return result;
194
195	result = KAUTH_RESULT_ALLOW;
196
197	return result;
198}
199
200/*
201 * Initialize the kqueue subsystem.
202 */
203void
204kqueue_init(void)
205{
206
207	rw_init(&kqueue_filter_lock);
208	mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
209
210	kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
211	    kqueue_listener_cb, NULL);
212}
213
214/*
215 * Find kfilter entry by name, or NULL if not found.
216 */
217static struct kfilter *
218kfilter_byname_sys(const char *name)
219{
220	int i;
221
222	KASSERT(rw_lock_held(&kqueue_filter_lock));
223
224	for (i = 0; sys_kfilters[i].name != NULL; i++) {
225		if (strcmp(name, sys_kfilters[i].name) == 0)
226			return &sys_kfilters[i];
227	}
228	return NULL;
229}
230
231static struct kfilter *
232kfilter_byname_user(const char *name)
233{
234	int i;
235
236	KASSERT(rw_lock_held(&kqueue_filter_lock));
237
238	/* user filter slots have a NULL name if previously deregistered */
239	for (i = 0; i < user_kfilterc ; i++) {
240		if (user_kfilters[i].name != NULL &&
241		    strcmp(name, user_kfilters[i].name) == 0)
242			return &user_kfilters[i];
243	}
244	return NULL;
245}
246
247static struct kfilter *
248kfilter_byname(const char *name)
249{
250	struct kfilter *kfilter;
251
252	KASSERT(rw_lock_held(&kqueue_filter_lock));
253
254	if ((kfilter = kfilter_byname_sys(name)) != NULL)
255		return kfilter;
256
257	return kfilter_byname_user(name);
258}
259
260/*
261 * Find kfilter entry by filter id, or NULL if not found.
262 * Assumes entries are indexed in filter id order, for speed.
263 */
264static struct kfilter *
265kfilter_byfilter(uint32_t filter)
266{
267	struct kfilter *kfilter;
268
269	KASSERT(rw_lock_held(&kqueue_filter_lock));
270
271	if (filter < EVFILT_SYSCOUNT)	/* it's a system filter */
272		kfilter = &sys_kfilters[filter];
273	else if (user_kfilters != NULL &&
274	    filter < EVFILT_SYSCOUNT + user_kfilterc)
275					/* it's a user filter */
276		kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
277	else
278		return (NULL);		/* out of range */
279	KASSERT(kfilter->filter == filter);	/* sanity check! */
280	return (kfilter);
281}
282
283/*
284 * Register a new kfilter. Stores the entry in user_kfilters.
285 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
286 * If retfilter != NULL, the new filterid is returned in it.
287 */
288int
289kfilter_register(const char *name, const struct filterops *filtops,
290		 int *retfilter)
291{
292	struct kfilter *kfilter;
293	size_t len;
294	int i;
295
296	if (name == NULL || name[0] == '\0' || filtops == NULL)
297		return (EINVAL);	/* invalid args */
298
299	rw_enter(&kqueue_filter_lock, RW_WRITER);
300	if (kfilter_byname(name) != NULL) {
301		rw_exit(&kqueue_filter_lock);
302		return (EEXIST);	/* already exists */
303	}
304	if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
305		rw_exit(&kqueue_filter_lock);
306		return (EINVAL);	/* too many */
307	}
308
309	for (i = 0; i < user_kfilterc; i++) {
310		kfilter = &user_kfilters[i];
311		if (kfilter->name == NULL) {
312			/* Previously deregistered slot.  Reuse. */
313			goto reuse;
314		}
315	}
316
317	/* check if need to grow user_kfilters */
318	if (user_kfilterc + 1 > user_kfiltermaxc) {
319		/* Grow in KFILTER_EXTENT chunks. */
320		user_kfiltermaxc += KFILTER_EXTENT;
321		len = user_kfiltermaxc * sizeof(*kfilter);
322		kfilter = kmem_alloc(len, KM_SLEEP);
323		memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
324		if (user_kfilters != NULL) {
325			memcpy(kfilter, user_kfilters, user_kfiltersz);
326			kmem_free(user_kfilters, user_kfiltersz);
327		}
328		user_kfiltersz = len;
329		user_kfilters = kfilter;
330	}
331	/* Adding new slot */
332	kfilter = &user_kfilters[user_kfilterc++];
333reuse:
334	kfilter->namelen = strlen(name) + 1;
335	kfilter->name = kmem_alloc(kfilter->namelen, KM_SLEEP);
336	memcpy(__UNCONST(kfilter->name), name, kfilter->namelen);
337
338	kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
339
340	kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
341	memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
342
343	if (retfilter != NULL)
344		*retfilter = kfilter->filter;
345	rw_exit(&kqueue_filter_lock);
346
347	return (0);
348}
349
350/*
351 * Unregister a kfilter previously registered with kfilter_register.
352 * This retains the filter id, but clears the name and frees filtops (filter
353 * operations), so that the number isn't reused during a boot.
354 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
355 */
356int
357kfilter_unregister(const char *name)
358{
359	struct kfilter *kfilter;
360
361	if (name == NULL || name[0] == '\0')
362		return (EINVAL);	/* invalid name */
363
364	rw_enter(&kqueue_filter_lock, RW_WRITER);
365	if (kfilter_byname_sys(name) != NULL) {
366		rw_exit(&kqueue_filter_lock);
367		return (EINVAL);	/* can't detach system filters */
368	}
369
370	kfilter = kfilter_byname_user(name);
371	if (kfilter == NULL) {
372		rw_exit(&kqueue_filter_lock);
373		return (ENOENT);
374	}
375	if (kfilter->refcnt != 0) {
376		rw_exit(&kqueue_filter_lock);
377		return (EBUSY);
378	}
379
380	/* Cast away const (but we know it's safe. */
381	kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
382	kfilter->name = NULL;	/* mark as `not implemented' */
383
384	if (kfilter->filtops != NULL) {
385		/* Cast away const (but we know it's safe. */
386		kmem_free(__UNCONST(kfilter->filtops),
387		    sizeof(*kfilter->filtops));
388		kfilter->filtops = NULL; /* mark as `not implemented' */
389	}
390	rw_exit(&kqueue_filter_lock);
391
392	return (0);
393}
394
395
396/*
397 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
398 * descriptors. Calls fileops kqfilter method for given file descriptor.
399 */
400static int
401filt_fileattach(struct knote *kn)
402{
403	file_t *fp;
404
405	fp = kn->kn_obj;
406
407	return (*fp->f_ops->fo_kqfilter)(fp, kn);
408}
409
410/*
411 * Filter detach method for EVFILT_READ on kqueue descriptor.
412 */
413static void
414filt_kqdetach(struct knote *kn)
415{
416	struct kqueue *kq;
417
418	kq = ((file_t *)kn->kn_obj)->f_data;
419
420	mutex_spin_enter(&kq->kq_lock);
421	SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext);
422	mutex_spin_exit(&kq->kq_lock);
423}
424
425/*
426 * Filter event method for EVFILT_READ on kqueue descriptor.
427 */
428/*ARGSUSED*/
429static int
430filt_kqueue(struct knote *kn, long hint)
431{
432	struct kqueue *kq;
433	int rv;
434
435	kq = ((file_t *)kn->kn_obj)->f_data;
436
437	if (hint != NOTE_SUBMIT)
438		mutex_spin_enter(&kq->kq_lock);
439	kn->kn_data = kq->kq_count;
440	rv = (kn->kn_data > 0);
441	if (hint != NOTE_SUBMIT)
442		mutex_spin_exit(&kq->kq_lock);
443
444	return rv;
445}
446
447/*
448 * Filter attach method for EVFILT_PROC.
449 */
450static int
451filt_procattach(struct knote *kn)
452{
453	struct proc *p, *curp;
454	struct lwp *curl;
455
456	curl = curlwp;
457	curp = curl->l_proc;
458
459	mutex_enter(proc_lock);
460	p = proc_find(kn->kn_id);
461	if (p == NULL) {
462		mutex_exit(proc_lock);
463		return ESRCH;
464	}
465
466	/*
467	 * Fail if it's not owned by you, or the last exec gave us
468	 * setuid/setgid privs (unless you're root).
469	 */
470	mutex_enter(p->p_lock);
471	mutex_exit(proc_lock);
472	if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER,
473	    p, NULL, NULL, NULL) != 0) {
474	    	mutex_exit(p->p_lock);
475		return EACCES;
476	}
477
478	kn->kn_obj = p;
479	kn->kn_flags |= EV_CLEAR;	/* automatically set */
480
481	/*
482	 * internal flag indicating registration done by kernel
483	 */
484	if (kn->kn_flags & EV_FLAG1) {
485		kn->kn_data = kn->kn_sdata;	/* ppid */
486		kn->kn_fflags = NOTE_CHILD;
487		kn->kn_flags &= ~EV_FLAG1;
488	}
489	SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
490    	mutex_exit(p->p_lock);
491
492	return 0;
493}
494
495/*
496 * Filter detach method for EVFILT_PROC.
497 *
498 * The knote may be attached to a different process, which may exit,
499 * leaving nothing for the knote to be attached to.  So when the process
500 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
501 * it will be deleted when read out.  However, as part of the knote deletion,
502 * this routine is called, so a check is needed to avoid actually performing
503 * a detach, because the original process might not exist any more.
504 */
505static void
506filt_procdetach(struct knote *kn)
507{
508	struct proc *p;
509
510	if (kn->kn_status & KN_DETACHED)
511		return;
512
513	p = kn->kn_obj;
514
515	mutex_enter(p->p_lock);
516	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
517	mutex_exit(p->p_lock);
518}
519
520/*
521 * Filter event method for EVFILT_PROC.
522 */
523static int
524filt_proc(struct knote *kn, long hint)
525{
526	u_int event, fflag;
527	struct kevent kev;
528	struct kqueue *kq;
529	int error;
530
531	event = (u_int)hint & NOTE_PCTRLMASK;
532	kq = kn->kn_kq;
533	fflag = 0;
534
535	/* If the user is interested in this event, record it. */
536	if (kn->kn_sfflags & event)
537		fflag |= event;
538
539	if (event == NOTE_EXIT) {
540		/*
541		 * Process is gone, so flag the event as finished.
542		 *
543		 * Detach the knote from watched process and mark
544		 * it as such. We can't leave this to kqueue_scan(),
545		 * since the process might not exist by then. And we
546		 * have to do this now, since psignal KNOTE() is called
547		 * also for zombies and we might end up reading freed
548		 * memory if the kevent would already be picked up
549		 * and knote g/c'ed.
550		 */
551		filt_procdetach(kn);
552
553		mutex_spin_enter(&kq->kq_lock);
554		kn->kn_status |= KN_DETACHED;
555		/* Mark as ONESHOT, so that the knote it g/c'ed when read */
556		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
557		kn->kn_fflags |= fflag;
558		mutex_spin_exit(&kq->kq_lock);
559
560		return 1;
561	}
562
563	mutex_spin_enter(&kq->kq_lock);
564	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
565		/*
566		 * Process forked, and user wants to track the new process,
567		 * so attach a new knote to it, and immediately report an
568		 * event with the parent's pid.  Register knote with new
569		 * process.
570		 */
571		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
572		kev.filter = kn->kn_filter;
573		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
574		kev.fflags = kn->kn_sfflags;
575		kev.data = kn->kn_id;			/* parent */
576		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
577		mutex_spin_exit(&kq->kq_lock);
578		error = kqueue_register(kq, &kev);
579		mutex_spin_enter(&kq->kq_lock);
580		if (error != 0)
581			kn->kn_fflags |= NOTE_TRACKERR;
582	}
583	kn->kn_fflags |= fflag;
584	fflag = kn->kn_fflags;
585	mutex_spin_exit(&kq->kq_lock);
586
587	return fflag != 0;
588}
589
590static void
591filt_timerexpire(void *knx)
592{
593	struct knote *kn = knx;
594	int tticks;
595
596	mutex_enter(&kqueue_misc_lock);
597	kn->kn_data++;
598	knote_activate(kn);
599	if ((kn->kn_flags & EV_ONESHOT) == 0) {
600		tticks = mstohz(kn->kn_sdata);
601		if (tticks <= 0)
602			tticks = 1;
603		callout_schedule((callout_t *)kn->kn_hook, tticks);
604	}
605	mutex_exit(&kqueue_misc_lock);
606}
607
608/*
609 * data contains amount of time to sleep, in milliseconds
610 */
611static int
612filt_timerattach(struct knote *kn)
613{
614	callout_t *calloutp;
615	struct kqueue *kq;
616	int tticks;
617
618	tticks = mstohz(kn->kn_sdata);
619
620	/* if the supplied value is under our resolution, use 1 tick */
621	if (tticks == 0) {
622		if (kn->kn_sdata == 0)
623			return EINVAL;
624		tticks = 1;
625	}
626
627	if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
628	    (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
629		atomic_dec_uint(&kq_ncallouts);
630		return ENOMEM;
631	}
632	callout_init(calloutp, CALLOUT_MPSAFE);
633
634	kq = kn->kn_kq;
635	mutex_spin_enter(&kq->kq_lock);
636	kn->kn_flags |= EV_CLEAR;		/* automatically set */
637	kn->kn_hook = calloutp;
638	mutex_spin_exit(&kq->kq_lock);
639
640	callout_reset(calloutp, tticks, filt_timerexpire, kn);
641
642	return (0);
643}
644
645static void
646filt_timerdetach(struct knote *kn)
647{
648	callout_t *calloutp;
649
650	calloutp = (callout_t *)kn->kn_hook;
651	callout_halt(calloutp, NULL);
652	callout_destroy(calloutp);
653	kmem_free(calloutp, sizeof(*calloutp));
654	atomic_dec_uint(&kq_ncallouts);
655}
656
657static int
658filt_timer(struct knote *kn, long hint)
659{
660	int rv;
661
662	mutex_enter(&kqueue_misc_lock);
663	rv = (kn->kn_data != 0);
664	mutex_exit(&kqueue_misc_lock);
665
666	return rv;
667}
668
669/*
670 * filt_seltrue:
671 *
672 *	This filter "event" routine simulates seltrue().
673 */
674int
675filt_seltrue(struct knote *kn, long hint)
676{
677
678	/*
679	 * We don't know how much data can be read/written,
680	 * but we know that it *can* be.  This is about as
681	 * good as select/poll does as well.
682	 */
683	kn->kn_data = 0;
684	return (1);
685}
686
687/*
688 * This provides full kqfilter entry for device switch tables, which
689 * has same effect as filter using filt_seltrue() as filter method.
690 */
691static void
692filt_seltruedetach(struct knote *kn)
693{
694	/* Nothing to do */
695}
696
697const struct filterops seltrue_filtops =
698	{ 1, NULL, filt_seltruedetach, filt_seltrue };
699
700int
701seltrue_kqfilter(dev_t dev, struct knote *kn)
702{
703	switch (kn->kn_filter) {
704	case EVFILT_READ:
705	case EVFILT_WRITE:
706		kn->kn_fop = &seltrue_filtops;
707		break;
708	default:
709		return (EINVAL);
710	}
711
712	/* Nothing more to do */
713	return (0);
714}
715
716/*
717 * kqueue(2) system call.
718 */
719static int
720kqueue1(struct lwp *l, int flags, register_t *retval)
721{
722	struct kqueue *kq;
723	file_t *fp;
724	int fd, error;
725
726	if ((error = fd_allocfile(&fp, &fd)) != 0)
727		return error;
728	fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
729	fp->f_type = DTYPE_KQUEUE;
730	fp->f_ops = &kqueueops;
731	kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
732	mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
733	cv_init(&kq->kq_cv, "kqueue");
734	selinit(&kq->kq_sel);
735	TAILQ_INIT(&kq->kq_head);
736	fp->f_data = kq;
737	*retval = fd;
738	kq->kq_fdp = curlwp->l_fd;
739	fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
740	fd_affix(curproc, fp, fd);
741	return error;
742}
743
744/*
745 * kqueue(2) system call.
746 */
747int
748sys_kqueue(struct lwp *l, const void *v, register_t *retval)
749{
750	return kqueue1(l, 0, retval);
751}
752
753int
754sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
755    register_t *retval)
756{
757	/* {
758		syscallarg(int) flags;
759	} */
760	return kqueue1(l, SCARG(uap, flags), retval);
761}
762
763/*
764 * kevent(2) system call.
765 */
766int
767kevent_fetch_changes(void *private, const struct kevent *changelist,
768    struct kevent *changes, size_t index, int n)
769{
770
771	return copyin(changelist + index, changes, n * sizeof(*changes));
772}
773
774int
775kevent_put_events(void *private, struct kevent *events,
776    struct kevent *eventlist, size_t index, int n)
777{
778
779	return copyout(events, eventlist + index, n * sizeof(*events));
780}
781
782static const struct kevent_ops kevent_native_ops = {
783	.keo_private = NULL,
784	.keo_fetch_timeout = copyin,
785	.keo_fetch_changes = kevent_fetch_changes,
786	.keo_put_events = kevent_put_events,
787};
788
789int
790sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap,
791    register_t *retval)
792{
793	/* {
794		syscallarg(int) fd;
795		syscallarg(const struct kevent *) changelist;
796		syscallarg(size_t) nchanges;
797		syscallarg(struct kevent *) eventlist;
798		syscallarg(size_t) nevents;
799		syscallarg(const struct timespec *) timeout;
800	} */
801
802	return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
803	    SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
804	    SCARG(uap, timeout), &kevent_native_ops);
805}
806
807int
808kevent1(register_t *retval, int fd,
809	const struct kevent *changelist, size_t nchanges,
810	struct kevent *eventlist, size_t nevents,
811	const struct timespec *timeout,
812	const struct kevent_ops *keops)
813{
814	struct kevent *kevp;
815	struct kqueue *kq;
816	struct timespec	ts;
817	size_t i, n, ichange;
818	int nerrors, error;
819	struct kevent kevbuf[8];	/* approx 300 bytes on 64-bit */
820	file_t *fp;
821
822	/* check that we're dealing with a kq */
823	fp = fd_getfile(fd);
824	if (fp == NULL)
825		return (EBADF);
826
827	if (fp->f_type != DTYPE_KQUEUE) {
828		fd_putfile(fd);
829		return (EBADF);
830	}
831
832	if (timeout != NULL) {
833		error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
834		if (error)
835			goto done;
836		timeout = &ts;
837	}
838
839	kq = (struct kqueue *)fp->f_data;
840	nerrors = 0;
841	ichange = 0;
842
843	/* traverse list of events to register */
844	while (nchanges > 0) {
845		n = MIN(nchanges, __arraycount(kevbuf));
846		error = (*keops->keo_fetch_changes)(keops->keo_private,
847		    changelist, kevbuf, ichange, n);
848		if (error)
849			goto done;
850		for (i = 0; i < n; i++) {
851			kevp = &kevbuf[i];
852			kevp->flags &= ~EV_SYSFLAGS;
853			/* register each knote */
854			error = kqueue_register(kq, kevp);
855			if (error) {
856				if (nevents != 0) {
857					kevp->flags = EV_ERROR;
858					kevp->data = error;
859					error = (*keops->keo_put_events)
860					    (keops->keo_private, kevp,
861					    eventlist, nerrors, 1);
862					if (error)
863						goto done;
864					nevents--;
865					nerrors++;
866				} else {
867					goto done;
868				}
869			}
870		}
871		nchanges -= n;	/* update the results */
872		ichange += n;
873	}
874	if (nerrors) {
875		*retval = nerrors;
876		error = 0;
877		goto done;
878	}
879
880	/* actually scan through the events */
881	error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
882	    kevbuf, __arraycount(kevbuf));
883 done:
884	fd_putfile(fd);
885	return (error);
886}
887
888/*
889 * Register a given kevent kev onto the kqueue
890 */
891static int
892kqueue_register(struct kqueue *kq, struct kevent *kev)
893{
894	struct kfilter *kfilter;
895	filedesc_t *fdp;
896	file_t *fp;
897	fdfile_t *ff;
898	struct knote *kn, *newkn;
899	struct klist *list;
900	int error, fd, rv;
901
902	fdp = kq->kq_fdp;
903	fp = NULL;
904	kn = NULL;
905	error = 0;
906	fd = 0;
907
908	newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
909
910	rw_enter(&kqueue_filter_lock, RW_READER);
911	kfilter = kfilter_byfilter(kev->filter);
912	if (kfilter == NULL || kfilter->filtops == NULL) {
913		/* filter not found nor implemented */
914		rw_exit(&kqueue_filter_lock);
915		kmem_free(newkn, sizeof(*newkn));
916		return (EINVAL);
917	}
918
919	/* search if knote already exists */
920	if (kfilter->filtops->f_isfd) {
921		/* monitoring a file descriptor */
922		fd = kev->ident;
923		if ((fp = fd_getfile(fd)) == NULL) {
924			rw_exit(&kqueue_filter_lock);
925			kmem_free(newkn, sizeof(*newkn));
926			return EBADF;
927		}
928		mutex_enter(&fdp->fd_lock);
929		ff = fdp->fd_dt->dt_ff[fd];
930		if (fd <= fdp->fd_lastkqfile) {
931			SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
932				if (kq == kn->kn_kq &&
933				    kev->filter == kn->kn_filter)
934					break;
935			}
936		}
937	} else {
938		/*
939		 * not monitoring a file descriptor, so
940		 * lookup knotes in internal hash table
941		 */
942		mutex_enter(&fdp->fd_lock);
943		if (fdp->fd_knhashmask != 0) {
944			list = &fdp->fd_knhash[
945			    KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
946			SLIST_FOREACH(kn, list, kn_link) {
947				if (kev->ident == kn->kn_id &&
948				    kq == kn->kn_kq &&
949				    kev->filter == kn->kn_filter)
950					break;
951			}
952		}
953	}
954
955	/*
956	 * kn now contains the matching knote, or NULL if no match
957	 */
958	if (kev->flags & EV_ADD) {
959		if (kn == NULL) {
960			/* create new knote */
961			kn = newkn;
962			newkn = NULL;
963			kn->kn_obj = fp;
964			kn->kn_id = kev->ident;
965			kn->kn_kq = kq;
966			kn->kn_fop = kfilter->filtops;
967			kn->kn_kfilter = kfilter;
968			kn->kn_sfflags = kev->fflags;
969			kn->kn_sdata = kev->data;
970			kev->fflags = 0;
971			kev->data = 0;
972			kn->kn_kevent = *kev;
973
974			/*
975			 * apply reference count to knote structure, and
976			 * do not release it at the end of this routine.
977			 */
978			fp = NULL;
979
980			if (!kn->kn_fop->f_isfd) {
981				/*
982				 * If knote is not on an fd, store on
983				 * internal hash table.
984				 */
985				if (fdp->fd_knhashmask == 0) {
986					/* XXXAD can block with fd_lock held */
987					fdp->fd_knhash = hashinit(KN_HASHSIZE,
988					    HASH_LIST, true,
989					    &fdp->fd_knhashmask);
990				}
991				list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
992				    fdp->fd_knhashmask)];
993			} else {
994				/* Otherwise, knote is on an fd. */
995				list = (struct klist *)
996				    &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
997				if ((int)kn->kn_id > fdp->fd_lastkqfile)
998					fdp->fd_lastkqfile = kn->kn_id;
999			}
1000			SLIST_INSERT_HEAD(list, kn, kn_link);
1001
1002			KERNEL_LOCK(1, NULL);		/* XXXSMP */
1003			error = (*kfilter->filtops->f_attach)(kn);
1004			KERNEL_UNLOCK_ONE(NULL);	/* XXXSMP */
1005			if (error != 0) {
1006#ifdef DIAGNOSTIC
1007				printf("%s: event not supported for file type"
1008				    " %d\n", __func__, fp ? fp->f_type : -1);
1009#endif
1010				/* knote_detach() drops fdp->fd_lock */
1011				knote_detach(kn, fdp, false);
1012				goto done;
1013			}
1014			atomic_inc_uint(&kfilter->refcnt);
1015		} else {
1016			/*
1017			 * The user may change some filter values after the
1018			 * initial EV_ADD, but doing so will not reset any
1019			 * filter which have already been triggered.
1020			 */
1021			kn->kn_sfflags = kev->fflags;
1022			kn->kn_sdata = kev->data;
1023			kn->kn_kevent.udata = kev->udata;
1024		}
1025		/*
1026		 * We can get here if we are trying to attach
1027		 * an event to a file descriptor that does not
1028		 * support events, and the attach routine is
1029		 * broken and does not return an error.
1030		 */
1031		KASSERT(kn->kn_fop->f_event != NULL);
1032		KERNEL_LOCK(1, NULL);			/* XXXSMP */
1033		rv = (*kn->kn_fop->f_event)(kn, 0);
1034		KERNEL_UNLOCK_ONE(NULL);		/* XXXSMP */
1035		if (rv)
1036			knote_activate(kn);
1037	} else {
1038		if (kn == NULL) {
1039			error = ENOENT;
1040		 	mutex_exit(&fdp->fd_lock);
1041			goto done;
1042		}
1043		if (kev->flags & EV_DELETE) {
1044			/* knote_detach() drops fdp->fd_lock */
1045			knote_detach(kn, fdp, true);
1046			goto done;
1047		}
1048	}
1049
1050	/* disable knote */
1051	if ((kev->flags & EV_DISABLE)) {
1052		mutex_spin_enter(&kq->kq_lock);
1053		if ((kn->kn_status & KN_DISABLED) == 0)
1054			kn->kn_status |= KN_DISABLED;
1055		mutex_spin_exit(&kq->kq_lock);
1056	}
1057
1058	/* enable knote */
1059	if ((kev->flags & EV_ENABLE)) {
1060		knote_enqueue(kn);
1061	}
1062	mutex_exit(&fdp->fd_lock);
1063 done:
1064	rw_exit(&kqueue_filter_lock);
1065	if (newkn != NULL)
1066		kmem_free(newkn, sizeof(*newkn));
1067	if (fp != NULL)
1068		fd_putfile(fd);
1069	return (error);
1070}
1071
1072#if defined(DEBUG)
1073static void
1074kq_check(struct kqueue *kq)
1075{
1076	const struct knote *kn;
1077	int count;
1078	int nmarker;
1079
1080	KASSERT(mutex_owned(&kq->kq_lock));
1081	KASSERT(kq->kq_count >= 0);
1082
1083	count = 0;
1084	nmarker = 0;
1085	TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1086		if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
1087			panic("%s: kq=%p kn=%p inconsist 1", __func__, kq, kn);
1088		}
1089		if ((kn->kn_status & KN_MARKER) == 0) {
1090			if (kn->kn_kq != kq) {
1091				panic("%s: kq=%p kn=%p inconsist 2",
1092				    __func__, kq, kn);
1093			}
1094			if ((kn->kn_status & KN_ACTIVE) == 0) {
1095				panic("%s: kq=%p kn=%p: not active",
1096				    __func__, kq, kn);
1097			}
1098			count++;
1099			if (count > kq->kq_count) {
1100				goto bad;
1101			}
1102		} else {
1103			nmarker++;
1104#if 0
1105			if (nmarker > 10000) {
1106				panic("%s: kq=%p too many markers: %d != %d, "
1107				    "nmarker=%d",
1108				    __func__, kq, kq->kq_count, count, nmarker);
1109			}
1110#endif
1111		}
1112	}
1113	if (kq->kq_count != count) {
1114bad:
1115		panic("%s: kq=%p inconsist 3: %d != %d, nmarker=%d",
1116		    __func__, kq, kq->kq_count, count, nmarker);
1117	}
1118}
1119#else /* defined(DEBUG) */
1120#define	kq_check(a)	/* nothing */
1121#endif /* defined(DEBUG) */
1122
1123/*
1124 * Scan through the list of events on fp (for a maximum of maxevents),
1125 * returning the results in to ulistp. Timeout is determined by tsp; if
1126 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1127 * as appropriate.
1128 */
1129static int
1130kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
1131	    const struct timespec *tsp, register_t *retval,
1132	    const struct kevent_ops *keops, struct kevent *kevbuf,
1133	    size_t kevcnt)
1134{
1135	struct kqueue	*kq;
1136	struct kevent	*kevp;
1137	struct timespec	ats, sleepts;
1138	struct knote	*kn, *marker;
1139	size_t		count, nkev, nevents;
1140	int		timeout, error, rv;
1141	filedesc_t	*fdp;
1142
1143	fdp = curlwp->l_fd;
1144	kq = fp->f_data;
1145	count = maxevents;
1146	nkev = nevents = error = 0;
1147	if (count == 0) {
1148		*retval = 0;
1149		return 0;
1150	}
1151
1152	if (tsp) {				/* timeout supplied */
1153		ats = *tsp;
1154		if (inittimeleft(&ats, &sleepts) == -1) {
1155			*retval = maxevents;
1156			return EINVAL;
1157		}
1158		timeout = tstohz(&ats);
1159		if (timeout <= 0)
1160			timeout = -1;           /* do poll */
1161	} else {
1162		/* no timeout, wait forever */
1163		timeout = 0;
1164	}
1165
1166	marker = kmem_zalloc(sizeof(*marker), KM_SLEEP);
1167	marker->kn_status = KN_MARKER;
1168	mutex_spin_enter(&kq->kq_lock);
1169 retry:
1170	kevp = kevbuf;
1171	if (kq->kq_count == 0) {
1172		if (timeout >= 0) {
1173			error = cv_timedwait_sig(&kq->kq_cv,
1174			    &kq->kq_lock, timeout);
1175			if (error == 0) {
1176				 if (tsp == NULL || (timeout =
1177				     gettimeleft(&ats, &sleepts)) > 0)
1178					goto retry;
1179			} else {
1180				/* don't restart after signals... */
1181				if (error == ERESTART)
1182					error = EINTR;
1183				if (error == EWOULDBLOCK)
1184					error = 0;
1185			}
1186		}
1187	} else {
1188		/* mark end of knote list */
1189		TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1190
1191		while (count != 0) {
1192			kn = TAILQ_FIRST(&kq->kq_head);	/* get next knote */
1193			while ((kn->kn_status & KN_MARKER) != 0) {
1194				if (kn == marker) {
1195					/* it's our marker, stop */
1196					TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1197					if (count < maxevents || (tsp != NULL &&
1198					    (timeout = gettimeleft(&ats,
1199					    &sleepts)) <= 0))
1200						goto done;
1201					goto retry;
1202				}
1203				/* someone else's marker. */
1204				kn = TAILQ_NEXT(kn, kn_tqe);
1205			}
1206			kq_check(kq);
1207			TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1208			kq->kq_count--;
1209			kn->kn_status &= ~KN_QUEUED;
1210			kq_check(kq);
1211			if (kn->kn_status & KN_DISABLED) {
1212				/* don't want disabled events */
1213				continue;
1214			}
1215			if ((kn->kn_flags & EV_ONESHOT) == 0) {
1216				mutex_spin_exit(&kq->kq_lock);
1217				KERNEL_LOCK(1, NULL);		/* XXXSMP */
1218				rv = (*kn->kn_fop->f_event)(kn, 0);
1219				KERNEL_UNLOCK_ONE(NULL);	/* XXXSMP */
1220				mutex_spin_enter(&kq->kq_lock);
1221				/* Re-poll if note was re-enqueued. */
1222				if ((kn->kn_status & KN_QUEUED) != 0)
1223					continue;
1224				if (rv == 0) {
1225					/*
1226					 * non-ONESHOT event that hasn't
1227					 * triggered again, so de-queue.
1228					 */
1229					kn->kn_status &= ~KN_ACTIVE;
1230					continue;
1231				}
1232			}
1233			/* XXXAD should be got from f_event if !oneshot. */
1234			*kevp++ = kn->kn_kevent;
1235			nkev++;
1236			if (kn->kn_flags & EV_ONESHOT) {
1237				/* delete ONESHOT events after retrieval */
1238				mutex_spin_exit(&kq->kq_lock);
1239				mutex_enter(&fdp->fd_lock);
1240				knote_detach(kn, fdp, true);
1241				mutex_spin_enter(&kq->kq_lock);
1242			} else if (kn->kn_flags & EV_CLEAR) {
1243				/* clear state after retrieval */
1244				kn->kn_data = 0;
1245				kn->kn_fflags = 0;
1246				kn->kn_status &= ~KN_ACTIVE;
1247			} else {
1248				/* add event back on list */
1249				kq_check(kq);
1250				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1251				kq->kq_count++;
1252				kn->kn_status |= KN_QUEUED;
1253				kq_check(kq);
1254			}
1255			if (nkev == kevcnt) {
1256				/* do copyouts in kevcnt chunks */
1257				mutex_spin_exit(&kq->kq_lock);
1258				error = (*keops->keo_put_events)
1259				    (keops->keo_private,
1260				    kevbuf, ulistp, nevents, nkev);
1261				mutex_spin_enter(&kq->kq_lock);
1262				nevents += nkev;
1263				nkev = 0;
1264				kevp = kevbuf;
1265			}
1266			count--;
1267			if (error != 0 || count == 0) {
1268				/* remove marker */
1269				TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1270				break;
1271			}
1272		}
1273	}
1274 done:
1275 	mutex_spin_exit(&kq->kq_lock);
1276	if (marker != NULL)
1277		kmem_free(marker, sizeof(*marker));
1278	if (nkev != 0) {
1279		/* copyout remaining events */
1280		error = (*keops->keo_put_events)(keops->keo_private,
1281		    kevbuf, ulistp, nevents, nkev);
1282	}
1283	*retval = maxevents - count;
1284
1285	return error;
1286}
1287
1288/*
1289 * fileops ioctl method for a kqueue descriptor.
1290 *
1291 * Two ioctls are currently supported. They both use struct kfilter_mapping:
1292 *	KFILTER_BYNAME		find name for filter, and return result in
1293 *				name, which is of size len.
1294 *	KFILTER_BYFILTER	find filter for name. len is ignored.
1295 */
1296/*ARGSUSED*/
1297static int
1298kqueue_ioctl(file_t *fp, u_long com, void *data)
1299{
1300	struct kfilter_mapping	*km;
1301	const struct kfilter	*kfilter;
1302	char			*name;
1303	int			error;
1304
1305	km = data;
1306	error = 0;
1307	name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
1308
1309	switch (com) {
1310	case KFILTER_BYFILTER:	/* convert filter -> name */
1311		rw_enter(&kqueue_filter_lock, RW_READER);
1312		kfilter = kfilter_byfilter(km->filter);
1313		if (kfilter != NULL) {
1314			strlcpy(name, kfilter->name, KFILTER_MAXNAME);
1315			rw_exit(&kqueue_filter_lock);
1316			error = copyoutstr(name, km->name, km->len, NULL);
1317		} else {
1318			rw_exit(&kqueue_filter_lock);
1319			error = ENOENT;
1320		}
1321		break;
1322
1323	case KFILTER_BYNAME:	/* convert name -> filter */
1324		error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1325		if (error) {
1326			break;
1327		}
1328		rw_enter(&kqueue_filter_lock, RW_READER);
1329		kfilter = kfilter_byname(name);
1330		if (kfilter != NULL)
1331			km->filter = kfilter->filter;
1332		else
1333			error = ENOENT;
1334		rw_exit(&kqueue_filter_lock);
1335		break;
1336
1337	default:
1338		error = ENOTTY;
1339		break;
1340
1341	}
1342	kmem_free(name, KFILTER_MAXNAME);
1343	return (error);
1344}
1345
1346/*
1347 * fileops fcntl method for a kqueue descriptor.
1348 */
1349static int
1350kqueue_fcntl(file_t *fp, u_int com, void *data)
1351{
1352
1353	return (ENOTTY);
1354}
1355
1356/*
1357 * fileops poll method for a kqueue descriptor.
1358 * Determine if kqueue has events pending.
1359 */
1360static int
1361kqueue_poll(file_t *fp, int events)
1362{
1363	struct kqueue	*kq;
1364	int		revents;
1365
1366	kq = fp->f_data;
1367
1368	revents = 0;
1369	if (events & (POLLIN | POLLRDNORM)) {
1370		mutex_spin_enter(&kq->kq_lock);
1371		if (kq->kq_count != 0) {
1372			revents |= events & (POLLIN | POLLRDNORM);
1373		} else {
1374			selrecord(curlwp, &kq->kq_sel);
1375		}
1376		kq_check(kq);
1377		mutex_spin_exit(&kq->kq_lock);
1378	}
1379
1380	return revents;
1381}
1382
1383/*
1384 * fileops stat method for a kqueue descriptor.
1385 * Returns dummy info, with st_size being number of events pending.
1386 */
1387static int
1388kqueue_stat(file_t *fp, struct stat *st)
1389{
1390	struct kqueue *kq;
1391
1392	kq = fp->f_data;
1393
1394	memset(st, 0, sizeof(*st));
1395	st->st_size = kq->kq_count;
1396	st->st_blksize = sizeof(struct kevent);
1397	st->st_mode = S_IFIFO;
1398
1399	return 0;
1400}
1401
1402static void
1403kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
1404{
1405	struct knote *kn;
1406	filedesc_t *fdp;
1407
1408	fdp = kq->kq_fdp;
1409
1410	KASSERT(mutex_owned(&fdp->fd_lock));
1411
1412	for (kn = SLIST_FIRST(list); kn != NULL;) {
1413		if (kq != kn->kn_kq) {
1414			kn = SLIST_NEXT(kn, kn_link);
1415			continue;
1416		}
1417		knote_detach(kn, fdp, true);
1418		mutex_enter(&fdp->fd_lock);
1419		kn = SLIST_FIRST(list);
1420	}
1421}
1422
1423
1424/*
1425 * fileops close method for a kqueue descriptor.
1426 */
1427static int
1428kqueue_close(file_t *fp)
1429{
1430	struct kqueue *kq;
1431	filedesc_t *fdp;
1432	fdfile_t *ff;
1433	int i;
1434
1435	kq = fp->f_data;
1436	fp->f_data = NULL;
1437	fp->f_type = 0;
1438	fdp = curlwp->l_fd;
1439
1440	mutex_enter(&fdp->fd_lock);
1441	for (i = 0; i <= fdp->fd_lastkqfile; i++) {
1442		if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
1443			continue;
1444		kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
1445	}
1446	if (fdp->fd_knhashmask != 0) {
1447		for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1448			kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
1449		}
1450	}
1451	mutex_exit(&fdp->fd_lock);
1452
1453	KASSERT(kq->kq_count == 0);
1454	mutex_destroy(&kq->kq_lock);
1455	cv_destroy(&kq->kq_cv);
1456	seldestroy(&kq->kq_sel);
1457	kmem_free(kq, sizeof(*kq));
1458
1459	return (0);
1460}
1461
1462/*
1463 * struct fileops kqfilter method for a kqueue descriptor.
1464 * Event triggered when monitored kqueue changes.
1465 */
1466static int
1467kqueue_kqfilter(file_t *fp, struct knote *kn)
1468{
1469	struct kqueue *kq;
1470	filedesc_t *fdp;
1471
1472	kq = ((file_t *)kn->kn_obj)->f_data;
1473
1474	KASSERT(fp == kn->kn_obj);
1475
1476	if (kn->kn_filter != EVFILT_READ)
1477		return 1;
1478
1479	kn->kn_fop = &kqread_filtops;
1480	fdp = curlwp->l_fd;
1481	mutex_enter(&kq->kq_lock);
1482	SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext);
1483	mutex_exit(&kq->kq_lock);
1484
1485	return 0;
1486}
1487
1488
1489/*
1490 * Walk down a list of knotes, activating them if their event has
1491 * triggered.  The caller's object lock (e.g. device driver lock)
1492 * must be held.
1493 */
1494void
1495knote(struct klist *list, long hint)
1496{
1497	struct knote *kn, *tmpkn;
1498
1499	SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
1500		if ((*kn->kn_fop->f_event)(kn, hint))
1501			knote_activate(kn);
1502	}
1503}
1504
1505/*
1506 * Remove all knotes referencing a specified fd
1507 */
1508void
1509knote_fdclose(int fd)
1510{
1511	struct klist *list;
1512	struct knote *kn;
1513	filedesc_t *fdp;
1514
1515	fdp = curlwp->l_fd;
1516	list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
1517	mutex_enter(&fdp->fd_lock);
1518	while ((kn = SLIST_FIRST(list)) != NULL) {
1519		knote_detach(kn, fdp, true);
1520		mutex_enter(&fdp->fd_lock);
1521	}
1522	mutex_exit(&fdp->fd_lock);
1523}
1524
1525/*
1526 * Drop knote.  Called with fdp->fd_lock held, and will drop before
1527 * returning.
1528 */
1529static void
1530knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
1531{
1532	struct klist *list;
1533	struct kqueue *kq;
1534
1535	kq = kn->kn_kq;
1536
1537	KASSERT((kn->kn_status & KN_MARKER) == 0);
1538	KASSERT(mutex_owned(&fdp->fd_lock));
1539
1540	/* Remove from monitored object. */
1541	if (dofop) {
1542		KERNEL_LOCK(1, NULL);		/* XXXSMP */
1543		(*kn->kn_fop->f_detach)(kn);
1544		KERNEL_UNLOCK_ONE(NULL);	/* XXXSMP */
1545	}
1546
1547	/* Remove from descriptor table. */
1548	if (kn->kn_fop->f_isfd)
1549		list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1550	else
1551		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1552
1553	SLIST_REMOVE(list, kn, knote, kn_link);
1554
1555	/* Remove from kqueue. */
1556	/* XXXAD should verify not in use by kqueue_scan. */
1557	mutex_spin_enter(&kq->kq_lock);
1558	if ((kn->kn_status & KN_QUEUED) != 0) {
1559		kq_check(kq);
1560		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1561		kn->kn_status &= ~KN_QUEUED;
1562		kq->kq_count--;
1563		kq_check(kq);
1564	}
1565	mutex_spin_exit(&kq->kq_lock);
1566
1567	mutex_exit(&fdp->fd_lock);
1568	if (kn->kn_fop->f_isfd)
1569		fd_putfile(kn->kn_id);
1570	atomic_dec_uint(&kn->kn_kfilter->refcnt);
1571	kmem_free(kn, sizeof(*kn));
1572}
1573
1574/*
1575 * Queue new event for knote.
1576 */
1577static void
1578knote_enqueue(struct knote *kn)
1579{
1580	struct kqueue *kq;
1581
1582	KASSERT((kn->kn_status & KN_MARKER) == 0);
1583
1584	kq = kn->kn_kq;
1585
1586	mutex_spin_enter(&kq->kq_lock);
1587	if ((kn->kn_status & KN_DISABLED) != 0) {
1588		kn->kn_status &= ~KN_DISABLED;
1589	}
1590	if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
1591		kq_check(kq);
1592		TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1593		kn->kn_status |= KN_QUEUED;
1594		kq->kq_count++;
1595		kq_check(kq);
1596		cv_broadcast(&kq->kq_cv);
1597		selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1598	}
1599	mutex_spin_exit(&kq->kq_lock);
1600}
1601/*
1602 * Queue new event for knote.
1603 */
1604static void
1605knote_activate(struct knote *kn)
1606{
1607	struct kqueue *kq;
1608
1609	KASSERT((kn->kn_status & KN_MARKER) == 0);
1610
1611	kq = kn->kn_kq;
1612
1613	mutex_spin_enter(&kq->kq_lock);
1614	kn->kn_status |= KN_ACTIVE;
1615	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
1616		kq_check(kq);
1617		TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1618		kn->kn_status |= KN_QUEUED;
1619		kq->kq_count++;
1620		kq_check(kq);
1621		cv_broadcast(&kq->kq_cv);
1622		selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1623	}
1624	mutex_spin_exit(&kq->kq_lock);
1625}
1626