1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29/*-
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54/*
55 *	@(#)kern_event.c       1.0 (3/31/2000)
56 */
57#include <stdint.h>
58
59#include <sys/param.h>
60#include <sys/systm.h>
61#include <sys/filedesc.h>
62#include <sys/kernel.h>
63#include <sys/proc_internal.h>
64#include <sys/kauth.h>
65#include <sys/malloc.h>
66#include <sys/unistd.h>
67#include <sys/file_internal.h>
68#include <sys/fcntl.h>
69#include <sys/select.h>
70#include <sys/queue.h>
71#include <sys/event.h>
72#include <sys/eventvar.h>
73#include <sys/protosw.h>
74#include <sys/socket.h>
75#include <sys/socketvar.h>
76#include <sys/stat.h>
77#include <sys/sysctl.h>
78#include <sys/uio.h>
79#include <sys/sysproto.h>
80#include <sys/user.h>
81#include <sys/vnode_internal.h>
82#include <string.h>
83#include <sys/proc_info.h>
84#include <sys/codesign.h>
85
86#include <kern/locks.h>
87#include <kern/clock.h>
88#include <kern/thread_call.h>
89#include <kern/sched_prim.h>
90#include <kern/zalloc.h>
91#include <kern/assert.h>
92
93#include <libkern/libkern.h>
94#include "net/net_str_id.h"
95
96#include <mach/task.h>
97
98#if VM_PRESSURE_EVENTS
99#include <kern/vm_pressure.h>
100#endif
101
102#if CONFIG_MEMORYSTATUS
103#include <sys/kern_memorystatus.h>
104#endif
105
106MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
107
108#define	KQ_EVENT	NULL
109
110static inline void kqlock(struct kqueue *kq);
111static inline void kqunlock(struct kqueue *kq);
112
113static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn);
114static int kqlock2knoteusewait(struct kqueue *kq, struct knote *kn);
115static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
116static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn);
117
118static void kqueue_wakeup(struct kqueue *kq, int closed);
119static int kqueue_read(struct fileproc *fp, struct uio *uio,
120    int flags, vfs_context_t ctx);
121static int kqueue_write(struct fileproc *fp, struct uio *uio,
122    int flags, vfs_context_t ctx);
123static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
124    vfs_context_t ctx);
125static int kqueue_select(struct fileproc *fp, int which, void *wql,
126    vfs_context_t ctx);
127static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
128static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
129	vfs_context_t ctx);
130static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
131
132static const struct fileops kqueueops = {
133	.fo_type = DTYPE_KQUEUE,
134	.fo_read = kqueue_read,
135	.fo_write = kqueue_write,
136	.fo_ioctl = kqueue_ioctl,
137	.fo_select = kqueue_select,
138	.fo_close = kqueue_close,
139	.fo_kqfilter = kqueue_kqfilter,
140	.fo_drain = kqueue_drain,
141};
142
143static int kevent_internal(struct proc *p, int iskev64, user_addr_t changelist,
144    int nchanges, user_addr_t eventlist, int nevents, int fd,
145    user_addr_t utimeout, unsigned int flags, int32_t *retval);
146static int kevent_copyin(user_addr_t *addrp, struct kevent64_s *kevp,
147    struct proc *p, int iskev64);
148static int kevent_copyout(struct kevent64_s *kevp, user_addr_t *addrp,
149    struct proc *p, int iskev64);
150char * kevent_description(struct kevent64_s *kevp, char *s, size_t n);
151
152static int kevent_callback(struct kqueue *kq, struct kevent64_s *kevp,
153    void *data);
154static void kevent_continue(struct kqueue *kq, void *data, int error);
155static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
156static int kqueue_process(struct kqueue *kq, kevent_callback_t callback,
157    void *data, int *countp, struct proc *p);
158static int kqueue_begin_processing(struct kqueue *kq);
159static void kqueue_end_processing(struct kqueue *kq);
160static int knote_process(struct knote *kn, kevent_callback_t callback,
161    void *data, struct kqtailq *inprocessp, struct proc *p);
162static void knote_put(struct knote *kn);
163static int knote_fdpattach(struct knote *kn, struct filedesc *fdp,
164    struct proc *p);
165static void knote_drop(struct knote *kn, struct proc *p);
166static void knote_activate(struct knote *kn, int);
167static void knote_deactivate(struct knote *kn);
168static void knote_enqueue(struct knote *kn);
169static void knote_dequeue(struct knote *kn);
170static struct knote *knote_alloc(void);
171static void knote_free(struct knote *kn);
172
173static int filt_fileattach(struct knote *kn);
174static struct filterops file_filtops = {
175	.f_isfd = 1,
176	.f_attach = filt_fileattach,
177};
178
179static void filt_kqdetach(struct knote *kn);
180static int filt_kqueue(struct knote *kn, long hint);
181static struct filterops kqread_filtops = {
182	.f_isfd = 1,
183	.f_detach = filt_kqdetach,
184	.f_event = filt_kqueue,
185};
186
187/* placeholder for not-yet-implemented filters */
188static int filt_badattach(struct knote *kn);
189static struct filterops bad_filtops = {
190	.f_attach = filt_badattach,
191};
192
193static int filt_procattach(struct knote *kn);
194static void filt_procdetach(struct knote *kn);
195static int filt_proc(struct knote *kn, long hint);
196static struct filterops proc_filtops = {
197	.f_attach = filt_procattach,
198	.f_detach = filt_procdetach,
199	.f_event = filt_proc,
200};
201
202#if VM_PRESSURE_EVENTS
203static int filt_vmattach(struct knote *kn);
204static void filt_vmdetach(struct knote *kn);
205static int filt_vm(struct knote *kn, long hint);
206static struct filterops vm_filtops = {
207	.f_attach = filt_vmattach,
208	.f_detach = filt_vmdetach,
209	.f_event = filt_vm,
210};
211#endif /* VM_PRESSURE_EVENTS */
212
213#if CONFIG_MEMORYSTATUS
214extern struct filterops memorystatus_filtops;
215#endif /* CONFIG_MEMORYSTATUS */
216
217extern struct filterops fs_filtops;
218
219extern struct filterops sig_filtops;
220
221/* Timer filter */
222static int filt_timerattach(struct knote *kn);
223static void filt_timerdetach(struct knote *kn);
224static int filt_timer(struct knote *kn, long hint);
225static void filt_timertouch(struct knote *kn, struct kevent64_s *kev,
226    long type);
227static struct filterops timer_filtops = {
228	.f_attach = filt_timerattach,
229	.f_detach = filt_timerdetach,
230	.f_event = filt_timer,
231	.f_touch = filt_timertouch,
232};
233
234/* Helpers */
235static void filt_timerexpire(void *knx, void *param1);
236static int filt_timervalidate(struct knote *kn);
237static void filt_timerupdate(struct knote *kn);
238static void filt_timercancel(struct knote *kn);
239
240#define	TIMER_RUNNING		0x1
241#define	TIMER_CANCELWAIT	0x2
242
243static lck_mtx_t _filt_timerlock;
244static void filt_timerlock(void);
245static void filt_timerunlock(void);
246
247static zone_t knote_zone;
248
249#define	KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
250
251#if 0
252extern struct filterops aio_filtops;
253#endif
254
255/* Mach portset filter */
256extern struct filterops machport_filtops;
257
258/* User filter */
259static int filt_userattach(struct knote *kn);
260static void filt_userdetach(struct knote *kn);
261static int filt_user(struct knote *kn, long hint);
262static void filt_usertouch(struct knote *kn, struct kevent64_s *kev,
263    long type);
264static struct filterops user_filtops = {
265	.f_attach = filt_userattach,
266	.f_detach = filt_userdetach,
267	.f_event = filt_user,
268	.f_touch = filt_usertouch,
269};
270
271/*
272 * Table for all system-defined filters.
273 */
274static struct filterops *sysfilt_ops[] = {
275	&file_filtops,			/* EVFILT_READ */
276	&file_filtops,			/* EVFILT_WRITE */
277#if 0
278	&aio_filtops,			/* EVFILT_AIO */
279#else
280	&bad_filtops,			/* EVFILT_AIO */
281#endif
282	&file_filtops,			/* EVFILT_VNODE */
283	&proc_filtops,			/* EVFILT_PROC */
284	&sig_filtops,			/* EVFILT_SIGNAL */
285	&timer_filtops,			/* EVFILT_TIMER */
286	&machport_filtops,		/* EVFILT_MACHPORT */
287	&fs_filtops,			/* EVFILT_FS */
288	&user_filtops,			/* EVFILT_USER */
289	&bad_filtops,			/* unused */
290#if VM_PRESSURE_EVENTS
291	&vm_filtops,			/* EVFILT_VM */
292#else
293	&bad_filtops,			/* EVFILT_VM */
294#endif
295	&file_filtops,			/* EVFILT_SOCK */
296#if CONFIG_MEMORYSTATUS
297	&memorystatus_filtops,  /* EVFILT_MEMORYSTATUS */
298#else
299	&bad_filtops,			/* EVFILT_MEMORYSTATUS */
300#endif
301};
302
303/*
304 * kqueue/note lock attributes and implementations
305 *
306 *	kqueues have locks, while knotes have use counts
307 *	Most of the knote state is guarded by the object lock.
308 *	the knote "inuse" count and status use the kqueue lock.
309 */
310lck_grp_attr_t * kq_lck_grp_attr;
311lck_grp_t * kq_lck_grp;
312lck_attr_t * kq_lck_attr;
313
314static inline void
315kqlock(struct kqueue *kq)
316{
317	lck_spin_lock(&kq->kq_lock);
318}
319
320static inline void
321kqunlock(struct kqueue *kq)
322{
323	lck_spin_unlock(&kq->kq_lock);
324}
325
326/*
327 * Convert a kq lock to a knote use referece.
328 *
329 *	If the knote is being dropped, we can't get
330 *	a use reference, so just return with it
331 *	still locked.
332 *	- kq locked at entry
333 *	- unlock on exit if we get the use reference
334 */
335static int
336kqlock2knoteuse(struct kqueue *kq, struct knote *kn)
337{
338	if (kn->kn_status & KN_DROPPING)
339		return (0);
340	kn->kn_inuse++;
341	kqunlock(kq);
342	return (1);
343}
344
345/*
346 * Convert a kq lock to a knote use referece,
347 * but wait for attach and drop events to complete.
348 *
349 *	If the knote is being dropped, we can't get
350 *	a use reference, so just return with it
351 *	still locked.
352 *	- kq locked at entry
353 *	- kq always unlocked on exit
354 */
355static int
356kqlock2knoteusewait(struct kqueue *kq, struct knote *kn)
357{
358	if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
359		kn->kn_status |= KN_USEWAIT;
360		wait_queue_assert_wait((wait_queue_t)kq->kq_wqs,
361		    &kn->kn_status, THREAD_UNINT, 0);
362		kqunlock(kq);
363		thread_block(THREAD_CONTINUE_NULL);
364		return (0);
365	}
366	kn->kn_inuse++;
367	kqunlock(kq);
368	return (1);
369}
370
371/*
372 * Convert from a knote use reference back to kq lock.
373 *
374 *	Drop a use reference and wake any waiters if
375 *	this is the last one.
376 *
377 *	The exit return indicates if the knote is
378 *	still alive - but the kqueue lock is taken
379 *	unconditionally.
380 */
381static int
382knoteuse2kqlock(struct kqueue *kq, struct knote *kn)
383{
384	kqlock(kq);
385	if (--kn->kn_inuse == 0) {
386		if ((kn->kn_status & KN_ATTACHING) != 0) {
387			kn->kn_status &= ~KN_ATTACHING;
388		}
389		if ((kn->kn_status & KN_USEWAIT) != 0) {
390			kn->kn_status &= ~KN_USEWAIT;
391			wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs,
392			    &kn->kn_status, THREAD_AWAKENED);
393		}
394	}
395	return ((kn->kn_status & KN_DROPPING) == 0);
396}
397
398/*
399 * Convert a kq lock to a knote drop reference.
400 *
401 *	If the knote is in use, wait for the use count
402 *	to subside.  We first mark our intention to drop
403 *	it - keeping other users from "piling on."
404 *	If we are too late, we have to wait for the
405 *	other drop to complete.
406 *
407 *	- kq locked at entry
408 *	- always unlocked on exit.
409 *	- caller can't hold any locks that would prevent
410 *	  the other dropper from completing.
411 */
412static int
413kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
414{
415	int oktodrop;
416
417	oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
418	kn->kn_status |= KN_DROPPING;
419	if (oktodrop) {
420		if (kn->kn_inuse == 0) {
421			kqunlock(kq);
422			return (oktodrop);
423		}
424	}
425	kn->kn_status |= KN_USEWAIT;
426	wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kn->kn_status,
427	    THREAD_UNINT, 0);
428	kqunlock(kq);
429	thread_block(THREAD_CONTINUE_NULL);
430	return (oktodrop);
431}
432
433/*
434 * Release a knote use count reference.
435 */
436static void
437knote_put(struct knote *kn)
438{
439	struct kqueue *kq = kn->kn_kq;
440
441	kqlock(kq);
442	if (--kn->kn_inuse == 0) {
443		if ((kn->kn_status & KN_USEWAIT) != 0) {
444			kn->kn_status &= ~KN_USEWAIT;
445			wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs,
446			    &kn->kn_status, THREAD_AWAKENED);
447		}
448	}
449	kqunlock(kq);
450}
451
452static int
453filt_fileattach(struct knote *kn)
454{
455	return (fo_kqfilter(kn->kn_fp, kn, vfs_context_current()));
456}
457
458#define	f_flag f_fglob->fg_flag
459#define	f_msgcount f_fglob->fg_msgcount
460#define	f_cred f_fglob->fg_cred
461#define	f_ops f_fglob->fg_ops
462#define	f_offset f_fglob->fg_offset
463#define	f_data f_fglob->fg_data
464
465static void
466filt_kqdetach(struct knote *kn)
467{
468	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
469
470	kqlock(kq);
471	KNOTE_DETACH(&kq->kq_sel.si_note, kn);
472	kqunlock(kq);
473}
474
475/*ARGSUSED*/
476static int
477filt_kqueue(struct knote *kn, __unused long hint)
478{
479	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
480
481	kn->kn_data = kq->kq_count;
482	return (kn->kn_data > 0);
483}
484
485static int
486filt_procattach(struct knote *kn)
487{
488	struct proc *p;
489
490	assert(PID_MAX < NOTE_PDATAMASK);
491
492	if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0)
493		return (ENOTSUP);
494
495	p = proc_find(kn->kn_id);
496	if (p == NULL) {
497		return (ESRCH);
498	}
499
500	const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
501
502	if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
503		do {
504			pid_t selfpid = proc_selfpid();
505
506			if (p->p_ppid == selfpid)
507				break;	/* parent => ok */
508
509			if ((p->p_lflag & P_LTRACED) != 0 &&
510			    (p->p_oppid == selfpid))
511				break;	/* parent-in-waiting => ok */
512
513			proc_rele(p);
514			return (EACCES);
515		} while (0);
516
517	proc_klist_lock();
518
519	kn->kn_flags |= EV_CLEAR;	/* automatically set */
520	kn->kn_ptr.p_proc = p;		/* store the proc handle */
521
522	KNOTE_ATTACH(&p->p_klist, kn);
523
524	proc_klist_unlock();
525
526	proc_rele(p);
527
528	return (0);
529}
530
531/*
532 * The knote may be attached to a different process, which may exit,
533 * leaving nothing for the knote to be attached to.  In that case,
534 * the pointer to the process will have already been nulled out.
535 */
536static void
537filt_procdetach(struct knote *kn)
538{
539	struct proc *p;
540
541	proc_klist_lock();
542
543	p = kn->kn_ptr.p_proc;
544	if (p != PROC_NULL) {
545		kn->kn_ptr.p_proc = PROC_NULL;
546		KNOTE_DETACH(&p->p_klist, kn);
547	}
548
549	proc_klist_unlock();
550}
551
552static int
553filt_proc(struct knote *kn, long hint)
554{
555	/*
556	 * Note: a lot of bits in hint may be obtained from the knote
557	 * To free some of those bits, see <rdar://problem/12592988> Freeing up
558	 * bits in hint for filt_proc
559	 */
560	/* hint is 0 when called from above */
561	if (hint != 0) {
562		u_int event;
563
564		/* ALWAYS CALLED WITH proc_klist_lock when (hint != 0) */
565
566		/*
567		 * mask off extra data
568		 */
569		event = (u_int)hint & NOTE_PCTRLMASK;
570
571		/*
572		 * termination lifecycle events can happen while a debugger
573		 * has reparented a process, in which case notifications
574		 * should be quashed except to the tracing parent. When
575		 * the debugger reaps the child (either via wait4(2) or
576		 * process exit), the child will be reparented to the original
577		 * parent and these knotes re-fired.
578		 */
579		if (event & NOTE_EXIT) {
580			if ((kn->kn_ptr.p_proc->p_oppid != 0)
581				&& (kn->kn_kq->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
582				/*
583				 * This knote is not for the current ptrace(2) parent, ignore.
584				 */
585				return 0;
586			}
587		}
588
589		/*
590		 * if the user is interested in this event, record it.
591		 */
592		if (kn->kn_sfflags & event)
593			kn->kn_fflags |= event;
594
595#pragma clang diagnostic push
596#pragma clang diagnostic ignored "-Wdeprecated-declarations"
597		if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
598			kn->kn_flags |= (EV_EOF | EV_ONESHOT);
599		}
600#pragma clang diagnostic pop
601
602
603		/*
604		 * The kernel has a wrapper in place that returns the same data
605		 * as is collected here, in kn_data.  Any changes to how
606		 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
607		 * should also be reflected in the proc_pidnoteexit() wrapper.
608		 */
609		if (event == NOTE_EXIT) {
610			kn->kn_data = 0;
611			if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
612				kn->kn_fflags |= NOTE_EXITSTATUS;
613				kn->kn_data |= (hint & NOTE_PDATAMASK);
614			}
615			if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
616				kn->kn_fflags |= NOTE_EXIT_DETAIL;
617				if ((kn->kn_ptr.p_proc->p_lflag &
618				    P_LTERM_DECRYPTFAIL) != 0) {
619					kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
620				}
621				if ((kn->kn_ptr.p_proc->p_lflag &
622				    P_LTERM_JETSAM) != 0) {
623					kn->kn_data |= NOTE_EXIT_MEMORY;
624					switch (kn->kn_ptr.p_proc->p_lflag &
625					    P_JETSAM_MASK) {
626						case P_JETSAM_VMPAGESHORTAGE:
627							kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
628							break;
629						case P_JETSAM_VMTHRASHING:
630							kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
631							break;
632						case P_JETSAM_FCTHRASHING:
633							kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING;
634							break;
635						case P_JETSAM_VNODE:
636							kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
637							break;
638						case P_JETSAM_HIWAT:
639							kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
640							break;
641						case P_JETSAM_PID:
642							kn->kn_data |= NOTE_EXIT_MEMORY_PID;
643							break;
644						case P_JETSAM_IDLEEXIT:
645							kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
646							break;
647					}
648				}
649				if ((kn->kn_ptr.p_proc->p_csflags &
650				    CS_KILLED) != 0) {
651					kn->kn_data |= NOTE_EXIT_CSERROR;
652				}
653			}
654		}
655	}
656
657	/* atomic check, no locking need when called from above */
658	return (kn->kn_fflags != 0);
659}
660
661#if VM_PRESSURE_EVENTS
662/*
663 * Virtual memory kevents
664 *
665 * author: Matt Jacobson [matthew_jacobson@apple.com]
666 */
667
668static int
669filt_vmattach(struct knote *kn)
670{
671	/*
672	 * The note will be cleared once the information has been flushed to
673	 * the client. If there is still pressure, we will be re-alerted.
674	 */
675	kn->kn_flags |= EV_CLEAR;
676	return (vm_knote_register(kn));
677}
678
679static void
680filt_vmdetach(struct knote *kn)
681{
682	vm_knote_unregister(kn);
683}
684
685static int
686filt_vm(struct knote *kn, long hint)
687{
688	/* hint == 0 means this is just an alive? check (always true) */
689	if (hint != 0) {
690		const pid_t pid = (pid_t)hint;
691		if ((kn->kn_sfflags & NOTE_VM_PRESSURE) &&
692		    (kn->kn_kq->kq_p->p_pid == pid)) {
693			kn->kn_fflags |= NOTE_VM_PRESSURE;
694		}
695	}
696
697	return (kn->kn_fflags != 0);
698}
699#endif /* VM_PRESSURE_EVENTS */
700
701/*
702 * filt_timervalidate - process data from user
703 *
704 *	Converts to either interval or deadline format.
705 *
706 *	The saved-data field in the knote contains the
707 *	time value.  The saved filter-flags indicates
708 *	the unit of measurement.
709 *
710 *	After validation, either the saved-data field
711 *	contains the interval in absolute time, or ext[0]
712 *	contains the expected deadline. If that deadline
713 *	is in the past, ext[0] is 0.
714 *
715 *	Returns EINVAL for unrecognized units of time.
716 *
717 *	Timer filter lock is held.
718 *
719 */
720static int
721filt_timervalidate(struct knote *kn)
722{
723	uint64_t multiplier;
724	uint64_t raw = 0;
725
726	switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS)) {
727	case NOTE_SECONDS:
728		multiplier = NSEC_PER_SEC;
729		break;
730	case NOTE_USECONDS:
731		multiplier = NSEC_PER_USEC;
732		break;
733	case NOTE_NSECONDS:
734		multiplier = 1;
735		break;
736	case 0: /* milliseconds (default) */
737		multiplier = NSEC_PER_SEC / 1000;
738		break;
739	default:
740		return (EINVAL);
741	}
742
743	/* transform the slop delta(leeway) in kn_ext[1] if passed to same time scale */
744	if(kn->kn_sfflags & NOTE_LEEWAY){
745		nanoseconds_to_absolutetime((uint64_t)kn->kn_ext[1] * multiplier, &raw);
746		kn->kn_ext[1] = raw;
747	}
748
749	nanoseconds_to_absolutetime((uint64_t)kn->kn_sdata * multiplier, &raw);
750
751	kn->kn_ext[0] = 0;
752	kn->kn_sdata = 0;
753
754	if (kn->kn_sfflags & NOTE_ABSOLUTE) {
755		clock_sec_t seconds;
756		clock_nsec_t nanoseconds;
757		uint64_t now;
758
759		clock_get_calendar_nanotime(&seconds, &nanoseconds);
760		nanoseconds_to_absolutetime((uint64_t)seconds * NSEC_PER_SEC +
761		    nanoseconds, &now);
762
763		if (raw < now) {
764			/* time has already passed */
765			kn->kn_ext[0] = 0;
766		} else {
767			raw -= now;
768			clock_absolutetime_interval_to_deadline(raw,
769			    &kn->kn_ext[0]);
770		}
771	} else {
772		kn->kn_sdata = raw;
773	}
774
775	return (0);
776}
777
778/*
779 * filt_timerupdate - compute the next deadline
780 *
781 * 	Repeating timers store their interval in kn_sdata. Absolute
782 * 	timers have already calculated the deadline, stored in ext[0].
783 *
784 * 	On return, the next deadline (or zero if no deadline is needed)
785 * 	is stored in kn_ext[0].
786 *
787 * 	Timer filter lock is held.
788 */
789static void
790filt_timerupdate(struct knote *kn)
791{
792	/* if there's no interval, deadline is just in kn_ext[0] */
793	if (kn->kn_sdata == 0)
794		return;
795
796	/* if timer hasn't fired before, fire in interval nsecs */
797	if (kn->kn_ext[0] == 0) {
798		clock_absolutetime_interval_to_deadline(kn->kn_sdata,
799		    &kn->kn_ext[0]);
800	} else {
801		/*
802		 * If timer has fired before, schedule the next pop
803		 * relative to the last intended deadline.
804		 *
805		 * We could check for whether the deadline has expired,
806		 * but the thread call layer can handle that.
807		 */
808		kn->kn_ext[0] += kn->kn_sdata;
809	}
810}
811
812/*
813 * filt_timerexpire - the timer callout routine
814 *
815 * Just propagate the timer event into the knote
816 * filter routine (by going through the knote
817 * synchronization point).  Pass a hint to
818 * indicate this is a real event, not just a
819 * query from above.
820 */
821static void
822filt_timerexpire(void *knx, __unused void *spare)
823{
824	struct klist timer_list;
825	struct knote *kn = knx;
826
827	filt_timerlock();
828
829	kn->kn_hookid &= ~TIMER_RUNNING;
830
831	/* no "object" for timers, so fake a list */
832	SLIST_INIT(&timer_list);
833	SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
834	KNOTE(&timer_list, 1);
835
836	/* if someone is waiting for timer to pop */
837	if (kn->kn_hookid & TIMER_CANCELWAIT) {
838		struct kqueue *kq = kn->kn_kq;
839		wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kn->kn_hook,
840		    THREAD_AWAKENED);
841	}
842
843	filt_timerunlock();
844}
845
846/*
847 * Cancel a running timer (or wait for the pop).
848 * Timer filter lock is held.
849 */
850static void
851filt_timercancel(struct knote *kn)
852{
853	struct kqueue *kq = kn->kn_kq;
854	thread_call_t callout = kn->kn_hook;
855	boolean_t cancelled;
856
857	if (kn->kn_hookid & TIMER_RUNNING) {
858		/* cancel the callout if we can */
859		cancelled = thread_call_cancel(callout);
860		if (cancelled) {
861			kn->kn_hookid &= ~TIMER_RUNNING;
862		} else {
863			/* we have to wait for the expire routine.  */
864			kn->kn_hookid |= TIMER_CANCELWAIT;
865			wait_queue_assert_wait((wait_queue_t)kq->kq_wqs,
866			    &kn->kn_hook, THREAD_UNINT, 0);
867			filt_timerunlock();
868			thread_block(THREAD_CONTINUE_NULL);
869			filt_timerlock();
870			assert((kn->kn_hookid & TIMER_RUNNING) == 0);
871		}
872	}
873}
874
875/*
876 * Allocate a thread call for the knote's lifetime, and kick off the timer.
877 */
878static int
879filt_timerattach(struct knote *kn)
880{
881	thread_call_t callout;
882	int error;
883
884	callout = thread_call_allocate(filt_timerexpire, kn);
885	if (NULL == callout)
886		return (ENOMEM);
887
888	filt_timerlock();
889	error = filt_timervalidate(kn);
890	if (error != 0) {
891		filt_timerunlock();
892		return (error);
893	}
894
895	kn->kn_hook = (void*)callout;
896	kn->kn_hookid = 0;
897
898	/* absolute=EV_ONESHOT */
899	if (kn->kn_sfflags & NOTE_ABSOLUTE)
900		kn->kn_flags |= EV_ONESHOT;
901
902	filt_timerupdate(kn);
903	if (kn->kn_ext[0]) {
904		kn->kn_flags |= EV_CLEAR;
905		unsigned int timer_flags = 0;
906		if (kn->kn_sfflags & NOTE_CRITICAL)
907			timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
908		else if (kn->kn_sfflags & NOTE_BACKGROUND)
909			timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
910		else
911			timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
912
913		if (kn->kn_sfflags & NOTE_LEEWAY)
914			timer_flags |= THREAD_CALL_DELAY_LEEWAY;
915
916		thread_call_enter_delayed_with_leeway(callout, NULL,
917				kn->kn_ext[0], kn->kn_ext[1], timer_flags);
918
919		kn->kn_hookid |= TIMER_RUNNING;
920	} else {
921		/* fake immediate */
922		kn->kn_data = 1;
923	}
924
925	filt_timerunlock();
926	return (0);
927}
928
929/*
930 * Shut down the timer if it's running, and free the callout.
931 */
932static void
933filt_timerdetach(struct knote *kn)
934{
935	thread_call_t callout;
936
937	filt_timerlock();
938
939	callout = (thread_call_t)kn->kn_hook;
940	filt_timercancel(kn);
941
942	filt_timerunlock();
943
944	thread_call_free(callout);
945}
946
947
948
949static int
950filt_timer(struct knote *kn, long hint)
951{
952	int result;
953
954	if (hint) {
955		/* real timer pop -- timer lock held by filt_timerexpire */
956		kn->kn_data++;
957
958		if (((kn->kn_hookid & TIMER_CANCELWAIT) == 0) &&
959				((kn->kn_flags & EV_ONESHOT) == 0)) {
960
961			/* evaluate next time to fire */
962			filt_timerupdate(kn);
963
964			if (kn->kn_ext[0]) {
965				unsigned int timer_flags = 0;
966
967				/* keep the callout and re-arm */
968				if (kn->kn_sfflags & NOTE_CRITICAL)
969					timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
970				else if (kn->kn_sfflags & NOTE_BACKGROUND)
971					timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
972				else
973					timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
974
975				if (kn->kn_sfflags & NOTE_LEEWAY)
976					timer_flags |= THREAD_CALL_DELAY_LEEWAY;
977
978				thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
979						kn->kn_ext[0], kn->kn_ext[1], timer_flags);
980
981				kn->kn_hookid |= TIMER_RUNNING;
982			}
983		}
984
985		return (1);
986	}
987
988	/* user-query */
989	filt_timerlock();
990
991	result = (kn->kn_data != 0);
992
993	filt_timerunlock();
994
995	return (result);
996}
997
998
999/*
1000 * filt_timertouch - update knote with new user input
1001 *
1002 * Cancel and restart the timer based on new user data. When
1003 * the user picks up a knote, clear the count of how many timer
1004 * pops have gone off (in kn_data).
1005 */
1006static void
1007filt_timertouch(struct knote *kn, struct kevent64_s *kev, long type)
1008{
1009	int error;
1010	filt_timerlock();
1011
1012	switch (type) {
1013	case EVENT_REGISTER:
1014		/* cancel current call */
1015		filt_timercancel(kn);
1016
1017		/* recalculate deadline */
1018		kn->kn_sdata = kev->data;
1019		kn->kn_sfflags = kev->fflags;
1020		kn->kn_ext[0] = kev->ext[0];
1021		kn->kn_ext[1] = kev->ext[1];
1022
1023		error = filt_timervalidate(kn);
1024		if (error) {
1025			/* no way to report error, so mark it in the knote */
1026			kn->kn_flags |= EV_ERROR;
1027			kn->kn_data = error;
1028			break;
1029		}
1030
1031		/* start timer if necessary */
1032		filt_timerupdate(kn);
1033
1034		if (kn->kn_ext[0]) {
1035			unsigned int timer_flags = 0;
1036			if (kn->kn_sfflags & NOTE_CRITICAL)
1037				timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1038			else if (kn->kn_sfflags & NOTE_BACKGROUND)
1039				timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1040			else
1041				timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1042
1043			if (kn->kn_sfflags & NOTE_LEEWAY)
1044				timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1045
1046			thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
1047					kn->kn_ext[0], kn->kn_ext[1], timer_flags);
1048
1049			kn->kn_hookid |= TIMER_RUNNING;
1050		} else {
1051			/* pretend the timer has fired */
1052			kn->kn_data = 1;
1053		}
1054
1055		break;
1056
1057	case EVENT_PROCESS:
1058		/* reset the timer pop count in kn_data */
1059		*kev = kn->kn_kevent;
1060		kev->ext[0] = 0;
1061		kn->kn_data = 0;
1062		if (kn->kn_flags & EV_CLEAR)
1063			kn->kn_fflags = 0;
1064		break;
1065	default:
1066		panic("%s: - invalid type (%ld)", __func__, type);
1067		break;
1068	}
1069
1070	filt_timerunlock();
1071}
1072
1073static void
1074filt_timerlock(void)
1075{
1076	lck_mtx_lock(&_filt_timerlock);
1077}
1078
1079static void
1080filt_timerunlock(void)
1081{
1082	lck_mtx_unlock(&_filt_timerlock);
1083}
1084
1085static int
1086filt_userattach(struct knote *kn)
1087{
1088	/* EVFILT_USER knotes are not attached to anything in the kernel */
1089	kn->kn_hook = NULL;
1090	if (kn->kn_fflags & NOTE_TRIGGER) {
1091		kn->kn_hookid = 1;
1092	} else {
1093		kn->kn_hookid = 0;
1094	}
1095	return (0);
1096}
1097
1098static void
1099filt_userdetach(__unused struct knote *kn)
1100{
1101	/* EVFILT_USER knotes are not attached to anything in the kernel */
1102}
1103
1104static int
1105filt_user(struct knote *kn, __unused long hint)
1106{
1107	return (kn->kn_hookid);
1108}
1109
1110static void
1111filt_usertouch(struct knote *kn, struct kevent64_s *kev, long type)
1112{
1113	uint32_t ffctrl;
1114	switch (type) {
1115	case EVENT_REGISTER:
1116		if (kev->fflags & NOTE_TRIGGER) {
1117			kn->kn_hookid = 1;
1118		}
1119
1120		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1121		kev->fflags &= NOTE_FFLAGSMASK;
1122		switch (ffctrl) {
1123		case NOTE_FFNOP:
1124			break;
1125		case NOTE_FFAND:
1126			OSBitAndAtomic(kev->fflags, &kn->kn_sfflags);
1127			break;
1128		case NOTE_FFOR:
1129			OSBitOrAtomic(kev->fflags, &kn->kn_sfflags);
1130			break;
1131		case NOTE_FFCOPY:
1132			kn->kn_sfflags = kev->fflags;
1133			break;
1134		}
1135		kn->kn_sdata = kev->data;
1136		break;
1137	case EVENT_PROCESS:
1138		*kev = kn->kn_kevent;
1139		kev->fflags = (volatile UInt32)kn->kn_sfflags;
1140		kev->data = kn->kn_sdata;
1141		if (kn->kn_flags & EV_CLEAR) {
1142			kn->kn_hookid = 0;
1143			kn->kn_data = 0;
1144			kn->kn_fflags = 0;
1145		}
1146		break;
1147	default:
1148		panic("%s: - invalid type (%ld)", __func__, type);
1149		break;
1150	}
1151}
1152
1153/*
1154 * JMM - placeholder for not-yet-implemented filters
1155 */
1156static int
1157filt_badattach(__unused struct knote *kn)
1158{
1159	return (ENOTSUP);
1160}
1161
1162struct kqueue *
1163kqueue_alloc(struct proc *p)
1164{
1165	struct filedesc *fdp = p->p_fd;
1166	struct kqueue *kq;
1167
1168	MALLOC_ZONE(kq, struct kqueue *, sizeof (struct kqueue), M_KQUEUE,
1169	    M_WAITOK);
1170	if (kq != NULL) {
1171		wait_queue_set_t wqs;
1172
1173		wqs = wait_queue_set_alloc(SYNC_POLICY_FIFO |
1174		    SYNC_POLICY_PREPOST);
1175		if (wqs != NULL) {
1176			bzero(kq, sizeof (struct kqueue));
1177			lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
1178			TAILQ_INIT(&kq->kq_head);
1179			kq->kq_wqs = wqs;
1180			kq->kq_p = p;
1181		} else {
1182			FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
1183		}
1184	}
1185
1186	if (fdp->fd_knlistsize < 0) {
1187		proc_fdlock(p);
1188		if (fdp->fd_knlistsize < 0)
1189			fdp->fd_knlistsize = 0;	/* this process has had a kq */
1190		proc_fdunlock(p);
1191	}
1192
1193	return (kq);
1194}
1195
1196/*
1197 * kqueue_dealloc - detach all knotes from a kqueue and free it
1198 *
1199 * 	We walk each list looking for knotes referencing this
1200 *	this kqueue.  If we find one, we try to drop it.  But
1201 *	if we fail to get a drop reference, that will wait
1202 *	until it is dropped.  So, we can just restart again
1203 *	safe in the assumption that the list will eventually
1204 *	not contain any more references to this kqueue (either
1205 *	we dropped them all, or someone else did).
1206 *
1207 *	Assumes no new events are being added to the kqueue.
1208 *	Nothing locked on entry or exit.
1209 */
1210void
1211kqueue_dealloc(struct kqueue *kq)
1212{
1213	struct proc *p = kq->kq_p;
1214	struct filedesc *fdp = p->p_fd;
1215	struct knote *kn;
1216	int i;
1217
1218	proc_fdlock(p);
1219	for (i = 0; i < fdp->fd_knlistsize; i++) {
1220		kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1221		while (kn != NULL) {
1222			if (kq == kn->kn_kq) {
1223				kqlock(kq);
1224				proc_fdunlock(p);
1225				/* drop it ourselves or wait */
1226				if (kqlock2knotedrop(kq, kn)) {
1227					kn->kn_fop->f_detach(kn);
1228					knote_drop(kn, p);
1229				}
1230				proc_fdlock(p);
1231				/* start over at beginning of list */
1232				kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1233				continue;
1234			}
1235			kn = SLIST_NEXT(kn, kn_link);
1236		}
1237	}
1238	if (fdp->fd_knhashmask != 0) {
1239		for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
1240			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1241			while (kn != NULL) {
1242				if (kq == kn->kn_kq) {
1243					kqlock(kq);
1244					proc_fdunlock(p);
1245					/* drop it ourselves or wait */
1246					if (kqlock2knotedrop(kq, kn)) {
1247						kn->kn_fop->f_detach(kn);
1248						knote_drop(kn, p);
1249					}
1250					proc_fdlock(p);
1251					/* start over at beginning of list */
1252					kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1253					continue;
1254				}
1255				kn = SLIST_NEXT(kn, kn_link);
1256			}
1257		}
1258	}
1259	proc_fdunlock(p);
1260
1261	/*
1262	 * before freeing the wait queue set for this kqueue,
1263	 * make sure it is unlinked from all its containing (select) sets.
1264	 */
1265	wait_queue_unlink_all((wait_queue_t)kq->kq_wqs);
1266	wait_queue_set_free(kq->kq_wqs);
1267	lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
1268	FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
1269}
1270
1271int
1272kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
1273{
1274	struct kqueue *kq;
1275	struct fileproc *fp;
1276	int fd, error;
1277
1278	error = falloc_withalloc(p,
1279	    &fp, &fd, vfs_context_current(), fp_zalloc, cra);
1280	if (error) {
1281		return (error);
1282	}
1283
1284	kq = kqueue_alloc(p);
1285	if (kq == NULL) {
1286		fp_free(p, fd, fp);
1287		return (ENOMEM);
1288	}
1289
1290	fp->f_flag = FREAD | FWRITE;
1291	fp->f_ops = &kqueueops;
1292	fp->f_data = kq;
1293
1294	proc_fdlock(p);
1295	*fdflags(p, fd) |= UF_EXCLOSE;
1296	procfdtbl_releasefd(p, fd, NULL);
1297	fp_drop(p, fd, fp, 1);
1298	proc_fdunlock(p);
1299
1300	*retval = fd;
1301	return (error);
1302}
1303
1304int
1305kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
1306{
1307	return (kqueue_body(p, fileproc_alloc_init, NULL, retval));
1308}
1309
1310static int
1311kevent_copyin(user_addr_t *addrp, struct kevent64_s *kevp, struct proc *p,
1312    int iskev64)
1313{
1314	int advance;
1315	int error;
1316
1317	if (iskev64) {
1318		advance = sizeof (struct kevent64_s);
1319		error = copyin(*addrp, (caddr_t)kevp, advance);
1320	} else if (IS_64BIT_PROCESS(p)) {
1321		struct user64_kevent kev64;
1322		bzero(kevp, sizeof (struct kevent64_s));
1323
1324		advance = sizeof (kev64);
1325		error = copyin(*addrp, (caddr_t)&kev64, advance);
1326		if (error)
1327			return (error);
1328		kevp->ident = kev64.ident;
1329		kevp->filter = kev64.filter;
1330		kevp->flags = kev64.flags;
1331		kevp->fflags = kev64.fflags;
1332		kevp->data = kev64.data;
1333		kevp->udata = kev64.udata;
1334	} else {
1335		struct user32_kevent kev32;
1336		bzero(kevp, sizeof (struct kevent64_s));
1337
1338		advance = sizeof (kev32);
1339		error = copyin(*addrp, (caddr_t)&kev32, advance);
1340		if (error)
1341			return (error);
1342		kevp->ident = (uintptr_t)kev32.ident;
1343		kevp->filter = kev32.filter;
1344		kevp->flags = kev32.flags;
1345		kevp->fflags = kev32.fflags;
1346		kevp->data = (intptr_t)kev32.data;
1347		kevp->udata = CAST_USER_ADDR_T(kev32.udata);
1348	}
1349	if (!error)
1350		*addrp += advance;
1351	return (error);
1352}
1353
1354static int
1355kevent_copyout(struct kevent64_s *kevp, user_addr_t *addrp, struct proc *p,
1356    int iskev64)
1357{
1358	int advance;
1359	int error;
1360
1361	if (iskev64) {
1362		advance = sizeof (struct kevent64_s);
1363		error = copyout((caddr_t)kevp, *addrp, advance);
1364	} else if (IS_64BIT_PROCESS(p)) {
1365		struct user64_kevent kev64;
1366
1367		/*
1368		 * deal with the special case of a user-supplied
1369		 * value of (uintptr_t)-1.
1370		 */
1371		kev64.ident = (kevp->ident == (uintptr_t)-1) ?
1372		    (uint64_t)-1LL : (uint64_t)kevp->ident;
1373
1374		kev64.filter = kevp->filter;
1375		kev64.flags = kevp->flags;
1376		kev64.fflags = kevp->fflags;
1377		kev64.data = (int64_t) kevp->data;
1378		kev64.udata = kevp->udata;
1379		advance = sizeof (kev64);
1380		error = copyout((caddr_t)&kev64, *addrp, advance);
1381	} else {
1382		struct user32_kevent kev32;
1383
1384		kev32.ident = (uint32_t)kevp->ident;
1385		kev32.filter = kevp->filter;
1386		kev32.flags = kevp->flags;
1387		kev32.fflags = kevp->fflags;
1388		kev32.data = (int32_t)kevp->data;
1389		kev32.udata = kevp->udata;
1390		advance = sizeof (kev32);
1391		error = copyout((caddr_t)&kev32, *addrp, advance);
1392	}
1393	if (!error)
1394		*addrp += advance;
1395	return (error);
1396}
1397
1398/*
1399 * kevent_continue - continue a kevent syscall after blocking
1400 *
1401 *	assume we inherit a use count on the kq fileglob.
1402 */
1403
1404static void
1405kevent_continue(__unused struct kqueue *kq, void *data, int error)
1406{
1407	struct _kevent *cont_args;
1408	struct fileproc *fp;
1409	int32_t *retval;
1410	int noutputs;
1411	int fd;
1412	struct proc *p = current_proc();
1413
1414	cont_args = (struct _kevent *)data;
1415	noutputs = cont_args->eventout;
1416	retval = cont_args->retval;
1417	fd = cont_args->fd;
1418	fp = cont_args->fp;
1419
1420	fp_drop(p, fd, fp, 0);
1421
1422	/* don't restart after signals... */
1423	if (error == ERESTART)
1424		error = EINTR;
1425	else if (error == EWOULDBLOCK)
1426		error = 0;
1427	if (error == 0)
1428		*retval = noutputs;
1429	unix_syscall_return(error);
1430}
1431
1432/*
1433 * kevent - [syscall] register and wait for kernel events
1434 *
1435 */
1436int
1437kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
1438{
1439	return (kevent_internal(p,
1440	    0,
1441	    uap->changelist,
1442	    uap->nchanges,
1443	    uap->eventlist,
1444	    uap->nevents,
1445	    uap->fd,
1446	    uap->timeout,
1447	    0, /* no flags from old kevent() call */
1448	    retval));
1449}
1450
1451int
1452kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
1453{
1454	return (kevent_internal(p,
1455	    1,
1456	    uap->changelist,
1457	    uap->nchanges,
1458	    uap->eventlist,
1459	    uap->nevents,
1460	    uap->fd,
1461	    uap->timeout,
1462	    uap->flags,
1463	    retval));
1464}
1465
1466static int
1467kevent_internal(struct proc *p, int iskev64, user_addr_t changelist,
1468    int nchanges, user_addr_t ueventlist, int nevents, int fd,
1469    user_addr_t utimeout, __unused unsigned int flags,
1470    int32_t *retval)
1471{
1472	struct _kevent *cont_args;
1473	uthread_t ut;
1474	struct kqueue *kq;
1475	struct fileproc *fp;
1476	struct kevent64_s kev;
1477	int error, noutputs;
1478	struct timeval atv;
1479
1480	/* convert timeout to absolute - if we have one */
1481	if (utimeout != USER_ADDR_NULL) {
1482		struct timeval rtv;
1483		if (IS_64BIT_PROCESS(p)) {
1484			struct user64_timespec ts;
1485			error = copyin(utimeout, &ts, sizeof(ts));
1486			if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0)
1487				error = EINVAL;
1488			else
1489				TIMESPEC_TO_TIMEVAL(&rtv, &ts);
1490		} else {
1491			struct user32_timespec ts;
1492			error = copyin(utimeout, &ts, sizeof(ts));
1493			TIMESPEC_TO_TIMEVAL(&rtv, &ts);
1494		}
1495		if (error)
1496			return (error);
1497		if (itimerfix(&rtv))
1498			return (EINVAL);
1499		getmicrouptime(&atv);
1500		timevaladd(&atv, &rtv);
1501	} else {
1502		atv.tv_sec = 0;
1503		atv.tv_usec = 0;
1504	}
1505
1506	/* get a usecount for the kq itself */
1507	if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0)
1508		return (error);
1509
1510	/* each kq should only be used for events of one type */
1511	kqlock(kq);
1512	if (kq->kq_state & (KQ_KEV32 | KQ_KEV64)) {
1513		if (((iskev64 && (kq->kq_state & KQ_KEV32)) ||
1514			(!iskev64 && (kq->kq_state & KQ_KEV64)))) {
1515			error = EINVAL;
1516			kqunlock(kq);
1517			goto errorout;
1518		}
1519	} else {
1520		kq->kq_state |= (iskev64 ? KQ_KEV64 : KQ_KEV32);
1521	}
1522	kqunlock(kq);
1523
1524	/* register all the change requests the user provided... */
1525	noutputs = 0;
1526	while (nchanges > 0 && error == 0) {
1527		error = kevent_copyin(&changelist, &kev, p, iskev64);
1528		if (error)
1529			break;
1530
1531		kev.flags &= ~EV_SYSFLAGS;
1532		error = kevent_register(kq, &kev, p);
1533		if ((error || (kev.flags & EV_RECEIPT)) && nevents > 0) {
1534			kev.flags = EV_ERROR;
1535			kev.data = error;
1536			error = kevent_copyout(&kev, &ueventlist, p, iskev64);
1537			if (error == 0) {
1538				nevents--;
1539				noutputs++;
1540			}
1541		}
1542		nchanges--;
1543	}
1544
1545	/* store the continuation/completion data in the uthread */
1546	ut = (uthread_t)get_bsdthread_info(current_thread());
1547	cont_args = &ut->uu_kevent.ss_kevent;
1548	cont_args->fp = fp;
1549	cont_args->fd = fd;
1550	cont_args->retval = retval;
1551	cont_args->eventlist = ueventlist;
1552	cont_args->eventcount = nevents;
1553	cont_args->eventout = noutputs;
1554	cont_args->eventsize = iskev64;
1555
1556	if (nevents > 0 && noutputs == 0 && error == 0)
1557		error = kqueue_scan(kq, kevent_callback,
1558		    kevent_continue, cont_args,
1559		    &atv, p);
1560	kevent_continue(kq, cont_args, error);
1561
1562errorout:
1563	fp_drop(p, fd, fp, 0);
1564	return (error);
1565}
1566
1567
1568/*
1569 * kevent_callback - callback for each individual event
1570 *
1571 * called with nothing locked
1572 * caller holds a reference on the kqueue
1573 */
1574static int
1575kevent_callback(__unused struct kqueue *kq, struct kevent64_s *kevp,
1576    void *data)
1577{
1578	struct _kevent *cont_args;
1579	int error;
1580	int iskev64;
1581
1582	cont_args = (struct _kevent *)data;
1583	assert(cont_args->eventout < cont_args->eventcount);
1584
1585	iskev64 = cont_args->eventsize;
1586
1587	/*
1588	 * Copy out the appropriate amount of event data for this user.
1589	 */
1590	error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
1591	    iskev64);
1592
1593	/*
1594	 * If there isn't space for additional events, return
1595	 * a harmless error to stop the processing here
1596	 */
1597	if (error == 0 && ++cont_args->eventout == cont_args->eventcount)
1598		error = EWOULDBLOCK;
1599	return (error);
1600}
1601
1602/*
1603 * kevent_description - format a description of a kevent for diagnostic output
1604 *
1605 * called with a 128-byte string buffer
1606 */
1607
1608char *
1609kevent_description(struct kevent64_s *kevp, char *s, size_t n)
1610{
1611	snprintf(s, n,
1612	    "kevent="
1613	    "{.ident=%#llx, .filter=%d, .flags=%#x, .fflags=%#x, .data=%#llx, .udata=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
1614	    kevp->ident,
1615	    kevp->filter,
1616	    kevp->flags,
1617	    kevp->fflags,
1618	    kevp->data,
1619	    kevp->udata,
1620	    kevp->ext[0],
1621	    kevp->ext[1]);
1622
1623	return (s);
1624}
1625
1626/*
1627 * kevent_register - add a new event to a kqueue
1628 *
1629 *	Creates a mapping between the event source and
1630 *	the kqueue via a knote data structure.
1631 *
1632 *	Because many/most the event sources are file
1633 *	descriptor related, the knote is linked off
1634 *	the filedescriptor table for quick access.
1635 *
1636 *	called with nothing locked
1637 *	caller holds a reference on the kqueue
1638 */
1639
1640int
1641kevent_register(struct kqueue *kq, struct kevent64_s *kev,
1642    __unused struct proc *ctxp)
1643{
1644	struct proc *p = kq->kq_p;
1645	struct filedesc *fdp = p->p_fd;
1646	struct filterops *fops;
1647	struct fileproc *fp = NULL;
1648	struct knote *kn = NULL;
1649	int error = 0;
1650
1651	if (kev->filter < 0) {
1652		if (kev->filter + EVFILT_SYSCOUNT < 0)
1653			return (EINVAL);
1654		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
1655	} else {
1656		/*
1657		 * XXX
1658		 * filter attach routine is responsible for insuring that
1659		 * the identifier can be attached to it.
1660		 */
1661		printf("unknown filter: %d\n", kev->filter);
1662		return (EINVAL);
1663	}
1664
1665restart:
1666	/* this iocount needs to be dropped if it is not registered */
1667	proc_fdlock(p);
1668	if (fops->f_isfd && (error = fp_lookup(p, kev->ident, &fp, 1)) != 0) {
1669		proc_fdunlock(p);
1670		return (error);
1671	}
1672
1673	if (fops->f_isfd) {
1674		/* fd-based knotes are linked off the fd table */
1675		if (kev->ident < (u_int)fdp->fd_knlistsize) {
1676			SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
1677				if (kq == kn->kn_kq &&
1678				    kev->filter == kn->kn_filter)
1679					break;
1680		}
1681	} else {
1682		/* hash non-fd knotes here too */
1683		if (fdp->fd_knhashmask != 0) {
1684			struct klist *list;
1685
1686			list = &fdp->fd_knhash[
1687			    KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1688			SLIST_FOREACH(kn, list, kn_link)
1689				if (kev->ident == kn->kn_id &&
1690				    kq == kn->kn_kq &&
1691				    kev->filter == kn->kn_filter)
1692					break;
1693		}
1694	}
1695
1696	/*
1697	 * kn now contains the matching knote, or NULL if no match
1698	 */
1699	if (kn == NULL) {
1700		if ((kev->flags & (EV_ADD|EV_DELETE)) == EV_ADD) {
1701			kn = knote_alloc();
1702			if (kn == NULL) {
1703				proc_fdunlock(p);
1704				error = ENOMEM;
1705				goto done;
1706			}
1707			kn->kn_fp = fp;
1708			kn->kn_kq = kq;
1709			kn->kn_tq = &kq->kq_head;
1710			kn->kn_fop = fops;
1711			kn->kn_sfflags = kev->fflags;
1712			kn->kn_sdata = kev->data;
1713			kev->fflags = 0;
1714			kev->data = 0;
1715			kn->kn_kevent = *kev;
1716			kn->kn_inuse = 1;  /* for f_attach() */
1717			kn->kn_status = KN_ATTACHING;
1718
1719			/* before anyone can find it */
1720			if (kev->flags & EV_DISABLE)
1721				kn->kn_status |= KN_DISABLED;
1722
1723			error = knote_fdpattach(kn, fdp, p);
1724			proc_fdunlock(p);
1725
1726			if (error) {
1727				knote_free(kn);
1728				goto done;
1729			}
1730
1731			/*
1732			 * apply reference count to knote structure, and
1733			 * do not release it at the end of this routine.
1734			 */
1735			fp = NULL;
1736
1737			error = fops->f_attach(kn);
1738
1739			kqlock(kq);
1740
1741			if (error != 0) {
1742				/*
1743				 * Failed to attach correctly, so drop.
1744				 * All other possible users/droppers
1745				 * have deferred to us.
1746				 */
1747				kn->kn_status |= KN_DROPPING;
1748				kqunlock(kq);
1749				knote_drop(kn, p);
1750				goto done;
1751			} else if (kn->kn_status & KN_DROPPING) {
1752				/*
1753				 * Attach succeeded, but someone else
1754				 * deferred their drop - now we have
1755				 * to do it for them (after detaching).
1756				 */
1757				kqunlock(kq);
1758				kn->kn_fop->f_detach(kn);
1759				knote_drop(kn, p);
1760				goto done;
1761			}
1762			kn->kn_status &= ~KN_ATTACHING;
1763			kqunlock(kq);
1764		} else {
1765			proc_fdunlock(p);
1766			error = ENOENT;
1767			goto done;
1768		}
1769	} else {
1770		/* existing knote - get kqueue lock */
1771		kqlock(kq);
1772		proc_fdunlock(p);
1773
1774		if (kev->flags & EV_DELETE) {
1775			knote_dequeue(kn);
1776			kn->kn_status |= KN_DISABLED;
1777			if (kqlock2knotedrop(kq, kn)) {
1778				kn->kn_fop->f_detach(kn);
1779				knote_drop(kn, p);
1780			}
1781			goto done;
1782		}
1783
1784		/* update status flags for existing knote */
1785		if (kev->flags & EV_DISABLE) {
1786			knote_dequeue(kn);
1787			kn->kn_status |= KN_DISABLED;
1788		} else if (kev->flags & EV_ENABLE) {
1789			kn->kn_status &= ~KN_DISABLED;
1790			if (kn->kn_status & KN_ACTIVE)
1791				knote_enqueue(kn);
1792		}
1793
1794		/*
1795		 * The user may change some filter values after the
1796		 * initial EV_ADD, but doing so will not reset any
1797		 * filter which have already been triggered.
1798		 */
1799		kn->kn_kevent.udata = kev->udata;
1800		if (fops->f_isfd || fops->f_touch == NULL) {
1801			kn->kn_sfflags = kev->fflags;
1802			kn->kn_sdata = kev->data;
1803		}
1804
1805		/*
1806		 * If somebody is in the middle of dropping this
1807		 * knote - go find/insert a new one.  But we have
1808		 * wait for this one to go away first. Attaches
1809		 * running in parallel may also drop/modify the
1810		 * knote.  Wait for those to complete as well and
1811		 * then start over if we encounter one.
1812		 */
1813		if (!kqlock2knoteusewait(kq, kn)) {
1814			/* kqueue, proc_fdlock both unlocked */
1815			goto restart;
1816		}
1817
1818		/*
1819		 * Call touch routine to notify filter of changes
1820		 * in filter values.
1821		 */
1822		if (!fops->f_isfd && fops->f_touch != NULL)
1823			fops->f_touch(kn, kev, EVENT_REGISTER);
1824	}
1825	/* still have use ref on knote */
1826
1827	/*
1828	 * If the knote is not marked to always stay enqueued,
1829	 * invoke the filter routine to see if it should be
1830	 * enqueued now.
1831	 */
1832	if ((kn->kn_status & KN_STAYQUEUED) == 0 && kn->kn_fop->f_event(kn, 0)) {
1833		if (knoteuse2kqlock(kq, kn))
1834			knote_activate(kn, 1);
1835		kqunlock(kq);
1836	} else {
1837		knote_put(kn);
1838	}
1839
1840done:
1841	if (fp != NULL)
1842		fp_drop(p, kev->ident, fp, 0);
1843	return (error);
1844}
1845
1846
1847/*
1848 * knote_process - process a triggered event
1849 *
1850 *	Validate that it is really still a triggered event
1851 *	by calling the filter routines (if necessary).  Hold
1852 *	a use reference on the knote to avoid it being detached.
1853 *	If it is still considered triggered, invoke the callback
1854 *	routine provided and move it to the provided inprocess
1855 *	queue.
1856 *
1857 *	caller holds a reference on the kqueue.
1858 *	kqueue locked on entry and exit - but may be dropped
1859 */
1860static int
1861knote_process(struct knote *kn,
1862    kevent_callback_t callback,
1863    void *data,
1864    struct kqtailq *inprocessp,
1865    struct proc *p)
1866{
1867	struct kqueue *kq = kn->kn_kq;
1868	struct kevent64_s kev;
1869	int touch;
1870	int result;
1871	int error;
1872
1873	/*
1874	 * Determine the kevent state we want to return.
1875	 *
1876	 * Some event states need to be revalidated before returning
1877	 * them, others we take the snapshot at the time the event
1878	 * was enqueued.
1879	 *
1880	 * Events with non-NULL f_touch operations must be touched.
1881	 * Triggered events must fill in kev for the callback.
1882	 *
1883	 * Convert our lock to a use-count and call the event's
1884	 * filter routine(s) to update.
1885	 */
1886	if ((kn->kn_status & KN_DISABLED) != 0) {
1887		result = 0;
1888		touch = 0;
1889	} else {
1890		int revalidate;
1891
1892		result = 1;
1893		revalidate = ((kn->kn_status & KN_STAYQUEUED) != 0 ||
1894		    (kn->kn_flags & EV_ONESHOT) == 0);
1895		touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL);
1896
1897		if (revalidate || touch) {
1898			if (revalidate)
1899				knote_deactivate(kn);
1900
1901			/* call the filter/touch routines with just a ref */
1902			if (kqlock2knoteuse(kq, kn)) {
1903				/* if we have to revalidate, call the filter */
1904				if (revalidate) {
1905					result = kn->kn_fop->f_event(kn, 0);
1906				}
1907
1908				/*
1909				 * capture the kevent data - using touch if
1910				 * specified
1911				 */
1912				if (result && touch) {
1913					kn->kn_fop->f_touch(kn, &kev,
1914					    EVENT_PROCESS);
1915				}
1916
1917				/*
1918				 * convert back to a kqlock - bail if the knote
1919				 * went away
1920				 */
1921				if (!knoteuse2kqlock(kq, kn)) {
1922					return (EJUSTRETURN);
1923				} else if (result) {
1924					/*
1925					 * if revalidated as alive, make sure
1926					 * it's active
1927					 */
1928					if (!(kn->kn_status & KN_ACTIVE)) {
1929						knote_activate(kn, 0);
1930					}
1931
1932					/*
1933					 * capture all events that occurred
1934					 * during filter
1935					 */
1936					if (!touch) {
1937						kev = kn->kn_kevent;
1938					}
1939
1940				} else if ((kn->kn_status & KN_STAYQUEUED) == 0) {
1941					/*
1942					 * was already dequeued, so just bail on
1943					 * this one
1944					 */
1945					return (EJUSTRETURN);
1946				}
1947			} else {
1948				return (EJUSTRETURN);
1949			}
1950		} else {
1951			kev = kn->kn_kevent;
1952		}
1953	}
1954
1955	/* move knote onto inprocess queue */
1956	assert(kn->kn_tq == &kq->kq_head);
1957	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1958	kn->kn_tq = inprocessp;
1959	TAILQ_INSERT_TAIL(inprocessp, kn, kn_tqe);
1960
1961	/*
1962	 * Determine how to dispatch the knote for future event handling.
1963	 * not-fired: just return (do not callout).
1964	 * One-shot: deactivate it.
1965	 * Clear: deactivate and clear the state.
1966	 * Dispatch: don't clear state, just deactivate it and mark it disabled.
1967	 * All others: just leave where they are.
1968	 */
1969
1970	if (result == 0) {
1971		return (EJUSTRETURN);
1972	} else if ((kn->kn_flags & EV_ONESHOT) != 0) {
1973		knote_deactivate(kn);
1974		if (kqlock2knotedrop(kq, kn)) {
1975			kn->kn_fop->f_detach(kn);
1976			knote_drop(kn, p);
1977		}
1978	} else if ((kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) != 0) {
1979		if ((kn->kn_flags & EV_DISPATCH) != 0) {
1980			/* deactivate and disable all dispatch knotes */
1981			knote_deactivate(kn);
1982			kn->kn_status |= KN_DISABLED;
1983		} else if (!touch || kn->kn_fflags == 0) {
1984			/* only deactivate if nothing since the touch */
1985			knote_deactivate(kn);
1986		}
1987		if (!touch && (kn->kn_flags & EV_CLEAR) != 0) {
1988			/* manually clear non-touch knotes */
1989			kn->kn_data = 0;
1990			kn->kn_fflags = 0;
1991		}
1992		kqunlock(kq);
1993	} else {
1994		/*
1995		 * leave on inprocess queue.  We'll
1996		 * move all the remaining ones back
1997		 * the kq queue and wakeup any
1998		 * waiters when we are done.
1999		 */
2000		kqunlock(kq);
2001	}
2002
2003	/* callback to handle each event as we find it */
2004	error = (callback)(kq, &kev, data);
2005
2006	kqlock(kq);
2007	return (error);
2008}
2009
2010/*
2011 * Return 0 to indicate that processing should proceed,
2012 * -1 if there is nothing to process.
2013 *
2014 * Called with kqueue locked and returns the same way,
2015 * but may drop lock temporarily.
2016 */
2017static int
2018kqueue_begin_processing(struct kqueue *kq)
2019{
2020	for (;;) {
2021		if (kq->kq_count == 0) {
2022			return (-1);
2023		}
2024
2025		/* if someone else is processing the queue, wait */
2026		if (kq->kq_nprocess != 0) {
2027			wait_queue_assert_wait((wait_queue_t)kq->kq_wqs,
2028			    &kq->kq_nprocess, THREAD_UNINT, 0);
2029			kq->kq_state |= KQ_PROCWAIT;
2030			kqunlock(kq);
2031			thread_block(THREAD_CONTINUE_NULL);
2032			kqlock(kq);
2033		} else {
2034			kq->kq_nprocess = 1;
2035			return (0);
2036		}
2037	}
2038}
2039
2040/*
2041 * Called with kqueue lock held.
2042 */
2043static void
2044kqueue_end_processing(struct kqueue *kq)
2045{
2046	kq->kq_nprocess = 0;
2047	if (kq->kq_state & KQ_PROCWAIT) {
2048		kq->kq_state &= ~KQ_PROCWAIT;
2049		wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs,
2050		    &kq->kq_nprocess, THREAD_AWAKENED);
2051	}
2052}
2053
2054/*
2055 * kqueue_process - process the triggered events in a kqueue
2056 *
2057 *	Walk the queued knotes and validate that they are
2058 *	really still triggered events by calling the filter
2059 *	routines (if necessary).  Hold a use reference on
2060 *	the knote to avoid it being detached. For each event
2061 *	that is still considered triggered, invoke the
2062 *	callback routine provided.
2063 *
2064 *	caller holds a reference on the kqueue.
2065 *	kqueue locked on entry and exit - but may be dropped
2066 *	kqueue list locked (held for duration of call)
2067 */
2068
2069static int
2070kqueue_process(struct kqueue *kq,
2071    kevent_callback_t callback,
2072    void *data,
2073    int *countp,
2074    struct proc *p)
2075{
2076	struct kqtailq inprocess;
2077	struct knote *kn;
2078	int nevents;
2079	int error;
2080
2081	TAILQ_INIT(&inprocess);
2082
2083	if (kqueue_begin_processing(kq) == -1) {
2084		*countp = 0;
2085		/* Nothing to process */
2086		return (0);
2087	}
2088
2089	/*
2090	 * Clear any pre-posted status from previous runs, so we
2091	 * only detect events that occur during this run.
2092	 */
2093	wait_queue_sub_clearrefs(kq->kq_wqs);
2094
2095	/*
2096	 * loop through the enqueued knotes, processing each one and
2097	 * revalidating those that need it. As they are processed,
2098	 * they get moved to the inprocess queue (so the loop can end).
2099	 */
2100	error = 0;
2101	nevents = 0;
2102
2103	while (error == 0 &&
2104	    (kn = TAILQ_FIRST(&kq->kq_head)) != NULL) {
2105		error = knote_process(kn, callback, data, &inprocess, p);
2106		if (error == EJUSTRETURN)
2107			error = 0;
2108		else
2109			nevents++;
2110	}
2111
2112	/*
2113	 * With the kqueue still locked, move any knotes
2114	 * remaining on the inprocess queue back to the
2115	 * kq's queue and wake up any waiters.
2116	 */
2117	while ((kn = TAILQ_FIRST(&inprocess)) != NULL) {
2118		assert(kn->kn_tq == &inprocess);
2119		TAILQ_REMOVE(&inprocess, kn, kn_tqe);
2120		kn->kn_tq = &kq->kq_head;
2121		TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2122	}
2123
2124	kqueue_end_processing(kq);
2125
2126	*countp = nevents;
2127	return (error);
2128}
2129
2130
2131static void
2132kqueue_scan_continue(void *data, wait_result_t wait_result)
2133{
2134	thread_t self = current_thread();
2135	uthread_t ut = (uthread_t)get_bsdthread_info(self);
2136	struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
2137	struct kqueue *kq = (struct kqueue *)data;
2138	int error;
2139	int count;
2140
2141	/* convert the (previous) wait_result to a proper error */
2142	switch (wait_result) {
2143	case THREAD_AWAKENED:
2144		kqlock(kq);
2145		error = kqueue_process(kq, cont_args->call, cont_args, &count,
2146		    current_proc());
2147		if (error == 0 && count == 0) {
2148			wait_queue_assert_wait((wait_queue_t)kq->kq_wqs,
2149			    KQ_EVENT, THREAD_ABORTSAFE, cont_args->deadline);
2150			kq->kq_state |= KQ_SLEEP;
2151			kqunlock(kq);
2152			thread_block_parameter(kqueue_scan_continue, kq);
2153			/* NOTREACHED */
2154		}
2155		kqunlock(kq);
2156		break;
2157	case THREAD_TIMED_OUT:
2158		error = EWOULDBLOCK;
2159		break;
2160	case THREAD_INTERRUPTED:
2161		error = EINTR;
2162		break;
2163	default:
2164		panic("%s: - invalid wait_result (%d)", __func__,
2165		    wait_result);
2166		error = 0;
2167	}
2168
2169	/* call the continuation with the results */
2170	assert(cont_args->cont != NULL);
2171	(cont_args->cont)(kq, cont_args->data, error);
2172}
2173
2174
2175/*
2176 * kqueue_scan - scan and wait for events in a kqueue
2177 *
2178 *	Process the triggered events in a kqueue.
2179 *
2180 *	If there are no events triggered arrange to
2181 *	wait for them. If the caller provided a
2182 *	continuation routine, then kevent_scan will
2183 *	also.
2184 *
2185 *	The callback routine must be valid.
2186 *	The caller must hold a use-count reference on the kq.
2187 */
2188
2189int
2190kqueue_scan(struct kqueue *kq,
2191	    kevent_callback_t callback,
2192	    kqueue_continue_t continuation,
2193	    void *data,
2194	    struct timeval *atvp,
2195	    struct proc *p)
2196{
2197	thread_continue_t cont = THREAD_CONTINUE_NULL;
2198	uint64_t deadline;
2199	int error;
2200	int first;
2201
2202	assert(callback != NULL);
2203
2204	first = 1;
2205	for (;;) {
2206		wait_result_t wait_result;
2207		int count;
2208
2209		/*
2210		 * Make a pass through the kq to find events already
2211		 * triggered.
2212		 */
2213		kqlock(kq);
2214		error = kqueue_process(kq, callback, data, &count, p);
2215		if (error || count)
2216			break; /* lock still held */
2217
2218		/* looks like we have to consider blocking */
2219		if (first) {
2220			first = 0;
2221			/* convert the timeout to a deadline once */
2222			if (atvp->tv_sec || atvp->tv_usec) {
2223				uint64_t now;
2224
2225				clock_get_uptime(&now);
2226				nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
2227							    atvp->tv_usec * (long)NSEC_PER_USEC,
2228							    &deadline);
2229				if (now >= deadline) {
2230					/* non-blocking call */
2231					error = EWOULDBLOCK;
2232					break; /* lock still held */
2233				}
2234				deadline -= now;
2235				clock_absolutetime_interval_to_deadline(deadline, &deadline);
2236			} else {
2237				deadline = 0; 	/* block forever */
2238			}
2239
2240			if (continuation) {
2241				uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
2242				struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;
2243
2244				cont_args->call = callback;
2245				cont_args->cont = continuation;
2246				cont_args->deadline = deadline;
2247				cont_args->data = data;
2248				cont = kqueue_scan_continue;
2249			}
2250		}
2251
2252		/* go ahead and wait */
2253		wait_queue_assert_wait_with_leeway((wait_queue_t)kq->kq_wqs,
2254		    KQ_EVENT, THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL,
2255		    deadline, 0);
2256		kq->kq_state |= KQ_SLEEP;
2257		kqunlock(kq);
2258		wait_result = thread_block_parameter(cont, kq);
2259		/* NOTREACHED if (continuation != NULL) */
2260
2261		switch (wait_result) {
2262		case THREAD_AWAKENED:
2263			continue;
2264		case THREAD_TIMED_OUT:
2265			return (EWOULDBLOCK);
2266		case THREAD_INTERRUPTED:
2267			return (EINTR);
2268		default:
2269			panic("%s: - bad wait_result (%d)", __func__,
2270			    wait_result);
2271			error = 0;
2272		}
2273	}
2274	kqunlock(kq);
2275	return (error);
2276}
2277
2278
2279/*
2280 * XXX
2281 * This could be expanded to call kqueue_scan, if desired.
2282 */
2283/*ARGSUSED*/
2284static int
2285kqueue_read(__unused struct fileproc *fp,
2286    __unused struct uio *uio,
2287    __unused int flags,
2288    __unused vfs_context_t ctx)
2289{
2290	return (ENXIO);
2291}
2292
2293/*ARGSUSED*/
2294static int
2295kqueue_write(__unused struct fileproc *fp,
2296    __unused struct uio *uio,
2297    __unused int flags,
2298    __unused vfs_context_t ctx)
2299{
2300	return (ENXIO);
2301}
2302
2303/*ARGSUSED*/
2304static int
2305kqueue_ioctl(__unused struct fileproc *fp,
2306    __unused u_long com,
2307    __unused caddr_t data,
2308    __unused vfs_context_t ctx)
2309{
2310	return (ENOTTY);
2311}
2312
2313/*ARGSUSED*/
2314static int
2315kqueue_select(struct fileproc *fp, int which, void *wql,
2316    __unused vfs_context_t ctx)
2317{
2318	struct kqueue *kq = (struct kqueue *)fp->f_data;
2319	struct knote *kn;
2320	struct kqtailq inprocessq;
2321	int retnum = 0;
2322
2323	if (which != FREAD)
2324		return (0);
2325
2326	TAILQ_INIT(&inprocessq);
2327
2328	kqlock(kq);
2329	/*
2330	 * If this is the first pass, link the wait queue associated with the
2331	 * the kqueue onto the wait queue set for the select().  Normally we
2332	 * use selrecord() for this, but it uses the wait queue within the
2333	 * selinfo structure and we need to use the main one for the kqueue to
2334	 * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
2335	 * (The select() call will unlink them when it ends).
2336	 */
2337	if (wql != NULL) {
2338		thread_t cur_act = current_thread();
2339		struct uthread * ut = get_bsdthread_info(cur_act);
2340
2341		kq->kq_state |= KQ_SEL;
2342		wait_queue_link_noalloc((wait_queue_t)kq->kq_wqs, ut->uu_wqset,
2343		    (wait_queue_link_t)wql);
2344	}
2345
2346	if (kqueue_begin_processing(kq) == -1) {
2347		kqunlock(kq);
2348		return (0);
2349	}
2350
2351	if (kq->kq_count != 0) {
2352		/*
2353		 * there is something queued - but it might be a
2354		 * KN_STAYQUEUED knote, which may or may not have
2355		 * any events pending.  So, we have to walk the
2356		 * list of knotes to see, and peek at the stay-
2357		 * queued ones to be really sure.
2358		 */
2359		while ((kn = (struct knote *)TAILQ_FIRST(&kq->kq_head)) != NULL) {
2360			if ((kn->kn_status & KN_STAYQUEUED) == 0) {
2361				retnum = 1;
2362				goto out;
2363			}
2364
2365			TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2366			TAILQ_INSERT_TAIL(&inprocessq, kn, kn_tqe);
2367
2368			if (kqlock2knoteuse(kq, kn)) {
2369				unsigned peek;
2370
2371				peek = kn->kn_fop->f_peek(kn);
2372				if (knoteuse2kqlock(kq, kn)) {
2373					if (peek > 0) {
2374						retnum = 1;
2375						goto out;
2376					}
2377				} else {
2378					retnum = 0;
2379				}
2380			}
2381		}
2382	}
2383
2384out:
2385	/* Return knotes to active queue */
2386	while ((kn = TAILQ_FIRST(&inprocessq)) != NULL) {
2387		TAILQ_REMOVE(&inprocessq, kn, kn_tqe);
2388		kn->kn_tq = &kq->kq_head;
2389		TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2390	}
2391
2392	kqueue_end_processing(kq);
2393	kqunlock(kq);
2394	return (retnum);
2395}
2396
2397/*
2398 * kqueue_close -
2399 */
2400/*ARGSUSED*/
2401static int
2402kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
2403{
2404	struct kqueue *kq = (struct kqueue *)fg->fg_data;
2405
2406	kqueue_dealloc(kq);
2407	fg->fg_data = NULL;
2408	return (0);
2409}
2410
2411/*ARGSUSED*/
2412/*
2413 * The callers has taken a use-count reference on this kqueue and will donate it
2414 * to the kqueue we are being added to.  This keeps the kqueue from closing until
2415 * that relationship is torn down.
2416 */
2417static int
2418kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
2419{
2420	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
2421	struct kqueue *parentkq = kn->kn_kq;
2422
2423	if (parentkq == kq ||
2424	    kn->kn_filter != EVFILT_READ)
2425		return (1);
2426
2427	/*
2428	 * We have to avoid creating a cycle when nesting kqueues
2429	 * inside another.  Rather than trying to walk the whole
2430	 * potential DAG of nested kqueues, we just use a simple
2431	 * ceiling protocol.  When a kqueue is inserted into another,
2432	 * we check that the (future) parent is not already nested
2433	 * into another kqueue at a lower level than the potenial
2434	 * child (because it could indicate a cycle).  If that test
2435	 * passes, we just mark the nesting levels accordingly.
2436	 */
2437
2438	kqlock(parentkq);
2439	if (parentkq->kq_level > 0 &&
2440	    parentkq->kq_level < kq->kq_level)
2441	{
2442		kqunlock(parentkq);
2443		return (1);
2444	} else {
2445		/* set parent level appropriately */
2446		if (parentkq->kq_level == 0)
2447			parentkq->kq_level = 2;
2448		if (parentkq->kq_level < kq->kq_level + 1)
2449			parentkq->kq_level = kq->kq_level + 1;
2450		kqunlock(parentkq);
2451
2452		kn->kn_fop = &kqread_filtops;
2453		kqlock(kq);
2454		KNOTE_ATTACH(&kq->kq_sel.si_note, kn);
2455		/* indicate nesting in child, if needed */
2456		if (kq->kq_level == 0)
2457			kq->kq_level = 1;
2458		kqunlock(kq);
2459		return (0);
2460	}
2461}
2462
2463/*
2464 * kqueue_drain - called when kq is closed
2465 */
2466/*ARGSUSED*/
2467static int
2468kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
2469{
2470	struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
2471	kqlock(kq);
2472	kqueue_wakeup(kq, 1);
2473	kqunlock(kq);
2474	return (0);
2475}
2476
2477/*ARGSUSED*/
2478int
2479kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
2480{
2481	kqlock(kq);
2482	if (isstat64 != 0) {
2483		struct stat64 *sb64 = (struct stat64 *)ub;
2484
2485		bzero((void *)sb64, sizeof(*sb64));
2486		sb64->st_size = kq->kq_count;
2487		if (kq->kq_state & KQ_KEV64)
2488			sb64->st_blksize = sizeof(struct kevent64_s);
2489		else
2490			sb64->st_blksize = IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) : sizeof(struct user32_kevent);
2491		sb64->st_mode = S_IFIFO;
2492	} else {
2493		struct stat *sb = (struct stat *)ub;
2494
2495		bzero((void *)sb, sizeof(*sb));
2496		sb->st_size = kq->kq_count;
2497		if (kq->kq_state & KQ_KEV64)
2498			sb->st_blksize = sizeof(struct kevent64_s);
2499		else
2500			sb->st_blksize = IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) : sizeof(struct user32_kevent);
2501		sb->st_mode = S_IFIFO;
2502	}
2503	kqunlock(kq);
2504	return (0);
2505}
2506
2507/*
2508 * Called with the kqueue locked
2509 */
2510static void
2511kqueue_wakeup(struct kqueue *kq, int closed)
2512{
2513	if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0 || kq->kq_nprocess > 0) {
2514		kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
2515		wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, KQ_EVENT,
2516		    (closed) ? THREAD_INTERRUPTED : THREAD_AWAKENED);
2517	}
2518}
2519
2520void
2521klist_init(struct klist *list)
2522{
2523	SLIST_INIT(list);
2524}
2525
2526
2527/*
2528 * Query/Post each knote in the object's list
2529 *
2530 *	The object lock protects the list. It is assumed
2531 *	that the filter/event routine for the object can
2532 *	determine that the object is already locked (via
2533 *	the hint) and not deadlock itself.
2534 *
2535 *	The object lock should also hold off pending
2536 *	detach/drop operations.  But we'll prevent it here
2537 *	too - just in case.
2538 */
2539void
2540knote(struct klist *list, long hint)
2541{
2542	struct knote *kn;
2543
2544	SLIST_FOREACH(kn, list, kn_selnext) {
2545		struct kqueue *kq = kn->kn_kq;
2546
2547		kqlock(kq);
2548		if (kqlock2knoteuse(kq, kn)) {
2549			int result;
2550
2551			/* call the event with only a use count */
2552			result = kn->kn_fop->f_event(kn, hint);
2553
2554			/* if its not going away and triggered */
2555			if (knoteuse2kqlock(kq, kn) && result)
2556				knote_activate(kn, 1);
2557			/* lock held again */
2558		}
2559		kqunlock(kq);
2560	}
2561}
2562
2563/*
2564 * attach a knote to the specified list.  Return true if this is the first entry.
2565 * The list is protected by whatever lock the object it is associated with uses.
2566 */
2567int
2568knote_attach(struct klist *list, struct knote *kn)
2569{
2570	int ret = SLIST_EMPTY(list);
2571	SLIST_INSERT_HEAD(list, kn, kn_selnext);
2572	return (ret);
2573}
2574
2575/*
2576 * detach a knote from the specified list.  Return true if that was the last entry.
2577 * The list is protected by whatever lock the object it is associated with uses.
2578 */
2579int
2580knote_detach(struct klist *list, struct knote *kn)
2581{
2582	SLIST_REMOVE(list, kn, knote, kn_selnext);
2583	return (SLIST_EMPTY(list));
2584}
2585
2586/*
2587 * For a given knote, link a provided wait queue directly with the kqueue.
2588 * Wakeups will happen via recursive wait queue support.  But nothing will move
2589 * the knote to the active list at wakeup (nothing calls knote()).  Instead,
2590 * we permanently enqueue them here.
2591 *
2592 * kqueue and knote references are held by caller.
2593 *
2594 * caller provides the wait queue link structure.
2595 */
2596int
2597knote_link_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t wql)
2598{
2599	struct kqueue *kq = kn->kn_kq;
2600	kern_return_t kr;
2601
2602	kr = wait_queue_link_noalloc(wq, kq->kq_wqs, wql);
2603	if (kr == KERN_SUCCESS) {
2604		knote_markstayqueued(kn);
2605		return (0);
2606	} else {
2607		return (EINVAL);
2608	}
2609}
2610
2611/*
2612 * Unlink the provided wait queue from the kqueue associated with a knote.
2613 * Also remove it from the magic list of directly attached knotes.
2614 *
2615 * Note that the unlink may have already happened from the other side, so
2616 * ignore any failures to unlink and just remove it from the kqueue list.
2617 *
2618 * On success, caller is responsible for the link structure
2619 */
2620int
2621knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t *wqlp)
2622{
2623	struct kqueue *kq = kn->kn_kq;
2624	kern_return_t kr;
2625
2626	kr = wait_queue_unlink_nofree(wq, kq->kq_wqs, wqlp);
2627	kqlock(kq);
2628	kn->kn_status &= ~KN_STAYQUEUED;
2629	knote_dequeue(kn);
2630	kqunlock(kq);
2631	return ((kr != KERN_SUCCESS) ? EINVAL : 0);
2632}
2633
2634/*
2635 * remove all knotes referencing a specified fd
2636 *
2637 * Essentially an inlined knote_remove & knote_drop
2638 * when we know for sure that the thing is a file
2639 *
2640 * Entered with the proc_fd lock already held.
2641 * It returns the same way, but may drop it temporarily.
2642 */
2643void
2644knote_fdclose(struct proc *p, int fd)
2645{
2646	struct filedesc *fdp = p->p_fd;
2647	struct klist *list;
2648	struct knote *kn;
2649
2650	list = &fdp->fd_knlist[fd];
2651	while ((kn = SLIST_FIRST(list)) != NULL) {
2652		struct kqueue *kq = kn->kn_kq;
2653
2654		if (kq->kq_p != p)
2655			panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
2656			    __func__, kq->kq_p, p);
2657
2658		kqlock(kq);
2659		proc_fdunlock(p);
2660
2661		/*
2662		 * Convert the lock to a drop ref.
2663		 * If we get it, go ahead and drop it.
2664		 * Otherwise, we waited for it to
2665		 * be dropped by the other guy, so
2666		 * it is safe to move on in the list.
2667		 */
2668		if (kqlock2knotedrop(kq, kn)) {
2669			kn->kn_fop->f_detach(kn);
2670			knote_drop(kn, p);
2671		}
2672
2673		proc_fdlock(p);
2674
2675		/* the fd tables may have changed - start over */
2676		list = &fdp->fd_knlist[fd];
2677	}
2678}
2679
2680/* proc_fdlock held on entry (and exit) */
2681static int
2682knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p)
2683{
2684	struct klist *list = NULL;
2685
2686	if (! kn->kn_fop->f_isfd) {
2687		if (fdp->fd_knhashmask == 0)
2688			fdp->fd_knhash = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
2689			    &fdp->fd_knhashmask);
2690		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
2691	} else {
2692		if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
2693			u_int size = 0;
2694
2695			if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
2696			    || kn->kn_id >= (uint64_t)maxfiles)
2697				return (EINVAL);
2698
2699			/* have to grow the fd_knlist */
2700			size = fdp->fd_knlistsize;
2701			while (size <= kn->kn_id)
2702				size += KQEXTENT;
2703
2704			if (size >= (UINT_MAX/sizeof(struct klist *)))
2705				return (EINVAL);
2706
2707			MALLOC(list, struct klist *,
2708			    size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
2709			if (list == NULL)
2710				return (ENOMEM);
2711
2712			bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
2713			    fdp->fd_knlistsize * sizeof(struct klist *));
2714			bzero((caddr_t)list +
2715			    fdp->fd_knlistsize * sizeof(struct klist *),
2716			    (size - fdp->fd_knlistsize) * sizeof(struct klist *));
2717			FREE(fdp->fd_knlist, M_KQUEUE);
2718			fdp->fd_knlist = list;
2719			fdp->fd_knlistsize = size;
2720		}
2721		list = &fdp->fd_knlist[kn->kn_id];
2722	}
2723	SLIST_INSERT_HEAD(list, kn, kn_link);
2724	return (0);
2725}
2726
2727
2728
2729/*
2730 * should be called at spl == 0, since we don't want to hold spl
2731 * while calling fdrop and free.
2732 */
2733static void
2734knote_drop(struct knote *kn, __unused struct proc *ctxp)
2735{
2736	struct kqueue *kq = kn->kn_kq;
2737	struct proc *p = kq->kq_p;
2738	struct filedesc *fdp = p->p_fd;
2739	struct klist *list;
2740	int needswakeup;
2741
2742	proc_fdlock(p);
2743	if (kn->kn_fop->f_isfd)
2744		list = &fdp->fd_knlist[kn->kn_id];
2745	else
2746		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
2747
2748	SLIST_REMOVE(list, kn, knote, kn_link);
2749	kqlock(kq);
2750	knote_dequeue(kn);
2751	needswakeup = (kn->kn_status & KN_USEWAIT);
2752	kqunlock(kq);
2753	proc_fdunlock(p);
2754
2755	if (needswakeup)
2756		wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kn->kn_status,
2757		    THREAD_AWAKENED);
2758
2759	if (kn->kn_fop->f_isfd)
2760		fp_drop(p, kn->kn_id, kn->kn_fp, 0);
2761
2762	knote_free(kn);
2763}
2764
2765/* called with kqueue lock held */
2766static void
2767knote_activate(struct knote *kn, int propagate)
2768{
2769	struct kqueue *kq = kn->kn_kq;
2770
2771	kn->kn_status |= KN_ACTIVE;
2772	knote_enqueue(kn);
2773	kqueue_wakeup(kq, 0);
2774
2775	/* this is a real event: wake up the parent kq, too */
2776	if (propagate)
2777		KNOTE(&kq->kq_sel.si_note, 0);
2778}
2779
2780/* called with kqueue lock held */
2781static void
2782knote_deactivate(struct knote *kn)
2783{
2784	kn->kn_status &= ~KN_ACTIVE;
2785	knote_dequeue(kn);
2786}
2787
2788/* called with kqueue lock held */
2789static void
2790knote_enqueue(struct knote *kn)
2791{
2792	if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_STAYQUEUED ||
2793	    (kn->kn_status & (KN_QUEUED | KN_STAYQUEUED | KN_DISABLED)) == 0) {
2794		struct kqtailq *tq = kn->kn_tq;
2795		struct kqueue *kq = kn->kn_kq;
2796
2797		TAILQ_INSERT_TAIL(tq, kn, kn_tqe);
2798		kn->kn_status |= KN_QUEUED;
2799		kq->kq_count++;
2800	}
2801}
2802
2803/* called with kqueue lock held */
2804static void
2805knote_dequeue(struct knote *kn)
2806{
2807	struct kqueue *kq = kn->kn_kq;
2808
2809	if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_QUEUED) {
2810		struct kqtailq *tq = kn->kn_tq;
2811
2812		TAILQ_REMOVE(tq, kn, kn_tqe);
2813		kn->kn_tq = &kq->kq_head;
2814		kn->kn_status &= ~KN_QUEUED;
2815		kq->kq_count--;
2816	}
2817}
2818
2819void
2820knote_init(void)
2821{
2822	knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote),
2823	    8192, "knote zone");
2824
2825	/* allocate kq lock group attribute and group */
2826	kq_lck_grp_attr = lck_grp_attr_alloc_init();
2827
2828	kq_lck_grp = lck_grp_alloc_init("kqueue",  kq_lck_grp_attr);
2829
2830	/* Allocate kq lock attribute */
2831	kq_lck_attr = lck_attr_alloc_init();
2832
2833	/* Initialize the timer filter lock */
2834	lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
2835
2836#if VM_PRESSURE_EVENTS
2837	/* Initialize the vm pressure list lock */
2838	vm_pressure_init(kq_lck_grp, kq_lck_attr);
2839#endif
2840
2841#if CONFIG_MEMORYSTATUS
2842	/* Initialize the memorystatus list lock */
2843	memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
2844#endif
2845}
2846SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
2847
2848static struct knote *
2849knote_alloc(void)
2850{
2851	return ((struct knote *)zalloc(knote_zone));
2852}
2853
2854static void
2855knote_free(struct knote *kn)
2856{
2857	zfree(knote_zone, kn);
2858}
2859
2860#if SOCKETS
2861#include <sys/param.h>
2862#include <sys/socket.h>
2863#include <sys/protosw.h>
2864#include <sys/domain.h>
2865#include <sys/mbuf.h>
2866#include <sys/kern_event.h>
2867#include <sys/malloc.h>
2868#include <sys/sys_domain.h>
2869#include <sys/syslog.h>
2870
2871#ifndef ROUNDUP64
2872#define	ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
2873#endif
2874
2875#ifndef ADVANCE64
2876#define	ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
2877#endif
2878
2879static lck_grp_attr_t *kev_lck_grp_attr;
2880static lck_attr_t *kev_lck_attr;
2881static lck_grp_t *kev_lck_grp;
2882static decl_lck_rw_data(,kev_lck_data);
2883static lck_rw_t *kev_rwlock = &kev_lck_data;
2884
2885static int kev_attach(struct socket *so, int proto, struct proc *p);
2886static int kev_detach(struct socket *so);
2887static int kev_control(struct socket *so, u_long cmd, caddr_t data,
2888    struct ifnet *ifp, struct proc *p);
2889static lck_mtx_t * event_getlock(struct socket *, int);
2890static int event_lock(struct socket *, int, void *);
2891static int event_unlock(struct socket *, int, void *);
2892
2893static int event_sofreelastref(struct socket *);
2894static void kev_delete(struct kern_event_pcb *);
2895
2896static struct pr_usrreqs event_usrreqs = {
2897	.pru_attach =		kev_attach,
2898	.pru_control =		kev_control,
2899	.pru_detach =		kev_detach,
2900	.pru_soreceive =	soreceive,
2901};
2902
2903static struct protosw eventsw[] = {
2904{
2905	.pr_type =		SOCK_RAW,
2906	.pr_protocol =		SYSPROTO_EVENT,
2907	.pr_flags =		PR_ATOMIC,
2908	.pr_usrreqs =		&event_usrreqs,
2909	.pr_lock =		event_lock,
2910	.pr_unlock =		event_unlock,
2911	.pr_getlock =		event_getlock,
2912}
2913};
2914
2915__private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
2916__private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
2917
2918SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
2919	CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Kernel event family");
2920
2921struct kevtstat kevtstat;
2922SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
2923    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2924    kevt_getstat, "S,kevtstat", "");
2925
2926SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
2927	CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2928	kevt_pcblist, "S,xkevtpcb", "");
2929
2930static lck_mtx_t *
2931event_getlock(struct socket *so, int locktype)
2932{
2933#pragma unused(locktype)
2934	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
2935
2936	if (so->so_pcb != NULL)  {
2937		if (so->so_usecount < 0)
2938			panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
2939			    so, so->so_usecount, solockhistory_nr(so));
2940			/* NOTREACHED */
2941	} else {
2942		panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
2943		    so, solockhistory_nr(so));
2944		/* NOTREACHED */
2945	}
2946	return (&ev_pcb->evp_mtx);
2947}
2948
2949static int
2950event_lock(struct socket *so, int refcount, void *lr)
2951{
2952	void *lr_saved;
2953
2954	if (lr == NULL)
2955		lr_saved = __builtin_return_address(0);
2956	else
2957		lr_saved = lr;
2958
2959	if (so->so_pcb != NULL) {
2960		lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
2961	} else  {
2962		panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
2963		    so, lr_saved, solockhistory_nr(so));
2964		/* NOTREACHED */
2965	}
2966
2967	if (so->so_usecount < 0) {
2968		panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
2969		    so, so->so_pcb, lr_saved, so->so_usecount,
2970		    solockhistory_nr(so));
2971		/* NOTREACHED */
2972	}
2973
2974	if (refcount)
2975		so->so_usecount++;
2976
2977	so->lock_lr[so->next_lock_lr] = lr_saved;
2978	so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
2979	return (0);
2980}
2981
2982static int
2983event_unlock(struct socket *so, int refcount, void *lr)
2984{
2985	void *lr_saved;
2986	lck_mtx_t *mutex_held;
2987
2988	if (lr == NULL)
2989		lr_saved = __builtin_return_address(0);
2990	else
2991		lr_saved = lr;
2992
2993	if (refcount)
2994		so->so_usecount--;
2995
2996	if (so->so_usecount < 0) {
2997		panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
2998		    so, so->so_usecount, solockhistory_nr(so));
2999		/* NOTREACHED */
3000	}
3001	if (so->so_pcb == NULL) {
3002		panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
3003		    so, so->so_usecount, (void *)lr_saved,
3004		    solockhistory_nr(so));
3005		/* NOTREACHED */
3006	}
3007	mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
3008
3009	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3010	so->unlock_lr[so->next_unlock_lr] = lr_saved;
3011	so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
3012
3013	if (so->so_usecount == 0) {
3014		VERIFY(so->so_flags & SOF_PCBCLEARING);
3015		event_sofreelastref(so);
3016	} else {
3017		lck_mtx_unlock(mutex_held);
3018	}
3019
3020	return (0);
3021}
3022
3023static int
3024event_sofreelastref(struct socket *so)
3025{
3026	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
3027
3028	lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
3029
3030	so->so_pcb = NULL;
3031
3032	/*
3033	 * Disable upcall in the event another thread is in kev_post_msg()
3034	 * appending record to the receive socket buffer, since sbwakeup()
3035	 * may release the socket lock otherwise.
3036	 */
3037	so->so_rcv.sb_flags &= ~SB_UPCALL;
3038	so->so_snd.sb_flags &= ~SB_UPCALL;
3039	so->so_event = sonullevent;
3040	lck_mtx_unlock(&(ev_pcb->evp_mtx));
3041
3042	lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
3043	lck_rw_lock_exclusive(kev_rwlock);
3044	LIST_REMOVE(ev_pcb, evp_link);
3045	kevtstat.kes_pcbcount--;
3046	kevtstat.kes_gencnt++;
3047	lck_rw_done(kev_rwlock);
3048	kev_delete(ev_pcb);
3049
3050	sofreelastref(so, 1);
3051	return (0);
3052}
3053
3054static int event_proto_count = (sizeof (eventsw) / sizeof (struct protosw));
3055
3056static
3057struct kern_event_head kern_event_head;
3058
3059static u_int32_t static_event_id = 0;
3060
3061#define	EVPCB_ZONE_MAX		65536
3062#define	EVPCB_ZONE_NAME		"kerneventpcb"
3063static struct zone *ev_pcb_zone;
3064
3065/*
3066 * Install the protosw's for the NKE manager.  Invoked at extension load time
3067 */
3068void
3069kern_event_init(struct domain *dp)
3070{
3071	struct protosw *pr;
3072	int i;
3073
3074	VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
3075	VERIFY(dp == systemdomain);
3076
3077	kev_lck_grp_attr = lck_grp_attr_alloc_init();
3078	if (kev_lck_grp_attr == NULL) {
3079		panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
3080		/* NOTREACHED */
3081	}
3082
3083	kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
3084	    kev_lck_grp_attr);
3085	if (kev_lck_grp == NULL) {
3086		panic("%s: lck_grp_alloc_init failed\n", __func__);
3087		/* NOTREACHED */
3088	}
3089
3090	kev_lck_attr = lck_attr_alloc_init();
3091	if (kev_lck_attr == NULL) {
3092		panic("%s: lck_attr_alloc_init failed\n", __func__);
3093		/* NOTREACHED */
3094	}
3095
3096	lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
3097	if (kev_rwlock == NULL) {
3098		panic("%s: lck_mtx_alloc_init failed\n", __func__);
3099		/* NOTREACHED */
3100	}
3101
3102	for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++)
3103		net_add_proto(pr, dp, 1);
3104
3105	ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
3106	    EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), 0, EVPCB_ZONE_NAME);
3107	if (ev_pcb_zone == NULL) {
3108		panic("%s: failed allocating ev_pcb_zone", __func__);
3109		/* NOTREACHED */
3110	}
3111	zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
3112	zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
3113}
3114
3115static int
3116kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
3117{
3118	int error = 0;
3119	struct kern_event_pcb *ev_pcb;
3120
3121	error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
3122	if (error != 0)
3123		return (error);
3124
3125	if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
3126		return (ENOBUFS);
3127	}
3128	bzero(ev_pcb, sizeof(struct kern_event_pcb));
3129	lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);
3130
3131	ev_pcb->evp_socket = so;
3132	ev_pcb->evp_vendor_code_filter = 0xffffffff;
3133
3134	so->so_pcb = (caddr_t) ev_pcb;
3135	lck_rw_lock_exclusive(kev_rwlock);
3136	LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
3137	kevtstat.kes_pcbcount++;
3138	kevtstat.kes_gencnt++;
3139	lck_rw_done(kev_rwlock);
3140
3141	return (error);
3142}
3143
3144static void
3145kev_delete(struct kern_event_pcb *ev_pcb)
3146{
3147	VERIFY(ev_pcb != NULL);
3148	lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
3149	zfree(ev_pcb_zone, ev_pcb);
3150}
3151
3152static int
3153kev_detach(struct socket *so)
3154{
3155	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
3156
3157	if (ev_pcb != NULL) {
3158		soisdisconnected(so);
3159		so->so_flags |= SOF_PCBCLEARING;
3160	}
3161
3162	return (0);
3163}
3164
3165/*
3166 * For now, kev_vendor_code and mbuf_tags use the same
3167 * mechanism.
3168 */
3169errno_t kev_vendor_code_find(
3170	const char	*string,
3171	u_int32_t 	*out_vendor_code)
3172{
3173	if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
3174		return (EINVAL);
3175	}
3176	return (net_str_id_find_internal(string, out_vendor_code,
3177	    NSI_VENDOR_CODE, 1));
3178}
3179
3180errno_t
3181kev_msg_post(struct kev_msg *event_msg)
3182{
3183	mbuf_tag_id_t min_vendor, max_vendor;
3184
3185	net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
3186
3187	if (event_msg == NULL)
3188		return (EINVAL);
3189
3190	/*
3191	 * Limit third parties to posting events for registered vendor codes
3192	 * only
3193	 */
3194	if (event_msg->vendor_code < min_vendor ||
3195	    event_msg->vendor_code > max_vendor) {
3196		OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
3197		return (EINVAL);
3198	}
3199	return (kev_post_msg(event_msg));
3200}
3201
3202int
3203kev_post_msg(struct kev_msg *event_msg)
3204{
3205	struct mbuf *m, *m2;
3206	struct kern_event_pcb *ev_pcb;
3207	struct kern_event_msg *ev;
3208	char *tmp;
3209	u_int32_t total_size;
3210	int i;
3211
3212	/* Verify the message is small enough to fit in one mbuf w/o cluster */
3213	total_size = KEV_MSG_HEADER_SIZE;
3214
3215	for (i = 0; i < 5; i++) {
3216		if (event_msg->dv[i].data_length == 0)
3217			break;
3218		total_size += event_msg->dv[i].data_length;
3219	}
3220
3221	if (total_size > MLEN) {
3222		OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
3223		return (EMSGSIZE);
3224	}
3225
3226	m = m_get(M_DONTWAIT, MT_DATA);
3227	if (m == 0) {
3228		OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
3229		return (ENOMEM);
3230	}
3231	ev = mtod(m, struct kern_event_msg *);
3232	total_size = KEV_MSG_HEADER_SIZE;
3233
3234	tmp = (char *) &ev->event_data[0];
3235	for (i = 0; i < 5; i++) {
3236		if (event_msg->dv[i].data_length == 0)
3237			break;
3238
3239		total_size += event_msg->dv[i].data_length;
3240		bcopy(event_msg->dv[i].data_ptr, tmp,
3241		    event_msg->dv[i].data_length);
3242		tmp += event_msg->dv[i].data_length;
3243	}
3244
3245	ev->id = ++static_event_id;
3246	ev->total_size   = total_size;
3247	ev->vendor_code  = event_msg->vendor_code;
3248	ev->kev_class    = event_msg->kev_class;
3249	ev->kev_subclass = event_msg->kev_subclass;
3250	ev->event_code   = event_msg->event_code;
3251
3252	m->m_len = total_size;
3253	lck_rw_lock_shared(kev_rwlock);
3254	for (ev_pcb = LIST_FIRST(&kern_event_head);
3255	    ev_pcb;
3256	    ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
3257		lck_mtx_lock(&ev_pcb->evp_mtx);
3258		if (ev_pcb->evp_socket->so_pcb == NULL) {
3259			lck_mtx_unlock(&ev_pcb->evp_mtx);
3260			continue;
3261		}
3262		if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
3263			if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
3264				lck_mtx_unlock(&ev_pcb->evp_mtx);
3265				continue;
3266			}
3267
3268			if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
3269				if (ev_pcb->evp_class_filter != ev->kev_class) {
3270					lck_mtx_unlock(&ev_pcb->evp_mtx);
3271					continue;
3272				}
3273
3274				if ((ev_pcb->evp_subclass_filter !=
3275				    KEV_ANY_SUBCLASS) &&
3276				    (ev_pcb->evp_subclass_filter !=
3277				    ev->kev_subclass)) {
3278					lck_mtx_unlock(&ev_pcb->evp_mtx);
3279					continue;
3280				}
3281			}
3282		}
3283
3284		m2 = m_copym(m, 0, m->m_len, M_NOWAIT);
3285		if (m2 == 0) {
3286			OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
3287			m_free(m);
3288			lck_mtx_unlock(&ev_pcb->evp_mtx);
3289			lck_rw_done(kev_rwlock);
3290			return (ENOMEM);
3291		}
3292		if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
3293			/*
3294			 * We use "m" for the socket stats as it would be
3295			 * unsafe to use "m2"
3296			 */
3297			so_inc_recv_data_stat(ev_pcb->evp_socket,
3298			    1, m->m_len, SO_TC_BE);
3299
3300			sorwakeup(ev_pcb->evp_socket);
3301			OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
3302		} else {
3303			OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
3304		}
3305		lck_mtx_unlock(&ev_pcb->evp_mtx);
3306	}
3307	m_free(m);
3308	lck_rw_done(kev_rwlock);
3309
3310	return (0);
3311}
3312
3313static int
3314kev_control(struct socket *so,
3315    u_long cmd,
3316    caddr_t data,
3317    __unused struct ifnet *ifp,
3318    __unused struct proc *p)
3319{
3320	struct kev_request *kev_req = (struct kev_request *) data;
3321	struct kern_event_pcb  *ev_pcb;
3322	struct kev_vendor_code *kev_vendor;
3323	u_int32_t  *id_value = (u_int32_t *) data;
3324
3325	switch (cmd) {
3326		case SIOCGKEVID:
3327			*id_value = static_event_id;
3328			break;
3329		case SIOCSKEVFILT:
3330			ev_pcb = (struct kern_event_pcb *) so->so_pcb;
3331			ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
3332			ev_pcb->evp_class_filter = kev_req->kev_class;
3333			ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
3334			break;
3335		case SIOCGKEVFILT:
3336			ev_pcb = (struct kern_event_pcb *) so->so_pcb;
3337			kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
3338			kev_req->kev_class   = ev_pcb->evp_class_filter;
3339			kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
3340			break;
3341		case SIOCGKEVVENDOR:
3342			kev_vendor = (struct kev_vendor_code *)data;
3343			/* Make sure string is NULL terminated */
3344			kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0;
3345			return (net_str_id_find_internal(kev_vendor->vendor_string,
3346			    &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0));
3347		default:
3348			return (ENOTSUP);
3349	}
3350
3351	return (0);
3352}
3353
3354int
3355kevt_getstat SYSCTL_HANDLER_ARGS
3356{
3357#pragma unused(oidp, arg1, arg2)
3358	int error = 0;
3359
3360	lck_rw_lock_shared(kev_rwlock);
3361
3362	if (req->newptr != USER_ADDR_NULL) {
3363		error = EPERM;
3364		goto done;
3365	}
3366	if (req->oldptr == USER_ADDR_NULL) {
3367		req->oldidx = sizeof(struct kevtstat);
3368		goto done;
3369	}
3370
3371	error = SYSCTL_OUT(req, &kevtstat,
3372	    MIN(sizeof(struct kevtstat), req->oldlen));
3373done:
3374	lck_rw_done(kev_rwlock);
3375
3376	return (error);
3377}
3378
3379__private_extern__ int
3380kevt_pcblist SYSCTL_HANDLER_ARGS
3381{
3382#pragma unused(oidp, arg1, arg2)
3383	int error = 0;
3384	int n, i;
3385	struct xsystmgen xsg;
3386	void *buf = NULL;
3387	size_t item_size = ROUNDUP64(sizeof (struct xkevtpcb)) +
3388		ROUNDUP64(sizeof (struct xsocket_n)) +
3389		2 * ROUNDUP64(sizeof (struct xsockbuf_n)) +
3390		ROUNDUP64(sizeof (struct xsockstat_n));
3391	struct kern_event_pcb  *ev_pcb;
3392
3393	buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
3394	if (buf == NULL)
3395		return (ENOMEM);
3396
3397	lck_rw_lock_shared(kev_rwlock);
3398
3399	n = kevtstat.kes_pcbcount;
3400
3401	if (req->oldptr == USER_ADDR_NULL) {
3402		req->oldidx = (n + n/8) * item_size;
3403		goto done;
3404	}
3405	if (req->newptr != USER_ADDR_NULL) {
3406		error = EPERM;
3407		goto done;
3408	}
3409	bzero(&xsg, sizeof (xsg));
3410	xsg.xg_len = sizeof (xsg);
3411	xsg.xg_count = n;
3412	xsg.xg_gen = kevtstat.kes_gencnt;
3413	xsg.xg_sogen = so_gencnt;
3414	error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
3415	if (error) {
3416		goto done;
3417	}
3418	/*
3419	 * We are done if there is no pcb
3420	 */
3421	if (n == 0) {
3422		goto done;
3423	}
3424
3425	i = 0;
3426	for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
3427	    i < n && ev_pcb != NULL;
3428	    i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
3429		struct xkevtpcb *xk = (struct xkevtpcb *)buf;
3430		struct xsocket_n *xso = (struct xsocket_n *)
3431			ADVANCE64(xk, sizeof (*xk));
3432		struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
3433			ADVANCE64(xso, sizeof (*xso));
3434		struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
3435			ADVANCE64(xsbrcv, sizeof (*xsbrcv));
3436		struct xsockstat_n *xsostats = (struct xsockstat_n *)
3437			ADVANCE64(xsbsnd, sizeof (*xsbsnd));
3438
3439		bzero(buf, item_size);
3440
3441		lck_mtx_lock(&ev_pcb->evp_mtx);
3442
3443		xk->kep_len = sizeof(struct xkevtpcb);
3444		xk->kep_kind = XSO_EVT;
3445		xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
3446		xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
3447		xk->kep_class_filter = ev_pcb->evp_class_filter;
3448		xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
3449
3450		sotoxsocket_n(ev_pcb->evp_socket, xso);
3451		sbtoxsockbuf_n(ev_pcb->evp_socket ?
3452			&ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
3453		sbtoxsockbuf_n(ev_pcb->evp_socket ?
3454			&ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
3455		sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
3456
3457		lck_mtx_unlock(&ev_pcb->evp_mtx);
3458
3459		error = SYSCTL_OUT(req, buf, item_size);
3460	}
3461
3462	if (error == 0) {
3463		/*
3464		 * Give the user an updated idea of our state.
3465		 * If the generation differs from what we told
3466		 * her before, she knows that something happened
3467		 * while we were processing this request, and it
3468		 * might be necessary to retry.
3469		 */
3470		bzero(&xsg, sizeof (xsg));
3471		xsg.xg_len = sizeof (xsg);
3472		xsg.xg_count = n;
3473		xsg.xg_gen = kevtstat.kes_gencnt;
3474		xsg.xg_sogen = so_gencnt;
3475		error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
3476		if (error) {
3477			goto done;
3478		}
3479	}
3480
3481done:
3482	lck_rw_done(kev_rwlock);
3483
3484	return (error);
3485}
3486
3487#endif /* SOCKETS */
3488
3489
3490int
3491fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
3492{
3493	struct vinfo_stat * st;
3494
3495	st = &kinfo->kq_stat;
3496
3497	st->vst_size = kq->kq_count;
3498	if (kq->kq_state & KQ_KEV64)
3499		st->vst_blksize = sizeof(struct kevent64_s);
3500	else
3501		st->vst_blksize = sizeof(struct kevent);
3502	st->vst_mode = S_IFIFO;
3503	if (kq->kq_state & KQ_SEL)
3504		kinfo->kq_state |=  PROC_KQUEUE_SELECT;
3505	if (kq->kq_state & KQ_SLEEP)
3506		kinfo->kq_state |= PROC_KQUEUE_SLEEP;
3507
3508	return (0);
3509}
3510
3511
3512void
3513knote_markstayqueued(struct knote *kn)
3514{
3515	kqlock(kn->kn_kq);
3516	kn->kn_status |= KN_STAYQUEUED;
3517	knote_enqueue(kn);
3518	kqunlock(kn->kn_kq);
3519}
3520