1/*- 2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 3 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org> 4 * Copyright (c) 2009 Apple, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/11/sys/kern/kern_event.c 369183 2021-01-30 23:42:30Z gbe $"); 31 32#include "opt_ktrace.h" 33#include "opt_kqueue.h" 34 35#include <sys/param.h> 36#include <sys/systm.h> 37#include <sys/capsicum.h> 38#include <sys/kernel.h> 39#include <sys/lock.h> 40#include <sys/mutex.h> 41#include <sys/rwlock.h> 42#include <sys/proc.h> 43#include <sys/malloc.h> 44#include <sys/unistd.h> 45#include <sys/file.h> 46#include <sys/filedesc.h> 47#include <sys/filio.h> 48#include <sys/fcntl.h> 49#include <sys/kthread.h> 50#include <sys/selinfo.h> 51#include <sys/queue.h> 52#include <sys/event.h> 53#include <sys/eventvar.h> 54#include <sys/poll.h> 55#include <sys/protosw.h> 56#include <sys/resourcevar.h> 57#include <sys/sigio.h> 58#include <sys/signalvar.h> 59#include <sys/socket.h> 60#include <sys/socketvar.h> 61#include <sys/stat.h> 62#include <sys/sysctl.h> 63#include <sys/sysproto.h> 64#include <sys/syscallsubr.h> 65#include <sys/taskqueue.h> 66#include <sys/uio.h> 67#include <sys/user.h> 68#ifdef KTRACE 69#include <sys/ktrace.h> 70#endif 71#include <machine/atomic.h> 72 73#include <vm/uma.h> 74 75static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 76 77/* 78 * This lock is used if multiple kq locks are required. This possibly 79 * should be made into a per proc lock. 80 */ 81static struct mtx kq_global; 82MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); 83#define KQ_GLOBAL_LOCK(lck, haslck) do { \ 84 if (!haslck) \ 85 mtx_lock(lck); \ 86 haslck = 1; \ 87} while (0) 88#define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ 89 if (haslck) \ 90 mtx_unlock(lck); \ 91 haslck = 0; \ 92} while (0) 93 94TASKQUEUE_DEFINE_THREAD(kqueue_ctx); 95 96static int kevent_copyout(void *arg, struct kevent *kevp, int count); 97static int kevent_copyin(void *arg, struct kevent *kevp, int count); 98static int kqueue_register(struct kqueue *kq, struct kevent *kev, 99 struct thread *td, int waitok); 100static int kqueue_acquire(struct file *fp, struct kqueue **kqp); 101static void kqueue_release(struct kqueue *kq, int locked); 102static void kqueue_destroy(struct kqueue *kq); 103static void kqueue_drain(struct kqueue *kq, struct thread *td); 104static int kqueue_expand(struct kqueue *kq, struct filterops *fops, 105 uintptr_t ident, int waitok); 106static void kqueue_task(void *arg, int pending); 107static int kqueue_scan(struct kqueue *kq, int maxevents, 108 struct kevent_copyops *k_ops, 109 const struct timespec *timeout, 110 struct kevent *keva, struct thread *td); 111static void kqueue_wakeup(struct kqueue *kq); 112static struct filterops *kqueue_fo_find(int filt); 113static void kqueue_fo_release(int filt); 114 115static fo_ioctl_t kqueue_ioctl; 116static fo_poll_t kqueue_poll; 117static fo_kqfilter_t kqueue_kqfilter; 118static fo_stat_t kqueue_stat; 119static fo_close_t kqueue_close; 120static fo_fill_kinfo_t kqueue_fill_kinfo; 121 122static struct fileops kqueueops = { 123 .fo_read = invfo_rdwr, 124 .fo_write = invfo_rdwr, 125 .fo_truncate = invfo_truncate, 126 .fo_ioctl = kqueue_ioctl, 127 .fo_poll = kqueue_poll, 128 .fo_kqfilter = kqueue_kqfilter, 129 .fo_stat = kqueue_stat, 130 .fo_close = kqueue_close, 131 .fo_chmod = invfo_chmod, 132 .fo_chown = invfo_chown, 133 .fo_sendfile = invfo_sendfile, 134 .fo_fill_kinfo = kqueue_fill_kinfo, 135}; 136 137static int knote_attach(struct knote *kn, struct kqueue *kq); 138static void knote_drop(struct knote *kn, struct thread *td); 139static void knote_enqueue(struct knote *kn); 140static void knote_dequeue(struct knote *kn); 141static void knote_init(void); 142static struct knote *knote_alloc(int waitok); 143static void knote_free(struct knote *kn); 144 145static void filt_kqdetach(struct knote *kn); 146static int filt_kqueue(struct knote *kn, long hint); 147static int filt_procattach(struct knote *kn); 148static void filt_procdetach(struct knote *kn); 149static int filt_proc(struct knote *kn, long hint); 150static int filt_fileattach(struct knote *kn); 151static void filt_timerexpire(void *knx); 152static int filt_timerattach(struct knote *kn); 153static void filt_timerdetach(struct knote *kn); 154static void filt_timerstart(struct knote *kn, sbintime_t to); 155static void filt_timertouch(struct knote *kn, struct kevent *kev, 156 u_long type); 157static int filt_timervalidate(struct knote *kn, sbintime_t *to); 158static int filt_timer(struct knote *kn, long hint); 159static int filt_userattach(struct knote *kn); 160static void filt_userdetach(struct knote *kn); 161static int filt_user(struct knote *kn, long hint); 162static void filt_usertouch(struct knote *kn, struct kevent *kev, 163 u_long type); 164 165static struct filterops file_filtops = { 166 .f_isfd = 1, 167 .f_attach = filt_fileattach, 168}; 169static struct filterops kqread_filtops = { 170 .f_isfd = 1, 171 .f_detach = filt_kqdetach, 172 .f_event = filt_kqueue, 173}; 174/* XXX - move to kern_proc.c? */ 175static struct filterops proc_filtops = { 176 .f_isfd = 0, 177 .f_attach = filt_procattach, 178 .f_detach = filt_procdetach, 179 .f_event = filt_proc, 180}; 181static struct filterops timer_filtops = { 182 .f_isfd = 0, 183 .f_attach = filt_timerattach, 184 .f_detach = filt_timerdetach, 185 .f_event = filt_timer, 186 .f_touch = filt_timertouch, 187}; 188static struct filterops user_filtops = { 189 .f_attach = filt_userattach, 190 .f_detach = filt_userdetach, 191 .f_event = filt_user, 192 .f_touch = filt_usertouch, 193}; 194 195static uma_zone_t knote_zone; 196static unsigned int kq_ncallouts = 0; 197static unsigned int kq_calloutmax = 4 * 1024; 198SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 199 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 200 201/* XXX - ensure not KN_INFLUX?? */ 202#define KNOTE_ACTIVATE(kn, islock) do { \ 203 if ((islock)) \ 204 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ 205 else \ 206 KQ_LOCK((kn)->kn_kq); \ 207 (kn)->kn_status |= KN_ACTIVE; \ 208 if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 209 knote_enqueue((kn)); \ 210 if (!(islock)) \ 211 KQ_UNLOCK((kn)->kn_kq); \ 212} while(0) 213#define KQ_LOCK(kq) do { \ 214 mtx_lock(&(kq)->kq_lock); \ 215} while (0) 216#define KQ_FLUX_WAKEUP(kq) do { \ 217 if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ 218 (kq)->kq_state &= ~KQ_FLUXWAIT; \ 219 wakeup((kq)); \ 220 } \ 221} while (0) 222#define KQ_UNLOCK_FLUX(kq) do { \ 223 KQ_FLUX_WAKEUP(kq); \ 224 mtx_unlock(&(kq)->kq_lock); \ 225} while (0) 226#define KQ_UNLOCK(kq) do { \ 227 mtx_unlock(&(kq)->kq_lock); \ 228} while (0) 229#define KQ_OWNED(kq) do { \ 230 mtx_assert(&(kq)->kq_lock, MA_OWNED); \ 231} while (0) 232#define KQ_NOTOWNED(kq) do { \ 233 mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ 234} while (0) 235 236static struct knlist * 237kn_list_lock(struct knote *kn) 238{ 239 struct knlist *knl; 240 241 knl = kn->kn_knlist; 242 if (knl != NULL) 243 knl->kl_lock(knl->kl_lockarg); 244 return (knl); 245} 246 247static void 248kn_list_unlock(struct knlist *knl) 249{ 250 bool do_free; 251 252 if (knl == NULL) 253 return; 254 do_free = knl->kl_autodestroy && knlist_empty(knl); 255 knl->kl_unlock(knl->kl_lockarg); 256 if (do_free) { 257 knlist_destroy(knl); 258 free(knl, M_KQUEUE); 259 } 260} 261 262#define KNL_ASSERT_LOCK(knl, islocked) do { \ 263 if (islocked) \ 264 KNL_ASSERT_LOCKED(knl); \ 265 else \ 266 KNL_ASSERT_UNLOCKED(knl); \ 267} while (0) 268#ifdef INVARIANTS 269#define KNL_ASSERT_LOCKED(knl) do { \ 270 knl->kl_assert_locked((knl)->kl_lockarg); \ 271} while (0) 272#define KNL_ASSERT_UNLOCKED(knl) do { \ 273 knl->kl_assert_unlocked((knl)->kl_lockarg); \ 274} while (0) 275#else /* !INVARIANTS */ 276#define KNL_ASSERT_LOCKED(knl) do {} while(0) 277#define KNL_ASSERT_UNLOCKED(knl) do {} while (0) 278#endif /* INVARIANTS */ 279 280#ifndef KN_HASHSIZE 281#define KN_HASHSIZE 64 /* XXX should be tunable */ 282#endif 283 284#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 285 286static int 287filt_nullattach(struct knote *kn) 288{ 289 290 return (ENXIO); 291}; 292 293struct filterops null_filtops = { 294 .f_isfd = 0, 295 .f_attach = filt_nullattach, 296}; 297 298/* XXX - make SYSINIT to add these, and move into respective modules. */ 299extern struct filterops sig_filtops; 300extern struct filterops fs_filtops; 301 302/* 303 * Table for for all system-defined filters. 304 */ 305static struct mtx filterops_lock; 306MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", 307 MTX_DEF); 308static struct { 309 struct filterops *for_fop; 310 int for_nolock; 311 int for_refcnt; 312} sysfilt_ops[EVFILT_SYSCOUNT] = { 313 { &file_filtops, 1 }, /* EVFILT_READ */ 314 { &file_filtops, 1 }, /* EVFILT_WRITE */ 315 { &null_filtops }, /* EVFILT_AIO */ 316 { &file_filtops, 1 }, /* EVFILT_VNODE */ 317 { &proc_filtops, 1 }, /* EVFILT_PROC */ 318 { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ 319 { &timer_filtops, 1 }, /* EVFILT_TIMER */ 320 { &file_filtops, 1 }, /* EVFILT_PROCDESC */ 321 { &fs_filtops, 1 }, /* EVFILT_FS */ 322 { &null_filtops }, /* EVFILT_LIO */ 323 { &user_filtops, 1 }, /* EVFILT_USER */ 324 { &null_filtops }, /* EVFILT_SENDFILE */ 325}; 326 327/* 328 * Simple redirection for all cdevsw style objects to call their fo_kqfilter 329 * method. 330 */ 331static int 332filt_fileattach(struct knote *kn) 333{ 334 335 return (fo_kqfilter(kn->kn_fp, kn)); 336} 337 338/*ARGSUSED*/ 339static int 340kqueue_kqfilter(struct file *fp, struct knote *kn) 341{ 342 struct kqueue *kq = kn->kn_fp->f_data; 343 344 if (kn->kn_filter != EVFILT_READ) 345 return (EINVAL); 346 347 kn->kn_status |= KN_KQUEUE; 348 kn->kn_fop = &kqread_filtops; 349 knlist_add(&kq->kq_sel.si_note, kn, 0); 350 351 return (0); 352} 353 354static void 355filt_kqdetach(struct knote *kn) 356{ 357 struct kqueue *kq = kn->kn_fp->f_data; 358 359 knlist_remove(&kq->kq_sel.si_note, kn, 0); 360} 361 362/*ARGSUSED*/ 363static int 364filt_kqueue(struct knote *kn, long hint) 365{ 366 struct kqueue *kq = kn->kn_fp->f_data; 367 368 kn->kn_data = kq->kq_count; 369 return (kn->kn_data > 0); 370} 371 372/* XXX - move to kern_proc.c? */ 373static int 374filt_procattach(struct knote *kn) 375{ 376 struct proc *p; 377 int error; 378 bool exiting, immediate; 379 380 exiting = immediate = false; 381 p = pfind(kn->kn_id); 382 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { 383 p = zpfind(kn->kn_id); 384 exiting = true; 385 } else if (p != NULL && (p->p_flag & P_WEXIT)) { 386 exiting = true; 387 } 388 389 if (p == NULL) 390 return (ESRCH); 391 if ((error = p_cansee(curthread, p))) { 392 PROC_UNLOCK(p); 393 return (error); 394 } 395 396 kn->kn_ptr.p_proc = p; 397 kn->kn_flags |= EV_CLEAR; /* automatically set */ 398 399 /* 400 * Internal flag indicating registration done by kernel for the 401 * purposes of getting a NOTE_CHILD notification. 402 */ 403 if (kn->kn_flags & EV_FLAG2) { 404 kn->kn_flags &= ~EV_FLAG2; 405 kn->kn_data = kn->kn_sdata; /* ppid */ 406 kn->kn_fflags = NOTE_CHILD; 407 kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); 408 immediate = true; /* Force immediate activation of child note. */ 409 } 410 /* 411 * Internal flag indicating registration done by kernel (for other than 412 * NOTE_CHILD). 413 */ 414 if (kn->kn_flags & EV_FLAG1) { 415 kn->kn_flags &= ~EV_FLAG1; 416 } 417 418 knlist_add(p->p_klist, kn, 1); 419 420 /* 421 * Immediately activate any child notes or, in the case of a zombie 422 * target process, exit notes. The latter is necessary to handle the 423 * case where the target process, e.g. a child, dies before the kevent 424 * is registered. 425 */ 426 if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) 427 KNOTE_ACTIVATE(kn, 0); 428 429 PROC_UNLOCK(p); 430 431 return (0); 432} 433 434/* 435 * The knote may be attached to a different process, which may exit, 436 * leaving nothing for the knote to be attached to. So when the process 437 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 438 * it will be deleted when read out. However, as part of the knote deletion, 439 * this routine is called, so a check is needed to avoid actually performing 440 * a detach, because the original process does not exist any more. 441 */ 442/* XXX - move to kern_proc.c? */ 443static void 444filt_procdetach(struct knote *kn) 445{ 446 447 knlist_remove(kn->kn_knlist, kn, 0); 448 kn->kn_ptr.p_proc = NULL; 449} 450 451/* XXX - move to kern_proc.c? */ 452static int 453filt_proc(struct knote *kn, long hint) 454{ 455 struct proc *p; 456 u_int event; 457 458 p = kn->kn_ptr.p_proc; 459 if (p == NULL) /* already activated, from attach filter */ 460 return (0); 461 462 /* Mask off extra data. */ 463 event = (u_int)hint & NOTE_PCTRLMASK; 464 465 /* If the user is interested in this event, record it. */ 466 if (kn->kn_sfflags & event) 467 kn->kn_fflags |= event; 468 469 /* Process is gone, so flag the event as finished. */ 470 if (event == NOTE_EXIT) { 471 kn->kn_flags |= EV_EOF | EV_ONESHOT; 472 kn->kn_ptr.p_proc = NULL; 473 if (kn->kn_fflags & NOTE_EXIT) 474 kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); 475 if (kn->kn_fflags == 0) 476 kn->kn_flags |= EV_DROP; 477 return (1); 478 } 479 480 return (kn->kn_fflags != 0); 481} 482 483/* 484 * Called when the process forked. It mostly does the same as the 485 * knote(), activating all knotes registered to be activated when the 486 * process forked. Additionally, for each knote attached to the 487 * parent, check whether user wants to track the new process. If so 488 * attach a new knote to it, and immediately report an event with the 489 * child's pid. 490 */ 491void 492knote_fork(struct knlist *list, int pid) 493{ 494 struct kqueue *kq; 495 struct knote *kn; 496 struct kevent kev; 497 int error; 498 499 if (list == NULL) 500 return; 501 502 memset(&kev, 0, sizeof(kev)); 503 list->kl_lock(list->kl_lockarg); 504 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { 505 kq = kn->kn_kq; 506 KQ_LOCK(kq); 507 if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) { 508 KQ_UNLOCK(kq); 509 continue; 510 } 511 512 /* 513 * The same as knote(), activate the event. 514 */ 515 if ((kn->kn_sfflags & NOTE_TRACK) == 0) { 516 kn->kn_status |= KN_HASKQLOCK; 517 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 518 KNOTE_ACTIVATE(kn, 1); 519 kn->kn_status &= ~KN_HASKQLOCK; 520 KQ_UNLOCK(kq); 521 continue; 522 } 523 524 /* 525 * The NOTE_TRACK case. In addition to the activation 526 * of the event, we need to register new events to 527 * track the child. Drop the locks in preparation for 528 * the call to kqueue_register(). 529 */ 530 kn->kn_status |= KN_INFLUX; 531 KQ_UNLOCK(kq); 532 list->kl_unlock(list->kl_lockarg); 533 534 /* 535 * Activate existing knote and register tracking knotes with 536 * new process. 537 * 538 * First register a knote to get just the child notice. This 539 * must be a separate note from a potential NOTE_EXIT 540 * notification since both NOTE_CHILD and NOTE_EXIT are defined 541 * to use the data field (in conflicting ways). 542 */ 543 kev.ident = pid; 544 kev.filter = kn->kn_filter; 545 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | 546 EV_FLAG2; 547 kev.fflags = kn->kn_sfflags; 548 kev.data = kn->kn_id; /* parent */ 549 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 550 error = kqueue_register(kq, &kev, NULL, 0); 551 if (error) 552 kn->kn_fflags |= NOTE_TRACKERR; 553 554 /* 555 * Then register another knote to track other potential events 556 * from the new process. 557 */ 558 kev.ident = pid; 559 kev.filter = kn->kn_filter; 560 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 561 kev.fflags = kn->kn_sfflags; 562 kev.data = kn->kn_id; /* parent */ 563 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 564 error = kqueue_register(kq, &kev, NULL, 0); 565 if (error) 566 kn->kn_fflags |= NOTE_TRACKERR; 567 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 568 KNOTE_ACTIVATE(kn, 0); 569 list->kl_lock(list->kl_lockarg); 570 KQ_LOCK(kq); 571 kn->kn_status &= ~KN_INFLUX; 572 KQ_UNLOCK_FLUX(kq); 573 } 574 list->kl_unlock(list->kl_lockarg); 575} 576 577/* 578 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the 579 * interval timer support code. 580 */ 581 582#define NOTE_TIMER_PRECMASK \ 583 (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) 584 585static sbintime_t 586timer2sbintime(intptr_t data, int flags) 587{ 588 int64_t secs; 589 590 /* 591 * Macros for converting to the fractional second portion of an 592 * sbintime_t using 64bit multiplication to improve precision. 593 */ 594#define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) 595#define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) 596#define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) 597 switch (flags & NOTE_TIMER_PRECMASK) { 598 case NOTE_SECONDS: 599#ifdef __LP64__ 600 if (data > (SBT_MAX / SBT_1S)) 601 return (SBT_MAX); 602#endif 603 return ((sbintime_t)data << 32); 604 case NOTE_MSECONDS: /* FALLTHROUGH */ 605 case 0: 606 if (data >= 1000) { 607 secs = data / 1000; 608#ifdef __LP64__ 609 if (secs > (SBT_MAX / SBT_1S)) 610 return (SBT_MAX); 611#endif 612 return (secs << 32 | MS_TO_SBT(data % 1000)); 613 } 614 return (MS_TO_SBT(data)); 615 case NOTE_USECONDS: 616 if (data >= 1000000) { 617 secs = data / 1000000; 618#ifdef __LP64__ 619 if (secs > (SBT_MAX / SBT_1S)) 620 return (SBT_MAX); 621#endif 622 return (secs << 32 | US_TO_SBT(data % 1000000)); 623 } 624 return (US_TO_SBT(data)); 625 case NOTE_NSECONDS: 626 if (data >= 1000000000) { 627 secs = data / 1000000000; 628#ifdef __LP64__ 629 if (secs > (SBT_MAX / SBT_1S)) 630 return (SBT_MAX); 631#endif 632 return (secs << 32 | NS_TO_SBT(data % 1000000000)); 633 } 634 return (NS_TO_SBT(data)); 635 default: 636 break; 637 } 638 return (-1); 639} 640 641struct kq_timer_cb_data { 642 struct callout c; 643 sbintime_t next; /* next timer event fires at */ 644 sbintime_t to; /* precalculated timer period */ 645}; 646 647static void 648filt_timerexpire(void *knx) 649{ 650 struct knote *kn; 651 struct kq_timer_cb_data *kc; 652 653 kn = knx; 654 kn->kn_data++; 655 KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ 656 657 if ((kn->kn_flags & EV_ONESHOT) != 0) 658 return; 659 660 kc = kn->kn_ptr.p_v; 661 kc->next += kc->to; 662 callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, 663 PCPU_GET(cpuid), C_ABSOLUTE); 664} 665 666/* 667 * data contains amount of time to sleep 668 */ 669static int 670filt_timervalidate(struct knote *kn, sbintime_t *to) 671{ 672 673 if (kn->kn_sdata < 0) 674 return (EINVAL); 675 if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) 676 kn->kn_sdata = 1; 677 /* 678 * The only fflags values supported are the timer unit 679 * (precision) and the absolute time indicator. 680 */ 681 if ((kn->kn_sfflags & ~NOTE_TIMER_PRECMASK) != 0) 682 return (EINVAL); 683 684 *to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); 685 if (*to < 0) 686 return (EINVAL); 687 return (0); 688} 689 690static int 691filt_timerattach(struct knote *kn) 692{ 693 struct kq_timer_cb_data *kc; 694 sbintime_t to; 695 unsigned int ncallouts; 696 int error; 697 698 error = filt_timervalidate(kn, &to); 699 if (error != 0) 700 return (error); 701 702 do { 703 ncallouts = kq_ncallouts; 704 if (ncallouts >= kq_calloutmax) 705 return (ENOMEM); 706 } while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1)); 707 708 kn->kn_flags |= EV_CLEAR; /* automatically set */ 709 kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ 710 kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); 711 callout_init(&kc->c, 1); 712 filt_timerstart(kn, to); 713 714 return (0); 715} 716 717static void 718filt_timerstart(struct knote *kn, sbintime_t to) 719{ 720 struct kq_timer_cb_data *kc; 721 722 kc = kn->kn_ptr.p_v; 723 kc->next = to + sbinuptime(); 724 kc->to = to; 725 callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, 726 PCPU_GET(cpuid), C_ABSOLUTE); 727} 728 729static void 730filt_timerdetach(struct knote *kn) 731{ 732 struct kq_timer_cb_data *kc; 733 unsigned int old; 734 735 kc = kn->kn_ptr.p_v; 736 callout_drain(&kc->c); 737 free(kc, M_KQUEUE); 738 old = atomic_fetchadd_int(&kq_ncallouts, -1); 739 KASSERT(old > 0, ("Number of callouts cannot become negative")); 740 kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ 741} 742 743static void 744filt_timertouch(struct knote *kn, struct kevent *kev, u_long type) 745{ 746 struct kq_timer_cb_data *kc; 747 struct kqueue *kq; 748 sbintime_t to; 749 int error; 750 751 switch (type) { 752 case EVENT_REGISTER: 753 /* Handle re-added timers that update data/fflags */ 754 if (kev->flags & EV_ADD) { 755 kc = kn->kn_ptr.p_v; 756 757 /* Drain any existing callout. */ 758 callout_drain(&kc->c); 759 760 /* Throw away any existing undelivered record 761 * of the timer expiration. This is done under 762 * the presumption that if a process is 763 * re-adding this timer with new parameters, 764 * it is no longer interested in what may have 765 * happened under the old parameters. If it is 766 * interested, it can wait for the expiration, 767 * delete the old timer definition, and then 768 * add the new one. 769 * 770 * This has to be done while the kq is locked: 771 * - if enqueued, dequeue 772 * - make it no longer active 773 * - clear the count of expiration events 774 */ 775 kq = kn->kn_kq; 776 KQ_LOCK(kq); 777 if (kn->kn_status & KN_QUEUED) 778 knote_dequeue(kn); 779 780 kn->kn_status &= ~KN_ACTIVE; 781 kn->kn_data = 0; 782 KQ_UNLOCK(kq); 783 784 /* Reschedule timer based on new data/fflags */ 785 kn->kn_sfflags = kev->fflags; 786 kn->kn_sdata = kev->data; 787 error = filt_timervalidate(kn, &to); 788 if (error != 0) { 789 kn->kn_flags |= EV_ERROR; 790 kn->kn_data = error; 791 } else 792 filt_timerstart(kn, to); 793 } 794 break; 795 796 case EVENT_PROCESS: 797 *kev = kn->kn_kevent; 798 if (kn->kn_flags & EV_CLEAR) { 799 kn->kn_data = 0; 800 kn->kn_fflags = 0; 801 } 802 break; 803 804 default: 805 panic("filt_timertouch() - invalid type (%ld)", type); 806 break; 807 } 808} 809 810static int 811filt_timer(struct knote *kn, long hint) 812{ 813 814 return (kn->kn_data != 0); 815} 816 817static int 818filt_userattach(struct knote *kn) 819{ 820 821 /* 822 * EVFILT_USER knotes are not attached to anything in the kernel. 823 */ 824 kn->kn_hook = NULL; 825 if (kn->kn_fflags & NOTE_TRIGGER) 826 kn->kn_hookid = 1; 827 else 828 kn->kn_hookid = 0; 829 return (0); 830} 831 832static void 833filt_userdetach(__unused struct knote *kn) 834{ 835 836 /* 837 * EVFILT_USER knotes are not attached to anything in the kernel. 838 */ 839} 840 841static int 842filt_user(struct knote *kn, __unused long hint) 843{ 844 845 return (kn->kn_hookid); 846} 847 848static void 849filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) 850{ 851 u_int ffctrl; 852 853 switch (type) { 854 case EVENT_REGISTER: 855 if (kev->fflags & NOTE_TRIGGER) 856 kn->kn_hookid = 1; 857 858 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 859 kev->fflags &= NOTE_FFLAGSMASK; 860 switch (ffctrl) { 861 case NOTE_FFNOP: 862 break; 863 864 case NOTE_FFAND: 865 kn->kn_sfflags &= kev->fflags; 866 break; 867 868 case NOTE_FFOR: 869 kn->kn_sfflags |= kev->fflags; 870 break; 871 872 case NOTE_FFCOPY: 873 kn->kn_sfflags = kev->fflags; 874 break; 875 876 default: 877 /* XXX Return error? */ 878 break; 879 } 880 kn->kn_sdata = kev->data; 881 if (kev->flags & EV_CLEAR) { 882 kn->kn_hookid = 0; 883 kn->kn_data = 0; 884 kn->kn_fflags = 0; 885 } 886 break; 887 888 case EVENT_PROCESS: 889 *kev = kn->kn_kevent; 890 kev->fflags = kn->kn_sfflags; 891 kev->data = kn->kn_sdata; 892 if (kn->kn_flags & EV_CLEAR) { 893 kn->kn_hookid = 0; 894 kn->kn_data = 0; 895 kn->kn_fflags = 0; 896 } 897 break; 898 899 default: 900 panic("filt_usertouch() - invalid type (%ld)", type); 901 break; 902 } 903} 904 905int 906sys_kqueue(struct thread *td, struct kqueue_args *uap) 907{ 908 909 return (kern_kqueue(td, 0, NULL)); 910} 911 912static void 913kqueue_init(struct kqueue *kq) 914{ 915 916 mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); 917 TAILQ_INIT(&kq->kq_head); 918 knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); 919 TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); 920} 921 922int 923kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) 924{ 925 struct filedesc *fdp; 926 struct kqueue *kq; 927 struct file *fp; 928 struct ucred *cred; 929 int fd, error; 930 931 fdp = td->td_proc->p_fd; 932 cred = td->td_ucred; 933 if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) 934 return (ENOMEM); 935 936 error = falloc_caps(td, &fp, &fd, flags, fcaps); 937 if (error != 0) { 938 chgkqcnt(cred->cr_ruidinfo, -1, 0); 939 return (error); 940 } 941 942 /* An extra reference on `fp' has been held for us by falloc(). */ 943 kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); 944 kqueue_init(kq); 945 kq->kq_fdp = fdp; 946 kq->kq_cred = crhold(cred); 947 948 FILEDESC_XLOCK(fdp); 949 TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); 950 FILEDESC_XUNLOCK(fdp); 951 952 finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); 953 fdrop(fp, td); 954 955 td->td_retval[0] = fd; 956 return (0); 957} 958 959#ifndef _SYS_SYSPROTO_H_ 960struct kevent_args { 961 int fd; 962 const struct kevent *changelist; 963 int nchanges; 964 struct kevent *eventlist; 965 int nevents; 966 const struct timespec *timeout; 967}; 968#endif 969int 970sys_kevent(struct thread *td, struct kevent_args *uap) 971{ 972 struct timespec ts, *tsp; 973 struct kevent_copyops k_ops = { 974 .arg = uap, 975 .k_copyout = kevent_copyout, 976 .k_copyin = kevent_copyin, 977 }; 978#ifdef KTRACE 979 struct kevent *eventlist = uap->eventlist; 980#endif 981 int error; 982 983 if (uap->timeout != NULL) { 984 error = copyin(uap->timeout, &ts, sizeof(ts)); 985 if (error) 986 return (error); 987 tsp = &ts; 988 } else 989 tsp = NULL; 990 991#ifdef KTRACE 992 if (KTRPOINT(td, KTR_STRUCT_ARRAY)) 993 ktrstructarray("kevent", UIO_USERSPACE, uap->changelist, 994 uap->nchanges, sizeof(struct kevent)); 995#endif 996 997 error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, 998 &k_ops, tsp); 999 1000#ifdef KTRACE 1001 if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) 1002 ktrstructarray("kevent", UIO_USERSPACE, eventlist, 1003 td->td_retval[0], sizeof(struct kevent)); 1004#endif 1005 1006 return (error); 1007} 1008 1009/* 1010 * Copy 'count' items into the destination list pointed to by uap->eventlist. 1011 */ 1012static int 1013kevent_copyout(void *arg, struct kevent *kevp, int count) 1014{ 1015 struct kevent_args *uap; 1016 int error; 1017 1018 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1019 uap = (struct kevent_args *)arg; 1020 1021 error = copyout(kevp, uap->eventlist, count * sizeof *kevp); 1022 if (error == 0) 1023 uap->eventlist += count; 1024 return (error); 1025} 1026 1027/* 1028 * Copy 'count' items from the list pointed to by uap->changelist. 1029 */ 1030static int 1031kevent_copyin(void *arg, struct kevent *kevp, int count) 1032{ 1033 struct kevent_args *uap; 1034 int error; 1035 1036 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1037 uap = (struct kevent_args *)arg; 1038 1039 error = copyin(uap->changelist, kevp, count * sizeof *kevp); 1040 if (error == 0) 1041 uap->changelist += count; 1042 return (error); 1043} 1044 1045int 1046kern_kevent(struct thread *td, int fd, int nchanges, int nevents, 1047 struct kevent_copyops *k_ops, const struct timespec *timeout) 1048{ 1049 cap_rights_t rights; 1050 struct file *fp; 1051 int error; 1052 1053 cap_rights_init(&rights); 1054 if (nchanges > 0) 1055 cap_rights_set(&rights, CAP_KQUEUE_CHANGE); 1056 if (nevents > 0) 1057 cap_rights_set(&rights, CAP_KQUEUE_EVENT); 1058 error = fget(td, fd, &rights, &fp); 1059 if (error != 0) 1060 return (error); 1061 1062 error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); 1063 fdrop(fp, td); 1064 1065 return (error); 1066} 1067 1068static int 1069kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, 1070 struct kevent_copyops *k_ops, const struct timespec *timeout) 1071{ 1072 struct kevent keva[KQ_NEVENTS]; 1073 struct kevent *kevp, *changes; 1074 int i, n, nerrors, error; 1075 1076 nerrors = 0; 1077 while (nchanges > 0) { 1078 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; 1079 error = k_ops->k_copyin(k_ops->arg, keva, n); 1080 if (error) 1081 return (error); 1082 changes = keva; 1083 for (i = 0; i < n; i++) { 1084 kevp = &changes[i]; 1085 if (!kevp->filter) 1086 continue; 1087 kevp->flags &= ~EV_SYSFLAGS; 1088 error = kqueue_register(kq, kevp, td, 1); 1089 if (error || (kevp->flags & EV_RECEIPT)) { 1090 if (nevents == 0) 1091 return (error); 1092 kevp->flags = EV_ERROR; 1093 kevp->data = error; 1094 (void)k_ops->k_copyout(k_ops->arg, kevp, 1); 1095 nevents--; 1096 nerrors++; 1097 } 1098 } 1099 nchanges -= n; 1100 } 1101 if (nerrors) { 1102 td->td_retval[0] = nerrors; 1103 return (0); 1104 } 1105 1106 return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); 1107} 1108 1109int 1110kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, 1111 struct kevent_copyops *k_ops, const struct timespec *timeout) 1112{ 1113 struct kqueue *kq; 1114 int error; 1115 1116 error = kqueue_acquire(fp, &kq); 1117 if (error != 0) 1118 return (error); 1119 error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); 1120 kqueue_release(kq, 0); 1121 return (error); 1122} 1123 1124/* 1125 * Performs a kevent() call on a temporarily created kqueue. This can be 1126 * used to perform one-shot polling, similar to poll() and select(). 1127 */ 1128int 1129kern_kevent_anonymous(struct thread *td, int nevents, 1130 struct kevent_copyops *k_ops) 1131{ 1132 struct kqueue kq = {}; 1133 int error; 1134 1135 kqueue_init(&kq); 1136 kq.kq_refcnt = 1; 1137 error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); 1138 kqueue_drain(&kq, td); 1139 kqueue_destroy(&kq); 1140 return (error); 1141} 1142 1143int 1144kqueue_add_filteropts(int filt, struct filterops *filtops) 1145{ 1146 int error; 1147 1148 error = 0; 1149 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { 1150 printf( 1151"trying to add a filterop that is out of range: %d is beyond %d\n", 1152 ~filt, EVFILT_SYSCOUNT); 1153 return EINVAL; 1154 } 1155 mtx_lock(&filterops_lock); 1156 if (sysfilt_ops[~filt].for_fop != &null_filtops && 1157 sysfilt_ops[~filt].for_fop != NULL) 1158 error = EEXIST; 1159 else { 1160 sysfilt_ops[~filt].for_fop = filtops; 1161 sysfilt_ops[~filt].for_refcnt = 0; 1162 } 1163 mtx_unlock(&filterops_lock); 1164 1165 return (error); 1166} 1167 1168int 1169kqueue_del_filteropts(int filt) 1170{ 1171 int error; 1172 1173 error = 0; 1174 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1175 return EINVAL; 1176 1177 mtx_lock(&filterops_lock); 1178 if (sysfilt_ops[~filt].for_fop == &null_filtops || 1179 sysfilt_ops[~filt].for_fop == NULL) 1180 error = EINVAL; 1181 else if (sysfilt_ops[~filt].for_refcnt != 0) 1182 error = EBUSY; 1183 else { 1184 sysfilt_ops[~filt].for_fop = &null_filtops; 1185 sysfilt_ops[~filt].for_refcnt = 0; 1186 } 1187 mtx_unlock(&filterops_lock); 1188 1189 return error; 1190} 1191 1192static struct filterops * 1193kqueue_fo_find(int filt) 1194{ 1195 1196 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1197 return NULL; 1198 1199 if (sysfilt_ops[~filt].for_nolock) 1200 return sysfilt_ops[~filt].for_fop; 1201 1202 mtx_lock(&filterops_lock); 1203 sysfilt_ops[~filt].for_refcnt++; 1204 if (sysfilt_ops[~filt].for_fop == NULL) 1205 sysfilt_ops[~filt].for_fop = &null_filtops; 1206 mtx_unlock(&filterops_lock); 1207 1208 return sysfilt_ops[~filt].for_fop; 1209} 1210 1211static void 1212kqueue_fo_release(int filt) 1213{ 1214 1215 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1216 return; 1217 1218 if (sysfilt_ops[~filt].for_nolock) 1219 return; 1220 1221 mtx_lock(&filterops_lock); 1222 KASSERT(sysfilt_ops[~filt].for_refcnt > 0, 1223 ("filter object refcount not valid on release")); 1224 sysfilt_ops[~filt].for_refcnt--; 1225 mtx_unlock(&filterops_lock); 1226} 1227 1228/* 1229 * A ref to kq (obtained via kqueue_acquire) must be held. waitok will 1230 * influence if memory allocation should wait. Make sure it is 0 if you 1231 * hold any mutexes. 1232 */ 1233static int 1234kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok) 1235{ 1236 struct filterops *fops; 1237 struct file *fp; 1238 struct knote *kn, *tkn; 1239 struct knlist *knl; 1240 cap_rights_t rights; 1241 int error, filt, event; 1242 int haskqglobal, filedesc_unlock; 1243 1244 if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) 1245 return (EINVAL); 1246 1247 fp = NULL; 1248 kn = NULL; 1249 knl = NULL; 1250 error = 0; 1251 haskqglobal = 0; 1252 filedesc_unlock = 0; 1253 1254 filt = kev->filter; 1255 fops = kqueue_fo_find(filt); 1256 if (fops == NULL) 1257 return EINVAL; 1258 1259 if (kev->flags & EV_ADD) { 1260 /* 1261 * Prevent waiting with locks. Non-sleepable 1262 * allocation failures are handled in the loop, only 1263 * if the spare knote appears to be actually required. 1264 */ 1265 tkn = knote_alloc(waitok); 1266 } else { 1267 tkn = NULL; 1268 } 1269 1270findkn: 1271 if (fops->f_isfd) { 1272 KASSERT(td != NULL, ("td is NULL")); 1273 if (kev->ident > INT_MAX) 1274 error = EBADF; 1275 else 1276 error = fget(td, kev->ident, 1277 cap_rights_init(&rights, CAP_EVENT), &fp); 1278 if (error) 1279 goto done; 1280 1281 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, 1282 kev->ident, 0) != 0) { 1283 /* try again */ 1284 fdrop(fp, td); 1285 fp = NULL; 1286 error = kqueue_expand(kq, fops, kev->ident, waitok); 1287 if (error) 1288 goto done; 1289 goto findkn; 1290 } 1291 1292 if (fp->f_type == DTYPE_KQUEUE) { 1293 /* 1294 * If we add some intelligence about what we are doing, 1295 * we should be able to support events on ourselves. 1296 * We need to know when we are doing this to prevent 1297 * getting both the knlist lock and the kq lock since 1298 * they are the same thing. 1299 */ 1300 if (fp->f_data == kq) { 1301 error = EINVAL; 1302 goto done; 1303 } 1304 1305 /* 1306 * Pre-lock the filedesc before the global 1307 * lock mutex, see the comment in 1308 * kqueue_close(). 1309 */ 1310 FILEDESC_XLOCK(td->td_proc->p_fd); 1311 filedesc_unlock = 1; 1312 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1313 } 1314 1315 KQ_LOCK(kq); 1316 if (kev->ident < kq->kq_knlistsize) { 1317 SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) 1318 if (kev->filter == kn->kn_filter) 1319 break; 1320 } 1321 } else { 1322 if ((kev->flags & EV_ADD) == EV_ADD) { 1323 error = kqueue_expand(kq, fops, kev->ident, waitok); 1324 if (error != 0) 1325 goto done; 1326 } 1327 1328 KQ_LOCK(kq); 1329 1330 /* 1331 * If possible, find an existing knote to use for this kevent. 1332 */ 1333 if (kev->filter == EVFILT_PROC && 1334 (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { 1335 /* This is an internal creation of a process tracking 1336 * note. Don't attempt to coalesce this with an 1337 * existing note. 1338 */ 1339 ; 1340 } else if (kq->kq_knhashmask != 0) { 1341 struct klist *list; 1342 1343 list = &kq->kq_knhash[ 1344 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1345 SLIST_FOREACH(kn, list, kn_link) 1346 if (kev->ident == kn->kn_id && 1347 kev->filter == kn->kn_filter) 1348 break; 1349 } 1350 } 1351 1352 /* knote is in the process of changing, wait for it to stabilize. */ 1353 if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) { 1354 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1355 if (filedesc_unlock) { 1356 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1357 filedesc_unlock = 0; 1358 } 1359 kq->kq_state |= KQ_FLUXWAIT; 1360 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); 1361 if (fp != NULL) { 1362 fdrop(fp, td); 1363 fp = NULL; 1364 } 1365 goto findkn; 1366 } 1367 1368 /* 1369 * kn now contains the matching knote, or NULL if no match 1370 */ 1371 if (kn == NULL) { 1372 if (kev->flags & EV_ADD) { 1373 kn = tkn; 1374 tkn = NULL; 1375 if (kn == NULL) { 1376 KQ_UNLOCK(kq); 1377 error = ENOMEM; 1378 goto done; 1379 } 1380 kn->kn_fp = fp; 1381 kn->kn_kq = kq; 1382 kn->kn_fop = fops; 1383 /* 1384 * apply reference counts to knote structure, and 1385 * do not release it at the end of this routine. 1386 */ 1387 fops = NULL; 1388 fp = NULL; 1389 1390 kn->kn_sfflags = kev->fflags; 1391 kn->kn_sdata = kev->data; 1392 kev->fflags = 0; 1393 kev->data = 0; 1394 kn->kn_kevent = *kev; 1395 kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | 1396 EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); 1397 kn->kn_status = KN_INFLUX|KN_DETACHED; 1398 if ((kev->flags & EV_DISABLE) != 0) 1399 kn->kn_status |= KN_DISABLED; 1400 1401 error = knote_attach(kn, kq); 1402 KQ_UNLOCK(kq); 1403 if (error != 0) { 1404 tkn = kn; 1405 goto done; 1406 } 1407 1408 if ((error = kn->kn_fop->f_attach(kn)) != 0) { 1409 knote_drop(kn, td); 1410 goto done; 1411 } 1412 knl = kn_list_lock(kn); 1413 goto done_ev_add; 1414 } else { 1415 /* No matching knote and the EV_ADD flag is not set. */ 1416 KQ_UNLOCK(kq); 1417 error = ENOENT; 1418 goto done; 1419 } 1420 } 1421 1422 if (kev->flags & EV_DELETE) { 1423 kn->kn_status |= KN_INFLUX; 1424 KQ_UNLOCK(kq); 1425 if (!(kn->kn_status & KN_DETACHED)) 1426 kn->kn_fop->f_detach(kn); 1427 knote_drop(kn, td); 1428 goto done; 1429 } 1430 1431 if (kev->flags & EV_FORCEONESHOT) { 1432 kn->kn_flags |= EV_ONESHOT; 1433 KNOTE_ACTIVATE(kn, 1); 1434 } 1435 1436 if ((kev->flags & EV_ENABLE) != 0) 1437 kn->kn_status &= ~KN_DISABLED; 1438 else if ((kev->flags & EV_DISABLE) != 0) 1439 kn->kn_status |= KN_DISABLED; 1440 1441 /* 1442 * The user may change some filter values after the initial EV_ADD, 1443 * but doing so will not reset any filter which has already been 1444 * triggered. 1445 */ 1446 kn->kn_status |= KN_INFLUX | KN_SCAN; 1447 KQ_UNLOCK(kq); 1448 knl = kn_list_lock(kn); 1449 kn->kn_kevent.udata = kev->udata; 1450 if (!fops->f_isfd && fops->f_touch != NULL) { 1451 fops->f_touch(kn, kev, EVENT_REGISTER); 1452 } else { 1453 kn->kn_sfflags = kev->fflags; 1454 kn->kn_sdata = kev->data; 1455 } 1456 1457done_ev_add: 1458 /* 1459 * We can get here with kn->kn_knlist == NULL. This can happen when 1460 * the initial attach event decides that the event is "completed" 1461 * already, e.g., filt_procattach() is called on a zombie process. It 1462 * will call filt_proc() which will remove it from the list, and NULL 1463 * kn_knlist. 1464 * 1465 * KN_DISABLED will be stable while the knote is in flux, so the 1466 * unlocked read will not race with an update. 1467 */ 1468 if ((kn->kn_status & KN_DISABLED) == 0) 1469 event = kn->kn_fop->f_event(kn, 0); 1470 else 1471 event = 0; 1472 1473 KQ_LOCK(kq); 1474 if (event) 1475 kn->kn_status |= KN_ACTIVE; 1476 if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == 1477 KN_ACTIVE) 1478 knote_enqueue(kn); 1479 kn->kn_status &= ~(KN_INFLUX | KN_SCAN); 1480 kn_list_unlock(knl); 1481 KQ_UNLOCK_FLUX(kq); 1482 1483done: 1484 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1485 if (filedesc_unlock) 1486 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1487 if (fp != NULL) 1488 fdrop(fp, td); 1489 knote_free(tkn); 1490 if (fops != NULL) 1491 kqueue_fo_release(filt); 1492 return (error); 1493} 1494 1495static int 1496kqueue_acquire(struct file *fp, struct kqueue **kqp) 1497{ 1498 int error; 1499 struct kqueue *kq; 1500 1501 error = 0; 1502 1503 kq = fp->f_data; 1504 if (fp->f_type != DTYPE_KQUEUE || kq == NULL) 1505 return (EBADF); 1506 *kqp = kq; 1507 KQ_LOCK(kq); 1508 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { 1509 KQ_UNLOCK(kq); 1510 return (EBADF); 1511 } 1512 kq->kq_refcnt++; 1513 KQ_UNLOCK(kq); 1514 1515 return error; 1516} 1517 1518static void 1519kqueue_release(struct kqueue *kq, int locked) 1520{ 1521 if (locked) 1522 KQ_OWNED(kq); 1523 else 1524 KQ_LOCK(kq); 1525 kq->kq_refcnt--; 1526 if (kq->kq_refcnt == 1) 1527 wakeup(&kq->kq_refcnt); 1528 if (!locked) 1529 KQ_UNLOCK(kq); 1530} 1531 1532static void 1533kqueue_schedtask(struct kqueue *kq) 1534{ 1535 1536 KQ_OWNED(kq); 1537 KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), 1538 ("scheduling kqueue task while draining")); 1539 1540 if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { 1541 taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); 1542 kq->kq_state |= KQ_TASKSCHED; 1543 } 1544} 1545 1546/* 1547 * Expand the kq to make sure we have storage for fops/ident pair. 1548 * 1549 * Return 0 on success (or no work necessary), return errno on failure. 1550 * 1551 * Not calling hashinit w/ waitok (proper malloc flag) should be safe. 1552 * If kqueue_register is called from a non-fd context, there usually/should 1553 * be no locks held. 1554 */ 1555static int 1556kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, 1557 int waitok) 1558{ 1559 struct klist *list, *tmp_knhash, *to_free; 1560 u_long tmp_knhashmask; 1561 int error, fd, size; 1562 int mflag = waitok ? M_WAITOK : M_NOWAIT; 1563 1564 KQ_NOTOWNED(kq); 1565 1566 error = 0; 1567 to_free = NULL; 1568 if (fops->f_isfd) { 1569 fd = ident; 1570 if (kq->kq_knlistsize <= fd) { 1571 size = kq->kq_knlistsize; 1572 while (size <= fd) 1573 size += KQEXTENT; 1574 list = malloc(size * sizeof(*list), M_KQUEUE, mflag); 1575 if (list == NULL) 1576 return ENOMEM; 1577 KQ_LOCK(kq); 1578 if ((kq->kq_state & KQ_CLOSING) != 0) { 1579 to_free = list; 1580 error = EBADF; 1581 } else if (kq->kq_knlistsize > fd) { 1582 to_free = list; 1583 } else { 1584 if (kq->kq_knlist != NULL) { 1585 bcopy(kq->kq_knlist, list, 1586 kq->kq_knlistsize * sizeof(*list)); 1587 to_free = kq->kq_knlist; 1588 kq->kq_knlist = NULL; 1589 } 1590 bzero((caddr_t)list + 1591 kq->kq_knlistsize * sizeof(*list), 1592 (size - kq->kq_knlistsize) * sizeof(*list)); 1593 kq->kq_knlistsize = size; 1594 kq->kq_knlist = list; 1595 } 1596 KQ_UNLOCK(kq); 1597 } 1598 } else { 1599 if (kq->kq_knhashmask == 0) { 1600 tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, 1601 &tmp_knhashmask); 1602 if (tmp_knhash == NULL) 1603 return (ENOMEM); 1604 KQ_LOCK(kq); 1605 if ((kq->kq_state & KQ_CLOSING) != 0) { 1606 to_free = tmp_knhash; 1607 error = EBADF; 1608 } else if (kq->kq_knhashmask == 0) { 1609 kq->kq_knhash = tmp_knhash; 1610 kq->kq_knhashmask = tmp_knhashmask; 1611 } else { 1612 to_free = tmp_knhash; 1613 } 1614 KQ_UNLOCK(kq); 1615 } 1616 } 1617 free(to_free, M_KQUEUE); 1618 1619 KQ_NOTOWNED(kq); 1620 return (error); 1621} 1622 1623static void 1624kqueue_task(void *arg, int pending) 1625{ 1626 struct kqueue *kq; 1627 int haskqglobal; 1628 1629 haskqglobal = 0; 1630 kq = arg; 1631 1632 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1633 KQ_LOCK(kq); 1634 1635 KNOTE_LOCKED(&kq->kq_sel.si_note, 0); 1636 1637 kq->kq_state &= ~KQ_TASKSCHED; 1638 if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { 1639 wakeup(&kq->kq_state); 1640 } 1641 KQ_UNLOCK(kq); 1642 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1643} 1644 1645/* 1646 * Scan, update kn_data (if not ONESHOT), and copyout triggered events. 1647 * We treat KN_MARKER knotes as if they are INFLUX. 1648 */ 1649static int 1650kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, 1651 const struct timespec *tsp, struct kevent *keva, struct thread *td) 1652{ 1653 struct kevent *kevp; 1654 struct knote *kn, *marker; 1655 struct knlist *knl; 1656 sbintime_t asbt, rsbt; 1657 int count, error, haskqglobal, influx, nkev, touch; 1658 1659 count = maxevents; 1660 nkev = 0; 1661 error = 0; 1662 haskqglobal = 0; 1663 1664 if (maxevents == 0) 1665 goto done_nl; 1666 1667 rsbt = 0; 1668 if (tsp != NULL) { 1669 if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || 1670 tsp->tv_nsec >= 1000000000) { 1671 error = EINVAL; 1672 goto done_nl; 1673 } 1674 if (timespecisset(tsp)) { 1675 if (tsp->tv_sec <= INT32_MAX) { 1676 rsbt = tstosbt(*tsp); 1677 if (TIMESEL(&asbt, rsbt)) 1678 asbt += tc_tick_sbt; 1679 if (asbt <= SBT_MAX - rsbt) 1680 asbt += rsbt; 1681 else 1682 asbt = 0; 1683 rsbt >>= tc_precexp; 1684 } else 1685 asbt = 0; 1686 } else 1687 asbt = -1; 1688 } else 1689 asbt = 0; 1690 marker = knote_alloc(1); 1691 marker->kn_status = KN_MARKER; 1692 KQ_LOCK(kq); 1693 1694retry: 1695 kevp = keva; 1696 if (kq->kq_count == 0) { 1697 if (asbt == -1) { 1698 error = EWOULDBLOCK; 1699 } else { 1700 kq->kq_state |= KQ_SLEEP; 1701 error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, 1702 "kqread", asbt, rsbt, C_ABSOLUTE); 1703 } 1704 if (error == 0) 1705 goto retry; 1706 /* don't restart after signals... */ 1707 if (error == ERESTART) 1708 error = EINTR; 1709 else if (error == EWOULDBLOCK) 1710 error = 0; 1711 goto done; 1712 } 1713 1714 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); 1715 influx = 0; 1716 while (count) { 1717 KQ_OWNED(kq); 1718 kn = TAILQ_FIRST(&kq->kq_head); 1719 1720 if ((kn->kn_status == KN_MARKER && kn != marker) || 1721 (kn->kn_status & KN_INFLUX) == KN_INFLUX) { 1722 if (influx) { 1723 influx = 0; 1724 KQ_FLUX_WAKEUP(kq); 1725 } 1726 kq->kq_state |= KQ_FLUXWAIT; 1727 error = msleep(kq, &kq->kq_lock, PSOCK, 1728 "kqflxwt", 0); 1729 continue; 1730 } 1731 1732 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1733 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { 1734 kn->kn_status &= ~KN_QUEUED; 1735 kq->kq_count--; 1736 continue; 1737 } 1738 if (kn == marker) { 1739 KQ_FLUX_WAKEUP(kq); 1740 if (count == maxevents) 1741 goto retry; 1742 goto done; 1743 } 1744 KASSERT((kn->kn_status & KN_INFLUX) == 0, 1745 ("KN_INFLUX set when not suppose to be")); 1746 1747 if ((kn->kn_flags & EV_DROP) == EV_DROP) { 1748 kn->kn_status &= ~KN_QUEUED; 1749 kn->kn_status |= KN_INFLUX; 1750 kq->kq_count--; 1751 KQ_UNLOCK(kq); 1752 /* 1753 * We don't need to lock the list since we've marked 1754 * it _INFLUX. 1755 */ 1756 if (!(kn->kn_status & KN_DETACHED)) 1757 kn->kn_fop->f_detach(kn); 1758 knote_drop(kn, td); 1759 KQ_LOCK(kq); 1760 continue; 1761 } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { 1762 kn->kn_status &= ~KN_QUEUED; 1763 kn->kn_status |= KN_INFLUX; 1764 kq->kq_count--; 1765 KQ_UNLOCK(kq); 1766 /* 1767 * We don't need to lock the list since we've marked 1768 * it _INFLUX. 1769 */ 1770 *kevp = kn->kn_kevent; 1771 if (!(kn->kn_status & KN_DETACHED)) 1772 kn->kn_fop->f_detach(kn); 1773 knote_drop(kn, td); 1774 KQ_LOCK(kq); 1775 kn = NULL; 1776 } else { 1777 kn->kn_status |= KN_INFLUX | KN_SCAN; 1778 KQ_UNLOCK(kq); 1779 if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) 1780 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1781 knl = kn_list_lock(kn); 1782 if (kn->kn_fop->f_event(kn, 0) == 0) { 1783 KQ_LOCK(kq); 1784 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1785 kn->kn_status &= 1786 ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX | 1787 KN_SCAN); 1788 kq->kq_count--; 1789 kn_list_unlock(knl); 1790 influx = 1; 1791 continue; 1792 } 1793 touch = (!kn->kn_fop->f_isfd && 1794 kn->kn_fop->f_touch != NULL); 1795 if (touch) 1796 kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); 1797 else 1798 *kevp = kn->kn_kevent; 1799 KQ_LOCK(kq); 1800 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1801 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 1802 /* 1803 * Manually clear knotes who weren't 1804 * 'touch'ed. 1805 */ 1806 if (touch == 0 && kn->kn_flags & EV_CLEAR) { 1807 kn->kn_data = 0; 1808 kn->kn_fflags = 0; 1809 } 1810 if (kn->kn_flags & EV_DISPATCH) 1811 kn->kn_status |= KN_DISABLED; 1812 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 1813 kq->kq_count--; 1814 } else 1815 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1816 1817 kn->kn_status &= ~(KN_INFLUX | KN_SCAN); 1818 kn_list_unlock(knl); 1819 influx = 1; 1820 } 1821 1822 /* we are returning a copy to the user */ 1823 kevp++; 1824 nkev++; 1825 count--; 1826 1827 if (nkev == KQ_NEVENTS) { 1828 influx = 0; 1829 KQ_UNLOCK_FLUX(kq); 1830 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1831 nkev = 0; 1832 kevp = keva; 1833 KQ_LOCK(kq); 1834 if (error) 1835 break; 1836 } 1837 } 1838 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); 1839done: 1840 KQ_OWNED(kq); 1841 KQ_UNLOCK_FLUX(kq); 1842 knote_free(marker); 1843done_nl: 1844 KQ_NOTOWNED(kq); 1845 if (nkev != 0) 1846 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1847 td->td_retval[0] = maxevents - count; 1848 return (error); 1849} 1850 1851/*ARGSUSED*/ 1852static int 1853kqueue_ioctl(struct file *fp, u_long cmd, void *data, 1854 struct ucred *active_cred, struct thread *td) 1855{ 1856 /* 1857 * Enabling sigio causes two major problems: 1858 * 1) infinite recursion: 1859 * Synopsys: kevent is being used to track signals and have FIOASYNC 1860 * set. On receipt of a signal this will cause a kqueue to recurse 1861 * into itself over and over. Sending the sigio causes the kqueue 1862 * to become ready, which in turn posts sigio again, forever. 1863 * Solution: this can be solved by setting a flag in the kqueue that 1864 * we have a SIGIO in progress. 1865 * 2) locking problems: 1866 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts 1867 * us above the proc and pgrp locks. 1868 * Solution: Post a signal using an async mechanism, being sure to 1869 * record a generation count in the delivery so that we do not deliver 1870 * a signal to the wrong process. 1871 * 1872 * Note, these two mechanisms are somewhat mutually exclusive! 1873 */ 1874#if 0 1875 struct kqueue *kq; 1876 1877 kq = fp->f_data; 1878 switch (cmd) { 1879 case FIOASYNC: 1880 if (*(int *)data) { 1881 kq->kq_state |= KQ_ASYNC; 1882 } else { 1883 kq->kq_state &= ~KQ_ASYNC; 1884 } 1885 return (0); 1886 1887 case FIOSETOWN: 1888 return (fsetown(*(int *)data, &kq->kq_sigio)); 1889 1890 case FIOGETOWN: 1891 *(int *)data = fgetown(&kq->kq_sigio); 1892 return (0); 1893 } 1894#endif 1895 1896 return (ENOTTY); 1897} 1898 1899/*ARGSUSED*/ 1900static int 1901kqueue_poll(struct file *fp, int events, struct ucred *active_cred, 1902 struct thread *td) 1903{ 1904 struct kqueue *kq; 1905 int revents = 0; 1906 int error; 1907 1908 if ((error = kqueue_acquire(fp, &kq))) 1909 return POLLERR; 1910 1911 KQ_LOCK(kq); 1912 if (events & (POLLIN | POLLRDNORM)) { 1913 if (kq->kq_count) { 1914 revents |= events & (POLLIN | POLLRDNORM); 1915 } else { 1916 selrecord(td, &kq->kq_sel); 1917 if (SEL_WAITING(&kq->kq_sel)) 1918 kq->kq_state |= KQ_SEL; 1919 } 1920 } 1921 kqueue_release(kq, 1); 1922 KQ_UNLOCK(kq); 1923 return (revents); 1924} 1925 1926/*ARGSUSED*/ 1927static int 1928kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1929 struct thread *td) 1930{ 1931 1932 bzero((void *)st, sizeof *st); 1933 /* 1934 * We no longer return kq_count because the unlocked value is useless. 1935 * If you spent all this time getting the count, why not spend your 1936 * syscall better by calling kevent? 1937 * 1938 * XXX - This is needed for libc_r. 1939 */ 1940 st->st_mode = S_IFIFO; 1941 return (0); 1942} 1943 1944static void 1945kqueue_drain(struct kqueue *kq, struct thread *td) 1946{ 1947 struct knote *kn; 1948 int i; 1949 1950 KQ_LOCK(kq); 1951 1952 KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, 1953 ("kqueue already closing")); 1954 kq->kq_state |= KQ_CLOSING; 1955 if (kq->kq_refcnt > 1) 1956 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); 1957 1958 KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); 1959 1960 KASSERT(knlist_empty(&kq->kq_sel.si_note), 1961 ("kqueue's knlist not empty")); 1962 1963 for (i = 0; i < kq->kq_knlistsize; i++) { 1964 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { 1965 if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) { 1966 kq->kq_state |= KQ_FLUXWAIT; 1967 msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); 1968 continue; 1969 } 1970 kn->kn_status |= KN_INFLUX; 1971 KQ_UNLOCK(kq); 1972 if (!(kn->kn_status & KN_DETACHED)) 1973 kn->kn_fop->f_detach(kn); 1974 knote_drop(kn, td); 1975 KQ_LOCK(kq); 1976 } 1977 } 1978 if (kq->kq_knhashmask != 0) { 1979 for (i = 0; i <= kq->kq_knhashmask; i++) { 1980 while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { 1981 if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) { 1982 kq->kq_state |= KQ_FLUXWAIT; 1983 msleep(kq, &kq->kq_lock, PSOCK, 1984 "kqclo2", 0); 1985 continue; 1986 } 1987 kn->kn_status |= KN_INFLUX; 1988 KQ_UNLOCK(kq); 1989 if (!(kn->kn_status & KN_DETACHED)) 1990 kn->kn_fop->f_detach(kn); 1991 knote_drop(kn, td); 1992 KQ_LOCK(kq); 1993 } 1994 } 1995 } 1996 1997 if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { 1998 kq->kq_state |= KQ_TASKDRAIN; 1999 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); 2000 } 2001 2002 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 2003 selwakeuppri(&kq->kq_sel, PSOCK); 2004 if (!SEL_WAITING(&kq->kq_sel)) 2005 kq->kq_state &= ~KQ_SEL; 2006 } 2007 2008 KQ_UNLOCK(kq); 2009} 2010 2011static void 2012kqueue_destroy(struct kqueue *kq) 2013{ 2014 2015 KASSERT(kq->kq_fdp == NULL, 2016 ("kqueue still attached to a file descriptor")); 2017 seldrain(&kq->kq_sel); 2018 knlist_destroy(&kq->kq_sel.si_note); 2019 mtx_destroy(&kq->kq_lock); 2020 2021 if (kq->kq_knhash != NULL) 2022 free(kq->kq_knhash, M_KQUEUE); 2023 if (kq->kq_knlist != NULL) 2024 free(kq->kq_knlist, M_KQUEUE); 2025 2026 funsetown(&kq->kq_sigio); 2027} 2028 2029/*ARGSUSED*/ 2030static int 2031kqueue_close(struct file *fp, struct thread *td) 2032{ 2033 struct kqueue *kq = fp->f_data; 2034 struct filedesc *fdp; 2035 int error; 2036 int filedesc_unlock; 2037 2038 if ((error = kqueue_acquire(fp, &kq))) 2039 return error; 2040 kqueue_drain(kq, td); 2041 2042 /* 2043 * We could be called due to the knote_drop() doing fdrop(), 2044 * called from kqueue_register(). In this case the global 2045 * lock is owned, and filedesc sx is locked before, to not 2046 * take the sleepable lock after non-sleepable. 2047 */ 2048 fdp = kq->kq_fdp; 2049 kq->kq_fdp = NULL; 2050 if (!sx_xlocked(FILEDESC_LOCK(fdp))) { 2051 FILEDESC_XLOCK(fdp); 2052 filedesc_unlock = 1; 2053 } else 2054 filedesc_unlock = 0; 2055 TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); 2056 if (filedesc_unlock) 2057 FILEDESC_XUNLOCK(fdp); 2058 2059 kqueue_destroy(kq); 2060 chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); 2061 crfree(kq->kq_cred); 2062 free(kq, M_KQUEUE); 2063 fp->f_data = NULL; 2064 2065 return (0); 2066} 2067 2068static int 2069kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 2070{ 2071 2072 kif->kf_type = KF_TYPE_KQUEUE; 2073 return (0); 2074} 2075 2076static void 2077kqueue_wakeup(struct kqueue *kq) 2078{ 2079 KQ_OWNED(kq); 2080 2081 if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { 2082 kq->kq_state &= ~KQ_SLEEP; 2083 wakeup(kq); 2084 } 2085 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 2086 selwakeuppri(&kq->kq_sel, PSOCK); 2087 if (!SEL_WAITING(&kq->kq_sel)) 2088 kq->kq_state &= ~KQ_SEL; 2089 } 2090 if (!knlist_empty(&kq->kq_sel.si_note)) 2091 kqueue_schedtask(kq); 2092 if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { 2093 pgsigio(&kq->kq_sigio, SIGIO, 0); 2094 } 2095} 2096 2097/* 2098 * Walk down a list of knotes, activating them if their event has triggered. 2099 * 2100 * There is a possibility to optimize in the case of one kq watching another. 2101 * Instead of scheduling a task to wake it up, you could pass enough state 2102 * down the chain to make up the parent kqueue. Make this code functional 2103 * first. 2104 */ 2105void 2106knote(struct knlist *list, long hint, int lockflags) 2107{ 2108 struct kqueue *kq; 2109 struct knote *kn, *tkn; 2110 int error; 2111 bool own_influx; 2112 2113 if (list == NULL) 2114 return; 2115 2116 KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); 2117 2118 if ((lockflags & KNF_LISTLOCKED) == 0) 2119 list->kl_lock(list->kl_lockarg); 2120 2121 /* 2122 * If we unlock the list lock (and set KN_INFLUX), we can 2123 * eliminate the kqueue scheduling, but this will introduce 2124 * four lock/unlock's for each knote to test. Also, marker 2125 * would be needed to keep iteration position, since filters 2126 * or other threads could remove events. 2127 */ 2128 SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { 2129 kq = kn->kn_kq; 2130 KQ_LOCK(kq); 2131 if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) { 2132 /* 2133 * Do not process the influx notes, except for 2134 * the influx coming from the kq unlock in the 2135 * kqueue_scan(). In the later case, we do 2136 * not interfere with the scan, since the code 2137 * fragment in kqueue_scan() locks the knlist, 2138 * and cannot proceed until we finished. 2139 */ 2140 KQ_UNLOCK(kq); 2141 } else if ((lockflags & KNF_NOKQLOCK) != 0) { 2142 own_influx = (kn->kn_status & KN_INFLUX) == 0; 2143 if (own_influx) 2144 kn->kn_status |= KN_INFLUX; 2145 KQ_UNLOCK(kq); 2146 error = kn->kn_fop->f_event(kn, hint); 2147 KQ_LOCK(kq); 2148 if (own_influx) 2149 kn->kn_status &= ~KN_INFLUX; 2150 if (error) 2151 KNOTE_ACTIVATE(kn, 1); 2152 KQ_UNLOCK_FLUX(kq); 2153 } else { 2154 kn->kn_status |= KN_HASKQLOCK; 2155 if (kn->kn_fop->f_event(kn, hint)) 2156 KNOTE_ACTIVATE(kn, 1); 2157 kn->kn_status &= ~KN_HASKQLOCK; 2158 KQ_UNLOCK(kq); 2159 } 2160 } 2161 if ((lockflags & KNF_LISTLOCKED) == 0) 2162 list->kl_unlock(list->kl_lockarg); 2163} 2164 2165/* 2166 * add a knote to a knlist 2167 */ 2168void 2169knlist_add(struct knlist *knl, struct knote *kn, int islocked) 2170{ 2171 KNL_ASSERT_LOCK(knl, islocked); 2172 KQ_NOTOWNED(kn->kn_kq); 2173 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == 2174 (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED")); 2175 if (!islocked) 2176 knl->kl_lock(knl->kl_lockarg); 2177 SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); 2178 if (!islocked) 2179 knl->kl_unlock(knl->kl_lockarg); 2180 KQ_LOCK(kn->kn_kq); 2181 kn->kn_knlist = knl; 2182 kn->kn_status &= ~KN_DETACHED; 2183 KQ_UNLOCK(kn->kn_kq); 2184} 2185 2186static void 2187knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, 2188 int kqislocked) 2189{ 2190 KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked")); 2191 KNL_ASSERT_LOCK(knl, knlislocked); 2192 mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); 2193 if (!kqislocked) 2194 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX, 2195 ("knlist_remove called w/o knote being KN_INFLUX or already removed")); 2196 if (!knlislocked) 2197 knl->kl_lock(knl->kl_lockarg); 2198 SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); 2199 kn->kn_knlist = NULL; 2200 if (!knlislocked) 2201 kn_list_unlock(knl); 2202 if (!kqislocked) 2203 KQ_LOCK(kn->kn_kq); 2204 kn->kn_status |= KN_DETACHED; 2205 if (!kqislocked) 2206 KQ_UNLOCK(kn->kn_kq); 2207} 2208 2209/* 2210 * remove knote from the specified knlist 2211 */ 2212void 2213knlist_remove(struct knlist *knl, struct knote *kn, int islocked) 2214{ 2215 2216 knlist_remove_kq(knl, kn, islocked, 0); 2217} 2218 2219int 2220knlist_empty(struct knlist *knl) 2221{ 2222 2223 KNL_ASSERT_LOCKED(knl); 2224 return (SLIST_EMPTY(&knl->kl_list)); 2225} 2226 2227static struct mtx knlist_lock; 2228MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", 2229 MTX_DEF); 2230static void knlist_mtx_lock(void *arg); 2231static void knlist_mtx_unlock(void *arg); 2232 2233static void 2234knlist_mtx_lock(void *arg) 2235{ 2236 2237 mtx_lock((struct mtx *)arg); 2238} 2239 2240static void 2241knlist_mtx_unlock(void *arg) 2242{ 2243 2244 mtx_unlock((struct mtx *)arg); 2245} 2246 2247static void 2248knlist_mtx_assert_locked(void *arg) 2249{ 2250 2251 mtx_assert((struct mtx *)arg, MA_OWNED); 2252} 2253 2254static void 2255knlist_mtx_assert_unlocked(void *arg) 2256{ 2257 2258 mtx_assert((struct mtx *)arg, MA_NOTOWNED); 2259} 2260 2261static void 2262knlist_rw_rlock(void *arg) 2263{ 2264 2265 rw_rlock((struct rwlock *)arg); 2266} 2267 2268static void 2269knlist_rw_runlock(void *arg) 2270{ 2271 2272 rw_runlock((struct rwlock *)arg); 2273} 2274 2275static void 2276knlist_rw_assert_locked(void *arg) 2277{ 2278 2279 rw_assert((struct rwlock *)arg, RA_LOCKED); 2280} 2281 2282static void 2283knlist_rw_assert_unlocked(void *arg) 2284{ 2285 2286 rw_assert((struct rwlock *)arg, RA_UNLOCKED); 2287} 2288 2289void 2290knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), 2291 void (*kl_unlock)(void *), 2292 void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *)) 2293{ 2294 2295 if (lock == NULL) 2296 knl->kl_lockarg = &knlist_lock; 2297 else 2298 knl->kl_lockarg = lock; 2299 2300 if (kl_lock == NULL) 2301 knl->kl_lock = knlist_mtx_lock; 2302 else 2303 knl->kl_lock = kl_lock; 2304 if (kl_unlock == NULL) 2305 knl->kl_unlock = knlist_mtx_unlock; 2306 else 2307 knl->kl_unlock = kl_unlock; 2308 if (kl_assert_locked == NULL) 2309 knl->kl_assert_locked = knlist_mtx_assert_locked; 2310 else 2311 knl->kl_assert_locked = kl_assert_locked; 2312 if (kl_assert_unlocked == NULL) 2313 knl->kl_assert_unlocked = knlist_mtx_assert_unlocked; 2314 else 2315 knl->kl_assert_unlocked = kl_assert_unlocked; 2316 2317 knl->kl_autodestroy = 0; 2318 SLIST_INIT(&knl->kl_list); 2319} 2320 2321void 2322knlist_init_mtx(struct knlist *knl, struct mtx *lock) 2323{ 2324 2325 knlist_init(knl, lock, NULL, NULL, NULL, NULL); 2326} 2327 2328struct knlist * 2329knlist_alloc(struct mtx *lock) 2330{ 2331 struct knlist *knl; 2332 2333 knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); 2334 knlist_init_mtx(knl, lock); 2335 return (knl); 2336} 2337 2338void 2339knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) 2340{ 2341 2342 knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, 2343 knlist_rw_assert_locked, knlist_rw_assert_unlocked); 2344} 2345 2346void 2347knlist_destroy(struct knlist *knl) 2348{ 2349 2350 KASSERT(KNLIST_EMPTY(knl), 2351 ("destroying knlist %p with knotes on it", knl)); 2352} 2353 2354void 2355knlist_detach(struct knlist *knl) 2356{ 2357 2358 KNL_ASSERT_LOCKED(knl); 2359 knl->kl_autodestroy = 1; 2360 if (knlist_empty(knl)) { 2361 knlist_destroy(knl); 2362 free(knl, M_KQUEUE); 2363 } 2364} 2365 2366/* 2367 * Even if we are locked, we may need to drop the lock to allow any influx 2368 * knotes time to "settle". 2369 */ 2370void 2371knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) 2372{ 2373 struct knote *kn, *kn2; 2374 struct kqueue *kq; 2375 2376 KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); 2377 if (islocked) 2378 KNL_ASSERT_LOCKED(knl); 2379 else { 2380 KNL_ASSERT_UNLOCKED(knl); 2381again: /* need to reacquire lock since we have dropped it */ 2382 knl->kl_lock(knl->kl_lockarg); 2383 } 2384 2385 SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { 2386 kq = kn->kn_kq; 2387 KQ_LOCK(kq); 2388 if ((kn->kn_status & KN_INFLUX)) { 2389 KQ_UNLOCK(kq); 2390 continue; 2391 } 2392 knlist_remove_kq(knl, kn, 1, 1); 2393 if (killkn) { 2394 kn->kn_status |= KN_INFLUX | KN_DETACHED; 2395 KQ_UNLOCK(kq); 2396 knote_drop(kn, td); 2397 } else { 2398 /* Make sure cleared knotes disappear soon */ 2399 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 2400 KQ_UNLOCK(kq); 2401 } 2402 kq = NULL; 2403 } 2404 2405 if (!SLIST_EMPTY(&knl->kl_list)) { 2406 /* there are still KN_INFLUX remaining */ 2407 kn = SLIST_FIRST(&knl->kl_list); 2408 kq = kn->kn_kq; 2409 KQ_LOCK(kq); 2410 KASSERT(kn->kn_status & KN_INFLUX, 2411 ("knote removed w/o list lock")); 2412 knl->kl_unlock(knl->kl_lockarg); 2413 kq->kq_state |= KQ_FLUXWAIT; 2414 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); 2415 kq = NULL; 2416 goto again; 2417 } 2418 2419 if (islocked) 2420 KNL_ASSERT_LOCKED(knl); 2421 else { 2422 knl->kl_unlock(knl->kl_lockarg); 2423 KNL_ASSERT_UNLOCKED(knl); 2424 } 2425} 2426 2427/* 2428 * Remove all knotes referencing a specified fd must be called with FILEDESC 2429 * lock. This prevents a race where a new fd comes along and occupies the 2430 * entry and we attach a knote to the fd. 2431 */ 2432void 2433knote_fdclose(struct thread *td, int fd) 2434{ 2435 struct filedesc *fdp = td->td_proc->p_fd; 2436 struct kqueue *kq; 2437 struct knote *kn; 2438 int influx; 2439 2440 FILEDESC_XLOCK_ASSERT(fdp); 2441 2442 /* 2443 * We shouldn't have to worry about new kevents appearing on fd 2444 * since filedesc is locked. 2445 */ 2446 TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { 2447 KQ_LOCK(kq); 2448 2449again: 2450 influx = 0; 2451 while (kq->kq_knlistsize > fd && 2452 (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { 2453 if (kn->kn_status & KN_INFLUX) { 2454 /* someone else might be waiting on our knote */ 2455 if (influx) 2456 wakeup(kq); 2457 kq->kq_state |= KQ_FLUXWAIT; 2458 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); 2459 goto again; 2460 } 2461 kn->kn_status |= KN_INFLUX; 2462 KQ_UNLOCK(kq); 2463 if (!(kn->kn_status & KN_DETACHED)) 2464 kn->kn_fop->f_detach(kn); 2465 knote_drop(kn, td); 2466 influx = 1; 2467 KQ_LOCK(kq); 2468 } 2469 KQ_UNLOCK_FLUX(kq); 2470 } 2471} 2472 2473static int 2474knote_attach(struct knote *kn, struct kqueue *kq) 2475{ 2476 struct klist *list; 2477 2478 KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX")); 2479 KQ_OWNED(kq); 2480 2481 if ((kq->kq_state & KQ_CLOSING) != 0) 2482 return (EBADF); 2483 if (kn->kn_fop->f_isfd) { 2484 if (kn->kn_id >= kq->kq_knlistsize) 2485 return (ENOMEM); 2486 list = &kq->kq_knlist[kn->kn_id]; 2487 } else { 2488 if (kq->kq_knhash == NULL) 2489 return (ENOMEM); 2490 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2491 } 2492 SLIST_INSERT_HEAD(list, kn, kn_link); 2493 return (0); 2494} 2495 2496/* 2497 * knote must already have been detached using the f_detach method. 2498 * no lock need to be held, it is assumed that the KN_INFLUX flag is set 2499 * to prevent other removal. 2500 */ 2501static void 2502knote_drop(struct knote *kn, struct thread *td) 2503{ 2504 struct kqueue *kq; 2505 struct klist *list; 2506 2507 kq = kn->kn_kq; 2508 2509 KQ_NOTOWNED(kq); 2510 KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX, 2511 ("knote_drop called without KN_INFLUX set in kn_status")); 2512 2513 KQ_LOCK(kq); 2514 if (kn->kn_fop->f_isfd) 2515 list = &kq->kq_knlist[kn->kn_id]; 2516 else 2517 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2518 2519 if (!SLIST_EMPTY(list)) 2520 SLIST_REMOVE(list, kn, knote, kn_link); 2521 if (kn->kn_status & KN_QUEUED) 2522 knote_dequeue(kn); 2523 KQ_UNLOCK_FLUX(kq); 2524 2525 if (kn->kn_fop->f_isfd) { 2526 fdrop(kn->kn_fp, td); 2527 kn->kn_fp = NULL; 2528 } 2529 kqueue_fo_release(kn->kn_kevent.filter); 2530 kn->kn_fop = NULL; 2531 knote_free(kn); 2532} 2533 2534static void 2535knote_enqueue(struct knote *kn) 2536{ 2537 struct kqueue *kq = kn->kn_kq; 2538 2539 KQ_OWNED(kn->kn_kq); 2540 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 2541 2542 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 2543 kn->kn_status |= KN_QUEUED; 2544 kq->kq_count++; 2545 kqueue_wakeup(kq); 2546} 2547 2548static void 2549knote_dequeue(struct knote *kn) 2550{ 2551 struct kqueue *kq = kn->kn_kq; 2552 2553 KQ_OWNED(kn->kn_kq); 2554 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 2555 2556 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 2557 kn->kn_status &= ~KN_QUEUED; 2558 kq->kq_count--; 2559} 2560 2561static void 2562knote_init(void) 2563{ 2564 2565 knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, 2566 NULL, NULL, UMA_ALIGN_PTR, 0); 2567} 2568SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); 2569 2570static struct knote * 2571knote_alloc(int waitok) 2572{ 2573 2574 return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) | 2575 M_ZERO)); 2576} 2577 2578static void 2579knote_free(struct knote *kn) 2580{ 2581 2582 uma_zfree(knote_zone, kn); 2583} 2584 2585/* 2586 * Register the kev w/ the kq specified by fd. 2587 */ 2588int 2589kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok) 2590{ 2591 struct kqueue *kq; 2592 struct file *fp; 2593 cap_rights_t rights; 2594 int error; 2595 2596 error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp); 2597 if (error != 0) 2598 return (error); 2599 if ((error = kqueue_acquire(fp, &kq)) != 0) 2600 goto noacquire; 2601 2602 error = kqueue_register(kq, kev, td, waitok); 2603 kqueue_release(kq, 0); 2604 2605noacquire: 2606 fdrop(fp, td); 2607 return (error); 2608} 2609