1/*- 2 * Copyright (c) 2007 Roman Divacky 3 * Copyright (c) 2014 Dmitry Chagin 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/11/sys/compat/linux/linux_event.c 346832 2019-04-28 14:03:32Z dchagin $"); 30 31#include "opt_compat.h" 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/imgact.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/lock.h> 39#include <sys/mutex.h> 40#include <sys/callout.h> 41#include <sys/capsicum.h> 42#include <sys/types.h> 43#include <sys/user.h> 44#include <sys/file.h> 45#include <sys/filedesc.h> 46#include <sys/filio.h> 47#include <sys/errno.h> 48#include <sys/event.h> 49#include <sys/poll.h> 50#include <sys/proc.h> 51#include <sys/selinfo.h> 52#include <sys/sx.h> 53#include <sys/syscallsubr.h> 54#include <sys/timespec.h> 55 56#ifdef COMPAT_LINUX32 57#include <machine/../linux32/linux.h> 58#include <machine/../linux32/linux32_proto.h> 59#else 60#include <machine/../linux/linux.h> 61#include <machine/../linux/linux_proto.h> 62#endif 63 64#include <compat/linux/linux_emul.h> 65#include <compat/linux/linux_event.h> 66#include <compat/linux/linux_file.h> 67#include <compat/linux/linux_timer.h> 68#include <compat/linux/linux_util.h> 69 70/* 71 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 72 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 73 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 74 * data verbatuim. Therefore we allocate 64-bit memory block to pass 75 * user supplied data for every file descriptor. 76 */ 77 78typedef uint64_t epoll_udata_t; 79 80struct epoll_emuldata { 81 uint32_t fdc; /* epoll udata max index */ 82 epoll_udata_t udata[1]; /* epoll user data vector */ 83}; 84 85#define EPOLL_DEF_SZ 16 86#define EPOLL_SIZE(fdn) \ 87 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 88 89struct epoll_event { 90 uint32_t events; 91 epoll_udata_t data; 92} 93#if defined(__amd64__) 94__attribute__((packed)) 95#endif 96; 97 98#define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 99 100static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 101static int epoll_to_kevent(struct thread *td, struct file *epfp, 102 int fd, struct epoll_event *l_event, int *kev_flags, 103 struct kevent *kevent, int *nkevents); 104static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 105static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 106static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 107static int epoll_delete_event(struct thread *td, struct file *epfp, 108 int fd, int filter); 109static int epoll_delete_all_events(struct thread *td, struct file *epfp, 110 int fd); 111 112struct epoll_copyin_args { 113 struct kevent *changelist; 114}; 115 116struct epoll_copyout_args { 117 struct epoll_event *leventlist; 118 struct proc *p; 119 uint32_t count; 120 int error; 121}; 122 123/* eventfd */ 124typedef uint64_t eventfd_t; 125 126static fo_rdwr_t eventfd_read; 127static fo_rdwr_t eventfd_write; 128static fo_ioctl_t eventfd_ioctl; 129static fo_poll_t eventfd_poll; 130static fo_kqfilter_t eventfd_kqfilter; 131static fo_stat_t eventfd_stat; 132static fo_close_t eventfd_close; 133static fo_fill_kinfo_t eventfd_fill_kinfo; 134 135static struct fileops eventfdops = { 136 .fo_read = eventfd_read, 137 .fo_write = eventfd_write, 138 .fo_truncate = invfo_truncate, 139 .fo_ioctl = eventfd_ioctl, 140 .fo_poll = eventfd_poll, 141 .fo_kqfilter = eventfd_kqfilter, 142 .fo_stat = eventfd_stat, 143 .fo_close = eventfd_close, 144 .fo_chmod = invfo_chmod, 145 .fo_chown = invfo_chown, 146 .fo_sendfile = invfo_sendfile, 147 .fo_fill_kinfo = eventfd_fill_kinfo, 148 .fo_flags = DFLAG_PASSABLE 149}; 150 151static void filt_eventfddetach(struct knote *kn); 152static int filt_eventfdread(struct knote *kn, long hint); 153static int filt_eventfdwrite(struct knote *kn, long hint); 154 155static struct filterops eventfd_rfiltops = { 156 .f_isfd = 1, 157 .f_detach = filt_eventfddetach, 158 .f_event = filt_eventfdread 159}; 160static struct filterops eventfd_wfiltops = { 161 .f_isfd = 1, 162 .f_detach = filt_eventfddetach, 163 .f_event = filt_eventfdwrite 164}; 165 166/* timerfd */ 167typedef uint64_t timerfd_t; 168 169static fo_rdwr_t timerfd_read; 170static fo_poll_t timerfd_poll; 171static fo_kqfilter_t timerfd_kqfilter; 172static fo_stat_t timerfd_stat; 173static fo_close_t timerfd_close; 174static fo_fill_kinfo_t timerfd_fill_kinfo; 175 176static struct fileops timerfdops = { 177 .fo_read = timerfd_read, 178 .fo_write = invfo_rdwr, 179 .fo_truncate = invfo_truncate, 180 .fo_ioctl = eventfd_ioctl, 181 .fo_poll = timerfd_poll, 182 .fo_kqfilter = timerfd_kqfilter, 183 .fo_stat = timerfd_stat, 184 .fo_close = timerfd_close, 185 .fo_chmod = invfo_chmod, 186 .fo_chown = invfo_chown, 187 .fo_sendfile = invfo_sendfile, 188 .fo_fill_kinfo = timerfd_fill_kinfo, 189 .fo_flags = DFLAG_PASSABLE 190}; 191 192static void filt_timerfddetach(struct knote *kn); 193static int filt_timerfdread(struct knote *kn, long hint); 194 195static struct filterops timerfd_rfiltops = { 196 .f_isfd = 1, 197 .f_detach = filt_timerfddetach, 198 .f_event = filt_timerfdread 199}; 200 201struct eventfd { 202 eventfd_t efd_count; 203 uint32_t efd_flags; 204 struct selinfo efd_sel; 205 struct mtx efd_lock; 206}; 207 208struct timerfd { 209 clockid_t tfd_clockid; 210 struct itimerspec tfd_time; 211 struct callout tfd_callout; 212 timerfd_t tfd_count; 213 bool tfd_canceled; 214 struct selinfo tfd_sel; 215 struct mtx tfd_lock; 216}; 217 218static int eventfd_create(struct thread *td, uint32_t initval, int flags); 219static void linux_timerfd_expire(void *); 220static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); 221 222 223static void 224epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 225{ 226 struct linux_pemuldata *pem; 227 struct epoll_emuldata *emd; 228 struct proc *p; 229 230 p = td->td_proc; 231 232 pem = pem_find(p); 233 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 234 235 LINUX_PEM_XLOCK(pem); 236 if (pem->epoll == NULL) { 237 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 238 emd->fdc = fd; 239 pem->epoll = emd; 240 } else { 241 emd = pem->epoll; 242 if (fd > emd->fdc) { 243 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 244 emd->fdc = fd; 245 pem->epoll = emd; 246 } 247 } 248 emd->udata[fd] = udata; 249 LINUX_PEM_XUNLOCK(pem); 250} 251 252static int 253epoll_create_common(struct thread *td, int flags) 254{ 255 int error; 256 257 error = kern_kqueue(td, flags, NULL); 258 if (error != 0) 259 return (error); 260 261 epoll_fd_install(td, EPOLL_DEF_SZ, 0); 262 263 return (0); 264} 265 266#ifdef LINUX_LEGACY_SYSCALLS 267int 268linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 269{ 270 271 /* 272 * args->size is unused. Linux just tests it 273 * and then forgets it as well. 274 */ 275 if (args->size <= 0) 276 return (EINVAL); 277 278 return (epoll_create_common(td, 0)); 279} 280#endif 281 282int 283linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 284{ 285 int flags; 286 287 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 288 return (EINVAL); 289 290 flags = 0; 291 if ((args->flags & LINUX_O_CLOEXEC) != 0) 292 flags |= O_CLOEXEC; 293 294 return (epoll_create_common(td, flags)); 295} 296 297/* Structure converting function from epoll to kevent. */ 298static int 299epoll_to_kevent(struct thread *td, struct file *epfp, 300 int fd, struct epoll_event *l_event, int *kev_flags, 301 struct kevent *kevent, int *nkevents) 302{ 303 uint32_t levents = l_event->events; 304 struct linux_pemuldata *pem; 305 struct proc *p; 306 307 /* flags related to how event is registered */ 308 if ((levents & LINUX_EPOLLONESHOT) != 0) 309 *kev_flags |= EV_ONESHOT; 310 if ((levents & LINUX_EPOLLET) != 0) 311 *kev_flags |= EV_CLEAR; 312 if ((levents & LINUX_EPOLLERR) != 0) 313 *kev_flags |= EV_ERROR; 314 if ((levents & LINUX_EPOLLRDHUP) != 0) 315 *kev_flags |= EV_EOF; 316 317 /* flags related to what event is registered */ 318 if ((levents & LINUX_EPOLL_EVRD) != 0) { 319 EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0); 320 ++(*nkevents); 321 } 322 if ((levents & LINUX_EPOLL_EVWR) != 0) { 323 EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0); 324 ++(*nkevents); 325 } 326 327 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 328 p = td->td_proc; 329 330 pem = pem_find(p); 331 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 332 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 333 334 LINUX_PEM_XLOCK(pem); 335 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 336 pem->flags |= LINUX_XUNSUP_EPOLL; 337 LINUX_PEM_XUNLOCK(pem); 338 linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", 339 levents); 340 } else 341 LINUX_PEM_XUNLOCK(pem); 342 return (EINVAL); 343 } 344 345 return (0); 346} 347 348/* 349 * Structure converting function from kevent to epoll. In a case 350 * this is called on error in registration we store the error in 351 * event->data and pick it up later in linux_epoll_ctl(). 352 */ 353static void 354kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 355{ 356 357 if ((kevent->flags & EV_ERROR) != 0) { 358 l_event->events = LINUX_EPOLLERR; 359 return; 360 } 361 362 /* XXX EPOLLPRI, EPOLLHUP */ 363 switch (kevent->filter) { 364 case EVFILT_READ: 365 l_event->events = LINUX_EPOLLIN; 366 if ((kevent->flags & EV_EOF) != 0) 367 l_event->events |= LINUX_EPOLLRDHUP; 368 break; 369 case EVFILT_WRITE: 370 l_event->events = LINUX_EPOLLOUT; 371 break; 372 } 373} 374 375/* 376 * Copyout callback used by kevent. This converts kevent 377 * events to epoll events and copies them back to the 378 * userspace. This is also called on error on registering 379 * of the filter. 380 */ 381static int 382epoll_kev_copyout(void *arg, struct kevent *kevp, int count) 383{ 384 struct epoll_copyout_args *args; 385 struct linux_pemuldata *pem; 386 struct epoll_emuldata *emd; 387 struct epoll_event *eep; 388 int error, fd, i; 389 390 args = (struct epoll_copyout_args*) arg; 391 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 392 393 pem = pem_find(args->p); 394 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 395 LINUX_PEM_SLOCK(pem); 396 emd = pem->epoll; 397 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 398 399 for (i = 0; i < count; i++) { 400 kevent_to_epoll(&kevp[i], &eep[i]); 401 402 fd = kevp[i].ident; 403 KASSERT(fd <= emd->fdc, ("epoll user data vector" 404 " is too small.\n")); 405 eep[i].data = emd->udata[fd]; 406 } 407 LINUX_PEM_SUNLOCK(pem); 408 409 error = copyout(eep, args->leventlist, count * sizeof(*eep)); 410 if (error == 0) { 411 args->leventlist += count; 412 args->count += count; 413 } else if (args->error == 0) 414 args->error = error; 415 416 free(eep, M_EPOLL); 417 return (error); 418} 419 420/* 421 * Copyin callback used by kevent. This copies already 422 * converted filters from kernel memory to the kevent 423 * internal kernel memory. Hence the memcpy instead of 424 * copyin. 425 */ 426static int 427epoll_kev_copyin(void *arg, struct kevent *kevp, int count) 428{ 429 struct epoll_copyin_args *args; 430 431 args = (struct epoll_copyin_args*) arg; 432 433 memcpy(kevp, args->changelist, count * sizeof(*kevp)); 434 args->changelist += count; 435 436 return (0); 437} 438 439/* 440 * Load epoll filter, convert it to kevent filter 441 * and load it into kevent subsystem. 442 */ 443int 444linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 445{ 446 struct file *epfp, *fp; 447 struct epoll_copyin_args ciargs; 448 struct kevent kev[2]; 449 struct kevent_copyops k_ops = { &ciargs, 450 NULL, 451 epoll_kev_copyin}; 452 struct epoll_event le; 453 cap_rights_t rights; 454 int kev_flags; 455 int nchanges = 0; 456 int error; 457 458 if (args->op != LINUX_EPOLL_CTL_DEL) { 459 error = copyin(args->event, &le, sizeof(le)); 460 if (error != 0) 461 return (error); 462 } 463 464 error = fget(td, args->epfd, 465 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); 466 if (error != 0) 467 return (error); 468 if (epfp->f_type != DTYPE_KQUEUE) { 469 error = EINVAL; 470 goto leave1; 471 } 472 473 /* Protect user data vector from incorrectly supplied fd. */ 474 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); 475 if (error != 0) 476 goto leave1; 477 478 /* Linux disallows spying on himself */ 479 if (epfp == fp) { 480 error = EINVAL; 481 goto leave0; 482 } 483 484 ciargs.changelist = kev; 485 486 if (args->op != LINUX_EPOLL_CTL_DEL) { 487 kev_flags = EV_ADD | EV_ENABLE; 488 error = epoll_to_kevent(td, epfp, args->fd, &le, 489 &kev_flags, kev, &nchanges); 490 if (error != 0) 491 goto leave0; 492 } 493 494 switch (args->op) { 495 case LINUX_EPOLL_CTL_MOD: 496 error = epoll_delete_all_events(td, epfp, args->fd); 497 if (error != 0) 498 goto leave0; 499 break; 500 501 case LINUX_EPOLL_CTL_ADD: 502 /* 503 * kqueue_register() return ENOENT if event does not exists 504 * and the EV_ADD flag is not set. 505 */ 506 kev[0].flags &= ~EV_ADD; 507 error = kqfd_register(args->epfd, &kev[0], td, 1); 508 if (error != ENOENT) { 509 error = EEXIST; 510 goto leave0; 511 } 512 error = 0; 513 kev[0].flags |= EV_ADD; 514 break; 515 516 case LINUX_EPOLL_CTL_DEL: 517 /* CTL_DEL means unregister this fd with this epoll */ 518 error = epoll_delete_all_events(td, epfp, args->fd); 519 goto leave0; 520 521 default: 522 error = EINVAL; 523 goto leave0; 524 } 525 526 epoll_fd_install(td, args->fd, le.data); 527 528 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 529 530leave0: 531 fdrop(fp, td); 532 533leave1: 534 fdrop(epfp, td); 535 return (error); 536} 537 538/* 539 * Wait for a filter to be triggered on the epoll file descriptor. 540 */ 541static int 542linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, 543 int maxevents, int timeout, sigset_t *uset) 544{ 545 struct epoll_copyout_args coargs; 546 struct kevent_copyops k_ops = { &coargs, 547 epoll_kev_copyout, 548 NULL}; 549 struct timespec ts, *tsp; 550 cap_rights_t rights; 551 struct file *epfp; 552 sigset_t omask; 553 int error; 554 555 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) 556 return (EINVAL); 557 558 error = fget(td, epfd, 559 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); 560 if (error != 0) 561 return (error); 562 if (epfp->f_type != DTYPE_KQUEUE) { 563 error = EINVAL; 564 goto leave1; 565 } 566 if (uset != NULL) { 567 error = kern_sigprocmask(td, SIG_SETMASK, uset, 568 &omask, 0); 569 if (error != 0) 570 goto leave1; 571 td->td_pflags |= TDP_OLDMASK; 572 /* 573 * Make sure that ast() is called on return to 574 * usermode and TDP_OLDMASK is cleared, restoring old 575 * sigmask. 576 */ 577 thread_lock(td); 578 td->td_flags |= TDF_ASTPENDING; 579 thread_unlock(td); 580 } 581 582 583 coargs.leventlist = events; 584 coargs.p = td->td_proc; 585 coargs.count = 0; 586 coargs.error = 0; 587 588 if (timeout != -1) { 589 if (timeout < 0) { 590 error = EINVAL; 591 goto leave0; 592 } 593 /* Convert from milliseconds to timespec. */ 594 ts.tv_sec = timeout / 1000; 595 ts.tv_nsec = (timeout % 1000) * 1000000; 596 tsp = &ts; 597 } else { 598 tsp = NULL; 599 } 600 601 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); 602 if (error == 0 && coargs.error != 0) 603 error = coargs.error; 604 605 /* 606 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 607 * Maybe we should translate that but I don't think it matters at all. 608 */ 609 if (error == 0) 610 td->td_retval[0] = coargs.count; 611 612leave0: 613 if (uset != NULL) 614 error = kern_sigprocmask(td, SIG_SETMASK, &omask, 615 NULL, 0); 616leave1: 617 fdrop(epfp, td); 618 return (error); 619} 620 621#ifdef LINUX_LEGACY_SYSCALLS 622int 623linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 624{ 625 626 return (linux_epoll_wait_common(td, args->epfd, args->events, 627 args->maxevents, args->timeout, NULL)); 628} 629#endif 630 631int 632linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) 633{ 634 sigset_t mask, *pmask; 635 l_sigset_t lmask; 636 int error; 637 638 if (args->mask != NULL) { 639 if (args->sigsetsize != sizeof(l_sigset_t)) 640 return (EINVAL); 641 error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); 642 if (error != 0) 643 return (error); 644 linux_to_bsd_sigset(&lmask, &mask); 645 pmask = &mask; 646 } else 647 pmask = NULL; 648 return (linux_epoll_wait_common(td, args->epfd, args->events, 649 args->maxevents, args->timeout, pmask)); 650} 651 652static int 653epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter) 654{ 655 struct epoll_copyin_args ciargs; 656 struct kevent kev; 657 struct kevent_copyops k_ops = { &ciargs, 658 NULL, 659 epoll_kev_copyin}; 660 661 ciargs.changelist = &kev; 662 EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); 663 664 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); 665} 666 667static int 668epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 669{ 670 int error1, error2; 671 672 error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ); 673 error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE); 674 675 /* return 0 if at least one result positive */ 676 return (error1 == 0 ? 0 : error2); 677} 678 679static int 680eventfd_create(struct thread *td, uint32_t initval, int flags) 681{ 682 struct filedesc *fdp; 683 struct eventfd *efd; 684 struct file *fp; 685 int fflags, fd, error; 686 687 fflags = 0; 688 if ((flags & LINUX_O_CLOEXEC) != 0) 689 fflags |= O_CLOEXEC; 690 691 fdp = td->td_proc->p_fd; 692 error = falloc(td, &fp, &fd, fflags); 693 if (error != 0) 694 return (error); 695 696 efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); 697 efd->efd_flags = flags; 698 efd->efd_count = initval; 699 mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); 700 701 knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); 702 703 fflags = FREAD | FWRITE; 704 if ((flags & LINUX_O_NONBLOCK) != 0) 705 fflags |= FNONBLOCK; 706 707 finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); 708 fdrop(fp, td); 709 710 td->td_retval[0] = fd; 711 return (error); 712} 713 714#ifdef LINUX_LEGACY_SYSCALLS 715int 716linux_eventfd(struct thread *td, struct linux_eventfd_args *args) 717{ 718 719 return (eventfd_create(td, args->initval, 0)); 720} 721#endif 722 723int 724linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) 725{ 726 727 if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) 728 return (EINVAL); 729 730 return (eventfd_create(td, args->initval, args->flags)); 731} 732 733static int 734eventfd_close(struct file *fp, struct thread *td) 735{ 736 struct eventfd *efd; 737 738 efd = fp->f_data; 739 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 740 return (EINVAL); 741 742 seldrain(&efd->efd_sel); 743 knlist_destroy(&efd->efd_sel.si_note); 744 745 fp->f_ops = &badfileops; 746 mtx_destroy(&efd->efd_lock); 747 free(efd, M_EPOLL); 748 749 return (0); 750} 751 752static int 753eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 754 int flags, struct thread *td) 755{ 756 struct eventfd *efd; 757 eventfd_t count; 758 int error; 759 760 efd = fp->f_data; 761 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 762 return (EINVAL); 763 764 if (uio->uio_resid < sizeof(eventfd_t)) 765 return (EINVAL); 766 767 error = 0; 768 mtx_lock(&efd->efd_lock); 769retry: 770 if (efd->efd_count == 0) { 771 if ((fp->f_flag & FNONBLOCK) != 0) { 772 mtx_unlock(&efd->efd_lock); 773 return (EAGAIN); 774 } 775 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); 776 if (error == 0) 777 goto retry; 778 } 779 if (error == 0) { 780 if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { 781 count = 1; 782 --efd->efd_count; 783 } else { 784 count = efd->efd_count; 785 efd->efd_count = 0; 786 } 787 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 788 selwakeup(&efd->efd_sel); 789 wakeup(&efd->efd_count); 790 mtx_unlock(&efd->efd_lock); 791 error = uiomove(&count, sizeof(eventfd_t), uio); 792 } else 793 mtx_unlock(&efd->efd_lock); 794 795 return (error); 796} 797 798static int 799eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 800 int flags, struct thread *td) 801{ 802 struct eventfd *efd; 803 eventfd_t count; 804 int error; 805 806 efd = fp->f_data; 807 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 808 return (EINVAL); 809 810 if (uio->uio_resid < sizeof(eventfd_t)) 811 return (EINVAL); 812 813 error = uiomove(&count, sizeof(eventfd_t), uio); 814 if (error != 0) 815 return (error); 816 if (count == UINT64_MAX) 817 return (EINVAL); 818 819 mtx_lock(&efd->efd_lock); 820retry: 821 if (UINT64_MAX - efd->efd_count <= count) { 822 if ((fp->f_flag & FNONBLOCK) != 0) { 823 mtx_unlock(&efd->efd_lock); 824 /* Do not not return the number of bytes written */ 825 uio->uio_resid += sizeof(eventfd_t); 826 return (EAGAIN); 827 } 828 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, 829 PCATCH, "lefdwr", 0); 830 if (error == 0) 831 goto retry; 832 } 833 if (error == 0) { 834 efd->efd_count += count; 835 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 836 selwakeup(&efd->efd_sel); 837 wakeup(&efd->efd_count); 838 } 839 mtx_unlock(&efd->efd_lock); 840 841 return (error); 842} 843 844static int 845eventfd_poll(struct file *fp, int events, struct ucred *active_cred, 846 struct thread *td) 847{ 848 struct eventfd *efd; 849 int revents = 0; 850 851 efd = fp->f_data; 852 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 853 return (POLLERR); 854 855 mtx_lock(&efd->efd_lock); 856 if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) 857 revents |= events & (POLLIN|POLLRDNORM); 858 if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) 859 revents |= events & (POLLOUT|POLLWRNORM); 860 if (revents == 0) 861 selrecord(td, &efd->efd_sel); 862 mtx_unlock(&efd->efd_lock); 863 864 return (revents); 865} 866 867/*ARGSUSED*/ 868static int 869eventfd_kqfilter(struct file *fp, struct knote *kn) 870{ 871 struct eventfd *efd; 872 873 efd = fp->f_data; 874 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 875 return (EINVAL); 876 877 mtx_lock(&efd->efd_lock); 878 switch (kn->kn_filter) { 879 case EVFILT_READ: 880 kn->kn_fop = &eventfd_rfiltops; 881 break; 882 case EVFILT_WRITE: 883 kn->kn_fop = &eventfd_wfiltops; 884 break; 885 default: 886 mtx_unlock(&efd->efd_lock); 887 return (EINVAL); 888 } 889 890 kn->kn_hook = efd; 891 knlist_add(&efd->efd_sel.si_note, kn, 1); 892 mtx_unlock(&efd->efd_lock); 893 894 return (0); 895} 896 897static void 898filt_eventfddetach(struct knote *kn) 899{ 900 struct eventfd *efd = kn->kn_hook; 901 902 mtx_lock(&efd->efd_lock); 903 knlist_remove(&efd->efd_sel.si_note, kn, 1); 904 mtx_unlock(&efd->efd_lock); 905} 906 907/*ARGSUSED*/ 908static int 909filt_eventfdread(struct knote *kn, long hint) 910{ 911 struct eventfd *efd = kn->kn_hook; 912 int ret; 913 914 mtx_assert(&efd->efd_lock, MA_OWNED); 915 ret = (efd->efd_count > 0); 916 917 return (ret); 918} 919 920/*ARGSUSED*/ 921static int 922filt_eventfdwrite(struct knote *kn, long hint) 923{ 924 struct eventfd *efd = kn->kn_hook; 925 int ret; 926 927 mtx_assert(&efd->efd_lock, MA_OWNED); 928 ret = (UINT64_MAX - 1 > efd->efd_count); 929 930 return (ret); 931} 932 933/*ARGSUSED*/ 934static int 935eventfd_ioctl(struct file *fp, u_long cmd, void *data, 936 struct ucred *active_cred, struct thread *td) 937{ 938 939 if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD && 940 fp->f_type != DTYPE_LINUXTFD)) 941 return (EINVAL); 942 943 switch (cmd) 944 { 945 case FIONBIO: 946 if ((*(int *)data)) 947 atomic_set_int(&fp->f_flag, FNONBLOCK); 948 else 949 atomic_clear_int(&fp->f_flag, FNONBLOCK); 950 case FIOASYNC: 951 return (0); 952 default: 953 return (ENXIO); 954 } 955} 956 957/*ARGSUSED*/ 958static int 959eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 960 struct thread *td) 961{ 962 963 return (ENXIO); 964} 965 966/*ARGSUSED*/ 967static int 968eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 969{ 970 971 kif->kf_type = KF_TYPE_UNKNOWN; 972 return (0); 973} 974 975int 976linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) 977{ 978 struct filedesc *fdp; 979 struct timerfd *tfd; 980 struct file *fp; 981 clockid_t clockid; 982 int fflags, fd, error; 983 984 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) 985 return (EINVAL); 986 987 error = linux_to_native_clockid(&clockid, args->clockid); 988 if (error != 0) 989 return (error); 990 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) 991 return (EINVAL); 992 993 fflags = 0; 994 if ((args->flags & LINUX_TFD_CLOEXEC) != 0) 995 fflags |= O_CLOEXEC; 996 997 fdp = td->td_proc->p_fd; 998 error = falloc(td, &fp, &fd, fflags); 999 if (error != 0) 1000 return (error); 1001 1002 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); 1003 tfd->tfd_clockid = clockid; 1004 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); 1005 1006 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); 1007 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); 1008 1009 fflags = FREAD; 1010 if ((args->flags & LINUX_O_NONBLOCK) != 0) 1011 fflags |= FNONBLOCK; 1012 1013 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); 1014 fdrop(fp, td); 1015 1016 td->td_retval[0] = fd; 1017 return (error); 1018} 1019 1020static int 1021timerfd_close(struct file *fp, struct thread *td) 1022{ 1023 struct timerfd *tfd; 1024 1025 tfd = fp->f_data; 1026 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1027 return (EINVAL); 1028 1029 timespecclear(&tfd->tfd_time.it_value); 1030 timespecclear(&tfd->tfd_time.it_interval); 1031 1032 mtx_lock(&tfd->tfd_lock); 1033 callout_drain(&tfd->tfd_callout); 1034 mtx_unlock(&tfd->tfd_lock); 1035 1036 seldrain(&tfd->tfd_sel); 1037 knlist_destroy(&tfd->tfd_sel.si_note); 1038 1039 fp->f_ops = &badfileops; 1040 mtx_destroy(&tfd->tfd_lock); 1041 free(tfd, M_EPOLL); 1042 1043 return (0); 1044} 1045 1046static int 1047timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 1048 int flags, struct thread *td) 1049{ 1050 struct timerfd *tfd; 1051 timerfd_t count; 1052 int error; 1053 1054 tfd = fp->f_data; 1055 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1056 return (EINVAL); 1057 1058 if (uio->uio_resid < sizeof(timerfd_t)) 1059 return (EINVAL); 1060 1061 error = 0; 1062 mtx_lock(&tfd->tfd_lock); 1063retry: 1064 if (tfd->tfd_canceled) { 1065 tfd->tfd_count = 0; 1066 mtx_unlock(&tfd->tfd_lock); 1067 return (ECANCELED); 1068 } 1069 if (tfd->tfd_count == 0) { 1070 if ((fp->f_flag & FNONBLOCK) != 0) { 1071 mtx_unlock(&tfd->tfd_lock); 1072 return (EAGAIN); 1073 } 1074 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); 1075 if (error == 0) 1076 goto retry; 1077 } 1078 if (error == 0) { 1079 count = tfd->tfd_count; 1080 tfd->tfd_count = 0; 1081 mtx_unlock(&tfd->tfd_lock); 1082 error = uiomove(&count, sizeof(timerfd_t), uio); 1083 } else 1084 mtx_unlock(&tfd->tfd_lock); 1085 1086 return (error); 1087} 1088 1089static int 1090timerfd_poll(struct file *fp, int events, struct ucred *active_cred, 1091 struct thread *td) 1092{ 1093 struct timerfd *tfd; 1094 int revents = 0; 1095 1096 tfd = fp->f_data; 1097 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1098 return (POLLERR); 1099 1100 mtx_lock(&tfd->tfd_lock); 1101 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) 1102 revents |= events & (POLLIN|POLLRDNORM); 1103 if (revents == 0) 1104 selrecord(td, &tfd->tfd_sel); 1105 mtx_unlock(&tfd->tfd_lock); 1106 1107 return (revents); 1108} 1109 1110/*ARGSUSED*/ 1111static int 1112timerfd_kqfilter(struct file *fp, struct knote *kn) 1113{ 1114 struct timerfd *tfd; 1115 1116 tfd = fp->f_data; 1117 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1118 return (EINVAL); 1119 1120 if (kn->kn_filter == EVFILT_READ) 1121 kn->kn_fop = &timerfd_rfiltops; 1122 else 1123 return (EINVAL); 1124 1125 kn->kn_hook = tfd; 1126 knlist_add(&tfd->tfd_sel.si_note, kn, 0); 1127 1128 return (0); 1129} 1130 1131static void 1132filt_timerfddetach(struct knote *kn) 1133{ 1134 struct timerfd *tfd = kn->kn_hook; 1135 1136 mtx_lock(&tfd->tfd_lock); 1137 knlist_remove(&tfd->tfd_sel.si_note, kn, 1); 1138 mtx_unlock(&tfd->tfd_lock); 1139} 1140 1141/*ARGSUSED*/ 1142static int 1143filt_timerfdread(struct knote *kn, long hint) 1144{ 1145 struct timerfd *tfd = kn->kn_hook; 1146 1147 return (tfd->tfd_count > 0); 1148} 1149 1150/*ARGSUSED*/ 1151static int 1152timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1153 struct thread *td) 1154{ 1155 1156 return (ENXIO); 1157} 1158 1159/*ARGSUSED*/ 1160static int 1161timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1162{ 1163 1164 kif->kf_type = KF_TYPE_UNKNOWN; 1165 return (0); 1166} 1167 1168static void 1169linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) 1170{ 1171 1172 if (tfd->tfd_clockid == CLOCK_REALTIME) 1173 getnanotime(ts); 1174 else /* CLOCK_MONOTONIC */ 1175 getnanouptime(ts); 1176} 1177 1178static void 1179linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) 1180{ 1181 struct timespec cts; 1182 1183 linux_timerfd_clocktime(tfd, &cts); 1184 *ots = tfd->tfd_time; 1185 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { 1186 timespecsub(&ots->it_value, &cts); 1187 if (ots->it_value.tv_sec < 0 || 1188 (ots->it_value.tv_sec == 0 && 1189 ots->it_value.tv_nsec == 0)) { 1190 ots->it_value.tv_sec = 0; 1191 ots->it_value.tv_nsec = 1; 1192 } 1193 } 1194} 1195 1196int 1197linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) 1198{ 1199 cap_rights_t rights; 1200 struct l_itimerspec lots; 1201 struct itimerspec ots; 1202 struct timerfd *tfd; 1203 struct file *fp; 1204 int error; 1205 1206 error = fget(td, args->fd, cap_rights_init(&rights, CAP_READ), &fp); 1207 if (error != 0) 1208 return (error); 1209 tfd = fp->f_data; 1210 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1211 error = EINVAL; 1212 goto out; 1213 } 1214 1215 mtx_lock(&tfd->tfd_lock); 1216 linux_timerfd_curval(tfd, &ots); 1217 mtx_unlock(&tfd->tfd_lock); 1218 1219 error = native_to_linux_itimerspec(&lots, &ots); 1220 if (error == 0) 1221 error = copyout(&lots, args->old_value, sizeof(lots)); 1222 1223out: 1224 fdrop(fp, td); 1225 return (error); 1226} 1227 1228int 1229linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) 1230{ 1231 struct l_itimerspec lots; 1232 struct itimerspec nts, ots; 1233 struct timespec cts, ts; 1234 cap_rights_t rights; 1235 struct timerfd *tfd; 1236 struct timeval tv; 1237 struct file *fp; 1238 int error; 1239 1240 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) 1241 return (EINVAL); 1242 1243 error = copyin(args->new_value, &lots, sizeof(lots)); 1244 if (error != 0) 1245 return (error); 1246 error = linux_to_native_itimerspec(&nts, &lots); 1247 if (error != 0) 1248 return (error); 1249 1250 error = fget(td, args->fd, cap_rights_init(&rights, CAP_WRITE), &fp); 1251 if (error != 0) 1252 return (error); 1253 tfd = fp->f_data; 1254 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1255 error = EINVAL; 1256 goto out; 1257 } 1258 1259 mtx_lock(&tfd->tfd_lock); 1260 if (!timespecisset(&nts.it_value)) 1261 timespecclear(&nts.it_interval); 1262 if (args->old_value != NULL) 1263 linux_timerfd_curval(tfd, &ots); 1264 1265 tfd->tfd_time = nts; 1266 if (timespecisset(&nts.it_value)) { 1267 linux_timerfd_clocktime(tfd, &cts); 1268 ts = nts.it_value; 1269 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { 1270 timespecadd(&tfd->tfd_time.it_value, &cts); 1271 } else { 1272 timespecsub(&ts, &cts); 1273 } 1274 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1275 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1276 linux_timerfd_expire, tfd); 1277 tfd->tfd_canceled = false; 1278 } else { 1279 tfd->tfd_canceled = true; 1280 callout_stop(&tfd->tfd_callout); 1281 } 1282 mtx_unlock(&tfd->tfd_lock); 1283 1284 if (args->old_value != NULL) { 1285 error = native_to_linux_itimerspec(&lots, &ots); 1286 if (error == 0) 1287 error = copyout(&lots, args->old_value, sizeof(lots)); 1288 } 1289 1290out: 1291 fdrop(fp, td); 1292 return (error); 1293} 1294 1295static void 1296linux_timerfd_expire(void *arg) 1297{ 1298 struct timespec cts, ts; 1299 struct timeval tv; 1300 struct timerfd *tfd; 1301 1302 tfd = (struct timerfd *)arg; 1303 1304 linux_timerfd_clocktime(tfd, &cts); 1305 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { 1306 if (timespecisset(&tfd->tfd_time.it_interval)) 1307 timespecadd(&tfd->tfd_time.it_value, 1308 &tfd->tfd_time.it_interval); 1309 else 1310 /* single shot timer */ 1311 timespecclear(&tfd->tfd_time.it_value); 1312 if (timespecisset(&tfd->tfd_time.it_value)) { 1313 ts = tfd->tfd_time.it_value; 1314 timespecsub(&ts, &cts); 1315 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1316 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1317 linux_timerfd_expire, tfd); 1318 } 1319 tfd->tfd_count++; 1320 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); 1321 selwakeup(&tfd->tfd_sel); 1322 wakeup(&tfd->tfd_count); 1323 } else if (timespecisset(&tfd->tfd_time.it_value)) { 1324 ts = tfd->tfd_time.it_value; 1325 timespecsub(&ts, &cts); 1326 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1327 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1328 linux_timerfd_expire, tfd); 1329 } 1330} 1331