sys_eventfd.c revision 1.10
1/* $NetBSD: sys_eventfd.c,v 1.10 2023/11/19 04:13:37 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include <sys/cdefs.h> 33__KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.10 2023/11/19 04:13:37 riastradh Exp $"); 34 35/* 36 * eventfd 37 * 38 * Eventfd objects present a simple counting object associated with a 39 * file descriptor. Writes and reads to this file descriptor increment 40 * and decrement the count, respectively. When the count is non-zero, 41 * the descriptor is considered "readable", and when less than the max 42 * value (EVENTFD_MAXVAL), is considered "writable". 43 * 44 * This implementation is API compatible with the Linux eventfd(2) 45 * interface. 46 */ 47 48#include <sys/param.h> 49#include <sys/types.h> 50#include <sys/condvar.h> 51#include <sys/eventfd.h> 52#include <sys/file.h> 53#include <sys/filedesc.h> 54#include <sys/kauth.h> 55#include <sys/mutex.h> 56#include <sys/poll.h> 57#include <sys/proc.h> 58#include <sys/select.h> 59#include <sys/stat.h> 60#include <sys/syscallargs.h> 61#include <sys/uio.h> 62 63struct eventfd { 64 kmutex_t efd_lock; 65 kcondvar_t efd_read_wait; 66 kcondvar_t efd_write_wait; 67 struct selinfo efd_read_sel; 68 struct selinfo efd_write_sel; 69 eventfd_t efd_val; 70 int64_t efd_nwaiters; 71 bool efd_restarting; 72 bool efd_is_semaphore; 73 74 /* 75 * Information kept for stat(2). 76 */ 77 struct timespec efd_btime; /* time created */ 78 struct timespec efd_mtime; /* last write */ 79 struct timespec efd_atime; /* last read */ 80}; 81 82#define EVENTFD_MAXVAL (UINT64_MAX - 1) 83 84/* 85 * eventfd_create: 86 * 87 * Create an eventfd object. 88 */ 89static struct eventfd * 90eventfd_create(unsigned int const val, int const flags) 91{ 92 struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP); 93 94 mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE); 95 cv_init(&efd->efd_read_wait, "efdread"); 96 cv_init(&efd->efd_write_wait, "efdwrite"); 97 selinit(&efd->efd_read_sel); 98 selinit(&efd->efd_write_sel); 99 efd->efd_val = val; 100 efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE); 101 getnanotime(&efd->efd_btime); 102 103 /* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */ 104 105 return efd; 106} 107 108/* 109 * eventfd_destroy: 110 * 111 * Destroy an eventfd object. 112 */ 113static void 114eventfd_destroy(struct eventfd * const efd) 115{ 116 117 KASSERT(efd->efd_nwaiters == 0); 118 119 cv_destroy(&efd->efd_read_wait); 120 cv_destroy(&efd->efd_write_wait); 121 122 seldestroy(&efd->efd_read_sel); 123 seldestroy(&efd->efd_write_sel); 124 125 mutex_destroy(&efd->efd_lock); 126 127 kmem_free(efd, sizeof(*efd)); 128} 129 130/* 131 * eventfd_wait: 132 * 133 * Block on an eventfd. Handles non-blocking, as well as 134 * the restart cases. 135 */ 136static int 137eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write) 138{ 139 kcondvar_t *waitcv; 140 int error; 141 142 if (fflag & FNONBLOCK) { 143 return EAGAIN; 144 } 145 146 /* 147 * We're going to block. Check if we need to return ERESTART. 148 */ 149 if (efd->efd_restarting) { 150 return ERESTART; 151 } 152 153 if (is_write) { 154 waitcv = &efd->efd_write_wait; 155 } else { 156 waitcv = &efd->efd_read_wait; 157 } 158 159 efd->efd_nwaiters++; 160 KASSERT(efd->efd_nwaiters > 0); 161 error = cv_wait_sig(waitcv, &efd->efd_lock); 162 efd->efd_nwaiters--; 163 KASSERT(efd->efd_nwaiters >= 0); 164 165 /* 166 * If a restart was triggered while we were asleep, we need 167 * to return ERESTART if no other error was returned. 168 */ 169 if (efd->efd_restarting) { 170 if (error == 0) { 171 error = ERESTART; 172 } 173 } 174 175 return error; 176} 177 178/* 179 * eventfd_wake: 180 * 181 * Wake LWPs block on an eventfd. 182 */ 183static void 184eventfd_wake(struct eventfd * const efd, bool const is_write) 185{ 186 kcondvar_t *waitcv = NULL; 187 struct selinfo *sel; 188 int pollev; 189 190 if (is_write) { 191 waitcv = &efd->efd_read_wait; 192 sel = &efd->efd_read_sel; 193 pollev = POLLIN | POLLRDNORM; 194 } else { 195 waitcv = &efd->efd_write_wait; 196 sel = &efd->efd_write_sel; 197 pollev = POLLOUT | POLLWRNORM; 198 } 199 if (waitcv != NULL) { 200 cv_broadcast(waitcv); 201 } 202 selnotify(sel, pollev, NOTE_SUBMIT); 203} 204 205/* 206 * eventfd file operations 207 */ 208 209static int 210eventfd_fop_read(file_t * const fp, off_t * const offset, 211 struct uio * const uio, kauth_cred_t const cred, int const flags) 212{ 213 struct eventfd * const efd = fp->f_eventfd; 214 int const fflag = fp->f_flag; 215 eventfd_t return_value; 216 int error; 217 218 if (uio->uio_resid < sizeof(eventfd_t)) { 219 return EINVAL; 220 } 221 222 mutex_enter(&efd->efd_lock); 223 224 while (efd->efd_val == 0) { 225 if ((error = eventfd_wait(efd, fflag, false)) != 0) { 226 mutex_exit(&efd->efd_lock); 227 return error; 228 } 229 } 230 231 if (efd->efd_is_semaphore) { 232 return_value = 1; 233 efd->efd_val--; 234 } else { 235 return_value = efd->efd_val; 236 efd->efd_val = 0; 237 } 238 239 getnanotime(&efd->efd_atime); 240 eventfd_wake(efd, false); 241 242 mutex_exit(&efd->efd_lock); 243 244 error = uiomove(&return_value, sizeof(return_value), uio); 245 246 return error; 247} 248 249static int 250eventfd_fop_write(file_t * const fp, off_t * const offset, 251 struct uio * const uio, kauth_cred_t const cred, int const flags) 252{ 253 struct eventfd * const efd = fp->f_eventfd; 254 int const fflag = fp->f_flag; 255 eventfd_t write_value; 256 int error; 257 258 if (uio->uio_resid < sizeof(eventfd_t)) { 259 return EINVAL; 260 } 261 262 if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) { 263 return error; 264 } 265 266 if (write_value > EVENTFD_MAXVAL) { 267 error = EINVAL; 268 goto out; 269 } 270 271 mutex_enter(&efd->efd_lock); 272 273 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 274 while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) { 275 if ((error = eventfd_wait(efd, fflag, true)) != 0) { 276 mutex_exit(&efd->efd_lock); 277 goto out; 278 } 279 } 280 281 efd->efd_val += write_value; 282 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 283 284 getnanotime(&efd->efd_mtime); 285 eventfd_wake(efd, true); 286 287 mutex_exit(&efd->efd_lock); 288 289 out: 290 if (error) { 291 /* 292 * Undo the effect of uiomove() so that the error 293 * gets reported correctly; see dofilewrite(). 294 */ 295 uio->uio_resid += sizeof(write_value); 296 } 297 return error; 298} 299 300static int 301eventfd_ioctl(file_t * const fp, u_long const cmd, void * const data) 302{ 303 struct eventfd * const efd = fp->f_eventfd; 304 305 switch (cmd) { 306 case FIONBIO: 307 return 0; 308 309 case FIONREAD: 310 mutex_enter(&efd->efd_lock); 311 *(int *)data = efd->efd_val != 0 ? sizeof(eventfd_t) : 0; 312 mutex_exit(&efd->efd_lock); 313 return 0; 314 315 case FIONWRITE: 316 *(int *)data = 0; 317 return 0; 318 319 case FIONSPACE: 320 /* 321 * FIONSPACE doesn't really work for eventfd, because the 322 * writability depends on the contents (value) being written. 323 */ 324 break; 325 326 default: 327 break; 328 } 329 330 return EPASSTHROUGH; 331} 332 333static int 334eventfd_fop_poll(file_t * const fp, int const events) 335{ 336 struct eventfd * const efd = fp->f_eventfd; 337 int revents = 0; 338 339 /* 340 * Note that Linux will return POLLERR if the eventfd count 341 * overflows, but that is not possible in the normal read/write 342 * API, only with Linux kernel-internal interfaces. So, this 343 * implementation never returns POLLERR. 344 * 345 * Also note that the Linux eventfd(2) man page does not 346 * specifically discuss returning POLLRDNORM, but we check 347 * for that event in addition to POLLIN. 348 */ 349 350 mutex_enter(&efd->efd_lock); 351 352 if (events & (POLLIN | POLLRDNORM)) { 353 if (efd->efd_val != 0) { 354 revents |= events & (POLLIN | POLLRDNORM); 355 } else { 356 selrecord(curlwp, &efd->efd_read_sel); 357 } 358 } 359 360 if (events & (POLLOUT | POLLWRNORM)) { 361 if (efd->efd_val < EVENTFD_MAXVAL) { 362 revents |= events & (POLLOUT | POLLWRNORM); 363 } else { 364 selrecord(curlwp, &efd->efd_write_sel); 365 } 366 } 367 368 mutex_exit(&efd->efd_lock); 369 370 return revents; 371} 372 373static int 374eventfd_fop_stat(file_t * const fp, struct stat * const st) 375{ 376 struct eventfd * const efd = fp->f_eventfd; 377 378 memset(st, 0, sizeof(*st)); 379 380 mutex_enter(&efd->efd_lock); 381 st->st_size = (off_t)efd->efd_val; 382 st->st_blksize = sizeof(eventfd_t); 383 st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 384 st->st_blocks = 1; 385 st->st_birthtimespec = st->st_ctimespec = efd->efd_btime; 386 st->st_atimespec = efd->efd_atime; 387 st->st_mtimespec = efd->efd_mtime; 388 st->st_uid = kauth_cred_geteuid(fp->f_cred); 389 st->st_gid = kauth_cred_getegid(fp->f_cred); 390 mutex_exit(&efd->efd_lock); 391 392 return 0; 393} 394 395static int 396eventfd_fop_close(file_t * const fp) 397{ 398 struct eventfd * const efd = fp->f_eventfd; 399 400 fp->f_eventfd = NULL; 401 eventfd_destroy(efd); 402 403 return 0; 404} 405 406static void 407eventfd_filt_read_detach(struct knote * const kn) 408{ 409 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 410 411 mutex_enter(&efd->efd_lock); 412 KASSERT(kn->kn_hook == efd); 413 selremove_knote(&efd->efd_read_sel, kn); 414 mutex_exit(&efd->efd_lock); 415} 416 417static int 418eventfd_filt_read(struct knote * const kn, long const hint) 419{ 420 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 421 int rv; 422 423 if (hint & NOTE_SUBMIT) { 424 KASSERT(mutex_owned(&efd->efd_lock)); 425 } else { 426 mutex_enter(&efd->efd_lock); 427 } 428 429 kn->kn_data = (int64_t)efd->efd_val; 430 rv = (eventfd_t)kn->kn_data > 0; 431 432 if ((hint & NOTE_SUBMIT) == 0) { 433 mutex_exit(&efd->efd_lock); 434 } 435 436 return rv; 437} 438 439static const struct filterops eventfd_read_filterops = { 440 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 441 .f_detach = eventfd_filt_read_detach, 442 .f_event = eventfd_filt_read, 443}; 444 445static void 446eventfd_filt_write_detach(struct knote * const kn) 447{ 448 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 449 450 mutex_enter(&efd->efd_lock); 451 KASSERT(kn->kn_hook == efd); 452 selremove_knote(&efd->efd_write_sel, kn); 453 mutex_exit(&efd->efd_lock); 454} 455 456static int 457eventfd_filt_write(struct knote * const kn, long const hint) 458{ 459 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 460 int rv; 461 462 if (hint & NOTE_SUBMIT) { 463 KASSERT(mutex_owned(&efd->efd_lock)); 464 } else { 465 mutex_enter(&efd->efd_lock); 466 } 467 468 kn->kn_data = (int64_t)efd->efd_val; 469 rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL; 470 471 if ((hint & NOTE_SUBMIT) == 0) { 472 mutex_exit(&efd->efd_lock); 473 } 474 475 return rv; 476} 477 478static const struct filterops eventfd_write_filterops = { 479 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 480 .f_detach = eventfd_filt_write_detach, 481 .f_event = eventfd_filt_write, 482}; 483 484static int 485eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn) 486{ 487 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 488 struct selinfo *sel; 489 490 switch (kn->kn_filter) { 491 case EVFILT_READ: 492 sel = &efd->efd_read_sel; 493 kn->kn_fop = &eventfd_read_filterops; 494 break; 495 496 case EVFILT_WRITE: 497 sel = &efd->efd_write_sel; 498 kn->kn_fop = &eventfd_write_filterops; 499 break; 500 501 default: 502 return EINVAL; 503 } 504 505 kn->kn_hook = efd; 506 507 mutex_enter(&efd->efd_lock); 508 selrecord_knote(sel, kn); 509 mutex_exit(&efd->efd_lock); 510 511 return 0; 512} 513 514static void 515eventfd_fop_restart(file_t * const fp) 516{ 517 struct eventfd * const efd = fp->f_eventfd; 518 519 /* 520 * Unblock blocked reads/writes in order to allow close() to complete. 521 * System calls return ERESTART so that the fd is revalidated. 522 */ 523 524 mutex_enter(&efd->efd_lock); 525 526 if (efd->efd_nwaiters != 0) { 527 efd->efd_restarting = true; 528 cv_broadcast(&efd->efd_read_wait); 529 cv_broadcast(&efd->efd_write_wait); 530 } 531 532 mutex_exit(&efd->efd_lock); 533} 534 535static const struct fileops eventfd_fileops = { 536 .fo_name = "eventfd", 537 .fo_read = eventfd_fop_read, 538 .fo_write = eventfd_fop_write, 539 .fo_ioctl = eventfd_ioctl, 540 .fo_fcntl = fnullop_fcntl, 541 .fo_poll = eventfd_fop_poll, 542 .fo_stat = eventfd_fop_stat, 543 .fo_close = eventfd_fop_close, 544 .fo_kqfilter = eventfd_fop_kqfilter, 545 .fo_restart = eventfd_fop_restart, 546}; 547 548/* 549 * eventfd(2) system call 550 */ 551int 552do_eventfd(struct lwp * const l, unsigned int const val, int const flags, 553 register_t *retval) 554{ 555 file_t *fp; 556 int fd, error; 557 558 if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) { 559 return EINVAL; 560 } 561 562 if ((error = fd_allocfile(&fp, &fd)) != 0) { 563 return error; 564 } 565 566 fp->f_flag = FREAD | FWRITE; 567 if (flags & EFD_NONBLOCK) { 568 fp->f_flag |= FNONBLOCK; 569 } 570 fp->f_type = DTYPE_EVENTFD; 571 fp->f_ops = &eventfd_fileops; 572 fp->f_eventfd = eventfd_create(val, flags); 573 fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC)); 574 fd_affix(curproc, fp, fd); 575 576 *retval = fd; 577 return 0; 578} 579 580int 581sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap, 582 register_t *retval) 583{ 584 /* { 585 syscallarg(unsigned int) val; 586 syscallarg(int) flags; 587 } */ 588 589 return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval); 590} 591