sys_eventfd.c revision 1.11
1/* $NetBSD: sys_eventfd.c,v 1.11 2023/11/19 17:16:00 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include <sys/cdefs.h> 33__KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.11 2023/11/19 17:16:00 riastradh Exp $"); 34 35/* 36 * eventfd 37 * 38 * Eventfd objects present a simple counting object associated with a 39 * file descriptor. Writes and reads to this file descriptor increment 40 * and decrement the count, respectively. When the count is non-zero, 41 * the descriptor is considered "readable", and when less than the max 42 * value (EVENTFD_MAXVAL), is considered "writable". 43 * 44 * This implementation is API compatible with the Linux eventfd(2) 45 * interface. 46 */ 47 48#include <sys/param.h> 49#include <sys/types.h> 50#include <sys/condvar.h> 51#include <sys/eventfd.h> 52#include <sys/file.h> 53#include <sys/filedesc.h> 54#include <sys/kauth.h> 55#include <sys/mutex.h> 56#include <sys/poll.h> 57#include <sys/proc.h> 58#include <sys/select.h> 59#include <sys/stat.h> 60#include <sys/syscallargs.h> 61#include <sys/uio.h> 62 63struct eventfd { 64 kmutex_t efd_lock; 65 kcondvar_t efd_read_wait; 66 kcondvar_t efd_write_wait; 67 struct selinfo efd_read_sel; 68 struct selinfo efd_write_sel; 69 eventfd_t efd_val; 70 int64_t efd_nwaiters; 71 bool efd_restarting; 72 bool efd_is_semaphore; 73 74 /* 75 * Information kept for stat(2). 76 */ 77 struct timespec efd_btime; /* time created */ 78 struct timespec efd_mtime; /* last write */ 79 struct timespec efd_atime; /* last read */ 80}; 81 82#define EVENTFD_MAXVAL (UINT64_MAX - 1) 83 84/* 85 * eventfd_create: 86 * 87 * Create an eventfd object. 88 */ 89static struct eventfd * 90eventfd_create(unsigned int const val, int const flags) 91{ 92 struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP); 93 94 mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE); 95 cv_init(&efd->efd_read_wait, "efdread"); 96 cv_init(&efd->efd_write_wait, "efdwrite"); 97 selinit(&efd->efd_read_sel); 98 selinit(&efd->efd_write_sel); 99 efd->efd_val = val; 100 efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE); 101 getnanotime(&efd->efd_btime); 102 103 /* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */ 104 105 return efd; 106} 107 108/* 109 * eventfd_destroy: 110 * 111 * Destroy an eventfd object. 112 */ 113static void 114eventfd_destroy(struct eventfd * const efd) 115{ 116 117 KASSERT(efd->efd_nwaiters == 0); 118 119 cv_destroy(&efd->efd_read_wait); 120 cv_destroy(&efd->efd_write_wait); 121 122 seldestroy(&efd->efd_read_sel); 123 seldestroy(&efd->efd_write_sel); 124 125 mutex_destroy(&efd->efd_lock); 126 127 kmem_free(efd, sizeof(*efd)); 128} 129 130/* 131 * eventfd_wait: 132 * 133 * Block on an eventfd. Handles non-blocking, as well as 134 * the restart cases. 135 */ 136static int 137eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write) 138{ 139 kcondvar_t *waitcv; 140 int error; 141 142 if (fflag & FNONBLOCK) { 143 return EAGAIN; 144 } 145 146 /* 147 * We're going to block. Check if we need to return ERESTART. 148 */ 149 if (efd->efd_restarting) { 150 return ERESTART; 151 } 152 153 if (is_write) { 154 waitcv = &efd->efd_write_wait; 155 } else { 156 waitcv = &efd->efd_read_wait; 157 } 158 159 efd->efd_nwaiters++; 160 KASSERT(efd->efd_nwaiters > 0); 161 error = cv_wait_sig(waitcv, &efd->efd_lock); 162 efd->efd_nwaiters--; 163 KASSERT(efd->efd_nwaiters >= 0); 164 165 /* 166 * If a restart was triggered while we were asleep, we need 167 * to return ERESTART if no other error was returned. 168 */ 169 if (efd->efd_restarting) { 170 if (error == 0) { 171 error = ERESTART; 172 } 173 } 174 175 return error; 176} 177 178/* 179 * eventfd_wake: 180 * 181 * Wake LWPs block on an eventfd. 182 */ 183static void 184eventfd_wake(struct eventfd * const efd, bool const is_write) 185{ 186 kcondvar_t *waitcv = NULL; 187 struct selinfo *sel; 188 int pollev; 189 190 if (is_write) { 191 waitcv = &efd->efd_read_wait; 192 sel = &efd->efd_read_sel; 193 pollev = POLLIN | POLLRDNORM; 194 } else { 195 waitcv = &efd->efd_write_wait; 196 sel = &efd->efd_write_sel; 197 pollev = POLLOUT | POLLWRNORM; 198 } 199 cv_broadcast(waitcv); 200 selnotify(sel, pollev, NOTE_SUBMIT); 201} 202 203/* 204 * eventfd file operations 205 */ 206 207static int 208eventfd_fop_read(file_t * const fp, off_t * const offset, 209 struct uio * const uio, kauth_cred_t const cred, int const flags) 210{ 211 struct eventfd * const efd = fp->f_eventfd; 212 int const fflag = fp->f_flag; 213 eventfd_t return_value; 214 int error; 215 216 if (uio->uio_resid < sizeof(eventfd_t)) { 217 return EINVAL; 218 } 219 220 mutex_enter(&efd->efd_lock); 221 222 while (efd->efd_val == 0) { 223 if ((error = eventfd_wait(efd, fflag, false)) != 0) { 224 mutex_exit(&efd->efd_lock); 225 return error; 226 } 227 } 228 229 if (efd->efd_is_semaphore) { 230 return_value = 1; 231 efd->efd_val--; 232 } else { 233 return_value = efd->efd_val; 234 efd->efd_val = 0; 235 } 236 237 getnanotime(&efd->efd_atime); 238 eventfd_wake(efd, false); 239 240 mutex_exit(&efd->efd_lock); 241 242 error = uiomove(&return_value, sizeof(return_value), uio); 243 244 return error; 245} 246 247static int 248eventfd_fop_write(file_t * const fp, off_t * const offset, 249 struct uio * const uio, kauth_cred_t const cred, int const flags) 250{ 251 struct eventfd * const efd = fp->f_eventfd; 252 int const fflag = fp->f_flag; 253 eventfd_t write_value; 254 int error; 255 256 if (uio->uio_resid < sizeof(eventfd_t)) { 257 return EINVAL; 258 } 259 260 if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) { 261 return error; 262 } 263 264 if (write_value > EVENTFD_MAXVAL) { 265 error = EINVAL; 266 goto out; 267 } 268 269 mutex_enter(&efd->efd_lock); 270 271 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 272 while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) { 273 if ((error = eventfd_wait(efd, fflag, true)) != 0) { 274 mutex_exit(&efd->efd_lock); 275 goto out; 276 } 277 } 278 279 efd->efd_val += write_value; 280 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 281 282 getnanotime(&efd->efd_mtime); 283 eventfd_wake(efd, true); 284 285 mutex_exit(&efd->efd_lock); 286 287 out: 288 if (error) { 289 /* 290 * Undo the effect of uiomove() so that the error 291 * gets reported correctly; see dofilewrite(). 292 */ 293 uio->uio_resid += sizeof(write_value); 294 } 295 return error; 296} 297 298static int 299eventfd_ioctl(file_t * const fp, u_long const cmd, void * const data) 300{ 301 struct eventfd * const efd = fp->f_eventfd; 302 303 switch (cmd) { 304 case FIONBIO: 305 return 0; 306 307 case FIONREAD: 308 mutex_enter(&efd->efd_lock); 309 *(int *)data = efd->efd_val != 0 ? sizeof(eventfd_t) : 0; 310 mutex_exit(&efd->efd_lock); 311 return 0; 312 313 case FIONWRITE: 314 *(int *)data = 0; 315 return 0; 316 317 case FIONSPACE: 318 /* 319 * FIONSPACE doesn't really work for eventfd, because the 320 * writability depends on the contents (value) being written. 321 */ 322 break; 323 324 default: 325 break; 326 } 327 328 return EPASSTHROUGH; 329} 330 331static int 332eventfd_fop_poll(file_t * const fp, int const events) 333{ 334 struct eventfd * const efd = fp->f_eventfd; 335 int revents = 0; 336 337 /* 338 * Note that Linux will return POLLERR if the eventfd count 339 * overflows, but that is not possible in the normal read/write 340 * API, only with Linux kernel-internal interfaces. So, this 341 * implementation never returns POLLERR. 342 * 343 * Also note that the Linux eventfd(2) man page does not 344 * specifically discuss returning POLLRDNORM, but we check 345 * for that event in addition to POLLIN. 346 */ 347 348 mutex_enter(&efd->efd_lock); 349 350 if (events & (POLLIN | POLLRDNORM)) { 351 if (efd->efd_val != 0) { 352 revents |= events & (POLLIN | POLLRDNORM); 353 } else { 354 selrecord(curlwp, &efd->efd_read_sel); 355 } 356 } 357 358 if (events & (POLLOUT | POLLWRNORM)) { 359 if (efd->efd_val < EVENTFD_MAXVAL) { 360 revents |= events & (POLLOUT | POLLWRNORM); 361 } else { 362 selrecord(curlwp, &efd->efd_write_sel); 363 } 364 } 365 366 mutex_exit(&efd->efd_lock); 367 368 return revents; 369} 370 371static int 372eventfd_fop_stat(file_t * const fp, struct stat * const st) 373{ 374 struct eventfd * const efd = fp->f_eventfd; 375 376 memset(st, 0, sizeof(*st)); 377 378 mutex_enter(&efd->efd_lock); 379 st->st_size = (off_t)efd->efd_val; 380 st->st_blksize = sizeof(eventfd_t); 381 st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 382 st->st_blocks = 1; 383 st->st_birthtimespec = st->st_ctimespec = efd->efd_btime; 384 st->st_atimespec = efd->efd_atime; 385 st->st_mtimespec = efd->efd_mtime; 386 st->st_uid = kauth_cred_geteuid(fp->f_cred); 387 st->st_gid = kauth_cred_getegid(fp->f_cred); 388 mutex_exit(&efd->efd_lock); 389 390 return 0; 391} 392 393static int 394eventfd_fop_close(file_t * const fp) 395{ 396 struct eventfd * const efd = fp->f_eventfd; 397 398 fp->f_eventfd = NULL; 399 eventfd_destroy(efd); 400 401 return 0; 402} 403 404static void 405eventfd_filt_read_detach(struct knote * const kn) 406{ 407 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 408 409 mutex_enter(&efd->efd_lock); 410 KASSERT(kn->kn_hook == efd); 411 selremove_knote(&efd->efd_read_sel, kn); 412 mutex_exit(&efd->efd_lock); 413} 414 415static int 416eventfd_filt_read(struct knote * const kn, long const hint) 417{ 418 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 419 int rv; 420 421 if (hint & NOTE_SUBMIT) { 422 KASSERT(mutex_owned(&efd->efd_lock)); 423 } else { 424 mutex_enter(&efd->efd_lock); 425 } 426 427 kn->kn_data = (int64_t)efd->efd_val; 428 rv = (eventfd_t)kn->kn_data > 0; 429 430 if ((hint & NOTE_SUBMIT) == 0) { 431 mutex_exit(&efd->efd_lock); 432 } 433 434 return rv; 435} 436 437static const struct filterops eventfd_read_filterops = { 438 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 439 .f_detach = eventfd_filt_read_detach, 440 .f_event = eventfd_filt_read, 441}; 442 443static void 444eventfd_filt_write_detach(struct knote * const kn) 445{ 446 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 447 448 mutex_enter(&efd->efd_lock); 449 KASSERT(kn->kn_hook == efd); 450 selremove_knote(&efd->efd_write_sel, kn); 451 mutex_exit(&efd->efd_lock); 452} 453 454static int 455eventfd_filt_write(struct knote * const kn, long const hint) 456{ 457 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 458 int rv; 459 460 if (hint & NOTE_SUBMIT) { 461 KASSERT(mutex_owned(&efd->efd_lock)); 462 } else { 463 mutex_enter(&efd->efd_lock); 464 } 465 466 kn->kn_data = (int64_t)efd->efd_val; 467 rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL; 468 469 if ((hint & NOTE_SUBMIT) == 0) { 470 mutex_exit(&efd->efd_lock); 471 } 472 473 return rv; 474} 475 476static const struct filterops eventfd_write_filterops = { 477 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 478 .f_detach = eventfd_filt_write_detach, 479 .f_event = eventfd_filt_write, 480}; 481 482static int 483eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn) 484{ 485 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 486 struct selinfo *sel; 487 488 switch (kn->kn_filter) { 489 case EVFILT_READ: 490 sel = &efd->efd_read_sel; 491 kn->kn_fop = &eventfd_read_filterops; 492 break; 493 494 case EVFILT_WRITE: 495 sel = &efd->efd_write_sel; 496 kn->kn_fop = &eventfd_write_filterops; 497 break; 498 499 default: 500 return EINVAL; 501 } 502 503 kn->kn_hook = efd; 504 505 mutex_enter(&efd->efd_lock); 506 selrecord_knote(sel, kn); 507 mutex_exit(&efd->efd_lock); 508 509 return 0; 510} 511 512static void 513eventfd_fop_restart(file_t * const fp) 514{ 515 struct eventfd * const efd = fp->f_eventfd; 516 517 /* 518 * Unblock blocked reads/writes in order to allow close() to complete. 519 * System calls return ERESTART so that the fd is revalidated. 520 */ 521 522 mutex_enter(&efd->efd_lock); 523 524 if (efd->efd_nwaiters != 0) { 525 efd->efd_restarting = true; 526 cv_broadcast(&efd->efd_read_wait); 527 cv_broadcast(&efd->efd_write_wait); 528 } 529 530 mutex_exit(&efd->efd_lock); 531} 532 533static const struct fileops eventfd_fileops = { 534 .fo_name = "eventfd", 535 .fo_read = eventfd_fop_read, 536 .fo_write = eventfd_fop_write, 537 .fo_ioctl = eventfd_ioctl, 538 .fo_fcntl = fnullop_fcntl, 539 .fo_poll = eventfd_fop_poll, 540 .fo_stat = eventfd_fop_stat, 541 .fo_close = eventfd_fop_close, 542 .fo_kqfilter = eventfd_fop_kqfilter, 543 .fo_restart = eventfd_fop_restart, 544}; 545 546/* 547 * eventfd(2) system call 548 */ 549int 550do_eventfd(struct lwp * const l, unsigned int const val, int const flags, 551 register_t *retval) 552{ 553 file_t *fp; 554 int fd, error; 555 556 if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) { 557 return EINVAL; 558 } 559 560 if ((error = fd_allocfile(&fp, &fd)) != 0) { 561 return error; 562 } 563 564 fp->f_flag = FREAD | FWRITE; 565 if (flags & EFD_NONBLOCK) { 566 fp->f_flag |= FNONBLOCK; 567 } 568 fp->f_type = DTYPE_EVENTFD; 569 fp->f_ops = &eventfd_fileops; 570 fp->f_eventfd = eventfd_create(val, flags); 571 fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC)); 572 fd_affix(curproc, fp, fd); 573 574 *retval = fd; 575 return 0; 576} 577 578int 579sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap, 580 register_t *retval) 581{ 582 /* { 583 syscallarg(unsigned int) val; 584 syscallarg(int) flags; 585 } */ 586 587 return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval); 588} 589