sys_eventfd.c revision 1.10
1/*	$NetBSD: sys_eventfd.c,v 1.10 2023/11/19 04:13:37 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.10 2023/11/19 04:13:37 riastradh Exp $");
34
35/*
36 * eventfd
37 *
38 * Eventfd objects present a simple counting object associated with a
39 * file descriptor.  Writes and reads to this file descriptor increment
40 * and decrement the count, respectively.  When the count is non-zero,
41 * the descriptor is considered "readable", and when less than the max
42 * value (EVENTFD_MAXVAL), is considered "writable".
43 *
44 * This implementation is API compatible with the Linux eventfd(2)
45 * interface.
46 */
47
48#include <sys/param.h>
49#include <sys/types.h>
50#include <sys/condvar.h>
51#include <sys/eventfd.h>
52#include <sys/file.h>
53#include <sys/filedesc.h>
54#include <sys/kauth.h>
55#include <sys/mutex.h>
56#include <sys/poll.h>
57#include <sys/proc.h>
58#include <sys/select.h>
59#include <sys/stat.h>
60#include <sys/syscallargs.h>
61#include <sys/uio.h>
62
63struct eventfd {
64	kmutex_t	efd_lock;
65	kcondvar_t	efd_read_wait;
66	kcondvar_t	efd_write_wait;
67	struct selinfo	efd_read_sel;
68	struct selinfo	efd_write_sel;
69	eventfd_t	efd_val;
70	int64_t		efd_nwaiters;
71	bool		efd_restarting;
72	bool		efd_is_semaphore;
73
74	/*
75	 * Information kept for stat(2).
76	 */
77	struct timespec efd_btime;	/* time created */
78	struct timespec	efd_mtime;	/* last write */
79	struct timespec	efd_atime;	/* last read */
80};
81
82#define	EVENTFD_MAXVAL	(UINT64_MAX - 1)
83
84/*
85 * eventfd_create:
86 *
87 *	Create an eventfd object.
88 */
89static struct eventfd *
90eventfd_create(unsigned int const val, int const flags)
91{
92	struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP);
93
94	mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE);
95	cv_init(&efd->efd_read_wait, "efdread");
96	cv_init(&efd->efd_write_wait, "efdwrite");
97	selinit(&efd->efd_read_sel);
98	selinit(&efd->efd_write_sel);
99	efd->efd_val = val;
100	efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE);
101	getnanotime(&efd->efd_btime);
102
103	/* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */
104
105	return efd;
106}
107
108/*
109 * eventfd_destroy:
110 *
111 *	Destroy an eventfd object.
112 */
113static void
114eventfd_destroy(struct eventfd * const efd)
115{
116
117	KASSERT(efd->efd_nwaiters == 0);
118
119	cv_destroy(&efd->efd_read_wait);
120	cv_destroy(&efd->efd_write_wait);
121
122	seldestroy(&efd->efd_read_sel);
123	seldestroy(&efd->efd_write_sel);
124
125	mutex_destroy(&efd->efd_lock);
126
127	kmem_free(efd, sizeof(*efd));
128}
129
130/*
131 * eventfd_wait:
132 *
133 *	Block on an eventfd.  Handles non-blocking, as well as
134 *	the restart cases.
135 */
136static int
137eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write)
138{
139	kcondvar_t *waitcv;
140	int error;
141
142	if (fflag & FNONBLOCK) {
143		return EAGAIN;
144	}
145
146	/*
147	 * We're going to block.  Check if we need to return ERESTART.
148	 */
149	if (efd->efd_restarting) {
150		return ERESTART;
151	}
152
153	if (is_write) {
154		waitcv = &efd->efd_write_wait;
155	} else {
156		waitcv = &efd->efd_read_wait;
157	}
158
159	efd->efd_nwaiters++;
160	KASSERT(efd->efd_nwaiters > 0);
161	error = cv_wait_sig(waitcv, &efd->efd_lock);
162	efd->efd_nwaiters--;
163	KASSERT(efd->efd_nwaiters >= 0);
164
165	/*
166	 * If a restart was triggered while we were asleep, we need
167	 * to return ERESTART if no other error was returned.
168	 */
169	if (efd->efd_restarting) {
170		if (error == 0) {
171			error = ERESTART;
172		}
173	}
174
175	return error;
176}
177
178/*
179 * eventfd_wake:
180 *
181 *	Wake LWPs block on an eventfd.
182 */
183static void
184eventfd_wake(struct eventfd * const efd, bool const is_write)
185{
186	kcondvar_t *waitcv = NULL;
187	struct selinfo *sel;
188	int pollev;
189
190	if (is_write) {
191		waitcv = &efd->efd_read_wait;
192		sel = &efd->efd_read_sel;
193		pollev = POLLIN | POLLRDNORM;
194	} else {
195		waitcv = &efd->efd_write_wait;
196		sel = &efd->efd_write_sel;
197		pollev = POLLOUT | POLLWRNORM;
198	}
199	if (waitcv != NULL) {
200		cv_broadcast(waitcv);
201	}
202	selnotify(sel, pollev, NOTE_SUBMIT);
203}
204
205/*
206 * eventfd file operations
207 */
208
209static int
210eventfd_fop_read(file_t * const fp, off_t * const offset,
211    struct uio * const uio, kauth_cred_t const cred, int const flags)
212{
213	struct eventfd * const efd = fp->f_eventfd;
214	int const fflag = fp->f_flag;
215	eventfd_t return_value;
216	int error;
217
218	if (uio->uio_resid < sizeof(eventfd_t)) {
219		return EINVAL;
220	}
221
222	mutex_enter(&efd->efd_lock);
223
224	while (efd->efd_val == 0) {
225		if ((error = eventfd_wait(efd, fflag, false)) != 0) {
226			mutex_exit(&efd->efd_lock);
227			return error;
228		}
229	}
230
231	if (efd->efd_is_semaphore) {
232		return_value = 1;
233		efd->efd_val--;
234	} else {
235		return_value = efd->efd_val;
236		efd->efd_val = 0;
237	}
238
239	getnanotime(&efd->efd_atime);
240	eventfd_wake(efd, false);
241
242	mutex_exit(&efd->efd_lock);
243
244	error = uiomove(&return_value, sizeof(return_value), uio);
245
246	return error;
247}
248
249static int
250eventfd_fop_write(file_t * const fp, off_t * const offset,
251    struct uio * const uio, kauth_cred_t const cred, int const flags)
252{
253	struct eventfd * const efd = fp->f_eventfd;
254	int const fflag = fp->f_flag;
255	eventfd_t write_value;
256	int error;
257
258	if (uio->uio_resid < sizeof(eventfd_t)) {
259		return EINVAL;
260	}
261
262	if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) {
263		return error;
264	}
265
266	if (write_value > EVENTFD_MAXVAL) {
267		error = EINVAL;
268		goto out;
269	}
270
271	mutex_enter(&efd->efd_lock);
272
273	KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
274	while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) {
275		if ((error = eventfd_wait(efd, fflag, true)) != 0) {
276			mutex_exit(&efd->efd_lock);
277			goto out;
278		}
279	}
280
281	efd->efd_val += write_value;
282	KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
283
284	getnanotime(&efd->efd_mtime);
285	eventfd_wake(efd, true);
286
287	mutex_exit(&efd->efd_lock);
288
289 out:
290	if (error) {
291		/*
292		 * Undo the effect of uiomove() so that the error
293		 * gets reported correctly; see dofilewrite().
294		 */
295		uio->uio_resid += sizeof(write_value);
296	}
297	return error;
298}
299
300static int
301eventfd_ioctl(file_t * const fp, u_long const cmd, void * const data)
302{
303	struct eventfd * const efd = fp->f_eventfd;
304
305	switch (cmd) {
306	case FIONBIO:
307		return 0;
308
309	case FIONREAD:
310		mutex_enter(&efd->efd_lock);
311		*(int *)data = efd->efd_val != 0 ? sizeof(eventfd_t) : 0;
312		mutex_exit(&efd->efd_lock);
313		return 0;
314
315	case FIONWRITE:
316		*(int *)data = 0;
317		return 0;
318
319	case FIONSPACE:
320		/*
321		 * FIONSPACE doesn't really work for eventfd, because the
322		 * writability depends on the contents (value) being written.
323		 */
324		break;
325
326	default:
327		break;
328	}
329
330	return EPASSTHROUGH;
331}
332
333static int
334eventfd_fop_poll(file_t * const fp, int const events)
335{
336	struct eventfd * const efd = fp->f_eventfd;
337	int revents = 0;
338
339	/*
340	 * Note that Linux will return POLLERR if the eventfd count
341	 * overflows, but that is not possible in the normal read/write
342	 * API, only with Linux kernel-internal interfaces.  So, this
343	 * implementation never returns POLLERR.
344	 *
345	 * Also note that the Linux eventfd(2) man page does not
346	 * specifically discuss returning POLLRDNORM, but we check
347	 * for that event in addition to POLLIN.
348	 */
349
350	mutex_enter(&efd->efd_lock);
351
352	if (events & (POLLIN | POLLRDNORM)) {
353		if (efd->efd_val != 0) {
354			revents |= events & (POLLIN | POLLRDNORM);
355		} else {
356			selrecord(curlwp, &efd->efd_read_sel);
357		}
358	}
359
360	if (events & (POLLOUT | POLLWRNORM)) {
361		if (efd->efd_val < EVENTFD_MAXVAL) {
362			revents |= events & (POLLOUT | POLLWRNORM);
363		} else {
364			selrecord(curlwp, &efd->efd_write_sel);
365		}
366	}
367
368	mutex_exit(&efd->efd_lock);
369
370	return revents;
371}
372
373static int
374eventfd_fop_stat(file_t * const fp, struct stat * const st)
375{
376	struct eventfd * const efd = fp->f_eventfd;
377
378	memset(st, 0, sizeof(*st));
379
380	mutex_enter(&efd->efd_lock);
381	st->st_size = (off_t)efd->efd_val;
382	st->st_blksize = sizeof(eventfd_t);
383	st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
384	st->st_blocks = 1;
385	st->st_birthtimespec = st->st_ctimespec = efd->efd_btime;
386	st->st_atimespec = efd->efd_atime;
387	st->st_mtimespec = efd->efd_mtime;
388	st->st_uid = kauth_cred_geteuid(fp->f_cred);
389	st->st_gid = kauth_cred_getegid(fp->f_cred);
390	mutex_exit(&efd->efd_lock);
391
392	return 0;
393}
394
395static int
396eventfd_fop_close(file_t * const fp)
397{
398	struct eventfd * const efd = fp->f_eventfd;
399
400	fp->f_eventfd = NULL;
401	eventfd_destroy(efd);
402
403	return 0;
404}
405
406static void
407eventfd_filt_read_detach(struct knote * const kn)
408{
409	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
410
411	mutex_enter(&efd->efd_lock);
412	KASSERT(kn->kn_hook == efd);
413	selremove_knote(&efd->efd_read_sel, kn);
414	mutex_exit(&efd->efd_lock);
415}
416
417static int
418eventfd_filt_read(struct knote * const kn, long const hint)
419{
420	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
421	int rv;
422
423	if (hint & NOTE_SUBMIT) {
424		KASSERT(mutex_owned(&efd->efd_lock));
425	} else {
426		mutex_enter(&efd->efd_lock);
427	}
428
429	kn->kn_data = (int64_t)efd->efd_val;
430	rv = (eventfd_t)kn->kn_data > 0;
431
432	if ((hint & NOTE_SUBMIT) == 0) {
433		mutex_exit(&efd->efd_lock);
434	}
435
436	return rv;
437}
438
439static const struct filterops eventfd_read_filterops = {
440	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
441	.f_detach = eventfd_filt_read_detach,
442	.f_event = eventfd_filt_read,
443};
444
445static void
446eventfd_filt_write_detach(struct knote * const kn)
447{
448	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
449
450	mutex_enter(&efd->efd_lock);
451	KASSERT(kn->kn_hook == efd);
452	selremove_knote(&efd->efd_write_sel, kn);
453	mutex_exit(&efd->efd_lock);
454}
455
456static int
457eventfd_filt_write(struct knote * const kn, long const hint)
458{
459	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
460	int rv;
461
462	if (hint & NOTE_SUBMIT) {
463		KASSERT(mutex_owned(&efd->efd_lock));
464	} else {
465		mutex_enter(&efd->efd_lock);
466	}
467
468	kn->kn_data = (int64_t)efd->efd_val;
469	rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL;
470
471	if ((hint & NOTE_SUBMIT) == 0) {
472		mutex_exit(&efd->efd_lock);
473	}
474
475	return rv;
476}
477
478static const struct filterops eventfd_write_filterops = {
479	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
480	.f_detach = eventfd_filt_write_detach,
481	.f_event = eventfd_filt_write,
482};
483
484static int
485eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn)
486{
487	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
488	struct selinfo *sel;
489
490	switch (kn->kn_filter) {
491	case EVFILT_READ:
492		sel = &efd->efd_read_sel;
493		kn->kn_fop = &eventfd_read_filterops;
494		break;
495
496	case EVFILT_WRITE:
497		sel = &efd->efd_write_sel;
498		kn->kn_fop = &eventfd_write_filterops;
499		break;
500
501	default:
502		return EINVAL;
503	}
504
505	kn->kn_hook = efd;
506
507	mutex_enter(&efd->efd_lock);
508	selrecord_knote(sel, kn);
509	mutex_exit(&efd->efd_lock);
510
511	return 0;
512}
513
514static void
515eventfd_fop_restart(file_t * const fp)
516{
517	struct eventfd * const efd = fp->f_eventfd;
518
519	/*
520	 * Unblock blocked reads/writes in order to allow close() to complete.
521	 * System calls return ERESTART so that the fd is revalidated.
522	 */
523
524	mutex_enter(&efd->efd_lock);
525
526	if (efd->efd_nwaiters != 0) {
527		efd->efd_restarting = true;
528		cv_broadcast(&efd->efd_read_wait);
529		cv_broadcast(&efd->efd_write_wait);
530	}
531
532	mutex_exit(&efd->efd_lock);
533}
534
535static const struct fileops eventfd_fileops = {
536	.fo_name = "eventfd",
537	.fo_read = eventfd_fop_read,
538	.fo_write = eventfd_fop_write,
539	.fo_ioctl = eventfd_ioctl,
540	.fo_fcntl = fnullop_fcntl,
541	.fo_poll = eventfd_fop_poll,
542	.fo_stat = eventfd_fop_stat,
543	.fo_close = eventfd_fop_close,
544	.fo_kqfilter = eventfd_fop_kqfilter,
545	.fo_restart = eventfd_fop_restart,
546};
547
548/*
549 * eventfd(2) system call
550 */
551int
552do_eventfd(struct lwp * const l, unsigned int const val, int const flags,
553    register_t *retval)
554{
555	file_t *fp;
556	int fd, error;
557
558	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) {
559		return EINVAL;
560	}
561
562	if ((error = fd_allocfile(&fp, &fd)) != 0) {
563		return error;
564	}
565
566	fp->f_flag = FREAD | FWRITE;
567	if (flags & EFD_NONBLOCK) {
568		fp->f_flag |= FNONBLOCK;
569	}
570	fp->f_type = DTYPE_EVENTFD;
571	fp->f_ops = &eventfd_fileops;
572	fp->f_eventfd = eventfd_create(val, flags);
573	fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC));
574	fd_affix(curproc, fp, fd);
575
576	*retval = fd;
577	return 0;
578}
579
580int
581sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap,
582    register_t *retval)
583{
584	/* {
585		syscallarg(unsigned int) val;
586		syscallarg(int) flags;
587	} */
588
589	return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval);
590}
591