sys_eventfd.c revision 1.11
1/*	$NetBSD: sys_eventfd.c,v 1.11 2023/11/19 17:16:00 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.11 2023/11/19 17:16:00 riastradh Exp $");
34
35/*
36 * eventfd
37 *
38 * Eventfd objects present a simple counting object associated with a
39 * file descriptor.  Writes and reads to this file descriptor increment
40 * and decrement the count, respectively.  When the count is non-zero,
41 * the descriptor is considered "readable", and when less than the max
42 * value (EVENTFD_MAXVAL), is considered "writable".
43 *
44 * This implementation is API compatible with the Linux eventfd(2)
45 * interface.
46 */
47
48#include <sys/param.h>
49#include <sys/types.h>
50#include <sys/condvar.h>
51#include <sys/eventfd.h>
52#include <sys/file.h>
53#include <sys/filedesc.h>
54#include <sys/kauth.h>
55#include <sys/mutex.h>
56#include <sys/poll.h>
57#include <sys/proc.h>
58#include <sys/select.h>
59#include <sys/stat.h>
60#include <sys/syscallargs.h>
61#include <sys/uio.h>
62
63struct eventfd {
64	kmutex_t	efd_lock;
65	kcondvar_t	efd_read_wait;
66	kcondvar_t	efd_write_wait;
67	struct selinfo	efd_read_sel;
68	struct selinfo	efd_write_sel;
69	eventfd_t	efd_val;
70	int64_t		efd_nwaiters;
71	bool		efd_restarting;
72	bool		efd_is_semaphore;
73
74	/*
75	 * Information kept for stat(2).
76	 */
77	struct timespec efd_btime;	/* time created */
78	struct timespec	efd_mtime;	/* last write */
79	struct timespec	efd_atime;	/* last read */
80};
81
82#define	EVENTFD_MAXVAL	(UINT64_MAX - 1)
83
84/*
85 * eventfd_create:
86 *
87 *	Create an eventfd object.
88 */
89static struct eventfd *
90eventfd_create(unsigned int const val, int const flags)
91{
92	struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP);
93
94	mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE);
95	cv_init(&efd->efd_read_wait, "efdread");
96	cv_init(&efd->efd_write_wait, "efdwrite");
97	selinit(&efd->efd_read_sel);
98	selinit(&efd->efd_write_sel);
99	efd->efd_val = val;
100	efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE);
101	getnanotime(&efd->efd_btime);
102
103	/* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */
104
105	return efd;
106}
107
108/*
109 * eventfd_destroy:
110 *
111 *	Destroy an eventfd object.
112 */
113static void
114eventfd_destroy(struct eventfd * const efd)
115{
116
117	KASSERT(efd->efd_nwaiters == 0);
118
119	cv_destroy(&efd->efd_read_wait);
120	cv_destroy(&efd->efd_write_wait);
121
122	seldestroy(&efd->efd_read_sel);
123	seldestroy(&efd->efd_write_sel);
124
125	mutex_destroy(&efd->efd_lock);
126
127	kmem_free(efd, sizeof(*efd));
128}
129
130/*
131 * eventfd_wait:
132 *
133 *	Block on an eventfd.  Handles non-blocking, as well as
134 *	the restart cases.
135 */
136static int
137eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write)
138{
139	kcondvar_t *waitcv;
140	int error;
141
142	if (fflag & FNONBLOCK) {
143		return EAGAIN;
144	}
145
146	/*
147	 * We're going to block.  Check if we need to return ERESTART.
148	 */
149	if (efd->efd_restarting) {
150		return ERESTART;
151	}
152
153	if (is_write) {
154		waitcv = &efd->efd_write_wait;
155	} else {
156		waitcv = &efd->efd_read_wait;
157	}
158
159	efd->efd_nwaiters++;
160	KASSERT(efd->efd_nwaiters > 0);
161	error = cv_wait_sig(waitcv, &efd->efd_lock);
162	efd->efd_nwaiters--;
163	KASSERT(efd->efd_nwaiters >= 0);
164
165	/*
166	 * If a restart was triggered while we were asleep, we need
167	 * to return ERESTART if no other error was returned.
168	 */
169	if (efd->efd_restarting) {
170		if (error == 0) {
171			error = ERESTART;
172		}
173	}
174
175	return error;
176}
177
178/*
179 * eventfd_wake:
180 *
181 *	Wake LWPs block on an eventfd.
182 */
183static void
184eventfd_wake(struct eventfd * const efd, bool const is_write)
185{
186	kcondvar_t *waitcv = NULL;
187	struct selinfo *sel;
188	int pollev;
189
190	if (is_write) {
191		waitcv = &efd->efd_read_wait;
192		sel = &efd->efd_read_sel;
193		pollev = POLLIN | POLLRDNORM;
194	} else {
195		waitcv = &efd->efd_write_wait;
196		sel = &efd->efd_write_sel;
197		pollev = POLLOUT | POLLWRNORM;
198	}
199	cv_broadcast(waitcv);
200	selnotify(sel, pollev, NOTE_SUBMIT);
201}
202
203/*
204 * eventfd file operations
205 */
206
207static int
208eventfd_fop_read(file_t * const fp, off_t * const offset,
209    struct uio * const uio, kauth_cred_t const cred, int const flags)
210{
211	struct eventfd * const efd = fp->f_eventfd;
212	int const fflag = fp->f_flag;
213	eventfd_t return_value;
214	int error;
215
216	if (uio->uio_resid < sizeof(eventfd_t)) {
217		return EINVAL;
218	}
219
220	mutex_enter(&efd->efd_lock);
221
222	while (efd->efd_val == 0) {
223		if ((error = eventfd_wait(efd, fflag, false)) != 0) {
224			mutex_exit(&efd->efd_lock);
225			return error;
226		}
227	}
228
229	if (efd->efd_is_semaphore) {
230		return_value = 1;
231		efd->efd_val--;
232	} else {
233		return_value = efd->efd_val;
234		efd->efd_val = 0;
235	}
236
237	getnanotime(&efd->efd_atime);
238	eventfd_wake(efd, false);
239
240	mutex_exit(&efd->efd_lock);
241
242	error = uiomove(&return_value, sizeof(return_value), uio);
243
244	return error;
245}
246
247static int
248eventfd_fop_write(file_t * const fp, off_t * const offset,
249    struct uio * const uio, kauth_cred_t const cred, int const flags)
250{
251	struct eventfd * const efd = fp->f_eventfd;
252	int const fflag = fp->f_flag;
253	eventfd_t write_value;
254	int error;
255
256	if (uio->uio_resid < sizeof(eventfd_t)) {
257		return EINVAL;
258	}
259
260	if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) {
261		return error;
262	}
263
264	if (write_value > EVENTFD_MAXVAL) {
265		error = EINVAL;
266		goto out;
267	}
268
269	mutex_enter(&efd->efd_lock);
270
271	KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
272	while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) {
273		if ((error = eventfd_wait(efd, fflag, true)) != 0) {
274			mutex_exit(&efd->efd_lock);
275			goto out;
276		}
277	}
278
279	efd->efd_val += write_value;
280	KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
281
282	getnanotime(&efd->efd_mtime);
283	eventfd_wake(efd, true);
284
285	mutex_exit(&efd->efd_lock);
286
287 out:
288	if (error) {
289		/*
290		 * Undo the effect of uiomove() so that the error
291		 * gets reported correctly; see dofilewrite().
292		 */
293		uio->uio_resid += sizeof(write_value);
294	}
295	return error;
296}
297
298static int
299eventfd_ioctl(file_t * const fp, u_long const cmd, void * const data)
300{
301	struct eventfd * const efd = fp->f_eventfd;
302
303	switch (cmd) {
304	case FIONBIO:
305		return 0;
306
307	case FIONREAD:
308		mutex_enter(&efd->efd_lock);
309		*(int *)data = efd->efd_val != 0 ? sizeof(eventfd_t) : 0;
310		mutex_exit(&efd->efd_lock);
311		return 0;
312
313	case FIONWRITE:
314		*(int *)data = 0;
315		return 0;
316
317	case FIONSPACE:
318		/*
319		 * FIONSPACE doesn't really work for eventfd, because the
320		 * writability depends on the contents (value) being written.
321		 */
322		break;
323
324	default:
325		break;
326	}
327
328	return EPASSTHROUGH;
329}
330
331static int
332eventfd_fop_poll(file_t * const fp, int const events)
333{
334	struct eventfd * const efd = fp->f_eventfd;
335	int revents = 0;
336
337	/*
338	 * Note that Linux will return POLLERR if the eventfd count
339	 * overflows, but that is not possible in the normal read/write
340	 * API, only with Linux kernel-internal interfaces.  So, this
341	 * implementation never returns POLLERR.
342	 *
343	 * Also note that the Linux eventfd(2) man page does not
344	 * specifically discuss returning POLLRDNORM, but we check
345	 * for that event in addition to POLLIN.
346	 */
347
348	mutex_enter(&efd->efd_lock);
349
350	if (events & (POLLIN | POLLRDNORM)) {
351		if (efd->efd_val != 0) {
352			revents |= events & (POLLIN | POLLRDNORM);
353		} else {
354			selrecord(curlwp, &efd->efd_read_sel);
355		}
356	}
357
358	if (events & (POLLOUT | POLLWRNORM)) {
359		if (efd->efd_val < EVENTFD_MAXVAL) {
360			revents |= events & (POLLOUT | POLLWRNORM);
361		} else {
362			selrecord(curlwp, &efd->efd_write_sel);
363		}
364	}
365
366	mutex_exit(&efd->efd_lock);
367
368	return revents;
369}
370
371static int
372eventfd_fop_stat(file_t * const fp, struct stat * const st)
373{
374	struct eventfd * const efd = fp->f_eventfd;
375
376	memset(st, 0, sizeof(*st));
377
378	mutex_enter(&efd->efd_lock);
379	st->st_size = (off_t)efd->efd_val;
380	st->st_blksize = sizeof(eventfd_t);
381	st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
382	st->st_blocks = 1;
383	st->st_birthtimespec = st->st_ctimespec = efd->efd_btime;
384	st->st_atimespec = efd->efd_atime;
385	st->st_mtimespec = efd->efd_mtime;
386	st->st_uid = kauth_cred_geteuid(fp->f_cred);
387	st->st_gid = kauth_cred_getegid(fp->f_cred);
388	mutex_exit(&efd->efd_lock);
389
390	return 0;
391}
392
393static int
394eventfd_fop_close(file_t * const fp)
395{
396	struct eventfd * const efd = fp->f_eventfd;
397
398	fp->f_eventfd = NULL;
399	eventfd_destroy(efd);
400
401	return 0;
402}
403
404static void
405eventfd_filt_read_detach(struct knote * const kn)
406{
407	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
408
409	mutex_enter(&efd->efd_lock);
410	KASSERT(kn->kn_hook == efd);
411	selremove_knote(&efd->efd_read_sel, kn);
412	mutex_exit(&efd->efd_lock);
413}
414
415static int
416eventfd_filt_read(struct knote * const kn, long const hint)
417{
418	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
419	int rv;
420
421	if (hint & NOTE_SUBMIT) {
422		KASSERT(mutex_owned(&efd->efd_lock));
423	} else {
424		mutex_enter(&efd->efd_lock);
425	}
426
427	kn->kn_data = (int64_t)efd->efd_val;
428	rv = (eventfd_t)kn->kn_data > 0;
429
430	if ((hint & NOTE_SUBMIT) == 0) {
431		mutex_exit(&efd->efd_lock);
432	}
433
434	return rv;
435}
436
437static const struct filterops eventfd_read_filterops = {
438	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
439	.f_detach = eventfd_filt_read_detach,
440	.f_event = eventfd_filt_read,
441};
442
443static void
444eventfd_filt_write_detach(struct knote * const kn)
445{
446	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
447
448	mutex_enter(&efd->efd_lock);
449	KASSERT(kn->kn_hook == efd);
450	selremove_knote(&efd->efd_write_sel, kn);
451	mutex_exit(&efd->efd_lock);
452}
453
454static int
455eventfd_filt_write(struct knote * const kn, long const hint)
456{
457	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
458	int rv;
459
460	if (hint & NOTE_SUBMIT) {
461		KASSERT(mutex_owned(&efd->efd_lock));
462	} else {
463		mutex_enter(&efd->efd_lock);
464	}
465
466	kn->kn_data = (int64_t)efd->efd_val;
467	rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL;
468
469	if ((hint & NOTE_SUBMIT) == 0) {
470		mutex_exit(&efd->efd_lock);
471	}
472
473	return rv;
474}
475
476static const struct filterops eventfd_write_filterops = {
477	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
478	.f_detach = eventfd_filt_write_detach,
479	.f_event = eventfd_filt_write,
480};
481
482static int
483eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn)
484{
485	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
486	struct selinfo *sel;
487
488	switch (kn->kn_filter) {
489	case EVFILT_READ:
490		sel = &efd->efd_read_sel;
491		kn->kn_fop = &eventfd_read_filterops;
492		break;
493
494	case EVFILT_WRITE:
495		sel = &efd->efd_write_sel;
496		kn->kn_fop = &eventfd_write_filterops;
497		break;
498
499	default:
500		return EINVAL;
501	}
502
503	kn->kn_hook = efd;
504
505	mutex_enter(&efd->efd_lock);
506	selrecord_knote(sel, kn);
507	mutex_exit(&efd->efd_lock);
508
509	return 0;
510}
511
512static void
513eventfd_fop_restart(file_t * const fp)
514{
515	struct eventfd * const efd = fp->f_eventfd;
516
517	/*
518	 * Unblock blocked reads/writes in order to allow close() to complete.
519	 * System calls return ERESTART so that the fd is revalidated.
520	 */
521
522	mutex_enter(&efd->efd_lock);
523
524	if (efd->efd_nwaiters != 0) {
525		efd->efd_restarting = true;
526		cv_broadcast(&efd->efd_read_wait);
527		cv_broadcast(&efd->efd_write_wait);
528	}
529
530	mutex_exit(&efd->efd_lock);
531}
532
533static const struct fileops eventfd_fileops = {
534	.fo_name = "eventfd",
535	.fo_read = eventfd_fop_read,
536	.fo_write = eventfd_fop_write,
537	.fo_ioctl = eventfd_ioctl,
538	.fo_fcntl = fnullop_fcntl,
539	.fo_poll = eventfd_fop_poll,
540	.fo_stat = eventfd_fop_stat,
541	.fo_close = eventfd_fop_close,
542	.fo_kqfilter = eventfd_fop_kqfilter,
543	.fo_restart = eventfd_fop_restart,
544};
545
546/*
547 * eventfd(2) system call
548 */
549int
550do_eventfd(struct lwp * const l, unsigned int const val, int const flags,
551    register_t *retval)
552{
553	file_t *fp;
554	int fd, error;
555
556	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) {
557		return EINVAL;
558	}
559
560	if ((error = fd_allocfile(&fp, &fd)) != 0) {
561		return error;
562	}
563
564	fp->f_flag = FREAD | FWRITE;
565	if (flags & EFD_NONBLOCK) {
566		fp->f_flag |= FNONBLOCK;
567	}
568	fp->f_type = DTYPE_EVENTFD;
569	fp->f_ops = &eventfd_fileops;
570	fp->f_eventfd = eventfd_create(val, flags);
571	fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC));
572	fd_affix(curproc, fp, fd);
573
574	*retval = fd;
575	return 0;
576}
577
578int
579sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap,
580    register_t *retval)
581{
582	/* {
583		syscallarg(unsigned int) val;
584		syscallarg(int) flags;
585	} */
586
587	return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval);
588}
589