1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2007 Roman Divacky
5 * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/callout.h>
31#include <sys/capsicum.h>
32#include <sys/errno.h>
33#include <sys/event.h>
34#include <sys/eventfd.h>
35#include <sys/file.h>
36#include <sys/filedesc.h>
37#include <sys/filio.h>
38#include <sys/limits.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/poll.h>
42#include <sys/proc.h>
43#include <sys/selinfo.h>
44#include <sys/specialfd.h>
45#include <sys/sx.h>
46#include <sys/syscallsubr.h>
47#include <sys/timerfd.h>
48#include <sys/timespec.h>
49#include <sys/user.h>
50
51#ifdef COMPAT_LINUX32
52#include <machine/../linux32/linux.h>
53#include <machine/../linux32/linux32_proto.h>
54#else
55#include <machine/../linux/linux.h>
56#include <machine/../linux/linux_proto.h>
57#endif
58
59#include <compat/linux/linux_emul.h>
60#include <compat/linux/linux_event.h>
61#include <compat/linux/linux_file.h>
62#include <compat/linux/linux_signal.h>
63#include <compat/linux/linux_time.h>
64#include <compat/linux/linux_util.h>
65
66typedef uint64_t	epoll_udata_t;
67
68struct epoll_event {
69	uint32_t	events;
70	epoll_udata_t	data;
71}
72#if defined(__amd64__)
73__attribute__((packed))
74#endif
75;
76
77#define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
78
79static int	epoll_to_kevent(struct thread *td, int fd,
80		    struct epoll_event *l_event, struct kevent *kevent,
81		    int *nkevents);
82static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
83static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
84static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
85static int	epoll_register_kevent(struct thread *td, struct file *epfp,
86		    int fd, int filter, unsigned int flags);
87static int	epoll_fd_registered(struct thread *td, struct file *epfp,
88		    int fd);
89static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
90		    int fd);
91
92struct epoll_copyin_args {
93	struct kevent	*changelist;
94};
95
96struct epoll_copyout_args {
97	struct epoll_event	*leventlist;
98	struct proc		*p;
99	uint32_t		count;
100	int			error;
101};
102
103static int
104epoll_create_common(struct thread *td, int flags)
105{
106
107	return (kern_kqueue(td, flags, NULL));
108}
109
110#ifdef LINUX_LEGACY_SYSCALLS
111int
112linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
113{
114
115	/*
116	 * args->size is unused. Linux just tests it
117	 * and then forgets it as well.
118	 */
119	if (args->size <= 0)
120		return (EINVAL);
121
122	return (epoll_create_common(td, 0));
123}
124#endif
125
126int
127linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
128{
129	int flags;
130
131	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
132		return (EINVAL);
133
134	flags = 0;
135	if ((args->flags & LINUX_O_CLOEXEC) != 0)
136		flags |= O_CLOEXEC;
137
138	return (epoll_create_common(td, flags));
139}
140
141/* Structure converting function from epoll to kevent. */
142static int
143epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
144    struct kevent *kevent, int *nkevents)
145{
146	uint32_t levents = l_event->events;
147	struct linux_pemuldata *pem;
148	struct proc *p;
149	unsigned short kev_flags = EV_ADD | EV_ENABLE;
150
151	/* flags related to how event is registered */
152	if ((levents & LINUX_EPOLLONESHOT) != 0)
153		kev_flags |= EV_DISPATCH;
154	if ((levents & LINUX_EPOLLET) != 0)
155		kev_flags |= EV_CLEAR;
156	if ((levents & LINUX_EPOLLERR) != 0)
157		kev_flags |= EV_ERROR;
158	if ((levents & LINUX_EPOLLRDHUP) != 0)
159		kev_flags |= EV_EOF;
160
161	/* flags related to what event is registered */
162	if ((levents & LINUX_EPOLL_EVRD) != 0) {
163		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
164		kevent->ext[0] = l_event->data;
165		++kevent;
166		++(*nkevents);
167	}
168	if ((levents & LINUX_EPOLL_EVWR) != 0) {
169		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
170		kevent->ext[0] = l_event->data;
171		++kevent;
172		++(*nkevents);
173	}
174	/* zero event mask is legal */
175	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
176		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
177		++(*nkevents);
178	}
179
180	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
181		p = td->td_proc;
182
183		pem = pem_find(p);
184		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
185
186		LINUX_PEM_XLOCK(pem);
187		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
188			pem->flags |= LINUX_XUNSUP_EPOLL;
189			LINUX_PEM_XUNLOCK(pem);
190			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
191			    levents);
192		} else
193			LINUX_PEM_XUNLOCK(pem);
194		return (EINVAL);
195	}
196
197	return (0);
198}
199
200/*
201 * Structure converting function from kevent to epoll. In a case
202 * this is called on error in registration we store the error in
203 * event->data and pick it up later in linux_epoll_ctl().
204 */
205static void
206kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
207{
208
209	l_event->data = kevent->ext[0];
210
211	if ((kevent->flags & EV_ERROR) != 0) {
212		l_event->events = LINUX_EPOLLERR;
213		return;
214	}
215
216	/* XXX EPOLLPRI, EPOLLHUP */
217	switch (kevent->filter) {
218	case EVFILT_READ:
219		l_event->events = LINUX_EPOLLIN;
220		if ((kevent->flags & EV_EOF) != 0)
221			l_event->events |= LINUX_EPOLLRDHUP;
222	break;
223	case EVFILT_WRITE:
224		l_event->events = LINUX_EPOLLOUT;
225	break;
226	}
227}
228
229/*
230 * Copyout callback used by kevent. This converts kevent
231 * events to epoll events and copies them back to the
232 * userspace. This is also called on error on registering
233 * of the filter.
234 */
235static int
236epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
237{
238	struct epoll_copyout_args *args;
239	struct epoll_event *eep;
240	int error, i;
241
242	args = (struct epoll_copyout_args*) arg;
243	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
244
245	for (i = 0; i < count; i++)
246		kevent_to_epoll(&kevp[i], &eep[i]);
247
248	error = copyout(eep, args->leventlist, count * sizeof(*eep));
249	if (error == 0) {
250		args->leventlist += count;
251		args->count += count;
252	} else if (args->error == 0)
253		args->error = error;
254
255	free(eep, M_EPOLL);
256	return (error);
257}
258
259/*
260 * Copyin callback used by kevent. This copies already
261 * converted filters from kernel memory to the kevent
262 * internal kernel memory. Hence the memcpy instead of
263 * copyin.
264 */
265static int
266epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
267{
268	struct epoll_copyin_args *args;
269
270	args = (struct epoll_copyin_args*) arg;
271
272	memcpy(kevp, args->changelist, count * sizeof(*kevp));
273	args->changelist += count;
274
275	return (0);
276}
277
278/*
279 * Load epoll filter, convert it to kevent filter
280 * and load it into kevent subsystem.
281 */
282int
283linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
284{
285	struct file *epfp, *fp;
286	struct epoll_copyin_args ciargs;
287	struct kevent kev[2];
288	struct kevent_copyops k_ops = { &ciargs,
289					NULL,
290					epoll_kev_copyin};
291	struct epoll_event le;
292	cap_rights_t rights;
293	int nchanges = 0;
294	int error;
295
296	if (args->op != LINUX_EPOLL_CTL_DEL) {
297		error = copyin(args->event, &le, sizeof(le));
298		if (error != 0)
299			return (error);
300	}
301
302	error = fget(td, args->epfd,
303	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
304	if (error != 0)
305		return (error);
306	if (epfp->f_type != DTYPE_KQUEUE) {
307		error = EINVAL;
308		goto leave1;
309	}
310
311	 /* Protect user data vector from incorrectly supplied fd. */
312	error = fget(td, args->fd,
313		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
314	if (error != 0)
315		goto leave1;
316
317	/* Linux disallows spying on himself */
318	if (epfp == fp) {
319		error = EINVAL;
320		goto leave0;
321	}
322
323	ciargs.changelist = kev;
324
325	if (args->op != LINUX_EPOLL_CTL_DEL) {
326		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
327		if (error != 0)
328			goto leave0;
329	}
330
331	switch (args->op) {
332	case LINUX_EPOLL_CTL_MOD:
333		error = epoll_delete_all_events(td, epfp, args->fd);
334		if (error != 0)
335			goto leave0;
336		break;
337
338	case LINUX_EPOLL_CTL_ADD:
339		if (epoll_fd_registered(td, epfp, args->fd)) {
340			error = EEXIST;
341			goto leave0;
342		}
343		break;
344
345	case LINUX_EPOLL_CTL_DEL:
346		/* CTL_DEL means unregister this fd with this epoll */
347		error = epoll_delete_all_events(td, epfp, args->fd);
348		goto leave0;
349
350	default:
351		error = EINVAL;
352		goto leave0;
353	}
354
355	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
356
357leave0:
358	fdrop(fp, td);
359
360leave1:
361	fdrop(epfp, td);
362	return (error);
363}
364
365/*
366 * Wait for a filter to be triggered on the epoll file descriptor.
367 */
368
369static int
370linux_epoll_wait_ts(struct thread *td, int epfd, struct epoll_event *events,
371    int maxevents, struct timespec *tsp, sigset_t *uset)
372{
373	struct epoll_copyout_args coargs;
374	struct kevent_copyops k_ops = { &coargs,
375					epoll_kev_copyout,
376					NULL};
377	cap_rights_t rights;
378	struct file *epfp;
379	sigset_t omask;
380	int error;
381
382	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
383		return (EINVAL);
384
385	error = fget(td, epfd,
386	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
387	if (error != 0)
388		return (error);
389	if (epfp->f_type != DTYPE_KQUEUE) {
390		error = EINVAL;
391		goto leave;
392	}
393	if (uset != NULL) {
394		error = kern_sigprocmask(td, SIG_SETMASK, uset,
395		    &omask, 0);
396		if (error != 0)
397			goto leave;
398		td->td_pflags |= TDP_OLDMASK;
399		/*
400		 * Make sure that ast() is called on return to
401		 * usermode and TDP_OLDMASK is cleared, restoring old
402		 * sigmask.
403		 */
404		ast_sched(td, TDA_SIGSUSPEND);
405	}
406
407	coargs.leventlist = events;
408	coargs.p = td->td_proc;
409	coargs.count = 0;
410	coargs.error = 0;
411
412	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
413	if (error == 0 && coargs.error != 0)
414		error = coargs.error;
415
416	/*
417	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
418	 * Maybe we should translate that but I don't think it matters at all.
419	 */
420	if (error == 0)
421		td->td_retval[0] = coargs.count;
422
423	if (uset != NULL)
424		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
425		    NULL, 0);
426leave:
427	fdrop(epfp, td);
428	return (error);
429}
430
431static int
432linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
433    int maxevents, int timeout, sigset_t *uset)
434{
435	struct timespec ts, *tsp;
436
437	/*
438	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
439	 * to block indefinitely. Real implementation does it if any negative
440	 * timeout value is passed.
441	 */
442	if (timeout >= 0) {
443		/* Convert from milliseconds to timespec. */
444		ts.tv_sec = timeout / 1000;
445		ts.tv_nsec = (timeout % 1000) * 1000000;
446		tsp = &ts;
447	} else {
448		tsp = NULL;
449	}
450	return (linux_epoll_wait_ts(td, epfd, events, maxevents, tsp, uset));
451
452}
453
454#ifdef LINUX_LEGACY_SYSCALLS
455int
456linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
457{
458
459	return (linux_epoll_wait_common(td, args->epfd, args->events,
460	    args->maxevents, args->timeout, NULL));
461}
462#endif
463
464int
465linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
466{
467	sigset_t mask, *pmask;
468	int error;
469
470	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
471	    &mask, &pmask);
472	if (error != 0)
473		return (error);
474
475	return (linux_epoll_wait_common(td, args->epfd, args->events,
476	    args->maxevents, args->timeout, pmask));
477}
478
479#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
480int
481linux_epoll_pwait2_64(struct thread *td, struct linux_epoll_pwait2_64_args *args)
482{
483	struct timespec ts, *tsa;
484	sigset_t mask, *pmask;
485	int error;
486
487	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
488	    &mask, &pmask);
489	if (error != 0)
490		return (error);
491
492	if (args->timeout) {
493		error = linux_get_timespec64(&ts, args->timeout);
494		if (error != 0)
495			return (error);
496		tsa = &ts;
497	} else
498		tsa = NULL;
499
500	return (linux_epoll_wait_ts(td, args->epfd, args->events,
501	    args->maxevents, tsa, pmask));
502}
503#else
504int
505linux_epoll_pwait2(struct thread *td, struct linux_epoll_pwait2_args *args)
506{
507	struct timespec ts, *tsa;
508	sigset_t mask, *pmask;
509	int error;
510
511	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
512	    &mask, &pmask);
513	if (error != 0)
514		return (error);
515
516	if (args->timeout) {
517		error = linux_get_timespec(&ts, args->timeout);
518		if (error != 0)
519			return (error);
520		tsa = &ts;
521	} else
522		tsa = NULL;
523
524	return (linux_epoll_wait_ts(td, args->epfd, args->events,
525	    args->maxevents, tsa, pmask));
526}
527#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
528
529static int
530epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
531    unsigned int flags)
532{
533	struct epoll_copyin_args ciargs;
534	struct kevent kev;
535	struct kevent_copyops k_ops = { &ciargs,
536					NULL,
537					epoll_kev_copyin};
538
539	ciargs.changelist = &kev;
540	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
541
542	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
543}
544
545static int
546epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
547{
548	/*
549	 * Set empty filter flags to avoid accidental modification of already
550	 * registered events. In the case of event re-registration:
551	 * 1. If event does not exists kevent() does nothing and returns ENOENT
552	 * 2. If event does exists, it's enabled/disabled state is preserved
553	 *    but fflags, data and udata fields are overwritten. So we can not
554	 *    set socket lowats and store user's context pointer in udata.
555	 */
556	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
557	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
558		return (1);
559
560	return (0);
561}
562
563static int
564epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
565{
566	int error1, error2;
567
568	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
569	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
570
571	/* return 0 if at least one result positive */
572	return (error1 == 0 ? 0 : error2);
573}
574
575#ifdef LINUX_LEGACY_SYSCALLS
576int
577linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
578{
579	struct specialfd_eventfd ae;
580
581	bzero(&ae, sizeof(ae));
582	ae.initval = args->initval;
583	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
584}
585#endif
586
587int
588linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
589{
590	struct specialfd_eventfd ae;
591	int flags;
592
593	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
594	    LINUX_EFD_SEMAPHORE)) != 0)
595		return (EINVAL);
596	flags = 0;
597	if ((args->flags & LINUX_O_CLOEXEC) != 0)
598		flags |= EFD_CLOEXEC;
599	if ((args->flags & LINUX_O_NONBLOCK) != 0)
600		flags |= EFD_NONBLOCK;
601	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
602		flags |= EFD_SEMAPHORE;
603
604	bzero(&ae, sizeof(ae));
605	ae.flags = flags;
606	ae.initval = args->initval;
607	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
608}
609
610int
611linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
612{
613	clockid_t clockid;
614	int error, flags;
615
616	error = linux_to_native_clockid(&clockid, args->clockid);
617	if (error != 0)
618		return (error);
619	flags = 0;
620	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
621		flags |= O_CLOEXEC;
622	if ((args->flags & LINUX_TFD_NONBLOCK) != 0)
623		flags |= TFD_NONBLOCK;
624
625	return (kern_timerfd_create(td, clockid, flags));
626}
627
628int
629linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
630{
631	struct l_itimerspec lots;
632	struct itimerspec ots;
633	int error;
634
635	error = kern_timerfd_gettime(td, args->fd, &ots);
636	if (error != 0)
637		return (error);
638
639	error = native_to_linux_itimerspec(&lots, &ots);
640	if (error == 0)
641		error = copyout(&lots, args->old_value, sizeof(lots));
642
643	return (error);
644}
645
646int
647linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
648{
649	struct l_itimerspec lots;
650	struct itimerspec nts, ots;
651	int error;
652
653	error = copyin(args->new_value, &lots, sizeof(lots));
654	if (error != 0)
655		return (error);
656	error = linux_to_native_itimerspec(&nts, &lots);
657	if (error != 0)
658		return (error);
659	if (args->old_value == NULL)
660		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, NULL);
661	else
662		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, &ots);
663	if (error == 0 && args->old_value != NULL) {
664		error = native_to_linux_itimerspec(&lots, &ots);
665		if (error == 0)
666			error = copyout(&lots, args->old_value, sizeof(lots));
667	}
668
669	return (error);
670}
671
672#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
673int
674linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
675{
676	struct l_itimerspec64 lots;
677	struct itimerspec ots;
678	int error;
679
680	error = kern_timerfd_gettime(td, args->fd, &ots);
681	if (error != 0)
682		return (error);
683
684	error = native_to_linux_itimerspec64(&lots, &ots);
685	if (error == 0)
686		error = copyout(&lots, args->old_value, sizeof(lots));
687
688	return (error);
689}
690
691int
692linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
693{
694	struct l_itimerspec64 lots;
695	struct itimerspec nts, ots;
696	int error;
697
698	error = copyin(args->new_value, &lots, sizeof(lots));
699	if (error != 0)
700		return (error);
701	error = linux_to_native_itimerspec64(&nts, &lots);
702	if (error != 0)
703		return (error);
704	if (args->old_value == NULL)
705		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, NULL);
706	else
707		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, &ots);
708	if (error == 0 && args->old_value != NULL) {
709		error = native_to_linux_itimerspec64(&lots, &ots);
710		if (error == 0)
711			error = copyout(&lots, args->old_value, sizeof(lots));
712	}
713
714	return (error);
715}
716#endif
717