mevent.c revision 330449
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD: stable/11/usr.sbin/bhyve/mevent.c 330449 2018-03-05 07:26:05Z eadler $
29 */
30
31/*
32 * Micro event library for FreeBSD, designed for a single i/o thread
33 * using kqueue, and having events be persistent by default.
34 */
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD: stable/11/usr.sbin/bhyve/mevent.c 330449 2018-03-05 07:26:05Z eadler $");
38
39#include <assert.h>
40#include <err.h>
41#include <errno.h>
42#include <stdlib.h>
43#include <stdio.h>
44#include <string.h>
45#include <sysexits.h>
46#include <unistd.h>
47
48#include <sys/types.h>
49#ifndef WITHOUT_CAPSICUM
50#include <sys/capsicum.h>
51#endif
52#include <sys/event.h>
53#include <sys/time.h>
54
55#include <pthread.h>
56#include <pthread_np.h>
57
58#include "mevent.h"
59
60#define	MEVENT_MAX	64
61
62#define	MEV_ADD		1
63#define	MEV_ENABLE	2
64#define	MEV_DISABLE	3
65#define	MEV_DEL_PENDING	4
66
67extern char *vmname;
68
69static pthread_t mevent_tid;
70static int mevent_timid = 43;
71static int mevent_pipefd[2];
72static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
73
74struct mevent {
75	void	(*me_func)(int, enum ev_type, void *);
76#define me_msecs me_fd
77	int	me_fd;
78	int	me_timid;
79	enum ev_type me_type;
80	void    *me_param;
81	int	me_cq;
82	int	me_state;
83	int	me_closefd;
84	LIST_ENTRY(mevent) me_list;
85};
86
87static LIST_HEAD(listhead, mevent) global_head, change_head;
88
89static void
90mevent_qlock(void)
91{
92	pthread_mutex_lock(&mevent_lmutex);
93}
94
95static void
96mevent_qunlock(void)
97{
98	pthread_mutex_unlock(&mevent_lmutex);
99}
100
101static void
102mevent_pipe_read(int fd, enum ev_type type, void *param)
103{
104	char buf[MEVENT_MAX];
105	int status;
106
107	/*
108	 * Drain the pipe read side. The fd is non-blocking so this is
109	 * safe to do.
110	 */
111	do {
112		status = read(fd, buf, sizeof(buf));
113	} while (status == MEVENT_MAX);
114}
115
116static void
117mevent_notify(void)
118{
119	char c;
120
121	/*
122	 * If calling from outside the i/o thread, write a byte on the
123	 * pipe to force the i/o thread to exit the blocking kevent call.
124	 */
125	if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
126		write(mevent_pipefd[1], &c, 1);
127	}
128}
129
130static int
131mevent_kq_filter(struct mevent *mevp)
132{
133	int retval;
134
135	retval = 0;
136
137	if (mevp->me_type == EVF_READ)
138		retval = EVFILT_READ;
139
140	if (mevp->me_type == EVF_WRITE)
141		retval = EVFILT_WRITE;
142
143	if (mevp->me_type == EVF_TIMER)
144		retval = EVFILT_TIMER;
145
146	if (mevp->me_type == EVF_SIGNAL)
147		retval = EVFILT_SIGNAL;
148
149	return (retval);
150}
151
152static int
153mevent_kq_flags(struct mevent *mevp)
154{
155	int ret;
156
157	switch (mevp->me_state) {
158	case MEV_ADD:
159		ret = EV_ADD;		/* implicitly enabled */
160		break;
161	case MEV_ENABLE:
162		ret = EV_ENABLE;
163		break;
164	case MEV_DISABLE:
165		ret = EV_DISABLE;
166		break;
167	case MEV_DEL_PENDING:
168		ret = EV_DELETE;
169		break;
170	default:
171		assert(0);
172		break;
173	}
174
175	return (ret);
176}
177
178static int
179mevent_kq_fflags(struct mevent *mevp)
180{
181	/* XXX nothing yet, perhaps EV_EOF for reads ? */
182	return (0);
183}
184
185static int
186mevent_build(int mfd, struct kevent *kev)
187{
188	struct mevent *mevp, *tmpp;
189	int i;
190
191	i = 0;
192
193	mevent_qlock();
194
195	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
196		if (mevp->me_closefd) {
197			/*
198			 * A close of the file descriptor will remove the
199			 * event
200			 */
201			close(mevp->me_fd);
202		} else {
203			if (mevp->me_type == EVF_TIMER) {
204				kev[i].ident = mevp->me_timid;
205				kev[i].data = mevp->me_msecs;
206			} else {
207				kev[i].ident = mevp->me_fd;
208				kev[i].data = 0;
209			}
210			kev[i].filter = mevent_kq_filter(mevp);
211			kev[i].flags = mevent_kq_flags(mevp);
212			kev[i].fflags = mevent_kq_fflags(mevp);
213			kev[i].udata = mevp;
214			i++;
215		}
216
217		mevp->me_cq = 0;
218		LIST_REMOVE(mevp, me_list);
219
220		if (mevp->me_state == MEV_DEL_PENDING) {
221			free(mevp);
222		} else {
223			LIST_INSERT_HEAD(&global_head, mevp, me_list);
224		}
225
226		assert(i < MEVENT_MAX);
227	}
228
229	mevent_qunlock();
230
231	return (i);
232}
233
234static void
235mevent_handle(struct kevent *kev, int numev)
236{
237	struct mevent *mevp;
238	int i;
239
240	for (i = 0; i < numev; i++) {
241		mevp = kev[i].udata;
242
243		/* XXX check for EV_ERROR ? */
244
245		(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
246	}
247}
248
249struct mevent *
250mevent_add(int tfd, enum ev_type type,
251	   void (*func)(int, enum ev_type, void *), void *param)
252{
253	struct mevent *lp, *mevp;
254
255	if (tfd < 0 || func == NULL) {
256		return (NULL);
257	}
258
259	mevp = NULL;
260
261	mevent_qlock();
262
263	/*
264	 * Verify that the fd/type tuple is not present in any list
265	 */
266	LIST_FOREACH(lp, &global_head, me_list) {
267		if (type != EVF_TIMER && lp->me_fd == tfd &&
268		    lp->me_type == type) {
269			goto exit;
270		}
271	}
272
273	LIST_FOREACH(lp, &change_head, me_list) {
274		if (type != EVF_TIMER && lp->me_fd == tfd &&
275		    lp->me_type == type) {
276			goto exit;
277		}
278	}
279
280	/*
281	 * Allocate an entry, populate it, and add it to the change list.
282	 */
283	mevp = calloc(1, sizeof(struct mevent));
284	if (mevp == NULL) {
285		goto exit;
286	}
287
288	if (type == EVF_TIMER) {
289		mevp->me_msecs = tfd;
290		mevp->me_timid = mevent_timid++;
291	} else
292		mevp->me_fd = tfd;
293	mevp->me_type = type;
294	mevp->me_func = func;
295	mevp->me_param = param;
296
297	LIST_INSERT_HEAD(&change_head, mevp, me_list);
298	mevp->me_cq = 1;
299	mevp->me_state = MEV_ADD;
300	mevent_notify();
301
302exit:
303	mevent_qunlock();
304
305	return (mevp);
306}
307
308static int
309mevent_update(struct mevent *evp, int newstate)
310{
311	/*
312	 * It's not possible to enable/disable a deleted event
313	 */
314	if (evp->me_state == MEV_DEL_PENDING)
315		return (EINVAL);
316
317	/*
318	 * No update needed if state isn't changing
319	 */
320	if (evp->me_state == newstate)
321		return (0);
322
323	mevent_qlock();
324
325	evp->me_state = newstate;
326
327	/*
328	 * Place the entry onto the changed list if not already there.
329	 */
330	if (evp->me_cq == 0) {
331		evp->me_cq = 1;
332		LIST_REMOVE(evp, me_list);
333		LIST_INSERT_HEAD(&change_head, evp, me_list);
334		mevent_notify();
335	}
336
337	mevent_qunlock();
338
339	return (0);
340}
341
342int
343mevent_enable(struct mevent *evp)
344{
345
346	return (mevent_update(evp, MEV_ENABLE));
347}
348
349int
350mevent_disable(struct mevent *evp)
351{
352
353	return (mevent_update(evp, MEV_DISABLE));
354}
355
356static int
357mevent_delete_event(struct mevent *evp, int closefd)
358{
359	mevent_qlock();
360
361	/*
362         * Place the entry onto the changed list if not already there, and
363	 * mark as to be deleted.
364         */
365        if (evp->me_cq == 0) {
366		evp->me_cq = 1;
367		LIST_REMOVE(evp, me_list);
368		LIST_INSERT_HEAD(&change_head, evp, me_list);
369		mevent_notify();
370        }
371	evp->me_state = MEV_DEL_PENDING;
372
373	if (closefd)
374		evp->me_closefd = 1;
375
376	mevent_qunlock();
377
378	return (0);
379}
380
381int
382mevent_delete(struct mevent *evp)
383{
384
385	return (mevent_delete_event(evp, 0));
386}
387
388int
389mevent_delete_close(struct mevent *evp)
390{
391
392	return (mevent_delete_event(evp, 1));
393}
394
395static void
396mevent_set_name(void)
397{
398
399	pthread_set_name_np(mevent_tid, "mevent");
400}
401
402void
403mevent_dispatch(void)
404{
405	struct kevent changelist[MEVENT_MAX];
406	struct kevent eventlist[MEVENT_MAX];
407	struct mevent *pipev;
408	int mfd;
409	int numev;
410	int ret;
411#ifndef WITHOUT_CAPSICUM
412	cap_rights_t rights;
413#endif
414
415	mevent_tid = pthread_self();
416	mevent_set_name();
417
418	mfd = kqueue();
419	assert(mfd > 0);
420
421#ifndef WITHOUT_CAPSICUM
422	cap_rights_init(&rights, CAP_KQUEUE);
423	if (cap_rights_limit(mfd, &rights) == -1 && errno != ENOSYS)
424		errx(EX_OSERR, "Unable to apply rights for sandbox");
425#endif
426
427	/*
428	 * Open the pipe that will be used for other threads to force
429	 * the blocking kqueue call to exit by writing to it. Set the
430	 * descriptor to non-blocking.
431	 */
432	ret = pipe(mevent_pipefd);
433	if (ret < 0) {
434		perror("pipe");
435		exit(0);
436	}
437
438#ifndef WITHOUT_CAPSICUM
439	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
440	if (cap_rights_limit(mevent_pipefd[0], &rights) == -1 && errno != ENOSYS)
441		errx(EX_OSERR, "Unable to apply rights for sandbox");
442	if (cap_rights_limit(mevent_pipefd[1], &rights) == -1 && errno != ENOSYS)
443		errx(EX_OSERR, "Unable to apply rights for sandbox");
444#endif
445
446	/*
447	 * Add internal event handler for the pipe write fd
448	 */
449	pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
450	assert(pipev != NULL);
451
452	for (;;) {
453		/*
454		 * Build changelist if required.
455		 * XXX the changelist can be put into the blocking call
456		 * to eliminate the extra syscall. Currently better for
457		 * debug.
458		 */
459		numev = mevent_build(mfd, changelist);
460		if (numev) {
461			ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
462			if (ret == -1) {
463				perror("Error return from kevent change");
464			}
465		}
466
467		/*
468		 * Block awaiting events
469		 */
470		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
471		if (ret == -1 && errno != EINTR) {
472			perror("Error return from kevent monitor");
473		}
474
475		/*
476		 * Handle reported events
477		 */
478		mevent_handle(eventlist, ret);
479	}
480}
481