1/*
2 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3 * Copyright 2007-2012 Niels Provos, Nick Mathewson
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 *    derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27#include "event2/event-config.h"
28
29#include <stdint.h>
30#include <sys/types.h>
31#include <sys/resource.h>
32#ifdef _EVENT_HAVE_SYS_TIME_H
33#include <sys/time.h>
34#endif
35#include <sys/queue.h>
36#include <sys/epoll.h>
37#include <signal.h>
38#include <limits.h>
39#include <stdio.h>
40#include <stdlib.h>
41#include <string.h>
42#include <unistd.h>
43#include <errno.h>
44#ifdef _EVENT_HAVE_FCNTL_H
45#include <fcntl.h>
46#endif
47
48#include "event-internal.h"
49#include "evsignal-internal.h"
50#include "event2/thread.h"
51#include "evthread-internal.h"
52#include "log-internal.h"
53#include "evmap-internal.h"
54#include "changelist-internal.h"
55
56struct epollop {
57	struct epoll_event *events;
58	int nevents;
59	int epfd;
60};
61
62static void *epoll_init(struct event_base *);
63static int epoll_dispatch(struct event_base *, struct timeval *);
64static void epoll_dealloc(struct event_base *);
65
66static const struct eventop epollops_changelist = {
67	"epoll (with changelist)",
68	epoll_init,
69	event_changelist_add,
70	event_changelist_del,
71	epoll_dispatch,
72	epoll_dealloc,
73	1, /* need reinit */
74	EV_FEATURE_ET|EV_FEATURE_O1,
75	EVENT_CHANGELIST_FDINFO_SIZE
76};
77
78
79static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
80    short old, short events, void *p);
81static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
82    short old, short events, void *p);
83
84const struct eventop epollops = {
85	"epoll",
86	epoll_init,
87	epoll_nochangelist_add,
88	epoll_nochangelist_del,
89	epoll_dispatch,
90	epoll_dealloc,
91	1, /* need reinit */
92	EV_FEATURE_ET|EV_FEATURE_O1,
93	0
94};
95
96#define INITIAL_NEVENT 32
97#define MAX_NEVENT 4096
98
99/* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
100 * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
101 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
102 * largest number of msec we can support here is 2147482.  Let's
103 * round that down by 47 seconds.
104 */
105#define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
106
107static void *
108epoll_init(struct event_base *base)
109{
110	int epfd;
111	struct epollop *epollop;
112
113	/* Initialize the kernel queue.  (The size field is ignored since
114	 * 2.6.8.) */
115	if ((epfd = epoll_create(32000)) == -1) {
116		if (errno != ENOSYS)
117			event_warn("epoll_create");
118		return (NULL);
119	}
120
121	evutil_make_socket_closeonexec(epfd);
122
123	if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
124		close(epfd);
125		return (NULL);
126	}
127
128	epollop->epfd = epfd;
129
130	/* Initialize fields */
131	epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
132	if (epollop->events == NULL) {
133		mm_free(epollop);
134		close(epfd);
135		return (NULL);
136	}
137	epollop->nevents = INITIAL_NEVENT;
138
139	if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
140	    ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
141		evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
142		base->evsel = &epollops_changelist;
143
144	evsig_init(base);
145
146	return (epollop);
147}
148
149static const char *
150change_to_string(int change)
151{
152	change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
153	if (change == EV_CHANGE_ADD) {
154		return "add";
155	} else if (change == EV_CHANGE_DEL) {
156		return "del";
157	} else if (change == 0) {
158		return "none";
159	} else {
160		return "???";
161	}
162}
163
164static const char *
165epoll_op_to_string(int op)
166{
167	return op == EPOLL_CTL_ADD?"ADD":
168	    op == EPOLL_CTL_DEL?"DEL":
169	    op == EPOLL_CTL_MOD?"MOD":
170	    "???";
171}
172
173static int
174epoll_apply_one_change(struct event_base *base,
175    struct epollop *epollop,
176    const struct event_change *ch)
177{
178	struct epoll_event epev;
179	int op, events = 0;
180
181	if (1) {
182		/* The logic here is a little tricky.  If we had no events set
183		   on the fd before, we need to set op="ADD" and set
184		   events=the events we want to add.  If we had any events set
185		   on the fd before, and we want any events to remain on the
186		   fd, we need to say op="MOD" and set events=the events we
187		   want to remain.  But if we want to delete the last event,
188		   we say op="DEL" and set events=the remaining events.  What
189		   fun!
190		*/
191
192		/* TODO: Turn this into a switch or a table lookup. */
193
194		if ((ch->read_change & EV_CHANGE_ADD) ||
195		    (ch->write_change & EV_CHANGE_ADD)) {
196			/* If we are adding anything at all, we'll want to do
197			 * either an ADD or a MOD. */
198			events = 0;
199			op = EPOLL_CTL_ADD;
200			if (ch->read_change & EV_CHANGE_ADD) {
201				events |= EPOLLIN;
202			} else if (ch->read_change & EV_CHANGE_DEL) {
203				;
204			} else if (ch->old_events & EV_READ) {
205				events |= EPOLLIN;
206			}
207			if (ch->write_change & EV_CHANGE_ADD) {
208				events |= EPOLLOUT;
209			} else if (ch->write_change & EV_CHANGE_DEL) {
210				;
211			} else if (ch->old_events & EV_WRITE) {
212				events |= EPOLLOUT;
213			}
214			if ((ch->read_change|ch->write_change) & EV_ET)
215				events |= EPOLLET;
216
217			if (ch->old_events) {
218				/* If MOD fails, we retry as an ADD, and if
219				 * ADD fails we will retry as a MOD.  So the
220				 * only hard part here is to guess which one
221				 * will work.  As a heuristic, we'll try
222				 * MOD first if we think there were old
223				 * events and ADD if we think there were none.
224				 *
225				 * We can be wrong about the MOD if the file
226				 * has in fact been closed and re-opened.
227				 *
228				 * We can be wrong about the ADD if the
229				 * the fd has been re-created with a dup()
230				 * of the same file that it was before.
231				 */
232				op = EPOLL_CTL_MOD;
233			}
234		} else if ((ch->read_change & EV_CHANGE_DEL) ||
235		    (ch->write_change & EV_CHANGE_DEL)) {
236			/* If we're deleting anything, we'll want to do a MOD
237			 * or a DEL. */
238			op = EPOLL_CTL_DEL;
239
240			if (ch->read_change & EV_CHANGE_DEL) {
241				if (ch->write_change & EV_CHANGE_DEL) {
242					events = EPOLLIN|EPOLLOUT;
243				} else if (ch->old_events & EV_WRITE) {
244					events = EPOLLOUT;
245					op = EPOLL_CTL_MOD;
246				} else {
247					events = EPOLLIN;
248				}
249			} else if (ch->write_change & EV_CHANGE_DEL) {
250				if (ch->old_events & EV_READ) {
251					events = EPOLLIN;
252					op = EPOLL_CTL_MOD;
253				} else {
254					events = EPOLLOUT;
255				}
256			}
257		}
258
259		if (!events)
260			return 0;
261
262		memset(&epev, 0, sizeof(epev));
263		epev.data.fd = ch->fd;
264		epev.events = events;
265		if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
266			if (op == EPOLL_CTL_MOD && errno == ENOENT) {
267				/* If a MOD operation fails with ENOENT, the
268				 * fd was probably closed and re-opened.  We
269				 * should retry the operation as an ADD.
270				 */
271				if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
272					event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
273					    (int)epev.events, ch->fd);
274					return -1;
275				} else {
276					event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
277						(int)epev.events,
278						ch->fd));
279				}
280			} else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
281				/* If an ADD operation fails with EEXIST,
282				 * either the operation was redundant (as with a
283				 * precautionary add), or we ran into a fun
284				 * kernel bug where using dup*() to duplicate the
285				 * same file into the same fd gives you the same epitem
286				 * rather than a fresh one.  For the second case,
287				 * we must retry with MOD. */
288				if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
289					event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
290					    (int)epev.events, ch->fd);
291					return -1;
292				} else {
293					event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
294						(int)epev.events,
295						ch->fd));
296				}
297			} else if (op == EPOLL_CTL_DEL &&
298			    (errno == ENOENT || errno == EBADF ||
299				errno == EPERM)) {
300				/* If a delete fails with one of these errors,
301				 * that's fine too: we closed the fd before we
302				 * got around to calling epoll_dispatch. */
303				event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
304					(int)epev.events,
305					ch->fd,
306					strerror(errno)));
307			} else {
308				event_warn("Epoll %s(%d) on fd %d failed.  Old events were %d; read change was %d (%s); write change was %d (%s)",
309				    epoll_op_to_string(op),
310				    (int)epev.events,
311				    ch->fd,
312				    ch->old_events,
313				    ch->read_change,
314				    change_to_string(ch->read_change),
315				    ch->write_change,
316				    change_to_string(ch->write_change));
317				return -1;
318			}
319		} else {
320			event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
321				epoll_op_to_string(op),
322				(int)epev.events,
323				(int)ch->fd,
324				ch->old_events,
325				ch->read_change,
326				ch->write_change));
327		}
328	}
329	return 0;
330}
331
332static int
333epoll_apply_changes(struct event_base *base)
334{
335	struct event_changelist *changelist = &base->changelist;
336	struct epollop *epollop = base->evbase;
337	struct event_change *ch;
338
339	int r = 0;
340	int i;
341
342	for (i = 0; i < changelist->n_changes; ++i) {
343		ch = &changelist->changes[i];
344		if (epoll_apply_one_change(base, epollop, ch) < 0)
345			r = -1;
346	}
347
348	return (r);
349}
350
351static int
352epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
353    short old, short events, void *p)
354{
355	struct event_change ch;
356	ch.fd = fd;
357	ch.old_events = old;
358	ch.read_change = ch.write_change = 0;
359	if (events & EV_WRITE)
360		ch.write_change = EV_CHANGE_ADD |
361		    (events & EV_ET);
362	if (events & EV_READ)
363		ch.read_change = EV_CHANGE_ADD |
364		    (events & EV_ET);
365
366	return epoll_apply_one_change(base, base->evbase, &ch);
367}
368
369static int
370epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
371    short old, short events, void *p)
372{
373	struct event_change ch;
374	ch.fd = fd;
375	ch.old_events = old;
376	ch.read_change = ch.write_change = 0;
377	if (events & EV_WRITE)
378		ch.write_change = EV_CHANGE_DEL;
379	if (events & EV_READ)
380		ch.read_change = EV_CHANGE_DEL;
381
382	return epoll_apply_one_change(base, base->evbase, &ch);
383}
384
385static int
386epoll_dispatch(struct event_base *base, struct timeval *tv)
387{
388	struct epollop *epollop = base->evbase;
389	struct epoll_event *events = epollop->events;
390	int i, res;
391	long timeout = -1;
392
393	if (tv != NULL) {
394		timeout = evutil_tv_to_msec(tv);
395		if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
396			/* Linux kernels can wait forever if the timeout is
397			 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
398			timeout = MAX_EPOLL_TIMEOUT_MSEC;
399		}
400	}
401
402	epoll_apply_changes(base);
403	event_changelist_remove_all(&base->changelist, base);
404
405	EVBASE_RELEASE_LOCK(base, th_base_lock);
406
407	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
408
409	EVBASE_ACQUIRE_LOCK(base, th_base_lock);
410
411	if (res == -1) {
412		if (errno != EINTR) {
413			event_warn("epoll_wait");
414			return (-1);
415		}
416
417		return (0);
418	}
419
420	event_debug(("%s: epoll_wait reports %d", __func__, res));
421	EVUTIL_ASSERT(res <= epollop->nevents);
422
423	for (i = 0; i < res; i++) {
424		int what = events[i].events;
425		short ev = 0;
426
427		if (what & (EPOLLHUP|EPOLLERR)) {
428			ev = EV_READ | EV_WRITE;
429		} else {
430			if (what & EPOLLIN)
431				ev |= EV_READ;
432			if (what & EPOLLOUT)
433				ev |= EV_WRITE;
434		}
435
436		if (!ev)
437			continue;
438
439		evmap_io_active(base, events[i].data.fd, ev | EV_ET);
440	}
441
442	if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
443		/* We used all of the event space this time.  We should
444		   be ready for more events next time. */
445		int new_nevents = epollop->nevents * 2;
446		struct epoll_event *new_events;
447
448		new_events = mm_realloc(epollop->events,
449		    new_nevents * sizeof(struct epoll_event));
450		if (new_events) {
451			epollop->events = new_events;
452			epollop->nevents = new_nevents;
453		}
454	}
455
456	return (0);
457}
458
459
460static void
461epoll_dealloc(struct event_base *base)
462{
463	struct epollop *epollop = base->evbase;
464
465	evsig_dealloc(base);
466	if (epollop->events)
467		mm_free(epollop->events);
468	if (epollop->epfd >= 0)
469		close(epollop->epfd);
470
471	memset(epollop, 0, sizeof(struct epollop));
472	mm_free(epollop);
473}
474