1/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */
2/* dbus-socket-set-epoll.c - a socket set implemented via Linux epoll(4)
3 *
4 * Copyright �� 2011 Nokia Corporation
5 *
6 * Licensed under the Academic Free License version 2.1
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 * MA  02110-1301  USA
22 *
23 */
24
25#include <config.h>
26#include "dbus-socket-set.h"
27
28#include <dbus/dbus-internals.h>
29#include <dbus/dbus-sysdeps.h>
30
31#ifndef __linux__
32# error This file is for Linux epoll(4)
33#endif
34
35#include <errno.h>
36#include <fcntl.h>
37#include <sys/epoll.h>
38#include <unistd.h>
39
40#ifndef DOXYGEN_SHOULD_SKIP_THIS
41
42typedef struct {
43    DBusSocketSet parent;
44    int epfd;
45} DBusSocketSetEpoll;
46
47static inline DBusSocketSetEpoll *
48socket_set_epoll_cast (DBusSocketSet *set)
49{
50  _dbus_assert (set->cls == &_dbus_socket_set_epoll_class);
51  return (DBusSocketSetEpoll *) set;
52}
53
54/* this is safe to call on a partially-allocated socket set */
55static void
56socket_set_epoll_free (DBusSocketSet *set)
57{
58  DBusSocketSetEpoll *self = socket_set_epoll_cast (set);
59
60  if (self == NULL)
61    return;
62
63  if (self->epfd != -1)
64    close (self->epfd);
65
66  dbus_free (self);
67}
68
69DBusSocketSet *
70_dbus_socket_set_epoll_new (void)
71{
72  DBusSocketSetEpoll *self;
73
74  self = dbus_new0 (DBusSocketSetEpoll, 1);
75
76  if (self == NULL)
77    return NULL;
78
79  self->parent.cls = &_dbus_socket_set_epoll_class;
80
81  self->epfd = epoll_create1 (EPOLL_CLOEXEC);
82
83  if (self->epfd == -1)
84    {
85      int flags;
86
87      /* the size hint is ignored unless you have a rather old kernel,
88       * but must be positive on some versions, so just pick something
89       * arbitrary; it's a hint, not a limit */
90      self->epfd = epoll_create (42);
91
92      flags = fcntl (self->epfd, F_GETFD, 0);
93
94      if (flags != -1)
95        fcntl (self->epfd, F_SETFD, flags | FD_CLOEXEC);
96    }
97
98  if (self->epfd == -1)
99    {
100      socket_set_epoll_free ((DBusSocketSet *) self);
101      return NULL;
102    }
103
104  return (DBusSocketSet *) self;
105}
106
107static uint32_t
108watch_flags_to_epoll_events (unsigned int flags)
109{
110  uint32_t events = 0;
111
112  if (flags & DBUS_WATCH_READABLE)
113    events |= EPOLLIN;
114  if (flags & DBUS_WATCH_WRITABLE)
115    events |= EPOLLOUT;
116
117  return events;
118}
119
120static unsigned int
121epoll_events_to_watch_flags (uint32_t events)
122{
123  short flags = 0;
124
125  if (events & EPOLLIN)
126    flags |= DBUS_WATCH_READABLE;
127  if (events & EPOLLOUT)
128    flags |= DBUS_WATCH_WRITABLE;
129  if (events & EPOLLHUP)
130    flags |= DBUS_WATCH_HANGUP;
131  if (events & EPOLLERR)
132    flags |= DBUS_WATCH_ERROR;
133
134  return flags;
135}
136
137static dbus_bool_t
138socket_set_epoll_add (DBusSocketSet  *set,
139                      int             fd,
140                      unsigned int    flags,
141                      dbus_bool_t     enabled)
142{
143  DBusSocketSetEpoll *self = socket_set_epoll_cast (set);
144  struct epoll_event event;
145  int err;
146
147  event.data.fd = fd;
148
149  if (enabled)
150    {
151      event.events = watch_flags_to_epoll_events (flags);
152    }
153  else
154    {
155      /* We need to add *something* to reserve space in the kernel's data
156       * structures: see socket_set_epoll_disable for more details */
157      event.events = EPOLLET;
158    }
159
160  if (epoll_ctl (self->epfd, EPOLL_CTL_ADD, fd, &event) == 0)
161    return TRUE;
162
163  /* Anything except ENOMEM, ENOSPC means we have an internal error. */
164  err = errno;
165  switch (err)
166    {
167      case ENOMEM:
168      case ENOSPC:
169        /* be silent: this is basically OOM, which our callers are expected
170         * to cope with */
171        break;
172
173      case EBADF:
174        _dbus_warn ("Bad fd %d\n", fd);
175        break;
176
177      case EEXIST:
178        _dbus_warn ("fd %d added and then added again\n", fd);
179        break;
180
181      default:
182        _dbus_warn ("Misc error when trying to watch fd %d: %s\n", fd,
183                    strerror (err));
184        break;
185    }
186
187  return FALSE;
188}
189
190static void
191socket_set_epoll_enable (DBusSocketSet  *set,
192                         int             fd,
193                         unsigned int    flags)
194{
195  DBusSocketSetEpoll *self = socket_set_epoll_cast (set);
196  struct epoll_event event;
197  int err;
198
199  event.data.fd = fd;
200  event.events = watch_flags_to_epoll_events (flags);
201
202  if (epoll_ctl (self->epfd, EPOLL_CTL_MOD, fd, &event) == 0)
203    return;
204
205  err = errno;
206
207  /* Enabling a file descriptor isn't allowed to fail, even for OOM, so we
208   * do our best to avoid all of these. */
209  switch (err)
210    {
211      case EBADF:
212        _dbus_warn ("Bad fd %d\n", fd);
213        break;
214
215      case ENOENT:
216        _dbus_warn ("fd %d enabled before it was added\n", fd);
217        break;
218
219      case ENOMEM:
220        _dbus_warn ("Insufficient memory to change watch for fd %d\n", fd);
221        break;
222
223      default:
224        _dbus_warn ("Misc error when trying to watch fd %d: %s\n", fd,
225                    strerror (err));
226        break;
227    }
228}
229
230static void
231socket_set_epoll_disable (DBusSocketSet  *set,
232                          int             fd)
233{
234  DBusSocketSetEpoll *self = socket_set_epoll_cast (set);
235  struct epoll_event event;
236  int err;
237
238  /* The naive thing to do would be EPOLL_CTL_DEL, but that'll probably
239   * free resources in the kernel. When we come to do socket_set_epoll_enable,
240   * there might not be enough resources to bring it back!
241   *
242   * The next idea you might have is to set the flags to 0. However, events
243   * always trigger on EPOLLERR and EPOLLHUP, even if libdbus isn't actually
244   * delivering them to a DBusWatch. Because epoll is level-triggered by
245   * default, we'll busy-loop on an unhandled error or hangup; not good.
246   *
247   * So, let's set it to be edge-triggered: then the worst case is that
248   * we return from poll immediately on one iteration, ignore it because no
249   * watch is enabled, then go back to normal. When we re-enable a watch
250   * we'll switch back to level-triggered and be notified again (verified to
251   * work on 2.6.32). Compile this file with -DTEST_BEHAVIOUR_OF_EPOLLET for
252   * test code.
253   */
254  event.data.fd = fd;
255  event.events = EPOLLET;
256
257  if (epoll_ctl (self->epfd, EPOLL_CTL_MOD, fd, &event) == 0)
258    return;
259
260  err = errno;
261  _dbus_warn ("Error when trying to watch fd %d: %s\n", fd,
262              strerror (err));
263}
264
265static void
266socket_set_epoll_remove (DBusSocketSet  *set,
267                         int             fd)
268{
269  DBusSocketSetEpoll *self = socket_set_epoll_cast (set);
270  int err;
271  /* Kernels < 2.6.9 require a non-NULL struct pointer, even though its
272   * contents are ignored */
273  struct epoll_event dummy = { 0 };
274
275  if (epoll_ctl (self->epfd, EPOLL_CTL_DEL, fd, &dummy) == 0)
276    return;
277
278  err = errno;
279  _dbus_warn ("Error when trying to remove fd %d: %s\n", fd, strerror (err));
280}
281
282/* Optimally, this should be the same as in DBusLoop: we use it to translate
283 * between struct epoll_event and DBusSocketEvent without allocating heap
284 * memory. */
285#define N_STACK_DESCRIPTORS 64
286
287static int
288socket_set_epoll_poll (DBusSocketSet   *set,
289                       DBusSocketEvent *revents,
290                       int              max_events,
291                       int              timeout_ms)
292{
293  DBusSocketSetEpoll *self = socket_set_epoll_cast (set);
294  struct epoll_event events[N_STACK_DESCRIPTORS];
295  int n_ready;
296  int i;
297
298  _dbus_assert (max_events > 0);
299
300  n_ready = epoll_wait (self->epfd, events,
301                        MIN (_DBUS_N_ELEMENTS (events), max_events),
302                        timeout_ms);
303
304  if (n_ready <= 0)
305    return n_ready;
306
307  for (i = 0; i < n_ready; i++)
308    {
309      revents[i].fd = events[i].data.fd;
310      revents[i].flags = epoll_events_to_watch_flags (events[i].events);
311    }
312
313  return n_ready;
314}
315
316DBusSocketSetClass _dbus_socket_set_epoll_class = {
317    socket_set_epoll_free,
318    socket_set_epoll_add,
319    socket_set_epoll_remove,
320    socket_set_epoll_enable,
321    socket_set_epoll_disable,
322    socket_set_epoll_poll
323};
324
325#ifdef TEST_BEHAVIOUR_OF_EPOLLET
326/* usage: cat /dev/null | ./epoll
327 *
328 * desired output:
329 * ctl ADD: 0
330 * wait for HUP, edge-triggered: 1
331 * wait for HUP again: 0
332 * ctl MOD: 0
333 * wait for HUP: 1
334 */
335
336#include <sys/epoll.h>
337
338#include <stdio.h>
339
340int
341main (void)
342{
343  struct epoll_event input;
344  struct epoll_event output;
345  int epfd = epoll_create1 (EPOLL_CLOEXEC);
346  int fd = 0; /* stdin */
347  int ret;
348
349  input.events = EPOLLHUP | EPOLLET;
350  ret = epoll_ctl (epfd, EPOLL_CTL_ADD, fd, &input);
351  printf ("ctl ADD: %d\n", ret);
352
353  ret = epoll_wait (epfd, &output, 1, -1);
354  printf ("wait for HUP, edge-triggered: %d\n", ret);
355
356  ret = epoll_wait (epfd, &output, 1, 1);
357  printf ("wait for HUP again: %d\n", ret);
358
359  input.events = EPOLLHUP;
360  ret = epoll_ctl (epfd, EPOLL_CTL_MOD, fd, &input);
361  printf ("ctl MOD: %d\n", ret);
362
363  ret = epoll_wait (epfd, &output, 1, -1);
364  printf ("wait for HUP: %d\n", ret);
365
366  return 0;
367}
368
369#endif /* TEST_BEHAVIOUR_OF_EPOLLET */
370
371#endif /* !DOXYGEN_SHOULD_SKIP_THIS */
372