1/*
2 * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1998-2003  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id$ */
19
20/*! \file */
21
22#include <config.h>
23
24#include <sys/param.h>
25#include <sys/types.h>
26#include <sys/socket.h>
27#include <sys/stat.h>
28#include <sys/time.h>
29#include <sys/uio.h>
30
31#include <errno.h>
32#include <fcntl.h>
33#include <stddef.h>
34#include <stdlib.h>
35#include <string.h>
36#include <unistd.h>
37
38#include <isc/buffer.h>
39#include <isc/bufferlist.h>
40#include <isc/condition.h>
41#include <isc/formatcheck.h>
42#include <isc/list.h>
43#include <isc/log.h>
44#include <isc/mem.h>
45#include <isc/msgs.h>
46#include <isc/mutex.h>
47#include <isc/net.h>
48#include <isc/once.h>
49#include <isc/platform.h>
50#include <isc/print.h>
51#include <isc/region.h>
52#include <isc/socket.h>
53#include <isc/stats.h>
54#include <isc/strerror.h>
55#include <isc/task.h>
56#include <isc/thread.h>
57#include <isc/util.h>
58#include <isc/xml.h>
59
60#ifdef ISC_PLATFORM_HAVESYSUNH
61#include <sys/un.h>
62#endif
63#ifdef ISC_PLATFORM_HAVEKQUEUE
64#include <sys/event.h>
65#endif
66#ifdef ISC_PLATFORM_HAVEEPOLL
67#include <sys/epoll.h>
68#endif
69#ifdef ISC_PLATFORM_HAVEDEVPOLL
70#if defined(HAVE_SYS_DEVPOLL_H)
71#include <sys/devpoll.h>
72#elif defined(HAVE_DEVPOLL_H)
73#include <devpoll.h>
74#endif
75#endif
76
77#include "errno2result.h"
78
79/* See task.c about the following definition: */
80#ifdef BIND9
81#ifdef ISC_PLATFORM_USETHREADS
82#define USE_WATCHER_THREAD
83#else
84#define USE_SHARED_MANAGER
85#endif	/* ISC_PLATFORM_USETHREADS */
86#endif	/* BIND9 */
87
88#ifndef USE_WATCHER_THREAD
89#include "socket_p.h"
90#include "../task_p.h"
91#endif /* USE_WATCHER_THREAD */
92
93#if defined(SO_BSDCOMPAT) && defined(__linux__)
94#include <sys/utsname.h>
95#endif
96
97/*%
98 * Choose the most preferable multiplex method.
99 */
100#ifdef ISC_PLATFORM_HAVEKQUEUE
101#define USE_KQUEUE
102#elif defined (ISC_PLATFORM_HAVEEPOLL)
103#define USE_EPOLL
104#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
105#define USE_DEVPOLL
106typedef struct {
107	unsigned int want_read : 1,
108		want_write : 1;
109} pollinfo_t;
110#else
111#define USE_SELECT
112#endif	/* ISC_PLATFORM_HAVEKQUEUE */
113
114#ifndef USE_WATCHER_THREAD
115#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
116struct isc_socketwait {
117	int nevents;
118};
119#elif defined (USE_SELECT)
120struct isc_socketwait {
121	fd_set *readset;
122	fd_set *writeset;
123	int nfds;
124	int maxfd;
125};
126#endif	/* USE_KQUEUE */
127#endif /* !USE_WATCHER_THREAD */
128
129/*%
130 * Maximum number of allowable open sockets.  This is also the maximum
131 * allowable socket file descriptor.
132 *
133 * Care should be taken before modifying this value for select():
134 * The API standard doesn't ensure select() accept more than (the system default
135 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
136 * the vast majority of cases.  This constant should therefore be increased only
137 * when absolutely necessary and possible, i.e., the server is exhausting all
138 * available file descriptors (up to FD_SETSIZE) and the select() function
139 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
140 * always by true, but we keep using some of them to ensure as much
141 * portability as possible).  Note also that overall server performance
142 * may be rather worsened with a larger value of this constant due to
143 * inherent scalability problems of select().
144 *
145 * As a special note, this value shouldn't have to be touched if
146 * this is a build for an authoritative only DNS server.
147 */
148#ifndef ISC_SOCKET_MAXSOCKETS
149#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
150#define ISC_SOCKET_MAXSOCKETS 4096
151#elif defined(USE_SELECT)
152#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
153#endif	/* USE_KQUEUE... */
154#endif	/* ISC_SOCKET_MAXSOCKETS */
155
156#ifdef USE_SELECT
157/*%
158 * Mac OS X needs a special definition to support larger values in select().
159 * We always define this because a larger value can be specified run-time.
160 */
161#ifdef __APPLE__
162#define _DARWIN_UNLIMITED_SELECT
163#endif	/* __APPLE__ */
164#endif	/* USE_SELECT */
165
166#ifdef ISC_SOCKET_USE_POLLWATCH
167/*%
168 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
169 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
170 * some of the specified FD.  The idea is based on the observation that it's
171 * likely for a busy server to keep receiving packets.  It specifically works
172 * as follows: the socket watcher is first initialized with the state of
173 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
174 * event occurs.  When it wakes up for a socket I/O event, it moves to the
175 * poll_active state, and sets the poll timeout to a short period
176 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
177 * watcher goes to the poll_checking state with the same timeout period.
178 * In this state, the watcher tries to detect whether this is a break
179 * during intermittent events or the kernel bug is triggered.  If the next
180 * polling reports an event within the short period, the previous timeout is
181 * likely to be a kernel bug, and so the watcher goes back to the active state.
182 * Otherwise, it moves to the idle state again.
183 *
184 * It's not clear whether this is a thread-related bug, but since we've only
185 * seen this with threads, this workaround is used only when enabling threads.
186 */
187
188typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
189
190#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
191#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
192#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
193#endif	/* ISC_SOCKET_USE_POLLWATCH */
194
195/*%
196 * Size of per-FD lock buckets.
197 */
198#ifdef ISC_PLATFORM_USETHREADS
199#define FDLOCK_COUNT		1024
200#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)
201#else
202#define FDLOCK_COUNT		1
203#define FDLOCK_ID(fd)		0
204#endif	/* ISC_PLATFORM_USETHREADS */
205
206/*%
207 * Maximum number of events communicated with the kernel.  There should normally
208 * be no need for having a large number.
209 */
210#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
211#ifndef ISC_SOCKET_MAXEVENTS
212#define ISC_SOCKET_MAXEVENTS	64
213#endif
214#endif
215
216/*%
217 * Some systems define the socket length argument as an int, some as size_t,
218 * some as socklen_t.  This is here so it can be easily changed if needed.
219 */
220#ifndef ISC_SOCKADDR_LEN_T
221#define ISC_SOCKADDR_LEN_T unsigned int
222#endif
223
224/*%
225 * Define what the possible "soft" errors can be.  These are non-fatal returns
226 * of various network related functions, like recv() and so on.
227 *
228 * For some reason, BSDI (and perhaps others) will sometimes return <0
229 * from recv() but will have errno==0.  This is broken, but we have to
230 * work around it here.
231 */
232#define SOFT_ERROR(e)	((e) == EAGAIN || \
233			 (e) == EWOULDBLOCK || \
234			 (e) == EINTR || \
235			 (e) == 0)
236
237#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
238
239/*!<
240 * DLVL(90)  --  Function entry/exit and other tracing.
241 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
242 * DLVL(60)  --  Socket data send/receive
243 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
244 * DLVL(20)  --  Socket creation/destruction.
245 */
246#define TRACE_LEVEL		90
247#define CORRECTNESS_LEVEL	70
248#define IOEVENT_LEVEL		60
249#define EVENT_LEVEL		50
250#define CREATION_LEVEL		20
251
252#define TRACE		DLVL(TRACE_LEVEL)
253#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
254#define IOEVENT		DLVL(IOEVENT_LEVEL)
255#define EVENT		DLVL(EVENT_LEVEL)
256#define CREATION	DLVL(CREATION_LEVEL)
257
258typedef isc_event_t intev_t;
259
260#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
261#define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
262
263/*!
264 * IPv6 control information.  If the socket is an IPv6 socket we want
265 * to collect the destination address and interface so the client can
266 * set them on outgoing packets.
267 */
268#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
269#ifndef USE_CMSG
270#define USE_CMSG	1
271#endif
272#endif
273
274/*%
275 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
276 * a setsockopt() like interface to request timestamps, and if the OS
277 * doesn't do it for us, call gettimeofday() on every UDP receive?
278 */
279#ifdef SO_TIMESTAMP
280#ifndef USE_CMSG
281#define USE_CMSG	1
282#endif
283#endif
284
285/*%
286 * The size to raise the receive buffer to (from BIND 8).
287 */
288#define RCVBUFSIZE (32*1024)
289
290/*%
291 * The number of times a send operation is repeated if the result is EINTR.
292 */
293#define NRETRIES 10
294
295typedef struct isc__socket isc__socket_t;
296typedef struct isc__socketmgr isc__socketmgr_t;
297
298#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
299
300struct isc__socket {
301	/* Not locked. */
302	isc_socket_t		common;
303	isc__socketmgr_t	*manager;
304	isc_mutex_t		lock;
305	isc_sockettype_t	type;
306	const isc_statscounter_t	*statsindex;
307
308	/* Locked by socket lock. */
309	ISC_LINK(isc__socket_t)	link;
310	unsigned int		references;
311	int			fd;
312	int			pf;
313	char				name[16];
314	void *				tag;
315
316	ISC_LIST(isc_socketevent_t)		send_list;
317	ISC_LIST(isc_socketevent_t)		recv_list;
318	ISC_LIST(isc_socket_newconnev_t)	accept_list;
319	isc_socket_connev_t		       *connect_ev;
320
321	/*
322	 * Internal events.  Posted when a descriptor is readable or
323	 * writable.  These are statically allocated and never freed.
324	 * They will be set to non-purgable before use.
325	 */
326	intev_t			readable_ev;
327	intev_t			writable_ev;
328
329	isc_sockaddr_t		peer_address;  /* remote address */
330
331	unsigned int		pending_recv : 1,
332				pending_send : 1,
333				pending_accept : 1,
334				listener : 1, /* listener socket */
335				connected : 1,
336				connecting : 1, /* connect pending */
337				bound : 1; /* bound to local addr */
338
339#ifdef ISC_NET_RECVOVERFLOW
340	unsigned char		overflow; /* used for MSG_TRUNC fake */
341#endif
342
343	char			*recvcmsgbuf;
344	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
345	char			*sendcmsgbuf;
346	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
347
348	void			*fdwatcharg;
349	isc_sockfdwatch_t	fdwatchcb;
350	int			fdwatchflags;
351	isc_task_t		*fdwatchtask;
352};
353
354#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
355#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
356
357struct isc__socketmgr {
358	/* Not locked. */
359	isc_socketmgr_t		common;
360	isc_mem_t	       *mctx;
361	isc_mutex_t		lock;
362	isc_mutex_t		*fdlock;
363	isc_stats_t		*stats;
364#ifdef USE_KQUEUE
365	int			kqueue_fd;
366	int			nevents;
367	struct kevent		*events;
368#endif	/* USE_KQUEUE */
369#ifdef USE_EPOLL
370	int			epoll_fd;
371	int			nevents;
372	struct epoll_event	*events;
373#endif	/* USE_EPOLL */
374#ifdef USE_DEVPOLL
375	int			devpoll_fd;
376	int			nevents;
377	struct pollfd		*events;
378#endif	/* USE_DEVPOLL */
379#ifdef USE_SELECT
380	int			fd_bufsize;
381#endif	/* USE_SELECT */
382	unsigned int		maxsocks;
383#ifdef ISC_PLATFORM_USETHREADS
384	int			pipe_fds[2];
385#endif
386
387	/* Locked by fdlock. */
388	isc__socket_t	       **fds;
389	int			*fdstate;
390#ifdef USE_DEVPOLL
391	pollinfo_t		*fdpollinfo;
392#endif
393
394	/* Locked by manager lock. */
395	ISC_LIST(isc__socket_t)	socklist;
396#ifdef USE_SELECT
397	fd_set			*read_fds;
398	fd_set			*read_fds_copy;
399	fd_set			*write_fds;
400	fd_set			*write_fds_copy;
401	int			maxfd;
402#endif	/* USE_SELECT */
403	int			reserved;	/* unlocked */
404#ifdef USE_WATCHER_THREAD
405	isc_thread_t		watcher;
406	isc_condition_t		shutdown_ok;
407#else /* USE_WATCHER_THREAD */
408	unsigned int		refs;
409#endif /* USE_WATCHER_THREAD */
410	int			maxudp;
411};
412
413#ifdef USE_SHARED_MANAGER
414static isc__socketmgr_t *socketmgr = NULL;
415#endif /* USE_SHARED_MANAGER */
416
417#define CLOSED			0	/* this one must be zero */
418#define MANAGED			1
419#define CLOSE_PENDING		2
420
421/*
422 * send() and recv() iovec counts
423 */
424#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
425#ifdef ISC_NET_RECVOVERFLOW
426# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
427#else
428# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
429#endif
430
431static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
432static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
433static void free_socket(isc__socket_t **);
434static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
435				    isc__socket_t **);
436static void destroy(isc__socket_t **);
437static void internal_accept(isc_task_t *, isc_event_t *);
438static void internal_connect(isc_task_t *, isc_event_t *);
439static void internal_recv(isc_task_t *, isc_event_t *);
440static void internal_send(isc_task_t *, isc_event_t *);
441static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
442static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
443static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
444static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
445			      struct msghdr *, struct iovec *, size_t *);
446static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
447			      struct msghdr *, struct iovec *, size_t *);
448#ifdef USE_WATCHER_THREAD
449static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
450#endif
451
452/*%
453 * The following can be either static or public, depending on build environment.
454 */
455
456#ifdef BIND9
457#define ISC_SOCKETFUNC_SCOPE
458#else
459#define ISC_SOCKETFUNC_SCOPE static
460#endif
461
462ISC_SOCKETFUNC_SCOPE isc_result_t
463isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
464		   isc_socket_t **socketp);
465ISC_SOCKETFUNC_SCOPE void
466isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
467ISC_SOCKETFUNC_SCOPE void
468isc__socket_detach(isc_socket_t **socketp);
469ISC_SOCKETFUNC_SCOPE isc_result_t
470isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
471ISC_SOCKETFUNC_SCOPE isc_result_t
472isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
473		       unsigned int maxsocks);
474ISC_SOCKETFUNC_SCOPE void
475isc__socketmgr_destroy(isc_socketmgr_t **managerp);
476ISC_SOCKETFUNC_SCOPE isc_result_t
477isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
478		 unsigned int minimum, isc_task_t *task,
479		  isc_taskaction_t action, const void *arg);
480ISC_SOCKETFUNC_SCOPE isc_result_t
481isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
482		 unsigned int minimum, isc_task_t *task,
483		 isc_taskaction_t action, const void *arg);
484ISC_SOCKETFUNC_SCOPE isc_result_t
485isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
486		  unsigned int minimum, isc_task_t *task,
487		  isc_socketevent_t *event, unsigned int flags);
488ISC_SOCKETFUNC_SCOPE isc_result_t
489isc__socket_send(isc_socket_t *sock, isc_region_t *region,
490		 isc_task_t *task, isc_taskaction_t action, const void *arg);
491ISC_SOCKETFUNC_SCOPE isc_result_t
492isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
493		   isc_task_t *task, isc_taskaction_t action, const void *arg,
494		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
495ISC_SOCKETFUNC_SCOPE isc_result_t
496isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
497		  isc_task_t *task, isc_taskaction_t action, const void *arg);
498ISC_SOCKETFUNC_SCOPE isc_result_t
499isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
500		    isc_task_t *task, isc_taskaction_t action, const void *arg,
501		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
502ISC_SOCKETFUNC_SCOPE isc_result_t
503isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
504		    isc_task_t *task,
505		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
506		    isc_socketevent_t *event, unsigned int flags);
507ISC_SOCKETFUNC_SCOPE void
508isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
509ISC_SOCKETFUNC_SCOPE isc_result_t
510isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
511		     isc_uint32_t owner, isc_uint32_t group);
512ISC_SOCKETFUNC_SCOPE isc_result_t
513isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
514		 unsigned int options);
515ISC_SOCKETFUNC_SCOPE isc_result_t
516isc__socket_filter(isc_socket_t *sock, const char *filter);
517ISC_SOCKETFUNC_SCOPE isc_result_t
518isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
519ISC_SOCKETFUNC_SCOPE isc_result_t
520isc__socket_accept(isc_socket_t *sock,
521		   isc_task_t *task, isc_taskaction_t action, const void *arg);
522ISC_SOCKETFUNC_SCOPE isc_result_t
523isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
524		    isc_task_t *task, isc_taskaction_t action,
525		    const void *arg);
526ISC_SOCKETFUNC_SCOPE isc_result_t
527isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
528ISC_SOCKETFUNC_SCOPE isc_result_t
529isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
530ISC_SOCKETFUNC_SCOPE void
531isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
532ISC_SOCKETFUNC_SCOPE isc_sockettype_t
533isc__socket_gettype(isc_socket_t *sock);
534ISC_SOCKETFUNC_SCOPE isc_boolean_t
535isc__socket_isbound(isc_socket_t *sock);
536ISC_SOCKETFUNC_SCOPE void
537isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
538#if defined(HAVE_LIBXML2) && defined(BIND9)
539ISC_SOCKETFUNC_SCOPE void
540isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
541#endif
542
543ISC_SOCKETFUNC_SCOPE isc_result_t
544isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
545			  isc_sockfdwatch_t callback, void *cbarg,
546			  isc_task_t *task, isc_socket_t **socketp);
547ISC_SOCKETFUNC_SCOPE isc_result_t
548isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
549
550static struct {
551	isc_socketmethods_t methods;
552
553	/*%
554	 * The following are defined just for avoiding unused static functions.
555	 */
556#ifndef BIND9
557	void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
558		*listen, *accept, *getpeername, *isbound;
559#endif
560} socketmethods = {
561	{
562		isc__socket_attach,
563		isc__socket_detach,
564		isc__socket_bind,
565		isc__socket_sendto,
566		isc__socket_connect,
567		isc__socket_recv,
568		isc__socket_cancel,
569		isc__socket_getsockname,
570		isc__socket_gettype,
571		isc__socket_ipv6only,
572		isc__socket_fdwatchpoke
573	}
574#ifndef BIND9
575	,
576	(void *)isc__socket_recvv, (void *)isc__socket_send,
577	(void *)isc__socket_sendv, (void *)isc__socket_sendto2,
578	(void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
579	(void *)isc__socket_filter, (void *)isc__socket_listen,
580	(void *)isc__socket_accept, (void *)isc__socket_getpeername,
581	(void *)isc__socket_isbound
582#endif
583};
584
585static isc_socketmgrmethods_t socketmgrmethods = {
586	isc__socketmgr_destroy,
587	isc__socket_create,
588	isc__socket_fdwatchcreate
589};
590
591#define SELECT_POKE_SHUTDOWN		(-1)
592#define SELECT_POKE_NOTHING		(-2)
593#define SELECT_POKE_READ		(-3)
594#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
595#define SELECT_POKE_WRITE		(-4)
596#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
597#define SELECT_POKE_CLOSE		(-5)
598
599#define SOCK_DEAD(s)			((s)->references == 0)
600
601/*%
602 * Shortcut index arrays to get access to statistics counters.
603 */
604enum {
605	STATID_OPEN = 0,
606	STATID_OPENFAIL = 1,
607	STATID_CLOSE = 2,
608	STATID_BINDFAIL = 3,
609	STATID_CONNECTFAIL = 4,
610	STATID_CONNECT = 5,
611	STATID_ACCEPTFAIL = 6,
612	STATID_ACCEPT = 7,
613	STATID_SENDFAIL = 8,
614	STATID_RECVFAIL = 9
615};
616static const isc_statscounter_t upd4statsindex[] = {
617	isc_sockstatscounter_udp4open,
618	isc_sockstatscounter_udp4openfail,
619	isc_sockstatscounter_udp4close,
620	isc_sockstatscounter_udp4bindfail,
621	isc_sockstatscounter_udp4connectfail,
622	isc_sockstatscounter_udp4connect,
623	-1,
624	-1,
625	isc_sockstatscounter_udp4sendfail,
626	isc_sockstatscounter_udp4recvfail
627};
628static const isc_statscounter_t upd6statsindex[] = {
629	isc_sockstatscounter_udp6open,
630	isc_sockstatscounter_udp6openfail,
631	isc_sockstatscounter_udp6close,
632	isc_sockstatscounter_udp6bindfail,
633	isc_sockstatscounter_udp6connectfail,
634	isc_sockstatscounter_udp6connect,
635	-1,
636	-1,
637	isc_sockstatscounter_udp6sendfail,
638	isc_sockstatscounter_udp6recvfail
639};
640static const isc_statscounter_t tcp4statsindex[] = {
641	isc_sockstatscounter_tcp4open,
642	isc_sockstatscounter_tcp4openfail,
643	isc_sockstatscounter_tcp4close,
644	isc_sockstatscounter_tcp4bindfail,
645	isc_sockstatscounter_tcp4connectfail,
646	isc_sockstatscounter_tcp4connect,
647	isc_sockstatscounter_tcp4acceptfail,
648	isc_sockstatscounter_tcp4accept,
649	isc_sockstatscounter_tcp4sendfail,
650	isc_sockstatscounter_tcp4recvfail
651};
652static const isc_statscounter_t tcp6statsindex[] = {
653	isc_sockstatscounter_tcp6open,
654	isc_sockstatscounter_tcp6openfail,
655	isc_sockstatscounter_tcp6close,
656	isc_sockstatscounter_tcp6bindfail,
657	isc_sockstatscounter_tcp6connectfail,
658	isc_sockstatscounter_tcp6connect,
659	isc_sockstatscounter_tcp6acceptfail,
660	isc_sockstatscounter_tcp6accept,
661	isc_sockstatscounter_tcp6sendfail,
662	isc_sockstatscounter_tcp6recvfail
663};
664static const isc_statscounter_t unixstatsindex[] = {
665	isc_sockstatscounter_unixopen,
666	isc_sockstatscounter_unixopenfail,
667	isc_sockstatscounter_unixclose,
668	isc_sockstatscounter_unixbindfail,
669	isc_sockstatscounter_unixconnectfail,
670	isc_sockstatscounter_unixconnect,
671	isc_sockstatscounter_unixacceptfail,
672	isc_sockstatscounter_unixaccept,
673	isc_sockstatscounter_unixsendfail,
674	isc_sockstatscounter_unixrecvfail
675};
676static const isc_statscounter_t fdwatchstatsindex[] = {
677	-1,
678	-1,
679	isc_sockstatscounter_fdwatchclose,
680	isc_sockstatscounter_fdwatchbindfail,
681	isc_sockstatscounter_fdwatchconnectfail,
682	isc_sockstatscounter_fdwatchconnect,
683	-1,
684	-1,
685	isc_sockstatscounter_fdwatchsendfail,
686	isc_sockstatscounter_fdwatchrecvfail
687};
688
689#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
690    defined(USE_WATCHER_THREAD)
691static void
692manager_log(isc__socketmgr_t *sockmgr,
693	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
694	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
695static void
696manager_log(isc__socketmgr_t *sockmgr,
697	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
698	    const char *fmt, ...)
699{
700	char msgbuf[2048];
701	va_list ap;
702
703	if (! isc_log_wouldlog(isc_lctx, level))
704		return;
705
706	va_start(ap, fmt);
707	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
708	va_end(ap);
709
710	isc_log_write(isc_lctx, category, module, level,
711		      "sockmgr %p: %s", sockmgr, msgbuf);
712}
713#endif
714
715static void
716socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
717	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
718	   isc_msgcat_t *msgcat, int msgset, int message,
719	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
720static void
721socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
722	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
723	   isc_msgcat_t *msgcat, int msgset, int message,
724	   const char *fmt, ...)
725{
726	char msgbuf[2048];
727	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
728	va_list ap;
729
730	if (! isc_log_wouldlog(isc_lctx, level))
731		return;
732
733	va_start(ap, fmt);
734	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
735	va_end(ap);
736
737	if (address == NULL) {
738		isc_log_iwrite(isc_lctx, category, module, level,
739			       msgcat, msgset, message,
740			       "socket %p: %s", sock, msgbuf);
741	} else {
742		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
743		isc_log_iwrite(isc_lctx, category, module, level,
744			       msgcat, msgset, message,
745			       "socket %p %s: %s", sock, peerbuf, msgbuf);
746	}
747}
748
749#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
750    defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
751/*
752 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
753 * setting IPV6_V6ONLY.
754 */
755static void
756FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
757{
758	char strbuf[ISC_STRERRORSIZE];
759	int on = 1;
760
761	if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
762		return;
763
764	if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
765		       (void *)&on, sizeof(on)) < 0) {
766
767		isc__strerror(errno, strbuf, sizeof(strbuf));
768		UNEXPECTED_ERROR(__FILE__, __LINE__,
769				 "setsockopt(%d, IPV6_RECVPKTINFO) "
770				 "%s: %s", sock->fd,
771				 isc_msgcat_get(isc_msgcat,
772						ISC_MSGSET_GENERAL,
773						ISC_MSG_FAILED,
774						"failed"),
775				 strbuf);
776	}
777}
778#else
779#define FIX_IPV6_RECVPKTINFO(sock) (void)0
780#endif
781
782/*%
783 * Increment socket-related statistics counters.
784 */
785static inline void
786inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
787	REQUIRE(counterid != -1);
788
789	if (stats != NULL)
790		isc_stats_increment(stats, counterid);
791}
792
793static inline isc_result_t
794watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
795	isc_result_t result = ISC_R_SUCCESS;
796
797#ifdef USE_KQUEUE
798	struct kevent evchange;
799
800	memset(&evchange, 0, sizeof(evchange));
801	if (msg == SELECT_POKE_READ)
802		evchange.filter = EVFILT_READ;
803	else
804		evchange.filter = EVFILT_WRITE;
805	evchange.flags = EV_ADD;
806	evchange.ident = fd;
807	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
808		result = isc__errno2result(errno);
809
810	return (result);
811#elif defined(USE_EPOLL)
812	struct epoll_event event;
813
814	if (msg == SELECT_POKE_READ)
815		event.events = EPOLLIN;
816	else
817		event.events = EPOLLOUT;
818	memset(&event.data, 0, sizeof(event.data));
819	event.data.fd = fd;
820	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
821	    errno != EEXIST) {
822		result = isc__errno2result(errno);
823	}
824
825	return (result);
826#elif defined(USE_DEVPOLL)
827	struct pollfd pfd;
828	int lockid = FDLOCK_ID(fd);
829
830	memset(&pfd, 0, sizeof(pfd));
831	if (msg == SELECT_POKE_READ)
832		pfd.events = POLLIN;
833	else
834		pfd.events = POLLOUT;
835	pfd.fd = fd;
836	pfd.revents = 0;
837	LOCK(&manager->fdlock[lockid]);
838	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
839		result = isc__errno2result(errno);
840	else {
841		if (msg == SELECT_POKE_READ)
842			manager->fdpollinfo[fd].want_read = 1;
843		else
844			manager->fdpollinfo[fd].want_write = 1;
845	}
846	UNLOCK(&manager->fdlock[lockid]);
847
848	return (result);
849#elif defined(USE_SELECT)
850	LOCK(&manager->lock);
851	if (msg == SELECT_POKE_READ)
852		FD_SET(fd, manager->read_fds);
853	if (msg == SELECT_POKE_WRITE)
854		FD_SET(fd, manager->write_fds);
855	UNLOCK(&manager->lock);
856
857	return (result);
858#endif
859}
860
861static inline isc_result_t
862unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
863	isc_result_t result = ISC_R_SUCCESS;
864
865#ifdef USE_KQUEUE
866	struct kevent evchange;
867
868	memset(&evchange, 0, sizeof(evchange));
869	if (msg == SELECT_POKE_READ)
870		evchange.filter = EVFILT_READ;
871	else
872		evchange.filter = EVFILT_WRITE;
873	evchange.flags = EV_DELETE;
874	evchange.ident = fd;
875	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
876		result = isc__errno2result(errno);
877
878	return (result);
879#elif defined(USE_EPOLL)
880	struct epoll_event event;
881
882	if (msg == SELECT_POKE_READ)
883		event.events = EPOLLIN;
884	else
885		event.events = EPOLLOUT;
886	memset(&event.data, 0, sizeof(event.data));
887	event.data.fd = fd;
888	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
889	    errno != ENOENT) {
890		char strbuf[ISC_STRERRORSIZE];
891		isc__strerror(errno, strbuf, sizeof(strbuf));
892		UNEXPECTED_ERROR(__FILE__, __LINE__,
893				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
894		result = ISC_R_UNEXPECTED;
895	}
896	return (result);
897#elif defined(USE_DEVPOLL)
898	struct pollfd pfds[2];
899	size_t writelen = sizeof(pfds[0]);
900	int lockid = FDLOCK_ID(fd);
901
902	memset(pfds, 0, sizeof(pfds));
903	pfds[0].events = POLLREMOVE;
904	pfds[0].fd = fd;
905
906	/*
907	 * Canceling read or write polling via /dev/poll is tricky.  Since it
908	 * only provides a way of canceling per FD, we may need to re-poll the
909	 * socket for the other operation.
910	 */
911	LOCK(&manager->fdlock[lockid]);
912	if (msg == SELECT_POKE_READ &&
913	    manager->fdpollinfo[fd].want_write == 1) {
914		pfds[1].events = POLLOUT;
915		pfds[1].fd = fd;
916		writelen += sizeof(pfds[1]);
917	}
918	if (msg == SELECT_POKE_WRITE &&
919	    manager->fdpollinfo[fd].want_read == 1) {
920		pfds[1].events = POLLIN;
921		pfds[1].fd = fd;
922		writelen += sizeof(pfds[1]);
923	}
924
925	if (write(manager->devpoll_fd, pfds, writelen) == -1)
926		result = isc__errno2result(errno);
927	else {
928		if (msg == SELECT_POKE_READ)
929			manager->fdpollinfo[fd].want_read = 0;
930		else
931			manager->fdpollinfo[fd].want_write = 0;
932	}
933	UNLOCK(&manager->fdlock[lockid]);
934
935	return (result);
936#elif defined(USE_SELECT)
937	LOCK(&manager->lock);
938	if (msg == SELECT_POKE_READ)
939		FD_CLR(fd, manager->read_fds);
940	else if (msg == SELECT_POKE_WRITE)
941		FD_CLR(fd, manager->write_fds);
942	UNLOCK(&manager->lock);
943
944	return (result);
945#endif
946}
947
948static void
949wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
950	isc_result_t result;
951	int lockid = FDLOCK_ID(fd);
952
953	/*
954	 * This is a wakeup on a socket.  If the socket is not in the
955	 * process of being closed, start watching it for either reads
956	 * or writes.
957	 */
958
959	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
960
961	if (msg == SELECT_POKE_CLOSE) {
962		/* No one should be updating fdstate, so no need to lock it */
963		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
964		manager->fdstate[fd] = CLOSED;
965		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
966		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
967		(void)close(fd);
968		return;
969	}
970
971	LOCK(&manager->fdlock[lockid]);
972	if (manager->fdstate[fd] == CLOSE_PENDING) {
973		UNLOCK(&manager->fdlock[lockid]);
974
975		/*
976		 * We accept (and ignore) any error from unwatch_fd() as we are
977		 * closing the socket, hoping it doesn't leave dangling state in
978		 * the kernel.
979		 * Note that unwatch_fd() must be called after releasing the
980		 * fdlock; otherwise it could cause deadlock due to a lock order
981		 * reversal.
982		 */
983		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
984		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
985		return;
986	}
987	if (manager->fdstate[fd] != MANAGED) {
988		UNLOCK(&manager->fdlock[lockid]);
989		return;
990	}
991	UNLOCK(&manager->fdlock[lockid]);
992
993	/*
994	 * Set requested bit.
995	 */
996	result = watch_fd(manager, fd, msg);
997	if (result != ISC_R_SUCCESS) {
998		/*
999		 * XXXJT: what should we do?  Ignoring the failure of watching
1000		 * a socket will make the application dysfunctional, but there
1001		 * seems to be no reasonable recovery process.
1002		 */
1003		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1004			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1005			      "failed to start watching FD (%d): %s",
1006			      fd, isc_result_totext(result));
1007	}
1008}
1009
1010#ifdef USE_WATCHER_THREAD
1011/*
1012 * Poke the select loop when there is something for us to do.
1013 * The write is required (by POSIX) to complete.  That is, we
1014 * will not get partial writes.
1015 */
1016static void
1017select_poke(isc__socketmgr_t *mgr, int fd, int msg) {
1018	int cc;
1019	int buf[2];
1020	char strbuf[ISC_STRERRORSIZE];
1021
1022	buf[0] = fd;
1023	buf[1] = msg;
1024
1025	do {
1026		cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
1027#ifdef ENOSR
1028		/*
1029		 * Treat ENOSR as EAGAIN but loop slowly as it is
1030		 * unlikely to clear fast.
1031		 */
1032		if (cc < 0 && errno == ENOSR) {
1033			sleep(1);
1034			errno = EAGAIN;
1035		}
1036#endif
1037	} while (cc < 0 && SOFT_ERROR(errno));
1038
1039	if (cc < 0) {
1040		isc__strerror(errno, strbuf, sizeof(strbuf));
1041		FATAL_ERROR(__FILE__, __LINE__,
1042			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1043					   ISC_MSG_WRITEFAILED,
1044					   "write() failed "
1045					   "during watcher poke: %s"),
1046			    strbuf);
1047	}
1048
1049	INSIST(cc == sizeof(buf));
1050}
1051
1052/*
1053 * Read a message on the internal fd.
1054 */
1055static void
1056select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) {
1057	int buf[2];
1058	int cc;
1059	char strbuf[ISC_STRERRORSIZE];
1060
1061	cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
1062	if (cc < 0) {
1063		*msg = SELECT_POKE_NOTHING;
1064		*fd = -1;	/* Silence compiler. */
1065		if (SOFT_ERROR(errno))
1066			return;
1067
1068		isc__strerror(errno, strbuf, sizeof(strbuf));
1069		FATAL_ERROR(__FILE__, __LINE__,
1070			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1071					   ISC_MSG_READFAILED,
1072					   "read() failed "
1073					   "during watcher poke: %s"),
1074			    strbuf);
1075
1076		return;
1077	}
1078	INSIST(cc == sizeof(buf));
1079
1080	*fd = buf[0];
1081	*msg = buf[1];
1082}
1083#else /* USE_WATCHER_THREAD */
1084/*
1085 * Update the state of the socketmgr when something changes.
1086 */
1087static void
1088select_poke(isc__socketmgr_t *manager, int fd, int msg) {
1089	if (msg == SELECT_POKE_SHUTDOWN)
1090		return;
1091	else if (fd >= 0)
1092		wakeup_socket(manager, fd, msg);
1093	return;
1094}
1095#endif /* USE_WATCHER_THREAD */
1096
1097/*
1098 * Make a fd non-blocking.
1099 */
1100static isc_result_t
1101make_nonblock(int fd) {
1102	int ret;
1103	int flags;
1104	char strbuf[ISC_STRERRORSIZE];
1105#ifdef USE_FIONBIO_IOCTL
1106	int on = 1;
1107
1108	ret = ioctl(fd, FIONBIO, (char *)&on);
1109#else
1110	flags = fcntl(fd, F_GETFL, 0);
1111	flags |= PORT_NONBLOCK;
1112	ret = fcntl(fd, F_SETFL, flags);
1113#endif
1114
1115	if (ret == -1) {
1116		isc__strerror(errno, strbuf, sizeof(strbuf));
1117		UNEXPECTED_ERROR(__FILE__, __LINE__,
1118#ifdef USE_FIONBIO_IOCTL
1119				 "ioctl(%d, FIONBIO, &on): %s", fd,
1120#else
1121				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1122#endif
1123				 strbuf);
1124
1125		return (ISC_R_UNEXPECTED);
1126	}
1127
1128	return (ISC_R_SUCCESS);
1129}
1130
1131#ifdef USE_CMSG
1132/*
1133 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1134 * In order to ensure as much portability as possible, we provide wrapper
1135 * functions of these macros.
1136 * Note that cmsg_space() could run slow on OSes that do not have
1137 * CMSG_SPACE.
1138 */
1139static inline ISC_SOCKADDR_LEN_T
1140cmsg_len(ISC_SOCKADDR_LEN_T len) {
1141#ifdef CMSG_LEN
1142	return (CMSG_LEN(len));
1143#else
1144	ISC_SOCKADDR_LEN_T hdrlen;
1145
1146	/*
1147	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1148	 * is correct.
1149	 */
1150	hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
1151	return (hdrlen + len);
1152#endif
1153}
1154
1155static inline ISC_SOCKADDR_LEN_T
1156cmsg_space(ISC_SOCKADDR_LEN_T len) {
1157#ifdef CMSG_SPACE
1158	return (CMSG_SPACE(len));
1159#else
1160	struct msghdr msg;
1161	struct cmsghdr *cmsgp;
1162	/*
1163	 * XXX: The buffer length is an ad-hoc value, but should be enough
1164	 * in a practical sense.
1165	 */
1166	char dummybuf[sizeof(struct cmsghdr) + 1024];
1167
1168	memset(&msg, 0, sizeof(msg));
1169	msg.msg_control = dummybuf;
1170	msg.msg_controllen = sizeof(dummybuf);
1171
1172	cmsgp = (struct cmsghdr *)dummybuf;
1173	cmsgp->cmsg_len = cmsg_len(len);
1174
1175	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1176	if (cmsgp != NULL)
1177		return ((char *)cmsgp - (char *)msg.msg_control);
1178	else
1179		return (0);
1180#endif
1181}
1182#endif /* USE_CMSG */
1183
1184/*
1185 * Process control messages received on a socket.
1186 */
1187static void
1188process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1189#ifdef USE_CMSG
1190	struct cmsghdr *cmsgp;
1191#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1192	struct in6_pktinfo *pktinfop;
1193#endif
1194#ifdef SO_TIMESTAMP
1195	struct timeval *timevalp;
1196#endif
1197#endif
1198
1199	/*
1200	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1201	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1202	 * They are all here, outside of the CPP tests, because it is
1203	 * more consistent with the usual ISC coding style.
1204	 */
1205	UNUSED(sock);
1206	UNUSED(msg);
1207	UNUSED(dev);
1208
1209#ifdef ISC_NET_BSD44MSGHDR
1210
1211#ifdef MSG_TRUNC
1212	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1213		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1214#endif
1215
1216#ifdef MSG_CTRUNC
1217	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1218		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1219#endif
1220
1221#ifndef USE_CMSG
1222	return;
1223#else
1224	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1225		return;
1226
1227#ifdef SO_TIMESTAMP
1228	timevalp = NULL;
1229#endif
1230#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1231	pktinfop = NULL;
1232#endif
1233
1234	cmsgp = CMSG_FIRSTHDR(msg);
1235	while (cmsgp != NULL) {
1236		socket_log(sock, NULL, TRACE,
1237			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1238			   "processing cmsg %p", cmsgp);
1239
1240#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1241		if (cmsgp->cmsg_level == IPPROTO_IPV6
1242		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
1243
1244			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1245			memcpy(&dev->pktinfo, pktinfop,
1246			       sizeof(struct in6_pktinfo));
1247			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1248			socket_log(sock, NULL, TRACE,
1249				   isc_msgcat, ISC_MSGSET_SOCKET,
1250				   ISC_MSG_IFRECEIVED,
1251				   "interface received on ifindex %u",
1252				   dev->pktinfo.ipi6_ifindex);
1253			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1254				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1255			goto next;
1256		}
1257#endif
1258
1259#ifdef SO_TIMESTAMP
1260		if (cmsgp->cmsg_level == SOL_SOCKET
1261		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1262			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1263			dev->timestamp.seconds = timevalp->tv_sec;
1264			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1265			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1266			goto next;
1267		}
1268#endif
1269
1270	next:
1271		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1272	}
1273#endif /* USE_CMSG */
1274
1275#endif /* ISC_NET_BSD44MSGHDR */
1276}
1277
1278/*
1279 * Construct an iov array and attach it to the msghdr passed in.  This is
1280 * the SEND constructor, which will use the used region of the buffer
1281 * (if using a buffer list) or will use the internal region (if a single
1282 * buffer I/O is requested).
1283 *
1284 * Nothing can be NULL, and the done event must list at least one buffer
1285 * on the buffer linked list for this function to be meaningful.
1286 *
1287 * If write_countp != NULL, *write_countp will hold the number of bytes
1288 * this transaction can send.
1289 */
1290static void
1291build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev,
1292		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1293{
1294	unsigned int iovcount;
1295	isc_buffer_t *buffer;
1296	isc_region_t used;
1297	size_t write_count;
1298	size_t skip_count;
1299
1300	memset(msg, 0, sizeof(*msg));
1301
1302	if (!sock->connected) {
1303		msg->msg_name = (void *)&dev->address.type.sa;
1304		msg->msg_namelen = dev->address.length;
1305	} else {
1306		msg->msg_name = NULL;
1307		msg->msg_namelen = 0;
1308	}
1309
1310	buffer = ISC_LIST_HEAD(dev->bufferlist);
1311	write_count = 0;
1312	iovcount = 0;
1313
1314	/*
1315	 * Single buffer I/O?  Skip what we've done so far in this region.
1316	 */
1317	if (buffer == NULL) {
1318		write_count = dev->region.length - dev->n;
1319		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1320		iov[0].iov_len = write_count;
1321		iovcount = 1;
1322
1323		goto config;
1324	}
1325
1326	/*
1327	 * Multibuffer I/O.
1328	 * Skip the data in the buffer list that we have already written.
1329	 */
1330	skip_count = dev->n;
1331	while (buffer != NULL) {
1332		REQUIRE(ISC_BUFFER_VALID(buffer));
1333		if (skip_count < isc_buffer_usedlength(buffer))
1334			break;
1335		skip_count -= isc_buffer_usedlength(buffer);
1336		buffer = ISC_LIST_NEXT(buffer, link);
1337	}
1338
1339	while (buffer != NULL) {
1340		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1341
1342		isc_buffer_usedregion(buffer, &used);
1343
1344		if (used.length > 0) {
1345			iov[iovcount].iov_base = (void *)(used.base
1346							  + skip_count);
1347			iov[iovcount].iov_len = used.length - skip_count;
1348			write_count += (used.length - skip_count);
1349			skip_count = 0;
1350			iovcount++;
1351		}
1352		buffer = ISC_LIST_NEXT(buffer, link);
1353	}
1354
1355	INSIST(skip_count == 0U);
1356
1357 config:
1358	msg->msg_iov = iov;
1359	msg->msg_iovlen = iovcount;
1360
1361#ifdef ISC_NET_BSD44MSGHDR
1362	msg->msg_control = NULL;
1363	msg->msg_controllen = 0;
1364	msg->msg_flags = 0;
1365#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1366	if ((sock->type == isc_sockettype_udp)
1367	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1368#if defined(IPV6_USE_MIN_MTU)
1369		int use_min_mtu = 1;	/* -1, 0, 1 */
1370#endif
1371		struct cmsghdr *cmsgp;
1372		struct in6_pktinfo *pktinfop;
1373
1374		socket_log(sock, NULL, TRACE,
1375			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1376			   "sendto pktinfo data, ifindex %u",
1377			   dev->pktinfo.ipi6_ifindex);
1378
1379		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1380		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1381		msg->msg_control = (void *)sock->sendcmsgbuf;
1382
1383		cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1384		cmsgp->cmsg_level = IPPROTO_IPV6;
1385		cmsgp->cmsg_type = IPV6_PKTINFO;
1386		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1387		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1388		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1389#if defined(IPV6_USE_MIN_MTU)
1390		/*
1391		 * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD
1392		 * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO
1393		 * is used.
1394		 */
1395		cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
1396					   msg->msg_controllen);
1397		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1398		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1399
1400		cmsgp->cmsg_level = IPPROTO_IPV6;
1401		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1402		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1403		memcpy(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1404#endif
1405	}
1406#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1407#else /* ISC_NET_BSD44MSGHDR */
1408	msg->msg_accrights = NULL;
1409	msg->msg_accrightslen = 0;
1410#endif /* ISC_NET_BSD44MSGHDR */
1411
1412	if (write_countp != NULL)
1413		*write_countp = write_count;
1414}
1415
1416/*
1417 * Construct an iov array and attach it to the msghdr passed in.  This is
1418 * the RECV constructor, which will use the available region of the buffer
1419 * (if using a buffer list) or will use the internal region (if a single
1420 * buffer I/O is requested).
1421 *
1422 * Nothing can be NULL, and the done event must list at least one buffer
1423 * on the buffer linked list for this function to be meaningful.
1424 *
1425 * If read_countp != NULL, *read_countp will hold the number of bytes
1426 * this transaction can receive.
1427 */
1428static void
1429build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev,
1430		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1431{
1432	unsigned int iovcount;
1433	isc_buffer_t *buffer;
1434	isc_region_t available;
1435	size_t read_count;
1436
1437	memset(msg, 0, sizeof(struct msghdr));
1438
1439	if (sock->type == isc_sockettype_udp) {
1440		memset(&dev->address, 0, sizeof(dev->address));
1441#ifdef BROKEN_RECVMSG
1442		if (sock->pf == AF_INET) {
1443			msg->msg_name = (void *)&dev->address.type.sin;
1444			msg->msg_namelen = sizeof(dev->address.type.sin6);
1445		} else if (sock->pf == AF_INET6) {
1446			msg->msg_name = (void *)&dev->address.type.sin6;
1447			msg->msg_namelen = sizeof(dev->address.type.sin6);
1448#ifdef ISC_PLATFORM_HAVESYSUNH
1449		} else if (sock->pf == AF_UNIX) {
1450			msg->msg_name = (void *)&dev->address.type.sunix;
1451			msg->msg_namelen = sizeof(dev->address.type.sunix);
1452#endif
1453		} else {
1454			msg->msg_name = (void *)&dev->address.type.sa;
1455			msg->msg_namelen = sizeof(dev->address.type);
1456		}
1457#else
1458		msg->msg_name = (void *)&dev->address.type.sa;
1459		msg->msg_namelen = sizeof(dev->address.type);
1460#endif
1461#ifdef ISC_NET_RECVOVERFLOW
1462		/* If needed, steal one iovec for overflow detection. */
1463		maxiov--;
1464#endif
1465	} else { /* TCP */
1466		msg->msg_name = NULL;
1467		msg->msg_namelen = 0;
1468		dev->address = sock->peer_address;
1469	}
1470
1471	buffer = ISC_LIST_HEAD(dev->bufferlist);
1472	read_count = 0;
1473
1474	/*
1475	 * Single buffer I/O?  Skip what we've done so far in this region.
1476	 */
1477	if (buffer == NULL) {
1478		read_count = dev->region.length - dev->n;
1479		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1480		iov[0].iov_len = read_count;
1481		iovcount = 1;
1482
1483		goto config;
1484	}
1485
1486	/*
1487	 * Multibuffer I/O.
1488	 * Skip empty buffers.
1489	 */
1490	while (buffer != NULL) {
1491		REQUIRE(ISC_BUFFER_VALID(buffer));
1492		if (isc_buffer_availablelength(buffer) != 0)
1493			break;
1494		buffer = ISC_LIST_NEXT(buffer, link);
1495	}
1496
1497	iovcount = 0;
1498	while (buffer != NULL) {
1499		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1500
1501		isc_buffer_availableregion(buffer, &available);
1502
1503		if (available.length > 0) {
1504			iov[iovcount].iov_base = (void *)(available.base);
1505			iov[iovcount].iov_len = available.length;
1506			read_count += available.length;
1507			iovcount++;
1508		}
1509		buffer = ISC_LIST_NEXT(buffer, link);
1510	}
1511
1512 config:
1513
1514	/*
1515	 * If needed, set up to receive that one extra byte.  Note that
1516	 * we know there is at least one iov left, since we stole it
1517	 * at the top of this function.
1518	 */
1519#ifdef ISC_NET_RECVOVERFLOW
1520	if (sock->type == isc_sockettype_udp) {
1521		iov[iovcount].iov_base = (void *)(&sock->overflow);
1522		iov[iovcount].iov_len = 1;
1523		iovcount++;
1524	}
1525#endif
1526
1527	msg->msg_iov = iov;
1528	msg->msg_iovlen = iovcount;
1529
1530#ifdef ISC_NET_BSD44MSGHDR
1531	msg->msg_control = NULL;
1532	msg->msg_controllen = 0;
1533	msg->msg_flags = 0;
1534#if defined(USE_CMSG)
1535	if (sock->type == isc_sockettype_udp) {
1536		msg->msg_control = sock->recvcmsgbuf;
1537		msg->msg_controllen = sock->recvcmsgbuflen;
1538	}
1539#endif /* USE_CMSG */
1540#else /* ISC_NET_BSD44MSGHDR */
1541	msg->msg_accrights = NULL;
1542	msg->msg_accrightslen = 0;
1543#endif /* ISC_NET_BSD44MSGHDR */
1544
1545	if (read_countp != NULL)
1546		*read_countp = read_count;
1547}
1548
1549static void
1550set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
1551		isc_socketevent_t *dev)
1552{
1553	if (sock->type == isc_sockettype_udp) {
1554		if (address != NULL)
1555			dev->address = *address;
1556		else
1557			dev->address = sock->peer_address;
1558	} else if (sock->type == isc_sockettype_tcp) {
1559		INSIST(address == NULL);
1560		dev->address = sock->peer_address;
1561	}
1562}
1563
1564static void
1565destroy_socketevent(isc_event_t *event) {
1566	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1567
1568	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1569
1570	(ev->destroy)(event);
1571}
1572
1573static isc_socketevent_t *
1574allocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype,
1575		     isc_taskaction_t action, const void *arg)
1576{
1577	isc_socketevent_t *ev;
1578
1579	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1580						     sock, eventtype,
1581						     action, arg,
1582						     sizeof(*ev));
1583
1584	if (ev == NULL)
1585		return (NULL);
1586
1587	ev->result = ISC_R_UNSET;
1588	ISC_LINK_INIT(ev, ev_link);
1589	ISC_LIST_INIT(ev->bufferlist);
1590	ev->region.base = NULL;
1591	ev->n = 0;
1592	ev->offset = 0;
1593	ev->attributes = 0;
1594	ev->destroy = ev->ev_destroy;
1595	ev->ev_destroy = destroy_socketevent;
1596
1597	return (ev);
1598}
1599
1600#if defined(ISC_SOCKET_DEBUG)
1601static void
1602dump_msg(struct msghdr *msg) {
1603	unsigned int i;
1604
1605	printf("MSGHDR %p\n", msg);
1606	printf("\tname %p, namelen %ld\n", msg->msg_name,
1607	       (long) msg->msg_namelen);
1608	printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1609	       (long) msg->msg_iovlen);
1610	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1611		printf("\t\t%d\tbase %p, len %ld\n", i,
1612		       msg->msg_iov[i].iov_base,
1613		       (long) msg->msg_iov[i].iov_len);
1614#ifdef ISC_NET_BSD44MSGHDR
1615	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1616	       (long) msg->msg_controllen);
1617#endif
1618}
1619#endif
1620
1621#define DOIO_SUCCESS		0	/* i/o ok, event sent */
1622#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
1623#define DOIO_HARD		2	/* i/o error, event sent */
1624#define DOIO_EOF		3	/* EOF, no event sent */
1625
1626static int
1627doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
1628	int cc;
1629	struct iovec iov[MAXSCATTERGATHER_RECV];
1630	size_t read_count;
1631	size_t actual_count;
1632	struct msghdr msghdr;
1633	isc_buffer_t *buffer;
1634	int recv_errno;
1635	char strbuf[ISC_STRERRORSIZE];
1636
1637	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1638
1639#if defined(ISC_SOCKET_DEBUG)
1640	dump_msg(&msghdr);
1641#endif
1642
1643	cc = recvmsg(sock->fd, &msghdr, 0);
1644	recv_errno = errno;
1645
1646#if defined(ISC_SOCKET_DEBUG)
1647	dump_msg(&msghdr);
1648#endif
1649
1650	if (cc < 0) {
1651		if (SOFT_ERROR(recv_errno))
1652			return (DOIO_SOFT);
1653
1654		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1655			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1656			socket_log(sock, NULL, IOEVENT,
1657				   isc_msgcat, ISC_MSGSET_SOCKET,
1658				   ISC_MSG_DOIORECV,
1659				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1660				   sock->fd, cc, recv_errno, strbuf);
1661		}
1662
1663#define SOFT_OR_HARD(_system, _isc) \
1664	if (recv_errno == _system) { \
1665		if (sock->connected) { \
1666			dev->result = _isc; \
1667			inc_stats(sock->manager->stats, \
1668				  sock->statsindex[STATID_RECVFAIL]); \
1669			return (DOIO_HARD); \
1670		} \
1671		return (DOIO_SOFT); \
1672	}
1673#define ALWAYS_HARD(_system, _isc) \
1674	if (recv_errno == _system) { \
1675		dev->result = _isc; \
1676		inc_stats(sock->manager->stats, \
1677			  sock->statsindex[STATID_RECVFAIL]); \
1678		return (DOIO_HARD); \
1679	}
1680
1681		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1682		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1683		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1684		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1685		/* HPUX 11.11 can return EADDRNOTAVAIL. */
1686		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1687		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1688		/*
1689		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1690		 * errors.
1691		 */
1692#ifdef EPROTO
1693		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1694#endif
1695		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1696
1697#undef SOFT_OR_HARD
1698#undef ALWAYS_HARD
1699
1700		dev->result = isc__errno2result(recv_errno);
1701		inc_stats(sock->manager->stats,
1702			  sock->statsindex[STATID_RECVFAIL]);
1703		return (DOIO_HARD);
1704	}
1705
1706	/*
1707	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1708	 * while on UDP sockets, zero length reads are perfectly valid,
1709	 * although strange.
1710	 */
1711	switch (sock->type) {
1712	case isc_sockettype_tcp:
1713	case isc_sockettype_unix:
1714		if (cc == 0)
1715			return (DOIO_EOF);
1716		break;
1717	case isc_sockettype_udp:
1718		break;
1719	case isc_sockettype_fdwatch:
1720	default:
1721		INSIST(0);
1722	}
1723
1724	if (sock->type == isc_sockettype_udp) {
1725		dev->address.length = msghdr.msg_namelen;
1726		if (isc_sockaddr_getport(&dev->address) == 0) {
1727			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1728				socket_log(sock, &dev->address, IOEVENT,
1729					   isc_msgcat, ISC_MSGSET_SOCKET,
1730					   ISC_MSG_ZEROPORT,
1731					   "dropping source port zero packet");
1732			}
1733			return (DOIO_SOFT);
1734		}
1735		/*
1736		 * Simulate a firewall blocking UDP responses bigger than
1737		 * 512 bytes.
1738		 */
1739		if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp)
1740			return (DOIO_SOFT);
1741	}
1742
1743	socket_log(sock, &dev->address, IOEVENT,
1744		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1745		   "packet received correctly");
1746
1747	/*
1748	 * Overflow bit detection.  If we received MORE bytes than we should,
1749	 * this indicates an overflow situation.  Set the flag in the
1750	 * dev entry and adjust how much we read by one.
1751	 */
1752#ifdef ISC_NET_RECVOVERFLOW
1753	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1754		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1755		cc--;
1756	}
1757#endif
1758
1759	/*
1760	 * If there are control messages attached, run through them and pull
1761	 * out the interesting bits.
1762	 */
1763	if (sock->type == isc_sockettype_udp)
1764		process_cmsg(sock, &msghdr, dev);
1765
1766	/*
1767	 * update the buffers (if any) and the i/o count
1768	 */
1769	dev->n += cc;
1770	actual_count = cc;
1771	buffer = ISC_LIST_HEAD(dev->bufferlist);
1772	while (buffer != NULL && actual_count > 0U) {
1773		REQUIRE(ISC_BUFFER_VALID(buffer));
1774		if (isc_buffer_availablelength(buffer) <= actual_count) {
1775			actual_count -= isc_buffer_availablelength(buffer);
1776			isc_buffer_add(buffer,
1777				       isc_buffer_availablelength(buffer));
1778		} else {
1779			isc_buffer_add(buffer, actual_count);
1780			actual_count = 0;
1781			POST(actual_count);
1782			break;
1783		}
1784		buffer = ISC_LIST_NEXT(buffer, link);
1785		if (buffer == NULL) {
1786			INSIST(actual_count == 0U);
1787		}
1788	}
1789
1790	/*
1791	 * If we read less than we expected, update counters,
1792	 * and let the upper layer poke the descriptor.
1793	 */
1794	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1795		return (DOIO_SOFT);
1796
1797	/*
1798	 * Full reads are posted, or partials if partials are ok.
1799	 */
1800	dev->result = ISC_R_SUCCESS;
1801	return (DOIO_SUCCESS);
1802}
1803
1804/*
1805 * Returns:
1806 *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1807 *			ISC_R_SUCCESS.
1808 *
1809 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1810 *			dev->result contains the appropriate error.
1811 *
1812 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1813 *			event was sent.  The operation should be retried.
1814 *
1815 *	No other return values are possible.
1816 */
1817static int
1818doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1819	int cc;
1820	struct iovec iov[MAXSCATTERGATHER_SEND];
1821	size_t write_count;
1822	struct msghdr msghdr;
1823	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1824	int attempts = 0;
1825	int send_errno;
1826	char strbuf[ISC_STRERRORSIZE];
1827
1828	build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1829
1830 resend:
1831	cc = sendmsg(sock->fd, &msghdr, 0);
1832	send_errno = errno;
1833
1834	/*
1835	 * Check for error or block condition.
1836	 */
1837	if (cc < 0) {
1838		if (send_errno == EINTR && ++attempts < NRETRIES)
1839			goto resend;
1840
1841		if (SOFT_ERROR(send_errno))
1842			return (DOIO_SOFT);
1843
1844#define SOFT_OR_HARD(_system, _isc) \
1845	if (send_errno == _system) { \
1846		if (sock->connected) { \
1847			dev->result = _isc; \
1848			inc_stats(sock->manager->stats, \
1849				  sock->statsindex[STATID_SENDFAIL]); \
1850			return (DOIO_HARD); \
1851		} \
1852		return (DOIO_SOFT); \
1853	}
1854#define ALWAYS_HARD(_system, _isc) \
1855	if (send_errno == _system) { \
1856		dev->result = _isc; \
1857		inc_stats(sock->manager->stats, \
1858			  sock->statsindex[STATID_SENDFAIL]); \
1859		return (DOIO_HARD); \
1860	}
1861
1862		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1863		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1864		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1865		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1866		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1867#ifdef EHOSTDOWN
1868		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1869#endif
1870		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1871		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1872		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1873		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1874		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1875
1876#undef SOFT_OR_HARD
1877#undef ALWAYS_HARD
1878
1879		/*
1880		 * The other error types depend on whether or not the
1881		 * socket is UDP or TCP.  If it is UDP, some errors
1882		 * that we expect to be fatal under TCP are merely
1883		 * annoying, and are really soft errors.
1884		 *
1885		 * However, these soft errors are still returned as
1886		 * a status.
1887		 */
1888		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1889		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1890		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1891				 addrbuf, strbuf);
1892		dev->result = isc__errno2result(send_errno);
1893		inc_stats(sock->manager->stats,
1894			  sock->statsindex[STATID_SENDFAIL]);
1895		return (DOIO_HARD);
1896	}
1897
1898	if (cc == 0) {
1899		inc_stats(sock->manager->stats,
1900			  sock->statsindex[STATID_SENDFAIL]);
1901		UNEXPECTED_ERROR(__FILE__, __LINE__,
1902				 "doio_send: send() %s 0",
1903				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1904						ISC_MSG_RETURNED, "returned"));
1905	}
1906
1907	/*
1908	 * If we write less than we expected, update counters, poke.
1909	 */
1910	dev->n += cc;
1911	if ((size_t)cc != write_count)
1912		return (DOIO_SOFT);
1913
1914	/*
1915	 * Exactly what we wanted to write.  We're done with this
1916	 * entry.  Post its completion event.
1917	 */
1918	dev->result = ISC_R_SUCCESS;
1919	return (DOIO_SUCCESS);
1920}
1921
1922/*
1923 * Kill.
1924 *
1925 * Caller must ensure that the socket is not locked and no external
1926 * references exist.
1927 */
1928static void
1929closesocket(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1930	isc_sockettype_t type = sock->type;
1931	int lockid = FDLOCK_ID(fd);
1932
1933	/*
1934	 * No one has this socket open, so the watcher doesn't have to be
1935	 * poked, and the socket doesn't have to be locked.
1936	 */
1937	LOCK(&manager->fdlock[lockid]);
1938	manager->fds[fd] = NULL;
1939	if (type == isc_sockettype_fdwatch)
1940		manager->fdstate[fd] = CLOSED;
1941	else
1942		manager->fdstate[fd] = CLOSE_PENDING;
1943	UNLOCK(&manager->fdlock[lockid]);
1944	if (type == isc_sockettype_fdwatch) {
1945		/*
1946		 * The caller may close the socket once this function returns,
1947		 * and `fd' may be reassigned for a new socket.  So we do
1948		 * unwatch_fd() here, rather than defer it via select_poke().
1949		 * Note: this may complicate data protection among threads and
1950		 * may reduce performance due to additional locks.  One way to
1951		 * solve this would be to dup() the watched descriptor, but we
1952		 * take a simpler approach at this moment.
1953		 */
1954		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1955		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1956	} else
1957		select_poke(manager, fd, SELECT_POKE_CLOSE);
1958
1959	inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1960
1961	/*
1962	 * update manager->maxfd here (XXX: this should be implemented more
1963	 * efficiently)
1964	 */
1965#ifdef USE_SELECT
1966	LOCK(&manager->lock);
1967	if (manager->maxfd == fd) {
1968		int i;
1969
1970		manager->maxfd = 0;
1971		for (i = fd - 1; i >= 0; i--) {
1972			lockid = FDLOCK_ID(i);
1973
1974			LOCK(&manager->fdlock[lockid]);
1975			if (manager->fdstate[i] == MANAGED) {
1976				manager->maxfd = i;
1977				UNLOCK(&manager->fdlock[lockid]);
1978				break;
1979			}
1980			UNLOCK(&manager->fdlock[lockid]);
1981		}
1982#ifdef ISC_PLATFORM_USETHREADS
1983		if (manager->maxfd < manager->pipe_fds[0])
1984			manager->maxfd = manager->pipe_fds[0];
1985#endif
1986	}
1987	UNLOCK(&manager->lock);
1988#endif	/* USE_SELECT */
1989}
1990
1991static void
1992destroy(isc__socket_t **sockp) {
1993	int fd;
1994	isc__socket_t *sock = *sockp;
1995	isc__socketmgr_t *manager = sock->manager;
1996
1997	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1998		   ISC_MSG_DESTROYING, "destroying");
1999
2000	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2001	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2002	INSIST(ISC_LIST_EMPTY(sock->send_list));
2003	INSIST(sock->connect_ev == NULL);
2004	REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
2005
2006	if (sock->fd >= 0) {
2007		fd = sock->fd;
2008		sock->fd = -1;
2009		closesocket(manager, sock, fd);
2010	}
2011
2012	LOCK(&manager->lock);
2013
2014	ISC_LIST_UNLINK(manager->socklist, sock, link);
2015
2016#ifdef USE_WATCHER_THREAD
2017	if (ISC_LIST_EMPTY(manager->socklist))
2018		SIGNAL(&manager->shutdown_ok);
2019#endif /* USE_WATCHER_THREAD */
2020
2021	/* can't unlock manager as its memory context is still used */
2022	free_socket(sockp);
2023
2024	UNLOCK(&manager->lock);
2025}
2026
2027static isc_result_t
2028allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
2029		isc__socket_t **socketp)
2030{
2031	isc__socket_t *sock;
2032	isc_result_t result;
2033	ISC_SOCKADDR_LEN_T cmsgbuflen;
2034
2035	sock = isc_mem_get(manager->mctx, sizeof(*sock));
2036
2037	if (sock == NULL)
2038		return (ISC_R_NOMEMORY);
2039
2040	sock->common.magic = 0;
2041	sock->common.impmagic = 0;
2042	sock->references = 0;
2043
2044	sock->manager = manager;
2045	sock->type = type;
2046	sock->fd = -1;
2047	sock->statsindex = NULL;
2048
2049	ISC_LINK_INIT(sock, link);
2050
2051	sock->recvcmsgbuf = NULL;
2052	sock->sendcmsgbuf = NULL;
2053
2054	/*
2055	 * set up cmsg buffers
2056	 */
2057	cmsgbuflen = 0;
2058#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2059	cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2060#endif
2061#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
2062	cmsgbuflen += cmsg_space(sizeof(struct timeval));
2063#endif
2064	sock->recvcmsgbuflen = cmsgbuflen;
2065	if (sock->recvcmsgbuflen != 0U) {
2066		sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2067		if (sock->recvcmsgbuf == NULL) {
2068			result = ISC_R_NOMEMORY;
2069			goto error;
2070		}
2071	}
2072
2073	cmsgbuflen = 0;
2074#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2075	cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2076#if defined(IPV6_USE_MIN_MTU)
2077	/*
2078	 * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU
2079	 * support.
2080	 */
2081	cmsgbuflen += cmsg_space(sizeof(int));
2082#endif
2083#endif
2084	sock->sendcmsgbuflen = cmsgbuflen;
2085	if (sock->sendcmsgbuflen != 0U) {
2086		sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2087		if (sock->sendcmsgbuf == NULL) {
2088			result = ISC_R_NOMEMORY;
2089			goto error;
2090		}
2091	}
2092
2093	memset(sock->name, 0, sizeof(sock->name));
2094	sock->tag = NULL;
2095
2096	/*
2097	 * set up list of readers and writers to be initially empty
2098	 */
2099	ISC_LIST_INIT(sock->recv_list);
2100	ISC_LIST_INIT(sock->send_list);
2101	ISC_LIST_INIT(sock->accept_list);
2102	sock->connect_ev = NULL;
2103	sock->pending_recv = 0;
2104	sock->pending_send = 0;
2105	sock->pending_accept = 0;
2106	sock->listener = 0;
2107	sock->connected = 0;
2108	sock->connecting = 0;
2109	sock->bound = 0;
2110
2111	/*
2112	 * initialize the lock
2113	 */
2114	result = isc_mutex_init(&sock->lock);
2115	if (result != ISC_R_SUCCESS) {
2116		sock->common.magic = 0;
2117		sock->common.impmagic = 0;
2118		goto error;
2119	}
2120
2121	/*
2122	 * Initialize readable and writable events
2123	 */
2124	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
2125		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
2126		       NULL, sock, sock, NULL, NULL);
2127	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
2128		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
2129		       NULL, sock, sock, NULL, NULL);
2130
2131	sock->common.magic = ISCAPI_SOCKET_MAGIC;
2132	sock->common.impmagic = SOCKET_MAGIC;
2133	*socketp = sock;
2134
2135	return (ISC_R_SUCCESS);
2136
2137 error:
2138	if (sock->recvcmsgbuf != NULL)
2139		isc_mem_put(manager->mctx, sock->recvcmsgbuf,
2140			    sock->recvcmsgbuflen);
2141	if (sock->sendcmsgbuf != NULL)
2142		isc_mem_put(manager->mctx, sock->sendcmsgbuf,
2143			    sock->sendcmsgbuflen);
2144	isc_mem_put(manager->mctx, sock, sizeof(*sock));
2145
2146	return (result);
2147}
2148
2149/*
2150 * This event requires that the various lists be empty, that the reference
2151 * count be 1, and that the magic number is valid.  The other socket bits,
2152 * like the lock, must be initialized as well.  The fd associated must be
2153 * marked as closed, by setting it to -1 on close, or this routine will
2154 * also close the socket.
2155 */
2156static void
2157free_socket(isc__socket_t **socketp) {
2158	isc__socket_t *sock = *socketp;
2159
2160	INSIST(sock->references == 0);
2161	INSIST(VALID_SOCKET(sock));
2162	INSIST(!sock->connecting);
2163	INSIST(!sock->pending_recv);
2164	INSIST(!sock->pending_send);
2165	INSIST(!sock->pending_accept);
2166	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2167	INSIST(ISC_LIST_EMPTY(sock->send_list));
2168	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2169	INSIST(!ISC_LINK_LINKED(sock, link));
2170
2171	if (sock->recvcmsgbuf != NULL)
2172		isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
2173			    sock->recvcmsgbuflen);
2174	if (sock->sendcmsgbuf != NULL)
2175		isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
2176			    sock->sendcmsgbuflen);
2177
2178	sock->common.magic = 0;
2179	sock->common.impmagic = 0;
2180
2181	DESTROYLOCK(&sock->lock);
2182
2183	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
2184
2185	*socketp = NULL;
2186}
2187
2188#ifdef SO_BSDCOMPAT
2189/*
2190 * This really should not be necessary to do.  Having to workout
2191 * which kernel version we are on at run time so that we don't cause
2192 * the kernel to issue a warning about us using a deprecated socket option.
2193 * Such warnings should *never* be on by default in production kernels.
2194 *
2195 * We can't do this a build time because executables are moved between
2196 * machines and hence kernels.
2197 *
2198 * We can't just not set SO_BSDCOMAT because some kernels require it.
2199 */
2200
2201static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
2202isc_boolean_t bsdcompat = ISC_TRUE;
2203
2204static void
2205clear_bsdcompat(void) {
2206#ifdef __linux__
2207	 struct utsname buf;
2208	 char *endp;
2209	 long int major;
2210	 long int minor;
2211
2212	 uname(&buf);    /* Can only fail if buf is bad in Linux. */
2213
2214	 /* Paranoia in parsing can be increased, but we trust uname(). */
2215	 major = strtol(buf.release, &endp, 10);
2216	 if (*endp == '.') {
2217		minor = strtol(endp+1, &endp, 10);
2218		if ((major > 2) || ((major == 2) && (minor >= 4))) {
2219			bsdcompat = ISC_FALSE;
2220		}
2221	 }
2222#endif /* __linux __ */
2223}
2224#endif
2225
2226static isc_result_t
2227opensocket(isc__socketmgr_t *manager, isc__socket_t *sock) {
2228	isc_result_t result;
2229	char strbuf[ISC_STRERRORSIZE];
2230	const char *err = "socket";
2231	int tries = 0;
2232#if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2233	int on = 1;
2234#endif
2235#if defined(SO_RCVBUF)
2236	ISC_SOCKADDR_LEN_T optlen;
2237	int size;
2238#endif
2239
2240 again:
2241	switch (sock->type) {
2242	case isc_sockettype_udp:
2243		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2244		break;
2245	case isc_sockettype_tcp:
2246		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2247		break;
2248	case isc_sockettype_unix:
2249		sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2250		break;
2251	case isc_sockettype_fdwatch:
2252		/*
2253		 * We should not be called for isc_sockettype_fdwatch sockets.
2254		 */
2255		INSIST(0);
2256		break;
2257	}
2258	if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2259		goto again;
2260
2261#ifdef F_DUPFD
2262	/*
2263	 * Leave a space for stdio and TCP to work in.
2264	 */
2265	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2266	    sock->fd >= 0 && sock->fd < manager->reserved) {
2267		int new, tmp;
2268		new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2269		tmp = errno;
2270		(void)close(sock->fd);
2271		errno = tmp;
2272		sock->fd = new;
2273		err = "isc_socket_create: fcntl/reserved";
2274	} else if (sock->fd >= 0 && sock->fd < 20) {
2275		int new, tmp;
2276		new = fcntl(sock->fd, F_DUPFD, 20);
2277		tmp = errno;
2278		(void)close(sock->fd);
2279		errno = tmp;
2280		sock->fd = new;
2281		err = "isc_socket_create: fcntl";
2282	}
2283#endif
2284
2285	if (sock->fd >= (int)manager->maxsocks) {
2286		(void)close(sock->fd);
2287		isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2288			       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2289			       isc_msgcat, ISC_MSGSET_SOCKET,
2290			       ISC_MSG_TOOMANYFDS,
2291			       "socket: file descriptor exceeds limit (%d/%u)",
2292			       sock->fd, manager->maxsocks);
2293		return (ISC_R_NORESOURCES);
2294	}
2295
2296	if (sock->fd < 0) {
2297		switch (errno) {
2298		case EMFILE:
2299		case ENFILE:
2300			isc__strerror(errno, strbuf, sizeof(strbuf));
2301			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2302				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2303				       isc_msgcat, ISC_MSGSET_SOCKET,
2304				       ISC_MSG_TOOMANYFDS,
2305				       "%s: %s", err, strbuf);
2306			/* fallthrough */
2307		case ENOBUFS:
2308			return (ISC_R_NORESOURCES);
2309
2310		case EPROTONOSUPPORT:
2311		case EPFNOSUPPORT:
2312		case EAFNOSUPPORT:
2313		/*
2314		 * Linux 2.2 (and maybe others) return EINVAL instead of
2315		 * EAFNOSUPPORT.
2316		 */
2317		case EINVAL:
2318			return (ISC_R_FAMILYNOSUPPORT);
2319
2320		default:
2321			isc__strerror(errno, strbuf, sizeof(strbuf));
2322			UNEXPECTED_ERROR(__FILE__, __LINE__,
2323					 "%s() %s: %s", err,
2324					 isc_msgcat_get(isc_msgcat,
2325							ISC_MSGSET_GENERAL,
2326							ISC_MSG_FAILED,
2327							"failed"),
2328					 strbuf);
2329			return (ISC_R_UNEXPECTED);
2330		}
2331	}
2332
2333	result = make_nonblock(sock->fd);
2334	if (result != ISC_R_SUCCESS) {
2335		(void)close(sock->fd);
2336		return (result);
2337	}
2338
2339#ifdef SO_BSDCOMPAT
2340	RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2341				  clear_bsdcompat) == ISC_R_SUCCESS);
2342	if (sock->type != isc_sockettype_unix && bsdcompat &&
2343	    setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2344		       (void *)&on, sizeof(on)) < 0) {
2345		isc__strerror(errno, strbuf, sizeof(strbuf));
2346		UNEXPECTED_ERROR(__FILE__, __LINE__,
2347				 "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2348				 sock->fd,
2349				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2350						ISC_MSG_FAILED, "failed"),
2351				 strbuf);
2352		/* Press on... */
2353	}
2354#endif
2355
2356#ifdef SO_NOSIGPIPE
2357	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2358		       (void *)&on, sizeof(on)) < 0) {
2359		isc__strerror(errno, strbuf, sizeof(strbuf));
2360		UNEXPECTED_ERROR(__FILE__, __LINE__,
2361				 "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2362				 sock->fd,
2363				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2364						ISC_MSG_FAILED, "failed"),
2365				 strbuf);
2366		/* Press on... */
2367	}
2368#endif
2369
2370#if defined(USE_CMSG) || defined(SO_RCVBUF)
2371	if (sock->type == isc_sockettype_udp) {
2372
2373#if defined(USE_CMSG)
2374#if defined(SO_TIMESTAMP)
2375		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2376			       (void *)&on, sizeof(on)) < 0
2377		    && errno != ENOPROTOOPT) {
2378			isc__strerror(errno, strbuf, sizeof(strbuf));
2379			UNEXPECTED_ERROR(__FILE__, __LINE__,
2380					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2381					 sock->fd,
2382					 isc_msgcat_get(isc_msgcat,
2383							ISC_MSGSET_GENERAL,
2384							ISC_MSG_FAILED,
2385							"failed"),
2386					 strbuf);
2387			/* Press on... */
2388		}
2389#endif /* SO_TIMESTAMP */
2390
2391#if defined(ISC_PLATFORM_HAVEIPV6)
2392		if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2393			/*
2394			 * Warn explicitly because this anomaly can be hidden
2395			 * in usual operation (and unexpectedly appear later).
2396			 */
2397			UNEXPECTED_ERROR(__FILE__, __LINE__,
2398					 "No buffer available to receive "
2399					 "IPv6 destination");
2400		}
2401#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2402#ifdef IPV6_RECVPKTINFO
2403		/* RFC 3542 */
2404		if ((sock->pf == AF_INET6)
2405		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2406				   (void *)&on, sizeof(on)) < 0)) {
2407			isc__strerror(errno, strbuf, sizeof(strbuf));
2408			UNEXPECTED_ERROR(__FILE__, __LINE__,
2409					 "setsockopt(%d, IPV6_RECVPKTINFO) "
2410					 "%s: %s", sock->fd,
2411					 isc_msgcat_get(isc_msgcat,
2412							ISC_MSGSET_GENERAL,
2413							ISC_MSG_FAILED,
2414							"failed"),
2415					 strbuf);
2416		}
2417#else
2418		/* RFC 2292 */
2419		if ((sock->pf == AF_INET6)
2420		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2421				   (void *)&on, sizeof(on)) < 0)) {
2422			isc__strerror(errno, strbuf, sizeof(strbuf));
2423			UNEXPECTED_ERROR(__FILE__, __LINE__,
2424					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2425					 sock->fd,
2426					 isc_msgcat_get(isc_msgcat,
2427							ISC_MSGSET_GENERAL,
2428							ISC_MSG_FAILED,
2429							"failed"),
2430					 strbuf);
2431		}
2432#endif /* IPV6_RECVPKTINFO */
2433#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2434#ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2435		/* use minimum MTU */
2436		if (sock->pf == AF_INET6 &&
2437		    setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2438			       (void *)&on, sizeof(on)) < 0) {
2439			isc__strerror(errno, strbuf, sizeof(strbuf));
2440			UNEXPECTED_ERROR(__FILE__, __LINE__,
2441					 "setsockopt(%d, IPV6_USE_MIN_MTU) "
2442					 "%s: %s", sock->fd,
2443					 isc_msgcat_get(isc_msgcat,
2444							ISC_MSGSET_GENERAL,
2445							ISC_MSG_FAILED,
2446							"failed"),
2447					 strbuf);
2448		}
2449#endif
2450#if defined(IPV6_MTU)
2451		/*
2452		 * Use minimum MTU on IPv6 sockets.
2453		 */
2454		if (sock->pf == AF_INET6) {
2455			int mtu = 1280;
2456			(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU,
2457					 &mtu, sizeof(mtu));
2458		}
2459#endif
2460#if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
2461		/*
2462		 * Turn off Path MTU discovery on IPv6/UDP sockets.
2463		 */
2464		if (sock->pf == AF_INET6) {
2465			int action = IPV6_PMTUDISC_DONT;
2466			(void)setsockopt(sock->fd, IPPROTO_IPV6,
2467					 IPV6_MTU_DISCOVER, &action,
2468					 sizeof(action));
2469		}
2470#endif
2471#endif /* ISC_PLATFORM_HAVEIPV6 */
2472#endif /* defined(USE_CMSG) */
2473
2474#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2475		/*
2476		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2477		 */
2478		if (sock->pf == AF_INET) {
2479			int action = IP_PMTUDISC_DONT;
2480			(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2481					 &action, sizeof(action));
2482		}
2483#endif
2484#if defined(IP_DONTFRAG)
2485		/*
2486		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2487		 */
2488		if (sock->pf == AF_INET) {
2489			int off = 0;
2490			(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2491					 &off, sizeof(off));
2492		}
2493#endif
2494
2495#if defined(SO_RCVBUF)
2496		optlen = sizeof(size);
2497		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2498			       (void *)&size, &optlen) >= 0 &&
2499		     size < RCVBUFSIZE) {
2500			size = RCVBUFSIZE;
2501			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2502				       (void *)&size, sizeof(size)) == -1) {
2503				isc__strerror(errno, strbuf, sizeof(strbuf));
2504				UNEXPECTED_ERROR(__FILE__, __LINE__,
2505					"setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2506					sock->fd, size,
2507					isc_msgcat_get(isc_msgcat,
2508						       ISC_MSGSET_GENERAL,
2509						       ISC_MSG_FAILED,
2510						       "failed"),
2511					strbuf);
2512			}
2513		}
2514#endif
2515	}
2516#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2517
2518	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2519
2520	return (ISC_R_SUCCESS);
2521}
2522
2523/*%
2524 * Create a new 'type' socket managed by 'manager'.  Events
2525 * will be posted to 'task' and when dispatched 'action' will be
2526 * called with 'arg' as the arg value.  The new socket is returned
2527 * in 'socketp'.
2528 */
2529ISC_SOCKETFUNC_SCOPE isc_result_t
2530isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2531		   isc_socket_t **socketp)
2532{
2533	isc__socket_t *sock = NULL;
2534	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2535	isc_result_t result;
2536	int lockid;
2537
2538	REQUIRE(VALID_MANAGER(manager));
2539	REQUIRE(socketp != NULL && *socketp == NULL);
2540	REQUIRE(type != isc_sockettype_fdwatch);
2541
2542	result = allocate_socket(manager, type, &sock);
2543	if (result != ISC_R_SUCCESS)
2544		return (result);
2545
2546	switch (sock->type) {
2547	case isc_sockettype_udp:
2548		sock->statsindex =
2549			(pf == AF_INET) ? upd4statsindex : upd6statsindex;
2550		break;
2551	case isc_sockettype_tcp:
2552		sock->statsindex =
2553			(pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2554		break;
2555	case isc_sockettype_unix:
2556		sock->statsindex = unixstatsindex;
2557		break;
2558	default:
2559		INSIST(0);
2560	}
2561
2562	sock->pf = pf;
2563	result = opensocket(manager, sock);
2564	if (result != ISC_R_SUCCESS) {
2565		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2566		free_socket(&sock);
2567		return (result);
2568	}
2569
2570	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2571	sock->references = 1;
2572	*socketp = (isc_socket_t *)sock;
2573
2574	/*
2575	 * Note we don't have to lock the socket like we normally would because
2576	 * there are no external references to it yet.
2577	 */
2578
2579	lockid = FDLOCK_ID(sock->fd);
2580	LOCK(&manager->fdlock[lockid]);
2581	manager->fds[sock->fd] = sock;
2582	manager->fdstate[sock->fd] = MANAGED;
2583#ifdef USE_DEVPOLL
2584	INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2585	       sock->manager->fdpollinfo[sock->fd].want_write == 0);
2586#endif
2587	UNLOCK(&manager->fdlock[lockid]);
2588
2589	LOCK(&manager->lock);
2590	ISC_LIST_APPEND(manager->socklist, sock, link);
2591#ifdef USE_SELECT
2592	if (manager->maxfd < sock->fd)
2593		manager->maxfd = sock->fd;
2594#endif
2595	UNLOCK(&manager->lock);
2596
2597	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2598		   ISC_MSG_CREATED, "created");
2599
2600	return (ISC_R_SUCCESS);
2601}
2602
2603#ifdef BIND9
2604ISC_SOCKETFUNC_SCOPE isc_result_t
2605isc__socket_open(isc_socket_t *sock0) {
2606	isc_result_t result;
2607	isc__socket_t *sock = (isc__socket_t *)sock0;
2608
2609	REQUIRE(VALID_SOCKET(sock));
2610
2611	LOCK(&sock->lock);
2612	REQUIRE(sock->references == 1);
2613	REQUIRE(sock->type != isc_sockettype_fdwatch);
2614	UNLOCK(&sock->lock);
2615	/*
2616	 * We don't need to retain the lock hereafter, since no one else has
2617	 * this socket.
2618	 */
2619	REQUIRE(sock->fd == -1);
2620
2621	result = opensocket(sock->manager, sock);
2622	if (result != ISC_R_SUCCESS)
2623		sock->fd = -1;
2624
2625	if (result == ISC_R_SUCCESS) {
2626		int lockid = FDLOCK_ID(sock->fd);
2627
2628		LOCK(&sock->manager->fdlock[lockid]);
2629		sock->manager->fds[sock->fd] = sock;
2630		sock->manager->fdstate[sock->fd] = MANAGED;
2631#ifdef USE_DEVPOLL
2632		INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2633		       sock->manager->fdpollinfo[sock->fd].want_write == 0);
2634#endif
2635		UNLOCK(&sock->manager->fdlock[lockid]);
2636
2637#ifdef USE_SELECT
2638		LOCK(&sock->manager->lock);
2639		if (sock->manager->maxfd < sock->fd)
2640			sock->manager->maxfd = sock->fd;
2641		UNLOCK(&sock->manager->lock);
2642#endif
2643	}
2644
2645	return (result);
2646}
2647#endif	/* BIND9 */
2648
2649/*
2650 * Create a new 'type' socket managed by 'manager'.  Events
2651 * will be posted to 'task' and when dispatched 'action' will be
2652 * called with 'arg' as the arg value.  The new socket is returned
2653 * in 'socketp'.
2654 */
2655ISC_SOCKETFUNC_SCOPE isc_result_t
2656isc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags,
2657			  isc_sockfdwatch_t callback, void *cbarg,
2658			  isc_task_t *task, isc_socket_t **socketp)
2659{
2660	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2661	isc__socket_t *sock = NULL;
2662	isc_result_t result;
2663	int lockid;
2664
2665	REQUIRE(VALID_MANAGER(manager));
2666	REQUIRE(socketp != NULL && *socketp == NULL);
2667
2668	result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2669	if (result != ISC_R_SUCCESS)
2670		return (result);
2671
2672	sock->fd = fd;
2673	sock->fdwatcharg = cbarg;
2674	sock->fdwatchcb = callback;
2675	sock->fdwatchflags = flags;
2676	sock->fdwatchtask = task;
2677	sock->statsindex = fdwatchstatsindex;
2678
2679	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2680	sock->references = 1;
2681	*socketp = (isc_socket_t *)sock;
2682
2683	/*
2684	 * Note we don't have to lock the socket like we normally would because
2685	 * there are no external references to it yet.
2686	 */
2687
2688	lockid = FDLOCK_ID(sock->fd);
2689	LOCK(&manager->fdlock[lockid]);
2690	manager->fds[sock->fd] = sock;
2691	manager->fdstate[sock->fd] = MANAGED;
2692	UNLOCK(&manager->fdlock[lockid]);
2693
2694	LOCK(&manager->lock);
2695	ISC_LIST_APPEND(manager->socklist, sock, link);
2696#ifdef USE_SELECT
2697	if (manager->maxfd < sock->fd)
2698		manager->maxfd = sock->fd;
2699#endif
2700	UNLOCK(&manager->lock);
2701
2702	if (flags & ISC_SOCKFDWATCH_READ)
2703		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2704	if (flags & ISC_SOCKFDWATCH_WRITE)
2705		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2706
2707	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2708		   ISC_MSG_CREATED, "fdwatch-created");
2709
2710	return (ISC_R_SUCCESS);
2711}
2712
2713/*
2714 * Indicate to the manager that it should watch the socket again.
2715 * This can be used to restart watching if the previous event handler
2716 * didn't indicate there was more data to be processed.  Primarily
2717 * it is for writing but could be used for reading if desired
2718 */
2719
2720ISC_SOCKETFUNC_SCOPE isc_result_t
2721isc__socket_fdwatchpoke(isc_socket_t *sock0, int flags)
2722{
2723	isc__socket_t *sock = (isc__socket_t *)sock0;
2724
2725	REQUIRE(VALID_SOCKET(sock));
2726
2727	/*
2728	 * We check both flags first to allow us to get the lock
2729	 * once but only if we need it.
2730	 */
2731
2732	if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
2733		LOCK(&sock->lock);
2734		if (((flags & ISC_SOCKFDWATCH_READ) != 0) &&
2735		    !sock->pending_recv)
2736			select_poke(sock->manager, sock->fd,
2737				    SELECT_POKE_READ);
2738		if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) &&
2739		    !sock->pending_send)
2740			select_poke(sock->manager, sock->fd,
2741				    SELECT_POKE_WRITE);
2742		UNLOCK(&sock->lock);
2743	}
2744
2745	socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2746		   ISC_MSG_POKED, "fdwatch-poked flags: %d", flags);
2747
2748	return (ISC_R_SUCCESS);
2749}
2750
2751/*
2752 * Attach to a socket.  Caller must explicitly detach when it is done.
2753 */
2754ISC_SOCKETFUNC_SCOPE void
2755isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
2756	isc__socket_t *sock = (isc__socket_t *)sock0;
2757
2758	REQUIRE(VALID_SOCKET(sock));
2759	REQUIRE(socketp != NULL && *socketp == NULL);
2760
2761	LOCK(&sock->lock);
2762	sock->references++;
2763	UNLOCK(&sock->lock);
2764
2765	*socketp = (isc_socket_t *)sock;
2766}
2767
2768/*
2769 * Dereference a socket.  If this is the last reference to it, clean things
2770 * up by destroying the socket.
2771 */
2772ISC_SOCKETFUNC_SCOPE void
2773isc__socket_detach(isc_socket_t **socketp) {
2774	isc__socket_t *sock;
2775	isc_boolean_t kill_socket = ISC_FALSE;
2776
2777	REQUIRE(socketp != NULL);
2778	sock = (isc__socket_t *)*socketp;
2779	REQUIRE(VALID_SOCKET(sock));
2780
2781	LOCK(&sock->lock);
2782	REQUIRE(sock->references > 0);
2783	sock->references--;
2784	if (sock->references == 0)
2785		kill_socket = ISC_TRUE;
2786	UNLOCK(&sock->lock);
2787
2788	if (kill_socket)
2789		destroy(&sock);
2790
2791	*socketp = NULL;
2792}
2793
2794#ifdef BIND9
2795ISC_SOCKETFUNC_SCOPE isc_result_t
2796isc__socket_close(isc_socket_t *sock0) {
2797	isc__socket_t *sock = (isc__socket_t *)sock0;
2798	int fd;
2799	isc__socketmgr_t *manager;
2800
2801	REQUIRE(VALID_SOCKET(sock));
2802
2803	LOCK(&sock->lock);
2804
2805	REQUIRE(sock->references == 1);
2806	REQUIRE(sock->type != isc_sockettype_fdwatch);
2807	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2808
2809	INSIST(!sock->connecting);
2810	INSIST(!sock->pending_recv);
2811	INSIST(!sock->pending_send);
2812	INSIST(!sock->pending_accept);
2813	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2814	INSIST(ISC_LIST_EMPTY(sock->send_list));
2815	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2816	INSIST(sock->connect_ev == NULL);
2817
2818	manager = sock->manager;
2819	fd = sock->fd;
2820	sock->fd = -1;
2821	memset(sock->name, 0, sizeof(sock->name));
2822	sock->tag = NULL;
2823	sock->listener = 0;
2824	sock->connected = 0;
2825	sock->connecting = 0;
2826	sock->bound = 0;
2827	isc_sockaddr_any(&sock->peer_address);
2828
2829	UNLOCK(&sock->lock);
2830
2831	closesocket(manager, sock, fd);
2832
2833	return (ISC_R_SUCCESS);
2834}
2835#endif	/* BIND9 */
2836
2837/*
2838 * I/O is possible on a given socket.  Schedule an event to this task that
2839 * will call an internal function to do the I/O.  This will charge the
2840 * task with the I/O operation and let our select loop handler get back
2841 * to doing something real as fast as possible.
2842 *
2843 * The socket and manager must be locked before calling this function.
2844 */
2845static void
2846dispatch_recv(isc__socket_t *sock) {
2847	intev_t *iev;
2848	isc_socketevent_t *ev;
2849	isc_task_t *sender;
2850
2851	INSIST(!sock->pending_recv);
2852
2853	if (sock->type != isc_sockettype_fdwatch) {
2854		ev = ISC_LIST_HEAD(sock->recv_list);
2855		if (ev == NULL)
2856			return;
2857		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2858			   "dispatch_recv:  event %p -> task %p",
2859			   ev, ev->ev_sender);
2860		sender = ev->ev_sender;
2861	} else {
2862		sender = sock->fdwatchtask;
2863	}
2864
2865	sock->pending_recv = 1;
2866	iev = &sock->readable_ev;
2867
2868	sock->references++;
2869	iev->ev_sender = sock;
2870	if (sock->type == isc_sockettype_fdwatch)
2871		iev->ev_action = internal_fdwatch_read;
2872	else
2873		iev->ev_action = internal_recv;
2874	iev->ev_arg = sock;
2875
2876	isc_task_send(sender, (isc_event_t **)&iev);
2877}
2878
2879static void
2880dispatch_send(isc__socket_t *sock) {
2881	intev_t *iev;
2882	isc_socketevent_t *ev;
2883	isc_task_t *sender;
2884
2885	INSIST(!sock->pending_send);
2886
2887	if (sock->type != isc_sockettype_fdwatch) {
2888		ev = ISC_LIST_HEAD(sock->send_list);
2889		if (ev == NULL)
2890			return;
2891		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2892			   "dispatch_send:  event %p -> task %p",
2893			   ev, ev->ev_sender);
2894		sender = ev->ev_sender;
2895	} else {
2896		sender = sock->fdwatchtask;
2897	}
2898
2899	sock->pending_send = 1;
2900	iev = &sock->writable_ev;
2901
2902	sock->references++;
2903	iev->ev_sender = sock;
2904	if (sock->type == isc_sockettype_fdwatch)
2905		iev->ev_action = internal_fdwatch_write;
2906	else
2907		iev->ev_action = internal_send;
2908	iev->ev_arg = sock;
2909
2910	isc_task_send(sender, (isc_event_t **)&iev);
2911}
2912
2913/*
2914 * Dispatch an internal accept event.
2915 */
2916static void
2917dispatch_accept(isc__socket_t *sock) {
2918	intev_t *iev;
2919	isc_socket_newconnev_t *ev;
2920
2921	INSIST(!sock->pending_accept);
2922
2923	/*
2924	 * Are there any done events left, or were they all canceled
2925	 * before the manager got the socket lock?
2926	 */
2927	ev = ISC_LIST_HEAD(sock->accept_list);
2928	if (ev == NULL)
2929		return;
2930
2931	sock->pending_accept = 1;
2932	iev = &sock->readable_ev;
2933
2934	sock->references++;  /* keep socket around for this internal event */
2935	iev->ev_sender = sock;
2936	iev->ev_action = internal_accept;
2937	iev->ev_arg = sock;
2938
2939	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2940}
2941
2942static void
2943dispatch_connect(isc__socket_t *sock) {
2944	intev_t *iev;
2945	isc_socket_connev_t *ev;
2946
2947	iev = &sock->writable_ev;
2948
2949	ev = sock->connect_ev;
2950	INSIST(ev != NULL); /* XXX */
2951
2952	INSIST(sock->connecting);
2953
2954	sock->references++;  /* keep socket around for this internal event */
2955	iev->ev_sender = sock;
2956	iev->ev_action = internal_connect;
2957	iev->ev_arg = sock;
2958
2959	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2960}
2961
2962/*
2963 * Dequeue an item off the given socket's read queue, set the result code
2964 * in the done event to the one provided, and send it to the task it was
2965 * destined for.
2966 *
2967 * If the event to be sent is on a list, remove it before sending.  If
2968 * asked to, send and detach from the socket as well.
2969 *
2970 * Caller must have the socket locked if the event is attached to the socket.
2971 */
2972static void
2973send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
2974	isc_task_t *task;
2975
2976	task = (*dev)->ev_sender;
2977
2978	(*dev)->ev_sender = sock;
2979
2980	if (ISC_LINK_LINKED(*dev, ev_link))
2981		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2982
2983	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2984	    == ISC_SOCKEVENTATTR_ATTACHED)
2985		isc_task_sendanddetach(&task, (isc_event_t **)dev);
2986	else
2987		isc_task_send(task, (isc_event_t **)dev);
2988}
2989
2990/*
2991 * See comments for send_recvdone_event() above.
2992 *
2993 * Caller must have the socket locked if the event is attached to the socket.
2994 */
2995static void
2996send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
2997	isc_task_t *task;
2998
2999	INSIST(dev != NULL && *dev != NULL);
3000
3001	task = (*dev)->ev_sender;
3002	(*dev)->ev_sender = sock;
3003
3004	if (ISC_LINK_LINKED(*dev, ev_link))
3005		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
3006
3007	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
3008	    == ISC_SOCKEVENTATTR_ATTACHED)
3009		isc_task_sendanddetach(&task, (isc_event_t **)dev);
3010	else
3011		isc_task_send(task, (isc_event_t **)dev);
3012}
3013
3014/*
3015 * Call accept() on a socket, to get the new file descriptor.  The listen
3016 * socket is used as a prototype to create a new isc_socket_t.  The new
3017 * socket has one outstanding reference.  The task receiving the event
3018 * will be detached from just after the event is delivered.
3019 *
3020 * On entry to this function, the event delivered is the internal
3021 * readable event, and the first item on the accept_list should be
3022 * the done event we want to send.  If the list is empty, this is a no-op,
3023 * so just unlock and return.
3024 */
3025static void
3026internal_accept(isc_task_t *me, isc_event_t *ev) {
3027	isc__socket_t *sock;
3028	isc__socketmgr_t *manager;
3029	isc_socket_newconnev_t *dev;
3030	isc_task_t *task;
3031	ISC_SOCKADDR_LEN_T addrlen;
3032	int fd;
3033	isc_result_t result = ISC_R_SUCCESS;
3034	char strbuf[ISC_STRERRORSIZE];
3035	const char *err = "accept";
3036
3037	UNUSED(me);
3038
3039	sock = ev->ev_sender;
3040	INSIST(VALID_SOCKET(sock));
3041
3042	LOCK(&sock->lock);
3043	socket_log(sock, NULL, TRACE,
3044		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
3045		   "internal_accept called, locked socket");
3046
3047	manager = sock->manager;
3048	INSIST(VALID_MANAGER(manager));
3049
3050	INSIST(sock->listener);
3051	INSIST(sock->pending_accept == 1);
3052	sock->pending_accept = 0;
3053
3054	INSIST(sock->references > 0);
3055	sock->references--;  /* the internal event is done with this socket */
3056	if (sock->references == 0) {
3057		UNLOCK(&sock->lock);
3058		destroy(&sock);
3059		return;
3060	}
3061
3062	/*
3063	 * Get the first item off the accept list.
3064	 * If it is empty, unlock the socket and return.
3065	 */
3066	dev = ISC_LIST_HEAD(sock->accept_list);
3067	if (dev == NULL) {
3068		UNLOCK(&sock->lock);
3069		return;
3070	}
3071
3072	/*
3073	 * Try to accept the new connection.  If the accept fails with
3074	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
3075	 * again.  Also ignore ECONNRESET, which has been reported to
3076	 * be spuriously returned on Linux 2.2.19 although it is not
3077	 * a documented error for accept().  ECONNABORTED has been
3078	 * reported for Solaris 8.  The rest are thrown in not because
3079	 * we have seen them but because they are ignored by other
3080	 * daemons such as BIND 8 and Apache.
3081	 */
3082
3083	addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
3084	memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
3085	fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
3086		    (void *)&addrlen);
3087
3088#ifdef F_DUPFD
3089	/*
3090	 * Leave a space for stdio to work in.
3091	 */
3092	if (fd >= 0 && fd < 20) {
3093		int new, tmp;
3094		new = fcntl(fd, F_DUPFD, 20);
3095		tmp = errno;
3096		(void)close(fd);
3097		errno = tmp;
3098		fd = new;
3099		err = "accept/fcntl";
3100	}
3101#endif
3102
3103	if (fd < 0) {
3104		if (SOFT_ERROR(errno))
3105			goto soft_error;
3106		switch (errno) {
3107		case ENFILE:
3108		case EMFILE:
3109			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3110				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3111				       isc_msgcat, ISC_MSGSET_SOCKET,
3112				       ISC_MSG_TOOMANYFDS,
3113				       "%s: too many open file descriptors",
3114				       err);
3115			goto soft_error;
3116
3117		case ENOBUFS:
3118		case ENOMEM:
3119		case ECONNRESET:
3120		case ECONNABORTED:
3121		case EHOSTUNREACH:
3122		case EHOSTDOWN:
3123		case ENETUNREACH:
3124		case ENETDOWN:
3125		case ECONNREFUSED:
3126#ifdef EPROTO
3127		case EPROTO:
3128#endif
3129#ifdef ENONET
3130		case ENONET:
3131#endif
3132			goto soft_error;
3133		default:
3134			break;
3135		}
3136		isc__strerror(errno, strbuf, sizeof(strbuf));
3137		UNEXPECTED_ERROR(__FILE__, __LINE__,
3138				 "internal_accept: %s() %s: %s", err,
3139				 isc_msgcat_get(isc_msgcat,
3140						ISC_MSGSET_GENERAL,
3141						ISC_MSG_FAILED,
3142						"failed"),
3143				 strbuf);
3144		fd = -1;
3145		result = ISC_R_UNEXPECTED;
3146	} else {
3147		if (addrlen == 0U) {
3148			UNEXPECTED_ERROR(__FILE__, __LINE__,
3149					 "internal_accept(): "
3150					 "accept() failed to return "
3151					 "remote address");
3152
3153			(void)close(fd);
3154			goto soft_error;
3155		} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
3156			   sock->pf)
3157		{
3158			UNEXPECTED_ERROR(__FILE__, __LINE__,
3159					 "internal_accept(): "
3160					 "accept() returned peer address "
3161					 "family %u (expected %u)",
3162					 NEWCONNSOCK(dev)->peer_address.
3163					 type.sa.sa_family,
3164					 sock->pf);
3165			(void)close(fd);
3166			goto soft_error;
3167		} else if (fd >= (int)manager->maxsocks) {
3168			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3169				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3170				       isc_msgcat, ISC_MSGSET_SOCKET,
3171				       ISC_MSG_TOOMANYFDS,
3172				       "accept: "
3173				       "file descriptor exceeds limit (%d/%u)",
3174				       fd, manager->maxsocks);
3175			(void)close(fd);
3176			goto soft_error;
3177		}
3178	}
3179
3180	if (fd != -1) {
3181		NEWCONNSOCK(dev)->peer_address.length = addrlen;
3182		NEWCONNSOCK(dev)->pf = sock->pf;
3183	}
3184
3185	/*
3186	 * Pull off the done event.
3187	 */
3188	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
3189
3190	/*
3191	 * Poke watcher if there are more pending accepts.
3192	 */
3193	if (!ISC_LIST_EMPTY(sock->accept_list))
3194		select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3195
3196	UNLOCK(&sock->lock);
3197
3198	if (fd != -1) {
3199		result = make_nonblock(fd);
3200		if (result != ISC_R_SUCCESS) {
3201			(void)close(fd);
3202			fd = -1;
3203		}
3204	}
3205
3206	/*
3207	 * -1 means the new socket didn't happen.
3208	 */
3209	if (fd != -1) {
3210		int lockid = FDLOCK_ID(fd);
3211
3212		LOCK(&manager->fdlock[lockid]);
3213		manager->fds[fd] = NEWCONNSOCK(dev);
3214		manager->fdstate[fd] = MANAGED;
3215		UNLOCK(&manager->fdlock[lockid]);
3216
3217		LOCK(&manager->lock);
3218		ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
3219
3220		NEWCONNSOCK(dev)->fd = fd;
3221		NEWCONNSOCK(dev)->bound = 1;
3222		NEWCONNSOCK(dev)->connected = 1;
3223
3224		/*
3225		 * Save away the remote address
3226		 */
3227		dev->address = NEWCONNSOCK(dev)->peer_address;
3228
3229#ifdef USE_SELECT
3230		if (manager->maxfd < fd)
3231			manager->maxfd = fd;
3232#endif
3233
3234		socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
3235			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
3236			   "accepted connection, new socket %p",
3237			   dev->newsocket);
3238
3239		UNLOCK(&manager->lock);
3240
3241		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3242	} else {
3243		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3244		NEWCONNSOCK(dev)->references--;
3245		free_socket((isc__socket_t **)&dev->newsocket);
3246	}
3247
3248	/*
3249	 * Fill in the done event details and send it off.
3250	 */
3251	dev->result = result;
3252	task = dev->ev_sender;
3253	dev->ev_sender = sock;
3254
3255	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3256	return;
3257
3258 soft_error:
3259	select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3260	UNLOCK(&sock->lock);
3261
3262	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3263	return;
3264}
3265
3266static void
3267internal_recv(isc_task_t *me, isc_event_t *ev) {
3268	isc_socketevent_t *dev;
3269	isc__socket_t *sock;
3270
3271	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3272
3273	sock = ev->ev_sender;
3274	INSIST(VALID_SOCKET(sock));
3275
3276	LOCK(&sock->lock);
3277	socket_log(sock, NULL, IOEVENT,
3278		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3279		   "internal_recv: task %p got event %p", me, ev);
3280
3281	INSIST(sock->pending_recv == 1);
3282	sock->pending_recv = 0;
3283
3284	INSIST(sock->references > 0);
3285	sock->references--;  /* the internal event is done with this socket */
3286	if (sock->references == 0) {
3287		UNLOCK(&sock->lock);
3288		destroy(&sock);
3289		return;
3290	}
3291
3292	/*
3293	 * Try to do as much I/O as possible on this socket.  There are no
3294	 * limits here, currently.
3295	 */
3296	dev = ISC_LIST_HEAD(sock->recv_list);
3297	while (dev != NULL) {
3298		switch (doio_recv(sock, dev)) {
3299		case DOIO_SOFT:
3300			goto poke;
3301
3302		case DOIO_EOF:
3303			/*
3304			 * read of 0 means the remote end was closed.
3305			 * Run through the event queue and dispatch all
3306			 * the events with an EOF result code.
3307			 */
3308			do {
3309				dev->result = ISC_R_EOF;
3310				send_recvdone_event(sock, &dev);
3311				dev = ISC_LIST_HEAD(sock->recv_list);
3312			} while (dev != NULL);
3313			goto poke;
3314
3315		case DOIO_SUCCESS:
3316		case DOIO_HARD:
3317			send_recvdone_event(sock, &dev);
3318			break;
3319		}
3320
3321		dev = ISC_LIST_HEAD(sock->recv_list);
3322	}
3323
3324 poke:
3325	if (!ISC_LIST_EMPTY(sock->recv_list))
3326		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3327
3328	UNLOCK(&sock->lock);
3329}
3330
3331static void
3332internal_send(isc_task_t *me, isc_event_t *ev) {
3333	isc_socketevent_t *dev;
3334	isc__socket_t *sock;
3335
3336	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3337
3338	/*
3339	 * Find out what socket this is and lock it.
3340	 */
3341	sock = (isc__socket_t *)ev->ev_sender;
3342	INSIST(VALID_SOCKET(sock));
3343
3344	LOCK(&sock->lock);
3345	socket_log(sock, NULL, IOEVENT,
3346		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3347		   "internal_send: task %p got event %p", me, ev);
3348
3349	INSIST(sock->pending_send == 1);
3350	sock->pending_send = 0;
3351
3352	INSIST(sock->references > 0);
3353	sock->references--;  /* the internal event is done with this socket */
3354	if (sock->references == 0) {
3355		UNLOCK(&sock->lock);
3356		destroy(&sock);
3357		return;
3358	}
3359
3360	/*
3361	 * Try to do as much I/O as possible on this socket.  There are no
3362	 * limits here, currently.
3363	 */
3364	dev = ISC_LIST_HEAD(sock->send_list);
3365	while (dev != NULL) {
3366		switch (doio_send(sock, dev)) {
3367		case DOIO_SOFT:
3368			goto poke;
3369
3370		case DOIO_HARD:
3371		case DOIO_SUCCESS:
3372			send_senddone_event(sock, &dev);
3373			break;
3374		}
3375
3376		dev = ISC_LIST_HEAD(sock->send_list);
3377	}
3378
3379 poke:
3380	if (!ISC_LIST_EMPTY(sock->send_list))
3381		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3382
3383	UNLOCK(&sock->lock);
3384}
3385
3386static void
3387internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3388	isc__socket_t *sock;
3389	int more_data;
3390
3391	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3392
3393	/*
3394	 * Find out what socket this is and lock it.
3395	 */
3396	sock = (isc__socket_t *)ev->ev_sender;
3397	INSIST(VALID_SOCKET(sock));
3398
3399	LOCK(&sock->lock);
3400	socket_log(sock, NULL, IOEVENT,
3401		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3402		   "internal_fdwatch_write: task %p got event %p", me, ev);
3403
3404	INSIST(sock->pending_send == 1);
3405
3406	UNLOCK(&sock->lock);
3407	more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3408				      sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
3409	LOCK(&sock->lock);
3410
3411	sock->pending_send = 0;
3412
3413	INSIST(sock->references > 0);
3414	sock->references--;  /* the internal event is done with this socket */
3415	if (sock->references == 0) {
3416		UNLOCK(&sock->lock);
3417		destroy(&sock);
3418		return;
3419	}
3420
3421	if (more_data)
3422		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3423
3424	UNLOCK(&sock->lock);
3425}
3426
3427static void
3428internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3429	isc__socket_t *sock;
3430	int more_data;
3431
3432	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3433
3434	/*
3435	 * Find out what socket this is and lock it.
3436	 */
3437	sock = (isc__socket_t *)ev->ev_sender;
3438	INSIST(VALID_SOCKET(sock));
3439
3440	LOCK(&sock->lock);
3441	socket_log(sock, NULL, IOEVENT,
3442		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3443		   "internal_fdwatch_read: task %p got event %p", me, ev);
3444
3445	INSIST(sock->pending_recv == 1);
3446
3447	UNLOCK(&sock->lock);
3448	more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3449				      sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
3450	LOCK(&sock->lock);
3451
3452	sock->pending_recv = 0;
3453
3454	INSIST(sock->references > 0);
3455	sock->references--;  /* the internal event is done with this socket */
3456	if (sock->references == 0) {
3457		UNLOCK(&sock->lock);
3458		destroy(&sock);
3459		return;
3460	}
3461
3462	if (more_data)
3463		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3464
3465	UNLOCK(&sock->lock);
3466}
3467
3468/*
3469 * Process read/writes on each fd here.  Avoid locking
3470 * and unlocking twice if both reads and writes are possible.
3471 */
3472static void
3473process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
3474	   isc_boolean_t writeable)
3475{
3476	isc__socket_t *sock;
3477	isc_boolean_t unlock_sock;
3478	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3479	int lockid = FDLOCK_ID(fd);
3480
3481	/*
3482	 * If the socket is going to be closed, don't do more I/O.
3483	 */
3484	LOCK(&manager->fdlock[lockid]);
3485	if (manager->fdstate[fd] == CLOSE_PENDING) {
3486		UNLOCK(&manager->fdlock[lockid]);
3487
3488		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3489		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3490		return;
3491	}
3492
3493	sock = manager->fds[fd];
3494	unlock_sock = ISC_FALSE;
3495	if (readable) {
3496		if (sock == NULL) {
3497			unwatch_read = ISC_TRUE;
3498			goto check_write;
3499		}
3500		unlock_sock = ISC_TRUE;
3501		LOCK(&sock->lock);
3502		if (!SOCK_DEAD(sock)) {
3503			if (sock->listener)
3504				dispatch_accept(sock);
3505			else
3506				dispatch_recv(sock);
3507		}
3508		unwatch_read = ISC_TRUE;
3509	}
3510check_write:
3511	if (writeable) {
3512		if (sock == NULL) {
3513			unwatch_write = ISC_TRUE;
3514			goto unlock_fd;
3515		}
3516		if (!unlock_sock) {
3517			unlock_sock = ISC_TRUE;
3518			LOCK(&sock->lock);
3519		}
3520		if (!SOCK_DEAD(sock)) {
3521			if (sock->connecting)
3522				dispatch_connect(sock);
3523			else
3524				dispatch_send(sock);
3525		}
3526		unwatch_write = ISC_TRUE;
3527	}
3528	if (unlock_sock)
3529		UNLOCK(&sock->lock);
3530
3531 unlock_fd:
3532	UNLOCK(&manager->fdlock[lockid]);
3533	if (unwatch_read)
3534		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3535	if (unwatch_write)
3536		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3537
3538}
3539
3540#ifdef USE_KQUEUE
3541static isc_boolean_t
3542process_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) {
3543	int i;
3544	isc_boolean_t readable, writable;
3545	isc_boolean_t done = ISC_FALSE;
3546#ifdef USE_WATCHER_THREAD
3547	isc_boolean_t have_ctlevent = ISC_FALSE;
3548#endif
3549
3550	if (nevents == manager->nevents) {
3551		/*
3552		 * This is not an error, but something unexpected.  If this
3553		 * happens, it may indicate the need for increasing
3554		 * ISC_SOCKET_MAXEVENTS.
3555		 */
3556		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3557			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3558			    "maximum number of FD events (%d) received",
3559			    nevents);
3560	}
3561
3562	for (i = 0; i < nevents; i++) {
3563		REQUIRE(events[i].ident < manager->maxsocks);
3564#ifdef USE_WATCHER_THREAD
3565		if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3566			have_ctlevent = ISC_TRUE;
3567			continue;
3568		}
3569#endif
3570		readable = ISC_TF(events[i].filter == EVFILT_READ);
3571		writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3572		process_fd(manager, events[i].ident, readable, writable);
3573	}
3574
3575#ifdef USE_WATCHER_THREAD
3576	if (have_ctlevent)
3577		done = process_ctlfd(manager);
3578#endif
3579
3580	return (done);
3581}
3582#elif defined(USE_EPOLL)
3583static isc_boolean_t
3584process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents)
3585{
3586	int i;
3587	isc_boolean_t done = ISC_FALSE;
3588#ifdef USE_WATCHER_THREAD
3589	isc_boolean_t have_ctlevent = ISC_FALSE;
3590#endif
3591
3592	if (nevents == manager->nevents) {
3593		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3594			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3595			    "maximum number of FD events (%d) received",
3596			    nevents);
3597	}
3598
3599	for (i = 0; i < nevents; i++) {
3600		REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3601#ifdef USE_WATCHER_THREAD
3602		if (events[i].data.fd == manager->pipe_fds[0]) {
3603			have_ctlevent = ISC_TRUE;
3604			continue;
3605		}
3606#endif
3607		if ((events[i].events & EPOLLERR) != 0 ||
3608		    (events[i].events & EPOLLHUP) != 0) {
3609			/*
3610			 * epoll does not set IN/OUT bits on an erroneous
3611			 * condition, so we need to try both anyway.  This is a
3612			 * bit inefficient, but should be okay for such rare
3613			 * events.  Note also that the read or write attempt
3614			 * won't block because we use non-blocking sockets.
3615			 */
3616			events[i].events |= (EPOLLIN | EPOLLOUT);
3617		}
3618		process_fd(manager, events[i].data.fd,
3619			   (events[i].events & EPOLLIN) != 0,
3620			   (events[i].events & EPOLLOUT) != 0);
3621	}
3622
3623#ifdef USE_WATCHER_THREAD
3624	if (have_ctlevent)
3625		done = process_ctlfd(manager);
3626#endif
3627
3628	return (done);
3629}
3630#elif defined(USE_DEVPOLL)
3631static isc_boolean_t
3632process_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) {
3633	int i;
3634	isc_boolean_t done = ISC_FALSE;
3635#ifdef USE_WATCHER_THREAD
3636	isc_boolean_t have_ctlevent = ISC_FALSE;
3637#endif
3638
3639	if (nevents == manager->nevents) {
3640		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3641			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3642			    "maximum number of FD events (%d) received",
3643			    nevents);
3644	}
3645
3646	for (i = 0; i < nevents; i++) {
3647		REQUIRE(events[i].fd < (int)manager->maxsocks);
3648#ifdef USE_WATCHER_THREAD
3649		if (events[i].fd == manager->pipe_fds[0]) {
3650			have_ctlevent = ISC_TRUE;
3651			continue;
3652		}
3653#endif
3654		process_fd(manager, events[i].fd,
3655			   (events[i].events & POLLIN) != 0,
3656			   (events[i].events & POLLOUT) != 0);
3657	}
3658
3659#ifdef USE_WATCHER_THREAD
3660	if (have_ctlevent)
3661		done = process_ctlfd(manager);
3662#endif
3663
3664	return (done);
3665}
3666#elif defined(USE_SELECT)
3667static void
3668process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
3669	    fd_set *writefds)
3670{
3671	int i;
3672
3673	REQUIRE(maxfd <= (int)manager->maxsocks);
3674
3675	for (i = 0; i < maxfd; i++) {
3676#ifdef USE_WATCHER_THREAD
3677		if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3678			continue;
3679#endif /* USE_WATCHER_THREAD */
3680		process_fd(manager, i, FD_ISSET(i, readfds),
3681			   FD_ISSET(i, writefds));
3682	}
3683}
3684#endif
3685
3686#ifdef USE_WATCHER_THREAD
3687static isc_boolean_t
3688process_ctlfd(isc__socketmgr_t *manager) {
3689	int msg, fd;
3690
3691	for (;;) {
3692		select_readmsg(manager, &fd, &msg);
3693
3694		manager_log(manager, IOEVENT,
3695			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3696					   ISC_MSG_WATCHERMSG,
3697					   "watcher got message %d "
3698					   "for socket %d"), msg, fd);
3699
3700		/*
3701		 * Nothing to read?
3702		 */
3703		if (msg == SELECT_POKE_NOTHING)
3704			break;
3705
3706		/*
3707		 * Handle shutdown message.  We really should
3708		 * jump out of this loop right away, but
3709		 * it doesn't matter if we have to do a little
3710		 * more work first.
3711		 */
3712		if (msg == SELECT_POKE_SHUTDOWN)
3713			return (ISC_TRUE);
3714
3715		/*
3716		 * This is a wakeup on a socket.  Look
3717		 * at the event queue for both read and write,
3718		 * and decide if we need to watch on it now
3719		 * or not.
3720		 */
3721		wakeup_socket(manager, fd, msg);
3722	}
3723
3724	return (ISC_FALSE);
3725}
3726
3727/*
3728 * This is the thread that will loop forever, always in a select or poll
3729 * call.
3730 *
3731 * When select returns something to do, track down what thread gets to do
3732 * this I/O and post the event to it.
3733 */
3734static isc_threadresult_t
3735watcher(void *uap) {
3736	isc__socketmgr_t *manager = uap;
3737	isc_boolean_t done;
3738	int cc;
3739#ifdef USE_KQUEUE
3740	const char *fnname = "kevent()";
3741#elif defined (USE_EPOLL)
3742	const char *fnname = "epoll_wait()";
3743#elif defined(USE_DEVPOLL)
3744	const char *fnname = "ioctl(DP_POLL)";
3745	struct dvpoll dvp;
3746#elif defined (USE_SELECT)
3747	const char *fnname = "select()";
3748	int maxfd;
3749	int ctlfd;
3750#endif
3751	char strbuf[ISC_STRERRORSIZE];
3752#ifdef ISC_SOCKET_USE_POLLWATCH
3753	pollstate_t pollstate = poll_idle;
3754#endif
3755
3756#if defined (USE_SELECT)
3757	/*
3758	 * Get the control fd here.  This will never change.
3759	 */
3760	ctlfd = manager->pipe_fds[0];
3761#endif
3762	done = ISC_FALSE;
3763	while (!done) {
3764		do {
3765#ifdef USE_KQUEUE
3766			cc = kevent(manager->kqueue_fd, NULL, 0,
3767				    manager->events, manager->nevents, NULL);
3768#elif defined(USE_EPOLL)
3769			cc = epoll_wait(manager->epoll_fd, manager->events,
3770					manager->nevents, -1);
3771#elif defined(USE_DEVPOLL)
3772			dvp.dp_fds = manager->events;
3773			dvp.dp_nfds = manager->nevents;
3774#ifndef ISC_SOCKET_USE_POLLWATCH
3775			dvp.dp_timeout = -1;
3776#else
3777			if (pollstate == poll_idle)
3778				dvp.dp_timeout = -1;
3779			else
3780				dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3781#endif	/* ISC_SOCKET_USE_POLLWATCH */
3782			cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3783#elif defined(USE_SELECT)
3784			LOCK(&manager->lock);
3785			memcpy(manager->read_fds_copy, manager->read_fds,
3786			       manager->fd_bufsize);
3787			memcpy(manager->write_fds_copy, manager->write_fds,
3788			       manager->fd_bufsize);
3789			maxfd = manager->maxfd + 1;
3790			UNLOCK(&manager->lock);
3791
3792			cc = select(maxfd, manager->read_fds_copy,
3793				    manager->write_fds_copy, NULL, NULL);
3794#endif	/* USE_KQUEUE */
3795
3796			if (cc < 0 && !SOFT_ERROR(errno)) {
3797				isc__strerror(errno, strbuf, sizeof(strbuf));
3798				FATAL_ERROR(__FILE__, __LINE__,
3799					    "%s %s: %s", fnname,
3800					    isc_msgcat_get(isc_msgcat,
3801							   ISC_MSGSET_GENERAL,
3802							   ISC_MSG_FAILED,
3803							   "failed"), strbuf);
3804			}
3805
3806#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3807			if (cc == 0) {
3808				if (pollstate == poll_active)
3809					pollstate = poll_checking;
3810				else if (pollstate == poll_checking)
3811					pollstate = poll_idle;
3812			} else if (cc > 0) {
3813				if (pollstate == poll_checking) {
3814					/*
3815					 * XXX: We'd like to use a more
3816					 * verbose log level as it's actually an
3817					 * unexpected event, but the kernel bug
3818					 * reportedly happens pretty frequently
3819					 * (and it can also be a false positive)
3820					 * so it would be just too noisy.
3821					 */
3822					manager_log(manager,
3823						    ISC_LOGCATEGORY_GENERAL,
3824						    ISC_LOGMODULE_SOCKET,
3825						    ISC_LOG_DEBUG(1),
3826						    "unexpected POLL timeout");
3827				}
3828				pollstate = poll_active;
3829			}
3830#endif
3831		} while (cc < 0);
3832
3833#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3834		done = process_fds(manager, manager->events, cc);
3835#elif defined(USE_SELECT)
3836		process_fds(manager, maxfd, manager->read_fds_copy,
3837			    manager->write_fds_copy);
3838
3839		/*
3840		 * Process reads on internal, control fd.
3841		 */
3842		if (FD_ISSET(ctlfd, manager->read_fds_copy))
3843			done = process_ctlfd(manager);
3844#endif
3845	}
3846
3847	manager_log(manager, TRACE, "%s",
3848		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3849				   ISC_MSG_EXITING, "watcher exiting"));
3850
3851	return ((isc_threadresult_t)0);
3852}
3853#endif /* USE_WATCHER_THREAD */
3854
3855#ifdef BIND9
3856ISC_SOCKETFUNC_SCOPE void
3857isc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) {
3858	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3859
3860	REQUIRE(VALID_MANAGER(manager));
3861
3862	manager->reserved = reserved;
3863}
3864
3865ISC_SOCKETFUNC_SCOPE void
3866isc___socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) {
3867	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3868
3869	REQUIRE(VALID_MANAGER(manager));
3870
3871	manager->maxudp = maxudp;
3872}
3873#endif	/* BIND9 */
3874
3875/*
3876 * Create a new socket manager.
3877 */
3878
3879static isc_result_t
3880setup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
3881	isc_result_t result;
3882#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3883	char strbuf[ISC_STRERRORSIZE];
3884#endif
3885
3886#ifdef USE_KQUEUE
3887	manager->nevents = ISC_SOCKET_MAXEVENTS;
3888	manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3889				      manager->nevents);
3890	if (manager->events == NULL)
3891		return (ISC_R_NOMEMORY);
3892	manager->kqueue_fd = kqueue();
3893	if (manager->kqueue_fd == -1) {
3894		result = isc__errno2result(errno);
3895		isc__strerror(errno, strbuf, sizeof(strbuf));
3896		UNEXPECTED_ERROR(__FILE__, __LINE__,
3897				 "kqueue %s: %s",
3898				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3899						ISC_MSG_FAILED, "failed"),
3900				 strbuf);
3901		isc_mem_put(mctx, manager->events,
3902			    sizeof(struct kevent) * manager->nevents);
3903		return (result);
3904	}
3905
3906#ifdef USE_WATCHER_THREAD
3907	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3908	if (result != ISC_R_SUCCESS) {
3909		close(manager->kqueue_fd);
3910		isc_mem_put(mctx, manager->events,
3911			    sizeof(struct kevent) * manager->nevents);
3912		return (result);
3913	}
3914#endif	/* USE_WATCHER_THREAD */
3915#elif defined(USE_EPOLL)
3916	manager->nevents = ISC_SOCKET_MAXEVENTS;
3917	manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3918				      manager->nevents);
3919	if (manager->events == NULL)
3920		return (ISC_R_NOMEMORY);
3921	manager->epoll_fd = epoll_create(manager->nevents);
3922	if (manager->epoll_fd == -1) {
3923		result = isc__errno2result(errno);
3924		isc__strerror(errno, strbuf, sizeof(strbuf));
3925		UNEXPECTED_ERROR(__FILE__, __LINE__,
3926				 "epoll_create %s: %s",
3927				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3928						ISC_MSG_FAILED, "failed"),
3929				 strbuf);
3930		isc_mem_put(mctx, manager->events,
3931			    sizeof(struct epoll_event) * manager->nevents);
3932		return (result);
3933	}
3934#ifdef USE_WATCHER_THREAD
3935	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3936	if (result != ISC_R_SUCCESS) {
3937		close(manager->epoll_fd);
3938		isc_mem_put(mctx, manager->events,
3939			    sizeof(struct epoll_event) * manager->nevents);
3940		return (result);
3941	}
3942#endif	/* USE_WATCHER_THREAD */
3943#elif defined(USE_DEVPOLL)
3944	/*
3945	 * XXXJT: /dev/poll seems to reject large numbers of events,
3946	 * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3947	 */
3948	manager->nevents = ISC_SOCKET_MAXEVENTS;
3949	manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3950				      manager->nevents);
3951	if (manager->events == NULL)
3952		return (ISC_R_NOMEMORY);
3953	/*
3954	 * Note: fdpollinfo should be able to support all possible FDs, so
3955	 * it must have maxsocks entries (not nevents).
3956	 */
3957	manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3958					  manager->maxsocks);
3959	if (manager->fdpollinfo == NULL) {
3960		isc_mem_put(mctx, manager->events,
3961			    sizeof(struct pollfd) * manager->nevents);
3962		return (ISC_R_NOMEMORY);
3963	}
3964	memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3965	manager->devpoll_fd = open("/dev/poll", O_RDWR);
3966	if (manager->devpoll_fd == -1) {
3967		result = isc__errno2result(errno);
3968		isc__strerror(errno, strbuf, sizeof(strbuf));
3969		UNEXPECTED_ERROR(__FILE__, __LINE__,
3970				 "open(/dev/poll) %s: %s",
3971				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3972						ISC_MSG_FAILED, "failed"),
3973				 strbuf);
3974		isc_mem_put(mctx, manager->events,
3975			    sizeof(struct pollfd) * manager->nevents);
3976		isc_mem_put(mctx, manager->fdpollinfo,
3977			    sizeof(pollinfo_t) * manager->maxsocks);
3978		return (result);
3979	}
3980#ifdef USE_WATCHER_THREAD
3981	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3982	if (result != ISC_R_SUCCESS) {
3983		close(manager->devpoll_fd);
3984		isc_mem_put(mctx, manager->events,
3985			    sizeof(struct pollfd) * manager->nevents);
3986		isc_mem_put(mctx, manager->fdpollinfo,
3987			    sizeof(pollinfo_t) * manager->maxsocks);
3988		return (result);
3989	}
3990#endif	/* USE_WATCHER_THREAD */
3991#elif defined(USE_SELECT)
3992	UNUSED(result);
3993
3994#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3995	/*
3996	 * Note: this code should also cover the case of MAXSOCKETS <=
3997	 * FD_SETSIZE, but we separate the cases to avoid possible portability
3998	 * issues regarding howmany() and the actual representation of fd_set.
3999	 */
4000	manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
4001		sizeof(fd_mask);
4002#else
4003	manager->fd_bufsize = sizeof(fd_set);
4004#endif
4005
4006	manager->read_fds = NULL;
4007	manager->read_fds_copy = NULL;
4008	manager->write_fds = NULL;
4009	manager->write_fds_copy = NULL;
4010
4011	manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
4012	if (manager->read_fds != NULL)
4013		manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
4014	if (manager->read_fds_copy != NULL)
4015		manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
4016	if (manager->write_fds != NULL) {
4017		manager->write_fds_copy = isc_mem_get(mctx,
4018						      manager->fd_bufsize);
4019	}
4020	if (manager->write_fds_copy == NULL) {
4021		if (manager->write_fds != NULL) {
4022			isc_mem_put(mctx, manager->write_fds,
4023				    manager->fd_bufsize);
4024		}
4025		if (manager->read_fds_copy != NULL) {
4026			isc_mem_put(mctx, manager->read_fds_copy,
4027				    manager->fd_bufsize);
4028		}
4029		if (manager->read_fds != NULL) {
4030			isc_mem_put(mctx, manager->read_fds,
4031				    manager->fd_bufsize);
4032		}
4033		return (ISC_R_NOMEMORY);
4034	}
4035	memset(manager->read_fds, 0, manager->fd_bufsize);
4036	memset(manager->write_fds, 0, manager->fd_bufsize);
4037
4038#ifdef USE_WATCHER_THREAD
4039	(void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4040	manager->maxfd = manager->pipe_fds[0];
4041#else /* USE_WATCHER_THREAD */
4042	manager->maxfd = 0;
4043#endif /* USE_WATCHER_THREAD */
4044#endif	/* USE_KQUEUE */
4045
4046	return (ISC_R_SUCCESS);
4047}
4048
4049static void
4050cleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
4051#ifdef USE_WATCHER_THREAD
4052	isc_result_t result;
4053
4054	result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4055	if (result != ISC_R_SUCCESS) {
4056		UNEXPECTED_ERROR(__FILE__, __LINE__,
4057				 "epoll_ctl(DEL) %s",
4058				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4059						ISC_MSG_FAILED, "failed"));
4060	}
4061#endif	/* USE_WATCHER_THREAD */
4062
4063#ifdef USE_KQUEUE
4064	close(manager->kqueue_fd);
4065	isc_mem_put(mctx, manager->events,
4066		    sizeof(struct kevent) * manager->nevents);
4067#elif defined(USE_EPOLL)
4068	close(manager->epoll_fd);
4069	isc_mem_put(mctx, manager->events,
4070		    sizeof(struct epoll_event) * manager->nevents);
4071#elif defined(USE_DEVPOLL)
4072	close(manager->devpoll_fd);
4073	isc_mem_put(mctx, manager->events,
4074		    sizeof(struct pollfd) * manager->nevents);
4075	isc_mem_put(mctx, manager->fdpollinfo,
4076		    sizeof(pollinfo_t) * manager->maxsocks);
4077#elif defined(USE_SELECT)
4078	if (manager->read_fds != NULL)
4079		isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
4080	if (manager->read_fds_copy != NULL)
4081		isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
4082	if (manager->write_fds != NULL)
4083		isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
4084	if (manager->write_fds_copy != NULL)
4085		isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
4086#endif	/* USE_KQUEUE */
4087}
4088
4089ISC_SOCKETFUNC_SCOPE isc_result_t
4090isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
4091	return (isc__socketmgr_create2(mctx, managerp, 0));
4092}
4093
4094ISC_SOCKETFUNC_SCOPE isc_result_t
4095isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
4096		       unsigned int maxsocks)
4097{
4098	int i;
4099	isc__socketmgr_t *manager;
4100#ifdef USE_WATCHER_THREAD
4101	char strbuf[ISC_STRERRORSIZE];
4102#endif
4103	isc_result_t result;
4104
4105	REQUIRE(managerp != NULL && *managerp == NULL);
4106
4107#ifdef USE_SHARED_MANAGER
4108	if (socketmgr != NULL) {
4109		/* Don't allow maxsocks to be updated */
4110		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
4111			return (ISC_R_EXISTS);
4112
4113		socketmgr->refs++;
4114		*managerp = (isc_socketmgr_t *)socketmgr;
4115		return (ISC_R_SUCCESS);
4116	}
4117#endif /* USE_SHARED_MANAGER */
4118
4119	if (maxsocks == 0)
4120		maxsocks = ISC_SOCKET_MAXSOCKETS;
4121
4122	manager = isc_mem_get(mctx, sizeof(*manager));
4123	if (manager == NULL)
4124		return (ISC_R_NOMEMORY);
4125
4126	/* zero-clear so that necessary cleanup on failure will be easy */
4127	memset(manager, 0, sizeof(*manager));
4128	manager->maxsocks = maxsocks;
4129	manager->reserved = 0;
4130	manager->maxudp = 0;
4131	manager->fds = isc_mem_get(mctx,
4132				   manager->maxsocks * sizeof(isc__socket_t *));
4133	if (manager->fds == NULL) {
4134		result = ISC_R_NOMEMORY;
4135		goto free_manager;
4136	}
4137	manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
4138	if (manager->fdstate == NULL) {
4139		result = ISC_R_NOMEMORY;
4140		goto free_manager;
4141	}
4142	manager->stats = NULL;
4143
4144	manager->common.methods = &socketmgrmethods;
4145	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
4146	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
4147	manager->mctx = NULL;
4148	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
4149	ISC_LIST_INIT(manager->socklist);
4150	result = isc_mutex_init(&manager->lock);
4151	if (result != ISC_R_SUCCESS)
4152		goto free_manager;
4153	manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
4154	if (manager->fdlock == NULL) {
4155		result = ISC_R_NOMEMORY;
4156		goto cleanup_lock;
4157	}
4158	for (i = 0; i < FDLOCK_COUNT; i++) {
4159		result = isc_mutex_init(&manager->fdlock[i]);
4160		if (result != ISC_R_SUCCESS) {
4161			while (--i >= 0)
4162				DESTROYLOCK(&manager->fdlock[i]);
4163			isc_mem_put(mctx, manager->fdlock,
4164				    FDLOCK_COUNT * sizeof(isc_mutex_t));
4165			manager->fdlock = NULL;
4166			goto cleanup_lock;
4167		}
4168	}
4169
4170#ifdef USE_WATCHER_THREAD
4171	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
4172		UNEXPECTED_ERROR(__FILE__, __LINE__,
4173				 "isc_condition_init() %s",
4174				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4175						ISC_MSG_FAILED, "failed"));
4176		result = ISC_R_UNEXPECTED;
4177		goto cleanup_lock;
4178	}
4179
4180	/*
4181	 * Create the special fds that will be used to wake up the
4182	 * select/poll loop when something internal needs to be done.
4183	 */
4184	if (pipe(manager->pipe_fds) != 0) {
4185		isc__strerror(errno, strbuf, sizeof(strbuf));
4186		UNEXPECTED_ERROR(__FILE__, __LINE__,
4187				 "pipe() %s: %s",
4188				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4189						ISC_MSG_FAILED, "failed"),
4190				 strbuf);
4191		result = ISC_R_UNEXPECTED;
4192		goto cleanup_condition;
4193	}
4194
4195	RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
4196#if 0
4197	RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
4198#endif
4199#endif	/* USE_WATCHER_THREAD */
4200
4201#ifdef USE_SHARED_MANAGER
4202	manager->refs = 1;
4203#endif /* USE_SHARED_MANAGER */
4204
4205	/*
4206	 * Set up initial state for the select loop
4207	 */
4208	result = setup_watcher(mctx, manager);
4209	if (result != ISC_R_SUCCESS)
4210		goto cleanup;
4211	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
4212#ifdef USE_WATCHER_THREAD
4213	/*
4214	 * Start up the select/poll thread.
4215	 */
4216	if (isc_thread_create(watcher, manager, &manager->watcher) !=
4217	    ISC_R_SUCCESS) {
4218		UNEXPECTED_ERROR(__FILE__, __LINE__,
4219				 "isc_thread_create() %s",
4220				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4221						ISC_MSG_FAILED, "failed"));
4222		cleanup_watcher(mctx, manager);
4223		result = ISC_R_UNEXPECTED;
4224		goto cleanup;
4225	}
4226#endif /* USE_WATCHER_THREAD */
4227	isc_mem_attach(mctx, &manager->mctx);
4228
4229#ifdef USE_SHARED_MANAGER
4230	socketmgr = manager;
4231#endif /* USE_SHARED_MANAGER */
4232	*managerp = (isc_socketmgr_t *)manager;
4233
4234	return (ISC_R_SUCCESS);
4235
4236cleanup:
4237#ifdef USE_WATCHER_THREAD
4238	(void)close(manager->pipe_fds[0]);
4239	(void)close(manager->pipe_fds[1]);
4240#endif	/* USE_WATCHER_THREAD */
4241
4242#ifdef USE_WATCHER_THREAD
4243cleanup_condition:
4244	(void)isc_condition_destroy(&manager->shutdown_ok);
4245#endif	/* USE_WATCHER_THREAD */
4246
4247
4248cleanup_lock:
4249	if (manager->fdlock != NULL) {
4250		for (i = 0; i < FDLOCK_COUNT; i++)
4251			DESTROYLOCK(&manager->fdlock[i]);
4252	}
4253	DESTROYLOCK(&manager->lock);
4254
4255free_manager:
4256	if (manager->fdlock != NULL) {
4257		isc_mem_put(mctx, manager->fdlock,
4258			    FDLOCK_COUNT * sizeof(isc_mutex_t));
4259	}
4260	if (manager->fdstate != NULL) {
4261		isc_mem_put(mctx, manager->fdstate,
4262			    manager->maxsocks * sizeof(int));
4263	}
4264	if (manager->fds != NULL) {
4265		isc_mem_put(mctx, manager->fds,
4266			    manager->maxsocks * sizeof(isc_socket_t *));
4267	}
4268	isc_mem_put(mctx, manager, sizeof(*manager));
4269
4270	return (result);
4271}
4272
4273#ifdef BIND9
4274isc_result_t
4275isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) {
4276	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4277	REQUIRE(VALID_MANAGER(manager));
4278	REQUIRE(nsockp != NULL);
4279
4280	*nsockp = manager->maxsocks;
4281
4282	return (ISC_R_SUCCESS);
4283}
4284
4285void
4286isc__socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) {
4287	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4288
4289	REQUIRE(VALID_MANAGER(manager));
4290	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
4291	REQUIRE(manager->stats == NULL);
4292	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
4293
4294	isc_stats_attach(stats, &manager->stats);
4295}
4296#endif
4297
4298ISC_SOCKETFUNC_SCOPE void
4299isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
4300	isc__socketmgr_t *manager;
4301	int i;
4302	isc_mem_t *mctx;
4303
4304	/*
4305	 * Destroy a socket manager.
4306	 */
4307
4308	REQUIRE(managerp != NULL);
4309	manager = (isc__socketmgr_t *)*managerp;
4310	REQUIRE(VALID_MANAGER(manager));
4311
4312#ifdef USE_SHARED_MANAGER
4313	manager->refs--;
4314	if (manager->refs > 0) {
4315		*managerp = NULL;
4316		return;
4317	}
4318	socketmgr = NULL;
4319#endif /* USE_SHARED_MANAGER */
4320
4321	LOCK(&manager->lock);
4322
4323	/*
4324	 * Wait for all sockets to be destroyed.
4325	 */
4326	while (!ISC_LIST_EMPTY(manager->socklist)) {
4327#ifdef USE_WATCHER_THREAD
4328		manager_log(manager, CREATION, "%s",
4329			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4330					   ISC_MSG_SOCKETSREMAIN,
4331					   "sockets exist"));
4332		WAIT(&manager->shutdown_ok, &manager->lock);
4333#else /* USE_WATCHER_THREAD */
4334		UNLOCK(&manager->lock);
4335		isc__taskmgr_dispatch(NULL);
4336		LOCK(&manager->lock);
4337#endif /* USE_WATCHER_THREAD */
4338	}
4339
4340	UNLOCK(&manager->lock);
4341
4342	/*
4343	 * Here, poke our select/poll thread.  Do this by closing the write
4344	 * half of the pipe, which will send EOF to the read half.
4345	 * This is currently a no-op in the non-threaded case.
4346	 */
4347	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4348
4349#ifdef USE_WATCHER_THREAD
4350	/*
4351	 * Wait for thread to exit.
4352	 */
4353	if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4354		UNEXPECTED_ERROR(__FILE__, __LINE__,
4355				 "isc_thread_join() %s",
4356				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4357						ISC_MSG_FAILED, "failed"));
4358#endif /* USE_WATCHER_THREAD */
4359
4360	/*
4361	 * Clean up.
4362	 */
4363	cleanup_watcher(manager->mctx, manager);
4364
4365#ifdef USE_WATCHER_THREAD
4366	(void)close(manager->pipe_fds[0]);
4367	(void)close(manager->pipe_fds[1]);
4368	(void)isc_condition_destroy(&manager->shutdown_ok);
4369#endif /* USE_WATCHER_THREAD */
4370
4371	for (i = 0; i < (int)manager->maxsocks; i++)
4372		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4373			(void)close(i);
4374
4375	isc_mem_put(manager->mctx, manager->fds,
4376		    manager->maxsocks * sizeof(isc__socket_t *));
4377	isc_mem_put(manager->mctx, manager->fdstate,
4378		    manager->maxsocks * sizeof(int));
4379
4380	if (manager->stats != NULL)
4381		isc_stats_detach(&manager->stats);
4382
4383	if (manager->fdlock != NULL) {
4384		for (i = 0; i < FDLOCK_COUNT; i++)
4385			DESTROYLOCK(&manager->fdlock[i]);
4386		isc_mem_put(manager->mctx, manager->fdlock,
4387			    FDLOCK_COUNT * sizeof(isc_mutex_t));
4388	}
4389	DESTROYLOCK(&manager->lock);
4390	manager->common.magic = 0;
4391	manager->common.impmagic = 0;
4392	mctx= manager->mctx;
4393	isc_mem_put(mctx, manager, sizeof(*manager));
4394
4395	isc_mem_detach(&mctx);
4396
4397	*managerp = NULL;
4398
4399#ifdef USE_SHARED_MANAGER
4400	socketmgr = NULL;
4401#endif
4402}
4403
4404static isc_result_t
4405socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4406	    unsigned int flags)
4407{
4408	int io_state;
4409	isc_boolean_t have_lock = ISC_FALSE;
4410	isc_task_t *ntask = NULL;
4411	isc_result_t result = ISC_R_SUCCESS;
4412
4413	dev->ev_sender = task;
4414
4415	if (sock->type == isc_sockettype_udp) {
4416		io_state = doio_recv(sock, dev);
4417	} else {
4418		LOCK(&sock->lock);
4419		have_lock = ISC_TRUE;
4420
4421		if (ISC_LIST_EMPTY(sock->recv_list))
4422			io_state = doio_recv(sock, dev);
4423		else
4424			io_state = DOIO_SOFT;
4425	}
4426
4427	switch (io_state) {
4428	case DOIO_SOFT:
4429		/*
4430		 * We couldn't read all or part of the request right now, so
4431		 * queue it.
4432		 *
4433		 * Attach to socket and to task
4434		 */
4435		isc_task_attach(task, &ntask);
4436		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4437
4438		if (!have_lock) {
4439			LOCK(&sock->lock);
4440			have_lock = ISC_TRUE;
4441		}
4442
4443		/*
4444		 * Enqueue the request.  If the socket was previously not being
4445		 * watched, poke the watcher to start paying attention to it.
4446		 */
4447		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4448			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4449		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4450
4451		socket_log(sock, NULL, EVENT, NULL, 0, 0,
4452			   "socket_recv: event %p -> task %p",
4453			   dev, ntask);
4454
4455		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4456			result = ISC_R_INPROGRESS;
4457		break;
4458
4459	case DOIO_EOF:
4460		dev->result = ISC_R_EOF;
4461		/* fallthrough */
4462
4463	case DOIO_HARD:
4464	case DOIO_SUCCESS:
4465		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4466			send_recvdone_event(sock, &dev);
4467		break;
4468	}
4469
4470	if (have_lock)
4471		UNLOCK(&sock->lock);
4472
4473	return (result);
4474}
4475
4476ISC_SOCKETFUNC_SCOPE isc_result_t
4477isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4478		  unsigned int minimum, isc_task_t *task,
4479		  isc_taskaction_t action, const void *arg)
4480{
4481	isc__socket_t *sock = (isc__socket_t *)sock0;
4482	isc_socketevent_t *dev;
4483	isc__socketmgr_t *manager;
4484	unsigned int iocount;
4485	isc_buffer_t *buffer;
4486
4487	REQUIRE(VALID_SOCKET(sock));
4488	REQUIRE(buflist != NULL);
4489	REQUIRE(!ISC_LIST_EMPTY(*buflist));
4490	REQUIRE(task != NULL);
4491	REQUIRE(action != NULL);
4492
4493	manager = sock->manager;
4494	REQUIRE(VALID_MANAGER(manager));
4495
4496	iocount = isc_bufferlist_availablecount(buflist);
4497	REQUIRE(iocount > 0);
4498
4499	INSIST(sock->bound);
4500
4501	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4502	if (dev == NULL)
4503		return (ISC_R_NOMEMORY);
4504
4505	/*
4506	 * UDP sockets are always partial read
4507	 */
4508	if (sock->type == isc_sockettype_udp)
4509		dev->minimum = 1;
4510	else {
4511		if (minimum == 0)
4512			dev->minimum = iocount;
4513		else
4514			dev->minimum = minimum;
4515	}
4516
4517	/*
4518	 * Move each buffer from the passed in list to our internal one.
4519	 */
4520	buffer = ISC_LIST_HEAD(*buflist);
4521	while (buffer != NULL) {
4522		ISC_LIST_DEQUEUE(*buflist, buffer, link);
4523		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4524		buffer = ISC_LIST_HEAD(*buflist);
4525	}
4526
4527	return (socket_recv(sock, dev, task, 0));
4528}
4529
4530ISC_SOCKETFUNC_SCOPE isc_result_t
4531isc__socket_recv(isc_socket_t *sock0, isc_region_t *region,
4532		 unsigned int minimum, isc_task_t *task,
4533		 isc_taskaction_t action, const void *arg)
4534{
4535	isc__socket_t *sock = (isc__socket_t *)sock0;
4536	isc_socketevent_t *dev;
4537	isc__socketmgr_t *manager;
4538
4539	REQUIRE(VALID_SOCKET(sock));
4540	REQUIRE(action != NULL);
4541
4542	manager = sock->manager;
4543	REQUIRE(VALID_MANAGER(manager));
4544
4545	INSIST(sock->bound);
4546
4547	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4548	if (dev == NULL)
4549		return (ISC_R_NOMEMORY);
4550
4551	return (isc__socket_recv2(sock0, region, minimum, task, dev, 0));
4552}
4553
4554ISC_SOCKETFUNC_SCOPE isc_result_t
4555isc__socket_recv2(isc_socket_t *sock0, isc_region_t *region,
4556		  unsigned int minimum, isc_task_t *task,
4557		  isc_socketevent_t *event, unsigned int flags)
4558{
4559	isc__socket_t *sock = (isc__socket_t *)sock0;
4560
4561	event->ev_sender = sock;
4562	event->result = ISC_R_UNSET;
4563	ISC_LIST_INIT(event->bufferlist);
4564	event->region = *region;
4565	event->n = 0;
4566	event->offset = 0;
4567	event->attributes = 0;
4568
4569	/*
4570	 * UDP sockets are always partial read.
4571	 */
4572	if (sock->type == isc_sockettype_udp)
4573		event->minimum = 1;
4574	else {
4575		if (minimum == 0)
4576			event->minimum = region->length;
4577		else
4578			event->minimum = minimum;
4579	}
4580
4581	return (socket_recv(sock, event, task, flags));
4582}
4583
4584static isc_result_t
4585socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4586	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4587	    unsigned int flags)
4588{
4589	int io_state;
4590	isc_boolean_t have_lock = ISC_FALSE;
4591	isc_task_t *ntask = NULL;
4592	isc_result_t result = ISC_R_SUCCESS;
4593
4594	dev->ev_sender = task;
4595
4596	set_dev_address(address, sock, dev);
4597	if (pktinfo != NULL) {
4598		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4599		dev->pktinfo = *pktinfo;
4600
4601		if (!isc_sockaddr_issitelocal(&dev->address) &&
4602		    !isc_sockaddr_islinklocal(&dev->address)) {
4603			socket_log(sock, NULL, TRACE, isc_msgcat,
4604				   ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4605				   "pktinfo structure provided, ifindex %u "
4606				   "(set to 0)", pktinfo->ipi6_ifindex);
4607
4608			/*
4609			 * Set the pktinfo index to 0 here, to let the
4610			 * kernel decide what interface it should send on.
4611			 */
4612			dev->pktinfo.ipi6_ifindex = 0;
4613		}
4614	}
4615
4616	if (sock->type == isc_sockettype_udp)
4617		io_state = doio_send(sock, dev);
4618	else {
4619		LOCK(&sock->lock);
4620		have_lock = ISC_TRUE;
4621
4622		if (ISC_LIST_EMPTY(sock->send_list))
4623			io_state = doio_send(sock, dev);
4624		else
4625			io_state = DOIO_SOFT;
4626	}
4627
4628	switch (io_state) {
4629	case DOIO_SOFT:
4630		/*
4631		 * We couldn't send all or part of the request right now, so
4632		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4633		 */
4634		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4635			isc_task_attach(task, &ntask);
4636			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4637
4638			if (!have_lock) {
4639				LOCK(&sock->lock);
4640				have_lock = ISC_TRUE;
4641			}
4642
4643			/*
4644			 * Enqueue the request.  If the socket was previously
4645			 * not being watched, poke the watcher to start
4646			 * paying attention to it.
4647			 */
4648			if (ISC_LIST_EMPTY(sock->send_list) &&
4649			    !sock->pending_send)
4650				select_poke(sock->manager, sock->fd,
4651					    SELECT_POKE_WRITE);
4652			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4653
4654			socket_log(sock, NULL, EVENT, NULL, 0, 0,
4655				   "socket_send: event %p -> task %p",
4656				   dev, ntask);
4657
4658			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4659				result = ISC_R_INPROGRESS;
4660			break;
4661		}
4662
4663	case DOIO_HARD:
4664	case DOIO_SUCCESS:
4665		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4666			send_senddone_event(sock, &dev);
4667		break;
4668	}
4669
4670	if (have_lock)
4671		UNLOCK(&sock->lock);
4672
4673	return (result);
4674}
4675
4676ISC_SOCKETFUNC_SCOPE isc_result_t
4677isc__socket_send(isc_socket_t *sock, isc_region_t *region,
4678		 isc_task_t *task, isc_taskaction_t action, const void *arg)
4679{
4680	/*
4681	 * REQUIRE() checking is performed in isc_socket_sendto().
4682	 */
4683	return (isc__socket_sendto(sock, region, task, action, arg, NULL,
4684				   NULL));
4685}
4686
4687ISC_SOCKETFUNC_SCOPE isc_result_t
4688isc__socket_sendto(isc_socket_t *sock0, isc_region_t *region,
4689		   isc_task_t *task, isc_taskaction_t action, const void *arg,
4690		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4691{
4692	isc__socket_t *sock = (isc__socket_t *)sock0;
4693	isc_socketevent_t *dev;
4694	isc__socketmgr_t *manager;
4695
4696	REQUIRE(VALID_SOCKET(sock));
4697	REQUIRE(region != NULL);
4698	REQUIRE(task != NULL);
4699	REQUIRE(action != NULL);
4700
4701	manager = sock->manager;
4702	REQUIRE(VALID_MANAGER(manager));
4703
4704	INSIST(sock->bound);
4705
4706	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4707	if (dev == NULL)
4708		return (ISC_R_NOMEMORY);
4709
4710	dev->region = *region;
4711
4712	return (socket_send(sock, dev, task, address, pktinfo, 0));
4713}
4714
4715ISC_SOCKETFUNC_SCOPE isc_result_t
4716isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4717		  isc_task_t *task, isc_taskaction_t action, const void *arg)
4718{
4719	return (isc__socket_sendtov(sock, buflist, task, action, arg, NULL,
4720				    NULL));
4721}
4722
4723ISC_SOCKETFUNC_SCOPE isc_result_t
4724isc__socket_sendtov(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4725		    isc_task_t *task, isc_taskaction_t action, const void *arg,
4726		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4727{
4728	isc__socket_t *sock = (isc__socket_t *)sock0;
4729	isc_socketevent_t *dev;
4730	isc__socketmgr_t *manager;
4731	unsigned int iocount;
4732	isc_buffer_t *buffer;
4733
4734	REQUIRE(VALID_SOCKET(sock));
4735	REQUIRE(buflist != NULL);
4736	REQUIRE(!ISC_LIST_EMPTY(*buflist));
4737	REQUIRE(task != NULL);
4738	REQUIRE(action != NULL);
4739
4740	manager = sock->manager;
4741	REQUIRE(VALID_MANAGER(manager));
4742
4743	iocount = isc_bufferlist_usedcount(buflist);
4744	REQUIRE(iocount > 0);
4745
4746	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4747	if (dev == NULL)
4748		return (ISC_R_NOMEMORY);
4749
4750	/*
4751	 * Move each buffer from the passed in list to our internal one.
4752	 */
4753	buffer = ISC_LIST_HEAD(*buflist);
4754	while (buffer != NULL) {
4755		ISC_LIST_DEQUEUE(*buflist, buffer, link);
4756		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4757		buffer = ISC_LIST_HEAD(*buflist);
4758	}
4759
4760	return (socket_send(sock, dev, task, address, pktinfo, 0));
4761}
4762
4763ISC_SOCKETFUNC_SCOPE isc_result_t
4764isc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region,
4765		    isc_task_t *task,
4766		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4767		    isc_socketevent_t *event, unsigned int flags)
4768{
4769	isc__socket_t *sock = (isc__socket_t *)sock0;
4770
4771	REQUIRE(VALID_SOCKET(sock));
4772	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4773	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4774		REQUIRE(sock->type == isc_sockettype_udp);
4775	event->ev_sender = sock;
4776	event->result = ISC_R_UNSET;
4777	ISC_LIST_INIT(event->bufferlist);
4778	event->region = *region;
4779	event->n = 0;
4780	event->offset = 0;
4781	event->attributes = 0;
4782
4783	return (socket_send(sock, event, task, address, pktinfo, flags));
4784}
4785
4786ISC_SOCKETFUNC_SCOPE void
4787isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4788#ifdef ISC_PLATFORM_HAVESYSUNH
4789	int s;
4790	struct stat sb;
4791	char strbuf[ISC_STRERRORSIZE];
4792
4793	if (sockaddr->type.sa.sa_family != AF_UNIX)
4794		return;
4795
4796#ifndef S_ISSOCK
4797#if defined(S_IFMT) && defined(S_IFSOCK)
4798#define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4799#elif defined(_S_IFMT) && defined(S_IFSOCK)
4800#define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4801#endif
4802#endif
4803
4804#ifndef S_ISFIFO
4805#if defined(S_IFMT) && defined(S_IFIFO)
4806#define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4807#elif defined(_S_IFMT) && defined(S_IFIFO)
4808#define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4809#endif
4810#endif
4811
4812#if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4813#error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4814#endif
4815
4816#ifndef S_ISFIFO
4817#define S_ISFIFO(mode) 0
4818#endif
4819
4820#ifndef S_ISSOCK
4821#define S_ISSOCK(mode) 0
4822#endif
4823
4824	if (active) {
4825		if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4826			isc__strerror(errno, strbuf, sizeof(strbuf));
4827			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4828				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4829				      "isc_socket_cleanunix: stat(%s): %s",
4830				      sockaddr->type.sunix.sun_path, strbuf);
4831			return;
4832		}
4833		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4834			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4835				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4836				      "isc_socket_cleanunix: %s: not a socket",
4837				      sockaddr->type.sunix.sun_path);
4838			return;
4839		}
4840		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4841			isc__strerror(errno, strbuf, sizeof(strbuf));
4842			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4843				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4844				      "isc_socket_cleanunix: unlink(%s): %s",
4845				      sockaddr->type.sunix.sun_path, strbuf);
4846		}
4847		return;
4848	}
4849
4850	s = socket(AF_UNIX, SOCK_STREAM, 0);
4851	if (s < 0) {
4852		isc__strerror(errno, strbuf, sizeof(strbuf));
4853		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4854			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4855			      "isc_socket_cleanunix: socket(%s): %s",
4856			      sockaddr->type.sunix.sun_path, strbuf);
4857		return;
4858	}
4859
4860	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4861		switch (errno) {
4862		case ENOENT:    /* We exited cleanly last time */
4863			break;
4864		default:
4865			isc__strerror(errno, strbuf, sizeof(strbuf));
4866			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4867				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4868				      "isc_socket_cleanunix: stat(%s): %s",
4869				      sockaddr->type.sunix.sun_path, strbuf);
4870			break;
4871		}
4872		goto cleanup;
4873	}
4874
4875	if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4876		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4877			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4878			      "isc_socket_cleanunix: %s: not a socket",
4879			      sockaddr->type.sunix.sun_path);
4880		goto cleanup;
4881	}
4882
4883	if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4884		    sizeof(sockaddr->type.sunix)) < 0) {
4885		switch (errno) {
4886		case ECONNREFUSED:
4887		case ECONNRESET:
4888			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4889				isc__strerror(errno, strbuf, sizeof(strbuf));
4890				isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4891					      ISC_LOGMODULE_SOCKET,
4892					      ISC_LOG_WARNING,
4893					      "isc_socket_cleanunix: "
4894					      "unlink(%s): %s",
4895					      sockaddr->type.sunix.sun_path,
4896					      strbuf);
4897			}
4898			break;
4899		default:
4900			isc__strerror(errno, strbuf, sizeof(strbuf));
4901			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4902				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4903				      "isc_socket_cleanunix: connect(%s): %s",
4904				      sockaddr->type.sunix.sun_path, strbuf);
4905			break;
4906		}
4907	}
4908 cleanup:
4909	close(s);
4910#else
4911	UNUSED(sockaddr);
4912	UNUSED(active);
4913#endif
4914}
4915
4916ISC_SOCKETFUNC_SCOPE isc_result_t
4917isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4918		    isc_uint32_t owner, isc_uint32_t group)
4919{
4920#ifdef ISC_PLATFORM_HAVESYSUNH
4921	isc_result_t result = ISC_R_SUCCESS;
4922	char strbuf[ISC_STRERRORSIZE];
4923	char path[sizeof(sockaddr->type.sunix.sun_path)];
4924#ifdef NEED_SECURE_DIRECTORY
4925	char *slash;
4926#endif
4927
4928	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4929	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4930	strcpy(path, sockaddr->type.sunix.sun_path);
4931
4932#ifdef NEED_SECURE_DIRECTORY
4933	slash = strrchr(path, '/');
4934	if (slash != NULL) {
4935		if (slash != path)
4936			*slash = '\0';
4937		else
4938			strcpy(path, "/");
4939	} else
4940		strcpy(path, ".");
4941#endif
4942
4943	if (chmod(path, perm) < 0) {
4944		isc__strerror(errno, strbuf, sizeof(strbuf));
4945		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4946			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4947			      "isc_socket_permunix: chmod(%s, %d): %s",
4948			      path, perm, strbuf);
4949		result = ISC_R_FAILURE;
4950	}
4951	if (chown(path, owner, group) < 0) {
4952		isc__strerror(errno, strbuf, sizeof(strbuf));
4953		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4954			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4955			      "isc_socket_permunix: chown(%s, %d, %d): %s",
4956			      path, owner, group,
4957			      strbuf);
4958		result = ISC_R_FAILURE;
4959	}
4960	return (result);
4961#else
4962	UNUSED(sockaddr);
4963	UNUSED(perm);
4964	UNUSED(owner);
4965	UNUSED(group);
4966	return (ISC_R_NOTIMPLEMENTED);
4967#endif
4968}
4969
4970ISC_SOCKETFUNC_SCOPE isc_result_t
4971isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
4972		 unsigned int options) {
4973	isc__socket_t *sock = (isc__socket_t *)sock0;
4974	char strbuf[ISC_STRERRORSIZE];
4975	int on = 1;
4976
4977	REQUIRE(VALID_SOCKET(sock));
4978
4979	LOCK(&sock->lock);
4980
4981	INSIST(!sock->bound);
4982
4983	if (sock->pf != sockaddr->type.sa.sa_family) {
4984		UNLOCK(&sock->lock);
4985		return (ISC_R_FAMILYMISMATCH);
4986	}
4987	/*
4988	 * Only set SO_REUSEADDR when we want a specific port.
4989	 */
4990#ifdef AF_UNIX
4991	if (sock->pf == AF_UNIX)
4992		goto bind_socket;
4993#endif
4994	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4995	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
4996	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4997		       sizeof(on)) < 0) {
4998		UNEXPECTED_ERROR(__FILE__, __LINE__,
4999				 "setsockopt(%d) %s", sock->fd,
5000				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
5001						ISC_MSG_FAILED, "failed"));
5002		/* Press on... */
5003	}
5004#ifdef AF_UNIX
5005 bind_socket:
5006#endif
5007	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
5008		inc_stats(sock->manager->stats,
5009			  sock->statsindex[STATID_BINDFAIL]);
5010
5011		UNLOCK(&sock->lock);
5012		switch (errno) {
5013		case EACCES:
5014			return (ISC_R_NOPERM);
5015		case EADDRNOTAVAIL:
5016			return (ISC_R_ADDRNOTAVAIL);
5017		case EADDRINUSE:
5018			return (ISC_R_ADDRINUSE);
5019		case EINVAL:
5020			return (ISC_R_BOUND);
5021		default:
5022			isc__strerror(errno, strbuf, sizeof(strbuf));
5023			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
5024					 strbuf);
5025			return (ISC_R_UNEXPECTED);
5026		}
5027	}
5028
5029	socket_log(sock, sockaddr, TRACE,
5030		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
5031	sock->bound = 1;
5032
5033	UNLOCK(&sock->lock);
5034	return (ISC_R_SUCCESS);
5035}
5036
5037/*
5038 * Enable this only for specific OS versions, and only when they have repaired
5039 * their problems with it.  Until then, this is is broken and needs to be
5040 * diabled by default.  See RT22589 for details.
5041 */
5042#undef ENABLE_ACCEPTFILTER
5043
5044ISC_SOCKETFUNC_SCOPE isc_result_t
5045isc__socket_filter(isc_socket_t *sock0, const char *filter) {
5046	isc__socket_t *sock = (isc__socket_t *)sock0;
5047#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5048	char strbuf[ISC_STRERRORSIZE];
5049	struct accept_filter_arg afa;
5050#else
5051	UNUSED(sock);
5052	UNUSED(filter);
5053#endif
5054
5055	REQUIRE(VALID_SOCKET(sock));
5056
5057#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5058	bzero(&afa, sizeof(afa));
5059	strncpy(afa.af_name, filter, sizeof(afa.af_name));
5060	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
5061			 &afa, sizeof(afa)) == -1) {
5062		isc__strerror(errno, strbuf, sizeof(strbuf));
5063		socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
5064			   ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
5065			   strbuf);
5066		return (ISC_R_FAILURE);
5067	}
5068	return (ISC_R_SUCCESS);
5069#else
5070	return (ISC_R_NOTIMPLEMENTED);
5071#endif
5072}
5073
5074/*
5075 * Set up to listen on a given socket.  We do this by creating an internal
5076 * event that will be dispatched when the socket has read activity.  The
5077 * watcher will send the internal event to the task when there is a new
5078 * connection.
5079 *
5080 * Unlike in read, we don't preallocate a done event here.  Every time there
5081 * is a new connection we'll have to allocate a new one anyway, so we might
5082 * as well keep things simple rather than having to track them.
5083 */
5084ISC_SOCKETFUNC_SCOPE isc_result_t
5085isc__socket_listen(isc_socket_t *sock0, unsigned int backlog) {
5086	isc__socket_t *sock = (isc__socket_t *)sock0;
5087	char strbuf[ISC_STRERRORSIZE];
5088
5089	REQUIRE(VALID_SOCKET(sock));
5090
5091	LOCK(&sock->lock);
5092
5093	REQUIRE(!sock->listener);
5094	REQUIRE(sock->bound);
5095	REQUIRE(sock->type == isc_sockettype_tcp ||
5096		sock->type == isc_sockettype_unix);
5097
5098	if (backlog == 0)
5099		backlog = SOMAXCONN;
5100
5101	if (listen(sock->fd, (int)backlog) < 0) {
5102		UNLOCK(&sock->lock);
5103		isc__strerror(errno, strbuf, sizeof(strbuf));
5104
5105		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
5106
5107		return (ISC_R_UNEXPECTED);
5108	}
5109
5110	sock->listener = 1;
5111
5112	UNLOCK(&sock->lock);
5113	return (ISC_R_SUCCESS);
5114}
5115
5116/*
5117 * This should try to do aggressive accept() XXXMLG
5118 */
5119ISC_SOCKETFUNC_SCOPE isc_result_t
5120isc__socket_accept(isc_socket_t *sock0,
5121		  isc_task_t *task, isc_taskaction_t action, const void *arg)
5122{
5123	isc__socket_t *sock = (isc__socket_t *)sock0;
5124	isc_socket_newconnev_t *dev;
5125	isc__socketmgr_t *manager;
5126	isc_task_t *ntask = NULL;
5127	isc__socket_t *nsock;
5128	isc_result_t result;
5129	isc_boolean_t do_poke = ISC_FALSE;
5130
5131	REQUIRE(VALID_SOCKET(sock));
5132	manager = sock->manager;
5133	REQUIRE(VALID_MANAGER(manager));
5134
5135	LOCK(&sock->lock);
5136
5137	REQUIRE(sock->listener);
5138
5139	/*
5140	 * Sender field is overloaded here with the task we will be sending
5141	 * this event to.  Just before the actual event is delivered the
5142	 * actual ev_sender will be touched up to be the socket.
5143	 */
5144	dev = (isc_socket_newconnev_t *)
5145		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
5146				   action, arg, sizeof(*dev));
5147	if (dev == NULL) {
5148		UNLOCK(&sock->lock);
5149		return (ISC_R_NOMEMORY);
5150	}
5151	ISC_LINK_INIT(dev, ev_link);
5152
5153	result = allocate_socket(manager, sock->type, &nsock);
5154	if (result != ISC_R_SUCCESS) {
5155		isc_event_free(ISC_EVENT_PTR(&dev));
5156		UNLOCK(&sock->lock);
5157		return (result);
5158	}
5159
5160	/*
5161	 * Attach to socket and to task.
5162	 */
5163	isc_task_attach(task, &ntask);
5164	if (isc_task_exiting(ntask)) {
5165		free_socket(&nsock);
5166		isc_task_detach(&ntask);
5167		isc_event_free(ISC_EVENT_PTR(&dev));
5168		UNLOCK(&sock->lock);
5169		return (ISC_R_SHUTTINGDOWN);
5170	}
5171	nsock->references++;
5172	nsock->statsindex = sock->statsindex;
5173
5174	dev->ev_sender = ntask;
5175	dev->newsocket = (isc_socket_t *)nsock;
5176
5177	/*
5178	 * Poke watcher here.  We still have the socket locked, so there
5179	 * is no race condition.  We will keep the lock for such a short
5180	 * bit of time waking it up now or later won't matter all that much.
5181	 */
5182	if (ISC_LIST_EMPTY(sock->accept_list))
5183		do_poke = ISC_TRUE;
5184
5185	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
5186
5187	if (do_poke)
5188		select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
5189
5190	UNLOCK(&sock->lock);
5191	return (ISC_R_SUCCESS);
5192}
5193
5194ISC_SOCKETFUNC_SCOPE isc_result_t
5195isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
5196		   isc_task_t *task, isc_taskaction_t action, const void *arg)
5197{
5198	isc__socket_t *sock = (isc__socket_t *)sock0;
5199	isc_socket_connev_t *dev;
5200	isc_task_t *ntask = NULL;
5201	isc__socketmgr_t *manager;
5202	int cc;
5203	char strbuf[ISC_STRERRORSIZE];
5204	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
5205
5206	REQUIRE(VALID_SOCKET(sock));
5207	REQUIRE(addr != NULL);
5208	REQUIRE(task != NULL);
5209	REQUIRE(action != NULL);
5210
5211	manager = sock->manager;
5212	REQUIRE(VALID_MANAGER(manager));
5213	REQUIRE(addr != NULL);
5214
5215	if (isc_sockaddr_ismulticast(addr))
5216		return (ISC_R_MULTICAST);
5217
5218	LOCK(&sock->lock);
5219
5220	REQUIRE(!sock->connecting);
5221
5222	dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
5223							ISC_SOCKEVENT_CONNECT,
5224							action,	arg,
5225							sizeof(*dev));
5226	if (dev == NULL) {
5227		UNLOCK(&sock->lock);
5228		return (ISC_R_NOMEMORY);
5229	}
5230	ISC_LINK_INIT(dev, ev_link);
5231
5232	/*
5233	 * Try to do the connect right away, as there can be only one
5234	 * outstanding, and it might happen to complete.
5235	 */
5236	sock->peer_address = *addr;
5237	cc = connect(sock->fd, &addr->type.sa, addr->length);
5238	if (cc < 0) {
5239		/*
5240		 * HP-UX "fails" to connect a UDP socket and sets errno to
5241		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
5242		 * a success and let the user detect it if it's really an error
5243		 * at the time of sending a packet on the socket.
5244		 */
5245		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
5246			cc = 0;
5247			goto success;
5248		}
5249		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
5250			goto queue;
5251
5252		switch (errno) {
5253#define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
5254			ERROR_MATCH(EACCES, ISC_R_NOPERM);
5255			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5256			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5257			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5258			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5259#ifdef EHOSTDOWN
5260			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5261#endif
5262			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5263			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5264			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5265			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5266			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5267#undef ERROR_MATCH
5268		}
5269
5270		sock->connected = 0;
5271
5272		isc__strerror(errno, strbuf, sizeof(strbuf));
5273		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
5274		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
5275				 addrbuf, errno, strbuf);
5276
5277		UNLOCK(&sock->lock);
5278		inc_stats(sock->manager->stats,
5279			  sock->statsindex[STATID_CONNECTFAIL]);
5280		isc_event_free(ISC_EVENT_PTR(&dev));
5281		return (ISC_R_UNEXPECTED);
5282
5283	err_exit:
5284		sock->connected = 0;
5285		isc_task_send(task, ISC_EVENT_PTR(&dev));
5286
5287		UNLOCK(&sock->lock);
5288		inc_stats(sock->manager->stats,
5289			  sock->statsindex[STATID_CONNECTFAIL]);
5290		return (ISC_R_SUCCESS);
5291	}
5292
5293	/*
5294	 * If connect completed, fire off the done event.
5295	 */
5296 success:
5297	if (cc == 0) {
5298		sock->connected = 1;
5299		sock->bound = 1;
5300		dev->result = ISC_R_SUCCESS;
5301		isc_task_send(task, ISC_EVENT_PTR(&dev));
5302
5303		UNLOCK(&sock->lock);
5304
5305		inc_stats(sock->manager->stats,
5306			  sock->statsindex[STATID_CONNECT]);
5307
5308		return (ISC_R_SUCCESS);
5309	}
5310
5311 queue:
5312
5313	/*
5314	 * Attach to task.
5315	 */
5316	isc_task_attach(task, &ntask);
5317
5318	sock->connecting = 1;
5319
5320	dev->ev_sender = ntask;
5321
5322	/*
5323	 * Poke watcher here.  We still have the socket locked, so there
5324	 * is no race condition.  We will keep the lock for such a short
5325	 * bit of time waking it up now or later won't matter all that much.
5326	 */
5327	if (sock->connect_ev == NULL)
5328		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
5329
5330	sock->connect_ev = dev;
5331
5332	UNLOCK(&sock->lock);
5333	return (ISC_R_SUCCESS);
5334}
5335
5336/*
5337 * Called when a socket with a pending connect() finishes.
5338 */
5339static void
5340internal_connect(isc_task_t *me, isc_event_t *ev) {
5341	isc__socket_t *sock;
5342	isc_socket_connev_t *dev;
5343	isc_task_t *task;
5344	int cc;
5345	ISC_SOCKADDR_LEN_T optlen;
5346	char strbuf[ISC_STRERRORSIZE];
5347	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5348
5349	UNUSED(me);
5350	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5351
5352	sock = ev->ev_sender;
5353	INSIST(VALID_SOCKET(sock));
5354
5355	LOCK(&sock->lock);
5356
5357	/*
5358	 * When the internal event was sent the reference count was bumped
5359	 * to keep the socket around for us.  Decrement the count here.
5360	 */
5361	INSIST(sock->references > 0);
5362	sock->references--;
5363	if (sock->references == 0) {
5364		UNLOCK(&sock->lock);
5365		destroy(&sock);
5366		return;
5367	}
5368
5369	/*
5370	 * Has this event been canceled?
5371	 */
5372	dev = sock->connect_ev;
5373	if (dev == NULL) {
5374		INSIST(!sock->connecting);
5375		UNLOCK(&sock->lock);
5376		return;
5377	}
5378
5379	INSIST(sock->connecting);
5380	sock->connecting = 0;
5381
5382	/*
5383	 * Get any possible error status here.
5384	 */
5385	optlen = sizeof(cc);
5386	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5387		       (void *)&cc, (void *)&optlen) < 0)
5388		cc = errno;
5389	else
5390		errno = cc;
5391
5392	if (errno != 0) {
5393		/*
5394		 * If the error is EAGAIN, just re-select on this
5395		 * fd and pretend nothing strange happened.
5396		 */
5397		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5398			sock->connecting = 1;
5399			select_poke(sock->manager, sock->fd,
5400				    SELECT_POKE_CONNECT);
5401			UNLOCK(&sock->lock);
5402
5403			return;
5404		}
5405
5406		inc_stats(sock->manager->stats,
5407			  sock->statsindex[STATID_CONNECTFAIL]);
5408
5409		/*
5410		 * Translate other errors into ISC_R_* flavors.
5411		 */
5412		switch (errno) {
5413#define ERROR_MATCH(a, b) case a: dev->result = b; break;
5414			ERROR_MATCH(EACCES, ISC_R_NOPERM);
5415			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5416			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5417			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5418			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5419#ifdef EHOSTDOWN
5420			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5421#endif
5422			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5423			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5424			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5425			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5426			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5427			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5428#undef ERROR_MATCH
5429		default:
5430			dev->result = ISC_R_UNEXPECTED;
5431			isc_sockaddr_format(&sock->peer_address, peerbuf,
5432					    sizeof(peerbuf));
5433			isc__strerror(errno, strbuf, sizeof(strbuf));
5434			UNEXPECTED_ERROR(__FILE__, __LINE__,
5435					 "internal_connect: connect(%s) %s",
5436					 peerbuf, strbuf);
5437		}
5438	} else {
5439		inc_stats(sock->manager->stats,
5440			  sock->statsindex[STATID_CONNECT]);
5441		dev->result = ISC_R_SUCCESS;
5442		sock->connected = 1;
5443		sock->bound = 1;
5444	}
5445
5446	sock->connect_ev = NULL;
5447
5448	UNLOCK(&sock->lock);
5449
5450	task = dev->ev_sender;
5451	dev->ev_sender = sock;
5452	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5453}
5454
5455ISC_SOCKETFUNC_SCOPE isc_result_t
5456isc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5457	isc__socket_t *sock = (isc__socket_t *)sock0;
5458	isc_result_t result;
5459
5460	REQUIRE(VALID_SOCKET(sock));
5461	REQUIRE(addressp != NULL);
5462
5463	LOCK(&sock->lock);
5464
5465	if (sock->connected) {
5466		*addressp = sock->peer_address;
5467		result = ISC_R_SUCCESS;
5468	} else {
5469		result = ISC_R_NOTCONNECTED;
5470	}
5471
5472	UNLOCK(&sock->lock);
5473
5474	return (result);
5475}
5476
5477ISC_SOCKETFUNC_SCOPE isc_result_t
5478isc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5479	isc__socket_t *sock = (isc__socket_t *)sock0;
5480	ISC_SOCKADDR_LEN_T len;
5481	isc_result_t result;
5482	char strbuf[ISC_STRERRORSIZE];
5483
5484	REQUIRE(VALID_SOCKET(sock));
5485	REQUIRE(addressp != NULL);
5486
5487	LOCK(&sock->lock);
5488
5489	if (!sock->bound) {
5490		result = ISC_R_NOTBOUND;
5491		goto out;
5492	}
5493
5494	result = ISC_R_SUCCESS;
5495
5496	len = sizeof(addressp->type);
5497	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5498		isc__strerror(errno, strbuf, sizeof(strbuf));
5499		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5500				 strbuf);
5501		result = ISC_R_UNEXPECTED;
5502		goto out;
5503	}
5504	addressp->length = (unsigned int)len;
5505
5506 out:
5507	UNLOCK(&sock->lock);
5508
5509	return (result);
5510}
5511
5512/*
5513 * Run through the list of events on this socket, and cancel the ones
5514 * queued for task "task" of type "how".  "how" is a bitmask.
5515 */
5516ISC_SOCKETFUNC_SCOPE void
5517isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
5518	isc__socket_t *sock = (isc__socket_t *)sock0;
5519
5520	REQUIRE(VALID_SOCKET(sock));
5521
5522	/*
5523	 * Quick exit if there is nothing to do.  Don't even bother locking
5524	 * in this case.
5525	 */
5526	if (how == 0)
5527		return;
5528
5529	LOCK(&sock->lock);
5530
5531	/*
5532	 * All of these do the same thing, more or less.
5533	 * Each will:
5534	 *	o If the internal event is marked as "posted" try to
5535	 *	  remove it from the task's queue.  If this fails, mark it
5536	 *	  as canceled instead, and let the task clean it up later.
5537	 *	o For each I/O request for that task of that type, post
5538	 *	  its done event with status of "ISC_R_CANCELED".
5539	 *	o Reset any state needed.
5540	 */
5541	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5542	    && !ISC_LIST_EMPTY(sock->recv_list)) {
5543		isc_socketevent_t      *dev;
5544		isc_socketevent_t      *next;
5545		isc_task_t	       *current_task;
5546
5547		dev = ISC_LIST_HEAD(sock->recv_list);
5548
5549		while (dev != NULL) {
5550			current_task = dev->ev_sender;
5551			next = ISC_LIST_NEXT(dev, ev_link);
5552
5553			if ((task == NULL) || (task == current_task)) {
5554				dev->result = ISC_R_CANCELED;
5555				send_recvdone_event(sock, &dev);
5556			}
5557			dev = next;
5558		}
5559	}
5560
5561	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5562	    && !ISC_LIST_EMPTY(sock->send_list)) {
5563		isc_socketevent_t      *dev;
5564		isc_socketevent_t      *next;
5565		isc_task_t	       *current_task;
5566
5567		dev = ISC_LIST_HEAD(sock->send_list);
5568
5569		while (dev != NULL) {
5570			current_task = dev->ev_sender;
5571			next = ISC_LIST_NEXT(dev, ev_link);
5572
5573			if ((task == NULL) || (task == current_task)) {
5574				dev->result = ISC_R_CANCELED;
5575				send_senddone_event(sock, &dev);
5576			}
5577			dev = next;
5578		}
5579	}
5580
5581	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5582	    && !ISC_LIST_EMPTY(sock->accept_list)) {
5583		isc_socket_newconnev_t *dev;
5584		isc_socket_newconnev_t *next;
5585		isc_task_t	       *current_task;
5586
5587		dev = ISC_LIST_HEAD(sock->accept_list);
5588		while (dev != NULL) {
5589			current_task = dev->ev_sender;
5590			next = ISC_LIST_NEXT(dev, ev_link);
5591
5592			if ((task == NULL) || (task == current_task)) {
5593
5594				ISC_LIST_UNLINK(sock->accept_list, dev,
5595						ev_link);
5596
5597				NEWCONNSOCK(dev)->references--;
5598				free_socket((isc__socket_t **)&dev->newsocket);
5599
5600				dev->result = ISC_R_CANCELED;
5601				dev->ev_sender = sock;
5602				isc_task_sendanddetach(&current_task,
5603						       ISC_EVENT_PTR(&dev));
5604			}
5605
5606			dev = next;
5607		}
5608	}
5609
5610	/*
5611	 * Connecting is not a list.
5612	 */
5613	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5614	    && sock->connect_ev != NULL) {
5615		isc_socket_connev_t    *dev;
5616		isc_task_t	       *current_task;
5617
5618		INSIST(sock->connecting);
5619		sock->connecting = 0;
5620
5621		dev = sock->connect_ev;
5622		current_task = dev->ev_sender;
5623
5624		if ((task == NULL) || (task == current_task)) {
5625			sock->connect_ev = NULL;
5626
5627			dev->result = ISC_R_CANCELED;
5628			dev->ev_sender = sock;
5629			isc_task_sendanddetach(&current_task,
5630					       ISC_EVENT_PTR(&dev));
5631		}
5632	}
5633
5634	UNLOCK(&sock->lock);
5635}
5636
5637ISC_SOCKETFUNC_SCOPE isc_sockettype_t
5638isc__socket_gettype(isc_socket_t *sock0) {
5639	isc__socket_t *sock = (isc__socket_t *)sock0;
5640
5641	REQUIRE(VALID_SOCKET(sock));
5642
5643	return (sock->type);
5644}
5645
5646ISC_SOCKETFUNC_SCOPE isc_boolean_t
5647isc__socket_isbound(isc_socket_t *sock0) {
5648	isc__socket_t *sock = (isc__socket_t *)sock0;
5649	isc_boolean_t val;
5650
5651	REQUIRE(VALID_SOCKET(sock));
5652
5653	LOCK(&sock->lock);
5654	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5655	UNLOCK(&sock->lock);
5656
5657	return (val);
5658}
5659
5660ISC_SOCKETFUNC_SCOPE void
5661isc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) {
5662	isc__socket_t *sock = (isc__socket_t *)sock0;
5663#if defined(IPV6_V6ONLY)
5664	int onoff = yes ? 1 : 0;
5665#else
5666	UNUSED(yes);
5667	UNUSED(sock);
5668#endif
5669
5670	REQUIRE(VALID_SOCKET(sock));
5671
5672#ifdef IPV6_V6ONLY
5673	if (sock->pf == AF_INET6) {
5674		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5675			       (void *)&onoff, sizeof(int)) < 0) {
5676			char strbuf[ISC_STRERRORSIZE];
5677			isc__strerror(errno, strbuf, sizeof(strbuf));
5678			UNEXPECTED_ERROR(__FILE__, __LINE__,
5679					 "setsockopt(%d, IPV6_V6ONLY) "
5680					 "%s: %s", sock->fd,
5681					 isc_msgcat_get(isc_msgcat,
5682							ISC_MSGSET_GENERAL,
5683							ISC_MSG_FAILED,
5684							"failed"),
5685					 strbuf);
5686		}
5687	}
5688	FIX_IPV6_RECVPKTINFO(sock);	/* AIX */
5689#endif
5690}
5691
5692#ifndef USE_WATCHER_THREAD
5693/*
5694 * In our assumed scenario, we can simply use a single static object.
5695 * XXX: this is not true if the application uses multiple threads with
5696 *      'multi-context' mode.  Fixing this is a future TODO item.
5697 */
5698static isc_socketwait_t swait_private;
5699
5700int
5701isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
5702			  isc_socketwait_t **swaitp)
5703{
5704	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5705
5706
5707	int n;
5708#ifdef USE_KQUEUE
5709	struct timespec ts, *tsp;
5710#endif
5711#ifdef USE_EPOLL
5712	int timeout;
5713#endif
5714#ifdef USE_DEVPOLL
5715	struct dvpoll dvp;
5716#endif
5717
5718	REQUIRE(swaitp != NULL && *swaitp == NULL);
5719
5720#ifdef USE_SHARED_MANAGER
5721	if (manager == NULL)
5722		manager = socketmgr;
5723#endif
5724	if (manager == NULL)
5725		return (0);
5726
5727#ifdef USE_KQUEUE
5728	if (tvp != NULL) {
5729		ts.tv_sec = tvp->tv_sec;
5730		ts.tv_nsec = tvp->tv_usec * 1000;
5731		tsp = &ts;
5732	} else
5733		tsp = NULL;
5734	swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0,
5735				       manager->events, manager->nevents,
5736				       tsp);
5737	n = swait_private.nevents;
5738#elif defined(USE_EPOLL)
5739	if (tvp != NULL)
5740		timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5741	else
5742		timeout = -1;
5743	swait_private.nevents = epoll_wait(manager->epoll_fd,
5744					   manager->events,
5745					   manager->nevents, timeout);
5746	n = swait_private.nevents;
5747#elif defined(USE_DEVPOLL)
5748	dvp.dp_fds = manager->events;
5749	dvp.dp_nfds = manager->nevents;
5750	if (tvp != NULL) {
5751		dvp.dp_timeout = tvp->tv_sec * 1000 +
5752			(tvp->tv_usec + 999) / 1000;
5753	} else
5754		dvp.dp_timeout = -1;
5755	swait_private.nevents = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
5756	n = swait_private.nevents;
5757#elif defined(USE_SELECT)
5758	memcpy(manager->read_fds_copy, manager->read_fds,  manager->fd_bufsize);
5759	memcpy(manager->write_fds_copy, manager->write_fds,
5760	       manager->fd_bufsize);
5761
5762	swait_private.readset = manager->read_fds_copy;
5763	swait_private.writeset = manager->write_fds_copy;
5764	swait_private.maxfd = manager->maxfd + 1;
5765
5766	n = select(swait_private.maxfd, swait_private.readset,
5767		   swait_private.writeset, NULL, tvp);
5768#endif
5769
5770	*swaitp = &swait_private;
5771	return (n);
5772}
5773
5774isc_result_t
5775isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
5776	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5777
5778	REQUIRE(swait == &swait_private);
5779
5780#ifdef USE_SHARED_MANAGER
5781	if (manager == NULL)
5782		manager = socketmgr;
5783#endif
5784	if (manager == NULL)
5785		return (ISC_R_NOTFOUND);
5786
5787#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5788	(void)process_fds(manager, manager->events, swait->nevents);
5789	return (ISC_R_SUCCESS);
5790#elif defined(USE_SELECT)
5791	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
5792	return (ISC_R_SUCCESS);
5793#endif
5794}
5795#endif /* USE_WATCHER_THREAD */
5796
5797#ifdef BIND9
5798void
5799isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) {
5800	isc__socket_t *socket = (isc__socket_t *)socket0;
5801
5802	/*
5803	 * Name 'socket'.
5804	 */
5805
5806	REQUIRE(VALID_SOCKET(socket));
5807
5808	LOCK(&socket->lock);
5809	memset(socket->name, 0, sizeof(socket->name));
5810	strncpy(socket->name, name, sizeof(socket->name) - 1);
5811	socket->tag = tag;
5812	UNLOCK(&socket->lock);
5813}
5814
5815ISC_SOCKETFUNC_SCOPE const char *
5816isc__socket_getname(isc_socket_t *socket0) {
5817	isc__socket_t *socket = (isc__socket_t *)socket0;
5818
5819	return (socket->name);
5820}
5821
5822void *
5823isc__socket_gettag(isc_socket_t *socket0) {
5824	isc__socket_t *socket = (isc__socket_t *)socket0;
5825
5826	return (socket->tag);
5827}
5828#endif	/* BIND9 */
5829
5830#ifdef USE_SOCKETIMPREGISTER
5831isc_result_t
5832isc__socket_register() {
5833	return (isc_socket_register(isc__socketmgr_create));
5834}
5835#endif
5836
5837#if defined(HAVE_LIBXML2) && defined(BIND9)
5838
5839static const char *
5840_socktype(isc_sockettype_t type)
5841{
5842	if (type == isc_sockettype_udp)
5843		return ("udp");
5844	else if (type == isc_sockettype_tcp)
5845		return ("tcp");
5846	else if (type == isc_sockettype_unix)
5847		return ("unix");
5848	else if (type == isc_sockettype_fdwatch)
5849		return ("fdwatch");
5850	else
5851		return ("not-initialized");
5852}
5853
5854ISC_SOCKETFUNC_SCOPE void
5855isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) {
5856	isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
5857	isc__socket_t *sock;
5858	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5859	isc_sockaddr_t addr;
5860	ISC_SOCKADDR_LEN_T len;
5861
5862	LOCK(&mgr->lock);
5863
5864#ifdef USE_SHARED_MANAGER
5865	xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5866	xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5867	xmlTextWriterEndElement(writer);
5868#endif	/* USE_SHARED_MANAGER */
5869
5870	xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5871	sock = ISC_LIST_HEAD(mgr->socklist);
5872	while (sock != NULL) {
5873		LOCK(&sock->lock);
5874		xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5875
5876		xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5877		xmlTextWriterWriteFormatString(writer, "%p", sock);
5878		xmlTextWriterEndElement(writer);
5879
5880		if (sock->name[0] != 0) {
5881			xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5882			xmlTextWriterWriteFormatString(writer, "%s",
5883						       sock->name);
5884			xmlTextWriterEndElement(writer); /* name */
5885		}
5886
5887		xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5888		xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5889		xmlTextWriterEndElement(writer);
5890
5891		xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5892					  ISC_XMLCHAR _socktype(sock->type));
5893
5894		if (sock->connected) {
5895			isc_sockaddr_format(&sock->peer_address, peerbuf,
5896					    sizeof(peerbuf));
5897			xmlTextWriterWriteElement(writer,
5898						  ISC_XMLCHAR "peer-address",
5899						  ISC_XMLCHAR peerbuf);
5900		}
5901
5902		len = sizeof(addr);
5903		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5904			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5905			xmlTextWriterWriteElement(writer,
5906						  ISC_XMLCHAR "local-address",
5907						  ISC_XMLCHAR peerbuf);
5908		}
5909
5910		xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5911		if (sock->pending_recv)
5912			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5913						ISC_XMLCHAR "pending-receive");
5914		if (sock->pending_send)
5915			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5916						  ISC_XMLCHAR "pending-send");
5917		if (sock->pending_accept)
5918			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5919						 ISC_XMLCHAR "pending_accept");
5920		if (sock->listener)
5921			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5922						  ISC_XMLCHAR "listener");
5923		if (sock->connected)
5924			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5925						  ISC_XMLCHAR "connected");
5926		if (sock->connecting)
5927			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5928						  ISC_XMLCHAR "connecting");
5929		if (sock->bound)
5930			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5931						  ISC_XMLCHAR "bound");
5932
5933		xmlTextWriterEndElement(writer); /* states */
5934
5935		xmlTextWriterEndElement(writer); /* socket */
5936
5937		UNLOCK(&sock->lock);
5938		sock = ISC_LIST_NEXT(sock, link);
5939	}
5940	xmlTextWriterEndElement(writer); /* sockets */
5941
5942	UNLOCK(&mgr->lock);
5943}
5944#endif /* HAVE_LIBXML2 */
5945