1/*
2 * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1998-2003  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id$ */
19
20/*! \file */
21
22#include <config.h>
23
24#include <sys/param.h>
25#include <sys/types.h>
26#include <sys/socket.h>
27#include <sys/stat.h>
28#include <sys/time.h>
29#include <sys/uio.h>
30
31#include <errno.h>
32#include <fcntl.h>
33#include <stddef.h>
34#include <stdlib.h>
35#include <string.h>
36#include <unistd.h>
37
38#include <isc/buffer.h>
39#include <isc/bufferlist.h>
40#include <isc/condition.h>
41#include <isc/formatcheck.h>
42#include <isc/list.h>
43#include <isc/log.h>
44#include <isc/mem.h>
45#include <isc/msgs.h>
46#include <isc/mutex.h>
47#include <isc/net.h>
48#include <isc/once.h>
49#include <isc/platform.h>
50#include <isc/print.h>
51#include <isc/region.h>
52#include <isc/socket.h>
53#include <isc/stats.h>
54#include <isc/strerror.h>
55#include <isc/task.h>
56#include <isc/thread.h>
57#include <isc/util.h>
58#include <isc/xml.h>
59
60#ifdef ISC_PLATFORM_HAVESYSUNH
61#include <sys/un.h>
62#endif
63#ifdef ISC_PLATFORM_HAVEKQUEUE
64#include <sys/event.h>
65#endif
66#ifdef ISC_PLATFORM_HAVEEPOLL
67#include <sys/epoll.h>
68#endif
69#ifdef ISC_PLATFORM_HAVEDEVPOLL
70#if defined(HAVE_SYS_DEVPOLL_H)
71#include <sys/devpoll.h>
72#elif defined(HAVE_DEVPOLL_H)
73#include <devpoll.h>
74#endif
75#endif
76
77#include "errno2result.h"
78
79/* See task.c about the following definition: */
80#ifdef BIND9
81#ifdef ISC_PLATFORM_USETHREADS
82#define USE_WATCHER_THREAD
83#else
84#define USE_SHARED_MANAGER
85#endif	/* ISC_PLATFORM_USETHREADS */
86#endif	/* BIND9 */
87
88#ifndef USE_WATCHER_THREAD
89#include "socket_p.h"
90#include "../task_p.h"
91#endif /* USE_WATCHER_THREAD */
92
93#if defined(SO_BSDCOMPAT) && defined(__linux__)
94#include <sys/utsname.h>
95#endif
96
97/*%
98 * Choose the most preferable multiplex method.
99 */
100#ifdef ISC_PLATFORM_HAVEKQUEUE
101#define USE_KQUEUE
102#elif defined (ISC_PLATFORM_HAVEEPOLL)
103#define USE_EPOLL
104#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
105#define USE_DEVPOLL
106typedef struct {
107	unsigned int want_read : 1,
108		want_write : 1;
109} pollinfo_t;
110#else
111#define USE_SELECT
112#endif	/* ISC_PLATFORM_HAVEKQUEUE */
113
114#ifndef USE_WATCHER_THREAD
115#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
116struct isc_socketwait {
117	int nevents;
118};
119#elif defined (USE_SELECT)
120struct isc_socketwait {
121	fd_set *readset;
122	fd_set *writeset;
123	int nfds;
124	int maxfd;
125};
126#endif	/* USE_KQUEUE */
127#endif /* !USE_WATCHER_THREAD */
128
129/*%
130 * Maximum number of allowable open sockets.  This is also the maximum
131 * allowable socket file descriptor.
132 *
133 * Care should be taken before modifying this value for select():
134 * The API standard doesn't ensure select() accept more than (the system default
135 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
136 * the vast majority of cases.  This constant should therefore be increased only
137 * when absolutely necessary and possible, i.e., the server is exhausting all
138 * available file descriptors (up to FD_SETSIZE) and the select() function
139 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
140 * always by true, but we keep using some of them to ensure as much
141 * portability as possible).  Note also that overall server performance
142 * may be rather worsened with a larger value of this constant due to
143 * inherent scalability problems of select().
144 *
145 * As a special note, this value shouldn't have to be touched if
146 * this is a build for an authoritative only DNS server.
147 */
148#ifndef ISC_SOCKET_MAXSOCKETS
149#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
150#define ISC_SOCKET_MAXSOCKETS 4096
151#elif defined(USE_SELECT)
152#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
153#endif	/* USE_KQUEUE... */
154#endif	/* ISC_SOCKET_MAXSOCKETS */
155
156#ifdef USE_SELECT
157/*%
158 * Mac OS X needs a special definition to support larger values in select().
159 * We always define this because a larger value can be specified run-time.
160 */
161#ifdef __APPLE__
162#define _DARWIN_UNLIMITED_SELECT
163#endif	/* __APPLE__ */
164#endif	/* USE_SELECT */
165
166#ifdef ISC_SOCKET_USE_POLLWATCH
167/*%
168 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
169 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
170 * some of the specified FD.  The idea is based on the observation that it's
171 * likely for a busy server to keep receiving packets.  It specifically works
172 * as follows: the socket watcher is first initialized with the state of
173 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
174 * event occurs.  When it wakes up for a socket I/O event, it moves to the
175 * poll_active state, and sets the poll timeout to a short period
176 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
177 * watcher goes to the poll_checking state with the same timeout period.
178 * In this state, the watcher tries to detect whether this is a break
179 * during intermittent events or the kernel bug is triggered.  If the next
180 * polling reports an event within the short period, the previous timeout is
181 * likely to be a kernel bug, and so the watcher goes back to the active state.
182 * Otherwise, it moves to the idle state again.
183 *
184 * It's not clear whether this is a thread-related bug, but since we've only
185 * seen this with threads, this workaround is used only when enabling threads.
186 */
187
188typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
189
190#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
191#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
192#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
193#endif	/* ISC_SOCKET_USE_POLLWATCH */
194
195/*%
196 * Size of per-FD lock buckets.
197 */
198#ifdef ISC_PLATFORM_USETHREADS
199#define FDLOCK_COUNT		1024
200#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)
201#else
202#define FDLOCK_COUNT		1
203#define FDLOCK_ID(fd)		0
204#endif	/* ISC_PLATFORM_USETHREADS */
205
206/*%
207 * Maximum number of events communicated with the kernel.  There should normally
208 * be no need for having a large number.
209 */
210#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
211#ifndef ISC_SOCKET_MAXEVENTS
212#define ISC_SOCKET_MAXEVENTS	64
213#endif
214#endif
215
216/*%
217 * Some systems define the socket length argument as an int, some as size_t,
218 * some as socklen_t.  This is here so it can be easily changed if needed.
219 */
220#ifndef ISC_SOCKADDR_LEN_T
221#define ISC_SOCKADDR_LEN_T unsigned int
222#endif
223
224/*%
225 * Define what the possible "soft" errors can be.  These are non-fatal returns
226 * of various network related functions, like recv() and so on.
227 *
228 * For some reason, BSDI (and perhaps others) will sometimes return <0
229 * from recv() but will have errno==0.  This is broken, but we have to
230 * work around it here.
231 */
232#define SOFT_ERROR(e)	((e) == EAGAIN || \
233			 (e) == EWOULDBLOCK || \
234			 (e) == EINTR || \
235			 (e) == 0)
236
237#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
238
239/*!<
240 * DLVL(90)  --  Function entry/exit and other tracing.
241 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
242 * DLVL(60)  --  Socket data send/receive
243 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
244 * DLVL(20)  --  Socket creation/destruction.
245 */
246#define TRACE_LEVEL		90
247#define CORRECTNESS_LEVEL	70
248#define IOEVENT_LEVEL		60
249#define EVENT_LEVEL		50
250#define CREATION_LEVEL		20
251
252#define TRACE		DLVL(TRACE_LEVEL)
253#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
254#define IOEVENT		DLVL(IOEVENT_LEVEL)
255#define EVENT		DLVL(EVENT_LEVEL)
256#define CREATION	DLVL(CREATION_LEVEL)
257
258typedef isc_event_t intev_t;
259
260#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
261#define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
262
263/*!
264 * IPv6 control information.  If the socket is an IPv6 socket we want
265 * to collect the destination address and interface so the client can
266 * set them on outgoing packets.
267 */
268#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
269#ifndef USE_CMSG
270#define USE_CMSG	1
271#endif
272#endif
273
274/*%
275 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
276 * a setsockopt() like interface to request timestamps, and if the OS
277 * doesn't do it for us, call gettimeofday() on every UDP receive?
278 */
279#ifdef SO_TIMESTAMP
280#ifndef USE_CMSG
281#define USE_CMSG	1
282#endif
283#endif
284
285/*%
286 * The size to raise the receive buffer to (from BIND 8).
287 */
288#define RCVBUFSIZE (32*1024)
289
290/*%
291 * The number of times a send operation is repeated if the result is EINTR.
292 */
293#define NRETRIES 10
294
295typedef struct isc__socket isc__socket_t;
296typedef struct isc__socketmgr isc__socketmgr_t;
297
298#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
299
300struct isc__socket {
301	/* Not locked. */
302	isc_socket_t		common;
303	isc__socketmgr_t	*manager;
304	isc_mutex_t		lock;
305	isc_sockettype_t	type;
306	const isc_statscounter_t	*statsindex;
307
308	/* Locked by socket lock. */
309	ISC_LINK(isc__socket_t)	link;
310	unsigned int		references;
311	int			fd;
312	int			pf;
313	char				name[16];
314	void *				tag;
315
316	ISC_LIST(isc_socketevent_t)		send_list;
317	ISC_LIST(isc_socketevent_t)		recv_list;
318	ISC_LIST(isc_socket_newconnev_t)	accept_list;
319	isc_socket_connev_t		       *connect_ev;
320
321	/*
322	 * Internal events.  Posted when a descriptor is readable or
323	 * writable.  These are statically allocated and never freed.
324	 * They will be set to non-purgable before use.
325	 */
326	intev_t			readable_ev;
327	intev_t			writable_ev;
328
329	isc_sockaddr_t		peer_address;  /* remote address */
330
331	unsigned int		pending_recv : 1,
332				pending_send : 1,
333				pending_accept : 1,
334				listener : 1, /* listener socket */
335				connected : 1,
336				connecting : 1, /* connect pending */
337				bound : 1, /* bound to local addr */
338				dupped : 1;
339
340#ifdef ISC_NET_RECVOVERFLOW
341	unsigned char		overflow; /* used for MSG_TRUNC fake */
342#endif
343
344	char			*recvcmsgbuf;
345	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
346	char			*sendcmsgbuf;
347	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
348
349	void			*fdwatcharg;
350	isc_sockfdwatch_t	fdwatchcb;
351	int			fdwatchflags;
352	isc_task_t		*fdwatchtask;
353};
354
355#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
356#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
357
358struct isc__socketmgr {
359	/* Not locked. */
360	isc_socketmgr_t		common;
361	isc_mem_t	       *mctx;
362	isc_mutex_t		lock;
363	isc_mutex_t		*fdlock;
364	isc_stats_t		*stats;
365#ifdef USE_KQUEUE
366	int			kqueue_fd;
367	int			nevents;
368	struct kevent		*events;
369#endif	/* USE_KQUEUE */
370#ifdef USE_EPOLL
371	int			epoll_fd;
372	int			nevents;
373	struct epoll_event	*events;
374#endif	/* USE_EPOLL */
375#ifdef USE_DEVPOLL
376	int			devpoll_fd;
377	int			nevents;
378	struct pollfd		*events;
379#endif	/* USE_DEVPOLL */
380#ifdef USE_SELECT
381	int			fd_bufsize;
382#endif	/* USE_SELECT */
383	unsigned int		maxsocks;
384#ifdef ISC_PLATFORM_USETHREADS
385	int			pipe_fds[2];
386#endif
387
388	/* Locked by fdlock. */
389	isc__socket_t	       **fds;
390	int			*fdstate;
391#ifdef USE_DEVPOLL
392	pollinfo_t		*fdpollinfo;
393#endif
394
395	/* Locked by manager lock. */
396	ISC_LIST(isc__socket_t)	socklist;
397#ifdef USE_SELECT
398	fd_set			*read_fds;
399	fd_set			*read_fds_copy;
400	fd_set			*write_fds;
401	fd_set			*write_fds_copy;
402	int			maxfd;
403#endif	/* USE_SELECT */
404	int			reserved;	/* unlocked */
405#ifdef USE_WATCHER_THREAD
406	isc_thread_t		watcher;
407	isc_condition_t		shutdown_ok;
408#else /* USE_WATCHER_THREAD */
409	unsigned int		refs;
410#endif /* USE_WATCHER_THREAD */
411	int			maxudp;
412};
413
414#ifdef USE_SHARED_MANAGER
415static isc__socketmgr_t *socketmgr = NULL;
416#endif /* USE_SHARED_MANAGER */
417
418#define CLOSED			0	/* this one must be zero */
419#define MANAGED			1
420#define CLOSE_PENDING		2
421
422/*
423 * send() and recv() iovec counts
424 */
425#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
426#ifdef ISC_NET_RECVOVERFLOW
427# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
428#else
429# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
430#endif
431
432static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
433				  isc_sockettype_t type,
434				  isc_socket_t **socketp,
435				  isc_socket_t *dup_socket);
436static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
437static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
438static void free_socket(isc__socket_t **);
439static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
440				    isc__socket_t **);
441static void destroy(isc__socket_t **);
442static void internal_accept(isc_task_t *, isc_event_t *);
443static void internal_connect(isc_task_t *, isc_event_t *);
444static void internal_recv(isc_task_t *, isc_event_t *);
445static void internal_send(isc_task_t *, isc_event_t *);
446static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
447static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
448static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
449static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
450			      struct msghdr *, struct iovec *, size_t *);
451static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
452			      struct msghdr *, struct iovec *, size_t *);
453#ifdef USE_WATCHER_THREAD
454static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
455#endif
456
457/*%
458 * The following can be either static or public, depending on build environment.
459 */
460
461#ifdef BIND9
462#define ISC_SOCKETFUNC_SCOPE
463#else
464#define ISC_SOCKETFUNC_SCOPE static
465#endif
466
467ISC_SOCKETFUNC_SCOPE isc_result_t
468isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
469		   isc_socket_t **socketp);
470ISC_SOCKETFUNC_SCOPE void
471isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
472ISC_SOCKETFUNC_SCOPE void
473isc__socket_detach(isc_socket_t **socketp);
474ISC_SOCKETFUNC_SCOPE isc_result_t
475isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
476ISC_SOCKETFUNC_SCOPE isc_result_t
477isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
478		       unsigned int maxsocks);
479ISC_SOCKETFUNC_SCOPE void
480isc__socketmgr_destroy(isc_socketmgr_t **managerp);
481ISC_SOCKETFUNC_SCOPE isc_result_t
482isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
483		 unsigned int minimum, isc_task_t *task,
484		  isc_taskaction_t action, const void *arg);
485ISC_SOCKETFUNC_SCOPE isc_result_t
486isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
487		 unsigned int minimum, isc_task_t *task,
488		 isc_taskaction_t action, const void *arg);
489ISC_SOCKETFUNC_SCOPE isc_result_t
490isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
491		  unsigned int minimum, isc_task_t *task,
492		  isc_socketevent_t *event, unsigned int flags);
493ISC_SOCKETFUNC_SCOPE isc_result_t
494isc__socket_send(isc_socket_t *sock, isc_region_t *region,
495		 isc_task_t *task, isc_taskaction_t action, const void *arg);
496ISC_SOCKETFUNC_SCOPE isc_result_t
497isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
498		   isc_task_t *task, isc_taskaction_t action, const void *arg,
499		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
500ISC_SOCKETFUNC_SCOPE isc_result_t
501isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
502		  isc_task_t *task, isc_taskaction_t action, const void *arg);
503ISC_SOCKETFUNC_SCOPE isc_result_t
504isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
505		    isc_task_t *task, isc_taskaction_t action, const void *arg,
506		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
507ISC_SOCKETFUNC_SCOPE isc_result_t
508isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
509		    isc_task_t *task,
510		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
511		    isc_socketevent_t *event, unsigned int flags);
512ISC_SOCKETFUNC_SCOPE void
513isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
514ISC_SOCKETFUNC_SCOPE isc_result_t
515isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
516		     isc_uint32_t owner, isc_uint32_t group);
517ISC_SOCKETFUNC_SCOPE isc_result_t
518isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
519		 unsigned int options);
520ISC_SOCKETFUNC_SCOPE isc_result_t
521isc__socket_filter(isc_socket_t *sock, const char *filter);
522ISC_SOCKETFUNC_SCOPE isc_result_t
523isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
524ISC_SOCKETFUNC_SCOPE isc_result_t
525isc__socket_accept(isc_socket_t *sock,
526		   isc_task_t *task, isc_taskaction_t action, const void *arg);
527ISC_SOCKETFUNC_SCOPE isc_result_t
528isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
529		    isc_task_t *task, isc_taskaction_t action,
530		    const void *arg);
531ISC_SOCKETFUNC_SCOPE isc_result_t
532isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
533ISC_SOCKETFUNC_SCOPE isc_result_t
534isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
535ISC_SOCKETFUNC_SCOPE void
536isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
537ISC_SOCKETFUNC_SCOPE isc_sockettype_t
538isc__socket_gettype(isc_socket_t *sock);
539ISC_SOCKETFUNC_SCOPE isc_boolean_t
540isc__socket_isbound(isc_socket_t *sock);
541ISC_SOCKETFUNC_SCOPE void
542isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
543#if defined(HAVE_LIBXML2) && defined(BIND9)
544ISC_SOCKETFUNC_SCOPE void
545isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
546#endif
547
548ISC_SOCKETFUNC_SCOPE isc_result_t
549isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
550			  isc_sockfdwatch_t callback, void *cbarg,
551			  isc_task_t *task, isc_socket_t **socketp);
552ISC_SOCKETFUNC_SCOPE isc_result_t
553isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
554ISC_SOCKETFUNC_SCOPE isc_result_t
555isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp);
556ISC_SOCKETFUNC_SCOPE int
557isc__socket_getfd(isc_socket_t *sock);
558
559static struct {
560	isc_socketmethods_t methods;
561
562	/*%
563	 * The following are defined just for avoiding unused static functions.
564	 */
565#ifndef BIND9
566	void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
567		*listen, *accept, *getpeername, *isbound;
568#endif
569} socketmethods = {
570	{
571		isc__socket_attach,
572		isc__socket_detach,
573		isc__socket_bind,
574		isc__socket_sendto,
575		isc__socket_connect,
576		isc__socket_recv,
577		isc__socket_cancel,
578		isc__socket_getsockname,
579		isc__socket_gettype,
580		isc__socket_ipv6only,
581		isc__socket_fdwatchpoke,
582		isc__socket_dup,
583		isc__socket_getfd
584	}
585#ifndef BIND9
586	,
587	(void *)isc__socket_recvv, (void *)isc__socket_send,
588	(void *)isc__socket_sendv, (void *)isc__socket_sendto2,
589	(void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
590	(void *)isc__socket_filter, (void *)isc__socket_listen,
591	(void *)isc__socket_accept, (void *)isc__socket_getpeername,
592	(void *)isc__socket_isbound
593#endif
594};
595
596static isc_socketmgrmethods_t socketmgrmethods = {
597	isc__socketmgr_destroy,
598	isc__socket_create,
599	isc__socket_fdwatchcreate
600};
601
602#define SELECT_POKE_SHUTDOWN		(-1)
603#define SELECT_POKE_NOTHING		(-2)
604#define SELECT_POKE_READ		(-3)
605#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
606#define SELECT_POKE_WRITE		(-4)
607#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
608#define SELECT_POKE_CLOSE		(-5)
609
610#define SOCK_DEAD(s)			((s)->references == 0)
611
612/*%
613 * Shortcut index arrays to get access to statistics counters.
614 */
615enum {
616	STATID_OPEN = 0,
617	STATID_OPENFAIL = 1,
618	STATID_CLOSE = 2,
619	STATID_BINDFAIL = 3,
620	STATID_CONNECTFAIL = 4,
621	STATID_CONNECT = 5,
622	STATID_ACCEPTFAIL = 6,
623	STATID_ACCEPT = 7,
624	STATID_SENDFAIL = 8,
625	STATID_RECVFAIL = 9
626};
627static const isc_statscounter_t upd4statsindex[] = {
628	isc_sockstatscounter_udp4open,
629	isc_sockstatscounter_udp4openfail,
630	isc_sockstatscounter_udp4close,
631	isc_sockstatscounter_udp4bindfail,
632	isc_sockstatscounter_udp4connectfail,
633	isc_sockstatscounter_udp4connect,
634	-1,
635	-1,
636	isc_sockstatscounter_udp4sendfail,
637	isc_sockstatscounter_udp4recvfail
638};
639static const isc_statscounter_t upd6statsindex[] = {
640	isc_sockstatscounter_udp6open,
641	isc_sockstatscounter_udp6openfail,
642	isc_sockstatscounter_udp6close,
643	isc_sockstatscounter_udp6bindfail,
644	isc_sockstatscounter_udp6connectfail,
645	isc_sockstatscounter_udp6connect,
646	-1,
647	-1,
648	isc_sockstatscounter_udp6sendfail,
649	isc_sockstatscounter_udp6recvfail
650};
651static const isc_statscounter_t tcp4statsindex[] = {
652	isc_sockstatscounter_tcp4open,
653	isc_sockstatscounter_tcp4openfail,
654	isc_sockstatscounter_tcp4close,
655	isc_sockstatscounter_tcp4bindfail,
656	isc_sockstatscounter_tcp4connectfail,
657	isc_sockstatscounter_tcp4connect,
658	isc_sockstatscounter_tcp4acceptfail,
659	isc_sockstatscounter_tcp4accept,
660	isc_sockstatscounter_tcp4sendfail,
661	isc_sockstatscounter_tcp4recvfail
662};
663static const isc_statscounter_t tcp6statsindex[] = {
664	isc_sockstatscounter_tcp6open,
665	isc_sockstatscounter_tcp6openfail,
666	isc_sockstatscounter_tcp6close,
667	isc_sockstatscounter_tcp6bindfail,
668	isc_sockstatscounter_tcp6connectfail,
669	isc_sockstatscounter_tcp6connect,
670	isc_sockstatscounter_tcp6acceptfail,
671	isc_sockstatscounter_tcp6accept,
672	isc_sockstatscounter_tcp6sendfail,
673	isc_sockstatscounter_tcp6recvfail
674};
675static const isc_statscounter_t unixstatsindex[] = {
676	isc_sockstatscounter_unixopen,
677	isc_sockstatscounter_unixopenfail,
678	isc_sockstatscounter_unixclose,
679	isc_sockstatscounter_unixbindfail,
680	isc_sockstatscounter_unixconnectfail,
681	isc_sockstatscounter_unixconnect,
682	isc_sockstatscounter_unixacceptfail,
683	isc_sockstatscounter_unixaccept,
684	isc_sockstatscounter_unixsendfail,
685	isc_sockstatscounter_unixrecvfail
686};
687static const isc_statscounter_t fdwatchstatsindex[] = {
688	-1,
689	-1,
690	isc_sockstatscounter_fdwatchclose,
691	isc_sockstatscounter_fdwatchbindfail,
692	isc_sockstatscounter_fdwatchconnectfail,
693	isc_sockstatscounter_fdwatchconnect,
694	-1,
695	-1,
696	isc_sockstatscounter_fdwatchsendfail,
697	isc_sockstatscounter_fdwatchrecvfail
698};
699
700#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
701    defined(USE_WATCHER_THREAD)
702static void
703manager_log(isc__socketmgr_t *sockmgr,
704	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
705	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
706static void
707manager_log(isc__socketmgr_t *sockmgr,
708	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
709	    const char *fmt, ...)
710{
711	char msgbuf[2048];
712	va_list ap;
713
714	if (! isc_log_wouldlog(isc_lctx, level))
715		return;
716
717	va_start(ap, fmt);
718	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
719	va_end(ap);
720
721	isc_log_write(isc_lctx, category, module, level,
722		      "sockmgr %p: %s", sockmgr, msgbuf);
723}
724#endif
725
726static void
727socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
728	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
729	   isc_msgcat_t *msgcat, int msgset, int message,
730	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
731static void
732socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
733	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
734	   isc_msgcat_t *msgcat, int msgset, int message,
735	   const char *fmt, ...)
736{
737	char msgbuf[2048];
738	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
739	va_list ap;
740
741	if (! isc_log_wouldlog(isc_lctx, level))
742		return;
743
744	va_start(ap, fmt);
745	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
746	va_end(ap);
747
748	if (address == NULL) {
749		isc_log_iwrite(isc_lctx, category, module, level,
750			       msgcat, msgset, message,
751			       "socket %p: %s", sock, msgbuf);
752	} else {
753		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
754		isc_log_iwrite(isc_lctx, category, module, level,
755			       msgcat, msgset, message,
756			       "socket %p %s: %s", sock, peerbuf, msgbuf);
757	}
758}
759
760#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
761    defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
762/*
763 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
764 * setting IPV6_V6ONLY.
765 */
766static void
767FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
768{
769	char strbuf[ISC_STRERRORSIZE];
770	int on = 1;
771
772	if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
773		return;
774
775	if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
776		       (void *)&on, sizeof(on)) < 0) {
777
778		isc__strerror(errno, strbuf, sizeof(strbuf));
779		UNEXPECTED_ERROR(__FILE__, __LINE__,
780				 "setsockopt(%d, IPV6_RECVPKTINFO) "
781				 "%s: %s", sock->fd,
782				 isc_msgcat_get(isc_msgcat,
783						ISC_MSGSET_GENERAL,
784						ISC_MSG_FAILED,
785						"failed"),
786				 strbuf);
787	}
788}
789#else
790#define FIX_IPV6_RECVPKTINFO(sock) (void)0
791#endif
792
793/*%
794 * Increment socket-related statistics counters.
795 */
796static inline void
797inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
798	REQUIRE(counterid != -1);
799
800	if (stats != NULL)
801		isc_stats_increment(stats, counterid);
802}
803
804static inline isc_result_t
805watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
806	isc_result_t result = ISC_R_SUCCESS;
807
808#ifdef USE_KQUEUE
809	struct kevent evchange;
810
811	memset(&evchange, 0, sizeof(evchange));
812	if (msg == SELECT_POKE_READ)
813		evchange.filter = EVFILT_READ;
814	else
815		evchange.filter = EVFILT_WRITE;
816	evchange.flags = EV_ADD;
817	evchange.ident = fd;
818	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
819		result = isc__errno2result(errno);
820
821	return (result);
822#elif defined(USE_EPOLL)
823	struct epoll_event event;
824
825	if (msg == SELECT_POKE_READ)
826		event.events = EPOLLIN;
827	else
828		event.events = EPOLLOUT;
829	memset(&event.data, 0, sizeof(event.data));
830	event.data.fd = fd;
831	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
832	    errno != EEXIST) {
833		result = isc__errno2result(errno);
834	}
835
836	return (result);
837#elif defined(USE_DEVPOLL)
838	struct pollfd pfd;
839	int lockid = FDLOCK_ID(fd);
840
841	memset(&pfd, 0, sizeof(pfd));
842	if (msg == SELECT_POKE_READ)
843		pfd.events = POLLIN;
844	else
845		pfd.events = POLLOUT;
846	pfd.fd = fd;
847	pfd.revents = 0;
848	LOCK(&manager->fdlock[lockid]);
849	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
850		result = isc__errno2result(errno);
851	else {
852		if (msg == SELECT_POKE_READ)
853			manager->fdpollinfo[fd].want_read = 1;
854		else
855			manager->fdpollinfo[fd].want_write = 1;
856	}
857	UNLOCK(&manager->fdlock[lockid]);
858
859	return (result);
860#elif defined(USE_SELECT)
861	LOCK(&manager->lock);
862	if (msg == SELECT_POKE_READ)
863		FD_SET(fd, manager->read_fds);
864	if (msg == SELECT_POKE_WRITE)
865		FD_SET(fd, manager->write_fds);
866	UNLOCK(&manager->lock);
867
868	return (result);
869#endif
870}
871
872static inline isc_result_t
873unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
874	isc_result_t result = ISC_R_SUCCESS;
875
876#ifdef USE_KQUEUE
877	struct kevent evchange;
878
879	memset(&evchange, 0, sizeof(evchange));
880	if (msg == SELECT_POKE_READ)
881		evchange.filter = EVFILT_READ;
882	else
883		evchange.filter = EVFILT_WRITE;
884	evchange.flags = EV_DELETE;
885	evchange.ident = fd;
886	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
887		result = isc__errno2result(errno);
888
889	return (result);
890#elif defined(USE_EPOLL)
891	struct epoll_event event;
892
893	if (msg == SELECT_POKE_READ)
894		event.events = EPOLLIN;
895	else
896		event.events = EPOLLOUT;
897	memset(&event.data, 0, sizeof(event.data));
898	event.data.fd = fd;
899	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
900	    errno != ENOENT) {
901		char strbuf[ISC_STRERRORSIZE];
902		isc__strerror(errno, strbuf, sizeof(strbuf));
903		UNEXPECTED_ERROR(__FILE__, __LINE__,
904				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
905		result = ISC_R_UNEXPECTED;
906	}
907	return (result);
908#elif defined(USE_DEVPOLL)
909	struct pollfd pfds[2];
910	size_t writelen = sizeof(pfds[0]);
911	int lockid = FDLOCK_ID(fd);
912
913	memset(pfds, 0, sizeof(pfds));
914	pfds[0].events = POLLREMOVE;
915	pfds[0].fd = fd;
916
917	/*
918	 * Canceling read or write polling via /dev/poll is tricky.  Since it
919	 * only provides a way of canceling per FD, we may need to re-poll the
920	 * socket for the other operation.
921	 */
922	LOCK(&manager->fdlock[lockid]);
923	if (msg == SELECT_POKE_READ &&
924	    manager->fdpollinfo[fd].want_write == 1) {
925		pfds[1].events = POLLOUT;
926		pfds[1].fd = fd;
927		writelen += sizeof(pfds[1]);
928	}
929	if (msg == SELECT_POKE_WRITE &&
930	    manager->fdpollinfo[fd].want_read == 1) {
931		pfds[1].events = POLLIN;
932		pfds[1].fd = fd;
933		writelen += sizeof(pfds[1]);
934	}
935
936	if (write(manager->devpoll_fd, pfds, writelen) == -1)
937		result = isc__errno2result(errno);
938	else {
939		if (msg == SELECT_POKE_READ)
940			manager->fdpollinfo[fd].want_read = 0;
941		else
942			manager->fdpollinfo[fd].want_write = 0;
943	}
944	UNLOCK(&manager->fdlock[lockid]);
945
946	return (result);
947#elif defined(USE_SELECT)
948	LOCK(&manager->lock);
949	if (msg == SELECT_POKE_READ)
950		FD_CLR(fd, manager->read_fds);
951	else if (msg == SELECT_POKE_WRITE)
952		FD_CLR(fd, manager->write_fds);
953	UNLOCK(&manager->lock);
954
955	return (result);
956#endif
957}
958
959static void
960wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
961	isc_result_t result;
962	int lockid = FDLOCK_ID(fd);
963
964	/*
965	 * This is a wakeup on a socket.  If the socket is not in the
966	 * process of being closed, start watching it for either reads
967	 * or writes.
968	 */
969
970	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
971
972	if (msg == SELECT_POKE_CLOSE) {
973		/* No one should be updating fdstate, so no need to lock it */
974		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
975		manager->fdstate[fd] = CLOSED;
976		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
977		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
978		(void)close(fd);
979		return;
980	}
981
982	LOCK(&manager->fdlock[lockid]);
983	if (manager->fdstate[fd] == CLOSE_PENDING) {
984		UNLOCK(&manager->fdlock[lockid]);
985
986		/*
987		 * We accept (and ignore) any error from unwatch_fd() as we are
988		 * closing the socket, hoping it doesn't leave dangling state in
989		 * the kernel.
990		 * Note that unwatch_fd() must be called after releasing the
991		 * fdlock; otherwise it could cause deadlock due to a lock order
992		 * reversal.
993		 */
994		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
995		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
996		return;
997	}
998	if (manager->fdstate[fd] != MANAGED) {
999		UNLOCK(&manager->fdlock[lockid]);
1000		return;
1001	}
1002	UNLOCK(&manager->fdlock[lockid]);
1003
1004	/*
1005	 * Set requested bit.
1006	 */
1007	result = watch_fd(manager, fd, msg);
1008	if (result != ISC_R_SUCCESS) {
1009		/*
1010		 * XXXJT: what should we do?  Ignoring the failure of watching
1011		 * a socket will make the application dysfunctional, but there
1012		 * seems to be no reasonable recovery process.
1013		 */
1014		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1015			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1016			      "failed to start watching FD (%d): %s",
1017			      fd, isc_result_totext(result));
1018	}
1019}
1020
1021#ifdef USE_WATCHER_THREAD
1022/*
1023 * Poke the select loop when there is something for us to do.
1024 * The write is required (by POSIX) to complete.  That is, we
1025 * will not get partial writes.
1026 */
1027static void
1028select_poke(isc__socketmgr_t *mgr, int fd, int msg) {
1029	int cc;
1030	int buf[2];
1031	char strbuf[ISC_STRERRORSIZE];
1032
1033	buf[0] = fd;
1034	buf[1] = msg;
1035
1036	do {
1037		cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
1038#ifdef ENOSR
1039		/*
1040		 * Treat ENOSR as EAGAIN but loop slowly as it is
1041		 * unlikely to clear fast.
1042		 */
1043		if (cc < 0 && errno == ENOSR) {
1044			sleep(1);
1045			errno = EAGAIN;
1046		}
1047#endif
1048	} while (cc < 0 && SOFT_ERROR(errno));
1049
1050	if (cc < 0) {
1051		isc__strerror(errno, strbuf, sizeof(strbuf));
1052		FATAL_ERROR(__FILE__, __LINE__,
1053			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1054					   ISC_MSG_WRITEFAILED,
1055					   "write() failed "
1056					   "during watcher poke: %s"),
1057			    strbuf);
1058	}
1059
1060	INSIST(cc == sizeof(buf));
1061}
1062
1063/*
1064 * Read a message on the internal fd.
1065 */
1066static void
1067select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) {
1068	int buf[2];
1069	int cc;
1070	char strbuf[ISC_STRERRORSIZE];
1071
1072	cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
1073	if (cc < 0) {
1074		*msg = SELECT_POKE_NOTHING;
1075		*fd = -1;	/* Silence compiler. */
1076		if (SOFT_ERROR(errno))
1077			return;
1078
1079		isc__strerror(errno, strbuf, sizeof(strbuf));
1080		FATAL_ERROR(__FILE__, __LINE__,
1081			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1082					   ISC_MSG_READFAILED,
1083					   "read() failed "
1084					   "during watcher poke: %s"),
1085			    strbuf);
1086
1087		return;
1088	}
1089	INSIST(cc == sizeof(buf));
1090
1091	*fd = buf[0];
1092	*msg = buf[1];
1093}
1094#else /* USE_WATCHER_THREAD */
1095/*
1096 * Update the state of the socketmgr when something changes.
1097 */
1098static void
1099select_poke(isc__socketmgr_t *manager, int fd, int msg) {
1100	if (msg == SELECT_POKE_SHUTDOWN)
1101		return;
1102	else if (fd >= 0)
1103		wakeup_socket(manager, fd, msg);
1104	return;
1105}
1106#endif /* USE_WATCHER_THREAD */
1107
1108/*
1109 * Make a fd non-blocking.
1110 */
1111static isc_result_t
1112make_nonblock(int fd) {
1113	int ret;
1114	int flags;
1115	char strbuf[ISC_STRERRORSIZE];
1116#ifdef USE_FIONBIO_IOCTL
1117	int on = 1;
1118
1119	ret = ioctl(fd, FIONBIO, (char *)&on);
1120#else
1121	flags = fcntl(fd, F_GETFL, 0);
1122	flags |= PORT_NONBLOCK;
1123	ret = fcntl(fd, F_SETFL, flags);
1124#endif
1125
1126	if (ret == -1) {
1127		isc__strerror(errno, strbuf, sizeof(strbuf));
1128		UNEXPECTED_ERROR(__FILE__, __LINE__,
1129#ifdef USE_FIONBIO_IOCTL
1130				 "ioctl(%d, FIONBIO, &on): %s", fd,
1131#else
1132				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1133#endif
1134				 strbuf);
1135
1136		return (ISC_R_UNEXPECTED);
1137	}
1138
1139	return (ISC_R_SUCCESS);
1140}
1141
1142#ifdef USE_CMSG
1143/*
1144 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1145 * In order to ensure as much portability as possible, we provide wrapper
1146 * functions of these macros.
1147 * Note that cmsg_space() could run slow on OSes that do not have
1148 * CMSG_SPACE.
1149 */
1150static inline ISC_SOCKADDR_LEN_T
1151cmsg_len(ISC_SOCKADDR_LEN_T len) {
1152#ifdef CMSG_LEN
1153	return (CMSG_LEN(len));
1154#else
1155	ISC_SOCKADDR_LEN_T hdrlen;
1156
1157	/*
1158	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1159	 * is correct.
1160	 */
1161	hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
1162	return (hdrlen + len);
1163#endif
1164}
1165
1166static inline ISC_SOCKADDR_LEN_T
1167cmsg_space(ISC_SOCKADDR_LEN_T len) {
1168#ifdef CMSG_SPACE
1169	return (CMSG_SPACE(len));
1170#else
1171	struct msghdr msg;
1172	struct cmsghdr *cmsgp;
1173	/*
1174	 * XXX: The buffer length is an ad-hoc value, but should be enough
1175	 * in a practical sense.
1176	 */
1177	char dummybuf[sizeof(struct cmsghdr) + 1024];
1178
1179	memset(&msg, 0, sizeof(msg));
1180	msg.msg_control = dummybuf;
1181	msg.msg_controllen = sizeof(dummybuf);
1182
1183	cmsgp = (struct cmsghdr *)dummybuf;
1184	cmsgp->cmsg_len = cmsg_len(len);
1185
1186	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1187	if (cmsgp != NULL)
1188		return ((char *)cmsgp - (char *)msg.msg_control);
1189	else
1190		return (0);
1191#endif
1192}
1193#endif /* USE_CMSG */
1194
1195/*
1196 * Process control messages received on a socket.
1197 */
1198static void
1199process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1200#ifdef USE_CMSG
1201	struct cmsghdr *cmsgp;
1202#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1203	struct in6_pktinfo *pktinfop;
1204#endif
1205#ifdef SO_TIMESTAMP
1206	struct timeval *timevalp;
1207#endif
1208#endif
1209
1210	/*
1211	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1212	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1213	 * They are all here, outside of the CPP tests, because it is
1214	 * more consistent with the usual ISC coding style.
1215	 */
1216	UNUSED(sock);
1217	UNUSED(msg);
1218	UNUSED(dev);
1219
1220#ifdef ISC_NET_BSD44MSGHDR
1221
1222#ifdef MSG_TRUNC
1223	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1224		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1225#endif
1226
1227#ifdef MSG_CTRUNC
1228	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1229		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1230#endif
1231
1232#ifndef USE_CMSG
1233	return;
1234#else
1235	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1236		return;
1237
1238#ifdef SO_TIMESTAMP
1239	timevalp = NULL;
1240#endif
1241#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1242	pktinfop = NULL;
1243#endif
1244
1245	cmsgp = CMSG_FIRSTHDR(msg);
1246	while (cmsgp != NULL) {
1247		socket_log(sock, NULL, TRACE,
1248			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1249			   "processing cmsg %p", cmsgp);
1250
1251#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1252		if (cmsgp->cmsg_level == IPPROTO_IPV6
1253		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
1254
1255			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1256			memcpy(&dev->pktinfo, pktinfop,
1257			       sizeof(struct in6_pktinfo));
1258			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1259			socket_log(sock, NULL, TRACE,
1260				   isc_msgcat, ISC_MSGSET_SOCKET,
1261				   ISC_MSG_IFRECEIVED,
1262				   "interface received on ifindex %u",
1263				   dev->pktinfo.ipi6_ifindex);
1264			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1265				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1266			goto next;
1267		}
1268#endif
1269
1270#ifdef SO_TIMESTAMP
1271		if (cmsgp->cmsg_level == SOL_SOCKET
1272		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1273			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1274			dev->timestamp.seconds = timevalp->tv_sec;
1275			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1276			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1277			goto next;
1278		}
1279#endif
1280
1281	next:
1282		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1283	}
1284#endif /* USE_CMSG */
1285
1286#endif /* ISC_NET_BSD44MSGHDR */
1287}
1288
1289/*
1290 * Construct an iov array and attach it to the msghdr passed in.  This is
1291 * the SEND constructor, which will use the used region of the buffer
1292 * (if using a buffer list) or will use the internal region (if a single
1293 * buffer I/O is requested).
1294 *
1295 * Nothing can be NULL, and the done event must list at least one buffer
1296 * on the buffer linked list for this function to be meaningful.
1297 *
1298 * If write_countp != NULL, *write_countp will hold the number of bytes
1299 * this transaction can send.
1300 */
1301static void
1302build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev,
1303		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1304{
1305	unsigned int iovcount;
1306	isc_buffer_t *buffer;
1307	isc_region_t used;
1308	size_t write_count;
1309	size_t skip_count;
1310
1311	memset(msg, 0, sizeof(*msg));
1312
1313	if (!sock->connected) {
1314		msg->msg_name = (void *)&dev->address.type.sa;
1315		msg->msg_namelen = dev->address.length;
1316	} else {
1317		msg->msg_name = NULL;
1318		msg->msg_namelen = 0;
1319	}
1320
1321	buffer = ISC_LIST_HEAD(dev->bufferlist);
1322	write_count = 0;
1323	iovcount = 0;
1324
1325	/*
1326	 * Single buffer I/O?  Skip what we've done so far in this region.
1327	 */
1328	if (buffer == NULL) {
1329		write_count = dev->region.length - dev->n;
1330		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1331		iov[0].iov_len = write_count;
1332		iovcount = 1;
1333
1334		goto config;
1335	}
1336
1337	/*
1338	 * Multibuffer I/O.
1339	 * Skip the data in the buffer list that we have already written.
1340	 */
1341	skip_count = dev->n;
1342	while (buffer != NULL) {
1343		REQUIRE(ISC_BUFFER_VALID(buffer));
1344		if (skip_count < isc_buffer_usedlength(buffer))
1345			break;
1346		skip_count -= isc_buffer_usedlength(buffer);
1347		buffer = ISC_LIST_NEXT(buffer, link);
1348	}
1349
1350	while (buffer != NULL) {
1351		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1352
1353		isc_buffer_usedregion(buffer, &used);
1354
1355		if (used.length > 0) {
1356			iov[iovcount].iov_base = (void *)(used.base
1357							  + skip_count);
1358			iov[iovcount].iov_len = used.length - skip_count;
1359			write_count += (used.length - skip_count);
1360			skip_count = 0;
1361			iovcount++;
1362		}
1363		buffer = ISC_LIST_NEXT(buffer, link);
1364	}
1365
1366	INSIST(skip_count == 0U);
1367
1368 config:
1369	msg->msg_iov = iov;
1370	msg->msg_iovlen = iovcount;
1371
1372#ifdef ISC_NET_BSD44MSGHDR
1373	msg->msg_control = NULL;
1374	msg->msg_controllen = 0;
1375	msg->msg_flags = 0;
1376#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1377	if ((sock->type == isc_sockettype_udp)
1378	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1379#if defined(IPV6_USE_MIN_MTU)
1380		int use_min_mtu = 1;	/* -1, 0, 1 */
1381#endif
1382		struct cmsghdr *cmsgp;
1383		struct in6_pktinfo *pktinfop;
1384
1385		socket_log(sock, NULL, TRACE,
1386			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1387			   "sendto pktinfo data, ifindex %u",
1388			   dev->pktinfo.ipi6_ifindex);
1389
1390		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1391		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1392		msg->msg_control = (void *)sock->sendcmsgbuf;
1393
1394		cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1395		cmsgp->cmsg_level = IPPROTO_IPV6;
1396		cmsgp->cmsg_type = IPV6_PKTINFO;
1397		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1398		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1399		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1400#if defined(IPV6_USE_MIN_MTU)
1401		/*
1402		 * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD
1403		 * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO
1404		 * is used.
1405		 */
1406		cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
1407					   msg->msg_controllen);
1408		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1409		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1410
1411		cmsgp->cmsg_level = IPPROTO_IPV6;
1412		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1413		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1414		memcpy(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1415#endif
1416	}
1417#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1418#else /* ISC_NET_BSD44MSGHDR */
1419	msg->msg_accrights = NULL;
1420	msg->msg_accrightslen = 0;
1421#endif /* ISC_NET_BSD44MSGHDR */
1422
1423	if (write_countp != NULL)
1424		*write_countp = write_count;
1425}
1426
1427/*
1428 * Construct an iov array and attach it to the msghdr passed in.  This is
1429 * the RECV constructor, which will use the available region of the buffer
1430 * (if using a buffer list) or will use the internal region (if a single
1431 * buffer I/O is requested).
1432 *
1433 * Nothing can be NULL, and the done event must list at least one buffer
1434 * on the buffer linked list for this function to be meaningful.
1435 *
1436 * If read_countp != NULL, *read_countp will hold the number of bytes
1437 * this transaction can receive.
1438 */
1439static void
1440build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev,
1441		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1442{
1443	unsigned int iovcount;
1444	isc_buffer_t *buffer;
1445	isc_region_t available;
1446	size_t read_count;
1447
1448	memset(msg, 0, sizeof(struct msghdr));
1449
1450	if (sock->type == isc_sockettype_udp) {
1451		memset(&dev->address, 0, sizeof(dev->address));
1452#ifdef BROKEN_RECVMSG
1453		if (sock->pf == AF_INET) {
1454			msg->msg_name = (void *)&dev->address.type.sin;
1455			msg->msg_namelen = sizeof(dev->address.type.sin6);
1456		} else if (sock->pf == AF_INET6) {
1457			msg->msg_name = (void *)&dev->address.type.sin6;
1458			msg->msg_namelen = sizeof(dev->address.type.sin6);
1459#ifdef ISC_PLATFORM_HAVESYSUNH
1460		} else if (sock->pf == AF_UNIX) {
1461			msg->msg_name = (void *)&dev->address.type.sunix;
1462			msg->msg_namelen = sizeof(dev->address.type.sunix);
1463#endif
1464		} else {
1465			msg->msg_name = (void *)&dev->address.type.sa;
1466			msg->msg_namelen = sizeof(dev->address.type);
1467		}
1468#else
1469		msg->msg_name = (void *)&dev->address.type.sa;
1470		msg->msg_namelen = sizeof(dev->address.type);
1471#endif
1472#ifdef ISC_NET_RECVOVERFLOW
1473		/* If needed, steal one iovec for overflow detection. */
1474		maxiov--;
1475#endif
1476	} else { /* TCP */
1477		msg->msg_name = NULL;
1478		msg->msg_namelen = 0;
1479		dev->address = sock->peer_address;
1480	}
1481
1482	buffer = ISC_LIST_HEAD(dev->bufferlist);
1483	read_count = 0;
1484
1485	/*
1486	 * Single buffer I/O?  Skip what we've done so far in this region.
1487	 */
1488	if (buffer == NULL) {
1489		read_count = dev->region.length - dev->n;
1490		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1491		iov[0].iov_len = read_count;
1492		iovcount = 1;
1493
1494		goto config;
1495	}
1496
1497	/*
1498	 * Multibuffer I/O.
1499	 * Skip empty buffers.
1500	 */
1501	while (buffer != NULL) {
1502		REQUIRE(ISC_BUFFER_VALID(buffer));
1503		if (isc_buffer_availablelength(buffer) != 0)
1504			break;
1505		buffer = ISC_LIST_NEXT(buffer, link);
1506	}
1507
1508	iovcount = 0;
1509	while (buffer != NULL) {
1510		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1511
1512		isc_buffer_availableregion(buffer, &available);
1513
1514		if (available.length > 0) {
1515			iov[iovcount].iov_base = (void *)(available.base);
1516			iov[iovcount].iov_len = available.length;
1517			read_count += available.length;
1518			iovcount++;
1519		}
1520		buffer = ISC_LIST_NEXT(buffer, link);
1521	}
1522
1523 config:
1524
1525	/*
1526	 * If needed, set up to receive that one extra byte.  Note that
1527	 * we know there is at least one iov left, since we stole it
1528	 * at the top of this function.
1529	 */
1530#ifdef ISC_NET_RECVOVERFLOW
1531	if (sock->type == isc_sockettype_udp) {
1532		iov[iovcount].iov_base = (void *)(&sock->overflow);
1533		iov[iovcount].iov_len = 1;
1534		iovcount++;
1535	}
1536#endif
1537
1538	msg->msg_iov = iov;
1539	msg->msg_iovlen = iovcount;
1540
1541#ifdef ISC_NET_BSD44MSGHDR
1542	msg->msg_control = NULL;
1543	msg->msg_controllen = 0;
1544	msg->msg_flags = 0;
1545#if defined(USE_CMSG)
1546	if (sock->type == isc_sockettype_udp) {
1547		msg->msg_control = sock->recvcmsgbuf;
1548		msg->msg_controllen = sock->recvcmsgbuflen;
1549	}
1550#endif /* USE_CMSG */
1551#else /* ISC_NET_BSD44MSGHDR */
1552	msg->msg_accrights = NULL;
1553	msg->msg_accrightslen = 0;
1554#endif /* ISC_NET_BSD44MSGHDR */
1555
1556	if (read_countp != NULL)
1557		*read_countp = read_count;
1558}
1559
1560static void
1561set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
1562		isc_socketevent_t *dev)
1563{
1564	if (sock->type == isc_sockettype_udp) {
1565		if (address != NULL)
1566			dev->address = *address;
1567		else
1568			dev->address = sock->peer_address;
1569	} else if (sock->type == isc_sockettype_tcp) {
1570		INSIST(address == NULL);
1571		dev->address = sock->peer_address;
1572	}
1573}
1574
1575static void
1576destroy_socketevent(isc_event_t *event) {
1577	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1578
1579	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1580
1581	(ev->destroy)(event);
1582}
1583
1584static isc_socketevent_t *
1585allocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype,
1586		     isc_taskaction_t action, const void *arg)
1587{
1588	isc_socketevent_t *ev;
1589
1590	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1591						     sock, eventtype,
1592						     action, arg,
1593						     sizeof(*ev));
1594
1595	if (ev == NULL)
1596		return (NULL);
1597
1598	ev->result = ISC_R_UNSET;
1599	ISC_LINK_INIT(ev, ev_link);
1600	ISC_LIST_INIT(ev->bufferlist);
1601	ev->region.base = NULL;
1602	ev->n = 0;
1603	ev->offset = 0;
1604	ev->attributes = 0;
1605	ev->destroy = ev->ev_destroy;
1606	ev->ev_destroy = destroy_socketevent;
1607
1608	return (ev);
1609}
1610
1611#if defined(ISC_SOCKET_DEBUG)
1612static void
1613dump_msg(struct msghdr *msg) {
1614	unsigned int i;
1615
1616	printf("MSGHDR %p\n", msg);
1617	printf("\tname %p, namelen %ld\n", msg->msg_name,
1618	       (long) msg->msg_namelen);
1619	printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1620	       (long) msg->msg_iovlen);
1621	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1622		printf("\t\t%d\tbase %p, len %ld\n", i,
1623		       msg->msg_iov[i].iov_base,
1624		       (long) msg->msg_iov[i].iov_len);
1625#ifdef ISC_NET_BSD44MSGHDR
1626	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1627	       (long) msg->msg_controllen);
1628#endif
1629}
1630#endif
1631
1632#define DOIO_SUCCESS		0	/* i/o ok, event sent */
1633#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
1634#define DOIO_HARD		2	/* i/o error, event sent */
1635#define DOIO_EOF		3	/* EOF, no event sent */
1636
1637static int
1638doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
1639	int cc;
1640	struct iovec iov[MAXSCATTERGATHER_RECV];
1641	size_t read_count;
1642	size_t actual_count;
1643	struct msghdr msghdr;
1644	isc_buffer_t *buffer;
1645	int recv_errno;
1646	char strbuf[ISC_STRERRORSIZE];
1647
1648	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1649
1650#if defined(ISC_SOCKET_DEBUG)
1651	dump_msg(&msghdr);
1652#endif
1653
1654	cc = recvmsg(sock->fd, &msghdr, 0);
1655	recv_errno = errno;
1656
1657#if defined(ISC_SOCKET_DEBUG)
1658	dump_msg(&msghdr);
1659#endif
1660
1661	if (cc < 0) {
1662		if (SOFT_ERROR(recv_errno))
1663			return (DOIO_SOFT);
1664
1665		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1666			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1667			socket_log(sock, NULL, IOEVENT,
1668				   isc_msgcat, ISC_MSGSET_SOCKET,
1669				   ISC_MSG_DOIORECV,
1670				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1671				   sock->fd, cc, recv_errno, strbuf);
1672		}
1673
1674#define SOFT_OR_HARD(_system, _isc) \
1675	if (recv_errno == _system) { \
1676		if (sock->connected) { \
1677			dev->result = _isc; \
1678			inc_stats(sock->manager->stats, \
1679				  sock->statsindex[STATID_RECVFAIL]); \
1680			return (DOIO_HARD); \
1681		} \
1682		return (DOIO_SOFT); \
1683	}
1684#define ALWAYS_HARD(_system, _isc) \
1685	if (recv_errno == _system) { \
1686		dev->result = _isc; \
1687		inc_stats(sock->manager->stats, \
1688			  sock->statsindex[STATID_RECVFAIL]); \
1689		return (DOIO_HARD); \
1690	}
1691
1692		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1693		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1694		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1695		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1696		/* HPUX 11.11 can return EADDRNOTAVAIL. */
1697		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1698		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1699		/*
1700		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1701		 * errors.
1702		 */
1703#ifdef EPROTO
1704		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1705#endif
1706		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1707
1708#undef SOFT_OR_HARD
1709#undef ALWAYS_HARD
1710
1711		dev->result = isc__errno2result(recv_errno);
1712		inc_stats(sock->manager->stats,
1713			  sock->statsindex[STATID_RECVFAIL]);
1714		return (DOIO_HARD);
1715	}
1716
1717	/*
1718	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1719	 * while on UDP sockets, zero length reads are perfectly valid,
1720	 * although strange.
1721	 */
1722	switch (sock->type) {
1723	case isc_sockettype_tcp:
1724	case isc_sockettype_unix:
1725		if (cc == 0)
1726			return (DOIO_EOF);
1727		break;
1728	case isc_sockettype_udp:
1729		break;
1730	case isc_sockettype_fdwatch:
1731	default:
1732		INSIST(0);
1733	}
1734
1735	if (sock->type == isc_sockettype_udp) {
1736		dev->address.length = msghdr.msg_namelen;
1737		if (isc_sockaddr_getport(&dev->address) == 0) {
1738			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1739				socket_log(sock, &dev->address, IOEVENT,
1740					   isc_msgcat, ISC_MSGSET_SOCKET,
1741					   ISC_MSG_ZEROPORT,
1742					   "dropping source port zero packet");
1743			}
1744			return (DOIO_SOFT);
1745		}
1746		/*
1747		 * Simulate a firewall blocking UDP responses bigger than
1748		 * 512 bytes.
1749		 */
1750		if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp)
1751			return (DOIO_SOFT);
1752	}
1753
1754	socket_log(sock, &dev->address, IOEVENT,
1755		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1756		   "packet received correctly");
1757
1758	/*
1759	 * Overflow bit detection.  If we received MORE bytes than we should,
1760	 * this indicates an overflow situation.  Set the flag in the
1761	 * dev entry and adjust how much we read by one.
1762	 */
1763#ifdef ISC_NET_RECVOVERFLOW
1764	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1765		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1766		cc--;
1767	}
1768#endif
1769
1770	/*
1771	 * If there are control messages attached, run through them and pull
1772	 * out the interesting bits.
1773	 */
1774	if (sock->type == isc_sockettype_udp)
1775		process_cmsg(sock, &msghdr, dev);
1776
1777	/*
1778	 * update the buffers (if any) and the i/o count
1779	 */
1780	dev->n += cc;
1781	actual_count = cc;
1782	buffer = ISC_LIST_HEAD(dev->bufferlist);
1783	while (buffer != NULL && actual_count > 0U) {
1784		REQUIRE(ISC_BUFFER_VALID(buffer));
1785		if (isc_buffer_availablelength(buffer) <= actual_count) {
1786			actual_count -= isc_buffer_availablelength(buffer);
1787			isc_buffer_add(buffer,
1788				       isc_buffer_availablelength(buffer));
1789		} else {
1790			isc_buffer_add(buffer, actual_count);
1791			actual_count = 0;
1792			POST(actual_count);
1793			break;
1794		}
1795		buffer = ISC_LIST_NEXT(buffer, link);
1796		if (buffer == NULL) {
1797			INSIST(actual_count == 0U);
1798		}
1799	}
1800
1801	/*
1802	 * If we read less than we expected, update counters,
1803	 * and let the upper layer poke the descriptor.
1804	 */
1805	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1806		return (DOIO_SOFT);
1807
1808	/*
1809	 * Full reads are posted, or partials if partials are ok.
1810	 */
1811	dev->result = ISC_R_SUCCESS;
1812	return (DOIO_SUCCESS);
1813}
1814
1815/*
1816 * Returns:
1817 *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1818 *			ISC_R_SUCCESS.
1819 *
1820 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1821 *			dev->result contains the appropriate error.
1822 *
1823 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1824 *			event was sent.  The operation should be retried.
1825 *
1826 *	No other return values are possible.
1827 */
1828static int
1829doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1830	int cc;
1831	struct iovec iov[MAXSCATTERGATHER_SEND];
1832	size_t write_count;
1833	struct msghdr msghdr;
1834	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1835	int attempts = 0;
1836	int send_errno;
1837	char strbuf[ISC_STRERRORSIZE];
1838
1839	build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1840
1841 resend:
1842	cc = sendmsg(sock->fd, &msghdr, 0);
1843	send_errno = errno;
1844
1845	/*
1846	 * Check for error or block condition.
1847	 */
1848	if (cc < 0) {
1849		if (send_errno == EINTR && ++attempts < NRETRIES)
1850			goto resend;
1851
1852		if (SOFT_ERROR(send_errno))
1853			return (DOIO_SOFT);
1854
1855#define SOFT_OR_HARD(_system, _isc) \
1856	if (send_errno == _system) { \
1857		if (sock->connected) { \
1858			dev->result = _isc; \
1859			inc_stats(sock->manager->stats, \
1860				  sock->statsindex[STATID_SENDFAIL]); \
1861			return (DOIO_HARD); \
1862		} \
1863		return (DOIO_SOFT); \
1864	}
1865#define ALWAYS_HARD(_system, _isc) \
1866	if (send_errno == _system) { \
1867		dev->result = _isc; \
1868		inc_stats(sock->manager->stats, \
1869			  sock->statsindex[STATID_SENDFAIL]); \
1870		return (DOIO_HARD); \
1871	}
1872
1873		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1874		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1875		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1876		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1877		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1878#ifdef EHOSTDOWN
1879		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1880#endif
1881		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1882		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1883		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1884		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1885		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1886
1887#undef SOFT_OR_HARD
1888#undef ALWAYS_HARD
1889
1890		/*
1891		 * The other error types depend on whether or not the
1892		 * socket is UDP or TCP.  If it is UDP, some errors
1893		 * that we expect to be fatal under TCP are merely
1894		 * annoying, and are really soft errors.
1895		 *
1896		 * However, these soft errors are still returned as
1897		 * a status.
1898		 */
1899		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1900		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1901		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1902				 addrbuf, strbuf);
1903		dev->result = isc__errno2result(send_errno);
1904		inc_stats(sock->manager->stats,
1905			  sock->statsindex[STATID_SENDFAIL]);
1906		return (DOIO_HARD);
1907	}
1908
1909	if (cc == 0) {
1910		inc_stats(sock->manager->stats,
1911			  sock->statsindex[STATID_SENDFAIL]);
1912		UNEXPECTED_ERROR(__FILE__, __LINE__,
1913				 "doio_send: send() %s 0",
1914				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1915						ISC_MSG_RETURNED, "returned"));
1916	}
1917
1918	/*
1919	 * If we write less than we expected, update counters, poke.
1920	 */
1921	dev->n += cc;
1922	if ((size_t)cc != write_count)
1923		return (DOIO_SOFT);
1924
1925	/*
1926	 * Exactly what we wanted to write.  We're done with this
1927	 * entry.  Post its completion event.
1928	 */
1929	dev->result = ISC_R_SUCCESS;
1930	return (DOIO_SUCCESS);
1931}
1932
1933/*
1934 * Kill.
1935 *
1936 * Caller must ensure that the socket is not locked and no external
1937 * references exist.
1938 */
1939static void
1940closesocket(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1941	isc_sockettype_t type = sock->type;
1942	int lockid = FDLOCK_ID(fd);
1943
1944	/*
1945	 * No one has this socket open, so the watcher doesn't have to be
1946	 * poked, and the socket doesn't have to be locked.
1947	 */
1948	LOCK(&manager->fdlock[lockid]);
1949	manager->fds[fd] = NULL;
1950	if (type == isc_sockettype_fdwatch)
1951		manager->fdstate[fd] = CLOSED;
1952	else
1953		manager->fdstate[fd] = CLOSE_PENDING;
1954	UNLOCK(&manager->fdlock[lockid]);
1955	if (type == isc_sockettype_fdwatch) {
1956		/*
1957		 * The caller may close the socket once this function returns,
1958		 * and `fd' may be reassigned for a new socket.  So we do
1959		 * unwatch_fd() here, rather than defer it via select_poke().
1960		 * Note: this may complicate data protection among threads and
1961		 * may reduce performance due to additional locks.  One way to
1962		 * solve this would be to dup() the watched descriptor, but we
1963		 * take a simpler approach at this moment.
1964		 */
1965		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1966		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1967	} else
1968		select_poke(manager, fd, SELECT_POKE_CLOSE);
1969
1970	inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1971
1972	/*
1973	 * update manager->maxfd here (XXX: this should be implemented more
1974	 * efficiently)
1975	 */
1976#ifdef USE_SELECT
1977	LOCK(&manager->lock);
1978	if (manager->maxfd == fd) {
1979		int i;
1980
1981		manager->maxfd = 0;
1982		for (i = fd - 1; i >= 0; i--) {
1983			lockid = FDLOCK_ID(i);
1984
1985			LOCK(&manager->fdlock[lockid]);
1986			if (manager->fdstate[i] == MANAGED) {
1987				manager->maxfd = i;
1988				UNLOCK(&manager->fdlock[lockid]);
1989				break;
1990			}
1991			UNLOCK(&manager->fdlock[lockid]);
1992		}
1993#ifdef ISC_PLATFORM_USETHREADS
1994		if (manager->maxfd < manager->pipe_fds[0])
1995			manager->maxfd = manager->pipe_fds[0];
1996#endif
1997	}
1998	UNLOCK(&manager->lock);
1999#endif	/* USE_SELECT */
2000}
2001
2002static void
2003destroy(isc__socket_t **sockp) {
2004	int fd;
2005	isc__socket_t *sock = *sockp;
2006	isc__socketmgr_t *manager = sock->manager;
2007
2008	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2009		   ISC_MSG_DESTROYING, "destroying");
2010
2011	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2012	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2013	INSIST(ISC_LIST_EMPTY(sock->send_list));
2014	INSIST(sock->connect_ev == NULL);
2015	REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
2016
2017	if (sock->fd >= 0) {
2018		fd = sock->fd;
2019		sock->fd = -1;
2020		closesocket(manager, sock, fd);
2021	}
2022
2023	LOCK(&manager->lock);
2024
2025	ISC_LIST_UNLINK(manager->socklist, sock, link);
2026
2027#ifdef USE_WATCHER_THREAD
2028	if (ISC_LIST_EMPTY(manager->socklist))
2029		SIGNAL(&manager->shutdown_ok);
2030#endif /* USE_WATCHER_THREAD */
2031
2032	/* can't unlock manager as its memory context is still used */
2033	free_socket(sockp);
2034
2035	UNLOCK(&manager->lock);
2036}
2037
2038static isc_result_t
2039allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
2040		isc__socket_t **socketp)
2041{
2042	isc__socket_t *sock;
2043	isc_result_t result;
2044	ISC_SOCKADDR_LEN_T cmsgbuflen;
2045
2046	sock = isc_mem_get(manager->mctx, sizeof(*sock));
2047
2048	if (sock == NULL)
2049		return (ISC_R_NOMEMORY);
2050
2051	sock->common.magic = 0;
2052	sock->common.impmagic = 0;
2053	sock->references = 0;
2054
2055	sock->manager = manager;
2056	sock->type = type;
2057	sock->fd = -1;
2058	sock->dupped = 0;
2059	sock->statsindex = NULL;
2060
2061	ISC_LINK_INIT(sock, link);
2062
2063	sock->recvcmsgbuf = NULL;
2064	sock->sendcmsgbuf = NULL;
2065
2066	/*
2067	 * set up cmsg buffers
2068	 */
2069	cmsgbuflen = 0;
2070#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2071	cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2072#endif
2073#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
2074	cmsgbuflen += cmsg_space(sizeof(struct timeval));
2075#endif
2076	sock->recvcmsgbuflen = cmsgbuflen;
2077	if (sock->recvcmsgbuflen != 0U) {
2078		sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2079		if (sock->recvcmsgbuf == NULL) {
2080			result = ISC_R_NOMEMORY;
2081			goto error;
2082		}
2083	}
2084
2085	cmsgbuflen = 0;
2086#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2087	cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2088#if defined(IPV6_USE_MIN_MTU)
2089	/*
2090	 * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU
2091	 * support.
2092	 */
2093	cmsgbuflen += cmsg_space(sizeof(int));
2094#endif
2095#endif
2096	sock->sendcmsgbuflen = cmsgbuflen;
2097	if (sock->sendcmsgbuflen != 0U) {
2098		sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2099		if (sock->sendcmsgbuf == NULL) {
2100			result = ISC_R_NOMEMORY;
2101			goto error;
2102		}
2103	}
2104
2105	memset(sock->name, 0, sizeof(sock->name));
2106	sock->tag = NULL;
2107
2108	/*
2109	 * set up list of readers and writers to be initially empty
2110	 */
2111	ISC_LIST_INIT(sock->recv_list);
2112	ISC_LIST_INIT(sock->send_list);
2113	ISC_LIST_INIT(sock->accept_list);
2114	sock->connect_ev = NULL;
2115	sock->pending_recv = 0;
2116	sock->pending_send = 0;
2117	sock->pending_accept = 0;
2118	sock->listener = 0;
2119	sock->connected = 0;
2120	sock->connecting = 0;
2121	sock->bound = 0;
2122
2123	/*
2124	 * initialize the lock
2125	 */
2126	result = isc_mutex_init(&sock->lock);
2127	if (result != ISC_R_SUCCESS) {
2128		sock->common.magic = 0;
2129		sock->common.impmagic = 0;
2130		goto error;
2131	}
2132
2133	/*
2134	 * Initialize readable and writable events
2135	 */
2136	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
2137		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
2138		       NULL, sock, sock, NULL, NULL);
2139	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
2140		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
2141		       NULL, sock, sock, NULL, NULL);
2142
2143	sock->common.magic = ISCAPI_SOCKET_MAGIC;
2144	sock->common.impmagic = SOCKET_MAGIC;
2145	*socketp = sock;
2146
2147	return (ISC_R_SUCCESS);
2148
2149 error:
2150	if (sock->recvcmsgbuf != NULL)
2151		isc_mem_put(manager->mctx, sock->recvcmsgbuf,
2152			    sock->recvcmsgbuflen);
2153	if (sock->sendcmsgbuf != NULL)
2154		isc_mem_put(manager->mctx, sock->sendcmsgbuf,
2155			    sock->sendcmsgbuflen);
2156	isc_mem_put(manager->mctx, sock, sizeof(*sock));
2157
2158	return (result);
2159}
2160
2161/*
2162 * This event requires that the various lists be empty, that the reference
2163 * count be 1, and that the magic number is valid.  The other socket bits,
2164 * like the lock, must be initialized as well.  The fd associated must be
2165 * marked as closed, by setting it to -1 on close, or this routine will
2166 * also close the socket.
2167 */
2168static void
2169free_socket(isc__socket_t **socketp) {
2170	isc__socket_t *sock = *socketp;
2171
2172	INSIST(sock->references == 0);
2173	INSIST(VALID_SOCKET(sock));
2174	INSIST(!sock->connecting);
2175	INSIST(!sock->pending_recv);
2176	INSIST(!sock->pending_send);
2177	INSIST(!sock->pending_accept);
2178	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2179	INSIST(ISC_LIST_EMPTY(sock->send_list));
2180	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2181	INSIST(!ISC_LINK_LINKED(sock, link));
2182
2183	if (sock->recvcmsgbuf != NULL)
2184		isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
2185			    sock->recvcmsgbuflen);
2186	if (sock->sendcmsgbuf != NULL)
2187		isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
2188			    sock->sendcmsgbuflen);
2189
2190	sock->common.magic = 0;
2191	sock->common.impmagic = 0;
2192
2193	DESTROYLOCK(&sock->lock);
2194
2195	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
2196
2197	*socketp = NULL;
2198}
2199
2200#ifdef SO_BSDCOMPAT
2201/*
2202 * This really should not be necessary to do.  Having to workout
2203 * which kernel version we are on at run time so that we don't cause
2204 * the kernel to issue a warning about us using a deprecated socket option.
2205 * Such warnings should *never* be on by default in production kernels.
2206 *
2207 * We can't do this a build time because executables are moved between
2208 * machines and hence kernels.
2209 *
2210 * We can't just not set SO_BSDCOMAT because some kernels require it.
2211 */
2212
2213static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
2214isc_boolean_t bsdcompat = ISC_TRUE;
2215
2216static void
2217clear_bsdcompat(void) {
2218#ifdef __linux__
2219	 struct utsname buf;
2220	 char *endp;
2221	 long int major;
2222	 long int minor;
2223
2224	 uname(&buf);    /* Can only fail if buf is bad in Linux. */
2225
2226	 /* Paranoia in parsing can be increased, but we trust uname(). */
2227	 major = strtol(buf.release, &endp, 10);
2228	 if (*endp == '.') {
2229		minor = strtol(endp+1, &endp, 10);
2230		if ((major > 2) || ((major == 2) && (minor >= 4))) {
2231			bsdcompat = ISC_FALSE;
2232		}
2233	 }
2234#endif /* __linux __ */
2235}
2236#endif
2237
2238static isc_result_t
2239opensocket(isc__socketmgr_t *manager, isc__socket_t *sock,
2240	   isc__socket_t *dup_socket)
2241{
2242	isc_result_t result;
2243	char strbuf[ISC_STRERRORSIZE];
2244	const char *err = "socket";
2245	int tries = 0;
2246#if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2247	int on = 1;
2248#endif
2249#if defined(SO_RCVBUF)
2250	ISC_SOCKADDR_LEN_T optlen;
2251	int size;
2252#endif
2253
2254 again:
2255	if (dup_socket == NULL) {
2256		switch (sock->type) {
2257		case isc_sockettype_udp:
2258			sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2259			break;
2260		case isc_sockettype_tcp:
2261			sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2262			break;
2263		case isc_sockettype_unix:
2264			sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2265			break;
2266		case isc_sockettype_fdwatch:
2267			/*
2268			 * We should not be called for isc_sockettype_fdwatch
2269			 * sockets.
2270			 */
2271			INSIST(0);
2272			break;
2273		}
2274	} else {
2275		sock->fd = dup(dup_socket->fd);
2276		sock->dupped = 1;
2277		sock->bound = dup_socket->bound;
2278	}
2279	if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2280		goto again;
2281
2282#ifdef F_DUPFD
2283	/*
2284	 * Leave a space for stdio and TCP to work in.
2285	 */
2286	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2287	    sock->fd >= 0 && sock->fd < manager->reserved) {
2288		int new, tmp;
2289		new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2290		tmp = errno;
2291		(void)close(sock->fd);
2292		errno = tmp;
2293		sock->fd = new;
2294		err = "isc_socket_create: fcntl/reserved";
2295	} else if (sock->fd >= 0 && sock->fd < 20) {
2296		int new, tmp;
2297		new = fcntl(sock->fd, F_DUPFD, 20);
2298		tmp = errno;
2299		(void)close(sock->fd);
2300		errno = tmp;
2301		sock->fd = new;
2302		err = "isc_socket_create: fcntl";
2303	}
2304#endif
2305
2306	if (sock->fd >= (int)manager->maxsocks) {
2307		(void)close(sock->fd);
2308		isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2309			       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2310			       isc_msgcat, ISC_MSGSET_SOCKET,
2311			       ISC_MSG_TOOMANYFDS,
2312			       "socket: file descriptor exceeds limit (%d/%u)",
2313			       sock->fd, manager->maxsocks);
2314		return (ISC_R_NORESOURCES);
2315	}
2316
2317	if (sock->fd < 0) {
2318		switch (errno) {
2319		case EMFILE:
2320		case ENFILE:
2321			isc__strerror(errno, strbuf, sizeof(strbuf));
2322			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2323				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2324				       isc_msgcat, ISC_MSGSET_SOCKET,
2325				       ISC_MSG_TOOMANYFDS,
2326				       "%s: %s", err, strbuf);
2327			/* fallthrough */
2328		case ENOBUFS:
2329			return (ISC_R_NORESOURCES);
2330
2331		case EPROTONOSUPPORT:
2332		case EPFNOSUPPORT:
2333		case EAFNOSUPPORT:
2334		/*
2335		 * Linux 2.2 (and maybe others) return EINVAL instead of
2336		 * EAFNOSUPPORT.
2337		 */
2338		case EINVAL:
2339			return (ISC_R_FAMILYNOSUPPORT);
2340
2341		default:
2342			isc__strerror(errno, strbuf, sizeof(strbuf));
2343			UNEXPECTED_ERROR(__FILE__, __LINE__,
2344					 "%s() %s: %s", err,
2345					 isc_msgcat_get(isc_msgcat,
2346							ISC_MSGSET_GENERAL,
2347							ISC_MSG_FAILED,
2348							"failed"),
2349					 strbuf);
2350			return (ISC_R_UNEXPECTED);
2351		}
2352	}
2353
2354	if (dup_socket != NULL)
2355		goto setup_done;
2356
2357	result = make_nonblock(sock->fd);
2358	if (result != ISC_R_SUCCESS) {
2359		(void)close(sock->fd);
2360		return (result);
2361	}
2362
2363#ifdef SO_BSDCOMPAT
2364	RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2365				  clear_bsdcompat) == ISC_R_SUCCESS);
2366	if (sock->type != isc_sockettype_unix && bsdcompat &&
2367	    setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2368		       (void *)&on, sizeof(on)) < 0) {
2369		isc__strerror(errno, strbuf, sizeof(strbuf));
2370		UNEXPECTED_ERROR(__FILE__, __LINE__,
2371				 "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2372				 sock->fd,
2373				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2374						ISC_MSG_FAILED, "failed"),
2375				 strbuf);
2376		/* Press on... */
2377	}
2378#endif
2379
2380#ifdef SO_NOSIGPIPE
2381	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2382		       (void *)&on, sizeof(on)) < 0) {
2383		isc__strerror(errno, strbuf, sizeof(strbuf));
2384		UNEXPECTED_ERROR(__FILE__, __LINE__,
2385				 "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2386				 sock->fd,
2387				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2388						ISC_MSG_FAILED, "failed"),
2389				 strbuf);
2390		/* Press on... */
2391	}
2392#endif
2393
2394#if defined(USE_CMSG) || defined(SO_RCVBUF)
2395	if (sock->type == isc_sockettype_udp) {
2396
2397#if defined(USE_CMSG)
2398#if defined(SO_TIMESTAMP)
2399		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2400			       (void *)&on, sizeof(on)) < 0
2401		    && errno != ENOPROTOOPT) {
2402			isc__strerror(errno, strbuf, sizeof(strbuf));
2403			UNEXPECTED_ERROR(__FILE__, __LINE__,
2404					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2405					 sock->fd,
2406					 isc_msgcat_get(isc_msgcat,
2407							ISC_MSGSET_GENERAL,
2408							ISC_MSG_FAILED,
2409							"failed"),
2410					 strbuf);
2411			/* Press on... */
2412		}
2413#endif /* SO_TIMESTAMP */
2414
2415#if defined(ISC_PLATFORM_HAVEIPV6)
2416		if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2417			/*
2418			 * Warn explicitly because this anomaly can be hidden
2419			 * in usual operation (and unexpectedly appear later).
2420			 */
2421			UNEXPECTED_ERROR(__FILE__, __LINE__,
2422					 "No buffer available to receive "
2423					 "IPv6 destination");
2424		}
2425#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2426#ifdef IPV6_RECVPKTINFO
2427		/* RFC 3542 */
2428		if ((sock->pf == AF_INET6)
2429		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2430				   (void *)&on, sizeof(on)) < 0)) {
2431			isc__strerror(errno, strbuf, sizeof(strbuf));
2432			UNEXPECTED_ERROR(__FILE__, __LINE__,
2433					 "setsockopt(%d, IPV6_RECVPKTINFO) "
2434					 "%s: %s", sock->fd,
2435					 isc_msgcat_get(isc_msgcat,
2436							ISC_MSGSET_GENERAL,
2437							ISC_MSG_FAILED,
2438							"failed"),
2439					 strbuf);
2440		}
2441#else
2442		/* RFC 2292 */
2443		if ((sock->pf == AF_INET6)
2444		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2445				   (void *)&on, sizeof(on)) < 0)) {
2446			isc__strerror(errno, strbuf, sizeof(strbuf));
2447			UNEXPECTED_ERROR(__FILE__, __LINE__,
2448					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2449					 sock->fd,
2450					 isc_msgcat_get(isc_msgcat,
2451							ISC_MSGSET_GENERAL,
2452							ISC_MSG_FAILED,
2453							"failed"),
2454					 strbuf);
2455		}
2456#endif /* IPV6_RECVPKTINFO */
2457#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2458#ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2459		/* use minimum MTU */
2460		if (sock->pf == AF_INET6 &&
2461		    setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2462			       (void *)&on, sizeof(on)) < 0) {
2463			isc__strerror(errno, strbuf, sizeof(strbuf));
2464			UNEXPECTED_ERROR(__FILE__, __LINE__,
2465					 "setsockopt(%d, IPV6_USE_MIN_MTU) "
2466					 "%s: %s", sock->fd,
2467					 isc_msgcat_get(isc_msgcat,
2468							ISC_MSGSET_GENERAL,
2469							ISC_MSG_FAILED,
2470							"failed"),
2471					 strbuf);
2472		}
2473#endif
2474#if defined(IPV6_MTU)
2475		/*
2476		 * Use minimum MTU on IPv6 sockets.
2477		 */
2478		if (sock->pf == AF_INET6) {
2479			int mtu = 1280;
2480			(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU,
2481					 &mtu, sizeof(mtu));
2482		}
2483#endif
2484#if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
2485		/*
2486		 * Turn off Path MTU discovery on IPv6/UDP sockets.
2487		 */
2488		if (sock->pf == AF_INET6) {
2489			int action = IPV6_PMTUDISC_DONT;
2490			(void)setsockopt(sock->fd, IPPROTO_IPV6,
2491					 IPV6_MTU_DISCOVER, &action,
2492					 sizeof(action));
2493		}
2494#endif
2495#endif /* ISC_PLATFORM_HAVEIPV6 */
2496#endif /* defined(USE_CMSG) */
2497
2498#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2499		/*
2500		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2501		 */
2502		if (sock->pf == AF_INET) {
2503			int action = IP_PMTUDISC_DONT;
2504			(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2505					 &action, sizeof(action));
2506		}
2507#endif
2508#if defined(IP_DONTFRAG)
2509		/*
2510		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2511		 */
2512		if (sock->pf == AF_INET) {
2513			int off = 0;
2514			(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2515					 &off, sizeof(off));
2516		}
2517#endif
2518
2519#if defined(SO_RCVBUF)
2520		optlen = sizeof(size);
2521		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2522			       (void *)&size, &optlen) >= 0 &&
2523		     size < RCVBUFSIZE) {
2524			size = RCVBUFSIZE;
2525			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2526				       (void *)&size, sizeof(size)) == -1) {
2527				isc__strerror(errno, strbuf, sizeof(strbuf));
2528				UNEXPECTED_ERROR(__FILE__, __LINE__,
2529					"setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2530					sock->fd, size,
2531					isc_msgcat_get(isc_msgcat,
2532						       ISC_MSGSET_GENERAL,
2533						       ISC_MSG_FAILED,
2534						       "failed"),
2535					strbuf);
2536			}
2537		}
2538#endif
2539	}
2540#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2541
2542setup_done:
2543	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2544
2545	return (ISC_R_SUCCESS);
2546}
2547
2548/*
2549 * Create a 'type' socket or duplicate an existing socket, managed
2550 * by 'manager'.  Events will be posted to 'task' and when dispatched
2551 * 'action' will be called with 'arg' as the arg value.  The new
2552 * socket is returned in 'socketp'.
2553 */
2554static isc_result_t
2555socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2556	      isc_socket_t **socketp, isc_socket_t *dup_socket)
2557{
2558	isc__socket_t *sock = NULL;
2559	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2560	isc_result_t result;
2561	int lockid;
2562
2563	REQUIRE(VALID_MANAGER(manager));
2564	REQUIRE(socketp != NULL && *socketp == NULL);
2565	REQUIRE(type != isc_sockettype_fdwatch);
2566
2567	result = allocate_socket(manager, type, &sock);
2568	if (result != ISC_R_SUCCESS)
2569		return (result);
2570
2571	switch (sock->type) {
2572	case isc_sockettype_udp:
2573		sock->statsindex =
2574			(pf == AF_INET) ? upd4statsindex : upd6statsindex;
2575		break;
2576	case isc_sockettype_tcp:
2577		sock->statsindex =
2578			(pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2579		break;
2580	case isc_sockettype_unix:
2581		sock->statsindex = unixstatsindex;
2582		break;
2583	default:
2584		INSIST(0);
2585	}
2586
2587	sock->pf = pf;
2588
2589	result = opensocket(manager, sock, (isc__socket_t *)dup_socket);
2590	if (result != ISC_R_SUCCESS) {
2591		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2592		free_socket(&sock);
2593		return (result);
2594	}
2595
2596	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2597	sock->references = 1;
2598	*socketp = (isc_socket_t *)sock;
2599
2600	/*
2601	 * Note we don't have to lock the socket like we normally would because
2602	 * there are no external references to it yet.
2603	 */
2604
2605	lockid = FDLOCK_ID(sock->fd);
2606	LOCK(&manager->fdlock[lockid]);
2607	manager->fds[sock->fd] = sock;
2608	manager->fdstate[sock->fd] = MANAGED;
2609#ifdef USE_DEVPOLL
2610	INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2611	       sock->manager->fdpollinfo[sock->fd].want_write == 0);
2612#endif
2613	UNLOCK(&manager->fdlock[lockid]);
2614
2615	LOCK(&manager->lock);
2616	ISC_LIST_APPEND(manager->socklist, sock, link);
2617#ifdef USE_SELECT
2618	if (manager->maxfd < sock->fd)
2619		manager->maxfd = sock->fd;
2620#endif
2621	UNLOCK(&manager->lock);
2622
2623	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2624		   ISC_MSG_CREATED, dup_socket == NULL ? "dupped" : "created");
2625
2626	return (ISC_R_SUCCESS);
2627}
2628
2629/*%
2630 * Create a new 'type' socket managed by 'manager'.  Events
2631 * will be posted to 'task' and when dispatched 'action' will be
2632 * called with 'arg' as the arg value.  The new socket is returned
2633 * in 'socketp'.
2634 */
2635ISC_SOCKETFUNC_SCOPE isc_result_t
2636isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2637		   isc_socket_t **socketp)
2638{
2639	return (socket_create(manager0, pf, type, socketp, NULL));
2640}
2641
2642/*%
2643 * Duplicate an existing socket.  The new socket is returned
2644 * in 'socketp'.
2645 */
2646ISC_SOCKETFUNC_SCOPE isc_result_t
2647isc__socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) {
2648	isc__socket_t *sock = (isc__socket_t *)sock0;
2649
2650	REQUIRE(VALID_SOCKET(sock));
2651	REQUIRE(socketp != NULL && *socketp == NULL);
2652
2653	return (socket_create((isc_socketmgr_t *) sock->manager,
2654			      sock->pf, sock->type, socketp,
2655			      sock0));
2656}
2657
2658#ifdef BIND9
2659ISC_SOCKETFUNC_SCOPE isc_result_t
2660isc__socket_open(isc_socket_t *sock0) {
2661	isc_result_t result;
2662	isc__socket_t *sock = (isc__socket_t *)sock0;
2663
2664	REQUIRE(VALID_SOCKET(sock));
2665
2666	LOCK(&sock->lock);
2667	REQUIRE(sock->references == 1);
2668	REQUIRE(sock->type != isc_sockettype_fdwatch);
2669	UNLOCK(&sock->lock);
2670	/*
2671	 * We don't need to retain the lock hereafter, since no one else has
2672	 * this socket.
2673	 */
2674	REQUIRE(sock->fd == -1);
2675
2676	result = opensocket(sock->manager, sock, NULL);
2677	if (result != ISC_R_SUCCESS)
2678		sock->fd = -1;
2679
2680	if (result == ISC_R_SUCCESS) {
2681		int lockid = FDLOCK_ID(sock->fd);
2682
2683		LOCK(&sock->manager->fdlock[lockid]);
2684		sock->manager->fds[sock->fd] = sock;
2685		sock->manager->fdstate[sock->fd] = MANAGED;
2686#ifdef USE_DEVPOLL
2687		INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2688		       sock->manager->fdpollinfo[sock->fd].want_write == 0);
2689#endif
2690		UNLOCK(&sock->manager->fdlock[lockid]);
2691
2692#ifdef USE_SELECT
2693		LOCK(&sock->manager->lock);
2694		if (sock->manager->maxfd < sock->fd)
2695			sock->manager->maxfd = sock->fd;
2696		UNLOCK(&sock->manager->lock);
2697#endif
2698	}
2699
2700	return (result);
2701}
2702#endif	/* BIND9 */
2703
2704/*
2705 * Create a new 'type' socket managed by 'manager'.  Events
2706 * will be posted to 'task' and when dispatched 'action' will be
2707 * called with 'arg' as the arg value.  The new socket is returned
2708 * in 'socketp'.
2709 */
2710ISC_SOCKETFUNC_SCOPE isc_result_t
2711isc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags,
2712			  isc_sockfdwatch_t callback, void *cbarg,
2713			  isc_task_t *task, isc_socket_t **socketp)
2714{
2715	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2716	isc__socket_t *sock = NULL;
2717	isc_result_t result;
2718	int lockid;
2719
2720	REQUIRE(VALID_MANAGER(manager));
2721	REQUIRE(socketp != NULL && *socketp == NULL);
2722
2723	result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2724	if (result != ISC_R_SUCCESS)
2725		return (result);
2726
2727	sock->fd = fd;
2728	sock->fdwatcharg = cbarg;
2729	sock->fdwatchcb = callback;
2730	sock->fdwatchflags = flags;
2731	sock->fdwatchtask = task;
2732	sock->statsindex = fdwatchstatsindex;
2733
2734	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2735	sock->references = 1;
2736	*socketp = (isc_socket_t *)sock;
2737
2738	/*
2739	 * Note we don't have to lock the socket like we normally would because
2740	 * there are no external references to it yet.
2741	 */
2742
2743	lockid = FDLOCK_ID(sock->fd);
2744	LOCK(&manager->fdlock[lockid]);
2745	manager->fds[sock->fd] = sock;
2746	manager->fdstate[sock->fd] = MANAGED;
2747	UNLOCK(&manager->fdlock[lockid]);
2748
2749	LOCK(&manager->lock);
2750	ISC_LIST_APPEND(manager->socklist, sock, link);
2751#ifdef USE_SELECT
2752	if (manager->maxfd < sock->fd)
2753		manager->maxfd = sock->fd;
2754#endif
2755	UNLOCK(&manager->lock);
2756
2757	if (flags & ISC_SOCKFDWATCH_READ)
2758		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2759	if (flags & ISC_SOCKFDWATCH_WRITE)
2760		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2761
2762	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2763		   ISC_MSG_CREATED, "fdwatch-created");
2764
2765	return (ISC_R_SUCCESS);
2766}
2767
2768/*
2769 * Indicate to the manager that it should watch the socket again.
2770 * This can be used to restart watching if the previous event handler
2771 * didn't indicate there was more data to be processed.  Primarily
2772 * it is for writing but could be used for reading if desired
2773 */
2774
2775ISC_SOCKETFUNC_SCOPE isc_result_t
2776isc__socket_fdwatchpoke(isc_socket_t *sock0, int flags)
2777{
2778	isc__socket_t *sock = (isc__socket_t *)sock0;
2779
2780	REQUIRE(VALID_SOCKET(sock));
2781
2782	/*
2783	 * We check both flags first to allow us to get the lock
2784	 * once but only if we need it.
2785	 */
2786
2787	if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
2788		LOCK(&sock->lock);
2789		if (((flags & ISC_SOCKFDWATCH_READ) != 0) &&
2790		    !sock->pending_recv)
2791			select_poke(sock->manager, sock->fd,
2792				    SELECT_POKE_READ);
2793		if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) &&
2794		    !sock->pending_send)
2795			select_poke(sock->manager, sock->fd,
2796				    SELECT_POKE_WRITE);
2797		UNLOCK(&sock->lock);
2798	}
2799
2800	socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2801		   ISC_MSG_POKED, "fdwatch-poked flags: %d", flags);
2802
2803	return (ISC_R_SUCCESS);
2804}
2805
2806/*
2807 * Attach to a socket.  Caller must explicitly detach when it is done.
2808 */
2809ISC_SOCKETFUNC_SCOPE void
2810isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
2811	isc__socket_t *sock = (isc__socket_t *)sock0;
2812
2813	REQUIRE(VALID_SOCKET(sock));
2814	REQUIRE(socketp != NULL && *socketp == NULL);
2815
2816	LOCK(&sock->lock);
2817	sock->references++;
2818	UNLOCK(&sock->lock);
2819
2820	*socketp = (isc_socket_t *)sock;
2821}
2822
2823/*
2824 * Dereference a socket.  If this is the last reference to it, clean things
2825 * up by destroying the socket.
2826 */
2827ISC_SOCKETFUNC_SCOPE void
2828isc__socket_detach(isc_socket_t **socketp) {
2829	isc__socket_t *sock;
2830	isc_boolean_t kill_socket = ISC_FALSE;
2831
2832	REQUIRE(socketp != NULL);
2833	sock = (isc__socket_t *)*socketp;
2834	REQUIRE(VALID_SOCKET(sock));
2835
2836	LOCK(&sock->lock);
2837	REQUIRE(sock->references > 0);
2838	sock->references--;
2839	if (sock->references == 0)
2840		kill_socket = ISC_TRUE;
2841	UNLOCK(&sock->lock);
2842
2843	if (kill_socket)
2844		destroy(&sock);
2845
2846	*socketp = NULL;
2847}
2848
2849#ifdef BIND9
2850ISC_SOCKETFUNC_SCOPE isc_result_t
2851isc__socket_close(isc_socket_t *sock0) {
2852	isc__socket_t *sock = (isc__socket_t *)sock0;
2853	int fd;
2854	isc__socketmgr_t *manager;
2855
2856	fflush(stdout);
2857	REQUIRE(VALID_SOCKET(sock));
2858
2859	LOCK(&sock->lock);
2860
2861	REQUIRE(sock->references == 1);
2862	REQUIRE(sock->type != isc_sockettype_fdwatch);
2863	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2864
2865	INSIST(!sock->connecting);
2866	INSIST(!sock->pending_recv);
2867	INSIST(!sock->pending_send);
2868	INSIST(!sock->pending_accept);
2869	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2870	INSIST(ISC_LIST_EMPTY(sock->send_list));
2871	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2872	INSIST(sock->connect_ev == NULL);
2873
2874	manager = sock->manager;
2875	fd = sock->fd;
2876	sock->fd = -1;
2877	sock->dupped = 0;
2878	memset(sock->name, 0, sizeof(sock->name));
2879	sock->tag = NULL;
2880	sock->listener = 0;
2881	sock->connected = 0;
2882	sock->connecting = 0;
2883	sock->bound = 0;
2884	isc_sockaddr_any(&sock->peer_address);
2885
2886	UNLOCK(&sock->lock);
2887
2888	closesocket(manager, sock, fd);
2889
2890	return (ISC_R_SUCCESS);
2891}
2892#endif	/* BIND9 */
2893
2894/*
2895 * I/O is possible on a given socket.  Schedule an event to this task that
2896 * will call an internal function to do the I/O.  This will charge the
2897 * task with the I/O operation and let our select loop handler get back
2898 * to doing something real as fast as possible.
2899 *
2900 * The socket and manager must be locked before calling this function.
2901 */
2902static void
2903dispatch_recv(isc__socket_t *sock) {
2904	intev_t *iev;
2905	isc_socketevent_t *ev;
2906	isc_task_t *sender;
2907
2908	INSIST(!sock->pending_recv);
2909
2910	if (sock->type != isc_sockettype_fdwatch) {
2911		ev = ISC_LIST_HEAD(sock->recv_list);
2912		if (ev == NULL)
2913			return;
2914		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2915			   "dispatch_recv:  event %p -> task %p",
2916			   ev, ev->ev_sender);
2917		sender = ev->ev_sender;
2918	} else {
2919		sender = sock->fdwatchtask;
2920	}
2921
2922	sock->pending_recv = 1;
2923	iev = &sock->readable_ev;
2924
2925	sock->references++;
2926	iev->ev_sender = sock;
2927	if (sock->type == isc_sockettype_fdwatch)
2928		iev->ev_action = internal_fdwatch_read;
2929	else
2930		iev->ev_action = internal_recv;
2931	iev->ev_arg = sock;
2932
2933	isc_task_send(sender, (isc_event_t **)&iev);
2934}
2935
2936static void
2937dispatch_send(isc__socket_t *sock) {
2938	intev_t *iev;
2939	isc_socketevent_t *ev;
2940	isc_task_t *sender;
2941
2942	INSIST(!sock->pending_send);
2943
2944	if (sock->type != isc_sockettype_fdwatch) {
2945		ev = ISC_LIST_HEAD(sock->send_list);
2946		if (ev == NULL)
2947			return;
2948		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2949			   "dispatch_send:  event %p -> task %p",
2950			   ev, ev->ev_sender);
2951		sender = ev->ev_sender;
2952	} else {
2953		sender = sock->fdwatchtask;
2954	}
2955
2956	sock->pending_send = 1;
2957	iev = &sock->writable_ev;
2958
2959	sock->references++;
2960	iev->ev_sender = sock;
2961	if (sock->type == isc_sockettype_fdwatch)
2962		iev->ev_action = internal_fdwatch_write;
2963	else
2964		iev->ev_action = internal_send;
2965	iev->ev_arg = sock;
2966
2967	isc_task_send(sender, (isc_event_t **)&iev);
2968}
2969
2970/*
2971 * Dispatch an internal accept event.
2972 */
2973static void
2974dispatch_accept(isc__socket_t *sock) {
2975	intev_t *iev;
2976	isc_socket_newconnev_t *ev;
2977
2978	INSIST(!sock->pending_accept);
2979
2980	/*
2981	 * Are there any done events left, or were they all canceled
2982	 * before the manager got the socket lock?
2983	 */
2984	ev = ISC_LIST_HEAD(sock->accept_list);
2985	if (ev == NULL)
2986		return;
2987
2988	sock->pending_accept = 1;
2989	iev = &sock->readable_ev;
2990
2991	sock->references++;  /* keep socket around for this internal event */
2992	iev->ev_sender = sock;
2993	iev->ev_action = internal_accept;
2994	iev->ev_arg = sock;
2995
2996	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2997}
2998
2999static void
3000dispatch_connect(isc__socket_t *sock) {
3001	intev_t *iev;
3002	isc_socket_connev_t *ev;
3003
3004	iev = &sock->writable_ev;
3005
3006	ev = sock->connect_ev;
3007	INSIST(ev != NULL); /* XXX */
3008
3009	INSIST(sock->connecting);
3010
3011	sock->references++;  /* keep socket around for this internal event */
3012	iev->ev_sender = sock;
3013	iev->ev_action = internal_connect;
3014	iev->ev_arg = sock;
3015
3016	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
3017}
3018
3019/*
3020 * Dequeue an item off the given socket's read queue, set the result code
3021 * in the done event to the one provided, and send it to the task it was
3022 * destined for.
3023 *
3024 * If the event to be sent is on a list, remove it before sending.  If
3025 * asked to, send and detach from the socket as well.
3026 *
3027 * Caller must have the socket locked if the event is attached to the socket.
3028 */
3029static void
3030send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
3031	isc_task_t *task;
3032
3033	task = (*dev)->ev_sender;
3034
3035	(*dev)->ev_sender = sock;
3036
3037	if (ISC_LINK_LINKED(*dev, ev_link))
3038		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
3039
3040	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
3041	    == ISC_SOCKEVENTATTR_ATTACHED)
3042		isc_task_sendanddetach(&task, (isc_event_t **)dev);
3043	else
3044		isc_task_send(task, (isc_event_t **)dev);
3045}
3046
3047/*
3048 * See comments for send_recvdone_event() above.
3049 *
3050 * Caller must have the socket locked if the event is attached to the socket.
3051 */
3052static void
3053send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
3054	isc_task_t *task;
3055
3056	INSIST(dev != NULL && *dev != NULL);
3057
3058	task = (*dev)->ev_sender;
3059	(*dev)->ev_sender = sock;
3060
3061	if (ISC_LINK_LINKED(*dev, ev_link))
3062		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
3063
3064	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
3065	    == ISC_SOCKEVENTATTR_ATTACHED)
3066		isc_task_sendanddetach(&task, (isc_event_t **)dev);
3067	else
3068		isc_task_send(task, (isc_event_t **)dev);
3069}
3070
3071/*
3072 * Call accept() on a socket, to get the new file descriptor.  The listen
3073 * socket is used as a prototype to create a new isc_socket_t.  The new
3074 * socket has one outstanding reference.  The task receiving the event
3075 * will be detached from just after the event is delivered.
3076 *
3077 * On entry to this function, the event delivered is the internal
3078 * readable event, and the first item on the accept_list should be
3079 * the done event we want to send.  If the list is empty, this is a no-op,
3080 * so just unlock and return.
3081 */
3082static void
3083internal_accept(isc_task_t *me, isc_event_t *ev) {
3084	isc__socket_t *sock;
3085	isc__socketmgr_t *manager;
3086	isc_socket_newconnev_t *dev;
3087	isc_task_t *task;
3088	ISC_SOCKADDR_LEN_T addrlen;
3089	int fd;
3090	isc_result_t result = ISC_R_SUCCESS;
3091	char strbuf[ISC_STRERRORSIZE];
3092	const char *err = "accept";
3093
3094	UNUSED(me);
3095
3096	sock = ev->ev_sender;
3097	INSIST(VALID_SOCKET(sock));
3098
3099	LOCK(&sock->lock);
3100	socket_log(sock, NULL, TRACE,
3101		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
3102		   "internal_accept called, locked socket");
3103
3104	manager = sock->manager;
3105	INSIST(VALID_MANAGER(manager));
3106
3107	INSIST(sock->listener);
3108	INSIST(sock->pending_accept == 1);
3109	sock->pending_accept = 0;
3110
3111	INSIST(sock->references > 0);
3112	sock->references--;  /* the internal event is done with this socket */
3113	if (sock->references == 0) {
3114		UNLOCK(&sock->lock);
3115		destroy(&sock);
3116		return;
3117	}
3118
3119	/*
3120	 * Get the first item off the accept list.
3121	 * If it is empty, unlock the socket and return.
3122	 */
3123	dev = ISC_LIST_HEAD(sock->accept_list);
3124	if (dev == NULL) {
3125		UNLOCK(&sock->lock);
3126		return;
3127	}
3128
3129	/*
3130	 * Try to accept the new connection.  If the accept fails with
3131	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
3132	 * again.  Also ignore ECONNRESET, which has been reported to
3133	 * be spuriously returned on Linux 2.2.19 although it is not
3134	 * a documented error for accept().  ECONNABORTED has been
3135	 * reported for Solaris 8.  The rest are thrown in not because
3136	 * we have seen them but because they are ignored by other
3137	 * daemons such as BIND 8 and Apache.
3138	 */
3139
3140	addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
3141	memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
3142	fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
3143		    (void *)&addrlen);
3144
3145#ifdef F_DUPFD
3146	/*
3147	 * Leave a space for stdio to work in.
3148	 */
3149	if (fd >= 0 && fd < 20) {
3150		int new, tmp;
3151		new = fcntl(fd, F_DUPFD, 20);
3152		tmp = errno;
3153		(void)close(fd);
3154		errno = tmp;
3155		fd = new;
3156		err = "accept/fcntl";
3157	}
3158#endif
3159
3160	if (fd < 0) {
3161		if (SOFT_ERROR(errno))
3162			goto soft_error;
3163		switch (errno) {
3164		case ENFILE:
3165		case EMFILE:
3166			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3167				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3168				       isc_msgcat, ISC_MSGSET_SOCKET,
3169				       ISC_MSG_TOOMANYFDS,
3170				       "%s: too many open file descriptors",
3171				       err);
3172			goto soft_error;
3173
3174		case ENOBUFS:
3175		case ENOMEM:
3176		case ECONNRESET:
3177		case ECONNABORTED:
3178		case EHOSTUNREACH:
3179		case EHOSTDOWN:
3180		case ENETUNREACH:
3181		case ENETDOWN:
3182		case ECONNREFUSED:
3183#ifdef EPROTO
3184		case EPROTO:
3185#endif
3186#ifdef ENONET
3187		case ENONET:
3188#endif
3189			goto soft_error;
3190		default:
3191			break;
3192		}
3193		isc__strerror(errno, strbuf, sizeof(strbuf));
3194		UNEXPECTED_ERROR(__FILE__, __LINE__,
3195				 "internal_accept: %s() %s: %s", err,
3196				 isc_msgcat_get(isc_msgcat,
3197						ISC_MSGSET_GENERAL,
3198						ISC_MSG_FAILED,
3199						"failed"),
3200				 strbuf);
3201		fd = -1;
3202		result = ISC_R_UNEXPECTED;
3203	} else {
3204		if (addrlen == 0U) {
3205			UNEXPECTED_ERROR(__FILE__, __LINE__,
3206					 "internal_accept(): "
3207					 "accept() failed to return "
3208					 "remote address");
3209
3210			(void)close(fd);
3211			goto soft_error;
3212		} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
3213			   sock->pf)
3214		{
3215			UNEXPECTED_ERROR(__FILE__, __LINE__,
3216					 "internal_accept(): "
3217					 "accept() returned peer address "
3218					 "family %u (expected %u)",
3219					 NEWCONNSOCK(dev)->peer_address.
3220					 type.sa.sa_family,
3221					 sock->pf);
3222			(void)close(fd);
3223			goto soft_error;
3224		} else if (fd >= (int)manager->maxsocks) {
3225			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3226				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3227				       isc_msgcat, ISC_MSGSET_SOCKET,
3228				       ISC_MSG_TOOMANYFDS,
3229				       "accept: "
3230				       "file descriptor exceeds limit (%d/%u)",
3231				       fd, manager->maxsocks);
3232			(void)close(fd);
3233			goto soft_error;
3234		}
3235	}
3236
3237	if (fd != -1) {
3238		NEWCONNSOCK(dev)->peer_address.length = addrlen;
3239		NEWCONNSOCK(dev)->pf = sock->pf;
3240	}
3241
3242	/*
3243	 * Pull off the done event.
3244	 */
3245	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
3246
3247	/*
3248	 * Poke watcher if there are more pending accepts.
3249	 */
3250	if (!ISC_LIST_EMPTY(sock->accept_list))
3251		select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3252
3253	UNLOCK(&sock->lock);
3254
3255	if (fd != -1) {
3256		result = make_nonblock(fd);
3257		if (result != ISC_R_SUCCESS) {
3258			(void)close(fd);
3259			fd = -1;
3260		}
3261	}
3262
3263	/*
3264	 * -1 means the new socket didn't happen.
3265	 */
3266	if (fd != -1) {
3267		int lockid = FDLOCK_ID(fd);
3268
3269		LOCK(&manager->fdlock[lockid]);
3270		manager->fds[fd] = NEWCONNSOCK(dev);
3271		manager->fdstate[fd] = MANAGED;
3272		UNLOCK(&manager->fdlock[lockid]);
3273
3274		LOCK(&manager->lock);
3275		ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
3276
3277		NEWCONNSOCK(dev)->fd = fd;
3278		NEWCONNSOCK(dev)->bound = 1;
3279		NEWCONNSOCK(dev)->connected = 1;
3280
3281		/*
3282		 * Save away the remote address
3283		 */
3284		dev->address = NEWCONNSOCK(dev)->peer_address;
3285
3286#ifdef USE_SELECT
3287		if (manager->maxfd < fd)
3288			manager->maxfd = fd;
3289#endif
3290
3291		socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
3292			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
3293			   "accepted connection, new socket %p",
3294			   dev->newsocket);
3295
3296		UNLOCK(&manager->lock);
3297
3298		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3299	} else {
3300		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3301		NEWCONNSOCK(dev)->references--;
3302		free_socket((isc__socket_t **)&dev->newsocket);
3303	}
3304
3305	/*
3306	 * Fill in the done event details and send it off.
3307	 */
3308	dev->result = result;
3309	task = dev->ev_sender;
3310	dev->ev_sender = sock;
3311
3312	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3313	return;
3314
3315 soft_error:
3316	select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3317	UNLOCK(&sock->lock);
3318
3319	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3320	return;
3321}
3322
3323static void
3324internal_recv(isc_task_t *me, isc_event_t *ev) {
3325	isc_socketevent_t *dev;
3326	isc__socket_t *sock;
3327
3328	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3329
3330	sock = ev->ev_sender;
3331	INSIST(VALID_SOCKET(sock));
3332
3333	LOCK(&sock->lock);
3334	socket_log(sock, NULL, IOEVENT,
3335		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3336		   "internal_recv: task %p got event %p", me, ev);
3337
3338	INSIST(sock->pending_recv == 1);
3339	sock->pending_recv = 0;
3340
3341	INSIST(sock->references > 0);
3342	sock->references--;  /* the internal event is done with this socket */
3343	if (sock->references == 0) {
3344		UNLOCK(&sock->lock);
3345		destroy(&sock);
3346		return;
3347	}
3348
3349	/*
3350	 * Try to do as much I/O as possible on this socket.  There are no
3351	 * limits here, currently.
3352	 */
3353	dev = ISC_LIST_HEAD(sock->recv_list);
3354	while (dev != NULL) {
3355		switch (doio_recv(sock, dev)) {
3356		case DOIO_SOFT:
3357			goto poke;
3358
3359		case DOIO_EOF:
3360			/*
3361			 * read of 0 means the remote end was closed.
3362			 * Run through the event queue and dispatch all
3363			 * the events with an EOF result code.
3364			 */
3365			do {
3366				dev->result = ISC_R_EOF;
3367				send_recvdone_event(sock, &dev);
3368				dev = ISC_LIST_HEAD(sock->recv_list);
3369			} while (dev != NULL);
3370			goto poke;
3371
3372		case DOIO_SUCCESS:
3373		case DOIO_HARD:
3374			send_recvdone_event(sock, &dev);
3375			break;
3376		}
3377
3378		dev = ISC_LIST_HEAD(sock->recv_list);
3379	}
3380
3381 poke:
3382	if (!ISC_LIST_EMPTY(sock->recv_list))
3383		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3384
3385	UNLOCK(&sock->lock);
3386}
3387
3388static void
3389internal_send(isc_task_t *me, isc_event_t *ev) {
3390	isc_socketevent_t *dev;
3391	isc__socket_t *sock;
3392
3393	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3394
3395	/*
3396	 * Find out what socket this is and lock it.
3397	 */
3398	sock = (isc__socket_t *)ev->ev_sender;
3399	INSIST(VALID_SOCKET(sock));
3400
3401	LOCK(&sock->lock);
3402	socket_log(sock, NULL, IOEVENT,
3403		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3404		   "internal_send: task %p got event %p", me, ev);
3405
3406	INSIST(sock->pending_send == 1);
3407	sock->pending_send = 0;
3408
3409	INSIST(sock->references > 0);
3410	sock->references--;  /* the internal event is done with this socket */
3411	if (sock->references == 0) {
3412		UNLOCK(&sock->lock);
3413		destroy(&sock);
3414		return;
3415	}
3416
3417	/*
3418	 * Try to do as much I/O as possible on this socket.  There are no
3419	 * limits here, currently.
3420	 */
3421	dev = ISC_LIST_HEAD(sock->send_list);
3422	while (dev != NULL) {
3423		switch (doio_send(sock, dev)) {
3424		case DOIO_SOFT:
3425			goto poke;
3426
3427		case DOIO_HARD:
3428		case DOIO_SUCCESS:
3429			send_senddone_event(sock, &dev);
3430			break;
3431		}
3432
3433		dev = ISC_LIST_HEAD(sock->send_list);
3434	}
3435
3436 poke:
3437	if (!ISC_LIST_EMPTY(sock->send_list))
3438		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3439
3440	UNLOCK(&sock->lock);
3441}
3442
3443static void
3444internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3445	isc__socket_t *sock;
3446	int more_data;
3447
3448	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3449
3450	/*
3451	 * Find out what socket this is and lock it.
3452	 */
3453	sock = (isc__socket_t *)ev->ev_sender;
3454	INSIST(VALID_SOCKET(sock));
3455
3456	LOCK(&sock->lock);
3457	socket_log(sock, NULL, IOEVENT,
3458		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3459		   "internal_fdwatch_write: task %p got event %p", me, ev);
3460
3461	INSIST(sock->pending_send == 1);
3462
3463	UNLOCK(&sock->lock);
3464	more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3465				      sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
3466	LOCK(&sock->lock);
3467
3468	sock->pending_send = 0;
3469
3470	INSIST(sock->references > 0);
3471	sock->references--;  /* the internal event is done with this socket */
3472	if (sock->references == 0) {
3473		UNLOCK(&sock->lock);
3474		destroy(&sock);
3475		return;
3476	}
3477
3478	if (more_data)
3479		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3480
3481	UNLOCK(&sock->lock);
3482}
3483
3484static void
3485internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3486	isc__socket_t *sock;
3487	int more_data;
3488
3489	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3490
3491	/*
3492	 * Find out what socket this is and lock it.
3493	 */
3494	sock = (isc__socket_t *)ev->ev_sender;
3495	INSIST(VALID_SOCKET(sock));
3496
3497	LOCK(&sock->lock);
3498	socket_log(sock, NULL, IOEVENT,
3499		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3500		   "internal_fdwatch_read: task %p got event %p", me, ev);
3501
3502	INSIST(sock->pending_recv == 1);
3503
3504	UNLOCK(&sock->lock);
3505	more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3506				      sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
3507	LOCK(&sock->lock);
3508
3509	sock->pending_recv = 0;
3510
3511	INSIST(sock->references > 0);
3512	sock->references--;  /* the internal event is done with this socket */
3513	if (sock->references == 0) {
3514		UNLOCK(&sock->lock);
3515		destroy(&sock);
3516		return;
3517	}
3518
3519	if (more_data)
3520		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3521
3522	UNLOCK(&sock->lock);
3523}
3524
3525/*
3526 * Process read/writes on each fd here.  Avoid locking
3527 * and unlocking twice if both reads and writes are possible.
3528 */
3529static void
3530process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
3531	   isc_boolean_t writeable)
3532{
3533	isc__socket_t *sock;
3534	isc_boolean_t unlock_sock;
3535	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3536	int lockid = FDLOCK_ID(fd);
3537
3538	/*
3539	 * If the socket is going to be closed, don't do more I/O.
3540	 */
3541	LOCK(&manager->fdlock[lockid]);
3542	if (manager->fdstate[fd] == CLOSE_PENDING) {
3543		UNLOCK(&manager->fdlock[lockid]);
3544
3545		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3546		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3547		return;
3548	}
3549
3550	sock = manager->fds[fd];
3551	unlock_sock = ISC_FALSE;
3552	if (readable) {
3553		if (sock == NULL) {
3554			unwatch_read = ISC_TRUE;
3555			goto check_write;
3556		}
3557		unlock_sock = ISC_TRUE;
3558		LOCK(&sock->lock);
3559		if (!SOCK_DEAD(sock)) {
3560			if (sock->listener)
3561				dispatch_accept(sock);
3562			else
3563				dispatch_recv(sock);
3564		}
3565		unwatch_read = ISC_TRUE;
3566	}
3567check_write:
3568	if (writeable) {
3569		if (sock == NULL) {
3570			unwatch_write = ISC_TRUE;
3571			goto unlock_fd;
3572		}
3573		if (!unlock_sock) {
3574			unlock_sock = ISC_TRUE;
3575			LOCK(&sock->lock);
3576		}
3577		if (!SOCK_DEAD(sock)) {
3578			if (sock->connecting)
3579				dispatch_connect(sock);
3580			else
3581				dispatch_send(sock);
3582		}
3583		unwatch_write = ISC_TRUE;
3584	}
3585	if (unlock_sock)
3586		UNLOCK(&sock->lock);
3587
3588 unlock_fd:
3589	UNLOCK(&manager->fdlock[lockid]);
3590	if (unwatch_read)
3591		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3592	if (unwatch_write)
3593		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3594
3595}
3596
3597#ifdef USE_KQUEUE
3598static isc_boolean_t
3599process_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) {
3600	int i;
3601	isc_boolean_t readable, writable;
3602	isc_boolean_t done = ISC_FALSE;
3603#ifdef USE_WATCHER_THREAD
3604	isc_boolean_t have_ctlevent = ISC_FALSE;
3605#endif
3606
3607	if (nevents == manager->nevents) {
3608		/*
3609		 * This is not an error, but something unexpected.  If this
3610		 * happens, it may indicate the need for increasing
3611		 * ISC_SOCKET_MAXEVENTS.
3612		 */
3613		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3614			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3615			    "maximum number of FD events (%d) received",
3616			    nevents);
3617	}
3618
3619	for (i = 0; i < nevents; i++) {
3620		REQUIRE(events[i].ident < manager->maxsocks);
3621#ifdef USE_WATCHER_THREAD
3622		if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3623			have_ctlevent = ISC_TRUE;
3624			continue;
3625		}
3626#endif
3627		readable = ISC_TF(events[i].filter == EVFILT_READ);
3628		writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3629		process_fd(manager, events[i].ident, readable, writable);
3630	}
3631
3632#ifdef USE_WATCHER_THREAD
3633	if (have_ctlevent)
3634		done = process_ctlfd(manager);
3635#endif
3636
3637	return (done);
3638}
3639#elif defined(USE_EPOLL)
3640static isc_boolean_t
3641process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents)
3642{
3643	int i;
3644	isc_boolean_t done = ISC_FALSE;
3645#ifdef USE_WATCHER_THREAD
3646	isc_boolean_t have_ctlevent = ISC_FALSE;
3647#endif
3648
3649	if (nevents == manager->nevents) {
3650		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3651			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3652			    "maximum number of FD events (%d) received",
3653			    nevents);
3654	}
3655
3656	for (i = 0; i < nevents; i++) {
3657		REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3658#ifdef USE_WATCHER_THREAD
3659		if (events[i].data.fd == manager->pipe_fds[0]) {
3660			have_ctlevent = ISC_TRUE;
3661			continue;
3662		}
3663#endif
3664		if ((events[i].events & EPOLLERR) != 0 ||
3665		    (events[i].events & EPOLLHUP) != 0) {
3666			/*
3667			 * epoll does not set IN/OUT bits on an erroneous
3668			 * condition, so we need to try both anyway.  This is a
3669			 * bit inefficient, but should be okay for such rare
3670			 * events.  Note also that the read or write attempt
3671			 * won't block because we use non-blocking sockets.
3672			 */
3673			events[i].events |= (EPOLLIN | EPOLLOUT);
3674		}
3675		process_fd(manager, events[i].data.fd,
3676			   (events[i].events & EPOLLIN) != 0,
3677			   (events[i].events & EPOLLOUT) != 0);
3678	}
3679
3680#ifdef USE_WATCHER_THREAD
3681	if (have_ctlevent)
3682		done = process_ctlfd(manager);
3683#endif
3684
3685	return (done);
3686}
3687#elif defined(USE_DEVPOLL)
3688static isc_boolean_t
3689process_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) {
3690	int i;
3691	isc_boolean_t done = ISC_FALSE;
3692#ifdef USE_WATCHER_THREAD
3693	isc_boolean_t have_ctlevent = ISC_FALSE;
3694#endif
3695
3696	if (nevents == manager->nevents) {
3697		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3698			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3699			    "maximum number of FD events (%d) received",
3700			    nevents);
3701	}
3702
3703	for (i = 0; i < nevents; i++) {
3704		REQUIRE(events[i].fd < (int)manager->maxsocks);
3705#ifdef USE_WATCHER_THREAD
3706		if (events[i].fd == manager->pipe_fds[0]) {
3707			have_ctlevent = ISC_TRUE;
3708			continue;
3709		}
3710#endif
3711		process_fd(manager, events[i].fd,
3712			   (events[i].events & POLLIN) != 0,
3713			   (events[i].events & POLLOUT) != 0);
3714	}
3715
3716#ifdef USE_WATCHER_THREAD
3717	if (have_ctlevent)
3718		done = process_ctlfd(manager);
3719#endif
3720
3721	return (done);
3722}
3723#elif defined(USE_SELECT)
3724static void
3725process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
3726	    fd_set *writefds)
3727{
3728	int i;
3729
3730	REQUIRE(maxfd <= (int)manager->maxsocks);
3731
3732	for (i = 0; i < maxfd; i++) {
3733#ifdef USE_WATCHER_THREAD
3734		if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3735			continue;
3736#endif /* USE_WATCHER_THREAD */
3737		process_fd(manager, i, FD_ISSET(i, readfds),
3738			   FD_ISSET(i, writefds));
3739	}
3740}
3741#endif
3742
3743#ifdef USE_WATCHER_THREAD
3744static isc_boolean_t
3745process_ctlfd(isc__socketmgr_t *manager) {
3746	int msg, fd;
3747
3748	for (;;) {
3749		select_readmsg(manager, &fd, &msg);
3750
3751		manager_log(manager, IOEVENT,
3752			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3753					   ISC_MSG_WATCHERMSG,
3754					   "watcher got message %d "
3755					   "for socket %d"), msg, fd);
3756
3757		/*
3758		 * Nothing to read?
3759		 */
3760		if (msg == SELECT_POKE_NOTHING)
3761			break;
3762
3763		/*
3764		 * Handle shutdown message.  We really should
3765		 * jump out of this loop right away, but
3766		 * it doesn't matter if we have to do a little
3767		 * more work first.
3768		 */
3769		if (msg == SELECT_POKE_SHUTDOWN)
3770			return (ISC_TRUE);
3771
3772		/*
3773		 * This is a wakeup on a socket.  Look
3774		 * at the event queue for both read and write,
3775		 * and decide if we need to watch on it now
3776		 * or not.
3777		 */
3778		wakeup_socket(manager, fd, msg);
3779	}
3780
3781	return (ISC_FALSE);
3782}
3783
3784/*
3785 * This is the thread that will loop forever, always in a select or poll
3786 * call.
3787 *
3788 * When select returns something to do, track down what thread gets to do
3789 * this I/O and post the event to it.
3790 */
3791static isc_threadresult_t
3792watcher(void *uap) {
3793	isc__socketmgr_t *manager = uap;
3794	isc_boolean_t done;
3795	int cc;
3796#ifdef USE_KQUEUE
3797	const char *fnname = "kevent()";
3798#elif defined (USE_EPOLL)
3799	const char *fnname = "epoll_wait()";
3800#elif defined(USE_DEVPOLL)
3801	const char *fnname = "ioctl(DP_POLL)";
3802	struct dvpoll dvp;
3803#elif defined (USE_SELECT)
3804	const char *fnname = "select()";
3805	int maxfd;
3806	int ctlfd;
3807#endif
3808	char strbuf[ISC_STRERRORSIZE];
3809#ifdef ISC_SOCKET_USE_POLLWATCH
3810	pollstate_t pollstate = poll_idle;
3811#endif
3812
3813#if defined (USE_SELECT)
3814	/*
3815	 * Get the control fd here.  This will never change.
3816	 */
3817	ctlfd = manager->pipe_fds[0];
3818#endif
3819	done = ISC_FALSE;
3820	while (!done) {
3821		do {
3822#ifdef USE_KQUEUE
3823			cc = kevent(manager->kqueue_fd, NULL, 0,
3824				    manager->events, manager->nevents, NULL);
3825#elif defined(USE_EPOLL)
3826			cc = epoll_wait(manager->epoll_fd, manager->events,
3827					manager->nevents, -1);
3828#elif defined(USE_DEVPOLL)
3829			dvp.dp_fds = manager->events;
3830			dvp.dp_nfds = manager->nevents;
3831#ifndef ISC_SOCKET_USE_POLLWATCH
3832			dvp.dp_timeout = -1;
3833#else
3834			if (pollstate == poll_idle)
3835				dvp.dp_timeout = -1;
3836			else
3837				dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3838#endif	/* ISC_SOCKET_USE_POLLWATCH */
3839			cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3840#elif defined(USE_SELECT)
3841			LOCK(&manager->lock);
3842			memcpy(manager->read_fds_copy, manager->read_fds,
3843			       manager->fd_bufsize);
3844			memcpy(manager->write_fds_copy, manager->write_fds,
3845			       manager->fd_bufsize);
3846			maxfd = manager->maxfd + 1;
3847			UNLOCK(&manager->lock);
3848
3849			cc = select(maxfd, manager->read_fds_copy,
3850				    manager->write_fds_copy, NULL, NULL);
3851#endif	/* USE_KQUEUE */
3852
3853			if (cc < 0 && !SOFT_ERROR(errno)) {
3854				isc__strerror(errno, strbuf, sizeof(strbuf));
3855				FATAL_ERROR(__FILE__, __LINE__,
3856					    "%s %s: %s", fnname,
3857					    isc_msgcat_get(isc_msgcat,
3858							   ISC_MSGSET_GENERAL,
3859							   ISC_MSG_FAILED,
3860							   "failed"), strbuf);
3861			}
3862
3863#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3864			if (cc == 0) {
3865				if (pollstate == poll_active)
3866					pollstate = poll_checking;
3867				else if (pollstate == poll_checking)
3868					pollstate = poll_idle;
3869			} else if (cc > 0) {
3870				if (pollstate == poll_checking) {
3871					/*
3872					 * XXX: We'd like to use a more
3873					 * verbose log level as it's actually an
3874					 * unexpected event, but the kernel bug
3875					 * reportedly happens pretty frequently
3876					 * (and it can also be a false positive)
3877					 * so it would be just too noisy.
3878					 */
3879					manager_log(manager,
3880						    ISC_LOGCATEGORY_GENERAL,
3881						    ISC_LOGMODULE_SOCKET,
3882						    ISC_LOG_DEBUG(1),
3883						    "unexpected POLL timeout");
3884				}
3885				pollstate = poll_active;
3886			}
3887#endif
3888		} while (cc < 0);
3889
3890#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3891		done = process_fds(manager, manager->events, cc);
3892#elif defined(USE_SELECT)
3893		process_fds(manager, maxfd, manager->read_fds_copy,
3894			    manager->write_fds_copy);
3895
3896		/*
3897		 * Process reads on internal, control fd.
3898		 */
3899		if (FD_ISSET(ctlfd, manager->read_fds_copy))
3900			done = process_ctlfd(manager);
3901#endif
3902	}
3903
3904	manager_log(manager, TRACE, "%s",
3905		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3906				   ISC_MSG_EXITING, "watcher exiting"));
3907
3908	return ((isc_threadresult_t)0);
3909}
3910#endif /* USE_WATCHER_THREAD */
3911
3912#ifdef BIND9
3913ISC_SOCKETFUNC_SCOPE void
3914isc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) {
3915	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3916
3917	REQUIRE(VALID_MANAGER(manager));
3918
3919	manager->reserved = reserved;
3920}
3921
3922ISC_SOCKETFUNC_SCOPE void
3923isc___socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) {
3924	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3925
3926	REQUIRE(VALID_MANAGER(manager));
3927
3928	manager->maxudp = maxudp;
3929}
3930#endif	/* BIND9 */
3931
3932/*
3933 * Create a new socket manager.
3934 */
3935
3936static isc_result_t
3937setup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
3938	isc_result_t result;
3939#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3940	char strbuf[ISC_STRERRORSIZE];
3941#endif
3942
3943#ifdef USE_KQUEUE
3944	manager->nevents = ISC_SOCKET_MAXEVENTS;
3945	manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3946				      manager->nevents);
3947	if (manager->events == NULL)
3948		return (ISC_R_NOMEMORY);
3949	manager->kqueue_fd = kqueue();
3950	if (manager->kqueue_fd == -1) {
3951		result = isc__errno2result(errno);
3952		isc__strerror(errno, strbuf, sizeof(strbuf));
3953		UNEXPECTED_ERROR(__FILE__, __LINE__,
3954				 "kqueue %s: %s",
3955				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3956						ISC_MSG_FAILED, "failed"),
3957				 strbuf);
3958		isc_mem_put(mctx, manager->events,
3959			    sizeof(struct kevent) * manager->nevents);
3960		return (result);
3961	}
3962
3963#ifdef USE_WATCHER_THREAD
3964	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3965	if (result != ISC_R_SUCCESS) {
3966		close(manager->kqueue_fd);
3967		isc_mem_put(mctx, manager->events,
3968			    sizeof(struct kevent) * manager->nevents);
3969		return (result);
3970	}
3971#endif	/* USE_WATCHER_THREAD */
3972#elif defined(USE_EPOLL)
3973	manager->nevents = ISC_SOCKET_MAXEVENTS;
3974	manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3975				      manager->nevents);
3976	if (manager->events == NULL)
3977		return (ISC_R_NOMEMORY);
3978	manager->epoll_fd = epoll_create(manager->nevents);
3979	if (manager->epoll_fd == -1) {
3980		result = isc__errno2result(errno);
3981		isc__strerror(errno, strbuf, sizeof(strbuf));
3982		UNEXPECTED_ERROR(__FILE__, __LINE__,
3983				 "epoll_create %s: %s",
3984				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3985						ISC_MSG_FAILED, "failed"),
3986				 strbuf);
3987		isc_mem_put(mctx, manager->events,
3988			    sizeof(struct epoll_event) * manager->nevents);
3989		return (result);
3990	}
3991#ifdef USE_WATCHER_THREAD
3992	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3993	if (result != ISC_R_SUCCESS) {
3994		close(manager->epoll_fd);
3995		isc_mem_put(mctx, manager->events,
3996			    sizeof(struct epoll_event) * manager->nevents);
3997		return (result);
3998	}
3999#endif	/* USE_WATCHER_THREAD */
4000#elif defined(USE_DEVPOLL)
4001	/*
4002	 * XXXJT: /dev/poll seems to reject large numbers of events,
4003	 * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
4004	 */
4005	manager->nevents = ISC_SOCKET_MAXEVENTS;
4006	manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
4007				      manager->nevents);
4008	if (manager->events == NULL)
4009		return (ISC_R_NOMEMORY);
4010	/*
4011	 * Note: fdpollinfo should be able to support all possible FDs, so
4012	 * it must have maxsocks entries (not nevents).
4013	 */
4014	manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
4015					  manager->maxsocks);
4016	if (manager->fdpollinfo == NULL) {
4017		isc_mem_put(mctx, manager->events,
4018			    sizeof(struct pollfd) * manager->nevents);
4019		return (ISC_R_NOMEMORY);
4020	}
4021	memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
4022	manager->devpoll_fd = open("/dev/poll", O_RDWR);
4023	if (manager->devpoll_fd == -1) {
4024		result = isc__errno2result(errno);
4025		isc__strerror(errno, strbuf, sizeof(strbuf));
4026		UNEXPECTED_ERROR(__FILE__, __LINE__,
4027				 "open(/dev/poll) %s: %s",
4028				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4029						ISC_MSG_FAILED, "failed"),
4030				 strbuf);
4031		isc_mem_put(mctx, manager->events,
4032			    sizeof(struct pollfd) * manager->nevents);
4033		isc_mem_put(mctx, manager->fdpollinfo,
4034			    sizeof(pollinfo_t) * manager->maxsocks);
4035		return (result);
4036	}
4037#ifdef USE_WATCHER_THREAD
4038	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4039	if (result != ISC_R_SUCCESS) {
4040		close(manager->devpoll_fd);
4041		isc_mem_put(mctx, manager->events,
4042			    sizeof(struct pollfd) * manager->nevents);
4043		isc_mem_put(mctx, manager->fdpollinfo,
4044			    sizeof(pollinfo_t) * manager->maxsocks);
4045		return (result);
4046	}
4047#endif	/* USE_WATCHER_THREAD */
4048#elif defined(USE_SELECT)
4049	UNUSED(result);
4050
4051#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
4052	/*
4053	 * Note: this code should also cover the case of MAXSOCKETS <=
4054	 * FD_SETSIZE, but we separate the cases to avoid possible portability
4055	 * issues regarding howmany() and the actual representation of fd_set.
4056	 */
4057	manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
4058		sizeof(fd_mask);
4059#else
4060	manager->fd_bufsize = sizeof(fd_set);
4061#endif
4062
4063	manager->read_fds = NULL;
4064	manager->read_fds_copy = NULL;
4065	manager->write_fds = NULL;
4066	manager->write_fds_copy = NULL;
4067
4068	manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
4069	if (manager->read_fds != NULL)
4070		manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
4071	if (manager->read_fds_copy != NULL)
4072		manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
4073	if (manager->write_fds != NULL) {
4074		manager->write_fds_copy = isc_mem_get(mctx,
4075						      manager->fd_bufsize);
4076	}
4077	if (manager->write_fds_copy == NULL) {
4078		if (manager->write_fds != NULL) {
4079			isc_mem_put(mctx, manager->write_fds,
4080				    manager->fd_bufsize);
4081		}
4082		if (manager->read_fds_copy != NULL) {
4083			isc_mem_put(mctx, manager->read_fds_copy,
4084				    manager->fd_bufsize);
4085		}
4086		if (manager->read_fds != NULL) {
4087			isc_mem_put(mctx, manager->read_fds,
4088				    manager->fd_bufsize);
4089		}
4090		return (ISC_R_NOMEMORY);
4091	}
4092	memset(manager->read_fds, 0, manager->fd_bufsize);
4093	memset(manager->write_fds, 0, manager->fd_bufsize);
4094
4095#ifdef USE_WATCHER_THREAD
4096	(void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4097	manager->maxfd = manager->pipe_fds[0];
4098#else /* USE_WATCHER_THREAD */
4099	manager->maxfd = 0;
4100#endif /* USE_WATCHER_THREAD */
4101#endif	/* USE_KQUEUE */
4102
4103	return (ISC_R_SUCCESS);
4104}
4105
4106static void
4107cleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
4108#ifdef USE_WATCHER_THREAD
4109	isc_result_t result;
4110
4111	result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4112	if (result != ISC_R_SUCCESS) {
4113		UNEXPECTED_ERROR(__FILE__, __LINE__,
4114				 "epoll_ctl(DEL) %s",
4115				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4116						ISC_MSG_FAILED, "failed"));
4117	}
4118#endif	/* USE_WATCHER_THREAD */
4119
4120#ifdef USE_KQUEUE
4121	close(manager->kqueue_fd);
4122	isc_mem_put(mctx, manager->events,
4123		    sizeof(struct kevent) * manager->nevents);
4124#elif defined(USE_EPOLL)
4125	close(manager->epoll_fd);
4126	isc_mem_put(mctx, manager->events,
4127		    sizeof(struct epoll_event) * manager->nevents);
4128#elif defined(USE_DEVPOLL)
4129	close(manager->devpoll_fd);
4130	isc_mem_put(mctx, manager->events,
4131		    sizeof(struct pollfd) * manager->nevents);
4132	isc_mem_put(mctx, manager->fdpollinfo,
4133		    sizeof(pollinfo_t) * manager->maxsocks);
4134#elif defined(USE_SELECT)
4135	if (manager->read_fds != NULL)
4136		isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
4137	if (manager->read_fds_copy != NULL)
4138		isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
4139	if (manager->write_fds != NULL)
4140		isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
4141	if (manager->write_fds_copy != NULL)
4142		isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
4143#endif	/* USE_KQUEUE */
4144}
4145
4146ISC_SOCKETFUNC_SCOPE isc_result_t
4147isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
4148	return (isc__socketmgr_create2(mctx, managerp, 0));
4149}
4150
4151ISC_SOCKETFUNC_SCOPE isc_result_t
4152isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
4153		       unsigned int maxsocks)
4154{
4155	int i;
4156	isc__socketmgr_t *manager;
4157#ifdef USE_WATCHER_THREAD
4158	char strbuf[ISC_STRERRORSIZE];
4159#endif
4160	isc_result_t result;
4161
4162	REQUIRE(managerp != NULL && *managerp == NULL);
4163
4164#ifdef USE_SHARED_MANAGER
4165	if (socketmgr != NULL) {
4166		/* Don't allow maxsocks to be updated */
4167		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
4168			return (ISC_R_EXISTS);
4169
4170		socketmgr->refs++;
4171		*managerp = (isc_socketmgr_t *)socketmgr;
4172		return (ISC_R_SUCCESS);
4173	}
4174#endif /* USE_SHARED_MANAGER */
4175
4176	if (maxsocks == 0)
4177		maxsocks = ISC_SOCKET_MAXSOCKETS;
4178
4179	manager = isc_mem_get(mctx, sizeof(*manager));
4180	if (manager == NULL)
4181		return (ISC_R_NOMEMORY);
4182
4183	/* zero-clear so that necessary cleanup on failure will be easy */
4184	memset(manager, 0, sizeof(*manager));
4185	manager->maxsocks = maxsocks;
4186	manager->reserved = 0;
4187	manager->maxudp = 0;
4188	manager->fds = isc_mem_get(mctx,
4189				   manager->maxsocks * sizeof(isc__socket_t *));
4190	if (manager->fds == NULL) {
4191		result = ISC_R_NOMEMORY;
4192		goto free_manager;
4193	}
4194	manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
4195	if (manager->fdstate == NULL) {
4196		result = ISC_R_NOMEMORY;
4197		goto free_manager;
4198	}
4199	manager->stats = NULL;
4200
4201	manager->common.methods = &socketmgrmethods;
4202	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
4203	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
4204	manager->mctx = NULL;
4205	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
4206	ISC_LIST_INIT(manager->socklist);
4207	result = isc_mutex_init(&manager->lock);
4208	if (result != ISC_R_SUCCESS)
4209		goto free_manager;
4210	manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
4211	if (manager->fdlock == NULL) {
4212		result = ISC_R_NOMEMORY;
4213		goto cleanup_lock;
4214	}
4215	for (i = 0; i < FDLOCK_COUNT; i++) {
4216		result = isc_mutex_init(&manager->fdlock[i]);
4217		if (result != ISC_R_SUCCESS) {
4218			while (--i >= 0)
4219				DESTROYLOCK(&manager->fdlock[i]);
4220			isc_mem_put(mctx, manager->fdlock,
4221				    FDLOCK_COUNT * sizeof(isc_mutex_t));
4222			manager->fdlock = NULL;
4223			goto cleanup_lock;
4224		}
4225	}
4226
4227#ifdef USE_WATCHER_THREAD
4228	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
4229		UNEXPECTED_ERROR(__FILE__, __LINE__,
4230				 "isc_condition_init() %s",
4231				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4232						ISC_MSG_FAILED, "failed"));
4233		result = ISC_R_UNEXPECTED;
4234		goto cleanup_lock;
4235	}
4236
4237	/*
4238	 * Create the special fds that will be used to wake up the
4239	 * select/poll loop when something internal needs to be done.
4240	 */
4241	if (pipe(manager->pipe_fds) != 0) {
4242		isc__strerror(errno, strbuf, sizeof(strbuf));
4243		UNEXPECTED_ERROR(__FILE__, __LINE__,
4244				 "pipe() %s: %s",
4245				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4246						ISC_MSG_FAILED, "failed"),
4247				 strbuf);
4248		result = ISC_R_UNEXPECTED;
4249		goto cleanup_condition;
4250	}
4251
4252	RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
4253#if 0
4254	RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
4255#endif
4256#endif	/* USE_WATCHER_THREAD */
4257
4258#ifdef USE_SHARED_MANAGER
4259	manager->refs = 1;
4260#endif /* USE_SHARED_MANAGER */
4261
4262	/*
4263	 * Set up initial state for the select loop
4264	 */
4265	result = setup_watcher(mctx, manager);
4266	if (result != ISC_R_SUCCESS)
4267		goto cleanup;
4268	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
4269#ifdef USE_WATCHER_THREAD
4270	/*
4271	 * Start up the select/poll thread.
4272	 */
4273	if (isc_thread_create(watcher, manager, &manager->watcher) !=
4274	    ISC_R_SUCCESS) {
4275		UNEXPECTED_ERROR(__FILE__, __LINE__,
4276				 "isc_thread_create() %s",
4277				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4278						ISC_MSG_FAILED, "failed"));
4279		cleanup_watcher(mctx, manager);
4280		result = ISC_R_UNEXPECTED;
4281		goto cleanup;
4282	}
4283#endif /* USE_WATCHER_THREAD */
4284	isc_mem_attach(mctx, &manager->mctx);
4285
4286#ifdef USE_SHARED_MANAGER
4287	socketmgr = manager;
4288#endif /* USE_SHARED_MANAGER */
4289	*managerp = (isc_socketmgr_t *)manager;
4290
4291	return (ISC_R_SUCCESS);
4292
4293cleanup:
4294#ifdef USE_WATCHER_THREAD
4295	(void)close(manager->pipe_fds[0]);
4296	(void)close(manager->pipe_fds[1]);
4297#endif	/* USE_WATCHER_THREAD */
4298
4299#ifdef USE_WATCHER_THREAD
4300cleanup_condition:
4301	(void)isc_condition_destroy(&manager->shutdown_ok);
4302#endif	/* USE_WATCHER_THREAD */
4303
4304
4305cleanup_lock:
4306	if (manager->fdlock != NULL) {
4307		for (i = 0; i < FDLOCK_COUNT; i++)
4308			DESTROYLOCK(&manager->fdlock[i]);
4309	}
4310	DESTROYLOCK(&manager->lock);
4311
4312free_manager:
4313	if (manager->fdlock != NULL) {
4314		isc_mem_put(mctx, manager->fdlock,
4315			    FDLOCK_COUNT * sizeof(isc_mutex_t));
4316	}
4317	if (manager->fdstate != NULL) {
4318		isc_mem_put(mctx, manager->fdstate,
4319			    manager->maxsocks * sizeof(int));
4320	}
4321	if (manager->fds != NULL) {
4322		isc_mem_put(mctx, manager->fds,
4323			    manager->maxsocks * sizeof(isc_socket_t *));
4324	}
4325	isc_mem_put(mctx, manager, sizeof(*manager));
4326
4327	return (result);
4328}
4329
4330#ifdef BIND9
4331isc_result_t
4332isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) {
4333	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4334	REQUIRE(VALID_MANAGER(manager));
4335	REQUIRE(nsockp != NULL);
4336
4337	*nsockp = manager->maxsocks;
4338
4339	return (ISC_R_SUCCESS);
4340}
4341
4342void
4343isc__socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) {
4344	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4345
4346	REQUIRE(VALID_MANAGER(manager));
4347	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
4348	REQUIRE(manager->stats == NULL);
4349	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
4350
4351	isc_stats_attach(stats, &manager->stats);
4352}
4353#endif
4354
4355ISC_SOCKETFUNC_SCOPE void
4356isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
4357	isc__socketmgr_t *manager;
4358	int i;
4359	isc_mem_t *mctx;
4360
4361	/*
4362	 * Destroy a socket manager.
4363	 */
4364
4365	REQUIRE(managerp != NULL);
4366	manager = (isc__socketmgr_t *)*managerp;
4367	REQUIRE(VALID_MANAGER(manager));
4368
4369#ifdef USE_SHARED_MANAGER
4370	manager->refs--;
4371	if (manager->refs > 0) {
4372		*managerp = NULL;
4373		return;
4374	}
4375	socketmgr = NULL;
4376#endif /* USE_SHARED_MANAGER */
4377
4378	LOCK(&manager->lock);
4379
4380	/*
4381	 * Wait for all sockets to be destroyed.
4382	 */
4383	while (!ISC_LIST_EMPTY(manager->socklist)) {
4384#ifdef USE_WATCHER_THREAD
4385		manager_log(manager, CREATION, "%s",
4386			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4387					   ISC_MSG_SOCKETSREMAIN,
4388					   "sockets exist"));
4389		WAIT(&manager->shutdown_ok, &manager->lock);
4390#else /* USE_WATCHER_THREAD */
4391		UNLOCK(&manager->lock);
4392		isc__taskmgr_dispatch(NULL);
4393		LOCK(&manager->lock);
4394#endif /* USE_WATCHER_THREAD */
4395	}
4396
4397	UNLOCK(&manager->lock);
4398
4399	/*
4400	 * Here, poke our select/poll thread.  Do this by closing the write
4401	 * half of the pipe, which will send EOF to the read half.
4402	 * This is currently a no-op in the non-threaded case.
4403	 */
4404	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4405
4406#ifdef USE_WATCHER_THREAD
4407	/*
4408	 * Wait for thread to exit.
4409	 */
4410	if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4411		UNEXPECTED_ERROR(__FILE__, __LINE__,
4412				 "isc_thread_join() %s",
4413				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4414						ISC_MSG_FAILED, "failed"));
4415#endif /* USE_WATCHER_THREAD */
4416
4417	/*
4418	 * Clean up.
4419	 */
4420	cleanup_watcher(manager->mctx, manager);
4421
4422#ifdef USE_WATCHER_THREAD
4423	(void)close(manager->pipe_fds[0]);
4424	(void)close(manager->pipe_fds[1]);
4425	(void)isc_condition_destroy(&manager->shutdown_ok);
4426#endif /* USE_WATCHER_THREAD */
4427
4428	for (i = 0; i < (int)manager->maxsocks; i++)
4429		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4430			(void)close(i);
4431
4432	isc_mem_put(manager->mctx, manager->fds,
4433		    manager->maxsocks * sizeof(isc__socket_t *));
4434	isc_mem_put(manager->mctx, manager->fdstate,
4435		    manager->maxsocks * sizeof(int));
4436
4437	if (manager->stats != NULL)
4438		isc_stats_detach(&manager->stats);
4439
4440	if (manager->fdlock != NULL) {
4441		for (i = 0; i < FDLOCK_COUNT; i++)
4442			DESTROYLOCK(&manager->fdlock[i]);
4443		isc_mem_put(manager->mctx, manager->fdlock,
4444			    FDLOCK_COUNT * sizeof(isc_mutex_t));
4445	}
4446	DESTROYLOCK(&manager->lock);
4447	manager->common.magic = 0;
4448	manager->common.impmagic = 0;
4449	mctx= manager->mctx;
4450	isc_mem_put(mctx, manager, sizeof(*manager));
4451
4452	isc_mem_detach(&mctx);
4453
4454	*managerp = NULL;
4455
4456#ifdef USE_SHARED_MANAGER
4457	socketmgr = NULL;
4458#endif
4459}
4460
4461static isc_result_t
4462socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4463	    unsigned int flags)
4464{
4465	int io_state;
4466	isc_boolean_t have_lock = ISC_FALSE;
4467	isc_task_t *ntask = NULL;
4468	isc_result_t result = ISC_R_SUCCESS;
4469
4470	dev->ev_sender = task;
4471
4472	if (sock->type == isc_sockettype_udp) {
4473		io_state = doio_recv(sock, dev);
4474	} else {
4475		LOCK(&sock->lock);
4476		have_lock = ISC_TRUE;
4477
4478		if (ISC_LIST_EMPTY(sock->recv_list))
4479			io_state = doio_recv(sock, dev);
4480		else
4481			io_state = DOIO_SOFT;
4482	}
4483
4484	switch (io_state) {
4485	case DOIO_SOFT:
4486		/*
4487		 * We couldn't read all or part of the request right now, so
4488		 * queue it.
4489		 *
4490		 * Attach to socket and to task
4491		 */
4492		isc_task_attach(task, &ntask);
4493		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4494
4495		if (!have_lock) {
4496			LOCK(&sock->lock);
4497			have_lock = ISC_TRUE;
4498		}
4499
4500		/*
4501		 * Enqueue the request.  If the socket was previously not being
4502		 * watched, poke the watcher to start paying attention to it.
4503		 */
4504		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4505			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4506		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4507
4508		socket_log(sock, NULL, EVENT, NULL, 0, 0,
4509			   "socket_recv: event %p -> task %p",
4510			   dev, ntask);
4511
4512		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4513			result = ISC_R_INPROGRESS;
4514		break;
4515
4516	case DOIO_EOF:
4517		dev->result = ISC_R_EOF;
4518		/* fallthrough */
4519
4520	case DOIO_HARD:
4521	case DOIO_SUCCESS:
4522		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4523			send_recvdone_event(sock, &dev);
4524		break;
4525	}
4526
4527	if (have_lock)
4528		UNLOCK(&sock->lock);
4529
4530	return (result);
4531}
4532
4533ISC_SOCKETFUNC_SCOPE isc_result_t
4534isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4535		  unsigned int minimum, isc_task_t *task,
4536		  isc_taskaction_t action, const void *arg)
4537{
4538	isc__socket_t *sock = (isc__socket_t *)sock0;
4539	isc_socketevent_t *dev;
4540	isc__socketmgr_t *manager;
4541	unsigned int iocount;
4542	isc_buffer_t *buffer;
4543
4544	REQUIRE(VALID_SOCKET(sock));
4545	REQUIRE(buflist != NULL);
4546	REQUIRE(!ISC_LIST_EMPTY(*buflist));
4547	REQUIRE(task != NULL);
4548	REQUIRE(action != NULL);
4549
4550	manager = sock->manager;
4551	REQUIRE(VALID_MANAGER(manager));
4552
4553	iocount = isc_bufferlist_availablecount(buflist);
4554	REQUIRE(iocount > 0);
4555
4556	INSIST(sock->bound);
4557
4558	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4559	if (dev == NULL)
4560		return (ISC_R_NOMEMORY);
4561
4562	/*
4563	 * UDP sockets are always partial read
4564	 */
4565	if (sock->type == isc_sockettype_udp)
4566		dev->minimum = 1;
4567	else {
4568		if (minimum == 0)
4569			dev->minimum = iocount;
4570		else
4571			dev->minimum = minimum;
4572	}
4573
4574	/*
4575	 * Move each buffer from the passed in list to our internal one.
4576	 */
4577	buffer = ISC_LIST_HEAD(*buflist);
4578	while (buffer != NULL) {
4579		ISC_LIST_DEQUEUE(*buflist, buffer, link);
4580		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4581		buffer = ISC_LIST_HEAD(*buflist);
4582	}
4583
4584	return (socket_recv(sock, dev, task, 0));
4585}
4586
4587ISC_SOCKETFUNC_SCOPE isc_result_t
4588isc__socket_recv(isc_socket_t *sock0, isc_region_t *region,
4589		 unsigned int minimum, isc_task_t *task,
4590		 isc_taskaction_t action, const void *arg)
4591{
4592	isc__socket_t *sock = (isc__socket_t *)sock0;
4593	isc_socketevent_t *dev;
4594	isc__socketmgr_t *manager;
4595
4596	REQUIRE(VALID_SOCKET(sock));
4597	REQUIRE(action != NULL);
4598
4599	manager = sock->manager;
4600	REQUIRE(VALID_MANAGER(manager));
4601
4602	INSIST(sock->bound);
4603
4604	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4605	if (dev == NULL)
4606		return (ISC_R_NOMEMORY);
4607
4608	return (isc__socket_recv2(sock0, region, minimum, task, dev, 0));
4609}
4610
4611ISC_SOCKETFUNC_SCOPE isc_result_t
4612isc__socket_recv2(isc_socket_t *sock0, isc_region_t *region,
4613		  unsigned int minimum, isc_task_t *task,
4614		  isc_socketevent_t *event, unsigned int flags)
4615{
4616	isc__socket_t *sock = (isc__socket_t *)sock0;
4617
4618	event->ev_sender = sock;
4619	event->result = ISC_R_UNSET;
4620	ISC_LIST_INIT(event->bufferlist);
4621	event->region = *region;
4622	event->n = 0;
4623	event->offset = 0;
4624	event->attributes = 0;
4625
4626	/*
4627	 * UDP sockets are always partial read.
4628	 */
4629	if (sock->type == isc_sockettype_udp)
4630		event->minimum = 1;
4631	else {
4632		if (minimum == 0)
4633			event->minimum = region->length;
4634		else
4635			event->minimum = minimum;
4636	}
4637
4638	return (socket_recv(sock, event, task, flags));
4639}
4640
4641static isc_result_t
4642socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4643	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4644	    unsigned int flags)
4645{
4646	int io_state;
4647	isc_boolean_t have_lock = ISC_FALSE;
4648	isc_task_t *ntask = NULL;
4649	isc_result_t result = ISC_R_SUCCESS;
4650
4651	dev->ev_sender = task;
4652
4653	set_dev_address(address, sock, dev);
4654	if (pktinfo != NULL) {
4655		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4656		dev->pktinfo = *pktinfo;
4657
4658		if (!isc_sockaddr_issitelocal(&dev->address) &&
4659		    !isc_sockaddr_islinklocal(&dev->address)) {
4660			socket_log(sock, NULL, TRACE, isc_msgcat,
4661				   ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4662				   "pktinfo structure provided, ifindex %u "
4663				   "(set to 0)", pktinfo->ipi6_ifindex);
4664
4665			/*
4666			 * Set the pktinfo index to 0 here, to let the
4667			 * kernel decide what interface it should send on.
4668			 */
4669			dev->pktinfo.ipi6_ifindex = 0;
4670		}
4671	}
4672
4673	if (sock->type == isc_sockettype_udp)
4674		io_state = doio_send(sock, dev);
4675	else {
4676		LOCK(&sock->lock);
4677		have_lock = ISC_TRUE;
4678
4679		if (ISC_LIST_EMPTY(sock->send_list))
4680			io_state = doio_send(sock, dev);
4681		else
4682			io_state = DOIO_SOFT;
4683	}
4684
4685	switch (io_state) {
4686	case DOIO_SOFT:
4687		/*
4688		 * We couldn't send all or part of the request right now, so
4689		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4690		 */
4691		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4692			isc_task_attach(task, &ntask);
4693			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4694
4695			if (!have_lock) {
4696				LOCK(&sock->lock);
4697				have_lock = ISC_TRUE;
4698			}
4699
4700			/*
4701			 * Enqueue the request.  If the socket was previously
4702			 * not being watched, poke the watcher to start
4703			 * paying attention to it.
4704			 */
4705			if (ISC_LIST_EMPTY(sock->send_list) &&
4706			    !sock->pending_send)
4707				select_poke(sock->manager, sock->fd,
4708					    SELECT_POKE_WRITE);
4709			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4710
4711			socket_log(sock, NULL, EVENT, NULL, 0, 0,
4712				   "socket_send: event %p -> task %p",
4713				   dev, ntask);
4714
4715			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4716				result = ISC_R_INPROGRESS;
4717			break;
4718		}
4719
4720	case DOIO_HARD:
4721	case DOIO_SUCCESS:
4722		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4723			send_senddone_event(sock, &dev);
4724		break;
4725	}
4726
4727	if (have_lock)
4728		UNLOCK(&sock->lock);
4729
4730	return (result);
4731}
4732
4733ISC_SOCKETFUNC_SCOPE isc_result_t
4734isc__socket_send(isc_socket_t *sock, isc_region_t *region,
4735		 isc_task_t *task, isc_taskaction_t action, const void *arg)
4736{
4737	/*
4738	 * REQUIRE() checking is performed in isc_socket_sendto().
4739	 */
4740	return (isc__socket_sendto(sock, region, task, action, arg, NULL,
4741				   NULL));
4742}
4743
4744ISC_SOCKETFUNC_SCOPE isc_result_t
4745isc__socket_sendto(isc_socket_t *sock0, isc_region_t *region,
4746		   isc_task_t *task, isc_taskaction_t action, const void *arg,
4747		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4748{
4749	isc__socket_t *sock = (isc__socket_t *)sock0;
4750	isc_socketevent_t *dev;
4751	isc__socketmgr_t *manager;
4752
4753	REQUIRE(VALID_SOCKET(sock));
4754	REQUIRE(region != NULL);
4755	REQUIRE(task != NULL);
4756	REQUIRE(action != NULL);
4757
4758	manager = sock->manager;
4759	REQUIRE(VALID_MANAGER(manager));
4760
4761	INSIST(sock->bound);
4762
4763	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4764	if (dev == NULL)
4765		return (ISC_R_NOMEMORY);
4766
4767	dev->region = *region;
4768
4769	return (socket_send(sock, dev, task, address, pktinfo, 0));
4770}
4771
4772ISC_SOCKETFUNC_SCOPE isc_result_t
4773isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4774		  isc_task_t *task, isc_taskaction_t action, const void *arg)
4775{
4776	return (isc__socket_sendtov(sock, buflist, task, action, arg, NULL,
4777				    NULL));
4778}
4779
4780ISC_SOCKETFUNC_SCOPE isc_result_t
4781isc__socket_sendtov(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4782		    isc_task_t *task, isc_taskaction_t action, const void *arg,
4783		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4784{
4785	isc__socket_t *sock = (isc__socket_t *)sock0;
4786	isc_socketevent_t *dev;
4787	isc__socketmgr_t *manager;
4788	unsigned int iocount;
4789	isc_buffer_t *buffer;
4790
4791	REQUIRE(VALID_SOCKET(sock));
4792	REQUIRE(buflist != NULL);
4793	REQUIRE(!ISC_LIST_EMPTY(*buflist));
4794	REQUIRE(task != NULL);
4795	REQUIRE(action != NULL);
4796
4797	manager = sock->manager;
4798	REQUIRE(VALID_MANAGER(manager));
4799
4800	iocount = isc_bufferlist_usedcount(buflist);
4801	REQUIRE(iocount > 0);
4802
4803	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4804	if (dev == NULL)
4805		return (ISC_R_NOMEMORY);
4806
4807	/*
4808	 * Move each buffer from the passed in list to our internal one.
4809	 */
4810	buffer = ISC_LIST_HEAD(*buflist);
4811	while (buffer != NULL) {
4812		ISC_LIST_DEQUEUE(*buflist, buffer, link);
4813		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4814		buffer = ISC_LIST_HEAD(*buflist);
4815	}
4816
4817	return (socket_send(sock, dev, task, address, pktinfo, 0));
4818}
4819
4820ISC_SOCKETFUNC_SCOPE isc_result_t
4821isc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region,
4822		    isc_task_t *task,
4823		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4824		    isc_socketevent_t *event, unsigned int flags)
4825{
4826	isc__socket_t *sock = (isc__socket_t *)sock0;
4827
4828	REQUIRE(VALID_SOCKET(sock));
4829	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4830	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4831		REQUIRE(sock->type == isc_sockettype_udp);
4832	event->ev_sender = sock;
4833	event->result = ISC_R_UNSET;
4834	ISC_LIST_INIT(event->bufferlist);
4835	event->region = *region;
4836	event->n = 0;
4837	event->offset = 0;
4838	event->attributes = 0;
4839
4840	return (socket_send(sock, event, task, address, pktinfo, flags));
4841}
4842
4843ISC_SOCKETFUNC_SCOPE void
4844isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4845#ifdef ISC_PLATFORM_HAVESYSUNH
4846	int s;
4847	struct stat sb;
4848	char strbuf[ISC_STRERRORSIZE];
4849
4850	if (sockaddr->type.sa.sa_family != AF_UNIX)
4851		return;
4852
4853#ifndef S_ISSOCK
4854#if defined(S_IFMT) && defined(S_IFSOCK)
4855#define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4856#elif defined(_S_IFMT) && defined(S_IFSOCK)
4857#define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4858#endif
4859#endif
4860
4861#ifndef S_ISFIFO
4862#if defined(S_IFMT) && defined(S_IFIFO)
4863#define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4864#elif defined(_S_IFMT) && defined(S_IFIFO)
4865#define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4866#endif
4867#endif
4868
4869#if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4870#error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4871#endif
4872
4873#ifndef S_ISFIFO
4874#define S_ISFIFO(mode) 0
4875#endif
4876
4877#ifndef S_ISSOCK
4878#define S_ISSOCK(mode) 0
4879#endif
4880
4881	if (active) {
4882		if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4883			isc__strerror(errno, strbuf, sizeof(strbuf));
4884			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4885				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4886				      "isc_socket_cleanunix: stat(%s): %s",
4887				      sockaddr->type.sunix.sun_path, strbuf);
4888			return;
4889		}
4890		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4891			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4892				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4893				      "isc_socket_cleanunix: %s: not a socket",
4894				      sockaddr->type.sunix.sun_path);
4895			return;
4896		}
4897		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4898			isc__strerror(errno, strbuf, sizeof(strbuf));
4899			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4900				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4901				      "isc_socket_cleanunix: unlink(%s): %s",
4902				      sockaddr->type.sunix.sun_path, strbuf);
4903		}
4904		return;
4905	}
4906
4907	s = socket(AF_UNIX, SOCK_STREAM, 0);
4908	if (s < 0) {
4909		isc__strerror(errno, strbuf, sizeof(strbuf));
4910		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4911			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4912			      "isc_socket_cleanunix: socket(%s): %s",
4913			      sockaddr->type.sunix.sun_path, strbuf);
4914		return;
4915	}
4916
4917	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4918		switch (errno) {
4919		case ENOENT:    /* We exited cleanly last time */
4920			break;
4921		default:
4922			isc__strerror(errno, strbuf, sizeof(strbuf));
4923			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4924				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4925				      "isc_socket_cleanunix: stat(%s): %s",
4926				      sockaddr->type.sunix.sun_path, strbuf);
4927			break;
4928		}
4929		goto cleanup;
4930	}
4931
4932	if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4933		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4934			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4935			      "isc_socket_cleanunix: %s: not a socket",
4936			      sockaddr->type.sunix.sun_path);
4937		goto cleanup;
4938	}
4939
4940	if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4941		    sizeof(sockaddr->type.sunix)) < 0) {
4942		switch (errno) {
4943		case ECONNREFUSED:
4944		case ECONNRESET:
4945			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4946				isc__strerror(errno, strbuf, sizeof(strbuf));
4947				isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4948					      ISC_LOGMODULE_SOCKET,
4949					      ISC_LOG_WARNING,
4950					      "isc_socket_cleanunix: "
4951					      "unlink(%s): %s",
4952					      sockaddr->type.sunix.sun_path,
4953					      strbuf);
4954			}
4955			break;
4956		default:
4957			isc__strerror(errno, strbuf, sizeof(strbuf));
4958			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4959				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4960				      "isc_socket_cleanunix: connect(%s): %s",
4961				      sockaddr->type.sunix.sun_path, strbuf);
4962			break;
4963		}
4964	}
4965 cleanup:
4966	close(s);
4967#else
4968	UNUSED(sockaddr);
4969	UNUSED(active);
4970#endif
4971}
4972
4973ISC_SOCKETFUNC_SCOPE isc_result_t
4974isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4975		    isc_uint32_t owner, isc_uint32_t group)
4976{
4977#ifdef ISC_PLATFORM_HAVESYSUNH
4978	isc_result_t result = ISC_R_SUCCESS;
4979	char strbuf[ISC_STRERRORSIZE];
4980	char path[sizeof(sockaddr->type.sunix.sun_path)];
4981#ifdef NEED_SECURE_DIRECTORY
4982	char *slash;
4983#endif
4984
4985	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4986	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4987	strcpy(path, sockaddr->type.sunix.sun_path);
4988
4989#ifdef NEED_SECURE_DIRECTORY
4990	slash = strrchr(path, '/');
4991	if (slash != NULL) {
4992		if (slash != path)
4993			*slash = '\0';
4994		else
4995			strcpy(path, "/");
4996	} else
4997		strcpy(path, ".");
4998#endif
4999
5000	if (chmod(path, perm) < 0) {
5001		isc__strerror(errno, strbuf, sizeof(strbuf));
5002		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
5003			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
5004			      "isc_socket_permunix: chmod(%s, %d): %s",
5005			      path, perm, strbuf);
5006		result = ISC_R_FAILURE;
5007	}
5008	if (chown(path, owner, group) < 0) {
5009		isc__strerror(errno, strbuf, sizeof(strbuf));
5010		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
5011			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
5012			      "isc_socket_permunix: chown(%s, %d, %d): %s",
5013			      path, owner, group,
5014			      strbuf);
5015		result = ISC_R_FAILURE;
5016	}
5017	return (result);
5018#else
5019	UNUSED(sockaddr);
5020	UNUSED(perm);
5021	UNUSED(owner);
5022	UNUSED(group);
5023	return (ISC_R_NOTIMPLEMENTED);
5024#endif
5025}
5026
5027ISC_SOCKETFUNC_SCOPE isc_result_t
5028isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
5029		 unsigned int options) {
5030	isc__socket_t *sock = (isc__socket_t *)sock0;
5031	char strbuf[ISC_STRERRORSIZE];
5032	int on = 1;
5033
5034	REQUIRE(VALID_SOCKET(sock));
5035
5036	LOCK(&sock->lock);
5037
5038	INSIST(!sock->bound);
5039	INSIST(!sock->dupped);
5040
5041	if (sock->pf != sockaddr->type.sa.sa_family) {
5042		UNLOCK(&sock->lock);
5043		return (ISC_R_FAMILYMISMATCH);
5044	}
5045
5046	/*
5047	 * Only set SO_REUSEADDR when we want a specific port.
5048	 */
5049#ifdef AF_UNIX
5050	if (sock->pf == AF_UNIX)
5051		goto bind_socket;
5052#endif
5053	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
5054	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
5055	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
5056		       sizeof(on)) < 0) {
5057		UNEXPECTED_ERROR(__FILE__, __LINE__,
5058				 "setsockopt(%d) %s", sock->fd,
5059				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
5060						ISC_MSG_FAILED, "failed"));
5061		/* Press on... */
5062	}
5063#ifdef AF_UNIX
5064 bind_socket:
5065#endif
5066	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
5067		inc_stats(sock->manager->stats,
5068			  sock->statsindex[STATID_BINDFAIL]);
5069
5070		UNLOCK(&sock->lock);
5071		switch (errno) {
5072		case EACCES:
5073			return (ISC_R_NOPERM);
5074		case EADDRNOTAVAIL:
5075			return (ISC_R_ADDRNOTAVAIL);
5076		case EADDRINUSE:
5077			return (ISC_R_ADDRINUSE);
5078		case EINVAL:
5079			return (ISC_R_BOUND);
5080		default:
5081			isc__strerror(errno, strbuf, sizeof(strbuf));
5082			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
5083					 strbuf);
5084			return (ISC_R_UNEXPECTED);
5085		}
5086	}
5087
5088	socket_log(sock, sockaddr, TRACE,
5089		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
5090	sock->bound = 1;
5091
5092	UNLOCK(&sock->lock);
5093	return (ISC_R_SUCCESS);
5094}
5095
5096/*
5097 * Enable this only for specific OS versions, and only when they have repaired
5098 * their problems with it.  Until then, this is is broken and needs to be
5099 * diabled by default.  See RT22589 for details.
5100 */
5101#undef ENABLE_ACCEPTFILTER
5102
5103ISC_SOCKETFUNC_SCOPE isc_result_t
5104isc__socket_filter(isc_socket_t *sock0, const char *filter) {
5105	isc__socket_t *sock = (isc__socket_t *)sock0;
5106#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5107	char strbuf[ISC_STRERRORSIZE];
5108	struct accept_filter_arg afa;
5109#else
5110	UNUSED(sock);
5111	UNUSED(filter);
5112#endif
5113
5114	REQUIRE(VALID_SOCKET(sock));
5115
5116#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5117	bzero(&afa, sizeof(afa));
5118	strncpy(afa.af_name, filter, sizeof(afa.af_name));
5119	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
5120			 &afa, sizeof(afa)) == -1) {
5121		isc__strerror(errno, strbuf, sizeof(strbuf));
5122		socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
5123			   ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
5124			   strbuf);
5125		return (ISC_R_FAILURE);
5126	}
5127	return (ISC_R_SUCCESS);
5128#else
5129	return (ISC_R_NOTIMPLEMENTED);
5130#endif
5131}
5132
5133/*
5134 * Set up to listen on a given socket.  We do this by creating an internal
5135 * event that will be dispatched when the socket has read activity.  The
5136 * watcher will send the internal event to the task when there is a new
5137 * connection.
5138 *
5139 * Unlike in read, we don't preallocate a done event here.  Every time there
5140 * is a new connection we'll have to allocate a new one anyway, so we might
5141 * as well keep things simple rather than having to track them.
5142 */
5143ISC_SOCKETFUNC_SCOPE isc_result_t
5144isc__socket_listen(isc_socket_t *sock0, unsigned int backlog) {
5145	isc__socket_t *sock = (isc__socket_t *)sock0;
5146	char strbuf[ISC_STRERRORSIZE];
5147
5148	REQUIRE(VALID_SOCKET(sock));
5149
5150	LOCK(&sock->lock);
5151
5152	REQUIRE(!sock->listener);
5153	REQUIRE(sock->bound);
5154	REQUIRE(sock->type == isc_sockettype_tcp ||
5155		sock->type == isc_sockettype_unix);
5156
5157	if (backlog == 0)
5158		backlog = SOMAXCONN;
5159
5160	if (listen(sock->fd, (int)backlog) < 0) {
5161		UNLOCK(&sock->lock);
5162		isc__strerror(errno, strbuf, sizeof(strbuf));
5163
5164		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
5165
5166		return (ISC_R_UNEXPECTED);
5167	}
5168
5169	sock->listener = 1;
5170
5171	UNLOCK(&sock->lock);
5172	return (ISC_R_SUCCESS);
5173}
5174
5175/*
5176 * This should try to do aggressive accept() XXXMLG
5177 */
5178ISC_SOCKETFUNC_SCOPE isc_result_t
5179isc__socket_accept(isc_socket_t *sock0,
5180		  isc_task_t *task, isc_taskaction_t action, const void *arg)
5181{
5182	isc__socket_t *sock = (isc__socket_t *)sock0;
5183	isc_socket_newconnev_t *dev;
5184	isc__socketmgr_t *manager;
5185	isc_task_t *ntask = NULL;
5186	isc__socket_t *nsock;
5187	isc_result_t result;
5188	isc_boolean_t do_poke = ISC_FALSE;
5189
5190	REQUIRE(VALID_SOCKET(sock));
5191	manager = sock->manager;
5192	REQUIRE(VALID_MANAGER(manager));
5193
5194	LOCK(&sock->lock);
5195
5196	REQUIRE(sock->listener);
5197
5198	/*
5199	 * Sender field is overloaded here with the task we will be sending
5200	 * this event to.  Just before the actual event is delivered the
5201	 * actual ev_sender will be touched up to be the socket.
5202	 */
5203	dev = (isc_socket_newconnev_t *)
5204		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
5205				   action, arg, sizeof(*dev));
5206	if (dev == NULL) {
5207		UNLOCK(&sock->lock);
5208		return (ISC_R_NOMEMORY);
5209	}
5210	ISC_LINK_INIT(dev, ev_link);
5211
5212	result = allocate_socket(manager, sock->type, &nsock);
5213	if (result != ISC_R_SUCCESS) {
5214		isc_event_free(ISC_EVENT_PTR(&dev));
5215		UNLOCK(&sock->lock);
5216		return (result);
5217	}
5218
5219	/*
5220	 * Attach to socket and to task.
5221	 */
5222	isc_task_attach(task, &ntask);
5223	if (isc_task_exiting(ntask)) {
5224		free_socket(&nsock);
5225		isc_task_detach(&ntask);
5226		isc_event_free(ISC_EVENT_PTR(&dev));
5227		UNLOCK(&sock->lock);
5228		return (ISC_R_SHUTTINGDOWN);
5229	}
5230	nsock->references++;
5231	nsock->statsindex = sock->statsindex;
5232
5233	dev->ev_sender = ntask;
5234	dev->newsocket = (isc_socket_t *)nsock;
5235
5236	/*
5237	 * Poke watcher here.  We still have the socket locked, so there
5238	 * is no race condition.  We will keep the lock for such a short
5239	 * bit of time waking it up now or later won't matter all that much.
5240	 */
5241	if (ISC_LIST_EMPTY(sock->accept_list))
5242		do_poke = ISC_TRUE;
5243
5244	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
5245
5246	if (do_poke)
5247		select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
5248
5249	UNLOCK(&sock->lock);
5250	return (ISC_R_SUCCESS);
5251}
5252
5253ISC_SOCKETFUNC_SCOPE isc_result_t
5254isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
5255		   isc_task_t *task, isc_taskaction_t action, const void *arg)
5256{
5257	isc__socket_t *sock = (isc__socket_t *)sock0;
5258	isc_socket_connev_t *dev;
5259	isc_task_t *ntask = NULL;
5260	isc__socketmgr_t *manager;
5261	int cc;
5262	char strbuf[ISC_STRERRORSIZE];
5263	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
5264
5265	REQUIRE(VALID_SOCKET(sock));
5266	REQUIRE(addr != NULL);
5267	REQUIRE(task != NULL);
5268	REQUIRE(action != NULL);
5269
5270	manager = sock->manager;
5271	REQUIRE(VALID_MANAGER(manager));
5272	REQUIRE(addr != NULL);
5273
5274	if (isc_sockaddr_ismulticast(addr))
5275		return (ISC_R_MULTICAST);
5276
5277	LOCK(&sock->lock);
5278
5279	REQUIRE(!sock->connecting);
5280
5281	dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
5282							ISC_SOCKEVENT_CONNECT,
5283							action,	arg,
5284							sizeof(*dev));
5285	if (dev == NULL) {
5286		UNLOCK(&sock->lock);
5287		return (ISC_R_NOMEMORY);
5288	}
5289	ISC_LINK_INIT(dev, ev_link);
5290
5291	/*
5292	 * Try to do the connect right away, as there can be only one
5293	 * outstanding, and it might happen to complete.
5294	 */
5295	sock->peer_address = *addr;
5296	cc = connect(sock->fd, &addr->type.sa, addr->length);
5297	if (cc < 0) {
5298		/*
5299		 * HP-UX "fails" to connect a UDP socket and sets errno to
5300		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
5301		 * a success and let the user detect it if it's really an error
5302		 * at the time of sending a packet on the socket.
5303		 */
5304		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
5305			cc = 0;
5306			goto success;
5307		}
5308		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
5309			goto queue;
5310
5311		switch (errno) {
5312#define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
5313			ERROR_MATCH(EACCES, ISC_R_NOPERM);
5314			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5315			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5316			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5317			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5318#ifdef EHOSTDOWN
5319			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5320#endif
5321			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5322			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5323			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5324			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5325			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5326#undef ERROR_MATCH
5327		}
5328
5329		sock->connected = 0;
5330
5331		isc__strerror(errno, strbuf, sizeof(strbuf));
5332		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
5333		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
5334				 addrbuf, errno, strbuf);
5335
5336		UNLOCK(&sock->lock);
5337		inc_stats(sock->manager->stats,
5338			  sock->statsindex[STATID_CONNECTFAIL]);
5339		isc_event_free(ISC_EVENT_PTR(&dev));
5340		return (ISC_R_UNEXPECTED);
5341
5342	err_exit:
5343		sock->connected = 0;
5344		isc_task_send(task, ISC_EVENT_PTR(&dev));
5345
5346		UNLOCK(&sock->lock);
5347		inc_stats(sock->manager->stats,
5348			  sock->statsindex[STATID_CONNECTFAIL]);
5349		return (ISC_R_SUCCESS);
5350	}
5351
5352	/*
5353	 * If connect completed, fire off the done event.
5354	 */
5355 success:
5356	if (cc == 0) {
5357		sock->connected = 1;
5358		sock->bound = 1;
5359		dev->result = ISC_R_SUCCESS;
5360		isc_task_send(task, ISC_EVENT_PTR(&dev));
5361
5362		UNLOCK(&sock->lock);
5363
5364		inc_stats(sock->manager->stats,
5365			  sock->statsindex[STATID_CONNECT]);
5366
5367		return (ISC_R_SUCCESS);
5368	}
5369
5370 queue:
5371
5372	/*
5373	 * Attach to task.
5374	 */
5375	isc_task_attach(task, &ntask);
5376
5377	sock->connecting = 1;
5378
5379	dev->ev_sender = ntask;
5380
5381	/*
5382	 * Poke watcher here.  We still have the socket locked, so there
5383	 * is no race condition.  We will keep the lock for such a short
5384	 * bit of time waking it up now or later won't matter all that much.
5385	 */
5386	if (sock->connect_ev == NULL)
5387		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
5388
5389	sock->connect_ev = dev;
5390
5391	UNLOCK(&sock->lock);
5392	return (ISC_R_SUCCESS);
5393}
5394
5395/*
5396 * Called when a socket with a pending connect() finishes.
5397 */
5398static void
5399internal_connect(isc_task_t *me, isc_event_t *ev) {
5400	isc__socket_t *sock;
5401	isc_socket_connev_t *dev;
5402	isc_task_t *task;
5403	int cc;
5404	ISC_SOCKADDR_LEN_T optlen;
5405	char strbuf[ISC_STRERRORSIZE];
5406	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5407
5408	UNUSED(me);
5409	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5410
5411	sock = ev->ev_sender;
5412	INSIST(VALID_SOCKET(sock));
5413
5414	LOCK(&sock->lock);
5415
5416	/*
5417	 * When the internal event was sent the reference count was bumped
5418	 * to keep the socket around for us.  Decrement the count here.
5419	 */
5420	INSIST(sock->references > 0);
5421	sock->references--;
5422	if (sock->references == 0) {
5423		UNLOCK(&sock->lock);
5424		destroy(&sock);
5425		return;
5426	}
5427
5428	/*
5429	 * Has this event been canceled?
5430	 */
5431	dev = sock->connect_ev;
5432	if (dev == NULL) {
5433		INSIST(!sock->connecting);
5434		UNLOCK(&sock->lock);
5435		return;
5436	}
5437
5438	INSIST(sock->connecting);
5439	sock->connecting = 0;
5440
5441	/*
5442	 * Get any possible error status here.
5443	 */
5444	optlen = sizeof(cc);
5445	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5446		       (void *)&cc, (void *)&optlen) < 0)
5447		cc = errno;
5448	else
5449		errno = cc;
5450
5451	if (errno != 0) {
5452		/*
5453		 * If the error is EAGAIN, just re-select on this
5454		 * fd and pretend nothing strange happened.
5455		 */
5456		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5457			sock->connecting = 1;
5458			select_poke(sock->manager, sock->fd,
5459				    SELECT_POKE_CONNECT);
5460			UNLOCK(&sock->lock);
5461
5462			return;
5463		}
5464
5465		inc_stats(sock->manager->stats,
5466			  sock->statsindex[STATID_CONNECTFAIL]);
5467
5468		/*
5469		 * Translate other errors into ISC_R_* flavors.
5470		 */
5471		switch (errno) {
5472#define ERROR_MATCH(a, b) case a: dev->result = b; break;
5473			ERROR_MATCH(EACCES, ISC_R_NOPERM);
5474			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5475			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5476			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5477			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5478#ifdef EHOSTDOWN
5479			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5480#endif
5481			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5482			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5483			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5484			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5485			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5486			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5487#undef ERROR_MATCH
5488		default:
5489			dev->result = ISC_R_UNEXPECTED;
5490			isc_sockaddr_format(&sock->peer_address, peerbuf,
5491					    sizeof(peerbuf));
5492			isc__strerror(errno, strbuf, sizeof(strbuf));
5493			UNEXPECTED_ERROR(__FILE__, __LINE__,
5494					 "internal_connect: connect(%s) %s",
5495					 peerbuf, strbuf);
5496		}
5497	} else {
5498		inc_stats(sock->manager->stats,
5499			  sock->statsindex[STATID_CONNECT]);
5500		dev->result = ISC_R_SUCCESS;
5501		sock->connected = 1;
5502		sock->bound = 1;
5503	}
5504
5505	sock->connect_ev = NULL;
5506
5507	UNLOCK(&sock->lock);
5508
5509	task = dev->ev_sender;
5510	dev->ev_sender = sock;
5511	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5512}
5513
5514ISC_SOCKETFUNC_SCOPE isc_result_t
5515isc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5516	isc__socket_t *sock = (isc__socket_t *)sock0;
5517	isc_result_t result;
5518
5519	REQUIRE(VALID_SOCKET(sock));
5520	REQUIRE(addressp != NULL);
5521
5522	LOCK(&sock->lock);
5523
5524	if (sock->connected) {
5525		*addressp = sock->peer_address;
5526		result = ISC_R_SUCCESS;
5527	} else {
5528		result = ISC_R_NOTCONNECTED;
5529	}
5530
5531	UNLOCK(&sock->lock);
5532
5533	return (result);
5534}
5535
5536ISC_SOCKETFUNC_SCOPE isc_result_t
5537isc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5538	isc__socket_t *sock = (isc__socket_t *)sock0;
5539	ISC_SOCKADDR_LEN_T len;
5540	isc_result_t result;
5541	char strbuf[ISC_STRERRORSIZE];
5542
5543	REQUIRE(VALID_SOCKET(sock));
5544	REQUIRE(addressp != NULL);
5545
5546	LOCK(&sock->lock);
5547
5548	if (!sock->bound) {
5549		result = ISC_R_NOTBOUND;
5550		goto out;
5551	}
5552
5553	result = ISC_R_SUCCESS;
5554
5555	len = sizeof(addressp->type);
5556	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5557		isc__strerror(errno, strbuf, sizeof(strbuf));
5558		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5559				 strbuf);
5560		result = ISC_R_UNEXPECTED;
5561		goto out;
5562	}
5563	addressp->length = (unsigned int)len;
5564
5565 out:
5566	UNLOCK(&sock->lock);
5567
5568	return (result);
5569}
5570
5571/*
5572 * Run through the list of events on this socket, and cancel the ones
5573 * queued for task "task" of type "how".  "how" is a bitmask.
5574 */
5575ISC_SOCKETFUNC_SCOPE void
5576isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
5577	isc__socket_t *sock = (isc__socket_t *)sock0;
5578
5579	REQUIRE(VALID_SOCKET(sock));
5580
5581	/*
5582	 * Quick exit if there is nothing to do.  Don't even bother locking
5583	 * in this case.
5584	 */
5585	if (how == 0)
5586		return;
5587
5588	LOCK(&sock->lock);
5589
5590	/*
5591	 * All of these do the same thing, more or less.
5592	 * Each will:
5593	 *	o If the internal event is marked as "posted" try to
5594	 *	  remove it from the task's queue.  If this fails, mark it
5595	 *	  as canceled instead, and let the task clean it up later.
5596	 *	o For each I/O request for that task of that type, post
5597	 *	  its done event with status of "ISC_R_CANCELED".
5598	 *	o Reset any state needed.
5599	 */
5600	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5601	    && !ISC_LIST_EMPTY(sock->recv_list)) {
5602		isc_socketevent_t      *dev;
5603		isc_socketevent_t      *next;
5604		isc_task_t	       *current_task;
5605
5606		dev = ISC_LIST_HEAD(sock->recv_list);
5607
5608		while (dev != NULL) {
5609			current_task = dev->ev_sender;
5610			next = ISC_LIST_NEXT(dev, ev_link);
5611
5612			if ((task == NULL) || (task == current_task)) {
5613				dev->result = ISC_R_CANCELED;
5614				send_recvdone_event(sock, &dev);
5615			}
5616			dev = next;
5617		}
5618	}
5619
5620	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5621	    && !ISC_LIST_EMPTY(sock->send_list)) {
5622		isc_socketevent_t      *dev;
5623		isc_socketevent_t      *next;
5624		isc_task_t	       *current_task;
5625
5626		dev = ISC_LIST_HEAD(sock->send_list);
5627
5628		while (dev != NULL) {
5629			current_task = dev->ev_sender;
5630			next = ISC_LIST_NEXT(dev, ev_link);
5631
5632			if ((task == NULL) || (task == current_task)) {
5633				dev->result = ISC_R_CANCELED;
5634				send_senddone_event(sock, &dev);
5635			}
5636			dev = next;
5637		}
5638	}
5639
5640	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5641	    && !ISC_LIST_EMPTY(sock->accept_list)) {
5642		isc_socket_newconnev_t *dev;
5643		isc_socket_newconnev_t *next;
5644		isc_task_t	       *current_task;
5645
5646		dev = ISC_LIST_HEAD(sock->accept_list);
5647		while (dev != NULL) {
5648			current_task = dev->ev_sender;
5649			next = ISC_LIST_NEXT(dev, ev_link);
5650
5651			if ((task == NULL) || (task == current_task)) {
5652
5653				ISC_LIST_UNLINK(sock->accept_list, dev,
5654						ev_link);
5655
5656				NEWCONNSOCK(dev)->references--;
5657				free_socket((isc__socket_t **)&dev->newsocket);
5658
5659				dev->result = ISC_R_CANCELED;
5660				dev->ev_sender = sock;
5661				isc_task_sendanddetach(&current_task,
5662						       ISC_EVENT_PTR(&dev));
5663			}
5664
5665			dev = next;
5666		}
5667	}
5668
5669	/*
5670	 * Connecting is not a list.
5671	 */
5672	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5673	    && sock->connect_ev != NULL) {
5674		isc_socket_connev_t    *dev;
5675		isc_task_t	       *current_task;
5676
5677		INSIST(sock->connecting);
5678		sock->connecting = 0;
5679
5680		dev = sock->connect_ev;
5681		current_task = dev->ev_sender;
5682
5683		if ((task == NULL) || (task == current_task)) {
5684			sock->connect_ev = NULL;
5685
5686			dev->result = ISC_R_CANCELED;
5687			dev->ev_sender = sock;
5688			isc_task_sendanddetach(&current_task,
5689					       ISC_EVENT_PTR(&dev));
5690		}
5691	}
5692
5693	UNLOCK(&sock->lock);
5694}
5695
5696ISC_SOCKETFUNC_SCOPE isc_sockettype_t
5697isc__socket_gettype(isc_socket_t *sock0) {
5698	isc__socket_t *sock = (isc__socket_t *)sock0;
5699
5700	REQUIRE(VALID_SOCKET(sock));
5701
5702	return (sock->type);
5703}
5704
5705ISC_SOCKETFUNC_SCOPE isc_boolean_t
5706isc__socket_isbound(isc_socket_t *sock0) {
5707	isc__socket_t *sock = (isc__socket_t *)sock0;
5708	isc_boolean_t val;
5709
5710	REQUIRE(VALID_SOCKET(sock));
5711
5712	LOCK(&sock->lock);
5713	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5714	UNLOCK(&sock->lock);
5715
5716	return (val);
5717}
5718
5719ISC_SOCKETFUNC_SCOPE void
5720isc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) {
5721	isc__socket_t *sock = (isc__socket_t *)sock0;
5722#if defined(IPV6_V6ONLY)
5723	int onoff = yes ? 1 : 0;
5724#else
5725	UNUSED(yes);
5726	UNUSED(sock);
5727#endif
5728
5729	REQUIRE(VALID_SOCKET(sock));
5730	INSIST(!sock->dupped);
5731
5732#ifdef IPV6_V6ONLY
5733	if (sock->pf == AF_INET6) {
5734		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5735			       (void *)&onoff, sizeof(int)) < 0) {
5736			char strbuf[ISC_STRERRORSIZE];
5737			isc__strerror(errno, strbuf, sizeof(strbuf));
5738			UNEXPECTED_ERROR(__FILE__, __LINE__,
5739					 "setsockopt(%d, IPV6_V6ONLY) "
5740					 "%s: %s", sock->fd,
5741					 isc_msgcat_get(isc_msgcat,
5742							ISC_MSGSET_GENERAL,
5743							ISC_MSG_FAILED,
5744							"failed"),
5745					 strbuf);
5746		}
5747	}
5748	FIX_IPV6_RECVPKTINFO(sock);	/* AIX */
5749#endif
5750}
5751
5752#ifndef USE_WATCHER_THREAD
5753/*
5754 * In our assumed scenario, we can simply use a single static object.
5755 * XXX: this is not true if the application uses multiple threads with
5756 *      'multi-context' mode.  Fixing this is a future TODO item.
5757 */
5758static isc_socketwait_t swait_private;
5759
5760int
5761isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
5762			  isc_socketwait_t **swaitp)
5763{
5764	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5765
5766
5767	int n;
5768#ifdef USE_KQUEUE
5769	struct timespec ts, *tsp;
5770#endif
5771#ifdef USE_EPOLL
5772	int timeout;
5773#endif
5774#ifdef USE_DEVPOLL
5775	struct dvpoll dvp;
5776#endif
5777
5778	REQUIRE(swaitp != NULL && *swaitp == NULL);
5779
5780#ifdef USE_SHARED_MANAGER
5781	if (manager == NULL)
5782		manager = socketmgr;
5783#endif
5784	if (manager == NULL)
5785		return (0);
5786
5787#ifdef USE_KQUEUE
5788	if (tvp != NULL) {
5789		ts.tv_sec = tvp->tv_sec;
5790		ts.tv_nsec = tvp->tv_usec * 1000;
5791		tsp = &ts;
5792	} else
5793		tsp = NULL;
5794	swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0,
5795				       manager->events, manager->nevents,
5796				       tsp);
5797	n = swait_private.nevents;
5798#elif defined(USE_EPOLL)
5799	if (tvp != NULL)
5800		timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5801	else
5802		timeout = -1;
5803	swait_private.nevents = epoll_wait(manager->epoll_fd,
5804					   manager->events,
5805					   manager->nevents, timeout);
5806	n = swait_private.nevents;
5807#elif defined(USE_DEVPOLL)
5808	dvp.dp_fds = manager->events;
5809	dvp.dp_nfds = manager->nevents;
5810	if (tvp != NULL) {
5811		dvp.dp_timeout = tvp->tv_sec * 1000 +
5812			(tvp->tv_usec + 999) / 1000;
5813	} else
5814		dvp.dp_timeout = -1;
5815	swait_private.nevents = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
5816	n = swait_private.nevents;
5817#elif defined(USE_SELECT)
5818	memcpy(manager->read_fds_copy, manager->read_fds,  manager->fd_bufsize);
5819	memcpy(manager->write_fds_copy, manager->write_fds,
5820	       manager->fd_bufsize);
5821
5822	swait_private.readset = manager->read_fds_copy;
5823	swait_private.writeset = manager->write_fds_copy;
5824	swait_private.maxfd = manager->maxfd + 1;
5825
5826	n = select(swait_private.maxfd, swait_private.readset,
5827		   swait_private.writeset, NULL, tvp);
5828#endif
5829
5830	*swaitp = &swait_private;
5831	return (n);
5832}
5833
5834isc_result_t
5835isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
5836	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5837
5838	REQUIRE(swait == &swait_private);
5839
5840#ifdef USE_SHARED_MANAGER
5841	if (manager == NULL)
5842		manager = socketmgr;
5843#endif
5844	if (manager == NULL)
5845		return (ISC_R_NOTFOUND);
5846
5847#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5848	(void)process_fds(manager, manager->events, swait->nevents);
5849	return (ISC_R_SUCCESS);
5850#elif defined(USE_SELECT)
5851	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
5852	return (ISC_R_SUCCESS);
5853#endif
5854}
5855#endif /* USE_WATCHER_THREAD */
5856
5857#ifdef BIND9
5858void
5859isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) {
5860	isc__socket_t *socket = (isc__socket_t *)socket0;
5861
5862	/*
5863	 * Name 'socket'.
5864	 */
5865
5866	REQUIRE(VALID_SOCKET(socket));
5867
5868	LOCK(&socket->lock);
5869	memset(socket->name, 0, sizeof(socket->name));
5870	strncpy(socket->name, name, sizeof(socket->name) - 1);
5871	socket->tag = tag;
5872	UNLOCK(&socket->lock);
5873}
5874
5875ISC_SOCKETFUNC_SCOPE const char *
5876isc__socket_getname(isc_socket_t *socket0) {
5877	isc__socket_t *socket = (isc__socket_t *)socket0;
5878
5879	return (socket->name);
5880}
5881
5882void *
5883isc__socket_gettag(isc_socket_t *socket0) {
5884	isc__socket_t *socket = (isc__socket_t *)socket0;
5885
5886	return (socket->tag);
5887}
5888#endif	/* BIND9 */
5889
5890#ifdef USE_SOCKETIMPREGISTER
5891isc_result_t
5892isc__socket_register() {
5893	return (isc_socket_register(isc__socketmgr_create));
5894}
5895#endif
5896
5897ISC_SOCKETFUNC_SCOPE int
5898isc__socket_getfd(isc_socket_t *socket0) {
5899	isc__socket_t *socket = (isc__socket_t *)socket0;
5900
5901	return ((short) socket->fd);
5902}
5903
5904#if defined(HAVE_LIBXML2) && defined(BIND9)
5905
5906static const char *
5907_socktype(isc_sockettype_t type)
5908{
5909	if (type == isc_sockettype_udp)
5910		return ("udp");
5911	else if (type == isc_sockettype_tcp)
5912		return ("tcp");
5913	else if (type == isc_sockettype_unix)
5914		return ("unix");
5915	else if (type == isc_sockettype_fdwatch)
5916		return ("fdwatch");
5917	else
5918		return ("not-initialized");
5919}
5920
5921ISC_SOCKETFUNC_SCOPE void
5922isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) {
5923	isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
5924	isc__socket_t *sock;
5925	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5926	isc_sockaddr_t addr;
5927	ISC_SOCKADDR_LEN_T len;
5928
5929	LOCK(&mgr->lock);
5930
5931#ifdef USE_SHARED_MANAGER
5932	xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5933	xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5934	xmlTextWriterEndElement(writer);
5935#endif	/* USE_SHARED_MANAGER */
5936
5937	xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5938	sock = ISC_LIST_HEAD(mgr->socklist);
5939	while (sock != NULL) {
5940		LOCK(&sock->lock);
5941		xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5942
5943		xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5944		xmlTextWriterWriteFormatString(writer, "%p", sock);
5945		xmlTextWriterEndElement(writer);
5946
5947		if (sock->name[0] != 0) {
5948			xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5949			xmlTextWriterWriteFormatString(writer, "%s",
5950						       sock->name);
5951			xmlTextWriterEndElement(writer); /* name */
5952		}
5953
5954		xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5955		xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5956		xmlTextWriterEndElement(writer);
5957
5958		xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5959					  ISC_XMLCHAR _socktype(sock->type));
5960
5961		if (sock->connected) {
5962			isc_sockaddr_format(&sock->peer_address, peerbuf,
5963					    sizeof(peerbuf));
5964			xmlTextWriterWriteElement(writer,
5965						  ISC_XMLCHAR "peer-address",
5966						  ISC_XMLCHAR peerbuf);
5967		}
5968
5969		len = sizeof(addr);
5970		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5971			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5972			xmlTextWriterWriteElement(writer,
5973						  ISC_XMLCHAR "local-address",
5974						  ISC_XMLCHAR peerbuf);
5975		}
5976
5977		xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5978		if (sock->pending_recv)
5979			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5980						ISC_XMLCHAR "pending-receive");
5981		if (sock->pending_send)
5982			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5983						  ISC_XMLCHAR "pending-send");
5984		if (sock->pending_accept)
5985			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5986						 ISC_XMLCHAR "pending_accept");
5987		if (sock->listener)
5988			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5989						  ISC_XMLCHAR "listener");
5990		if (sock->connected)
5991			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5992						  ISC_XMLCHAR "connected");
5993		if (sock->connecting)
5994			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5995						  ISC_XMLCHAR "connecting");
5996		if (sock->bound)
5997			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5998						  ISC_XMLCHAR "bound");
5999
6000		xmlTextWriterEndElement(writer); /* states */
6001
6002		xmlTextWriterEndElement(writer); /* socket */
6003
6004		UNLOCK(&sock->lock);
6005		sock = ISC_LIST_NEXT(sock, link);
6006	}
6007	xmlTextWriterEndElement(writer); /* sockets */
6008
6009	UNLOCK(&mgr->lock);
6010}
6011#endif /* HAVE_LIBXML2 */
6012