1/*	$NetBSD: socket.c,v 1.2 2011/08/16 04:45:17 christos Exp $	*/
2
3/*
4 * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
5 * Copyright (C) 1998-2003  Internet Software Consortium.
6 *
7 * Permission to use, copy, modify, and/or distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
12 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
13 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
14 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
15 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
16 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
17 * PERFORMANCE OF THIS SOFTWARE.
18 */
19
20/* Id: socket.c,v 1.308.12.8 2009/04/18 01:29:26 jinmei Exp */
21
22/*! \file */
23
24#include <config.h>
25
26#include <sys/param.h>
27#include <sys/types.h>
28#include <sys/socket.h>
29#include <sys/stat.h>
30#include <sys/time.h>
31#include <sys/uio.h>
32
33#include <errno.h>
34#include <fcntl.h>
35#include <stddef.h>
36#include <stdlib.h>
37#include <string.h>
38#include <unistd.h>
39
40#include <isc/buffer.h>
41#include <isc/bufferlist.h>
42#include <isc/condition.h>
43#include <isc/formatcheck.h>
44#include <isc/list.h>
45#include <isc/log.h>
46#include <isc/mem.h>
47#include <isc/msgs.h>
48#include <isc/mutex.h>
49#include <isc/net.h>
50#include <isc/once.h>
51#include <isc/platform.h>
52#include <isc/print.h>
53#include <isc/region.h>
54#include <isc/socket.h>
55#include <isc/stats.h>
56#include <isc/strerror.h>
57#include <isc/task.h>
58#include <isc/thread.h>
59#include <isc/util.h>
60#include <isc/xml.h>
61
62#ifdef ISC_PLATFORM_HAVESYSUNH
63#include <sys/un.h>
64#endif
65#ifdef ISC_PLATFORM_HAVEKQUEUE
66#include <sys/event.h>
67#endif
68#ifdef ISC_PLATFORM_HAVEEPOLL
69#include <sys/epoll.h>
70#endif
71#ifdef ISC_PLATFORM_HAVEDEVPOLL
72#include <sys/devpoll.h>
73#endif
74
75#include "errno2result.h"
76
77#ifndef ISC_PLATFORM_USETHREADS
78#include "socket_p.h"
79#endif /* ISC_PLATFORM_USETHREADS */
80
81#if defined(SO_BSDCOMPAT) && defined(__linux__)
82#include <sys/utsname.h>
83#endif
84
85/*%
86 * Choose the most preferable multiplex method.
87 */
88#ifdef ISC_PLATFORM_HAVEKQUEUE
89#define USE_KQUEUE
90#elif defined (ISC_PLATFORM_HAVEEPOLL)
91#define USE_EPOLL
92#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
93#define USE_DEVPOLL
94typedef struct {
95	unsigned int want_read : 1,
96		want_write : 1;
97} pollinfo_t;
98#else
99#define USE_SELECT
100#endif	/* ISC_PLATFORM_HAVEKQUEUE */
101
102#ifndef ISC_PLATFORM_USETHREADS
103#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
104struct isc_socketwait {
105	int nevents;
106};
107#elif defined (USE_SELECT)
108struct isc_socketwait {
109	fd_set *readset;
110	fd_set *writeset;
111	int nfds;
112	int maxfd;
113};
114#endif	/* USE_KQUEUE */
115#endif /* !ISC_PLATFORM_USETHREADS */
116
117/*%
118 * Maximum number of allowable open sockets.  This is also the maximum
119 * allowable socket file descriptor.
120 *
121 * Care should be taken before modifying this value for select():
122 * The API standard doesn't ensure select() accept more than (the system default
123 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
124 * the vast majority of cases.  This constant should therefore be increased only
125 * when absolutely necessary and possible, i.e., the server is exhausting all
126 * available file descriptors (up to FD_SETSIZE) and the select() function
127 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
128 * always by true, but we keep using some of them to ensure as much
129 * portability as possible).  Note also that overall server performance
130 * may be rather worsened with a larger value of this constant due to
131 * inherent scalability problems of select().
132 *
133 * As a special note, this value shouldn't have to be touched if
134 * this is a build for an authoritative only DNS server.
135 */
136#ifndef ISC_SOCKET_MAXSOCKETS
137#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
138#define ISC_SOCKET_MAXSOCKETS 4096
139#elif defined(USE_SELECT)
140#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
141#endif	/* USE_KQUEUE... */
142#endif	/* ISC_SOCKET_MAXSOCKETS */
143
144#ifdef USE_SELECT
145/*%
146 * Mac OS X needs a special definition to support larger values in select().
147 * We always define this because a larger value can be specified run-time.
148 */
149#ifdef __APPLE__
150#define _DARWIN_UNLIMITED_SELECT
151#endif	/* __APPLE__ */
152#endif	/* USE_SELECT */
153
154#ifdef ISC_SOCKET_USE_POLLWATCH
155/*%
156 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
157 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
158 * some of the specified FD.  The idea is based on the observation that it's
159 * likely for a busy server to keep receiving packets.  It specifically works
160 * as follows: the socket watcher is first initialized with the state of
161 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
162 * event occurs.  When it wakes up for a socket I/O event, it moves to the
163 * poll_active state, and sets the poll timeout to a short period
164 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
165 * watcher goes to the poll_checking state with the same timeout period.
166 * In this state, the watcher tries to detect whether this is a break
167 * during intermittent events or the kernel bug is triggered.  If the next
168 * polling reports an event within the short period, the previous timeout is
169 * likely to be a kernel bug, and so the watcher goes back to the active state.
170 * Otherwise, it moves to the idle state again.
171 *
172 * It's not clear whether this is a thread-related bug, but since we've only
173 * seen this with threads, this workaround is used only when enabling threads.
174 */
175
176typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
177
178#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
179#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
180#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
181#endif	/* ISC_SOCKET_USE_POLLWATCH */
182
183/*%
184 * Size of per-FD lock buckets.
185 */
186#ifdef ISC_PLATFORM_USETHREADS
187#define FDLOCK_COUNT		1024
188#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)
189#else
190#define FDLOCK_COUNT		1
191#define FDLOCK_ID(fd)		0
192#endif	/* ISC_PLATFORM_USETHREADS */
193
194/*%
195 * Maximum number of events communicated with the kernel.  There should normally
196 * be no need for having a large number.
197 */
198#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
199#ifndef ISC_SOCKET_MAXEVENTS
200#define ISC_SOCKET_MAXEVENTS	64
201#endif
202#endif
203
204/*%
205 * Some systems define the socket length argument as an int, some as size_t,
206 * some as socklen_t.  This is here so it can be easily changed if needed.
207 */
208#ifndef ISC_SOCKADDR_LEN_T
209#define ISC_SOCKADDR_LEN_T unsigned int
210#endif
211
212/*%
213 * Define what the possible "soft" errors can be.  These are non-fatal returns
214 * of various network related functions, like recv() and so on.
215 *
216 * For some reason, BSDI (and perhaps others) will sometimes return <0
217 * from recv() but will have errno==0.  This is broken, but we have to
218 * work around it here.
219 */
220#define SOFT_ERROR(e)	((e) == EAGAIN || \
221			 (e) == EWOULDBLOCK || \
222			 (e) == EINTR || \
223			 (e) == 0)
224
225#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
226
227/*!<
228 * DLVL(90)  --  Function entry/exit and other tracing.
229 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
230 * DLVL(60)  --  Socket data send/receive
231 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
232 * DLVL(20)  --  Socket creation/destruction.
233 */
234#define TRACE_LEVEL		90
235#define CORRECTNESS_LEVEL	70
236#define IOEVENT_LEVEL		60
237#define EVENT_LEVEL		50
238#define CREATION_LEVEL		20
239
240#define TRACE		DLVL(TRACE_LEVEL)
241#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
242#define IOEVENT		DLVL(IOEVENT_LEVEL)
243#define EVENT		DLVL(EVENT_LEVEL)
244#define CREATION	DLVL(CREATION_LEVEL)
245
246typedef isc_event_t intev_t;
247
248#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
249#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
250
251/*!
252 * IPv6 control information.  If the socket is an IPv6 socket we want
253 * to collect the destination address and interface so the client can
254 * set them on outgoing packets.
255 */
256#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
257#ifndef USE_CMSG
258#define USE_CMSG	1
259#endif
260#endif
261
262/*%
263 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
264 * a setsockopt() like interface to request timestamps, and if the OS
265 * doesn't do it for us, call gettimeofday() on every UDP receive?
266 */
267#ifdef SO_TIMESTAMP
268#ifndef USE_CMSG
269#define USE_CMSG	1
270#endif
271#endif
272
273/*%
274 * The size to raise the receive buffer to (from BIND 8).
275 */
276#define RCVBUFSIZE (32*1024)
277
278/*%
279 * The number of times a send operation is repeated if the result is EINTR.
280 */
281#define NRETRIES 10
282
283struct isc_socket {
284	/* Not locked. */
285	unsigned int		magic;
286	isc_socketmgr_t	       *manager;
287	isc_mutex_t		lock;
288	isc_sockettype_t	type;
289	const isc_statscounter_t	*statsindex;
290
291	/* Locked by socket lock. */
292	ISC_LINK(isc_socket_t)	link;
293	unsigned int		references;
294	int			fd;
295	int			pf;
296	char				name[16];
297	void *				tag;
298
299	ISC_LIST(isc_socketevent_t)		send_list;
300	ISC_LIST(isc_socketevent_t)		recv_list;
301	ISC_LIST(isc_socket_newconnev_t)	accept_list;
302	isc_socket_connev_t		       *connect_ev;
303
304	/*
305	 * Internal events.  Posted when a descriptor is readable or
306	 * writable.  These are statically allocated and never freed.
307	 * They will be set to non-purgable before use.
308	 */
309	intev_t			readable_ev;
310	intev_t			writable_ev;
311
312	isc_sockaddr_t		peer_address;  /* remote address */
313
314	unsigned int		pending_recv : 1,
315				pending_send : 1,
316				pending_accept : 1,
317				listener : 1, /* listener socket */
318				connected : 1,
319				connecting : 1, /* connect pending */
320				bound : 1; /* bound to local addr */
321
322#ifdef ISC_NET_RECVOVERFLOW
323	unsigned char		overflow; /* used for MSG_TRUNC fake */
324#endif
325
326	char			*recvcmsgbuf;
327	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
328	char			*sendcmsgbuf;
329	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
330
331	void			*fdwatcharg;
332	isc_sockfdwatch_t	fdwatchcb;
333	int			fdwatchflags;
334	isc_task_t		*fdwatchtask;
335};
336
337#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
338#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
339
340struct isc_socketmgr {
341	/* Not locked. */
342	unsigned int		magic;
343	isc_mem_t	       *mctx;
344	isc_mutex_t		lock;
345	isc_mutex_t		*fdlock;
346	isc_stats_t		*stats;
347#ifdef USE_KQUEUE
348	int			kqueue_fd;
349	int			nevents;
350	struct kevent		*events;
351#endif	/* USE_KQUEUE */
352#ifdef USE_EPOLL
353	int			epoll_fd;
354	int			nevents;
355	struct epoll_event	*events;
356#endif	/* USE_EPOLL */
357#ifdef USE_DEVPOLL
358	int			devpoll_fd;
359	int			nevents;
360	struct pollfd		*events;
361#endif	/* USE_DEVPOLL */
362#ifdef USE_SELECT
363	int			fd_bufsize;
364#endif	/* USE_SELECT */
365	unsigned int		maxsocks;
366#ifdef ISC_PLATFORM_USETHREADS
367	int			pipe_fds[2];
368#endif
369
370	/* Locked by fdlock. */
371	isc_socket_t	       **fds;
372	int			*fdstate;
373#ifdef USE_DEVPOLL
374	pollinfo_t		*fdpollinfo;
375#endif
376
377	/* Locked by manager lock. */
378	ISC_LIST(isc_socket_t)	socklist;
379#ifdef USE_SELECT
380	fd_set			*read_fds;
381	fd_set			*read_fds_copy;
382	fd_set			*write_fds;
383	fd_set			*write_fds_copy;
384	int			maxfd;
385#endif	/* USE_SELECT */
386	int			reserved;	/* unlocked */
387#ifdef ISC_PLATFORM_USETHREADS
388	isc_thread_t		watcher;
389	isc_condition_t		shutdown_ok;
390#else /* ISC_PLATFORM_USETHREADS */
391	unsigned int		refs;
392#endif /* ISC_PLATFORM_USETHREADS */
393};
394
395#ifndef ISC_PLATFORM_USETHREADS
396static isc_socketmgr_t *socketmgr = NULL;
397#endif /* ISC_PLATFORM_USETHREADS */
398
399#define CLOSED			0	/* this one must be zero */
400#define MANAGED			1
401#define CLOSE_PENDING		2
402
403/*
404 * send() and recv() iovec counts
405 */
406#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
407#ifdef ISC_NET_RECVOVERFLOW
408# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
409#else
410# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
411#endif
412
413static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
414static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
415static void free_socket(isc_socket_t **);
416static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
417				    isc_socket_t **);
418static void destroy(isc_socket_t **);
419static void internal_accept(isc_task_t *, isc_event_t *);
420static void internal_connect(isc_task_t *, isc_event_t *);
421static void internal_recv(isc_task_t *, isc_event_t *);
422static void internal_send(isc_task_t *, isc_event_t *);
423static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
424static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
425static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
426static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
427			      struct msghdr *, struct iovec *, size_t *);
428static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
429			      struct msghdr *, struct iovec *, size_t *);
430#ifdef ISC_PLATFORM_USETHREADS
431static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
432#endif
433
434#define SELECT_POKE_SHUTDOWN		(-1)
435#define SELECT_POKE_NOTHING		(-2)
436#define SELECT_POKE_READ		(-3)
437#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
438#define SELECT_POKE_WRITE		(-4)
439#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
440#define SELECT_POKE_CLOSE		(-5)
441
442#define SOCK_DEAD(s)			((s)->references == 0)
443
444/*%
445 * Shortcut index arrays to get access to statistics counters.
446 */
447enum {
448	STATID_OPEN = 0,
449	STATID_OPENFAIL = 1,
450	STATID_CLOSE = 2,
451	STATID_BINDFAIL = 3,
452	STATID_CONNECTFAIL = 4,
453	STATID_CONNECT = 5,
454	STATID_ACCEPTFAIL = 6,
455	STATID_ACCEPT = 7,
456	STATID_SENDFAIL = 8,
457	STATID_RECVFAIL = 9
458};
459static const isc_statscounter_t upd4statsindex[] = {
460	isc_sockstatscounter_udp4open,
461	isc_sockstatscounter_udp4openfail,
462	isc_sockstatscounter_udp4close,
463	isc_sockstatscounter_udp4bindfail,
464	isc_sockstatscounter_udp4connectfail,
465	isc_sockstatscounter_udp4connect,
466	-1,
467	-1,
468	isc_sockstatscounter_udp4sendfail,
469	isc_sockstatscounter_udp4recvfail
470};
471static const isc_statscounter_t upd6statsindex[] = {
472	isc_sockstatscounter_udp6open,
473	isc_sockstatscounter_udp6openfail,
474	isc_sockstatscounter_udp6close,
475	isc_sockstatscounter_udp6bindfail,
476	isc_sockstatscounter_udp6connectfail,
477	isc_sockstatscounter_udp6connect,
478	-1,
479	-1,
480	isc_sockstatscounter_udp6sendfail,
481	isc_sockstatscounter_udp6recvfail
482};
483static const isc_statscounter_t tcp4statsindex[] = {
484	isc_sockstatscounter_tcp4open,
485	isc_sockstatscounter_tcp4openfail,
486	isc_sockstatscounter_tcp4close,
487	isc_sockstatscounter_tcp4bindfail,
488	isc_sockstatscounter_tcp4connectfail,
489	isc_sockstatscounter_tcp4connect,
490	isc_sockstatscounter_tcp4acceptfail,
491	isc_sockstatscounter_tcp4accept,
492	isc_sockstatscounter_tcp4sendfail,
493	isc_sockstatscounter_tcp4recvfail
494};
495static const isc_statscounter_t tcp6statsindex[] = {
496	isc_sockstatscounter_tcp6open,
497	isc_sockstatscounter_tcp6openfail,
498	isc_sockstatscounter_tcp6close,
499	isc_sockstatscounter_tcp6bindfail,
500	isc_sockstatscounter_tcp6connectfail,
501	isc_sockstatscounter_tcp6connect,
502	isc_sockstatscounter_tcp6acceptfail,
503	isc_sockstatscounter_tcp6accept,
504	isc_sockstatscounter_tcp6sendfail,
505	isc_sockstatscounter_tcp6recvfail
506};
507static const isc_statscounter_t unixstatsindex[] = {
508	isc_sockstatscounter_unixopen,
509	isc_sockstatscounter_unixopenfail,
510	isc_sockstatscounter_unixclose,
511	isc_sockstatscounter_unixbindfail,
512	isc_sockstatscounter_unixconnectfail,
513	isc_sockstatscounter_unixconnect,
514	isc_sockstatscounter_unixacceptfail,
515	isc_sockstatscounter_unixaccept,
516	isc_sockstatscounter_unixsendfail,
517	isc_sockstatscounter_unixrecvfail
518};
519static const isc_statscounter_t fdwatchstatsindex[] = {
520	-1,
521	-1,
522	isc_sockstatscounter_fdwatchclose,
523	isc_sockstatscounter_fdwatchbindfail,
524	isc_sockstatscounter_fdwatchconnectfail,
525	isc_sockstatscounter_fdwatchconnect,
526	-1,
527	-1,
528	isc_sockstatscounter_fdwatchsendfail,
529	isc_sockstatscounter_fdwatchrecvfail
530};
531
532static void
533manager_log(isc_socketmgr_t *sockmgr,
534	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
535	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
536static void
537manager_log(isc_socketmgr_t *sockmgr,
538	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
539	    const char *fmt, ...)
540{
541	char msgbuf[2048];
542	va_list ap;
543
544	if (! isc_log_wouldlog(isc_lctx, level))
545		return;
546
547	va_start(ap, fmt);
548	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
549	va_end(ap);
550
551	isc_log_write(isc_lctx, category, module, level,
552		      "sockmgr %p: %s", sockmgr, msgbuf);
553}
554
555static void
556socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
557	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
558	   isc_msgcat_t *msgcat, int msgset, int message,
559	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
560static void
561socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
562	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
563	   isc_msgcat_t *msgcat, int msgset, int message,
564	   const char *fmt, ...)
565{
566	char msgbuf[2048];
567	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
568	va_list ap;
569
570	if (! isc_log_wouldlog(isc_lctx, level))
571		return;
572
573	va_start(ap, fmt);
574	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
575	va_end(ap);
576
577	if (address == NULL) {
578		isc_log_iwrite(isc_lctx, category, module, level,
579			       msgcat, msgset, message,
580			       "socket %p: %s", sock, msgbuf);
581	} else {
582		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
583		isc_log_iwrite(isc_lctx, category, module, level,
584			       msgcat, msgset, message,
585			       "socket %p %s: %s", sock, peerbuf, msgbuf);
586	}
587}
588
589#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
590    defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
591/*
592 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
593 * setting IPV6_V6ONLY.
594 */
595static void
596FIX_IPV6_RECVPKTINFO(isc_socket_t *sock)
597{
598	char strbuf[ISC_STRERRORSIZE];
599	int on = 1;
600
601	if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
602		return;
603
604	if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
605		       (void *)&on, sizeof(on)) < 0) {
606		isc__strerror(errno, strbuf, sizeof(strbuf));
607		UNEXPECTED_ERROR(__FILE__, __LINE__,
608				 "setsockopt(%d, IPV6_RECVPKTINFO) "
609				 "%s: %s", sock->fd,
610				 isc_msgcat_get(isc_msgcat,
611						ISC_MSGSET_GENERAL,
612						ISC_MSG_FAILED,
613						"failed"),
614				 strbuf);
615	}
616}
617#else
618#define FIX_IPV6_RECVPKTINFO(sock) (void)0
619#endif
620
621/*%
622 * Increment socket-related statistics counters.
623 */
624static inline void
625inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
626	REQUIRE(counterid != -1);
627
628	if (stats != NULL)
629		isc_stats_increment(stats, counterid);
630}
631
632static inline isc_result_t
633watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
634	isc_result_t result = ISC_R_SUCCESS;
635
636#ifdef USE_KQUEUE
637	struct kevent evchange;
638
639	memset(&evchange, 0, sizeof(evchange));
640	if (msg == SELECT_POKE_READ)
641		evchange.filter = EVFILT_READ;
642	else
643		evchange.filter = EVFILT_WRITE;
644	evchange.flags = EV_ADD;
645	evchange.ident = fd;
646	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
647		result = isc__errno2result(errno);
648
649	return (result);
650#elif defined(USE_EPOLL)
651	struct epoll_event event;
652
653	if (msg == SELECT_POKE_READ)
654		event.events = EPOLLIN;
655	else
656		event.events = EPOLLOUT;
657	event.data.fd = fd;
658	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
659	    errno != EEXIST) {
660		result = isc__errno2result(errno);
661	}
662
663	return (result);
664#elif defined(USE_DEVPOLL)
665	struct pollfd pfd;
666	int lockid = FDLOCK_ID(fd);
667
668	memset(&pfd, 0, sizeof(pfd));
669	if (msg == SELECT_POKE_READ)
670		pfd.events = POLLIN;
671	else
672		pfd.events = POLLOUT;
673	pfd.fd = fd;
674	pfd.revents = 0;
675	LOCK(&manager->fdlock[lockid]);
676	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
677		result = isc__errno2result(errno);
678	else {
679		if (msg == SELECT_POKE_READ)
680			manager->fdpollinfo[fd].want_read = 1;
681		else
682			manager->fdpollinfo[fd].want_write = 1;
683	}
684	UNLOCK(&manager->fdlock[lockid]);
685
686	return (result);
687#elif defined(USE_SELECT)
688	LOCK(&manager->lock);
689	if (msg == SELECT_POKE_READ)
690		FD_SET(fd, manager->read_fds);
691	if (msg == SELECT_POKE_WRITE)
692		FD_SET(fd, manager->write_fds);
693	UNLOCK(&manager->lock);
694
695	return (result);
696#endif
697}
698
699static inline isc_result_t
700unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
701	isc_result_t result = ISC_R_SUCCESS;
702
703#ifdef USE_KQUEUE
704	struct kevent evchange;
705
706	memset(&evchange, 0, sizeof(evchange));
707	if (msg == SELECT_POKE_READ)
708		evchange.filter = EVFILT_READ;
709	else
710		evchange.filter = EVFILT_WRITE;
711	evchange.flags = EV_DELETE;
712	evchange.ident = fd;
713	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
714		result = isc__errno2result(errno);
715
716	return (result);
717#elif defined(USE_EPOLL)
718	struct epoll_event event;
719
720	if (msg == SELECT_POKE_READ)
721		event.events = EPOLLIN;
722	else
723		event.events = EPOLLOUT;
724	event.data.fd = fd;
725	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
726	    errno != ENOENT) {
727		char strbuf[ISC_STRERRORSIZE];
728		isc__strerror(errno, strbuf, sizeof(strbuf));
729		UNEXPECTED_ERROR(__FILE__, __LINE__,
730				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
731		result = ISC_R_UNEXPECTED;
732	}
733	return (result);
734#elif defined(USE_DEVPOLL)
735	struct pollfd pfds[2];
736	size_t writelen = sizeof(pfds[0]);
737	int lockid = FDLOCK_ID(fd);
738
739	memset(pfds, 0, sizeof(pfds));
740	pfds[0].events = POLLREMOVE;
741	pfds[0].fd = fd;
742
743	/*
744	 * Canceling read or write polling via /dev/poll is tricky.  Since it
745	 * only provides a way of canceling per FD, we may need to re-poll the
746	 * socket for the other operation.
747	 */
748	LOCK(&manager->fdlock[lockid]);
749	if (msg == SELECT_POKE_READ &&
750	    manager->fdpollinfo[fd].want_write == 1) {
751		pfds[1].events = POLLOUT;
752		pfds[1].fd = fd;
753		writelen += sizeof(pfds[1]);
754	}
755	if (msg == SELECT_POKE_WRITE &&
756	    manager->fdpollinfo[fd].want_read == 1) {
757		pfds[1].events = POLLIN;
758		pfds[1].fd = fd;
759		writelen += sizeof(pfds[1]);
760	}
761
762	if (write(manager->devpoll_fd, pfds, writelen) == -1)
763		result = isc__errno2result(errno);
764	else {
765		if (msg == SELECT_POKE_READ)
766			manager->fdpollinfo[fd].want_read = 0;
767		else
768			manager->fdpollinfo[fd].want_write = 0;
769	}
770	UNLOCK(&manager->fdlock[lockid]);
771
772	return (result);
773#elif defined(USE_SELECT)
774	LOCK(&manager->lock);
775	if (msg == SELECT_POKE_READ)
776		FD_CLR(fd, manager->read_fds);
777	else if (msg == SELECT_POKE_WRITE)
778		FD_CLR(fd, manager->write_fds);
779	UNLOCK(&manager->lock);
780
781	return (result);
782#endif
783}
784
785static void
786wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
787	isc_result_t result;
788	int lockid = FDLOCK_ID(fd);
789
790	/*
791	 * This is a wakeup on a socket.  If the socket is not in the
792	 * process of being closed, start watching it for either reads
793	 * or writes.
794	 */
795
796	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
797
798	if (msg == SELECT_POKE_CLOSE) {
799		/* No one should be updating fdstate, so no need to lock it */
800		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
801		manager->fdstate[fd] = CLOSED;
802		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
803		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
804		(void)close(fd);
805		return;
806	}
807
808	LOCK(&manager->fdlock[lockid]);
809	if (manager->fdstate[fd] == CLOSE_PENDING) {
810		UNLOCK(&manager->fdlock[lockid]);
811
812		/*
813		 * We accept (and ignore) any error from unwatch_fd() as we are
814		 * closing the socket, hoping it doesn't leave dangling state in
815		 * the kernel.
816		 * Note that unwatch_fd() must be called after releasing the
817		 * fdlock; otherwise it could cause deadlock due to a lock order
818		 * reversal.
819		 */
820		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
821		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
822		return;
823	}
824	if (manager->fdstate[fd] != MANAGED) {
825		UNLOCK(&manager->fdlock[lockid]);
826		return;
827	}
828	UNLOCK(&manager->fdlock[lockid]);
829
830	/*
831	 * Set requested bit.
832	 */
833	result = watch_fd(manager, fd, msg);
834	if (result != ISC_R_SUCCESS) {
835		/*
836		 * XXXJT: what should we do?  Ignoring the failure of watching
837		 * a socket will make the application dysfunctional, but there
838		 * seems to be no reasonable recovery process.
839		 */
840		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
841			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
842			      "failed to start watching FD (%d): %s",
843			      fd, isc_result_totext(result));
844	}
845}
846
847#ifdef ISC_PLATFORM_USETHREADS
848/*
849 * Poke the select loop when there is something for us to do.
850 * The write is required (by POSIX) to complete.  That is, we
851 * will not get partial writes.
852 */
853static void
854select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
855	int cc;
856	int buf[2];
857	char strbuf[ISC_STRERRORSIZE];
858
859	buf[0] = fd;
860	buf[1] = msg;
861
862	do {
863		cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
864#ifdef ENOSR
865		/*
866		 * Treat ENOSR as EAGAIN but loop slowly as it is
867		 * unlikely to clear fast.
868		 */
869		if (cc < 0 && errno == ENOSR) {
870			sleep(1);
871			errno = EAGAIN;
872		}
873#endif
874	} while (cc < 0 && SOFT_ERROR(errno));
875
876	if (cc < 0) {
877		isc__strerror(errno, strbuf, sizeof(strbuf));
878		FATAL_ERROR(__FILE__, __LINE__,
879			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
880					   ISC_MSG_WRITEFAILED,
881					   "write() failed "
882					   "during watcher poke: %s"),
883			    strbuf);
884	}
885
886	INSIST(cc == sizeof(buf));
887}
888
889/*
890 * Read a message on the internal fd.
891 */
892static void
893select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
894	int buf[2];
895	int cc;
896	char strbuf[ISC_STRERRORSIZE];
897
898	cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
899	if (cc < 0) {
900		*msg = SELECT_POKE_NOTHING;
901		*fd = -1;	/* Silence compiler. */
902		if (SOFT_ERROR(errno))
903			return;
904
905		isc__strerror(errno, strbuf, sizeof(strbuf));
906		FATAL_ERROR(__FILE__, __LINE__,
907			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
908					   ISC_MSG_READFAILED,
909					   "read() failed "
910					   "during watcher poke: %s"),
911			    strbuf);
912
913		return;
914	}
915	INSIST(cc == sizeof(buf));
916
917	*fd = buf[0];
918	*msg = buf[1];
919}
920#else /* ISC_PLATFORM_USETHREADS */
921/*
922 * Update the state of the socketmgr when something changes.
923 */
924static void
925select_poke(isc_socketmgr_t *manager, int fd, int msg) {
926	if (msg == SELECT_POKE_SHUTDOWN)
927		return;
928	else if (fd >= 0)
929		wakeup_socket(manager, fd, msg);
930	return;
931}
932#endif /* ISC_PLATFORM_USETHREADS */
933
934/*
935 * Make a fd non-blocking.
936 */
937static isc_result_t
938make_nonblock(int fd) {
939	int ret;
940	int flags;
941	char strbuf[ISC_STRERRORSIZE];
942#ifdef USE_FIONBIO_IOCTL
943	int on = 1;
944
945	ret = ioctl(fd, FIONBIO, (char *)&on);
946#else
947	flags = fcntl(fd, F_GETFL, 0);
948	flags |= PORT_NONBLOCK;
949	ret = fcntl(fd, F_SETFL, flags);
950#endif
951
952	if (ret == -1) {
953		isc__strerror(errno, strbuf, sizeof(strbuf));
954		UNEXPECTED_ERROR(__FILE__, __LINE__,
955#ifdef USE_FIONBIO_IOCTL
956				 "ioctl(%d, FIONBIO, &on): %s", fd,
957#else
958				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
959#endif
960				 strbuf);
961
962		return (ISC_R_UNEXPECTED);
963	}
964
965	return (ISC_R_SUCCESS);
966}
967
968#ifdef USE_CMSG
969/*
970 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
971 * In order to ensure as much portability as possible, we provide wrapper
972 * functions of these macros.
973 * Note that cmsg_space() could run slow on OSes that do not have
974 * CMSG_SPACE.
975 */
976static inline ISC_SOCKADDR_LEN_T
977cmsg_len(ISC_SOCKADDR_LEN_T len) {
978#ifdef CMSG_LEN
979	return (CMSG_LEN(len));
980#else
981	ISC_SOCKADDR_LEN_T hdrlen;
982
983	/*
984	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
985	 * is correct.
986	 */
987	hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
988	return (hdrlen + len);
989#endif
990}
991
992static inline ISC_SOCKADDR_LEN_T
993cmsg_space(ISC_SOCKADDR_LEN_T len) {
994#ifdef CMSG_SPACE
995	return (CMSG_SPACE(len));
996#else
997	struct msghdr msg;
998	struct cmsghdr *cmsgp;
999	/*
1000	 * XXX: The buffer length is an ad-hoc value, but should be enough
1001	 * in a practical sense.
1002	 */
1003	char dummybuf[sizeof(struct cmsghdr) + 1024];
1004
1005	memset(&msg, 0, sizeof(msg));
1006	msg.msg_control = dummybuf;
1007	msg.msg_controllen = sizeof(dummybuf);
1008
1009	cmsgp = (struct cmsghdr *)dummybuf;
1010	cmsgp->cmsg_len = cmsg_len(len);
1011
1012	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1013	if (cmsgp != NULL)
1014		return ((char *)cmsgp - (char *)msg.msg_control);
1015	else
1016		return (0);
1017#endif
1018}
1019#endif /* USE_CMSG */
1020
1021/*
1022 * Process control messages received on a socket.
1023 */
1024static void
1025process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1026#ifdef USE_CMSG
1027	struct cmsghdr *cmsgp;
1028#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1029	struct in6_pktinfo *pktinfop;
1030#endif
1031#ifdef SO_TIMESTAMP
1032	struct timeval *timevalp;
1033#endif
1034#endif
1035
1036	/*
1037	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1038	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1039	 * They are all here, outside of the CPP tests, because it is
1040	 * more consistent with the usual ISC coding style.
1041	 */
1042	UNUSED(sock);
1043	UNUSED(msg);
1044	UNUSED(dev);
1045
1046#ifdef ISC_NET_BSD44MSGHDR
1047
1048#ifdef MSG_TRUNC
1049	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1050		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1051#endif
1052
1053#ifdef MSG_CTRUNC
1054	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1055		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1056#endif
1057
1058#ifndef USE_CMSG
1059	return;
1060#else
1061	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1062		return;
1063
1064#ifdef SO_TIMESTAMP
1065	timevalp = NULL;
1066#endif
1067#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1068	pktinfop = NULL;
1069#endif
1070
1071	cmsgp = CMSG_FIRSTHDR(msg);
1072	while (cmsgp != NULL) {
1073		socket_log(sock, NULL, TRACE,
1074			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1075			   "processing cmsg %p", cmsgp);
1076
1077#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1078		if (cmsgp->cmsg_level == IPPROTO_IPV6
1079		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
1080
1081			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1082			memcpy(&dev->pktinfo, pktinfop,
1083			       sizeof(struct in6_pktinfo));
1084			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1085			socket_log(sock, NULL, TRACE,
1086				   isc_msgcat, ISC_MSGSET_SOCKET,
1087				   ISC_MSG_IFRECEIVED,
1088				   "interface received on ifindex %u",
1089				   dev->pktinfo.ipi6_ifindex);
1090			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1091				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1092			goto next;
1093		}
1094#endif
1095
1096#ifdef SO_TIMESTAMP
1097		if (cmsgp->cmsg_level == SOL_SOCKET
1098		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1099			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1100			dev->timestamp.seconds = timevalp->tv_sec;
1101			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1102			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1103			goto next;
1104		}
1105#endif
1106
1107	next:
1108		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1109	}
1110#endif /* USE_CMSG */
1111
1112#endif /* ISC_NET_BSD44MSGHDR */
1113}
1114
1115/*
1116 * Construct an iov array and attach it to the msghdr passed in.  This is
1117 * the SEND constructor, which will use the used region of the buffer
1118 * (if using a buffer list) or will use the internal region (if a single
1119 * buffer I/O is requested).
1120 *
1121 * Nothing can be NULL, and the done event must list at least one buffer
1122 * on the buffer linked list for this function to be meaningful.
1123 *
1124 * If write_countp != NULL, *write_countp will hold the number of bytes
1125 * this transaction can send.
1126 */
1127static void
1128build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
1129		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1130{
1131	unsigned int iovcount;
1132	isc_buffer_t *buffer;
1133	isc_region_t used;
1134	size_t write_count;
1135	size_t skip_count;
1136
1137	memset(msg, 0, sizeof(*msg));
1138
1139	if (!sock->connected) {
1140		msg->msg_name = (void *)&dev->address.type.sa;
1141		msg->msg_namelen = dev->address.length;
1142	} else {
1143		msg->msg_name = NULL;
1144		msg->msg_namelen = 0;
1145	}
1146
1147	buffer = ISC_LIST_HEAD(dev->bufferlist);
1148	write_count = 0;
1149	iovcount = 0;
1150
1151	/*
1152	 * Single buffer I/O?  Skip what we've done so far in this region.
1153	 */
1154	if (buffer == NULL) {
1155		write_count = dev->region.length - dev->n;
1156		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1157		iov[0].iov_len = write_count;
1158		iovcount = 1;
1159
1160		goto config;
1161	}
1162
1163	/*
1164	 * Multibuffer I/O.
1165	 * Skip the data in the buffer list that we have already written.
1166	 */
1167	skip_count = dev->n;
1168	while (buffer != NULL) {
1169		REQUIRE(ISC_BUFFER_VALID(buffer));
1170		if (skip_count < isc_buffer_usedlength(buffer))
1171			break;
1172		skip_count -= isc_buffer_usedlength(buffer);
1173		buffer = ISC_LIST_NEXT(buffer, link);
1174	}
1175
1176	while (buffer != NULL) {
1177		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1178
1179		isc_buffer_usedregion(buffer, &used);
1180
1181		if (used.length > 0) {
1182			iov[iovcount].iov_base = (void *)(used.base
1183							  + skip_count);
1184			iov[iovcount].iov_len = used.length - skip_count;
1185			write_count += (used.length - skip_count);
1186			skip_count = 0;
1187			iovcount++;
1188		}
1189		buffer = ISC_LIST_NEXT(buffer, link);
1190	}
1191
1192	INSIST(skip_count == 0U);
1193
1194 config:
1195	msg->msg_iov = iov;
1196	msg->msg_iovlen = iovcount;
1197
1198#ifdef ISC_NET_BSD44MSGHDR
1199	msg->msg_control = NULL;
1200	msg->msg_controllen = 0;
1201	msg->msg_flags = 0;
1202#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1203	if ((sock->type == isc_sockettype_udp)
1204	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1205		struct cmsghdr *cmsgp;
1206		struct in6_pktinfo *pktinfop;
1207
1208		socket_log(sock, NULL, TRACE,
1209			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1210			   "sendto pktinfo data, ifindex %u",
1211			   dev->pktinfo.ipi6_ifindex);
1212
1213		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1214		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1215		msg->msg_control = (void *)sock->sendcmsgbuf;
1216
1217		cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1218		cmsgp->cmsg_level = IPPROTO_IPV6;
1219		cmsgp->cmsg_type = IPV6_PKTINFO;
1220		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1221		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1222		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1223	}
1224#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1225#else /* ISC_NET_BSD44MSGHDR */
1226	msg->msg_accrights = NULL;
1227	msg->msg_accrightslen = 0;
1228#endif /* ISC_NET_BSD44MSGHDR */
1229
1230	if (write_countp != NULL)
1231		*write_countp = write_count;
1232}
1233
1234/*
1235 * Construct an iov array and attach it to the msghdr passed in.  This is
1236 * the RECV constructor, which will use the available region of the buffer
1237 * (if using a buffer list) or will use the internal region (if a single
1238 * buffer I/O is requested).
1239 *
1240 * Nothing can be NULL, and the done event must list at least one buffer
1241 * on the buffer linked list for this function to be meaningful.
1242 *
1243 * If read_countp != NULL, *read_countp will hold the number of bytes
1244 * this transaction can receive.
1245 */
1246static void
1247build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1248		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1249{
1250	unsigned int iovcount;
1251	isc_buffer_t *buffer;
1252	isc_region_t available;
1253	size_t read_count;
1254
1255	memset(msg, 0, sizeof(struct msghdr));
1256
1257	if (sock->type == isc_sockettype_udp) {
1258		memset(&dev->address, 0, sizeof(dev->address));
1259#ifdef BROKEN_RECVMSG
1260		if (sock->pf == AF_INET) {
1261			msg->msg_name = (void *)&dev->address.type.sin;
1262			msg->msg_namelen = sizeof(dev->address.type.sin6);
1263		} else if (sock->pf == AF_INET6) {
1264			msg->msg_name = (void *)&dev->address.type.sin6;
1265			msg->msg_namelen = sizeof(dev->address.type.sin6);
1266#ifdef ISC_PLATFORM_HAVESYSUNH
1267		} else if (sock->pf == AF_UNIX) {
1268			msg->msg_name = (void *)&dev->address.type.sunix;
1269			msg->msg_namelen = sizeof(dev->address.type.sunix);
1270#endif
1271		} else {
1272			msg->msg_name = (void *)&dev->address.type.sa;
1273			msg->msg_namelen = sizeof(dev->address.type);
1274		}
1275#else
1276		msg->msg_name = (void *)&dev->address.type.sa;
1277		msg->msg_namelen = sizeof(dev->address.type);
1278#endif
1279#ifdef ISC_NET_RECVOVERFLOW
1280		/* If needed, steal one iovec for overflow detection. */
1281		maxiov--;
1282#endif
1283	} else { /* TCP */
1284		msg->msg_name = NULL;
1285		msg->msg_namelen = 0;
1286		dev->address = sock->peer_address;
1287	}
1288
1289	buffer = ISC_LIST_HEAD(dev->bufferlist);
1290	read_count = 0;
1291
1292	/*
1293	 * Single buffer I/O?  Skip what we've done so far in this region.
1294	 */
1295	if (buffer == NULL) {
1296		read_count = dev->region.length - dev->n;
1297		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1298		iov[0].iov_len = read_count;
1299		iovcount = 1;
1300
1301		goto config;
1302	}
1303
1304	/*
1305	 * Multibuffer I/O.
1306	 * Skip empty buffers.
1307	 */
1308	while (buffer != NULL) {
1309		REQUIRE(ISC_BUFFER_VALID(buffer));
1310		if (isc_buffer_availablelength(buffer) != 0)
1311			break;
1312		buffer = ISC_LIST_NEXT(buffer, link);
1313	}
1314
1315	iovcount = 0;
1316	while (buffer != NULL) {
1317		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1318
1319		isc_buffer_availableregion(buffer, &available);
1320
1321		if (available.length > 0) {
1322			iov[iovcount].iov_base = (void *)(available.base);
1323			iov[iovcount].iov_len = available.length;
1324			read_count += available.length;
1325			iovcount++;
1326		}
1327		buffer = ISC_LIST_NEXT(buffer, link);
1328	}
1329
1330 config:
1331
1332	/*
1333	 * If needed, set up to receive that one extra byte.  Note that
1334	 * we know there is at least one iov left, since we stole it
1335	 * at the top of this function.
1336	 */
1337#ifdef ISC_NET_RECVOVERFLOW
1338	if (sock->type == isc_sockettype_udp) {
1339		iov[iovcount].iov_base = (void *)(&sock->overflow);
1340		iov[iovcount].iov_len = 1;
1341		iovcount++;
1342	}
1343#endif
1344
1345	msg->msg_iov = iov;
1346	msg->msg_iovlen = iovcount;
1347
1348#ifdef ISC_NET_BSD44MSGHDR
1349	msg->msg_control = NULL;
1350	msg->msg_controllen = 0;
1351	msg->msg_flags = 0;
1352#if defined(USE_CMSG)
1353	if (sock->type == isc_sockettype_udp) {
1354		msg->msg_control = sock->recvcmsgbuf;
1355		msg->msg_controllen = sock->recvcmsgbuflen;
1356	}
1357#endif /* USE_CMSG */
1358#else /* ISC_NET_BSD44MSGHDR */
1359	msg->msg_accrights = NULL;
1360	msg->msg_accrightslen = 0;
1361#endif /* ISC_NET_BSD44MSGHDR */
1362
1363	if (read_countp != NULL)
1364		*read_countp = read_count;
1365}
1366
1367static void
1368set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1369		isc_socketevent_t *dev)
1370{
1371	if (sock->type == isc_sockettype_udp) {
1372		if (address != NULL)
1373			dev->address = *address;
1374		else
1375			dev->address = sock->peer_address;
1376	} else if (sock->type == isc_sockettype_tcp) {
1377		INSIST(address == NULL);
1378		dev->address = sock->peer_address;
1379	}
1380}
1381
1382static void
1383destroy_socketevent(isc_event_t *event) {
1384	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1385
1386	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1387
1388	(ev->destroy)(event);
1389}
1390
1391static isc_socketevent_t *
1392allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1393		     isc_taskaction_t action, const void *arg)
1394{
1395	isc_socketevent_t *ev;
1396
1397	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1398						     sock, eventtype,
1399						     action, arg,
1400						     sizeof(*ev));
1401
1402	if (ev == NULL)
1403		return (NULL);
1404
1405	ev->result = ISC_R_UNEXPECTED;
1406	ISC_LINK_INIT(ev, ev_link);
1407	ISC_LIST_INIT(ev->bufferlist);
1408	ev->region.base = NULL;
1409	ev->n = 0;
1410	ev->offset = 0;
1411	ev->attributes = 0;
1412	ev->destroy = ev->ev_destroy;
1413	ev->ev_destroy = destroy_socketevent;
1414
1415	return (ev);
1416}
1417
1418#if defined(ISC_SOCKET_DEBUG)
1419static void
1420dump_msg(struct msghdr *msg) {
1421	unsigned int i;
1422
1423	printf("MSGHDR %p\n", msg);
1424	printf("\tname %p, namelen %ld\n", msg->msg_name,
1425	       (long) msg->msg_namelen);
1426	printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1427	       (long) msg->msg_iovlen);
1428	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1429		printf("\t\t%d\tbase %p, len %ld\n", i,
1430		       msg->msg_iov[i].iov_base,
1431		       (long) msg->msg_iov[i].iov_len);
1432#ifdef ISC_NET_BSD44MSGHDR
1433	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1434	       (long) msg->msg_controllen);
1435#endif
1436}
1437#endif
1438
1439#define DOIO_SUCCESS		0	/* i/o ok, event sent */
1440#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
1441#define DOIO_HARD		2	/* i/o error, event sent */
1442#define DOIO_EOF		3	/* EOF, no event sent */
1443
1444static int
1445doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1446	int cc;
1447	struct iovec iov[MAXSCATTERGATHER_RECV];
1448	size_t read_count;
1449	size_t actual_count;
1450	struct msghdr msghdr;
1451	isc_buffer_t *buffer;
1452	int recv_errno;
1453	char strbuf[ISC_STRERRORSIZE];
1454
1455	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1456
1457#if defined(ISC_SOCKET_DEBUG)
1458	dump_msg(&msghdr);
1459#endif
1460
1461	cc = recvmsg(sock->fd, &msghdr, 0);
1462	recv_errno = errno;
1463
1464#if defined(ISC_SOCKET_DEBUG)
1465	dump_msg(&msghdr);
1466#endif
1467
1468	if (cc < 0) {
1469		if (SOFT_ERROR(recv_errno))
1470			return (DOIO_SOFT);
1471
1472		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1473			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1474			socket_log(sock, NULL, IOEVENT,
1475				   isc_msgcat, ISC_MSGSET_SOCKET,
1476				   ISC_MSG_DOIORECV,
1477				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1478				   sock->fd, cc, recv_errno, strbuf);
1479		}
1480
1481#define SOFT_OR_HARD(_system, _isc) \
1482	if (recv_errno == _system) { \
1483		if (sock->connected) { \
1484			dev->result = _isc; \
1485			inc_stats(sock->manager->stats, \
1486				  sock->statsindex[STATID_RECVFAIL]); \
1487			return (DOIO_HARD); \
1488		} \
1489		return (DOIO_SOFT); \
1490	}
1491#define ALWAYS_HARD(_system, _isc) \
1492	if (recv_errno == _system) { \
1493		dev->result = _isc; \
1494		inc_stats(sock->manager->stats, \
1495			  sock->statsindex[STATID_RECVFAIL]); \
1496		return (DOIO_HARD); \
1497	}
1498
1499		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1500		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1501		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1502		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1503		/* HPUX 11.11 can return EADDRNOTAVAIL. */
1504		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1505		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1506		/*
1507		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1508		 * errors.
1509		 */
1510#ifdef EPROTO
1511		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1512#endif
1513		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1514
1515#undef SOFT_OR_HARD
1516#undef ALWAYS_HARD
1517
1518		dev->result = isc__errno2result(recv_errno);
1519		inc_stats(sock->manager->stats,
1520			  sock->statsindex[STATID_RECVFAIL]);
1521		return (DOIO_HARD);
1522	}
1523
1524	/*
1525	 * On TCP, zero length reads indicate EOF, while on
1526	 * UDP, zero length reads are perfectly valid, although
1527	 * strange.
1528	 */
1529	if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1530		return (DOIO_EOF);
1531
1532	if (sock->type == isc_sockettype_udp) {
1533		dev->address.length = msghdr.msg_namelen;
1534		if (isc_sockaddr_getport(&dev->address) == 0) {
1535			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1536				socket_log(sock, &dev->address, IOEVENT,
1537					   isc_msgcat, ISC_MSGSET_SOCKET,
1538					   ISC_MSG_ZEROPORT,
1539					   "dropping source port zero packet");
1540			}
1541			return (DOIO_SOFT);
1542		}
1543	}
1544
1545	socket_log(sock, &dev->address, IOEVENT,
1546		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1547		   "packet received correctly");
1548
1549	/*
1550	 * Overflow bit detection.  If we received MORE bytes than we should,
1551	 * this indicates an overflow situation.  Set the flag in the
1552	 * dev entry and adjust how much we read by one.
1553	 */
1554#ifdef ISC_NET_RECVOVERFLOW
1555	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1556		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1557		cc--;
1558	}
1559#endif
1560
1561	/*
1562	 * If there are control messages attached, run through them and pull
1563	 * out the interesting bits.
1564	 */
1565	if (sock->type == isc_sockettype_udp)
1566		process_cmsg(sock, &msghdr, dev);
1567
1568	/*
1569	 * update the buffers (if any) and the i/o count
1570	 */
1571	dev->n += cc;
1572	actual_count = cc;
1573	buffer = ISC_LIST_HEAD(dev->bufferlist);
1574	while (buffer != NULL && actual_count > 0U) {
1575		REQUIRE(ISC_BUFFER_VALID(buffer));
1576		if (isc_buffer_availablelength(buffer) <= actual_count) {
1577			actual_count -= isc_buffer_availablelength(buffer);
1578			isc_buffer_add(buffer,
1579				       isc_buffer_availablelength(buffer));
1580		} else {
1581			isc_buffer_add(buffer, actual_count);
1582			actual_count = 0;
1583			break;
1584		}
1585		buffer = ISC_LIST_NEXT(buffer, link);
1586		if (buffer == NULL) {
1587			INSIST(actual_count == 0U);
1588		}
1589	}
1590
1591	/*
1592	 * If we read less than we expected, update counters,
1593	 * and let the upper layer poke the descriptor.
1594	 */
1595	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1596		return (DOIO_SOFT);
1597
1598	/*
1599	 * Full reads are posted, or partials if partials are ok.
1600	 */
1601	dev->result = ISC_R_SUCCESS;
1602	return (DOIO_SUCCESS);
1603}
1604
1605/*
1606 * Returns:
1607 *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1608 *			ISC_R_SUCCESS.
1609 *
1610 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1611 *			dev->result contains the appropriate error.
1612 *
1613 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1614 *			event was sent.  The operation should be retried.
1615 *
1616 *	No other return values are possible.
1617 */
1618static int
1619doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1620	int cc;
1621	struct iovec iov[MAXSCATTERGATHER_SEND];
1622	size_t write_count;
1623	struct msghdr msghdr;
1624	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1625	int attempts = 0;
1626	int send_errno;
1627	char strbuf[ISC_STRERRORSIZE];
1628
1629	build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1630
1631 resend:
1632	cc = sendmsg(sock->fd, &msghdr, 0);
1633	send_errno = errno;
1634
1635	/*
1636	 * Check for error or block condition.
1637	 */
1638	if (cc < 0) {
1639		if (send_errno == EINTR && ++attempts < NRETRIES)
1640			goto resend;
1641
1642		if (SOFT_ERROR(send_errno))
1643			return (DOIO_SOFT);
1644
1645#define SOFT_OR_HARD(_system, _isc) \
1646	if (send_errno == _system) { \
1647		if (sock->connected) { \
1648			dev->result = _isc; \
1649			inc_stats(sock->manager->stats, \
1650				  sock->statsindex[STATID_SENDFAIL]); \
1651			return (DOIO_HARD); \
1652		} \
1653		return (DOIO_SOFT); \
1654	}
1655#define ALWAYS_HARD(_system, _isc) \
1656	if (send_errno == _system) { \
1657		dev->result = _isc; \
1658		inc_stats(sock->manager->stats, \
1659			  sock->statsindex[STATID_SENDFAIL]); \
1660		return (DOIO_HARD); \
1661	}
1662
1663		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1664		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1665		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1666		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1667		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1668#ifdef EHOSTDOWN
1669		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1670#endif
1671		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1672		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1673		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1674		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1675		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1676
1677#undef SOFT_OR_HARD
1678#undef ALWAYS_HARD
1679
1680		/*
1681		 * The other error types depend on whether or not the
1682		 * socket is UDP or TCP.  If it is UDP, some errors
1683		 * that we expect to be fatal under TCP are merely
1684		 * annoying, and are really soft errors.
1685		 *
1686		 * However, these soft errors are still returned as
1687		 * a status.
1688		 */
1689		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1690		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1691		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1692				 addrbuf, strbuf);
1693		dev->result = isc__errno2result(send_errno);
1694		inc_stats(sock->manager->stats,
1695			  sock->statsindex[STATID_SENDFAIL]);
1696		return (DOIO_HARD);
1697	}
1698
1699	if (cc == 0) {
1700		inc_stats(sock->manager->stats,
1701			  sock->statsindex[STATID_SENDFAIL]);
1702		UNEXPECTED_ERROR(__FILE__, __LINE__,
1703				 "doio_send: send() %s 0",
1704				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1705						ISC_MSG_RETURNED, "returned"));
1706	}
1707
1708	/*
1709	 * If we write less than we expected, update counters, poke.
1710	 */
1711	dev->n += cc;
1712	if ((size_t)cc != write_count)
1713		return (DOIO_SOFT);
1714
1715	/*
1716	 * Exactly what we wanted to write.  We're done with this
1717	 * entry.  Post its completion event.
1718	 */
1719	dev->result = ISC_R_SUCCESS;
1720	return (DOIO_SUCCESS);
1721}
1722
1723/*
1724 * Kill.
1725 *
1726 * Caller must ensure that the socket is not locked and no external
1727 * references exist.
1728 */
1729static void
1730closesocket(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1731	isc_sockettype_t type = sock->type;
1732	int lockid = FDLOCK_ID(fd);
1733
1734	/*
1735	 * No one has this socket open, so the watcher doesn't have to be
1736	 * poked, and the socket doesn't have to be locked.
1737	 */
1738	LOCK(&manager->fdlock[lockid]);
1739	manager->fds[fd] = NULL;
1740	if (type == isc_sockettype_fdwatch)
1741		manager->fdstate[fd] = CLOSED;
1742	else
1743		manager->fdstate[fd] = CLOSE_PENDING;
1744	UNLOCK(&manager->fdlock[lockid]);
1745	if (type == isc_sockettype_fdwatch) {
1746		/*
1747		 * The caller may close the socket once this function returns,
1748		 * and `fd' may be reassigned for a new socket.  So we do
1749		 * unwatch_fd() here, rather than defer it via select_poke().
1750		 * Note: this may complicate data protection among threads and
1751		 * may reduce performance due to additional locks.  One way to
1752		 * solve this would be to dup() the watched descriptor, but we
1753		 * take a simpler approach at this moment.
1754		 */
1755		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1756		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1757	} else
1758		select_poke(manager, fd, SELECT_POKE_CLOSE);
1759
1760	inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1761
1762	/*
1763	 * update manager->maxfd here (XXX: this should be implemented more
1764	 * efficiently)
1765	 */
1766#ifdef USE_SELECT
1767	LOCK(&manager->lock);
1768	if (manager->maxfd == fd) {
1769		int i;
1770
1771		manager->maxfd = 0;
1772		for (i = fd - 1; i >= 0; i--) {
1773			lockid = FDLOCK_ID(i);
1774
1775			LOCK(&manager->fdlock[lockid]);
1776			if (manager->fdstate[i] == MANAGED) {
1777				manager->maxfd = i;
1778				UNLOCK(&manager->fdlock[lockid]);
1779				break;
1780			}
1781			UNLOCK(&manager->fdlock[lockid]);
1782		}
1783#ifdef ISC_PLATFORM_USETHREADS
1784		if (manager->maxfd < manager->pipe_fds[0])
1785			manager->maxfd = manager->pipe_fds[0];
1786#endif
1787	}
1788	UNLOCK(&manager->lock);
1789#endif	/* USE_SELECT */
1790}
1791
1792static void
1793destroy(isc_socket_t **sockp) {
1794	int fd;
1795	isc_socket_t *sock = *sockp;
1796	isc_socketmgr_t *manager = sock->manager;
1797
1798	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1799		   ISC_MSG_DESTROYING, "destroying");
1800
1801	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1802	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1803	INSIST(ISC_LIST_EMPTY(sock->send_list));
1804	INSIST(sock->connect_ev == NULL);
1805	REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
1806
1807	if (sock->fd >= 0) {
1808		fd = sock->fd;
1809		sock->fd = -1;
1810		closesocket(manager, sock, fd);
1811	}
1812
1813	LOCK(&manager->lock);
1814
1815	ISC_LIST_UNLINK(manager->socklist, sock, link);
1816
1817#ifdef ISC_PLATFORM_USETHREADS
1818	if (ISC_LIST_EMPTY(manager->socklist))
1819		SIGNAL(&manager->shutdown_ok);
1820#endif /* ISC_PLATFORM_USETHREADS */
1821
1822	UNLOCK(&manager->lock);
1823
1824	free_socket(sockp);
1825}
1826
1827static isc_result_t
1828allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1829		isc_socket_t **socketp)
1830{
1831	isc_socket_t *sock;
1832	isc_result_t result;
1833	ISC_SOCKADDR_LEN_T cmsgbuflen;
1834
1835	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1836
1837	if (sock == NULL)
1838		return (ISC_R_NOMEMORY);
1839
1840	result = ISC_R_UNEXPECTED;
1841
1842	sock->magic = 0;
1843	sock->references = 0;
1844
1845	sock->manager = manager;
1846	sock->type = type;
1847	sock->fd = -1;
1848	sock->statsindex = NULL;
1849
1850	ISC_LINK_INIT(sock, link);
1851
1852	sock->recvcmsgbuf = NULL;
1853	sock->sendcmsgbuf = NULL;
1854
1855	/*
1856	 * set up cmsg buffers
1857	 */
1858	cmsgbuflen = 0;
1859#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1860	cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1861#endif
1862#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1863	cmsgbuflen += cmsg_space(sizeof(struct timeval));
1864#endif
1865	sock->recvcmsgbuflen = cmsgbuflen;
1866	if (sock->recvcmsgbuflen != 0U) {
1867		sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1868		if (sock->recvcmsgbuf == NULL)
1869			goto error;
1870	}
1871
1872	cmsgbuflen = 0;
1873#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1874	cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1875#endif
1876	sock->sendcmsgbuflen = cmsgbuflen;
1877	if (sock->sendcmsgbuflen != 0U) {
1878		sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1879		if (sock->sendcmsgbuf == NULL)
1880			goto error;
1881	}
1882
1883	memset(sock->name, 0, sizeof(sock->name));
1884	sock->tag = NULL;
1885
1886	/*
1887	 * set up list of readers and writers to be initially empty
1888	 */
1889	ISC_LIST_INIT(sock->recv_list);
1890	ISC_LIST_INIT(sock->send_list);
1891	ISC_LIST_INIT(sock->accept_list);
1892	sock->connect_ev = NULL;
1893	sock->pending_recv = 0;
1894	sock->pending_send = 0;
1895	sock->pending_accept = 0;
1896	sock->listener = 0;
1897	sock->connected = 0;
1898	sock->connecting = 0;
1899	sock->bound = 0;
1900
1901	/*
1902	 * initialize the lock
1903	 */
1904	result = isc_mutex_init(&sock->lock);
1905	if (result != ISC_R_SUCCESS) {
1906		sock->magic = 0;
1907		goto error;
1908	}
1909
1910	/*
1911	 * Initialize readable and writable events
1912	 */
1913	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1914		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1915		       NULL, sock, sock, NULL, NULL);
1916	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1917		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1918		       NULL, sock, sock, NULL, NULL);
1919
1920	sock->magic = SOCKET_MAGIC;
1921	*socketp = sock;
1922
1923	return (ISC_R_SUCCESS);
1924
1925 error:
1926	if (sock->recvcmsgbuf != NULL)
1927		isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1928			    sock->recvcmsgbuflen);
1929	if (sock->sendcmsgbuf != NULL)
1930		isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1931			    sock->sendcmsgbuflen);
1932	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1933
1934	return (result);
1935}
1936
1937/*
1938 * This event requires that the various lists be empty, that the reference
1939 * count be 1, and that the magic number is valid.  The other socket bits,
1940 * like the lock, must be initialized as well.  The fd associated must be
1941 * marked as closed, by setting it to -1 on close, or this routine will
1942 * also close the socket.
1943 */
1944static void
1945free_socket(isc_socket_t **socketp) {
1946	isc_socket_t *sock = *socketp;
1947
1948	INSIST(sock->references == 0);
1949	INSIST(VALID_SOCKET(sock));
1950	INSIST(!sock->connecting);
1951	INSIST(!sock->pending_recv);
1952	INSIST(!sock->pending_send);
1953	INSIST(!sock->pending_accept);
1954	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1955	INSIST(ISC_LIST_EMPTY(sock->send_list));
1956	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1957	INSIST(!ISC_LINK_LINKED(sock, link));
1958
1959	if (sock->recvcmsgbuf != NULL)
1960		isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1961			    sock->recvcmsgbuflen);
1962	if (sock->sendcmsgbuf != NULL)
1963		isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1964			    sock->sendcmsgbuflen);
1965
1966	sock->magic = 0;
1967
1968	DESTROYLOCK(&sock->lock);
1969
1970	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1971
1972	*socketp = NULL;
1973}
1974
1975#ifdef SO_BSDCOMPAT
1976/*
1977 * This really should not be necessary to do.  Having to workout
1978 * which kernel version we are on at run time so that we don't cause
1979 * the kernel to issue a warning about us using a deprecated socket option.
1980 * Such warnings should *never* be on by default in production kernels.
1981 *
1982 * We can't do this a build time because executables are moved between
1983 * machines and hence kernels.
1984 *
1985 * We can't just not set SO_BSDCOMAT because some kernels require it.
1986 */
1987
1988static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
1989isc_boolean_t bsdcompat = ISC_TRUE;
1990
1991static void
1992clear_bsdcompat(void) {
1993#ifdef __linux__
1994	 struct utsname buf;
1995	 char *endp;
1996	 long int major;
1997	 long int minor;
1998
1999	 uname(&buf);    /* Can only fail if buf is bad in Linux. */
2000
2001	 /* Paranoia in parsing can be increased, but we trust uname(). */
2002	 major = strtol(buf.release, &endp, 10);
2003	 if (*endp == '.') {
2004		minor = strtol(endp+1, &endp, 10);
2005		if ((major > 2) || ((major == 2) && (minor >= 4))) {
2006			bsdcompat = ISC_FALSE;
2007		}
2008	 }
2009#endif /* __linux __ */
2010}
2011#endif
2012
2013static isc_result_t
2014opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
2015	char strbuf[ISC_STRERRORSIZE];
2016	const char *err = "socket";
2017	int tries = 0;
2018#if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2019	int on = 1;
2020#endif
2021#if defined(SO_RCVBUF)
2022	ISC_SOCKADDR_LEN_T optlen;
2023	int size;
2024#endif
2025
2026 again:
2027	switch (sock->type) {
2028	case isc_sockettype_udp:
2029		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2030		break;
2031	case isc_sockettype_tcp:
2032		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2033		break;
2034	case isc_sockettype_unix:
2035		sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2036		break;
2037	case isc_sockettype_fdwatch:
2038		/*
2039		 * We should not be called for isc_sockettype_fdwatch sockets.
2040		 */
2041		INSIST(0);
2042		break;
2043	}
2044	if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2045		goto again;
2046
2047#ifdef F_DUPFD
2048	/*
2049	 * Leave a space for stdio and TCP to work in.
2050	 */
2051	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2052	    sock->fd >= 0 && sock->fd < manager->reserved) {
2053		int new, tmp;
2054		new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2055		tmp = errno;
2056		(void)close(sock->fd);
2057		errno = tmp;
2058		sock->fd = new;
2059		err = "isc_socket_create: fcntl/reserved";
2060	} else if (sock->fd >= 0 && sock->fd < 20) {
2061		int new, tmp;
2062		new = fcntl(sock->fd, F_DUPFD, 20);
2063		tmp = errno;
2064		(void)close(sock->fd);
2065		errno = tmp;
2066		sock->fd = new;
2067		err = "isc_socket_create: fcntl";
2068	}
2069#endif
2070
2071	if (sock->fd >= (int)manager->maxsocks) {
2072		(void)close(sock->fd);
2073		isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2074			       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2075			       isc_msgcat, ISC_MSGSET_SOCKET,
2076			       ISC_MSG_TOOMANYFDS,
2077			       "socket: file descriptor exceeds limit (%d/%u)",
2078			       sock->fd, manager->maxsocks);
2079		return (ISC_R_NORESOURCES);
2080	}
2081
2082	if (sock->fd < 0) {
2083		switch (errno) {
2084		case EMFILE:
2085		case ENFILE:
2086			isc__strerror(errno, strbuf, sizeof(strbuf));
2087			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2088				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2089				       isc_msgcat, ISC_MSGSET_SOCKET,
2090				       ISC_MSG_TOOMANYFDS,
2091				       "%s: %s", err, strbuf);
2092			/* fallthrough */
2093		case ENOBUFS:
2094			return (ISC_R_NORESOURCES);
2095
2096		case EPROTONOSUPPORT:
2097		case EPFNOSUPPORT:
2098		case EAFNOSUPPORT:
2099		/*
2100		 * Linux 2.2 (and maybe others) return EINVAL instead of
2101		 * EAFNOSUPPORT.
2102		 */
2103		case EINVAL:
2104			return (ISC_R_FAMILYNOSUPPORT);
2105
2106		default:
2107			isc__strerror(errno, strbuf, sizeof(strbuf));
2108			UNEXPECTED_ERROR(__FILE__, __LINE__,
2109					 "%s() %s: %s", err,
2110					 isc_msgcat_get(isc_msgcat,
2111							ISC_MSGSET_GENERAL,
2112							ISC_MSG_FAILED,
2113							"failed"),
2114					 strbuf);
2115			return (ISC_R_UNEXPECTED);
2116		}
2117	}
2118
2119	if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
2120		(void)close(sock->fd);
2121		return (ISC_R_UNEXPECTED);
2122	}
2123
2124#ifdef SO_BSDCOMPAT
2125	RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2126				  clear_bsdcompat) == ISC_R_SUCCESS);
2127	if (sock->type != isc_sockettype_unix && bsdcompat &&
2128	    setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2129		       (void *)&on, sizeof(on)) < 0) {
2130		isc__strerror(errno, strbuf, sizeof(strbuf));
2131		UNEXPECTED_ERROR(__FILE__, __LINE__,
2132				 "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2133				 sock->fd,
2134				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2135						ISC_MSG_FAILED, "failed"),
2136				 strbuf);
2137		/* Press on... */
2138	}
2139#endif
2140
2141#ifdef SO_NOSIGPIPE
2142	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2143		       (void *)&on, sizeof(on)) < 0) {
2144		isc__strerror(errno, strbuf, sizeof(strbuf));
2145		UNEXPECTED_ERROR(__FILE__, __LINE__,
2146				 "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2147				 sock->fd,
2148				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2149						ISC_MSG_FAILED, "failed"),
2150				 strbuf);
2151		/* Press on... */
2152	}
2153#endif
2154
2155#if defined(USE_CMSG) || defined(SO_RCVBUF)
2156	if (sock->type == isc_sockettype_udp) {
2157
2158#if defined(USE_CMSG)
2159#if defined(SO_TIMESTAMP)
2160		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2161			       (void *)&on, sizeof(on)) < 0
2162		    && errno != ENOPROTOOPT) {
2163			isc__strerror(errno, strbuf, sizeof(strbuf));
2164			UNEXPECTED_ERROR(__FILE__, __LINE__,
2165					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2166					 sock->fd,
2167					 isc_msgcat_get(isc_msgcat,
2168							ISC_MSGSET_GENERAL,
2169							ISC_MSG_FAILED,
2170							"failed"),
2171					 strbuf);
2172			/* Press on... */
2173		}
2174#endif /* SO_TIMESTAMP */
2175
2176#if defined(ISC_PLATFORM_HAVEIPV6)
2177		if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2178			/*
2179			 * Warn explicitly because this anomaly can be hidden
2180			 * in usual operation (and unexpectedly appear later).
2181			 */
2182			UNEXPECTED_ERROR(__FILE__, __LINE__,
2183					 "No buffer available to receive "
2184					 "IPv6 destination");
2185		}
2186#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2187#ifdef IPV6_RECVPKTINFO
2188		/* RFC 3542 */
2189		if ((sock->pf == AF_INET6)
2190		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2191				   (void *)&on, sizeof(on)) < 0)) {
2192			isc__strerror(errno, strbuf, sizeof(strbuf));
2193			UNEXPECTED_ERROR(__FILE__, __LINE__,
2194					 "setsockopt(%d, IPV6_RECVPKTINFO) "
2195					 "%s: %s", sock->fd,
2196					 isc_msgcat_get(isc_msgcat,
2197							ISC_MSGSET_GENERAL,
2198							ISC_MSG_FAILED,
2199							"failed"),
2200					 strbuf);
2201		}
2202#else
2203		/* RFC 2292 */
2204		if ((sock->pf == AF_INET6)
2205		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2206				   (void *)&on, sizeof(on)) < 0)) {
2207			isc__strerror(errno, strbuf, sizeof(strbuf));
2208			UNEXPECTED_ERROR(__FILE__, __LINE__,
2209					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2210					 sock->fd,
2211					 isc_msgcat_get(isc_msgcat,
2212							ISC_MSGSET_GENERAL,
2213							ISC_MSG_FAILED,
2214							"failed"),
2215					 strbuf);
2216		}
2217#endif /* IPV6_RECVPKTINFO */
2218#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2219#ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2220		/* use minimum MTU */
2221		if (sock->pf == AF_INET6) {
2222			(void)setsockopt(sock->fd, IPPROTO_IPV6,
2223					 IPV6_USE_MIN_MTU,
2224					 (void *)&on, sizeof(on));
2225		}
2226#endif
2227#endif /* ISC_PLATFORM_HAVEIPV6 */
2228#endif /* defined(USE_CMSG) */
2229
2230#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2231		/*
2232		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2233		 */
2234		if (sock->pf == AF_INET) {
2235			int action = IP_PMTUDISC_DONT;
2236			(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2237					 &action, sizeof(action));
2238		}
2239#endif
2240#if defined(IP_DONTFRAG)
2241		/*
2242		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2243		 */
2244		if (sock->pf == AF_INET) {
2245			int off = 0;
2246			(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2247					 &off, sizeof(off));
2248		}
2249#endif
2250
2251#if defined(SO_RCVBUF)
2252		optlen = sizeof(size);
2253		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2254			       (void *)&size, &optlen) >= 0 &&
2255		     size < RCVBUFSIZE) {
2256			size = RCVBUFSIZE;
2257			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2258				       (void *)&size, sizeof(size)) == -1) {
2259				isc__strerror(errno, strbuf, sizeof(strbuf));
2260				UNEXPECTED_ERROR(__FILE__, __LINE__,
2261					"setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2262					sock->fd, size,
2263					isc_msgcat_get(isc_msgcat,
2264						       ISC_MSGSET_GENERAL,
2265						       ISC_MSG_FAILED,
2266						       "failed"),
2267					strbuf);
2268			}
2269		}
2270#endif
2271	}
2272#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2273
2274	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2275
2276	return (ISC_R_SUCCESS);
2277}
2278
2279/*%
2280 * Create a new 'type' socket managed by 'manager'.  Events
2281 * will be posted to 'task' and when dispatched 'action' will be
2282 * called with 'arg' as the arg value.  The new socket is returned
2283 * in 'socketp'.
2284 */
2285isc_result_t
2286isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2287		  isc_socket_t **socketp)
2288{
2289	isc_socket_t *sock = NULL;
2290	isc_result_t result;
2291	int lockid;
2292
2293	REQUIRE(VALID_MANAGER(manager));
2294	REQUIRE(socketp != NULL && *socketp == NULL);
2295	REQUIRE(type != isc_sockettype_fdwatch);
2296
2297	result = allocate_socket(manager, type, &sock);
2298	if (result != ISC_R_SUCCESS)
2299		return (result);
2300
2301	switch (sock->type) {
2302	case isc_sockettype_udp:
2303		sock->statsindex =
2304			(pf == AF_INET) ? upd4statsindex : upd6statsindex;
2305		break;
2306	case isc_sockettype_tcp:
2307		sock->statsindex =
2308			(pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2309		break;
2310	case isc_sockettype_unix:
2311		sock->statsindex = unixstatsindex;
2312		break;
2313	default:
2314		INSIST(0);
2315	}
2316
2317	sock->pf = pf;
2318	result = opensocket(manager, sock);
2319	if (result != ISC_R_SUCCESS) {
2320		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2321		free_socket(&sock);
2322		return (result);
2323	}
2324
2325	sock->references = 1;
2326	*socketp = sock;
2327
2328	/*
2329	 * Note we don't have to lock the socket like we normally would because
2330	 * there are no external references to it yet.
2331	 */
2332
2333	lockid = FDLOCK_ID(sock->fd);
2334	LOCK(&manager->fdlock[lockid]);
2335	manager->fds[sock->fd] = sock;
2336	manager->fdstate[sock->fd] = MANAGED;
2337#ifdef USE_DEVPOLL
2338	INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2339	       sock->manager->fdpollinfo[sock->fd].want_write == 0);
2340#endif
2341	UNLOCK(&manager->fdlock[lockid]);
2342
2343	LOCK(&manager->lock);
2344	ISC_LIST_APPEND(manager->socklist, sock, link);
2345#ifdef USE_SELECT
2346	if (manager->maxfd < sock->fd)
2347		manager->maxfd = sock->fd;
2348#endif
2349	UNLOCK(&manager->lock);
2350
2351	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2352		   ISC_MSG_CREATED, "created");
2353
2354	return (ISC_R_SUCCESS);
2355}
2356
2357isc_result_t
2358isc_socket_open(isc_socket_t *sock) {
2359	isc_result_t result;
2360
2361	REQUIRE(VALID_SOCKET(sock));
2362
2363	LOCK(&sock->lock);
2364	REQUIRE(sock->references == 1);
2365	REQUIRE(sock->type != isc_sockettype_fdwatch);
2366	UNLOCK(&sock->lock);
2367	/*
2368	 * We don't need to retain the lock hereafter, since no one else has
2369	 * this socket.
2370	 */
2371	REQUIRE(sock->fd == -1);
2372
2373	result = opensocket(sock->manager, sock);
2374	if (result != ISC_R_SUCCESS)
2375		sock->fd = -1;
2376
2377	if (result == ISC_R_SUCCESS) {
2378		int lockid = FDLOCK_ID(sock->fd);
2379
2380		LOCK(&sock->manager->fdlock[lockid]);
2381		sock->manager->fds[sock->fd] = sock;
2382		sock->manager->fdstate[sock->fd] = MANAGED;
2383#ifdef USE_DEVPOLL
2384		INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2385		       sock->manager->fdpollinfo[sock->fd].want_write == 0);
2386#endif
2387		UNLOCK(&sock->manager->fdlock[lockid]);
2388
2389#ifdef USE_SELECT
2390		LOCK(&sock->manager->lock);
2391		if (sock->manager->maxfd < sock->fd)
2392			sock->manager->maxfd = sock->fd;
2393		UNLOCK(&sock->manager->lock);
2394#endif
2395	}
2396
2397	return (result);
2398}
2399
2400/*
2401 * Create a new 'type' socket managed by 'manager'.  Events
2402 * will be posted to 'task' and when dispatched 'action' will be
2403 * called with 'arg' as the arg value.  The new socket is returned
2404 * in 'socketp'.
2405 */
2406isc_result_t
2407isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
2408			 isc_sockfdwatch_t callback, void *cbarg,
2409			 isc_task_t *task, isc_socket_t **socketp)
2410{
2411	isc_socket_t *sock = NULL;
2412	isc_result_t result;
2413	int lockid;
2414
2415	REQUIRE(VALID_MANAGER(manager));
2416	REQUIRE(socketp != NULL && *socketp == NULL);
2417
2418	result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2419	if (result != ISC_R_SUCCESS)
2420		return (result);
2421
2422	sock->fd = fd;
2423	sock->fdwatcharg = cbarg;
2424	sock->fdwatchcb = callback;
2425	sock->fdwatchflags = flags;
2426	sock->fdwatchtask = task;
2427	sock->statsindex = fdwatchstatsindex;
2428
2429	sock->references = 1;
2430	*socketp = sock;
2431
2432	/*
2433	 * Note we don't have to lock the socket like we normally would because
2434	 * there are no external references to it yet.
2435	 */
2436
2437	lockid = FDLOCK_ID(sock->fd);
2438	LOCK(&manager->fdlock[lockid]);
2439	manager->fds[sock->fd] = sock;
2440	manager->fdstate[sock->fd] = MANAGED;
2441	UNLOCK(&manager->fdlock[lockid]);
2442
2443	LOCK(&manager->lock);
2444	ISC_LIST_APPEND(manager->socklist, sock, link);
2445#ifdef USE_SELECT
2446	if (manager->maxfd < sock->fd)
2447		manager->maxfd = sock->fd;
2448#endif
2449	UNLOCK(&manager->lock);
2450
2451	if (flags & ISC_SOCKFDWATCH_READ)
2452		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2453	if (flags & ISC_SOCKFDWATCH_WRITE)
2454		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2455
2456	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2457		   ISC_MSG_CREATED, "fdwatch-created");
2458
2459	return (ISC_R_SUCCESS);
2460}
2461
2462/*
2463 * Attach to a socket.  Caller must explicitly detach when it is done.
2464 */
2465void
2466isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2467	REQUIRE(VALID_SOCKET(sock));
2468	REQUIRE(socketp != NULL && *socketp == NULL);
2469
2470	LOCK(&sock->lock);
2471	sock->references++;
2472	UNLOCK(&sock->lock);
2473
2474	*socketp = sock;
2475}
2476
2477/*
2478 * Dereference a socket.  If this is the last reference to it, clean things
2479 * up by destroying the socket.
2480 */
2481void
2482isc_socket_detach(isc_socket_t **socketp) {
2483	isc_socket_t *sock;
2484	isc_boolean_t kill_socket = ISC_FALSE;
2485
2486	REQUIRE(socketp != NULL);
2487	sock = *socketp;
2488	REQUIRE(VALID_SOCKET(sock));
2489
2490	LOCK(&sock->lock);
2491	REQUIRE(sock->references > 0);
2492	sock->references--;
2493	if (sock->references == 0)
2494		kill_socket = ISC_TRUE;
2495	UNLOCK(&sock->lock);
2496
2497	if (kill_socket)
2498		destroy(&sock);
2499
2500	*socketp = NULL;
2501}
2502
2503isc_result_t
2504isc_socket_close(isc_socket_t *sock) {
2505	int fd;
2506	isc_socketmgr_t *manager;
2507	isc_sockettype_t type;
2508
2509	REQUIRE(VALID_SOCKET(sock));
2510
2511	LOCK(&sock->lock);
2512
2513	REQUIRE(sock->references == 1);
2514	REQUIRE(sock->type != isc_sockettype_fdwatch);
2515	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2516
2517	INSIST(!sock->connecting);
2518	INSIST(!sock->pending_recv);
2519	INSIST(!sock->pending_send);
2520	INSIST(!sock->pending_accept);
2521	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2522	INSIST(ISC_LIST_EMPTY(sock->send_list));
2523	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2524	INSIST(sock->connect_ev == NULL);
2525
2526	manager = sock->manager;
2527	type = sock->type;
2528	fd = sock->fd;
2529	sock->fd = -1;
2530	memset(sock->name, 0, sizeof(sock->name));
2531	sock->tag = NULL;
2532	sock->listener = 0;
2533	sock->connected = 0;
2534	sock->connecting = 0;
2535	sock->bound = 0;
2536	isc_sockaddr_any(&sock->peer_address);
2537
2538	UNLOCK(&sock->lock);
2539
2540	closesocket(manager, sock, fd);
2541
2542	return (ISC_R_SUCCESS);
2543}
2544
2545/*
2546 * I/O is possible on a given socket.  Schedule an event to this task that
2547 * will call an internal function to do the I/O.  This will charge the
2548 * task with the I/O operation and let our select loop handler get back
2549 * to doing something real as fast as possible.
2550 *
2551 * The socket and manager must be locked before calling this function.
2552 */
2553static void
2554dispatch_recv(isc_socket_t *sock) {
2555	intev_t *iev;
2556	isc_socketevent_t *ev;
2557	isc_task_t *sender;
2558
2559	INSIST(!sock->pending_recv);
2560
2561	if (sock->type != isc_sockettype_fdwatch) {
2562		ev = ISC_LIST_HEAD(sock->recv_list);
2563		if (ev == NULL)
2564			return;
2565		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2566			   "dispatch_recv:  event %p -> task %p",
2567			   ev, ev->ev_sender);
2568		sender = ev->ev_sender;
2569	} else {
2570		sender = sock->fdwatchtask;
2571	}
2572
2573	sock->pending_recv = 1;
2574	iev = &sock->readable_ev;
2575
2576	sock->references++;
2577	iev->ev_sender = sock;
2578	if (sock->type == isc_sockettype_fdwatch)
2579		iev->ev_action = internal_fdwatch_read;
2580	else
2581		iev->ev_action = internal_recv;
2582	iev->ev_arg = sock;
2583
2584	isc_task_send(sender, (isc_event_t **)&iev);
2585}
2586
2587static void
2588dispatch_send(isc_socket_t *sock) {
2589	intev_t *iev;
2590	isc_socketevent_t *ev;
2591	isc_task_t *sender;
2592
2593	INSIST(!sock->pending_send);
2594
2595	if (sock->type != isc_sockettype_fdwatch) {
2596		ev = ISC_LIST_HEAD(sock->send_list);
2597		if (ev == NULL)
2598			return;
2599		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2600			   "dispatch_send:  event %p -> task %p",
2601			   ev, ev->ev_sender);
2602		sender = ev->ev_sender;
2603	} else {
2604		sender = sock->fdwatchtask;
2605	}
2606
2607	sock->pending_send = 1;
2608	iev = &sock->writable_ev;
2609
2610	sock->references++;
2611	iev->ev_sender = sock;
2612	if (sock->type == isc_sockettype_fdwatch)
2613		iev->ev_action = internal_fdwatch_write;
2614	else
2615		iev->ev_action = internal_send;
2616	iev->ev_arg = sock;
2617
2618	isc_task_send(sender, (isc_event_t **)&iev);
2619}
2620
2621/*
2622 * Dispatch an internal accept event.
2623 */
2624static void
2625dispatch_accept(isc_socket_t *sock) {
2626	intev_t *iev;
2627	isc_socket_newconnev_t *ev;
2628
2629	INSIST(!sock->pending_accept);
2630
2631	/*
2632	 * Are there any done events left, or were they all canceled
2633	 * before the manager got the socket lock?
2634	 */
2635	ev = ISC_LIST_HEAD(sock->accept_list);
2636	if (ev == NULL)
2637		return;
2638
2639	sock->pending_accept = 1;
2640	iev = &sock->readable_ev;
2641
2642	sock->references++;  /* keep socket around for this internal event */
2643	iev->ev_sender = sock;
2644	iev->ev_action = internal_accept;
2645	iev->ev_arg = sock;
2646
2647	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2648}
2649
2650static void
2651dispatch_connect(isc_socket_t *sock) {
2652	intev_t *iev;
2653	isc_socket_connev_t *ev;
2654
2655	iev = &sock->writable_ev;
2656
2657	ev = sock->connect_ev;
2658	INSIST(ev != NULL); /* XXX */
2659
2660	INSIST(sock->connecting);
2661
2662	sock->references++;  /* keep socket around for this internal event */
2663	iev->ev_sender = sock;
2664	iev->ev_action = internal_connect;
2665	iev->ev_arg = sock;
2666
2667	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2668}
2669
2670/*
2671 * Dequeue an item off the given socket's read queue, set the result code
2672 * in the done event to the one provided, and send it to the task it was
2673 * destined for.
2674 *
2675 * If the event to be sent is on a list, remove it before sending.  If
2676 * asked to, send and detach from the socket as well.
2677 *
2678 * Caller must have the socket locked if the event is attached to the socket.
2679 */
2680static void
2681send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2682	isc_task_t *task;
2683
2684	task = (*dev)->ev_sender;
2685
2686	(*dev)->ev_sender = sock;
2687
2688	if (ISC_LINK_LINKED(*dev, ev_link))
2689		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2690
2691	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2692	    == ISC_SOCKEVENTATTR_ATTACHED)
2693		isc_task_sendanddetach(&task, (isc_event_t **)dev);
2694	else
2695		isc_task_send(task, (isc_event_t **)dev);
2696}
2697
2698/*
2699 * See comments for send_recvdone_event() above.
2700 *
2701 * Caller must have the socket locked if the event is attached to the socket.
2702 */
2703static void
2704send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2705	isc_task_t *task;
2706
2707	INSIST(dev != NULL && *dev != NULL);
2708
2709	task = (*dev)->ev_sender;
2710	(*dev)->ev_sender = sock;
2711
2712	if (ISC_LINK_LINKED(*dev, ev_link))
2713		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2714
2715	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2716	    == ISC_SOCKEVENTATTR_ATTACHED)
2717		isc_task_sendanddetach(&task, (isc_event_t **)dev);
2718	else
2719		isc_task_send(task, (isc_event_t **)dev);
2720}
2721
2722/*
2723 * Call accept() on a socket, to get the new file descriptor.  The listen
2724 * socket is used as a prototype to create a new isc_socket_t.  The new
2725 * socket has one outstanding reference.  The task receiving the event
2726 * will be detached from just after the event is delivered.
2727 *
2728 * On entry to this function, the event delivered is the internal
2729 * readable event, and the first item on the accept_list should be
2730 * the done event we want to send.  If the list is empty, this is a no-op,
2731 * so just unlock and return.
2732 */
2733static void
2734internal_accept(isc_task_t *me, isc_event_t *ev) {
2735	isc_socket_t *sock;
2736	isc_socketmgr_t *manager;
2737	isc_socket_newconnev_t *dev;
2738	isc_task_t *task;
2739	ISC_SOCKADDR_LEN_T addrlen;
2740	int fd;
2741	isc_result_t result = ISC_R_SUCCESS;
2742	char strbuf[ISC_STRERRORSIZE];
2743	const char *err = "accept";
2744
2745	UNUSED(me);
2746
2747	sock = ev->ev_sender;
2748	INSIST(VALID_SOCKET(sock));
2749
2750	LOCK(&sock->lock);
2751	socket_log(sock, NULL, TRACE,
2752		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2753		   "internal_accept called, locked socket");
2754
2755	manager = sock->manager;
2756	INSIST(VALID_MANAGER(manager));
2757
2758	INSIST(sock->listener);
2759	INSIST(sock->pending_accept == 1);
2760	sock->pending_accept = 0;
2761
2762	INSIST(sock->references > 0);
2763	sock->references--;  /* the internal event is done with this socket */
2764	if (sock->references == 0) {
2765		UNLOCK(&sock->lock);
2766		destroy(&sock);
2767		return;
2768	}
2769
2770	/*
2771	 * Get the first item off the accept list.
2772	 * If it is empty, unlock the socket and return.
2773	 */
2774	dev = ISC_LIST_HEAD(sock->accept_list);
2775	if (dev == NULL) {
2776		UNLOCK(&sock->lock);
2777		return;
2778	}
2779
2780	/*
2781	 * Try to accept the new connection.  If the accept fails with
2782	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2783	 * again.  Also ignore ECONNRESET, which has been reported to
2784	 * be spuriously returned on Linux 2.2.19 although it is not
2785	 * a documented error for accept().  ECONNABORTED has been
2786	 * reported for Solaris 8.  The rest are thrown in not because
2787	 * we have seen them but because they are ignored by other
2788	 * daemons such as BIND 8 and Apache.
2789	 */
2790
2791	addrlen = sizeof(dev->newsocket->peer_address.type);
2792	memset(&dev->newsocket->peer_address.type, 0, addrlen);
2793	fd = accept(sock->fd, &dev->newsocket->peer_address.type.sa,
2794		    (void *)&addrlen);
2795
2796#ifdef F_DUPFD
2797	/*
2798	 * Leave a space for stdio to work in.
2799	 */
2800	if (fd >= 0 && fd < 20) {
2801		int new, tmp;
2802		new = fcntl(fd, F_DUPFD, 20);
2803		tmp = errno;
2804		(void)close(fd);
2805		errno = tmp;
2806		fd = new;
2807		err = "accept/fcntl";
2808	}
2809#endif
2810
2811	if (fd < 0) {
2812		if (SOFT_ERROR(errno))
2813			goto soft_error;
2814		switch (errno) {
2815		case ENFILE:
2816		case EMFILE:
2817			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2818				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2819				       isc_msgcat, ISC_MSGSET_SOCKET,
2820				       ISC_MSG_TOOMANYFDS,
2821				       "%s: too many open file descriptors",
2822				       err);
2823			goto soft_error;
2824
2825		case ENOBUFS:
2826		case ENOMEM:
2827		case ECONNRESET:
2828		case ECONNABORTED:
2829		case EHOSTUNREACH:
2830		case EHOSTDOWN:
2831		case ENETUNREACH:
2832		case ENETDOWN:
2833		case ECONNREFUSED:
2834#ifdef EPROTO
2835		case EPROTO:
2836#endif
2837#ifdef ENONET
2838		case ENONET:
2839#endif
2840			goto soft_error;
2841		default:
2842			break;
2843		}
2844		isc__strerror(errno, strbuf, sizeof(strbuf));
2845		UNEXPECTED_ERROR(__FILE__, __LINE__,
2846				 "internal_accept: %s() %s: %s", err,
2847				 isc_msgcat_get(isc_msgcat,
2848						ISC_MSGSET_GENERAL,
2849						ISC_MSG_FAILED,
2850						"failed"),
2851				 strbuf);
2852		fd = -1;
2853		result = ISC_R_UNEXPECTED;
2854	} else {
2855		if (addrlen == 0U) {
2856			UNEXPECTED_ERROR(__FILE__, __LINE__,
2857					 "internal_accept(): "
2858					 "accept() failed to return "
2859					 "remote address");
2860
2861			(void)close(fd);
2862			goto soft_error;
2863		} else if (dev->newsocket->peer_address.type.sa.sa_family !=
2864			   sock->pf)
2865		{
2866			UNEXPECTED_ERROR(__FILE__, __LINE__,
2867					 "internal_accept(): "
2868					 "accept() returned peer address "
2869					 "family %u (expected %u)",
2870					 dev->newsocket->peer_address.
2871					 type.sa.sa_family,
2872					 sock->pf);
2873			(void)close(fd);
2874			goto soft_error;
2875		} else if (fd >= (int)manager->maxsocks) {
2876			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2877				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2878				       isc_msgcat, ISC_MSGSET_SOCKET,
2879				       ISC_MSG_TOOMANYFDS,
2880				       "accept: "
2881				       "file descriptor exceeds limit (%d/%u)",
2882				       fd, manager->maxsocks);
2883			(void)close(fd);
2884			goto soft_error;
2885		}
2886	}
2887
2888	if (fd != -1) {
2889		dev->newsocket->peer_address.length = addrlen;
2890		dev->newsocket->pf = sock->pf;
2891	}
2892
2893	/*
2894	 * Pull off the done event.
2895	 */
2896	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2897
2898	/*
2899	 * Poke watcher if there are more pending accepts.
2900	 */
2901	if (!ISC_LIST_EMPTY(sock->accept_list))
2902		select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2903
2904	UNLOCK(&sock->lock);
2905
2906	if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2907		(void)close(fd);
2908		fd = -1;
2909		result = ISC_R_UNEXPECTED;
2910	}
2911
2912	/*
2913	 * -1 means the new socket didn't happen.
2914	 */
2915	if (fd != -1) {
2916		int lockid = FDLOCK_ID(fd);
2917
2918		LOCK(&manager->fdlock[lockid]);
2919		manager->fds[fd] = dev->newsocket;
2920		manager->fdstate[fd] = MANAGED;
2921		UNLOCK(&manager->fdlock[lockid]);
2922
2923		LOCK(&manager->lock);
2924		ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2925
2926		dev->newsocket->fd = fd;
2927		dev->newsocket->bound = 1;
2928		dev->newsocket->connected = 1;
2929
2930		/*
2931		 * Save away the remote address
2932		 */
2933		dev->address = dev->newsocket->peer_address;
2934
2935#ifdef USE_SELECT
2936		if (manager->maxfd < fd)
2937			manager->maxfd = fd;
2938#endif
2939
2940		socket_log(sock, &dev->newsocket->peer_address, CREATION,
2941			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2942			   "accepted connection, new socket %p",
2943			   dev->newsocket);
2944
2945		UNLOCK(&manager->lock);
2946
2947		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
2948	} else {
2949		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2950		dev->newsocket->references--;
2951		free_socket(&dev->newsocket);
2952	}
2953
2954	/*
2955	 * Fill in the done event details and send it off.
2956	 */
2957	dev->result = result;
2958	task = dev->ev_sender;
2959	dev->ev_sender = sock;
2960
2961	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2962	return;
2963
2964 soft_error:
2965	select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2966	UNLOCK(&sock->lock);
2967
2968	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2969	return;
2970}
2971
2972static void
2973internal_recv(isc_task_t *me, isc_event_t *ev) {
2974	isc_socketevent_t *dev;
2975	isc_socket_t *sock;
2976
2977	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2978
2979	sock = ev->ev_sender;
2980	INSIST(VALID_SOCKET(sock));
2981
2982	LOCK(&sock->lock);
2983	socket_log(sock, NULL, IOEVENT,
2984		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2985		   "internal_recv: task %p got event %p", me, ev);
2986
2987	INSIST(sock->pending_recv == 1);
2988	sock->pending_recv = 0;
2989
2990	INSIST(sock->references > 0);
2991	sock->references--;  /* the internal event is done with this socket */
2992	if (sock->references == 0) {
2993		UNLOCK(&sock->lock);
2994		destroy(&sock);
2995		return;
2996	}
2997
2998	/*
2999	 * Try to do as much I/O as possible on this socket.  There are no
3000	 * limits here, currently.
3001	 */
3002	dev = ISC_LIST_HEAD(sock->recv_list);
3003	while (dev != NULL) {
3004		switch (doio_recv(sock, dev)) {
3005		case DOIO_SOFT:
3006			goto poke;
3007
3008		case DOIO_EOF:
3009			/*
3010			 * read of 0 means the remote end was closed.
3011			 * Run through the event queue and dispatch all
3012			 * the events with an EOF result code.
3013			 */
3014			do {
3015				dev->result = ISC_R_EOF;
3016				send_recvdone_event(sock, &dev);
3017				dev = ISC_LIST_HEAD(sock->recv_list);
3018			} while (dev != NULL);
3019			goto poke;
3020
3021		case DOIO_SUCCESS:
3022		case DOIO_HARD:
3023			send_recvdone_event(sock, &dev);
3024			break;
3025		}
3026
3027		dev = ISC_LIST_HEAD(sock->recv_list);
3028	}
3029
3030 poke:
3031	if (!ISC_LIST_EMPTY(sock->recv_list))
3032		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3033
3034	UNLOCK(&sock->lock);
3035}
3036
3037static void
3038internal_send(isc_task_t *me, isc_event_t *ev) {
3039	isc_socketevent_t *dev;
3040	isc_socket_t *sock;
3041
3042	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3043
3044	/*
3045	 * Find out what socket this is and lock it.
3046	 */
3047	sock = (isc_socket_t *)ev->ev_sender;
3048	INSIST(VALID_SOCKET(sock));
3049
3050	LOCK(&sock->lock);
3051	socket_log(sock, NULL, IOEVENT,
3052		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3053		   "internal_send: task %p got event %p", me, ev);
3054
3055	INSIST(sock->pending_send == 1);
3056	sock->pending_send = 0;
3057
3058	INSIST(sock->references > 0);
3059	sock->references--;  /* the internal event is done with this socket */
3060	if (sock->references == 0) {
3061		UNLOCK(&sock->lock);
3062		destroy(&sock);
3063		return;
3064	}
3065
3066	/*
3067	 * Try to do as much I/O as possible on this socket.  There are no
3068	 * limits here, currently.
3069	 */
3070	dev = ISC_LIST_HEAD(sock->send_list);
3071	while (dev != NULL) {
3072		switch (doio_send(sock, dev)) {
3073		case DOIO_SOFT:
3074			goto poke;
3075
3076		case DOIO_HARD:
3077		case DOIO_SUCCESS:
3078			send_senddone_event(sock, &dev);
3079			break;
3080		}
3081
3082		dev = ISC_LIST_HEAD(sock->send_list);
3083	}
3084
3085 poke:
3086	if (!ISC_LIST_EMPTY(sock->send_list))
3087		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3088
3089	UNLOCK(&sock->lock);
3090}
3091
3092static void
3093internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3094	isc_socket_t *sock;
3095	int more_data;
3096
3097	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3098
3099	/*
3100	 * Find out what socket this is and lock it.
3101	 */
3102	sock = (isc_socket_t *)ev->ev_sender;
3103	INSIST(VALID_SOCKET(sock));
3104
3105	LOCK(&sock->lock);
3106	socket_log(sock, NULL, IOEVENT,
3107		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3108		   "internal_fdwatch_write: task %p got event %p", me, ev);
3109
3110	INSIST(sock->pending_send == 1);
3111
3112	UNLOCK(&sock->lock);
3113	more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3114	LOCK(&sock->lock);
3115
3116	sock->pending_send = 0;
3117
3118	INSIST(sock->references > 0);
3119	sock->references--;  /* the internal event is done with this socket */
3120	if (sock->references == 0) {
3121		UNLOCK(&sock->lock);
3122		destroy(&sock);
3123		return;
3124	}
3125
3126	if (more_data)
3127		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3128
3129	UNLOCK(&sock->lock);
3130}
3131
3132static void
3133internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3134	isc_socket_t *sock;
3135	int more_data;
3136
3137	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3138
3139	/*
3140	 * Find out what socket this is and lock it.
3141	 */
3142	sock = (isc_socket_t *)ev->ev_sender;
3143	INSIST(VALID_SOCKET(sock));
3144
3145	LOCK(&sock->lock);
3146	socket_log(sock, NULL, IOEVENT,
3147		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3148		   "internal_fdwatch_read: task %p got event %p", me, ev);
3149
3150	INSIST(sock->pending_recv == 1);
3151
3152	UNLOCK(&sock->lock);
3153	more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3154	LOCK(&sock->lock);
3155
3156	sock->pending_recv = 0;
3157
3158	INSIST(sock->references > 0);
3159	sock->references--;  /* the internal event is done with this socket */
3160	if (sock->references == 0) {
3161		UNLOCK(&sock->lock);
3162		destroy(&sock);
3163		return;
3164	}
3165
3166	if (more_data)
3167		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3168
3169	UNLOCK(&sock->lock);
3170}
3171
3172/*
3173 * Process read/writes on each fd here.  Avoid locking
3174 * and unlocking twice if both reads and writes are possible.
3175 */
3176static void
3177process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
3178	   isc_boolean_t writeable)
3179{
3180	isc_socket_t *sock;
3181	isc_boolean_t unlock_sock;
3182	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3183	int lockid = FDLOCK_ID(fd);
3184
3185	/*
3186	 * If the socket is going to be closed, don't do more I/O.
3187	 */
3188	LOCK(&manager->fdlock[lockid]);
3189	if (manager->fdstate[fd] == CLOSE_PENDING) {
3190		UNLOCK(&manager->fdlock[lockid]);
3191
3192		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3193		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3194		return;
3195	}
3196
3197	sock = manager->fds[fd];
3198	unlock_sock = ISC_FALSE;
3199	if (readable) {
3200		if (sock == NULL) {
3201			unwatch_read = ISC_TRUE;
3202			goto check_write;
3203		}
3204		unlock_sock = ISC_TRUE;
3205		LOCK(&sock->lock);
3206		if (!SOCK_DEAD(sock)) {
3207			if (sock->listener)
3208				dispatch_accept(sock);
3209			else
3210				dispatch_recv(sock);
3211		}
3212		unwatch_read = ISC_TRUE;
3213	}
3214check_write:
3215	if (writeable) {
3216		if (sock == NULL) {
3217			unwatch_write = ISC_TRUE;
3218			goto unlock_fd;
3219		}
3220		if (!unlock_sock) {
3221			unlock_sock = ISC_TRUE;
3222			LOCK(&sock->lock);
3223		}
3224		if (!SOCK_DEAD(sock)) {
3225			if (sock->connecting)
3226				dispatch_connect(sock);
3227			else
3228				dispatch_send(sock);
3229		}
3230		unwatch_write = ISC_TRUE;
3231	}
3232	if (unlock_sock)
3233		UNLOCK(&sock->lock);
3234
3235 unlock_fd:
3236	UNLOCK(&manager->fdlock[lockid]);
3237	if (unwatch_read)
3238		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3239	if (unwatch_write)
3240		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3241
3242}
3243
3244#ifdef USE_KQUEUE
3245static isc_boolean_t
3246process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
3247	int i;
3248	isc_boolean_t readable, writable;
3249	isc_boolean_t done = ISC_FALSE;
3250#ifdef ISC_PLATFORM_USETHREADS
3251	isc_boolean_t have_ctlevent = ISC_FALSE;
3252#endif
3253
3254	if (nevents == manager->nevents) {
3255		/*
3256		 * This is not an error, but something unexpected.  If this
3257		 * happens, it may indicate the need for increasing
3258		 * ISC_SOCKET_MAXEVENTS.
3259		 */
3260		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3261			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3262			    "maximum number of FD events (%d) received",
3263			    nevents);
3264	}
3265
3266	for (i = 0; i < nevents; i++) {
3267		REQUIRE(events[i].ident < manager->maxsocks);
3268#ifdef ISC_PLATFORM_USETHREADS
3269		if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3270			have_ctlevent = ISC_TRUE;
3271			continue;
3272		}
3273#endif
3274		readable = ISC_TF(events[i].filter == EVFILT_READ);
3275		writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3276		process_fd(manager, events[i].ident, readable, writable);
3277	}
3278
3279#ifdef ISC_PLATFORM_USETHREADS
3280	if (have_ctlevent)
3281		done = process_ctlfd(manager);
3282#endif
3283
3284	return (done);
3285}
3286#elif defined(USE_EPOLL)
3287static isc_boolean_t
3288process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
3289	int i;
3290	isc_boolean_t done = ISC_FALSE;
3291#ifdef ISC_PLATFORM_USETHREADS
3292	isc_boolean_t have_ctlevent = ISC_FALSE;
3293#endif
3294
3295	if (nevents == manager->nevents) {
3296		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3297			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3298			    "maximum number of FD events (%d) received",
3299			    nevents);
3300	}
3301
3302	for (i = 0; i < nevents; i++) {
3303		REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3304#ifdef ISC_PLATFORM_USETHREADS
3305		if (events[i].data.fd == manager->pipe_fds[0]) {
3306			have_ctlevent = ISC_TRUE;
3307			continue;
3308		}
3309#endif
3310		if ((events[i].events & EPOLLERR) != 0 ||
3311		    (events[i].events & EPOLLHUP) != 0) {
3312			/*
3313			 * epoll does not set IN/OUT bits on an erroneous
3314			 * condition, so we need to try both anyway.  This is a
3315			 * bit inefficient, but should be okay for such rare
3316			 * events.  Note also that the read or write attempt
3317			 * won't block because we use non-blocking sockets.
3318			 */
3319			events[i].events |= (EPOLLIN | EPOLLOUT);
3320		}
3321		process_fd(manager, events[i].data.fd,
3322			   (events[i].events & EPOLLIN) != 0,
3323			   (events[i].events & EPOLLOUT) != 0);
3324	}
3325
3326#ifdef ISC_PLATFORM_USETHREADS
3327	if (have_ctlevent)
3328		done = process_ctlfd(manager);
3329#endif
3330
3331	return (done);
3332}
3333#elif defined(USE_DEVPOLL)
3334static isc_boolean_t
3335process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
3336	int i;
3337	isc_boolean_t done = ISC_FALSE;
3338#ifdef ISC_PLATFORM_USETHREADS
3339	isc_boolean_t have_ctlevent = ISC_FALSE;
3340#endif
3341
3342	if (nevents == manager->nevents) {
3343		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3344			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3345			    "maximum number of FD events (%d) received",
3346			    nevents);
3347	}
3348
3349	for (i = 0; i < nevents; i++) {
3350		REQUIRE(events[i].fd < (int)manager->maxsocks);
3351#ifdef ISC_PLATFORM_USETHREADS
3352		if (events[i].fd == manager->pipe_fds[0]) {
3353			have_ctlevent = ISC_TRUE;
3354			continue;
3355		}
3356#endif
3357		process_fd(manager, events[i].fd,
3358			   (events[i].events & POLLIN) != 0,
3359			   (events[i].events & POLLOUT) != 0);
3360	}
3361
3362#ifdef ISC_PLATFORM_USETHREADS
3363	if (have_ctlevent)
3364		done = process_ctlfd(manager);
3365#endif
3366
3367	return (done);
3368}
3369#elif defined(USE_SELECT)
3370static void
3371process_fds(isc_socketmgr_t *manager, int maxfd,
3372	    fd_set *readfds, fd_set *writefds)
3373{
3374	int i;
3375
3376	REQUIRE(maxfd <= (int)manager->maxsocks);
3377
3378	for (i = 0; i < maxfd; i++) {
3379#ifdef ISC_PLATFORM_USETHREADS
3380		if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3381			continue;
3382#endif /* ISC_PLATFORM_USETHREADS */
3383		process_fd(manager, i, FD_ISSET(i, readfds),
3384			   FD_ISSET(i, writefds));
3385	}
3386}
3387#endif
3388
3389#ifdef ISC_PLATFORM_USETHREADS
3390static isc_boolean_t
3391process_ctlfd(isc_socketmgr_t *manager) {
3392	int msg, fd;
3393
3394	for (;;) {
3395		select_readmsg(manager, &fd, &msg);
3396
3397		manager_log(manager, IOEVENT,
3398			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3399					   ISC_MSG_WATCHERMSG,
3400					   "watcher got message %d "
3401					   "for socket %d"), msg, fd);
3402
3403		/*
3404		 * Nothing to read?
3405		 */
3406		if (msg == SELECT_POKE_NOTHING)
3407			break;
3408
3409		/*
3410		 * Handle shutdown message.  We really should
3411		 * jump out of this loop right away, but
3412		 * it doesn't matter if we have to do a little
3413		 * more work first.
3414		 */
3415		if (msg == SELECT_POKE_SHUTDOWN)
3416			return (ISC_TRUE);
3417
3418		/*
3419		 * This is a wakeup on a socket.  Look
3420		 * at the event queue for both read and write,
3421		 * and decide if we need to watch on it now
3422		 * or not.
3423		 */
3424		wakeup_socket(manager, fd, msg);
3425	}
3426
3427	return (ISC_FALSE);
3428}
3429
3430/*
3431 * This is the thread that will loop forever, always in a select or poll
3432 * call.
3433 *
3434 * When select returns something to do, track down what thread gets to do
3435 * this I/O and post the event to it.
3436 */
3437static isc_threadresult_t
3438watcher(void *uap) {
3439	isc_socketmgr_t *manager = uap;
3440	isc_boolean_t done;
3441	int ctlfd;
3442	int cc;
3443#ifdef USE_KQUEUE
3444	const char *fnname = "kevent()";
3445#elif defined (USE_EPOLL)
3446	const char *fnname = "epoll_wait()";
3447#elif defined(USE_DEVPOLL)
3448	const char *fnname = "ioctl(DP_POLL)";
3449	struct dvpoll dvp;
3450#elif defined (USE_SELECT)
3451	const char *fnname = "select()";
3452	int maxfd;
3453#endif
3454	char strbuf[ISC_STRERRORSIZE];
3455#ifdef ISC_SOCKET_USE_POLLWATCH
3456	pollstate_t pollstate = poll_idle;
3457#endif
3458
3459	/*
3460	 * Get the control fd here.  This will never change.
3461	 */
3462	ctlfd = manager->pipe_fds[0];
3463	done = ISC_FALSE;
3464	while (!done) {
3465		do {
3466#ifdef USE_KQUEUE
3467			cc = kevent(manager->kqueue_fd, NULL, 0,
3468				    manager->events, manager->nevents, NULL);
3469#elif defined(USE_EPOLL)
3470			cc = epoll_wait(manager->epoll_fd, manager->events,
3471					manager->nevents, -1);
3472#elif defined(USE_DEVPOLL)
3473			dvp.dp_fds = manager->events;
3474			dvp.dp_nfds = manager->nevents;
3475#ifndef ISC_SOCKET_USE_POLLWATCH
3476			dvp.dp_timeout = -1;
3477#else
3478			if (pollstate == poll_idle)
3479				dvp.dp_timeout = -1;
3480			else
3481				dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3482#endif	/* ISC_SOCKET_USE_POLLWATCH */
3483			cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3484#elif defined(USE_SELECT)
3485			LOCK(&manager->lock);
3486			memcpy(manager->read_fds_copy, manager->read_fds,
3487			       manager->fd_bufsize);
3488			memcpy(manager->write_fds_copy, manager->write_fds,
3489			       manager->fd_bufsize);
3490			maxfd = manager->maxfd + 1;
3491			UNLOCK(&manager->lock);
3492
3493			cc = select(maxfd, manager->read_fds_copy,
3494				    manager->write_fds_copy, NULL, NULL);
3495#endif	/* USE_KQUEUE */
3496
3497			if (cc < 0 && !SOFT_ERROR(errno)) {
3498				isc__strerror(errno, strbuf, sizeof(strbuf));
3499				FATAL_ERROR(__FILE__, __LINE__,
3500					    "%s %s: %s", fnname,
3501					    isc_msgcat_get(isc_msgcat,
3502							   ISC_MSGSET_GENERAL,
3503							   ISC_MSG_FAILED,
3504							   "failed"), strbuf);
3505			}
3506
3507#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3508			if (cc == 0) {
3509				if (pollstate == poll_active)
3510					pollstate = poll_checking;
3511				else if (pollstate == poll_checking)
3512					pollstate = poll_idle;
3513			} else if (cc > 0) {
3514				if (pollstate == poll_checking) {
3515					/*
3516					 * XXX: We'd like to use a more
3517					 * verbose log level as it's actually an
3518					 * unexpected event, but the kernel bug
3519					 * reportedly happens pretty frequently
3520					 * (and it can also be a false positive)
3521					 * so it would be just too noisy.
3522					 */
3523					manager_log(manager,
3524						    ISC_LOGCATEGORY_GENERAL,
3525						    ISC_LOGMODULE_SOCKET,
3526						    ISC_LOG_DEBUG(1),
3527						    "unexpected POLL timeout");
3528				}
3529				pollstate = poll_active;
3530			}
3531#endif
3532		} while (cc < 0);
3533
3534#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3535		done = process_fds(manager, manager->events, cc);
3536#elif defined(USE_SELECT)
3537		process_fds(manager, maxfd, manager->read_fds_copy,
3538			    manager->write_fds_copy);
3539
3540		/*
3541		 * Process reads on internal, control fd.
3542		 */
3543		if (FD_ISSET(ctlfd, manager->read_fds_copy))
3544			done = process_ctlfd(manager);
3545#endif
3546	}
3547
3548	manager_log(manager, TRACE, "%s",
3549		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3550				   ISC_MSG_EXITING, "watcher exiting"));
3551
3552	return ((isc_threadresult_t)0);
3553}
3554#endif /* ISC_PLATFORM_USETHREADS */
3555
3556void
3557isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3558
3559	REQUIRE(VALID_MANAGER(manager));
3560
3561	manager->reserved = reserved;
3562}
3563
3564/*
3565 * Create a new socket manager.
3566 */
3567
3568static isc_result_t
3569setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3570	isc_result_t result;
3571#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3572	char strbuf[ISC_STRERRORSIZE];
3573#endif
3574
3575#ifdef USE_KQUEUE
3576	manager->nevents = ISC_SOCKET_MAXEVENTS;
3577	manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3578				      manager->nevents);
3579	if (manager->events == NULL)
3580		return (ISC_R_NOMEMORY);
3581	manager->kqueue_fd = kqueue();
3582	if (manager->kqueue_fd == -1) {
3583		result = isc__errno2result(errno);
3584		isc__strerror(errno, strbuf, sizeof(strbuf));
3585		UNEXPECTED_ERROR(__FILE__, __LINE__,
3586				 "kqueue %s: %s",
3587				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3588						ISC_MSG_FAILED, "failed"),
3589				 strbuf);
3590		isc_mem_put(mctx, manager->events,
3591			    sizeof(struct kevent) * manager->nevents);
3592		return (result);
3593	}
3594
3595#ifdef ISC_PLATFORM_USETHREADS
3596	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3597	if (result != ISC_R_SUCCESS) {
3598		close(manager->kqueue_fd);
3599		isc_mem_put(mctx, manager->events,
3600			    sizeof(struct kevent) * manager->nevents);
3601		return (result);
3602	}
3603#endif	/* ISC_PLATFORM_USETHREADS */
3604#elif defined(USE_EPOLL)
3605	manager->nevents = ISC_SOCKET_MAXEVENTS;
3606	manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3607				      manager->nevents);
3608	if (manager->events == NULL)
3609		return (ISC_R_NOMEMORY);
3610	manager->epoll_fd = epoll_create(manager->nevents);
3611	if (manager->epoll_fd == -1) {
3612		result = isc__errno2result(errno);
3613		isc__strerror(errno, strbuf, sizeof(strbuf));
3614		UNEXPECTED_ERROR(__FILE__, __LINE__,
3615				 "epoll_create %s: %s",
3616				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3617						ISC_MSG_FAILED, "failed"),
3618				 strbuf);
3619		isc_mem_put(mctx, manager->events,
3620			    sizeof(struct epoll_event) * manager->nevents);
3621		return (result);
3622	}
3623#ifdef ISC_PLATFORM_USETHREADS
3624	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3625	if (result != ISC_R_SUCCESS) {
3626		close(manager->epoll_fd);
3627		isc_mem_put(mctx, manager->events,
3628			    sizeof(struct epoll_event) * manager->nevents);
3629		return (result);
3630	}
3631#endif	/* ISC_PLATFORM_USETHREADS */
3632#elif defined(USE_DEVPOLL)
3633	/*
3634	 * XXXJT: /dev/poll seems to reject large numbers of events,
3635	 * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3636	 */
3637	manager->nevents = ISC_SOCKET_MAXEVENTS;
3638	manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3639				      manager->nevents);
3640	if (manager->events == NULL)
3641		return (ISC_R_NOMEMORY);
3642	/*
3643	 * Note: fdpollinfo should be able to support all possible FDs, so
3644	 * it must have maxsocks entries (not nevents).
3645	 */
3646	manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3647					  manager->maxsocks);
3648	if (manager->fdpollinfo == NULL) {
3649		isc_mem_put(mctx, manager->events,
3650			    sizeof(pollinfo_t) * manager->maxsocks);
3651		return (ISC_R_NOMEMORY);
3652	}
3653	memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3654	manager->devpoll_fd = open("/dev/poll", O_RDWR);
3655	if (manager->devpoll_fd == -1) {
3656		result = isc__errno2result(errno);
3657		isc__strerror(errno, strbuf, sizeof(strbuf));
3658		UNEXPECTED_ERROR(__FILE__, __LINE__,
3659				 "open(/dev/poll) %s: %s",
3660				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3661						ISC_MSG_FAILED, "failed"),
3662				 strbuf);
3663		isc_mem_put(mctx, manager->events,
3664			    sizeof(struct pollfd) * manager->nevents);
3665		isc_mem_put(mctx, manager->fdpollinfo,
3666			    sizeof(pollinfo_t) * manager->maxsocks);
3667		return (result);
3668	}
3669#ifdef ISC_PLATFORM_USETHREADS
3670	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3671	if (result != ISC_R_SUCCESS) {
3672		close(manager->devpoll_fd);
3673		isc_mem_put(mctx, manager->events,
3674			    sizeof(struct pollfd) * manager->nevents);
3675		isc_mem_put(mctx, manager->fdpollinfo,
3676			    sizeof(pollinfo_t) * manager->maxsocks);
3677		return (result);
3678	}
3679#endif	/* ISC_PLATFORM_USETHREADS */
3680#elif defined(USE_SELECT)
3681	UNUSED(result);
3682
3683#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3684	/*
3685	 * Note: this code should also cover the case of MAXSOCKETS <=
3686	 * FD_SETSIZE, but we separate the cases to avoid possible portability
3687	 * issues regarding howmany() and the actual representation of fd_set.
3688	 */
3689	manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3690		sizeof(fd_mask);
3691#else
3692	manager->fd_bufsize = sizeof(fd_set);
3693#endif
3694
3695	manager->read_fds = NULL;
3696	manager->read_fds_copy = NULL;
3697	manager->write_fds = NULL;
3698	manager->write_fds_copy = NULL;
3699
3700	manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
3701	if (manager->read_fds != NULL)
3702		manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
3703	if (manager->read_fds_copy != NULL)
3704		manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
3705	if (manager->write_fds != NULL) {
3706		manager->write_fds_copy = isc_mem_get(mctx,
3707						      manager->fd_bufsize);
3708	}
3709	if (manager->write_fds_copy == NULL) {
3710		if (manager->write_fds != NULL) {
3711			isc_mem_put(mctx, manager->write_fds,
3712				    manager->fd_bufsize);
3713		}
3714		if (manager->read_fds_copy != NULL) {
3715			isc_mem_put(mctx, manager->read_fds_copy,
3716				    manager->fd_bufsize);
3717		}
3718		if (manager->read_fds != NULL) {
3719			isc_mem_put(mctx, manager->read_fds,
3720				    manager->fd_bufsize);
3721		}
3722		return (ISC_R_NOMEMORY);
3723	}
3724	memset(manager->read_fds, 0, manager->fd_bufsize);
3725	memset(manager->write_fds, 0, manager->fd_bufsize);
3726
3727#ifdef ISC_PLATFORM_USETHREADS
3728	(void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3729	manager->maxfd = manager->pipe_fds[0];
3730#else /* ISC_PLATFORM_USETHREADS */
3731	manager->maxfd = 0;
3732#endif /* ISC_PLATFORM_USETHREADS */
3733#endif	/* USE_KQUEUE */
3734
3735	return (ISC_R_SUCCESS);
3736}
3737
3738static void
3739cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3740#ifdef ISC_PLATFORM_USETHREADS
3741	isc_result_t result;
3742
3743	result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3744	if (result != ISC_R_SUCCESS) {
3745		UNEXPECTED_ERROR(__FILE__, __LINE__,
3746				 "epoll_ctl(DEL) %s",
3747				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3748						ISC_MSG_FAILED, "failed"));
3749	}
3750#endif	/* ISC_PLATFORM_USETHREADS */
3751
3752#ifdef USE_KQUEUE
3753	close(manager->kqueue_fd);
3754	isc_mem_put(mctx, manager->events,
3755		    sizeof(struct kevent) * manager->nevents);
3756#elif defined(USE_EPOLL)
3757	close(manager->epoll_fd);
3758	isc_mem_put(mctx, manager->events,
3759		    sizeof(struct epoll_event) * manager->nevents);
3760#elif defined(USE_DEVPOLL)
3761	close(manager->devpoll_fd);
3762	isc_mem_put(mctx, manager->events,
3763		    sizeof(struct pollfd) * manager->nevents);
3764	isc_mem_put(mctx, manager->fdpollinfo,
3765		    sizeof(pollinfo_t) * manager->maxsocks);
3766#elif defined(USE_SELECT)
3767	if (manager->read_fds != NULL)
3768		isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
3769	if (manager->read_fds_copy != NULL)
3770		isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
3771	if (manager->write_fds != NULL)
3772		isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
3773	if (manager->write_fds_copy != NULL)
3774		isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
3775#endif	/* USE_KQUEUE */
3776}
3777
3778isc_result_t
3779isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3780	return (isc_socketmgr_create2(mctx, managerp, 0));
3781}
3782
3783isc_result_t
3784isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3785		      unsigned int maxsocks)
3786{
3787	int i;
3788	isc_socketmgr_t *manager;
3789#ifdef ISC_PLATFORM_USETHREADS
3790	char strbuf[ISC_STRERRORSIZE];
3791#endif
3792	isc_result_t result;
3793
3794	REQUIRE(managerp != NULL && *managerp == NULL);
3795
3796#ifndef ISC_PLATFORM_USETHREADS
3797	if (socketmgr != NULL) {
3798		/* Don't allow maxsocks to be updated */
3799		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
3800			return (ISC_R_EXISTS);
3801
3802		socketmgr->refs++;
3803		*managerp = socketmgr;
3804		return (ISC_R_SUCCESS);
3805	}
3806#endif /* ISC_PLATFORM_USETHREADS */
3807
3808	if (maxsocks == 0)
3809		maxsocks = ISC_SOCKET_MAXSOCKETS;
3810
3811	manager = isc_mem_get(mctx, sizeof(*manager));
3812	if (manager == NULL)
3813		return (ISC_R_NOMEMORY);
3814
3815	/* zero-clear so that necessary cleanup on failure will be easy */
3816	memset(manager, 0, sizeof(*manager));
3817	manager->maxsocks = maxsocks;
3818	manager->reserved = 0;
3819	manager->fds = isc_mem_get(mctx,
3820				   manager->maxsocks * sizeof(isc_socket_t *));
3821	if (manager->fds == NULL) {
3822		result = ISC_R_NOMEMORY;
3823		goto free_manager;
3824	}
3825	manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
3826	if (manager->fdstate == NULL) {
3827		result = ISC_R_NOMEMORY;
3828		goto free_manager;
3829	}
3830	manager->stats = NULL;
3831
3832	manager->magic = SOCKET_MANAGER_MAGIC;
3833	manager->mctx = NULL;
3834	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
3835	ISC_LIST_INIT(manager->socklist);
3836	result = isc_mutex_init(&manager->lock);
3837	if (result != ISC_R_SUCCESS)
3838		goto free_manager;
3839	manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
3840	if (manager->fdlock == NULL) {
3841		result = ISC_R_NOMEMORY;
3842		goto cleanup_lock;
3843	}
3844	for (i = 0; i < FDLOCK_COUNT; i++) {
3845		result = isc_mutex_init(&manager->fdlock[i]);
3846		if (result != ISC_R_SUCCESS) {
3847			while (--i >= 0)
3848				DESTROYLOCK(&manager->fdlock[i]);
3849			isc_mem_put(mctx, manager->fdlock,
3850				    FDLOCK_COUNT * sizeof(isc_mutex_t));
3851			manager->fdlock = NULL;
3852			goto cleanup_lock;
3853		}
3854	}
3855
3856#ifdef ISC_PLATFORM_USETHREADS
3857	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
3858		UNEXPECTED_ERROR(__FILE__, __LINE__,
3859				 "isc_condition_init() %s",
3860				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3861						ISC_MSG_FAILED, "failed"));
3862		result = ISC_R_UNEXPECTED;
3863		goto cleanup_lock;
3864	}
3865
3866	/*
3867	 * Create the special fds that will be used to wake up the
3868	 * select/poll loop when something internal needs to be done.
3869	 */
3870	if (pipe(manager->pipe_fds) != 0) {
3871		isc__strerror(errno, strbuf, sizeof(strbuf));
3872		UNEXPECTED_ERROR(__FILE__, __LINE__,
3873				 "pipe() %s: %s",
3874				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3875						ISC_MSG_FAILED, "failed"),
3876				 strbuf);
3877		result = ISC_R_UNEXPECTED;
3878		goto cleanup_condition;
3879	}
3880
3881	RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
3882#if 0
3883	RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
3884#endif
3885#else /* ISC_PLATFORM_USETHREADS */
3886	manager->refs = 1;
3887#endif /* ISC_PLATFORM_USETHREADS */
3888
3889	/*
3890	 * Set up initial state for the select loop
3891	 */
3892	result = setup_watcher(mctx, manager);
3893	if (result != ISC_R_SUCCESS)
3894		goto cleanup;
3895	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
3896#ifdef ISC_PLATFORM_USETHREADS
3897	/*
3898	 * Start up the select/poll thread.
3899	 */
3900	if (isc_thread_create(watcher, manager, &manager->watcher) !=
3901	    ISC_R_SUCCESS) {
3902		UNEXPECTED_ERROR(__FILE__, __LINE__,
3903				 "isc_thread_create() %s",
3904				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3905						ISC_MSG_FAILED, "failed"));
3906		cleanup_watcher(mctx, manager);
3907		result = ISC_R_UNEXPECTED;
3908		goto cleanup;
3909	}
3910#endif /* ISC_PLATFORM_USETHREADS */
3911	isc_mem_attach(mctx, &manager->mctx);
3912
3913#ifndef ISC_PLATFORM_USETHREADS
3914	socketmgr = manager;
3915#endif /* ISC_PLATFORM_USETHREADS */
3916	*managerp = manager;
3917
3918	return (ISC_R_SUCCESS);
3919
3920cleanup:
3921#ifdef ISC_PLATFORM_USETHREADS
3922	(void)close(manager->pipe_fds[0]);
3923	(void)close(manager->pipe_fds[1]);
3924#endif	/* ISC_PLATFORM_USETHREADS */
3925
3926#ifdef ISC_PLATFORM_USETHREADS
3927cleanup_condition:
3928	(void)isc_condition_destroy(&manager->shutdown_ok);
3929#endif	/* ISC_PLATFORM_USETHREADS */
3930
3931
3932cleanup_lock:
3933	if (manager->fdlock != NULL) {
3934		for (i = 0; i < FDLOCK_COUNT; i++)
3935			DESTROYLOCK(&manager->fdlock[i]);
3936	}
3937	DESTROYLOCK(&manager->lock);
3938
3939free_manager:
3940	if (manager->fdlock != NULL) {
3941		isc_mem_put(mctx, manager->fdlock,
3942			    FDLOCK_COUNT * sizeof(isc_mutex_t));
3943	}
3944	if (manager->fdstate != NULL) {
3945		isc_mem_put(mctx, manager->fdstate,
3946			    manager->maxsocks * sizeof(int));
3947	}
3948	if (manager->fds != NULL) {
3949		isc_mem_put(mctx, manager->fds,
3950			    manager->maxsocks * sizeof(isc_socket_t *));
3951	}
3952	isc_mem_put(mctx, manager, sizeof(*manager));
3953
3954	return (result);
3955}
3956
3957isc_result_t
3958isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3959	REQUIRE(VALID_MANAGER(manager));
3960	REQUIRE(nsockp != NULL);
3961
3962	*nsockp = manager->maxsocks;
3963
3964	return (ISC_R_SUCCESS);
3965}
3966
3967void
3968isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3969	REQUIRE(VALID_MANAGER(manager));
3970	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3971	REQUIRE(manager->stats == NULL);
3972	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3973
3974	isc_stats_attach(stats, &manager->stats);
3975}
3976
3977void
3978isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3979	isc_socketmgr_t *manager;
3980	int i;
3981	isc_mem_t *mctx;
3982
3983	/*
3984	 * Destroy a socket manager.
3985	 */
3986
3987	REQUIRE(managerp != NULL);
3988	manager = *managerp;
3989	REQUIRE(VALID_MANAGER(manager));
3990
3991#ifndef ISC_PLATFORM_USETHREADS
3992	if (manager->refs > 1) {
3993		manager->refs--;
3994		*managerp = NULL;
3995		return;
3996	}
3997#endif /* ISC_PLATFORM_USETHREADS */
3998
3999	LOCK(&manager->lock);
4000
4001#ifdef ISC_PLATFORM_USETHREADS
4002	/*
4003	 * Wait for all sockets to be destroyed.
4004	 */
4005	while (!ISC_LIST_EMPTY(manager->socklist)) {
4006		manager_log(manager, CREATION, "%s",
4007			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4008					   ISC_MSG_SOCKETSREMAIN,
4009					   "sockets exist"));
4010		WAIT(&manager->shutdown_ok, &manager->lock);
4011	}
4012#else /* ISC_PLATFORM_USETHREADS */
4013	/*
4014	 * Hope all sockets have been destroyed.
4015	 */
4016	if (!ISC_LIST_EMPTY(manager->socklist)) {
4017		manager_log(manager, CREATION, "%s",
4018			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4019					   ISC_MSG_SOCKETSREMAIN,
4020					   "sockets exist"));
4021		INSIST(0);
4022	}
4023#endif /* ISC_PLATFORM_USETHREADS */
4024
4025	UNLOCK(&manager->lock);
4026
4027	/*
4028	 * Here, poke our select/poll thread.  Do this by closing the write
4029	 * half of the pipe, which will send EOF to the read half.
4030	 * This is currently a no-op in the non-threaded case.
4031	 */
4032	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4033
4034#ifdef ISC_PLATFORM_USETHREADS
4035	/*
4036	 * Wait for thread to exit.
4037	 */
4038	if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4039		UNEXPECTED_ERROR(__FILE__, __LINE__,
4040				 "isc_thread_join() %s",
4041				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4042						ISC_MSG_FAILED, "failed"));
4043#endif /* ISC_PLATFORM_USETHREADS */
4044
4045	/*
4046	 * Clean up.
4047	 */
4048	cleanup_watcher(manager->mctx, manager);
4049
4050#ifdef ISC_PLATFORM_USETHREADS
4051	(void)close(manager->pipe_fds[0]);
4052	(void)close(manager->pipe_fds[1]);
4053	(void)isc_condition_destroy(&manager->shutdown_ok);
4054#endif /* ISC_PLATFORM_USETHREADS */
4055
4056	for (i = 0; i < (int)manager->maxsocks; i++)
4057		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4058			(void)close(i);
4059
4060	isc_mem_put(manager->mctx, manager->fds,
4061		    manager->maxsocks * sizeof(isc_socket_t *));
4062	isc_mem_put(manager->mctx, manager->fdstate,
4063		    manager->maxsocks * sizeof(int));
4064
4065	if (manager->stats != NULL)
4066		isc_stats_detach(&manager->stats);
4067
4068	if (manager->fdlock != NULL) {
4069		for (i = 0; i < FDLOCK_COUNT; i++)
4070			DESTROYLOCK(&manager->fdlock[i]);
4071		isc_mem_put(manager->mctx, manager->fdlock,
4072			    FDLOCK_COUNT * sizeof(isc_mutex_t));
4073	}
4074	DESTROYLOCK(&manager->lock);
4075	manager->magic = 0;
4076	mctx= manager->mctx;
4077	isc_mem_put(mctx, manager, sizeof(*manager));
4078
4079	isc_mem_detach(&mctx);
4080
4081	*managerp = NULL;
4082}
4083
4084static isc_result_t
4085socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4086	    unsigned int flags)
4087{
4088	int io_state;
4089	isc_boolean_t have_lock = ISC_FALSE;
4090	isc_task_t *ntask = NULL;
4091	isc_result_t result = ISC_R_SUCCESS;
4092
4093	dev->ev_sender = task;
4094
4095	if (sock->type == isc_sockettype_udp) {
4096		io_state = doio_recv(sock, dev);
4097	} else {
4098		LOCK(&sock->lock);
4099		have_lock = ISC_TRUE;
4100
4101		if (ISC_LIST_EMPTY(sock->recv_list))
4102			io_state = doio_recv(sock, dev);
4103		else
4104			io_state = DOIO_SOFT;
4105	}
4106
4107	switch (io_state) {
4108	case DOIO_SOFT:
4109		/*
4110		 * We couldn't read all or part of the request right now, so
4111		 * queue it.
4112		 *
4113		 * Attach to socket and to task
4114		 */
4115		isc_task_attach(task, &ntask);
4116		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4117
4118		if (!have_lock) {
4119			LOCK(&sock->lock);
4120			have_lock = ISC_TRUE;
4121		}
4122
4123		/*
4124		 * Enqueue the request.  If the socket was previously not being
4125		 * watched, poke the watcher to start paying attention to it.
4126		 */
4127		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4128			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4129		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4130
4131		socket_log(sock, NULL, EVENT, NULL, 0, 0,
4132			   "socket_recv: event %p -> task %p",
4133			   dev, ntask);
4134
4135		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4136			result = ISC_R_INPROGRESS;
4137		break;
4138
4139	case DOIO_EOF:
4140		dev->result = ISC_R_EOF;
4141		/* fallthrough */
4142
4143	case DOIO_HARD:
4144	case DOIO_SUCCESS:
4145		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4146			send_recvdone_event(sock, &dev);
4147		break;
4148	}
4149
4150	if (have_lock)
4151		UNLOCK(&sock->lock);
4152
4153	return (result);
4154}
4155
4156isc_result_t
4157isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4158		 unsigned int minimum, isc_task_t *task,
4159		 isc_taskaction_t action, const void *arg)
4160{
4161	isc_socketevent_t *dev;
4162	isc_socketmgr_t *manager;
4163	unsigned int iocount;
4164	isc_buffer_t *buffer;
4165
4166	REQUIRE(VALID_SOCKET(sock));
4167	REQUIRE(buflist != NULL);
4168	REQUIRE(!ISC_LIST_EMPTY(*buflist));
4169	REQUIRE(task != NULL);
4170	REQUIRE(action != NULL);
4171
4172	manager = sock->manager;
4173	REQUIRE(VALID_MANAGER(manager));
4174
4175	iocount = isc_bufferlist_availablecount(buflist);
4176	REQUIRE(iocount > 0);
4177
4178	INSIST(sock->bound);
4179
4180	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4181	if (dev == NULL) {
4182		return (ISC_R_NOMEMORY);
4183	}
4184
4185	/*
4186	 * UDP sockets are always partial read
4187	 */
4188	if (sock->type == isc_sockettype_udp)
4189		dev->minimum = 1;
4190	else {
4191		if (minimum == 0)
4192			dev->minimum = iocount;
4193		else
4194			dev->minimum = minimum;
4195	}
4196
4197	/*
4198	 * Move each buffer from the passed in list to our internal one.
4199	 */
4200	buffer = ISC_LIST_HEAD(*buflist);
4201	while (buffer != NULL) {
4202		ISC_LIST_DEQUEUE(*buflist, buffer, link);
4203		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4204		buffer = ISC_LIST_HEAD(*buflist);
4205	}
4206
4207	return (socket_recv(sock, dev, task, 0));
4208}
4209
4210isc_result_t
4211isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4212		isc_task_t *task, isc_taskaction_t action, const void *arg)
4213{
4214	isc_socketevent_t *dev;
4215	isc_socketmgr_t *manager;
4216
4217	REQUIRE(VALID_SOCKET(sock));
4218	REQUIRE(action != NULL);
4219
4220	manager = sock->manager;
4221	REQUIRE(VALID_MANAGER(manager));
4222
4223	INSIST(sock->bound);
4224
4225	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4226	if (dev == NULL)
4227		return (ISC_R_NOMEMORY);
4228
4229	return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
4230}
4231
4232isc_result_t
4233isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
4234		 unsigned int minimum, isc_task_t *task,
4235		 isc_socketevent_t *event, unsigned int flags)
4236{
4237	event->ev_sender = sock;
4238	event->result = ISC_R_UNEXPECTED;
4239	ISC_LIST_INIT(event->bufferlist);
4240	event->region = *region;
4241	event->n = 0;
4242	event->offset = 0;
4243	event->attributes = 0;
4244
4245	/*
4246	 * UDP sockets are always partial read.
4247	 */
4248	if (sock->type == isc_sockettype_udp)
4249		event->minimum = 1;
4250	else {
4251		if (minimum == 0)
4252			event->minimum = region->length;
4253		else
4254			event->minimum = minimum;
4255	}
4256
4257	return (socket_recv(sock, event, task, flags));
4258}
4259
4260static isc_result_t
4261socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4262	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4263	    unsigned int flags)
4264{
4265	int io_state;
4266	isc_boolean_t have_lock = ISC_FALSE;
4267	isc_task_t *ntask = NULL;
4268	isc_result_t result = ISC_R_SUCCESS;
4269
4270	dev->ev_sender = task;
4271
4272	set_dev_address(address, sock, dev);
4273	if (pktinfo != NULL) {
4274		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4275		dev->pktinfo = *pktinfo;
4276
4277		if (!isc_sockaddr_issitelocal(&dev->address) &&
4278		    !isc_sockaddr_islinklocal(&dev->address)) {
4279			socket_log(sock, NULL, TRACE, isc_msgcat,
4280				   ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4281				   "pktinfo structure provided, ifindex %u "
4282				   "(set to 0)", pktinfo->ipi6_ifindex);
4283
4284			/*
4285			 * Set the pktinfo index to 0 here, to let the
4286			 * kernel decide what interface it should send on.
4287			 */
4288			dev->pktinfo.ipi6_ifindex = 0;
4289		}
4290	}
4291
4292	if (sock->type == isc_sockettype_udp)
4293		io_state = doio_send(sock, dev);
4294	else {
4295		LOCK(&sock->lock);
4296		have_lock = ISC_TRUE;
4297
4298		if (ISC_LIST_EMPTY(sock->send_list))
4299			io_state = doio_send(sock, dev);
4300		else
4301			io_state = DOIO_SOFT;
4302	}
4303
4304	switch (io_state) {
4305	case DOIO_SOFT:
4306		/*
4307		 * We couldn't send all or part of the request right now, so
4308		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4309		 */
4310		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4311			isc_task_attach(task, &ntask);
4312			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4313
4314			if (!have_lock) {
4315				LOCK(&sock->lock);
4316				have_lock = ISC_TRUE;
4317			}
4318
4319			/*
4320			 * Enqueue the request.  If the socket was previously
4321			 * not being watched, poke the watcher to start
4322			 * paying attention to it.
4323			 */
4324			if (ISC_LIST_EMPTY(sock->send_list) &&
4325			    !sock->pending_send)
4326				select_poke(sock->manager, sock->fd,
4327					    SELECT_POKE_WRITE);
4328			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4329
4330			socket_log(sock, NULL, EVENT, NULL, 0, 0,
4331				   "socket_send: event %p -> task %p",
4332				   dev, ntask);
4333
4334			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4335				result = ISC_R_INPROGRESS;
4336			break;
4337		}
4338
4339	case DOIO_HARD:
4340	case DOIO_SUCCESS:
4341		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4342			send_senddone_event(sock, &dev);
4343		break;
4344	}
4345
4346	if (have_lock)
4347		UNLOCK(&sock->lock);
4348
4349	return (result);
4350}
4351
4352isc_result_t
4353isc_socket_send(isc_socket_t *sock, isc_region_t *region,
4354		isc_task_t *task, isc_taskaction_t action, const void *arg)
4355{
4356	/*
4357	 * REQUIRE() checking is performed in isc_socket_sendto().
4358	 */
4359	return (isc_socket_sendto(sock, region, task, action, arg, NULL,
4360				  NULL));
4361}
4362
4363isc_result_t
4364isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
4365		  isc_task_t *task, isc_taskaction_t action, const void *arg,
4366		  isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4367{
4368	isc_socketevent_t *dev;
4369	isc_socketmgr_t *manager;
4370
4371	REQUIRE(VALID_SOCKET(sock));
4372	REQUIRE(region != NULL);
4373	REQUIRE(task != NULL);
4374	REQUIRE(action != NULL);
4375
4376	manager = sock->manager;
4377	REQUIRE(VALID_MANAGER(manager));
4378
4379	INSIST(sock->bound);
4380
4381	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4382	if (dev == NULL) {
4383		return (ISC_R_NOMEMORY);
4384	}
4385
4386	dev->region = *region;
4387
4388	return (socket_send(sock, dev, task, address, pktinfo, 0));
4389}
4390
4391isc_result_t
4392isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4393		 isc_task_t *task, isc_taskaction_t action, const void *arg)
4394{
4395	return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
4396				   NULL));
4397}
4398
4399isc_result_t
4400isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
4401		   isc_task_t *task, isc_taskaction_t action, const void *arg,
4402		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4403{
4404	isc_socketevent_t *dev;
4405	isc_socketmgr_t *manager;
4406	unsigned int iocount;
4407	isc_buffer_t *buffer;
4408
4409	REQUIRE(VALID_SOCKET(sock));
4410	REQUIRE(buflist != NULL);
4411	REQUIRE(!ISC_LIST_EMPTY(*buflist));
4412	REQUIRE(task != NULL);
4413	REQUIRE(action != NULL);
4414
4415	manager = sock->manager;
4416	REQUIRE(VALID_MANAGER(manager));
4417
4418	iocount = isc_bufferlist_usedcount(buflist);
4419	REQUIRE(iocount > 0);
4420
4421	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4422	if (dev == NULL) {
4423		return (ISC_R_NOMEMORY);
4424	}
4425
4426	/*
4427	 * Move each buffer from the passed in list to our internal one.
4428	 */
4429	buffer = ISC_LIST_HEAD(*buflist);
4430	while (buffer != NULL) {
4431		ISC_LIST_DEQUEUE(*buflist, buffer, link);
4432		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4433		buffer = ISC_LIST_HEAD(*buflist);
4434	}
4435
4436	return (socket_send(sock, dev, task, address, pktinfo, 0));
4437}
4438
4439isc_result_t
4440isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
4441		   isc_task_t *task,
4442		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4443		   isc_socketevent_t *event, unsigned int flags)
4444{
4445	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4446	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4447		REQUIRE(sock->type == isc_sockettype_udp);
4448	event->ev_sender = sock;
4449	event->result = ISC_R_UNEXPECTED;
4450	ISC_LIST_INIT(event->bufferlist);
4451	event->region = *region;
4452	event->n = 0;
4453	event->offset = 0;
4454	event->attributes = 0;
4455
4456	return (socket_send(sock, event, task, address, pktinfo, flags));
4457}
4458
4459void
4460isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4461#ifdef ISC_PLATFORM_HAVESYSUNH
4462	int s;
4463	struct stat sb;
4464	char strbuf[ISC_STRERRORSIZE];
4465
4466	if (sockaddr->type.sa.sa_family != AF_UNIX)
4467		return;
4468
4469#ifndef S_ISSOCK
4470#if defined(S_IFMT) && defined(S_IFSOCK)
4471#define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4472#elif defined(_S_IFMT) && defined(S_IFSOCK)
4473#define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4474#endif
4475#endif
4476
4477#ifndef S_ISFIFO
4478#if defined(S_IFMT) && defined(S_IFIFO)
4479#define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4480#elif defined(_S_IFMT) && defined(S_IFIFO)
4481#define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4482#endif
4483#endif
4484
4485#if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4486#error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4487#endif
4488
4489#ifndef S_ISFIFO
4490#define S_ISFIFO(mode) 0
4491#endif
4492
4493#ifndef S_ISSOCK
4494#define S_ISSOCK(mode) 0
4495#endif
4496
4497	if (active) {
4498		if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4499			isc__strerror(errno, strbuf, sizeof(strbuf));
4500			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4501				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4502				      "isc_socket_cleanunix: stat(%s): %s",
4503				      sockaddr->type.sunix.sun_path, strbuf);
4504			return;
4505		}
4506		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4507			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4508				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4509				      "isc_socket_cleanunix: %s: not a socket",
4510				      sockaddr->type.sunix.sun_path);
4511			return;
4512		}
4513		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4514			isc__strerror(errno, strbuf, sizeof(strbuf));
4515			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4516				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4517				      "isc_socket_cleanunix: unlink(%s): %s",
4518				      sockaddr->type.sunix.sun_path, strbuf);
4519		}
4520		return;
4521	}
4522
4523	s = socket(AF_UNIX, SOCK_STREAM, 0);
4524	if (s < 0) {
4525		isc__strerror(errno, strbuf, sizeof(strbuf));
4526		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4527			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4528			      "isc_socket_cleanunix: socket(%s): %s",
4529			      sockaddr->type.sunix.sun_path, strbuf);
4530		return;
4531	}
4532
4533	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4534		switch (errno) {
4535		case ENOENT:    /* We exited cleanly last time */
4536			break;
4537		default:
4538			isc__strerror(errno, strbuf, sizeof(strbuf));
4539			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4540				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4541				      "isc_socket_cleanunix: stat(%s): %s",
4542				      sockaddr->type.sunix.sun_path, strbuf);
4543			break;
4544		}
4545		goto cleanup;
4546	}
4547
4548	if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4549		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4550			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4551			      "isc_socket_cleanunix: %s: not a socket",
4552			      sockaddr->type.sunix.sun_path);
4553		goto cleanup;
4554	}
4555
4556	if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4557		    sizeof(sockaddr->type.sunix)) < 0) {
4558		switch (errno) {
4559		case ECONNREFUSED:
4560		case ECONNRESET:
4561			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4562				isc__strerror(errno, strbuf, sizeof(strbuf));
4563				isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4564					      ISC_LOGMODULE_SOCKET,
4565					      ISC_LOG_WARNING,
4566					      "isc_socket_cleanunix: "
4567					      "unlink(%s): %s",
4568					      sockaddr->type.sunix.sun_path,
4569					      strbuf);
4570			}
4571			break;
4572		default:
4573			isc__strerror(errno, strbuf, sizeof(strbuf));
4574			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4575				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4576				      "isc_socket_cleanunix: connect(%s): %s",
4577				      sockaddr->type.sunix.sun_path, strbuf);
4578			break;
4579		}
4580	}
4581 cleanup:
4582	close(s);
4583#else
4584	UNUSED(sockaddr);
4585	UNUSED(active);
4586#endif
4587}
4588
4589isc_result_t
4590isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4591		    isc_uint32_t owner, isc_uint32_t group)
4592{
4593#ifdef ISC_PLATFORM_HAVESYSUNH
4594	isc_result_t result = ISC_R_SUCCESS;
4595	char strbuf[ISC_STRERRORSIZE];
4596	char path[sizeof(sockaddr->type.sunix.sun_path)];
4597#ifdef NEED_SECURE_DIRECTORY
4598	char *slash;
4599#endif
4600
4601	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4602	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4603	strcpy(path, sockaddr->type.sunix.sun_path);
4604
4605#ifdef NEED_SECURE_DIRECTORY
4606	slash = strrchr(path, '/');
4607	if (slash != NULL) {
4608		if (slash != path)
4609			*slash = '\0';
4610		else
4611			strcpy(path, "/");
4612	} else
4613		strcpy(path, ".");
4614#endif
4615
4616	if (chmod(path, perm) < 0) {
4617		isc__strerror(errno, strbuf, sizeof(strbuf));
4618		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4619			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4620			      "isc_socket_permunix: chmod(%s, %d): %s",
4621			      path, perm, strbuf);
4622		result = ISC_R_FAILURE;
4623	}
4624	if (chown(path, owner, group) < 0) {
4625		isc__strerror(errno, strbuf, sizeof(strbuf));
4626		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4627			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4628			      "isc_socket_permunix: chown(%s, %d, %d): %s",
4629			      path, owner, group,
4630			      strbuf);
4631		result = ISC_R_FAILURE;
4632	}
4633	return (result);
4634#else
4635	UNUSED(sockaddr);
4636	UNUSED(perm);
4637	UNUSED(owner);
4638	UNUSED(group);
4639	return (ISC_R_NOTIMPLEMENTED);
4640#endif
4641}
4642
4643isc_result_t
4644isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
4645		unsigned int options) {
4646	char strbuf[ISC_STRERRORSIZE];
4647	int on = 1;
4648
4649	LOCK(&sock->lock);
4650
4651	INSIST(!sock->bound);
4652
4653	if (sock->pf != sockaddr->type.sa.sa_family) {
4654		UNLOCK(&sock->lock);
4655		return (ISC_R_FAMILYMISMATCH);
4656	}
4657	/*
4658	 * Only set SO_REUSEADDR when we want a specific port.
4659	 */
4660#ifdef AF_UNIX
4661	if (sock->pf == AF_UNIX)
4662		goto bind_socket;
4663#endif
4664	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4665	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
4666	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4667		       sizeof(on)) < 0) {
4668		UNEXPECTED_ERROR(__FILE__, __LINE__,
4669				 "setsockopt(%d) %s", sock->fd,
4670				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4671						ISC_MSG_FAILED, "failed"));
4672		/* Press on... */
4673	}
4674#ifdef AF_UNIX
4675 bind_socket:
4676#endif
4677	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4678		inc_stats(sock->manager->stats,
4679			  sock->statsindex[STATID_BINDFAIL]);
4680
4681		UNLOCK(&sock->lock);
4682		switch (errno) {
4683		case EACCES:
4684			return (ISC_R_NOPERM);
4685		case EADDRNOTAVAIL:
4686			return (ISC_R_ADDRNOTAVAIL);
4687		case EADDRINUSE:
4688			return (ISC_R_ADDRINUSE);
4689		case EINVAL:
4690			return (ISC_R_BOUND);
4691		default:
4692			isc__strerror(errno, strbuf, sizeof(strbuf));
4693			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4694					 strbuf);
4695			return (ISC_R_UNEXPECTED);
4696		}
4697	}
4698
4699	socket_log(sock, sockaddr, TRACE,
4700		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
4701	sock->bound = 1;
4702
4703	UNLOCK(&sock->lock);
4704	return (ISC_R_SUCCESS);
4705}
4706
4707isc_result_t
4708isc_socket_filter(isc_socket_t *sock, const char *filter) {
4709#ifdef SO_ACCEPTFILTER
4710	char strbuf[ISC_STRERRORSIZE];
4711	struct accept_filter_arg afa;
4712#else
4713	UNUSED(sock);
4714	UNUSED(filter);
4715#endif
4716
4717	REQUIRE(VALID_SOCKET(sock));
4718
4719#ifdef SO_ACCEPTFILTER
4720	bzero(&afa, sizeof(afa));
4721	strncpy(afa.af_name, filter, sizeof(afa.af_name));
4722	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
4723			 &afa, sizeof(afa)) == -1) {
4724		isc__strerror(errno, strbuf, sizeof(strbuf));
4725		socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
4726			   ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
4727			   strbuf);
4728		return (ISC_R_FAILURE);
4729	}
4730	return (ISC_R_SUCCESS);
4731#else
4732	return (ISC_R_NOTIMPLEMENTED);
4733#endif
4734}
4735
4736/*
4737 * Set up to listen on a given socket.  We do this by creating an internal
4738 * event that will be dispatched when the socket has read activity.  The
4739 * watcher will send the internal event to the task when there is a new
4740 * connection.
4741 *
4742 * Unlike in read, we don't preallocate a done event here.  Every time there
4743 * is a new connection we'll have to allocate a new one anyway, so we might
4744 * as well keep things simple rather than having to track them.
4745 */
4746isc_result_t
4747isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4748	char strbuf[ISC_STRERRORSIZE];
4749
4750	REQUIRE(VALID_SOCKET(sock));
4751
4752	LOCK(&sock->lock);
4753
4754	REQUIRE(!sock->listener);
4755	REQUIRE(sock->bound);
4756	REQUIRE(sock->type == isc_sockettype_tcp ||
4757		sock->type == isc_sockettype_unix);
4758
4759	if (backlog == 0)
4760		backlog = SOMAXCONN;
4761
4762	if (listen(sock->fd, (int)backlog) < 0) {
4763		UNLOCK(&sock->lock);
4764		isc__strerror(errno, strbuf, sizeof(strbuf));
4765
4766		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4767
4768		return (ISC_R_UNEXPECTED);
4769	}
4770
4771	sock->listener = 1;
4772
4773	UNLOCK(&sock->lock);
4774	return (ISC_R_SUCCESS);
4775}
4776
4777/*
4778 * This should try to do aggressive accept() XXXMLG
4779 */
4780isc_result_t
4781isc_socket_accept(isc_socket_t *sock,
4782		  isc_task_t *task, isc_taskaction_t action, const void *arg)
4783{
4784	isc_socket_newconnev_t *dev;
4785	isc_socketmgr_t *manager;
4786	isc_task_t *ntask = NULL;
4787	isc_socket_t *nsock;
4788	isc_result_t result;
4789	isc_boolean_t do_poke = ISC_FALSE;
4790
4791	REQUIRE(VALID_SOCKET(sock));
4792	manager = sock->manager;
4793	REQUIRE(VALID_MANAGER(manager));
4794
4795	LOCK(&sock->lock);
4796
4797	REQUIRE(sock->listener);
4798
4799	/*
4800	 * Sender field is overloaded here with the task we will be sending
4801	 * this event to.  Just before the actual event is delivered the
4802	 * actual ev_sender will be touched up to be the socket.
4803	 */
4804	dev = (isc_socket_newconnev_t *)
4805		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
4806				   action, arg, sizeof(*dev));
4807	if (dev == NULL) {
4808		UNLOCK(&sock->lock);
4809		return (ISC_R_NOMEMORY);
4810	}
4811	ISC_LINK_INIT(dev, ev_link);
4812
4813	result = allocate_socket(manager, sock->type, &nsock);
4814	if (result != ISC_R_SUCCESS) {
4815		isc_event_free(ISC_EVENT_PTR(&dev));
4816		UNLOCK(&sock->lock);
4817		return (result);
4818	}
4819
4820	/*
4821	 * Attach to socket and to task.
4822	 */
4823	isc_task_attach(task, &ntask);
4824	nsock->references++;
4825	nsock->statsindex = sock->statsindex;
4826
4827	dev->ev_sender = ntask;
4828	dev->newsocket = nsock;
4829
4830	/*
4831	 * Poke watcher here.  We still have the socket locked, so there
4832	 * is no race condition.  We will keep the lock for such a short
4833	 * bit of time waking it up now or later won't matter all that much.
4834	 */
4835	if (ISC_LIST_EMPTY(sock->accept_list))
4836		do_poke = ISC_TRUE;
4837
4838	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4839
4840	if (do_poke)
4841		select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
4842
4843	UNLOCK(&sock->lock);
4844	return (ISC_R_SUCCESS);
4845}
4846
4847isc_result_t
4848isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
4849		   isc_task_t *task, isc_taskaction_t action, const void *arg)
4850{
4851	isc_socket_connev_t *dev;
4852	isc_task_t *ntask = NULL;
4853	isc_socketmgr_t *manager;
4854	int cc;
4855	char strbuf[ISC_STRERRORSIZE];
4856
4857	REQUIRE(VALID_SOCKET(sock));
4858	REQUIRE(addr != NULL);
4859	REQUIRE(task != NULL);
4860	REQUIRE(action != NULL);
4861
4862	manager = sock->manager;
4863	REQUIRE(VALID_MANAGER(manager));
4864	REQUIRE(addr != NULL);
4865
4866	if (isc_sockaddr_ismulticast(addr))
4867		return (ISC_R_MULTICAST);
4868
4869	LOCK(&sock->lock);
4870
4871	REQUIRE(!sock->connecting);
4872
4873	dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
4874							ISC_SOCKEVENT_CONNECT,
4875							action,	arg,
4876							sizeof(*dev));
4877	if (dev == NULL) {
4878		UNLOCK(&sock->lock);
4879		return (ISC_R_NOMEMORY);
4880	}
4881	ISC_LINK_INIT(dev, ev_link);
4882
4883	/*
4884	 * Try to do the connect right away, as there can be only one
4885	 * outstanding, and it might happen to complete.
4886	 */
4887	sock->peer_address = *addr;
4888	cc = connect(sock->fd, &addr->type.sa, addr->length);
4889	if (cc < 0) {
4890		/*
4891		 * HP-UX "fails" to connect a UDP socket and sets errno to
4892		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
4893		 * a success and let the user detect it if it's really an error
4894		 * at the time of sending a packet on the socket.
4895		 */
4896		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4897			cc = 0;
4898			goto success;
4899		}
4900		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
4901			goto queue;
4902
4903		switch (errno) {
4904#define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
4905			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4906			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4907			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4908			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4909			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4910#ifdef EHOSTDOWN
4911			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4912#endif
4913			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4914			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4915			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4916			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4917			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4918#undef ERROR_MATCH
4919		}
4920
4921		sock->connected = 0;
4922
4923		isc__strerror(errno, strbuf, sizeof(strbuf));
4924		UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
4925
4926		UNLOCK(&sock->lock);
4927		inc_stats(sock->manager->stats,
4928			  sock->statsindex[STATID_CONNECTFAIL]);
4929		isc_event_free(ISC_EVENT_PTR(&dev));
4930		return (ISC_R_UNEXPECTED);
4931
4932	err_exit:
4933		sock->connected = 0;
4934		isc_task_send(task, ISC_EVENT_PTR(&dev));
4935
4936		UNLOCK(&sock->lock);
4937		inc_stats(sock->manager->stats,
4938			  sock->statsindex[STATID_CONNECTFAIL]);
4939		return (ISC_R_SUCCESS);
4940	}
4941
4942	/*
4943	 * If connect completed, fire off the done event.
4944	 */
4945 success:
4946	if (cc == 0) {
4947		sock->connected = 1;
4948		sock->bound = 1;
4949		dev->result = ISC_R_SUCCESS;
4950		isc_task_send(task, ISC_EVENT_PTR(&dev));
4951
4952		UNLOCK(&sock->lock);
4953
4954		inc_stats(sock->manager->stats,
4955			  sock->statsindex[STATID_CONNECT]);
4956
4957		return (ISC_R_SUCCESS);
4958	}
4959
4960 queue:
4961
4962	/*
4963	 * Attach to task.
4964	 */
4965	isc_task_attach(task, &ntask);
4966
4967	sock->connecting = 1;
4968
4969	dev->ev_sender = ntask;
4970
4971	/*
4972	 * Poke watcher here.  We still have the socket locked, so there
4973	 * is no race condition.  We will keep the lock for such a short
4974	 * bit of time waking it up now or later won't matter all that much.
4975	 */
4976	if (sock->connect_ev == NULL)
4977		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
4978
4979	sock->connect_ev = dev;
4980
4981	UNLOCK(&sock->lock);
4982	return (ISC_R_SUCCESS);
4983}
4984
4985/*
4986 * Called when a socket with a pending connect() finishes.
4987 */
4988static void
4989internal_connect(isc_task_t *me, isc_event_t *ev) {
4990	isc_socket_t *sock;
4991	isc_socket_connev_t *dev;
4992	isc_task_t *task;
4993	int cc;
4994	ISC_SOCKADDR_LEN_T optlen;
4995	char strbuf[ISC_STRERRORSIZE];
4996	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4997
4998	UNUSED(me);
4999	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5000
5001	sock = ev->ev_sender;
5002	INSIST(VALID_SOCKET(sock));
5003
5004	LOCK(&sock->lock);
5005
5006	/*
5007	 * When the internal event was sent the reference count was bumped
5008	 * to keep the socket around for us.  Decrement the count here.
5009	 */
5010	INSIST(sock->references > 0);
5011	sock->references--;
5012	if (sock->references == 0) {
5013		UNLOCK(&sock->lock);
5014		destroy(&sock);
5015		return;
5016	}
5017
5018	/*
5019	 * Has this event been canceled?
5020	 */
5021	dev = sock->connect_ev;
5022	if (dev == NULL) {
5023		INSIST(!sock->connecting);
5024		UNLOCK(&sock->lock);
5025		return;
5026	}
5027
5028	INSIST(sock->connecting);
5029	sock->connecting = 0;
5030
5031	/*
5032	 * Get any possible error status here.
5033	 */
5034	optlen = sizeof(cc);
5035	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5036		       (void *)&cc, (void *)&optlen) < 0)
5037		cc = errno;
5038	else
5039		errno = cc;
5040
5041	if (errno != 0) {
5042		/*
5043		 * If the error is EAGAIN, just re-select on this
5044		 * fd and pretend nothing strange happened.
5045		 */
5046		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5047			sock->connecting = 1;
5048			select_poke(sock->manager, sock->fd,
5049				    SELECT_POKE_CONNECT);
5050			UNLOCK(&sock->lock);
5051
5052			return;
5053		}
5054
5055		inc_stats(sock->manager->stats,
5056			  sock->statsindex[STATID_CONNECTFAIL]);
5057
5058		/*
5059		 * Translate other errors into ISC_R_* flavors.
5060		 */
5061		switch (errno) {
5062#define ERROR_MATCH(a, b) case a: dev->result = b; break;
5063			ERROR_MATCH(EACCES, ISC_R_NOPERM);
5064			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5065			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5066			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5067			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5068#ifdef EHOSTDOWN
5069			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5070#endif
5071			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5072			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5073			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5074			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5075			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5076			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5077#undef ERROR_MATCH
5078		default:
5079			dev->result = ISC_R_UNEXPECTED;
5080			isc_sockaddr_format(&sock->peer_address, peerbuf,
5081					    sizeof(peerbuf));
5082			isc__strerror(errno, strbuf, sizeof(strbuf));
5083			UNEXPECTED_ERROR(__FILE__, __LINE__,
5084					 "internal_connect: connect(%s) %s",
5085					 peerbuf, strbuf);
5086		}
5087	} else {
5088		inc_stats(sock->manager->stats,
5089			  sock->statsindex[STATID_CONNECT]);
5090		dev->result = ISC_R_SUCCESS;
5091		sock->connected = 1;
5092		sock->bound = 1;
5093	}
5094
5095	sock->connect_ev = NULL;
5096
5097	UNLOCK(&sock->lock);
5098
5099	task = dev->ev_sender;
5100	dev->ev_sender = sock;
5101	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5102}
5103
5104isc_result_t
5105isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5106	isc_result_t result;
5107
5108	REQUIRE(VALID_SOCKET(sock));
5109	REQUIRE(addressp != NULL);
5110
5111	LOCK(&sock->lock);
5112
5113	if (sock->connected) {
5114		*addressp = sock->peer_address;
5115		result = ISC_R_SUCCESS;
5116	} else {
5117		result = ISC_R_NOTCONNECTED;
5118	}
5119
5120	UNLOCK(&sock->lock);
5121
5122	return (result);
5123}
5124
5125isc_result_t
5126isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5127	ISC_SOCKADDR_LEN_T len;
5128	isc_result_t result;
5129	char strbuf[ISC_STRERRORSIZE];
5130
5131	REQUIRE(VALID_SOCKET(sock));
5132	REQUIRE(addressp != NULL);
5133
5134	LOCK(&sock->lock);
5135
5136	if (!sock->bound) {
5137		result = ISC_R_NOTBOUND;
5138		goto out;
5139	}
5140
5141	result = ISC_R_SUCCESS;
5142
5143	len = sizeof(addressp->type);
5144	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5145		isc__strerror(errno, strbuf, sizeof(strbuf));
5146		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5147				 strbuf);
5148		result = ISC_R_UNEXPECTED;
5149		goto out;
5150	}
5151	addressp->length = (unsigned int)len;
5152
5153 out:
5154	UNLOCK(&sock->lock);
5155
5156	return (result);
5157}
5158
5159/*
5160 * Run through the list of events on this socket, and cancel the ones
5161 * queued for task "task" of type "how".  "how" is a bitmask.
5162 */
5163void
5164isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
5165
5166	REQUIRE(VALID_SOCKET(sock));
5167
5168	/*
5169	 * Quick exit if there is nothing to do.  Don't even bother locking
5170	 * in this case.
5171	 */
5172	if (how == 0)
5173		return;
5174
5175	LOCK(&sock->lock);
5176
5177	/*
5178	 * All of these do the same thing, more or less.
5179	 * Each will:
5180	 *	o If the internal event is marked as "posted" try to
5181	 *	  remove it from the task's queue.  If this fails, mark it
5182	 *	  as canceled instead, and let the task clean it up later.
5183	 *	o For each I/O request for that task of that type, post
5184	 *	  its done event with status of "ISC_R_CANCELED".
5185	 *	o Reset any state needed.
5186	 */
5187	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5188	    && !ISC_LIST_EMPTY(sock->recv_list)) {
5189		isc_socketevent_t      *dev;
5190		isc_socketevent_t      *next;
5191		isc_task_t	       *current_task;
5192
5193		dev = ISC_LIST_HEAD(sock->recv_list);
5194
5195		while (dev != NULL) {
5196			current_task = dev->ev_sender;
5197			next = ISC_LIST_NEXT(dev, ev_link);
5198
5199			if ((task == NULL) || (task == current_task)) {
5200				dev->result = ISC_R_CANCELED;
5201				send_recvdone_event(sock, &dev);
5202			}
5203			dev = next;
5204		}
5205	}
5206
5207	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5208	    && !ISC_LIST_EMPTY(sock->send_list)) {
5209		isc_socketevent_t      *dev;
5210		isc_socketevent_t      *next;
5211		isc_task_t	       *current_task;
5212
5213		dev = ISC_LIST_HEAD(sock->send_list);
5214
5215		while (dev != NULL) {
5216			current_task = dev->ev_sender;
5217			next = ISC_LIST_NEXT(dev, ev_link);
5218
5219			if ((task == NULL) || (task == current_task)) {
5220				dev->result = ISC_R_CANCELED;
5221				send_senddone_event(sock, &dev);
5222			}
5223			dev = next;
5224		}
5225	}
5226
5227	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5228	    && !ISC_LIST_EMPTY(sock->accept_list)) {
5229		isc_socket_newconnev_t *dev;
5230		isc_socket_newconnev_t *next;
5231		isc_task_t	       *current_task;
5232
5233		dev = ISC_LIST_HEAD(sock->accept_list);
5234		while (dev != NULL) {
5235			current_task = dev->ev_sender;
5236			next = ISC_LIST_NEXT(dev, ev_link);
5237
5238			if ((task == NULL) || (task == current_task)) {
5239
5240				ISC_LIST_UNLINK(sock->accept_list, dev,
5241						ev_link);
5242
5243				dev->newsocket->references--;
5244				free_socket(&dev->newsocket);
5245
5246				dev->result = ISC_R_CANCELED;
5247				dev->ev_sender = sock;
5248				isc_task_sendanddetach(&current_task,
5249						       ISC_EVENT_PTR(&dev));
5250			}
5251
5252			dev = next;
5253		}
5254	}
5255
5256	/*
5257	 * Connecting is not a list.
5258	 */
5259	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5260	    && sock->connect_ev != NULL) {
5261		isc_socket_connev_t    *dev;
5262		isc_task_t	       *current_task;
5263
5264		INSIST(sock->connecting);
5265		sock->connecting = 0;
5266
5267		dev = sock->connect_ev;
5268		current_task = dev->ev_sender;
5269
5270		if ((task == NULL) || (task == current_task)) {
5271			sock->connect_ev = NULL;
5272
5273			dev->result = ISC_R_CANCELED;
5274			dev->ev_sender = sock;
5275			isc_task_sendanddetach(&current_task,
5276					       ISC_EVENT_PTR(&dev));
5277		}
5278	}
5279
5280	UNLOCK(&sock->lock);
5281}
5282
5283isc_sockettype_t
5284isc_socket_gettype(isc_socket_t *sock) {
5285	REQUIRE(VALID_SOCKET(sock));
5286
5287	return (sock->type);
5288}
5289
5290isc_boolean_t
5291isc_socket_isbound(isc_socket_t *sock) {
5292	isc_boolean_t val;
5293
5294	LOCK(&sock->lock);
5295	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5296	UNLOCK(&sock->lock);
5297
5298	return (val);
5299}
5300
5301void
5302isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
5303#if defined(IPV6_V6ONLY)
5304	int onoff = yes ? 1 : 0;
5305#else
5306	UNUSED(yes);
5307	UNUSED(sock);
5308#endif
5309
5310	REQUIRE(VALID_SOCKET(sock));
5311
5312#ifdef IPV6_V6ONLY
5313	if (sock->pf == AF_INET6) {
5314		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5315			       (void *)&onoff, sizeof(int)) < 0) {
5316			char strbuf[ISC_STRERRORSIZE];
5317
5318			UNEXPECTED_ERROR(__FILE__, __LINE__,
5319					 "setsockopt(%d, IPV6_V6ONLY) "
5320					 "%s: %s", sock->fd,
5321					 isc_msgcat_get(isc_msgcat,
5322							ISC_MSGSET_GENERAL,
5323							ISC_MSG_FAILED,
5324							"failed"),
5325					 strbuf);
5326		}
5327	}
5328	FIX_IPV6_RECVPKTINFO(sock);	/* AIX */
5329#endif
5330}
5331
5332#ifndef ISC_PLATFORM_USETHREADS
5333/* In our assumed scenario, we can simply use a single static object. */
5334static isc_socketwait_t swait_private;
5335
5336int
5337isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
5338	int n;
5339#ifdef USE_KQUEUE
5340	struct timespec ts, *tsp;
5341#endif
5342#ifdef USE_EPOLL
5343	int timeout;
5344#endif
5345#ifdef USE_DEVPOLL
5346	struct dvpoll dvp;
5347#endif
5348
5349	REQUIRE(swaitp != NULL && *swaitp == NULL);
5350
5351	if (socketmgr == NULL)
5352		return (0);
5353
5354#ifdef USE_KQUEUE
5355	if (tvp != NULL) {
5356		ts.tv_sec = tvp->tv_sec;
5357		ts.tv_nsec = tvp->tv_usec * 1000;
5358		tsp = &ts;
5359	} else
5360		tsp = NULL;
5361	swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
5362				       socketmgr->events, socketmgr->nevents,
5363				       tsp);
5364	n = swait_private.nevents;
5365#elif defined(USE_EPOLL)
5366	if (tvp != NULL)
5367		timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5368	else
5369		timeout = -1;
5370	swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
5371					   socketmgr->events,
5372					   socketmgr->nevents, timeout);
5373	n = swait_private.nevents;
5374#elif defined(USE_DEVPOLL)
5375	dvp.dp_fds = socketmgr->events;
5376	dvp.dp_nfds = socketmgr->nevents;
5377	if (tvp != NULL) {
5378		dvp.dp_timeout = tvp->tv_sec * 1000 +
5379			(tvp->tv_usec + 999) / 1000;
5380	} else
5381		dvp.dp_timeout = -1;
5382	swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
5383	n = swait_private.nevents;
5384#elif defined(USE_SELECT)
5385	memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
5386	       socketmgr->fd_bufsize);
5387	memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
5388	       socketmgr->fd_bufsize);
5389
5390	swait_private.readset = socketmgr->read_fds_copy;
5391	swait_private.writeset = socketmgr->write_fds_copy;
5392	swait_private.maxfd = socketmgr->maxfd + 1;
5393
5394	n = select(swait_private.maxfd, swait_private.readset,
5395		   swait_private.writeset, NULL, tvp);
5396#endif
5397
5398	*swaitp = &swait_private;
5399	return (n);
5400}
5401
5402isc_result_t
5403isc__socketmgr_dispatch(isc_socketwait_t *swait) {
5404	REQUIRE(swait == &swait_private);
5405
5406	if (socketmgr == NULL)
5407		return (ISC_R_NOTFOUND);
5408
5409#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5410	(void)process_fds(socketmgr, socketmgr->events, swait->nevents);
5411	return (ISC_R_SUCCESS);
5412#elif defined(USE_SELECT)
5413	process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset);
5414	return (ISC_R_SUCCESS);
5415#endif
5416}
5417#endif /* ISC_PLATFORM_USETHREADS */
5418
5419void
5420isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) {
5421
5422	/*
5423	 * Name 'socket'.
5424	 */
5425
5426	REQUIRE(VALID_SOCKET(socket));
5427
5428	LOCK(&socket->lock);
5429	memset(socket->name, 0, sizeof(socket->name));
5430	strncpy(socket->name, name, sizeof(socket->name) - 1);
5431	socket->tag = tag;
5432	UNLOCK(&socket->lock);
5433}
5434
5435const char *
5436isc_socket_getname(isc_socket_t *socket) {
5437	return (socket->name);
5438}
5439
5440void *
5441isc_socket_gettag(isc_socket_t *socket) {
5442	return (socket->tag);
5443}
5444
5445#ifdef HAVE_LIBXML2
5446
5447static const char *
5448_socktype(isc_sockettype_t type)
5449{
5450	if (type == isc_sockettype_udp)
5451		return ("udp");
5452	else if (type == isc_sockettype_tcp)
5453		return ("tcp");
5454	else if (type == isc_sockettype_unix)
5455		return ("unix");
5456	else if (type == isc_sockettype_fdwatch)
5457		return ("fdwatch");
5458	else
5459		return ("not-initialized");
5460}
5461
5462void
5463isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
5464{
5465	isc_socket_t *sock;
5466	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5467	isc_sockaddr_t addr;
5468	ISC_SOCKADDR_LEN_T len;
5469
5470	LOCK(&mgr->lock);
5471
5472#ifndef ISC_PLATFORM_USETHREADS
5473	xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5474	xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5475	xmlTextWriterEndElement(writer);
5476#endif
5477
5478	xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5479	sock = ISC_LIST_HEAD(mgr->socklist);
5480	while (sock != NULL) {
5481		LOCK(&sock->lock);
5482		xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5483
5484		xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5485		xmlTextWriterWriteFormatString(writer, "%p", sock);
5486		xmlTextWriterEndElement(writer);
5487
5488		if (sock->name[0] != 0) {
5489			xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5490			xmlTextWriterWriteFormatString(writer, "%s",
5491						       sock->name);
5492			xmlTextWriterEndElement(writer); /* name */
5493		}
5494
5495		xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5496		xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5497		xmlTextWriterEndElement(writer);
5498
5499		xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5500					  ISC_XMLCHAR _socktype(sock->type));
5501
5502		if (sock->connected) {
5503			isc_sockaddr_format(&sock->peer_address, peerbuf,
5504					    sizeof(peerbuf));
5505			xmlTextWriterWriteElement(writer,
5506						  ISC_XMLCHAR "peer-address",
5507						  ISC_XMLCHAR peerbuf);
5508		}
5509
5510		len = sizeof(addr);
5511		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5512			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5513			xmlTextWriterWriteElement(writer,
5514						  ISC_XMLCHAR "local-address",
5515						  ISC_XMLCHAR peerbuf);
5516		}
5517
5518		xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5519		if (sock->pending_recv)
5520			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5521						ISC_XMLCHAR "pending-receive");
5522		if (sock->pending_send)
5523			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5524						  ISC_XMLCHAR "pending-send");
5525		if (sock->pending_accept)
5526			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5527						 ISC_XMLCHAR "pending_accept");
5528		if (sock->listener)
5529			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5530						  ISC_XMLCHAR "listener");
5531		if (sock->connected)
5532			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5533						  ISC_XMLCHAR "connected");
5534		if (sock->connecting)
5535			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5536						  ISC_XMLCHAR "connecting");
5537		if (sock->bound)
5538			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5539						  ISC_XMLCHAR "bound");
5540
5541		xmlTextWriterEndElement(writer); /* states */
5542
5543		xmlTextWriterEndElement(writer); /* socket */
5544
5545		UNLOCK(&sock->lock);
5546		sock = ISC_LIST_NEXT(sock, link);
5547	}
5548	xmlTextWriterEndElement(writer); /* sockets */
5549
5550	UNLOCK(&mgr->lock);
5551}
5552#endif /* HAVE_LIBXML2 */
5553