1/*	$NetBSD: socket.c,v 1.1 2024/02/18 20:57:57 christos Exp $	*/
2
3/*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0.  If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16/*! \file */
17
18#include <inttypes.h>
19#include <stdbool.h>
20#include <sys/param.h>
21#include <sys/socket.h>
22#include <sys/stat.h>
23#include <sys/types.h>
24#if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
25#include <sys/sysctl.h>
26#endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
27#include <sys/time.h>
28#include <sys/uio.h>
29
30#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
31#include <linux/netlink.h>
32#include <linux/rtnetlink.h>
33#endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
34	*/
35
36#include <errno.h>
37#include <fcntl.h>
38#include <stddef.h>
39#include <stdlib.h>
40#include <unistd.h>
41
42#include <isc/app.h>
43#include <isc/buffer.h>
44#include <isc/condition.h>
45#include <isc/formatcheck.h>
46#include <isc/list.h>
47#include <isc/log.h>
48#include <isc/mem.h>
49#include <isc/mutex.h>
50#include <isc/net.h>
51#include <isc/once.h>
52#include <isc/platform.h>
53#include <isc/print.h>
54#include <isc/refcount.h>
55#include <isc/region.h>
56#include <isc/resource.h>
57#include <isc/socket.h>
58#include <isc/stats.h>
59#include <isc/strerr.h>
60#include <isc/string.h>
61#include <isc/task.h>
62#include <isc/thread.h>
63#include <isc/util.h>
64
65#ifdef ISC_PLATFORM_HAVESYSUNH
66#include <sys/un.h>
67#endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
68#ifdef HAVE_KQUEUE
69#include <sys/event.h>
70#endif /* ifdef HAVE_KQUEUE */
71#ifdef HAVE_EPOLL_CREATE1
72#include <sys/epoll.h>
73#endif /* ifdef HAVE_EPOLL_CREATE1 */
74#if defined(HAVE_SYS_DEVPOLL_H)
75#include <sys/devpoll.h>
76#elif defined(HAVE_DEVPOLL_H)
77#include <devpoll.h>
78#endif /* if defined(HAVE_SYS_DEVPOLL_H) */
79
80#include <netinet/tcp.h>
81
82#include "errno2result.h"
83
84#ifdef ENABLE_TCP_FASTOPEN
85#include <netinet/tcp.h>
86#endif /* ifdef ENABLE_TCP_FASTOPEN */
87
88#ifdef HAVE_JSON_C
89#include <json_object.h>
90#endif /* HAVE_JSON_C */
91
92#ifdef HAVE_LIBXML2
93#include <libxml/xmlwriter.h>
94#define ISC_XMLCHAR (const xmlChar *)
95#endif /* HAVE_LIBXML2 */
96
97/*%
98 * Choose the most preferable multiplex method.
99 */
100#if defined(HAVE_KQUEUE)
101#define USE_KQUEUE
102#elif defined(HAVE_EPOLL_CREATE1)
103#define USE_EPOLL
104#elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
105#define USE_DEVPOLL
106typedef struct {
107	unsigned int want_read : 1, want_write : 1;
108} pollinfo_t;
109#else /* if defined(HAVE_KQUEUE) */
110#define USE_SELECT
111#endif /* HAVE_KQUEUE */
112
113/*
114 * Set by the -T dscp option on the command line. If set to a value
115 * other than -1, we check to make sure DSCP values match it, and
116 * assert if not.
117 */
118int isc_dscp_check_value = -1;
119
120/*%
121 * Maximum number of allowable open sockets.  This is also the maximum
122 * allowable socket file descriptor.
123 *
124 * Care should be taken before modifying this value for select():
125 * The API standard doesn't ensure select() accept more than (the system default
126 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
127 * the vast majority of cases.  This constant should therefore be increased only
128 * when absolutely necessary and possible, i.e., the server is exhausting all
129 * available file descriptors (up to FD_SETSIZE) and the select() function
130 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
131 * always by true, but we keep using some of them to ensure as much
132 * portability as possible).  Note also that overall server performance
133 * may be rather worsened with a larger value of this constant due to
134 * inherent scalability problems of select().
135 *
136 * As a special note, this value shouldn't have to be touched if
137 * this is a build for an authoritative only DNS server.
138 */
139#ifndef ISC_SOCKET_MAXSOCKETS
140#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
141#ifdef TUNE_LARGE
142#define ISC_SOCKET_MAXSOCKETS 21000
143#else /* ifdef TUNE_LARGE */
144#define ISC_SOCKET_MAXSOCKETS 4096
145#endif /* TUNE_LARGE */
146#elif defined(USE_SELECT)
147#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
148#endif /* USE_KQUEUE... */
149#endif /* ISC_SOCKET_MAXSOCKETS */
150
151#ifdef USE_SELECT
152/*%
153 * Mac OS X needs a special definition to support larger values in select().
154 * We always define this because a larger value can be specified run-time.
155 */
156#ifdef __APPLE__
157#define _DARWIN_UNLIMITED_SELECT
158#endif /* __APPLE__ */
159#endif /* USE_SELECT */
160
161#ifdef ISC_SOCKET_USE_POLLWATCH
162/*%
163 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
164 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
165 * some of the specified FD.  The idea is based on the observation that it's
166 * likely for a busy server to keep receiving packets.  It specifically works
167 * as follows: the socket watcher is first initialized with the state of
168 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
169 * event occurs.  When it wakes up for a socket I/O event, it moves to the
170 * poll_active state, and sets the poll timeout to a short period
171 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
172 * watcher goes to the poll_checking state with the same timeout period.
173 * In this state, the watcher tries to detect whether this is a break
174 * during intermittent events or the kernel bug is triggered.  If the next
175 * polling reports an event within the short period, the previous timeout is
176 * likely to be a kernel bug, and so the watcher goes back to the active state.
177 * Otherwise, it moves to the idle state again.
178 *
179 * It's not clear whether this is a thread-related bug, but since we've only
180 * seen this with threads, this workaround is used only when enabling threads.
181 */
182
183typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
184
185#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
186#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
187#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
188#endif /* ISC_SOCKET_USE_POLLWATCH */
189
190/*%
191 * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
192 */
193#define FDLOCK_BITS  10
194#define FDLOCK_COUNT (1 << FDLOCK_BITS)
195#define FDLOCK_ID(fd)                                   \
196	(((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
197	 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
198
199/*%
200 * Maximum number of events communicated with the kernel.  There should normally
201 * be no need for having a large number.
202 */
203#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
204#ifndef ISC_SOCKET_MAXEVENTS
205#ifdef TUNE_LARGE
206#define ISC_SOCKET_MAXEVENTS 2048
207#else /* ifdef TUNE_LARGE */
208#define ISC_SOCKET_MAXEVENTS 64
209#endif /* TUNE_LARGE */
210#endif /* ifndef ISC_SOCKET_MAXEVENTS */
211#endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
212	* */
213
214/*%
215 * Some systems define the socket length argument as an int, some as size_t,
216 * some as socklen_t.  This is here so it can be easily changed if needed.
217 */
218#ifndef socklen_t
219#define socklen_t unsigned int
220#endif /* ifndef socklen_t */
221
222/*%
223 * Define what the possible "soft" errors can be.  These are non-fatal returns
224 * of various network related functions, like recv() and so on.
225 *
226 * For some reason, BSDI (and perhaps others) will sometimes return <0
227 * from recv() but will have errno==0.  This is broken, but we have to
228 * work around it here.
229 */
230#define SOFT_ERROR(e)                                             \
231	((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
232	 (e) == EINTR || (e) == 0)
233
234#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
235
236/*!<
237 * DLVL(90)  --  Function entry/exit and other tracing.
238 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
239 * DLVL(60)  --  Socket data send/receive
240 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
241 * DLVL(20)  --  Socket creation/destruction.
242 */
243#define TRACE_LEVEL	  90
244#define CORRECTNESS_LEVEL 70
245#define IOEVENT_LEVEL	  60
246#define EVENT_LEVEL	  50
247#define CREATION_LEVEL	  20
248
249#define TRACE	    DLVL(TRACE_LEVEL)
250#define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
251#define IOEVENT	    DLVL(IOEVENT_LEVEL)
252#define EVENT	    DLVL(EVENT_LEVEL)
253#define CREATION    DLVL(CREATION_LEVEL)
254
255typedef isc_event_t intev_t;
256
257#define SOCKET_MAGIC	ISC_MAGIC('I', 'O', 'i', 'o')
258#define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
259
260/*!
261 * IPv6 control information.  If the socket is an IPv6 socket we want
262 * to collect the destination address and interface so the client can
263 * set them on outgoing packets.
264 */
265#ifndef USE_CMSG
266#define USE_CMSG 1
267#endif /* ifndef USE_CMSG */
268
269/*%
270 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
271 * a setsockopt() like interface to request timestamps, and if the OS
272 * doesn't do it for us, call gettimeofday() on every UDP receive?
273 */
274#ifdef SO_TIMESTAMP
275#ifndef USE_CMSG
276#define USE_CMSG 1
277#endif /* ifndef USE_CMSG */
278#endif /* ifdef SO_TIMESTAMP */
279
280#if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
281#define SET_RCVBUF
282#endif
283
284#if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
285#define SET_SNDBUF
286#endif
287
288/*%
289 * Instead of calculating the cmsgbuf lengths every time we take
290 * a rule of thumb approach - sizes are taken from x86_64 linux,
291 * multiplied by 2, everything should fit. Those sizes are not
292 * large enough to cause any concern.
293 */
294#if defined(USE_CMSG)
295#define CMSG_SP_IN6PKT 40
296#else /* if defined(USE_CMSG) */
297#define CMSG_SP_IN6PKT 0
298#endif /* if defined(USE_CMSG) */
299
300#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
301#define CMSG_SP_TIMESTAMP 32
302#else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
303#define CMSG_SP_TIMESTAMP 0
304#endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
305
306#if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
307#define CMSG_SP_TCTOS 24
308#else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
309#define CMSG_SP_TCTOS 0
310#endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
311
312#define CMSG_SP_INT 24
313
314/* Align cmsg buffers to be safe on SPARC etc. */
315#define RECVCMSGBUFLEN                                                       \
316	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
317			  1,                                                 \
318		  sizeof(void *))
319#define SENDCMSGBUFLEN                                                    \
320	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
321		  sizeof(void *))
322
323/*%
324 * The number of times a send operation is repeated if the result is EINTR.
325 */
326#define NRETRIES 10
327
328typedef struct isc__socketthread isc__socketthread_t;
329
330#define NEWCONNSOCK(ev) ((ev)->newsocket)
331
332struct isc_socket {
333	/* Not locked. */
334	unsigned int magic;
335	isc_socketmgr_t *manager;
336	isc_mutex_t lock;
337	isc_sockettype_t type;
338	const isc_statscounter_t *statsindex;
339	isc_refcount_t references;
340
341	/* Locked by socket lock. */
342	ISC_LINK(isc_socket_t) link;
343	int fd;
344	int pf;
345	int threadid;
346	char name[16];
347	void *tag;
348
349	ISC_LIST(isc_socketevent_t) send_list;
350	ISC_LIST(isc_socketevent_t) recv_list;
351	ISC_LIST(isc_socket_newconnev_t) accept_list;
352	ISC_LIST(isc_socket_connev_t) connect_list;
353
354	isc_sockaddr_t peer_address; /* remote address */
355
356	unsigned int listener : 1,	       /* listener socket */
357		connected : 1, connecting : 1, /* connect pending
358						* */
359		bound  : 1,		       /* bound to local addr */
360		dupped : 1, active : 1,	       /* currently active */
361		pktdscp : 1;		       /* per packet dscp */
362
363#ifdef ISC_PLATFORM_RECVOVERFLOW
364	unsigned char overflow; /* used for MSG_TRUNC fake */
365#endif				/* ifdef ISC_PLATFORM_RECVOVERFLOW */
366
367	void			*fdwatcharg;
368	isc_sockfdwatch_t	fdwatchcb;
369	int			fdwatchflags;
370	isc_task_t              *fdwatchtask;
371	unsigned int		dscp;
372};
373
374#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
375#define VALID_MANAGER(m)     ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
376
377struct isc_socketmgr {
378	/* Not locked. */
379	unsigned int magic;
380	isc_mem_t *mctx;
381	isc_mutex_t lock;
382	isc_stats_t *stats;
383	int nthreads;
384	isc__socketthread_t *threads;
385	unsigned int maxsocks;
386	/* Locked by manager lock. */
387	ISC_LIST(isc_socket_t) socklist;
388	int reserved; /* unlocked */
389	isc_condition_t shutdown_ok;
390	size_t maxudp;
391};
392
393struct isc__socketthread {
394	isc_socketmgr_t *manager;
395	int threadid;
396	isc_thread_t thread;
397	int pipe_fds[2];
398	isc_mutex_t *fdlock;
399	/* Locked by fdlock. */
400	isc_socket_t **fds;
401	int *fdstate;
402#ifdef USE_KQUEUE
403	int kqueue_fd;
404	int nevents;
405	struct kevent *events;
406#endif /* USE_KQUEUE */
407#ifdef USE_EPOLL
408	int epoll_fd;
409	int nevents;
410	struct epoll_event *events;
411	uint32_t *epoll_events;
412#endif /* USE_EPOLL */
413#ifdef USE_DEVPOLL
414	int devpoll_fd;
415	isc_resourcevalue_t open_max;
416	unsigned int calls;
417	int nevents;
418	struct pollfd *events;
419	pollinfo_t *fdpollinfo;
420#endif /* USE_DEVPOLL */
421#ifdef USE_SELECT
422	int fd_bufsize;
423	fd_set *read_fds;
424	fd_set *read_fds_copy;
425	fd_set *write_fds;
426	fd_set *write_fds_copy;
427	int maxfd;
428#endif /* USE_SELECT */
429};
430
431#define CLOSED	      0 /* this one must be zero */
432#define MANAGED	      1
433#define CLOSE_PENDING 2
434
435/*
436 * send() and recv() iovec counts
437 */
438#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
439#ifdef ISC_PLATFORM_RECVOVERFLOW
440#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
441#else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
442#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
443#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
444
445static isc_result_t
446socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
447	      isc_socket_t **socketp, isc_socket_t *dup_socket);
448static void
449send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
450static void
451send_senddone_event(isc_socket_t *, isc_socketevent_t **);
452static void
453send_connectdone_event(isc_socket_t *, isc_socket_connev_t **);
454static void
455free_socket(isc_socket_t **);
456static isc_result_t
457allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **);
458static void
459destroy(isc_socket_t **);
460static void
461internal_accept(isc_socket_t *);
462static void
463internal_connect(isc_socket_t *);
464static void
465internal_recv(isc_socket_t *);
466static void
467internal_send(isc_socket_t *);
468static void
469process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
470static void
471build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
472		  struct iovec *, size_t *);
473static void
474build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
475		  struct iovec *, size_t *);
476static bool
477process_ctlfd(isc__socketthread_t *thread);
478static void
479setdscp(isc_socket_t *sock, isc_dscp_t dscp);
480static void
481dispatch_recv(isc_socket_t *sock);
482static void
483dispatch_send(isc_socket_t *sock);
484static void
485internal_fdwatch_read(isc_socket_t *sock);
486static void
487internal_fdwatch_write(isc_socket_t *sock);
488
489#define SELECT_POKE_SHUTDOWN (-1)
490#define SELECT_POKE_NOTHING  (-2)
491#define SELECT_POKE_READ     (-3)
492#define SELECT_POKE_ACCEPT   (-3) /*%< Same as _READ */
493#define SELECT_POKE_WRITE    (-4)
494#define SELECT_POKE_CONNECT  (-4) /*%< Same as _WRITE */
495#define SELECT_POKE_CLOSE    (-5)
496
497/*%
498 * Shortcut index arrays to get access to statistics counters.
499 */
500enum {
501	STATID_OPEN = 0,
502	STATID_OPENFAIL = 1,
503	STATID_CLOSE = 2,
504	STATID_BINDFAIL = 3,
505	STATID_CONNECTFAIL = 4,
506	STATID_CONNECT = 5,
507	STATID_ACCEPTFAIL = 6,
508	STATID_ACCEPT = 7,
509	STATID_SENDFAIL = 8,
510	STATID_RECVFAIL = 9,
511	STATID_ACTIVE = 10
512};
513static const isc_statscounter_t udp4statsindex[] = {
514	isc_sockstatscounter_udp4open,
515	isc_sockstatscounter_udp4openfail,
516	isc_sockstatscounter_udp4close,
517	isc_sockstatscounter_udp4bindfail,
518	isc_sockstatscounter_udp4connectfail,
519	isc_sockstatscounter_udp4connect,
520	-1,
521	-1,
522	isc_sockstatscounter_udp4sendfail,
523	isc_sockstatscounter_udp4recvfail,
524	isc_sockstatscounter_udp4active
525};
526static const isc_statscounter_t udp6statsindex[] = {
527	isc_sockstatscounter_udp6open,
528	isc_sockstatscounter_udp6openfail,
529	isc_sockstatscounter_udp6close,
530	isc_sockstatscounter_udp6bindfail,
531	isc_sockstatscounter_udp6connectfail,
532	isc_sockstatscounter_udp6connect,
533	-1,
534	-1,
535	isc_sockstatscounter_udp6sendfail,
536	isc_sockstatscounter_udp6recvfail,
537	isc_sockstatscounter_udp6active
538};
539static const isc_statscounter_t tcp4statsindex[] = {
540	isc_sockstatscounter_tcp4open,	      isc_sockstatscounter_tcp4openfail,
541	isc_sockstatscounter_tcp4close,	      isc_sockstatscounter_tcp4bindfail,
542	isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
543	isc_sockstatscounter_tcp4acceptfail,  isc_sockstatscounter_tcp4accept,
544	isc_sockstatscounter_tcp4sendfail,    isc_sockstatscounter_tcp4recvfail,
545	isc_sockstatscounter_tcp4active
546};
547static const isc_statscounter_t tcp6statsindex[] = {
548	isc_sockstatscounter_tcp6open,	      isc_sockstatscounter_tcp6openfail,
549	isc_sockstatscounter_tcp6close,	      isc_sockstatscounter_tcp6bindfail,
550	isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
551	isc_sockstatscounter_tcp6acceptfail,  isc_sockstatscounter_tcp6accept,
552	isc_sockstatscounter_tcp6sendfail,    isc_sockstatscounter_tcp6recvfail,
553	isc_sockstatscounter_tcp6active
554};
555static const isc_statscounter_t unixstatsindex[] = {
556	isc_sockstatscounter_unixopen,	      isc_sockstatscounter_unixopenfail,
557	isc_sockstatscounter_unixclose,	      isc_sockstatscounter_unixbindfail,
558	isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
559	isc_sockstatscounter_unixacceptfail,  isc_sockstatscounter_unixaccept,
560	isc_sockstatscounter_unixsendfail,    isc_sockstatscounter_unixrecvfail,
561	isc_sockstatscounter_unixactive
562};
563static const isc_statscounter_t rawstatsindex[] = {
564	isc_sockstatscounter_rawopen,
565	isc_sockstatscounter_rawopenfail,
566	isc_sockstatscounter_rawclose,
567	-1,
568	-1,
569	-1,
570	-1,
571	-1,
572	-1,
573	isc_sockstatscounter_rawrecvfail,
574	isc_sockstatscounter_rawactive
575};
576
577static int
578gen_threadid(isc_socket_t *sock);
579
580static int
581gen_threadid(isc_socket_t *sock) {
582	return (sock->fd % sock->manager->nthreads);
583}
584
585static void
586manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
587	    isc_logmodule_t *module, int level, const char *fmt, ...)
588	ISC_FORMAT_PRINTF(5, 6);
589static void
590manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
591	    isc_logmodule_t *module, int level, const char *fmt, ...) {
592	char msgbuf[2048];
593	va_list ap;
594
595	if (!isc_log_wouldlog(isc_lctx, level)) {
596		return;
597	}
598
599	va_start(ap, fmt);
600	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
601	va_end(ap);
602
603	isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
604		      sockmgr, msgbuf);
605}
606
607static void
608thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
609	   isc_logmodule_t *module, int level, const char *fmt, ...)
610	ISC_FORMAT_PRINTF(5, 6);
611static void
612thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
613	   isc_logmodule_t *module, int level, const char *fmt, ...) {
614	char msgbuf[2048];
615	va_list ap;
616
617	if (!isc_log_wouldlog(isc_lctx, level)) {
618		return;
619	}
620
621	va_start(ap, fmt);
622	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
623	va_end(ap);
624
625	isc_log_write(isc_lctx, category, module, level,
626		      "sockmgr %p thread %d: %s", thread->manager,
627		      thread->threadid, msgbuf);
628}
629
630static void
631socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
632	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
633	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
634static void
635socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
636	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
637	   const char *fmt, ...) {
638	char msgbuf[2048];
639	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
640	va_list ap;
641
642	if (!isc_log_wouldlog(isc_lctx, level)) {
643		return;
644	}
645
646	va_start(ap, fmt);
647	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
648	va_end(ap);
649
650	if (address == NULL) {
651		isc_log_write(isc_lctx, category, module, level,
652			      "socket %p: %s", sock, msgbuf);
653	} else {
654		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
655		isc_log_write(isc_lctx, category, module, level,
656			      "socket %p %s: %s", sock, peerbuf, msgbuf);
657	}
658}
659
660/*%
661 * Increment socket-related statistics counters.
662 */
663static void
664inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
665	REQUIRE(counterid != -1);
666
667	if (stats != NULL) {
668		isc_stats_increment(stats, counterid);
669	}
670}
671
672/*%
673 * Decrement socket-related statistics counters.
674 */
675static void
676dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
677	REQUIRE(counterid != -1);
678
679	if (stats != NULL) {
680		isc_stats_decrement(stats, counterid);
681	}
682}
683
684static isc_result_t
685watch_fd(isc__socketthread_t *thread, int fd, int msg) {
686	isc_result_t result = ISC_R_SUCCESS;
687
688#ifdef USE_KQUEUE
689	struct kevent evchange;
690
691	memset(&evchange, 0, sizeof(evchange));
692	if (msg == SELECT_POKE_READ) {
693		evchange.filter = EVFILT_READ;
694	} else {
695		evchange.filter = EVFILT_WRITE;
696	}
697	evchange.flags = EV_ADD;
698	evchange.ident = fd;
699	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
700		result = isc__errno2result(errno);
701	}
702
703	return (result);
704#elif defined(USE_EPOLL)
705	struct epoll_event event;
706	uint32_t oldevents;
707	int ret;
708	int op;
709
710	oldevents = thread->epoll_events[fd];
711	if (msg == SELECT_POKE_READ) {
712		thread->epoll_events[fd] |= EPOLLIN;
713	} else {
714		thread->epoll_events[fd] |= EPOLLOUT;
715	}
716
717	event.events = thread->epoll_events[fd];
718	memset(&event.data, 0, sizeof(event.data));
719	event.data.fd = fd;
720
721	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
722	if (thread->fds[fd] != NULL) {
723		LOCK(&thread->fds[fd]->lock);
724	}
725	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
726	if (thread->fds[fd] != NULL) {
727		UNLOCK(&thread->fds[fd]->lock);
728	}
729	if (ret == -1) {
730		if (errno == EEXIST) {
731			UNEXPECTED_ERROR(__FILE__, __LINE__,
732					 "epoll_ctl(ADD/MOD) returned "
733					 "EEXIST for fd %d",
734					 fd);
735		}
736		result = isc__errno2result(errno);
737	}
738
739	return (result);
740#elif defined(USE_DEVPOLL)
741	struct pollfd pfd;
742
743	memset(&pfd, 0, sizeof(pfd));
744	if (msg == SELECT_POKE_READ) {
745		pfd.events = POLLIN;
746	} else {
747		pfd.events = POLLOUT;
748	}
749	pfd.fd = fd;
750	pfd.revents = 0;
751	if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
752		result = isc__errno2result(errno);
753	} else {
754		if (msg == SELECT_POKE_READ) {
755			thread->fdpollinfo[fd].want_read = 1;
756		} else {
757			thread->fdpollinfo[fd].want_write = 1;
758		}
759	}
760
761	return (result);
762#elif defined(USE_SELECT)
763	LOCK(&thread->manager->lock);
764	if (msg == SELECT_POKE_READ) {
765		FD_SET(fd, thread->read_fds);
766	}
767	if (msg == SELECT_POKE_WRITE) {
768		FD_SET(fd, thread->write_fds);
769	}
770	UNLOCK(&thread->manager->lock);
771
772	return (result);
773#endif /* ifdef USE_KQUEUE */
774}
775
776static isc_result_t
777unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
778	isc_result_t result = ISC_R_SUCCESS;
779
780#ifdef USE_KQUEUE
781	struct kevent evchange;
782
783	memset(&evchange, 0, sizeof(evchange));
784	if (msg == SELECT_POKE_READ) {
785		evchange.filter = EVFILT_READ;
786	} else {
787		evchange.filter = EVFILT_WRITE;
788	}
789	evchange.flags = EV_DELETE;
790	evchange.ident = fd;
791	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
792		result = isc__errno2result(errno);
793	}
794
795	return (result);
796#elif defined(USE_EPOLL)
797	struct epoll_event event;
798	int ret;
799	int op;
800
801	if (msg == SELECT_POKE_READ) {
802		thread->epoll_events[fd] &= ~(EPOLLIN);
803	} else {
804		thread->epoll_events[fd] &= ~(EPOLLOUT);
805	}
806
807	event.events = thread->epoll_events[fd];
808	memset(&event.data, 0, sizeof(event.data));
809	event.data.fd = fd;
810
811	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
812	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
813	if (ret == -1 && errno != ENOENT) {
814		char strbuf[ISC_STRERRORSIZE];
815		strerror_r(errno, strbuf, sizeof(strbuf));
816		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
817				 fd, strbuf);
818		result = ISC_R_UNEXPECTED;
819	}
820	return (result);
821#elif defined(USE_DEVPOLL)
822	struct pollfd pfds[2];
823	size_t writelen = sizeof(pfds[0]);
824
825	memset(pfds, 0, sizeof(pfds));
826	pfds[0].events = POLLREMOVE;
827	pfds[0].fd = fd;
828
829	/*
830	 * Canceling read or write polling via /dev/poll is tricky.  Since it
831	 * only provides a way of canceling per FD, we may need to re-poll the
832	 * socket for the other operation.
833	 */
834	if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
835		pfds[1].events = POLLOUT;
836		pfds[1].fd = fd;
837		writelen += sizeof(pfds[1]);
838	}
839	if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
840		pfds[1].events = POLLIN;
841		pfds[1].fd = fd;
842		writelen += sizeof(pfds[1]);
843	}
844
845	if (write(thread->devpoll_fd, pfds, writelen) == -1) {
846		result = isc__errno2result(errno);
847	} else {
848		if (msg == SELECT_POKE_READ) {
849			thread->fdpollinfo[fd].want_read = 0;
850		} else {
851			thread->fdpollinfo[fd].want_write = 0;
852		}
853	}
854
855	return (result);
856#elif defined(USE_SELECT)
857	LOCK(&thread->manager->lock);
858	if (msg == SELECT_POKE_READ) {
859		FD_CLR(fd, thread->read_fds);
860	} else if (msg == SELECT_POKE_WRITE) {
861		FD_CLR(fd, thread->write_fds);
862	}
863	UNLOCK(&thread->manager->lock);
864
865	return (result);
866#endif /* ifdef USE_KQUEUE */
867}
868
869/*
870 * A poke message was received, perform a proper watch/unwatch
871 * on a fd provided
872 */
873static void
874wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
875	isc_result_t result;
876	int lockid = FDLOCK_ID(fd);
877
878	/*
879	 * This is a wakeup on a socket.  If the socket is not in the
880	 * process of being closed, start watching it for either reads
881	 * or writes.
882	 */
883
884	INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
885
886	if (msg == SELECT_POKE_CLOSE) {
887		LOCK(&thread->fdlock[lockid]);
888		INSIST(thread->fdstate[fd] == CLOSE_PENDING);
889		thread->fdstate[fd] = CLOSED;
890		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
891		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
892		(void)close(fd);
893		UNLOCK(&thread->fdlock[lockid]);
894		return;
895	}
896
897	LOCK(&thread->fdlock[lockid]);
898	if (thread->fdstate[fd] == CLOSE_PENDING) {
899		/*
900		 * We accept (and ignore) any error from unwatch_fd() as we are
901		 * closing the socket, hoping it doesn't leave dangling state in
902		 * the kernel.
903		 * Note that unwatch_fd() must be called after releasing the
904		 * fdlock; otherwise it could cause deadlock due to a lock order
905		 * reversal.
906		 */
907		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
908		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
909		UNLOCK(&thread->fdlock[lockid]);
910		return;
911	}
912	if (thread->fdstate[fd] != MANAGED) {
913		UNLOCK(&thread->fdlock[lockid]);
914		return;
915	}
916
917	/*
918	 * Set requested bit.
919	 */
920	result = watch_fd(thread, fd, msg);
921	if (result != ISC_R_SUCCESS) {
922		/*
923		 * XXXJT: what should we do?  Ignoring the failure of watching
924		 * a socket will make the application dysfunctional, but there
925		 * seems to be no reasonable recovery process.
926		 */
927		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
928			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
929			      "failed to start watching FD (%d): %s", fd,
930			      isc_result_totext(result));
931	}
932	UNLOCK(&thread->fdlock[lockid]);
933}
934
935/*
936 * Poke the select loop when there is something for us to do.
937 * The write is required (by POSIX) to complete.  That is, we
938 * will not get partial writes.
939 */
940static void
941select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) {
942	int cc;
943	int buf[2];
944	char strbuf[ISC_STRERRORSIZE];
945
946	buf[0] = fd;
947	buf[1] = msg;
948
949	do {
950		cc = write(mgr->threads[threadid].pipe_fds[1], buf,
951			   sizeof(buf));
952#ifdef ENOSR
953		/*
954		 * Treat ENOSR as EAGAIN but loop slowly as it is
955		 * unlikely to clear fast.
956		 */
957		if (cc < 0 && errno == ENOSR) {
958			sleep(1);
959			errno = EAGAIN;
960		}
961#endif /* ifdef ENOSR */
962	} while (cc < 0 && SOFT_ERROR(errno));
963
964	if (cc < 0) {
965		strerror_r(errno, strbuf, sizeof(strbuf));
966		FATAL_ERROR(__FILE__, __LINE__,
967			    "write() failed during watcher poke: %s", strbuf);
968	}
969
970	INSIST(cc == sizeof(buf));
971}
972
973/*
974 * Read a message on the internal fd.
975 */
976static void
977select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
978	int buf[2];
979	int cc;
980	char strbuf[ISC_STRERRORSIZE];
981
982	cc = read(thread->pipe_fds[0], buf, sizeof(buf));
983	if (cc < 0) {
984		*msg = SELECT_POKE_NOTHING;
985		*fd = -1; /* Silence compiler. */
986		if (SOFT_ERROR(errno)) {
987			return;
988		}
989
990		strerror_r(errno, strbuf, sizeof(strbuf));
991		FATAL_ERROR(__FILE__, __LINE__,
992			    "read() failed during watcher poke: %s", strbuf);
993	}
994	INSIST(cc == sizeof(buf));
995
996	*fd = buf[0];
997	*msg = buf[1];
998}
999
1000/*
1001 * Make a fd non-blocking.
1002 */
1003static isc_result_t
1004make_nonblock(int fd) {
1005	int ret;
1006	char strbuf[ISC_STRERRORSIZE];
1007#ifdef USE_FIONBIO_IOCTL
1008	int on = 1;
1009#else  /* ifdef USE_FIONBIO_IOCTL */
1010	int flags;
1011#endif /* ifdef USE_FIONBIO_IOCTL */
1012
1013#ifdef USE_FIONBIO_IOCTL
1014	ret = ioctl(fd, FIONBIO, (char *)&on);
1015#else  /* ifdef USE_FIONBIO_IOCTL */
1016	flags = fcntl(fd, F_GETFL, 0);
1017	flags |= PORT_NONBLOCK;
1018	ret = fcntl(fd, F_SETFL, flags);
1019#endif /* ifdef USE_FIONBIO_IOCTL */
1020
1021	if (ret == -1) {
1022		strerror_r(errno, strbuf, sizeof(strbuf));
1023		UNEXPECTED_ERROR(__FILE__, __LINE__,
1024#ifdef USE_FIONBIO_IOCTL
1025				 "ioctl(%d, FIONBIO, &on): %s", fd,
1026#else  /* ifdef USE_FIONBIO_IOCTL */
1027				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1028#endif /* ifdef USE_FIONBIO_IOCTL */
1029				 strbuf);
1030
1031		return (ISC_R_UNEXPECTED);
1032	}
1033
1034	return (ISC_R_SUCCESS);
1035}
1036
1037#ifdef USE_CMSG
1038/*
1039 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1040 * In order to ensure as much portability as possible, we provide wrapper
1041 * functions of these macros.
1042 * Note that cmsg_space() could run slow on OSes that do not have
1043 * CMSG_SPACE.
1044 */
1045static socklen_t
1046cmsg_len(socklen_t len) {
1047#ifdef CMSG_LEN
1048	return (CMSG_LEN(len));
1049#else  /* ifdef CMSG_LEN */
1050	socklen_t hdrlen;
1051
1052	/*
1053	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1054	 * is correct.
1055	 */
1056	hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
1057	return (hdrlen + len);
1058#endif /* ifdef CMSG_LEN */
1059}
1060
1061static socklen_t
1062cmsg_space(socklen_t len) {
1063#ifdef CMSG_SPACE
1064	return (CMSG_SPACE(len));
1065#else  /* ifdef CMSG_SPACE */
1066	struct msghdr msg;
1067	struct cmsghdr *cmsgp;
1068	/*
1069	 * XXX: The buffer length is an ad-hoc value, but should be enough
1070	 * in a practical sense.
1071	 */
1072	char dummybuf[sizeof(struct cmsghdr) + 1024];
1073
1074	memset(&msg, 0, sizeof(msg));
1075	msg.msg_control = dummybuf;
1076	msg.msg_controllen = sizeof(dummybuf);
1077
1078	cmsgp = (struct cmsghdr *)dummybuf;
1079	cmsgp->cmsg_len = cmsg_len(len);
1080
1081	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1082	if (cmsgp != NULL) {
1083		return ((char *)cmsgp - (char *)msg.msg_control);
1084	} else {
1085		return (0);
1086	}
1087#endif /* ifdef CMSG_SPACE */
1088}
1089#endif /* USE_CMSG */
1090
1091/*
1092 * Process control messages received on a socket.
1093 */
1094static void
1095process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1096#ifdef USE_CMSG
1097	struct cmsghdr *cmsgp;
1098	struct in6_pktinfo *pktinfop;
1099#ifdef SO_TIMESTAMP
1100	void *timevalp;
1101#endif /* ifdef SO_TIMESTAMP */
1102#endif /* ifdef USE_CMSG */
1103
1104	/*
1105	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1106	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1107	 * They are all here, outside of the CPP tests, because it is
1108	 * more consistent with the usual ISC coding style.
1109	 */
1110	UNUSED(sock);
1111	UNUSED(msg);
1112	UNUSED(dev);
1113
1114#ifdef MSG_TRUNC
1115	if ((msg->msg_flags & MSG_TRUNC) != 0) {
1116		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1117	}
1118#endif /* ifdef MSG_TRUNC */
1119
1120#ifdef MSG_CTRUNC
1121	if ((msg->msg_flags & MSG_CTRUNC) != 0) {
1122		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1123	}
1124#endif /* ifdef MSG_CTRUNC */
1125
1126#ifndef USE_CMSG
1127	return;
1128#else /* ifndef USE_CMSG */
1129	if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
1130		return;
1131	}
1132
1133#ifdef SO_TIMESTAMP
1134	timevalp = NULL;
1135#endif /* ifdef SO_TIMESTAMP */
1136	pktinfop = NULL;
1137
1138	cmsgp = CMSG_FIRSTHDR(msg);
1139	while (cmsgp != NULL) {
1140		socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
1141
1142		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1143		    cmsgp->cmsg_type == IPV6_PKTINFO)
1144		{
1145			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1146			memmove(&dev->pktinfo, pktinfop,
1147				sizeof(struct in6_pktinfo));
1148			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1149			socket_log(sock, NULL, TRACE,
1150				   "interface received on ifindex %u",
1151				   dev->pktinfo.ipi6_ifindex);
1152			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
1153				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1154			}
1155			goto next;
1156		}
1157
1158#ifdef SO_TIMESTAMP
1159		if (cmsgp->cmsg_level == SOL_SOCKET &&
1160		    cmsgp->cmsg_type == SCM_TIMESTAMP)
1161		{
1162			struct timeval tv;
1163			timevalp = CMSG_DATA(cmsgp);
1164			memmove(&tv, timevalp, sizeof(tv));
1165			dev->timestamp.seconds = tv.tv_sec;
1166			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1167			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1168			goto next;
1169		}
1170#endif /* ifdef SO_TIMESTAMP */
1171
1172#ifdef IPV6_TCLASS
1173		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1174		    cmsgp->cmsg_type == IPV6_TCLASS)
1175		{
1176			dev->dscp = *(int *)CMSG_DATA(cmsgp);
1177			dev->dscp >>= 2;
1178			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1179			goto next;
1180		}
1181#endif /* ifdef IPV6_TCLASS */
1182
1183#ifdef IP_TOS
1184		if (cmsgp->cmsg_level == IPPROTO_IP &&
1185		    (cmsgp->cmsg_type == IP_TOS
1186#ifdef IP_RECVTOS
1187		     || cmsgp->cmsg_type == IP_RECVTOS
1188#endif /* ifdef IP_RECVTOS */
1189		     ))
1190		{
1191			dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
1192			dev->dscp >>= 2;
1193			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1194			goto next;
1195		}
1196#endif /* ifdef IP_TOS */
1197	next:
1198		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1199	}
1200#endif /* USE_CMSG */
1201}
1202
1203/*
1204 * Construct an iov array and attach it to the msghdr passed in.  This is
1205 * the SEND constructor, which will use the used region of the buffer
1206 * (if using a buffer list) or will use the internal region (if a single
1207 * buffer I/O is requested).
1208 *
1209 * Nothing can be NULL, and the done event must list at least one buffer
1210 * on the buffer linked list for this function to be meaningful.
1211 *
1212 * If write_countp != NULL, *write_countp will hold the number of bytes
1213 * this transaction can send.
1214 */
1215static void
1216build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1217		  struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
1218	unsigned int iovcount;
1219	size_t write_count;
1220	struct cmsghdr *cmsgp;
1221
1222	memset(msg, 0, sizeof(*msg));
1223
1224	if (!sock->connected) {
1225		msg->msg_name = (void *)&dev->address.type.sa;
1226		msg->msg_namelen = dev->address.length;
1227	} else {
1228		msg->msg_name = NULL;
1229		msg->msg_namelen = 0;
1230	}
1231
1232	write_count = dev->region.length - dev->n;
1233	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1234	iov[0].iov_len = write_count;
1235	iovcount = 1;
1236
1237	msg->msg_iov = iov;
1238	msg->msg_iovlen = iovcount;
1239	msg->msg_control = NULL;
1240	msg->msg_controllen = 0;
1241	msg->msg_flags = 0;
1242#if defined(USE_CMSG)
1243
1244	if ((sock->type == isc_sockettype_udp) &&
1245	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
1246	{
1247		struct in6_pktinfo *pktinfop;
1248
1249		socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
1250			   dev->pktinfo.ipi6_ifindex);
1251
1252		msg->msg_control = (void *)cmsgbuf;
1253		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1254		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1255
1256		cmsgp = (struct cmsghdr *)cmsgbuf;
1257		cmsgp->cmsg_level = IPPROTO_IPV6;
1258		cmsgp->cmsg_type = IPV6_PKTINFO;
1259		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1260		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1261		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1262	}
1263
1264#if defined(IPV6_USE_MIN_MTU)
1265	if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
1266	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
1267	{
1268		int use_min_mtu = 1; /* -1, 0, 1 */
1269
1270		cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
1271		msg->msg_control = (void *)cmsgbuf;
1272		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1273		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1274
1275		cmsgp->cmsg_level = IPPROTO_IPV6;
1276		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1277		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1278		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1279	}
1280#endif /* if defined(IPV6_USE_MIN_MTU) */
1281
1282	if (isc_dscp_check_value > -1) {
1283		if (sock->type == isc_sockettype_udp) {
1284			INSIST((int)dev->dscp == isc_dscp_check_value);
1285		} else if (sock->type == isc_sockettype_tcp) {
1286			INSIST((int)sock->dscp == isc_dscp_check_value);
1287		}
1288	}
1289
1290#if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
1291	if ((sock->type == isc_sockettype_udp) &&
1292	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
1293	{
1294		int dscp = (dev->dscp << 2) & 0xff;
1295
1296		INSIST(dev->dscp < 0x40);
1297
1298#ifdef IP_TOS
1299		if (sock->pf == AF_INET && sock->pktdscp) {
1300			cmsgp = (struct cmsghdr *)(cmsgbuf +
1301						   msg->msg_controllen);
1302			msg->msg_control = (void *)cmsgbuf;
1303			msg->msg_controllen += cmsg_space(sizeof(dscp));
1304			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1305
1306			cmsgp->cmsg_level = IPPROTO_IP;
1307			cmsgp->cmsg_type = IP_TOS;
1308			cmsgp->cmsg_len = cmsg_len(sizeof(char));
1309			*(unsigned char *)CMSG_DATA(cmsgp) = dscp;
1310		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
1311			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
1312				       (void *)&dscp, sizeof(int)) < 0)
1313			{
1314				char strbuf[ISC_STRERRORSIZE];
1315				strerror_r(errno, strbuf, sizeof(strbuf));
1316				UNEXPECTED_ERROR(__FILE__, __LINE__,
1317						 "setsockopt(%d, IP_TOS, %.02x)"
1318						 " failed: %s",
1319						 sock->fd, dscp >> 2, strbuf);
1320			} else {
1321				sock->dscp = dscp;
1322			}
1323		}
1324#endif /* ifdef IP_TOS */
1325#if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
1326		if (sock->pf == AF_INET6 && sock->pktdscp) {
1327			cmsgp = (struct cmsghdr *)(cmsgbuf +
1328						   msg->msg_controllen);
1329			msg->msg_control = (void *)cmsgbuf;
1330			msg->msg_controllen += cmsg_space(sizeof(dscp));
1331			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1332
1333			cmsgp->cmsg_level = IPPROTO_IPV6;
1334			cmsgp->cmsg_type = IPV6_TCLASS;
1335			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
1336			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
1337		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
1338			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
1339				       (void *)&dscp, sizeof(int)) < 0)
1340			{
1341				char strbuf[ISC_STRERRORSIZE];
1342				strerror_r(errno, strbuf, sizeof(strbuf));
1343				UNEXPECTED_ERROR(__FILE__, __LINE__,
1344						 "setsockopt(%d, IPV6_TCLASS, "
1345						 "%.02x) failed: %s",
1346						 sock->fd, dscp >> 2, strbuf);
1347			} else {
1348				sock->dscp = dscp;
1349			}
1350		}
1351#endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
1352		if (msg->msg_controllen != 0 &&
1353		    msg->msg_controllen < SENDCMSGBUFLEN)
1354		{
1355			memset(cmsgbuf + msg->msg_controllen, 0,
1356			       SENDCMSGBUFLEN - msg->msg_controllen);
1357		}
1358	}
1359#endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
1360	* defined(IPV6_TCLASS))                           \
1361	* */
1362#endif /* USE_CMSG */
1363
1364	if (write_countp != NULL) {
1365		*write_countp = write_count;
1366	}
1367}
1368
1369/*
1370 * Construct an iov array and attach it to the msghdr passed in.  This is
1371 * the RECV constructor, which will use the available region of the buffer
1372 * (if using a buffer list) or will use the internal region (if a single
1373 * buffer I/O is requested).
1374 *
1375 * Nothing can be NULL, and the done event must list at least one buffer
1376 * on the buffer linked list for this function to be meaningful.
1377 *
1378 * If read_countp != NULL, *read_countp will hold the number of bytes
1379 * this transaction can receive.
1380 */
1381static void
1382build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1383		  struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
1384	unsigned int iovcount;
1385	size_t read_count;
1386
1387	memset(msg, 0, sizeof(struct msghdr));
1388
1389	if (sock->type == isc_sockettype_udp) {
1390		memset(&dev->address, 0, sizeof(dev->address));
1391		msg->msg_name = (void *)&dev->address.type.sa;
1392		msg->msg_namelen = sizeof(dev->address.type);
1393	} else { /* TCP */
1394		msg->msg_name = NULL;
1395		msg->msg_namelen = 0;
1396		dev->address = sock->peer_address;
1397	}
1398
1399	read_count = dev->region.length - dev->n;
1400	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1401	iov[0].iov_len = read_count;
1402	iovcount = 1;
1403
1404	/*
1405	 * If needed, set up to receive that one extra byte.
1406	 */
1407#ifdef ISC_PLATFORM_RECVOVERFLOW
1408	if (sock->type == isc_sockettype_udp) {
1409		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1410		iov[iovcount].iov_base = (void *)(&sock->overflow);
1411		iov[iovcount].iov_len = 1;
1412		iovcount++;
1413	}
1414#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1415
1416	msg->msg_iov = iov;
1417	msg->msg_iovlen = iovcount;
1418
1419#if defined(USE_CMSG)
1420	msg->msg_control = cmsgbuf;
1421	msg->msg_controllen = RECVCMSGBUFLEN;
1422#else  /* if defined(USE_CMSG) */
1423	msg->msg_control = NULL;
1424	msg->msg_controllen = 0;
1425#endif /* USE_CMSG */
1426	msg->msg_flags = 0;
1427
1428	if (read_countp != NULL) {
1429		*read_countp = read_count;
1430	}
1431}
1432
1433static void
1434set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock,
1435		isc_socketevent_t *dev) {
1436	if (sock->type == isc_sockettype_udp) {
1437		if (address != NULL) {
1438			dev->address = *address;
1439		} else {
1440			dev->address = sock->peer_address;
1441		}
1442	} else if (sock->type == isc_sockettype_tcp) {
1443		INSIST(address == NULL);
1444		dev->address = sock->peer_address;
1445	}
1446}
1447
1448static void
1449destroy_socketevent(isc_event_t *event) {
1450	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1451
1452	(ev->destroy)(event);
1453}
1454
1455static isc_socketevent_t *
1456allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
1457		     isc_taskaction_t action, void *arg) {
1458	isc_socketevent_t *ev;
1459
1460	ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
1461						     action, arg, sizeof(*ev));
1462
1463	ev->result = ISC_R_UNSET;
1464	ISC_LINK_INIT(ev, ev_link);
1465	ev->region.base = NULL;
1466	ev->n = 0;
1467	ev->offset = 0;
1468	ev->attributes = 0;
1469	ev->destroy = ev->ev_destroy;
1470	ev->ev_destroy = destroy_socketevent;
1471	ev->dscp = 0;
1472
1473	return (ev);
1474}
1475
1476#if defined(ISC_SOCKET_DEBUG)
1477static void
1478dump_msg(struct msghdr *msg) {
1479	unsigned int i;
1480
1481	printf("MSGHDR %p\n", msg);
1482	printf("\tname %p, namelen %ld\n", msg->msg_name,
1483	       (long)msg->msg_namelen);
1484	printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
1485	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) {
1486		printf("\t\t%u\tbase %p, len %ld\n", i,
1487		       msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
1488	}
1489	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1490	       (long)msg->msg_controllen);
1491}
1492#endif /* if defined(ISC_SOCKET_DEBUG) */
1493
1494#define DOIO_SUCCESS 0 /* i/o ok, event sent */
1495#define DOIO_SOFT    1 /* i/o ok, soft error, no event sent */
1496#define DOIO_HARD    2 /* i/o error, event sent */
1497#define DOIO_EOF     3 /* EOF, no event sent */
1498
1499static int
1500doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1501	int cc;
1502	struct iovec iov[MAXSCATTERGATHER_RECV];
1503	size_t read_count;
1504	struct msghdr msghdr;
1505	int recv_errno;
1506	char strbuf[ISC_STRERRORSIZE];
1507	char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
1508
1509	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
1510
1511#if defined(ISC_SOCKET_DEBUG)
1512	dump_msg(&msghdr);
1513#endif /* if defined(ISC_SOCKET_DEBUG) */
1514
1515	cc = recvmsg(sock->fd, &msghdr, 0);
1516	recv_errno = errno;
1517
1518#if defined(ISC_SOCKET_DEBUG)
1519	dump_msg(&msghdr);
1520#endif /* if defined(ISC_SOCKET_DEBUG) */
1521
1522	if (cc < 0) {
1523		if (SOFT_ERROR(recv_errno)) {
1524			return (DOIO_SOFT);
1525		}
1526
1527		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1528			strerror_r(recv_errno, strbuf, sizeof(strbuf));
1529			socket_log(sock, NULL, IOEVENT,
1530				   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1531				   sock->fd, cc, recv_errno, strbuf);
1532		}
1533
1534#define SOFT_OR_HARD(_system, _isc)                                   \
1535	if (recv_errno == _system) {                                  \
1536		if (sock->connected) {                                \
1537			dev->result = _isc;                           \
1538			inc_stats(sock->manager->stats,               \
1539				  sock->statsindex[STATID_RECVFAIL]); \
1540			return (DOIO_HARD);                           \
1541		}                                                     \
1542		return (DOIO_SOFT);                                   \
1543	}
1544#define ALWAYS_HARD(_system, _isc)                            \
1545	if (recv_errno == _system) {                          \
1546		dev->result = _isc;                           \
1547		inc_stats(sock->manager->stats,               \
1548			  sock->statsindex[STATID_RECVFAIL]); \
1549		return (DOIO_HARD);                           \
1550	}
1551
1552		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1553		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1554		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1555		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1556		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1557		/*
1558		 * Older operating systems may still return EPROTO in some
1559		 * situations, for example when receiving ICMP/ICMPv6 errors.
1560		 * A real life scenario is when ICMPv6 returns code 5 or 6.
1561		 * These codes are introduced in RFC 4443 from March 2006,
1562		 * and the document obsoletes RFC 1885. But unfortunately not
1563		 * all operating systems have caught up with the new standard
1564		 * (in 2020) and thus a generic protocol error is returned.
1565		 */
1566		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1567		/* Should never get this one but it was seen. */
1568#ifdef ENOPROTOOPT
1569		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1570#endif /* ifdef ENOPROTOOPT */
1571		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1572
1573#undef SOFT_OR_HARD
1574#undef ALWAYS_HARD
1575
1576		dev->result = isc__errno2result(recv_errno);
1577		inc_stats(sock->manager->stats,
1578			  sock->statsindex[STATID_RECVFAIL]);
1579		return (DOIO_HARD);
1580	}
1581
1582	/*
1583	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1584	 * while on UDP sockets, zero length reads are perfectly valid,
1585	 * although strange.
1586	 */
1587	switch (sock->type) {
1588	case isc_sockettype_tcp:
1589	case isc_sockettype_unix:
1590		if (cc == 0) {
1591			return (DOIO_EOF);
1592		}
1593		break;
1594	case isc_sockettype_udp:
1595	case isc_sockettype_raw:
1596		break;
1597	case isc_sockettype_fdwatch:
1598	default:
1599		UNREACHABLE();
1600	}
1601
1602	if (sock->type == isc_sockettype_udp) {
1603		dev->address.length = msghdr.msg_namelen;
1604		if (isc_sockaddr_getport(&dev->address) == 0) {
1605			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1606				socket_log(sock, &dev->address, IOEVENT,
1607					   "dropping source port zero packet");
1608			}
1609			return (DOIO_SOFT);
1610		}
1611		/*
1612		 * Simulate a firewall blocking UDP responses bigger than
1613		 * 'maxudp' bytes.
1614		 */
1615		if (sock->manager->maxudp != 0 &&
1616		    cc > (int)sock->manager->maxudp)
1617		{
1618			return (DOIO_SOFT);
1619		}
1620	}
1621
1622	socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
1623
1624	/*
1625	 * Overflow bit detection.  If we received MORE bytes than we should,
1626	 * this indicates an overflow situation.  Set the flag in the
1627	 * dev entry and adjust how much we read by one.
1628	 */
1629#ifdef ISC_PLATFORM_RECVOVERFLOW
1630	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1631		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1632		cc--;
1633	}
1634#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1635
1636	/*
1637	 * If there are control messages attached, run through them and pull
1638	 * out the interesting bits.
1639	 */
1640	process_cmsg(sock, &msghdr, dev);
1641
1642	/*
1643	 * update the buffers (if any) and the i/o count
1644	 */
1645	dev->n += cc;
1646
1647	/*
1648	 * If we read less than we expected, update counters,
1649	 * and let the upper layer poke the descriptor.
1650	 */
1651	if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
1652		return (DOIO_SOFT);
1653	}
1654
1655	/*
1656	 * Full reads are posted, or partials if partials are ok.
1657	 */
1658	dev->result = ISC_R_SUCCESS;
1659	return (DOIO_SUCCESS);
1660}
1661
1662/*
1663 * Returns:
1664 *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1665 *			ISC_R_SUCCESS.
1666 *
1667 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1668 *			dev->result contains the appropriate error.
1669 *
1670 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1671 *			event was sent.  The operation should be retried.
1672 *
1673 *	No other return values are possible.
1674 */
1675static int
1676doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1677	int cc;
1678	struct iovec iov[MAXSCATTERGATHER_SEND];
1679	size_t write_count;
1680	struct msghdr msghdr;
1681	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1682	int attempts = 0;
1683	int send_errno;
1684	char strbuf[ISC_STRERRORSIZE];
1685	char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
1686
1687	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1688
1689resend:
1690	if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
1691	    write_count > sock->manager->maxudp)
1692	{
1693		cc = write_count;
1694	} else {
1695		cc = sendmsg(sock->fd, &msghdr, 0);
1696	}
1697	send_errno = errno;
1698
1699	/*
1700	 * Check for error or block condition.
1701	 */
1702	if (cc < 0) {
1703		if (send_errno == EINTR && ++attempts < NRETRIES) {
1704			goto resend;
1705		}
1706
1707		if (SOFT_ERROR(send_errno)) {
1708			if (errno == EWOULDBLOCK || errno == EAGAIN) {
1709				dev->result = ISC_R_WOULDBLOCK;
1710			}
1711			return (DOIO_SOFT);
1712		}
1713
1714#define SOFT_OR_HARD(_system, _isc)                                   \
1715	if (send_errno == _system) {                                  \
1716		if (sock->connected) {                                \
1717			dev->result = _isc;                           \
1718			inc_stats(sock->manager->stats,               \
1719				  sock->statsindex[STATID_SENDFAIL]); \
1720			return (DOIO_HARD);                           \
1721		}                                                     \
1722		return (DOIO_SOFT);                                   \
1723	}
1724#define ALWAYS_HARD(_system, _isc)                            \
1725	if (send_errno == _system) {                          \
1726		dev->result = _isc;                           \
1727		inc_stats(sock->manager->stats,               \
1728			  sock->statsindex[STATID_SENDFAIL]); \
1729		return (DOIO_HARD);                           \
1730	}
1731
1732		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1733		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1734		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1735		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1736		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1737#ifdef EHOSTDOWN
1738		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1739#endif /* ifdef EHOSTDOWN */
1740		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1741		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1742		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1743		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1744		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1745
1746#undef SOFT_OR_HARD
1747#undef ALWAYS_HARD
1748
1749		/*
1750		 * The other error types depend on whether or not the
1751		 * socket is UDP or TCP.  If it is UDP, some errors
1752		 * that we expect to be fatal under TCP are merely
1753		 * annoying, and are really soft errors.
1754		 *
1755		 * However, these soft errors are still returned as
1756		 * a status.
1757		 */
1758		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1759		strerror_r(send_errno, strbuf, sizeof(strbuf));
1760		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1761				 addrbuf, strbuf);
1762		dev->result = isc__errno2result(send_errno);
1763		inc_stats(sock->manager->stats,
1764			  sock->statsindex[STATID_SENDFAIL]);
1765		return (DOIO_HARD);
1766	}
1767
1768	if (cc == 0) {
1769		inc_stats(sock->manager->stats,
1770			  sock->statsindex[STATID_SENDFAIL]);
1771		UNEXPECTED_ERROR(__FILE__, __LINE__,
1772				 "doio_send: send() returned 0");
1773	}
1774
1775	/*
1776	 * If we write less than we expected, update counters, poke.
1777	 */
1778	dev->n += cc;
1779	if ((size_t)cc != write_count) {
1780		return (DOIO_SOFT);
1781	}
1782
1783	/*
1784	 * Exactly what we wanted to write.  We're done with this
1785	 * entry.  Post its completion event.
1786	 */
1787	dev->result = ISC_R_SUCCESS;
1788	return (DOIO_SUCCESS);
1789}
1790
1791/*
1792 * Kill.
1793 *
1794 * Caller must ensure that the socket is not locked and no external
1795 * references exist.
1796 */
1797static void
1798socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) {
1799	int lockid = FDLOCK_ID(fd);
1800	/*
1801	 * No one has this socket open, so the watcher doesn't have to be
1802	 * poked, and the socket doesn't have to be locked.
1803	 */
1804	LOCK(&thread->fdlock[lockid]);
1805	thread->fds[fd] = NULL;
1806	if (sock->type == isc_sockettype_fdwatch)
1807		thread->fdstate[fd] = CLOSED;
1808	else
1809		thread->fdstate[fd] = CLOSE_PENDING;
1810	UNLOCK(&thread->fdlock[lockid]);
1811	if (sock->type == isc_sockettype_fdwatch) {
1812		/*
1813		 * The caller may close the socket once this function returns,
1814		 * and `fd' may be reassigned for a new socket.  So we do
1815		 * unwatch_fd() here, rather than defer it via select_poke().
1816		 * Note: this may complicate data protection among threads and
1817		 * may reduce performance due to additional locks.  One way to
1818		 * solve this would be to dup() the watched descriptor, but we
1819		 * take a simpler approach at this moment.
1820		 */
1821		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
1822		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
1823	} else
1824		select_poke(thread->manager, thread->threadid, fd,
1825		    SELECT_POKE_CLOSE);
1826
1827	inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
1828
1829	LOCK(&sock->lock);
1830	if (sock->active == 1) {
1831		dec_stats(thread->manager->stats,
1832			  sock->statsindex[STATID_ACTIVE]);
1833		sock->active = 0;
1834	}
1835	UNLOCK(&sock->lock);
1836
1837	/*
1838	 * update manager->maxfd here (XXX: this should be implemented more
1839	 * efficiently)
1840	 */
1841#ifdef USE_SELECT
1842	LOCK(&thread->manager->lock);
1843	if (thread->maxfd == fd) {
1844		int i;
1845
1846		thread->maxfd = 0;
1847		for (i = fd - 1; i >= 0; i--) {
1848			lockid = FDLOCK_ID(i);
1849
1850			LOCK(&thread->fdlock[lockid]);
1851			if (thread->fdstate[i] == MANAGED) {
1852				thread->maxfd = i;
1853				UNLOCK(&thread->fdlock[lockid]);
1854				break;
1855			}
1856			UNLOCK(&thread->fdlock[lockid]);
1857		}
1858		if (thread->maxfd < thread->pipe_fds[0]) {
1859			thread->maxfd = thread->pipe_fds[0];
1860		}
1861	}
1862
1863	UNLOCK(&thread->manager->lock);
1864#endif /* USE_SELECT */
1865}
1866
1867static void
1868destroy(isc_socket_t **sockp) {
1869	int fd = 0;
1870	isc_socket_t *sock = *sockp;
1871	isc_socketmgr_t *manager = sock->manager;
1872	isc__socketthread_t *thread = NULL;
1873
1874	socket_log(sock, NULL, CREATION, "destroying");
1875
1876	isc_refcount_destroy(&sock->references);
1877
1878	LOCK(&sock->lock);
1879	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1880	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1881	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1882	INSIST(ISC_LIST_EMPTY(sock->send_list));
1883	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1884
1885	if (sock->fd >= 0) {
1886		fd = sock->fd;
1887		thread = &manager->threads[sock->threadid];
1888		sock->fd = -1;
1889		sock->threadid = -1;
1890	}
1891	UNLOCK(&sock->lock);
1892
1893	if (fd > 0) {
1894		socketclose(thread, sock, fd);
1895	}
1896
1897	LOCK(&manager->lock);
1898
1899	ISC_LIST_UNLINK(manager->socklist, sock, link);
1900
1901	if (ISC_LIST_EMPTY(manager->socklist)) {
1902		SIGNAL(&manager->shutdown_ok);
1903	}
1904
1905	/* can't unlock manager as its memory context is still used */
1906	free_socket(sockp);
1907
1908	UNLOCK(&manager->lock);
1909}
1910
1911static isc_result_t
1912allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1913		isc_socket_t **socketp) {
1914	isc_socket_t *sock;
1915
1916	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1917
1918	sock->magic = 0;
1919	isc_refcount_init(&sock->references, 0);
1920
1921	sock->manager = manager;
1922	sock->type = type;
1923	sock->fd = -1;
1924	sock->threadid = -1;
1925	sock->dscp = 0; /* TOS/TCLASS is zero until set. */
1926	sock->dupped = 0;
1927	sock->statsindex = NULL;
1928	sock->active = 0;
1929
1930	ISC_LINK_INIT(sock, link);
1931
1932	memset(sock->name, 0, sizeof(sock->name));
1933	sock->tag = NULL;
1934
1935	/*
1936	 * Set up list of readers and writers to be initially empty.
1937	 */
1938	ISC_LIST_INIT(sock->recv_list);
1939	ISC_LIST_INIT(sock->send_list);
1940	ISC_LIST_INIT(sock->accept_list);
1941	ISC_LIST_INIT(sock->connect_list);
1942
1943	sock->listener = 0;
1944	sock->connected = 0;
1945	sock->connecting = 0;
1946	sock->bound = 0;
1947	sock->pktdscp = 0;
1948
1949	/*
1950	 * Initialize the lock.
1951	 */
1952	isc_mutex_init(&sock->lock);
1953
1954	sock->magic = SOCKET_MAGIC;
1955	*socketp = sock;
1956
1957	return (ISC_R_SUCCESS);
1958}
1959
1960/*
1961 * This event requires that the various lists be empty, that the reference
1962 * count be 1, and that the magic number is valid.  The other socket bits,
1963 * like the lock, must be initialized as well.  The fd associated must be
1964 * marked as closed, by setting it to -1 on close, or this routine will
1965 * also close the socket.
1966 */
1967static void
1968free_socket(isc_socket_t **socketp) {
1969	isc_socket_t *sock = *socketp;
1970	*socketp = NULL;
1971
1972	INSIST(VALID_SOCKET(sock));
1973	isc_refcount_destroy(&sock->references);
1974	LOCK(&sock->lock);
1975	INSIST(!sock->connecting);
1976	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1977	INSIST(ISC_LIST_EMPTY(sock->send_list));
1978	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1979	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1980	INSIST(!ISC_LINK_LINKED(sock, link));
1981	UNLOCK(&sock->lock);
1982
1983	sock->magic = 0;
1984
1985	isc_mutex_destroy(&sock->lock);
1986
1987	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1988}
1989
1990#if defined(SET_RCVBUF)
1991static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
1992static int rcvbuf = ISC_RECV_BUFFER_SIZE;
1993
1994static void
1995set_rcvbuf(void) {
1996	int fd;
1997	int max = rcvbuf, min;
1998	socklen_t len;
1999
2000	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2001	if (fd == -1) {
2002		switch (errno) {
2003		case EPROTONOSUPPORT:
2004		case EPFNOSUPPORT:
2005		case EAFNOSUPPORT:
2006		/*
2007		 * Linux 2.2 (and maybe others) return EINVAL instead of
2008		 * EAFNOSUPPORT.
2009		 */
2010		case EINVAL:
2011			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2012			break;
2013		}
2014	}
2015	if (fd == -1) {
2016		return;
2017	}
2018
2019	len = sizeof(min);
2020	if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
2021	    min < rcvbuf)
2022	{
2023	again:
2024		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
2025			       sizeof(rcvbuf)) == -1)
2026		{
2027			if (errno == ENOBUFS && rcvbuf > min) {
2028				max = rcvbuf - 1;
2029				rcvbuf = (rcvbuf + min) / 2;
2030				goto again;
2031			} else {
2032				rcvbuf = min;
2033				goto cleanup;
2034			}
2035		} else {
2036			min = rcvbuf;
2037		}
2038		if (min != max) {
2039			rcvbuf = max;
2040			goto again;
2041		}
2042	}
2043cleanup:
2044	close(fd);
2045}
2046#endif /* ifdef SO_RCVBUF */
2047
2048#if defined(SET_SNDBUF)
2049static isc_once_t sndbuf_once = ISC_ONCE_INIT;
2050static int sndbuf = ISC_SEND_BUFFER_SIZE;
2051
2052static void
2053set_sndbuf(void) {
2054	int fd;
2055	int max = sndbuf, min;
2056	socklen_t len;
2057
2058	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2059	if (fd == -1) {
2060		switch (errno) {
2061		case EPROTONOSUPPORT:
2062		case EPFNOSUPPORT:
2063		case EAFNOSUPPORT:
2064		/*
2065		 * Linux 2.2 (and maybe others) return EINVAL instead of
2066		 * EAFNOSUPPORT.
2067		 */
2068		case EINVAL:
2069			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2070			break;
2071		}
2072	}
2073	if (fd == -1) {
2074		return;
2075	}
2076
2077	len = sizeof(min);
2078	if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
2079	    min < sndbuf)
2080	{
2081	again:
2082		if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
2083			       sizeof(sndbuf)) == -1)
2084		{
2085			if (errno == ENOBUFS && sndbuf > min) {
2086				max = sndbuf - 1;
2087				sndbuf = (sndbuf + min) / 2;
2088				goto again;
2089			} else {
2090				sndbuf = min;
2091				goto cleanup;
2092			}
2093		} else {
2094			min = sndbuf;
2095		}
2096		if (min != max) {
2097			sndbuf = max;
2098			goto again;
2099		}
2100	}
2101cleanup:
2102	close(fd);
2103}
2104#endif /* ifdef SO_SNDBUF */
2105
2106static void
2107use_min_mtu(isc_socket_t *sock) {
2108#if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2109	UNUSED(sock);
2110#endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
2111#ifdef IPV6_USE_MIN_MTU
2112	/* use minimum MTU */
2113	if (sock->pf == AF_INET6) {
2114		int on = 1;
2115		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2116				 (void *)&on, sizeof(on));
2117	}
2118#endif /* ifdef IPV6_USE_MIN_MTU */
2119#if defined(IPV6_MTU)
2120	/*
2121	 * Use minimum MTU on IPv6 sockets.
2122	 */
2123	if (sock->pf == AF_INET6) {
2124		int mtu = 1280;
2125		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
2126				 sizeof(mtu));
2127	}
2128#endif /* if defined(IPV6_MTU) */
2129}
2130
2131static void
2132set_tcp_maxseg(isc_socket_t *sock, int size) {
2133#ifdef TCP_MAXSEG
2134	if (sock->type == isc_sockettype_tcp) {
2135		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
2136				 (void *)&size, sizeof(size));
2137	}
2138#endif /* ifdef TCP_MAXSEG */
2139}
2140
2141static void
2142set_ip_disable_pmtud(isc_socket_t *sock) {
2143	/*
2144	 * Disable Path MTU Discover on IP packets
2145	 */
2146	if (sock->pf == AF_INET6) {
2147#if defined(IPV6_DONTFRAG)
2148		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
2149				 &(int){ 0 }, sizeof(int));
2150#endif
2151#if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2152		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
2153				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2154#endif
2155	} else if (sock->pf == AF_INET) {
2156#if defined(IP_DONTFRAG)
2157		(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 },
2158				 sizeof(int));
2159#endif
2160#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2161		(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2162				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2163#endif
2164	}
2165}
2166
2167static isc_result_t
2168opensocket(isc_socketmgr_t *manager, isc_socket_t *sock,
2169	   isc_socket_t *dup_socket) {
2170	isc_result_t result;
2171	char strbuf[ISC_STRERRORSIZE];
2172	const char *err = "socket";
2173	int tries = 0;
2174#if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
2175	int on = 1;
2176#endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
2177#if defined(SET_RCVBUF) || defined(SET_SNDBUF)
2178	socklen_t optlen;
2179	int size = 0;
2180#endif
2181
2182again:
2183	if (dup_socket == NULL) {
2184		switch (sock->type) {
2185		case isc_sockettype_udp:
2186			sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2187			break;
2188		case isc_sockettype_tcp:
2189			sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2190			break;
2191		case isc_sockettype_unix:
2192			sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2193			break;
2194		case isc_sockettype_raw:
2195			errno = EPFNOSUPPORT;
2196			/*
2197			 * PF_ROUTE is a alias for PF_NETLINK on linux.
2198			 */
2199#if defined(PF_ROUTE)
2200			if (sock->fd == -1 && sock->pf == PF_ROUTE) {
2201#ifdef NETLINK_ROUTE
2202				sock->fd = socket(sock->pf, SOCK_RAW,
2203						  NETLINK_ROUTE);
2204#else  /* ifdef NETLINK_ROUTE */
2205				sock->fd = socket(sock->pf, SOCK_RAW, 0);
2206#endif /* ifdef NETLINK_ROUTE */
2207				if (sock->fd != -1) {
2208#ifdef NETLINK_ROUTE
2209					struct sockaddr_nl sa;
2210					int n;
2211
2212					/*
2213					 * Do an implicit bind.
2214					 */
2215					memset(&sa, 0, sizeof(sa));
2216					sa.nl_family = AF_NETLINK;
2217					sa.nl_groups = RTMGRP_IPV4_IFADDR |
2218						       RTMGRP_IPV6_IFADDR;
2219					n = bind(sock->fd,
2220						 (struct sockaddr *)&sa,
2221						 sizeof(sa));
2222					if (n < 0) {
2223						close(sock->fd);
2224						sock->fd = -1;
2225					}
2226#endif /* ifdef NETLINK_ROUTE */
2227					sock->bound = 1;
2228				}
2229			}
2230#endif /* if defined(PF_ROUTE) */
2231			break;
2232		case isc_sockettype_fdwatch:
2233			/*
2234			 * We should not be called for isc_sockettype_fdwatch
2235			 * sockets.
2236			 */
2237			INSIST(0);
2238			break;
2239		}
2240	} else {
2241		sock->fd = dup(dup_socket->fd);
2242		sock->dupped = 1;
2243		sock->bound = dup_socket->bound;
2244	}
2245	if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
2246		goto again;
2247	}
2248
2249#ifdef F_DUPFD
2250	/*
2251	 * Leave a space for stdio and TCP to work in.
2252	 */
2253	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2254	    sock->fd >= 0 && sock->fd < manager->reserved)
2255	{
2256		int newfd, tmp;
2257		newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
2258		tmp = errno;
2259		(void)close(sock->fd);
2260		errno = tmp;
2261		sock->fd = newfd;
2262		err = "isc_socket_create: fcntl/reserved";
2263	} else if (sock->fd >= 0 && sock->fd < 20) {
2264		int newfd, tmp;
2265		newfd = fcntl(sock->fd, F_DUPFD, 20);
2266		tmp = errno;
2267		(void)close(sock->fd);
2268		errno = tmp;
2269		sock->fd = newfd;
2270		err = "isc_socket_create: fcntl";
2271	}
2272#endif /* ifdef F_DUPFD */
2273
2274	if (sock->fd >= (int)manager->maxsocks) {
2275		(void)close(sock->fd);
2276		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2277			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2278			      "socket: file descriptor exceeds limit (%d/%u)",
2279			      sock->fd, manager->maxsocks);
2280		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2281		return (ISC_R_NORESOURCES);
2282	}
2283
2284	if (sock->fd < 0) {
2285		switch (errno) {
2286		case EMFILE:
2287		case ENFILE:
2288			strerror_r(errno, strbuf, sizeof(strbuf));
2289			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2290				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2291				      "%s: %s", err, strbuf);
2292			FALLTHROUGH;
2293		case ENOBUFS:
2294			inc_stats(manager->stats,
2295				  sock->statsindex[STATID_OPENFAIL]);
2296			return (ISC_R_NORESOURCES);
2297
2298		case EPROTONOSUPPORT:
2299		case EPFNOSUPPORT:
2300		case EAFNOSUPPORT:
2301		/*
2302		 * Linux 2.2 (and maybe others) return EINVAL instead of
2303		 * EAFNOSUPPORT.
2304		 */
2305		case EINVAL:
2306			inc_stats(manager->stats,
2307				  sock->statsindex[STATID_OPENFAIL]);
2308			return (ISC_R_FAMILYNOSUPPORT);
2309
2310		default:
2311			strerror_r(errno, strbuf, sizeof(strbuf));
2312			UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
2313					 err, strbuf);
2314			inc_stats(manager->stats,
2315				  sock->statsindex[STATID_OPENFAIL]);
2316			return (ISC_R_UNEXPECTED);
2317		}
2318	}
2319
2320	if (dup_socket != NULL) {
2321		goto setup_done;
2322	}
2323
2324	result = make_nonblock(sock->fd);
2325	if (result != ISC_R_SUCCESS) {
2326		(void)close(sock->fd);
2327		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2328		return (result);
2329	}
2330
2331#ifdef SO_NOSIGPIPE
2332	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
2333		       sizeof(on)) < 0)
2334	{
2335		strerror_r(errno, strbuf, sizeof(strbuf));
2336		UNEXPECTED_ERROR(__FILE__, __LINE__,
2337				 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
2338				 sock->fd, strbuf);
2339		/* Press on... */
2340	}
2341#endif /* ifdef SO_NOSIGPIPE */
2342
2343	/*
2344	 * Use minimum mtu if possible.
2345	 */
2346	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
2347		use_min_mtu(sock);
2348		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
2349	}
2350
2351#if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
2352	if (sock->type == isc_sockettype_udp) {
2353#if defined(USE_CMSG)
2354#if defined(SO_TIMESTAMP)
2355		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
2356			       sizeof(on)) < 0 &&
2357		    errno != ENOPROTOOPT)
2358		{
2359			strerror_r(errno, strbuf, sizeof(strbuf));
2360			UNEXPECTED_ERROR(__FILE__, __LINE__,
2361					 "setsockopt(%d, SO_TIMESTAMP) failed: "
2362					 "%s",
2363					 sock->fd, strbuf);
2364			/* Press on... */
2365		}
2366#endif /* SO_TIMESTAMP */
2367
2368#ifdef IPV6_RECVPKTINFO
2369		/* RFC 3542 */
2370		if ((sock->pf == AF_INET6) &&
2371		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2372				(void *)&on, sizeof(on)) < 0))
2373		{
2374			strerror_r(errno, strbuf, sizeof(strbuf));
2375			UNEXPECTED_ERROR(__FILE__, __LINE__,
2376					 "setsockopt(%d, IPV6_RECVPKTINFO) "
2377					 "failed: %s",
2378					 sock->fd, strbuf);
2379		}
2380#else  /* ifdef IPV6_RECVPKTINFO */
2381		/* RFC 2292 */
2382		if ((sock->pf == AF_INET6) &&
2383		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2384				(void *)&on, sizeof(on)) < 0))
2385		{
2386			strerror_r(errno, strbuf, sizeof(strbuf));
2387			UNEXPECTED_ERROR(__FILE__, __LINE__,
2388					 "setsockopt(%d, IPV6_PKTINFO) failed: "
2389					 "%s",
2390					 sock->fd, strbuf);
2391		}
2392#endif /* IPV6_RECVPKTINFO */
2393#endif /* defined(USE_CMSG) */
2394
2395#if defined(SET_RCVBUF)
2396		optlen = sizeof(size);
2397		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
2398			       &optlen) == 0 &&
2399		    size < rcvbuf)
2400		{
2401			RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
2402				      ISC_R_SUCCESS);
2403			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2404				       (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
2405			{
2406				strerror_r(errno, strbuf, sizeof(strbuf));
2407				UNEXPECTED_ERROR(__FILE__, __LINE__,
2408						 "setsockopt(%d, SO_RCVBUF, "
2409						 "%d) failed: %s",
2410						 sock->fd, rcvbuf, strbuf);
2411			}
2412		}
2413#endif /* if defined(SET_RCVBUF) */
2414
2415#if defined(SET_SNDBUF)
2416		optlen = sizeof(size);
2417		if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
2418			       &optlen) == 0 &&
2419		    size < sndbuf)
2420		{
2421			RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
2422				      ISC_R_SUCCESS);
2423			if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
2424				       (void *)&sndbuf, sizeof(sndbuf)) == -1)
2425			{
2426				strerror_r(errno, strbuf, sizeof(strbuf));
2427				UNEXPECTED_ERROR(__FILE__, __LINE__,
2428						 "setsockopt(%d, SO_SNDBUF, "
2429						 "%d) failed: %s",
2430						 sock->fd, sndbuf, strbuf);
2431			}
2432		}
2433#endif /* if defined(SO_SNDBUF) */
2434	}
2435#ifdef IPV6_RECVTCLASS
2436	if ((sock->pf == AF_INET6) &&
2437	    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
2438			sizeof(on)) < 0))
2439	{
2440		strerror_r(errno, strbuf, sizeof(strbuf));
2441		UNEXPECTED_ERROR(__FILE__, __LINE__,
2442				 "setsockopt(%d, IPV6_RECVTCLASS) "
2443				 "failed: %s",
2444				 sock->fd, strbuf);
2445	}
2446#endif /* ifdef IPV6_RECVTCLASS */
2447#ifdef IP_RECVTOS
2448	if ((sock->pf == AF_INET) &&
2449	    (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
2450			sizeof(on)) < 0))
2451	{
2452		strerror_r(errno, strbuf, sizeof(strbuf));
2453		UNEXPECTED_ERROR(__FILE__, __LINE__,
2454				 "setsockopt(%d, IP_RECVTOS) "
2455				 "failed: %s",
2456				 sock->fd, strbuf);
2457	}
2458#endif /* ifdef IP_RECVTOS */
2459#endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
2460
2461	set_ip_disable_pmtud(sock);
2462
2463setup_done:
2464	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2465	if (sock->active == 0) {
2466		inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
2467		sock->active = 1;
2468	}
2469
2470	return (ISC_R_SUCCESS);
2471}
2472
2473/*
2474 * Create a 'type' socket or duplicate an existing socket, managed
2475 * by 'manager'.  Events will be posted to 'task' and when dispatched
2476 * 'action' will be called with 'arg' as the arg value.  The new
2477 * socket is returned in 'socketp'.
2478 */
2479static isc_result_t
2480socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2481	      isc_socket_t **socketp, isc_socket_t *dup_socket) {
2482	isc_socket_t *sock = NULL;
2483	isc__socketthread_t *thread;
2484	isc_result_t result;
2485	int lockid;
2486
2487	REQUIRE(VALID_MANAGER(manager));
2488	REQUIRE(socketp != NULL && *socketp == NULL);
2489	REQUIRE(type != isc_sockettype_fdwatch);
2490
2491	result = allocate_socket(manager, type, &sock);
2492	if (result != ISC_R_SUCCESS) {
2493		return (result);
2494	}
2495
2496	switch (sock->type) {
2497	case isc_sockettype_udp:
2498		sock->statsindex = (pf == AF_INET) ? udp4statsindex
2499						   : udp6statsindex;
2500#define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
2501		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
2502		break;
2503	case isc_sockettype_tcp:
2504		sock->statsindex = (pf == AF_INET) ? tcp4statsindex
2505						   : tcp6statsindex;
2506		break;
2507	case isc_sockettype_unix:
2508		sock->statsindex = unixstatsindex;
2509		break;
2510	case isc_sockettype_raw:
2511		sock->statsindex = rawstatsindex;
2512		break;
2513	default:
2514		UNREACHABLE();
2515	}
2516
2517	sock->pf = pf;
2518
2519	result = opensocket(manager, sock, dup_socket);
2520	if (result != ISC_R_SUCCESS) {
2521		free_socket(&sock);
2522		return (result);
2523	}
2524
2525	if (sock->fd == -1) {
2526		abort();
2527	}
2528	sock->threadid = gen_threadid(sock);
2529	isc_refcount_increment0(&sock->references);
2530	thread = &manager->threads[sock->threadid];
2531	*socketp = sock;
2532
2533	/*
2534	 * Note we don't have to lock the socket like we normally would because
2535	 * there are no external references to it yet.
2536	 */
2537
2538	lockid = FDLOCK_ID(sock->fd);
2539	LOCK(&thread->fdlock[lockid]);
2540	thread->fds[sock->fd] = sock;
2541	thread->fdstate[sock->fd] = MANAGED;
2542#if defined(USE_EPOLL)
2543	thread->epoll_events[sock->fd] = 0;
2544#endif /* if defined(USE_EPOLL) */
2545#ifdef USE_DEVPOLL
2546	INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2547	       thread->fdpollinfo[sock->fd].want_write == 0);
2548#endif /* ifdef USE_DEVPOLL */
2549	UNLOCK(&thread->fdlock[lockid]);
2550
2551	LOCK(&manager->lock);
2552	ISC_LIST_APPEND(manager->socklist, sock, link);
2553#ifdef USE_SELECT
2554	if (thread->maxfd < sock->fd) {
2555		thread->maxfd = sock->fd;
2556	}
2557#endif /* ifdef USE_SELECT */
2558	UNLOCK(&manager->lock);
2559
2560	socket_log(sock, NULL, CREATION,
2561		   dup_socket != NULL ? "dupped" : "created");
2562
2563	return (ISC_R_SUCCESS);
2564}
2565
2566/*%
2567 * Create a new 'type' socket managed by 'manager'.  Events
2568 * will be posted to 'task' and when dispatched 'action' will be
2569 * called with 'arg' as the arg value.  The new socket is returned
2570 * in 'socketp'.
2571 */
2572isc_result_t
2573isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2574		  isc_socket_t **socketp) {
2575	return (socket_create(manager0, pf, type, socketp, NULL));
2576}
2577
2578/*%
2579 * Duplicate an existing socket.  The new socket is returned
2580 * in 'socketp'.
2581 */
2582isc_result_t
2583isc_socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
2584	REQUIRE(VALID_SOCKET(sock));
2585	REQUIRE(socketp != NULL && *socketp == NULL);
2586
2587	return (socket_create(sock->manager, sock->pf, sock->type, socketp,
2588			      sock));
2589}
2590
2591isc_result_t
2592isc_socket_open(isc_socket_t *sock) {
2593	isc_result_t result;
2594	isc__socketthread_t *thread;
2595
2596	REQUIRE(VALID_SOCKET(sock));
2597
2598	LOCK(&sock->lock);
2599
2600	REQUIRE(isc_refcount_current(&sock->references) >= 1);
2601	REQUIRE(sock->fd == -1);
2602	REQUIRE(sock->threadid == -1);
2603	REQUIRE(sock->type != isc_sockettype_fdwatch);
2604
2605	result = opensocket(sock->manager, sock, NULL);
2606
2607	UNLOCK(&sock->lock);
2608
2609	if (result != ISC_R_SUCCESS) {
2610		sock->fd = -1;
2611	} else {
2612		sock->threadid = gen_threadid(sock);
2613		thread = &sock->manager->threads[sock->threadid];
2614		int lockid = FDLOCK_ID(sock->fd);
2615
2616		LOCK(&thread->fdlock[lockid]);
2617		thread->fds[sock->fd] = sock;
2618		thread->fdstate[sock->fd] = MANAGED;
2619#if defined(USE_EPOLL)
2620		thread->epoll_events[sock->fd] = 0;
2621#endif /* if defined(USE_EPOLL) */
2622#ifdef USE_DEVPOLL
2623		INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2624		       thread->fdpollinfo[sock->fd].want_write == 0);
2625#endif /* ifdef USE_DEVPOLL */
2626		UNLOCK(&thread->fdlock[lockid]);
2627
2628#ifdef USE_SELECT
2629		LOCK(&sock->manager->lock);
2630		if (thread->maxfd < sock->fd) {
2631			thread->maxfd = sock->fd;
2632		}
2633		UNLOCK(&sock->manager->lock);
2634#endif /* ifdef USE_SELECT */
2635	}
2636
2637	return (result);
2638}
2639
2640/*
2641 * Attach to a socket.  Caller must explicitly detach when it is done.
2642 */
2643void
2644isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2645	REQUIRE(VALID_SOCKET(sock));
2646	REQUIRE(socketp != NULL && *socketp == NULL);
2647
2648	int old_refs = isc_refcount_increment(&sock->references);
2649	REQUIRE(old_refs > 0);
2650
2651	*socketp = sock;
2652}
2653
2654/*
2655 * Dereference a socket.  If this is the last reference to it, clean things
2656 * up by destroying the socket.
2657 */
2658void
2659isc_socket_detach(isc_socket_t **socketp) {
2660	isc_socket_t *sock;
2661
2662	REQUIRE(socketp != NULL);
2663	sock = *socketp;
2664	REQUIRE(VALID_SOCKET(sock));
2665	if (isc_refcount_decrement(&sock->references) == 1) {
2666		destroy(&sock);
2667	}
2668
2669	*socketp = NULL;
2670}
2671
2672isc_result_t
2673isc_socket_close(isc_socket_t *sock) {
2674	int fd;
2675	isc_socketmgr_t *manager;
2676	isc__socketthread_t *thread;
2677	fflush(stdout);
2678	REQUIRE(VALID_SOCKET(sock));
2679
2680	LOCK(&sock->lock);
2681
2682	REQUIRE(sock->type != isc_sockettype_fdwatch);
2683	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2684
2685	INSIST(!sock->connecting);
2686	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2687	INSIST(ISC_LIST_EMPTY(sock->send_list));
2688	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2689	INSIST(ISC_LIST_EMPTY(sock->connect_list));
2690
2691	manager = sock->manager;
2692	thread = &manager->threads[sock->threadid];
2693	fd = sock->fd;
2694	sock->fd = -1;
2695	sock->threadid = -1;
2696
2697	sock->dupped = 0;
2698	memset(sock->name, 0, sizeof(sock->name));
2699	sock->tag = NULL;
2700	sock->listener = 0;
2701	sock->connected = 0;
2702	sock->connecting = 0;
2703	sock->bound = 0;
2704	isc_sockaddr_any(&sock->peer_address);
2705
2706	UNLOCK(&sock->lock);
2707
2708	socketclose(thread, sock, fd);
2709
2710	return (ISC_R_SUCCESS);
2711}
2712
2713static void
2714dispatch_recv(isc_socket_t *sock) {
2715	if (sock->type != isc_sockettype_fdwatch) {
2716		internal_recv(sock);
2717	} else {
2718		internal_fdwatch_read(sock);
2719	}
2720}
2721
2722static void
2723dispatch_send(isc_socket_t *sock) {
2724	if (sock->type != isc_sockettype_fdwatch) {
2725		internal_send(sock);
2726	} else {
2727		internal_fdwatch_write(sock);
2728	}
2729}
2730
2731/*
2732 * Dequeue an item off the given socket's read queue, set the result code
2733 * in the done event to the one provided, and send it to the task it was
2734 * destined for.
2735 *
2736 * If the event to be sent is on a list, remove it before sending.  If
2737 * asked to, send and detach from the socket as well.
2738 *
2739 * Caller must have the socket locked if the event is attached to the socket.
2740 */
2741static void
2742send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2743	isc_task_t *task;
2744
2745	task = (*dev)->ev_sender;
2746
2747	(*dev)->ev_sender = sock;
2748
2749	if (ISC_LINK_LINKED(*dev, ev_link)) {
2750		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2751	}
2752
2753	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2754		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2755					 sock->threadid);
2756	} else {
2757		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2758	}
2759}
2760
2761/*
2762 * See comments for send_recvdone_event() above.
2763 *
2764 * Caller must have the socket locked if the event is attached to the socket.
2765 */
2766static void
2767send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2768	isc_task_t *task;
2769
2770	INSIST(dev != NULL && *dev != NULL);
2771
2772	task = (*dev)->ev_sender;
2773	(*dev)->ev_sender = sock;
2774
2775	if (ISC_LINK_LINKED(*dev, ev_link)) {
2776		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2777	}
2778
2779	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2780		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2781					 sock->threadid);
2782	} else {
2783		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2784	}
2785}
2786
2787/*
2788 * See comments for send_recvdone_event() above.
2789 *
2790 * Caller must have the socket locked if the event is attached to the socket.
2791 */
2792static void
2793send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) {
2794	isc_task_t *task;
2795
2796	INSIST(dev != NULL && *dev != NULL);
2797
2798	task = (*dev)->ev_sender;
2799	(*dev)->ev_sender = sock;
2800
2801	if (ISC_LINK_LINKED(*dev, ev_link)) {
2802		ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
2803	}
2804
2805	isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
2806}
2807
2808/*
2809 * Call accept() on a socket, to get the new file descriptor.  The listen
2810 * socket is used as a prototype to create a new isc_socket_t.  The new
2811 * socket has one outstanding reference.  The task receiving the event
2812 * will be detached from just after the event is delivered.
2813 *
2814 * On entry to this function, the event delivered is the internal
2815 * readable event, and the first item on the accept_list should be
2816 * the done event we want to send.  If the list is empty, this is a no-op,
2817 * so just unlock and return.
2818 */
2819static void
2820internal_accept(isc_socket_t *sock) {
2821	isc_socketmgr_t *manager;
2822	isc__socketthread_t *thread, *nthread;
2823	isc_socket_newconnev_t *dev;
2824	isc_task_t *task;
2825	socklen_t addrlen;
2826	int fd;
2827	isc_result_t result = ISC_R_SUCCESS;
2828	char strbuf[ISC_STRERRORSIZE];
2829	const char *err = "accept";
2830
2831	INSIST(VALID_SOCKET(sock));
2832	REQUIRE(sock->fd >= 0);
2833
2834	socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
2835
2836	manager = sock->manager;
2837	INSIST(VALID_MANAGER(manager));
2838	thread = &manager->threads[sock->threadid];
2839
2840	INSIST(sock->listener);
2841
2842	/*
2843	 * Get the first item off the accept list.
2844	 * If it is empty, unlock the socket and return.
2845	 */
2846	dev = ISC_LIST_HEAD(sock->accept_list);
2847	if (dev == NULL) {
2848		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2849		UNLOCK(&sock->lock);
2850		return;
2851	}
2852
2853	/*
2854	 * Try to accept the new connection.  If the accept fails with
2855	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2856	 * again.  Also ignore ECONNRESET, which has been reported to
2857	 * be spuriously returned on Linux 2.2.19 although it is not
2858	 * a documented error for accept().  ECONNABORTED has been
2859	 * reported for Solaris 8.  The rest are thrown in not because
2860	 * we have seen them but because they are ignored by other
2861	 * daemons such as BIND 8 and Apache.
2862	 */
2863
2864	addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
2865	memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
2866	fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
2867		    (void *)&addrlen);
2868
2869#ifdef F_DUPFD
2870	/*
2871	 * Leave a space for stdio to work in.
2872	 */
2873	if (fd >= 0 && fd < 20) {
2874		int newfd, tmp;
2875		newfd = fcntl(fd, F_DUPFD, 20);
2876		tmp = errno;
2877		(void)close(fd);
2878		errno = tmp;
2879		fd = newfd;
2880		err = "accept/fcntl";
2881	}
2882#endif /* ifdef F_DUPFD */
2883
2884	if (fd < 0) {
2885		if (SOFT_ERROR(errno)) {
2886			goto soft_error;
2887		}
2888		switch (errno) {
2889		case ENFILE:
2890		case EMFILE:
2891			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2892				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2893				      "%s: too many open file descriptors",
2894				      err);
2895			goto soft_error;
2896
2897		case ENOBUFS:
2898		case ENOMEM:
2899		case ECONNRESET:
2900		case ECONNABORTED:
2901		case EHOSTUNREACH:
2902		case EHOSTDOWN:
2903		case ENETUNREACH:
2904		case ENETDOWN:
2905		case ECONNREFUSED:
2906#ifdef EPROTO
2907		case EPROTO:
2908#endif /* ifdef EPROTO */
2909#ifdef ENONET
2910		case ENONET:
2911#endif /* ifdef ENONET */
2912			goto soft_error;
2913		default:
2914			break;
2915		}
2916		strerror_r(errno, strbuf, sizeof(strbuf));
2917		UNEXPECTED_ERROR(__FILE__, __LINE__,
2918				 "internal_accept: %s() failed: %s", err,
2919				 strbuf);
2920		fd = -1;
2921		result = ISC_R_UNEXPECTED;
2922	} else {
2923		if (addrlen == 0U) {
2924			UNEXPECTED_ERROR(__FILE__, __LINE__,
2925					 "internal_accept(): "
2926					 "accept() failed to return "
2927					 "remote address");
2928
2929			(void)close(fd);
2930			goto soft_error;
2931		} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
2932			   sock->pf)
2933		{
2934			UNEXPECTED_ERROR(
2935				__FILE__, __LINE__,
2936				"internal_accept(): "
2937				"accept() returned peer address "
2938				"family %u (expected %u)",
2939				NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
2940				sock->pf);
2941			(void)close(fd);
2942			goto soft_error;
2943		} else if (fd >= (int)manager->maxsocks) {
2944			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2945				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2946				      "accept: file descriptor exceeds limit "
2947				      "(%d/%u)",
2948				      fd, manager->maxsocks);
2949			(void)close(fd);
2950			goto soft_error;
2951		}
2952	}
2953
2954	if (fd != -1) {
2955		NEWCONNSOCK(dev)->peer_address.length = addrlen;
2956		NEWCONNSOCK(dev)->pf = sock->pf;
2957	}
2958
2959	/*
2960	 * Pull off the done event.
2961	 */
2962	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2963
2964	/*
2965	 * Poke watcher if there are more pending accepts.
2966	 */
2967	if (ISC_LIST_EMPTY(sock->accept_list)) {
2968		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2969	}
2970
2971	if (fd != -1) {
2972		result = make_nonblock(fd);
2973		if (result != ISC_R_SUCCESS) {
2974			(void)close(fd);
2975			fd = -1;
2976		}
2977	}
2978
2979	/*
2980	 * We need to unlock sock->lock now to be able to lock manager->lock
2981	 * without risking a deadlock with xmlstats.
2982	 */
2983	UNLOCK(&sock->lock);
2984
2985	/*
2986	 * -1 means the new socket didn't happen.
2987	 */
2988	if (fd != -1) {
2989		int lockid = FDLOCK_ID(fd);
2990
2991		NEWCONNSOCK(dev)->fd = fd;
2992		NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
2993		NEWCONNSOCK(dev)->bound = 1;
2994		NEWCONNSOCK(dev)->connected = 1;
2995		nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
2996
2997		/*
2998		 * We already hold a lock on one fdlock in accepting thread,
2999		 * we need to make sure that we don't double lock.
3000		 */
3001		bool same_bucket = (sock->threadid ==
3002				    NEWCONNSOCK(dev)->threadid) &&
3003				   (FDLOCK_ID(sock->fd) == lockid);
3004
3005		/*
3006		 * Use minimum mtu if possible.
3007		 */
3008		use_min_mtu(NEWCONNSOCK(dev));
3009		set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
3010
3011		/*
3012		 * Ensure DSCP settings are inherited across accept.
3013		 */
3014		setdscp(NEWCONNSOCK(dev), sock->dscp);
3015
3016		/*
3017		 * Save away the remote address
3018		 */
3019		dev->address = NEWCONNSOCK(dev)->peer_address;
3020
3021		if (NEWCONNSOCK(dev)->active == 0) {
3022			inc_stats(manager->stats,
3023				  NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
3024			NEWCONNSOCK(dev)->active = 1;
3025		}
3026
3027		if (!same_bucket) {
3028			LOCK(&nthread->fdlock[lockid]);
3029		}
3030		nthread->fds[fd] = NEWCONNSOCK(dev);
3031		nthread->fdstate[fd] = MANAGED;
3032#if defined(USE_EPOLL)
3033		nthread->epoll_events[fd] = 0;
3034#endif /* if defined(USE_EPOLL) */
3035		if (!same_bucket) {
3036			UNLOCK(&nthread->fdlock[lockid]);
3037		}
3038
3039		LOCK(&manager->lock);
3040
3041#ifdef USE_SELECT
3042		if (nthread->maxfd < fd) {
3043			nthread->maxfd = fd;
3044		}
3045#endif /* ifdef USE_SELECT */
3046
3047		socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
3048			   "accepted connection, new socket %p",
3049			   dev->newsocket);
3050
3051		ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
3052
3053		UNLOCK(&manager->lock);
3054
3055		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3056	} else {
3057		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3058		isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
3059		free_socket((isc_socket_t **)&dev->newsocket);
3060	}
3061
3062	/*
3063	 * Fill in the done event details and send it off.
3064	 */
3065	dev->result = result;
3066	task = dev->ev_sender;
3067	dev->ev_sender = sock;
3068
3069	isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
3070	return;
3071
3072soft_error:
3073	watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
3074	UNLOCK(&sock->lock);
3075
3076	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3077	return;
3078}
3079
3080static void
3081internal_recv(isc_socket_t *sock) {
3082	isc_socketevent_t *dev;
3083
3084	INSIST(VALID_SOCKET(sock));
3085	REQUIRE(sock->fd >= 0);
3086
3087	dev = ISC_LIST_HEAD(sock->recv_list);
3088	if (dev == NULL) {
3089		goto finish;
3090	}
3091
3092	socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
3093		   dev, dev->ev_sender);
3094
3095	/*
3096	 * Try to do as much I/O as possible on this socket.  There are no
3097	 * limits here, currently.
3098	 */
3099	while (dev != NULL) {
3100		switch (doio_recv(sock, dev)) {
3101		case DOIO_SOFT:
3102			goto finish;
3103
3104		case DOIO_EOF:
3105			/*
3106			 * read of 0 means the remote end was closed.
3107			 * Run through the event queue and dispatch all
3108			 * the events with an EOF result code.
3109			 */
3110			do {
3111				dev->result = ISC_R_EOF;
3112				send_recvdone_event(sock, &dev);
3113				dev = ISC_LIST_HEAD(sock->recv_list);
3114			} while (dev != NULL);
3115			goto finish;
3116
3117		case DOIO_SUCCESS:
3118		case DOIO_HARD:
3119			send_recvdone_event(sock, &dev);
3120			break;
3121		}
3122
3123		dev = ISC_LIST_HEAD(sock->recv_list);
3124	}
3125
3126finish:
3127	if (ISC_LIST_EMPTY(sock->recv_list)) {
3128		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3129			   SELECT_POKE_READ);
3130	}
3131}
3132
3133static void
3134internal_send(isc_socket_t *sock) {
3135	isc_socketevent_t *dev;
3136
3137	INSIST(VALID_SOCKET(sock));
3138	REQUIRE(sock->fd >= 0);
3139
3140	dev = ISC_LIST_HEAD(sock->send_list);
3141	if (dev == NULL) {
3142		goto finish;
3143	}
3144	socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
3145		   dev->ev_sender);
3146
3147	/*
3148	 * Try to do as much I/O as possible on this socket.  There are no
3149	 * limits here, currently.
3150	 */
3151	while (dev != NULL) {
3152		switch (doio_send(sock, dev)) {
3153		case DOIO_SOFT:
3154			goto finish;
3155
3156		case DOIO_HARD:
3157		case DOIO_SUCCESS:
3158			send_senddone_event(sock, &dev);
3159			break;
3160		}
3161
3162		dev = ISC_LIST_HEAD(sock->send_list);
3163	}
3164
3165finish:
3166	if (ISC_LIST_EMPTY(sock->send_list)) {
3167		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3168			   SELECT_POKE_WRITE);
3169	}
3170}
3171
3172static void
3173internal_fdwatch_write(isc_socket_t *sock)
3174{
3175	int more_data;
3176
3177	INSIST(VALID_SOCKET(sock));
3178
3179	isc_refcount_increment(&sock->references);
3180	UNLOCK(&sock->lock);
3181
3182	more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock,
3183				      sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
3184
3185	LOCK(&sock->lock);
3186
3187	if (isc_refcount_decrement(&sock->references) == 0) {
3188		UNLOCK(&sock->lock);
3189		destroy(&sock);
3190		return;
3191	}
3192
3193	if (more_data)
3194		select_poke(sock->manager, sock->threadid, sock->fd,
3195		    SELECT_POKE_WRITE);
3196}
3197
3198static void
3199internal_fdwatch_read(isc_socket_t *sock)
3200{
3201	int more_data;
3202
3203	INSIST(VALID_SOCKET(sock));
3204
3205	isc_refcount_increment(&sock->references);
3206	UNLOCK(&sock->lock);
3207
3208	more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock,
3209				      sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
3210
3211	LOCK(&sock->lock);
3212
3213	if (isc_refcount_decrement(&sock->references) == 0) {
3214		UNLOCK(&sock->lock);
3215		destroy(&sock);
3216		return;
3217	}
3218
3219	if (more_data)
3220		select_poke(sock->manager, sock->threadid, sock->fd,
3221		    SELECT_POKE_READ);
3222}
3223
3224/*
3225 * Process read/writes on each fd here.  Avoid locking
3226 * and unlocking twice if both reads and writes are possible.
3227 */
3228static void
3229process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
3230	isc_socket_t *sock;
3231	int lockid = FDLOCK_ID(fd);
3232
3233	/*
3234	 * If the socket is going to be closed, don't do more I/O.
3235	 */
3236	LOCK(&thread->fdlock[lockid]);
3237	if (thread->fdstate[fd] == CLOSE_PENDING) {
3238		UNLOCK(&thread->fdlock[lockid]);
3239
3240		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
3241		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
3242		return;
3243	}
3244
3245	sock = thread->fds[fd];
3246	if (sock == NULL) {
3247		UNLOCK(&thread->fdlock[lockid]);
3248		return;
3249	}
3250
3251	LOCK(&sock->lock);
3252
3253	if (sock->fd < 0) {
3254		/*
3255		 * Sock is being closed - the final external reference
3256		 * is gone but it was not yet removed from event loop
3257		 * and fdstate[]/fds[] as destroy() is waiting on
3258		 * thread->fdlock[lockid] or sock->lock that we're holding.
3259		 * Just release the locks and bail.
3260		 */
3261		UNLOCK(&sock->lock);
3262		UNLOCK(&thread->fdlock[lockid]);
3263		return;
3264	}
3265
3266	REQUIRE(readable || writeable);
3267	if (writeable) {
3268		if (sock->connecting) {
3269			internal_connect(sock);
3270		} else {
3271			dispatch_send(sock);
3272		}
3273	}
3274
3275	if (readable) {
3276		if (sock->listener) {
3277			internal_accept(sock); /* unlocks sock */
3278		} else {
3279			dispatch_recv(sock);
3280			UNLOCK(&sock->lock);
3281		}
3282	} else {
3283		UNLOCK(&sock->lock);
3284	}
3285
3286	UNLOCK(&thread->fdlock[lockid]);
3287
3288	/*
3289	 * Socket destruction might be pending, it will resume
3290	 * after releasing fdlock and sock->lock.
3291	 */
3292}
3293
3294/*
3295 * process_fds is different for different event loops
3296 * it takes the events from event loops and for each FD
3297 * launches process_fd
3298 */
3299#ifdef USE_KQUEUE
3300static bool
3301process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
3302	int i;
3303	bool readable, writable;
3304	bool done = false;
3305	bool have_ctlevent = false;
3306	if (nevents == thread->nevents) {
3307		/*
3308		 * This is not an error, but something unexpected.  If this
3309		 * happens, it may indicate the need for increasing
3310		 * ISC_SOCKET_MAXEVENTS.
3311		 */
3312		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3313			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3314			   "maximum number of FD events (%d) received",
3315			   nevents);
3316	}
3317
3318	for (i = 0; i < nevents; i++) {
3319		REQUIRE(events[i].ident < thread->manager->maxsocks);
3320		if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
3321			have_ctlevent = true;
3322			continue;
3323		}
3324		readable = (events[i].filter == EVFILT_READ);
3325		writable = (events[i].filter == EVFILT_WRITE);
3326		process_fd(thread, events[i].ident, readable, writable);
3327	}
3328
3329	if (have_ctlevent) {
3330		done = process_ctlfd(thread);
3331	}
3332
3333	return (done);
3334}
3335#elif defined(USE_EPOLL)
3336static bool
3337process_fds(isc__socketthread_t *thread, struct epoll_event *events,
3338	    int nevents) {
3339	int i;
3340	bool done = false;
3341	bool have_ctlevent = false;
3342
3343	if (nevents == thread->nevents) {
3344		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3345			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3346			   "maximum number of FD events (%d) received",
3347			   nevents);
3348	}
3349
3350	for (i = 0; i < nevents; i++) {
3351		REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
3352		if (events[i].data.fd == thread->pipe_fds[0]) {
3353			have_ctlevent = true;
3354			continue;
3355		}
3356		if ((events[i].events & EPOLLERR) != 0 ||
3357		    (events[i].events & EPOLLHUP) != 0)
3358		{
3359			/*
3360			 * epoll does not set IN/OUT bits on an erroneous
3361			 * condition, so we need to try both anyway.  This is a
3362			 * bit inefficient, but should be okay for such rare
3363			 * events.  Note also that the read or write attempt
3364			 * won't block because we use non-blocking sockets.
3365			 */
3366			int fd = events[i].data.fd;
3367			events[i].events |= thread->epoll_events[fd];
3368		}
3369		process_fd(thread, events[i].data.fd,
3370			   (events[i].events & EPOLLIN) != 0,
3371			   (events[i].events & EPOLLOUT) != 0);
3372	}
3373
3374	if (have_ctlevent) {
3375		done = process_ctlfd(thread);
3376	}
3377
3378	return (done);
3379}
3380#elif defined(USE_DEVPOLL)
3381static bool
3382process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
3383	int i;
3384	bool done = false;
3385	bool have_ctlevent = false;
3386
3387	if (nevents == thread->nevents) {
3388		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3389			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3390			   "maximum number of FD events (%d) received",
3391			   nevents);
3392	}
3393
3394	for (i = 0; i < nevents; i++) {
3395		REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
3396		if (events[i].fd == thread->pipe_fds[0]) {
3397			have_ctlevent = true;
3398			continue;
3399		}
3400		process_fd(thread, events[i].fd,
3401			   (events[i].events & POLLIN) != 0,
3402			   (events[i].events & POLLOUT) != 0);
3403	}
3404
3405	if (have_ctlevent) {
3406		done = process_ctlfd(thread);
3407	}
3408
3409	return (done);
3410}
3411#elif defined(USE_SELECT)
3412static void
3413process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
3414	    fd_set *writefds) {
3415	int i;
3416
3417	REQUIRE(maxfd <= (int)thread->manager->maxsocks);
3418
3419	for (i = 0; i < maxfd; i++) {
3420		if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
3421			continue;
3422		}
3423		process_fd(thread, i, FD_ISSET(i, readfds),
3424			   FD_ISSET(i, writefds));
3425	}
3426}
3427#endif /* ifdef USE_KQUEUE */
3428
3429static bool
3430process_ctlfd(isc__socketthread_t *thread) {
3431	int msg, fd;
3432
3433	for (;;) {
3434		select_readmsg(thread, &fd, &msg);
3435
3436		thread_log(thread, IOEVENT,
3437			   "watcher got message %d for socket %d", msg, fd);
3438
3439		/*
3440		 * Nothing to read?
3441		 */
3442		if (msg == SELECT_POKE_NOTHING) {
3443			break;
3444		}
3445
3446		/*
3447		 * Handle shutdown message.  We really should
3448		 * jump out of this loop right away, but
3449		 * it doesn't matter if we have to do a little
3450		 * more work first.
3451		 */
3452		if (msg == SELECT_POKE_SHUTDOWN) {
3453			return (true);
3454		}
3455
3456		/*
3457		 * This is a wakeup on a socket.  Look
3458		 * at the event queue for both read and write,
3459		 * and decide if we need to watch on it now
3460		 * or not.
3461		 */
3462		wakeup_socket(thread, fd, msg);
3463	}
3464
3465	return (false);
3466}
3467
3468/*
3469 * This is the thread that will loop forever, always in a select or poll
3470 * call.
3471 *
3472 * When select returns something to do, do whatever's necessary and post
3473 * an event to the task that was requesting the action.
3474 */
3475static isc_threadresult_t
3476netthread(void *uap) {
3477	isc__socketthread_t *thread = uap;
3478	isc_socketmgr_t *manager = thread->manager;
3479	(void)manager;
3480	bool done;
3481	int cc;
3482#ifdef USE_KQUEUE
3483	const char *fnname = "kevent()";
3484#elif defined(USE_EPOLL)
3485	const char *fnname = "epoll_wait()";
3486#elif defined(USE_DEVPOLL)
3487	isc_result_t result;
3488	const char *fnname = "ioctl(DP_POLL)";
3489	struct dvpoll dvp;
3490	int pass;
3491#if defined(ISC_SOCKET_USE_POLLWATCH)
3492	pollstate_t pollstate = poll_idle;
3493#endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
3494#elif defined(USE_SELECT)
3495	const char *fnname = "select()";
3496	int maxfd;
3497	int ctlfd;
3498#endif /* ifdef USE_KQUEUE */
3499	char strbuf[ISC_STRERRORSIZE];
3500
3501#if defined(USE_SELECT)
3502	/*
3503	 * Get the control fd here.  This will never change.
3504	 */
3505	ctlfd = thread->pipe_fds[0];
3506#endif /* if defined(USE_SELECT) */
3507	done = false;
3508	while (!done) {
3509		do {
3510#ifdef USE_KQUEUE
3511			cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
3512				    thread->nevents, NULL);
3513#elif defined(USE_EPOLL)
3514			cc = epoll_wait(thread->epoll_fd, thread->events,
3515					thread->nevents, -1);
3516#elif defined(USE_DEVPOLL)
3517			/*
3518			 * Re-probe every thousand calls.
3519			 */
3520			if (thread->calls++ > 1000U) {
3521				result = isc_resource_getcurlimit(
3522					isc_resource_openfiles,
3523					&thread->open_max);
3524				if (result != ISC_R_SUCCESS) {
3525					thread->open_max = 64;
3526				}
3527				thread->calls = 0;
3528			}
3529			for (pass = 0; pass < 2; pass++) {
3530				dvp.dp_fds = thread->events;
3531				dvp.dp_nfds = thread->nevents;
3532				if (dvp.dp_nfds >= thread->open_max) {
3533					dvp.dp_nfds = thread->open_max - 1;
3534				}
3535#ifndef ISC_SOCKET_USE_POLLWATCH
3536				dvp.dp_timeout = -1;
3537#else  /* ifndef ISC_SOCKET_USE_POLLWATCH */
3538				if (pollstate == poll_idle) {
3539					dvp.dp_timeout = -1;
3540				} else {
3541					dvp.dp_timeout =
3542						ISC_SOCKET_POLLWATCH_TIMEOUT;
3543				}
3544#endif /* ISC_SOCKET_USE_POLLWATCH */
3545				cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
3546				if (cc == -1 && errno == EINVAL) {
3547					/*
3548					 * {OPEN_MAX} may have dropped.  Look
3549					 * up the current value and try again.
3550					 */
3551					result = isc_resource_getcurlimit(
3552						isc_resource_openfiles,
3553						&thread->open_max);
3554					if (result != ISC_R_SUCCESS) {
3555						thread->open_max = 64;
3556					}
3557				} else {
3558					break;
3559				}
3560			}
3561#elif defined(USE_SELECT)
3562			/*
3563			 * We will have only one thread anyway, we can lock
3564			 * manager lock and don't care
3565			 */
3566			LOCK(&manager->lock);
3567			memmove(thread->read_fds_copy, thread->read_fds,
3568				thread->fd_bufsize);
3569			memmove(thread->write_fds_copy, thread->write_fds,
3570				thread->fd_bufsize);
3571			maxfd = thread->maxfd + 1;
3572			UNLOCK(&manager->lock);
3573
3574			cc = select(maxfd, thread->read_fds_copy,
3575				    thread->write_fds_copy, NULL, NULL);
3576#endif /* USE_KQUEUE */
3577
3578			if (cc < 0 && !SOFT_ERROR(errno)) {
3579				strerror_r(errno, strbuf, sizeof(strbuf));
3580				FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
3581					    fnname, strbuf);
3582			}
3583
3584#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3585			if (cc == 0) {
3586				if (pollstate == poll_active) {
3587					pollstate = poll_checking;
3588				} else if (pollstate == poll_checking) {
3589					pollstate = poll_idle;
3590				}
3591			} else if (cc > 0) {
3592				if (pollstate == poll_checking) {
3593					/*
3594					 * XXX: We'd like to use a more
3595					 * verbose log level as it's actually an
3596					 * unexpected event, but the kernel bug
3597					 * reportedly happens pretty frequently
3598					 * (and it can also be a false positive)
3599					 * so it would be just too noisy.
3600					 */
3601					thread_log(thread,
3602						   ISC_LOGCATEGORY_GENERAL,
3603						   ISC_LOGMODULE_SOCKET,
3604						   ISC_LOG_DEBUG(1),
3605						   "unexpected POLL timeout");
3606				}
3607				pollstate = poll_active;
3608			}
3609#endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
3610		} while (cc < 0);
3611
3612#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3613		done = process_fds(thread, thread->events, cc);
3614#elif defined(USE_SELECT)
3615		process_fds(thread, maxfd, thread->read_fds_copy,
3616			    thread->write_fds_copy);
3617
3618		/*
3619		 * Process reads on internal, control fd.
3620		 */
3621		if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
3622			done = process_ctlfd(thread);
3623		}
3624#endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
3625	* */
3626	}
3627
3628	thread_log(thread, TRACE, "watcher exiting");
3629	return ((isc_threadresult_t)0);
3630}
3631
3632void
3633isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) {
3634	REQUIRE(VALID_MANAGER(manager));
3635
3636	manager->reserved = reserved;
3637}
3638
3639void
3640isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) {
3641	REQUIRE(VALID_MANAGER(manager));
3642
3643	manager->maxudp = maxudp;
3644}
3645
3646/*
3647 * Setup socket thread, thread->manager and thread->threadid must be filled.
3648 */
3649
3650static isc_result_t
3651setup_thread(isc__socketthread_t *thread) {
3652	isc_result_t result = ISC_R_SUCCESS;
3653	int i;
3654	char strbuf[ISC_STRERRORSIZE];
3655
3656	REQUIRE(thread != NULL);
3657	REQUIRE(VALID_MANAGER(thread->manager));
3658	REQUIRE(thread->threadid >= 0 &&
3659		thread->threadid < thread->manager->nthreads);
3660
3661	thread->fds =
3662		isc_mem_get(thread->manager->mctx,
3663			    thread->manager->maxsocks * sizeof(isc_socket_t *));
3664
3665	memset(thread->fds, 0,
3666	       thread->manager->maxsocks * sizeof(isc_socket_t *));
3667
3668	thread->fdstate = isc_mem_get(thread->manager->mctx,
3669				      thread->manager->maxsocks * sizeof(int));
3670
3671	memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
3672
3673	thread->fdlock = isc_mem_get(thread->manager->mctx,
3674				     FDLOCK_COUNT * sizeof(isc_mutex_t));
3675
3676	for (i = 0; i < FDLOCK_COUNT; i++) {
3677		isc_mutex_init(&thread->fdlock[i]);
3678	}
3679
3680	if (pipe(thread->pipe_fds) != 0) {
3681		strerror_r(errno, strbuf, sizeof(strbuf));
3682		UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
3683				 strbuf);
3684		return (ISC_R_UNEXPECTED);
3685	}
3686	RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
3687
3688#ifdef USE_KQUEUE
3689	thread->nevents = ISC_SOCKET_MAXEVENTS;
3690	thread->events = isc_mem_get(thread->manager->mctx,
3691				     sizeof(struct kevent) * thread->nevents);
3692
3693	thread->kqueue_fd = kqueue();
3694	if (thread->kqueue_fd == -1) {
3695		result = isc__errno2result(errno);
3696		strerror_r(errno, strbuf, sizeof(strbuf));
3697		UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
3698				 strbuf);
3699		isc_mem_put(thread->manager->mctx, thread->events,
3700			    sizeof(struct kevent) * thread->nevents);
3701		return (result);
3702	}
3703
3704	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3705	if (result != ISC_R_SUCCESS) {
3706		close(thread->kqueue_fd);
3707		isc_mem_put(thread->manager->mctx, thread->events,
3708			    sizeof(struct kevent) * thread->nevents);
3709	}
3710	return (result);
3711
3712#elif defined(USE_EPOLL)
3713	thread->nevents = ISC_SOCKET_MAXEVENTS;
3714	thread->epoll_events =
3715		isc_mem_get(thread->manager->mctx,
3716			    (thread->manager->maxsocks * sizeof(uint32_t)));
3717
3718	memset(thread->epoll_events, 0,
3719	       thread->manager->maxsocks * sizeof(uint32_t));
3720
3721	thread->events =
3722		isc_mem_get(thread->manager->mctx,
3723			    sizeof(struct epoll_event) * thread->nevents);
3724
3725	thread->epoll_fd = epoll_create(thread->nevents);
3726	if (thread->epoll_fd == -1) {
3727		result = isc__errno2result(errno);
3728		strerror_r(errno, strbuf, sizeof(strbuf));
3729		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
3730				 strbuf);
3731		return (result);
3732	}
3733
3734	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3735	return (result);
3736
3737#elif defined(USE_DEVPOLL)
3738	thread->nevents = ISC_SOCKET_MAXEVENTS;
3739	result = isc_resource_getcurlimit(isc_resource_openfiles,
3740					  &thread->open_max);
3741	if (result != ISC_R_SUCCESS) {
3742		thread->open_max = 64;
3743	}
3744	thread->calls = 0;
3745	thread->events = isc_mem_get(thread->manager->mctx,
3746				     sizeof(struct pollfd) * thread->nevents);
3747
3748	/*
3749	 * Note: fdpollinfo should be able to support all possible FDs, so
3750	 * it must have maxsocks entries (not nevents).
3751	 */
3752	thread->fdpollinfo =
3753		isc_mem_get(thread->manager->mctx,
3754			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3755	memset(thread->fdpollinfo, 0,
3756	       sizeof(pollinfo_t) * thread->manager->maxsocks);
3757	thread->devpoll_fd = open("/dev/poll", O_RDWR);
3758	if (thread->devpoll_fd == -1) {
3759		result = isc__errno2result(errno);
3760		strerror_r(errno, strbuf, sizeof(strbuf));
3761		UNEXPECTED_ERROR(__FILE__, __LINE__,
3762				 "open(/dev/poll) failed: %s", strbuf);
3763		isc_mem_put(thread->manager->mctx, thread->events,
3764			    sizeof(struct pollfd) * thread->nevents);
3765		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3766			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3767		return (result);
3768	}
3769	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3770	if (result != ISC_R_SUCCESS) {
3771		close(thread->devpoll_fd);
3772		isc_mem_put(thread->manager->mctx, thread->events,
3773			    sizeof(struct pollfd) * thread->nevents);
3774		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3775			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3776		return (result);
3777	}
3778
3779	return (ISC_R_SUCCESS);
3780#elif defined(USE_SELECT)
3781	UNUSED(result);
3782
3783#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3784	/*
3785	 * Note: this code should also cover the case of MAXSOCKETS <=
3786	 * FD_SETSIZE, but we separate the cases to avoid possible portability
3787	 * issues regarding howmany() and the actual representation of fd_set.
3788	 */
3789	thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3790			     sizeof(fd_mask);
3791#else  /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3792	thread->fd_bufsize = sizeof(fd_set);
3793#endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3794
3795	thread->read_fds = isc_mem_get(thread->manager->mctx,
3796				       thread->fd_bufsize);
3797	thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
3798					    thread->fd_bufsize);
3799	thread->write_fds = isc_mem_get(thread->manager->mctx,
3800					thread->fd_bufsize);
3801	thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
3802					     thread->fd_bufsize);
3803	memset(thread->read_fds, 0, thread->fd_bufsize);
3804	memset(thread->write_fds, 0, thread->fd_bufsize);
3805
3806	(void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3807	thread->maxfd = thread->pipe_fds[0];
3808
3809	return (ISC_R_SUCCESS);
3810#endif /* USE_KQUEUE */
3811}
3812
3813static void
3814cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
3815	isc_result_t result;
3816	int i;
3817
3818	result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3819	if (result != ISC_R_SUCCESS) {
3820		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
3821	}
3822#ifdef USE_KQUEUE
3823	close(thread->kqueue_fd);
3824	isc_mem_put(mctx, thread->events,
3825		    sizeof(struct kevent) * thread->nevents);
3826#elif defined(USE_EPOLL)
3827	close(thread->epoll_fd);
3828
3829	isc_mem_put(mctx, thread->events,
3830		    sizeof(struct epoll_event) * thread->nevents);
3831#elif defined(USE_DEVPOLL)
3832	close(thread->devpoll_fd);
3833	isc_mem_put(mctx, thread->events,
3834		    sizeof(struct pollfd) * thread->nevents);
3835	isc_mem_put(mctx, thread->fdpollinfo,
3836		    sizeof(pollinfo_t) * thread->manager->maxsocks);
3837#elif defined(USE_SELECT)
3838	if (thread->read_fds != NULL) {
3839		isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
3840	}
3841	if (thread->read_fds_copy != NULL) {
3842		isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
3843	}
3844	if (thread->write_fds != NULL) {
3845		isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
3846	}
3847	if (thread->write_fds_copy != NULL) {
3848		isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
3849	}
3850#endif /* USE_KQUEUE */
3851	for (i = 0; i < (int)thread->manager->maxsocks; i++) {
3852		if (thread->fdstate[i] == CLOSE_PENDING) {
3853			/* no need to lock */
3854			(void)close(i);
3855		}
3856	}
3857
3858#if defined(USE_EPOLL)
3859	isc_mem_put(thread->manager->mctx, thread->epoll_events,
3860		    thread->manager->maxsocks * sizeof(uint32_t));
3861#endif /* if defined(USE_EPOLL) */
3862	isc_mem_put(thread->manager->mctx, thread->fds,
3863		    thread->manager->maxsocks * sizeof(isc_socket_t *));
3864	isc_mem_put(thread->manager->mctx, thread->fdstate,
3865		    thread->manager->maxsocks * sizeof(int));
3866
3867	for (i = 0; i < FDLOCK_COUNT; i++) {
3868		isc_mutex_destroy(&thread->fdlock[i]);
3869	}
3870	isc_mem_put(thread->manager->mctx, thread->fdlock,
3871		    FDLOCK_COUNT * sizeof(isc_mutex_t));
3872}
3873
3874isc_result_t
3875isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3876	return (isc_socketmgr_create2(mctx, managerp, 0, 1));
3877}
3878
3879isc_result_t
3880isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3881		      unsigned int maxsocks, int nthreads) {
3882	int i;
3883	isc_socketmgr_t *manager;
3884
3885	REQUIRE(managerp != NULL && *managerp == NULL);
3886
3887	if (maxsocks == 0) {
3888		maxsocks = ISC_SOCKET_MAXSOCKETS;
3889	}
3890
3891	manager = isc_mem_get(mctx, sizeof(*manager));
3892
3893	/* zero-clear so that necessary cleanup on failure will be easy */
3894	memset(manager, 0, sizeof(*manager));
3895	manager->maxsocks = maxsocks;
3896	manager->reserved = 0;
3897	manager->maxudp = 0;
3898	manager->nthreads = nthreads;
3899	manager->stats = NULL;
3900
3901	manager->magic = SOCKET_MANAGER_MAGIC;
3902	manager->mctx = NULL;
3903	ISC_LIST_INIT(manager->socklist);
3904	isc_mutex_init(&manager->lock);
3905	isc_condition_init(&manager->shutdown_ok);
3906
3907	/*
3908	 * Start up the select/poll thread.
3909	 */
3910	manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
3911						     manager->nthreads);
3912	isc_mem_attach(mctx, &manager->mctx);
3913
3914	for (i = 0; i < manager->nthreads; i++) {
3915		manager->threads[i].manager = manager;
3916		manager->threads[i].threadid = i;
3917		setup_thread(&manager->threads[i]);
3918		isc_thread_create(netthread, &manager->threads[i],
3919				  &manager->threads[i].thread);
3920		char tname[1024];
3921		sprintf(tname, "sock-%d", i);
3922		isc_thread_setname(manager->threads[i].thread, tname);
3923	}
3924
3925	*managerp = manager;
3926
3927	return (ISC_R_SUCCESS);
3928}
3929
3930isc_result_t
3931isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3932	REQUIRE(VALID_MANAGER(manager));
3933	REQUIRE(nsockp != NULL);
3934
3935	*nsockp = manager->maxsocks;
3936
3937	return (ISC_R_SUCCESS);
3938}
3939
3940void
3941isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3942	REQUIRE(VALID_MANAGER(manager));
3943	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3944	REQUIRE(manager->stats == NULL);
3945	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3946
3947	isc_stats_attach(stats, &manager->stats);
3948}
3949
3950void
3951isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3952	isc_socketmgr_t *manager;
3953
3954	/*
3955	 * Destroy a socket manager.
3956	 */
3957
3958	REQUIRE(managerp != NULL);
3959	manager = *managerp;
3960	REQUIRE(VALID_MANAGER(manager));
3961
3962	LOCK(&manager->lock);
3963
3964	/*
3965	 * Wait for all sockets to be destroyed.
3966	 */
3967	while (!ISC_LIST_EMPTY(manager->socklist)) {
3968		manager_log(manager, CREATION, "sockets exist");
3969		WAIT(&manager->shutdown_ok, &manager->lock);
3970	}
3971
3972	UNLOCK(&manager->lock);
3973
3974	/*
3975	 * Here, poke our select/poll thread.  Do this by closing the write
3976	 * half of the pipe, which will send EOF to the read half.
3977	 * This is currently a no-op in the non-threaded case.
3978	 */
3979	for (int i = 0; i < manager->nthreads; i++) {
3980		select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
3981	}
3982
3983	/*
3984	 * Wait for thread to exit.
3985	 */
3986	for (int i = 0; i < manager->nthreads; i++) {
3987		isc_thread_join(manager->threads[i].thread, NULL);
3988		cleanup_thread(manager->mctx, &manager->threads[i]);
3989	}
3990	/*
3991	 * Clean up.
3992	 */
3993	isc_mem_put(manager->mctx, manager->threads,
3994		    sizeof(isc__socketthread_t) * manager->nthreads);
3995	(void)isc_condition_destroy(&manager->shutdown_ok);
3996
3997	if (manager->stats != NULL) {
3998		isc_stats_detach(&manager->stats);
3999	}
4000	isc_mutex_destroy(&manager->lock);
4001	manager->magic = 0;
4002	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
4003
4004	*managerp = NULL;
4005}
4006
4007static isc_result_t
4008socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4009	    unsigned int flags) {
4010	int io_state;
4011	bool have_lock = false;
4012	isc_task_t *ntask = NULL;
4013	isc_result_t result = ISC_R_SUCCESS;
4014
4015	dev->ev_sender = task;
4016
4017	if (sock->type == isc_sockettype_udp) {
4018		io_state = doio_recv(sock, dev);
4019	} else {
4020		LOCK(&sock->lock);
4021		have_lock = true;
4022
4023		if (ISC_LIST_EMPTY(sock->recv_list)) {
4024			io_state = doio_recv(sock, dev);
4025		} else {
4026			io_state = DOIO_SOFT;
4027		}
4028	}
4029
4030	switch (io_state) {
4031	case DOIO_SOFT:
4032		/*
4033		 * We couldn't read all or part of the request right now, so
4034		 * queue it.
4035		 *
4036		 * Attach to socket and to task
4037		 */
4038		isc_task_attach(task, &ntask);
4039		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4040
4041		if (!have_lock) {
4042			LOCK(&sock->lock);
4043			have_lock = true;
4044		}
4045
4046		/*
4047		 * Enqueue the request.  If the socket was previously not being
4048		 * watched, poke the watcher to start paying attention to it.
4049		 */
4050		bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
4051		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4052		if (do_poke) {
4053			select_poke(sock->manager, sock->threadid, sock->fd,
4054				    SELECT_POKE_READ);
4055		}
4056
4057		socket_log(sock, NULL, EVENT,
4058			   "socket_recv: event %p -> task %p", dev, ntask);
4059
4060		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4061			result = ISC_R_INPROGRESS;
4062		}
4063		break;
4064
4065	case DOIO_EOF:
4066		dev->result = ISC_R_EOF;
4067		FALLTHROUGH;
4068
4069	case DOIO_HARD:
4070	case DOIO_SUCCESS:
4071		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4072			send_recvdone_event(sock, &dev);
4073		}
4074		break;
4075	}
4076
4077	if (have_lock) {
4078		UNLOCK(&sock->lock);
4079	}
4080
4081	return (result);
4082}
4083
4084isc_result_t
4085isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4086		isc_task_t *task, isc_taskaction_t action, void *arg) {
4087	isc_socketevent_t *dev;
4088	isc_socketmgr_t *manager;
4089
4090	REQUIRE(VALID_SOCKET(sock));
4091	REQUIRE(action != NULL);
4092
4093	manager = sock->manager;
4094	REQUIRE(VALID_MANAGER(manager));
4095
4096	INSIST(sock->bound);
4097
4098	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
4099				   action, arg);
4100	if (dev == NULL) {
4101		return (ISC_R_NOMEMORY);
4102	}
4103
4104	return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
4105}
4106
4107isc_result_t
4108isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4109		 isc_task_t *task, isc_socketevent_t *event,
4110		 unsigned int flags) {
4111	event->ev_sender = sock;
4112	event->result = ISC_R_UNSET;
4113	event->region = *region;
4114	event->n = 0;
4115	event->offset = 0;
4116	event->attributes = 0;
4117
4118	/*
4119	 * UDP sockets are always partial read.
4120	 */
4121	if (sock->type == isc_sockettype_udp) {
4122		event->minimum = 1;
4123	} else {
4124		if (minimum == 0) {
4125			event->minimum = region->length;
4126		} else {
4127			event->minimum = minimum;
4128		}
4129	}
4130
4131	return (socket_recv(sock, event, task, flags));
4132}
4133
4134static isc_result_t
4135socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4136	    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4137	    unsigned int flags) {
4138	int io_state;
4139	bool have_lock = false;
4140	isc_task_t *ntask = NULL;
4141	isc_result_t result = ISC_R_SUCCESS;
4142
4143	dev->ev_sender = task;
4144
4145	set_dev_address(address, sock, dev);
4146	if (pktinfo != NULL) {
4147		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4148		dev->pktinfo = *pktinfo;
4149
4150		if (!isc_sockaddr_issitelocal(&dev->address) &&
4151		    !isc_sockaddr_islinklocal(&dev->address))
4152		{
4153			socket_log(sock, NULL, TRACE,
4154				   "pktinfo structure provided, ifindex %u "
4155				   "(set to 0)",
4156				   pktinfo->ipi6_ifindex);
4157
4158			/*
4159			 * Set the pktinfo index to 0 here, to let the
4160			 * kernel decide what interface it should send on.
4161			 */
4162			dev->pktinfo.ipi6_ifindex = 0;
4163		}
4164	}
4165
4166	if (sock->type == isc_sockettype_udp) {
4167		io_state = doio_send(sock, dev);
4168	} else {
4169		LOCK(&sock->lock);
4170		have_lock = true;
4171
4172		if (ISC_LIST_EMPTY(sock->send_list)) {
4173			io_state = doio_send(sock, dev);
4174		} else {
4175			io_state = DOIO_SOFT;
4176		}
4177	}
4178
4179	switch (io_state) {
4180	case DOIO_SOFT:
4181		/*
4182		 * We couldn't send all or part of the request right now, so
4183		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4184		 */
4185		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4186			isc_task_attach(task, &ntask);
4187			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4188
4189			if (!have_lock) {
4190				LOCK(&sock->lock);
4191				have_lock = true;
4192			}
4193
4194			/*
4195			 * Enqueue the request.  If the socket was previously
4196			 * not being watched, poke the watcher to start
4197			 * paying attention to it.
4198			 */
4199			bool do_poke = ISC_LIST_EMPTY(sock->send_list);
4200			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4201			if (do_poke) {
4202				select_poke(sock->manager, sock->threadid,
4203					    sock->fd, SELECT_POKE_WRITE);
4204			}
4205			socket_log(sock, NULL, EVENT,
4206				   "socket_send: event %p -> task %p", dev,
4207				   ntask);
4208
4209			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4210				result = ISC_R_INPROGRESS;
4211			}
4212			break;
4213		}
4214
4215		FALLTHROUGH;
4216
4217	case DOIO_HARD:
4218	case DOIO_SUCCESS:
4219		if (!have_lock) {
4220			LOCK(&sock->lock);
4221			have_lock = true;
4222		}
4223		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4224			send_senddone_event(sock, &dev);
4225		}
4226		break;
4227	}
4228
4229	if (have_lock) {
4230		UNLOCK(&sock->lock);
4231	}
4232
4233	return (result);
4234}
4235
4236isc_result_t
4237isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4238		isc_taskaction_t action, void *arg) {
4239	/*
4240	 * REQUIRE() checking is performed in isc_socket_sendto().
4241	 */
4242	return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
4243}
4244
4245isc_result_t
4246isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4247		  isc_taskaction_t action, void *arg,
4248		  const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
4249	isc_socketevent_t *dev;
4250	isc_socketmgr_t *manager;
4251
4252	REQUIRE(VALID_SOCKET(sock));
4253	REQUIRE(region != NULL);
4254	REQUIRE(task != NULL);
4255	REQUIRE(action != NULL);
4256
4257	manager = sock->manager;
4258	REQUIRE(VALID_MANAGER(manager));
4259
4260	INSIST(sock->bound);
4261
4262	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
4263				   action, arg);
4264	if (dev == NULL) {
4265		return (ISC_R_NOMEMORY);
4266	}
4267
4268	dev->region = *region;
4269
4270	return (socket_send(sock, dev, task, address, pktinfo, 0));
4271}
4272
4273isc_result_t
4274isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4275		   const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4276		   isc_socketevent_t *event, unsigned int flags) {
4277	REQUIRE(VALID_SOCKET(sock));
4278	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
4279		0);
4280	if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
4281		REQUIRE(sock->type == isc_sockettype_udp);
4282	}
4283	event->ev_sender = sock;
4284	event->result = ISC_R_UNSET;
4285	event->region = *region;
4286	event->n = 0;
4287	event->offset = 0;
4288	event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
4289
4290	return (socket_send(sock, event, task, address, pktinfo, flags));
4291}
4292
4293void
4294isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
4295#ifdef ISC_PLATFORM_HAVESYSUNH
4296	int s;
4297	struct stat sb;
4298	char strbuf[ISC_STRERRORSIZE];
4299
4300	if (sockaddr->type.sa.sa_family != AF_UNIX) {
4301		return;
4302	}
4303
4304#ifndef S_ISSOCK
4305#if defined(S_IFMT) && defined(S_IFSOCK)
4306#define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
4307#elif defined(_S_IFMT) && defined(S_IFSOCK)
4308#define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
4309#endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
4310#endif /* ifndef S_ISSOCK */
4311
4312#ifndef S_ISFIFO
4313#if defined(S_IFMT) && defined(S_IFIFO)
4314#define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
4315#elif defined(_S_IFMT) && defined(S_IFIFO)
4316#define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
4317#endif /* if defined(S_IFMT) && defined(S_IFIFO) */
4318#endif /* ifndef S_ISFIFO */
4319
4320#if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4321/* cppcheck-suppress preprocessorErrorDirective */
4322#error \
4323	You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4324#endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
4325
4326#ifndef S_ISFIFO
4327#define S_ISFIFO(mode) 0
4328#endif /* ifndef S_ISFIFO */
4329
4330#ifndef S_ISSOCK
4331#define S_ISSOCK(mode) 0
4332#endif /* ifndef S_ISSOCK */
4333
4334	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4335		switch (errno) {
4336		case ENOENT:
4337			if (active) { /* We exited cleanly last time */
4338				break;
4339			}
4340			FALLTHROUGH;
4341		default:
4342			strerror_r(errno, strbuf, sizeof(strbuf));
4343			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4344				      ISC_LOGMODULE_SOCKET,
4345				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4346				      "isc_socket_cleanunix: stat(%s): %s",
4347				      sockaddr->type.sunix.sun_path, strbuf);
4348			return;
4349		}
4350	} else {
4351		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4352			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4353				      ISC_LOGMODULE_SOCKET,
4354				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4355				      "isc_socket_cleanunix: %s: not a socket",
4356				      sockaddr->type.sunix.sun_path);
4357			return;
4358		}
4359	}
4360
4361	if (active) {
4362		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4363			strerror_r(errno, strbuf, sizeof(strbuf));
4364			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4365				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4366				      "isc_socket_cleanunix: unlink(%s): %s",
4367				      sockaddr->type.sunix.sun_path, strbuf);
4368		}
4369		return;
4370	}
4371
4372	s = socket(AF_UNIX, SOCK_STREAM, 0);
4373	if (s < 0) {
4374		strerror_r(errno, strbuf, sizeof(strbuf));
4375		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4376			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4377			      "isc_socket_cleanunix: socket(%s): %s",
4378			      sockaddr->type.sunix.sun_path, strbuf);
4379		return;
4380	}
4381
4382	if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
4383		    sizeof(sockaddr->type.sunix)) < 0)
4384	{
4385		switch (errno) {
4386		case ECONNREFUSED:
4387		case ECONNRESET:
4388			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4389				strerror_r(errno, strbuf, sizeof(strbuf));
4390				isc_log_write(
4391					isc_lctx, ISC_LOGCATEGORY_GENERAL,
4392					ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4393					"isc_socket_cleanunix: "
4394					"unlink(%s): %s",
4395					sockaddr->type.sunix.sun_path, strbuf);
4396			}
4397			break;
4398		default:
4399			strerror_r(errno, strbuf, sizeof(strbuf));
4400			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4401				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4402				      "isc_socket_cleanunix: connect(%s): %s",
4403				      sockaddr->type.sunix.sun_path, strbuf);
4404			break;
4405		}
4406	}
4407	close(s);
4408#else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
4409	UNUSED(sockaddr);
4410	UNUSED(active);
4411#endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4412}
4413
4414isc_result_t
4415isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
4416		    uint32_t owner, uint32_t group) {
4417#ifdef ISC_PLATFORM_HAVESYSUNH
4418	isc_result_t result = ISC_R_SUCCESS;
4419	char strbuf[ISC_STRERRORSIZE];
4420	char path[sizeof(sockaddr->type.sunix.sun_path)];
4421#ifdef NEED_SECURE_DIRECTORY
4422	char *slash;
4423#endif /* ifdef NEED_SECURE_DIRECTORY */
4424
4425	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4426	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4427	strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
4428
4429#ifdef NEED_SECURE_DIRECTORY
4430	slash = strrchr(path, '/');
4431	if (slash != NULL) {
4432		if (slash != path) {
4433			*slash = '\0';
4434		} else {
4435			strlcpy(path, "/", sizeof(path));
4436		}
4437	} else {
4438		strlcpy(path, ".", sizeof(path));
4439	}
4440#endif /* ifdef NEED_SECURE_DIRECTORY */
4441
4442	if (chmod(path, perm) < 0) {
4443		strerror_r(errno, strbuf, sizeof(strbuf));
4444		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4445			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4446			      "isc_socket_permunix: chmod(%s, %d): %s", path,
4447			      perm, strbuf);
4448		result = ISC_R_FAILURE;
4449	}
4450	if (chown(path, owner, group) < 0) {
4451		strerror_r(errno, strbuf, sizeof(strbuf));
4452		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4453			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4454			      "isc_socket_permunix: chown(%s, %d, %d): %s",
4455			      path, owner, group, strbuf);
4456		result = ISC_R_FAILURE;
4457	}
4458	return (result);
4459#else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
4460	UNUSED(sockaddr);
4461	UNUSED(perm);
4462	UNUSED(owner);
4463	UNUSED(group);
4464	return (ISC_R_NOTIMPLEMENTED);
4465#endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4466}
4467
4468isc_result_t
4469isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
4470		isc_socket_options_t options) {
4471	char strbuf[ISC_STRERRORSIZE];
4472	int on = 1;
4473
4474	REQUIRE(VALID_SOCKET(sock));
4475
4476	LOCK(&sock->lock);
4477
4478	INSIST(!sock->bound);
4479	INSIST(!sock->dupped);
4480
4481	if (sock->pf != sockaddr->type.sa.sa_family) {
4482		UNLOCK(&sock->lock);
4483		return (ISC_R_FAMILYMISMATCH);
4484	}
4485
4486	/*
4487	 * Only set SO_REUSEADDR when we want a specific port.
4488	 */
4489#ifdef AF_UNIX
4490	if (sock->pf == AF_UNIX) {
4491		goto bind_socket;
4492	}
4493#endif /* ifdef AF_UNIX */
4494	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4495	    isc_sockaddr_getport(sockaddr) != (in_port_t)0)
4496	{
4497		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4498			       sizeof(on)) < 0)
4499		{
4500			UNEXPECTED_ERROR(__FILE__, __LINE__,
4501					 "setsockopt(%d) failed", sock->fd);
4502		}
4503#if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
4504		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
4505			       (void *)&on, sizeof(on)) < 0)
4506		{
4507			UNEXPECTED_ERROR(__FILE__, __LINE__,
4508					 "setsockopt(%d) failed", sock->fd);
4509		}
4510#elif defined(__linux__) && defined(SO_REUSEPORT)
4511		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
4512			       sizeof(on)) < 0)
4513		{
4514			UNEXPECTED_ERROR(__FILE__, __LINE__,
4515					 "setsockopt(%d) failed", sock->fd);
4516		}
4517#endif		/* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
4518		/* Press on... */
4519	}
4520#ifdef AF_UNIX
4521bind_socket:
4522#endif /* ifdef AF_UNIX */
4523	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4524		inc_stats(sock->manager->stats,
4525			  sock->statsindex[STATID_BINDFAIL]);
4526
4527		UNLOCK(&sock->lock);
4528		switch (errno) {
4529		case EACCES:
4530			return (ISC_R_NOPERM);
4531		case EADDRNOTAVAIL:
4532			return (ISC_R_ADDRNOTAVAIL);
4533		case EADDRINUSE:
4534			return (ISC_R_ADDRINUSE);
4535		case EINVAL:
4536			return (ISC_R_BOUND);
4537		default:
4538			strerror_r(errno, strbuf, sizeof(strbuf));
4539			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4540					 strbuf);
4541			return (ISC_R_UNEXPECTED);
4542		}
4543	}
4544
4545	socket_log(sock, sockaddr, TRACE, "bound");
4546	sock->bound = 1;
4547
4548	UNLOCK(&sock->lock);
4549	return (ISC_R_SUCCESS);
4550}
4551
4552/*
4553 * Enable this only for specific OS versions, and only when they have repaired
4554 * their problems with it.  Until then, this is is broken and needs to be
4555 * disabled by default.  See RT22589 for details.
4556 */
4557#undef ENABLE_ACCEPTFILTER
4558
4559isc_result_t
4560isc_socket_filter(isc_socket_t *sock, const char *filter) {
4561#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4562	char strbuf[ISC_STRERRORSIZE];
4563	struct accept_filter_arg afa;
4564#else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4565	UNUSED(sock);
4566	UNUSED(filter);
4567#endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4568
4569	REQUIRE(VALID_SOCKET(sock));
4570
4571#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4572	bzero(&afa, sizeof(afa));
4573	strlcpy(afa.af_name, filter, sizeof(afa.af_name));
4574	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
4575		       sizeof(afa)) == -1)
4576	{
4577		strerror_r(errno, strbuf, sizeof(strbuf));
4578		socket_log(sock, NULL, CREATION,
4579			   "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
4580		return (ISC_R_FAILURE);
4581	}
4582	return (ISC_R_SUCCESS);
4583#else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4584	return (ISC_R_NOTIMPLEMENTED);
4585#endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4586}
4587
4588/*
4589 * Try enabling TCP Fast Open for a given socket if the OS supports it.
4590 */
4591static void
4592set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) {
4593#if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
4594	char strbuf[ISC_STRERRORSIZE];
4595
4596/*
4597 * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
4598 * shipping a default kernel without TFO support, so we special-case it by
4599 * performing an additional runtime check for TFO support using sysctl to
4600 * prevent setsockopt() errors from being logged.
4601 */
4602#if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
4603#define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
4604	unsigned int enabled;
4605	size_t enabledlen = sizeof(enabled);
4606	static bool tfo_notice_logged = false;
4607
4608	if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
4609		/*
4610		 * This kernel does not support TCP Fast Open.  There is
4611		 * nothing more we can do.
4612		 */
4613		return;
4614	} else if (enabled == 0) {
4615		/*
4616		 * This kernel does support TCP Fast Open, but it is disabled
4617		 * by sysctl.  Notify the user, but do not nag.
4618		 */
4619		if (!tfo_notice_logged) {
4620			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4621				      ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
4622				      "TCP_FASTOPEN support is disabled by "
4623				      "sysctl (" SYSCTL_TFO " = 0)");
4624			tfo_notice_logged = true;
4625		}
4626		return;
4627	}
4628#endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
4629
4630#ifdef __APPLE__
4631	backlog = 1;
4632#else  /* ifdef __APPLE__ */
4633	backlog = backlog / 2;
4634	if (backlog == 0) {
4635		backlog = 1;
4636	}
4637#endif /* ifdef __APPLE__ */
4638	if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
4639		       sizeof(backlog)) < 0)
4640	{
4641		strerror_r(errno, strbuf, sizeof(strbuf));
4642		UNEXPECTED_ERROR(__FILE__, __LINE__,
4643				 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
4644				 sock->fd, strbuf);
4645		/* TCP_FASTOPEN is experimental so ignore failures */
4646	}
4647#else  /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4648	UNUSED(sock);
4649	UNUSED(backlog);
4650#endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4651}
4652
4653/*
4654 * Set up to listen on a given socket.  We do this by creating an internal
4655 * event that will be dispatched when the socket has read activity.  The
4656 * watcher will send the internal event to the task when there is a new
4657 * connection.
4658 *
4659 * Unlike in read, we don't preallocate a done event here.  Every time there
4660 * is a new connection we'll have to allocate a new one anyway, so we might
4661 * as well keep things simple rather than having to track them.
4662 */
4663isc_result_t
4664isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4665	char strbuf[ISC_STRERRORSIZE];
4666
4667	REQUIRE(VALID_SOCKET(sock));
4668
4669	LOCK(&sock->lock);
4670
4671	REQUIRE(!sock->listener);
4672	REQUIRE(sock->bound);
4673	REQUIRE(sock->type == isc_sockettype_tcp ||
4674		sock->type == isc_sockettype_unix);
4675
4676	if (backlog == 0) {
4677		backlog = SOMAXCONN;
4678	}
4679
4680	if (listen(sock->fd, (int)backlog) < 0) {
4681		UNLOCK(&sock->lock);
4682		strerror_r(errno, strbuf, sizeof(strbuf));
4683
4684		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4685
4686		return (ISC_R_UNEXPECTED);
4687	}
4688
4689	set_tcp_fastopen(sock, backlog);
4690
4691	sock->listener = 1;
4692
4693	UNLOCK(&sock->lock);
4694	return (ISC_R_SUCCESS);
4695}
4696
4697/*
4698 * This should try to do aggressive accept() XXXMLG
4699 */
4700isc_result_t
4701isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action,
4702		  void *arg) {
4703	isc_socket_newconnev_t *dev;
4704	isc_socketmgr_t *manager;
4705	isc_task_t *ntask = NULL;
4706	isc_socket_t *nsock;
4707	isc_result_t result;
4708	bool do_poke = false;
4709
4710	REQUIRE(VALID_SOCKET(sock));
4711	manager = sock->manager;
4712	REQUIRE(VALID_MANAGER(manager));
4713
4714	LOCK(&sock->lock);
4715
4716	REQUIRE(sock->listener);
4717
4718	/*
4719	 * Sender field is overloaded here with the task we will be sending
4720	 * this event to.  Just before the actual event is delivered the
4721	 * actual ev_sender will be touched up to be the socket.
4722	 */
4723	dev = (isc_socket_newconnev_t *)isc_event_allocate(
4724		manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
4725		sizeof(*dev));
4726	ISC_LINK_INIT(dev, ev_link);
4727
4728	result = allocate_socket(manager, sock->type, &nsock);
4729	if (result != ISC_R_SUCCESS) {
4730		isc_event_free(ISC_EVENT_PTR(&dev));
4731		UNLOCK(&sock->lock);
4732		return (result);
4733	}
4734
4735	/*
4736	 * Attach to socket and to task.
4737	 */
4738	isc_task_attach(task, &ntask);
4739	if (isc_task_exiting(ntask)) {
4740		free_socket(&nsock);
4741		isc_task_detach(&ntask);
4742		isc_event_free(ISC_EVENT_PTR(&dev));
4743		UNLOCK(&sock->lock);
4744		return (ISC_R_SHUTTINGDOWN);
4745	}
4746	isc_refcount_increment0(&nsock->references);
4747	nsock->statsindex = sock->statsindex;
4748
4749	dev->ev_sender = ntask;
4750	dev->newsocket = nsock;
4751
4752	/*
4753	 * Poke watcher here.  We still have the socket locked, so there
4754	 * is no race condition.  We will keep the lock for such a short
4755	 * bit of time waking it up now or later won't matter all that much.
4756	 */
4757	do_poke = ISC_LIST_EMPTY(sock->accept_list);
4758	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4759	if (do_poke) {
4760		select_poke(manager, sock->threadid, sock->fd,
4761			    SELECT_POKE_ACCEPT);
4762	}
4763	UNLOCK(&sock->lock);
4764	return (ISC_R_SUCCESS);
4765}
4766
4767isc_result_t
4768isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
4769		   isc_task_t *task, isc_taskaction_t action, void *arg) {
4770	isc_socket_connev_t *dev;
4771	isc_task_t *ntask = NULL;
4772	isc_socketmgr_t *manager;
4773	int cc;
4774	char strbuf[ISC_STRERRORSIZE];
4775	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4776
4777	REQUIRE(VALID_SOCKET(sock));
4778	REQUIRE(addr != NULL);
4779	REQUIRE(task != NULL);
4780	REQUIRE(action != NULL);
4781
4782	manager = sock->manager;
4783	REQUIRE(VALID_MANAGER(manager));
4784	REQUIRE(addr != NULL);
4785
4786	if (isc_sockaddr_ismulticast(addr)) {
4787		return (ISC_R_MULTICAST);
4788	}
4789
4790	LOCK(&sock->lock);
4791
4792	dev = (isc_socket_connev_t *)isc_event_allocate(
4793		manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
4794		sizeof(*dev));
4795	ISC_LINK_INIT(dev, ev_link);
4796
4797	if (sock->connecting) {
4798		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4799		goto queue;
4800	}
4801
4802	if (sock->connected) {
4803		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4804		dev->result = ISC_R_SUCCESS;
4805		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4806
4807		UNLOCK(&sock->lock);
4808
4809		return (ISC_R_SUCCESS);
4810	}
4811
4812	/*
4813	 * Try to do the connect right away, as there can be only one
4814	 * outstanding, and it might happen to complete.
4815	 */
4816	sock->peer_address = *addr;
4817	cc = connect(sock->fd, &addr->type.sa, addr->length);
4818	if (cc < 0) {
4819		/*
4820		 * The socket is nonblocking and the connection cannot be
4821		 * completed immediately.  It is possible to select(2) or
4822		 * poll(2) for completion by selecting the socket for writing.
4823		 * After select(2) indicates writability, use getsockopt(2) to
4824		 * read the SO_ERROR option at level SOL_SOCKET to determine
4825		 * whether connect() completed successfully (SO_ERROR is zero)
4826		 * or unsuccessfully (SO_ERROR is one of the usual error codes
4827		 * listed here, explaining the reason for the failure).
4828		 */
4829		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4830			cc = 0;
4831			goto success;
4832		}
4833		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4834			goto queue;
4835		}
4836
4837		switch (errno) {
4838#define ERROR_MATCH(a, b)        \
4839	case a:                  \
4840		dev->result = b; \
4841		goto err_exit;
4842			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4843			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4844			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4845			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4846			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4847#ifdef EHOSTDOWN
4848			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4849#endif /* ifdef EHOSTDOWN */
4850			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4851			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4852			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4853			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4854			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4855			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4856#undef ERROR_MATCH
4857		}
4858
4859		sock->connected = 0;
4860
4861		strerror_r(errno, strbuf, sizeof(strbuf));
4862		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4863		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4864				 addrbuf, errno, strbuf);
4865
4866		UNLOCK(&sock->lock);
4867		inc_stats(sock->manager->stats,
4868			  sock->statsindex[STATID_CONNECTFAIL]);
4869		isc_event_free(ISC_EVENT_PTR(&dev));
4870		return (ISC_R_UNEXPECTED);
4871
4872	err_exit:
4873		sock->connected = 0;
4874		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4875
4876		UNLOCK(&sock->lock);
4877		inc_stats(sock->manager->stats,
4878			  sock->statsindex[STATID_CONNECTFAIL]);
4879		return (ISC_R_SUCCESS);
4880	}
4881
4882	/*
4883	 * If connect completed, fire off the done event.
4884	 */
4885success:
4886	if (cc == 0) {
4887		sock->connected = 1;
4888		sock->bound = 1;
4889		dev->result = ISC_R_SUCCESS;
4890		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4891
4892		UNLOCK(&sock->lock);
4893
4894		inc_stats(sock->manager->stats,
4895			  sock->statsindex[STATID_CONNECT]);
4896
4897		return (ISC_R_SUCCESS);
4898	}
4899
4900queue:
4901
4902	/*
4903	 * Attach to task.
4904	 */
4905	isc_task_attach(task, &ntask);
4906
4907	dev->ev_sender = ntask;
4908
4909	/*
4910	 * Poke watcher here.  We still have the socket locked, so there
4911	 * is no race condition.  We will keep the lock for such a short
4912	 * bit of time waking it up now or later won't matter all that much.
4913	 */
4914	bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
4915	ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
4916	if (do_poke && !sock->connecting) {
4917		sock->connecting = 1;
4918		select_poke(manager, sock->threadid, sock->fd,
4919			    SELECT_POKE_CONNECT);
4920	}
4921
4922	UNLOCK(&sock->lock);
4923	return (ISC_R_SUCCESS);
4924}
4925
4926/*
4927 * Called when a socket with a pending connect() finishes.
4928 */
4929static void
4930internal_connect(isc_socket_t *sock) {
4931	isc_socket_connev_t *dev;
4932	int cc;
4933	isc_result_t result;
4934	socklen_t optlen;
4935	char strbuf[ISC_STRERRORSIZE];
4936	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4937
4938	INSIST(VALID_SOCKET(sock));
4939	REQUIRE(sock->fd >= 0);
4940
4941	/*
4942	 * Get the first item off the connect list.
4943	 * If it is empty, unlock the socket and return.
4944	 */
4945	dev = ISC_LIST_HEAD(sock->connect_list);
4946	if (dev == NULL) {
4947		INSIST(!sock->connecting);
4948		goto finish;
4949	}
4950
4951	INSIST(sock->connecting);
4952	sock->connecting = 0;
4953
4954	/*
4955	 * Get any possible error status here.
4956	 */
4957	optlen = sizeof(cc);
4958	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
4959		       (void *)&optlen) != 0)
4960	{
4961		cc = errno;
4962	} else {
4963		errno = cc;
4964	}
4965
4966	if (errno != 0) {
4967		/*
4968		 * If the error is EAGAIN, just re-select on this
4969		 * fd and pretend nothing strange happened.
4970		 */
4971		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4972			sock->connecting = 1;
4973			return;
4974		}
4975
4976		inc_stats(sock->manager->stats,
4977			  sock->statsindex[STATID_CONNECTFAIL]);
4978
4979		/*
4980		 * Translate other errors into ISC_R_* flavors.
4981		 */
4982		switch (errno) {
4983#define ERROR_MATCH(a, b)   \
4984	case a:             \
4985		result = b; \
4986		break;
4987			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4988			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4989			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4990			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4991			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4992#ifdef EHOSTDOWN
4993			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4994#endif /* ifdef EHOSTDOWN */
4995			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4996			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4997			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4998			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4999			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5000			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5001#undef ERROR_MATCH
5002		default:
5003			result = ISC_R_UNEXPECTED;
5004			isc_sockaddr_format(&sock->peer_address, peerbuf,
5005					    sizeof(peerbuf));
5006			strerror_r(errno, strbuf, sizeof(strbuf));
5007			UNEXPECTED_ERROR(__FILE__, __LINE__,
5008					 "internal_connect: connect(%s) %s",
5009					 peerbuf, strbuf);
5010		}
5011	} else {
5012		inc_stats(sock->manager->stats,
5013			  sock->statsindex[STATID_CONNECT]);
5014		result = ISC_R_SUCCESS;
5015		sock->connected = 1;
5016		sock->bound = 1;
5017	}
5018
5019	do {
5020		dev->result = result;
5021		send_connectdone_event(sock, &dev);
5022		dev = ISC_LIST_HEAD(sock->connect_list);
5023	} while (dev != NULL);
5024
5025finish:
5026	unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
5027		   SELECT_POKE_CONNECT);
5028}
5029
5030isc_result_t
5031isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5032	isc_result_t result;
5033
5034	REQUIRE(VALID_SOCKET(sock));
5035	REQUIRE(addressp != NULL);
5036
5037	LOCK(&sock->lock);
5038
5039	if (sock->connected) {
5040		*addressp = sock->peer_address;
5041		result = ISC_R_SUCCESS;
5042	} else {
5043		result = ISC_R_NOTCONNECTED;
5044	}
5045
5046	UNLOCK(&sock->lock);
5047
5048	return (result);
5049}
5050
5051isc_result_t
5052isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5053	socklen_t len;
5054	isc_result_t result;
5055	char strbuf[ISC_STRERRORSIZE];
5056
5057	REQUIRE(VALID_SOCKET(sock));
5058	REQUIRE(addressp != NULL);
5059
5060	LOCK(&sock->lock);
5061
5062	if (!sock->bound) {
5063		result = ISC_R_NOTBOUND;
5064		goto out;
5065	}
5066
5067	result = ISC_R_SUCCESS;
5068
5069	len = sizeof(addressp->type);
5070	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5071		strerror_r(errno, strbuf, sizeof(strbuf));
5072		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
5073		result = ISC_R_UNEXPECTED;
5074		goto out;
5075	}
5076	addressp->length = (unsigned int)len;
5077
5078out:
5079	UNLOCK(&sock->lock);
5080
5081	return (result);
5082}
5083
5084/*
5085 * Run through the list of events on this socket, and cancel the ones
5086 * queued for task "task" of type "how".  "how" is a bitmask.
5087 */
5088void
5089isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
5090	REQUIRE(VALID_SOCKET(sock));
5091
5092	/*
5093	 * Quick exit if there is nothing to do.  Don't even bother locking
5094	 * in this case.
5095	 */
5096	if (how == 0) {
5097		return;
5098	}
5099
5100	LOCK(&sock->lock);
5101
5102	/*
5103	 * All of these do the same thing, more or less.
5104	 * Each will:
5105	 *	o If the internal event is marked as "posted" try to
5106	 *	  remove it from the task's queue.  If this fails, mark it
5107	 *	  as canceled instead, and let the task clean it up later.
5108	 *	o For each I/O request for that task of that type, post
5109	 *	  its done event with status of "ISC_R_CANCELED".
5110	 *	o Reset any state needed.
5111	 */
5112	if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
5113	    !ISC_LIST_EMPTY(sock->recv_list))
5114	{
5115		isc_socketevent_t *dev;
5116		isc_socketevent_t *next;
5117		isc_task_t *current_task;
5118
5119		dev = ISC_LIST_HEAD(sock->recv_list);
5120
5121		while (dev != NULL) {
5122			current_task = dev->ev_sender;
5123			next = ISC_LIST_NEXT(dev, ev_link);
5124
5125			if ((task == NULL) || (task == current_task)) {
5126				dev->result = ISC_R_CANCELED;
5127				send_recvdone_event(sock, &dev);
5128			}
5129			dev = next;
5130		}
5131	}
5132
5133	if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
5134	    !ISC_LIST_EMPTY(sock->send_list))
5135	{
5136		isc_socketevent_t *dev;
5137		isc_socketevent_t *next;
5138		isc_task_t *current_task;
5139
5140		dev = ISC_LIST_HEAD(sock->send_list);
5141
5142		while (dev != NULL) {
5143			current_task = dev->ev_sender;
5144			next = ISC_LIST_NEXT(dev, ev_link);
5145
5146			if ((task == NULL) || (task == current_task)) {
5147				dev->result = ISC_R_CANCELED;
5148				send_senddone_event(sock, &dev);
5149			}
5150			dev = next;
5151		}
5152	}
5153
5154	if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
5155	    !ISC_LIST_EMPTY(sock->accept_list))
5156	{
5157		isc_socket_newconnev_t *dev;
5158		isc_socket_newconnev_t *next;
5159		isc_task_t *current_task;
5160
5161		dev = ISC_LIST_HEAD(sock->accept_list);
5162		while (dev != NULL) {
5163			current_task = dev->ev_sender;
5164			next = ISC_LIST_NEXT(dev, ev_link);
5165
5166			if ((task == NULL) || (task == current_task)) {
5167				ISC_LIST_UNLINK(sock->accept_list, dev,
5168						ev_link);
5169
5170				isc_refcount_decrementz(
5171					&NEWCONNSOCK(dev)->references);
5172				free_socket((isc_socket_t **)&dev->newsocket);
5173
5174				dev->result = ISC_R_CANCELED;
5175				dev->ev_sender = sock;
5176				isc_task_sendtoanddetach(&current_task,
5177							 ISC_EVENT_PTR(&dev),
5178							 sock->threadid);
5179			}
5180
5181			dev = next;
5182		}
5183	}
5184
5185	if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
5186	    !ISC_LIST_EMPTY(sock->connect_list))
5187	{
5188		isc_socket_connev_t *dev;
5189		isc_socket_connev_t *next;
5190		isc_task_t *current_task;
5191
5192		INSIST(sock->connecting);
5193		sock->connecting = 0;
5194
5195		dev = ISC_LIST_HEAD(sock->connect_list);
5196
5197		while (dev != NULL) {
5198			current_task = dev->ev_sender;
5199			next = ISC_LIST_NEXT(dev, ev_link);
5200
5201			if ((task == NULL) || (task == current_task)) {
5202				dev->result = ISC_R_CANCELED;
5203				send_connectdone_event(sock, &dev);
5204			}
5205			dev = next;
5206		}
5207	}
5208
5209	UNLOCK(&sock->lock);
5210}
5211
5212isc_sockettype_t
5213isc_socket_gettype(isc_socket_t *sock) {
5214	REQUIRE(VALID_SOCKET(sock));
5215
5216	return (sock->type);
5217}
5218
5219void
5220isc_socket_ipv6only(isc_socket_t *sock, bool yes) {
5221#if defined(IPV6_V6ONLY) && !defined(__OpenBSD__)
5222	int onoff = yes ? 1 : 0;
5223#else  /* if defined(IPV6_V6ONLY) */
5224	UNUSED(yes);
5225	UNUSED(sock);
5226#endif /* if defined(IPV6_V6ONLY) */
5227
5228	REQUIRE(VALID_SOCKET(sock));
5229	INSIST(!sock->dupped);
5230
5231#if defined(IPV6_V6ONLY) && !defined(__OpenBSD__)
5232	if (sock->pf == AF_INET6) {
5233		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5234			       (void *)&onoff, sizeof(int)) < 0)
5235		{
5236			char strbuf[ISC_STRERRORSIZE];
5237			strerror_r(errno, strbuf, sizeof(strbuf));
5238			UNEXPECTED_ERROR(__FILE__, __LINE__,
5239					 "setsockopt(%d, IPV6_V6ONLY) failed: "
5240					 "%s",
5241					 sock->fd, strbuf);
5242		}
5243	}
5244#endif /* ifdef IPV6_V6ONLY */
5245}
5246
5247static void
5248setdscp(isc_socket_t *sock, isc_dscp_t dscp) {
5249#if defined(IP_TOS) || defined(IPV6_TCLASS)
5250	int value = dscp << 2;
5251#endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
5252
5253	sock->dscp = dscp;
5254
5255#ifdef IP_TOS
5256	if (sock->pf == AF_INET) {
5257		if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
5258			       sizeof(value)) < 0)
5259		{
5260			char strbuf[ISC_STRERRORSIZE];
5261			strerror_r(errno, strbuf, sizeof(strbuf));
5262			UNEXPECTED_ERROR(__FILE__, __LINE__,
5263					 "setsockopt(%d, IP_TOS, %.02x) "
5264					 "failed: %s",
5265					 sock->fd, value >> 2, strbuf);
5266		}
5267	}
5268#endif /* ifdef IP_TOS */
5269#ifdef IPV6_TCLASS
5270	if (sock->pf == AF_INET6) {
5271		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
5272			       (void *)&value, sizeof(value)) < 0)
5273		{
5274			char strbuf[ISC_STRERRORSIZE];
5275			strerror_r(errno, strbuf, sizeof(strbuf));
5276			UNEXPECTED_ERROR(__FILE__, __LINE__,
5277					 "setsockopt(%d, IPV6_TCLASS, %.02x) "
5278					 "failed: %s",
5279					 sock->fd, dscp >> 2, strbuf);
5280		}
5281	}
5282#endif /* ifdef IPV6_TCLASS */
5283}
5284
5285void
5286isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
5287	REQUIRE(VALID_SOCKET(sock));
5288	REQUIRE(dscp < 0x40);
5289
5290#if !defined(IP_TOS) && !defined(IPV6_TCLASS)
5291	UNUSED(dscp);
5292#else  /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5293	if (dscp < 0) {
5294		return;
5295	}
5296
5297	/* The DSCP value must not be changed once it has been set. */
5298	if (isc_dscp_check_value != -1) {
5299		INSIST(dscp == isc_dscp_check_value);
5300	}
5301#endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5302
5303#ifdef notyet
5304	REQUIRE(!sock->dupped);
5305#endif /* ifdef notyet */
5306
5307	setdscp(sock, dscp);
5308}
5309
5310isc_socketevent_t *
5311isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
5312		       isc_taskaction_t action, void *arg) {
5313	return (allocate_socketevent(mctx, sender, eventtype, action, arg));
5314}
5315
5316void
5317isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) {
5318	/*
5319	 * Name 'sock'.
5320	 */
5321
5322	REQUIRE(VALID_SOCKET(sock));
5323
5324	LOCK(&sock->lock);
5325	strlcpy(sock->name, name, sizeof(sock->name));
5326	sock->tag = tag;
5327	UNLOCK(&sock->lock);
5328}
5329
5330const char *
5331isc_socket_getname(isc_socket_t *sock) {
5332	return (sock->name);
5333}
5334
5335void *
5336isc_socket_gettag(isc_socket_t *sock) {
5337	return (sock->tag);
5338}
5339
5340int
5341isc_socket_getfd(isc_socket_t *sock) {
5342	return ((short)sock->fd);
5343}
5344
5345static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
5346static bool hasreuseport = false;
5347
5348static void
5349init_hasreuseport(void) {
5350/*
5351 * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
5352 * We only want to use it on Linux, if it's available. On BSD we want to dup()
5353 * sockets instead of re-binding them.
5354 */
5355#if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5356	(defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
5357	int sock, yes = 1;
5358	sock = socket(AF_INET, SOCK_DGRAM, 0);
5359	if (sock < 0) {
5360		sock = socket(AF_INET6, SOCK_DGRAM, 0);
5361		if (sock < 0) {
5362			return;
5363		}
5364	}
5365	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
5366		       sizeof(yes)) < 0)
5367	{
5368		close(sock);
5369		return;
5370#if defined(__FreeBSD_kernel__)
5371	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
5372			      sizeof(yes)) < 0)
5373#else  /* if defined(__FreeBSD_kernel__) */
5374	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
5375			      sizeof(yes)) < 0)
5376#endif /* if defined(__FreeBSD_kernel__) */
5377	{
5378		close(sock);
5379		return;
5380	}
5381	hasreuseport = true;
5382	close(sock);
5383#endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5384	* (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
5385}
5386
5387bool
5388isc_socket_hasreuseport() {
5389	RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
5390		      ISC_R_SUCCESS);
5391	return (hasreuseport);
5392}
5393
5394#if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
5395static const char *
5396_socktype(isc_sockettype_t type) {
5397	switch (type) {
5398	case isc_sockettype_udp:
5399		return ("udp");
5400	case isc_sockettype_tcp:
5401		return ("tcp");
5402	case isc_sockettype_unix:
5403		return ("unix");
5404	case isc_sockettype_fdwatch:
5405		return ("fdwatch");
5406	default:
5407		return ("not-initialized");
5408	}
5409}
5410#endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
5411
5412#ifdef HAVE_LIBXML2
5413#define TRY0(a)                     \
5414	do {                        \
5415		xmlrc = (a);        \
5416		if (xmlrc < 0)      \
5417			goto error; \
5418	} while (0)
5419int
5420isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) {
5421	isc_socket_t *sock = NULL;
5422	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5423	isc_sockaddr_t addr;
5424	socklen_t len;
5425	int xmlrc;
5426	xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
5427
5428	LOCK(&mgr->lock);
5429
5430	TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5431	sock = ISC_LIST_HEAD(mgr->socklist);
5432	while (sock != NULL) {
5433		LOCK(&sock->lock);
5434		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5435
5436		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5437		TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5438		TRY0(xmlTextWriterEndElement(writer));
5439
5440		if (sock->name[0] != 0) {
5441			TRY0(xmlTextWriterStartElement(writer,
5442						       ISC_XMLCHAR "name"));
5443			TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5444							    sock->name));
5445			TRY0(xmlTextWriterEndElement(writer)); /* name */
5446		}
5447
5448		TRY0(xmlTextWriterStartElement(writer,
5449					       ISC_XMLCHAR "references"));
5450		TRY0(xmlTextWriterWriteFormatString(
5451			writer, "%d",
5452			(int)isc_refcount_current(&sock->references)));
5453		TRY0(xmlTextWriterEndElement(writer));
5454
5455		TRY0(xmlTextWriterWriteElement(
5456			writer, ISC_XMLCHAR "type",
5457			ISC_XMLCHAR _socktype(sock->type)));
5458
5459		if (sock->connected) {
5460			isc_sockaddr_format(&sock->peer_address, peerbuf,
5461					    sizeof(peerbuf));
5462			TRY0(xmlTextWriterWriteElement(
5463				writer, ISC_XMLCHAR "peer-address",
5464				ISC_XMLCHAR peerbuf));
5465		}
5466
5467		len = sizeof(addr);
5468		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5469			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5470			TRY0(xmlTextWriterWriteElement(
5471				writer, ISC_XMLCHAR "local-address",
5472				ISC_XMLCHAR peerbuf));
5473		}
5474
5475		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5476		if (sock->listener) {
5477			TRY0(xmlTextWriterWriteElement(writer,
5478						       ISC_XMLCHAR "state",
5479						       ISC_XMLCHAR "listener"));
5480		}
5481		if (sock->connected) {
5482			TRY0(xmlTextWriterWriteElement(
5483				writer, ISC_XMLCHAR "state",
5484				ISC_XMLCHAR "connected"));
5485		}
5486		if (sock->connecting) {
5487			TRY0(xmlTextWriterWriteElement(
5488				writer, ISC_XMLCHAR "state",
5489				ISC_XMLCHAR "connecting"));
5490		}
5491		if (sock->bound) {
5492			TRY0(xmlTextWriterWriteElement(writer,
5493						       ISC_XMLCHAR "state",
5494						       ISC_XMLCHAR "bound"));
5495		}
5496
5497		TRY0(xmlTextWriterEndElement(writer)); /* states */
5498
5499		TRY0(xmlTextWriterEndElement(writer)); /* socket */
5500
5501		UNLOCK(&sock->lock);
5502		sock = ISC_LIST_NEXT(sock, link);
5503	}
5504	TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5505
5506error:
5507	if (sock != NULL) {
5508		UNLOCK(&sock->lock);
5509	}
5510
5511	UNLOCK(&mgr->lock);
5512
5513	return (xmlrc);
5514}
5515#endif /* HAVE_LIBXML2 */
5516
5517#ifdef HAVE_JSON_C
5518#define CHECKMEM(m)                              \
5519	do {                                     \
5520		if (m == NULL) {                 \
5521			result = ISC_R_NOMEMORY; \
5522			goto error;              \
5523		}                                \
5524	} while (0)
5525
5526isc_result_t
5527isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) {
5528	isc_result_t result = ISC_R_SUCCESS;
5529	isc_socket_t *sock = NULL;
5530	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5531	isc_sockaddr_t addr;
5532	socklen_t len;
5533	json_object *obj, *array = json_object_new_array();
5534	json_object *stats = (json_object *)stats0;
5535
5536	CHECKMEM(array);
5537
5538	LOCK(&mgr->lock);
5539
5540	sock = ISC_LIST_HEAD(mgr->socklist);
5541	while (sock != NULL) {
5542		json_object *states, *entry = json_object_new_object();
5543		char buf[255];
5544
5545		CHECKMEM(entry);
5546		json_object_array_add(array, entry);
5547
5548		LOCK(&sock->lock);
5549
5550		snprintf(buf, sizeof(buf), "%p", sock);
5551		obj = json_object_new_string(buf);
5552		CHECKMEM(obj);
5553		json_object_object_add(entry, "id", obj);
5554
5555		if (sock->name[0] != 0) {
5556			obj = json_object_new_string(sock->name);
5557			CHECKMEM(obj);
5558			json_object_object_add(entry, "name", obj);
5559		}
5560
5561		obj = json_object_new_int(
5562			(int)isc_refcount_current(&sock->references));
5563		CHECKMEM(obj);
5564		json_object_object_add(entry, "references", obj);
5565
5566		obj = json_object_new_string(_socktype(sock->type));
5567		CHECKMEM(obj);
5568		json_object_object_add(entry, "type", obj);
5569
5570		if (sock->connected) {
5571			isc_sockaddr_format(&sock->peer_address, peerbuf,
5572					    sizeof(peerbuf));
5573			obj = json_object_new_string(peerbuf);
5574			CHECKMEM(obj);
5575			json_object_object_add(entry, "peer-address", obj);
5576		}
5577
5578		len = sizeof(addr);
5579		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5580			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5581			obj = json_object_new_string(peerbuf);
5582			CHECKMEM(obj);
5583			json_object_object_add(entry, "local-address", obj);
5584		}
5585
5586		states = json_object_new_array();
5587		CHECKMEM(states);
5588		json_object_object_add(entry, "states", states);
5589
5590		if (sock->listener) {
5591			obj = json_object_new_string("listener");
5592			CHECKMEM(obj);
5593			json_object_array_add(states, obj);
5594		}
5595
5596		if (sock->connected) {
5597			obj = json_object_new_string("connected");
5598			CHECKMEM(obj);
5599			json_object_array_add(states, obj);
5600		}
5601
5602		if (sock->connecting) {
5603			obj = json_object_new_string("connecting");
5604			CHECKMEM(obj);
5605			json_object_array_add(states, obj);
5606		}
5607
5608		if (sock->bound) {
5609			obj = json_object_new_string("bound");
5610			CHECKMEM(obj);
5611			json_object_array_add(states, obj);
5612		}
5613
5614		UNLOCK(&sock->lock);
5615		sock = ISC_LIST_NEXT(sock, link);
5616	}
5617
5618	json_object_object_add(stats, "sockets", array);
5619	array = NULL;
5620	result = ISC_R_SUCCESS;
5621
5622error:
5623	if (array != NULL) {
5624		json_object_put(array);
5625	}
5626
5627	if (sock != NULL) {
5628		UNLOCK(&sock->lock);
5629	}
5630
5631	UNLOCK(&mgr->lock);
5632
5633	return (result);
5634}
5635#endif /* HAVE_JSON_C */
5636
5637/*
5638 * Create a new 'type' socket managed by 'manager'.  Events
5639 * will be posted to 'task' and when dispatched 'action' will be
5640 * called with 'arg' as the arg value.  The new socket is returned
5641 * in 'socketp'.
5642 */
5643isc_result_t
5644isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
5645			 isc_sockfdwatch_t callback, void *cbarg,
5646			 isc_task_t *task, isc_socket_t **socketp)
5647{
5648	isc_socket_t *sock = NULL;
5649	isc__socketthread_t *thread;
5650	isc_result_t result;
5651	int lockid;
5652
5653	REQUIRE(VALID_MANAGER(manager));
5654	REQUIRE(socketp != NULL && *socketp == NULL);
5655
5656	if (fd < 0 || (unsigned int)fd >= manager->maxsocks)
5657		return (ISC_R_RANGE);
5658
5659	result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
5660	if (result != ISC_R_SUCCESS)
5661		return (result);
5662
5663	sock->fd = fd;
5664	sock->fdwatcharg = cbarg;
5665	sock->fdwatchcb = callback;
5666	sock->fdwatchflags = flags;
5667	sock->fdwatchtask = task;
5668
5669	sock->threadid = gen_threadid(sock);
5670	isc_refcount_init(&sock->references, 1);
5671	thread = &manager->threads[sock->threadid];
5672	*socketp = (isc_socket_t *)sock;
5673
5674	/*
5675	 * Note we don't have to lock the socket like we normally would because
5676	 * there are no external references to it yet.
5677	 */
5678
5679	lockid = FDLOCK_ID(sock->fd);
5680	LOCK(&thread->fdlock[lockid]);
5681	thread->fds[sock->fd] = sock;
5682	thread->fdstate[sock->fd] = MANAGED;
5683
5684#if defined(USE_EPOLL)
5685	manager->epoll_events[sock->fd] = 0;
5686#endif
5687#ifdef USE_DEVPOLL
5688	INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
5689	       thread->fdpollinfo[sock->fd].want_write == 0);
5690#endif /* ifdef USE_DEVPOLL */
5691	UNLOCK(&thread->fdlock[lockid]);
5692
5693	LOCK(&manager->lock);
5694	ISC_LIST_APPEND(manager->socklist, sock, link);
5695#ifdef USE_SELECT
5696	if (thread->maxfd < sock->fd)
5697		thread->maxfd = sock->fd;
5698#endif
5699	UNLOCK(&manager->lock);
5700
5701	sock->active = 1;
5702	if (flags & ISC_SOCKFDWATCH_READ)
5703		select_poke(sock->manager, sock->threadid, sock->fd,
5704		    SELECT_POKE_READ);
5705	if (flags & ISC_SOCKFDWATCH_WRITE)
5706		select_poke(sock->manager, sock->threadid, sock->fd,
5707		    SELECT_POKE_WRITE);
5708
5709	socket_log(sock, NULL, CREATION, "fdwatch-created");
5710
5711	return (ISC_R_SUCCESS);
5712}
5713
5714/*
5715 * Indicate to the manager that it should watch the socket again.
5716 * This can be used to restart watching if the previous event handler
5717 * didn't indicate there was more data to be processed.  Primarily
5718 * it is for writing but could be used for reading if desired
5719 */
5720
5721isc_result_t
5722isc_socket_fdwatchpoke(isc_socket_t *sock, int flags)
5723{
5724	REQUIRE(VALID_SOCKET(sock));
5725
5726	/*
5727	 * We check both flags first to allow us to get the lock
5728	 * once but only if we need it.
5729	 */
5730
5731	if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
5732		LOCK(&sock->lock);
5733		if ((flags & ISC_SOCKFDWATCH_READ) != 0)
5734			select_poke(sock->manager, sock->threadid, sock->fd,
5735				    SELECT_POKE_READ);
5736		if ((flags & ISC_SOCKFDWATCH_WRITE) != 0)
5737			select_poke(sock->manager, sock->threadid, sock->fd,
5738				    SELECT_POKE_WRITE);
5739		UNLOCK(&sock->lock);
5740	}
5741
5742	socket_log(sock, NULL, TRACE, "fdwatch-poked flags: %d", flags);
5743
5744	return (ISC_R_SUCCESS);
5745}
5746