1/*
2 * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 2000-2003  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id$ */
19
20/* This code uses functions which are only available on Server 2003 and
21 * higher, and Windows XP and higher.
22 *
23 * This code is by nature multithreaded and takes advantage of various
24 * features to pass on information through the completion port for
25 * when I/O is completed.  All sends, receives, accepts, and connects are
26 * completed through the completion port.
27 *
28 * The number of Completion Port Worker threads used is the total number
29 * of CPU's + 1. This increases the likelihood that a Worker Thread is
30 * available for processing a completed request.
31 *
32 * XXXPDM 5 August, 2002
33 */
34
35#define MAKE_EXTERNAL 1
36#include <config.h>
37
38#include <sys/types.h>
39
40#ifndef _WINSOCKAPI_
41#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
42#endif
43
44#include <errno.h>
45#include <stddef.h>
46#include <stdlib.h>
47#include <string.h>
48#include <unistd.h>
49#include <io.h>
50#include <fcntl.h>
51#include <process.h>
52
53#include <isc/buffer.h>
54#include <isc/bufferlist.h>
55#include <isc/condition.h>
56#include <isc/list.h>
57#include <isc/log.h>
58#include <isc/mem.h>
59#include <isc/msgs.h>
60#include <isc/mutex.h>
61#include <isc/net.h>
62#include <isc/once.h>
63#include <isc/os.h>
64#include <isc/platform.h>
65#include <isc/print.h>
66#include <isc/region.h>
67#include <isc/socket.h>
68#include <isc/stats.h>
69#include <isc/strerror.h>
70#include <isc/syslog.h>
71#include <isc/task.h>
72#include <isc/thread.h>
73#include <isc/util.h>
74#include <isc/win32os.h>
75
76#include <mswsock.h>
77
78#include "errno2result.h"
79
80/*
81 * How in the world can Microsoft exist with APIs like this?
82 * We can't actually call this directly, because it turns out
83 * no library exports this function.  Instead, we need to
84 * issue a runtime call to get the address.
85 */
86LPFN_CONNECTEX ISCConnectEx;
87LPFN_ACCEPTEX ISCAcceptEx;
88LPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
89
90/*
91 * Run expensive internal consistency checks.
92 */
93#ifdef ISC_SOCKET_CONSISTENCY_CHECKS
94#define CONSISTENT(sock) consistent(sock)
95#else
96#define CONSISTENT(sock) do {} while (0)
97#endif
98static void consistent(isc_socket_t *sock);
99
100/*
101 * Define this macro to control the behavior of connection
102 * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
103 * for details.
104 * NOTE: This requires that Windows 2000 systems install Service Pack 2
105 * or later.
106 */
107#ifndef SIO_UDP_CONNRESET
108#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
109#endif
110
111/*
112 * Some systems define the socket length argument as an int, some as size_t,
113 * some as socklen_t.  This is here so it can be easily changed if needed.
114 */
115#ifndef ISC_SOCKADDR_LEN_T
116#define ISC_SOCKADDR_LEN_T unsigned int
117#endif
118
119/*
120 * Define what the possible "soft" errors can be.  These are non-fatal returns
121 * of various network related functions, like recv() and so on.
122 */
123#define SOFT_ERROR(e)	((e) == WSAEINTR || \
124			 (e) == WSAEWOULDBLOCK || \
125			 (e) == EWOULDBLOCK || \
126			 (e) == EINTR || \
127			 (e) == EAGAIN || \
128			 (e) == 0)
129
130/*
131 * Pending errors are not really errors and should be
132 * kept separate
133 */
134#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
135
136#define DOIO_SUCCESS	  0       /* i/o ok, event sent */
137#define DOIO_SOFT	  1       /* i/o ok, soft error, no event sent */
138#define DOIO_HARD	  2       /* i/o error, event sent */
139#define DOIO_EOF	  3       /* EOF, no event sent */
140#define DOIO_PENDING	  4       /* status when i/o is in process */
141#define DOIO_NEEDMORE	  5       /* IO was processed, but we need more due to minimum */
142
143#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
144
145/*
146 * DLVL(90)  --  Function entry/exit and other tracing.
147 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
148 * DLVL(60)  --  Socket data send/receive
149 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
150 * DLVL(20)  --  Socket creation/destruction.
151 */
152#define TRACE_LEVEL		90
153#define CORRECTNESS_LEVEL	70
154#define IOEVENT_LEVEL		60
155#define EVENT_LEVEL		50
156#define CREATION_LEVEL		20
157
158#define TRACE		DLVL(TRACE_LEVEL)
159#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
160#define IOEVENT		DLVL(IOEVENT_LEVEL)
161#define EVENT		DLVL(EVENT_LEVEL)
162#define CREATION	DLVL(CREATION_LEVEL)
163
164typedef isc_event_t intev_t;
165
166/*
167 * Socket State
168 */
169enum {
170  SOCK_INITIALIZED,	/* Socket Initialized */
171  SOCK_OPEN,		/* Socket opened but nothing yet to do */
172  SOCK_DATA,		/* Socket sending or receiving data */
173  SOCK_LISTEN,		/* TCP Socket listening for connects */
174  SOCK_ACCEPT,		/* TCP socket is waiting to accept */
175  SOCK_CONNECT,		/* TCP Socket connecting */
176  SOCK_CLOSED,		/* Socket has been closed */
177};
178
179#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
180#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
181
182/*
183 * IPv6 control information.  If the socket is an IPv6 socket we want
184 * to collect the destination address and interface so the client can
185 * set them on outgoing packets.
186 */
187#ifdef ISC_PLATFORM_HAVEIPV6
188#ifndef USE_CMSG
189#define USE_CMSG	1
190#endif
191#endif
192
193/*
194 * We really  don't want to try and use these control messages. Win32
195 * doesn't have this mechanism before XP.
196 */
197#undef USE_CMSG
198
199/*
200 * Message header for recvmsg and sendmsg calls.
201 * Used value-result for recvmsg, value only for sendmsg.
202 */
203struct msghdr {
204	SOCKADDR_STORAGE to_addr;	/* UDP send/recv address */
205	int      to_addr_len;		/* length of the address */
206	WSABUF  *msg_iov;		/* scatter/gather array */
207	u_int   msg_iovlen;             /* # elements in msg_iov */
208	void	*msg_control;           /* ancillary data, see below */
209	u_int   msg_controllen;         /* ancillary data buffer len */
210	int	msg_totallen;		/* total length of this message */
211} msghdr;
212
213/*
214 * The size to raise the receive buffer to.
215 */
216#define RCVBUFSIZE (32*1024)
217
218/*
219 * The number of times a send operation is repeated if the result
220 * is WSAEINTR.
221 */
222#define NRETRIES 10
223
224struct isc_socket {
225	/* Not locked. */
226	unsigned int		magic;
227	isc_socketmgr_t	       *manager;
228	isc_mutex_t		lock;
229	isc_sockettype_t	type;
230
231	/* Pointers to scatter/gather buffers */
232	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
233
234	/* Locked by socket lock. */
235	ISC_LINK(isc_socket_t)	link;
236	unsigned int		references; /* EXTERNAL references */
237	SOCKET			fd;	/* file handle */
238	int			pf;	/* protocol family */
239	char			name[16];
240	void *			tag;
241
242	/*
243	 * Each recv() call uses this buffer.  It is a per-socket receive
244	 * buffer that allows us to decouple the system recv() from the
245	 * recv_list done events.  This means the items on the recv_list
246	 * can be removed without having to cancel pending system recv()
247	 * calls.  It also allows us to read-ahead in some cases.
248	 */
249	struct {
250		SOCKADDR_STORAGE	from_addr;	   // UDP send/recv address
251		int		from_addr_len;	   // length of the address
252		char		*base;		   // the base of the buffer
253		char		*consume_position; // where to start copying data from next
254		unsigned int	len;		   // the actual size of this buffer
255		unsigned int	remaining;	   // the number of bytes remaining
256	} recvbuf;
257
258	ISC_LIST(isc_socketevent_t)		send_list;
259	ISC_LIST(isc_socketevent_t)		recv_list;
260	ISC_LIST(isc_socket_newconnev_t)	accept_list;
261	isc_socket_connev_t		       *connect_ev;
262
263	isc_sockaddr_t		address;  /* remote address */
264
265	unsigned int		listener : 1,	/* listener socket */
266				connected : 1,
267				pending_connect : 1, /* connect pending */
268				bound : 1;	/* bound to local addr */
269	unsigned int		pending_iocp;	/* Should equal the counters below. Debug. */
270	unsigned int		pending_recv;  /* Number of outstanding recv() calls. */
271	unsigned int		pending_send;  /* Number of outstanding send() calls. */
272	unsigned int		pending_accept; /* Number of outstanding accept() calls. */
273	unsigned int		state; /* Socket state. Debugging and consistency checking. */
274	int			state_lineno;  /* line which last touched state */
275};
276
277#define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (0)
278
279/*
280 * Buffer structure
281 */
282typedef struct buflist buflist_t;
283
284struct buflist {
285	void			*buf;
286	unsigned int		buflen;
287	ISC_LINK(buflist_t)	link;
288};
289
290/*
291 * I/O Completion ports Info structures
292 */
293
294static HANDLE hHeapHandle = NULL;
295typedef struct IoCompletionInfo {
296	OVERLAPPED		overlapped;
297	isc_socketevent_t	*dev;  /* send()/recv() done event */
298	isc_socket_connev_t	*cdev; /* connect() done event */
299	isc_socket_newconnev_t	*adev; /* accept() done event */
300	void			*acceptbuffer;
301	DWORD			received_bytes;
302	int			request_type;
303	struct msghdr		messagehdr;
304	ISC_LIST(buflist_t)	bufferlist;	/*%< list of buffers */
305} IoCompletionInfo;
306
307/*
308 * Define a maximum number of I/O Completion Port worker threads
309 * to handle the load on the Completion Port. The actual number
310 * used is the number of CPU's + 1.
311 */
312#define MAX_IOCPTHREADS 20
313
314#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
315#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
316
317struct isc_socketmgr {
318	/* Not locked. */
319	unsigned int			magic;
320	isc_mem_t		       *mctx;
321	isc_mutex_t			lock;
322	isc_stats_t		       *stats;
323
324	/* Locked by manager lock. */
325	ISC_LIST(isc_socket_t)		socklist;
326	isc_boolean_t			bShutdown;
327	isc_condition_t			shutdown_ok;
328	HANDLE				hIoCompletionPort;
329	int				maxIOCPThreads;
330	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
331	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
332
333	/*
334	 * Debugging.
335	 * Modified by InterlockedIncrement() and InterlockedDecrement()
336	 */
337	LONG				totalSockets;
338	LONG				iocp_total;
339};
340
341enum {
342	SOCKET_RECV,
343	SOCKET_SEND,
344	SOCKET_ACCEPT,
345	SOCKET_CONNECT
346};
347
348/*
349 * send() and recv() iovec counts
350 */
351#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
352#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
353
354static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
355static void maybe_free_socket(isc_socket_t **, int);
356static void free_socket(isc_socket_t **, int);
357static isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
358static isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
359static isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
360static void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
361static void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
362static void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
363static void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
364static void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
365static void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
366static void queue_receive_request(isc_socket_t *sock);
367
368/*
369 * This is used to dump the contents of the sock structure
370 * You should make sure that the sock is locked before
371 * dumping it. Since the code uses simple printf() statements
372 * it should only be used interactively.
373 */
374void
375sock_dump(isc_socket_t *sock) {
376	isc_socketevent_t *ldev;
377	isc_socket_newconnev_t *ndev;
378
379#if 0
380	isc_sockaddr_t addr;
381	char socktext[256];
382
383	isc_socket_getpeername(sock, &addr);
384	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
385	printf("Remote Socket: %s\n", socktext);
386	isc_socket_getsockname(sock, &addr);
387	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
388	printf("This Socket: %s\n", socktext);
389#endif
390
391	printf("\n\t\tSock Dump\n");
392	printf("\t\tfd: %u\n", sock->fd);
393	printf("\t\treferences: %d\n", sock->references);
394	printf("\t\tpending_accept: %d\n", sock->pending_accept);
395	printf("\t\tconnecting: %d\n", sock->pending_connect);
396	printf("\t\tconnected: %d\n", sock->connected);
397	printf("\t\tbound: %d\n", sock->bound);
398	printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
399	printf("\t\tsocket type: %d\n", sock->type);
400
401	printf("\n\t\tSock Recv List\n");
402	ldev = ISC_LIST_HEAD(sock->recv_list);
403	while (ldev != NULL) {
404		printf("\t\tdev: %p\n", ldev);
405		ldev = ISC_LIST_NEXT(ldev, ev_link);
406	}
407
408	printf("\n\t\tSock Send List\n");
409	ldev = ISC_LIST_HEAD(sock->send_list);
410	while (ldev != NULL) {
411		printf("\t\tdev: %p\n", ldev);
412		ldev = ISC_LIST_NEXT(ldev, ev_link);
413	}
414
415	printf("\n\t\tSock Accept List\n");
416	ndev = ISC_LIST_HEAD(sock->accept_list);
417	while (ndev != NULL) {
418		printf("\t\tdev: %p\n", ldev);
419		ndev = ISC_LIST_NEXT(ndev, ev_link);
420	}
421}
422
423static void
424socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
425	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
426	   isc_msgcat_t *msgcat, int msgset, int message,
427	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
428
429/*  This function will add an entry to the I/O completion port
430 *  that will signal the I/O thread to exit (gracefully)
431 */
432static void
433signal_iocompletionport_exit(isc_socketmgr_t *manager) {
434	int i;
435	int errval;
436	char strbuf[ISC_STRERRORSIZE];
437
438	REQUIRE(VALID_MANAGER(manager));
439	for (i = 0; i < manager->maxIOCPThreads; i++) {
440		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
441						0, 0, 0)) {
442			errval = GetLastError();
443			isc__strerror(errval, strbuf, sizeof(strbuf));
444			FATAL_ERROR(__FILE__, __LINE__,
445				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
446				ISC_MSG_FAILED,
447				"Can't request service thread to exit: %s"),
448				strbuf);
449		}
450	}
451}
452
453/*
454 * Create the worker threads for the I/O Completion Port
455 */
456void
457iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
458	int errval;
459	char strbuf[ISC_STRERRORSIZE];
460	int i;
461
462	INSIST(total_threads > 0);
463	REQUIRE(VALID_MANAGER(manager));
464	/*
465	 * We need at least one
466	 */
467	for (i = 0; i < total_threads; i++) {
468		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
469						manager, 0,
470						&manager->dwIOCPThreadIds[i]);
471		if (manager->hIOCPThreads[i] == NULL) {
472			errval = GetLastError();
473			isc__strerror(errval, strbuf, sizeof(strbuf));
474			FATAL_ERROR(__FILE__, __LINE__,
475				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
476				ISC_MSG_FAILED,
477				"Can't create IOCP thread: %s"),
478				strbuf);
479			exit(1);
480		}
481	}
482}
483
484/*
485 *  Create/initialise the I/O completion port
486 */
487void
488iocompletionport_init(isc_socketmgr_t *manager) {
489	int errval;
490	char strbuf[ISC_STRERRORSIZE];
491
492	REQUIRE(VALID_MANAGER(manager));
493	/*
494	 * Create a private heap to handle the socket overlapped structure
495	 * The minimum number of structures is 10, there is no maximum
496	 */
497	hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
498	if (hHeapHandle == NULL) {
499		errval = GetLastError();
500		isc__strerror(errval, strbuf, sizeof(strbuf));
501		FATAL_ERROR(__FILE__, __LINE__,
502			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
503					   ISC_MSG_FAILED,
504					   "HeapCreate() failed during "
505					   "initialization: %s"),
506			    strbuf);
507		exit(1);
508	}
509
510	manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
511
512	/* Now Create the Completion Port */
513	manager->hIoCompletionPort = CreateIoCompletionPort(
514			INVALID_HANDLE_VALUE, NULL,
515			0, manager->maxIOCPThreads);
516	if (manager->hIoCompletionPort == NULL) {
517		errval = GetLastError();
518		isc__strerror(errval, strbuf, sizeof(strbuf));
519		FATAL_ERROR(__FILE__, __LINE__,
520				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
521				ISC_MSG_FAILED,
522				"CreateIoCompletionPort() failed "
523				"during initialization: %s"),
524				strbuf);
525		exit(1);
526	}
527
528	/*
529	 * Worker threads for servicing the I/O
530	 */
531	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
532}
533
534/*
535 * Associate a socket with an IO Completion Port.  This allows us to queue events for it
536 * and have our worker pool of threads process them.
537 */
538void
539iocompletionport_update(isc_socket_t *sock) {
540	HANDLE hiocp;
541	char strbuf[ISC_STRERRORSIZE];
542
543	REQUIRE(VALID_SOCKET(sock));
544
545	hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
546		sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
547
548	if (hiocp == NULL) {
549		DWORD errval = GetLastError();
550		isc__strerror(errval, strbuf, sizeof(strbuf));
551		isc_log_iwrite(isc_lctx,
552				ISC_LOGCATEGORY_GENERAL,
553				ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
554				isc_msgcat, ISC_MSGSET_SOCKET,
555				ISC_MSG_TOOMANYHANDLES,
556				"iocompletionport_update: failed to open"
557				" io completion port: %s",
558				strbuf);
559
560		/* XXXMLG temporary hack to make failures detected.
561		 * This function should return errors to the caller, not
562		 * exit here.
563		 */
564		FATAL_ERROR(__FILE__, __LINE__,
565				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
566				ISC_MSG_FAILED,
567				"CreateIoCompletionPort() failed "
568				"during initialization: %s"),
569				strbuf);
570		exit(1);
571	}
572
573	InterlockedIncrement(&sock->manager->iocp_total);
574}
575
576/*
577 * Routine to cleanup and then close the socket.
578 * Only close the socket here if it is NOT associated
579 * with an event, otherwise the WSAWaitForMultipleEvents
580 * may fail due to the fact that the Wait should not
581 * be running while closing an event or a socket.
582 * The socket is locked before calling this function
583 */
584void
585socket_close(isc_socket_t *sock) {
586
587	REQUIRE(sock != NULL);
588
589	if (sock->fd != INVALID_SOCKET) {
590		closesocket(sock->fd);
591		sock->fd = INVALID_SOCKET;
592		_set_state(sock, SOCK_CLOSED);
593		InterlockedDecrement(&sock->manager->totalSockets);
594	}
595}
596
597static isc_once_t initialise_once = ISC_ONCE_INIT;
598static isc_boolean_t initialised = ISC_FALSE;
599
600static void
601initialise(void) {
602	WORD wVersionRequested;
603	WSADATA wsaData;
604	int err;
605	SOCKET sock;
606	GUID GUIDConnectEx = WSAID_CONNECTEX;
607	GUID GUIDAcceptEx = WSAID_ACCEPTEX;
608	GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
609	DWORD dwBytes;
610
611	/* Need Winsock 2.2 or better */
612	wVersionRequested = MAKEWORD(2, 2);
613
614	err = WSAStartup(wVersionRequested, &wsaData);
615	if (err != 0) {
616		char strbuf[ISC_STRERRORSIZE];
617		isc__strerror(err, strbuf, sizeof(strbuf));
618		FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
619			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
620					   ISC_MSG_FAILED, "failed"),
621			    strbuf);
622		exit(1);
623	}
624	/*
625	 * The following APIs do not exist as functions in a library, but we must
626	 * ask winsock for them.  They are "extensions" -- but why they cannot be
627	 * actual functions is beyond me.  So, ask winsock for the pointers to the
628	 * functions we need.
629	 */
630	sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
631	INSIST(sock != INVALID_SOCKET);
632	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
633		 &GUIDConnectEx, sizeof(GUIDConnectEx),
634		 &ISCConnectEx, sizeof(ISCConnectEx),
635		 &dwBytes, NULL, NULL);
636	INSIST(err == 0);
637
638	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
639		 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
640		 &ISCAcceptEx, sizeof(ISCAcceptEx),
641		 &dwBytes, NULL, NULL);
642	INSIST(err == 0);
643
644	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
645		 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
646		 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
647		 &dwBytes, NULL, NULL);
648	INSIST(err == 0);
649
650	closesocket(sock);
651
652	initialised = ISC_TRUE;
653}
654
655/*
656 * Initialize socket services
657 */
658void
659InitSockets(void) {
660	RUNTIME_CHECK(isc_once_do(&initialise_once,
661				  initialise) == ISC_R_SUCCESS);
662	if (!initialised)
663		exit(1);
664}
665
666int
667internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
668		 struct msghdr *messagehdr, int flags, int *Error)
669{
670	int Result;
671	DWORD BytesSent;
672	DWORD Flags = flags;
673	int total_sent;
674
675	*Error = 0;
676	Result = WSASendTo(sock->fd, messagehdr->msg_iov,
677			   messagehdr->msg_iovlen, &BytesSent,
678			   Flags, (SOCKADDR *)&messagehdr->to_addr,
679			   messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
680			   NULL);
681
682	total_sent = (int)BytesSent;
683
684	/* Check for errors.*/
685	if (Result == SOCKET_ERROR) {
686		*Error = WSAGetLastError();
687
688		switch (*Error) {
689		case WSA_IO_INCOMPLETE:
690		case WSA_WAIT_IO_COMPLETION:
691		case WSA_IO_PENDING:
692		case NO_ERROR:		/* Strange, but okay */
693			sock->pending_iocp++;
694			sock->pending_send++;
695			break;
696
697		default:
698			return (-1);
699			break;
700		}
701	} else {
702		sock->pending_iocp++;
703		sock->pending_send++;
704	}
705
706	if (lpo != NULL)
707		return (0);
708	else
709		return (total_sent);
710}
711
712static void
713queue_receive_request(isc_socket_t *sock) {
714	DWORD Flags = 0;
715	DWORD NumBytes = 0;
716	int total_bytes = 0;
717	int Result;
718	int Error;
719	int need_retry;
720	WSABUF iov[1];
721	IoCompletionInfo *lpo = NULL;
722	isc_result_t isc_result;
723
724 retry:
725	need_retry = ISC_FALSE;
726
727	/*
728	 * If we already have a receive pending, do nothing.
729	 */
730	if (sock->pending_recv > 0) {
731		if (lpo != NULL)
732			HeapFree(hHeapHandle, 0, lpo);
733		return;
734	}
735
736	/*
737	 * If no one is waiting, do nothing.
738	 */
739	if (ISC_LIST_EMPTY(sock->recv_list)) {
740		if (lpo != NULL)
741			HeapFree(hHeapHandle, 0, lpo);
742		return;
743	}
744
745	INSIST(sock->recvbuf.remaining == 0);
746	INSIST(sock->fd != INVALID_SOCKET);
747
748	iov[0].len = sock->recvbuf.len;
749	iov[0].buf = sock->recvbuf.base;
750
751	if (lpo == NULL) {
752		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
753						    HEAP_ZERO_MEMORY,
754						    sizeof(IoCompletionInfo));
755		RUNTIME_CHECK(lpo != NULL);
756	} else
757		ZeroMemory(lpo, sizeof(IoCompletionInfo));
758	lpo->request_type = SOCKET_RECV;
759
760	sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
761
762	Error = 0;
763	Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
764			     &NumBytes, &Flags,
765			     (SOCKADDR *)&sock->recvbuf.from_addr,
766			     &sock->recvbuf.from_addr_len,
767			     (LPWSAOVERLAPPED)lpo, NULL);
768
769	/* Check for errors. */
770	if (Result == SOCKET_ERROR) {
771		Error = WSAGetLastError();
772
773		switch (Error) {
774		case WSA_IO_PENDING:
775			sock->pending_iocp++;
776			sock->pending_recv++;
777			break;
778
779		/* direct error: no completion event */
780		case ERROR_HOST_UNREACHABLE:
781		case WSAENETRESET:
782		case WSAECONNRESET:
783			if (!sock->connected) {
784				/* soft error */
785				need_retry = ISC_TRUE;
786				break;
787			}
788			/* FALLTHROUGH */
789
790		default:
791			isc_result = isc__errno2result(Error);
792			if (isc_result == ISC_R_UNEXPECTED)
793				UNEXPECTED_ERROR(__FILE__, __LINE__,
794					"WSARecvFrom: Windows error code: %d, isc result %d",
795					Error, isc_result);
796			send_recvdone_abort(sock, isc_result);
797			HeapFree(hHeapHandle, 0, lpo);
798			lpo = NULL;
799			break;
800		}
801	} else {
802		/*
803		 * The recv() finished immediately, but we will still get
804		 * a completion event.  Rather than duplicate code, let
805		 * that thread handle sending the data along its way.
806		 */
807		sock->pending_iocp++;
808		sock->pending_recv++;
809	}
810
811	socket_log(__LINE__, sock, NULL, IOEVENT,
812		   isc_msgcat, ISC_MSGSET_SOCKET,
813		   ISC_MSG_DOIORECV,
814		   "queue_io_request: fd %d result %d error %d",
815		   sock->fd, Result, Error);
816
817	CONSISTENT(sock);
818
819	if (need_retry)
820		goto retry;
821}
822
823static void
824manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
825	    isc_logmodule_t *module, int level, const char *fmt, ...)
826{
827	char msgbuf[2048];
828	va_list ap;
829
830	if (!isc_log_wouldlog(isc_lctx, level))
831		return;
832
833	va_start(ap, fmt);
834	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
835	va_end(ap);
836
837	isc_log_write(isc_lctx, category, module, level,
838		      "sockmgr %p: %s", sockmgr, msgbuf);
839}
840
841static void
842socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
843	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
844	   isc_msgcat_t *msgcat, int msgset, int message,
845	   const char *fmt, ...)
846{
847	char msgbuf[2048];
848	char peerbuf[256];
849	va_list ap;
850
851
852	if (!isc_log_wouldlog(isc_lctx, level))
853		return;
854
855	va_start(ap, fmt);
856	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
857	va_end(ap);
858
859	if (address == NULL) {
860		isc_log_iwrite(isc_lctx, category, module, level,
861			       msgcat, msgset, message,
862			       "socket %p line %d: %s", sock, lineno, msgbuf);
863	} else {
864		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
865		isc_log_iwrite(isc_lctx, category, module, level,
866			       msgcat, msgset, message,
867				   "socket %p line %d peer %s: %s", sock, lineno,
868				   peerbuf, msgbuf);
869	}
870
871}
872
873/*
874 * Make an fd SOCKET non-blocking.
875 */
876static isc_result_t
877make_nonblock(SOCKET fd) {
878	int ret;
879	unsigned long flags = 1;
880	char strbuf[ISC_STRERRORSIZE];
881
882	/* Set the socket to non-blocking */
883	ret = ioctlsocket(fd, FIONBIO, &flags);
884
885	if (ret == -1) {
886		isc__strerror(errno, strbuf, sizeof(strbuf));
887		UNEXPECTED_ERROR(__FILE__, __LINE__,
888				 "ioctlsocket(%d, FIOBIO, %d): %s",
889				 fd, flags, strbuf);
890
891		return (ISC_R_UNEXPECTED);
892	}
893
894	return (ISC_R_SUCCESS);
895}
896
897/*
898 * Windows 2000 systems incorrectly cause UDP sockets using WSARecvFrom
899 * to not work correctly, returning a WSACONNRESET error when a WSASendTo
900 * fails with an "ICMP port unreachable" response and preventing the
901 * socket from using the WSARecvFrom in subsequent operations.
902 * The function below fixes this, but requires that Windows 2000
903 * Service Pack 2 or later be installed on the system.  NT 4.0
904 * systems are not affected by this and work correctly.
905 * See Microsoft Knowledge Base Article Q263823 for details of this.
906 */
907isc_result_t
908connection_reset_fix(SOCKET fd) {
909	DWORD dwBytesReturned = 0;
910	BOOL  bNewBehavior = FALSE;
911	DWORD status;
912
913	if (isc_win32os_majorversion() < 5)
914		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */
915
916	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
917	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
918			  sizeof(bNewBehavior), NULL, 0,
919			  &dwBytesReturned, NULL, NULL);
920	if (status != SOCKET_ERROR)
921		return (ISC_R_SUCCESS);
922	else {
923		UNEXPECTED_ERROR(__FILE__, __LINE__,
924				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
925				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
926						ISC_MSG_FAILED, "failed"));
927		return (ISC_R_UNEXPECTED);
928	}
929}
930
931/*
932 * Construct an iov array and attach it to the msghdr passed in.  This is
933 * the SEND constructor, which will use the used region of the buffer
934 * (if using a buffer list) or will use the internal region (if a single
935 * buffer I/O is requested).
936 *
937 * Nothing can be NULL, and the done event must list at least one buffer
938 * on the buffer linked list for this function to be meaningful.
939 */
940static void
941build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
942		  struct msghdr *msg, char *cmsg, WSABUF *iov,
943		  IoCompletionInfo  *lpo)
944{
945	unsigned int iovcount;
946	isc_buffer_t *buffer;
947	buflist_t  *cpbuffer;
948	isc_region_t used;
949	size_t write_count;
950	size_t skip_count;
951
952	memset(msg, 0, sizeof(*msg));
953
954	memcpy(&msg->to_addr, &dev->address.type, dev->address.length);
955	msg->to_addr_len = dev->address.length;
956
957	buffer = ISC_LIST_HEAD(dev->bufferlist);
958	write_count = 0;
959	iovcount = 0;
960
961	/*
962	 * Single buffer I/O?  Skip what we've done so far in this region.
963	 */
964	if (buffer == NULL) {
965		write_count = dev->region.length - dev->n;
966		cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
967		RUNTIME_CHECK(cpbuffer != NULL);
968		cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
969		RUNTIME_CHECK(cpbuffer->buf != NULL);
970
971		socket_log(__LINE__, sock, NULL, TRACE,
972		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
973		   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
974		   cpbuffer->buf, write_count);
975
976		memcpy(cpbuffer->buf,(dev->region.base + dev->n), write_count);
977		cpbuffer->buflen = write_count;
978		ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
979		iov[0].buf = cpbuffer->buf;
980		iov[0].len = write_count;
981		iovcount = 1;
982
983		goto config;
984	}
985
986	/*
987	 * Multibuffer I/O.
988	 * Skip the data in the buffer list that we have already written.
989	 */
990	skip_count = dev->n;
991	while (buffer != NULL) {
992		REQUIRE(ISC_BUFFER_VALID(buffer));
993		if (skip_count < isc_buffer_usedlength(buffer))
994			break;
995		skip_count -= isc_buffer_usedlength(buffer);
996		buffer = ISC_LIST_NEXT(buffer, link);
997	}
998
999	while (buffer != NULL) {
1000		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1001
1002		isc_buffer_usedregion(buffer, &used);
1003
1004		if (used.length > 0) {
1005			int uselen = used.length - skip_count;
1006			cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
1007			RUNTIME_CHECK(cpbuffer != NULL);
1008			cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
1009			RUNTIME_CHECK(cpbuffer->buf != NULL);
1010
1011			socket_log(__LINE__, sock, NULL, TRACE,
1012			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1013			   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
1014			   cpbuffer->buf, write_count);
1015
1016			memcpy(cpbuffer->buf,(used.base + skip_count), uselen);
1017			cpbuffer->buflen = uselen;
1018			iov[iovcount].buf = cpbuffer->buf;
1019			iov[iovcount].len = used.length - skip_count;
1020			write_count += uselen;
1021			skip_count = 0;
1022			iovcount++;
1023		}
1024		buffer = ISC_LIST_NEXT(buffer, link);
1025	}
1026
1027	INSIST(skip_count == 0);
1028
1029 config:
1030	msg->msg_iov = iov;
1031	msg->msg_iovlen = iovcount;
1032	msg->msg_totallen = write_count;
1033}
1034
1035static void
1036set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1037		isc_socketevent_t *dev)
1038{
1039	if (sock->type == isc_sockettype_udp) {
1040		if (address != NULL)
1041			dev->address = *address;
1042		else
1043			dev->address = sock->address;
1044	} else if (sock->type == isc_sockettype_tcp) {
1045		INSIST(address == NULL);
1046		dev->address = sock->address;
1047	}
1048}
1049
1050static void
1051destroy_socketevent(isc_event_t *event) {
1052	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1053
1054	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1055
1056	(ev->destroy)(event);
1057}
1058
1059static isc_socketevent_t *
1060allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1061		     isc_taskaction_t action, const void *arg)
1062{
1063	isc_socketevent_t *ev;
1064
1065	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1066						     sock, eventtype,
1067						     action, arg,
1068						     sizeof(*ev));
1069	if (ev == NULL)
1070		return (NULL);
1071
1072	ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1073	ISC_LINK_INIT(ev, ev_link);
1074	ISC_LIST_INIT(ev->bufferlist);
1075	ev->region.base = NULL;
1076	ev->n = 0;
1077	ev->offset = 0;
1078	ev->attributes = 0;
1079	ev->destroy = ev->ev_destroy;
1080	ev->ev_destroy = destroy_socketevent;
1081
1082	return (ev);
1083}
1084
1085#if defined(ISC_SOCKET_DEBUG)
1086static void
1087dump_msg(struct msghdr *msg, isc_socket_t *sock) {
1088	unsigned int i;
1089
1090	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1091	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1092	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1093	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1094		printf("\t\t%d\tbase %p, len %d\n", i,
1095		       msg->msg_iov[i].buf,
1096		       msg->msg_iov[i].len);
1097}
1098#endif
1099
1100/*
1101 * map the error code
1102 */
1103int
1104map_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1105		 char *errorstring, size_t bufsize) {
1106
1107	int doreturn;
1108	switch (windows_errno) {
1109	case WSAECONNREFUSED:
1110		*isc_errno = ISC_R_CONNREFUSED;
1111		if (sock->connected)
1112			doreturn = DOIO_HARD;
1113		else
1114			doreturn = DOIO_SOFT;
1115		break;
1116	case WSAENETUNREACH:
1117	case ERROR_NETWORK_UNREACHABLE:
1118		*isc_errno = ISC_R_NETUNREACH;
1119		if (sock->connected)
1120			doreturn = DOIO_HARD;
1121		else
1122			doreturn = DOIO_SOFT;
1123		break;
1124	case ERROR_PORT_UNREACHABLE:
1125	case ERROR_HOST_UNREACHABLE:
1126	case WSAEHOSTUNREACH:
1127		*isc_errno = ISC_R_HOSTUNREACH;
1128		if (sock->connected)
1129			doreturn = DOIO_HARD;
1130		else
1131			doreturn = DOIO_SOFT;
1132		break;
1133	case WSAENETDOWN:
1134		*isc_errno = ISC_R_NETDOWN;
1135		if (sock->connected)
1136			doreturn = DOIO_HARD;
1137		else
1138			doreturn = DOIO_SOFT;
1139		break;
1140	case WSAEHOSTDOWN:
1141		*isc_errno = ISC_R_HOSTDOWN;
1142		if (sock->connected)
1143			doreturn = DOIO_HARD;
1144		else
1145			doreturn = DOIO_SOFT;
1146		break;
1147	case WSAEACCES:
1148		*isc_errno = ISC_R_NOPERM;
1149		if (sock->connected)
1150			doreturn = DOIO_HARD;
1151		else
1152			doreturn = DOIO_SOFT;
1153		break;
1154	case WSAECONNRESET:
1155	case WSAENETRESET:
1156	case WSAECONNABORTED:
1157	case WSAEDISCON:
1158		*isc_errno = ISC_R_CONNECTIONRESET;
1159		if (sock->connected)
1160			doreturn = DOIO_HARD;
1161		else
1162			doreturn = DOIO_SOFT;
1163		break;
1164	case WSAENOTCONN:
1165		*isc_errno = ISC_R_NOTCONNECTED;
1166		if (sock->connected)
1167			doreturn = DOIO_HARD;
1168		else
1169			doreturn = DOIO_SOFT;
1170		break;
1171	case ERROR_OPERATION_ABORTED:
1172	case ERROR_CONNECTION_ABORTED:
1173	case ERROR_REQUEST_ABORTED:
1174		*isc_errno = ISC_R_CONNECTIONRESET;
1175		doreturn = DOIO_HARD;
1176		break;
1177	case WSAENOBUFS:
1178		*isc_errno = ISC_R_NORESOURCES;
1179		doreturn = DOIO_HARD;
1180		break;
1181	case WSAEAFNOSUPPORT:
1182		*isc_errno = ISC_R_FAMILYNOSUPPORT;
1183		doreturn = DOIO_HARD;
1184		break;
1185	case WSAEADDRNOTAVAIL:
1186		*isc_errno = ISC_R_ADDRNOTAVAIL;
1187		doreturn = DOIO_HARD;
1188		break;
1189	case WSAEDESTADDRREQ:
1190		*isc_errno = ISC_R_BADADDRESSFORM;
1191		doreturn = DOIO_HARD;
1192		break;
1193	case ERROR_NETNAME_DELETED:
1194		*isc_errno = ISC_R_NETDOWN;
1195		doreturn = DOIO_HARD;
1196		break;
1197	default:
1198		*isc_errno = ISC_R_IOERROR;
1199		doreturn = DOIO_HARD;
1200		break;
1201	}
1202	if (doreturn == DOIO_HARD) {
1203		isc__strerror(windows_errno, errorstring, bufsize);
1204	}
1205	return (doreturn);
1206}
1207
1208static void
1209fill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1210	isc_region_t r;
1211	int copylen;
1212	isc_buffer_t *buffer;
1213
1214	INSIST(dev->n < dev->minimum);
1215	INSIST(sock->recvbuf.remaining > 0);
1216	INSIST(sock->pending_recv == 0);
1217
1218	if (sock->type == isc_sockettype_udp) {
1219		dev->address.length = sock->recvbuf.from_addr_len;
1220		memcpy(&dev->address.type, &sock->recvbuf.from_addr,
1221		    sock->recvbuf.from_addr_len);
1222		if (isc_sockaddr_getport(&dev->address) == 0) {
1223			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1224				socket_log(__LINE__, sock, &dev->address, IOEVENT,
1225					   isc_msgcat, ISC_MSGSET_SOCKET,
1226					   ISC_MSG_ZEROPORT,
1227					   "dropping source port zero packet");
1228			}
1229			sock->recvbuf.remaining = 0;
1230			return;
1231		}
1232	} else if (sock->type == isc_sockettype_tcp) {
1233		dev->address = sock->address;
1234	}
1235
1236	/*
1237	 * Run through the list of buffers we were given, and find the
1238	 * first one with space.  Once it is found, loop through, filling
1239	 * the buffers as much as possible.
1240	 */
1241	buffer = ISC_LIST_HEAD(dev->bufferlist);
1242	if (buffer != NULL) { // Multi-buffer receive
1243		while (buffer != NULL && sock->recvbuf.remaining > 0) {
1244			REQUIRE(ISC_BUFFER_VALID(buffer));
1245			if (isc_buffer_availablelength(buffer) > 0) {
1246				isc_buffer_availableregion(buffer, &r);
1247				copylen = min(r.length, sock->recvbuf.remaining);
1248				memcpy(r.base, sock->recvbuf.consume_position, copylen);
1249				sock->recvbuf.consume_position += copylen;
1250				sock->recvbuf.remaining -= copylen;
1251				isc_buffer_add(buffer, copylen);
1252				dev->n += copylen;
1253			}
1254			buffer = ISC_LIST_NEXT(buffer, link);
1255		}
1256	} else { // Single-buffer receive
1257		copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1258		memcpy(dev->region.base + dev->n, sock->recvbuf.consume_position, copylen);
1259		sock->recvbuf.consume_position += copylen;
1260		sock->recvbuf.remaining -= copylen;
1261		dev->n += copylen;
1262	}
1263
1264	/*
1265	 * UDP receives are all-consuming.  That is, if we have 4k worth of
1266	 * data in our receive buffer, and the caller only gave us
1267	 * 1k of space, we will toss the remaining 3k of data.  TCP
1268	 * will keep the extra data around and use it for later requests.
1269	 */
1270	if (sock->type == isc_sockettype_udp)
1271		sock->recvbuf.remaining = 0;
1272}
1273
1274/*
1275 * Copy out as much data from the internal buffer to done events.
1276 * As each done event is filled, send it along its way.
1277 */
1278static void
1279completeio_recv(isc_socket_t *sock)
1280{
1281	isc_socketevent_t *dev;
1282
1283	/*
1284	 * If we are in the process of filling our buffer, we cannot
1285	 * touch it yet, so don't.
1286	 */
1287	if (sock->pending_recv > 0)
1288		return;
1289
1290	while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1291		dev = ISC_LIST_HEAD(sock->recv_list);
1292
1293		/*
1294		 * See if we have sufficient data in our receive buffer
1295		 * to handle this.  If we do, copy out the data.
1296		 */
1297		fill_recv(sock, dev);
1298
1299		/*
1300		 * Did we satisfy it?
1301		 */
1302		if (dev->n >= dev->minimum) {
1303			dev->result = ISC_R_SUCCESS;
1304			send_recvdone_event(sock, &dev);
1305		}
1306	}
1307}
1308
1309/*
1310 * Returns:
1311 *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1312 *			ISC_R_SUCCESS.
1313 *
1314 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1315 *			dev->result contains the appropriate error.
1316 *
1317 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1318 *			event was sent.  The operation should be retried.
1319 *
1320 *	No other return values are possible.
1321 */
1322static int
1323completeio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1324		struct msghdr *messagehdr, int cc, int send_errno)
1325{
1326	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1327	char strbuf[ISC_STRERRORSIZE];
1328
1329	if (send_errno != 0) {
1330		if (SOFT_ERROR(send_errno))
1331			return (DOIO_SOFT);
1332
1333		return (map_socket_error(sock, send_errno, &dev->result,
1334			strbuf, sizeof(strbuf)));
1335
1336		/*
1337		 * The other error types depend on whether or not the
1338		 * socket is UDP or TCP.  If it is UDP, some errors
1339		 * that we expect to be fatal under TCP are merely
1340		 * annoying, and are really soft errors.
1341		 *
1342		 * However, these soft errors are still returned as
1343		 * a status.
1344		 */
1345		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1346		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1347		UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1348				 addrbuf, strbuf);
1349		dev->result = isc__errno2result(send_errno);
1350		return (DOIO_HARD);
1351	}
1352
1353	/*
1354	 * If we write less than we expected, update counters, poke.
1355	 */
1356	dev->n += cc;
1357	if (cc != messagehdr->msg_totallen)
1358		return (DOIO_SOFT);
1359
1360	/*
1361	 * Exactly what we wanted to write.  We're done with this
1362	 * entry.  Post its completion event.
1363	 */
1364	dev->result = ISC_R_SUCCESS;
1365	return (DOIO_SUCCESS);
1366}
1367
1368static int
1369startio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1370	     int *send_errno)
1371{
1372	char *cmsg = NULL;
1373	char strbuf[ISC_STRERRORSIZE];
1374	IoCompletionInfo *lpo;
1375	int status;
1376	struct msghdr *msghdr;
1377
1378	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1379					    HEAP_ZERO_MEMORY,
1380					    sizeof(IoCompletionInfo));
1381	RUNTIME_CHECK(lpo != NULL);
1382	lpo->request_type = SOCKET_SEND;
1383	lpo->dev = dev;
1384	msghdr = &lpo->messagehdr;
1385	memset(msghdr, 0, sizeof(struct msghdr));
1386	ISC_LIST_INIT(lpo->bufferlist);
1387
1388	build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1389
1390	*nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1391
1392	if (*nbytes < 0) {
1393		/*
1394		 * I/O has been initiated
1395		 * completion will be through the completion port
1396		 */
1397		if (PENDING_ERROR(*send_errno)) {
1398			status = DOIO_PENDING;
1399			goto done;
1400		}
1401
1402		if (SOFT_ERROR(*send_errno)) {
1403			status = DOIO_SOFT;
1404			goto done;
1405		}
1406
1407		/*
1408		 * If we got this far then something is wrong
1409		 */
1410		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1411			isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1412			socket_log(__LINE__, sock, NULL, IOEVENT,
1413				   isc_msgcat, ISC_MSGSET_SOCKET,
1414				   ISC_MSG_INTERNALSEND,
1415				   "startio_send: internal_sendmsg(%d) %d "
1416				   "bytes, err %d/%s",
1417				   sock->fd, *nbytes, *send_errno, strbuf);
1418		}
1419		status = DOIO_HARD;
1420		goto done;
1421	}
1422	dev->result = ISC_R_SUCCESS;
1423	status = DOIO_SOFT;
1424 done:
1425	_set_state(sock, SOCK_DATA);
1426	return (status);
1427}
1428
1429static isc_result_t
1430allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1431		isc_socket_t **socketp) {
1432	isc_socket_t *sock;
1433	isc_result_t result;
1434
1435	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1436
1437	if (sock == NULL)
1438		return (ISC_R_NOMEMORY);
1439
1440	sock->magic = 0;
1441	sock->references = 0;
1442
1443	sock->manager = manager;
1444	sock->type = type;
1445	sock->fd = INVALID_SOCKET;
1446
1447	ISC_LINK_INIT(sock, link);
1448
1449	/*
1450	 * set up list of readers and writers to be initially empty
1451	 */
1452	ISC_LIST_INIT(sock->recv_list);
1453	ISC_LIST_INIT(sock->send_list);
1454	ISC_LIST_INIT(sock->accept_list);
1455	sock->connect_ev = NULL;
1456	sock->pending_accept = 0;
1457	sock->pending_recv = 0;
1458	sock->pending_send = 0;
1459	sock->pending_iocp = 0;
1460	sock->listener = 0;
1461	sock->connected = 0;
1462	sock->pending_connect = 0;
1463	sock->bound = 0;
1464	memset(sock->name, 0, sizeof(sock->name));	// zero the name field
1465	_set_state(sock, SOCK_INITIALIZED);
1466
1467	sock->recvbuf.len = 65536;
1468	sock->recvbuf.consume_position = sock->recvbuf.base;
1469	sock->recvbuf.remaining = 0;
1470	sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1471	if (sock->recvbuf.base == NULL) {
1472		sock->magic = 0;
1473		goto error;
1474	}
1475
1476	/*
1477	 * initialize the lock
1478	 */
1479	result = isc_mutex_init(&sock->lock);
1480	if (result != ISC_R_SUCCESS) {
1481		sock->magic = 0;
1482		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1483		sock->recvbuf.base = NULL;
1484		goto error;
1485	}
1486
1487	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1488		   "allocated");
1489
1490	sock->magic = SOCKET_MAGIC;
1491	*socketp = sock;
1492
1493	return (ISC_R_SUCCESS);
1494
1495 error:
1496	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1497
1498	return (result);
1499}
1500
1501/*
1502 * Verify that the socket state is consistent.
1503 */
1504static void
1505consistent(isc_socket_t *sock) {
1506
1507	isc_socketevent_t *dev;
1508	isc_socket_newconnev_t *nev;
1509	unsigned int count;
1510	char *crash_reason;
1511	isc_boolean_t crash = ISC_FALSE;
1512
1513	REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1514		+ sock->pending_accept + sock->pending_connect);
1515
1516	dev = ISC_LIST_HEAD(sock->send_list);
1517	count = 0;
1518	while (dev != NULL) {
1519		count++;
1520		dev = ISC_LIST_NEXT(dev, ev_link);
1521	}
1522	if (count > sock->pending_send) {
1523		crash = ISC_TRUE;
1524		crash_reason = "send_list > sock->pending_send";
1525	}
1526
1527	nev = ISC_LIST_HEAD(sock->accept_list);
1528	count = 0;
1529	while (nev != NULL) {
1530		count++;
1531		nev = ISC_LIST_NEXT(nev, ev_link);
1532	}
1533	if (count > sock->pending_accept) {
1534		crash = ISC_TRUE;
1535		crash_reason = "send_list > sock->pending_send";
1536	}
1537
1538	if (crash) {
1539		socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1540			   ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1541			   crash_reason);
1542		sock_dump(sock);
1543		INSIST(crash == ISC_FALSE);
1544	}
1545}
1546
1547/*
1548 * Maybe free the socket.
1549 *
1550 * This function will verify tht the socket is no longer in use in any way,
1551 * either internally or externally.  This is the only place where this
1552 * check is to be made; if some bit of code believes that IT is done with
1553 * the socket (e.g., some reference counter reaches zero), it should call
1554 * this function.
1555 *
1556 * When calling this function, the socket must be locked, and the manager
1557 * must be unlocked.
1558 *
1559 * When this function returns, *socketp will be NULL.  No tricks to try
1560 * to hold on to this pointer are allowed.
1561 */
1562static void
1563maybe_free_socket(isc_socket_t **socketp, int lineno) {
1564	isc_socket_t *sock = *socketp;
1565	*socketp = NULL;
1566
1567	INSIST(VALID_SOCKET(sock));
1568	CONSISTENT(sock);
1569
1570	if (sock->pending_iocp > 0
1571	    || sock->pending_recv > 0
1572	    || sock->pending_send > 0
1573	    || sock->pending_accept > 0
1574	    || sock->references > 0
1575	    || sock->pending_connect == 1
1576	    || !ISC_LIST_EMPTY(sock->recv_list)
1577	    || !ISC_LIST_EMPTY(sock->send_list)
1578	    || !ISC_LIST_EMPTY(sock->accept_list)
1579	    || sock->fd != INVALID_SOCKET) {
1580		UNLOCK(&sock->lock);
1581		return;
1582	}
1583	UNLOCK(&sock->lock);
1584
1585	free_socket(&sock, lineno);
1586}
1587
1588void
1589free_socket(isc_socket_t **sockp, int lineno) {
1590	isc_socketmgr_t *manager;
1591	isc_socket_t *sock = *sockp;
1592	*sockp = NULL;
1593
1594	manager = sock->manager;
1595
1596	/*
1597	 * Seems we can free the socket after all.
1598	 */
1599	manager = sock->manager;
1600	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1601		   ISC_MSG_DESTROYING, "freeing socket line %d fd %d lock %p semaphore %p",
1602		   lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1603
1604	sock->magic = 0;
1605	DESTROYLOCK(&sock->lock);
1606
1607	if (sock->recvbuf.base != NULL)
1608		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1609
1610	LOCK(&manager->lock);
1611	if (ISC_LINK_LINKED(sock, link))
1612		ISC_LIST_UNLINK(manager->socklist, sock, link);
1613	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1614
1615	if (ISC_LIST_EMPTY(manager->socklist))
1616		SIGNAL(&manager->shutdown_ok);
1617	UNLOCK(&manager->lock);
1618}
1619
1620/*
1621 * Create a new 'type' socket managed by 'manager'.  Events
1622 * will be posted to 'task' and when dispatched 'action' will be
1623 * called with 'arg' as the arg value.  The new socket is returned
1624 * in 'socketp'.
1625 */
1626isc_result_t
1627isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1628		  isc_socket_t **socketp) {
1629	isc_socket_t *sock = NULL;
1630	isc_result_t result;
1631#if defined(USE_CMSG)
1632	int on = 1;
1633#endif
1634#if defined(SO_RCVBUF)
1635	ISC_SOCKADDR_LEN_T optlen;
1636	int size;
1637#endif
1638	int socket_errno;
1639	char strbuf[ISC_STRERRORSIZE];
1640
1641	REQUIRE(VALID_MANAGER(manager));
1642	REQUIRE(socketp != NULL && *socketp == NULL);
1643	REQUIRE(type != isc_sockettype_fdwatch);
1644
1645	result = allocate_socket(manager, type, &sock);
1646	if (result != ISC_R_SUCCESS)
1647		return (result);
1648
1649	sock->pf = pf;
1650	switch (type) {
1651	case isc_sockettype_udp:
1652		sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1653		if (sock->fd != INVALID_SOCKET) {
1654			result = connection_reset_fix(sock->fd);
1655			if (result != ISC_R_SUCCESS) {
1656				socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1657					"closed %d %d %d con_reset_fix_failed",
1658					sock->pending_recv, sock->pending_send,
1659					sock->references);
1660				closesocket(sock->fd);
1661				_set_state(sock, SOCK_CLOSED);
1662				sock->fd = INVALID_SOCKET;
1663				free_socket(&sock, __LINE__);
1664				return (result);
1665			}
1666		}
1667		break;
1668	case isc_sockettype_tcp:
1669		sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1670		break;
1671	}
1672
1673	if (sock->fd == INVALID_SOCKET) {
1674		socket_errno = WSAGetLastError();
1675		free_socket(&sock, __LINE__);
1676
1677		switch (socket_errno) {
1678		case WSAEMFILE:
1679		case WSAENOBUFS:
1680			return (ISC_R_NORESOURCES);
1681
1682		case WSAEPROTONOSUPPORT:
1683		case WSAEPFNOSUPPORT:
1684		case WSAEAFNOSUPPORT:
1685			return (ISC_R_FAMILYNOSUPPORT);
1686
1687		default:
1688			isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1689			UNEXPECTED_ERROR(__FILE__, __LINE__,
1690					 "socket() %s: %s",
1691					 isc_msgcat_get(isc_msgcat,
1692							ISC_MSGSET_GENERAL,
1693							ISC_MSG_FAILED,
1694							"failed"),
1695					 strbuf);
1696			return (ISC_R_UNEXPECTED);
1697		}
1698	}
1699
1700	result = make_nonblock(sock->fd);
1701	if (result != ISC_R_SUCCESS) {
1702		socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1703			"closed %d %d %d make_nonblock_failed",
1704			sock->pending_recv, sock->pending_send,
1705			sock->references);
1706		closesocket(sock->fd);
1707		sock->fd = INVALID_SOCKET;
1708		free_socket(&sock, __LINE__);
1709		return (result);
1710	}
1711
1712
1713#if defined(USE_CMSG) || defined(SO_RCVBUF)
1714	if (type == isc_sockettype_udp) {
1715
1716#if defined(USE_CMSG)
1717#if defined(ISC_PLATFORM_HAVEIPV6)
1718#ifdef IPV6_RECVPKTINFO
1719		/* 2292bis */
1720		if ((pf == AF_INET6)
1721		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1722				   (char *)&on, sizeof(on)) < 0)) {
1723			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1724			UNEXPECTED_ERROR(__FILE__, __LINE__,
1725					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1726					 "%s: %s", sock->fd,
1727					 isc_msgcat_get(isc_msgcat,
1728							ISC_MSGSET_GENERAL,
1729							ISC_MSG_FAILED,
1730							"failed"),
1731					 strbuf);
1732		}
1733#else
1734		/* 2292 */
1735		if ((pf == AF_INET6)
1736		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1737				   (char *)&on, sizeof(on)) < 0)) {
1738			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1739			UNEXPECTED_ERROR(__FILE__, __LINE__,
1740					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1741					 sock->fd,
1742					 isc_msgcat_get(isc_msgcat,
1743							ISC_MSGSET_GENERAL,
1744							ISC_MSG_FAILED,
1745							"failed"),
1746					 strbuf);
1747		}
1748#endif /* IPV6_RECVPKTINFO */
1749#ifdef IPV6_USE_MIN_MTU	/*2292bis, not too common yet*/
1750		/* use minimum MTU */
1751		if (pf == AF_INET6) {
1752			(void)setsockopt(sock->fd, IPPROTO_IPV6,
1753					 IPV6_USE_MIN_MTU,
1754					 (char *)&on, sizeof(on));
1755		}
1756#endif
1757#endif /* ISC_PLATFORM_HAVEIPV6 */
1758#endif /* defined(USE_CMSG) */
1759
1760#if defined(SO_RCVBUF)
1761	       optlen = sizeof(size);
1762	       if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1763			      (char *)&size, &optlen) >= 0 &&
1764		    size < RCVBUFSIZE) {
1765		       size = RCVBUFSIZE;
1766		       (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1767					(char *)&size, sizeof(size));
1768	       }
1769#endif
1770
1771	}
1772#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1773
1774	_set_state(sock, SOCK_OPEN);
1775	sock->references = 1;
1776	*socketp = sock;
1777
1778	iocompletionport_update(sock);
1779
1780	/*
1781	 * Note we don't have to lock the socket like we normally would because
1782	 * there are no external references to it yet.
1783	 */
1784	LOCK(&manager->lock);
1785	ISC_LIST_APPEND(manager->socklist, sock, link);
1786	InterlockedIncrement(&manager->totalSockets);
1787	UNLOCK(&manager->lock);
1788
1789	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1790		   ISC_MSG_CREATED, "created %u type %u", sock->fd, type);
1791
1792	return (ISC_R_SUCCESS);
1793}
1794
1795isc_result_t
1796isc_socket_open(isc_socket_t *sock) {
1797	REQUIRE(VALID_SOCKET(sock));
1798	REQUIRE(sock->type != isc_sockettype_fdwatch);
1799
1800	return (ISC_R_NOTIMPLEMENTED);
1801}
1802
1803/*
1804 * Attach to a socket.  Caller must explicitly detach when it is done.
1805 */
1806void
1807isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1808	REQUIRE(VALID_SOCKET(sock));
1809	REQUIRE(socketp != NULL && *socketp == NULL);
1810
1811	LOCK(&sock->lock);
1812	CONSISTENT(sock);
1813	sock->references++;
1814	UNLOCK(&sock->lock);
1815
1816	*socketp = sock;
1817}
1818
1819/*
1820 * Dereference a socket.  If this is the last reference to it, clean things
1821 * up by destroying the socket.
1822 */
1823void
1824isc__socket_detach(isc_socket_t **socketp) {
1825	isc_socket_t *sock;
1826	isc_boolean_t kill_socket = ISC_FALSE;
1827
1828	REQUIRE(socketp != NULL);
1829	sock = *socketp;
1830	REQUIRE(VALID_SOCKET(sock));
1831	REQUIRE(sock->type != isc_sockettype_fdwatch);
1832
1833	LOCK(&sock->lock);
1834	CONSISTENT(sock);
1835	REQUIRE(sock->references > 0);
1836	sock->references--;
1837
1838	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1839		"detach_socket %d %d %d",
1840		sock->pending_recv, sock->pending_send,
1841		sock->references);
1842
1843	if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1844		closesocket(sock->fd);
1845		sock->fd = INVALID_SOCKET;
1846		_set_state(sock, SOCK_CLOSED);
1847	}
1848
1849	maybe_free_socket(&sock, __LINE__);
1850
1851	*socketp = NULL;
1852}
1853
1854isc_result_t
1855isc_socket_close(isc_socket_t *sock) {
1856	REQUIRE(VALID_SOCKET(sock));
1857	REQUIRE(sock->type != isc_sockettype_fdwatch);
1858
1859	return (ISC_R_NOTIMPLEMENTED);
1860}
1861
1862/*
1863 * Dequeue an item off the given socket's read queue, set the result code
1864 * in the done event to the one provided, and send it to the task it was
1865 * destined for.
1866 *
1867 * If the event to be sent is on a list, remove it before sending.  If
1868 * asked to, send and detach from the task as well.
1869 *
1870 * Caller must have the socket locked if the event is attached to the socket.
1871 */
1872static void
1873send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1874	isc_task_t *task;
1875
1876	task = (*dev)->ev_sender;
1877	(*dev)->ev_sender = sock;
1878
1879	if (ISC_LINK_LINKED(*dev, ev_link))
1880		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1881
1882	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1883	    == ISC_SOCKEVENTATTR_ATTACHED)
1884		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1885	else
1886		isc_task_send(task, (isc_event_t **)dev);
1887
1888	CONSISTENT(sock);
1889}
1890
1891/*
1892 * See comments for send_recvdone_event() above.
1893 */
1894static void
1895send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1896	isc_task_t *task;
1897
1898	INSIST(dev != NULL && *dev != NULL);
1899
1900	task = (*dev)->ev_sender;
1901	(*dev)->ev_sender = sock;
1902
1903	if (ISC_LINK_LINKED(*dev, ev_link))
1904		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1905
1906	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1907	    == ISC_SOCKEVENTATTR_ATTACHED)
1908		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1909	else
1910		isc_task_send(task, (isc_event_t **)dev);
1911
1912	CONSISTENT(sock);
1913}
1914
1915/*
1916 * See comments for send_recvdone_event() above.
1917 */
1918static void
1919send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
1920	isc_task_t *task;
1921
1922	INSIST(adev != NULL && *adev != NULL);
1923
1924	task = (*adev)->ev_sender;
1925	(*adev)->ev_sender = sock;
1926
1927	if (ISC_LINK_LINKED(*adev, ev_link))
1928		ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
1929
1930	isc_task_sendanddetach(&task, (isc_event_t **)adev);
1931
1932	CONSISTENT(sock);
1933}
1934
1935/*
1936 * See comments for send_recvdone_event() above.
1937 */
1938static void
1939send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
1940	isc_task_t *task;
1941
1942	INSIST(cdev != NULL && *cdev != NULL);
1943
1944	task = (*cdev)->ev_sender;
1945	(*cdev)->ev_sender = sock;
1946
1947	sock->connect_ev = NULL;
1948
1949	isc_task_sendanddetach(&task, (isc_event_t **)cdev);
1950
1951	CONSISTENT(sock);
1952}
1953
1954/*
1955 * On entry to this function, the event delivered is the internal
1956 * readable event, and the first item on the accept_list should be
1957 * the done event we want to send.  If the list is empty, this is a no-op,
1958 * so just close the new connection, unlock, and return.
1959 *
1960 * Note the socket is locked before entering here
1961 */
1962static void
1963internal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
1964	isc_socket_newconnev_t *adev;
1965	isc_result_t result = ISC_R_SUCCESS;
1966	isc_socket_t *nsock;
1967	struct sockaddr *localaddr;
1968	int localaddr_len = sizeof(*localaddr);
1969	struct sockaddr *remoteaddr;
1970	int remoteaddr_len = sizeof(*remoteaddr);
1971
1972	INSIST(VALID_SOCKET(sock));
1973	LOCK(&sock->lock);
1974	CONSISTENT(sock);
1975
1976	socket_log(__LINE__, sock, NULL, TRACE,
1977		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1978		   "internal_accept called");
1979
1980	INSIST(sock->listener);
1981
1982	INSIST(sock->pending_iocp > 0);
1983	sock->pending_iocp--;
1984	INSIST(sock->pending_accept > 0);
1985	sock->pending_accept--;
1986
1987	adev = lpo->adev;
1988
1989	/*
1990	 * If the event is no longer in the list we can just return.
1991	 */
1992	if (!acceptdone_is_active(sock, adev))
1993		goto done;
1994
1995	nsock = adev->newsocket;
1996
1997	/*
1998	 * Pull off the done event.
1999	 */
2000	ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
2001
2002	/*
2003	 * Extract the addresses from the socket, copy them into the structure,
2004	 * and return the new socket.
2005	 */
2006	ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
2007		sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
2008		(LPSOCKADDR *)&localaddr, &localaddr_len,
2009		(LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
2010	memcpy(&adev->address.type, remoteaddr, remoteaddr_len);
2011	adev->address.length = remoteaddr_len;
2012	nsock->address = adev->address;
2013	nsock->pf = adev->address.type.sa.sa_family;
2014
2015	socket_log(__LINE__, nsock, &nsock->address, TRACE,
2016		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2017		   "internal_accept parent %p", sock);
2018
2019	result = make_nonblock(adev->newsocket->fd);
2020	INSIST(result == ISC_R_SUCCESS);
2021
2022	INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
2023			  (char *)&sock->fd, sizeof(sock->fd)) == 0);
2024
2025	/*
2026	 * Hook it up into the manager.
2027	 */
2028	nsock->bound = 1;
2029	nsock->connected = 1;
2030	_set_state(nsock, SOCK_OPEN);
2031
2032	LOCK(&nsock->manager->lock);
2033	ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2034	InterlockedIncrement(&nsock->manager->totalSockets);
2035	UNLOCK(&nsock->manager->lock);
2036
2037	socket_log(__LINE__, sock, &nsock->address, CREATION,
2038		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2039		   "accepted_connection new_socket %p fd %d",
2040		   nsock, nsock->fd);
2041
2042	adev->result = result;
2043	send_acceptdone_event(sock, &adev);
2044
2045done:
2046	CONSISTENT(sock);
2047	UNLOCK(&sock->lock);
2048
2049	HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2050	lpo->acceptbuffer = NULL;
2051}
2052
2053/*
2054 * Called when a socket with a pending connect() finishes.
2055 * Note that the socket is locked before entering.
2056 */
2057static void
2058internal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2059	isc_socket_connev_t *cdev;
2060	char strbuf[ISC_STRERRORSIZE];
2061
2062	INSIST(VALID_SOCKET(sock));
2063
2064	LOCK(&sock->lock);
2065
2066	INSIST(sock->pending_iocp > 0);
2067	sock->pending_iocp--;
2068	INSIST(sock->pending_connect == 1);
2069	sock->pending_connect = 0;
2070
2071	/*
2072	 * Has this event been canceled?
2073	 */
2074	cdev = lpo->cdev;
2075	if (!connectdone_is_active(sock, cdev)) {
2076		sock->pending_connect = 0;
2077		if (sock->fd != INVALID_SOCKET) {
2078			closesocket(sock->fd);
2079			sock->fd = INVALID_SOCKET;
2080			_set_state(sock, SOCK_CLOSED);
2081		}
2082		CONSISTENT(sock);
2083		UNLOCK(&sock->lock);
2084		return;
2085	}
2086
2087	/*
2088	 * Check possible Windows network event error status here.
2089	 */
2090	if (connect_errno != 0) {
2091		/*
2092		 * If the error is SOFT, just try again on this
2093		 * fd and pretend nothing strange happened.
2094		 */
2095		if (SOFT_ERROR(connect_errno) ||
2096		    connect_errno == WSAEINPROGRESS) {
2097			sock->pending_connect = 1;
2098			CONSISTENT(sock);
2099			UNLOCK(&sock->lock);
2100			return;
2101		}
2102
2103		/*
2104		 * Translate other errors into ISC_R_* flavors.
2105		 */
2106		switch (connect_errno) {
2107#define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2108			ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2109			ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2110			ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2111			ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2112			ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2113			ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2114			ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2115			ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2116			ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2117			ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2118			ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2119			ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2120#undef ERROR_MATCH
2121		default:
2122			cdev->result = ISC_R_UNEXPECTED;
2123			isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2124			UNEXPECTED_ERROR(__FILE__, __LINE__,
2125					 "internal_connect: connect() %s",
2126					 strbuf);
2127		}
2128	} else {
2129		INSIST(setsockopt(sock->fd, SOL_SOCKET,
2130				  SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2131		cdev->result = ISC_R_SUCCESS;
2132		sock->connected = 1;
2133		socket_log(__LINE__, sock, &sock->address, IOEVENT,
2134			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2135			   "internal_connect: success");
2136	}
2137
2138	send_connectdone_event(sock, &cdev);
2139
2140	UNLOCK(&sock->lock);
2141}
2142
2143/*
2144 * Loop through the socket, returning ISC_R_EOF for each done event pending.
2145 */
2146static void
2147send_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2148	isc_socketevent_t *dev;
2149
2150	while (!ISC_LIST_EMPTY(sock->recv_list)) {
2151		dev = ISC_LIST_HEAD(sock->recv_list);
2152		dev->result = result;
2153		send_recvdone_event(sock, &dev);
2154	}
2155}
2156
2157/*
2158 * Take the data we received in our private buffer, and if any recv() calls on
2159 * our list are satisfied, send the corresponding done event.
2160 *
2161 * If we need more data (there are still items on the recv_list after we consume all
2162 * our data) then arrange for another system recv() call to fill our buffers.
2163 */
2164static void
2165internal_recv(isc_socket_t *sock, int nbytes)
2166{
2167	INSIST(VALID_SOCKET(sock));
2168
2169	LOCK(&sock->lock);
2170	CONSISTENT(sock);
2171
2172	socket_log(__LINE__, sock, NULL, IOEVENT,
2173		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2174		   "internal_recv: %d bytes received", nbytes);
2175
2176	/*
2177	 * If we got here, the I/O operation succeeded.  However, we might still have removed this
2178	 * event from our notification list (or never placed it on it due to immediate completion.)
2179	 * Handle the reference counting here, and handle the cancellation event just after.
2180	 */
2181	INSIST(sock->pending_iocp > 0);
2182	sock->pending_iocp--;
2183	INSIST(sock->pending_recv > 0);
2184	sock->pending_recv--;
2185
2186	/*
2187	 * The only way we could have gotten here is that our I/O has successfully completed.
2188	 * Update our pointers, and move on.  The only odd case here is that we might not
2189	 * have received enough data on a TCP stream to satisfy the minimum requirements.  If
2190	 * this is the case, we will re-issue the recv() call for what we need.
2191	 *
2192	 * We do check for a recv() of 0 bytes on a TCP stream.  This means the remote end
2193	 * has closed.
2194	 */
2195	if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2196		send_recvdone_abort(sock, ISC_R_EOF);
2197		maybe_free_socket(&sock, __LINE__);
2198		return;
2199	}
2200	sock->recvbuf.remaining = nbytes;
2201	sock->recvbuf.consume_position = sock->recvbuf.base;
2202	completeio_recv(sock);
2203
2204	/*
2205	 * If there are more receivers waiting for data, queue another receive
2206	 * here.
2207	 */
2208	queue_receive_request(sock);
2209
2210	/*
2211	 * Unlock and/or destroy if we are the last thing this socket has left to do.
2212	 */
2213	maybe_free_socket(&sock, __LINE__);
2214}
2215
2216static void
2217internal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2218	      struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2219{
2220	buflist_t *buffer;
2221
2222	/*
2223	 * Find out what socket this is and lock it.
2224	 */
2225	INSIST(VALID_SOCKET(sock));
2226
2227	LOCK(&sock->lock);
2228	CONSISTENT(sock);
2229
2230	socket_log(__LINE__, sock, NULL, IOEVENT,
2231		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2232		   "internal_send: task got socket event %p", dev);
2233
2234	buffer = ISC_LIST_HEAD(lpo->bufferlist);
2235	while (buffer != NULL) {
2236		ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2237
2238		socket_log(__LINE__, sock, NULL, TRACE,
2239		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2240		   "free_buffer %p %p", buffer, buffer->buf);
2241
2242		HeapFree(hHeapHandle, 0, buffer->buf);
2243		HeapFree(hHeapHandle, 0, buffer);
2244		buffer = ISC_LIST_HEAD(lpo->bufferlist);
2245	}
2246
2247	INSIST(sock->pending_iocp > 0);
2248	sock->pending_iocp--;
2249	INSIST(sock->pending_send > 0);
2250	sock->pending_send--;
2251
2252	/* If the event is no longer in the list we can just return */
2253	if (!senddone_is_active(sock, dev))
2254		goto done;
2255
2256	/*
2257	 * Set the error code and send things on its way.
2258	 */
2259	switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2260	case DOIO_SOFT:
2261		break;
2262	case DOIO_HARD:
2263	case DOIO_SUCCESS:
2264		send_senddone_event(sock, &dev);
2265		break;
2266	}
2267
2268 done:
2269	maybe_free_socket(&sock, __LINE__);
2270}
2271
2272/*
2273 * These return if the done event passed in is on the list (or for connect, is
2274 * the one we're waiting for.  Using these ensures we will not double-send an
2275 * event.
2276 */
2277static isc_boolean_t
2278senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2279{
2280	isc_socketevent_t *ldev;
2281
2282	ldev = ISC_LIST_HEAD(sock->send_list);
2283	while (ldev != NULL && ldev != dev)
2284		ldev = ISC_LIST_NEXT(ldev, ev_link);
2285
2286	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2287}
2288
2289static isc_boolean_t
2290acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2291{
2292	isc_socket_newconnev_t *ldev;
2293
2294	ldev = ISC_LIST_HEAD(sock->accept_list);
2295	while (ldev != NULL && ldev != dev)
2296		ldev = ISC_LIST_NEXT(ldev, ev_link);
2297
2298	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2299}
2300
2301static isc_boolean_t
2302connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2303{
2304	return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2305}
2306
2307//
2308// The Windows network stack seems to have two very distinct paths depending
2309// on what is installed.  Specifically, if something is looking at network
2310// connections (like an anti-virus or anti-malware application, such as
2311// McAfee products) Windows may return additional error conditions which
2312// were not previously returned.
2313//
2314// One specific one is when a TCP SYN scan is used.  In this situation,
2315// Windows responds with the SYN-ACK, but the scanner never responds with
2316// the 3rd packet, the ACK.  Windows consiers this a partially open connection.
2317// Most Unix networking stacks, and Windows without McAfee installed, will
2318// not return this to the caller.  However, with this product installed,
2319// Windows returns this as a failed status on the Accept() call.  Here, we
2320// will just re-issue the ISCAcceptEx() call as if nothing had happened.
2321//
2322// This code should only be called when the listening socket has received
2323// such an error.  Additionally, the "parent" socket must be locked.
2324// Additionally, the lpo argument is re-used here, and must not be freed
2325// by the caller.
2326//
2327static isc_result_t
2328restart_accept(isc_socket_t *parent, IoCompletionInfo *lpo)
2329{
2330	isc_socket_t *nsock = lpo->adev->newsocket;
2331	SOCKET new_fd;
2332
2333	/*
2334	 * AcceptEx() requires we pass in a socket.  Note that we carefully
2335	 * do not close the previous socket in case of an error message returned by
2336	 * our new socket() call.  If we return an error here, our caller will
2337	 * clean up.
2338	 */
2339	new_fd = socket(parent->pf, SOCK_STREAM, IPPROTO_TCP);
2340	if (nsock->fd == INVALID_SOCKET) {
2341		return (ISC_R_FAILURE); // parent will ask windows for error message
2342	}
2343	closesocket(nsock->fd);
2344	nsock->fd = new_fd;
2345
2346	memset(&lpo->overlapped, 0, sizeof(lpo->overlapped));
2347
2348	ISCAcceptEx(parent->fd,
2349		    nsock->fd,				/* Accepted Socket */
2350		    lpo->acceptbuffer,			/* Buffer for initial Recv */
2351		    0,					/* Length of Buffer */
2352		    sizeof(SOCKADDR_STORAGE) + 16,	/* Local address length + 16 */
2353		    sizeof(SOCKADDR_STORAGE) + 16,	/* Remote address lengh + 16 */
2354		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
2355		    (LPOVERLAPPED)lpo			/* Overlapped structure */
2356		    );
2357
2358	InterlockedDecrement(&nsock->manager->iocp_total);
2359	iocompletionport_update(nsock);
2360
2361	return (ISC_R_SUCCESS);
2362}
2363
2364/*
2365 * This is the I/O Completion Port Worker Function. It loops forever
2366 * waiting for I/O to complete and then forwards them for further
2367 * processing. There are a number of these in separate threads.
2368 */
2369static isc_threadresult_t WINAPI
2370SocketIoThread(LPVOID ThreadContext) {
2371	isc_socketmgr_t *manager = ThreadContext;
2372	BOOL bSuccess = FALSE;
2373	DWORD nbytes;
2374	IoCompletionInfo *lpo = NULL;
2375	isc_socket_t *sock = NULL;
2376	int request;
2377	struct msghdr *messagehdr = NULL;
2378	int errval;
2379	char strbuf[ISC_STRERRORSIZE];
2380	int errstatus;
2381
2382	REQUIRE(VALID_MANAGER(manager));
2383
2384	/*
2385	 * Set the thread priority high enough so I/O will
2386	 * preempt normal recv packet processing, but not
2387	 * higher than the timer sync thread.
2388	 */
2389	if (!SetThreadPriority(GetCurrentThread(),
2390			       THREAD_PRIORITY_ABOVE_NORMAL)) {
2391		errval = GetLastError();
2392		isc__strerror(errval, strbuf, sizeof(strbuf));
2393		FATAL_ERROR(__FILE__, __LINE__,
2394				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2395				ISC_MSG_FAILED,
2396				"Can't set thread priority: %s"),
2397				strbuf);
2398	}
2399
2400	/*
2401	 * Loop forever waiting on I/O Completions and then processing them
2402	 */
2403	while (TRUE) {
2404		wait_again:
2405		bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2406						     &nbytes, (LPDWORD)&sock,
2407						     (LPWSAOVERLAPPED *)&lpo,
2408						     INFINITE);
2409		if (lpo == NULL) /* Received request to exit */
2410			break;
2411
2412		REQUIRE(VALID_SOCKET(sock));
2413
2414		request = lpo->request_type;
2415
2416		errstatus = 0;
2417		if (!bSuccess) {
2418			isc_result_t isc_result;
2419
2420			/*
2421			 * Did the I/O operation complete?
2422			 */
2423			errstatus = GetLastError();
2424			isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2425
2426			LOCK(&sock->lock);
2427			CONSISTENT(sock);
2428			switch (request) {
2429			case SOCKET_RECV:
2430				INSIST(sock->pending_iocp > 0);
2431				sock->pending_iocp--;
2432				INSIST(sock->pending_recv > 0);
2433				sock->pending_recv--;
2434				if (!sock->connected &&
2435				    ((errstatus == ERROR_HOST_UNREACHABLE) ||
2436				     (errstatus == WSAENETRESET) ||
2437				     (errstatus == WSAECONNRESET))) {
2438					/* ignore soft errors */
2439					queue_receive_request(sock);
2440					break;
2441				}
2442				send_recvdone_abort(sock, isc_result);
2443				if (isc_result == ISC_R_UNEXPECTED) {
2444					UNEXPECTED_ERROR(__FILE__, __LINE__,
2445						"SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2446						errstatus, isc_result);
2447				}
2448				break;
2449
2450			case SOCKET_SEND:
2451				INSIST(sock->pending_iocp > 0);
2452				sock->pending_iocp--;
2453				INSIST(sock->pending_send > 0);
2454				sock->pending_send--;
2455				if (senddone_is_active(sock, lpo->dev)) {
2456					lpo->dev->result = isc_result;
2457					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2458						"canceled_send");
2459					send_senddone_event(sock, &lpo->dev);
2460				}
2461				break;
2462
2463			case SOCKET_ACCEPT:
2464				INSIST(sock->pending_iocp > 0);
2465				INSIST(sock->pending_accept > 0);
2466
2467				socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2468					"Accept: errstatus=%d isc_result=%d", errstatus, isc_result);
2469
2470				if (acceptdone_is_active(sock, lpo->adev)) {
2471					if (restart_accept(sock, lpo) == ISC_R_SUCCESS) {
2472						UNLOCK(&sock->lock);
2473						goto wait_again;
2474					} else {
2475						errstatus = GetLastError();
2476						isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2477						socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2478							"restart_accept() failed: errstatus=%d isc_result=%d",
2479							errstatus, isc_result);
2480					}
2481				}
2482
2483				sock->pending_iocp--;
2484				sock->pending_accept--;
2485				if (acceptdone_is_active(sock, lpo->adev)) {
2486					closesocket(lpo->adev->newsocket->fd);
2487					lpo->adev->newsocket->fd = INVALID_SOCKET;
2488					lpo->adev->newsocket->references--;
2489					free_socket(&lpo->adev->newsocket, __LINE__);
2490					lpo->adev->result = isc_result;
2491					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2492						"canceled_accept");
2493					send_acceptdone_event(sock, &lpo->adev);
2494				}
2495				break;
2496
2497			case SOCKET_CONNECT:
2498				INSIST(sock->pending_iocp > 0);
2499				sock->pending_iocp--;
2500				INSIST(sock->pending_connect == 1);
2501				sock->pending_connect = 0;
2502				if (connectdone_is_active(sock, lpo->cdev)) {
2503					lpo->cdev->result = isc_result;
2504					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2505						"canceled_connect");
2506					send_connectdone_event(sock, &lpo->cdev);
2507				}
2508				break;
2509			}
2510			maybe_free_socket(&sock, __LINE__);
2511
2512			if (lpo != NULL)
2513				HeapFree(hHeapHandle, 0, lpo);
2514			continue;
2515		}
2516
2517		messagehdr = &lpo->messagehdr;
2518
2519		switch (request) {
2520		case SOCKET_RECV:
2521			internal_recv(sock, nbytes);
2522			break;
2523		case SOCKET_SEND:
2524			internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2525			break;
2526		case SOCKET_ACCEPT:
2527			internal_accept(sock, lpo, errstatus);
2528			break;
2529		case SOCKET_CONNECT:
2530			internal_connect(sock, lpo, errstatus);
2531			break;
2532		}
2533
2534		if (lpo != NULL)
2535			HeapFree(hHeapHandle, 0, lpo);
2536	}
2537
2538	/*
2539	 * Exit Completion Port Thread
2540	 */
2541	manager_log(manager, TRACE,
2542		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2543				   ISC_MSG_EXITING, "SocketIoThread exiting"));
2544	return ((isc_threadresult_t)0);
2545}
2546
2547/*
2548 * Create a new socket manager.
2549 */
2550isc_result_t
2551isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2552	return (isc_socketmgr_create2(mctx, managerp, 0));
2553}
2554
2555isc_result_t
2556isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2557		       unsigned int maxsocks)
2558{
2559	isc_socketmgr_t *manager;
2560	isc_result_t result;
2561
2562	REQUIRE(managerp != NULL && *managerp == NULL);
2563
2564	if (maxsocks != 0)
2565		return (ISC_R_NOTIMPLEMENTED);
2566
2567	manager = isc_mem_get(mctx, sizeof(*manager));
2568	if (manager == NULL)
2569		return (ISC_R_NOMEMORY);
2570
2571	InitSockets();
2572
2573	manager->magic = SOCKET_MANAGER_MAGIC;
2574	manager->mctx = NULL;
2575	manager->stats = NULL;
2576	ISC_LIST_INIT(manager->socklist);
2577	result = isc_mutex_init(&manager->lock);
2578	if (result != ISC_R_SUCCESS) {
2579		isc_mem_put(mctx, manager, sizeof(*manager));
2580		return (result);
2581	}
2582	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2583		DESTROYLOCK(&manager->lock);
2584		isc_mem_put(mctx, manager, sizeof(*manager));
2585		UNEXPECTED_ERROR(__FILE__, __LINE__,
2586				 "isc_condition_init() %s",
2587				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2588						ISC_MSG_FAILED, "failed"));
2589		return (ISC_R_UNEXPECTED);
2590	}
2591
2592	isc_mem_attach(mctx, &manager->mctx);
2593
2594	iocompletionport_init(manager);	/* Create the Completion Ports */
2595
2596	manager->bShutdown = ISC_FALSE;
2597	manager->totalSockets = 0;
2598	manager->iocp_total = 0;
2599
2600	*managerp = manager;
2601
2602	return (ISC_R_SUCCESS);
2603}
2604
2605isc_result_t
2606isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2607	REQUIRE(VALID_MANAGER(manager));
2608	REQUIRE(nsockp != NULL);
2609
2610	return (ISC_R_NOTIMPLEMENTED);
2611}
2612
2613void
2614isc__socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2615	REQUIRE(VALID_MANAGER(manager));
2616	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2617	REQUIRE(manager->stats == NULL);
2618	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2619
2620	isc_stats_attach(stats, &manager->stats);
2621}
2622
2623void
2624isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2625	isc_socketmgr_t *manager;
2626	int i;
2627	isc_mem_t *mctx;
2628
2629	/*
2630	 * Destroy a socket manager.
2631	 */
2632
2633	REQUIRE(managerp != NULL);
2634	manager = *managerp;
2635	REQUIRE(VALID_MANAGER(manager));
2636
2637	LOCK(&manager->lock);
2638
2639	/*
2640	 * Wait for all sockets to be destroyed.
2641	 */
2642	while (!ISC_LIST_EMPTY(manager->socklist)) {
2643		manager_log(manager, CREATION,
2644			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2645					   ISC_MSG_SOCKETSREMAIN,
2646					   "sockets exist"));
2647		WAIT(&manager->shutdown_ok, &manager->lock);
2648	}
2649
2650	UNLOCK(&manager->lock);
2651
2652	/*
2653	 * Here, we need to had some wait code for the completion port
2654	 * thread.
2655	 */
2656	signal_iocompletionport_exit(manager);
2657	manager->bShutdown = ISC_TRUE;
2658
2659	/*
2660	 * Wait for threads to exit.
2661	 */
2662	for (i = 0; i < manager->maxIOCPThreads; i++) {
2663		if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2664			NULL) != ISC_R_SUCCESS)
2665			UNEXPECTED_ERROR(__FILE__, __LINE__,
2666				 "isc_thread_join() for Completion Port %s",
2667				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2668						ISC_MSG_FAILED, "failed"));
2669	}
2670	/*
2671	 * Clean up.
2672	 */
2673
2674	CloseHandle(manager->hIoCompletionPort);
2675
2676	(void)isc_condition_destroy(&manager->shutdown_ok);
2677
2678	DESTROYLOCK(&manager->lock);
2679	if (manager->stats != NULL)
2680		isc_stats_detach(&manager->stats);
2681	manager->magic = 0;
2682	mctx= manager->mctx;
2683	isc_mem_put(mctx, manager, sizeof(*manager));
2684
2685	isc_mem_detach(&mctx);
2686
2687	*managerp = NULL;
2688}
2689
2690static void
2691queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev)
2692{
2693	isc_task_t *ntask = NULL;
2694
2695	isc_task_attach(task, &ntask);
2696	dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2697
2698	/*
2699	 * Enqueue the request.
2700	 */
2701	INSIST(!ISC_LINK_LINKED(dev, ev_link));
2702	ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2703
2704	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2705		   "queue_receive_event: event %p -> task %p",
2706		   dev, ntask);
2707}
2708
2709/*
2710 * Check the pending receive queue, and if we have data pending, give it to this
2711 * caller.  If we have none, queue an I/O request.  If this caller is not the first
2712 * on the list, then we will just queue this event and return.
2713 *
2714 * Caller must have the socket locked.
2715 */
2716static isc_result_t
2717socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2718	    unsigned int flags)
2719{
2720	int cc = 0;
2721	isc_task_t *ntask = NULL;
2722	isc_result_t result = ISC_R_SUCCESS;
2723	int recv_errno = 0;
2724
2725	dev->ev_sender = task;
2726
2727	if (sock->fd == INVALID_SOCKET)
2728		return (ISC_R_EOF);
2729
2730	/*
2731	 * Queue our event on the list of things to do.  Call our function to
2732	 * attempt to fill buffers as much as possible, and return done events.
2733	 * We are going to lie about our handling of the ISC_SOCKFLAG_IMMEDIATE
2734	 * here and tell our caller that we could not satisfy it immediately.
2735	 */
2736	queue_receive_event(sock, task, dev);
2737	if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2738		result = ISC_R_INPROGRESS;
2739
2740	completeio_recv(sock);
2741
2742	/*
2743	 * If there are more receivers waiting for data, queue another receive
2744	 * here.  If the
2745	 */
2746	queue_receive_request(sock);
2747
2748	return (result);
2749}
2750
2751isc_result_t
2752isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2753		 unsigned int minimum, isc_task_t *task,
2754		 isc_taskaction_t action, const void *arg)
2755{
2756	isc_socketevent_t *dev;
2757	isc_socketmgr_t *manager;
2758	unsigned int iocount;
2759	isc_buffer_t *buffer;
2760	isc_result_t ret;
2761
2762	REQUIRE(VALID_SOCKET(sock));
2763	LOCK(&sock->lock);
2764	CONSISTENT(sock);
2765
2766	/*
2767	 * Make sure that the socket is not closed.  XXXMLG change error here?
2768	 */
2769	if (sock->fd == INVALID_SOCKET) {
2770		UNLOCK(&sock->lock);
2771		return (ISC_R_CONNREFUSED);
2772	}
2773
2774	REQUIRE(buflist != NULL);
2775	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2776	REQUIRE(task != NULL);
2777	REQUIRE(action != NULL);
2778
2779	manager = sock->manager;
2780	REQUIRE(VALID_MANAGER(manager));
2781
2782	iocount = isc_bufferlist_availablecount(buflist);
2783	REQUIRE(iocount > 0);
2784
2785	INSIST(sock->bound);
2786
2787	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2788	if (dev == NULL) {
2789		UNLOCK(&sock->lock);
2790		return (ISC_R_NOMEMORY);
2791	}
2792
2793	/*
2794	 * UDP sockets are always partial read
2795	 */
2796	if (sock->type == isc_sockettype_udp)
2797		dev->minimum = 1;
2798	else {
2799		if (minimum == 0)
2800			dev->minimum = iocount;
2801		else
2802			dev->minimum = minimum;
2803	}
2804
2805	/*
2806	 * Move each buffer from the passed in list to our internal one.
2807	 */
2808	buffer = ISC_LIST_HEAD(*buflist);
2809	while (buffer != NULL) {
2810		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2811		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2812		buffer = ISC_LIST_HEAD(*buflist);
2813	}
2814
2815	ret = socket_recv(sock, dev, task, 0);
2816
2817	UNLOCK(&sock->lock);
2818	return (ret);
2819}
2820
2821isc_result_t
2822isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
2823		 unsigned int minimum, isc_task_t *task,
2824		 isc_taskaction_t action, const void *arg)
2825{
2826	isc_socketevent_t *dev;
2827	isc_socketmgr_t *manager;
2828	isc_result_t ret;
2829
2830	REQUIRE(VALID_SOCKET(sock));
2831	LOCK(&sock->lock);
2832	CONSISTENT(sock);
2833
2834	/*
2835	 * make sure that the socket's not closed
2836	 */
2837	if (sock->fd == INVALID_SOCKET) {
2838		UNLOCK(&sock->lock);
2839		return (ISC_R_CONNREFUSED);
2840	}
2841	REQUIRE(action != NULL);
2842
2843	manager = sock->manager;
2844	REQUIRE(VALID_MANAGER(manager));
2845
2846	INSIST(sock->bound);
2847
2848	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2849	if (dev == NULL) {
2850		UNLOCK(&sock->lock);
2851		return (ISC_R_NOMEMORY);
2852	}
2853
2854	ret = isc_socket_recv2(sock, region, minimum, task, dev, 0);
2855	UNLOCK(&sock->lock);
2856	return (ret);
2857}
2858
2859isc_result_t
2860isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
2861		  unsigned int minimum, isc_task_t *task,
2862		  isc_socketevent_t *event, unsigned int flags)
2863{
2864	isc_result_t ret;
2865
2866	REQUIRE(VALID_SOCKET(sock));
2867	LOCK(&sock->lock);
2868	CONSISTENT(sock);
2869
2870	event->result = ISC_R_UNEXPECTED;
2871	event->ev_sender = sock;
2872	/*
2873	 * make sure that the socket's not closed
2874	 */
2875	if (sock->fd == INVALID_SOCKET) {
2876		UNLOCK(&sock->lock);
2877		return (ISC_R_CONNREFUSED);
2878	}
2879
2880	ISC_LIST_INIT(event->bufferlist);
2881	event->region = *region;
2882	event->n = 0;
2883	event->offset = 0;
2884	event->attributes = 0;
2885
2886	/*
2887	 * UDP sockets are always partial read.
2888	 */
2889	if (sock->type == isc_sockettype_udp)
2890		event->minimum = 1;
2891	else {
2892		if (minimum == 0)
2893			event->minimum = region->length;
2894		else
2895			event->minimum = minimum;
2896	}
2897
2898	ret = socket_recv(sock, event, task, flags);
2899	UNLOCK(&sock->lock);
2900	return (ret);
2901}
2902
2903/*
2904 * Caller must have the socket locked.
2905 */
2906static isc_result_t
2907socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2908	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2909	    unsigned int flags)
2910{
2911	int io_state;
2912	int send_errno = 0;
2913	int cc = 0;
2914	isc_task_t *ntask = NULL;
2915	isc_result_t result = ISC_R_SUCCESS;
2916
2917	dev->ev_sender = task;
2918
2919	set_dev_address(address, sock, dev);
2920	if (pktinfo != NULL) {
2921		socket_log(__LINE__, sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2922			   ISC_MSG_PKTINFOPROVIDED,
2923			   "pktinfo structure provided, ifindex %u (set to 0)",
2924			   pktinfo->ipi6_ifindex);
2925
2926		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2927		dev->pktinfo = *pktinfo;
2928		/*
2929		 * Set the pktinfo index to 0 here, to let the kernel decide
2930		 * what interface it should send on.
2931		 */
2932		dev->pktinfo.ipi6_ifindex = 0;
2933	}
2934
2935	io_state = startio_send(sock, dev, &cc, &send_errno);
2936	switch (io_state) {
2937	case DOIO_PENDING:	/* I/O started. Nothing more to do */
2938	case DOIO_SOFT:
2939		/*
2940		 * We couldn't send all or part of the request right now, so
2941		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2942		 */
2943		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2944			isc_task_attach(task, &ntask);
2945			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2946
2947			/*
2948			 * Enqueue the request.
2949			 */
2950			INSIST(!ISC_LINK_LINKED(dev, ev_link));
2951			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2952
2953			socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2954				   "socket_send: event %p -> task %p",
2955				   dev, ntask);
2956
2957			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2958				result = ISC_R_INPROGRESS;
2959			break;
2960		}
2961
2962	case DOIO_SUCCESS:
2963		break;
2964	}
2965
2966	return (result);
2967}
2968
2969isc_result_t
2970isc__socket_send(isc_socket_t *sock, isc_region_t *region,
2971		 isc_task_t *task, isc_taskaction_t action, const void *arg)
2972{
2973	/*
2974	 * REQUIRE() checking is performed in isc_socket_sendto().
2975	 */
2976	return (isc_socket_sendto(sock, region, task, action, arg, NULL,
2977				  NULL));
2978}
2979
2980isc_result_t
2981isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
2982		   isc_task_t *task, isc_taskaction_t action, const void *arg,
2983		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2984{
2985	isc_socketevent_t *dev;
2986	isc_socketmgr_t *manager;
2987	isc_result_t ret;
2988
2989	REQUIRE(VALID_SOCKET(sock));
2990	REQUIRE(sock->type != isc_sockettype_fdwatch);
2991
2992	LOCK(&sock->lock);
2993	CONSISTENT(sock);
2994
2995	/*
2996	 * make sure that the socket's not closed
2997	 */
2998	if (sock->fd == INVALID_SOCKET) {
2999		UNLOCK(&sock->lock);
3000		return (ISC_R_CONNREFUSED);
3001	}
3002	REQUIRE(region != NULL);
3003	REQUIRE(task != NULL);
3004	REQUIRE(action != NULL);
3005
3006	manager = sock->manager;
3007	REQUIRE(VALID_MANAGER(manager));
3008
3009	INSIST(sock->bound);
3010
3011	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3012	if (dev == NULL) {
3013		UNLOCK(&sock->lock);
3014		return (ISC_R_NOMEMORY);
3015	}
3016	dev->region = *region;
3017
3018	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3019	UNLOCK(&sock->lock);
3020	return (ret);
3021}
3022
3023isc_result_t
3024isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3025		  isc_task_t *task, isc_taskaction_t action, const void *arg)
3026{
3027	return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
3028				   NULL));
3029}
3030
3031isc_result_t
3032isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3033		    isc_task_t *task, isc_taskaction_t action, const void *arg,
3034		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3035{
3036	isc_socketevent_t *dev;
3037	isc_socketmgr_t *manager;
3038	unsigned int iocount;
3039	isc_buffer_t *buffer;
3040	isc_result_t ret;
3041
3042	REQUIRE(VALID_SOCKET(sock));
3043
3044	LOCK(&sock->lock);
3045	CONSISTENT(sock);
3046
3047	/*
3048	 * make sure that the socket's not closed
3049	 */
3050	if (sock->fd == INVALID_SOCKET) {
3051		UNLOCK(&sock->lock);
3052		return (ISC_R_CONNREFUSED);
3053	}
3054	REQUIRE(buflist != NULL);
3055	REQUIRE(!ISC_LIST_EMPTY(*buflist));
3056	REQUIRE(task != NULL);
3057	REQUIRE(action != NULL);
3058
3059	manager = sock->manager;
3060	REQUIRE(VALID_MANAGER(manager));
3061
3062	iocount = isc_bufferlist_usedcount(buflist);
3063	REQUIRE(iocount > 0);
3064
3065	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3066	if (dev == NULL) {
3067		UNLOCK(&sock->lock);
3068		return (ISC_R_NOMEMORY);
3069	}
3070
3071	/*
3072	 * Move each buffer from the passed in list to our internal one.
3073	 */
3074	buffer = ISC_LIST_HEAD(*buflist);
3075	while (buffer != NULL) {
3076		ISC_LIST_DEQUEUE(*buflist, buffer, link);
3077		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3078		buffer = ISC_LIST_HEAD(*buflist);
3079	}
3080
3081	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3082	UNLOCK(&sock->lock);
3083	return (ret);
3084}
3085
3086isc_result_t
3087isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3088		    isc_task_t *task,
3089		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3090		    isc_socketevent_t *event, unsigned int flags)
3091{
3092	isc_result_t ret;
3093
3094	REQUIRE(VALID_SOCKET(sock));
3095	LOCK(&sock->lock);
3096	CONSISTENT(sock);
3097
3098	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3099	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3100		REQUIRE(sock->type == isc_sockettype_udp);
3101	event->ev_sender = sock;
3102	event->result = ISC_R_UNEXPECTED;
3103	/*
3104	 * make sure that the socket's not closed
3105	 */
3106	if (sock->fd == INVALID_SOCKET) {
3107		UNLOCK(&sock->lock);
3108		return (ISC_R_CONNREFUSED);
3109	}
3110	ISC_LIST_INIT(event->bufferlist);
3111	event->region = *region;
3112	event->n = 0;
3113	event->offset = 0;
3114	event->attributes = 0;
3115
3116	ret = socket_send(sock, event, task, address, pktinfo, flags);
3117	UNLOCK(&sock->lock);
3118	return (ret);
3119}
3120
3121isc_result_t
3122isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3123		 unsigned int options) {
3124	int bind_errno;
3125	char strbuf[ISC_STRERRORSIZE];
3126	int on = 1;
3127
3128	REQUIRE(VALID_SOCKET(sock));
3129	LOCK(&sock->lock);
3130	CONSISTENT(sock);
3131
3132	/*
3133	 * make sure that the socket's not closed
3134	 */
3135	if (sock->fd == INVALID_SOCKET) {
3136		UNLOCK(&sock->lock);
3137		return (ISC_R_CONNREFUSED);
3138	}
3139
3140	INSIST(!sock->bound);
3141
3142	if (sock->pf != sockaddr->type.sa.sa_family) {
3143		UNLOCK(&sock->lock);
3144		return (ISC_R_FAMILYMISMATCH);
3145	}
3146	/*
3147	 * Only set SO_REUSEADDR when we want a specific port.
3148	 */
3149	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3150	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3151	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
3152		       sizeof(on)) < 0) {
3153		UNEXPECTED_ERROR(__FILE__, __LINE__,
3154				 "setsockopt(%d) %s", sock->fd,
3155				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3156						ISC_MSG_FAILED, "failed"));
3157		/* Press on... */
3158	}
3159	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3160		bind_errno = WSAGetLastError();
3161		UNLOCK(&sock->lock);
3162		switch (bind_errno) {
3163		case WSAEACCES:
3164			return (ISC_R_NOPERM);
3165		case WSAEADDRNOTAVAIL:
3166			return (ISC_R_ADDRNOTAVAIL);
3167		case WSAEADDRINUSE:
3168			return (ISC_R_ADDRINUSE);
3169		case WSAEINVAL:
3170			return (ISC_R_BOUND);
3171		default:
3172			isc__strerror(bind_errno, strbuf, sizeof(strbuf));
3173			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3174					 strbuf);
3175			return (ISC_R_UNEXPECTED);
3176		}
3177	}
3178
3179	socket_log(__LINE__, sock, sockaddr, TRACE,
3180		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3181	sock->bound = 1;
3182
3183	UNLOCK(&sock->lock);
3184	return (ISC_R_SUCCESS);
3185}
3186
3187isc_result_t
3188isc__socket_filter(isc_socket_t *sock, const char *filter) {
3189	UNUSED(sock);
3190	UNUSED(filter);
3191
3192	REQUIRE(VALID_SOCKET(sock));
3193	return (ISC_R_NOTIMPLEMENTED);
3194}
3195
3196/*
3197 * Set up to listen on a given socket.  We do this by creating an internal
3198 * event that will be dispatched when the socket has read activity.  The
3199 * watcher will send the internal event to the task when there is a new
3200 * connection.
3201 *
3202 * Unlike in read, we don't preallocate a done event here.  Every time there
3203 * is a new connection we'll have to allocate a new one anyway, so we might
3204 * as well keep things simple rather than having to track them.
3205 */
3206isc_result_t
3207isc__socket_listen(isc_socket_t *sock, unsigned int backlog) {
3208	char strbuf[ISC_STRERRORSIZE];
3209
3210	REQUIRE(VALID_SOCKET(sock));
3211
3212	LOCK(&sock->lock);
3213	CONSISTENT(sock);
3214
3215	/*
3216	 * make sure that the socket's not closed
3217	 */
3218	if (sock->fd == INVALID_SOCKET) {
3219		UNLOCK(&sock->lock);
3220		return (ISC_R_CONNREFUSED);
3221	}
3222
3223	REQUIRE(!sock->listener);
3224	REQUIRE(sock->bound);
3225	REQUIRE(sock->type == isc_sockettype_tcp);
3226
3227	if (backlog == 0)
3228		backlog = SOMAXCONN;
3229
3230	if (listen(sock->fd, (int)backlog) < 0) {
3231		UNLOCK(&sock->lock);
3232		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3233
3234		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3235
3236		return (ISC_R_UNEXPECTED);
3237	}
3238
3239	socket_log(__LINE__, sock, NULL, TRACE,
3240		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "listening");
3241	sock->listener = 1;
3242	_set_state(sock, SOCK_LISTEN);
3243
3244	UNLOCK(&sock->lock);
3245	return (ISC_R_SUCCESS);
3246}
3247
3248/*
3249 * This should try to do aggressive accept() XXXMLG
3250 */
3251isc_result_t
3252isc__socket_accept(isc_socket_t *sock,
3253		   isc_task_t *task, isc_taskaction_t action, const void *arg)
3254{
3255	isc_socket_newconnev_t *adev;
3256	isc_socketmgr_t *manager;
3257	isc_task_t *ntask = NULL;
3258	isc_socket_t *nsock;
3259	isc_result_t result;
3260	IoCompletionInfo *lpo;
3261
3262	REQUIRE(VALID_SOCKET(sock));
3263
3264	manager = sock->manager;
3265	REQUIRE(VALID_MANAGER(manager));
3266
3267	LOCK(&sock->lock);
3268	CONSISTENT(sock);
3269
3270	/*
3271	 * make sure that the socket's not closed
3272	 */
3273	if (sock->fd == INVALID_SOCKET) {
3274		UNLOCK(&sock->lock);
3275		return (ISC_R_CONNREFUSED);
3276	}
3277
3278	REQUIRE(sock->listener);
3279
3280	/*
3281	 * Sender field is overloaded here with the task we will be sending
3282	 * this event to.  Just before the actual event is delivered the
3283	 * actual ev_sender will be touched up to be the socket.
3284	 */
3285	adev = (isc_socket_newconnev_t *)
3286		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3287				   action, arg, sizeof(*adev));
3288	if (adev == NULL) {
3289		UNLOCK(&sock->lock);
3290		return (ISC_R_NOMEMORY);
3291	}
3292	ISC_LINK_INIT(adev, ev_link);
3293
3294	result = allocate_socket(manager, sock->type, &nsock);
3295	if (result != ISC_R_SUCCESS) {
3296		isc_event_free((isc_event_t **)&adev);
3297		UNLOCK(&sock->lock);
3298		return (result);
3299	}
3300
3301	/*
3302	 * AcceptEx() requires we pass in a socket.
3303	 */
3304	nsock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
3305	if (nsock->fd == INVALID_SOCKET) {
3306		free_socket(&nsock, __LINE__);
3307		isc_event_free((isc_event_t **)&adev);
3308		UNLOCK(&sock->lock);
3309		return (ISC_R_FAILURE); // XXXMLG need real error message
3310	}
3311
3312	/*
3313	 * Attach to socket and to task.
3314	 */
3315	isc_task_attach(task, &ntask);
3316	if (isc_task_exiting(ntask)) {
3317		free_socket(&nsock, __LINE__);
3318		isc_task_detach(&ntask);
3319		isc_event_free(ISC_EVENT_PTR(&adev));
3320		UNLOCK(&sock->lock);
3321		return (ISC_R_SHUTTINGDOWN);
3322	}
3323	nsock->references++;
3324
3325	adev->ev_sender = ntask;
3326	adev->newsocket = nsock;
3327	_set_state(nsock, SOCK_ACCEPT);
3328
3329	/*
3330	 * Queue io completion for an accept().
3331	 */
3332	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3333					    HEAP_ZERO_MEMORY,
3334					    sizeof(IoCompletionInfo));
3335	RUNTIME_CHECK(lpo != NULL);
3336	lpo->acceptbuffer = (void *)HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
3337		(sizeof(SOCKADDR_STORAGE) + 16) * 2);
3338	RUNTIME_CHECK(lpo->acceptbuffer != NULL);
3339
3340	lpo->adev = adev;
3341	lpo->request_type = SOCKET_ACCEPT;
3342
3343	ISCAcceptEx(sock->fd,
3344		    nsock->fd,				/* Accepted Socket */
3345		    lpo->acceptbuffer,			/* Buffer for initial Recv */
3346		    0,					/* Length of Buffer */
3347		    sizeof(SOCKADDR_STORAGE) + 16,		/* Local address length + 16 */
3348		    sizeof(SOCKADDR_STORAGE) + 16,		/* Remote address lengh + 16 */
3349		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
3350		    (LPOVERLAPPED)lpo			/* Overlapped structure */
3351		    );
3352	iocompletionport_update(nsock);
3353
3354	socket_log(__LINE__, sock, NULL, TRACE,
3355		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND,
3356		   "accepting for nsock %p fd %d", nsock, nsock->fd);
3357
3358	/*
3359	 * Enqueue the event
3360	 */
3361	ISC_LIST_ENQUEUE(sock->accept_list, adev, ev_link);
3362	sock->pending_accept++;
3363	sock->pending_iocp++;
3364
3365	UNLOCK(&sock->lock);
3366	return (ISC_R_SUCCESS);
3367}
3368
3369isc_result_t
3370isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3371		    isc_task_t *task, isc_taskaction_t action, const void *arg)
3372{
3373	char strbuf[ISC_STRERRORSIZE];
3374	isc_socket_connev_t *cdev;
3375	isc_task_t *ntask = NULL;
3376	isc_socketmgr_t *manager;
3377	IoCompletionInfo *lpo;
3378	int bind_errno;
3379
3380	REQUIRE(VALID_SOCKET(sock));
3381	REQUIRE(addr != NULL);
3382	REQUIRE(task != NULL);
3383	REQUIRE(action != NULL);
3384
3385	manager = sock->manager;
3386	REQUIRE(VALID_MANAGER(manager));
3387	REQUIRE(addr != NULL);
3388
3389	if (isc_sockaddr_ismulticast(addr))
3390		return (ISC_R_MULTICAST);
3391
3392	LOCK(&sock->lock);
3393	CONSISTENT(sock);
3394
3395	/*
3396	 * make sure that the socket's not closed
3397	 */
3398	if (sock->fd == INVALID_SOCKET) {
3399		UNLOCK(&sock->lock);
3400		return (ISC_R_CONNREFUSED);
3401	}
3402
3403	/*
3404	 * Windows sockets won't connect unless the socket is bound.
3405	 */
3406	if (!sock->bound) {
3407		isc_sockaddr_t any;
3408
3409		isc_sockaddr_anyofpf(&any, isc_sockaddr_pf(addr));
3410		if (bind(sock->fd, &any.type.sa, any.length) < 0) {
3411			bind_errno = WSAGetLastError();
3412			UNLOCK(&sock->lock);
3413			switch (bind_errno) {
3414			case WSAEACCES:
3415				return (ISC_R_NOPERM);
3416			case WSAEADDRNOTAVAIL:
3417				return (ISC_R_ADDRNOTAVAIL);
3418			case WSAEADDRINUSE:
3419				return (ISC_R_ADDRINUSE);
3420			case WSAEINVAL:
3421				return (ISC_R_BOUND);
3422			default:
3423				isc__strerror(bind_errno, strbuf,
3424					      sizeof(strbuf));
3425				UNEXPECTED_ERROR(__FILE__, __LINE__,
3426						 "bind: %s", strbuf);
3427				return (ISC_R_UNEXPECTED);
3428			}
3429		}
3430		sock->bound = 1;
3431	}
3432
3433	REQUIRE(!sock->pending_connect);
3434
3435	cdev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3436							ISC_SOCKEVENT_CONNECT,
3437							action,	arg,
3438							sizeof(*cdev));
3439	if (cdev == NULL) {
3440		UNLOCK(&sock->lock);
3441		return (ISC_R_NOMEMORY);
3442	}
3443	ISC_LINK_INIT(cdev, ev_link);
3444
3445	if (sock->type == isc_sockettype_tcp) {
3446		/*
3447		 * Queue io completion for an accept().
3448		 */
3449		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3450						    HEAP_ZERO_MEMORY,
3451						    sizeof(IoCompletionInfo));
3452		lpo->cdev = cdev;
3453		lpo->request_type = SOCKET_CONNECT;
3454
3455		sock->address = *addr;
3456		ISCConnectEx(sock->fd, &addr->type.sa, addr->length,
3457			NULL, 0, NULL, (LPOVERLAPPED)lpo);
3458
3459		/*
3460		 * Attach to task.
3461		 */
3462		isc_task_attach(task, &ntask);
3463		cdev->ev_sender = ntask;
3464
3465		sock->pending_connect = 1;
3466		_set_state(sock, SOCK_CONNECT);
3467
3468		/*
3469		 * Enqueue the request.
3470		 */
3471		sock->connect_ev = cdev;
3472		sock->pending_iocp++;
3473	} else {
3474		WSAConnect(sock->fd, &addr->type.sa, addr->length, NULL, NULL, NULL, NULL);
3475		cdev->result = ISC_R_SUCCESS;
3476		isc_task_send(task, (isc_event_t **)&cdev);
3477	}
3478	CONSISTENT(sock);
3479	UNLOCK(&sock->lock);
3480
3481	return (ISC_R_SUCCESS);
3482}
3483
3484isc_result_t
3485isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3486	isc_result_t result;
3487
3488	REQUIRE(VALID_SOCKET(sock));
3489	REQUIRE(addressp != NULL);
3490
3491	LOCK(&sock->lock);
3492	CONSISTENT(sock);
3493
3494	/*
3495	 * make sure that the socket's not closed
3496	 */
3497	if (sock->fd == INVALID_SOCKET) {
3498		UNLOCK(&sock->lock);
3499		return (ISC_R_CONNREFUSED);
3500	}
3501
3502	if (sock->connected) {
3503		*addressp = sock->address;
3504		result = ISC_R_SUCCESS;
3505	} else {
3506		result = ISC_R_NOTCONNECTED;
3507	}
3508
3509	UNLOCK(&sock->lock);
3510
3511	return (result);
3512}
3513
3514isc_result_t
3515isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3516	ISC_SOCKADDR_LEN_T len;
3517	isc_result_t result;
3518	char strbuf[ISC_STRERRORSIZE];
3519
3520	REQUIRE(VALID_SOCKET(sock));
3521	REQUIRE(addressp != NULL);
3522
3523	LOCK(&sock->lock);
3524	CONSISTENT(sock);
3525
3526	/*
3527	 * make sure that the socket's not closed
3528	 */
3529	if (sock->fd == INVALID_SOCKET) {
3530		UNLOCK(&sock->lock);
3531		return (ISC_R_CONNREFUSED);
3532	}
3533
3534	if (!sock->bound) {
3535		result = ISC_R_NOTBOUND;
3536		goto out;
3537	}
3538
3539	result = ISC_R_SUCCESS;
3540
3541	len = sizeof(addressp->type);
3542	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3543		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3544		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3545				 strbuf);
3546		result = ISC_R_UNEXPECTED;
3547		goto out;
3548	}
3549	addressp->length = (unsigned int)len;
3550
3551 out:
3552	UNLOCK(&sock->lock);
3553
3554	return (result);
3555}
3556
3557/*
3558 * Run through the list of events on this socket, and cancel the ones
3559 * queued for task "task" of type "how".  "how" is a bitmask.
3560 */
3561void
3562isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3563
3564	REQUIRE(VALID_SOCKET(sock));
3565
3566	/*
3567	 * Quick exit if there is nothing to do.  Don't even bother locking
3568	 * in this case.
3569	 */
3570	if (how == 0)
3571		return;
3572
3573	LOCK(&sock->lock);
3574	CONSISTENT(sock);
3575
3576	/*
3577	 * make sure that the socket's not closed
3578	 */
3579	if (sock->fd == INVALID_SOCKET) {
3580		UNLOCK(&sock->lock);
3581		return;
3582	}
3583
3584	/*
3585	 * All of these do the same thing, more or less.
3586	 * Each will:
3587	 *	o If the internal event is marked as "posted" try to
3588	 *	  remove it from the task's queue.  If this fails, mark it
3589	 *	  as canceled instead, and let the task clean it up later.
3590	 *	o For each I/O request for that task of that type, post
3591	 *	  its done event with status of "ISC_R_CANCELED".
3592	 *	o Reset any state needed.
3593	 */
3594
3595	if ((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) {
3596		isc_socketevent_t      *dev;
3597		isc_socketevent_t      *next;
3598		isc_task_t	       *current_task;
3599
3600		dev = ISC_LIST_HEAD(sock->recv_list);
3601		while (dev != NULL) {
3602			current_task = dev->ev_sender;
3603			next = ISC_LIST_NEXT(dev, ev_link);
3604			if ((task == NULL) || (task == current_task)) {
3605				dev->result = ISC_R_CANCELED;
3606				send_recvdone_event(sock, &dev);
3607			}
3608			dev = next;
3609		}
3610	}
3611	how &= ~ISC_SOCKCANCEL_RECV;
3612
3613	if ((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) {
3614		isc_socketevent_t      *dev;
3615		isc_socketevent_t      *next;
3616		isc_task_t	       *current_task;
3617
3618		dev = ISC_LIST_HEAD(sock->send_list);
3619
3620		while (dev != NULL) {
3621			current_task = dev->ev_sender;
3622			next = ISC_LIST_NEXT(dev, ev_link);
3623			if ((task == NULL) || (task == current_task)) {
3624				dev->result = ISC_R_CANCELED;
3625				send_senddone_event(sock, &dev);
3626			}
3627			dev = next;
3628		}
3629	}
3630	how &= ~ISC_SOCKCANCEL_SEND;
3631
3632	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3633	    && !ISC_LIST_EMPTY(sock->accept_list)) {
3634		isc_socket_newconnev_t *dev;
3635		isc_socket_newconnev_t *next;
3636		isc_task_t	       *current_task;
3637
3638		dev = ISC_LIST_HEAD(sock->accept_list);
3639		while (dev != NULL) {
3640			current_task = dev->ev_sender;
3641			next = ISC_LIST_NEXT(dev, ev_link);
3642
3643			if ((task == NULL) || (task == current_task)) {
3644
3645				dev->newsocket->references--;
3646				closesocket(dev->newsocket->fd);
3647				dev->newsocket->fd = INVALID_SOCKET;
3648				free_socket(&dev->newsocket, __LINE__);
3649
3650				dev->result = ISC_R_CANCELED;
3651				send_acceptdone_event(sock, &dev);
3652			}
3653
3654			dev = next;
3655		}
3656	}
3657	how &= ~ISC_SOCKCANCEL_ACCEPT;
3658
3659	/*
3660	 * Connecting is not a list.
3661	 */
3662	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3663	    && sock->connect_ev != NULL) {
3664		isc_socket_connev_t    *dev;
3665		isc_task_t	       *current_task;
3666
3667		INSIST(sock->pending_connect);
3668
3669		dev = sock->connect_ev;
3670		current_task = dev->ev_sender;
3671
3672		if ((task == NULL) || (task == current_task)) {
3673			closesocket(sock->fd);
3674			sock->fd = INVALID_SOCKET;
3675			_set_state(sock, SOCK_CLOSED);
3676
3677			sock->connect_ev = NULL;
3678			dev->result = ISC_R_CANCELED;
3679			send_connectdone_event(sock, &dev);
3680		}
3681	}
3682	how &= ~ISC_SOCKCANCEL_CONNECT;
3683
3684	maybe_free_socket(&sock, __LINE__);
3685}
3686
3687isc_sockettype_t
3688isc__socket_gettype(isc_socket_t *sock) {
3689	isc_sockettype_t type;
3690
3691	REQUIRE(VALID_SOCKET(sock));
3692
3693	LOCK(&sock->lock);
3694
3695	/*
3696	 * make sure that the socket's not closed
3697	 */
3698	if (sock->fd == INVALID_SOCKET) {
3699		UNLOCK(&sock->lock);
3700		return (ISC_R_CONNREFUSED);
3701	}
3702
3703	type = sock->type;
3704	UNLOCK(&sock->lock);
3705	return (type);
3706}
3707
3708isc_boolean_t
3709isc__socket_isbound(isc_socket_t *sock) {
3710	isc_boolean_t val;
3711
3712	REQUIRE(VALID_SOCKET(sock));
3713
3714	LOCK(&sock->lock);
3715	CONSISTENT(sock);
3716
3717	/*
3718	 * make sure that the socket's not closed
3719	 */
3720	if (sock->fd == INVALID_SOCKET) {
3721		UNLOCK(&sock->lock);
3722		return (ISC_FALSE);
3723	}
3724
3725	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3726	UNLOCK(&sock->lock);
3727
3728	return (val);
3729}
3730
3731void
3732isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3733#if defined(IPV6_V6ONLY)
3734	int onoff = yes ? 1 : 0;
3735#else
3736	UNUSED(yes);
3737#endif
3738
3739	REQUIRE(VALID_SOCKET(sock));
3740
3741#ifdef IPV6_V6ONLY
3742	if (sock->pf == AF_INET6) {
3743		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3744				 (char *)&onoff, sizeof(onoff));
3745	}
3746#endif
3747}
3748
3749void
3750isc__socket_cleanunix(isc_sockaddr_t *addr, isc_boolean_t active) {
3751	UNUSED(addr);
3752	UNUSED(active);
3753}
3754
3755isc_result_t
3756isc__socket_permunix(isc_sockaddr_t *addr, isc_uint32_t perm,
3757		     isc_uint32_t owner,	isc_uint32_t group)
3758{
3759	UNUSED(addr);
3760	UNUSED(perm);
3761	UNUSED(owner);
3762	UNUSED(group);
3763	return (ISC_R_NOTIMPLEMENTED);
3764}
3765
3766void
3767isc__socket_setname(isc_socket_t *socket, const char *name, void *tag) {
3768
3769	/*
3770	 * Name 'socket'.
3771	 */
3772
3773	REQUIRE(VALID_SOCKET(socket));
3774
3775	LOCK(&socket->lock);
3776	memset(socket->name, 0, sizeof(socket->name));
3777	strncpy(socket->name, name, sizeof(socket->name) - 1);
3778	socket->tag = tag;
3779	UNLOCK(&socket->lock);
3780}
3781
3782const char *
3783isc__socket_getname(isc_socket_t *socket) {
3784	return (socket->name);
3785}
3786
3787void *
3788isc__socket_gettag(isc_socket_t *socket) {
3789	return (socket->tag);
3790}
3791
3792void
3793isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3794	UNUSED(manager);
3795	UNUSED(reserved);
3796}
3797
3798void
3799isc___socketmgr_maxudp(isc_socketmgr_t *manager, int maxudp) {
3800
3801	UNUSED(manager);
3802	UNUSED(maxudp);
3803}
3804
3805#ifdef HAVE_LIBXML2
3806
3807static const char *
3808_socktype(isc_sockettype_t type)
3809{
3810	if (type == isc_sockettype_udp)
3811		return ("udp");
3812	else if (type == isc_sockettype_tcp)
3813		return ("tcp");
3814	else if (type == isc_sockettype_unix)
3815		return ("unix");
3816	else if (type == isc_sockettype_fdwatch)
3817		return ("fdwatch");
3818	else
3819		return ("not-initialized");
3820}
3821
3822void
3823isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
3824{
3825	isc_socket_t *sock;
3826	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3827	isc_sockaddr_t addr;
3828	ISC_SOCKADDR_LEN_T len;
3829
3830	LOCK(&mgr->lock);
3831
3832#ifndef ISC_PLATFORM_USETHREADS
3833	xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3834	xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
3835	xmlTextWriterEndElement(writer);
3836#endif
3837
3838	xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
3839	sock = ISC_LIST_HEAD(mgr->socklist);
3840	while (sock != NULL) {
3841		LOCK(&sock->lock);
3842		xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
3843
3844		xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
3845		xmlTextWriterWriteFormatString(writer, "%p", sock);
3846		xmlTextWriterEndElement(writer);
3847
3848		if (sock->name[0] != 0) {
3849			xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
3850			xmlTextWriterWriteFormatString(writer, "%s",
3851						       sock->name);
3852			xmlTextWriterEndElement(writer); /* name */
3853		}
3854
3855		xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3856		xmlTextWriterWriteFormatString(writer, "%d", sock->references);
3857		xmlTextWriterEndElement(writer);
3858
3859		xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
3860					  ISC_XMLCHAR _socktype(sock->type));
3861
3862		if (sock->connected) {
3863			isc_sockaddr_format(&sock->address, peerbuf,
3864					    sizeof(peerbuf));
3865			xmlTextWriterWriteElement(writer,
3866						  ISC_XMLCHAR "peer-address",
3867						  ISC_XMLCHAR peerbuf);
3868		}
3869
3870		len = sizeof(addr);
3871		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
3872			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
3873			xmlTextWriterWriteElement(writer,
3874						  ISC_XMLCHAR "local-address",
3875						  ISC_XMLCHAR peerbuf);
3876		}
3877
3878		xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
3879		if (sock->pending_recv)
3880			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3881						ISC_XMLCHAR "pending-receive");
3882		if (sock->pending_send)
3883			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3884						  ISC_XMLCHAR "pending-send");
3885		if (sock->pending_accept)
3886			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3887						 ISC_XMLCHAR "pending_accept");
3888		if (sock->listener)
3889			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3890						  ISC_XMLCHAR "listener");
3891		if (sock->connected)
3892			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3893						  ISC_XMLCHAR "connected");
3894		if (sock->pending_connect)
3895			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3896						  ISC_XMLCHAR "connecting");
3897		if (sock->bound)
3898			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3899						  ISC_XMLCHAR "bound");
3900
3901		xmlTextWriterEndElement(writer); /* states */
3902
3903		xmlTextWriterEndElement(writer); /* socket */
3904
3905		UNLOCK(&sock->lock);
3906		sock = ISC_LIST_NEXT(sock, link);
3907	}
3908	xmlTextWriterEndElement(writer); /* sockets */
3909
3910	UNLOCK(&mgr->lock);
3911}
3912#endif /* HAVE_LIBXML2 */
3913