1/*	$NetBSD$	*/
2
3/*
4 * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
5 * Copyright (C) 2000-2003  Internet Software Consortium.
6 *
7 * Permission to use, copy, modify, and/or distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
12 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
13 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
14 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
15 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
16 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
17 * PERFORMANCE OF THIS SOFTWARE.
18 */
19
20/* Id */
21
22/* This code uses functions which are only available on Server 2003 and
23 * higher, and Windows XP and higher.
24 *
25 * This code is by nature multithreaded and takes advantage of various
26 * features to pass on information through the completion port for
27 * when I/O is completed.  All sends, receives, accepts, and connects are
28 * completed through the completion port.
29 *
30 * The number of Completion Port Worker threads used is the total number
31 * of CPU's + 1. This increases the likelihood that a Worker Thread is
32 * available for processing a completed request.
33 *
34 * XXXPDM 5 August, 2002
35 */
36
37#define MAKE_EXTERNAL 1
38#include <config.h>
39
40#include <sys/types.h>
41
42#ifndef _WINSOCKAPI_
43#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
44#endif
45
46#include <errno.h>
47#include <stddef.h>
48#include <stdlib.h>
49#include <string.h>
50#include <unistd.h>
51#include <io.h>
52#include <fcntl.h>
53#include <process.h>
54
55#include <isc/buffer.h>
56#include <isc/bufferlist.h>
57#include <isc/condition.h>
58#include <isc/list.h>
59#include <isc/log.h>
60#include <isc/mem.h>
61#include <isc/msgs.h>
62#include <isc/mutex.h>
63#include <isc/net.h>
64#include <isc/once.h>
65#include <isc/os.h>
66#include <isc/platform.h>
67#include <isc/print.h>
68#include <isc/region.h>
69#include <isc/socket.h>
70#include <isc/stats.h>
71#include <isc/strerror.h>
72#include <isc/syslog.h>
73#include <isc/task.h>
74#include <isc/thread.h>
75#include <isc/util.h>
76#include <isc/win32os.h>
77
78#include <mswsock.h>
79
80#include "errno2result.h"
81
82/*
83 * How in the world can Microsoft exist with APIs like this?
84 * We can't actually call this directly, because it turns out
85 * no library exports this function.  Instead, we need to
86 * issue a runtime call to get the address.
87 */
88LPFN_CONNECTEX ISCConnectEx;
89LPFN_ACCEPTEX ISCAcceptEx;
90LPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
91
92/*
93 * Run expensive internal consistency checks.
94 */
95#ifdef ISC_SOCKET_CONSISTENCY_CHECKS
96#define CONSISTENT(sock) consistent(sock)
97#else
98#define CONSISTENT(sock) do {} while (/*CONSTCOND*/0)
99#endif
100static void consistent(isc_socket_t *sock);
101
102/*
103 * Define this macro to control the behavior of connection
104 * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
105 * for details.
106 * NOTE: This requires that Windows 2000 systems install Service Pack 2
107 * or later.
108 */
109#ifndef SIO_UDP_CONNRESET
110#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
111#endif
112
113/*
114 * Some systems define the socket length argument as an int, some as size_t,
115 * some as socklen_t.  This is here so it can be easily changed if needed.
116 */
117#ifndef ISC_SOCKADDR_LEN_T
118#define ISC_SOCKADDR_LEN_T unsigned int
119#endif
120
121/*
122 * Define what the possible "soft" errors can be.  These are non-fatal returns
123 * of various network related functions, like recv() and so on.
124 */
125#define SOFT_ERROR(e)	((e) == WSAEINTR || \
126			 (e) == WSAEWOULDBLOCK || \
127			 (e) == EWOULDBLOCK || \
128			 (e) == EINTR || \
129			 (e) == EAGAIN || \
130			 (e) == 0)
131
132/*
133 * Pending errors are not really errors and should be
134 * kept separate
135 */
136#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
137
138#define DOIO_SUCCESS	  0       /* i/o ok, event sent */
139#define DOIO_SOFT	  1       /* i/o ok, soft error, no event sent */
140#define DOIO_HARD	  2       /* i/o error, event sent */
141#define DOIO_EOF	  3       /* EOF, no event sent */
142#define DOIO_PENDING	  4       /* status when i/o is in process */
143#define DOIO_NEEDMORE	  5       /* IO was processed, but we need more due to minimum */
144
145#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
146
147/*
148 * DLVL(90)  --  Function entry/exit and other tracing.
149 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
150 * DLVL(60)  --  Socket data send/receive
151 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
152 * DLVL(20)  --  Socket creation/destruction.
153 */
154#define TRACE_LEVEL		90
155#define CORRECTNESS_LEVEL	70
156#define IOEVENT_LEVEL		60
157#define EVENT_LEVEL		50
158#define CREATION_LEVEL		20
159
160#define TRACE		DLVL(TRACE_LEVEL)
161#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
162#define IOEVENT		DLVL(IOEVENT_LEVEL)
163#define EVENT		DLVL(EVENT_LEVEL)
164#define CREATION	DLVL(CREATION_LEVEL)
165
166typedef isc_event_t intev_t;
167
168/*
169 * Socket State
170 */
171enum {
172  SOCK_INITIALIZED,	/* Socket Initialized */
173  SOCK_OPEN,		/* Socket opened but nothing yet to do */
174  SOCK_DATA,		/* Socket sending or receiving data */
175  SOCK_LISTEN,		/* TCP Socket listening for connects */
176  SOCK_ACCEPT,		/* TCP socket is waiting to accept */
177  SOCK_CONNECT,		/* TCP Socket connecting */
178  SOCK_CLOSED,		/* Socket has been closed */
179};
180
181#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
182#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
183
184/*
185 * IPv6 control information.  If the socket is an IPv6 socket we want
186 * to collect the destination address and interface so the client can
187 * set them on outgoing packets.
188 */
189#ifdef ISC_PLATFORM_HAVEIPV6
190#ifndef USE_CMSG
191#define USE_CMSG	1
192#endif
193#endif
194
195/*
196 * We really  don't want to try and use these control messages. Win32
197 * doesn't have this mechanism before XP.
198 */
199#undef USE_CMSG
200
201/*
202 * Message header for recvmsg and sendmsg calls.
203 * Used value-result for recvmsg, value only for sendmsg.
204 */
205struct msghdr {
206	SOCKADDR_STORAGE to_addr;	/* UDP send/recv address */
207	int      to_addr_len;		/* length of the address */
208	WSABUF  *msg_iov;		/* scatter/gather array */
209	u_int   msg_iovlen;             /* # elements in msg_iov */
210	void	*msg_control;           /* ancillary data, see below */
211	u_int   msg_controllen;         /* ancillary data buffer len */
212	int	msg_totallen;		/* total length of this message */
213} msghdr;
214
215/*
216 * The size to raise the receive buffer to.
217 */
218#define RCVBUFSIZE (32*1024)
219
220/*
221 * The number of times a send operation is repeated if the result
222 * is WSAEINTR.
223 */
224#define NRETRIES 10
225
226struct isc_socket {
227	/* Not locked. */
228	unsigned int		magic;
229	isc_socketmgr_t	       *manager;
230	isc_mutex_t		lock;
231	isc_sockettype_t	type;
232
233	/* Pointers to scatter/gather buffers */
234	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
235
236	/* Locked by socket lock. */
237	ISC_LINK(isc_socket_t)	link;
238	unsigned int		references; /* EXTERNAL references */
239	SOCKET			fd;	/* file handle */
240	int			pf;	/* protocol family */
241	char			name[16];
242	void *			tag;
243
244	/*
245	 * Each recv() call uses this buffer.  It is a per-socket receive
246	 * buffer that allows us to decouple the system recv() from the
247	 * recv_list done events.  This means the items on the recv_list
248	 * can be removed without having to cancel pending system recv()
249	 * calls.  It also allows us to read-ahead in some cases.
250	 */
251	struct {
252		SOCKADDR_STORAGE	from_addr;	   // UDP send/recv address
253		int		from_addr_len;	   // length of the address
254		char		*base;		   // the base of the buffer
255		char		*consume_position; // where to start copying data from next
256		unsigned int	len;		   // the actual size of this buffer
257		unsigned int	remaining;	   // the number of bytes remaining
258	} recvbuf;
259
260	ISC_LIST(isc_socketevent_t)		send_list;
261	ISC_LIST(isc_socketevent_t)		recv_list;
262	ISC_LIST(isc_socket_newconnev_t)	accept_list;
263	isc_socket_connev_t		       *connect_ev;
264
265	isc_sockaddr_t		address;  /* remote address */
266
267	unsigned int		listener : 1,	/* listener socket */
268				connected : 1,
269				pending_connect : 1, /* connect pending */
270				bound : 1,	/* bound to local addr */
271				dupped : 1;     /* created by isc_socket_dup() */
272	unsigned int		pending_iocp;	/* Should equal the counters below. Debug. */
273	unsigned int		pending_recv;  /* Number of outstanding recv() calls. */
274	unsigned int		pending_send;  /* Number of outstanding send() calls. */
275	unsigned int		pending_accept; /* Number of outstanding accept() calls. */
276	unsigned int		state; /* Socket state. Debugging and consistency checking. */
277	int			state_lineno;  /* line which last touched state */
278};
279
280#define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (/*CONSTCOND*/0)
281
282/*
283 * Buffer structure
284 */
285typedef struct buflist buflist_t;
286
287struct buflist {
288	void			*buf;
289	unsigned int		buflen;
290	ISC_LINK(buflist_t)	link;
291};
292
293/*
294 * I/O Completion ports Info structures
295 */
296
297static HANDLE hHeapHandle = NULL;
298typedef struct IoCompletionInfo {
299	OVERLAPPED		overlapped;
300	isc_socketevent_t	*dev;  /* send()/recv() done event */
301	isc_socket_connev_t	*cdev; /* connect() done event */
302	isc_socket_newconnev_t	*adev; /* accept() done event */
303	void			*acceptbuffer;
304	DWORD			received_bytes;
305	int			request_type;
306	struct msghdr		messagehdr;
307	ISC_LIST(buflist_t)	bufferlist;	/*%< list of buffers */
308} IoCompletionInfo;
309
310/*
311 * Define a maximum number of I/O Completion Port worker threads
312 * to handle the load on the Completion Port. The actual number
313 * used is the number of CPU's + 1.
314 */
315#define MAX_IOCPTHREADS 20
316
317#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
318#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
319
320struct isc_socketmgr {
321	/* Not locked. */
322	unsigned int			magic;
323	isc_mem_t		       *mctx;
324	isc_mutex_t			lock;
325	isc_stats_t		       *stats;
326
327	/* Locked by manager lock. */
328	ISC_LIST(isc_socket_t)		socklist;
329	isc_boolean_t			bShutdown;
330	isc_condition_t			shutdown_ok;
331	HANDLE				hIoCompletionPort;
332	int				maxIOCPThreads;
333	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
334	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
335
336	/*
337	 * Debugging.
338	 * Modified by InterlockedIncrement() and InterlockedDecrement()
339	 */
340	LONG				totalSockets;
341	LONG				iocp_total;
342};
343
344enum {
345	SOCKET_RECV,
346	SOCKET_SEND,
347	SOCKET_ACCEPT,
348	SOCKET_CONNECT
349};
350
351/*
352 * send() and recv() iovec counts
353 */
354#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
355#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
356
357static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
358				  isc_sockettype_t type,
359				  isc_socket_t **socketp,
360				  isc_socket_t *dup_socket);
361static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
362static void maybe_free_socket(isc_socket_t **, int);
363static void free_socket(isc_socket_t **, int);
364static isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
365static isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
366static isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
367static void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
368static void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
369static void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
370static void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
371static void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
372static void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
373static void queue_receive_request(isc_socket_t *sock);
374
375/*
376 * This is used to dump the contents of the sock structure
377 * You should make sure that the sock is locked before
378 * dumping it. Since the code uses simple printf() statements
379 * it should only be used interactively.
380 */
381void
382sock_dump(isc_socket_t *sock) {
383	isc_socketevent_t *ldev;
384	isc_socket_newconnev_t *ndev;
385
386#if 0
387	isc_sockaddr_t addr;
388	char socktext[256];
389
390	isc_socket_getpeername(sock, &addr);
391	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
392	printf("Remote Socket: %s\n", socktext);
393	isc_socket_getsockname(sock, &addr);
394	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
395	printf("This Socket: %s\n", socktext);
396#endif
397
398	printf("\n\t\tSock Dump\n");
399	printf("\t\tfd: %u\n", sock->fd);
400	printf("\t\treferences: %d\n", sock->references);
401	printf("\t\tpending_accept: %d\n", sock->pending_accept);
402	printf("\t\tconnecting: %d\n", sock->pending_connect);
403	printf("\t\tconnected: %d\n", sock->connected);
404	printf("\t\tbound: %d\n", sock->bound);
405	printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
406	printf("\t\tsocket type: %d\n", sock->type);
407
408	printf("\n\t\tSock Recv List\n");
409	ldev = ISC_LIST_HEAD(sock->recv_list);
410	while (ldev != NULL) {
411		printf("\t\tdev: %p\n", ldev);
412		ldev = ISC_LIST_NEXT(ldev, ev_link);
413	}
414
415	printf("\n\t\tSock Send List\n");
416	ldev = ISC_LIST_HEAD(sock->send_list);
417	while (ldev != NULL) {
418		printf("\t\tdev: %p\n", ldev);
419		ldev = ISC_LIST_NEXT(ldev, ev_link);
420	}
421
422	printf("\n\t\tSock Accept List\n");
423	ndev = ISC_LIST_HEAD(sock->accept_list);
424	while (ndev != NULL) {
425		printf("\t\tdev: %p\n", ldev);
426		ndev = ISC_LIST_NEXT(ndev, ev_link);
427	}
428}
429
430static void
431socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
432	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
433	   isc_msgcat_t *msgcat, int msgset, int message,
434	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
435
436/*  This function will add an entry to the I/O completion port
437 *  that will signal the I/O thread to exit (gracefully)
438 */
439static void
440signal_iocompletionport_exit(isc_socketmgr_t *manager) {
441	int i;
442	int errval;
443	char strbuf[ISC_STRERRORSIZE];
444
445	REQUIRE(VALID_MANAGER(manager));
446	for (i = 0; i < manager->maxIOCPThreads; i++) {
447		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
448						0, 0, 0)) {
449			errval = GetLastError();
450			isc__strerror(errval, strbuf, sizeof(strbuf));
451			FATAL_ERROR(__FILE__, __LINE__,
452				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
453				ISC_MSG_FAILED,
454				"Can't request service thread to exit: %s"),
455				strbuf);
456		}
457	}
458}
459
460/*
461 * Create the worker threads for the I/O Completion Port
462 */
463void
464iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
465	int errval;
466	char strbuf[ISC_STRERRORSIZE];
467	int i;
468
469	INSIST(total_threads > 0);
470	REQUIRE(VALID_MANAGER(manager));
471	/*
472	 * We need at least one
473	 */
474	for (i = 0; i < total_threads; i++) {
475		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
476						manager, 0,
477						&manager->dwIOCPThreadIds[i]);
478		if (manager->hIOCPThreads[i] == NULL) {
479			errval = GetLastError();
480			isc__strerror(errval, strbuf, sizeof(strbuf));
481			FATAL_ERROR(__FILE__, __LINE__,
482				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
483				ISC_MSG_FAILED,
484				"Can't create IOCP thread: %s"),
485				strbuf);
486			exit(1);
487		}
488	}
489}
490
491/*
492 *  Create/initialise the I/O completion port
493 */
494void
495iocompletionport_init(isc_socketmgr_t *manager) {
496	int errval;
497	char strbuf[ISC_STRERRORSIZE];
498
499	REQUIRE(VALID_MANAGER(manager));
500	/*
501	 * Create a private heap to handle the socket overlapped structure
502	 * The minimum number of structures is 10, there is no maximum
503	 */
504	hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
505	if (hHeapHandle == NULL) {
506		errval = GetLastError();
507		isc__strerror(errval, strbuf, sizeof(strbuf));
508		FATAL_ERROR(__FILE__, __LINE__,
509			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
510					   ISC_MSG_FAILED,
511					   "HeapCreate() failed during "
512					   "initialization: %s"),
513			    strbuf);
514		exit(1);
515	}
516
517	manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
518
519	/* Now Create the Completion Port */
520	manager->hIoCompletionPort = CreateIoCompletionPort(
521			INVALID_HANDLE_VALUE, NULL,
522			0, manager->maxIOCPThreads);
523	if (manager->hIoCompletionPort == NULL) {
524		errval = GetLastError();
525		isc__strerror(errval, strbuf, sizeof(strbuf));
526		FATAL_ERROR(__FILE__, __LINE__,
527				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
528				ISC_MSG_FAILED,
529				"CreateIoCompletionPort() failed "
530				"during initialization: %s"),
531				strbuf);
532		exit(1);
533	}
534
535	/*
536	 * Worker threads for servicing the I/O
537	 */
538	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
539}
540
541/*
542 * Associate a socket with an IO Completion Port.  This allows us to queue events for it
543 * and have our worker pool of threads process them.
544 */
545void
546iocompletionport_update(isc_socket_t *sock) {
547	HANDLE hiocp;
548	char strbuf[ISC_STRERRORSIZE];
549
550	REQUIRE(VALID_SOCKET(sock));
551
552	hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
553		sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
554
555	if (hiocp == NULL) {
556		DWORD errval = GetLastError();
557		isc__strerror(errval, strbuf, sizeof(strbuf));
558		isc_log_iwrite(isc_lctx,
559				ISC_LOGCATEGORY_GENERAL,
560				ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
561				isc_msgcat, ISC_MSGSET_SOCKET,
562				ISC_MSG_TOOMANYHANDLES,
563				"iocompletionport_update: failed to open"
564				" io completion port: %s",
565				strbuf);
566
567		/* XXXMLG temporary hack to make failures detected.
568		 * This function should return errors to the caller, not
569		 * exit here.
570		 */
571		FATAL_ERROR(__FILE__, __LINE__,
572				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
573				ISC_MSG_FAILED,
574				"CreateIoCompletionPort() failed "
575				"during initialization: %s"),
576				strbuf);
577		exit(1);
578	}
579
580	InterlockedIncrement(&sock->manager->iocp_total);
581}
582
583/*
584 * Routine to cleanup and then close the socket.
585 * Only close the socket here if it is NOT associated
586 * with an event, otherwise the WSAWaitForMultipleEvents
587 * may fail due to the fact that the Wait should not
588 * be running while closing an event or a socket.
589 * The socket is locked before calling this function
590 */
591void
592socket_close(isc_socket_t *sock) {
593
594	REQUIRE(sock != NULL);
595
596	if (sock->fd != INVALID_SOCKET) {
597		closesocket(sock->fd);
598		sock->fd = INVALID_SOCKET;
599		_set_state(sock, SOCK_CLOSED);
600		InterlockedDecrement(&sock->manager->totalSockets);
601	}
602}
603
604static isc_once_t initialise_once = ISC_ONCE_INIT;
605static isc_boolean_t initialised = ISC_FALSE;
606
607static void
608initialise(void) {
609	WORD wVersionRequested;
610	WSADATA wsaData;
611	int err;
612	SOCKET sock;
613	GUID GUIDConnectEx = WSAID_CONNECTEX;
614	GUID GUIDAcceptEx = WSAID_ACCEPTEX;
615	GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
616	DWORD dwBytes;
617
618	/* Need Winsock 2.2 or better */
619	wVersionRequested = MAKEWORD(2, 2);
620
621	err = WSAStartup(wVersionRequested, &wsaData);
622	if (err != 0) {
623		char strbuf[ISC_STRERRORSIZE];
624		isc__strerror(err, strbuf, sizeof(strbuf));
625		FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
626			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
627					   ISC_MSG_FAILED, "failed"),
628			    strbuf);
629		exit(1);
630	}
631	/*
632	 * The following APIs do not exist as functions in a library, but we must
633	 * ask winsock for them.  They are "extensions" -- but why they cannot be
634	 * actual functions is beyond me.  So, ask winsock for the pointers to the
635	 * functions we need.
636	 */
637	sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
638	INSIST(sock != INVALID_SOCKET);
639	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
640		 &GUIDConnectEx, sizeof(GUIDConnectEx),
641		 &ISCConnectEx, sizeof(ISCConnectEx),
642		 &dwBytes, NULL, NULL);
643	INSIST(err == 0);
644
645	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
646		 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
647		 &ISCAcceptEx, sizeof(ISCAcceptEx),
648		 &dwBytes, NULL, NULL);
649	INSIST(err == 0);
650
651	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
652		 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
653		 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
654		 &dwBytes, NULL, NULL);
655	INSIST(err == 0);
656
657	closesocket(sock);
658
659	initialised = ISC_TRUE;
660}
661
662/*
663 * Initialize socket services
664 */
665void
666InitSockets(void) {
667	RUNTIME_CHECK(isc_once_do(&initialise_once,
668				  initialise) == ISC_R_SUCCESS);
669	if (!initialised)
670		exit(1);
671}
672
673int
674internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
675		 struct msghdr *messagehdr, int flags, int *Error)
676{
677	int Result;
678	DWORD BytesSent;
679	DWORD Flags = flags;
680	int total_sent;
681
682	*Error = 0;
683	Result = WSASendTo(sock->fd, messagehdr->msg_iov,
684			   messagehdr->msg_iovlen, &BytesSent,
685			   Flags, (SOCKADDR *)&messagehdr->to_addr,
686			   messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
687			   NULL);
688
689	total_sent = (int)BytesSent;
690
691	/* Check for errors.*/
692	if (Result == SOCKET_ERROR) {
693		*Error = WSAGetLastError();
694
695		switch (*Error) {
696		case WSA_IO_INCOMPLETE:
697		case WSA_WAIT_IO_COMPLETION:
698		case WSA_IO_PENDING:
699		case NO_ERROR:		/* Strange, but okay */
700			sock->pending_iocp++;
701			sock->pending_send++;
702			break;
703
704		default:
705			return (-1);
706			break;
707		}
708	} else {
709		sock->pending_iocp++;
710		sock->pending_send++;
711	}
712
713	if (lpo != NULL)
714		return (0);
715	else
716		return (total_sent);
717}
718
719static void
720queue_receive_request(isc_socket_t *sock) {
721	DWORD Flags = 0;
722	DWORD NumBytes = 0;
723	int total_bytes = 0;
724	int Result;
725	int Error;
726	int need_retry;
727	WSABUF iov[1];
728	IoCompletionInfo *lpo = NULL;
729	isc_result_t isc_result;
730
731 retry:
732	need_retry = ISC_FALSE;
733
734	/*
735	 * If we already have a receive pending, do nothing.
736	 */
737	if (sock->pending_recv > 0) {
738		if (lpo != NULL)
739			HeapFree(hHeapHandle, 0, lpo);
740		return;
741	}
742
743	/*
744	 * If no one is waiting, do nothing.
745	 */
746	if (ISC_LIST_EMPTY(sock->recv_list)) {
747		if (lpo != NULL)
748			HeapFree(hHeapHandle, 0, lpo);
749		return;
750	}
751
752	INSIST(sock->recvbuf.remaining == 0);
753	INSIST(sock->fd != INVALID_SOCKET);
754
755	iov[0].len = sock->recvbuf.len;
756	iov[0].buf = sock->recvbuf.base;
757
758	if (lpo == NULL) {
759		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
760						    HEAP_ZERO_MEMORY,
761						    sizeof(IoCompletionInfo));
762		RUNTIME_CHECK(lpo != NULL);
763	} else
764		ZeroMemory(lpo, sizeof(IoCompletionInfo));
765	lpo->request_type = SOCKET_RECV;
766
767	sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
768
769	Error = 0;
770	Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
771			     &NumBytes, &Flags,
772			     (SOCKADDR *)&sock->recvbuf.from_addr,
773			     &sock->recvbuf.from_addr_len,
774			     (LPWSAOVERLAPPED)lpo, NULL);
775
776	/* Check for errors. */
777	if (Result == SOCKET_ERROR) {
778		Error = WSAGetLastError();
779
780		switch (Error) {
781		case WSA_IO_PENDING:
782			sock->pending_iocp++;
783			sock->pending_recv++;
784			break;
785
786		/* direct error: no completion event */
787		case ERROR_HOST_UNREACHABLE:
788		case WSAENETRESET:
789		case WSAECONNRESET:
790			if (!sock->connected) {
791				/* soft error */
792				need_retry = ISC_TRUE;
793				break;
794			}
795			/* FALLTHROUGH */
796
797		default:
798			isc_result = isc__errno2result(Error);
799			if (isc_result == ISC_R_UNEXPECTED)
800				UNEXPECTED_ERROR(__FILE__, __LINE__,
801					"WSARecvFrom: Windows error code: %d, isc result %d",
802					Error, isc_result);
803			send_recvdone_abort(sock, isc_result);
804			HeapFree(hHeapHandle, 0, lpo);
805			lpo = NULL;
806			break;
807		}
808	} else {
809		/*
810		 * The recv() finished immediately, but we will still get
811		 * a completion event.  Rather than duplicate code, let
812		 * that thread handle sending the data along its way.
813		 */
814		sock->pending_iocp++;
815		sock->pending_recv++;
816	}
817
818	socket_log(__LINE__, sock, NULL, IOEVENT,
819		   isc_msgcat, ISC_MSGSET_SOCKET,
820		   ISC_MSG_DOIORECV,
821		   "queue_io_request: fd %d result %d error %d",
822		   sock->fd, Result, Error);
823
824	CONSISTENT(sock);
825
826	if (need_retry)
827		goto retry;
828}
829
830static void
831manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
832	    isc_logmodule_t *module, int level, const char *fmt, ...)
833{
834	char msgbuf[2048];
835	va_list ap;
836
837	if (!isc_log_wouldlog(isc_lctx, level))
838		return;
839
840	va_start(ap, fmt);
841	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
842	va_end(ap);
843
844	isc_log_write(isc_lctx, category, module, level,
845		      "sockmgr %p: %s", sockmgr, msgbuf);
846}
847
848static void
849socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
850	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
851	   isc_msgcat_t *msgcat, int msgset, int message,
852	   const char *fmt, ...)
853{
854	char msgbuf[2048];
855	char peerbuf[256];
856	va_list ap;
857
858
859	if (!isc_log_wouldlog(isc_lctx, level))
860		return;
861
862	va_start(ap, fmt);
863	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
864	va_end(ap);
865
866	if (address == NULL) {
867		isc_log_iwrite(isc_lctx, category, module, level,
868			       msgcat, msgset, message,
869			       "socket %p line %d: %s", sock, lineno, msgbuf);
870	} else {
871		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
872		isc_log_iwrite(isc_lctx, category, module, level,
873			       msgcat, msgset, message,
874				   "socket %p line %d peer %s: %s", sock, lineno,
875				   peerbuf, msgbuf);
876	}
877
878}
879
880/*
881 * Make an fd SOCKET non-blocking.
882 */
883static isc_result_t
884make_nonblock(SOCKET fd) {
885	int ret;
886	unsigned long flags = 1;
887	char strbuf[ISC_STRERRORSIZE];
888
889	/* Set the socket to non-blocking */
890	ret = ioctlsocket(fd, FIONBIO, &flags);
891
892	if (ret == -1) {
893		isc__strerror(errno, strbuf, sizeof(strbuf));
894		UNEXPECTED_ERROR(__FILE__, __LINE__,
895				 "ioctlsocket(%d, FIOBIO, %d): %s",
896				 fd, flags, strbuf);
897
898		return (ISC_R_UNEXPECTED);
899	}
900
901	return (ISC_R_SUCCESS);
902}
903
904/*
905 * Windows 2000 systems incorrectly cause UDP sockets using WSARecvFrom
906 * to not work correctly, returning a WSACONNRESET error when a WSASendTo
907 * fails with an "ICMP port unreachable" response and preventing the
908 * socket from using the WSARecvFrom in subsequent operations.
909 * The function below fixes this, but requires that Windows 2000
910 * Service Pack 2 or later be installed on the system.  NT 4.0
911 * systems are not affected by this and work correctly.
912 * See Microsoft Knowledge Base Article Q263823 for details of this.
913 */
914isc_result_t
915connection_reset_fix(SOCKET fd) {
916	DWORD dwBytesReturned = 0;
917	BOOL  bNewBehavior = FALSE;
918	DWORD status;
919
920	if (isc_win32os_majorversion() < 5)
921		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */
922
923	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
924	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
925			  sizeof(bNewBehavior), NULL, 0,
926			  &dwBytesReturned, NULL, NULL);
927	if (status != SOCKET_ERROR)
928		return (ISC_R_SUCCESS);
929	else {
930		UNEXPECTED_ERROR(__FILE__, __LINE__,
931				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
932				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
933						ISC_MSG_FAILED, "failed"));
934		return (ISC_R_UNEXPECTED);
935	}
936}
937
938/*
939 * Construct an iov array and attach it to the msghdr passed in.  This is
940 * the SEND constructor, which will use the used region of the buffer
941 * (if using a buffer list) or will use the internal region (if a single
942 * buffer I/O is requested).
943 *
944 * Nothing can be NULL, and the done event must list at least one buffer
945 * on the buffer linked list for this function to be meaningful.
946 */
947static void
948build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
949		  struct msghdr *msg, char *cmsg, WSABUF *iov,
950		  IoCompletionInfo  *lpo)
951{
952	unsigned int iovcount;
953	isc_buffer_t *buffer;
954	buflist_t  *cpbuffer;
955	isc_region_t used;
956	size_t write_count;
957	size_t skip_count;
958
959	memset(msg, 0, sizeof(*msg));
960
961	memcpy(&msg->to_addr, &dev->address.type, dev->address.length);
962	msg->to_addr_len = dev->address.length;
963
964	buffer = ISC_LIST_HEAD(dev->bufferlist);
965	write_count = 0;
966	iovcount = 0;
967
968	/*
969	 * Single buffer I/O?  Skip what we've done so far in this region.
970	 */
971	if (buffer == NULL) {
972		write_count = dev->region.length - dev->n;
973		cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
974		RUNTIME_CHECK(cpbuffer != NULL);
975		cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
976		RUNTIME_CHECK(cpbuffer->buf != NULL);
977
978		socket_log(__LINE__, sock, NULL, TRACE,
979		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
980		   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
981		   cpbuffer->buf, write_count);
982
983		memcpy(cpbuffer->buf,(dev->region.base + dev->n), write_count);
984		cpbuffer->buflen = write_count;
985		ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
986		iov[0].buf = cpbuffer->buf;
987		iov[0].len = write_count;
988		iovcount = 1;
989
990		goto config;
991	}
992
993	/*
994	 * Multibuffer I/O.
995	 * Skip the data in the buffer list that we have already written.
996	 */
997	skip_count = dev->n;
998	while (buffer != NULL) {
999		REQUIRE(ISC_BUFFER_VALID(buffer));
1000		if (skip_count < isc_buffer_usedlength(buffer))
1001			break;
1002		skip_count -= isc_buffer_usedlength(buffer);
1003		buffer = ISC_LIST_NEXT(buffer, link);
1004	}
1005
1006	while (buffer != NULL) {
1007		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1008
1009		isc_buffer_usedregion(buffer, &used);
1010
1011		if (used.length > 0) {
1012			int uselen = used.length - skip_count;
1013			cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
1014			RUNTIME_CHECK(cpbuffer != NULL);
1015			cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
1016			RUNTIME_CHECK(cpbuffer->buf != NULL);
1017
1018			socket_log(__LINE__, sock, NULL, TRACE,
1019			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1020			   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
1021			   cpbuffer->buf, write_count);
1022
1023			memcpy(cpbuffer->buf,(used.base + skip_count), uselen);
1024			cpbuffer->buflen = uselen;
1025			iov[iovcount].buf = cpbuffer->buf;
1026			iov[iovcount].len = used.length - skip_count;
1027			write_count += uselen;
1028			skip_count = 0;
1029			iovcount++;
1030		}
1031		buffer = ISC_LIST_NEXT(buffer, link);
1032	}
1033
1034	INSIST(skip_count == 0);
1035
1036 config:
1037	msg->msg_iov = iov;
1038	msg->msg_iovlen = iovcount;
1039	msg->msg_totallen = write_count;
1040}
1041
1042static void
1043set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1044		isc_socketevent_t *dev)
1045{
1046	if (sock->type == isc_sockettype_udp) {
1047		if (address != NULL)
1048			dev->address = *address;
1049		else
1050			dev->address = sock->address;
1051	} else if (sock->type == isc_sockettype_tcp) {
1052		INSIST(address == NULL);
1053		dev->address = sock->address;
1054	}
1055}
1056
1057static void
1058destroy_socketevent(isc_event_t *event) {
1059	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1060
1061	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1062
1063	(ev->destroy)(event);
1064}
1065
1066static isc_socketevent_t *
1067allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1068		     isc_taskaction_t action, const void *arg)
1069{
1070	isc_socketevent_t *ev;
1071
1072	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1073						     sock, eventtype,
1074						     action, arg,
1075						     sizeof(*ev));
1076	if (ev == NULL)
1077		return (NULL);
1078
1079	ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1080	ISC_LINK_INIT(ev, ev_link);
1081	ISC_LIST_INIT(ev->bufferlist);
1082	ev->region.base = NULL;
1083	ev->n = 0;
1084	ev->offset = 0;
1085	ev->attributes = 0;
1086	ev->destroy = ev->ev_destroy;
1087	ev->ev_destroy = destroy_socketevent;
1088
1089	return (ev);
1090}
1091
1092#if defined(ISC_SOCKET_DEBUG)
1093static void
1094dump_msg(struct msghdr *msg, isc_socket_t *sock) {
1095	unsigned int i;
1096
1097	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1098	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1099	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1100	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1101		printf("\t\t%d\tbase %p, len %d\n", i,
1102		       msg->msg_iov[i].buf,
1103		       msg->msg_iov[i].len);
1104}
1105#endif
1106
1107/*
1108 * map the error code
1109 */
1110int
1111map_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1112		 char *errorstring, size_t bufsize) {
1113
1114	int doreturn;
1115	switch (windows_errno) {
1116	case WSAECONNREFUSED:
1117		*isc_errno = ISC_R_CONNREFUSED;
1118		if (sock->connected)
1119			doreturn = DOIO_HARD;
1120		else
1121			doreturn = DOIO_SOFT;
1122		break;
1123	case WSAENETUNREACH:
1124	case ERROR_NETWORK_UNREACHABLE:
1125		*isc_errno = ISC_R_NETUNREACH;
1126		if (sock->connected)
1127			doreturn = DOIO_HARD;
1128		else
1129			doreturn = DOIO_SOFT;
1130		break;
1131	case ERROR_PORT_UNREACHABLE:
1132	case ERROR_HOST_UNREACHABLE:
1133	case WSAEHOSTUNREACH:
1134		*isc_errno = ISC_R_HOSTUNREACH;
1135		if (sock->connected)
1136			doreturn = DOIO_HARD;
1137		else
1138			doreturn = DOIO_SOFT;
1139		break;
1140	case WSAENETDOWN:
1141		*isc_errno = ISC_R_NETDOWN;
1142		if (sock->connected)
1143			doreturn = DOIO_HARD;
1144		else
1145			doreturn = DOIO_SOFT;
1146		break;
1147	case WSAEHOSTDOWN:
1148		*isc_errno = ISC_R_HOSTDOWN;
1149		if (sock->connected)
1150			doreturn = DOIO_HARD;
1151		else
1152			doreturn = DOIO_SOFT;
1153		break;
1154	case WSAEACCES:
1155		*isc_errno = ISC_R_NOPERM;
1156		if (sock->connected)
1157			doreturn = DOIO_HARD;
1158		else
1159			doreturn = DOIO_SOFT;
1160		break;
1161	case WSAECONNRESET:
1162	case WSAENETRESET:
1163	case WSAECONNABORTED:
1164	case WSAEDISCON:
1165		*isc_errno = ISC_R_CONNECTIONRESET;
1166		if (sock->connected)
1167			doreturn = DOIO_HARD;
1168		else
1169			doreturn = DOIO_SOFT;
1170		break;
1171	case WSAENOTCONN:
1172		*isc_errno = ISC_R_NOTCONNECTED;
1173		if (sock->connected)
1174			doreturn = DOIO_HARD;
1175		else
1176			doreturn = DOIO_SOFT;
1177		break;
1178	case ERROR_OPERATION_ABORTED:
1179	case ERROR_CONNECTION_ABORTED:
1180	case ERROR_REQUEST_ABORTED:
1181		*isc_errno = ISC_R_CONNECTIONRESET;
1182		doreturn = DOIO_HARD;
1183		break;
1184	case WSAENOBUFS:
1185		*isc_errno = ISC_R_NORESOURCES;
1186		doreturn = DOIO_HARD;
1187		break;
1188	case WSAEAFNOSUPPORT:
1189		*isc_errno = ISC_R_FAMILYNOSUPPORT;
1190		doreturn = DOIO_HARD;
1191		break;
1192	case WSAEADDRNOTAVAIL:
1193		*isc_errno = ISC_R_ADDRNOTAVAIL;
1194		doreturn = DOIO_HARD;
1195		break;
1196	case WSAEDESTADDRREQ:
1197		*isc_errno = ISC_R_BADADDRESSFORM;
1198		doreturn = DOIO_HARD;
1199		break;
1200	case ERROR_NETNAME_DELETED:
1201		*isc_errno = ISC_R_NETDOWN;
1202		doreturn = DOIO_HARD;
1203		break;
1204	default:
1205		*isc_errno = ISC_R_IOERROR;
1206		doreturn = DOIO_HARD;
1207		break;
1208	}
1209	if (doreturn == DOIO_HARD) {
1210		isc__strerror(windows_errno, errorstring, bufsize);
1211	}
1212	return (doreturn);
1213}
1214
1215static void
1216fill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1217	isc_region_t r;
1218	int copylen;
1219	isc_buffer_t *buffer;
1220
1221	INSIST(dev->n < dev->minimum);
1222	INSIST(sock->recvbuf.remaining > 0);
1223	INSIST(sock->pending_recv == 0);
1224
1225	if (sock->type == isc_sockettype_udp) {
1226		dev->address.length = sock->recvbuf.from_addr_len;
1227		memcpy(&dev->address.type, &sock->recvbuf.from_addr,
1228		    sock->recvbuf.from_addr_len);
1229		if (isc_sockaddr_getport(&dev->address) == 0) {
1230			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1231				socket_log(__LINE__, sock, &dev->address, IOEVENT,
1232					   isc_msgcat, ISC_MSGSET_SOCKET,
1233					   ISC_MSG_ZEROPORT,
1234					   "dropping source port zero packet");
1235			}
1236			sock->recvbuf.remaining = 0;
1237			return;
1238		}
1239	} else if (sock->type == isc_sockettype_tcp) {
1240		dev->address = sock->address;
1241	}
1242
1243	/*
1244	 * Run through the list of buffers we were given, and find the
1245	 * first one with space.  Once it is found, loop through, filling
1246	 * the buffers as much as possible.
1247	 */
1248	buffer = ISC_LIST_HEAD(dev->bufferlist);
1249	if (buffer != NULL) { // Multi-buffer receive
1250		while (buffer != NULL && sock->recvbuf.remaining > 0) {
1251			REQUIRE(ISC_BUFFER_VALID(buffer));
1252			if (isc_buffer_availablelength(buffer) > 0) {
1253				isc_buffer_availableregion(buffer, &r);
1254				copylen = min(r.length, sock->recvbuf.remaining);
1255				memcpy(r.base, sock->recvbuf.consume_position, copylen);
1256				sock->recvbuf.consume_position += copylen;
1257				sock->recvbuf.remaining -= copylen;
1258				isc_buffer_add(buffer, copylen);
1259				dev->n += copylen;
1260			}
1261			buffer = ISC_LIST_NEXT(buffer, link);
1262		}
1263	} else { // Single-buffer receive
1264		copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1265		memcpy(dev->region.base + dev->n, sock->recvbuf.consume_position, copylen);
1266		sock->recvbuf.consume_position += copylen;
1267		sock->recvbuf.remaining -= copylen;
1268		dev->n += copylen;
1269	}
1270
1271	/*
1272	 * UDP receives are all-consuming.  That is, if we have 4k worth of
1273	 * data in our receive buffer, and the caller only gave us
1274	 * 1k of space, we will toss the remaining 3k of data.  TCP
1275	 * will keep the extra data around and use it for later requests.
1276	 */
1277	if (sock->type == isc_sockettype_udp)
1278		sock->recvbuf.remaining = 0;
1279}
1280
1281/*
1282 * Copy out as much data from the internal buffer to done events.
1283 * As each done event is filled, send it along its way.
1284 */
1285static void
1286completeio_recv(isc_socket_t *sock)
1287{
1288	isc_socketevent_t *dev;
1289
1290	/*
1291	 * If we are in the process of filling our buffer, we cannot
1292	 * touch it yet, so don't.
1293	 */
1294	if (sock->pending_recv > 0)
1295		return;
1296
1297	while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1298		dev = ISC_LIST_HEAD(sock->recv_list);
1299
1300		/*
1301		 * See if we have sufficient data in our receive buffer
1302		 * to handle this.  If we do, copy out the data.
1303		 */
1304		fill_recv(sock, dev);
1305
1306		/*
1307		 * Did we satisfy it?
1308		 */
1309		if (dev->n >= dev->minimum) {
1310			dev->result = ISC_R_SUCCESS;
1311			send_recvdone_event(sock, &dev);
1312		}
1313	}
1314}
1315
1316/*
1317 * Returns:
1318 *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1319 *			ISC_R_SUCCESS.
1320 *
1321 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1322 *			dev->result contains the appropriate error.
1323 *
1324 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1325 *			event was sent.  The operation should be retried.
1326 *
1327 *	No other return values are possible.
1328 */
1329static int
1330completeio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1331		struct msghdr *messagehdr, int cc, int send_errno)
1332{
1333	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1334	char strbuf[ISC_STRERRORSIZE];
1335
1336	if (send_errno != 0) {
1337		if (SOFT_ERROR(send_errno))
1338			return (DOIO_SOFT);
1339
1340		return (map_socket_error(sock, send_errno, &dev->result,
1341			strbuf, sizeof(strbuf)));
1342
1343		/*
1344		 * The other error types depend on whether or not the
1345		 * socket is UDP or TCP.  If it is UDP, some errors
1346		 * that we expect to be fatal under TCP are merely
1347		 * annoying, and are really soft errors.
1348		 *
1349		 * However, these soft errors are still returned as
1350		 * a status.
1351		 */
1352		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1353		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1354		UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1355				 addrbuf, strbuf);
1356		dev->result = isc__errno2result(send_errno);
1357		return (DOIO_HARD);
1358	}
1359
1360	/*
1361	 * If we write less than we expected, update counters, poke.
1362	 */
1363	dev->n += cc;
1364	if (cc != messagehdr->msg_totallen)
1365		return (DOIO_SOFT);
1366
1367	/*
1368	 * Exactly what we wanted to write.  We're done with this
1369	 * entry.  Post its completion event.
1370	 */
1371	dev->result = ISC_R_SUCCESS;
1372	return (DOIO_SUCCESS);
1373}
1374
1375static int
1376startio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1377	     int *send_errno)
1378{
1379	char *cmsg = NULL;
1380	char strbuf[ISC_STRERRORSIZE];
1381	IoCompletionInfo *lpo;
1382	int status;
1383	struct msghdr *msghdr;
1384
1385	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1386					    HEAP_ZERO_MEMORY,
1387					    sizeof(IoCompletionInfo));
1388	RUNTIME_CHECK(lpo != NULL);
1389	lpo->request_type = SOCKET_SEND;
1390	lpo->dev = dev;
1391	msghdr = &lpo->messagehdr;
1392	memset(msghdr, 0, sizeof(struct msghdr));
1393	ISC_LIST_INIT(lpo->bufferlist);
1394
1395	build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1396
1397	*nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1398
1399	if (*nbytes < 0) {
1400		/*
1401		 * I/O has been initiated
1402		 * completion will be through the completion port
1403		 */
1404		if (PENDING_ERROR(*send_errno)) {
1405			status = DOIO_PENDING;
1406			goto done;
1407		}
1408
1409		if (SOFT_ERROR(*send_errno)) {
1410			status = DOIO_SOFT;
1411			goto done;
1412		}
1413
1414		/*
1415		 * If we got this far then something is wrong
1416		 */
1417		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1418			isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1419			socket_log(__LINE__, sock, NULL, IOEVENT,
1420				   isc_msgcat, ISC_MSGSET_SOCKET,
1421				   ISC_MSG_INTERNALSEND,
1422				   "startio_send: internal_sendmsg(%d) %d "
1423				   "bytes, err %d/%s",
1424				   sock->fd, *nbytes, *send_errno, strbuf);
1425		}
1426		status = DOIO_HARD;
1427		goto done;
1428	}
1429	dev->result = ISC_R_SUCCESS;
1430	status = DOIO_SOFT;
1431 done:
1432	_set_state(sock, SOCK_DATA);
1433	return (status);
1434}
1435
1436static isc_result_t
1437allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1438		isc_socket_t **socketp) {
1439	isc_socket_t *sock;
1440	isc_result_t result;
1441
1442	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1443
1444	if (sock == NULL)
1445		return (ISC_R_NOMEMORY);
1446
1447	sock->magic = 0;
1448	sock->references = 0;
1449
1450	sock->manager = manager;
1451	sock->type = type;
1452	sock->fd = INVALID_SOCKET;
1453
1454	ISC_LINK_INIT(sock, link);
1455
1456	/*
1457	 * set up list of readers and writers to be initially empty
1458	 */
1459	ISC_LIST_INIT(sock->recv_list);
1460	ISC_LIST_INIT(sock->send_list);
1461	ISC_LIST_INIT(sock->accept_list);
1462	sock->connect_ev = NULL;
1463	sock->pending_accept = 0;
1464	sock->pending_recv = 0;
1465	sock->pending_send = 0;
1466	sock->pending_iocp = 0;
1467	sock->listener = 0;
1468	sock->connected = 0;
1469	sock->pending_connect = 0;
1470	sock->bound = 0;
1471	sock->dupped = 0;
1472	memset(sock->name, 0, sizeof(sock->name));	// zero the name field
1473	_set_state(sock, SOCK_INITIALIZED);
1474
1475	sock->recvbuf.len = 65536;
1476	sock->recvbuf.consume_position = sock->recvbuf.base;
1477	sock->recvbuf.remaining = 0;
1478	sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1479	if (sock->recvbuf.base == NULL) {
1480		sock->magic = 0;
1481		goto error;
1482	}
1483
1484	/*
1485	 * initialize the lock
1486	 */
1487	result = isc_mutex_init(&sock->lock);
1488	if (result != ISC_R_SUCCESS) {
1489		sock->magic = 0;
1490		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1491		sock->recvbuf.base = NULL;
1492		goto error;
1493	}
1494
1495	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1496		   "allocated");
1497
1498	sock->magic = SOCKET_MAGIC;
1499	*socketp = sock;
1500
1501	return (ISC_R_SUCCESS);
1502
1503 error:
1504	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1505
1506	return (result);
1507}
1508
1509/*
1510 * Verify that the socket state is consistent.
1511 */
1512static void
1513consistent(isc_socket_t *sock) {
1514
1515	isc_socketevent_t *dev;
1516	isc_socket_newconnev_t *nev;
1517	unsigned int count;
1518	char *crash_reason;
1519	isc_boolean_t crash = ISC_FALSE;
1520
1521	REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1522		+ sock->pending_accept + sock->pending_connect);
1523
1524	dev = ISC_LIST_HEAD(sock->send_list);
1525	count = 0;
1526	while (dev != NULL) {
1527		count++;
1528		dev = ISC_LIST_NEXT(dev, ev_link);
1529	}
1530	if (count > sock->pending_send) {
1531		crash = ISC_TRUE;
1532		crash_reason = "send_list > sock->pending_send";
1533	}
1534
1535	nev = ISC_LIST_HEAD(sock->accept_list);
1536	count = 0;
1537	while (nev != NULL) {
1538		count++;
1539		nev = ISC_LIST_NEXT(nev, ev_link);
1540	}
1541	if (count > sock->pending_accept) {
1542		crash = ISC_TRUE;
1543		crash_reason = "send_list > sock->pending_send";
1544	}
1545
1546	if (crash) {
1547		socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1548			   ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1549			   crash_reason);
1550		sock_dump(sock);
1551		INSIST(crash == ISC_FALSE);
1552	}
1553}
1554
1555/*
1556 * Maybe free the socket.
1557 *
1558 * This function will verify tht the socket is no longer in use in any way,
1559 * either internally or externally.  This is the only place where this
1560 * check is to be made; if some bit of code believes that IT is done with
1561 * the socket (e.g., some reference counter reaches zero), it should call
1562 * this function.
1563 *
1564 * When calling this function, the socket must be locked, and the manager
1565 * must be unlocked.
1566 *
1567 * When this function returns, *socketp will be NULL.  No tricks to try
1568 * to hold on to this pointer are allowed.
1569 */
1570static void
1571maybe_free_socket(isc_socket_t **socketp, int lineno) {
1572	isc_socket_t *sock = *socketp;
1573	*socketp = NULL;
1574
1575	INSIST(VALID_SOCKET(sock));
1576	CONSISTENT(sock);
1577
1578	if (sock->pending_iocp > 0
1579	    || sock->pending_recv > 0
1580	    || sock->pending_send > 0
1581	    || sock->pending_accept > 0
1582	    || sock->references > 0
1583	    || sock->pending_connect == 1
1584	    || !ISC_LIST_EMPTY(sock->recv_list)
1585	    || !ISC_LIST_EMPTY(sock->send_list)
1586	    || !ISC_LIST_EMPTY(sock->accept_list)
1587	    || sock->fd != INVALID_SOCKET) {
1588		UNLOCK(&sock->lock);
1589		return;
1590	}
1591	UNLOCK(&sock->lock);
1592
1593	free_socket(&sock, lineno);
1594}
1595
1596void
1597free_socket(isc_socket_t **sockp, int lineno) {
1598	isc_socketmgr_t *manager;
1599	isc_socket_t *sock = *sockp;
1600	*sockp = NULL;
1601
1602	manager = sock->manager;
1603
1604	/*
1605	 * Seems we can free the socket after all.
1606	 */
1607	manager = sock->manager;
1608	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1609		   ISC_MSG_DESTROYING, "freeing socket line %d fd %d lock %p semaphore %p",
1610		   lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1611
1612	sock->magic = 0;
1613	DESTROYLOCK(&sock->lock);
1614
1615	if (sock->recvbuf.base != NULL)
1616		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1617
1618	LOCK(&manager->lock);
1619	if (ISC_LINK_LINKED(sock, link))
1620		ISC_LIST_UNLINK(manager->socklist, sock, link);
1621	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1622
1623	if (ISC_LIST_EMPTY(manager->socklist))
1624		SIGNAL(&manager->shutdown_ok);
1625	UNLOCK(&manager->lock);
1626}
1627
1628/*
1629 * Create a new 'type' socket managed by 'manager'.  Events
1630 * will be posted to 'task' and when dispatched 'action' will be
1631 * called with 'arg' as the arg value.  The new socket is returned
1632 * in 'socketp'.
1633 */
1634static isc_result_t
1635socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1636	      isc_socket_t **socketp, isc_socket_t *dup_socket)
1637{
1638	isc_socket_t *sock = NULL;
1639	isc_result_t result;
1640#if defined(USE_CMSG)
1641	int on = 1;
1642#endif
1643#if defined(SO_RCVBUF)
1644	ISC_SOCKADDR_LEN_T optlen;
1645	int size;
1646#endif
1647	int socket_errno;
1648	char strbuf[ISC_STRERRORSIZE];
1649
1650	REQUIRE(VALID_MANAGER(manager));
1651	REQUIRE(socketp != NULL && *socketp == NULL);
1652	REQUIRE(type != isc_sockettype_fdwatch);
1653
1654	if (dup_socket != NULL)
1655		return (ISC_R_NOTIMPLEMENTED);
1656
1657	result = allocate_socket(manager, type, &sock);
1658	if (result != ISC_R_SUCCESS)
1659		return (result);
1660
1661	sock->pf = pf;
1662#if 0
1663	if (dup_socket == NULL) {
1664#endif
1665		switch (type) {
1666		case isc_sockettype_udp:
1667			sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1668			if (sock->fd != INVALID_SOCKET) {
1669				result = connection_reset_fix(sock->fd);
1670				if (result != ISC_R_SUCCESS) {
1671					socket_log(__LINE__, sock,
1672						NULL, EVENT, NULL, 0, 0,
1673						"closed %d %d %d "
1674						"con_reset_fix_failed",
1675						sock->pending_recv,
1676						sock->pending_send,
1677						sock->references);
1678					closesocket(sock->fd);
1679					_set_state(sock, SOCK_CLOSED);
1680					sock->fd = INVALID_SOCKET;
1681					free_socket(&sock, __LINE__);
1682					return (result);
1683				}
1684			}
1685			break;
1686		case isc_sockettype_tcp:
1687			sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1688			break;
1689		}
1690#if 0
1691	} else {
1692		/*
1693		 * XXX: dup() is deprecated in windows, use _dup()
1694		 * instead.  In future we may want to investigate
1695		 * WSADuplicateSocket().
1696		 */
1697		sock->fd = _dup(dup_socket->fd);
1698		sock->dupped = 1;
1699		sock->bound = dup_socket->bound;
1700	}
1701#endif
1702
1703	if (sock->fd == INVALID_SOCKET) {
1704		socket_errno = WSAGetLastError();
1705		free_socket(&sock, __LINE__);
1706
1707		switch (socket_errno) {
1708		case WSAEMFILE:
1709		case WSAENOBUFS:
1710			return (ISC_R_NORESOURCES);
1711
1712		case WSAEPROTONOSUPPORT:
1713		case WSAEPFNOSUPPORT:
1714		case WSAEAFNOSUPPORT:
1715			return (ISC_R_FAMILYNOSUPPORT);
1716
1717		default:
1718			isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1719			UNEXPECTED_ERROR(__FILE__, __LINE__,
1720					 "socket() %s: %s",
1721					 isc_msgcat_get(isc_msgcat,
1722							ISC_MSGSET_GENERAL,
1723							ISC_MSG_FAILED,
1724							"failed"),
1725					 strbuf);
1726			return (ISC_R_UNEXPECTED);
1727		}
1728	}
1729
1730	result = make_nonblock(sock->fd);
1731	if (result != ISC_R_SUCCESS) {
1732		socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1733			"closed %d %d %d make_nonblock_failed",
1734			sock->pending_recv, sock->pending_send,
1735			sock->references);
1736		closesocket(sock->fd);
1737		sock->fd = INVALID_SOCKET;
1738		free_socket(&sock, __LINE__);
1739		return (result);
1740	}
1741
1742
1743#if defined(USE_CMSG) || defined(SO_RCVBUF)
1744	if (type == isc_sockettype_udp) {
1745
1746#if defined(USE_CMSG)
1747#if defined(ISC_PLATFORM_HAVEIPV6)
1748#ifdef IPV6_RECVPKTINFO
1749		/* 2292bis */
1750		if ((pf == AF_INET6)
1751		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1752				   (char *)&on, sizeof(on)) < 0)) {
1753			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1754			UNEXPECTED_ERROR(__FILE__, __LINE__,
1755					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1756					 "%s: %s", sock->fd,
1757					 isc_msgcat_get(isc_msgcat,
1758							ISC_MSGSET_GENERAL,
1759							ISC_MSG_FAILED,
1760							"failed"),
1761					 strbuf);
1762		}
1763#else
1764		/* 2292 */
1765		if ((pf == AF_INET6)
1766		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1767				   (char *)&on, sizeof(on)) < 0)) {
1768			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1769			UNEXPECTED_ERROR(__FILE__, __LINE__,
1770					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1771					 sock->fd,
1772					 isc_msgcat_get(isc_msgcat,
1773							ISC_MSGSET_GENERAL,
1774							ISC_MSG_FAILED,
1775							"failed"),
1776					 strbuf);
1777		}
1778#endif /* IPV6_RECVPKTINFO */
1779#ifdef IPV6_USE_MIN_MTU	/*2292bis, not too common yet*/
1780		/* use minimum MTU */
1781		if (pf == AF_INET6) {
1782			(void)setsockopt(sock->fd, IPPROTO_IPV6,
1783					 IPV6_USE_MIN_MTU,
1784					 (char *)&on, sizeof(on));
1785		}
1786#endif
1787#endif /* ISC_PLATFORM_HAVEIPV6 */
1788#endif /* defined(USE_CMSG) */
1789
1790#if defined(SO_RCVBUF)
1791	       optlen = sizeof(size);
1792	       if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1793			      (char *)&size, &optlen) >= 0 &&
1794		    size < RCVBUFSIZE) {
1795		       size = RCVBUFSIZE;
1796		       (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1797					(char *)&size, sizeof(size));
1798	       }
1799#endif
1800
1801	}
1802#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1803
1804	_set_state(sock, SOCK_OPEN);
1805	sock->references = 1;
1806	*socketp = sock;
1807
1808	iocompletionport_update(sock);
1809
1810	/*
1811	 * Note we don't have to lock the socket like we normally would because
1812	 * there are no external references to it yet.
1813	 */
1814	LOCK(&manager->lock);
1815	ISC_LIST_APPEND(manager->socklist, sock, link);
1816	InterlockedIncrement(&manager->totalSockets);
1817	UNLOCK(&manager->lock);
1818
1819	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat,
1820		   ISC_MSGSET_SOCKET, ISC_MSG_CREATED,
1821		   "created %u type %u", sock->fd, type);
1822
1823	return (ISC_R_SUCCESS);
1824}
1825
1826isc_result_t
1827isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1828		   isc_socket_t **socketp)
1829{
1830	return (socket_create(manager, pf, type, socketp, NULL));
1831}
1832
1833isc_result_t
1834isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
1835	REQUIRE(VALID_SOCKET(sock));
1836	REQUIRE(socketp != NULL && *socketp == NULL);
1837
1838#if 1
1839	return (ISC_R_NOTIMPLEMENTED);
1840#else
1841	return (socket_create(sock->manager, sock->pf, sock->type,
1842			      socketp, sock));
1843#endif
1844}
1845
1846isc_result_t
1847isc_socket_open(isc_socket_t *sock) {
1848	REQUIRE(VALID_SOCKET(sock));
1849	REQUIRE(sock->type != isc_sockettype_fdwatch);
1850
1851	return (ISC_R_NOTIMPLEMENTED);
1852}
1853
1854/*
1855 * Attach to a socket.  Caller must explicitly detach when it is done.
1856 */
1857void
1858isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1859	REQUIRE(VALID_SOCKET(sock));
1860	REQUIRE(socketp != NULL && *socketp == NULL);
1861
1862	LOCK(&sock->lock);
1863	CONSISTENT(sock);
1864	sock->references++;
1865	UNLOCK(&sock->lock);
1866
1867	*socketp = sock;
1868}
1869
1870/*
1871 * Dereference a socket.  If this is the last reference to it, clean things
1872 * up by destroying the socket.
1873 */
1874void
1875isc__socket_detach(isc_socket_t **socketp) {
1876	isc_socket_t *sock;
1877	isc_boolean_t kill_socket = ISC_FALSE;
1878
1879	REQUIRE(socketp != NULL);
1880	sock = *socketp;
1881	REQUIRE(VALID_SOCKET(sock));
1882	REQUIRE(sock->type != isc_sockettype_fdwatch);
1883
1884	LOCK(&sock->lock);
1885	CONSISTENT(sock);
1886	REQUIRE(sock->references > 0);
1887	sock->references--;
1888
1889	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1890		"detach_socket %d %d %d",
1891		sock->pending_recv, sock->pending_send,
1892		sock->references);
1893
1894	if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1895		closesocket(sock->fd);
1896		sock->fd = INVALID_SOCKET;
1897		_set_state(sock, SOCK_CLOSED);
1898	}
1899
1900	maybe_free_socket(&sock, __LINE__);
1901
1902	*socketp = NULL;
1903}
1904
1905isc_result_t
1906isc_socket_close(isc_socket_t *sock) {
1907	REQUIRE(VALID_SOCKET(sock));
1908	REQUIRE(sock->type != isc_sockettype_fdwatch);
1909
1910	return (ISC_R_NOTIMPLEMENTED);
1911}
1912
1913/*
1914 * Dequeue an item off the given socket's read queue, set the result code
1915 * in the done event to the one provided, and send it to the task it was
1916 * destined for.
1917 *
1918 * If the event to be sent is on a list, remove it before sending.  If
1919 * asked to, send and detach from the task as well.
1920 *
1921 * Caller must have the socket locked if the event is attached to the socket.
1922 */
1923static void
1924send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1925	isc_task_t *task;
1926
1927	task = (*dev)->ev_sender;
1928	(*dev)->ev_sender = sock;
1929
1930	if (ISC_LINK_LINKED(*dev, ev_link))
1931		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1932
1933	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1934	    == ISC_SOCKEVENTATTR_ATTACHED)
1935		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1936	else
1937		isc_task_send(task, (isc_event_t **)dev);
1938
1939	CONSISTENT(sock);
1940}
1941
1942/*
1943 * See comments for send_recvdone_event() above.
1944 */
1945static void
1946send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1947	isc_task_t *task;
1948
1949	INSIST(dev != NULL && *dev != NULL);
1950
1951	task = (*dev)->ev_sender;
1952	(*dev)->ev_sender = sock;
1953
1954	if (ISC_LINK_LINKED(*dev, ev_link))
1955		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1956
1957	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1958	    == ISC_SOCKEVENTATTR_ATTACHED)
1959		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1960	else
1961		isc_task_send(task, (isc_event_t **)dev);
1962
1963	CONSISTENT(sock);
1964}
1965
1966/*
1967 * See comments for send_recvdone_event() above.
1968 */
1969static void
1970send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
1971	isc_task_t *task;
1972
1973	INSIST(adev != NULL && *adev != NULL);
1974
1975	task = (*adev)->ev_sender;
1976	(*adev)->ev_sender = sock;
1977
1978	if (ISC_LINK_LINKED(*adev, ev_link))
1979		ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
1980
1981	isc_task_sendanddetach(&task, (isc_event_t **)adev);
1982
1983	CONSISTENT(sock);
1984}
1985
1986/*
1987 * See comments for send_recvdone_event() above.
1988 */
1989static void
1990send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
1991	isc_task_t *task;
1992
1993	INSIST(cdev != NULL && *cdev != NULL);
1994
1995	task = (*cdev)->ev_sender;
1996	(*cdev)->ev_sender = sock;
1997
1998	sock->connect_ev = NULL;
1999
2000	isc_task_sendanddetach(&task, (isc_event_t **)cdev);
2001
2002	CONSISTENT(sock);
2003}
2004
2005/*
2006 * On entry to this function, the event delivered is the internal
2007 * readable event, and the first item on the accept_list should be
2008 * the done event we want to send.  If the list is empty, this is a no-op,
2009 * so just close the new connection, unlock, and return.
2010 *
2011 * Note the socket is locked before entering here
2012 */
2013static void
2014internal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
2015	isc_socket_newconnev_t *adev;
2016	isc_result_t result = ISC_R_SUCCESS;
2017	isc_socket_t *nsock;
2018	struct sockaddr *localaddr;
2019	int localaddr_len = sizeof(*localaddr);
2020	struct sockaddr *remoteaddr;
2021	int remoteaddr_len = sizeof(*remoteaddr);
2022
2023	INSIST(VALID_SOCKET(sock));
2024	LOCK(&sock->lock);
2025	CONSISTENT(sock);
2026
2027	socket_log(__LINE__, sock, NULL, TRACE,
2028		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2029		   "internal_accept called");
2030
2031	INSIST(sock->listener);
2032
2033	INSIST(sock->pending_iocp > 0);
2034	sock->pending_iocp--;
2035	INSIST(sock->pending_accept > 0);
2036	sock->pending_accept--;
2037
2038	adev = lpo->adev;
2039
2040	/*
2041	 * If the event is no longer in the list we can just return.
2042	 */
2043	if (!acceptdone_is_active(sock, adev))
2044		goto done;
2045
2046	nsock = adev->newsocket;
2047
2048	/*
2049	 * Pull off the done event.
2050	 */
2051	ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
2052
2053	/*
2054	 * Extract the addresses from the socket, copy them into the structure,
2055	 * and return the new socket.
2056	 */
2057	ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
2058		sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
2059		(LPSOCKADDR *)&localaddr, &localaddr_len,
2060		(LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
2061	memcpy(&adev->address.type, remoteaddr, remoteaddr_len);
2062	adev->address.length = remoteaddr_len;
2063	nsock->address = adev->address;
2064	nsock->pf = adev->address.type.sa.sa_family;
2065
2066	socket_log(__LINE__, nsock, &nsock->address, TRACE,
2067		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2068		   "internal_accept parent %p", sock);
2069
2070	result = make_nonblock(adev->newsocket->fd);
2071	INSIST(result == ISC_R_SUCCESS);
2072
2073	INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
2074			  (char *)&sock->fd, sizeof(sock->fd)) == 0);
2075
2076	/*
2077	 * Hook it up into the manager.
2078	 */
2079	nsock->bound = 1;
2080	nsock->connected = 1;
2081	_set_state(nsock, SOCK_OPEN);
2082
2083	LOCK(&nsock->manager->lock);
2084	ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2085	InterlockedIncrement(&nsock->manager->totalSockets);
2086	UNLOCK(&nsock->manager->lock);
2087
2088	socket_log(__LINE__, sock, &nsock->address, CREATION,
2089		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2090		   "accepted_connection new_socket %p fd %d",
2091		   nsock, nsock->fd);
2092
2093	adev->result = result;
2094	send_acceptdone_event(sock, &adev);
2095
2096done:
2097	CONSISTENT(sock);
2098	UNLOCK(&sock->lock);
2099
2100	HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2101	lpo->acceptbuffer = NULL;
2102}
2103
2104/*
2105 * Called when a socket with a pending connect() finishes.
2106 * Note that the socket is locked before entering.
2107 */
2108static void
2109internal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2110	isc_socket_connev_t *cdev;
2111	char strbuf[ISC_STRERRORSIZE];
2112
2113	INSIST(VALID_SOCKET(sock));
2114
2115	LOCK(&sock->lock);
2116
2117	INSIST(sock->pending_iocp > 0);
2118	sock->pending_iocp--;
2119	INSIST(sock->pending_connect == 1);
2120	sock->pending_connect = 0;
2121
2122	/*
2123	 * Has this event been canceled?
2124	 */
2125	cdev = lpo->cdev;
2126	if (!connectdone_is_active(sock, cdev)) {
2127		sock->pending_connect = 0;
2128		if (sock->fd != INVALID_SOCKET) {
2129			closesocket(sock->fd);
2130			sock->fd = INVALID_SOCKET;
2131			_set_state(sock, SOCK_CLOSED);
2132		}
2133		CONSISTENT(sock);
2134		UNLOCK(&sock->lock);
2135		return;
2136	}
2137
2138	/*
2139	 * Check possible Windows network event error status here.
2140	 */
2141	if (connect_errno != 0) {
2142		/*
2143		 * If the error is SOFT, just try again on this
2144		 * fd and pretend nothing strange happened.
2145		 */
2146		if (SOFT_ERROR(connect_errno) ||
2147		    connect_errno == WSAEINPROGRESS) {
2148			sock->pending_connect = 1;
2149			CONSISTENT(sock);
2150			UNLOCK(&sock->lock);
2151			return;
2152		}
2153
2154		/*
2155		 * Translate other errors into ISC_R_* flavors.
2156		 */
2157		switch (connect_errno) {
2158#define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2159			ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2160			ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2161			ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2162			ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2163			ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2164			ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2165			ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2166			ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2167			ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2168			ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2169			ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2170			ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2171#undef ERROR_MATCH
2172		default:
2173			cdev->result = ISC_R_UNEXPECTED;
2174			isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2175			UNEXPECTED_ERROR(__FILE__, __LINE__,
2176					 "internal_connect: connect() %s",
2177					 strbuf);
2178		}
2179	} else {
2180		INSIST(setsockopt(sock->fd, SOL_SOCKET,
2181				  SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2182		cdev->result = ISC_R_SUCCESS;
2183		sock->connected = 1;
2184		socket_log(__LINE__, sock, &sock->address, IOEVENT,
2185			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2186			   "internal_connect: success");
2187	}
2188
2189	send_connectdone_event(sock, &cdev);
2190
2191	UNLOCK(&sock->lock);
2192}
2193
2194/*
2195 * Loop through the socket, returning ISC_R_EOF for each done event pending.
2196 */
2197static void
2198send_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2199	isc_socketevent_t *dev;
2200
2201	while (!ISC_LIST_EMPTY(sock->recv_list)) {
2202		dev = ISC_LIST_HEAD(sock->recv_list);
2203		dev->result = result;
2204		send_recvdone_event(sock, &dev);
2205	}
2206}
2207
2208/*
2209 * Take the data we received in our private buffer, and if any recv() calls on
2210 * our list are satisfied, send the corresponding done event.
2211 *
2212 * If we need more data (there are still items on the recv_list after we consume all
2213 * our data) then arrange for another system recv() call to fill our buffers.
2214 */
2215static void
2216internal_recv(isc_socket_t *sock, int nbytes)
2217{
2218	INSIST(VALID_SOCKET(sock));
2219
2220	LOCK(&sock->lock);
2221	CONSISTENT(sock);
2222
2223	socket_log(__LINE__, sock, NULL, IOEVENT,
2224		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2225		   "internal_recv: %d bytes received", nbytes);
2226
2227	/*
2228	 * If we got here, the I/O operation succeeded.  However, we might still have removed this
2229	 * event from our notification list (or never placed it on it due to immediate completion.)
2230	 * Handle the reference counting here, and handle the cancellation event just after.
2231	 */
2232	INSIST(sock->pending_iocp > 0);
2233	sock->pending_iocp--;
2234	INSIST(sock->pending_recv > 0);
2235	sock->pending_recv--;
2236
2237	/*
2238	 * The only way we could have gotten here is that our I/O has successfully completed.
2239	 * Update our pointers, and move on.  The only odd case here is that we might not
2240	 * have received enough data on a TCP stream to satisfy the minimum requirements.  If
2241	 * this is the case, we will re-issue the recv() call for what we need.
2242	 *
2243	 * We do check for a recv() of 0 bytes on a TCP stream.  This means the remote end
2244	 * has closed.
2245	 */
2246	if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2247		send_recvdone_abort(sock, ISC_R_EOF);
2248		maybe_free_socket(&sock, __LINE__);
2249		return;
2250	}
2251	sock->recvbuf.remaining = nbytes;
2252	sock->recvbuf.consume_position = sock->recvbuf.base;
2253	completeio_recv(sock);
2254
2255	/*
2256	 * If there are more receivers waiting for data, queue another receive
2257	 * here.
2258	 */
2259	queue_receive_request(sock);
2260
2261	/*
2262	 * Unlock and/or destroy if we are the last thing this socket has left to do.
2263	 */
2264	maybe_free_socket(&sock, __LINE__);
2265}
2266
2267static void
2268internal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2269	      struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2270{
2271	buflist_t *buffer;
2272
2273	/*
2274	 * Find out what socket this is and lock it.
2275	 */
2276	INSIST(VALID_SOCKET(sock));
2277
2278	LOCK(&sock->lock);
2279	CONSISTENT(sock);
2280
2281	socket_log(__LINE__, sock, NULL, IOEVENT,
2282		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2283		   "internal_send: task got socket event %p", dev);
2284
2285	buffer = ISC_LIST_HEAD(lpo->bufferlist);
2286	while (buffer != NULL) {
2287		ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2288
2289		socket_log(__LINE__, sock, NULL, TRACE,
2290		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2291		   "free_buffer %p %p", buffer, buffer->buf);
2292
2293		HeapFree(hHeapHandle, 0, buffer->buf);
2294		HeapFree(hHeapHandle, 0, buffer);
2295		buffer = ISC_LIST_HEAD(lpo->bufferlist);
2296	}
2297
2298	INSIST(sock->pending_iocp > 0);
2299	sock->pending_iocp--;
2300	INSIST(sock->pending_send > 0);
2301	sock->pending_send--;
2302
2303	/* If the event is no longer in the list we can just return */
2304	if (!senddone_is_active(sock, dev))
2305		goto done;
2306
2307	/*
2308	 * Set the error code and send things on its way.
2309	 */
2310	switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2311	case DOIO_SOFT:
2312		break;
2313	case DOIO_HARD:
2314	case DOIO_SUCCESS:
2315		send_senddone_event(sock, &dev);
2316		break;
2317	}
2318
2319 done:
2320	maybe_free_socket(&sock, __LINE__);
2321}
2322
2323/*
2324 * These return if the done event passed in is on the list (or for connect, is
2325 * the one we're waiting for.  Using these ensures we will not double-send an
2326 * event.
2327 */
2328static isc_boolean_t
2329senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2330{
2331	isc_socketevent_t *ldev;
2332
2333	ldev = ISC_LIST_HEAD(sock->send_list);
2334	while (ldev != NULL && ldev != dev)
2335		ldev = ISC_LIST_NEXT(ldev, ev_link);
2336
2337	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2338}
2339
2340static isc_boolean_t
2341acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2342{
2343	isc_socket_newconnev_t *ldev;
2344
2345	ldev = ISC_LIST_HEAD(sock->accept_list);
2346	while (ldev != NULL && ldev != dev)
2347		ldev = ISC_LIST_NEXT(ldev, ev_link);
2348
2349	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2350}
2351
2352static isc_boolean_t
2353connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2354{
2355	return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2356}
2357
2358//
2359// The Windows network stack seems to have two very distinct paths depending
2360// on what is installed.  Specifically, if something is looking at network
2361// connections (like an anti-virus or anti-malware application, such as
2362// McAfee products) Windows may return additional error conditions which
2363// were not previously returned.
2364//
2365// One specific one is when a TCP SYN scan is used.  In this situation,
2366// Windows responds with the SYN-ACK, but the scanner never responds with
2367// the 3rd packet, the ACK.  Windows consiers this a partially open connection.
2368// Most Unix networking stacks, and Windows without McAfee installed, will
2369// not return this to the caller.  However, with this product installed,
2370// Windows returns this as a failed status on the Accept() call.  Here, we
2371// will just re-issue the ISCAcceptEx() call as if nothing had happened.
2372//
2373// This code should only be called when the listening socket has received
2374// such an error.  Additionally, the "parent" socket must be locked.
2375// Additionally, the lpo argument is re-used here, and must not be freed
2376// by the caller.
2377//
2378static isc_result_t
2379restart_accept(isc_socket_t *parent, IoCompletionInfo *lpo)
2380{
2381	isc_socket_t *nsock = lpo->adev->newsocket;
2382	SOCKET new_fd;
2383
2384	/*
2385	 * AcceptEx() requires we pass in a socket.  Note that we carefully
2386	 * do not close the previous socket in case of an error message returned by
2387	 * our new socket() call.  If we return an error here, our caller will
2388	 * clean up.
2389	 */
2390	new_fd = socket(parent->pf, SOCK_STREAM, IPPROTO_TCP);
2391	if (nsock->fd == INVALID_SOCKET) {
2392		return (ISC_R_FAILURE); // parent will ask windows for error message
2393	}
2394	closesocket(nsock->fd);
2395	nsock->fd = new_fd;
2396
2397	memset(&lpo->overlapped, 0, sizeof(lpo->overlapped));
2398
2399	ISCAcceptEx(parent->fd,
2400		    nsock->fd,				/* Accepted Socket */
2401		    lpo->acceptbuffer,			/* Buffer for initial Recv */
2402		    0,					/* Length of Buffer */
2403		    sizeof(SOCKADDR_STORAGE) + 16,	/* Local address length + 16 */
2404		    sizeof(SOCKADDR_STORAGE) + 16,	/* Remote address lengh + 16 */
2405		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
2406		    (LPOVERLAPPED)lpo			/* Overlapped structure */
2407		    );
2408
2409	InterlockedDecrement(&nsock->manager->iocp_total);
2410	iocompletionport_update(nsock);
2411
2412	return (ISC_R_SUCCESS);
2413}
2414
2415/*
2416 * This is the I/O Completion Port Worker Function. It loops forever
2417 * waiting for I/O to complete and then forwards them for further
2418 * processing. There are a number of these in separate threads.
2419 */
2420static isc_threadresult_t WINAPI
2421SocketIoThread(LPVOID ThreadContext) {
2422	isc_socketmgr_t *manager = ThreadContext;
2423	BOOL bSuccess = FALSE;
2424	DWORD nbytes;
2425	IoCompletionInfo *lpo = NULL;
2426	isc_socket_t *sock = NULL;
2427	int request;
2428	struct msghdr *messagehdr = NULL;
2429	int errval;
2430	char strbuf[ISC_STRERRORSIZE];
2431	int errstatus;
2432
2433	REQUIRE(VALID_MANAGER(manager));
2434
2435	/*
2436	 * Set the thread priority high enough so I/O will
2437	 * preempt normal recv packet processing, but not
2438	 * higher than the timer sync thread.
2439	 */
2440	if (!SetThreadPriority(GetCurrentThread(),
2441			       THREAD_PRIORITY_ABOVE_NORMAL)) {
2442		errval = GetLastError();
2443		isc__strerror(errval, strbuf, sizeof(strbuf));
2444		FATAL_ERROR(__FILE__, __LINE__,
2445				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2446				ISC_MSG_FAILED,
2447				"Can't set thread priority: %s"),
2448				strbuf);
2449	}
2450
2451	/*
2452	 * Loop forever waiting on I/O Completions and then processing them
2453	 */
2454	while (TRUE) {
2455		wait_again:
2456		bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2457						     &nbytes, (LPDWORD)&sock,
2458						     (LPWSAOVERLAPPED *)&lpo,
2459						     INFINITE);
2460		if (lpo == NULL) /* Received request to exit */
2461			break;
2462
2463		REQUIRE(VALID_SOCKET(sock));
2464
2465		request = lpo->request_type;
2466
2467		errstatus = 0;
2468		if (!bSuccess) {
2469			isc_result_t isc_result;
2470
2471			/*
2472			 * Did the I/O operation complete?
2473			 */
2474			errstatus = GetLastError();
2475			isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2476
2477			LOCK(&sock->lock);
2478			CONSISTENT(sock);
2479			switch (request) {
2480			case SOCKET_RECV:
2481				INSIST(sock->pending_iocp > 0);
2482				sock->pending_iocp--;
2483				INSIST(sock->pending_recv > 0);
2484				sock->pending_recv--;
2485				if (!sock->connected &&
2486				    ((errstatus == ERROR_HOST_UNREACHABLE) ||
2487				     (errstatus == WSAENETRESET) ||
2488				     (errstatus == WSAECONNRESET))) {
2489					/* ignore soft errors */
2490					queue_receive_request(sock);
2491					break;
2492				}
2493				send_recvdone_abort(sock, isc_result);
2494				if (isc_result == ISC_R_UNEXPECTED) {
2495					UNEXPECTED_ERROR(__FILE__, __LINE__,
2496						"SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2497						errstatus, isc_result);
2498				}
2499				break;
2500
2501			case SOCKET_SEND:
2502				INSIST(sock->pending_iocp > 0);
2503				sock->pending_iocp--;
2504				INSIST(sock->pending_send > 0);
2505				sock->pending_send--;
2506				if (senddone_is_active(sock, lpo->dev)) {
2507					lpo->dev->result = isc_result;
2508					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2509						"canceled_send");
2510					send_senddone_event(sock, &lpo->dev);
2511				}
2512				break;
2513
2514			case SOCKET_ACCEPT:
2515				INSIST(sock->pending_iocp > 0);
2516				INSIST(sock->pending_accept > 0);
2517
2518				socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2519					"Accept: errstatus=%d isc_result=%d", errstatus, isc_result);
2520
2521				if (acceptdone_is_active(sock, lpo->adev)) {
2522					if (restart_accept(sock, lpo) == ISC_R_SUCCESS) {
2523						UNLOCK(&sock->lock);
2524						goto wait_again;
2525					} else {
2526						errstatus = GetLastError();
2527						isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2528						socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2529							"restart_accept() failed: errstatus=%d isc_result=%d",
2530							errstatus, isc_result);
2531					}
2532				}
2533
2534				sock->pending_iocp--;
2535				sock->pending_accept--;
2536				if (acceptdone_is_active(sock, lpo->adev)) {
2537					closesocket(lpo->adev->newsocket->fd);
2538					lpo->adev->newsocket->fd = INVALID_SOCKET;
2539					lpo->adev->newsocket->references--;
2540					free_socket(&lpo->adev->newsocket, __LINE__);
2541					lpo->adev->result = isc_result;
2542					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2543						"canceled_accept");
2544					send_acceptdone_event(sock, &lpo->adev);
2545				}
2546				break;
2547
2548			case SOCKET_CONNECT:
2549				INSIST(sock->pending_iocp > 0);
2550				sock->pending_iocp--;
2551				INSIST(sock->pending_connect == 1);
2552				sock->pending_connect = 0;
2553				if (connectdone_is_active(sock, lpo->cdev)) {
2554					lpo->cdev->result = isc_result;
2555					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2556						"canceled_connect");
2557					send_connectdone_event(sock, &lpo->cdev);
2558				}
2559				break;
2560			}
2561			maybe_free_socket(&sock, __LINE__);
2562
2563			if (lpo != NULL)
2564				HeapFree(hHeapHandle, 0, lpo);
2565			continue;
2566		}
2567
2568		messagehdr = &lpo->messagehdr;
2569
2570		switch (request) {
2571		case SOCKET_RECV:
2572			internal_recv(sock, nbytes);
2573			break;
2574		case SOCKET_SEND:
2575			internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2576			break;
2577		case SOCKET_ACCEPT:
2578			internal_accept(sock, lpo, errstatus);
2579			break;
2580		case SOCKET_CONNECT:
2581			internal_connect(sock, lpo, errstatus);
2582			break;
2583		}
2584
2585		if (lpo != NULL)
2586			HeapFree(hHeapHandle, 0, lpo);
2587	}
2588
2589	/*
2590	 * Exit Completion Port Thread
2591	 */
2592	manager_log(manager, TRACE,
2593		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2594				   ISC_MSG_EXITING, "SocketIoThread exiting"));
2595	return ((isc_threadresult_t)0);
2596}
2597
2598/*
2599 * Create a new socket manager.
2600 */
2601isc_result_t
2602isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2603	return (isc_socketmgr_create2(mctx, managerp, 0));
2604}
2605
2606isc_result_t
2607isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2608		       unsigned int maxsocks)
2609{
2610	isc_socketmgr_t *manager;
2611	isc_result_t result;
2612
2613	REQUIRE(managerp != NULL && *managerp == NULL);
2614
2615	if (maxsocks != 0)
2616		return (ISC_R_NOTIMPLEMENTED);
2617
2618	manager = isc_mem_get(mctx, sizeof(*manager));
2619	if (manager == NULL)
2620		return (ISC_R_NOMEMORY);
2621
2622	InitSockets();
2623
2624	manager->magic = SOCKET_MANAGER_MAGIC;
2625	manager->mctx = NULL;
2626	manager->stats = NULL;
2627	ISC_LIST_INIT(manager->socklist);
2628	result = isc_mutex_init(&manager->lock);
2629	if (result != ISC_R_SUCCESS) {
2630		isc_mem_put(mctx, manager, sizeof(*manager));
2631		return (result);
2632	}
2633	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2634		DESTROYLOCK(&manager->lock);
2635		isc_mem_put(mctx, manager, sizeof(*manager));
2636		UNEXPECTED_ERROR(__FILE__, __LINE__,
2637				 "isc_condition_init() %s",
2638				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2639						ISC_MSG_FAILED, "failed"));
2640		return (ISC_R_UNEXPECTED);
2641	}
2642
2643	isc_mem_attach(mctx, &manager->mctx);
2644
2645	iocompletionport_init(manager);	/* Create the Completion Ports */
2646
2647	manager->bShutdown = ISC_FALSE;
2648	manager->totalSockets = 0;
2649	manager->iocp_total = 0;
2650
2651	*managerp = manager;
2652
2653	return (ISC_R_SUCCESS);
2654}
2655
2656isc_result_t
2657isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2658	REQUIRE(VALID_MANAGER(manager));
2659	REQUIRE(nsockp != NULL);
2660
2661	return (ISC_R_NOTIMPLEMENTED);
2662}
2663
2664void
2665isc__socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2666	REQUIRE(VALID_MANAGER(manager));
2667	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2668	REQUIRE(manager->stats == NULL);
2669	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2670
2671	isc_stats_attach(stats, &manager->stats);
2672}
2673
2674void
2675isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2676	isc_socketmgr_t *manager;
2677	int i;
2678	isc_mem_t *mctx;
2679
2680	/*
2681	 * Destroy a socket manager.
2682	 */
2683
2684	REQUIRE(managerp != NULL);
2685	manager = *managerp;
2686	REQUIRE(VALID_MANAGER(manager));
2687
2688	LOCK(&manager->lock);
2689
2690	/*
2691	 * Wait for all sockets to be destroyed.
2692	 */
2693	while (!ISC_LIST_EMPTY(manager->socklist)) {
2694		manager_log(manager, CREATION,
2695			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2696					   ISC_MSG_SOCKETSREMAIN,
2697					   "sockets exist"));
2698		WAIT(&manager->shutdown_ok, &manager->lock);
2699	}
2700
2701	UNLOCK(&manager->lock);
2702
2703	/*
2704	 * Here, we need to had some wait code for the completion port
2705	 * thread.
2706	 */
2707	signal_iocompletionport_exit(manager);
2708	manager->bShutdown = ISC_TRUE;
2709
2710	/*
2711	 * Wait for threads to exit.
2712	 */
2713	for (i = 0; i < manager->maxIOCPThreads; i++) {
2714		if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2715			NULL) != ISC_R_SUCCESS)
2716			UNEXPECTED_ERROR(__FILE__, __LINE__,
2717				 "isc_thread_join() for Completion Port %s",
2718				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2719						ISC_MSG_FAILED, "failed"));
2720	}
2721	/*
2722	 * Clean up.
2723	 */
2724
2725	CloseHandle(manager->hIoCompletionPort);
2726
2727	(void)isc_condition_destroy(&manager->shutdown_ok);
2728
2729	DESTROYLOCK(&manager->lock);
2730	if (manager->stats != NULL)
2731		isc_stats_detach(&manager->stats);
2732	manager->magic = 0;
2733	mctx= manager->mctx;
2734	isc_mem_put(mctx, manager, sizeof(*manager));
2735
2736	isc_mem_detach(&mctx);
2737
2738	*managerp = NULL;
2739}
2740
2741static void
2742queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev)
2743{
2744	isc_task_t *ntask = NULL;
2745
2746	isc_task_attach(task, &ntask);
2747	dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2748
2749	/*
2750	 * Enqueue the request.
2751	 */
2752	INSIST(!ISC_LINK_LINKED(dev, ev_link));
2753	ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2754
2755	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2756		   "queue_receive_event: event %p -> task %p",
2757		   dev, ntask);
2758}
2759
2760/*
2761 * Check the pending receive queue, and if we have data pending, give it to this
2762 * caller.  If we have none, queue an I/O request.  If this caller is not the first
2763 * on the list, then we will just queue this event and return.
2764 *
2765 * Caller must have the socket locked.
2766 */
2767static isc_result_t
2768socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2769	    unsigned int flags)
2770{
2771	int cc = 0;
2772	isc_task_t *ntask = NULL;
2773	isc_result_t result = ISC_R_SUCCESS;
2774	int recv_errno = 0;
2775
2776	dev->ev_sender = task;
2777
2778	if (sock->fd == INVALID_SOCKET)
2779		return (ISC_R_EOF);
2780
2781	/*
2782	 * Queue our event on the list of things to do.  Call our function to
2783	 * attempt to fill buffers as much as possible, and return done events.
2784	 * We are going to lie about our handling of the ISC_SOCKFLAG_IMMEDIATE
2785	 * here and tell our caller that we could not satisfy it immediately.
2786	 */
2787	queue_receive_event(sock, task, dev);
2788	if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2789		result = ISC_R_INPROGRESS;
2790
2791	completeio_recv(sock);
2792
2793	/*
2794	 * If there are more receivers waiting for data, queue another receive
2795	 * here.  If the
2796	 */
2797	queue_receive_request(sock);
2798
2799	return (result);
2800}
2801
2802isc_result_t
2803isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2804		 unsigned int minimum, isc_task_t *task,
2805		 isc_taskaction_t action, const void *arg)
2806{
2807	isc_socketevent_t *dev;
2808	isc_socketmgr_t *manager;
2809	unsigned int iocount;
2810	isc_buffer_t *buffer;
2811	isc_result_t ret;
2812
2813	REQUIRE(VALID_SOCKET(sock));
2814	LOCK(&sock->lock);
2815	CONSISTENT(sock);
2816
2817	/*
2818	 * Make sure that the socket is not closed.  XXXMLG change error here?
2819	 */
2820	if (sock->fd == INVALID_SOCKET) {
2821		UNLOCK(&sock->lock);
2822		return (ISC_R_CONNREFUSED);
2823	}
2824
2825	REQUIRE(buflist != NULL);
2826	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2827	REQUIRE(task != NULL);
2828	REQUIRE(action != NULL);
2829
2830	manager = sock->manager;
2831	REQUIRE(VALID_MANAGER(manager));
2832
2833	iocount = isc_bufferlist_availablecount(buflist);
2834	REQUIRE(iocount > 0);
2835
2836	INSIST(sock->bound);
2837
2838	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2839	if (dev == NULL) {
2840		UNLOCK(&sock->lock);
2841		return (ISC_R_NOMEMORY);
2842	}
2843
2844	/*
2845	 * UDP sockets are always partial read
2846	 */
2847	if (sock->type == isc_sockettype_udp)
2848		dev->minimum = 1;
2849	else {
2850		if (minimum == 0)
2851			dev->minimum = iocount;
2852		else
2853			dev->minimum = minimum;
2854	}
2855
2856	/*
2857	 * Move each buffer from the passed in list to our internal one.
2858	 */
2859	buffer = ISC_LIST_HEAD(*buflist);
2860	while (buffer != NULL) {
2861		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2862		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2863		buffer = ISC_LIST_HEAD(*buflist);
2864	}
2865
2866	ret = socket_recv(sock, dev, task, 0);
2867
2868	UNLOCK(&sock->lock);
2869	return (ret);
2870}
2871
2872isc_result_t
2873isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
2874		 unsigned int minimum, isc_task_t *task,
2875		 isc_taskaction_t action, const void *arg)
2876{
2877	isc_socketevent_t *dev;
2878	isc_socketmgr_t *manager;
2879	isc_result_t ret;
2880
2881	REQUIRE(VALID_SOCKET(sock));
2882	LOCK(&sock->lock);
2883	CONSISTENT(sock);
2884
2885	/*
2886	 * make sure that the socket's not closed
2887	 */
2888	if (sock->fd == INVALID_SOCKET) {
2889		UNLOCK(&sock->lock);
2890		return (ISC_R_CONNREFUSED);
2891	}
2892	REQUIRE(action != NULL);
2893
2894	manager = sock->manager;
2895	REQUIRE(VALID_MANAGER(manager));
2896
2897	INSIST(sock->bound);
2898
2899	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2900	if (dev == NULL) {
2901		UNLOCK(&sock->lock);
2902		return (ISC_R_NOMEMORY);
2903	}
2904
2905	ret = isc_socket_recv2(sock, region, minimum, task, dev, 0);
2906	UNLOCK(&sock->lock);
2907	return (ret);
2908}
2909
2910isc_result_t
2911isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
2912		  unsigned int minimum, isc_task_t *task,
2913		  isc_socketevent_t *event, unsigned int flags)
2914{
2915	isc_result_t ret;
2916
2917	REQUIRE(VALID_SOCKET(sock));
2918	LOCK(&sock->lock);
2919	CONSISTENT(sock);
2920
2921	event->result = ISC_R_UNEXPECTED;
2922	event->ev_sender = sock;
2923	/*
2924	 * make sure that the socket's not closed
2925	 */
2926	if (sock->fd == INVALID_SOCKET) {
2927		UNLOCK(&sock->lock);
2928		return (ISC_R_CONNREFUSED);
2929	}
2930
2931	ISC_LIST_INIT(event->bufferlist);
2932	event->region = *region;
2933	event->n = 0;
2934	event->offset = 0;
2935	event->attributes = 0;
2936
2937	/*
2938	 * UDP sockets are always partial read.
2939	 */
2940	if (sock->type == isc_sockettype_udp)
2941		event->minimum = 1;
2942	else {
2943		if (minimum == 0)
2944			event->minimum = region->length;
2945		else
2946			event->minimum = minimum;
2947	}
2948
2949	ret = socket_recv(sock, event, task, flags);
2950	UNLOCK(&sock->lock);
2951	return (ret);
2952}
2953
2954/*
2955 * Caller must have the socket locked.
2956 */
2957static isc_result_t
2958socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2959	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2960	    unsigned int flags)
2961{
2962	int io_state;
2963	int send_errno = 0;
2964	int cc = 0;
2965	isc_task_t *ntask = NULL;
2966	isc_result_t result = ISC_R_SUCCESS;
2967
2968	dev->ev_sender = task;
2969
2970	set_dev_address(address, sock, dev);
2971	if (pktinfo != NULL) {
2972		socket_log(__LINE__, sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2973			   ISC_MSG_PKTINFOPROVIDED,
2974			   "pktinfo structure provided, ifindex %u (set to 0)",
2975			   pktinfo->ipi6_ifindex);
2976
2977		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2978		dev->pktinfo = *pktinfo;
2979		/*
2980		 * Set the pktinfo index to 0 here, to let the kernel decide
2981		 * what interface it should send on.
2982		 */
2983		dev->pktinfo.ipi6_ifindex = 0;
2984	}
2985
2986	io_state = startio_send(sock, dev, &cc, &send_errno);
2987	switch (io_state) {
2988	case DOIO_PENDING:	/* I/O started. Nothing more to do */
2989	case DOIO_SOFT:
2990		/*
2991		 * We couldn't send all or part of the request right now, so
2992		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2993		 */
2994		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2995			isc_task_attach(task, &ntask);
2996			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2997
2998			/*
2999			 * Enqueue the request.
3000			 */
3001			INSIST(!ISC_LINK_LINKED(dev, ev_link));
3002			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3003
3004			socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
3005				   "socket_send: event %p -> task %p",
3006				   dev, ntask);
3007
3008			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3009				result = ISC_R_INPROGRESS;
3010			break;
3011		}
3012
3013	case DOIO_SUCCESS:
3014		break;
3015	}
3016
3017	return (result);
3018}
3019
3020isc_result_t
3021isc__socket_send(isc_socket_t *sock, isc_region_t *region,
3022		 isc_task_t *task, isc_taskaction_t action, const void *arg)
3023{
3024	/*
3025	 * REQUIRE() checking is performed in isc_socket_sendto().
3026	 */
3027	return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3028				  NULL));
3029}
3030
3031isc_result_t
3032isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
3033		   isc_task_t *task, isc_taskaction_t action, const void *arg,
3034		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3035{
3036	isc_socketevent_t *dev;
3037	isc_socketmgr_t *manager;
3038	isc_result_t ret;
3039
3040	REQUIRE(VALID_SOCKET(sock));
3041	REQUIRE(sock->type != isc_sockettype_fdwatch);
3042
3043	LOCK(&sock->lock);
3044	CONSISTENT(sock);
3045
3046	/*
3047	 * make sure that the socket's not closed
3048	 */
3049	if (sock->fd == INVALID_SOCKET) {
3050		UNLOCK(&sock->lock);
3051		return (ISC_R_CONNREFUSED);
3052	}
3053	REQUIRE(region != NULL);
3054	REQUIRE(task != NULL);
3055	REQUIRE(action != NULL);
3056
3057	manager = sock->manager;
3058	REQUIRE(VALID_MANAGER(manager));
3059
3060	INSIST(sock->bound);
3061
3062	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3063	if (dev == NULL) {
3064		UNLOCK(&sock->lock);
3065		return (ISC_R_NOMEMORY);
3066	}
3067	dev->region = *region;
3068
3069	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3070	UNLOCK(&sock->lock);
3071	return (ret);
3072}
3073
3074isc_result_t
3075isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3076		  isc_task_t *task, isc_taskaction_t action, const void *arg)
3077{
3078	return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
3079				   NULL));
3080}
3081
3082isc_result_t
3083isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3084		    isc_task_t *task, isc_taskaction_t action, const void *arg,
3085		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3086{
3087	isc_socketevent_t *dev;
3088	isc_socketmgr_t *manager;
3089	unsigned int iocount;
3090	isc_buffer_t *buffer;
3091	isc_result_t ret;
3092
3093	REQUIRE(VALID_SOCKET(sock));
3094
3095	LOCK(&sock->lock);
3096	CONSISTENT(sock);
3097
3098	/*
3099	 * make sure that the socket's not closed
3100	 */
3101	if (sock->fd == INVALID_SOCKET) {
3102		UNLOCK(&sock->lock);
3103		return (ISC_R_CONNREFUSED);
3104	}
3105	REQUIRE(buflist != NULL);
3106	REQUIRE(!ISC_LIST_EMPTY(*buflist));
3107	REQUIRE(task != NULL);
3108	REQUIRE(action != NULL);
3109
3110	manager = sock->manager;
3111	REQUIRE(VALID_MANAGER(manager));
3112
3113	iocount = isc_bufferlist_usedcount(buflist);
3114	REQUIRE(iocount > 0);
3115
3116	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3117	if (dev == NULL) {
3118		UNLOCK(&sock->lock);
3119		return (ISC_R_NOMEMORY);
3120	}
3121
3122	/*
3123	 * Move each buffer from the passed in list to our internal one.
3124	 */
3125	buffer = ISC_LIST_HEAD(*buflist);
3126	while (buffer != NULL) {
3127		ISC_LIST_DEQUEUE(*buflist, buffer, link);
3128		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3129		buffer = ISC_LIST_HEAD(*buflist);
3130	}
3131
3132	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3133	UNLOCK(&sock->lock);
3134	return (ret);
3135}
3136
3137isc_result_t
3138isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3139		    isc_task_t *task,
3140		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3141		    isc_socketevent_t *event, unsigned int flags)
3142{
3143	isc_result_t ret;
3144
3145	REQUIRE(VALID_SOCKET(sock));
3146	LOCK(&sock->lock);
3147	CONSISTENT(sock);
3148
3149	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3150	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3151		REQUIRE(sock->type == isc_sockettype_udp);
3152	event->ev_sender = sock;
3153	event->result = ISC_R_UNEXPECTED;
3154	/*
3155	 * make sure that the socket's not closed
3156	 */
3157	if (sock->fd == INVALID_SOCKET) {
3158		UNLOCK(&sock->lock);
3159		return (ISC_R_CONNREFUSED);
3160	}
3161	ISC_LIST_INIT(event->bufferlist);
3162	event->region = *region;
3163	event->n = 0;
3164	event->offset = 0;
3165	event->attributes = 0;
3166
3167	ret = socket_send(sock, event, task, address, pktinfo, flags);
3168	UNLOCK(&sock->lock);
3169	return (ret);
3170}
3171
3172isc_result_t
3173isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3174		 unsigned int options) {
3175	int bind_errno;
3176	char strbuf[ISC_STRERRORSIZE];
3177	int on = 1;
3178
3179	REQUIRE(VALID_SOCKET(sock));
3180	LOCK(&sock->lock);
3181	CONSISTENT(sock);
3182
3183	/*
3184	 * make sure that the socket's not closed
3185	 */
3186	if (sock->fd == INVALID_SOCKET) {
3187		UNLOCK(&sock->lock);
3188		return (ISC_R_CONNREFUSED);
3189	}
3190
3191	INSIST(!sock->bound);
3192	INSIST(!sock->dupped);
3193
3194	if (sock->pf != sockaddr->type.sa.sa_family) {
3195		UNLOCK(&sock->lock);
3196		return (ISC_R_FAMILYMISMATCH);
3197	}
3198	/*
3199	 * Only set SO_REUSEADDR when we want a specific port.
3200	 */
3201	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3202	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3203	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
3204		       sizeof(on)) < 0) {
3205		UNEXPECTED_ERROR(__FILE__, __LINE__,
3206				 "setsockopt(%d) %s", sock->fd,
3207				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3208						ISC_MSG_FAILED, "failed"));
3209		/* Press on... */
3210	}
3211	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3212		bind_errno = WSAGetLastError();
3213		UNLOCK(&sock->lock);
3214		switch (bind_errno) {
3215		case WSAEACCES:
3216			return (ISC_R_NOPERM);
3217		case WSAEADDRNOTAVAIL:
3218			return (ISC_R_ADDRNOTAVAIL);
3219		case WSAEADDRINUSE:
3220			return (ISC_R_ADDRINUSE);
3221		case WSAEINVAL:
3222			return (ISC_R_BOUND);
3223		default:
3224			isc__strerror(bind_errno, strbuf, sizeof(strbuf));
3225			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3226					 strbuf);
3227			return (ISC_R_UNEXPECTED);
3228		}
3229	}
3230
3231	socket_log(__LINE__, sock, sockaddr, TRACE,
3232		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3233	sock->bound = 1;
3234
3235	UNLOCK(&sock->lock);
3236	return (ISC_R_SUCCESS);
3237}
3238
3239isc_result_t
3240isc__socket_filter(isc_socket_t *sock, const char *filter) {
3241	UNUSED(sock);
3242	UNUSED(filter);
3243
3244	REQUIRE(VALID_SOCKET(sock));
3245	return (ISC_R_NOTIMPLEMENTED);
3246}
3247
3248/*
3249 * Set up to listen on a given socket.  We do this by creating an internal
3250 * event that will be dispatched when the socket has read activity.  The
3251 * watcher will send the internal event to the task when there is a new
3252 * connection.
3253 *
3254 * Unlike in read, we don't preallocate a done event here.  Every time there
3255 * is a new connection we'll have to allocate a new one anyway, so we might
3256 * as well keep things simple rather than having to track them.
3257 */
3258isc_result_t
3259isc__socket_listen(isc_socket_t *sock, unsigned int backlog) {
3260	char strbuf[ISC_STRERRORSIZE];
3261
3262	REQUIRE(VALID_SOCKET(sock));
3263
3264	LOCK(&sock->lock);
3265	CONSISTENT(sock);
3266
3267	/*
3268	 * make sure that the socket's not closed
3269	 */
3270	if (sock->fd == INVALID_SOCKET) {
3271		UNLOCK(&sock->lock);
3272		return (ISC_R_CONNREFUSED);
3273	}
3274
3275	REQUIRE(!sock->listener);
3276	REQUIRE(sock->bound);
3277	REQUIRE(sock->type == isc_sockettype_tcp);
3278
3279	if (backlog == 0)
3280		backlog = SOMAXCONN;
3281
3282	if (listen(sock->fd, (int)backlog) < 0) {
3283		UNLOCK(&sock->lock);
3284		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3285
3286		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3287
3288		return (ISC_R_UNEXPECTED);
3289	}
3290
3291	socket_log(__LINE__, sock, NULL, TRACE,
3292		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "listening");
3293	sock->listener = 1;
3294	_set_state(sock, SOCK_LISTEN);
3295
3296	UNLOCK(&sock->lock);
3297	return (ISC_R_SUCCESS);
3298}
3299
3300/*
3301 * This should try to do aggressive accept() XXXMLG
3302 */
3303isc_result_t
3304isc__socket_accept(isc_socket_t *sock,
3305		   isc_task_t *task, isc_taskaction_t action, const void *arg)
3306{
3307	isc_socket_newconnev_t *adev;
3308	isc_socketmgr_t *manager;
3309	isc_task_t *ntask = NULL;
3310	isc_socket_t *nsock;
3311	isc_result_t result;
3312	IoCompletionInfo *lpo;
3313
3314	REQUIRE(VALID_SOCKET(sock));
3315
3316	manager = sock->manager;
3317	REQUIRE(VALID_MANAGER(manager));
3318
3319	LOCK(&sock->lock);
3320	CONSISTENT(sock);
3321
3322	/*
3323	 * make sure that the socket's not closed
3324	 */
3325	if (sock->fd == INVALID_SOCKET) {
3326		UNLOCK(&sock->lock);
3327		return (ISC_R_CONNREFUSED);
3328	}
3329
3330	REQUIRE(sock->listener);
3331
3332	/*
3333	 * Sender field is overloaded here with the task we will be sending
3334	 * this event to.  Just before the actual event is delivered the
3335	 * actual ev_sender will be touched up to be the socket.
3336	 */
3337	adev = (isc_socket_newconnev_t *)
3338		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3339				   action, arg, sizeof(*adev));
3340	if (adev == NULL) {
3341		UNLOCK(&sock->lock);
3342		return (ISC_R_NOMEMORY);
3343	}
3344	ISC_LINK_INIT(adev, ev_link);
3345
3346	result = allocate_socket(manager, sock->type, &nsock);
3347	if (result != ISC_R_SUCCESS) {
3348		isc_event_free((isc_event_t **)&adev);
3349		UNLOCK(&sock->lock);
3350		return (result);
3351	}
3352
3353	/*
3354	 * AcceptEx() requires we pass in a socket.
3355	 */
3356	nsock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
3357	if (nsock->fd == INVALID_SOCKET) {
3358		free_socket(&nsock, __LINE__);
3359		isc_event_free((isc_event_t **)&adev);
3360		UNLOCK(&sock->lock);
3361		return (ISC_R_FAILURE); // XXXMLG need real error message
3362	}
3363
3364	/*
3365	 * Attach to socket and to task.
3366	 */
3367	isc_task_attach(task, &ntask);
3368	if (isc_task_exiting(ntask)) {
3369		free_socket(&nsock, __LINE__);
3370		isc_task_detach(&ntask);
3371		isc_event_free(ISC_EVENT_PTR(&adev));
3372		UNLOCK(&sock->lock);
3373		return (ISC_R_SHUTTINGDOWN);
3374	}
3375	nsock->references++;
3376
3377	adev->ev_sender = ntask;
3378	adev->newsocket = nsock;
3379	_set_state(nsock, SOCK_ACCEPT);
3380
3381	/*
3382	 * Queue io completion for an accept().
3383	 */
3384	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3385					    HEAP_ZERO_MEMORY,
3386					    sizeof(IoCompletionInfo));
3387	RUNTIME_CHECK(lpo != NULL);
3388	lpo->acceptbuffer = (void *)HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
3389		(sizeof(SOCKADDR_STORAGE) + 16) * 2);
3390	RUNTIME_CHECK(lpo->acceptbuffer != NULL);
3391
3392	lpo->adev = adev;
3393	lpo->request_type = SOCKET_ACCEPT;
3394
3395	ISCAcceptEx(sock->fd,
3396		    nsock->fd,				/* Accepted Socket */
3397		    lpo->acceptbuffer,			/* Buffer for initial Recv */
3398		    0,					/* Length of Buffer */
3399		    sizeof(SOCKADDR_STORAGE) + 16,		/* Local address length + 16 */
3400		    sizeof(SOCKADDR_STORAGE) + 16,		/* Remote address lengh + 16 */
3401		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
3402		    (LPOVERLAPPED)lpo			/* Overlapped structure */
3403		    );
3404	iocompletionport_update(nsock);
3405
3406	socket_log(__LINE__, sock, NULL, TRACE,
3407		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND,
3408		   "accepting for nsock %p fd %d", nsock, nsock->fd);
3409
3410	/*
3411	 * Enqueue the event
3412	 */
3413	ISC_LIST_ENQUEUE(sock->accept_list, adev, ev_link);
3414	sock->pending_accept++;
3415	sock->pending_iocp++;
3416
3417	UNLOCK(&sock->lock);
3418	return (ISC_R_SUCCESS);
3419}
3420
3421isc_result_t
3422isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3423		    isc_task_t *task, isc_taskaction_t action, const void *arg)
3424{
3425	char strbuf[ISC_STRERRORSIZE];
3426	isc_socket_connev_t *cdev;
3427	isc_task_t *ntask = NULL;
3428	isc_socketmgr_t *manager;
3429	IoCompletionInfo *lpo;
3430	int bind_errno;
3431
3432	REQUIRE(VALID_SOCKET(sock));
3433	REQUIRE(addr != NULL);
3434	REQUIRE(task != NULL);
3435	REQUIRE(action != NULL);
3436
3437	manager = sock->manager;
3438	REQUIRE(VALID_MANAGER(manager));
3439	REQUIRE(addr != NULL);
3440
3441	if (isc_sockaddr_ismulticast(addr))
3442		return (ISC_R_MULTICAST);
3443
3444	LOCK(&sock->lock);
3445	CONSISTENT(sock);
3446
3447	/*
3448	 * make sure that the socket's not closed
3449	 */
3450	if (sock->fd == INVALID_SOCKET) {
3451		UNLOCK(&sock->lock);
3452		return (ISC_R_CONNREFUSED);
3453	}
3454
3455	/*
3456	 * Windows sockets won't connect unless the socket is bound.
3457	 */
3458	if (!sock->bound) {
3459		isc_sockaddr_t any;
3460
3461		isc_sockaddr_anyofpf(&any, isc_sockaddr_pf(addr));
3462		if (bind(sock->fd, &any.type.sa, any.length) < 0) {
3463			bind_errno = WSAGetLastError();
3464			UNLOCK(&sock->lock);
3465			switch (bind_errno) {
3466			case WSAEACCES:
3467				return (ISC_R_NOPERM);
3468			case WSAEADDRNOTAVAIL:
3469				return (ISC_R_ADDRNOTAVAIL);
3470			case WSAEADDRINUSE:
3471				return (ISC_R_ADDRINUSE);
3472			case WSAEINVAL:
3473				return (ISC_R_BOUND);
3474			default:
3475				isc__strerror(bind_errno, strbuf,
3476					      sizeof(strbuf));
3477				UNEXPECTED_ERROR(__FILE__, __LINE__,
3478						 "bind: %s", strbuf);
3479				return (ISC_R_UNEXPECTED);
3480			}
3481		}
3482		sock->bound = 1;
3483	}
3484
3485	REQUIRE(!sock->pending_connect);
3486
3487	cdev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3488							ISC_SOCKEVENT_CONNECT,
3489							action,	arg,
3490							sizeof(*cdev));
3491	if (cdev == NULL) {
3492		UNLOCK(&sock->lock);
3493		return (ISC_R_NOMEMORY);
3494	}
3495	ISC_LINK_INIT(cdev, ev_link);
3496
3497	if (sock->type == isc_sockettype_tcp) {
3498		/*
3499		 * Queue io completion for an accept().
3500		 */
3501		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3502						    HEAP_ZERO_MEMORY,
3503						    sizeof(IoCompletionInfo));
3504		lpo->cdev = cdev;
3505		lpo->request_type = SOCKET_CONNECT;
3506
3507		sock->address = *addr;
3508		ISCConnectEx(sock->fd, &addr->type.sa, addr->length,
3509			NULL, 0, NULL, (LPOVERLAPPED)lpo);
3510
3511		/*
3512		 * Attach to task.
3513		 */
3514		isc_task_attach(task, &ntask);
3515		cdev->ev_sender = ntask;
3516
3517		sock->pending_connect = 1;
3518		_set_state(sock, SOCK_CONNECT);
3519
3520		/*
3521		 * Enqueue the request.
3522		 */
3523		sock->connect_ev = cdev;
3524		sock->pending_iocp++;
3525	} else {
3526		WSAConnect(sock->fd, &addr->type.sa, addr->length, NULL, NULL, NULL, NULL);
3527		cdev->result = ISC_R_SUCCESS;
3528		isc_task_send(task, (isc_event_t **)&cdev);
3529	}
3530	CONSISTENT(sock);
3531	UNLOCK(&sock->lock);
3532
3533	return (ISC_R_SUCCESS);
3534}
3535
3536isc_result_t
3537isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3538	isc_result_t result;
3539
3540	REQUIRE(VALID_SOCKET(sock));
3541	REQUIRE(addressp != NULL);
3542
3543	LOCK(&sock->lock);
3544	CONSISTENT(sock);
3545
3546	/*
3547	 * make sure that the socket's not closed
3548	 */
3549	if (sock->fd == INVALID_SOCKET) {
3550		UNLOCK(&sock->lock);
3551		return (ISC_R_CONNREFUSED);
3552	}
3553
3554	if (sock->connected) {
3555		*addressp = sock->address;
3556		result = ISC_R_SUCCESS;
3557	} else {
3558		result = ISC_R_NOTCONNECTED;
3559	}
3560
3561	UNLOCK(&sock->lock);
3562
3563	return (result);
3564}
3565
3566isc_result_t
3567isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3568	ISC_SOCKADDR_LEN_T len;
3569	isc_result_t result;
3570	char strbuf[ISC_STRERRORSIZE];
3571
3572	REQUIRE(VALID_SOCKET(sock));
3573	REQUIRE(addressp != NULL);
3574
3575	LOCK(&sock->lock);
3576	CONSISTENT(sock);
3577
3578	/*
3579	 * make sure that the socket's not closed
3580	 */
3581	if (sock->fd == INVALID_SOCKET) {
3582		UNLOCK(&sock->lock);
3583		return (ISC_R_CONNREFUSED);
3584	}
3585
3586	if (!sock->bound) {
3587		result = ISC_R_NOTBOUND;
3588		goto out;
3589	}
3590
3591	result = ISC_R_SUCCESS;
3592
3593	len = sizeof(addressp->type);
3594	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3595		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3596		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3597				 strbuf);
3598		result = ISC_R_UNEXPECTED;
3599		goto out;
3600	}
3601	addressp->length = (unsigned int)len;
3602
3603 out:
3604	UNLOCK(&sock->lock);
3605
3606	return (result);
3607}
3608
3609/*
3610 * Run through the list of events on this socket, and cancel the ones
3611 * queued for task "task" of type "how".  "how" is a bitmask.
3612 */
3613void
3614isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3615
3616	REQUIRE(VALID_SOCKET(sock));
3617
3618	/*
3619	 * Quick exit if there is nothing to do.  Don't even bother locking
3620	 * in this case.
3621	 */
3622	if (how == 0)
3623		return;
3624
3625	LOCK(&sock->lock);
3626	CONSISTENT(sock);
3627
3628	/*
3629	 * make sure that the socket's not closed
3630	 */
3631	if (sock->fd == INVALID_SOCKET) {
3632		UNLOCK(&sock->lock);
3633		return;
3634	}
3635
3636	/*
3637	 * All of these do the same thing, more or less.
3638	 * Each will:
3639	 *	o If the internal event is marked as "posted" try to
3640	 *	  remove it from the task's queue.  If this fails, mark it
3641	 *	  as canceled instead, and let the task clean it up later.
3642	 *	o For each I/O request for that task of that type, post
3643	 *	  its done event with status of "ISC_R_CANCELED".
3644	 *	o Reset any state needed.
3645	 */
3646
3647	if ((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) {
3648		isc_socketevent_t      *dev;
3649		isc_socketevent_t      *next;
3650		isc_task_t	       *current_task;
3651
3652		dev = ISC_LIST_HEAD(sock->recv_list);
3653		while (dev != NULL) {
3654			current_task = dev->ev_sender;
3655			next = ISC_LIST_NEXT(dev, ev_link);
3656			if ((task == NULL) || (task == current_task)) {
3657				dev->result = ISC_R_CANCELED;
3658				send_recvdone_event(sock, &dev);
3659			}
3660			dev = next;
3661		}
3662	}
3663	how &= ~ISC_SOCKCANCEL_RECV;
3664
3665	if ((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) {
3666		isc_socketevent_t      *dev;
3667		isc_socketevent_t      *next;
3668		isc_task_t	       *current_task;
3669
3670		dev = ISC_LIST_HEAD(sock->send_list);
3671
3672		while (dev != NULL) {
3673			current_task = dev->ev_sender;
3674			next = ISC_LIST_NEXT(dev, ev_link);
3675			if ((task == NULL) || (task == current_task)) {
3676				dev->result = ISC_R_CANCELED;
3677				send_senddone_event(sock, &dev);
3678			}
3679			dev = next;
3680		}
3681	}
3682	how &= ~ISC_SOCKCANCEL_SEND;
3683
3684	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3685	    && !ISC_LIST_EMPTY(sock->accept_list)) {
3686		isc_socket_newconnev_t *dev;
3687		isc_socket_newconnev_t *next;
3688		isc_task_t	       *current_task;
3689
3690		dev = ISC_LIST_HEAD(sock->accept_list);
3691		while (dev != NULL) {
3692			current_task = dev->ev_sender;
3693			next = ISC_LIST_NEXT(dev, ev_link);
3694
3695			if ((task == NULL) || (task == current_task)) {
3696
3697				dev->newsocket->references--;
3698				closesocket(dev->newsocket->fd);
3699				dev->newsocket->fd = INVALID_SOCKET;
3700				free_socket(&dev->newsocket, __LINE__);
3701
3702				dev->result = ISC_R_CANCELED;
3703				send_acceptdone_event(sock, &dev);
3704			}
3705
3706			dev = next;
3707		}
3708	}
3709	how &= ~ISC_SOCKCANCEL_ACCEPT;
3710
3711	/*
3712	 * Connecting is not a list.
3713	 */
3714	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3715	    && sock->connect_ev != NULL) {
3716		isc_socket_connev_t    *dev;
3717		isc_task_t	       *current_task;
3718
3719		INSIST(sock->pending_connect);
3720
3721		dev = sock->connect_ev;
3722		current_task = dev->ev_sender;
3723
3724		if ((task == NULL) || (task == current_task)) {
3725			closesocket(sock->fd);
3726			sock->fd = INVALID_SOCKET;
3727			_set_state(sock, SOCK_CLOSED);
3728
3729			sock->connect_ev = NULL;
3730			dev->result = ISC_R_CANCELED;
3731			send_connectdone_event(sock, &dev);
3732		}
3733	}
3734	how &= ~ISC_SOCKCANCEL_CONNECT;
3735
3736	maybe_free_socket(&sock, __LINE__);
3737}
3738
3739isc_sockettype_t
3740isc__socket_gettype(isc_socket_t *sock) {
3741	isc_sockettype_t type;
3742
3743	REQUIRE(VALID_SOCKET(sock));
3744
3745	LOCK(&sock->lock);
3746
3747	/*
3748	 * make sure that the socket's not closed
3749	 */
3750	if (sock->fd == INVALID_SOCKET) {
3751		UNLOCK(&sock->lock);
3752		return (ISC_R_CONNREFUSED);
3753	}
3754
3755	type = sock->type;
3756	UNLOCK(&sock->lock);
3757	return (type);
3758}
3759
3760isc_boolean_t
3761isc__socket_isbound(isc_socket_t *sock) {
3762	isc_boolean_t val;
3763
3764	REQUIRE(VALID_SOCKET(sock));
3765
3766	LOCK(&sock->lock);
3767	CONSISTENT(sock);
3768
3769	/*
3770	 * make sure that the socket's not closed
3771	 */
3772	if (sock->fd == INVALID_SOCKET) {
3773		UNLOCK(&sock->lock);
3774		return (ISC_FALSE);
3775	}
3776
3777	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3778	UNLOCK(&sock->lock);
3779
3780	return (val);
3781}
3782
3783void
3784isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3785#if defined(IPV6_V6ONLY)
3786	int onoff = yes ? 1 : 0;
3787#else
3788	UNUSED(yes);
3789#endif
3790
3791	REQUIRE(VALID_SOCKET(sock));
3792
3793#ifdef IPV6_V6ONLY
3794	if (sock->pf == AF_INET6) {
3795		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3796				 (char *)&onoff, sizeof(onoff));
3797	}
3798#endif
3799}
3800
3801void
3802isc__socket_cleanunix(isc_sockaddr_t *addr, isc_boolean_t active) {
3803	UNUSED(addr);
3804	UNUSED(active);
3805}
3806
3807isc_result_t
3808isc__socket_permunix(isc_sockaddr_t *addr, isc_uint32_t perm,
3809		     isc_uint32_t owner,	isc_uint32_t group)
3810{
3811	UNUSED(addr);
3812	UNUSED(perm);
3813	UNUSED(owner);
3814	UNUSED(group);
3815	return (ISC_R_NOTIMPLEMENTED);
3816}
3817
3818void
3819isc__socket_setname(isc_socket_t *socket, const char *name, void *tag) {
3820
3821	/*
3822	 * Name 'socket'.
3823	 */
3824
3825	REQUIRE(VALID_SOCKET(socket));
3826
3827	LOCK(&socket->lock);
3828	memset(socket->name, 0, sizeof(socket->name));
3829	strncpy(socket->name, name, sizeof(socket->name) - 1);
3830	socket->tag = tag;
3831	UNLOCK(&socket->lock);
3832}
3833
3834const char *
3835isc__socket_getname(isc_socket_t *socket) {
3836	return (socket->name);
3837}
3838
3839void *
3840isc__socket_gettag(isc_socket_t *socket) {
3841	return (socket->tag);
3842}
3843
3844int
3845isc__socket_getfd(isc_socket_t *socket) {
3846	return ((short) socket->fd);
3847}
3848
3849void
3850isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3851	UNUSED(manager);
3852	UNUSED(reserved);
3853}
3854
3855void
3856isc___socketmgr_maxudp(isc_socketmgr_t *manager, int maxudp) {
3857
3858	UNUSED(manager);
3859	UNUSED(maxudp);
3860}
3861
3862#ifdef HAVE_LIBXML2
3863
3864static const char *
3865_socktype(isc_sockettype_t type)
3866{
3867	if (type == isc_sockettype_udp)
3868		return ("udp");
3869	else if (type == isc_sockettype_tcp)
3870		return ("tcp");
3871	else if (type == isc_sockettype_unix)
3872		return ("unix");
3873	else if (type == isc_sockettype_fdwatch)
3874		return ("fdwatch");
3875	else
3876		return ("not-initialized");
3877}
3878
3879void
3880isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
3881{
3882	isc_socket_t *sock;
3883	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3884	isc_sockaddr_t addr;
3885	ISC_SOCKADDR_LEN_T len;
3886
3887	LOCK(&mgr->lock);
3888
3889#ifndef ISC_PLATFORM_USETHREADS
3890	xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3891	xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
3892	xmlTextWriterEndElement(writer);
3893#endif
3894
3895	xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
3896	sock = ISC_LIST_HEAD(mgr->socklist);
3897	while (sock != NULL) {
3898		LOCK(&sock->lock);
3899		xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
3900
3901		xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
3902		xmlTextWriterWriteFormatString(writer, "%p", sock);
3903		xmlTextWriterEndElement(writer);
3904
3905		if (sock->name[0] != 0) {
3906			xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
3907			xmlTextWriterWriteFormatString(writer, "%s",
3908						       sock->name);
3909			xmlTextWriterEndElement(writer); /* name */
3910		}
3911
3912		xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3913		xmlTextWriterWriteFormatString(writer, "%d", sock->references);
3914		xmlTextWriterEndElement(writer);
3915
3916		xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
3917					  ISC_XMLCHAR _socktype(sock->type));
3918
3919		if (sock->connected) {
3920			isc_sockaddr_format(&sock->address, peerbuf,
3921					    sizeof(peerbuf));
3922			xmlTextWriterWriteElement(writer,
3923						  ISC_XMLCHAR "peer-address",
3924						  ISC_XMLCHAR peerbuf);
3925		}
3926
3927		len = sizeof(addr);
3928		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
3929			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
3930			xmlTextWriterWriteElement(writer,
3931						  ISC_XMLCHAR "local-address",
3932						  ISC_XMLCHAR peerbuf);
3933		}
3934
3935		xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
3936		if (sock->pending_recv)
3937			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3938						ISC_XMLCHAR "pending-receive");
3939		if (sock->pending_send)
3940			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3941						  ISC_XMLCHAR "pending-send");
3942		if (sock->pending_accept)
3943			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3944						 ISC_XMLCHAR "pending_accept");
3945		if (sock->listener)
3946			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3947						  ISC_XMLCHAR "listener");
3948		if (sock->connected)
3949			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3950						  ISC_XMLCHAR "connected");
3951		if (sock->pending_connect)
3952			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3953						  ISC_XMLCHAR "connecting");
3954		if (sock->bound)
3955			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3956						  ISC_XMLCHAR "bound");
3957
3958		xmlTextWriterEndElement(writer); /* states */
3959
3960		xmlTextWriterEndElement(writer); /* socket */
3961
3962		UNLOCK(&sock->lock);
3963		sock = ISC_LIST_NEXT(sock, link);
3964	}
3965	xmlTextWriterEndElement(writer); /* sockets */
3966
3967	UNLOCK(&mgr->lock);
3968}
3969#endif /* HAVE_LIBXML2 */
3970