socket.c revision 290001
1/*
2 * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 2000-2003  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id$ */
19
20/* This code uses functions which are only available on Server 2003 and
21 * higher, and Windows XP and higher.
22 *
23 * This code is by nature multithreaded and takes advantage of various
24 * features to pass on information through the completion port for
25 * when I/O is completed.  All sends, receives, accepts, and connects are
26 * completed through the completion port.
27 *
28 * The number of Completion Port Worker threads used is the total number
29 * of CPU's + 1. This increases the likelihood that a Worker Thread is
30 * available for processing a completed request.
31 *
32 * XXXPDM 5 August, 2002
33 */
34
35#define MAKE_EXTERNAL 1
36#include <config.h>
37
38#include <sys/types.h>
39
40#ifndef _WINSOCKAPI_
41#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
42#endif
43
44#include <errno.h>
45#include <stddef.h>
46#include <stdlib.h>
47#include <string.h>
48#include <unistd.h>
49#include <io.h>
50#include <fcntl.h>
51#include <process.h>
52
53#include <isc/buffer.h>
54#include <isc/bufferlist.h>
55#include <isc/condition.h>
56#include <isc/list.h>
57#include <isc/log.h>
58#include <isc/mem.h>
59#include <isc/msgs.h>
60#include <isc/mutex.h>
61#include <isc/net.h>
62#include <isc/once.h>
63#include <isc/os.h>
64#include <isc/platform.h>
65#include <isc/print.h>
66#include <isc/region.h>
67#include <isc/socket.h>
68#include <isc/stats.h>
69#include <isc/strerror.h>
70#include <isc/syslog.h>
71#include <isc/task.h>
72#include <isc/thread.h>
73#include <isc/util.h>
74#include <isc/win32os.h>
75
76#include <mswsock.h>
77
78#include "errno2result.h"
79
80/*
81 * How in the world can Microsoft exist with APIs like this?
82 * We can't actually call this directly, because it turns out
83 * no library exports this function.  Instead, we need to
84 * issue a runtime call to get the address.
85 */
86LPFN_CONNECTEX ISCConnectEx;
87LPFN_ACCEPTEX ISCAcceptEx;
88LPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
89
90/*
91 * Run expensive internal consistency checks.
92 */
93#ifdef ISC_SOCKET_CONSISTENCY_CHECKS
94#define CONSISTENT(sock) consistent(sock)
95#else
96#define CONSISTENT(sock) do {} while (0)
97#endif
98static void consistent(isc_socket_t *sock);
99
100/*
101 * Define this macro to control the behavior of connection
102 * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
103 * for details.
104 * NOTE: This requires that Windows 2000 systems install Service Pack 2
105 * or later.
106 */
107#ifndef SIO_UDP_CONNRESET
108#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
109#endif
110
111/*
112 * Some systems define the socket length argument as an int, some as size_t,
113 * some as socklen_t.  This is here so it can be easily changed if needed.
114 */
115#ifndef ISC_SOCKADDR_LEN_T
116#define ISC_SOCKADDR_LEN_T unsigned int
117#endif
118
119/*
120 * Define what the possible "soft" errors can be.  These are non-fatal returns
121 * of various network related functions, like recv() and so on.
122 */
123#define SOFT_ERROR(e)	((e) == WSAEINTR || \
124			 (e) == WSAEWOULDBLOCK || \
125			 (e) == EWOULDBLOCK || \
126			 (e) == EINTR || \
127			 (e) == EAGAIN || \
128			 (e) == 0)
129
130/*
131 * Pending errors are not really errors and should be
132 * kept separate
133 */
134#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
135
136#define DOIO_SUCCESS	  0       /* i/o ok, event sent */
137#define DOIO_SOFT	  1       /* i/o ok, soft error, no event sent */
138#define DOIO_HARD	  2       /* i/o error, event sent */
139#define DOIO_EOF	  3       /* EOF, no event sent */
140#define DOIO_PENDING	  4       /* status when i/o is in process */
141#define DOIO_NEEDMORE	  5       /* IO was processed, but we need more due to minimum */
142
143#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
144
145/*
146 * DLVL(90)  --  Function entry/exit and other tracing.
147 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
148 * DLVL(60)  --  Socket data send/receive
149 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
150 * DLVL(20)  --  Socket creation/destruction.
151 */
152#define TRACE_LEVEL		90
153#define CORRECTNESS_LEVEL	70
154#define IOEVENT_LEVEL		60
155#define EVENT_LEVEL		50
156#define CREATION_LEVEL		20
157
158#define TRACE		DLVL(TRACE_LEVEL)
159#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
160#define IOEVENT		DLVL(IOEVENT_LEVEL)
161#define EVENT		DLVL(EVENT_LEVEL)
162#define CREATION	DLVL(CREATION_LEVEL)
163
164typedef isc_event_t intev_t;
165
166/*
167 * Socket State
168 */
169enum {
170  SOCK_INITIALIZED,	/* Socket Initialized */
171  SOCK_OPEN,		/* Socket opened but nothing yet to do */
172  SOCK_DATA,		/* Socket sending or receiving data */
173  SOCK_LISTEN,		/* TCP Socket listening for connects */
174  SOCK_ACCEPT,		/* TCP socket is waiting to accept */
175  SOCK_CONNECT,		/* TCP Socket connecting */
176  SOCK_CLOSED,		/* Socket has been closed */
177};
178
179#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
180#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
181
182/*
183 * IPv6 control information.  If the socket is an IPv6 socket we want
184 * to collect the destination address and interface so the client can
185 * set them on outgoing packets.
186 */
187#ifdef ISC_PLATFORM_HAVEIPV6
188#ifndef USE_CMSG
189#define USE_CMSG	1
190#endif
191#endif
192
193/*
194 * We really  don't want to try and use these control messages. Win32
195 * doesn't have this mechanism before XP.
196 */
197#undef USE_CMSG
198
199/*
200 * Message header for recvmsg and sendmsg calls.
201 * Used value-result for recvmsg, value only for sendmsg.
202 */
203struct msghdr {
204	SOCKADDR_STORAGE to_addr;	/* UDP send/recv address */
205	int      to_addr_len;		/* length of the address */
206	WSABUF  *msg_iov;		/* scatter/gather array */
207	u_int   msg_iovlen;             /* # elements in msg_iov */
208	void	*msg_control;           /* ancillary data, see below */
209	u_int   msg_controllen;         /* ancillary data buffer len */
210	int	msg_totallen;		/* total length of this message */
211} msghdr;
212
213/*
214 * The size to raise the receive buffer to.
215 */
216#define RCVBUFSIZE (32*1024)
217
218/*
219 * The number of times a send operation is repeated if the result
220 * is WSAEINTR.
221 */
222#define NRETRIES 10
223
224struct isc_socket {
225	/* Not locked. */
226	unsigned int		magic;
227	isc_socketmgr_t	       *manager;
228	isc_mutex_t		lock;
229	isc_sockettype_t	type;
230
231	/* Pointers to scatter/gather buffers */
232	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
233
234	/* Locked by socket lock. */
235	ISC_LINK(isc_socket_t)	link;
236	unsigned int		references; /* EXTERNAL references */
237	SOCKET			fd;	/* file handle */
238	int			pf;	/* protocol family */
239	char			name[16];
240	void *			tag;
241
242	/*
243	 * Each recv() call uses this buffer.  It is a per-socket receive
244	 * buffer that allows us to decouple the system recv() from the
245	 * recv_list done events.  This means the items on the recv_list
246	 * can be removed without having to cancel pending system recv()
247	 * calls.  It also allows us to read-ahead in some cases.
248	 */
249	struct {
250		SOCKADDR_STORAGE	from_addr;	   // UDP send/recv address
251		int		from_addr_len;	   // length of the address
252		char		*base;		   // the base of the buffer
253		char		*consume_position; // where to start copying data from next
254		unsigned int	len;		   // the actual size of this buffer
255		unsigned int	remaining;	   // the number of bytes remaining
256	} recvbuf;
257
258	ISC_LIST(isc_socketevent_t)		send_list;
259	ISC_LIST(isc_socketevent_t)		recv_list;
260	ISC_LIST(isc_socket_newconnev_t)	accept_list;
261	isc_socket_connev_t		       *connect_ev;
262
263	isc_sockaddr_t		address;  /* remote address */
264
265	unsigned int		listener : 1,	/* listener socket */
266				connected : 1,
267				pending_connect : 1, /* connect pending */
268				bound : 1,	/* bound to local addr */
269				dupped : 1;     /* created by isc_socket_dup() */
270	unsigned int		pending_iocp;	/* Should equal the counters below. Debug. */
271	unsigned int		pending_recv;  /* Number of outstanding recv() calls. */
272	unsigned int		pending_send;  /* Number of outstanding send() calls. */
273	unsigned int		pending_accept; /* Number of outstanding accept() calls. */
274	unsigned int		state; /* Socket state. Debugging and consistency checking. */
275	int			state_lineno;  /* line which last touched state */
276};
277
278#define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (0)
279
280/*
281 * Buffer structure
282 */
283typedef struct buflist buflist_t;
284
285struct buflist {
286	void			*buf;
287	unsigned int		buflen;
288	ISC_LINK(buflist_t)	link;
289};
290
291/*
292 * I/O Completion ports Info structures
293 */
294
295static HANDLE hHeapHandle = NULL;
296typedef struct IoCompletionInfo {
297	OVERLAPPED		overlapped;
298	isc_socketevent_t	*dev;  /* send()/recv() done event */
299	isc_socket_connev_t	*cdev; /* connect() done event */
300	isc_socket_newconnev_t	*adev; /* accept() done event */
301	void			*acceptbuffer;
302	DWORD			received_bytes;
303	int			request_type;
304	struct msghdr		messagehdr;
305	ISC_LIST(buflist_t)	bufferlist;	/*%< list of buffers */
306} IoCompletionInfo;
307
308/*
309 * Define a maximum number of I/O Completion Port worker threads
310 * to handle the load on the Completion Port. The actual number
311 * used is the number of CPU's + 1.
312 */
313#define MAX_IOCPTHREADS 20
314
315#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
316#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
317
318struct isc_socketmgr {
319	/* Not locked. */
320	unsigned int			magic;
321	isc_mem_t		       *mctx;
322	isc_mutex_t			lock;
323	isc_stats_t		       *stats;
324
325	/* Locked by manager lock. */
326	ISC_LIST(isc_socket_t)		socklist;
327	isc_boolean_t			bShutdown;
328	isc_condition_t			shutdown_ok;
329	HANDLE				hIoCompletionPort;
330	int				maxIOCPThreads;
331	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
332	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
333
334	/*
335	 * Debugging.
336	 * Modified by InterlockedIncrement() and InterlockedDecrement()
337	 */
338	LONG				totalSockets;
339	LONG				iocp_total;
340};
341
342enum {
343	SOCKET_RECV,
344	SOCKET_SEND,
345	SOCKET_ACCEPT,
346	SOCKET_CONNECT
347};
348
349/*
350 * send() and recv() iovec counts
351 */
352#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
353#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
354
355static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
356				  isc_sockettype_t type,
357				  isc_socket_t **socketp,
358				  isc_socket_t *dup_socket);
359static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
360static void maybe_free_socket(isc_socket_t **, int);
361static void free_socket(isc_socket_t **, int);
362static isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
363static isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
364static isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
365static void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
366static void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
367static void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
368static void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
369static void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
370static void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
371static void queue_receive_request(isc_socket_t *sock);
372
373/*
374 * This is used to dump the contents of the sock structure
375 * You should make sure that the sock is locked before
376 * dumping it. Since the code uses simple printf() statements
377 * it should only be used interactively.
378 */
379void
380sock_dump(isc_socket_t *sock) {
381	isc_socketevent_t *ldev;
382	isc_socket_newconnev_t *ndev;
383
384#if 0
385	isc_sockaddr_t addr;
386	char socktext[256];
387
388	isc_socket_getpeername(sock, &addr);
389	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
390	printf("Remote Socket: %s\n", socktext);
391	isc_socket_getsockname(sock, &addr);
392	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
393	printf("This Socket: %s\n", socktext);
394#endif
395
396	printf("\n\t\tSock Dump\n");
397	printf("\t\tfd: %u\n", sock->fd);
398	printf("\t\treferences: %d\n", sock->references);
399	printf("\t\tpending_accept: %d\n", sock->pending_accept);
400	printf("\t\tconnecting: %d\n", sock->pending_connect);
401	printf("\t\tconnected: %d\n", sock->connected);
402	printf("\t\tbound: %d\n", sock->bound);
403	printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
404	printf("\t\tsocket type: %d\n", sock->type);
405
406	printf("\n\t\tSock Recv List\n");
407	ldev = ISC_LIST_HEAD(sock->recv_list);
408	while (ldev != NULL) {
409		printf("\t\tdev: %p\n", ldev);
410		ldev = ISC_LIST_NEXT(ldev, ev_link);
411	}
412
413	printf("\n\t\tSock Send List\n");
414	ldev = ISC_LIST_HEAD(sock->send_list);
415	while (ldev != NULL) {
416		printf("\t\tdev: %p\n", ldev);
417		ldev = ISC_LIST_NEXT(ldev, ev_link);
418	}
419
420	printf("\n\t\tSock Accept List\n");
421	ndev = ISC_LIST_HEAD(sock->accept_list);
422	while (ndev != NULL) {
423		printf("\t\tdev: %p\n", ldev);
424		ndev = ISC_LIST_NEXT(ndev, ev_link);
425	}
426}
427
428static void
429socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
430	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
431	   isc_msgcat_t *msgcat, int msgset, int message,
432	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
433
434/*  This function will add an entry to the I/O completion port
435 *  that will signal the I/O thread to exit (gracefully)
436 */
437static void
438signal_iocompletionport_exit(isc_socketmgr_t *manager) {
439	int i;
440	int errval;
441	char strbuf[ISC_STRERRORSIZE];
442
443	REQUIRE(VALID_MANAGER(manager));
444	for (i = 0; i < manager->maxIOCPThreads; i++) {
445		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
446						0, 0, 0)) {
447			errval = GetLastError();
448			isc__strerror(errval, strbuf, sizeof(strbuf));
449			FATAL_ERROR(__FILE__, __LINE__,
450				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
451				ISC_MSG_FAILED,
452				"Can't request service thread to exit: %s"),
453				strbuf);
454		}
455	}
456}
457
458/*
459 * Create the worker threads for the I/O Completion Port
460 */
461void
462iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
463	int errval;
464	char strbuf[ISC_STRERRORSIZE];
465	int i;
466
467	INSIST(total_threads > 0);
468	REQUIRE(VALID_MANAGER(manager));
469	/*
470	 * We need at least one
471	 */
472	for (i = 0; i < total_threads; i++) {
473		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
474						manager, 0,
475						&manager->dwIOCPThreadIds[i]);
476		if (manager->hIOCPThreads[i] == NULL) {
477			errval = GetLastError();
478			isc__strerror(errval, strbuf, sizeof(strbuf));
479			FATAL_ERROR(__FILE__, __LINE__,
480				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
481				ISC_MSG_FAILED,
482				"Can't create IOCP thread: %s"),
483				strbuf);
484			exit(1);
485		}
486	}
487}
488
489/*
490 *  Create/initialise the I/O completion port
491 */
492void
493iocompletionport_init(isc_socketmgr_t *manager) {
494	int errval;
495	char strbuf[ISC_STRERRORSIZE];
496
497	REQUIRE(VALID_MANAGER(manager));
498	/*
499	 * Create a private heap to handle the socket overlapped structure
500	 * The minimum number of structures is 10, there is no maximum
501	 */
502	hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
503	if (hHeapHandle == NULL) {
504		errval = GetLastError();
505		isc__strerror(errval, strbuf, sizeof(strbuf));
506		FATAL_ERROR(__FILE__, __LINE__,
507			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
508					   ISC_MSG_FAILED,
509					   "HeapCreate() failed during "
510					   "initialization: %s"),
511			    strbuf);
512		exit(1);
513	}
514
515	manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
516
517	/* Now Create the Completion Port */
518	manager->hIoCompletionPort = CreateIoCompletionPort(
519			INVALID_HANDLE_VALUE, NULL,
520			0, manager->maxIOCPThreads);
521	if (manager->hIoCompletionPort == NULL) {
522		errval = GetLastError();
523		isc__strerror(errval, strbuf, sizeof(strbuf));
524		FATAL_ERROR(__FILE__, __LINE__,
525				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
526				ISC_MSG_FAILED,
527				"CreateIoCompletionPort() failed "
528				"during initialization: %s"),
529				strbuf);
530		exit(1);
531	}
532
533	/*
534	 * Worker threads for servicing the I/O
535	 */
536	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
537}
538
539/*
540 * Associate a socket with an IO Completion Port.  This allows us to queue events for it
541 * and have our worker pool of threads process them.
542 */
543void
544iocompletionport_update(isc_socket_t *sock) {
545	HANDLE hiocp;
546	char strbuf[ISC_STRERRORSIZE];
547
548	REQUIRE(VALID_SOCKET(sock));
549
550	hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
551		sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
552
553	if (hiocp == NULL) {
554		DWORD errval = GetLastError();
555		isc__strerror(errval, strbuf, sizeof(strbuf));
556		isc_log_iwrite(isc_lctx,
557				ISC_LOGCATEGORY_GENERAL,
558				ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
559				isc_msgcat, ISC_MSGSET_SOCKET,
560				ISC_MSG_TOOMANYHANDLES,
561				"iocompletionport_update: failed to open"
562				" io completion port: %s",
563				strbuf);
564
565		/* XXXMLG temporary hack to make failures detected.
566		 * This function should return errors to the caller, not
567		 * exit here.
568		 */
569		FATAL_ERROR(__FILE__, __LINE__,
570				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
571				ISC_MSG_FAILED,
572				"CreateIoCompletionPort() failed "
573				"during initialization: %s"),
574				strbuf);
575		exit(1);
576	}
577
578	InterlockedIncrement(&sock->manager->iocp_total);
579}
580
581/*
582 * Routine to cleanup and then close the socket.
583 * Only close the socket here if it is NOT associated
584 * with an event, otherwise the WSAWaitForMultipleEvents
585 * may fail due to the fact that the Wait should not
586 * be running while closing an event or a socket.
587 * The socket is locked before calling this function
588 */
589void
590socket_close(isc_socket_t *sock) {
591
592	REQUIRE(sock != NULL);
593
594	if (sock->fd != INVALID_SOCKET) {
595		closesocket(sock->fd);
596		sock->fd = INVALID_SOCKET;
597		_set_state(sock, SOCK_CLOSED);
598		InterlockedDecrement(&sock->manager->totalSockets);
599	}
600}
601
602static isc_once_t initialise_once = ISC_ONCE_INIT;
603static isc_boolean_t initialised = ISC_FALSE;
604
605static void
606initialise(void) {
607	WORD wVersionRequested;
608	WSADATA wsaData;
609	int err;
610	SOCKET sock;
611	GUID GUIDConnectEx = WSAID_CONNECTEX;
612	GUID GUIDAcceptEx = WSAID_ACCEPTEX;
613	GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
614	DWORD dwBytes;
615
616	/* Need Winsock 2.2 or better */
617	wVersionRequested = MAKEWORD(2, 2);
618
619	err = WSAStartup(wVersionRequested, &wsaData);
620	if (err != 0) {
621		char strbuf[ISC_STRERRORSIZE];
622		isc__strerror(err, strbuf, sizeof(strbuf));
623		FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
624			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
625					   ISC_MSG_FAILED, "failed"),
626			    strbuf);
627		exit(1);
628	}
629	/*
630	 * The following APIs do not exist as functions in a library, but we must
631	 * ask winsock for them.  They are "extensions" -- but why they cannot be
632	 * actual functions is beyond me.  So, ask winsock for the pointers to the
633	 * functions we need.
634	 */
635	sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
636	INSIST(sock != INVALID_SOCKET);
637	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
638		 &GUIDConnectEx, sizeof(GUIDConnectEx),
639		 &ISCConnectEx, sizeof(ISCConnectEx),
640		 &dwBytes, NULL, NULL);
641	INSIST(err == 0);
642
643	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
644		 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
645		 &ISCAcceptEx, sizeof(ISCAcceptEx),
646		 &dwBytes, NULL, NULL);
647	INSIST(err == 0);
648
649	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
650		 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
651		 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
652		 &dwBytes, NULL, NULL);
653	INSIST(err == 0);
654
655	closesocket(sock);
656
657	initialised = ISC_TRUE;
658}
659
660/*
661 * Initialize socket services
662 */
663void
664InitSockets(void) {
665	RUNTIME_CHECK(isc_once_do(&initialise_once,
666				  initialise) == ISC_R_SUCCESS);
667	if (!initialised)
668		exit(1);
669}
670
671int
672internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
673		 struct msghdr *messagehdr, int flags, int *Error)
674{
675	int Result;
676	DWORD BytesSent;
677	DWORD Flags = flags;
678	int total_sent;
679
680	*Error = 0;
681	Result = WSASendTo(sock->fd, messagehdr->msg_iov,
682			   messagehdr->msg_iovlen, &BytesSent,
683			   Flags, (SOCKADDR *)&messagehdr->to_addr,
684			   messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
685			   NULL);
686
687	total_sent = (int)BytesSent;
688
689	/* Check for errors.*/
690	if (Result == SOCKET_ERROR) {
691		*Error = WSAGetLastError();
692
693		switch (*Error) {
694		case WSA_IO_INCOMPLETE:
695		case WSA_WAIT_IO_COMPLETION:
696		case WSA_IO_PENDING:
697		case NO_ERROR:		/* Strange, but okay */
698			sock->pending_iocp++;
699			sock->pending_send++;
700			break;
701
702		default:
703			return (-1);
704			break;
705		}
706	} else {
707		sock->pending_iocp++;
708		sock->pending_send++;
709	}
710
711	if (lpo != NULL)
712		return (0);
713	else
714		return (total_sent);
715}
716
717static void
718queue_receive_request(isc_socket_t *sock) {
719	DWORD Flags = 0;
720	DWORD NumBytes = 0;
721	int total_bytes = 0;
722	int Result;
723	int Error;
724	int need_retry;
725	WSABUF iov[1];
726	IoCompletionInfo *lpo = NULL;
727	isc_result_t isc_result;
728
729 retry:
730	need_retry = ISC_FALSE;
731
732	/*
733	 * If we already have a receive pending, do nothing.
734	 */
735	if (sock->pending_recv > 0) {
736		if (lpo != NULL)
737			HeapFree(hHeapHandle, 0, lpo);
738		return;
739	}
740
741	/*
742	 * If no one is waiting, do nothing.
743	 */
744	if (ISC_LIST_EMPTY(sock->recv_list)) {
745		if (lpo != NULL)
746			HeapFree(hHeapHandle, 0, lpo);
747		return;
748	}
749
750	INSIST(sock->recvbuf.remaining == 0);
751	INSIST(sock->fd != INVALID_SOCKET);
752
753	iov[0].len = sock->recvbuf.len;
754	iov[0].buf = sock->recvbuf.base;
755
756	if (lpo == NULL) {
757		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
758						    HEAP_ZERO_MEMORY,
759						    sizeof(IoCompletionInfo));
760		RUNTIME_CHECK(lpo != NULL);
761	} else
762		ZeroMemory(lpo, sizeof(IoCompletionInfo));
763	lpo->request_type = SOCKET_RECV;
764
765	sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
766
767	Error = 0;
768	Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
769			     &NumBytes, &Flags,
770			     (SOCKADDR *)&sock->recvbuf.from_addr,
771			     &sock->recvbuf.from_addr_len,
772			     (LPWSAOVERLAPPED)lpo, NULL);
773
774	/* Check for errors. */
775	if (Result == SOCKET_ERROR) {
776		Error = WSAGetLastError();
777
778		switch (Error) {
779		case WSA_IO_PENDING:
780			sock->pending_iocp++;
781			sock->pending_recv++;
782			break;
783
784		/* direct error: no completion event */
785		case ERROR_HOST_UNREACHABLE:
786		case WSAENETRESET:
787		case WSAECONNRESET:
788			if (!sock->connected) {
789				/* soft error */
790				need_retry = ISC_TRUE;
791				break;
792			}
793			/* FALLTHROUGH */
794
795		default:
796			isc_result = isc__errno2result(Error);
797			if (isc_result == ISC_R_UNEXPECTED)
798				UNEXPECTED_ERROR(__FILE__, __LINE__,
799					"WSARecvFrom: Windows error code: %d, isc result %d",
800					Error, isc_result);
801			send_recvdone_abort(sock, isc_result);
802			HeapFree(hHeapHandle, 0, lpo);
803			lpo = NULL;
804			break;
805		}
806	} else {
807		/*
808		 * The recv() finished immediately, but we will still get
809		 * a completion event.  Rather than duplicate code, let
810		 * that thread handle sending the data along its way.
811		 */
812		sock->pending_iocp++;
813		sock->pending_recv++;
814	}
815
816	socket_log(__LINE__, sock, NULL, IOEVENT,
817		   isc_msgcat, ISC_MSGSET_SOCKET,
818		   ISC_MSG_DOIORECV,
819		   "queue_io_request: fd %d result %d error %d",
820		   sock->fd, Result, Error);
821
822	CONSISTENT(sock);
823
824	if (need_retry)
825		goto retry;
826}
827
828static void
829manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
830	    isc_logmodule_t *module, int level, const char *fmt, ...)
831{
832	char msgbuf[2048];
833	va_list ap;
834
835	if (!isc_log_wouldlog(isc_lctx, level))
836		return;
837
838	va_start(ap, fmt);
839	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
840	va_end(ap);
841
842	isc_log_write(isc_lctx, category, module, level,
843		      "sockmgr %p: %s", sockmgr, msgbuf);
844}
845
846static void
847socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
848	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
849	   isc_msgcat_t *msgcat, int msgset, int message,
850	   const char *fmt, ...)
851{
852	char msgbuf[2048];
853	char peerbuf[256];
854	va_list ap;
855
856
857	if (!isc_log_wouldlog(isc_lctx, level))
858		return;
859
860	va_start(ap, fmt);
861	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
862	va_end(ap);
863
864	if (address == NULL) {
865		isc_log_iwrite(isc_lctx, category, module, level,
866			       msgcat, msgset, message,
867			       "socket %p line %d: %s", sock, lineno, msgbuf);
868	} else {
869		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
870		isc_log_iwrite(isc_lctx, category, module, level,
871			       msgcat, msgset, message,
872				   "socket %p line %d peer %s: %s", sock, lineno,
873				   peerbuf, msgbuf);
874	}
875
876}
877
878/*
879 * Make an fd SOCKET non-blocking.
880 */
881static isc_result_t
882make_nonblock(SOCKET fd) {
883	int ret;
884	unsigned long flags = 1;
885	char strbuf[ISC_STRERRORSIZE];
886
887	/* Set the socket to non-blocking */
888	ret = ioctlsocket(fd, FIONBIO, &flags);
889
890	if (ret == -1) {
891		isc__strerror(errno, strbuf, sizeof(strbuf));
892		UNEXPECTED_ERROR(__FILE__, __LINE__,
893				 "ioctlsocket(%d, FIOBIO, %d): %s",
894				 fd, flags, strbuf);
895
896		return (ISC_R_UNEXPECTED);
897	}
898
899	return (ISC_R_SUCCESS);
900}
901
902/*
903 * Windows 2000 systems incorrectly cause UDP sockets using WSARecvFrom
904 * to not work correctly, returning a WSACONNRESET error when a WSASendTo
905 * fails with an "ICMP port unreachable" response and preventing the
906 * socket from using the WSARecvFrom in subsequent operations.
907 * The function below fixes this, but requires that Windows 2000
908 * Service Pack 2 or later be installed on the system.  NT 4.0
909 * systems are not affected by this and work correctly.
910 * See Microsoft Knowledge Base Article Q263823 for details of this.
911 */
912isc_result_t
913connection_reset_fix(SOCKET fd) {
914	DWORD dwBytesReturned = 0;
915	BOOL  bNewBehavior = FALSE;
916	DWORD status;
917
918	if (isc_win32os_majorversion() < 5)
919		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */
920
921	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
922	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
923			  sizeof(bNewBehavior), NULL, 0,
924			  &dwBytesReturned, NULL, NULL);
925	if (status != SOCKET_ERROR)
926		return (ISC_R_SUCCESS);
927	else {
928		UNEXPECTED_ERROR(__FILE__, __LINE__,
929				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
930				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
931						ISC_MSG_FAILED, "failed"));
932		return (ISC_R_UNEXPECTED);
933	}
934}
935
936/*
937 * Construct an iov array and attach it to the msghdr passed in.  This is
938 * the SEND constructor, which will use the used region of the buffer
939 * (if using a buffer list) or will use the internal region (if a single
940 * buffer I/O is requested).
941 *
942 * Nothing can be NULL, and the done event must list at least one buffer
943 * on the buffer linked list for this function to be meaningful.
944 */
945static void
946build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
947		  struct msghdr *msg, char *cmsg, WSABUF *iov,
948		  IoCompletionInfo  *lpo)
949{
950	unsigned int iovcount;
951	isc_buffer_t *buffer;
952	buflist_t  *cpbuffer;
953	isc_region_t used;
954	size_t write_count;
955	size_t skip_count;
956
957	memset(msg, 0, sizeof(*msg));
958
959	memcpy(&msg->to_addr, &dev->address.type, dev->address.length);
960	msg->to_addr_len = dev->address.length;
961
962	buffer = ISC_LIST_HEAD(dev->bufferlist);
963	write_count = 0;
964	iovcount = 0;
965
966	/*
967	 * Single buffer I/O?  Skip what we've done so far in this region.
968	 */
969	if (buffer == NULL) {
970		write_count = dev->region.length - dev->n;
971		cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
972		RUNTIME_CHECK(cpbuffer != NULL);
973		cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
974		RUNTIME_CHECK(cpbuffer->buf != NULL);
975
976		socket_log(__LINE__, sock, NULL, TRACE,
977		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
978		   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
979		   cpbuffer->buf, write_count);
980
981		memcpy(cpbuffer->buf,(dev->region.base + dev->n), write_count);
982		cpbuffer->buflen = write_count;
983		ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
984		iov[0].buf = cpbuffer->buf;
985		iov[0].len = write_count;
986		iovcount = 1;
987
988		goto config;
989	}
990
991	/*
992	 * Multibuffer I/O.
993	 * Skip the data in the buffer list that we have already written.
994	 */
995	skip_count = dev->n;
996	while (buffer != NULL) {
997		REQUIRE(ISC_BUFFER_VALID(buffer));
998		if (skip_count < isc_buffer_usedlength(buffer))
999			break;
1000		skip_count -= isc_buffer_usedlength(buffer);
1001		buffer = ISC_LIST_NEXT(buffer, link);
1002	}
1003
1004	while (buffer != NULL) {
1005		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1006
1007		isc_buffer_usedregion(buffer, &used);
1008
1009		if (used.length > 0) {
1010			int uselen = used.length - skip_count;
1011			cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
1012			RUNTIME_CHECK(cpbuffer != NULL);
1013			cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
1014			RUNTIME_CHECK(cpbuffer->buf != NULL);
1015
1016			socket_log(__LINE__, sock, NULL, TRACE,
1017			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1018			   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
1019			   cpbuffer->buf, write_count);
1020
1021			memcpy(cpbuffer->buf,(used.base + skip_count), uselen);
1022			cpbuffer->buflen = uselen;
1023			iov[iovcount].buf = cpbuffer->buf;
1024			iov[iovcount].len = used.length - skip_count;
1025			write_count += uselen;
1026			skip_count = 0;
1027			iovcount++;
1028		}
1029		buffer = ISC_LIST_NEXT(buffer, link);
1030	}
1031
1032	INSIST(skip_count == 0);
1033
1034 config:
1035	msg->msg_iov = iov;
1036	msg->msg_iovlen = iovcount;
1037	msg->msg_totallen = write_count;
1038}
1039
1040static void
1041set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1042		isc_socketevent_t *dev)
1043{
1044	if (sock->type == isc_sockettype_udp) {
1045		if (address != NULL)
1046			dev->address = *address;
1047		else
1048			dev->address = sock->address;
1049	} else if (sock->type == isc_sockettype_tcp) {
1050		INSIST(address == NULL);
1051		dev->address = sock->address;
1052	}
1053}
1054
1055static void
1056destroy_socketevent(isc_event_t *event) {
1057	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1058
1059	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1060
1061	(ev->destroy)(event);
1062}
1063
1064static isc_socketevent_t *
1065allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1066		     isc_taskaction_t action, const void *arg)
1067{
1068	isc_socketevent_t *ev;
1069
1070	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1071						     sock, eventtype,
1072						     action, arg,
1073						     sizeof(*ev));
1074	if (ev == NULL)
1075		return (NULL);
1076
1077	ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1078	ISC_LINK_INIT(ev, ev_link);
1079	ISC_LIST_INIT(ev->bufferlist);
1080	ev->region.base = NULL;
1081	ev->n = 0;
1082	ev->offset = 0;
1083	ev->attributes = 0;
1084	ev->destroy = ev->ev_destroy;
1085	ev->ev_destroy = destroy_socketevent;
1086
1087	return (ev);
1088}
1089
1090#if defined(ISC_SOCKET_DEBUG)
1091static void
1092dump_msg(struct msghdr *msg, isc_socket_t *sock) {
1093	unsigned int i;
1094
1095	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1096	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1097	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1098	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1099		printf("\t\t%d\tbase %p, len %d\n", i,
1100		       msg->msg_iov[i].buf,
1101		       msg->msg_iov[i].len);
1102}
1103#endif
1104
1105/*
1106 * map the error code
1107 */
1108int
1109map_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1110		 char *errorstring, size_t bufsize) {
1111
1112	int doreturn;
1113	switch (windows_errno) {
1114	case WSAECONNREFUSED:
1115		*isc_errno = ISC_R_CONNREFUSED;
1116		if (sock->connected)
1117			doreturn = DOIO_HARD;
1118		else
1119			doreturn = DOIO_SOFT;
1120		break;
1121	case WSAENETUNREACH:
1122	case ERROR_NETWORK_UNREACHABLE:
1123		*isc_errno = ISC_R_NETUNREACH;
1124		if (sock->connected)
1125			doreturn = DOIO_HARD;
1126		else
1127			doreturn = DOIO_SOFT;
1128		break;
1129	case ERROR_PORT_UNREACHABLE:
1130	case ERROR_HOST_UNREACHABLE:
1131	case WSAEHOSTUNREACH:
1132		*isc_errno = ISC_R_HOSTUNREACH;
1133		if (sock->connected)
1134			doreturn = DOIO_HARD;
1135		else
1136			doreturn = DOIO_SOFT;
1137		break;
1138	case WSAENETDOWN:
1139		*isc_errno = ISC_R_NETDOWN;
1140		if (sock->connected)
1141			doreturn = DOIO_HARD;
1142		else
1143			doreturn = DOIO_SOFT;
1144		break;
1145	case WSAEHOSTDOWN:
1146		*isc_errno = ISC_R_HOSTDOWN;
1147		if (sock->connected)
1148			doreturn = DOIO_HARD;
1149		else
1150			doreturn = DOIO_SOFT;
1151		break;
1152	case WSAEACCES:
1153		*isc_errno = ISC_R_NOPERM;
1154		if (sock->connected)
1155			doreturn = DOIO_HARD;
1156		else
1157			doreturn = DOIO_SOFT;
1158		break;
1159	case WSAECONNRESET:
1160	case WSAENETRESET:
1161	case WSAECONNABORTED:
1162	case WSAEDISCON:
1163		*isc_errno = ISC_R_CONNECTIONRESET;
1164		if (sock->connected)
1165			doreturn = DOIO_HARD;
1166		else
1167			doreturn = DOIO_SOFT;
1168		break;
1169	case WSAENOTCONN:
1170		*isc_errno = ISC_R_NOTCONNECTED;
1171		if (sock->connected)
1172			doreturn = DOIO_HARD;
1173		else
1174			doreturn = DOIO_SOFT;
1175		break;
1176	case ERROR_OPERATION_ABORTED:
1177	case ERROR_CONNECTION_ABORTED:
1178	case ERROR_REQUEST_ABORTED:
1179		*isc_errno = ISC_R_CONNECTIONRESET;
1180		doreturn = DOIO_HARD;
1181		break;
1182	case WSAENOBUFS:
1183		*isc_errno = ISC_R_NORESOURCES;
1184		doreturn = DOIO_HARD;
1185		break;
1186	case WSAEAFNOSUPPORT:
1187		*isc_errno = ISC_R_FAMILYNOSUPPORT;
1188		doreturn = DOIO_HARD;
1189		break;
1190	case WSAEADDRNOTAVAIL:
1191		*isc_errno = ISC_R_ADDRNOTAVAIL;
1192		doreturn = DOIO_HARD;
1193		break;
1194	case WSAEDESTADDRREQ:
1195		*isc_errno = ISC_R_BADADDRESSFORM;
1196		doreturn = DOIO_HARD;
1197		break;
1198	case ERROR_NETNAME_DELETED:
1199		*isc_errno = ISC_R_NETDOWN;
1200		doreturn = DOIO_HARD;
1201		break;
1202	default:
1203		*isc_errno = ISC_R_IOERROR;
1204		doreturn = DOIO_HARD;
1205		break;
1206	}
1207	if (doreturn == DOIO_HARD) {
1208		isc__strerror(windows_errno, errorstring, bufsize);
1209	}
1210	return (doreturn);
1211}
1212
1213static void
1214fill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1215	isc_region_t r;
1216	int copylen;
1217	isc_buffer_t *buffer;
1218
1219	INSIST(dev->n < dev->minimum);
1220	INSIST(sock->recvbuf.remaining > 0);
1221	INSIST(sock->pending_recv == 0);
1222
1223	if (sock->type == isc_sockettype_udp) {
1224		dev->address.length = sock->recvbuf.from_addr_len;
1225		memcpy(&dev->address.type, &sock->recvbuf.from_addr,
1226		    sock->recvbuf.from_addr_len);
1227		if (isc_sockaddr_getport(&dev->address) == 0) {
1228			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1229				socket_log(__LINE__, sock, &dev->address, IOEVENT,
1230					   isc_msgcat, ISC_MSGSET_SOCKET,
1231					   ISC_MSG_ZEROPORT,
1232					   "dropping source port zero packet");
1233			}
1234			sock->recvbuf.remaining = 0;
1235			return;
1236		}
1237	} else if (sock->type == isc_sockettype_tcp) {
1238		dev->address = sock->address;
1239	}
1240
1241	/*
1242	 * Run through the list of buffers we were given, and find the
1243	 * first one with space.  Once it is found, loop through, filling
1244	 * the buffers as much as possible.
1245	 */
1246	buffer = ISC_LIST_HEAD(dev->bufferlist);
1247	if (buffer != NULL) { // Multi-buffer receive
1248		while (buffer != NULL && sock->recvbuf.remaining > 0) {
1249			REQUIRE(ISC_BUFFER_VALID(buffer));
1250			if (isc_buffer_availablelength(buffer) > 0) {
1251				isc_buffer_availableregion(buffer, &r);
1252				copylen = min(r.length, sock->recvbuf.remaining);
1253				memcpy(r.base, sock->recvbuf.consume_position, copylen);
1254				sock->recvbuf.consume_position += copylen;
1255				sock->recvbuf.remaining -= copylen;
1256				isc_buffer_add(buffer, copylen);
1257				dev->n += copylen;
1258			}
1259			buffer = ISC_LIST_NEXT(buffer, link);
1260		}
1261	} else { // Single-buffer receive
1262		copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1263		memcpy(dev->region.base + dev->n, sock->recvbuf.consume_position, copylen);
1264		sock->recvbuf.consume_position += copylen;
1265		sock->recvbuf.remaining -= copylen;
1266		dev->n += copylen;
1267	}
1268
1269	/*
1270	 * UDP receives are all-consuming.  That is, if we have 4k worth of
1271	 * data in our receive buffer, and the caller only gave us
1272	 * 1k of space, we will toss the remaining 3k of data.  TCP
1273	 * will keep the extra data around and use it for later requests.
1274	 */
1275	if (sock->type == isc_sockettype_udp)
1276		sock->recvbuf.remaining = 0;
1277}
1278
1279/*
1280 * Copy out as much data from the internal buffer to done events.
1281 * As each done event is filled, send it along its way.
1282 */
1283static void
1284completeio_recv(isc_socket_t *sock)
1285{
1286	isc_socketevent_t *dev;
1287
1288	/*
1289	 * If we are in the process of filling our buffer, we cannot
1290	 * touch it yet, so don't.
1291	 */
1292	if (sock->pending_recv > 0)
1293		return;
1294
1295	while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1296		dev = ISC_LIST_HEAD(sock->recv_list);
1297
1298		/*
1299		 * See if we have sufficient data in our receive buffer
1300		 * to handle this.  If we do, copy out the data.
1301		 */
1302		fill_recv(sock, dev);
1303
1304		/*
1305		 * Did we satisfy it?
1306		 */
1307		if (dev->n >= dev->minimum) {
1308			dev->result = ISC_R_SUCCESS;
1309			send_recvdone_event(sock, &dev);
1310		}
1311	}
1312}
1313
1314/*
1315 * Returns:
1316 *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1317 *			ISC_R_SUCCESS.
1318 *
1319 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1320 *			dev->result contains the appropriate error.
1321 *
1322 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1323 *			event was sent.  The operation should be retried.
1324 *
1325 *	No other return values are possible.
1326 */
1327static int
1328completeio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1329		struct msghdr *messagehdr, int cc, int send_errno)
1330{
1331	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1332	char strbuf[ISC_STRERRORSIZE];
1333
1334	if (send_errno != 0) {
1335		if (SOFT_ERROR(send_errno))
1336			return (DOIO_SOFT);
1337
1338		return (map_socket_error(sock, send_errno, &dev->result,
1339			strbuf, sizeof(strbuf)));
1340
1341		/*
1342		 * The other error types depend on whether or not the
1343		 * socket is UDP or TCP.  If it is UDP, some errors
1344		 * that we expect to be fatal under TCP are merely
1345		 * annoying, and are really soft errors.
1346		 *
1347		 * However, these soft errors are still returned as
1348		 * a status.
1349		 */
1350		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1351		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1352		UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1353				 addrbuf, strbuf);
1354		dev->result = isc__errno2result(send_errno);
1355		return (DOIO_HARD);
1356	}
1357
1358	/*
1359	 * If we write less than we expected, update counters, poke.
1360	 */
1361	dev->n += cc;
1362	if (cc != messagehdr->msg_totallen)
1363		return (DOIO_SOFT);
1364
1365	/*
1366	 * Exactly what we wanted to write.  We're done with this
1367	 * entry.  Post its completion event.
1368	 */
1369	dev->result = ISC_R_SUCCESS;
1370	return (DOIO_SUCCESS);
1371}
1372
1373static int
1374startio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1375	     int *send_errno)
1376{
1377	char *cmsg = NULL;
1378	char strbuf[ISC_STRERRORSIZE];
1379	IoCompletionInfo *lpo;
1380	int status;
1381	struct msghdr *msghdr;
1382
1383	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1384					    HEAP_ZERO_MEMORY,
1385					    sizeof(IoCompletionInfo));
1386	RUNTIME_CHECK(lpo != NULL);
1387	lpo->request_type = SOCKET_SEND;
1388	lpo->dev = dev;
1389	msghdr = &lpo->messagehdr;
1390	memset(msghdr, 0, sizeof(struct msghdr));
1391	ISC_LIST_INIT(lpo->bufferlist);
1392
1393	build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1394
1395	*nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1396
1397	if (*nbytes < 0) {
1398		/*
1399		 * I/O has been initiated
1400		 * completion will be through the completion port
1401		 */
1402		if (PENDING_ERROR(*send_errno)) {
1403			status = DOIO_PENDING;
1404			goto done;
1405		}
1406
1407		if (SOFT_ERROR(*send_errno)) {
1408			status = DOIO_SOFT;
1409			goto done;
1410		}
1411
1412		/*
1413		 * If we got this far then something is wrong
1414		 */
1415		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1416			isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1417			socket_log(__LINE__, sock, NULL, IOEVENT,
1418				   isc_msgcat, ISC_MSGSET_SOCKET,
1419				   ISC_MSG_INTERNALSEND,
1420				   "startio_send: internal_sendmsg(%d) %d "
1421				   "bytes, err %d/%s",
1422				   sock->fd, *nbytes, *send_errno, strbuf);
1423		}
1424		status = DOIO_HARD;
1425		goto done;
1426	}
1427	dev->result = ISC_R_SUCCESS;
1428	status = DOIO_SOFT;
1429 done:
1430	_set_state(sock, SOCK_DATA);
1431	return (status);
1432}
1433
1434static isc_result_t
1435allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1436		isc_socket_t **socketp) {
1437	isc_socket_t *sock;
1438	isc_result_t result;
1439
1440	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1441
1442	if (sock == NULL)
1443		return (ISC_R_NOMEMORY);
1444
1445	sock->magic = 0;
1446	sock->references = 0;
1447
1448	sock->manager = manager;
1449	sock->type = type;
1450	sock->fd = INVALID_SOCKET;
1451
1452	ISC_LINK_INIT(sock, link);
1453
1454	/*
1455	 * set up list of readers and writers to be initially empty
1456	 */
1457	ISC_LIST_INIT(sock->recv_list);
1458	ISC_LIST_INIT(sock->send_list);
1459	ISC_LIST_INIT(sock->accept_list);
1460	sock->connect_ev = NULL;
1461	sock->pending_accept = 0;
1462	sock->pending_recv = 0;
1463	sock->pending_send = 0;
1464	sock->pending_iocp = 0;
1465	sock->listener = 0;
1466	sock->connected = 0;
1467	sock->pending_connect = 0;
1468	sock->bound = 0;
1469	sock->dupped = 0;
1470	memset(sock->name, 0, sizeof(sock->name));	// zero the name field
1471	_set_state(sock, SOCK_INITIALIZED);
1472
1473	sock->recvbuf.len = 65536;
1474	sock->recvbuf.consume_position = sock->recvbuf.base;
1475	sock->recvbuf.remaining = 0;
1476	sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1477	if (sock->recvbuf.base == NULL) {
1478		sock->magic = 0;
1479		goto error;
1480	}
1481
1482	/*
1483	 * initialize the lock
1484	 */
1485	result = isc_mutex_init(&sock->lock);
1486	if (result != ISC_R_SUCCESS) {
1487		sock->magic = 0;
1488		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1489		sock->recvbuf.base = NULL;
1490		goto error;
1491	}
1492
1493	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1494		   "allocated");
1495
1496	sock->magic = SOCKET_MAGIC;
1497	*socketp = sock;
1498
1499	return (ISC_R_SUCCESS);
1500
1501 error:
1502	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1503
1504	return (result);
1505}
1506
1507/*
1508 * Verify that the socket state is consistent.
1509 */
1510static void
1511consistent(isc_socket_t *sock) {
1512
1513	isc_socketevent_t *dev;
1514	isc_socket_newconnev_t *nev;
1515	unsigned int count;
1516	char *crash_reason;
1517	isc_boolean_t crash = ISC_FALSE;
1518
1519	REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1520		+ sock->pending_accept + sock->pending_connect);
1521
1522	dev = ISC_LIST_HEAD(sock->send_list);
1523	count = 0;
1524	while (dev != NULL) {
1525		count++;
1526		dev = ISC_LIST_NEXT(dev, ev_link);
1527	}
1528	if (count > sock->pending_send) {
1529		crash = ISC_TRUE;
1530		crash_reason = "send_list > sock->pending_send";
1531	}
1532
1533	nev = ISC_LIST_HEAD(sock->accept_list);
1534	count = 0;
1535	while (nev != NULL) {
1536		count++;
1537		nev = ISC_LIST_NEXT(nev, ev_link);
1538	}
1539	if (count > sock->pending_accept) {
1540		crash = ISC_TRUE;
1541		crash_reason = "send_list > sock->pending_send";
1542	}
1543
1544	if (crash) {
1545		socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1546			   ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1547			   crash_reason);
1548		sock_dump(sock);
1549		INSIST(crash == ISC_FALSE);
1550	}
1551}
1552
1553/*
1554 * Maybe free the socket.
1555 *
1556 * This function will verify tht the socket is no longer in use in any way,
1557 * either internally or externally.  This is the only place where this
1558 * check is to be made; if some bit of code believes that IT is done with
1559 * the socket (e.g., some reference counter reaches zero), it should call
1560 * this function.
1561 *
1562 * When calling this function, the socket must be locked, and the manager
1563 * must be unlocked.
1564 *
1565 * When this function returns, *socketp will be NULL.  No tricks to try
1566 * to hold on to this pointer are allowed.
1567 */
1568static void
1569maybe_free_socket(isc_socket_t **socketp, int lineno) {
1570	isc_socket_t *sock = *socketp;
1571	*socketp = NULL;
1572
1573	INSIST(VALID_SOCKET(sock));
1574	CONSISTENT(sock);
1575
1576	if (sock->pending_iocp > 0
1577	    || sock->pending_recv > 0
1578	    || sock->pending_send > 0
1579	    || sock->pending_accept > 0
1580	    || sock->references > 0
1581	    || sock->pending_connect == 1
1582	    || !ISC_LIST_EMPTY(sock->recv_list)
1583	    || !ISC_LIST_EMPTY(sock->send_list)
1584	    || !ISC_LIST_EMPTY(sock->accept_list)
1585	    || sock->fd != INVALID_SOCKET) {
1586		UNLOCK(&sock->lock);
1587		return;
1588	}
1589	UNLOCK(&sock->lock);
1590
1591	free_socket(&sock, lineno);
1592}
1593
1594void
1595free_socket(isc_socket_t **sockp, int lineno) {
1596	isc_socketmgr_t *manager;
1597	isc_socket_t *sock = *sockp;
1598	*sockp = NULL;
1599
1600	manager = sock->manager;
1601
1602	/*
1603	 * Seems we can free the socket after all.
1604	 */
1605	manager = sock->manager;
1606	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1607		   ISC_MSG_DESTROYING, "freeing socket line %d fd %d lock %p semaphore %p",
1608		   lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1609
1610	sock->magic = 0;
1611	DESTROYLOCK(&sock->lock);
1612
1613	if (sock->recvbuf.base != NULL)
1614		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1615
1616	LOCK(&manager->lock);
1617	if (ISC_LINK_LINKED(sock, link))
1618		ISC_LIST_UNLINK(manager->socklist, sock, link);
1619	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1620
1621	if (ISC_LIST_EMPTY(manager->socklist))
1622		SIGNAL(&manager->shutdown_ok);
1623	UNLOCK(&manager->lock);
1624}
1625
1626/*
1627 * Create a new 'type' socket managed by 'manager'.  Events
1628 * will be posted to 'task' and when dispatched 'action' will be
1629 * called with 'arg' as the arg value.  The new socket is returned
1630 * in 'socketp'.
1631 */
1632static isc_result_t
1633socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1634	      isc_socket_t **socketp, isc_socket_t *dup_socket)
1635{
1636	isc_socket_t *sock = NULL;
1637	isc_result_t result;
1638#if defined(USE_CMSG)
1639	int on = 1;
1640#endif
1641#if defined(SO_RCVBUF)
1642	ISC_SOCKADDR_LEN_T optlen;
1643	int size;
1644#endif
1645	int socket_errno;
1646	char strbuf[ISC_STRERRORSIZE];
1647
1648	REQUIRE(VALID_MANAGER(manager));
1649	REQUIRE(socketp != NULL && *socketp == NULL);
1650	REQUIRE(type != isc_sockettype_fdwatch);
1651
1652	if (dup_socket != NULL)
1653		return (ISC_R_NOTIMPLEMENTED);
1654
1655	result = allocate_socket(manager, type, &sock);
1656	if (result != ISC_R_SUCCESS)
1657		return (result);
1658
1659	sock->pf = pf;
1660#if 0
1661	if (dup_socket == NULL) {
1662#endif
1663		switch (type) {
1664		case isc_sockettype_udp:
1665			sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1666			if (sock->fd != INVALID_SOCKET) {
1667				result = connection_reset_fix(sock->fd);
1668				if (result != ISC_R_SUCCESS) {
1669					socket_log(__LINE__, sock,
1670						NULL, EVENT, NULL, 0, 0,
1671						"closed %d %d %d "
1672						"con_reset_fix_failed",
1673						sock->pending_recv,
1674						sock->pending_send,
1675						sock->references);
1676					closesocket(sock->fd);
1677					_set_state(sock, SOCK_CLOSED);
1678					sock->fd = INVALID_SOCKET;
1679					free_socket(&sock, __LINE__);
1680					return (result);
1681				}
1682			}
1683			break;
1684		case isc_sockettype_tcp:
1685			sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1686			break;
1687		}
1688#if 0
1689	} else {
1690		/*
1691		 * XXX: dup() is deprecated in windows, use _dup()
1692		 * instead.  In future we may want to investigate
1693		 * WSADuplicateSocket().
1694		 */
1695		sock->fd = _dup(dup_socket->fd);
1696		sock->dupped = 1;
1697		sock->bound = dup_socket->bound;
1698	}
1699#endif
1700
1701	if (sock->fd == INVALID_SOCKET) {
1702		socket_errno = WSAGetLastError();
1703		free_socket(&sock, __LINE__);
1704
1705		switch (socket_errno) {
1706		case WSAEMFILE:
1707		case WSAENOBUFS:
1708			return (ISC_R_NORESOURCES);
1709
1710		case WSAEPROTONOSUPPORT:
1711		case WSAEPFNOSUPPORT:
1712		case WSAEAFNOSUPPORT:
1713			return (ISC_R_FAMILYNOSUPPORT);
1714
1715		default:
1716			isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1717			UNEXPECTED_ERROR(__FILE__, __LINE__,
1718					 "socket() %s: %s",
1719					 isc_msgcat_get(isc_msgcat,
1720							ISC_MSGSET_GENERAL,
1721							ISC_MSG_FAILED,
1722							"failed"),
1723					 strbuf);
1724			return (ISC_R_UNEXPECTED);
1725		}
1726	}
1727
1728	result = make_nonblock(sock->fd);
1729	if (result != ISC_R_SUCCESS) {
1730		socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1731			"closed %d %d %d make_nonblock_failed",
1732			sock->pending_recv, sock->pending_send,
1733			sock->references);
1734		closesocket(sock->fd);
1735		sock->fd = INVALID_SOCKET;
1736		free_socket(&sock, __LINE__);
1737		return (result);
1738	}
1739
1740
1741#if defined(USE_CMSG) || defined(SO_RCVBUF)
1742	if (type == isc_sockettype_udp) {
1743
1744#if defined(USE_CMSG)
1745#if defined(ISC_PLATFORM_HAVEIPV6)
1746#ifdef IPV6_RECVPKTINFO
1747		/* 2292bis */
1748		if ((pf == AF_INET6)
1749		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1750				   (char *)&on, sizeof(on)) < 0)) {
1751			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1752			UNEXPECTED_ERROR(__FILE__, __LINE__,
1753					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1754					 "%s: %s", sock->fd,
1755					 isc_msgcat_get(isc_msgcat,
1756							ISC_MSGSET_GENERAL,
1757							ISC_MSG_FAILED,
1758							"failed"),
1759					 strbuf);
1760		}
1761#else
1762		/* 2292 */
1763		if ((pf == AF_INET6)
1764		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1765				   (char *)&on, sizeof(on)) < 0)) {
1766			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1767			UNEXPECTED_ERROR(__FILE__, __LINE__,
1768					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1769					 sock->fd,
1770					 isc_msgcat_get(isc_msgcat,
1771							ISC_MSGSET_GENERAL,
1772							ISC_MSG_FAILED,
1773							"failed"),
1774					 strbuf);
1775		}
1776#endif /* IPV6_RECVPKTINFO */
1777#ifdef IPV6_USE_MIN_MTU	/*2292bis, not too common yet*/
1778		/* use minimum MTU */
1779		if (pf == AF_INET6) {
1780			(void)setsockopt(sock->fd, IPPROTO_IPV6,
1781					 IPV6_USE_MIN_MTU,
1782					 (char *)&on, sizeof(on));
1783		}
1784#endif
1785#endif /* ISC_PLATFORM_HAVEIPV6 */
1786#endif /* defined(USE_CMSG) */
1787
1788#if defined(SO_RCVBUF)
1789	       optlen = sizeof(size);
1790	       if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1791			      (char *)&size, &optlen) >= 0 &&
1792		    size < RCVBUFSIZE) {
1793		       size = RCVBUFSIZE;
1794		       (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1795					(char *)&size, sizeof(size));
1796	       }
1797#endif
1798
1799	}
1800#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1801
1802	_set_state(sock, SOCK_OPEN);
1803	sock->references = 1;
1804	*socketp = sock;
1805
1806	iocompletionport_update(sock);
1807
1808	/*
1809	 * Note we don't have to lock the socket like we normally would because
1810	 * there are no external references to it yet.
1811	 */
1812	LOCK(&manager->lock);
1813	ISC_LIST_APPEND(manager->socklist, sock, link);
1814	InterlockedIncrement(&manager->totalSockets);
1815	UNLOCK(&manager->lock);
1816
1817	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat,
1818		   ISC_MSGSET_SOCKET, ISC_MSG_CREATED,
1819		   "created %u type %u", sock->fd, type);
1820
1821	return (ISC_R_SUCCESS);
1822}
1823
1824isc_result_t
1825isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1826		   isc_socket_t **socketp)
1827{
1828	return (socket_create(manager, pf, type, socketp, NULL));
1829}
1830
1831isc_result_t
1832isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
1833	REQUIRE(VALID_SOCKET(sock));
1834	REQUIRE(socketp != NULL && *socketp == NULL);
1835
1836#if 1
1837	return (ISC_R_NOTIMPLEMENTED);
1838#else
1839	return (socket_create(sock->manager, sock->pf, sock->type,
1840			      socketp, sock));
1841#endif
1842}
1843
1844isc_result_t
1845isc_socket_open(isc_socket_t *sock) {
1846	REQUIRE(VALID_SOCKET(sock));
1847	REQUIRE(sock->type != isc_sockettype_fdwatch);
1848
1849	return (ISC_R_NOTIMPLEMENTED);
1850}
1851
1852/*
1853 * Attach to a socket.  Caller must explicitly detach when it is done.
1854 */
1855void
1856isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1857	REQUIRE(VALID_SOCKET(sock));
1858	REQUIRE(socketp != NULL && *socketp == NULL);
1859
1860	LOCK(&sock->lock);
1861	CONSISTENT(sock);
1862	sock->references++;
1863	UNLOCK(&sock->lock);
1864
1865	*socketp = sock;
1866}
1867
1868/*
1869 * Dereference a socket.  If this is the last reference to it, clean things
1870 * up by destroying the socket.
1871 */
1872void
1873isc__socket_detach(isc_socket_t **socketp) {
1874	isc_socket_t *sock;
1875	isc_boolean_t kill_socket = ISC_FALSE;
1876
1877	REQUIRE(socketp != NULL);
1878	sock = *socketp;
1879	REQUIRE(VALID_SOCKET(sock));
1880	REQUIRE(sock->type != isc_sockettype_fdwatch);
1881
1882	LOCK(&sock->lock);
1883	CONSISTENT(sock);
1884	REQUIRE(sock->references > 0);
1885	sock->references--;
1886
1887	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1888		"detach_socket %d %d %d",
1889		sock->pending_recv, sock->pending_send,
1890		sock->references);
1891
1892	if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1893		closesocket(sock->fd);
1894		sock->fd = INVALID_SOCKET;
1895		_set_state(sock, SOCK_CLOSED);
1896	}
1897
1898	maybe_free_socket(&sock, __LINE__);
1899
1900	*socketp = NULL;
1901}
1902
1903isc_result_t
1904isc_socket_close(isc_socket_t *sock) {
1905	REQUIRE(VALID_SOCKET(sock));
1906	REQUIRE(sock->type != isc_sockettype_fdwatch);
1907
1908	return (ISC_R_NOTIMPLEMENTED);
1909}
1910
1911/*
1912 * Dequeue an item off the given socket's read queue, set the result code
1913 * in the done event to the one provided, and send it to the task it was
1914 * destined for.
1915 *
1916 * If the event to be sent is on a list, remove it before sending.  If
1917 * asked to, send and detach from the task as well.
1918 *
1919 * Caller must have the socket locked if the event is attached to the socket.
1920 */
1921static void
1922send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1923	isc_task_t *task;
1924
1925	task = (*dev)->ev_sender;
1926	(*dev)->ev_sender = sock;
1927
1928	if (ISC_LINK_LINKED(*dev, ev_link))
1929		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1930
1931	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1932	    == ISC_SOCKEVENTATTR_ATTACHED)
1933		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1934	else
1935		isc_task_send(task, (isc_event_t **)dev);
1936
1937	CONSISTENT(sock);
1938}
1939
1940/*
1941 * See comments for send_recvdone_event() above.
1942 */
1943static void
1944send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1945	isc_task_t *task;
1946
1947	INSIST(dev != NULL && *dev != NULL);
1948
1949	task = (*dev)->ev_sender;
1950	(*dev)->ev_sender = sock;
1951
1952	if (ISC_LINK_LINKED(*dev, ev_link))
1953		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1954
1955	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1956	    == ISC_SOCKEVENTATTR_ATTACHED)
1957		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1958	else
1959		isc_task_send(task, (isc_event_t **)dev);
1960
1961	CONSISTENT(sock);
1962}
1963
1964/*
1965 * See comments for send_recvdone_event() above.
1966 */
1967static void
1968send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
1969	isc_task_t *task;
1970
1971	INSIST(adev != NULL && *adev != NULL);
1972
1973	task = (*adev)->ev_sender;
1974	(*adev)->ev_sender = sock;
1975
1976	if (ISC_LINK_LINKED(*adev, ev_link))
1977		ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
1978
1979	isc_task_sendanddetach(&task, (isc_event_t **)adev);
1980
1981	CONSISTENT(sock);
1982}
1983
1984/*
1985 * See comments for send_recvdone_event() above.
1986 */
1987static void
1988send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
1989	isc_task_t *task;
1990
1991	INSIST(cdev != NULL && *cdev != NULL);
1992
1993	task = (*cdev)->ev_sender;
1994	(*cdev)->ev_sender = sock;
1995
1996	sock->connect_ev = NULL;
1997
1998	isc_task_sendanddetach(&task, (isc_event_t **)cdev);
1999
2000	CONSISTENT(sock);
2001}
2002
2003/*
2004 * On entry to this function, the event delivered is the internal
2005 * readable event, and the first item on the accept_list should be
2006 * the done event we want to send.  If the list is empty, this is a no-op,
2007 * so just close the new connection, unlock, and return.
2008 *
2009 * Note the socket is locked before entering here
2010 */
2011static void
2012internal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
2013	isc_socket_newconnev_t *adev;
2014	isc_result_t result = ISC_R_SUCCESS;
2015	isc_socket_t *nsock;
2016	struct sockaddr *localaddr;
2017	int localaddr_len = sizeof(*localaddr);
2018	struct sockaddr *remoteaddr;
2019	int remoteaddr_len = sizeof(*remoteaddr);
2020
2021	INSIST(VALID_SOCKET(sock));
2022	LOCK(&sock->lock);
2023	CONSISTENT(sock);
2024
2025	socket_log(__LINE__, sock, NULL, TRACE,
2026		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2027		   "internal_accept called");
2028
2029	INSIST(sock->listener);
2030
2031	INSIST(sock->pending_iocp > 0);
2032	sock->pending_iocp--;
2033	INSIST(sock->pending_accept > 0);
2034	sock->pending_accept--;
2035
2036	adev = lpo->adev;
2037
2038	/*
2039	 * If the event is no longer in the list we can just return.
2040	 */
2041	if (!acceptdone_is_active(sock, adev))
2042		goto done;
2043
2044	nsock = adev->newsocket;
2045
2046	/*
2047	 * Pull off the done event.
2048	 */
2049	ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
2050
2051	/*
2052	 * Extract the addresses from the socket, copy them into the structure,
2053	 * and return the new socket.
2054	 */
2055	ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
2056		sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
2057		(LPSOCKADDR *)&localaddr, &localaddr_len,
2058		(LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
2059	memcpy(&adev->address.type, remoteaddr, remoteaddr_len);
2060	adev->address.length = remoteaddr_len;
2061	nsock->address = adev->address;
2062	nsock->pf = adev->address.type.sa.sa_family;
2063
2064	socket_log(__LINE__, nsock, &nsock->address, TRACE,
2065		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2066		   "internal_accept parent %p", sock);
2067
2068	result = make_nonblock(adev->newsocket->fd);
2069	INSIST(result == ISC_R_SUCCESS);
2070
2071	INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
2072			  (char *)&sock->fd, sizeof(sock->fd)) == 0);
2073
2074	/*
2075	 * Hook it up into the manager.
2076	 */
2077	nsock->bound = 1;
2078	nsock->connected = 1;
2079	_set_state(nsock, SOCK_OPEN);
2080
2081	LOCK(&nsock->manager->lock);
2082	ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2083	InterlockedIncrement(&nsock->manager->totalSockets);
2084	UNLOCK(&nsock->manager->lock);
2085
2086	socket_log(__LINE__, sock, &nsock->address, CREATION,
2087		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2088		   "accepted_connection new_socket %p fd %d",
2089		   nsock, nsock->fd);
2090
2091	adev->result = result;
2092	send_acceptdone_event(sock, &adev);
2093
2094done:
2095	CONSISTENT(sock);
2096	UNLOCK(&sock->lock);
2097
2098	HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2099	lpo->acceptbuffer = NULL;
2100}
2101
2102/*
2103 * Called when a socket with a pending connect() finishes.
2104 * Note that the socket is locked before entering.
2105 */
2106static void
2107internal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2108	isc_socket_connev_t *cdev;
2109	char strbuf[ISC_STRERRORSIZE];
2110
2111	INSIST(VALID_SOCKET(sock));
2112
2113	LOCK(&sock->lock);
2114
2115	INSIST(sock->pending_iocp > 0);
2116	sock->pending_iocp--;
2117	INSIST(sock->pending_connect == 1);
2118	sock->pending_connect = 0;
2119
2120	/*
2121	 * Has this event been canceled?
2122	 */
2123	cdev = lpo->cdev;
2124	if (!connectdone_is_active(sock, cdev)) {
2125		sock->pending_connect = 0;
2126		if (sock->fd != INVALID_SOCKET) {
2127			closesocket(sock->fd);
2128			sock->fd = INVALID_SOCKET;
2129			_set_state(sock, SOCK_CLOSED);
2130		}
2131		CONSISTENT(sock);
2132		UNLOCK(&sock->lock);
2133		return;
2134	}
2135
2136	/*
2137	 * Check possible Windows network event error status here.
2138	 */
2139	if (connect_errno != 0) {
2140		/*
2141		 * If the error is SOFT, just try again on this
2142		 * fd and pretend nothing strange happened.
2143		 */
2144		if (SOFT_ERROR(connect_errno) ||
2145		    connect_errno == WSAEINPROGRESS) {
2146			sock->pending_connect = 1;
2147			CONSISTENT(sock);
2148			UNLOCK(&sock->lock);
2149			return;
2150		}
2151
2152		/*
2153		 * Translate other errors into ISC_R_* flavors.
2154		 */
2155		switch (connect_errno) {
2156#define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2157			ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2158			ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2159			ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2160			ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2161			ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2162			ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2163			ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2164			ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2165			ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2166			ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2167			ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2168			ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2169#undef ERROR_MATCH
2170		default:
2171			cdev->result = ISC_R_UNEXPECTED;
2172			isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2173			UNEXPECTED_ERROR(__FILE__, __LINE__,
2174					 "internal_connect: connect() %s",
2175					 strbuf);
2176		}
2177	} else {
2178		INSIST(setsockopt(sock->fd, SOL_SOCKET,
2179				  SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2180		cdev->result = ISC_R_SUCCESS;
2181		sock->connected = 1;
2182		socket_log(__LINE__, sock, &sock->address, IOEVENT,
2183			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2184			   "internal_connect: success");
2185	}
2186
2187	send_connectdone_event(sock, &cdev);
2188
2189	UNLOCK(&sock->lock);
2190}
2191
2192/*
2193 * Loop through the socket, returning ISC_R_EOF for each done event pending.
2194 */
2195static void
2196send_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2197	isc_socketevent_t *dev;
2198
2199	while (!ISC_LIST_EMPTY(sock->recv_list)) {
2200		dev = ISC_LIST_HEAD(sock->recv_list);
2201		dev->result = result;
2202		send_recvdone_event(sock, &dev);
2203	}
2204}
2205
2206/*
2207 * Take the data we received in our private buffer, and if any recv() calls on
2208 * our list are satisfied, send the corresponding done event.
2209 *
2210 * If we need more data (there are still items on the recv_list after we consume all
2211 * our data) then arrange for another system recv() call to fill our buffers.
2212 */
2213static void
2214internal_recv(isc_socket_t *sock, int nbytes)
2215{
2216	INSIST(VALID_SOCKET(sock));
2217
2218	LOCK(&sock->lock);
2219	CONSISTENT(sock);
2220
2221	socket_log(__LINE__, sock, NULL, IOEVENT,
2222		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2223		   "internal_recv: %d bytes received", nbytes);
2224
2225	/*
2226	 * If we got here, the I/O operation succeeded.  However, we might still have removed this
2227	 * event from our notification list (or never placed it on it due to immediate completion.)
2228	 * Handle the reference counting here, and handle the cancellation event just after.
2229	 */
2230	INSIST(sock->pending_iocp > 0);
2231	sock->pending_iocp--;
2232	INSIST(sock->pending_recv > 0);
2233	sock->pending_recv--;
2234
2235	/*
2236	 * The only way we could have gotten here is that our I/O has successfully completed.
2237	 * Update our pointers, and move on.  The only odd case here is that we might not
2238	 * have received enough data on a TCP stream to satisfy the minimum requirements.  If
2239	 * this is the case, we will re-issue the recv() call for what we need.
2240	 *
2241	 * We do check for a recv() of 0 bytes on a TCP stream.  This means the remote end
2242	 * has closed.
2243	 */
2244	if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2245		send_recvdone_abort(sock, ISC_R_EOF);
2246		maybe_free_socket(&sock, __LINE__);
2247		return;
2248	}
2249	sock->recvbuf.remaining = nbytes;
2250	sock->recvbuf.consume_position = sock->recvbuf.base;
2251	completeio_recv(sock);
2252
2253	/*
2254	 * If there are more receivers waiting for data, queue another receive
2255	 * here.
2256	 */
2257	queue_receive_request(sock);
2258
2259	/*
2260	 * Unlock and/or destroy if we are the last thing this socket has left to do.
2261	 */
2262	maybe_free_socket(&sock, __LINE__);
2263}
2264
2265static void
2266internal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2267	      struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2268{
2269	buflist_t *buffer;
2270
2271	/*
2272	 * Find out what socket this is and lock it.
2273	 */
2274	INSIST(VALID_SOCKET(sock));
2275
2276	LOCK(&sock->lock);
2277	CONSISTENT(sock);
2278
2279	socket_log(__LINE__, sock, NULL, IOEVENT,
2280		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2281		   "internal_send: task got socket event %p", dev);
2282
2283	buffer = ISC_LIST_HEAD(lpo->bufferlist);
2284	while (buffer != NULL) {
2285		ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2286
2287		socket_log(__LINE__, sock, NULL, TRACE,
2288		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2289		   "free_buffer %p %p", buffer, buffer->buf);
2290
2291		HeapFree(hHeapHandle, 0, buffer->buf);
2292		HeapFree(hHeapHandle, 0, buffer);
2293		buffer = ISC_LIST_HEAD(lpo->bufferlist);
2294	}
2295
2296	INSIST(sock->pending_iocp > 0);
2297	sock->pending_iocp--;
2298	INSIST(sock->pending_send > 0);
2299	sock->pending_send--;
2300
2301	/* If the event is no longer in the list we can just return */
2302	if (!senddone_is_active(sock, dev))
2303		goto done;
2304
2305	/*
2306	 * Set the error code and send things on its way.
2307	 */
2308	switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2309	case DOIO_SOFT:
2310		break;
2311	case DOIO_HARD:
2312	case DOIO_SUCCESS:
2313		send_senddone_event(sock, &dev);
2314		break;
2315	}
2316
2317 done:
2318	maybe_free_socket(&sock, __LINE__);
2319}
2320
2321/*
2322 * These return if the done event passed in is on the list (or for connect, is
2323 * the one we're waiting for.  Using these ensures we will not double-send an
2324 * event.
2325 */
2326static isc_boolean_t
2327senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2328{
2329	isc_socketevent_t *ldev;
2330
2331	ldev = ISC_LIST_HEAD(sock->send_list);
2332	while (ldev != NULL && ldev != dev)
2333		ldev = ISC_LIST_NEXT(ldev, ev_link);
2334
2335	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2336}
2337
2338static isc_boolean_t
2339acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2340{
2341	isc_socket_newconnev_t *ldev;
2342
2343	ldev = ISC_LIST_HEAD(sock->accept_list);
2344	while (ldev != NULL && ldev != dev)
2345		ldev = ISC_LIST_NEXT(ldev, ev_link);
2346
2347	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2348}
2349
2350static isc_boolean_t
2351connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2352{
2353	return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2354}
2355
2356//
2357// The Windows network stack seems to have two very distinct paths depending
2358// on what is installed.  Specifically, if something is looking at network
2359// connections (like an anti-virus or anti-malware application, such as
2360// McAfee products) Windows may return additional error conditions which
2361// were not previously returned.
2362//
2363// One specific one is when a TCP SYN scan is used.  In this situation,
2364// Windows responds with the SYN-ACK, but the scanner never responds with
2365// the 3rd packet, the ACK.  Windows consiers this a partially open connection.
2366// Most Unix networking stacks, and Windows without McAfee installed, will
2367// not return this to the caller.  However, with this product installed,
2368// Windows returns this as a failed status on the Accept() call.  Here, we
2369// will just re-issue the ISCAcceptEx() call as if nothing had happened.
2370//
2371// This code should only be called when the listening socket has received
2372// such an error.  Additionally, the "parent" socket must be locked.
2373// Additionally, the lpo argument is re-used here, and must not be freed
2374// by the caller.
2375//
2376static isc_result_t
2377restart_accept(isc_socket_t *parent, IoCompletionInfo *lpo)
2378{
2379	isc_socket_t *nsock = lpo->adev->newsocket;
2380	SOCKET new_fd;
2381
2382	/*
2383	 * AcceptEx() requires we pass in a socket.  Note that we carefully
2384	 * do not close the previous socket in case of an error message returned by
2385	 * our new socket() call.  If we return an error here, our caller will
2386	 * clean up.
2387	 */
2388	new_fd = socket(parent->pf, SOCK_STREAM, IPPROTO_TCP);
2389	if (nsock->fd == INVALID_SOCKET) {
2390		return (ISC_R_FAILURE); // parent will ask windows for error message
2391	}
2392	closesocket(nsock->fd);
2393	nsock->fd = new_fd;
2394
2395	memset(&lpo->overlapped, 0, sizeof(lpo->overlapped));
2396
2397	ISCAcceptEx(parent->fd,
2398		    nsock->fd,				/* Accepted Socket */
2399		    lpo->acceptbuffer,			/* Buffer for initial Recv */
2400		    0,					/* Length of Buffer */
2401		    sizeof(SOCKADDR_STORAGE) + 16,	/* Local address length + 16 */
2402		    sizeof(SOCKADDR_STORAGE) + 16,	/* Remote address lengh + 16 */
2403		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
2404		    (LPOVERLAPPED)lpo			/* Overlapped structure */
2405		    );
2406
2407	InterlockedDecrement(&nsock->manager->iocp_total);
2408	iocompletionport_update(nsock);
2409
2410	return (ISC_R_SUCCESS);
2411}
2412
2413/*
2414 * This is the I/O Completion Port Worker Function. It loops forever
2415 * waiting for I/O to complete and then forwards them for further
2416 * processing. There are a number of these in separate threads.
2417 */
2418static isc_threadresult_t WINAPI
2419SocketIoThread(LPVOID ThreadContext) {
2420	isc_socketmgr_t *manager = ThreadContext;
2421	BOOL bSuccess = FALSE;
2422	DWORD nbytes;
2423	IoCompletionInfo *lpo = NULL;
2424	isc_socket_t *sock = NULL;
2425	int request;
2426	struct msghdr *messagehdr = NULL;
2427	int errval;
2428	char strbuf[ISC_STRERRORSIZE];
2429	int errstatus;
2430
2431	REQUIRE(VALID_MANAGER(manager));
2432
2433	/*
2434	 * Set the thread priority high enough so I/O will
2435	 * preempt normal recv packet processing, but not
2436	 * higher than the timer sync thread.
2437	 */
2438	if (!SetThreadPriority(GetCurrentThread(),
2439			       THREAD_PRIORITY_ABOVE_NORMAL)) {
2440		errval = GetLastError();
2441		isc__strerror(errval, strbuf, sizeof(strbuf));
2442		FATAL_ERROR(__FILE__, __LINE__,
2443				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2444				ISC_MSG_FAILED,
2445				"Can't set thread priority: %s"),
2446				strbuf);
2447	}
2448
2449	/*
2450	 * Loop forever waiting on I/O Completions and then processing them
2451	 */
2452	while (TRUE) {
2453		wait_again:
2454		bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2455						     &nbytes, (LPDWORD)&sock,
2456						     (LPWSAOVERLAPPED *)&lpo,
2457						     INFINITE);
2458		if (lpo == NULL) /* Received request to exit */
2459			break;
2460
2461		REQUIRE(VALID_SOCKET(sock));
2462
2463		request = lpo->request_type;
2464
2465		errstatus = 0;
2466		if (!bSuccess) {
2467			isc_result_t isc_result;
2468
2469			/*
2470			 * Did the I/O operation complete?
2471			 */
2472			errstatus = GetLastError();
2473			isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2474
2475			LOCK(&sock->lock);
2476			CONSISTENT(sock);
2477			switch (request) {
2478			case SOCKET_RECV:
2479				INSIST(sock->pending_iocp > 0);
2480				sock->pending_iocp--;
2481				INSIST(sock->pending_recv > 0);
2482				sock->pending_recv--;
2483				if (!sock->connected &&
2484				    ((errstatus == ERROR_HOST_UNREACHABLE) ||
2485				     (errstatus == WSAENETRESET) ||
2486				     (errstatus == WSAECONNRESET))) {
2487					/* ignore soft errors */
2488					queue_receive_request(sock);
2489					break;
2490				}
2491				send_recvdone_abort(sock, isc_result);
2492				if (isc_result == ISC_R_UNEXPECTED) {
2493					UNEXPECTED_ERROR(__FILE__, __LINE__,
2494						"SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2495						errstatus, isc_result);
2496				}
2497				break;
2498
2499			case SOCKET_SEND:
2500				INSIST(sock->pending_iocp > 0);
2501				sock->pending_iocp--;
2502				INSIST(sock->pending_send > 0);
2503				sock->pending_send--;
2504				if (senddone_is_active(sock, lpo->dev)) {
2505					lpo->dev->result = isc_result;
2506					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2507						"canceled_send");
2508					send_senddone_event(sock, &lpo->dev);
2509				}
2510				break;
2511
2512			case SOCKET_ACCEPT:
2513				INSIST(sock->pending_iocp > 0);
2514				INSIST(sock->pending_accept > 0);
2515
2516				socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2517					"Accept: errstatus=%d isc_result=%d", errstatus, isc_result);
2518
2519				if (acceptdone_is_active(sock, lpo->adev)) {
2520					if (restart_accept(sock, lpo) == ISC_R_SUCCESS) {
2521						UNLOCK(&sock->lock);
2522						goto wait_again;
2523					} else {
2524						errstatus = GetLastError();
2525						isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2526						socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2527							"restart_accept() failed: errstatus=%d isc_result=%d",
2528							errstatus, isc_result);
2529					}
2530				}
2531
2532				sock->pending_iocp--;
2533				sock->pending_accept--;
2534				if (acceptdone_is_active(sock, lpo->adev)) {
2535					closesocket(lpo->adev->newsocket->fd);
2536					lpo->adev->newsocket->fd = INVALID_SOCKET;
2537					lpo->adev->newsocket->references--;
2538					free_socket(&lpo->adev->newsocket, __LINE__);
2539					lpo->adev->result = isc_result;
2540					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2541						"canceled_accept");
2542					send_acceptdone_event(sock, &lpo->adev);
2543				}
2544				break;
2545
2546			case SOCKET_CONNECT:
2547				INSIST(sock->pending_iocp > 0);
2548				sock->pending_iocp--;
2549				INSIST(sock->pending_connect == 1);
2550				sock->pending_connect = 0;
2551				if (connectdone_is_active(sock, lpo->cdev)) {
2552					lpo->cdev->result = isc_result;
2553					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2554						"canceled_connect");
2555					send_connectdone_event(sock, &lpo->cdev);
2556				}
2557				break;
2558			}
2559			maybe_free_socket(&sock, __LINE__);
2560
2561			if (lpo != NULL)
2562				HeapFree(hHeapHandle, 0, lpo);
2563			continue;
2564		}
2565
2566		messagehdr = &lpo->messagehdr;
2567
2568		switch (request) {
2569		case SOCKET_RECV:
2570			internal_recv(sock, nbytes);
2571			break;
2572		case SOCKET_SEND:
2573			internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2574			break;
2575		case SOCKET_ACCEPT:
2576			internal_accept(sock, lpo, errstatus);
2577			break;
2578		case SOCKET_CONNECT:
2579			internal_connect(sock, lpo, errstatus);
2580			break;
2581		}
2582
2583		if (lpo != NULL)
2584			HeapFree(hHeapHandle, 0, lpo);
2585	}
2586
2587	/*
2588	 * Exit Completion Port Thread
2589	 */
2590	manager_log(manager, TRACE,
2591		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2592				   ISC_MSG_EXITING, "SocketIoThread exiting"));
2593	return ((isc_threadresult_t)0);
2594}
2595
2596/*
2597 * Create a new socket manager.
2598 */
2599isc_result_t
2600isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2601	return (isc_socketmgr_create2(mctx, managerp, 0));
2602}
2603
2604isc_result_t
2605isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2606		       unsigned int maxsocks)
2607{
2608	isc_socketmgr_t *manager;
2609	isc_result_t result;
2610
2611	REQUIRE(managerp != NULL && *managerp == NULL);
2612
2613	if (maxsocks != 0)
2614		return (ISC_R_NOTIMPLEMENTED);
2615
2616	manager = isc_mem_get(mctx, sizeof(*manager));
2617	if (manager == NULL)
2618		return (ISC_R_NOMEMORY);
2619
2620	InitSockets();
2621
2622	manager->magic = SOCKET_MANAGER_MAGIC;
2623	manager->mctx = NULL;
2624	manager->stats = NULL;
2625	ISC_LIST_INIT(manager->socklist);
2626	result = isc_mutex_init(&manager->lock);
2627	if (result != ISC_R_SUCCESS) {
2628		isc_mem_put(mctx, manager, sizeof(*manager));
2629		return (result);
2630	}
2631	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2632		DESTROYLOCK(&manager->lock);
2633		isc_mem_put(mctx, manager, sizeof(*manager));
2634		UNEXPECTED_ERROR(__FILE__, __LINE__,
2635				 "isc_condition_init() %s",
2636				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2637						ISC_MSG_FAILED, "failed"));
2638		return (ISC_R_UNEXPECTED);
2639	}
2640
2641	isc_mem_attach(mctx, &manager->mctx);
2642
2643	iocompletionport_init(manager);	/* Create the Completion Ports */
2644
2645	manager->bShutdown = ISC_FALSE;
2646	manager->totalSockets = 0;
2647	manager->iocp_total = 0;
2648
2649	*managerp = manager;
2650
2651	return (ISC_R_SUCCESS);
2652}
2653
2654isc_result_t
2655isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2656	REQUIRE(VALID_MANAGER(manager));
2657	REQUIRE(nsockp != NULL);
2658
2659	return (ISC_R_NOTIMPLEMENTED);
2660}
2661
2662void
2663isc__socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2664	REQUIRE(VALID_MANAGER(manager));
2665	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2666	REQUIRE(manager->stats == NULL);
2667	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2668
2669	isc_stats_attach(stats, &manager->stats);
2670}
2671
2672void
2673isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2674	isc_socketmgr_t *manager;
2675	int i;
2676	isc_mem_t *mctx;
2677
2678	/*
2679	 * Destroy a socket manager.
2680	 */
2681
2682	REQUIRE(managerp != NULL);
2683	manager = *managerp;
2684	REQUIRE(VALID_MANAGER(manager));
2685
2686	LOCK(&manager->lock);
2687
2688	/*
2689	 * Wait for all sockets to be destroyed.
2690	 */
2691	while (!ISC_LIST_EMPTY(manager->socklist)) {
2692		manager_log(manager, CREATION,
2693			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2694					   ISC_MSG_SOCKETSREMAIN,
2695					   "sockets exist"));
2696		WAIT(&manager->shutdown_ok, &manager->lock);
2697	}
2698
2699	UNLOCK(&manager->lock);
2700
2701	/*
2702	 * Here, we need to had some wait code for the completion port
2703	 * thread.
2704	 */
2705	signal_iocompletionport_exit(manager);
2706	manager->bShutdown = ISC_TRUE;
2707
2708	/*
2709	 * Wait for threads to exit.
2710	 */
2711	for (i = 0; i < manager->maxIOCPThreads; i++) {
2712		if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2713			NULL) != ISC_R_SUCCESS)
2714			UNEXPECTED_ERROR(__FILE__, __LINE__,
2715				 "isc_thread_join() for Completion Port %s",
2716				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2717						ISC_MSG_FAILED, "failed"));
2718	}
2719	/*
2720	 * Clean up.
2721	 */
2722
2723	CloseHandle(manager->hIoCompletionPort);
2724
2725	(void)isc_condition_destroy(&manager->shutdown_ok);
2726
2727	DESTROYLOCK(&manager->lock);
2728	if (manager->stats != NULL)
2729		isc_stats_detach(&manager->stats);
2730	manager->magic = 0;
2731	mctx= manager->mctx;
2732	isc_mem_put(mctx, manager, sizeof(*manager));
2733
2734	isc_mem_detach(&mctx);
2735
2736	*managerp = NULL;
2737}
2738
2739static void
2740queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev)
2741{
2742	isc_task_t *ntask = NULL;
2743
2744	isc_task_attach(task, &ntask);
2745	dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2746
2747	/*
2748	 * Enqueue the request.
2749	 */
2750	INSIST(!ISC_LINK_LINKED(dev, ev_link));
2751	ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2752
2753	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2754		   "queue_receive_event: event %p -> task %p",
2755		   dev, ntask);
2756}
2757
2758/*
2759 * Check the pending receive queue, and if we have data pending, give it to this
2760 * caller.  If we have none, queue an I/O request.  If this caller is not the first
2761 * on the list, then we will just queue this event and return.
2762 *
2763 * Caller must have the socket locked.
2764 */
2765static isc_result_t
2766socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2767	    unsigned int flags)
2768{
2769	int cc = 0;
2770	isc_task_t *ntask = NULL;
2771	isc_result_t result = ISC_R_SUCCESS;
2772	int recv_errno = 0;
2773
2774	dev->ev_sender = task;
2775
2776	if (sock->fd == INVALID_SOCKET)
2777		return (ISC_R_EOF);
2778
2779	/*
2780	 * Queue our event on the list of things to do.  Call our function to
2781	 * attempt to fill buffers as much as possible, and return done events.
2782	 * We are going to lie about our handling of the ISC_SOCKFLAG_IMMEDIATE
2783	 * here and tell our caller that we could not satisfy it immediately.
2784	 */
2785	queue_receive_event(sock, task, dev);
2786	if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2787		result = ISC_R_INPROGRESS;
2788
2789	completeio_recv(sock);
2790
2791	/*
2792	 * If there are more receivers waiting for data, queue another receive
2793	 * here.  If the
2794	 */
2795	queue_receive_request(sock);
2796
2797	return (result);
2798}
2799
2800isc_result_t
2801isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2802		 unsigned int minimum, isc_task_t *task,
2803		 isc_taskaction_t action, const void *arg)
2804{
2805	isc_socketevent_t *dev;
2806	isc_socketmgr_t *manager;
2807	unsigned int iocount;
2808	isc_buffer_t *buffer;
2809	isc_result_t ret;
2810
2811	REQUIRE(VALID_SOCKET(sock));
2812	LOCK(&sock->lock);
2813	CONSISTENT(sock);
2814
2815	/*
2816	 * Make sure that the socket is not closed.  XXXMLG change error here?
2817	 */
2818	if (sock->fd == INVALID_SOCKET) {
2819		UNLOCK(&sock->lock);
2820		return (ISC_R_CONNREFUSED);
2821	}
2822
2823	REQUIRE(buflist != NULL);
2824	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2825	REQUIRE(task != NULL);
2826	REQUIRE(action != NULL);
2827
2828	manager = sock->manager;
2829	REQUIRE(VALID_MANAGER(manager));
2830
2831	iocount = isc_bufferlist_availablecount(buflist);
2832	REQUIRE(iocount > 0);
2833
2834	INSIST(sock->bound);
2835
2836	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2837	if (dev == NULL) {
2838		UNLOCK(&sock->lock);
2839		return (ISC_R_NOMEMORY);
2840	}
2841
2842	/*
2843	 * UDP sockets are always partial read
2844	 */
2845	if (sock->type == isc_sockettype_udp)
2846		dev->minimum = 1;
2847	else {
2848		if (minimum == 0)
2849			dev->minimum = iocount;
2850		else
2851			dev->minimum = minimum;
2852	}
2853
2854	/*
2855	 * Move each buffer from the passed in list to our internal one.
2856	 */
2857	buffer = ISC_LIST_HEAD(*buflist);
2858	while (buffer != NULL) {
2859		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2860		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2861		buffer = ISC_LIST_HEAD(*buflist);
2862	}
2863
2864	ret = socket_recv(sock, dev, task, 0);
2865
2866	UNLOCK(&sock->lock);
2867	return (ret);
2868}
2869
2870isc_result_t
2871isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
2872		 unsigned int minimum, isc_task_t *task,
2873		 isc_taskaction_t action, const void *arg)
2874{
2875	isc_socketevent_t *dev;
2876	isc_socketmgr_t *manager;
2877	isc_result_t ret;
2878
2879	REQUIRE(VALID_SOCKET(sock));
2880	LOCK(&sock->lock);
2881	CONSISTENT(sock);
2882
2883	/*
2884	 * make sure that the socket's not closed
2885	 */
2886	if (sock->fd == INVALID_SOCKET) {
2887		UNLOCK(&sock->lock);
2888		return (ISC_R_CONNREFUSED);
2889	}
2890	REQUIRE(action != NULL);
2891
2892	manager = sock->manager;
2893	REQUIRE(VALID_MANAGER(manager));
2894
2895	INSIST(sock->bound);
2896
2897	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2898	if (dev == NULL) {
2899		UNLOCK(&sock->lock);
2900		return (ISC_R_NOMEMORY);
2901	}
2902
2903	ret = isc_socket_recv2(sock, region, minimum, task, dev, 0);
2904	UNLOCK(&sock->lock);
2905	return (ret);
2906}
2907
2908isc_result_t
2909isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
2910		  unsigned int minimum, isc_task_t *task,
2911		  isc_socketevent_t *event, unsigned int flags)
2912{
2913	isc_result_t ret;
2914
2915	REQUIRE(VALID_SOCKET(sock));
2916	LOCK(&sock->lock);
2917	CONSISTENT(sock);
2918
2919	event->result = ISC_R_UNEXPECTED;
2920	event->ev_sender = sock;
2921	/*
2922	 * make sure that the socket's not closed
2923	 */
2924	if (sock->fd == INVALID_SOCKET) {
2925		UNLOCK(&sock->lock);
2926		return (ISC_R_CONNREFUSED);
2927	}
2928
2929	ISC_LIST_INIT(event->bufferlist);
2930	event->region = *region;
2931	event->n = 0;
2932	event->offset = 0;
2933	event->attributes = 0;
2934
2935	/*
2936	 * UDP sockets are always partial read.
2937	 */
2938	if (sock->type == isc_sockettype_udp)
2939		event->minimum = 1;
2940	else {
2941		if (minimum == 0)
2942			event->minimum = region->length;
2943		else
2944			event->minimum = minimum;
2945	}
2946
2947	ret = socket_recv(sock, event, task, flags);
2948	UNLOCK(&sock->lock);
2949	return (ret);
2950}
2951
2952/*
2953 * Caller must have the socket locked.
2954 */
2955static isc_result_t
2956socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2957	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2958	    unsigned int flags)
2959{
2960	int io_state;
2961	int send_errno = 0;
2962	int cc = 0;
2963	isc_task_t *ntask = NULL;
2964	isc_result_t result = ISC_R_SUCCESS;
2965
2966	dev->ev_sender = task;
2967
2968	set_dev_address(address, sock, dev);
2969	if (pktinfo != NULL) {
2970		socket_log(__LINE__, sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2971			   ISC_MSG_PKTINFOPROVIDED,
2972			   "pktinfo structure provided, ifindex %u (set to 0)",
2973			   pktinfo->ipi6_ifindex);
2974
2975		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2976		dev->pktinfo = *pktinfo;
2977		/*
2978		 * Set the pktinfo index to 0 here, to let the kernel decide
2979		 * what interface it should send on.
2980		 */
2981		dev->pktinfo.ipi6_ifindex = 0;
2982	}
2983
2984	io_state = startio_send(sock, dev, &cc, &send_errno);
2985	switch (io_state) {
2986	case DOIO_PENDING:	/* I/O started. Nothing more to do */
2987	case DOIO_SOFT:
2988		/*
2989		 * We couldn't send all or part of the request right now, so
2990		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2991		 */
2992		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2993			isc_task_attach(task, &ntask);
2994			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2995
2996			/*
2997			 * Enqueue the request.
2998			 */
2999			INSIST(!ISC_LINK_LINKED(dev, ev_link));
3000			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3001
3002			socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
3003				   "socket_send: event %p -> task %p",
3004				   dev, ntask);
3005
3006			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3007				result = ISC_R_INPROGRESS;
3008			break;
3009		}
3010
3011	case DOIO_SUCCESS:
3012		break;
3013	}
3014
3015	return (result);
3016}
3017
3018isc_result_t
3019isc__socket_send(isc_socket_t *sock, isc_region_t *region,
3020		 isc_task_t *task, isc_taskaction_t action, const void *arg)
3021{
3022	/*
3023	 * REQUIRE() checking is performed in isc_socket_sendto().
3024	 */
3025	return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3026				  NULL));
3027}
3028
3029isc_result_t
3030isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
3031		   isc_task_t *task, isc_taskaction_t action, const void *arg,
3032		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3033{
3034	isc_socketevent_t *dev;
3035	isc_socketmgr_t *manager;
3036	isc_result_t ret;
3037
3038	REQUIRE(VALID_SOCKET(sock));
3039	REQUIRE(sock->type != isc_sockettype_fdwatch);
3040
3041	LOCK(&sock->lock);
3042	CONSISTENT(sock);
3043
3044	/*
3045	 * make sure that the socket's not closed
3046	 */
3047	if (sock->fd == INVALID_SOCKET) {
3048		UNLOCK(&sock->lock);
3049		return (ISC_R_CONNREFUSED);
3050	}
3051	REQUIRE(region != NULL);
3052	REQUIRE(task != NULL);
3053	REQUIRE(action != NULL);
3054
3055	manager = sock->manager;
3056	REQUIRE(VALID_MANAGER(manager));
3057
3058	INSIST(sock->bound);
3059
3060	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3061	if (dev == NULL) {
3062		UNLOCK(&sock->lock);
3063		return (ISC_R_NOMEMORY);
3064	}
3065	dev->region = *region;
3066
3067	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3068	UNLOCK(&sock->lock);
3069	return (ret);
3070}
3071
3072isc_result_t
3073isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3074		  isc_task_t *task, isc_taskaction_t action, const void *arg)
3075{
3076	return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
3077				   NULL));
3078}
3079
3080isc_result_t
3081isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3082		    isc_task_t *task, isc_taskaction_t action, const void *arg,
3083		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3084{
3085	isc_socketevent_t *dev;
3086	isc_socketmgr_t *manager;
3087	unsigned int iocount;
3088	isc_buffer_t *buffer;
3089	isc_result_t ret;
3090
3091	REQUIRE(VALID_SOCKET(sock));
3092
3093	LOCK(&sock->lock);
3094	CONSISTENT(sock);
3095
3096	/*
3097	 * make sure that the socket's not closed
3098	 */
3099	if (sock->fd == INVALID_SOCKET) {
3100		UNLOCK(&sock->lock);
3101		return (ISC_R_CONNREFUSED);
3102	}
3103	REQUIRE(buflist != NULL);
3104	REQUIRE(!ISC_LIST_EMPTY(*buflist));
3105	REQUIRE(task != NULL);
3106	REQUIRE(action != NULL);
3107
3108	manager = sock->manager;
3109	REQUIRE(VALID_MANAGER(manager));
3110
3111	iocount = isc_bufferlist_usedcount(buflist);
3112	REQUIRE(iocount > 0);
3113
3114	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3115	if (dev == NULL) {
3116		UNLOCK(&sock->lock);
3117		return (ISC_R_NOMEMORY);
3118	}
3119
3120	/*
3121	 * Move each buffer from the passed in list to our internal one.
3122	 */
3123	buffer = ISC_LIST_HEAD(*buflist);
3124	while (buffer != NULL) {
3125		ISC_LIST_DEQUEUE(*buflist, buffer, link);
3126		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3127		buffer = ISC_LIST_HEAD(*buflist);
3128	}
3129
3130	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3131	UNLOCK(&sock->lock);
3132	return (ret);
3133}
3134
3135isc_result_t
3136isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3137		    isc_task_t *task,
3138		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3139		    isc_socketevent_t *event, unsigned int flags)
3140{
3141	isc_result_t ret;
3142
3143	REQUIRE(VALID_SOCKET(sock));
3144	LOCK(&sock->lock);
3145	CONSISTENT(sock);
3146
3147	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3148	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3149		REQUIRE(sock->type == isc_sockettype_udp);
3150	event->ev_sender = sock;
3151	event->result = ISC_R_UNEXPECTED;
3152	/*
3153	 * make sure that the socket's not closed
3154	 */
3155	if (sock->fd == INVALID_SOCKET) {
3156		UNLOCK(&sock->lock);
3157		return (ISC_R_CONNREFUSED);
3158	}
3159	ISC_LIST_INIT(event->bufferlist);
3160	event->region = *region;
3161	event->n = 0;
3162	event->offset = 0;
3163	event->attributes = 0;
3164
3165	ret = socket_send(sock, event, task, address, pktinfo, flags);
3166	UNLOCK(&sock->lock);
3167	return (ret);
3168}
3169
3170isc_result_t
3171isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3172		 unsigned int options) {
3173	int bind_errno;
3174	char strbuf[ISC_STRERRORSIZE];
3175	int on = 1;
3176
3177	REQUIRE(VALID_SOCKET(sock));
3178	LOCK(&sock->lock);
3179	CONSISTENT(sock);
3180
3181	/*
3182	 * make sure that the socket's not closed
3183	 */
3184	if (sock->fd == INVALID_SOCKET) {
3185		UNLOCK(&sock->lock);
3186		return (ISC_R_CONNREFUSED);
3187	}
3188
3189	INSIST(!sock->bound);
3190	INSIST(!sock->dupped);
3191
3192	if (sock->pf != sockaddr->type.sa.sa_family) {
3193		UNLOCK(&sock->lock);
3194		return (ISC_R_FAMILYMISMATCH);
3195	}
3196	/*
3197	 * Only set SO_REUSEADDR when we want a specific port.
3198	 */
3199	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3200	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3201	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
3202		       sizeof(on)) < 0) {
3203		UNEXPECTED_ERROR(__FILE__, __LINE__,
3204				 "setsockopt(%d) %s", sock->fd,
3205				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3206						ISC_MSG_FAILED, "failed"));
3207		/* Press on... */
3208	}
3209	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3210		bind_errno = WSAGetLastError();
3211		UNLOCK(&sock->lock);
3212		switch (bind_errno) {
3213		case WSAEACCES:
3214			return (ISC_R_NOPERM);
3215		case WSAEADDRNOTAVAIL:
3216			return (ISC_R_ADDRNOTAVAIL);
3217		case WSAEADDRINUSE:
3218			return (ISC_R_ADDRINUSE);
3219		case WSAEINVAL:
3220			return (ISC_R_BOUND);
3221		default:
3222			isc__strerror(bind_errno, strbuf, sizeof(strbuf));
3223			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3224					 strbuf);
3225			return (ISC_R_UNEXPECTED);
3226		}
3227	}
3228
3229	socket_log(__LINE__, sock, sockaddr, TRACE,
3230		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3231	sock->bound = 1;
3232
3233	UNLOCK(&sock->lock);
3234	return (ISC_R_SUCCESS);
3235}
3236
3237isc_result_t
3238isc__socket_filter(isc_socket_t *sock, const char *filter) {
3239	UNUSED(sock);
3240	UNUSED(filter);
3241
3242	REQUIRE(VALID_SOCKET(sock));
3243	return (ISC_R_NOTIMPLEMENTED);
3244}
3245
3246/*
3247 * Set up to listen on a given socket.  We do this by creating an internal
3248 * event that will be dispatched when the socket has read activity.  The
3249 * watcher will send the internal event to the task when there is a new
3250 * connection.
3251 *
3252 * Unlike in read, we don't preallocate a done event here.  Every time there
3253 * is a new connection we'll have to allocate a new one anyway, so we might
3254 * as well keep things simple rather than having to track them.
3255 */
3256isc_result_t
3257isc__socket_listen(isc_socket_t *sock, unsigned int backlog) {
3258	char strbuf[ISC_STRERRORSIZE];
3259
3260	REQUIRE(VALID_SOCKET(sock));
3261
3262	LOCK(&sock->lock);
3263	CONSISTENT(sock);
3264
3265	/*
3266	 * make sure that the socket's not closed
3267	 */
3268	if (sock->fd == INVALID_SOCKET) {
3269		UNLOCK(&sock->lock);
3270		return (ISC_R_CONNREFUSED);
3271	}
3272
3273	REQUIRE(!sock->listener);
3274	REQUIRE(sock->bound);
3275	REQUIRE(sock->type == isc_sockettype_tcp);
3276
3277	if (backlog == 0)
3278		backlog = SOMAXCONN;
3279
3280	if (listen(sock->fd, (int)backlog) < 0) {
3281		UNLOCK(&sock->lock);
3282		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3283
3284		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3285
3286		return (ISC_R_UNEXPECTED);
3287	}
3288
3289	socket_log(__LINE__, sock, NULL, TRACE,
3290		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "listening");
3291	sock->listener = 1;
3292	_set_state(sock, SOCK_LISTEN);
3293
3294	UNLOCK(&sock->lock);
3295	return (ISC_R_SUCCESS);
3296}
3297
3298/*
3299 * This should try to do aggressive accept() XXXMLG
3300 */
3301isc_result_t
3302isc__socket_accept(isc_socket_t *sock,
3303		   isc_task_t *task, isc_taskaction_t action, const void *arg)
3304{
3305	isc_socket_newconnev_t *adev;
3306	isc_socketmgr_t *manager;
3307	isc_task_t *ntask = NULL;
3308	isc_socket_t *nsock;
3309	isc_result_t result;
3310	IoCompletionInfo *lpo;
3311
3312	REQUIRE(VALID_SOCKET(sock));
3313
3314	manager = sock->manager;
3315	REQUIRE(VALID_MANAGER(manager));
3316
3317	LOCK(&sock->lock);
3318	CONSISTENT(sock);
3319
3320	/*
3321	 * make sure that the socket's not closed
3322	 */
3323	if (sock->fd == INVALID_SOCKET) {
3324		UNLOCK(&sock->lock);
3325		return (ISC_R_CONNREFUSED);
3326	}
3327
3328	REQUIRE(sock->listener);
3329
3330	/*
3331	 * Sender field is overloaded here with the task we will be sending
3332	 * this event to.  Just before the actual event is delivered the
3333	 * actual ev_sender will be touched up to be the socket.
3334	 */
3335	adev = (isc_socket_newconnev_t *)
3336		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3337				   action, arg, sizeof(*adev));
3338	if (adev == NULL) {
3339		UNLOCK(&sock->lock);
3340		return (ISC_R_NOMEMORY);
3341	}
3342	ISC_LINK_INIT(adev, ev_link);
3343
3344	result = allocate_socket(manager, sock->type, &nsock);
3345	if (result != ISC_R_SUCCESS) {
3346		isc_event_free((isc_event_t **)&adev);
3347		UNLOCK(&sock->lock);
3348		return (result);
3349	}
3350
3351	/*
3352	 * AcceptEx() requires we pass in a socket.
3353	 */
3354	nsock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
3355	if (nsock->fd == INVALID_SOCKET) {
3356		free_socket(&nsock, __LINE__);
3357		isc_event_free((isc_event_t **)&adev);
3358		UNLOCK(&sock->lock);
3359		return (ISC_R_FAILURE); // XXXMLG need real error message
3360	}
3361
3362	/*
3363	 * Attach to socket and to task.
3364	 */
3365	isc_task_attach(task, &ntask);
3366	if (isc_task_exiting(ntask)) {
3367		free_socket(&nsock, __LINE__);
3368		isc_task_detach(&ntask);
3369		isc_event_free(ISC_EVENT_PTR(&adev));
3370		UNLOCK(&sock->lock);
3371		return (ISC_R_SHUTTINGDOWN);
3372	}
3373	nsock->references++;
3374
3375	adev->ev_sender = ntask;
3376	adev->newsocket = nsock;
3377	_set_state(nsock, SOCK_ACCEPT);
3378
3379	/*
3380	 * Queue io completion for an accept().
3381	 */
3382	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3383					    HEAP_ZERO_MEMORY,
3384					    sizeof(IoCompletionInfo));
3385	RUNTIME_CHECK(lpo != NULL);
3386	lpo->acceptbuffer = (void *)HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
3387		(sizeof(SOCKADDR_STORAGE) + 16) * 2);
3388	RUNTIME_CHECK(lpo->acceptbuffer != NULL);
3389
3390	lpo->adev = adev;
3391	lpo->request_type = SOCKET_ACCEPT;
3392
3393	ISCAcceptEx(sock->fd,
3394		    nsock->fd,				/* Accepted Socket */
3395		    lpo->acceptbuffer,			/* Buffer for initial Recv */
3396		    0,					/* Length of Buffer */
3397		    sizeof(SOCKADDR_STORAGE) + 16,		/* Local address length + 16 */
3398		    sizeof(SOCKADDR_STORAGE) + 16,		/* Remote address lengh + 16 */
3399		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
3400		    (LPOVERLAPPED)lpo			/* Overlapped structure */
3401		    );
3402	iocompletionport_update(nsock);
3403
3404	socket_log(__LINE__, sock, NULL, TRACE,
3405		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND,
3406		   "accepting for nsock %p fd %d", nsock, nsock->fd);
3407
3408	/*
3409	 * Enqueue the event
3410	 */
3411	ISC_LIST_ENQUEUE(sock->accept_list, adev, ev_link);
3412	sock->pending_accept++;
3413	sock->pending_iocp++;
3414
3415	UNLOCK(&sock->lock);
3416	return (ISC_R_SUCCESS);
3417}
3418
3419isc_result_t
3420isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3421		    isc_task_t *task, isc_taskaction_t action, const void *arg)
3422{
3423	char strbuf[ISC_STRERRORSIZE];
3424	isc_socket_connev_t *cdev;
3425	isc_task_t *ntask = NULL;
3426	isc_socketmgr_t *manager;
3427	IoCompletionInfo *lpo;
3428	int bind_errno;
3429
3430	REQUIRE(VALID_SOCKET(sock));
3431	REQUIRE(addr != NULL);
3432	REQUIRE(task != NULL);
3433	REQUIRE(action != NULL);
3434
3435	manager = sock->manager;
3436	REQUIRE(VALID_MANAGER(manager));
3437	REQUIRE(addr != NULL);
3438
3439	if (isc_sockaddr_ismulticast(addr))
3440		return (ISC_R_MULTICAST);
3441
3442	LOCK(&sock->lock);
3443	CONSISTENT(sock);
3444
3445	/*
3446	 * make sure that the socket's not closed
3447	 */
3448	if (sock->fd == INVALID_SOCKET) {
3449		UNLOCK(&sock->lock);
3450		return (ISC_R_CONNREFUSED);
3451	}
3452
3453	/*
3454	 * Windows sockets won't connect unless the socket is bound.
3455	 */
3456	if (!sock->bound) {
3457		isc_sockaddr_t any;
3458
3459		isc_sockaddr_anyofpf(&any, isc_sockaddr_pf(addr));
3460		if (bind(sock->fd, &any.type.sa, any.length) < 0) {
3461			bind_errno = WSAGetLastError();
3462			UNLOCK(&sock->lock);
3463			switch (bind_errno) {
3464			case WSAEACCES:
3465				return (ISC_R_NOPERM);
3466			case WSAEADDRNOTAVAIL:
3467				return (ISC_R_ADDRNOTAVAIL);
3468			case WSAEADDRINUSE:
3469				return (ISC_R_ADDRINUSE);
3470			case WSAEINVAL:
3471				return (ISC_R_BOUND);
3472			default:
3473				isc__strerror(bind_errno, strbuf,
3474					      sizeof(strbuf));
3475				UNEXPECTED_ERROR(__FILE__, __LINE__,
3476						 "bind: %s", strbuf);
3477				return (ISC_R_UNEXPECTED);
3478			}
3479		}
3480		sock->bound = 1;
3481	}
3482
3483	REQUIRE(!sock->pending_connect);
3484
3485	cdev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3486							ISC_SOCKEVENT_CONNECT,
3487							action,	arg,
3488							sizeof(*cdev));
3489	if (cdev == NULL) {
3490		UNLOCK(&sock->lock);
3491		return (ISC_R_NOMEMORY);
3492	}
3493	ISC_LINK_INIT(cdev, ev_link);
3494
3495	if (sock->type == isc_sockettype_tcp) {
3496		/*
3497		 * Queue io completion for an accept().
3498		 */
3499		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3500						    HEAP_ZERO_MEMORY,
3501						    sizeof(IoCompletionInfo));
3502		lpo->cdev = cdev;
3503		lpo->request_type = SOCKET_CONNECT;
3504
3505		sock->address = *addr;
3506		ISCConnectEx(sock->fd, &addr->type.sa, addr->length,
3507			NULL, 0, NULL, (LPOVERLAPPED)lpo);
3508
3509		/*
3510		 * Attach to task.
3511		 */
3512		isc_task_attach(task, &ntask);
3513		cdev->ev_sender = ntask;
3514
3515		sock->pending_connect = 1;
3516		_set_state(sock, SOCK_CONNECT);
3517
3518		/*
3519		 * Enqueue the request.
3520		 */
3521		sock->connect_ev = cdev;
3522		sock->pending_iocp++;
3523	} else {
3524		WSAConnect(sock->fd, &addr->type.sa, addr->length, NULL, NULL, NULL, NULL);
3525		cdev->result = ISC_R_SUCCESS;
3526		isc_task_send(task, (isc_event_t **)&cdev);
3527	}
3528	CONSISTENT(sock);
3529	UNLOCK(&sock->lock);
3530
3531	return (ISC_R_SUCCESS);
3532}
3533
3534isc_result_t
3535isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3536	isc_result_t result;
3537
3538	REQUIRE(VALID_SOCKET(sock));
3539	REQUIRE(addressp != NULL);
3540
3541	LOCK(&sock->lock);
3542	CONSISTENT(sock);
3543
3544	/*
3545	 * make sure that the socket's not closed
3546	 */
3547	if (sock->fd == INVALID_SOCKET) {
3548		UNLOCK(&sock->lock);
3549		return (ISC_R_CONNREFUSED);
3550	}
3551
3552	if (sock->connected) {
3553		*addressp = sock->address;
3554		result = ISC_R_SUCCESS;
3555	} else {
3556		result = ISC_R_NOTCONNECTED;
3557	}
3558
3559	UNLOCK(&sock->lock);
3560
3561	return (result);
3562}
3563
3564isc_result_t
3565isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3566	ISC_SOCKADDR_LEN_T len;
3567	isc_result_t result;
3568	char strbuf[ISC_STRERRORSIZE];
3569
3570	REQUIRE(VALID_SOCKET(sock));
3571	REQUIRE(addressp != NULL);
3572
3573	LOCK(&sock->lock);
3574	CONSISTENT(sock);
3575
3576	/*
3577	 * make sure that the socket's not closed
3578	 */
3579	if (sock->fd == INVALID_SOCKET) {
3580		UNLOCK(&sock->lock);
3581		return (ISC_R_CONNREFUSED);
3582	}
3583
3584	if (!sock->bound) {
3585		result = ISC_R_NOTBOUND;
3586		goto out;
3587	}
3588
3589	result = ISC_R_SUCCESS;
3590
3591	len = sizeof(addressp->type);
3592	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3593		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3594		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3595				 strbuf);
3596		result = ISC_R_UNEXPECTED;
3597		goto out;
3598	}
3599	addressp->length = (unsigned int)len;
3600
3601 out:
3602	UNLOCK(&sock->lock);
3603
3604	return (result);
3605}
3606
3607/*
3608 * Run through the list of events on this socket, and cancel the ones
3609 * queued for task "task" of type "how".  "how" is a bitmask.
3610 */
3611void
3612isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3613
3614	REQUIRE(VALID_SOCKET(sock));
3615
3616	/*
3617	 * Quick exit if there is nothing to do.  Don't even bother locking
3618	 * in this case.
3619	 */
3620	if (how == 0)
3621		return;
3622
3623	LOCK(&sock->lock);
3624	CONSISTENT(sock);
3625
3626	/*
3627	 * make sure that the socket's not closed
3628	 */
3629	if (sock->fd == INVALID_SOCKET) {
3630		UNLOCK(&sock->lock);
3631		return;
3632	}
3633
3634	/*
3635	 * All of these do the same thing, more or less.
3636	 * Each will:
3637	 *	o If the internal event is marked as "posted" try to
3638	 *	  remove it from the task's queue.  If this fails, mark it
3639	 *	  as canceled instead, and let the task clean it up later.
3640	 *	o For each I/O request for that task of that type, post
3641	 *	  its done event with status of "ISC_R_CANCELED".
3642	 *	o Reset any state needed.
3643	 */
3644
3645	if ((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) {
3646		isc_socketevent_t      *dev;
3647		isc_socketevent_t      *next;
3648		isc_task_t	       *current_task;
3649
3650		dev = ISC_LIST_HEAD(sock->recv_list);
3651		while (dev != NULL) {
3652			current_task = dev->ev_sender;
3653			next = ISC_LIST_NEXT(dev, ev_link);
3654			if ((task == NULL) || (task == current_task)) {
3655				dev->result = ISC_R_CANCELED;
3656				send_recvdone_event(sock, &dev);
3657			}
3658			dev = next;
3659		}
3660	}
3661	how &= ~ISC_SOCKCANCEL_RECV;
3662
3663	if ((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) {
3664		isc_socketevent_t      *dev;
3665		isc_socketevent_t      *next;
3666		isc_task_t	       *current_task;
3667
3668		dev = ISC_LIST_HEAD(sock->send_list);
3669
3670		while (dev != NULL) {
3671			current_task = dev->ev_sender;
3672			next = ISC_LIST_NEXT(dev, ev_link);
3673			if ((task == NULL) || (task == current_task)) {
3674				dev->result = ISC_R_CANCELED;
3675				send_senddone_event(sock, &dev);
3676			}
3677			dev = next;
3678		}
3679	}
3680	how &= ~ISC_SOCKCANCEL_SEND;
3681
3682	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3683	    && !ISC_LIST_EMPTY(sock->accept_list)) {
3684		isc_socket_newconnev_t *dev;
3685		isc_socket_newconnev_t *next;
3686		isc_task_t	       *current_task;
3687
3688		dev = ISC_LIST_HEAD(sock->accept_list);
3689		while (dev != NULL) {
3690			current_task = dev->ev_sender;
3691			next = ISC_LIST_NEXT(dev, ev_link);
3692
3693			if ((task == NULL) || (task == current_task)) {
3694
3695				dev->newsocket->references--;
3696				closesocket(dev->newsocket->fd);
3697				dev->newsocket->fd = INVALID_SOCKET;
3698				free_socket(&dev->newsocket, __LINE__);
3699
3700				dev->result = ISC_R_CANCELED;
3701				send_acceptdone_event(sock, &dev);
3702			}
3703
3704			dev = next;
3705		}
3706	}
3707	how &= ~ISC_SOCKCANCEL_ACCEPT;
3708
3709	/*
3710	 * Connecting is not a list.
3711	 */
3712	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3713	    && sock->connect_ev != NULL) {
3714		isc_socket_connev_t    *dev;
3715		isc_task_t	       *current_task;
3716
3717		INSIST(sock->pending_connect);
3718
3719		dev = sock->connect_ev;
3720		current_task = dev->ev_sender;
3721
3722		if ((task == NULL) || (task == current_task)) {
3723			closesocket(sock->fd);
3724			sock->fd = INVALID_SOCKET;
3725			_set_state(sock, SOCK_CLOSED);
3726
3727			sock->connect_ev = NULL;
3728			dev->result = ISC_R_CANCELED;
3729			send_connectdone_event(sock, &dev);
3730		}
3731	}
3732	how &= ~ISC_SOCKCANCEL_CONNECT;
3733
3734	maybe_free_socket(&sock, __LINE__);
3735}
3736
3737isc_sockettype_t
3738isc__socket_gettype(isc_socket_t *sock) {
3739	isc_sockettype_t type;
3740
3741	REQUIRE(VALID_SOCKET(sock));
3742
3743	LOCK(&sock->lock);
3744
3745	/*
3746	 * make sure that the socket's not closed
3747	 */
3748	if (sock->fd == INVALID_SOCKET) {
3749		UNLOCK(&sock->lock);
3750		return (ISC_R_CONNREFUSED);
3751	}
3752
3753	type = sock->type;
3754	UNLOCK(&sock->lock);
3755	return (type);
3756}
3757
3758isc_boolean_t
3759isc__socket_isbound(isc_socket_t *sock) {
3760	isc_boolean_t val;
3761
3762	REQUIRE(VALID_SOCKET(sock));
3763
3764	LOCK(&sock->lock);
3765	CONSISTENT(sock);
3766
3767	/*
3768	 * make sure that the socket's not closed
3769	 */
3770	if (sock->fd == INVALID_SOCKET) {
3771		UNLOCK(&sock->lock);
3772		return (ISC_FALSE);
3773	}
3774
3775	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3776	UNLOCK(&sock->lock);
3777
3778	return (val);
3779}
3780
3781void
3782isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3783#if defined(IPV6_V6ONLY)
3784	int onoff = yes ? 1 : 0;
3785#else
3786	UNUSED(yes);
3787#endif
3788
3789	REQUIRE(VALID_SOCKET(sock));
3790
3791#ifdef IPV6_V6ONLY
3792	if (sock->pf == AF_INET6) {
3793		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3794				 (char *)&onoff, sizeof(onoff));
3795	}
3796#endif
3797}
3798
3799void
3800isc__socket_cleanunix(isc_sockaddr_t *addr, isc_boolean_t active) {
3801	UNUSED(addr);
3802	UNUSED(active);
3803}
3804
3805isc_result_t
3806isc__socket_permunix(isc_sockaddr_t *addr, isc_uint32_t perm,
3807		     isc_uint32_t owner,	isc_uint32_t group)
3808{
3809	UNUSED(addr);
3810	UNUSED(perm);
3811	UNUSED(owner);
3812	UNUSED(group);
3813	return (ISC_R_NOTIMPLEMENTED);
3814}
3815
3816void
3817isc__socket_setname(isc_socket_t *socket, const char *name, void *tag) {
3818
3819	/*
3820	 * Name 'socket'.
3821	 */
3822
3823	REQUIRE(VALID_SOCKET(socket));
3824
3825	LOCK(&socket->lock);
3826	memset(socket->name, 0, sizeof(socket->name));
3827	strncpy(socket->name, name, sizeof(socket->name) - 1);
3828	socket->tag = tag;
3829	UNLOCK(&socket->lock);
3830}
3831
3832const char *
3833isc__socket_getname(isc_socket_t *socket) {
3834	return (socket->name);
3835}
3836
3837void *
3838isc__socket_gettag(isc_socket_t *socket) {
3839	return (socket->tag);
3840}
3841
3842int
3843isc__socket_getfd(isc_socket_t *socket) {
3844	return ((short) socket->fd);
3845}
3846
3847void
3848isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3849	UNUSED(manager);
3850	UNUSED(reserved);
3851}
3852
3853void
3854isc___socketmgr_maxudp(isc_socketmgr_t *manager, int maxudp) {
3855
3856	UNUSED(manager);
3857	UNUSED(maxudp);
3858}
3859
3860#ifdef HAVE_LIBXML2
3861
3862static const char *
3863_socktype(isc_sockettype_t type)
3864{
3865	if (type == isc_sockettype_udp)
3866		return ("udp");
3867	else if (type == isc_sockettype_tcp)
3868		return ("tcp");
3869	else if (type == isc_sockettype_unix)
3870		return ("unix");
3871	else if (type == isc_sockettype_fdwatch)
3872		return ("fdwatch");
3873	else
3874		return ("not-initialized");
3875}
3876
3877void
3878isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
3879{
3880	isc_socket_t *sock;
3881	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3882	isc_sockaddr_t addr;
3883	ISC_SOCKADDR_LEN_T len;
3884
3885	LOCK(&mgr->lock);
3886
3887#ifndef ISC_PLATFORM_USETHREADS
3888	xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3889	xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
3890	xmlTextWriterEndElement(writer);
3891#endif
3892
3893	xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
3894	sock = ISC_LIST_HEAD(mgr->socklist);
3895	while (sock != NULL) {
3896		LOCK(&sock->lock);
3897		xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
3898
3899		xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
3900		xmlTextWriterWriteFormatString(writer, "%p", sock);
3901		xmlTextWriterEndElement(writer);
3902
3903		if (sock->name[0] != 0) {
3904			xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
3905			xmlTextWriterWriteFormatString(writer, "%s",
3906						       sock->name);
3907			xmlTextWriterEndElement(writer); /* name */
3908		}
3909
3910		xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3911		xmlTextWriterWriteFormatString(writer, "%d", sock->references);
3912		xmlTextWriterEndElement(writer);
3913
3914		xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
3915					  ISC_XMLCHAR _socktype(sock->type));
3916
3917		if (sock->connected) {
3918			isc_sockaddr_format(&sock->address, peerbuf,
3919					    sizeof(peerbuf));
3920			xmlTextWriterWriteElement(writer,
3921						  ISC_XMLCHAR "peer-address",
3922						  ISC_XMLCHAR peerbuf);
3923		}
3924
3925		len = sizeof(addr);
3926		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
3927			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
3928			xmlTextWriterWriteElement(writer,
3929						  ISC_XMLCHAR "local-address",
3930						  ISC_XMLCHAR peerbuf);
3931		}
3932
3933		xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
3934		if (sock->pending_recv)
3935			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3936						ISC_XMLCHAR "pending-receive");
3937		if (sock->pending_send)
3938			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3939						  ISC_XMLCHAR "pending-send");
3940		if (sock->pending_accept)
3941			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3942						 ISC_XMLCHAR "pending_accept");
3943		if (sock->listener)
3944			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3945						  ISC_XMLCHAR "listener");
3946		if (sock->connected)
3947			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3948						  ISC_XMLCHAR "connected");
3949		if (sock->pending_connect)
3950			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3951						  ISC_XMLCHAR "connecting");
3952		if (sock->bound)
3953			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3954						  ISC_XMLCHAR "bound");
3955
3956		xmlTextWriterEndElement(writer); /* states */
3957
3958		xmlTextWriterEndElement(writer); /* socket */
3959
3960		UNLOCK(&sock->lock);
3961		sock = ISC_LIST_NEXT(sock, link);
3962	}
3963	xmlTextWriterEndElement(writer); /* sockets */
3964
3965	UNLOCK(&mgr->lock);
3966}
3967#endif /* HAVE_LIBXML2 */
3968