1/*	$NetBSD$	*/
2
3/*
4 * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
5 * Copyright (C) 2000-2003  Internet Software Consortium.
6 *
7 * Permission to use, copy, modify, and/or distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
12 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
13 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
14 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
15 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
16 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
17 * PERFORMANCE OF THIS SOFTWARE.
18 */
19
20/* Id: socket.c,v 1.70.54.4 2009/01/29 22:40:36 jinmei Exp */
21
22/* This code uses functions which are only available on Server 2003 and
23 * higher, and Windows XP and higher.
24 *
25 * This code is by nature multithreaded and takes advantage of various
26 * features to pass on information through the completion port for
27 * when I/O is completed.  All sends, receives, accepts, and connects are
28 * completed through the completion port.
29 *
30 * The number of Completion Port Worker threads used is the total number
31 * of CPU's + 1. This increases the likelihood that a Worker Thread is
32 * available for processing a completed request.
33 *
34 * XXXPDM 5 August, 2002
35 */
36
37#define MAKE_EXTERNAL 1
38#include <config.h>
39
40#include <sys/types.h>
41
42#ifndef _WINSOCKAPI_
43#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
44#endif
45
46#include <errno.h>
47#include <stddef.h>
48#include <stdlib.h>
49#include <string.h>
50#include <unistd.h>
51#include <io.h>
52#include <fcntl.h>
53#include <process.h>
54
55#include <isc/buffer.h>
56#include <isc/bufferlist.h>
57#include <isc/condition.h>
58#include <isc/list.h>
59#include <isc/log.h>
60#include <isc/mem.h>
61#include <isc/msgs.h>
62#include <isc/mutex.h>
63#include <isc/net.h>
64#include <isc/once.h>
65#include <isc/os.h>
66#include <isc/platform.h>
67#include <isc/print.h>
68#include <isc/region.h>
69#include <isc/socket.h>
70#include <isc/stats.h>
71#include <isc/strerror.h>
72#include <isc/syslog.h>
73#include <isc/task.h>
74#include <isc/thread.h>
75#include <isc/util.h>
76#include <isc/win32os.h>
77
78#include <mswsock.h>
79
80#include "errno2result.h"
81
82/*
83 * How in the world can Microsoft exist with APIs like this?
84 * We can't actually call this directly, because it turns out
85 * no library exports this function.  Instead, we need to
86 * issue a runtime call to get the address.
87 */
88LPFN_CONNECTEX ISCConnectEx;
89LPFN_ACCEPTEX ISCAcceptEx;
90LPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
91
92/*
93 * Run expensive internal consistency checks.
94 */
95#ifdef ISC_SOCKET_CONSISTENCY_CHECKS
96#define CONSISTENT(sock) consistent(sock)
97#else
98#define CONSISTENT(sock) do {} while (0)
99#endif
100static void consistent(isc_socket_t *sock);
101
102/*
103 * Define this macro to control the behavior of connection
104 * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
105 * for details.
106 * NOTE: This requires that Windows 2000 systems install Service Pack 2
107 * or later.
108 */
109#ifndef SIO_UDP_CONNRESET
110#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
111#endif
112
113/*
114 * Some systems define the socket length argument as an int, some as size_t,
115 * some as socklen_t.  This is here so it can be easily changed if needed.
116 */
117#ifndef ISC_SOCKADDR_LEN_T
118#define ISC_SOCKADDR_LEN_T unsigned int
119#endif
120
121/*
122 * Define what the possible "soft" errors can be.  These are non-fatal returns
123 * of various network related functions, like recv() and so on.
124 */
125#define SOFT_ERROR(e)	((e) == WSAEINTR || \
126			 (e) == WSAEWOULDBLOCK || \
127			 (e) == EWOULDBLOCK || \
128			 (e) == EINTR || \
129			 (e) == EAGAIN || \
130			 (e) == 0)
131
132/*
133 * Pending errors are not really errors and should be
134 * kept separate
135 */
136#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
137
138#define DOIO_SUCCESS	  0       /* i/o ok, event sent */
139#define DOIO_SOFT	  1       /* i/o ok, soft error, no event sent */
140#define DOIO_HARD	  2       /* i/o error, event sent */
141#define DOIO_EOF	  3       /* EOF, no event sent */
142#define DOIO_PENDING	  4       /* status when i/o is in process */
143#define DOIO_NEEDMORE	  5       /* IO was processed, but we need more due to minimum */
144
145#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
146
147/*
148 * DLVL(90)  --  Function entry/exit and other tracing.
149 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
150 * DLVL(60)  --  Socket data send/receive
151 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
152 * DLVL(20)  --  Socket creation/destruction.
153 */
154#define TRACE_LEVEL		90
155#define CORRECTNESS_LEVEL	70
156#define IOEVENT_LEVEL		60
157#define EVENT_LEVEL		50
158#define CREATION_LEVEL		20
159
160#define TRACE		DLVL(TRACE_LEVEL)
161#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
162#define IOEVENT		DLVL(IOEVENT_LEVEL)
163#define EVENT		DLVL(EVENT_LEVEL)
164#define CREATION	DLVL(CREATION_LEVEL)
165
166typedef isc_event_t intev_t;
167
168/*
169 * Socket State
170 */
171enum {
172  SOCK_INITIALIZED,	/* Socket Initialized */
173  SOCK_OPEN,		/* Socket opened but nothing yet to do */
174  SOCK_DATA,		/* Socket sending or receiving data */
175  SOCK_LISTEN,		/* TCP Socket listening for connects */
176  SOCK_ACCEPT,		/* TCP socket is waiting to accept */
177  SOCK_CONNECT,		/* TCP Socket connecting */
178  SOCK_CLOSED,		/* Socket has been closed */
179};
180
181#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
182#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
183
184/*
185 * IPv6 control information.  If the socket is an IPv6 socket we want
186 * to collect the destination address and interface so the client can
187 * set them on outgoing packets.
188 */
189#ifdef ISC_PLATFORM_HAVEIPV6
190#ifndef USE_CMSG
191#define USE_CMSG	1
192#endif
193#endif
194
195/*
196 * We really  don't want to try and use these control messages. Win32
197 * doesn't have this mechanism before XP.
198 */
199#undef USE_CMSG
200
201/*
202 * Message header for recvmsg and sendmsg calls.
203 * Used value-result for recvmsg, value only for sendmsg.
204 */
205struct msghdr {
206	SOCKADDR_STORAGE to_addr;	/* UDP send/recv address */
207	int      to_addr_len;		/* length of the address */
208	WSABUF  *msg_iov;		/* scatter/gather array */
209	u_int   msg_iovlen;             /* # elements in msg_iov */
210	void	*msg_control;           /* ancillary data, see below */
211	u_int   msg_controllen;         /* ancillary data buffer len */
212	int	msg_totallen;		/* total length of this message */
213} msghdr;
214
215/*
216 * The size to raise the receive buffer to.
217 */
218#define RCVBUFSIZE (32*1024)
219
220/*
221 * The number of times a send operation is repeated if the result
222 * is WSAEINTR.
223 */
224#define NRETRIES 10
225
226struct isc_socket {
227	/* Not locked. */
228	unsigned int		magic;
229	isc_socketmgr_t	       *manager;
230	isc_mutex_t		lock;
231	isc_sockettype_t	type;
232
233	/* Pointers to scatter/gather buffers */
234	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
235
236	/* Locked by socket lock. */
237	ISC_LINK(isc_socket_t)	link;
238	unsigned int		references; /* EXTERNAL references */
239	SOCKET			fd;	/* file handle */
240	int			pf;	/* protocol family */
241	char			name[16];
242	void *			tag;
243
244	/*
245	 * Each recv() call uses this buffer.  It is a per-socket receive
246	 * buffer that allows us to decouple the system recv() from the
247	 * recv_list done events.  This means the items on the recv_list
248	 * can be removed without having to cancel pending system recv()
249	 * calls.  It also allows us to read-ahead in some cases.
250	 */
251	struct {
252		SOCKADDR_STORAGE	from_addr;	   // UDP send/recv address
253		int		from_addr_len;	   // length of the address
254		char		*base;		   // the base of the buffer
255		char		*consume_position; // where to start copying data from next
256		unsigned int	len;		   // the actual size of this buffer
257		unsigned int	remaining;	   // the number of bytes remaining
258	} recvbuf;
259
260	ISC_LIST(isc_socketevent_t)		send_list;
261	ISC_LIST(isc_socketevent_t)		recv_list;
262	ISC_LIST(isc_socket_newconnev_t)	accept_list;
263	isc_socket_connev_t		       *connect_ev;
264
265	isc_sockaddr_t		address;  /* remote address */
266
267	unsigned int		listener : 1,	/* listener socket */
268				connected : 1,
269				pending_connect : 1, /* connect pending */
270				bound : 1;	/* bound to local addr */
271	unsigned int		pending_iocp;	/* Should equal the counters below. Debug. */
272	unsigned int		pending_recv;  /* Number of outstanding recv() calls. */
273	unsigned int		pending_send;  /* Number of outstanding send() calls. */
274	unsigned int		pending_accept; /* Number of outstanding accept() calls. */
275	unsigned int		state; /* Socket state. Debugging and consistency checking. */
276	int			state_lineno;  /* line which last touched state */
277};
278
279#define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (0)
280
281/*
282 * Buffer structure
283 */
284typedef struct buflist buflist_t;
285
286struct buflist {
287	void			*buf;
288	unsigned int		buflen;
289	ISC_LINK(buflist_t)	link;
290};
291
292/*
293 * I/O Completion ports Info structures
294 */
295
296static HANDLE hHeapHandle = NULL;
297typedef struct IoCompletionInfo {
298	OVERLAPPED		overlapped;
299	isc_socketevent_t	*dev;  /* send()/recv() done event */
300	isc_socket_connev_t	*cdev; /* connect() done event */
301	isc_socket_newconnev_t	*adev; /* accept() done event */
302	void			*acceptbuffer;
303	DWORD			received_bytes;
304	int			request_type;
305	struct msghdr		messagehdr;
306	ISC_LIST(buflist_t)	bufferlist;	/*%< list of buffers */
307} IoCompletionInfo;
308
309/*
310 * Define a maximum number of I/O Completion Port worker threads
311 * to handle the load on the Completion Port. The actual number
312 * used is the number of CPU's + 1.
313 */
314#define MAX_IOCPTHREADS 20
315
316#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
317#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
318
319struct isc_socketmgr {
320	/* Not locked. */
321	unsigned int			magic;
322	isc_mem_t		       *mctx;
323	isc_mutex_t			lock;
324	isc_stats_t		       *stats;
325
326	/* Locked by manager lock. */
327	ISC_LIST(isc_socket_t)		socklist;
328	isc_boolean_t			bShutdown;
329	isc_condition_t			shutdown_ok;
330	HANDLE				hIoCompletionPort;
331	int				maxIOCPThreads;
332	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
333	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
334
335	/*
336	 * Debugging.
337	 * Modified by InterlockedIncrement() and InterlockedDecrement()
338	 */
339	LONG				totalSockets;
340	LONG				iocp_total;
341};
342
343enum {
344	SOCKET_RECV,
345	SOCKET_SEND,
346	SOCKET_ACCEPT,
347	SOCKET_CONNECT
348};
349
350/*
351 * send() and recv() iovec counts
352 */
353#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
354#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
355
356static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
357static void maybe_free_socket(isc_socket_t **, int);
358static void free_socket(isc_socket_t **, int);
359static isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
360static isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
361static isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
362static void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
363static void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
364static void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
365static void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
366static void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
367static void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
368static void queue_receive_request(isc_socket_t *sock);
369
370/*
371 * This is used to dump the contents of the sock structure
372 * You should make sure that the sock is locked before
373 * dumping it. Since the code uses simple printf() statements
374 * it should only be used interactively.
375 */
376void
377sock_dump(isc_socket_t *sock) {
378	isc_socketevent_t *ldev;
379	isc_socket_newconnev_t *ndev;
380
381#if 0
382	isc_sockaddr_t addr;
383	char socktext[256];
384
385	isc_socket_getpeername(sock, &addr);
386	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
387	printf("Remote Socket: %s\n", socktext);
388	isc_socket_getsockname(sock, &addr);
389	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
390	printf("This Socket: %s\n", socktext);
391#endif
392
393	printf("\n\t\tSock Dump\n");
394	printf("\t\tfd: %u\n", sock->fd);
395	printf("\t\treferences: %d\n", sock->references);
396	printf("\t\tpending_accept: %d\n", sock->pending_accept);
397	printf("\t\tconnecting: %d\n", sock->pending_connect);
398	printf("\t\tconnected: %d\n", sock->connected);
399	printf("\t\tbound: %d\n", sock->bound);
400	printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
401	printf("\t\tsocket type: %d\n", sock->type);
402
403	printf("\n\t\tSock Recv List\n");
404	ldev = ISC_LIST_HEAD(sock->recv_list);
405	while (ldev != NULL) {
406		printf("\t\tdev: %p\n", ldev);
407		ldev = ISC_LIST_NEXT(ldev, ev_link);
408	}
409
410	printf("\n\t\tSock Send List\n");
411	ldev = ISC_LIST_HEAD(sock->send_list);
412	while (ldev != NULL) {
413		printf("\t\tdev: %p\n", ldev);
414		ldev = ISC_LIST_NEXT(ldev, ev_link);
415	}
416
417	printf("\n\t\tSock Accept List\n");
418	ndev = ISC_LIST_HEAD(sock->accept_list);
419	while (ndev != NULL) {
420		printf("\t\tdev: %p\n", ldev);
421		ndev = ISC_LIST_NEXT(ndev, ev_link);
422	}
423}
424
425static void
426socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
427	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
428	   isc_msgcat_t *msgcat, int msgset, int message,
429	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
430
431/*  This function will add an entry to the I/O completion port
432 *  that will signal the I/O thread to exit (gracefully)
433 */
434static void
435signal_iocompletionport_exit(isc_socketmgr_t *manager) {
436	int i;
437	int errval;
438	char strbuf[ISC_STRERRORSIZE];
439
440	REQUIRE(VALID_MANAGER(manager));
441	for (i = 0; i < manager->maxIOCPThreads; i++) {
442		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
443						0, 0, 0)) {
444			errval = GetLastError();
445			isc__strerror(errval, strbuf, sizeof(strbuf));
446			FATAL_ERROR(__FILE__, __LINE__,
447				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
448				ISC_MSG_FAILED,
449				"Can't request service thread to exit: %s"),
450				strbuf);
451		}
452	}
453}
454
455/*
456 * Create the worker threads for the I/O Completion Port
457 */
458void
459iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
460	int errval;
461	char strbuf[ISC_STRERRORSIZE];
462	int i;
463
464	INSIST(total_threads > 0);
465	REQUIRE(VALID_MANAGER(manager));
466	/*
467	 * We need at least one
468	 */
469	for (i = 0; i < total_threads; i++) {
470		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
471						manager, 0,
472						&manager->dwIOCPThreadIds[i]);
473		if (manager->hIOCPThreads[i] == NULL) {
474			errval = GetLastError();
475			isc__strerror(errval, strbuf, sizeof(strbuf));
476			FATAL_ERROR(__FILE__, __LINE__,
477				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
478				ISC_MSG_FAILED,
479				"Can't create IOCP thread: %s"),
480				strbuf);
481			exit(1);
482		}
483	}
484}
485
486/*
487 *  Create/initialise the I/O completion port
488 */
489void
490iocompletionport_init(isc_socketmgr_t *manager) {
491	int errval;
492	char strbuf[ISC_STRERRORSIZE];
493
494	REQUIRE(VALID_MANAGER(manager));
495	/*
496	 * Create a private heap to handle the socket overlapped structure
497	 * The minimum number of structures is 10, there is no maximum
498	 */
499	hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
500	if (hHeapHandle == NULL) {
501		errval = GetLastError();
502		isc__strerror(errval, strbuf, sizeof(strbuf));
503		FATAL_ERROR(__FILE__, __LINE__,
504			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
505					   ISC_MSG_FAILED,
506					   "HeapCreate() failed during "
507					   "initialization: %s"),
508			    strbuf);
509		exit(1);
510	}
511
512	manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
513
514	/* Now Create the Completion Port */
515	manager->hIoCompletionPort = CreateIoCompletionPort(
516			INVALID_HANDLE_VALUE, NULL,
517			0, manager->maxIOCPThreads);
518	if (manager->hIoCompletionPort == NULL) {
519		errval = GetLastError();
520		isc__strerror(errval, strbuf, sizeof(strbuf));
521		FATAL_ERROR(__FILE__, __LINE__,
522				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
523				ISC_MSG_FAILED,
524				"CreateIoCompletionPort() failed "
525				"during initialization: %s"),
526				strbuf);
527		exit(1);
528	}
529
530	/*
531	 * Worker threads for servicing the I/O
532	 */
533	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
534}
535
536/*
537 * Associate a socket with an IO Completion Port.  This allows us to queue events for it
538 * and have our worker pool of threads process them.
539 */
540void
541iocompletionport_update(isc_socket_t *sock) {
542	HANDLE hiocp;
543	char strbuf[ISC_STRERRORSIZE];
544
545	REQUIRE(VALID_SOCKET(sock));
546
547	hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
548		sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
549
550	if (hiocp == NULL) {
551		DWORD errval = GetLastError();
552		isc__strerror(errval, strbuf, sizeof(strbuf));
553		isc_log_iwrite(isc_lctx,
554				ISC_LOGCATEGORY_GENERAL,
555				ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
556				isc_msgcat, ISC_MSGSET_SOCKET,
557				ISC_MSG_TOOMANYHANDLES,
558				"iocompletionport_update: failed to open"
559				" io completion port: %s",
560				strbuf);
561
562		/* XXXMLG temporary hack to make failures detected.
563		 * This function should return errors to the caller, not
564		 * exit here.
565		 */
566		FATAL_ERROR(__FILE__, __LINE__,
567				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
568				ISC_MSG_FAILED,
569				"CreateIoCompletionPort() failed "
570				"during initialization: %s"),
571				strbuf);
572		exit(1);
573	}
574
575	InterlockedIncrement(&sock->manager->iocp_total);
576}
577
578/*
579 * Routine to cleanup and then close the socket.
580 * Only close the socket here if it is NOT associated
581 * with an event, otherwise the WSAWaitForMultipleEvents
582 * may fail due to the fact that the Wait should not
583 * be running while closing an event or a socket.
584 * The socket is locked before calling this function
585 */
586void
587socket_close(isc_socket_t *sock) {
588
589	REQUIRE(sock != NULL);
590
591	if (sock->fd != INVALID_SOCKET) {
592		closesocket(sock->fd);
593		sock->fd = INVALID_SOCKET;
594		_set_state(sock, SOCK_CLOSED);
595		InterlockedDecrement(&sock->manager->totalSockets);
596	}
597}
598
599static isc_once_t initialise_once = ISC_ONCE_INIT;
600static isc_boolean_t initialised = ISC_FALSE;
601
602static void
603initialise(void) {
604	WORD wVersionRequested;
605	WSADATA wsaData;
606	int err;
607	SOCKET sock;
608	GUID GUIDConnectEx = WSAID_CONNECTEX;
609	GUID GUIDAcceptEx = WSAID_ACCEPTEX;
610	GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
611	DWORD dwBytes;
612
613	/* Need Winsock 2.2 or better */
614	wVersionRequested = MAKEWORD(2, 2);
615
616	err = WSAStartup(wVersionRequested, &wsaData);
617	if (err != 0) {
618		char strbuf[ISC_STRERRORSIZE];
619		isc__strerror(err, strbuf, sizeof(strbuf));
620		FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
621			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
622					   ISC_MSG_FAILED, "failed"),
623			    strbuf);
624		exit(1);
625	}
626	/*
627	 * The following APIs do not exist as functions in a library, but we must
628	 * ask winsock for them.  They are "extensions" -- but why they cannot be
629	 * actual functions is beyond me.  So, ask winsock for the pointers to the
630	 * functions we need.
631	 */
632	sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
633	INSIST(sock != INVALID_SOCKET);
634	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
635		 &GUIDConnectEx, sizeof(GUIDConnectEx),
636		 &ISCConnectEx, sizeof(ISCConnectEx),
637		 &dwBytes, NULL, NULL);
638	INSIST(err == 0);
639
640	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
641		 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
642		 &ISCAcceptEx, sizeof(ISCAcceptEx),
643		 &dwBytes, NULL, NULL);
644	INSIST(err == 0);
645
646	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
647		 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
648		 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
649		 &dwBytes, NULL, NULL);
650	INSIST(err == 0);
651
652	closesocket(sock);
653
654	initialised = ISC_TRUE;
655}
656
657/*
658 * Initialize socket services
659 */
660void
661InitSockets(void) {
662	RUNTIME_CHECK(isc_once_do(&initialise_once,
663				  initialise) == ISC_R_SUCCESS);
664	if (!initialised)
665		exit(1);
666}
667
668int
669internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
670		 struct msghdr *messagehdr, int flags, int *Error)
671{
672	int Result;
673	DWORD BytesSent;
674	DWORD Flags = flags;
675	int total_sent;
676
677	*Error = 0;
678	Result = WSASendTo(sock->fd, messagehdr->msg_iov,
679			   messagehdr->msg_iovlen, &BytesSent,
680			   Flags, (SOCKADDR *)&messagehdr->to_addr,
681			   messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
682			   NULL);
683
684	total_sent = (int)BytesSent;
685
686	/* Check for errors.*/
687	if (Result == SOCKET_ERROR) {
688		*Error = WSAGetLastError();
689
690		switch (*Error) {
691		case WSA_IO_INCOMPLETE:
692		case WSA_WAIT_IO_COMPLETION:
693		case WSA_IO_PENDING:
694		case NO_ERROR:		/* Strange, but okay */
695			sock->pending_iocp++;
696			sock->pending_send++;
697			break;
698
699		default:
700			return (-1);
701			break;
702		}
703	} else {
704		sock->pending_iocp++;
705		sock->pending_send++;
706	}
707
708	if (lpo != NULL)
709		return (0);
710	else
711		return (total_sent);
712}
713
714static void
715queue_receive_request(isc_socket_t *sock) {
716	DWORD Flags = 0;
717	DWORD NumBytes = 0;
718	int total_bytes = 0;
719	int Result;
720	int Error;
721	WSABUF iov[1];
722	IoCompletionInfo *lpo;
723	isc_result_t isc_result;
724
725	/*
726	 * If we already have a receive pending, do nothing.
727	 */
728	if (sock->pending_recv > 0)
729		return;
730
731	/*
732	 * If no one is waiting, do nothing.
733	 */
734	if (ISC_LIST_EMPTY(sock->recv_list))
735		return;
736
737	INSIST(sock->recvbuf.remaining == 0);
738	INSIST(sock->fd != INVALID_SOCKET);
739
740	iov[0].len = sock->recvbuf.len;
741	iov[0].buf = sock->recvbuf.base;
742
743	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
744					    HEAP_ZERO_MEMORY,
745					    sizeof(IoCompletionInfo));
746	RUNTIME_CHECK(lpo != NULL);
747	lpo->request_type = SOCKET_RECV;
748
749	sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
750
751	Error = 0;
752	Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
753			     &NumBytes, &Flags,
754			     (SOCKADDR *)&sock->recvbuf.from_addr,
755			     &sock->recvbuf.from_addr_len,
756			     (LPWSAOVERLAPPED)lpo, NULL);
757
758	/* Check for errors. */
759	if (Result == SOCKET_ERROR) {
760		Error = WSAGetLastError();
761
762		switch (Error) {
763		case WSA_IO_PENDING:
764			sock->pending_iocp++;
765			sock->pending_recv++;
766			break;
767
768		default:
769			isc_result = isc__errno2result(Error);
770			if (isc_result == ISC_R_UNEXPECTED)
771				UNEXPECTED_ERROR(__FILE__, __LINE__,
772					"WSARecvFrom: Windows error code: %d, isc result %d",
773					Error, isc_result);
774			send_recvdone_abort(sock, isc_result);
775			break;
776		}
777	} else {
778		/*
779		 * The recv() finished immediately, but we will still get
780		 * a completion event.  Rather than duplicate code, let
781		 * that thread handle sending the data along its way.
782		 */
783		sock->pending_iocp++;
784		sock->pending_recv++;
785	}
786
787	socket_log(__LINE__, sock, NULL, IOEVENT,
788		   isc_msgcat, ISC_MSGSET_SOCKET,
789		   ISC_MSG_DOIORECV,
790		   "queue_io_request: fd %d result %d error %d",
791		   sock->fd, Result, Error);
792
793	CONSISTENT(sock);
794}
795
796static void
797manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
798	    isc_logmodule_t *module, int level, const char *fmt, ...)
799{
800	char msgbuf[2048];
801	va_list ap;
802
803	if (!isc_log_wouldlog(isc_lctx, level))
804		return;
805
806	va_start(ap, fmt);
807	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
808	va_end(ap);
809
810	isc_log_write(isc_lctx, category, module, level,
811		      "sockmgr %p: %s", sockmgr, msgbuf);
812}
813
814static void
815socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
816	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
817	   isc_msgcat_t *msgcat, int msgset, int message,
818	   const char *fmt, ...)
819{
820	char msgbuf[2048];
821	char peerbuf[256];
822	va_list ap;
823
824
825	if (!isc_log_wouldlog(isc_lctx, level))
826		return;
827
828	va_start(ap, fmt);
829	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
830	va_end(ap);
831
832	if (address == NULL) {
833		isc_log_iwrite(isc_lctx, category, module, level,
834			       msgcat, msgset, message,
835			       "socket %p line %d: %s", sock, lineno, msgbuf);
836	} else {
837		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
838		isc_log_iwrite(isc_lctx, category, module, level,
839			       msgcat, msgset, message,
840				   "socket %p line %d peer %s: %s", sock, lineno,
841				   peerbuf, msgbuf);
842	}
843
844}
845
846/*
847 * Make an fd SOCKET non-blocking.
848 */
849static isc_result_t
850make_nonblock(SOCKET fd) {
851	int ret;
852	unsigned long flags = 1;
853	char strbuf[ISC_STRERRORSIZE];
854
855	/* Set the socket to non-blocking */
856	ret = ioctlsocket(fd, FIONBIO, &flags);
857
858	if (ret == -1) {
859		isc__strerror(errno, strbuf, sizeof(strbuf));
860		UNEXPECTED_ERROR(__FILE__, __LINE__,
861				 "ioctlsocket(%d, FIOBIO, %d): %s",
862				 fd, flags, strbuf);
863
864		return (ISC_R_UNEXPECTED);
865	}
866
867	return (ISC_R_SUCCESS);
868}
869
870/*
871 * Windows 2000 systems incorrectly cause UDP sockets using WASRecvFrom
872 * to not work correctly, returning a WSACONNRESET error when a WSASendTo
873 * fails with an "ICMP port unreachable" response and preventing the
874 * socket from using the WSARecvFrom in subsequent operations.
875 * The function below fixes this, but requires that Windows 2000
876 * Service Pack 2 or later be installed on the system.  NT 4.0
877 * systems are not affected by this and work correctly.
878 * See Microsoft Knowledge Base Article Q263823 for details of this.
879 */
880isc_result_t
881connection_reset_fix(SOCKET fd) {
882	DWORD dwBytesReturned = 0;
883	BOOL  bNewBehavior = FALSE;
884	DWORD status;
885
886	if (isc_win32os_majorversion() < 5)
887		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */
888
889	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
890	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
891			  sizeof(bNewBehavior), NULL, 0,
892			  &dwBytesReturned, NULL, NULL);
893	if (status != SOCKET_ERROR)
894		return (ISC_R_SUCCESS);
895	else {
896		UNEXPECTED_ERROR(__FILE__, __LINE__,
897				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
898				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
899						ISC_MSG_FAILED, "failed"));
900		return (ISC_R_UNEXPECTED);
901	}
902}
903
904/*
905 * Construct an iov array and attach it to the msghdr passed in.  This is
906 * the SEND constructor, which will use the used region of the buffer
907 * (if using a buffer list) or will use the internal region (if a single
908 * buffer I/O is requested).
909 *
910 * Nothing can be NULL, and the done event must list at least one buffer
911 * on the buffer linked list for this function to be meaningful.
912 */
913static void
914build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
915		  struct msghdr *msg, char *cmsg, WSABUF *iov,
916		  IoCompletionInfo  *lpo)
917{
918	unsigned int iovcount;
919	isc_buffer_t *buffer;
920	buflist_t  *cpbuffer;
921	isc_region_t used;
922	size_t write_count;
923	size_t skip_count;
924
925	memset(msg, 0, sizeof(*msg));
926
927	memcpy(&msg->to_addr, &dev->address.type, dev->address.length);
928	msg->to_addr_len = dev->address.length;
929
930	buffer = ISC_LIST_HEAD(dev->bufferlist);
931	write_count = 0;
932	iovcount = 0;
933
934	/*
935	 * Single buffer I/O?  Skip what we've done so far in this region.
936	 */
937	if (buffer == NULL) {
938		write_count = dev->region.length - dev->n;
939		cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
940		RUNTIME_CHECK(cpbuffer != NULL);
941		cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
942		RUNTIME_CHECK(cpbuffer->buf != NULL);
943
944		socket_log(__LINE__, sock, NULL, TRACE,
945		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
946		   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
947		   cpbuffer->buf, write_count);
948
949		memcpy(cpbuffer->buf,(dev->region.base + dev->n), write_count);
950		cpbuffer->buflen = write_count;
951		ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
952		iov[0].buf = cpbuffer->buf;
953		iov[0].len = write_count;
954		iovcount = 1;
955
956		goto config;
957	}
958
959	/*
960	 * Multibuffer I/O.
961	 * Skip the data in the buffer list that we have already written.
962	 */
963	skip_count = dev->n;
964	while (buffer != NULL) {
965		REQUIRE(ISC_BUFFER_VALID(buffer));
966		if (skip_count < isc_buffer_usedlength(buffer))
967			break;
968		skip_count -= isc_buffer_usedlength(buffer);
969		buffer = ISC_LIST_NEXT(buffer, link);
970	}
971
972	while (buffer != NULL) {
973		INSIST(iovcount < MAXSCATTERGATHER_SEND);
974
975		isc_buffer_usedregion(buffer, &used);
976
977		if (used.length > 0) {
978			int uselen = used.length - skip_count;
979			cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
980			RUNTIME_CHECK(cpbuffer != NULL);
981			cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
982			RUNTIME_CHECK(cpbuffer->buf != NULL);
983
984			socket_log(__LINE__, sock, NULL, TRACE,
985			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
986			   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
987			   cpbuffer->buf, write_count);
988
989			memcpy(cpbuffer->buf,(used.base + skip_count), uselen);
990			cpbuffer->buflen = uselen;
991			iov[iovcount].buf = cpbuffer->buf;
992			iov[iovcount].len = used.length - skip_count;
993			write_count += uselen;
994			skip_count = 0;
995			iovcount++;
996		}
997		buffer = ISC_LIST_NEXT(buffer, link);
998	}
999
1000	INSIST(skip_count == 0);
1001
1002 config:
1003	msg->msg_iov = iov;
1004	msg->msg_iovlen = iovcount;
1005	msg->msg_totallen = write_count;
1006}
1007
1008static void
1009set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1010		isc_socketevent_t *dev)
1011{
1012	if (sock->type == isc_sockettype_udp) {
1013		if (address != NULL)
1014			dev->address = *address;
1015		else
1016			dev->address = sock->address;
1017	} else if (sock->type == isc_sockettype_tcp) {
1018		INSIST(address == NULL);
1019		dev->address = sock->address;
1020	}
1021}
1022
1023static void
1024destroy_socketevent(isc_event_t *event) {
1025	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1026
1027	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1028
1029	(ev->destroy)(event);
1030}
1031
1032static isc_socketevent_t *
1033allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1034		     isc_taskaction_t action, const void *arg)
1035{
1036	isc_socketevent_t *ev;
1037
1038	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1039						     sock, eventtype,
1040						     action, arg,
1041						     sizeof(*ev));
1042	if (ev == NULL)
1043		return (NULL);
1044
1045	ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1046	ISC_LINK_INIT(ev, ev_link);
1047	ISC_LIST_INIT(ev->bufferlist);
1048	ev->region.base = NULL;
1049	ev->n = 0;
1050	ev->offset = 0;
1051	ev->attributes = 0;
1052	ev->destroy = ev->ev_destroy;
1053	ev->ev_destroy = destroy_socketevent;
1054
1055	return (ev);
1056}
1057
1058#if defined(ISC_SOCKET_DEBUG)
1059static void
1060dump_msg(struct msghdr *msg, isc_socket_t *sock) {
1061	unsigned int i;
1062
1063	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1064	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1065	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1066	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1067		printf("\t\t%d\tbase %p, len %d\n", i,
1068		       msg->msg_iov[i].buf,
1069		       msg->msg_iov[i].len);
1070}
1071#endif
1072
1073/*
1074 * map the error code
1075 */
1076int
1077map_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1078		 char *errorstring, size_t bufsize) {
1079
1080	int doreturn;
1081	switch (windows_errno) {
1082	case WSAECONNREFUSED:
1083		*isc_errno = ISC_R_CONNREFUSED;
1084		if (sock->connected)
1085			doreturn = DOIO_HARD;
1086		else
1087			doreturn = DOIO_SOFT;
1088		break;
1089	case WSAENETUNREACH:
1090	case ERROR_NETWORK_UNREACHABLE:
1091		*isc_errno = ISC_R_NETUNREACH;
1092		if (sock->connected)
1093			doreturn = DOIO_HARD;
1094		else
1095			doreturn = DOIO_SOFT;
1096		break;
1097	case ERROR_PORT_UNREACHABLE:
1098	case ERROR_HOST_UNREACHABLE:
1099	case WSAEHOSTUNREACH:
1100		*isc_errno = ISC_R_HOSTUNREACH;
1101		if (sock->connected)
1102			doreturn = DOIO_HARD;
1103		else
1104			doreturn = DOIO_SOFT;
1105		break;
1106	case WSAENETDOWN:
1107		*isc_errno = ISC_R_NETDOWN;
1108		if (sock->connected)
1109			doreturn = DOIO_HARD;
1110		else
1111			doreturn = DOIO_SOFT;
1112		break;
1113	case WSAEHOSTDOWN:
1114		*isc_errno = ISC_R_HOSTDOWN;
1115		if (sock->connected)
1116			doreturn = DOIO_HARD;
1117		else
1118			doreturn = DOIO_SOFT;
1119		break;
1120	case WSAEACCES:
1121		*isc_errno = ISC_R_NOPERM;
1122		if (sock->connected)
1123			doreturn = DOIO_HARD;
1124		else
1125			doreturn = DOIO_SOFT;
1126		break;
1127	case WSAECONNRESET:
1128	case WSAENETRESET:
1129	case WSAECONNABORTED:
1130	case WSAEDISCON:
1131		*isc_errno = ISC_R_CONNECTIONRESET;
1132		if (sock->connected)
1133			doreturn = DOIO_HARD;
1134		else
1135			doreturn = DOIO_SOFT;
1136		break;
1137	case WSAENOTCONN:
1138		*isc_errno = ISC_R_NOTCONNECTED;
1139		if (sock->connected)
1140			doreturn = DOIO_HARD;
1141		else
1142			doreturn = DOIO_SOFT;
1143		break;
1144	case ERROR_OPERATION_ABORTED:
1145	case ERROR_CONNECTION_ABORTED:
1146	case ERROR_REQUEST_ABORTED:
1147		*isc_errno = ISC_R_CONNECTIONRESET;
1148		doreturn = DOIO_HARD;
1149		break;
1150	case WSAENOBUFS:
1151		*isc_errno = ISC_R_NORESOURCES;
1152		doreturn = DOIO_HARD;
1153		break;
1154	case WSAEAFNOSUPPORT:
1155		*isc_errno = ISC_R_FAMILYNOSUPPORT;
1156		doreturn = DOIO_HARD;
1157		break;
1158	case WSAEADDRNOTAVAIL:
1159		*isc_errno = ISC_R_ADDRNOTAVAIL;
1160		doreturn = DOIO_HARD;
1161		break;
1162	case WSAEDESTADDRREQ:
1163		*isc_errno = ISC_R_BADADDRESSFORM;
1164		doreturn = DOIO_HARD;
1165		break;
1166	case ERROR_NETNAME_DELETED:
1167		*isc_errno = ISC_R_NETDOWN;
1168		doreturn = DOIO_HARD;
1169		break;
1170	default:
1171		*isc_errno = ISC_R_IOERROR;
1172		doreturn = DOIO_HARD;
1173		break;
1174	}
1175	if (doreturn == DOIO_HARD) {
1176		isc__strerror(windows_errno, errorstring, bufsize);
1177	}
1178	return (doreturn);
1179}
1180
1181static void
1182fill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1183	isc_region_t r;
1184	int copylen;
1185	isc_buffer_t *buffer;
1186
1187	INSIST(dev->n < dev->minimum);
1188	INSIST(sock->recvbuf.remaining > 0);
1189	INSIST(sock->pending_recv == 0);
1190
1191	if (sock->type == isc_sockettype_udp) {
1192		dev->address.length = sock->recvbuf.from_addr_len;
1193		memcpy(&dev->address.type, &sock->recvbuf.from_addr,
1194		    sock->recvbuf.from_addr_len);
1195		if (isc_sockaddr_getport(&dev->address) == 0) {
1196			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1197				socket_log(__LINE__, sock, &dev->address, IOEVENT,
1198					   isc_msgcat, ISC_MSGSET_SOCKET,
1199					   ISC_MSG_ZEROPORT,
1200					   "dropping source port zero packet");
1201			}
1202			sock->recvbuf.remaining = 0;
1203			return;
1204		}
1205	} else if (sock->type == isc_sockettype_tcp) {
1206		dev->address = sock->address;
1207	}
1208
1209	/*
1210	 * Run through the list of buffers we were given, and find the
1211	 * first one with space.  Once it is found, loop through, filling
1212	 * the buffers as much as possible.
1213	 */
1214	buffer = ISC_LIST_HEAD(dev->bufferlist);
1215	if (buffer != NULL) { // Multi-buffer receive
1216		while (buffer != NULL && sock->recvbuf.remaining > 0) {
1217			REQUIRE(ISC_BUFFER_VALID(buffer));
1218			if (isc_buffer_availablelength(buffer) > 0) {
1219				isc_buffer_availableregion(buffer, &r);
1220				copylen = min(r.length, sock->recvbuf.remaining);
1221				memcpy(r.base, sock->recvbuf.consume_position, copylen);
1222				sock->recvbuf.consume_position += copylen;
1223				sock->recvbuf.remaining -= copylen;
1224				isc_buffer_add(buffer, copylen);
1225				dev->n += copylen;
1226			}
1227			buffer = ISC_LIST_NEXT(buffer, link);
1228		}
1229	} else { // Single-buffer receive
1230		copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1231		memcpy(dev->region.base + dev->n, sock->recvbuf.consume_position, copylen);
1232		sock->recvbuf.consume_position += copylen;
1233		sock->recvbuf.remaining -= copylen;
1234		dev->n += copylen;
1235	}
1236
1237	/*
1238	 * UDP receives are all-consuming.  That is, if we have 4k worth of
1239	 * data in our receive buffer, and the caller only gave us
1240	 * 1k of space, we will toss the remaining 3k of data.  TCP
1241	 * will keep the extra data around and use it for later requests.
1242	 */
1243	if (sock->type == isc_sockettype_udp)
1244		sock->recvbuf.remaining = 0;
1245}
1246
1247/*
1248 * Copy out as much data from the internal buffer to done events.
1249 * As each done event is filled, send it along its way.
1250 */
1251static void
1252completeio_recv(isc_socket_t *sock)
1253{
1254	isc_socketevent_t *dev;
1255
1256	/*
1257	 * If we are in the process of filling our buffer, we cannot
1258	 * touch it yet, so don't.
1259	 */
1260	if (sock->pending_recv > 0)
1261		return;
1262
1263	while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1264		dev = ISC_LIST_HEAD(sock->recv_list);
1265
1266		/*
1267		 * See if we have sufficient data in our receive buffer
1268		 * to handle this.  If we do, copy out the data.
1269		 */
1270		fill_recv(sock, dev);
1271
1272		/*
1273		 * Did we satisfy it?
1274		 */
1275		if (dev->n >= dev->minimum) {
1276			dev->result = ISC_R_SUCCESS;
1277			send_recvdone_event(sock, &dev);
1278		}
1279	}
1280}
1281
1282/*
1283 * Returns:
1284 *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1285 *			ISC_R_SUCCESS.
1286 *
1287 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1288 *			dev->result contains the appropriate error.
1289 *
1290 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1291 *			event was sent.  The operation should be retried.
1292 *
1293 *	No other return values are possible.
1294 */
1295static int
1296completeio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1297		struct msghdr *messagehdr, int cc, int send_errno)
1298{
1299	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1300	char strbuf[ISC_STRERRORSIZE];
1301
1302	if (send_errno != 0) {
1303		if (SOFT_ERROR(send_errno))
1304			return (DOIO_SOFT);
1305
1306		return (map_socket_error(sock, send_errno, &dev->result,
1307			strbuf, sizeof(strbuf)));
1308
1309		/*
1310		 * The other error types depend on whether or not the
1311		 * socket is UDP or TCP.  If it is UDP, some errors
1312		 * that we expect to be fatal under TCP are merely
1313		 * annoying, and are really soft errors.
1314		 *
1315		 * However, these soft errors are still returned as
1316		 * a status.
1317		 */
1318		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1319		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1320		UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1321				 addrbuf, strbuf);
1322		dev->result = isc__errno2result(send_errno);
1323	return (DOIO_HARD);
1324	}
1325
1326	/*
1327	 * If we write less than we expected, update counters, poke.
1328	 */
1329	dev->n += cc;
1330	if (cc != messagehdr->msg_totallen)
1331		return (DOIO_SOFT);
1332
1333	/*
1334	 * Exactly what we wanted to write.  We're done with this
1335	 * entry.  Post its completion event.
1336	 */
1337	dev->result = ISC_R_SUCCESS;
1338	return (DOIO_SUCCESS);
1339}
1340
1341static int
1342startio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1343	     int *send_errno)
1344{
1345	char *cmsg = NULL;
1346	char strbuf[ISC_STRERRORSIZE];
1347	IoCompletionInfo *lpo;
1348	int status;
1349	struct msghdr *msghdr;
1350
1351	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1352					    HEAP_ZERO_MEMORY,
1353					    sizeof(IoCompletionInfo));
1354	RUNTIME_CHECK(lpo != NULL);
1355	lpo->request_type = SOCKET_SEND;
1356	lpo->dev = dev;
1357	msghdr = &lpo->messagehdr;
1358	memset(msghdr, 0, sizeof(struct msghdr));
1359	ISC_LIST_INIT(lpo->bufferlist);
1360
1361	build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1362
1363	*nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1364
1365	if (*nbytes < 0) {
1366		/*
1367		 * I/O has been initiated
1368		 * completion will be through the completion port
1369		 */
1370		if (PENDING_ERROR(*send_errno)) {
1371			status = DOIO_PENDING;
1372			goto done;
1373		}
1374
1375		if (SOFT_ERROR(*send_errno)) {
1376			status = DOIO_SOFT;
1377			goto done;
1378		}
1379
1380		/*
1381		 * If we got this far then something is wrong
1382		 */
1383		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1384			isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1385			socket_log(__LINE__, sock, NULL, IOEVENT,
1386				   isc_msgcat, ISC_MSGSET_SOCKET,
1387				   ISC_MSG_INTERNALSEND,
1388				   "startio_send: internal_sendmsg(%d) %d "
1389				   "bytes, err %d/%s",
1390				   sock->fd, *nbytes, *send_errno, strbuf);
1391		}
1392		goto done;
1393	}
1394	dev->result = ISC_R_SUCCESS;
1395	status = DOIO_SOFT;
1396 done:
1397	_set_state(sock, SOCK_DATA);
1398	return (status);
1399}
1400
1401static isc_result_t
1402allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1403		isc_socket_t **socketp) {
1404	isc_socket_t *sock;
1405	isc_result_t result;
1406
1407	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1408
1409	if (sock == NULL)
1410		return (ISC_R_NOMEMORY);
1411
1412	sock->magic = 0;
1413	sock->references = 0;
1414
1415	sock->manager = manager;
1416	sock->type = type;
1417	sock->fd = INVALID_SOCKET;
1418
1419	ISC_LINK_INIT(sock, link);
1420
1421	/*
1422	 * set up list of readers and writers to be initially empty
1423	 */
1424	ISC_LIST_INIT(sock->recv_list);
1425	ISC_LIST_INIT(sock->send_list);
1426	ISC_LIST_INIT(sock->accept_list);
1427	sock->connect_ev = NULL;
1428	sock->pending_accept = 0;
1429	sock->pending_recv = 0;
1430	sock->pending_send = 0;
1431	sock->pending_iocp = 0;
1432	sock->listener = 0;
1433	sock->connected = 0;
1434	sock->pending_connect = 0;
1435	sock->bound = 0;
1436	memset(sock->name, 0, sizeof(sock->name));	// zero the name field
1437	_set_state(sock, SOCK_INITIALIZED);
1438
1439	sock->recvbuf.len = 65536;
1440	sock->recvbuf.consume_position = sock->recvbuf.base;
1441	sock->recvbuf.remaining = 0;
1442	sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1443	if (sock->recvbuf.base == NULL) {
1444		sock->magic = 0;
1445		goto error;
1446	}
1447
1448	/*
1449	 * initialize the lock
1450	 */
1451	result = isc_mutex_init(&sock->lock);
1452	if (result != ISC_R_SUCCESS) {
1453		sock->magic = 0;
1454		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1455		sock->recvbuf.base = NULL;
1456		goto error;
1457	}
1458
1459	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1460		   "allocated");
1461
1462	sock->magic = SOCKET_MAGIC;
1463	*socketp = sock;
1464
1465	return (ISC_R_SUCCESS);
1466
1467 error:
1468	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1469
1470	return (result);
1471}
1472
1473/*
1474 * Verify that the socket state is consistent.
1475 */
1476static void
1477consistent(isc_socket_t *sock) {
1478
1479	isc_socketevent_t *dev;
1480	isc_socket_newconnev_t *nev;
1481	unsigned int count;
1482	char *crash_reason;
1483	isc_boolean_t crash = ISC_FALSE;
1484
1485	REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1486		+ sock->pending_accept + sock->pending_connect);
1487
1488	dev = ISC_LIST_HEAD(sock->send_list);
1489	count = 0;
1490	while (dev != NULL) {
1491		count++;
1492		dev = ISC_LIST_NEXT(dev, ev_link);
1493	}
1494	if (count > sock->pending_send) {
1495		crash = ISC_TRUE;
1496		crash_reason = "send_list > sock->pending_send";
1497	}
1498
1499	nev = ISC_LIST_HEAD(sock->accept_list);
1500	count = 0;
1501	while (nev != NULL) {
1502		count++;
1503		nev = ISC_LIST_NEXT(nev, ev_link);
1504	}
1505	if (count > sock->pending_accept) {
1506		crash = ISC_TRUE;
1507		crash_reason = "send_list > sock->pending_send";
1508	}
1509
1510	if (crash) {
1511		socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1512			   ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1513			   crash_reason);
1514		sock_dump(sock);
1515		INSIST(crash == ISC_FALSE);
1516	}
1517}
1518
1519/*
1520 * Maybe free the socket.
1521 *
1522 * This function will verify tht the socket is no longer in use in any way,
1523 * either internally or externally.  This is the only place where this
1524 * check is to be made; if some bit of code believes that IT is done with
1525 * the socket (e.g., some reference counter reaches zero), it should call
1526 * this function.
1527 *
1528 * When calling this function, the socket must be locked, and the manager
1529 * must be unlocked.
1530 *
1531 * When this function returns, *socketp will be NULL.  No tricks to try
1532 * to hold on to this pointer are allowed.
1533 */
1534static void
1535maybe_free_socket(isc_socket_t **socketp, int lineno) {
1536	isc_socket_t *sock = *socketp;
1537	*socketp = NULL;
1538
1539	INSIST(VALID_SOCKET(sock));
1540	CONSISTENT(sock);
1541
1542	if (sock->pending_iocp > 0
1543	    || sock->pending_recv > 0
1544	    || sock->pending_send > 0
1545	    || sock->pending_accept > 0
1546	    || sock->references > 0
1547	    || sock->pending_connect == 1
1548	    || !ISC_LIST_EMPTY(sock->recv_list)
1549	    || !ISC_LIST_EMPTY(sock->send_list)
1550	    || !ISC_LIST_EMPTY(sock->accept_list)
1551	    || sock->fd != INVALID_SOCKET) {
1552		UNLOCK(&sock->lock);
1553		return;
1554	}
1555	UNLOCK(&sock->lock);
1556
1557	free_socket(&sock, lineno);
1558}
1559
1560void
1561free_socket(isc_socket_t **sockp, int lineno) {
1562	isc_socketmgr_t *manager;
1563	isc_socket_t *sock = *sockp;
1564	*sockp = NULL;
1565
1566	manager = sock->manager;
1567
1568	/*
1569	 * Seems we can free the socket after all.
1570	 */
1571	manager = sock->manager;
1572	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1573		   ISC_MSG_DESTROYING, "freeing socket line %d fd %d lock %p semaphore %p",
1574		   lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1575
1576	sock->magic = 0;
1577	DESTROYLOCK(&sock->lock);
1578
1579	if (sock->recvbuf.base != NULL)
1580		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1581
1582	LOCK(&manager->lock);
1583	if (ISC_LINK_LINKED(sock, link))
1584		ISC_LIST_UNLINK(manager->socklist, sock, link);
1585	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1586
1587	if (ISC_LIST_EMPTY(manager->socklist))
1588		SIGNAL(&manager->shutdown_ok);
1589	UNLOCK(&manager->lock);
1590}
1591
1592/*
1593 * Create a new 'type' socket managed by 'manager'.  Events
1594 * will be posted to 'task' and when dispatched 'action' will be
1595 * called with 'arg' as the arg value.  The new socket is returned
1596 * in 'socketp'.
1597 */
1598isc_result_t
1599isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1600		  isc_socket_t **socketp) {
1601	isc_socket_t *sock = NULL;
1602	isc_result_t result;
1603#if defined(USE_CMSG)
1604	int on = 1;
1605#endif
1606#if defined(SO_RCVBUF)
1607	ISC_SOCKADDR_LEN_T optlen;
1608	int size;
1609#endif
1610	int socket_errno;
1611	char strbuf[ISC_STRERRORSIZE];
1612
1613	REQUIRE(VALID_MANAGER(manager));
1614	REQUIRE(socketp != NULL && *socketp == NULL);
1615	REQUIRE(type != isc_sockettype_fdwatch);
1616
1617	result = allocate_socket(manager, type, &sock);
1618	if (result != ISC_R_SUCCESS)
1619		return (result);
1620
1621	sock->pf = pf;
1622	switch (type) {
1623	case isc_sockettype_udp:
1624		sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1625		if (sock->fd != INVALID_SOCKET) {
1626			result = connection_reset_fix(sock->fd);
1627			if (result != ISC_R_SUCCESS) {
1628				socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1629					"closed %d %d %d con_reset_fix_failed",
1630					sock->pending_recv, sock->pending_send,
1631					sock->references);
1632				closesocket(sock->fd);
1633				_set_state(sock, SOCK_CLOSED);
1634				sock->fd = INVALID_SOCKET;
1635				free_socket(&sock, __LINE__);
1636				return (result);
1637			}
1638		}
1639		break;
1640	case isc_sockettype_tcp:
1641		sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1642		break;
1643	}
1644
1645	if (sock->fd == INVALID_SOCKET) {
1646		socket_errno = WSAGetLastError();
1647		free_socket(&sock, __LINE__);
1648
1649		switch (socket_errno) {
1650		case WSAEMFILE:
1651		case WSAENOBUFS:
1652			return (ISC_R_NORESOURCES);
1653
1654		case WSAEPROTONOSUPPORT:
1655		case WSAEPFNOSUPPORT:
1656		case WSAEAFNOSUPPORT:
1657			return (ISC_R_FAMILYNOSUPPORT);
1658
1659		default:
1660			isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1661			UNEXPECTED_ERROR(__FILE__, __LINE__,
1662					 "socket() %s: %s",
1663					 isc_msgcat_get(isc_msgcat,
1664							ISC_MSGSET_GENERAL,
1665							ISC_MSG_FAILED,
1666							"failed"),
1667					 strbuf);
1668			return (ISC_R_UNEXPECTED);
1669		}
1670	}
1671
1672	result = make_nonblock(sock->fd);
1673	if (result != ISC_R_SUCCESS) {
1674		socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1675			"closed %d %d %d make_nonblock_failed",
1676			sock->pending_recv, sock->pending_send,
1677			sock->references);
1678		closesocket(sock->fd);
1679		sock->fd = INVALID_SOCKET;
1680		free_socket(&sock, __LINE__);
1681		return (result);
1682	}
1683
1684
1685#if defined(USE_CMSG) || defined(SO_RCVBUF)
1686	if (type == isc_sockettype_udp) {
1687
1688#if defined(USE_CMSG)
1689#if defined(ISC_PLATFORM_HAVEIPV6)
1690#ifdef IPV6_RECVPKTINFO
1691		/* 2292bis */
1692		if ((pf == AF_INET6)
1693		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1694				   (void *)&on, sizeof(on)) < 0)) {
1695			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1696			UNEXPECTED_ERROR(__FILE__, __LINE__,
1697					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1698					 "%s: %s", sock->fd,
1699					 isc_msgcat_get(isc_msgcat,
1700							ISC_MSGSET_GENERAL,
1701							ISC_MSG_FAILED,
1702							"failed"),
1703					 strbuf);
1704		}
1705#else
1706		/* 2292 */
1707		if ((pf == AF_INET6)
1708		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1709				   (void *)&on, sizeof(on)) < 0)) {
1710			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1711			UNEXPECTED_ERROR(__FILE__, __LINE__,
1712					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1713					 sock->fd,
1714					 isc_msgcat_get(isc_msgcat,
1715							ISC_MSGSET_GENERAL,
1716							ISC_MSG_FAILED,
1717							"failed"),
1718					 strbuf);
1719		}
1720#endif /* IPV6_RECVPKTINFO */
1721#ifdef IPV6_USE_MIN_MTU	/*2292bis, not too common yet*/
1722		/* use minimum MTU */
1723		if (pf == AF_INET6) {
1724			(void)setsockopt(sock->fd, IPPROTO_IPV6,
1725					 IPV6_USE_MIN_MTU,
1726					 (void *)&on, sizeof(on));
1727		}
1728#endif
1729#endif /* ISC_PLATFORM_HAVEIPV6 */
1730#endif /* defined(USE_CMSG) */
1731
1732#if defined(SO_RCVBUF)
1733	       optlen = sizeof(size);
1734	       if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1735			      (void *)&size, &optlen) >= 0 &&
1736		    size < RCVBUFSIZE) {
1737		       size = RCVBUFSIZE;
1738		       (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1739					(void *)&size, sizeof(size));
1740	       }
1741#endif
1742
1743	}
1744#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1745
1746	_set_state(sock, SOCK_OPEN);
1747	sock->references = 1;
1748	*socketp = sock;
1749
1750	iocompletionport_update(sock);
1751
1752	/*
1753	 * Note we don't have to lock the socket like we normally would because
1754	 * there are no external references to it yet.
1755	 */
1756	LOCK(&manager->lock);
1757	ISC_LIST_APPEND(manager->socklist, sock, link);
1758	InterlockedIncrement(&manager->totalSockets);
1759	UNLOCK(&manager->lock);
1760
1761	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1762		   ISC_MSG_CREATED, "created %u type %u", sock->fd, type);
1763
1764	return (ISC_R_SUCCESS);
1765}
1766
1767isc_result_t
1768isc_socket_open(isc_socket_t *sock) {
1769	REQUIRE(VALID_SOCKET(sock));
1770	REQUIRE(sock->type != isc_sockettype_fdwatch);
1771
1772	return (ISC_R_NOTIMPLEMENTED);
1773}
1774
1775/*
1776 * Attach to a socket.  Caller must explicitly detach when it is done.
1777 */
1778void
1779isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1780	REQUIRE(VALID_SOCKET(sock));
1781	REQUIRE(socketp != NULL && *socketp == NULL);
1782
1783	LOCK(&sock->lock);
1784	CONSISTENT(sock);
1785	sock->references++;
1786	UNLOCK(&sock->lock);
1787
1788	*socketp = sock;
1789}
1790
1791/*
1792 * Dereference a socket.  If this is the last reference to it, clean things
1793 * up by destroying the socket.
1794 */
1795void
1796isc_socket_detach(isc_socket_t **socketp) {
1797	isc_socket_t *sock;
1798	isc_boolean_t kill_socket = ISC_FALSE;
1799
1800	REQUIRE(socketp != NULL);
1801	sock = *socketp;
1802	REQUIRE(VALID_SOCKET(sock));
1803	REQUIRE(sock->type != isc_sockettype_fdwatch);
1804
1805	LOCK(&sock->lock);
1806	CONSISTENT(sock);
1807	REQUIRE(sock->references > 0);
1808	sock->references--;
1809
1810	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1811		"detach_socket %d %d %d",
1812		sock->pending_recv, sock->pending_send,
1813		sock->references);
1814
1815	if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1816		closesocket(sock->fd);
1817		sock->fd = INVALID_SOCKET;
1818		_set_state(sock, SOCK_CLOSED);
1819	}
1820
1821	maybe_free_socket(&sock, __LINE__);
1822
1823	*socketp = NULL;
1824}
1825
1826isc_result_t
1827isc_socket_close(isc_socket_t *sock) {
1828	REQUIRE(VALID_SOCKET(sock));
1829	REQUIRE(sock->type != isc_sockettype_fdwatch);
1830
1831	return (ISC_R_NOTIMPLEMENTED);
1832}
1833
1834/*
1835 * Dequeue an item off the given socket's read queue, set the result code
1836 * in the done event to the one provided, and send it to the task it was
1837 * destined for.
1838 *
1839 * If the event to be sent is on a list, remove it before sending.  If
1840 * asked to, send and detach from the task as well.
1841 *
1842 * Caller must have the socket locked if the event is attached to the socket.
1843 */
1844static void
1845send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1846	isc_task_t *task;
1847
1848	task = (*dev)->ev_sender;
1849	(*dev)->ev_sender = sock;
1850
1851	if (ISC_LINK_LINKED(*dev, ev_link))
1852		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1853
1854	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1855	    == ISC_SOCKEVENTATTR_ATTACHED)
1856		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1857	else
1858		isc_task_send(task, (isc_event_t **)dev);
1859
1860	CONSISTENT(sock);
1861}
1862
1863/*
1864 * See comments for send_recvdone_event() above.
1865 */
1866static void
1867send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1868	isc_task_t *task;
1869
1870	INSIST(dev != NULL && *dev != NULL);
1871
1872	task = (*dev)->ev_sender;
1873	(*dev)->ev_sender = sock;
1874
1875	if (ISC_LINK_LINKED(*dev, ev_link))
1876		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1877
1878	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1879	    == ISC_SOCKEVENTATTR_ATTACHED)
1880		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1881	else
1882		isc_task_send(task, (isc_event_t **)dev);
1883
1884	CONSISTENT(sock);
1885}
1886
1887/*
1888 * See comments for send_recvdone_event() above.
1889 */
1890static void
1891send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
1892	isc_task_t *task;
1893
1894	INSIST(adev != NULL && *adev != NULL);
1895
1896	task = (*adev)->ev_sender;
1897	(*adev)->ev_sender = sock;
1898
1899	if (ISC_LINK_LINKED(*adev, ev_link))
1900		ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
1901
1902	isc_task_sendanddetach(&task, (isc_event_t **)adev);
1903
1904	CONSISTENT(sock);
1905}
1906
1907/*
1908 * See comments for send_recvdone_event() above.
1909 */
1910static void
1911send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
1912	isc_task_t *task;
1913
1914	INSIST(cdev != NULL && *cdev != NULL);
1915
1916	task = (*cdev)->ev_sender;
1917	(*cdev)->ev_sender = sock;
1918
1919	sock->connect_ev = NULL;
1920
1921	isc_task_sendanddetach(&task, (isc_event_t **)cdev);
1922
1923	CONSISTENT(sock);
1924}
1925
1926/*
1927 * On entry to this function, the event delivered is the internal
1928 * readable event, and the first item on the accept_list should be
1929 * the done event we want to send.  If the list is empty, this is a no-op,
1930 * so just close the new connection, unlock, and return.
1931 *
1932 * Note the socket is locked before entering here
1933 */
1934static void
1935internal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
1936	isc_socket_newconnev_t *adev;
1937	isc_result_t result = ISC_R_SUCCESS;
1938	isc_socket_t *nsock;
1939	struct sockaddr *localaddr;
1940	int localaddr_len = sizeof(*localaddr);
1941	struct sockaddr *remoteaddr;
1942	int remoteaddr_len = sizeof(*remoteaddr);
1943
1944	INSIST(VALID_SOCKET(sock));
1945	LOCK(&sock->lock);
1946	CONSISTENT(sock);
1947
1948	socket_log(__LINE__, sock, NULL, TRACE,
1949		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1950		   "internal_accept called");
1951
1952	INSIST(sock->listener);
1953
1954	INSIST(sock->pending_iocp > 0);
1955	sock->pending_iocp--;
1956	INSIST(sock->pending_accept > 0);
1957	sock->pending_accept--;
1958
1959	adev = lpo->adev;
1960
1961	/*
1962	 * If the event is no longer in the list we can just return.
1963	 */
1964	if (!acceptdone_is_active(sock, adev))
1965		goto done;
1966
1967	nsock = adev->newsocket;
1968
1969	/*
1970	 * Pull off the done event.
1971	 */
1972	ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
1973
1974	/*
1975	 * Extract the addresses from the socket, copy them into the structure,
1976	 * and return the new socket.
1977	 */
1978	ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
1979		sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
1980		(LPSOCKADDR *)&localaddr, &localaddr_len,
1981		(LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
1982	memcpy(&adev->address.type, remoteaddr, remoteaddr_len);
1983	adev->address.length = remoteaddr_len;
1984	nsock->address = adev->address;
1985	nsock->pf = adev->address.type.sa.sa_family;
1986
1987	socket_log(__LINE__, nsock, &nsock->address, TRACE,
1988		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1989		   "internal_accept parent %p", sock);
1990
1991	result = make_nonblock(adev->newsocket->fd);
1992	INSIST(result == ISC_R_SUCCESS);
1993
1994	INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
1995	       (char *)&sock->fd, sizeof(sock->fd)) == 0);
1996
1997	/*
1998	 * Hook it up into the manager.
1999	 */
2000	nsock->bound = 1;
2001	nsock->connected = 1;
2002	_set_state(nsock, SOCK_OPEN);
2003
2004	LOCK(&nsock->manager->lock);
2005	ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2006	InterlockedIncrement(&nsock->manager->totalSockets);
2007	UNLOCK(&nsock->manager->lock);
2008
2009	socket_log(__LINE__, sock, &nsock->address, CREATION,
2010		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2011		   "accepted_connection new_socket %p fd %d",
2012		   nsock, nsock->fd);
2013
2014	adev->result = result;
2015	send_acceptdone_event(sock, &adev);
2016
2017done:
2018	CONSISTENT(sock);
2019	UNLOCK(&sock->lock);
2020
2021	HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2022	lpo->acceptbuffer = NULL;
2023}
2024
2025/*
2026 * Called when a socket with a pending connect() finishes.
2027 * Note that the socket is locked before entering.
2028 */
2029static void
2030internal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2031	isc_socket_connev_t *cdev;
2032	char strbuf[ISC_STRERRORSIZE];
2033
2034	INSIST(VALID_SOCKET(sock));
2035
2036	LOCK(&sock->lock);
2037
2038	INSIST(sock->pending_iocp > 0);
2039	sock->pending_iocp--;
2040	INSIST(sock->pending_connect == 1);
2041	sock->pending_connect = 0;
2042
2043	/*
2044	 * Has this event been canceled?
2045	 */
2046	cdev = lpo->cdev;
2047	if (!connectdone_is_active(sock, cdev)) {
2048		sock->pending_connect = 0;
2049		if (sock->fd != INVALID_SOCKET) {
2050			closesocket(sock->fd);
2051			sock->fd = INVALID_SOCKET;
2052			_set_state(sock, SOCK_CLOSED);
2053		}
2054		CONSISTENT(sock);
2055		UNLOCK(&sock->lock);
2056		return;
2057	}
2058
2059	/*
2060	 * Check possible Windows network event error status here.
2061	 */
2062	if (connect_errno != 0) {
2063		/*
2064		 * If the error is SOFT, just try again on this
2065		 * fd and pretend nothing strange happened.
2066		 */
2067		if (SOFT_ERROR(connect_errno) ||
2068		    connect_errno == WSAEINPROGRESS) {
2069			sock->pending_connect = 1;
2070			CONSISTENT(sock);
2071			UNLOCK(&sock->lock);
2072			return;
2073		}
2074
2075		/*
2076		 * Translate other errors into ISC_R_* flavors.
2077		 */
2078		switch (connect_errno) {
2079#define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2080			ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2081			ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2082			ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2083			ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2084			ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2085			ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2086			ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2087			ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2088			ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2089			ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2090			ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2091			ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2092#undef ERROR_MATCH
2093		default:
2094			cdev->result = ISC_R_UNEXPECTED;
2095			isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2096			UNEXPECTED_ERROR(__FILE__, __LINE__,
2097					 "internal_connect: connect() %s",
2098					 strbuf);
2099		}
2100	} else {
2101		INSIST(setsockopt(sock->fd, SOL_SOCKET, SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2102		cdev->result = ISC_R_SUCCESS;
2103		sock->connected = 1;
2104		socket_log(__LINE__, sock, &sock->address, IOEVENT,
2105			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2106			   "internal_connect: success");
2107	}
2108
2109	send_connectdone_event(sock, &cdev);
2110
2111	UNLOCK(&sock->lock);
2112}
2113
2114/*
2115 * Loop through the socket, returning ISC_R_EOF for each done event pending.
2116 */
2117static void
2118send_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2119	isc_socketevent_t *dev;
2120
2121	while (!ISC_LIST_EMPTY(sock->recv_list)) {
2122		dev = ISC_LIST_HEAD(sock->recv_list);
2123		dev->result = result;
2124		send_recvdone_event(sock, &dev);
2125	}
2126}
2127
2128/*
2129 * Take the data we received in our private buffer, and if any recv() calls on
2130 * our list are satisfied, send the corresponding done event.
2131 *
2132 * If we need more data (there are still items on the recv_list after we consume all
2133 * our data) then arrange for another system recv() call to fill our buffers.
2134 */
2135static void
2136internal_recv(isc_socket_t *sock, int nbytes)
2137{
2138	INSIST(VALID_SOCKET(sock));
2139
2140	LOCK(&sock->lock);
2141	CONSISTENT(sock);
2142
2143	socket_log(__LINE__, sock, NULL, IOEVENT,
2144		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2145		   "internal_recv: %d bytes received", nbytes);
2146
2147	/*
2148	 * If we got here, the I/O operation succeeded.  However, we might still have removed this
2149	 * event from our notification list (or never placed it on it due to immediate completion.)
2150	 * Handle the reference counting here, and handle the cancellation event just after.
2151	 */
2152	INSIST(sock->pending_iocp > 0);
2153	sock->pending_iocp--;
2154	INSIST(sock->pending_recv > 0);
2155	sock->pending_recv--;
2156
2157	/*
2158	 * The only way we could have gotten here is that our I/O has successfully completed.
2159	 * Update our pointers, and move on.  The only odd case here is that we might not
2160	 * have received enough data on a TCP stream to satisfy the minimum requirements.  If
2161	 * this is the case, we will re-issue the recv() call for what we need.
2162	 *
2163	 * We do check for a recv() of 0 bytes on a TCP stream.  This means the remote end
2164	 * has closed.
2165	 */
2166	if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2167		send_recvdone_abort(sock, ISC_R_EOF);
2168		maybe_free_socket(&sock, __LINE__);
2169		return;
2170	}
2171	sock->recvbuf.remaining = nbytes;
2172	sock->recvbuf.consume_position = sock->recvbuf.base;
2173	completeio_recv(sock);
2174
2175	/*
2176	 * If there are more receivers waiting for data, queue another receive
2177	 * here.
2178	 */
2179	queue_receive_request(sock);
2180
2181	/*
2182	 * Unlock and/or destroy if we are the last thing this socket has left to do.
2183	 */
2184	maybe_free_socket(&sock, __LINE__);
2185}
2186
2187static void
2188internal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2189	      struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2190{
2191	buflist_t *buffer;
2192
2193	/*
2194	 * Find out what socket this is and lock it.
2195	 */
2196	INSIST(VALID_SOCKET(sock));
2197
2198	LOCK(&sock->lock);
2199	CONSISTENT(sock);
2200
2201	socket_log(__LINE__, sock, NULL, IOEVENT,
2202		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2203		   "internal_send: task got socket event %p", dev);
2204
2205	buffer = ISC_LIST_HEAD(lpo->bufferlist);
2206	while (buffer != NULL) {
2207		ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2208
2209		socket_log(__LINE__, sock, NULL, TRACE,
2210		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2211		   "free_buffer %p %p", buffer, buffer->buf);
2212
2213		HeapFree(hHeapHandle, 0, buffer->buf);
2214		HeapFree(hHeapHandle, 0, buffer);
2215		buffer = ISC_LIST_HEAD(lpo->bufferlist);
2216	}
2217
2218	INSIST(sock->pending_iocp > 0);
2219	sock->pending_iocp--;
2220	INSIST(sock->pending_send > 0);
2221	sock->pending_send--;
2222
2223	/* If the event is no longer in the list we can just return */
2224	if (!senddone_is_active(sock, dev))
2225		goto done;
2226
2227	/*
2228	 * Set the error code and send things on its way.
2229	 */
2230	switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2231	case DOIO_SOFT:
2232		break;
2233	case DOIO_HARD:
2234	case DOIO_SUCCESS:
2235		send_senddone_event(sock, &dev);
2236		break;
2237	}
2238
2239 done:
2240	maybe_free_socket(&sock, __LINE__);
2241}
2242
2243/*
2244 * These return if the done event passed in is on the list (or for connect, is
2245 * the one we're waiting for.  Using these ensures we will not double-send an
2246 * event.
2247 */
2248static isc_boolean_t
2249senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2250{
2251	isc_socketevent_t *ldev;
2252
2253	ldev = ISC_LIST_HEAD(sock->send_list);
2254	while (ldev != NULL && ldev != dev)
2255		ldev = ISC_LIST_NEXT(ldev, ev_link);
2256
2257	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2258}
2259
2260static isc_boolean_t
2261acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2262{
2263	isc_socket_newconnev_t *ldev;
2264
2265	ldev = ISC_LIST_HEAD(sock->accept_list);
2266	while (ldev != NULL && ldev != dev)
2267		ldev = ISC_LIST_NEXT(ldev, ev_link);
2268
2269	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2270}
2271
2272static isc_boolean_t
2273connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2274{
2275	return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2276}
2277
2278/*
2279 * This is the I/O Completion Port Worker Function. It loops forever
2280 * waiting for I/O to complete and then forwards them for further
2281 * processing. There are a number of these in separate threads.
2282 */
2283static isc_threadresult_t WINAPI
2284SocketIoThread(LPVOID ThreadContext) {
2285	isc_socketmgr_t *manager = ThreadContext;
2286	BOOL bSuccess = FALSE;
2287	DWORD nbytes;
2288	IoCompletionInfo *lpo = NULL;
2289	isc_socket_t *sock = NULL;
2290	int request;
2291	struct msghdr *messagehdr = NULL;
2292	int errval;
2293	char strbuf[ISC_STRERRORSIZE];
2294	int errstatus;
2295
2296	REQUIRE(VALID_MANAGER(manager));
2297
2298	/*
2299	 * Set the thread priority high enough so I/O will
2300	 * preempt normal recv packet processing, but not
2301	 * higher than the timer sync thread.
2302	 */
2303	if (!SetThreadPriority(GetCurrentThread(),
2304			       THREAD_PRIORITY_ABOVE_NORMAL)) {
2305		errval = GetLastError();
2306		isc__strerror(errval, strbuf, sizeof(strbuf));
2307		FATAL_ERROR(__FILE__, __LINE__,
2308				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2309				ISC_MSG_FAILED,
2310				"Can't set thread priority: %s"),
2311				strbuf);
2312	}
2313
2314	/*
2315	 * Loop forever waiting on I/O Completions and then processing them
2316	 */
2317	while (TRUE) {
2318		bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2319						     &nbytes, (LPDWORD)&sock,
2320						     (LPWSAOVERLAPPED *)&lpo,
2321						     INFINITE);
2322		if (lpo == NULL) /* Received request to exit */
2323			break;
2324
2325		REQUIRE(VALID_SOCKET(sock));
2326
2327		request = lpo->request_type;
2328
2329		errstatus = 0;
2330		if (!bSuccess) {
2331			isc_result_t isc_result;
2332
2333			/*
2334			 * Did the I/O operation complete?
2335			 */
2336			errstatus = WSAGetLastError();
2337			isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2338
2339			LOCK(&sock->lock);
2340			CONSISTENT(sock);
2341			switch (request) {
2342			case SOCKET_RECV:
2343				INSIST(sock->pending_iocp > 0);
2344				sock->pending_iocp--;
2345				INSIST(sock->pending_recv > 0);
2346				sock->pending_recv--;
2347				send_recvdone_abort(sock, isc_result);
2348				if (isc_result == ISC_R_UNEXPECTED) {
2349					UNEXPECTED_ERROR(__FILE__, __LINE__,
2350						"SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2351						errstatus, isc_result);
2352				}
2353				break;
2354
2355			case SOCKET_SEND:
2356				INSIST(sock->pending_iocp > 0);
2357				sock->pending_iocp--;
2358				INSIST(sock->pending_send > 0);
2359				sock->pending_send--;
2360				if (senddone_is_active(sock, lpo->dev)) {
2361					lpo->dev->result = isc_result;
2362					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2363						"canceled_send");
2364					send_senddone_event(sock, &lpo->dev);
2365				}
2366				break;
2367
2368			case SOCKET_ACCEPT:
2369				INSIST(sock->pending_iocp > 0);
2370				sock->pending_iocp--;
2371				INSIST(sock->pending_accept > 0);
2372				sock->pending_accept--;
2373				if (acceptdone_is_active(sock, lpo->adev)) {
2374					closesocket(lpo->adev->newsocket->fd);
2375					lpo->adev->newsocket->fd = INVALID_SOCKET;
2376					lpo->adev->newsocket->references--;
2377					free_socket(&lpo->adev->newsocket, __LINE__);
2378					lpo->adev->result = isc_result;
2379					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2380						"canceled_accept");
2381					send_acceptdone_event(sock, &lpo->adev);
2382				}
2383				break;
2384
2385			case SOCKET_CONNECT:
2386				INSIST(sock->pending_iocp > 0);
2387				sock->pending_iocp--;
2388				INSIST(sock->pending_connect == 1);
2389				sock->pending_connect = 0;
2390				if (connectdone_is_active(sock, lpo->cdev)) {
2391					lpo->cdev->result = isc_result;
2392					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2393						"canceled_connect");
2394					send_connectdone_event(sock, &lpo->cdev);
2395				}
2396				break;
2397			}
2398			maybe_free_socket(&sock, __LINE__);
2399
2400			if (lpo != NULL)
2401				HeapFree(hHeapHandle, 0, lpo);
2402			continue;
2403		}
2404
2405		messagehdr = &lpo->messagehdr;
2406
2407		switch (request) {
2408		case SOCKET_RECV:
2409			internal_recv(sock, nbytes);
2410			break;
2411		case SOCKET_SEND:
2412			internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2413			break;
2414		case SOCKET_ACCEPT:
2415			internal_accept(sock, lpo, errstatus);
2416			break;
2417		case SOCKET_CONNECT:
2418			internal_connect(sock, lpo, errstatus);
2419			break;
2420		}
2421
2422		if (lpo != NULL)
2423			HeapFree(hHeapHandle, 0, lpo);
2424	}
2425
2426	/*
2427	 * Exit Completion Port Thread
2428	 */
2429	manager_log(manager, TRACE,
2430		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2431				   ISC_MSG_EXITING, "SocketIoThread exiting"));
2432	return ((isc_threadresult_t)0);
2433}
2434
2435/*
2436 * Create a new socket manager.
2437 */
2438isc_result_t
2439isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2440	return (isc_socketmgr_create2(mctx, managerp, 0));
2441}
2442
2443isc_result_t
2444isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2445		     unsigned int maxsocks)
2446{
2447	isc_socketmgr_t *manager;
2448	isc_result_t result;
2449
2450	REQUIRE(managerp != NULL && *managerp == NULL);
2451
2452	if (maxsocks != 0)
2453		return (ISC_R_NOTIMPLEMENTED);
2454
2455	manager = isc_mem_get(mctx, sizeof(*manager));
2456	if (manager == NULL)
2457		return (ISC_R_NOMEMORY);
2458
2459	InitSockets();
2460
2461	manager->magic = SOCKET_MANAGER_MAGIC;
2462	manager->mctx = NULL;
2463	manager->stats = NULL;
2464	ISC_LIST_INIT(manager->socklist);
2465	result = isc_mutex_init(&manager->lock);
2466	if (result != ISC_R_SUCCESS) {
2467		isc_mem_put(mctx, manager, sizeof(*manager));
2468		return (result);
2469	}
2470	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2471		DESTROYLOCK(&manager->lock);
2472		isc_mem_put(mctx, manager, sizeof(*manager));
2473		UNEXPECTED_ERROR(__FILE__, __LINE__,
2474				 "isc_condition_init() %s",
2475				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2476						ISC_MSG_FAILED, "failed"));
2477		return (ISC_R_UNEXPECTED);
2478	}
2479
2480	isc_mem_attach(mctx, &manager->mctx);
2481
2482	iocompletionport_init(manager);	/* Create the Completion Ports */
2483
2484	manager->bShutdown = ISC_FALSE;
2485	manager->totalSockets = 0;
2486	manager->iocp_total = 0;
2487
2488	*managerp = manager;
2489
2490	return (ISC_R_SUCCESS);
2491}
2492
2493isc_result_t
2494isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2495	REQUIRE(VALID_MANAGER(manager));
2496	REQUIRE(nsockp != NULL);
2497
2498	return (ISC_R_NOTIMPLEMENTED);
2499}
2500
2501void
2502isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2503	REQUIRE(VALID_MANAGER(manager));
2504	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2505	REQUIRE(manager->stats == NULL);
2506	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2507
2508	isc_stats_attach(stats, &manager->stats);
2509}
2510
2511void
2512isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2513	isc_socketmgr_t *manager;
2514	int i;
2515	isc_mem_t *mctx;
2516
2517	/*
2518	 * Destroy a socket manager.
2519	 */
2520
2521	REQUIRE(managerp != NULL);
2522	manager = *managerp;
2523	REQUIRE(VALID_MANAGER(manager));
2524
2525	LOCK(&manager->lock);
2526
2527	/*
2528	 * Wait for all sockets to be destroyed.
2529	 */
2530	while (!ISC_LIST_EMPTY(manager->socklist)) {
2531		manager_log(manager, CREATION,
2532			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2533					   ISC_MSG_SOCKETSREMAIN,
2534					   "sockets exist"));
2535		WAIT(&manager->shutdown_ok, &manager->lock);
2536	}
2537
2538	UNLOCK(&manager->lock);
2539
2540	/*
2541	 * Here, we need to had some wait code for the completion port
2542	 * thread.
2543	 */
2544	signal_iocompletionport_exit(manager);
2545	manager->bShutdown = ISC_TRUE;
2546
2547	/*
2548	 * Wait for threads to exit.
2549	 */
2550	for (i = 0; i < manager->maxIOCPThreads; i++) {
2551		if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2552			NULL) != ISC_R_SUCCESS)
2553			UNEXPECTED_ERROR(__FILE__, __LINE__,
2554				 "isc_thread_join() for Completion Port %s",
2555				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2556						ISC_MSG_FAILED, "failed"));
2557	}
2558	/*
2559	 * Clean up.
2560	 */
2561
2562	CloseHandle(manager->hIoCompletionPort);
2563
2564	(void)isc_condition_destroy(&manager->shutdown_ok);
2565
2566	DESTROYLOCK(&manager->lock);
2567	if (manager->stats != NULL)
2568		isc_stats_detach(&manager->stats);
2569	manager->magic = 0;
2570	mctx= manager->mctx;
2571	isc_mem_put(mctx, manager, sizeof(*manager));
2572
2573	isc_mem_detach(&mctx);
2574
2575	*managerp = NULL;
2576}
2577
2578static void
2579queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev)
2580{
2581	isc_task_t *ntask = NULL;
2582
2583	isc_task_attach(task, &ntask);
2584	dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2585
2586	/*
2587	 * Enqueue the request.
2588	 */
2589	INSIST(!ISC_LINK_LINKED(dev, ev_link));
2590	ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2591
2592	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2593		   "queue_receive_event: event %p -> task %p",
2594		   dev, ntask);
2595}
2596
2597/*
2598 * Check the pending receive queue, and if we have data pending, give it to this
2599 * caller.  If we have none, queue an I/O request.  If this caller is not the first
2600 * on the list, then we will just queue this event and return.
2601 *
2602 * Caller must have the socket locked.
2603 */
2604static isc_result_t
2605socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2606	    unsigned int flags)
2607{
2608	int cc = 0;
2609	isc_task_t *ntask = NULL;
2610	isc_result_t result = ISC_R_SUCCESS;
2611	int recv_errno = 0;
2612
2613	dev->ev_sender = task;
2614
2615	if (sock->fd == INVALID_SOCKET)
2616		return (ISC_R_EOF);
2617
2618	/*
2619	 * Queue our event on the list of things to do.  Call our function to
2620	 * attempt to fill buffers as much as possible, and return done events.
2621	 * We are going to lie about our handling of the ISC_SOCKFLAG_IMMEDIATE
2622	 * here and tell our caller that we could not satisfy it immediately.
2623	 */
2624	queue_receive_event(sock, task, dev);
2625	if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2626		result = ISC_R_INPROGRESS;
2627
2628	completeio_recv(sock);
2629
2630	/*
2631	 * If there are more receivers waiting for data, queue another receive
2632	 * here.  If the
2633	 */
2634	queue_receive_request(sock);
2635
2636	return (result);
2637}
2638
2639isc_result_t
2640isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2641		 unsigned int minimum, isc_task_t *task,
2642		 isc_taskaction_t action, const void *arg)
2643{
2644	isc_socketevent_t *dev;
2645	isc_socketmgr_t *manager;
2646	unsigned int iocount;
2647	isc_buffer_t *buffer;
2648	isc_result_t ret;
2649
2650	REQUIRE(VALID_SOCKET(sock));
2651	LOCK(&sock->lock);
2652	CONSISTENT(sock);
2653
2654	/*
2655	 * Make sure that the socket is not closed.  XXXMLG change error here?
2656	 */
2657	if (sock->fd == INVALID_SOCKET) {
2658		UNLOCK(&sock->lock);
2659		return (ISC_R_CONNREFUSED);
2660	}
2661
2662	REQUIRE(buflist != NULL);
2663	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2664	REQUIRE(task != NULL);
2665	REQUIRE(action != NULL);
2666
2667	manager = sock->manager;
2668	REQUIRE(VALID_MANAGER(manager));
2669
2670	iocount = isc_bufferlist_availablecount(buflist);
2671	REQUIRE(iocount > 0);
2672
2673	INSIST(sock->bound);
2674
2675	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2676	if (dev == NULL) {
2677		UNLOCK(&sock->lock);
2678		return (ISC_R_NOMEMORY);
2679	}
2680
2681	/*
2682	 * UDP sockets are always partial read
2683	 */
2684	if (sock->type == isc_sockettype_udp)
2685		dev->minimum = 1;
2686	else {
2687		if (minimum == 0)
2688			dev->minimum = iocount;
2689		else
2690			dev->minimum = minimum;
2691	}
2692
2693	/*
2694	 * Move each buffer from the passed in list to our internal one.
2695	 */
2696	buffer = ISC_LIST_HEAD(*buflist);
2697	while (buffer != NULL) {
2698		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2699		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2700		buffer = ISC_LIST_HEAD(*buflist);
2701	}
2702
2703	ret = socket_recv(sock, dev, task, 0);
2704
2705	UNLOCK(&sock->lock);
2706	return (ret);
2707}
2708
2709isc_result_t
2710isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2711		isc_task_t *task, isc_taskaction_t action, const void *arg)
2712{
2713	isc_socketevent_t *dev;
2714	isc_socketmgr_t *manager;
2715	isc_result_t ret;
2716
2717	REQUIRE(VALID_SOCKET(sock));
2718	LOCK(&sock->lock);
2719	CONSISTENT(sock);
2720
2721	/*
2722	 * make sure that the socket's not closed
2723	 */
2724	if (sock->fd == INVALID_SOCKET) {
2725		UNLOCK(&sock->lock);
2726		return (ISC_R_CONNREFUSED);
2727	}
2728	REQUIRE(action != NULL);
2729
2730	manager = sock->manager;
2731	REQUIRE(VALID_MANAGER(manager));
2732
2733	INSIST(sock->bound);
2734
2735	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2736	if (dev == NULL) {
2737		UNLOCK(&sock->lock);
2738		return (ISC_R_NOMEMORY);
2739	}
2740
2741	ret = isc_socket_recv2(sock, region, minimum, task, dev, 0);
2742	UNLOCK(&sock->lock);
2743	return (ret);
2744}
2745
2746isc_result_t
2747isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2748		 unsigned int minimum, isc_task_t *task,
2749		 isc_socketevent_t *event, unsigned int flags)
2750{
2751	isc_result_t ret;
2752
2753	REQUIRE(VALID_SOCKET(sock));
2754	LOCK(&sock->lock);
2755	CONSISTENT(sock);
2756
2757	event->result = ISC_R_UNEXPECTED;
2758	event->ev_sender = sock;
2759	/*
2760	 * make sure that the socket's not closed
2761	 */
2762	if (sock->fd == INVALID_SOCKET) {
2763		UNLOCK(&sock->lock);
2764		return (ISC_R_CONNREFUSED);
2765	}
2766
2767	ISC_LIST_INIT(event->bufferlist);
2768	event->region = *region;
2769	event->n = 0;
2770	event->offset = 0;
2771	event->attributes = 0;
2772
2773	/*
2774	 * UDP sockets are always partial read.
2775	 */
2776	if (sock->type == isc_sockettype_udp)
2777		event->minimum = 1;
2778	else {
2779		if (minimum == 0)
2780			event->minimum = region->length;
2781		else
2782			event->minimum = minimum;
2783	}
2784
2785	ret = socket_recv(sock, event, task, flags);
2786	UNLOCK(&sock->lock);
2787	return (ret);
2788}
2789
2790/*
2791 * Caller must have the socket locked.
2792 */
2793static isc_result_t
2794socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2795	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2796	    unsigned int flags)
2797{
2798	int io_state;
2799	int send_errno = 0;
2800	int cc = 0;
2801	isc_task_t *ntask = NULL;
2802	isc_result_t result = ISC_R_SUCCESS;
2803
2804	dev->ev_sender = task;
2805
2806	set_dev_address(address, sock, dev);
2807	if (pktinfo != NULL) {
2808		socket_log(__LINE__, sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2809			   ISC_MSG_PKTINFOPROVIDED,
2810			   "pktinfo structure provided, ifindex %u (set to 0)",
2811			   pktinfo->ipi6_ifindex);
2812
2813		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2814		dev->pktinfo = *pktinfo;
2815		/*
2816		 * Set the pktinfo index to 0 here, to let the kernel decide
2817		 * what interface it should send on.
2818		 */
2819		dev->pktinfo.ipi6_ifindex = 0;
2820	}
2821
2822	io_state = startio_send(sock, dev, &cc, &send_errno);
2823	switch (io_state) {
2824	case DOIO_PENDING:	/* I/O started. Nothing more to do */
2825	case DOIO_SOFT:
2826		/*
2827		 * We couldn't send all or part of the request right now, so
2828		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2829		 */
2830		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2831			isc_task_attach(task, &ntask);
2832			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2833
2834			/*
2835			 * Enqueue the request.
2836			 */
2837			INSIST(!ISC_LINK_LINKED(dev, ev_link));
2838			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2839
2840			socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2841				   "socket_send: event %p -> task %p",
2842				   dev, ntask);
2843
2844			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2845				result = ISC_R_INPROGRESS;
2846			break;
2847		}
2848
2849	case DOIO_SUCCESS:
2850		break;
2851	}
2852
2853	return (result);
2854}
2855
2856isc_result_t
2857isc_socket_send(isc_socket_t *sock, isc_region_t *region,
2858		isc_task_t *task, isc_taskaction_t action, const void *arg)
2859{
2860	/*
2861	 * REQUIRE() checking is performed in isc_socket_sendto().
2862	 */
2863	return (isc_socket_sendto(sock, region, task, action, arg, NULL,
2864				  NULL));
2865}
2866
2867isc_result_t
2868isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
2869		  isc_task_t *task, isc_taskaction_t action, const void *arg,
2870		  isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2871{
2872	isc_socketevent_t *dev;
2873	isc_socketmgr_t *manager;
2874	isc_result_t ret;
2875
2876	REQUIRE(VALID_SOCKET(sock));
2877	REQUIRE(sock->type != isc_sockettype_fdwatch);
2878
2879	LOCK(&sock->lock);
2880	CONSISTENT(sock);
2881
2882	/*
2883	 * make sure that the socket's not closed
2884	 */
2885	if (sock->fd == INVALID_SOCKET) {
2886		UNLOCK(&sock->lock);
2887		return (ISC_R_CONNREFUSED);
2888	}
2889	REQUIRE(region != NULL);
2890	REQUIRE(task != NULL);
2891	REQUIRE(action != NULL);
2892
2893	manager = sock->manager;
2894	REQUIRE(VALID_MANAGER(manager));
2895
2896	INSIST(sock->bound);
2897
2898	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2899	if (dev == NULL) {
2900		UNLOCK(&sock->lock);
2901		return (ISC_R_NOMEMORY);
2902	}
2903	dev->region = *region;
2904
2905	ret = socket_send(sock, dev, task, address, pktinfo, 0);
2906	UNLOCK(&sock->lock);
2907	return (ret);
2908}
2909
2910isc_result_t
2911isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2912		 isc_task_t *task, isc_taskaction_t action, const void *arg)
2913{
2914	return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
2915				   NULL));
2916}
2917
2918isc_result_t
2919isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
2920		   isc_task_t *task, isc_taskaction_t action, const void *arg,
2921		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2922{
2923	isc_socketevent_t *dev;
2924	isc_socketmgr_t *manager;
2925	unsigned int iocount;
2926	isc_buffer_t *buffer;
2927	isc_result_t ret;
2928
2929	REQUIRE(VALID_SOCKET(sock));
2930
2931	LOCK(&sock->lock);
2932	CONSISTENT(sock);
2933
2934	/*
2935	 * make sure that the socket's not closed
2936	 */
2937	if (sock->fd == INVALID_SOCKET) {
2938		UNLOCK(&sock->lock);
2939		return (ISC_R_CONNREFUSED);
2940	}
2941	REQUIRE(buflist != NULL);
2942	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2943	REQUIRE(task != NULL);
2944	REQUIRE(action != NULL);
2945
2946	manager = sock->manager;
2947	REQUIRE(VALID_MANAGER(manager));
2948
2949	iocount = isc_bufferlist_usedcount(buflist);
2950	REQUIRE(iocount > 0);
2951
2952	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2953	if (dev == NULL) {
2954		UNLOCK(&sock->lock);
2955		return (ISC_R_NOMEMORY);
2956	}
2957
2958	/*
2959	 * Move each buffer from the passed in list to our internal one.
2960	 */
2961	buffer = ISC_LIST_HEAD(*buflist);
2962	while (buffer != NULL) {
2963		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2964		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2965		buffer = ISC_LIST_HEAD(*buflist);
2966	}
2967
2968	ret = socket_send(sock, dev, task, address, pktinfo, 0);
2969	UNLOCK(&sock->lock);
2970	return (ret);
2971}
2972
2973isc_result_t
2974isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
2975		   isc_task_t *task,
2976		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2977		   isc_socketevent_t *event, unsigned int flags)
2978{
2979	isc_result_t ret;
2980
2981	REQUIRE(VALID_SOCKET(sock));
2982	LOCK(&sock->lock);
2983	CONSISTENT(sock);
2984
2985	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
2986	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
2987		REQUIRE(sock->type == isc_sockettype_udp);
2988	event->ev_sender = sock;
2989	event->result = ISC_R_UNEXPECTED;
2990	/*
2991	 * make sure that the socket's not closed
2992	 */
2993	if (sock->fd == INVALID_SOCKET) {
2994		UNLOCK(&sock->lock);
2995		return (ISC_R_CONNREFUSED);
2996	}
2997	ISC_LIST_INIT(event->bufferlist);
2998	event->region = *region;
2999	event->n = 0;
3000	event->offset = 0;
3001	event->attributes = 0;
3002
3003	ret = socket_send(sock, event, task, address, pktinfo, flags);
3004	UNLOCK(&sock->lock);
3005	return (ret);
3006}
3007
3008isc_result_t
3009isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3010		unsigned int options) {
3011	int bind_errno;
3012	char strbuf[ISC_STRERRORSIZE];
3013	int on = 1;
3014
3015	REQUIRE(VALID_SOCKET(sock));
3016	LOCK(&sock->lock);
3017	CONSISTENT(sock);
3018
3019	/*
3020	 * make sure that the socket's not closed
3021	 */
3022	if (sock->fd == INVALID_SOCKET) {
3023		UNLOCK(&sock->lock);
3024		return (ISC_R_CONNREFUSED);
3025	}
3026
3027	INSIST(!sock->bound);
3028
3029	if (sock->pf != sockaddr->type.sa.sa_family) {
3030		UNLOCK(&sock->lock);
3031		return (ISC_R_FAMILYMISMATCH);
3032	}
3033	/*
3034	 * Only set SO_REUSEADDR when we want a specific port.
3035	 */
3036	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3037	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3038	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
3039		       sizeof(on)) < 0) {
3040		UNEXPECTED_ERROR(__FILE__, __LINE__,
3041				 "setsockopt(%d) %s", sock->fd,
3042				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3043						ISC_MSG_FAILED, "failed"));
3044		/* Press on... */
3045	}
3046	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3047		bind_errno = WSAGetLastError();
3048		UNLOCK(&sock->lock);
3049		switch (bind_errno) {
3050		case WSAEACCES:
3051			return (ISC_R_NOPERM);
3052		case WSAEADDRNOTAVAIL:
3053			return (ISC_R_ADDRNOTAVAIL);
3054		case WSAEADDRINUSE:
3055			return (ISC_R_ADDRINUSE);
3056		case WSAEINVAL:
3057			return (ISC_R_BOUND);
3058		default:
3059			isc__strerror(bind_errno, strbuf, sizeof(strbuf));
3060			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3061					 strbuf);
3062			return (ISC_R_UNEXPECTED);
3063		}
3064	}
3065
3066	socket_log(__LINE__, sock, sockaddr, TRACE,
3067		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3068	sock->bound = 1;
3069
3070	UNLOCK(&sock->lock);
3071	return (ISC_R_SUCCESS);
3072}
3073
3074isc_result_t
3075isc_socket_filter(isc_socket_t *sock, const char *filter) {
3076	UNUSED(sock);
3077	UNUSED(filter);
3078
3079	REQUIRE(VALID_SOCKET(sock));
3080	return (ISC_R_NOTIMPLEMENTED);
3081}
3082
3083/*
3084 * Set up to listen on a given socket.  We do this by creating an internal
3085 * event that will be dispatched when the socket has read activity.  The
3086 * watcher will send the internal event to the task when there is a new
3087 * connection.
3088 *
3089 * Unlike in read, we don't preallocate a done event here.  Every time there
3090 * is a new connection we'll have to allocate a new one anyway, so we might
3091 * as well keep things simple rather than having to track them.
3092 */
3093isc_result_t
3094isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
3095	char strbuf[ISC_STRERRORSIZE];
3096
3097	REQUIRE(VALID_SOCKET(sock));
3098
3099	LOCK(&sock->lock);
3100	CONSISTENT(sock);
3101
3102	/*
3103	 * make sure that the socket's not closed
3104	 */
3105	if (sock->fd == INVALID_SOCKET) {
3106		UNLOCK(&sock->lock);
3107		return (ISC_R_CONNREFUSED);
3108	}
3109
3110	REQUIRE(!sock->listener);
3111	REQUIRE(sock->bound);
3112	REQUIRE(sock->type == isc_sockettype_tcp);
3113
3114	if (backlog == 0)
3115		backlog = SOMAXCONN;
3116
3117	if (listen(sock->fd, (int)backlog) < 0) {
3118		UNLOCK(&sock->lock);
3119		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3120
3121		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3122
3123		return (ISC_R_UNEXPECTED);
3124	}
3125
3126	socket_log(__LINE__, sock, NULL, TRACE,
3127		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "listening");
3128	sock->listener = 1;
3129	_set_state(sock, SOCK_LISTEN);
3130
3131	UNLOCK(&sock->lock);
3132	return (ISC_R_SUCCESS);
3133}
3134
3135/*
3136 * This should try to do aggressive accept() XXXMLG
3137 */
3138isc_result_t
3139isc_socket_accept(isc_socket_t *sock,
3140		  isc_task_t *task, isc_taskaction_t action, const void *arg)
3141{
3142	isc_socket_newconnev_t *adev;
3143	isc_socketmgr_t *manager;
3144	isc_task_t *ntask = NULL;
3145	isc_socket_t *nsock;
3146	isc_result_t result;
3147	IoCompletionInfo *lpo;
3148
3149	REQUIRE(VALID_SOCKET(sock));
3150
3151	manager = sock->manager;
3152	REQUIRE(VALID_MANAGER(manager));
3153
3154	LOCK(&sock->lock);
3155	CONSISTENT(sock);
3156
3157	/*
3158	 * make sure that the socket's not closed
3159	 */
3160	if (sock->fd == INVALID_SOCKET) {
3161		UNLOCK(&sock->lock);
3162		return (ISC_R_CONNREFUSED);
3163	}
3164
3165	REQUIRE(sock->listener);
3166
3167	/*
3168	 * Sender field is overloaded here with the task we will be sending
3169	 * this event to.  Just before the actual event is delivered the
3170	 * actual ev_sender will be touched up to be the socket.
3171	 */
3172	adev = (isc_socket_newconnev_t *)
3173		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3174				   action, arg, sizeof(*adev));
3175	if (adev == NULL) {
3176		UNLOCK(&sock->lock);
3177		return (ISC_R_NOMEMORY);
3178	}
3179	ISC_LINK_INIT(adev, ev_link);
3180
3181	result = allocate_socket(manager, sock->type, &nsock);
3182	if (result != ISC_R_SUCCESS) {
3183		isc_event_free((isc_event_t **)&adev);
3184		UNLOCK(&sock->lock);
3185		return (result);
3186	}
3187
3188	/*
3189	 * AcceptEx() requires we pass in a socket.
3190	 */
3191	nsock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
3192	if (nsock->fd == INVALID_SOCKET) {
3193		free_socket(&nsock, __LINE__);
3194		isc_event_free((isc_event_t **)&adev);
3195		UNLOCK(&sock->lock);
3196		return (ISC_R_FAILURE); // XXXMLG need real error message
3197	}
3198
3199	/*
3200	 * Attach to socket and to task.
3201	 */
3202	isc_task_attach(task, &ntask);
3203	nsock->references++;
3204
3205	adev->ev_sender = ntask;
3206	adev->newsocket = nsock;
3207	_set_state(nsock, SOCK_ACCEPT);
3208
3209	/*
3210	 * Queue io completion for an accept().
3211	 */
3212	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3213					    HEAP_ZERO_MEMORY,
3214					    sizeof(IoCompletionInfo));
3215	RUNTIME_CHECK(lpo != NULL);
3216	lpo->acceptbuffer = (void *)HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
3217		(sizeof(SOCKADDR_STORAGE) + 16) * 2);
3218	RUNTIME_CHECK(lpo->acceptbuffer != NULL);
3219
3220	lpo->adev = adev;
3221	lpo->request_type = SOCKET_ACCEPT;
3222
3223	ISCAcceptEx(sock->fd,
3224		    nsock->fd,				/* Accepted Socket */
3225		    lpo->acceptbuffer,			/* Buffer for initial Recv */
3226		    0,					/* Length of Buffer */
3227		    sizeof(SOCKADDR_STORAGE) + 16,		/* Local address length + 16 */
3228		    sizeof(SOCKADDR_STORAGE) + 16,		/* Remote address lengh + 16 */
3229		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
3230		    (LPOVERLAPPED)lpo			/* Overlapped structure */
3231		    );
3232	iocompletionport_update(nsock);
3233
3234	socket_log(__LINE__, sock, NULL, TRACE,
3235		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND,
3236		   "accepting for nsock %p fd %d", nsock, nsock->fd);
3237
3238	/*
3239	 * Enqueue the event
3240	 */
3241	ISC_LIST_ENQUEUE(sock->accept_list, adev, ev_link);
3242	sock->pending_accept++;
3243	sock->pending_iocp++;
3244
3245	UNLOCK(&sock->lock);
3246	return (ISC_R_SUCCESS);
3247}
3248
3249isc_result_t
3250isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3251		   isc_task_t *task, isc_taskaction_t action, const void *arg)
3252{
3253	char strbuf[ISC_STRERRORSIZE];
3254	isc_socket_connev_t *cdev;
3255	isc_task_t *ntask = NULL;
3256	isc_socketmgr_t *manager;
3257	IoCompletionInfo *lpo;
3258	int bind_errno;
3259
3260	REQUIRE(VALID_SOCKET(sock));
3261	REQUIRE(addr != NULL);
3262	REQUIRE(task != NULL);
3263	REQUIRE(action != NULL);
3264
3265	manager = sock->manager;
3266	REQUIRE(VALID_MANAGER(manager));
3267	REQUIRE(addr != NULL);
3268
3269	if (isc_sockaddr_ismulticast(addr))
3270		return (ISC_R_MULTICAST);
3271
3272	LOCK(&sock->lock);
3273	CONSISTENT(sock);
3274
3275	/*
3276	 * make sure that the socket's not closed
3277	 */
3278	if (sock->fd == INVALID_SOCKET) {
3279		UNLOCK(&sock->lock);
3280		return (ISC_R_CONNREFUSED);
3281	}
3282
3283	/*
3284	 * Windows sockets won't connect unless the socket is bound.
3285	 */
3286	if (!sock->bound) {
3287		isc_sockaddr_t any;
3288
3289		isc_sockaddr_anyofpf(&any, isc_sockaddr_pf(addr));
3290		if (bind(sock->fd, &any.type.sa, any.length) < 0) {
3291			bind_errno = WSAGetLastError();
3292			UNLOCK(&sock->lock);
3293			switch (bind_errno) {
3294			case WSAEACCES:
3295				return (ISC_R_NOPERM);
3296			case WSAEADDRNOTAVAIL:
3297				return (ISC_R_ADDRNOTAVAIL);
3298			case WSAEADDRINUSE:
3299				return (ISC_R_ADDRINUSE);
3300			case WSAEINVAL:
3301				return (ISC_R_BOUND);
3302			default:
3303				isc__strerror(bind_errno, strbuf,
3304					      sizeof(strbuf));
3305				UNEXPECTED_ERROR(__FILE__, __LINE__,
3306						 "bind: %s", strbuf);
3307				return (ISC_R_UNEXPECTED);
3308			}
3309		}
3310		sock->bound = 1;
3311	}
3312
3313	REQUIRE(!sock->pending_connect);
3314
3315	cdev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3316							ISC_SOCKEVENT_CONNECT,
3317							action,	arg,
3318							sizeof(*cdev));
3319	if (cdev == NULL) {
3320		UNLOCK(&sock->lock);
3321		return (ISC_R_NOMEMORY);
3322	}
3323	ISC_LINK_INIT(cdev, ev_link);
3324
3325	if (sock->type == isc_sockettype_tcp) {
3326		/*
3327		 * Queue io completion for an accept().
3328		 */
3329		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3330						    HEAP_ZERO_MEMORY,
3331						    sizeof(IoCompletionInfo));
3332		lpo->cdev = cdev;
3333		lpo->request_type = SOCKET_CONNECT;
3334
3335		sock->address = *addr;
3336		ISCConnectEx(sock->fd, &addr->type.sa, addr->length,
3337			NULL, 0, NULL, (LPOVERLAPPED)lpo);
3338
3339		/*
3340		 * Attach to task.
3341		 */
3342		isc_task_attach(task, &ntask);
3343		cdev->ev_sender = ntask;
3344
3345		sock->pending_connect = 1;
3346		_set_state(sock, SOCK_CONNECT);
3347
3348		/*
3349		 * Enqueue the request.
3350		 */
3351		sock->connect_ev = cdev;
3352		sock->pending_iocp++;
3353	} else {
3354		WSAConnect(sock->fd, &addr->type.sa, addr->length, NULL, NULL, NULL, NULL);
3355		cdev->result = ISC_R_SUCCESS;
3356		isc_task_send(task, (isc_event_t **)&cdev);
3357	}
3358	CONSISTENT(sock);
3359	UNLOCK(&sock->lock);
3360
3361	return (ISC_R_SUCCESS);
3362}
3363
3364isc_result_t
3365isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3366	isc_result_t result;
3367
3368	REQUIRE(VALID_SOCKET(sock));
3369	REQUIRE(addressp != NULL);
3370
3371	LOCK(&sock->lock);
3372	CONSISTENT(sock);
3373
3374	/*
3375	 * make sure that the socket's not closed
3376	 */
3377	if (sock->fd == INVALID_SOCKET) {
3378		UNLOCK(&sock->lock);
3379		return (ISC_R_CONNREFUSED);
3380	}
3381
3382	if (sock->connected) {
3383		*addressp = sock->address;
3384		result = ISC_R_SUCCESS;
3385	} else {
3386		result = ISC_R_NOTCONNECTED;
3387	}
3388
3389	UNLOCK(&sock->lock);
3390
3391	return (result);
3392}
3393
3394isc_result_t
3395isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3396	ISC_SOCKADDR_LEN_T len;
3397	isc_result_t result;
3398	char strbuf[ISC_STRERRORSIZE];
3399
3400	REQUIRE(VALID_SOCKET(sock));
3401	REQUIRE(addressp != NULL);
3402
3403	LOCK(&sock->lock);
3404	CONSISTENT(sock);
3405
3406	/*
3407	 * make sure that the socket's not closed
3408	 */
3409	if (sock->fd == INVALID_SOCKET) {
3410		UNLOCK(&sock->lock);
3411		return (ISC_R_CONNREFUSED);
3412	}
3413
3414	if (!sock->bound) {
3415		result = ISC_R_NOTBOUND;
3416		goto out;
3417	}
3418
3419	result = ISC_R_SUCCESS;
3420
3421	len = sizeof(addressp->type);
3422	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3423		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3424		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3425				 strbuf);
3426		result = ISC_R_UNEXPECTED;
3427		goto out;
3428	}
3429	addressp->length = (unsigned int)len;
3430
3431 out:
3432	UNLOCK(&sock->lock);
3433
3434	return (result);
3435}
3436
3437/*
3438 * Run through the list of events on this socket, and cancel the ones
3439 * queued for task "task" of type "how".  "how" is a bitmask.
3440 */
3441void
3442isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3443
3444	REQUIRE(VALID_SOCKET(sock));
3445
3446	/*
3447	 * Quick exit if there is nothing to do.  Don't even bother locking
3448	 * in this case.
3449	 */
3450	if (how == 0)
3451		return;
3452
3453	LOCK(&sock->lock);
3454	CONSISTENT(sock);
3455
3456	/*
3457	 * make sure that the socket's not closed
3458	 */
3459	if (sock->fd == INVALID_SOCKET) {
3460		UNLOCK(&sock->lock);
3461		return;
3462	}
3463
3464	/*
3465	 * All of these do the same thing, more or less.
3466	 * Each will:
3467	 *	o If the internal event is marked as "posted" try to
3468	 *	  remove it from the task's queue.  If this fails, mark it
3469	 *	  as canceled instead, and let the task clean it up later.
3470	 *	o For each I/O request for that task of that type, post
3471	 *	  its done event with status of "ISC_R_CANCELED".
3472	 *	o Reset any state needed.
3473	 */
3474
3475	if ((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) {
3476		isc_socketevent_t      *dev;
3477		isc_socketevent_t      *next;
3478		isc_task_t	       *current_task;
3479
3480		dev = ISC_LIST_HEAD(sock->recv_list);
3481		while (dev != NULL) {
3482			current_task = dev->ev_sender;
3483			next = ISC_LIST_NEXT(dev, ev_link);
3484			if ((task == NULL) || (task == current_task)) {
3485				dev->result = ISC_R_CANCELED;
3486				send_recvdone_event(sock, &dev);
3487			}
3488			dev = next;
3489		}
3490	}
3491	how &= ~ISC_SOCKCANCEL_RECV;
3492
3493	if ((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) {
3494		isc_socketevent_t      *dev;
3495		isc_socketevent_t      *next;
3496		isc_task_t	       *current_task;
3497
3498		dev = ISC_LIST_HEAD(sock->send_list);
3499
3500		while (dev != NULL) {
3501			current_task = dev->ev_sender;
3502			next = ISC_LIST_NEXT(dev, ev_link);
3503			if ((task == NULL) || (task == current_task)) {
3504				dev->result = ISC_R_CANCELED;
3505				send_senddone_event(sock, &dev);
3506			}
3507			dev = next;
3508		}
3509	}
3510	how &= ~ISC_SOCKCANCEL_SEND;
3511
3512	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3513	    && !ISC_LIST_EMPTY(sock->accept_list)) {
3514		isc_socket_newconnev_t *dev;
3515		isc_socket_newconnev_t *next;
3516		isc_task_t	       *current_task;
3517
3518		dev = ISC_LIST_HEAD(sock->accept_list);
3519		while (dev != NULL) {
3520			current_task = dev->ev_sender;
3521			next = ISC_LIST_NEXT(dev, ev_link);
3522
3523			if ((task == NULL) || (task == current_task)) {
3524
3525				dev->newsocket->references--;
3526				closesocket(dev->newsocket->fd);
3527				dev->newsocket->fd = INVALID_SOCKET;
3528				free_socket(&dev->newsocket, __LINE__);
3529
3530				dev->result = ISC_R_CANCELED;
3531				send_acceptdone_event(sock, &dev);
3532			}
3533
3534			dev = next;
3535		}
3536	}
3537	how &= ~ISC_SOCKCANCEL_ACCEPT;
3538
3539	/*
3540	 * Connecting is not a list.
3541	 */
3542	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3543	    && sock->connect_ev != NULL) {
3544		isc_socket_connev_t    *dev;
3545		isc_task_t	       *current_task;
3546
3547		INSIST(sock->pending_connect);
3548
3549		dev = sock->connect_ev;
3550		current_task = dev->ev_sender;
3551
3552		if ((task == NULL) || (task == current_task)) {
3553			closesocket(sock->fd);
3554			sock->fd = INVALID_SOCKET;
3555			_set_state(sock, SOCK_CLOSED);
3556
3557			sock->connect_ev = NULL;
3558			dev->result = ISC_R_CANCELED;
3559			send_connectdone_event(sock, &dev);
3560		}
3561	}
3562	how &= ~ISC_SOCKCANCEL_CONNECT;
3563
3564	maybe_free_socket(&sock, __LINE__);
3565}
3566
3567isc_sockettype_t
3568isc_socket_gettype(isc_socket_t *sock) {
3569	isc_sockettype_t type;
3570
3571	REQUIRE(VALID_SOCKET(sock));
3572
3573	LOCK(&sock->lock);
3574
3575	/*
3576	 * make sure that the socket's not closed
3577	 */
3578	if (sock->fd == INVALID_SOCKET) {
3579		UNLOCK(&sock->lock);
3580		return (ISC_R_CONNREFUSED);
3581	}
3582
3583	type = sock->type;
3584	UNLOCK(&sock->lock);
3585	return (type);
3586}
3587
3588isc_boolean_t
3589isc_socket_isbound(isc_socket_t *sock) {
3590	isc_boolean_t val;
3591
3592	REQUIRE(VALID_SOCKET(sock));
3593
3594	LOCK(&sock->lock);
3595	CONSISTENT(sock);
3596
3597	/*
3598	 * make sure that the socket's not closed
3599	 */
3600	if (sock->fd == INVALID_SOCKET) {
3601		UNLOCK(&sock->lock);
3602		return (ISC_FALSE);
3603	}
3604
3605	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3606	UNLOCK(&sock->lock);
3607
3608	return (val);
3609}
3610
3611void
3612isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3613#if defined(IPV6_V6ONLY)
3614	int onoff = yes ? 1 : 0;
3615#else
3616	UNUSED(yes);
3617#endif
3618
3619	REQUIRE(VALID_SOCKET(sock));
3620
3621#ifdef IPV6_V6ONLY
3622	if (sock->pf == AF_INET6) {
3623		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3624				 (void *)&onoff, sizeof(onoff));
3625	}
3626#endif
3627}
3628
3629void
3630isc_socket_cleanunix(isc_sockaddr_t *addr, isc_boolean_t active) {
3631	UNUSED(addr);
3632	UNUSED(active);
3633}
3634
3635isc_result_t
3636isc_socket_permunix(isc_sockaddr_t *addr, isc_uint32_t perm,
3637		    isc_uint32_t owner,	isc_uint32_t group)
3638{
3639	UNUSED(addr);
3640	UNUSED(perm);
3641	UNUSED(owner);
3642	UNUSED(group);
3643	return (ISC_R_NOTIMPLEMENTED);
3644}
3645
3646void
3647isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) {
3648
3649	/*
3650	 * Name 'socket'.
3651	 */
3652
3653	REQUIRE(VALID_SOCKET(socket));
3654
3655	LOCK(&socket->lock);
3656	memset(socket->name, 0, sizeof(socket->name));
3657	strncpy(socket->name, name, sizeof(socket->name) - 1);
3658	socket->tag = tag;
3659	UNLOCK(&socket->lock);
3660}
3661
3662const char *
3663isc_socket_getname(isc_socket_t *socket) {
3664	return (socket->name);
3665}
3666
3667void *
3668isc_socket_gettag(isc_socket_t *socket) {
3669	return (socket->tag);
3670}
3671
3672void
3673isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3674	UNUSED(manager);
3675	UNUSED(reserved);
3676}
3677