1/*
2 * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 2000-2003  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id: socket.c,v 1.70.54.4 2009/01/29 22:40:36 jinmei Exp $ */
19
20/* This code uses functions which are only available on Server 2003 and
21 * higher, and Windows XP and higher.
22 *
23 * This code is by nature multithreaded and takes advantage of various
24 * features to pass on information through the completion port for
25 * when I/O is completed.  All sends, receives, accepts, and connects are
26 * completed through the completion port.
27 *
28 * The number of Completion Port Worker threads used is the total number
29 * of CPU's + 1. This increases the likelihood that a Worker Thread is
30 * available for processing a completed request.
31 *
32 * XXXPDM 5 August, 2002
33 */
34
35#define MAKE_EXTERNAL 1
36#include <config.h>
37
38#include <sys/types.h>
39
40#ifndef _WINSOCKAPI_
41#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
42#endif
43
44#include <errno.h>
45#include <stddef.h>
46#include <stdlib.h>
47#include <string.h>
48#include <unistd.h>
49#include <io.h>
50#include <fcntl.h>
51#include <process.h>
52
53#include <isc/buffer.h>
54#include <isc/bufferlist.h>
55#include <isc/condition.h>
56#include <isc/list.h>
57#include <isc/log.h>
58#include <isc/mem.h>
59#include <isc/msgs.h>
60#include <isc/mutex.h>
61#include <isc/net.h>
62#include <isc/once.h>
63#include <isc/os.h>
64#include <isc/platform.h>
65#include <isc/print.h>
66#include <isc/region.h>
67#include <isc/socket.h>
68#include <isc/stats.h>
69#include <isc/strerror.h>
70#include <isc/syslog.h>
71#include <isc/task.h>
72#include <isc/thread.h>
73#include <isc/util.h>
74#include <isc/win32os.h>
75
76#include <mswsock.h>
77
78#include "errno2result.h"
79
80/*
81 * How in the world can Microsoft exist with APIs like this?
82 * We can't actually call this directly, because it turns out
83 * no library exports this function.  Instead, we need to
84 * issue a runtime call to get the address.
85 */
86LPFN_CONNECTEX ISCConnectEx;
87LPFN_ACCEPTEX ISCAcceptEx;
88LPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
89
90/*
91 * Run expensive internal consistency checks.
92 */
93#ifdef ISC_SOCKET_CONSISTENCY_CHECKS
94#define CONSISTENT(sock) consistent(sock)
95#else
96#define CONSISTENT(sock) do {} while (0)
97#endif
98static void consistent(isc_socket_t *sock);
99
100/*
101 * Define this macro to control the behavior of connection
102 * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
103 * for details.
104 * NOTE: This requires that Windows 2000 systems install Service Pack 2
105 * or later.
106 */
107#ifndef SIO_UDP_CONNRESET
108#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
109#endif
110
111/*
112 * Some systems define the socket length argument as an int, some as size_t,
113 * some as socklen_t.  This is here so it can be easily changed if needed.
114 */
115#ifndef ISC_SOCKADDR_LEN_T
116#define ISC_SOCKADDR_LEN_T unsigned int
117#endif
118
119/*
120 * Define what the possible "soft" errors can be.  These are non-fatal returns
121 * of various network related functions, like recv() and so on.
122 */
123#define SOFT_ERROR(e)	((e) == WSAEINTR || \
124			 (e) == WSAEWOULDBLOCK || \
125			 (e) == EWOULDBLOCK || \
126			 (e) == EINTR || \
127			 (e) == EAGAIN || \
128			 (e) == 0)
129
130/*
131 * Pending errors are not really errors and should be
132 * kept separate
133 */
134#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
135
136#define DOIO_SUCCESS	  0       /* i/o ok, event sent */
137#define DOIO_SOFT	  1       /* i/o ok, soft error, no event sent */
138#define DOIO_HARD	  2       /* i/o error, event sent */
139#define DOIO_EOF	  3       /* EOF, no event sent */
140#define DOIO_PENDING	  4       /* status when i/o is in process */
141#define DOIO_NEEDMORE	  5       /* IO was processed, but we need more due to minimum */
142
143#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
144
145/*
146 * DLVL(90)  --  Function entry/exit and other tracing.
147 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
148 * DLVL(60)  --  Socket data send/receive
149 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
150 * DLVL(20)  --  Socket creation/destruction.
151 */
152#define TRACE_LEVEL		90
153#define CORRECTNESS_LEVEL	70
154#define IOEVENT_LEVEL		60
155#define EVENT_LEVEL		50
156#define CREATION_LEVEL		20
157
158#define TRACE		DLVL(TRACE_LEVEL)
159#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
160#define IOEVENT		DLVL(IOEVENT_LEVEL)
161#define EVENT		DLVL(EVENT_LEVEL)
162#define CREATION	DLVL(CREATION_LEVEL)
163
164typedef isc_event_t intev_t;
165
166/*
167 * Socket State
168 */
169enum {
170  SOCK_INITIALIZED,	/* Socket Initialized */
171  SOCK_OPEN,		/* Socket opened but nothing yet to do */
172  SOCK_DATA,		/* Socket sending or receiving data */
173  SOCK_LISTEN,		/* TCP Socket listening for connects */
174  SOCK_ACCEPT,		/* TCP socket is waiting to accept */
175  SOCK_CONNECT,		/* TCP Socket connecting */
176  SOCK_CLOSED,		/* Socket has been closed */
177};
178
179#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
180#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
181
182/*
183 * IPv6 control information.  If the socket is an IPv6 socket we want
184 * to collect the destination address and interface so the client can
185 * set them on outgoing packets.
186 */
187#ifdef ISC_PLATFORM_HAVEIPV6
188#ifndef USE_CMSG
189#define USE_CMSG	1
190#endif
191#endif
192
193/*
194 * We really  don't want to try and use these control messages. Win32
195 * doesn't have this mechanism before XP.
196 */
197#undef USE_CMSG
198
199/*
200 * Message header for recvmsg and sendmsg calls.
201 * Used value-result for recvmsg, value only for sendmsg.
202 */
203struct msghdr {
204	SOCKADDR_STORAGE to_addr;	/* UDP send/recv address */
205	int      to_addr_len;		/* length of the address */
206	WSABUF  *msg_iov;		/* scatter/gather array */
207	u_int   msg_iovlen;             /* # elements in msg_iov */
208	void	*msg_control;           /* ancillary data, see below */
209	u_int   msg_controllen;         /* ancillary data buffer len */
210	int	msg_totallen;		/* total length of this message */
211} msghdr;
212
213/*
214 * The size to raise the receive buffer to.
215 */
216#define RCVBUFSIZE (32*1024)
217
218/*
219 * The number of times a send operation is repeated if the result
220 * is WSAEINTR.
221 */
222#define NRETRIES 10
223
224struct isc_socket {
225	/* Not locked. */
226	unsigned int		magic;
227	isc_socketmgr_t	       *manager;
228	isc_mutex_t		lock;
229	isc_sockettype_t	type;
230
231	/* Pointers to scatter/gather buffers */
232	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
233
234	/* Locked by socket lock. */
235	ISC_LINK(isc_socket_t)	link;
236	unsigned int		references; /* EXTERNAL references */
237	SOCKET			fd;	/* file handle */
238	int			pf;	/* protocol family */
239	char			name[16];
240	void *			tag;
241
242	/*
243	 * Each recv() call uses this buffer.  It is a per-socket receive
244	 * buffer that allows us to decouple the system recv() from the
245	 * recv_list done events.  This means the items on the recv_list
246	 * can be removed without having to cancel pending system recv()
247	 * calls.  It also allows us to read-ahead in some cases.
248	 */
249	struct {
250		SOCKADDR_STORAGE	from_addr;	   // UDP send/recv address
251		int		from_addr_len;	   // length of the address
252		char		*base;		   // the base of the buffer
253		char		*consume_position; // where to start copying data from next
254		unsigned int	len;		   // the actual size of this buffer
255		unsigned int	remaining;	   // the number of bytes remaining
256	} recvbuf;
257
258	ISC_LIST(isc_socketevent_t)		send_list;
259	ISC_LIST(isc_socketevent_t)		recv_list;
260	ISC_LIST(isc_socket_newconnev_t)	accept_list;
261	isc_socket_connev_t		       *connect_ev;
262
263	isc_sockaddr_t		address;  /* remote address */
264
265	unsigned int		listener : 1,	/* listener socket */
266				connected : 1,
267				pending_connect : 1, /* connect pending */
268				bound : 1;	/* bound to local addr */
269	unsigned int		pending_iocp;	/* Should equal the counters below. Debug. */
270	unsigned int		pending_recv;  /* Number of outstanding recv() calls. */
271	unsigned int		pending_send;  /* Number of outstanding send() calls. */
272	unsigned int		pending_accept; /* Number of outstanding accept() calls. */
273	unsigned int		state; /* Socket state. Debugging and consistency checking. */
274	int			state_lineno;  /* line which last touched state */
275};
276
277#define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (0)
278
279/*
280 * Buffer structure
281 */
282typedef struct buflist buflist_t;
283
284struct buflist {
285	void			*buf;
286	unsigned int		buflen;
287	ISC_LINK(buflist_t)	link;
288};
289
290/*
291 * I/O Completion ports Info structures
292 */
293
294static HANDLE hHeapHandle = NULL;
295typedef struct IoCompletionInfo {
296	OVERLAPPED		overlapped;
297	isc_socketevent_t	*dev;  /* send()/recv() done event */
298	isc_socket_connev_t	*cdev; /* connect() done event */
299	isc_socket_newconnev_t	*adev; /* accept() done event */
300	void			*acceptbuffer;
301	DWORD			received_bytes;
302	int			request_type;
303	struct msghdr		messagehdr;
304	ISC_LIST(buflist_t)	bufferlist;	/*%< list of buffers */
305} IoCompletionInfo;
306
307/*
308 * Define a maximum number of I/O Completion Port worker threads
309 * to handle the load on the Completion Port. The actual number
310 * used is the number of CPU's + 1.
311 */
312#define MAX_IOCPTHREADS 20
313
314#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
315#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
316
317struct isc_socketmgr {
318	/* Not locked. */
319	unsigned int			magic;
320	isc_mem_t		       *mctx;
321	isc_mutex_t			lock;
322	isc_stats_t		       *stats;
323
324	/* Locked by manager lock. */
325	ISC_LIST(isc_socket_t)		socklist;
326	isc_boolean_t			bShutdown;
327	isc_condition_t			shutdown_ok;
328	HANDLE				hIoCompletionPort;
329	int				maxIOCPThreads;
330	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
331	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
332
333	/*
334	 * Debugging.
335	 * Modified by InterlockedIncrement() and InterlockedDecrement()
336	 */
337	LONG				totalSockets;
338	LONG				iocp_total;
339};
340
341enum {
342	SOCKET_RECV,
343	SOCKET_SEND,
344	SOCKET_ACCEPT,
345	SOCKET_CONNECT
346};
347
348/*
349 * send() and recv() iovec counts
350 */
351#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
352#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
353
354static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
355static void maybe_free_socket(isc_socket_t **, int);
356static void free_socket(isc_socket_t **, int);
357static isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
358static isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
359static isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
360static void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
361static void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
362static void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
363static void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
364static void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
365static void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
366static void queue_receive_request(isc_socket_t *sock);
367
368/*
369 * This is used to dump the contents of the sock structure
370 * You should make sure that the sock is locked before
371 * dumping it. Since the code uses simple printf() statements
372 * it should only be used interactively.
373 */
374void
375sock_dump(isc_socket_t *sock) {
376	isc_socketevent_t *ldev;
377	isc_socket_newconnev_t *ndev;
378
379#if 0
380	isc_sockaddr_t addr;
381	char socktext[256];
382
383	isc_socket_getpeername(sock, &addr);
384	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
385	printf("Remote Socket: %s\n", socktext);
386	isc_socket_getsockname(sock, &addr);
387	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
388	printf("This Socket: %s\n", socktext);
389#endif
390
391	printf("\n\t\tSock Dump\n");
392	printf("\t\tfd: %u\n", sock->fd);
393	printf("\t\treferences: %d\n", sock->references);
394	printf("\t\tpending_accept: %d\n", sock->pending_accept);
395	printf("\t\tconnecting: %d\n", sock->pending_connect);
396	printf("\t\tconnected: %d\n", sock->connected);
397	printf("\t\tbound: %d\n", sock->bound);
398	printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
399	printf("\t\tsocket type: %d\n", sock->type);
400
401	printf("\n\t\tSock Recv List\n");
402	ldev = ISC_LIST_HEAD(sock->recv_list);
403	while (ldev != NULL) {
404		printf("\t\tdev: %p\n", ldev);
405		ldev = ISC_LIST_NEXT(ldev, ev_link);
406	}
407
408	printf("\n\t\tSock Send List\n");
409	ldev = ISC_LIST_HEAD(sock->send_list);
410	while (ldev != NULL) {
411		printf("\t\tdev: %p\n", ldev);
412		ldev = ISC_LIST_NEXT(ldev, ev_link);
413	}
414
415	printf("\n\t\tSock Accept List\n");
416	ndev = ISC_LIST_HEAD(sock->accept_list);
417	while (ndev != NULL) {
418		printf("\t\tdev: %p\n", ldev);
419		ndev = ISC_LIST_NEXT(ndev, ev_link);
420	}
421}
422
423static void
424socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
425	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
426	   isc_msgcat_t *msgcat, int msgset, int message,
427	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
428
429/*  This function will add an entry to the I/O completion port
430 *  that will signal the I/O thread to exit (gracefully)
431 */
432static void
433signal_iocompletionport_exit(isc_socketmgr_t *manager) {
434	int i;
435	int errval;
436	char strbuf[ISC_STRERRORSIZE];
437
438	REQUIRE(VALID_MANAGER(manager));
439	for (i = 0; i < manager->maxIOCPThreads; i++) {
440		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
441						0, 0, 0)) {
442			errval = GetLastError();
443			isc__strerror(errval, strbuf, sizeof(strbuf));
444			FATAL_ERROR(__FILE__, __LINE__,
445				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
446				ISC_MSG_FAILED,
447				"Can't request service thread to exit: %s"),
448				strbuf);
449		}
450	}
451}
452
453/*
454 * Create the worker threads for the I/O Completion Port
455 */
456void
457iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
458	int errval;
459	char strbuf[ISC_STRERRORSIZE];
460	int i;
461
462	INSIST(total_threads > 0);
463	REQUIRE(VALID_MANAGER(manager));
464	/*
465	 * We need at least one
466	 */
467	for (i = 0; i < total_threads; i++) {
468		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
469						manager, 0,
470						&manager->dwIOCPThreadIds[i]);
471		if (manager->hIOCPThreads[i] == NULL) {
472			errval = GetLastError();
473			isc__strerror(errval, strbuf, sizeof(strbuf));
474			FATAL_ERROR(__FILE__, __LINE__,
475				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
476				ISC_MSG_FAILED,
477				"Can't create IOCP thread: %s"),
478				strbuf);
479			exit(1);
480		}
481	}
482}
483
484/*
485 *  Create/initialise the I/O completion port
486 */
487void
488iocompletionport_init(isc_socketmgr_t *manager) {
489	int errval;
490	char strbuf[ISC_STRERRORSIZE];
491
492	REQUIRE(VALID_MANAGER(manager));
493	/*
494	 * Create a private heap to handle the socket overlapped structure
495	 * The minimum number of structures is 10, there is no maximum
496	 */
497	hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
498	if (hHeapHandle == NULL) {
499		errval = GetLastError();
500		isc__strerror(errval, strbuf, sizeof(strbuf));
501		FATAL_ERROR(__FILE__, __LINE__,
502			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
503					   ISC_MSG_FAILED,
504					   "HeapCreate() failed during "
505					   "initialization: %s"),
506			    strbuf);
507		exit(1);
508	}
509
510	manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
511
512	/* Now Create the Completion Port */
513	manager->hIoCompletionPort = CreateIoCompletionPort(
514			INVALID_HANDLE_VALUE, NULL,
515			0, manager->maxIOCPThreads);
516	if (manager->hIoCompletionPort == NULL) {
517		errval = GetLastError();
518		isc__strerror(errval, strbuf, sizeof(strbuf));
519		FATAL_ERROR(__FILE__, __LINE__,
520				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
521				ISC_MSG_FAILED,
522				"CreateIoCompletionPort() failed "
523				"during initialization: %s"),
524				strbuf);
525		exit(1);
526	}
527
528	/*
529	 * Worker threads for servicing the I/O
530	 */
531	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
532}
533
534/*
535 * Associate a socket with an IO Completion Port.  This allows us to queue events for it
536 * and have our worker pool of threads process them.
537 */
538void
539iocompletionport_update(isc_socket_t *sock) {
540	HANDLE hiocp;
541	char strbuf[ISC_STRERRORSIZE];
542
543	REQUIRE(VALID_SOCKET(sock));
544
545	hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
546		sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
547
548	if (hiocp == NULL) {
549		DWORD errval = GetLastError();
550		isc__strerror(errval, strbuf, sizeof(strbuf));
551		isc_log_iwrite(isc_lctx,
552				ISC_LOGCATEGORY_GENERAL,
553				ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
554				isc_msgcat, ISC_MSGSET_SOCKET,
555				ISC_MSG_TOOMANYHANDLES,
556				"iocompletionport_update: failed to open"
557				" io completion port: %s",
558				strbuf);
559
560		/* XXXMLG temporary hack to make failures detected.
561		 * This function should return errors to the caller, not
562		 * exit here.
563		 */
564		FATAL_ERROR(__FILE__, __LINE__,
565				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
566				ISC_MSG_FAILED,
567				"CreateIoCompletionPort() failed "
568				"during initialization: %s"),
569				strbuf);
570		exit(1);
571	}
572
573	InterlockedIncrement(&sock->manager->iocp_total);
574}
575
576/*
577 * Routine to cleanup and then close the socket.
578 * Only close the socket here if it is NOT associated
579 * with an event, otherwise the WSAWaitForMultipleEvents
580 * may fail due to the fact that the Wait should not
581 * be running while closing an event or a socket.
582 * The socket is locked before calling this function
583 */
584void
585socket_close(isc_socket_t *sock) {
586
587	REQUIRE(sock != NULL);
588
589	if (sock->fd != INVALID_SOCKET) {
590		closesocket(sock->fd);
591		sock->fd = INVALID_SOCKET;
592		_set_state(sock, SOCK_CLOSED);
593		InterlockedDecrement(&sock->manager->totalSockets);
594	}
595}
596
597static isc_once_t initialise_once = ISC_ONCE_INIT;
598static isc_boolean_t initialised = ISC_FALSE;
599
600static void
601initialise(void) {
602	WORD wVersionRequested;
603	WSADATA wsaData;
604	int err;
605	SOCKET sock;
606	GUID GUIDConnectEx = WSAID_CONNECTEX;
607	GUID GUIDAcceptEx = WSAID_ACCEPTEX;
608	GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
609	DWORD dwBytes;
610
611	/* Need Winsock 2.2 or better */
612	wVersionRequested = MAKEWORD(2, 2);
613
614	err = WSAStartup(wVersionRequested, &wsaData);
615	if (err != 0) {
616		char strbuf[ISC_STRERRORSIZE];
617		isc__strerror(err, strbuf, sizeof(strbuf));
618		FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
619			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
620					   ISC_MSG_FAILED, "failed"),
621			    strbuf);
622		exit(1);
623	}
624	/*
625	 * The following APIs do not exist as functions in a library, but we must
626	 * ask winsock for them.  They are "extensions" -- but why they cannot be
627	 * actual functions is beyond me.  So, ask winsock for the pointers to the
628	 * functions we need.
629	 */
630	sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
631	INSIST(sock != INVALID_SOCKET);
632	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
633		 &GUIDConnectEx, sizeof(GUIDConnectEx),
634		 &ISCConnectEx, sizeof(ISCConnectEx),
635		 &dwBytes, NULL, NULL);
636	INSIST(err == 0);
637
638	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
639		 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
640		 &ISCAcceptEx, sizeof(ISCAcceptEx),
641		 &dwBytes, NULL, NULL);
642	INSIST(err == 0);
643
644	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
645		 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
646		 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
647		 &dwBytes, NULL, NULL);
648	INSIST(err == 0);
649
650	closesocket(sock);
651
652	initialised = ISC_TRUE;
653}
654
655/*
656 * Initialize socket services
657 */
658void
659InitSockets(void) {
660	RUNTIME_CHECK(isc_once_do(&initialise_once,
661				  initialise) == ISC_R_SUCCESS);
662	if (!initialised)
663		exit(1);
664}
665
666int
667internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
668		 struct msghdr *messagehdr, int flags, int *Error)
669{
670	int Result;
671	DWORD BytesSent;
672	DWORD Flags = flags;
673	int total_sent;
674
675	*Error = 0;
676	Result = WSASendTo(sock->fd, messagehdr->msg_iov,
677			   messagehdr->msg_iovlen, &BytesSent,
678			   Flags, (SOCKADDR *)&messagehdr->to_addr,
679			   messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
680			   NULL);
681
682	total_sent = (int)BytesSent;
683
684	/* Check for errors.*/
685	if (Result == SOCKET_ERROR) {
686		*Error = WSAGetLastError();
687
688		switch (*Error) {
689		case WSA_IO_INCOMPLETE:
690		case WSA_WAIT_IO_COMPLETION:
691		case WSA_IO_PENDING:
692		case NO_ERROR:		/* Strange, but okay */
693			sock->pending_iocp++;
694			sock->pending_send++;
695			break;
696
697		default:
698			return (-1);
699			break;
700		}
701	} else {
702		sock->pending_iocp++;
703		sock->pending_send++;
704	}
705
706	if (lpo != NULL)
707		return (0);
708	else
709		return (total_sent);
710}
711
712static void
713queue_receive_request(isc_socket_t *sock) {
714	DWORD Flags = 0;
715	DWORD NumBytes = 0;
716	int total_bytes = 0;
717	int Result;
718	int Error;
719	WSABUF iov[1];
720	IoCompletionInfo *lpo;
721	isc_result_t isc_result;
722
723	/*
724	 * If we already have a receive pending, do nothing.
725	 */
726	if (sock->pending_recv > 0)
727		return;
728
729	/*
730	 * If no one is waiting, do nothing.
731	 */
732	if (ISC_LIST_EMPTY(sock->recv_list))
733		return;
734
735	INSIST(sock->recvbuf.remaining == 0);
736	INSIST(sock->fd != INVALID_SOCKET);
737
738	iov[0].len = sock->recvbuf.len;
739	iov[0].buf = sock->recvbuf.base;
740
741	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
742					    HEAP_ZERO_MEMORY,
743					    sizeof(IoCompletionInfo));
744	RUNTIME_CHECK(lpo != NULL);
745	lpo->request_type = SOCKET_RECV;
746
747	sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
748
749	Error = 0;
750	Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
751			     &NumBytes, &Flags,
752			     (SOCKADDR *)&sock->recvbuf.from_addr,
753			     &sock->recvbuf.from_addr_len,
754			     (LPWSAOVERLAPPED)lpo, NULL);
755
756	/* Check for errors. */
757	if (Result == SOCKET_ERROR) {
758		Error = WSAGetLastError();
759
760		switch (Error) {
761		case WSA_IO_PENDING:
762			sock->pending_iocp++;
763			sock->pending_recv++;
764			break;
765
766		default:
767			isc_result = isc__errno2result(Error);
768			if (isc_result == ISC_R_UNEXPECTED)
769				UNEXPECTED_ERROR(__FILE__, __LINE__,
770					"WSARecvFrom: Windows error code: %d, isc result %d",
771					Error, isc_result);
772			send_recvdone_abort(sock, isc_result);
773			break;
774		}
775	} else {
776		/*
777		 * The recv() finished immediately, but we will still get
778		 * a completion event.  Rather than duplicate code, let
779		 * that thread handle sending the data along its way.
780		 */
781		sock->pending_iocp++;
782		sock->pending_recv++;
783	}
784
785	socket_log(__LINE__, sock, NULL, IOEVENT,
786		   isc_msgcat, ISC_MSGSET_SOCKET,
787		   ISC_MSG_DOIORECV,
788		   "queue_io_request: fd %d result %d error %d",
789		   sock->fd, Result, Error);
790
791	CONSISTENT(sock);
792}
793
794static void
795manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
796	    isc_logmodule_t *module, int level, const char *fmt, ...)
797{
798	char msgbuf[2048];
799	va_list ap;
800
801	if (!isc_log_wouldlog(isc_lctx, level))
802		return;
803
804	va_start(ap, fmt);
805	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
806	va_end(ap);
807
808	isc_log_write(isc_lctx, category, module, level,
809		      "sockmgr %p: %s", sockmgr, msgbuf);
810}
811
812static void
813socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
814	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
815	   isc_msgcat_t *msgcat, int msgset, int message,
816	   const char *fmt, ...)
817{
818	char msgbuf[2048];
819	char peerbuf[256];
820	va_list ap;
821
822
823	if (!isc_log_wouldlog(isc_lctx, level))
824		return;
825
826	va_start(ap, fmt);
827	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
828	va_end(ap);
829
830	if (address == NULL) {
831		isc_log_iwrite(isc_lctx, category, module, level,
832			       msgcat, msgset, message,
833			       "socket %p line %d: %s", sock, lineno, msgbuf);
834	} else {
835		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
836		isc_log_iwrite(isc_lctx, category, module, level,
837			       msgcat, msgset, message,
838				   "socket %p line %d peer %s: %s", sock, lineno,
839				   peerbuf, msgbuf);
840	}
841
842}
843
844/*
845 * Make an fd SOCKET non-blocking.
846 */
847static isc_result_t
848make_nonblock(SOCKET fd) {
849	int ret;
850	unsigned long flags = 1;
851	char strbuf[ISC_STRERRORSIZE];
852
853	/* Set the socket to non-blocking */
854	ret = ioctlsocket(fd, FIONBIO, &flags);
855
856	if (ret == -1) {
857		isc__strerror(errno, strbuf, sizeof(strbuf));
858		UNEXPECTED_ERROR(__FILE__, __LINE__,
859				 "ioctlsocket(%d, FIOBIO, %d): %s",
860				 fd, flags, strbuf);
861
862		return (ISC_R_UNEXPECTED);
863	}
864
865	return (ISC_R_SUCCESS);
866}
867
868/*
869 * Windows 2000 systems incorrectly cause UDP sockets using WASRecvFrom
870 * to not work correctly, returning a WSACONNRESET error when a WSASendTo
871 * fails with an "ICMP port unreachable" response and preventing the
872 * socket from using the WSARecvFrom in subsequent operations.
873 * The function below fixes this, but requires that Windows 2000
874 * Service Pack 2 or later be installed on the system.  NT 4.0
875 * systems are not affected by this and work correctly.
876 * See Microsoft Knowledge Base Article Q263823 for details of this.
877 */
878isc_result_t
879connection_reset_fix(SOCKET fd) {
880	DWORD dwBytesReturned = 0;
881	BOOL  bNewBehavior = FALSE;
882	DWORD status;
883
884	if (isc_win32os_majorversion() < 5)
885		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */
886
887	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
888	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
889			  sizeof(bNewBehavior), NULL, 0,
890			  &dwBytesReturned, NULL, NULL);
891	if (status != SOCKET_ERROR)
892		return (ISC_R_SUCCESS);
893	else {
894		UNEXPECTED_ERROR(__FILE__, __LINE__,
895				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
896				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
897						ISC_MSG_FAILED, "failed"));
898		return (ISC_R_UNEXPECTED);
899	}
900}
901
902/*
903 * Construct an iov array and attach it to the msghdr passed in.  This is
904 * the SEND constructor, which will use the used region of the buffer
905 * (if using a buffer list) or will use the internal region (if a single
906 * buffer I/O is requested).
907 *
908 * Nothing can be NULL, and the done event must list at least one buffer
909 * on the buffer linked list for this function to be meaningful.
910 */
911static void
912build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
913		  struct msghdr *msg, char *cmsg, WSABUF *iov,
914		  IoCompletionInfo  *lpo)
915{
916	unsigned int iovcount;
917	isc_buffer_t *buffer;
918	buflist_t  *cpbuffer;
919	isc_region_t used;
920	size_t write_count;
921	size_t skip_count;
922
923	memset(msg, 0, sizeof(*msg));
924
925	memcpy(&msg->to_addr, &dev->address.type, dev->address.length);
926	msg->to_addr_len = dev->address.length;
927
928	buffer = ISC_LIST_HEAD(dev->bufferlist);
929	write_count = 0;
930	iovcount = 0;
931
932	/*
933	 * Single buffer I/O?  Skip what we've done so far in this region.
934	 */
935	if (buffer == NULL) {
936		write_count = dev->region.length - dev->n;
937		cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
938		RUNTIME_CHECK(cpbuffer != NULL);
939		cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
940		RUNTIME_CHECK(cpbuffer->buf != NULL);
941
942		socket_log(__LINE__, sock, NULL, TRACE,
943		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
944		   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
945		   cpbuffer->buf, write_count);
946
947		memcpy(cpbuffer->buf,(dev->region.base + dev->n), write_count);
948		cpbuffer->buflen = write_count;
949		ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
950		iov[0].buf = cpbuffer->buf;
951		iov[0].len = write_count;
952		iovcount = 1;
953
954		goto config;
955	}
956
957	/*
958	 * Multibuffer I/O.
959	 * Skip the data in the buffer list that we have already written.
960	 */
961	skip_count = dev->n;
962	while (buffer != NULL) {
963		REQUIRE(ISC_BUFFER_VALID(buffer));
964		if (skip_count < isc_buffer_usedlength(buffer))
965			break;
966		skip_count -= isc_buffer_usedlength(buffer);
967		buffer = ISC_LIST_NEXT(buffer, link);
968	}
969
970	while (buffer != NULL) {
971		INSIST(iovcount < MAXSCATTERGATHER_SEND);
972
973		isc_buffer_usedregion(buffer, &used);
974
975		if (used.length > 0) {
976			int uselen = used.length - skip_count;
977			cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
978			RUNTIME_CHECK(cpbuffer != NULL);
979			cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
980			RUNTIME_CHECK(cpbuffer->buf != NULL);
981
982			socket_log(__LINE__, sock, NULL, TRACE,
983			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
984			   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
985			   cpbuffer->buf, write_count);
986
987			memcpy(cpbuffer->buf,(used.base + skip_count), uselen);
988			cpbuffer->buflen = uselen;
989			iov[iovcount].buf = cpbuffer->buf;
990			iov[iovcount].len = used.length - skip_count;
991			write_count += uselen;
992			skip_count = 0;
993			iovcount++;
994		}
995		buffer = ISC_LIST_NEXT(buffer, link);
996	}
997
998	INSIST(skip_count == 0);
999
1000 config:
1001	msg->msg_iov = iov;
1002	msg->msg_iovlen = iovcount;
1003	msg->msg_totallen = write_count;
1004}
1005
1006static void
1007set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1008		isc_socketevent_t *dev)
1009{
1010	if (sock->type == isc_sockettype_udp) {
1011		if (address != NULL)
1012			dev->address = *address;
1013		else
1014			dev->address = sock->address;
1015	} else if (sock->type == isc_sockettype_tcp) {
1016		INSIST(address == NULL);
1017		dev->address = sock->address;
1018	}
1019}
1020
1021static void
1022destroy_socketevent(isc_event_t *event) {
1023	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1024
1025	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1026
1027	(ev->destroy)(event);
1028}
1029
1030static isc_socketevent_t *
1031allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1032		     isc_taskaction_t action, const void *arg)
1033{
1034	isc_socketevent_t *ev;
1035
1036	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1037						     sock, eventtype,
1038						     action, arg,
1039						     sizeof(*ev));
1040	if (ev == NULL)
1041		return (NULL);
1042
1043	ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1044	ISC_LINK_INIT(ev, ev_link);
1045	ISC_LIST_INIT(ev->bufferlist);
1046	ev->region.base = NULL;
1047	ev->n = 0;
1048	ev->offset = 0;
1049	ev->attributes = 0;
1050	ev->destroy = ev->ev_destroy;
1051	ev->ev_destroy = destroy_socketevent;
1052
1053	return (ev);
1054}
1055
1056#if defined(ISC_SOCKET_DEBUG)
1057static void
1058dump_msg(struct msghdr *msg, isc_socket_t *sock) {
1059	unsigned int i;
1060
1061	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1062	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1063	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1064	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1065		printf("\t\t%d\tbase %p, len %d\n", i,
1066		       msg->msg_iov[i].buf,
1067		       msg->msg_iov[i].len);
1068}
1069#endif
1070
1071/*
1072 * map the error code
1073 */
1074int
1075map_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1076		 char *errorstring, size_t bufsize) {
1077
1078	int doreturn;
1079	switch (windows_errno) {
1080	case WSAECONNREFUSED:
1081		*isc_errno = ISC_R_CONNREFUSED;
1082		if (sock->connected)
1083			doreturn = DOIO_HARD;
1084		else
1085			doreturn = DOIO_SOFT;
1086		break;
1087	case WSAENETUNREACH:
1088	case ERROR_NETWORK_UNREACHABLE:
1089		*isc_errno = ISC_R_NETUNREACH;
1090		if (sock->connected)
1091			doreturn = DOIO_HARD;
1092		else
1093			doreturn = DOIO_SOFT;
1094		break;
1095	case ERROR_PORT_UNREACHABLE:
1096	case ERROR_HOST_UNREACHABLE:
1097	case WSAEHOSTUNREACH:
1098		*isc_errno = ISC_R_HOSTUNREACH;
1099		if (sock->connected)
1100			doreturn = DOIO_HARD;
1101		else
1102			doreturn = DOIO_SOFT;
1103		break;
1104	case WSAENETDOWN:
1105		*isc_errno = ISC_R_NETDOWN;
1106		if (sock->connected)
1107			doreturn = DOIO_HARD;
1108		else
1109			doreturn = DOIO_SOFT;
1110		break;
1111	case WSAEHOSTDOWN:
1112		*isc_errno = ISC_R_HOSTDOWN;
1113		if (sock->connected)
1114			doreturn = DOIO_HARD;
1115		else
1116			doreturn = DOIO_SOFT;
1117		break;
1118	case WSAEACCES:
1119		*isc_errno = ISC_R_NOPERM;
1120		if (sock->connected)
1121			doreturn = DOIO_HARD;
1122		else
1123			doreturn = DOIO_SOFT;
1124		break;
1125	case WSAECONNRESET:
1126	case WSAENETRESET:
1127	case WSAECONNABORTED:
1128	case WSAEDISCON:
1129		*isc_errno = ISC_R_CONNECTIONRESET;
1130		if (sock->connected)
1131			doreturn = DOIO_HARD;
1132		else
1133			doreturn = DOIO_SOFT;
1134		break;
1135	case WSAENOTCONN:
1136		*isc_errno = ISC_R_NOTCONNECTED;
1137		if (sock->connected)
1138			doreturn = DOIO_HARD;
1139		else
1140			doreturn = DOIO_SOFT;
1141		break;
1142	case ERROR_OPERATION_ABORTED:
1143	case ERROR_CONNECTION_ABORTED:
1144	case ERROR_REQUEST_ABORTED:
1145		*isc_errno = ISC_R_CONNECTIONRESET;
1146		doreturn = DOIO_HARD;
1147		break;
1148	case WSAENOBUFS:
1149		*isc_errno = ISC_R_NORESOURCES;
1150		doreturn = DOIO_HARD;
1151		break;
1152	case WSAEAFNOSUPPORT:
1153		*isc_errno = ISC_R_FAMILYNOSUPPORT;
1154		doreturn = DOIO_HARD;
1155		break;
1156	case WSAEADDRNOTAVAIL:
1157		*isc_errno = ISC_R_ADDRNOTAVAIL;
1158		doreturn = DOIO_HARD;
1159		break;
1160	case WSAEDESTADDRREQ:
1161		*isc_errno = ISC_R_BADADDRESSFORM;
1162		doreturn = DOIO_HARD;
1163		break;
1164	case ERROR_NETNAME_DELETED:
1165		*isc_errno = ISC_R_NETDOWN;
1166		doreturn = DOIO_HARD;
1167		break;
1168	default:
1169		*isc_errno = ISC_R_IOERROR;
1170		doreturn = DOIO_HARD;
1171		break;
1172	}
1173	if (doreturn == DOIO_HARD) {
1174		isc__strerror(windows_errno, errorstring, bufsize);
1175	}
1176	return (doreturn);
1177}
1178
1179static void
1180fill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1181	isc_region_t r;
1182	int copylen;
1183	isc_buffer_t *buffer;
1184
1185	INSIST(dev->n < dev->minimum);
1186	INSIST(sock->recvbuf.remaining > 0);
1187	INSIST(sock->pending_recv == 0);
1188
1189	if (sock->type == isc_sockettype_udp) {
1190		dev->address.length = sock->recvbuf.from_addr_len;
1191		memcpy(&dev->address.type, &sock->recvbuf.from_addr,
1192		    sock->recvbuf.from_addr_len);
1193		if (isc_sockaddr_getport(&dev->address) == 0) {
1194			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1195				socket_log(__LINE__, sock, &dev->address, IOEVENT,
1196					   isc_msgcat, ISC_MSGSET_SOCKET,
1197					   ISC_MSG_ZEROPORT,
1198					   "dropping source port zero packet");
1199			}
1200			sock->recvbuf.remaining = 0;
1201			return;
1202		}
1203	} else if (sock->type == isc_sockettype_tcp) {
1204		dev->address = sock->address;
1205	}
1206
1207	/*
1208	 * Run through the list of buffers we were given, and find the
1209	 * first one with space.  Once it is found, loop through, filling
1210	 * the buffers as much as possible.
1211	 */
1212	buffer = ISC_LIST_HEAD(dev->bufferlist);
1213	if (buffer != NULL) { // Multi-buffer receive
1214		while (buffer != NULL && sock->recvbuf.remaining > 0) {
1215			REQUIRE(ISC_BUFFER_VALID(buffer));
1216			if (isc_buffer_availablelength(buffer) > 0) {
1217				isc_buffer_availableregion(buffer, &r);
1218				copylen = min(r.length, sock->recvbuf.remaining);
1219				memcpy(r.base, sock->recvbuf.consume_position, copylen);
1220				sock->recvbuf.consume_position += copylen;
1221				sock->recvbuf.remaining -= copylen;
1222				isc_buffer_add(buffer, copylen);
1223				dev->n += copylen;
1224			}
1225			buffer = ISC_LIST_NEXT(buffer, link);
1226		}
1227	} else { // Single-buffer receive
1228		copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1229		memcpy(dev->region.base + dev->n, sock->recvbuf.consume_position, copylen);
1230		sock->recvbuf.consume_position += copylen;
1231		sock->recvbuf.remaining -= copylen;
1232		dev->n += copylen;
1233	}
1234
1235	/*
1236	 * UDP receives are all-consuming.  That is, if we have 4k worth of
1237	 * data in our receive buffer, and the caller only gave us
1238	 * 1k of space, we will toss the remaining 3k of data.  TCP
1239	 * will keep the extra data around and use it for later requests.
1240	 */
1241	if (sock->type == isc_sockettype_udp)
1242		sock->recvbuf.remaining = 0;
1243}
1244
1245/*
1246 * Copy out as much data from the internal buffer to done events.
1247 * As each done event is filled, send it along its way.
1248 */
1249static void
1250completeio_recv(isc_socket_t *sock)
1251{
1252	isc_socketevent_t *dev;
1253
1254	/*
1255	 * If we are in the process of filling our buffer, we cannot
1256	 * touch it yet, so don't.
1257	 */
1258	if (sock->pending_recv > 0)
1259		return;
1260
1261	while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1262		dev = ISC_LIST_HEAD(sock->recv_list);
1263
1264		/*
1265		 * See if we have sufficient data in our receive buffer
1266		 * to handle this.  If we do, copy out the data.
1267		 */
1268		fill_recv(sock, dev);
1269
1270		/*
1271		 * Did we satisfy it?
1272		 */
1273		if (dev->n >= dev->minimum) {
1274			dev->result = ISC_R_SUCCESS;
1275			send_recvdone_event(sock, &dev);
1276		}
1277	}
1278}
1279
1280/*
1281 * Returns:
1282 *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1283 *			ISC_R_SUCCESS.
1284 *
1285 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1286 *			dev->result contains the appropriate error.
1287 *
1288 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1289 *			event was sent.  The operation should be retried.
1290 *
1291 *	No other return values are possible.
1292 */
1293static int
1294completeio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1295		struct msghdr *messagehdr, int cc, int send_errno)
1296{
1297	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1298	char strbuf[ISC_STRERRORSIZE];
1299
1300	if (send_errno != 0) {
1301		if (SOFT_ERROR(send_errno))
1302			return (DOIO_SOFT);
1303
1304		return (map_socket_error(sock, send_errno, &dev->result,
1305			strbuf, sizeof(strbuf)));
1306
1307		/*
1308		 * The other error types depend on whether or not the
1309		 * socket is UDP or TCP.  If it is UDP, some errors
1310		 * that we expect to be fatal under TCP are merely
1311		 * annoying, and are really soft errors.
1312		 *
1313		 * However, these soft errors are still returned as
1314		 * a status.
1315		 */
1316		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1317		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1318		UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1319				 addrbuf, strbuf);
1320		dev->result = isc__errno2result(send_errno);
1321	return (DOIO_HARD);
1322	}
1323
1324	/*
1325	 * If we write less than we expected, update counters, poke.
1326	 */
1327	dev->n += cc;
1328	if (cc != messagehdr->msg_totallen)
1329		return (DOIO_SOFT);
1330
1331	/*
1332	 * Exactly what we wanted to write.  We're done with this
1333	 * entry.  Post its completion event.
1334	 */
1335	dev->result = ISC_R_SUCCESS;
1336	return (DOIO_SUCCESS);
1337}
1338
1339static int
1340startio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1341	     int *send_errno)
1342{
1343	char *cmsg = NULL;
1344	char strbuf[ISC_STRERRORSIZE];
1345	IoCompletionInfo *lpo;
1346	int status;
1347	struct msghdr *msghdr;
1348
1349	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1350					    HEAP_ZERO_MEMORY,
1351					    sizeof(IoCompletionInfo));
1352	RUNTIME_CHECK(lpo != NULL);
1353	lpo->request_type = SOCKET_SEND;
1354	lpo->dev = dev;
1355	msghdr = &lpo->messagehdr;
1356	memset(msghdr, 0, sizeof(struct msghdr));
1357	ISC_LIST_INIT(lpo->bufferlist);
1358
1359	build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1360
1361	*nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1362
1363	if (*nbytes < 0) {
1364		/*
1365		 * I/O has been initiated
1366		 * completion will be through the completion port
1367		 */
1368		if (PENDING_ERROR(*send_errno)) {
1369			status = DOIO_PENDING;
1370			goto done;
1371		}
1372
1373		if (SOFT_ERROR(*send_errno)) {
1374			status = DOIO_SOFT;
1375			goto done;
1376		}
1377
1378		/*
1379		 * If we got this far then something is wrong
1380		 */
1381		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1382			isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1383			socket_log(__LINE__, sock, NULL, IOEVENT,
1384				   isc_msgcat, ISC_MSGSET_SOCKET,
1385				   ISC_MSG_INTERNALSEND,
1386				   "startio_send: internal_sendmsg(%d) %d "
1387				   "bytes, err %d/%s",
1388				   sock->fd, *nbytes, *send_errno, strbuf);
1389		}
1390		goto done;
1391	}
1392	dev->result = ISC_R_SUCCESS;
1393	status = DOIO_SOFT;
1394 done:
1395	_set_state(sock, SOCK_DATA);
1396	return (status);
1397}
1398
1399static isc_result_t
1400allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1401		isc_socket_t **socketp) {
1402	isc_socket_t *sock;
1403	isc_result_t result;
1404
1405	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1406
1407	if (sock == NULL)
1408		return (ISC_R_NOMEMORY);
1409
1410	sock->magic = 0;
1411	sock->references = 0;
1412
1413	sock->manager = manager;
1414	sock->type = type;
1415	sock->fd = INVALID_SOCKET;
1416
1417	ISC_LINK_INIT(sock, link);
1418
1419	/*
1420	 * set up list of readers and writers to be initially empty
1421	 */
1422	ISC_LIST_INIT(sock->recv_list);
1423	ISC_LIST_INIT(sock->send_list);
1424	ISC_LIST_INIT(sock->accept_list);
1425	sock->connect_ev = NULL;
1426	sock->pending_accept = 0;
1427	sock->pending_recv = 0;
1428	sock->pending_send = 0;
1429	sock->pending_iocp = 0;
1430	sock->listener = 0;
1431	sock->connected = 0;
1432	sock->pending_connect = 0;
1433	sock->bound = 0;
1434	memset(sock->name, 0, sizeof(sock->name));	// zero the name field
1435	_set_state(sock, SOCK_INITIALIZED);
1436
1437	sock->recvbuf.len = 65536;
1438	sock->recvbuf.consume_position = sock->recvbuf.base;
1439	sock->recvbuf.remaining = 0;
1440	sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1441	if (sock->recvbuf.base == NULL) {
1442		sock->magic = 0;
1443		goto error;
1444	}
1445
1446	/*
1447	 * initialize the lock
1448	 */
1449	result = isc_mutex_init(&sock->lock);
1450	if (result != ISC_R_SUCCESS) {
1451		sock->magic = 0;
1452		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1453		sock->recvbuf.base = NULL;
1454		goto error;
1455	}
1456
1457	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1458		   "allocated");
1459
1460	sock->magic = SOCKET_MAGIC;
1461	*socketp = sock;
1462
1463	return (ISC_R_SUCCESS);
1464
1465 error:
1466	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1467
1468	return (result);
1469}
1470
1471/*
1472 * Verify that the socket state is consistent.
1473 */
1474static void
1475consistent(isc_socket_t *sock) {
1476
1477	isc_socketevent_t *dev;
1478	isc_socket_newconnev_t *nev;
1479	unsigned int count;
1480	char *crash_reason;
1481	isc_boolean_t crash = ISC_FALSE;
1482
1483	REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1484		+ sock->pending_accept + sock->pending_connect);
1485
1486	dev = ISC_LIST_HEAD(sock->send_list);
1487	count = 0;
1488	while (dev != NULL) {
1489		count++;
1490		dev = ISC_LIST_NEXT(dev, ev_link);
1491	}
1492	if (count > sock->pending_send) {
1493		crash = ISC_TRUE;
1494		crash_reason = "send_list > sock->pending_send";
1495	}
1496
1497	nev = ISC_LIST_HEAD(sock->accept_list);
1498	count = 0;
1499	while (nev != NULL) {
1500		count++;
1501		nev = ISC_LIST_NEXT(nev, ev_link);
1502	}
1503	if (count > sock->pending_accept) {
1504		crash = ISC_TRUE;
1505		crash_reason = "send_list > sock->pending_send";
1506	}
1507
1508	if (crash) {
1509		socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1510			   ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1511			   crash_reason);
1512		sock_dump(sock);
1513		INSIST(crash == ISC_FALSE);
1514	}
1515}
1516
1517/*
1518 * Maybe free the socket.
1519 *
1520 * This function will verify tht the socket is no longer in use in any way,
1521 * either internally or externally.  This is the only place where this
1522 * check is to be made; if some bit of code believes that IT is done with
1523 * the socket (e.g., some reference counter reaches zero), it should call
1524 * this function.
1525 *
1526 * When calling this function, the socket must be locked, and the manager
1527 * must be unlocked.
1528 *
1529 * When this function returns, *socketp will be NULL.  No tricks to try
1530 * to hold on to this pointer are allowed.
1531 */
1532static void
1533maybe_free_socket(isc_socket_t **socketp, int lineno) {
1534	isc_socket_t *sock = *socketp;
1535	*socketp = NULL;
1536
1537	INSIST(VALID_SOCKET(sock));
1538	CONSISTENT(sock);
1539
1540	if (sock->pending_iocp > 0
1541	    || sock->pending_recv > 0
1542	    || sock->pending_send > 0
1543	    || sock->pending_accept > 0
1544	    || sock->references > 0
1545	    || sock->pending_connect == 1
1546	    || !ISC_LIST_EMPTY(sock->recv_list)
1547	    || !ISC_LIST_EMPTY(sock->send_list)
1548	    || !ISC_LIST_EMPTY(sock->accept_list)
1549	    || sock->fd != INVALID_SOCKET) {
1550		UNLOCK(&sock->lock);
1551		return;
1552	}
1553	UNLOCK(&sock->lock);
1554
1555	free_socket(&sock, lineno);
1556}
1557
1558void
1559free_socket(isc_socket_t **sockp, int lineno) {
1560	isc_socketmgr_t *manager;
1561	isc_socket_t *sock = *sockp;
1562	*sockp = NULL;
1563
1564	manager = sock->manager;
1565
1566	/*
1567	 * Seems we can free the socket after all.
1568	 */
1569	manager = sock->manager;
1570	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1571		   ISC_MSG_DESTROYING, "freeing socket line %d fd %d lock %p semaphore %p",
1572		   lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1573
1574	sock->magic = 0;
1575	DESTROYLOCK(&sock->lock);
1576
1577	if (sock->recvbuf.base != NULL)
1578		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1579
1580	LOCK(&manager->lock);
1581	if (ISC_LINK_LINKED(sock, link))
1582		ISC_LIST_UNLINK(manager->socklist, sock, link);
1583	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1584
1585	if (ISC_LIST_EMPTY(manager->socklist))
1586		SIGNAL(&manager->shutdown_ok);
1587	UNLOCK(&manager->lock);
1588}
1589
1590/*
1591 * Create a new 'type' socket managed by 'manager'.  Events
1592 * will be posted to 'task' and when dispatched 'action' will be
1593 * called with 'arg' as the arg value.  The new socket is returned
1594 * in 'socketp'.
1595 */
1596isc_result_t
1597isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1598		  isc_socket_t **socketp) {
1599	isc_socket_t *sock = NULL;
1600	isc_result_t result;
1601#if defined(USE_CMSG)
1602	int on = 1;
1603#endif
1604#if defined(SO_RCVBUF)
1605	ISC_SOCKADDR_LEN_T optlen;
1606	int size;
1607#endif
1608	int socket_errno;
1609	char strbuf[ISC_STRERRORSIZE];
1610
1611	REQUIRE(VALID_MANAGER(manager));
1612	REQUIRE(socketp != NULL && *socketp == NULL);
1613	REQUIRE(type != isc_sockettype_fdwatch);
1614
1615	result = allocate_socket(manager, type, &sock);
1616	if (result != ISC_R_SUCCESS)
1617		return (result);
1618
1619	sock->pf = pf;
1620	switch (type) {
1621	case isc_sockettype_udp:
1622		sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1623		if (sock->fd != INVALID_SOCKET) {
1624			result = connection_reset_fix(sock->fd);
1625			if (result != ISC_R_SUCCESS) {
1626				socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1627					"closed %d %d %d con_reset_fix_failed",
1628					sock->pending_recv, sock->pending_send,
1629					sock->references);
1630				closesocket(sock->fd);
1631				_set_state(sock, SOCK_CLOSED);
1632				sock->fd = INVALID_SOCKET;
1633				free_socket(&sock, __LINE__);
1634				return (result);
1635			}
1636		}
1637		break;
1638	case isc_sockettype_tcp:
1639		sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1640		break;
1641	}
1642
1643	if (sock->fd == INVALID_SOCKET) {
1644		socket_errno = WSAGetLastError();
1645		free_socket(&sock, __LINE__);
1646
1647		switch (socket_errno) {
1648		case WSAEMFILE:
1649		case WSAENOBUFS:
1650			return (ISC_R_NORESOURCES);
1651
1652		case WSAEPROTONOSUPPORT:
1653		case WSAEPFNOSUPPORT:
1654		case WSAEAFNOSUPPORT:
1655			return (ISC_R_FAMILYNOSUPPORT);
1656
1657		default:
1658			isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1659			UNEXPECTED_ERROR(__FILE__, __LINE__,
1660					 "socket() %s: %s",
1661					 isc_msgcat_get(isc_msgcat,
1662							ISC_MSGSET_GENERAL,
1663							ISC_MSG_FAILED,
1664							"failed"),
1665					 strbuf);
1666			return (ISC_R_UNEXPECTED);
1667		}
1668	}
1669
1670	result = make_nonblock(sock->fd);
1671	if (result != ISC_R_SUCCESS) {
1672		socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1673			"closed %d %d %d make_nonblock_failed",
1674			sock->pending_recv, sock->pending_send,
1675			sock->references);
1676		closesocket(sock->fd);
1677		sock->fd = INVALID_SOCKET;
1678		free_socket(&sock, __LINE__);
1679		return (result);
1680	}
1681
1682
1683#if defined(USE_CMSG) || defined(SO_RCVBUF)
1684	if (type == isc_sockettype_udp) {
1685
1686#if defined(USE_CMSG)
1687#if defined(ISC_PLATFORM_HAVEIPV6)
1688#ifdef IPV6_RECVPKTINFO
1689		/* 2292bis */
1690		if ((pf == AF_INET6)
1691		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1692				   (void *)&on, sizeof(on)) < 0)) {
1693			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1694			UNEXPECTED_ERROR(__FILE__, __LINE__,
1695					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1696					 "%s: %s", sock->fd,
1697					 isc_msgcat_get(isc_msgcat,
1698							ISC_MSGSET_GENERAL,
1699							ISC_MSG_FAILED,
1700							"failed"),
1701					 strbuf);
1702		}
1703#else
1704		/* 2292 */
1705		if ((pf == AF_INET6)
1706		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1707				   (void *)&on, sizeof(on)) < 0)) {
1708			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1709			UNEXPECTED_ERROR(__FILE__, __LINE__,
1710					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1711					 sock->fd,
1712					 isc_msgcat_get(isc_msgcat,
1713							ISC_MSGSET_GENERAL,
1714							ISC_MSG_FAILED,
1715							"failed"),
1716					 strbuf);
1717		}
1718#endif /* IPV6_RECVPKTINFO */
1719#ifdef IPV6_USE_MIN_MTU	/*2292bis, not too common yet*/
1720		/* use minimum MTU */
1721		if (pf == AF_INET6) {
1722			(void)setsockopt(sock->fd, IPPROTO_IPV6,
1723					 IPV6_USE_MIN_MTU,
1724					 (void *)&on, sizeof(on));
1725		}
1726#endif
1727#endif /* ISC_PLATFORM_HAVEIPV6 */
1728#endif /* defined(USE_CMSG) */
1729
1730#if defined(SO_RCVBUF)
1731	       optlen = sizeof(size);
1732	       if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1733			      (void *)&size, &optlen) >= 0 &&
1734		    size < RCVBUFSIZE) {
1735		       size = RCVBUFSIZE;
1736		       (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1737					(void *)&size, sizeof(size));
1738	       }
1739#endif
1740
1741	}
1742#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1743
1744	_set_state(sock, SOCK_OPEN);
1745	sock->references = 1;
1746	*socketp = sock;
1747
1748	iocompletionport_update(sock);
1749
1750	/*
1751	 * Note we don't have to lock the socket like we normally would because
1752	 * there are no external references to it yet.
1753	 */
1754	LOCK(&manager->lock);
1755	ISC_LIST_APPEND(manager->socklist, sock, link);
1756	InterlockedIncrement(&manager->totalSockets);
1757	UNLOCK(&manager->lock);
1758
1759	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1760		   ISC_MSG_CREATED, "created %u type %u", sock->fd, type);
1761
1762	return (ISC_R_SUCCESS);
1763}
1764
1765isc_result_t
1766isc_socket_open(isc_socket_t *sock) {
1767	REQUIRE(VALID_SOCKET(sock));
1768	REQUIRE(sock->type != isc_sockettype_fdwatch);
1769
1770	return (ISC_R_NOTIMPLEMENTED);
1771}
1772
1773/*
1774 * Attach to a socket.  Caller must explicitly detach when it is done.
1775 */
1776void
1777isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1778	REQUIRE(VALID_SOCKET(sock));
1779	REQUIRE(socketp != NULL && *socketp == NULL);
1780
1781	LOCK(&sock->lock);
1782	CONSISTENT(sock);
1783	sock->references++;
1784	UNLOCK(&sock->lock);
1785
1786	*socketp = sock;
1787}
1788
1789/*
1790 * Dereference a socket.  If this is the last reference to it, clean things
1791 * up by destroying the socket.
1792 */
1793void
1794isc_socket_detach(isc_socket_t **socketp) {
1795	isc_socket_t *sock;
1796	isc_boolean_t kill_socket = ISC_FALSE;
1797
1798	REQUIRE(socketp != NULL);
1799	sock = *socketp;
1800	REQUIRE(VALID_SOCKET(sock));
1801	REQUIRE(sock->type != isc_sockettype_fdwatch);
1802
1803	LOCK(&sock->lock);
1804	CONSISTENT(sock);
1805	REQUIRE(sock->references > 0);
1806	sock->references--;
1807
1808	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1809		"detach_socket %d %d %d",
1810		sock->pending_recv, sock->pending_send,
1811		sock->references);
1812
1813	if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1814		closesocket(sock->fd);
1815		sock->fd = INVALID_SOCKET;
1816		_set_state(sock, SOCK_CLOSED);
1817	}
1818
1819	maybe_free_socket(&sock, __LINE__);
1820
1821	*socketp = NULL;
1822}
1823
1824isc_result_t
1825isc_socket_close(isc_socket_t *sock) {
1826	REQUIRE(VALID_SOCKET(sock));
1827	REQUIRE(sock->type != isc_sockettype_fdwatch);
1828
1829	return (ISC_R_NOTIMPLEMENTED);
1830}
1831
1832/*
1833 * Dequeue an item off the given socket's read queue, set the result code
1834 * in the done event to the one provided, and send it to the task it was
1835 * destined for.
1836 *
1837 * If the event to be sent is on a list, remove it before sending.  If
1838 * asked to, send and detach from the task as well.
1839 *
1840 * Caller must have the socket locked if the event is attached to the socket.
1841 */
1842static void
1843send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1844	isc_task_t *task;
1845
1846	task = (*dev)->ev_sender;
1847	(*dev)->ev_sender = sock;
1848
1849	if (ISC_LINK_LINKED(*dev, ev_link))
1850		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1851
1852	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1853	    == ISC_SOCKEVENTATTR_ATTACHED)
1854		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1855	else
1856		isc_task_send(task, (isc_event_t **)dev);
1857
1858	CONSISTENT(sock);
1859}
1860
1861/*
1862 * See comments for send_recvdone_event() above.
1863 */
1864static void
1865send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1866	isc_task_t *task;
1867
1868	INSIST(dev != NULL && *dev != NULL);
1869
1870	task = (*dev)->ev_sender;
1871	(*dev)->ev_sender = sock;
1872
1873	if (ISC_LINK_LINKED(*dev, ev_link))
1874		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1875
1876	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1877	    == ISC_SOCKEVENTATTR_ATTACHED)
1878		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1879	else
1880		isc_task_send(task, (isc_event_t **)dev);
1881
1882	CONSISTENT(sock);
1883}
1884
1885/*
1886 * See comments for send_recvdone_event() above.
1887 */
1888static void
1889send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
1890	isc_task_t *task;
1891
1892	INSIST(adev != NULL && *adev != NULL);
1893
1894	task = (*adev)->ev_sender;
1895	(*adev)->ev_sender = sock;
1896
1897	if (ISC_LINK_LINKED(*adev, ev_link))
1898		ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
1899
1900	isc_task_sendanddetach(&task, (isc_event_t **)adev);
1901
1902	CONSISTENT(sock);
1903}
1904
1905/*
1906 * See comments for send_recvdone_event() above.
1907 */
1908static void
1909send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
1910	isc_task_t *task;
1911
1912	INSIST(cdev != NULL && *cdev != NULL);
1913
1914	task = (*cdev)->ev_sender;
1915	(*cdev)->ev_sender = sock;
1916
1917	sock->connect_ev = NULL;
1918
1919	isc_task_sendanddetach(&task, (isc_event_t **)cdev);
1920
1921	CONSISTENT(sock);
1922}
1923
1924/*
1925 * On entry to this function, the event delivered is the internal
1926 * readable event, and the first item on the accept_list should be
1927 * the done event we want to send.  If the list is empty, this is a no-op,
1928 * so just close the new connection, unlock, and return.
1929 *
1930 * Note the socket is locked before entering here
1931 */
1932static void
1933internal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
1934	isc_socket_newconnev_t *adev;
1935	isc_result_t result = ISC_R_SUCCESS;
1936	isc_socket_t *nsock;
1937	struct sockaddr *localaddr;
1938	int localaddr_len = sizeof(*localaddr);
1939	struct sockaddr *remoteaddr;
1940	int remoteaddr_len = sizeof(*remoteaddr);
1941
1942	INSIST(VALID_SOCKET(sock));
1943	LOCK(&sock->lock);
1944	CONSISTENT(sock);
1945
1946	socket_log(__LINE__, sock, NULL, TRACE,
1947		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1948		   "internal_accept called");
1949
1950	INSIST(sock->listener);
1951
1952	INSIST(sock->pending_iocp > 0);
1953	sock->pending_iocp--;
1954	INSIST(sock->pending_accept > 0);
1955	sock->pending_accept--;
1956
1957	adev = lpo->adev;
1958
1959	/*
1960	 * If the event is no longer in the list we can just return.
1961	 */
1962	if (!acceptdone_is_active(sock, adev))
1963		goto done;
1964
1965	nsock = adev->newsocket;
1966
1967	/*
1968	 * Pull off the done event.
1969	 */
1970	ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
1971
1972	/*
1973	 * Extract the addresses from the socket, copy them into the structure,
1974	 * and return the new socket.
1975	 */
1976	ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
1977		sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
1978		(LPSOCKADDR *)&localaddr, &localaddr_len,
1979		(LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
1980	memcpy(&adev->address.type, remoteaddr, remoteaddr_len);
1981	adev->address.length = remoteaddr_len;
1982	nsock->address = adev->address;
1983	nsock->pf = adev->address.type.sa.sa_family;
1984
1985	socket_log(__LINE__, nsock, &nsock->address, TRACE,
1986		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1987		   "internal_accept parent %p", sock);
1988
1989	result = make_nonblock(adev->newsocket->fd);
1990	INSIST(result == ISC_R_SUCCESS);
1991
1992	INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
1993	       (char *)&sock->fd, sizeof(sock->fd)) == 0);
1994
1995	/*
1996	 * Hook it up into the manager.
1997	 */
1998	nsock->bound = 1;
1999	nsock->connected = 1;
2000	_set_state(nsock, SOCK_OPEN);
2001
2002	LOCK(&nsock->manager->lock);
2003	ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2004	InterlockedIncrement(&nsock->manager->totalSockets);
2005	UNLOCK(&nsock->manager->lock);
2006
2007	socket_log(__LINE__, sock, &nsock->address, CREATION,
2008		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2009		   "accepted_connection new_socket %p fd %d",
2010		   nsock, nsock->fd);
2011
2012	adev->result = result;
2013	send_acceptdone_event(sock, &adev);
2014
2015done:
2016	CONSISTENT(sock);
2017	UNLOCK(&sock->lock);
2018
2019	HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2020	lpo->acceptbuffer = NULL;
2021}
2022
2023/*
2024 * Called when a socket with a pending connect() finishes.
2025 * Note that the socket is locked before entering.
2026 */
2027static void
2028internal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2029	isc_socket_connev_t *cdev;
2030	char strbuf[ISC_STRERRORSIZE];
2031
2032	INSIST(VALID_SOCKET(sock));
2033
2034	LOCK(&sock->lock);
2035
2036	INSIST(sock->pending_iocp > 0);
2037	sock->pending_iocp--;
2038	INSIST(sock->pending_connect == 1);
2039	sock->pending_connect = 0;
2040
2041	/*
2042	 * Has this event been canceled?
2043	 */
2044	cdev = lpo->cdev;
2045	if (!connectdone_is_active(sock, cdev)) {
2046		sock->pending_connect = 0;
2047		if (sock->fd != INVALID_SOCKET) {
2048			closesocket(sock->fd);
2049			sock->fd = INVALID_SOCKET;
2050			_set_state(sock, SOCK_CLOSED);
2051		}
2052		CONSISTENT(sock);
2053		UNLOCK(&sock->lock);
2054		return;
2055	}
2056
2057	/*
2058	 * Check possible Windows network event error status here.
2059	 */
2060	if (connect_errno != 0) {
2061		/*
2062		 * If the error is SOFT, just try again on this
2063		 * fd and pretend nothing strange happened.
2064		 */
2065		if (SOFT_ERROR(connect_errno) ||
2066		    connect_errno == WSAEINPROGRESS) {
2067			sock->pending_connect = 1;
2068			CONSISTENT(sock);
2069			UNLOCK(&sock->lock);
2070			return;
2071		}
2072
2073		/*
2074		 * Translate other errors into ISC_R_* flavors.
2075		 */
2076		switch (connect_errno) {
2077#define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2078			ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2079			ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2080			ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2081			ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2082			ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2083			ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2084			ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2085			ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2086			ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2087			ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2088			ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2089			ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2090#undef ERROR_MATCH
2091		default:
2092			cdev->result = ISC_R_UNEXPECTED;
2093			isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2094			UNEXPECTED_ERROR(__FILE__, __LINE__,
2095					 "internal_connect: connect() %s",
2096					 strbuf);
2097		}
2098	} else {
2099		INSIST(setsockopt(sock->fd, SOL_SOCKET, SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2100		cdev->result = ISC_R_SUCCESS;
2101		sock->connected = 1;
2102		socket_log(__LINE__, sock, &sock->address, IOEVENT,
2103			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2104			   "internal_connect: success");
2105	}
2106
2107	send_connectdone_event(sock, &cdev);
2108
2109	UNLOCK(&sock->lock);
2110}
2111
2112/*
2113 * Loop through the socket, returning ISC_R_EOF for each done event pending.
2114 */
2115static void
2116send_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2117	isc_socketevent_t *dev;
2118
2119	while (!ISC_LIST_EMPTY(sock->recv_list)) {
2120		dev = ISC_LIST_HEAD(sock->recv_list);
2121		dev->result = result;
2122		send_recvdone_event(sock, &dev);
2123	}
2124}
2125
2126/*
2127 * Take the data we received in our private buffer, and if any recv() calls on
2128 * our list are satisfied, send the corresponding done event.
2129 *
2130 * If we need more data (there are still items on the recv_list after we consume all
2131 * our data) then arrange for another system recv() call to fill our buffers.
2132 */
2133static void
2134internal_recv(isc_socket_t *sock, int nbytes)
2135{
2136	INSIST(VALID_SOCKET(sock));
2137
2138	LOCK(&sock->lock);
2139	CONSISTENT(sock);
2140
2141	socket_log(__LINE__, sock, NULL, IOEVENT,
2142		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2143		   "internal_recv: %d bytes received", nbytes);
2144
2145	/*
2146	 * If we got here, the I/O operation succeeded.  However, we might still have removed this
2147	 * event from our notification list (or never placed it on it due to immediate completion.)
2148	 * Handle the reference counting here, and handle the cancellation event just after.
2149	 */
2150	INSIST(sock->pending_iocp > 0);
2151	sock->pending_iocp--;
2152	INSIST(sock->pending_recv > 0);
2153	sock->pending_recv--;
2154
2155	/*
2156	 * The only way we could have gotten here is that our I/O has successfully completed.
2157	 * Update our pointers, and move on.  The only odd case here is that we might not
2158	 * have received enough data on a TCP stream to satisfy the minimum requirements.  If
2159	 * this is the case, we will re-issue the recv() call for what we need.
2160	 *
2161	 * We do check for a recv() of 0 bytes on a TCP stream.  This means the remote end
2162	 * has closed.
2163	 */
2164	if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2165		send_recvdone_abort(sock, ISC_R_EOF);
2166		maybe_free_socket(&sock, __LINE__);
2167		return;
2168	}
2169	sock->recvbuf.remaining = nbytes;
2170	sock->recvbuf.consume_position = sock->recvbuf.base;
2171	completeio_recv(sock);
2172
2173	/*
2174	 * If there are more receivers waiting for data, queue another receive
2175	 * here.
2176	 */
2177	queue_receive_request(sock);
2178
2179	/*
2180	 * Unlock and/or destroy if we are the last thing this socket has left to do.
2181	 */
2182	maybe_free_socket(&sock, __LINE__);
2183}
2184
2185static void
2186internal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2187	      struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2188{
2189	buflist_t *buffer;
2190
2191	/*
2192	 * Find out what socket this is and lock it.
2193	 */
2194	INSIST(VALID_SOCKET(sock));
2195
2196	LOCK(&sock->lock);
2197	CONSISTENT(sock);
2198
2199	socket_log(__LINE__, sock, NULL, IOEVENT,
2200		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2201		   "internal_send: task got socket event %p", dev);
2202
2203	buffer = ISC_LIST_HEAD(lpo->bufferlist);
2204	while (buffer != NULL) {
2205		ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2206
2207		socket_log(__LINE__, sock, NULL, TRACE,
2208		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2209		   "free_buffer %p %p", buffer, buffer->buf);
2210
2211		HeapFree(hHeapHandle, 0, buffer->buf);
2212		HeapFree(hHeapHandle, 0, buffer);
2213		buffer = ISC_LIST_HEAD(lpo->bufferlist);
2214	}
2215
2216	INSIST(sock->pending_iocp > 0);
2217	sock->pending_iocp--;
2218	INSIST(sock->pending_send > 0);
2219	sock->pending_send--;
2220
2221	/* If the event is no longer in the list we can just return */
2222	if (!senddone_is_active(sock, dev))
2223		goto done;
2224
2225	/*
2226	 * Set the error code and send things on its way.
2227	 */
2228	switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2229	case DOIO_SOFT:
2230		break;
2231	case DOIO_HARD:
2232	case DOIO_SUCCESS:
2233		send_senddone_event(sock, &dev);
2234		break;
2235	}
2236
2237 done:
2238	maybe_free_socket(&sock, __LINE__);
2239}
2240
2241/*
2242 * These return if the done event passed in is on the list (or for connect, is
2243 * the one we're waiting for.  Using these ensures we will not double-send an
2244 * event.
2245 */
2246static isc_boolean_t
2247senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2248{
2249	isc_socketevent_t *ldev;
2250
2251	ldev = ISC_LIST_HEAD(sock->send_list);
2252	while (ldev != NULL && ldev != dev)
2253		ldev = ISC_LIST_NEXT(ldev, ev_link);
2254
2255	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2256}
2257
2258static isc_boolean_t
2259acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2260{
2261	isc_socket_newconnev_t *ldev;
2262
2263	ldev = ISC_LIST_HEAD(sock->accept_list);
2264	while (ldev != NULL && ldev != dev)
2265		ldev = ISC_LIST_NEXT(ldev, ev_link);
2266
2267	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2268}
2269
2270static isc_boolean_t
2271connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2272{
2273	return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2274}
2275
2276/*
2277 * This is the I/O Completion Port Worker Function. It loops forever
2278 * waiting for I/O to complete and then forwards them for further
2279 * processing. There are a number of these in separate threads.
2280 */
2281static isc_threadresult_t WINAPI
2282SocketIoThread(LPVOID ThreadContext) {
2283	isc_socketmgr_t *manager = ThreadContext;
2284	BOOL bSuccess = FALSE;
2285	DWORD nbytes;
2286	IoCompletionInfo *lpo = NULL;
2287	isc_socket_t *sock = NULL;
2288	int request;
2289	struct msghdr *messagehdr = NULL;
2290	int errval;
2291	char strbuf[ISC_STRERRORSIZE];
2292	int errstatus;
2293
2294	REQUIRE(VALID_MANAGER(manager));
2295
2296	/*
2297	 * Set the thread priority high enough so I/O will
2298	 * preempt normal recv packet processing, but not
2299	 * higher than the timer sync thread.
2300	 */
2301	if (!SetThreadPriority(GetCurrentThread(),
2302			       THREAD_PRIORITY_ABOVE_NORMAL)) {
2303		errval = GetLastError();
2304		isc__strerror(errval, strbuf, sizeof(strbuf));
2305		FATAL_ERROR(__FILE__, __LINE__,
2306				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2307				ISC_MSG_FAILED,
2308				"Can't set thread priority: %s"),
2309				strbuf);
2310	}
2311
2312	/*
2313	 * Loop forever waiting on I/O Completions and then processing them
2314	 */
2315	while (TRUE) {
2316		bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2317						     &nbytes, (LPDWORD)&sock,
2318						     (LPWSAOVERLAPPED *)&lpo,
2319						     INFINITE);
2320		if (lpo == NULL) /* Received request to exit */
2321			break;
2322
2323		REQUIRE(VALID_SOCKET(sock));
2324
2325		request = lpo->request_type;
2326
2327		errstatus = 0;
2328		if (!bSuccess) {
2329			isc_result_t isc_result;
2330
2331			/*
2332			 * Did the I/O operation complete?
2333			 */
2334			errstatus = WSAGetLastError();
2335			isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2336
2337			LOCK(&sock->lock);
2338			CONSISTENT(sock);
2339			switch (request) {
2340			case SOCKET_RECV:
2341				INSIST(sock->pending_iocp > 0);
2342				sock->pending_iocp--;
2343				INSIST(sock->pending_recv > 0);
2344				sock->pending_recv--;
2345				send_recvdone_abort(sock, isc_result);
2346				if (isc_result == ISC_R_UNEXPECTED) {
2347					UNEXPECTED_ERROR(__FILE__, __LINE__,
2348						"SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2349						errstatus, isc_result);
2350				}
2351				break;
2352
2353			case SOCKET_SEND:
2354				INSIST(sock->pending_iocp > 0);
2355				sock->pending_iocp--;
2356				INSIST(sock->pending_send > 0);
2357				sock->pending_send--;
2358				if (senddone_is_active(sock, lpo->dev)) {
2359					lpo->dev->result = isc_result;
2360					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2361						"canceled_send");
2362					send_senddone_event(sock, &lpo->dev);
2363				}
2364				break;
2365
2366			case SOCKET_ACCEPT:
2367				INSIST(sock->pending_iocp > 0);
2368				sock->pending_iocp--;
2369				INSIST(sock->pending_accept > 0);
2370				sock->pending_accept--;
2371				if (acceptdone_is_active(sock, lpo->adev)) {
2372					closesocket(lpo->adev->newsocket->fd);
2373					lpo->adev->newsocket->fd = INVALID_SOCKET;
2374					lpo->adev->newsocket->references--;
2375					free_socket(&lpo->adev->newsocket, __LINE__);
2376					lpo->adev->result = isc_result;
2377					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2378						"canceled_accept");
2379					send_acceptdone_event(sock, &lpo->adev);
2380				}
2381				break;
2382
2383			case SOCKET_CONNECT:
2384				INSIST(sock->pending_iocp > 0);
2385				sock->pending_iocp--;
2386				INSIST(sock->pending_connect == 1);
2387				sock->pending_connect = 0;
2388				if (connectdone_is_active(sock, lpo->cdev)) {
2389					lpo->cdev->result = isc_result;
2390					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2391						"canceled_connect");
2392					send_connectdone_event(sock, &lpo->cdev);
2393				}
2394				break;
2395			}
2396			maybe_free_socket(&sock, __LINE__);
2397
2398			if (lpo != NULL)
2399				HeapFree(hHeapHandle, 0, lpo);
2400			continue;
2401		}
2402
2403		messagehdr = &lpo->messagehdr;
2404
2405		switch (request) {
2406		case SOCKET_RECV:
2407			internal_recv(sock, nbytes);
2408			break;
2409		case SOCKET_SEND:
2410			internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2411			break;
2412		case SOCKET_ACCEPT:
2413			internal_accept(sock, lpo, errstatus);
2414			break;
2415		case SOCKET_CONNECT:
2416			internal_connect(sock, lpo, errstatus);
2417			break;
2418		}
2419
2420		if (lpo != NULL)
2421			HeapFree(hHeapHandle, 0, lpo);
2422	}
2423
2424	/*
2425	 * Exit Completion Port Thread
2426	 */
2427	manager_log(manager, TRACE,
2428		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2429				   ISC_MSG_EXITING, "SocketIoThread exiting"));
2430	return ((isc_threadresult_t)0);
2431}
2432
2433/*
2434 * Create a new socket manager.
2435 */
2436isc_result_t
2437isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2438	return (isc_socketmgr_create2(mctx, managerp, 0));
2439}
2440
2441isc_result_t
2442isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2443		     unsigned int maxsocks)
2444{
2445	isc_socketmgr_t *manager;
2446	isc_result_t result;
2447
2448	REQUIRE(managerp != NULL && *managerp == NULL);
2449
2450	if (maxsocks != 0)
2451		return (ISC_R_NOTIMPLEMENTED);
2452
2453	manager = isc_mem_get(mctx, sizeof(*manager));
2454	if (manager == NULL)
2455		return (ISC_R_NOMEMORY);
2456
2457	InitSockets();
2458
2459	manager->magic = SOCKET_MANAGER_MAGIC;
2460	manager->mctx = NULL;
2461	manager->stats = NULL;
2462	ISC_LIST_INIT(manager->socklist);
2463	result = isc_mutex_init(&manager->lock);
2464	if (result != ISC_R_SUCCESS) {
2465		isc_mem_put(mctx, manager, sizeof(*manager));
2466		return (result);
2467	}
2468	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2469		DESTROYLOCK(&manager->lock);
2470		isc_mem_put(mctx, manager, sizeof(*manager));
2471		UNEXPECTED_ERROR(__FILE__, __LINE__,
2472				 "isc_condition_init() %s",
2473				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2474						ISC_MSG_FAILED, "failed"));
2475		return (ISC_R_UNEXPECTED);
2476	}
2477
2478	isc_mem_attach(mctx, &manager->mctx);
2479
2480	iocompletionport_init(manager);	/* Create the Completion Ports */
2481
2482	manager->bShutdown = ISC_FALSE;
2483	manager->totalSockets = 0;
2484	manager->iocp_total = 0;
2485
2486	*managerp = manager;
2487
2488	return (ISC_R_SUCCESS);
2489}
2490
2491isc_result_t
2492isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2493	REQUIRE(VALID_MANAGER(manager));
2494	REQUIRE(nsockp != NULL);
2495
2496	return (ISC_R_NOTIMPLEMENTED);
2497}
2498
2499void
2500isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2501	REQUIRE(VALID_MANAGER(manager));
2502	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2503	REQUIRE(manager->stats == NULL);
2504	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2505
2506	isc_stats_attach(stats, &manager->stats);
2507}
2508
2509void
2510isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2511	isc_socketmgr_t *manager;
2512	int i;
2513	isc_mem_t *mctx;
2514
2515	/*
2516	 * Destroy a socket manager.
2517	 */
2518
2519	REQUIRE(managerp != NULL);
2520	manager = *managerp;
2521	REQUIRE(VALID_MANAGER(manager));
2522
2523	LOCK(&manager->lock);
2524
2525	/*
2526	 * Wait for all sockets to be destroyed.
2527	 */
2528	while (!ISC_LIST_EMPTY(manager->socklist)) {
2529		manager_log(manager, CREATION,
2530			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2531					   ISC_MSG_SOCKETSREMAIN,
2532					   "sockets exist"));
2533		WAIT(&manager->shutdown_ok, &manager->lock);
2534	}
2535
2536	UNLOCK(&manager->lock);
2537
2538	/*
2539	 * Here, we need to had some wait code for the completion port
2540	 * thread.
2541	 */
2542	signal_iocompletionport_exit(manager);
2543	manager->bShutdown = ISC_TRUE;
2544
2545	/*
2546	 * Wait for threads to exit.
2547	 */
2548	for (i = 0; i < manager->maxIOCPThreads; i++) {
2549		if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2550			NULL) != ISC_R_SUCCESS)
2551			UNEXPECTED_ERROR(__FILE__, __LINE__,
2552				 "isc_thread_join() for Completion Port %s",
2553				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2554						ISC_MSG_FAILED, "failed"));
2555	}
2556	/*
2557	 * Clean up.
2558	 */
2559
2560	CloseHandle(manager->hIoCompletionPort);
2561
2562	(void)isc_condition_destroy(&manager->shutdown_ok);
2563
2564	DESTROYLOCK(&manager->lock);
2565	if (manager->stats != NULL)
2566		isc_stats_detach(&manager->stats);
2567	manager->magic = 0;
2568	mctx= manager->mctx;
2569	isc_mem_put(mctx, manager, sizeof(*manager));
2570
2571	isc_mem_detach(&mctx);
2572
2573	*managerp = NULL;
2574}
2575
2576static void
2577queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev)
2578{
2579	isc_task_t *ntask = NULL;
2580
2581	isc_task_attach(task, &ntask);
2582	dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2583
2584	/*
2585	 * Enqueue the request.
2586	 */
2587	INSIST(!ISC_LINK_LINKED(dev, ev_link));
2588	ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2589
2590	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2591		   "queue_receive_event: event %p -> task %p",
2592		   dev, ntask);
2593}
2594
2595/*
2596 * Check the pending receive queue, and if we have data pending, give it to this
2597 * caller.  If we have none, queue an I/O request.  If this caller is not the first
2598 * on the list, then we will just queue this event and return.
2599 *
2600 * Caller must have the socket locked.
2601 */
2602static isc_result_t
2603socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2604	    unsigned int flags)
2605{
2606	int cc = 0;
2607	isc_task_t *ntask = NULL;
2608	isc_result_t result = ISC_R_SUCCESS;
2609	int recv_errno = 0;
2610
2611	dev->ev_sender = task;
2612
2613	if (sock->fd == INVALID_SOCKET)
2614		return (ISC_R_EOF);
2615
2616	/*
2617	 * Queue our event on the list of things to do.  Call our function to
2618	 * attempt to fill buffers as much as possible, and return done events.
2619	 * We are going to lie about our handling of the ISC_SOCKFLAG_IMMEDIATE
2620	 * here and tell our caller that we could not satisfy it immediately.
2621	 */
2622	queue_receive_event(sock, task, dev);
2623	if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2624		result = ISC_R_INPROGRESS;
2625
2626	completeio_recv(sock);
2627
2628	/*
2629	 * If there are more receivers waiting for data, queue another receive
2630	 * here.  If the
2631	 */
2632	queue_receive_request(sock);
2633
2634	return (result);
2635}
2636
2637isc_result_t
2638isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2639		 unsigned int minimum, isc_task_t *task,
2640		 isc_taskaction_t action, const void *arg)
2641{
2642	isc_socketevent_t *dev;
2643	isc_socketmgr_t *manager;
2644	unsigned int iocount;
2645	isc_buffer_t *buffer;
2646	isc_result_t ret;
2647
2648	REQUIRE(VALID_SOCKET(sock));
2649	LOCK(&sock->lock);
2650	CONSISTENT(sock);
2651
2652	/*
2653	 * Make sure that the socket is not closed.  XXXMLG change error here?
2654	 */
2655	if (sock->fd == INVALID_SOCKET) {
2656		UNLOCK(&sock->lock);
2657		return (ISC_R_CONNREFUSED);
2658	}
2659
2660	REQUIRE(buflist != NULL);
2661	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2662	REQUIRE(task != NULL);
2663	REQUIRE(action != NULL);
2664
2665	manager = sock->manager;
2666	REQUIRE(VALID_MANAGER(manager));
2667
2668	iocount = isc_bufferlist_availablecount(buflist);
2669	REQUIRE(iocount > 0);
2670
2671	INSIST(sock->bound);
2672
2673	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2674	if (dev == NULL) {
2675		UNLOCK(&sock->lock);
2676		return (ISC_R_NOMEMORY);
2677	}
2678
2679	/*
2680	 * UDP sockets are always partial read
2681	 */
2682	if (sock->type == isc_sockettype_udp)
2683		dev->minimum = 1;
2684	else {
2685		if (minimum == 0)
2686			dev->minimum = iocount;
2687		else
2688			dev->minimum = minimum;
2689	}
2690
2691	/*
2692	 * Move each buffer from the passed in list to our internal one.
2693	 */
2694	buffer = ISC_LIST_HEAD(*buflist);
2695	while (buffer != NULL) {
2696		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2697		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2698		buffer = ISC_LIST_HEAD(*buflist);
2699	}
2700
2701	ret = socket_recv(sock, dev, task, 0);
2702
2703	UNLOCK(&sock->lock);
2704	return (ret);
2705}
2706
2707isc_result_t
2708isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2709		isc_task_t *task, isc_taskaction_t action, const void *arg)
2710{
2711	isc_socketevent_t *dev;
2712	isc_socketmgr_t *manager;
2713	isc_result_t ret;
2714
2715	REQUIRE(VALID_SOCKET(sock));
2716	LOCK(&sock->lock);
2717	CONSISTENT(sock);
2718
2719	/*
2720	 * make sure that the socket's not closed
2721	 */
2722	if (sock->fd == INVALID_SOCKET) {
2723		UNLOCK(&sock->lock);
2724		return (ISC_R_CONNREFUSED);
2725	}
2726	REQUIRE(action != NULL);
2727
2728	manager = sock->manager;
2729	REQUIRE(VALID_MANAGER(manager));
2730
2731	INSIST(sock->bound);
2732
2733	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2734	if (dev == NULL) {
2735		UNLOCK(&sock->lock);
2736		return (ISC_R_NOMEMORY);
2737	}
2738
2739	ret = isc_socket_recv2(sock, region, minimum, task, dev, 0);
2740	UNLOCK(&sock->lock);
2741	return (ret);
2742}
2743
2744isc_result_t
2745isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2746		 unsigned int minimum, isc_task_t *task,
2747		 isc_socketevent_t *event, unsigned int flags)
2748{
2749	isc_result_t ret;
2750
2751	REQUIRE(VALID_SOCKET(sock));
2752	LOCK(&sock->lock);
2753	CONSISTENT(sock);
2754
2755	event->result = ISC_R_UNEXPECTED;
2756	event->ev_sender = sock;
2757	/*
2758	 * make sure that the socket's not closed
2759	 */
2760	if (sock->fd == INVALID_SOCKET) {
2761		UNLOCK(&sock->lock);
2762		return (ISC_R_CONNREFUSED);
2763	}
2764
2765	ISC_LIST_INIT(event->bufferlist);
2766	event->region = *region;
2767	event->n = 0;
2768	event->offset = 0;
2769	event->attributes = 0;
2770
2771	/*
2772	 * UDP sockets are always partial read.
2773	 */
2774	if (sock->type == isc_sockettype_udp)
2775		event->minimum = 1;
2776	else {
2777		if (minimum == 0)
2778			event->minimum = region->length;
2779		else
2780			event->minimum = minimum;
2781	}
2782
2783	ret = socket_recv(sock, event, task, flags);
2784	UNLOCK(&sock->lock);
2785	return (ret);
2786}
2787
2788/*
2789 * Caller must have the socket locked.
2790 */
2791static isc_result_t
2792socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2793	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2794	    unsigned int flags)
2795{
2796	int io_state;
2797	int send_errno = 0;
2798	int cc = 0;
2799	isc_task_t *ntask = NULL;
2800	isc_result_t result = ISC_R_SUCCESS;
2801
2802	dev->ev_sender = task;
2803
2804	set_dev_address(address, sock, dev);
2805	if (pktinfo != NULL) {
2806		socket_log(__LINE__, sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2807			   ISC_MSG_PKTINFOPROVIDED,
2808			   "pktinfo structure provided, ifindex %u (set to 0)",
2809			   pktinfo->ipi6_ifindex);
2810
2811		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2812		dev->pktinfo = *pktinfo;
2813		/*
2814		 * Set the pktinfo index to 0 here, to let the kernel decide
2815		 * what interface it should send on.
2816		 */
2817		dev->pktinfo.ipi6_ifindex = 0;
2818	}
2819
2820	io_state = startio_send(sock, dev, &cc, &send_errno);
2821	switch (io_state) {
2822	case DOIO_PENDING:	/* I/O started. Nothing more to do */
2823	case DOIO_SOFT:
2824		/*
2825		 * We couldn't send all or part of the request right now, so
2826		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2827		 */
2828		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2829			isc_task_attach(task, &ntask);
2830			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2831
2832			/*
2833			 * Enqueue the request.
2834			 */
2835			INSIST(!ISC_LINK_LINKED(dev, ev_link));
2836			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2837
2838			socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2839				   "socket_send: event %p -> task %p",
2840				   dev, ntask);
2841
2842			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2843				result = ISC_R_INPROGRESS;
2844			break;
2845		}
2846
2847	case DOIO_SUCCESS:
2848		break;
2849	}
2850
2851	return (result);
2852}
2853
2854isc_result_t
2855isc_socket_send(isc_socket_t *sock, isc_region_t *region,
2856		isc_task_t *task, isc_taskaction_t action, const void *arg)
2857{
2858	/*
2859	 * REQUIRE() checking is performed in isc_socket_sendto().
2860	 */
2861	return (isc_socket_sendto(sock, region, task, action, arg, NULL,
2862				  NULL));
2863}
2864
2865isc_result_t
2866isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
2867		  isc_task_t *task, isc_taskaction_t action, const void *arg,
2868		  isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2869{
2870	isc_socketevent_t *dev;
2871	isc_socketmgr_t *manager;
2872	isc_result_t ret;
2873
2874	REQUIRE(VALID_SOCKET(sock));
2875	REQUIRE(sock->type != isc_sockettype_fdwatch);
2876
2877	LOCK(&sock->lock);
2878	CONSISTENT(sock);
2879
2880	/*
2881	 * make sure that the socket's not closed
2882	 */
2883	if (sock->fd == INVALID_SOCKET) {
2884		UNLOCK(&sock->lock);
2885		return (ISC_R_CONNREFUSED);
2886	}
2887	REQUIRE(region != NULL);
2888	REQUIRE(task != NULL);
2889	REQUIRE(action != NULL);
2890
2891	manager = sock->manager;
2892	REQUIRE(VALID_MANAGER(manager));
2893
2894	INSIST(sock->bound);
2895
2896	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2897	if (dev == NULL) {
2898		UNLOCK(&sock->lock);
2899		return (ISC_R_NOMEMORY);
2900	}
2901	dev->region = *region;
2902
2903	ret = socket_send(sock, dev, task, address, pktinfo, 0);
2904	UNLOCK(&sock->lock);
2905	return (ret);
2906}
2907
2908isc_result_t
2909isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2910		 isc_task_t *task, isc_taskaction_t action, const void *arg)
2911{
2912	return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
2913				   NULL));
2914}
2915
2916isc_result_t
2917isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
2918		   isc_task_t *task, isc_taskaction_t action, const void *arg,
2919		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2920{
2921	isc_socketevent_t *dev;
2922	isc_socketmgr_t *manager;
2923	unsigned int iocount;
2924	isc_buffer_t *buffer;
2925	isc_result_t ret;
2926
2927	REQUIRE(VALID_SOCKET(sock));
2928
2929	LOCK(&sock->lock);
2930	CONSISTENT(sock);
2931
2932	/*
2933	 * make sure that the socket's not closed
2934	 */
2935	if (sock->fd == INVALID_SOCKET) {
2936		UNLOCK(&sock->lock);
2937		return (ISC_R_CONNREFUSED);
2938	}
2939	REQUIRE(buflist != NULL);
2940	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2941	REQUIRE(task != NULL);
2942	REQUIRE(action != NULL);
2943
2944	manager = sock->manager;
2945	REQUIRE(VALID_MANAGER(manager));
2946
2947	iocount = isc_bufferlist_usedcount(buflist);
2948	REQUIRE(iocount > 0);
2949
2950	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2951	if (dev == NULL) {
2952		UNLOCK(&sock->lock);
2953		return (ISC_R_NOMEMORY);
2954	}
2955
2956	/*
2957	 * Move each buffer from the passed in list to our internal one.
2958	 */
2959	buffer = ISC_LIST_HEAD(*buflist);
2960	while (buffer != NULL) {
2961		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2962		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2963		buffer = ISC_LIST_HEAD(*buflist);
2964	}
2965
2966	ret = socket_send(sock, dev, task, address, pktinfo, 0);
2967	UNLOCK(&sock->lock);
2968	return (ret);
2969}
2970
2971isc_result_t
2972isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
2973		   isc_task_t *task,
2974		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2975		   isc_socketevent_t *event, unsigned int flags)
2976{
2977	isc_result_t ret;
2978
2979	REQUIRE(VALID_SOCKET(sock));
2980	LOCK(&sock->lock);
2981	CONSISTENT(sock);
2982
2983	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
2984	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
2985		REQUIRE(sock->type == isc_sockettype_udp);
2986	event->ev_sender = sock;
2987	event->result = ISC_R_UNEXPECTED;
2988	/*
2989	 * make sure that the socket's not closed
2990	 */
2991	if (sock->fd == INVALID_SOCKET) {
2992		UNLOCK(&sock->lock);
2993		return (ISC_R_CONNREFUSED);
2994	}
2995	ISC_LIST_INIT(event->bufferlist);
2996	event->region = *region;
2997	event->n = 0;
2998	event->offset = 0;
2999	event->attributes = 0;
3000
3001	ret = socket_send(sock, event, task, address, pktinfo, flags);
3002	UNLOCK(&sock->lock);
3003	return (ret);
3004}
3005
3006isc_result_t
3007isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3008		unsigned int options) {
3009	int bind_errno;
3010	char strbuf[ISC_STRERRORSIZE];
3011	int on = 1;
3012
3013	REQUIRE(VALID_SOCKET(sock));
3014	LOCK(&sock->lock);
3015	CONSISTENT(sock);
3016
3017	/*
3018	 * make sure that the socket's not closed
3019	 */
3020	if (sock->fd == INVALID_SOCKET) {
3021		UNLOCK(&sock->lock);
3022		return (ISC_R_CONNREFUSED);
3023	}
3024
3025	INSIST(!sock->bound);
3026
3027	if (sock->pf != sockaddr->type.sa.sa_family) {
3028		UNLOCK(&sock->lock);
3029		return (ISC_R_FAMILYMISMATCH);
3030	}
3031	/*
3032	 * Only set SO_REUSEADDR when we want a specific port.
3033	 */
3034	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3035	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3036	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
3037		       sizeof(on)) < 0) {
3038		UNEXPECTED_ERROR(__FILE__, __LINE__,
3039				 "setsockopt(%d) %s", sock->fd,
3040				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3041						ISC_MSG_FAILED, "failed"));
3042		/* Press on... */
3043	}
3044	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3045		bind_errno = WSAGetLastError();
3046		UNLOCK(&sock->lock);
3047		switch (bind_errno) {
3048		case WSAEACCES:
3049			return (ISC_R_NOPERM);
3050		case WSAEADDRNOTAVAIL:
3051			return (ISC_R_ADDRNOTAVAIL);
3052		case WSAEADDRINUSE:
3053			return (ISC_R_ADDRINUSE);
3054		case WSAEINVAL:
3055			return (ISC_R_BOUND);
3056		default:
3057			isc__strerror(bind_errno, strbuf, sizeof(strbuf));
3058			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3059					 strbuf);
3060			return (ISC_R_UNEXPECTED);
3061		}
3062	}
3063
3064	socket_log(__LINE__, sock, sockaddr, TRACE,
3065		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3066	sock->bound = 1;
3067
3068	UNLOCK(&sock->lock);
3069	return (ISC_R_SUCCESS);
3070}
3071
3072isc_result_t
3073isc_socket_filter(isc_socket_t *sock, const char *filter) {
3074	UNUSED(sock);
3075	UNUSED(filter);
3076
3077	REQUIRE(VALID_SOCKET(sock));
3078	return (ISC_R_NOTIMPLEMENTED);
3079}
3080
3081/*
3082 * Set up to listen on a given socket.  We do this by creating an internal
3083 * event that will be dispatched when the socket has read activity.  The
3084 * watcher will send the internal event to the task when there is a new
3085 * connection.
3086 *
3087 * Unlike in read, we don't preallocate a done event here.  Every time there
3088 * is a new connection we'll have to allocate a new one anyway, so we might
3089 * as well keep things simple rather than having to track them.
3090 */
3091isc_result_t
3092isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
3093	char strbuf[ISC_STRERRORSIZE];
3094
3095	REQUIRE(VALID_SOCKET(sock));
3096
3097	LOCK(&sock->lock);
3098	CONSISTENT(sock);
3099
3100	/*
3101	 * make sure that the socket's not closed
3102	 */
3103	if (sock->fd == INVALID_SOCKET) {
3104		UNLOCK(&sock->lock);
3105		return (ISC_R_CONNREFUSED);
3106	}
3107
3108	REQUIRE(!sock->listener);
3109	REQUIRE(sock->bound);
3110	REQUIRE(sock->type == isc_sockettype_tcp);
3111
3112	if (backlog == 0)
3113		backlog = SOMAXCONN;
3114
3115	if (listen(sock->fd, (int)backlog) < 0) {
3116		UNLOCK(&sock->lock);
3117		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3118
3119		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3120
3121		return (ISC_R_UNEXPECTED);
3122	}
3123
3124	socket_log(__LINE__, sock, NULL, TRACE,
3125		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "listening");
3126	sock->listener = 1;
3127	_set_state(sock, SOCK_LISTEN);
3128
3129	UNLOCK(&sock->lock);
3130	return (ISC_R_SUCCESS);
3131}
3132
3133/*
3134 * This should try to do aggressive accept() XXXMLG
3135 */
3136isc_result_t
3137isc_socket_accept(isc_socket_t *sock,
3138		  isc_task_t *task, isc_taskaction_t action, const void *arg)
3139{
3140	isc_socket_newconnev_t *adev;
3141	isc_socketmgr_t *manager;
3142	isc_task_t *ntask = NULL;
3143	isc_socket_t *nsock;
3144	isc_result_t result;
3145	IoCompletionInfo *lpo;
3146
3147	REQUIRE(VALID_SOCKET(sock));
3148
3149	manager = sock->manager;
3150	REQUIRE(VALID_MANAGER(manager));
3151
3152	LOCK(&sock->lock);
3153	CONSISTENT(sock);
3154
3155	/*
3156	 * make sure that the socket's not closed
3157	 */
3158	if (sock->fd == INVALID_SOCKET) {
3159		UNLOCK(&sock->lock);
3160		return (ISC_R_CONNREFUSED);
3161	}
3162
3163	REQUIRE(sock->listener);
3164
3165	/*
3166	 * Sender field is overloaded here with the task we will be sending
3167	 * this event to.  Just before the actual event is delivered the
3168	 * actual ev_sender will be touched up to be the socket.
3169	 */
3170	adev = (isc_socket_newconnev_t *)
3171		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3172				   action, arg, sizeof(*adev));
3173	if (adev == NULL) {
3174		UNLOCK(&sock->lock);
3175		return (ISC_R_NOMEMORY);
3176	}
3177	ISC_LINK_INIT(adev, ev_link);
3178
3179	result = allocate_socket(manager, sock->type, &nsock);
3180	if (result != ISC_R_SUCCESS) {
3181		isc_event_free((isc_event_t **)&adev);
3182		UNLOCK(&sock->lock);
3183		return (result);
3184	}
3185
3186	/*
3187	 * AcceptEx() requires we pass in a socket.
3188	 */
3189	nsock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
3190	if (nsock->fd == INVALID_SOCKET) {
3191		free_socket(&nsock, __LINE__);
3192		isc_event_free((isc_event_t **)&adev);
3193		UNLOCK(&sock->lock);
3194		return (ISC_R_FAILURE); // XXXMLG need real error message
3195	}
3196
3197	/*
3198	 * Attach to socket and to task.
3199	 */
3200	isc_task_attach(task, &ntask);
3201	nsock->references++;
3202
3203	adev->ev_sender = ntask;
3204	adev->newsocket = nsock;
3205	_set_state(nsock, SOCK_ACCEPT);
3206
3207	/*
3208	 * Queue io completion for an accept().
3209	 */
3210	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3211					    HEAP_ZERO_MEMORY,
3212					    sizeof(IoCompletionInfo));
3213	RUNTIME_CHECK(lpo != NULL);
3214	lpo->acceptbuffer = (void *)HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
3215		(sizeof(SOCKADDR_STORAGE) + 16) * 2);
3216	RUNTIME_CHECK(lpo->acceptbuffer != NULL);
3217
3218	lpo->adev = adev;
3219	lpo->request_type = SOCKET_ACCEPT;
3220
3221	ISCAcceptEx(sock->fd,
3222		    nsock->fd,				/* Accepted Socket */
3223		    lpo->acceptbuffer,			/* Buffer for initial Recv */
3224		    0,					/* Length of Buffer */
3225		    sizeof(SOCKADDR_STORAGE) + 16,		/* Local address length + 16 */
3226		    sizeof(SOCKADDR_STORAGE) + 16,		/* Remote address lengh + 16 */
3227		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
3228		    (LPOVERLAPPED)lpo			/* Overlapped structure */
3229		    );
3230	iocompletionport_update(nsock);
3231
3232	socket_log(__LINE__, sock, NULL, TRACE,
3233		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND,
3234		   "accepting for nsock %p fd %d", nsock, nsock->fd);
3235
3236	/*
3237	 * Enqueue the event
3238	 */
3239	ISC_LIST_ENQUEUE(sock->accept_list, adev, ev_link);
3240	sock->pending_accept++;
3241	sock->pending_iocp++;
3242
3243	UNLOCK(&sock->lock);
3244	return (ISC_R_SUCCESS);
3245}
3246
3247isc_result_t
3248isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3249		   isc_task_t *task, isc_taskaction_t action, const void *arg)
3250{
3251	char strbuf[ISC_STRERRORSIZE];
3252	isc_socket_connev_t *cdev;
3253	isc_task_t *ntask = NULL;
3254	isc_socketmgr_t *manager;
3255	IoCompletionInfo *lpo;
3256	int bind_errno;
3257
3258	REQUIRE(VALID_SOCKET(sock));
3259	REQUIRE(addr != NULL);
3260	REQUIRE(task != NULL);
3261	REQUIRE(action != NULL);
3262
3263	manager = sock->manager;
3264	REQUIRE(VALID_MANAGER(manager));
3265	REQUIRE(addr != NULL);
3266
3267	if (isc_sockaddr_ismulticast(addr))
3268		return (ISC_R_MULTICAST);
3269
3270	LOCK(&sock->lock);
3271	CONSISTENT(sock);
3272
3273	/*
3274	 * make sure that the socket's not closed
3275	 */
3276	if (sock->fd == INVALID_SOCKET) {
3277		UNLOCK(&sock->lock);
3278		return (ISC_R_CONNREFUSED);
3279	}
3280
3281	/*
3282	 * Windows sockets won't connect unless the socket is bound.
3283	 */
3284	if (!sock->bound) {
3285		isc_sockaddr_t any;
3286
3287		isc_sockaddr_anyofpf(&any, isc_sockaddr_pf(addr));
3288		if (bind(sock->fd, &any.type.sa, any.length) < 0) {
3289			bind_errno = WSAGetLastError();
3290			UNLOCK(&sock->lock);
3291			switch (bind_errno) {
3292			case WSAEACCES:
3293				return (ISC_R_NOPERM);
3294			case WSAEADDRNOTAVAIL:
3295				return (ISC_R_ADDRNOTAVAIL);
3296			case WSAEADDRINUSE:
3297				return (ISC_R_ADDRINUSE);
3298			case WSAEINVAL:
3299				return (ISC_R_BOUND);
3300			default:
3301				isc__strerror(bind_errno, strbuf,
3302					      sizeof(strbuf));
3303				UNEXPECTED_ERROR(__FILE__, __LINE__,
3304						 "bind: %s", strbuf);
3305				return (ISC_R_UNEXPECTED);
3306			}
3307		}
3308		sock->bound = 1;
3309	}
3310
3311	REQUIRE(!sock->pending_connect);
3312
3313	cdev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3314							ISC_SOCKEVENT_CONNECT,
3315							action,	arg,
3316							sizeof(*cdev));
3317	if (cdev == NULL) {
3318		UNLOCK(&sock->lock);
3319		return (ISC_R_NOMEMORY);
3320	}
3321	ISC_LINK_INIT(cdev, ev_link);
3322
3323	if (sock->type == isc_sockettype_tcp) {
3324		/*
3325		 * Queue io completion for an accept().
3326		 */
3327		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3328						    HEAP_ZERO_MEMORY,
3329						    sizeof(IoCompletionInfo));
3330		lpo->cdev = cdev;
3331		lpo->request_type = SOCKET_CONNECT;
3332
3333		sock->address = *addr;
3334		ISCConnectEx(sock->fd, &addr->type.sa, addr->length,
3335			NULL, 0, NULL, (LPOVERLAPPED)lpo);
3336
3337		/*
3338		 * Attach to task.
3339		 */
3340		isc_task_attach(task, &ntask);
3341		cdev->ev_sender = ntask;
3342
3343		sock->pending_connect = 1;
3344		_set_state(sock, SOCK_CONNECT);
3345
3346		/*
3347		 * Enqueue the request.
3348		 */
3349		sock->connect_ev = cdev;
3350		sock->pending_iocp++;
3351	} else {
3352		WSAConnect(sock->fd, &addr->type.sa, addr->length, NULL, NULL, NULL, NULL);
3353		cdev->result = ISC_R_SUCCESS;
3354		isc_task_send(task, (isc_event_t **)&cdev);
3355	}
3356	CONSISTENT(sock);
3357	UNLOCK(&sock->lock);
3358
3359	return (ISC_R_SUCCESS);
3360}
3361
3362isc_result_t
3363isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3364	isc_result_t result;
3365
3366	REQUIRE(VALID_SOCKET(sock));
3367	REQUIRE(addressp != NULL);
3368
3369	LOCK(&sock->lock);
3370	CONSISTENT(sock);
3371
3372	/*
3373	 * make sure that the socket's not closed
3374	 */
3375	if (sock->fd == INVALID_SOCKET) {
3376		UNLOCK(&sock->lock);
3377		return (ISC_R_CONNREFUSED);
3378	}
3379
3380	if (sock->connected) {
3381		*addressp = sock->address;
3382		result = ISC_R_SUCCESS;
3383	} else {
3384		result = ISC_R_NOTCONNECTED;
3385	}
3386
3387	UNLOCK(&sock->lock);
3388
3389	return (result);
3390}
3391
3392isc_result_t
3393isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3394	ISC_SOCKADDR_LEN_T len;
3395	isc_result_t result;
3396	char strbuf[ISC_STRERRORSIZE];
3397
3398	REQUIRE(VALID_SOCKET(sock));
3399	REQUIRE(addressp != NULL);
3400
3401	LOCK(&sock->lock);
3402	CONSISTENT(sock);
3403
3404	/*
3405	 * make sure that the socket's not closed
3406	 */
3407	if (sock->fd == INVALID_SOCKET) {
3408		UNLOCK(&sock->lock);
3409		return (ISC_R_CONNREFUSED);
3410	}
3411
3412	if (!sock->bound) {
3413		result = ISC_R_NOTBOUND;
3414		goto out;
3415	}
3416
3417	result = ISC_R_SUCCESS;
3418
3419	len = sizeof(addressp->type);
3420	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3421		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3422		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3423				 strbuf);
3424		result = ISC_R_UNEXPECTED;
3425		goto out;
3426	}
3427	addressp->length = (unsigned int)len;
3428
3429 out:
3430	UNLOCK(&sock->lock);
3431
3432	return (result);
3433}
3434
3435/*
3436 * Run through the list of events on this socket, and cancel the ones
3437 * queued for task "task" of type "how".  "how" is a bitmask.
3438 */
3439void
3440isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3441
3442	REQUIRE(VALID_SOCKET(sock));
3443
3444	/*
3445	 * Quick exit if there is nothing to do.  Don't even bother locking
3446	 * in this case.
3447	 */
3448	if (how == 0)
3449		return;
3450
3451	LOCK(&sock->lock);
3452	CONSISTENT(sock);
3453
3454	/*
3455	 * make sure that the socket's not closed
3456	 */
3457	if (sock->fd == INVALID_SOCKET) {
3458		UNLOCK(&sock->lock);
3459		return;
3460	}
3461
3462	/*
3463	 * All of these do the same thing, more or less.
3464	 * Each will:
3465	 *	o If the internal event is marked as "posted" try to
3466	 *	  remove it from the task's queue.  If this fails, mark it
3467	 *	  as canceled instead, and let the task clean it up later.
3468	 *	o For each I/O request for that task of that type, post
3469	 *	  its done event with status of "ISC_R_CANCELED".
3470	 *	o Reset any state needed.
3471	 */
3472
3473	if ((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) {
3474		isc_socketevent_t      *dev;
3475		isc_socketevent_t      *next;
3476		isc_task_t	       *current_task;
3477
3478		dev = ISC_LIST_HEAD(sock->recv_list);
3479		while (dev != NULL) {
3480			current_task = dev->ev_sender;
3481			next = ISC_LIST_NEXT(dev, ev_link);
3482			if ((task == NULL) || (task == current_task)) {
3483				dev->result = ISC_R_CANCELED;
3484				send_recvdone_event(sock, &dev);
3485			}
3486			dev = next;
3487		}
3488	}
3489	how &= ~ISC_SOCKCANCEL_RECV;
3490
3491	if ((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) {
3492		isc_socketevent_t      *dev;
3493		isc_socketevent_t      *next;
3494		isc_task_t	       *current_task;
3495
3496		dev = ISC_LIST_HEAD(sock->send_list);
3497
3498		while (dev != NULL) {
3499			current_task = dev->ev_sender;
3500			next = ISC_LIST_NEXT(dev, ev_link);
3501			if ((task == NULL) || (task == current_task)) {
3502				dev->result = ISC_R_CANCELED;
3503				send_senddone_event(sock, &dev);
3504			}
3505			dev = next;
3506		}
3507	}
3508	how &= ~ISC_SOCKCANCEL_SEND;
3509
3510	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3511	    && !ISC_LIST_EMPTY(sock->accept_list)) {
3512		isc_socket_newconnev_t *dev;
3513		isc_socket_newconnev_t *next;
3514		isc_task_t	       *current_task;
3515
3516		dev = ISC_LIST_HEAD(sock->accept_list);
3517		while (dev != NULL) {
3518			current_task = dev->ev_sender;
3519			next = ISC_LIST_NEXT(dev, ev_link);
3520
3521			if ((task == NULL) || (task == current_task)) {
3522
3523				dev->newsocket->references--;
3524				closesocket(dev->newsocket->fd);
3525				dev->newsocket->fd = INVALID_SOCKET;
3526				free_socket(&dev->newsocket, __LINE__);
3527
3528				dev->result = ISC_R_CANCELED;
3529				send_acceptdone_event(sock, &dev);
3530			}
3531
3532			dev = next;
3533		}
3534	}
3535	how &= ~ISC_SOCKCANCEL_ACCEPT;
3536
3537	/*
3538	 * Connecting is not a list.
3539	 */
3540	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3541	    && sock->connect_ev != NULL) {
3542		isc_socket_connev_t    *dev;
3543		isc_task_t	       *current_task;
3544
3545		INSIST(sock->pending_connect);
3546
3547		dev = sock->connect_ev;
3548		current_task = dev->ev_sender;
3549
3550		if ((task == NULL) || (task == current_task)) {
3551			closesocket(sock->fd);
3552			sock->fd = INVALID_SOCKET;
3553			_set_state(sock, SOCK_CLOSED);
3554
3555			sock->connect_ev = NULL;
3556			dev->result = ISC_R_CANCELED;
3557			send_connectdone_event(sock, &dev);
3558		}
3559	}
3560	how &= ~ISC_SOCKCANCEL_CONNECT;
3561
3562	maybe_free_socket(&sock, __LINE__);
3563}
3564
3565isc_sockettype_t
3566isc_socket_gettype(isc_socket_t *sock) {
3567	isc_sockettype_t type;
3568
3569	REQUIRE(VALID_SOCKET(sock));
3570
3571	LOCK(&sock->lock);
3572
3573	/*
3574	 * make sure that the socket's not closed
3575	 */
3576	if (sock->fd == INVALID_SOCKET) {
3577		UNLOCK(&sock->lock);
3578		return (ISC_R_CONNREFUSED);
3579	}
3580
3581	type = sock->type;
3582	UNLOCK(&sock->lock);
3583	return (type);
3584}
3585
3586isc_boolean_t
3587isc_socket_isbound(isc_socket_t *sock) {
3588	isc_boolean_t val;
3589
3590	REQUIRE(VALID_SOCKET(sock));
3591
3592	LOCK(&sock->lock);
3593	CONSISTENT(sock);
3594
3595	/*
3596	 * make sure that the socket's not closed
3597	 */
3598	if (sock->fd == INVALID_SOCKET) {
3599		UNLOCK(&sock->lock);
3600		return (ISC_FALSE);
3601	}
3602
3603	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3604	UNLOCK(&sock->lock);
3605
3606	return (val);
3607}
3608
3609void
3610isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3611#if defined(IPV6_V6ONLY)
3612	int onoff = yes ? 1 : 0;
3613#else
3614	UNUSED(yes);
3615#endif
3616
3617	REQUIRE(VALID_SOCKET(sock));
3618
3619#ifdef IPV6_V6ONLY
3620	if (sock->pf == AF_INET6) {
3621		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3622				 (void *)&onoff, sizeof(onoff));
3623	}
3624#endif
3625}
3626
3627void
3628isc_socket_cleanunix(isc_sockaddr_t *addr, isc_boolean_t active) {
3629	UNUSED(addr);
3630	UNUSED(active);
3631}
3632
3633isc_result_t
3634isc_socket_permunix(isc_sockaddr_t *addr, isc_uint32_t perm,
3635		    isc_uint32_t owner,	isc_uint32_t group)
3636{
3637	UNUSED(addr);
3638	UNUSED(perm);
3639	UNUSED(owner);
3640	UNUSED(group);
3641	return (ISC_R_NOTIMPLEMENTED);
3642}
3643
3644void
3645isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) {
3646
3647	/*
3648	 * Name 'socket'.
3649	 */
3650
3651	REQUIRE(VALID_SOCKET(socket));
3652
3653	LOCK(&socket->lock);
3654	memset(socket->name, 0, sizeof(socket->name));
3655	strncpy(socket->name, name, sizeof(socket->name) - 1);
3656	socket->tag = tag;
3657	UNLOCK(&socket->lock);
3658}
3659
3660const char *
3661isc_socket_getname(isc_socket_t *socket) {
3662	return (socket->name);
3663}
3664
3665void *
3666isc_socket_gettag(isc_socket_t *socket) {
3667	return (socket->tag);
3668}
3669
3670void
3671isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3672	UNUSED(manager);
3673	UNUSED(reserved);
3674}
3675