1/*
2 * Copyright (C) 2004-2009, 2011-2014  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2003  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id: dispatch.c,v 1.175 2011/11/29 01:03:47 marka Exp $ */
19
20/*! \file */
21
22#include <config.h>
23
24#include <stdlib.h>
25#include <sys/types.h>
26#include <unistd.h>
27#include <stdlib.h>
28
29#include <isc/entropy.h>
30#include <isc/mem.h>
31#include <isc/mutex.h>
32#include <isc/portset.h>
33#include <isc/print.h>
34#include <isc/random.h>
35#include <isc/socket.h>
36#include <isc/stats.h>
37#include <isc/string.h>
38#include <isc/task.h>
39#include <isc/time.h>
40#include <isc/util.h>
41
42#include <dns/acl.h>
43#include <dns/dispatch.h>
44#include <dns/events.h>
45#include <dns/log.h>
46#include <dns/message.h>
47#include <dns/portlist.h>
48#include <dns/stats.h>
49#include <dns/tcpmsg.h>
50#include <dns/types.h>
51
52typedef ISC_LIST(dns_dispentry_t)	dns_displist_t;
53
54typedef struct dispsocket		dispsocket_t;
55typedef ISC_LIST(dispsocket_t)		dispsocketlist_t;
56
57typedef struct dispportentry		dispportentry_t;
58typedef ISC_LIST(dispportentry_t)	dispportlist_t;
59
60/* ARC4 Random generator state */
61typedef struct arc4ctx {
62	isc_uint8_t	i;
63	isc_uint8_t	j;
64	isc_uint8_t	s[256];
65	int		count;
66	isc_entropy_t	*entropy;	/*%< entropy source for ARC4 */
67	isc_mutex_t	*lock;
68} arc4ctx_t;
69
70typedef struct dns_qid {
71	unsigned int	magic;
72	unsigned int	qid_nbuckets;	/*%< hash table size */
73	unsigned int	qid_increment;	/*%< id increment on collision */
74	isc_mutex_t	lock;
75	dns_displist_t	*qid_table;	/*%< the table itself */
76	dispsocketlist_t *sock_table;	/*%< socket table */
77} dns_qid_t;
78
79struct dns_dispatchmgr {
80	/* Unlocked. */
81	unsigned int			magic;
82	isc_mem_t		       *mctx;
83	dns_acl_t		       *blackhole;
84	dns_portlist_t		       *portlist;
85	isc_stats_t		       *stats;
86	isc_entropy_t		       *entropy; /*%< entropy source */
87
88	/* Locked by "lock". */
89	isc_mutex_t			lock;
90	unsigned int			state;
91	ISC_LIST(dns_dispatch_t)	list;
92
93	/* Locked by arc4_lock. */
94	isc_mutex_t			arc4_lock;
95	arc4ctx_t			arc4ctx;    /*%< ARC4 context for QID */
96
97	/* locked by buffer lock */
98	dns_qid_t			*qid;
99	isc_mutex_t			buffer_lock;
100	unsigned int			buffers;    /*%< allocated buffers */
101	unsigned int			buffersize; /*%< size of each buffer */
102	unsigned int			maxbuffers; /*%< max buffers */
103
104	/* Locked internally. */
105	isc_mutex_t			depool_lock;
106	isc_mempool_t		       *depool;	/*%< pool for dispatch events */
107	isc_mutex_t			rpool_lock;
108	isc_mempool_t		       *rpool;	/*%< pool for replies */
109	isc_mutex_t			dpool_lock;
110	isc_mempool_t		       *dpool;  /*%< dispatch allocations */
111	isc_mutex_t			bpool_lock;
112	isc_mempool_t		       *bpool;	/*%< pool for buffers */
113	isc_mutex_t			spool_lock;
114	isc_mempool_t		       *spool;	/*%< pool for dispsocks */
115
116	/*%
117	 * Locked by qid->lock if qid exists; otherwise, can be used without
118	 * being locked.
119	 * Memory footprint considerations: this is a simple implementation of
120	 * available ports, i.e., an ordered array of the actual port numbers.
121	 * This will require about 256KB of memory in the worst case (128KB for
122	 * each of IPv4 and IPv6).  We could reduce it by representing it as a
123	 * more sophisticated way such as a list (or array) of ranges that are
124	 * searched to identify a specific port.  Our decision here is the saved
125	 * memory isn't worth the implementation complexity, considering the
126	 * fact that the whole BIND9 process (which is mainly named) already
127	 * requires a pretty large memory footprint.  We may, however, have to
128	 * revisit the decision when we want to use it as a separate module for
129	 * an environment where memory requirement is severer.
130	 */
131	in_port_t	*v4ports;	/*%< available ports for IPv4 */
132	unsigned int	nv4ports;	/*%< # of available ports for IPv4 */
133	in_port_t	*v6ports;	/*%< available ports for IPv4 */
134	unsigned int	nv6ports;	/*%< # of available ports for IPv4 */
135};
136
137#define MGR_SHUTTINGDOWN		0x00000001U
138#define MGR_IS_SHUTTINGDOWN(l)	(((l)->state & MGR_SHUTTINGDOWN) != 0)
139
140#define IS_PRIVATE(d)	(((d)->attributes & DNS_DISPATCHATTR_PRIVATE) != 0)
141
142struct dns_dispentry {
143	unsigned int			magic;
144	dns_dispatch_t		       *disp;
145	dns_messageid_t			id;
146	in_port_t			port;
147	unsigned int			bucket;
148	isc_sockaddr_t			host;
149	isc_task_t		       *task;
150	isc_taskaction_t		action;
151	void			       *arg;
152	isc_boolean_t			item_out;
153	dispsocket_t			*dispsocket;
154	ISC_LIST(dns_dispatchevent_t)	items;
155	ISC_LINK(dns_dispentry_t)	link;
156};
157
158/*%
159 * Maximum number of dispatch sockets that can be pooled for reuse.  The
160 * appropriate value may vary, but experiments have shown a busy caching server
161 * may need more than 1000 sockets concurrently opened.  The maximum allowable
162 * number of dispatch sockets (per manager) will be set to the double of this
163 * value.
164 */
165#ifndef DNS_DISPATCH_POOLSOCKS
166#define DNS_DISPATCH_POOLSOCKS			2048
167#endif
168
169/*%
170 * Quota to control the number of dispatch sockets.  If a dispatch has more
171 * than the quota of sockets, new queries will purge oldest ones, so that
172 * a massive number of outstanding queries won't prevent subsequent queries
173 * (especially if the older ones take longer time and result in timeout).
174 */
175#ifndef DNS_DISPATCH_SOCKSQUOTA
176#define DNS_DISPATCH_SOCKSQUOTA			3072
177#endif
178
179struct dispsocket {
180	unsigned int			magic;
181	isc_socket_t			*socket;
182	dns_dispatch_t			*disp;
183	isc_sockaddr_t			host;
184	in_port_t			localport; /* XXX: should be removed later */
185	dispportentry_t			*portentry;
186	dns_dispentry_t			*resp;
187	isc_task_t			*task;
188	ISC_LINK(dispsocket_t)		link;
189	unsigned int			bucket;
190	ISC_LINK(dispsocket_t)		blink;
191};
192
193/*%
194 * A port table entry.  We remember every port we first open in a table with a
195 * reference counter so that we can 'reuse' the same port (with different
196 * destination addresses) using the SO_REUSEADDR socket option.
197 */
198struct dispportentry {
199	in_port_t			port;
200	unsigned int			refs;
201	ISC_LINK(struct dispportentry)	link;
202};
203
204#ifndef DNS_DISPATCH_PORTTABLESIZE
205#define DNS_DISPATCH_PORTTABLESIZE	1024
206#endif
207
208#define INVALID_BUCKET		(0xffffdead)
209
210/*%
211 * Number of tasks for each dispatch that use separate sockets for different
212 * transactions.  This must be a power of 2 as it will divide 32 bit numbers
213 * to get an uniformly random tasks selection.  See get_dispsocket().
214 */
215#define MAX_INTERNAL_TASKS	64
216
217struct dns_dispatch {
218	/* Unlocked. */
219	unsigned int		magic;		/*%< magic */
220	dns_dispatchmgr_t      *mgr;		/*%< dispatch manager */
221	int			ntasks;
222	/*%
223	 * internal task buckets.  We use multiple tasks to distribute various
224	 * socket events well when using separate dispatch sockets.  We use the
225	 * 1st task (task[0]) for internal control events.
226	 */
227	isc_task_t	       *task[MAX_INTERNAL_TASKS];
228	isc_socket_t	       *socket;		/*%< isc socket attached to */
229	isc_sockaddr_t		local;		/*%< local address */
230	in_port_t		localport;	/*%< local UDP port */
231	unsigned int		maxrequests;	/*%< max requests */
232	isc_event_t	       *ctlevent;
233
234	isc_mutex_t		sepool_lock;
235	isc_mempool_t	       *sepool;		/*%< pool for socket events */
236
237	/*% Locked by mgr->lock. */
238	ISC_LINK(dns_dispatch_t) link;
239
240	/* Locked by "lock". */
241	isc_mutex_t		lock;		/*%< locks all below */
242	isc_sockettype_t	socktype;
243	unsigned int		attributes;
244	unsigned int		refcount;	/*%< number of users */
245	dns_dispatchevent_t    *failsafe_ev;	/*%< failsafe cancel event */
246	unsigned int		shutting_down : 1,
247				shutdown_out : 1,
248				connected : 1,
249				tcpmsg_valid : 1,
250				recv_pending : 1; /*%< is a recv() pending? */
251	isc_result_t		shutdown_why;
252	ISC_LIST(dispsocket_t)	activesockets;
253	ISC_LIST(dispsocket_t)	inactivesockets;
254	unsigned int		nsockets;
255	unsigned int		requests;	/*%< how many requests we have */
256	unsigned int		tcpbuffers;	/*%< allocated buffers */
257	dns_tcpmsg_t		tcpmsg;		/*%< for tcp streams */
258	dns_qid_t		*qid;
259	arc4ctx_t		arc4ctx;	/*%< for QID/UDP port num */
260	dispportlist_t		*port_table;	/*%< hold ports 'owned' by us */
261	isc_mempool_t		*portpool;	/*%< port table entries  */
262};
263
264#define QID_MAGIC		ISC_MAGIC('Q', 'i', 'd', ' ')
265#define VALID_QID(e)		ISC_MAGIC_VALID((e), QID_MAGIC)
266
267#define RESPONSE_MAGIC		ISC_MAGIC('D', 'r', 's', 'p')
268#define VALID_RESPONSE(e)	ISC_MAGIC_VALID((e), RESPONSE_MAGIC)
269
270#define DISPSOCK_MAGIC		ISC_MAGIC('D', 's', 'o', 'c')
271#define VALID_DISPSOCK(e)	ISC_MAGIC_VALID((e), DISPSOCK_MAGIC)
272
273#define DISPATCH_MAGIC		ISC_MAGIC('D', 'i', 's', 'p')
274#define VALID_DISPATCH(e)	ISC_MAGIC_VALID((e), DISPATCH_MAGIC)
275
276#define DNS_DISPATCHMGR_MAGIC	ISC_MAGIC('D', 'M', 'g', 'r')
277#define VALID_DISPATCHMGR(e)	ISC_MAGIC_VALID((e), DNS_DISPATCHMGR_MAGIC)
278
279#define DNS_QID(disp) ((disp)->socktype == isc_sockettype_tcp) ? \
280		       (disp)->qid : (disp)->mgr->qid
281#define DISP_ARC4CTX(disp) ((disp)->socktype == isc_sockettype_udp) ? \
282			(&(disp)->arc4ctx) : (&(disp)->mgr->arc4ctx)
283
284/*%
285 * Locking a query port buffer is a bit tricky.  We access the buffer without
286 * locking until qid is created.  Technically, there is a possibility of race
287 * between the creation of qid and access to the port buffer; in practice,
288 * however, this should be safe because qid isn't created until the first
289 * dispatch is created and there should be no contending situation until then.
290 */
291#define PORTBUFLOCK(mgr) if ((mgr)->qid != NULL) LOCK(&((mgr)->qid->lock))
292#define PORTBUFUNLOCK(mgr) if ((mgr)->qid != NULL) UNLOCK((&(mgr)->qid->lock))
293
294/*
295 * Statics.
296 */
297static dns_dispentry_t *entry_search(dns_qid_t *, isc_sockaddr_t *,
298				     dns_messageid_t, in_port_t, unsigned int);
299static isc_boolean_t destroy_disp_ok(dns_dispatch_t *);
300static void destroy_disp(isc_task_t *task, isc_event_t *event);
301static void destroy_dispsocket(dns_dispatch_t *, dispsocket_t **);
302static void deactivate_dispsocket(dns_dispatch_t *, dispsocket_t *);
303static void udp_exrecv(isc_task_t *, isc_event_t *);
304static void udp_shrecv(isc_task_t *, isc_event_t *);
305static void udp_recv(isc_event_t *, dns_dispatch_t *, dispsocket_t *);
306static void tcp_recv(isc_task_t *, isc_event_t *);
307static isc_result_t startrecv(dns_dispatch_t *, dispsocket_t *);
308static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t,
309			     in_port_t);
310static void free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len);
311static void *allocate_udp_buffer(dns_dispatch_t *disp);
312static inline void free_devent(dns_dispatch_t *disp, dns_dispatchevent_t *ev);
313static inline dns_dispatchevent_t *allocate_devent(dns_dispatch_t *disp);
314static void do_cancel(dns_dispatch_t *disp);
315static dns_dispentry_t *linear_first(dns_qid_t *disp);
316static dns_dispentry_t *linear_next(dns_qid_t *disp,
317				    dns_dispentry_t *resp);
318static void dispatch_free(dns_dispatch_t **dispp);
319static isc_result_t get_udpsocket(dns_dispatchmgr_t *mgr,
320				  dns_dispatch_t *disp,
321				  isc_socketmgr_t *sockmgr,
322				  isc_sockaddr_t *localaddr,
323				  isc_socket_t **sockp,
324				  isc_socket_t *dup_socket);
325static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr,
326				       isc_socketmgr_t *sockmgr,
327				       isc_taskmgr_t *taskmgr,
328				       isc_sockaddr_t *localaddr,
329				       unsigned int maxrequests,
330				       unsigned int attributes,
331				       dns_dispatch_t **dispp,
332				       isc_socket_t *dup_socket);
333static isc_boolean_t destroy_mgr_ok(dns_dispatchmgr_t *mgr);
334static void destroy_mgr(dns_dispatchmgr_t **mgrp);
335static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
336				 unsigned int increment, dns_qid_t **qidp,
337				 isc_boolean_t needaddrtable);
338static void qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp);
339static isc_result_t open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
340				unsigned int options, isc_socket_t **sockp,
341				isc_socket_t *dup_socket);
342static isc_boolean_t portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
343				   isc_sockaddr_t *sockaddrp);
344
345#define LVL(x) ISC_LOG_DEBUG(x)
346
347static void
348mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...)
349     ISC_FORMAT_PRINTF(3, 4);
350
351static void
352mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...) {
353	char msgbuf[2048];
354	va_list ap;
355
356	if (! isc_log_wouldlog(dns_lctx, level))
357		return;
358
359	va_start(ap, fmt);
360	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
361	va_end(ap);
362
363	isc_log_write(dns_lctx,
364		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
365		      level, "dispatchmgr %p: %s", mgr, msgbuf);
366}
367
368static inline void
369inc_stats(dns_dispatchmgr_t *mgr, isc_statscounter_t counter) {
370	if (mgr->stats != NULL)
371		isc_stats_increment(mgr->stats, counter);
372}
373
374static void
375dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...)
376     ISC_FORMAT_PRINTF(3, 4);
377
378static void
379dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...) {
380	char msgbuf[2048];
381	va_list ap;
382
383	if (! isc_log_wouldlog(dns_lctx, level))
384		return;
385
386	va_start(ap, fmt);
387	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
388	va_end(ap);
389
390	isc_log_write(dns_lctx,
391		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
392		      level, "dispatch %p: %s", disp, msgbuf);
393}
394
395static void
396request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
397	    int level, const char *fmt, ...)
398     ISC_FORMAT_PRINTF(4, 5);
399
400static void
401request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
402	    int level, const char *fmt, ...)
403{
404	char msgbuf[2048];
405	char peerbuf[256];
406	va_list ap;
407
408	if (! isc_log_wouldlog(dns_lctx, level))
409		return;
410
411	va_start(ap, fmt);
412	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
413	va_end(ap);
414
415	if (VALID_RESPONSE(resp)) {
416		isc_sockaddr_format(&resp->host, peerbuf, sizeof(peerbuf));
417		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
418			      DNS_LOGMODULE_DISPATCH, level,
419			      "dispatch %p response %p %s: %s", disp, resp,
420			      peerbuf, msgbuf);
421	} else {
422		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
423			      DNS_LOGMODULE_DISPATCH, level,
424			      "dispatch %p req/resp %p: %s", disp, resp,
425			      msgbuf);
426	}
427}
428
429/*%
430 * ARC4 random number generator derived from OpenBSD.
431 * Only dispatch_random() and dispatch_uniformrandom() are expected
432 * to be called from general dispatch routines; the rest of them are subroutines
433 * for these two.
434 *
435 * The original copyright follows:
436 * Copyright (c) 1996, David Mazieres <dm@uun.org>
437 * Copyright (c) 2008, Damien Miller <djm@openbsd.org>
438 *
439 * Permission to use, copy, modify, and distribute this software for any
440 * purpose with or without fee is hereby granted, provided that the above
441 * copyright notice and this permission notice appear in all copies.
442 *
443 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
444 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
445 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
446 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
447 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
448 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
449 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
450 */
451#ifdef BIND9
452static void
453dispatch_initrandom(arc4ctx_t *actx, isc_entropy_t *entropy,
454		    isc_mutex_t *lock)
455{
456	int n;
457	for (n = 0; n < 256; n++)
458		actx->s[n] = n;
459	actx->i = 0;
460	actx->j = 0;
461	actx->count = 0;
462	actx->entropy = entropy; /* don't have to attach */
463	actx->lock = lock;
464}
465
466static void
467dispatch_arc4addrandom(arc4ctx_t *actx, unsigned char *dat, int datlen) {
468	int n;
469	isc_uint8_t si;
470
471	actx->i--;
472	for (n = 0; n < 256; n++) {
473		actx->i = (actx->i + 1);
474		si = actx->s[actx->i];
475		actx->j = (actx->j + si + dat[n % datlen]);
476		actx->s[actx->i] = actx->s[actx->j];
477		actx->s[actx->j] = si;
478	}
479	actx->j = actx->i;
480}
481
482static inline isc_uint8_t
483dispatch_arc4get8(arc4ctx_t *actx) {
484	isc_uint8_t si, sj;
485
486	actx->i = (actx->i + 1);
487	si = actx->s[actx->i];
488	actx->j = (actx->j + si);
489	sj = actx->s[actx->j];
490	actx->s[actx->i] = sj;
491	actx->s[actx->j] = si;
492
493	return (actx->s[(si + sj) & 0xff]);
494}
495
496static inline isc_uint16_t
497dispatch_arc4get16(arc4ctx_t *actx) {
498	isc_uint16_t val;
499
500	val = dispatch_arc4get8(actx) << 8;
501	val |= dispatch_arc4get8(actx);
502
503	return (val);
504}
505
506static void
507dispatch_arc4stir(arc4ctx_t *actx) {
508	int i;
509	union {
510		unsigned char rnd[128];
511		isc_uint32_t rnd32[32];
512	} rnd;
513	isc_result_t result;
514
515	if (actx->entropy != NULL) {
516		/*
517		 * We accept any quality of random data to avoid blocking.
518		 */
519		result = isc_entropy_getdata(actx->entropy, rnd.rnd,
520					     sizeof(rnd), NULL, 0);
521		RUNTIME_CHECK(result == ISC_R_SUCCESS);
522	} else {
523		for (i = 0; i < 32; i++)
524			isc_random_get(&rnd.rnd32[i]);
525	}
526	dispatch_arc4addrandom(actx, rnd.rnd, sizeof(rnd.rnd));
527
528	/*
529	 * Discard early keystream, as per recommendations in:
530	 * http://www.wisdom.weizmann.ac.il/~itsik/RC4/Papers/Rc4_ksa.ps
531	 */
532	for (i = 0; i < 256; i++)
533		(void)dispatch_arc4get8(actx);
534
535	/*
536	 * Derived from OpenBSD's implementation.  The rationale is not clear,
537	 * but should be conservative enough in safety, and reasonably large
538	 * for efficiency.
539	 */
540	actx->count = 1600000;
541}
542
543static isc_uint16_t
544dispatch_random(arc4ctx_t *actx) {
545	isc_uint16_t result;
546
547	if (actx->lock != NULL)
548		LOCK(actx->lock);
549
550	actx->count -= sizeof(isc_uint16_t);
551	if (actx->count <= 0)
552		dispatch_arc4stir(actx);
553	result = dispatch_arc4get16(actx);
554
555	if (actx->lock != NULL)
556		UNLOCK(actx->lock);
557
558	return (result);
559}
560#else
561/*
562 * For general purpose library, we don't have to be too strict about the
563 * quality of random values.  Performance doesn't matter much, either.
564 * So we simply use the isc_random module to keep the library as small as
565 * possible.
566 */
567
568static void
569dispatch_initrandom(arc4ctx_t *actx, isc_entropy_t *entropy,
570		    isc_mutex_t *lock)
571{
572	UNUSED(actx);
573	UNUSED(entropy);
574	UNUSED(lock);
575
576	return;
577}
578
579static isc_uint16_t
580dispatch_random(arc4ctx_t *actx) {
581	isc_uint32_t r;
582
583	UNUSED(actx);
584
585	isc_random_get(&r);
586	return (r & 0xffff);
587}
588#endif	/* BIND9 */
589
590static isc_uint16_t
591dispatch_uniformrandom(arc4ctx_t *actx, isc_uint16_t upper_bound) {
592	isc_uint16_t min, r;
593
594	if (upper_bound < 2)
595		return (0);
596
597	/*
598	 * Ensure the range of random numbers [min, 0xffff] be a multiple of
599	 * upper_bound and contain at least a half of the 16 bit range.
600	 */
601
602	if (upper_bound > 0x8000)
603		min = 1 + ~upper_bound; /* 0x8000 - upper_bound */
604	else
605		min = (isc_uint16_t)(0x10000 % (isc_uint32_t)upper_bound);
606
607	/*
608	 * This could theoretically loop forever but each retry has
609	 * p > 0.5 (worst case, usually far better) of selecting a
610	 * number inside the range we need, so it should rarely need
611	 * to re-roll.
612	 */
613	for (;;) {
614		r = dispatch_random(actx);
615		if (r >= min)
616			break;
617	}
618
619	return (r % upper_bound);
620}
621
622/*
623 * Return a hash of the destination and message id.
624 */
625static isc_uint32_t
626dns_hash(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
627	 in_port_t port)
628{
629	unsigned int ret;
630
631	ret = isc_sockaddr_hash(dest, ISC_TRUE);
632	ret ^= (id << 16) | port;
633	ret %= qid->qid_nbuckets;
634
635	INSIST(ret < qid->qid_nbuckets);
636
637	return (ret);
638}
639
640/*
641 * Find the first entry in 'qid'.  Returns NULL if there are no entries.
642 */
643static dns_dispentry_t *
644linear_first(dns_qid_t *qid) {
645	dns_dispentry_t *ret;
646	unsigned int bucket;
647
648	bucket = 0;
649
650	while (bucket < qid->qid_nbuckets) {
651		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
652		if (ret != NULL)
653			return (ret);
654		bucket++;
655	}
656
657	return (NULL);
658}
659
660/*
661 * Find the next entry after 'resp' in 'qid'.  Return NULL if there are
662 * no more entries.
663 */
664static dns_dispentry_t *
665linear_next(dns_qid_t *qid, dns_dispentry_t *resp) {
666	dns_dispentry_t *ret;
667	unsigned int bucket;
668
669	ret = ISC_LIST_NEXT(resp, link);
670	if (ret != NULL)
671		return (ret);
672
673	bucket = resp->bucket;
674	bucket++;
675	while (bucket < qid->qid_nbuckets) {
676		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
677		if (ret != NULL)
678			return (ret);
679		bucket++;
680	}
681
682	return (NULL);
683}
684
685/*
686 * The dispatch must be locked.
687 */
688static isc_boolean_t
689destroy_disp_ok(dns_dispatch_t *disp)
690{
691	if (disp->refcount != 0)
692		return (ISC_FALSE);
693
694	if (disp->recv_pending != 0)
695		return (ISC_FALSE);
696
697	if (!ISC_LIST_EMPTY(disp->activesockets))
698		return (ISC_FALSE);
699
700	if (disp->shutting_down == 0)
701		return (ISC_FALSE);
702
703	return (ISC_TRUE);
704}
705
706/*
707 * Called when refcount reaches 0 (and safe to destroy).
708 *
709 * The dispatcher must be locked.
710 * The manager must not be locked.
711 */
712static void
713destroy_disp(isc_task_t *task, isc_event_t *event) {
714	dns_dispatch_t *disp;
715	dns_dispatchmgr_t *mgr;
716	isc_boolean_t killmgr;
717	dispsocket_t *dispsocket;
718	int i;
719
720	INSIST(event->ev_type == DNS_EVENT_DISPATCHCONTROL);
721
722	UNUSED(task);
723
724	disp = event->ev_arg;
725	mgr = disp->mgr;
726
727	LOCK(&mgr->lock);
728	ISC_LIST_UNLINK(mgr->list, disp, link);
729
730	dispatch_log(disp, LVL(90),
731		     "shutting down; detaching from sock %p, task %p",
732		     disp->socket, disp->task[0]); /* XXXX */
733
734	if (disp->sepool != NULL) {
735		isc_mempool_destroy(&disp->sepool);
736		(void)isc_mutex_destroy(&disp->sepool_lock);
737	}
738
739	if (disp->socket != NULL)
740		isc_socket_detach(&disp->socket);
741	while ((dispsocket = ISC_LIST_HEAD(disp->inactivesockets)) != NULL) {
742		ISC_LIST_UNLINK(disp->inactivesockets, dispsocket, link);
743		destroy_dispsocket(disp, &dispsocket);
744	}
745	for (i = 0; i < disp->ntasks; i++)
746		isc_task_detach(&disp->task[i]);
747	isc_event_free(&event);
748
749	dispatch_free(&disp);
750
751	killmgr = destroy_mgr_ok(mgr);
752	UNLOCK(&mgr->lock);
753	if (killmgr)
754		destroy_mgr(&mgr);
755}
756
757/*%
758 * Manipulate port table per dispatch: find an entry for a given port number,
759 * create a new entry, and decrement a given entry with possible clean-up.
760 */
761static dispportentry_t *
762port_search(dns_dispatch_t *disp, in_port_t port) {
763	dispportentry_t *portentry;
764
765	REQUIRE(disp->port_table != NULL);
766
767	portentry = ISC_LIST_HEAD(disp->port_table[port %
768						   DNS_DISPATCH_PORTTABLESIZE]);
769	while (portentry != NULL) {
770		if (portentry->port == port)
771			return (portentry);
772		portentry = ISC_LIST_NEXT(portentry, link);
773	}
774
775	return (NULL);
776}
777
778static dispportentry_t *
779new_portentry(dns_dispatch_t *disp, in_port_t port) {
780	dispportentry_t *portentry;
781	dns_qid_t *qid;
782
783	REQUIRE(disp->port_table != NULL);
784
785	portentry = isc_mempool_get(disp->portpool);
786	if (portentry == NULL)
787		return (portentry);
788
789	portentry->port = port;
790	portentry->refs = 1;
791	ISC_LINK_INIT(portentry, link);
792	qid = DNS_QID(disp);
793	LOCK(&qid->lock);
794	ISC_LIST_APPEND(disp->port_table[port % DNS_DISPATCH_PORTTABLESIZE],
795			portentry, link);
796	UNLOCK(&qid->lock);
797
798	return (portentry);
799}
800
801/*%
802 * The caller must not hold the qid->lock.
803 */
804static void
805deref_portentry(dns_dispatch_t *disp, dispportentry_t **portentryp) {
806	dispportentry_t *portentry = *portentryp;
807	dns_qid_t *qid;
808
809	REQUIRE(disp->port_table != NULL);
810	REQUIRE(portentry != NULL && portentry->refs > 0);
811
812	qid = DNS_QID(disp);
813	LOCK(&qid->lock);
814	portentry->refs--;
815
816	if (portentry->refs == 0) {
817		ISC_LIST_UNLINK(disp->port_table[portentry->port %
818						 DNS_DISPATCH_PORTTABLESIZE],
819				portentry, link);
820		isc_mempool_put(disp->portpool, portentry);
821	}
822	UNLOCK(&qid->lock);
823
824	*portentryp = NULL;
825}
826
827/*%
828 * Find a dispsocket for socket address 'dest', and port number 'port'.
829 * Return NULL if no such entry exists.
830 */
831static dispsocket_t *
832socket_search(dns_qid_t *qid, isc_sockaddr_t *dest, in_port_t port,
833	      unsigned int bucket)
834{
835	dispsocket_t *dispsock;
836
837	REQUIRE(VALID_QID(qid));
838	REQUIRE(bucket < qid->qid_nbuckets);
839
840	dispsock = ISC_LIST_HEAD(qid->sock_table[bucket]);
841
842	while (dispsock != NULL) {
843		if (dispsock->portentry != NULL &&
844		    dispsock->portentry->port == port &&
845		    isc_sockaddr_equal(dest, &dispsock->host))
846			return (dispsock);
847		dispsock = ISC_LIST_NEXT(dispsock, blink);
848	}
849
850	return (NULL);
851}
852
853/*%
854 * Make a new socket for a single dispatch with a random port number.
855 * The caller must hold the disp->lock
856 */
857static isc_result_t
858get_dispsocket(dns_dispatch_t *disp, isc_sockaddr_t *dest,
859	       isc_socketmgr_t *sockmgr, dispsocket_t **dispsockp,
860	       in_port_t *portp)
861{
862	int i;
863	isc_uint32_t r;
864	dns_dispatchmgr_t *mgr = disp->mgr;
865	isc_socket_t *sock = NULL;
866	isc_result_t result = ISC_R_FAILURE;
867	in_port_t port;
868	isc_sockaddr_t localaddr;
869	unsigned int bucket = 0;
870	dispsocket_t *dispsock;
871	unsigned int nports;
872	in_port_t *ports;
873	unsigned int bindoptions;
874	dispportentry_t *portentry = NULL;
875	dns_qid_t *qid;
876
877	if (isc_sockaddr_pf(&disp->local) == AF_INET) {
878		nports = disp->mgr->nv4ports;
879		ports = disp->mgr->v4ports;
880	} else {
881		nports = disp->mgr->nv6ports;
882		ports = disp->mgr->v6ports;
883	}
884	if (nports == 0)
885		return (ISC_R_ADDRNOTAVAIL);
886
887	dispsock = ISC_LIST_HEAD(disp->inactivesockets);
888	if (dispsock != NULL) {
889		ISC_LIST_UNLINK(disp->inactivesockets, dispsock, link);
890		sock = dispsock->socket;
891		dispsock->socket = NULL;
892	} else {
893		dispsock = isc_mempool_get(mgr->spool);
894		if (dispsock == NULL)
895			return (ISC_R_NOMEMORY);
896
897		disp->nsockets++;
898		dispsock->socket = NULL;
899		dispsock->disp = disp;
900		dispsock->resp = NULL;
901		dispsock->portentry = NULL;
902		isc_random_get(&r);
903		dispsock->task = NULL;
904		isc_task_attach(disp->task[r % disp->ntasks], &dispsock->task);
905		ISC_LINK_INIT(dispsock, link);
906		ISC_LINK_INIT(dispsock, blink);
907		dispsock->magic = DISPSOCK_MAGIC;
908	}
909
910	/*
911	 * Pick up a random UDP port and open a new socket with it.  Avoid
912	 * choosing ports that share the same destination because it will be
913	 * very likely to fail in bind(2) or connect(2).
914	 */
915	localaddr = disp->local;
916	qid = DNS_QID(disp);
917
918	for (i = 0; i < 64; i++) {
919		port = ports[dispatch_uniformrandom(DISP_ARC4CTX(disp),
920							nports)];
921		isc_sockaddr_setport(&localaddr, port);
922
923		LOCK(&qid->lock);
924		bucket = dns_hash(qid, dest, 0, port);
925		if (socket_search(qid, dest, port, bucket) != NULL) {
926			UNLOCK(&qid->lock);
927			continue;
928		}
929		UNLOCK(&qid->lock);
930		bindoptions = 0;
931		portentry = port_search(disp, port);
932
933		if (portentry != NULL)
934			bindoptions |= ISC_SOCKET_REUSEADDRESS;
935		result = open_socket(sockmgr, &localaddr, bindoptions, &sock,
936				     NULL);
937		if (result == ISC_R_SUCCESS) {
938			if (portentry == NULL) {
939				portentry = new_portentry(disp, port);
940				if (portentry == NULL) {
941					result = ISC_R_NOMEMORY;
942					break;
943				}
944			} else {
945				LOCK(&qid->lock);
946				portentry->refs++;
947				UNLOCK(&qid->lock);
948			}
949			break;
950		} else if (result == ISC_R_NOPERM) {
951			char buf[ISC_SOCKADDR_FORMATSIZE];
952			isc_sockaddr_format(&localaddr, buf, sizeof(buf));
953			dispatch_log(disp, ISC_LOG_WARNING,
954				     "open_socket(%s) -> %s: continuing",
955				     buf, isc_result_totext(result));
956		} else if (result != ISC_R_ADDRINUSE)
957			break;
958	}
959
960	if (result == ISC_R_SUCCESS) {
961		dispsock->socket = sock;
962		dispsock->host = *dest;
963		dispsock->portentry = portentry;
964		dispsock->bucket = bucket;
965		LOCK(&qid->lock);
966		ISC_LIST_APPEND(qid->sock_table[bucket], dispsock, blink);
967		UNLOCK(&qid->lock);
968		*dispsockp = dispsock;
969		*portp = port;
970	} else {
971		/*
972		 * We could keep it in the inactive list, but since this should
973		 * be an exceptional case and might be resource shortage, we'd
974		 * rather destroy it.
975		 */
976		if (sock != NULL)
977			isc_socket_detach(&sock);
978		destroy_dispsocket(disp, &dispsock);
979	}
980
981	return (result);
982}
983
984/*%
985 * Destroy a dedicated dispatch socket.
986 */
987static void
988destroy_dispsocket(dns_dispatch_t *disp, dispsocket_t **dispsockp) {
989	dispsocket_t *dispsock;
990	dns_qid_t *qid;
991
992	/*
993	 * The dispatch must be locked.
994	 */
995
996	REQUIRE(dispsockp != NULL && *dispsockp != NULL);
997	dispsock = *dispsockp;
998	REQUIRE(!ISC_LINK_LINKED(dispsock, link));
999
1000	disp->nsockets--;
1001	dispsock->magic = 0;
1002	if (dispsock->portentry != NULL)
1003		deref_portentry(disp, &dispsock->portentry);
1004	if (dispsock->socket != NULL)
1005		isc_socket_detach(&dispsock->socket);
1006	if (ISC_LINK_LINKED(dispsock, blink)) {
1007		qid = DNS_QID(disp);
1008		LOCK(&qid->lock);
1009		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
1010				blink);
1011		UNLOCK(&qid->lock);
1012	}
1013	if (dispsock->task != NULL)
1014		isc_task_detach(&dispsock->task);
1015	isc_mempool_put(disp->mgr->spool, dispsock);
1016
1017	*dispsockp = NULL;
1018}
1019
1020/*%
1021 * Deactivate a dedicated dispatch socket.  Move it to the inactive list for
1022 * future reuse unless the total number of sockets are exceeding the maximum.
1023 */
1024static void
1025deactivate_dispsocket(dns_dispatch_t *disp, dispsocket_t *dispsock) {
1026	isc_result_t result;
1027	dns_qid_t *qid;
1028
1029	/*
1030	 * The dispatch must be locked.
1031	 */
1032	ISC_LIST_UNLINK(disp->activesockets, dispsock, link);
1033	if (dispsock->resp != NULL) {
1034		INSIST(dispsock->resp->dispsocket == dispsock);
1035		dispsock->resp->dispsocket = NULL;
1036	}
1037
1038	INSIST(dispsock->portentry != NULL);
1039	deref_portentry(disp, &dispsock->portentry);
1040
1041#ifdef BIND9
1042	if (disp->nsockets > DNS_DISPATCH_POOLSOCKS)
1043		destroy_dispsocket(disp, &dispsock);
1044	else {
1045		result = isc_socket_close(dispsock->socket);
1046
1047		qid = DNS_QID(disp);
1048		LOCK(&qid->lock);
1049		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
1050				blink);
1051		UNLOCK(&qid->lock);
1052
1053		if (result == ISC_R_SUCCESS)
1054			ISC_LIST_APPEND(disp->inactivesockets, dispsock, link);
1055		else {
1056			/*
1057			 * If the underlying system does not allow this
1058			 * optimization, destroy this temporary structure (and
1059			 * create a new one for a new transaction).
1060			 */
1061			INSIST(result == ISC_R_NOTIMPLEMENTED);
1062			destroy_dispsocket(disp, &dispsock);
1063		}
1064	}
1065#else
1066	/* This kind of optimization isn't necessary for normal use */
1067	UNUSED(qid);
1068	UNUSED(result);
1069
1070	destroy_dispsocket(disp, &dispsock);
1071#endif
1072}
1073
1074/*
1075 * Find an entry for query ID 'id', socket address 'dest', and port number
1076 * 'port'.
1077 * Return NULL if no such entry exists.
1078 */
1079static dns_dispentry_t *
1080entry_search(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
1081	     in_port_t port, unsigned int bucket)
1082{
1083	dns_dispentry_t *res;
1084
1085	REQUIRE(VALID_QID(qid));
1086	REQUIRE(bucket < qid->qid_nbuckets);
1087
1088	res = ISC_LIST_HEAD(qid->qid_table[bucket]);
1089
1090	while (res != NULL) {
1091		if (res->id == id && isc_sockaddr_equal(dest, &res->host) &&
1092		    res->port == port) {
1093			return (res);
1094		}
1095		res = ISC_LIST_NEXT(res, link);
1096	}
1097
1098	return (NULL);
1099}
1100
1101static void
1102free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len) {
1103	isc_mempool_t *bpool;
1104	INSIST(buf != NULL && len != 0);
1105
1106
1107	switch (disp->socktype) {
1108	case isc_sockettype_tcp:
1109		INSIST(disp->tcpbuffers > 0);
1110		disp->tcpbuffers--;
1111		isc_mem_put(disp->mgr->mctx, buf, len);
1112		break;
1113	case isc_sockettype_udp:
1114		LOCK(&disp->mgr->buffer_lock);
1115		INSIST(disp->mgr->buffers > 0);
1116		INSIST(len == disp->mgr->buffersize);
1117		disp->mgr->buffers--;
1118		bpool = disp->mgr->bpool;
1119		UNLOCK(&disp->mgr->buffer_lock);
1120		isc_mempool_put(bpool, buf);
1121		break;
1122	default:
1123		INSIST(0);
1124		break;
1125	}
1126}
1127
1128static void *
1129allocate_udp_buffer(dns_dispatch_t *disp) {
1130	isc_mempool_t *bpool;
1131	void *temp;
1132
1133	LOCK(&disp->mgr->buffer_lock);
1134	bpool = disp->mgr->bpool;
1135	disp->mgr->buffers++;
1136	UNLOCK(&disp->mgr->buffer_lock);
1137
1138	temp = isc_mempool_get(bpool);
1139
1140	if (temp == NULL) {
1141		LOCK(&disp->mgr->buffer_lock);
1142		disp->mgr->buffers--;
1143		UNLOCK(&disp->mgr->buffer_lock);
1144	}
1145
1146	return (temp);
1147}
1148
1149static inline void
1150free_sevent(isc_event_t *ev) {
1151	isc_mempool_t *pool = ev->ev_destroy_arg;
1152	isc_socketevent_t *sev = (isc_socketevent_t *) ev;
1153	isc_mempool_put(pool, sev);
1154}
1155
1156static inline isc_socketevent_t *
1157allocate_sevent(dns_dispatch_t *disp, isc_socket_t *socket,
1158		isc_eventtype_t type, isc_taskaction_t action, const void *arg)
1159{
1160	isc_socketevent_t *ev;
1161	void *deconst_arg;
1162
1163	ev = isc_mempool_get(disp->sepool);
1164	if (ev == NULL)
1165		return (NULL);
1166	DE_CONST(arg, deconst_arg);
1167	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, type,
1168		       action, deconst_arg, socket,
1169		       free_sevent, disp->sepool);
1170	ev->result = ISC_R_UNSET;
1171	ISC_LINK_INIT(ev, ev_link);
1172	ISC_LIST_INIT(ev->bufferlist);
1173	ev->region.base = NULL;
1174	ev->n = 0;
1175	ev->offset = 0;
1176	ev->attributes = 0;
1177
1178	return (ev);
1179}
1180
1181
1182static inline void
1183free_devent(dns_dispatch_t *disp, dns_dispatchevent_t *ev) {
1184	if (disp->failsafe_ev == ev) {
1185		INSIST(disp->shutdown_out == 1);
1186		disp->shutdown_out = 0;
1187
1188		return;
1189	}
1190
1191	isc_mempool_put(disp->mgr->depool, ev);
1192}
1193
1194static inline dns_dispatchevent_t *
1195allocate_devent(dns_dispatch_t *disp) {
1196	dns_dispatchevent_t *ev;
1197
1198	ev = isc_mempool_get(disp->mgr->depool);
1199	if (ev == NULL)
1200		return (NULL);
1201	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, 0,
1202		       NULL, NULL, NULL, NULL, NULL);
1203
1204	return (ev);
1205}
1206
1207static void
1208udp_exrecv(isc_task_t *task, isc_event_t *ev) {
1209	dispsocket_t *dispsock = ev->ev_arg;
1210
1211	UNUSED(task);
1212
1213	REQUIRE(VALID_DISPSOCK(dispsock));
1214	udp_recv(ev, dispsock->disp, dispsock);
1215}
1216
1217static void
1218udp_shrecv(isc_task_t *task, isc_event_t *ev) {
1219	dns_dispatch_t *disp = ev->ev_arg;
1220
1221	UNUSED(task);
1222
1223	REQUIRE(VALID_DISPATCH(disp));
1224	udp_recv(ev, disp, NULL);
1225}
1226
1227/*
1228 * General flow:
1229 *
1230 * If I/O result == CANCELED or error, free the buffer.
1231 *
1232 * If query, free the buffer, restart.
1233 *
1234 * If response:
1235 *	Allocate event, fill in details.
1236 *		If cannot allocate, free buffer, restart.
1237 *	find target.  If not found, free buffer, restart.
1238 *	if event queue is not empty, queue.  else, send.
1239 *	restart.
1240 */
1241static void
1242udp_recv(isc_event_t *ev_in, dns_dispatch_t *disp, dispsocket_t *dispsock) {
1243	isc_socketevent_t *ev = (isc_socketevent_t *)ev_in;
1244	dns_messageid_t id;
1245	isc_result_t dres;
1246	isc_buffer_t source;
1247	unsigned int flags;
1248	dns_dispentry_t *resp = NULL;
1249	dns_dispatchevent_t *rev;
1250	unsigned int bucket;
1251	isc_boolean_t killit;
1252	isc_boolean_t queue_response;
1253	dns_dispatchmgr_t *mgr;
1254	dns_qid_t *qid;
1255	isc_netaddr_t netaddr;
1256	int match;
1257	int result;
1258	isc_boolean_t qidlocked = ISC_FALSE;
1259
1260	LOCK(&disp->lock);
1261
1262	mgr = disp->mgr;
1263	qid = mgr->qid;
1264
1265	dispatch_log(disp, LVL(90),
1266		     "got packet: requests %d, buffers %d, recvs %d",
1267		     disp->requests, disp->mgr->buffers, disp->recv_pending);
1268
1269	if (dispsock == NULL && ev->ev_type == ISC_SOCKEVENT_RECVDONE) {
1270		/*
1271		 * Unless the receive event was imported from a listening
1272		 * interface, in which case the event type is
1273		 * DNS_EVENT_IMPORTRECVDONE, receive operation must be pending.
1274		 */
1275		INSIST(disp->recv_pending != 0);
1276		disp->recv_pending = 0;
1277	}
1278
1279	if (dispsock != NULL &&
1280	    (ev->result == ISC_R_CANCELED || dispsock->resp == NULL)) {
1281		/*
1282		 * dispsock->resp can be NULL if this transaction was canceled
1283		 * just after receiving a response.  Since this socket is
1284		 * exclusively used and there should be at most one receive
1285		 * event the canceled event should have been no effect.  So
1286		 * we can (and should) deactivate the socket right now.
1287		 */
1288		deactivate_dispsocket(disp, dispsock);
1289		dispsock = NULL;
1290	}
1291
1292	if (disp->shutting_down) {
1293		/*
1294		 * This dispatcher is shutting down.
1295		 */
1296		free_buffer(disp, ev->region.base, ev->region.length);
1297
1298		isc_event_free(&ev_in);
1299		ev = NULL;
1300
1301		killit = destroy_disp_ok(disp);
1302		UNLOCK(&disp->lock);
1303		if (killit)
1304			isc_task_send(disp->task[0], &disp->ctlevent);
1305
1306		return;
1307	}
1308
1309	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
1310		if (dispsock != NULL) {
1311			resp = dispsock->resp;
1312			id = resp->id;
1313			if (ev->result != ISC_R_SUCCESS) {
1314				/*
1315				 * This is most likely a network error on a
1316				 * connected socket.  It makes no sense to
1317				 * check the address or parse the packet, but it
1318				 * will help to return the error to the caller.
1319				 */
1320				goto sendresponse;
1321			}
1322		} else {
1323			free_buffer(disp, ev->region.base, ev->region.length);
1324
1325			UNLOCK(&disp->lock);
1326			isc_event_free(&ev_in);
1327			return;
1328		}
1329	} else if (ev->result != ISC_R_SUCCESS) {
1330		free_buffer(disp, ev->region.base, ev->region.length);
1331
1332		if (ev->result != ISC_R_CANCELED)
1333			dispatch_log(disp, ISC_LOG_ERROR,
1334				     "odd socket result in udp_recv(): %s",
1335				     isc_result_totext(ev->result));
1336
1337		UNLOCK(&disp->lock);
1338		isc_event_free(&ev_in);
1339		return;
1340	}
1341
1342	/*
1343	 * If this is from a blackholed address, drop it.
1344	 */
1345	isc_netaddr_fromsockaddr(&netaddr, &ev->address);
1346	if (disp->mgr->blackhole != NULL &&
1347	    dns_acl_match(&netaddr, NULL, disp->mgr->blackhole,
1348			  NULL, &match, NULL) == ISC_R_SUCCESS &&
1349	    match > 0)
1350	{
1351		if (isc_log_wouldlog(dns_lctx, LVL(10))) {
1352			char netaddrstr[ISC_NETADDR_FORMATSIZE];
1353			isc_netaddr_format(&netaddr, netaddrstr,
1354					   sizeof(netaddrstr));
1355			dispatch_log(disp, LVL(10),
1356				     "blackholed packet from %s",
1357				     netaddrstr);
1358		}
1359		free_buffer(disp, ev->region.base, ev->region.length);
1360		goto restart;
1361	}
1362
1363	/*
1364	 * Peek into the buffer to see what we can see.
1365	 */
1366	isc_buffer_init(&source, ev->region.base, ev->region.length);
1367	isc_buffer_add(&source, ev->n);
1368	dres = dns_message_peekheader(&source, &id, &flags);
1369	if (dres != ISC_R_SUCCESS) {
1370		free_buffer(disp, ev->region.base, ev->region.length);
1371		dispatch_log(disp, LVL(10), "got garbage packet");
1372		goto restart;
1373	}
1374
1375	dispatch_log(disp, LVL(92),
1376		     "got valid DNS message header, /QR %c, id %u",
1377		     ((flags & DNS_MESSAGEFLAG_QR) ? '1' : '0'), id);
1378
1379	/*
1380	 * Look at flags.  If query, drop it. If response,
1381	 * look to see where it goes.
1382	 */
1383	if ((flags & DNS_MESSAGEFLAG_QR) == 0) {
1384		/* query */
1385		free_buffer(disp, ev->region.base, ev->region.length);
1386		goto restart;
1387	}
1388
1389	/*
1390	 * Search for the corresponding response.  If we are using an exclusive
1391	 * socket, we've already identified it and we can skip the search; but
1392	 * the ID and the address must match the expected ones.
1393	 */
1394	if (resp == NULL) {
1395		bucket = dns_hash(qid, &ev->address, id, disp->localport);
1396		LOCK(&qid->lock);
1397		qidlocked = ISC_TRUE;
1398		resp = entry_search(qid, &ev->address, id, disp->localport,
1399				    bucket);
1400		dispatch_log(disp, LVL(90),
1401			     "search for response in bucket %d: %s",
1402			     bucket, (resp == NULL ? "not found" : "found"));
1403
1404		if (resp == NULL) {
1405			inc_stats(mgr, dns_resstatscounter_mismatch);
1406			free_buffer(disp, ev->region.base, ev->region.length);
1407			goto unlock;
1408		}
1409	} else if (resp->id != id || !isc_sockaddr_equal(&ev->address,
1410							 &resp->host)) {
1411		dispatch_log(disp, LVL(90),
1412			     "response to an exclusive socket doesn't match");
1413		inc_stats(mgr, dns_resstatscounter_mismatch);
1414		free_buffer(disp, ev->region.base, ev->region.length);
1415		goto unlock;
1416	}
1417
1418	/*
1419	 * Now that we have the original dispatch the query was sent
1420	 * from check that the address and port the response was
1421	 * sent to make sense.
1422	 */
1423	if (disp != resp->disp) {
1424		isc_sockaddr_t a1;
1425		isc_sockaddr_t a2;
1426
1427		/*
1428		 * Check that the socket types and ports match.
1429		 */
1430		if (disp->socktype != resp->disp->socktype ||
1431		    isc_sockaddr_getport(&disp->local) !=
1432		    isc_sockaddr_getport(&resp->disp->local)) {
1433			free_buffer(disp, ev->region.base, ev->region.length);
1434			goto unlock;
1435		}
1436
1437		/*
1438		 * If each dispatch is bound to a different address
1439		 * then fail.
1440		 *
1441		 * Note under Linux a packet can be sent out via IPv4 socket
1442		 * and the response be received via a IPv6 socket.
1443		 *
1444		 * Requests sent out via IPv6 should always come back in
1445		 * via IPv6.
1446		 */
1447		if (isc_sockaddr_pf(&resp->disp->local) == PF_INET6 &&
1448		    isc_sockaddr_pf(&disp->local) != PF_INET6) {
1449			free_buffer(disp, ev->region.base, ev->region.length);
1450			goto unlock;
1451		}
1452		isc_sockaddr_anyofpf(&a1, isc_sockaddr_pf(&resp->disp->local));
1453		isc_sockaddr_anyofpf(&a2, isc_sockaddr_pf(&disp->local));
1454		if (!isc_sockaddr_eqaddr(&disp->local, &resp->disp->local) &&
1455		    !isc_sockaddr_eqaddr(&a1, &resp->disp->local) &&
1456		    !isc_sockaddr_eqaddr(&a2, &disp->local)) {
1457			free_buffer(disp, ev->region.base, ev->region.length);
1458			goto unlock;
1459		}
1460	}
1461
1462  sendresponse:
1463	queue_response = resp->item_out;
1464	rev = allocate_devent(resp->disp);
1465	if (rev == NULL) {
1466		free_buffer(disp, ev->region.base, ev->region.length);
1467		goto unlock;
1468	}
1469
1470	/*
1471	 * At this point, rev contains the event we want to fill in, and
1472	 * resp contains the information on the place to send it to.
1473	 * Send the event off.
1474	 */
1475	isc_buffer_init(&rev->buffer, ev->region.base, ev->region.length);
1476	isc_buffer_add(&rev->buffer, ev->n);
1477	rev->result = ev->result;
1478	rev->id = id;
1479	rev->addr = ev->address;
1480	rev->pktinfo = ev->pktinfo;
1481	rev->attributes = ev->attributes;
1482	if (queue_response) {
1483		ISC_LIST_APPEND(resp->items, rev, ev_link);
1484	} else {
1485		ISC_EVENT_INIT(rev, sizeof(*rev), 0, NULL,
1486			       DNS_EVENT_DISPATCH,
1487			       resp->action, resp->arg, resp, NULL, NULL);
1488		request_log(disp, resp, LVL(90),
1489			    "[a] Sent event %p buffer %p len %d to task %p",
1490			    rev, rev->buffer.base, rev->buffer.length,
1491			    resp->task);
1492		resp->item_out = ISC_TRUE;
1493		isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
1494	}
1495 unlock:
1496	if (qidlocked)
1497		UNLOCK(&qid->lock);
1498
1499	/*
1500	 * Restart recv() to get the next packet.
1501	 */
1502 restart:
1503	result = startrecv(disp, dispsock);
1504	if (result != ISC_R_SUCCESS && dispsock != NULL) {
1505		/*
1506		 * XXX: wired. There seems to be no recovery process other than
1507		 * deactivate this socket anyway (since we cannot start
1508		 * receiving, we won't be able to receive a cancel event
1509		 * from the user).
1510		 */
1511		deactivate_dispsocket(disp, dispsock);
1512	}
1513	UNLOCK(&disp->lock);
1514
1515	isc_event_free(&ev_in);
1516}
1517
1518/*
1519 * General flow:
1520 *
1521 * If I/O result == CANCELED, EOF, or error, notify everyone as the
1522 * various queues drain.
1523 *
1524 * If query, restart.
1525 *
1526 * If response:
1527 *	Allocate event, fill in details.
1528 *		If cannot allocate, restart.
1529 *	find target.  If not found, restart.
1530 *	if event queue is not empty, queue.  else, send.
1531 *	restart.
1532 */
1533static void
1534tcp_recv(isc_task_t *task, isc_event_t *ev_in) {
1535	dns_dispatch_t *disp = ev_in->ev_arg;
1536	dns_tcpmsg_t *tcpmsg = &disp->tcpmsg;
1537	dns_messageid_t id;
1538	isc_result_t dres;
1539	unsigned int flags;
1540	dns_dispentry_t *resp;
1541	dns_dispatchevent_t *rev;
1542	unsigned int bucket;
1543	isc_boolean_t killit;
1544	isc_boolean_t queue_response;
1545	dns_qid_t *qid;
1546	int level;
1547	char buf[ISC_SOCKADDR_FORMATSIZE];
1548
1549	UNUSED(task);
1550
1551	REQUIRE(VALID_DISPATCH(disp));
1552
1553	qid = disp->qid;
1554
1555	dispatch_log(disp, LVL(90),
1556		     "got TCP packet: requests %d, buffers %d, recvs %d",
1557		     disp->requests, disp->tcpbuffers, disp->recv_pending);
1558
1559	LOCK(&disp->lock);
1560
1561	INSIST(disp->recv_pending != 0);
1562	disp->recv_pending = 0;
1563
1564	if (disp->refcount == 0) {
1565		/*
1566		 * This dispatcher is shutting down.  Force cancelation.
1567		 */
1568		tcpmsg->result = ISC_R_CANCELED;
1569	}
1570
1571	if (tcpmsg->result != ISC_R_SUCCESS) {
1572		switch (tcpmsg->result) {
1573		case ISC_R_CANCELED:
1574			break;
1575
1576		case ISC_R_EOF:
1577			dispatch_log(disp, LVL(90), "shutting down on EOF");
1578			do_cancel(disp);
1579			break;
1580
1581		case ISC_R_CONNECTIONRESET:
1582			level = ISC_LOG_INFO;
1583			goto logit;
1584
1585		default:
1586			level = ISC_LOG_ERROR;
1587		logit:
1588			isc_sockaddr_format(&tcpmsg->address, buf, sizeof(buf));
1589			dispatch_log(disp, level, "shutting down due to TCP "
1590				     "receive error: %s: %s", buf,
1591				     isc_result_totext(tcpmsg->result));
1592			do_cancel(disp);
1593			break;
1594		}
1595
1596		/*
1597		 * The event is statically allocated in the tcpmsg
1598		 * structure, and destroy_disp() frees the tcpmsg, so we must
1599		 * free the event *before* calling destroy_disp().
1600		 */
1601		isc_event_free(&ev_in);
1602
1603		disp->shutting_down = 1;
1604		disp->shutdown_why = tcpmsg->result;
1605
1606		/*
1607		 * If the recv() was canceled pass the word on.
1608		 */
1609		killit = destroy_disp_ok(disp);
1610		UNLOCK(&disp->lock);
1611		if (killit)
1612			isc_task_send(disp->task[0], &disp->ctlevent);
1613		return;
1614	}
1615
1616	dispatch_log(disp, LVL(90), "result %d, length == %d, addr = %p",
1617		     tcpmsg->result,
1618		     tcpmsg->buffer.length, tcpmsg->buffer.base);
1619
1620	/*
1621	 * Peek into the buffer to see what we can see.
1622	 */
1623	dres = dns_message_peekheader(&tcpmsg->buffer, &id, &flags);
1624	if (dres != ISC_R_SUCCESS) {
1625		dispatch_log(disp, LVL(10), "got garbage packet");
1626		goto restart;
1627	}
1628
1629	dispatch_log(disp, LVL(92),
1630		     "got valid DNS message header, /QR %c, id %u",
1631		     ((flags & DNS_MESSAGEFLAG_QR) ? '1' : '0'), id);
1632
1633	/*
1634	 * Allocate an event to send to the query or response client, and
1635	 * allocate a new buffer for our use.
1636	 */
1637
1638	/*
1639	 * Look at flags.  If query, drop it. If response,
1640	 * look to see where it goes.
1641	 */
1642	if ((flags & DNS_MESSAGEFLAG_QR) == 0) {
1643		/*
1644		 * Query.
1645		 */
1646		goto restart;
1647	}
1648
1649	/*
1650	 * Response.
1651	 */
1652	bucket = dns_hash(qid, &tcpmsg->address, id, disp->localport);
1653	LOCK(&qid->lock);
1654	resp = entry_search(qid, &tcpmsg->address, id, disp->localport, bucket);
1655	dispatch_log(disp, LVL(90),
1656		     "search for response in bucket %d: %s",
1657		     bucket, (resp == NULL ? "not found" : "found"));
1658
1659	if (resp == NULL)
1660		goto unlock;
1661	queue_response = resp->item_out;
1662	rev = allocate_devent(disp);
1663	if (rev == NULL)
1664		goto unlock;
1665
1666	/*
1667	 * At this point, rev contains the event we want to fill in, and
1668	 * resp contains the information on the place to send it to.
1669	 * Send the event off.
1670	 */
1671	dns_tcpmsg_keepbuffer(tcpmsg, &rev->buffer);
1672	disp->tcpbuffers++;
1673	rev->result = ISC_R_SUCCESS;
1674	rev->id = id;
1675	rev->addr = tcpmsg->address;
1676	if (queue_response) {
1677		ISC_LIST_APPEND(resp->items, rev, ev_link);
1678	} else {
1679		ISC_EVENT_INIT(rev, sizeof(*rev), 0, NULL, DNS_EVENT_DISPATCH,
1680			       resp->action, resp->arg, resp, NULL, NULL);
1681		request_log(disp, resp, LVL(90),
1682			    "[b] Sent event %p buffer %p len %d to task %p",
1683			    rev, rev->buffer.base, rev->buffer.length,
1684			    resp->task);
1685		resp->item_out = ISC_TRUE;
1686		isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
1687	}
1688 unlock:
1689	UNLOCK(&qid->lock);
1690
1691	/*
1692	 * Restart recv() to get the next packet.
1693	 */
1694 restart:
1695	(void)startrecv(disp, NULL);
1696
1697	UNLOCK(&disp->lock);
1698
1699	isc_event_free(&ev_in);
1700}
1701
1702/*
1703 * disp must be locked.
1704 */
1705static isc_result_t
1706startrecv(dns_dispatch_t *disp, dispsocket_t *dispsock) {
1707	isc_result_t res;
1708	isc_region_t region;
1709	isc_socket_t *socket;
1710
1711	if (disp->shutting_down == 1)
1712		return (ISC_R_SUCCESS);
1713
1714	if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
1715		return (ISC_R_SUCCESS);
1716
1717	if (disp->recv_pending != 0 && dispsock == NULL)
1718		return (ISC_R_SUCCESS);
1719
1720	if (disp->mgr->buffers >= disp->mgr->maxbuffers)
1721		return (ISC_R_NOMEMORY);
1722
1723	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
1724	    dispsock == NULL)
1725		return (ISC_R_SUCCESS);
1726
1727	if (dispsock != NULL)
1728		socket = dispsock->socket;
1729	else
1730		socket = disp->socket;
1731	INSIST(socket != NULL);
1732
1733	switch (disp->socktype) {
1734		/*
1735		 * UDP reads are always maximal.
1736		 */
1737	case isc_sockettype_udp:
1738		region.length = disp->mgr->buffersize;
1739		region.base = allocate_udp_buffer(disp);
1740		if (region.base == NULL)
1741			return (ISC_R_NOMEMORY);
1742		if (dispsock != NULL) {
1743			isc_task_t *dt = dispsock->task;
1744			isc_socketevent_t *sev =
1745				allocate_sevent(disp, socket,
1746						ISC_SOCKEVENT_RECVDONE,
1747						udp_exrecv, dispsock);
1748			if (sev == NULL) {
1749				free_buffer(disp, region.base, region.length);
1750				return (ISC_R_NOMEMORY);
1751			}
1752
1753			res = isc_socket_recv2(socket, &region, 1, dt, sev, 0);
1754			if (res != ISC_R_SUCCESS) {
1755				free_buffer(disp, region.base, region.length);
1756				return (res);
1757			}
1758		} else {
1759			isc_task_t *dt = disp->task[0];
1760			isc_socketevent_t *sev =
1761				allocate_sevent(disp, socket,
1762						ISC_SOCKEVENT_RECVDONE,
1763						udp_shrecv, disp);
1764			if (sev == NULL) {
1765				free_buffer(disp, region.base, region.length);
1766				return (ISC_R_NOMEMORY);
1767			}
1768
1769			res = isc_socket_recv2(socket, &region, 1, dt, sev, 0);
1770			if (res != ISC_R_SUCCESS) {
1771				free_buffer(disp, region.base, region.length);
1772				disp->shutdown_why = res;
1773				disp->shutting_down = 1;
1774				do_cancel(disp);
1775				return (ISC_R_SUCCESS); /* recover by cancel */
1776			}
1777			INSIST(disp->recv_pending == 0);
1778			disp->recv_pending = 1;
1779		}
1780		break;
1781
1782	case isc_sockettype_tcp:
1783		res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task[0],
1784					     tcp_recv, disp);
1785		if (res != ISC_R_SUCCESS) {
1786			disp->shutdown_why = res;
1787			disp->shutting_down = 1;
1788			do_cancel(disp);
1789			return (ISC_R_SUCCESS); /* recover by cancel */
1790		}
1791		INSIST(disp->recv_pending == 0);
1792		disp->recv_pending = 1;
1793		break;
1794	default:
1795		INSIST(0);
1796		break;
1797	}
1798
1799	return (ISC_R_SUCCESS);
1800}
1801
1802/*
1803 * Mgr must be locked when calling this function.
1804 */
1805static isc_boolean_t
1806destroy_mgr_ok(dns_dispatchmgr_t *mgr) {
1807	mgr_log(mgr, LVL(90),
1808		"destroy_mgr_ok: shuttingdown=%d, listnonempty=%d, "
1809		"depool=%d, rpool=%d, dpool=%d",
1810		MGR_IS_SHUTTINGDOWN(mgr), !ISC_LIST_EMPTY(mgr->list),
1811		isc_mempool_getallocated(mgr->depool),
1812		isc_mempool_getallocated(mgr->rpool),
1813		isc_mempool_getallocated(mgr->dpool));
1814	if (!MGR_IS_SHUTTINGDOWN(mgr))
1815		return (ISC_FALSE);
1816	if (!ISC_LIST_EMPTY(mgr->list))
1817		return (ISC_FALSE);
1818	if (isc_mempool_getallocated(mgr->depool) != 0)
1819		return (ISC_FALSE);
1820	if (isc_mempool_getallocated(mgr->rpool) != 0)
1821		return (ISC_FALSE);
1822	if (isc_mempool_getallocated(mgr->dpool) != 0)
1823		return (ISC_FALSE);
1824
1825	return (ISC_TRUE);
1826}
1827
1828/*
1829 * Mgr must be unlocked when calling this function.
1830 */
1831static void
1832destroy_mgr(dns_dispatchmgr_t **mgrp) {
1833	isc_mem_t *mctx;
1834	dns_dispatchmgr_t *mgr;
1835
1836	mgr = *mgrp;
1837	*mgrp = NULL;
1838
1839	mctx = mgr->mctx;
1840
1841	mgr->magic = 0;
1842	mgr->mctx = NULL;
1843	DESTROYLOCK(&mgr->lock);
1844	mgr->state = 0;
1845
1846	DESTROYLOCK(&mgr->arc4_lock);
1847
1848	isc_mempool_destroy(&mgr->depool);
1849	isc_mempool_destroy(&mgr->rpool);
1850	isc_mempool_destroy(&mgr->dpool);
1851	if (mgr->bpool != NULL)
1852		isc_mempool_destroy(&mgr->bpool);
1853	if (mgr->spool != NULL)
1854		isc_mempool_destroy(&mgr->spool);
1855
1856	DESTROYLOCK(&mgr->spool_lock);
1857	DESTROYLOCK(&mgr->bpool_lock);
1858	DESTROYLOCK(&mgr->dpool_lock);
1859	DESTROYLOCK(&mgr->rpool_lock);
1860	DESTROYLOCK(&mgr->depool_lock);
1861
1862#ifdef BIND9
1863	if (mgr->entropy != NULL)
1864		isc_entropy_detach(&mgr->entropy);
1865#endif /* BIND9 */
1866	if (mgr->qid != NULL)
1867		qid_destroy(mctx, &mgr->qid);
1868
1869	DESTROYLOCK(&mgr->buffer_lock);
1870
1871	if (mgr->blackhole != NULL)
1872		dns_acl_detach(&mgr->blackhole);
1873
1874	if (mgr->stats != NULL)
1875		isc_stats_detach(&mgr->stats);
1876
1877	if (mgr->v4ports != NULL) {
1878		isc_mem_put(mctx, mgr->v4ports,
1879			    mgr->nv4ports * sizeof(in_port_t));
1880	}
1881	if (mgr->v6ports != NULL) {
1882		isc_mem_put(mctx, mgr->v6ports,
1883			    mgr->nv6ports * sizeof(in_port_t));
1884	}
1885	isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
1886	isc_mem_detach(&mctx);
1887}
1888
1889static isc_result_t
1890open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
1891	    unsigned int options, isc_socket_t **sockp,
1892	    isc_socket_t *dup_socket)
1893{
1894	isc_socket_t *sock;
1895	isc_result_t result;
1896
1897	sock = *sockp;
1898	if (sock != NULL) {
1899#ifdef BIND9
1900		result = isc_socket_open(sock);
1901		if (result != ISC_R_SUCCESS)
1902			return (result);
1903#else
1904		INSIST(0);
1905#endif
1906	} else if (dup_socket != NULL) {
1907		result = isc_socket_dup(dup_socket, &sock);
1908		if (result != ISC_R_SUCCESS)
1909			return (result);
1910
1911		isc_socket_setname(sock, "dispatcher", NULL);
1912		*sockp = sock;
1913		return (ISC_R_SUCCESS);
1914	} else {
1915		result = isc_socket_create(mgr, isc_sockaddr_pf(local),
1916					isc_sockettype_udp, &sock);
1917		if (result != ISC_R_SUCCESS)
1918			return (result);
1919	}
1920
1921	isc_socket_setname(sock, "dispatcher", NULL);
1922
1923#ifndef ISC_ALLOW_MAPPED
1924	isc_socket_ipv6only(sock, ISC_TRUE);
1925#endif
1926	result = isc_socket_bind(sock, local, options);
1927	if (result != ISC_R_SUCCESS) {
1928		if (*sockp == NULL)
1929			isc_socket_detach(&sock);
1930		else {
1931#ifdef BIND9
1932			isc_socket_close(sock);
1933#else
1934			INSIST(0);
1935#endif
1936		}
1937		return (result);
1938	}
1939
1940	*sockp = sock;
1941	return (ISC_R_SUCCESS);
1942}
1943
1944/*%
1945 * Create a temporary port list to set the initial default set of dispatch
1946 * ports: [1024, 65535].  This is almost meaningless as the application will
1947 * normally set the ports explicitly, but is provided to fill some minor corner
1948 * cases.
1949 */
1950static isc_result_t
1951create_default_portset(isc_mem_t *mctx, isc_portset_t **portsetp) {
1952	isc_result_t result;
1953
1954	result = isc_portset_create(mctx, portsetp);
1955	if (result != ISC_R_SUCCESS)
1956		return (result);
1957	isc_portset_addrange(*portsetp, 1024, 65535);
1958
1959	return (ISC_R_SUCCESS);
1960}
1961
1962/*
1963 * Publics.
1964 */
1965
1966isc_result_t
1967dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy,
1968		       dns_dispatchmgr_t **mgrp)
1969{
1970	dns_dispatchmgr_t *mgr;
1971	isc_result_t result;
1972	isc_portset_t *v4portset = NULL;
1973	isc_portset_t *v6portset = NULL;
1974
1975	REQUIRE(mctx != NULL);
1976	REQUIRE(mgrp != NULL && *mgrp == NULL);
1977
1978	mgr = isc_mem_get(mctx, sizeof(dns_dispatchmgr_t));
1979	if (mgr == NULL)
1980		return (ISC_R_NOMEMORY);
1981
1982	mgr->mctx = NULL;
1983	isc_mem_attach(mctx, &mgr->mctx);
1984
1985	mgr->blackhole = NULL;
1986	mgr->stats = NULL;
1987
1988	result = isc_mutex_init(&mgr->lock);
1989	if (result != ISC_R_SUCCESS)
1990		goto deallocate;
1991
1992	result = isc_mutex_init(&mgr->arc4_lock);
1993	if (result != ISC_R_SUCCESS)
1994		goto kill_lock;
1995
1996	result = isc_mutex_init(&mgr->buffer_lock);
1997	if (result != ISC_R_SUCCESS)
1998		goto kill_arc4_lock;
1999
2000	result = isc_mutex_init(&mgr->depool_lock);
2001	if (result != ISC_R_SUCCESS)
2002		goto kill_buffer_lock;
2003
2004	result = isc_mutex_init(&mgr->rpool_lock);
2005	if (result != ISC_R_SUCCESS)
2006		goto kill_depool_lock;
2007
2008	result = isc_mutex_init(&mgr->dpool_lock);
2009	if (result != ISC_R_SUCCESS)
2010		goto kill_rpool_lock;
2011
2012	result = isc_mutex_init(&mgr->bpool_lock);
2013	if (result != ISC_R_SUCCESS)
2014		goto kill_dpool_lock;
2015
2016	result = isc_mutex_init(&mgr->spool_lock);
2017	if (result != ISC_R_SUCCESS)
2018		goto kill_bpool_lock;
2019
2020	mgr->depool = NULL;
2021	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispatchevent_t),
2022			       &mgr->depool) != ISC_R_SUCCESS) {
2023		result = ISC_R_NOMEMORY;
2024		goto kill_spool_lock;
2025	}
2026
2027	mgr->rpool = NULL;
2028	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispentry_t),
2029			       &mgr->rpool) != ISC_R_SUCCESS) {
2030		result = ISC_R_NOMEMORY;
2031		goto kill_depool;
2032	}
2033
2034	mgr->dpool = NULL;
2035	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispatch_t),
2036			       &mgr->dpool) != ISC_R_SUCCESS) {
2037		result = ISC_R_NOMEMORY;
2038		goto kill_rpool;
2039	}
2040
2041	isc_mempool_setname(mgr->depool, "dispmgr_depool");
2042	isc_mempool_setmaxalloc(mgr->depool, 32768);
2043	isc_mempool_setfreemax(mgr->depool, 32768);
2044	isc_mempool_associatelock(mgr->depool, &mgr->depool_lock);
2045	isc_mempool_setfillcount(mgr->depool, 256);
2046
2047	isc_mempool_setname(mgr->rpool, "dispmgr_rpool");
2048	isc_mempool_setmaxalloc(mgr->rpool, 32768);
2049	isc_mempool_setfreemax(mgr->rpool, 32768);
2050	isc_mempool_associatelock(mgr->rpool, &mgr->rpool_lock);
2051	isc_mempool_setfillcount(mgr->rpool, 256);
2052
2053	isc_mempool_setname(mgr->dpool, "dispmgr_dpool");
2054	isc_mempool_setmaxalloc(mgr->dpool, 32768);
2055	isc_mempool_setfreemax(mgr->dpool, 32768);
2056	isc_mempool_associatelock(mgr->dpool, &mgr->dpool_lock);
2057	isc_mempool_setfillcount(mgr->dpool, 256);
2058
2059	mgr->buffers = 0;
2060	mgr->buffersize = 0;
2061	mgr->maxbuffers = 0;
2062	mgr->bpool = NULL;
2063	mgr->spool = NULL;
2064	mgr->entropy = NULL;
2065	mgr->qid = NULL;
2066	mgr->state = 0;
2067	ISC_LIST_INIT(mgr->list);
2068	mgr->v4ports = NULL;
2069	mgr->v6ports = NULL;
2070	mgr->nv4ports = 0;
2071	mgr->nv6ports = 0;
2072	mgr->magic = DNS_DISPATCHMGR_MAGIC;
2073
2074	result = create_default_portset(mctx, &v4portset);
2075	if (result == ISC_R_SUCCESS) {
2076		result = create_default_portset(mctx, &v6portset);
2077		if (result == ISC_R_SUCCESS) {
2078			result = dns_dispatchmgr_setavailports(mgr,
2079							       v4portset,
2080							       v6portset);
2081		}
2082	}
2083	if (v4portset != NULL)
2084		isc_portset_destroy(mctx, &v4portset);
2085	if (v6portset != NULL)
2086		isc_portset_destroy(mctx, &v6portset);
2087	if (result != ISC_R_SUCCESS)
2088		goto kill_dpool;
2089
2090#ifdef BIND9
2091	if (entropy != NULL)
2092		isc_entropy_attach(entropy, &mgr->entropy);
2093#else
2094	UNUSED(entropy);
2095#endif
2096
2097	dispatch_initrandom(&mgr->arc4ctx, mgr->entropy, &mgr->arc4_lock);
2098
2099	*mgrp = mgr;
2100	return (ISC_R_SUCCESS);
2101
2102 kill_dpool:
2103	isc_mempool_destroy(&mgr->dpool);
2104 kill_rpool:
2105	isc_mempool_destroy(&mgr->rpool);
2106 kill_depool:
2107	isc_mempool_destroy(&mgr->depool);
2108 kill_spool_lock:
2109	DESTROYLOCK(&mgr->spool_lock);
2110 kill_bpool_lock:
2111	DESTROYLOCK(&mgr->bpool_lock);
2112 kill_dpool_lock:
2113	DESTROYLOCK(&mgr->dpool_lock);
2114 kill_rpool_lock:
2115	DESTROYLOCK(&mgr->rpool_lock);
2116 kill_depool_lock:
2117	DESTROYLOCK(&mgr->depool_lock);
2118 kill_buffer_lock:
2119	DESTROYLOCK(&mgr->buffer_lock);
2120 kill_arc4_lock:
2121	DESTROYLOCK(&mgr->arc4_lock);
2122 kill_lock:
2123	DESTROYLOCK(&mgr->lock);
2124 deallocate:
2125	isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
2126	isc_mem_detach(&mctx);
2127
2128	return (result);
2129}
2130
2131void
2132dns_dispatchmgr_setblackhole(dns_dispatchmgr_t *mgr, dns_acl_t *blackhole) {
2133	REQUIRE(VALID_DISPATCHMGR(mgr));
2134	if (mgr->blackhole != NULL)
2135		dns_acl_detach(&mgr->blackhole);
2136	dns_acl_attach(blackhole, &mgr->blackhole);
2137}
2138
2139dns_acl_t *
2140dns_dispatchmgr_getblackhole(dns_dispatchmgr_t *mgr) {
2141	REQUIRE(VALID_DISPATCHMGR(mgr));
2142	return (mgr->blackhole);
2143}
2144
2145void
2146dns_dispatchmgr_setblackportlist(dns_dispatchmgr_t *mgr,
2147				 dns_portlist_t *portlist)
2148{
2149	REQUIRE(VALID_DISPATCHMGR(mgr));
2150	UNUSED(portlist);
2151
2152	/* This function is deprecated: use dns_dispatchmgr_setavailports(). */
2153	return;
2154}
2155
2156dns_portlist_t *
2157dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr) {
2158	REQUIRE(VALID_DISPATCHMGR(mgr));
2159	return (NULL);		/* this function is deprecated */
2160}
2161
2162isc_result_t
2163dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset,
2164			      isc_portset_t *v6portset)
2165{
2166	in_port_t *v4ports, *v6ports, p;
2167	unsigned int nv4ports, nv6ports, i4, i6;
2168
2169	REQUIRE(VALID_DISPATCHMGR(mgr));
2170
2171	nv4ports = isc_portset_nports(v4portset);
2172	nv6ports = isc_portset_nports(v6portset);
2173
2174	v4ports = NULL;
2175	if (nv4ports != 0) {
2176		v4ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv4ports);
2177		if (v4ports == NULL)
2178			return (ISC_R_NOMEMORY);
2179	}
2180	v6ports = NULL;
2181	if (nv6ports != 0) {
2182		v6ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv6ports);
2183		if (v6ports == NULL) {
2184			if (v4ports != NULL) {
2185				isc_mem_put(mgr->mctx, v4ports,
2186					    sizeof(in_port_t) *
2187					    isc_portset_nports(v4portset));
2188			}
2189			return (ISC_R_NOMEMORY);
2190		}
2191	}
2192
2193	p = 0;
2194	i4 = 0;
2195	i6 = 0;
2196	do {
2197		if (isc_portset_isset(v4portset, p)) {
2198			INSIST(i4 < nv4ports);
2199			v4ports[i4++] = p;
2200		}
2201		if (isc_portset_isset(v6portset, p)) {
2202			INSIST(i6 < nv6ports);
2203			v6ports[i6++] = p;
2204		}
2205	} while (p++ < 65535);
2206	INSIST(i4 == nv4ports && i6 == nv6ports);
2207
2208	PORTBUFLOCK(mgr);
2209	if (mgr->v4ports != NULL) {
2210		isc_mem_put(mgr->mctx, mgr->v4ports,
2211			    mgr->nv4ports * sizeof(in_port_t));
2212	}
2213	mgr->v4ports = v4ports;
2214	mgr->nv4ports = nv4ports;
2215
2216	if (mgr->v6ports != NULL) {
2217		isc_mem_put(mgr->mctx, mgr->v6ports,
2218			    mgr->nv6ports * sizeof(in_port_t));
2219	}
2220	mgr->v6ports = v6ports;
2221	mgr->nv6ports = nv6ports;
2222	PORTBUFUNLOCK(mgr);
2223
2224	return (ISC_R_SUCCESS);
2225}
2226
2227static isc_result_t
2228dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr,
2229		       unsigned int buffersize, unsigned int maxbuffers,
2230		       unsigned int maxrequests, unsigned int buckets,
2231		       unsigned int increment)
2232{
2233	isc_result_t result;
2234
2235	REQUIRE(VALID_DISPATCHMGR(mgr));
2236	REQUIRE(buffersize >= 512 && buffersize < (64 * 1024));
2237	REQUIRE(maxbuffers > 0);
2238	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2239	REQUIRE(increment > buckets);
2240
2241	/*
2242	 * Keep some number of items around.  This should be a config
2243	 * option.  For now, keep 8, but later keep at least two even
2244	 * if the caller wants less.  This allows us to ensure certain
2245	 * things, like an event can be "freed" and the next allocation
2246	 * will always succeed.
2247	 *
2248	 * Note that if limits are placed on anything here, we use one
2249	 * event internally, so the actual limit should be "wanted + 1."
2250	 *
2251	 * XXXMLG
2252	 */
2253
2254	if (maxbuffers < 8)
2255		maxbuffers = 8;
2256
2257	LOCK(&mgr->buffer_lock);
2258
2259	/* Create or adjust buffer pool */
2260	if (mgr->bpool != NULL) {
2261		/*
2262		 * We only increase the maxbuffers to avoid accidental buffer
2263		 * shortage.  Ideally we'd separate the manager-wide maximum
2264		 * from per-dispatch limits and respect the latter within the
2265		 * global limit.  But at this moment that's deemed to be
2266		 * overkilling and isn't worth additional implementation
2267		 * complexity.
2268		 */
2269		if (maxbuffers > mgr->maxbuffers) {
2270			isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
2271			isc_mempool_setfreemax(mgr->bpool, maxbuffers);
2272			mgr->maxbuffers = maxbuffers;
2273		}
2274	} else {
2275		result = isc_mempool_create(mgr->mctx, buffersize, &mgr->bpool);
2276		if (result != ISC_R_SUCCESS) {
2277			UNLOCK(&mgr->buffer_lock);
2278			return (result);
2279		}
2280		isc_mempool_setname(mgr->bpool, "dispmgr_bpool");
2281		isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
2282		isc_mempool_setfreemax(mgr->bpool, maxbuffers);
2283		isc_mempool_associatelock(mgr->bpool, &mgr->bpool_lock);
2284		isc_mempool_setfillcount(mgr->bpool, 256);
2285	}
2286
2287	/* Create or adjust socket pool */
2288	if (mgr->spool != NULL) {
2289		if (maxrequests < DNS_DISPATCH_POOLSOCKS * 2)
2290		  isc_mempool_setmaxalloc(mgr->spool, DNS_DISPATCH_POOLSOCKS * 2);
2291		  isc_mempool_setfreemax(mgr->spool, DNS_DISPATCH_POOLSOCKS * 2);
2292		UNLOCK(&mgr->buffer_lock);
2293		return (ISC_R_SUCCESS);
2294	}
2295	result = isc_mempool_create(mgr->mctx, sizeof(dispsocket_t),
2296				    &mgr->spool);
2297	if (result != ISC_R_SUCCESS) {
2298		UNLOCK(&mgr->buffer_lock);
2299		goto cleanup;
2300	}
2301	isc_mempool_setname(mgr->spool, "dispmgr_spool");
2302	isc_mempool_setmaxalloc(mgr->spool, maxrequests);
2303	isc_mempool_setfreemax(mgr->spool, maxrequests);
2304	isc_mempool_associatelock(mgr->spool, &mgr->spool_lock);
2305	isc_mempool_setfillcount(mgr->spool, 256);
2306
2307	result = qid_allocate(mgr, buckets, increment, &mgr->qid, ISC_TRUE);
2308	if (result != ISC_R_SUCCESS)
2309		goto cleanup;
2310
2311	mgr->buffersize = buffersize;
2312	mgr->maxbuffers = maxbuffers;
2313	UNLOCK(&mgr->buffer_lock);
2314	return (ISC_R_SUCCESS);
2315
2316 cleanup:
2317	isc_mempool_destroy(&mgr->bpool);
2318	if (mgr->spool != NULL)
2319		isc_mempool_destroy(&mgr->spool);
2320	UNLOCK(&mgr->buffer_lock);
2321	return (result);
2322}
2323
2324void
2325dns_dispatchmgr_destroy(dns_dispatchmgr_t **mgrp) {
2326	dns_dispatchmgr_t *mgr;
2327	isc_boolean_t killit;
2328
2329	REQUIRE(mgrp != NULL);
2330	REQUIRE(VALID_DISPATCHMGR(*mgrp));
2331
2332	mgr = *mgrp;
2333	*mgrp = NULL;
2334
2335	LOCK(&mgr->lock);
2336	mgr->state |= MGR_SHUTTINGDOWN;
2337
2338	killit = destroy_mgr_ok(mgr);
2339	UNLOCK(&mgr->lock);
2340
2341	mgr_log(mgr, LVL(90), "destroy: killit=%d", killit);
2342
2343	if (killit)
2344		destroy_mgr(&mgr);
2345}
2346
2347void
2348dns_dispatchmgr_setstats(dns_dispatchmgr_t *mgr, isc_stats_t *stats) {
2349	REQUIRE(VALID_DISPATCHMGR(mgr));
2350	REQUIRE(ISC_LIST_EMPTY(mgr->list));
2351	REQUIRE(mgr->stats == NULL);
2352
2353	isc_stats_attach(stats, &mgr->stats);
2354}
2355
2356static int
2357port_cmp(const void *key, const void *ent) {
2358	in_port_t p1 = *(const in_port_t *)key;
2359	in_port_t p2 = *(const in_port_t *)ent;
2360
2361	if (p1 < p2)
2362		return (-1);
2363	else if (p1 == p2)
2364		return (0);
2365	else
2366		return (1);
2367}
2368
2369static isc_boolean_t
2370portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
2371	      isc_sockaddr_t *sockaddrp)
2372{
2373	isc_sockaddr_t sockaddr;
2374	isc_result_t result;
2375	in_port_t *ports, port;
2376	unsigned int nports;
2377	isc_boolean_t available = ISC_FALSE;
2378
2379	REQUIRE(sock != NULL || sockaddrp != NULL);
2380
2381	PORTBUFLOCK(mgr);
2382	if (sock != NULL) {
2383		sockaddrp = &sockaddr;
2384		result = isc_socket_getsockname(sock, sockaddrp);
2385		if (result != ISC_R_SUCCESS)
2386			goto unlock;
2387	}
2388
2389	if (isc_sockaddr_pf(sockaddrp) == AF_INET) {
2390		ports = mgr->v4ports;
2391		nports = mgr->nv4ports;
2392	} else {
2393		ports = mgr->v6ports;
2394		nports = mgr->nv6ports;
2395	}
2396	if (ports == NULL)
2397		goto unlock;
2398
2399	port = isc_sockaddr_getport(sockaddrp);
2400	if (bsearch(&port, ports, nports, sizeof(in_port_t), port_cmp) != NULL)
2401		available = ISC_TRUE;
2402
2403unlock:
2404	PORTBUFUNLOCK(mgr);
2405	return (available);
2406}
2407
2408#define ATTRMATCH(_a1, _a2, _mask) (((_a1) & (_mask)) == ((_a2) & (_mask)))
2409
2410static isc_boolean_t
2411local_addr_match(dns_dispatch_t *disp, isc_sockaddr_t *addr) {
2412	isc_sockaddr_t sockaddr;
2413	isc_result_t result;
2414
2415	REQUIRE(disp->socket != NULL);
2416
2417	if (addr == NULL)
2418		return (ISC_TRUE);
2419
2420	/*
2421	 * Don't match wildcard ports unless the port is available in the
2422	 * current configuration.
2423	 */
2424	if (isc_sockaddr_getport(addr) == 0 &&
2425	    isc_sockaddr_getport(&disp->local) == 0 &&
2426	    !portavailable(disp->mgr, disp->socket, NULL)) {
2427		return (ISC_FALSE);
2428	}
2429
2430	/*
2431	 * Check if we match the binding <address,port>.
2432	 * Wildcard ports match/fail here.
2433	 */
2434	if (isc_sockaddr_equal(&disp->local, addr))
2435		return (ISC_TRUE);
2436	if (isc_sockaddr_getport(addr) == 0)
2437		return (ISC_FALSE);
2438
2439	/*
2440	 * Check if we match a bound wildcard port <address,port>.
2441	 */
2442	if (!isc_sockaddr_eqaddr(&disp->local, addr))
2443		return (ISC_FALSE);
2444	result = isc_socket_getsockname(disp->socket, &sockaddr);
2445	if (result != ISC_R_SUCCESS)
2446		return (ISC_FALSE);
2447
2448	return (isc_sockaddr_equal(&sockaddr, addr));
2449}
2450
2451/*
2452 * Requires mgr be locked.
2453 *
2454 * No dispatcher can be locked by this thread when calling this function.
2455 *
2456 *
2457 * NOTE:
2458 *	If a matching dispatcher is found, it is locked after this function
2459 *	returns, and must be unlocked by the caller.
2460 */
2461static isc_result_t
2462dispatch_find(dns_dispatchmgr_t *mgr, isc_sockaddr_t *local,
2463	      unsigned int attributes, unsigned int mask,
2464	      dns_dispatch_t **dispp)
2465{
2466	dns_dispatch_t *disp;
2467	isc_result_t result;
2468
2469	/*
2470	 * Make certain that we will not match a private or exclusive dispatch.
2471	 */
2472	attributes &= ~(DNS_DISPATCHATTR_PRIVATE|DNS_DISPATCHATTR_EXCLUSIVE);
2473	mask |= (DNS_DISPATCHATTR_PRIVATE|DNS_DISPATCHATTR_EXCLUSIVE);
2474
2475	disp = ISC_LIST_HEAD(mgr->list);
2476	while (disp != NULL) {
2477		LOCK(&disp->lock);
2478		if ((disp->shutting_down == 0)
2479		    && ATTRMATCH(disp->attributes, attributes, mask)
2480		    && local_addr_match(disp, local))
2481			break;
2482		UNLOCK(&disp->lock);
2483		disp = ISC_LIST_NEXT(disp, link);
2484	}
2485
2486	if (disp == NULL) {
2487		result = ISC_R_NOTFOUND;
2488		goto out;
2489	}
2490
2491	*dispp = disp;
2492	result = ISC_R_SUCCESS;
2493 out:
2494
2495	return (result);
2496}
2497
2498static isc_result_t
2499qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
2500	     unsigned int increment, dns_qid_t **qidp,
2501	     isc_boolean_t needsocktable)
2502{
2503	dns_qid_t *qid;
2504	unsigned int i;
2505	isc_result_t result;
2506
2507	REQUIRE(VALID_DISPATCHMGR(mgr));
2508	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2509	REQUIRE(increment > buckets);
2510	REQUIRE(qidp != NULL && *qidp == NULL);
2511
2512	qid = isc_mem_get(mgr->mctx, sizeof(*qid));
2513	if (qid == NULL)
2514		return (ISC_R_NOMEMORY);
2515
2516	qid->qid_table = isc_mem_get(mgr->mctx,
2517				     buckets * sizeof(dns_displist_t));
2518	if (qid->qid_table == NULL) {
2519		isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2520		return (ISC_R_NOMEMORY);
2521	}
2522
2523	qid->sock_table = NULL;
2524	if (needsocktable) {
2525		qid->sock_table = isc_mem_get(mgr->mctx, buckets *
2526					      sizeof(dispsocketlist_t));
2527		if (qid->sock_table == NULL) {
2528			isc_mem_put(mgr->mctx, qid->qid_table,
2529				    buckets * sizeof(dns_displist_t));
2530			isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2531			return (ISC_R_NOMEMORY);
2532		}
2533	}
2534
2535	result = isc_mutex_init(&qid->lock);
2536	if (result != ISC_R_SUCCESS) {
2537		if (qid->sock_table != NULL) {
2538			isc_mem_put(mgr->mctx, qid->sock_table,
2539				    buckets * sizeof(dispsocketlist_t));
2540		}
2541		isc_mem_put(mgr->mctx, qid->qid_table,
2542			    buckets * sizeof(dns_displist_t));
2543		isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2544		return (result);
2545	}
2546
2547	for (i = 0; i < buckets; i++) {
2548		ISC_LIST_INIT(qid->qid_table[i]);
2549		if (qid->sock_table != NULL)
2550			ISC_LIST_INIT(qid->sock_table[i]);
2551	}
2552
2553	qid->qid_nbuckets = buckets;
2554	qid->qid_increment = increment;
2555	qid->magic = QID_MAGIC;
2556	*qidp = qid;
2557	return (ISC_R_SUCCESS);
2558}
2559
2560static void
2561qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp) {
2562	dns_qid_t *qid;
2563
2564	REQUIRE(qidp != NULL);
2565	qid = *qidp;
2566
2567	REQUIRE(VALID_QID(qid));
2568
2569	*qidp = NULL;
2570	qid->magic = 0;
2571	isc_mem_put(mctx, qid->qid_table,
2572		    qid->qid_nbuckets * sizeof(dns_displist_t));
2573	if (qid->sock_table != NULL) {
2574		isc_mem_put(mctx, qid->sock_table,
2575			    qid->qid_nbuckets * sizeof(dispsocketlist_t));
2576	}
2577	DESTROYLOCK(&qid->lock);
2578	isc_mem_put(mctx, qid, sizeof(*qid));
2579}
2580
2581/*
2582 * Allocate and set important limits.
2583 */
2584static isc_result_t
2585dispatch_allocate(dns_dispatchmgr_t *mgr, unsigned int maxrequests,
2586		  dns_dispatch_t **dispp)
2587{
2588	dns_dispatch_t *disp;
2589	isc_result_t result;
2590
2591	REQUIRE(VALID_DISPATCHMGR(mgr));
2592	REQUIRE(dispp != NULL && *dispp == NULL);
2593
2594	/*
2595	 * Set up the dispatcher, mostly.  Don't bother setting some of
2596	 * the options that are controlled by tcp vs. udp, etc.
2597	 */
2598
2599	disp = isc_mempool_get(mgr->dpool);
2600	if (disp == NULL)
2601		return (ISC_R_NOMEMORY);
2602
2603	disp->magic = 0;
2604	disp->mgr = mgr;
2605	disp->maxrequests = maxrequests;
2606	disp->attributes = 0;
2607	ISC_LINK_INIT(disp, link);
2608	disp->refcount = 1;
2609	disp->recv_pending = 0;
2610	memset(&disp->local, 0, sizeof(disp->local));
2611	disp->localport = 0;
2612	disp->shutting_down = 0;
2613	disp->shutdown_out = 0;
2614	disp->connected = 0;
2615	disp->tcpmsg_valid = 0;
2616	disp->shutdown_why = ISC_R_UNEXPECTED;
2617	disp->requests = 0;
2618	disp->tcpbuffers = 0;
2619	disp->qid = NULL;
2620	ISC_LIST_INIT(disp->activesockets);
2621	ISC_LIST_INIT(disp->inactivesockets);
2622	disp->nsockets = 0;
2623	dispatch_initrandom(&disp->arc4ctx, mgr->entropy, NULL);
2624	disp->port_table = NULL;
2625	disp->portpool = NULL;
2626
2627	result = isc_mutex_init(&disp->lock);
2628	if (result != ISC_R_SUCCESS)
2629		goto deallocate;
2630
2631	disp->failsafe_ev = allocate_devent(disp);
2632	if (disp->failsafe_ev == NULL) {
2633		result = ISC_R_NOMEMORY;
2634		goto kill_lock;
2635	}
2636
2637	disp->magic = DISPATCH_MAGIC;
2638
2639	*dispp = disp;
2640	return (ISC_R_SUCCESS);
2641
2642	/*
2643	 * error returns
2644	 */
2645 kill_lock:
2646	DESTROYLOCK(&disp->lock);
2647 deallocate:
2648	isc_mempool_put(mgr->dpool, disp);
2649
2650	return (result);
2651}
2652
2653
2654/*
2655 * MUST be unlocked, and not used by anything.
2656 */
2657static void
2658dispatch_free(dns_dispatch_t **dispp) {
2659	dns_dispatch_t *disp;
2660	dns_dispatchmgr_t *mgr;
2661	int i;
2662
2663	REQUIRE(VALID_DISPATCH(*dispp));
2664	disp = *dispp;
2665	*dispp = NULL;
2666
2667	mgr = disp->mgr;
2668	REQUIRE(VALID_DISPATCHMGR(mgr));
2669
2670	if (disp->tcpmsg_valid) {
2671		dns_tcpmsg_invalidate(&disp->tcpmsg);
2672		disp->tcpmsg_valid = 0;
2673	}
2674
2675	INSIST(disp->tcpbuffers == 0);
2676	INSIST(disp->requests == 0);
2677	INSIST(disp->recv_pending == 0);
2678	INSIST(ISC_LIST_EMPTY(disp->activesockets));
2679	INSIST(ISC_LIST_EMPTY(disp->inactivesockets));
2680
2681	isc_mempool_put(mgr->depool, disp->failsafe_ev);
2682	disp->failsafe_ev = NULL;
2683
2684	if (disp->qid != NULL)
2685		qid_destroy(mgr->mctx, &disp->qid);
2686
2687	if (disp->port_table != NULL) {
2688		for (i = 0; i < DNS_DISPATCH_PORTTABLESIZE; i++)
2689			INSIST(ISC_LIST_EMPTY(disp->port_table[i]));
2690		isc_mem_put(mgr->mctx, disp->port_table,
2691			    sizeof(disp->port_table[0]) *
2692			    DNS_DISPATCH_PORTTABLESIZE);
2693	}
2694
2695	if (disp->portpool != NULL)
2696		isc_mempool_destroy(&disp->portpool);
2697
2698	disp->mgr = NULL;
2699	DESTROYLOCK(&disp->lock);
2700	disp->magic = 0;
2701	isc_mempool_put(mgr->dpool, disp);
2702}
2703
2704isc_result_t
2705dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
2706		       isc_taskmgr_t *taskmgr, unsigned int buffersize,
2707		       unsigned int maxbuffers, unsigned int maxrequests,
2708		       unsigned int buckets, unsigned int increment,
2709		       unsigned int attributes, dns_dispatch_t **dispp)
2710{
2711	isc_result_t result;
2712	dns_dispatch_t *disp;
2713
2714	UNUSED(maxbuffers);
2715	UNUSED(buffersize);
2716
2717	REQUIRE(VALID_DISPATCHMGR(mgr));
2718	REQUIRE(isc_socket_gettype(sock) == isc_sockettype_tcp);
2719	REQUIRE((attributes & DNS_DISPATCHATTR_TCP) != 0);
2720	REQUIRE((attributes & DNS_DISPATCHATTR_UDP) == 0);
2721
2722	attributes |= DNS_DISPATCHATTR_PRIVATE;  /* XXXMLG */
2723
2724	LOCK(&mgr->lock);
2725
2726	/*
2727	 * dispatch_allocate() checks mgr for us.
2728	 * qid_allocate() checks buckets and increment for us.
2729	 */
2730	disp = NULL;
2731	result = dispatch_allocate(mgr, maxrequests, &disp);
2732	if (result != ISC_R_SUCCESS) {
2733		UNLOCK(&mgr->lock);
2734		return (result);
2735	}
2736
2737	result = qid_allocate(mgr, buckets, increment, &disp->qid, ISC_FALSE);
2738	if (result != ISC_R_SUCCESS)
2739		goto deallocate_dispatch;
2740
2741	disp->socktype = isc_sockettype_tcp;
2742	disp->socket = NULL;
2743	isc_socket_attach(sock, &disp->socket);
2744
2745	disp->sepool = NULL;
2746
2747	disp->ntasks = 1;
2748	disp->task[0] = NULL;
2749	result = isc_task_create(taskmgr, 0, &disp->task[0]);
2750	if (result != ISC_R_SUCCESS)
2751		goto kill_socket;
2752
2753	disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
2754					    DNS_EVENT_DISPATCHCONTROL,
2755					    destroy_disp, disp,
2756					    sizeof(isc_event_t));
2757	if (disp->ctlevent == NULL) {
2758		result = ISC_R_NOMEMORY;
2759		goto kill_task;
2760	}
2761
2762	isc_task_setname(disp->task[0], "tcpdispatch", disp);
2763
2764	dns_tcpmsg_init(mgr->mctx, disp->socket, &disp->tcpmsg);
2765	disp->tcpmsg_valid = 1;
2766
2767	disp->attributes = attributes;
2768
2769	/*
2770	 * Append it to the dispatcher list.
2771	 */
2772	ISC_LIST_APPEND(mgr->list, disp, link);
2773	UNLOCK(&mgr->lock);
2774
2775	mgr_log(mgr, LVL(90), "created TCP dispatcher %p", disp);
2776	dispatch_log(disp, LVL(90), "created task %p", disp->task[0]);
2777
2778	*dispp = disp;
2779
2780	return (ISC_R_SUCCESS);
2781
2782	/*
2783	 * Error returns.
2784	 */
2785 kill_task:
2786	isc_task_detach(&disp->task[0]);
2787 kill_socket:
2788	isc_socket_detach(&disp->socket);
2789 deallocate_dispatch:
2790	dispatch_free(&disp);
2791
2792	UNLOCK(&mgr->lock);
2793
2794	return (result);
2795}
2796
2797isc_result_t
2798dns_dispatch_getudp_dup(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
2799		    isc_taskmgr_t *taskmgr, isc_sockaddr_t *localaddr,
2800		    unsigned int buffersize,
2801		    unsigned int maxbuffers, unsigned int maxrequests,
2802		    unsigned int buckets, unsigned int increment,
2803		    unsigned int attributes, unsigned int mask,
2804		    dns_dispatch_t **dispp, dns_dispatch_t *dup_dispatch)
2805{
2806	isc_result_t result;
2807	dns_dispatch_t *disp = NULL;
2808
2809	REQUIRE(VALID_DISPATCHMGR(mgr));
2810	REQUIRE(sockmgr != NULL);
2811	REQUIRE(localaddr != NULL);
2812	REQUIRE(taskmgr != NULL);
2813	REQUIRE(buffersize >= 512 && buffersize < (64 * 1024));
2814	REQUIRE(maxbuffers > 0);
2815	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2816	REQUIRE(increment > buckets);
2817	REQUIRE(dispp != NULL && *dispp == NULL);
2818	REQUIRE((attributes & DNS_DISPATCHATTR_TCP) == 0);
2819
2820	result = dns_dispatchmgr_setudp(mgr, buffersize, maxbuffers,
2821					maxrequests, buckets, increment);
2822	if (result != ISC_R_SUCCESS)
2823		return (result);
2824
2825	LOCK(&mgr->lock);
2826
2827	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
2828		REQUIRE(isc_sockaddr_getport(localaddr) == 0);
2829		goto createudp;
2830	}
2831
2832	/*
2833	 * See if we have a dispatcher that matches.
2834	 */
2835	if (dup_dispatch == NULL) {
2836		result = dispatch_find(mgr, localaddr, attributes, mask, &disp);
2837		if (result == ISC_R_SUCCESS) {
2838			disp->refcount++;
2839
2840			if (disp->maxrequests < maxrequests)
2841				disp->maxrequests = maxrequests;
2842
2843			if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) == 0
2844			    && (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
2845			{
2846				disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
2847				if (disp->recv_pending != 0)
2848					isc_socket_cancel(disp->socket,
2849							  disp->task[0],
2850							  ISC_SOCKCANCEL_RECV);
2851			}
2852
2853			UNLOCK(&disp->lock);
2854			UNLOCK(&mgr->lock);
2855
2856			*dispp = disp;
2857
2858			return (ISC_R_SUCCESS);
2859		}
2860	}
2861
2862 createudp:
2863	/*
2864	 * Nope, create one.
2865	 */
2866	result = dispatch_createudp(mgr, sockmgr, taskmgr, localaddr,
2867				    maxrequests, attributes, &disp,
2868				    dup_dispatch == NULL
2869					    ? NULL
2870					    : dup_dispatch->socket);
2871
2872	if (result != ISC_R_SUCCESS) {
2873		UNLOCK(&mgr->lock);
2874		return (result);
2875	}
2876
2877	UNLOCK(&mgr->lock);
2878	*dispp = disp;
2879
2880	return (ISC_R_SUCCESS);
2881}
2882
2883isc_result_t
2884dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
2885		    isc_taskmgr_t *taskmgr, isc_sockaddr_t *localaddr,
2886		    unsigned int buffersize,
2887		    unsigned int maxbuffers, unsigned int maxrequests,
2888		    unsigned int buckets, unsigned int increment,
2889		    unsigned int attributes, unsigned int mask,
2890		    dns_dispatch_t **dispp)
2891{
2892	return (dns_dispatch_getudp_dup(mgr, sockmgr, taskmgr, localaddr,
2893					buffersize, maxbuffers, maxrequests,
2894					buckets, increment, attributes,
2895					mask, dispp, NULL));
2896}
2897
2898/*
2899 * mgr should be locked.
2900 */
2901
2902#ifndef DNS_DISPATCH_HELD
2903#define DNS_DISPATCH_HELD 20U
2904#endif
2905
2906static isc_result_t
2907get_udpsocket(dns_dispatchmgr_t *mgr, dns_dispatch_t *disp,
2908	      isc_socketmgr_t *sockmgr, isc_sockaddr_t *localaddr,
2909	      isc_socket_t **sockp, isc_socket_t *dup_socket)
2910{
2911	unsigned int i, j;
2912	isc_socket_t *held[DNS_DISPATCH_HELD];
2913	isc_sockaddr_t localaddr_bound;
2914	isc_socket_t *sock = NULL;
2915	isc_result_t result = ISC_R_SUCCESS;
2916	isc_boolean_t anyport;
2917
2918	INSIST(sockp != NULL && *sockp == NULL);
2919
2920	localaddr_bound = *localaddr;
2921	anyport = ISC_TF(isc_sockaddr_getport(localaddr) == 0);
2922
2923	if (anyport) {
2924		unsigned int nports;
2925		in_port_t *ports;
2926
2927		/*
2928		 * If no port is specified, we first try to pick up a random
2929		 * port by ourselves.
2930		 */
2931		if (isc_sockaddr_pf(localaddr) == AF_INET) {
2932			nports = disp->mgr->nv4ports;
2933			ports = disp->mgr->v4ports;
2934		} else {
2935			nports = disp->mgr->nv6ports;
2936			ports = disp->mgr->v6ports;
2937		}
2938		if (nports == 0)
2939			return (ISC_R_ADDRNOTAVAIL);
2940
2941		for (i = 0; i < 1024; i++) {
2942			in_port_t prt;
2943
2944			prt = ports[dispatch_uniformrandom(
2945					DISP_ARC4CTX(disp),
2946					nports)];
2947			isc_sockaddr_setport(&localaddr_bound, prt);
2948			result = open_socket(sockmgr, &localaddr_bound,
2949					     0, &sock, NULL);
2950			/*
2951			 * Continue if the port choosen is already in use
2952			 * or the OS has reserved it.
2953			 */
2954			if (result == ISC_R_NOPERM ||
2955			    result == ISC_R_ADDRINUSE)
2956				continue;
2957			disp->localport = prt;
2958			*sockp = sock;
2959			return (result);
2960		}
2961
2962		/*
2963		 * If this fails 1024 times, we then ask the kernel for
2964		 * choosing one.
2965		 */
2966	} else {
2967		/* Allow to reuse address for non-random ports. */
2968		result = open_socket(sockmgr, localaddr,
2969				     ISC_SOCKET_REUSEADDRESS, &sock,
2970				     dup_socket);
2971
2972		if (result == ISC_R_SUCCESS)
2973			*sockp = sock;
2974
2975		return (result);
2976	}
2977
2978	memset(held, 0, sizeof(held));
2979	i = 0;
2980
2981	for (j = 0; j < 0xffffU; j++) {
2982		result = open_socket(sockmgr, localaddr, 0, &sock, NULL);
2983		if (result != ISC_R_SUCCESS)
2984			goto end;
2985		else if (portavailable(mgr, sock, NULL))
2986			break;
2987		if (held[i] != NULL)
2988			isc_socket_detach(&held[i]);
2989		held[i++] = sock;
2990		sock = NULL;
2991		if (i == DNS_DISPATCH_HELD)
2992			i = 0;
2993	}
2994	if (j == 0xffffU) {
2995		mgr_log(mgr, ISC_LOG_ERROR,
2996			"avoid-v%s-udp-ports: unable to allocate "
2997			"an available port",
2998			isc_sockaddr_pf(localaddr) == AF_INET ? "4" : "6");
2999		result = ISC_R_FAILURE;
3000		goto end;
3001	}
3002	*sockp = sock;
3003
3004end:
3005	for (i = 0; i < DNS_DISPATCH_HELD; i++) {
3006		if (held[i] != NULL)
3007			isc_socket_detach(&held[i]);
3008	}
3009
3010	return (result);
3011}
3012
3013static isc_result_t
3014dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
3015		   isc_taskmgr_t *taskmgr,
3016		   isc_sockaddr_t *localaddr,
3017		   unsigned int maxrequests,
3018		   unsigned int attributes,
3019		   dns_dispatch_t **dispp,
3020		   isc_socket_t *dup_socket)
3021{
3022	isc_result_t result;
3023	dns_dispatch_t *disp;
3024	isc_socket_t *sock = NULL;
3025	int i = 0;
3026
3027	/*
3028	 * dispatch_allocate() checks mgr for us.
3029	 */
3030	disp = NULL;
3031	result = dispatch_allocate(mgr, maxrequests, &disp);
3032	if (result != ISC_R_SUCCESS)
3033		return (result);
3034
3035	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0) {
3036		result = get_udpsocket(mgr, disp, sockmgr, localaddr, &sock,
3037				       dup_socket);
3038		if (result != ISC_R_SUCCESS)
3039			goto deallocate_dispatch;
3040
3041		if (isc_log_wouldlog(dns_lctx, 90)) {
3042			char addrbuf[ISC_SOCKADDR_FORMATSIZE];
3043
3044			isc_sockaddr_format(localaddr, addrbuf,
3045					    ISC_SOCKADDR_FORMATSIZE);
3046			mgr_log(mgr, LVL(90), "dns_dispatch_createudp: Created"
3047				" UDP dispatch for %s with socket fd %d\n",
3048				addrbuf, isc_socket_getfd(sock));
3049		}
3050
3051	} else {
3052		isc_sockaddr_t sa_any;
3053
3054		/*
3055		 * For dispatches using exclusive sockets with a specific
3056		 * source address, we only check if the specified address is
3057		 * available on the system.  Query sockets will be created later
3058		 * on demand.
3059		 */
3060		isc_sockaddr_anyofpf(&sa_any, isc_sockaddr_pf(localaddr));
3061		if (!isc_sockaddr_eqaddr(&sa_any, localaddr)) {
3062			result = open_socket(sockmgr, localaddr, 0, &sock, NULL);
3063			if (sock != NULL)
3064				isc_socket_detach(&sock);
3065			if (result != ISC_R_SUCCESS)
3066				goto deallocate_dispatch;
3067		}
3068
3069		disp->port_table = isc_mem_get(mgr->mctx,
3070					       sizeof(disp->port_table[0]) *
3071					       DNS_DISPATCH_PORTTABLESIZE);
3072		if (disp->port_table == NULL)
3073			goto deallocate_dispatch;
3074		for (i = 0; i < DNS_DISPATCH_PORTTABLESIZE; i++)
3075			ISC_LIST_INIT(disp->port_table[i]);
3076
3077		result = isc_mempool_create(mgr->mctx, sizeof(dispportentry_t),
3078					    &disp->portpool);
3079		if (result != ISC_R_SUCCESS)
3080			goto deallocate_dispatch;
3081		isc_mempool_setname(disp->portpool, "disp_portpool");
3082		isc_mempool_setfreemax(disp->portpool, 128);
3083	}
3084	disp->socktype = isc_sockettype_udp;
3085	disp->socket = sock;
3086	disp->local = *localaddr;
3087
3088	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
3089		disp->ntasks = MAX_INTERNAL_TASKS;
3090	else
3091		disp->ntasks = 1;
3092	for (i = 0; i < disp->ntasks; i++) {
3093		disp->task[i] = NULL;
3094		result = isc_task_create(taskmgr, 0, &disp->task[i]);
3095		if (result != ISC_R_SUCCESS) {
3096			while (--i >= 0) {
3097				isc_task_shutdown(disp->task[i]);
3098				isc_task_detach(&disp->task[i]);
3099			}
3100			goto kill_socket;
3101		}
3102		isc_task_setname(disp->task[i], "udpdispatch", disp);
3103	}
3104
3105	disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
3106					    DNS_EVENT_DISPATCHCONTROL,
3107					    destroy_disp, disp,
3108					    sizeof(isc_event_t));
3109	if (disp->ctlevent == NULL) {
3110		result = ISC_R_NOMEMORY;
3111		goto kill_task;
3112	}
3113
3114	disp->sepool = NULL;
3115	if (isc_mempool_create(mgr->mctx, sizeof(isc_socketevent_t),
3116			       &disp->sepool) != ISC_R_SUCCESS)
3117	{
3118		result = ISC_R_NOMEMORY;
3119		goto kill_ctlevent;
3120	}
3121
3122	result = isc_mutex_init(&disp->sepool_lock);
3123	if (result != ISC_R_SUCCESS)
3124		goto kill_sepool;
3125
3126	isc_mempool_setname(disp->sepool, "disp_sepool");
3127	isc_mempool_setmaxalloc(disp->sepool, 32768);
3128	isc_mempool_setfreemax(disp->sepool, 32768);
3129	isc_mempool_associatelock(disp->sepool, &disp->sepool_lock);
3130	isc_mempool_setfillcount(disp->sepool, 16);
3131
3132	attributes &= ~DNS_DISPATCHATTR_TCP;
3133	attributes |= DNS_DISPATCHATTR_UDP;
3134	disp->attributes = attributes;
3135
3136	/*
3137	 * Append it to the dispatcher list.
3138	 */
3139	ISC_LIST_APPEND(mgr->list, disp, link);
3140
3141	mgr_log(mgr, LVL(90), "created UDP dispatcher %p", disp);
3142	dispatch_log(disp, LVL(90), "created task %p", disp->task[0]); /* XXX */
3143	if (disp->socket != NULL)
3144		dispatch_log(disp, LVL(90), "created socket %p", disp->socket);
3145
3146	*dispp = disp;
3147
3148	return (result);
3149
3150	/*
3151	 * Error returns.
3152	 */
3153 kill_sepool:
3154	isc_mempool_destroy(&disp->sepool);
3155 kill_ctlevent:
3156	isc_event_free(&disp->ctlevent);
3157 kill_task:
3158	for (i = 0; i < disp->ntasks; i++)
3159		isc_task_detach(&disp->task[i]);
3160 kill_socket:
3161	if (disp->socket != NULL)
3162		isc_socket_detach(&disp->socket);
3163 deallocate_dispatch:
3164	dispatch_free(&disp);
3165
3166	return (result);
3167}
3168
3169void
3170dns_dispatch_attach(dns_dispatch_t *disp, dns_dispatch_t **dispp) {
3171	REQUIRE(VALID_DISPATCH(disp));
3172	REQUIRE(dispp != NULL && *dispp == NULL);
3173
3174	LOCK(&disp->lock);
3175	disp->refcount++;
3176	UNLOCK(&disp->lock);
3177
3178	*dispp = disp;
3179}
3180
3181/*
3182 * It is important to lock the manager while we are deleting the dispatch,
3183 * since dns_dispatch_getudp will call dispatch_find, which returns to
3184 * the caller a dispatch but does not attach to it until later.  _getudp
3185 * locks the manager, however, so locking it here will keep us from attaching
3186 * to a dispatcher that is in the process of going away.
3187 */
3188void
3189dns_dispatch_detach(dns_dispatch_t **dispp) {
3190	dns_dispatch_t *disp;
3191	dispsocket_t *dispsock;
3192	isc_boolean_t killit;
3193
3194	REQUIRE(dispp != NULL && VALID_DISPATCH(*dispp));
3195
3196	disp = *dispp;
3197	*dispp = NULL;
3198
3199	LOCK(&disp->lock);
3200
3201	INSIST(disp->refcount > 0);
3202	disp->refcount--;
3203	if (disp->refcount == 0) {
3204		if (disp->recv_pending > 0)
3205			isc_socket_cancel(disp->socket, disp->task[0],
3206					  ISC_SOCKCANCEL_RECV);
3207		for (dispsock = ISC_LIST_HEAD(disp->activesockets);
3208		     dispsock != NULL;
3209		     dispsock = ISC_LIST_NEXT(dispsock, link)) {
3210			isc_socket_cancel(dispsock->socket, dispsock->task,
3211					  ISC_SOCKCANCEL_RECV);
3212		}
3213		disp->shutting_down = 1;
3214	}
3215
3216	dispatch_log(disp, LVL(90), "detach: refcount %d", disp->refcount);
3217
3218	killit = destroy_disp_ok(disp);
3219	UNLOCK(&disp->lock);
3220	if (killit)
3221		isc_task_send(disp->task[0], &disp->ctlevent);
3222}
3223
3224isc_result_t
3225dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest,
3226			  isc_task_t *task, isc_taskaction_t action, void *arg,
3227			  dns_messageid_t *idp, dns_dispentry_t **resp,
3228			  isc_socketmgr_t *sockmgr)
3229{
3230	dns_dispentry_t *res;
3231	unsigned int bucket;
3232	in_port_t localport = 0;
3233	dns_messageid_t id;
3234	int i;
3235	isc_boolean_t ok;
3236	dns_qid_t *qid;
3237	dispsocket_t *dispsocket = NULL;
3238	isc_result_t result;
3239
3240	REQUIRE(VALID_DISPATCH(disp));
3241	REQUIRE(task != NULL);
3242	REQUIRE(dest != NULL);
3243	REQUIRE(resp != NULL && *resp == NULL);
3244	REQUIRE(idp != NULL);
3245	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
3246		REQUIRE(sockmgr != NULL);
3247
3248	LOCK(&disp->lock);
3249
3250	if (disp->shutting_down == 1) {
3251		UNLOCK(&disp->lock);
3252		return (ISC_R_SHUTTINGDOWN);
3253	}
3254
3255	if (disp->requests >= disp->maxrequests) {
3256		UNLOCK(&disp->lock);
3257		return (ISC_R_QUOTA);
3258	}
3259
3260	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
3261	    disp->nsockets > DNS_DISPATCH_SOCKSQUOTA) {
3262		dispsocket_t *oldestsocket;
3263		dns_dispentry_t *oldestresp;
3264		dns_dispatchevent_t *rev;
3265
3266		/*
3267		 * Kill oldest outstanding query if the number of sockets
3268		 * exceeds the quota to keep the room for new queries.
3269		 */
3270		oldestsocket = ISC_LIST_HEAD(disp->activesockets);
3271		oldestresp = oldestsocket->resp;
3272		if (oldestresp != NULL && !oldestresp->item_out) {
3273			rev = allocate_devent(oldestresp->disp);
3274			if (rev != NULL) {
3275				rev->buffer.base = NULL;
3276				rev->result = ISC_R_CANCELED;
3277				rev->id = oldestresp->id;
3278				ISC_EVENT_INIT(rev, sizeof(*rev), 0,
3279					       NULL, DNS_EVENT_DISPATCH,
3280					       oldestresp->action,
3281					       oldestresp->arg, oldestresp,
3282					       NULL, NULL);
3283				oldestresp->item_out = ISC_TRUE;
3284				isc_task_send(oldestresp->task,
3285					      ISC_EVENT_PTR(&rev));
3286				inc_stats(disp->mgr,
3287					  dns_resstatscounter_dispabort);
3288			}
3289		}
3290
3291		/*
3292		 * Move this entry to the tail so that it won't (easily) be
3293		 * examined before actually being canceled.
3294		 */
3295		ISC_LIST_UNLINK(disp->activesockets, oldestsocket, link);
3296		ISC_LIST_APPEND(disp->activesockets, oldestsocket, link);
3297	}
3298
3299	qid = DNS_QID(disp);
3300
3301	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
3302		/*
3303		 * Get a separate UDP socket with a random port number.
3304		 */
3305		result = get_dispsocket(disp, dest, sockmgr, &dispsocket,
3306					&localport);
3307		if (result != ISC_R_SUCCESS) {
3308			UNLOCK(&disp->lock);
3309			inc_stats(disp->mgr, dns_resstatscounter_dispsockfail);
3310			return (result);
3311		}
3312	} else {
3313		localport = disp->localport;
3314	}
3315
3316	/*
3317	 * Try somewhat hard to find an unique ID.
3318	 */
3319	LOCK(&qid->lock);
3320	id = (dns_messageid_t)dispatch_random(DISP_ARC4CTX(disp));
3321	ok = ISC_FALSE;
3322	i = 0;
3323	do {
3324		bucket = dns_hash(qid, dest, id, localport);
3325		if (entry_search(qid, dest, id, localport, bucket) == NULL) {
3326			ok = ISC_TRUE;
3327			break;
3328		}
3329		id += qid->qid_increment;
3330		id &= 0x0000ffff;
3331	} while (i++ < 64);
3332	UNLOCK(&qid->lock);
3333
3334	if (!ok) {
3335		UNLOCK(&disp->lock);
3336		return (ISC_R_NOMORE);
3337	}
3338
3339	res = isc_mempool_get(disp->mgr->rpool);
3340	if (res == NULL) {
3341		if (dispsocket != NULL)
3342			destroy_dispsocket(disp, &dispsocket);
3343		UNLOCK(&disp->lock);
3344		return (ISC_R_NOMEMORY);
3345	}
3346
3347	disp->refcount++;
3348	disp->requests++;
3349	res->task = NULL;
3350	isc_task_attach(task, &res->task);
3351	res->disp = disp;
3352	res->id = id;
3353	res->port = localport;
3354	res->bucket = bucket;
3355	res->host = *dest;
3356	res->action = action;
3357	res->arg = arg;
3358	res->dispsocket = dispsocket;
3359	if (dispsocket != NULL)
3360		dispsocket->resp = res;
3361	res->item_out = ISC_FALSE;
3362	ISC_LIST_INIT(res->items);
3363	ISC_LINK_INIT(res, link);
3364	res->magic = RESPONSE_MAGIC;
3365
3366	LOCK(&qid->lock);
3367	ISC_LIST_APPEND(qid->qid_table[bucket], res, link);
3368	UNLOCK(&qid->lock);
3369
3370	request_log(disp, res, LVL(90),
3371		    "attached to task %p", res->task);
3372
3373	if (((disp->attributes & DNS_DISPATCHATTR_UDP) != 0) ||
3374	    ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0)) {
3375		result = startrecv(disp, dispsocket);
3376		if (result != ISC_R_SUCCESS) {
3377			LOCK(&qid->lock);
3378			ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
3379			UNLOCK(&qid->lock);
3380
3381			if (dispsocket != NULL)
3382				destroy_dispsocket(disp, &dispsocket);
3383
3384			disp->refcount--;
3385			disp->requests--;
3386
3387			UNLOCK(&disp->lock);
3388			isc_task_detach(&res->task);
3389			isc_mempool_put(disp->mgr->rpool, res);
3390			return (result);
3391		}
3392	}
3393
3394	if (dispsocket != NULL)
3395		ISC_LIST_APPEND(disp->activesockets, dispsocket, link);
3396
3397	UNLOCK(&disp->lock);
3398
3399	*idp = id;
3400	*resp = res;
3401
3402	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
3403		INSIST(res->dispsocket != NULL);
3404
3405	return (ISC_R_SUCCESS);
3406}
3407
3408isc_result_t
3409dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
3410			 isc_task_t *task, isc_taskaction_t action, void *arg,
3411			 dns_messageid_t *idp, dns_dispentry_t **resp)
3412{
3413	REQUIRE(VALID_DISPATCH(disp));
3414	REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
3415
3416	return (dns_dispatch_addresponse2(disp, dest, task, action, arg,
3417					  idp, resp, NULL));
3418}
3419
3420void
3421dns_dispatch_starttcp(dns_dispatch_t *disp) {
3422
3423	REQUIRE(VALID_DISPATCH(disp));
3424
3425	dispatch_log(disp, LVL(90), "starttcp %p", disp->task[0]);
3426
3427	LOCK(&disp->lock);
3428	disp->attributes |= DNS_DISPATCHATTR_CONNECTED;
3429	(void)startrecv(disp, NULL);
3430	UNLOCK(&disp->lock);
3431}
3432
3433void
3434dns_dispatch_removeresponse(dns_dispentry_t **resp,
3435			    dns_dispatchevent_t **sockevent)
3436{
3437	dns_dispatchmgr_t *mgr;
3438	dns_dispatch_t *disp;
3439	dns_dispentry_t *res;
3440	dispsocket_t *dispsock;
3441	dns_dispatchevent_t *ev;
3442	unsigned int bucket;
3443	isc_boolean_t killit;
3444	unsigned int n;
3445	isc_eventlist_t events;
3446	dns_qid_t *qid;
3447
3448	REQUIRE(resp != NULL);
3449	REQUIRE(VALID_RESPONSE(*resp));
3450
3451	res = *resp;
3452	*resp = NULL;
3453
3454	disp = res->disp;
3455	REQUIRE(VALID_DISPATCH(disp));
3456	mgr = disp->mgr;
3457	REQUIRE(VALID_DISPATCHMGR(mgr));
3458
3459	qid = DNS_QID(disp);
3460
3461	if (sockevent != NULL) {
3462		REQUIRE(*sockevent != NULL);
3463		ev = *sockevent;
3464		*sockevent = NULL;
3465	} else {
3466		ev = NULL;
3467	}
3468
3469	LOCK(&disp->lock);
3470
3471	INSIST(disp->requests > 0);
3472	disp->requests--;
3473	INSIST(disp->refcount > 0);
3474	disp->refcount--;
3475	if (disp->refcount == 0) {
3476		if (disp->recv_pending > 0)
3477			isc_socket_cancel(disp->socket, disp->task[0],
3478					  ISC_SOCKCANCEL_RECV);
3479		for (dispsock = ISC_LIST_HEAD(disp->activesockets);
3480		     dispsock != NULL;
3481		     dispsock = ISC_LIST_NEXT(dispsock, link)) {
3482			isc_socket_cancel(dispsock->socket, dispsock->task,
3483					  ISC_SOCKCANCEL_RECV);
3484		}
3485		disp->shutting_down = 1;
3486	}
3487
3488	bucket = res->bucket;
3489
3490	LOCK(&qid->lock);
3491	ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
3492	UNLOCK(&qid->lock);
3493
3494	if (ev == NULL && res->item_out) {
3495		/*
3496		 * We've posted our event, but the caller hasn't gotten it
3497		 * yet.  Take it back.
3498		 */
3499		ISC_LIST_INIT(events);
3500		n = isc_task_unsend(res->task, res, DNS_EVENT_DISPATCH,
3501				    NULL, &events);
3502		/*
3503		 * We had better have gotten it back.
3504		 */
3505		INSIST(n == 1);
3506		ev = (dns_dispatchevent_t *)ISC_LIST_HEAD(events);
3507	}
3508
3509	if (ev != NULL) {
3510		REQUIRE(res->item_out == ISC_TRUE);
3511		res->item_out = ISC_FALSE;
3512		if (ev->buffer.base != NULL)
3513			free_buffer(disp, ev->buffer.base, ev->buffer.length);
3514		free_devent(disp, ev);
3515	}
3516
3517	request_log(disp, res, LVL(90), "detaching from task %p", res->task);
3518	isc_task_detach(&res->task);
3519
3520	if (res->dispsocket != NULL) {
3521		isc_socket_cancel(res->dispsocket->socket,
3522				  res->dispsocket->task, ISC_SOCKCANCEL_RECV);
3523		res->dispsocket->resp = NULL;
3524	}
3525
3526	/*
3527	 * Free any buffered requests as well
3528	 */
3529	ev = ISC_LIST_HEAD(res->items);
3530	while (ev != NULL) {
3531		ISC_LIST_UNLINK(res->items, ev, ev_link);
3532		if (ev->buffer.base != NULL)
3533			free_buffer(disp, ev->buffer.base, ev->buffer.length);
3534		free_devent(disp, ev);
3535		ev = ISC_LIST_HEAD(res->items);
3536	}
3537	res->magic = 0;
3538	isc_mempool_put(disp->mgr->rpool, res);
3539	if (disp->shutting_down == 1)
3540		do_cancel(disp);
3541	else
3542		(void)startrecv(disp, NULL);
3543
3544	killit = destroy_disp_ok(disp);
3545	UNLOCK(&disp->lock);
3546	if (killit)
3547		isc_task_send(disp->task[0], &disp->ctlevent);
3548}
3549
3550static void
3551do_cancel(dns_dispatch_t *disp) {
3552	dns_dispatchevent_t *ev;
3553	dns_dispentry_t *resp;
3554	dns_qid_t *qid;
3555
3556	if (disp->shutdown_out == 1)
3557		return;
3558
3559	qid = DNS_QID(disp);
3560
3561	/*
3562	 * Search for the first response handler without packets outstanding
3563	 * unless a specific hander is given.
3564	 */
3565	LOCK(&qid->lock);
3566	for (resp = linear_first(qid);
3567	     resp != NULL && resp->item_out;
3568	     /* Empty. */)
3569		resp = linear_next(qid, resp);
3570
3571	/*
3572	 * No one to send the cancel event to, so nothing to do.
3573	 */
3574	if (resp == NULL)
3575		goto unlock;
3576
3577	/*
3578	 * Send the shutdown failsafe event to this resp.
3579	 */
3580	ev = disp->failsafe_ev;
3581	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, DNS_EVENT_DISPATCH,
3582		       resp->action, resp->arg, resp, NULL, NULL);
3583	ev->result = disp->shutdown_why;
3584	ev->buffer.base = NULL;
3585	ev->buffer.length = 0;
3586	disp->shutdown_out = 1;
3587	request_log(disp, resp, LVL(10),
3588		    "cancel: failsafe event %p -> task %p",
3589		    ev, resp->task);
3590	resp->item_out = ISC_TRUE;
3591	isc_task_send(resp->task, ISC_EVENT_PTR(&ev));
3592 unlock:
3593	UNLOCK(&qid->lock);
3594}
3595
3596isc_socket_t *
3597dns_dispatch_getsocket(dns_dispatch_t *disp) {
3598	REQUIRE(VALID_DISPATCH(disp));
3599
3600	return (disp->socket);
3601}
3602
3603isc_socket_t *
3604dns_dispatch_getentrysocket(dns_dispentry_t *resp) {
3605	REQUIRE(VALID_RESPONSE(resp));
3606
3607	if (resp->dispsocket != NULL)
3608		return (resp->dispsocket->socket);
3609	else
3610		return (NULL);
3611}
3612
3613isc_result_t
3614dns_dispatch_getlocaladdress(dns_dispatch_t *disp, isc_sockaddr_t *addrp) {
3615
3616	REQUIRE(VALID_DISPATCH(disp));
3617	REQUIRE(addrp != NULL);
3618
3619	if (disp->socktype == isc_sockettype_udp) {
3620		*addrp = disp->local;
3621		return (ISC_R_SUCCESS);
3622	}
3623	return (ISC_R_NOTIMPLEMENTED);
3624}
3625
3626void
3627dns_dispatch_cancel(dns_dispatch_t *disp) {
3628	REQUIRE(VALID_DISPATCH(disp));
3629
3630	LOCK(&disp->lock);
3631
3632	if (disp->shutting_down == 1) {
3633		UNLOCK(&disp->lock);
3634		return;
3635	}
3636
3637	disp->shutdown_why = ISC_R_CANCELED;
3638	disp->shutting_down = 1;
3639	do_cancel(disp);
3640
3641	UNLOCK(&disp->lock);
3642
3643	return;
3644}
3645
3646unsigned int
3647dns_dispatch_getattributes(dns_dispatch_t *disp) {
3648	REQUIRE(VALID_DISPATCH(disp));
3649
3650	/*
3651	 * We don't bother locking disp here; it's the caller's responsibility
3652	 * to use only non volatile flags.
3653	 */
3654	return (disp->attributes);
3655}
3656
3657void
3658dns_dispatch_changeattributes(dns_dispatch_t *disp,
3659			      unsigned int attributes, unsigned int mask)
3660{
3661	REQUIRE(VALID_DISPATCH(disp));
3662	/* Exclusive attribute can only be set on creation */
3663	REQUIRE((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
3664	/* Also, a dispatch with randomport specified cannot start listening */
3665	REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0 ||
3666		(attributes & DNS_DISPATCHATTR_NOLISTEN) == 0);
3667
3668	/* XXXMLG
3669	 * Should check for valid attributes here!
3670	 */
3671
3672	LOCK(&disp->lock);
3673
3674	if ((mask & DNS_DISPATCHATTR_NOLISTEN) != 0) {
3675		if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0 &&
3676		    (attributes & DNS_DISPATCHATTR_NOLISTEN) == 0) {
3677			disp->attributes &= ~DNS_DISPATCHATTR_NOLISTEN;
3678			(void)startrecv(disp, NULL);
3679		} else if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN)
3680			   == 0 &&
3681			   (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0) {
3682			disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
3683			if (disp->recv_pending != 0)
3684				isc_socket_cancel(disp->socket, disp->task[0],
3685						  ISC_SOCKCANCEL_RECV);
3686		}
3687	}
3688
3689	disp->attributes &= ~mask;
3690	disp->attributes |= (attributes & mask);
3691	UNLOCK(&disp->lock);
3692}
3693
3694void
3695dns_dispatch_importrecv(dns_dispatch_t *disp, isc_event_t *event) {
3696	void *buf;
3697	isc_socketevent_t *sevent, *newsevent;
3698
3699	REQUIRE(VALID_DISPATCH(disp));
3700	REQUIRE((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0);
3701	REQUIRE(event != NULL);
3702
3703	sevent = (isc_socketevent_t *)event;
3704
3705	INSIST(sevent->n <= disp->mgr->buffersize);
3706	newsevent = (isc_socketevent_t *)
3707		    isc_event_allocate(disp->mgr->mctx, NULL,
3708				      DNS_EVENT_IMPORTRECVDONE, udp_shrecv,
3709				      disp, sizeof(isc_socketevent_t));
3710	if (newsevent == NULL)
3711		return;
3712
3713	buf = allocate_udp_buffer(disp);
3714	if (buf == NULL) {
3715		isc_event_free(ISC_EVENT_PTR(&newsevent));
3716		return;
3717	}
3718	memmove(buf, sevent->region.base, sevent->n);
3719	newsevent->region.base = buf;
3720	newsevent->region.length = disp->mgr->buffersize;
3721	newsevent->n = sevent->n;
3722	newsevent->result = sevent->result;
3723	newsevent->address = sevent->address;
3724	newsevent->timestamp = sevent->timestamp;
3725	newsevent->pktinfo = sevent->pktinfo;
3726	newsevent->attributes = sevent->attributes;
3727
3728	isc_task_send(disp->task[0], ISC_EVENT_PTR(&newsevent));
3729}
3730
3731dns_dispatch_t *
3732dns_dispatchset_get(dns_dispatchset_t *dset) {
3733	dns_dispatch_t *disp;
3734
3735	/* check that dispatch set is configured */
3736	if (dset == NULL || dset->ndisp == 0)
3737		return (NULL);
3738
3739	LOCK(&dset->lock);
3740	disp = dset->dispatches[dset->cur];
3741	dset->cur++;
3742	if (dset->cur == dset->ndisp)
3743		dset->cur = 0;
3744	UNLOCK(&dset->lock);
3745
3746	return (disp);
3747}
3748
3749isc_result_t
3750dns_dispatchset_create(isc_mem_t *mctx, isc_socketmgr_t *sockmgr,
3751		       isc_taskmgr_t *taskmgr, dns_dispatch_t *source,
3752		       dns_dispatchset_t **dsetp, int n)
3753{
3754	isc_result_t result;
3755	dns_dispatchset_t *dset;
3756	dns_dispatchmgr_t *mgr;
3757	int i, j;
3758
3759	REQUIRE(VALID_DISPATCH(source));
3760	REQUIRE((source->attributes & DNS_DISPATCHATTR_UDP) != 0);
3761	REQUIRE(dsetp != NULL && *dsetp == NULL);
3762
3763	mgr = source->mgr;
3764
3765	dset = isc_mem_get(mctx, sizeof(dns_dispatchset_t));
3766	if (dset == NULL)
3767		return (ISC_R_NOMEMORY);
3768	memset(dset, 0, sizeof(*dset));
3769
3770	result = isc_mutex_init(&dset->lock);
3771	if (result != ISC_R_SUCCESS)
3772		goto fail_alloc;
3773
3774	dset->dispatches = isc_mem_get(mctx, sizeof(dns_dispatch_t *) * n);
3775	if (dset == NULL) {
3776		result = ISC_R_NOMEMORY;
3777		goto fail_lock;
3778	}
3779
3780	isc_mem_attach(mctx, &dset->mctx);
3781	dset->ndisp = n;
3782	dset->cur = 0;
3783
3784	dset->dispatches[0] = NULL;
3785	dns_dispatch_attach(source, &dset->dispatches[0]);
3786
3787	LOCK(&mgr->lock);
3788	for (i = 1; i < n; i++) {
3789		dset->dispatches[i] = NULL;
3790		result = dispatch_createudp(mgr, sockmgr, taskmgr,
3791					    &source->local,
3792					    source->maxrequests,
3793					    source->attributes,
3794					    &dset->dispatches[i],
3795					    source->socket);
3796		if (result != ISC_R_SUCCESS)
3797			goto fail;
3798	}
3799
3800	UNLOCK(&mgr->lock);
3801	*dsetp = dset;
3802
3803	return (ISC_R_SUCCESS);
3804
3805 fail:
3806	UNLOCK(&mgr->lock);
3807
3808	for (j = 0; j < i; j++)
3809		dns_dispatch_detach(&(dset->dispatches[j]));
3810	isc_mem_put(mctx, dset->dispatches, sizeof(dns_dispatch_t *) * n);
3811	if (dset->mctx == mctx)
3812		isc_mem_detach(&dset->mctx);
3813
3814 fail_lock:
3815	DESTROYLOCK(&dset->lock);
3816
3817 fail_alloc:
3818	isc_mem_put(mctx, dset, sizeof(dns_dispatchset_t));
3819	return (result);
3820}
3821
3822void
3823dns_dispatchset_cancelall(dns_dispatchset_t *dset, isc_task_t *task) {
3824	int i;
3825
3826	REQUIRE(dset != NULL);
3827
3828	for (i = 0; i < dset->ndisp; i++) {
3829		isc_socket_t *sock;
3830		sock = dns_dispatch_getsocket(dset->dispatches[i]);
3831		isc_socket_cancel(sock, task, ISC_SOCKCANCEL_ALL);
3832	}
3833}
3834
3835void
3836dns_dispatchset_destroy(dns_dispatchset_t **dsetp) {
3837	dns_dispatchset_t *dset;
3838	int i;
3839
3840	REQUIRE(dsetp != NULL && *dsetp != NULL);
3841
3842	dset = *dsetp;
3843	for (i = 0; i < dset->ndisp; i++)
3844		dns_dispatch_detach(&(dset->dispatches[i]));
3845	isc_mem_put(dset->mctx, dset->dispatches,
3846		    sizeof(dns_dispatch_t *) * dset->ndisp);
3847	DESTROYLOCK(&dset->lock);
3848	isc_mem_putanddetach(&dset->mctx, dset, sizeof(dns_dispatchset_t));
3849
3850	*dsetp = NULL;
3851}
3852
3853#if 0
3854void
3855dns_dispatchmgr_dump(dns_dispatchmgr_t *mgr) {
3856	dns_dispatch_t *disp;
3857	char foo[1024];
3858
3859	disp = ISC_LIST_HEAD(mgr->list);
3860	while (disp != NULL) {
3861		isc_sockaddr_format(&disp->local, foo, sizeof(foo));
3862		printf("\tdispatch %p, addr %s\n", disp, foo);
3863		disp = ISC_LIST_NEXT(disp, link);
3864	}
3865}
3866#endif
3867