1/*
2 * Copyright (C) 2004-2009, 2011, 2012  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2003  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id$ */
19
20/*! \file */
21
22#include <config.h>
23
24#include <stdlib.h>
25#include <sys/types.h>
26#include <unistd.h>
27#include <stdlib.h>
28
29#include <isc/entropy.h>
30#include <isc/mem.h>
31#include <isc/mutex.h>
32#include <isc/portset.h>
33#include <isc/print.h>
34#include <isc/random.h>
35#include <isc/stats.h>
36#include <isc/string.h>
37#include <isc/task.h>
38#include <isc/time.h>
39#include <isc/util.h>
40
41#include <dns/acl.h>
42#include <dns/dispatch.h>
43#include <dns/events.h>
44#include <dns/log.h>
45#include <dns/message.h>
46#include <dns/portlist.h>
47#include <dns/stats.h>
48#include <dns/tcpmsg.h>
49#include <dns/types.h>
50
51typedef ISC_LIST(dns_dispentry_t)	dns_displist_t;
52
53typedef struct dispsocket		dispsocket_t;
54typedef ISC_LIST(dispsocket_t)		dispsocketlist_t;
55
56typedef struct dispportentry		dispportentry_t;
57typedef ISC_LIST(dispportentry_t)	dispportlist_t;
58
59/* ARC4 Random generator state */
60typedef struct arc4ctx {
61	isc_uint8_t	i;
62	isc_uint8_t	j;
63	isc_uint8_t	s[256];
64	int		count;
65	isc_entropy_t	*entropy;	/*%< entropy source for ARC4 */
66	isc_mutex_t	*lock;
67} arc4ctx_t;
68
69typedef struct dns_qid {
70	unsigned int	magic;
71	unsigned int	qid_nbuckets;	/*%< hash table size */
72	unsigned int	qid_increment;	/*%< id increment on collision */
73	isc_mutex_t	lock;
74	dns_displist_t	*qid_table;	/*%< the table itself */
75	dispsocketlist_t *sock_table;	/*%< socket table */
76} dns_qid_t;
77
78struct dns_dispatchmgr {
79	/* Unlocked. */
80	unsigned int			magic;
81	isc_mem_t		       *mctx;
82	dns_acl_t		       *blackhole;
83	dns_portlist_t		       *portlist;
84	isc_stats_t		       *stats;
85	isc_entropy_t		       *entropy; /*%< entropy source */
86
87	/* Locked by "lock". */
88	isc_mutex_t			lock;
89	unsigned int			state;
90	ISC_LIST(dns_dispatch_t)	list;
91
92	/* Locked by arc4_lock. */
93	isc_mutex_t			arc4_lock;
94	arc4ctx_t			arc4ctx;    /*%< ARC4 context for QID */
95
96	/* locked by buffer lock */
97	dns_qid_t			*qid;
98	isc_mutex_t			buffer_lock;
99	unsigned int			buffers;    /*%< allocated buffers */
100	unsigned int			buffersize; /*%< size of each buffer */
101	unsigned int			maxbuffers; /*%< max buffers */
102
103	/* Locked internally. */
104	isc_mutex_t			pool_lock;
105	isc_mempool_t		       *epool;	/*%< memory pool for events */
106	isc_mempool_t		       *rpool;	/*%< memory pool for replies */
107	isc_mempool_t		       *dpool;  /*%< dispatch allocations */
108	isc_mempool_t		       *bpool;	/*%< memory pool for buffers */
109	isc_mempool_t		       *spool;	/*%< memory pool for dispsocs */
110
111	/*%
112	 * Locked by qid->lock if qid exists; otherwise, can be used without
113	 * being locked.
114	 * Memory footprint considerations: this is a simple implementation of
115	 * available ports, i.e., an ordered array of the actual port numbers.
116	 * This will require about 256KB of memory in the worst case (128KB for
117	 * each of IPv4 and IPv6).  We could reduce it by representing it as a
118	 * more sophisticated way such as a list (or array) of ranges that are
119	 * searched to identify a specific port.  Our decision here is the saved
120	 * memory isn't worth the implementation complexity, considering the
121	 * fact that the whole BIND9 process (which is mainly named) already
122	 * requires a pretty large memory footprint.  We may, however, have to
123	 * revisit the decision when we want to use it as a separate module for
124	 * an environment where memory requirement is severer.
125	 */
126	in_port_t	*v4ports;	/*%< available ports for IPv4 */
127	unsigned int	nv4ports;	/*%< # of available ports for IPv4 */
128	in_port_t	*v6ports;	/*%< available ports for IPv4 */
129	unsigned int	nv6ports;	/*%< # of available ports for IPv4 */
130};
131
132#define MGR_SHUTTINGDOWN		0x00000001U
133#define MGR_IS_SHUTTINGDOWN(l)	(((l)->state & MGR_SHUTTINGDOWN) != 0)
134
135#define IS_PRIVATE(d)	(((d)->attributes & DNS_DISPATCHATTR_PRIVATE) != 0)
136
137struct dns_dispentry {
138	unsigned int			magic;
139	dns_dispatch_t		       *disp;
140	dns_messageid_t			id;
141	in_port_t			port;
142	unsigned int			bucket;
143	isc_sockaddr_t			host;
144	isc_task_t		       *task;
145	isc_taskaction_t		action;
146	void			       *arg;
147	isc_boolean_t			item_out;
148	dispsocket_t			*dispsocket;
149	ISC_LIST(dns_dispatchevent_t)	items;
150	ISC_LINK(dns_dispentry_t)	link;
151};
152
153/*%
154 * Maximum number of dispatch sockets that can be pooled for reuse.  The
155 * appropriate value may vary, but experiments have shown a busy caching server
156 * may need more than 1000 sockets concurrently opened.  The maximum allowable
157 * number of dispatch sockets (per manager) will be set to the double of this
158 * value.
159 */
160#ifndef DNS_DISPATCH_POOLSOCKS
161#define DNS_DISPATCH_POOLSOCKS			2048
162#endif
163
164/*%
165 * Quota to control the number of dispatch sockets.  If a dispatch has more
166 * than the quota of sockets, new queries will purge oldest ones, so that
167 * a massive number of outstanding queries won't prevent subsequent queries
168 * (especially if the older ones take longer time and result in timeout).
169 */
170#ifndef DNS_DISPATCH_SOCKSQUOTA
171#define DNS_DISPATCH_SOCKSQUOTA			3072
172#endif
173
174struct dispsocket {
175	unsigned int			magic;
176	isc_socket_t			*socket;
177	dns_dispatch_t			*disp;
178	isc_sockaddr_t			host;
179	in_port_t			localport; /* XXX: should be removed later */
180	dispportentry_t			*portentry;
181	dns_dispentry_t			*resp;
182	isc_task_t			*task;
183	ISC_LINK(dispsocket_t)		link;
184	unsigned int			bucket;
185	ISC_LINK(dispsocket_t)		blink;
186};
187
188/*%
189 * A port table entry.  We remember every port we first open in a table with a
190 * reference counter so that we can 'reuse' the same port (with different
191 * destination addresses) using the SO_REUSEADDR socket option.
192 */
193struct dispportentry {
194	in_port_t			port;
195	unsigned int			refs;
196	ISC_LINK(struct dispportentry)	link;
197};
198
199#ifndef DNS_DISPATCH_PORTTABLESIZE
200#define DNS_DISPATCH_PORTTABLESIZE	1024
201#endif
202
203#define INVALID_BUCKET		(0xffffdead)
204
205/*%
206 * Number of tasks for each dispatch that use separate sockets for different
207 * transactions.  This must be a power of 2 as it will divide 32 bit numbers
208 * to get an uniformly random tasks selection.  See get_dispsocket().
209 */
210#define MAX_INTERNAL_TASKS	64
211
212struct dns_dispatch {
213	/* Unlocked. */
214	unsigned int		magic;		/*%< magic */
215	dns_dispatchmgr_t      *mgr;		/*%< dispatch manager */
216	int			ntasks;
217	/*%
218	 * internal task buckets.  We use multiple tasks to distribute various
219	 * socket events well when using separate dispatch sockets.  We use the
220	 * 1st task (task[0]) for internal control events.
221	 */
222	isc_task_t	       *task[MAX_INTERNAL_TASKS];
223	isc_socket_t	       *socket;		/*%< isc socket attached to */
224	isc_sockaddr_t		local;		/*%< local address */
225	in_port_t		localport;	/*%< local UDP port */
226	unsigned int		maxrequests;	/*%< max requests */
227	isc_event_t	       *ctlevent;
228
229	/*% Locked by mgr->lock. */
230	ISC_LINK(dns_dispatch_t) link;
231
232	/* Locked by "lock". */
233	isc_mutex_t		lock;		/*%< locks all below */
234	isc_sockettype_t	socktype;
235	unsigned int		attributes;
236	unsigned int		refcount;	/*%< number of users */
237	dns_dispatchevent_t    *failsafe_ev;	/*%< failsafe cancel event */
238	unsigned int		shutting_down : 1,
239				shutdown_out : 1,
240				connected : 1,
241				tcpmsg_valid : 1,
242				recv_pending : 1; /*%< is a recv() pending? */
243	isc_result_t		shutdown_why;
244	ISC_LIST(dispsocket_t)	activesockets;
245	ISC_LIST(dispsocket_t)	inactivesockets;
246	unsigned int		nsockets;
247	unsigned int		requests;	/*%< how many requests we have */
248	unsigned int		tcpbuffers;	/*%< allocated buffers */
249	dns_tcpmsg_t		tcpmsg;		/*%< for tcp streams */
250	dns_qid_t		*qid;
251	arc4ctx_t		arc4ctx;	/*%< for QID/UDP port num */
252	dispportlist_t		*port_table;	/*%< hold ports 'owned' by us */
253	isc_mempool_t		*portpool;	/*%< port table entries  */
254};
255
256#define QID_MAGIC		ISC_MAGIC('Q', 'i', 'd', ' ')
257#define VALID_QID(e)		ISC_MAGIC_VALID((e), QID_MAGIC)
258
259#define RESPONSE_MAGIC		ISC_MAGIC('D', 'r', 's', 'p')
260#define VALID_RESPONSE(e)	ISC_MAGIC_VALID((e), RESPONSE_MAGIC)
261
262#define DISPSOCK_MAGIC		ISC_MAGIC('D', 's', 'o', 'c')
263#define VALID_DISPSOCK(e)	ISC_MAGIC_VALID((e), DISPSOCK_MAGIC)
264
265#define DISPATCH_MAGIC		ISC_MAGIC('D', 'i', 's', 'p')
266#define VALID_DISPATCH(e)	ISC_MAGIC_VALID((e), DISPATCH_MAGIC)
267
268#define DNS_DISPATCHMGR_MAGIC	ISC_MAGIC('D', 'M', 'g', 'r')
269#define VALID_DISPATCHMGR(e)	ISC_MAGIC_VALID((e), DNS_DISPATCHMGR_MAGIC)
270
271#define DNS_QID(disp) ((disp)->socktype == isc_sockettype_tcp) ? \
272		       (disp)->qid : (disp)->mgr->qid
273#define DISP_ARC4CTX(disp) ((disp)->socktype == isc_sockettype_udp) ? \
274			(&(disp)->arc4ctx) : (&(disp)->mgr->arc4ctx)
275
276/*%
277 * Locking a query port buffer is a bit tricky.  We access the buffer without
278 * locking until qid is created.  Technically, there is a possibility of race
279 * between the creation of qid and access to the port buffer; in practice,
280 * however, this should be safe because qid isn't created until the first
281 * dispatch is created and there should be no contending situation until then.
282 */
283#define PORTBUFLOCK(mgr) if ((mgr)->qid != NULL) LOCK(&((mgr)->qid->lock))
284#define PORTBUFUNLOCK(mgr) if ((mgr)->qid != NULL) UNLOCK((&(mgr)->qid->lock))
285
286/*
287 * Statics.
288 */
289static dns_dispentry_t *entry_search(dns_qid_t *, isc_sockaddr_t *,
290				     dns_messageid_t, in_port_t, unsigned int);
291static isc_boolean_t destroy_disp_ok(dns_dispatch_t *);
292static void destroy_disp(isc_task_t *task, isc_event_t *event);
293static void destroy_dispsocket(dns_dispatch_t *, dispsocket_t **);
294static void deactivate_dispsocket(dns_dispatch_t *, dispsocket_t *);
295static void udp_exrecv(isc_task_t *, isc_event_t *);
296static void udp_shrecv(isc_task_t *, isc_event_t *);
297static void udp_recv(isc_event_t *, dns_dispatch_t *, dispsocket_t *);
298static void tcp_recv(isc_task_t *, isc_event_t *);
299static isc_result_t startrecv(dns_dispatch_t *, dispsocket_t *);
300static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t,
301			     in_port_t);
302static void free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len);
303static void *allocate_udp_buffer(dns_dispatch_t *disp);
304static inline void free_event(dns_dispatch_t *disp, dns_dispatchevent_t *ev);
305static inline dns_dispatchevent_t *allocate_event(dns_dispatch_t *disp);
306static void do_cancel(dns_dispatch_t *disp);
307static dns_dispentry_t *linear_first(dns_qid_t *disp);
308static dns_dispentry_t *linear_next(dns_qid_t *disp,
309				    dns_dispentry_t *resp);
310static void dispatch_free(dns_dispatch_t **dispp);
311static isc_result_t get_udpsocket(dns_dispatchmgr_t *mgr,
312				  dns_dispatch_t *disp,
313				  isc_socketmgr_t *sockmgr,
314				  isc_sockaddr_t *localaddr,
315				  isc_socket_t **sockp);
316static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr,
317				       isc_socketmgr_t *sockmgr,
318				       isc_taskmgr_t *taskmgr,
319				       isc_sockaddr_t *localaddr,
320				       unsigned int maxrequests,
321				       unsigned int attributes,
322				       dns_dispatch_t **dispp);
323static isc_boolean_t destroy_mgr_ok(dns_dispatchmgr_t *mgr);
324static void destroy_mgr(dns_dispatchmgr_t **mgrp);
325static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
326				 unsigned int increment, dns_qid_t **qidp,
327				 isc_boolean_t needaddrtable);
328static void qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp);
329static isc_result_t open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
330				unsigned int options, isc_socket_t **sockp);
331static isc_boolean_t portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
332				   isc_sockaddr_t *sockaddrp);
333
334#define LVL(x) ISC_LOG_DEBUG(x)
335
336static void
337mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...)
338     ISC_FORMAT_PRINTF(3, 4);
339
340static void
341mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...) {
342	char msgbuf[2048];
343	va_list ap;
344
345	if (! isc_log_wouldlog(dns_lctx, level))
346		return;
347
348	va_start(ap, fmt);
349	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
350	va_end(ap);
351
352	isc_log_write(dns_lctx,
353		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
354		      level, "dispatchmgr %p: %s", mgr, msgbuf);
355}
356
357static inline void
358inc_stats(dns_dispatchmgr_t *mgr, isc_statscounter_t counter) {
359	if (mgr->stats != NULL)
360		isc_stats_increment(mgr->stats, counter);
361}
362
363static void
364dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...)
365     ISC_FORMAT_PRINTF(3, 4);
366
367static void
368dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...) {
369	char msgbuf[2048];
370	va_list ap;
371
372	if (! isc_log_wouldlog(dns_lctx, level))
373		return;
374
375	va_start(ap, fmt);
376	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
377	va_end(ap);
378
379	isc_log_write(dns_lctx,
380		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
381		      level, "dispatch %p: %s", disp, msgbuf);
382}
383
384static void
385request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
386	    int level, const char *fmt, ...)
387     ISC_FORMAT_PRINTF(4, 5);
388
389static void
390request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
391	    int level, const char *fmt, ...)
392{
393	char msgbuf[2048];
394	char peerbuf[256];
395	va_list ap;
396
397	if (! isc_log_wouldlog(dns_lctx, level))
398		return;
399
400	va_start(ap, fmt);
401	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
402	va_end(ap);
403
404	if (VALID_RESPONSE(resp)) {
405		isc_sockaddr_format(&resp->host, peerbuf, sizeof(peerbuf));
406		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
407			      DNS_LOGMODULE_DISPATCH, level,
408			      "dispatch %p response %p %s: %s", disp, resp,
409			      peerbuf, msgbuf);
410	} else {
411		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
412			      DNS_LOGMODULE_DISPATCH, level,
413			      "dispatch %p req/resp %p: %s", disp, resp,
414			      msgbuf);
415	}
416}
417
418/*%
419 * ARC4 random number generator derived from OpenBSD.
420 * Only dispatch_random() and dispatch_uniformrandom() are expected
421 * to be called from general dispatch routines; the rest of them are subroutines
422 * for these two.
423 *
424 * The original copyright follows:
425 * Copyright (c) 1996, David Mazieres <dm@uun.org>
426 * Copyright (c) 2008, Damien Miller <djm@openbsd.org>
427 *
428 * Permission to use, copy, modify, and distribute this software for any
429 * purpose with or without fee is hereby granted, provided that the above
430 * copyright notice and this permission notice appear in all copies.
431 *
432 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
433 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
434 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
435 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
436 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
437 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
438 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
439 */
440#ifdef BIND9
441static void
442dispatch_initrandom(arc4ctx_t *actx, isc_entropy_t *entropy,
443		    isc_mutex_t *lock)
444{
445	int n;
446	for (n = 0; n < 256; n++)
447		actx->s[n] = n;
448	actx->i = 0;
449	actx->j = 0;
450	actx->count = 0;
451	actx->entropy = entropy; /* don't have to attach */
452	actx->lock = lock;
453}
454
455static void
456dispatch_arc4addrandom(arc4ctx_t *actx, unsigned char *dat, int datlen) {
457	int n;
458	isc_uint8_t si;
459
460	actx->i--;
461	for (n = 0; n < 256; n++) {
462		actx->i = (actx->i + 1);
463		si = actx->s[actx->i];
464		actx->j = (actx->j + si + dat[n % datlen]);
465		actx->s[actx->i] = actx->s[actx->j];
466		actx->s[actx->j] = si;
467	}
468	actx->j = actx->i;
469}
470
471static inline isc_uint8_t
472dispatch_arc4get8(arc4ctx_t *actx) {
473	isc_uint8_t si, sj;
474
475	actx->i = (actx->i + 1);
476	si = actx->s[actx->i];
477	actx->j = (actx->j + si);
478	sj = actx->s[actx->j];
479	actx->s[actx->i] = sj;
480	actx->s[actx->j] = si;
481
482	return (actx->s[(si + sj) & 0xff]);
483}
484
485static inline isc_uint16_t
486dispatch_arc4get16(arc4ctx_t *actx) {
487	isc_uint16_t val;
488
489	val = dispatch_arc4get8(actx) << 8;
490	val |= dispatch_arc4get8(actx);
491
492	return (val);
493}
494
495static void
496dispatch_arc4stir(arc4ctx_t *actx) {
497	int i;
498	union {
499		unsigned char rnd[128];
500		isc_uint32_t rnd32[32];
501	} rnd;
502	isc_result_t result;
503
504	if (actx->entropy != NULL) {
505		/*
506		 * We accept any quality of random data to avoid blocking.
507		 */
508		result = isc_entropy_getdata(actx->entropy, rnd.rnd,
509					     sizeof(rnd), NULL, 0);
510		RUNTIME_CHECK(result == ISC_R_SUCCESS);
511	} else {
512		for (i = 0; i < 32; i++)
513			isc_random_get(&rnd.rnd32[i]);
514	}
515	dispatch_arc4addrandom(actx, rnd.rnd, sizeof(rnd.rnd));
516
517	/*
518	 * Discard early keystream, as per recommendations in:
519	 * http://www.wisdom.weizmann.ac.il/~itsik/RC4/Papers/Rc4_ksa.ps
520	 */
521	for (i = 0; i < 256; i++)
522		(void)dispatch_arc4get8(actx);
523
524	/*
525	 * Derived from OpenBSD's implementation.  The rationale is not clear,
526	 * but should be conservative enough in safety, and reasonably large
527	 * for efficiency.
528	 */
529	actx->count = 1600000;
530}
531
532static isc_uint16_t
533dispatch_random(arc4ctx_t *actx) {
534	isc_uint16_t result;
535
536	if (actx->lock != NULL)
537		LOCK(actx->lock);
538
539	actx->count -= sizeof(isc_uint16_t);
540	if (actx->count <= 0)
541		dispatch_arc4stir(actx);
542	result = dispatch_arc4get16(actx);
543
544	if (actx->lock != NULL)
545		UNLOCK(actx->lock);
546
547	return (result);
548}
549#else
550/*
551 * For general purpose library, we don't have to be too strict about the
552 * quality of random values.  Performance doesn't matter much, either.
553 * So we simply use the isc_random module to keep the library as small as
554 * possible.
555 */
556
557static void
558dispatch_initrandom(arc4ctx_t *actx, isc_entropy_t *entropy,
559		    isc_mutex_t *lock)
560{
561	UNUSED(actx);
562	UNUSED(entropy);
563	UNUSED(lock);
564
565	return;
566}
567
568static isc_uint16_t
569dispatch_random(arc4ctx_t *actx) {
570	isc_uint32_t r;
571
572	UNUSED(actx);
573
574	isc_random_get(&r);
575	return (r & 0xffff);
576}
577#endif	/* BIND9 */
578
579static isc_uint16_t
580dispatch_uniformrandom(arc4ctx_t *actx, isc_uint16_t upper_bound) {
581	isc_uint16_t min, r;
582
583	if (upper_bound < 2)
584		return (0);
585
586	/*
587	 * Ensure the range of random numbers [min, 0xffff] be a multiple of
588	 * upper_bound and contain at least a half of the 16 bit range.
589	 */
590
591	if (upper_bound > 0x8000)
592		min = 1 + ~upper_bound; /* 0x8000 - upper_bound */
593	else
594		min = (isc_uint16_t)(0x10000 % (isc_uint32_t)upper_bound);
595
596	/*
597	 * This could theoretically loop forever but each retry has
598	 * p > 0.5 (worst case, usually far better) of selecting a
599	 * number inside the range we need, so it should rarely need
600	 * to re-roll.
601	 */
602	for (;;) {
603		r = dispatch_random(actx);
604		if (r >= min)
605			break;
606	}
607
608	return (r % upper_bound);
609}
610
611/*
612 * Return a hash of the destination and message id.
613 */
614static isc_uint32_t
615dns_hash(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
616	 in_port_t port)
617{
618	unsigned int ret;
619
620	ret = isc_sockaddr_hash(dest, ISC_TRUE);
621	ret ^= (id << 16) | port;
622	ret %= qid->qid_nbuckets;
623
624	INSIST(ret < qid->qid_nbuckets);
625
626	return (ret);
627}
628
629/*
630 * Find the first entry in 'qid'.  Returns NULL if there are no entries.
631 */
632static dns_dispentry_t *
633linear_first(dns_qid_t *qid) {
634	dns_dispentry_t *ret;
635	unsigned int bucket;
636
637	bucket = 0;
638
639	while (bucket < qid->qid_nbuckets) {
640		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
641		if (ret != NULL)
642			return (ret);
643		bucket++;
644	}
645
646	return (NULL);
647}
648
649/*
650 * Find the next entry after 'resp' in 'qid'.  Return NULL if there are
651 * no more entries.
652 */
653static dns_dispentry_t *
654linear_next(dns_qid_t *qid, dns_dispentry_t *resp) {
655	dns_dispentry_t *ret;
656	unsigned int bucket;
657
658	ret = ISC_LIST_NEXT(resp, link);
659	if (ret != NULL)
660		return (ret);
661
662	bucket = resp->bucket;
663	bucket++;
664	while (bucket < qid->qid_nbuckets) {
665		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
666		if (ret != NULL)
667			return (ret);
668		bucket++;
669	}
670
671	return (NULL);
672}
673
674/*
675 * The dispatch must be locked.
676 */
677static isc_boolean_t
678destroy_disp_ok(dns_dispatch_t *disp)
679{
680	if (disp->refcount != 0)
681		return (ISC_FALSE);
682
683	if (disp->recv_pending != 0)
684		return (ISC_FALSE);
685
686	if (!ISC_LIST_EMPTY(disp->activesockets))
687		return (ISC_FALSE);
688
689	if (disp->shutting_down == 0)
690		return (ISC_FALSE);
691
692	return (ISC_TRUE);
693}
694
695/*
696 * Called when refcount reaches 0 (and safe to destroy).
697 *
698 * The dispatcher must not be locked.
699 * The manager must be locked.
700 */
701static void
702destroy_disp(isc_task_t *task, isc_event_t *event) {
703	dns_dispatch_t *disp;
704	dns_dispatchmgr_t *mgr;
705	isc_boolean_t killmgr;
706	dispsocket_t *dispsocket;
707	int i;
708
709	INSIST(event->ev_type == DNS_EVENT_DISPATCHCONTROL);
710
711	UNUSED(task);
712
713	disp = event->ev_arg;
714	mgr = disp->mgr;
715
716	LOCK(&mgr->lock);
717	ISC_LIST_UNLINK(mgr->list, disp, link);
718
719	dispatch_log(disp, LVL(90),
720		     "shutting down; detaching from sock %p, task %p",
721		     disp->socket, disp->task[0]); /* XXXX */
722
723	if (disp->socket != NULL)
724		isc_socket_detach(&disp->socket);
725	while ((dispsocket = ISC_LIST_HEAD(disp->inactivesockets)) != NULL) {
726		ISC_LIST_UNLINK(disp->inactivesockets, dispsocket, link);
727		destroy_dispsocket(disp, &dispsocket);
728	}
729	for (i = 0; i < disp->ntasks; i++)
730		isc_task_detach(&disp->task[i]);
731	isc_event_free(&event);
732
733	dispatch_free(&disp);
734
735	killmgr = destroy_mgr_ok(mgr);
736	UNLOCK(&mgr->lock);
737	if (killmgr)
738		destroy_mgr(&mgr);
739}
740
741/*%
742 * Manipulate port table per dispatch: find an entry for a given port number,
743 * create a new entry, and decrement a given entry with possible clean-up.
744 */
745static dispportentry_t *
746port_search(dns_dispatch_t *disp, in_port_t port) {
747	dispportentry_t *portentry;
748
749	REQUIRE(disp->port_table != NULL);
750
751	portentry = ISC_LIST_HEAD(disp->port_table[port %
752						   DNS_DISPATCH_PORTTABLESIZE]);
753	while (portentry != NULL) {
754		if (portentry->port == port)
755			return (portentry);
756		portentry = ISC_LIST_NEXT(portentry, link);
757	}
758
759	return (NULL);
760}
761
762static dispportentry_t *
763new_portentry(dns_dispatch_t *disp, in_port_t port) {
764	dispportentry_t *portentry;
765
766	REQUIRE(disp->port_table != NULL);
767
768	portentry = isc_mempool_get(disp->portpool);
769	if (portentry == NULL)
770		return (portentry);
771
772	portentry->port = port;
773	portentry->refs = 0;
774	ISC_LINK_INIT(portentry, link);
775	ISC_LIST_APPEND(disp->port_table[port % DNS_DISPATCH_PORTTABLESIZE],
776			portentry, link);
777
778	return (portentry);
779}
780
781/*%
782 * The caller must not hold the qid->lock.
783 */
784static void
785deref_portentry(dns_dispatch_t *disp, dispportentry_t **portentryp) {
786	dispportentry_t *portentry = *portentryp;
787	dns_qid_t *qid;
788
789	REQUIRE(disp->port_table != NULL);
790	REQUIRE(portentry != NULL && portentry->refs > 0);
791
792	qid = DNS_QID(disp);
793	LOCK(&qid->lock);
794	portentry->refs--;
795	if (portentry->refs == 0) {
796		ISC_LIST_UNLINK(disp->port_table[portentry->port %
797						 DNS_DISPATCH_PORTTABLESIZE],
798				portentry, link);
799		isc_mempool_put(disp->portpool, portentry);
800	}
801
802	*portentryp = NULL;
803	UNLOCK(&qid->lock);
804}
805
806/*%
807 * Find a dispsocket for socket address 'dest', and port number 'port'.
808 * Return NULL if no such entry exists.
809 */
810static dispsocket_t *
811socket_search(dns_qid_t *qid, isc_sockaddr_t *dest, in_port_t port,
812	      unsigned int bucket)
813{
814	dispsocket_t *dispsock;
815
816	REQUIRE(bucket < qid->qid_nbuckets);
817
818	dispsock = ISC_LIST_HEAD(qid->sock_table[bucket]);
819
820	while (dispsock != NULL) {
821		if (dispsock->portentry != NULL &&
822		    dispsock->portentry->port == port &&
823		    isc_sockaddr_equal(dest, &dispsock->host))
824			return (dispsock);
825		dispsock = ISC_LIST_NEXT(dispsock, blink);
826	}
827
828	return (NULL);
829}
830
831/*%
832 * Make a new socket for a single dispatch with a random port number.
833 * The caller must hold the disp->lock and qid->lock.
834 */
835static isc_result_t
836get_dispsocket(dns_dispatch_t *disp, isc_sockaddr_t *dest,
837	       isc_socketmgr_t *sockmgr, dns_qid_t *qid,
838	       dispsocket_t **dispsockp, in_port_t *portp)
839{
840	int i;
841	isc_uint32_t r;
842	dns_dispatchmgr_t *mgr = disp->mgr;
843	isc_socket_t *sock = NULL;
844	isc_result_t result = ISC_R_FAILURE;
845	in_port_t port;
846	isc_sockaddr_t localaddr;
847	unsigned int bucket = 0;
848	dispsocket_t *dispsock;
849	unsigned int nports;
850	in_port_t *ports;
851	unsigned int bindoptions;
852	dispportentry_t *portentry = NULL;
853
854	if (isc_sockaddr_pf(&disp->local) == AF_INET) {
855		nports = disp->mgr->nv4ports;
856		ports = disp->mgr->v4ports;
857	} else {
858		nports = disp->mgr->nv6ports;
859		ports = disp->mgr->v6ports;
860	}
861	if (nports == 0)
862		return (ISC_R_ADDRNOTAVAIL);
863
864	dispsock = ISC_LIST_HEAD(disp->inactivesockets);
865	if (dispsock != NULL) {
866		ISC_LIST_UNLINK(disp->inactivesockets, dispsock, link);
867		sock = dispsock->socket;
868		dispsock->socket = NULL;
869	} else {
870		dispsock = isc_mempool_get(mgr->spool);
871		if (dispsock == NULL)
872			return (ISC_R_NOMEMORY);
873
874		disp->nsockets++;
875		dispsock->socket = NULL;
876		dispsock->disp = disp;
877		dispsock->resp = NULL;
878		dispsock->portentry = NULL;
879		isc_random_get(&r);
880		dispsock->task = NULL;
881		isc_task_attach(disp->task[r % disp->ntasks], &dispsock->task);
882		ISC_LINK_INIT(dispsock, link);
883		ISC_LINK_INIT(dispsock, blink);
884		dispsock->magic = DISPSOCK_MAGIC;
885	}
886
887	/*
888	 * Pick up a random UDP port and open a new socket with it.  Avoid
889	 * choosing ports that share the same destination because it will be
890	 * very likely to fail in bind(2) or connect(2).
891	 */
892	localaddr = disp->local;
893	for (i = 0; i < 64; i++) {
894		port = ports[dispatch_uniformrandom(DISP_ARC4CTX(disp),
895							nports)];
896		isc_sockaddr_setport(&localaddr, port);
897
898		bucket = dns_hash(qid, dest, 0, port);
899		if (socket_search(qid, dest, port, bucket) != NULL)
900			continue;
901		bindoptions = 0;
902		portentry = port_search(disp, port);
903		if (portentry != NULL)
904			bindoptions |= ISC_SOCKET_REUSEADDRESS;
905		result = open_socket(sockmgr, &localaddr, bindoptions, &sock);
906		if (result == ISC_R_SUCCESS) {
907			if (portentry == NULL) {
908				portentry = new_portentry(disp, port);
909				if (portentry == NULL) {
910					result = ISC_R_NOMEMORY;
911					break;
912				}
913			}
914			portentry->refs++;
915			break;
916		} else if (result == ISC_R_NOPERM) {
917			char buf[ISC_SOCKADDR_FORMATSIZE];
918			isc_sockaddr_format(&localaddr, buf, sizeof(buf));
919			dispatch_log(disp, ISC_LOG_WARNING,
920				     "open_socket(%s) -> %s: continuing",
921				     buf, isc_result_totext(result));
922		} else if (result != ISC_R_ADDRINUSE)
923			break;
924	}
925
926	if (result == ISC_R_SUCCESS) {
927		dispsock->socket = sock;
928		dispsock->host = *dest;
929		dispsock->portentry = portentry;
930		dispsock->bucket = bucket;
931		ISC_LIST_APPEND(qid->sock_table[bucket], dispsock, blink);
932		*dispsockp = dispsock;
933		*portp = port;
934	} else {
935		/*
936		 * We could keep it in the inactive list, but since this should
937		 * be an exceptional case and might be resource shortage, we'd
938		 * rather destroy it.
939		 */
940		if (sock != NULL)
941			isc_socket_detach(&sock);
942		destroy_dispsocket(disp, &dispsock);
943	}
944
945	return (result);
946}
947
948/*%
949 * Destroy a dedicated dispatch socket.
950 */
951static void
952destroy_dispsocket(dns_dispatch_t *disp, dispsocket_t **dispsockp) {
953	dispsocket_t *dispsock;
954	dns_qid_t *qid;
955
956	/*
957	 * The dispatch must be locked.
958	 */
959
960	REQUIRE(dispsockp != NULL && *dispsockp != NULL);
961	dispsock = *dispsockp;
962	REQUIRE(!ISC_LINK_LINKED(dispsock, link));
963
964	disp->nsockets--;
965	dispsock->magic = 0;
966	if (dispsock->portentry != NULL)
967		deref_portentry(disp, &dispsock->portentry);
968	if (dispsock->socket != NULL)
969		isc_socket_detach(&dispsock->socket);
970	if (ISC_LINK_LINKED(dispsock, blink)) {
971		qid = DNS_QID(disp);
972		LOCK(&qid->lock);
973		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
974				blink);
975		UNLOCK(&qid->lock);
976	}
977	if (dispsock->task != NULL)
978		isc_task_detach(&dispsock->task);
979	isc_mempool_put(disp->mgr->spool, dispsock);
980
981	*dispsockp = NULL;
982}
983
984/*%
985 * Deactivate a dedicated dispatch socket.  Move it to the inactive list for
986 * future reuse unless the total number of sockets are exceeding the maximum.
987 */
988static void
989deactivate_dispsocket(dns_dispatch_t *disp, dispsocket_t *dispsock) {
990	isc_result_t result;
991	dns_qid_t *qid;
992
993	/*
994	 * The dispatch must be locked.
995	 */
996	ISC_LIST_UNLINK(disp->activesockets, dispsock, link);
997	if (dispsock->resp != NULL) {
998		INSIST(dispsock->resp->dispsocket == dispsock);
999		dispsock->resp->dispsocket = NULL;
1000	}
1001
1002	INSIST(dispsock->portentry != NULL);
1003	deref_portentry(disp, &dispsock->portentry);
1004
1005#ifdef BIND9
1006	if (disp->nsockets > DNS_DISPATCH_POOLSOCKS)
1007		destroy_dispsocket(disp, &dispsock);
1008	else {
1009		result = isc_socket_close(dispsock->socket);
1010
1011		qid = DNS_QID(disp);
1012		LOCK(&qid->lock);
1013		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
1014				blink);
1015		UNLOCK(&qid->lock);
1016
1017		if (result == ISC_R_SUCCESS)
1018			ISC_LIST_APPEND(disp->inactivesockets, dispsock, link);
1019		else {
1020			/*
1021			 * If the underlying system does not allow this
1022			 * optimization, destroy this temporary structure (and
1023			 * create a new one for a new transaction).
1024			 */
1025			INSIST(result == ISC_R_NOTIMPLEMENTED);
1026			destroy_dispsocket(disp, &dispsock);
1027		}
1028	}
1029#else
1030	/* This kind of optimization isn't necessary for normal use */
1031	UNUSED(qid);
1032	UNUSED(result);
1033
1034	destroy_dispsocket(disp, &dispsock);
1035#endif
1036}
1037
1038/*
1039 * Find an entry for query ID 'id', socket address 'dest', and port number
1040 * 'port'.
1041 * Return NULL if no such entry exists.
1042 */
1043static dns_dispentry_t *
1044entry_search(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
1045	     in_port_t port, unsigned int bucket)
1046{
1047	dns_dispentry_t *res;
1048
1049	REQUIRE(bucket < qid->qid_nbuckets);
1050
1051	res = ISC_LIST_HEAD(qid->qid_table[bucket]);
1052
1053	while (res != NULL) {
1054		if (res->id == id && isc_sockaddr_equal(dest, &res->host) &&
1055		    res->port == port) {
1056			return (res);
1057		}
1058		res = ISC_LIST_NEXT(res, link);
1059	}
1060
1061	return (NULL);
1062}
1063
1064static void
1065free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len) {
1066	INSIST(buf != NULL && len != 0);
1067
1068
1069	switch (disp->socktype) {
1070	case isc_sockettype_tcp:
1071		INSIST(disp->tcpbuffers > 0);
1072		disp->tcpbuffers--;
1073		isc_mem_put(disp->mgr->mctx, buf, len);
1074		break;
1075	case isc_sockettype_udp:
1076		LOCK(&disp->mgr->buffer_lock);
1077		INSIST(disp->mgr->buffers > 0);
1078		INSIST(len == disp->mgr->buffersize);
1079		disp->mgr->buffers--;
1080		isc_mempool_put(disp->mgr->bpool, buf);
1081		UNLOCK(&disp->mgr->buffer_lock);
1082		break;
1083	default:
1084		INSIST(0);
1085		break;
1086	}
1087}
1088
1089static void *
1090allocate_udp_buffer(dns_dispatch_t *disp) {
1091	void *temp;
1092
1093	LOCK(&disp->mgr->buffer_lock);
1094	temp = isc_mempool_get(disp->mgr->bpool);
1095
1096	if (temp != NULL)
1097		disp->mgr->buffers++;
1098	UNLOCK(&disp->mgr->buffer_lock);
1099
1100	return (temp);
1101}
1102
1103static inline void
1104free_event(dns_dispatch_t *disp, dns_dispatchevent_t *ev) {
1105	if (disp->failsafe_ev == ev) {
1106		INSIST(disp->shutdown_out == 1);
1107		disp->shutdown_out = 0;
1108
1109		return;
1110	}
1111
1112	isc_mempool_put(disp->mgr->epool, ev);
1113}
1114
1115static inline dns_dispatchevent_t *
1116allocate_event(dns_dispatch_t *disp) {
1117	dns_dispatchevent_t *ev;
1118
1119	ev = isc_mempool_get(disp->mgr->epool);
1120	if (ev == NULL)
1121		return (NULL);
1122	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, 0,
1123		       NULL, NULL, NULL, NULL, NULL);
1124
1125	return (ev);
1126}
1127
1128static void
1129udp_exrecv(isc_task_t *task, isc_event_t *ev) {
1130	dispsocket_t *dispsock = ev->ev_arg;
1131
1132	UNUSED(task);
1133
1134	REQUIRE(VALID_DISPSOCK(dispsock));
1135	udp_recv(ev, dispsock->disp, dispsock);
1136}
1137
1138static void
1139udp_shrecv(isc_task_t *task, isc_event_t *ev) {
1140	dns_dispatch_t *disp = ev->ev_arg;
1141
1142	UNUSED(task);
1143
1144	REQUIRE(VALID_DISPATCH(disp));
1145	udp_recv(ev, disp, NULL);
1146}
1147
1148/*
1149 * General flow:
1150 *
1151 * If I/O result == CANCELED or error, free the buffer.
1152 *
1153 * If query, free the buffer, restart.
1154 *
1155 * If response:
1156 *	Allocate event, fill in details.
1157 *		If cannot allocate, free buffer, restart.
1158 *	find target.  If not found, free buffer, restart.
1159 *	if event queue is not empty, queue.  else, send.
1160 *	restart.
1161 */
1162static void
1163udp_recv(isc_event_t *ev_in, dns_dispatch_t *disp, dispsocket_t *dispsock) {
1164	isc_socketevent_t *ev = (isc_socketevent_t *)ev_in;
1165	dns_messageid_t id;
1166	isc_result_t dres;
1167	isc_buffer_t source;
1168	unsigned int flags;
1169	dns_dispentry_t *resp = NULL;
1170	dns_dispatchevent_t *rev;
1171	unsigned int bucket;
1172	isc_boolean_t killit;
1173	isc_boolean_t queue_response;
1174	dns_dispatchmgr_t *mgr;
1175	dns_qid_t *qid;
1176	isc_netaddr_t netaddr;
1177	int match;
1178	int result;
1179	isc_boolean_t qidlocked = ISC_FALSE;
1180
1181	LOCK(&disp->lock);
1182
1183	mgr = disp->mgr;
1184	qid = mgr->qid;
1185
1186	dispatch_log(disp, LVL(90),
1187		     "got packet: requests %d, buffers %d, recvs %d",
1188		     disp->requests, disp->mgr->buffers, disp->recv_pending);
1189
1190	if (dispsock == NULL && ev->ev_type == ISC_SOCKEVENT_RECVDONE) {
1191		/*
1192		 * Unless the receive event was imported from a listening
1193		 * interface, in which case the event type is
1194		 * DNS_EVENT_IMPORTRECVDONE, receive operation must be pending.
1195		 */
1196		INSIST(disp->recv_pending != 0);
1197		disp->recv_pending = 0;
1198	}
1199
1200	if (dispsock != NULL &&
1201	    (ev->result == ISC_R_CANCELED || dispsock->resp == NULL)) {
1202		/*
1203		 * dispsock->resp can be NULL if this transaction was canceled
1204		 * just after receiving a response.  Since this socket is
1205		 * exclusively used and there should be at most one receive
1206		 * event the canceled event should have been no effect.  So
1207		 * we can (and should) deactivate the socket right now.
1208		 */
1209		deactivate_dispsocket(disp, dispsock);
1210		dispsock = NULL;
1211	}
1212
1213	if (disp->shutting_down) {
1214		/*
1215		 * This dispatcher is shutting down.
1216		 */
1217		free_buffer(disp, ev->region.base, ev->region.length);
1218
1219		isc_event_free(&ev_in);
1220		ev = NULL;
1221
1222		killit = destroy_disp_ok(disp);
1223		UNLOCK(&disp->lock);
1224		if (killit)
1225			isc_task_send(disp->task[0], &disp->ctlevent);
1226
1227		return;
1228	}
1229
1230	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
1231		if (dispsock != NULL) {
1232			resp = dispsock->resp;
1233			id = resp->id;
1234			if (ev->result != ISC_R_SUCCESS) {
1235				/*
1236				 * This is most likely a network error on a
1237				 * connected socket.  It makes no sense to
1238				 * check the address or parse the packet, but it
1239				 * will help to return the error to the caller.
1240				 */
1241				goto sendresponse;
1242			}
1243		} else {
1244			free_buffer(disp, ev->region.base, ev->region.length);
1245
1246			UNLOCK(&disp->lock);
1247			isc_event_free(&ev_in);
1248			return;
1249		}
1250	} else if (ev->result != ISC_R_SUCCESS) {
1251		free_buffer(disp, ev->region.base, ev->region.length);
1252
1253		if (ev->result != ISC_R_CANCELED)
1254			dispatch_log(disp, ISC_LOG_ERROR,
1255				     "odd socket result in udp_recv(): %s",
1256				     isc_result_totext(ev->result));
1257
1258		UNLOCK(&disp->lock);
1259		isc_event_free(&ev_in);
1260		return;
1261	}
1262
1263	/*
1264	 * If this is from a blackholed address, drop it.
1265	 */
1266	isc_netaddr_fromsockaddr(&netaddr, &ev->address);
1267	if (disp->mgr->blackhole != NULL &&
1268	    dns_acl_match(&netaddr, NULL, disp->mgr->blackhole,
1269			  NULL, &match, NULL) == ISC_R_SUCCESS &&
1270	    match > 0)
1271	{
1272		if (isc_log_wouldlog(dns_lctx, LVL(10))) {
1273			char netaddrstr[ISC_NETADDR_FORMATSIZE];
1274			isc_netaddr_format(&netaddr, netaddrstr,
1275					   sizeof(netaddrstr));
1276			dispatch_log(disp, LVL(10),
1277				     "blackholed packet from %s",
1278				     netaddrstr);
1279		}
1280		free_buffer(disp, ev->region.base, ev->region.length);
1281		goto restart;
1282	}
1283
1284	/*
1285	 * Peek into the buffer to see what we can see.
1286	 */
1287	isc_buffer_init(&source, ev->region.base, ev->region.length);
1288	isc_buffer_add(&source, ev->n);
1289	dres = dns_message_peekheader(&source, &id, &flags);
1290	if (dres != ISC_R_SUCCESS) {
1291		free_buffer(disp, ev->region.base, ev->region.length);
1292		dispatch_log(disp, LVL(10), "got garbage packet");
1293		goto restart;
1294	}
1295
1296	dispatch_log(disp, LVL(92),
1297		     "got valid DNS message header, /QR %c, id %u",
1298		     ((flags & DNS_MESSAGEFLAG_QR) ? '1' : '0'), id);
1299
1300	/*
1301	 * Look at flags.  If query, drop it. If response,
1302	 * look to see where it goes.
1303	 */
1304	if ((flags & DNS_MESSAGEFLAG_QR) == 0) {
1305		/* query */
1306		free_buffer(disp, ev->region.base, ev->region.length);
1307		goto restart;
1308	}
1309
1310	/*
1311	 * Search for the corresponding response.  If we are using an exclusive
1312	 * socket, we've already identified it and we can skip the search; but
1313	 * the ID and the address must match the expected ones.
1314	 */
1315	if (resp == NULL) {
1316		bucket = dns_hash(qid, &ev->address, id, disp->localport);
1317		LOCK(&qid->lock);
1318		qidlocked = ISC_TRUE;
1319		resp = entry_search(qid, &ev->address, id, disp->localport,
1320				    bucket);
1321		dispatch_log(disp, LVL(90),
1322			     "search for response in bucket %d: %s",
1323			     bucket, (resp == NULL ? "not found" : "found"));
1324
1325		if (resp == NULL) {
1326			inc_stats(mgr, dns_resstatscounter_mismatch);
1327			free_buffer(disp, ev->region.base, ev->region.length);
1328			goto unlock;
1329		}
1330	} else if (resp->id != id || !isc_sockaddr_equal(&ev->address,
1331							 &resp->host)) {
1332		dispatch_log(disp, LVL(90),
1333			     "response to an exclusive socket doesn't match");
1334		inc_stats(mgr, dns_resstatscounter_mismatch);
1335		free_buffer(disp, ev->region.base, ev->region.length);
1336		goto unlock;
1337	}
1338
1339	/*
1340	 * Now that we have the original dispatch the query was sent
1341	 * from check that the address and port the response was
1342	 * sent to make sense.
1343	 */
1344	if (disp != resp->disp) {
1345		isc_sockaddr_t a1;
1346		isc_sockaddr_t a2;
1347
1348		/*
1349		 * Check that the socket types and ports match.
1350		 */
1351		if (disp->socktype != resp->disp->socktype ||
1352		    isc_sockaddr_getport(&disp->local) !=
1353		    isc_sockaddr_getport(&resp->disp->local)) {
1354			free_buffer(disp, ev->region.base, ev->region.length);
1355			goto unlock;
1356		}
1357
1358		/*
1359		 * If both dispatches are bound to an address then fail as
1360		 * the addresses can't be equal (enforced by the IP stack).
1361		 *
1362		 * Note under Linux a packet can be sent out via IPv4 socket
1363		 * and the response be received via a IPv6 socket.
1364		 *
1365		 * Requests sent out via IPv6 should always come back in
1366		 * via IPv6.
1367		 */
1368		if (isc_sockaddr_pf(&resp->disp->local) == PF_INET6 &&
1369		    isc_sockaddr_pf(&disp->local) != PF_INET6) {
1370			free_buffer(disp, ev->region.base, ev->region.length);
1371			goto unlock;
1372		}
1373		isc_sockaddr_anyofpf(&a1, isc_sockaddr_pf(&resp->disp->local));
1374		isc_sockaddr_anyofpf(&a2, isc_sockaddr_pf(&disp->local));
1375		if (!isc_sockaddr_eqaddr(&a1, &resp->disp->local) &&
1376		    !isc_sockaddr_eqaddr(&a2, &disp->local)) {
1377			free_buffer(disp, ev->region.base, ev->region.length);
1378			goto unlock;
1379		}
1380	}
1381
1382  sendresponse:
1383	queue_response = resp->item_out;
1384	rev = allocate_event(resp->disp);
1385	if (rev == NULL) {
1386		free_buffer(disp, ev->region.base, ev->region.length);
1387		goto unlock;
1388	}
1389
1390	/*
1391	 * At this point, rev contains the event we want to fill in, and
1392	 * resp contains the information on the place to send it to.
1393	 * Send the event off.
1394	 */
1395	isc_buffer_init(&rev->buffer, ev->region.base, ev->region.length);
1396	isc_buffer_add(&rev->buffer, ev->n);
1397	rev->result = ev->result;
1398	rev->id = id;
1399	rev->addr = ev->address;
1400	rev->pktinfo = ev->pktinfo;
1401	rev->attributes = ev->attributes;
1402	if (queue_response) {
1403		ISC_LIST_APPEND(resp->items, rev, ev_link);
1404	} else {
1405		ISC_EVENT_INIT(rev, sizeof(*rev), 0, NULL,
1406			       DNS_EVENT_DISPATCH,
1407			       resp->action, resp->arg, resp, NULL, NULL);
1408		request_log(disp, resp, LVL(90),
1409			    "[a] Sent event %p buffer %p len %d to task %p",
1410			    rev, rev->buffer.base, rev->buffer.length,
1411			    resp->task);
1412		resp->item_out = ISC_TRUE;
1413		isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
1414	}
1415 unlock:
1416	if (qidlocked)
1417		UNLOCK(&qid->lock);
1418
1419	/*
1420	 * Restart recv() to get the next packet.
1421	 */
1422 restart:
1423	result = startrecv(disp, dispsock);
1424	if (result != ISC_R_SUCCESS && dispsock != NULL) {
1425		/*
1426		 * XXX: wired. There seems to be no recovery process other than
1427		 * deactivate this socket anyway (since we cannot start
1428		 * receiving, we won't be able to receive a cancel event
1429		 * from the user).
1430		 */
1431		deactivate_dispsocket(disp, dispsock);
1432	}
1433	UNLOCK(&disp->lock);
1434
1435	isc_event_free(&ev_in);
1436}
1437
1438/*
1439 * General flow:
1440 *
1441 * If I/O result == CANCELED, EOF, or error, notify everyone as the
1442 * various queues drain.
1443 *
1444 * If query, restart.
1445 *
1446 * If response:
1447 *	Allocate event, fill in details.
1448 *		If cannot allocate, restart.
1449 *	find target.  If not found, restart.
1450 *	if event queue is not empty, queue.  else, send.
1451 *	restart.
1452 */
1453static void
1454tcp_recv(isc_task_t *task, isc_event_t *ev_in) {
1455	dns_dispatch_t *disp = ev_in->ev_arg;
1456	dns_tcpmsg_t *tcpmsg = &disp->tcpmsg;
1457	dns_messageid_t id;
1458	isc_result_t dres;
1459	unsigned int flags;
1460	dns_dispentry_t *resp;
1461	dns_dispatchevent_t *rev;
1462	unsigned int bucket;
1463	isc_boolean_t killit;
1464	isc_boolean_t queue_response;
1465	dns_qid_t *qid;
1466	int level;
1467	char buf[ISC_SOCKADDR_FORMATSIZE];
1468
1469	UNUSED(task);
1470
1471	REQUIRE(VALID_DISPATCH(disp));
1472
1473	qid = disp->qid;
1474
1475	dispatch_log(disp, LVL(90),
1476		     "got TCP packet: requests %d, buffers %d, recvs %d",
1477		     disp->requests, disp->tcpbuffers, disp->recv_pending);
1478
1479	LOCK(&disp->lock);
1480
1481	INSIST(disp->recv_pending != 0);
1482	disp->recv_pending = 0;
1483
1484	if (disp->refcount == 0) {
1485		/*
1486		 * This dispatcher is shutting down.  Force cancelation.
1487		 */
1488		tcpmsg->result = ISC_R_CANCELED;
1489	}
1490
1491	if (tcpmsg->result != ISC_R_SUCCESS) {
1492		switch (tcpmsg->result) {
1493		case ISC_R_CANCELED:
1494			break;
1495
1496		case ISC_R_EOF:
1497			dispatch_log(disp, LVL(90), "shutting down on EOF");
1498			do_cancel(disp);
1499			break;
1500
1501		case ISC_R_CONNECTIONRESET:
1502			level = ISC_LOG_INFO;
1503			goto logit;
1504
1505		default:
1506			level = ISC_LOG_ERROR;
1507		logit:
1508			isc_sockaddr_format(&tcpmsg->address, buf, sizeof(buf));
1509			dispatch_log(disp, level, "shutting down due to TCP "
1510				     "receive error: %s: %s", buf,
1511				     isc_result_totext(tcpmsg->result));
1512			do_cancel(disp);
1513			break;
1514		}
1515
1516		/*
1517		 * The event is statically allocated in the tcpmsg
1518		 * structure, and destroy_disp() frees the tcpmsg, so we must
1519		 * free the event *before* calling destroy_disp().
1520		 */
1521		isc_event_free(&ev_in);
1522
1523		disp->shutting_down = 1;
1524		disp->shutdown_why = tcpmsg->result;
1525
1526		/*
1527		 * If the recv() was canceled pass the word on.
1528		 */
1529		killit = destroy_disp_ok(disp);
1530		UNLOCK(&disp->lock);
1531		if (killit)
1532			isc_task_send(disp->task[0], &disp->ctlevent);
1533		return;
1534	}
1535
1536	dispatch_log(disp, LVL(90), "result %d, length == %d, addr = %p",
1537		     tcpmsg->result,
1538		     tcpmsg->buffer.length, tcpmsg->buffer.base);
1539
1540	/*
1541	 * Peek into the buffer to see what we can see.
1542	 */
1543	dres = dns_message_peekheader(&tcpmsg->buffer, &id, &flags);
1544	if (dres != ISC_R_SUCCESS) {
1545		dispatch_log(disp, LVL(10), "got garbage packet");
1546		goto restart;
1547	}
1548
1549	dispatch_log(disp, LVL(92),
1550		     "got valid DNS message header, /QR %c, id %u",
1551		     ((flags & DNS_MESSAGEFLAG_QR) ? '1' : '0'), id);
1552
1553	/*
1554	 * Allocate an event to send to the query or response client, and
1555	 * allocate a new buffer for our use.
1556	 */
1557
1558	/*
1559	 * Look at flags.  If query, drop it. If response,
1560	 * look to see where it goes.
1561	 */
1562	if ((flags & DNS_MESSAGEFLAG_QR) == 0) {
1563		/*
1564		 * Query.
1565		 */
1566		goto restart;
1567	}
1568
1569	/*
1570	 * Response.
1571	 */
1572	bucket = dns_hash(qid, &tcpmsg->address, id, disp->localport);
1573	LOCK(&qid->lock);
1574	resp = entry_search(qid, &tcpmsg->address, id, disp->localport, bucket);
1575	dispatch_log(disp, LVL(90),
1576		     "search for response in bucket %d: %s",
1577		     bucket, (resp == NULL ? "not found" : "found"));
1578
1579	if (resp == NULL)
1580		goto unlock;
1581	queue_response = resp->item_out;
1582	rev = allocate_event(disp);
1583	if (rev == NULL)
1584		goto unlock;
1585
1586	/*
1587	 * At this point, rev contains the event we want to fill in, and
1588	 * resp contains the information on the place to send it to.
1589	 * Send the event off.
1590	 */
1591	dns_tcpmsg_keepbuffer(tcpmsg, &rev->buffer);
1592	disp->tcpbuffers++;
1593	rev->result = ISC_R_SUCCESS;
1594	rev->id = id;
1595	rev->addr = tcpmsg->address;
1596	if (queue_response) {
1597		ISC_LIST_APPEND(resp->items, rev, ev_link);
1598	} else {
1599		ISC_EVENT_INIT(rev, sizeof(*rev), 0, NULL, DNS_EVENT_DISPATCH,
1600			       resp->action, resp->arg, resp, NULL, NULL);
1601		request_log(disp, resp, LVL(90),
1602			    "[b] Sent event %p buffer %p len %d to task %p",
1603			    rev, rev->buffer.base, rev->buffer.length,
1604			    resp->task);
1605		resp->item_out = ISC_TRUE;
1606		isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
1607	}
1608 unlock:
1609	UNLOCK(&qid->lock);
1610
1611	/*
1612	 * Restart recv() to get the next packet.
1613	 */
1614 restart:
1615	(void)startrecv(disp, NULL);
1616
1617	UNLOCK(&disp->lock);
1618
1619	isc_event_free(&ev_in);
1620}
1621
1622/*
1623 * disp must be locked.
1624 */
1625static isc_result_t
1626startrecv(dns_dispatch_t *disp, dispsocket_t *dispsock) {
1627	isc_result_t res;
1628	isc_region_t region;
1629	isc_socket_t *socket;
1630
1631	if (disp->shutting_down == 1)
1632		return (ISC_R_SUCCESS);
1633
1634	if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
1635		return (ISC_R_SUCCESS);
1636
1637	if (disp->recv_pending != 0 && dispsock == NULL)
1638		return (ISC_R_SUCCESS);
1639
1640	if (disp->mgr->buffers >= disp->mgr->maxbuffers)
1641		return (ISC_R_NOMEMORY);
1642
1643	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
1644	    dispsock == NULL)
1645		return (ISC_R_SUCCESS);
1646
1647	if (dispsock != NULL)
1648		socket = dispsock->socket;
1649	else
1650		socket = disp->socket;
1651	INSIST(socket != NULL);
1652
1653	switch (disp->socktype) {
1654		/*
1655		 * UDP reads are always maximal.
1656		 */
1657	case isc_sockettype_udp:
1658		region.length = disp->mgr->buffersize;
1659		region.base = allocate_udp_buffer(disp);
1660		if (region.base == NULL)
1661			return (ISC_R_NOMEMORY);
1662		if (dispsock != NULL) {
1663			res = isc_socket_recv(socket, &region, 1,
1664					      dispsock->task, udp_exrecv,
1665					      dispsock);
1666			if (res != ISC_R_SUCCESS) {
1667				free_buffer(disp, region.base, region.length);
1668				return (res);
1669			}
1670		} else {
1671			res = isc_socket_recv(socket, &region, 1,
1672					      disp->task[0], udp_shrecv, disp);
1673			if (res != ISC_R_SUCCESS) {
1674				free_buffer(disp, region.base, region.length);
1675				disp->shutdown_why = res;
1676				disp->shutting_down = 1;
1677				do_cancel(disp);
1678				return (ISC_R_SUCCESS); /* recover by cancel */
1679			}
1680			INSIST(disp->recv_pending == 0);
1681			disp->recv_pending = 1;
1682		}
1683		break;
1684
1685	case isc_sockettype_tcp:
1686		res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task[0],
1687					     tcp_recv, disp);
1688		if (res != ISC_R_SUCCESS) {
1689			disp->shutdown_why = res;
1690			disp->shutting_down = 1;
1691			do_cancel(disp);
1692			return (ISC_R_SUCCESS); /* recover by cancel */
1693		}
1694		INSIST(disp->recv_pending == 0);
1695		disp->recv_pending = 1;
1696		break;
1697	default:
1698		INSIST(0);
1699		break;
1700	}
1701
1702	return (ISC_R_SUCCESS);
1703}
1704
1705/*
1706 * Mgr must be locked when calling this function.
1707 */
1708static isc_boolean_t
1709destroy_mgr_ok(dns_dispatchmgr_t *mgr) {
1710	mgr_log(mgr, LVL(90),
1711		"destroy_mgr_ok: shuttingdown=%d, listnonempty=%d, "
1712		"epool=%d, rpool=%d, dpool=%d",
1713		MGR_IS_SHUTTINGDOWN(mgr), !ISC_LIST_EMPTY(mgr->list),
1714		isc_mempool_getallocated(mgr->epool),
1715		isc_mempool_getallocated(mgr->rpool),
1716		isc_mempool_getallocated(mgr->dpool));
1717	if (!MGR_IS_SHUTTINGDOWN(mgr))
1718		return (ISC_FALSE);
1719	if (!ISC_LIST_EMPTY(mgr->list))
1720		return (ISC_FALSE);
1721	if (isc_mempool_getallocated(mgr->epool) != 0)
1722		return (ISC_FALSE);
1723	if (isc_mempool_getallocated(mgr->rpool) != 0)
1724		return (ISC_FALSE);
1725	if (isc_mempool_getallocated(mgr->dpool) != 0)
1726		return (ISC_FALSE);
1727
1728	return (ISC_TRUE);
1729}
1730
1731/*
1732 * Mgr must be unlocked when calling this function.
1733 */
1734static void
1735destroy_mgr(dns_dispatchmgr_t **mgrp) {
1736	isc_mem_t *mctx;
1737	dns_dispatchmgr_t *mgr;
1738
1739	mgr = *mgrp;
1740	*mgrp = NULL;
1741
1742	mctx = mgr->mctx;
1743
1744	mgr->magic = 0;
1745	mgr->mctx = NULL;
1746	DESTROYLOCK(&mgr->lock);
1747	mgr->state = 0;
1748
1749	DESTROYLOCK(&mgr->arc4_lock);
1750
1751	isc_mempool_destroy(&mgr->epool);
1752	isc_mempool_destroy(&mgr->rpool);
1753	isc_mempool_destroy(&mgr->dpool);
1754	if (mgr->bpool != NULL)
1755		isc_mempool_destroy(&mgr->bpool);
1756	if (mgr->spool != NULL)
1757		isc_mempool_destroy(&mgr->spool);
1758
1759	DESTROYLOCK(&mgr->pool_lock);
1760
1761#ifdef BIND9
1762	if (mgr->entropy != NULL)
1763		isc_entropy_detach(&mgr->entropy);
1764#endif /* BIND9 */
1765	if (mgr->qid != NULL)
1766		qid_destroy(mctx, &mgr->qid);
1767
1768	DESTROYLOCK(&mgr->buffer_lock);
1769
1770	if (mgr->blackhole != NULL)
1771		dns_acl_detach(&mgr->blackhole);
1772
1773	if (mgr->stats != NULL)
1774		isc_stats_detach(&mgr->stats);
1775
1776	if (mgr->v4ports != NULL) {
1777		isc_mem_put(mctx, mgr->v4ports,
1778			    mgr->nv4ports * sizeof(in_port_t));
1779	}
1780	if (mgr->v6ports != NULL) {
1781		isc_mem_put(mctx, mgr->v6ports,
1782			    mgr->nv6ports * sizeof(in_port_t));
1783	}
1784	isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
1785	isc_mem_detach(&mctx);
1786}
1787
1788static isc_result_t
1789open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
1790	    unsigned int options, isc_socket_t **sockp)
1791{
1792	isc_socket_t *sock;
1793	isc_result_t result;
1794
1795	sock = *sockp;
1796	if (sock == NULL) {
1797		result = isc_socket_create(mgr, isc_sockaddr_pf(local),
1798					   isc_sockettype_udp, &sock);
1799		if (result != ISC_R_SUCCESS)
1800			return (result);
1801		isc_socket_setname(sock, "dispatcher", NULL);
1802	} else {
1803#ifdef BIND9
1804		result = isc_socket_open(sock);
1805		if (result != ISC_R_SUCCESS)
1806			return (result);
1807#else
1808		INSIST(0);
1809#endif
1810	}
1811
1812#ifndef ISC_ALLOW_MAPPED
1813	isc_socket_ipv6only(sock, ISC_TRUE);
1814#endif
1815	result = isc_socket_bind(sock, local, options);
1816	if (result != ISC_R_SUCCESS) {
1817		if (*sockp == NULL)
1818			isc_socket_detach(&sock);
1819		else {
1820#ifdef BIND9
1821			isc_socket_close(sock);
1822#else
1823			INSIST(0);
1824#endif
1825		}
1826		return (result);
1827	}
1828
1829	*sockp = sock;
1830	return (ISC_R_SUCCESS);
1831}
1832
1833/*%
1834 * Create a temporary port list to set the initial default set of dispatch
1835 * ports: [1024, 65535].  This is almost meaningless as the application will
1836 * normally set the ports explicitly, but is provided to fill some minor corner
1837 * cases.
1838 */
1839static isc_result_t
1840create_default_portset(isc_mem_t *mctx, isc_portset_t **portsetp) {
1841	isc_result_t result;
1842
1843	result = isc_portset_create(mctx, portsetp);
1844	if (result != ISC_R_SUCCESS)
1845		return (result);
1846	isc_portset_addrange(*portsetp, 1024, 65535);
1847
1848	return (ISC_R_SUCCESS);
1849}
1850
1851/*
1852 * Publics.
1853 */
1854
1855isc_result_t
1856dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy,
1857		       dns_dispatchmgr_t **mgrp)
1858{
1859	dns_dispatchmgr_t *mgr;
1860	isc_result_t result;
1861	isc_portset_t *v4portset = NULL;
1862	isc_portset_t *v6portset = NULL;
1863
1864	REQUIRE(mctx != NULL);
1865	REQUIRE(mgrp != NULL && *mgrp == NULL);
1866
1867	mgr = isc_mem_get(mctx, sizeof(dns_dispatchmgr_t));
1868	if (mgr == NULL)
1869		return (ISC_R_NOMEMORY);
1870
1871	mgr->mctx = NULL;
1872	isc_mem_attach(mctx, &mgr->mctx);
1873
1874	mgr->blackhole = NULL;
1875	mgr->stats = NULL;
1876
1877	result = isc_mutex_init(&mgr->lock);
1878	if (result != ISC_R_SUCCESS)
1879		goto deallocate;
1880
1881	result = isc_mutex_init(&mgr->arc4_lock);
1882	if (result != ISC_R_SUCCESS)
1883		goto kill_lock;
1884
1885	result = isc_mutex_init(&mgr->buffer_lock);
1886	if (result != ISC_R_SUCCESS)
1887		goto kill_arc4_lock;
1888
1889	result = isc_mutex_init(&mgr->pool_lock);
1890	if (result != ISC_R_SUCCESS)
1891		goto kill_buffer_lock;
1892
1893	mgr->epool = NULL;
1894	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispatchevent_t),
1895			       &mgr->epool) != ISC_R_SUCCESS) {
1896		result = ISC_R_NOMEMORY;
1897		goto kill_pool_lock;
1898	}
1899
1900	mgr->rpool = NULL;
1901	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispentry_t),
1902			       &mgr->rpool) != ISC_R_SUCCESS) {
1903		result = ISC_R_NOMEMORY;
1904		goto kill_epool;
1905	}
1906
1907	mgr->dpool = NULL;
1908	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispatch_t),
1909			       &mgr->dpool) != ISC_R_SUCCESS) {
1910		result = ISC_R_NOMEMORY;
1911		goto kill_rpool;
1912	}
1913
1914	isc_mempool_setname(mgr->epool, "dispmgr_epool");
1915	isc_mempool_setfreemax(mgr->epool, 1024);
1916	isc_mempool_associatelock(mgr->epool, &mgr->pool_lock);
1917
1918	isc_mempool_setname(mgr->rpool, "dispmgr_rpool");
1919	isc_mempool_setfreemax(mgr->rpool, 1024);
1920	isc_mempool_associatelock(mgr->rpool, &mgr->pool_lock);
1921
1922	isc_mempool_setname(mgr->dpool, "dispmgr_dpool");
1923	isc_mempool_setfreemax(mgr->dpool, 1024);
1924	isc_mempool_associatelock(mgr->dpool, &mgr->pool_lock);
1925
1926	mgr->buffers = 0;
1927	mgr->buffersize = 0;
1928	mgr->maxbuffers = 0;
1929	mgr->bpool = NULL;
1930	mgr->spool = NULL;
1931	mgr->entropy = NULL;
1932	mgr->qid = NULL;
1933	mgr->state = 0;
1934	ISC_LIST_INIT(mgr->list);
1935	mgr->v4ports = NULL;
1936	mgr->v6ports = NULL;
1937	mgr->nv4ports = 0;
1938	mgr->nv6ports = 0;
1939	mgr->magic = DNS_DISPATCHMGR_MAGIC;
1940
1941	result = create_default_portset(mctx, &v4portset);
1942	if (result == ISC_R_SUCCESS) {
1943		result = create_default_portset(mctx, &v6portset);
1944		if (result == ISC_R_SUCCESS) {
1945			result = dns_dispatchmgr_setavailports(mgr,
1946							       v4portset,
1947							       v6portset);
1948		}
1949	}
1950	if (v4portset != NULL)
1951		isc_portset_destroy(mctx, &v4portset);
1952	if (v6portset != NULL)
1953		isc_portset_destroy(mctx, &v6portset);
1954	if (result != ISC_R_SUCCESS)
1955		goto kill_dpool;
1956
1957#ifdef BIND9
1958	if (entropy != NULL)
1959		isc_entropy_attach(entropy, &mgr->entropy);
1960#else
1961	UNUSED(entropy);
1962#endif
1963
1964	dispatch_initrandom(&mgr->arc4ctx, mgr->entropy, &mgr->arc4_lock);
1965
1966	*mgrp = mgr;
1967	return (ISC_R_SUCCESS);
1968
1969 kill_dpool:
1970	isc_mempool_destroy(&mgr->dpool);
1971 kill_rpool:
1972	isc_mempool_destroy(&mgr->rpool);
1973 kill_epool:
1974	isc_mempool_destroy(&mgr->epool);
1975 kill_pool_lock:
1976	DESTROYLOCK(&mgr->pool_lock);
1977 kill_buffer_lock:
1978	DESTROYLOCK(&mgr->buffer_lock);
1979 kill_arc4_lock:
1980	DESTROYLOCK(&mgr->arc4_lock);
1981 kill_lock:
1982	DESTROYLOCK(&mgr->lock);
1983 deallocate:
1984	isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
1985	isc_mem_detach(&mctx);
1986
1987	return (result);
1988}
1989
1990void
1991dns_dispatchmgr_setblackhole(dns_dispatchmgr_t *mgr, dns_acl_t *blackhole) {
1992	REQUIRE(VALID_DISPATCHMGR(mgr));
1993	if (mgr->blackhole != NULL)
1994		dns_acl_detach(&mgr->blackhole);
1995	dns_acl_attach(blackhole, &mgr->blackhole);
1996}
1997
1998dns_acl_t *
1999dns_dispatchmgr_getblackhole(dns_dispatchmgr_t *mgr) {
2000	REQUIRE(VALID_DISPATCHMGR(mgr));
2001	return (mgr->blackhole);
2002}
2003
2004void
2005dns_dispatchmgr_setblackportlist(dns_dispatchmgr_t *mgr,
2006				 dns_portlist_t *portlist)
2007{
2008	REQUIRE(VALID_DISPATCHMGR(mgr));
2009	UNUSED(portlist);
2010
2011	/* This function is deprecated: use dns_dispatchmgr_setavailports(). */
2012	return;
2013}
2014
2015dns_portlist_t *
2016dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr) {
2017	REQUIRE(VALID_DISPATCHMGR(mgr));
2018	return (NULL);		/* this function is deprecated */
2019}
2020
2021isc_result_t
2022dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset,
2023			      isc_portset_t *v6portset)
2024{
2025	in_port_t *v4ports, *v6ports, p;
2026	unsigned int nv4ports, nv6ports, i4, i6;
2027
2028	REQUIRE(VALID_DISPATCHMGR(mgr));
2029
2030	nv4ports = isc_portset_nports(v4portset);
2031	nv6ports = isc_portset_nports(v6portset);
2032
2033	v4ports = NULL;
2034	if (nv4ports != 0) {
2035		v4ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv4ports);
2036		if (v4ports == NULL)
2037			return (ISC_R_NOMEMORY);
2038	}
2039	v6ports = NULL;
2040	if (nv6ports != 0) {
2041		v6ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv6ports);
2042		if (v6ports == NULL) {
2043			if (v4ports != NULL) {
2044				isc_mem_put(mgr->mctx, v4ports,
2045					    sizeof(in_port_t) *
2046					    isc_portset_nports(v4portset));
2047			}
2048			return (ISC_R_NOMEMORY);
2049		}
2050	}
2051
2052	p = 0;
2053	i4 = 0;
2054	i6 = 0;
2055	do {
2056		if (isc_portset_isset(v4portset, p)) {
2057			INSIST(i4 < nv4ports);
2058			v4ports[i4++] = p;
2059		}
2060		if (isc_portset_isset(v6portset, p)) {
2061			INSIST(i6 < nv6ports);
2062			v6ports[i6++] = p;
2063		}
2064	} while (p++ < 65535);
2065	INSIST(i4 == nv4ports && i6 == nv6ports);
2066
2067	PORTBUFLOCK(mgr);
2068	if (mgr->v4ports != NULL) {
2069		isc_mem_put(mgr->mctx, mgr->v4ports,
2070			    mgr->nv4ports * sizeof(in_port_t));
2071	}
2072	mgr->v4ports = v4ports;
2073	mgr->nv4ports = nv4ports;
2074
2075	if (mgr->v6ports != NULL) {
2076		isc_mem_put(mgr->mctx, mgr->v6ports,
2077			    mgr->nv6ports * sizeof(in_port_t));
2078	}
2079	mgr->v6ports = v6ports;
2080	mgr->nv6ports = nv6ports;
2081	PORTBUFUNLOCK(mgr);
2082
2083	return (ISC_R_SUCCESS);
2084}
2085
2086static isc_result_t
2087dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr,
2088		       unsigned int buffersize, unsigned int maxbuffers,
2089		       unsigned int maxrequests, unsigned int buckets,
2090		       unsigned int increment)
2091{
2092	isc_result_t result;
2093
2094	REQUIRE(VALID_DISPATCHMGR(mgr));
2095	REQUIRE(buffersize >= 512 && buffersize < (64 * 1024));
2096	REQUIRE(maxbuffers > 0);
2097	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2098	REQUIRE(increment > buckets);
2099
2100	/*
2101	 * Keep some number of items around.  This should be a config
2102	 * option.  For now, keep 8, but later keep at least two even
2103	 * if the caller wants less.  This allows us to ensure certain
2104	 * things, like an event can be "freed" and the next allocation
2105	 * will always succeed.
2106	 *
2107	 * Note that if limits are placed on anything here, we use one
2108	 * event internally, so the actual limit should be "wanted + 1."
2109	 *
2110	 * XXXMLG
2111	 */
2112
2113	if (maxbuffers < 8)
2114		maxbuffers = 8;
2115
2116	LOCK(&mgr->buffer_lock);
2117
2118	/* Create or adjust buffer pool */
2119	if (mgr->bpool != NULL) {
2120		/*
2121		 * We only increase the maxbuffers to avoid accidental buffer
2122		 * shortage.  Ideally we'd separate the manager-wide maximum
2123		 * from per-dispatch limits and respect the latter within the
2124		 * global limit.  But at this moment that's deemed to be
2125		 * overkilling and isn't worth additional implementation
2126		 * complexity.
2127		 */
2128		if (maxbuffers > mgr->maxbuffers) {
2129			isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
2130			mgr->maxbuffers = maxbuffers;
2131		}
2132	} else {
2133		result = isc_mempool_create(mgr->mctx, buffersize, &mgr->bpool);
2134		if (result != ISC_R_SUCCESS) {
2135			UNLOCK(&mgr->buffer_lock);
2136			return (result);
2137		}
2138		isc_mempool_setname(mgr->bpool, "dispmgr_bpool");
2139		isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
2140		isc_mempool_associatelock(mgr->bpool, &mgr->pool_lock);
2141	}
2142
2143	/* Create or adjust socket pool */
2144	if (mgr->spool != NULL) {
2145		isc_mempool_setmaxalloc(mgr->spool, DNS_DISPATCH_POOLSOCKS * 2);
2146		UNLOCK(&mgr->buffer_lock);
2147		return (ISC_R_SUCCESS);
2148	}
2149	result = isc_mempool_create(mgr->mctx, sizeof(dispsocket_t),
2150				    &mgr->spool);
2151	if (result != ISC_R_SUCCESS) {
2152		UNLOCK(&mgr->buffer_lock);
2153		goto cleanup;
2154	}
2155	isc_mempool_setname(mgr->spool, "dispmgr_spool");
2156	isc_mempool_setmaxalloc(mgr->spool, maxrequests);
2157	isc_mempool_associatelock(mgr->spool, &mgr->pool_lock);
2158
2159	result = qid_allocate(mgr, buckets, increment, &mgr->qid, ISC_TRUE);
2160	if (result != ISC_R_SUCCESS)
2161		goto cleanup;
2162
2163	mgr->buffersize = buffersize;
2164	mgr->maxbuffers = maxbuffers;
2165	UNLOCK(&mgr->buffer_lock);
2166	return (ISC_R_SUCCESS);
2167
2168 cleanup:
2169	isc_mempool_destroy(&mgr->bpool);
2170	if (mgr->spool != NULL)
2171		isc_mempool_destroy(&mgr->spool);
2172	UNLOCK(&mgr->buffer_lock);
2173	return (result);
2174}
2175
2176void
2177dns_dispatchmgr_destroy(dns_dispatchmgr_t **mgrp) {
2178	dns_dispatchmgr_t *mgr;
2179	isc_boolean_t killit;
2180
2181	REQUIRE(mgrp != NULL);
2182	REQUIRE(VALID_DISPATCHMGR(*mgrp));
2183
2184	mgr = *mgrp;
2185	*mgrp = NULL;
2186
2187	LOCK(&mgr->lock);
2188	mgr->state |= MGR_SHUTTINGDOWN;
2189
2190	killit = destroy_mgr_ok(mgr);
2191	UNLOCK(&mgr->lock);
2192
2193	mgr_log(mgr, LVL(90), "destroy: killit=%d", killit);
2194
2195	if (killit)
2196		destroy_mgr(&mgr);
2197}
2198
2199void
2200dns_dispatchmgr_setstats(dns_dispatchmgr_t *mgr, isc_stats_t *stats) {
2201	REQUIRE(VALID_DISPATCHMGR(mgr));
2202	REQUIRE(ISC_LIST_EMPTY(mgr->list));
2203	REQUIRE(mgr->stats == NULL);
2204
2205	isc_stats_attach(stats, &mgr->stats);
2206}
2207
2208static int
2209port_cmp(const void *key, const void *ent) {
2210	in_port_t p1 = *(const in_port_t *)key;
2211	in_port_t p2 = *(const in_port_t *)ent;
2212
2213	if (p1 < p2)
2214		return (-1);
2215	else if (p1 == p2)
2216		return (0);
2217	else
2218		return (1);
2219}
2220
2221static isc_boolean_t
2222portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
2223	      isc_sockaddr_t *sockaddrp)
2224{
2225	isc_sockaddr_t sockaddr;
2226	isc_result_t result;
2227	in_port_t *ports, port;
2228	unsigned int nports;
2229	isc_boolean_t available = ISC_FALSE;
2230
2231	REQUIRE(sock != NULL || sockaddrp != NULL);
2232
2233	PORTBUFLOCK(mgr);
2234	if (sock != NULL) {
2235		sockaddrp = &sockaddr;
2236		result = isc_socket_getsockname(sock, sockaddrp);
2237		if (result != ISC_R_SUCCESS)
2238			goto unlock;
2239	}
2240
2241	if (isc_sockaddr_pf(sockaddrp) == AF_INET) {
2242		ports = mgr->v4ports;
2243		nports = mgr->nv4ports;
2244	} else {
2245		ports = mgr->v6ports;
2246		nports = mgr->nv6ports;
2247	}
2248	if (ports == NULL)
2249		goto unlock;
2250
2251	port = isc_sockaddr_getport(sockaddrp);
2252	if (bsearch(&port, ports, nports, sizeof(in_port_t), port_cmp) != NULL)
2253		available = ISC_TRUE;
2254
2255unlock:
2256	PORTBUFUNLOCK(mgr);
2257	return (available);
2258}
2259
2260#define ATTRMATCH(_a1, _a2, _mask) (((_a1) & (_mask)) == ((_a2) & (_mask)))
2261
2262static isc_boolean_t
2263local_addr_match(dns_dispatch_t *disp, isc_sockaddr_t *addr) {
2264	isc_sockaddr_t sockaddr;
2265	isc_result_t result;
2266
2267	REQUIRE(disp->socket != NULL);
2268
2269	if (addr == NULL)
2270		return (ISC_TRUE);
2271
2272	/*
2273	 * Don't match wildcard ports unless the port is available in the
2274	 * current configuration.
2275	 */
2276	if (isc_sockaddr_getport(addr) == 0 &&
2277	    isc_sockaddr_getport(&disp->local) == 0 &&
2278	    !portavailable(disp->mgr, disp->socket, NULL)) {
2279		return (ISC_FALSE);
2280	}
2281
2282	/*
2283	 * Check if we match the binding <address,port>.
2284	 * Wildcard ports match/fail here.
2285	 */
2286	if (isc_sockaddr_equal(&disp->local, addr))
2287		return (ISC_TRUE);
2288	if (isc_sockaddr_getport(addr) == 0)
2289		return (ISC_FALSE);
2290
2291	/*
2292	 * Check if we match a bound wildcard port <address,port>.
2293	 */
2294	if (!isc_sockaddr_eqaddr(&disp->local, addr))
2295		return (ISC_FALSE);
2296	result = isc_socket_getsockname(disp->socket, &sockaddr);
2297	if (result != ISC_R_SUCCESS)
2298		return (ISC_FALSE);
2299
2300	return (isc_sockaddr_equal(&sockaddr, addr));
2301}
2302
2303/*
2304 * Requires mgr be locked.
2305 *
2306 * No dispatcher can be locked by this thread when calling this function.
2307 *
2308 *
2309 * NOTE:
2310 *	If a matching dispatcher is found, it is locked after this function
2311 *	returns, and must be unlocked by the caller.
2312 */
2313static isc_result_t
2314dispatch_find(dns_dispatchmgr_t *mgr, isc_sockaddr_t *local,
2315	      unsigned int attributes, unsigned int mask,
2316	      dns_dispatch_t **dispp)
2317{
2318	dns_dispatch_t *disp;
2319	isc_result_t result;
2320
2321	/*
2322	 * Make certain that we will not match a private or exclusive dispatch.
2323	 */
2324	attributes &= ~(DNS_DISPATCHATTR_PRIVATE|DNS_DISPATCHATTR_EXCLUSIVE);
2325	mask |= (DNS_DISPATCHATTR_PRIVATE|DNS_DISPATCHATTR_EXCLUSIVE);
2326
2327	disp = ISC_LIST_HEAD(mgr->list);
2328	while (disp != NULL) {
2329		LOCK(&disp->lock);
2330		if ((disp->shutting_down == 0)
2331		    && ATTRMATCH(disp->attributes, attributes, mask)
2332		    && local_addr_match(disp, local))
2333			break;
2334		UNLOCK(&disp->lock);
2335		disp = ISC_LIST_NEXT(disp, link);
2336	}
2337
2338	if (disp == NULL) {
2339		result = ISC_R_NOTFOUND;
2340		goto out;
2341	}
2342
2343	*dispp = disp;
2344	result = ISC_R_SUCCESS;
2345 out:
2346
2347	return (result);
2348}
2349
2350static isc_result_t
2351qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
2352	     unsigned int increment, dns_qid_t **qidp,
2353	     isc_boolean_t needsocktable)
2354{
2355	dns_qid_t *qid;
2356	unsigned int i;
2357	isc_result_t result;
2358
2359	REQUIRE(VALID_DISPATCHMGR(mgr));
2360	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2361	REQUIRE(increment > buckets);
2362	REQUIRE(qidp != NULL && *qidp == NULL);
2363
2364	qid = isc_mem_get(mgr->mctx, sizeof(*qid));
2365	if (qid == NULL)
2366		return (ISC_R_NOMEMORY);
2367
2368	qid->qid_table = isc_mem_get(mgr->mctx,
2369				     buckets * sizeof(dns_displist_t));
2370	if (qid->qid_table == NULL) {
2371		isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2372		return (ISC_R_NOMEMORY);
2373	}
2374
2375	qid->sock_table = NULL;
2376	if (needsocktable) {
2377		qid->sock_table = isc_mem_get(mgr->mctx, buckets *
2378					      sizeof(dispsocketlist_t));
2379		if (qid->sock_table == NULL) {
2380			isc_mem_put(mgr->mctx, qid->qid_table,
2381				    buckets * sizeof(dns_displist_t));
2382			isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2383			return (ISC_R_NOMEMORY);
2384		}
2385	}
2386
2387	result = isc_mutex_init(&qid->lock);
2388	if (result != ISC_R_SUCCESS) {
2389		if (qid->sock_table != NULL) {
2390			isc_mem_put(mgr->mctx, qid->sock_table,
2391				    buckets * sizeof(dispsocketlist_t));
2392		}
2393		isc_mem_put(mgr->mctx, qid->qid_table,
2394			    buckets * sizeof(dns_displist_t));
2395		isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2396		return (result);
2397	}
2398
2399	for (i = 0; i < buckets; i++) {
2400		ISC_LIST_INIT(qid->qid_table[i]);
2401		if (qid->sock_table != NULL)
2402			ISC_LIST_INIT(qid->sock_table[i]);
2403	}
2404
2405	qid->qid_nbuckets = buckets;
2406	qid->qid_increment = increment;
2407	qid->magic = QID_MAGIC;
2408	*qidp = qid;
2409	return (ISC_R_SUCCESS);
2410}
2411
2412static void
2413qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp) {
2414	dns_qid_t *qid;
2415
2416	REQUIRE(qidp != NULL);
2417	qid = *qidp;
2418
2419	REQUIRE(VALID_QID(qid));
2420
2421	*qidp = NULL;
2422	qid->magic = 0;
2423	isc_mem_put(mctx, qid->qid_table,
2424		    qid->qid_nbuckets * sizeof(dns_displist_t));
2425	if (qid->sock_table != NULL) {
2426		isc_mem_put(mctx, qid->sock_table,
2427			    qid->qid_nbuckets * sizeof(dispsocketlist_t));
2428	}
2429	DESTROYLOCK(&qid->lock);
2430	isc_mem_put(mctx, qid, sizeof(*qid));
2431}
2432
2433/*
2434 * Allocate and set important limits.
2435 */
2436static isc_result_t
2437dispatch_allocate(dns_dispatchmgr_t *mgr, unsigned int maxrequests,
2438		  dns_dispatch_t **dispp)
2439{
2440	dns_dispatch_t *disp;
2441	isc_result_t result;
2442
2443	REQUIRE(VALID_DISPATCHMGR(mgr));
2444	REQUIRE(dispp != NULL && *dispp == NULL);
2445
2446	/*
2447	 * Set up the dispatcher, mostly.  Don't bother setting some of
2448	 * the options that are controlled by tcp vs. udp, etc.
2449	 */
2450
2451	disp = isc_mempool_get(mgr->dpool);
2452	if (disp == NULL)
2453		return (ISC_R_NOMEMORY);
2454
2455	disp->magic = 0;
2456	disp->mgr = mgr;
2457	disp->maxrequests = maxrequests;
2458	disp->attributes = 0;
2459	ISC_LINK_INIT(disp, link);
2460	disp->refcount = 1;
2461	disp->recv_pending = 0;
2462	memset(&disp->local, 0, sizeof(disp->local));
2463	disp->localport = 0;
2464	disp->shutting_down = 0;
2465	disp->shutdown_out = 0;
2466	disp->connected = 0;
2467	disp->tcpmsg_valid = 0;
2468	disp->shutdown_why = ISC_R_UNEXPECTED;
2469	disp->requests = 0;
2470	disp->tcpbuffers = 0;
2471	disp->qid = NULL;
2472	ISC_LIST_INIT(disp->activesockets);
2473	ISC_LIST_INIT(disp->inactivesockets);
2474	disp->nsockets = 0;
2475	dispatch_initrandom(&disp->arc4ctx, mgr->entropy, NULL);
2476	disp->port_table = NULL;
2477	disp->portpool = NULL;
2478
2479	result = isc_mutex_init(&disp->lock);
2480	if (result != ISC_R_SUCCESS)
2481		goto deallocate;
2482
2483	disp->failsafe_ev = allocate_event(disp);
2484	if (disp->failsafe_ev == NULL) {
2485		result = ISC_R_NOMEMORY;
2486		goto kill_lock;
2487	}
2488
2489	disp->magic = DISPATCH_MAGIC;
2490
2491	*dispp = disp;
2492	return (ISC_R_SUCCESS);
2493
2494	/*
2495	 * error returns
2496	 */
2497 kill_lock:
2498	DESTROYLOCK(&disp->lock);
2499 deallocate:
2500	isc_mempool_put(mgr->dpool, disp);
2501
2502	return (result);
2503}
2504
2505
2506/*
2507 * MUST be unlocked, and not used by anything.
2508 */
2509static void
2510dispatch_free(dns_dispatch_t **dispp)
2511{
2512	dns_dispatch_t *disp;
2513	dns_dispatchmgr_t *mgr;
2514	int i;
2515
2516	REQUIRE(VALID_DISPATCH(*dispp));
2517	disp = *dispp;
2518	*dispp = NULL;
2519
2520	mgr = disp->mgr;
2521	REQUIRE(VALID_DISPATCHMGR(mgr));
2522
2523	if (disp->tcpmsg_valid) {
2524		dns_tcpmsg_invalidate(&disp->tcpmsg);
2525		disp->tcpmsg_valid = 0;
2526	}
2527
2528	INSIST(disp->tcpbuffers == 0);
2529	INSIST(disp->requests == 0);
2530	INSIST(disp->recv_pending == 0);
2531	INSIST(ISC_LIST_EMPTY(disp->activesockets));
2532	INSIST(ISC_LIST_EMPTY(disp->inactivesockets));
2533
2534	isc_mempool_put(mgr->epool, disp->failsafe_ev);
2535	disp->failsafe_ev = NULL;
2536
2537	if (disp->qid != NULL)
2538		qid_destroy(mgr->mctx, &disp->qid);
2539
2540	if (disp->port_table != NULL) {
2541		for (i = 0; i < DNS_DISPATCH_PORTTABLESIZE; i++)
2542			INSIST(ISC_LIST_EMPTY(disp->port_table[i]));
2543		isc_mem_put(mgr->mctx, disp->port_table,
2544			    sizeof(disp->port_table[0]) *
2545			    DNS_DISPATCH_PORTTABLESIZE);
2546	}
2547
2548	if (disp->portpool != NULL)
2549		isc_mempool_destroy(&disp->portpool);
2550
2551	disp->mgr = NULL;
2552	DESTROYLOCK(&disp->lock);
2553	disp->magic = 0;
2554	isc_mempool_put(mgr->dpool, disp);
2555}
2556
2557isc_result_t
2558dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
2559		       isc_taskmgr_t *taskmgr, unsigned int buffersize,
2560		       unsigned int maxbuffers, unsigned int maxrequests,
2561		       unsigned int buckets, unsigned int increment,
2562		       unsigned int attributes, dns_dispatch_t **dispp)
2563{
2564	isc_result_t result;
2565	dns_dispatch_t *disp;
2566
2567	UNUSED(maxbuffers);
2568	UNUSED(buffersize);
2569
2570	REQUIRE(VALID_DISPATCHMGR(mgr));
2571	REQUIRE(isc_socket_gettype(sock) == isc_sockettype_tcp);
2572	REQUIRE((attributes & DNS_DISPATCHATTR_TCP) != 0);
2573	REQUIRE((attributes & DNS_DISPATCHATTR_UDP) == 0);
2574
2575	attributes |= DNS_DISPATCHATTR_PRIVATE;  /* XXXMLG */
2576
2577	LOCK(&mgr->lock);
2578
2579	/*
2580	 * dispatch_allocate() checks mgr for us.
2581	 * qid_allocate() checks buckets and increment for us.
2582	 */
2583	disp = NULL;
2584	result = dispatch_allocate(mgr, maxrequests, &disp);
2585	if (result != ISC_R_SUCCESS) {
2586		UNLOCK(&mgr->lock);
2587		return (result);
2588	}
2589
2590	result = qid_allocate(mgr, buckets, increment, &disp->qid, ISC_FALSE);
2591	if (result != ISC_R_SUCCESS)
2592		goto deallocate_dispatch;
2593
2594	disp->socktype = isc_sockettype_tcp;
2595	disp->socket = NULL;
2596	isc_socket_attach(sock, &disp->socket);
2597
2598	disp->ntasks = 1;
2599	disp->task[0] = NULL;
2600	result = isc_task_create(taskmgr, 0, &disp->task[0]);
2601	if (result != ISC_R_SUCCESS)
2602		goto kill_socket;
2603
2604	disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
2605					    DNS_EVENT_DISPATCHCONTROL,
2606					    destroy_disp, disp,
2607					    sizeof(isc_event_t));
2608	if (disp->ctlevent == NULL) {
2609		result = ISC_R_NOMEMORY;
2610		goto kill_task;
2611	}
2612
2613	isc_task_setname(disp->task[0], "tcpdispatch", disp);
2614
2615	dns_tcpmsg_init(mgr->mctx, disp->socket, &disp->tcpmsg);
2616	disp->tcpmsg_valid = 1;
2617
2618	disp->attributes = attributes;
2619
2620	/*
2621	 * Append it to the dispatcher list.
2622	 */
2623	ISC_LIST_APPEND(mgr->list, disp, link);
2624	UNLOCK(&mgr->lock);
2625
2626	mgr_log(mgr, LVL(90), "created TCP dispatcher %p", disp);
2627	dispatch_log(disp, LVL(90), "created task %p", disp->task[0]);
2628
2629	*dispp = disp;
2630
2631	return (ISC_R_SUCCESS);
2632
2633	/*
2634	 * Error returns.
2635	 */
2636 kill_task:
2637	isc_task_detach(&disp->task[0]);
2638 kill_socket:
2639	isc_socket_detach(&disp->socket);
2640 deallocate_dispatch:
2641	dispatch_free(&disp);
2642
2643	UNLOCK(&mgr->lock);
2644
2645	return (result);
2646}
2647
2648isc_result_t
2649dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
2650		    isc_taskmgr_t *taskmgr, isc_sockaddr_t *localaddr,
2651		    unsigned int buffersize,
2652		    unsigned int maxbuffers, unsigned int maxrequests,
2653		    unsigned int buckets, unsigned int increment,
2654		    unsigned int attributes, unsigned int mask,
2655		    dns_dispatch_t **dispp)
2656{
2657	isc_result_t result;
2658	dns_dispatch_t *disp = NULL;
2659
2660	REQUIRE(VALID_DISPATCHMGR(mgr));
2661	REQUIRE(sockmgr != NULL);
2662	REQUIRE(localaddr != NULL);
2663	REQUIRE(taskmgr != NULL);
2664	REQUIRE(buffersize >= 512 && buffersize < (64 * 1024));
2665	REQUIRE(maxbuffers > 0);
2666	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2667	REQUIRE(increment > buckets);
2668	REQUIRE(dispp != NULL && *dispp == NULL);
2669	REQUIRE((attributes & DNS_DISPATCHATTR_TCP) == 0);
2670
2671	result = dns_dispatchmgr_setudp(mgr, buffersize, maxbuffers,
2672					maxrequests, buckets, increment);
2673	if (result != ISC_R_SUCCESS)
2674		return (result);
2675
2676	LOCK(&mgr->lock);
2677
2678	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
2679		REQUIRE(isc_sockaddr_getport(localaddr) == 0);
2680		goto createudp;
2681	}
2682
2683	/*
2684	 * See if we have a dispatcher that matches.
2685	 */
2686	result = dispatch_find(mgr, localaddr, attributes, mask, &disp);
2687	if (result == ISC_R_SUCCESS) {
2688		disp->refcount++;
2689
2690		if (disp->maxrequests < maxrequests)
2691			disp->maxrequests = maxrequests;
2692
2693		if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) == 0 &&
2694		    (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
2695		{
2696			disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
2697			if (disp->recv_pending != 0)
2698				isc_socket_cancel(disp->socket, disp->task[0],
2699						  ISC_SOCKCANCEL_RECV);
2700		}
2701
2702		UNLOCK(&disp->lock);
2703		UNLOCK(&mgr->lock);
2704
2705		*dispp = disp;
2706
2707		return (ISC_R_SUCCESS);
2708	}
2709
2710 createudp:
2711	/*
2712	 * Nope, create one.
2713	 */
2714	result = dispatch_createudp(mgr, sockmgr, taskmgr, localaddr,
2715				    maxrequests, attributes, &disp);
2716	if (result != ISC_R_SUCCESS) {
2717		UNLOCK(&mgr->lock);
2718		return (result);
2719	}
2720
2721	UNLOCK(&mgr->lock);
2722	*dispp = disp;
2723	return (ISC_R_SUCCESS);
2724}
2725
2726/*
2727 * mgr should be locked.
2728 */
2729
2730#ifndef DNS_DISPATCH_HELD
2731#define DNS_DISPATCH_HELD 20U
2732#endif
2733
2734static isc_result_t
2735get_udpsocket(dns_dispatchmgr_t *mgr, dns_dispatch_t *disp,
2736	      isc_socketmgr_t *sockmgr, isc_sockaddr_t *localaddr,
2737	      isc_socket_t **sockp)
2738{
2739	unsigned int i, j;
2740	isc_socket_t *held[DNS_DISPATCH_HELD];
2741	isc_sockaddr_t localaddr_bound;
2742	isc_socket_t *sock = NULL;
2743	isc_result_t result = ISC_R_SUCCESS;
2744	isc_boolean_t anyport;
2745
2746	INSIST(sockp != NULL && *sockp == NULL);
2747
2748	localaddr_bound = *localaddr;
2749	anyport = ISC_TF(isc_sockaddr_getport(localaddr) == 0);
2750
2751	if (anyport) {
2752		unsigned int nports;
2753		in_port_t *ports;
2754
2755		/*
2756		 * If no port is specified, we first try to pick up a random
2757		 * port by ourselves.
2758		 */
2759		if (isc_sockaddr_pf(&disp->local) == AF_INET) {
2760			nports = disp->mgr->nv4ports;
2761			ports = disp->mgr->v4ports;
2762		} else {
2763			nports = disp->mgr->nv6ports;
2764			ports = disp->mgr->v6ports;
2765		}
2766		if (nports == 0)
2767			return (ISC_R_ADDRNOTAVAIL);
2768
2769		for (i = 0; i < 1024; i++) {
2770			in_port_t prt;
2771
2772			prt = ports[dispatch_uniformrandom(
2773					DISP_ARC4CTX(disp),
2774					nports)];
2775			isc_sockaddr_setport(&localaddr_bound, prt);
2776			result = open_socket(sockmgr, &localaddr_bound,
2777					     0, &sock);
2778			if (result == ISC_R_SUCCESS ||
2779			    result != ISC_R_ADDRINUSE) {
2780				disp->localport = prt;
2781				*sockp = sock;
2782				return (result);
2783			}
2784		}
2785
2786		/*
2787		 * If this fails 1024 times, we then ask the kernel for
2788		 * choosing one.
2789		 */
2790	} else {
2791		/* Allow to reuse address for non-random ports. */
2792		result = open_socket(sockmgr, localaddr,
2793				     ISC_SOCKET_REUSEADDRESS, &sock);
2794
2795		if (result == ISC_R_SUCCESS)
2796			*sockp = sock;
2797
2798		return (result);
2799	}
2800
2801	memset(held, 0, sizeof(held));
2802	i = 0;
2803
2804	for (j = 0; j < 0xffffU; j++) {
2805		result = open_socket(sockmgr, localaddr, 0, &sock);
2806		if (result != ISC_R_SUCCESS)
2807			goto end;
2808		else if (!anyport)
2809			break;
2810		else if (portavailable(mgr, sock, NULL))
2811			break;
2812		if (held[i] != NULL)
2813			isc_socket_detach(&held[i]);
2814		held[i++] = sock;
2815		sock = NULL;
2816		if (i == DNS_DISPATCH_HELD)
2817			i = 0;
2818	}
2819	if (j == 0xffffU) {
2820		mgr_log(mgr, ISC_LOG_ERROR,
2821			"avoid-v%s-udp-ports: unable to allocate "
2822			"an available port",
2823			isc_sockaddr_pf(localaddr) == AF_INET ? "4" : "6");
2824		result = ISC_R_FAILURE;
2825		goto end;
2826	}
2827	*sockp = sock;
2828
2829end:
2830	for (i = 0; i < DNS_DISPATCH_HELD; i++) {
2831		if (held[i] != NULL)
2832			isc_socket_detach(&held[i]);
2833	}
2834
2835	return (result);
2836}
2837
2838static isc_result_t
2839dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
2840		   isc_taskmgr_t *taskmgr,
2841		   isc_sockaddr_t *localaddr,
2842		   unsigned int maxrequests,
2843		   unsigned int attributes,
2844		   dns_dispatch_t **dispp)
2845{
2846	isc_result_t result;
2847	dns_dispatch_t *disp;
2848	isc_socket_t *sock = NULL;
2849	int i = 0;
2850
2851	/*
2852	 * dispatch_allocate() checks mgr for us.
2853	 */
2854	disp = NULL;
2855	result = dispatch_allocate(mgr, maxrequests, &disp);
2856	if (result != ISC_R_SUCCESS)
2857		return (result);
2858
2859	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0) {
2860		result = get_udpsocket(mgr, disp, sockmgr, localaddr, &sock);
2861		if (result != ISC_R_SUCCESS)
2862			goto deallocate_dispatch;
2863	} else {
2864		isc_sockaddr_t sa_any;
2865
2866		/*
2867		 * For dispatches using exclusive sockets with a specific
2868		 * source address, we only check if the specified address is
2869		 * available on the system.  Query sockets will be created later
2870		 * on demand.
2871		 */
2872		isc_sockaddr_anyofpf(&sa_any, isc_sockaddr_pf(localaddr));
2873		if (!isc_sockaddr_eqaddr(&sa_any, localaddr)) {
2874			result = open_socket(sockmgr, localaddr, 0, &sock);
2875			if (sock != NULL)
2876				isc_socket_detach(&sock);
2877			if (result != ISC_R_SUCCESS)
2878				goto deallocate_dispatch;
2879		}
2880
2881		disp->port_table = isc_mem_get(mgr->mctx,
2882					       sizeof(disp->port_table[0]) *
2883					       DNS_DISPATCH_PORTTABLESIZE);
2884		if (disp->port_table == NULL)
2885			goto deallocate_dispatch;
2886		for (i = 0; i < DNS_DISPATCH_PORTTABLESIZE; i++)
2887			ISC_LIST_INIT(disp->port_table[i]);
2888
2889		result = isc_mempool_create(mgr->mctx, sizeof(dispportentry_t),
2890					    &disp->portpool);
2891		if (result != ISC_R_SUCCESS)
2892			goto deallocate_dispatch;
2893		isc_mempool_setname(disp->portpool, "disp_portpool");
2894		isc_mempool_setfreemax(disp->portpool, 128);
2895	}
2896	disp->socktype = isc_sockettype_udp;
2897	disp->socket = sock;
2898	disp->local = *localaddr;
2899
2900	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
2901		disp->ntasks = MAX_INTERNAL_TASKS;
2902	else
2903		disp->ntasks = 1;
2904	for (i = 0; i < disp->ntasks; i++) {
2905		disp->task[i] = NULL;
2906		result = isc_task_create(taskmgr, 0, &disp->task[i]);
2907		if (result != ISC_R_SUCCESS) {
2908			while (--i >= 0) {
2909				isc_task_shutdown(disp->task[i]);
2910				isc_task_detach(&disp->task[i]);
2911			}
2912			goto kill_socket;
2913		}
2914		isc_task_setname(disp->task[i], "udpdispatch", disp);
2915	}
2916
2917	disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
2918					    DNS_EVENT_DISPATCHCONTROL,
2919					    destroy_disp, disp,
2920					    sizeof(isc_event_t));
2921	if (disp->ctlevent == NULL) {
2922		result = ISC_R_NOMEMORY;
2923		goto kill_task;
2924	}
2925
2926	attributes &= ~DNS_DISPATCHATTR_TCP;
2927	attributes |= DNS_DISPATCHATTR_UDP;
2928	disp->attributes = attributes;
2929
2930	/*
2931	 * Append it to the dispatcher list.
2932	 */
2933	ISC_LIST_APPEND(mgr->list, disp, link);
2934
2935	mgr_log(mgr, LVL(90), "created UDP dispatcher %p", disp);
2936	dispatch_log(disp, LVL(90), "created task %p", disp->task[0]); /* XXX */
2937	if (disp->socket != NULL)
2938		dispatch_log(disp, LVL(90), "created socket %p", disp->socket);
2939
2940	*dispp = disp;
2941	return (result);
2942
2943	/*
2944	 * Error returns.
2945	 */
2946 kill_task:
2947	for (i = 0; i < disp->ntasks; i++)
2948		isc_task_detach(&disp->task[i]);
2949 kill_socket:
2950	if (disp->socket != NULL)
2951		isc_socket_detach(&disp->socket);
2952 deallocate_dispatch:
2953	dispatch_free(&disp);
2954
2955	return (result);
2956}
2957
2958void
2959dns_dispatch_attach(dns_dispatch_t *disp, dns_dispatch_t **dispp) {
2960	REQUIRE(VALID_DISPATCH(disp));
2961	REQUIRE(dispp != NULL && *dispp == NULL);
2962
2963	LOCK(&disp->lock);
2964	disp->refcount++;
2965	UNLOCK(&disp->lock);
2966
2967	*dispp = disp;
2968}
2969
2970/*
2971 * It is important to lock the manager while we are deleting the dispatch,
2972 * since dns_dispatch_getudp will call dispatch_find, which returns to
2973 * the caller a dispatch but does not attach to it until later.  _getudp
2974 * locks the manager, however, so locking it here will keep us from attaching
2975 * to a dispatcher that is in the process of going away.
2976 */
2977void
2978dns_dispatch_detach(dns_dispatch_t **dispp) {
2979	dns_dispatch_t *disp;
2980	dispsocket_t *dispsock;
2981	isc_boolean_t killit;
2982
2983	REQUIRE(dispp != NULL && VALID_DISPATCH(*dispp));
2984
2985	disp = *dispp;
2986	*dispp = NULL;
2987
2988	LOCK(&disp->lock);
2989
2990	INSIST(disp->refcount > 0);
2991	disp->refcount--;
2992	if (disp->refcount == 0) {
2993		if (disp->recv_pending > 0)
2994			isc_socket_cancel(disp->socket, disp->task[0],
2995					  ISC_SOCKCANCEL_RECV);
2996		for (dispsock = ISC_LIST_HEAD(disp->activesockets);
2997		     dispsock != NULL;
2998		     dispsock = ISC_LIST_NEXT(dispsock, link)) {
2999			isc_socket_cancel(dispsock->socket, dispsock->task,
3000					  ISC_SOCKCANCEL_RECV);
3001		}
3002		disp->shutting_down = 1;
3003	}
3004
3005	dispatch_log(disp, LVL(90), "detach: refcount %d", disp->refcount);
3006
3007	killit = destroy_disp_ok(disp);
3008	UNLOCK(&disp->lock);
3009	if (killit)
3010		isc_task_send(disp->task[0], &disp->ctlevent);
3011}
3012
3013isc_result_t
3014dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest,
3015			  isc_task_t *task, isc_taskaction_t action, void *arg,
3016			  dns_messageid_t *idp, dns_dispentry_t **resp,
3017			  isc_socketmgr_t *sockmgr)
3018{
3019	dns_dispentry_t *res;
3020	unsigned int bucket;
3021	in_port_t localport = 0;
3022	dns_messageid_t id;
3023	int i;
3024	isc_boolean_t ok;
3025	dns_qid_t *qid;
3026	dispsocket_t *dispsocket = NULL;
3027	isc_result_t result;
3028
3029	REQUIRE(VALID_DISPATCH(disp));
3030	REQUIRE(task != NULL);
3031	REQUIRE(dest != NULL);
3032	REQUIRE(resp != NULL && *resp == NULL);
3033	REQUIRE(idp != NULL);
3034	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
3035		REQUIRE(sockmgr != NULL);
3036
3037	LOCK(&disp->lock);
3038
3039	if (disp->shutting_down == 1) {
3040		UNLOCK(&disp->lock);
3041		return (ISC_R_SHUTTINGDOWN);
3042	}
3043
3044	if (disp->requests >= disp->maxrequests) {
3045		UNLOCK(&disp->lock);
3046		return (ISC_R_QUOTA);
3047	}
3048
3049	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
3050	    disp->nsockets > DNS_DISPATCH_SOCKSQUOTA) {
3051		dispsocket_t *oldestsocket;
3052		dns_dispentry_t *oldestresp;
3053		dns_dispatchevent_t *rev;
3054
3055		/*
3056		 * Kill oldest outstanding query if the number of sockets
3057		 * exceeds the quota to keep the room for new queries.
3058		 */
3059		oldestsocket = ISC_LIST_HEAD(disp->activesockets);
3060		oldestresp = oldestsocket->resp;
3061		if (oldestresp != NULL && !oldestresp->item_out) {
3062			rev = allocate_event(oldestresp->disp);
3063			if (rev != NULL) {
3064				rev->buffer.base = NULL;
3065				rev->result = ISC_R_CANCELED;
3066				rev->id = oldestresp->id;
3067				ISC_EVENT_INIT(rev, sizeof(*rev), 0,
3068					       NULL, DNS_EVENT_DISPATCH,
3069					       oldestresp->action,
3070					       oldestresp->arg, oldestresp,
3071					       NULL, NULL);
3072				oldestresp->item_out = ISC_TRUE;
3073				isc_task_send(oldestresp->task,
3074					      ISC_EVENT_PTR(&rev));
3075				inc_stats(disp->mgr,
3076					  dns_resstatscounter_dispabort);
3077			}
3078		}
3079
3080		/*
3081		 * Move this entry to the tail so that it won't (easily) be
3082		 * examined before actually being canceled.
3083		 */
3084		ISC_LIST_UNLINK(disp->activesockets, oldestsocket, link);
3085		ISC_LIST_APPEND(disp->activesockets, oldestsocket, link);
3086	}
3087
3088	qid = DNS_QID(disp);
3089	LOCK(&qid->lock);
3090
3091	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
3092		/*
3093		 * Get a separate UDP socket with a random port number.
3094		 */
3095		result = get_dispsocket(disp, dest, sockmgr, qid, &dispsocket,
3096					&localport);
3097		if (result != ISC_R_SUCCESS) {
3098			UNLOCK(&qid->lock);
3099			UNLOCK(&disp->lock);
3100			inc_stats(disp->mgr, dns_resstatscounter_dispsockfail);
3101			return (result);
3102		}
3103	} else {
3104		localport = disp->localport;
3105	}
3106
3107	/*
3108	 * Try somewhat hard to find an unique ID.
3109	 */
3110	id = (dns_messageid_t)dispatch_random(DISP_ARC4CTX(disp));
3111	bucket = dns_hash(qid, dest, id, localport);
3112	ok = ISC_FALSE;
3113	for (i = 0; i < 64; i++) {
3114		if (entry_search(qid, dest, id, localport, bucket) == NULL) {
3115			ok = ISC_TRUE;
3116			break;
3117		}
3118		id += qid->qid_increment;
3119		id &= 0x0000ffff;
3120		bucket = dns_hash(qid, dest, id, localport);
3121	}
3122
3123	if (!ok) {
3124		UNLOCK(&qid->lock);
3125		UNLOCK(&disp->lock);
3126		return (ISC_R_NOMORE);
3127	}
3128
3129	res = isc_mempool_get(disp->mgr->rpool);
3130	if (res == NULL) {
3131		UNLOCK(&qid->lock);
3132		UNLOCK(&disp->lock);
3133		if (dispsocket != NULL)
3134			destroy_dispsocket(disp, &dispsocket);
3135		return (ISC_R_NOMEMORY);
3136	}
3137
3138	disp->refcount++;
3139	disp->requests++;
3140	res->task = NULL;
3141	isc_task_attach(task, &res->task);
3142	res->disp = disp;
3143	res->id = id;
3144	res->port = localport;
3145	res->bucket = bucket;
3146	res->host = *dest;
3147	res->action = action;
3148	res->arg = arg;
3149	res->dispsocket = dispsocket;
3150	if (dispsocket != NULL)
3151		dispsocket->resp = res;
3152	res->item_out = ISC_FALSE;
3153	ISC_LIST_INIT(res->items);
3154	ISC_LINK_INIT(res, link);
3155	res->magic = RESPONSE_MAGIC;
3156	ISC_LIST_APPEND(qid->qid_table[bucket], res, link);
3157	UNLOCK(&qid->lock);
3158
3159	request_log(disp, res, LVL(90),
3160		    "attached to task %p", res->task);
3161
3162	if (((disp->attributes & DNS_DISPATCHATTR_UDP) != 0) ||
3163	    ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0)) {
3164		result = startrecv(disp, dispsocket);
3165		if (result != ISC_R_SUCCESS) {
3166			LOCK(&qid->lock);
3167			ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
3168			UNLOCK(&qid->lock);
3169
3170			if (dispsocket != NULL)
3171				destroy_dispsocket(disp, &dispsocket);
3172
3173			disp->refcount--;
3174			disp->requests--;
3175
3176			UNLOCK(&disp->lock);
3177			isc_task_detach(&res->task);
3178			isc_mempool_put(disp->mgr->rpool, res);
3179			return (result);
3180		}
3181	}
3182
3183	if (dispsocket != NULL)
3184		ISC_LIST_APPEND(disp->activesockets, dispsocket, link);
3185
3186	UNLOCK(&disp->lock);
3187
3188	*idp = id;
3189	*resp = res;
3190
3191	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
3192		INSIST(res->dispsocket != NULL);
3193
3194	return (ISC_R_SUCCESS);
3195}
3196
3197isc_result_t
3198dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
3199			 isc_task_t *task, isc_taskaction_t action, void *arg,
3200			 dns_messageid_t *idp, dns_dispentry_t **resp)
3201{
3202	REQUIRE(VALID_DISPATCH(disp));
3203	REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
3204
3205	return (dns_dispatch_addresponse2(disp, dest, task, action, arg,
3206					  idp, resp, NULL));
3207}
3208
3209void
3210dns_dispatch_starttcp(dns_dispatch_t *disp) {
3211
3212	REQUIRE(VALID_DISPATCH(disp));
3213
3214	dispatch_log(disp, LVL(90), "starttcp %p", disp->task[0]);
3215
3216	LOCK(&disp->lock);
3217	disp->attributes |= DNS_DISPATCHATTR_CONNECTED;
3218	(void)startrecv(disp, NULL);
3219	UNLOCK(&disp->lock);
3220}
3221
3222void
3223dns_dispatch_removeresponse(dns_dispentry_t **resp,
3224			    dns_dispatchevent_t **sockevent)
3225{
3226	dns_dispatchmgr_t *mgr;
3227	dns_dispatch_t *disp;
3228	dns_dispentry_t *res;
3229	dispsocket_t *dispsock;
3230	dns_dispatchevent_t *ev;
3231	unsigned int bucket;
3232	isc_boolean_t killit;
3233	unsigned int n;
3234	isc_eventlist_t events;
3235	dns_qid_t *qid;
3236
3237	REQUIRE(resp != NULL);
3238	REQUIRE(VALID_RESPONSE(*resp));
3239
3240	res = *resp;
3241	*resp = NULL;
3242
3243	disp = res->disp;
3244	REQUIRE(VALID_DISPATCH(disp));
3245	mgr = disp->mgr;
3246	REQUIRE(VALID_DISPATCHMGR(mgr));
3247
3248	qid = DNS_QID(disp);
3249
3250	if (sockevent != NULL) {
3251		REQUIRE(*sockevent != NULL);
3252		ev = *sockevent;
3253		*sockevent = NULL;
3254	} else {
3255		ev = NULL;
3256	}
3257
3258	LOCK(&disp->lock);
3259
3260	INSIST(disp->requests > 0);
3261	disp->requests--;
3262	INSIST(disp->refcount > 0);
3263	disp->refcount--;
3264	if (disp->refcount == 0) {
3265		if (disp->recv_pending > 0)
3266			isc_socket_cancel(disp->socket, disp->task[0],
3267					  ISC_SOCKCANCEL_RECV);
3268		for (dispsock = ISC_LIST_HEAD(disp->activesockets);
3269		     dispsock != NULL;
3270		     dispsock = ISC_LIST_NEXT(dispsock, link)) {
3271			isc_socket_cancel(dispsock->socket, dispsock->task,
3272					  ISC_SOCKCANCEL_RECV);
3273		}
3274		disp->shutting_down = 1;
3275	}
3276
3277	bucket = res->bucket;
3278
3279	LOCK(&qid->lock);
3280	ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
3281	UNLOCK(&qid->lock);
3282
3283	if (ev == NULL && res->item_out) {
3284		/*
3285		 * We've posted our event, but the caller hasn't gotten it
3286		 * yet.  Take it back.
3287		 */
3288		ISC_LIST_INIT(events);
3289		n = isc_task_unsend(res->task, res, DNS_EVENT_DISPATCH,
3290				    NULL, &events);
3291		/*
3292		 * We had better have gotten it back.
3293		 */
3294		INSIST(n == 1);
3295		ev = (dns_dispatchevent_t *)ISC_LIST_HEAD(events);
3296	}
3297
3298	if (ev != NULL) {
3299		REQUIRE(res->item_out == ISC_TRUE);
3300		res->item_out = ISC_FALSE;
3301		if (ev->buffer.base != NULL)
3302			free_buffer(disp, ev->buffer.base, ev->buffer.length);
3303		free_event(disp, ev);
3304	}
3305
3306	request_log(disp, res, LVL(90), "detaching from task %p", res->task);
3307	isc_task_detach(&res->task);
3308
3309	if (res->dispsocket != NULL) {
3310		isc_socket_cancel(res->dispsocket->socket,
3311				  res->dispsocket->task, ISC_SOCKCANCEL_RECV);
3312		res->dispsocket->resp = NULL;
3313	}
3314
3315	/*
3316	 * Free any buffered requests as well
3317	 */
3318	ev = ISC_LIST_HEAD(res->items);
3319	while (ev != NULL) {
3320		ISC_LIST_UNLINK(res->items, ev, ev_link);
3321		if (ev->buffer.base != NULL)
3322			free_buffer(disp, ev->buffer.base, ev->buffer.length);
3323		free_event(disp, ev);
3324		ev = ISC_LIST_HEAD(res->items);
3325	}
3326	res->magic = 0;
3327	isc_mempool_put(disp->mgr->rpool, res);
3328	if (disp->shutting_down == 1)
3329		do_cancel(disp);
3330	else
3331		(void)startrecv(disp, NULL);
3332
3333	killit = destroy_disp_ok(disp);
3334	UNLOCK(&disp->lock);
3335	if (killit)
3336		isc_task_send(disp->task[0], &disp->ctlevent);
3337}
3338
3339static void
3340do_cancel(dns_dispatch_t *disp) {
3341	dns_dispatchevent_t *ev;
3342	dns_dispentry_t *resp;
3343	dns_qid_t *qid;
3344
3345	if (disp->shutdown_out == 1)
3346		return;
3347
3348	qid = DNS_QID(disp);
3349
3350	/*
3351	 * Search for the first response handler without packets outstanding
3352	 * unless a specific hander is given.
3353	 */
3354	LOCK(&qid->lock);
3355	for (resp = linear_first(qid);
3356	     resp != NULL && resp->item_out;
3357	     /* Empty. */)
3358		resp = linear_next(qid, resp);
3359
3360	/*
3361	 * No one to send the cancel event to, so nothing to do.
3362	 */
3363	if (resp == NULL)
3364		goto unlock;
3365
3366	/*
3367	 * Send the shutdown failsafe event to this resp.
3368	 */
3369	ev = disp->failsafe_ev;
3370	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, DNS_EVENT_DISPATCH,
3371		       resp->action, resp->arg, resp, NULL, NULL);
3372	ev->result = disp->shutdown_why;
3373	ev->buffer.base = NULL;
3374	ev->buffer.length = 0;
3375	disp->shutdown_out = 1;
3376	request_log(disp, resp, LVL(10),
3377		    "cancel: failsafe event %p -> task %p",
3378		    ev, resp->task);
3379	resp->item_out = ISC_TRUE;
3380	isc_task_send(resp->task, ISC_EVENT_PTR(&ev));
3381 unlock:
3382	UNLOCK(&qid->lock);
3383}
3384
3385isc_socket_t *
3386dns_dispatch_getsocket(dns_dispatch_t *disp) {
3387	REQUIRE(VALID_DISPATCH(disp));
3388
3389	return (disp->socket);
3390}
3391
3392isc_socket_t *
3393dns_dispatch_getentrysocket(dns_dispentry_t *resp) {
3394	REQUIRE(VALID_RESPONSE(resp));
3395
3396	if (resp->dispsocket != NULL)
3397		return (resp->dispsocket->socket);
3398	else
3399		return (NULL);
3400}
3401
3402isc_result_t
3403dns_dispatch_getlocaladdress(dns_dispatch_t *disp, isc_sockaddr_t *addrp) {
3404
3405	REQUIRE(VALID_DISPATCH(disp));
3406	REQUIRE(addrp != NULL);
3407
3408	if (disp->socktype == isc_sockettype_udp) {
3409		*addrp = disp->local;
3410		return (ISC_R_SUCCESS);
3411	}
3412	return (ISC_R_NOTIMPLEMENTED);
3413}
3414
3415void
3416dns_dispatch_cancel(dns_dispatch_t *disp) {
3417	REQUIRE(VALID_DISPATCH(disp));
3418
3419	LOCK(&disp->lock);
3420
3421	if (disp->shutting_down == 1) {
3422		UNLOCK(&disp->lock);
3423		return;
3424	}
3425
3426	disp->shutdown_why = ISC_R_CANCELED;
3427	disp->shutting_down = 1;
3428	do_cancel(disp);
3429
3430	UNLOCK(&disp->lock);
3431
3432	return;
3433}
3434
3435unsigned int
3436dns_dispatch_getattributes(dns_dispatch_t *disp) {
3437	REQUIRE(VALID_DISPATCH(disp));
3438
3439	/*
3440	 * We don't bother locking disp here; it's the caller's responsibility
3441	 * to use only non volatile flags.
3442	 */
3443	return (disp->attributes);
3444}
3445
3446void
3447dns_dispatch_changeattributes(dns_dispatch_t *disp,
3448			      unsigned int attributes, unsigned int mask)
3449{
3450	REQUIRE(VALID_DISPATCH(disp));
3451	/* Exclusive attribute can only be set on creation */
3452	REQUIRE((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
3453	/* Also, a dispatch with randomport specified cannot start listening */
3454	REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0 ||
3455		(attributes & DNS_DISPATCHATTR_NOLISTEN) == 0);
3456
3457	/* XXXMLG
3458	 * Should check for valid attributes here!
3459	 */
3460
3461	LOCK(&disp->lock);
3462
3463	if ((mask & DNS_DISPATCHATTR_NOLISTEN) != 0) {
3464		if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0 &&
3465		    (attributes & DNS_DISPATCHATTR_NOLISTEN) == 0) {
3466			disp->attributes &= ~DNS_DISPATCHATTR_NOLISTEN;
3467			(void)startrecv(disp, NULL);
3468		} else if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN)
3469			   == 0 &&
3470			   (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0) {
3471			disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
3472			if (disp->recv_pending != 0)
3473				isc_socket_cancel(disp->socket, disp->task[0],
3474						  ISC_SOCKCANCEL_RECV);
3475		}
3476	}
3477
3478	disp->attributes &= ~mask;
3479	disp->attributes |= (attributes & mask);
3480	UNLOCK(&disp->lock);
3481}
3482
3483void
3484dns_dispatch_importrecv(dns_dispatch_t *disp, isc_event_t *event) {
3485	void *buf;
3486	isc_socketevent_t *sevent, *newsevent;
3487
3488	REQUIRE(VALID_DISPATCH(disp));
3489	REQUIRE((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0);
3490	REQUIRE(event != NULL);
3491
3492	sevent = (isc_socketevent_t *)event;
3493
3494	INSIST(sevent->n <= disp->mgr->buffersize);
3495	newsevent = (isc_socketevent_t *)
3496		    isc_event_allocate(disp->mgr->mctx, NULL,
3497				      DNS_EVENT_IMPORTRECVDONE, udp_shrecv,
3498				      disp, sizeof(isc_socketevent_t));
3499	if (newsevent == NULL)
3500		return;
3501
3502	buf = allocate_udp_buffer(disp);
3503	if (buf == NULL) {
3504		isc_event_free(ISC_EVENT_PTR(&newsevent));
3505		return;
3506	}
3507	memcpy(buf, sevent->region.base, sevent->n);
3508	newsevent->region.base = buf;
3509	newsevent->region.length = disp->mgr->buffersize;
3510	newsevent->n = sevent->n;
3511	newsevent->result = sevent->result;
3512	newsevent->address = sevent->address;
3513	newsevent->timestamp = sevent->timestamp;
3514	newsevent->pktinfo = sevent->pktinfo;
3515	newsevent->attributes = sevent->attributes;
3516
3517	isc_task_send(disp->task[0], ISC_EVENT_PTR(&newsevent));
3518}
3519
3520#if 0
3521void
3522dns_dispatchmgr_dump(dns_dispatchmgr_t *mgr) {
3523	dns_dispatch_t *disp;
3524	char foo[1024];
3525
3526	disp = ISC_LIST_HEAD(mgr->list);
3527	while (disp != NULL) {
3528		isc_sockaddr_format(&disp->local, foo, sizeof(foo));
3529		printf("\tdispatch %p, addr %s\n", disp, foo);
3530		disp = ISC_LIST_NEXT(disp, link);
3531	}
3532}
3533#endif
3534