dispatch.c revision 204619
1/*
2 * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2003  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id: dispatch.c,v 1.155.12.11 2009/12/02 23:26:28 marka Exp $ */
19
20/*! \file */
21
22#include <config.h>
23
24#include <stdlib.h>
25#include <sys/types.h>
26#include <unistd.h>
27#include <stdlib.h>
28
29#include <isc/entropy.h>
30#include <isc/mem.h>
31#include <isc/mutex.h>
32#include <isc/portset.h>
33#include <isc/print.h>
34#include <isc/random.h>
35#include <isc/stats.h>
36#include <isc/string.h>
37#include <isc/task.h>
38#include <isc/time.h>
39#include <isc/util.h>
40
41#include <dns/acl.h>
42#include <dns/dispatch.h>
43#include <dns/events.h>
44#include <dns/log.h>
45#include <dns/message.h>
46#include <dns/portlist.h>
47#include <dns/stats.h>
48#include <dns/tcpmsg.h>
49#include <dns/types.h>
50
51typedef ISC_LIST(dns_dispentry_t)	dns_displist_t;
52
53typedef struct dispsocket		dispsocket_t;
54typedef ISC_LIST(dispsocket_t)		dispsocketlist_t;
55
56typedef struct dispportentry		dispportentry_t;
57typedef ISC_LIST(dispportentry_t)	dispportlist_t;
58
59/* ARC4 Random generator state */
60typedef struct arc4ctx {
61	isc_uint8_t	i;
62	isc_uint8_t	j;
63	isc_uint8_t	s[256];
64	int		count;
65	isc_entropy_t	*entropy;	/*%< entropy source for ARC4 */
66	isc_mutex_t	*lock;
67} arc4ctx_t;
68
69typedef struct dns_qid {
70	unsigned int	magic;
71	unsigned int	qid_nbuckets;	/*%< hash table size */
72	unsigned int	qid_increment;	/*%< id increment on collision */
73	isc_mutex_t	lock;
74	dns_displist_t	*qid_table;	/*%< the table itself */
75	dispsocketlist_t *sock_table;	/*%< socket table */
76} dns_qid_t;
77
78struct dns_dispatchmgr {
79	/* Unlocked. */
80	unsigned int			magic;
81	isc_mem_t		       *mctx;
82	dns_acl_t		       *blackhole;
83	dns_portlist_t		       *portlist;
84	isc_stats_t		       *stats;
85	isc_entropy_t		       *entropy; /*%< entropy source */
86
87	/* Locked by "lock". */
88	isc_mutex_t			lock;
89	unsigned int			state;
90	ISC_LIST(dns_dispatch_t)	list;
91
92	/* Locked by arc4_lock. */
93	isc_mutex_t			arc4_lock;
94	arc4ctx_t			arc4ctx;    /*%< ARC4 context for QID */
95
96	/* locked by buffer lock */
97	dns_qid_t			*qid;
98	isc_mutex_t			buffer_lock;
99	unsigned int			buffers;    /*%< allocated buffers */
100	unsigned int			buffersize; /*%< size of each buffer */
101	unsigned int			maxbuffers; /*%< max buffers */
102
103	/* Locked internally. */
104	isc_mutex_t			pool_lock;
105	isc_mempool_t		       *epool;	/*%< memory pool for events */
106	isc_mempool_t		       *rpool;	/*%< memory pool for replies */
107	isc_mempool_t		       *dpool;  /*%< dispatch allocations */
108	isc_mempool_t		       *bpool;	/*%< memory pool for buffers */
109	isc_mempool_t		       *spool;	/*%< memory pool for dispsocs */
110
111	/*%
112	 * Locked by qid->lock if qid exists; otherwise, can be used without
113	 * being locked.
114	 * Memory footprint considerations: this is a simple implementation of
115	 * available ports, i.e., an ordered array of the actual port numbers.
116	 * This will require about 256KB of memory in the worst case (128KB for
117	 * each of IPv4 and IPv6).  We could reduce it by representing it as a
118	 * more sophisticated way such as a list (or array) of ranges that are
119	 * searched to identify a specific port.  Our decision here is the saved
120	 * memory isn't worth the implementation complexity, considering the
121	 * fact that the whole BIND9 process (which is mainly named) already
122	 * requires a pretty large memory footprint.  We may, however, have to
123	 * revisit the decision when we want to use it as a separate module for
124	 * an environment where memory requirement is severer.
125	 */
126	in_port_t	*v4ports;	/*%< available ports for IPv4 */
127	unsigned int	nv4ports;	/*%< # of available ports for IPv4 */
128	in_port_t	*v6ports;	/*%< available ports for IPv4 */
129	unsigned int	nv6ports;	/*%< # of available ports for IPv4 */
130};
131
132#define MGR_SHUTTINGDOWN		0x00000001U
133#define MGR_IS_SHUTTINGDOWN(l)	(((l)->state & MGR_SHUTTINGDOWN) != 0)
134
135#define IS_PRIVATE(d)	(((d)->attributes & DNS_DISPATCHATTR_PRIVATE) != 0)
136
137struct dns_dispentry {
138	unsigned int			magic;
139	dns_dispatch_t		       *disp;
140	dns_messageid_t			id;
141	in_port_t			port;
142	unsigned int			bucket;
143	isc_sockaddr_t			host;
144	isc_task_t		       *task;
145	isc_taskaction_t		action;
146	void			       *arg;
147	isc_boolean_t			item_out;
148	dispsocket_t			*dispsocket;
149	ISC_LIST(dns_dispatchevent_t)	items;
150	ISC_LINK(dns_dispentry_t)	link;
151};
152
153/*%
154 * Maximum number of dispatch sockets that can be pooled for reuse.  The
155 * appropriate value may vary, but experiments have shown a busy caching server
156 * may need more than 1000 sockets concurrently opened.  The maximum allowable
157 * number of dispatch sockets (per manager) will be set to the double of this
158 * value.
159 */
160#ifndef DNS_DISPATCH_POOLSOCKS
161#define DNS_DISPATCH_POOLSOCKS			2048
162#endif
163
164/*%
165 * Quota to control the number of dispatch sockets.  If a dispatch has more
166 * than the quota of sockets, new queries will purge oldest ones, so that
167 * a massive number of outstanding queries won't prevent subsequent queries
168 * (especially if the older ones take longer time and result in timeout).
169 */
170#ifndef DNS_DISPATCH_SOCKSQUOTA
171#define DNS_DISPATCH_SOCKSQUOTA			3072
172#endif
173
174struct dispsocket {
175	unsigned int			magic;
176	isc_socket_t			*socket;
177	dns_dispatch_t			*disp;
178	isc_sockaddr_t			host;
179	in_port_t			localport; /* XXX: should be removed later */
180	dispportentry_t			*portentry;
181	dns_dispentry_t			*resp;
182	isc_task_t			*task;
183	ISC_LINK(dispsocket_t)		link;
184	unsigned int			bucket;
185	ISC_LINK(dispsocket_t)		blink;
186};
187
188/*%
189 * A port table entry.  We remember every port we first open in a table with a
190 * reference counter so that we can 'reuse' the same port (with different
191 * destination addresses) using the SO_REUSEADDR socket option.
192 */
193struct dispportentry {
194	in_port_t			port;
195	unsigned int			refs;
196	ISC_LINK(struct dispportentry)	link;
197};
198
199#ifndef DNS_DISPATCH_PORTTABLESIZE
200#define DNS_DISPATCH_PORTTABLESIZE	1024
201#endif
202
203#define INVALID_BUCKET		(0xffffdead)
204
205/*%
206 * Number of tasks for each dispatch that use separate sockets for different
207 * transactions.  This must be a power of 2 as it will divide 32 bit numbers
208 * to get an uniformly random tasks selection.  See get_dispsocket().
209 */
210#define MAX_INTERNAL_TASKS	64
211
212struct dns_dispatch {
213	/* Unlocked. */
214	unsigned int		magic;		/*%< magic */
215	dns_dispatchmgr_t      *mgr;		/*%< dispatch manager */
216	int			ntasks;
217	/*%
218	 * internal task buckets.  We use multiple tasks to distribute various
219	 * socket events well when using separate dispatch sockets.  We use the
220	 * 1st task (task[0]) for internal control events.
221	 */
222	isc_task_t	       *task[MAX_INTERNAL_TASKS];
223	isc_socket_t	       *socket;		/*%< isc socket attached to */
224	isc_sockaddr_t		local;		/*%< local address */
225	in_port_t		localport;	/*%< local UDP port */
226	unsigned int		maxrequests;	/*%< max requests */
227	isc_event_t	       *ctlevent;
228
229	/*% Locked by mgr->lock. */
230	ISC_LINK(dns_dispatch_t) link;
231
232	/* Locked by "lock". */
233	isc_mutex_t		lock;		/*%< locks all below */
234	isc_sockettype_t	socktype;
235	unsigned int		attributes;
236	unsigned int		refcount;	/*%< number of users */
237	dns_dispatchevent_t    *failsafe_ev;	/*%< failsafe cancel event */
238	unsigned int		shutting_down : 1,
239				shutdown_out : 1,
240				connected : 1,
241				tcpmsg_valid : 1,
242				recv_pending : 1; /*%< is a recv() pending? */
243	isc_result_t		shutdown_why;
244	ISC_LIST(dispsocket_t)	activesockets;
245	ISC_LIST(dispsocket_t)	inactivesockets;
246	unsigned int		nsockets;
247	unsigned int		requests;	/*%< how many requests we have */
248	unsigned int		tcpbuffers;	/*%< allocated buffers */
249	dns_tcpmsg_t		tcpmsg;		/*%< for tcp streams */
250	dns_qid_t		*qid;
251	arc4ctx_t		arc4ctx;	/*%< for QID/UDP port num */
252	dispportlist_t		*port_table;	/*%< hold ports 'owned' by us */
253	isc_mempool_t		*portpool;	/*%< port table entries  */
254};
255
256#define QID_MAGIC		ISC_MAGIC('Q', 'i', 'd', ' ')
257#define VALID_QID(e)		ISC_MAGIC_VALID((e), QID_MAGIC)
258
259#define RESPONSE_MAGIC		ISC_MAGIC('D', 'r', 's', 'p')
260#define VALID_RESPONSE(e)	ISC_MAGIC_VALID((e), RESPONSE_MAGIC)
261
262#define DISPSOCK_MAGIC		ISC_MAGIC('D', 's', 'o', 'c')
263#define VALID_DISPSOCK(e)	ISC_MAGIC_VALID((e), DISPSOCK_MAGIC)
264
265#define DISPATCH_MAGIC		ISC_MAGIC('D', 'i', 's', 'p')
266#define VALID_DISPATCH(e)	ISC_MAGIC_VALID((e), DISPATCH_MAGIC)
267
268#define DNS_DISPATCHMGR_MAGIC	ISC_MAGIC('D', 'M', 'g', 'r')
269#define VALID_DISPATCHMGR(e)	ISC_MAGIC_VALID((e), DNS_DISPATCHMGR_MAGIC)
270
271#define DNS_QID(disp) ((disp)->socktype == isc_sockettype_tcp) ? \
272		       (disp)->qid : (disp)->mgr->qid
273#define DISP_ARC4CTX(disp) ((disp)->socktype == isc_sockettype_udp) ? \
274			(&(disp)->arc4ctx) : (&(disp)->mgr->arc4ctx)
275
276/*%
277 * Locking a query port buffer is a bit tricky.  We access the buffer without
278 * locking until qid is created.  Technically, there is a possibility of race
279 * between the creation of qid and access to the port buffer; in practice,
280 * however, this should be safe because qid isn't created until the first
281 * dispatch is created and there should be no contending situation until then.
282 */
283#define PORTBUFLOCK(mgr) if ((mgr)->qid != NULL) LOCK(&((mgr)->qid->lock))
284#define PORTBUFUNLOCK(mgr) if ((mgr)->qid != NULL) UNLOCK((&(mgr)->qid->lock))
285
286/*
287 * Statics.
288 */
289static dns_dispentry_t *entry_search(dns_qid_t *, isc_sockaddr_t *,
290				     dns_messageid_t, in_port_t, unsigned int);
291static isc_boolean_t destroy_disp_ok(dns_dispatch_t *);
292static void destroy_disp(isc_task_t *task, isc_event_t *event);
293static void destroy_dispsocket(dns_dispatch_t *, dispsocket_t **);
294static void deactivate_dispsocket(dns_dispatch_t *, dispsocket_t *);
295static void udp_exrecv(isc_task_t *, isc_event_t *);
296static void udp_shrecv(isc_task_t *, isc_event_t *);
297static void udp_recv(isc_event_t *, dns_dispatch_t *, dispsocket_t *);
298static void tcp_recv(isc_task_t *, isc_event_t *);
299static isc_result_t startrecv(dns_dispatch_t *, dispsocket_t *);
300static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t,
301			     in_port_t);
302static void free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len);
303static void *allocate_udp_buffer(dns_dispatch_t *disp);
304static inline void free_event(dns_dispatch_t *disp, dns_dispatchevent_t *ev);
305static inline dns_dispatchevent_t *allocate_event(dns_dispatch_t *disp);
306static void do_cancel(dns_dispatch_t *disp);
307static dns_dispentry_t *linear_first(dns_qid_t *disp);
308static dns_dispentry_t *linear_next(dns_qid_t *disp,
309				    dns_dispentry_t *resp);
310static void dispatch_free(dns_dispatch_t **dispp);
311static isc_result_t get_udpsocket(dns_dispatchmgr_t *mgr,
312				  dns_dispatch_t *disp,
313				  isc_socketmgr_t *sockmgr,
314				  isc_sockaddr_t *localaddr,
315				  isc_socket_t **sockp);
316static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr,
317				       isc_socketmgr_t *sockmgr,
318				       isc_taskmgr_t *taskmgr,
319				       isc_sockaddr_t *localaddr,
320				       unsigned int maxrequests,
321				       unsigned int attributes,
322				       dns_dispatch_t **dispp);
323static isc_boolean_t destroy_mgr_ok(dns_dispatchmgr_t *mgr);
324static void destroy_mgr(dns_dispatchmgr_t **mgrp);
325static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
326				 unsigned int increment, dns_qid_t **qidp,
327				 isc_boolean_t needaddrtable);
328static void qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp);
329static isc_result_t open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
330				unsigned int options, isc_socket_t **sockp);
331static isc_boolean_t portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
332				   isc_sockaddr_t *sockaddrp);
333
334#define LVL(x) ISC_LOG_DEBUG(x)
335
336static void
337mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...)
338     ISC_FORMAT_PRINTF(3, 4);
339
340static void
341mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...) {
342	char msgbuf[2048];
343	va_list ap;
344
345	if (! isc_log_wouldlog(dns_lctx, level))
346		return;
347
348	va_start(ap, fmt);
349	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
350	va_end(ap);
351
352	isc_log_write(dns_lctx,
353		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
354		      level, "dispatchmgr %p: %s", mgr, msgbuf);
355}
356
357static inline void
358inc_stats(dns_dispatchmgr_t *mgr, isc_statscounter_t counter) {
359	if (mgr->stats != NULL)
360		isc_stats_increment(mgr->stats, counter);
361}
362
363static void
364dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...)
365     ISC_FORMAT_PRINTF(3, 4);
366
367static void
368dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...) {
369	char msgbuf[2048];
370	va_list ap;
371
372	if (! isc_log_wouldlog(dns_lctx, level))
373		return;
374
375	va_start(ap, fmt);
376	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
377	va_end(ap);
378
379	isc_log_write(dns_lctx,
380		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
381		      level, "dispatch %p: %s", disp, msgbuf);
382}
383
384static void
385request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
386	    int level, const char *fmt, ...)
387     ISC_FORMAT_PRINTF(4, 5);
388
389static void
390request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
391	    int level, const char *fmt, ...)
392{
393	char msgbuf[2048];
394	char peerbuf[256];
395	va_list ap;
396
397	if (! isc_log_wouldlog(dns_lctx, level))
398		return;
399
400	va_start(ap, fmt);
401	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
402	va_end(ap);
403
404	if (VALID_RESPONSE(resp)) {
405		isc_sockaddr_format(&resp->host, peerbuf, sizeof(peerbuf));
406		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
407			      DNS_LOGMODULE_DISPATCH, level,
408			      "dispatch %p response %p %s: %s", disp, resp,
409			      peerbuf, msgbuf);
410	} else {
411		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
412			      DNS_LOGMODULE_DISPATCH, level,
413			      "dispatch %p req/resp %p: %s", disp, resp,
414			      msgbuf);
415	}
416}
417
418/*%
419 * ARC4 random number generator derived from OpenBSD.
420 * Only dispatch_arc4random() and dispatch_arc4uniformrandom() are expected
421 * to be called from general dispatch routines; the rest of them are subroutines
422 * for these two.
423 *
424 * The original copyright follows:
425 * Copyright (c) 1996, David Mazieres <dm@uun.org>
426 * Copyright (c) 2008, Damien Miller <djm@openbsd.org>
427 *
428 * Permission to use, copy, modify, and distribute this software for any
429 * purpose with or without fee is hereby granted, provided that the above
430 * copyright notice and this permission notice appear in all copies.
431 *
432 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
433 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
434 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
435 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
436 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
437 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
438 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
439 */
440static void
441dispatch_arc4init(arc4ctx_t *actx, isc_entropy_t *entropy, isc_mutex_t *lock) {
442	int n;
443	for (n = 0; n < 256; n++)
444		actx->s[n] = n;
445	actx->i = 0;
446	actx->j = 0;
447	actx->count = 0;
448	actx->entropy = entropy; /* don't have to attach */
449	actx->lock = lock;
450}
451
452static void
453dispatch_arc4addrandom(arc4ctx_t *actx, unsigned char *dat, int datlen) {
454	int n;
455	isc_uint8_t si;
456
457	actx->i--;
458	for (n = 0; n < 256; n++) {
459		actx->i = (actx->i + 1);
460		si = actx->s[actx->i];
461		actx->j = (actx->j + si + dat[n % datlen]);
462		actx->s[actx->i] = actx->s[actx->j];
463		actx->s[actx->j] = si;
464	}
465	actx->j = actx->i;
466}
467
468static inline isc_uint8_t
469dispatch_arc4get8(arc4ctx_t *actx) {
470	isc_uint8_t si, sj;
471
472	actx->i = (actx->i + 1);
473	si = actx->s[actx->i];
474	actx->j = (actx->j + si);
475	sj = actx->s[actx->j];
476	actx->s[actx->i] = sj;
477	actx->s[actx->j] = si;
478
479	return (actx->s[(si + sj) & 0xff]);
480}
481
482static inline isc_uint16_t
483dispatch_arc4get16(arc4ctx_t *actx) {
484	isc_uint16_t val;
485
486	val = dispatch_arc4get8(actx) << 8;
487	val |= dispatch_arc4get8(actx);
488
489	return (val);
490}
491
492static void
493dispatch_arc4stir(arc4ctx_t *actx) {
494	int i;
495	union {
496		unsigned char rnd[128];
497		isc_uint32_t rnd32[32];
498	} rnd;
499	isc_result_t result;
500
501	if (actx->entropy != NULL) {
502		/*
503		 * We accept any quality of random data to avoid blocking.
504		 */
505		result = isc_entropy_getdata(actx->entropy, rnd.rnd,
506					     sizeof(rnd), NULL, 0);
507		RUNTIME_CHECK(result == ISC_R_SUCCESS);
508	} else {
509		for (i = 0; i < 32; i++)
510			isc_random_get(&rnd.rnd32[i]);
511	}
512	dispatch_arc4addrandom(actx, rnd.rnd, sizeof(rnd.rnd));
513
514	/*
515	 * Discard early keystream, as per recommendations in:
516	 * http://www.wisdom.weizmann.ac.il/~itsik/RC4/Papers/Rc4_ksa.ps
517	 */
518	for (i = 0; i < 256; i++)
519		(void)dispatch_arc4get8(actx);
520
521	/*
522	 * Derived from OpenBSD's implementation.  The rationale is not clear,
523	 * but should be conservative enough in safety, and reasonably large
524	 * for efficiency.
525	 */
526	actx->count = 1600000;
527}
528
529static isc_uint16_t
530dispatch_arc4random(arc4ctx_t *actx) {
531	isc_uint16_t result;
532
533	if (actx->lock != NULL)
534		LOCK(actx->lock);
535
536	actx->count -= sizeof(isc_uint16_t);
537	if (actx->count <= 0)
538		dispatch_arc4stir(actx);
539	result = dispatch_arc4get16(actx);
540
541	if (actx->lock != NULL)
542		UNLOCK(actx->lock);
543
544	return (result);
545}
546
547static isc_uint16_t
548dispatch_arc4uniformrandom(arc4ctx_t *actx, isc_uint16_t upper_bound) {
549	isc_uint16_t min, r;
550
551	if (upper_bound < 2)
552		return (0);
553
554	/*
555	 * Ensure the range of random numbers [min, 0xffff] be a multiple of
556	 * upper_bound and contain at least a half of the 16 bit range.
557	 */
558
559	if (upper_bound > 0x8000)
560		min = 1 + ~upper_bound; /* 0x8000 - upper_bound */
561	else
562		min = (isc_uint16_t)(0x10000 % (isc_uint32_t)upper_bound);
563
564	/*
565	 * This could theoretically loop forever but each retry has
566	 * p > 0.5 (worst case, usually far better) of selecting a
567	 * number inside the range we need, so it should rarely need
568	 * to re-roll.
569	 */
570	for (;;) {
571		r = dispatch_arc4random(actx);
572		if (r >= min)
573			break;
574	}
575
576	return (r % upper_bound);
577}
578
579/*
580 * Return a hash of the destination and message id.
581 */
582static isc_uint32_t
583dns_hash(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
584	 in_port_t port)
585{
586	unsigned int ret;
587
588	ret = isc_sockaddr_hash(dest, ISC_TRUE);
589	ret ^= (id << 16) | port;
590	ret %= qid->qid_nbuckets;
591
592	INSIST(ret < qid->qid_nbuckets);
593
594	return (ret);
595}
596
597/*
598 * Find the first entry in 'qid'.  Returns NULL if there are no entries.
599 */
600static dns_dispentry_t *
601linear_first(dns_qid_t *qid) {
602	dns_dispentry_t *ret;
603	unsigned int bucket;
604
605	bucket = 0;
606
607	while (bucket < qid->qid_nbuckets) {
608		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
609		if (ret != NULL)
610			return (ret);
611		bucket++;
612	}
613
614	return (NULL);
615}
616
617/*
618 * Find the next entry after 'resp' in 'qid'.  Return NULL if there are
619 * no more entries.
620 */
621static dns_dispentry_t *
622linear_next(dns_qid_t *qid, dns_dispentry_t *resp) {
623	dns_dispentry_t *ret;
624	unsigned int bucket;
625
626	ret = ISC_LIST_NEXT(resp, link);
627	if (ret != NULL)
628		return (ret);
629
630	bucket = resp->bucket;
631	bucket++;
632	while (bucket < qid->qid_nbuckets) {
633		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
634		if (ret != NULL)
635			return (ret);
636		bucket++;
637	}
638
639	return (NULL);
640}
641
642/*
643 * The dispatch must be locked.
644 */
645static isc_boolean_t
646destroy_disp_ok(dns_dispatch_t *disp)
647{
648	if (disp->refcount != 0)
649		return (ISC_FALSE);
650
651	if (disp->recv_pending != 0)
652		return (ISC_FALSE);
653
654	if (!ISC_LIST_EMPTY(disp->activesockets))
655		return (ISC_FALSE);
656
657	if (disp->shutting_down == 0)
658		return (ISC_FALSE);
659
660	return (ISC_TRUE);
661}
662
663/*
664 * Called when refcount reaches 0 (and safe to destroy).
665 *
666 * The dispatcher must not be locked.
667 * The manager must be locked.
668 */
669static void
670destroy_disp(isc_task_t *task, isc_event_t *event) {
671	dns_dispatch_t *disp;
672	dns_dispatchmgr_t *mgr;
673	isc_boolean_t killmgr;
674	dispsocket_t *dispsocket;
675	int i;
676
677	INSIST(event->ev_type == DNS_EVENT_DISPATCHCONTROL);
678
679	UNUSED(task);
680
681	disp = event->ev_arg;
682	mgr = disp->mgr;
683
684	LOCK(&mgr->lock);
685	ISC_LIST_UNLINK(mgr->list, disp, link);
686
687	dispatch_log(disp, LVL(90),
688		     "shutting down; detaching from sock %p, task %p",
689		     disp->socket, disp->task[0]); /* XXXX */
690
691	if (disp->socket != NULL)
692		isc_socket_detach(&disp->socket);
693	while ((dispsocket = ISC_LIST_HEAD(disp->inactivesockets)) != NULL) {
694		ISC_LIST_UNLINK(disp->inactivesockets, dispsocket, link);
695		destroy_dispsocket(disp, &dispsocket);
696	}
697	for (i = 0; i < disp->ntasks; i++)
698		isc_task_detach(&disp->task[i]);
699	isc_event_free(&event);
700
701	dispatch_free(&disp);
702
703	killmgr = destroy_mgr_ok(mgr);
704	UNLOCK(&mgr->lock);
705	if (killmgr)
706		destroy_mgr(&mgr);
707}
708
709/*%
710 * Manipulate port table per dispatch: find an entry for a given port number,
711 * create a new entry, and decrement a given entry with possible clean-up.
712 */
713static dispportentry_t *
714port_search(dns_dispatch_t *disp, in_port_t port) {
715	dispportentry_t *portentry;
716
717	REQUIRE(disp->port_table != NULL);
718
719	portentry = ISC_LIST_HEAD(disp->port_table[port %
720						   DNS_DISPATCH_PORTTABLESIZE]);
721	while (portentry != NULL) {
722		if (portentry->port == port)
723			return (portentry);
724		portentry = ISC_LIST_NEXT(portentry, link);
725	}
726
727	return (NULL);
728}
729
730static dispportentry_t *
731new_portentry(dns_dispatch_t *disp, in_port_t port) {
732	dispportentry_t *portentry;
733
734	REQUIRE(disp->port_table != NULL);
735
736	portentry = isc_mempool_get(disp->portpool);
737	if (portentry == NULL)
738		return (portentry);
739
740	portentry->port = port;
741	portentry->refs = 0;
742	ISC_LINK_INIT(portentry, link);
743	ISC_LIST_APPEND(disp->port_table[port % DNS_DISPATCH_PORTTABLESIZE],
744			portentry, link);
745
746	return (portentry);
747}
748
749/*%
750 * The caller must not hold the qid->lock.
751 */
752static void
753deref_portentry(dns_dispatch_t *disp, dispportentry_t **portentryp) {
754	dispportentry_t *portentry = *portentryp;
755	dns_qid_t *qid;
756
757	REQUIRE(disp->port_table != NULL);
758	REQUIRE(portentry != NULL && portentry->refs > 0);
759
760	qid = DNS_QID(disp);
761	LOCK(&qid->lock);
762	portentry->refs--;
763	if (portentry->refs == 0) {
764		ISC_LIST_UNLINK(disp->port_table[portentry->port %
765						 DNS_DISPATCH_PORTTABLESIZE],
766				portentry, link);
767		isc_mempool_put(disp->portpool, portentry);
768	}
769
770	*portentryp = NULL;
771	UNLOCK(&qid->lock);
772}
773
774/*%
775 * Find a dispsocket for socket address 'dest', and port number 'port'.
776 * Return NULL if no such entry exists.
777 */
778static dispsocket_t *
779socket_search(dns_qid_t *qid, isc_sockaddr_t *dest, in_port_t port,
780	      unsigned int bucket)
781{
782	dispsocket_t *dispsock;
783
784	REQUIRE(bucket < qid->qid_nbuckets);
785
786	dispsock = ISC_LIST_HEAD(qid->sock_table[bucket]);
787
788	while (dispsock != NULL) {
789		if (dispsock->portentry != NULL &&
790		    dispsock->portentry->port == port &&
791		    isc_sockaddr_equal(dest, &dispsock->host))
792			return (dispsock);
793		dispsock = ISC_LIST_NEXT(dispsock, blink);
794	}
795
796	return (NULL);
797}
798
799/*%
800 * Make a new socket for a single dispatch with a random port number.
801 * The caller must hold the disp->lock and qid->lock.
802 */
803static isc_result_t
804get_dispsocket(dns_dispatch_t *disp, isc_sockaddr_t *dest,
805	       isc_socketmgr_t *sockmgr, dns_qid_t *qid,
806	       dispsocket_t **dispsockp, in_port_t *portp)
807{
808	int i;
809	isc_uint32_t r;
810	dns_dispatchmgr_t *mgr = disp->mgr;
811	isc_socket_t *sock = NULL;
812	isc_result_t result = ISC_R_FAILURE;
813	in_port_t port;
814	isc_sockaddr_t localaddr;
815	unsigned int bucket = 0;
816	dispsocket_t *dispsock;
817	unsigned int nports;
818	in_port_t *ports;
819	unsigned int bindoptions;
820	dispportentry_t *portentry = NULL;
821
822	if (isc_sockaddr_pf(&disp->local) == AF_INET) {
823		nports = disp->mgr->nv4ports;
824		ports = disp->mgr->v4ports;
825	} else {
826		nports = disp->mgr->nv6ports;
827		ports = disp->mgr->v6ports;
828	}
829	if (nports == 0)
830		return (ISC_R_ADDRNOTAVAIL);
831
832	dispsock = ISC_LIST_HEAD(disp->inactivesockets);
833	if (dispsock != NULL) {
834		ISC_LIST_UNLINK(disp->inactivesockets, dispsock, link);
835		sock = dispsock->socket;
836		dispsock->socket = NULL;
837	} else {
838		dispsock = isc_mempool_get(mgr->spool);
839		if (dispsock == NULL)
840			return (ISC_R_NOMEMORY);
841
842		disp->nsockets++;
843		dispsock->socket = NULL;
844		dispsock->disp = disp;
845		dispsock->resp = NULL;
846		dispsock->portentry = NULL;
847		isc_random_get(&r);
848		dispsock->task = NULL;
849		isc_task_attach(disp->task[r % disp->ntasks], &dispsock->task);
850		ISC_LINK_INIT(dispsock, link);
851		ISC_LINK_INIT(dispsock, blink);
852		dispsock->magic = DISPSOCK_MAGIC;
853	}
854
855	/*
856	 * Pick up a random UDP port and open a new socket with it.  Avoid
857	 * choosing ports that share the same destination because it will be
858	 * very likely to fail in bind(2) or connect(2).
859	 */
860	localaddr = disp->local;
861	for (i = 0; i < 64; i++) {
862		port = ports[dispatch_arc4uniformrandom(DISP_ARC4CTX(disp),
863							nports)];
864		isc_sockaddr_setport(&localaddr, port);
865
866		bucket = dns_hash(qid, dest, 0, port);
867		if (socket_search(qid, dest, port, bucket) != NULL)
868			continue;
869		bindoptions = 0;
870		portentry = port_search(disp, port);
871		if (portentry != NULL)
872			bindoptions |= ISC_SOCKET_REUSEADDRESS;
873		result = open_socket(sockmgr, &localaddr, bindoptions, &sock);
874		if (result == ISC_R_SUCCESS) {
875			if (portentry == NULL) {
876				portentry = new_portentry(disp, port);
877				if (portentry == NULL) {
878					result = ISC_R_NOMEMORY;
879					break;
880				}
881			}
882			portentry->refs++;
883			break;
884		} else if (result != ISC_R_ADDRINUSE)
885			break;
886	}
887
888	if (result == ISC_R_SUCCESS) {
889		dispsock->socket = sock;
890		dispsock->host = *dest;
891		dispsock->portentry = portentry;
892		dispsock->bucket = bucket;
893		ISC_LIST_APPEND(qid->sock_table[bucket], dispsock, blink);
894		*dispsockp = dispsock;
895		*portp = port;
896	} else {
897		/*
898		 * We could keep it in the inactive list, but since this should
899		 * be an exceptional case and might be resource shortage, we'd
900		 * rather destroy it.
901		 */
902		if (sock != NULL)
903			isc_socket_detach(&sock);
904		destroy_dispsocket(disp, &dispsock);
905	}
906
907	return (result);
908}
909
910/*%
911 * Destroy a dedicated dispatch socket.
912 */
913static void
914destroy_dispsocket(dns_dispatch_t *disp, dispsocket_t **dispsockp) {
915	dispsocket_t *dispsock;
916	dns_qid_t *qid;
917
918	/*
919	 * The dispatch must be locked.
920	 */
921
922	REQUIRE(dispsockp != NULL && *dispsockp != NULL);
923	dispsock = *dispsockp;
924	REQUIRE(!ISC_LINK_LINKED(dispsock, link));
925
926	disp->nsockets--;
927	dispsock->magic = 0;
928	if (dispsock->portentry != NULL)
929		deref_portentry(disp, &dispsock->portentry);
930	if (dispsock->socket != NULL)
931		isc_socket_detach(&dispsock->socket);
932	if (ISC_LINK_LINKED(dispsock, blink)) {
933		qid = DNS_QID(disp);
934		LOCK(&qid->lock);
935		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
936				blink);
937		UNLOCK(&qid->lock);
938	}
939	if (dispsock->task != NULL)
940		isc_task_detach(&dispsock->task);
941	isc_mempool_put(disp->mgr->spool, dispsock);
942
943	*dispsockp = NULL;
944}
945
946/*%
947 * Deactivate a dedicated dispatch socket.  Move it to the inactive list for
948 * future reuse unless the total number of sockets are exceeding the maximum.
949 */
950static void
951deactivate_dispsocket(dns_dispatch_t *disp, dispsocket_t *dispsock) {
952	isc_result_t result;
953	dns_qid_t *qid;
954
955	/*
956	 * The dispatch must be locked.
957	 */
958	ISC_LIST_UNLINK(disp->activesockets, dispsock, link);
959	if (dispsock->resp != NULL) {
960		INSIST(dispsock->resp->dispsocket == dispsock);
961		dispsock->resp->dispsocket = NULL;
962	}
963
964	INSIST(dispsock->portentry != NULL);
965	deref_portentry(disp, &dispsock->portentry);
966
967	if (disp->nsockets > DNS_DISPATCH_POOLSOCKS)
968		destroy_dispsocket(disp, &dispsock);
969	else {
970		result = isc_socket_close(dispsock->socket);
971
972		qid = DNS_QID(disp);
973		LOCK(&qid->lock);
974		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
975				blink);
976		UNLOCK(&qid->lock);
977
978		if (result == ISC_R_SUCCESS)
979			ISC_LIST_APPEND(disp->inactivesockets, dispsock, link);
980		else {
981			/*
982			 * If the underlying system does not allow this
983			 * optimization, destroy this temporary structure (and
984			 * create a new one for a new transaction).
985			 */
986			INSIST(result == ISC_R_NOTIMPLEMENTED);
987			destroy_dispsocket(disp, &dispsock);
988		}
989	}
990}
991
992/*
993 * Find an entry for query ID 'id', socket address 'dest', and port number
994 * 'port'.
995 * Return NULL if no such entry exists.
996 */
997static dns_dispentry_t *
998entry_search(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
999	     in_port_t port, unsigned int bucket)
1000{
1001	dns_dispentry_t *res;
1002
1003	REQUIRE(bucket < qid->qid_nbuckets);
1004
1005	res = ISC_LIST_HEAD(qid->qid_table[bucket]);
1006
1007	while (res != NULL) {
1008		if (res->id == id && isc_sockaddr_equal(dest, &res->host) &&
1009		    res->port == port) {
1010			return (res);
1011		}
1012		res = ISC_LIST_NEXT(res, link);
1013	}
1014
1015	return (NULL);
1016}
1017
1018static void
1019free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len) {
1020	INSIST(buf != NULL && len != 0);
1021
1022
1023	switch (disp->socktype) {
1024	case isc_sockettype_tcp:
1025		INSIST(disp->tcpbuffers > 0);
1026		disp->tcpbuffers--;
1027		isc_mem_put(disp->mgr->mctx, buf, len);
1028		break;
1029	case isc_sockettype_udp:
1030		LOCK(&disp->mgr->buffer_lock);
1031		INSIST(disp->mgr->buffers > 0);
1032		INSIST(len == disp->mgr->buffersize);
1033		disp->mgr->buffers--;
1034		isc_mempool_put(disp->mgr->bpool, buf);
1035		UNLOCK(&disp->mgr->buffer_lock);
1036		break;
1037	default:
1038		INSIST(0);
1039		break;
1040	}
1041}
1042
1043static void *
1044allocate_udp_buffer(dns_dispatch_t *disp) {
1045	void *temp;
1046
1047	LOCK(&disp->mgr->buffer_lock);
1048	temp = isc_mempool_get(disp->mgr->bpool);
1049
1050	if (temp != NULL)
1051		disp->mgr->buffers++;
1052	UNLOCK(&disp->mgr->buffer_lock);
1053
1054	return (temp);
1055}
1056
1057static inline void
1058free_event(dns_dispatch_t *disp, dns_dispatchevent_t *ev) {
1059	if (disp->failsafe_ev == ev) {
1060		INSIST(disp->shutdown_out == 1);
1061		disp->shutdown_out = 0;
1062
1063		return;
1064	}
1065
1066	isc_mempool_put(disp->mgr->epool, ev);
1067}
1068
1069static inline dns_dispatchevent_t *
1070allocate_event(dns_dispatch_t *disp) {
1071	dns_dispatchevent_t *ev;
1072
1073	ev = isc_mempool_get(disp->mgr->epool);
1074	if (ev == NULL)
1075		return (NULL);
1076	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, 0,
1077		       NULL, NULL, NULL, NULL, NULL);
1078
1079	return (ev);
1080}
1081
1082static void
1083udp_exrecv(isc_task_t *task, isc_event_t *ev) {
1084	dispsocket_t *dispsock = ev->ev_arg;
1085
1086	UNUSED(task);
1087
1088	REQUIRE(VALID_DISPSOCK(dispsock));
1089	udp_recv(ev, dispsock->disp, dispsock);
1090}
1091
1092static void
1093udp_shrecv(isc_task_t *task, isc_event_t *ev) {
1094	dns_dispatch_t *disp = ev->ev_arg;
1095
1096	UNUSED(task);
1097
1098	REQUIRE(VALID_DISPATCH(disp));
1099	udp_recv(ev, disp, NULL);
1100}
1101
1102/*
1103 * General flow:
1104 *
1105 * If I/O result == CANCELED or error, free the buffer.
1106 *
1107 * If query, free the buffer, restart.
1108 *
1109 * If response:
1110 *	Allocate event, fill in details.
1111 *		If cannot allocate, free buffer, restart.
1112 *	find target.  If not found, free buffer, restart.
1113 *	if event queue is not empty, queue.  else, send.
1114 *	restart.
1115 */
1116static void
1117udp_recv(isc_event_t *ev_in, dns_dispatch_t *disp, dispsocket_t *dispsock) {
1118	isc_socketevent_t *ev = (isc_socketevent_t *)ev_in;
1119	dns_messageid_t id;
1120	isc_result_t dres;
1121	isc_buffer_t source;
1122	unsigned int flags;
1123	dns_dispentry_t *resp = NULL;
1124	dns_dispatchevent_t *rev;
1125	unsigned int bucket;
1126	isc_boolean_t killit;
1127	isc_boolean_t queue_response;
1128	dns_dispatchmgr_t *mgr;
1129	dns_qid_t *qid;
1130	isc_netaddr_t netaddr;
1131	int match;
1132	int result;
1133	isc_boolean_t qidlocked = ISC_FALSE;
1134
1135	LOCK(&disp->lock);
1136
1137	mgr = disp->mgr;
1138	qid = mgr->qid;
1139
1140	dispatch_log(disp, LVL(90),
1141		     "got packet: requests %d, buffers %d, recvs %d",
1142		     disp->requests, disp->mgr->buffers, disp->recv_pending);
1143
1144	if (dispsock == NULL && ev->ev_type == ISC_SOCKEVENT_RECVDONE) {
1145		/*
1146		 * Unless the receive event was imported from a listening
1147		 * interface, in which case the event type is
1148		 * DNS_EVENT_IMPORTRECVDONE, receive operation must be pending.
1149		 */
1150		INSIST(disp->recv_pending != 0);
1151		disp->recv_pending = 0;
1152	}
1153
1154	if (dispsock != NULL &&
1155	    (ev->result == ISC_R_CANCELED || dispsock->resp == NULL)) {
1156		/*
1157		 * dispsock->resp can be NULL if this transaction was canceled
1158		 * just after receiving a response.  Since this socket is
1159		 * exclusively used and there should be at most one receive
1160		 * event the canceled event should have been no effect.  So
1161		 * we can (and should) deactivate the socket right now.
1162		 */
1163		deactivate_dispsocket(disp, dispsock);
1164		dispsock = NULL;
1165	}
1166
1167	if (disp->shutting_down) {
1168		/*
1169		 * This dispatcher is shutting down.
1170		 */
1171		free_buffer(disp, ev->region.base, ev->region.length);
1172
1173		isc_event_free(&ev_in);
1174		ev = NULL;
1175
1176		killit = destroy_disp_ok(disp);
1177		UNLOCK(&disp->lock);
1178		if (killit)
1179			isc_task_send(disp->task[0], &disp->ctlevent);
1180
1181		return;
1182	}
1183
1184	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
1185		if (dispsock != NULL) {
1186			resp = dispsock->resp;
1187			id = resp->id;
1188			if (ev->result != ISC_R_SUCCESS) {
1189				/*
1190				 * This is most likely a network error on a
1191				 * connected socket.  It makes no sense to
1192				 * check the address or parse the packet, but it
1193				 * will help to return the error to the caller.
1194				 */
1195				goto sendresponse;
1196			}
1197		} else {
1198			free_buffer(disp, ev->region.base, ev->region.length);
1199
1200			UNLOCK(&disp->lock);
1201			isc_event_free(&ev_in);
1202			return;
1203		}
1204	} else if (ev->result != ISC_R_SUCCESS) {
1205		free_buffer(disp, ev->region.base, ev->region.length);
1206
1207		if (ev->result != ISC_R_CANCELED)
1208			dispatch_log(disp, ISC_LOG_ERROR,
1209				     "odd socket result in udp_recv(): %s",
1210				     isc_result_totext(ev->result));
1211
1212		UNLOCK(&disp->lock);
1213		isc_event_free(&ev_in);
1214		return;
1215	}
1216
1217	/*
1218	 * If this is from a blackholed address, drop it.
1219	 */
1220	isc_netaddr_fromsockaddr(&netaddr, &ev->address);
1221	if (disp->mgr->blackhole != NULL &&
1222	    dns_acl_match(&netaddr, NULL, disp->mgr->blackhole,
1223			  NULL, &match, NULL) == ISC_R_SUCCESS &&
1224	    match > 0)
1225	{
1226		if (isc_log_wouldlog(dns_lctx, LVL(10))) {
1227			char netaddrstr[ISC_NETADDR_FORMATSIZE];
1228			isc_netaddr_format(&netaddr, netaddrstr,
1229					   sizeof(netaddrstr));
1230			dispatch_log(disp, LVL(10),
1231				     "blackholed packet from %s",
1232				     netaddrstr);
1233		}
1234		free_buffer(disp, ev->region.base, ev->region.length);
1235		goto restart;
1236	}
1237
1238	/*
1239	 * Peek into the buffer to see what we can see.
1240	 */
1241	isc_buffer_init(&source, ev->region.base, ev->region.length);
1242	isc_buffer_add(&source, ev->n);
1243	dres = dns_message_peekheader(&source, &id, &flags);
1244	if (dres != ISC_R_SUCCESS) {
1245		free_buffer(disp, ev->region.base, ev->region.length);
1246		dispatch_log(disp, LVL(10), "got garbage packet");
1247		goto restart;
1248	}
1249
1250	dispatch_log(disp, LVL(92),
1251		     "got valid DNS message header, /QR %c, id %u",
1252		     ((flags & DNS_MESSAGEFLAG_QR) ? '1' : '0'), id);
1253
1254	/*
1255	 * Look at flags.  If query, drop it. If response,
1256	 * look to see where it goes.
1257	 */
1258	queue_response = ISC_FALSE;
1259	if ((flags & DNS_MESSAGEFLAG_QR) == 0) {
1260		/* query */
1261		free_buffer(disp, ev->region.base, ev->region.length);
1262		goto restart;
1263	}
1264
1265	/*
1266	 * Search for the corresponding response.  If we are using an exclusive
1267	 * socket, we've already identified it and we can skip the search; but
1268	 * the ID and the address must match the expected ones.
1269	 */
1270	if (resp == NULL) {
1271		bucket = dns_hash(qid, &ev->address, id, disp->localport);
1272		LOCK(&qid->lock);
1273		qidlocked = ISC_TRUE;
1274		resp = entry_search(qid, &ev->address, id, disp->localport,
1275				    bucket);
1276		dispatch_log(disp, LVL(90),
1277			     "search for response in bucket %d: %s",
1278			     bucket, (resp == NULL ? "not found" : "found"));
1279
1280		if (resp == NULL) {
1281			inc_stats(mgr, dns_resstatscounter_mismatch);
1282			free_buffer(disp, ev->region.base, ev->region.length);
1283			goto unlock;
1284		}
1285	} else if (resp->id != id || !isc_sockaddr_equal(&ev->address,
1286							 &resp->host)) {
1287		dispatch_log(disp, LVL(90),
1288			     "response to an exclusive socket doesn't match");
1289		inc_stats(mgr, dns_resstatscounter_mismatch);
1290		free_buffer(disp, ev->region.base, ev->region.length);
1291		goto unlock;
1292	}
1293
1294	/*
1295	 * Now that we have the original dispatch the query was sent
1296	 * from check that the address and port the response was
1297	 * sent to make sense.
1298	 */
1299	if (disp != resp->disp) {
1300		isc_sockaddr_t a1;
1301		isc_sockaddr_t a2;
1302
1303		/*
1304		 * Check that the socket types and ports match.
1305		 */
1306		if (disp->socktype != resp->disp->socktype ||
1307		    isc_sockaddr_getport(&disp->local) !=
1308		    isc_sockaddr_getport(&resp->disp->local)) {
1309			free_buffer(disp, ev->region.base, ev->region.length);
1310			goto unlock;
1311		}
1312
1313		/*
1314		 * If both dispatches are bound to an address then fail as
1315		 * the addresses can't be equal (enforced by the IP stack).
1316		 *
1317		 * Note under Linux a packet can be sent out via IPv4 socket
1318		 * and the response be received via a IPv6 socket.
1319		 *
1320		 * Requests sent out via IPv6 should always come back in
1321		 * via IPv6.
1322		 */
1323		if (isc_sockaddr_pf(&resp->disp->local) == PF_INET6 &&
1324		    isc_sockaddr_pf(&disp->local) != PF_INET6) {
1325			free_buffer(disp, ev->region.base, ev->region.length);
1326			goto unlock;
1327		}
1328		isc_sockaddr_anyofpf(&a1, isc_sockaddr_pf(&resp->disp->local));
1329		isc_sockaddr_anyofpf(&a2, isc_sockaddr_pf(&disp->local));
1330		if (!isc_sockaddr_eqaddr(&a1, &resp->disp->local) &&
1331		    !isc_sockaddr_eqaddr(&a2, &disp->local)) {
1332			free_buffer(disp, ev->region.base, ev->region.length);
1333			goto unlock;
1334		}
1335	}
1336
1337  sendresponse:
1338	queue_response = resp->item_out;
1339	rev = allocate_event(resp->disp);
1340	if (rev == NULL) {
1341		free_buffer(disp, ev->region.base, ev->region.length);
1342		goto unlock;
1343	}
1344
1345	/*
1346	 * At this point, rev contains the event we want to fill in, and
1347	 * resp contains the information on the place to send it to.
1348	 * Send the event off.
1349	 */
1350	isc_buffer_init(&rev->buffer, ev->region.base, ev->region.length);
1351	isc_buffer_add(&rev->buffer, ev->n);
1352	rev->result = ev->result;
1353	rev->id = id;
1354	rev->addr = ev->address;
1355	rev->pktinfo = ev->pktinfo;
1356	rev->attributes = ev->attributes;
1357	if (queue_response) {
1358		ISC_LIST_APPEND(resp->items, rev, ev_link);
1359	} else {
1360		ISC_EVENT_INIT(rev, sizeof(*rev), 0, NULL,
1361			       DNS_EVENT_DISPATCH,
1362			       resp->action, resp->arg, resp, NULL, NULL);
1363		request_log(disp, resp, LVL(90),
1364			    "[a] Sent event %p buffer %p len %d to task %p",
1365			    rev, rev->buffer.base, rev->buffer.length,
1366			    resp->task);
1367		resp->item_out = ISC_TRUE;
1368		isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
1369	}
1370 unlock:
1371	if (qidlocked)
1372		UNLOCK(&qid->lock);
1373
1374	/*
1375	 * Restart recv() to get the next packet.
1376	 */
1377 restart:
1378	result = startrecv(disp, dispsock);
1379	if (result != ISC_R_SUCCESS && dispsock != NULL) {
1380		/*
1381		 * XXX: wired. There seems to be no recovery process other than
1382		 * deactivate this socket anyway (since we cannot start
1383		 * receiving, we won't be able to receive a cancel event
1384		 * from the user).
1385		 */
1386		deactivate_dispsocket(disp, dispsock);
1387	}
1388	UNLOCK(&disp->lock);
1389
1390	isc_event_free(&ev_in);
1391}
1392
1393/*
1394 * General flow:
1395 *
1396 * If I/O result == CANCELED, EOF, or error, notify everyone as the
1397 * various queues drain.
1398 *
1399 * If query, restart.
1400 *
1401 * If response:
1402 *	Allocate event, fill in details.
1403 *		If cannot allocate, restart.
1404 *	find target.  If not found, restart.
1405 *	if event queue is not empty, queue.  else, send.
1406 *	restart.
1407 */
1408static void
1409tcp_recv(isc_task_t *task, isc_event_t *ev_in) {
1410	dns_dispatch_t *disp = ev_in->ev_arg;
1411	dns_tcpmsg_t *tcpmsg = &disp->tcpmsg;
1412	dns_messageid_t id;
1413	isc_result_t dres;
1414	unsigned int flags;
1415	dns_dispentry_t *resp;
1416	dns_dispatchevent_t *rev;
1417	unsigned int bucket;
1418	isc_boolean_t killit;
1419	isc_boolean_t queue_response;
1420	dns_qid_t *qid;
1421	int level;
1422	char buf[ISC_SOCKADDR_FORMATSIZE];
1423
1424	UNUSED(task);
1425
1426	REQUIRE(VALID_DISPATCH(disp));
1427
1428	qid = disp->qid;
1429
1430	dispatch_log(disp, LVL(90),
1431		     "got TCP packet: requests %d, buffers %d, recvs %d",
1432		     disp->requests, disp->tcpbuffers, disp->recv_pending);
1433
1434	LOCK(&disp->lock);
1435
1436	INSIST(disp->recv_pending != 0);
1437	disp->recv_pending = 0;
1438
1439	if (disp->refcount == 0) {
1440		/*
1441		 * This dispatcher is shutting down.  Force cancelation.
1442		 */
1443		tcpmsg->result = ISC_R_CANCELED;
1444	}
1445
1446	if (tcpmsg->result != ISC_R_SUCCESS) {
1447		switch (tcpmsg->result) {
1448		case ISC_R_CANCELED:
1449			break;
1450
1451		case ISC_R_EOF:
1452			dispatch_log(disp, LVL(90), "shutting down on EOF");
1453			do_cancel(disp);
1454			break;
1455
1456		case ISC_R_CONNECTIONRESET:
1457			level = ISC_LOG_INFO;
1458			goto logit;
1459
1460		default:
1461			level = ISC_LOG_ERROR;
1462		logit:
1463			isc_sockaddr_format(&tcpmsg->address, buf, sizeof(buf));
1464			dispatch_log(disp, level, "shutting down due to TCP "
1465				     "receive error: %s: %s", buf,
1466				     isc_result_totext(tcpmsg->result));
1467			do_cancel(disp);
1468			break;
1469		}
1470
1471		/*
1472		 * The event is statically allocated in the tcpmsg
1473		 * structure, and destroy_disp() frees the tcpmsg, so we must
1474		 * free the event *before* calling destroy_disp().
1475		 */
1476		isc_event_free(&ev_in);
1477
1478		disp->shutting_down = 1;
1479		disp->shutdown_why = tcpmsg->result;
1480
1481		/*
1482		 * If the recv() was canceled pass the word on.
1483		 */
1484		killit = destroy_disp_ok(disp);
1485		UNLOCK(&disp->lock);
1486		if (killit)
1487			isc_task_send(disp->task[0], &disp->ctlevent);
1488		return;
1489	}
1490
1491	dispatch_log(disp, LVL(90), "result %d, length == %d, addr = %p",
1492		     tcpmsg->result,
1493		     tcpmsg->buffer.length, tcpmsg->buffer.base);
1494
1495	/*
1496	 * Peek into the buffer to see what we can see.
1497	 */
1498	dres = dns_message_peekheader(&tcpmsg->buffer, &id, &flags);
1499	if (dres != ISC_R_SUCCESS) {
1500		dispatch_log(disp, LVL(10), "got garbage packet");
1501		goto restart;
1502	}
1503
1504	dispatch_log(disp, LVL(92),
1505		     "got valid DNS message header, /QR %c, id %u",
1506		     ((flags & DNS_MESSAGEFLAG_QR) ? '1' : '0'), id);
1507
1508	/*
1509	 * Allocate an event to send to the query or response client, and
1510	 * allocate a new buffer for our use.
1511	 */
1512
1513	/*
1514	 * Look at flags.  If query, drop it. If response,
1515	 * look to see where it goes.
1516	 */
1517	queue_response = ISC_FALSE;
1518	if ((flags & DNS_MESSAGEFLAG_QR) == 0) {
1519		/*
1520		 * Query.
1521		 */
1522		goto restart;
1523	}
1524
1525	/*
1526	 * Response.
1527	 */
1528	bucket = dns_hash(qid, &tcpmsg->address, id, disp->localport);
1529	LOCK(&qid->lock);
1530	resp = entry_search(qid, &tcpmsg->address, id, disp->localport, bucket);
1531	dispatch_log(disp, LVL(90),
1532		     "search for response in bucket %d: %s",
1533		     bucket, (resp == NULL ? "not found" : "found"));
1534
1535	if (resp == NULL)
1536		goto unlock;
1537	queue_response = resp->item_out;
1538	rev = allocate_event(disp);
1539	if (rev == NULL)
1540		goto unlock;
1541
1542	/*
1543	 * At this point, rev contains the event we want to fill in, and
1544	 * resp contains the information on the place to send it to.
1545	 * Send the event off.
1546	 */
1547	dns_tcpmsg_keepbuffer(tcpmsg, &rev->buffer);
1548	disp->tcpbuffers++;
1549	rev->result = ISC_R_SUCCESS;
1550	rev->id = id;
1551	rev->addr = tcpmsg->address;
1552	if (queue_response) {
1553		ISC_LIST_APPEND(resp->items, rev, ev_link);
1554	} else {
1555		ISC_EVENT_INIT(rev, sizeof(*rev), 0, NULL, DNS_EVENT_DISPATCH,
1556			       resp->action, resp->arg, resp, NULL, NULL);
1557		request_log(disp, resp, LVL(90),
1558			    "[b] Sent event %p buffer %p len %d to task %p",
1559			    rev, rev->buffer.base, rev->buffer.length,
1560			    resp->task);
1561		resp->item_out = ISC_TRUE;
1562		isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
1563	}
1564 unlock:
1565	UNLOCK(&qid->lock);
1566
1567	/*
1568	 * Restart recv() to get the next packet.
1569	 */
1570 restart:
1571	(void)startrecv(disp, NULL);
1572
1573	UNLOCK(&disp->lock);
1574
1575	isc_event_free(&ev_in);
1576}
1577
1578/*
1579 * disp must be locked.
1580 */
1581static isc_result_t
1582startrecv(dns_dispatch_t *disp, dispsocket_t *dispsock) {
1583	isc_result_t res;
1584	isc_region_t region;
1585	isc_socket_t *socket;
1586
1587	if (disp->shutting_down == 1)
1588		return (ISC_R_SUCCESS);
1589
1590	if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
1591		return (ISC_R_SUCCESS);
1592
1593	if (disp->recv_pending != 0 && dispsock == NULL)
1594		return (ISC_R_SUCCESS);
1595
1596	if (disp->mgr->buffers >= disp->mgr->maxbuffers)
1597		return (ISC_R_NOMEMORY);
1598
1599	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
1600	    dispsock == NULL)
1601		return (ISC_R_SUCCESS);
1602
1603	if (dispsock != NULL)
1604		socket = dispsock->socket;
1605	else
1606		socket = disp->socket;
1607	INSIST(socket != NULL);
1608
1609	switch (disp->socktype) {
1610		/*
1611		 * UDP reads are always maximal.
1612		 */
1613	case isc_sockettype_udp:
1614		region.length = disp->mgr->buffersize;
1615		region.base = allocate_udp_buffer(disp);
1616		if (region.base == NULL)
1617			return (ISC_R_NOMEMORY);
1618		if (dispsock != NULL) {
1619			res = isc_socket_recv(socket, &region, 1,
1620					      dispsock->task, udp_exrecv,
1621					      dispsock);
1622			if (res != ISC_R_SUCCESS) {
1623				free_buffer(disp, region.base, region.length);
1624				return (res);
1625			}
1626		} else {
1627			res = isc_socket_recv(socket, &region, 1,
1628					      disp->task[0], udp_shrecv, disp);
1629			if (res != ISC_R_SUCCESS) {
1630				free_buffer(disp, region.base, region.length);
1631				disp->shutdown_why = res;
1632				disp->shutting_down = 1;
1633				do_cancel(disp);
1634				return (ISC_R_SUCCESS); /* recover by cancel */
1635			}
1636			INSIST(disp->recv_pending == 0);
1637			disp->recv_pending = 1;
1638		}
1639		break;
1640
1641	case isc_sockettype_tcp:
1642		res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task[0],
1643					     tcp_recv, disp);
1644		if (res != ISC_R_SUCCESS) {
1645			disp->shutdown_why = res;
1646			disp->shutting_down = 1;
1647			do_cancel(disp);
1648			return (ISC_R_SUCCESS); /* recover by cancel */
1649		}
1650		INSIST(disp->recv_pending == 0);
1651		disp->recv_pending = 1;
1652		break;
1653	default:
1654		INSIST(0);
1655		break;
1656	}
1657
1658	return (ISC_R_SUCCESS);
1659}
1660
1661/*
1662 * Mgr must be locked when calling this function.
1663 */
1664static isc_boolean_t
1665destroy_mgr_ok(dns_dispatchmgr_t *mgr) {
1666	mgr_log(mgr, LVL(90),
1667		"destroy_mgr_ok: shuttingdown=%d, listnonempty=%d, "
1668		"epool=%d, rpool=%d, dpool=%d",
1669		MGR_IS_SHUTTINGDOWN(mgr), !ISC_LIST_EMPTY(mgr->list),
1670		isc_mempool_getallocated(mgr->epool),
1671		isc_mempool_getallocated(mgr->rpool),
1672		isc_mempool_getallocated(mgr->dpool));
1673	if (!MGR_IS_SHUTTINGDOWN(mgr))
1674		return (ISC_FALSE);
1675	if (!ISC_LIST_EMPTY(mgr->list))
1676		return (ISC_FALSE);
1677	if (isc_mempool_getallocated(mgr->epool) != 0)
1678		return (ISC_FALSE);
1679	if (isc_mempool_getallocated(mgr->rpool) != 0)
1680		return (ISC_FALSE);
1681	if (isc_mempool_getallocated(mgr->dpool) != 0)
1682		return (ISC_FALSE);
1683
1684	return (ISC_TRUE);
1685}
1686
1687/*
1688 * Mgr must be unlocked when calling this function.
1689 */
1690static void
1691destroy_mgr(dns_dispatchmgr_t **mgrp) {
1692	isc_mem_t *mctx;
1693	dns_dispatchmgr_t *mgr;
1694
1695	mgr = *mgrp;
1696	*mgrp = NULL;
1697
1698	mctx = mgr->mctx;
1699
1700	mgr->magic = 0;
1701	mgr->mctx = NULL;
1702	DESTROYLOCK(&mgr->lock);
1703	mgr->state = 0;
1704
1705	DESTROYLOCK(&mgr->arc4_lock);
1706
1707	isc_mempool_destroy(&mgr->epool);
1708	isc_mempool_destroy(&mgr->rpool);
1709	isc_mempool_destroy(&mgr->dpool);
1710	isc_mempool_destroy(&mgr->bpool);
1711	isc_mempool_destroy(&mgr->spool);
1712
1713	DESTROYLOCK(&mgr->pool_lock);
1714
1715	if (mgr->entropy != NULL)
1716		isc_entropy_detach(&mgr->entropy);
1717	if (mgr->qid != NULL)
1718		qid_destroy(mctx, &mgr->qid);
1719
1720	DESTROYLOCK(&mgr->buffer_lock);
1721
1722	if (mgr->blackhole != NULL)
1723		dns_acl_detach(&mgr->blackhole);
1724
1725	if (mgr->stats != NULL)
1726		isc_stats_detach(&mgr->stats);
1727
1728	if (mgr->v4ports != NULL) {
1729		isc_mem_put(mctx, mgr->v4ports,
1730			    mgr->nv4ports * sizeof(in_port_t));
1731	}
1732	if (mgr->v6ports != NULL) {
1733		isc_mem_put(mctx, mgr->v6ports,
1734			    mgr->nv6ports * sizeof(in_port_t));
1735	}
1736	isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
1737	isc_mem_detach(&mctx);
1738}
1739
1740static isc_result_t
1741open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
1742	    unsigned int options, isc_socket_t **sockp)
1743{
1744	isc_socket_t *sock;
1745	isc_result_t result;
1746
1747	sock = *sockp;
1748	if (sock == NULL) {
1749		result = isc_socket_create(mgr, isc_sockaddr_pf(local),
1750					   isc_sockettype_udp, &sock);
1751		if (result != ISC_R_SUCCESS)
1752			return (result);
1753		isc_socket_setname(sock, "dispatcher", NULL);
1754	} else {
1755		result = isc_socket_open(sock);
1756		if (result != ISC_R_SUCCESS)
1757			return (result);
1758	}
1759
1760#ifndef ISC_ALLOW_MAPPED
1761	isc_socket_ipv6only(sock, ISC_TRUE);
1762#endif
1763	result = isc_socket_bind(sock, local, options);
1764	if (result != ISC_R_SUCCESS) {
1765		if (*sockp == NULL)
1766			isc_socket_detach(&sock);
1767		else
1768			isc_socket_close(sock);
1769		return (result);
1770	}
1771
1772	*sockp = sock;
1773	return (ISC_R_SUCCESS);
1774}
1775
1776/*%
1777 * Create a temporary port list to set the initial default set of dispatch
1778 * ports: [1024, 65535].  This is almost meaningless as the application will
1779 * normally set the ports explicitly, but is provided to fill some minor corner
1780 * cases.
1781 */
1782static isc_result_t
1783create_default_portset(isc_mem_t *mctx, isc_portset_t **portsetp) {
1784	isc_result_t result;
1785
1786	result = isc_portset_create(mctx, portsetp);
1787	if (result != ISC_R_SUCCESS)
1788		return (result);
1789	isc_portset_addrange(*portsetp, 1024, 65535);
1790
1791	return (ISC_R_SUCCESS);
1792}
1793
1794/*
1795 * Publics.
1796 */
1797
1798isc_result_t
1799dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy,
1800		       dns_dispatchmgr_t **mgrp)
1801{
1802	dns_dispatchmgr_t *mgr;
1803	isc_result_t result;
1804	isc_portset_t *v4portset = NULL;
1805	isc_portset_t *v6portset = NULL;
1806
1807	REQUIRE(mctx != NULL);
1808	REQUIRE(mgrp != NULL && *mgrp == NULL);
1809
1810	mgr = isc_mem_get(mctx, sizeof(dns_dispatchmgr_t));
1811	if (mgr == NULL)
1812		return (ISC_R_NOMEMORY);
1813
1814	mgr->mctx = NULL;
1815	isc_mem_attach(mctx, &mgr->mctx);
1816
1817	mgr->blackhole = NULL;
1818	mgr->stats = NULL;
1819
1820	result = isc_mutex_init(&mgr->lock);
1821	if (result != ISC_R_SUCCESS)
1822		goto deallocate;
1823
1824	result = isc_mutex_init(&mgr->arc4_lock);
1825	if (result != ISC_R_SUCCESS)
1826		goto kill_lock;
1827
1828	result = isc_mutex_init(&mgr->buffer_lock);
1829	if (result != ISC_R_SUCCESS)
1830		goto kill_arc4_lock;
1831
1832	result = isc_mutex_init(&mgr->pool_lock);
1833	if (result != ISC_R_SUCCESS)
1834		goto kill_buffer_lock;
1835
1836	mgr->epool = NULL;
1837	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispatchevent_t),
1838			       &mgr->epool) != ISC_R_SUCCESS) {
1839		result = ISC_R_NOMEMORY;
1840		goto kill_pool_lock;
1841	}
1842
1843	mgr->rpool = NULL;
1844	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispentry_t),
1845			       &mgr->rpool) != ISC_R_SUCCESS) {
1846		result = ISC_R_NOMEMORY;
1847		goto kill_epool;
1848	}
1849
1850	mgr->dpool = NULL;
1851	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispatch_t),
1852			       &mgr->dpool) != ISC_R_SUCCESS) {
1853		result = ISC_R_NOMEMORY;
1854		goto kill_rpool;
1855	}
1856
1857	isc_mempool_setname(mgr->epool, "dispmgr_epool");
1858	isc_mempool_setfreemax(mgr->epool, 1024);
1859	isc_mempool_associatelock(mgr->epool, &mgr->pool_lock);
1860
1861	isc_mempool_setname(mgr->rpool, "dispmgr_rpool");
1862	isc_mempool_setfreemax(mgr->rpool, 1024);
1863	isc_mempool_associatelock(mgr->rpool, &mgr->pool_lock);
1864
1865	isc_mempool_setname(mgr->dpool, "dispmgr_dpool");
1866	isc_mempool_setfreemax(mgr->dpool, 1024);
1867	isc_mempool_associatelock(mgr->dpool, &mgr->pool_lock);
1868
1869	mgr->buffers = 0;
1870	mgr->buffersize = 0;
1871	mgr->maxbuffers = 0;
1872	mgr->bpool = NULL;
1873	mgr->spool = NULL;
1874	mgr->entropy = NULL;
1875	mgr->qid = NULL;
1876	mgr->state = 0;
1877	ISC_LIST_INIT(mgr->list);
1878	mgr->v4ports = NULL;
1879	mgr->v6ports = NULL;
1880	mgr->nv4ports = 0;
1881	mgr->nv6ports = 0;
1882	mgr->magic = DNS_DISPATCHMGR_MAGIC;
1883
1884	result = create_default_portset(mctx, &v4portset);
1885	if (result == ISC_R_SUCCESS) {
1886		result = create_default_portset(mctx, &v6portset);
1887		if (result == ISC_R_SUCCESS) {
1888			result = dns_dispatchmgr_setavailports(mgr,
1889							       v4portset,
1890							       v6portset);
1891		}
1892	}
1893	if (v4portset != NULL)
1894		isc_portset_destroy(mctx, &v4portset);
1895	if (v6portset != NULL)
1896		isc_portset_destroy(mctx, &v6portset);
1897	if (result != ISC_R_SUCCESS)
1898		goto kill_dpool;
1899
1900	if (entropy != NULL)
1901		isc_entropy_attach(entropy, &mgr->entropy);
1902
1903	dispatch_arc4init(&mgr->arc4ctx, mgr->entropy, &mgr->arc4_lock);
1904
1905	*mgrp = mgr;
1906	return (ISC_R_SUCCESS);
1907
1908 kill_dpool:
1909	isc_mempool_destroy(&mgr->dpool);
1910 kill_rpool:
1911	isc_mempool_destroy(&mgr->rpool);
1912 kill_epool:
1913	isc_mempool_destroy(&mgr->epool);
1914 kill_pool_lock:
1915	DESTROYLOCK(&mgr->pool_lock);
1916 kill_buffer_lock:
1917	DESTROYLOCK(&mgr->buffer_lock);
1918 kill_arc4_lock:
1919	DESTROYLOCK(&mgr->arc4_lock);
1920 kill_lock:
1921	DESTROYLOCK(&mgr->lock);
1922 deallocate:
1923	isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
1924	isc_mem_detach(&mctx);
1925
1926	return (result);
1927}
1928
1929void
1930dns_dispatchmgr_setblackhole(dns_dispatchmgr_t *mgr, dns_acl_t *blackhole) {
1931	REQUIRE(VALID_DISPATCHMGR(mgr));
1932	if (mgr->blackhole != NULL)
1933		dns_acl_detach(&mgr->blackhole);
1934	dns_acl_attach(blackhole, &mgr->blackhole);
1935}
1936
1937dns_acl_t *
1938dns_dispatchmgr_getblackhole(dns_dispatchmgr_t *mgr) {
1939	REQUIRE(VALID_DISPATCHMGR(mgr));
1940	return (mgr->blackhole);
1941}
1942
1943void
1944dns_dispatchmgr_setblackportlist(dns_dispatchmgr_t *mgr,
1945				 dns_portlist_t *portlist)
1946{
1947	REQUIRE(VALID_DISPATCHMGR(mgr));
1948	UNUSED(portlist);
1949
1950	/* This function is deprecated: use dns_dispatchmgr_setavailports(). */
1951	return;
1952}
1953
1954dns_portlist_t *
1955dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr) {
1956	REQUIRE(VALID_DISPATCHMGR(mgr));
1957	return (NULL);		/* this function is deprecated */
1958}
1959
1960isc_result_t
1961dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset,
1962			      isc_portset_t *v6portset)
1963{
1964	in_port_t *v4ports, *v6ports, p;
1965	unsigned int nv4ports, nv6ports, i4, i6;
1966
1967	REQUIRE(VALID_DISPATCHMGR(mgr));
1968
1969	nv4ports = isc_portset_nports(v4portset);
1970	nv6ports = isc_portset_nports(v6portset);
1971
1972	v4ports = NULL;
1973	if (nv4ports != 0) {
1974		v4ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv4ports);
1975		if (v4ports == NULL)
1976			return (ISC_R_NOMEMORY);
1977	}
1978	v6ports = NULL;
1979	if (nv6ports != 0) {
1980		v6ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv6ports);
1981		if (v6ports == NULL) {
1982			if (v4ports != NULL) {
1983				isc_mem_put(mgr->mctx, v4ports,
1984					    sizeof(in_port_t) *
1985					    isc_portset_nports(v4portset));
1986			}
1987			return (ISC_R_NOMEMORY);
1988		}
1989	}
1990
1991	p = 0;
1992	i4 = 0;
1993	i6 = 0;
1994	do {
1995		if (isc_portset_isset(v4portset, p)) {
1996			INSIST(i4 < nv4ports);
1997			v4ports[i4++] = p;
1998		}
1999		if (isc_portset_isset(v6portset, p)) {
2000			INSIST(i6 < nv6ports);
2001			v6ports[i6++] = p;
2002		}
2003	} while (p++ < 65535);
2004	INSIST(i4 == nv4ports && i6 == nv6ports);
2005
2006	PORTBUFLOCK(mgr);
2007	if (mgr->v4ports != NULL) {
2008		isc_mem_put(mgr->mctx, mgr->v4ports,
2009			    mgr->nv4ports * sizeof(in_port_t));
2010	}
2011	mgr->v4ports = v4ports;
2012	mgr->nv4ports = nv4ports;
2013
2014	if (mgr->v6ports != NULL) {
2015		isc_mem_put(mgr->mctx, mgr->v6ports,
2016			    mgr->nv6ports * sizeof(in_port_t));
2017	}
2018	mgr->v6ports = v6ports;
2019	mgr->nv6ports = nv6ports;
2020	PORTBUFUNLOCK(mgr);
2021
2022	return (ISC_R_SUCCESS);
2023}
2024
2025static isc_result_t
2026dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr,
2027		       unsigned int buffersize, unsigned int maxbuffers,
2028		       unsigned int maxrequests, unsigned int buckets,
2029		       unsigned int increment)
2030{
2031	isc_result_t result;
2032
2033	REQUIRE(VALID_DISPATCHMGR(mgr));
2034	REQUIRE(buffersize >= 512 && buffersize < (64 * 1024));
2035	REQUIRE(maxbuffers > 0);
2036	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2037	REQUIRE(increment > buckets);
2038
2039	/*
2040	 * Keep some number of items around.  This should be a config
2041	 * option.  For now, keep 8, but later keep at least two even
2042	 * if the caller wants less.  This allows us to ensure certain
2043	 * things, like an event can be "freed" and the next allocation
2044	 * will always succeed.
2045	 *
2046	 * Note that if limits are placed on anything here, we use one
2047	 * event internally, so the actual limit should be "wanted + 1."
2048	 *
2049	 * XXXMLG
2050	 */
2051
2052	if (maxbuffers < 8)
2053		maxbuffers = 8;
2054
2055	LOCK(&mgr->buffer_lock);
2056
2057	/* Create or adjust buffer pool */
2058	if (mgr->bpool != NULL) {
2059		/*
2060		 * We only increase the maxbuffers to avoid accidental buffer
2061		 * shortage.  Ideally we'd separate the manager-wide maximum
2062		 * from per-dispatch limits and respect the latter within the
2063		 * global limit.  But at this moment that's deemed to be
2064		 * overkilling and isn't worth additional implementation
2065		 * complexity.
2066		 */
2067		if (maxbuffers > mgr->maxbuffers) {
2068			isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
2069			mgr->maxbuffers = maxbuffers;
2070		}
2071	} else {
2072		result = isc_mempool_create(mgr->mctx, buffersize, &mgr->bpool);
2073		if (result != ISC_R_SUCCESS) {
2074			UNLOCK(&mgr->buffer_lock);
2075			return (result);
2076		}
2077		isc_mempool_setname(mgr->bpool, "dispmgr_bpool");
2078		isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
2079		isc_mempool_associatelock(mgr->bpool, &mgr->pool_lock);
2080	}
2081
2082	/* Create or adjust socket pool */
2083	if (mgr->spool != NULL) {
2084		isc_mempool_setmaxalloc(mgr->spool, DNS_DISPATCH_POOLSOCKS * 2);
2085		UNLOCK(&mgr->buffer_lock);
2086		return (ISC_R_SUCCESS);
2087	}
2088	result = isc_mempool_create(mgr->mctx, sizeof(dispsocket_t),
2089				    &mgr->spool);
2090	if (result != ISC_R_SUCCESS) {
2091		UNLOCK(&mgr->buffer_lock);
2092		goto cleanup;
2093	}
2094	isc_mempool_setname(mgr->spool, "dispmgr_spool");
2095	isc_mempool_setmaxalloc(mgr->spool, maxrequests);
2096	isc_mempool_associatelock(mgr->spool, &mgr->pool_lock);
2097
2098	result = qid_allocate(mgr, buckets, increment, &mgr->qid, ISC_TRUE);
2099	if (result != ISC_R_SUCCESS)
2100		goto cleanup;
2101
2102	mgr->buffersize = buffersize;
2103	mgr->maxbuffers = maxbuffers;
2104	UNLOCK(&mgr->buffer_lock);
2105	return (ISC_R_SUCCESS);
2106
2107 cleanup:
2108	isc_mempool_destroy(&mgr->bpool);
2109	if (mgr->spool != NULL)
2110		isc_mempool_destroy(&mgr->spool);
2111	UNLOCK(&mgr->buffer_lock);
2112	return (result);
2113}
2114
2115void
2116dns_dispatchmgr_destroy(dns_dispatchmgr_t **mgrp) {
2117	dns_dispatchmgr_t *mgr;
2118	isc_boolean_t killit;
2119
2120	REQUIRE(mgrp != NULL);
2121	REQUIRE(VALID_DISPATCHMGR(*mgrp));
2122
2123	mgr = *mgrp;
2124	*mgrp = NULL;
2125
2126	LOCK(&mgr->lock);
2127	mgr->state |= MGR_SHUTTINGDOWN;
2128
2129	killit = destroy_mgr_ok(mgr);
2130	UNLOCK(&mgr->lock);
2131
2132	mgr_log(mgr, LVL(90), "destroy: killit=%d", killit);
2133
2134	if (killit)
2135		destroy_mgr(&mgr);
2136}
2137
2138void
2139dns_dispatchmgr_setstats(dns_dispatchmgr_t *mgr, isc_stats_t *stats) {
2140	REQUIRE(VALID_DISPATCHMGR(mgr));
2141	REQUIRE(ISC_LIST_EMPTY(mgr->list));
2142	REQUIRE(mgr->stats == NULL);
2143
2144	isc_stats_attach(stats, &mgr->stats);
2145}
2146
2147static int
2148port_cmp(const void *key, const void *ent) {
2149	in_port_t p1 = *(const in_port_t *)key;
2150	in_port_t p2 = *(const in_port_t *)ent;
2151
2152	if (p1 < p2)
2153		return (-1);
2154	else if (p1 == p2)
2155		return (0);
2156	else
2157		return (1);
2158}
2159
2160static isc_boolean_t
2161portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
2162	      isc_sockaddr_t *sockaddrp)
2163{
2164	isc_sockaddr_t sockaddr;
2165	isc_result_t result;
2166	in_port_t *ports, port;
2167	unsigned int nports;
2168	isc_boolean_t available = ISC_FALSE;
2169
2170	REQUIRE(sock != NULL || sockaddrp != NULL);
2171
2172	PORTBUFLOCK(mgr);
2173	if (sock != NULL) {
2174		sockaddrp = &sockaddr;
2175		result = isc_socket_getsockname(sock, sockaddrp);
2176		if (result != ISC_R_SUCCESS)
2177			goto unlock;
2178	}
2179
2180	if (isc_sockaddr_pf(sockaddrp) == AF_INET) {
2181		ports = mgr->v4ports;
2182		nports = mgr->nv4ports;
2183	} else {
2184		ports = mgr->v6ports;
2185		nports = mgr->nv6ports;
2186	}
2187	if (ports == NULL)
2188		goto unlock;
2189
2190	port = isc_sockaddr_getport(sockaddrp);
2191	if (bsearch(&port, ports, nports, sizeof(in_port_t), port_cmp) != NULL)
2192		available = ISC_TRUE;
2193
2194unlock:
2195	PORTBUFUNLOCK(mgr);
2196	return (available);
2197}
2198
2199#define ATTRMATCH(_a1, _a2, _mask) (((_a1) & (_mask)) == ((_a2) & (_mask)))
2200
2201static isc_boolean_t
2202local_addr_match(dns_dispatch_t *disp, isc_sockaddr_t *addr) {
2203	isc_sockaddr_t sockaddr;
2204	isc_result_t result;
2205
2206	REQUIRE(disp->socket != NULL);
2207
2208	if (addr == NULL)
2209		return (ISC_TRUE);
2210
2211	/*
2212	 * Don't match wildcard ports unless the port is available in the
2213	 * current configuration.
2214	 */
2215	if (isc_sockaddr_getport(addr) == 0 &&
2216	    isc_sockaddr_getport(&disp->local) == 0 &&
2217	    !portavailable(disp->mgr, disp->socket, NULL)) {
2218		return (ISC_FALSE);
2219	}
2220
2221	/*
2222	 * Check if we match the binding <address,port>.
2223	 * Wildcard ports match/fail here.
2224	 */
2225	if (isc_sockaddr_equal(&disp->local, addr))
2226		return (ISC_TRUE);
2227	if (isc_sockaddr_getport(addr) == 0)
2228		return (ISC_FALSE);
2229
2230	/*
2231	 * Check if we match a bound wildcard port <address,port>.
2232	 */
2233	if (!isc_sockaddr_eqaddr(&disp->local, addr))
2234		return (ISC_FALSE);
2235	result = isc_socket_getsockname(disp->socket, &sockaddr);
2236	if (result != ISC_R_SUCCESS)
2237		return (ISC_FALSE);
2238
2239	return (isc_sockaddr_equal(&sockaddr, addr));
2240}
2241
2242/*
2243 * Requires mgr be locked.
2244 *
2245 * No dispatcher can be locked by this thread when calling this function.
2246 *
2247 *
2248 * NOTE:
2249 *	If a matching dispatcher is found, it is locked after this function
2250 *	returns, and must be unlocked by the caller.
2251 */
2252static isc_result_t
2253dispatch_find(dns_dispatchmgr_t *mgr, isc_sockaddr_t *local,
2254	      unsigned int attributes, unsigned int mask,
2255	      dns_dispatch_t **dispp)
2256{
2257	dns_dispatch_t *disp;
2258	isc_result_t result;
2259
2260	/*
2261	 * Make certain that we will not match a private or exclusive dispatch.
2262	 */
2263	attributes &= ~(DNS_DISPATCHATTR_PRIVATE|DNS_DISPATCHATTR_EXCLUSIVE);
2264	mask |= (DNS_DISPATCHATTR_PRIVATE|DNS_DISPATCHATTR_EXCLUSIVE);
2265
2266	disp = ISC_LIST_HEAD(mgr->list);
2267	while (disp != NULL) {
2268		LOCK(&disp->lock);
2269		if ((disp->shutting_down == 0)
2270		    && ATTRMATCH(disp->attributes, attributes, mask)
2271		    && local_addr_match(disp, local))
2272			break;
2273		UNLOCK(&disp->lock);
2274		disp = ISC_LIST_NEXT(disp, link);
2275	}
2276
2277	if (disp == NULL) {
2278		result = ISC_R_NOTFOUND;
2279		goto out;
2280	}
2281
2282	*dispp = disp;
2283	result = ISC_R_SUCCESS;
2284 out:
2285
2286	return (result);
2287}
2288
2289static isc_result_t
2290qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
2291	     unsigned int increment, dns_qid_t **qidp,
2292	     isc_boolean_t needsocktable)
2293{
2294	dns_qid_t *qid;
2295	unsigned int i;
2296	isc_result_t result;
2297
2298	REQUIRE(VALID_DISPATCHMGR(mgr));
2299	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2300	REQUIRE(increment > buckets);
2301	REQUIRE(qidp != NULL && *qidp == NULL);
2302
2303	qid = isc_mem_get(mgr->mctx, sizeof(*qid));
2304	if (qid == NULL)
2305		return (ISC_R_NOMEMORY);
2306
2307	qid->qid_table = isc_mem_get(mgr->mctx,
2308				     buckets * sizeof(dns_displist_t));
2309	if (qid->qid_table == NULL) {
2310		isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2311		return (ISC_R_NOMEMORY);
2312	}
2313
2314	qid->sock_table = NULL;
2315	if (needsocktable) {
2316		qid->sock_table = isc_mem_get(mgr->mctx, buckets *
2317					      sizeof(dispsocketlist_t));
2318		if (qid->sock_table == NULL) {
2319			isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2320			isc_mem_put(mgr->mctx, qid->qid_table,
2321				    buckets * sizeof(dns_displist_t));
2322			return (ISC_R_NOMEMORY);
2323		}
2324	}
2325
2326	result = isc_mutex_init(&qid->lock);
2327	if (result != ISC_R_SUCCESS) {
2328		if (qid->sock_table != NULL) {
2329			isc_mem_put(mgr->mctx, qid->sock_table,
2330				    buckets * sizeof(dispsocketlist_t));
2331		}
2332		isc_mem_put(mgr->mctx, qid->qid_table,
2333			    buckets * sizeof(dns_displist_t));
2334		isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2335		return (result);
2336	}
2337
2338	for (i = 0; i < buckets; i++) {
2339		ISC_LIST_INIT(qid->qid_table[i]);
2340		if (qid->sock_table != NULL)
2341			ISC_LIST_INIT(qid->sock_table[i]);
2342	}
2343
2344	qid->qid_nbuckets = buckets;
2345	qid->qid_increment = increment;
2346	qid->magic = QID_MAGIC;
2347	*qidp = qid;
2348	return (ISC_R_SUCCESS);
2349}
2350
2351static void
2352qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp) {
2353	dns_qid_t *qid;
2354
2355	REQUIRE(qidp != NULL);
2356	qid = *qidp;
2357
2358	REQUIRE(VALID_QID(qid));
2359
2360	*qidp = NULL;
2361	qid->magic = 0;
2362	isc_mem_put(mctx, qid->qid_table,
2363		    qid->qid_nbuckets * sizeof(dns_displist_t));
2364	if (qid->sock_table != NULL) {
2365		isc_mem_put(mctx, qid->sock_table,
2366			    qid->qid_nbuckets * sizeof(dispsocketlist_t));
2367	}
2368	DESTROYLOCK(&qid->lock);
2369	isc_mem_put(mctx, qid, sizeof(*qid));
2370}
2371
2372/*
2373 * Allocate and set important limits.
2374 */
2375static isc_result_t
2376dispatch_allocate(dns_dispatchmgr_t *mgr, unsigned int maxrequests,
2377		  dns_dispatch_t **dispp)
2378{
2379	dns_dispatch_t *disp;
2380	isc_result_t result;
2381
2382	REQUIRE(VALID_DISPATCHMGR(mgr));
2383	REQUIRE(dispp != NULL && *dispp == NULL);
2384
2385	/*
2386	 * Set up the dispatcher, mostly.  Don't bother setting some of
2387	 * the options that are controlled by tcp vs. udp, etc.
2388	 */
2389
2390	disp = isc_mempool_get(mgr->dpool);
2391	if (disp == NULL)
2392		return (ISC_R_NOMEMORY);
2393
2394	disp->magic = 0;
2395	disp->mgr = mgr;
2396	disp->maxrequests = maxrequests;
2397	disp->attributes = 0;
2398	ISC_LINK_INIT(disp, link);
2399	disp->refcount = 1;
2400	disp->recv_pending = 0;
2401	memset(&disp->local, 0, sizeof(disp->local));
2402	disp->localport = 0;
2403	disp->shutting_down = 0;
2404	disp->shutdown_out = 0;
2405	disp->connected = 0;
2406	disp->tcpmsg_valid = 0;
2407	disp->shutdown_why = ISC_R_UNEXPECTED;
2408	disp->requests = 0;
2409	disp->tcpbuffers = 0;
2410	disp->qid = NULL;
2411	ISC_LIST_INIT(disp->activesockets);
2412	ISC_LIST_INIT(disp->inactivesockets);
2413	disp->nsockets = 0;
2414	dispatch_arc4init(&disp->arc4ctx, mgr->entropy, NULL);
2415	disp->port_table = NULL;
2416	disp->portpool = NULL;
2417
2418	result = isc_mutex_init(&disp->lock);
2419	if (result != ISC_R_SUCCESS)
2420		goto deallocate;
2421
2422	disp->failsafe_ev = allocate_event(disp);
2423	if (disp->failsafe_ev == NULL) {
2424		result = ISC_R_NOMEMORY;
2425		goto kill_lock;
2426	}
2427
2428	disp->magic = DISPATCH_MAGIC;
2429
2430	*dispp = disp;
2431	return (ISC_R_SUCCESS);
2432
2433	/*
2434	 * error returns
2435	 */
2436 kill_lock:
2437	DESTROYLOCK(&disp->lock);
2438 deallocate:
2439	isc_mempool_put(mgr->dpool, disp);
2440
2441	return (result);
2442}
2443
2444
2445/*
2446 * MUST be unlocked, and not used by anything.
2447 */
2448static void
2449dispatch_free(dns_dispatch_t **dispp)
2450{
2451	dns_dispatch_t *disp;
2452	dns_dispatchmgr_t *mgr;
2453	int i;
2454
2455	REQUIRE(VALID_DISPATCH(*dispp));
2456	disp = *dispp;
2457	*dispp = NULL;
2458
2459	mgr = disp->mgr;
2460	REQUIRE(VALID_DISPATCHMGR(mgr));
2461
2462	if (disp->tcpmsg_valid) {
2463		dns_tcpmsg_invalidate(&disp->tcpmsg);
2464		disp->tcpmsg_valid = 0;
2465	}
2466
2467	INSIST(disp->tcpbuffers == 0);
2468	INSIST(disp->requests == 0);
2469	INSIST(disp->recv_pending == 0);
2470	INSIST(ISC_LIST_EMPTY(disp->activesockets));
2471	INSIST(ISC_LIST_EMPTY(disp->inactivesockets));
2472
2473	isc_mempool_put(mgr->epool, disp->failsafe_ev);
2474	disp->failsafe_ev = NULL;
2475
2476	if (disp->qid != NULL)
2477		qid_destroy(mgr->mctx, &disp->qid);
2478
2479	if (disp->port_table != NULL) {
2480		for (i = 0; i < DNS_DISPATCH_PORTTABLESIZE; i++)
2481			INSIST(ISC_LIST_EMPTY(disp->port_table[i]));
2482		isc_mem_put(mgr->mctx, disp->port_table,
2483			    sizeof(disp->port_table[0]) *
2484			    DNS_DISPATCH_PORTTABLESIZE);
2485	}
2486
2487	if (disp->portpool != NULL)
2488		isc_mempool_destroy(&disp->portpool);
2489
2490	disp->mgr = NULL;
2491	DESTROYLOCK(&disp->lock);
2492	disp->magic = 0;
2493	isc_mempool_put(mgr->dpool, disp);
2494}
2495
2496isc_result_t
2497dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
2498		       isc_taskmgr_t *taskmgr, unsigned int buffersize,
2499		       unsigned int maxbuffers, unsigned int maxrequests,
2500		       unsigned int buckets, unsigned int increment,
2501		       unsigned int attributes, dns_dispatch_t **dispp)
2502{
2503	isc_result_t result;
2504	dns_dispatch_t *disp;
2505
2506	UNUSED(maxbuffers);
2507	UNUSED(buffersize);
2508
2509	REQUIRE(VALID_DISPATCHMGR(mgr));
2510	REQUIRE(isc_socket_gettype(sock) == isc_sockettype_tcp);
2511	REQUIRE((attributes & DNS_DISPATCHATTR_TCP) != 0);
2512	REQUIRE((attributes & DNS_DISPATCHATTR_UDP) == 0);
2513
2514	attributes |= DNS_DISPATCHATTR_PRIVATE;  /* XXXMLG */
2515
2516	LOCK(&mgr->lock);
2517
2518	/*
2519	 * dispatch_allocate() checks mgr for us.
2520	 * qid_allocate() checks buckets and increment for us.
2521	 */
2522	disp = NULL;
2523	result = dispatch_allocate(mgr, maxrequests, &disp);
2524	if (result != ISC_R_SUCCESS) {
2525		UNLOCK(&mgr->lock);
2526		return (result);
2527	}
2528
2529	result = qid_allocate(mgr, buckets, increment, &disp->qid, ISC_FALSE);
2530	if (result != ISC_R_SUCCESS)
2531		goto deallocate_dispatch;
2532
2533	disp->socktype = isc_sockettype_tcp;
2534	disp->socket = NULL;
2535	isc_socket_attach(sock, &disp->socket);
2536
2537	disp->ntasks = 1;
2538	disp->task[0] = NULL;
2539	result = isc_task_create(taskmgr, 0, &disp->task[0]);
2540	if (result != ISC_R_SUCCESS)
2541		goto kill_socket;
2542
2543	disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
2544					    DNS_EVENT_DISPATCHCONTROL,
2545					    destroy_disp, disp,
2546					    sizeof(isc_event_t));
2547	if (disp->ctlevent == NULL) {
2548		result = ISC_R_NOMEMORY;
2549		goto kill_task;
2550	}
2551
2552	isc_task_setname(disp->task[0], "tcpdispatch", disp);
2553
2554	dns_tcpmsg_init(mgr->mctx, disp->socket, &disp->tcpmsg);
2555	disp->tcpmsg_valid = 1;
2556
2557	disp->attributes = attributes;
2558
2559	/*
2560	 * Append it to the dispatcher list.
2561	 */
2562	ISC_LIST_APPEND(mgr->list, disp, link);
2563	UNLOCK(&mgr->lock);
2564
2565	mgr_log(mgr, LVL(90), "created TCP dispatcher %p", disp);
2566	dispatch_log(disp, LVL(90), "created task %p", disp->task[0]);
2567
2568	*dispp = disp;
2569
2570	return (ISC_R_SUCCESS);
2571
2572	/*
2573	 * Error returns.
2574	 */
2575 kill_task:
2576	isc_task_detach(&disp->task[0]);
2577 kill_socket:
2578	isc_socket_detach(&disp->socket);
2579 deallocate_dispatch:
2580	dispatch_free(&disp);
2581
2582	UNLOCK(&mgr->lock);
2583
2584	return (result);
2585}
2586
2587isc_result_t
2588dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
2589		    isc_taskmgr_t *taskmgr, isc_sockaddr_t *localaddr,
2590		    unsigned int buffersize,
2591		    unsigned int maxbuffers, unsigned int maxrequests,
2592		    unsigned int buckets, unsigned int increment,
2593		    unsigned int attributes, unsigned int mask,
2594		    dns_dispatch_t **dispp)
2595{
2596	isc_result_t result;
2597	dns_dispatch_t *disp = NULL;
2598
2599	REQUIRE(VALID_DISPATCHMGR(mgr));
2600	REQUIRE(sockmgr != NULL);
2601	REQUIRE(localaddr != NULL);
2602	REQUIRE(taskmgr != NULL);
2603	REQUIRE(buffersize >= 512 && buffersize < (64 * 1024));
2604	REQUIRE(maxbuffers > 0);
2605	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2606	REQUIRE(increment > buckets);
2607	REQUIRE(dispp != NULL && *dispp == NULL);
2608	REQUIRE((attributes & DNS_DISPATCHATTR_TCP) == 0);
2609
2610	result = dns_dispatchmgr_setudp(mgr, buffersize, maxbuffers,
2611					maxrequests, buckets, increment);
2612	if (result != ISC_R_SUCCESS)
2613		return (result);
2614
2615	LOCK(&mgr->lock);
2616
2617	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
2618		REQUIRE(isc_sockaddr_getport(localaddr) == 0);
2619		goto createudp;
2620	}
2621
2622	/*
2623	 * See if we have a dispatcher that matches.
2624	 */
2625	result = dispatch_find(mgr, localaddr, attributes, mask, &disp);
2626	if (result == ISC_R_SUCCESS) {
2627		disp->refcount++;
2628
2629		if (disp->maxrequests < maxrequests)
2630			disp->maxrequests = maxrequests;
2631
2632		if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) == 0 &&
2633		    (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
2634		{
2635			disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
2636			if (disp->recv_pending != 0)
2637				isc_socket_cancel(disp->socket, disp->task[0],
2638						  ISC_SOCKCANCEL_RECV);
2639		}
2640
2641		UNLOCK(&disp->lock);
2642		UNLOCK(&mgr->lock);
2643
2644		*dispp = disp;
2645
2646		return (ISC_R_SUCCESS);
2647	}
2648
2649 createudp:
2650	/*
2651	 * Nope, create one.
2652	 */
2653	result = dispatch_createudp(mgr, sockmgr, taskmgr, localaddr,
2654				    maxrequests, attributes, &disp);
2655	if (result != ISC_R_SUCCESS) {
2656		UNLOCK(&mgr->lock);
2657		return (result);
2658	}
2659
2660	UNLOCK(&mgr->lock);
2661	*dispp = disp;
2662	return (ISC_R_SUCCESS);
2663}
2664
2665/*
2666 * mgr should be locked.
2667 */
2668
2669#ifndef DNS_DISPATCH_HELD
2670#define DNS_DISPATCH_HELD 20U
2671#endif
2672
2673static isc_result_t
2674get_udpsocket(dns_dispatchmgr_t *mgr, dns_dispatch_t *disp,
2675	      isc_socketmgr_t *sockmgr, isc_sockaddr_t *localaddr,
2676	      isc_socket_t **sockp)
2677{
2678	unsigned int i, j;
2679	isc_socket_t *held[DNS_DISPATCH_HELD];
2680	isc_sockaddr_t localaddr_bound;
2681	isc_socket_t *sock = NULL;
2682	isc_result_t result = ISC_R_SUCCESS;
2683	isc_boolean_t anyport;
2684
2685	INSIST(sockp != NULL && *sockp == NULL);
2686
2687	localaddr_bound = *localaddr;
2688	anyport = ISC_TF(isc_sockaddr_getport(localaddr) == 0);
2689
2690	if (anyport) {
2691		unsigned int nports;
2692		in_port_t *ports;
2693
2694		/*
2695		 * If no port is specified, we first try to pick up a random
2696		 * port by ourselves.
2697		 */
2698		if (isc_sockaddr_pf(&disp->local) == AF_INET) {
2699			nports = disp->mgr->nv4ports;
2700			ports = disp->mgr->v4ports;
2701		} else {
2702			nports = disp->mgr->nv6ports;
2703			ports = disp->mgr->v6ports;
2704		}
2705		if (nports == 0)
2706			return (ISC_R_ADDRNOTAVAIL);
2707
2708		for (i = 0; i < 1024; i++) {
2709			in_port_t prt;
2710
2711			prt = ports[dispatch_arc4uniformrandom(
2712					DISP_ARC4CTX(disp),
2713					nports)];
2714			isc_sockaddr_setport(&localaddr_bound, prt);
2715			result = open_socket(sockmgr, &localaddr_bound,
2716					     0, &sock);
2717			if (result == ISC_R_SUCCESS ||
2718			    result != ISC_R_ADDRINUSE) {
2719				disp->localport = prt;
2720				*sockp = sock;
2721				return (result);
2722			}
2723		}
2724
2725		/*
2726		 * If this fails 1024 times, we then ask the kernel for
2727		 * choosing one.
2728		 */
2729	} else {
2730		/* Allow to reuse address for non-random ports. */
2731		result = open_socket(sockmgr, localaddr,
2732				     ISC_SOCKET_REUSEADDRESS, &sock);
2733
2734		if (result == ISC_R_SUCCESS)
2735			*sockp = sock;
2736
2737		return (result);
2738	}
2739
2740	memset(held, 0, sizeof(held));
2741	i = 0;
2742
2743	for (j = 0; j < 0xffffU; j++) {
2744		result = open_socket(sockmgr, localaddr, 0, &sock);
2745		if (result != ISC_R_SUCCESS)
2746			goto end;
2747		else if (!anyport)
2748			break;
2749		else if (portavailable(mgr, sock, NULL))
2750			break;
2751		if (held[i] != NULL)
2752			isc_socket_detach(&held[i]);
2753		held[i++] = sock;
2754		sock = NULL;
2755		if (i == DNS_DISPATCH_HELD)
2756			i = 0;
2757	}
2758	if (j == 0xffffU) {
2759		mgr_log(mgr, ISC_LOG_ERROR,
2760			"avoid-v%s-udp-ports: unable to allocate "
2761			"an available port",
2762			isc_sockaddr_pf(localaddr) == AF_INET ? "4" : "6");
2763		result = ISC_R_FAILURE;
2764		goto end;
2765	}
2766	*sockp = sock;
2767
2768end:
2769	for (i = 0; i < DNS_DISPATCH_HELD; i++) {
2770		if (held[i] != NULL)
2771			isc_socket_detach(&held[i]);
2772	}
2773
2774	return (result);
2775}
2776
2777static isc_result_t
2778dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
2779		   isc_taskmgr_t *taskmgr,
2780		   isc_sockaddr_t *localaddr,
2781		   unsigned int maxrequests,
2782		   unsigned int attributes,
2783		   dns_dispatch_t **dispp)
2784{
2785	isc_result_t result;
2786	dns_dispatch_t *disp;
2787	isc_socket_t *sock = NULL;
2788	int i = 0;
2789
2790	/*
2791	 * dispatch_allocate() checks mgr for us.
2792	 */
2793	disp = NULL;
2794	result = dispatch_allocate(mgr, maxrequests, &disp);
2795	if (result != ISC_R_SUCCESS)
2796		return (result);
2797
2798	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0) {
2799		result = get_udpsocket(mgr, disp, sockmgr, localaddr, &sock);
2800		if (result != ISC_R_SUCCESS)
2801			goto deallocate_dispatch;
2802	} else {
2803		isc_sockaddr_t sa_any;
2804
2805		/*
2806		 * For dispatches using exclusive sockets with a specific
2807		 * source address, we only check if the specified address is
2808		 * available on the system.  Query sockets will be created later
2809		 * on demand.
2810		 */
2811		isc_sockaddr_anyofpf(&sa_any, isc_sockaddr_pf(localaddr));
2812		if (!isc_sockaddr_eqaddr(&sa_any, localaddr)) {
2813			result = open_socket(sockmgr, localaddr, 0, &sock);
2814			if (sock != NULL)
2815				isc_socket_detach(&sock);
2816			if (result != ISC_R_SUCCESS)
2817				goto deallocate_dispatch;
2818		}
2819
2820		disp->port_table = isc_mem_get(mgr->mctx,
2821					       sizeof(disp->port_table[0]) *
2822					       DNS_DISPATCH_PORTTABLESIZE);
2823		if (disp->port_table == NULL)
2824			goto deallocate_dispatch;
2825		for (i = 0; i < DNS_DISPATCH_PORTTABLESIZE; i++)
2826			ISC_LIST_INIT(disp->port_table[i]);
2827
2828		result = isc_mempool_create(mgr->mctx, sizeof(dispportentry_t),
2829					    &disp->portpool);
2830		if (result != ISC_R_SUCCESS)
2831			goto deallocate_dispatch;
2832		isc_mempool_setname(disp->portpool, "disp_portpool");
2833		isc_mempool_setfreemax(disp->portpool, 128);
2834	}
2835	disp->socktype = isc_sockettype_udp;
2836	disp->socket = sock;
2837	disp->local = *localaddr;
2838
2839	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
2840		disp->ntasks = MAX_INTERNAL_TASKS;
2841	else
2842		disp->ntasks = 1;
2843	for (i = 0; i < disp->ntasks; i++) {
2844		disp->task[i] = NULL;
2845		result = isc_task_create(taskmgr, 0, &disp->task[i]);
2846		if (result != ISC_R_SUCCESS) {
2847			while (--i >= 0)
2848				isc_task_destroy(&disp->task[i]);
2849			goto kill_socket;
2850		}
2851		isc_task_setname(disp->task[i], "udpdispatch", disp);
2852	}
2853
2854	disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
2855					    DNS_EVENT_DISPATCHCONTROL,
2856					    destroy_disp, disp,
2857					    sizeof(isc_event_t));
2858	if (disp->ctlevent == NULL) {
2859		result = ISC_R_NOMEMORY;
2860		goto kill_task;
2861	}
2862
2863	attributes &= ~DNS_DISPATCHATTR_TCP;
2864	attributes |= DNS_DISPATCHATTR_UDP;
2865	disp->attributes = attributes;
2866
2867	/*
2868	 * Append it to the dispatcher list.
2869	 */
2870	ISC_LIST_APPEND(mgr->list, disp, link);
2871
2872	mgr_log(mgr, LVL(90), "created UDP dispatcher %p", disp);
2873	dispatch_log(disp, LVL(90), "created task %p", disp->task[0]); /* XXX */
2874	if (disp->socket != NULL)
2875		dispatch_log(disp, LVL(90), "created socket %p", disp->socket);
2876
2877	*dispp = disp;
2878	return (result);
2879
2880	/*
2881	 * Error returns.
2882	 */
2883 kill_task:
2884	for (i = 0; i < disp->ntasks; i++)
2885		isc_task_detach(&disp->task[i]);
2886 kill_socket:
2887	if (disp->socket != NULL)
2888		isc_socket_detach(&disp->socket);
2889 deallocate_dispatch:
2890	dispatch_free(&disp);
2891
2892	return (result);
2893}
2894
2895void
2896dns_dispatch_attach(dns_dispatch_t *disp, dns_dispatch_t **dispp) {
2897	REQUIRE(VALID_DISPATCH(disp));
2898	REQUIRE(dispp != NULL && *dispp == NULL);
2899
2900	LOCK(&disp->lock);
2901	disp->refcount++;
2902	UNLOCK(&disp->lock);
2903
2904	*dispp = disp;
2905}
2906
2907/*
2908 * It is important to lock the manager while we are deleting the dispatch,
2909 * since dns_dispatch_getudp will call dispatch_find, which returns to
2910 * the caller a dispatch but does not attach to it until later.  _getudp
2911 * locks the manager, however, so locking it here will keep us from attaching
2912 * to a dispatcher that is in the process of going away.
2913 */
2914void
2915dns_dispatch_detach(dns_dispatch_t **dispp) {
2916	dns_dispatch_t *disp;
2917	dispsocket_t *dispsock;
2918	isc_boolean_t killit;
2919
2920	REQUIRE(dispp != NULL && VALID_DISPATCH(*dispp));
2921
2922	disp = *dispp;
2923	*dispp = NULL;
2924
2925	LOCK(&disp->lock);
2926
2927	INSIST(disp->refcount > 0);
2928	disp->refcount--;
2929	killit = ISC_FALSE;
2930	if (disp->refcount == 0) {
2931		if (disp->recv_pending > 0)
2932			isc_socket_cancel(disp->socket, disp->task[0],
2933					  ISC_SOCKCANCEL_RECV);
2934		for (dispsock = ISC_LIST_HEAD(disp->activesockets);
2935		     dispsock != NULL;
2936		     dispsock = ISC_LIST_NEXT(dispsock, link)) {
2937			isc_socket_cancel(dispsock->socket, dispsock->task,
2938					  ISC_SOCKCANCEL_RECV);
2939		}
2940		disp->shutting_down = 1;
2941	}
2942
2943	dispatch_log(disp, LVL(90), "detach: refcount %d", disp->refcount);
2944
2945	killit = destroy_disp_ok(disp);
2946	UNLOCK(&disp->lock);
2947	if (killit)
2948		isc_task_send(disp->task[0], &disp->ctlevent);
2949}
2950
2951isc_result_t
2952dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest,
2953			  isc_task_t *task, isc_taskaction_t action, void *arg,
2954			  dns_messageid_t *idp, dns_dispentry_t **resp,
2955			  isc_socketmgr_t *sockmgr)
2956{
2957	dns_dispentry_t *res;
2958	unsigned int bucket;
2959	in_port_t localport = 0;
2960	dns_messageid_t id;
2961	int i;
2962	isc_boolean_t ok;
2963	dns_qid_t *qid;
2964	dispsocket_t *dispsocket = NULL;
2965	isc_result_t result;
2966
2967	REQUIRE(VALID_DISPATCH(disp));
2968	REQUIRE(task != NULL);
2969	REQUIRE(dest != NULL);
2970	REQUIRE(resp != NULL && *resp == NULL);
2971	REQUIRE(idp != NULL);
2972	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
2973		REQUIRE(sockmgr != NULL);
2974
2975	LOCK(&disp->lock);
2976
2977	if (disp->shutting_down == 1) {
2978		UNLOCK(&disp->lock);
2979		return (ISC_R_SHUTTINGDOWN);
2980	}
2981
2982	if (disp->requests >= disp->maxrequests) {
2983		UNLOCK(&disp->lock);
2984		return (ISC_R_QUOTA);
2985	}
2986
2987	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
2988	    disp->nsockets > DNS_DISPATCH_SOCKSQUOTA) {
2989		dispsocket_t *oldestsocket;
2990		dns_dispentry_t *oldestresp;
2991		dns_dispatchevent_t *rev;
2992
2993		/*
2994		 * Kill oldest outstanding query if the number of sockets
2995		 * exceeds the quota to keep the room for new queries.
2996		 */
2997		oldestsocket = ISC_LIST_HEAD(disp->activesockets);
2998		oldestresp = oldestsocket->resp;
2999		if (oldestresp != NULL && !oldestresp->item_out) {
3000			rev = allocate_event(oldestresp->disp);
3001			if (rev != NULL) {
3002				rev->buffer.base = NULL;
3003				rev->result = ISC_R_CANCELED;
3004				rev->id = oldestresp->id;
3005				ISC_EVENT_INIT(rev, sizeof(*rev), 0,
3006					       NULL, DNS_EVENT_DISPATCH,
3007					       oldestresp->action,
3008					       oldestresp->arg, oldestresp,
3009					       NULL, NULL);
3010				oldestresp->item_out = ISC_TRUE;
3011				isc_task_send(oldestresp->task,
3012					      ISC_EVENT_PTR(&rev));
3013				inc_stats(disp->mgr,
3014					  dns_resstatscounter_dispabort);
3015			}
3016		}
3017
3018		/*
3019		 * Move this entry to the tail so that it won't (easily) be
3020		 * examined before actually being canceled.
3021		 */
3022		ISC_LIST_UNLINK(disp->activesockets, oldestsocket, link);
3023		ISC_LIST_APPEND(disp->activesockets, oldestsocket, link);
3024	}
3025
3026	qid = DNS_QID(disp);
3027	LOCK(&qid->lock);
3028
3029	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
3030		/*
3031		 * Get a separate UDP socket with a random port number.
3032		 */
3033		result = get_dispsocket(disp, dest, sockmgr, qid, &dispsocket,
3034					&localport);
3035		if (result != ISC_R_SUCCESS) {
3036			UNLOCK(&qid->lock);
3037			UNLOCK(&disp->lock);
3038			inc_stats(disp->mgr, dns_resstatscounter_dispsockfail);
3039			return (result);
3040		}
3041	} else {
3042		localport = disp->localport;
3043	}
3044
3045	/*
3046	 * Try somewhat hard to find an unique ID.
3047	 */
3048	id = (dns_messageid_t)dispatch_arc4random(DISP_ARC4CTX(disp));
3049	bucket = dns_hash(qid, dest, id, localport);
3050	ok = ISC_FALSE;
3051	for (i = 0; i < 64; i++) {
3052		if (entry_search(qid, dest, id, localport, bucket) == NULL) {
3053			ok = ISC_TRUE;
3054			break;
3055		}
3056		id += qid->qid_increment;
3057		id &= 0x0000ffff;
3058		bucket = dns_hash(qid, dest, id, localport);
3059	}
3060
3061	if (!ok) {
3062		UNLOCK(&qid->lock);
3063		UNLOCK(&disp->lock);
3064		return (ISC_R_NOMORE);
3065	}
3066
3067	res = isc_mempool_get(disp->mgr->rpool);
3068	if (res == NULL) {
3069		UNLOCK(&qid->lock);
3070		UNLOCK(&disp->lock);
3071		if (dispsocket != NULL)
3072			destroy_dispsocket(disp, &dispsocket);
3073		return (ISC_R_NOMEMORY);
3074	}
3075
3076	disp->refcount++;
3077	disp->requests++;
3078	res->task = NULL;
3079	isc_task_attach(task, &res->task);
3080	res->disp = disp;
3081	res->id = id;
3082	res->port = localport;
3083	res->bucket = bucket;
3084	res->host = *dest;
3085	res->action = action;
3086	res->arg = arg;
3087	res->dispsocket = dispsocket;
3088	if (dispsocket != NULL)
3089		dispsocket->resp = res;
3090	res->item_out = ISC_FALSE;
3091	ISC_LIST_INIT(res->items);
3092	ISC_LINK_INIT(res, link);
3093	res->magic = RESPONSE_MAGIC;
3094	ISC_LIST_APPEND(qid->qid_table[bucket], res, link);
3095	UNLOCK(&qid->lock);
3096
3097	request_log(disp, res, LVL(90),
3098		    "attached to task %p", res->task);
3099
3100	if (((disp->attributes & DNS_DISPATCHATTR_UDP) != 0) ||
3101	    ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0)) {
3102		result = startrecv(disp, dispsocket);
3103		if (result != ISC_R_SUCCESS) {
3104			LOCK(&qid->lock);
3105			ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
3106			UNLOCK(&qid->lock);
3107
3108			if (dispsocket != NULL)
3109				destroy_dispsocket(disp, &dispsocket);
3110
3111			disp->refcount--;
3112			disp->requests--;
3113
3114			UNLOCK(&disp->lock);
3115			isc_task_detach(&res->task);
3116			isc_mempool_put(disp->mgr->rpool, res);
3117			return (result);
3118		}
3119	}
3120
3121	if (dispsocket != NULL)
3122		ISC_LIST_APPEND(disp->activesockets, dispsocket, link);
3123
3124	UNLOCK(&disp->lock);
3125
3126	*idp = id;
3127	*resp = res;
3128
3129	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
3130		INSIST(res->dispsocket != NULL);
3131
3132	return (ISC_R_SUCCESS);
3133}
3134
3135isc_result_t
3136dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
3137			 isc_task_t *task, isc_taskaction_t action, void *arg,
3138			 dns_messageid_t *idp, dns_dispentry_t **resp)
3139{
3140	REQUIRE(VALID_DISPATCH(disp));
3141	REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
3142
3143	return (dns_dispatch_addresponse2(disp, dest, task, action, arg,
3144					  idp, resp, NULL));
3145}
3146
3147void
3148dns_dispatch_starttcp(dns_dispatch_t *disp) {
3149
3150	REQUIRE(VALID_DISPATCH(disp));
3151
3152	dispatch_log(disp, LVL(90), "starttcp %p", disp->task[0]);
3153
3154	LOCK(&disp->lock);
3155	disp->attributes |= DNS_DISPATCHATTR_CONNECTED;
3156	(void)startrecv(disp, NULL);
3157	UNLOCK(&disp->lock);
3158}
3159
3160void
3161dns_dispatch_removeresponse(dns_dispentry_t **resp,
3162			    dns_dispatchevent_t **sockevent)
3163{
3164	dns_dispatchmgr_t *mgr;
3165	dns_dispatch_t *disp;
3166	dns_dispentry_t *res;
3167	dispsocket_t *dispsock;
3168	dns_dispatchevent_t *ev;
3169	unsigned int bucket;
3170	isc_boolean_t killit;
3171	unsigned int n;
3172	isc_eventlist_t events;
3173	dns_qid_t *qid;
3174
3175	REQUIRE(resp != NULL);
3176	REQUIRE(VALID_RESPONSE(*resp));
3177
3178	res = *resp;
3179	*resp = NULL;
3180
3181	disp = res->disp;
3182	REQUIRE(VALID_DISPATCH(disp));
3183	mgr = disp->mgr;
3184	REQUIRE(VALID_DISPATCHMGR(mgr));
3185
3186	qid = DNS_QID(disp);
3187
3188	if (sockevent != NULL) {
3189		REQUIRE(*sockevent != NULL);
3190		ev = *sockevent;
3191		*sockevent = NULL;
3192	} else {
3193		ev = NULL;
3194	}
3195
3196	LOCK(&disp->lock);
3197
3198	INSIST(disp->requests > 0);
3199	disp->requests--;
3200	INSIST(disp->refcount > 0);
3201	disp->refcount--;
3202	killit = ISC_FALSE;
3203	if (disp->refcount == 0) {
3204		if (disp->recv_pending > 0)
3205			isc_socket_cancel(disp->socket, disp->task[0],
3206					  ISC_SOCKCANCEL_RECV);
3207		for (dispsock = ISC_LIST_HEAD(disp->activesockets);
3208		     dispsock != NULL;
3209		     dispsock = ISC_LIST_NEXT(dispsock, link)) {
3210			isc_socket_cancel(dispsock->socket, dispsock->task,
3211					  ISC_SOCKCANCEL_RECV);
3212		}
3213		disp->shutting_down = 1;
3214	}
3215
3216	bucket = res->bucket;
3217
3218	LOCK(&qid->lock);
3219	ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
3220	UNLOCK(&qid->lock);
3221
3222	if (ev == NULL && res->item_out) {
3223		/*
3224		 * We've posted our event, but the caller hasn't gotten it
3225		 * yet.  Take it back.
3226		 */
3227		ISC_LIST_INIT(events);
3228		n = isc_task_unsend(res->task, res, DNS_EVENT_DISPATCH,
3229				    NULL, &events);
3230		/*
3231		 * We had better have gotten it back.
3232		 */
3233		INSIST(n == 1);
3234		ev = (dns_dispatchevent_t *)ISC_LIST_HEAD(events);
3235	}
3236
3237	if (ev != NULL) {
3238		REQUIRE(res->item_out == ISC_TRUE);
3239		res->item_out = ISC_FALSE;
3240		if (ev->buffer.base != NULL)
3241			free_buffer(disp, ev->buffer.base, ev->buffer.length);
3242		free_event(disp, ev);
3243	}
3244
3245	request_log(disp, res, LVL(90), "detaching from task %p", res->task);
3246	isc_task_detach(&res->task);
3247
3248	if (res->dispsocket != NULL) {
3249		isc_socket_cancel(res->dispsocket->socket,
3250				  res->dispsocket->task, ISC_SOCKCANCEL_RECV);
3251		res->dispsocket->resp = NULL;
3252	}
3253
3254	/*
3255	 * Free any buffered requests as well
3256	 */
3257	ev = ISC_LIST_HEAD(res->items);
3258	while (ev != NULL) {
3259		ISC_LIST_UNLINK(res->items, ev, ev_link);
3260		if (ev->buffer.base != NULL)
3261			free_buffer(disp, ev->buffer.base, ev->buffer.length);
3262		free_event(disp, ev);
3263		ev = ISC_LIST_HEAD(res->items);
3264	}
3265	res->magic = 0;
3266	isc_mempool_put(disp->mgr->rpool, res);
3267	if (disp->shutting_down == 1)
3268		do_cancel(disp);
3269	else
3270		(void)startrecv(disp, NULL);
3271
3272	killit = destroy_disp_ok(disp);
3273	UNLOCK(&disp->lock);
3274	if (killit)
3275		isc_task_send(disp->task[0], &disp->ctlevent);
3276}
3277
3278static void
3279do_cancel(dns_dispatch_t *disp) {
3280	dns_dispatchevent_t *ev;
3281	dns_dispentry_t *resp;
3282	dns_qid_t *qid;
3283
3284	if (disp->shutdown_out == 1)
3285		return;
3286
3287	qid = DNS_QID(disp);
3288
3289	/*
3290	 * Search for the first response handler without packets outstanding
3291	 * unless a specific hander is given.
3292	 */
3293	LOCK(&qid->lock);
3294	for (resp = linear_first(qid);
3295	     resp != NULL && resp->item_out;
3296	     /* Empty. */)
3297		resp = linear_next(qid, resp);
3298
3299	/*
3300	 * No one to send the cancel event to, so nothing to do.
3301	 */
3302	if (resp == NULL)
3303		goto unlock;
3304
3305	/*
3306	 * Send the shutdown failsafe event to this resp.
3307	 */
3308	ev = disp->failsafe_ev;
3309	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, DNS_EVENT_DISPATCH,
3310		       resp->action, resp->arg, resp, NULL, NULL);
3311	ev->result = disp->shutdown_why;
3312	ev->buffer.base = NULL;
3313	ev->buffer.length = 0;
3314	disp->shutdown_out = 1;
3315	request_log(disp, resp, LVL(10),
3316		    "cancel: failsafe event %p -> task %p",
3317		    ev, resp->task);
3318	resp->item_out = ISC_TRUE;
3319	isc_task_send(resp->task, ISC_EVENT_PTR(&ev));
3320 unlock:
3321	UNLOCK(&qid->lock);
3322}
3323
3324isc_socket_t *
3325dns_dispatch_getsocket(dns_dispatch_t *disp) {
3326	REQUIRE(VALID_DISPATCH(disp));
3327
3328	return (disp->socket);
3329}
3330
3331isc_socket_t *
3332dns_dispatch_getentrysocket(dns_dispentry_t *resp) {
3333	REQUIRE(VALID_RESPONSE(resp));
3334
3335	if (resp->dispsocket != NULL)
3336		return (resp->dispsocket->socket);
3337	else
3338		return (NULL);
3339}
3340
3341isc_result_t
3342dns_dispatch_getlocaladdress(dns_dispatch_t *disp, isc_sockaddr_t *addrp) {
3343
3344	REQUIRE(VALID_DISPATCH(disp));
3345	REQUIRE(addrp != NULL);
3346
3347	if (disp->socktype == isc_sockettype_udp) {
3348		*addrp = disp->local;
3349		return (ISC_R_SUCCESS);
3350	}
3351	return (ISC_R_NOTIMPLEMENTED);
3352}
3353
3354void
3355dns_dispatch_cancel(dns_dispatch_t *disp) {
3356	REQUIRE(VALID_DISPATCH(disp));
3357
3358	LOCK(&disp->lock);
3359
3360	if (disp->shutting_down == 1) {
3361		UNLOCK(&disp->lock);
3362		return;
3363	}
3364
3365	disp->shutdown_why = ISC_R_CANCELED;
3366	disp->shutting_down = 1;
3367	do_cancel(disp);
3368
3369	UNLOCK(&disp->lock);
3370
3371	return;
3372}
3373
3374unsigned int
3375dns_dispatch_getattributes(dns_dispatch_t *disp) {
3376	REQUIRE(VALID_DISPATCH(disp));
3377
3378	/*
3379	 * We don't bother locking disp here; it's the caller's responsibility
3380	 * to use only non volatile flags.
3381	 */
3382	return (disp->attributes);
3383}
3384
3385void
3386dns_dispatch_changeattributes(dns_dispatch_t *disp,
3387			      unsigned int attributes, unsigned int mask)
3388{
3389	REQUIRE(VALID_DISPATCH(disp));
3390	/* Exclusive attribute can only be set on creation */
3391	REQUIRE((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
3392	/* Also, a dispatch with randomport specified cannot start listening */
3393	REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0 ||
3394		(attributes & DNS_DISPATCHATTR_NOLISTEN) == 0);
3395
3396	/* XXXMLG
3397	 * Should check for valid attributes here!
3398	 */
3399
3400	LOCK(&disp->lock);
3401
3402	if ((mask & DNS_DISPATCHATTR_NOLISTEN) != 0) {
3403		if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0 &&
3404		    (attributes & DNS_DISPATCHATTR_NOLISTEN) == 0) {
3405			disp->attributes &= ~DNS_DISPATCHATTR_NOLISTEN;
3406			(void)startrecv(disp, NULL);
3407		} else if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN)
3408			   == 0 &&
3409			   (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0) {
3410			disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
3411			if (disp->recv_pending != 0)
3412				isc_socket_cancel(disp->socket, disp->task[0],
3413						  ISC_SOCKCANCEL_RECV);
3414		}
3415	}
3416
3417	disp->attributes &= ~mask;
3418	disp->attributes |= (attributes & mask);
3419	UNLOCK(&disp->lock);
3420}
3421
3422void
3423dns_dispatch_importrecv(dns_dispatch_t *disp, isc_event_t *event) {
3424	void *buf;
3425	isc_socketevent_t *sevent, *newsevent;
3426
3427	REQUIRE(VALID_DISPATCH(disp));
3428	REQUIRE((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0);
3429	REQUIRE(event != NULL);
3430
3431	sevent = (isc_socketevent_t *)event;
3432
3433	INSIST(sevent->n <= disp->mgr->buffersize);
3434	newsevent = (isc_socketevent_t *)
3435		    isc_event_allocate(disp->mgr->mctx, NULL,
3436				      DNS_EVENT_IMPORTRECVDONE, udp_shrecv,
3437				      disp, sizeof(isc_socketevent_t));
3438	if (newsevent == NULL)
3439		return;
3440
3441	buf = allocate_udp_buffer(disp);
3442	if (buf == NULL) {
3443		isc_event_free(ISC_EVENT_PTR(&newsevent));
3444		return;
3445	}
3446	memcpy(buf, sevent->region.base, sevent->n);
3447	newsevent->region.base = buf;
3448	newsevent->region.length = disp->mgr->buffersize;
3449	newsevent->n = sevent->n;
3450	newsevent->result = sevent->result;
3451	newsevent->address = sevent->address;
3452	newsevent->timestamp = sevent->timestamp;
3453	newsevent->pktinfo = sevent->pktinfo;
3454	newsevent->attributes = sevent->attributes;
3455
3456	isc_task_send(disp->task[0], ISC_EVENT_PTR(&newsevent));
3457}
3458
3459#if 0
3460void
3461dns_dispatchmgr_dump(dns_dispatchmgr_t *mgr) {
3462	dns_dispatch_t *disp;
3463	char foo[1024];
3464
3465	disp = ISC_LIST_HEAD(mgr->list);
3466	while (disp != NULL) {
3467		isc_sockaddr_format(&disp->local, foo, sizeof(foo));
3468		printf("\tdispatch %p, addr %s\n", disp, foo);
3469		disp = ISC_LIST_NEXT(disp, link);
3470	}
3471}
3472#endif
3473