dispatch.c revision 224092
1/*
2 * Copyright (C) 2004-2009, 2011  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2003  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id: dispatch.c,v 1.168.248.1.2.1 2011-06-02 23:47:34 tbox Exp $ */
19
20/*! \file */
21
22#include <config.h>
23
24#include <stdlib.h>
25#include <sys/types.h>
26#include <unistd.h>
27#include <stdlib.h>
28
29#include <isc/entropy.h>
30#include <isc/mem.h>
31#include <isc/mutex.h>
32#include <isc/portset.h>
33#include <isc/print.h>
34#include <isc/random.h>
35#include <isc/stats.h>
36#include <isc/string.h>
37#include <isc/task.h>
38#include <isc/time.h>
39#include <isc/util.h>
40
41#include <dns/acl.h>
42#include <dns/dispatch.h>
43#include <dns/events.h>
44#include <dns/log.h>
45#include <dns/message.h>
46#include <dns/portlist.h>
47#include <dns/stats.h>
48#include <dns/tcpmsg.h>
49#include <dns/types.h>
50
51typedef ISC_LIST(dns_dispentry_t)	dns_displist_t;
52
53typedef struct dispsocket		dispsocket_t;
54typedef ISC_LIST(dispsocket_t)		dispsocketlist_t;
55
56typedef struct dispportentry		dispportentry_t;
57typedef ISC_LIST(dispportentry_t)	dispportlist_t;
58
59/* ARC4 Random generator state */
60typedef struct arc4ctx {
61	isc_uint8_t	i;
62	isc_uint8_t	j;
63	isc_uint8_t	s[256];
64	int		count;
65	isc_entropy_t	*entropy;	/*%< entropy source for ARC4 */
66	isc_mutex_t	*lock;
67} arc4ctx_t;
68
69typedef struct dns_qid {
70	unsigned int	magic;
71	unsigned int	qid_nbuckets;	/*%< hash table size */
72	unsigned int	qid_increment;	/*%< id increment on collision */
73	isc_mutex_t	lock;
74	dns_displist_t	*qid_table;	/*%< the table itself */
75	dispsocketlist_t *sock_table;	/*%< socket table */
76} dns_qid_t;
77
78struct dns_dispatchmgr {
79	/* Unlocked. */
80	unsigned int			magic;
81	isc_mem_t		       *mctx;
82	dns_acl_t		       *blackhole;
83	dns_portlist_t		       *portlist;
84	isc_stats_t		       *stats;
85	isc_entropy_t		       *entropy; /*%< entropy source */
86
87	/* Locked by "lock". */
88	isc_mutex_t			lock;
89	unsigned int			state;
90	ISC_LIST(dns_dispatch_t)	list;
91
92	/* Locked by arc4_lock. */
93	isc_mutex_t			arc4_lock;
94	arc4ctx_t			arc4ctx;    /*%< ARC4 context for QID */
95
96	/* locked by buffer lock */
97	dns_qid_t			*qid;
98	isc_mutex_t			buffer_lock;
99	unsigned int			buffers;    /*%< allocated buffers */
100	unsigned int			buffersize; /*%< size of each buffer */
101	unsigned int			maxbuffers; /*%< max buffers */
102
103	/* Locked internally. */
104	isc_mutex_t			pool_lock;
105	isc_mempool_t		       *epool;	/*%< memory pool for events */
106	isc_mempool_t		       *rpool;	/*%< memory pool for replies */
107	isc_mempool_t		       *dpool;  /*%< dispatch allocations */
108	isc_mempool_t		       *bpool;	/*%< memory pool for buffers */
109	isc_mempool_t		       *spool;	/*%< memory pool for dispsocs */
110
111	/*%
112	 * Locked by qid->lock if qid exists; otherwise, can be used without
113	 * being locked.
114	 * Memory footprint considerations: this is a simple implementation of
115	 * available ports, i.e., an ordered array of the actual port numbers.
116	 * This will require about 256KB of memory in the worst case (128KB for
117	 * each of IPv4 and IPv6).  We could reduce it by representing it as a
118	 * more sophisticated way such as a list (or array) of ranges that are
119	 * searched to identify a specific port.  Our decision here is the saved
120	 * memory isn't worth the implementation complexity, considering the
121	 * fact that the whole BIND9 process (which is mainly named) already
122	 * requires a pretty large memory footprint.  We may, however, have to
123	 * revisit the decision when we want to use it as a separate module for
124	 * an environment where memory requirement is severer.
125	 */
126	in_port_t	*v4ports;	/*%< available ports for IPv4 */
127	unsigned int	nv4ports;	/*%< # of available ports for IPv4 */
128	in_port_t	*v6ports;	/*%< available ports for IPv4 */
129	unsigned int	nv6ports;	/*%< # of available ports for IPv4 */
130};
131
132#define MGR_SHUTTINGDOWN		0x00000001U
133#define MGR_IS_SHUTTINGDOWN(l)	(((l)->state & MGR_SHUTTINGDOWN) != 0)
134
135#define IS_PRIVATE(d)	(((d)->attributes & DNS_DISPATCHATTR_PRIVATE) != 0)
136
137struct dns_dispentry {
138	unsigned int			magic;
139	dns_dispatch_t		       *disp;
140	dns_messageid_t			id;
141	in_port_t			port;
142	unsigned int			bucket;
143	isc_sockaddr_t			host;
144	isc_task_t		       *task;
145	isc_taskaction_t		action;
146	void			       *arg;
147	isc_boolean_t			item_out;
148	dispsocket_t			*dispsocket;
149	ISC_LIST(dns_dispatchevent_t)	items;
150	ISC_LINK(dns_dispentry_t)	link;
151};
152
153/*%
154 * Maximum number of dispatch sockets that can be pooled for reuse.  The
155 * appropriate value may vary, but experiments have shown a busy caching server
156 * may need more than 1000 sockets concurrently opened.  The maximum allowable
157 * number of dispatch sockets (per manager) will be set to the double of this
158 * value.
159 */
160#ifndef DNS_DISPATCH_POOLSOCKS
161#define DNS_DISPATCH_POOLSOCKS			2048
162#endif
163
164/*%
165 * Quota to control the number of dispatch sockets.  If a dispatch has more
166 * than the quota of sockets, new queries will purge oldest ones, so that
167 * a massive number of outstanding queries won't prevent subsequent queries
168 * (especially if the older ones take longer time and result in timeout).
169 */
170#ifndef DNS_DISPATCH_SOCKSQUOTA
171#define DNS_DISPATCH_SOCKSQUOTA			3072
172#endif
173
174struct dispsocket {
175	unsigned int			magic;
176	isc_socket_t			*socket;
177	dns_dispatch_t			*disp;
178	isc_sockaddr_t			host;
179	in_port_t			localport; /* XXX: should be removed later */
180	dispportentry_t			*portentry;
181	dns_dispentry_t			*resp;
182	isc_task_t			*task;
183	ISC_LINK(dispsocket_t)		link;
184	unsigned int			bucket;
185	ISC_LINK(dispsocket_t)		blink;
186};
187
188/*%
189 * A port table entry.  We remember every port we first open in a table with a
190 * reference counter so that we can 'reuse' the same port (with different
191 * destination addresses) using the SO_REUSEADDR socket option.
192 */
193struct dispportentry {
194	in_port_t			port;
195	unsigned int			refs;
196	ISC_LINK(struct dispportentry)	link;
197};
198
199#ifndef DNS_DISPATCH_PORTTABLESIZE
200#define DNS_DISPATCH_PORTTABLESIZE	1024
201#endif
202
203#define INVALID_BUCKET		(0xffffdead)
204
205/*%
206 * Number of tasks for each dispatch that use separate sockets for different
207 * transactions.  This must be a power of 2 as it will divide 32 bit numbers
208 * to get an uniformly random tasks selection.  See get_dispsocket().
209 */
210#define MAX_INTERNAL_TASKS	64
211
212struct dns_dispatch {
213	/* Unlocked. */
214	unsigned int		magic;		/*%< magic */
215	dns_dispatchmgr_t      *mgr;		/*%< dispatch manager */
216	int			ntasks;
217	/*%
218	 * internal task buckets.  We use multiple tasks to distribute various
219	 * socket events well when using separate dispatch sockets.  We use the
220	 * 1st task (task[0]) for internal control events.
221	 */
222	isc_task_t	       *task[MAX_INTERNAL_TASKS];
223	isc_socket_t	       *socket;		/*%< isc socket attached to */
224	isc_sockaddr_t		local;		/*%< local address */
225	in_port_t		localport;	/*%< local UDP port */
226	unsigned int		maxrequests;	/*%< max requests */
227	isc_event_t	       *ctlevent;
228
229	/*% Locked by mgr->lock. */
230	ISC_LINK(dns_dispatch_t) link;
231
232	/* Locked by "lock". */
233	isc_mutex_t		lock;		/*%< locks all below */
234	isc_sockettype_t	socktype;
235	unsigned int		attributes;
236	unsigned int		refcount;	/*%< number of users */
237	dns_dispatchevent_t    *failsafe_ev;	/*%< failsafe cancel event */
238	unsigned int		shutting_down : 1,
239				shutdown_out : 1,
240				connected : 1,
241				tcpmsg_valid : 1,
242				recv_pending : 1; /*%< is a recv() pending? */
243	isc_result_t		shutdown_why;
244	ISC_LIST(dispsocket_t)	activesockets;
245	ISC_LIST(dispsocket_t)	inactivesockets;
246	unsigned int		nsockets;
247	unsigned int		requests;	/*%< how many requests we have */
248	unsigned int		tcpbuffers;	/*%< allocated buffers */
249	dns_tcpmsg_t		tcpmsg;		/*%< for tcp streams */
250	dns_qid_t		*qid;
251	arc4ctx_t		arc4ctx;	/*%< for QID/UDP port num */
252	dispportlist_t		*port_table;	/*%< hold ports 'owned' by us */
253	isc_mempool_t		*portpool;	/*%< port table entries  */
254};
255
256#define QID_MAGIC		ISC_MAGIC('Q', 'i', 'd', ' ')
257#define VALID_QID(e)		ISC_MAGIC_VALID((e), QID_MAGIC)
258
259#define RESPONSE_MAGIC		ISC_MAGIC('D', 'r', 's', 'p')
260#define VALID_RESPONSE(e)	ISC_MAGIC_VALID((e), RESPONSE_MAGIC)
261
262#define DISPSOCK_MAGIC		ISC_MAGIC('D', 's', 'o', 'c')
263#define VALID_DISPSOCK(e)	ISC_MAGIC_VALID((e), DISPSOCK_MAGIC)
264
265#define DISPATCH_MAGIC		ISC_MAGIC('D', 'i', 's', 'p')
266#define VALID_DISPATCH(e)	ISC_MAGIC_VALID((e), DISPATCH_MAGIC)
267
268#define DNS_DISPATCHMGR_MAGIC	ISC_MAGIC('D', 'M', 'g', 'r')
269#define VALID_DISPATCHMGR(e)	ISC_MAGIC_VALID((e), DNS_DISPATCHMGR_MAGIC)
270
271#define DNS_QID(disp) ((disp)->socktype == isc_sockettype_tcp) ? \
272		       (disp)->qid : (disp)->mgr->qid
273#define DISP_ARC4CTX(disp) ((disp)->socktype == isc_sockettype_udp) ? \
274			(&(disp)->arc4ctx) : (&(disp)->mgr->arc4ctx)
275
276/*%
277 * Locking a query port buffer is a bit tricky.  We access the buffer without
278 * locking until qid is created.  Technically, there is a possibility of race
279 * between the creation of qid and access to the port buffer; in practice,
280 * however, this should be safe because qid isn't created until the first
281 * dispatch is created and there should be no contending situation until then.
282 */
283#define PORTBUFLOCK(mgr) if ((mgr)->qid != NULL) LOCK(&((mgr)->qid->lock))
284#define PORTBUFUNLOCK(mgr) if ((mgr)->qid != NULL) UNLOCK((&(mgr)->qid->lock))
285
286/*
287 * Statics.
288 */
289static dns_dispentry_t *entry_search(dns_qid_t *, isc_sockaddr_t *,
290				     dns_messageid_t, in_port_t, unsigned int);
291static isc_boolean_t destroy_disp_ok(dns_dispatch_t *);
292static void destroy_disp(isc_task_t *task, isc_event_t *event);
293static void destroy_dispsocket(dns_dispatch_t *, dispsocket_t **);
294static void deactivate_dispsocket(dns_dispatch_t *, dispsocket_t *);
295static void udp_exrecv(isc_task_t *, isc_event_t *);
296static void udp_shrecv(isc_task_t *, isc_event_t *);
297static void udp_recv(isc_event_t *, dns_dispatch_t *, dispsocket_t *);
298static void tcp_recv(isc_task_t *, isc_event_t *);
299static isc_result_t startrecv(dns_dispatch_t *, dispsocket_t *);
300static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t,
301			     in_port_t);
302static void free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len);
303static void *allocate_udp_buffer(dns_dispatch_t *disp);
304static inline void free_event(dns_dispatch_t *disp, dns_dispatchevent_t *ev);
305static inline dns_dispatchevent_t *allocate_event(dns_dispatch_t *disp);
306static void do_cancel(dns_dispatch_t *disp);
307static dns_dispentry_t *linear_first(dns_qid_t *disp);
308static dns_dispentry_t *linear_next(dns_qid_t *disp,
309				    dns_dispentry_t *resp);
310static void dispatch_free(dns_dispatch_t **dispp);
311static isc_result_t get_udpsocket(dns_dispatchmgr_t *mgr,
312				  dns_dispatch_t *disp,
313				  isc_socketmgr_t *sockmgr,
314				  isc_sockaddr_t *localaddr,
315				  isc_socket_t **sockp);
316static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr,
317				       isc_socketmgr_t *sockmgr,
318				       isc_taskmgr_t *taskmgr,
319				       isc_sockaddr_t *localaddr,
320				       unsigned int maxrequests,
321				       unsigned int attributes,
322				       dns_dispatch_t **dispp);
323static isc_boolean_t destroy_mgr_ok(dns_dispatchmgr_t *mgr);
324static void destroy_mgr(dns_dispatchmgr_t **mgrp);
325static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
326				 unsigned int increment, dns_qid_t **qidp,
327				 isc_boolean_t needaddrtable);
328static void qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp);
329static isc_result_t open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
330				unsigned int options, isc_socket_t **sockp);
331static isc_boolean_t portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
332				   isc_sockaddr_t *sockaddrp);
333
334#define LVL(x) ISC_LOG_DEBUG(x)
335
336static void
337mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...)
338     ISC_FORMAT_PRINTF(3, 4);
339
340static void
341mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...) {
342	char msgbuf[2048];
343	va_list ap;
344
345	if (! isc_log_wouldlog(dns_lctx, level))
346		return;
347
348	va_start(ap, fmt);
349	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
350	va_end(ap);
351
352	isc_log_write(dns_lctx,
353		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
354		      level, "dispatchmgr %p: %s", mgr, msgbuf);
355}
356
357static inline void
358inc_stats(dns_dispatchmgr_t *mgr, isc_statscounter_t counter) {
359	if (mgr->stats != NULL)
360		isc_stats_increment(mgr->stats, counter);
361}
362
363static void
364dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...)
365     ISC_FORMAT_PRINTF(3, 4);
366
367static void
368dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...) {
369	char msgbuf[2048];
370	va_list ap;
371
372	if (! isc_log_wouldlog(dns_lctx, level))
373		return;
374
375	va_start(ap, fmt);
376	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
377	va_end(ap);
378
379	isc_log_write(dns_lctx,
380		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
381		      level, "dispatch %p: %s", disp, msgbuf);
382}
383
384static void
385request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
386	    int level, const char *fmt, ...)
387     ISC_FORMAT_PRINTF(4, 5);
388
389static void
390request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
391	    int level, const char *fmt, ...)
392{
393	char msgbuf[2048];
394	char peerbuf[256];
395	va_list ap;
396
397	if (! isc_log_wouldlog(dns_lctx, level))
398		return;
399
400	va_start(ap, fmt);
401	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
402	va_end(ap);
403
404	if (VALID_RESPONSE(resp)) {
405		isc_sockaddr_format(&resp->host, peerbuf, sizeof(peerbuf));
406		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
407			      DNS_LOGMODULE_DISPATCH, level,
408			      "dispatch %p response %p %s: %s", disp, resp,
409			      peerbuf, msgbuf);
410	} else {
411		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
412			      DNS_LOGMODULE_DISPATCH, level,
413			      "dispatch %p req/resp %p: %s", disp, resp,
414			      msgbuf);
415	}
416}
417
418/*%
419 * ARC4 random number generator derived from OpenBSD.
420 * Only dispatch_random() and dispatch_uniformrandom() are expected
421 * to be called from general dispatch routines; the rest of them are subroutines
422 * for these two.
423 *
424 * The original copyright follows:
425 * Copyright (c) 1996, David Mazieres <dm@uun.org>
426 * Copyright (c) 2008, Damien Miller <djm@openbsd.org>
427 *
428 * Permission to use, copy, modify, and distribute this software for any
429 * purpose with or without fee is hereby granted, provided that the above
430 * copyright notice and this permission notice appear in all copies.
431 *
432 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
433 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
434 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
435 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
436 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
437 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
438 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
439 */
440#ifdef BIND9
441static void
442dispatch_initrandom(arc4ctx_t *actx, isc_entropy_t *entropy,
443		    isc_mutex_t *lock)
444{
445	int n;
446	for (n = 0; n < 256; n++)
447		actx->s[n] = n;
448	actx->i = 0;
449	actx->j = 0;
450	actx->count = 0;
451	actx->entropy = entropy; /* don't have to attach */
452	actx->lock = lock;
453}
454
455static void
456dispatch_arc4addrandom(arc4ctx_t *actx, unsigned char *dat, int datlen) {
457	int n;
458	isc_uint8_t si;
459
460	actx->i--;
461	for (n = 0; n < 256; n++) {
462		actx->i = (actx->i + 1);
463		si = actx->s[actx->i];
464		actx->j = (actx->j + si + dat[n % datlen]);
465		actx->s[actx->i] = actx->s[actx->j];
466		actx->s[actx->j] = si;
467	}
468	actx->j = actx->i;
469}
470
471static inline isc_uint8_t
472dispatch_arc4get8(arc4ctx_t *actx) {
473	isc_uint8_t si, sj;
474
475	actx->i = (actx->i + 1);
476	si = actx->s[actx->i];
477	actx->j = (actx->j + si);
478	sj = actx->s[actx->j];
479	actx->s[actx->i] = sj;
480	actx->s[actx->j] = si;
481
482	return (actx->s[(si + sj) & 0xff]);
483}
484
485static inline isc_uint16_t
486dispatch_arc4get16(arc4ctx_t *actx) {
487	isc_uint16_t val;
488
489	val = dispatch_arc4get8(actx) << 8;
490	val |= dispatch_arc4get8(actx);
491
492	return (val);
493}
494
495static void
496dispatch_arc4stir(arc4ctx_t *actx) {
497	int i;
498	union {
499		unsigned char rnd[128];
500		isc_uint32_t rnd32[32];
501	} rnd;
502	isc_result_t result;
503
504	if (actx->entropy != NULL) {
505		/*
506		 * We accept any quality of random data to avoid blocking.
507		 */
508		result = isc_entropy_getdata(actx->entropy, rnd.rnd,
509					     sizeof(rnd), NULL, 0);
510		RUNTIME_CHECK(result == ISC_R_SUCCESS);
511	} else {
512		for (i = 0; i < 32; i++)
513			isc_random_get(&rnd.rnd32[i]);
514	}
515	dispatch_arc4addrandom(actx, rnd.rnd, sizeof(rnd.rnd));
516
517	/*
518	 * Discard early keystream, as per recommendations in:
519	 * http://www.wisdom.weizmann.ac.il/~itsik/RC4/Papers/Rc4_ksa.ps
520	 */
521	for (i = 0; i < 256; i++)
522		(void)dispatch_arc4get8(actx);
523
524	/*
525	 * Derived from OpenBSD's implementation.  The rationale is not clear,
526	 * but should be conservative enough in safety, and reasonably large
527	 * for efficiency.
528	 */
529	actx->count = 1600000;
530}
531
532static isc_uint16_t
533dispatch_random(arc4ctx_t *actx) {
534	isc_uint16_t result;
535
536	if (actx->lock != NULL)
537		LOCK(actx->lock);
538
539	actx->count -= sizeof(isc_uint16_t);
540	if (actx->count <= 0)
541		dispatch_arc4stir(actx);
542	result = dispatch_arc4get16(actx);
543
544	if (actx->lock != NULL)
545		UNLOCK(actx->lock);
546
547	return (result);
548}
549#else
550/*
551 * For general purpose library, we don't have to be too strict about the
552 * quality of random values.  Performance doesn't matter much, either.
553 * So we simply use the isc_random module to keep the library as small as
554 * possible.
555 */
556
557static void
558dispatch_initrandom(arc4ctx_t *actx, isc_entropy_t *entropy,
559		    isc_mutex_t *lock)
560{
561	UNUSED(actx);
562	UNUSED(entropy);
563	UNUSED(lock);
564
565	return;
566}
567
568static isc_uint16_t
569dispatch_random(arc4ctx_t *actx) {
570	isc_uint32_t r;
571
572	UNUSED(actx);
573
574	isc_random_get(&r);
575	return (r & 0xffff);
576}
577#endif	/* BIND9 */
578
579static isc_uint16_t
580dispatch_uniformrandom(arc4ctx_t *actx, isc_uint16_t upper_bound) {
581	isc_uint16_t min, r;
582
583	if (upper_bound < 2)
584		return (0);
585
586	/*
587	 * Ensure the range of random numbers [min, 0xffff] be a multiple of
588	 * upper_bound and contain at least a half of the 16 bit range.
589	 */
590
591	if (upper_bound > 0x8000)
592		min = 1 + ~upper_bound; /* 0x8000 - upper_bound */
593	else
594		min = (isc_uint16_t)(0x10000 % (isc_uint32_t)upper_bound);
595
596	/*
597	 * This could theoretically loop forever but each retry has
598	 * p > 0.5 (worst case, usually far better) of selecting a
599	 * number inside the range we need, so it should rarely need
600	 * to re-roll.
601	 */
602	for (;;) {
603		r = dispatch_random(actx);
604		if (r >= min)
605			break;
606	}
607
608	return (r % upper_bound);
609}
610
611/*
612 * Return a hash of the destination and message id.
613 */
614static isc_uint32_t
615dns_hash(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
616	 in_port_t port)
617{
618	unsigned int ret;
619
620	ret = isc_sockaddr_hash(dest, ISC_TRUE);
621	ret ^= (id << 16) | port;
622	ret %= qid->qid_nbuckets;
623
624	INSIST(ret < qid->qid_nbuckets);
625
626	return (ret);
627}
628
629/*
630 * Find the first entry in 'qid'.  Returns NULL if there are no entries.
631 */
632static dns_dispentry_t *
633linear_first(dns_qid_t *qid) {
634	dns_dispentry_t *ret;
635	unsigned int bucket;
636
637	bucket = 0;
638
639	while (bucket < qid->qid_nbuckets) {
640		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
641		if (ret != NULL)
642			return (ret);
643		bucket++;
644	}
645
646	return (NULL);
647}
648
649/*
650 * Find the next entry after 'resp' in 'qid'.  Return NULL if there are
651 * no more entries.
652 */
653static dns_dispentry_t *
654linear_next(dns_qid_t *qid, dns_dispentry_t *resp) {
655	dns_dispentry_t *ret;
656	unsigned int bucket;
657
658	ret = ISC_LIST_NEXT(resp, link);
659	if (ret != NULL)
660		return (ret);
661
662	bucket = resp->bucket;
663	bucket++;
664	while (bucket < qid->qid_nbuckets) {
665		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
666		if (ret != NULL)
667			return (ret);
668		bucket++;
669	}
670
671	return (NULL);
672}
673
674/*
675 * The dispatch must be locked.
676 */
677static isc_boolean_t
678destroy_disp_ok(dns_dispatch_t *disp)
679{
680	if (disp->refcount != 0)
681		return (ISC_FALSE);
682
683	if (disp->recv_pending != 0)
684		return (ISC_FALSE);
685
686	if (!ISC_LIST_EMPTY(disp->activesockets))
687		return (ISC_FALSE);
688
689	if (disp->shutting_down == 0)
690		return (ISC_FALSE);
691
692	return (ISC_TRUE);
693}
694
695/*
696 * Called when refcount reaches 0 (and safe to destroy).
697 *
698 * The dispatcher must not be locked.
699 * The manager must be locked.
700 */
701static void
702destroy_disp(isc_task_t *task, isc_event_t *event) {
703	dns_dispatch_t *disp;
704	dns_dispatchmgr_t *mgr;
705	isc_boolean_t killmgr;
706	dispsocket_t *dispsocket;
707	int i;
708
709	INSIST(event->ev_type == DNS_EVENT_DISPATCHCONTROL);
710
711	UNUSED(task);
712
713	disp = event->ev_arg;
714	mgr = disp->mgr;
715
716	LOCK(&mgr->lock);
717	ISC_LIST_UNLINK(mgr->list, disp, link);
718
719	dispatch_log(disp, LVL(90),
720		     "shutting down; detaching from sock %p, task %p",
721		     disp->socket, disp->task[0]); /* XXXX */
722
723	if (disp->socket != NULL)
724		isc_socket_detach(&disp->socket);
725	while ((dispsocket = ISC_LIST_HEAD(disp->inactivesockets)) != NULL) {
726		ISC_LIST_UNLINK(disp->inactivesockets, dispsocket, link);
727		destroy_dispsocket(disp, &dispsocket);
728	}
729	for (i = 0; i < disp->ntasks; i++)
730		isc_task_detach(&disp->task[i]);
731	isc_event_free(&event);
732
733	dispatch_free(&disp);
734
735	killmgr = destroy_mgr_ok(mgr);
736	UNLOCK(&mgr->lock);
737	if (killmgr)
738		destroy_mgr(&mgr);
739}
740
741/*%
742 * Manipulate port table per dispatch: find an entry for a given port number,
743 * create a new entry, and decrement a given entry with possible clean-up.
744 */
745static dispportentry_t *
746port_search(dns_dispatch_t *disp, in_port_t port) {
747	dispportentry_t *portentry;
748
749	REQUIRE(disp->port_table != NULL);
750
751	portentry = ISC_LIST_HEAD(disp->port_table[port %
752						   DNS_DISPATCH_PORTTABLESIZE]);
753	while (portentry != NULL) {
754		if (portentry->port == port)
755			return (portentry);
756		portentry = ISC_LIST_NEXT(portentry, link);
757	}
758
759	return (NULL);
760}
761
762static dispportentry_t *
763new_portentry(dns_dispatch_t *disp, in_port_t port) {
764	dispportentry_t *portentry;
765
766	REQUIRE(disp->port_table != NULL);
767
768	portentry = isc_mempool_get(disp->portpool);
769	if (portentry == NULL)
770		return (portentry);
771
772	portentry->port = port;
773	portentry->refs = 0;
774	ISC_LINK_INIT(portentry, link);
775	ISC_LIST_APPEND(disp->port_table[port % DNS_DISPATCH_PORTTABLESIZE],
776			portentry, link);
777
778	return (portentry);
779}
780
781/*%
782 * The caller must not hold the qid->lock.
783 */
784static void
785deref_portentry(dns_dispatch_t *disp, dispportentry_t **portentryp) {
786	dispportentry_t *portentry = *portentryp;
787	dns_qid_t *qid;
788
789	REQUIRE(disp->port_table != NULL);
790	REQUIRE(portentry != NULL && portentry->refs > 0);
791
792	qid = DNS_QID(disp);
793	LOCK(&qid->lock);
794	portentry->refs--;
795	if (portentry->refs == 0) {
796		ISC_LIST_UNLINK(disp->port_table[portentry->port %
797						 DNS_DISPATCH_PORTTABLESIZE],
798				portentry, link);
799		isc_mempool_put(disp->portpool, portentry);
800	}
801
802	*portentryp = NULL;
803	UNLOCK(&qid->lock);
804}
805
806/*%
807 * Find a dispsocket for socket address 'dest', and port number 'port'.
808 * Return NULL if no such entry exists.
809 */
810static dispsocket_t *
811socket_search(dns_qid_t *qid, isc_sockaddr_t *dest, in_port_t port,
812	      unsigned int bucket)
813{
814	dispsocket_t *dispsock;
815
816	REQUIRE(bucket < qid->qid_nbuckets);
817
818	dispsock = ISC_LIST_HEAD(qid->sock_table[bucket]);
819
820	while (dispsock != NULL) {
821		if (dispsock->portentry != NULL &&
822		    dispsock->portentry->port == port &&
823		    isc_sockaddr_equal(dest, &dispsock->host))
824			return (dispsock);
825		dispsock = ISC_LIST_NEXT(dispsock, blink);
826	}
827
828	return (NULL);
829}
830
831/*%
832 * Make a new socket for a single dispatch with a random port number.
833 * The caller must hold the disp->lock and qid->lock.
834 */
835static isc_result_t
836get_dispsocket(dns_dispatch_t *disp, isc_sockaddr_t *dest,
837	       isc_socketmgr_t *sockmgr, dns_qid_t *qid,
838	       dispsocket_t **dispsockp, in_port_t *portp)
839{
840	int i;
841	isc_uint32_t r;
842	dns_dispatchmgr_t *mgr = disp->mgr;
843	isc_socket_t *sock = NULL;
844	isc_result_t result = ISC_R_FAILURE;
845	in_port_t port;
846	isc_sockaddr_t localaddr;
847	unsigned int bucket = 0;
848	dispsocket_t *dispsock;
849	unsigned int nports;
850	in_port_t *ports;
851	unsigned int bindoptions;
852	dispportentry_t *portentry = NULL;
853
854	if (isc_sockaddr_pf(&disp->local) == AF_INET) {
855		nports = disp->mgr->nv4ports;
856		ports = disp->mgr->v4ports;
857	} else {
858		nports = disp->mgr->nv6ports;
859		ports = disp->mgr->v6ports;
860	}
861	if (nports == 0)
862		return (ISC_R_ADDRNOTAVAIL);
863
864	dispsock = ISC_LIST_HEAD(disp->inactivesockets);
865	if (dispsock != NULL) {
866		ISC_LIST_UNLINK(disp->inactivesockets, dispsock, link);
867		sock = dispsock->socket;
868		dispsock->socket = NULL;
869	} else {
870		dispsock = isc_mempool_get(mgr->spool);
871		if (dispsock == NULL)
872			return (ISC_R_NOMEMORY);
873
874		disp->nsockets++;
875		dispsock->socket = NULL;
876		dispsock->disp = disp;
877		dispsock->resp = NULL;
878		dispsock->portentry = NULL;
879		isc_random_get(&r);
880		dispsock->task = NULL;
881		isc_task_attach(disp->task[r % disp->ntasks], &dispsock->task);
882		ISC_LINK_INIT(dispsock, link);
883		ISC_LINK_INIT(dispsock, blink);
884		dispsock->magic = DISPSOCK_MAGIC;
885	}
886
887	/*
888	 * Pick up a random UDP port and open a new socket with it.  Avoid
889	 * choosing ports that share the same destination because it will be
890	 * very likely to fail in bind(2) or connect(2).
891	 */
892	localaddr = disp->local;
893	for (i = 0; i < 64; i++) {
894		port = ports[dispatch_uniformrandom(DISP_ARC4CTX(disp),
895							nports)];
896		isc_sockaddr_setport(&localaddr, port);
897
898		bucket = dns_hash(qid, dest, 0, port);
899		if (socket_search(qid, dest, port, bucket) != NULL)
900			continue;
901		bindoptions = 0;
902		portentry = port_search(disp, port);
903		if (portentry != NULL)
904			bindoptions |= ISC_SOCKET_REUSEADDRESS;
905		result = open_socket(sockmgr, &localaddr, bindoptions, &sock);
906		if (result == ISC_R_SUCCESS) {
907			if (portentry == NULL) {
908				portentry = new_portentry(disp, port);
909				if (portentry == NULL) {
910					result = ISC_R_NOMEMORY;
911					break;
912				}
913			}
914			portentry->refs++;
915			break;
916		} else if (result != ISC_R_ADDRINUSE)
917			break;
918	}
919
920	if (result == ISC_R_SUCCESS) {
921		dispsock->socket = sock;
922		dispsock->host = *dest;
923		dispsock->portentry = portentry;
924		dispsock->bucket = bucket;
925		ISC_LIST_APPEND(qid->sock_table[bucket], dispsock, blink);
926		*dispsockp = dispsock;
927		*portp = port;
928	} else {
929		/*
930		 * We could keep it in the inactive list, but since this should
931		 * be an exceptional case and might be resource shortage, we'd
932		 * rather destroy it.
933		 */
934		if (sock != NULL)
935			isc_socket_detach(&sock);
936		destroy_dispsocket(disp, &dispsock);
937	}
938
939	return (result);
940}
941
942/*%
943 * Destroy a dedicated dispatch socket.
944 */
945static void
946destroy_dispsocket(dns_dispatch_t *disp, dispsocket_t **dispsockp) {
947	dispsocket_t *dispsock;
948	dns_qid_t *qid;
949
950	/*
951	 * The dispatch must be locked.
952	 */
953
954	REQUIRE(dispsockp != NULL && *dispsockp != NULL);
955	dispsock = *dispsockp;
956	REQUIRE(!ISC_LINK_LINKED(dispsock, link));
957
958	disp->nsockets--;
959	dispsock->magic = 0;
960	if (dispsock->portentry != NULL)
961		deref_portentry(disp, &dispsock->portentry);
962	if (dispsock->socket != NULL)
963		isc_socket_detach(&dispsock->socket);
964	if (ISC_LINK_LINKED(dispsock, blink)) {
965		qid = DNS_QID(disp);
966		LOCK(&qid->lock);
967		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
968				blink);
969		UNLOCK(&qid->lock);
970	}
971	if (dispsock->task != NULL)
972		isc_task_detach(&dispsock->task);
973	isc_mempool_put(disp->mgr->spool, dispsock);
974
975	*dispsockp = NULL;
976}
977
978/*%
979 * Deactivate a dedicated dispatch socket.  Move it to the inactive list for
980 * future reuse unless the total number of sockets are exceeding the maximum.
981 */
982static void
983deactivate_dispsocket(dns_dispatch_t *disp, dispsocket_t *dispsock) {
984	isc_result_t result;
985	dns_qid_t *qid;
986
987	/*
988	 * The dispatch must be locked.
989	 */
990	ISC_LIST_UNLINK(disp->activesockets, dispsock, link);
991	if (dispsock->resp != NULL) {
992		INSIST(dispsock->resp->dispsocket == dispsock);
993		dispsock->resp->dispsocket = NULL;
994	}
995
996	INSIST(dispsock->portentry != NULL);
997	deref_portentry(disp, &dispsock->portentry);
998
999#ifdef BIND9
1000	if (disp->nsockets > DNS_DISPATCH_POOLSOCKS)
1001		destroy_dispsocket(disp, &dispsock);
1002	else {
1003		result = isc_socket_close(dispsock->socket);
1004
1005		qid = DNS_QID(disp);
1006		LOCK(&qid->lock);
1007		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
1008				blink);
1009		UNLOCK(&qid->lock);
1010
1011		if (result == ISC_R_SUCCESS)
1012			ISC_LIST_APPEND(disp->inactivesockets, dispsock, link);
1013		else {
1014			/*
1015			 * If the underlying system does not allow this
1016			 * optimization, destroy this temporary structure (and
1017			 * create a new one for a new transaction).
1018			 */
1019			INSIST(result == ISC_R_NOTIMPLEMENTED);
1020			destroy_dispsocket(disp, &dispsock);
1021		}
1022	}
1023#else
1024	/* This kind of optimization isn't necessary for normal use */
1025	UNUSED(qid);
1026	UNUSED(result);
1027
1028	destroy_dispsocket(disp, &dispsock);
1029#endif
1030}
1031
1032/*
1033 * Find an entry for query ID 'id', socket address 'dest', and port number
1034 * 'port'.
1035 * Return NULL if no such entry exists.
1036 */
1037static dns_dispentry_t *
1038entry_search(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
1039	     in_port_t port, unsigned int bucket)
1040{
1041	dns_dispentry_t *res;
1042
1043	REQUIRE(bucket < qid->qid_nbuckets);
1044
1045	res = ISC_LIST_HEAD(qid->qid_table[bucket]);
1046
1047	while (res != NULL) {
1048		if (res->id == id && isc_sockaddr_equal(dest, &res->host) &&
1049		    res->port == port) {
1050			return (res);
1051		}
1052		res = ISC_LIST_NEXT(res, link);
1053	}
1054
1055	return (NULL);
1056}
1057
1058static void
1059free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len) {
1060	INSIST(buf != NULL && len != 0);
1061
1062
1063	switch (disp->socktype) {
1064	case isc_sockettype_tcp:
1065		INSIST(disp->tcpbuffers > 0);
1066		disp->tcpbuffers--;
1067		isc_mem_put(disp->mgr->mctx, buf, len);
1068		break;
1069	case isc_sockettype_udp:
1070		LOCK(&disp->mgr->buffer_lock);
1071		INSIST(disp->mgr->buffers > 0);
1072		INSIST(len == disp->mgr->buffersize);
1073		disp->mgr->buffers--;
1074		isc_mempool_put(disp->mgr->bpool, buf);
1075		UNLOCK(&disp->mgr->buffer_lock);
1076		break;
1077	default:
1078		INSIST(0);
1079		break;
1080	}
1081}
1082
1083static void *
1084allocate_udp_buffer(dns_dispatch_t *disp) {
1085	void *temp;
1086
1087	LOCK(&disp->mgr->buffer_lock);
1088	temp = isc_mempool_get(disp->mgr->bpool);
1089
1090	if (temp != NULL)
1091		disp->mgr->buffers++;
1092	UNLOCK(&disp->mgr->buffer_lock);
1093
1094	return (temp);
1095}
1096
1097static inline void
1098free_event(dns_dispatch_t *disp, dns_dispatchevent_t *ev) {
1099	if (disp->failsafe_ev == ev) {
1100		INSIST(disp->shutdown_out == 1);
1101		disp->shutdown_out = 0;
1102
1103		return;
1104	}
1105
1106	isc_mempool_put(disp->mgr->epool, ev);
1107}
1108
1109static inline dns_dispatchevent_t *
1110allocate_event(dns_dispatch_t *disp) {
1111	dns_dispatchevent_t *ev;
1112
1113	ev = isc_mempool_get(disp->mgr->epool);
1114	if (ev == NULL)
1115		return (NULL);
1116	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, 0,
1117		       NULL, NULL, NULL, NULL, NULL);
1118
1119	return (ev);
1120}
1121
1122static void
1123udp_exrecv(isc_task_t *task, isc_event_t *ev) {
1124	dispsocket_t *dispsock = ev->ev_arg;
1125
1126	UNUSED(task);
1127
1128	REQUIRE(VALID_DISPSOCK(dispsock));
1129	udp_recv(ev, dispsock->disp, dispsock);
1130}
1131
1132static void
1133udp_shrecv(isc_task_t *task, isc_event_t *ev) {
1134	dns_dispatch_t *disp = ev->ev_arg;
1135
1136	UNUSED(task);
1137
1138	REQUIRE(VALID_DISPATCH(disp));
1139	udp_recv(ev, disp, NULL);
1140}
1141
1142/*
1143 * General flow:
1144 *
1145 * If I/O result == CANCELED or error, free the buffer.
1146 *
1147 * If query, free the buffer, restart.
1148 *
1149 * If response:
1150 *	Allocate event, fill in details.
1151 *		If cannot allocate, free buffer, restart.
1152 *	find target.  If not found, free buffer, restart.
1153 *	if event queue is not empty, queue.  else, send.
1154 *	restart.
1155 */
1156static void
1157udp_recv(isc_event_t *ev_in, dns_dispatch_t *disp, dispsocket_t *dispsock) {
1158	isc_socketevent_t *ev = (isc_socketevent_t *)ev_in;
1159	dns_messageid_t id;
1160	isc_result_t dres;
1161	isc_buffer_t source;
1162	unsigned int flags;
1163	dns_dispentry_t *resp = NULL;
1164	dns_dispatchevent_t *rev;
1165	unsigned int bucket;
1166	isc_boolean_t killit;
1167	isc_boolean_t queue_response;
1168	dns_dispatchmgr_t *mgr;
1169	dns_qid_t *qid;
1170	isc_netaddr_t netaddr;
1171	int match;
1172	int result;
1173	isc_boolean_t qidlocked = ISC_FALSE;
1174
1175	LOCK(&disp->lock);
1176
1177	mgr = disp->mgr;
1178	qid = mgr->qid;
1179
1180	dispatch_log(disp, LVL(90),
1181		     "got packet: requests %d, buffers %d, recvs %d",
1182		     disp->requests, disp->mgr->buffers, disp->recv_pending);
1183
1184	if (dispsock == NULL && ev->ev_type == ISC_SOCKEVENT_RECVDONE) {
1185		/*
1186		 * Unless the receive event was imported from a listening
1187		 * interface, in which case the event type is
1188		 * DNS_EVENT_IMPORTRECVDONE, receive operation must be pending.
1189		 */
1190		INSIST(disp->recv_pending != 0);
1191		disp->recv_pending = 0;
1192	}
1193
1194	if (dispsock != NULL &&
1195	    (ev->result == ISC_R_CANCELED || dispsock->resp == NULL)) {
1196		/*
1197		 * dispsock->resp can be NULL if this transaction was canceled
1198		 * just after receiving a response.  Since this socket is
1199		 * exclusively used and there should be at most one receive
1200		 * event the canceled event should have been no effect.  So
1201		 * we can (and should) deactivate the socket right now.
1202		 */
1203		deactivate_dispsocket(disp, dispsock);
1204		dispsock = NULL;
1205	}
1206
1207	if (disp->shutting_down) {
1208		/*
1209		 * This dispatcher is shutting down.
1210		 */
1211		free_buffer(disp, ev->region.base, ev->region.length);
1212
1213		isc_event_free(&ev_in);
1214		ev = NULL;
1215
1216		killit = destroy_disp_ok(disp);
1217		UNLOCK(&disp->lock);
1218		if (killit)
1219			isc_task_send(disp->task[0], &disp->ctlevent);
1220
1221		return;
1222	}
1223
1224	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
1225		if (dispsock != NULL) {
1226			resp = dispsock->resp;
1227			id = resp->id;
1228			if (ev->result != ISC_R_SUCCESS) {
1229				/*
1230				 * This is most likely a network error on a
1231				 * connected socket.  It makes no sense to
1232				 * check the address or parse the packet, but it
1233				 * will help to return the error to the caller.
1234				 */
1235				goto sendresponse;
1236			}
1237		} else {
1238			free_buffer(disp, ev->region.base, ev->region.length);
1239
1240			UNLOCK(&disp->lock);
1241			isc_event_free(&ev_in);
1242			return;
1243		}
1244	} else if (ev->result != ISC_R_SUCCESS) {
1245		free_buffer(disp, ev->region.base, ev->region.length);
1246
1247		if (ev->result != ISC_R_CANCELED)
1248			dispatch_log(disp, ISC_LOG_ERROR,
1249				     "odd socket result in udp_recv(): %s",
1250				     isc_result_totext(ev->result));
1251
1252		UNLOCK(&disp->lock);
1253		isc_event_free(&ev_in);
1254		return;
1255	}
1256
1257	/*
1258	 * If this is from a blackholed address, drop it.
1259	 */
1260	isc_netaddr_fromsockaddr(&netaddr, &ev->address);
1261	if (disp->mgr->blackhole != NULL &&
1262	    dns_acl_match(&netaddr, NULL, disp->mgr->blackhole,
1263			  NULL, &match, NULL) == ISC_R_SUCCESS &&
1264	    match > 0)
1265	{
1266		if (isc_log_wouldlog(dns_lctx, LVL(10))) {
1267			char netaddrstr[ISC_NETADDR_FORMATSIZE];
1268			isc_netaddr_format(&netaddr, netaddrstr,
1269					   sizeof(netaddrstr));
1270			dispatch_log(disp, LVL(10),
1271				     "blackholed packet from %s",
1272				     netaddrstr);
1273		}
1274		free_buffer(disp, ev->region.base, ev->region.length);
1275		goto restart;
1276	}
1277
1278	/*
1279	 * Peek into the buffer to see what we can see.
1280	 */
1281	isc_buffer_init(&source, ev->region.base, ev->region.length);
1282	isc_buffer_add(&source, ev->n);
1283	dres = dns_message_peekheader(&source, &id, &flags);
1284	if (dres != ISC_R_SUCCESS) {
1285		free_buffer(disp, ev->region.base, ev->region.length);
1286		dispatch_log(disp, LVL(10), "got garbage packet");
1287		goto restart;
1288	}
1289
1290	dispatch_log(disp, LVL(92),
1291		     "got valid DNS message header, /QR %c, id %u",
1292		     ((flags & DNS_MESSAGEFLAG_QR) ? '1' : '0'), id);
1293
1294	/*
1295	 * Look at flags.  If query, drop it. If response,
1296	 * look to see where it goes.
1297	 */
1298	queue_response = ISC_FALSE;
1299	if ((flags & DNS_MESSAGEFLAG_QR) == 0) {
1300		/* query */
1301		free_buffer(disp, ev->region.base, ev->region.length);
1302		goto restart;
1303	}
1304
1305	/*
1306	 * Search for the corresponding response.  If we are using an exclusive
1307	 * socket, we've already identified it and we can skip the search; but
1308	 * the ID and the address must match the expected ones.
1309	 */
1310	if (resp == NULL) {
1311		bucket = dns_hash(qid, &ev->address, id, disp->localport);
1312		LOCK(&qid->lock);
1313		qidlocked = ISC_TRUE;
1314		resp = entry_search(qid, &ev->address, id, disp->localport,
1315				    bucket);
1316		dispatch_log(disp, LVL(90),
1317			     "search for response in bucket %d: %s",
1318			     bucket, (resp == NULL ? "not found" : "found"));
1319
1320		if (resp == NULL) {
1321			inc_stats(mgr, dns_resstatscounter_mismatch);
1322			free_buffer(disp, ev->region.base, ev->region.length);
1323			goto unlock;
1324		}
1325	} else if (resp->id != id || !isc_sockaddr_equal(&ev->address,
1326							 &resp->host)) {
1327		dispatch_log(disp, LVL(90),
1328			     "response to an exclusive socket doesn't match");
1329		inc_stats(mgr, dns_resstatscounter_mismatch);
1330		free_buffer(disp, ev->region.base, ev->region.length);
1331		goto unlock;
1332	}
1333
1334	/*
1335	 * Now that we have the original dispatch the query was sent
1336	 * from check that the address and port the response was
1337	 * sent to make sense.
1338	 */
1339	if (disp != resp->disp) {
1340		isc_sockaddr_t a1;
1341		isc_sockaddr_t a2;
1342
1343		/*
1344		 * Check that the socket types and ports match.
1345		 */
1346		if (disp->socktype != resp->disp->socktype ||
1347		    isc_sockaddr_getport(&disp->local) !=
1348		    isc_sockaddr_getport(&resp->disp->local)) {
1349			free_buffer(disp, ev->region.base, ev->region.length);
1350			goto unlock;
1351		}
1352
1353		/*
1354		 * If both dispatches are bound to an address then fail as
1355		 * the addresses can't be equal (enforced by the IP stack).
1356		 *
1357		 * Note under Linux a packet can be sent out via IPv4 socket
1358		 * and the response be received via a IPv6 socket.
1359		 *
1360		 * Requests sent out via IPv6 should always come back in
1361		 * via IPv6.
1362		 */
1363		if (isc_sockaddr_pf(&resp->disp->local) == PF_INET6 &&
1364		    isc_sockaddr_pf(&disp->local) != PF_INET6) {
1365			free_buffer(disp, ev->region.base, ev->region.length);
1366			goto unlock;
1367		}
1368		isc_sockaddr_anyofpf(&a1, isc_sockaddr_pf(&resp->disp->local));
1369		isc_sockaddr_anyofpf(&a2, isc_sockaddr_pf(&disp->local));
1370		if (!isc_sockaddr_eqaddr(&a1, &resp->disp->local) &&
1371		    !isc_sockaddr_eqaddr(&a2, &disp->local)) {
1372			free_buffer(disp, ev->region.base, ev->region.length);
1373			goto unlock;
1374		}
1375	}
1376
1377  sendresponse:
1378	queue_response = resp->item_out;
1379	rev = allocate_event(resp->disp);
1380	if (rev == NULL) {
1381		free_buffer(disp, ev->region.base, ev->region.length);
1382		goto unlock;
1383	}
1384
1385	/*
1386	 * At this point, rev contains the event we want to fill in, and
1387	 * resp contains the information on the place to send it to.
1388	 * Send the event off.
1389	 */
1390	isc_buffer_init(&rev->buffer, ev->region.base, ev->region.length);
1391	isc_buffer_add(&rev->buffer, ev->n);
1392	rev->result = ev->result;
1393	rev->id = id;
1394	rev->addr = ev->address;
1395	rev->pktinfo = ev->pktinfo;
1396	rev->attributes = ev->attributes;
1397	if (queue_response) {
1398		ISC_LIST_APPEND(resp->items, rev, ev_link);
1399	} else {
1400		ISC_EVENT_INIT(rev, sizeof(*rev), 0, NULL,
1401			       DNS_EVENT_DISPATCH,
1402			       resp->action, resp->arg, resp, NULL, NULL);
1403		request_log(disp, resp, LVL(90),
1404			    "[a] Sent event %p buffer %p len %d to task %p",
1405			    rev, rev->buffer.base, rev->buffer.length,
1406			    resp->task);
1407		resp->item_out = ISC_TRUE;
1408		isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
1409	}
1410 unlock:
1411	if (qidlocked)
1412		UNLOCK(&qid->lock);
1413
1414	/*
1415	 * Restart recv() to get the next packet.
1416	 */
1417 restart:
1418	result = startrecv(disp, dispsock);
1419	if (result != ISC_R_SUCCESS && dispsock != NULL) {
1420		/*
1421		 * XXX: wired. There seems to be no recovery process other than
1422		 * deactivate this socket anyway (since we cannot start
1423		 * receiving, we won't be able to receive a cancel event
1424		 * from the user).
1425		 */
1426		deactivate_dispsocket(disp, dispsock);
1427	}
1428	UNLOCK(&disp->lock);
1429
1430	isc_event_free(&ev_in);
1431}
1432
1433/*
1434 * General flow:
1435 *
1436 * If I/O result == CANCELED, EOF, or error, notify everyone as the
1437 * various queues drain.
1438 *
1439 * If query, restart.
1440 *
1441 * If response:
1442 *	Allocate event, fill in details.
1443 *		If cannot allocate, restart.
1444 *	find target.  If not found, restart.
1445 *	if event queue is not empty, queue.  else, send.
1446 *	restart.
1447 */
1448static void
1449tcp_recv(isc_task_t *task, isc_event_t *ev_in) {
1450	dns_dispatch_t *disp = ev_in->ev_arg;
1451	dns_tcpmsg_t *tcpmsg = &disp->tcpmsg;
1452	dns_messageid_t id;
1453	isc_result_t dres;
1454	unsigned int flags;
1455	dns_dispentry_t *resp;
1456	dns_dispatchevent_t *rev;
1457	unsigned int bucket;
1458	isc_boolean_t killit;
1459	isc_boolean_t queue_response;
1460	dns_qid_t *qid;
1461	int level;
1462	char buf[ISC_SOCKADDR_FORMATSIZE];
1463
1464	UNUSED(task);
1465
1466	REQUIRE(VALID_DISPATCH(disp));
1467
1468	qid = disp->qid;
1469
1470	dispatch_log(disp, LVL(90),
1471		     "got TCP packet: requests %d, buffers %d, recvs %d",
1472		     disp->requests, disp->tcpbuffers, disp->recv_pending);
1473
1474	LOCK(&disp->lock);
1475
1476	INSIST(disp->recv_pending != 0);
1477	disp->recv_pending = 0;
1478
1479	if (disp->refcount == 0) {
1480		/*
1481		 * This dispatcher is shutting down.  Force cancelation.
1482		 */
1483		tcpmsg->result = ISC_R_CANCELED;
1484	}
1485
1486	if (tcpmsg->result != ISC_R_SUCCESS) {
1487		switch (tcpmsg->result) {
1488		case ISC_R_CANCELED:
1489			break;
1490
1491		case ISC_R_EOF:
1492			dispatch_log(disp, LVL(90), "shutting down on EOF");
1493			do_cancel(disp);
1494			break;
1495
1496		case ISC_R_CONNECTIONRESET:
1497			level = ISC_LOG_INFO;
1498			goto logit;
1499
1500		default:
1501			level = ISC_LOG_ERROR;
1502		logit:
1503			isc_sockaddr_format(&tcpmsg->address, buf, sizeof(buf));
1504			dispatch_log(disp, level, "shutting down due to TCP "
1505				     "receive error: %s: %s", buf,
1506				     isc_result_totext(tcpmsg->result));
1507			do_cancel(disp);
1508			break;
1509		}
1510
1511		/*
1512		 * The event is statically allocated in the tcpmsg
1513		 * structure, and destroy_disp() frees the tcpmsg, so we must
1514		 * free the event *before* calling destroy_disp().
1515		 */
1516		isc_event_free(&ev_in);
1517
1518		disp->shutting_down = 1;
1519		disp->shutdown_why = tcpmsg->result;
1520
1521		/*
1522		 * If the recv() was canceled pass the word on.
1523		 */
1524		killit = destroy_disp_ok(disp);
1525		UNLOCK(&disp->lock);
1526		if (killit)
1527			isc_task_send(disp->task[0], &disp->ctlevent);
1528		return;
1529	}
1530
1531	dispatch_log(disp, LVL(90), "result %d, length == %d, addr = %p",
1532		     tcpmsg->result,
1533		     tcpmsg->buffer.length, tcpmsg->buffer.base);
1534
1535	/*
1536	 * Peek into the buffer to see what we can see.
1537	 */
1538	dres = dns_message_peekheader(&tcpmsg->buffer, &id, &flags);
1539	if (dres != ISC_R_SUCCESS) {
1540		dispatch_log(disp, LVL(10), "got garbage packet");
1541		goto restart;
1542	}
1543
1544	dispatch_log(disp, LVL(92),
1545		     "got valid DNS message header, /QR %c, id %u",
1546		     ((flags & DNS_MESSAGEFLAG_QR) ? '1' : '0'), id);
1547
1548	/*
1549	 * Allocate an event to send to the query or response client, and
1550	 * allocate a new buffer for our use.
1551	 */
1552
1553	/*
1554	 * Look at flags.  If query, drop it. If response,
1555	 * look to see where it goes.
1556	 */
1557	queue_response = ISC_FALSE;
1558	if ((flags & DNS_MESSAGEFLAG_QR) == 0) {
1559		/*
1560		 * Query.
1561		 */
1562		goto restart;
1563	}
1564
1565	/*
1566	 * Response.
1567	 */
1568	bucket = dns_hash(qid, &tcpmsg->address, id, disp->localport);
1569	LOCK(&qid->lock);
1570	resp = entry_search(qid, &tcpmsg->address, id, disp->localport, bucket);
1571	dispatch_log(disp, LVL(90),
1572		     "search for response in bucket %d: %s",
1573		     bucket, (resp == NULL ? "not found" : "found"));
1574
1575	if (resp == NULL)
1576		goto unlock;
1577	queue_response = resp->item_out;
1578	rev = allocate_event(disp);
1579	if (rev == NULL)
1580		goto unlock;
1581
1582	/*
1583	 * At this point, rev contains the event we want to fill in, and
1584	 * resp contains the information on the place to send it to.
1585	 * Send the event off.
1586	 */
1587	dns_tcpmsg_keepbuffer(tcpmsg, &rev->buffer);
1588	disp->tcpbuffers++;
1589	rev->result = ISC_R_SUCCESS;
1590	rev->id = id;
1591	rev->addr = tcpmsg->address;
1592	if (queue_response) {
1593		ISC_LIST_APPEND(resp->items, rev, ev_link);
1594	} else {
1595		ISC_EVENT_INIT(rev, sizeof(*rev), 0, NULL, DNS_EVENT_DISPATCH,
1596			       resp->action, resp->arg, resp, NULL, NULL);
1597		request_log(disp, resp, LVL(90),
1598			    "[b] Sent event %p buffer %p len %d to task %p",
1599			    rev, rev->buffer.base, rev->buffer.length,
1600			    resp->task);
1601		resp->item_out = ISC_TRUE;
1602		isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
1603	}
1604 unlock:
1605	UNLOCK(&qid->lock);
1606
1607	/*
1608	 * Restart recv() to get the next packet.
1609	 */
1610 restart:
1611	(void)startrecv(disp, NULL);
1612
1613	UNLOCK(&disp->lock);
1614
1615	isc_event_free(&ev_in);
1616}
1617
1618/*
1619 * disp must be locked.
1620 */
1621static isc_result_t
1622startrecv(dns_dispatch_t *disp, dispsocket_t *dispsock) {
1623	isc_result_t res;
1624	isc_region_t region;
1625	isc_socket_t *socket;
1626
1627	if (disp->shutting_down == 1)
1628		return (ISC_R_SUCCESS);
1629
1630	if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
1631		return (ISC_R_SUCCESS);
1632
1633	if (disp->recv_pending != 0 && dispsock == NULL)
1634		return (ISC_R_SUCCESS);
1635
1636	if (disp->mgr->buffers >= disp->mgr->maxbuffers)
1637		return (ISC_R_NOMEMORY);
1638
1639	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
1640	    dispsock == NULL)
1641		return (ISC_R_SUCCESS);
1642
1643	if (dispsock != NULL)
1644		socket = dispsock->socket;
1645	else
1646		socket = disp->socket;
1647	INSIST(socket != NULL);
1648
1649	switch (disp->socktype) {
1650		/*
1651		 * UDP reads are always maximal.
1652		 */
1653	case isc_sockettype_udp:
1654		region.length = disp->mgr->buffersize;
1655		region.base = allocate_udp_buffer(disp);
1656		if (region.base == NULL)
1657			return (ISC_R_NOMEMORY);
1658		if (dispsock != NULL) {
1659			res = isc_socket_recv(socket, &region, 1,
1660					      dispsock->task, udp_exrecv,
1661					      dispsock);
1662			if (res != ISC_R_SUCCESS) {
1663				free_buffer(disp, region.base, region.length);
1664				return (res);
1665			}
1666		} else {
1667			res = isc_socket_recv(socket, &region, 1,
1668					      disp->task[0], udp_shrecv, disp);
1669			if (res != ISC_R_SUCCESS) {
1670				free_buffer(disp, region.base, region.length);
1671				disp->shutdown_why = res;
1672				disp->shutting_down = 1;
1673				do_cancel(disp);
1674				return (ISC_R_SUCCESS); /* recover by cancel */
1675			}
1676			INSIST(disp->recv_pending == 0);
1677			disp->recv_pending = 1;
1678		}
1679		break;
1680
1681	case isc_sockettype_tcp:
1682		res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task[0],
1683					     tcp_recv, disp);
1684		if (res != ISC_R_SUCCESS) {
1685			disp->shutdown_why = res;
1686			disp->shutting_down = 1;
1687			do_cancel(disp);
1688			return (ISC_R_SUCCESS); /* recover by cancel */
1689		}
1690		INSIST(disp->recv_pending == 0);
1691		disp->recv_pending = 1;
1692		break;
1693	default:
1694		INSIST(0);
1695		break;
1696	}
1697
1698	return (ISC_R_SUCCESS);
1699}
1700
1701/*
1702 * Mgr must be locked when calling this function.
1703 */
1704static isc_boolean_t
1705destroy_mgr_ok(dns_dispatchmgr_t *mgr) {
1706	mgr_log(mgr, LVL(90),
1707		"destroy_mgr_ok: shuttingdown=%d, listnonempty=%d, "
1708		"epool=%d, rpool=%d, dpool=%d",
1709		MGR_IS_SHUTTINGDOWN(mgr), !ISC_LIST_EMPTY(mgr->list),
1710		isc_mempool_getallocated(mgr->epool),
1711		isc_mempool_getallocated(mgr->rpool),
1712		isc_mempool_getallocated(mgr->dpool));
1713	if (!MGR_IS_SHUTTINGDOWN(mgr))
1714		return (ISC_FALSE);
1715	if (!ISC_LIST_EMPTY(mgr->list))
1716		return (ISC_FALSE);
1717	if (isc_mempool_getallocated(mgr->epool) != 0)
1718		return (ISC_FALSE);
1719	if (isc_mempool_getallocated(mgr->rpool) != 0)
1720		return (ISC_FALSE);
1721	if (isc_mempool_getallocated(mgr->dpool) != 0)
1722		return (ISC_FALSE);
1723
1724	return (ISC_TRUE);
1725}
1726
1727/*
1728 * Mgr must be unlocked when calling this function.
1729 */
1730static void
1731destroy_mgr(dns_dispatchmgr_t **mgrp) {
1732	isc_mem_t *mctx;
1733	dns_dispatchmgr_t *mgr;
1734
1735	mgr = *mgrp;
1736	*mgrp = NULL;
1737
1738	mctx = mgr->mctx;
1739
1740	mgr->magic = 0;
1741	mgr->mctx = NULL;
1742	DESTROYLOCK(&mgr->lock);
1743	mgr->state = 0;
1744
1745	DESTROYLOCK(&mgr->arc4_lock);
1746
1747	isc_mempool_destroy(&mgr->epool);
1748	isc_mempool_destroy(&mgr->rpool);
1749	isc_mempool_destroy(&mgr->dpool);
1750	if (mgr->bpool != NULL)
1751		isc_mempool_destroy(&mgr->bpool);
1752	if (mgr->spool != NULL)
1753		isc_mempool_destroy(&mgr->spool);
1754
1755	DESTROYLOCK(&mgr->pool_lock);
1756
1757#ifdef BIND9
1758	if (mgr->entropy != NULL)
1759		isc_entropy_detach(&mgr->entropy);
1760#endif /* BIND9 */
1761	if (mgr->qid != NULL)
1762		qid_destroy(mctx, &mgr->qid);
1763
1764	DESTROYLOCK(&mgr->buffer_lock);
1765
1766	if (mgr->blackhole != NULL)
1767		dns_acl_detach(&mgr->blackhole);
1768
1769	if (mgr->stats != NULL)
1770		isc_stats_detach(&mgr->stats);
1771
1772	if (mgr->v4ports != NULL) {
1773		isc_mem_put(mctx, mgr->v4ports,
1774			    mgr->nv4ports * sizeof(in_port_t));
1775	}
1776	if (mgr->v6ports != NULL) {
1777		isc_mem_put(mctx, mgr->v6ports,
1778			    mgr->nv6ports * sizeof(in_port_t));
1779	}
1780	isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
1781	isc_mem_detach(&mctx);
1782}
1783
1784static isc_result_t
1785open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
1786	    unsigned int options, isc_socket_t **sockp)
1787{
1788	isc_socket_t *sock;
1789	isc_result_t result;
1790
1791	sock = *sockp;
1792	if (sock == NULL) {
1793		result = isc_socket_create(mgr, isc_sockaddr_pf(local),
1794					   isc_sockettype_udp, &sock);
1795		if (result != ISC_R_SUCCESS)
1796			return (result);
1797		isc_socket_setname(sock, "dispatcher", NULL);
1798	} else {
1799#ifdef BIND9
1800		result = isc_socket_open(sock);
1801		if (result != ISC_R_SUCCESS)
1802			return (result);
1803#else
1804		INSIST(0);
1805#endif
1806	}
1807
1808#ifndef ISC_ALLOW_MAPPED
1809	isc_socket_ipv6only(sock, ISC_TRUE);
1810#endif
1811	result = isc_socket_bind(sock, local, options);
1812	if (result != ISC_R_SUCCESS) {
1813		if (*sockp == NULL)
1814			isc_socket_detach(&sock);
1815		else {
1816#ifdef BIND9
1817			isc_socket_close(sock);
1818#else
1819			INSIST(0);
1820#endif
1821		}
1822		return (result);
1823	}
1824
1825	*sockp = sock;
1826	return (ISC_R_SUCCESS);
1827}
1828
1829/*%
1830 * Create a temporary port list to set the initial default set of dispatch
1831 * ports: [1024, 65535].  This is almost meaningless as the application will
1832 * normally set the ports explicitly, but is provided to fill some minor corner
1833 * cases.
1834 */
1835static isc_result_t
1836create_default_portset(isc_mem_t *mctx, isc_portset_t **portsetp) {
1837	isc_result_t result;
1838
1839	result = isc_portset_create(mctx, portsetp);
1840	if (result != ISC_R_SUCCESS)
1841		return (result);
1842	isc_portset_addrange(*portsetp, 1024, 65535);
1843
1844	return (ISC_R_SUCCESS);
1845}
1846
1847/*
1848 * Publics.
1849 */
1850
1851isc_result_t
1852dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy,
1853		       dns_dispatchmgr_t **mgrp)
1854{
1855	dns_dispatchmgr_t *mgr;
1856	isc_result_t result;
1857	isc_portset_t *v4portset = NULL;
1858	isc_portset_t *v6portset = NULL;
1859
1860	REQUIRE(mctx != NULL);
1861	REQUIRE(mgrp != NULL && *mgrp == NULL);
1862
1863	mgr = isc_mem_get(mctx, sizeof(dns_dispatchmgr_t));
1864	if (mgr == NULL)
1865		return (ISC_R_NOMEMORY);
1866
1867	mgr->mctx = NULL;
1868	isc_mem_attach(mctx, &mgr->mctx);
1869
1870	mgr->blackhole = NULL;
1871	mgr->stats = NULL;
1872
1873	result = isc_mutex_init(&mgr->lock);
1874	if (result != ISC_R_SUCCESS)
1875		goto deallocate;
1876
1877	result = isc_mutex_init(&mgr->arc4_lock);
1878	if (result != ISC_R_SUCCESS)
1879		goto kill_lock;
1880
1881	result = isc_mutex_init(&mgr->buffer_lock);
1882	if (result != ISC_R_SUCCESS)
1883		goto kill_arc4_lock;
1884
1885	result = isc_mutex_init(&mgr->pool_lock);
1886	if (result != ISC_R_SUCCESS)
1887		goto kill_buffer_lock;
1888
1889	mgr->epool = NULL;
1890	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispatchevent_t),
1891			       &mgr->epool) != ISC_R_SUCCESS) {
1892		result = ISC_R_NOMEMORY;
1893		goto kill_pool_lock;
1894	}
1895
1896	mgr->rpool = NULL;
1897	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispentry_t),
1898			       &mgr->rpool) != ISC_R_SUCCESS) {
1899		result = ISC_R_NOMEMORY;
1900		goto kill_epool;
1901	}
1902
1903	mgr->dpool = NULL;
1904	if (isc_mempool_create(mgr->mctx, sizeof(dns_dispatch_t),
1905			       &mgr->dpool) != ISC_R_SUCCESS) {
1906		result = ISC_R_NOMEMORY;
1907		goto kill_rpool;
1908	}
1909
1910	isc_mempool_setname(mgr->epool, "dispmgr_epool");
1911	isc_mempool_setfreemax(mgr->epool, 1024);
1912	isc_mempool_associatelock(mgr->epool, &mgr->pool_lock);
1913
1914	isc_mempool_setname(mgr->rpool, "dispmgr_rpool");
1915	isc_mempool_setfreemax(mgr->rpool, 1024);
1916	isc_mempool_associatelock(mgr->rpool, &mgr->pool_lock);
1917
1918	isc_mempool_setname(mgr->dpool, "dispmgr_dpool");
1919	isc_mempool_setfreemax(mgr->dpool, 1024);
1920	isc_mempool_associatelock(mgr->dpool, &mgr->pool_lock);
1921
1922	mgr->buffers = 0;
1923	mgr->buffersize = 0;
1924	mgr->maxbuffers = 0;
1925	mgr->bpool = NULL;
1926	mgr->spool = NULL;
1927	mgr->entropy = NULL;
1928	mgr->qid = NULL;
1929	mgr->state = 0;
1930	ISC_LIST_INIT(mgr->list);
1931	mgr->v4ports = NULL;
1932	mgr->v6ports = NULL;
1933	mgr->nv4ports = 0;
1934	mgr->nv6ports = 0;
1935	mgr->magic = DNS_DISPATCHMGR_MAGIC;
1936
1937	result = create_default_portset(mctx, &v4portset);
1938	if (result == ISC_R_SUCCESS) {
1939		result = create_default_portset(mctx, &v6portset);
1940		if (result == ISC_R_SUCCESS) {
1941			result = dns_dispatchmgr_setavailports(mgr,
1942							       v4portset,
1943							       v6portset);
1944		}
1945	}
1946	if (v4portset != NULL)
1947		isc_portset_destroy(mctx, &v4portset);
1948	if (v6portset != NULL)
1949		isc_portset_destroy(mctx, &v6portset);
1950	if (result != ISC_R_SUCCESS)
1951		goto kill_dpool;
1952
1953#ifdef BIND9
1954	if (entropy != NULL)
1955		isc_entropy_attach(entropy, &mgr->entropy);
1956#else
1957	UNUSED(entropy);
1958#endif
1959
1960	dispatch_initrandom(&mgr->arc4ctx, mgr->entropy, &mgr->arc4_lock);
1961
1962	*mgrp = mgr;
1963	return (ISC_R_SUCCESS);
1964
1965 kill_dpool:
1966	isc_mempool_destroy(&mgr->dpool);
1967 kill_rpool:
1968	isc_mempool_destroy(&mgr->rpool);
1969 kill_epool:
1970	isc_mempool_destroy(&mgr->epool);
1971 kill_pool_lock:
1972	DESTROYLOCK(&mgr->pool_lock);
1973 kill_buffer_lock:
1974	DESTROYLOCK(&mgr->buffer_lock);
1975 kill_arc4_lock:
1976	DESTROYLOCK(&mgr->arc4_lock);
1977 kill_lock:
1978	DESTROYLOCK(&mgr->lock);
1979 deallocate:
1980	isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
1981	isc_mem_detach(&mctx);
1982
1983	return (result);
1984}
1985
1986void
1987dns_dispatchmgr_setblackhole(dns_dispatchmgr_t *mgr, dns_acl_t *blackhole) {
1988	REQUIRE(VALID_DISPATCHMGR(mgr));
1989	if (mgr->blackhole != NULL)
1990		dns_acl_detach(&mgr->blackhole);
1991	dns_acl_attach(blackhole, &mgr->blackhole);
1992}
1993
1994dns_acl_t *
1995dns_dispatchmgr_getblackhole(dns_dispatchmgr_t *mgr) {
1996	REQUIRE(VALID_DISPATCHMGR(mgr));
1997	return (mgr->blackhole);
1998}
1999
2000void
2001dns_dispatchmgr_setblackportlist(dns_dispatchmgr_t *mgr,
2002				 dns_portlist_t *portlist)
2003{
2004	REQUIRE(VALID_DISPATCHMGR(mgr));
2005	UNUSED(portlist);
2006
2007	/* This function is deprecated: use dns_dispatchmgr_setavailports(). */
2008	return;
2009}
2010
2011dns_portlist_t *
2012dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr) {
2013	REQUIRE(VALID_DISPATCHMGR(mgr));
2014	return (NULL);		/* this function is deprecated */
2015}
2016
2017isc_result_t
2018dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset,
2019			      isc_portset_t *v6portset)
2020{
2021	in_port_t *v4ports, *v6ports, p;
2022	unsigned int nv4ports, nv6ports, i4, i6;
2023
2024	REQUIRE(VALID_DISPATCHMGR(mgr));
2025
2026	nv4ports = isc_portset_nports(v4portset);
2027	nv6ports = isc_portset_nports(v6portset);
2028
2029	v4ports = NULL;
2030	if (nv4ports != 0) {
2031		v4ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv4ports);
2032		if (v4ports == NULL)
2033			return (ISC_R_NOMEMORY);
2034	}
2035	v6ports = NULL;
2036	if (nv6ports != 0) {
2037		v6ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv6ports);
2038		if (v6ports == NULL) {
2039			if (v4ports != NULL) {
2040				isc_mem_put(mgr->mctx, v4ports,
2041					    sizeof(in_port_t) *
2042					    isc_portset_nports(v4portset));
2043			}
2044			return (ISC_R_NOMEMORY);
2045		}
2046	}
2047
2048	p = 0;
2049	i4 = 0;
2050	i6 = 0;
2051	do {
2052		if (isc_portset_isset(v4portset, p)) {
2053			INSIST(i4 < nv4ports);
2054			v4ports[i4++] = p;
2055		}
2056		if (isc_portset_isset(v6portset, p)) {
2057			INSIST(i6 < nv6ports);
2058			v6ports[i6++] = p;
2059		}
2060	} while (p++ < 65535);
2061	INSIST(i4 == nv4ports && i6 == nv6ports);
2062
2063	PORTBUFLOCK(mgr);
2064	if (mgr->v4ports != NULL) {
2065		isc_mem_put(mgr->mctx, mgr->v4ports,
2066			    mgr->nv4ports * sizeof(in_port_t));
2067	}
2068	mgr->v4ports = v4ports;
2069	mgr->nv4ports = nv4ports;
2070
2071	if (mgr->v6ports != NULL) {
2072		isc_mem_put(mgr->mctx, mgr->v6ports,
2073			    mgr->nv6ports * sizeof(in_port_t));
2074	}
2075	mgr->v6ports = v6ports;
2076	mgr->nv6ports = nv6ports;
2077	PORTBUFUNLOCK(mgr);
2078
2079	return (ISC_R_SUCCESS);
2080}
2081
2082static isc_result_t
2083dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr,
2084		       unsigned int buffersize, unsigned int maxbuffers,
2085		       unsigned int maxrequests, unsigned int buckets,
2086		       unsigned int increment)
2087{
2088	isc_result_t result;
2089
2090	REQUIRE(VALID_DISPATCHMGR(mgr));
2091	REQUIRE(buffersize >= 512 && buffersize < (64 * 1024));
2092	REQUIRE(maxbuffers > 0);
2093	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2094	REQUIRE(increment > buckets);
2095
2096	/*
2097	 * Keep some number of items around.  This should be a config
2098	 * option.  For now, keep 8, but later keep at least two even
2099	 * if the caller wants less.  This allows us to ensure certain
2100	 * things, like an event can be "freed" and the next allocation
2101	 * will always succeed.
2102	 *
2103	 * Note that if limits are placed on anything here, we use one
2104	 * event internally, so the actual limit should be "wanted + 1."
2105	 *
2106	 * XXXMLG
2107	 */
2108
2109	if (maxbuffers < 8)
2110		maxbuffers = 8;
2111
2112	LOCK(&mgr->buffer_lock);
2113
2114	/* Create or adjust buffer pool */
2115	if (mgr->bpool != NULL) {
2116		/*
2117		 * We only increase the maxbuffers to avoid accidental buffer
2118		 * shortage.  Ideally we'd separate the manager-wide maximum
2119		 * from per-dispatch limits and respect the latter within the
2120		 * global limit.  But at this moment that's deemed to be
2121		 * overkilling and isn't worth additional implementation
2122		 * complexity.
2123		 */
2124		if (maxbuffers > mgr->maxbuffers) {
2125			isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
2126			mgr->maxbuffers = maxbuffers;
2127		}
2128	} else {
2129		result = isc_mempool_create(mgr->mctx, buffersize, &mgr->bpool);
2130		if (result != ISC_R_SUCCESS) {
2131			UNLOCK(&mgr->buffer_lock);
2132			return (result);
2133		}
2134		isc_mempool_setname(mgr->bpool, "dispmgr_bpool");
2135		isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
2136		isc_mempool_associatelock(mgr->bpool, &mgr->pool_lock);
2137	}
2138
2139	/* Create or adjust socket pool */
2140	if (mgr->spool != NULL) {
2141		isc_mempool_setmaxalloc(mgr->spool, DNS_DISPATCH_POOLSOCKS * 2);
2142		UNLOCK(&mgr->buffer_lock);
2143		return (ISC_R_SUCCESS);
2144	}
2145	result = isc_mempool_create(mgr->mctx, sizeof(dispsocket_t),
2146				    &mgr->spool);
2147	if (result != ISC_R_SUCCESS) {
2148		UNLOCK(&mgr->buffer_lock);
2149		goto cleanup;
2150	}
2151	isc_mempool_setname(mgr->spool, "dispmgr_spool");
2152	isc_mempool_setmaxalloc(mgr->spool, maxrequests);
2153	isc_mempool_associatelock(mgr->spool, &mgr->pool_lock);
2154
2155	result = qid_allocate(mgr, buckets, increment, &mgr->qid, ISC_TRUE);
2156	if (result != ISC_R_SUCCESS)
2157		goto cleanup;
2158
2159	mgr->buffersize = buffersize;
2160	mgr->maxbuffers = maxbuffers;
2161	UNLOCK(&mgr->buffer_lock);
2162	return (ISC_R_SUCCESS);
2163
2164 cleanup:
2165	isc_mempool_destroy(&mgr->bpool);
2166	if (mgr->spool != NULL)
2167		isc_mempool_destroy(&mgr->spool);
2168	UNLOCK(&mgr->buffer_lock);
2169	return (result);
2170}
2171
2172void
2173dns_dispatchmgr_destroy(dns_dispatchmgr_t **mgrp) {
2174	dns_dispatchmgr_t *mgr;
2175	isc_boolean_t killit;
2176
2177	REQUIRE(mgrp != NULL);
2178	REQUIRE(VALID_DISPATCHMGR(*mgrp));
2179
2180	mgr = *mgrp;
2181	*mgrp = NULL;
2182
2183	LOCK(&mgr->lock);
2184	mgr->state |= MGR_SHUTTINGDOWN;
2185
2186	killit = destroy_mgr_ok(mgr);
2187	UNLOCK(&mgr->lock);
2188
2189	mgr_log(mgr, LVL(90), "destroy: killit=%d", killit);
2190
2191	if (killit)
2192		destroy_mgr(&mgr);
2193}
2194
2195void
2196dns_dispatchmgr_setstats(dns_dispatchmgr_t *mgr, isc_stats_t *stats) {
2197	REQUIRE(VALID_DISPATCHMGR(mgr));
2198	REQUIRE(ISC_LIST_EMPTY(mgr->list));
2199	REQUIRE(mgr->stats == NULL);
2200
2201	isc_stats_attach(stats, &mgr->stats);
2202}
2203
2204static int
2205port_cmp(const void *key, const void *ent) {
2206	in_port_t p1 = *(const in_port_t *)key;
2207	in_port_t p2 = *(const in_port_t *)ent;
2208
2209	if (p1 < p2)
2210		return (-1);
2211	else if (p1 == p2)
2212		return (0);
2213	else
2214		return (1);
2215}
2216
2217static isc_boolean_t
2218portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
2219	      isc_sockaddr_t *sockaddrp)
2220{
2221	isc_sockaddr_t sockaddr;
2222	isc_result_t result;
2223	in_port_t *ports, port;
2224	unsigned int nports;
2225	isc_boolean_t available = ISC_FALSE;
2226
2227	REQUIRE(sock != NULL || sockaddrp != NULL);
2228
2229	PORTBUFLOCK(mgr);
2230	if (sock != NULL) {
2231		sockaddrp = &sockaddr;
2232		result = isc_socket_getsockname(sock, sockaddrp);
2233		if (result != ISC_R_SUCCESS)
2234			goto unlock;
2235	}
2236
2237	if (isc_sockaddr_pf(sockaddrp) == AF_INET) {
2238		ports = mgr->v4ports;
2239		nports = mgr->nv4ports;
2240	} else {
2241		ports = mgr->v6ports;
2242		nports = mgr->nv6ports;
2243	}
2244	if (ports == NULL)
2245		goto unlock;
2246
2247	port = isc_sockaddr_getport(sockaddrp);
2248	if (bsearch(&port, ports, nports, sizeof(in_port_t), port_cmp) != NULL)
2249		available = ISC_TRUE;
2250
2251unlock:
2252	PORTBUFUNLOCK(mgr);
2253	return (available);
2254}
2255
2256#define ATTRMATCH(_a1, _a2, _mask) (((_a1) & (_mask)) == ((_a2) & (_mask)))
2257
2258static isc_boolean_t
2259local_addr_match(dns_dispatch_t *disp, isc_sockaddr_t *addr) {
2260	isc_sockaddr_t sockaddr;
2261	isc_result_t result;
2262
2263	REQUIRE(disp->socket != NULL);
2264
2265	if (addr == NULL)
2266		return (ISC_TRUE);
2267
2268	/*
2269	 * Don't match wildcard ports unless the port is available in the
2270	 * current configuration.
2271	 */
2272	if (isc_sockaddr_getport(addr) == 0 &&
2273	    isc_sockaddr_getport(&disp->local) == 0 &&
2274	    !portavailable(disp->mgr, disp->socket, NULL)) {
2275		return (ISC_FALSE);
2276	}
2277
2278	/*
2279	 * Check if we match the binding <address,port>.
2280	 * Wildcard ports match/fail here.
2281	 */
2282	if (isc_sockaddr_equal(&disp->local, addr))
2283		return (ISC_TRUE);
2284	if (isc_sockaddr_getport(addr) == 0)
2285		return (ISC_FALSE);
2286
2287	/*
2288	 * Check if we match a bound wildcard port <address,port>.
2289	 */
2290	if (!isc_sockaddr_eqaddr(&disp->local, addr))
2291		return (ISC_FALSE);
2292	result = isc_socket_getsockname(disp->socket, &sockaddr);
2293	if (result != ISC_R_SUCCESS)
2294		return (ISC_FALSE);
2295
2296	return (isc_sockaddr_equal(&sockaddr, addr));
2297}
2298
2299/*
2300 * Requires mgr be locked.
2301 *
2302 * No dispatcher can be locked by this thread when calling this function.
2303 *
2304 *
2305 * NOTE:
2306 *	If a matching dispatcher is found, it is locked after this function
2307 *	returns, and must be unlocked by the caller.
2308 */
2309static isc_result_t
2310dispatch_find(dns_dispatchmgr_t *mgr, isc_sockaddr_t *local,
2311	      unsigned int attributes, unsigned int mask,
2312	      dns_dispatch_t **dispp)
2313{
2314	dns_dispatch_t *disp;
2315	isc_result_t result;
2316
2317	/*
2318	 * Make certain that we will not match a private or exclusive dispatch.
2319	 */
2320	attributes &= ~(DNS_DISPATCHATTR_PRIVATE|DNS_DISPATCHATTR_EXCLUSIVE);
2321	mask |= (DNS_DISPATCHATTR_PRIVATE|DNS_DISPATCHATTR_EXCLUSIVE);
2322
2323	disp = ISC_LIST_HEAD(mgr->list);
2324	while (disp != NULL) {
2325		LOCK(&disp->lock);
2326		if ((disp->shutting_down == 0)
2327		    && ATTRMATCH(disp->attributes, attributes, mask)
2328		    && local_addr_match(disp, local))
2329			break;
2330		UNLOCK(&disp->lock);
2331		disp = ISC_LIST_NEXT(disp, link);
2332	}
2333
2334	if (disp == NULL) {
2335		result = ISC_R_NOTFOUND;
2336		goto out;
2337	}
2338
2339	*dispp = disp;
2340	result = ISC_R_SUCCESS;
2341 out:
2342
2343	return (result);
2344}
2345
2346static isc_result_t
2347qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
2348	     unsigned int increment, dns_qid_t **qidp,
2349	     isc_boolean_t needsocktable)
2350{
2351	dns_qid_t *qid;
2352	unsigned int i;
2353	isc_result_t result;
2354
2355	REQUIRE(VALID_DISPATCHMGR(mgr));
2356	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2357	REQUIRE(increment > buckets);
2358	REQUIRE(qidp != NULL && *qidp == NULL);
2359
2360	qid = isc_mem_get(mgr->mctx, sizeof(*qid));
2361	if (qid == NULL)
2362		return (ISC_R_NOMEMORY);
2363
2364	qid->qid_table = isc_mem_get(mgr->mctx,
2365				     buckets * sizeof(dns_displist_t));
2366	if (qid->qid_table == NULL) {
2367		isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2368		return (ISC_R_NOMEMORY);
2369	}
2370
2371	qid->sock_table = NULL;
2372	if (needsocktable) {
2373		qid->sock_table = isc_mem_get(mgr->mctx, buckets *
2374					      sizeof(dispsocketlist_t));
2375		if (qid->sock_table == NULL) {
2376			isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2377			isc_mem_put(mgr->mctx, qid->qid_table,
2378				    buckets * sizeof(dns_displist_t));
2379			return (ISC_R_NOMEMORY);
2380		}
2381	}
2382
2383	result = isc_mutex_init(&qid->lock);
2384	if (result != ISC_R_SUCCESS) {
2385		if (qid->sock_table != NULL) {
2386			isc_mem_put(mgr->mctx, qid->sock_table,
2387				    buckets * sizeof(dispsocketlist_t));
2388		}
2389		isc_mem_put(mgr->mctx, qid->qid_table,
2390			    buckets * sizeof(dns_displist_t));
2391		isc_mem_put(mgr->mctx, qid, sizeof(*qid));
2392		return (result);
2393	}
2394
2395	for (i = 0; i < buckets; i++) {
2396		ISC_LIST_INIT(qid->qid_table[i]);
2397		if (qid->sock_table != NULL)
2398			ISC_LIST_INIT(qid->sock_table[i]);
2399	}
2400
2401	qid->qid_nbuckets = buckets;
2402	qid->qid_increment = increment;
2403	qid->magic = QID_MAGIC;
2404	*qidp = qid;
2405	return (ISC_R_SUCCESS);
2406}
2407
2408static void
2409qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp) {
2410	dns_qid_t *qid;
2411
2412	REQUIRE(qidp != NULL);
2413	qid = *qidp;
2414
2415	REQUIRE(VALID_QID(qid));
2416
2417	*qidp = NULL;
2418	qid->magic = 0;
2419	isc_mem_put(mctx, qid->qid_table,
2420		    qid->qid_nbuckets * sizeof(dns_displist_t));
2421	if (qid->sock_table != NULL) {
2422		isc_mem_put(mctx, qid->sock_table,
2423			    qid->qid_nbuckets * sizeof(dispsocketlist_t));
2424	}
2425	DESTROYLOCK(&qid->lock);
2426	isc_mem_put(mctx, qid, sizeof(*qid));
2427}
2428
2429/*
2430 * Allocate and set important limits.
2431 */
2432static isc_result_t
2433dispatch_allocate(dns_dispatchmgr_t *mgr, unsigned int maxrequests,
2434		  dns_dispatch_t **dispp)
2435{
2436	dns_dispatch_t *disp;
2437	isc_result_t result;
2438
2439	REQUIRE(VALID_DISPATCHMGR(mgr));
2440	REQUIRE(dispp != NULL && *dispp == NULL);
2441
2442	/*
2443	 * Set up the dispatcher, mostly.  Don't bother setting some of
2444	 * the options that are controlled by tcp vs. udp, etc.
2445	 */
2446
2447	disp = isc_mempool_get(mgr->dpool);
2448	if (disp == NULL)
2449		return (ISC_R_NOMEMORY);
2450
2451	disp->magic = 0;
2452	disp->mgr = mgr;
2453	disp->maxrequests = maxrequests;
2454	disp->attributes = 0;
2455	ISC_LINK_INIT(disp, link);
2456	disp->refcount = 1;
2457	disp->recv_pending = 0;
2458	memset(&disp->local, 0, sizeof(disp->local));
2459	disp->localport = 0;
2460	disp->shutting_down = 0;
2461	disp->shutdown_out = 0;
2462	disp->connected = 0;
2463	disp->tcpmsg_valid = 0;
2464	disp->shutdown_why = ISC_R_UNEXPECTED;
2465	disp->requests = 0;
2466	disp->tcpbuffers = 0;
2467	disp->qid = NULL;
2468	ISC_LIST_INIT(disp->activesockets);
2469	ISC_LIST_INIT(disp->inactivesockets);
2470	disp->nsockets = 0;
2471	dispatch_initrandom(&disp->arc4ctx, mgr->entropy, NULL);
2472	disp->port_table = NULL;
2473	disp->portpool = NULL;
2474
2475	result = isc_mutex_init(&disp->lock);
2476	if (result != ISC_R_SUCCESS)
2477		goto deallocate;
2478
2479	disp->failsafe_ev = allocate_event(disp);
2480	if (disp->failsafe_ev == NULL) {
2481		result = ISC_R_NOMEMORY;
2482		goto kill_lock;
2483	}
2484
2485	disp->magic = DISPATCH_MAGIC;
2486
2487	*dispp = disp;
2488	return (ISC_R_SUCCESS);
2489
2490	/*
2491	 * error returns
2492	 */
2493 kill_lock:
2494	DESTROYLOCK(&disp->lock);
2495 deallocate:
2496	isc_mempool_put(mgr->dpool, disp);
2497
2498	return (result);
2499}
2500
2501
2502/*
2503 * MUST be unlocked, and not used by anything.
2504 */
2505static void
2506dispatch_free(dns_dispatch_t **dispp)
2507{
2508	dns_dispatch_t *disp;
2509	dns_dispatchmgr_t *mgr;
2510	int i;
2511
2512	REQUIRE(VALID_DISPATCH(*dispp));
2513	disp = *dispp;
2514	*dispp = NULL;
2515
2516	mgr = disp->mgr;
2517	REQUIRE(VALID_DISPATCHMGR(mgr));
2518
2519	if (disp->tcpmsg_valid) {
2520		dns_tcpmsg_invalidate(&disp->tcpmsg);
2521		disp->tcpmsg_valid = 0;
2522	}
2523
2524	INSIST(disp->tcpbuffers == 0);
2525	INSIST(disp->requests == 0);
2526	INSIST(disp->recv_pending == 0);
2527	INSIST(ISC_LIST_EMPTY(disp->activesockets));
2528	INSIST(ISC_LIST_EMPTY(disp->inactivesockets));
2529
2530	isc_mempool_put(mgr->epool, disp->failsafe_ev);
2531	disp->failsafe_ev = NULL;
2532
2533	if (disp->qid != NULL)
2534		qid_destroy(mgr->mctx, &disp->qid);
2535
2536	if (disp->port_table != NULL) {
2537		for (i = 0; i < DNS_DISPATCH_PORTTABLESIZE; i++)
2538			INSIST(ISC_LIST_EMPTY(disp->port_table[i]));
2539		isc_mem_put(mgr->mctx, disp->port_table,
2540			    sizeof(disp->port_table[0]) *
2541			    DNS_DISPATCH_PORTTABLESIZE);
2542	}
2543
2544	if (disp->portpool != NULL)
2545		isc_mempool_destroy(&disp->portpool);
2546
2547	disp->mgr = NULL;
2548	DESTROYLOCK(&disp->lock);
2549	disp->magic = 0;
2550	isc_mempool_put(mgr->dpool, disp);
2551}
2552
2553isc_result_t
2554dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
2555		       isc_taskmgr_t *taskmgr, unsigned int buffersize,
2556		       unsigned int maxbuffers, unsigned int maxrequests,
2557		       unsigned int buckets, unsigned int increment,
2558		       unsigned int attributes, dns_dispatch_t **dispp)
2559{
2560	isc_result_t result;
2561	dns_dispatch_t *disp;
2562
2563	UNUSED(maxbuffers);
2564	UNUSED(buffersize);
2565
2566	REQUIRE(VALID_DISPATCHMGR(mgr));
2567	REQUIRE(isc_socket_gettype(sock) == isc_sockettype_tcp);
2568	REQUIRE((attributes & DNS_DISPATCHATTR_TCP) != 0);
2569	REQUIRE((attributes & DNS_DISPATCHATTR_UDP) == 0);
2570
2571	attributes |= DNS_DISPATCHATTR_PRIVATE;  /* XXXMLG */
2572
2573	LOCK(&mgr->lock);
2574
2575	/*
2576	 * dispatch_allocate() checks mgr for us.
2577	 * qid_allocate() checks buckets and increment for us.
2578	 */
2579	disp = NULL;
2580	result = dispatch_allocate(mgr, maxrequests, &disp);
2581	if (result != ISC_R_SUCCESS) {
2582		UNLOCK(&mgr->lock);
2583		return (result);
2584	}
2585
2586	result = qid_allocate(mgr, buckets, increment, &disp->qid, ISC_FALSE);
2587	if (result != ISC_R_SUCCESS)
2588		goto deallocate_dispatch;
2589
2590	disp->socktype = isc_sockettype_tcp;
2591	disp->socket = NULL;
2592	isc_socket_attach(sock, &disp->socket);
2593
2594	disp->ntasks = 1;
2595	disp->task[0] = NULL;
2596	result = isc_task_create(taskmgr, 0, &disp->task[0]);
2597	if (result != ISC_R_SUCCESS)
2598		goto kill_socket;
2599
2600	disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
2601					    DNS_EVENT_DISPATCHCONTROL,
2602					    destroy_disp, disp,
2603					    sizeof(isc_event_t));
2604	if (disp->ctlevent == NULL) {
2605		result = ISC_R_NOMEMORY;
2606		goto kill_task;
2607	}
2608
2609	isc_task_setname(disp->task[0], "tcpdispatch", disp);
2610
2611	dns_tcpmsg_init(mgr->mctx, disp->socket, &disp->tcpmsg);
2612	disp->tcpmsg_valid = 1;
2613
2614	disp->attributes = attributes;
2615
2616	/*
2617	 * Append it to the dispatcher list.
2618	 */
2619	ISC_LIST_APPEND(mgr->list, disp, link);
2620	UNLOCK(&mgr->lock);
2621
2622	mgr_log(mgr, LVL(90), "created TCP dispatcher %p", disp);
2623	dispatch_log(disp, LVL(90), "created task %p", disp->task[0]);
2624
2625	*dispp = disp;
2626
2627	return (ISC_R_SUCCESS);
2628
2629	/*
2630	 * Error returns.
2631	 */
2632 kill_task:
2633	isc_task_detach(&disp->task[0]);
2634 kill_socket:
2635	isc_socket_detach(&disp->socket);
2636 deallocate_dispatch:
2637	dispatch_free(&disp);
2638
2639	UNLOCK(&mgr->lock);
2640
2641	return (result);
2642}
2643
2644isc_result_t
2645dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
2646		    isc_taskmgr_t *taskmgr, isc_sockaddr_t *localaddr,
2647		    unsigned int buffersize,
2648		    unsigned int maxbuffers, unsigned int maxrequests,
2649		    unsigned int buckets, unsigned int increment,
2650		    unsigned int attributes, unsigned int mask,
2651		    dns_dispatch_t **dispp)
2652{
2653	isc_result_t result;
2654	dns_dispatch_t *disp = NULL;
2655
2656	REQUIRE(VALID_DISPATCHMGR(mgr));
2657	REQUIRE(sockmgr != NULL);
2658	REQUIRE(localaddr != NULL);
2659	REQUIRE(taskmgr != NULL);
2660	REQUIRE(buffersize >= 512 && buffersize < (64 * 1024));
2661	REQUIRE(maxbuffers > 0);
2662	REQUIRE(buckets < 2097169);  /* next prime > 65536 * 32 */
2663	REQUIRE(increment > buckets);
2664	REQUIRE(dispp != NULL && *dispp == NULL);
2665	REQUIRE((attributes & DNS_DISPATCHATTR_TCP) == 0);
2666
2667	result = dns_dispatchmgr_setudp(mgr, buffersize, maxbuffers,
2668					maxrequests, buckets, increment);
2669	if (result != ISC_R_SUCCESS)
2670		return (result);
2671
2672	LOCK(&mgr->lock);
2673
2674	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
2675		REQUIRE(isc_sockaddr_getport(localaddr) == 0);
2676		goto createudp;
2677	}
2678
2679	/*
2680	 * See if we have a dispatcher that matches.
2681	 */
2682	result = dispatch_find(mgr, localaddr, attributes, mask, &disp);
2683	if (result == ISC_R_SUCCESS) {
2684		disp->refcount++;
2685
2686		if (disp->maxrequests < maxrequests)
2687			disp->maxrequests = maxrequests;
2688
2689		if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) == 0 &&
2690		    (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
2691		{
2692			disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
2693			if (disp->recv_pending != 0)
2694				isc_socket_cancel(disp->socket, disp->task[0],
2695						  ISC_SOCKCANCEL_RECV);
2696		}
2697
2698		UNLOCK(&disp->lock);
2699		UNLOCK(&mgr->lock);
2700
2701		*dispp = disp;
2702
2703		return (ISC_R_SUCCESS);
2704	}
2705
2706 createudp:
2707	/*
2708	 * Nope, create one.
2709	 */
2710	result = dispatch_createudp(mgr, sockmgr, taskmgr, localaddr,
2711				    maxrequests, attributes, &disp);
2712	if (result != ISC_R_SUCCESS) {
2713		UNLOCK(&mgr->lock);
2714		return (result);
2715	}
2716
2717	UNLOCK(&mgr->lock);
2718	*dispp = disp;
2719	return (ISC_R_SUCCESS);
2720}
2721
2722/*
2723 * mgr should be locked.
2724 */
2725
2726#ifndef DNS_DISPATCH_HELD
2727#define DNS_DISPATCH_HELD 20U
2728#endif
2729
2730static isc_result_t
2731get_udpsocket(dns_dispatchmgr_t *mgr, dns_dispatch_t *disp,
2732	      isc_socketmgr_t *sockmgr, isc_sockaddr_t *localaddr,
2733	      isc_socket_t **sockp)
2734{
2735	unsigned int i, j;
2736	isc_socket_t *held[DNS_DISPATCH_HELD];
2737	isc_sockaddr_t localaddr_bound;
2738	isc_socket_t *sock = NULL;
2739	isc_result_t result = ISC_R_SUCCESS;
2740	isc_boolean_t anyport;
2741
2742	INSIST(sockp != NULL && *sockp == NULL);
2743
2744	localaddr_bound = *localaddr;
2745	anyport = ISC_TF(isc_sockaddr_getport(localaddr) == 0);
2746
2747	if (anyport) {
2748		unsigned int nports;
2749		in_port_t *ports;
2750
2751		/*
2752		 * If no port is specified, we first try to pick up a random
2753		 * port by ourselves.
2754		 */
2755		if (isc_sockaddr_pf(&disp->local) == AF_INET) {
2756			nports = disp->mgr->nv4ports;
2757			ports = disp->mgr->v4ports;
2758		} else {
2759			nports = disp->mgr->nv6ports;
2760			ports = disp->mgr->v6ports;
2761		}
2762		if (nports == 0)
2763			return (ISC_R_ADDRNOTAVAIL);
2764
2765		for (i = 0; i < 1024; i++) {
2766			in_port_t prt;
2767
2768			prt = ports[dispatch_uniformrandom(
2769					DISP_ARC4CTX(disp),
2770					nports)];
2771			isc_sockaddr_setport(&localaddr_bound, prt);
2772			result = open_socket(sockmgr, &localaddr_bound,
2773					     0, &sock);
2774			if (result == ISC_R_SUCCESS ||
2775			    result != ISC_R_ADDRINUSE) {
2776				disp->localport = prt;
2777				*sockp = sock;
2778				return (result);
2779			}
2780		}
2781
2782		/*
2783		 * If this fails 1024 times, we then ask the kernel for
2784		 * choosing one.
2785		 */
2786	} else {
2787		/* Allow to reuse address for non-random ports. */
2788		result = open_socket(sockmgr, localaddr,
2789				     ISC_SOCKET_REUSEADDRESS, &sock);
2790
2791		if (result == ISC_R_SUCCESS)
2792			*sockp = sock;
2793
2794		return (result);
2795	}
2796
2797	memset(held, 0, sizeof(held));
2798	i = 0;
2799
2800	for (j = 0; j < 0xffffU; j++) {
2801		result = open_socket(sockmgr, localaddr, 0, &sock);
2802		if (result != ISC_R_SUCCESS)
2803			goto end;
2804		else if (!anyport)
2805			break;
2806		else if (portavailable(mgr, sock, NULL))
2807			break;
2808		if (held[i] != NULL)
2809			isc_socket_detach(&held[i]);
2810		held[i++] = sock;
2811		sock = NULL;
2812		if (i == DNS_DISPATCH_HELD)
2813			i = 0;
2814	}
2815	if (j == 0xffffU) {
2816		mgr_log(mgr, ISC_LOG_ERROR,
2817			"avoid-v%s-udp-ports: unable to allocate "
2818			"an available port",
2819			isc_sockaddr_pf(localaddr) == AF_INET ? "4" : "6");
2820		result = ISC_R_FAILURE;
2821		goto end;
2822	}
2823	*sockp = sock;
2824
2825end:
2826	for (i = 0; i < DNS_DISPATCH_HELD; i++) {
2827		if (held[i] != NULL)
2828			isc_socket_detach(&held[i]);
2829	}
2830
2831	return (result);
2832}
2833
2834static isc_result_t
2835dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
2836		   isc_taskmgr_t *taskmgr,
2837		   isc_sockaddr_t *localaddr,
2838		   unsigned int maxrequests,
2839		   unsigned int attributes,
2840		   dns_dispatch_t **dispp)
2841{
2842	isc_result_t result;
2843	dns_dispatch_t *disp;
2844	isc_socket_t *sock = NULL;
2845	int i = 0;
2846
2847	/*
2848	 * dispatch_allocate() checks mgr for us.
2849	 */
2850	disp = NULL;
2851	result = dispatch_allocate(mgr, maxrequests, &disp);
2852	if (result != ISC_R_SUCCESS)
2853		return (result);
2854
2855	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0) {
2856		result = get_udpsocket(mgr, disp, sockmgr, localaddr, &sock);
2857		if (result != ISC_R_SUCCESS)
2858			goto deallocate_dispatch;
2859	} else {
2860		isc_sockaddr_t sa_any;
2861
2862		/*
2863		 * For dispatches using exclusive sockets with a specific
2864		 * source address, we only check if the specified address is
2865		 * available on the system.  Query sockets will be created later
2866		 * on demand.
2867		 */
2868		isc_sockaddr_anyofpf(&sa_any, isc_sockaddr_pf(localaddr));
2869		if (!isc_sockaddr_eqaddr(&sa_any, localaddr)) {
2870			result = open_socket(sockmgr, localaddr, 0, &sock);
2871			if (sock != NULL)
2872				isc_socket_detach(&sock);
2873			if (result != ISC_R_SUCCESS)
2874				goto deallocate_dispatch;
2875		}
2876
2877		disp->port_table = isc_mem_get(mgr->mctx,
2878					       sizeof(disp->port_table[0]) *
2879					       DNS_DISPATCH_PORTTABLESIZE);
2880		if (disp->port_table == NULL)
2881			goto deallocate_dispatch;
2882		for (i = 0; i < DNS_DISPATCH_PORTTABLESIZE; i++)
2883			ISC_LIST_INIT(disp->port_table[i]);
2884
2885		result = isc_mempool_create(mgr->mctx, sizeof(dispportentry_t),
2886					    &disp->portpool);
2887		if (result != ISC_R_SUCCESS)
2888			goto deallocate_dispatch;
2889		isc_mempool_setname(disp->portpool, "disp_portpool");
2890		isc_mempool_setfreemax(disp->portpool, 128);
2891	}
2892	disp->socktype = isc_sockettype_udp;
2893	disp->socket = sock;
2894	disp->local = *localaddr;
2895
2896	if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
2897		disp->ntasks = MAX_INTERNAL_TASKS;
2898	else
2899		disp->ntasks = 1;
2900	for (i = 0; i < disp->ntasks; i++) {
2901		disp->task[i] = NULL;
2902		result = isc_task_create(taskmgr, 0, &disp->task[i]);
2903		if (result != ISC_R_SUCCESS) {
2904			while (--i >= 0) {
2905				isc_task_shutdown(disp->task[i]);
2906				isc_task_detach(&disp->task[i]);
2907			}
2908			goto kill_socket;
2909		}
2910		isc_task_setname(disp->task[i], "udpdispatch", disp);
2911	}
2912
2913	disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
2914					    DNS_EVENT_DISPATCHCONTROL,
2915					    destroy_disp, disp,
2916					    sizeof(isc_event_t));
2917	if (disp->ctlevent == NULL) {
2918		result = ISC_R_NOMEMORY;
2919		goto kill_task;
2920	}
2921
2922	attributes &= ~DNS_DISPATCHATTR_TCP;
2923	attributes |= DNS_DISPATCHATTR_UDP;
2924	disp->attributes = attributes;
2925
2926	/*
2927	 * Append it to the dispatcher list.
2928	 */
2929	ISC_LIST_APPEND(mgr->list, disp, link);
2930
2931	mgr_log(mgr, LVL(90), "created UDP dispatcher %p", disp);
2932	dispatch_log(disp, LVL(90), "created task %p", disp->task[0]); /* XXX */
2933	if (disp->socket != NULL)
2934		dispatch_log(disp, LVL(90), "created socket %p", disp->socket);
2935
2936	*dispp = disp;
2937	return (result);
2938
2939	/*
2940	 * Error returns.
2941	 */
2942 kill_task:
2943	for (i = 0; i < disp->ntasks; i++)
2944		isc_task_detach(&disp->task[i]);
2945 kill_socket:
2946	if (disp->socket != NULL)
2947		isc_socket_detach(&disp->socket);
2948 deallocate_dispatch:
2949	dispatch_free(&disp);
2950
2951	return (result);
2952}
2953
2954void
2955dns_dispatch_attach(dns_dispatch_t *disp, dns_dispatch_t **dispp) {
2956	REQUIRE(VALID_DISPATCH(disp));
2957	REQUIRE(dispp != NULL && *dispp == NULL);
2958
2959	LOCK(&disp->lock);
2960	disp->refcount++;
2961	UNLOCK(&disp->lock);
2962
2963	*dispp = disp;
2964}
2965
2966/*
2967 * It is important to lock the manager while we are deleting the dispatch,
2968 * since dns_dispatch_getudp will call dispatch_find, which returns to
2969 * the caller a dispatch but does not attach to it until later.  _getudp
2970 * locks the manager, however, so locking it here will keep us from attaching
2971 * to a dispatcher that is in the process of going away.
2972 */
2973void
2974dns_dispatch_detach(dns_dispatch_t **dispp) {
2975	dns_dispatch_t *disp;
2976	dispsocket_t *dispsock;
2977	isc_boolean_t killit;
2978
2979	REQUIRE(dispp != NULL && VALID_DISPATCH(*dispp));
2980
2981	disp = *dispp;
2982	*dispp = NULL;
2983
2984	LOCK(&disp->lock);
2985
2986	INSIST(disp->refcount > 0);
2987	disp->refcount--;
2988	killit = ISC_FALSE;
2989	if (disp->refcount == 0) {
2990		if (disp->recv_pending > 0)
2991			isc_socket_cancel(disp->socket, disp->task[0],
2992					  ISC_SOCKCANCEL_RECV);
2993		for (dispsock = ISC_LIST_HEAD(disp->activesockets);
2994		     dispsock != NULL;
2995		     dispsock = ISC_LIST_NEXT(dispsock, link)) {
2996			isc_socket_cancel(dispsock->socket, dispsock->task,
2997					  ISC_SOCKCANCEL_RECV);
2998		}
2999		disp->shutting_down = 1;
3000	}
3001
3002	dispatch_log(disp, LVL(90), "detach: refcount %d", disp->refcount);
3003
3004	killit = destroy_disp_ok(disp);
3005	UNLOCK(&disp->lock);
3006	if (killit)
3007		isc_task_send(disp->task[0], &disp->ctlevent);
3008}
3009
3010isc_result_t
3011dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest,
3012			  isc_task_t *task, isc_taskaction_t action, void *arg,
3013			  dns_messageid_t *idp, dns_dispentry_t **resp,
3014			  isc_socketmgr_t *sockmgr)
3015{
3016	dns_dispentry_t *res;
3017	unsigned int bucket;
3018	in_port_t localport = 0;
3019	dns_messageid_t id;
3020	int i;
3021	isc_boolean_t ok;
3022	dns_qid_t *qid;
3023	dispsocket_t *dispsocket = NULL;
3024	isc_result_t result;
3025
3026	REQUIRE(VALID_DISPATCH(disp));
3027	REQUIRE(task != NULL);
3028	REQUIRE(dest != NULL);
3029	REQUIRE(resp != NULL && *resp == NULL);
3030	REQUIRE(idp != NULL);
3031	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
3032		REQUIRE(sockmgr != NULL);
3033
3034	LOCK(&disp->lock);
3035
3036	if (disp->shutting_down == 1) {
3037		UNLOCK(&disp->lock);
3038		return (ISC_R_SHUTTINGDOWN);
3039	}
3040
3041	if (disp->requests >= disp->maxrequests) {
3042		UNLOCK(&disp->lock);
3043		return (ISC_R_QUOTA);
3044	}
3045
3046	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
3047	    disp->nsockets > DNS_DISPATCH_SOCKSQUOTA) {
3048		dispsocket_t *oldestsocket;
3049		dns_dispentry_t *oldestresp;
3050		dns_dispatchevent_t *rev;
3051
3052		/*
3053		 * Kill oldest outstanding query if the number of sockets
3054		 * exceeds the quota to keep the room for new queries.
3055		 */
3056		oldestsocket = ISC_LIST_HEAD(disp->activesockets);
3057		oldestresp = oldestsocket->resp;
3058		if (oldestresp != NULL && !oldestresp->item_out) {
3059			rev = allocate_event(oldestresp->disp);
3060			if (rev != NULL) {
3061				rev->buffer.base = NULL;
3062				rev->result = ISC_R_CANCELED;
3063				rev->id = oldestresp->id;
3064				ISC_EVENT_INIT(rev, sizeof(*rev), 0,
3065					       NULL, DNS_EVENT_DISPATCH,
3066					       oldestresp->action,
3067					       oldestresp->arg, oldestresp,
3068					       NULL, NULL);
3069				oldestresp->item_out = ISC_TRUE;
3070				isc_task_send(oldestresp->task,
3071					      ISC_EVENT_PTR(&rev));
3072				inc_stats(disp->mgr,
3073					  dns_resstatscounter_dispabort);
3074			}
3075		}
3076
3077		/*
3078		 * Move this entry to the tail so that it won't (easily) be
3079		 * examined before actually being canceled.
3080		 */
3081		ISC_LIST_UNLINK(disp->activesockets, oldestsocket, link);
3082		ISC_LIST_APPEND(disp->activesockets, oldestsocket, link);
3083	}
3084
3085	qid = DNS_QID(disp);
3086	LOCK(&qid->lock);
3087
3088	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
3089		/*
3090		 * Get a separate UDP socket with a random port number.
3091		 */
3092		result = get_dispsocket(disp, dest, sockmgr, qid, &dispsocket,
3093					&localport);
3094		if (result != ISC_R_SUCCESS) {
3095			UNLOCK(&qid->lock);
3096			UNLOCK(&disp->lock);
3097			inc_stats(disp->mgr, dns_resstatscounter_dispsockfail);
3098			return (result);
3099		}
3100	} else {
3101		localport = disp->localport;
3102	}
3103
3104	/*
3105	 * Try somewhat hard to find an unique ID.
3106	 */
3107	id = (dns_messageid_t)dispatch_random(DISP_ARC4CTX(disp));
3108	bucket = dns_hash(qid, dest, id, localport);
3109	ok = ISC_FALSE;
3110	for (i = 0; i < 64; i++) {
3111		if (entry_search(qid, dest, id, localport, bucket) == NULL) {
3112			ok = ISC_TRUE;
3113			break;
3114		}
3115		id += qid->qid_increment;
3116		id &= 0x0000ffff;
3117		bucket = dns_hash(qid, dest, id, localport);
3118	}
3119
3120	if (!ok) {
3121		UNLOCK(&qid->lock);
3122		UNLOCK(&disp->lock);
3123		return (ISC_R_NOMORE);
3124	}
3125
3126	res = isc_mempool_get(disp->mgr->rpool);
3127	if (res == NULL) {
3128		UNLOCK(&qid->lock);
3129		UNLOCK(&disp->lock);
3130		if (dispsocket != NULL)
3131			destroy_dispsocket(disp, &dispsocket);
3132		return (ISC_R_NOMEMORY);
3133	}
3134
3135	disp->refcount++;
3136	disp->requests++;
3137	res->task = NULL;
3138	isc_task_attach(task, &res->task);
3139	res->disp = disp;
3140	res->id = id;
3141	res->port = localport;
3142	res->bucket = bucket;
3143	res->host = *dest;
3144	res->action = action;
3145	res->arg = arg;
3146	res->dispsocket = dispsocket;
3147	if (dispsocket != NULL)
3148		dispsocket->resp = res;
3149	res->item_out = ISC_FALSE;
3150	ISC_LIST_INIT(res->items);
3151	ISC_LINK_INIT(res, link);
3152	res->magic = RESPONSE_MAGIC;
3153	ISC_LIST_APPEND(qid->qid_table[bucket], res, link);
3154	UNLOCK(&qid->lock);
3155
3156	request_log(disp, res, LVL(90),
3157		    "attached to task %p", res->task);
3158
3159	if (((disp->attributes & DNS_DISPATCHATTR_UDP) != 0) ||
3160	    ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0)) {
3161		result = startrecv(disp, dispsocket);
3162		if (result != ISC_R_SUCCESS) {
3163			LOCK(&qid->lock);
3164			ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
3165			UNLOCK(&qid->lock);
3166
3167			if (dispsocket != NULL)
3168				destroy_dispsocket(disp, &dispsocket);
3169
3170			disp->refcount--;
3171			disp->requests--;
3172
3173			UNLOCK(&disp->lock);
3174			isc_task_detach(&res->task);
3175			isc_mempool_put(disp->mgr->rpool, res);
3176			return (result);
3177		}
3178	}
3179
3180	if (dispsocket != NULL)
3181		ISC_LIST_APPEND(disp->activesockets, dispsocket, link);
3182
3183	UNLOCK(&disp->lock);
3184
3185	*idp = id;
3186	*resp = res;
3187
3188	if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
3189		INSIST(res->dispsocket != NULL);
3190
3191	return (ISC_R_SUCCESS);
3192}
3193
3194isc_result_t
3195dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
3196			 isc_task_t *task, isc_taskaction_t action, void *arg,
3197			 dns_messageid_t *idp, dns_dispentry_t **resp)
3198{
3199	REQUIRE(VALID_DISPATCH(disp));
3200	REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
3201
3202	return (dns_dispatch_addresponse2(disp, dest, task, action, arg,
3203					  idp, resp, NULL));
3204}
3205
3206void
3207dns_dispatch_starttcp(dns_dispatch_t *disp) {
3208
3209	REQUIRE(VALID_DISPATCH(disp));
3210
3211	dispatch_log(disp, LVL(90), "starttcp %p", disp->task[0]);
3212
3213	LOCK(&disp->lock);
3214	disp->attributes |= DNS_DISPATCHATTR_CONNECTED;
3215	(void)startrecv(disp, NULL);
3216	UNLOCK(&disp->lock);
3217}
3218
3219void
3220dns_dispatch_removeresponse(dns_dispentry_t **resp,
3221			    dns_dispatchevent_t **sockevent)
3222{
3223	dns_dispatchmgr_t *mgr;
3224	dns_dispatch_t *disp;
3225	dns_dispentry_t *res;
3226	dispsocket_t *dispsock;
3227	dns_dispatchevent_t *ev;
3228	unsigned int bucket;
3229	isc_boolean_t killit;
3230	unsigned int n;
3231	isc_eventlist_t events;
3232	dns_qid_t *qid;
3233
3234	REQUIRE(resp != NULL);
3235	REQUIRE(VALID_RESPONSE(*resp));
3236
3237	res = *resp;
3238	*resp = NULL;
3239
3240	disp = res->disp;
3241	REQUIRE(VALID_DISPATCH(disp));
3242	mgr = disp->mgr;
3243	REQUIRE(VALID_DISPATCHMGR(mgr));
3244
3245	qid = DNS_QID(disp);
3246
3247	if (sockevent != NULL) {
3248		REQUIRE(*sockevent != NULL);
3249		ev = *sockevent;
3250		*sockevent = NULL;
3251	} else {
3252		ev = NULL;
3253	}
3254
3255	LOCK(&disp->lock);
3256
3257	INSIST(disp->requests > 0);
3258	disp->requests--;
3259	INSIST(disp->refcount > 0);
3260	disp->refcount--;
3261	killit = ISC_FALSE;
3262	if (disp->refcount == 0) {
3263		if (disp->recv_pending > 0)
3264			isc_socket_cancel(disp->socket, disp->task[0],
3265					  ISC_SOCKCANCEL_RECV);
3266		for (dispsock = ISC_LIST_HEAD(disp->activesockets);
3267		     dispsock != NULL;
3268		     dispsock = ISC_LIST_NEXT(dispsock, link)) {
3269			isc_socket_cancel(dispsock->socket, dispsock->task,
3270					  ISC_SOCKCANCEL_RECV);
3271		}
3272		disp->shutting_down = 1;
3273	}
3274
3275	bucket = res->bucket;
3276
3277	LOCK(&qid->lock);
3278	ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
3279	UNLOCK(&qid->lock);
3280
3281	if (ev == NULL && res->item_out) {
3282		/*
3283		 * We've posted our event, but the caller hasn't gotten it
3284		 * yet.  Take it back.
3285		 */
3286		ISC_LIST_INIT(events);
3287		n = isc_task_unsend(res->task, res, DNS_EVENT_DISPATCH,
3288				    NULL, &events);
3289		/*
3290		 * We had better have gotten it back.
3291		 */
3292		INSIST(n == 1);
3293		ev = (dns_dispatchevent_t *)ISC_LIST_HEAD(events);
3294	}
3295
3296	if (ev != NULL) {
3297		REQUIRE(res->item_out == ISC_TRUE);
3298		res->item_out = ISC_FALSE;
3299		if (ev->buffer.base != NULL)
3300			free_buffer(disp, ev->buffer.base, ev->buffer.length);
3301		free_event(disp, ev);
3302	}
3303
3304	request_log(disp, res, LVL(90), "detaching from task %p", res->task);
3305	isc_task_detach(&res->task);
3306
3307	if (res->dispsocket != NULL) {
3308		isc_socket_cancel(res->dispsocket->socket,
3309				  res->dispsocket->task, ISC_SOCKCANCEL_RECV);
3310		res->dispsocket->resp = NULL;
3311	}
3312
3313	/*
3314	 * Free any buffered requests as well
3315	 */
3316	ev = ISC_LIST_HEAD(res->items);
3317	while (ev != NULL) {
3318		ISC_LIST_UNLINK(res->items, ev, ev_link);
3319		if (ev->buffer.base != NULL)
3320			free_buffer(disp, ev->buffer.base, ev->buffer.length);
3321		free_event(disp, ev);
3322		ev = ISC_LIST_HEAD(res->items);
3323	}
3324	res->magic = 0;
3325	isc_mempool_put(disp->mgr->rpool, res);
3326	if (disp->shutting_down == 1)
3327		do_cancel(disp);
3328	else
3329		(void)startrecv(disp, NULL);
3330
3331	killit = destroy_disp_ok(disp);
3332	UNLOCK(&disp->lock);
3333	if (killit)
3334		isc_task_send(disp->task[0], &disp->ctlevent);
3335}
3336
3337static void
3338do_cancel(dns_dispatch_t *disp) {
3339	dns_dispatchevent_t *ev;
3340	dns_dispentry_t *resp;
3341	dns_qid_t *qid;
3342
3343	if (disp->shutdown_out == 1)
3344		return;
3345
3346	qid = DNS_QID(disp);
3347
3348	/*
3349	 * Search for the first response handler without packets outstanding
3350	 * unless a specific hander is given.
3351	 */
3352	LOCK(&qid->lock);
3353	for (resp = linear_first(qid);
3354	     resp != NULL && resp->item_out;
3355	     /* Empty. */)
3356		resp = linear_next(qid, resp);
3357
3358	/*
3359	 * No one to send the cancel event to, so nothing to do.
3360	 */
3361	if (resp == NULL)
3362		goto unlock;
3363
3364	/*
3365	 * Send the shutdown failsafe event to this resp.
3366	 */
3367	ev = disp->failsafe_ev;
3368	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, DNS_EVENT_DISPATCH,
3369		       resp->action, resp->arg, resp, NULL, NULL);
3370	ev->result = disp->shutdown_why;
3371	ev->buffer.base = NULL;
3372	ev->buffer.length = 0;
3373	disp->shutdown_out = 1;
3374	request_log(disp, resp, LVL(10),
3375		    "cancel: failsafe event %p -> task %p",
3376		    ev, resp->task);
3377	resp->item_out = ISC_TRUE;
3378	isc_task_send(resp->task, ISC_EVENT_PTR(&ev));
3379 unlock:
3380	UNLOCK(&qid->lock);
3381}
3382
3383isc_socket_t *
3384dns_dispatch_getsocket(dns_dispatch_t *disp) {
3385	REQUIRE(VALID_DISPATCH(disp));
3386
3387	return (disp->socket);
3388}
3389
3390isc_socket_t *
3391dns_dispatch_getentrysocket(dns_dispentry_t *resp) {
3392	REQUIRE(VALID_RESPONSE(resp));
3393
3394	if (resp->dispsocket != NULL)
3395		return (resp->dispsocket->socket);
3396	else
3397		return (NULL);
3398}
3399
3400isc_result_t
3401dns_dispatch_getlocaladdress(dns_dispatch_t *disp, isc_sockaddr_t *addrp) {
3402
3403	REQUIRE(VALID_DISPATCH(disp));
3404	REQUIRE(addrp != NULL);
3405
3406	if (disp->socktype == isc_sockettype_udp) {
3407		*addrp = disp->local;
3408		return (ISC_R_SUCCESS);
3409	}
3410	return (ISC_R_NOTIMPLEMENTED);
3411}
3412
3413void
3414dns_dispatch_cancel(dns_dispatch_t *disp) {
3415	REQUIRE(VALID_DISPATCH(disp));
3416
3417	LOCK(&disp->lock);
3418
3419	if (disp->shutting_down == 1) {
3420		UNLOCK(&disp->lock);
3421		return;
3422	}
3423
3424	disp->shutdown_why = ISC_R_CANCELED;
3425	disp->shutting_down = 1;
3426	do_cancel(disp);
3427
3428	UNLOCK(&disp->lock);
3429
3430	return;
3431}
3432
3433unsigned int
3434dns_dispatch_getattributes(dns_dispatch_t *disp) {
3435	REQUIRE(VALID_DISPATCH(disp));
3436
3437	/*
3438	 * We don't bother locking disp here; it's the caller's responsibility
3439	 * to use only non volatile flags.
3440	 */
3441	return (disp->attributes);
3442}
3443
3444void
3445dns_dispatch_changeattributes(dns_dispatch_t *disp,
3446			      unsigned int attributes, unsigned int mask)
3447{
3448	REQUIRE(VALID_DISPATCH(disp));
3449	/* Exclusive attribute can only be set on creation */
3450	REQUIRE((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
3451	/* Also, a dispatch with randomport specified cannot start listening */
3452	REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0 ||
3453		(attributes & DNS_DISPATCHATTR_NOLISTEN) == 0);
3454
3455	/* XXXMLG
3456	 * Should check for valid attributes here!
3457	 */
3458
3459	LOCK(&disp->lock);
3460
3461	if ((mask & DNS_DISPATCHATTR_NOLISTEN) != 0) {
3462		if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0 &&
3463		    (attributes & DNS_DISPATCHATTR_NOLISTEN) == 0) {
3464			disp->attributes &= ~DNS_DISPATCHATTR_NOLISTEN;
3465			(void)startrecv(disp, NULL);
3466		} else if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN)
3467			   == 0 &&
3468			   (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0) {
3469			disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
3470			if (disp->recv_pending != 0)
3471				isc_socket_cancel(disp->socket, disp->task[0],
3472						  ISC_SOCKCANCEL_RECV);
3473		}
3474	}
3475
3476	disp->attributes &= ~mask;
3477	disp->attributes |= (attributes & mask);
3478	UNLOCK(&disp->lock);
3479}
3480
3481void
3482dns_dispatch_importrecv(dns_dispatch_t *disp, isc_event_t *event) {
3483	void *buf;
3484	isc_socketevent_t *sevent, *newsevent;
3485
3486	REQUIRE(VALID_DISPATCH(disp));
3487	REQUIRE((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0);
3488	REQUIRE(event != NULL);
3489
3490	sevent = (isc_socketevent_t *)event;
3491
3492	INSIST(sevent->n <= disp->mgr->buffersize);
3493	newsevent = (isc_socketevent_t *)
3494		    isc_event_allocate(disp->mgr->mctx, NULL,
3495				      DNS_EVENT_IMPORTRECVDONE, udp_shrecv,
3496				      disp, sizeof(isc_socketevent_t));
3497	if (newsevent == NULL)
3498		return;
3499
3500	buf = allocate_udp_buffer(disp);
3501	if (buf == NULL) {
3502		isc_event_free(ISC_EVENT_PTR(&newsevent));
3503		return;
3504	}
3505	memcpy(buf, sevent->region.base, sevent->n);
3506	newsevent->region.base = buf;
3507	newsevent->region.length = disp->mgr->buffersize;
3508	newsevent->n = sevent->n;
3509	newsevent->result = sevent->result;
3510	newsevent->address = sevent->address;
3511	newsevent->timestamp = sevent->timestamp;
3512	newsevent->pktinfo = sevent->pktinfo;
3513	newsevent->attributes = sevent->attributes;
3514
3515	isc_task_send(disp->task[0], ISC_EVENT_PTR(&newsevent));
3516}
3517
3518#if 0
3519void
3520dns_dispatchmgr_dump(dns_dispatchmgr_t *mgr) {
3521	dns_dispatch_t *disp;
3522	char foo[1024];
3523
3524	disp = ISC_LIST_HEAD(mgr->list);
3525	while (disp != NULL) {
3526		isc_sockaddr_format(&disp->local, foo, sizeof(foo));
3527		printf("\tdispatch %p, addr %s\n", disp, foo);
3528		disp = ISC_LIST_NEXT(disp, link);
3529	}
3530}
3531#endif
3532