1/*
2 * server.c -- nsd(8) network input/output
3 *
4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5 *
6 * See LICENSE for the license.
7 *
8 */
9
10#include "config.h"
11
12#include <sys/types.h>
13#include <sys/param.h>
14#include <limits.h>
15#include <sys/socket.h>
16#include <sys/uio.h>
17#include <sys/wait.h>
18
19#include <netinet/in.h>
20#ifdef USE_TCP_FASTOPEN
21  #include <netinet/tcp.h>
22#endif
23#include <arpa/inet.h>
24
25#include <assert.h>
26#include <ctype.h>
27#include <errno.h>
28#include <fcntl.h>
29#include <stddef.h>
30#include <stdio.h>
31#include <stdlib.h>
32#include <string.h>
33#include <time.h>
34#include <unistd.h>
35#include <signal.h>
36#include <netdb.h>
37#include <poll.h>
38#ifdef HAVE_SYS_RANDOM_H
39#include <sys/random.h>
40#endif
41#ifndef SHUT_WR
42#define SHUT_WR 1
43#endif
44#ifdef HAVE_MMAP
45#include <sys/mman.h>
46#endif /* HAVE_MMAP */
47#ifdef HAVE_OPENSSL_RAND_H
48#include <openssl/rand.h>
49#endif
50#ifdef HAVE_OPENSSL_SSL_H
51#include <openssl/ssl.h>
52#endif
53#ifdef HAVE_OPENSSL_ERR_H
54#include <openssl/err.h>
55#endif
56#ifdef HAVE_OPENSSL_OCSP_H
57#include <openssl/ocsp.h>
58#endif
59#ifndef USE_MINI_EVENT
60#  ifdef HAVE_EVENT_H
61#    include <event.h>
62#  else
63#    include <event2/event.h>
64#    include "event2/event_struct.h"
65#    include "event2/event_compat.h"
66#  endif
67#else
68#  include "mini_event.h"
69#endif
70
71#include "axfr.h"
72#include "namedb.h"
73#include "netio.h"
74#include "xfrd.h"
75#include "xfrd-tcp.h"
76#include "xfrd-disk.h"
77#include "difffile.h"
78#include "nsec3.h"
79#include "ipc.h"
80#include "udb.h"
81#include "remote.h"
82#include "lookup3.h"
83#include "rrl.h"
84#include "ixfr.h"
85#ifdef USE_DNSTAP
86#include "dnstap/dnstap_collector.h"
87#endif
88#include "verify.h"
89#include "util/proxy_protocol.h"
90
91#define RELOAD_SYNC_TIMEOUT 25 /* seconds */
92
93#ifdef USE_DNSTAP
94/*
95 * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
96 * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
97 */
98static void
99log_addr(const char* descr,
100#ifdef INET6
101	struct sockaddr_storage* addr
102#else
103	struct sockaddr_in* addr
104#endif
105	)
106{
107	char str_buf[64];
108	if(verbosity < 6)
109		return;
110	if(
111#ifdef INET6
112		addr->ss_family == AF_INET
113#else
114		addr->sin_family == AF_INET
115#endif
116		) {
117		struct sockaddr_in* s = (struct sockaddr_in*)addr;
118		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
119		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
120#ifdef INET6
121	} else {
122		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
123		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
124		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
125#endif
126	}
127}
128#endif /* USE_DNSTAP */
129
130#ifdef USE_TCP_FASTOPEN
131  #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
132  #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
133#endif
134
135/* header state for the PROXYv2 header (for TCP) */
136enum pp2_header_state {
137	/* no header encounter yet */
138	pp2_header_none = 0,
139	/* read the static part of the header */
140	pp2_header_init,
141	/* read the full header */
142	pp2_header_done
143};
144
145/*
146 * Data for the UDP handlers.
147 */
148struct udp_handler_data
149{
150	struct nsd        *nsd;
151	struct nsd_socket *socket;
152	struct event       event;
153	/* if set, PROXYv2 is expected on this connection */
154	int pp2_enabled;
155};
156
157struct tcp_accept_handler_data {
158	struct nsd        *nsd;
159	struct nsd_socket *socket;
160	int                event_added;
161	struct event       event;
162#ifdef HAVE_SSL
163	/* handler accepts TLS connections on the dedicated port */
164	int                tls_accept;
165#endif
166	/* if set, PROXYv2 is expected on this connection */
167	int pp2_enabled;
168};
169
170/*
171 * These globals are used to enable the TCP accept handlers
172 * when the number of TCP connection drops below the maximum
173 * number of TCP connections.
174 */
175static size_t tcp_accept_handler_count;
176static struct tcp_accept_handler_data *tcp_accept_handlers;
177
178static struct event slowaccept_event;
179static int slowaccept;
180
181#ifdef HAVE_SSL
182static unsigned char *ocspdata = NULL;
183static long ocspdata_len = 0;
184#endif
185
186#ifdef NONBLOCKING_IS_BROKEN
187/* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
188   read multiple times from a socket when reported ready by select. */
189# define NUM_RECV_PER_SELECT (1)
190#else /* !NONBLOCKING_IS_BROKEN */
191# define NUM_RECV_PER_SELECT (100)
192#endif /* NONBLOCKING_IS_BROKEN */
193
194#ifndef HAVE_MMSGHDR
195struct mmsghdr {
196	struct msghdr msg_hdr;
197	unsigned int  msg_len;
198};
199#endif
200
201static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
202static struct iovec iovecs[NUM_RECV_PER_SELECT];
203static struct query *queries[NUM_RECV_PER_SELECT];
204
205/*
206 * Data for the TCP connection handlers.
207 *
208 * The TCP handlers use non-blocking I/O.  This is necessary to avoid
209 * blocking the entire server on a slow TCP connection, but does make
210 * reading from and writing to the socket more complicated.
211 *
212 * Basically, whenever a read/write would block (indicated by the
213 * EAGAIN errno variable) we remember the position we were reading
214 * from/writing to and return from the TCP reading/writing event
215 * handler.  When the socket becomes readable/writable again we
216 * continue from the same position.
217 */
218struct tcp_handler_data
219{
220	/*
221	 * The region used to allocate all TCP connection related
222	 * data, including this structure.  This region is destroyed
223	 * when the connection is closed.
224	 */
225	region_type*		region;
226
227	/*
228	 * The global nsd structure.
229	 */
230	struct nsd*			nsd;
231
232	/*
233	 * The current query data for this TCP connection.
234	 */
235	query_type*			query;
236
237	/*
238	 * The query_state is used to remember if we are performing an
239	 * AXFR, if we're done processing, or if we should discard the
240	 * query and connection.
241	 */
242	query_state_type	query_state;
243
244	/*
245	 * The event for the file descriptor and tcp timeout
246	 */
247	struct event event;
248
249	/*
250	 * The bytes_transmitted field is used to remember the number
251	 * of bytes transmitted when receiving or sending a DNS
252	 * packet.  The count includes the two additional bytes used
253	 * to specify the packet length on a TCP connection.
254	 */
255	size_t				bytes_transmitted;
256
257	/* If the query is restarted and needs a reset */
258	int query_needs_reset;
259
260	/*
261	 * The number of queries handled by this specific TCP connection.
262	 */
263	int					query_count;
264
265	/*
266	 * The timeout in msec for this tcp connection
267	 */
268	int	tcp_timeout;
269
270	/*
271	 * If the connection is allowed to have further queries on it.
272	 */
273	int tcp_no_more_queries;
274
275#ifdef USE_DNSTAP
276	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
277	struct nsd_socket *socket;
278#endif /* USE_DNSTAP */
279
280	/* if set, PROXYv2 is expected on this connection */
281	int pp2_enabled;
282
283	/* header state for the PROXYv2 header (for TCP) */
284	enum pp2_header_state pp2_header_state;
285
286#ifdef HAVE_SSL
287	/*
288	 * TLS object.
289	 */
290	SSL* tls;
291
292	/*
293	 * TLS handshake state.
294	 */
295	enum { tls_hs_none, tls_hs_read, tls_hs_write,
296		tls_hs_read_event, tls_hs_write_event } shake_state;
297#endif
298	/* list of connections, for service of remaining tcp channels */
299	struct tcp_handler_data *prev, *next;
300};
301/* global that is the list of active tcp channels */
302static struct tcp_handler_data *tcp_active_list = NULL;
303
304/*
305 * Handle incoming queries on the UDP server sockets.
306 */
307static void handle_udp(int fd, short event, void* arg);
308
309/*
310 * Handle incoming connections on the TCP sockets.  These handlers
311 * usually wait for the NETIO_EVENT_READ event (indicating an incoming
312 * connection) but are disabled when the number of current TCP
313 * connections is equal to the maximum number of TCP connections.
314 * Disabling is done by changing the handler to wait for the
315 * NETIO_EVENT_NONE type.  This is done using the function
316 * configure_tcp_accept_handlers.
317 */
318static void handle_tcp_accept(int fd, short event, void* arg);
319
320/*
321 * Handle incoming queries on a TCP connection.  The TCP connections
322 * are configured to be non-blocking and the handler may be called
323 * multiple times before a complete query is received.
324 */
325static void handle_tcp_reading(int fd, short event, void* arg);
326
327/*
328 * Handle outgoing responses on a TCP connection.  The TCP connections
329 * are configured to be non-blocking and the handler may be called
330 * multiple times before a complete response is sent.
331 */
332static void handle_tcp_writing(int fd, short event, void* arg);
333
334#ifdef HAVE_SSL
335/* Create SSL object and associate fd */
336static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
337/*
338 * Handle TLS handshake. May be called multiple times if incomplete.
339 */
340static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
341
342/*
343 * Handle incoming queries on a TLS over TCP connection.  The TLS
344 * connections are configured to be non-blocking and the handler may
345 * be called multiple times before a complete query is received.
346 */
347static void handle_tls_reading(int fd, short event, void* arg);
348
349/*
350 * Handle outgoing responses on a TLS over TCP connection.  The TLS
351 * connections are configured to be non-blocking and the handler may
352 * be called multiple times before a complete response is sent.
353 */
354static void handle_tls_writing(int fd, short event, void* arg);
355#endif
356
357/*
358 * Send all children the quit nonblocking, then close pipe.
359 */
360static void send_children_quit(struct nsd* nsd);
361/* same, for shutdown time, waits for child to exit to avoid restart issues */
362static void send_children_quit_and_wait(struct nsd* nsd);
363
364/* set childrens flags to send NSD_STATS to them */
365#ifdef BIND8_STATS
366static void set_children_stats(struct nsd* nsd);
367#endif /* BIND8_STATS */
368
369/*
370 * Change the event types the HANDLERS are interested in to EVENT_TYPES.
371 */
372static void configure_handler_event_types(short event_types);
373
374static uint16_t *compressed_dname_offsets = 0;
375static uint32_t compression_table_capacity = 0;
376static uint32_t compression_table_size = 0;
377static domain_type* compressed_dnames[MAXRRSPP];
378
379#ifdef USE_TCP_FASTOPEN
380/* Checks to see if the kernel value must be manually changed in order for
381   TCP Fast Open to support server mode */
382static void report_tcp_fastopen_config() {
383
384	int tcp_fastopen_fp;
385	uint8_t tcp_fastopen_value;
386
387	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
388		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
389	}
390	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
391		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
392		close(tcp_fastopen_fp);
393	}
394	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
395		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
396		log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n");
397		log_msg(LOG_WARNING, "To enable TFO use the command:");
398		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
399		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
400		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
401		close(tcp_fastopen_fp);
402	}
403	close(tcp_fastopen_fp);
404}
405#endif
406
407/*
408 * Remove the specified pid from the list of child pids.  Returns -1 if
409 * the pid is not in the list, child_num otherwise.  The field is set to 0.
410 */
411static int
412delete_child_pid(struct nsd *nsd, pid_t pid)
413{
414	size_t i;
415	for (i = 0; i < nsd->child_count; ++i) {
416		if (nsd->children[i].pid == pid) {
417			nsd->children[i].pid = 0;
418			if(!nsd->children[i].need_to_exit) {
419				if(nsd->children[i].child_fd != -1)
420					close(nsd->children[i].child_fd);
421				nsd->children[i].child_fd = -1;
422				if(nsd->children[i].handler)
423					nsd->children[i].handler->fd = -1;
424			}
425			return i;
426		}
427	}
428	return -1;
429}
430
431/*
432 * Restart child servers if necessary.
433 */
434static int
435restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
436	int* xfrd_sock_p)
437{
438	struct main_ipc_handler_data *ipc_data;
439	size_t i;
440	int sv[2];
441
442	/* Fork the child processes... */
443	for (i = 0; i < nsd->child_count; ++i) {
444		if (nsd->children[i].pid <= 0) {
445			if (nsd->children[i].child_fd != -1)
446				close(nsd->children[i].child_fd);
447			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
448				log_msg(LOG_ERR, "socketpair: %s",
449					strerror(errno));
450				return -1;
451			}
452			nsd->children[i].child_fd = sv[0];
453			nsd->children[i].parent_fd = sv[1];
454			nsd->children[i].pid = fork();
455			switch (nsd->children[i].pid) {
456			default: /* SERVER MAIN */
457				close(nsd->children[i].parent_fd);
458				nsd->children[i].parent_fd = -1;
459				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
460					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
461				}
462				if(!nsd->children[i].handler)
463				{
464					ipc_data = (struct main_ipc_handler_data*) region_alloc(
465						region, sizeof(struct main_ipc_handler_data));
466					ipc_data->nsd = nsd;
467					ipc_data->child = &nsd->children[i];
468					ipc_data->child_num = i;
469					ipc_data->xfrd_sock = xfrd_sock_p;
470					ipc_data->packet = buffer_create(region, QIOBUFSZ);
471					ipc_data->forward_mode = 0;
472					ipc_data->got_bytes = 0;
473					ipc_data->total_bytes = 0;
474					ipc_data->acl_num = 0;
475					nsd->children[i].handler = (struct netio_handler*) region_alloc(
476						region, sizeof(struct netio_handler));
477					nsd->children[i].handler->fd = nsd->children[i].child_fd;
478					nsd->children[i].handler->timeout = NULL;
479					nsd->children[i].handler->user_data = ipc_data;
480					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
481					nsd->children[i].handler->event_handler = parent_handle_child_command;
482					netio_add_handler(netio, nsd->children[i].handler);
483				}
484				/* clear any ongoing ipc */
485				ipc_data = (struct main_ipc_handler_data*)
486					nsd->children[i].handler->user_data;
487				ipc_data->forward_mode = 0;
488				/* restart - update fd */
489				nsd->children[i].handler->fd = nsd->children[i].child_fd;
490				break;
491			case 0: /* CHILD */
492#ifdef MEMCLEAN /* OS collects memory pages */
493				region_destroy(region);
494#endif
495				nsd->pid = 0;
496				nsd->child_count = 0;
497				nsd->server_kind = nsd->children[i].kind;
498				nsd->this_child = &nsd->children[i];
499				nsd->this_child->child_num = i;
500				/* remove signal flags inherited from parent
501				   the parent will handle them. */
502				nsd->signal_hint_reload_hup = 0;
503				nsd->signal_hint_reload = 0;
504				nsd->signal_hint_child = 0;
505				nsd->signal_hint_quit = 0;
506				nsd->signal_hint_shutdown = 0;
507				nsd->signal_hint_stats = 0;
508				nsd->signal_hint_statsusr = 0;
509				close(*xfrd_sock_p);
510				close(nsd->this_child->child_fd);
511				nsd->this_child->child_fd = -1;
512				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
513					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
514				}
515				server_child(nsd);
516				/* NOTREACH */
517				exit(0);
518			case -1:
519				log_msg(LOG_ERR, "fork failed: %s",
520					strerror(errno));
521				return -1;
522			}
523		}
524	}
525	return 0;
526}
527
528#ifdef BIND8_STATS
529static void set_bind8_alarm(struct nsd* nsd)
530{
531	/* resync so that the next alarm is on the next whole minute */
532	if(nsd->st_period > 0) /* % by 0 gives divbyzero error */
533		alarm(nsd->st_period - (time(NULL) % nsd->st_period));
534}
535#endif
536
537/* set zone stat ids for zones initially read in */
538static void
539zonestatid_tree_set(struct nsd* nsd)
540{
541	struct radnode* n;
542	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
543		zone_type* zone = (zone_type*)n->elem;
544		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
545	}
546}
547
548#ifdef USE_ZONE_STATS
549void
550server_zonestat_alloc(struct nsd* nsd)
551{
552	size_t num = (nsd->options->zonestatnames->count==0?1:
553			nsd->options->zonestatnames->count);
554	size_t sz = sizeof(struct nsdst)*num;
555	char tmpfile[256];
556	uint8_t z = 0;
557
558	/* file names */
559	nsd->zonestatfname[0] = 0;
560	nsd->zonestatfname[1] = 0;
561	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
562		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
563	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
564	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
565		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
566	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
567
568	/* file descriptors */
569	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
570	if(nsd->zonestatfd[0] == -1) {
571		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
572			strerror(errno));
573		exit(1);
574	}
575	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
576	if(nsd->zonestatfd[0] == -1) {
577		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
578			strerror(errno));
579		close(nsd->zonestatfd[0]);
580		unlink(nsd->zonestatfname[0]);
581		exit(1);
582	}
583
584#ifdef HAVE_MMAP
585	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
586		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
587			strerror(errno));
588		exit(1);
589	}
590	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
591		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
592			nsd->zonestatfname[0], strerror(errno));
593		exit(1);
594	}
595	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
596		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
597			strerror(errno));
598		exit(1);
599	}
600	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
601		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
602			nsd->zonestatfname[1], strerror(errno));
603		exit(1);
604	}
605	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
606		MAP_SHARED, nsd->zonestatfd[0], 0);
607	if(nsd->zonestat[0] == MAP_FAILED) {
608		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
609		unlink(nsd->zonestatfname[0]);
610		unlink(nsd->zonestatfname[1]);
611		exit(1);
612	}
613	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
614		MAP_SHARED, nsd->zonestatfd[1], 0);
615	if(nsd->zonestat[1] == MAP_FAILED) {
616		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
617		unlink(nsd->zonestatfname[0]);
618		unlink(nsd->zonestatfname[1]);
619		exit(1);
620	}
621	memset(nsd->zonestat[0], 0, sz);
622	memset(nsd->zonestat[1], 0, sz);
623	nsd->zonestatsize[0] = num;
624	nsd->zonestatsize[1] = num;
625	nsd->zonestatdesired = num;
626	nsd->zonestatsizenow = num;
627	nsd->zonestatnow = nsd->zonestat[0];
628#endif /* HAVE_MMAP */
629}
630
631void
632zonestat_remap(struct nsd* nsd, int idx, size_t sz)
633{
634#ifdef HAVE_MMAP
635#ifdef MREMAP_MAYMOVE
636	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
637		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
638		MREMAP_MAYMOVE);
639	if(nsd->zonestat[idx] == MAP_FAILED) {
640		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
641		exit(1);
642	}
643#else /* !HAVE MREMAP */
644	if(msync(nsd->zonestat[idx],
645		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
646		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
647	if(munmap(nsd->zonestat[idx],
648		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
649		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
650	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
651		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
652	if(nsd->zonestat[idx] == MAP_FAILED) {
653		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
654		exit(1);
655	}
656#endif /* MREMAP */
657#endif /* HAVE_MMAP */
658}
659
660/* realloc the zonestat array for the one that is not currently in use,
661 * to match the desired new size of the array (if applicable) */
662void
663server_zonestat_realloc(struct nsd* nsd)
664{
665#ifdef HAVE_MMAP
666	uint8_t z = 0;
667	size_t sz;
668	int idx = 0; /* index of the zonestat array that is not in use */
669	if(nsd->zonestatnow == nsd->zonestat[0])
670		idx = 1;
671	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
672		return;
673	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
674	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
675		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
676			strerror(errno));
677		exit(1);
678	}
679	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
680		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
681			nsd->zonestatfname[idx], strerror(errno));
682		exit(1);
683	}
684	zonestat_remap(nsd, idx, sz);
685	/* zero the newly allocated region */
686	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
687		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
688			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
689			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
690	}
691	nsd->zonestatsize[idx] = nsd->zonestatdesired;
692#endif /* HAVE_MMAP */
693}
694
695/* switchover to use the other array for the new children, that
696 * briefly coexist with the old children.  And we want to avoid them
697 * both writing to the same statistics arrays. */
698void
699server_zonestat_switch(struct nsd* nsd)
700{
701	if(nsd->zonestatnow == nsd->zonestat[0]) {
702		nsd->zonestatnow = nsd->zonestat[1];
703		nsd->zonestatsizenow = nsd->zonestatsize[1];
704	} else {
705		nsd->zonestatnow = nsd->zonestat[0];
706		nsd->zonestatsizenow = nsd->zonestatsize[0];
707	}
708}
709#endif /* USE_ZONE_STATS */
710
711#ifdef BIND8_STATS
712void
713server_stat_alloc(struct nsd* nsd)
714{
715	char tmpfile[256];
716	size_t sz = sizeof(struct nsdst) * nsd->child_count * 2;
717	uint8_t z = 0;
718
719	/* file name */
720	nsd->statfname = 0;
721	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.stat",
722		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
723	nsd->statfname = region_strdup(nsd->region, tmpfile);
724
725	/* file descriptor */
726	nsd->statfd = open(nsd->statfname, O_CREAT|O_RDWR, 0600);
727	if(nsd->statfd == -1) {
728		log_msg(LOG_ERR, "cannot create %s: %s", nsd->statfname,
729			strerror(errno));
730		unlink(nsd->zonestatfname[0]);
731		unlink(nsd->zonestatfname[1]);
732		exit(1);
733	}
734
735#ifdef HAVE_MMAP
736	if(lseek(nsd->statfd, (off_t)sz-1, SEEK_SET) == -1) {
737		log_msg(LOG_ERR, "lseek %s: %s", nsd->statfname,
738			strerror(errno));
739		goto fail_exit;
740	}
741	if(write(nsd->statfd, &z, 1) == -1) {
742		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
743			nsd->statfname, strerror(errno));
744		goto fail_exit;
745	}
746	nsd->stat_map = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
747		MAP_SHARED, nsd->statfd, 0);
748	if(nsd->stat_map == MAP_FAILED) {
749		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
750fail_exit:
751		close(nsd->statfd);
752		unlink(nsd->statfname);
753		unlink(nsd->zonestatfname[0]);
754		unlink(nsd->zonestatfname[1]);
755		exit(1);
756	}
757	memset(nsd->stat_map, 0, sz);
758	nsd->stats_per_child[0] = nsd->stat_map;
759	nsd->stats_per_child[1] = &nsd->stat_map[nsd->child_count];
760	nsd->stat_current = 0;
761	nsd->st = &nsd->stats_per_child[nsd->stat_current][0];
762#endif /* HAVE_MMAP */
763}
764#endif /* BIND8_STATS */
765
766#ifdef BIND8_STATS
767void
768server_stat_free(struct nsd* nsd)
769{
770	unlink(nsd->statfname);
771}
772#endif /* BIND8_STATS */
773
774static void
775cleanup_dname_compression_tables(void *ptr)
776{
777	free(ptr);
778	compressed_dname_offsets = NULL;
779	compression_table_capacity = 0;
780}
781
782static void
783initialize_dname_compression_tables(struct nsd *nsd)
784{
785	size_t needed = domain_table_count(nsd->db->domains) + 1;
786	needed += EXTRA_DOMAIN_NUMBERS;
787	if(compression_table_capacity < needed) {
788		if(compressed_dname_offsets) {
789			region_remove_cleanup(nsd->db->region,
790				cleanup_dname_compression_tables,
791				compressed_dname_offsets);
792			free(compressed_dname_offsets);
793		}
794		compressed_dname_offsets = (uint16_t *) xmallocarray(
795			needed, sizeof(uint16_t));
796		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
797			compressed_dname_offsets);
798		compression_table_capacity = needed;
799		compression_table_size=domain_table_count(nsd->db->domains)+1;
800	}
801	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
802	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
803}
804
805static int
806set_cloexec(struct nsd_socket *sock)
807{
808	assert(sock != NULL);
809
810	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
811		const char *socktype =
812			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
813		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
814			socktype, strerror(errno));
815		return -1;
816	}
817
818	return 1;
819}
820
821static int
822set_reuseport(struct nsd_socket *sock)
823{
824#ifdef SO_REUSEPORT
825	int on = 1;
826#ifdef SO_REUSEPORT_LB
827	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
828	 * SO_REUSEPORT on Linux. This is what the users want with the config
829	 * option in nsd.conf; if we actually need local address and port reuse
830	 * they'll also need to have SO_REUSEPORT set for them, assume it was
831	 * _LB they want.
832	 */
833	int opt = SO_REUSEPORT_LB;
834	static const char optname[] = "SO_REUSEPORT_LB";
835#else /* !SO_REUSEPORT_LB */
836	int opt = SO_REUSEPORT;
837	static const char optname[] = "SO_REUSEPORT";
838#endif /* SO_REUSEPORT_LB */
839
840	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
841		return 1;
842	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
843		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
844			optname, strerror(errno));
845	}
846	return -1;
847#else
848	(void)sock;
849#endif /* SO_REUSEPORT */
850
851	return 0;
852}
853
854static int
855set_reuseaddr(struct nsd_socket *sock)
856{
857#ifdef SO_REUSEADDR
858	int on = 1;
859	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
860		return 1;
861	}
862	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
863		strerror(errno));
864	return -1;
865#endif /* SO_REUSEADDR */
866	return 0;
867}
868
869static int
870set_rcvbuf(struct nsd_socket *sock, int rcv)
871{
872#ifdef SO_RCVBUF
873#ifdef SO_RCVBUFFORCE
874	if(0 == setsockopt(
875		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
876	{
877		return 1;
878	}
879	if(errno == EPERM || errno == ENOBUFS) {
880		return 0;
881	}
882	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
883		strerror(errno));
884	return -1;
885#else /* !SO_RCVBUFFORCE */
886	if (0 == setsockopt(
887		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
888	{
889		return 1;
890	}
891	if(errno == ENOSYS || errno == ENOBUFS) {
892		return 0;
893	}
894	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
895		strerror(errno));
896	return -1;
897#endif /* SO_RCVBUFFORCE */
898#endif /* SO_RCVBUF */
899
900	return 0;
901}
902
903static int
904set_sndbuf(struct nsd_socket *sock, int snd)
905{
906#ifdef SO_SNDBUF
907#ifdef SO_SNDBUFFORCE
908	if(0 == setsockopt(
909		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
910	{
911		return 1;
912	}
913	if(errno == EPERM || errno == ENOBUFS) {
914		return 0;
915	}
916	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
917		strerror(errno));
918	return -1;
919#else /* !SO_SNDBUFFORCE */
920	if(0 == setsockopt(
921		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
922	{
923		return 1;
924	}
925	if(errno == ENOSYS || errno == ENOBUFS) {
926		return 0;
927	}
928	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
929		strerror(errno));
930	return -1;
931#endif /* SO_SNDBUFFORCE */
932#endif /* SO_SNDBUF */
933
934	return 0;
935}
936
937static int
938set_nonblock(struct nsd_socket *sock)
939{
940	const char *socktype =
941		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
942
943	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
944		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
945			socktype, strerror(errno));
946		return -1;
947	}
948
949	return 1;
950}
951
952#ifdef INET6
953static int
954set_ipv6_v6only(struct nsd_socket *sock)
955{
956#ifdef IPV6_V6ONLY
957	int on = 1;
958	const char *socktype =
959		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
960
961	if(0 == setsockopt(
962		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
963	{
964		return 1;
965	}
966
967	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
968		socktype, strerror(errno));
969	return -1;
970#else
971	(void)sock;
972#endif /* IPV6_V6ONLY */
973
974	return 0;
975}
976#endif /* INET6 */
977
978#ifdef INET6
979static int
980set_ipv6_use_min_mtu(struct nsd_socket *sock)
981{
982#if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
983#if defined(IPV6_USE_MIN_MTU)
984	/* There is no fragmentation of IPv6 datagrams during forwarding in the
985	 * network. Therefore we do not send UDP datagrams larger than the
986	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
987	 * larger if the network stack supports IPV6_USE_MIN_MTU.
988	 */
989	int opt = IPV6_USE_MIN_MTU;
990	int optval = 1;
991	static const char optname[] = "IPV6_USE_MIN_MTU";
992#elif defined(IPV6_MTU)
993	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
994	 * to the MIN MTU to get the same.
995	 */
996	int opt = IPV6_MTU;
997	int optval = IPV6_MIN_MTU;
998	static const char optname[] = "IPV6_MTU";
999#endif
1000	if(0 == setsockopt(
1001		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
1002	{
1003		return 1;
1004	}
1005
1006	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
1007		optname, strerror(errno));
1008	return -1;
1009#else
1010	(void)sock;
1011#endif /* INET6 */
1012
1013	return 0;
1014}
1015#endif /* INET6 */
1016
1017static int
1018set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
1019{
1020	int ret = 0;
1021
1022#if defined(IP_MTU_DISCOVER)
1023	int opt = IP_MTU_DISCOVER;
1024	int optval;
1025# if defined(IP_PMTUDISC_OMIT)
1026	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
1027	 * information and send packets with DF=0. Fragmentation is allowed if
1028	 * and only if the packet size exceeds the outgoing interface MTU or
1029	 * the packet encounters smaller MTU link in network. This mitigates
1030	 * DNS fragmentation attacks by preventing forged PMTU information.
1031	 * FreeBSD already has same semantics without setting the option.
1032	 */
1033	optval = IP_PMTUDISC_OMIT;
1034	if(0 == setsockopt(
1035		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
1036	{
1037		return 1;
1038	}
1039
1040	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1041		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
1042# endif /* IP_PMTUDISC_OMIT */
1043# if defined(IP_PMTUDISC_DONT)
1044	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
1045	optval = IP_PMTUDISC_DONT;
1046	if(0 == setsockopt(
1047		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
1048	{
1049		return 1;
1050	}
1051
1052	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1053		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
1054# endif
1055	ret = -1;
1056#elif defined(IP_DONTFRAG)
1057	int off = 0;
1058	if (0 == setsockopt(
1059		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
1060	{
1061		return 1;
1062	}
1063
1064	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
1065		strerror(errno));
1066	ret = -1;
1067#else
1068	(void)sock;
1069#endif
1070
1071	return ret;
1072}
1073
1074static int
1075set_ip_freebind(struct nsd_socket *sock)
1076{
1077#ifdef IP_FREEBIND
1078	int on = 1;
1079	const char *socktype =
1080		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1081	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
1082	{
1083		return 1;
1084	}
1085	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1086		socktype, strerror(errno));
1087	return -1;
1088#else
1089	(void)sock;
1090#endif /* IP_FREEBIND */
1091
1092	return 0;
1093}
1094
1095static int
1096set_ip_transparent(struct nsd_socket *sock)
1097{
1098	/*
1099	The scandalous preprocessor blob here calls for some explanation :)
1100	POSIX does not specify an option to bind non-local IPs, so
1101	platforms developed several implementation-specific options,
1102	all set in the same way, but with different names.
1103	For additional complexity, some platform manage this setting
1104	differently for different address families (IPv4 vs IPv6).
1105	This scandalous preprocessor blob below abstracts such variability
1106	in the way which leaves the C code as lean and clear as possible.
1107	*/
1108
1109#if defined(IP_TRANSPARENT)
1110#	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
1111#	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1112#	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
1113// as of 2020-01, Linux does not support this on IPv6 programmatically
1114#elif defined(SO_BINDANY)
1115#	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
1116#	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
1117#	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
1118#elif defined(IP_BINDANY)
1119#	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
1120#	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
1121#	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1122#	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
1123#	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
1124#endif
1125
1126#ifndef NSD_SOCKET_OPTION_TRANSPARENT
1127	(void)sock;
1128#else
1129#	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1130#		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1131#	endif
1132#	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1133#		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1134#	endif
1135#	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1136#		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1137#	endif
1138
1139	int on = 1;
1140	const char *socktype =
1141		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1142	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1143
1144	if(0 == setsockopt(
1145		sock->s,
1146		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1147		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1148		&on, sizeof(on)))
1149	{
1150		return 1;
1151	}
1152
1153	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1154		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1155	return -1;
1156#endif
1157
1158	return 0;
1159}
1160
1161static int
1162set_tcp_maxseg(struct nsd_socket *sock, int mss)
1163{
1164#if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1165	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1166		return 1;
1167	}
1168	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1169		strerror(errno));
1170	return -1;
1171#else
1172	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1173#endif
1174	return 0;
1175}
1176
1177#ifdef USE_TCP_FASTOPEN
1178static int
1179set_tcp_fastopen(struct nsd_socket *sock)
1180{
1181	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1182	 * a defense against IP spoofing attacks as suggested in RFC7413.
1183	 */
1184	int qlen;
1185
1186#ifdef __APPLE__
1187	/* macOS X implementation only supports qlen of 1 via this call. The
1188	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1189	 * kernel parameter.
1190	 */
1191	qlen = 1;
1192#else
1193	/* 5 is recommended on Linux. */
1194	qlen = 5;
1195#endif
1196	if (0 == setsockopt(
1197		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1198	{
1199		return 1;
1200	}
1201
1202	if (errno == EPERM) {
1203		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1204				 "; this could likely be because sysctl "
1205				 "net.inet.tcp.fastopen.enabled, "
1206				 "net.inet.tcp.fastopen.server_enable, or "
1207				 "net.ipv4.tcp_fastopen is disabled",
1208			strerror(errno));
1209	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1210	 * disabled, except when verbosity enabled for debugging
1211	 */
1212	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1213		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1214			strerror(errno));
1215	}
1216
1217	return (errno == ENOPROTOOPT ? 0 : -1);
1218}
1219#endif /* USE_TCP_FASTOPEN */
1220
1221static int
1222set_bindtodevice(struct nsd_socket *sock)
1223{
1224#if defined(SO_BINDTODEVICE)
1225	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1226		sock->device, strlen(sock->device)) == -1)
1227	{
1228		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1229		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1230		return -1;
1231	}
1232
1233	return 1;
1234#else
1235	(void)sock;
1236	return 0;
1237#endif
1238}
1239
1240static int
1241set_setfib(struct nsd_socket *sock)
1242{
1243#if defined(SO_SETFIB)
1244	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1245	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1246	{
1247		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1248		                 "SO_SETFIB", sock->fib, strerror(errno));
1249		return -1;
1250	}
1251
1252	return 1;
1253#else
1254	(void)sock;
1255	return 0;
1256#endif
1257}
1258
1259static int
1260open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1261{
1262	int rcv = 1*1024*1024, snd = 1*1024*1024;
1263
1264	if(-1 == (sock->s = socket(
1265		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1266	{
1267#ifdef INET6
1268		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1269		   (sock->addr.ai_family == AF_INET6) &&
1270		   (errno == EAFNOSUPPORT))
1271		{
1272			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1273				"not supported");
1274			return 0;
1275		}
1276#endif
1277		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1278		return -1;
1279	}
1280
1281	set_cloexec(sock);
1282
1283	if(nsd->reuseport && reuseport_works && *reuseport_works)
1284		*reuseport_works = (set_reuseport(sock) == 1);
1285
1286	if(nsd->options->receive_buffer_size > 0)
1287		rcv = nsd->options->receive_buffer_size;
1288	if(set_rcvbuf(sock, rcv) == -1)
1289		return -1;
1290
1291	if(nsd->options->send_buffer_size > 0)
1292		snd = nsd->options->send_buffer_size;
1293	if(set_sndbuf(sock, snd) == -1)
1294		return -1;
1295#ifdef INET6
1296	if(sock->addr.ai_family == AF_INET6) {
1297		if(set_ipv6_v6only(sock) == -1 ||
1298		   set_ipv6_use_min_mtu(sock) == -1)
1299			return -1;
1300	} else
1301#endif /* INET6 */
1302	if(sock->addr.ai_family == AF_INET) {
1303		if(set_ipv4_no_pmtu_disc(sock) == -1)
1304			return -1;
1305	}
1306
1307	/* Set socket to non-blocking. Otherwise, on operating systems
1308	 * with thundering herd problems, the UDP recv could block
1309	 * after select returns readable.
1310	 */
1311	set_nonblock(sock);
1312
1313	if(nsd->options->ip_freebind)
1314		(void)set_ip_freebind(sock);
1315	if(nsd->options->ip_transparent)
1316		(void)set_ip_transparent(sock);
1317	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1318		return -1;
1319	if(sock->fib != -1 && set_setfib(sock) == -1)
1320		return -1;
1321
1322	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1323		char buf[256];
1324		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1325		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1326			buf, strerror(errno));
1327		return -1;
1328	}
1329
1330	return 1;
1331}
1332
1333static int
1334open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1335{
1336#ifdef USE_TCP_FASTOPEN
1337	report_tcp_fastopen_config();
1338#endif
1339
1340	(void)reuseport_works;
1341
1342	if(-1 == (sock->s = socket(
1343		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1344	{
1345#ifdef INET6
1346		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1347		   (sock->addr.ai_family == AF_INET6) &&
1348		   (errno == EAFNOSUPPORT))
1349		{
1350			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1351			                     "not supported");
1352			return 0;
1353		}
1354#endif /* INET6 */
1355		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1356		return -1;
1357	}
1358
1359	set_cloexec(sock);
1360
1361	if(nsd->reuseport && reuseport_works && *reuseport_works)
1362		*reuseport_works = (set_reuseport(sock) == 1);
1363
1364	(void)set_reuseaddr(sock);
1365
1366#ifdef INET6
1367	if(sock->addr.ai_family == AF_INET6) {
1368		if (set_ipv6_v6only(sock) == -1 ||
1369		    set_ipv6_use_min_mtu(sock) == -1)
1370			return -1;
1371	}
1372#endif
1373
1374	if(nsd->tcp_mss > 0)
1375		set_tcp_maxseg(sock, nsd->tcp_mss);
1376	/* (StevensUNP p463), if TCP listening socket is blocking, then
1377	   it may block in accept, even if select() says readable. */
1378	(void)set_nonblock(sock);
1379	if(nsd->options->ip_freebind)
1380		(void)set_ip_freebind(sock);
1381	if(nsd->options->ip_transparent)
1382		(void)set_ip_transparent(sock);
1383	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1384		return -1;
1385	if(sock->fib != -1 && set_setfib(sock) == -1)
1386		return -1;
1387
1388	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1389		char buf[256];
1390		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1391		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1392			buf, strerror(errno));
1393		return -1;
1394	}
1395
1396#ifdef USE_TCP_FASTOPEN
1397	(void)set_tcp_fastopen(sock);
1398#endif
1399
1400	if(listen(sock->s, TCP_BACKLOG) == -1) {
1401		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1402		return -1;
1403	}
1404
1405	return 1;
1406}
1407
1408/*
1409 * Initialize the server, reuseport, create and bind the sockets.
1410 */
1411int
1412server_init(struct nsd *nsd)
1413{
1414	size_t i;
1415	int reuseport = 1; /* Determine if REUSEPORT works. */
1416
1417	/* open server interface ports */
1418	for(i = 0; i < nsd->ifs; i++) {
1419		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1420		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1421		{
1422			return -1;
1423		}
1424	}
1425
1426	if(nsd->reuseport && reuseport) {
1427		size_t ifs = nsd->ifs * nsd->reuseport;
1428
1429		/* increase the size of the interface arrays, there are going
1430		 * to be separate interface file descriptors for every server
1431		 * instance */
1432		region_remove_cleanup(nsd->region, free, nsd->udp);
1433		region_remove_cleanup(nsd->region, free, nsd->tcp);
1434
1435		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1436		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1437		region_add_cleanup(nsd->region, free, nsd->udp);
1438		region_add_cleanup(nsd->region, free, nsd->tcp);
1439		if(ifs > nsd->ifs) {
1440			memset(&nsd->udp[nsd->ifs], 0,
1441				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1442			memset(&nsd->tcp[nsd->ifs], 0,
1443				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1444		}
1445
1446		for(i = nsd->ifs; i < ifs; i++) {
1447			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1448			nsd->udp[i].s = -1;
1449			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1450				return -1;
1451			}
1452			/* Turn off REUSEPORT for TCP by copying the socket
1453			 * file descriptor.
1454			 * This means we should not close TCP used by
1455			 * other servers in reuseport enabled mode, in
1456			 * server_child().
1457			 */
1458			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1459		}
1460
1461		nsd->ifs = ifs;
1462	} else {
1463		nsd->reuseport = 0;
1464	}
1465
1466	/* open server interface ports for verifiers */
1467	for(i = 0; i < nsd->verify_ifs; i++) {
1468		if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 ||
1469		   open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1)
1470		{
1471			return -1;
1472		}
1473	}
1474
1475	return 0;
1476}
1477
1478/*
1479 * Prepare the server for take off.
1480 *
1481 */
1482int
1483server_prepare(struct nsd *nsd)
1484{
1485#ifdef RATELIMIT
1486	/* set secret modifier for hashing (rate limits) */
1487#ifdef HAVE_GETRANDOM
1488	uint32_t v;
1489	if(getrandom(&v, sizeof(v), 0) == -1) {
1490		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1491		exit(1);
1492	}
1493	hash_set_raninit(v);
1494#elif defined(HAVE_ARC4RANDOM)
1495	hash_set_raninit(arc4random());
1496#else
1497	uint32_t v = getpid() ^ time(NULL);
1498	srandom((unsigned long)v);
1499#  ifdef HAVE_SSL
1500	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1501		hash_set_raninit(v);
1502	else
1503#  endif
1504		hash_set_raninit(random());
1505#endif
1506	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1507		nsd->options->rrl_ratelimit,
1508		nsd->options->rrl_whitelist_ratelimit,
1509		nsd->options->rrl_slip,
1510		nsd->options->rrl_ipv4_prefix_length,
1511		nsd->options->rrl_ipv6_prefix_length);
1512#endif /* RATELIMIT */
1513
1514	/* Open the database... */
1515	if ((nsd->db = namedb_open(nsd->options)) == NULL) {
1516		log_msg(LOG_ERR, "unable to open the database: %s", strerror(errno));
1517		unlink(nsd->task[0]->fname);
1518		unlink(nsd->task[1]->fname);
1519#ifdef USE_ZONE_STATS
1520		unlink(nsd->zonestatfname[0]);
1521		unlink(nsd->zonestatfname[1]);
1522#endif
1523#ifdef BIND8_STATS
1524		server_stat_free(nsd);
1525#endif
1526		xfrd_del_tempdir(nsd);
1527		return -1;
1528	}
1529	/* check if zone files can be read */
1530	/* NULL for taskudb because we send soainfo in a moment, batched up,
1531	 * for all zones */
1532	namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1533	zonestatid_tree_set(nsd);
1534
1535	compression_table_capacity = 0;
1536	initialize_dname_compression_tables(nsd);
1537
1538#ifdef	BIND8_STATS
1539	/* Initialize times... */
1540	time(&nsd->st->boot);
1541	set_bind8_alarm(nsd);
1542#endif /* BIND8_STATS */
1543
1544	return 0;
1545}
1546
1547/*
1548 * Fork the required number of servers.
1549 */
1550static int
1551server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1552	int* xfrd_sock_p)
1553{
1554	size_t i;
1555
1556	/* Start all child servers initially.  */
1557	for (i = 0; i < nsd->child_count; ++i) {
1558		nsd->children[i].pid = 0;
1559	}
1560
1561	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1562}
1563
1564static void
1565server_close_socket(struct nsd_socket *sock)
1566{
1567	if(sock->s != -1) {
1568		close(sock->s);
1569		sock->s = -1;
1570	}
1571}
1572
1573void
1574server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1575{
1576	size_t i;
1577
1578	/* Close all the sockets... */
1579	for (i = 0; i < n; ++i) {
1580		server_close_socket(&sockets[i]);
1581	}
1582}
1583
1584/*
1585 * Close the sockets, shutdown the server and exit.
1586 * Does not return.
1587 */
1588void
1589server_shutdown(struct nsd *nsd)
1590{
1591	size_t i;
1592
1593	server_close_all_sockets(nsd->udp, nsd->ifs);
1594	server_close_all_sockets(nsd->tcp, nsd->ifs);
1595	/* CHILD: close command channel to parent */
1596	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1597	{
1598		close(nsd->this_child->parent_fd);
1599		nsd->this_child->parent_fd = -1;
1600	}
1601	/* SERVER: close command channels to children */
1602	if(!nsd->this_child)
1603	{
1604		for(i=0; i < nsd->child_count; ++i)
1605			if(nsd->children[i].child_fd != -1)
1606			{
1607				close(nsd->children[i].child_fd);
1608				nsd->children[i].child_fd = -1;
1609			}
1610	}
1611
1612	tsig_finalize();
1613	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1614#ifdef HAVE_SSL
1615	if (nsd->tls_ctx)
1616		SSL_CTX_free(nsd->tls_ctx);
1617#endif
1618
1619#ifdef MEMCLEAN /* OS collects memory pages */
1620#ifdef RATELIMIT
1621	rrl_mmap_deinit_keep_mmap();
1622#endif
1623#ifdef USE_DNSTAP
1624	dt_collector_destroy(nsd->dt_collector, nsd);
1625#endif
1626	udb_base_free_keep_mmap(nsd->task[0]);
1627	udb_base_free_keep_mmap(nsd->task[1]);
1628	namedb_free_ixfr(nsd->db);
1629	namedb_close(nsd->db);
1630	nsd_options_destroy(nsd->options);
1631	region_destroy(nsd->region);
1632#endif
1633	log_finalize();
1634	exit(0);
1635}
1636
1637void
1638server_prepare_xfrd(struct nsd* nsd)
1639{
1640	char tmpfile[256];
1641	/* create task mmaps */
1642	nsd->mytask = 0;
1643	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1644		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1645	nsd->task[0] = task_file_create(tmpfile);
1646	if(!nsd->task[0]) {
1647#ifdef USE_ZONE_STATS
1648		unlink(nsd->zonestatfname[0]);
1649		unlink(nsd->zonestatfname[1]);
1650#endif
1651#ifdef BIND8_STATS
1652		server_stat_free(nsd);
1653#endif
1654		xfrd_del_tempdir(nsd);
1655		exit(1);
1656	}
1657	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1658		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1659	nsd->task[1] = task_file_create(tmpfile);
1660	if(!nsd->task[1]) {
1661		unlink(nsd->task[0]->fname);
1662#ifdef USE_ZONE_STATS
1663		unlink(nsd->zonestatfname[0]);
1664		unlink(nsd->zonestatfname[1]);
1665#endif
1666#ifdef BIND8_STATS
1667		server_stat_free(nsd);
1668#endif
1669		xfrd_del_tempdir(nsd);
1670		exit(1);
1671	}
1672	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1673	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1674	/* create xfrd listener structure */
1675	nsd->xfrd_listener = region_alloc(nsd->region,
1676		sizeof(netio_handler_type));
1677	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1678		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1679	nsd->xfrd_listener->fd = -1;
1680	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1681		nsd;
1682	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1683		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1684}
1685
1686
1687void
1688server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1689{
1690	pid_t pid;
1691	int sockets[2] = {0,0};
1692	struct ipc_handler_conn_data *data;
1693
1694	if(nsd->xfrd_listener->fd != -1)
1695		close(nsd->xfrd_listener->fd);
1696	if(del_db) {
1697		/* recreate taskdb that xfrd was using, it may be corrupt */
1698		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1699		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1700		nsd->task[1-nsd->mytask]->fname = NULL;
1701		/* free alloc already, so udb does not shrink itself */
1702		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1703		nsd->task[1-nsd->mytask]->alloc = NULL;
1704		udb_base_free(nsd->task[1-nsd->mytask]);
1705		/* create new file, overwrite the old one */
1706		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1707		free(tmpfile);
1708	}
1709	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1710		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1711		return;
1712	}
1713	pid = fork();
1714	switch (pid) {
1715	case -1:
1716		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1717		break;
1718	default:
1719		/* PARENT: close first socket, use second one */
1720		close(sockets[0]);
1721		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1722			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1723		}
1724		if(del_db) xfrd_free_namedb(nsd);
1725		/* use other task than I am using, since if xfrd died and is
1726		 * restarted, the reload is using nsd->mytask */
1727		nsd->mytask = 1 - nsd->mytask;
1728
1729#ifdef HAVE_SETPROCTITLE
1730		setproctitle("xfrd");
1731#endif
1732#ifdef HAVE_CPUSET_T
1733		if(nsd->use_cpu_affinity) {
1734			set_cpu_affinity(nsd->xfrd_cpuset);
1735		}
1736#endif
1737
1738		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1739		/* ENOTREACH */
1740		break;
1741	case 0:
1742		/* CHILD: close second socket, use first one */
1743		close(sockets[1]);
1744		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1745			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1746		}
1747		nsd->xfrd_listener->fd = sockets[0];
1748		break;
1749	}
1750	/* server-parent only */
1751	nsd->xfrd_listener->timeout = NULL;
1752	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1753	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1754	/* clear ongoing ipc reads */
1755	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1756	data->conn->is_reading = 0;
1757}
1758
1759/** add all soainfo to taskdb */
1760static void
1761add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1762{
1763	struct radnode* n;
1764	udb_ptr task_last; /* last task, mytask is empty so NULL */
1765	/* add all SOA INFO to mytask */
1766	udb_ptr_init(&task_last, taskudb);
1767	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1768		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1769	}
1770	udb_ptr_unlink(&task_last, taskudb);
1771}
1772
1773void
1774server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1775{
1776	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1777	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1778	 *   then they exchange and process.
1779	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1780	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1781	 *   expire notifications can be sent back via a normal reload later
1782	 *   (xfrd will wait for current running reload to finish if any).
1783	 */
1784	sig_atomic_t cmd = 0;
1785	pid_t mypid;
1786	int xfrd_sock = nsd->xfrd_listener->fd;
1787	struct udb_base* taskudb = nsd->task[nsd->mytask];
1788	udb_ptr t;
1789	if(!shortsoa) {
1790		if(nsd->signal_hint_shutdown) {
1791		shutdown:
1792			log_msg(LOG_WARNING, "signal received, shutting down...");
1793			server_close_all_sockets(nsd->udp, nsd->ifs);
1794			server_close_all_sockets(nsd->tcp, nsd->ifs);
1795			daemon_remote_close(nsd->rc);
1796			/* Unlink it if possible... */
1797			unlinkpid(nsd->pidfile);
1798			unlink(nsd->task[0]->fname);
1799			unlink(nsd->task[1]->fname);
1800#ifdef USE_ZONE_STATS
1801			unlink(nsd->zonestatfname[0]);
1802			unlink(nsd->zonestatfname[1]);
1803#endif
1804#ifdef BIND8_STATS
1805			server_stat_free(nsd);
1806#endif
1807			server_shutdown(nsd);
1808			/* ENOTREACH */
1809			exit(0);
1810		}
1811	}
1812	if(shortsoa) {
1813		/* put SOA in xfrd task because mytask may be in use */
1814		taskudb = nsd->task[1-nsd->mytask];
1815	}
1816
1817	add_all_soa_to_task(nsd, taskudb);
1818	if(!shortsoa) {
1819		/* wait for xfrd to signal task is ready, RELOAD signal */
1820		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1821			cmd != NSD_RELOAD) {
1822			log_msg(LOG_ERR, "did not get start signal from xfrd");
1823			exit(1);
1824		}
1825		if(nsd->signal_hint_shutdown) {
1826			goto shutdown;
1827		}
1828	}
1829	/* give xfrd our task, signal it with RELOAD_DONE */
1830	task_process_sync(taskudb);
1831	cmd = NSD_RELOAD_DONE;
1832	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1833		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1834			(int)nsd->pid, strerror(errno));
1835	}
1836	mypid = getpid();
1837	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1838		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1839			strerror(errno));
1840	}
1841
1842	if(!shortsoa) {
1843		/* process the xfrd task works (expiry data) */
1844		nsd->mytask = 1 - nsd->mytask;
1845		taskudb = nsd->task[nsd->mytask];
1846		task_remap(taskudb);
1847		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1848		while(!udb_ptr_is_null(&t)) {
1849			task_process_expire(nsd->db, TASKLIST(&t));
1850			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1851		}
1852		udb_ptr_unlink(&t, taskudb);
1853		task_clear(taskudb);
1854
1855		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1856		cmd = NSD_RELOAD_DONE;
1857		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1858			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1859				(int)nsd->pid, strerror(errno));
1860		}
1861	}
1862}
1863
1864#ifdef HAVE_SSL
1865static void
1866log_crypto_from_err(const char* str, unsigned long err)
1867{
1868	/* error:[error code]:[library name]:[function name]:[reason string] */
1869	char buf[128];
1870	unsigned long e;
1871	ERR_error_string_n(err, buf, sizeof(buf));
1872	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1873	while( (e=ERR_get_error()) ) {
1874		ERR_error_string_n(e, buf, sizeof(buf));
1875		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1876	}
1877}
1878
1879void
1880log_crypto_err(const char* str)
1881{
1882	log_crypto_from_err(str, ERR_get_error());
1883}
1884
1885/** true if the ssl handshake error has to be squelched from the logs */
1886static int
1887squelch_err_ssl_handshake(unsigned long err)
1888{
1889	if(verbosity >= 3)
1890		return 0; /* only squelch on low verbosity */
1891	/* this is very specific, we could filter on ERR_GET_REASON()
1892	 * (the third element in ERR_PACK) */
1893	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1894		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1895		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1896		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1897#ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1898		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1899#endif
1900#ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1901		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1902		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1903#  ifdef SSL_R_VERSION_TOO_LOW
1904		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1905#  endif
1906#endif
1907		)
1908		return 1;
1909	return 0;
1910}
1911
1912void
1913perform_openssl_init(void)
1914{
1915	/* init SSL library */
1916#ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1917	ERR_load_crypto_strings();
1918#endif
1919#if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
1920	ERR_load_SSL_strings();
1921#endif
1922#if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1923	OpenSSL_add_all_algorithms();
1924#else
1925	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1926		| OPENSSL_INIT_ADD_ALL_DIGESTS
1927		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1928#endif
1929#if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1930	(void)SSL_library_init();
1931#else
1932	OPENSSL_init_ssl(0, NULL);
1933#endif
1934
1935	if(!RAND_status()) {
1936		/* try to seed it */
1937		unsigned char buf[256];
1938		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1939		size_t i;
1940		v = seed;
1941		for(i=0; i<256/sizeof(v); i++) {
1942			memmove(buf+i*sizeof(v), &v, sizeof(v));
1943			v = v*seed + (unsigned int)i;
1944		}
1945		RAND_seed(buf, 256);
1946		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1947	}
1948}
1949
1950static int
1951get_ocsp(char *filename, unsigned char **ocsp)
1952{
1953	BIO *bio;
1954	OCSP_RESPONSE *response;
1955	int len = -1;
1956	unsigned char *p, *buf;
1957	assert(filename);
1958
1959	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1960		log_crypto_err("get_ocsp: BIO_new_file failed");
1961		return -1;
1962	}
1963
1964	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1965		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1966		BIO_free(bio);
1967		return -1;
1968	}
1969
1970	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1971		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1972		OCSP_RESPONSE_free(response);
1973		BIO_free(bio);
1974		return -1;
1975	}
1976
1977	if ((buf = malloc((size_t) len)) == NULL) {
1978		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1979		OCSP_RESPONSE_free(response);
1980		BIO_free(bio);
1981		return -1;
1982	}
1983
1984	p = buf;
1985	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1986		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1987		free(buf);
1988		OCSP_RESPONSE_free(response);
1989		BIO_free(bio);
1990		return -1;
1991	}
1992
1993	OCSP_RESPONSE_free(response);
1994	BIO_free(bio);
1995
1996	*ocsp = buf;
1997	return len;
1998}
1999
2000/* further setup ssl ctx after the keys are loaded */
2001static void
2002listen_sslctx_setup_2(void* ctxt)
2003{
2004	SSL_CTX* ctx = (SSL_CTX*)ctxt;
2005	(void)ctx;
2006#if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
2007	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
2008		/* ENOTREACH */
2009		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
2010	}
2011#elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
2012	if(1) {
2013		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
2014		if (!ecdh) {
2015			log_crypto_err("could not find p256, not enabling ECDHE");
2016		} else {
2017			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
2018				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
2019			}
2020			EC_KEY_free (ecdh);
2021		}
2022	}
2023#endif
2024}
2025
2026static int
2027add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
2028{
2029	if(ocspdata) {
2030		unsigned char *p;
2031		if ((p=malloc(ocspdata_len)) == NULL) {
2032			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
2033			return SSL_TLSEXT_ERR_NOACK;
2034		}
2035		memcpy(p, ocspdata, ocspdata_len);
2036		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
2037			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
2038			free(p);
2039			return SSL_TLSEXT_ERR_NOACK;
2040		}
2041		return SSL_TLSEXT_ERR_OK;
2042	} else {
2043		return SSL_TLSEXT_ERR_NOACK;
2044	}
2045}
2046
2047SSL_CTX*
2048server_tls_ctx_setup(char* key, char* pem, char* verifypem)
2049{
2050	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
2051	if(!ctx) {
2052		log_crypto_err("could not SSL_CTX_new");
2053		return NULL;
2054	}
2055	/* no SSLv2, SSLv3 because has defects */
2056#if SSL_OP_NO_SSLv2 != 0
2057	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
2058		log_crypto_err("could not set SSL_OP_NO_SSLv2");
2059		SSL_CTX_free(ctx);
2060		return NULL;
2061	}
2062#endif
2063	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
2064		!= SSL_OP_NO_SSLv3){
2065		log_crypto_err("could not set SSL_OP_NO_SSLv3");
2066		SSL_CTX_free(ctx);
2067		return 0;
2068	}
2069#if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
2070	/* if we have tls 1.1 disable 1.0 */
2071	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
2072		!= SSL_OP_NO_TLSv1){
2073		log_crypto_err("could not set SSL_OP_NO_TLSv1");
2074		SSL_CTX_free(ctx);
2075		return 0;
2076	}
2077#endif
2078#if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
2079	/* if we have tls 1.2 disable 1.1 */
2080	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
2081		!= SSL_OP_NO_TLSv1_1){
2082		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
2083		SSL_CTX_free(ctx);
2084		return 0;
2085	}
2086#endif
2087#if defined(SSL_OP_NO_RENEGOTIATION)
2088	/* disable client renegotiation */
2089	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
2090		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
2091		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
2092		SSL_CTX_free(ctx);
2093		return 0;
2094	}
2095#endif
2096#if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2097	/* if we detect system-wide crypto policies, use those */
2098	if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
2099		/* if we have sha256, set the cipher list to have no known vulns */
2100		if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2101			log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2102	}
2103#endif
2104	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2105		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2106		SSL_OP_CIPHER_SERVER_PREFERENCE) {
2107		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2108		SSL_CTX_free(ctx);
2109		return 0;
2110	}
2111#ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2112	SSL_CTX_set_security_level(ctx, 0);
2113#endif
2114	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2115		log_msg(LOG_ERR, "error for cert file: %s", pem);
2116		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2117		SSL_CTX_free(ctx);
2118		return NULL;
2119	}
2120	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2121		log_msg(LOG_ERR, "error for private key file: %s", key);
2122		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2123		SSL_CTX_free(ctx);
2124		return NULL;
2125	}
2126	if(!SSL_CTX_check_private_key(ctx)) {
2127		log_msg(LOG_ERR, "error for key file: %s", key);
2128		log_crypto_err("Error in SSL_CTX check_private_key");
2129		SSL_CTX_free(ctx);
2130		return NULL;
2131	}
2132	listen_sslctx_setup_2(ctx);
2133	if(verifypem && verifypem[0]) {
2134		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2135			log_crypto_err("Error in SSL_CTX verify locations");
2136			SSL_CTX_free(ctx);
2137			return NULL;
2138		}
2139		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2140		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2141	}
2142	return ctx;
2143}
2144
2145SSL_CTX*
2146server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2147{
2148	char *key, *pem;
2149	SSL_CTX *ctx;
2150
2151	key = nsd->options->tls_service_key;
2152	pem = nsd->options->tls_service_pem;
2153	if(!key || key[0] == 0) {
2154		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2155		return NULL;
2156	}
2157	if(!pem || pem[0] == 0) {
2158		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2159		return NULL;
2160	}
2161
2162	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2163	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2164	ctx = server_tls_ctx_setup(key, pem, verifypem);
2165	if(!ctx) {
2166		log_msg(LOG_ERR, "could not setup server TLS context");
2167		return NULL;
2168	}
2169	if(ocspfile && ocspfile[0]) {
2170		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2171			log_crypto_err("Error reading OCSPfile");
2172			SSL_CTX_free(ctx);
2173			return NULL;
2174		} else {
2175			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2176			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2177				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2178				SSL_CTX_free(ctx);
2179				return NULL;
2180			}
2181		}
2182	}
2183	return ctx;
2184}
2185
2186/* check if tcp_handler_accept_data created for TLS dedicated port */
2187int
2188using_tls_port(struct sockaddr* addr, const char* tls_port)
2189{
2190	in_port_t port = 0;
2191
2192	if (addr->sa_family == AF_INET)
2193		port = ((struct sockaddr_in*)addr)->sin_port;
2194#ifndef HAVE_STRUCT_SOCKADDR_IN6
2195	else
2196		port = ((struct sockaddr_in6*)addr)->sin6_port;
2197#endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2198	if (atoi(tls_port) == ntohs(port))
2199		return 1;
2200
2201	return 0;
2202}
2203#endif
2204
2205/* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2206ssize_t
2207block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2208{
2209	uint8_t* buf = (uint8_t*) p;
2210	ssize_t total = 0;
2211	struct pollfd fd;
2212	memset(&fd, 0, sizeof(fd));
2213	fd.fd = s;
2214	fd.events = POLLIN;
2215
2216	while( total < sz) {
2217		ssize_t ret;
2218		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2219		if(ret == -1) {
2220			if(errno == EAGAIN)
2221				/* blocking read */
2222				continue;
2223			if(errno == EINTR) {
2224				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2225					return -1;
2226				/* other signals can be handled later */
2227				continue;
2228			}
2229			/* some error */
2230			return -1;
2231		}
2232		if(ret == 0) {
2233			/* operation timed out */
2234			return -2;
2235		}
2236		ret = read(s, buf+total, sz-total);
2237		if(ret == -1) {
2238			if(errno == EAGAIN)
2239				/* blocking read */
2240				continue;
2241			if(errno == EINTR) {
2242				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2243					return -1;
2244				/* other signals can be handled later */
2245				continue;
2246			}
2247			/* some error */
2248			return -1;
2249		}
2250		if(ret == 0) {
2251			/* closed connection! */
2252			return 0;
2253		}
2254		total += ret;
2255	}
2256	return total;
2257}
2258
2259static void
2260reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2261{
2262	sig_atomic_t cmd = NSD_QUIT_SYNC;
2263	udb_ptr t, next;
2264	udb_base* u = nsd->task[nsd->mytask];
2265	udb_ptr_init(&next, u);
2266	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2267	udb_base_set_userdata(u, 0);
2268	while(!udb_ptr_is_null(&t)) {
2269		/* store next in list so this one can be deleted or reused */
2270		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2271		udb_rptr_zero(&TASKLIST(&t)->next, u);
2272
2273		/* process task t */
2274		/* append results for task t and update last_task */
2275		task_process_in_reload(nsd, u, last_task, &t);
2276
2277		/* go to next */
2278		udb_ptr_set_ptr(&t, u, &next);
2279
2280		/* if the parent has quit, we must quit too, poll the fd for cmds */
2281		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2282			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2283			if(cmd == NSD_QUIT) {
2284				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2285				/* unlink files of remainder of tasks */
2286				while(!udb_ptr_is_null(&t)) {
2287					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2288						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2289					}
2290					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2291				}
2292				udb_ptr_unlink(&t, u);
2293				udb_ptr_unlink(&next, u);
2294				exit(0);
2295			}
2296		}
2297
2298	}
2299	udb_ptr_unlink(&t, u);
2300	udb_ptr_unlink(&next, u);
2301}
2302
2303void server_verify(struct nsd *nsd, int cmdsocket);
2304
2305/*
2306 * Reload the database, stop parent, re-fork children and continue.
2307 * as server_main.
2308 */
2309static void
2310server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2311	int cmdsocket)
2312{
2313	pid_t mypid;
2314	sig_atomic_t cmd = NSD_QUIT_SYNC;
2315	int ret;
2316	udb_ptr last_task;
2317	struct sigaction old_sigchld, ign_sigchld;
2318	struct radnode* node;
2319	zone_type* zone;
2320	enum soainfo_hint hint;
2321	/* ignore SIGCHLD from the previous server_main that used this pid */
2322	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2323	ign_sigchld.sa_handler = SIG_IGN;
2324	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2325
2326#ifdef HAVE_SETPROCTITLE
2327	setproctitle("main");
2328#endif
2329#ifdef HAVE_CPUSET_T
2330	if(nsd->use_cpu_affinity) {
2331		set_cpu_affinity(nsd->cpuset);
2332	}
2333#endif
2334
2335	/* see what tasks we got from xfrd */
2336	task_remap(nsd->task[nsd->mytask]);
2337	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2338	reload_process_tasks(nsd, &last_task, cmdsocket);
2339
2340#ifndef NDEBUG
2341	if(nsd_debug_level >= 1)
2342		region_log_stats(nsd->db->region);
2343#endif /* NDEBUG */
2344	initialize_dname_compression_tables(nsd);
2345
2346#ifdef BIND8_STATS
2347	/* Restart dumping stats if required.  */
2348	time(&nsd->st->boot);
2349	set_bind8_alarm(nsd);
2350	/* Switch to a different set of stat array for new server processes,
2351	 * because they can briefly coexist with the old processes. They
2352	 * have their own stat structure. */
2353	nsd->stat_current = (nsd->stat_current==0?1:0);
2354#endif
2355#ifdef USE_ZONE_STATS
2356	server_zonestat_realloc(nsd); /* realloc for new children */
2357	server_zonestat_switch(nsd);
2358#endif
2359
2360	if(nsd->options->verify_enable) {
2361#ifdef RATELIMIT
2362		/* allocate resources for rate limiting. use a slot that is guaranteed
2363		   not mapped to a file so no persistent data is overwritten */
2364		rrl_init(nsd->child_count + 1);
2365#endif
2366
2367		/* spin-up server and execute verifiers for each zone */
2368		server_verify(nsd, cmdsocket);
2369#ifdef RATELIMIT
2370		/* deallocate rate limiting resources */
2371		rrl_deinit(nsd->child_count + 1);
2372#endif
2373	}
2374
2375	for(node = radix_first(nsd->db->zonetree);
2376	    node != NULL;
2377	    node = radix_next(node))
2378	{
2379		zone = (zone_type *)node->elem;
2380		if(zone->is_updated) {
2381			if(zone->is_bad) {
2382				nsd->mode = NSD_RELOAD_FAILED;
2383				hint = soainfo_bad;
2384			} else {
2385				hint = soainfo_ok;
2386			}
2387			/* update(s), verified or not, possibly with subsequent
2388			   skipped update(s). skipped update(s) are picked up
2389			   by failed update check in xfrd */
2390			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2391			                 zone, hint);
2392		} else if(zone->is_skipped) {
2393			/* corrupt or inconsistent update without preceding
2394			   update(s), communicate soainfo_gone */
2395			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2396			                 zone, soainfo_gone);
2397		}
2398		zone->is_updated = 0;
2399		zone->is_skipped = 0;
2400	}
2401
2402	if(nsd->mode == NSD_RELOAD_FAILED) {
2403		exit(NSD_RELOAD_FAILED);
2404	}
2405
2406	/* listen for the signals of failed children again */
2407	sigaction(SIGCHLD, &old_sigchld, NULL);
2408#ifdef USE_DNSTAP
2409	if (nsd->dt_collector) {
2410		int *swap_fd_send;
2411		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2412		/* Swap fd_send with fd_swap so old serve child and new serve
2413		 * childs will not write to the same pipe ends simultaneously */
2414		swap_fd_send = nsd->dt_collector_fd_send;
2415		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2416		nsd->dt_collector_fd_swap = swap_fd_send;
2417
2418	}
2419#endif
2420	/* Start new child processes */
2421	if (server_start_children(nsd, server_region, netio, &nsd->
2422		xfrd_listener->fd) != 0) {
2423		send_children_quit(nsd);
2424		exit(1);
2425	}
2426
2427	/* if the parent has quit, we must quit too, poll the fd for cmds */
2428	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2429		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2430		if(cmd == NSD_QUIT) {
2431			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2432			send_children_quit(nsd);
2433			exit(0);
2434		}
2435	}
2436
2437	/* Send quit command to parent: blocking, wait for receipt. */
2438	do {
2439		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2440		cmd = NSD_QUIT_SYNC;
2441		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2442		{
2443			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2444				strerror(errno));
2445		}
2446		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2447		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2448		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2449			RELOAD_SYNC_TIMEOUT);
2450		if(ret == -2) {
2451			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2452		}
2453	} while (ret == -2);
2454	if(ret == -1) {
2455		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2456			strerror(errno));
2457	}
2458	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2459	if(cmd == NSD_QUIT) {
2460		/* small race condition possible here, parent got quit cmd. */
2461		send_children_quit(nsd);
2462		exit(1);
2463	}
2464	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2465	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2466	task_process_sync(nsd->task[nsd->mytask]);
2467#ifdef USE_ZONE_STATS
2468	server_zonestat_realloc(nsd); /* realloc for next children */
2469#endif
2470
2471	/* send soainfo to the xfrd process, signal it that reload is done,
2472	 * it picks up the taskudb */
2473	cmd = NSD_RELOAD_DONE;
2474	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2475		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2476			strerror(errno));
2477	}
2478	mypid = getpid();
2479	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2480		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2481			strerror(errno));
2482	}
2483
2484	/* try to reopen file */
2485	if (nsd->file_rotation_ok)
2486		log_reopen(nsd->log_filename, 1);
2487	/* exit reload, continue as new server_main */
2488}
2489
2490/*
2491 * Get the mode depending on the signal hints that have been received.
2492 * Multiple signal hints can be received and will be handled in turn.
2493 */
2494static sig_atomic_t
2495server_signal_mode(struct nsd *nsd)
2496{
2497	if(nsd->signal_hint_quit) {
2498		nsd->signal_hint_quit = 0;
2499		return NSD_QUIT;
2500	}
2501	else if(nsd->signal_hint_shutdown) {
2502		nsd->signal_hint_shutdown = 0;
2503		return NSD_SHUTDOWN;
2504	}
2505	else if(nsd->signal_hint_child) {
2506		nsd->signal_hint_child = 0;
2507		return NSD_REAP_CHILDREN;
2508	}
2509	else if(nsd->signal_hint_reload) {
2510		nsd->signal_hint_reload = 0;
2511		return NSD_RELOAD;
2512	}
2513	else if(nsd->signal_hint_reload_hup) {
2514		nsd->signal_hint_reload_hup = 0;
2515		return NSD_RELOAD_REQ;
2516	}
2517	else if(nsd->signal_hint_stats) {
2518		nsd->signal_hint_stats = 0;
2519#ifdef BIND8_STATS
2520		set_bind8_alarm(nsd);
2521#endif
2522		return NSD_STATS;
2523	}
2524	else if(nsd->signal_hint_statsusr) {
2525		nsd->signal_hint_statsusr = 0;
2526		return NSD_STATS;
2527	}
2528	return NSD_RUN;
2529}
2530
2531/*
2532 * The main server simply waits for signals and child processes to
2533 * terminate.  Child processes are restarted as necessary.
2534 */
2535void
2536server_main(struct nsd *nsd)
2537{
2538	region_type *server_region = region_create(xalloc, free);
2539	netio_type *netio = netio_create(server_region);
2540	netio_handler_type reload_listener;
2541	int reload_sockets[2] = {-1, -1};
2542	struct timespec timeout_spec;
2543	int status;
2544	pid_t child_pid;
2545	pid_t reload_pid = -1;
2546	sig_atomic_t mode;
2547
2548	/* Ensure we are the main process */
2549	assert(nsd->server_kind == NSD_SERVER_MAIN);
2550
2551	/* Add listener for the XFRD process */
2552	netio_add_handler(netio, nsd->xfrd_listener);
2553
2554#ifdef BIND8_STATS
2555	nsd->st = &nsd->stat_map[0];
2556	nsd->st->db_disk = 0;
2557	nsd->st->db_mem = region_get_mem(nsd->db->region);
2558#endif
2559
2560	/* Start the child processes that handle incoming queries */
2561	if (server_start_children(nsd, server_region, netio,
2562		&nsd->xfrd_listener->fd) != 0) {
2563		send_children_quit(nsd);
2564		exit(1);
2565	}
2566	reload_listener.fd = -1;
2567
2568	/* This_child MUST be 0, because this is the parent process */
2569	assert(nsd->this_child == 0);
2570
2571	/* Run the server until we get a shutdown signal */
2572	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2573		/* Did we receive a signal that changes our mode? */
2574		if(mode == NSD_RUN) {
2575			nsd->mode = mode = server_signal_mode(nsd);
2576		}
2577
2578		switch (mode) {
2579		case NSD_RUN:
2580			/* see if any child processes terminated */
2581			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2582				int is_child = delete_child_pid(nsd, child_pid);
2583				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2584					if(nsd->children[is_child].child_fd == -1)
2585						nsd->children[is_child].has_exited = 1;
2586					parent_check_all_children_exited(nsd);
2587				} else if(is_child != -1) {
2588					log_msg(LOG_WARNING,
2589					       "server %d died unexpectedly with status %d, restarting",
2590					       (int) child_pid, status);
2591					restart_child_servers(nsd, server_region, netio,
2592						&nsd->xfrd_listener->fd);
2593				} else if (child_pid == reload_pid) {
2594					sig_atomic_t cmd = NSD_RELOAD_FAILED;
2595					pid_t mypid;
2596					log_msg(LOG_WARNING,
2597					       "Reload process %d failed with status %d, continuing with old database",
2598					       (int) child_pid, status);
2599					reload_pid = -1;
2600					if(reload_listener.fd != -1) close(reload_listener.fd);
2601					netio_remove_handler(netio, &reload_listener);
2602					reload_listener.fd = -1;
2603					reload_listener.event_types = NETIO_EVENT_NONE;
2604					task_process_sync(nsd->task[nsd->mytask]);
2605					/* inform xfrd reload attempt ended */
2606					if(!write_socket(nsd->xfrd_listener->fd,
2607						&cmd, sizeof(cmd))) {
2608						log_msg(LOG_ERR, "problems "
2609						  "sending SOAEND to xfrd: %s",
2610						  strerror(errno));
2611					}
2612					mypid = getpid();
2613					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2614						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2615							strerror(errno));
2616					}
2617#ifdef USE_DNSTAP
2618				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2619					log_msg(LOG_WARNING,
2620					       "dnstap-collector %d terminated with status %d",
2621					       (int) child_pid, status);
2622					if(nsd->dt_collector) {
2623						dt_collector_close(nsd->dt_collector, nsd);
2624						dt_collector_destroy(nsd->dt_collector, nsd);
2625						nsd->dt_collector = NULL;
2626					}
2627					/* Only respawn a crashed (or exited)
2628					 * dnstap-collector when not reloading,
2629					 * to not induce a reload during a
2630					 * reload (which would seriously
2631					 * disrupt nsd procedures and lead to
2632					 * unpredictable results)!
2633					 *
2634					 * This will *leave* a dnstap-collector
2635					 * process terminated, but because
2636					 * signalling of the reload process to
2637					 * the main process to respawn in this
2638					 * situation will be cumbersome, and
2639					 * because this situation is so
2640					 * specific (and therefore hopefully
2641					 * extremely rare or non-existing at
2642					 * all), plus the fact that we are left
2643					 * with a perfectly function NSD
2644					 * (besides not logging dnstap
2645					 * messages), I consider it acceptable
2646					 * to leave this unresolved.
2647					 */
2648					if(reload_pid == -1 && nsd->options->dnstap_enable) {
2649						nsd->dt_collector = dt_collector_create(nsd);
2650						dt_collector_start(nsd->dt_collector, nsd);
2651						nsd->mode = NSD_RELOAD_REQ;
2652					}
2653#endif
2654				} else if(status != 0) {
2655					/* check for status, because we get
2656					 * the old-servermain because reload
2657					 * is the process-parent of old-main,
2658					 * and we get older server-processes
2659					 * that are exiting after a reload */
2660					log_msg(LOG_WARNING,
2661					       "process %d terminated with status %d",
2662					       (int) child_pid, status);
2663				}
2664			}
2665			if (child_pid == -1) {
2666				if (errno == EINTR) {
2667					continue;
2668				}
2669				if (errno != ECHILD)
2670					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2671			}
2672			if (nsd->mode != NSD_RUN)
2673				break;
2674
2675			/* timeout to collect processes. In case no sigchild happens. */
2676			timeout_spec.tv_sec = 60;
2677			timeout_spec.tv_nsec = 0;
2678
2679			/* listen on ports, timeout for collecting terminated children */
2680			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2681				if (errno != EINTR) {
2682					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2683				}
2684			}
2685			if(nsd->restart_children) {
2686				restart_child_servers(nsd, server_region, netio,
2687					&nsd->xfrd_listener->fd);
2688				nsd->restart_children = 0;
2689			}
2690			if(nsd->reload_failed) {
2691				sig_atomic_t cmd = NSD_RELOAD_FAILED;
2692				pid_t mypid;
2693				nsd->reload_failed = 0;
2694				log_msg(LOG_WARNING,
2695				       "Reload process %d failed, continuing with old database",
2696				       (int) reload_pid);
2697				reload_pid = -1;
2698				if(reload_listener.fd != -1) close(reload_listener.fd);
2699				netio_remove_handler(netio, &reload_listener);
2700				reload_listener.fd = -1;
2701				reload_listener.event_types = NETIO_EVENT_NONE;
2702				task_process_sync(nsd->task[nsd->mytask]);
2703				/* inform xfrd reload attempt ended */
2704				if(!write_socket(nsd->xfrd_listener->fd,
2705					&cmd, sizeof(cmd))) {
2706					log_msg(LOG_ERR, "problems "
2707					  "sending SOAEND to xfrd: %s",
2708					  strerror(errno));
2709				}
2710				mypid = getpid();
2711				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2712					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2713						strerror(errno));
2714				}
2715			}
2716
2717			break;
2718		case NSD_RELOAD_REQ: {
2719			sig_atomic_t cmd = NSD_RELOAD_REQ;
2720			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2721			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2722				"main: ipc send reload_req to xfrd"));
2723			if(!write_socket(nsd->xfrd_listener->fd,
2724				&cmd, sizeof(cmd))) {
2725				log_msg(LOG_ERR, "server_main: could not send "
2726				"reload_req to xfrd: %s", strerror(errno));
2727			}
2728			nsd->mode = NSD_RUN;
2729			} break;
2730		case NSD_RELOAD:
2731			/* Continue to run nsd after reload */
2732			nsd->mode = NSD_RUN;
2733			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2734			if (reload_pid != -1) {
2735				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2736				       (int) reload_pid);
2737				break;
2738			}
2739
2740			/* switch the mytask to keep track of who owns task*/
2741			nsd->mytask = 1 - nsd->mytask;
2742			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2743				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2744				reload_pid = -1;
2745				break;
2746			}
2747
2748			/* Do actual reload */
2749			reload_pid = fork();
2750			switch (reload_pid) {
2751			case -1:
2752				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2753				break;
2754			default:
2755				/* PARENT */
2756				close(reload_sockets[0]);
2757				server_reload(nsd, server_region, netio,
2758					reload_sockets[1]);
2759				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2760				close(reload_sockets[1]);
2761				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2762				/* drop stale xfrd ipc data */
2763				((struct ipc_handler_conn_data*)nsd->
2764					xfrd_listener->user_data)
2765					->conn->is_reading = 0;
2766				reload_pid = -1;
2767				reload_listener.fd = -1;
2768				reload_listener.event_types = NETIO_EVENT_NONE;
2769				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2770				break;
2771			case 0:
2772				/* CHILD */
2773				/* server_main keep running until NSD_QUIT_SYNC
2774				 * received from reload. */
2775				close(reload_sockets[1]);
2776				reload_listener.fd = reload_sockets[0];
2777				reload_listener.timeout = NULL;
2778				reload_listener.user_data = nsd;
2779				reload_listener.event_types = NETIO_EVENT_READ;
2780				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2781				netio_add_handler(netio, &reload_listener);
2782				reload_pid = getppid();
2783				break;
2784			}
2785			break;
2786		case NSD_QUIT_SYNC:
2787			/* synchronisation of xfrd, parent and reload */
2788			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2789				sig_atomic_t cmd = NSD_RELOAD;
2790				/* stop xfrd ipc writes in progress */
2791				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2792					"main: ipc send indication reload"));
2793				if(!write_socket(nsd->xfrd_listener->fd,
2794					&cmd, sizeof(cmd))) {
2795					log_msg(LOG_ERR, "server_main: could not send reload "
2796					"indication to xfrd: %s", strerror(errno));
2797				}
2798				/* wait for ACK from xfrd */
2799				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2800				nsd->quit_sync_done = 1;
2801			}
2802			nsd->mode = NSD_RUN;
2803			break;
2804		case NSD_QUIT:
2805			/* silent shutdown during reload */
2806			if(reload_listener.fd != -1) {
2807				/* acknowledge the quit, to sync reload that we will really quit now */
2808				sig_atomic_t cmd = NSD_RELOAD;
2809				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2810				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2811					log_msg(LOG_ERR, "server_main: "
2812						"could not ack quit: %s", strerror(errno));
2813				}
2814				close(reload_listener.fd);
2815			}
2816			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2817			/* only quit children after xfrd has acked */
2818			send_children_quit(nsd);
2819
2820#ifdef MEMCLEAN /* OS collects memory pages */
2821			region_destroy(server_region);
2822#endif
2823			server_shutdown(nsd);
2824
2825			/* ENOTREACH */
2826			break;
2827		case NSD_SHUTDOWN:
2828			break;
2829		case NSD_REAP_CHILDREN:
2830			/* continue; wait for child in run loop */
2831			nsd->mode = NSD_RUN;
2832			break;
2833		case NSD_STATS:
2834#ifdef BIND8_STATS
2835			set_children_stats(nsd);
2836#endif
2837			nsd->mode = NSD_RUN;
2838			break;
2839		default:
2840			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2841			nsd->mode = NSD_RUN;
2842			break;
2843		}
2844	}
2845	log_msg(LOG_WARNING, "signal received, shutting down...");
2846
2847	/* close opened ports to avoid race with restart of nsd */
2848	server_close_all_sockets(nsd->udp, nsd->ifs);
2849	server_close_all_sockets(nsd->tcp, nsd->ifs);
2850	daemon_remote_close(nsd->rc);
2851	send_children_quit_and_wait(nsd);
2852
2853	/* Unlink it if possible... */
2854	unlinkpid(nsd->pidfile);
2855	unlink(nsd->task[0]->fname);
2856	unlink(nsd->task[1]->fname);
2857#ifdef USE_ZONE_STATS
2858	unlink(nsd->zonestatfname[0]);
2859	unlink(nsd->zonestatfname[1]);
2860#endif
2861#ifdef BIND8_STATS
2862	server_stat_free(nsd);
2863#endif
2864#ifdef USE_DNSTAP
2865	dt_collector_close(nsd->dt_collector, nsd);
2866#endif
2867
2868	if(reload_listener.fd != -1) {
2869		sig_atomic_t cmd = NSD_QUIT;
2870		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2871			"main: ipc send quit to reload-process"));
2872		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2873			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2874				strerror(errno));
2875		}
2876		fsync(reload_listener.fd);
2877		close(reload_listener.fd);
2878		/* wait for reload to finish processing */
2879		while(1) {
2880			if(waitpid(reload_pid, NULL, 0) == -1) {
2881				if(errno == EINTR) continue;
2882				if(errno == ECHILD) break;
2883				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2884					(int)reload_pid, strerror(errno));
2885			}
2886			break;
2887		}
2888	}
2889	if(nsd->xfrd_listener->fd != -1) {
2890		/* complete quit, stop xfrd */
2891		sig_atomic_t cmd = NSD_QUIT;
2892		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2893			"main: ipc send quit to xfrd"));
2894		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2895			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2896				strerror(errno));
2897		}
2898		fsync(nsd->xfrd_listener->fd);
2899		close(nsd->xfrd_listener->fd);
2900		(void)kill(nsd->pid, SIGTERM);
2901	}
2902
2903#ifdef MEMCLEAN /* OS collects memory pages */
2904	region_destroy(server_region);
2905#endif
2906	server_shutdown(nsd);
2907}
2908
2909static query_state_type
2910server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
2911{
2912	return query_process(query, nsd, now_p);
2913}
2914
2915static query_state_type
2916server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
2917{
2918#ifdef RATELIMIT
2919	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
2920		if(query->edns.cookie_status != COOKIE_VALID
2921		&& query->edns.cookie_status != COOKIE_VALID_REUSE
2922		&& rrl_process_query(query))
2923			return rrl_slip(query);
2924		else	return QUERY_PROCESSED;
2925	}
2926	return QUERY_DISCARDED;
2927#else
2928	return query_process(query, nsd, now_p);
2929#endif
2930}
2931
2932const char*
2933nsd_event_vs(void)
2934{
2935#ifdef USE_MINI_EVENT
2936	return "";
2937#else
2938	return event_get_version();
2939#endif
2940}
2941
2942#if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
2943static const char* ub_ev_backend2str(int b)
2944{
2945	switch(b) {
2946	case EVBACKEND_SELECT:	return "select";
2947	case EVBACKEND_POLL:	return "poll";
2948	case EVBACKEND_EPOLL:	return "epoll";
2949	case EVBACKEND_KQUEUE:	return "kqueue";
2950	case EVBACKEND_DEVPOLL: return "devpoll";
2951	case EVBACKEND_PORT:	return "evport";
2952	}
2953	return "unknown";
2954}
2955#endif
2956
2957const char*
2958nsd_event_method(void)
2959{
2960#ifdef USE_MINI_EVENT
2961	return "select";
2962#else
2963	struct event_base* b = nsd_child_event_base();
2964	const char* m;
2965#  ifdef EV_FEATURE_BACKENDS
2966	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2967#  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2968	m = event_base_get_method(b);
2969#  else
2970	m = "?";
2971#  endif
2972#  ifdef MEMCLEAN
2973	event_base_free(b);
2974#  endif
2975	return m;
2976#endif
2977}
2978
2979struct event_base*
2980nsd_child_event_base(void)
2981{
2982	struct event_base* base;
2983#ifdef USE_MINI_EVENT
2984	static time_t secs;
2985	static struct timeval now;
2986	base = event_init(&secs, &now);
2987#else
2988#  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2989	/* libev */
2990	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2991#  else
2992	/* libevent */
2993#    ifdef HAVE_EVENT_BASE_NEW
2994	base = event_base_new();
2995#    else
2996	base = event_init();
2997#    endif
2998#  endif
2999#endif
3000	return base;
3001}
3002
3003static void
3004add_udp_handler(
3005	struct nsd *nsd,
3006	struct nsd_socket *sock,
3007	struct udp_handler_data *data)
3008{
3009	struct event *handler = &data->event;
3010
3011	data->nsd = nsd;
3012	data->socket = sock;
3013
3014	if(nsd->options->proxy_protocol_port &&
3015		sockaddr_uses_proxy_protocol_port(nsd->options,
3016		(struct sockaddr *)&sock->addr.ai_addr)) {
3017		data->pp2_enabled = 1;
3018	}
3019
3020	memset(handler, 0, sizeof(*handler));
3021	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
3022	if(event_base_set(nsd->event_base, handler) != 0)
3023		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
3024	if(event_add(handler, NULL) != 0)
3025		log_msg(LOG_ERR, "nsd udp: event_add failed");
3026}
3027
3028void
3029add_tcp_handler(
3030	struct nsd *nsd,
3031	struct nsd_socket *sock,
3032	struct tcp_accept_handler_data *data)
3033{
3034	struct event *handler = &data->event;
3035
3036	data->nsd = nsd;
3037	data->socket = sock;
3038
3039	if(nsd->options->proxy_protocol_port &&
3040		sockaddr_uses_proxy_protocol_port(nsd->options,
3041		(struct sockaddr *)&sock->addr.ai_addr)) {
3042		data->pp2_enabled = 1;
3043	}
3044
3045#ifdef HAVE_SSL
3046	if (nsd->tls_ctx &&
3047	    nsd->options->tls_port &&
3048	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
3049	{
3050		data->tls_accept = 1;
3051		if(verbosity >= 2) {
3052			char buf[48];
3053			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
3054			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
3055		}
3056	} else {
3057		data->tls_accept = 0;
3058	}
3059#endif
3060
3061	memset(handler, 0, sizeof(*handler));
3062	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
3063	if(event_base_set(nsd->event_base, handler) != 0)
3064		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
3065	if(event_add(handler, NULL) != 0)
3066		log_msg(LOG_ERR, "nsd tcp: event_add failed");
3067	data->event_added = 1;
3068}
3069
3070/*
3071 * Serve DNS request to verifiers (short-lived)
3072 */
3073void server_verify(struct nsd *nsd, int cmdsocket)
3074{
3075	size_t size = 0;
3076	struct event cmd_event, signal_event, exit_event;
3077	struct zone *zone;
3078
3079	assert(nsd != NULL);
3080
3081	zone = verify_next_zone(nsd, NULL);
3082	if(zone == NULL)
3083		return;
3084
3085	nsd->server_region = region_create(xalloc, free);
3086	nsd->event_base = nsd_child_event_base();
3087
3088	nsd->next_zone_to_verify = zone;
3089	nsd->verifier_count = 0;
3090	nsd->verifier_limit = nsd->options->verifier_count;
3091	size = sizeof(struct verifier) * nsd->verifier_limit;
3092	if(pipe(nsd->verifier_pipe) == -1) {
3093		log_msg(LOG_ERR, "verify: could not create pipe: %s",
3094				strerror(errno));
3095		goto fail_pipe;
3096	}
3097	fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC);
3098	fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC);
3099	nsd->verifiers = region_alloc_zero(nsd->server_region, size);
3100
3101	for(size_t i = 0; i < nsd->verifier_limit; i++) {
3102		nsd->verifiers[i].nsd = nsd;
3103		nsd->verifiers[i].zone = NULL;
3104		nsd->verifiers[i].pid = -1;
3105		nsd->verifiers[i].output_stream.fd = -1;
3106		nsd->verifiers[i].output_stream.priority = LOG_INFO;
3107		nsd->verifiers[i].error_stream.fd = -1;
3108		nsd->verifiers[i].error_stream.priority = LOG_ERR;
3109	}
3110
3111	event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd);
3112	if(event_base_set(nsd->event_base, &cmd_event) != 0 ||
3113	   event_add(&cmd_event, NULL) != 0)
3114	{
3115		log_msg(LOG_ERR, "verify: could not add command event");
3116		goto fail;
3117	}
3118
3119	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd);
3120	if(event_base_set(nsd->event_base, &signal_event) != 0 ||
3121	   signal_add(&signal_event, NULL) != 0)
3122	{
3123		log_msg(LOG_ERR, "verify: could not add signal event");
3124		goto fail;
3125	}
3126
3127	event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd);
3128	if(event_base_set(nsd->event_base, &exit_event) != 0 ||
3129	   event_add(&exit_event, NULL) != 0)
3130  {
3131		log_msg(LOG_ERR, "verify: could not add exit event");
3132		goto fail;
3133	}
3134
3135	memset(msgs, 0, sizeof(msgs));
3136	for (int i = 0; i < NUM_RECV_PER_SELECT; i++) {
3137		queries[i] = query_create(nsd->server_region,
3138			compressed_dname_offsets,
3139			compression_table_size, compressed_dnames);
3140		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3141		iovecs[i].iov_base = buffer_begin(queries[i]->packet);
3142		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3143		msgs[i].msg_hdr.msg_iov = &iovecs[i];
3144		msgs[i].msg_hdr.msg_iovlen = 1;
3145		msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr;
3146		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3147	}
3148
3149	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3150		struct udp_handler_data *data;
3151		data = region_alloc_zero(
3152			nsd->server_region, sizeof(*data));
3153		add_udp_handler(nsd, &nsd->verify_udp[i], data);
3154	}
3155
3156	tcp_accept_handler_count = nsd->verify_ifs;
3157	tcp_accept_handlers = region_alloc_array(nsd->server_region,
3158		nsd->verify_ifs, sizeof(*tcp_accept_handlers));
3159
3160	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3161		struct tcp_accept_handler_data *data;
3162		data = &tcp_accept_handlers[i];
3163		memset(data, 0, sizeof(*data));
3164		add_tcp_handler(nsd, &nsd->verify_tcp[i], data);
3165	}
3166
3167	while(nsd->next_zone_to_verify != NULL &&
3168	      nsd->verifier_count < nsd->verifier_limit)
3169	{
3170		verify_zone(nsd, nsd->next_zone_to_verify);
3171		nsd->next_zone_to_verify
3172			= verify_next_zone(nsd, nsd->next_zone_to_verify);
3173	}
3174
3175	/* short-lived main loop */
3176	event_base_dispatch(nsd->event_base);
3177
3178	/* remove command and exit event handlers */
3179	event_del(&exit_event);
3180	event_del(&signal_event);
3181	event_del(&cmd_event);
3182
3183	assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT);
3184	assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT);
3185fail:
3186	close(nsd->verifier_pipe[0]);
3187	close(nsd->verifier_pipe[1]);
3188fail_pipe:
3189	event_base_free(nsd->event_base);
3190	region_destroy(nsd->server_region);
3191
3192	nsd->event_base = NULL;
3193	nsd->server_region = NULL;
3194	nsd->verifier_limit = 0;
3195	nsd->verifier_pipe[0] = -1;
3196	nsd->verifier_pipe[1] = -1;
3197	nsd->verifiers = NULL;
3198}
3199
3200/*
3201 * Serve DNS requests.
3202 */
3203void
3204server_child(struct nsd *nsd)
3205{
3206	size_t i, from, numifs;
3207	region_type *server_region = region_create(xalloc, free);
3208	struct event_base* event_base = nsd_child_event_base();
3209	sig_atomic_t mode;
3210
3211	if(!event_base) {
3212		log_msg(LOG_ERR, "nsd server could not create event base");
3213		exit(1);
3214	}
3215	nsd->event_base = event_base;
3216	nsd->server_region = server_region;
3217
3218#ifdef RATELIMIT
3219	rrl_init(nsd->this_child->child_num);
3220#endif
3221
3222	assert(nsd->server_kind != NSD_SERVER_MAIN);
3223	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
3224
3225#ifdef HAVE_SETPROCTITLE
3226	setproctitle("server %d", nsd->this_child->child_num + 1);
3227#endif
3228#ifdef HAVE_CPUSET_T
3229	if(nsd->use_cpu_affinity) {
3230		set_cpu_affinity(nsd->this_child->cpuset);
3231	}
3232#endif
3233#ifdef BIND8_STATS
3234	nsd->st = &nsd->stats_per_child[nsd->stat_current]
3235		[nsd->this_child->child_num];
3236	nsd->st->boot = nsd->stat_map[0].boot;
3237	memcpy(&nsd->stat_proc, nsd->st, sizeof(nsd->stat_proc));
3238#endif
3239
3240	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
3241		server_close_all_sockets(nsd->tcp, nsd->ifs);
3242	}
3243	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
3244		server_close_all_sockets(nsd->udp, nsd->ifs);
3245	}
3246
3247	if (nsd->this_child->parent_fd != -1) {
3248		struct event *handler;
3249		struct ipc_handler_conn_data* user_data =
3250			(struct ipc_handler_conn_data*)region_alloc(
3251			server_region, sizeof(struct ipc_handler_conn_data));
3252		user_data->nsd = nsd;
3253		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
3254
3255		handler = (struct event*) region_alloc(
3256			server_region, sizeof(*handler));
3257		memset(handler, 0, sizeof(*handler));
3258		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3259			EV_READ, child_handle_parent_command, user_data);
3260		if(event_base_set(event_base, handler) != 0)
3261			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3262		if(event_add(handler, NULL) != 0)
3263			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3264	}
3265
3266	if(nsd->reuseport) {
3267		numifs = nsd->ifs / nsd->reuseport;
3268		from = numifs * nsd->this_child->child_num;
3269		if(from+numifs > nsd->ifs) { /* should not happen */
3270			from = 0;
3271			numifs = nsd->ifs;
3272		}
3273	} else {
3274		from = 0;
3275		numifs = nsd->ifs;
3276	}
3277
3278	if (nsd->server_kind & NSD_SERVER_UDP) {
3279		int child = nsd->this_child->child_num;
3280		memset(msgs, 0, sizeof(msgs));
3281		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3282			queries[i] = query_create(server_region,
3283				compressed_dname_offsets,
3284				compression_table_size, compressed_dnames);
3285			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3286			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
3287			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
3288			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
3289			msgs[i].msg_hdr.msg_iovlen  = 1;
3290			msgs[i].msg_hdr.msg_name    = &queries[i]->remote_addr;
3291			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3292		}
3293
3294		for (i = 0; i < nsd->ifs; i++) {
3295			int listen;
3296			struct udp_handler_data *data;
3297
3298			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3299
3300			if(i >= from && i < (from + numifs) && listen) {
3301				data = region_alloc_zero(
3302					nsd->server_region, sizeof(*data));
3303				add_udp_handler(nsd, &nsd->udp[i], data);
3304			} else {
3305				/* close sockets intended for other servers */
3306				server_close_socket(&nsd->udp[i]);
3307			}
3308		}
3309	}
3310
3311	/*
3312	 * Keep track of all the TCP accept handlers so we can enable
3313	 * and disable them based on the current number of active TCP
3314	 * connections.
3315	 */
3316	if (nsd->server_kind & NSD_SERVER_TCP) {
3317		int child = nsd->this_child->child_num;
3318		tcp_accept_handler_count = numifs;
3319		tcp_accept_handlers = region_alloc_array(server_region,
3320			numifs, sizeof(*tcp_accept_handlers));
3321
3322		for (i = 0; i < nsd->ifs; i++) {
3323			int listen;
3324			struct tcp_accept_handler_data *data;
3325
3326			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3327
3328			if(i >= from && i < (from + numifs) && listen) {
3329				data = &tcp_accept_handlers[i-from];
3330				memset(data, 0, sizeof(*data));
3331				add_tcp_handler(nsd, &nsd->tcp[i], data);
3332			} else {
3333				/* close sockets intended for other servers */
3334				/*
3335				 * uncomment this once tcp servers are no
3336				 * longer copied in the tcp fd copy line
3337				 * in server_init().
3338				server_close_socket(&nsd->tcp[i]);
3339				*/
3340				/* close sockets not meant for this server*/
3341				if(!listen)
3342					server_close_socket(&nsd->tcp[i]);
3343			}
3344		}
3345	} else {
3346		tcp_accept_handler_count = 0;
3347	}
3348
3349	/* The main loop... */
3350	while ((mode = nsd->mode) != NSD_QUIT) {
3351		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3352
3353		/* Do we need to do the statistics... */
3354		if (mode == NSD_STATS) {
3355#ifdef BIND8_STATS
3356			int p = nsd->st_period;
3357			nsd->st_period = 1; /* force stats printout */
3358			/* Dump the statistics */
3359			bind8_stats(nsd);
3360			nsd->st_period = p;
3361#else /* !BIND8_STATS */
3362			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3363#endif /* BIND8_STATS */
3364
3365			nsd->mode = NSD_RUN;
3366		}
3367		else if (mode == NSD_REAP_CHILDREN) {
3368			/* got signal, notify parent. parent reaps terminated children. */
3369			if (nsd->this_child->parent_fd != -1) {
3370				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3371				if (write(nsd->this_child->parent_fd,
3372				    &parent_notify,
3373				    sizeof(parent_notify)) == -1)
3374				{
3375					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3376						(int) nsd->this_child->pid, strerror(errno));
3377				}
3378			} else /* no parent, so reap 'em */
3379				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3380			nsd->mode = NSD_RUN;
3381		}
3382		else if(mode == NSD_RUN) {
3383			/* Wait for a query... */
3384			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3385				if (errno != EINTR) {
3386					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3387					break;
3388				}
3389			}
3390		} else if(mode == NSD_QUIT) {
3391			/* ignore here, quit */
3392		} else {
3393			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3394				(int)mode);
3395			nsd->mode = NSD_RUN;
3396		}
3397	}
3398
3399	service_remaining_tcp(nsd);
3400#ifdef	BIND8_STATS
3401	bind8_stats(nsd);
3402#endif /* BIND8_STATS */
3403
3404#ifdef MEMCLEAN /* OS collects memory pages */
3405#ifdef RATELIMIT
3406	rrl_deinit(nsd->this_child->child_num);
3407#endif
3408	event_base_free(event_base);
3409	region_destroy(server_region);
3410#endif
3411	server_shutdown(nsd);
3412}
3413
3414static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3415{
3416	int* timed_out = (int*)arg;
3417        assert(event & EV_TIMEOUT); (void)event;
3418	/* wake up the service tcp thread, note event is no longer
3419	 * registered */
3420	*timed_out = 1;
3421}
3422
3423void
3424service_remaining_tcp(struct nsd* nsd)
3425{
3426	struct tcp_handler_data* p;
3427	struct event_base* event_base;
3428	/* check if it is needed */
3429	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3430		return;
3431	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3432#ifdef USE_DNSTAP
3433	/* remove dnstap collector, we cannot write there because the new
3434	 * child process is using the file descriptor, or the child
3435	 * process after that. */
3436	dt_collector_destroy(nsd->dt_collector, nsd);
3437	nsd->dt_collector = NULL;
3438#endif
3439	/* setup event base */
3440	event_base = nsd_child_event_base();
3441	if(!event_base) {
3442		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3443		return;
3444	}
3445	/* register tcp connections */
3446	for(p = tcp_active_list; p != NULL; p = p->next) {
3447		struct timeval timeout;
3448		int fd = p->event.ev_fd;
3449#ifdef USE_MINI_EVENT
3450		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3451#else
3452		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3453#endif
3454		void (*fn)(int, short, void*);
3455#ifdef HAVE_SSL
3456		if(p->tls) {
3457			if((event&EV_READ))
3458				fn = handle_tls_reading;
3459			else	fn = handle_tls_writing;
3460		} else {
3461#endif
3462			if((event&EV_READ))
3463				fn = handle_tcp_reading;
3464			else	fn = handle_tcp_writing;
3465#ifdef HAVE_SSL
3466		}
3467#endif
3468
3469		p->tcp_no_more_queries = 1;
3470		/* set timeout to 3 seconds (previously 1/10 second) */
3471		if(p->tcp_timeout > 3000)
3472			p->tcp_timeout = 3000;
3473		timeout.tv_sec = p->tcp_timeout / 1000;
3474		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3475		event_del(&p->event);
3476		memset(&p->event, 0, sizeof(p->event));
3477		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3478			fn, p);
3479		if(event_base_set(event_base, &p->event) != 0)
3480			log_msg(LOG_ERR, "event base set failed");
3481		if(event_add(&p->event, &timeout) != 0)
3482			log_msg(LOG_ERR, "event add failed");
3483	}
3484
3485	/* handle it */
3486	while(nsd->current_tcp_count > 0) {
3487		mode_t m = server_signal_mode(nsd);
3488		struct event timeout;
3489		struct timeval tv;
3490		int timed_out = 0;
3491		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3492			m == NSD_REAP_CHILDREN) {
3493			/* quit */
3494			break;
3495		}
3496		/* timer */
3497		/* have to do something every 3 seconds */
3498		tv.tv_sec = 3;
3499		tv.tv_usec = 0;
3500		memset(&timeout, 0, sizeof(timeout));
3501		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3502			&timed_out);
3503		if(event_base_set(event_base, &timeout) != 0)
3504			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3505		if(event_add(&timeout, &tv) != 0)
3506			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3507
3508		/* service loop */
3509		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3510			if (errno != EINTR) {
3511				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3512				break;
3513			}
3514		}
3515		if(!timed_out) {
3516			event_del(&timeout);
3517		} else {
3518			/* timed out, quit */
3519			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3520			break;
3521		}
3522	}
3523#ifdef MEMCLEAN
3524	event_base_free(event_base);
3525#endif
3526	/* continue to quit after return */
3527}
3528
3529/* Implement recvmmsg and sendmmsg if the platform does not. These functions
3530 * are always used, even if nonblocking operations are broken, in which case
3531 * NUM_RECV_PER_SELECT is defined to 1 (one).
3532 */
3533#if defined(HAVE_RECVMMSG)
3534#define nsd_recvmmsg recvmmsg
3535#else /* !HAVE_RECVMMSG */
3536
3537static int
3538nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3539             int flags, struct timespec *timeout)
3540{
3541	unsigned int vpos = 0;
3542	ssize_t rcvd;
3543
3544	/* timeout is ignored, ensure caller does not expect it to work */
3545	assert(timeout == NULL); (void)timeout;
3546
3547	while(vpos < vlen) {
3548		rcvd = recvfrom(sockfd,
3549		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3550		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3551		                flags,
3552		                msgvec[vpos].msg_hdr.msg_name,
3553		               &msgvec[vpos].msg_hdr.msg_namelen);
3554		if(rcvd < 0) {
3555			break;
3556		} else {
3557			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3558			msgvec[vpos].msg_len = (unsigned int)rcvd;
3559			vpos++;
3560		}
3561	}
3562
3563	if(vpos) {
3564		/* error will be picked up next time */
3565		return (int)vpos;
3566	} else if(errno == 0) {
3567		return 0;
3568	} else if(errno == EAGAIN) {
3569		return 0;
3570	}
3571
3572	return -1;
3573}
3574#endif /* HAVE_RECVMMSG */
3575
3576#ifdef HAVE_SENDMMSG
3577#define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3578#else /* !HAVE_SENDMMSG */
3579
3580static int
3581nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3582{
3583	unsigned int vpos = 0;
3584	ssize_t snd;
3585
3586	while(vpos < vlen) {
3587		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3588		snd = sendto(sockfd,
3589		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3590		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3591		             flags,
3592		             msgvec[vpos].msg_hdr.msg_name,
3593		             msgvec[vpos].msg_hdr.msg_namelen);
3594		if(snd < 0) {
3595			break;
3596		} else {
3597			msgvec[vpos].msg_len = (unsigned int)snd;
3598			vpos++;
3599		}
3600	}
3601
3602	if(vpos) {
3603		return (int)vpos;
3604	} else if(errno == 0) {
3605		return 0;
3606	}
3607
3608	return -1;
3609}
3610#endif /* HAVE_SENDMMSG */
3611
3612static int
3613port_is_zero(
3614#ifdef INET6
3615        struct sockaddr_storage *addr
3616#else
3617        struct sockaddr_in *addr
3618#endif
3619	)
3620{
3621#ifdef INET6
3622	if(addr->ss_family == AF_INET6) {
3623		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3624	} else if(addr->ss_family == AF_INET) {
3625		return (((struct sockaddr_in *)addr)->sin_port) == 0;
3626	}
3627	return 0;
3628#else
3629	if(addr->sin_family == AF_INET) {
3630		return addr->sin_port == 0;
3631	}
3632	return 0;
3633#endif
3634}
3635
3636/* Parses the PROXYv2 header from buf and updates the struct.
3637 * Returns 1 on success, 0 on failure. */
3638static int
3639consume_pp2_header(struct buffer* buf, struct query* q, int stream)
3640{
3641	size_t size;
3642	struct pp2_header* header;
3643	int err = pp2_read_header(buffer_begin(buf), buffer_remaining(buf));
3644	if(err) {
3645		VERBOSITY(4, (LOG_ERR, "proxy-protocol: could not parse "
3646			"PROXYv2 header: %s", pp_lookup_error(err)));
3647		return 0;
3648	}
3649	header = (struct pp2_header*)buffer_begin(buf);
3650	size = PP2_HEADER_SIZE + read_uint16(&header->len);
3651	if(size > buffer_limit(buf)) {
3652		VERBOSITY(4, (LOG_ERR, "proxy-protocol: not enough buffer "
3653			"size to read PROXYv2 header"));
3654		return 0;
3655	}
3656	if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) {
3657		/* A connection from the proxy itself.
3658		 * No need to do anything with addresses. */
3659		goto done;
3660	}
3661	if(header->fam_prot == PP2_UNSPEC_UNSPEC) {
3662		/* Unspecified family and protocol. This could be used for
3663		 * health checks by proxies.
3664		 * No need to do anything with addresses. */
3665		goto done;
3666	}
3667	/* Read the proxied address */
3668	switch(header->fam_prot) {
3669		case PP2_INET_STREAM:
3670		case PP2_INET_DGRAM:
3671			{
3672			struct sockaddr_in* addr =
3673				(struct sockaddr_in*)&q->client_addr;
3674			addr->sin_family = AF_INET;
3675			memmove(&addr->sin_addr.s_addr,
3676				&header->addr.addr4.src_addr, 4);
3677			memmove(&addr->sin_port, &header->addr.addr4.src_port,
3678				2);
3679			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in);
3680			}
3681			/* Ignore the destination address; it should be us. */
3682			break;
3683#ifdef INET6
3684		case PP2_INET6_STREAM:
3685		case PP2_INET6_DGRAM:
3686			{
3687			struct sockaddr_in6* addr =
3688				(struct sockaddr_in6*)&q->client_addr;
3689			memset(addr, 0, sizeof(*addr));
3690			addr->sin6_family = AF_INET6;
3691			memmove(&addr->sin6_addr,
3692				header->addr.addr6.src_addr, 16);
3693			memmove(&addr->sin6_port, &header->addr.addr6.src_port,
3694				2);
3695			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6);
3696			}
3697			/* Ignore the destination address; it should be us. */
3698			break;
3699#endif /* INET6 */
3700		default:
3701			VERBOSITY(2, (LOG_ERR, "proxy-protocol: unsupported "
3702				"family and protocol 0x%x",
3703				(int)header->fam_prot));
3704			return 0;
3705	}
3706	q->is_proxied = 1;
3707done:
3708	if(!stream) {
3709		/* We are reading a whole packet;
3710		 * Move the rest of the data to overwrite the PROXYv2 header */
3711		/* XXX can we do better to avoid memmove? */
3712		memmove(header, ((char*)header)+size, buffer_limit(buf)-size);
3713		buffer_set_limit(buf, buffer_limit(buf)-size);
3714	}
3715	return 1;
3716}
3717
3718static void
3719handle_udp(int fd, short event, void* arg)
3720{
3721	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3722	int received, sent, recvcount, i;
3723	struct query *q;
3724	uint32_t now = 0;
3725
3726	if (!(event & EV_READ)) {
3727		return;
3728	}
3729	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3730	/* this printf strangely gave a performance increase on Linux */
3731	/* printf("recvcount %d \n", recvcount); */
3732	if (recvcount == -1) {
3733		if (errno != EAGAIN && errno != EINTR) {
3734			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3735			STATUP(data->nsd, rxerr);
3736			/* No zone statup */
3737		}
3738		/* Simply no data available */
3739		return;
3740	}
3741	for (i = 0; i < recvcount; i++) {
3742	loopstart:
3743		received = msgs[i].msg_len;
3744		queries[i]->remote_addrlen = msgs[i].msg_hdr.msg_namelen;
3745		queries[i]->client_addrlen = (socklen_t)sizeof(queries[i]->client_addr);
3746		queries[i]->is_proxied = 0;
3747		q = queries[i];
3748		if (received == -1) {
3749			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3750#if defined(HAVE_RECVMMSG)
3751				msgs[i].msg_hdr.msg_flags
3752#else
3753				errno
3754#endif
3755				));
3756			STATUP(data->nsd, rxerr);
3757			/* No zone statup */
3758			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3759			iovecs[i].iov_len = buffer_remaining(q->packet);
3760			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3761			goto swap_drop;
3762		}
3763
3764		/* Account... */
3765#ifdef BIND8_STATS
3766		if (data->socket->addr.ai_family == AF_INET) {
3767			STATUP(data->nsd, qudp);
3768		} else if (data->socket->addr.ai_family == AF_INET6) {
3769			STATUP(data->nsd, qudp6);
3770		}
3771#endif
3772
3773		buffer_skip(q->packet, received);
3774		buffer_flip(q->packet);
3775		if(data->pp2_enabled && !consume_pp2_header(q->packet, q, 0)) {
3776			VERBOSITY(2, (LOG_ERR, "proxy-protocol: could not "
3777				"consume PROXYv2 header"));
3778			goto swap_drop;
3779		}
3780		if(!q->is_proxied) {
3781			q->client_addrlen = q->remote_addrlen;
3782			memmove(&q->client_addr, &q->remote_addr,
3783				q->remote_addrlen);
3784		}
3785#ifdef USE_DNSTAP
3786		/*
3787		 * sending UDP-query with server address (local) and client address to dnstap process
3788		 */
3789		log_addr("query from client", &q->client_addr);
3790		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3791		if(verbosity >= 6 && q->is_proxied)
3792			log_addr("query via proxy", &q->remote_addr);
3793		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->client_addr, q->client_addrlen,
3794			q->tcp, q->packet);
3795#endif /* USE_DNSTAP */
3796
3797		/* Process and answer the query... */
3798		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3799			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3800				STATUP(data->nsd, nona);
3801				ZTATUP(data->nsd, q->zone, nona);
3802			}
3803
3804#ifdef USE_ZONE_STATS
3805			if (data->socket->addr.ai_family == AF_INET) {
3806				ZTATUP(data->nsd, q->zone, qudp);
3807			} else if (data->socket->addr.ai_family == AF_INET6) {
3808				ZTATUP(data->nsd, q->zone, qudp6);
3809			}
3810#endif
3811
3812			/* Add EDNS0 and TSIG info if necessary.  */
3813			query_add_optional(q, data->nsd, &now);
3814
3815			buffer_flip(q->packet);
3816			iovecs[i].iov_len = buffer_remaining(q->packet);
3817#ifdef BIND8_STATS
3818			/* Account the rcode & TC... */
3819			STATUP2(data->nsd, rcode, RCODE(q->packet));
3820			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3821			if (TC(q->packet)) {
3822				STATUP(data->nsd, truncated);
3823				ZTATUP(data->nsd, q->zone, truncated);
3824			}
3825#endif /* BIND8_STATS */
3826#ifdef USE_DNSTAP
3827			/*
3828			 * sending UDP-response with server address (local) and client address to dnstap process
3829			 */
3830			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3831			log_addr("response to client", &q->client_addr);
3832			if(verbosity >= 6 && q->is_proxied)
3833				log_addr("response via proxy", &q->remote_addr);
3834			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3835				&q->client_addr, q->client_addrlen, q->tcp, q->packet,
3836				q->zone);
3837#endif /* USE_DNSTAP */
3838		} else {
3839			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3840			iovecs[i].iov_len = buffer_remaining(q->packet);
3841			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3842		swap_drop:
3843			STATUP(data->nsd, dropped);
3844			ZTATUP(data->nsd, q->zone, dropped);
3845			if(i != recvcount-1) {
3846				/* swap with last and decrease recvcount */
3847				struct mmsghdr mtmp = msgs[i];
3848				struct iovec iotmp = iovecs[i];
3849				recvcount--;
3850				msgs[i] = msgs[recvcount];
3851				iovecs[i] = iovecs[recvcount];
3852				queries[i] = queries[recvcount];
3853				msgs[recvcount] = mtmp;
3854				iovecs[recvcount] = iotmp;
3855				queries[recvcount] = q;
3856				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3857				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3858				goto loopstart;
3859			} else { recvcount --; }
3860		}
3861	}
3862
3863	/* send until all are sent */
3864	i = 0;
3865	while(i<recvcount) {
3866		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3867		if(sent == -1) {
3868			if(errno == ENOBUFS ||
3869#ifdef EWOULDBLOCK
3870				errno == EWOULDBLOCK ||
3871#endif
3872				errno == EAGAIN) {
3873				/* block to wait until send buffer avail */
3874				int flag, errstore;
3875				if((flag = fcntl(fd, F_GETFL)) == -1) {
3876					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3877					flag = 0;
3878				}
3879				flag &= ~O_NONBLOCK;
3880				if(fcntl(fd, F_SETFL, flag) == -1)
3881					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3882				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3883				errstore = errno;
3884				flag |= O_NONBLOCK;
3885				if(fcntl(fd, F_SETFL, flag) == -1)
3886					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3887				if(sent != -1) {
3888					i += sent;
3889					continue;
3890				}
3891				errno = errstore;
3892			}
3893			if(errno == EINVAL) {
3894				/* skip the invalid argument entry,
3895				 * send the remaining packets in the list */
3896				if(!(port_is_zero((void*)&queries[i]->remote_addr) &&
3897					verbosity < 3)) {
3898					const char* es = strerror(errno);
3899					char a[64];
3900					addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
3901					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3902				}
3903				i += 1;
3904				continue;
3905			}
3906			/* don't log transient network full errors, unless
3907			 * on higher verbosity */
3908			if(!(errno == ENOBUFS && verbosity < 1) &&
3909#ifdef EWOULDBLOCK
3910			   errno != EWOULDBLOCK &&
3911#endif
3912			   errno != EAGAIN) {
3913				const char* es = strerror(errno);
3914				char a[64];
3915				addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
3916				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3917			}
3918#ifdef BIND8_STATS
3919			data->nsd->st->txerr += recvcount-i;
3920#endif /* BIND8_STATS */
3921			break;
3922		}
3923		i += sent;
3924	}
3925	for(i=0; i<recvcount; i++) {
3926		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3927		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3928		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3929	}
3930}
3931
3932#ifdef HAVE_SSL
3933/*
3934 * Setup an event for the tcp handler.
3935 */
3936static void
3937tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3938       int fd, short event)
3939{
3940	struct timeval timeout;
3941	struct event_base* ev_base;
3942
3943	timeout.tv_sec = data->nsd->tcp_timeout;
3944	timeout.tv_usec = 0L;
3945
3946	ev_base = data->event.ev_base;
3947	event_del(&data->event);
3948	memset(&data->event, 0, sizeof(data->event));
3949	event_set(&data->event, fd, event, fn, data);
3950	if(event_base_set(ev_base, &data->event) != 0)
3951		log_msg(LOG_ERR, "event base set failed");
3952	if(event_add(&data->event, &timeout) != 0)
3953		log_msg(LOG_ERR, "event add failed");
3954}
3955#endif /* HAVE_SSL */
3956
3957static void
3958cleanup_tcp_handler(struct tcp_handler_data* data)
3959{
3960	event_del(&data->event);
3961#ifdef HAVE_SSL
3962	if(data->tls) {
3963		SSL_shutdown(data->tls);
3964		SSL_free(data->tls);
3965		data->tls = NULL;
3966	}
3967#endif
3968	data->pp2_header_state = pp2_header_none;
3969	close(data->event.ev_fd);
3970	if(data->prev)
3971		data->prev->next = data->next;
3972	else	tcp_active_list = data->next;
3973	if(data->next)
3974		data->next->prev = data->prev;
3975
3976	/*
3977	 * Enable the TCP accept handlers when the current number of
3978	 * TCP connections is about to drop below the maximum number
3979	 * of TCP connections.
3980	 */
3981	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3982		configure_handler_event_types(EV_READ|EV_PERSIST);
3983		if(slowaccept) {
3984			event_del(&slowaccept_event);
3985			slowaccept = 0;
3986		}
3987	}
3988	--data->nsd->current_tcp_count;
3989	assert(data->nsd->current_tcp_count >= 0);
3990
3991	region_destroy(data->region);
3992}
3993
3994/* Read more data into the buffer for tcp read. Pass the amount of additional
3995 * data required. Returns false if nothing needs to be done this event, or
3996 * true if the additional data is in the buffer. */
3997static int
3998more_read_buf_tcp(int fd, struct tcp_handler_data* data, void* bufpos,
3999	size_t add_amount, ssize_t* received)
4000{
4001	*received = read(fd, bufpos, add_amount);
4002	if (*received == -1) {
4003		if (errno == EAGAIN || errno == EINTR) {
4004			/*
4005			 * Read would block, wait until more
4006			 * data is available.
4007			 */
4008			return 0;
4009		} else {
4010			char buf[48];
4011			addr2str(&data->query->remote_addr, buf, sizeof(buf));
4012#ifdef ECONNRESET
4013			if (verbosity >= 2 || errno != ECONNRESET)
4014#endif /* ECONNRESET */
4015			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
4016			cleanup_tcp_handler(data);
4017			return 0;
4018		}
4019	} else if (*received == 0) {
4020		/* EOF */
4021		cleanup_tcp_handler(data);
4022		return 0;
4023	}
4024	return 1;
4025}
4026
4027static void
4028handle_tcp_reading(int fd, short event, void* arg)
4029{
4030	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4031	ssize_t received;
4032	struct event_base* ev_base;
4033	struct timeval timeout;
4034	uint32_t now = 0;
4035
4036	if ((event & EV_TIMEOUT)) {
4037		/* Connection timed out.  */
4038		cleanup_tcp_handler(data);
4039		return;
4040	}
4041
4042	if ((data->nsd->tcp_query_count > 0 &&
4043	     data->query_count >= data->nsd->tcp_query_count) ||
4044	    (data->query_count > 0 && data->tcp_no_more_queries))
4045  {
4046		/* No more queries allowed on this tcp connection. */
4047		cleanup_tcp_handler(data);
4048		return;
4049	}
4050
4051	assert((event & EV_READ));
4052
4053	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
4054		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4055		data->query_needs_reset = 0;
4056	}
4057
4058	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
4059		struct pp2_header* header = NULL;
4060		size_t want_read_size = 0;
4061		size_t current_read_size = 0;
4062		if(data->pp2_header_state == pp2_header_none) {
4063			want_read_size = PP2_HEADER_SIZE;
4064			if(buffer_remaining(data->query->packet) <
4065				want_read_size) {
4066				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4067				cleanup_tcp_handler(data);
4068				return;
4069			}
4070			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4071			current_read_size = want_read_size;
4072			if(data->bytes_transmitted < current_read_size) {
4073				if(!more_read_buf_tcp(fd, data,
4074					(void*)buffer_at(data->query->packet,
4075						data->bytes_transmitted),
4076					current_read_size - data->bytes_transmitted,
4077					&received))
4078					return;
4079				data->bytes_transmitted += received;
4080				buffer_skip(data->query->packet, received);
4081				if(data->bytes_transmitted != current_read_size)
4082					return;
4083				data->pp2_header_state = pp2_header_init;
4084			}
4085		}
4086		if(data->pp2_header_state == pp2_header_init) {
4087			int err;
4088			err = pp2_read_header(buffer_begin(data->query->packet),
4089				buffer_limit(data->query->packet));
4090			if(err) {
4091				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
4092				cleanup_tcp_handler(data);
4093				return;
4094			}
4095			header = (struct pp2_header*)buffer_begin(data->query->packet);
4096			want_read_size = ntohs(header->len);
4097			if(buffer_limit(data->query->packet) <
4098				PP2_HEADER_SIZE + want_read_size) {
4099				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4100				cleanup_tcp_handler(data);
4101				return;
4102			}
4103			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4104			current_read_size = PP2_HEADER_SIZE + want_read_size;
4105			if(want_read_size == 0) {
4106				/* nothing more to read; header is complete */
4107				data->pp2_header_state = pp2_header_done;
4108			} else if(data->bytes_transmitted < current_read_size) {
4109				if(!more_read_buf_tcp(fd, data,
4110					(void*)buffer_at(data->query->packet,
4111						data->bytes_transmitted),
4112					current_read_size - data->bytes_transmitted,
4113					&received))
4114					return;
4115				data->bytes_transmitted += received;
4116				buffer_skip(data->query->packet, received);
4117				if(data->bytes_transmitted != current_read_size)
4118					return;
4119				data->pp2_header_state = pp2_header_done;
4120			}
4121		}
4122		if(data->pp2_header_state != pp2_header_done || !header) {
4123			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
4124
4125			cleanup_tcp_handler(data);
4126			return;
4127		}
4128		buffer_flip(data->query->packet);
4129		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
4130			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
4131
4132			cleanup_tcp_handler(data);
4133			return;
4134		}
4135		/* Clear and reset the buffer to read the following
4136		 * DNS packet(s). */
4137		buffer_clear(data->query->packet);
4138		data->bytes_transmitted = 0;
4139	}
4140
4141	/*
4142	 * Check if we received the leading packet length bytes yet.
4143	 */
4144	if (data->bytes_transmitted < sizeof(uint16_t)) {
4145		if(!more_read_buf_tcp(fd, data,
4146			(char*) &data->query->tcplen + data->bytes_transmitted,
4147			sizeof(uint16_t) - data->bytes_transmitted, &received))
4148			return;
4149		data->bytes_transmitted += received;
4150		if (data->bytes_transmitted < sizeof(uint16_t)) {
4151			/*
4152			 * Not done with the tcplen yet, wait for more
4153			 * data to become available.
4154			 */
4155			return;
4156		}
4157		assert(data->bytes_transmitted == sizeof(uint16_t));
4158
4159		data->query->tcplen = ntohs(data->query->tcplen);
4160
4161		/*
4162		 * Minimum query size is:
4163		 *
4164		 *     Size of the header (12)
4165		 *   + Root domain name   (1)
4166		 *   + Query class        (2)
4167		 *   + Query type         (2)
4168		 */
4169		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4170			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4171			cleanup_tcp_handler(data);
4172			return;
4173		}
4174
4175		if (data->query->tcplen > data->query->maxlen) {
4176			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4177			cleanup_tcp_handler(data);
4178			return;
4179		}
4180
4181		buffer_set_limit(data->query->packet, data->query->tcplen);
4182	}
4183
4184	assert(buffer_remaining(data->query->packet) > 0);
4185
4186	/* Read the (remaining) query data.  */
4187	if(!more_read_buf_tcp(fd, data, buffer_current(data->query->packet),
4188		buffer_remaining(data->query->packet), &received))
4189		return;
4190	data->bytes_transmitted += received;
4191	buffer_skip(data->query->packet, received);
4192	if (buffer_remaining(data->query->packet) > 0) {
4193		/*
4194		 * Message not yet complete, wait for more data to
4195		 * become available.
4196		 */
4197		return;
4198	}
4199
4200	assert(buffer_position(data->query->packet) == data->query->tcplen);
4201
4202	/* Account... */
4203#ifdef BIND8_STATS
4204#ifndef INET6
4205	STATUP(data->nsd, ctcp);
4206#else
4207	if (data->query->remote_addr.ss_family == AF_INET) {
4208		STATUP(data->nsd, ctcp);
4209	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4210		STATUP(data->nsd, ctcp6);
4211	}
4212#endif
4213#endif /* BIND8_STATS */
4214
4215	/* We have a complete query, process it.  */
4216
4217	/* tcp-query-count: handle query counter ++ */
4218	data->query_count++;
4219
4220	buffer_flip(data->query->packet);
4221#ifdef USE_DNSTAP
4222	/*
4223	 * and send TCP-query with found address (local) and client address to dnstap process
4224	 */
4225	log_addr("query from client", &data->query->client_addr);
4226	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4227	if(verbosity >= 6 && data->query->is_proxied)
4228		log_addr("query via proxy", &data->query->remote_addr);
4229	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4230		data->query->client_addrlen, data->query->tcp, data->query->packet);
4231#endif /* USE_DNSTAP */
4232	data->query_state = server_process_query(data->nsd, data->query, &now);
4233	if (data->query_state == QUERY_DISCARDED) {
4234		/* Drop the packet and the entire connection... */
4235		STATUP(data->nsd, dropped);
4236		ZTATUP(data->nsd, data->query->zone, dropped);
4237		cleanup_tcp_handler(data);
4238		return;
4239	}
4240
4241#ifdef BIND8_STATS
4242	if (RCODE(data->query->packet) == RCODE_OK
4243	    && !AA(data->query->packet))
4244	{
4245		STATUP(data->nsd, nona);
4246		ZTATUP(data->nsd, data->query->zone, nona);
4247	}
4248#endif /* BIND8_STATS */
4249
4250#ifdef USE_ZONE_STATS
4251#ifndef INET6
4252	ZTATUP(data->nsd, data->query->zone, ctcp);
4253#else
4254	if (data->query->remote_addr.ss_family == AF_INET) {
4255		ZTATUP(data->nsd, data->query->zone, ctcp);
4256	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4257		ZTATUP(data->nsd, data->query->zone, ctcp6);
4258	}
4259#endif
4260#endif /* USE_ZONE_STATS */
4261
4262	query_add_optional(data->query, data->nsd, &now);
4263
4264	/* Switch to the tcp write handler.  */
4265	buffer_flip(data->query->packet);
4266	data->query->tcplen = buffer_remaining(data->query->packet);
4267#ifdef BIND8_STATS
4268	/* Account the rcode & TC... */
4269	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4270	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4271	if (TC(data->query->packet)) {
4272		STATUP(data->nsd, truncated);
4273		ZTATUP(data->nsd, data->query->zone, truncated);
4274	}
4275#endif /* BIND8_STATS */
4276#ifdef USE_DNSTAP
4277	/*
4278	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4279	 */
4280	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4281	log_addr("response to client", &data->query->client_addr);
4282	if(verbosity >= 6 && data->query->is_proxied)
4283		log_addr("response via proxy", &data->query->remote_addr);
4284	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4285		data->query->client_addrlen, data->query->tcp, data->query->packet,
4286		data->query->zone);
4287#endif /* USE_DNSTAP */
4288	data->bytes_transmitted = 0;
4289
4290	timeout.tv_sec = data->tcp_timeout / 1000;
4291	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4292
4293	ev_base = data->event.ev_base;
4294	event_del(&data->event);
4295	memset(&data->event, 0, sizeof(data->event));
4296	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4297		handle_tcp_writing, data);
4298	if(event_base_set(ev_base, &data->event) != 0)
4299		log_msg(LOG_ERR, "event base set tcpr failed");
4300	if(event_add(&data->event, &timeout) != 0)
4301		log_msg(LOG_ERR, "event add tcpr failed");
4302	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4303	handle_tcp_writing(fd, EV_WRITE, data);
4304}
4305
4306static void
4307handle_tcp_writing(int fd, short event, void* arg)
4308{
4309	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4310	ssize_t sent;
4311	struct query *q = data->query;
4312	struct timeval timeout;
4313	struct event_base* ev_base;
4314	uint32_t now = 0;
4315
4316	if ((event & EV_TIMEOUT)) {
4317		/* Connection timed out.  */
4318		cleanup_tcp_handler(data);
4319		return;
4320	}
4321
4322	assert((event & EV_WRITE));
4323
4324	if (data->bytes_transmitted < sizeof(q->tcplen)) {
4325		/* Writing the response packet length.  */
4326		uint16_t n_tcplen = htons(q->tcplen);
4327#ifdef HAVE_WRITEV
4328		struct iovec iov[2];
4329		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
4330		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
4331		iov[1].iov_base = buffer_begin(q->packet);
4332		iov[1].iov_len = buffer_limit(q->packet);
4333		sent = writev(fd, iov, 2);
4334#else /* HAVE_WRITEV */
4335		sent = write(fd,
4336			     (const char *) &n_tcplen + data->bytes_transmitted,
4337			     sizeof(n_tcplen) - data->bytes_transmitted);
4338#endif /* HAVE_WRITEV */
4339		if (sent == -1) {
4340			if (errno == EAGAIN || errno == EINTR) {
4341				/*
4342				 * Write would block, wait until
4343				 * socket becomes writable again.
4344				 */
4345				return;
4346			} else {
4347#ifdef ECONNRESET
4348				if(verbosity >= 2 || errno != ECONNRESET)
4349#endif /* ECONNRESET */
4350#ifdef EPIPE
4351				  if(verbosity >= 2 || errno != EPIPE)
4352#endif /* EPIPE 'broken pipe' */
4353				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4354				cleanup_tcp_handler(data);
4355				return;
4356			}
4357		}
4358
4359		data->bytes_transmitted += sent;
4360		if (data->bytes_transmitted < sizeof(q->tcplen)) {
4361			/*
4362			 * Writing not complete, wait until socket
4363			 * becomes writable again.
4364			 */
4365			return;
4366		}
4367
4368#ifdef HAVE_WRITEV
4369		sent -= sizeof(n_tcplen);
4370		/* handle potential 'packet done' code */
4371		goto packet_could_be_done;
4372#endif
4373 	}
4374
4375	sent = write(fd,
4376		     buffer_current(q->packet),
4377		     buffer_remaining(q->packet));
4378	if (sent == -1) {
4379		if (errno == EAGAIN || errno == EINTR) {
4380			/*
4381			 * Write would block, wait until
4382			 * socket becomes writable again.
4383			 */
4384			return;
4385		} else {
4386#ifdef ECONNRESET
4387			if(verbosity >= 2 || errno != ECONNRESET)
4388#endif /* ECONNRESET */
4389#ifdef EPIPE
4390				  if(verbosity >= 2 || errno != EPIPE)
4391#endif /* EPIPE 'broken pipe' */
4392			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4393			cleanup_tcp_handler(data);
4394			return;
4395		}
4396	}
4397
4398	data->bytes_transmitted += sent;
4399#ifdef HAVE_WRITEV
4400  packet_could_be_done:
4401#endif
4402	buffer_skip(q->packet, sent);
4403	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4404		/*
4405		 * Still more data to write when socket becomes
4406		 * writable again.
4407		 */
4408		return;
4409	}
4410
4411	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4412
4413	if (data->query_state == QUERY_IN_AXFR ||
4414		data->query_state == QUERY_IN_IXFR) {
4415		/* Continue processing AXFR and writing back results.  */
4416		buffer_clear(q->packet);
4417		if(data->query_state == QUERY_IN_AXFR)
4418			data->query_state = query_axfr(data->nsd, q, 0);
4419		else data->query_state = query_ixfr(data->nsd, q);
4420		if (data->query_state != QUERY_PROCESSED) {
4421			query_add_optional(data->query, data->nsd, &now);
4422
4423			/* Reset data. */
4424			buffer_flip(q->packet);
4425			q->tcplen = buffer_remaining(q->packet);
4426			data->bytes_transmitted = 0;
4427			/* Reset timeout.  */
4428			timeout.tv_sec = data->tcp_timeout / 1000;
4429			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4430			ev_base = data->event.ev_base;
4431			event_del(&data->event);
4432			memset(&data->event, 0, sizeof(data->event));
4433			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4434				handle_tcp_writing, data);
4435			if(event_base_set(ev_base, &data->event) != 0)
4436				log_msg(LOG_ERR, "event base set tcpw failed");
4437			if(event_add(&data->event, &timeout) != 0)
4438				log_msg(LOG_ERR, "event add tcpw failed");
4439
4440			/*
4441			 * Write data if/when the socket is writable
4442			 * again.
4443			 */
4444			return;
4445		}
4446	}
4447
4448	/*
4449	 * Done sending, wait for the next request to arrive on the
4450	 * TCP socket by installing the TCP read handler.
4451	 */
4452	if ((data->nsd->tcp_query_count > 0 &&
4453		data->query_count >= data->nsd->tcp_query_count) ||
4454		data->tcp_no_more_queries) {
4455
4456		(void) shutdown(fd, SHUT_WR);
4457	}
4458
4459	data->bytes_transmitted = 0;
4460	data->query_needs_reset = 1;
4461
4462	timeout.tv_sec = data->tcp_timeout / 1000;
4463	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4464	ev_base = data->event.ev_base;
4465	event_del(&data->event);
4466	memset(&data->event, 0, sizeof(data->event));
4467	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4468		handle_tcp_reading, data);
4469	if(event_base_set(ev_base, &data->event) != 0)
4470		log_msg(LOG_ERR, "event base set tcpw failed");
4471	if(event_add(&data->event, &timeout) != 0)
4472		log_msg(LOG_ERR, "event add tcpw failed");
4473}
4474
4475#ifdef HAVE_SSL
4476/** create SSL object and associate fd */
4477static SSL*
4478incoming_ssl_fd(SSL_CTX* ctx, int fd)
4479{
4480	SSL* ssl = SSL_new((SSL_CTX*)ctx);
4481	if(!ssl) {
4482		log_crypto_err("could not SSL_new");
4483		return NULL;
4484	}
4485	SSL_set_accept_state(ssl);
4486	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4487	if(!SSL_set_fd(ssl, fd)) {
4488		log_crypto_err("could not SSL_set_fd");
4489		SSL_free(ssl);
4490		return NULL;
4491	}
4492	return ssl;
4493}
4494
4495/** TLS handshake to upgrade TCP connection */
4496static int
4497tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4498{
4499	int r;
4500	if(data->shake_state == tls_hs_read_event) {
4501		/* read condition satisfied back to writing */
4502		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4503		data->shake_state = tls_hs_none;
4504		return 1;
4505	}
4506	if(data->shake_state == tls_hs_write_event) {
4507		/* write condition satisfied back to reading */
4508		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4509		data->shake_state = tls_hs_none;
4510		return 1;
4511	}
4512
4513	/* (continue to) setup the TLS connection */
4514	ERR_clear_error();
4515	r = SSL_do_handshake(data->tls);
4516
4517	if(r != 1) {
4518		int want = SSL_get_error(data->tls, r);
4519		if(want == SSL_ERROR_WANT_READ) {
4520			if(data->shake_state == tls_hs_read) {
4521				/* try again later */
4522				return 1;
4523			}
4524			data->shake_state = tls_hs_read;
4525			/* switch back to reading mode */
4526			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4527			return 1;
4528		} else if(want == SSL_ERROR_WANT_WRITE) {
4529			if(data->shake_state == tls_hs_write) {
4530				/* try again later */
4531				return 1;
4532			}
4533			data->shake_state = tls_hs_write;
4534			/* switch back to writing mode */
4535			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4536			return 1;
4537		} else {
4538			if(r == 0)
4539				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4540			else {
4541				unsigned long err = ERR_get_error();
4542				if(!squelch_err_ssl_handshake(err)) {
4543					char a[64], s[256];
4544					addr2str(&data->query->remote_addr, a, sizeof(a));
4545					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4546					log_crypto_from_err(s, err);
4547				}
4548			}
4549			cleanup_tcp_handler(data);
4550			return 0;
4551		}
4552	}
4553
4554	/* Use to log successful upgrade for testing - could be removed*/
4555	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4556	/* set back to the event we need to have when reading (or writing) */
4557	if(data->shake_state == tls_hs_read && writing) {
4558		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4559	} else if(data->shake_state == tls_hs_write && !writing) {
4560		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4561	}
4562	data->shake_state = tls_hs_none;
4563	return 1;
4564}
4565
4566/* Read more data into the buffer for tls read. Pass the amount of additional
4567 * data required. Returns false if nothing needs to be done this event, or
4568 * true if the additional data is in the buffer. */
4569static int
4570more_read_buf_tls(int fd, struct tcp_handler_data* data, void* bufpos,
4571	size_t add_amount, ssize_t* received)
4572{
4573	ERR_clear_error();
4574	if((*received=SSL_read(data->tls, bufpos, add_amount)) <= 0) {
4575		int want = SSL_get_error(data->tls, *received);
4576		if(want == SSL_ERROR_ZERO_RETURN) {
4577			cleanup_tcp_handler(data);
4578			return 0; /* shutdown, closed */
4579		} else if(want == SSL_ERROR_WANT_READ) {
4580			/* wants to be called again */
4581			return 0;
4582		}
4583		else if(want == SSL_ERROR_WANT_WRITE) {
4584			/* switch to writing */
4585			data->shake_state = tls_hs_write_event;
4586			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4587			return 0;
4588		}
4589		cleanup_tcp_handler(data);
4590		log_crypto_err("could not SSL_read");
4591		return 0;
4592	}
4593	return 1;
4594}
4595
4596/** handle TLS reading of incoming query */
4597static void
4598handle_tls_reading(int fd, short event, void* arg)
4599{
4600	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4601	ssize_t received;
4602	uint32_t now = 0;
4603
4604	if ((event & EV_TIMEOUT)) {
4605		/* Connection timed out.  */
4606		cleanup_tcp_handler(data);
4607		return;
4608	}
4609
4610	if ((data->nsd->tcp_query_count > 0 &&
4611	     data->query_count >= data->nsd->tcp_query_count) ||
4612	    (data->query_count > 0 && data->tcp_no_more_queries))
4613	{
4614		/* No more queries allowed on this tcp connection. */
4615		cleanup_tcp_handler(data);
4616		return;
4617	}
4618
4619	assert((event & EV_READ));
4620
4621	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
4622		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4623		data->query_needs_reset = 0;
4624	}
4625
4626	if(data->shake_state != tls_hs_none) {
4627		if(!tls_handshake(data, fd, 0))
4628			return;
4629		if(data->shake_state != tls_hs_none)
4630			return;
4631	}
4632
4633	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
4634		struct pp2_header* header = NULL;
4635		size_t want_read_size = 0;
4636		size_t current_read_size = 0;
4637		if(data->pp2_header_state == pp2_header_none) {
4638			want_read_size = PP2_HEADER_SIZE;
4639			if(buffer_remaining(data->query->packet) <
4640				want_read_size) {
4641				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4642				cleanup_tcp_handler(data);
4643				return;
4644			}
4645			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4646			current_read_size = want_read_size;
4647			if(data->bytes_transmitted < current_read_size) {
4648				if(!more_read_buf_tls(fd, data,
4649					buffer_at(data->query->packet,
4650						data->bytes_transmitted),
4651					current_read_size - data->bytes_transmitted,
4652					&received))
4653					return;
4654				data->bytes_transmitted += received;
4655				buffer_skip(data->query->packet, received);
4656				if(data->bytes_transmitted != current_read_size)
4657					return;
4658				data->pp2_header_state = pp2_header_init;
4659			}
4660		}
4661		if(data->pp2_header_state == pp2_header_init) {
4662			int err;
4663			err = pp2_read_header(buffer_begin(data->query->packet),
4664				buffer_limit(data->query->packet));
4665			if(err) {
4666				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
4667				cleanup_tcp_handler(data);
4668				return;
4669			}
4670			header = (struct pp2_header*)buffer_begin(data->query->packet);
4671			want_read_size = ntohs(header->len);
4672			if(buffer_limit(data->query->packet) <
4673				PP2_HEADER_SIZE + want_read_size) {
4674				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4675				cleanup_tcp_handler(data);
4676				return;
4677			}
4678			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4679			current_read_size = PP2_HEADER_SIZE + want_read_size;
4680			if(want_read_size == 0) {
4681				/* nothing more to read; header is complete */
4682				data->pp2_header_state = pp2_header_done;
4683			} else if(data->bytes_transmitted < current_read_size) {
4684				if(!more_read_buf_tls(fd, data,
4685					buffer_at(data->query->packet,
4686						data->bytes_transmitted),
4687					current_read_size - data->bytes_transmitted,
4688					&received))
4689					return;
4690				data->bytes_transmitted += received;
4691				buffer_skip(data->query->packet, received);
4692				if(data->bytes_transmitted != current_read_size)
4693					return;
4694				data->pp2_header_state = pp2_header_done;
4695			}
4696		}
4697		if(data->pp2_header_state != pp2_header_done || !header) {
4698			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
4699			cleanup_tcp_handler(data);
4700			return;
4701		}
4702		buffer_flip(data->query->packet);
4703		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
4704			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
4705			cleanup_tcp_handler(data);
4706			return;
4707		}
4708		/* Clear and reset the buffer to read the following
4709		 * DNS packet(s). */
4710		buffer_clear(data->query->packet);
4711		data->bytes_transmitted = 0;
4712	}
4713	/*
4714	 * Check if we received the leading packet length bytes yet.
4715	 */
4716	if(data->bytes_transmitted < sizeof(uint16_t)) {
4717		if(!more_read_buf_tls(fd, data,
4718		    (char *) &data->query->tcplen + data->bytes_transmitted,
4719		    sizeof(uint16_t) - data->bytes_transmitted, &received))
4720			return;
4721		data->bytes_transmitted += received;
4722		if (data->bytes_transmitted < sizeof(uint16_t)) {
4723			/*
4724			 * Not done with the tcplen yet, wait for more
4725			 * data to become available.
4726			 */
4727			return;
4728		}
4729
4730		assert(data->bytes_transmitted == sizeof(uint16_t));
4731
4732		data->query->tcplen = ntohs(data->query->tcplen);
4733
4734		/*
4735		 * Minimum query size is:
4736		 *
4737		 *     Size of the header (12)
4738		 *   + Root domain name   (1)
4739		 *   + Query class        (2)
4740		 *   + Query type         (2)
4741		 */
4742		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4743			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4744			cleanup_tcp_handler(data);
4745			return;
4746		}
4747
4748		if (data->query->tcplen > data->query->maxlen) {
4749			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4750			cleanup_tcp_handler(data);
4751			return;
4752		}
4753
4754		buffer_set_limit(data->query->packet, data->query->tcplen);
4755	}
4756
4757	assert(buffer_remaining(data->query->packet) > 0);
4758
4759	/* Read the (remaining) query data.  */
4760	if(!more_read_buf_tls(fd, data, buffer_current(data->query->packet),
4761		buffer_remaining(data->query->packet), &received))
4762		return;
4763	data->bytes_transmitted += received;
4764	buffer_skip(data->query->packet, received);
4765	if (buffer_remaining(data->query->packet) > 0) {
4766		/*
4767		 * Message not yet complete, wait for more data to
4768		 * become available.
4769		 */
4770		return;
4771	}
4772
4773	assert(buffer_position(data->query->packet) == data->query->tcplen);
4774
4775	/* Account... */
4776#ifndef INET6
4777	STATUP(data->nsd, ctls);
4778#else
4779	if (data->query->remote_addr.ss_family == AF_INET) {
4780		STATUP(data->nsd, ctls);
4781	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4782		STATUP(data->nsd, ctls6);
4783	}
4784#endif
4785
4786	/* We have a complete query, process it.  */
4787
4788	/* tcp-query-count: handle query counter ++ */
4789	data->query_count++;
4790
4791	buffer_flip(data->query->packet);
4792#ifdef USE_DNSTAP
4793	/*
4794	 * and send TCP-query with found address (local) and client address to dnstap process
4795	 */
4796	log_addr("query from client", &data->query->client_addr);
4797	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4798	if(verbosity >= 6 && data->query->is_proxied)
4799		log_addr("query via proxy", &data->query->remote_addr);
4800	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4801		data->query->client_addrlen, data->query->tcp, data->query->packet);
4802#endif /* USE_DNSTAP */
4803	data->query_state = server_process_query(data->nsd, data->query, &now);
4804	if (data->query_state == QUERY_DISCARDED) {
4805		/* Drop the packet and the entire connection... */
4806		STATUP(data->nsd, dropped);
4807		ZTATUP(data->nsd, data->query->zone, dropped);
4808		cleanup_tcp_handler(data);
4809		return;
4810	}
4811
4812#ifdef BIND8_STATS
4813	if (RCODE(data->query->packet) == RCODE_OK
4814	    && !AA(data->query->packet))
4815	{
4816		STATUP(data->nsd, nona);
4817		ZTATUP(data->nsd, data->query->zone, nona);
4818	}
4819#endif /* BIND8_STATS */
4820
4821#ifdef USE_ZONE_STATS
4822#ifndef INET6
4823	ZTATUP(data->nsd, data->query->zone, ctls);
4824#else
4825	if (data->query->remote_addr.ss_family == AF_INET) {
4826		ZTATUP(data->nsd, data->query->zone, ctls);
4827	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4828		ZTATUP(data->nsd, data->query->zone, ctls6);
4829	}
4830#endif
4831#endif /* USE_ZONE_STATS */
4832
4833	query_add_optional(data->query, data->nsd, &now);
4834
4835	/* Switch to the tcp write handler.  */
4836	buffer_flip(data->query->packet);
4837	data->query->tcplen = buffer_remaining(data->query->packet);
4838#ifdef BIND8_STATS
4839	/* Account the rcode & TC... */
4840	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4841	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4842	if (TC(data->query->packet)) {
4843		STATUP(data->nsd, truncated);
4844		ZTATUP(data->nsd, data->query->zone, truncated);
4845	}
4846#endif /* BIND8_STATS */
4847#ifdef USE_DNSTAP
4848	/*
4849	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4850	 */
4851	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4852	log_addr("response to client", &data->query->client_addr);
4853	if(verbosity >= 6 && data->query->is_proxied)
4854		log_addr("response via proxy", &data->query->remote_addr);
4855	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4856		data->query->client_addrlen, data->query->tcp, data->query->packet,
4857		data->query->zone);
4858#endif /* USE_DNSTAP */
4859	data->bytes_transmitted = 0;
4860
4861	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4862
4863	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4864	handle_tls_writing(fd, EV_WRITE, data);
4865}
4866
4867/** handle TLS writing of outgoing response */
4868static void
4869handle_tls_writing(int fd, short event, void* arg)
4870{
4871	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4872	ssize_t sent;
4873	struct query *q = data->query;
4874	/* static variable that holds reassembly buffer used to put the
4875	 * TCP length in front of the packet, like writev. */
4876	static buffer_type* global_tls_temp_buffer = NULL;
4877	buffer_type* write_buffer;
4878	uint32_t now = 0;
4879
4880	if ((event & EV_TIMEOUT)) {
4881		/* Connection timed out.  */
4882		cleanup_tcp_handler(data);
4883		return;
4884	}
4885
4886	assert((event & EV_WRITE));
4887
4888	if(data->shake_state != tls_hs_none) {
4889		if(!tls_handshake(data, fd, 1))
4890			return;
4891		if(data->shake_state != tls_hs_none)
4892			return;
4893	}
4894
4895	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4896
4897	/* If we are writing the start of a message, we must include the length
4898	 * this is done with a copy into write_buffer. */
4899	write_buffer = NULL;
4900	if (data->bytes_transmitted == 0) {
4901		if(!global_tls_temp_buffer) {
4902			/* gets deallocated when nsd shuts down from
4903			 * nsd.region */
4904			global_tls_temp_buffer = buffer_create(nsd.region,
4905				QIOBUFSZ + sizeof(q->tcplen));
4906			if (!global_tls_temp_buffer) {
4907				return;
4908			}
4909		}
4910		write_buffer = global_tls_temp_buffer;
4911		buffer_clear(write_buffer);
4912		buffer_write_u16(write_buffer, q->tcplen);
4913		buffer_write(write_buffer, buffer_current(q->packet),
4914			(int)buffer_remaining(q->packet));
4915		buffer_flip(write_buffer);
4916	} else {
4917		write_buffer = q->packet;
4918	}
4919
4920	/* Write the response */
4921	ERR_clear_error();
4922	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4923	if(sent <= 0) {
4924		int want = SSL_get_error(data->tls, sent);
4925		if(want == SSL_ERROR_ZERO_RETURN) {
4926			cleanup_tcp_handler(data);
4927			/* closed */
4928		} else if(want == SSL_ERROR_WANT_READ) {
4929			/* switch back to reading */
4930			data->shake_state = tls_hs_read_event;
4931			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4932		} else if(want != SSL_ERROR_WANT_WRITE) {
4933			cleanup_tcp_handler(data);
4934			log_crypto_err("could not SSL_write");
4935		}
4936		return;
4937	}
4938
4939	buffer_skip(write_buffer, sent);
4940	if(buffer_remaining(write_buffer) != 0) {
4941		/* If not all sent, sync up the real buffer if it wasn't used.*/
4942		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4943			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4944		}
4945	}
4946
4947	data->bytes_transmitted += sent;
4948	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4949		/*
4950		 * Still more data to write when socket becomes
4951		 * writable again.
4952		 */
4953		return;
4954	}
4955
4956	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4957
4958	if (data->query_state == QUERY_IN_AXFR ||
4959		data->query_state == QUERY_IN_IXFR) {
4960		/* Continue processing AXFR and writing back results.  */
4961		buffer_clear(q->packet);
4962		if(data->query_state == QUERY_IN_AXFR)
4963			data->query_state = query_axfr(data->nsd, q, 0);
4964		else data->query_state = query_ixfr(data->nsd, q);
4965		if (data->query_state != QUERY_PROCESSED) {
4966			query_add_optional(data->query, data->nsd, &now);
4967
4968			/* Reset data. */
4969			buffer_flip(q->packet);
4970			q->tcplen = buffer_remaining(q->packet);
4971			data->bytes_transmitted = 0;
4972			/* Reset to writing mode.  */
4973			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4974
4975			/*
4976			 * Write data if/when the socket is writable
4977			 * again.
4978			 */
4979			return;
4980		}
4981	}
4982
4983	/*
4984	 * Done sending, wait for the next request to arrive on the
4985	 * TCP socket by installing the TCP read handler.
4986	 */
4987	if ((data->nsd->tcp_query_count > 0 &&
4988		data->query_count >= data->nsd->tcp_query_count) ||
4989		data->tcp_no_more_queries) {
4990
4991		(void) shutdown(fd, SHUT_WR);
4992	}
4993
4994	data->bytes_transmitted = 0;
4995	data->query_needs_reset = 1;
4996
4997	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4998}
4999#endif
5000
5001static void
5002handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
5003	void* ATTR_UNUSED(arg))
5004{
5005	if(slowaccept) {
5006		configure_handler_event_types(EV_PERSIST | EV_READ);
5007		slowaccept = 0;
5008	}
5009}
5010
5011static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
5012{
5013#ifndef HAVE_ACCEPT4
5014	int s = accept(fd, addr, addrlen);
5015	if (s != -1) {
5016		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
5017			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
5018			close(s);
5019			s = -1;
5020			errno=EINTR; /* stop error printout as error in accept4
5021				by setting this errno, it omits printout, in
5022				later code that calls nsd_accept4 */
5023		}
5024	}
5025	return s;
5026#else
5027	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
5028#endif /* HAVE_ACCEPT4 */
5029}
5030
5031/*
5032 * Handle an incoming TCP connection.  The connection is accepted and
5033 * a new TCP reader event handler is added.  The TCP handler
5034 * is responsible for cleanup when the connection is closed.
5035 */
5036static void
5037handle_tcp_accept(int fd, short event, void* arg)
5038{
5039	struct tcp_accept_handler_data *data
5040		= (struct tcp_accept_handler_data *) arg;
5041	int s;
5042	int reject = 0;
5043	struct tcp_handler_data *tcp_data;
5044	region_type *tcp_region;
5045#ifdef INET6
5046	struct sockaddr_storage addr;
5047#else
5048	struct sockaddr_in addr;
5049#endif
5050	socklen_t addrlen;
5051	struct timeval timeout;
5052
5053	if (!(event & EV_READ)) {
5054		return;
5055	}
5056
5057	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
5058		reject = data->nsd->options->tcp_reject_overflow;
5059		if (!reject) {
5060			return;
5061		}
5062	}
5063
5064	/* Accept it... */
5065	addrlen = sizeof(addr);
5066	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
5067	if (s == -1) {
5068		/**
5069		 * EMFILE and ENFILE is a signal that the limit of open
5070		 * file descriptors has been reached. Pause accept().
5071		 * EINTR is a signal interrupt. The others are various OS ways
5072		 * of saying that the client has closed the connection.
5073		 */
5074		if (errno == EMFILE || errno == ENFILE) {
5075			if (!slowaccept) {
5076				/* disable accept events */
5077				struct timeval tv;
5078				configure_handler_event_types(0);
5079				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
5080				tv.tv_usec = 0L;
5081				memset(&slowaccept_event, 0,
5082					sizeof(slowaccept_event));
5083				event_set(&slowaccept_event, -1, EV_TIMEOUT,
5084					handle_slowaccept_timeout, NULL);
5085				(void)event_base_set(data->event.ev_base,
5086					&slowaccept_event);
5087				(void)event_add(&slowaccept_event, &tv);
5088				slowaccept = 1;
5089				/* We don't want to spam the logs here */
5090			}
5091		} else if (errno != EINTR
5092			&& errno != EWOULDBLOCK
5093#ifdef ECONNABORTED
5094			&& errno != ECONNABORTED
5095#endif /* ECONNABORTED */
5096#ifdef EPROTO
5097			&& errno != EPROTO
5098#endif /* EPROTO */
5099			) {
5100			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
5101		}
5102		return;
5103	}
5104
5105	if (reject) {
5106		shutdown(s, SHUT_RDWR);
5107		close(s);
5108		return;
5109	}
5110
5111	/*
5112	 * This region is deallocated when the TCP connection is
5113	 * closed by the TCP handler.
5114	 */
5115	tcp_region = region_create(xalloc, free);
5116	tcp_data = (struct tcp_handler_data *) region_alloc(
5117		tcp_region, sizeof(struct tcp_handler_data));
5118	tcp_data->region = tcp_region;
5119	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
5120		compression_table_size, compressed_dnames);
5121	tcp_data->nsd = data->nsd;
5122	tcp_data->query_count = 0;
5123#ifdef HAVE_SSL
5124	tcp_data->shake_state = tls_hs_none;
5125	tcp_data->tls = NULL;
5126#endif
5127	tcp_data->query_needs_reset = 1;
5128	tcp_data->pp2_enabled = data->pp2_enabled;
5129	tcp_data->pp2_header_state = pp2_header_none;
5130	tcp_data->prev = NULL;
5131	tcp_data->next = NULL;
5132
5133	tcp_data->query_state = QUERY_PROCESSED;
5134	tcp_data->bytes_transmitted = 0;
5135	memcpy(&tcp_data->query->remote_addr, &addr, addrlen);
5136	tcp_data->query->remote_addrlen = addrlen;
5137	/* Copy remote_address to client_address.
5138	 * Simplest way/time for streams to do that. */
5139	memcpy(&tcp_data->query->client_addr, &addr, addrlen);
5140	tcp_data->query->client_addrlen = addrlen;
5141	tcp_data->query->is_proxied = 0;
5142
5143	tcp_data->tcp_no_more_queries = 0;
5144	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
5145	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
5146		/* very busy, give smaller timeout */
5147		tcp_data->tcp_timeout = 200;
5148	}
5149	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5150	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
5151	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
5152
5153#ifdef USE_DNSTAP
5154	/* save the address of the connection */
5155	tcp_data->socket = data->socket;
5156#endif /* USE_DNSTAP */
5157
5158#ifdef HAVE_SSL
5159	if (data->tls_accept) {
5160		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
5161		if(!tcp_data->tls) {
5162			close(s);
5163			return;
5164		}
5165		tcp_data->shake_state = tls_hs_read;
5166		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5167		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
5168			  handle_tls_reading, tcp_data);
5169	} else {
5170#endif
5171		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5172		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
5173			  handle_tcp_reading, tcp_data);
5174#ifdef HAVE_SSL
5175	}
5176#endif
5177	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
5178		log_msg(LOG_ERR, "cannot set tcp event base");
5179		close(s);
5180		region_destroy(tcp_region);
5181		return;
5182	}
5183	if(event_add(&tcp_data->event, &timeout) != 0) {
5184		log_msg(LOG_ERR, "cannot add tcp to event base");
5185		close(s);
5186		region_destroy(tcp_region);
5187		return;
5188	}
5189	if(tcp_active_list) {
5190		tcp_active_list->prev = tcp_data;
5191		tcp_data->next = tcp_active_list;
5192	}
5193	tcp_active_list = tcp_data;
5194
5195	/*
5196	 * Keep track of the total number of TCP handlers installed so
5197	 * we can stop accepting connections when the maximum number
5198	 * of simultaneous TCP connections is reached.
5199	 *
5200	 * If tcp-reject-overflow is enabled, however, then we do not
5201	 * change the handler event type; we keep it as-is and accept
5202	 * overflow TCP connections only so that we can forcibly kill
5203	 * them off.
5204	 */
5205	++data->nsd->current_tcp_count;
5206	if (!data->nsd->options->tcp_reject_overflow &&
5207	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
5208	{
5209		configure_handler_event_types(0);
5210	}
5211}
5212
5213static void
5214send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
5215{
5216	size_t i;
5217	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
5218	for (i = 0; i < nsd->child_count; ++i) {
5219		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
5220			if (write(nsd->children[i].child_fd,
5221				&command,
5222				sizeof(command)) == -1)
5223			{
5224				if(errno != EAGAIN && errno != EINTR)
5225					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
5226					(int) command,
5227					(int) nsd->children[i].pid,
5228					strerror(errno));
5229			} else if (timeout > 0) {
5230				(void)block_read(NULL,
5231					nsd->children[i].child_fd,
5232					&command, sizeof(command), timeout);
5233			}
5234			fsync(nsd->children[i].child_fd);
5235			close(nsd->children[i].child_fd);
5236			nsd->children[i].child_fd = -1;
5237		}
5238	}
5239}
5240
5241static void
5242send_children_quit(struct nsd* nsd)
5243{
5244	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
5245	send_children_command(nsd, NSD_QUIT, 0);
5246}
5247
5248static void
5249send_children_quit_and_wait(struct nsd* nsd)
5250{
5251	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
5252	send_children_command(nsd, NSD_QUIT_CHILD, 3);
5253}
5254
5255#ifdef BIND8_STATS
5256static void
5257set_children_stats(struct nsd* nsd)
5258{
5259	size_t i;
5260	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
5261	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
5262	for (i = 0; i < nsd->child_count; ++i) {
5263		nsd->children[i].need_to_send_STATS = 1;
5264		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
5265	}
5266}
5267#endif /* BIND8_STATS */
5268
5269static void
5270configure_handler_event_types(short event_types)
5271{
5272	size_t i;
5273
5274	for (i = 0; i < tcp_accept_handler_count; ++i) {
5275		struct event* handler = &tcp_accept_handlers[i].event;
5276		if(event_types) {
5277			/* reassign */
5278			int fd = handler->ev_fd;
5279			struct event_base* base = handler->ev_base;
5280			if(tcp_accept_handlers[i].event_added)
5281				event_del(handler);
5282			memset(handler, 0, sizeof(*handler));
5283			event_set(handler, fd, event_types,
5284				handle_tcp_accept, &tcp_accept_handlers[i]);
5285			if(event_base_set(base, handler) != 0)
5286				log_msg(LOG_ERR, "conhand: cannot event_base");
5287			if(event_add(handler, NULL) != 0)
5288				log_msg(LOG_ERR, "conhand: cannot event_add");
5289			tcp_accept_handlers[i].event_added = 1;
5290		} else {
5291			/* remove */
5292			if(tcp_accept_handlers[i].event_added) {
5293				event_del(handler);
5294				tcp_accept_handlers[i].event_added = 0;
5295			}
5296		}
5297	}
5298}
5299