1/* 2 * server.c -- nsd(8) network input/output 3 * 4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10#include "config.h" 11 12#include <sys/types.h> 13#include <sys/param.h> 14#include <limits.h> 15#include <sys/socket.h> 16#include <sys/uio.h> 17#include <sys/wait.h> 18 19#include <netinet/in.h> 20#ifdef USE_TCP_FASTOPEN 21 #include <netinet/tcp.h> 22#endif 23#include <arpa/inet.h> 24 25#include <assert.h> 26#include <ctype.h> 27#include <errno.h> 28#include <fcntl.h> 29#include <stddef.h> 30#include <stdio.h> 31#include <stdlib.h> 32#include <string.h> 33#include <time.h> 34#include <unistd.h> 35#include <signal.h> 36#include <netdb.h> 37#include <poll.h> 38#ifdef HAVE_SYS_RANDOM_H 39#include <sys/random.h> 40#endif 41#ifndef SHUT_WR 42#define SHUT_WR 1 43#endif 44#ifdef HAVE_MMAP 45#include <sys/mman.h> 46#endif /* HAVE_MMAP */ 47#ifdef HAVE_OPENSSL_RAND_H 48#include <openssl/rand.h> 49#endif 50#ifdef HAVE_OPENSSL_SSL_H 51#include <openssl/ssl.h> 52#endif 53#ifdef HAVE_OPENSSL_ERR_H 54#include <openssl/err.h> 55#endif 56#ifdef HAVE_OPENSSL_OCSP_H 57#include <openssl/ocsp.h> 58#endif 59#ifndef USE_MINI_EVENT 60# ifdef HAVE_EVENT_H 61# include <event.h> 62# else 63# include <event2/event.h> 64# include "event2/event_struct.h" 65# include "event2/event_compat.h" 66# endif 67#else 68# include "mini_event.h" 69#endif 70 71#include "axfr.h" 72#include "namedb.h" 73#include "netio.h" 74#include "xfrd.h" 75#include "xfrd-tcp.h" 76#include "xfrd-disk.h" 77#include "difffile.h" 78#include "nsec3.h" 79#include "ipc.h" 80#include "udb.h" 81#include "remote.h" 82#include "lookup3.h" 83#include "rrl.h" 84#include "ixfr.h" 85#ifdef USE_DNSTAP 86#include "dnstap/dnstap_collector.h" 87#endif 88#include "verify.h" 89#include "util/proxy_protocol.h" 90 91#define RELOAD_SYNC_TIMEOUT 25 /* seconds */ 92 93#ifdef USE_DNSTAP 94/* 95 * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content 96 * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*) 97 */ 98static void 99log_addr(const char* descr, 100#ifdef INET6 101 struct sockaddr_storage* addr 102#else 103 struct sockaddr_in* addr 104#endif 105 ) 106{ 107 char str_buf[64]; 108 if(verbosity < 6) 109 return; 110 if( 111#ifdef INET6 112 addr->ss_family == AF_INET 113#else 114 addr->sin_family == AF_INET 115#endif 116 ) { 117 struct sockaddr_in* s = (struct sockaddr_in*)addr; 118 inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf)); 119 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port))); 120#ifdef INET6 121 } else { 122 struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr; 123 inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf)); 124 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port))); 125#endif 126 } 127} 128#endif /* USE_DNSTAP */ 129 130#ifdef USE_TCP_FASTOPEN 131 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" 132 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 133#endif 134 135/* header state for the PROXYv2 header (for TCP) */ 136enum pp2_header_state { 137 /* no header encounter yet */ 138 pp2_header_none = 0, 139 /* read the static part of the header */ 140 pp2_header_init, 141 /* read the full header */ 142 pp2_header_done 143}; 144 145/* 146 * Data for the UDP handlers. 147 */ 148struct udp_handler_data 149{ 150 struct nsd *nsd; 151 struct nsd_socket *socket; 152 struct event event; 153 /* if set, PROXYv2 is expected on this connection */ 154 int pp2_enabled; 155}; 156 157struct tcp_accept_handler_data { 158 struct nsd *nsd; 159 struct nsd_socket *socket; 160 int event_added; 161 struct event event; 162#ifdef HAVE_SSL 163 /* handler accepts TLS connections on the dedicated port */ 164 int tls_accept; 165#endif 166 /* if set, PROXYv2 is expected on this connection */ 167 int pp2_enabled; 168}; 169 170/* 171 * These globals are used to enable the TCP accept handlers 172 * when the number of TCP connection drops below the maximum 173 * number of TCP connections. 174 */ 175static size_t tcp_accept_handler_count; 176static struct tcp_accept_handler_data *tcp_accept_handlers; 177 178static struct event slowaccept_event; 179static int slowaccept; 180 181#ifdef HAVE_SSL 182static unsigned char *ocspdata = NULL; 183static long ocspdata_len = 0; 184#endif 185 186#ifdef NONBLOCKING_IS_BROKEN 187/* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to 188 read multiple times from a socket when reported ready by select. */ 189# define NUM_RECV_PER_SELECT (1) 190#else /* !NONBLOCKING_IS_BROKEN */ 191# define NUM_RECV_PER_SELECT (100) 192#endif /* NONBLOCKING_IS_BROKEN */ 193 194#ifndef HAVE_MMSGHDR 195struct mmsghdr { 196 struct msghdr msg_hdr; 197 unsigned int msg_len; 198}; 199#endif 200 201static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; 202static struct iovec iovecs[NUM_RECV_PER_SELECT]; 203static struct query *queries[NUM_RECV_PER_SELECT]; 204 205/* 206 * Data for the TCP connection handlers. 207 * 208 * The TCP handlers use non-blocking I/O. This is necessary to avoid 209 * blocking the entire server on a slow TCP connection, but does make 210 * reading from and writing to the socket more complicated. 211 * 212 * Basically, whenever a read/write would block (indicated by the 213 * EAGAIN errno variable) we remember the position we were reading 214 * from/writing to and return from the TCP reading/writing event 215 * handler. When the socket becomes readable/writable again we 216 * continue from the same position. 217 */ 218struct tcp_handler_data 219{ 220 /* 221 * The region used to allocate all TCP connection related 222 * data, including this structure. This region is destroyed 223 * when the connection is closed. 224 */ 225 region_type* region; 226 227 /* 228 * The global nsd structure. 229 */ 230 struct nsd* nsd; 231 232 /* 233 * The current query data for this TCP connection. 234 */ 235 query_type* query; 236 237 /* 238 * The query_state is used to remember if we are performing an 239 * AXFR, if we're done processing, or if we should discard the 240 * query and connection. 241 */ 242 query_state_type query_state; 243 244 /* 245 * The event for the file descriptor and tcp timeout 246 */ 247 struct event event; 248 249 /* 250 * The bytes_transmitted field is used to remember the number 251 * of bytes transmitted when receiving or sending a DNS 252 * packet. The count includes the two additional bytes used 253 * to specify the packet length on a TCP connection. 254 */ 255 size_t bytes_transmitted; 256 257 /* If the query is restarted and needs a reset */ 258 int query_needs_reset; 259 260 /* 261 * The number of queries handled by this specific TCP connection. 262 */ 263 int query_count; 264 265 /* 266 * The timeout in msec for this tcp connection 267 */ 268 int tcp_timeout; 269 270 /* 271 * If the connection is allowed to have further queries on it. 272 */ 273 int tcp_no_more_queries; 274 275#ifdef USE_DNSTAP 276 /* the socket of the accept socket to find proper service (local) address the socket is bound to. */ 277 struct nsd_socket *socket; 278#endif /* USE_DNSTAP */ 279 280 /* if set, PROXYv2 is expected on this connection */ 281 int pp2_enabled; 282 283 /* header state for the PROXYv2 header (for TCP) */ 284 enum pp2_header_state pp2_header_state; 285 286#ifdef HAVE_SSL 287 /* 288 * TLS object. 289 */ 290 SSL* tls; 291 292 /* 293 * TLS handshake state. 294 */ 295 enum { tls_hs_none, tls_hs_read, tls_hs_write, 296 tls_hs_read_event, tls_hs_write_event } shake_state; 297#endif 298 /* list of connections, for service of remaining tcp channels */ 299 struct tcp_handler_data *prev, *next; 300}; 301/* global that is the list of active tcp channels */ 302static struct tcp_handler_data *tcp_active_list = NULL; 303 304/* 305 * Handle incoming queries on the UDP server sockets. 306 */ 307static void handle_udp(int fd, short event, void* arg); 308 309/* 310 * Handle incoming connections on the TCP sockets. These handlers 311 * usually wait for the NETIO_EVENT_READ event (indicating an incoming 312 * connection) but are disabled when the number of current TCP 313 * connections is equal to the maximum number of TCP connections. 314 * Disabling is done by changing the handler to wait for the 315 * NETIO_EVENT_NONE type. This is done using the function 316 * configure_tcp_accept_handlers. 317 */ 318static void handle_tcp_accept(int fd, short event, void* arg); 319 320/* 321 * Handle incoming queries on a TCP connection. The TCP connections 322 * are configured to be non-blocking and the handler may be called 323 * multiple times before a complete query is received. 324 */ 325static void handle_tcp_reading(int fd, short event, void* arg); 326 327/* 328 * Handle outgoing responses on a TCP connection. The TCP connections 329 * are configured to be non-blocking and the handler may be called 330 * multiple times before a complete response is sent. 331 */ 332static void handle_tcp_writing(int fd, short event, void* arg); 333 334#ifdef HAVE_SSL 335/* Create SSL object and associate fd */ 336static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); 337/* 338 * Handle TLS handshake. May be called multiple times if incomplete. 339 */ 340static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); 341 342/* 343 * Handle incoming queries on a TLS over TCP connection. The TLS 344 * connections are configured to be non-blocking and the handler may 345 * be called multiple times before a complete query is received. 346 */ 347static void handle_tls_reading(int fd, short event, void* arg); 348 349/* 350 * Handle outgoing responses on a TLS over TCP connection. The TLS 351 * connections are configured to be non-blocking and the handler may 352 * be called multiple times before a complete response is sent. 353 */ 354static void handle_tls_writing(int fd, short event, void* arg); 355#endif 356 357/* 358 * Send all children the quit nonblocking, then close pipe. 359 */ 360static void send_children_quit(struct nsd* nsd); 361/* same, for shutdown time, waits for child to exit to avoid restart issues */ 362static void send_children_quit_and_wait(struct nsd* nsd); 363 364/* set childrens flags to send NSD_STATS to them */ 365#ifdef BIND8_STATS 366static void set_children_stats(struct nsd* nsd); 367#endif /* BIND8_STATS */ 368 369/* 370 * Change the event types the HANDLERS are interested in to EVENT_TYPES. 371 */ 372static void configure_handler_event_types(short event_types); 373 374static uint16_t *compressed_dname_offsets = 0; 375static uint32_t compression_table_capacity = 0; 376static uint32_t compression_table_size = 0; 377static domain_type* compressed_dnames[MAXRRSPP]; 378 379#ifdef USE_TCP_FASTOPEN 380/* Checks to see if the kernel value must be manually changed in order for 381 TCP Fast Open to support server mode */ 382static void report_tcp_fastopen_config() { 383 384 int tcp_fastopen_fp; 385 uint8_t tcp_fastopen_value; 386 387 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { 388 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 389 } 390 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { 391 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 392 close(tcp_fastopen_fp); 393 } 394 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { 395 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); 396 log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n"); 397 log_msg(LOG_WARNING, "To enable TFO use the command:"); 398 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); 399 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); 400 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); 401 close(tcp_fastopen_fp); 402 } 403 close(tcp_fastopen_fp); 404} 405#endif 406 407/* 408 * Remove the specified pid from the list of child pids. Returns -1 if 409 * the pid is not in the list, child_num otherwise. The field is set to 0. 410 */ 411static int 412delete_child_pid(struct nsd *nsd, pid_t pid) 413{ 414 size_t i; 415 for (i = 0; i < nsd->child_count; ++i) { 416 if (nsd->children[i].pid == pid) { 417 nsd->children[i].pid = 0; 418 if(!nsd->children[i].need_to_exit) { 419 if(nsd->children[i].child_fd != -1) 420 close(nsd->children[i].child_fd); 421 nsd->children[i].child_fd = -1; 422 if(nsd->children[i].handler) 423 nsd->children[i].handler->fd = -1; 424 } 425 return i; 426 } 427 } 428 return -1; 429} 430 431/* 432 * Restart child servers if necessary. 433 */ 434static int 435restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, 436 int* xfrd_sock_p) 437{ 438 struct main_ipc_handler_data *ipc_data; 439 size_t i; 440 int sv[2]; 441 442 /* Fork the child processes... */ 443 for (i = 0; i < nsd->child_count; ++i) { 444 if (nsd->children[i].pid <= 0) { 445 if (nsd->children[i].child_fd != -1) 446 close(nsd->children[i].child_fd); 447 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { 448 log_msg(LOG_ERR, "socketpair: %s", 449 strerror(errno)); 450 return -1; 451 } 452 nsd->children[i].child_fd = sv[0]; 453 nsd->children[i].parent_fd = sv[1]; 454 nsd->children[i].pid = fork(); 455 switch (nsd->children[i].pid) { 456 default: /* SERVER MAIN */ 457 close(nsd->children[i].parent_fd); 458 nsd->children[i].parent_fd = -1; 459 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { 460 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 461 } 462 if(!nsd->children[i].handler) 463 { 464 ipc_data = (struct main_ipc_handler_data*) region_alloc( 465 region, sizeof(struct main_ipc_handler_data)); 466 ipc_data->nsd = nsd; 467 ipc_data->child = &nsd->children[i]; 468 ipc_data->child_num = i; 469 ipc_data->xfrd_sock = xfrd_sock_p; 470 ipc_data->packet = buffer_create(region, QIOBUFSZ); 471 ipc_data->forward_mode = 0; 472 ipc_data->got_bytes = 0; 473 ipc_data->total_bytes = 0; 474 ipc_data->acl_num = 0; 475 nsd->children[i].handler = (struct netio_handler*) region_alloc( 476 region, sizeof(struct netio_handler)); 477 nsd->children[i].handler->fd = nsd->children[i].child_fd; 478 nsd->children[i].handler->timeout = NULL; 479 nsd->children[i].handler->user_data = ipc_data; 480 nsd->children[i].handler->event_types = NETIO_EVENT_READ; 481 nsd->children[i].handler->event_handler = parent_handle_child_command; 482 netio_add_handler(netio, nsd->children[i].handler); 483 } 484 /* clear any ongoing ipc */ 485 ipc_data = (struct main_ipc_handler_data*) 486 nsd->children[i].handler->user_data; 487 ipc_data->forward_mode = 0; 488 /* restart - update fd */ 489 nsd->children[i].handler->fd = nsd->children[i].child_fd; 490 break; 491 case 0: /* CHILD */ 492#ifdef MEMCLEAN /* OS collects memory pages */ 493 region_destroy(region); 494#endif 495 nsd->pid = 0; 496 nsd->child_count = 0; 497 nsd->server_kind = nsd->children[i].kind; 498 nsd->this_child = &nsd->children[i]; 499 nsd->this_child->child_num = i; 500 /* remove signal flags inherited from parent 501 the parent will handle them. */ 502 nsd->signal_hint_reload_hup = 0; 503 nsd->signal_hint_reload = 0; 504 nsd->signal_hint_child = 0; 505 nsd->signal_hint_quit = 0; 506 nsd->signal_hint_shutdown = 0; 507 nsd->signal_hint_stats = 0; 508 nsd->signal_hint_statsusr = 0; 509 close(*xfrd_sock_p); 510 close(nsd->this_child->child_fd); 511 nsd->this_child->child_fd = -1; 512 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { 513 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 514 } 515 server_child(nsd); 516 /* NOTREACH */ 517 exit(0); 518 case -1: 519 log_msg(LOG_ERR, "fork failed: %s", 520 strerror(errno)); 521 return -1; 522 } 523 } 524 } 525 return 0; 526} 527 528#ifdef BIND8_STATS 529static void set_bind8_alarm(struct nsd* nsd) 530{ 531 /* resync so that the next alarm is on the next whole minute */ 532 if(nsd->st_period > 0) /* % by 0 gives divbyzero error */ 533 alarm(nsd->st_period - (time(NULL) % nsd->st_period)); 534} 535#endif 536 537/* set zone stat ids for zones initially read in */ 538static void 539zonestatid_tree_set(struct nsd* nsd) 540{ 541 struct radnode* n; 542 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 543 zone_type* zone = (zone_type*)n->elem; 544 zone->zonestatid = getzonestatid(nsd->options, zone->opts); 545 } 546} 547 548#ifdef USE_ZONE_STATS 549void 550server_zonestat_alloc(struct nsd* nsd) 551{ 552 size_t num = (nsd->options->zonestatnames->count==0?1: 553 nsd->options->zonestatnames->count); 554 size_t sz = sizeof(struct nsdst)*num; 555 char tmpfile[256]; 556 uint8_t z = 0; 557 558 /* file names */ 559 nsd->zonestatfname[0] = 0; 560 nsd->zonestatfname[1] = 0; 561 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", 562 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 563 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); 564 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", 565 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 566 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); 567 568 /* file descriptors */ 569 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); 570 if(nsd->zonestatfd[0] == -1) { 571 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], 572 strerror(errno)); 573 exit(1); 574 } 575 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); 576 if(nsd->zonestatfd[0] == -1) { 577 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], 578 strerror(errno)); 579 close(nsd->zonestatfd[0]); 580 unlink(nsd->zonestatfname[0]); 581 exit(1); 582 } 583 584#ifdef HAVE_MMAP 585 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { 586 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], 587 strerror(errno)); 588 exit(1); 589 } 590 if(write(nsd->zonestatfd[0], &z, 1) == -1) { 591 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 592 nsd->zonestatfname[0], strerror(errno)); 593 exit(1); 594 } 595 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { 596 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], 597 strerror(errno)); 598 exit(1); 599 } 600 if(write(nsd->zonestatfd[1], &z, 1) == -1) { 601 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 602 nsd->zonestatfname[1], strerror(errno)); 603 exit(1); 604 } 605 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 606 MAP_SHARED, nsd->zonestatfd[0], 0); 607 if(nsd->zonestat[0] == MAP_FAILED) { 608 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 609 unlink(nsd->zonestatfname[0]); 610 unlink(nsd->zonestatfname[1]); 611 exit(1); 612 } 613 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 614 MAP_SHARED, nsd->zonestatfd[1], 0); 615 if(nsd->zonestat[1] == MAP_FAILED) { 616 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 617 unlink(nsd->zonestatfname[0]); 618 unlink(nsd->zonestatfname[1]); 619 exit(1); 620 } 621 memset(nsd->zonestat[0], 0, sz); 622 memset(nsd->zonestat[1], 0, sz); 623 nsd->zonestatsize[0] = num; 624 nsd->zonestatsize[1] = num; 625 nsd->zonestatdesired = num; 626 nsd->zonestatsizenow = num; 627 nsd->zonestatnow = nsd->zonestat[0]; 628#endif /* HAVE_MMAP */ 629} 630 631void 632zonestat_remap(struct nsd* nsd, int idx, size_t sz) 633{ 634#ifdef HAVE_MMAP 635#ifdef MREMAP_MAYMOVE 636 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], 637 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, 638 MREMAP_MAYMOVE); 639 if(nsd->zonestat[idx] == MAP_FAILED) { 640 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); 641 exit(1); 642 } 643#else /* !HAVE MREMAP */ 644 if(msync(nsd->zonestat[idx], 645 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) 646 log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); 647 if(munmap(nsd->zonestat[idx], 648 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) 649 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); 650 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, 651 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); 652 if(nsd->zonestat[idx] == MAP_FAILED) { 653 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 654 exit(1); 655 } 656#endif /* MREMAP */ 657#endif /* HAVE_MMAP */ 658} 659 660/* realloc the zonestat array for the one that is not currently in use, 661 * to match the desired new size of the array (if applicable) */ 662void 663server_zonestat_realloc(struct nsd* nsd) 664{ 665#ifdef HAVE_MMAP 666 uint8_t z = 0; 667 size_t sz; 668 int idx = 0; /* index of the zonestat array that is not in use */ 669 if(nsd->zonestatnow == nsd->zonestat[0]) 670 idx = 1; 671 if(nsd->zonestatsize[idx] == nsd->zonestatdesired) 672 return; 673 sz = sizeof(struct nsdst)*nsd->zonestatdesired; 674 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { 675 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], 676 strerror(errno)); 677 exit(1); 678 } 679 if(write(nsd->zonestatfd[idx], &z, 1) == -1) { 680 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 681 nsd->zonestatfname[idx], strerror(errno)); 682 exit(1); 683 } 684 zonestat_remap(nsd, idx, sz); 685 /* zero the newly allocated region */ 686 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { 687 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * 688 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * 689 (nsd->zonestatdesired - nsd->zonestatsize[idx])); 690 } 691 nsd->zonestatsize[idx] = nsd->zonestatdesired; 692#endif /* HAVE_MMAP */ 693} 694 695/* switchover to use the other array for the new children, that 696 * briefly coexist with the old children. And we want to avoid them 697 * both writing to the same statistics arrays. */ 698void 699server_zonestat_switch(struct nsd* nsd) 700{ 701 if(nsd->zonestatnow == nsd->zonestat[0]) { 702 nsd->zonestatnow = nsd->zonestat[1]; 703 nsd->zonestatsizenow = nsd->zonestatsize[1]; 704 } else { 705 nsd->zonestatnow = nsd->zonestat[0]; 706 nsd->zonestatsizenow = nsd->zonestatsize[0]; 707 } 708} 709#endif /* USE_ZONE_STATS */ 710 711#ifdef BIND8_STATS 712void 713server_stat_alloc(struct nsd* nsd) 714{ 715 char tmpfile[256]; 716 size_t sz = sizeof(struct nsdst) * nsd->child_count * 2; 717 uint8_t z = 0; 718 719 /* file name */ 720 nsd->statfname = 0; 721 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.stat", 722 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 723 nsd->statfname = region_strdup(nsd->region, tmpfile); 724 725 /* file descriptor */ 726 nsd->statfd = open(nsd->statfname, O_CREAT|O_RDWR, 0600); 727 if(nsd->statfd == -1) { 728 log_msg(LOG_ERR, "cannot create %s: %s", nsd->statfname, 729 strerror(errno)); 730 unlink(nsd->zonestatfname[0]); 731 unlink(nsd->zonestatfname[1]); 732 exit(1); 733 } 734 735#ifdef HAVE_MMAP 736 if(lseek(nsd->statfd, (off_t)sz-1, SEEK_SET) == -1) { 737 log_msg(LOG_ERR, "lseek %s: %s", nsd->statfname, 738 strerror(errno)); 739 goto fail_exit; 740 } 741 if(write(nsd->statfd, &z, 1) == -1) { 742 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 743 nsd->statfname, strerror(errno)); 744 goto fail_exit; 745 } 746 nsd->stat_map = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 747 MAP_SHARED, nsd->statfd, 0); 748 if(nsd->stat_map == MAP_FAILED) { 749 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 750fail_exit: 751 close(nsd->statfd); 752 unlink(nsd->statfname); 753 unlink(nsd->zonestatfname[0]); 754 unlink(nsd->zonestatfname[1]); 755 exit(1); 756 } 757 memset(nsd->stat_map, 0, sz); 758 nsd->stats_per_child[0] = nsd->stat_map; 759 nsd->stats_per_child[1] = &nsd->stat_map[nsd->child_count]; 760 nsd->stat_current = 0; 761 nsd->st = &nsd->stats_per_child[nsd->stat_current][0]; 762#endif /* HAVE_MMAP */ 763} 764#endif /* BIND8_STATS */ 765 766#ifdef BIND8_STATS 767void 768server_stat_free(struct nsd* nsd) 769{ 770 unlink(nsd->statfname); 771} 772#endif /* BIND8_STATS */ 773 774static void 775cleanup_dname_compression_tables(void *ptr) 776{ 777 free(ptr); 778 compressed_dname_offsets = NULL; 779 compression_table_capacity = 0; 780} 781 782static void 783initialize_dname_compression_tables(struct nsd *nsd) 784{ 785 size_t needed = domain_table_count(nsd->db->domains) + 1; 786 needed += EXTRA_DOMAIN_NUMBERS; 787 if(compression_table_capacity < needed) { 788 if(compressed_dname_offsets) { 789 region_remove_cleanup(nsd->db->region, 790 cleanup_dname_compression_tables, 791 compressed_dname_offsets); 792 free(compressed_dname_offsets); 793 } 794 compressed_dname_offsets = (uint16_t *) xmallocarray( 795 needed, sizeof(uint16_t)); 796 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, 797 compressed_dname_offsets); 798 compression_table_capacity = needed; 799 compression_table_size=domain_table_count(nsd->db->domains)+1; 800 } 801 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); 802 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ 803} 804 805static int 806set_cloexec(struct nsd_socket *sock) 807{ 808 assert(sock != NULL); 809 810 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { 811 const char *socktype = 812 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; 813 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", 814 socktype, strerror(errno)); 815 return -1; 816 } 817 818 return 1; 819} 820 821static int 822set_reuseport(struct nsd_socket *sock) 823{ 824#ifdef SO_REUSEPORT 825 int on = 1; 826#ifdef SO_REUSEPORT_LB 827 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like 828 * SO_REUSEPORT on Linux. This is what the users want with the config 829 * option in nsd.conf; if we actually need local address and port reuse 830 * they'll also need to have SO_REUSEPORT set for them, assume it was 831 * _LB they want. 832 */ 833 int opt = SO_REUSEPORT_LB; 834 static const char optname[] = "SO_REUSEPORT_LB"; 835#else /* !SO_REUSEPORT_LB */ 836 int opt = SO_REUSEPORT; 837 static const char optname[] = "SO_REUSEPORT"; 838#endif /* SO_REUSEPORT_LB */ 839 840 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { 841 return 1; 842 } else if(verbosity >= 3 || errno != ENOPROTOOPT) { 843 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 844 optname, strerror(errno)); 845 } 846 return -1; 847#else 848 (void)sock; 849#endif /* SO_REUSEPORT */ 850 851 return 0; 852} 853 854static int 855set_reuseaddr(struct nsd_socket *sock) 856{ 857#ifdef SO_REUSEADDR 858 int on = 1; 859 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { 860 return 1; 861 } 862 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", 863 strerror(errno)); 864 return -1; 865#endif /* SO_REUSEADDR */ 866 return 0; 867} 868 869static int 870set_rcvbuf(struct nsd_socket *sock, int rcv) 871{ 872#ifdef SO_RCVBUF 873#ifdef SO_RCVBUFFORCE 874 if(0 == setsockopt( 875 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) 876 { 877 return 1; 878 } 879 if(errno == EPERM || errno == ENOBUFS) { 880 return 0; 881 } 882 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s", 883 strerror(errno)); 884 return -1; 885#else /* !SO_RCVBUFFORCE */ 886 if (0 == setsockopt( 887 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) 888 { 889 return 1; 890 } 891 if(errno == ENOSYS || errno == ENOBUFS) { 892 return 0; 893 } 894 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s", 895 strerror(errno)); 896 return -1; 897#endif /* SO_RCVBUFFORCE */ 898#endif /* SO_RCVBUF */ 899 900 return 0; 901} 902 903static int 904set_sndbuf(struct nsd_socket *sock, int snd) 905{ 906#ifdef SO_SNDBUF 907#ifdef SO_SNDBUFFORCE 908 if(0 == setsockopt( 909 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) 910 { 911 return 1; 912 } 913 if(errno == EPERM || errno == ENOBUFS) { 914 return 0; 915 } 916 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s", 917 strerror(errno)); 918 return -1; 919#else /* !SO_SNDBUFFORCE */ 920 if(0 == setsockopt( 921 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) 922 { 923 return 1; 924 } 925 if(errno == ENOSYS || errno == ENOBUFS) { 926 return 0; 927 } 928 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s", 929 strerror(errno)); 930 return -1; 931#endif /* SO_SNDBUFFORCE */ 932#endif /* SO_SNDBUF */ 933 934 return 0; 935} 936 937static int 938set_nonblock(struct nsd_socket *sock) 939{ 940 const char *socktype = 941 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 942 943 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { 944 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", 945 socktype, strerror(errno)); 946 return -1; 947 } 948 949 return 1; 950} 951 952#ifdef INET6 953static int 954set_ipv6_v6only(struct nsd_socket *sock) 955{ 956#ifdef IPV6_V6ONLY 957 int on = 1; 958 const char *socktype = 959 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 960 961 if(0 == setsockopt( 962 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) 963 { 964 return 1; 965 } 966 967 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", 968 socktype, strerror(errno)); 969 return -1; 970#else 971 (void)sock; 972#endif /* IPV6_V6ONLY */ 973 974 return 0; 975} 976#endif /* INET6 */ 977 978#ifdef INET6 979static int 980set_ipv6_use_min_mtu(struct nsd_socket *sock) 981{ 982#if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) 983#if defined(IPV6_USE_MIN_MTU) 984 /* There is no fragmentation of IPv6 datagrams during forwarding in the 985 * network. Therefore we do not send UDP datagrams larger than the 986 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be 987 * larger if the network stack supports IPV6_USE_MIN_MTU. 988 */ 989 int opt = IPV6_USE_MIN_MTU; 990 int optval = 1; 991 static const char optname[] = "IPV6_USE_MIN_MTU"; 992#elif defined(IPV6_MTU) 993 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU 994 * to the MIN MTU to get the same. 995 */ 996 int opt = IPV6_MTU; 997 int optval = IPV6_MIN_MTU; 998 static const char optname[] = "IPV6_MTU"; 999#endif 1000 if(0 == setsockopt( 1001 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) 1002 { 1003 return 1; 1004 } 1005 1006 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 1007 optname, strerror(errno)); 1008 return -1; 1009#else 1010 (void)sock; 1011#endif /* INET6 */ 1012 1013 return 0; 1014} 1015#endif /* INET6 */ 1016 1017static int 1018set_ipv4_no_pmtu_disc(struct nsd_socket *sock) 1019{ 1020 int ret = 0; 1021 1022#if defined(IP_MTU_DISCOVER) 1023 int opt = IP_MTU_DISCOVER; 1024 int optval; 1025# if defined(IP_PMTUDISC_OMIT) 1026 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU 1027 * information and send packets with DF=0. Fragmentation is allowed if 1028 * and only if the packet size exceeds the outgoing interface MTU or 1029 * the packet encounters smaller MTU link in network. This mitigates 1030 * DNS fragmentation attacks by preventing forged PMTU information. 1031 * FreeBSD already has same semantics without setting the option. 1032 */ 1033 optval = IP_PMTUDISC_OMIT; 1034 if(0 == setsockopt( 1035 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 1036 { 1037 return 1; 1038 } 1039 1040 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1041 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); 1042# endif /* IP_PMTUDISC_OMIT */ 1043# if defined(IP_PMTUDISC_DONT) 1044 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ 1045 optval = IP_PMTUDISC_DONT; 1046 if(0 == setsockopt( 1047 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 1048 { 1049 return 1; 1050 } 1051 1052 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1053 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); 1054# endif 1055 ret = -1; 1056#elif defined(IP_DONTFRAG) 1057 int off = 0; 1058 if (0 == setsockopt( 1059 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) 1060 { 1061 return 1; 1062 } 1063 1064 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", 1065 strerror(errno)); 1066 ret = -1; 1067#else 1068 (void)sock; 1069#endif 1070 1071 return ret; 1072} 1073 1074static int 1075set_ip_freebind(struct nsd_socket *sock) 1076{ 1077#ifdef IP_FREEBIND 1078 int on = 1; 1079 const char *socktype = 1080 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1081 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) 1082 { 1083 return 1; 1084 } 1085 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", 1086 socktype, strerror(errno)); 1087 return -1; 1088#else 1089 (void)sock; 1090#endif /* IP_FREEBIND */ 1091 1092 return 0; 1093} 1094 1095static int 1096set_ip_transparent(struct nsd_socket *sock) 1097{ 1098 /* 1099 The scandalous preprocessor blob here calls for some explanation :) 1100 POSIX does not specify an option to bind non-local IPs, so 1101 platforms developed several implementation-specific options, 1102 all set in the same way, but with different names. 1103 For additional complexity, some platform manage this setting 1104 differently for different address families (IPv4 vs IPv6). 1105 This scandalous preprocessor blob below abstracts such variability 1106 in the way which leaves the C code as lean and clear as possible. 1107 */ 1108 1109#if defined(IP_TRANSPARENT) 1110# define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT 1111# define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1112# define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" 1113// as of 2020-01, Linux does not support this on IPv6 programmatically 1114#elif defined(SO_BINDANY) 1115# define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY 1116# define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET 1117# define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" 1118#elif defined(IP_BINDANY) 1119# define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY 1120# define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY 1121# define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1122# define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 1123# define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" 1124#endif 1125 1126#ifndef NSD_SOCKET_OPTION_TRANSPARENT 1127 (void)sock; 1128#else 1129# ifndef NSD_SOCKET_OPTION_TRANSPARENT6 1130# define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT 1131# endif 1132# ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 1133# define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL 1134# endif 1135# ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 1136# define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME 1137# endif 1138 1139 int on = 1; 1140 const char *socktype = 1141 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1142 const int is_ip6 = (sock->addr.ai_family == AF_INET6); 1143 1144 if(0 == setsockopt( 1145 sock->s, 1146 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, 1147 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, 1148 &on, sizeof(on))) 1149 { 1150 return 1; 1151 } 1152 1153 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", 1154 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); 1155 return -1; 1156#endif 1157 1158 return 0; 1159} 1160 1161static int 1162set_tcp_maxseg(struct nsd_socket *sock, int mss) 1163{ 1164#if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) 1165 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { 1166 return 1; 1167 } 1168 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", 1169 strerror(errno)); 1170 return -1; 1171#else 1172 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); 1173#endif 1174 return 0; 1175} 1176 1177#ifdef USE_TCP_FASTOPEN 1178static int 1179set_tcp_fastopen(struct nsd_socket *sock) 1180{ 1181 /* qlen specifies how many outstanding TFO requests to allow. Limit is 1182 * a defense against IP spoofing attacks as suggested in RFC7413. 1183 */ 1184 int qlen; 1185 1186#ifdef __APPLE__ 1187 /* macOS X implementation only supports qlen of 1 via this call. The 1188 * actual value is configured by the net.inet.tcp.fastopen_backlog 1189 * kernel parameter. 1190 */ 1191 qlen = 1; 1192#else 1193 /* 5 is recommended on Linux. */ 1194 qlen = 5; 1195#endif 1196 if (0 == setsockopt( 1197 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) 1198 { 1199 return 1; 1200 } 1201 1202 if (errno == EPERM) { 1203 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " 1204 "; this could likely be because sysctl " 1205 "net.inet.tcp.fastopen.enabled, " 1206 "net.inet.tcp.fastopen.server_enable, or " 1207 "net.ipv4.tcp_fastopen is disabled", 1208 strerror(errno)); 1209 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support 1210 * disabled, except when verbosity enabled for debugging 1211 */ 1212 } else if(errno != ENOPROTOOPT || verbosity >= 3) { 1213 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", 1214 strerror(errno)); 1215 } 1216 1217 return (errno == ENOPROTOOPT ? 0 : -1); 1218} 1219#endif /* USE_TCP_FASTOPEN */ 1220 1221static int 1222set_bindtodevice(struct nsd_socket *sock) 1223{ 1224#if defined(SO_BINDTODEVICE) 1225 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, 1226 sock->device, strlen(sock->device)) == -1) 1227 { 1228 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1229 "SO_BINDTODEVICE", sock->device, strerror(errno)); 1230 return -1; 1231 } 1232 1233 return 1; 1234#else 1235 (void)sock; 1236 return 0; 1237#endif 1238} 1239 1240static int 1241set_setfib(struct nsd_socket *sock) 1242{ 1243#if defined(SO_SETFIB) 1244 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, 1245 (const void *)&sock->fib, sizeof(sock->fib)) == -1) 1246 { 1247 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", 1248 "SO_SETFIB", sock->fib, strerror(errno)); 1249 return -1; 1250 } 1251 1252 return 1; 1253#else 1254 (void)sock; 1255 return 0; 1256#endif 1257} 1258 1259static int 1260open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1261{ 1262 int rcv = 1*1024*1024, snd = 1*1024*1024; 1263 1264 if(-1 == (sock->s = socket( 1265 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1266 { 1267#ifdef INET6 1268 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1269 (sock->addr.ai_family == AF_INET6) && 1270 (errno == EAFNOSUPPORT)) 1271 { 1272 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " 1273 "not supported"); 1274 return 0; 1275 } 1276#endif 1277 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1278 return -1; 1279 } 1280 1281 set_cloexec(sock); 1282 1283 if(nsd->reuseport && reuseport_works && *reuseport_works) 1284 *reuseport_works = (set_reuseport(sock) == 1); 1285 1286 if(nsd->options->receive_buffer_size > 0) 1287 rcv = nsd->options->receive_buffer_size; 1288 if(set_rcvbuf(sock, rcv) == -1) 1289 return -1; 1290 1291 if(nsd->options->send_buffer_size > 0) 1292 snd = nsd->options->send_buffer_size; 1293 if(set_sndbuf(sock, snd) == -1) 1294 return -1; 1295#ifdef INET6 1296 if(sock->addr.ai_family == AF_INET6) { 1297 if(set_ipv6_v6only(sock) == -1 || 1298 set_ipv6_use_min_mtu(sock) == -1) 1299 return -1; 1300 } else 1301#endif /* INET6 */ 1302 if(sock->addr.ai_family == AF_INET) { 1303 if(set_ipv4_no_pmtu_disc(sock) == -1) 1304 return -1; 1305 } 1306 1307 /* Set socket to non-blocking. Otherwise, on operating systems 1308 * with thundering herd problems, the UDP recv could block 1309 * after select returns readable. 1310 */ 1311 set_nonblock(sock); 1312 1313 if(nsd->options->ip_freebind) 1314 (void)set_ip_freebind(sock); 1315 if(nsd->options->ip_transparent) 1316 (void)set_ip_transparent(sock); 1317 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1318 return -1; 1319 if(sock->fib != -1 && set_setfib(sock) == -1) 1320 return -1; 1321 1322 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1323 char buf[256]; 1324 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1325 log_msg(LOG_ERR, "can't bind udp socket %s: %s", 1326 buf, strerror(errno)); 1327 return -1; 1328 } 1329 1330 return 1; 1331} 1332 1333static int 1334open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1335{ 1336#ifdef USE_TCP_FASTOPEN 1337 report_tcp_fastopen_config(); 1338#endif 1339 1340 (void)reuseport_works; 1341 1342 if(-1 == (sock->s = socket( 1343 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1344 { 1345#ifdef INET6 1346 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1347 (sock->addr.ai_family == AF_INET6) && 1348 (errno == EAFNOSUPPORT)) 1349 { 1350 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " 1351 "not supported"); 1352 return 0; 1353 } 1354#endif /* INET6 */ 1355 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1356 return -1; 1357 } 1358 1359 set_cloexec(sock); 1360 1361 if(nsd->reuseport && reuseport_works && *reuseport_works) 1362 *reuseport_works = (set_reuseport(sock) == 1); 1363 1364 (void)set_reuseaddr(sock); 1365 1366#ifdef INET6 1367 if(sock->addr.ai_family == AF_INET6) { 1368 if (set_ipv6_v6only(sock) == -1 || 1369 set_ipv6_use_min_mtu(sock) == -1) 1370 return -1; 1371 } 1372#endif 1373 1374 if(nsd->tcp_mss > 0) 1375 set_tcp_maxseg(sock, nsd->tcp_mss); 1376 /* (StevensUNP p463), if TCP listening socket is blocking, then 1377 it may block in accept, even if select() says readable. */ 1378 (void)set_nonblock(sock); 1379 if(nsd->options->ip_freebind) 1380 (void)set_ip_freebind(sock); 1381 if(nsd->options->ip_transparent) 1382 (void)set_ip_transparent(sock); 1383 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1384 return -1; 1385 if(sock->fib != -1 && set_setfib(sock) == -1) 1386 return -1; 1387 1388 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1389 char buf[256]; 1390 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1391 log_msg(LOG_ERR, "can't bind tcp socket %s: %s", 1392 buf, strerror(errno)); 1393 return -1; 1394 } 1395 1396#ifdef USE_TCP_FASTOPEN 1397 (void)set_tcp_fastopen(sock); 1398#endif 1399 1400 if(listen(sock->s, TCP_BACKLOG) == -1) { 1401 log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); 1402 return -1; 1403 } 1404 1405 return 1; 1406} 1407 1408/* 1409 * Initialize the server, reuseport, create and bind the sockets. 1410 */ 1411int 1412server_init(struct nsd *nsd) 1413{ 1414 size_t i; 1415 int reuseport = 1; /* Determine if REUSEPORT works. */ 1416 1417 /* open server interface ports */ 1418 for(i = 0; i < nsd->ifs; i++) { 1419 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || 1420 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) 1421 { 1422 return -1; 1423 } 1424 } 1425 1426 if(nsd->reuseport && reuseport) { 1427 size_t ifs = nsd->ifs * nsd->reuseport; 1428 1429 /* increase the size of the interface arrays, there are going 1430 * to be separate interface file descriptors for every server 1431 * instance */ 1432 region_remove_cleanup(nsd->region, free, nsd->udp); 1433 region_remove_cleanup(nsd->region, free, nsd->tcp); 1434 1435 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); 1436 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); 1437 region_add_cleanup(nsd->region, free, nsd->udp); 1438 region_add_cleanup(nsd->region, free, nsd->tcp); 1439 if(ifs > nsd->ifs) { 1440 memset(&nsd->udp[nsd->ifs], 0, 1441 (ifs-nsd->ifs)*sizeof(*nsd->udp)); 1442 memset(&nsd->tcp[nsd->ifs], 0, 1443 (ifs-nsd->ifs)*sizeof(*nsd->tcp)); 1444 } 1445 1446 for(i = nsd->ifs; i < ifs; i++) { 1447 nsd->udp[i] = nsd->udp[i%nsd->ifs]; 1448 nsd->udp[i].s = -1; 1449 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { 1450 return -1; 1451 } 1452 /* Turn off REUSEPORT for TCP by copying the socket 1453 * file descriptor. 1454 * This means we should not close TCP used by 1455 * other servers in reuseport enabled mode, in 1456 * server_child(). 1457 */ 1458 nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; 1459 } 1460 1461 nsd->ifs = ifs; 1462 } else { 1463 nsd->reuseport = 0; 1464 } 1465 1466 /* open server interface ports for verifiers */ 1467 for(i = 0; i < nsd->verify_ifs; i++) { 1468 if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 || 1469 open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1) 1470 { 1471 return -1; 1472 } 1473 } 1474 1475 return 0; 1476} 1477 1478/* 1479 * Prepare the server for take off. 1480 * 1481 */ 1482int 1483server_prepare(struct nsd *nsd) 1484{ 1485#ifdef RATELIMIT 1486 /* set secret modifier for hashing (rate limits) */ 1487#ifdef HAVE_GETRANDOM 1488 uint32_t v; 1489 if(getrandom(&v, sizeof(v), 0) == -1) { 1490 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); 1491 exit(1); 1492 } 1493 hash_set_raninit(v); 1494#elif defined(HAVE_ARC4RANDOM) 1495 hash_set_raninit(arc4random()); 1496#else 1497 uint32_t v = getpid() ^ time(NULL); 1498 srandom((unsigned long)v); 1499# ifdef HAVE_SSL 1500 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) 1501 hash_set_raninit(v); 1502 else 1503# endif 1504 hash_set_raninit(random()); 1505#endif 1506 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, 1507 nsd->options->rrl_ratelimit, 1508 nsd->options->rrl_whitelist_ratelimit, 1509 nsd->options->rrl_slip, 1510 nsd->options->rrl_ipv4_prefix_length, 1511 nsd->options->rrl_ipv6_prefix_length); 1512#endif /* RATELIMIT */ 1513 1514 /* Open the database... */ 1515 if ((nsd->db = namedb_open(nsd->options)) == NULL) { 1516 log_msg(LOG_ERR, "unable to open the database: %s", strerror(errno)); 1517 unlink(nsd->task[0]->fname); 1518 unlink(nsd->task[1]->fname); 1519#ifdef USE_ZONE_STATS 1520 unlink(nsd->zonestatfname[0]); 1521 unlink(nsd->zonestatfname[1]); 1522#endif 1523#ifdef BIND8_STATS 1524 server_stat_free(nsd); 1525#endif 1526 xfrd_del_tempdir(nsd); 1527 return -1; 1528 } 1529 /* check if zone files can be read */ 1530 /* NULL for taskudb because we send soainfo in a moment, batched up, 1531 * for all zones */ 1532 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); 1533 zonestatid_tree_set(nsd); 1534 1535 compression_table_capacity = 0; 1536 initialize_dname_compression_tables(nsd); 1537 1538#ifdef BIND8_STATS 1539 /* Initialize times... */ 1540 time(&nsd->st->boot); 1541 set_bind8_alarm(nsd); 1542#endif /* BIND8_STATS */ 1543 1544 return 0; 1545} 1546 1547/* 1548 * Fork the required number of servers. 1549 */ 1550static int 1551server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, 1552 int* xfrd_sock_p) 1553{ 1554 size_t i; 1555 1556 /* Start all child servers initially. */ 1557 for (i = 0; i < nsd->child_count; ++i) { 1558 nsd->children[i].pid = 0; 1559 } 1560 1561 return restart_child_servers(nsd, region, netio, xfrd_sock_p); 1562} 1563 1564static void 1565server_close_socket(struct nsd_socket *sock) 1566{ 1567 if(sock->s != -1) { 1568 close(sock->s); 1569 sock->s = -1; 1570 } 1571} 1572 1573void 1574server_close_all_sockets(struct nsd_socket sockets[], size_t n) 1575{ 1576 size_t i; 1577 1578 /* Close all the sockets... */ 1579 for (i = 0; i < n; ++i) { 1580 server_close_socket(&sockets[i]); 1581 } 1582} 1583 1584/* 1585 * Close the sockets, shutdown the server and exit. 1586 * Does not return. 1587 */ 1588void 1589server_shutdown(struct nsd *nsd) 1590{ 1591 size_t i; 1592 1593 server_close_all_sockets(nsd->udp, nsd->ifs); 1594 server_close_all_sockets(nsd->tcp, nsd->ifs); 1595 /* CHILD: close command channel to parent */ 1596 if(nsd->this_child && nsd->this_child->parent_fd != -1) 1597 { 1598 close(nsd->this_child->parent_fd); 1599 nsd->this_child->parent_fd = -1; 1600 } 1601 /* SERVER: close command channels to children */ 1602 if(!nsd->this_child) 1603 { 1604 for(i=0; i < nsd->child_count; ++i) 1605 if(nsd->children[i].child_fd != -1) 1606 { 1607 close(nsd->children[i].child_fd); 1608 nsd->children[i].child_fd = -1; 1609 } 1610 } 1611 1612 tsig_finalize(); 1613 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ 1614#ifdef HAVE_SSL 1615 if (nsd->tls_ctx) 1616 SSL_CTX_free(nsd->tls_ctx); 1617#endif 1618 1619#ifdef MEMCLEAN /* OS collects memory pages */ 1620#ifdef RATELIMIT 1621 rrl_mmap_deinit_keep_mmap(); 1622#endif 1623#ifdef USE_DNSTAP 1624 dt_collector_destroy(nsd->dt_collector, nsd); 1625#endif 1626 udb_base_free_keep_mmap(nsd->task[0]); 1627 udb_base_free_keep_mmap(nsd->task[1]); 1628 namedb_free_ixfr(nsd->db); 1629 namedb_close(nsd->db); 1630 nsd_options_destroy(nsd->options); 1631 region_destroy(nsd->region); 1632#endif 1633 log_finalize(); 1634 exit(0); 1635} 1636 1637void 1638server_prepare_xfrd(struct nsd* nsd) 1639{ 1640 char tmpfile[256]; 1641 /* create task mmaps */ 1642 nsd->mytask = 0; 1643 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", 1644 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1645 nsd->task[0] = task_file_create(tmpfile); 1646 if(!nsd->task[0]) { 1647#ifdef USE_ZONE_STATS 1648 unlink(nsd->zonestatfname[0]); 1649 unlink(nsd->zonestatfname[1]); 1650#endif 1651#ifdef BIND8_STATS 1652 server_stat_free(nsd); 1653#endif 1654 xfrd_del_tempdir(nsd); 1655 exit(1); 1656 } 1657 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", 1658 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1659 nsd->task[1] = task_file_create(tmpfile); 1660 if(!nsd->task[1]) { 1661 unlink(nsd->task[0]->fname); 1662#ifdef USE_ZONE_STATS 1663 unlink(nsd->zonestatfname[0]); 1664 unlink(nsd->zonestatfname[1]); 1665#endif 1666#ifdef BIND8_STATS 1667 server_stat_free(nsd); 1668#endif 1669 xfrd_del_tempdir(nsd); 1670 exit(1); 1671 } 1672 assert(udb_base_get_userdata(nsd->task[0])->data == 0); 1673 assert(udb_base_get_userdata(nsd->task[1])->data == 0); 1674 /* create xfrd listener structure */ 1675 nsd->xfrd_listener = region_alloc(nsd->region, 1676 sizeof(netio_handler_type)); 1677 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) 1678 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); 1679 nsd->xfrd_listener->fd = -1; 1680 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = 1681 nsd; 1682 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = 1683 xfrd_tcp_create(nsd->region, QIOBUFSZ); 1684} 1685 1686 1687void 1688server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) 1689{ 1690 pid_t pid; 1691 int sockets[2] = {0,0}; 1692 struct ipc_handler_conn_data *data; 1693 1694 if(nsd->xfrd_listener->fd != -1) 1695 close(nsd->xfrd_listener->fd); 1696 if(del_db) { 1697 /* recreate taskdb that xfrd was using, it may be corrupt */ 1698 /* we (or reload) use nsd->mytask, and xfrd uses the other */ 1699 char* tmpfile = nsd->task[1-nsd->mytask]->fname; 1700 nsd->task[1-nsd->mytask]->fname = NULL; 1701 /* free alloc already, so udb does not shrink itself */ 1702 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); 1703 nsd->task[1-nsd->mytask]->alloc = NULL; 1704 udb_base_free(nsd->task[1-nsd->mytask]); 1705 /* create new file, overwrite the old one */ 1706 nsd->task[1-nsd->mytask] = task_file_create(tmpfile); 1707 free(tmpfile); 1708 } 1709 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { 1710 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); 1711 return; 1712 } 1713 pid = fork(); 1714 switch (pid) { 1715 case -1: 1716 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); 1717 break; 1718 default: 1719 /* PARENT: close first socket, use second one */ 1720 close(sockets[0]); 1721 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { 1722 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1723 } 1724 if(del_db) xfrd_free_namedb(nsd); 1725 /* use other task than I am using, since if xfrd died and is 1726 * restarted, the reload is using nsd->mytask */ 1727 nsd->mytask = 1 - nsd->mytask; 1728 1729#ifdef HAVE_SETPROCTITLE 1730 setproctitle("xfrd"); 1731#endif 1732#ifdef HAVE_CPUSET_T 1733 if(nsd->use_cpu_affinity) { 1734 set_cpu_affinity(nsd->xfrd_cpuset); 1735 } 1736#endif 1737 1738 xfrd_init(sockets[1], nsd, del_db, reload_active, pid); 1739 /* ENOTREACH */ 1740 break; 1741 case 0: 1742 /* CHILD: close second socket, use first one */ 1743 close(sockets[1]); 1744 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { 1745 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1746 } 1747 nsd->xfrd_listener->fd = sockets[0]; 1748 break; 1749 } 1750 /* server-parent only */ 1751 nsd->xfrd_listener->timeout = NULL; 1752 nsd->xfrd_listener->event_types = NETIO_EVENT_READ; 1753 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; 1754 /* clear ongoing ipc reads */ 1755 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; 1756 data->conn->is_reading = 0; 1757} 1758 1759/** add all soainfo to taskdb */ 1760static void 1761add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) 1762{ 1763 struct radnode* n; 1764 udb_ptr task_last; /* last task, mytask is empty so NULL */ 1765 /* add all SOA INFO to mytask */ 1766 udb_ptr_init(&task_last, taskudb); 1767 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 1768 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); 1769 } 1770 udb_ptr_unlink(&task_last, taskudb); 1771} 1772 1773void 1774server_send_soa_xfrd(struct nsd* nsd, int shortsoa) 1775{ 1776 /* normally this exchanges the SOA from nsd->xfrd and the expire back. 1777 * parent fills one taskdb with soas, xfrd fills other with expires. 1778 * then they exchange and process. 1779 * shortsoa: xfrd crashes and needs to be restarted and one taskdb 1780 * may be in use by reload. Fill SOA in taskdb and give to xfrd. 1781 * expire notifications can be sent back via a normal reload later 1782 * (xfrd will wait for current running reload to finish if any). 1783 */ 1784 sig_atomic_t cmd = 0; 1785 pid_t mypid; 1786 int xfrd_sock = nsd->xfrd_listener->fd; 1787 struct udb_base* taskudb = nsd->task[nsd->mytask]; 1788 udb_ptr t; 1789 if(!shortsoa) { 1790 if(nsd->signal_hint_shutdown) { 1791 shutdown: 1792 log_msg(LOG_WARNING, "signal received, shutting down..."); 1793 server_close_all_sockets(nsd->udp, nsd->ifs); 1794 server_close_all_sockets(nsd->tcp, nsd->ifs); 1795 daemon_remote_close(nsd->rc); 1796 /* Unlink it if possible... */ 1797 unlinkpid(nsd->pidfile); 1798 unlink(nsd->task[0]->fname); 1799 unlink(nsd->task[1]->fname); 1800#ifdef USE_ZONE_STATS 1801 unlink(nsd->zonestatfname[0]); 1802 unlink(nsd->zonestatfname[1]); 1803#endif 1804#ifdef BIND8_STATS 1805 server_stat_free(nsd); 1806#endif 1807 server_shutdown(nsd); 1808 /* ENOTREACH */ 1809 exit(0); 1810 } 1811 } 1812 if(shortsoa) { 1813 /* put SOA in xfrd task because mytask may be in use */ 1814 taskudb = nsd->task[1-nsd->mytask]; 1815 } 1816 1817 add_all_soa_to_task(nsd, taskudb); 1818 if(!shortsoa) { 1819 /* wait for xfrd to signal task is ready, RELOAD signal */ 1820 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || 1821 cmd != NSD_RELOAD) { 1822 log_msg(LOG_ERR, "did not get start signal from xfrd"); 1823 exit(1); 1824 } 1825 if(nsd->signal_hint_shutdown) { 1826 goto shutdown; 1827 } 1828 } 1829 /* give xfrd our task, signal it with RELOAD_DONE */ 1830 task_process_sync(taskudb); 1831 cmd = NSD_RELOAD_DONE; 1832 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1833 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1834 (int)nsd->pid, strerror(errno)); 1835 } 1836 mypid = getpid(); 1837 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 1838 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 1839 strerror(errno)); 1840 } 1841 1842 if(!shortsoa) { 1843 /* process the xfrd task works (expiry data) */ 1844 nsd->mytask = 1 - nsd->mytask; 1845 taskudb = nsd->task[nsd->mytask]; 1846 task_remap(taskudb); 1847 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); 1848 while(!udb_ptr_is_null(&t)) { 1849 task_process_expire(nsd->db, TASKLIST(&t)); 1850 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); 1851 } 1852 udb_ptr_unlink(&t, taskudb); 1853 task_clear(taskudb); 1854 1855 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ 1856 cmd = NSD_RELOAD_DONE; 1857 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1858 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1859 (int)nsd->pid, strerror(errno)); 1860 } 1861 } 1862} 1863 1864#ifdef HAVE_SSL 1865static void 1866log_crypto_from_err(const char* str, unsigned long err) 1867{ 1868 /* error:[error code]:[library name]:[function name]:[reason string] */ 1869 char buf[128]; 1870 unsigned long e; 1871 ERR_error_string_n(err, buf, sizeof(buf)); 1872 log_msg(LOG_ERR, "%s crypto %s", str, buf); 1873 while( (e=ERR_get_error()) ) { 1874 ERR_error_string_n(e, buf, sizeof(buf)); 1875 log_msg(LOG_ERR, "and additionally crypto %s", buf); 1876 } 1877} 1878 1879void 1880log_crypto_err(const char* str) 1881{ 1882 log_crypto_from_err(str, ERR_get_error()); 1883} 1884 1885/** true if the ssl handshake error has to be squelched from the logs */ 1886static int 1887squelch_err_ssl_handshake(unsigned long err) 1888{ 1889 if(verbosity >= 3) 1890 return 0; /* only squelch on low verbosity */ 1891 /* this is very specific, we could filter on ERR_GET_REASON() 1892 * (the third element in ERR_PACK) */ 1893 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || 1894 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || 1895 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || 1896 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) 1897#ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 1898 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) 1899#endif 1900#ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 1901 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) 1902 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) 1903# ifdef SSL_R_VERSION_TOO_LOW 1904 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) 1905# endif 1906#endif 1907 ) 1908 return 1; 1909 return 0; 1910} 1911 1912void 1913perform_openssl_init(void) 1914{ 1915 /* init SSL library */ 1916#ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS 1917 ERR_load_crypto_strings(); 1918#endif 1919#if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS) 1920 ERR_load_SSL_strings(); 1921#endif 1922#if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) 1923 OpenSSL_add_all_algorithms(); 1924#else 1925 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS 1926 | OPENSSL_INIT_ADD_ALL_DIGESTS 1927 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); 1928#endif 1929#if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) 1930 (void)SSL_library_init(); 1931#else 1932 OPENSSL_init_ssl(0, NULL); 1933#endif 1934 1935 if(!RAND_status()) { 1936 /* try to seed it */ 1937 unsigned char buf[256]; 1938 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); 1939 size_t i; 1940 v = seed; 1941 for(i=0; i<256/sizeof(v); i++) { 1942 memmove(buf+i*sizeof(v), &v, sizeof(v)); 1943 v = v*seed + (unsigned int)i; 1944 } 1945 RAND_seed(buf, 256); 1946 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); 1947 } 1948} 1949 1950static int 1951get_ocsp(char *filename, unsigned char **ocsp) 1952{ 1953 BIO *bio; 1954 OCSP_RESPONSE *response; 1955 int len = -1; 1956 unsigned char *p, *buf; 1957 assert(filename); 1958 1959 if ((bio = BIO_new_file(filename, "r")) == NULL) { 1960 log_crypto_err("get_ocsp: BIO_new_file failed"); 1961 return -1; 1962 } 1963 1964 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { 1965 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); 1966 BIO_free(bio); 1967 return -1; 1968 } 1969 1970 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { 1971 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); 1972 OCSP_RESPONSE_free(response); 1973 BIO_free(bio); 1974 return -1; 1975 } 1976 1977 if ((buf = malloc((size_t) len)) == NULL) { 1978 log_msg(LOG_ERR, "get_ocsp: malloc failed"); 1979 OCSP_RESPONSE_free(response); 1980 BIO_free(bio); 1981 return -1; 1982 } 1983 1984 p = buf; 1985 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { 1986 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); 1987 free(buf); 1988 OCSP_RESPONSE_free(response); 1989 BIO_free(bio); 1990 return -1; 1991 } 1992 1993 OCSP_RESPONSE_free(response); 1994 BIO_free(bio); 1995 1996 *ocsp = buf; 1997 return len; 1998} 1999 2000/* further setup ssl ctx after the keys are loaded */ 2001static void 2002listen_sslctx_setup_2(void* ctxt) 2003{ 2004 SSL_CTX* ctx = (SSL_CTX*)ctxt; 2005 (void)ctx; 2006#if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO 2007 if(!SSL_CTX_set_ecdh_auto(ctx,1)) { 2008 /* ENOTREACH */ 2009 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); 2010 } 2011#elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) 2012 if(1) { 2013 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); 2014 if (!ecdh) { 2015 log_crypto_err("could not find p256, not enabling ECDHE"); 2016 } else { 2017 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { 2018 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); 2019 } 2020 EC_KEY_free (ecdh); 2021 } 2022 } 2023#endif 2024} 2025 2026static int 2027add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) 2028{ 2029 if(ocspdata) { 2030 unsigned char *p; 2031 if ((p=malloc(ocspdata_len)) == NULL) { 2032 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); 2033 return SSL_TLSEXT_ERR_NOACK; 2034 } 2035 memcpy(p, ocspdata, ocspdata_len); 2036 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { 2037 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); 2038 free(p); 2039 return SSL_TLSEXT_ERR_NOACK; 2040 } 2041 return SSL_TLSEXT_ERR_OK; 2042 } else { 2043 return SSL_TLSEXT_ERR_NOACK; 2044 } 2045} 2046 2047SSL_CTX* 2048server_tls_ctx_setup(char* key, char* pem, char* verifypem) 2049{ 2050 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); 2051 if(!ctx) { 2052 log_crypto_err("could not SSL_CTX_new"); 2053 return NULL; 2054 } 2055 /* no SSLv2, SSLv3 because has defects */ 2056#if SSL_OP_NO_SSLv2 != 0 2057 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ 2058 log_crypto_err("could not set SSL_OP_NO_SSLv2"); 2059 SSL_CTX_free(ctx); 2060 return NULL; 2061 } 2062#endif 2063 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) 2064 != SSL_OP_NO_SSLv3){ 2065 log_crypto_err("could not set SSL_OP_NO_SSLv3"); 2066 SSL_CTX_free(ctx); 2067 return 0; 2068 } 2069#if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) 2070 /* if we have tls 1.1 disable 1.0 */ 2071 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) 2072 != SSL_OP_NO_TLSv1){ 2073 log_crypto_err("could not set SSL_OP_NO_TLSv1"); 2074 SSL_CTX_free(ctx); 2075 return 0; 2076 } 2077#endif 2078#if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) 2079 /* if we have tls 1.2 disable 1.1 */ 2080 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) 2081 != SSL_OP_NO_TLSv1_1){ 2082 log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); 2083 SSL_CTX_free(ctx); 2084 return 0; 2085 } 2086#endif 2087#if defined(SSL_OP_NO_RENEGOTIATION) 2088 /* disable client renegotiation */ 2089 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & 2090 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { 2091 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); 2092 SSL_CTX_free(ctx); 2093 return 0; 2094 } 2095#endif 2096#if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) 2097 /* if we detect system-wide crypto policies, use those */ 2098 if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) { 2099 /* if we have sha256, set the cipher list to have no known vulns */ 2100 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) 2101 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); 2102 } 2103#endif 2104 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & 2105 SSL_OP_CIPHER_SERVER_PREFERENCE) != 2106 SSL_OP_CIPHER_SERVER_PREFERENCE) { 2107 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); 2108 SSL_CTX_free(ctx); 2109 return 0; 2110 } 2111#ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL 2112 SSL_CTX_set_security_level(ctx, 0); 2113#endif 2114 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { 2115 log_msg(LOG_ERR, "error for cert file: %s", pem); 2116 log_crypto_err("error in SSL_CTX use_certificate_chain_file"); 2117 SSL_CTX_free(ctx); 2118 return NULL; 2119 } 2120 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { 2121 log_msg(LOG_ERR, "error for private key file: %s", key); 2122 log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); 2123 SSL_CTX_free(ctx); 2124 return NULL; 2125 } 2126 if(!SSL_CTX_check_private_key(ctx)) { 2127 log_msg(LOG_ERR, "error for key file: %s", key); 2128 log_crypto_err("Error in SSL_CTX check_private_key"); 2129 SSL_CTX_free(ctx); 2130 return NULL; 2131 } 2132 listen_sslctx_setup_2(ctx); 2133 if(verifypem && verifypem[0]) { 2134 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { 2135 log_crypto_err("Error in SSL_CTX verify locations"); 2136 SSL_CTX_free(ctx); 2137 return NULL; 2138 } 2139 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); 2140 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); 2141 } 2142 return ctx; 2143} 2144 2145SSL_CTX* 2146server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) 2147{ 2148 char *key, *pem; 2149 SSL_CTX *ctx; 2150 2151 key = nsd->options->tls_service_key; 2152 pem = nsd->options->tls_service_pem; 2153 if(!key || key[0] == 0) { 2154 log_msg(LOG_ERR, "error: no tls-service-key file specified"); 2155 return NULL; 2156 } 2157 if(!pem || pem[0] == 0) { 2158 log_msg(LOG_ERR, "error: no tls-service-pem file specified"); 2159 return NULL; 2160 } 2161 2162 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but 2163 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ 2164 ctx = server_tls_ctx_setup(key, pem, verifypem); 2165 if(!ctx) { 2166 log_msg(LOG_ERR, "could not setup server TLS context"); 2167 return NULL; 2168 } 2169 if(ocspfile && ocspfile[0]) { 2170 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { 2171 log_crypto_err("Error reading OCSPfile"); 2172 SSL_CTX_free(ctx); 2173 return NULL; 2174 } else { 2175 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); 2176 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { 2177 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); 2178 SSL_CTX_free(ctx); 2179 return NULL; 2180 } 2181 } 2182 } 2183 return ctx; 2184} 2185 2186/* check if tcp_handler_accept_data created for TLS dedicated port */ 2187int 2188using_tls_port(struct sockaddr* addr, const char* tls_port) 2189{ 2190 in_port_t port = 0; 2191 2192 if (addr->sa_family == AF_INET) 2193 port = ((struct sockaddr_in*)addr)->sin_port; 2194#ifndef HAVE_STRUCT_SOCKADDR_IN6 2195 else 2196 port = ((struct sockaddr_in6*)addr)->sin6_port; 2197#endif /* HAVE_STRUCT_SOCKADDR_IN6 */ 2198 if (atoi(tls_port) == ntohs(port)) 2199 return 1; 2200 2201 return 0; 2202} 2203#endif 2204 2205/* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ 2206ssize_t 2207block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) 2208{ 2209 uint8_t* buf = (uint8_t*) p; 2210 ssize_t total = 0; 2211 struct pollfd fd; 2212 memset(&fd, 0, sizeof(fd)); 2213 fd.fd = s; 2214 fd.events = POLLIN; 2215 2216 while( total < sz) { 2217 ssize_t ret; 2218 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); 2219 if(ret == -1) { 2220 if(errno == EAGAIN) 2221 /* blocking read */ 2222 continue; 2223 if(errno == EINTR) { 2224 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2225 return -1; 2226 /* other signals can be handled later */ 2227 continue; 2228 } 2229 /* some error */ 2230 return -1; 2231 } 2232 if(ret == 0) { 2233 /* operation timed out */ 2234 return -2; 2235 } 2236 ret = read(s, buf+total, sz-total); 2237 if(ret == -1) { 2238 if(errno == EAGAIN) 2239 /* blocking read */ 2240 continue; 2241 if(errno == EINTR) { 2242 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2243 return -1; 2244 /* other signals can be handled later */ 2245 continue; 2246 } 2247 /* some error */ 2248 return -1; 2249 } 2250 if(ret == 0) { 2251 /* closed connection! */ 2252 return 0; 2253 } 2254 total += ret; 2255 } 2256 return total; 2257} 2258 2259static void 2260reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket) 2261{ 2262 sig_atomic_t cmd = NSD_QUIT_SYNC; 2263 udb_ptr t, next; 2264 udb_base* u = nsd->task[nsd->mytask]; 2265 udb_ptr_init(&next, u); 2266 udb_ptr_new(&t, u, udb_base_get_userdata(u)); 2267 udb_base_set_userdata(u, 0); 2268 while(!udb_ptr_is_null(&t)) { 2269 /* store next in list so this one can be deleted or reused */ 2270 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); 2271 udb_rptr_zero(&TASKLIST(&t)->next, u); 2272 2273 /* process task t */ 2274 /* append results for task t and update last_task */ 2275 task_process_in_reload(nsd, u, last_task, &t); 2276 2277 /* go to next */ 2278 udb_ptr_set_ptr(&t, u, &next); 2279 2280 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2281 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2282 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2283 if(cmd == NSD_QUIT) { 2284 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2285 /* unlink files of remainder of tasks */ 2286 while(!udb_ptr_is_null(&t)) { 2287 if(TASKLIST(&t)->task_type == task_apply_xfr) { 2288 xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno); 2289 } 2290 udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next); 2291 } 2292 udb_ptr_unlink(&t, u); 2293 udb_ptr_unlink(&next, u); 2294 exit(0); 2295 } 2296 } 2297 2298 } 2299 udb_ptr_unlink(&t, u); 2300 udb_ptr_unlink(&next, u); 2301} 2302 2303void server_verify(struct nsd *nsd, int cmdsocket); 2304 2305/* 2306 * Reload the database, stop parent, re-fork children and continue. 2307 * as server_main. 2308 */ 2309static void 2310server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, 2311 int cmdsocket) 2312{ 2313 pid_t mypid; 2314 sig_atomic_t cmd = NSD_QUIT_SYNC; 2315 int ret; 2316 udb_ptr last_task; 2317 struct sigaction old_sigchld, ign_sigchld; 2318 struct radnode* node; 2319 zone_type* zone; 2320 enum soainfo_hint hint; 2321 /* ignore SIGCHLD from the previous server_main that used this pid */ 2322 memset(&ign_sigchld, 0, sizeof(ign_sigchld)); 2323 ign_sigchld.sa_handler = SIG_IGN; 2324 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); 2325 2326#ifdef HAVE_SETPROCTITLE 2327 setproctitle("main"); 2328#endif 2329#ifdef HAVE_CPUSET_T 2330 if(nsd->use_cpu_affinity) { 2331 set_cpu_affinity(nsd->cpuset); 2332 } 2333#endif 2334 2335 /* see what tasks we got from xfrd */ 2336 task_remap(nsd->task[nsd->mytask]); 2337 udb_ptr_init(&last_task, nsd->task[nsd->mytask]); 2338 reload_process_tasks(nsd, &last_task, cmdsocket); 2339 2340#ifndef NDEBUG 2341 if(nsd_debug_level >= 1) 2342 region_log_stats(nsd->db->region); 2343#endif /* NDEBUG */ 2344 initialize_dname_compression_tables(nsd); 2345 2346#ifdef BIND8_STATS 2347 /* Restart dumping stats if required. */ 2348 time(&nsd->st->boot); 2349 set_bind8_alarm(nsd); 2350 /* Switch to a different set of stat array for new server processes, 2351 * because they can briefly coexist with the old processes. They 2352 * have their own stat structure. */ 2353 nsd->stat_current = (nsd->stat_current==0?1:0); 2354#endif 2355#ifdef USE_ZONE_STATS 2356 server_zonestat_realloc(nsd); /* realloc for new children */ 2357 server_zonestat_switch(nsd); 2358#endif 2359 2360 if(nsd->options->verify_enable) { 2361#ifdef RATELIMIT 2362 /* allocate resources for rate limiting. use a slot that is guaranteed 2363 not mapped to a file so no persistent data is overwritten */ 2364 rrl_init(nsd->child_count + 1); 2365#endif 2366 2367 /* spin-up server and execute verifiers for each zone */ 2368 server_verify(nsd, cmdsocket); 2369#ifdef RATELIMIT 2370 /* deallocate rate limiting resources */ 2371 rrl_deinit(nsd->child_count + 1); 2372#endif 2373 } 2374 2375 for(node = radix_first(nsd->db->zonetree); 2376 node != NULL; 2377 node = radix_next(node)) 2378 { 2379 zone = (zone_type *)node->elem; 2380 if(zone->is_updated) { 2381 if(zone->is_bad) { 2382 nsd->mode = NSD_RELOAD_FAILED; 2383 hint = soainfo_bad; 2384 } else { 2385 hint = soainfo_ok; 2386 } 2387 /* update(s), verified or not, possibly with subsequent 2388 skipped update(s). skipped update(s) are picked up 2389 by failed update check in xfrd */ 2390 task_new_soainfo(nsd->task[nsd->mytask], &last_task, 2391 zone, hint); 2392 } else if(zone->is_skipped) { 2393 /* corrupt or inconsistent update without preceding 2394 update(s), communicate soainfo_gone */ 2395 task_new_soainfo(nsd->task[nsd->mytask], &last_task, 2396 zone, soainfo_gone); 2397 } 2398 zone->is_updated = 0; 2399 zone->is_skipped = 0; 2400 } 2401 2402 if(nsd->mode == NSD_RELOAD_FAILED) { 2403 exit(NSD_RELOAD_FAILED); 2404 } 2405 2406 /* listen for the signals of failed children again */ 2407 sigaction(SIGCHLD, &old_sigchld, NULL); 2408#ifdef USE_DNSTAP 2409 if (nsd->dt_collector) { 2410 int *swap_fd_send; 2411 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes")); 2412 /* Swap fd_send with fd_swap so old serve child and new serve 2413 * childs will not write to the same pipe ends simultaneously */ 2414 swap_fd_send = nsd->dt_collector_fd_send; 2415 nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap; 2416 nsd->dt_collector_fd_swap = swap_fd_send; 2417 2418 } 2419#endif 2420 /* Start new child processes */ 2421 if (server_start_children(nsd, server_region, netio, &nsd-> 2422 xfrd_listener->fd) != 0) { 2423 send_children_quit(nsd); 2424 exit(1); 2425 } 2426 2427 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2428 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2429 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2430 if(cmd == NSD_QUIT) { 2431 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2432 send_children_quit(nsd); 2433 exit(0); 2434 } 2435 } 2436 2437 /* Send quit command to parent: blocking, wait for receipt. */ 2438 do { 2439 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2440 cmd = NSD_QUIT_SYNC; 2441 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) 2442 { 2443 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", 2444 strerror(errno)); 2445 } 2446 /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */ 2447 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main")); 2448 ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 2449 RELOAD_SYNC_TIMEOUT); 2450 if(ret == -2) { 2451 DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry")); 2452 } 2453 } while (ret == -2); 2454 if(ret == -1) { 2455 log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s", 2456 strerror(errno)); 2457 } 2458 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd)); 2459 if(cmd == NSD_QUIT) { 2460 /* small race condition possible here, parent got quit cmd. */ 2461 send_children_quit(nsd); 2462 exit(1); 2463 } 2464 assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD); 2465 udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]); 2466 task_process_sync(nsd->task[nsd->mytask]); 2467#ifdef USE_ZONE_STATS 2468 server_zonestat_realloc(nsd); /* realloc for next children */ 2469#endif 2470 2471 /* send soainfo to the xfrd process, signal it that reload is done, 2472 * it picks up the taskudb */ 2473 cmd = NSD_RELOAD_DONE; 2474 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2475 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", 2476 strerror(errno)); 2477 } 2478 mypid = getpid(); 2479 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2480 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2481 strerror(errno)); 2482 } 2483 2484 /* try to reopen file */ 2485 if (nsd->file_rotation_ok) 2486 log_reopen(nsd->log_filename, 1); 2487 /* exit reload, continue as new server_main */ 2488} 2489 2490/* 2491 * Get the mode depending on the signal hints that have been received. 2492 * Multiple signal hints can be received and will be handled in turn. 2493 */ 2494static sig_atomic_t 2495server_signal_mode(struct nsd *nsd) 2496{ 2497 if(nsd->signal_hint_quit) { 2498 nsd->signal_hint_quit = 0; 2499 return NSD_QUIT; 2500 } 2501 else if(nsd->signal_hint_shutdown) { 2502 nsd->signal_hint_shutdown = 0; 2503 return NSD_SHUTDOWN; 2504 } 2505 else if(nsd->signal_hint_child) { 2506 nsd->signal_hint_child = 0; 2507 return NSD_REAP_CHILDREN; 2508 } 2509 else if(nsd->signal_hint_reload) { 2510 nsd->signal_hint_reload = 0; 2511 return NSD_RELOAD; 2512 } 2513 else if(nsd->signal_hint_reload_hup) { 2514 nsd->signal_hint_reload_hup = 0; 2515 return NSD_RELOAD_REQ; 2516 } 2517 else if(nsd->signal_hint_stats) { 2518 nsd->signal_hint_stats = 0; 2519#ifdef BIND8_STATS 2520 set_bind8_alarm(nsd); 2521#endif 2522 return NSD_STATS; 2523 } 2524 else if(nsd->signal_hint_statsusr) { 2525 nsd->signal_hint_statsusr = 0; 2526 return NSD_STATS; 2527 } 2528 return NSD_RUN; 2529} 2530 2531/* 2532 * The main server simply waits for signals and child processes to 2533 * terminate. Child processes are restarted as necessary. 2534 */ 2535void 2536server_main(struct nsd *nsd) 2537{ 2538 region_type *server_region = region_create(xalloc, free); 2539 netio_type *netio = netio_create(server_region); 2540 netio_handler_type reload_listener; 2541 int reload_sockets[2] = {-1, -1}; 2542 struct timespec timeout_spec; 2543 int status; 2544 pid_t child_pid; 2545 pid_t reload_pid = -1; 2546 sig_atomic_t mode; 2547 2548 /* Ensure we are the main process */ 2549 assert(nsd->server_kind == NSD_SERVER_MAIN); 2550 2551 /* Add listener for the XFRD process */ 2552 netio_add_handler(netio, nsd->xfrd_listener); 2553 2554#ifdef BIND8_STATS 2555 nsd->st = &nsd->stat_map[0]; 2556 nsd->st->db_disk = 0; 2557 nsd->st->db_mem = region_get_mem(nsd->db->region); 2558#endif 2559 2560 /* Start the child processes that handle incoming queries */ 2561 if (server_start_children(nsd, server_region, netio, 2562 &nsd->xfrd_listener->fd) != 0) { 2563 send_children_quit(nsd); 2564 exit(1); 2565 } 2566 reload_listener.fd = -1; 2567 2568 /* This_child MUST be 0, because this is the parent process */ 2569 assert(nsd->this_child == 0); 2570 2571 /* Run the server until we get a shutdown signal */ 2572 while ((mode = nsd->mode) != NSD_SHUTDOWN) { 2573 /* Did we receive a signal that changes our mode? */ 2574 if(mode == NSD_RUN) { 2575 nsd->mode = mode = server_signal_mode(nsd); 2576 } 2577 2578 switch (mode) { 2579 case NSD_RUN: 2580 /* see if any child processes terminated */ 2581 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { 2582 int is_child = delete_child_pid(nsd, child_pid); 2583 if (is_child != -1 && nsd->children[is_child].need_to_exit) { 2584 if(nsd->children[is_child].child_fd == -1) 2585 nsd->children[is_child].has_exited = 1; 2586 parent_check_all_children_exited(nsd); 2587 } else if(is_child != -1) { 2588 log_msg(LOG_WARNING, 2589 "server %d died unexpectedly with status %d, restarting", 2590 (int) child_pid, status); 2591 restart_child_servers(nsd, server_region, netio, 2592 &nsd->xfrd_listener->fd); 2593 } else if (child_pid == reload_pid) { 2594 sig_atomic_t cmd = NSD_RELOAD_FAILED; 2595 pid_t mypid; 2596 log_msg(LOG_WARNING, 2597 "Reload process %d failed with status %d, continuing with old database", 2598 (int) child_pid, status); 2599 reload_pid = -1; 2600 if(reload_listener.fd != -1) close(reload_listener.fd); 2601 netio_remove_handler(netio, &reload_listener); 2602 reload_listener.fd = -1; 2603 reload_listener.event_types = NETIO_EVENT_NONE; 2604 task_process_sync(nsd->task[nsd->mytask]); 2605 /* inform xfrd reload attempt ended */ 2606 if(!write_socket(nsd->xfrd_listener->fd, 2607 &cmd, sizeof(cmd))) { 2608 log_msg(LOG_ERR, "problems " 2609 "sending SOAEND to xfrd: %s", 2610 strerror(errno)); 2611 } 2612 mypid = getpid(); 2613 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2614 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2615 strerror(errno)); 2616 } 2617#ifdef USE_DNSTAP 2618 } else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) { 2619 log_msg(LOG_WARNING, 2620 "dnstap-collector %d terminated with status %d", 2621 (int) child_pid, status); 2622 if(nsd->dt_collector) { 2623 dt_collector_close(nsd->dt_collector, nsd); 2624 dt_collector_destroy(nsd->dt_collector, nsd); 2625 nsd->dt_collector = NULL; 2626 } 2627 /* Only respawn a crashed (or exited) 2628 * dnstap-collector when not reloading, 2629 * to not induce a reload during a 2630 * reload (which would seriously 2631 * disrupt nsd procedures and lead to 2632 * unpredictable results)! 2633 * 2634 * This will *leave* a dnstap-collector 2635 * process terminated, but because 2636 * signalling of the reload process to 2637 * the main process to respawn in this 2638 * situation will be cumbersome, and 2639 * because this situation is so 2640 * specific (and therefore hopefully 2641 * extremely rare or non-existing at 2642 * all), plus the fact that we are left 2643 * with a perfectly function NSD 2644 * (besides not logging dnstap 2645 * messages), I consider it acceptable 2646 * to leave this unresolved. 2647 */ 2648 if(reload_pid == -1 && nsd->options->dnstap_enable) { 2649 nsd->dt_collector = dt_collector_create(nsd); 2650 dt_collector_start(nsd->dt_collector, nsd); 2651 nsd->mode = NSD_RELOAD_REQ; 2652 } 2653#endif 2654 } else if(status != 0) { 2655 /* check for status, because we get 2656 * the old-servermain because reload 2657 * is the process-parent of old-main, 2658 * and we get older server-processes 2659 * that are exiting after a reload */ 2660 log_msg(LOG_WARNING, 2661 "process %d terminated with status %d", 2662 (int) child_pid, status); 2663 } 2664 } 2665 if (child_pid == -1) { 2666 if (errno == EINTR) { 2667 continue; 2668 } 2669 if (errno != ECHILD) 2670 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); 2671 } 2672 if (nsd->mode != NSD_RUN) 2673 break; 2674 2675 /* timeout to collect processes. In case no sigchild happens. */ 2676 timeout_spec.tv_sec = 60; 2677 timeout_spec.tv_nsec = 0; 2678 2679 /* listen on ports, timeout for collecting terminated children */ 2680 if(netio_dispatch(netio, &timeout_spec, 0) == -1) { 2681 if (errno != EINTR) { 2682 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); 2683 } 2684 } 2685 if(nsd->restart_children) { 2686 restart_child_servers(nsd, server_region, netio, 2687 &nsd->xfrd_listener->fd); 2688 nsd->restart_children = 0; 2689 } 2690 if(nsd->reload_failed) { 2691 sig_atomic_t cmd = NSD_RELOAD_FAILED; 2692 pid_t mypid; 2693 nsd->reload_failed = 0; 2694 log_msg(LOG_WARNING, 2695 "Reload process %d failed, continuing with old database", 2696 (int) reload_pid); 2697 reload_pid = -1; 2698 if(reload_listener.fd != -1) close(reload_listener.fd); 2699 netio_remove_handler(netio, &reload_listener); 2700 reload_listener.fd = -1; 2701 reload_listener.event_types = NETIO_EVENT_NONE; 2702 task_process_sync(nsd->task[nsd->mytask]); 2703 /* inform xfrd reload attempt ended */ 2704 if(!write_socket(nsd->xfrd_listener->fd, 2705 &cmd, sizeof(cmd))) { 2706 log_msg(LOG_ERR, "problems " 2707 "sending SOAEND to xfrd: %s", 2708 strerror(errno)); 2709 } 2710 mypid = getpid(); 2711 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2712 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2713 strerror(errno)); 2714 } 2715 } 2716 2717 break; 2718 case NSD_RELOAD_REQ: { 2719 sig_atomic_t cmd = NSD_RELOAD_REQ; 2720 log_msg(LOG_WARNING, "SIGHUP received, reloading..."); 2721 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2722 "main: ipc send reload_req to xfrd")); 2723 if(!write_socket(nsd->xfrd_listener->fd, 2724 &cmd, sizeof(cmd))) { 2725 log_msg(LOG_ERR, "server_main: could not send " 2726 "reload_req to xfrd: %s", strerror(errno)); 2727 } 2728 nsd->mode = NSD_RUN; 2729 } break; 2730 case NSD_RELOAD: 2731 /* Continue to run nsd after reload */ 2732 nsd->mode = NSD_RUN; 2733 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); 2734 if (reload_pid != -1) { 2735 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", 2736 (int) reload_pid); 2737 break; 2738 } 2739 2740 /* switch the mytask to keep track of who owns task*/ 2741 nsd->mytask = 1 - nsd->mytask; 2742 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { 2743 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); 2744 reload_pid = -1; 2745 break; 2746 } 2747 2748 /* Do actual reload */ 2749 reload_pid = fork(); 2750 switch (reload_pid) { 2751 case -1: 2752 log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); 2753 break; 2754 default: 2755 /* PARENT */ 2756 close(reload_sockets[0]); 2757 server_reload(nsd, server_region, netio, 2758 reload_sockets[1]); 2759 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); 2760 close(reload_sockets[1]); 2761 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); 2762 /* drop stale xfrd ipc data */ 2763 ((struct ipc_handler_conn_data*)nsd-> 2764 xfrd_listener->user_data) 2765 ->conn->is_reading = 0; 2766 reload_pid = -1; 2767 reload_listener.fd = -1; 2768 reload_listener.event_types = NETIO_EVENT_NONE; 2769 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); 2770 break; 2771 case 0: 2772 /* CHILD */ 2773 /* server_main keep running until NSD_QUIT_SYNC 2774 * received from reload. */ 2775 close(reload_sockets[1]); 2776 reload_listener.fd = reload_sockets[0]; 2777 reload_listener.timeout = NULL; 2778 reload_listener.user_data = nsd; 2779 reload_listener.event_types = NETIO_EVENT_READ; 2780 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ 2781 netio_add_handler(netio, &reload_listener); 2782 reload_pid = getppid(); 2783 break; 2784 } 2785 break; 2786 case NSD_QUIT_SYNC: 2787 /* synchronisation of xfrd, parent and reload */ 2788 if(!nsd->quit_sync_done && reload_listener.fd != -1) { 2789 sig_atomic_t cmd = NSD_RELOAD; 2790 /* stop xfrd ipc writes in progress */ 2791 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2792 "main: ipc send indication reload")); 2793 if(!write_socket(nsd->xfrd_listener->fd, 2794 &cmd, sizeof(cmd))) { 2795 log_msg(LOG_ERR, "server_main: could not send reload " 2796 "indication to xfrd: %s", strerror(errno)); 2797 } 2798 /* wait for ACK from xfrd */ 2799 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); 2800 nsd->quit_sync_done = 1; 2801 } 2802 nsd->mode = NSD_RUN; 2803 break; 2804 case NSD_QUIT: 2805 /* silent shutdown during reload */ 2806 if(reload_listener.fd != -1) { 2807 /* acknowledge the quit, to sync reload that we will really quit now */ 2808 sig_atomic_t cmd = NSD_RELOAD; 2809 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); 2810 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2811 log_msg(LOG_ERR, "server_main: " 2812 "could not ack quit: %s", strerror(errno)); 2813 } 2814 close(reload_listener.fd); 2815 } 2816 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); 2817 /* only quit children after xfrd has acked */ 2818 send_children_quit(nsd); 2819 2820#ifdef MEMCLEAN /* OS collects memory pages */ 2821 region_destroy(server_region); 2822#endif 2823 server_shutdown(nsd); 2824 2825 /* ENOTREACH */ 2826 break; 2827 case NSD_SHUTDOWN: 2828 break; 2829 case NSD_REAP_CHILDREN: 2830 /* continue; wait for child in run loop */ 2831 nsd->mode = NSD_RUN; 2832 break; 2833 case NSD_STATS: 2834#ifdef BIND8_STATS 2835 set_children_stats(nsd); 2836#endif 2837 nsd->mode = NSD_RUN; 2838 break; 2839 default: 2840 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); 2841 nsd->mode = NSD_RUN; 2842 break; 2843 } 2844 } 2845 log_msg(LOG_WARNING, "signal received, shutting down..."); 2846 2847 /* close opened ports to avoid race with restart of nsd */ 2848 server_close_all_sockets(nsd->udp, nsd->ifs); 2849 server_close_all_sockets(nsd->tcp, nsd->ifs); 2850 daemon_remote_close(nsd->rc); 2851 send_children_quit_and_wait(nsd); 2852 2853 /* Unlink it if possible... */ 2854 unlinkpid(nsd->pidfile); 2855 unlink(nsd->task[0]->fname); 2856 unlink(nsd->task[1]->fname); 2857#ifdef USE_ZONE_STATS 2858 unlink(nsd->zonestatfname[0]); 2859 unlink(nsd->zonestatfname[1]); 2860#endif 2861#ifdef BIND8_STATS 2862 server_stat_free(nsd); 2863#endif 2864#ifdef USE_DNSTAP 2865 dt_collector_close(nsd->dt_collector, nsd); 2866#endif 2867 2868 if(reload_listener.fd != -1) { 2869 sig_atomic_t cmd = NSD_QUIT; 2870 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2871 "main: ipc send quit to reload-process")); 2872 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2873 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", 2874 strerror(errno)); 2875 } 2876 fsync(reload_listener.fd); 2877 close(reload_listener.fd); 2878 /* wait for reload to finish processing */ 2879 while(1) { 2880 if(waitpid(reload_pid, NULL, 0) == -1) { 2881 if(errno == EINTR) continue; 2882 if(errno == ECHILD) break; 2883 log_msg(LOG_ERR, "waitpid(reload %d): %s", 2884 (int)reload_pid, strerror(errno)); 2885 } 2886 break; 2887 } 2888 } 2889 if(nsd->xfrd_listener->fd != -1) { 2890 /* complete quit, stop xfrd */ 2891 sig_atomic_t cmd = NSD_QUIT; 2892 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2893 "main: ipc send quit to xfrd")); 2894 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2895 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", 2896 strerror(errno)); 2897 } 2898 fsync(nsd->xfrd_listener->fd); 2899 close(nsd->xfrd_listener->fd); 2900 (void)kill(nsd->pid, SIGTERM); 2901 } 2902 2903#ifdef MEMCLEAN /* OS collects memory pages */ 2904 region_destroy(server_region); 2905#endif 2906 server_shutdown(nsd); 2907} 2908 2909static query_state_type 2910server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p) 2911{ 2912 return query_process(query, nsd, now_p); 2913} 2914 2915static query_state_type 2916server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p) 2917{ 2918#ifdef RATELIMIT 2919 if(query_process(query, nsd, now_p) != QUERY_DISCARDED) { 2920 if(query->edns.cookie_status != COOKIE_VALID 2921 && query->edns.cookie_status != COOKIE_VALID_REUSE 2922 && rrl_process_query(query)) 2923 return rrl_slip(query); 2924 else return QUERY_PROCESSED; 2925 } 2926 return QUERY_DISCARDED; 2927#else 2928 return query_process(query, nsd, now_p); 2929#endif 2930} 2931 2932const char* 2933nsd_event_vs(void) 2934{ 2935#ifdef USE_MINI_EVENT 2936 return ""; 2937#else 2938 return event_get_version(); 2939#endif 2940} 2941 2942#if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS) 2943static const char* ub_ev_backend2str(int b) 2944{ 2945 switch(b) { 2946 case EVBACKEND_SELECT: return "select"; 2947 case EVBACKEND_POLL: return "poll"; 2948 case EVBACKEND_EPOLL: return "epoll"; 2949 case EVBACKEND_KQUEUE: return "kqueue"; 2950 case EVBACKEND_DEVPOLL: return "devpoll"; 2951 case EVBACKEND_PORT: return "evport"; 2952 } 2953 return "unknown"; 2954} 2955#endif 2956 2957const char* 2958nsd_event_method(void) 2959{ 2960#ifdef USE_MINI_EVENT 2961 return "select"; 2962#else 2963 struct event_base* b = nsd_child_event_base(); 2964 const char* m; 2965# ifdef EV_FEATURE_BACKENDS 2966 m = ub_ev_backend2str(ev_backend((struct ev_loop*)b)); 2967# elif defined(HAVE_EVENT_BASE_GET_METHOD) 2968 m = event_base_get_method(b); 2969# else 2970 m = "?"; 2971# endif 2972# ifdef MEMCLEAN 2973 event_base_free(b); 2974# endif 2975 return m; 2976#endif 2977} 2978 2979struct event_base* 2980nsd_child_event_base(void) 2981{ 2982 struct event_base* base; 2983#ifdef USE_MINI_EVENT 2984 static time_t secs; 2985 static struct timeval now; 2986 base = event_init(&secs, &now); 2987#else 2988# if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) 2989 /* libev */ 2990 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); 2991# else 2992 /* libevent */ 2993# ifdef HAVE_EVENT_BASE_NEW 2994 base = event_base_new(); 2995# else 2996 base = event_init(); 2997# endif 2998# endif 2999#endif 3000 return base; 3001} 3002 3003static void 3004add_udp_handler( 3005 struct nsd *nsd, 3006 struct nsd_socket *sock, 3007 struct udp_handler_data *data) 3008{ 3009 struct event *handler = &data->event; 3010 3011 data->nsd = nsd; 3012 data->socket = sock; 3013 3014 if(nsd->options->proxy_protocol_port && 3015 sockaddr_uses_proxy_protocol_port(nsd->options, 3016 (struct sockaddr *)&sock->addr.ai_addr)) { 3017 data->pp2_enabled = 1; 3018 } 3019 3020 memset(handler, 0, sizeof(*handler)); 3021 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); 3022 if(event_base_set(nsd->event_base, handler) != 0) 3023 log_msg(LOG_ERR, "nsd udp: event_base_set failed"); 3024 if(event_add(handler, NULL) != 0) 3025 log_msg(LOG_ERR, "nsd udp: event_add failed"); 3026} 3027 3028void 3029add_tcp_handler( 3030 struct nsd *nsd, 3031 struct nsd_socket *sock, 3032 struct tcp_accept_handler_data *data) 3033{ 3034 struct event *handler = &data->event; 3035 3036 data->nsd = nsd; 3037 data->socket = sock; 3038 3039 if(nsd->options->proxy_protocol_port && 3040 sockaddr_uses_proxy_protocol_port(nsd->options, 3041 (struct sockaddr *)&sock->addr.ai_addr)) { 3042 data->pp2_enabled = 1; 3043 } 3044 3045#ifdef HAVE_SSL 3046 if (nsd->tls_ctx && 3047 nsd->options->tls_port && 3048 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) 3049 { 3050 data->tls_accept = 1; 3051 if(verbosity >= 2) { 3052 char buf[48]; 3053 addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 3054 VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); 3055 } 3056 } else { 3057 data->tls_accept = 0; 3058 } 3059#endif 3060 3061 memset(handler, 0, sizeof(*handler)); 3062 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); 3063 if(event_base_set(nsd->event_base, handler) != 0) 3064 log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); 3065 if(event_add(handler, NULL) != 0) 3066 log_msg(LOG_ERR, "nsd tcp: event_add failed"); 3067 data->event_added = 1; 3068} 3069 3070/* 3071 * Serve DNS request to verifiers (short-lived) 3072 */ 3073void server_verify(struct nsd *nsd, int cmdsocket) 3074{ 3075 size_t size = 0; 3076 struct event cmd_event, signal_event, exit_event; 3077 struct zone *zone; 3078 3079 assert(nsd != NULL); 3080 3081 zone = verify_next_zone(nsd, NULL); 3082 if(zone == NULL) 3083 return; 3084 3085 nsd->server_region = region_create(xalloc, free); 3086 nsd->event_base = nsd_child_event_base(); 3087 3088 nsd->next_zone_to_verify = zone; 3089 nsd->verifier_count = 0; 3090 nsd->verifier_limit = nsd->options->verifier_count; 3091 size = sizeof(struct verifier) * nsd->verifier_limit; 3092 if(pipe(nsd->verifier_pipe) == -1) { 3093 log_msg(LOG_ERR, "verify: could not create pipe: %s", 3094 strerror(errno)); 3095 goto fail_pipe; 3096 } 3097 fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC); 3098 fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC); 3099 nsd->verifiers = region_alloc_zero(nsd->server_region, size); 3100 3101 for(size_t i = 0; i < nsd->verifier_limit; i++) { 3102 nsd->verifiers[i].nsd = nsd; 3103 nsd->verifiers[i].zone = NULL; 3104 nsd->verifiers[i].pid = -1; 3105 nsd->verifiers[i].output_stream.fd = -1; 3106 nsd->verifiers[i].output_stream.priority = LOG_INFO; 3107 nsd->verifiers[i].error_stream.fd = -1; 3108 nsd->verifiers[i].error_stream.priority = LOG_ERR; 3109 } 3110 3111 event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd); 3112 if(event_base_set(nsd->event_base, &cmd_event) != 0 || 3113 event_add(&cmd_event, NULL) != 0) 3114 { 3115 log_msg(LOG_ERR, "verify: could not add command event"); 3116 goto fail; 3117 } 3118 3119 event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd); 3120 if(event_base_set(nsd->event_base, &signal_event) != 0 || 3121 signal_add(&signal_event, NULL) != 0) 3122 { 3123 log_msg(LOG_ERR, "verify: could not add signal event"); 3124 goto fail; 3125 } 3126 3127 event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd); 3128 if(event_base_set(nsd->event_base, &exit_event) != 0 || 3129 event_add(&exit_event, NULL) != 0) 3130 { 3131 log_msg(LOG_ERR, "verify: could not add exit event"); 3132 goto fail; 3133 } 3134 3135 memset(msgs, 0, sizeof(msgs)); 3136 for (int i = 0; i < NUM_RECV_PER_SELECT; i++) { 3137 queries[i] = query_create(nsd->server_region, 3138 compressed_dname_offsets, 3139 compression_table_size, compressed_dnames); 3140 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3141 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 3142 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3143 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3144 msgs[i].msg_hdr.msg_iovlen = 1; 3145 msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr; 3146 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3147 } 3148 3149 for (size_t i = 0; i < nsd->verify_ifs; i++) { 3150 struct udp_handler_data *data; 3151 data = region_alloc_zero( 3152 nsd->server_region, sizeof(*data)); 3153 add_udp_handler(nsd, &nsd->verify_udp[i], data); 3154 } 3155 3156 tcp_accept_handler_count = nsd->verify_ifs; 3157 tcp_accept_handlers = region_alloc_array(nsd->server_region, 3158 nsd->verify_ifs, sizeof(*tcp_accept_handlers)); 3159 3160 for (size_t i = 0; i < nsd->verify_ifs; i++) { 3161 struct tcp_accept_handler_data *data; 3162 data = &tcp_accept_handlers[i]; 3163 memset(data, 0, sizeof(*data)); 3164 add_tcp_handler(nsd, &nsd->verify_tcp[i], data); 3165 } 3166 3167 while(nsd->next_zone_to_verify != NULL && 3168 nsd->verifier_count < nsd->verifier_limit) 3169 { 3170 verify_zone(nsd, nsd->next_zone_to_verify); 3171 nsd->next_zone_to_verify 3172 = verify_next_zone(nsd, nsd->next_zone_to_verify); 3173 } 3174 3175 /* short-lived main loop */ 3176 event_base_dispatch(nsd->event_base); 3177 3178 /* remove command and exit event handlers */ 3179 event_del(&exit_event); 3180 event_del(&signal_event); 3181 event_del(&cmd_event); 3182 3183 assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT); 3184 assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT); 3185fail: 3186 close(nsd->verifier_pipe[0]); 3187 close(nsd->verifier_pipe[1]); 3188fail_pipe: 3189 event_base_free(nsd->event_base); 3190 region_destroy(nsd->server_region); 3191 3192 nsd->event_base = NULL; 3193 nsd->server_region = NULL; 3194 nsd->verifier_limit = 0; 3195 nsd->verifier_pipe[0] = -1; 3196 nsd->verifier_pipe[1] = -1; 3197 nsd->verifiers = NULL; 3198} 3199 3200/* 3201 * Serve DNS requests. 3202 */ 3203void 3204server_child(struct nsd *nsd) 3205{ 3206 size_t i, from, numifs; 3207 region_type *server_region = region_create(xalloc, free); 3208 struct event_base* event_base = nsd_child_event_base(); 3209 sig_atomic_t mode; 3210 3211 if(!event_base) { 3212 log_msg(LOG_ERR, "nsd server could not create event base"); 3213 exit(1); 3214 } 3215 nsd->event_base = event_base; 3216 nsd->server_region = server_region; 3217 3218#ifdef RATELIMIT 3219 rrl_init(nsd->this_child->child_num); 3220#endif 3221 3222 assert(nsd->server_kind != NSD_SERVER_MAIN); 3223 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); 3224 3225#ifdef HAVE_SETPROCTITLE 3226 setproctitle("server %d", nsd->this_child->child_num + 1); 3227#endif 3228#ifdef HAVE_CPUSET_T 3229 if(nsd->use_cpu_affinity) { 3230 set_cpu_affinity(nsd->this_child->cpuset); 3231 } 3232#endif 3233#ifdef BIND8_STATS 3234 nsd->st = &nsd->stats_per_child[nsd->stat_current] 3235 [nsd->this_child->child_num]; 3236 nsd->st->boot = nsd->stat_map[0].boot; 3237 memcpy(&nsd->stat_proc, nsd->st, sizeof(nsd->stat_proc)); 3238#endif 3239 3240 if (!(nsd->server_kind & NSD_SERVER_TCP)) { 3241 server_close_all_sockets(nsd->tcp, nsd->ifs); 3242 } 3243 if (!(nsd->server_kind & NSD_SERVER_UDP)) { 3244 server_close_all_sockets(nsd->udp, nsd->ifs); 3245 } 3246 3247 if (nsd->this_child->parent_fd != -1) { 3248 struct event *handler; 3249 struct ipc_handler_conn_data* user_data = 3250 (struct ipc_handler_conn_data*)region_alloc( 3251 server_region, sizeof(struct ipc_handler_conn_data)); 3252 user_data->nsd = nsd; 3253 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); 3254 3255 handler = (struct event*) region_alloc( 3256 server_region, sizeof(*handler)); 3257 memset(handler, 0, sizeof(*handler)); 3258 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| 3259 EV_READ, child_handle_parent_command, user_data); 3260 if(event_base_set(event_base, handler) != 0) 3261 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); 3262 if(event_add(handler, NULL) != 0) 3263 log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); 3264 } 3265 3266 if(nsd->reuseport) { 3267 numifs = nsd->ifs / nsd->reuseport; 3268 from = numifs * nsd->this_child->child_num; 3269 if(from+numifs > nsd->ifs) { /* should not happen */ 3270 from = 0; 3271 numifs = nsd->ifs; 3272 } 3273 } else { 3274 from = 0; 3275 numifs = nsd->ifs; 3276 } 3277 3278 if (nsd->server_kind & NSD_SERVER_UDP) { 3279 int child = nsd->this_child->child_num; 3280 memset(msgs, 0, sizeof(msgs)); 3281 for (i = 0; i < NUM_RECV_PER_SELECT; i++) { 3282 queries[i] = query_create(server_region, 3283 compressed_dname_offsets, 3284 compression_table_size, compressed_dnames); 3285 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3286 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 3287 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3288 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3289 msgs[i].msg_hdr.msg_iovlen = 1; 3290 msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr; 3291 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3292 } 3293 3294 for (i = 0; i < nsd->ifs; i++) { 3295 int listen; 3296 struct udp_handler_data *data; 3297 3298 listen = nsd_bitset_isset(nsd->udp[i].servers, child); 3299 3300 if(i >= from && i < (from + numifs) && listen) { 3301 data = region_alloc_zero( 3302 nsd->server_region, sizeof(*data)); 3303 add_udp_handler(nsd, &nsd->udp[i], data); 3304 } else { 3305 /* close sockets intended for other servers */ 3306 server_close_socket(&nsd->udp[i]); 3307 } 3308 } 3309 } 3310 3311 /* 3312 * Keep track of all the TCP accept handlers so we can enable 3313 * and disable them based on the current number of active TCP 3314 * connections. 3315 */ 3316 if (nsd->server_kind & NSD_SERVER_TCP) { 3317 int child = nsd->this_child->child_num; 3318 tcp_accept_handler_count = numifs; 3319 tcp_accept_handlers = region_alloc_array(server_region, 3320 numifs, sizeof(*tcp_accept_handlers)); 3321 3322 for (i = 0; i < nsd->ifs; i++) { 3323 int listen; 3324 struct tcp_accept_handler_data *data; 3325 3326 listen = nsd_bitset_isset(nsd->tcp[i].servers, child); 3327 3328 if(i >= from && i < (from + numifs) && listen) { 3329 data = &tcp_accept_handlers[i-from]; 3330 memset(data, 0, sizeof(*data)); 3331 add_tcp_handler(nsd, &nsd->tcp[i], data); 3332 } else { 3333 /* close sockets intended for other servers */ 3334 /* 3335 * uncomment this once tcp servers are no 3336 * longer copied in the tcp fd copy line 3337 * in server_init(). 3338 server_close_socket(&nsd->tcp[i]); 3339 */ 3340 /* close sockets not meant for this server*/ 3341 if(!listen) 3342 server_close_socket(&nsd->tcp[i]); 3343 } 3344 } 3345 } else { 3346 tcp_accept_handler_count = 0; 3347 } 3348 3349 /* The main loop... */ 3350 while ((mode = nsd->mode) != NSD_QUIT) { 3351 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); 3352 3353 /* Do we need to do the statistics... */ 3354 if (mode == NSD_STATS) { 3355#ifdef BIND8_STATS 3356 int p = nsd->st_period; 3357 nsd->st_period = 1; /* force stats printout */ 3358 /* Dump the statistics */ 3359 bind8_stats(nsd); 3360 nsd->st_period = p; 3361#else /* !BIND8_STATS */ 3362 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); 3363#endif /* BIND8_STATS */ 3364 3365 nsd->mode = NSD_RUN; 3366 } 3367 else if (mode == NSD_REAP_CHILDREN) { 3368 /* got signal, notify parent. parent reaps terminated children. */ 3369 if (nsd->this_child->parent_fd != -1) { 3370 sig_atomic_t parent_notify = NSD_REAP_CHILDREN; 3371 if (write(nsd->this_child->parent_fd, 3372 &parent_notify, 3373 sizeof(parent_notify)) == -1) 3374 { 3375 log_msg(LOG_ERR, "problems sending command from %d to parent: %s", 3376 (int) nsd->this_child->pid, strerror(errno)); 3377 } 3378 } else /* no parent, so reap 'em */ 3379 while (waitpid(-1, NULL, WNOHANG) > 0) ; 3380 nsd->mode = NSD_RUN; 3381 } 3382 else if(mode == NSD_RUN) { 3383 /* Wait for a query... */ 3384 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3385 if (errno != EINTR) { 3386 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3387 break; 3388 } 3389 } 3390 } else if(mode == NSD_QUIT) { 3391 /* ignore here, quit */ 3392 } else { 3393 log_msg(LOG_ERR, "mode bad value %d, back to service.", 3394 (int)mode); 3395 nsd->mode = NSD_RUN; 3396 } 3397 } 3398 3399 service_remaining_tcp(nsd); 3400#ifdef BIND8_STATS 3401 bind8_stats(nsd); 3402#endif /* BIND8_STATS */ 3403 3404#ifdef MEMCLEAN /* OS collects memory pages */ 3405#ifdef RATELIMIT 3406 rrl_deinit(nsd->this_child->child_num); 3407#endif 3408 event_base_free(event_base); 3409 region_destroy(server_region); 3410#endif 3411 server_shutdown(nsd); 3412} 3413 3414static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) 3415{ 3416 int* timed_out = (int*)arg; 3417 assert(event & EV_TIMEOUT); (void)event; 3418 /* wake up the service tcp thread, note event is no longer 3419 * registered */ 3420 *timed_out = 1; 3421} 3422 3423void 3424service_remaining_tcp(struct nsd* nsd) 3425{ 3426 struct tcp_handler_data* p; 3427 struct event_base* event_base; 3428 /* check if it is needed */ 3429 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) 3430 return; 3431 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections")); 3432#ifdef USE_DNSTAP 3433 /* remove dnstap collector, we cannot write there because the new 3434 * child process is using the file descriptor, or the child 3435 * process after that. */ 3436 dt_collector_destroy(nsd->dt_collector, nsd); 3437 nsd->dt_collector = NULL; 3438#endif 3439 /* setup event base */ 3440 event_base = nsd_child_event_base(); 3441 if(!event_base) { 3442 log_msg(LOG_ERR, "nsd remain tcp could not create event base"); 3443 return; 3444 } 3445 /* register tcp connections */ 3446 for(p = tcp_active_list; p != NULL; p = p->next) { 3447 struct timeval timeout; 3448 int fd = p->event.ev_fd; 3449#ifdef USE_MINI_EVENT 3450 short event = p->event.ev_flags & (EV_READ|EV_WRITE); 3451#else 3452 short event = p->event.ev_events & (EV_READ|EV_WRITE); 3453#endif 3454 void (*fn)(int, short, void*); 3455#ifdef HAVE_SSL 3456 if(p->tls) { 3457 if((event&EV_READ)) 3458 fn = handle_tls_reading; 3459 else fn = handle_tls_writing; 3460 } else { 3461#endif 3462 if((event&EV_READ)) 3463 fn = handle_tcp_reading; 3464 else fn = handle_tcp_writing; 3465#ifdef HAVE_SSL 3466 } 3467#endif 3468 3469 p->tcp_no_more_queries = 1; 3470 /* set timeout to 3 seconds (previously 1/10 second) */ 3471 if(p->tcp_timeout > 3000) 3472 p->tcp_timeout = 3000; 3473 timeout.tv_sec = p->tcp_timeout / 1000; 3474 timeout.tv_usec = (p->tcp_timeout % 1000)*1000; 3475 event_del(&p->event); 3476 memset(&p->event, 0, sizeof(p->event)); 3477 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, 3478 fn, p); 3479 if(event_base_set(event_base, &p->event) != 0) 3480 log_msg(LOG_ERR, "event base set failed"); 3481 if(event_add(&p->event, &timeout) != 0) 3482 log_msg(LOG_ERR, "event add failed"); 3483 } 3484 3485 /* handle it */ 3486 while(nsd->current_tcp_count > 0) { 3487 mode_t m = server_signal_mode(nsd); 3488 struct event timeout; 3489 struct timeval tv; 3490 int timed_out = 0; 3491 if(m == NSD_QUIT || m == NSD_SHUTDOWN || 3492 m == NSD_REAP_CHILDREN) { 3493 /* quit */ 3494 break; 3495 } 3496 /* timer */ 3497 /* have to do something every 3 seconds */ 3498 tv.tv_sec = 3; 3499 tv.tv_usec = 0; 3500 memset(&timeout, 0, sizeof(timeout)); 3501 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, 3502 &timed_out); 3503 if(event_base_set(event_base, &timeout) != 0) 3504 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); 3505 if(event_add(&timeout, &tv) != 0) 3506 log_msg(LOG_ERR, "remaintcp timer: event_add failed"); 3507 3508 /* service loop */ 3509 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3510 if (errno != EINTR) { 3511 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3512 break; 3513 } 3514 } 3515 if(!timed_out) { 3516 event_del(&timeout); 3517 } else { 3518 /* timed out, quit */ 3519 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit")); 3520 break; 3521 } 3522 } 3523#ifdef MEMCLEAN 3524 event_base_free(event_base); 3525#endif 3526 /* continue to quit after return */ 3527} 3528 3529/* Implement recvmmsg and sendmmsg if the platform does not. These functions 3530 * are always used, even if nonblocking operations are broken, in which case 3531 * NUM_RECV_PER_SELECT is defined to 1 (one). 3532 */ 3533#if defined(HAVE_RECVMMSG) 3534#define nsd_recvmmsg recvmmsg 3535#else /* !HAVE_RECVMMSG */ 3536 3537static int 3538nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, 3539 int flags, struct timespec *timeout) 3540{ 3541 unsigned int vpos = 0; 3542 ssize_t rcvd; 3543 3544 /* timeout is ignored, ensure caller does not expect it to work */ 3545 assert(timeout == NULL); (void)timeout; 3546 3547 while(vpos < vlen) { 3548 rcvd = recvfrom(sockfd, 3549 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3550 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3551 flags, 3552 msgvec[vpos].msg_hdr.msg_name, 3553 &msgvec[vpos].msg_hdr.msg_namelen); 3554 if(rcvd < 0) { 3555 break; 3556 } else { 3557 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); 3558 msgvec[vpos].msg_len = (unsigned int)rcvd; 3559 vpos++; 3560 } 3561 } 3562 3563 if(vpos) { 3564 /* error will be picked up next time */ 3565 return (int)vpos; 3566 } else if(errno == 0) { 3567 return 0; 3568 } else if(errno == EAGAIN) { 3569 return 0; 3570 } 3571 3572 return -1; 3573} 3574#endif /* HAVE_RECVMMSG */ 3575 3576#ifdef HAVE_SENDMMSG 3577#define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) 3578#else /* !HAVE_SENDMMSG */ 3579 3580static int 3581nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) 3582{ 3583 unsigned int vpos = 0; 3584 ssize_t snd; 3585 3586 while(vpos < vlen) { 3587 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); 3588 snd = sendto(sockfd, 3589 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3590 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3591 flags, 3592 msgvec[vpos].msg_hdr.msg_name, 3593 msgvec[vpos].msg_hdr.msg_namelen); 3594 if(snd < 0) { 3595 break; 3596 } else { 3597 msgvec[vpos].msg_len = (unsigned int)snd; 3598 vpos++; 3599 } 3600 } 3601 3602 if(vpos) { 3603 return (int)vpos; 3604 } else if(errno == 0) { 3605 return 0; 3606 } 3607 3608 return -1; 3609} 3610#endif /* HAVE_SENDMMSG */ 3611 3612static int 3613port_is_zero( 3614#ifdef INET6 3615 struct sockaddr_storage *addr 3616#else 3617 struct sockaddr_in *addr 3618#endif 3619 ) 3620{ 3621#ifdef INET6 3622 if(addr->ss_family == AF_INET6) { 3623 return (((struct sockaddr_in6 *)addr)->sin6_port) == 0; 3624 } else if(addr->ss_family == AF_INET) { 3625 return (((struct sockaddr_in *)addr)->sin_port) == 0; 3626 } 3627 return 0; 3628#else 3629 if(addr->sin_family == AF_INET) { 3630 return addr->sin_port == 0; 3631 } 3632 return 0; 3633#endif 3634} 3635 3636/* Parses the PROXYv2 header from buf and updates the struct. 3637 * Returns 1 on success, 0 on failure. */ 3638static int 3639consume_pp2_header(struct buffer* buf, struct query* q, int stream) 3640{ 3641 size_t size; 3642 struct pp2_header* header; 3643 int err = pp2_read_header(buffer_begin(buf), buffer_remaining(buf)); 3644 if(err) { 3645 VERBOSITY(4, (LOG_ERR, "proxy-protocol: could not parse " 3646 "PROXYv2 header: %s", pp_lookup_error(err))); 3647 return 0; 3648 } 3649 header = (struct pp2_header*)buffer_begin(buf); 3650 size = PP2_HEADER_SIZE + read_uint16(&header->len); 3651 if(size > buffer_limit(buf)) { 3652 VERBOSITY(4, (LOG_ERR, "proxy-protocol: not enough buffer " 3653 "size to read PROXYv2 header")); 3654 return 0; 3655 } 3656 if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) { 3657 /* A connection from the proxy itself. 3658 * No need to do anything with addresses. */ 3659 goto done; 3660 } 3661 if(header->fam_prot == PP2_UNSPEC_UNSPEC) { 3662 /* Unspecified family and protocol. This could be used for 3663 * health checks by proxies. 3664 * No need to do anything with addresses. */ 3665 goto done; 3666 } 3667 /* Read the proxied address */ 3668 switch(header->fam_prot) { 3669 case PP2_INET_STREAM: 3670 case PP2_INET_DGRAM: 3671 { 3672 struct sockaddr_in* addr = 3673 (struct sockaddr_in*)&q->client_addr; 3674 addr->sin_family = AF_INET; 3675 memmove(&addr->sin_addr.s_addr, 3676 &header->addr.addr4.src_addr, 4); 3677 memmove(&addr->sin_port, &header->addr.addr4.src_port, 3678 2); 3679 q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in); 3680 } 3681 /* Ignore the destination address; it should be us. */ 3682 break; 3683#ifdef INET6 3684 case PP2_INET6_STREAM: 3685 case PP2_INET6_DGRAM: 3686 { 3687 struct sockaddr_in6* addr = 3688 (struct sockaddr_in6*)&q->client_addr; 3689 memset(addr, 0, sizeof(*addr)); 3690 addr->sin6_family = AF_INET6; 3691 memmove(&addr->sin6_addr, 3692 header->addr.addr6.src_addr, 16); 3693 memmove(&addr->sin6_port, &header->addr.addr6.src_port, 3694 2); 3695 q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6); 3696 } 3697 /* Ignore the destination address; it should be us. */ 3698 break; 3699#endif /* INET6 */ 3700 default: 3701 VERBOSITY(2, (LOG_ERR, "proxy-protocol: unsupported " 3702 "family and protocol 0x%x", 3703 (int)header->fam_prot)); 3704 return 0; 3705 } 3706 q->is_proxied = 1; 3707done: 3708 if(!stream) { 3709 /* We are reading a whole packet; 3710 * Move the rest of the data to overwrite the PROXYv2 header */ 3711 /* XXX can we do better to avoid memmove? */ 3712 memmove(header, ((char*)header)+size, buffer_limit(buf)-size); 3713 buffer_set_limit(buf, buffer_limit(buf)-size); 3714 } 3715 return 1; 3716} 3717 3718static void 3719handle_udp(int fd, short event, void* arg) 3720{ 3721 struct udp_handler_data *data = (struct udp_handler_data *) arg; 3722 int received, sent, recvcount, i; 3723 struct query *q; 3724 uint32_t now = 0; 3725 3726 if (!(event & EV_READ)) { 3727 return; 3728 } 3729 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); 3730 /* this printf strangely gave a performance increase on Linux */ 3731 /* printf("recvcount %d \n", recvcount); */ 3732 if (recvcount == -1) { 3733 if (errno != EAGAIN && errno != EINTR) { 3734 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); 3735 STATUP(data->nsd, rxerr); 3736 /* No zone statup */ 3737 } 3738 /* Simply no data available */ 3739 return; 3740 } 3741 for (i = 0; i < recvcount; i++) { 3742 loopstart: 3743 received = msgs[i].msg_len; 3744 queries[i]->remote_addrlen = msgs[i].msg_hdr.msg_namelen; 3745 queries[i]->client_addrlen = (socklen_t)sizeof(queries[i]->client_addr); 3746 queries[i]->is_proxied = 0; 3747 q = queries[i]; 3748 if (received == -1) { 3749 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( 3750#if defined(HAVE_RECVMMSG) 3751 msgs[i].msg_hdr.msg_flags 3752#else 3753 errno 3754#endif 3755 )); 3756 STATUP(data->nsd, rxerr); 3757 /* No zone statup */ 3758 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3759 iovecs[i].iov_len = buffer_remaining(q->packet); 3760 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3761 goto swap_drop; 3762 } 3763 3764 /* Account... */ 3765#ifdef BIND8_STATS 3766 if (data->socket->addr.ai_family == AF_INET) { 3767 STATUP(data->nsd, qudp); 3768 } else if (data->socket->addr.ai_family == AF_INET6) { 3769 STATUP(data->nsd, qudp6); 3770 } 3771#endif 3772 3773 buffer_skip(q->packet, received); 3774 buffer_flip(q->packet); 3775 if(data->pp2_enabled && !consume_pp2_header(q->packet, q, 0)) { 3776 VERBOSITY(2, (LOG_ERR, "proxy-protocol: could not " 3777 "consume PROXYv2 header")); 3778 goto swap_drop; 3779 } 3780 if(!q->is_proxied) { 3781 q->client_addrlen = q->remote_addrlen; 3782 memmove(&q->client_addr, &q->remote_addr, 3783 q->remote_addrlen); 3784 } 3785#ifdef USE_DNSTAP 3786 /* 3787 * sending UDP-query with server address (local) and client address to dnstap process 3788 */ 3789 log_addr("query from client", &q->client_addr); 3790 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 3791 if(verbosity >= 6 && q->is_proxied) 3792 log_addr("query via proxy", &q->remote_addr); 3793 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->client_addr, q->client_addrlen, 3794 q->tcp, q->packet); 3795#endif /* USE_DNSTAP */ 3796 3797 /* Process and answer the query... */ 3798 if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) { 3799 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 3800 STATUP(data->nsd, nona); 3801 ZTATUP(data->nsd, q->zone, nona); 3802 } 3803 3804#ifdef USE_ZONE_STATS 3805 if (data->socket->addr.ai_family == AF_INET) { 3806 ZTATUP(data->nsd, q->zone, qudp); 3807 } else if (data->socket->addr.ai_family == AF_INET6) { 3808 ZTATUP(data->nsd, q->zone, qudp6); 3809 } 3810#endif 3811 3812 /* Add EDNS0 and TSIG info if necessary. */ 3813 query_add_optional(q, data->nsd, &now); 3814 3815 buffer_flip(q->packet); 3816 iovecs[i].iov_len = buffer_remaining(q->packet); 3817#ifdef BIND8_STATS 3818 /* Account the rcode & TC... */ 3819 STATUP2(data->nsd, rcode, RCODE(q->packet)); 3820 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); 3821 if (TC(q->packet)) { 3822 STATUP(data->nsd, truncated); 3823 ZTATUP(data->nsd, q->zone, truncated); 3824 } 3825#endif /* BIND8_STATS */ 3826#ifdef USE_DNSTAP 3827 /* 3828 * sending UDP-response with server address (local) and client address to dnstap process 3829 */ 3830 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 3831 log_addr("response to client", &q->client_addr); 3832 if(verbosity >= 6 && q->is_proxied) 3833 log_addr("response via proxy", &q->remote_addr); 3834 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, 3835 &q->client_addr, q->client_addrlen, q->tcp, q->packet, 3836 q->zone); 3837#endif /* USE_DNSTAP */ 3838 } else { 3839 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3840 iovecs[i].iov_len = buffer_remaining(q->packet); 3841 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3842 swap_drop: 3843 STATUP(data->nsd, dropped); 3844 ZTATUP(data->nsd, q->zone, dropped); 3845 if(i != recvcount-1) { 3846 /* swap with last and decrease recvcount */ 3847 struct mmsghdr mtmp = msgs[i]; 3848 struct iovec iotmp = iovecs[i]; 3849 recvcount--; 3850 msgs[i] = msgs[recvcount]; 3851 iovecs[i] = iovecs[recvcount]; 3852 queries[i] = queries[recvcount]; 3853 msgs[recvcount] = mtmp; 3854 iovecs[recvcount] = iotmp; 3855 queries[recvcount] = q; 3856 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3857 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; 3858 goto loopstart; 3859 } else { recvcount --; } 3860 } 3861 } 3862 3863 /* send until all are sent */ 3864 i = 0; 3865 while(i<recvcount) { 3866 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3867 if(sent == -1) { 3868 if(errno == ENOBUFS || 3869#ifdef EWOULDBLOCK 3870 errno == EWOULDBLOCK || 3871#endif 3872 errno == EAGAIN) { 3873 /* block to wait until send buffer avail */ 3874 int flag, errstore; 3875 if((flag = fcntl(fd, F_GETFL)) == -1) { 3876 log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno)); 3877 flag = 0; 3878 } 3879 flag &= ~O_NONBLOCK; 3880 if(fcntl(fd, F_SETFL, flag) == -1) 3881 log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno)); 3882 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3883 errstore = errno; 3884 flag |= O_NONBLOCK; 3885 if(fcntl(fd, F_SETFL, flag) == -1) 3886 log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno)); 3887 if(sent != -1) { 3888 i += sent; 3889 continue; 3890 } 3891 errno = errstore; 3892 } 3893 if(errno == EINVAL) { 3894 /* skip the invalid argument entry, 3895 * send the remaining packets in the list */ 3896 if(!(port_is_zero((void*)&queries[i]->remote_addr) && 3897 verbosity < 3)) { 3898 const char* es = strerror(errno); 3899 char a[64]; 3900 addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a)); 3901 log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3902 } 3903 i += 1; 3904 continue; 3905 } 3906 /* don't log transient network full errors, unless 3907 * on higher verbosity */ 3908 if(!(errno == ENOBUFS && verbosity < 1) && 3909#ifdef EWOULDBLOCK 3910 errno != EWOULDBLOCK && 3911#endif 3912 errno != EAGAIN) { 3913 const char* es = strerror(errno); 3914 char a[64]; 3915 addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a)); 3916 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3917 } 3918#ifdef BIND8_STATS 3919 data->nsd->st->txerr += recvcount-i; 3920#endif /* BIND8_STATS */ 3921 break; 3922 } 3923 i += sent; 3924 } 3925 for(i=0; i<recvcount; i++) { 3926 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3927 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3928 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3929 } 3930} 3931 3932#ifdef HAVE_SSL 3933/* 3934 * Setup an event for the tcp handler. 3935 */ 3936static void 3937tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), 3938 int fd, short event) 3939{ 3940 struct timeval timeout; 3941 struct event_base* ev_base; 3942 3943 timeout.tv_sec = data->nsd->tcp_timeout; 3944 timeout.tv_usec = 0L; 3945 3946 ev_base = data->event.ev_base; 3947 event_del(&data->event); 3948 memset(&data->event, 0, sizeof(data->event)); 3949 event_set(&data->event, fd, event, fn, data); 3950 if(event_base_set(ev_base, &data->event) != 0) 3951 log_msg(LOG_ERR, "event base set failed"); 3952 if(event_add(&data->event, &timeout) != 0) 3953 log_msg(LOG_ERR, "event add failed"); 3954} 3955#endif /* HAVE_SSL */ 3956 3957static void 3958cleanup_tcp_handler(struct tcp_handler_data* data) 3959{ 3960 event_del(&data->event); 3961#ifdef HAVE_SSL 3962 if(data->tls) { 3963 SSL_shutdown(data->tls); 3964 SSL_free(data->tls); 3965 data->tls = NULL; 3966 } 3967#endif 3968 data->pp2_header_state = pp2_header_none; 3969 close(data->event.ev_fd); 3970 if(data->prev) 3971 data->prev->next = data->next; 3972 else tcp_active_list = data->next; 3973 if(data->next) 3974 data->next->prev = data->prev; 3975 3976 /* 3977 * Enable the TCP accept handlers when the current number of 3978 * TCP connections is about to drop below the maximum number 3979 * of TCP connections. 3980 */ 3981 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { 3982 configure_handler_event_types(EV_READ|EV_PERSIST); 3983 if(slowaccept) { 3984 event_del(&slowaccept_event); 3985 slowaccept = 0; 3986 } 3987 } 3988 --data->nsd->current_tcp_count; 3989 assert(data->nsd->current_tcp_count >= 0); 3990 3991 region_destroy(data->region); 3992} 3993 3994/* Read more data into the buffer for tcp read. Pass the amount of additional 3995 * data required. Returns false if nothing needs to be done this event, or 3996 * true if the additional data is in the buffer. */ 3997static int 3998more_read_buf_tcp(int fd, struct tcp_handler_data* data, void* bufpos, 3999 size_t add_amount, ssize_t* received) 4000{ 4001 *received = read(fd, bufpos, add_amount); 4002 if (*received == -1) { 4003 if (errno == EAGAIN || errno == EINTR) { 4004 /* 4005 * Read would block, wait until more 4006 * data is available. 4007 */ 4008 return 0; 4009 } else { 4010 char buf[48]; 4011 addr2str(&data->query->remote_addr, buf, sizeof(buf)); 4012#ifdef ECONNRESET 4013 if (verbosity >= 2 || errno != ECONNRESET) 4014#endif /* ECONNRESET */ 4015 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 4016 cleanup_tcp_handler(data); 4017 return 0; 4018 } 4019 } else if (*received == 0) { 4020 /* EOF */ 4021 cleanup_tcp_handler(data); 4022 return 0; 4023 } 4024 return 1; 4025} 4026 4027static void 4028handle_tcp_reading(int fd, short event, void* arg) 4029{ 4030 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4031 ssize_t received; 4032 struct event_base* ev_base; 4033 struct timeval timeout; 4034 uint32_t now = 0; 4035 4036 if ((event & EV_TIMEOUT)) { 4037 /* Connection timed out. */ 4038 cleanup_tcp_handler(data); 4039 return; 4040 } 4041 4042 if ((data->nsd->tcp_query_count > 0 && 4043 data->query_count >= data->nsd->tcp_query_count) || 4044 (data->query_count > 0 && data->tcp_no_more_queries)) 4045 { 4046 /* No more queries allowed on this tcp connection. */ 4047 cleanup_tcp_handler(data); 4048 return; 4049 } 4050 4051 assert((event & EV_READ)); 4052 4053 if (data->bytes_transmitted == 0 && data->query_needs_reset) { 4054 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 4055 data->query_needs_reset = 0; 4056 } 4057 4058 if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) { 4059 struct pp2_header* header = NULL; 4060 size_t want_read_size = 0; 4061 size_t current_read_size = 0; 4062 if(data->pp2_header_state == pp2_header_none) { 4063 want_read_size = PP2_HEADER_SIZE; 4064 if(buffer_remaining(data->query->packet) < 4065 want_read_size) { 4066 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4067 cleanup_tcp_handler(data); 4068 return; 4069 } 4070 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4071 current_read_size = want_read_size; 4072 if(data->bytes_transmitted < current_read_size) { 4073 if(!more_read_buf_tcp(fd, data, 4074 (void*)buffer_at(data->query->packet, 4075 data->bytes_transmitted), 4076 current_read_size - data->bytes_transmitted, 4077 &received)) 4078 return; 4079 data->bytes_transmitted += received; 4080 buffer_skip(data->query->packet, received); 4081 if(data->bytes_transmitted != current_read_size) 4082 return; 4083 data->pp2_header_state = pp2_header_init; 4084 } 4085 } 4086 if(data->pp2_header_state == pp2_header_init) { 4087 int err; 4088 err = pp2_read_header(buffer_begin(data->query->packet), 4089 buffer_limit(data->query->packet)); 4090 if(err) { 4091 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err))); 4092 cleanup_tcp_handler(data); 4093 return; 4094 } 4095 header = (struct pp2_header*)buffer_begin(data->query->packet); 4096 want_read_size = ntohs(header->len); 4097 if(buffer_limit(data->query->packet) < 4098 PP2_HEADER_SIZE + want_read_size) { 4099 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4100 cleanup_tcp_handler(data); 4101 return; 4102 } 4103 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4104 current_read_size = PP2_HEADER_SIZE + want_read_size; 4105 if(want_read_size == 0) { 4106 /* nothing more to read; header is complete */ 4107 data->pp2_header_state = pp2_header_done; 4108 } else if(data->bytes_transmitted < current_read_size) { 4109 if(!more_read_buf_tcp(fd, data, 4110 (void*)buffer_at(data->query->packet, 4111 data->bytes_transmitted), 4112 current_read_size - data->bytes_transmitted, 4113 &received)) 4114 return; 4115 data->bytes_transmitted += received; 4116 buffer_skip(data->query->packet, received); 4117 if(data->bytes_transmitted != current_read_size) 4118 return; 4119 data->pp2_header_state = pp2_header_done; 4120 } 4121 } 4122 if(data->pp2_header_state != pp2_header_done || !header) { 4123 VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header")); 4124 4125 cleanup_tcp_handler(data); 4126 return; 4127 } 4128 buffer_flip(data->query->packet); 4129 if(!consume_pp2_header(data->query->packet, data->query, 1)) { 4130 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header")); 4131 4132 cleanup_tcp_handler(data); 4133 return; 4134 } 4135 /* Clear and reset the buffer to read the following 4136 * DNS packet(s). */ 4137 buffer_clear(data->query->packet); 4138 data->bytes_transmitted = 0; 4139 } 4140 4141 /* 4142 * Check if we received the leading packet length bytes yet. 4143 */ 4144 if (data->bytes_transmitted < sizeof(uint16_t)) { 4145 if(!more_read_buf_tcp(fd, data, 4146 (char*) &data->query->tcplen + data->bytes_transmitted, 4147 sizeof(uint16_t) - data->bytes_transmitted, &received)) 4148 return; 4149 data->bytes_transmitted += received; 4150 if (data->bytes_transmitted < sizeof(uint16_t)) { 4151 /* 4152 * Not done with the tcplen yet, wait for more 4153 * data to become available. 4154 */ 4155 return; 4156 } 4157 assert(data->bytes_transmitted == sizeof(uint16_t)); 4158 4159 data->query->tcplen = ntohs(data->query->tcplen); 4160 4161 /* 4162 * Minimum query size is: 4163 * 4164 * Size of the header (12) 4165 * + Root domain name (1) 4166 * + Query class (2) 4167 * + Query type (2) 4168 */ 4169 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4170 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4171 cleanup_tcp_handler(data); 4172 return; 4173 } 4174 4175 if (data->query->tcplen > data->query->maxlen) { 4176 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4177 cleanup_tcp_handler(data); 4178 return; 4179 } 4180 4181 buffer_set_limit(data->query->packet, data->query->tcplen); 4182 } 4183 4184 assert(buffer_remaining(data->query->packet) > 0); 4185 4186 /* Read the (remaining) query data. */ 4187 if(!more_read_buf_tcp(fd, data, buffer_current(data->query->packet), 4188 buffer_remaining(data->query->packet), &received)) 4189 return; 4190 data->bytes_transmitted += received; 4191 buffer_skip(data->query->packet, received); 4192 if (buffer_remaining(data->query->packet) > 0) { 4193 /* 4194 * Message not yet complete, wait for more data to 4195 * become available. 4196 */ 4197 return; 4198 } 4199 4200 assert(buffer_position(data->query->packet) == data->query->tcplen); 4201 4202 /* Account... */ 4203#ifdef BIND8_STATS 4204#ifndef INET6 4205 STATUP(data->nsd, ctcp); 4206#else 4207 if (data->query->remote_addr.ss_family == AF_INET) { 4208 STATUP(data->nsd, ctcp); 4209 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4210 STATUP(data->nsd, ctcp6); 4211 } 4212#endif 4213#endif /* BIND8_STATS */ 4214 4215 /* We have a complete query, process it. */ 4216 4217 /* tcp-query-count: handle query counter ++ */ 4218 data->query_count++; 4219 4220 buffer_flip(data->query->packet); 4221#ifdef USE_DNSTAP 4222 /* 4223 * and send TCP-query with found address (local) and client address to dnstap process 4224 */ 4225 log_addr("query from client", &data->query->client_addr); 4226 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 4227 if(verbosity >= 6 && data->query->is_proxied) 4228 log_addr("query via proxy", &data->query->remote_addr); 4229 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4230 data->query->client_addrlen, data->query->tcp, data->query->packet); 4231#endif /* USE_DNSTAP */ 4232 data->query_state = server_process_query(data->nsd, data->query, &now); 4233 if (data->query_state == QUERY_DISCARDED) { 4234 /* Drop the packet and the entire connection... */ 4235 STATUP(data->nsd, dropped); 4236 ZTATUP(data->nsd, data->query->zone, dropped); 4237 cleanup_tcp_handler(data); 4238 return; 4239 } 4240 4241#ifdef BIND8_STATS 4242 if (RCODE(data->query->packet) == RCODE_OK 4243 && !AA(data->query->packet)) 4244 { 4245 STATUP(data->nsd, nona); 4246 ZTATUP(data->nsd, data->query->zone, nona); 4247 } 4248#endif /* BIND8_STATS */ 4249 4250#ifdef USE_ZONE_STATS 4251#ifndef INET6 4252 ZTATUP(data->nsd, data->query->zone, ctcp); 4253#else 4254 if (data->query->remote_addr.ss_family == AF_INET) { 4255 ZTATUP(data->nsd, data->query->zone, ctcp); 4256 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4257 ZTATUP(data->nsd, data->query->zone, ctcp6); 4258 } 4259#endif 4260#endif /* USE_ZONE_STATS */ 4261 4262 query_add_optional(data->query, data->nsd, &now); 4263 4264 /* Switch to the tcp write handler. */ 4265 buffer_flip(data->query->packet); 4266 data->query->tcplen = buffer_remaining(data->query->packet); 4267#ifdef BIND8_STATS 4268 /* Account the rcode & TC... */ 4269 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4270 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4271 if (TC(data->query->packet)) { 4272 STATUP(data->nsd, truncated); 4273 ZTATUP(data->nsd, data->query->zone, truncated); 4274 } 4275#endif /* BIND8_STATS */ 4276#ifdef USE_DNSTAP 4277 /* 4278 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 4279 */ 4280 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 4281 log_addr("response to client", &data->query->client_addr); 4282 if(verbosity >= 6 && data->query->is_proxied) 4283 log_addr("response via proxy", &data->query->remote_addr); 4284 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4285 data->query->client_addrlen, data->query->tcp, data->query->packet, 4286 data->query->zone); 4287#endif /* USE_DNSTAP */ 4288 data->bytes_transmitted = 0; 4289 4290 timeout.tv_sec = data->tcp_timeout / 1000; 4291 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4292 4293 ev_base = data->event.ev_base; 4294 event_del(&data->event); 4295 memset(&data->event, 0, sizeof(data->event)); 4296 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 4297 handle_tcp_writing, data); 4298 if(event_base_set(ev_base, &data->event) != 0) 4299 log_msg(LOG_ERR, "event base set tcpr failed"); 4300 if(event_add(&data->event, &timeout) != 0) 4301 log_msg(LOG_ERR, "event add tcpr failed"); 4302 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4303 handle_tcp_writing(fd, EV_WRITE, data); 4304} 4305 4306static void 4307handle_tcp_writing(int fd, short event, void* arg) 4308{ 4309 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4310 ssize_t sent; 4311 struct query *q = data->query; 4312 struct timeval timeout; 4313 struct event_base* ev_base; 4314 uint32_t now = 0; 4315 4316 if ((event & EV_TIMEOUT)) { 4317 /* Connection timed out. */ 4318 cleanup_tcp_handler(data); 4319 return; 4320 } 4321 4322 assert((event & EV_WRITE)); 4323 4324 if (data->bytes_transmitted < sizeof(q->tcplen)) { 4325 /* Writing the response packet length. */ 4326 uint16_t n_tcplen = htons(q->tcplen); 4327#ifdef HAVE_WRITEV 4328 struct iovec iov[2]; 4329 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; 4330 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; 4331 iov[1].iov_base = buffer_begin(q->packet); 4332 iov[1].iov_len = buffer_limit(q->packet); 4333 sent = writev(fd, iov, 2); 4334#else /* HAVE_WRITEV */ 4335 sent = write(fd, 4336 (const char *) &n_tcplen + data->bytes_transmitted, 4337 sizeof(n_tcplen) - data->bytes_transmitted); 4338#endif /* HAVE_WRITEV */ 4339 if (sent == -1) { 4340 if (errno == EAGAIN || errno == EINTR) { 4341 /* 4342 * Write would block, wait until 4343 * socket becomes writable again. 4344 */ 4345 return; 4346 } else { 4347#ifdef ECONNRESET 4348 if(verbosity >= 2 || errno != ECONNRESET) 4349#endif /* ECONNRESET */ 4350#ifdef EPIPE 4351 if(verbosity >= 2 || errno != EPIPE) 4352#endif /* EPIPE 'broken pipe' */ 4353 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 4354 cleanup_tcp_handler(data); 4355 return; 4356 } 4357 } 4358 4359 data->bytes_transmitted += sent; 4360 if (data->bytes_transmitted < sizeof(q->tcplen)) { 4361 /* 4362 * Writing not complete, wait until socket 4363 * becomes writable again. 4364 */ 4365 return; 4366 } 4367 4368#ifdef HAVE_WRITEV 4369 sent -= sizeof(n_tcplen); 4370 /* handle potential 'packet done' code */ 4371 goto packet_could_be_done; 4372#endif 4373 } 4374 4375 sent = write(fd, 4376 buffer_current(q->packet), 4377 buffer_remaining(q->packet)); 4378 if (sent == -1) { 4379 if (errno == EAGAIN || errno == EINTR) { 4380 /* 4381 * Write would block, wait until 4382 * socket becomes writable again. 4383 */ 4384 return; 4385 } else { 4386#ifdef ECONNRESET 4387 if(verbosity >= 2 || errno != ECONNRESET) 4388#endif /* ECONNRESET */ 4389#ifdef EPIPE 4390 if(verbosity >= 2 || errno != EPIPE) 4391#endif /* EPIPE 'broken pipe' */ 4392 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 4393 cleanup_tcp_handler(data); 4394 return; 4395 } 4396 } 4397 4398 data->bytes_transmitted += sent; 4399#ifdef HAVE_WRITEV 4400 packet_could_be_done: 4401#endif 4402 buffer_skip(q->packet, sent); 4403 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4404 /* 4405 * Still more data to write when socket becomes 4406 * writable again. 4407 */ 4408 return; 4409 } 4410 4411 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4412 4413 if (data->query_state == QUERY_IN_AXFR || 4414 data->query_state == QUERY_IN_IXFR) { 4415 /* Continue processing AXFR and writing back results. */ 4416 buffer_clear(q->packet); 4417 if(data->query_state == QUERY_IN_AXFR) 4418 data->query_state = query_axfr(data->nsd, q, 0); 4419 else data->query_state = query_ixfr(data->nsd, q); 4420 if (data->query_state != QUERY_PROCESSED) { 4421 query_add_optional(data->query, data->nsd, &now); 4422 4423 /* Reset data. */ 4424 buffer_flip(q->packet); 4425 q->tcplen = buffer_remaining(q->packet); 4426 data->bytes_transmitted = 0; 4427 /* Reset timeout. */ 4428 timeout.tv_sec = data->tcp_timeout / 1000; 4429 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4430 ev_base = data->event.ev_base; 4431 event_del(&data->event); 4432 memset(&data->event, 0, sizeof(data->event)); 4433 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 4434 handle_tcp_writing, data); 4435 if(event_base_set(ev_base, &data->event) != 0) 4436 log_msg(LOG_ERR, "event base set tcpw failed"); 4437 if(event_add(&data->event, &timeout) != 0) 4438 log_msg(LOG_ERR, "event add tcpw failed"); 4439 4440 /* 4441 * Write data if/when the socket is writable 4442 * again. 4443 */ 4444 return; 4445 } 4446 } 4447 4448 /* 4449 * Done sending, wait for the next request to arrive on the 4450 * TCP socket by installing the TCP read handler. 4451 */ 4452 if ((data->nsd->tcp_query_count > 0 && 4453 data->query_count >= data->nsd->tcp_query_count) || 4454 data->tcp_no_more_queries) { 4455 4456 (void) shutdown(fd, SHUT_WR); 4457 } 4458 4459 data->bytes_transmitted = 0; 4460 data->query_needs_reset = 1; 4461 4462 timeout.tv_sec = data->tcp_timeout / 1000; 4463 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4464 ev_base = data->event.ev_base; 4465 event_del(&data->event); 4466 memset(&data->event, 0, sizeof(data->event)); 4467 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 4468 handle_tcp_reading, data); 4469 if(event_base_set(ev_base, &data->event) != 0) 4470 log_msg(LOG_ERR, "event base set tcpw failed"); 4471 if(event_add(&data->event, &timeout) != 0) 4472 log_msg(LOG_ERR, "event add tcpw failed"); 4473} 4474 4475#ifdef HAVE_SSL 4476/** create SSL object and associate fd */ 4477static SSL* 4478incoming_ssl_fd(SSL_CTX* ctx, int fd) 4479{ 4480 SSL* ssl = SSL_new((SSL_CTX*)ctx); 4481 if(!ssl) { 4482 log_crypto_err("could not SSL_new"); 4483 return NULL; 4484 } 4485 SSL_set_accept_state(ssl); 4486 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); 4487 if(!SSL_set_fd(ssl, fd)) { 4488 log_crypto_err("could not SSL_set_fd"); 4489 SSL_free(ssl); 4490 return NULL; 4491 } 4492 return ssl; 4493} 4494 4495/** TLS handshake to upgrade TCP connection */ 4496static int 4497tls_handshake(struct tcp_handler_data* data, int fd, int writing) 4498{ 4499 int r; 4500 if(data->shake_state == tls_hs_read_event) { 4501 /* read condition satisfied back to writing */ 4502 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4503 data->shake_state = tls_hs_none; 4504 return 1; 4505 } 4506 if(data->shake_state == tls_hs_write_event) { 4507 /* write condition satisfied back to reading */ 4508 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4509 data->shake_state = tls_hs_none; 4510 return 1; 4511 } 4512 4513 /* (continue to) setup the TLS connection */ 4514 ERR_clear_error(); 4515 r = SSL_do_handshake(data->tls); 4516 4517 if(r != 1) { 4518 int want = SSL_get_error(data->tls, r); 4519 if(want == SSL_ERROR_WANT_READ) { 4520 if(data->shake_state == tls_hs_read) { 4521 /* try again later */ 4522 return 1; 4523 } 4524 data->shake_state = tls_hs_read; 4525 /* switch back to reading mode */ 4526 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4527 return 1; 4528 } else if(want == SSL_ERROR_WANT_WRITE) { 4529 if(data->shake_state == tls_hs_write) { 4530 /* try again later */ 4531 return 1; 4532 } 4533 data->shake_state = tls_hs_write; 4534 /* switch back to writing mode */ 4535 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4536 return 1; 4537 } else { 4538 if(r == 0) 4539 VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely")); 4540 else { 4541 unsigned long err = ERR_get_error(); 4542 if(!squelch_err_ssl_handshake(err)) { 4543 char a[64], s[256]; 4544 addr2str(&data->query->remote_addr, a, sizeof(a)); 4545 snprintf(s, sizeof(s), "TLS handshake failed from %s", a); 4546 log_crypto_from_err(s, err); 4547 } 4548 } 4549 cleanup_tcp_handler(data); 4550 return 0; 4551 } 4552 } 4553 4554 /* Use to log successful upgrade for testing - could be removed*/ 4555 VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded.")); 4556 /* set back to the event we need to have when reading (or writing) */ 4557 if(data->shake_state == tls_hs_read && writing) { 4558 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4559 } else if(data->shake_state == tls_hs_write && !writing) { 4560 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4561 } 4562 data->shake_state = tls_hs_none; 4563 return 1; 4564} 4565 4566/* Read more data into the buffer for tls read. Pass the amount of additional 4567 * data required. Returns false if nothing needs to be done this event, or 4568 * true if the additional data is in the buffer. */ 4569static int 4570more_read_buf_tls(int fd, struct tcp_handler_data* data, void* bufpos, 4571 size_t add_amount, ssize_t* received) 4572{ 4573 ERR_clear_error(); 4574 if((*received=SSL_read(data->tls, bufpos, add_amount)) <= 0) { 4575 int want = SSL_get_error(data->tls, *received); 4576 if(want == SSL_ERROR_ZERO_RETURN) { 4577 cleanup_tcp_handler(data); 4578 return 0; /* shutdown, closed */ 4579 } else if(want == SSL_ERROR_WANT_READ) { 4580 /* wants to be called again */ 4581 return 0; 4582 } 4583 else if(want == SSL_ERROR_WANT_WRITE) { 4584 /* switch to writing */ 4585 data->shake_state = tls_hs_write_event; 4586 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4587 return 0; 4588 } 4589 cleanup_tcp_handler(data); 4590 log_crypto_err("could not SSL_read"); 4591 return 0; 4592 } 4593 return 1; 4594} 4595 4596/** handle TLS reading of incoming query */ 4597static void 4598handle_tls_reading(int fd, short event, void* arg) 4599{ 4600 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4601 ssize_t received; 4602 uint32_t now = 0; 4603 4604 if ((event & EV_TIMEOUT)) { 4605 /* Connection timed out. */ 4606 cleanup_tcp_handler(data); 4607 return; 4608 } 4609 4610 if ((data->nsd->tcp_query_count > 0 && 4611 data->query_count >= data->nsd->tcp_query_count) || 4612 (data->query_count > 0 && data->tcp_no_more_queries)) 4613 { 4614 /* No more queries allowed on this tcp connection. */ 4615 cleanup_tcp_handler(data); 4616 return; 4617 } 4618 4619 assert((event & EV_READ)); 4620 4621 if (data->bytes_transmitted == 0 && data->query_needs_reset) { 4622 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 4623 data->query_needs_reset = 0; 4624 } 4625 4626 if(data->shake_state != tls_hs_none) { 4627 if(!tls_handshake(data, fd, 0)) 4628 return; 4629 if(data->shake_state != tls_hs_none) 4630 return; 4631 } 4632 4633 if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) { 4634 struct pp2_header* header = NULL; 4635 size_t want_read_size = 0; 4636 size_t current_read_size = 0; 4637 if(data->pp2_header_state == pp2_header_none) { 4638 want_read_size = PP2_HEADER_SIZE; 4639 if(buffer_remaining(data->query->packet) < 4640 want_read_size) { 4641 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4642 cleanup_tcp_handler(data); 4643 return; 4644 } 4645 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4646 current_read_size = want_read_size; 4647 if(data->bytes_transmitted < current_read_size) { 4648 if(!more_read_buf_tls(fd, data, 4649 buffer_at(data->query->packet, 4650 data->bytes_transmitted), 4651 current_read_size - data->bytes_transmitted, 4652 &received)) 4653 return; 4654 data->bytes_transmitted += received; 4655 buffer_skip(data->query->packet, received); 4656 if(data->bytes_transmitted != current_read_size) 4657 return; 4658 data->pp2_header_state = pp2_header_init; 4659 } 4660 } 4661 if(data->pp2_header_state == pp2_header_init) { 4662 int err; 4663 err = pp2_read_header(buffer_begin(data->query->packet), 4664 buffer_limit(data->query->packet)); 4665 if(err) { 4666 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err))); 4667 cleanup_tcp_handler(data); 4668 return; 4669 } 4670 header = (struct pp2_header*)buffer_begin(data->query->packet); 4671 want_read_size = ntohs(header->len); 4672 if(buffer_limit(data->query->packet) < 4673 PP2_HEADER_SIZE + want_read_size) { 4674 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4675 cleanup_tcp_handler(data); 4676 return; 4677 } 4678 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4679 current_read_size = PP2_HEADER_SIZE + want_read_size; 4680 if(want_read_size == 0) { 4681 /* nothing more to read; header is complete */ 4682 data->pp2_header_state = pp2_header_done; 4683 } else if(data->bytes_transmitted < current_read_size) { 4684 if(!more_read_buf_tls(fd, data, 4685 buffer_at(data->query->packet, 4686 data->bytes_transmitted), 4687 current_read_size - data->bytes_transmitted, 4688 &received)) 4689 return; 4690 data->bytes_transmitted += received; 4691 buffer_skip(data->query->packet, received); 4692 if(data->bytes_transmitted != current_read_size) 4693 return; 4694 data->pp2_header_state = pp2_header_done; 4695 } 4696 } 4697 if(data->pp2_header_state != pp2_header_done || !header) { 4698 VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header")); 4699 cleanup_tcp_handler(data); 4700 return; 4701 } 4702 buffer_flip(data->query->packet); 4703 if(!consume_pp2_header(data->query->packet, data->query, 1)) { 4704 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header")); 4705 cleanup_tcp_handler(data); 4706 return; 4707 } 4708 /* Clear and reset the buffer to read the following 4709 * DNS packet(s). */ 4710 buffer_clear(data->query->packet); 4711 data->bytes_transmitted = 0; 4712 } 4713 /* 4714 * Check if we received the leading packet length bytes yet. 4715 */ 4716 if(data->bytes_transmitted < sizeof(uint16_t)) { 4717 if(!more_read_buf_tls(fd, data, 4718 (char *) &data->query->tcplen + data->bytes_transmitted, 4719 sizeof(uint16_t) - data->bytes_transmitted, &received)) 4720 return; 4721 data->bytes_transmitted += received; 4722 if (data->bytes_transmitted < sizeof(uint16_t)) { 4723 /* 4724 * Not done with the tcplen yet, wait for more 4725 * data to become available. 4726 */ 4727 return; 4728 } 4729 4730 assert(data->bytes_transmitted == sizeof(uint16_t)); 4731 4732 data->query->tcplen = ntohs(data->query->tcplen); 4733 4734 /* 4735 * Minimum query size is: 4736 * 4737 * Size of the header (12) 4738 * + Root domain name (1) 4739 * + Query class (2) 4740 * + Query type (2) 4741 */ 4742 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4743 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4744 cleanup_tcp_handler(data); 4745 return; 4746 } 4747 4748 if (data->query->tcplen > data->query->maxlen) { 4749 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4750 cleanup_tcp_handler(data); 4751 return; 4752 } 4753 4754 buffer_set_limit(data->query->packet, data->query->tcplen); 4755 } 4756 4757 assert(buffer_remaining(data->query->packet) > 0); 4758 4759 /* Read the (remaining) query data. */ 4760 if(!more_read_buf_tls(fd, data, buffer_current(data->query->packet), 4761 buffer_remaining(data->query->packet), &received)) 4762 return; 4763 data->bytes_transmitted += received; 4764 buffer_skip(data->query->packet, received); 4765 if (buffer_remaining(data->query->packet) > 0) { 4766 /* 4767 * Message not yet complete, wait for more data to 4768 * become available. 4769 */ 4770 return; 4771 } 4772 4773 assert(buffer_position(data->query->packet) == data->query->tcplen); 4774 4775 /* Account... */ 4776#ifndef INET6 4777 STATUP(data->nsd, ctls); 4778#else 4779 if (data->query->remote_addr.ss_family == AF_INET) { 4780 STATUP(data->nsd, ctls); 4781 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4782 STATUP(data->nsd, ctls6); 4783 } 4784#endif 4785 4786 /* We have a complete query, process it. */ 4787 4788 /* tcp-query-count: handle query counter ++ */ 4789 data->query_count++; 4790 4791 buffer_flip(data->query->packet); 4792#ifdef USE_DNSTAP 4793 /* 4794 * and send TCP-query with found address (local) and client address to dnstap process 4795 */ 4796 log_addr("query from client", &data->query->client_addr); 4797 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 4798 if(verbosity >= 6 && data->query->is_proxied) 4799 log_addr("query via proxy", &data->query->remote_addr); 4800 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4801 data->query->client_addrlen, data->query->tcp, data->query->packet); 4802#endif /* USE_DNSTAP */ 4803 data->query_state = server_process_query(data->nsd, data->query, &now); 4804 if (data->query_state == QUERY_DISCARDED) { 4805 /* Drop the packet and the entire connection... */ 4806 STATUP(data->nsd, dropped); 4807 ZTATUP(data->nsd, data->query->zone, dropped); 4808 cleanup_tcp_handler(data); 4809 return; 4810 } 4811 4812#ifdef BIND8_STATS 4813 if (RCODE(data->query->packet) == RCODE_OK 4814 && !AA(data->query->packet)) 4815 { 4816 STATUP(data->nsd, nona); 4817 ZTATUP(data->nsd, data->query->zone, nona); 4818 } 4819#endif /* BIND8_STATS */ 4820 4821#ifdef USE_ZONE_STATS 4822#ifndef INET6 4823 ZTATUP(data->nsd, data->query->zone, ctls); 4824#else 4825 if (data->query->remote_addr.ss_family == AF_INET) { 4826 ZTATUP(data->nsd, data->query->zone, ctls); 4827 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4828 ZTATUP(data->nsd, data->query->zone, ctls6); 4829 } 4830#endif 4831#endif /* USE_ZONE_STATS */ 4832 4833 query_add_optional(data->query, data->nsd, &now); 4834 4835 /* Switch to the tcp write handler. */ 4836 buffer_flip(data->query->packet); 4837 data->query->tcplen = buffer_remaining(data->query->packet); 4838#ifdef BIND8_STATS 4839 /* Account the rcode & TC... */ 4840 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4841 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4842 if (TC(data->query->packet)) { 4843 STATUP(data->nsd, truncated); 4844 ZTATUP(data->nsd, data->query->zone, truncated); 4845 } 4846#endif /* BIND8_STATS */ 4847#ifdef USE_DNSTAP 4848 /* 4849 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 4850 */ 4851 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 4852 log_addr("response to client", &data->query->client_addr); 4853 if(verbosity >= 6 && data->query->is_proxied) 4854 log_addr("response via proxy", &data->query->remote_addr); 4855 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4856 data->query->client_addrlen, data->query->tcp, data->query->packet, 4857 data->query->zone); 4858#endif /* USE_DNSTAP */ 4859 data->bytes_transmitted = 0; 4860 4861 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4862 4863 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4864 handle_tls_writing(fd, EV_WRITE, data); 4865} 4866 4867/** handle TLS writing of outgoing response */ 4868static void 4869handle_tls_writing(int fd, short event, void* arg) 4870{ 4871 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4872 ssize_t sent; 4873 struct query *q = data->query; 4874 /* static variable that holds reassembly buffer used to put the 4875 * TCP length in front of the packet, like writev. */ 4876 static buffer_type* global_tls_temp_buffer = NULL; 4877 buffer_type* write_buffer; 4878 uint32_t now = 0; 4879 4880 if ((event & EV_TIMEOUT)) { 4881 /* Connection timed out. */ 4882 cleanup_tcp_handler(data); 4883 return; 4884 } 4885 4886 assert((event & EV_WRITE)); 4887 4888 if(data->shake_state != tls_hs_none) { 4889 if(!tls_handshake(data, fd, 1)) 4890 return; 4891 if(data->shake_state != tls_hs_none) 4892 return; 4893 } 4894 4895 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); 4896 4897 /* If we are writing the start of a message, we must include the length 4898 * this is done with a copy into write_buffer. */ 4899 write_buffer = NULL; 4900 if (data->bytes_transmitted == 0) { 4901 if(!global_tls_temp_buffer) { 4902 /* gets deallocated when nsd shuts down from 4903 * nsd.region */ 4904 global_tls_temp_buffer = buffer_create(nsd.region, 4905 QIOBUFSZ + sizeof(q->tcplen)); 4906 if (!global_tls_temp_buffer) { 4907 return; 4908 } 4909 } 4910 write_buffer = global_tls_temp_buffer; 4911 buffer_clear(write_buffer); 4912 buffer_write_u16(write_buffer, q->tcplen); 4913 buffer_write(write_buffer, buffer_current(q->packet), 4914 (int)buffer_remaining(q->packet)); 4915 buffer_flip(write_buffer); 4916 } else { 4917 write_buffer = q->packet; 4918 } 4919 4920 /* Write the response */ 4921 ERR_clear_error(); 4922 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); 4923 if(sent <= 0) { 4924 int want = SSL_get_error(data->tls, sent); 4925 if(want == SSL_ERROR_ZERO_RETURN) { 4926 cleanup_tcp_handler(data); 4927 /* closed */ 4928 } else if(want == SSL_ERROR_WANT_READ) { 4929 /* switch back to reading */ 4930 data->shake_state = tls_hs_read_event; 4931 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4932 } else if(want != SSL_ERROR_WANT_WRITE) { 4933 cleanup_tcp_handler(data); 4934 log_crypto_err("could not SSL_write"); 4935 } 4936 return; 4937 } 4938 4939 buffer_skip(write_buffer, sent); 4940 if(buffer_remaining(write_buffer) != 0) { 4941 /* If not all sent, sync up the real buffer if it wasn't used.*/ 4942 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { 4943 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); 4944 } 4945 } 4946 4947 data->bytes_transmitted += sent; 4948 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4949 /* 4950 * Still more data to write when socket becomes 4951 * writable again. 4952 */ 4953 return; 4954 } 4955 4956 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4957 4958 if (data->query_state == QUERY_IN_AXFR || 4959 data->query_state == QUERY_IN_IXFR) { 4960 /* Continue processing AXFR and writing back results. */ 4961 buffer_clear(q->packet); 4962 if(data->query_state == QUERY_IN_AXFR) 4963 data->query_state = query_axfr(data->nsd, q, 0); 4964 else data->query_state = query_ixfr(data->nsd, q); 4965 if (data->query_state != QUERY_PROCESSED) { 4966 query_add_optional(data->query, data->nsd, &now); 4967 4968 /* Reset data. */ 4969 buffer_flip(q->packet); 4970 q->tcplen = buffer_remaining(q->packet); 4971 data->bytes_transmitted = 0; 4972 /* Reset to writing mode. */ 4973 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4974 4975 /* 4976 * Write data if/when the socket is writable 4977 * again. 4978 */ 4979 return; 4980 } 4981 } 4982 4983 /* 4984 * Done sending, wait for the next request to arrive on the 4985 * TCP socket by installing the TCP read handler. 4986 */ 4987 if ((data->nsd->tcp_query_count > 0 && 4988 data->query_count >= data->nsd->tcp_query_count) || 4989 data->tcp_no_more_queries) { 4990 4991 (void) shutdown(fd, SHUT_WR); 4992 } 4993 4994 data->bytes_transmitted = 0; 4995 data->query_needs_reset = 1; 4996 4997 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4998} 4999#endif 5000 5001static void 5002handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), 5003 void* ATTR_UNUSED(arg)) 5004{ 5005 if(slowaccept) { 5006 configure_handler_event_types(EV_PERSIST | EV_READ); 5007 slowaccept = 0; 5008 } 5009} 5010 5011static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) 5012{ 5013#ifndef HAVE_ACCEPT4 5014 int s = accept(fd, addr, addrlen); 5015 if (s != -1) { 5016 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { 5017 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); 5018 close(s); 5019 s = -1; 5020 errno=EINTR; /* stop error printout as error in accept4 5021 by setting this errno, it omits printout, in 5022 later code that calls nsd_accept4 */ 5023 } 5024 } 5025 return s; 5026#else 5027 return accept4(fd, addr, addrlen, SOCK_NONBLOCK); 5028#endif /* HAVE_ACCEPT4 */ 5029} 5030 5031/* 5032 * Handle an incoming TCP connection. The connection is accepted and 5033 * a new TCP reader event handler is added. The TCP handler 5034 * is responsible for cleanup when the connection is closed. 5035 */ 5036static void 5037handle_tcp_accept(int fd, short event, void* arg) 5038{ 5039 struct tcp_accept_handler_data *data 5040 = (struct tcp_accept_handler_data *) arg; 5041 int s; 5042 int reject = 0; 5043 struct tcp_handler_data *tcp_data; 5044 region_type *tcp_region; 5045#ifdef INET6 5046 struct sockaddr_storage addr; 5047#else 5048 struct sockaddr_in addr; 5049#endif 5050 socklen_t addrlen; 5051 struct timeval timeout; 5052 5053 if (!(event & EV_READ)) { 5054 return; 5055 } 5056 5057 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { 5058 reject = data->nsd->options->tcp_reject_overflow; 5059 if (!reject) { 5060 return; 5061 } 5062 } 5063 5064 /* Accept it... */ 5065 addrlen = sizeof(addr); 5066 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); 5067 if (s == -1) { 5068 /** 5069 * EMFILE and ENFILE is a signal that the limit of open 5070 * file descriptors has been reached. Pause accept(). 5071 * EINTR is a signal interrupt. The others are various OS ways 5072 * of saying that the client has closed the connection. 5073 */ 5074 if (errno == EMFILE || errno == ENFILE) { 5075 if (!slowaccept) { 5076 /* disable accept events */ 5077 struct timeval tv; 5078 configure_handler_event_types(0); 5079 tv.tv_sec = SLOW_ACCEPT_TIMEOUT; 5080 tv.tv_usec = 0L; 5081 memset(&slowaccept_event, 0, 5082 sizeof(slowaccept_event)); 5083 event_set(&slowaccept_event, -1, EV_TIMEOUT, 5084 handle_slowaccept_timeout, NULL); 5085 (void)event_base_set(data->event.ev_base, 5086 &slowaccept_event); 5087 (void)event_add(&slowaccept_event, &tv); 5088 slowaccept = 1; 5089 /* We don't want to spam the logs here */ 5090 } 5091 } else if (errno != EINTR 5092 && errno != EWOULDBLOCK 5093#ifdef ECONNABORTED 5094 && errno != ECONNABORTED 5095#endif /* ECONNABORTED */ 5096#ifdef EPROTO 5097 && errno != EPROTO 5098#endif /* EPROTO */ 5099 ) { 5100 log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); 5101 } 5102 return; 5103 } 5104 5105 if (reject) { 5106 shutdown(s, SHUT_RDWR); 5107 close(s); 5108 return; 5109 } 5110 5111 /* 5112 * This region is deallocated when the TCP connection is 5113 * closed by the TCP handler. 5114 */ 5115 tcp_region = region_create(xalloc, free); 5116 tcp_data = (struct tcp_handler_data *) region_alloc( 5117 tcp_region, sizeof(struct tcp_handler_data)); 5118 tcp_data->region = tcp_region; 5119 tcp_data->query = query_create(tcp_region, compressed_dname_offsets, 5120 compression_table_size, compressed_dnames); 5121 tcp_data->nsd = data->nsd; 5122 tcp_data->query_count = 0; 5123#ifdef HAVE_SSL 5124 tcp_data->shake_state = tls_hs_none; 5125 tcp_data->tls = NULL; 5126#endif 5127 tcp_data->query_needs_reset = 1; 5128 tcp_data->pp2_enabled = data->pp2_enabled; 5129 tcp_data->pp2_header_state = pp2_header_none; 5130 tcp_data->prev = NULL; 5131 tcp_data->next = NULL; 5132 5133 tcp_data->query_state = QUERY_PROCESSED; 5134 tcp_data->bytes_transmitted = 0; 5135 memcpy(&tcp_data->query->remote_addr, &addr, addrlen); 5136 tcp_data->query->remote_addrlen = addrlen; 5137 /* Copy remote_address to client_address. 5138 * Simplest way/time for streams to do that. */ 5139 memcpy(&tcp_data->query->client_addr, &addr, addrlen); 5140 tcp_data->query->client_addrlen = addrlen; 5141 tcp_data->query->is_proxied = 0; 5142 5143 tcp_data->tcp_no_more_queries = 0; 5144 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; 5145 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { 5146 /* very busy, give smaller timeout */ 5147 tcp_data->tcp_timeout = 200; 5148 } 5149 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5150 timeout.tv_sec = tcp_data->tcp_timeout / 1000; 5151 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; 5152 5153#ifdef USE_DNSTAP 5154 /* save the address of the connection */ 5155 tcp_data->socket = data->socket; 5156#endif /* USE_DNSTAP */ 5157 5158#ifdef HAVE_SSL 5159 if (data->tls_accept) { 5160 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); 5161 if(!tcp_data->tls) { 5162 close(s); 5163 return; 5164 } 5165 tcp_data->shake_state = tls_hs_read; 5166 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5167 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 5168 handle_tls_reading, tcp_data); 5169 } else { 5170#endif 5171 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5172 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 5173 handle_tcp_reading, tcp_data); 5174#ifdef HAVE_SSL 5175 } 5176#endif 5177 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { 5178 log_msg(LOG_ERR, "cannot set tcp event base"); 5179 close(s); 5180 region_destroy(tcp_region); 5181 return; 5182 } 5183 if(event_add(&tcp_data->event, &timeout) != 0) { 5184 log_msg(LOG_ERR, "cannot add tcp to event base"); 5185 close(s); 5186 region_destroy(tcp_region); 5187 return; 5188 } 5189 if(tcp_active_list) { 5190 tcp_active_list->prev = tcp_data; 5191 tcp_data->next = tcp_active_list; 5192 } 5193 tcp_active_list = tcp_data; 5194 5195 /* 5196 * Keep track of the total number of TCP handlers installed so 5197 * we can stop accepting connections when the maximum number 5198 * of simultaneous TCP connections is reached. 5199 * 5200 * If tcp-reject-overflow is enabled, however, then we do not 5201 * change the handler event type; we keep it as-is and accept 5202 * overflow TCP connections only so that we can forcibly kill 5203 * them off. 5204 */ 5205 ++data->nsd->current_tcp_count; 5206 if (!data->nsd->options->tcp_reject_overflow && 5207 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) 5208 { 5209 configure_handler_event_types(0); 5210 } 5211} 5212 5213static void 5214send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) 5215{ 5216 size_t i; 5217 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 5218 for (i = 0; i < nsd->child_count; ++i) { 5219 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { 5220 if (write(nsd->children[i].child_fd, 5221 &command, 5222 sizeof(command)) == -1) 5223 { 5224 if(errno != EAGAIN && errno != EINTR) 5225 log_msg(LOG_ERR, "problems sending command %d to server %d: %s", 5226 (int) command, 5227 (int) nsd->children[i].pid, 5228 strerror(errno)); 5229 } else if (timeout > 0) { 5230 (void)block_read(NULL, 5231 nsd->children[i].child_fd, 5232 &command, sizeof(command), timeout); 5233 } 5234 fsync(nsd->children[i].child_fd); 5235 close(nsd->children[i].child_fd); 5236 nsd->children[i].child_fd = -1; 5237 } 5238 } 5239} 5240 5241static void 5242send_children_quit(struct nsd* nsd) 5243{ 5244 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); 5245 send_children_command(nsd, NSD_QUIT, 0); 5246} 5247 5248static void 5249send_children_quit_and_wait(struct nsd* nsd) 5250{ 5251 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); 5252 send_children_command(nsd, NSD_QUIT_CHILD, 3); 5253} 5254 5255#ifdef BIND8_STATS 5256static void 5257set_children_stats(struct nsd* nsd) 5258{ 5259 size_t i; 5260 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 5261 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); 5262 for (i = 0; i < nsd->child_count; ++i) { 5263 nsd->children[i].need_to_send_STATS = 1; 5264 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; 5265 } 5266} 5267#endif /* BIND8_STATS */ 5268 5269static void 5270configure_handler_event_types(short event_types) 5271{ 5272 size_t i; 5273 5274 for (i = 0; i < tcp_accept_handler_count; ++i) { 5275 struct event* handler = &tcp_accept_handlers[i].event; 5276 if(event_types) { 5277 /* reassign */ 5278 int fd = handler->ev_fd; 5279 struct event_base* base = handler->ev_base; 5280 if(tcp_accept_handlers[i].event_added) 5281 event_del(handler); 5282 memset(handler, 0, sizeof(*handler)); 5283 event_set(handler, fd, event_types, 5284 handle_tcp_accept, &tcp_accept_handlers[i]); 5285 if(event_base_set(base, handler) != 0) 5286 log_msg(LOG_ERR, "conhand: cannot event_base"); 5287 if(event_add(handler, NULL) != 0) 5288 log_msg(LOG_ERR, "conhand: cannot event_add"); 5289 tcp_accept_handlers[i].event_added = 1; 5290 } else { 5291 /* remove */ 5292 if(tcp_accept_handlers[i].event_added) { 5293 event_del(handler); 5294 tcp_accept_handlers[i].event_added = 0; 5295 } 5296 } 5297 } 5298} 5299