listen_dnsport.c revision 356345
1/*
2 * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3 *
4 * Copyright (c) 2007, NLnet Labs. All rights reserved.
5 *
6 * This software is open source.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * Redistributions of source code must retain the above copyright notice,
13 * this list of conditions and the following disclaimer.
14 *
15 * Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 *
19 * Neither the name of the NLNET LABS nor the names of its contributors may
20 * be used to endorse or promote products derived from this software without
21 * specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36/**
37 * \file
38 *
39 * This file has functions to get queries from clients.
40 */
41#include "config.h"
42#ifdef HAVE_SYS_TYPES_H
43#  include <sys/types.h>
44#endif
45#include <sys/time.h>
46#ifdef USE_TCP_FASTOPEN
47#include <netinet/tcp.h>
48#endif
49#include "services/listen_dnsport.h"
50#include "services/outside_network.h"
51#include "util/netevent.h"
52#include "util/log.h"
53#include "util/config_file.h"
54#include "util/net_help.h"
55#include "sldns/sbuffer.h"
56#include "services/mesh.h"
57#include "util/fptr_wlist.h"
58#include "util/locks.h"
59
60#ifdef HAVE_NETDB_H
61#include <netdb.h>
62#endif
63#include <fcntl.h>
64
65#ifdef HAVE_SYS_UN_H
66#include <sys/un.h>
67#endif
68
69#ifdef HAVE_SYSTEMD
70#include <systemd/sd-daemon.h>
71#endif
72
73/** number of queued TCP connections for listen() */
74#define TCP_BACKLOG 256
75
76/** number of simultaneous requests a client can have */
77#define TCP_MAX_REQ_SIMULTANEOUS 32
78
79#ifndef THREADS_DISABLED
80/** lock on the counter of stream buffer memory */
81static lock_basic_type stream_wait_count_lock;
82#endif
83/** size (in bytes) of stream wait buffers */
84static size_t stream_wait_count = 0;
85/** is the lock initialised for stream wait buffers */
86static int stream_wait_lock_inited = 0;
87
88/**
89 * Debug print of the getaddrinfo returned address.
90 * @param addr: the address returned.
91 */
92static void
93verbose_print_addr(struct addrinfo *addr)
94{
95	if(verbosity >= VERB_ALGO) {
96		char buf[100];
97		void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
98#ifdef INET6
99		if(addr->ai_family == AF_INET6)
100			sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
101				sin6_addr;
102#endif /* INET6 */
103		if(inet_ntop(addr->ai_family, sinaddr, buf,
104			(socklen_t)sizeof(buf)) == 0) {
105			(void)strlcpy(buf, "(null)", sizeof(buf));
106		}
107		buf[sizeof(buf)-1] = 0;
108		verbose(VERB_ALGO, "creating %s%s socket %s %d",
109			addr->ai_socktype==SOCK_DGRAM?"udp":
110			addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
111			addr->ai_family==AF_INET?"4":
112			addr->ai_family==AF_INET6?"6":
113			"_otherfam", buf,
114			ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port));
115	}
116}
117
118#ifdef HAVE_SYSTEMD
119static int
120systemd_get_activated(int family, int socktype, int listen,
121		      struct sockaddr *addr, socklen_t addrlen,
122		      const char *path)
123{
124	int i = 0;
125	int r = 0;
126	int s = -1;
127	const char* listen_pid, *listen_fds;
128
129	/* We should use "listen" option only for stream protocols. For UDP it should be -1 */
130
131	if((r = sd_booted()) < 1) {
132		if(r == 0)
133			log_warn("systemd is not running");
134		else
135			log_err("systemd sd_booted(): %s", strerror(-r));
136		return -1;
137	}
138
139	listen_pid = getenv("LISTEN_PID");
140	listen_fds = getenv("LISTEN_FDS");
141
142	if (!listen_pid) {
143		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
144		return -1;
145	}
146
147	if (!listen_fds) {
148		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
149		return -1;
150	}
151
152	if((r = sd_listen_fds(0)) < 1) {
153		if(r == 0)
154			log_warn("systemd: did not return socket, check unit configuration");
155		else
156			log_err("systemd sd_listen_fds(): %s", strerror(-r));
157		return -1;
158	}
159
160	for(i = 0; i < r; i++) {
161		if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
162			s = SD_LISTEN_FDS_START + i;
163			break;
164		}
165	}
166	if (s == -1) {
167		if (addr)
168			log_err_addr("systemd sd_listen_fds()",
169				     "no such socket",
170				     (struct sockaddr_storage *)addr, addrlen);
171		else
172			log_err("systemd sd_listen_fds(): %s", path);
173	}
174	return s;
175}
176#endif
177
178int
179create_udp_sock(int family, int socktype, struct sockaddr* addr,
180        socklen_t addrlen, int v6only, int* inuse, int* noproto,
181	int rcv, int snd, int listen, int* reuseport, int transparent,
182	int freebind, int use_systemd)
183{
184	int s;
185#if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
186	int on=1;
187#endif
188#ifdef IPV6_MTU
189	int mtu = IPV6_MIN_MTU;
190#endif
191#if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
192	(void)rcv;
193#endif
194#if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
195	(void)snd;
196#endif
197#ifndef IPV6_V6ONLY
198	(void)v6only;
199#endif
200#if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
201	(void)transparent;
202#endif
203#if !defined(IP_FREEBIND)
204	(void)freebind;
205#endif
206#ifdef HAVE_SYSTEMD
207	int got_fd_from_systemd = 0;
208
209	if (!use_systemd
210	    || (use_systemd
211		&& (s = systemd_get_activated(family, socktype, -1, addr,
212					      addrlen, NULL)) == -1)) {
213#else
214	(void)use_systemd;
215#endif
216	if((s = socket(family, socktype, 0)) == -1) {
217		*inuse = 0;
218#ifndef USE_WINSOCK
219		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
220			*noproto = 1;
221			return -1;
222		}
223		log_err("can't create socket: %s", strerror(errno));
224#else
225		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
226			WSAGetLastError() == WSAEPROTONOSUPPORT) {
227			*noproto = 1;
228			return -1;
229		}
230		log_err("can't create socket: %s",
231			wsa_strerror(WSAGetLastError()));
232#endif
233		*noproto = 0;
234		return -1;
235	}
236#ifdef HAVE_SYSTEMD
237	} else {
238		got_fd_from_systemd = 1;
239	}
240#endif
241	if(listen) {
242#ifdef SO_REUSEADDR
243		if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
244			(socklen_t)sizeof(on)) < 0) {
245#ifndef USE_WINSOCK
246			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
247				strerror(errno));
248			if(errno != ENOSYS) {
249				close(s);
250				*noproto = 0;
251				*inuse = 0;
252				return -1;
253			}
254#else
255			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
256				wsa_strerror(WSAGetLastError()));
257			closesocket(s);
258			*noproto = 0;
259			*inuse = 0;
260			return -1;
261#endif
262		}
263#endif /* SO_REUSEADDR */
264#ifdef SO_REUSEPORT
265#  ifdef SO_REUSEPORT_LB
266		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
267		 * like SO_REUSEPORT on Linux.  This is what the users want
268		 * with the config option in unbound.conf; if we actually
269		 * need local address and port reuse they'll also need to
270		 * have SO_REUSEPORT set for them, assume it was _LB they want.
271		 */
272		if (reuseport && *reuseport &&
273		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
274			(socklen_t)sizeof(on)) < 0) {
275#ifdef ENOPROTOOPT
276			if(errno != ENOPROTOOPT || verbosity >= 3)
277				log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
278					strerror(errno));
279#endif
280			/* this option is not essential, we can continue */
281			*reuseport = 0;
282		}
283#  else /* no SO_REUSEPORT_LB */
284
285		/* try to set SO_REUSEPORT so that incoming
286		 * queries are distributed evenly among the receiving threads.
287		 * Each thread must have its own socket bound to the same port,
288		 * with SO_REUSEPORT set on each socket.
289		 */
290		if (reuseport && *reuseport &&
291		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
292			(socklen_t)sizeof(on)) < 0) {
293#ifdef ENOPROTOOPT
294			if(errno != ENOPROTOOPT || verbosity >= 3)
295				log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
296					strerror(errno));
297#endif
298			/* this option is not essential, we can continue */
299			*reuseport = 0;
300		}
301#  endif /* SO_REUSEPORT_LB */
302#else
303		(void)reuseport;
304#endif /* defined(SO_REUSEPORT) */
305#ifdef IP_TRANSPARENT
306		if (transparent &&
307		    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
308		    (socklen_t)sizeof(on)) < 0) {
309			log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
310			strerror(errno));
311		}
312#elif defined(IP_BINDANY)
313		if (transparent &&
314		    setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
315		    (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
316		    (void*)&on, (socklen_t)sizeof(on)) < 0) {
317			log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
318			(family==AF_INET6?"V6":""), strerror(errno));
319		}
320#elif defined(SO_BINDANY)
321		if (transparent &&
322		    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
323		    (socklen_t)sizeof(on)) < 0) {
324			log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
325			strerror(errno));
326		}
327#endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
328	}
329#ifdef IP_FREEBIND
330	if(freebind &&
331	    setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
332	    (socklen_t)sizeof(on)) < 0) {
333		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
334		strerror(errno));
335	}
336#endif /* IP_FREEBIND */
337	if(rcv) {
338#ifdef SO_RCVBUF
339		int got;
340		socklen_t slen = (socklen_t)sizeof(got);
341#  ifdef SO_RCVBUFFORCE
342		/* Linux specific: try to use root permission to override
343		 * system limits on rcvbuf. The limit is stored in
344		 * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
345		if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
346			(socklen_t)sizeof(rcv)) < 0) {
347			if(errno != EPERM) {
348#    ifndef USE_WINSOCK
349				log_err("setsockopt(..., SO_RCVBUFFORCE, "
350					"...) failed: %s", strerror(errno));
351				close(s);
352#    else
353				log_err("setsockopt(..., SO_RCVBUFFORCE, "
354					"...) failed: %s",
355					wsa_strerror(WSAGetLastError()));
356				closesocket(s);
357#    endif
358				*noproto = 0;
359				*inuse = 0;
360				return -1;
361			}
362#  endif /* SO_RCVBUFFORCE */
363			if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
364				(socklen_t)sizeof(rcv)) < 0) {
365#  ifndef USE_WINSOCK
366				log_err("setsockopt(..., SO_RCVBUF, "
367					"...) failed: %s", strerror(errno));
368				close(s);
369#  else
370				log_err("setsockopt(..., SO_RCVBUF, "
371					"...) failed: %s",
372					wsa_strerror(WSAGetLastError()));
373				closesocket(s);
374#  endif
375				*noproto = 0;
376				*inuse = 0;
377				return -1;
378			}
379			/* check if we got the right thing or if system
380			 * reduced to some system max.  Warn if so */
381			if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got,
382				&slen) >= 0 && got < rcv/2) {
383				log_warn("so-rcvbuf %u was not granted. "
384					"Got %u. To fix: start with "
385					"root permissions(linux) or sysctl "
386					"bigger net.core.rmem_max(linux) or "
387					"kern.ipc.maxsockbuf(bsd) values.",
388					(unsigned)rcv, (unsigned)got);
389			}
390#  ifdef SO_RCVBUFFORCE
391		}
392#  endif
393#endif /* SO_RCVBUF */
394	}
395	/* first do RCVBUF as the receive buffer is more important */
396	if(snd) {
397#ifdef SO_SNDBUF
398		int got;
399		socklen_t slen = (socklen_t)sizeof(got);
400#  ifdef SO_SNDBUFFORCE
401		/* Linux specific: try to use root permission to override
402		 * system limits on sndbuf. The limit is stored in
403		 * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
404		if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
405			(socklen_t)sizeof(snd)) < 0) {
406			if(errno != EPERM) {
407#    ifndef USE_WINSOCK
408				log_err("setsockopt(..., SO_SNDBUFFORCE, "
409					"...) failed: %s", strerror(errno));
410				close(s);
411#    else
412				log_err("setsockopt(..., SO_SNDBUFFORCE, "
413					"...) failed: %s",
414					wsa_strerror(WSAGetLastError()));
415				closesocket(s);
416#    endif
417				*noproto = 0;
418				*inuse = 0;
419				return -1;
420			}
421#  endif /* SO_SNDBUFFORCE */
422			if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
423				(socklen_t)sizeof(snd)) < 0) {
424#  ifndef USE_WINSOCK
425				log_err("setsockopt(..., SO_SNDBUF, "
426					"...) failed: %s", strerror(errno));
427				close(s);
428#  else
429				log_err("setsockopt(..., SO_SNDBUF, "
430					"...) failed: %s",
431					wsa_strerror(WSAGetLastError()));
432				closesocket(s);
433#  endif
434				*noproto = 0;
435				*inuse = 0;
436				return -1;
437			}
438			/* check if we got the right thing or if system
439			 * reduced to some system max.  Warn if so */
440			if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got,
441				&slen) >= 0 && got < snd/2) {
442				log_warn("so-sndbuf %u was not granted. "
443					"Got %u. To fix: start with "
444					"root permissions(linux) or sysctl "
445					"bigger net.core.wmem_max(linux) or "
446					"kern.ipc.maxsockbuf(bsd) values.",
447					(unsigned)snd, (unsigned)got);
448			}
449#  ifdef SO_SNDBUFFORCE
450		}
451#  endif
452#endif /* SO_SNDBUF */
453	}
454	if(family == AF_INET6) {
455# if defined(IPV6_V6ONLY)
456		if(v6only) {
457			int val=(v6only==2)?0:1;
458			if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
459				(void*)&val, (socklen_t)sizeof(val)) < 0) {
460#ifndef USE_WINSOCK
461				log_err("setsockopt(..., IPV6_V6ONLY"
462					", ...) failed: %s", strerror(errno));
463				close(s);
464#else
465				log_err("setsockopt(..., IPV6_V6ONLY"
466					", ...) failed: %s",
467					wsa_strerror(WSAGetLastError()));
468				closesocket(s);
469#endif
470				*noproto = 0;
471				*inuse = 0;
472				return -1;
473			}
474		}
475# endif
476# if defined(IPV6_USE_MIN_MTU)
477		/*
478		 * There is no fragmentation of IPv6 datagrams
479		 * during forwarding in the network. Therefore
480		 * we do not send UDP datagrams larger than
481		 * the minimum IPv6 MTU of 1280 octets. The
482		 * EDNS0 message length can be larger if the
483		 * network stack supports IPV6_USE_MIN_MTU.
484		 */
485		if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
486			(void*)&on, (socklen_t)sizeof(on)) < 0) {
487#  ifndef USE_WINSOCK
488			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
489				"...) failed: %s", strerror(errno));
490			close(s);
491#  else
492			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
493				"...) failed: %s",
494				wsa_strerror(WSAGetLastError()));
495			closesocket(s);
496#  endif
497			*noproto = 0;
498			*inuse = 0;
499			return -1;
500		}
501# elif defined(IPV6_MTU)
502		/*
503		 * On Linux, to send no larger than 1280, the PMTUD is
504		 * disabled by default for datagrams anyway, so we set
505		 * the MTU to use.
506		 */
507		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
508			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
509#  ifndef USE_WINSOCK
510			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
511				strerror(errno));
512			close(s);
513#  else
514			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
515				wsa_strerror(WSAGetLastError()));
516			closesocket(s);
517#  endif
518			*noproto = 0;
519			*inuse = 0;
520			return -1;
521		}
522# endif /* IPv6 MTU */
523	} else if(family == AF_INET) {
524#  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
525/* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
526 * PMTU information is not accepted, but fragmentation is allowed
527 * if and only if the packet size exceeds the outgoing interface MTU
528 * (and also uses the interface mtu to determine the size of the packets).
529 * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
530 * FreeBSD already has same semantics without setting the option. */
531		int omit_set = 0;
532		int action;
533#   if defined(IP_PMTUDISC_OMIT)
534		action = IP_PMTUDISC_OMIT;
535		if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
536			&action, (socklen_t)sizeof(action)) < 0) {
537
538			if (errno != EINVAL) {
539				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
540					strerror(errno));
541
542#    ifndef USE_WINSOCK
543				close(s);
544#    else
545				closesocket(s);
546#    endif
547				*noproto = 0;
548				*inuse = 0;
549				return -1;
550			}
551		}
552		else
553		{
554		    omit_set = 1;
555		}
556#   endif
557		if (omit_set == 0) {
558   			action = IP_PMTUDISC_DONT;
559			if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
560				&action, (socklen_t)sizeof(action)) < 0) {
561				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
562					strerror(errno));
563#    ifndef USE_WINSOCK
564				close(s);
565#    else
566				closesocket(s);
567#    endif
568				*noproto = 0;
569				*inuse = 0;
570				return -1;
571			}
572		}
573#  elif defined(IP_DONTFRAG)
574		int off = 0;
575		if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG,
576			&off, (socklen_t)sizeof(off)) < 0) {
577			log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
578				strerror(errno));
579#    ifndef USE_WINSOCK
580			close(s);
581#    else
582			closesocket(s);
583#    endif
584			*noproto = 0;
585			*inuse = 0;
586			return -1;
587		}
588#  endif /* IPv4 MTU */
589	}
590	if(
591#ifdef HAVE_SYSTEMD
592		!got_fd_from_systemd &&
593#endif
594		bind(s, (struct sockaddr*)addr, addrlen) != 0) {
595		*noproto = 0;
596		*inuse = 0;
597#ifndef USE_WINSOCK
598#ifdef EADDRINUSE
599		*inuse = (errno == EADDRINUSE);
600		/* detect freebsd jail with no ipv6 permission */
601		if(family==AF_INET6 && errno==EINVAL)
602			*noproto = 1;
603		else if(errno != EADDRINUSE &&
604			!(errno == EACCES && verbosity < 4 && !listen)
605#ifdef EADDRNOTAVAIL
606			&& !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
607#endif
608			) {
609			log_err_addr("can't bind socket", strerror(errno),
610				(struct sockaddr_storage*)addr, addrlen);
611		}
612#endif /* EADDRINUSE */
613		close(s);
614#else /* USE_WINSOCK */
615		if(WSAGetLastError() != WSAEADDRINUSE &&
616			WSAGetLastError() != WSAEADDRNOTAVAIL &&
617			!(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
618			log_err_addr("can't bind socket",
619				wsa_strerror(WSAGetLastError()),
620				(struct sockaddr_storage*)addr, addrlen);
621		}
622		closesocket(s);
623#endif /* USE_WINSOCK */
624		return -1;
625	}
626	if(!fd_set_nonblock(s)) {
627		*noproto = 0;
628		*inuse = 0;
629#ifndef USE_WINSOCK
630		close(s);
631#else
632		closesocket(s);
633#endif
634		return -1;
635	}
636	return s;
637}
638
639int
640create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
641	int* reuseport, int transparent, int mss, int freebind, int use_systemd)
642{
643	int s;
644#if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined(SO_BINDANY)
645	int on = 1;
646#endif
647#ifdef HAVE_SYSTEMD
648	int got_fd_from_systemd = 0;
649#endif
650#ifdef USE_TCP_FASTOPEN
651	int qlen;
652#endif
653#if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
654	(void)transparent;
655#endif
656#if !defined(IP_FREEBIND)
657	(void)freebind;
658#endif
659	verbose_print_addr(addr);
660	*noproto = 0;
661#ifdef HAVE_SYSTEMD
662	if (!use_systemd ||
663	    (use_systemd
664	     && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
665					   addr->ai_addr, addr->ai_addrlen,
666					   NULL)) == -1)) {
667#else
668	(void)use_systemd;
669#endif
670	if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
671#ifndef USE_WINSOCK
672		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
673			*noproto = 1;
674			return -1;
675		}
676		log_err("can't create socket: %s", strerror(errno));
677#else
678		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
679			WSAGetLastError() == WSAEPROTONOSUPPORT) {
680			*noproto = 1;
681			return -1;
682		}
683		log_err("can't create socket: %s",
684			wsa_strerror(WSAGetLastError()));
685#endif
686		return -1;
687	}
688	if (mss > 0) {
689#if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
690		if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
691			(socklen_t)sizeof(mss)) < 0) {
692			#ifndef USE_WINSOCK
693			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
694				strerror(errno));
695			#else
696			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
697				wsa_strerror(WSAGetLastError()));
698			#endif
699		} else {
700			verbose(VERB_ALGO,
701				" tcp socket mss set to %d", mss);
702		}
703#else
704		log_warn(" setsockopt(TCP_MAXSEG) unsupported");
705#endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
706	}
707#ifdef HAVE_SYSTEMD
708	} else {
709		got_fd_from_systemd = 1;
710    }
711#endif
712#ifdef SO_REUSEADDR
713	if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
714		(socklen_t)sizeof(on)) < 0) {
715#ifndef USE_WINSOCK
716		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
717			strerror(errno));
718		close(s);
719#else
720		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
721			wsa_strerror(WSAGetLastError()));
722		closesocket(s);
723#endif
724		return -1;
725	}
726#endif /* SO_REUSEADDR */
727#ifdef IP_FREEBIND
728	if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
729	    (socklen_t)sizeof(on)) < 0) {
730		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
731		strerror(errno));
732	}
733#endif /* IP_FREEBIND */
734#ifdef SO_REUSEPORT
735	/* try to set SO_REUSEPORT so that incoming
736	 * connections are distributed evenly among the receiving threads.
737	 * Each thread must have its own socket bound to the same port,
738	 * with SO_REUSEPORT set on each socket.
739	 */
740	if (reuseport && *reuseport &&
741		setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
742		(socklen_t)sizeof(on)) < 0) {
743#ifdef ENOPROTOOPT
744		if(errno != ENOPROTOOPT || verbosity >= 3)
745			log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
746				strerror(errno));
747#endif
748		/* this option is not essential, we can continue */
749		*reuseport = 0;
750	}
751#else
752	(void)reuseport;
753#endif /* defined(SO_REUSEPORT) */
754#if defined(IPV6_V6ONLY)
755	if(addr->ai_family == AF_INET6 && v6only) {
756		if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
757			(void*)&on, (socklen_t)sizeof(on)) < 0) {
758#ifndef USE_WINSOCK
759			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
760				strerror(errno));
761			close(s);
762#else
763			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
764				wsa_strerror(WSAGetLastError()));
765			closesocket(s);
766#endif
767			return -1;
768		}
769	}
770#else
771	(void)v6only;
772#endif /* IPV6_V6ONLY */
773#ifdef IP_TRANSPARENT
774	if (transparent &&
775	    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
776	    (socklen_t)sizeof(on)) < 0) {
777		log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
778			strerror(errno));
779	}
780#elif defined(IP_BINDANY)
781	if (transparent &&
782	    setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
783	    (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
784	    (void*)&on, (socklen_t)sizeof(on)) < 0) {
785		log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
786		(addr->ai_family==AF_INET6?"V6":""), strerror(errno));
787	}
788#elif defined(SO_BINDANY)
789	if (transparent &&
790	    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
791	    sizeof(on)) < 0) {
792		log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
793		strerror(errno));
794	}
795#endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
796	if(
797#ifdef HAVE_SYSTEMD
798		!got_fd_from_systemd &&
799#endif
800        bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
801#ifndef USE_WINSOCK
802		/* detect freebsd jail with no ipv6 permission */
803		if(addr->ai_family==AF_INET6 && errno==EINVAL)
804			*noproto = 1;
805		else {
806			log_err_addr("can't bind socket", strerror(errno),
807				(struct sockaddr_storage*)addr->ai_addr,
808				addr->ai_addrlen);
809		}
810		close(s);
811#else
812		log_err_addr("can't bind socket",
813			wsa_strerror(WSAGetLastError()),
814			(struct sockaddr_storage*)addr->ai_addr,
815			addr->ai_addrlen);
816		closesocket(s);
817#endif
818		return -1;
819	}
820	if(!fd_set_nonblock(s)) {
821#ifndef USE_WINSOCK
822		close(s);
823#else
824		closesocket(s);
825#endif
826		return -1;
827	}
828	if(listen(s, TCP_BACKLOG) == -1) {
829#ifndef USE_WINSOCK
830		log_err("can't listen: %s", strerror(errno));
831		close(s);
832#else
833		log_err("can't listen: %s", wsa_strerror(WSAGetLastError()));
834		closesocket(s);
835#endif
836		return -1;
837	}
838#ifdef USE_TCP_FASTOPEN
839	/* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
840	   against IP spoofing attacks as suggested in RFC7413 */
841#ifdef __APPLE__
842	/* OS X implementation only supports qlen of 1 via this call. Actual
843	   value is configured by the net.inet.tcp.fastopen_backlog kernel parm. */
844	qlen = 1;
845#else
846	/* 5 is recommended on linux */
847	qlen = 5;
848#endif
849	if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen,
850		  sizeof(qlen))) == -1 ) {
851#ifdef ENOPROTOOPT
852		/* squelch ENOPROTOOPT: freebsd server mode with kernel support
853		   disabled, except when verbosity enabled for debugging */
854		if(errno != ENOPROTOOPT || verbosity >= 3) {
855#endif
856		  if(errno == EPERM) {
857		  	log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
858		  } else {
859		  	log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
860		  }
861#ifdef ENOPROTOOPT
862		}
863#endif
864	}
865#endif
866	return s;
867}
868
869int
870create_local_accept_sock(const char *path, int* noproto, int use_systemd)
871{
872#ifdef HAVE_SYSTEMD
873	int ret;
874
875	if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
876		return ret;
877	else {
878#endif
879#ifdef HAVE_SYS_UN_H
880	int s;
881	struct sockaddr_un usock;
882#ifndef HAVE_SYSTEMD
883	(void)use_systemd;
884#endif
885
886	verbose(VERB_ALGO, "creating unix socket %s", path);
887#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
888	/* this member exists on BSDs, not Linux */
889	usock.sun_len = (unsigned)sizeof(usock);
890#endif
891	usock.sun_family = AF_LOCAL;
892	/* length is 92-108, 104 on FreeBSD */
893	(void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
894
895	if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
896		log_err("Cannot create local socket %s (%s)",
897			path, strerror(errno));
898		return -1;
899	}
900
901	if (unlink(path) && errno != ENOENT) {
902		/* The socket already exists and cannot be removed */
903		log_err("Cannot remove old local socket %s (%s)",
904			path, strerror(errno));
905		goto err;
906	}
907
908	if (bind(s, (struct sockaddr *)&usock,
909		(socklen_t)sizeof(struct sockaddr_un)) == -1) {
910		log_err("Cannot bind local socket %s (%s)",
911			path, strerror(errno));
912		goto err;
913	}
914
915	if (!fd_set_nonblock(s)) {
916		log_err("Cannot set non-blocking mode");
917		goto err;
918	}
919
920	if (listen(s, TCP_BACKLOG) == -1) {
921		log_err("can't listen: %s", strerror(errno));
922		goto err;
923	}
924
925	(void)noproto; /*unused*/
926	return s;
927
928err:
929#ifndef USE_WINSOCK
930	close(s);
931#else
932	closesocket(s);
933#endif
934	return -1;
935
936#ifdef HAVE_SYSTEMD
937	}
938#endif
939#else
940	(void)use_systemd;
941	(void)path;
942	log_err("Local sockets are not supported");
943	*noproto = 1;
944	return -1;
945#endif
946}
947
948
949/**
950 * Create socket from getaddrinfo results
951 */
952static int
953make_sock(int stype, const char* ifname, const char* port,
954	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
955	int* reuseport, int transparent, int tcp_mss, int freebind, int use_systemd)
956{
957	struct addrinfo *res = NULL;
958	int r, s, inuse, noproto;
959	hints->ai_socktype = stype;
960	*noip6 = 0;
961	if((r=getaddrinfo(ifname, port, hints, &res)) != 0 || !res) {
962#ifdef USE_WINSOCK
963		if(r == EAI_NONAME && hints->ai_family == AF_INET6){
964			*noip6 = 1; /* 'Host not found' for IP6 on winXP */
965			return -1;
966		}
967#endif
968		log_err("node %s:%s getaddrinfo: %s %s",
969			ifname?ifname:"default", port, gai_strerror(r),
970#ifdef EAI_SYSTEM
971			r==EAI_SYSTEM?(char*)strerror(errno):""
972#else
973			""
974#endif
975		);
976		return -1;
977	}
978	if(stype == SOCK_DGRAM) {
979		verbose_print_addr(res);
980		s = create_udp_sock(res->ai_family, res->ai_socktype,
981			(struct sockaddr*)res->ai_addr, res->ai_addrlen,
982			v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
983			reuseport, transparent, freebind, use_systemd);
984		if(s == -1 && inuse) {
985			log_err("bind: address already in use");
986		} else if(s == -1 && noproto && hints->ai_family == AF_INET6){
987			*noip6 = 1;
988		}
989	} else	{
990		s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
991			transparent, tcp_mss, freebind, use_systemd);
992		if(s == -1 && noproto && hints->ai_family == AF_INET6){
993			*noip6 = 1;
994		}
995	}
996	freeaddrinfo(res);
997	return s;
998}
999
1000/** make socket and first see if ifname contains port override info */
1001static int
1002make_sock_port(int stype, const char* ifname, const char* port,
1003	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1004	int* reuseport, int transparent, int tcp_mss, int freebind, int use_systemd)
1005{
1006	char* s = strchr(ifname, '@');
1007	if(s) {
1008		/* override port with ifspec@port */
1009		char p[16];
1010		char newif[128];
1011		if((size_t)(s-ifname) >= sizeof(newif)) {
1012			log_err("ifname too long: %s", ifname);
1013			*noip6 = 0;
1014			return -1;
1015		}
1016		if(strlen(s+1) >= sizeof(p)) {
1017			log_err("portnumber too long: %s", ifname);
1018			*noip6 = 0;
1019			return -1;
1020		}
1021		(void)strlcpy(newif, ifname, sizeof(newif));
1022		newif[s-ifname] = 0;
1023		(void)strlcpy(p, s+1, sizeof(p));
1024		p[strlen(s+1)]=0;
1025		return make_sock(stype, newif, p, hints, v6only, noip6,
1026			rcv, snd, reuseport, transparent, tcp_mss, freebind, use_systemd);
1027	}
1028	return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
1029		reuseport, transparent, tcp_mss, freebind, use_systemd);
1030}
1031
1032/**
1033 * Add port to open ports list.
1034 * @param list: list head. changed.
1035 * @param s: fd.
1036 * @param ftype: if fd is UDP.
1037 * @return false on failure. list in unchanged then.
1038 */
1039static int
1040port_insert(struct listen_port** list, int s, enum listen_type ftype)
1041{
1042	struct listen_port* item = (struct listen_port*)malloc(
1043		sizeof(struct listen_port));
1044	if(!item)
1045		return 0;
1046	item->next = *list;
1047	item->fd = s;
1048	item->ftype = ftype;
1049	*list = item;
1050	return 1;
1051}
1052
1053/** set fd to receive source address packet info */
1054static int
1055set_recvpktinfo(int s, int family)
1056{
1057#if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1058	int on = 1;
1059#else
1060	(void)s;
1061#endif
1062	if(family == AF_INET6) {
1063#           ifdef IPV6_RECVPKTINFO
1064		if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1065			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1066			log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1067				strerror(errno));
1068			return 0;
1069		}
1070#           elif defined(IPV6_PKTINFO)
1071		if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1072			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1073			log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1074				strerror(errno));
1075			return 0;
1076		}
1077#           else
1078		log_err("no IPV6_RECVPKTINFO and no IPV6_PKTINFO option, please "
1079			"disable interface-automatic or do-ip6 in config");
1080		return 0;
1081#           endif /* defined IPV6_RECVPKTINFO */
1082
1083	} else if(family == AF_INET) {
1084#           ifdef IP_PKTINFO
1085		if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1086			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1087			log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1088				strerror(errno));
1089			return 0;
1090		}
1091#           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1092		if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1093			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1094			log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1095				strerror(errno));
1096			return 0;
1097		}
1098#           else
1099		log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1100			"interface-automatic or do-ip4 in config");
1101		return 0;
1102#           endif /* IP_PKTINFO */
1103
1104	}
1105	return 1;
1106}
1107
1108/** see if interface is ssl, its port number == the ssl port number */
1109static int
1110if_is_ssl(const char* ifname, const char* port, int ssl_port,
1111	struct config_strlist* tls_additional_port)
1112{
1113	struct config_strlist* s;
1114	char* p = strchr(ifname, '@');
1115	if(!p && atoi(port) == ssl_port)
1116		return 1;
1117	if(p && atoi(p+1) == ssl_port)
1118		return 1;
1119	for(s = tls_additional_port; s; s = s->next) {
1120		if(p && atoi(p+1) == atoi(s->str))
1121			return 1;
1122		if(!p && atoi(port) == atoi(s->str))
1123			return 1;
1124	}
1125	return 0;
1126}
1127
1128/**
1129 * Helper for ports_open. Creates one interface (or NULL for default).
1130 * @param ifname: The interface ip address.
1131 * @param do_auto: use automatic interface detection.
1132 * 	If enabled, then ifname must be the wildcard name.
1133 * @param do_udp: if udp should be used.
1134 * @param do_tcp: if udp should be used.
1135 * @param hints: for getaddrinfo. family and flags have to be set by caller.
1136 * @param port: Port number to use (as string).
1137 * @param list: list of open ports, appended to, changed to point to list head.
1138 * @param rcv: receive buffer size for UDP
1139 * @param snd: send buffer size for UDP
1140 * @param ssl_port: ssl service port number
1141 * @param tls_additional_port: list of additional ssl service port numbers.
1142 * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1143 * 	set to false on exit if reuseport failed due to no kernel support.
1144 * @param transparent: set IP_TRANSPARENT socket option.
1145 * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1146 * @param freebind: set IP_FREEBIND socket option.
1147 * @param use_systemd: if true, fetch sockets from systemd.
1148 * @param dnscrypt_port: dnscrypt service port number
1149 * @return: returns false on error.
1150 */
1151static int
1152ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp,
1153	struct addrinfo *hints, const char* port, struct listen_port** list,
1154	size_t rcv, size_t snd, int ssl_port,
1155	struct config_strlist* tls_additional_port, int* reuseport,
1156	int transparent, int tcp_mss, int freebind, int use_systemd,
1157	int dnscrypt_port)
1158{
1159	int s, noip6=0;
1160#ifdef USE_DNSCRYPT
1161	int is_dnscrypt = ((strchr(ifname, '@') &&
1162			atoi(strchr(ifname, '@')+1) == dnscrypt_port) ||
1163			(!strchr(ifname, '@') && atoi(port) == dnscrypt_port));
1164#else
1165	int is_dnscrypt = 0;
1166	(void)dnscrypt_port;
1167#endif
1168
1169	if(!do_udp && !do_tcp)
1170		return 0;
1171	if(do_auto) {
1172		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1173			&noip6, rcv, snd, reuseport, transparent,
1174			tcp_mss, freebind, use_systemd)) == -1) {
1175			if(noip6) {
1176				log_warn("IPv6 protocol not available");
1177				return 1;
1178			}
1179			return 0;
1180		}
1181		/* getting source addr packet info is highly non-portable */
1182		if(!set_recvpktinfo(s, hints->ai_family)) {
1183#ifndef USE_WINSOCK
1184			close(s);
1185#else
1186			closesocket(s);
1187#endif
1188			return 0;
1189		}
1190		if(!port_insert(list, s,
1191		   is_dnscrypt?listen_type_udpancil_dnscrypt:listen_type_udpancil)) {
1192#ifndef USE_WINSOCK
1193			close(s);
1194#else
1195			closesocket(s);
1196#endif
1197			return 0;
1198		}
1199	} else if(do_udp) {
1200		/* regular udp socket */
1201		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1202			&noip6, rcv, snd, reuseport, transparent,
1203			tcp_mss, freebind, use_systemd)) == -1) {
1204			if(noip6) {
1205				log_warn("IPv6 protocol not available");
1206				return 1;
1207			}
1208			return 0;
1209		}
1210		if(!port_insert(list, s,
1211		   is_dnscrypt?listen_type_udp_dnscrypt:listen_type_udp)) {
1212#ifndef USE_WINSOCK
1213			close(s);
1214#else
1215			closesocket(s);
1216#endif
1217			return 0;
1218		}
1219	}
1220	if(do_tcp) {
1221		int is_ssl = if_is_ssl(ifname, port, ssl_port,
1222			tls_additional_port);
1223		if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1,
1224			&noip6, 0, 0, reuseport, transparent, tcp_mss,
1225			freebind, use_systemd)) == -1) {
1226			if(noip6) {
1227				/*log_warn("IPv6 protocol not available");*/
1228				return 1;
1229			}
1230			return 0;
1231		}
1232		if(is_ssl)
1233			verbose(VERB_ALGO, "setup TCP for SSL service");
1234		if(!port_insert(list, s, is_ssl?listen_type_ssl:
1235			(is_dnscrypt?listen_type_tcp_dnscrypt:listen_type_tcp))) {
1236#ifndef USE_WINSOCK
1237			close(s);
1238#else
1239			closesocket(s);
1240#endif
1241			return 0;
1242		}
1243	}
1244	return 1;
1245}
1246
1247/**
1248 * Add items to commpoint list in front.
1249 * @param c: commpoint to add.
1250 * @param front: listen struct.
1251 * @return: false on failure.
1252 */
1253static int
1254listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1255{
1256	struct listen_list* item = (struct listen_list*)malloc(
1257		sizeof(struct listen_list));
1258	if(!item)
1259		return 0;
1260	item->com = c;
1261	item->next = front->cps;
1262	front->cps = item;
1263	return 1;
1264}
1265
1266struct listen_dnsport*
1267listen_create(struct comm_base* base, struct listen_port* ports,
1268	size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1269	struct tcl_list* tcp_conn_limit, void* sslctx,
1270	struct dt_env* dtenv, comm_point_callback_type* cb, void *cb_arg)
1271{
1272	struct listen_dnsport* front = (struct listen_dnsport*)
1273		malloc(sizeof(struct listen_dnsport));
1274	if(!front)
1275		return NULL;
1276	front->cps = NULL;
1277	front->udp_buff = sldns_buffer_new(bufsize);
1278#ifdef USE_DNSCRYPT
1279	front->dnscrypt_udp_buff = NULL;
1280#endif
1281	if(!front->udp_buff) {
1282		free(front);
1283		return NULL;
1284	}
1285	if(!stream_wait_lock_inited) {
1286		lock_basic_init(&stream_wait_count_lock);
1287		stream_wait_lock_inited = 1;
1288	}
1289
1290	/* create comm points as needed */
1291	while(ports) {
1292		struct comm_point* cp = NULL;
1293		if(ports->ftype == listen_type_udp ||
1294		   ports->ftype == listen_type_udp_dnscrypt)
1295			cp = comm_point_create_udp(base, ports->fd,
1296				front->udp_buff, cb, cb_arg);
1297		else if(ports->ftype == listen_type_tcp ||
1298				ports->ftype == listen_type_tcp_dnscrypt)
1299			cp = comm_point_create_tcp(base, ports->fd,
1300				tcp_accept_count, tcp_idle_timeout,
1301				tcp_conn_limit, bufsize, front->udp_buff,
1302				cb, cb_arg);
1303		else if(ports->ftype == listen_type_ssl) {
1304			cp = comm_point_create_tcp(base, ports->fd,
1305				tcp_accept_count, tcp_idle_timeout,
1306				tcp_conn_limit, bufsize, front->udp_buff,
1307				cb, cb_arg);
1308			cp->ssl = sslctx;
1309		} else if(ports->ftype == listen_type_udpancil ||
1310				  ports->ftype == listen_type_udpancil_dnscrypt)
1311			cp = comm_point_create_udp_ancil(base, ports->fd,
1312				front->udp_buff, cb, cb_arg);
1313		if(!cp) {
1314			log_err("can't create commpoint");
1315			listen_delete(front);
1316			return NULL;
1317		}
1318		cp->dtenv = dtenv;
1319		cp->do_not_close = 1;
1320#ifdef USE_DNSCRYPT
1321		if (ports->ftype == listen_type_udp_dnscrypt ||
1322			ports->ftype == listen_type_tcp_dnscrypt ||
1323			ports->ftype == listen_type_udpancil_dnscrypt) {
1324			cp->dnscrypt = 1;
1325			cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1326			if(!cp->dnscrypt_buffer) {
1327				log_err("can't alloc dnscrypt_buffer");
1328				comm_point_delete(cp);
1329				listen_delete(front);
1330				return NULL;
1331			}
1332			front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1333		}
1334#endif
1335		if(!listen_cp_insert(cp, front)) {
1336			log_err("malloc failed");
1337			comm_point_delete(cp);
1338			listen_delete(front);
1339			return NULL;
1340		}
1341		ports = ports->next;
1342	}
1343	if(!front->cps) {
1344		log_err("Could not open sockets to accept queries.");
1345		listen_delete(front);
1346		return NULL;
1347	}
1348
1349	return front;
1350}
1351
1352void
1353listen_list_delete(struct listen_list* list)
1354{
1355	struct listen_list *p = list, *pn;
1356	while(p) {
1357		pn = p->next;
1358		comm_point_delete(p->com);
1359		free(p);
1360		p = pn;
1361	}
1362}
1363
1364void
1365listen_delete(struct listen_dnsport* front)
1366{
1367	if(!front)
1368		return;
1369	listen_list_delete(front->cps);
1370#ifdef USE_DNSCRYPT
1371	if(front->dnscrypt_udp_buff &&
1372		front->udp_buff != front->dnscrypt_udp_buff) {
1373		sldns_buffer_free(front->dnscrypt_udp_buff);
1374	}
1375#endif
1376	sldns_buffer_free(front->udp_buff);
1377	free(front);
1378	if(stream_wait_lock_inited) {
1379		stream_wait_lock_inited = 0;
1380		lock_basic_destroy(&stream_wait_count_lock);
1381	}
1382}
1383
1384struct listen_port*
1385listening_ports_open(struct config_file* cfg, int* reuseport)
1386{
1387	struct listen_port* list = NULL;
1388	struct addrinfo hints;
1389	int i, do_ip4, do_ip6;
1390	int do_tcp, do_auto;
1391	char portbuf[32];
1392	snprintf(portbuf, sizeof(portbuf), "%d", cfg->port);
1393	do_ip4 = cfg->do_ip4;
1394	do_ip6 = cfg->do_ip6;
1395	do_tcp = cfg->do_tcp;
1396	do_auto = cfg->if_automatic && cfg->do_udp;
1397	if(cfg->incoming_num_tcp == 0)
1398		do_tcp = 0;
1399
1400	/* getaddrinfo */
1401	memset(&hints, 0, sizeof(hints));
1402	hints.ai_flags = AI_PASSIVE;
1403	/* no name lookups on our listening ports */
1404	if(cfg->num_ifs > 0)
1405		hints.ai_flags |= AI_NUMERICHOST;
1406	hints.ai_family = AF_UNSPEC;
1407#ifndef INET6
1408	do_ip6 = 0;
1409#endif
1410	if(!do_ip4 && !do_ip6) {
1411		return NULL;
1412	}
1413	/* create ip4 and ip6 ports so that return addresses are nice. */
1414	if(do_auto || cfg->num_ifs == 0) {
1415		if(do_ip6) {
1416			hints.ai_family = AF_INET6;
1417			if(!ports_create_if(do_auto?"::0":"::1",
1418				do_auto, cfg->do_udp, do_tcp,
1419				&hints, portbuf, &list,
1420				cfg->so_rcvbuf, cfg->so_sndbuf,
1421				cfg->ssl_port, cfg->tls_additional_port,
1422				reuseport, cfg->ip_transparent,
1423				cfg->tcp_mss, cfg->ip_freebind, cfg->use_systemd,
1424				cfg->dnscrypt_port)) {
1425				listening_ports_free(list);
1426				return NULL;
1427			}
1428		}
1429		if(do_ip4) {
1430			hints.ai_family = AF_INET;
1431			if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1",
1432				do_auto, cfg->do_udp, do_tcp,
1433				&hints, portbuf, &list,
1434				cfg->so_rcvbuf, cfg->so_sndbuf,
1435				cfg->ssl_port, cfg->tls_additional_port,
1436				reuseport, cfg->ip_transparent,
1437				cfg->tcp_mss, cfg->ip_freebind, cfg->use_systemd,
1438				cfg->dnscrypt_port)) {
1439				listening_ports_free(list);
1440				return NULL;
1441			}
1442		}
1443	} else for(i = 0; i<cfg->num_ifs; i++) {
1444		if(str_is_ip6(cfg->ifs[i])) {
1445			if(!do_ip6)
1446				continue;
1447			hints.ai_family = AF_INET6;
1448			if(!ports_create_if(cfg->ifs[i], 0, cfg->do_udp,
1449				do_tcp, &hints, portbuf, &list,
1450				cfg->so_rcvbuf, cfg->so_sndbuf,
1451				cfg->ssl_port, cfg->tls_additional_port,
1452				reuseport, cfg->ip_transparent,
1453				cfg->tcp_mss, cfg->ip_freebind, cfg->use_systemd,
1454				cfg->dnscrypt_port)) {
1455				listening_ports_free(list);
1456				return NULL;
1457			}
1458		} else {
1459			if(!do_ip4)
1460				continue;
1461			hints.ai_family = AF_INET;
1462			if(!ports_create_if(cfg->ifs[i], 0, cfg->do_udp,
1463				do_tcp, &hints, portbuf, &list,
1464				cfg->so_rcvbuf, cfg->so_sndbuf,
1465				cfg->ssl_port, cfg->tls_additional_port,
1466				reuseport, cfg->ip_transparent,
1467				cfg->tcp_mss, cfg->ip_freebind, cfg->use_systemd,
1468				cfg->dnscrypt_port)) {
1469				listening_ports_free(list);
1470				return NULL;
1471			}
1472		}
1473	}
1474	return list;
1475}
1476
1477void listening_ports_free(struct listen_port* list)
1478{
1479	struct listen_port* nx;
1480	while(list) {
1481		nx = list->next;
1482		if(list->fd != -1) {
1483#ifndef USE_WINSOCK
1484			close(list->fd);
1485#else
1486			closesocket(list->fd);
1487#endif
1488		}
1489		free(list);
1490		list = nx;
1491	}
1492}
1493
1494size_t listen_get_mem(struct listen_dnsport* listen)
1495{
1496	struct listen_list* p;
1497	size_t s = sizeof(*listen) + sizeof(*listen->base) +
1498		sizeof(*listen->udp_buff) +
1499		sldns_buffer_capacity(listen->udp_buff);
1500#ifdef USE_DNSCRYPT
1501	s += sizeof(*listen->dnscrypt_udp_buff);
1502	if(listen->udp_buff != listen->dnscrypt_udp_buff){
1503		s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
1504	}
1505#endif
1506	for(p = listen->cps; p; p = p->next) {
1507		s += sizeof(*p);
1508		s += comm_point_get_mem(p->com);
1509	}
1510	return s;
1511}
1512
1513void listen_stop_accept(struct listen_dnsport* listen)
1514{
1515	/* do not stop the ones that have no tcp_free list
1516	 * (they have already stopped listening) */
1517	struct listen_list* p;
1518	for(p=listen->cps; p; p=p->next) {
1519		if(p->com->type == comm_tcp_accept &&
1520			p->com->tcp_free != NULL) {
1521			comm_point_stop_listening(p->com);
1522		}
1523	}
1524}
1525
1526void listen_start_accept(struct listen_dnsport* listen)
1527{
1528	/* do not start the ones that have no tcp_free list, it is no
1529	 * use to listen to them because they have no free tcp handlers */
1530	struct listen_list* p;
1531	for(p=listen->cps; p; p=p->next) {
1532		if(p->com->type == comm_tcp_accept &&
1533			p->com->tcp_free != NULL) {
1534			comm_point_start_listening(p->com, -1, -1);
1535		}
1536	}
1537}
1538
1539struct tcp_req_info*
1540tcp_req_info_create(struct sldns_buffer* spoolbuf)
1541{
1542	struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
1543	if(!req) {
1544		log_err("malloc failure for new stream outoforder processing structure");
1545		return NULL;
1546	}
1547	memset(req, 0, sizeof(*req));
1548	req->spool_buffer = spoolbuf;
1549	return req;
1550}
1551
1552void
1553tcp_req_info_delete(struct tcp_req_info* req)
1554{
1555	if(!req) return;
1556	tcp_req_info_clear(req);
1557	/* cp is pointer back to commpoint that owns this struct and
1558	 * called delete on us */
1559	/* spool_buffer is shared udp buffer, not deleted here */
1560	free(req);
1561}
1562
1563void tcp_req_info_clear(struct tcp_req_info* req)
1564{
1565	struct tcp_req_open_item* open, *nopen;
1566	struct tcp_req_done_item* item, *nitem;
1567	if(!req) return;
1568
1569	/* free outstanding request mesh reply entries */
1570	open = req->open_req_list;
1571	while(open) {
1572		nopen = open->next;
1573		mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
1574		free(open);
1575		open = nopen;
1576	}
1577	req->open_req_list = NULL;
1578	req->num_open_req = 0;
1579
1580	/* free pending writable result packets */
1581	item = req->done_req_list;
1582	while(item) {
1583		nitem = item->next;
1584		lock_basic_lock(&stream_wait_count_lock);
1585		stream_wait_count -= (sizeof(struct tcp_req_done_item)
1586			+item->len);
1587		lock_basic_unlock(&stream_wait_count_lock);
1588		free(item->buf);
1589		free(item);
1590		item = nitem;
1591	}
1592	req->done_req_list = NULL;
1593	req->num_done_req = 0;
1594	req->read_is_closed = 0;
1595}
1596
1597void
1598tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
1599{
1600	struct tcp_req_open_item* open, *prev = NULL;
1601	if(!req || !m) return;
1602	open = req->open_req_list;
1603	while(open) {
1604		if(open->mesh_state == m) {
1605			struct tcp_req_open_item* next;
1606			if(prev) prev->next = open->next;
1607			else req->open_req_list = open->next;
1608			/* caller has to manage the mesh state reply entry */
1609			next = open->next;
1610			free(open);
1611			req->num_open_req --;
1612
1613			/* prev = prev; */
1614			open = next;
1615			continue;
1616		}
1617		prev = open;
1618		open = open->next;
1619	}
1620}
1621
1622/** setup listening for read or write */
1623static void
1624tcp_req_info_setup_listen(struct tcp_req_info* req)
1625{
1626	int wr = 0;
1627	int rd = 0;
1628
1629	if(req->cp->tcp_byte_count != 0) {
1630		/* cannot change, halfway through */
1631		return;
1632	}
1633
1634	if(!req->cp->tcp_is_reading)
1635		wr = 1;
1636	if(req->num_open_req + req->num_done_req < TCP_MAX_REQ_SIMULTANEOUS &&
1637		!req->read_is_closed)
1638		rd = 1;
1639
1640	if(wr) {
1641		req->cp->tcp_is_reading = 0;
1642		comm_point_stop_listening(req->cp);
1643		comm_point_start_listening(req->cp, -1,
1644			req->cp->tcp_timeout_msec);
1645	} else if(rd) {
1646		req->cp->tcp_is_reading = 1;
1647		comm_point_stop_listening(req->cp);
1648		comm_point_start_listening(req->cp, -1,
1649			req->cp->tcp_timeout_msec);
1650		/* and also read it (from SSL stack buffers), so
1651		 * no event read event is expected since the remainder of
1652		 * the TLS frame is sitting in the buffers. */
1653		req->read_again = 1;
1654	} else {
1655		comm_point_stop_listening(req->cp);
1656		comm_point_start_listening(req->cp, -1,
1657			req->cp->tcp_timeout_msec);
1658		comm_point_listen_for_rw(req->cp, 0, 0);
1659	}
1660}
1661
1662/** remove first item from list of pending results */
1663static struct tcp_req_done_item*
1664tcp_req_info_pop_done(struct tcp_req_info* req)
1665{
1666	struct tcp_req_done_item* item;
1667	log_assert(req->num_done_req > 0 && req->done_req_list);
1668	item = req->done_req_list;
1669	lock_basic_lock(&stream_wait_count_lock);
1670	stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
1671	lock_basic_unlock(&stream_wait_count_lock);
1672	req->done_req_list = req->done_req_list->next;
1673	req->num_done_req --;
1674	return item;
1675}
1676
1677/** Send given buffer and setup to write */
1678static void
1679tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
1680	size_t len)
1681{
1682	sldns_buffer_clear(req->cp->buffer);
1683	sldns_buffer_write(req->cp->buffer, buf, len);
1684	sldns_buffer_flip(req->cp->buffer);
1685
1686	req->cp->tcp_is_reading = 0; /* we are now writing */
1687}
1688
1689/** pick up the next result and start writing it to the channel */
1690static void
1691tcp_req_pickup_next_result(struct tcp_req_info* req)
1692{
1693	if(req->num_done_req > 0) {
1694		/* unlist the done item from the list of pending results */
1695		struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
1696		tcp_req_info_start_write_buf(req, item->buf, item->len);
1697		free(item->buf);
1698		free(item);
1699	}
1700}
1701
1702/** the read channel has closed */
1703int
1704tcp_req_info_handle_read_close(struct tcp_req_info* req)
1705{
1706	verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
1707	/* reset byte count for (potential) partial read */
1708	req->cp->tcp_byte_count = 0;
1709	/* if we still have results to write, pick up next and write it */
1710	if(req->num_done_req != 0) {
1711		tcp_req_pickup_next_result(req);
1712		tcp_req_info_setup_listen(req);
1713		return 1;
1714	}
1715	/* if nothing to do, this closes the connection */
1716	if(req->num_open_req == 0 && req->num_done_req == 0)
1717		return 0;
1718	/* otherwise, we must be waiting for dns resolve, wait with timeout */
1719	req->read_is_closed = 1;
1720	tcp_req_info_setup_listen(req);
1721	return 1;
1722}
1723
1724void
1725tcp_req_info_handle_writedone(struct tcp_req_info* req)
1726{
1727	/* back to reading state, we finished this write event */
1728	sldns_buffer_clear(req->cp->buffer);
1729	if(req->num_done_req == 0 && req->read_is_closed) {
1730		/* no more to write and nothing to read, close it */
1731		comm_point_drop_reply(&req->cp->repinfo);
1732		return;
1733	}
1734	req->cp->tcp_is_reading = 1;
1735	/* see if another result needs writing */
1736	tcp_req_pickup_next_result(req);
1737
1738	/* see if there is more to write, if not stop_listening for writing */
1739	/* see if new requests are allowed, if so, start_listening
1740	 * for reading */
1741	tcp_req_info_setup_listen(req);
1742}
1743
1744void
1745tcp_req_info_handle_readdone(struct tcp_req_info* req)
1746{
1747	struct comm_point* c = req->cp;
1748
1749	/* we want to read up several requests, unless there are
1750	 * pending answers */
1751
1752	req->is_drop = 0;
1753	req->is_reply = 0;
1754	req->in_worker_handle = 1;
1755	sldns_buffer_set_limit(req->spool_buffer, 0);
1756	/* handle the current request */
1757	/* this calls the worker handle request routine that could give
1758	 * a cache response, or localdata response, or drop the reply,
1759	 * or schedule a mesh entry for later */
1760	fptr_ok(fptr_whitelist_comm_point(c->callback));
1761	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
1762		req->in_worker_handle = 0;
1763		/* there is an answer, put it up.  It is already in the
1764		 * c->buffer, just send it. */
1765		/* since we were just reading a query, the channel is
1766		 * clear to write to */
1767	send_it:
1768		c->tcp_is_reading = 0;
1769		comm_point_stop_listening(c);
1770		comm_point_start_listening(c, -1, c->tcp_timeout_msec);
1771		return;
1772	}
1773	req->in_worker_handle = 0;
1774	/* it should be waiting in the mesh for recursion.
1775	 * If mesh failed to add a new entry and called commpoint_drop_reply.
1776	 * Then the mesh state has been cleared. */
1777	if(req->is_drop) {
1778		/* the reply has been dropped, stream has been closed. */
1779		return;
1780	}
1781	/* If mesh failed(mallocfail) and called commpoint_send_reply with
1782	 * something like servfail then we pick up that reply below. */
1783	if(req->is_reply) {
1784		goto send_it;
1785	}
1786
1787	sldns_buffer_clear(c->buffer);
1788	/* if pending answers, pick up an answer and start sending it */
1789	tcp_req_pickup_next_result(req);
1790
1791	/* if answers pending, start sending answers */
1792	/* read more requests if we can have more requests */
1793	tcp_req_info_setup_listen(req);
1794}
1795
1796int
1797tcp_req_info_add_meshstate(struct tcp_req_info* req,
1798	struct mesh_area* mesh, struct mesh_state* m)
1799{
1800	struct tcp_req_open_item* item;
1801	log_assert(req && mesh && m);
1802	item = (struct tcp_req_open_item*)malloc(sizeof(*item));
1803	if(!item) return 0;
1804	item->next = req->open_req_list;
1805	item->mesh = mesh;
1806	item->mesh_state = m;
1807	req->open_req_list = item;
1808	req->num_open_req++;
1809	return 1;
1810}
1811
1812/** Add a result to the result list.  At the end. */
1813static int
1814tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
1815{
1816	struct tcp_req_done_item* last = NULL;
1817	struct tcp_req_done_item* item;
1818	size_t space;
1819
1820	/* see if we have space */
1821	space = sizeof(struct tcp_req_done_item) + len;
1822	lock_basic_lock(&stream_wait_count_lock);
1823	if(stream_wait_count + space > stream_wait_max) {
1824		lock_basic_unlock(&stream_wait_count_lock);
1825		verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
1826		return 0;
1827	}
1828	stream_wait_count += space;
1829	lock_basic_unlock(&stream_wait_count_lock);
1830
1831	/* find last element */
1832	last = req->done_req_list;
1833	while(last && last->next)
1834		last = last->next;
1835
1836	/* create new element */
1837	item = (struct tcp_req_done_item*)malloc(sizeof(*item));
1838	if(!item) {
1839		log_err("malloc failure, for stream result list");
1840		return 0;
1841	}
1842	item->next = NULL;
1843	item->len = len;
1844	item->buf = memdup(buf, len);
1845	if(!item->buf) {
1846		free(item);
1847		log_err("malloc failure, adding reply to stream result list");
1848		return 0;
1849	}
1850
1851	/* link in */
1852	if(last) last->next = item;
1853	else req->done_req_list = item;
1854	req->num_done_req++;
1855	return 1;
1856}
1857
1858void
1859tcp_req_info_send_reply(struct tcp_req_info* req)
1860{
1861	if(req->in_worker_handle) {
1862		/* reply from mesh is in the spool_buffer */
1863		/* copy now, so that the spool buffer is free for other tasks
1864		 * before the callback is done */
1865		sldns_buffer_clear(req->cp->buffer);
1866		sldns_buffer_write(req->cp->buffer,
1867			sldns_buffer_begin(req->spool_buffer),
1868			sldns_buffer_limit(req->spool_buffer));
1869		sldns_buffer_flip(req->cp->buffer);
1870		req->is_reply = 1;
1871		return;
1872	}
1873	/* now that the query has been handled, that mesh_reply entry
1874	 * should be removed, from the tcp_req_info list,
1875	 * the mesh state cleanup removes then with region_cleanup and
1876	 * replies_sent true. */
1877	/* see if we can send it straight away (we are not doing
1878	 * anything else).  If so, copy to buffer and start */
1879	if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
1880		/* buffer is free, and was ready to read new query into,
1881		 * but we are now going to use it to send this answer */
1882		tcp_req_info_start_write_buf(req,
1883			sldns_buffer_begin(req->spool_buffer),
1884			sldns_buffer_limit(req->spool_buffer));
1885		/* switch to listen to write events */
1886		comm_point_stop_listening(req->cp);
1887		comm_point_start_listening(req->cp, -1,
1888			req->cp->tcp_timeout_msec);
1889		return;
1890	}
1891	/* queue up the answer behind the others already pending */
1892	if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
1893		sldns_buffer_limit(req->spool_buffer))) {
1894		/* drop the connection, we are out of resources */
1895		comm_point_drop_reply(&req->cp->repinfo);
1896	}
1897}
1898
1899size_t tcp_req_info_get_stream_buffer_size(void)
1900{
1901	size_t s;
1902	if(!stream_wait_lock_inited)
1903		return stream_wait_count;
1904	lock_basic_lock(&stream_wait_count_lock);
1905	s = stream_wait_count;
1906	lock_basic_unlock(&stream_wait_count_lock);
1907	return s;
1908}
1909