1/*
2 * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3 *
4 * Copyright (c) 2007, NLnet Labs. All rights reserved.
5 *
6 * This software is open source.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * Redistributions of source code must retain the above copyright notice,
13 * this list of conditions and the following disclaimer.
14 *
15 * Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 *
19 * Neither the name of the NLNET LABS nor the names of its contributors may
20 * be used to endorse or promote products derived from this software without
21 * specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36/**
37 * \file
38 *
39 * This file has functions to get queries from clients.
40 */
41#include "config.h"
42#ifdef HAVE_SYS_TYPES_H
43#  include <sys/types.h>
44#endif
45#include <sys/time.h>
46#include <limits.h>
47#ifdef USE_TCP_FASTOPEN
48#include <netinet/tcp.h>
49#endif
50#include <ctype.h>
51#include "services/listen_dnsport.h"
52#include "services/outside_network.h"
53#include "util/netevent.h"
54#include "util/log.h"
55#include "util/config_file.h"
56#include "util/net_help.h"
57#include "sldns/sbuffer.h"
58#include "sldns/parseutil.h"
59#include "services/mesh.h"
60#include "util/fptr_wlist.h"
61#include "util/locks.h"
62
63#ifdef HAVE_NETDB_H
64#include <netdb.h>
65#endif
66#include <fcntl.h>
67
68#ifdef HAVE_SYS_UN_H
69#include <sys/un.h>
70#endif
71
72#ifdef HAVE_SYSTEMD
73#include <systemd/sd-daemon.h>
74#endif
75
76#ifdef HAVE_IFADDRS_H
77#include <ifaddrs.h>
78#endif
79#ifdef HAVE_NET_IF_H
80#include <net/if.h>
81#endif
82#ifdef HAVE_LINUX_NET_TSTAMP_H
83#include <linux/net_tstamp.h>
84#endif
85/** number of queued TCP connections for listen() */
86#define TCP_BACKLOG 256
87
88#ifndef THREADS_DISABLED
89/** lock on the counter of stream buffer memory */
90static lock_basic_type stream_wait_count_lock;
91/** lock on the counter of HTTP2 query buffer memory */
92static lock_basic_type http2_query_buffer_count_lock;
93/** lock on the counter of HTTP2 response buffer memory */
94static lock_basic_type http2_response_buffer_count_lock;
95#endif
96/** size (in bytes) of stream wait buffers */
97static size_t stream_wait_count = 0;
98/** is the lock initialised for stream wait buffers */
99static int stream_wait_lock_inited = 0;
100/** size (in bytes) of HTTP2 query buffers */
101static size_t http2_query_buffer_count = 0;
102/** is the lock initialised for HTTP2 query buffers */
103static int http2_query_buffer_lock_inited = 0;
104/** size (in bytes) of HTTP2 response buffers */
105static size_t http2_response_buffer_count = 0;
106/** is the lock initialised for HTTP2 response buffers */
107static int http2_response_buffer_lock_inited = 0;
108
109/**
110 * Debug print of the getaddrinfo returned address.
111 * @param addr: the address returned.
112 */
113static void
114verbose_print_addr(struct addrinfo *addr)
115{
116	if(verbosity >= VERB_ALGO) {
117		char buf[100];
118		void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
119#ifdef INET6
120		if(addr->ai_family == AF_INET6)
121			sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
122				sin6_addr;
123#endif /* INET6 */
124		if(inet_ntop(addr->ai_family, sinaddr, buf,
125			(socklen_t)sizeof(buf)) == 0) {
126			(void)strlcpy(buf, "(null)", sizeof(buf));
127		}
128		buf[sizeof(buf)-1] = 0;
129		verbose(VERB_ALGO, "creating %s%s socket %s %d",
130			addr->ai_socktype==SOCK_DGRAM?"udp":
131			addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
132			addr->ai_family==AF_INET?"4":
133			addr->ai_family==AF_INET6?"6":
134			"_otherfam", buf,
135			ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port));
136	}
137}
138
139void
140verbose_print_unbound_socket(struct unbound_socket* ub_sock)
141{
142	if(verbosity >= VERB_ALGO) {
143		log_info("listing of unbound_socket structure:");
144		verbose_print_addr(ub_sock->addr);
145		log_info("s is: %d, fam is: %s, acl: %s", ub_sock->s,
146			ub_sock->fam == AF_INET?"AF_INET":"AF_INET6",
147			ub_sock->acl?"yes":"no");
148	}
149}
150
151#ifdef HAVE_SYSTEMD
152static int
153systemd_get_activated(int family, int socktype, int listen,
154		      struct sockaddr *addr, socklen_t addrlen,
155		      const char *path)
156{
157	int i = 0;
158	int r = 0;
159	int s = -1;
160	const char* listen_pid, *listen_fds;
161
162	/* We should use "listen" option only for stream protocols. For UDP it should be -1 */
163
164	if((r = sd_booted()) < 1) {
165		if(r == 0)
166			log_warn("systemd is not running");
167		else
168			log_err("systemd sd_booted(): %s", strerror(-r));
169		return -1;
170	}
171
172	listen_pid = getenv("LISTEN_PID");
173	listen_fds = getenv("LISTEN_FDS");
174
175	if (!listen_pid) {
176		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
177		return -1;
178	}
179
180	if (!listen_fds) {
181		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
182		return -1;
183	}
184
185	if((r = sd_listen_fds(0)) < 1) {
186		if(r == 0)
187			log_warn("systemd: did not return socket, check unit configuration");
188		else
189			log_err("systemd sd_listen_fds(): %s", strerror(-r));
190		return -1;
191	}
192
193	for(i = 0; i < r; i++) {
194		if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
195			s = SD_LISTEN_FDS_START + i;
196			break;
197		}
198	}
199	if (s == -1) {
200		if (addr)
201			log_err_addr("systemd sd_listen_fds()",
202				     "no such socket",
203				     (struct sockaddr_storage *)addr, addrlen);
204		else
205			log_err("systemd sd_listen_fds(): %s", path);
206	}
207	return s;
208}
209#endif
210
211int
212create_udp_sock(int family, int socktype, struct sockaddr* addr,
213        socklen_t addrlen, int v6only, int* inuse, int* noproto,
214	int rcv, int snd, int listen, int* reuseport, int transparent,
215	int freebind, int use_systemd, int dscp)
216{
217	int s;
218	char* err;
219#if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
220	int on=1;
221#endif
222#ifdef IPV6_MTU
223	int mtu = IPV6_MIN_MTU;
224#endif
225#if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
226	(void)rcv;
227#endif
228#if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
229	(void)snd;
230#endif
231#ifndef IPV6_V6ONLY
232	(void)v6only;
233#endif
234#if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
235	(void)transparent;
236#endif
237#if !defined(IP_FREEBIND)
238	(void)freebind;
239#endif
240#ifdef HAVE_SYSTEMD
241	int got_fd_from_systemd = 0;
242
243	if (!use_systemd
244	    || (use_systemd
245		&& (s = systemd_get_activated(family, socktype, -1, addr,
246					      addrlen, NULL)) == -1)) {
247#else
248	(void)use_systemd;
249#endif
250	if((s = socket(family, socktype, 0)) == -1) {
251		*inuse = 0;
252#ifndef USE_WINSOCK
253		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
254			*noproto = 1;
255			return -1;
256		}
257#else
258		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
259			WSAGetLastError() == WSAEPROTONOSUPPORT) {
260			*noproto = 1;
261			return -1;
262		}
263#endif
264		log_err("can't create socket: %s", sock_strerror(errno));
265		*noproto = 0;
266		return -1;
267	}
268#ifdef HAVE_SYSTEMD
269	} else {
270		got_fd_from_systemd = 1;
271	}
272#endif
273	if(listen) {
274#ifdef SO_REUSEADDR
275		if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
276			(socklen_t)sizeof(on)) < 0) {
277			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
278				sock_strerror(errno));
279#ifndef USE_WINSOCK
280			if(errno != ENOSYS) {
281				close(s);
282				*noproto = 0;
283				*inuse = 0;
284				return -1;
285			}
286#else
287			closesocket(s);
288			*noproto = 0;
289			*inuse = 0;
290			return -1;
291#endif
292		}
293#endif /* SO_REUSEADDR */
294#ifdef SO_REUSEPORT
295#  ifdef SO_REUSEPORT_LB
296		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
297		 * like SO_REUSEPORT on Linux.  This is what the users want
298		 * with the config option in unbound.conf; if we actually
299		 * need local address and port reuse they'll also need to
300		 * have SO_REUSEPORT set for them, assume it was _LB they want.
301		 */
302		if (reuseport && *reuseport &&
303		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
304			(socklen_t)sizeof(on)) < 0) {
305#ifdef ENOPROTOOPT
306			if(errno != ENOPROTOOPT || verbosity >= 3)
307				log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
308					strerror(errno));
309#endif
310			/* this option is not essential, we can continue */
311			*reuseport = 0;
312		}
313#  else /* no SO_REUSEPORT_LB */
314
315		/* try to set SO_REUSEPORT so that incoming
316		 * queries are distributed evenly among the receiving threads.
317		 * Each thread must have its own socket bound to the same port,
318		 * with SO_REUSEPORT set on each socket.
319		 */
320		if (reuseport && *reuseport &&
321		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
322			(socklen_t)sizeof(on)) < 0) {
323#ifdef ENOPROTOOPT
324			if(errno != ENOPROTOOPT || verbosity >= 3)
325				log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
326					strerror(errno));
327#endif
328			/* this option is not essential, we can continue */
329			*reuseport = 0;
330		}
331#  endif /* SO_REUSEPORT_LB */
332#else
333		(void)reuseport;
334#endif /* defined(SO_REUSEPORT) */
335#ifdef IP_TRANSPARENT
336		if (transparent &&
337		    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
338		    (socklen_t)sizeof(on)) < 0) {
339			log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
340			strerror(errno));
341		}
342#elif defined(IP_BINDANY)
343		if (transparent &&
344		    setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
345		    (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
346		    (void*)&on, (socklen_t)sizeof(on)) < 0) {
347			log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
348			(family==AF_INET6?"V6":""), strerror(errno));
349		}
350#elif defined(SO_BINDANY)
351		if (transparent &&
352		    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
353		    (socklen_t)sizeof(on)) < 0) {
354			log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
355			strerror(errno));
356		}
357#endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
358	}
359#ifdef IP_FREEBIND
360	if(freebind &&
361	    setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
362	    (socklen_t)sizeof(on)) < 0) {
363		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
364		strerror(errno));
365	}
366#endif /* IP_FREEBIND */
367	if(rcv) {
368#ifdef SO_RCVBUF
369		int got;
370		socklen_t slen = (socklen_t)sizeof(got);
371#  ifdef SO_RCVBUFFORCE
372		/* Linux specific: try to use root permission to override
373		 * system limits on rcvbuf. The limit is stored in
374		 * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
375		if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
376			(socklen_t)sizeof(rcv)) < 0) {
377			if(errno != EPERM) {
378				log_err("setsockopt(..., SO_RCVBUFFORCE, "
379					"...) failed: %s", sock_strerror(errno));
380				sock_close(s);
381				*noproto = 0;
382				*inuse = 0;
383				return -1;
384			}
385#  endif /* SO_RCVBUFFORCE */
386			if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
387				(socklen_t)sizeof(rcv)) < 0) {
388				log_err("setsockopt(..., SO_RCVBUF, "
389					"...) failed: %s", sock_strerror(errno));
390				sock_close(s);
391				*noproto = 0;
392				*inuse = 0;
393				return -1;
394			}
395			/* check if we got the right thing or if system
396			 * reduced to some system max.  Warn if so */
397			if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got,
398				&slen) >= 0 && got < rcv/2) {
399				log_warn("so-rcvbuf %u was not granted. "
400					"Got %u. To fix: start with "
401					"root permissions(linux) or sysctl "
402					"bigger net.core.rmem_max(linux) or "
403					"kern.ipc.maxsockbuf(bsd) values.",
404					(unsigned)rcv, (unsigned)got);
405			}
406#  ifdef SO_RCVBUFFORCE
407		}
408#  endif
409#endif /* SO_RCVBUF */
410	}
411	/* first do RCVBUF as the receive buffer is more important */
412	if(snd) {
413#ifdef SO_SNDBUF
414		int got;
415		socklen_t slen = (socklen_t)sizeof(got);
416#  ifdef SO_SNDBUFFORCE
417		/* Linux specific: try to use root permission to override
418		 * system limits on sndbuf. The limit is stored in
419		 * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
420		if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
421			(socklen_t)sizeof(snd)) < 0) {
422			if(errno != EPERM) {
423				log_err("setsockopt(..., SO_SNDBUFFORCE, "
424					"...) failed: %s", sock_strerror(errno));
425				sock_close(s);
426				*noproto = 0;
427				*inuse = 0;
428				return -1;
429			}
430#  endif /* SO_SNDBUFFORCE */
431			if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
432				(socklen_t)sizeof(snd)) < 0) {
433				log_err("setsockopt(..., SO_SNDBUF, "
434					"...) failed: %s", sock_strerror(errno));
435				sock_close(s);
436				*noproto = 0;
437				*inuse = 0;
438				return -1;
439			}
440			/* check if we got the right thing or if system
441			 * reduced to some system max.  Warn if so */
442			if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got,
443				&slen) >= 0 && got < snd/2) {
444				log_warn("so-sndbuf %u was not granted. "
445					"Got %u. To fix: start with "
446					"root permissions(linux) or sysctl "
447					"bigger net.core.wmem_max(linux) or "
448					"kern.ipc.maxsockbuf(bsd) values.",
449					(unsigned)snd, (unsigned)got);
450			}
451#  ifdef SO_SNDBUFFORCE
452		}
453#  endif
454#endif /* SO_SNDBUF */
455	}
456	err = set_ip_dscp(s, family, dscp);
457	if(err != NULL)
458		log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
459	if(family == AF_INET6) {
460# if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
461		int omit6_set = 0;
462		int action;
463# endif
464# if defined(IPV6_V6ONLY)
465		if(v6only
466#   ifdef HAVE_SYSTEMD
467			/* Systemd wants to control if the socket is v6 only
468			 * or both, with BindIPv6Only=default, ipv6-only or
469			 * both in systemd.socket, so it is not set here. */
470			&& !got_fd_from_systemd
471#   endif
472			) {
473			int val=(v6only==2)?0:1;
474			if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
475				(void*)&val, (socklen_t)sizeof(val)) < 0) {
476				log_err("setsockopt(..., IPV6_V6ONLY"
477					", ...) failed: %s", sock_strerror(errno));
478				sock_close(s);
479				*noproto = 0;
480				*inuse = 0;
481				return -1;
482			}
483		}
484# endif
485# if defined(IPV6_USE_MIN_MTU)
486		/*
487		 * There is no fragmentation of IPv6 datagrams
488		 * during forwarding in the network. Therefore
489		 * we do not send UDP datagrams larger than
490		 * the minimum IPv6 MTU of 1280 octets. The
491		 * EDNS0 message length can be larger if the
492		 * network stack supports IPV6_USE_MIN_MTU.
493		 */
494		if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
495			(void*)&on, (socklen_t)sizeof(on)) < 0) {
496			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
497				"...) failed: %s", sock_strerror(errno));
498			sock_close(s);
499			*noproto = 0;
500			*inuse = 0;
501			return -1;
502		}
503# elif defined(IPV6_MTU)
504#   ifndef USE_WINSOCK
505		/*
506		 * On Linux, to send no larger than 1280, the PMTUD is
507		 * disabled by default for datagrams anyway, so we set
508		 * the MTU to use.
509		 */
510		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
511			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
512			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
513				sock_strerror(errno));
514			sock_close(s);
515			*noproto = 0;
516			*inuse = 0;
517			return -1;
518		}
519#   elif defined(IPV6_USER_MTU)
520		/* As later versions of the mingw crosscompiler define
521		 * IPV6_MTU, do the same for windows but use IPV6_USER_MTU
522		 * instead which is writable; IPV6_MTU is readonly there. */
523		if (setsockopt(s, IPPROTO_IPV6, IPV6_USER_MTU,
524			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
525			if (WSAGetLastError() != WSAENOPROTOOPT) {
526				log_err("setsockopt(..., IPV6_USER_MTU, ...) failed: %s",
527					wsa_strerror(WSAGetLastError()));
528				sock_close(s);
529				*noproto = 0;
530				*inuse = 0;
531				return -1;
532			}
533		}
534#   endif /* USE_WINSOCK */
535# endif /* IPv6 MTU */
536# if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
537#  if defined(IP_PMTUDISC_OMIT)
538		action = IP_PMTUDISC_OMIT;
539		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
540			&action, (socklen_t)sizeof(action)) < 0) {
541
542			if (errno != EINVAL) {
543				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
544					strerror(errno));
545				sock_close(s);
546				*noproto = 0;
547				*inuse = 0;
548				return -1;
549			}
550		}
551		else
552		{
553		    omit6_set = 1;
554		}
555#  endif
556		if (omit6_set == 0) {
557			action = IP_PMTUDISC_DONT;
558			if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
559				&action, (socklen_t)sizeof(action)) < 0) {
560				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
561					strerror(errno));
562				sock_close(s);
563				*noproto = 0;
564				*inuse = 0;
565				return -1;
566			}
567		}
568# endif /* IPV6_MTU_DISCOVER */
569	} else if(family == AF_INET) {
570#  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
571/* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
572 * PMTU information is not accepted, but fragmentation is allowed
573 * if and only if the packet size exceeds the outgoing interface MTU
574 * (and also uses the interface mtu to determine the size of the packets).
575 * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
576 * FreeBSD already has same semantics without setting the option. */
577		int omit_set = 0;
578		int action;
579#   if defined(IP_PMTUDISC_OMIT)
580		action = IP_PMTUDISC_OMIT;
581		if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
582			&action, (socklen_t)sizeof(action)) < 0) {
583
584			if (errno != EINVAL) {
585				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
586					strerror(errno));
587				sock_close(s);
588				*noproto = 0;
589				*inuse = 0;
590				return -1;
591			}
592		}
593		else
594		{
595		    omit_set = 1;
596		}
597#   endif
598		if (omit_set == 0) {
599   			action = IP_PMTUDISC_DONT;
600			if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
601				&action, (socklen_t)sizeof(action)) < 0) {
602				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
603					strerror(errno));
604				sock_close(s);
605				*noproto = 0;
606				*inuse = 0;
607				return -1;
608			}
609		}
610#  elif defined(IP_DONTFRAG) && !defined(__APPLE__)
611		/* the IP_DONTFRAG option if defined in the 11.0 OSX headers,
612		 * but does not work on that version, so we exclude it */
613		int off = 0;
614		if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG,
615			&off, (socklen_t)sizeof(off)) < 0) {
616			log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
617				strerror(errno));
618			sock_close(s);
619			*noproto = 0;
620			*inuse = 0;
621			return -1;
622		}
623#  endif /* IPv4 MTU */
624	}
625	if(
626#ifdef HAVE_SYSTEMD
627		!got_fd_from_systemd &&
628#endif
629		bind(s, (struct sockaddr*)addr, addrlen) != 0) {
630		*noproto = 0;
631		*inuse = 0;
632#ifndef USE_WINSOCK
633#ifdef EADDRINUSE
634		*inuse = (errno == EADDRINUSE);
635		/* detect freebsd jail with no ipv6 permission */
636		if(family==AF_INET6 && errno==EINVAL)
637			*noproto = 1;
638		else if(errno != EADDRINUSE &&
639			!(errno == EACCES && verbosity < 4 && !listen)
640#ifdef EADDRNOTAVAIL
641			&& !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
642#endif
643			) {
644			log_err_addr("can't bind socket", strerror(errno),
645				(struct sockaddr_storage*)addr, addrlen);
646		}
647#endif /* EADDRINUSE */
648#else /* USE_WINSOCK */
649		if(WSAGetLastError() != WSAEADDRINUSE &&
650			WSAGetLastError() != WSAEADDRNOTAVAIL &&
651			!(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
652			log_err_addr("can't bind socket",
653				wsa_strerror(WSAGetLastError()),
654				(struct sockaddr_storage*)addr, addrlen);
655		}
656#endif /* USE_WINSOCK */
657		sock_close(s);
658		return -1;
659	}
660	if(!fd_set_nonblock(s)) {
661		*noproto = 0;
662		*inuse = 0;
663		sock_close(s);
664		return -1;
665	}
666	return s;
667}
668
669int
670create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
671	int* reuseport, int transparent, int mss, int nodelay, int freebind,
672	int use_systemd, int dscp)
673{
674	int s;
675	char* err;
676#if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined(SO_BINDANY)
677	int on = 1;
678#endif
679#ifdef HAVE_SYSTEMD
680	int got_fd_from_systemd = 0;
681#endif
682#ifdef USE_TCP_FASTOPEN
683	int qlen;
684#endif
685#if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
686	(void)transparent;
687#endif
688#if !defined(IP_FREEBIND)
689	(void)freebind;
690#endif
691	verbose_print_addr(addr);
692	*noproto = 0;
693#ifdef HAVE_SYSTEMD
694	if (!use_systemd ||
695	    (use_systemd
696	     && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
697					   addr->ai_addr, addr->ai_addrlen,
698					   NULL)) == -1)) {
699#else
700	(void)use_systemd;
701#endif
702	if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
703#ifndef USE_WINSOCK
704		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
705			*noproto = 1;
706			return -1;
707		}
708#else
709		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
710			WSAGetLastError() == WSAEPROTONOSUPPORT) {
711			*noproto = 1;
712			return -1;
713		}
714#endif
715		log_err("can't create socket: %s", sock_strerror(errno));
716		return -1;
717	}
718	if(nodelay) {
719#if defined(IPPROTO_TCP) && defined(TCP_NODELAY)
720		if(setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void*)&on,
721			(socklen_t)sizeof(on)) < 0) {
722			#ifndef USE_WINSOCK
723			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
724				strerror(errno));
725			#else
726			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
727				wsa_strerror(WSAGetLastError()));
728			#endif
729		}
730#else
731		log_warn(" setsockopt(TCP_NODELAY) unsupported");
732#endif /* defined(IPPROTO_TCP) && defined(TCP_NODELAY) */
733	}
734	if (mss > 0) {
735#if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
736		if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
737			(socklen_t)sizeof(mss)) < 0) {
738			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
739				sock_strerror(errno));
740		} else {
741			verbose(VERB_ALGO,
742				" tcp socket mss set to %d", mss);
743		}
744#else
745		log_warn(" setsockopt(TCP_MAXSEG) unsupported");
746#endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
747	}
748#ifdef HAVE_SYSTEMD
749	} else {
750		got_fd_from_systemd = 1;
751    }
752#endif
753#ifdef SO_REUSEADDR
754	if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
755		(socklen_t)sizeof(on)) < 0) {
756		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
757			sock_strerror(errno));
758		sock_close(s);
759		return -1;
760	}
761#endif /* SO_REUSEADDR */
762#ifdef IP_FREEBIND
763	if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
764	    (socklen_t)sizeof(on)) < 0) {
765		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
766		strerror(errno));
767	}
768#endif /* IP_FREEBIND */
769#ifdef SO_REUSEPORT
770	/* try to set SO_REUSEPORT so that incoming
771	 * connections are distributed evenly among the receiving threads.
772	 * Each thread must have its own socket bound to the same port,
773	 * with SO_REUSEPORT set on each socket.
774	 */
775	if (reuseport && *reuseport &&
776		setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
777		(socklen_t)sizeof(on)) < 0) {
778#ifdef ENOPROTOOPT
779		if(errno != ENOPROTOOPT || verbosity >= 3)
780			log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
781				strerror(errno));
782#endif
783		/* this option is not essential, we can continue */
784		*reuseport = 0;
785	}
786#else
787	(void)reuseport;
788#endif /* defined(SO_REUSEPORT) */
789#if defined(IPV6_V6ONLY)
790	if(addr->ai_family == AF_INET6 && v6only
791#  ifdef HAVE_SYSTEMD
792		/* Systemd wants to control if the socket is v6 only
793		 * or both, with BindIPv6Only=default, ipv6-only or
794		 * both in systemd.socket, so it is not set here. */
795		&& !got_fd_from_systemd
796#  endif
797		) {
798		if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
799			(void*)&on, (socklen_t)sizeof(on)) < 0) {
800			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
801				sock_strerror(errno));
802			sock_close(s);
803			return -1;
804		}
805	}
806#else
807	(void)v6only;
808#endif /* IPV6_V6ONLY */
809#ifdef IP_TRANSPARENT
810	if (transparent &&
811	    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
812	    (socklen_t)sizeof(on)) < 0) {
813		log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
814			strerror(errno));
815	}
816#elif defined(IP_BINDANY)
817	if (transparent &&
818	    setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
819	    (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
820	    (void*)&on, (socklen_t)sizeof(on)) < 0) {
821		log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
822		(addr->ai_family==AF_INET6?"V6":""), strerror(errno));
823	}
824#elif defined(SO_BINDANY)
825	if (transparent &&
826	    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
827	    sizeof(on)) < 0) {
828		log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
829		strerror(errno));
830	}
831#endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
832	err = set_ip_dscp(s, addr->ai_family, dscp);
833	if(err != NULL)
834		log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
835	if(
836#ifdef HAVE_SYSTEMD
837		!got_fd_from_systemd &&
838#endif
839        bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
840#ifndef USE_WINSOCK
841		/* detect freebsd jail with no ipv6 permission */
842		if(addr->ai_family==AF_INET6 && errno==EINVAL)
843			*noproto = 1;
844		else {
845			log_err_addr("can't bind socket", strerror(errno),
846				(struct sockaddr_storage*)addr->ai_addr,
847				addr->ai_addrlen);
848		}
849#else
850		log_err_addr("can't bind socket",
851			wsa_strerror(WSAGetLastError()),
852			(struct sockaddr_storage*)addr->ai_addr,
853			addr->ai_addrlen);
854#endif
855		sock_close(s);
856		return -1;
857	}
858	if(!fd_set_nonblock(s)) {
859		sock_close(s);
860		return -1;
861	}
862	if(listen(s, TCP_BACKLOG) == -1) {
863		log_err("can't listen: %s", sock_strerror(errno));
864		sock_close(s);
865		return -1;
866	}
867#ifdef USE_TCP_FASTOPEN
868	/* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
869	   against IP spoofing attacks as suggested in RFC7413 */
870#ifdef __APPLE__
871	/* OS X implementation only supports qlen of 1 via this call. Actual
872	   value is configured by the net.inet.tcp.fastopen_backlog kernel parm. */
873	qlen = 1;
874#else
875	/* 5 is recommended on linux */
876	qlen = 5;
877#endif
878	if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen,
879		  sizeof(qlen))) == -1 ) {
880#ifdef ENOPROTOOPT
881		/* squelch ENOPROTOOPT: freebsd server mode with kernel support
882		   disabled, except when verbosity enabled for debugging */
883		if(errno != ENOPROTOOPT || verbosity >= 3) {
884#endif
885		  if(errno == EPERM) {
886		  	log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
887		  } else {
888		  	log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
889		  }
890#ifdef ENOPROTOOPT
891		}
892#endif
893	}
894#endif
895	return s;
896}
897
898char*
899set_ip_dscp(int socket, int addrfamily, int dscp)
900{
901	int ds;
902
903	if(dscp == 0)
904		return NULL;
905	ds = dscp << 2;
906	switch(addrfamily) {
907	case AF_INET6:
908	#ifdef IPV6_TCLASS
909		if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds,
910			sizeof(ds)) < 0)
911			return sock_strerror(errno);
912		break;
913	#else
914		return "IPV6_TCLASS not defined on this system";
915	#endif
916	default:
917		if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
918			return sock_strerror(errno);
919		break;
920	}
921	return NULL;
922}
923
924int
925create_local_accept_sock(const char *path, int* noproto, int use_systemd)
926{
927#ifdef HAVE_SYSTEMD
928	int ret;
929
930	if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
931		return ret;
932	else {
933#endif
934#ifdef HAVE_SYS_UN_H
935	int s;
936	struct sockaddr_un usock;
937#ifndef HAVE_SYSTEMD
938	(void)use_systemd;
939#endif
940
941	verbose(VERB_ALGO, "creating unix socket %s", path);
942#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
943	/* this member exists on BSDs, not Linux */
944	usock.sun_len = (unsigned)sizeof(usock);
945#endif
946	usock.sun_family = AF_LOCAL;
947	/* length is 92-108, 104 on FreeBSD */
948	(void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
949
950	if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
951		log_err("Cannot create local socket %s (%s)",
952			path, strerror(errno));
953		return -1;
954	}
955
956	if (unlink(path) && errno != ENOENT) {
957		/* The socket already exists and cannot be removed */
958		log_err("Cannot remove old local socket %s (%s)",
959			path, strerror(errno));
960		goto err;
961	}
962
963	if (bind(s, (struct sockaddr *)&usock,
964		(socklen_t)sizeof(struct sockaddr_un)) == -1) {
965		log_err("Cannot bind local socket %s (%s)",
966			path, strerror(errno));
967		goto err;
968	}
969
970	if (!fd_set_nonblock(s)) {
971		log_err("Cannot set non-blocking mode");
972		goto err;
973	}
974
975	if (listen(s, TCP_BACKLOG) == -1) {
976		log_err("can't listen: %s", strerror(errno));
977		goto err;
978	}
979
980	(void)noproto; /*unused*/
981	return s;
982
983err:
984	sock_close(s);
985	return -1;
986
987#ifdef HAVE_SYSTEMD
988	}
989#endif
990#else
991	(void)use_systemd;
992	(void)path;
993	log_err("Local sockets are not supported");
994	*noproto = 1;
995	return -1;
996#endif
997}
998
999
1000/**
1001 * Create socket from getaddrinfo results
1002 */
1003static int
1004make_sock(int stype, const char* ifname, const char* port,
1005	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1006	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1007	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1008{
1009	struct addrinfo *res = NULL;
1010	int r, s, inuse, noproto;
1011	hints->ai_socktype = stype;
1012	*noip6 = 0;
1013	if((r=getaddrinfo(ifname, port, hints, &res)) != 0 || !res) {
1014#ifdef USE_WINSOCK
1015		if(r == EAI_NONAME && hints->ai_family == AF_INET6){
1016			*noip6 = 1; /* 'Host not found' for IP6 on winXP */
1017			return -1;
1018		}
1019#endif
1020		log_err("node %s:%s getaddrinfo: %s %s",
1021			ifname?ifname:"default", port, gai_strerror(r),
1022#ifdef EAI_SYSTEM
1023			(r==EAI_SYSTEM?(char*)strerror(errno):"")
1024#else
1025			""
1026#endif
1027		);
1028		return -1;
1029	}
1030	if(stype == SOCK_DGRAM) {
1031		verbose_print_addr(res);
1032		s = create_udp_sock(res->ai_family, res->ai_socktype,
1033			(struct sockaddr*)res->ai_addr, res->ai_addrlen,
1034			v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
1035			reuseport, transparent, freebind, use_systemd, dscp);
1036		if(s == -1 && inuse) {
1037			log_err("bind: address already in use");
1038		} else if(s == -1 && noproto && hints->ai_family == AF_INET6){
1039			*noip6 = 1;
1040		}
1041	} else	{
1042		s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
1043			transparent, tcp_mss, nodelay, freebind, use_systemd,
1044			dscp);
1045		if(s == -1 && noproto && hints->ai_family == AF_INET6){
1046			*noip6 = 1;
1047		}
1048	}
1049
1050	ub_sock->addr = res;
1051	ub_sock->s = s;
1052	ub_sock->fam = hints->ai_family;
1053	ub_sock->acl = NULL;
1054
1055	return s;
1056}
1057
1058/** make socket and first see if ifname contains port override info */
1059static int
1060make_sock_port(int stype, const char* ifname, const char* port,
1061	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1062	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1063	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1064{
1065	char* s = strchr(ifname, '@');
1066	if(s) {
1067		/* override port with ifspec@port */
1068		char p[16];
1069		char newif[128];
1070		if((size_t)(s-ifname) >= sizeof(newif)) {
1071			log_err("ifname too long: %s", ifname);
1072			*noip6 = 0;
1073			return -1;
1074		}
1075		if(strlen(s+1) >= sizeof(p)) {
1076			log_err("portnumber too long: %s", ifname);
1077			*noip6 = 0;
1078			return -1;
1079		}
1080		(void)strlcpy(newif, ifname, sizeof(newif));
1081		newif[s-ifname] = 0;
1082		(void)strlcpy(p, s+1, sizeof(p));
1083		p[strlen(s+1)]=0;
1084		return make_sock(stype, newif, p, hints, v6only, noip6, rcv,
1085			snd, reuseport, transparent, tcp_mss, nodelay, freebind,
1086			use_systemd, dscp, ub_sock);
1087	}
1088	return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
1089		reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd,
1090		dscp, ub_sock);
1091}
1092
1093/**
1094 * Add port to open ports list.
1095 * @param list: list head. changed.
1096 * @param s: fd.
1097 * @param ftype: if fd is UDP.
1098 * @param pp2_enabled: if PROXYv2 is enabled for this port.
1099 * @param ub_sock: socket with address.
1100 * @return false on failure. list in unchanged then.
1101 */
1102static int
1103port_insert(struct listen_port** list, int s, enum listen_type ftype,
1104	int pp2_enabled, struct unbound_socket* ub_sock)
1105{
1106	struct listen_port* item = (struct listen_port*)malloc(
1107		sizeof(struct listen_port));
1108	if(!item)
1109		return 0;
1110	item->next = *list;
1111	item->fd = s;
1112	item->ftype = ftype;
1113	item->pp2_enabled = pp2_enabled;
1114	item->socket = ub_sock;
1115	*list = item;
1116	return 1;
1117}
1118
1119/** set fd to receive software timestamps */
1120static int
1121set_recvtimestamp(int s)
1122{
1123#ifdef HAVE_LINUX_NET_TSTAMP_H
1124	int opt = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE;
1125	if (setsockopt(s, SOL_SOCKET, SO_TIMESTAMPNS, (void*)&opt, (socklen_t)sizeof(opt)) < 0) {
1126		log_err("setsockopt(..., SO_TIMESTAMPNS, ...) failed: %s",
1127			strerror(errno));
1128		return 0;
1129	}
1130	return 1;
1131#else
1132	log_err("packets timestamping is not supported on this platform");
1133	(void)s;
1134	return 0;
1135#endif
1136}
1137
1138/** set fd to receive source address packet info */
1139static int
1140set_recvpktinfo(int s, int family)
1141{
1142#if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1143	int on = 1;
1144#else
1145	(void)s;
1146#endif
1147	if(family == AF_INET6) {
1148#           ifdef IPV6_RECVPKTINFO
1149		if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1150			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1151			log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1152				strerror(errno));
1153			return 0;
1154		}
1155#           elif defined(IPV6_PKTINFO)
1156		if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1157			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1158			log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1159				strerror(errno));
1160			return 0;
1161		}
1162#           else
1163		log_err("no IPV6_RECVPKTINFO and IPV6_PKTINFO options, please "
1164			"disable interface-automatic or do-ip6 in config");
1165		return 0;
1166#           endif /* defined IPV6_RECVPKTINFO */
1167
1168	} else if(family == AF_INET) {
1169#           ifdef IP_PKTINFO
1170		if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1171			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1172			log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1173				strerror(errno));
1174			return 0;
1175		}
1176#           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1177		if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1178			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1179			log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1180				strerror(errno));
1181			return 0;
1182		}
1183#           else
1184		log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1185			"interface-automatic or do-ip4 in config");
1186		return 0;
1187#           endif /* IP_PKTINFO */
1188
1189	}
1190	return 1;
1191}
1192
1193/** see if interface is ssl, its port number == the ssl port number */
1194static int
1195if_is_ssl(const char* ifname, const char* port, int ssl_port,
1196	struct config_strlist* tls_additional_port)
1197{
1198	struct config_strlist* s;
1199	char* p = strchr(ifname, '@');
1200	if(!p && atoi(port) == ssl_port)
1201		return 1;
1202	if(p && atoi(p+1) == ssl_port)
1203		return 1;
1204	for(s = tls_additional_port; s; s = s->next) {
1205		if(p && atoi(p+1) == atoi(s->str))
1206			return 1;
1207		if(!p && atoi(port) == atoi(s->str))
1208			return 1;
1209	}
1210	return 0;
1211}
1212
1213/**
1214 * Helper for ports_open. Creates one interface (or NULL for default).
1215 * @param ifname: The interface ip address.
1216 * @param do_auto: use automatic interface detection.
1217 * 	If enabled, then ifname must be the wildcard name.
1218 * @param do_udp: if udp should be used.
1219 * @param do_tcp: if tcp should be used.
1220 * @param hints: for getaddrinfo. family and flags have to be set by caller.
1221 * @param port: Port number to use (as string).
1222 * @param list: list of open ports, appended to, changed to point to list head.
1223 * @param rcv: receive buffer size for UDP
1224 * @param snd: send buffer size for UDP
1225 * @param ssl_port: ssl service port number
1226 * @param tls_additional_port: list of additional ssl service port numbers.
1227 * @param https_port: DoH service port number
1228 * @param proxy_protocol_port: list of PROXYv2 port numbers.
1229 * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1230 * 	set to false on exit if reuseport failed due to no kernel support.
1231 * @param transparent: set IP_TRANSPARENT socket option.
1232 * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1233 * @param freebind: set IP_FREEBIND socket option.
1234 * @param http2_nodelay: set TCP_NODELAY on HTTP/2 connection
1235 * @param use_systemd: if true, fetch sockets from systemd.
1236 * @param dnscrypt_port: dnscrypt service port number
1237 * @param dscp: DSCP to use.
1238 * @param sock_queue_timeout: the sock_queue_timeout from config. Seconds to
1239 * 	wait to discard if UDP packets have waited for long in the socket
1240 * 	buffer.
1241 * @return: returns false on error.
1242 */
1243static int
1244ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp,
1245	struct addrinfo *hints, const char* port, struct listen_port** list,
1246	size_t rcv, size_t snd, int ssl_port,
1247	struct config_strlist* tls_additional_port, int https_port,
1248	struct config_strlist* proxy_protocol_port,
1249	int* reuseport, int transparent, int tcp_mss, int freebind,
1250	int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp,
1251	int sock_queue_timeout)
1252{
1253	int s, noip6=0;
1254	int is_https = if_is_https(ifname, port, https_port);
1255	int is_dnscrypt = if_is_dnscrypt(ifname, port, dnscrypt_port);
1256	int is_pp2 = if_is_pp2(ifname, port, proxy_protocol_port);
1257	int nodelay = is_https && http2_nodelay;
1258	struct unbound_socket* ub_sock;
1259
1260	if(!do_udp && !do_tcp)
1261		return 0;
1262
1263	if(is_pp2) {
1264		if(is_dnscrypt) {
1265			fatal_exit("PROXYv2 and DNSCrypt combination not "
1266				"supported!");
1267		} else if(is_https) {
1268			fatal_exit("PROXYv2 and DoH combination not "
1269				"supported!");
1270		}
1271	}
1272
1273	if(do_auto) {
1274		ub_sock = calloc(1, sizeof(struct unbound_socket));
1275		if(!ub_sock)
1276			return 0;
1277		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1278			&noip6, rcv, snd, reuseport, transparent,
1279			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1280			if(ub_sock->addr)
1281				freeaddrinfo(ub_sock->addr);
1282			free(ub_sock);
1283			if(noip6) {
1284				log_warn("IPv6 protocol not available");
1285				return 1;
1286			}
1287			return 0;
1288		}
1289		/* getting source addr packet info is highly non-portable */
1290		if(!set_recvpktinfo(s, hints->ai_family)) {
1291			sock_close(s);
1292			if(ub_sock->addr)
1293				freeaddrinfo(ub_sock->addr);
1294			free(ub_sock);
1295			return 0;
1296		}
1297		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1298			log_warn("socket timestamping is not available");
1299		}
1300		if(!port_insert(list, s, is_dnscrypt
1301			?listen_type_udpancil_dnscrypt:listen_type_udpancil,
1302			is_pp2, ub_sock)) {
1303			sock_close(s);
1304			if(ub_sock->addr)
1305				freeaddrinfo(ub_sock->addr);
1306			free(ub_sock);
1307			return 0;
1308		}
1309	} else if(do_udp) {
1310		ub_sock = calloc(1, sizeof(struct unbound_socket));
1311		if(!ub_sock)
1312			return 0;
1313		/* regular udp socket */
1314		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1315			&noip6, rcv, snd, reuseport, transparent,
1316			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1317			if(ub_sock->addr)
1318				freeaddrinfo(ub_sock->addr);
1319			free(ub_sock);
1320			if(noip6) {
1321				log_warn("IPv6 protocol not available");
1322				return 1;
1323			}
1324			return 0;
1325		}
1326		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1327			log_warn("socket timestamping is not available");
1328		}
1329		if(!port_insert(list, s, is_dnscrypt
1330			?listen_type_udp_dnscrypt :
1331			(sock_queue_timeout ?
1332				listen_type_udpancil:listen_type_udp),
1333			is_pp2, ub_sock)) {
1334			sock_close(s);
1335			if(ub_sock->addr)
1336				freeaddrinfo(ub_sock->addr);
1337			free(ub_sock);
1338			return 0;
1339		}
1340	}
1341	if(do_tcp) {
1342		int is_ssl = if_is_ssl(ifname, port, ssl_port,
1343			tls_additional_port);
1344		enum listen_type port_type;
1345		ub_sock = calloc(1, sizeof(struct unbound_socket));
1346		if(!ub_sock)
1347			return 0;
1348		if(is_ssl)
1349			port_type = listen_type_ssl;
1350		else if(is_https)
1351			port_type = listen_type_http;
1352		else if(is_dnscrypt)
1353			port_type = listen_type_tcp_dnscrypt;
1354		else
1355			port_type = listen_type_tcp;
1356		if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1,
1357			&noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay,
1358			freebind, use_systemd, dscp, ub_sock)) == -1) {
1359			if(ub_sock->addr)
1360				freeaddrinfo(ub_sock->addr);
1361			free(ub_sock);
1362			if(noip6) {
1363				/*log_warn("IPv6 protocol not available");*/
1364				return 1;
1365			}
1366			return 0;
1367		}
1368		if(is_ssl)
1369			verbose(VERB_ALGO, "setup TCP for SSL service");
1370		if(!port_insert(list, s, port_type, is_pp2, ub_sock)) {
1371			sock_close(s);
1372			if(ub_sock->addr)
1373				freeaddrinfo(ub_sock->addr);
1374			free(ub_sock);
1375			return 0;
1376		}
1377	}
1378	return 1;
1379}
1380
1381/**
1382 * Add items to commpoint list in front.
1383 * @param c: commpoint to add.
1384 * @param front: listen struct.
1385 * @return: false on failure.
1386 */
1387static int
1388listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1389{
1390	struct listen_list* item = (struct listen_list*)malloc(
1391		sizeof(struct listen_list));
1392	if(!item)
1393		return 0;
1394	item->com = c;
1395	item->next = front->cps;
1396	front->cps = item;
1397	return 1;
1398}
1399
1400void listen_setup_locks(void)
1401{
1402	if(!stream_wait_lock_inited) {
1403		lock_basic_init(&stream_wait_count_lock);
1404		stream_wait_lock_inited = 1;
1405	}
1406	if(!http2_query_buffer_lock_inited) {
1407		lock_basic_init(&http2_query_buffer_count_lock);
1408		http2_query_buffer_lock_inited = 1;
1409	}
1410	if(!http2_response_buffer_lock_inited) {
1411		lock_basic_init(&http2_response_buffer_count_lock);
1412		http2_response_buffer_lock_inited = 1;
1413	}
1414}
1415
1416void listen_desetup_locks(void)
1417{
1418	if(stream_wait_lock_inited) {
1419		stream_wait_lock_inited = 0;
1420		lock_basic_destroy(&stream_wait_count_lock);
1421	}
1422	if(http2_query_buffer_lock_inited) {
1423		http2_query_buffer_lock_inited = 0;
1424		lock_basic_destroy(&http2_query_buffer_count_lock);
1425	}
1426	if(http2_response_buffer_lock_inited) {
1427		http2_response_buffer_lock_inited = 0;
1428		lock_basic_destroy(&http2_response_buffer_count_lock);
1429	}
1430}
1431
1432struct listen_dnsport*
1433listen_create(struct comm_base* base, struct listen_port* ports,
1434	size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1435	int harden_large_queries, uint32_t http_max_streams,
1436	char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit,
1437	void* sslctx, struct dt_env* dtenv, comm_point_callback_type* cb,
1438	void *cb_arg)
1439{
1440	struct listen_dnsport* front = (struct listen_dnsport*)
1441		malloc(sizeof(struct listen_dnsport));
1442	if(!front)
1443		return NULL;
1444	front->cps = NULL;
1445	front->udp_buff = sldns_buffer_new(bufsize);
1446#ifdef USE_DNSCRYPT
1447	front->dnscrypt_udp_buff = NULL;
1448#endif
1449	if(!front->udp_buff) {
1450		free(front);
1451		return NULL;
1452	}
1453
1454	/* create comm points as needed */
1455	while(ports) {
1456		struct comm_point* cp = NULL;
1457		if(ports->ftype == listen_type_udp ||
1458		   ports->ftype == listen_type_udp_dnscrypt) {
1459			cp = comm_point_create_udp(base, ports->fd,
1460				front->udp_buff, ports->pp2_enabled, cb,
1461				cb_arg, ports->socket);
1462		} else if(ports->ftype == listen_type_tcp ||
1463				ports->ftype == listen_type_tcp_dnscrypt) {
1464			cp = comm_point_create_tcp(base, ports->fd,
1465				tcp_accept_count, tcp_idle_timeout,
1466				harden_large_queries, 0, NULL,
1467				tcp_conn_limit, bufsize, front->udp_buff,
1468				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1469				ports->socket);
1470		} else if(ports->ftype == listen_type_ssl ||
1471			ports->ftype == listen_type_http) {
1472			cp = comm_point_create_tcp(base, ports->fd,
1473				tcp_accept_count, tcp_idle_timeout,
1474				harden_large_queries,
1475				http_max_streams, http_endpoint,
1476				tcp_conn_limit, bufsize, front->udp_buff,
1477				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1478				ports->socket);
1479			if(ports->ftype == listen_type_http) {
1480				if(!sslctx && !http_notls) {
1481					log_warn("HTTPS port configured, but "
1482						"no TLS tls-service-key or "
1483						"tls-service-pem set");
1484				}
1485#ifndef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
1486				if(!http_notls) {
1487					log_warn("Unbound is not compiled "
1488						"with an OpenSSL version "
1489						"supporting ALPN "
1490						"(OpenSSL >= 1.0.2). This "
1491						"is required to use "
1492						"DNS-over-HTTPS");
1493				}
1494#endif
1495#ifndef HAVE_NGHTTP2_NGHTTP2_H
1496				log_warn("Unbound is not compiled with "
1497					"nghttp2. This is required to use "
1498					"DNS-over-HTTPS.");
1499#endif
1500			}
1501		} else if(ports->ftype == listen_type_udpancil ||
1502				  ports->ftype == listen_type_udpancil_dnscrypt) {
1503#if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
1504			cp = comm_point_create_udp_ancil(base, ports->fd,
1505				front->udp_buff, ports->pp2_enabled, cb,
1506				cb_arg, ports->socket);
1507#else
1508			log_warn("This system does not support UDP ancilliary data.");
1509#endif
1510		}
1511		if(!cp) {
1512			log_err("can't create commpoint");
1513			listen_delete(front);
1514			return NULL;
1515		}
1516		if((http_notls && ports->ftype == listen_type_http) ||
1517			(ports->ftype == listen_type_tcp) ||
1518			(ports->ftype == listen_type_udp) ||
1519			(ports->ftype == listen_type_udpancil) ||
1520			(ports->ftype == listen_type_tcp_dnscrypt) ||
1521			(ports->ftype == listen_type_udp_dnscrypt) ||
1522			(ports->ftype == listen_type_udpancil_dnscrypt))
1523			cp->ssl = NULL;
1524		else
1525			cp->ssl = sslctx;
1526		cp->dtenv = dtenv;
1527		cp->do_not_close = 1;
1528#ifdef USE_DNSCRYPT
1529		if (ports->ftype == listen_type_udp_dnscrypt ||
1530			ports->ftype == listen_type_tcp_dnscrypt ||
1531			ports->ftype == listen_type_udpancil_dnscrypt) {
1532			cp->dnscrypt = 1;
1533			cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1534			if(!cp->dnscrypt_buffer) {
1535				log_err("can't alloc dnscrypt_buffer");
1536				comm_point_delete(cp);
1537				listen_delete(front);
1538				return NULL;
1539			}
1540			front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1541		}
1542#endif
1543		if(!listen_cp_insert(cp, front)) {
1544			log_err("malloc failed");
1545			comm_point_delete(cp);
1546			listen_delete(front);
1547			return NULL;
1548		}
1549		ports = ports->next;
1550	}
1551	if(!front->cps) {
1552		log_err("Could not open sockets to accept queries.");
1553		listen_delete(front);
1554		return NULL;
1555	}
1556
1557	return front;
1558}
1559
1560void
1561listen_list_delete(struct listen_list* list)
1562{
1563	struct listen_list *p = list, *pn;
1564	while(p) {
1565		pn = p->next;
1566		comm_point_delete(p->com);
1567		free(p);
1568		p = pn;
1569	}
1570}
1571
1572void
1573listen_delete(struct listen_dnsport* front)
1574{
1575	if(!front)
1576		return;
1577	listen_list_delete(front->cps);
1578#ifdef USE_DNSCRYPT
1579	if(front->dnscrypt_udp_buff &&
1580		front->udp_buff != front->dnscrypt_udp_buff) {
1581		sldns_buffer_free(front->dnscrypt_udp_buff);
1582	}
1583#endif
1584	sldns_buffer_free(front->udp_buff);
1585	free(front);
1586}
1587
1588#ifdef HAVE_GETIFADDRS
1589static int
1590resolve_ifa_name(struct ifaddrs *ifas, const char *search_ifa, char ***ip_addresses, int *ip_addresses_size)
1591{
1592	struct ifaddrs *ifa;
1593	void *tmpbuf;
1594	int last_ip_addresses_size = *ip_addresses_size;
1595
1596	for(ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) {
1597		sa_family_t family;
1598		const char* atsign;
1599#ifdef INET6      /* |   address ip    | % |  ifa name  | @ |  port  | nul */
1600		char addr_buf[INET6_ADDRSTRLEN + 1 + IF_NAMESIZE + 1 + 16 + 1];
1601#else
1602		char addr_buf[INET_ADDRSTRLEN + 1 + 16 + 1];
1603#endif
1604
1605		if((atsign=strrchr(search_ifa, '@')) != NULL) {
1606			if(strlen(ifa->ifa_name) != (size_t)(atsign-search_ifa)
1607			   || strncmp(ifa->ifa_name, search_ifa,
1608			   atsign-search_ifa) != 0)
1609				continue;
1610		} else {
1611			if(strcmp(ifa->ifa_name, search_ifa) != 0)
1612				continue;
1613			atsign = "";
1614		}
1615
1616		if(ifa->ifa_addr == NULL)
1617			continue;
1618
1619		family = ifa->ifa_addr->sa_family;
1620		if(family == AF_INET) {
1621			char a4[INET_ADDRSTRLEN + 1];
1622			struct sockaddr_in *in4 = (struct sockaddr_in *)
1623				ifa->ifa_addr;
1624			if(!inet_ntop(family, &in4->sin_addr, a4, sizeof(a4))) {
1625				log_err("inet_ntop failed");
1626				return 0;
1627			}
1628			snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1629				a4, atsign);
1630		}
1631#ifdef INET6
1632		else if(family == AF_INET6) {
1633			struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)
1634				ifa->ifa_addr;
1635			char a6[INET6_ADDRSTRLEN + 1];
1636			char if_index_name[IF_NAMESIZE + 1];
1637			if_index_name[0] = 0;
1638			if(!inet_ntop(family, &in6->sin6_addr, a6, sizeof(a6))) {
1639				log_err("inet_ntop failed");
1640				return 0;
1641			}
1642			(void)if_indextoname(in6->sin6_scope_id,
1643				(char *)if_index_name);
1644			if (strlen(if_index_name) != 0) {
1645				snprintf(addr_buf, sizeof(addr_buf),
1646					"%s%%%s%s", a6, if_index_name, atsign);
1647			} else {
1648				snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1649					a6, atsign);
1650			}
1651		}
1652#endif
1653		else {
1654			continue;
1655		}
1656		verbose(4, "interface %s has address %s", search_ifa, addr_buf);
1657
1658		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1659		if(!tmpbuf) {
1660			log_err("realloc failed: out of memory");
1661			return 0;
1662		} else {
1663			*ip_addresses = tmpbuf;
1664		}
1665		(*ip_addresses)[*ip_addresses_size] = strdup(addr_buf);
1666		if(!(*ip_addresses)[*ip_addresses_size]) {
1667			log_err("strdup failed: out of memory");
1668			return 0;
1669		}
1670		(*ip_addresses_size)++;
1671	}
1672
1673	if (*ip_addresses_size == last_ip_addresses_size) {
1674		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1675		if(!tmpbuf) {
1676			log_err("realloc failed: out of memory");
1677			return 0;
1678		} else {
1679			*ip_addresses = tmpbuf;
1680		}
1681		(*ip_addresses)[*ip_addresses_size] = strdup(search_ifa);
1682		if(!(*ip_addresses)[*ip_addresses_size]) {
1683			log_err("strdup failed: out of memory");
1684			return 0;
1685		}
1686		(*ip_addresses_size)++;
1687	}
1688	return 1;
1689}
1690#endif /* HAVE_GETIFADDRS */
1691
1692int resolve_interface_names(char** ifs, int num_ifs,
1693	struct config_strlist* list, char*** resif, int* num_resif)
1694{
1695#ifdef HAVE_GETIFADDRS
1696	struct ifaddrs *addrs = NULL;
1697	if(num_ifs == 0 && list == NULL) {
1698		*resif = NULL;
1699		*num_resif = 0;
1700		return 1;
1701	}
1702	if(getifaddrs(&addrs) == -1) {
1703		log_err("failed to list interfaces: getifaddrs: %s",
1704			strerror(errno));
1705		freeifaddrs(addrs);
1706		return 0;
1707	}
1708	if(ifs) {
1709		int i;
1710		for(i=0; i<num_ifs; i++) {
1711			if(!resolve_ifa_name(addrs, ifs[i], resif, num_resif)) {
1712				freeifaddrs(addrs);
1713				config_del_strarray(*resif, *num_resif);
1714				*resif = NULL;
1715				*num_resif = 0;
1716				return 0;
1717			}
1718		}
1719	}
1720	if(list) {
1721		struct config_strlist* p;
1722		for(p = list; p; p = p->next) {
1723			if(!resolve_ifa_name(addrs, p->str, resif, num_resif)) {
1724				freeifaddrs(addrs);
1725				config_del_strarray(*resif, *num_resif);
1726				*resif = NULL;
1727				*num_resif = 0;
1728				return 0;
1729			}
1730}
1731	}
1732	freeifaddrs(addrs);
1733	return 1;
1734#else
1735	struct config_strlist* p;
1736	if(num_ifs == 0 && list == NULL) {
1737		*resif = NULL;
1738		*num_resif = 0;
1739		return 1;
1740	}
1741	*num_resif = num_ifs;
1742	for(p = list; p; p = p->next) {
1743		(*num_resif)++;
1744	}
1745	*resif = calloc(*num_resif, sizeof(**resif));
1746	if(!*resif) {
1747		log_err("out of memory");
1748		return 0;
1749	}
1750	if(ifs) {
1751		int i;
1752		for(i=0; i<num_ifs; i++) {
1753			(*resif)[i] = strdup(ifs[i]);
1754			if(!((*resif)[i])) {
1755				log_err("out of memory");
1756				config_del_strarray(*resif, *num_resif);
1757				*resif = NULL;
1758				*num_resif = 0;
1759				return 0;
1760			}
1761		}
1762	}
1763	if(list) {
1764		int idx = num_ifs;
1765		for(p = list; p; p = p->next) {
1766			(*resif)[idx] = strdup(p->str);
1767			if(!((*resif)[idx])) {
1768				log_err("out of memory");
1769				config_del_strarray(*resif, *num_resif);
1770				*resif = NULL;
1771				*num_resif = 0;
1772				return 0;
1773			}
1774			idx++;
1775		}
1776	}
1777	return 1;
1778#endif /* HAVE_GETIFADDRS */
1779}
1780
1781struct listen_port*
1782listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs,
1783	int* reuseport)
1784{
1785	struct listen_port* list = NULL;
1786	struct addrinfo hints;
1787	int i, do_ip4, do_ip6;
1788	int do_tcp, do_auto;
1789	char portbuf[32];
1790	snprintf(portbuf, sizeof(portbuf), "%d", cfg->port);
1791	do_ip4 = cfg->do_ip4;
1792	do_ip6 = cfg->do_ip6;
1793	do_tcp = cfg->do_tcp;
1794	do_auto = cfg->if_automatic && cfg->do_udp;
1795	if(cfg->incoming_num_tcp == 0)
1796		do_tcp = 0;
1797
1798	/* getaddrinfo */
1799	memset(&hints, 0, sizeof(hints));
1800	hints.ai_flags = AI_PASSIVE;
1801	/* no name lookups on our listening ports */
1802	if(num_ifs > 0)
1803		hints.ai_flags |= AI_NUMERICHOST;
1804	hints.ai_family = AF_UNSPEC;
1805#ifndef INET6
1806	do_ip6 = 0;
1807#endif
1808	if(!do_ip4 && !do_ip6) {
1809		return NULL;
1810	}
1811	/* create ip4 and ip6 ports so that return addresses are nice. */
1812	if(do_auto || num_ifs == 0) {
1813		if(do_auto && cfg->if_automatic_ports &&
1814			cfg->if_automatic_ports[0]!=0) {
1815			char* now = cfg->if_automatic_ports;
1816			while(now && *now) {
1817				char* after;
1818				int extraport;
1819				while(isspace((unsigned char)*now))
1820					now++;
1821				if(!*now)
1822					break;
1823				after = now;
1824				extraport = (int)strtol(now, &after, 10);
1825				if(extraport < 0 || extraport > 65535) {
1826					log_err("interface-automatic-ports port number out of range, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1827					listening_ports_free(list);
1828					return NULL;
1829				}
1830				if(extraport == 0 && now == after) {
1831					log_err("interface-automatic-ports could not be parsed, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1832					listening_ports_free(list);
1833					return NULL;
1834				}
1835				now = after;
1836				snprintf(portbuf, sizeof(portbuf), "%d", extraport);
1837				if(do_ip6) {
1838					hints.ai_family = AF_INET6;
1839					if(!ports_create_if("::0",
1840						do_auto, cfg->do_udp, do_tcp,
1841						&hints, portbuf, &list,
1842						cfg->so_rcvbuf, cfg->so_sndbuf,
1843						cfg->ssl_port, cfg->tls_additional_port,
1844						cfg->https_port,
1845						cfg->proxy_protocol_port,
1846						reuseport, cfg->ip_transparent,
1847						cfg->tcp_mss, cfg->ip_freebind,
1848						cfg->http_nodelay, cfg->use_systemd,
1849						cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1850						listening_ports_free(list);
1851						return NULL;
1852					}
1853				}
1854				if(do_ip4) {
1855					hints.ai_family = AF_INET;
1856					if(!ports_create_if("0.0.0.0",
1857						do_auto, cfg->do_udp, do_tcp,
1858						&hints, portbuf, &list,
1859						cfg->so_rcvbuf, cfg->so_sndbuf,
1860						cfg->ssl_port, cfg->tls_additional_port,
1861						cfg->https_port,
1862						cfg->proxy_protocol_port,
1863						reuseport, cfg->ip_transparent,
1864						cfg->tcp_mss, cfg->ip_freebind,
1865						cfg->http_nodelay, cfg->use_systemd,
1866						cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1867						listening_ports_free(list);
1868						return NULL;
1869					}
1870				}
1871			}
1872			return list;
1873		}
1874		if(do_ip6) {
1875			hints.ai_family = AF_INET6;
1876			if(!ports_create_if(do_auto?"::0":"::1",
1877				do_auto, cfg->do_udp, do_tcp,
1878				&hints, portbuf, &list,
1879				cfg->so_rcvbuf, cfg->so_sndbuf,
1880				cfg->ssl_port, cfg->tls_additional_port,
1881				cfg->https_port, cfg->proxy_protocol_port,
1882				reuseport, cfg->ip_transparent,
1883				cfg->tcp_mss, cfg->ip_freebind,
1884				cfg->http_nodelay, cfg->use_systemd,
1885				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1886				listening_ports_free(list);
1887				return NULL;
1888			}
1889		}
1890		if(do_ip4) {
1891			hints.ai_family = AF_INET;
1892			if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1",
1893				do_auto, cfg->do_udp, do_tcp,
1894				&hints, portbuf, &list,
1895				cfg->so_rcvbuf, cfg->so_sndbuf,
1896				cfg->ssl_port, cfg->tls_additional_port,
1897				cfg->https_port, cfg->proxy_protocol_port,
1898				reuseport, cfg->ip_transparent,
1899				cfg->tcp_mss, cfg->ip_freebind,
1900				cfg->http_nodelay, cfg->use_systemd,
1901				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1902				listening_ports_free(list);
1903				return NULL;
1904			}
1905		}
1906	} else for(i = 0; i<num_ifs; i++) {
1907		if(str_is_ip6(ifs[i])) {
1908			if(!do_ip6)
1909				continue;
1910			hints.ai_family = AF_INET6;
1911			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1912				do_tcp, &hints, portbuf, &list,
1913				cfg->so_rcvbuf, cfg->so_sndbuf,
1914				cfg->ssl_port, cfg->tls_additional_port,
1915				cfg->https_port, cfg->proxy_protocol_port,
1916				reuseport, cfg->ip_transparent,
1917				cfg->tcp_mss, cfg->ip_freebind,
1918				cfg->http_nodelay, cfg->use_systemd,
1919				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1920				listening_ports_free(list);
1921				return NULL;
1922			}
1923		} else {
1924			if(!do_ip4)
1925				continue;
1926			hints.ai_family = AF_INET;
1927			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1928				do_tcp, &hints, portbuf, &list,
1929				cfg->so_rcvbuf, cfg->so_sndbuf,
1930				cfg->ssl_port, cfg->tls_additional_port,
1931				cfg->https_port, cfg->proxy_protocol_port,
1932				reuseport, cfg->ip_transparent,
1933				cfg->tcp_mss, cfg->ip_freebind,
1934				cfg->http_nodelay, cfg->use_systemd,
1935				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1936				listening_ports_free(list);
1937				return NULL;
1938			}
1939		}
1940	}
1941
1942	return list;
1943}
1944
1945void listening_ports_free(struct listen_port* list)
1946{
1947	struct listen_port* nx;
1948	while(list) {
1949		nx = list->next;
1950		if(list->fd != -1) {
1951			sock_close(list->fd);
1952		}
1953		/* rc_ports don't have ub_socket */
1954		if(list->socket) {
1955			if(list->socket->addr)
1956				freeaddrinfo(list->socket->addr);
1957			free(list->socket);
1958		}
1959		free(list);
1960		list = nx;
1961	}
1962}
1963
1964size_t listen_get_mem(struct listen_dnsport* listen)
1965{
1966	struct listen_list* p;
1967	size_t s = sizeof(*listen) + sizeof(*listen->base) +
1968		sizeof(*listen->udp_buff) +
1969		sldns_buffer_capacity(listen->udp_buff);
1970#ifdef USE_DNSCRYPT
1971	s += sizeof(*listen->dnscrypt_udp_buff);
1972	if(listen->udp_buff != listen->dnscrypt_udp_buff){
1973		s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
1974	}
1975#endif
1976	for(p = listen->cps; p; p = p->next) {
1977		s += sizeof(*p);
1978		s += comm_point_get_mem(p->com);
1979	}
1980	return s;
1981}
1982
1983void listen_stop_accept(struct listen_dnsport* listen)
1984{
1985	/* do not stop the ones that have no tcp_free list
1986	 * (they have already stopped listening) */
1987	struct listen_list* p;
1988	for(p=listen->cps; p; p=p->next) {
1989		if(p->com->type == comm_tcp_accept &&
1990			p->com->tcp_free != NULL) {
1991			comm_point_stop_listening(p->com);
1992		}
1993	}
1994}
1995
1996void listen_start_accept(struct listen_dnsport* listen)
1997{
1998	/* do not start the ones that have no tcp_free list, it is no
1999	 * use to listen to them because they have no free tcp handlers */
2000	struct listen_list* p;
2001	for(p=listen->cps; p; p=p->next) {
2002		if(p->com->type == comm_tcp_accept &&
2003			p->com->tcp_free != NULL) {
2004			comm_point_start_listening(p->com, -1, -1);
2005		}
2006	}
2007}
2008
2009struct tcp_req_info*
2010tcp_req_info_create(struct sldns_buffer* spoolbuf)
2011{
2012	struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
2013	if(!req) {
2014		log_err("malloc failure for new stream outoforder processing structure");
2015		return NULL;
2016	}
2017	memset(req, 0, sizeof(*req));
2018	req->spool_buffer = spoolbuf;
2019	return req;
2020}
2021
2022void
2023tcp_req_info_delete(struct tcp_req_info* req)
2024{
2025	if(!req) return;
2026	tcp_req_info_clear(req);
2027	/* cp is pointer back to commpoint that owns this struct and
2028	 * called delete on us */
2029	/* spool_buffer is shared udp buffer, not deleted here */
2030	free(req);
2031}
2032
2033void tcp_req_info_clear(struct tcp_req_info* req)
2034{
2035	struct tcp_req_open_item* open, *nopen;
2036	struct tcp_req_done_item* item, *nitem;
2037	if(!req) return;
2038
2039	/* free outstanding request mesh reply entries */
2040	open = req->open_req_list;
2041	while(open) {
2042		nopen = open->next;
2043		mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
2044		free(open);
2045		open = nopen;
2046	}
2047	req->open_req_list = NULL;
2048	req->num_open_req = 0;
2049
2050	/* free pending writable result packets */
2051	item = req->done_req_list;
2052	while(item) {
2053		nitem = item->next;
2054		lock_basic_lock(&stream_wait_count_lock);
2055		stream_wait_count -= (sizeof(struct tcp_req_done_item)
2056			+item->len);
2057		lock_basic_unlock(&stream_wait_count_lock);
2058		free(item->buf);
2059		free(item);
2060		item = nitem;
2061	}
2062	req->done_req_list = NULL;
2063	req->num_done_req = 0;
2064	req->read_is_closed = 0;
2065}
2066
2067void
2068tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
2069{
2070	struct tcp_req_open_item* open, *prev = NULL;
2071	if(!req || !m) return;
2072	open = req->open_req_list;
2073	while(open) {
2074		if(open->mesh_state == m) {
2075			struct tcp_req_open_item* next;
2076			if(prev) prev->next = open->next;
2077			else req->open_req_list = open->next;
2078			/* caller has to manage the mesh state reply entry */
2079			next = open->next;
2080			free(open);
2081			req->num_open_req --;
2082
2083			/* prev = prev; */
2084			open = next;
2085			continue;
2086		}
2087		prev = open;
2088		open = open->next;
2089	}
2090}
2091
2092/** setup listening for read or write */
2093static void
2094tcp_req_info_setup_listen(struct tcp_req_info* req)
2095{
2096	int wr = 0;
2097	int rd = 0;
2098
2099	if(req->cp->tcp_byte_count != 0) {
2100		/* cannot change, halfway through */
2101		return;
2102	}
2103
2104	if(!req->cp->tcp_is_reading)
2105		wr = 1;
2106	if(!req->read_is_closed)
2107		rd = 1;
2108
2109	if(wr) {
2110		req->cp->tcp_is_reading = 0;
2111		comm_point_stop_listening(req->cp);
2112		comm_point_start_listening(req->cp, -1,
2113			adjusted_tcp_timeout(req->cp));
2114	} else if(rd) {
2115		req->cp->tcp_is_reading = 1;
2116		comm_point_stop_listening(req->cp);
2117		comm_point_start_listening(req->cp, -1,
2118			adjusted_tcp_timeout(req->cp));
2119		/* and also read it (from SSL stack buffers), so
2120		 * no event read event is expected since the remainder of
2121		 * the TLS frame is sitting in the buffers. */
2122		req->read_again = 1;
2123	} else {
2124		comm_point_stop_listening(req->cp);
2125		comm_point_start_listening(req->cp, -1,
2126			adjusted_tcp_timeout(req->cp));
2127		comm_point_listen_for_rw(req->cp, 0, 0);
2128	}
2129}
2130
2131/** remove first item from list of pending results */
2132static struct tcp_req_done_item*
2133tcp_req_info_pop_done(struct tcp_req_info* req)
2134{
2135	struct tcp_req_done_item* item;
2136	log_assert(req->num_done_req > 0 && req->done_req_list);
2137	item = req->done_req_list;
2138	lock_basic_lock(&stream_wait_count_lock);
2139	stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
2140	lock_basic_unlock(&stream_wait_count_lock);
2141	req->done_req_list = req->done_req_list->next;
2142	req->num_done_req --;
2143	return item;
2144}
2145
2146/** Send given buffer and setup to write */
2147static void
2148tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
2149	size_t len)
2150{
2151	sldns_buffer_clear(req->cp->buffer);
2152	sldns_buffer_write(req->cp->buffer, buf, len);
2153	sldns_buffer_flip(req->cp->buffer);
2154
2155	req->cp->tcp_is_reading = 0; /* we are now writing */
2156}
2157
2158/** pick up the next result and start writing it to the channel */
2159static void
2160tcp_req_pickup_next_result(struct tcp_req_info* req)
2161{
2162	if(req->num_done_req > 0) {
2163		/* unlist the done item from the list of pending results */
2164		struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
2165		tcp_req_info_start_write_buf(req, item->buf, item->len);
2166		free(item->buf);
2167		free(item);
2168	}
2169}
2170
2171/** the read channel has closed */
2172int
2173tcp_req_info_handle_read_close(struct tcp_req_info* req)
2174{
2175	verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
2176	/* reset byte count for (potential) partial read */
2177	req->cp->tcp_byte_count = 0;
2178	/* if we still have results to write, pick up next and write it */
2179	if(req->num_done_req != 0) {
2180		tcp_req_pickup_next_result(req);
2181		tcp_req_info_setup_listen(req);
2182		return 1;
2183	}
2184	/* if nothing to do, this closes the connection */
2185	if(req->num_open_req == 0 && req->num_done_req == 0)
2186		return 0;
2187	/* otherwise, we must be waiting for dns resolve, wait with timeout */
2188	req->read_is_closed = 1;
2189	tcp_req_info_setup_listen(req);
2190	return 1;
2191}
2192
2193void
2194tcp_req_info_handle_writedone(struct tcp_req_info* req)
2195{
2196	/* back to reading state, we finished this write event */
2197	sldns_buffer_clear(req->cp->buffer);
2198	if(req->num_done_req == 0 && req->read_is_closed) {
2199		/* no more to write and nothing to read, close it */
2200		comm_point_drop_reply(&req->cp->repinfo);
2201		return;
2202	}
2203	req->cp->tcp_is_reading = 1;
2204	/* see if another result needs writing */
2205	tcp_req_pickup_next_result(req);
2206
2207	/* see if there is more to write, if not stop_listening for writing */
2208	/* see if new requests are allowed, if so, start_listening
2209	 * for reading */
2210	tcp_req_info_setup_listen(req);
2211}
2212
2213void
2214tcp_req_info_handle_readdone(struct tcp_req_info* req)
2215{
2216	struct comm_point* c = req->cp;
2217
2218	/* we want to read up several requests, unless there are
2219	 * pending answers */
2220
2221	req->is_drop = 0;
2222	req->is_reply = 0;
2223	req->in_worker_handle = 1;
2224	sldns_buffer_set_limit(req->spool_buffer, 0);
2225	/* handle the current request */
2226	/* this calls the worker handle request routine that could give
2227	 * a cache response, or localdata response, or drop the reply,
2228	 * or schedule a mesh entry for later */
2229	fptr_ok(fptr_whitelist_comm_point(c->callback));
2230	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
2231		req->in_worker_handle = 0;
2232		/* there is an answer, put it up.  It is already in the
2233		 * c->buffer, just send it. */
2234		/* since we were just reading a query, the channel is
2235		 * clear to write to */
2236	send_it:
2237		c->tcp_is_reading = 0;
2238		comm_point_stop_listening(c);
2239		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
2240		return;
2241	}
2242	req->in_worker_handle = 0;
2243	/* it should be waiting in the mesh for recursion.
2244	 * If mesh failed to add a new entry and called commpoint_drop_reply.
2245	 * Then the mesh state has been cleared. */
2246	if(req->is_drop) {
2247		/* the reply has been dropped, stream has been closed. */
2248		return;
2249	}
2250	/* If mesh failed(mallocfail) and called commpoint_send_reply with
2251	 * something like servfail then we pick up that reply below. */
2252	if(req->is_reply) {
2253		goto send_it;
2254	}
2255
2256	sldns_buffer_clear(c->buffer);
2257	/* if pending answers, pick up an answer and start sending it */
2258	tcp_req_pickup_next_result(req);
2259
2260	/* if answers pending, start sending answers */
2261	/* read more requests if we can have more requests */
2262	tcp_req_info_setup_listen(req);
2263}
2264
2265int
2266tcp_req_info_add_meshstate(struct tcp_req_info* req,
2267	struct mesh_area* mesh, struct mesh_state* m)
2268{
2269	struct tcp_req_open_item* item;
2270	log_assert(req && mesh && m);
2271	item = (struct tcp_req_open_item*)malloc(sizeof(*item));
2272	if(!item) return 0;
2273	item->next = req->open_req_list;
2274	item->mesh = mesh;
2275	item->mesh_state = m;
2276	req->open_req_list = item;
2277	req->num_open_req++;
2278	return 1;
2279}
2280
2281/** Add a result to the result list.  At the end. */
2282static int
2283tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
2284{
2285	struct tcp_req_done_item* last = NULL;
2286	struct tcp_req_done_item* item;
2287	size_t space;
2288
2289	/* see if we have space */
2290	space = sizeof(struct tcp_req_done_item) + len;
2291	lock_basic_lock(&stream_wait_count_lock);
2292	if(stream_wait_count + space > stream_wait_max) {
2293		lock_basic_unlock(&stream_wait_count_lock);
2294		verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
2295		return 0;
2296	}
2297	stream_wait_count += space;
2298	lock_basic_unlock(&stream_wait_count_lock);
2299
2300	/* find last element */
2301	last = req->done_req_list;
2302	while(last && last->next)
2303		last = last->next;
2304
2305	/* create new element */
2306	item = (struct tcp_req_done_item*)malloc(sizeof(*item));
2307	if(!item) {
2308		log_err("malloc failure, for stream result list");
2309		return 0;
2310	}
2311	item->next = NULL;
2312	item->len = len;
2313	item->buf = memdup(buf, len);
2314	if(!item->buf) {
2315		free(item);
2316		log_err("malloc failure, adding reply to stream result list");
2317		return 0;
2318	}
2319
2320	/* link in */
2321	if(last) last->next = item;
2322	else req->done_req_list = item;
2323	req->num_done_req++;
2324	return 1;
2325}
2326
2327void
2328tcp_req_info_send_reply(struct tcp_req_info* req)
2329{
2330	if(req->in_worker_handle) {
2331		/* reply from mesh is in the spool_buffer */
2332		/* copy now, so that the spool buffer is free for other tasks
2333		 * before the callback is done */
2334		sldns_buffer_clear(req->cp->buffer);
2335		sldns_buffer_write(req->cp->buffer,
2336			sldns_buffer_begin(req->spool_buffer),
2337			sldns_buffer_limit(req->spool_buffer));
2338		sldns_buffer_flip(req->cp->buffer);
2339		req->is_reply = 1;
2340		return;
2341	}
2342	/* now that the query has been handled, that mesh_reply entry
2343	 * should be removed, from the tcp_req_info list,
2344	 * the mesh state cleanup removes then with region_cleanup and
2345	 * replies_sent true. */
2346	/* see if we can send it straight away (we are not doing
2347	 * anything else).  If so, copy to buffer and start */
2348	if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
2349		/* buffer is free, and was ready to read new query into,
2350		 * but we are now going to use it to send this answer */
2351		tcp_req_info_start_write_buf(req,
2352			sldns_buffer_begin(req->spool_buffer),
2353			sldns_buffer_limit(req->spool_buffer));
2354		/* switch to listen to write events */
2355		comm_point_stop_listening(req->cp);
2356		comm_point_start_listening(req->cp, -1,
2357			adjusted_tcp_timeout(req->cp));
2358		return;
2359	}
2360	/* queue up the answer behind the others already pending */
2361	if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
2362		sldns_buffer_limit(req->spool_buffer))) {
2363		/* drop the connection, we are out of resources */
2364		comm_point_drop_reply(&req->cp->repinfo);
2365	}
2366}
2367
2368size_t tcp_req_info_get_stream_buffer_size(void)
2369{
2370	size_t s;
2371	if(!stream_wait_lock_inited)
2372		return stream_wait_count;
2373	lock_basic_lock(&stream_wait_count_lock);
2374	s = stream_wait_count;
2375	lock_basic_unlock(&stream_wait_count_lock);
2376	return s;
2377}
2378
2379size_t http2_get_query_buffer_size(void)
2380{
2381	size_t s;
2382	if(!http2_query_buffer_lock_inited)
2383		return http2_query_buffer_count;
2384	lock_basic_lock(&http2_query_buffer_count_lock);
2385	s = http2_query_buffer_count;
2386	lock_basic_unlock(&http2_query_buffer_count_lock);
2387	return s;
2388}
2389
2390size_t http2_get_response_buffer_size(void)
2391{
2392	size_t s;
2393	if(!http2_response_buffer_lock_inited)
2394		return http2_response_buffer_count;
2395	lock_basic_lock(&http2_response_buffer_count_lock);
2396	s = http2_response_buffer_count;
2397	lock_basic_unlock(&http2_response_buffer_count_lock);
2398	return s;
2399}
2400
2401#ifdef HAVE_NGHTTP2
2402/** nghttp2 callback. Used to copy response from rbuffer to nghttp2 session */
2403static ssize_t http2_submit_response_read_callback(
2404	nghttp2_session* ATTR_UNUSED(session),
2405	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2406	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2407{
2408	struct http2_stream* h2_stream;
2409	struct http2_session* h2_session = source->ptr;
2410	size_t copylen = length;
2411	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2412		h2_session->session, stream_id))) {
2413		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2414			"stream");
2415		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2416	}
2417	if(!h2_stream->rbuffer ||
2418		sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2419		verbose(VERB_QUERY, "http2: cannot submit buffer. No data "
2420			"available in rbuffer");
2421		/* rbuffer will be free'd in frame close cb */
2422		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2423	}
2424
2425	if(copylen > sldns_buffer_remaining(h2_stream->rbuffer))
2426		copylen = sldns_buffer_remaining(h2_stream->rbuffer);
2427	if(copylen > SSIZE_MAX)
2428		copylen = SSIZE_MAX; /* will probably never happen */
2429
2430	memcpy(buf, sldns_buffer_current(h2_stream->rbuffer), copylen);
2431	sldns_buffer_skip(h2_stream->rbuffer, copylen);
2432
2433	if(sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2434		*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2435		lock_basic_lock(&http2_response_buffer_count_lock);
2436		http2_response_buffer_count -=
2437			sldns_buffer_capacity(h2_stream->rbuffer);
2438		lock_basic_unlock(&http2_response_buffer_count_lock);
2439		sldns_buffer_free(h2_stream->rbuffer);
2440		h2_stream->rbuffer = NULL;
2441	}
2442
2443	return copylen;
2444}
2445
2446/**
2447 * Send RST_STREAM frame for stream.
2448 * @param h2_session: http2 session to submit frame to
2449 * @param h2_stream: http2 stream containing frame ID to use in RST_STREAM
2450 * @return 0 on error, 1 otherwise
2451 */
2452static int http2_submit_rst_stream(struct http2_session* h2_session,
2453		struct http2_stream* h2_stream)
2454{
2455	int ret = nghttp2_submit_rst_stream(h2_session->session,
2456		NGHTTP2_FLAG_NONE, h2_stream->stream_id,
2457		NGHTTP2_INTERNAL_ERROR);
2458	if(ret) {
2459		verbose(VERB_QUERY, "http2: nghttp2_submit_rst_stream failed, "
2460			"error: %s", nghttp2_strerror(ret));
2461		return 0;
2462	}
2463	return 1;
2464}
2465
2466/**
2467 * DNS response ready to be submitted to nghttp2, to be prepared for sending
2468 * out. Response is stored in c->buffer. Copy to rbuffer because the c->buffer
2469 * might be used before this will be sent out.
2470 * @param h2_session: http2 session, containing c->buffer which contains answer
2471 * @return 0 on error, 1 otherwise
2472 */
2473int http2_submit_dns_response(struct http2_session* h2_session)
2474{
2475	int ret;
2476	nghttp2_data_provider data_prd;
2477	char status[4];
2478	nghttp2_nv headers[3];
2479	struct http2_stream* h2_stream = h2_session->c->h2_stream;
2480	size_t rlen;
2481	char rlen_str[32];
2482
2483	if(h2_stream->rbuffer) {
2484		log_err("http2 submit response error: rbuffer already "
2485			"exists");
2486		return 0;
2487	}
2488	if(sldns_buffer_remaining(h2_session->c->buffer) == 0) {
2489		log_err("http2 submit response error: c->buffer not complete");
2490		return 0;
2491	}
2492
2493	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2494		verbose(VERB_QUERY, "http2: submit response error: "
2495			"invalid status");
2496		return 0;
2497	}
2498
2499	rlen = sldns_buffer_remaining(h2_session->c->buffer);
2500	snprintf(rlen_str, sizeof(rlen_str), "%u", (unsigned)rlen);
2501
2502	lock_basic_lock(&http2_response_buffer_count_lock);
2503	if(http2_response_buffer_count + rlen > http2_response_buffer_max) {
2504		lock_basic_unlock(&http2_response_buffer_count_lock);
2505		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2506			"in https-response-buffer-size");
2507		return http2_submit_rst_stream(h2_session, h2_stream);
2508	}
2509	http2_response_buffer_count += rlen;
2510	lock_basic_unlock(&http2_response_buffer_count_lock);
2511
2512	if(!(h2_stream->rbuffer = sldns_buffer_new(rlen))) {
2513		lock_basic_lock(&http2_response_buffer_count_lock);
2514		http2_response_buffer_count -= rlen;
2515		lock_basic_unlock(&http2_response_buffer_count_lock);
2516		log_err("http2 submit response error: malloc failure");
2517		return 0;
2518	}
2519
2520	headers[0].name = (uint8_t*)":status";
2521	headers[0].namelen = 7;
2522	headers[0].value = (uint8_t*)status;
2523	headers[0].valuelen = 3;
2524	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2525
2526	headers[1].name = (uint8_t*)"content-type";
2527	headers[1].namelen = 12;
2528	headers[1].value = (uint8_t*)"application/dns-message";
2529	headers[1].valuelen = 23;
2530	headers[1].flags = NGHTTP2_NV_FLAG_NONE;
2531
2532	headers[2].name = (uint8_t*)"content-length";
2533	headers[2].namelen = 14;
2534	headers[2].value = (uint8_t*)rlen_str;
2535	headers[2].valuelen = strlen(rlen_str);
2536	headers[2].flags = NGHTTP2_NV_FLAG_NONE;
2537
2538	sldns_buffer_write(h2_stream->rbuffer,
2539		sldns_buffer_current(h2_session->c->buffer),
2540		sldns_buffer_remaining(h2_session->c->buffer));
2541	sldns_buffer_flip(h2_stream->rbuffer);
2542
2543	data_prd.source.ptr = h2_session;
2544	data_prd.read_callback = http2_submit_response_read_callback;
2545	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2546		headers, 3, &data_prd);
2547	if(ret) {
2548		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2549			"error: %s", nghttp2_strerror(ret));
2550		return 0;
2551	}
2552	return 1;
2553}
2554#else
2555int http2_submit_dns_response(void* ATTR_UNUSED(v))
2556{
2557	return 0;
2558}
2559#endif
2560
2561#ifdef HAVE_NGHTTP2
2562/** HTTP status to descriptive string */
2563static char* http_status_to_str(enum http_status s)
2564{
2565	switch(s) {
2566		case HTTP_STATUS_OK:
2567			return "OK";
2568		case HTTP_STATUS_BAD_REQUEST:
2569			return "Bad Request";
2570		case HTTP_STATUS_NOT_FOUND:
2571			return "Not Found";
2572		case HTTP_STATUS_PAYLOAD_TOO_LARGE:
2573			return "Payload Too Large";
2574		case HTTP_STATUS_URI_TOO_LONG:
2575			return "URI Too Long";
2576		case HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE:
2577			return "Unsupported Media Type";
2578		case HTTP_STATUS_NOT_IMPLEMENTED:
2579			return "Not Implemented";
2580	}
2581	return "Status Unknown";
2582}
2583
2584/** nghttp2 callback. Used to copy error message to nghttp2 session */
2585static ssize_t http2_submit_error_read_callback(
2586	nghttp2_session* ATTR_UNUSED(session),
2587	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2588	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2589{
2590	struct http2_stream* h2_stream;
2591	struct http2_session* h2_session = source->ptr;
2592	char* msg;
2593	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2594		h2_session->session, stream_id))) {
2595		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2596			"stream");
2597		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2598	}
2599	*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2600	msg = http_status_to_str(h2_stream->status);
2601	if(length < strlen(msg))
2602		return 0; /* not worth trying over multiple frames */
2603	memcpy(buf, msg, strlen(msg));
2604	return strlen(msg);
2605
2606}
2607
2608/**
2609 * HTTP error response ready to be submitted to nghttp2, to be prepared for
2610 * sending out. Message body will contain descriptive string for HTTP status.
2611 * @param h2_session: http2 session to submit to
2612 * @param h2_stream: http2 stream containing HTTP status to use for error
2613 * @return 0 on error, 1 otherwise
2614 */
2615static int http2_submit_error(struct http2_session* h2_session,
2616	struct http2_stream* h2_stream)
2617{
2618	int ret;
2619	char status[4];
2620	nghttp2_data_provider data_prd;
2621	nghttp2_nv headers[1]; /* will be copied by nghttp */
2622	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2623		verbose(VERB_QUERY, "http2: submit error failed, "
2624			"invalid status");
2625		return 0;
2626	}
2627	headers[0].name = (uint8_t*)":status";
2628	headers[0].namelen = 7;
2629	headers[0].value = (uint8_t*)status;
2630	headers[0].valuelen = 3;
2631	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2632
2633	data_prd.source.ptr = h2_session;
2634	data_prd.read_callback = http2_submit_error_read_callback;
2635
2636	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2637		headers, 1, &data_prd);
2638	if(ret) {
2639		verbose(VERB_QUERY, "http2: submit error failed, "
2640			"error: %s", nghttp2_strerror(ret));
2641		return 0;
2642	}
2643	return 1;
2644}
2645
2646/**
2647 * Start query handling. Query is stored in the stream, and will be free'd here.
2648 * @param h2_session: http2 session, containing comm point
2649 * @param h2_stream: stream containing buffered query
2650 * @return: -1 on error, 1 if answer is stored in c->buffer, 0 if there is no
2651 * reply available (yet).
2652 */
2653static int http2_query_read_done(struct http2_session* h2_session,
2654	struct http2_stream* h2_stream)
2655{
2656	log_assert(h2_stream->qbuffer);
2657
2658	if(h2_session->c->h2_stream) {
2659		verbose(VERB_ALGO, "http2_query_read_done failure: shared "
2660			"buffer already assigned to stream");
2661		return -1;
2662	}
2663
2664    /* the c->buffer might be used by mesh_send_reply and no be cleard
2665	 * need to be cleared before use */
2666	sldns_buffer_clear(h2_session->c->buffer);
2667	if(sldns_buffer_remaining(h2_session->c->buffer) <
2668		sldns_buffer_remaining(h2_stream->qbuffer)) {
2669		/* qbuffer will be free'd in frame close cb */
2670		sldns_buffer_clear(h2_session->c->buffer);
2671		verbose(VERB_ALGO, "http2_query_read_done failure: can't fit "
2672			"qbuffer in c->buffer");
2673		return -1;
2674	}
2675
2676	sldns_buffer_write(h2_session->c->buffer,
2677		sldns_buffer_current(h2_stream->qbuffer),
2678		sldns_buffer_remaining(h2_stream->qbuffer));
2679
2680	lock_basic_lock(&http2_query_buffer_count_lock);
2681	http2_query_buffer_count -= sldns_buffer_capacity(h2_stream->qbuffer);
2682	lock_basic_unlock(&http2_query_buffer_count_lock);
2683	sldns_buffer_free(h2_stream->qbuffer);
2684	h2_stream->qbuffer = NULL;
2685
2686	sldns_buffer_flip(h2_session->c->buffer);
2687	h2_session->c->h2_stream = h2_stream;
2688	fptr_ok(fptr_whitelist_comm_point(h2_session->c->callback));
2689	if((*h2_session->c->callback)(h2_session->c, h2_session->c->cb_arg,
2690		NETEVENT_NOERROR, &h2_session->c->repinfo)) {
2691		return 1; /* answer in c->buffer */
2692	}
2693	sldns_buffer_clear(h2_session->c->buffer);
2694	h2_session->c->h2_stream = NULL;
2695	return 0; /* mesh state added, or dropped */
2696}
2697
2698/** nghttp2 callback. Used to check if the received frame indicates the end of a
2699 * stream. Gather collected request data and start query handling. */
2700static int http2_req_frame_recv_cb(nghttp2_session* session,
2701	const nghttp2_frame* frame, void* cb_arg)
2702{
2703	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2704	struct http2_stream* h2_stream;
2705	int query_read_done;
2706
2707	if((frame->hd.type != NGHTTP2_DATA &&
2708		frame->hd.type != NGHTTP2_HEADERS) ||
2709		!(frame->hd.flags & NGHTTP2_FLAG_END_STREAM)) {
2710			return 0;
2711	}
2712
2713	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2714		session, frame->hd.stream_id)))
2715		return 0;
2716
2717	if(h2_stream->invalid_endpoint) {
2718		h2_stream->status = HTTP_STATUS_NOT_FOUND;
2719		goto submit_http_error;
2720	}
2721
2722	if(h2_stream->invalid_content_type) {
2723		h2_stream->status = HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE;
2724		goto submit_http_error;
2725	}
2726
2727	if(h2_stream->http_method != HTTP_METHOD_GET &&
2728		h2_stream->http_method != HTTP_METHOD_POST) {
2729		h2_stream->status = HTTP_STATUS_NOT_IMPLEMENTED;
2730		goto submit_http_error;
2731	}
2732
2733	if(h2_stream->query_too_large) {
2734		if(h2_stream->http_method == HTTP_METHOD_POST)
2735			h2_stream->status = HTTP_STATUS_PAYLOAD_TOO_LARGE;
2736		else
2737			h2_stream->status = HTTP_STATUS_URI_TOO_LONG;
2738		goto submit_http_error;
2739	}
2740
2741	if(!h2_stream->qbuffer) {
2742		h2_stream->status = HTTP_STATUS_BAD_REQUEST;
2743		goto submit_http_error;
2744	}
2745
2746	if(h2_stream->status) {
2747submit_http_error:
2748		verbose(VERB_QUERY, "http2 request invalid, returning :status="
2749			"%d", h2_stream->status);
2750		if(!http2_submit_error(h2_session, h2_stream)) {
2751			return NGHTTP2_ERR_CALLBACK_FAILURE;
2752		}
2753		return 0;
2754	}
2755	h2_stream->status = HTTP_STATUS_OK;
2756
2757	sldns_buffer_flip(h2_stream->qbuffer);
2758	h2_session->postpone_drop = 1;
2759	query_read_done = http2_query_read_done(h2_session, h2_stream);
2760	if(query_read_done < 0)
2761		return NGHTTP2_ERR_CALLBACK_FAILURE;
2762	else if(!query_read_done) {
2763		if(h2_session->is_drop) {
2764			/* connection needs to be closed. Return failure to make
2765			 * sure no other action are taken anymore on comm point.
2766			 * failure will result in reclaiming (and closing)
2767			 * of comm point. */
2768			verbose(VERB_QUERY, "http2 query dropped in worker cb");
2769			h2_session->postpone_drop = 0;
2770			return NGHTTP2_ERR_CALLBACK_FAILURE;
2771		}
2772		/* nothing to submit right now, query added to mesh. */
2773		h2_session->postpone_drop = 0;
2774		return 0;
2775	}
2776	if(!http2_submit_dns_response(h2_session)) {
2777		sldns_buffer_clear(h2_session->c->buffer);
2778		h2_session->c->h2_stream = NULL;
2779		return NGHTTP2_ERR_CALLBACK_FAILURE;
2780	}
2781	verbose(VERB_QUERY, "http2 query submitted to session");
2782	sldns_buffer_clear(h2_session->c->buffer);
2783	h2_session->c->h2_stream = NULL;
2784	return 0;
2785}
2786
2787/** nghttp2 callback. Used to detect start of new streams. */
2788static int http2_req_begin_headers_cb(nghttp2_session* session,
2789	const nghttp2_frame* frame, void* cb_arg)
2790{
2791	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2792	struct http2_stream* h2_stream;
2793	int ret;
2794	if(frame->hd.type != NGHTTP2_HEADERS ||
2795		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2796		/* only interested in request headers */
2797		return 0;
2798	}
2799	if(!(h2_stream = http2_stream_create(frame->hd.stream_id))) {
2800		log_err("malloc failure while creating http2 stream");
2801		return NGHTTP2_ERR_CALLBACK_FAILURE;
2802	}
2803	http2_session_add_stream(h2_session, h2_stream);
2804	ret = nghttp2_session_set_stream_user_data(session,
2805		frame->hd.stream_id, h2_stream);
2806	if(ret) {
2807		/* stream does not exist */
2808		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2809			"error: %s", nghttp2_strerror(ret));
2810		return NGHTTP2_ERR_CALLBACK_FAILURE;
2811	}
2812
2813	return 0;
2814}
2815
2816/**
2817 * base64url decode, store in qbuffer
2818 * @param h2_session: http2 session
2819 * @param h2_stream: http2 stream
2820 * @param start: start of the base64 string
2821 * @param length: length of the base64 string
2822 * @return: 0 on error, 1 otherwise. query will be stored in h2_stream->qbuffer,
2823 * buffer will be NULL is unparseble.
2824 */
2825static int http2_buffer_uri_query(struct http2_session* h2_session,
2826	struct http2_stream* h2_stream, const uint8_t* start, size_t length)
2827{
2828	size_t expectb64len;
2829	int b64len;
2830	if(h2_stream->http_method == HTTP_METHOD_POST)
2831		return 1;
2832	if(length == 0)
2833		return 1;
2834	if(h2_stream->qbuffer) {
2835		verbose(VERB_ALGO, "http2_req_header fail, "
2836			"qbuffer already set");
2837		return 0;
2838	}
2839
2840	/* calculate size, might be a bit bigger than the real
2841	 * decoded buffer size */
2842	expectb64len = sldns_b64_pton_calculate_size(length);
2843	log_assert(expectb64len > 0);
2844	if(expectb64len >
2845		h2_session->c->http2_stream_max_qbuffer_size) {
2846		h2_stream->query_too_large = 1;
2847		return 1;
2848	}
2849
2850	lock_basic_lock(&http2_query_buffer_count_lock);
2851	if(http2_query_buffer_count + expectb64len > http2_query_buffer_max) {
2852		lock_basic_unlock(&http2_query_buffer_count_lock);
2853		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2854			"in http2-query-buffer-size");
2855		return http2_submit_rst_stream(h2_session, h2_stream);
2856	}
2857	http2_query_buffer_count += expectb64len;
2858	lock_basic_unlock(&http2_query_buffer_count_lock);
2859	if(!(h2_stream->qbuffer = sldns_buffer_new(expectb64len))) {
2860		lock_basic_lock(&http2_query_buffer_count_lock);
2861		http2_query_buffer_count -= expectb64len;
2862		lock_basic_unlock(&http2_query_buffer_count_lock);
2863		log_err("http2_req_header fail, qbuffer "
2864			"malloc failure");
2865		return 0;
2866	}
2867
2868	if(sldns_b64_contains_nonurl((char const*)start, length)) {
2869		char buf[65536+4];
2870		verbose(VERB_ALGO, "HTTP2 stream contains wrong b64 encoding");
2871		/* copy to the scratch buffer temporarily to terminate the
2872		 * string with a zero */
2873		if(length+1 > sizeof(buf)) {
2874			/* too long */
2875			lock_basic_lock(&http2_query_buffer_count_lock);
2876			http2_query_buffer_count -= expectb64len;
2877			lock_basic_unlock(&http2_query_buffer_count_lock);
2878			sldns_buffer_free(h2_stream->qbuffer);
2879			h2_stream->qbuffer = NULL;
2880			return 1;
2881		}
2882		memmove(buf, start, length);
2883		buf[length] = 0;
2884		if(!(b64len = sldns_b64_pton(buf, sldns_buffer_current(
2885			h2_stream->qbuffer), expectb64len)) || b64len < 0) {
2886			lock_basic_lock(&http2_query_buffer_count_lock);
2887			http2_query_buffer_count -= expectb64len;
2888			lock_basic_unlock(&http2_query_buffer_count_lock);
2889			sldns_buffer_free(h2_stream->qbuffer);
2890			h2_stream->qbuffer = NULL;
2891			return 1;
2892		}
2893	} else {
2894		if(!(b64len = sldns_b64url_pton(
2895			(char const *)start, length,
2896			sldns_buffer_current(h2_stream->qbuffer),
2897			expectb64len)) || b64len < 0) {
2898			lock_basic_lock(&http2_query_buffer_count_lock);
2899			http2_query_buffer_count -= expectb64len;
2900			lock_basic_unlock(&http2_query_buffer_count_lock);
2901			sldns_buffer_free(h2_stream->qbuffer);
2902			h2_stream->qbuffer = NULL;
2903			/* return without error, method can be an
2904			 * unknown POST */
2905			return 1;
2906		}
2907	}
2908	sldns_buffer_skip(h2_stream->qbuffer, (size_t)b64len);
2909	return 1;
2910}
2911
2912/** nghttp2 callback. Used to parse headers from HEADER frames. */
2913static int http2_req_header_cb(nghttp2_session* session,
2914	const nghttp2_frame* frame, const uint8_t* name, size_t namelen,
2915	const uint8_t* value, size_t valuelen, uint8_t ATTR_UNUSED(flags),
2916	void* cb_arg)
2917{
2918	struct http2_stream* h2_stream = NULL;
2919	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2920	/* nghttp2 deals with CONTINUATION frames and provides them as part of
2921	 * the HEADER */
2922	if(frame->hd.type != NGHTTP2_HEADERS ||
2923		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2924		/* only interested in request headers */
2925		return 0;
2926	}
2927	if(!(h2_stream = nghttp2_session_get_stream_user_data(session,
2928		frame->hd.stream_id)))
2929		return 0;
2930
2931	/* earlier checks already indicate we can stop handling this query */
2932	if(h2_stream->http_method == HTTP_METHOD_UNSUPPORTED ||
2933		h2_stream->invalid_content_type ||
2934		h2_stream->invalid_endpoint)
2935		return 0;
2936
2937
2938	/* nghttp2 performs some sanity checks in the headers, including:
2939	 * name and value are guaranteed to be null terminated
2940	 * name is guaranteed to be lowercase
2941	 * content-length value is guaranteed to contain digits
2942	 */
2943
2944	if(!h2_stream->http_method && namelen == 7 &&
2945		memcmp(":method", name, namelen) == 0) {
2946		/* Case insensitive check on :method value to be on the safe
2947		 * side. I failed to find text about case sensitivity in specs.
2948		 */
2949		if(valuelen == 3 && strcasecmp("GET", (const char*)value) == 0)
2950			h2_stream->http_method = HTTP_METHOD_GET;
2951		else if(valuelen == 4 &&
2952			strcasecmp("POST", (const char*)value) == 0) {
2953			h2_stream->http_method = HTTP_METHOD_POST;
2954			if(h2_stream->qbuffer) {
2955				/* POST method uses query from DATA frames */
2956				lock_basic_lock(&http2_query_buffer_count_lock);
2957				http2_query_buffer_count -=
2958					sldns_buffer_capacity(h2_stream->qbuffer);
2959				lock_basic_unlock(&http2_query_buffer_count_lock);
2960				sldns_buffer_free(h2_stream->qbuffer);
2961				h2_stream->qbuffer = NULL;
2962			}
2963		} else
2964			h2_stream->http_method = HTTP_METHOD_UNSUPPORTED;
2965		return 0;
2966	}
2967	if(namelen == 5 && memcmp(":path", name, namelen) == 0) {
2968		/* :path may contain DNS query, depending on method. Method might
2969		 * not be known yet here, so check after finishing receiving
2970		 * stream. */
2971#define	HTTP_QUERY_PARAM "?dns="
2972		size_t el = strlen(h2_session->c->http_endpoint);
2973		size_t qpl = strlen(HTTP_QUERY_PARAM);
2974
2975		if(valuelen < el || memcmp(h2_session->c->http_endpoint,
2976			value, el) != 0) {
2977			h2_stream->invalid_endpoint = 1;
2978			return 0;
2979		}
2980		/* larger than endpoint only allowed if it is for the query
2981		 * parameter */
2982		if(valuelen <= el+qpl ||
2983			memcmp(HTTP_QUERY_PARAM, value+el, qpl) != 0) {
2984			if(valuelen != el)
2985				h2_stream->invalid_endpoint = 1;
2986			return 0;
2987		}
2988
2989		if(!http2_buffer_uri_query(h2_session, h2_stream,
2990			value+(el+qpl), valuelen-(el+qpl))) {
2991			return NGHTTP2_ERR_CALLBACK_FAILURE;
2992		}
2993		return 0;
2994	}
2995	/* Content type is a SHOULD (rfc7231#section-3.1.1.5) when using POST,
2996	 * and not needed when using GET. Don't enfore.
2997	 * If set only allow lowercase "application/dns-message".
2998	 *
2999	 * Clients SHOULD (rfc8484#section-4.1) set an accept header, but MUST
3000	 * be able to handle "application/dns-message". Since that is the only
3001	 * content-type supported we can ignore the accept header.
3002	 */
3003	if((namelen == 12 && memcmp("content-type", name, namelen) == 0)) {
3004		if(valuelen != 23 || memcmp("application/dns-message", value,
3005			valuelen) != 0) {
3006			h2_stream->invalid_content_type = 1;
3007		}
3008	}
3009
3010	/* Only interested in content-lentg for POST (on not yet known) method.
3011	 */
3012	if((!h2_stream->http_method ||
3013		h2_stream->http_method == HTTP_METHOD_POST) &&
3014		!h2_stream->content_length && namelen  == 14 &&
3015		memcmp("content-length", name, namelen) == 0) {
3016		if(valuelen > 5) {
3017			h2_stream->query_too_large = 1;
3018			return 0;
3019		}
3020		/* guaranteed to only contain digits and be null terminated */
3021		h2_stream->content_length = atoi((const char*)value);
3022		if(h2_stream->content_length >
3023			h2_session->c->http2_stream_max_qbuffer_size) {
3024			h2_stream->query_too_large = 1;
3025			return 0;
3026		}
3027	}
3028	return 0;
3029}
3030
3031/** nghttp2 callback. Used to get data from DATA frames, which can contain
3032 * queries in POST requests. */
3033static int http2_req_data_chunk_recv_cb(nghttp2_session* ATTR_UNUSED(session),
3034	uint8_t ATTR_UNUSED(flags), int32_t stream_id, const uint8_t* data,
3035	size_t len, void* cb_arg)
3036{
3037	struct http2_session* h2_session = (struct http2_session*)cb_arg;
3038	struct http2_stream* h2_stream;
3039	size_t qlen = 0;
3040
3041	if(!(h2_stream = nghttp2_session_get_stream_user_data(
3042		h2_session->session, stream_id))) {
3043		return 0;
3044	}
3045
3046	if(h2_stream->query_too_large)
3047		return 0;
3048
3049	if(!h2_stream->qbuffer) {
3050		if(h2_stream->content_length) {
3051			if(h2_stream->content_length < len)
3052				/* getting more data in DATA frame than
3053				 * advertised in content-length header. */
3054				return NGHTTP2_ERR_CALLBACK_FAILURE;
3055			qlen = h2_stream->content_length;
3056		} else if(len <= h2_session->c->http2_stream_max_qbuffer_size) {
3057			/* setting this to msg-buffer-size can result in a lot
3058			 * of memory consuption. Most queries should fit in a
3059			 * single DATA frame, and most POST queries will
3060			 * contain content-length which does not impose this
3061			 * limit. */
3062			qlen = len;
3063		}
3064	}
3065	if(!h2_stream->qbuffer && qlen) {
3066		lock_basic_lock(&http2_query_buffer_count_lock);
3067		if(http2_query_buffer_count + qlen > http2_query_buffer_max) {
3068			lock_basic_unlock(&http2_query_buffer_count_lock);
3069			verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
3070				"in http2-query-buffer-size");
3071			return http2_submit_rst_stream(h2_session, h2_stream);
3072		}
3073		http2_query_buffer_count += qlen;
3074		lock_basic_unlock(&http2_query_buffer_count_lock);
3075		if(!(h2_stream->qbuffer = sldns_buffer_new(qlen))) {
3076			lock_basic_lock(&http2_query_buffer_count_lock);
3077			http2_query_buffer_count -= qlen;
3078			lock_basic_unlock(&http2_query_buffer_count_lock);
3079		}
3080	}
3081
3082	if(!h2_stream->qbuffer ||
3083		sldns_buffer_remaining(h2_stream->qbuffer) < len) {
3084		verbose(VERB_ALGO, "http2 data_chunck_recv failed. Not enough "
3085			"buffer space for POST query. Can happen on multi "
3086			"frame requests without content-length header");
3087		h2_stream->query_too_large = 1;
3088		return 0;
3089	}
3090
3091	sldns_buffer_write(h2_stream->qbuffer, data, len);
3092
3093	return 0;
3094}
3095
3096void http2_req_stream_clear(struct http2_stream* h2_stream)
3097{
3098	if(h2_stream->qbuffer) {
3099		lock_basic_lock(&http2_query_buffer_count_lock);
3100		http2_query_buffer_count -=
3101			sldns_buffer_capacity(h2_stream->qbuffer);
3102		lock_basic_unlock(&http2_query_buffer_count_lock);
3103		sldns_buffer_free(h2_stream->qbuffer);
3104		h2_stream->qbuffer = NULL;
3105	}
3106	if(h2_stream->rbuffer) {
3107		lock_basic_lock(&http2_response_buffer_count_lock);
3108		http2_response_buffer_count -=
3109			sldns_buffer_capacity(h2_stream->rbuffer);
3110		lock_basic_unlock(&http2_response_buffer_count_lock);
3111		sldns_buffer_free(h2_stream->rbuffer);
3112		h2_stream->rbuffer = NULL;
3113	}
3114}
3115
3116nghttp2_session_callbacks* http2_req_callbacks_create(void)
3117{
3118	nghttp2_session_callbacks *callbacks;
3119	if(nghttp2_session_callbacks_new(&callbacks) == NGHTTP2_ERR_NOMEM) {
3120		log_err("failed to initialize nghttp2 callback");
3121		return NULL;
3122	}
3123	/* reception of header block started, used to create h2_stream */
3124	nghttp2_session_callbacks_set_on_begin_headers_callback(callbacks,
3125		http2_req_begin_headers_cb);
3126	/* complete frame received, used to get data from stream if frame
3127	 * has end stream flag, and start processing query */
3128	nghttp2_session_callbacks_set_on_frame_recv_callback(callbacks,
3129		http2_req_frame_recv_cb);
3130	/* get request info from headers */
3131	nghttp2_session_callbacks_set_on_header_callback(callbacks,
3132		http2_req_header_cb);
3133	/* get data from DATA frames, containing POST query */
3134	nghttp2_session_callbacks_set_on_data_chunk_recv_callback(callbacks,
3135		http2_req_data_chunk_recv_cb);
3136
3137	/* generic HTTP2 callbacks */
3138	nghttp2_session_callbacks_set_recv_callback(callbacks, http2_recv_cb);
3139	nghttp2_session_callbacks_set_send_callback(callbacks, http2_send_cb);
3140	nghttp2_session_callbacks_set_on_stream_close_callback(callbacks,
3141		http2_stream_close_cb);
3142
3143	return callbacks;
3144}
3145#endif /* HAVE_NGHTTP2 */
3146