1/*
2 * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3 *
4 * Copyright (c) 2007, NLnet Labs. All rights reserved.
5 *
6 * This software is open source.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * Redistributions of source code must retain the above copyright notice,
13 * this list of conditions and the following disclaimer.
14 *
15 * Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 *
19 * Neither the name of the NLNET LABS nor the names of its contributors may
20 * be used to endorse or promote products derived from this software without
21 * specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36/**
37 * \file
38 *
39 * This file has functions to get queries from clients.
40 */
41#include "config.h"
42#ifdef HAVE_SYS_TYPES_H
43#  include <sys/types.h>
44#endif
45#include <sys/time.h>
46#include <limits.h>
47#ifdef USE_TCP_FASTOPEN
48#include <netinet/tcp.h>
49#endif
50#include <ctype.h>
51#include "services/listen_dnsport.h"
52#include "services/outside_network.h"
53#include "util/netevent.h"
54#include "util/log.h"
55#include "util/config_file.h"
56#include "util/net_help.h"
57#include "sldns/sbuffer.h"
58#include "sldns/parseutil.h"
59#include "services/mesh.h"
60#include "util/fptr_wlist.h"
61#include "util/locks.h"
62
63#ifdef HAVE_NETDB_H
64#include <netdb.h>
65#endif
66#include <fcntl.h>
67
68#ifdef HAVE_SYS_UN_H
69#include <sys/un.h>
70#endif
71
72#ifdef HAVE_SYSTEMD
73#include <systemd/sd-daemon.h>
74#endif
75
76#ifdef HAVE_IFADDRS_H
77#include <ifaddrs.h>
78#endif
79#ifdef HAVE_NET_IF_H
80#include <net/if.h>
81#endif
82#ifdef HAVE_LINUX_NET_TSTAMP_H
83#include <linux/net_tstamp.h>
84#endif
85/** number of queued TCP connections for listen() */
86#define TCP_BACKLOG 256
87
88#ifndef THREADS_DISABLED
89/** lock on the counter of stream buffer memory */
90static lock_basic_type stream_wait_count_lock;
91/** lock on the counter of HTTP2 query buffer memory */
92static lock_basic_type http2_query_buffer_count_lock;
93/** lock on the counter of HTTP2 response buffer memory */
94static lock_basic_type http2_response_buffer_count_lock;
95#endif
96/** size (in bytes) of stream wait buffers */
97static size_t stream_wait_count = 0;
98/** is the lock initialised for stream wait buffers */
99static int stream_wait_lock_inited = 0;
100/** size (in bytes) of HTTP2 query buffers */
101static size_t http2_query_buffer_count = 0;
102/** is the lock initialised for HTTP2 query buffers */
103static int http2_query_buffer_lock_inited = 0;
104/** size (in bytes) of HTTP2 response buffers */
105static size_t http2_response_buffer_count = 0;
106/** is the lock initialised for HTTP2 response buffers */
107static int http2_response_buffer_lock_inited = 0;
108
109/**
110 * Debug print of the getaddrinfo returned address.
111 * @param addr: the address returned.
112 */
113static void
114verbose_print_addr(struct addrinfo *addr)
115{
116	if(verbosity >= VERB_ALGO) {
117		char buf[100];
118		void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
119#ifdef INET6
120		if(addr->ai_family == AF_INET6)
121			sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
122				sin6_addr;
123#endif /* INET6 */
124		if(inet_ntop(addr->ai_family, sinaddr, buf,
125			(socklen_t)sizeof(buf)) == 0) {
126			(void)strlcpy(buf, "(null)", sizeof(buf));
127		}
128		buf[sizeof(buf)-1] = 0;
129		verbose(VERB_ALGO, "creating %s%s socket %s %d",
130			addr->ai_socktype==SOCK_DGRAM?"udp":
131			addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
132			addr->ai_family==AF_INET?"4":
133			addr->ai_family==AF_INET6?"6":
134			"_otherfam", buf,
135			ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port));
136	}
137}
138
139void
140verbose_print_unbound_socket(struct unbound_socket* ub_sock)
141{
142	if(verbosity >= VERB_ALGO) {
143		char buf[256];
144		log_info("listing of unbound_socket structure:");
145		addr_to_str((void*)ub_sock->addr, ub_sock->addrlen, buf,
146			sizeof(buf));
147		log_info("%s s is: %d, fam is: %s, acl: %s", buf, ub_sock->s,
148			ub_sock->fam == AF_INET?"AF_INET":"AF_INET6",
149			ub_sock->acl?"yes":"no");
150	}
151}
152
153#ifdef HAVE_SYSTEMD
154static int
155systemd_get_activated(int family, int socktype, int listen,
156		      struct sockaddr *addr, socklen_t addrlen,
157		      const char *path)
158{
159	int i = 0;
160	int r = 0;
161	int s = -1;
162	const char* listen_pid, *listen_fds;
163
164	/* We should use "listen" option only for stream protocols. For UDP it should be -1 */
165
166	if((r = sd_booted()) < 1) {
167		if(r == 0)
168			log_warn("systemd is not running");
169		else
170			log_err("systemd sd_booted(): %s", strerror(-r));
171		return -1;
172	}
173
174	listen_pid = getenv("LISTEN_PID");
175	listen_fds = getenv("LISTEN_FDS");
176
177	if (!listen_pid) {
178		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
179		return -1;
180	}
181
182	if (!listen_fds) {
183		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
184		return -1;
185	}
186
187	if((r = sd_listen_fds(0)) < 1) {
188		if(r == 0)
189			log_warn("systemd: did not return socket, check unit configuration");
190		else
191			log_err("systemd sd_listen_fds(): %s", strerror(-r));
192		return -1;
193	}
194
195	for(i = 0; i < r; i++) {
196		if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
197			s = SD_LISTEN_FDS_START + i;
198			break;
199		}
200	}
201	if (s == -1) {
202		if (addr)
203			log_err_addr("systemd sd_listen_fds()",
204				     "no such socket",
205				     (struct sockaddr_storage *)addr, addrlen);
206		else
207			log_err("systemd sd_listen_fds(): %s", path);
208	}
209	return s;
210}
211#endif
212
213int
214create_udp_sock(int family, int socktype, struct sockaddr* addr,
215        socklen_t addrlen, int v6only, int* inuse, int* noproto,
216	int rcv, int snd, int listen, int* reuseport, int transparent,
217	int freebind, int use_systemd, int dscp)
218{
219	int s;
220	char* err;
221#if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
222	int on=1;
223#endif
224#ifdef IPV6_MTU
225	int mtu = IPV6_MIN_MTU;
226#endif
227#if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
228	(void)rcv;
229#endif
230#if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
231	(void)snd;
232#endif
233#ifndef IPV6_V6ONLY
234	(void)v6only;
235#endif
236#if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
237	(void)transparent;
238#endif
239#if !defined(IP_FREEBIND)
240	(void)freebind;
241#endif
242#ifdef HAVE_SYSTEMD
243	int got_fd_from_systemd = 0;
244
245	if (!use_systemd
246	    || (use_systemd
247		&& (s = systemd_get_activated(family, socktype, -1, addr,
248					      addrlen, NULL)) == -1)) {
249#else
250	(void)use_systemd;
251#endif
252	if((s = socket(family, socktype, 0)) == -1) {
253		*inuse = 0;
254#ifndef USE_WINSOCK
255		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
256			*noproto = 1;
257			return -1;
258		}
259#else
260		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
261			WSAGetLastError() == WSAEPROTONOSUPPORT) {
262			*noproto = 1;
263			return -1;
264		}
265#endif
266		log_err("can't create socket: %s", sock_strerror(errno));
267		*noproto = 0;
268		return -1;
269	}
270#ifdef HAVE_SYSTEMD
271	} else {
272		got_fd_from_systemd = 1;
273	}
274#endif
275	if(listen) {
276#ifdef SO_REUSEADDR
277		if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
278			(socklen_t)sizeof(on)) < 0) {
279			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
280				sock_strerror(errno));
281#ifndef USE_WINSOCK
282			if(errno != ENOSYS) {
283				close(s);
284				*noproto = 0;
285				*inuse = 0;
286				return -1;
287			}
288#else
289			closesocket(s);
290			*noproto = 0;
291			*inuse = 0;
292			return -1;
293#endif
294		}
295#endif /* SO_REUSEADDR */
296#ifdef SO_REUSEPORT
297#  ifdef SO_REUSEPORT_LB
298		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
299		 * like SO_REUSEPORT on Linux.  This is what the users want
300		 * with the config option in unbound.conf; if we actually
301		 * need local address and port reuse they'll also need to
302		 * have SO_REUSEPORT set for them, assume it was _LB they want.
303		 */
304		if (reuseport && *reuseport &&
305		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
306			(socklen_t)sizeof(on)) < 0) {
307#ifdef ENOPROTOOPT
308			if(errno != ENOPROTOOPT || verbosity >= 3)
309				log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
310					strerror(errno));
311#endif
312			/* this option is not essential, we can continue */
313			*reuseport = 0;
314		}
315#  else /* no SO_REUSEPORT_LB */
316
317		/* try to set SO_REUSEPORT so that incoming
318		 * queries are distributed evenly among the receiving threads.
319		 * Each thread must have its own socket bound to the same port,
320		 * with SO_REUSEPORT set on each socket.
321		 */
322		if (reuseport && *reuseport &&
323		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
324			(socklen_t)sizeof(on)) < 0) {
325#ifdef ENOPROTOOPT
326			if(errno != ENOPROTOOPT || verbosity >= 3)
327				log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
328					strerror(errno));
329#endif
330			/* this option is not essential, we can continue */
331			*reuseport = 0;
332		}
333#  endif /* SO_REUSEPORT_LB */
334#else
335		(void)reuseport;
336#endif /* defined(SO_REUSEPORT) */
337#ifdef IP_TRANSPARENT
338		if (transparent &&
339		    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
340		    (socklen_t)sizeof(on)) < 0) {
341			log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
342			strerror(errno));
343		}
344#elif defined(IP_BINDANY)
345		if (transparent &&
346		    setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
347		    (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
348		    (void*)&on, (socklen_t)sizeof(on)) < 0) {
349			log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
350			(family==AF_INET6?"V6":""), strerror(errno));
351		}
352#elif defined(SO_BINDANY)
353		if (transparent &&
354		    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
355		    (socklen_t)sizeof(on)) < 0) {
356			log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
357			strerror(errno));
358		}
359#endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
360	}
361#ifdef IP_FREEBIND
362	if(freebind &&
363	    setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
364	    (socklen_t)sizeof(on)) < 0) {
365		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
366		strerror(errno));
367	}
368#endif /* IP_FREEBIND */
369	if(rcv) {
370#ifdef SO_RCVBUF
371		int got;
372		socklen_t slen = (socklen_t)sizeof(got);
373#  ifdef SO_RCVBUFFORCE
374		/* Linux specific: try to use root permission to override
375		 * system limits on rcvbuf. The limit is stored in
376		 * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
377		if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
378			(socklen_t)sizeof(rcv)) < 0) {
379			if(errno != EPERM) {
380				log_err("setsockopt(..., SO_RCVBUFFORCE, "
381					"...) failed: %s", sock_strerror(errno));
382				sock_close(s);
383				*noproto = 0;
384				*inuse = 0;
385				return -1;
386			}
387#  endif /* SO_RCVBUFFORCE */
388			if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
389				(socklen_t)sizeof(rcv)) < 0) {
390				log_err("setsockopt(..., SO_RCVBUF, "
391					"...) failed: %s", sock_strerror(errno));
392				sock_close(s);
393				*noproto = 0;
394				*inuse = 0;
395				return -1;
396			}
397			/* check if we got the right thing or if system
398			 * reduced to some system max.  Warn if so */
399			if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got,
400				&slen) >= 0 && got < rcv/2) {
401				log_warn("so-rcvbuf %u was not granted. "
402					"Got %u. To fix: start with "
403					"root permissions(linux) or sysctl "
404					"bigger net.core.rmem_max(linux) or "
405					"kern.ipc.maxsockbuf(bsd) values.",
406					(unsigned)rcv, (unsigned)got);
407			}
408#  ifdef SO_RCVBUFFORCE
409		}
410#  endif
411#endif /* SO_RCVBUF */
412	}
413	/* first do RCVBUF as the receive buffer is more important */
414	if(snd) {
415#ifdef SO_SNDBUF
416		int got;
417		socklen_t slen = (socklen_t)sizeof(got);
418#  ifdef SO_SNDBUFFORCE
419		/* Linux specific: try to use root permission to override
420		 * system limits on sndbuf. The limit is stored in
421		 * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
422		if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
423			(socklen_t)sizeof(snd)) < 0) {
424			if(errno != EPERM) {
425				log_err("setsockopt(..., SO_SNDBUFFORCE, "
426					"...) failed: %s", sock_strerror(errno));
427				sock_close(s);
428				*noproto = 0;
429				*inuse = 0;
430				return -1;
431			}
432#  endif /* SO_SNDBUFFORCE */
433			if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
434				(socklen_t)sizeof(snd)) < 0) {
435				log_err("setsockopt(..., SO_SNDBUF, "
436					"...) failed: %s", sock_strerror(errno));
437				sock_close(s);
438				*noproto = 0;
439				*inuse = 0;
440				return -1;
441			}
442			/* check if we got the right thing or if system
443			 * reduced to some system max.  Warn if so */
444			if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got,
445				&slen) >= 0 && got < snd/2) {
446				log_warn("so-sndbuf %u was not granted. "
447					"Got %u. To fix: start with "
448					"root permissions(linux) or sysctl "
449					"bigger net.core.wmem_max(linux) or "
450					"kern.ipc.maxsockbuf(bsd) values.",
451					(unsigned)snd, (unsigned)got);
452			}
453#  ifdef SO_SNDBUFFORCE
454		}
455#  endif
456#endif /* SO_SNDBUF */
457	}
458	err = set_ip_dscp(s, family, dscp);
459	if(err != NULL)
460		log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
461	if(family == AF_INET6) {
462# if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
463		int omit6_set = 0;
464		int action;
465# endif
466# if defined(IPV6_V6ONLY)
467		if(v6only
468#   ifdef HAVE_SYSTEMD
469			/* Systemd wants to control if the socket is v6 only
470			 * or both, with BindIPv6Only=default, ipv6-only or
471			 * both in systemd.socket, so it is not set here. */
472			&& !got_fd_from_systemd
473#   endif
474			) {
475			int val=(v6only==2)?0:1;
476			if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
477				(void*)&val, (socklen_t)sizeof(val)) < 0) {
478				log_err("setsockopt(..., IPV6_V6ONLY"
479					", ...) failed: %s", sock_strerror(errno));
480				sock_close(s);
481				*noproto = 0;
482				*inuse = 0;
483				return -1;
484			}
485		}
486# endif
487# if defined(IPV6_USE_MIN_MTU)
488		/*
489		 * There is no fragmentation of IPv6 datagrams
490		 * during forwarding in the network. Therefore
491		 * we do not send UDP datagrams larger than
492		 * the minimum IPv6 MTU of 1280 octets. The
493		 * EDNS0 message length can be larger if the
494		 * network stack supports IPV6_USE_MIN_MTU.
495		 */
496		if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
497			(void*)&on, (socklen_t)sizeof(on)) < 0) {
498			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
499				"...) failed: %s", sock_strerror(errno));
500			sock_close(s);
501			*noproto = 0;
502			*inuse = 0;
503			return -1;
504		}
505# elif defined(IPV6_MTU)
506#   ifndef USE_WINSOCK
507		/*
508		 * On Linux, to send no larger than 1280, the PMTUD is
509		 * disabled by default for datagrams anyway, so we set
510		 * the MTU to use.
511		 */
512		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
513			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
514			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
515				sock_strerror(errno));
516			sock_close(s);
517			*noproto = 0;
518			*inuse = 0;
519			return -1;
520		}
521#   elif defined(IPV6_USER_MTU)
522		/* As later versions of the mingw crosscompiler define
523		 * IPV6_MTU, do the same for windows but use IPV6_USER_MTU
524		 * instead which is writable; IPV6_MTU is readonly there. */
525		if (setsockopt(s, IPPROTO_IPV6, IPV6_USER_MTU,
526			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
527			if (WSAGetLastError() != WSAENOPROTOOPT) {
528				log_err("setsockopt(..., IPV6_USER_MTU, ...) failed: %s",
529					wsa_strerror(WSAGetLastError()));
530				sock_close(s);
531				*noproto = 0;
532				*inuse = 0;
533				return -1;
534			}
535		}
536#   endif /* USE_WINSOCK */
537# endif /* IPv6 MTU */
538# if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
539#  if defined(IP_PMTUDISC_OMIT)
540		action = IP_PMTUDISC_OMIT;
541		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
542			&action, (socklen_t)sizeof(action)) < 0) {
543
544			if (errno != EINVAL) {
545				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
546					strerror(errno));
547				sock_close(s);
548				*noproto = 0;
549				*inuse = 0;
550				return -1;
551			}
552		}
553		else
554		{
555		    omit6_set = 1;
556		}
557#  endif
558		if (omit6_set == 0) {
559			action = IP_PMTUDISC_DONT;
560			if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
561				&action, (socklen_t)sizeof(action)) < 0) {
562				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
563					strerror(errno));
564				sock_close(s);
565				*noproto = 0;
566				*inuse = 0;
567				return -1;
568			}
569		}
570# endif /* IPV6_MTU_DISCOVER */
571	} else if(family == AF_INET) {
572#  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
573/* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
574 * PMTU information is not accepted, but fragmentation is allowed
575 * if and only if the packet size exceeds the outgoing interface MTU
576 * (and also uses the interface mtu to determine the size of the packets).
577 * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
578 * FreeBSD already has same semantics without setting the option. */
579		int omit_set = 0;
580		int action;
581#   if defined(IP_PMTUDISC_OMIT)
582		action = IP_PMTUDISC_OMIT;
583		if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
584			&action, (socklen_t)sizeof(action)) < 0) {
585
586			if (errno != EINVAL) {
587				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
588					strerror(errno));
589				sock_close(s);
590				*noproto = 0;
591				*inuse = 0;
592				return -1;
593			}
594		}
595		else
596		{
597		    omit_set = 1;
598		}
599#   endif
600		if (omit_set == 0) {
601   			action = IP_PMTUDISC_DONT;
602			if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
603				&action, (socklen_t)sizeof(action)) < 0) {
604				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
605					strerror(errno));
606				sock_close(s);
607				*noproto = 0;
608				*inuse = 0;
609				return -1;
610			}
611		}
612#  elif defined(IP_DONTFRAG) && !defined(__APPLE__)
613		/* the IP_DONTFRAG option if defined in the 11.0 OSX headers,
614		 * but does not work on that version, so we exclude it */
615		/* a nonzero value disables fragmentation, according to
616		 * docs.oracle.com for ip(4). */
617		int off = 1;
618		if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG,
619			&off, (socklen_t)sizeof(off)) < 0) {
620			log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
621				strerror(errno));
622			sock_close(s);
623			*noproto = 0;
624			*inuse = 0;
625			return -1;
626		}
627#  endif /* IPv4 MTU */
628	}
629	if(
630#ifdef HAVE_SYSTEMD
631		!got_fd_from_systemd &&
632#endif
633		bind(s, (struct sockaddr*)addr, addrlen) != 0) {
634		*noproto = 0;
635		*inuse = 0;
636#ifndef USE_WINSOCK
637#ifdef EADDRINUSE
638		*inuse = (errno == EADDRINUSE);
639		/* detect freebsd jail with no ipv6 permission */
640		if(family==AF_INET6 && errno==EINVAL)
641			*noproto = 1;
642		else if(errno != EADDRINUSE &&
643			!(errno == EACCES && verbosity < 4 && !listen)
644#ifdef EADDRNOTAVAIL
645			&& !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
646#endif
647			) {
648			log_err_addr("can't bind socket", strerror(errno),
649				(struct sockaddr_storage*)addr, addrlen);
650		}
651#endif /* EADDRINUSE */
652#else /* USE_WINSOCK */
653		if(WSAGetLastError() != WSAEADDRINUSE &&
654			WSAGetLastError() != WSAEADDRNOTAVAIL &&
655			!(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
656			log_err_addr("can't bind socket",
657				wsa_strerror(WSAGetLastError()),
658				(struct sockaddr_storage*)addr, addrlen);
659		}
660#endif /* USE_WINSOCK */
661		sock_close(s);
662		return -1;
663	}
664	if(!fd_set_nonblock(s)) {
665		*noproto = 0;
666		*inuse = 0;
667		sock_close(s);
668		return -1;
669	}
670	return s;
671}
672
673int
674create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
675	int* reuseport, int transparent, int mss, int nodelay, int freebind,
676	int use_systemd, int dscp)
677{
678	int s;
679	char* err;
680#if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined(SO_BINDANY)
681	int on = 1;
682#endif
683#ifdef HAVE_SYSTEMD
684	int got_fd_from_systemd = 0;
685#endif
686#ifdef USE_TCP_FASTOPEN
687	int qlen;
688#endif
689#if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
690	(void)transparent;
691#endif
692#if !defined(IP_FREEBIND)
693	(void)freebind;
694#endif
695	verbose_print_addr(addr);
696	*noproto = 0;
697#ifdef HAVE_SYSTEMD
698	if (!use_systemd ||
699	    (use_systemd
700	     && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
701					   addr->ai_addr, addr->ai_addrlen,
702					   NULL)) == -1)) {
703#else
704	(void)use_systemd;
705#endif
706	if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
707#ifndef USE_WINSOCK
708		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
709			*noproto = 1;
710			return -1;
711		}
712#else
713		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
714			WSAGetLastError() == WSAEPROTONOSUPPORT) {
715			*noproto = 1;
716			return -1;
717		}
718#endif
719		log_err("can't create socket: %s", sock_strerror(errno));
720		return -1;
721	}
722	if(nodelay) {
723#if defined(IPPROTO_TCP) && defined(TCP_NODELAY)
724		if(setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void*)&on,
725			(socklen_t)sizeof(on)) < 0) {
726			#ifndef USE_WINSOCK
727			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
728				strerror(errno));
729			#else
730			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
731				wsa_strerror(WSAGetLastError()));
732			#endif
733		}
734#else
735		log_warn(" setsockopt(TCP_NODELAY) unsupported");
736#endif /* defined(IPPROTO_TCP) && defined(TCP_NODELAY) */
737	}
738	if (mss > 0) {
739#if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
740		if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
741			(socklen_t)sizeof(mss)) < 0) {
742			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
743				sock_strerror(errno));
744		} else {
745			verbose(VERB_ALGO,
746				" tcp socket mss set to %d", mss);
747		}
748#else
749		log_warn(" setsockopt(TCP_MAXSEG) unsupported");
750#endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
751	}
752#ifdef HAVE_SYSTEMD
753	} else {
754		got_fd_from_systemd = 1;
755    }
756#endif
757#ifdef SO_REUSEADDR
758	if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
759		(socklen_t)sizeof(on)) < 0) {
760		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
761			sock_strerror(errno));
762		sock_close(s);
763		return -1;
764	}
765#endif /* SO_REUSEADDR */
766#ifdef IP_FREEBIND
767	if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
768	    (socklen_t)sizeof(on)) < 0) {
769		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
770		strerror(errno));
771	}
772#endif /* IP_FREEBIND */
773#ifdef SO_REUSEPORT
774	/* try to set SO_REUSEPORT so that incoming
775	 * connections are distributed evenly among the receiving threads.
776	 * Each thread must have its own socket bound to the same port,
777	 * with SO_REUSEPORT set on each socket.
778	 */
779	if (reuseport && *reuseport &&
780		setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
781		(socklen_t)sizeof(on)) < 0) {
782#ifdef ENOPROTOOPT
783		if(errno != ENOPROTOOPT || verbosity >= 3)
784			log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
785				strerror(errno));
786#endif
787		/* this option is not essential, we can continue */
788		*reuseport = 0;
789	}
790#else
791	(void)reuseport;
792#endif /* defined(SO_REUSEPORT) */
793#if defined(IPV6_V6ONLY)
794	if(addr->ai_family == AF_INET6 && v6only
795#  ifdef HAVE_SYSTEMD
796		/* Systemd wants to control if the socket is v6 only
797		 * or both, with BindIPv6Only=default, ipv6-only or
798		 * both in systemd.socket, so it is not set here. */
799		&& !got_fd_from_systemd
800#  endif
801		) {
802		if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
803			(void*)&on, (socklen_t)sizeof(on)) < 0) {
804			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
805				sock_strerror(errno));
806			sock_close(s);
807			return -1;
808		}
809	}
810#else
811	(void)v6only;
812#endif /* IPV6_V6ONLY */
813#ifdef IP_TRANSPARENT
814	if (transparent &&
815	    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
816	    (socklen_t)sizeof(on)) < 0) {
817		log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
818			strerror(errno));
819	}
820#elif defined(IP_BINDANY)
821	if (transparent &&
822	    setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
823	    (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
824	    (void*)&on, (socklen_t)sizeof(on)) < 0) {
825		log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
826		(addr->ai_family==AF_INET6?"V6":""), strerror(errno));
827	}
828#elif defined(SO_BINDANY)
829	if (transparent &&
830	    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
831	    sizeof(on)) < 0) {
832		log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
833		strerror(errno));
834	}
835#endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
836	err = set_ip_dscp(s, addr->ai_family, dscp);
837	if(err != NULL)
838		log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
839	if(
840#ifdef HAVE_SYSTEMD
841		!got_fd_from_systemd &&
842#endif
843        bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
844#ifndef USE_WINSOCK
845		/* detect freebsd jail with no ipv6 permission */
846		if(addr->ai_family==AF_INET6 && errno==EINVAL)
847			*noproto = 1;
848		else {
849			log_err_addr("can't bind socket", strerror(errno),
850				(struct sockaddr_storage*)addr->ai_addr,
851				addr->ai_addrlen);
852		}
853#else
854		log_err_addr("can't bind socket",
855			wsa_strerror(WSAGetLastError()),
856			(struct sockaddr_storage*)addr->ai_addr,
857			addr->ai_addrlen);
858#endif
859		sock_close(s);
860		return -1;
861	}
862	if(!fd_set_nonblock(s)) {
863		sock_close(s);
864		return -1;
865	}
866	if(listen(s, TCP_BACKLOG) == -1) {
867		log_err("can't listen: %s", sock_strerror(errno));
868		sock_close(s);
869		return -1;
870	}
871#ifdef USE_TCP_FASTOPEN
872	/* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
873	   against IP spoofing attacks as suggested in RFC7413 */
874#ifdef __APPLE__
875	/* OS X implementation only supports qlen of 1 via this call. Actual
876	   value is configured by the net.inet.tcp.fastopen_backlog kernel parm. */
877	qlen = 1;
878#else
879	/* 5 is recommended on linux */
880	qlen = 5;
881#endif
882	if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen,
883		  sizeof(qlen))) == -1 ) {
884#ifdef ENOPROTOOPT
885		/* squelch ENOPROTOOPT: freebsd server mode with kernel support
886		   disabled, except when verbosity enabled for debugging */
887		if(errno != ENOPROTOOPT || verbosity >= 3) {
888#endif
889		  if(errno == EPERM) {
890		  	log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
891		  } else {
892		  	log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
893		  }
894#ifdef ENOPROTOOPT
895		}
896#endif
897	}
898#endif
899	return s;
900}
901
902char*
903set_ip_dscp(int socket, int addrfamily, int dscp)
904{
905	int ds;
906
907	if(dscp == 0)
908		return NULL;
909	ds = dscp << 2;
910	switch(addrfamily) {
911	case AF_INET6:
912	#ifdef IPV6_TCLASS
913		if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds,
914			sizeof(ds)) < 0)
915			return sock_strerror(errno);
916		break;
917	#else
918		return "IPV6_TCLASS not defined on this system";
919	#endif
920	default:
921		if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
922			return sock_strerror(errno);
923		break;
924	}
925	return NULL;
926}
927
928int
929create_local_accept_sock(const char *path, int* noproto, int use_systemd)
930{
931#ifdef HAVE_SYSTEMD
932	int ret;
933
934	if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
935		return ret;
936	else {
937#endif
938#ifdef HAVE_SYS_UN_H
939	int s;
940	struct sockaddr_un usock;
941#ifndef HAVE_SYSTEMD
942	(void)use_systemd;
943#endif
944
945	verbose(VERB_ALGO, "creating unix socket %s", path);
946#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
947	/* this member exists on BSDs, not Linux */
948	usock.sun_len = (unsigned)sizeof(usock);
949#endif
950	usock.sun_family = AF_LOCAL;
951	/* length is 92-108, 104 on FreeBSD */
952	(void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
953
954	if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
955		log_err("Cannot create local socket %s (%s)",
956			path, strerror(errno));
957		return -1;
958	}
959
960	if (unlink(path) && errno != ENOENT) {
961		/* The socket already exists and cannot be removed */
962		log_err("Cannot remove old local socket %s (%s)",
963			path, strerror(errno));
964		goto err;
965	}
966
967	if (bind(s, (struct sockaddr *)&usock,
968		(socklen_t)sizeof(struct sockaddr_un)) == -1) {
969		log_err("Cannot bind local socket %s (%s)",
970			path, strerror(errno));
971		goto err;
972	}
973
974	if (!fd_set_nonblock(s)) {
975		log_err("Cannot set non-blocking mode");
976		goto err;
977	}
978
979	if (listen(s, TCP_BACKLOG) == -1) {
980		log_err("can't listen: %s", strerror(errno));
981		goto err;
982	}
983
984	(void)noproto; /*unused*/
985	return s;
986
987err:
988	sock_close(s);
989	return -1;
990
991#ifdef HAVE_SYSTEMD
992	}
993#endif
994#else
995	(void)use_systemd;
996	(void)path;
997	log_err("Local sockets are not supported");
998	*noproto = 1;
999	return -1;
1000#endif
1001}
1002
1003
1004/**
1005 * Create socket from getaddrinfo results
1006 */
1007static int
1008make_sock(int stype, const char* ifname, const char* port,
1009	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1010	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1011	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1012{
1013	struct addrinfo *res = NULL;
1014	int r, s, inuse, noproto;
1015	hints->ai_socktype = stype;
1016	*noip6 = 0;
1017	if((r=getaddrinfo(ifname, port, hints, &res)) != 0 || !res) {
1018#ifdef USE_WINSOCK
1019		if(r == EAI_NONAME && hints->ai_family == AF_INET6){
1020			*noip6 = 1; /* 'Host not found' for IP6 on winXP */
1021			return -1;
1022		}
1023#endif
1024		log_err("node %s:%s getaddrinfo: %s %s",
1025			ifname?ifname:"default", port, gai_strerror(r),
1026#ifdef EAI_SYSTEM
1027			(r==EAI_SYSTEM?(char*)strerror(errno):"")
1028#else
1029			""
1030#endif
1031		);
1032		return -1;
1033	}
1034	if(stype == SOCK_DGRAM) {
1035		verbose_print_addr(res);
1036		s = create_udp_sock(res->ai_family, res->ai_socktype,
1037			(struct sockaddr*)res->ai_addr, res->ai_addrlen,
1038			v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
1039			reuseport, transparent, freebind, use_systemd, dscp);
1040		if(s == -1 && inuse) {
1041			log_err("bind: address already in use");
1042		} else if(s == -1 && noproto && hints->ai_family == AF_INET6){
1043			*noip6 = 1;
1044		}
1045	} else	{
1046		s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
1047			transparent, tcp_mss, nodelay, freebind, use_systemd,
1048			dscp);
1049		if(s == -1 && noproto && hints->ai_family == AF_INET6){
1050			*noip6 = 1;
1051		}
1052	}
1053
1054	if(!res->ai_addr) {
1055		log_err("getaddrinfo returned no address");
1056		freeaddrinfo(res);
1057		sock_close(s);
1058		return -1;
1059	}
1060	ub_sock->addr = memdup(res->ai_addr, res->ai_addrlen);
1061	ub_sock->addrlen = res->ai_addrlen;
1062	if(!ub_sock->addr) {
1063		log_err("out of memory: allocate listening address");
1064		freeaddrinfo(res);
1065		sock_close(s);
1066		return -1;
1067	}
1068	freeaddrinfo(res);
1069
1070	ub_sock->s = s;
1071	ub_sock->fam = hints->ai_family;
1072	ub_sock->acl = NULL;
1073
1074	return s;
1075}
1076
1077/** make socket and first see if ifname contains port override info */
1078static int
1079make_sock_port(int stype, const char* ifname, const char* port,
1080	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1081	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1082	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1083{
1084	char* s = strchr(ifname, '@');
1085	if(s) {
1086		/* override port with ifspec@port */
1087		char p[16];
1088		char newif[128];
1089		if((size_t)(s-ifname) >= sizeof(newif)) {
1090			log_err("ifname too long: %s", ifname);
1091			*noip6 = 0;
1092			return -1;
1093		}
1094		if(strlen(s+1) >= sizeof(p)) {
1095			log_err("portnumber too long: %s", ifname);
1096			*noip6 = 0;
1097			return -1;
1098		}
1099		(void)strlcpy(newif, ifname, sizeof(newif));
1100		newif[s-ifname] = 0;
1101		(void)strlcpy(p, s+1, sizeof(p));
1102		p[strlen(s+1)]=0;
1103		return make_sock(stype, newif, p, hints, v6only, noip6, rcv,
1104			snd, reuseport, transparent, tcp_mss, nodelay, freebind,
1105			use_systemd, dscp, ub_sock);
1106	}
1107	return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
1108		reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd,
1109		dscp, ub_sock);
1110}
1111
1112/**
1113 * Add port to open ports list.
1114 * @param list: list head. changed.
1115 * @param s: fd.
1116 * @param ftype: if fd is UDP.
1117 * @param pp2_enabled: if PROXYv2 is enabled for this port.
1118 * @param ub_sock: socket with address.
1119 * @return false on failure. list in unchanged then.
1120 */
1121static int
1122port_insert(struct listen_port** list, int s, enum listen_type ftype,
1123	int pp2_enabled, struct unbound_socket* ub_sock)
1124{
1125	struct listen_port* item = (struct listen_port*)malloc(
1126		sizeof(struct listen_port));
1127	if(!item)
1128		return 0;
1129	item->next = *list;
1130	item->fd = s;
1131	item->ftype = ftype;
1132	item->pp2_enabled = pp2_enabled;
1133	item->socket = ub_sock;
1134	*list = item;
1135	return 1;
1136}
1137
1138/** set fd to receive software timestamps */
1139static int
1140set_recvtimestamp(int s)
1141{
1142#ifdef HAVE_LINUX_NET_TSTAMP_H
1143	int opt = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE;
1144	if (setsockopt(s, SOL_SOCKET, SO_TIMESTAMPNS, (void*)&opt, (socklen_t)sizeof(opt)) < 0) {
1145		log_err("setsockopt(..., SO_TIMESTAMPNS, ...) failed: %s",
1146			strerror(errno));
1147		return 0;
1148	}
1149	return 1;
1150#else
1151	log_err("packets timestamping is not supported on this platform");
1152	(void)s;
1153	return 0;
1154#endif
1155}
1156
1157/** set fd to receive source address packet info */
1158static int
1159set_recvpktinfo(int s, int family)
1160{
1161#if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1162	int on = 1;
1163#else
1164	(void)s;
1165#endif
1166	if(family == AF_INET6) {
1167#           ifdef IPV6_RECVPKTINFO
1168		if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1169			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1170			log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1171				strerror(errno));
1172			return 0;
1173		}
1174#           elif defined(IPV6_PKTINFO)
1175		if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1176			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1177			log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1178				strerror(errno));
1179			return 0;
1180		}
1181#           else
1182		log_err("no IPV6_RECVPKTINFO and IPV6_PKTINFO options, please "
1183			"disable interface-automatic or do-ip6 in config");
1184		return 0;
1185#           endif /* defined IPV6_RECVPKTINFO */
1186
1187	} else if(family == AF_INET) {
1188#           ifdef IP_PKTINFO
1189		if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1190			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1191			log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1192				strerror(errno));
1193			return 0;
1194		}
1195#           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1196		if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1197			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1198			log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1199				strerror(errno));
1200			return 0;
1201		}
1202#           else
1203		log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1204			"interface-automatic or do-ip4 in config");
1205		return 0;
1206#           endif /* IP_PKTINFO */
1207
1208	}
1209	return 1;
1210}
1211
1212/** see if interface is ssl, its port number == the ssl port number */
1213static int
1214if_is_ssl(const char* ifname, const char* port, int ssl_port,
1215	struct config_strlist* tls_additional_port)
1216{
1217	struct config_strlist* s;
1218	char* p = strchr(ifname, '@');
1219	if(!p && atoi(port) == ssl_port)
1220		return 1;
1221	if(p && atoi(p+1) == ssl_port)
1222		return 1;
1223	for(s = tls_additional_port; s; s = s->next) {
1224		if(p && atoi(p+1) == atoi(s->str))
1225			return 1;
1226		if(!p && atoi(port) == atoi(s->str))
1227			return 1;
1228	}
1229	return 0;
1230}
1231
1232/**
1233 * Helper for ports_open. Creates one interface (or NULL for default).
1234 * @param ifname: The interface ip address.
1235 * @param do_auto: use automatic interface detection.
1236 * 	If enabled, then ifname must be the wildcard name.
1237 * @param do_udp: if udp should be used.
1238 * @param do_tcp: if tcp should be used.
1239 * @param hints: for getaddrinfo. family and flags have to be set by caller.
1240 * @param port: Port number to use (as string).
1241 * @param list: list of open ports, appended to, changed to point to list head.
1242 * @param rcv: receive buffer size for UDP
1243 * @param snd: send buffer size for UDP
1244 * @param ssl_port: ssl service port number
1245 * @param tls_additional_port: list of additional ssl service port numbers.
1246 * @param https_port: DoH service port number
1247 * @param proxy_protocol_port: list of PROXYv2 port numbers.
1248 * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1249 * 	set to false on exit if reuseport failed due to no kernel support.
1250 * @param transparent: set IP_TRANSPARENT socket option.
1251 * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1252 * @param freebind: set IP_FREEBIND socket option.
1253 * @param http2_nodelay: set TCP_NODELAY on HTTP/2 connection
1254 * @param use_systemd: if true, fetch sockets from systemd.
1255 * @param dnscrypt_port: dnscrypt service port number
1256 * @param dscp: DSCP to use.
1257 * @param sock_queue_timeout: the sock_queue_timeout from config. Seconds to
1258 * 	wait to discard if UDP packets have waited for long in the socket
1259 * 	buffer.
1260 * @return: returns false on error.
1261 */
1262static int
1263ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp,
1264	struct addrinfo *hints, const char* port, struct listen_port** list,
1265	size_t rcv, size_t snd, int ssl_port,
1266	struct config_strlist* tls_additional_port, int https_port,
1267	struct config_strlist* proxy_protocol_port,
1268	int* reuseport, int transparent, int tcp_mss, int freebind,
1269	int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp,
1270	int sock_queue_timeout)
1271{
1272	int s, noip6=0;
1273	int is_https = if_is_https(ifname, port, https_port);
1274	int is_dnscrypt = if_is_dnscrypt(ifname, port, dnscrypt_port);
1275	int is_pp2 = if_is_pp2(ifname, port, proxy_protocol_port);
1276	int nodelay = is_https && http2_nodelay;
1277	struct unbound_socket* ub_sock;
1278
1279	if(!do_udp && !do_tcp)
1280		return 0;
1281
1282	if(is_pp2) {
1283		if(is_dnscrypt) {
1284			fatal_exit("PROXYv2 and DNSCrypt combination not "
1285				"supported!");
1286		} else if(is_https) {
1287			fatal_exit("PROXYv2 and DoH combination not "
1288				"supported!");
1289		}
1290	}
1291
1292	if(do_auto) {
1293		ub_sock = calloc(1, sizeof(struct unbound_socket));
1294		if(!ub_sock)
1295			return 0;
1296		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1297			&noip6, rcv, snd, reuseport, transparent,
1298			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1299			free(ub_sock->addr);
1300			free(ub_sock);
1301			if(noip6) {
1302				log_warn("IPv6 protocol not available");
1303				return 1;
1304			}
1305			return 0;
1306		}
1307		/* getting source addr packet info is highly non-portable */
1308		if(!set_recvpktinfo(s, hints->ai_family)) {
1309			sock_close(s);
1310			free(ub_sock->addr);
1311			free(ub_sock);
1312			return 0;
1313		}
1314		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1315			log_warn("socket timestamping is not available");
1316		}
1317		if(!port_insert(list, s, is_dnscrypt
1318			?listen_type_udpancil_dnscrypt:listen_type_udpancil,
1319			is_pp2, ub_sock)) {
1320			sock_close(s);
1321			free(ub_sock->addr);
1322			free(ub_sock);
1323			return 0;
1324		}
1325	} else if(do_udp) {
1326		ub_sock = calloc(1, sizeof(struct unbound_socket));
1327		if(!ub_sock)
1328			return 0;
1329		/* regular udp socket */
1330		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1331			&noip6, rcv, snd, reuseport, transparent,
1332			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1333			free(ub_sock->addr);
1334			free(ub_sock);
1335			if(noip6) {
1336				log_warn("IPv6 protocol not available");
1337				return 1;
1338			}
1339			return 0;
1340		}
1341		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1342			log_warn("socket timestamping is not available");
1343		}
1344		if(!port_insert(list, s, is_dnscrypt
1345			?listen_type_udp_dnscrypt :
1346			(sock_queue_timeout ?
1347				listen_type_udpancil:listen_type_udp),
1348			is_pp2, ub_sock)) {
1349			sock_close(s);
1350			free(ub_sock->addr);
1351			free(ub_sock);
1352			return 0;
1353		}
1354	}
1355	if(do_tcp) {
1356		int is_ssl = if_is_ssl(ifname, port, ssl_port,
1357			tls_additional_port);
1358		enum listen_type port_type;
1359		ub_sock = calloc(1, sizeof(struct unbound_socket));
1360		if(!ub_sock)
1361			return 0;
1362		if(is_ssl)
1363			port_type = listen_type_ssl;
1364		else if(is_https)
1365			port_type = listen_type_http;
1366		else if(is_dnscrypt)
1367			port_type = listen_type_tcp_dnscrypt;
1368		else
1369			port_type = listen_type_tcp;
1370		if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1,
1371			&noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay,
1372			freebind, use_systemd, dscp, ub_sock)) == -1) {
1373			free(ub_sock->addr);
1374			free(ub_sock);
1375			if(noip6) {
1376				/*log_warn("IPv6 protocol not available");*/
1377				return 1;
1378			}
1379			return 0;
1380		}
1381		if(is_ssl)
1382			verbose(VERB_ALGO, "setup TCP for SSL service");
1383		if(!port_insert(list, s, port_type, is_pp2, ub_sock)) {
1384			sock_close(s);
1385			free(ub_sock->addr);
1386			free(ub_sock);
1387			return 0;
1388		}
1389	}
1390	return 1;
1391}
1392
1393/**
1394 * Add items to commpoint list in front.
1395 * @param c: commpoint to add.
1396 * @param front: listen struct.
1397 * @return: false on failure.
1398 */
1399static int
1400listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1401{
1402	struct listen_list* item = (struct listen_list*)malloc(
1403		sizeof(struct listen_list));
1404	if(!item)
1405		return 0;
1406	item->com = c;
1407	item->next = front->cps;
1408	front->cps = item;
1409	return 1;
1410}
1411
1412void listen_setup_locks(void)
1413{
1414	if(!stream_wait_lock_inited) {
1415		lock_basic_init(&stream_wait_count_lock);
1416		stream_wait_lock_inited = 1;
1417	}
1418	if(!http2_query_buffer_lock_inited) {
1419		lock_basic_init(&http2_query_buffer_count_lock);
1420		http2_query_buffer_lock_inited = 1;
1421	}
1422	if(!http2_response_buffer_lock_inited) {
1423		lock_basic_init(&http2_response_buffer_count_lock);
1424		http2_response_buffer_lock_inited = 1;
1425	}
1426}
1427
1428void listen_desetup_locks(void)
1429{
1430	if(stream_wait_lock_inited) {
1431		stream_wait_lock_inited = 0;
1432		lock_basic_destroy(&stream_wait_count_lock);
1433	}
1434	if(http2_query_buffer_lock_inited) {
1435		http2_query_buffer_lock_inited = 0;
1436		lock_basic_destroy(&http2_query_buffer_count_lock);
1437	}
1438	if(http2_response_buffer_lock_inited) {
1439		http2_response_buffer_lock_inited = 0;
1440		lock_basic_destroy(&http2_response_buffer_count_lock);
1441	}
1442}
1443
1444struct listen_dnsport*
1445listen_create(struct comm_base* base, struct listen_port* ports,
1446	size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1447	int harden_large_queries, uint32_t http_max_streams,
1448	char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit,
1449	void* sslctx, struct dt_env* dtenv, comm_point_callback_type* cb,
1450	void *cb_arg)
1451{
1452	struct listen_dnsport* front = (struct listen_dnsport*)
1453		malloc(sizeof(struct listen_dnsport));
1454	if(!front)
1455		return NULL;
1456	front->cps = NULL;
1457	front->udp_buff = sldns_buffer_new(bufsize);
1458#ifdef USE_DNSCRYPT
1459	front->dnscrypt_udp_buff = NULL;
1460#endif
1461	if(!front->udp_buff) {
1462		free(front);
1463		return NULL;
1464	}
1465
1466	/* create comm points as needed */
1467	while(ports) {
1468		struct comm_point* cp = NULL;
1469		if(ports->ftype == listen_type_udp ||
1470		   ports->ftype == listen_type_udp_dnscrypt) {
1471			cp = comm_point_create_udp(base, ports->fd,
1472				front->udp_buff, ports->pp2_enabled, cb,
1473				cb_arg, ports->socket);
1474		} else if(ports->ftype == listen_type_tcp ||
1475				ports->ftype == listen_type_tcp_dnscrypt) {
1476			cp = comm_point_create_tcp(base, ports->fd,
1477				tcp_accept_count, tcp_idle_timeout,
1478				harden_large_queries, 0, NULL,
1479				tcp_conn_limit, bufsize, front->udp_buff,
1480				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1481				ports->socket);
1482		} else if(ports->ftype == listen_type_ssl ||
1483			ports->ftype == listen_type_http) {
1484			cp = comm_point_create_tcp(base, ports->fd,
1485				tcp_accept_count, tcp_idle_timeout,
1486				harden_large_queries,
1487				http_max_streams, http_endpoint,
1488				tcp_conn_limit, bufsize, front->udp_buff,
1489				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1490				ports->socket);
1491			if(ports->ftype == listen_type_http) {
1492				if(!sslctx && !http_notls) {
1493					log_warn("HTTPS port configured, but "
1494						"no TLS tls-service-key or "
1495						"tls-service-pem set");
1496				}
1497#ifndef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
1498				if(!http_notls) {
1499					log_warn("Unbound is not compiled "
1500						"with an OpenSSL version "
1501						"supporting ALPN "
1502						"(OpenSSL >= 1.0.2). This "
1503						"is required to use "
1504						"DNS-over-HTTPS");
1505				}
1506#endif
1507#ifndef HAVE_NGHTTP2_NGHTTP2_H
1508				log_warn("Unbound is not compiled with "
1509					"nghttp2. This is required to use "
1510					"DNS-over-HTTPS.");
1511#endif
1512			}
1513		} else if(ports->ftype == listen_type_udpancil ||
1514				  ports->ftype == listen_type_udpancil_dnscrypt) {
1515#if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
1516			cp = comm_point_create_udp_ancil(base, ports->fd,
1517				front->udp_buff, ports->pp2_enabled, cb,
1518				cb_arg, ports->socket);
1519#else
1520			log_warn("This system does not support UDP ancilliary data.");
1521#endif
1522		}
1523		if(!cp) {
1524			log_err("can't create commpoint");
1525			listen_delete(front);
1526			return NULL;
1527		}
1528		if((http_notls && ports->ftype == listen_type_http) ||
1529			(ports->ftype == listen_type_tcp) ||
1530			(ports->ftype == listen_type_udp) ||
1531			(ports->ftype == listen_type_udpancil) ||
1532			(ports->ftype == listen_type_tcp_dnscrypt) ||
1533			(ports->ftype == listen_type_udp_dnscrypt) ||
1534			(ports->ftype == listen_type_udpancil_dnscrypt))
1535			cp->ssl = NULL;
1536		else
1537			cp->ssl = sslctx;
1538		cp->dtenv = dtenv;
1539		cp->do_not_close = 1;
1540#ifdef USE_DNSCRYPT
1541		if (ports->ftype == listen_type_udp_dnscrypt ||
1542			ports->ftype == listen_type_tcp_dnscrypt ||
1543			ports->ftype == listen_type_udpancil_dnscrypt) {
1544			cp->dnscrypt = 1;
1545			cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1546			if(!cp->dnscrypt_buffer) {
1547				log_err("can't alloc dnscrypt_buffer");
1548				comm_point_delete(cp);
1549				listen_delete(front);
1550				return NULL;
1551			}
1552			front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1553		}
1554#endif
1555		if(!listen_cp_insert(cp, front)) {
1556			log_err("malloc failed");
1557			comm_point_delete(cp);
1558			listen_delete(front);
1559			return NULL;
1560		}
1561		ports = ports->next;
1562	}
1563	if(!front->cps) {
1564		log_err("Could not open sockets to accept queries.");
1565		listen_delete(front);
1566		return NULL;
1567	}
1568
1569	return front;
1570}
1571
1572void
1573listen_list_delete(struct listen_list* list)
1574{
1575	struct listen_list *p = list, *pn;
1576	while(p) {
1577		pn = p->next;
1578		comm_point_delete(p->com);
1579		free(p);
1580		p = pn;
1581	}
1582}
1583
1584void
1585listen_delete(struct listen_dnsport* front)
1586{
1587	if(!front)
1588		return;
1589	listen_list_delete(front->cps);
1590#ifdef USE_DNSCRYPT
1591	if(front->dnscrypt_udp_buff &&
1592		front->udp_buff != front->dnscrypt_udp_buff) {
1593		sldns_buffer_free(front->dnscrypt_udp_buff);
1594	}
1595#endif
1596	sldns_buffer_free(front->udp_buff);
1597	free(front);
1598}
1599
1600#ifdef HAVE_GETIFADDRS
1601static int
1602resolve_ifa_name(struct ifaddrs *ifas, const char *search_ifa, char ***ip_addresses, int *ip_addresses_size)
1603{
1604	struct ifaddrs *ifa;
1605	void *tmpbuf;
1606	int last_ip_addresses_size = *ip_addresses_size;
1607
1608	for(ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) {
1609		sa_family_t family;
1610		const char* atsign;
1611#ifdef INET6      /* |   address ip    | % |  ifa name  | @ |  port  | nul */
1612		char addr_buf[INET6_ADDRSTRLEN + 1 + IF_NAMESIZE + 1 + 16 + 1];
1613#else
1614		char addr_buf[INET_ADDRSTRLEN + 1 + 16 + 1];
1615#endif
1616
1617		if((atsign=strrchr(search_ifa, '@')) != NULL) {
1618			if(strlen(ifa->ifa_name) != (size_t)(atsign-search_ifa)
1619			   || strncmp(ifa->ifa_name, search_ifa,
1620			   atsign-search_ifa) != 0)
1621				continue;
1622		} else {
1623			if(strcmp(ifa->ifa_name, search_ifa) != 0)
1624				continue;
1625			atsign = "";
1626		}
1627
1628		if(ifa->ifa_addr == NULL)
1629			continue;
1630
1631		family = ifa->ifa_addr->sa_family;
1632		if(family == AF_INET) {
1633			char a4[INET_ADDRSTRLEN + 1];
1634			struct sockaddr_in *in4 = (struct sockaddr_in *)
1635				ifa->ifa_addr;
1636			if(!inet_ntop(family, &in4->sin_addr, a4, sizeof(a4))) {
1637				log_err("inet_ntop failed");
1638				return 0;
1639			}
1640			snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1641				a4, atsign);
1642		}
1643#ifdef INET6
1644		else if(family == AF_INET6) {
1645			struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)
1646				ifa->ifa_addr;
1647			char a6[INET6_ADDRSTRLEN + 1];
1648			char if_index_name[IF_NAMESIZE + 1];
1649			if_index_name[0] = 0;
1650			if(!inet_ntop(family, &in6->sin6_addr, a6, sizeof(a6))) {
1651				log_err("inet_ntop failed");
1652				return 0;
1653			}
1654			(void)if_indextoname(in6->sin6_scope_id,
1655				(char *)if_index_name);
1656			if (strlen(if_index_name) != 0) {
1657				snprintf(addr_buf, sizeof(addr_buf),
1658					"%s%%%s%s", a6, if_index_name, atsign);
1659			} else {
1660				snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1661					a6, atsign);
1662			}
1663		}
1664#endif
1665		else {
1666			continue;
1667		}
1668		verbose(4, "interface %s has address %s", search_ifa, addr_buf);
1669
1670		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1671		if(!tmpbuf) {
1672			log_err("realloc failed: out of memory");
1673			return 0;
1674		} else {
1675			*ip_addresses = tmpbuf;
1676		}
1677		(*ip_addresses)[*ip_addresses_size] = strdup(addr_buf);
1678		if(!(*ip_addresses)[*ip_addresses_size]) {
1679			log_err("strdup failed: out of memory");
1680			return 0;
1681		}
1682		(*ip_addresses_size)++;
1683	}
1684
1685	if (*ip_addresses_size == last_ip_addresses_size) {
1686		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1687		if(!tmpbuf) {
1688			log_err("realloc failed: out of memory");
1689			return 0;
1690		} else {
1691			*ip_addresses = tmpbuf;
1692		}
1693		(*ip_addresses)[*ip_addresses_size] = strdup(search_ifa);
1694		if(!(*ip_addresses)[*ip_addresses_size]) {
1695			log_err("strdup failed: out of memory");
1696			return 0;
1697		}
1698		(*ip_addresses_size)++;
1699	}
1700	return 1;
1701}
1702#endif /* HAVE_GETIFADDRS */
1703
1704int resolve_interface_names(char** ifs, int num_ifs,
1705	struct config_strlist* list, char*** resif, int* num_resif)
1706{
1707#ifdef HAVE_GETIFADDRS
1708	struct ifaddrs *addrs = NULL;
1709	if(num_ifs == 0 && list == NULL) {
1710		*resif = NULL;
1711		*num_resif = 0;
1712		return 1;
1713	}
1714	if(getifaddrs(&addrs) == -1) {
1715		log_err("failed to list interfaces: getifaddrs: %s",
1716			strerror(errno));
1717		freeifaddrs(addrs);
1718		return 0;
1719	}
1720	if(ifs) {
1721		int i;
1722		for(i=0; i<num_ifs; i++) {
1723			if(!resolve_ifa_name(addrs, ifs[i], resif, num_resif)) {
1724				freeifaddrs(addrs);
1725				config_del_strarray(*resif, *num_resif);
1726				*resif = NULL;
1727				*num_resif = 0;
1728				return 0;
1729			}
1730		}
1731	}
1732	if(list) {
1733		struct config_strlist* p;
1734		for(p = list; p; p = p->next) {
1735			if(!resolve_ifa_name(addrs, p->str, resif, num_resif)) {
1736				freeifaddrs(addrs);
1737				config_del_strarray(*resif, *num_resif);
1738				*resif = NULL;
1739				*num_resif = 0;
1740				return 0;
1741			}
1742}
1743	}
1744	freeifaddrs(addrs);
1745	return 1;
1746#else
1747	struct config_strlist* p;
1748	if(num_ifs == 0 && list == NULL) {
1749		*resif = NULL;
1750		*num_resif = 0;
1751		return 1;
1752	}
1753	*num_resif = num_ifs;
1754	for(p = list; p; p = p->next) {
1755		(*num_resif)++;
1756	}
1757	*resif = calloc(*num_resif, sizeof(**resif));
1758	if(!*resif) {
1759		log_err("out of memory");
1760		return 0;
1761	}
1762	if(ifs) {
1763		int i;
1764		for(i=0; i<num_ifs; i++) {
1765			(*resif)[i] = strdup(ifs[i]);
1766			if(!((*resif)[i])) {
1767				log_err("out of memory");
1768				config_del_strarray(*resif, *num_resif);
1769				*resif = NULL;
1770				*num_resif = 0;
1771				return 0;
1772			}
1773		}
1774	}
1775	if(list) {
1776		int idx = num_ifs;
1777		for(p = list; p; p = p->next) {
1778			(*resif)[idx] = strdup(p->str);
1779			if(!((*resif)[idx])) {
1780				log_err("out of memory");
1781				config_del_strarray(*resif, *num_resif);
1782				*resif = NULL;
1783				*num_resif = 0;
1784				return 0;
1785			}
1786			idx++;
1787		}
1788	}
1789	return 1;
1790#endif /* HAVE_GETIFADDRS */
1791}
1792
1793struct listen_port*
1794listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs,
1795	int* reuseport)
1796{
1797	struct listen_port* list = NULL;
1798	struct addrinfo hints;
1799	int i, do_ip4, do_ip6;
1800	int do_tcp, do_auto;
1801	char portbuf[32];
1802	snprintf(portbuf, sizeof(portbuf), "%d", cfg->port);
1803	do_ip4 = cfg->do_ip4;
1804	do_ip6 = cfg->do_ip6;
1805	do_tcp = cfg->do_tcp;
1806	do_auto = cfg->if_automatic && cfg->do_udp;
1807	if(cfg->incoming_num_tcp == 0)
1808		do_tcp = 0;
1809
1810	/* getaddrinfo */
1811	memset(&hints, 0, sizeof(hints));
1812	hints.ai_flags = AI_PASSIVE;
1813	/* no name lookups on our listening ports */
1814	if(num_ifs > 0)
1815		hints.ai_flags |= AI_NUMERICHOST;
1816	hints.ai_family = AF_UNSPEC;
1817#ifndef INET6
1818	do_ip6 = 0;
1819#endif
1820	if(!do_ip4 && !do_ip6) {
1821		return NULL;
1822	}
1823	/* create ip4 and ip6 ports so that return addresses are nice. */
1824	if(do_auto || num_ifs == 0) {
1825		if(do_auto && cfg->if_automatic_ports &&
1826			cfg->if_automatic_ports[0]!=0) {
1827			char* now = cfg->if_automatic_ports;
1828			while(now && *now) {
1829				char* after;
1830				int extraport;
1831				while(isspace((unsigned char)*now))
1832					now++;
1833				if(!*now)
1834					break;
1835				after = now;
1836				extraport = (int)strtol(now, &after, 10);
1837				if(extraport < 0 || extraport > 65535) {
1838					log_err("interface-automatic-ports port number out of range, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1839					listening_ports_free(list);
1840					return NULL;
1841				}
1842				if(extraport == 0 && now == after) {
1843					log_err("interface-automatic-ports could not be parsed, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1844					listening_ports_free(list);
1845					return NULL;
1846				}
1847				now = after;
1848				snprintf(portbuf, sizeof(portbuf), "%d", extraport);
1849				if(do_ip6) {
1850					hints.ai_family = AF_INET6;
1851					if(!ports_create_if("::0",
1852						do_auto, cfg->do_udp, do_tcp,
1853						&hints, portbuf, &list,
1854						cfg->so_rcvbuf, cfg->so_sndbuf,
1855						cfg->ssl_port, cfg->tls_additional_port,
1856						cfg->https_port,
1857						cfg->proxy_protocol_port,
1858						reuseport, cfg->ip_transparent,
1859						cfg->tcp_mss, cfg->ip_freebind,
1860						cfg->http_nodelay, cfg->use_systemd,
1861						cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1862						listening_ports_free(list);
1863						return NULL;
1864					}
1865				}
1866				if(do_ip4) {
1867					hints.ai_family = AF_INET;
1868					if(!ports_create_if("0.0.0.0",
1869						do_auto, cfg->do_udp, do_tcp,
1870						&hints, portbuf, &list,
1871						cfg->so_rcvbuf, cfg->so_sndbuf,
1872						cfg->ssl_port, cfg->tls_additional_port,
1873						cfg->https_port,
1874						cfg->proxy_protocol_port,
1875						reuseport, cfg->ip_transparent,
1876						cfg->tcp_mss, cfg->ip_freebind,
1877						cfg->http_nodelay, cfg->use_systemd,
1878						cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1879						listening_ports_free(list);
1880						return NULL;
1881					}
1882				}
1883			}
1884			return list;
1885		}
1886		if(do_ip6) {
1887			hints.ai_family = AF_INET6;
1888			if(!ports_create_if(do_auto?"::0":"::1",
1889				do_auto, cfg->do_udp, do_tcp,
1890				&hints, portbuf, &list,
1891				cfg->so_rcvbuf, cfg->so_sndbuf,
1892				cfg->ssl_port, cfg->tls_additional_port,
1893				cfg->https_port, cfg->proxy_protocol_port,
1894				reuseport, cfg->ip_transparent,
1895				cfg->tcp_mss, cfg->ip_freebind,
1896				cfg->http_nodelay, cfg->use_systemd,
1897				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1898				listening_ports_free(list);
1899				return NULL;
1900			}
1901		}
1902		if(do_ip4) {
1903			hints.ai_family = AF_INET;
1904			if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1",
1905				do_auto, cfg->do_udp, do_tcp,
1906				&hints, portbuf, &list,
1907				cfg->so_rcvbuf, cfg->so_sndbuf,
1908				cfg->ssl_port, cfg->tls_additional_port,
1909				cfg->https_port, cfg->proxy_protocol_port,
1910				reuseport, cfg->ip_transparent,
1911				cfg->tcp_mss, cfg->ip_freebind,
1912				cfg->http_nodelay, cfg->use_systemd,
1913				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1914				listening_ports_free(list);
1915				return NULL;
1916			}
1917		}
1918	} else for(i = 0; i<num_ifs; i++) {
1919		if(str_is_ip6(ifs[i])) {
1920			if(!do_ip6)
1921				continue;
1922			hints.ai_family = AF_INET6;
1923			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1924				do_tcp, &hints, portbuf, &list,
1925				cfg->so_rcvbuf, cfg->so_sndbuf,
1926				cfg->ssl_port, cfg->tls_additional_port,
1927				cfg->https_port, cfg->proxy_protocol_port,
1928				reuseport, cfg->ip_transparent,
1929				cfg->tcp_mss, cfg->ip_freebind,
1930				cfg->http_nodelay, cfg->use_systemd,
1931				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1932				listening_ports_free(list);
1933				return NULL;
1934			}
1935		} else {
1936			if(!do_ip4)
1937				continue;
1938			hints.ai_family = AF_INET;
1939			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1940				do_tcp, &hints, portbuf, &list,
1941				cfg->so_rcvbuf, cfg->so_sndbuf,
1942				cfg->ssl_port, cfg->tls_additional_port,
1943				cfg->https_port, cfg->proxy_protocol_port,
1944				reuseport, cfg->ip_transparent,
1945				cfg->tcp_mss, cfg->ip_freebind,
1946				cfg->http_nodelay, cfg->use_systemd,
1947				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1948				listening_ports_free(list);
1949				return NULL;
1950			}
1951		}
1952	}
1953
1954	return list;
1955}
1956
1957void listening_ports_free(struct listen_port* list)
1958{
1959	struct listen_port* nx;
1960	while(list) {
1961		nx = list->next;
1962		if(list->fd != -1) {
1963			sock_close(list->fd);
1964		}
1965		/* rc_ports don't have ub_socket */
1966		if(list->socket) {
1967			free(list->socket->addr);
1968			free(list->socket);
1969		}
1970		free(list);
1971		list = nx;
1972	}
1973}
1974
1975size_t listen_get_mem(struct listen_dnsport* listen)
1976{
1977	struct listen_list* p;
1978	size_t s = sizeof(*listen) + sizeof(*listen->base) +
1979		sizeof(*listen->udp_buff) +
1980		sldns_buffer_capacity(listen->udp_buff);
1981#ifdef USE_DNSCRYPT
1982	s += sizeof(*listen->dnscrypt_udp_buff);
1983	if(listen->udp_buff != listen->dnscrypt_udp_buff){
1984		s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
1985	}
1986#endif
1987	for(p = listen->cps; p; p = p->next) {
1988		s += sizeof(*p);
1989		s += comm_point_get_mem(p->com);
1990	}
1991	return s;
1992}
1993
1994void listen_stop_accept(struct listen_dnsport* listen)
1995{
1996	/* do not stop the ones that have no tcp_free list
1997	 * (they have already stopped listening) */
1998	struct listen_list* p;
1999	for(p=listen->cps; p; p=p->next) {
2000		if(p->com->type == comm_tcp_accept &&
2001			p->com->tcp_free != NULL) {
2002			comm_point_stop_listening(p->com);
2003		}
2004	}
2005}
2006
2007void listen_start_accept(struct listen_dnsport* listen)
2008{
2009	/* do not start the ones that have no tcp_free list, it is no
2010	 * use to listen to them because they have no free tcp handlers */
2011	struct listen_list* p;
2012	for(p=listen->cps; p; p=p->next) {
2013		if(p->com->type == comm_tcp_accept &&
2014			p->com->tcp_free != NULL) {
2015			comm_point_start_listening(p->com, -1, -1);
2016		}
2017	}
2018}
2019
2020struct tcp_req_info*
2021tcp_req_info_create(struct sldns_buffer* spoolbuf)
2022{
2023	struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
2024	if(!req) {
2025		log_err("malloc failure for new stream outoforder processing structure");
2026		return NULL;
2027	}
2028	memset(req, 0, sizeof(*req));
2029	req->spool_buffer = spoolbuf;
2030	return req;
2031}
2032
2033void
2034tcp_req_info_delete(struct tcp_req_info* req)
2035{
2036	if(!req) return;
2037	tcp_req_info_clear(req);
2038	/* cp is pointer back to commpoint that owns this struct and
2039	 * called delete on us */
2040	/* spool_buffer is shared udp buffer, not deleted here */
2041	free(req);
2042}
2043
2044void tcp_req_info_clear(struct tcp_req_info* req)
2045{
2046	struct tcp_req_open_item* open, *nopen;
2047	struct tcp_req_done_item* item, *nitem;
2048	if(!req) return;
2049
2050	/* free outstanding request mesh reply entries */
2051	open = req->open_req_list;
2052	while(open) {
2053		nopen = open->next;
2054		mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
2055		free(open);
2056		open = nopen;
2057	}
2058	req->open_req_list = NULL;
2059	req->num_open_req = 0;
2060
2061	/* free pending writable result packets */
2062	item = req->done_req_list;
2063	while(item) {
2064		nitem = item->next;
2065		lock_basic_lock(&stream_wait_count_lock);
2066		stream_wait_count -= (sizeof(struct tcp_req_done_item)
2067			+item->len);
2068		lock_basic_unlock(&stream_wait_count_lock);
2069		free(item->buf);
2070		free(item);
2071		item = nitem;
2072	}
2073	req->done_req_list = NULL;
2074	req->num_done_req = 0;
2075	req->read_is_closed = 0;
2076}
2077
2078void
2079tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
2080{
2081	struct tcp_req_open_item* open, *prev = NULL;
2082	if(!req || !m) return;
2083	open = req->open_req_list;
2084	while(open) {
2085		if(open->mesh_state == m) {
2086			struct tcp_req_open_item* next;
2087			if(prev) prev->next = open->next;
2088			else req->open_req_list = open->next;
2089			/* caller has to manage the mesh state reply entry */
2090			next = open->next;
2091			free(open);
2092			req->num_open_req --;
2093
2094			/* prev = prev; */
2095			open = next;
2096			continue;
2097		}
2098		prev = open;
2099		open = open->next;
2100	}
2101}
2102
2103/** setup listening for read or write */
2104static void
2105tcp_req_info_setup_listen(struct tcp_req_info* req)
2106{
2107	int wr = 0;
2108	int rd = 0;
2109
2110	if(req->cp->tcp_byte_count != 0) {
2111		/* cannot change, halfway through */
2112		return;
2113	}
2114
2115	if(!req->cp->tcp_is_reading)
2116		wr = 1;
2117	if(!req->read_is_closed)
2118		rd = 1;
2119
2120	if(wr) {
2121		req->cp->tcp_is_reading = 0;
2122		comm_point_stop_listening(req->cp);
2123		comm_point_start_listening(req->cp, -1,
2124			adjusted_tcp_timeout(req->cp));
2125	} else if(rd) {
2126		req->cp->tcp_is_reading = 1;
2127		comm_point_stop_listening(req->cp);
2128		comm_point_start_listening(req->cp, -1,
2129			adjusted_tcp_timeout(req->cp));
2130		/* and also read it (from SSL stack buffers), so
2131		 * no event read event is expected since the remainder of
2132		 * the TLS frame is sitting in the buffers. */
2133		req->read_again = 1;
2134	} else {
2135		comm_point_stop_listening(req->cp);
2136		comm_point_start_listening(req->cp, -1,
2137			adjusted_tcp_timeout(req->cp));
2138		comm_point_listen_for_rw(req->cp, 0, 0);
2139	}
2140}
2141
2142/** remove first item from list of pending results */
2143static struct tcp_req_done_item*
2144tcp_req_info_pop_done(struct tcp_req_info* req)
2145{
2146	struct tcp_req_done_item* item;
2147	log_assert(req->num_done_req > 0 && req->done_req_list);
2148	item = req->done_req_list;
2149	lock_basic_lock(&stream_wait_count_lock);
2150	stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
2151	lock_basic_unlock(&stream_wait_count_lock);
2152	req->done_req_list = req->done_req_list->next;
2153	req->num_done_req --;
2154	return item;
2155}
2156
2157/** Send given buffer and setup to write */
2158static void
2159tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
2160	size_t len)
2161{
2162	sldns_buffer_clear(req->cp->buffer);
2163	sldns_buffer_write(req->cp->buffer, buf, len);
2164	sldns_buffer_flip(req->cp->buffer);
2165
2166	req->cp->tcp_is_reading = 0; /* we are now writing */
2167}
2168
2169/** pick up the next result and start writing it to the channel */
2170static void
2171tcp_req_pickup_next_result(struct tcp_req_info* req)
2172{
2173	if(req->num_done_req > 0) {
2174		/* unlist the done item from the list of pending results */
2175		struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
2176		tcp_req_info_start_write_buf(req, item->buf, item->len);
2177		free(item->buf);
2178		free(item);
2179	}
2180}
2181
2182/** the read channel has closed */
2183int
2184tcp_req_info_handle_read_close(struct tcp_req_info* req)
2185{
2186	verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
2187	/* reset byte count for (potential) partial read */
2188	req->cp->tcp_byte_count = 0;
2189	/* if we still have results to write, pick up next and write it */
2190	if(req->num_done_req != 0) {
2191		tcp_req_pickup_next_result(req);
2192		tcp_req_info_setup_listen(req);
2193		return 1;
2194	}
2195	/* if nothing to do, this closes the connection */
2196	if(req->num_open_req == 0 && req->num_done_req == 0)
2197		return 0;
2198	/* otherwise, we must be waiting for dns resolve, wait with timeout */
2199	req->read_is_closed = 1;
2200	tcp_req_info_setup_listen(req);
2201	return 1;
2202}
2203
2204void
2205tcp_req_info_handle_writedone(struct tcp_req_info* req)
2206{
2207	/* back to reading state, we finished this write event */
2208	sldns_buffer_clear(req->cp->buffer);
2209	if(req->num_done_req == 0 && req->read_is_closed) {
2210		/* no more to write and nothing to read, close it */
2211		comm_point_drop_reply(&req->cp->repinfo);
2212		return;
2213	}
2214	req->cp->tcp_is_reading = 1;
2215	/* see if another result needs writing */
2216	tcp_req_pickup_next_result(req);
2217
2218	/* see if there is more to write, if not stop_listening for writing */
2219	/* see if new requests are allowed, if so, start_listening
2220	 * for reading */
2221	tcp_req_info_setup_listen(req);
2222}
2223
2224void
2225tcp_req_info_handle_readdone(struct tcp_req_info* req)
2226{
2227	struct comm_point* c = req->cp;
2228
2229	/* we want to read up several requests, unless there are
2230	 * pending answers */
2231
2232	req->is_drop = 0;
2233	req->is_reply = 0;
2234	req->in_worker_handle = 1;
2235	sldns_buffer_set_limit(req->spool_buffer, 0);
2236	/* handle the current request */
2237	/* this calls the worker handle request routine that could give
2238	 * a cache response, or localdata response, or drop the reply,
2239	 * or schedule a mesh entry for later */
2240	fptr_ok(fptr_whitelist_comm_point(c->callback));
2241	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
2242		req->in_worker_handle = 0;
2243		/* there is an answer, put it up.  It is already in the
2244		 * c->buffer, just send it. */
2245		/* since we were just reading a query, the channel is
2246		 * clear to write to */
2247	send_it:
2248		c->tcp_is_reading = 0;
2249		comm_point_stop_listening(c);
2250		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
2251		return;
2252	}
2253	req->in_worker_handle = 0;
2254	/* it should be waiting in the mesh for recursion.
2255	 * If mesh failed to add a new entry and called commpoint_drop_reply.
2256	 * Then the mesh state has been cleared. */
2257	if(req->is_drop) {
2258		/* the reply has been dropped, stream has been closed. */
2259		return;
2260	}
2261	/* If mesh failed(mallocfail) and called commpoint_send_reply with
2262	 * something like servfail then we pick up that reply below. */
2263	if(req->is_reply) {
2264		goto send_it;
2265	}
2266
2267	sldns_buffer_clear(c->buffer);
2268	/* if pending answers, pick up an answer and start sending it */
2269	tcp_req_pickup_next_result(req);
2270
2271	/* if answers pending, start sending answers */
2272	/* read more requests if we can have more requests */
2273	tcp_req_info_setup_listen(req);
2274}
2275
2276int
2277tcp_req_info_add_meshstate(struct tcp_req_info* req,
2278	struct mesh_area* mesh, struct mesh_state* m)
2279{
2280	struct tcp_req_open_item* item;
2281	log_assert(req && mesh && m);
2282	item = (struct tcp_req_open_item*)malloc(sizeof(*item));
2283	if(!item) return 0;
2284	item->next = req->open_req_list;
2285	item->mesh = mesh;
2286	item->mesh_state = m;
2287	req->open_req_list = item;
2288	req->num_open_req++;
2289	return 1;
2290}
2291
2292/** Add a result to the result list.  At the end. */
2293static int
2294tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
2295{
2296	struct tcp_req_done_item* last = NULL;
2297	struct tcp_req_done_item* item;
2298	size_t space;
2299
2300	/* see if we have space */
2301	space = sizeof(struct tcp_req_done_item) + len;
2302	lock_basic_lock(&stream_wait_count_lock);
2303	if(stream_wait_count + space > stream_wait_max) {
2304		lock_basic_unlock(&stream_wait_count_lock);
2305		verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
2306		return 0;
2307	}
2308	stream_wait_count += space;
2309	lock_basic_unlock(&stream_wait_count_lock);
2310
2311	/* find last element */
2312	last = req->done_req_list;
2313	while(last && last->next)
2314		last = last->next;
2315
2316	/* create new element */
2317	item = (struct tcp_req_done_item*)malloc(sizeof(*item));
2318	if(!item) {
2319		log_err("malloc failure, for stream result list");
2320		return 0;
2321	}
2322	item->next = NULL;
2323	item->len = len;
2324	item->buf = memdup(buf, len);
2325	if(!item->buf) {
2326		free(item);
2327		log_err("malloc failure, adding reply to stream result list");
2328		return 0;
2329	}
2330
2331	/* link in */
2332	if(last) last->next = item;
2333	else req->done_req_list = item;
2334	req->num_done_req++;
2335	return 1;
2336}
2337
2338void
2339tcp_req_info_send_reply(struct tcp_req_info* req)
2340{
2341	if(req->in_worker_handle) {
2342		/* reply from mesh is in the spool_buffer */
2343		/* copy now, so that the spool buffer is free for other tasks
2344		 * before the callback is done */
2345		sldns_buffer_clear(req->cp->buffer);
2346		sldns_buffer_write(req->cp->buffer,
2347			sldns_buffer_begin(req->spool_buffer),
2348			sldns_buffer_limit(req->spool_buffer));
2349		sldns_buffer_flip(req->cp->buffer);
2350		req->is_reply = 1;
2351		return;
2352	}
2353	/* now that the query has been handled, that mesh_reply entry
2354	 * should be removed, from the tcp_req_info list,
2355	 * the mesh state cleanup removes then with region_cleanup and
2356	 * replies_sent true. */
2357	/* see if we can send it straight away (we are not doing
2358	 * anything else).  If so, copy to buffer and start */
2359	if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
2360		/* buffer is free, and was ready to read new query into,
2361		 * but we are now going to use it to send this answer */
2362		tcp_req_info_start_write_buf(req,
2363			sldns_buffer_begin(req->spool_buffer),
2364			sldns_buffer_limit(req->spool_buffer));
2365		/* switch to listen to write events */
2366		comm_point_stop_listening(req->cp);
2367		comm_point_start_listening(req->cp, -1,
2368			adjusted_tcp_timeout(req->cp));
2369		return;
2370	}
2371	/* queue up the answer behind the others already pending */
2372	if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
2373		sldns_buffer_limit(req->spool_buffer))) {
2374		/* drop the connection, we are out of resources */
2375		comm_point_drop_reply(&req->cp->repinfo);
2376	}
2377}
2378
2379size_t tcp_req_info_get_stream_buffer_size(void)
2380{
2381	size_t s;
2382	if(!stream_wait_lock_inited)
2383		return stream_wait_count;
2384	lock_basic_lock(&stream_wait_count_lock);
2385	s = stream_wait_count;
2386	lock_basic_unlock(&stream_wait_count_lock);
2387	return s;
2388}
2389
2390size_t http2_get_query_buffer_size(void)
2391{
2392	size_t s;
2393	if(!http2_query_buffer_lock_inited)
2394		return http2_query_buffer_count;
2395	lock_basic_lock(&http2_query_buffer_count_lock);
2396	s = http2_query_buffer_count;
2397	lock_basic_unlock(&http2_query_buffer_count_lock);
2398	return s;
2399}
2400
2401size_t http2_get_response_buffer_size(void)
2402{
2403	size_t s;
2404	if(!http2_response_buffer_lock_inited)
2405		return http2_response_buffer_count;
2406	lock_basic_lock(&http2_response_buffer_count_lock);
2407	s = http2_response_buffer_count;
2408	lock_basic_unlock(&http2_response_buffer_count_lock);
2409	return s;
2410}
2411
2412#ifdef HAVE_NGHTTP2
2413/** nghttp2 callback. Used to copy response from rbuffer to nghttp2 session */
2414static ssize_t http2_submit_response_read_callback(
2415	nghttp2_session* ATTR_UNUSED(session),
2416	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2417	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2418{
2419	struct http2_stream* h2_stream;
2420	struct http2_session* h2_session = source->ptr;
2421	size_t copylen = length;
2422	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2423		h2_session->session, stream_id))) {
2424		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2425			"stream");
2426		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2427	}
2428	if(!h2_stream->rbuffer ||
2429		sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2430		verbose(VERB_QUERY, "http2: cannot submit buffer. No data "
2431			"available in rbuffer");
2432		/* rbuffer will be free'd in frame close cb */
2433		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2434	}
2435
2436	if(copylen > sldns_buffer_remaining(h2_stream->rbuffer))
2437		copylen = sldns_buffer_remaining(h2_stream->rbuffer);
2438	if(copylen > SSIZE_MAX)
2439		copylen = SSIZE_MAX; /* will probably never happen */
2440
2441	memcpy(buf, sldns_buffer_current(h2_stream->rbuffer), copylen);
2442	sldns_buffer_skip(h2_stream->rbuffer, copylen);
2443
2444	if(sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2445		*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2446		lock_basic_lock(&http2_response_buffer_count_lock);
2447		http2_response_buffer_count -=
2448			sldns_buffer_capacity(h2_stream->rbuffer);
2449		lock_basic_unlock(&http2_response_buffer_count_lock);
2450		sldns_buffer_free(h2_stream->rbuffer);
2451		h2_stream->rbuffer = NULL;
2452	}
2453
2454	return copylen;
2455}
2456
2457/**
2458 * Send RST_STREAM frame for stream.
2459 * @param h2_session: http2 session to submit frame to
2460 * @param h2_stream: http2 stream containing frame ID to use in RST_STREAM
2461 * @return 0 on error, 1 otherwise
2462 */
2463static int http2_submit_rst_stream(struct http2_session* h2_session,
2464		struct http2_stream* h2_stream)
2465{
2466	int ret = nghttp2_submit_rst_stream(h2_session->session,
2467		NGHTTP2_FLAG_NONE, h2_stream->stream_id,
2468		NGHTTP2_INTERNAL_ERROR);
2469	if(ret) {
2470		verbose(VERB_QUERY, "http2: nghttp2_submit_rst_stream failed, "
2471			"error: %s", nghttp2_strerror(ret));
2472		return 0;
2473	}
2474	return 1;
2475}
2476
2477/**
2478 * DNS response ready to be submitted to nghttp2, to be prepared for sending
2479 * out. Response is stored in c->buffer. Copy to rbuffer because the c->buffer
2480 * might be used before this will be sent out.
2481 * @param h2_session: http2 session, containing c->buffer which contains answer
2482 * @return 0 on error, 1 otherwise
2483 */
2484int http2_submit_dns_response(struct http2_session* h2_session)
2485{
2486	int ret;
2487	nghttp2_data_provider data_prd;
2488	char status[4];
2489	nghttp2_nv headers[3];
2490	struct http2_stream* h2_stream = h2_session->c->h2_stream;
2491	size_t rlen;
2492	char rlen_str[32];
2493
2494	if(h2_stream->rbuffer) {
2495		log_err("http2 submit response error: rbuffer already "
2496			"exists");
2497		return 0;
2498	}
2499	if(sldns_buffer_remaining(h2_session->c->buffer) == 0) {
2500		log_err("http2 submit response error: c->buffer not complete");
2501		return 0;
2502	}
2503
2504	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2505		verbose(VERB_QUERY, "http2: submit response error: "
2506			"invalid status");
2507		return 0;
2508	}
2509
2510	rlen = sldns_buffer_remaining(h2_session->c->buffer);
2511	snprintf(rlen_str, sizeof(rlen_str), "%u", (unsigned)rlen);
2512
2513	lock_basic_lock(&http2_response_buffer_count_lock);
2514	if(http2_response_buffer_count + rlen > http2_response_buffer_max) {
2515		lock_basic_unlock(&http2_response_buffer_count_lock);
2516		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2517			"in https-response-buffer-size");
2518		return http2_submit_rst_stream(h2_session, h2_stream);
2519	}
2520	http2_response_buffer_count += rlen;
2521	lock_basic_unlock(&http2_response_buffer_count_lock);
2522
2523	if(!(h2_stream->rbuffer = sldns_buffer_new(rlen))) {
2524		lock_basic_lock(&http2_response_buffer_count_lock);
2525		http2_response_buffer_count -= rlen;
2526		lock_basic_unlock(&http2_response_buffer_count_lock);
2527		log_err("http2 submit response error: malloc failure");
2528		return 0;
2529	}
2530
2531	headers[0].name = (uint8_t*)":status";
2532	headers[0].namelen = 7;
2533	headers[0].value = (uint8_t*)status;
2534	headers[0].valuelen = 3;
2535	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2536
2537	headers[1].name = (uint8_t*)"content-type";
2538	headers[1].namelen = 12;
2539	headers[1].value = (uint8_t*)"application/dns-message";
2540	headers[1].valuelen = 23;
2541	headers[1].flags = NGHTTP2_NV_FLAG_NONE;
2542
2543	headers[2].name = (uint8_t*)"content-length";
2544	headers[2].namelen = 14;
2545	headers[2].value = (uint8_t*)rlen_str;
2546	headers[2].valuelen = strlen(rlen_str);
2547	headers[2].flags = NGHTTP2_NV_FLAG_NONE;
2548
2549	sldns_buffer_write(h2_stream->rbuffer,
2550		sldns_buffer_current(h2_session->c->buffer),
2551		sldns_buffer_remaining(h2_session->c->buffer));
2552	sldns_buffer_flip(h2_stream->rbuffer);
2553
2554	data_prd.source.ptr = h2_session;
2555	data_prd.read_callback = http2_submit_response_read_callback;
2556	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2557		headers, 3, &data_prd);
2558	if(ret) {
2559		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2560			"error: %s", nghttp2_strerror(ret));
2561		return 0;
2562	}
2563	return 1;
2564}
2565#else
2566int http2_submit_dns_response(void* ATTR_UNUSED(v))
2567{
2568	return 0;
2569}
2570#endif
2571
2572#ifdef HAVE_NGHTTP2
2573/** HTTP status to descriptive string */
2574static char* http_status_to_str(enum http_status s)
2575{
2576	switch(s) {
2577		case HTTP_STATUS_OK:
2578			return "OK";
2579		case HTTP_STATUS_BAD_REQUEST:
2580			return "Bad Request";
2581		case HTTP_STATUS_NOT_FOUND:
2582			return "Not Found";
2583		case HTTP_STATUS_PAYLOAD_TOO_LARGE:
2584			return "Payload Too Large";
2585		case HTTP_STATUS_URI_TOO_LONG:
2586			return "URI Too Long";
2587		case HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE:
2588			return "Unsupported Media Type";
2589		case HTTP_STATUS_NOT_IMPLEMENTED:
2590			return "Not Implemented";
2591	}
2592	return "Status Unknown";
2593}
2594
2595/** nghttp2 callback. Used to copy error message to nghttp2 session */
2596static ssize_t http2_submit_error_read_callback(
2597	nghttp2_session* ATTR_UNUSED(session),
2598	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2599	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2600{
2601	struct http2_stream* h2_stream;
2602	struct http2_session* h2_session = source->ptr;
2603	char* msg;
2604	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2605		h2_session->session, stream_id))) {
2606		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2607			"stream");
2608		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2609	}
2610	*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2611	msg = http_status_to_str(h2_stream->status);
2612	if(length < strlen(msg))
2613		return 0; /* not worth trying over multiple frames */
2614	memcpy(buf, msg, strlen(msg));
2615	return strlen(msg);
2616
2617}
2618
2619/**
2620 * HTTP error response ready to be submitted to nghttp2, to be prepared for
2621 * sending out. Message body will contain descriptive string for HTTP status.
2622 * @param h2_session: http2 session to submit to
2623 * @param h2_stream: http2 stream containing HTTP status to use for error
2624 * @return 0 on error, 1 otherwise
2625 */
2626static int http2_submit_error(struct http2_session* h2_session,
2627	struct http2_stream* h2_stream)
2628{
2629	int ret;
2630	char status[4];
2631	nghttp2_data_provider data_prd;
2632	nghttp2_nv headers[1]; /* will be copied by nghttp */
2633	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2634		verbose(VERB_QUERY, "http2: submit error failed, "
2635			"invalid status");
2636		return 0;
2637	}
2638	headers[0].name = (uint8_t*)":status";
2639	headers[0].namelen = 7;
2640	headers[0].value = (uint8_t*)status;
2641	headers[0].valuelen = 3;
2642	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2643
2644	data_prd.source.ptr = h2_session;
2645	data_prd.read_callback = http2_submit_error_read_callback;
2646
2647	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2648		headers, 1, &data_prd);
2649	if(ret) {
2650		verbose(VERB_QUERY, "http2: submit error failed, "
2651			"error: %s", nghttp2_strerror(ret));
2652		return 0;
2653	}
2654	return 1;
2655}
2656
2657/**
2658 * Start query handling. Query is stored in the stream, and will be free'd here.
2659 * @param h2_session: http2 session, containing comm point
2660 * @param h2_stream: stream containing buffered query
2661 * @return: -1 on error, 1 if answer is stored in c->buffer, 0 if there is no
2662 * reply available (yet).
2663 */
2664static int http2_query_read_done(struct http2_session* h2_session,
2665	struct http2_stream* h2_stream)
2666{
2667	log_assert(h2_stream->qbuffer);
2668
2669	if(h2_session->c->h2_stream) {
2670		verbose(VERB_ALGO, "http2_query_read_done failure: shared "
2671			"buffer already assigned to stream");
2672		return -1;
2673	}
2674
2675    /* the c->buffer might be used by mesh_send_reply and no be cleard
2676	 * need to be cleared before use */
2677	sldns_buffer_clear(h2_session->c->buffer);
2678	if(sldns_buffer_remaining(h2_session->c->buffer) <
2679		sldns_buffer_remaining(h2_stream->qbuffer)) {
2680		/* qbuffer will be free'd in frame close cb */
2681		sldns_buffer_clear(h2_session->c->buffer);
2682		verbose(VERB_ALGO, "http2_query_read_done failure: can't fit "
2683			"qbuffer in c->buffer");
2684		return -1;
2685	}
2686
2687	sldns_buffer_write(h2_session->c->buffer,
2688		sldns_buffer_current(h2_stream->qbuffer),
2689		sldns_buffer_remaining(h2_stream->qbuffer));
2690
2691	lock_basic_lock(&http2_query_buffer_count_lock);
2692	http2_query_buffer_count -= sldns_buffer_capacity(h2_stream->qbuffer);
2693	lock_basic_unlock(&http2_query_buffer_count_lock);
2694	sldns_buffer_free(h2_stream->qbuffer);
2695	h2_stream->qbuffer = NULL;
2696
2697	sldns_buffer_flip(h2_session->c->buffer);
2698	h2_session->c->h2_stream = h2_stream;
2699	fptr_ok(fptr_whitelist_comm_point(h2_session->c->callback));
2700	if((*h2_session->c->callback)(h2_session->c, h2_session->c->cb_arg,
2701		NETEVENT_NOERROR, &h2_session->c->repinfo)) {
2702		return 1; /* answer in c->buffer */
2703	}
2704	sldns_buffer_clear(h2_session->c->buffer);
2705	h2_session->c->h2_stream = NULL;
2706	return 0; /* mesh state added, or dropped */
2707}
2708
2709/** nghttp2 callback. Used to check if the received frame indicates the end of a
2710 * stream. Gather collected request data and start query handling. */
2711static int http2_req_frame_recv_cb(nghttp2_session* session,
2712	const nghttp2_frame* frame, void* cb_arg)
2713{
2714	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2715	struct http2_stream* h2_stream;
2716	int query_read_done;
2717
2718	if((frame->hd.type != NGHTTP2_DATA &&
2719		frame->hd.type != NGHTTP2_HEADERS) ||
2720		!(frame->hd.flags & NGHTTP2_FLAG_END_STREAM)) {
2721			return 0;
2722	}
2723
2724	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2725		session, frame->hd.stream_id)))
2726		return 0;
2727
2728	if(h2_stream->invalid_endpoint) {
2729		h2_stream->status = HTTP_STATUS_NOT_FOUND;
2730		goto submit_http_error;
2731	}
2732
2733	if(h2_stream->invalid_content_type) {
2734		h2_stream->status = HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE;
2735		goto submit_http_error;
2736	}
2737
2738	if(h2_stream->http_method != HTTP_METHOD_GET &&
2739		h2_stream->http_method != HTTP_METHOD_POST) {
2740		h2_stream->status = HTTP_STATUS_NOT_IMPLEMENTED;
2741		goto submit_http_error;
2742	}
2743
2744	if(h2_stream->query_too_large) {
2745		if(h2_stream->http_method == HTTP_METHOD_POST)
2746			h2_stream->status = HTTP_STATUS_PAYLOAD_TOO_LARGE;
2747		else
2748			h2_stream->status = HTTP_STATUS_URI_TOO_LONG;
2749		goto submit_http_error;
2750	}
2751
2752	if(!h2_stream->qbuffer) {
2753		h2_stream->status = HTTP_STATUS_BAD_REQUEST;
2754		goto submit_http_error;
2755	}
2756
2757	if(h2_stream->status) {
2758submit_http_error:
2759		verbose(VERB_QUERY, "http2 request invalid, returning :status="
2760			"%d", h2_stream->status);
2761		if(!http2_submit_error(h2_session, h2_stream)) {
2762			return NGHTTP2_ERR_CALLBACK_FAILURE;
2763		}
2764		return 0;
2765	}
2766	h2_stream->status = HTTP_STATUS_OK;
2767
2768	sldns_buffer_flip(h2_stream->qbuffer);
2769	h2_session->postpone_drop = 1;
2770	query_read_done = http2_query_read_done(h2_session, h2_stream);
2771	if(query_read_done < 0)
2772		return NGHTTP2_ERR_CALLBACK_FAILURE;
2773	else if(!query_read_done) {
2774		if(h2_session->is_drop) {
2775			/* connection needs to be closed. Return failure to make
2776			 * sure no other action are taken anymore on comm point.
2777			 * failure will result in reclaiming (and closing)
2778			 * of comm point. */
2779			verbose(VERB_QUERY, "http2 query dropped in worker cb");
2780			h2_session->postpone_drop = 0;
2781			return NGHTTP2_ERR_CALLBACK_FAILURE;
2782		}
2783		/* nothing to submit right now, query added to mesh. */
2784		h2_session->postpone_drop = 0;
2785		return 0;
2786	}
2787	if(!http2_submit_dns_response(h2_session)) {
2788		sldns_buffer_clear(h2_session->c->buffer);
2789		h2_session->c->h2_stream = NULL;
2790		return NGHTTP2_ERR_CALLBACK_FAILURE;
2791	}
2792	verbose(VERB_QUERY, "http2 query submitted to session");
2793	sldns_buffer_clear(h2_session->c->buffer);
2794	h2_session->c->h2_stream = NULL;
2795	return 0;
2796}
2797
2798/** nghttp2 callback. Used to detect start of new streams. */
2799static int http2_req_begin_headers_cb(nghttp2_session* session,
2800	const nghttp2_frame* frame, void* cb_arg)
2801{
2802	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2803	struct http2_stream* h2_stream;
2804	int ret;
2805	if(frame->hd.type != NGHTTP2_HEADERS ||
2806		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2807		/* only interested in request headers */
2808		return 0;
2809	}
2810	if(!(h2_stream = http2_stream_create(frame->hd.stream_id))) {
2811		log_err("malloc failure while creating http2 stream");
2812		return NGHTTP2_ERR_CALLBACK_FAILURE;
2813	}
2814	http2_session_add_stream(h2_session, h2_stream);
2815	ret = nghttp2_session_set_stream_user_data(session,
2816		frame->hd.stream_id, h2_stream);
2817	if(ret) {
2818		/* stream does not exist */
2819		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2820			"error: %s", nghttp2_strerror(ret));
2821		return NGHTTP2_ERR_CALLBACK_FAILURE;
2822	}
2823
2824	return 0;
2825}
2826
2827/**
2828 * base64url decode, store in qbuffer
2829 * @param h2_session: http2 session
2830 * @param h2_stream: http2 stream
2831 * @param start: start of the base64 string
2832 * @param length: length of the base64 string
2833 * @return: 0 on error, 1 otherwise. query will be stored in h2_stream->qbuffer,
2834 * buffer will be NULL is unparseble.
2835 */
2836static int http2_buffer_uri_query(struct http2_session* h2_session,
2837	struct http2_stream* h2_stream, const uint8_t* start, size_t length)
2838{
2839	size_t expectb64len;
2840	int b64len;
2841	if(h2_stream->http_method == HTTP_METHOD_POST)
2842		return 1;
2843	if(length == 0)
2844		return 1;
2845	if(h2_stream->qbuffer) {
2846		verbose(VERB_ALGO, "http2_req_header fail, "
2847			"qbuffer already set");
2848		return 0;
2849	}
2850
2851	/* calculate size, might be a bit bigger than the real
2852	 * decoded buffer size */
2853	expectb64len = sldns_b64_pton_calculate_size(length);
2854	log_assert(expectb64len > 0);
2855	if(expectb64len >
2856		h2_session->c->http2_stream_max_qbuffer_size) {
2857		h2_stream->query_too_large = 1;
2858		return 1;
2859	}
2860
2861	lock_basic_lock(&http2_query_buffer_count_lock);
2862	if(http2_query_buffer_count + expectb64len > http2_query_buffer_max) {
2863		lock_basic_unlock(&http2_query_buffer_count_lock);
2864		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2865			"in http2-query-buffer-size");
2866		return http2_submit_rst_stream(h2_session, h2_stream);
2867	}
2868	http2_query_buffer_count += expectb64len;
2869	lock_basic_unlock(&http2_query_buffer_count_lock);
2870	if(!(h2_stream->qbuffer = sldns_buffer_new(expectb64len))) {
2871		lock_basic_lock(&http2_query_buffer_count_lock);
2872		http2_query_buffer_count -= expectb64len;
2873		lock_basic_unlock(&http2_query_buffer_count_lock);
2874		log_err("http2_req_header fail, qbuffer "
2875			"malloc failure");
2876		return 0;
2877	}
2878
2879	if(sldns_b64_contains_nonurl((char const*)start, length)) {
2880		char buf[65536+4];
2881		verbose(VERB_ALGO, "HTTP2 stream contains wrong b64 encoding");
2882		/* copy to the scratch buffer temporarily to terminate the
2883		 * string with a zero */
2884		if(length+1 > sizeof(buf)) {
2885			/* too long */
2886			lock_basic_lock(&http2_query_buffer_count_lock);
2887			http2_query_buffer_count -= expectb64len;
2888			lock_basic_unlock(&http2_query_buffer_count_lock);
2889			sldns_buffer_free(h2_stream->qbuffer);
2890			h2_stream->qbuffer = NULL;
2891			return 1;
2892		}
2893		memmove(buf, start, length);
2894		buf[length] = 0;
2895		if(!(b64len = sldns_b64_pton(buf, sldns_buffer_current(
2896			h2_stream->qbuffer), expectb64len)) || b64len < 0) {
2897			lock_basic_lock(&http2_query_buffer_count_lock);
2898			http2_query_buffer_count -= expectb64len;
2899			lock_basic_unlock(&http2_query_buffer_count_lock);
2900			sldns_buffer_free(h2_stream->qbuffer);
2901			h2_stream->qbuffer = NULL;
2902			return 1;
2903		}
2904	} else {
2905		if(!(b64len = sldns_b64url_pton(
2906			(char const *)start, length,
2907			sldns_buffer_current(h2_stream->qbuffer),
2908			expectb64len)) || b64len < 0) {
2909			lock_basic_lock(&http2_query_buffer_count_lock);
2910			http2_query_buffer_count -= expectb64len;
2911			lock_basic_unlock(&http2_query_buffer_count_lock);
2912			sldns_buffer_free(h2_stream->qbuffer);
2913			h2_stream->qbuffer = NULL;
2914			/* return without error, method can be an
2915			 * unknown POST */
2916			return 1;
2917		}
2918	}
2919	sldns_buffer_skip(h2_stream->qbuffer, (size_t)b64len);
2920	return 1;
2921}
2922
2923/** nghttp2 callback. Used to parse headers from HEADER frames. */
2924static int http2_req_header_cb(nghttp2_session* session,
2925	const nghttp2_frame* frame, const uint8_t* name, size_t namelen,
2926	const uint8_t* value, size_t valuelen, uint8_t ATTR_UNUSED(flags),
2927	void* cb_arg)
2928{
2929	struct http2_stream* h2_stream = NULL;
2930	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2931	/* nghttp2 deals with CONTINUATION frames and provides them as part of
2932	 * the HEADER */
2933	if(frame->hd.type != NGHTTP2_HEADERS ||
2934		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2935		/* only interested in request headers */
2936		return 0;
2937	}
2938	if(!(h2_stream = nghttp2_session_get_stream_user_data(session,
2939		frame->hd.stream_id)))
2940		return 0;
2941
2942	/* earlier checks already indicate we can stop handling this query */
2943	if(h2_stream->http_method == HTTP_METHOD_UNSUPPORTED ||
2944		h2_stream->invalid_content_type ||
2945		h2_stream->invalid_endpoint)
2946		return 0;
2947
2948
2949	/* nghttp2 performs some sanity checks in the headers, including:
2950	 * name and value are guaranteed to be null terminated
2951	 * name is guaranteed to be lowercase
2952	 * content-length value is guaranteed to contain digits
2953	 */
2954
2955	if(!h2_stream->http_method && namelen == 7 &&
2956		memcmp(":method", name, namelen) == 0) {
2957		/* Case insensitive check on :method value to be on the safe
2958		 * side. I failed to find text about case sensitivity in specs.
2959		 */
2960		if(valuelen == 3 && strcasecmp("GET", (const char*)value) == 0)
2961			h2_stream->http_method = HTTP_METHOD_GET;
2962		else if(valuelen == 4 &&
2963			strcasecmp("POST", (const char*)value) == 0) {
2964			h2_stream->http_method = HTTP_METHOD_POST;
2965			if(h2_stream->qbuffer) {
2966				/* POST method uses query from DATA frames */
2967				lock_basic_lock(&http2_query_buffer_count_lock);
2968				http2_query_buffer_count -=
2969					sldns_buffer_capacity(h2_stream->qbuffer);
2970				lock_basic_unlock(&http2_query_buffer_count_lock);
2971				sldns_buffer_free(h2_stream->qbuffer);
2972				h2_stream->qbuffer = NULL;
2973			}
2974		} else
2975			h2_stream->http_method = HTTP_METHOD_UNSUPPORTED;
2976		return 0;
2977	}
2978	if(namelen == 5 && memcmp(":path", name, namelen) == 0) {
2979		/* :path may contain DNS query, depending on method. Method might
2980		 * not be known yet here, so check after finishing receiving
2981		 * stream. */
2982#define	HTTP_QUERY_PARAM "?dns="
2983		size_t el = strlen(h2_session->c->http_endpoint);
2984		size_t qpl = strlen(HTTP_QUERY_PARAM);
2985
2986		if(valuelen < el || memcmp(h2_session->c->http_endpoint,
2987			value, el) != 0) {
2988			h2_stream->invalid_endpoint = 1;
2989			return 0;
2990		}
2991		/* larger than endpoint only allowed if it is for the query
2992		 * parameter */
2993		if(valuelen <= el+qpl ||
2994			memcmp(HTTP_QUERY_PARAM, value+el, qpl) != 0) {
2995			if(valuelen != el)
2996				h2_stream->invalid_endpoint = 1;
2997			return 0;
2998		}
2999
3000		if(!http2_buffer_uri_query(h2_session, h2_stream,
3001			value+(el+qpl), valuelen-(el+qpl))) {
3002			return NGHTTP2_ERR_CALLBACK_FAILURE;
3003		}
3004		return 0;
3005	}
3006	/* Content type is a SHOULD (rfc7231#section-3.1.1.5) when using POST,
3007	 * and not needed when using GET. Don't enfore.
3008	 * If set only allow lowercase "application/dns-message".
3009	 *
3010	 * Clients SHOULD (rfc8484#section-4.1) set an accept header, but MUST
3011	 * be able to handle "application/dns-message". Since that is the only
3012	 * content-type supported we can ignore the accept header.
3013	 */
3014	if((namelen == 12 && memcmp("content-type", name, namelen) == 0)) {
3015		if(valuelen != 23 || memcmp("application/dns-message", value,
3016			valuelen) != 0) {
3017			h2_stream->invalid_content_type = 1;
3018		}
3019	}
3020
3021	/* Only interested in content-lentg for POST (on not yet known) method.
3022	 */
3023	if((!h2_stream->http_method ||
3024		h2_stream->http_method == HTTP_METHOD_POST) &&
3025		!h2_stream->content_length && namelen  == 14 &&
3026		memcmp("content-length", name, namelen) == 0) {
3027		if(valuelen > 5) {
3028			h2_stream->query_too_large = 1;
3029			return 0;
3030		}
3031		/* guaranteed to only contain digits and be null terminated */
3032		h2_stream->content_length = atoi((const char*)value);
3033		if(h2_stream->content_length >
3034			h2_session->c->http2_stream_max_qbuffer_size) {
3035			h2_stream->query_too_large = 1;
3036			return 0;
3037		}
3038	}
3039	return 0;
3040}
3041
3042/** nghttp2 callback. Used to get data from DATA frames, which can contain
3043 * queries in POST requests. */
3044static int http2_req_data_chunk_recv_cb(nghttp2_session* ATTR_UNUSED(session),
3045	uint8_t ATTR_UNUSED(flags), int32_t stream_id, const uint8_t* data,
3046	size_t len, void* cb_arg)
3047{
3048	struct http2_session* h2_session = (struct http2_session*)cb_arg;
3049	struct http2_stream* h2_stream;
3050	size_t qlen = 0;
3051
3052	if(!(h2_stream = nghttp2_session_get_stream_user_data(
3053		h2_session->session, stream_id))) {
3054		return 0;
3055	}
3056
3057	if(h2_stream->query_too_large)
3058		return 0;
3059
3060	if(!h2_stream->qbuffer) {
3061		if(h2_stream->content_length) {
3062			if(h2_stream->content_length < len)
3063				/* getting more data in DATA frame than
3064				 * advertised in content-length header. */
3065				return NGHTTP2_ERR_CALLBACK_FAILURE;
3066			qlen = h2_stream->content_length;
3067		} else if(len <= h2_session->c->http2_stream_max_qbuffer_size) {
3068			/* setting this to msg-buffer-size can result in a lot
3069			 * of memory consuption. Most queries should fit in a
3070			 * single DATA frame, and most POST queries will
3071			 * contain content-length which does not impose this
3072			 * limit. */
3073			qlen = len;
3074		}
3075	}
3076	if(!h2_stream->qbuffer && qlen) {
3077		lock_basic_lock(&http2_query_buffer_count_lock);
3078		if(http2_query_buffer_count + qlen > http2_query_buffer_max) {
3079			lock_basic_unlock(&http2_query_buffer_count_lock);
3080			verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
3081				"in http2-query-buffer-size");
3082			return http2_submit_rst_stream(h2_session, h2_stream);
3083		}
3084		http2_query_buffer_count += qlen;
3085		lock_basic_unlock(&http2_query_buffer_count_lock);
3086		if(!(h2_stream->qbuffer = sldns_buffer_new(qlen))) {
3087			lock_basic_lock(&http2_query_buffer_count_lock);
3088			http2_query_buffer_count -= qlen;
3089			lock_basic_unlock(&http2_query_buffer_count_lock);
3090		}
3091	}
3092
3093	if(!h2_stream->qbuffer ||
3094		sldns_buffer_remaining(h2_stream->qbuffer) < len) {
3095		verbose(VERB_ALGO, "http2 data_chunck_recv failed. Not enough "
3096			"buffer space for POST query. Can happen on multi "
3097			"frame requests without content-length header");
3098		h2_stream->query_too_large = 1;
3099		return 0;
3100	}
3101
3102	sldns_buffer_write(h2_stream->qbuffer, data, len);
3103
3104	return 0;
3105}
3106
3107void http2_req_stream_clear(struct http2_stream* h2_stream)
3108{
3109	if(h2_stream->qbuffer) {
3110		lock_basic_lock(&http2_query_buffer_count_lock);
3111		http2_query_buffer_count -=
3112			sldns_buffer_capacity(h2_stream->qbuffer);
3113		lock_basic_unlock(&http2_query_buffer_count_lock);
3114		sldns_buffer_free(h2_stream->qbuffer);
3115		h2_stream->qbuffer = NULL;
3116	}
3117	if(h2_stream->rbuffer) {
3118		lock_basic_lock(&http2_response_buffer_count_lock);
3119		http2_response_buffer_count -=
3120			sldns_buffer_capacity(h2_stream->rbuffer);
3121		lock_basic_unlock(&http2_response_buffer_count_lock);
3122		sldns_buffer_free(h2_stream->rbuffer);
3123		h2_stream->rbuffer = NULL;
3124	}
3125}
3126
3127nghttp2_session_callbacks* http2_req_callbacks_create(void)
3128{
3129	nghttp2_session_callbacks *callbacks;
3130	if(nghttp2_session_callbacks_new(&callbacks) == NGHTTP2_ERR_NOMEM) {
3131		log_err("failed to initialize nghttp2 callback");
3132		return NULL;
3133	}
3134	/* reception of header block started, used to create h2_stream */
3135	nghttp2_session_callbacks_set_on_begin_headers_callback(callbacks,
3136		http2_req_begin_headers_cb);
3137	/* complete frame received, used to get data from stream if frame
3138	 * has end stream flag, and start processing query */
3139	nghttp2_session_callbacks_set_on_frame_recv_callback(callbacks,
3140		http2_req_frame_recv_cb);
3141	/* get request info from headers */
3142	nghttp2_session_callbacks_set_on_header_callback(callbacks,
3143		http2_req_header_cb);
3144	/* get data from DATA frames, containing POST query */
3145	nghttp2_session_callbacks_set_on_data_chunk_recv_callback(callbacks,
3146		http2_req_data_chunk_recv_cb);
3147
3148	/* generic HTTP2 callbacks */
3149	nghttp2_session_callbacks_set_recv_callback(callbacks, http2_recv_cb);
3150	nghttp2_session_callbacks_set_send_callback(callbacks, http2_send_cb);
3151	nghttp2_session_callbacks_set_on_stream_close_callback(callbacks,
3152		http2_stream_close_cb);
3153
3154	return callbacks;
3155}
3156#endif /* HAVE_NGHTTP2 */
3157