netevent.c revision 307729
1/*
2 * util/netevent.c - event notification
3 *
4 * Copyright (c) 2007, NLnet Labs. All rights reserved.
5 *
6 * This software is open source.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * Redistributions of source code must retain the above copyright notice,
13 * this list of conditions and the following disclaimer.
14 *
15 * Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 *
19 * Neither the name of the NLNET LABS nor the names of its contributors may
20 * be used to endorse or promote products derived from this software without
21 * specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36/**
37 * \file
38 *
39 * This file contains event notification functions.
40 */
41#include "config.h"
42#include "util/netevent.h"
43#include "util/ub_event.h"
44#include "util/log.h"
45#include "util/net_help.h"
46#include "util/fptr_wlist.h"
47#include "sldns/pkthdr.h"
48#include "sldns/sbuffer.h"
49#include "dnstap/dnstap.h"
50#ifdef HAVE_OPENSSL_SSL_H
51#include <openssl/ssl.h>
52#endif
53#ifdef HAVE_OPENSSL_ERR_H
54#include <openssl/err.h>
55#endif
56
57/* -------- Start of local definitions -------- */
58/** if CMSG_ALIGN is not defined on this platform, a workaround */
59#ifndef CMSG_ALIGN
60#  ifdef __CMSG_ALIGN
61#    define CMSG_ALIGN(n) __CMSG_ALIGN(n)
62#  elif defined(CMSG_DATA_ALIGN)
63#    define CMSG_ALIGN _CMSG_DATA_ALIGN
64#  else
65#    define CMSG_ALIGN(len) (((len)+sizeof(long)-1) & ~(sizeof(long)-1))
66#  endif
67#endif
68
69/** if CMSG_LEN is not defined on this platform, a workaround */
70#ifndef CMSG_LEN
71#  define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr))+(len))
72#endif
73
74/** if CMSG_SPACE is not defined on this platform, a workaround */
75#ifndef CMSG_SPACE
76#  ifdef _CMSG_HDR_ALIGN
77#    define CMSG_SPACE(l) (CMSG_ALIGN(l)+_CMSG_HDR_ALIGN(sizeof(struct cmsghdr)))
78#  else
79#    define CMSG_SPACE(l) (CMSG_ALIGN(l)+CMSG_ALIGN(sizeof(struct cmsghdr)))
80#  endif
81#endif
82
83/** The TCP reading or writing query timeout in milliseconds */
84#define TCP_QUERY_TIMEOUT 120000
85/** The TCP timeout in msec for fast queries, above half are used */
86#define TCP_QUERY_TIMEOUT_FAST 200
87
88#ifndef NONBLOCKING_IS_BROKEN
89/** number of UDP reads to perform per read indication from select */
90#define NUM_UDP_PER_SELECT 100
91#else
92#define NUM_UDP_PER_SELECT 1
93#endif
94
95/**
96 * The internal event structure for keeping ub_event info for the event.
97 * Possibly other structures (list, tree) this is part of.
98 */
99struct internal_event {
100	/** the comm base */
101	struct comm_base* base;
102	/** ub_event event type */
103	struct ub_event* ev;
104};
105
106/**
107 * Internal base structure, so that every thread has its own events.
108 */
109struct internal_base {
110	/** ub_event event_base type. */
111	struct ub_event_base* base;
112	/** seconds time pointer points here */
113	time_t secs;
114	/** timeval with current time */
115	struct timeval now;
116	/** the event used for slow_accept timeouts */
117	struct ub_event* slow_accept;
118	/** true if slow_accept is enabled */
119	int slow_accept_enabled;
120};
121
122/**
123 * Internal timer structure, to store timer event in.
124 */
125struct internal_timer {
126	/** the super struct from which derived */
127	struct comm_timer super;
128	/** the comm base */
129	struct comm_base* base;
130	/** ub_event event type */
131	struct ub_event* ev;
132	/** is timer enabled */
133	uint8_t enabled;
134};
135
136/**
137 * Internal signal structure, to store signal event in.
138 */
139struct internal_signal {
140	/** ub_event event type */
141	struct ub_event* ev;
142	/** next in signal list */
143	struct internal_signal* next;
144};
145
146/** create a tcp handler with a parent */
147static struct comm_point* comm_point_create_tcp_handler(
148	struct comm_base *base, struct comm_point* parent, size_t bufsize,
149        comm_point_callback_t* callback, void* callback_arg);
150
151/* -------- End of local definitions -------- */
152
153struct comm_base*
154comm_base_create(int sigs)
155{
156	struct comm_base* b = (struct comm_base*)calloc(1,
157		sizeof(struct comm_base));
158	const char *evnm="event", *evsys="", *evmethod="";
159
160	if(!b)
161		return NULL;
162	b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base));
163	if(!b->eb) {
164		free(b);
165		return NULL;
166	}
167	b->eb->base = ub_default_event_base(sigs, &b->eb->secs, &b->eb->now);
168	if(!b->eb->base) {
169		free(b->eb);
170		free(b);
171		return NULL;
172	}
173	ub_comm_base_now(b);
174	ub_get_event_sys(b->eb->base, &evnm, &evsys, &evmethod);
175	verbose(VERB_ALGO, "%s %s user %s method.", evnm, evsys, evmethod);
176	return b;
177}
178
179struct comm_base*
180comm_base_create_event(struct ub_event_base* base)
181{
182	struct comm_base* b = (struct comm_base*)calloc(1,
183		sizeof(struct comm_base));
184	if(!b)
185		return NULL;
186	b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base));
187	if(!b->eb) {
188		free(b);
189		return NULL;
190	}
191	b->eb->base = base;
192	ub_comm_base_now(b);
193	return b;
194}
195
196void
197comm_base_delete(struct comm_base* b)
198{
199	if(!b)
200		return;
201	if(b->eb->slow_accept_enabled) {
202		if(ub_event_del(b->eb->slow_accept) != 0) {
203			log_err("could not event_del slow_accept");
204		}
205		ub_event_free(b->eb->slow_accept);
206	}
207	ub_event_base_free(b->eb->base);
208	b->eb->base = NULL;
209	free(b->eb);
210	free(b);
211}
212
213void
214comm_base_delete_no_base(struct comm_base* b)
215{
216	if(!b)
217		return;
218	if(b->eb->slow_accept_enabled) {
219		if(ub_event_del(b->eb->slow_accept) != 0) {
220			log_err("could not event_del slow_accept");
221		}
222		ub_event_free(b->eb->slow_accept);
223	}
224	b->eb->base = NULL;
225	free(b->eb);
226	free(b);
227}
228
229void
230comm_base_timept(struct comm_base* b, time_t** tt, struct timeval** tv)
231{
232	*tt = &b->eb->secs;
233	*tv = &b->eb->now;
234}
235
236void
237comm_base_dispatch(struct comm_base* b)
238{
239	int retval;
240	retval = ub_event_base_dispatch(b->eb->base);
241	if(retval < 0) {
242		fatal_exit("event_dispatch returned error %d, "
243			"errno is %s", retval, strerror(errno));
244	}
245}
246
247void comm_base_exit(struct comm_base* b)
248{
249	if(ub_event_base_loopexit(b->eb->base) != 0) {
250		log_err("Could not loopexit");
251	}
252}
253
254void comm_base_set_slow_accept_handlers(struct comm_base* b,
255	void (*stop_acc)(void*), void (*start_acc)(void*), void* arg)
256{
257	b->stop_accept = stop_acc;
258	b->start_accept = start_acc;
259	b->cb_arg = arg;
260}
261
262struct ub_event_base* comm_base_internal(struct comm_base* b)
263{
264	return b->eb->base;
265}
266
267/** see if errno for udp has to be logged or not uses globals */
268static int
269udp_send_errno_needs_log(struct sockaddr* addr, socklen_t addrlen)
270{
271	/* do not log transient errors (unless high verbosity) */
272#if defined(ENETUNREACH) || defined(EHOSTDOWN) || defined(EHOSTUNREACH) || defined(ENETDOWN)
273	switch(errno) {
274#  ifdef ENETUNREACH
275		case ENETUNREACH:
276#  endif
277#  ifdef EHOSTDOWN
278		case EHOSTDOWN:
279#  endif
280#  ifdef EHOSTUNREACH
281		case EHOSTUNREACH:
282#  endif
283#  ifdef ENETDOWN
284		case ENETDOWN:
285#  endif
286			if(verbosity < VERB_ALGO)
287				return 0;
288		default:
289			break;
290	}
291#endif
292	/* permission denied is gotten for every send if the
293	 * network is disconnected (on some OS), squelch it */
294	if( ((errno == EPERM)
295#  ifdef EADDRNOTAVAIL
296		/* 'Cannot assign requested address' also when disconnected */
297		|| (errno == EADDRNOTAVAIL)
298#  endif
299		) && verbosity < VERB_DETAIL)
300		return 0;
301	/* squelch errors where people deploy AAAA ::ffff:bla for
302	 * authority servers, which we try for intranets. */
303	if(errno == EINVAL && addr_is_ip4mapped(
304		(struct sockaddr_storage*)addr, addrlen) &&
305		verbosity < VERB_DETAIL)
306		return 0;
307	/* SO_BROADCAST sockopt can give access to 255.255.255.255,
308	 * but a dns cache does not need it. */
309	if(errno == EACCES && addr_is_broadcast(
310		(struct sockaddr_storage*)addr, addrlen) &&
311		verbosity < VERB_DETAIL)
312		return 0;
313	return 1;
314}
315
316int tcp_connect_errno_needs_log(struct sockaddr* addr, socklen_t addrlen)
317{
318	return udp_send_errno_needs_log(addr, addrlen);
319}
320
321/* send a UDP reply */
322int
323comm_point_send_udp_msg(struct comm_point *c, sldns_buffer* packet,
324	struct sockaddr* addr, socklen_t addrlen)
325{
326	ssize_t sent;
327	log_assert(c->fd != -1);
328#ifdef UNBOUND_DEBUG
329	if(sldns_buffer_remaining(packet) == 0)
330		log_err("error: send empty UDP packet");
331#endif
332	log_assert(addr && addrlen > 0);
333	sent = sendto(c->fd, (void*)sldns_buffer_begin(packet),
334		sldns_buffer_remaining(packet), 0,
335		addr, addrlen);
336	if(sent == -1) {
337		/* try again and block, waiting for IO to complete,
338		 * we want to send the answer, and we will wait for
339		 * the ethernet interface buffer to have space. */
340#ifndef USE_WINSOCK
341		if(errno == EAGAIN ||
342#  ifdef EWOULDBLOCK
343			errno == EWOULDBLOCK ||
344#  endif
345			errno == ENOBUFS) {
346#else
347		if(WSAGetLastError() == WSAEINPROGRESS ||
348			WSAGetLastError() == WSAENOBUFS ||
349			WSAGetLastError() == WSAEWOULDBLOCK) {
350#endif
351			int e;
352			fd_set_block(c->fd);
353			sent = sendto(c->fd, (void*)sldns_buffer_begin(packet),
354				sldns_buffer_remaining(packet), 0,
355				addr, addrlen);
356			e = errno;
357			fd_set_nonblock(c->fd);
358			errno = e;
359		}
360	}
361	if(sent == -1) {
362		if(!udp_send_errno_needs_log(addr, addrlen))
363			return 0;
364#ifndef USE_WINSOCK
365		verbose(VERB_OPS, "sendto failed: %s", strerror(errno));
366#else
367		verbose(VERB_OPS, "sendto failed: %s",
368			wsa_strerror(WSAGetLastError()));
369#endif
370		log_addr(VERB_OPS, "remote address is",
371			(struct sockaddr_storage*)addr, addrlen);
372		return 0;
373	} else if((size_t)sent != sldns_buffer_remaining(packet)) {
374		log_err("sent %d in place of %d bytes",
375			(int)sent, (int)sldns_buffer_remaining(packet));
376		return 0;
377	}
378	return 1;
379}
380
381#if defined(AF_INET6) && defined(IPV6_PKTINFO) && (defined(HAVE_RECVMSG) || defined(HAVE_SENDMSG))
382/** print debug ancillary info */
383static void p_ancil(const char* str, struct comm_reply* r)
384{
385	if(r->srctype != 4 && r->srctype != 6) {
386		log_info("%s: unknown srctype %d", str, r->srctype);
387		return;
388	}
389	if(r->srctype == 6) {
390		char buf[1024];
391		if(inet_ntop(AF_INET6, &r->pktinfo.v6info.ipi6_addr,
392			buf, (socklen_t)sizeof(buf)) == 0) {
393			(void)strlcpy(buf, "(inet_ntop error)", sizeof(buf));
394		}
395		buf[sizeof(buf)-1]=0;
396		log_info("%s: %s %d", str, buf, r->pktinfo.v6info.ipi6_ifindex);
397	} else if(r->srctype == 4) {
398#ifdef IP_PKTINFO
399		char buf1[1024], buf2[1024];
400		if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_addr,
401			buf1, (socklen_t)sizeof(buf1)) == 0) {
402			(void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1));
403		}
404		buf1[sizeof(buf1)-1]=0;
405#ifdef HAVE_STRUCT_IN_PKTINFO_IPI_SPEC_DST
406		if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_spec_dst,
407			buf2, (socklen_t)sizeof(buf2)) == 0) {
408			(void)strlcpy(buf2, "(inet_ntop error)", sizeof(buf2));
409		}
410		buf2[sizeof(buf2)-1]=0;
411#else
412		buf2[0]=0;
413#endif
414		log_info("%s: %d %s %s", str, r->pktinfo.v4info.ipi_ifindex,
415			buf1, buf2);
416#elif defined(IP_RECVDSTADDR)
417		char buf1[1024];
418		if(inet_ntop(AF_INET, &r->pktinfo.v4addr,
419			buf1, (socklen_t)sizeof(buf1)) == 0) {
420			(void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1));
421		}
422		buf1[sizeof(buf1)-1]=0;
423		log_info("%s: %s", str, buf1);
424#endif /* IP_PKTINFO or PI_RECVDSTDADDR */
425	}
426}
427#endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG||HAVE_SENDMSG */
428
429/** send a UDP reply over specified interface*/
430static int
431comm_point_send_udp_msg_if(struct comm_point *c, sldns_buffer* packet,
432	struct sockaddr* addr, socklen_t addrlen, struct comm_reply* r)
433{
434#if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_SENDMSG)
435	ssize_t sent;
436	struct msghdr msg;
437	struct iovec iov[1];
438	char control[256];
439#ifndef S_SPLINT_S
440	struct cmsghdr *cmsg;
441#endif /* S_SPLINT_S */
442
443	log_assert(c->fd != -1);
444#ifdef UNBOUND_DEBUG
445	if(sldns_buffer_remaining(packet) == 0)
446		log_err("error: send empty UDP packet");
447#endif
448	log_assert(addr && addrlen > 0);
449
450	msg.msg_name = addr;
451	msg.msg_namelen = addrlen;
452	iov[0].iov_base = sldns_buffer_begin(packet);
453	iov[0].iov_len = sldns_buffer_remaining(packet);
454	msg.msg_iov = iov;
455	msg.msg_iovlen = 1;
456	msg.msg_control = control;
457#ifndef S_SPLINT_S
458	msg.msg_controllen = sizeof(control);
459#endif /* S_SPLINT_S */
460	msg.msg_flags = 0;
461
462#ifndef S_SPLINT_S
463	cmsg = CMSG_FIRSTHDR(&msg);
464	if(r->srctype == 4) {
465#ifdef IP_PKTINFO
466		void* cmsg_data;
467		msg.msg_controllen = CMSG_SPACE(sizeof(struct in_pktinfo));
468		log_assert(msg.msg_controllen <= sizeof(control));
469		cmsg->cmsg_level = IPPROTO_IP;
470		cmsg->cmsg_type = IP_PKTINFO;
471		memmove(CMSG_DATA(cmsg), &r->pktinfo.v4info,
472			sizeof(struct in_pktinfo));
473		/* unset the ifindex to not bypass the routing tables */
474		cmsg_data = CMSG_DATA(cmsg);
475		((struct in_pktinfo *) cmsg_data)->ipi_ifindex = 0;
476		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo));
477#elif defined(IP_SENDSRCADDR)
478		msg.msg_controllen = CMSG_SPACE(sizeof(struct in_addr));
479		log_assert(msg.msg_controllen <= sizeof(control));
480		cmsg->cmsg_level = IPPROTO_IP;
481		cmsg->cmsg_type = IP_SENDSRCADDR;
482		memmove(CMSG_DATA(cmsg), &r->pktinfo.v4addr,
483			sizeof(struct in_addr));
484		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_addr));
485#else
486		verbose(VERB_ALGO, "no IP_PKTINFO or IP_SENDSRCADDR");
487		msg.msg_control = NULL;
488#endif /* IP_PKTINFO or IP_SENDSRCADDR */
489	} else if(r->srctype == 6) {
490		void* cmsg_data;
491		msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
492		log_assert(msg.msg_controllen <= sizeof(control));
493		cmsg->cmsg_level = IPPROTO_IPV6;
494		cmsg->cmsg_type = IPV6_PKTINFO;
495		memmove(CMSG_DATA(cmsg), &r->pktinfo.v6info,
496			sizeof(struct in6_pktinfo));
497		/* unset the ifindex to not bypass the routing tables */
498		cmsg_data = CMSG_DATA(cmsg);
499		((struct in6_pktinfo *) cmsg_data)->ipi6_ifindex = 0;
500		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
501	} else {
502		/* try to pass all 0 to use default route */
503		msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
504		log_assert(msg.msg_controllen <= sizeof(control));
505		cmsg->cmsg_level = IPPROTO_IPV6;
506		cmsg->cmsg_type = IPV6_PKTINFO;
507		memset(CMSG_DATA(cmsg), 0, sizeof(struct in6_pktinfo));
508		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
509	}
510#endif /* S_SPLINT_S */
511	if(verbosity >= VERB_ALGO)
512		p_ancil("send_udp over interface", r);
513	sent = sendmsg(c->fd, &msg, 0);
514	if(sent == -1) {
515		/* try again and block, waiting for IO to complete,
516		 * we want to send the answer, and we will wait for
517		 * the ethernet interface buffer to have space. */
518#ifndef USE_WINSOCK
519		if(errno == EAGAIN ||
520#  ifdef EWOULDBLOCK
521			errno == EWOULDBLOCK ||
522#  endif
523			errno == ENOBUFS) {
524#else
525		if(WSAGetLastError() == WSAEINPROGRESS ||
526			WSAGetLastError() == WSAENOBUFS ||
527			WSAGetLastError() == WSAEWOULDBLOCK) {
528#endif
529			int e;
530			fd_set_block(c->fd);
531			sent = sendmsg(c->fd, &msg, 0);
532			e = errno;
533			fd_set_nonblock(c->fd);
534			errno = e;
535		}
536	}
537	if(sent == -1) {
538		if(!udp_send_errno_needs_log(addr, addrlen))
539			return 0;
540		verbose(VERB_OPS, "sendmsg failed: %s", strerror(errno));
541		log_addr(VERB_OPS, "remote address is",
542			(struct sockaddr_storage*)addr, addrlen);
543#ifdef __NetBSD__
544		/* netbsd 7 has IP_PKTINFO for recv but not send */
545		if(errno == EINVAL && r->srctype == 4)
546			log_err("sendmsg: No support for sendmsg(IP_PKTINFO). "
547				"Please disable interface-automatic");
548#endif
549		return 0;
550	} else if((size_t)sent != sldns_buffer_remaining(packet)) {
551		log_err("sent %d in place of %d bytes",
552			(int)sent, (int)sldns_buffer_remaining(packet));
553		return 0;
554	}
555	return 1;
556#else
557	(void)c;
558	(void)packet;
559	(void)addr;
560	(void)addrlen;
561	(void)r;
562	log_err("sendmsg: IPV6_PKTINFO not supported");
563	return 0;
564#endif /* AF_INET6 && IPV6_PKTINFO && HAVE_SENDMSG */
565}
566
567void
568comm_point_udp_ancil_callback(int fd, short event, void* arg)
569{
570#if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
571	struct comm_reply rep;
572	struct msghdr msg;
573	struct iovec iov[1];
574	ssize_t rcv;
575	char ancil[256];
576	int i;
577#ifndef S_SPLINT_S
578	struct cmsghdr* cmsg;
579#endif /* S_SPLINT_S */
580
581	rep.c = (struct comm_point*)arg;
582	log_assert(rep.c->type == comm_udp);
583
584	if(!(event&UB_EV_READ))
585		return;
586	log_assert(rep.c && rep.c->buffer && rep.c->fd == fd);
587	ub_comm_base_now(rep.c->ev->base);
588	for(i=0; i<NUM_UDP_PER_SELECT; i++) {
589		sldns_buffer_clear(rep.c->buffer);
590		rep.addrlen = (socklen_t)sizeof(rep.addr);
591		log_assert(fd != -1);
592		log_assert(sldns_buffer_remaining(rep.c->buffer) > 0);
593		msg.msg_name = &rep.addr;
594		msg.msg_namelen = (socklen_t)sizeof(rep.addr);
595		iov[0].iov_base = sldns_buffer_begin(rep.c->buffer);
596		iov[0].iov_len = sldns_buffer_remaining(rep.c->buffer);
597		msg.msg_iov = iov;
598		msg.msg_iovlen = 1;
599		msg.msg_control = ancil;
600#ifndef S_SPLINT_S
601		msg.msg_controllen = sizeof(ancil);
602#endif /* S_SPLINT_S */
603		msg.msg_flags = 0;
604		rcv = recvmsg(fd, &msg, 0);
605		if(rcv == -1) {
606			if(errno != EAGAIN && errno != EINTR) {
607				log_err("recvmsg failed: %s", strerror(errno));
608			}
609			return;
610		}
611		rep.addrlen = msg.msg_namelen;
612		sldns_buffer_skip(rep.c->buffer, rcv);
613		sldns_buffer_flip(rep.c->buffer);
614		rep.srctype = 0;
615#ifndef S_SPLINT_S
616		for(cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
617			cmsg = CMSG_NXTHDR(&msg, cmsg)) {
618			if( cmsg->cmsg_level == IPPROTO_IPV6 &&
619				cmsg->cmsg_type == IPV6_PKTINFO) {
620				rep.srctype = 6;
621				memmove(&rep.pktinfo.v6info, CMSG_DATA(cmsg),
622					sizeof(struct in6_pktinfo));
623				break;
624#ifdef IP_PKTINFO
625			} else if( cmsg->cmsg_level == IPPROTO_IP &&
626				cmsg->cmsg_type == IP_PKTINFO) {
627				rep.srctype = 4;
628				memmove(&rep.pktinfo.v4info, CMSG_DATA(cmsg),
629					sizeof(struct in_pktinfo));
630				break;
631#elif defined(IP_RECVDSTADDR)
632			} else if( cmsg->cmsg_level == IPPROTO_IP &&
633				cmsg->cmsg_type == IP_RECVDSTADDR) {
634				rep.srctype = 4;
635				memmove(&rep.pktinfo.v4addr, CMSG_DATA(cmsg),
636					sizeof(struct in_addr));
637				break;
638#endif /* IP_PKTINFO or IP_RECVDSTADDR */
639			}
640		}
641		if(verbosity >= VERB_ALGO)
642			p_ancil("receive_udp on interface", &rep);
643#endif /* S_SPLINT_S */
644		fptr_ok(fptr_whitelist_comm_point(rep.c->callback));
645		if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) {
646			/* send back immediate reply */
647			(void)comm_point_send_udp_msg_if(rep.c, rep.c->buffer,
648				(struct sockaddr*)&rep.addr, rep.addrlen, &rep);
649		}
650		if(rep.c->fd == -1) /* commpoint closed */
651			break;
652	}
653#else
654	(void)fd;
655	(void)event;
656	(void)arg;
657	fatal_exit("recvmsg: No support for IPV6_PKTINFO. "
658		"Please disable interface-automatic");
659#endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG */
660}
661
662void
663comm_point_udp_callback(int fd, short event, void* arg)
664{
665	struct comm_reply rep;
666	ssize_t rcv;
667	int i;
668
669	rep.c = (struct comm_point*)arg;
670	log_assert(rep.c->type == comm_udp);
671
672	if(!(event&UB_EV_READ))
673		return;
674	log_assert(rep.c && rep.c->buffer && rep.c->fd == fd);
675	ub_comm_base_now(rep.c->ev->base);
676	for(i=0; i<NUM_UDP_PER_SELECT; i++) {
677		sldns_buffer_clear(rep.c->buffer);
678		rep.addrlen = (socklen_t)sizeof(rep.addr);
679		log_assert(fd != -1);
680		log_assert(sldns_buffer_remaining(rep.c->buffer) > 0);
681		rcv = recvfrom(fd, (void*)sldns_buffer_begin(rep.c->buffer),
682			sldns_buffer_remaining(rep.c->buffer), 0,
683			(struct sockaddr*)&rep.addr, &rep.addrlen);
684		if(rcv == -1) {
685#ifndef USE_WINSOCK
686			if(errno != EAGAIN && errno != EINTR)
687				log_err("recvfrom %d failed: %s",
688					fd, strerror(errno));
689#else
690			if(WSAGetLastError() != WSAEINPROGRESS &&
691				WSAGetLastError() != WSAECONNRESET &&
692				WSAGetLastError()!= WSAEWOULDBLOCK)
693				log_err("recvfrom failed: %s",
694					wsa_strerror(WSAGetLastError()));
695#endif
696			return;
697		}
698		sldns_buffer_skip(rep.c->buffer, rcv);
699		sldns_buffer_flip(rep.c->buffer);
700		rep.srctype = 0;
701		fptr_ok(fptr_whitelist_comm_point(rep.c->callback));
702		if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) {
703			/* send back immediate reply */
704			(void)comm_point_send_udp_msg(rep.c, rep.c->buffer,
705				(struct sockaddr*)&rep.addr, rep.addrlen);
706		}
707		if(rep.c->fd != fd) /* commpoint closed to -1 or reused for
708		another UDP port. Note rep.c cannot be reused with TCP fd. */
709			break;
710	}
711}
712
713/** Use a new tcp handler for new query fd, set to read query */
714static void
715setup_tcp_handler(struct comm_point* c, int fd, int cur, int max)
716{
717	log_assert(c->type == comm_tcp);
718	log_assert(c->fd == -1);
719	sldns_buffer_clear(c->buffer);
720	c->tcp_is_reading = 1;
721	c->tcp_byte_count = 0;
722	c->tcp_timeout_msec = TCP_QUERY_TIMEOUT;
723	/* if more than half the tcp handlers are in use, use a shorter
724	 * timeout for this TCP connection, we need to make space for
725	 * other connections to be able to get attention */
726	if(cur > max/2)
727		c->tcp_timeout_msec = TCP_QUERY_TIMEOUT_FAST;
728	comm_point_start_listening(c, fd, c->tcp_timeout_msec);
729}
730
731void comm_base_handle_slow_accept(int ATTR_UNUSED(fd),
732	short ATTR_UNUSED(event), void* arg)
733{
734	struct comm_base* b = (struct comm_base*)arg;
735	/* timeout for the slow accept, re-enable accepts again */
736	if(b->start_accept) {
737		verbose(VERB_ALGO, "wait is over, slow accept disabled");
738		fptr_ok(fptr_whitelist_start_accept(b->start_accept));
739		(*b->start_accept)(b->cb_arg);
740		b->eb->slow_accept_enabled = 0;
741	}
742}
743
744int comm_point_perform_accept(struct comm_point* c,
745	struct sockaddr_storage* addr, socklen_t* addrlen)
746{
747	int new_fd;
748	*addrlen = (socklen_t)sizeof(*addr);
749	new_fd = accept(c->fd, (struct sockaddr*)addr, addrlen);
750	if(new_fd == -1) {
751#ifndef USE_WINSOCK
752		/* EINTR is signal interrupt. others are closed connection. */
753		if(	errno == EINTR || errno == EAGAIN
754#ifdef EWOULDBLOCK
755			|| errno == EWOULDBLOCK
756#endif
757#ifdef ECONNABORTED
758			|| errno == ECONNABORTED
759#endif
760#ifdef EPROTO
761			|| errno == EPROTO
762#endif /* EPROTO */
763			)
764			return -1;
765#if defined(ENFILE) && defined(EMFILE)
766		if(errno == ENFILE || errno == EMFILE) {
767			/* out of file descriptors, likely outside of our
768			 * control. stop accept() calls for some time */
769			if(c->ev->base->stop_accept) {
770				struct comm_base* b = c->ev->base;
771				struct timeval tv;
772				verbose(VERB_ALGO, "out of file descriptors: "
773					"slow accept");
774				b->eb->slow_accept_enabled = 1;
775				fptr_ok(fptr_whitelist_stop_accept(
776					b->stop_accept));
777				(*b->stop_accept)(b->cb_arg);
778				/* set timeout, no mallocs */
779				tv.tv_sec = NETEVENT_SLOW_ACCEPT_TIME/1000;
780				tv.tv_usec = (NETEVENT_SLOW_ACCEPT_TIME%1000)*1000;
781				b->eb->slow_accept = ub_event_new(b->eb->base,
782					-1, UB_EV_TIMEOUT,
783					comm_base_handle_slow_accept, b);
784				if(b->eb->slow_accept == NULL) {
785					/* we do not want to log here, because
786					 * that would spam the logfiles.
787					 * error: "event_base_set failed." */
788				}
789				else if(ub_event_add(b->eb->slow_accept, &tv)
790					!= 0) {
791					/* we do not want to log here,
792					 * error: "event_add failed." */
793				}
794			}
795			return -1;
796		}
797#endif
798		log_err_addr("accept failed", strerror(errno), addr, *addrlen);
799#else /* USE_WINSOCK */
800		if(WSAGetLastError() == WSAEINPROGRESS ||
801			WSAGetLastError() == WSAECONNRESET)
802			return -1;
803		if(WSAGetLastError() == WSAEWOULDBLOCK) {
804			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ);
805			return -1;
806		}
807		log_err_addr("accept failed", wsa_strerror(WSAGetLastError()),
808			addr, *addrlen);
809#endif
810		return -1;
811	}
812	fd_set_nonblock(new_fd);
813	return new_fd;
814}
815
816#ifdef USE_WINSOCK
817static long win_bio_cb(BIO *b, int oper, const char* ATTR_UNUSED(argp),
818        int ATTR_UNUSED(argi), long argl, long retvalue)
819{
820	verbose(VERB_ALGO, "bio_cb %d, %s %s %s", oper,
821		(oper&BIO_CB_RETURN)?"return":"before",
822		(oper&BIO_CB_READ)?"read":((oper&BIO_CB_WRITE)?"write":"other"),
823		WSAGetLastError()==WSAEWOULDBLOCK?"wsawb":"");
824	/* on windows, check if previous operation caused EWOULDBLOCK */
825	if( (oper == (BIO_CB_READ|BIO_CB_RETURN) && argl == 0) ||
826		(oper == (BIO_CB_GETS|BIO_CB_RETURN) && argl == 0)) {
827		if(WSAGetLastError() == WSAEWOULDBLOCK)
828			ub_winsock_tcp_wouldblock((struct ub_event*)
829				BIO_get_callback_arg(b), UB_EV_READ);
830	}
831	if( (oper == (BIO_CB_WRITE|BIO_CB_RETURN) && argl == 0) ||
832		(oper == (BIO_CB_PUTS|BIO_CB_RETURN) && argl == 0)) {
833		if(WSAGetLastError() == WSAEWOULDBLOCK)
834			ub_winsock_tcp_wouldblock((struct ub_event*)
835				BIO_get_callback_arg(b), UB_EV_WRITE);
836	}
837	/* return original return value */
838	return retvalue;
839}
840
841/** set win bio callbacks for nonblocking operations */
842void
843comm_point_tcp_win_bio_cb(struct comm_point* c, void* thessl)
844{
845	SSL* ssl = (SSL*)thessl;
846	/* set them both just in case, but usually they are the same BIO */
847	BIO_set_callback(SSL_get_rbio(ssl), &win_bio_cb);
848	BIO_set_callback_arg(SSL_get_rbio(ssl), (char*)c->ev->ev);
849	BIO_set_callback(SSL_get_wbio(ssl), &win_bio_cb);
850	BIO_set_callback_arg(SSL_get_wbio(ssl), (char*)c->ev->ev);
851}
852#endif
853
854void
855comm_point_tcp_accept_callback(int fd, short event, void* arg)
856{
857	struct comm_point* c = (struct comm_point*)arg, *c_hdl;
858	int new_fd;
859	log_assert(c->type == comm_tcp_accept);
860	if(!(event & UB_EV_READ)) {
861		log_info("ignoring tcp accept event %d", (int)event);
862		return;
863	}
864	ub_comm_base_now(c->ev->base);
865	/* find free tcp handler. */
866	if(!c->tcp_free) {
867		log_warn("accepted too many tcp, connections full");
868		return;
869	}
870	/* accept incoming connection. */
871	c_hdl = c->tcp_free;
872	log_assert(fd != -1);
873	(void)fd;
874	new_fd = comm_point_perform_accept(c, &c_hdl->repinfo.addr,
875		&c_hdl->repinfo.addrlen);
876	if(new_fd == -1)
877		return;
878	if(c->ssl) {
879		c_hdl->ssl = incoming_ssl_fd(c->ssl, new_fd);
880		if(!c_hdl->ssl) {
881			c_hdl->fd = new_fd;
882			comm_point_close(c_hdl);
883			return;
884		}
885		c_hdl->ssl_shake_state = comm_ssl_shake_read;
886#ifdef USE_WINSOCK
887		comm_point_tcp_win_bio_cb(c_hdl, c_hdl->ssl);
888#endif
889	}
890
891	/* grab the tcp handler buffers */
892	c->cur_tcp_count++;
893	c->tcp_free = c_hdl->tcp_free;
894	if(!c->tcp_free) {
895		/* stop accepting incoming queries for now. */
896		comm_point_stop_listening(c);
897	}
898	setup_tcp_handler(c_hdl, new_fd, c->cur_tcp_count, c->max_tcp_count);
899}
900
901/** Make tcp handler free for next assignment */
902static void
903reclaim_tcp_handler(struct comm_point* c)
904{
905	log_assert(c->type == comm_tcp);
906	if(c->ssl) {
907#ifdef HAVE_SSL
908		SSL_shutdown(c->ssl);
909		SSL_free(c->ssl);
910		c->ssl = NULL;
911#endif
912	}
913	comm_point_close(c);
914	if(c->tcp_parent) {
915		c->tcp_parent->cur_tcp_count--;
916		c->tcp_free = c->tcp_parent->tcp_free;
917		c->tcp_parent->tcp_free = c;
918		if(!c->tcp_free) {
919			/* re-enable listening on accept socket */
920			comm_point_start_listening(c->tcp_parent, -1, -1);
921		}
922	}
923}
924
925/** do the callback when writing is done */
926static void
927tcp_callback_writer(struct comm_point* c)
928{
929	log_assert(c->type == comm_tcp);
930	sldns_buffer_clear(c->buffer);
931	if(c->tcp_do_toggle_rw)
932		c->tcp_is_reading = 1;
933	c->tcp_byte_count = 0;
934	/* switch from listening(write) to listening(read) */
935	comm_point_stop_listening(c);
936	comm_point_start_listening(c, -1, -1);
937}
938
939/** do the callback when reading is done */
940static void
941tcp_callback_reader(struct comm_point* c)
942{
943	log_assert(c->type == comm_tcp || c->type == comm_local);
944	sldns_buffer_flip(c->buffer);
945	if(c->tcp_do_toggle_rw)
946		c->tcp_is_reading = 0;
947	c->tcp_byte_count = 0;
948	if(c->type == comm_tcp)
949		comm_point_stop_listening(c);
950	fptr_ok(fptr_whitelist_comm_point(c->callback));
951	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
952		comm_point_start_listening(c, -1, c->tcp_timeout_msec);
953	}
954}
955
956/** continue ssl handshake */
957#ifdef HAVE_SSL
958static int
959ssl_handshake(struct comm_point* c)
960{
961	int r;
962	if(c->ssl_shake_state == comm_ssl_shake_hs_read) {
963		/* read condition satisfied back to writing */
964		comm_point_listen_for_rw(c, 1, 1);
965		c->ssl_shake_state = comm_ssl_shake_none;
966		return 1;
967	}
968	if(c->ssl_shake_state == comm_ssl_shake_hs_write) {
969		/* write condition satisfied, back to reading */
970		comm_point_listen_for_rw(c, 1, 0);
971		c->ssl_shake_state = comm_ssl_shake_none;
972		return 1;
973	}
974
975	ERR_clear_error();
976	r = SSL_do_handshake(c->ssl);
977	if(r != 1) {
978		int want = SSL_get_error(c->ssl, r);
979		if(want == SSL_ERROR_WANT_READ) {
980			if(c->ssl_shake_state == comm_ssl_shake_read)
981				return 1;
982			c->ssl_shake_state = comm_ssl_shake_read;
983			comm_point_listen_for_rw(c, 1, 0);
984			return 1;
985		} else if(want == SSL_ERROR_WANT_WRITE) {
986			if(c->ssl_shake_state == comm_ssl_shake_write)
987				return 1;
988			c->ssl_shake_state = comm_ssl_shake_write;
989			comm_point_listen_for_rw(c, 0, 1);
990			return 1;
991		} else if(r == 0) {
992			return 0; /* closed */
993		} else if(want == SSL_ERROR_SYSCALL) {
994			/* SYSCALL and errno==0 means closed uncleanly */
995			if(errno != 0)
996				log_err("SSL_handshake syscall: %s",
997					strerror(errno));
998			return 0;
999		} else {
1000			log_crypto_err("ssl handshake failed");
1001			log_addr(1, "ssl handshake failed", &c->repinfo.addr,
1002				c->repinfo.addrlen);
1003			return 0;
1004		}
1005	}
1006	/* this is where peer verification could take place */
1007	log_addr(VERB_ALGO, "SSL DNS connection", &c->repinfo.addr,
1008		c->repinfo.addrlen);
1009
1010	/* setup listen rw correctly */
1011	if(c->tcp_is_reading) {
1012		if(c->ssl_shake_state != comm_ssl_shake_read)
1013			comm_point_listen_for_rw(c, 1, 0);
1014	} else {
1015		comm_point_listen_for_rw(c, 1, 1);
1016	}
1017	c->ssl_shake_state = comm_ssl_shake_none;
1018	return 1;
1019}
1020#endif /* HAVE_SSL */
1021
1022/** ssl read callback on TCP */
1023static int
1024ssl_handle_read(struct comm_point* c)
1025{
1026#ifdef HAVE_SSL
1027	int r;
1028	if(c->ssl_shake_state != comm_ssl_shake_none) {
1029		if(!ssl_handshake(c))
1030			return 0;
1031		if(c->ssl_shake_state != comm_ssl_shake_none)
1032			return 1;
1033	}
1034	if(c->tcp_byte_count < sizeof(uint16_t)) {
1035		/* read length bytes */
1036		ERR_clear_error();
1037		if((r=SSL_read(c->ssl, (void*)sldns_buffer_at(c->buffer,
1038			c->tcp_byte_count), (int)(sizeof(uint16_t) -
1039			c->tcp_byte_count))) <= 0) {
1040			int want = SSL_get_error(c->ssl, r);
1041			if(want == SSL_ERROR_ZERO_RETURN) {
1042				return 0; /* shutdown, closed */
1043			} else if(want == SSL_ERROR_WANT_READ) {
1044				return 1; /* read more later */
1045			} else if(want == SSL_ERROR_WANT_WRITE) {
1046				c->ssl_shake_state = comm_ssl_shake_hs_write;
1047				comm_point_listen_for_rw(c, 0, 1);
1048				return 1;
1049			} else if(want == SSL_ERROR_SYSCALL) {
1050				if(errno != 0)
1051					log_err("SSL_read syscall: %s",
1052						strerror(errno));
1053				return 0;
1054			}
1055			log_crypto_err("could not SSL_read");
1056			return 0;
1057		}
1058		c->tcp_byte_count += r;
1059		if(c->tcp_byte_count != sizeof(uint16_t))
1060			return 1;
1061		if(sldns_buffer_read_u16_at(c->buffer, 0) >
1062			sldns_buffer_capacity(c->buffer)) {
1063			verbose(VERB_QUERY, "ssl: dropped larger than buffer");
1064			return 0;
1065		}
1066		sldns_buffer_set_limit(c->buffer,
1067			sldns_buffer_read_u16_at(c->buffer, 0));
1068		if(sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) {
1069			verbose(VERB_QUERY, "ssl: dropped bogus too short.");
1070			return 0;
1071		}
1072		verbose(VERB_ALGO, "Reading ssl tcp query of length %d",
1073			(int)sldns_buffer_limit(c->buffer));
1074	}
1075	log_assert(sldns_buffer_remaining(c->buffer) > 0);
1076	ERR_clear_error();
1077	r = SSL_read(c->ssl, (void*)sldns_buffer_current(c->buffer),
1078		(int)sldns_buffer_remaining(c->buffer));
1079	if(r <= 0) {
1080		int want = SSL_get_error(c->ssl, r);
1081		if(want == SSL_ERROR_ZERO_RETURN) {
1082			return 0; /* shutdown, closed */
1083		} else if(want == SSL_ERROR_WANT_READ) {
1084			return 1; /* read more later */
1085		} else if(want == SSL_ERROR_WANT_WRITE) {
1086			c->ssl_shake_state = comm_ssl_shake_hs_write;
1087			comm_point_listen_for_rw(c, 0, 1);
1088			return 1;
1089		} else if(want == SSL_ERROR_SYSCALL) {
1090			if(errno != 0)
1091				log_err("SSL_read syscall: %s",
1092					strerror(errno));
1093			return 0;
1094		}
1095		log_crypto_err("could not SSL_read");
1096		return 0;
1097	}
1098	sldns_buffer_skip(c->buffer, (ssize_t)r);
1099	if(sldns_buffer_remaining(c->buffer) <= 0) {
1100		tcp_callback_reader(c);
1101	}
1102	return 1;
1103#else
1104	(void)c;
1105	return 0;
1106#endif /* HAVE_SSL */
1107}
1108
1109/** ssl write callback on TCP */
1110static int
1111ssl_handle_write(struct comm_point* c)
1112{
1113#ifdef HAVE_SSL
1114	int r;
1115	if(c->ssl_shake_state != comm_ssl_shake_none) {
1116		if(!ssl_handshake(c))
1117			return 0;
1118		if(c->ssl_shake_state != comm_ssl_shake_none)
1119			return 1;
1120	}
1121	/* ignore return, if fails we may simply block */
1122	(void)SSL_set_mode(c->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE);
1123	if(c->tcp_byte_count < sizeof(uint16_t)) {
1124		uint16_t len = htons(sldns_buffer_limit(c->buffer));
1125		ERR_clear_error();
1126		r = SSL_write(c->ssl,
1127			(void*)(((uint8_t*)&len)+c->tcp_byte_count),
1128			(int)(sizeof(uint16_t)-c->tcp_byte_count));
1129		if(r <= 0) {
1130			int want = SSL_get_error(c->ssl, r);
1131			if(want == SSL_ERROR_ZERO_RETURN) {
1132				return 0; /* closed */
1133			} else if(want == SSL_ERROR_WANT_READ) {
1134				c->ssl_shake_state = comm_ssl_shake_read;
1135				comm_point_listen_for_rw(c, 1, 0);
1136				return 1; /* wait for read condition */
1137			} else if(want == SSL_ERROR_WANT_WRITE) {
1138				return 1; /* write more later */
1139			} else if(want == SSL_ERROR_SYSCALL) {
1140				if(errno != 0)
1141					log_err("SSL_write syscall: %s",
1142						strerror(errno));
1143				return 0;
1144			}
1145			log_crypto_err("could not SSL_write");
1146			return 0;
1147		}
1148		c->tcp_byte_count += r;
1149		if(c->tcp_byte_count < sizeof(uint16_t))
1150			return 1;
1151		sldns_buffer_set_position(c->buffer, c->tcp_byte_count -
1152			sizeof(uint16_t));
1153		if(sldns_buffer_remaining(c->buffer) == 0) {
1154			tcp_callback_writer(c);
1155			return 1;
1156		}
1157	}
1158	log_assert(sldns_buffer_remaining(c->buffer) > 0);
1159	ERR_clear_error();
1160	r = SSL_write(c->ssl, (void*)sldns_buffer_current(c->buffer),
1161		(int)sldns_buffer_remaining(c->buffer));
1162	if(r <= 0) {
1163		int want = SSL_get_error(c->ssl, r);
1164		if(want == SSL_ERROR_ZERO_RETURN) {
1165			return 0; /* closed */
1166		} else if(want == SSL_ERROR_WANT_READ) {
1167			c->ssl_shake_state = comm_ssl_shake_read;
1168			comm_point_listen_for_rw(c, 1, 0);
1169			return 1; /* wait for read condition */
1170		} else if(want == SSL_ERROR_WANT_WRITE) {
1171			return 1; /* write more later */
1172		} else if(want == SSL_ERROR_SYSCALL) {
1173			if(errno != 0)
1174				log_err("SSL_write syscall: %s",
1175					strerror(errno));
1176			return 0;
1177		}
1178		log_crypto_err("could not SSL_write");
1179		return 0;
1180	}
1181	sldns_buffer_skip(c->buffer, (ssize_t)r);
1182
1183	if(sldns_buffer_remaining(c->buffer) == 0) {
1184		tcp_callback_writer(c);
1185	}
1186	return 1;
1187#else
1188	(void)c;
1189	return 0;
1190#endif /* HAVE_SSL */
1191}
1192
1193/** handle ssl tcp connection with dns contents */
1194static int
1195ssl_handle_it(struct comm_point* c)
1196{
1197	if(c->tcp_is_reading)
1198		return ssl_handle_read(c);
1199	return ssl_handle_write(c);
1200}
1201
1202/** Handle tcp reading callback.
1203 * @param fd: file descriptor of socket.
1204 * @param c: comm point to read from into buffer.
1205 * @param short_ok: if true, very short packets are OK (for comm_local).
1206 * @return: 0 on error
1207 */
1208static int
1209comm_point_tcp_handle_read(int fd, struct comm_point* c, int short_ok)
1210{
1211	ssize_t r;
1212	log_assert(c->type == comm_tcp || c->type == comm_local);
1213	if(c->ssl)
1214		return ssl_handle_it(c);
1215	if(!c->tcp_is_reading)
1216		return 0;
1217
1218	log_assert(fd != -1);
1219	if(c->tcp_byte_count < sizeof(uint16_t)) {
1220		/* read length bytes */
1221		r = recv(fd,(void*)sldns_buffer_at(c->buffer,c->tcp_byte_count),
1222			sizeof(uint16_t)-c->tcp_byte_count, 0);
1223		if(r == 0)
1224			return 0;
1225		else if(r == -1) {
1226#ifndef USE_WINSOCK
1227			if(errno == EINTR || errno == EAGAIN)
1228				return 1;
1229#ifdef ECONNRESET
1230			if(errno == ECONNRESET && verbosity < 2)
1231				return 0; /* silence reset by peer */
1232#endif
1233			log_err_addr("read (in tcp s)", strerror(errno),
1234				&c->repinfo.addr, c->repinfo.addrlen);
1235#else /* USE_WINSOCK */
1236			if(WSAGetLastError() == WSAECONNRESET)
1237				return 0;
1238			if(WSAGetLastError() == WSAEINPROGRESS)
1239				return 1;
1240			if(WSAGetLastError() == WSAEWOULDBLOCK) {
1241				ub_winsock_tcp_wouldblock(c->ev->ev,
1242					UB_EV_READ);
1243				return 1;
1244			}
1245			log_err_addr("read (in tcp s)",
1246				wsa_strerror(WSAGetLastError()),
1247				&c->repinfo.addr, c->repinfo.addrlen);
1248#endif
1249			return 0;
1250		}
1251		c->tcp_byte_count += r;
1252		if(c->tcp_byte_count != sizeof(uint16_t))
1253			return 1;
1254		if(sldns_buffer_read_u16_at(c->buffer, 0) >
1255			sldns_buffer_capacity(c->buffer)) {
1256			verbose(VERB_QUERY, "tcp: dropped larger than buffer");
1257			return 0;
1258		}
1259		sldns_buffer_set_limit(c->buffer,
1260			sldns_buffer_read_u16_at(c->buffer, 0));
1261		if(!short_ok &&
1262			sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) {
1263			verbose(VERB_QUERY, "tcp: dropped bogus too short.");
1264			return 0;
1265		}
1266		verbose(VERB_ALGO, "Reading tcp query of length %d",
1267			(int)sldns_buffer_limit(c->buffer));
1268	}
1269
1270	log_assert(sldns_buffer_remaining(c->buffer) > 0);
1271	r = recv(fd, (void*)sldns_buffer_current(c->buffer),
1272		sldns_buffer_remaining(c->buffer), 0);
1273	if(r == 0) {
1274		return 0;
1275	} else if(r == -1) {
1276#ifndef USE_WINSOCK
1277		if(errno == EINTR || errno == EAGAIN)
1278			return 1;
1279		log_err_addr("read (in tcp r)", strerror(errno),
1280			&c->repinfo.addr, c->repinfo.addrlen);
1281#else /* USE_WINSOCK */
1282		if(WSAGetLastError() == WSAECONNRESET)
1283			return 0;
1284		if(WSAGetLastError() == WSAEINPROGRESS)
1285			return 1;
1286		if(WSAGetLastError() == WSAEWOULDBLOCK) {
1287			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ);
1288			return 1;
1289		}
1290		log_err_addr("read (in tcp r)",
1291			wsa_strerror(WSAGetLastError()),
1292			&c->repinfo.addr, c->repinfo.addrlen);
1293#endif
1294		return 0;
1295	}
1296	sldns_buffer_skip(c->buffer, r);
1297	if(sldns_buffer_remaining(c->buffer) <= 0) {
1298		tcp_callback_reader(c);
1299	}
1300	return 1;
1301}
1302
1303/**
1304 * Handle tcp writing callback.
1305 * @param fd: file descriptor of socket.
1306 * @param c: comm point to write buffer out of.
1307 * @return: 0 on error
1308 */
1309static int
1310comm_point_tcp_handle_write(int fd, struct comm_point* c)
1311{
1312	ssize_t r;
1313	log_assert(c->type == comm_tcp);
1314	if(c->tcp_is_reading && !c->ssl)
1315		return 0;
1316	log_assert(fd != -1);
1317	if(c->tcp_byte_count == 0 && c->tcp_check_nb_connect) {
1318		/* check for pending error from nonblocking connect */
1319		/* from Stevens, unix network programming, vol1, 3rd ed, p450*/
1320		int error = 0;
1321		socklen_t len = (socklen_t)sizeof(error);
1322		if(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&error,
1323			&len) < 0){
1324#ifndef USE_WINSOCK
1325			error = errno; /* on solaris errno is error */
1326#else /* USE_WINSOCK */
1327			error = WSAGetLastError();
1328#endif
1329		}
1330#ifndef USE_WINSOCK
1331#if defined(EINPROGRESS) && defined(EWOULDBLOCK)
1332		if(error == EINPROGRESS || error == EWOULDBLOCK)
1333			return 1; /* try again later */
1334		else
1335#endif
1336		if(error != 0 && verbosity < 2)
1337			return 0; /* silence lots of chatter in the logs */
1338                else if(error != 0) {
1339			log_err_addr("tcp connect", strerror(error),
1340				&c->repinfo.addr, c->repinfo.addrlen);
1341#else /* USE_WINSOCK */
1342		/* examine error */
1343		if(error == WSAEINPROGRESS)
1344			return 1;
1345		else if(error == WSAEWOULDBLOCK) {
1346			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE);
1347			return 1;
1348		} else if(error != 0 && verbosity < 2)
1349			return 0;
1350		else if(error != 0) {
1351			log_err_addr("tcp connect", wsa_strerror(error),
1352				&c->repinfo.addr, c->repinfo.addrlen);
1353#endif /* USE_WINSOCK */
1354			return 0;
1355		}
1356	}
1357	if(c->ssl)
1358		return ssl_handle_it(c);
1359
1360#ifdef USE_MSG_FASTOPEN
1361	/* Only try this on first use of a connection that uses tfo,
1362	   otherwise fall through to normal write */
1363	/* Also, TFO support on WINDOWS not implemented at the moment */
1364	if(c->tcp_do_fastopen == 1) {
1365		/* this form of sendmsg() does both a connect() and send() so need to
1366		   look for various flavours of error*/
1367		uint16_t len = htons(sldns_buffer_limit(c->buffer));
1368		struct msghdr msg;
1369		struct iovec iov[2];
1370		c->tcp_do_fastopen = 0;
1371		memset(&msg, 0, sizeof(msg));
1372		iov[0].iov_base = (uint8_t*)&len + c->tcp_byte_count;
1373		iov[0].iov_len = sizeof(uint16_t) - c->tcp_byte_count;
1374		iov[1].iov_base = sldns_buffer_begin(c->buffer);
1375		iov[1].iov_len = sldns_buffer_limit(c->buffer);
1376		log_assert(iov[0].iov_len > 0);
1377		log_assert(iov[1].iov_len > 0);
1378		msg.msg_name = &c->repinfo.addr;
1379		msg.msg_namelen = c->repinfo.addrlen;
1380		msg.msg_iov = iov;
1381		msg.msg_iovlen = 2;
1382		r = sendmsg(fd, &msg, MSG_FASTOPEN);
1383		if (r == -1) {
1384#if defined(EINPROGRESS) && defined(EWOULDBLOCK)
1385			/* Handshake is underway, maybe because no TFO cookie available.
1386			   Come back to write the messsage*/
1387			if(errno == EINPROGRESS || errno == EWOULDBLOCK)
1388				return 1;
1389#endif
1390			if(errno == EINTR || errno == EAGAIN)
1391				return 1;
1392			/* Not handling EISCONN here as shouldn't ever hit that case.*/
1393			if(errno != 0 && verbosity < 2)
1394				return 0; /* silence lots of chatter in the logs */
1395			else if(errno != 0)
1396				log_err_addr("tcp sendmsg", strerror(errno),
1397					&c->repinfo.addr, c->repinfo.addrlen);
1398			return 0;
1399		} else {
1400			c->tcp_byte_count += r;
1401			if(c->tcp_byte_count < sizeof(uint16_t))
1402				return 1;
1403			sldns_buffer_set_position(c->buffer, c->tcp_byte_count -
1404				sizeof(uint16_t));
1405			if(sldns_buffer_remaining(c->buffer) == 0) {
1406				tcp_callback_writer(c);
1407				return 1;
1408			}
1409		}
1410	}
1411#endif /* USE_MSG_FASTOPEN */
1412
1413	if(c->tcp_byte_count < sizeof(uint16_t)) {
1414		uint16_t len = htons(sldns_buffer_limit(c->buffer));
1415#ifdef HAVE_WRITEV
1416		struct iovec iov[2];
1417		iov[0].iov_base = (uint8_t*)&len + c->tcp_byte_count;
1418		iov[0].iov_len = sizeof(uint16_t) - c->tcp_byte_count;
1419		iov[1].iov_base = sldns_buffer_begin(c->buffer);
1420		iov[1].iov_len = sldns_buffer_limit(c->buffer);
1421		log_assert(iov[0].iov_len > 0);
1422		log_assert(iov[1].iov_len > 0);
1423		r = writev(fd, iov, 2);
1424#else /* HAVE_WRITEV */
1425		r = send(fd, (void*)(((uint8_t*)&len)+c->tcp_byte_count),
1426			sizeof(uint16_t)-c->tcp_byte_count, 0);
1427#endif /* HAVE_WRITEV */
1428		if(r == -1) {
1429#ifndef USE_WINSOCK
1430#  ifdef EPIPE
1431                	if(errno == EPIPE && verbosity < 2)
1432                        	return 0; /* silence 'broken pipe' */
1433  #endif
1434			if(errno == EINTR || errno == EAGAIN)
1435				return 1;
1436#  ifdef HAVE_WRITEV
1437			log_err_addr("tcp writev", strerror(errno),
1438				&c->repinfo.addr, c->repinfo.addrlen);
1439#  else /* HAVE_WRITEV */
1440			log_err_addr("tcp send s", strerror(errno),
1441				&c->repinfo.addr, c->repinfo.addrlen);
1442#  endif /* HAVE_WRITEV */
1443#else
1444			if(WSAGetLastError() == WSAENOTCONN)
1445				return 1;
1446			if(WSAGetLastError() == WSAEINPROGRESS)
1447				return 1;
1448			if(WSAGetLastError() == WSAEWOULDBLOCK) {
1449				ub_winsock_tcp_wouldblock(c->ev->ev,
1450					UB_EV_WRITE);
1451				return 1;
1452			}
1453			log_err_addr("tcp send s",
1454				wsa_strerror(WSAGetLastError()),
1455				&c->repinfo.addr, c->repinfo.addrlen);
1456#endif
1457			return 0;
1458		}
1459		c->tcp_byte_count += r;
1460		if(c->tcp_byte_count < sizeof(uint16_t))
1461			return 1;
1462		sldns_buffer_set_position(c->buffer, c->tcp_byte_count -
1463			sizeof(uint16_t));
1464		if(sldns_buffer_remaining(c->buffer) == 0) {
1465			tcp_callback_writer(c);
1466			return 1;
1467		}
1468	}
1469	log_assert(sldns_buffer_remaining(c->buffer) > 0);
1470	r = send(fd, (void*)sldns_buffer_current(c->buffer),
1471		sldns_buffer_remaining(c->buffer), 0);
1472	if(r == -1) {
1473#ifndef USE_WINSOCK
1474		if(errno == EINTR || errno == EAGAIN)
1475			return 1;
1476		log_err_addr("tcp send r", strerror(errno),
1477			&c->repinfo.addr, c->repinfo.addrlen);
1478#else
1479		if(WSAGetLastError() == WSAEINPROGRESS)
1480			return 1;
1481		if(WSAGetLastError() == WSAEWOULDBLOCK) {
1482			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE);
1483			return 1;
1484		}
1485		log_err_addr("tcp send r", wsa_strerror(WSAGetLastError()),
1486			&c->repinfo.addr, c->repinfo.addrlen);
1487#endif
1488		return 0;
1489	}
1490	sldns_buffer_skip(c->buffer, r);
1491
1492	if(sldns_buffer_remaining(c->buffer) == 0) {
1493		tcp_callback_writer(c);
1494	}
1495
1496	return 1;
1497}
1498
1499void
1500comm_point_tcp_handle_callback(int fd, short event, void* arg)
1501{
1502	struct comm_point* c = (struct comm_point*)arg;
1503	log_assert(c->type == comm_tcp);
1504	ub_comm_base_now(c->ev->base);
1505
1506	if(event&UB_EV_READ) {
1507		if(!comm_point_tcp_handle_read(fd, c, 0)) {
1508			reclaim_tcp_handler(c);
1509			if(!c->tcp_do_close) {
1510				fptr_ok(fptr_whitelist_comm_point(
1511					c->callback));
1512				(void)(*c->callback)(c, c->cb_arg,
1513					NETEVENT_CLOSED, NULL);
1514			}
1515		}
1516		return;
1517	}
1518	if(event&UB_EV_WRITE) {
1519		if(!comm_point_tcp_handle_write(fd, c)) {
1520			reclaim_tcp_handler(c);
1521			if(!c->tcp_do_close) {
1522				fptr_ok(fptr_whitelist_comm_point(
1523					c->callback));
1524				(void)(*c->callback)(c, c->cb_arg,
1525					NETEVENT_CLOSED, NULL);
1526			}
1527		}
1528		return;
1529	}
1530	if(event&UB_EV_TIMEOUT) {
1531		verbose(VERB_QUERY, "tcp took too long, dropped");
1532		reclaim_tcp_handler(c);
1533		if(!c->tcp_do_close) {
1534			fptr_ok(fptr_whitelist_comm_point(c->callback));
1535			(void)(*c->callback)(c, c->cb_arg,
1536				NETEVENT_TIMEOUT, NULL);
1537		}
1538		return;
1539	}
1540	log_err("Ignored event %d for tcphdl.", event);
1541}
1542
1543void comm_point_local_handle_callback(int fd, short event, void* arg)
1544{
1545	struct comm_point* c = (struct comm_point*)arg;
1546	log_assert(c->type == comm_local);
1547	ub_comm_base_now(c->ev->base);
1548
1549	if(event&UB_EV_READ) {
1550		if(!comm_point_tcp_handle_read(fd, c, 1)) {
1551			fptr_ok(fptr_whitelist_comm_point(c->callback));
1552			(void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED,
1553				NULL);
1554		}
1555		return;
1556	}
1557	log_err("Ignored event %d for localhdl.", event);
1558}
1559
1560void comm_point_raw_handle_callback(int ATTR_UNUSED(fd),
1561	short event, void* arg)
1562{
1563	struct comm_point* c = (struct comm_point*)arg;
1564	int err = NETEVENT_NOERROR;
1565	log_assert(c->type == comm_raw);
1566	ub_comm_base_now(c->ev->base);
1567
1568	if(event&UB_EV_TIMEOUT)
1569		err = NETEVENT_TIMEOUT;
1570	fptr_ok(fptr_whitelist_comm_point_raw(c->callback));
1571	(void)(*c->callback)(c, c->cb_arg, err, NULL);
1572}
1573
1574struct comm_point*
1575comm_point_create_udp(struct comm_base *base, int fd, sldns_buffer* buffer,
1576	comm_point_callback_t* callback, void* callback_arg)
1577{
1578	struct comm_point* c = (struct comm_point*)calloc(1,
1579		sizeof(struct comm_point));
1580	short evbits;
1581	if(!c)
1582		return NULL;
1583	c->ev = (struct internal_event*)calloc(1,
1584		sizeof(struct internal_event));
1585	if(!c->ev) {
1586		free(c);
1587		return NULL;
1588	}
1589	c->ev->base = base;
1590	c->fd = fd;
1591	c->buffer = buffer;
1592	c->timeout = NULL;
1593	c->tcp_is_reading = 0;
1594	c->tcp_byte_count = 0;
1595	c->tcp_parent = NULL;
1596	c->max_tcp_count = 0;
1597	c->cur_tcp_count = 0;
1598	c->tcp_handlers = NULL;
1599	c->tcp_free = NULL;
1600	c->type = comm_udp;
1601	c->tcp_do_close = 0;
1602	c->do_not_close = 0;
1603	c->tcp_do_toggle_rw = 0;
1604	c->tcp_check_nb_connect = 0;
1605#ifdef USE_MSG_FASTOPEN
1606	c->tcp_do_fastopen = 0;
1607#endif
1608	c->inuse = 0;
1609	c->callback = callback;
1610	c->cb_arg = callback_arg;
1611	evbits = UB_EV_READ | UB_EV_PERSIST;
1612	/* ub_event stuff */
1613	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
1614		comm_point_udp_callback, c);
1615	if(c->ev->ev == NULL) {
1616		log_err("could not baseset udp event");
1617		comm_point_delete(c);
1618		return NULL;
1619	}
1620	if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) {
1621		log_err("could not add udp event");
1622		comm_point_delete(c);
1623		return NULL;
1624	}
1625	return c;
1626}
1627
1628struct comm_point*
1629comm_point_create_udp_ancil(struct comm_base *base, int fd,
1630	sldns_buffer* buffer,
1631	comm_point_callback_t* callback, void* callback_arg)
1632{
1633	struct comm_point* c = (struct comm_point*)calloc(1,
1634		sizeof(struct comm_point));
1635	short evbits;
1636	if(!c)
1637		return NULL;
1638	c->ev = (struct internal_event*)calloc(1,
1639		sizeof(struct internal_event));
1640	if(!c->ev) {
1641		free(c);
1642		return NULL;
1643	}
1644	c->ev->base = base;
1645	c->fd = fd;
1646	c->buffer = buffer;
1647	c->timeout = NULL;
1648	c->tcp_is_reading = 0;
1649	c->tcp_byte_count = 0;
1650	c->tcp_parent = NULL;
1651	c->max_tcp_count = 0;
1652	c->cur_tcp_count = 0;
1653	c->tcp_handlers = NULL;
1654	c->tcp_free = NULL;
1655	c->type = comm_udp;
1656	c->tcp_do_close = 0;
1657	c->do_not_close = 0;
1658	c->inuse = 0;
1659	c->tcp_do_toggle_rw = 0;
1660	c->tcp_check_nb_connect = 0;
1661#ifdef USE_MSG_FASTOPEN
1662	c->tcp_do_fastopen = 0;
1663#endif
1664	c->callback = callback;
1665	c->cb_arg = callback_arg;
1666	evbits = UB_EV_READ | UB_EV_PERSIST;
1667	/* ub_event stuff */
1668	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
1669		comm_point_udp_ancil_callback, c);
1670	if(c->ev->ev == NULL) {
1671		log_err("could not baseset udp event");
1672		comm_point_delete(c);
1673		return NULL;
1674	}
1675	if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) {
1676		log_err("could not add udp event");
1677		comm_point_delete(c);
1678		return NULL;
1679	}
1680	return c;
1681}
1682
1683static struct comm_point*
1684comm_point_create_tcp_handler(struct comm_base *base,
1685	struct comm_point* parent, size_t bufsize,
1686        comm_point_callback_t* callback, void* callback_arg)
1687{
1688	struct comm_point* c = (struct comm_point*)calloc(1,
1689		sizeof(struct comm_point));
1690	short evbits;
1691	if(!c)
1692		return NULL;
1693	c->ev = (struct internal_event*)calloc(1,
1694		sizeof(struct internal_event));
1695	if(!c->ev) {
1696		free(c);
1697		return NULL;
1698	}
1699	c->ev->base = base;
1700	c->fd = -1;
1701	c->buffer = sldns_buffer_new(bufsize);
1702	if(!c->buffer) {
1703		free(c->ev);
1704		free(c);
1705		return NULL;
1706	}
1707	c->timeout = (struct timeval*)malloc(sizeof(struct timeval));
1708	if(!c->timeout) {
1709		sldns_buffer_free(c->buffer);
1710		free(c->ev);
1711		free(c);
1712		return NULL;
1713	}
1714	c->tcp_is_reading = 0;
1715	c->tcp_byte_count = 0;
1716	c->tcp_parent = parent;
1717	c->max_tcp_count = 0;
1718	c->cur_tcp_count = 0;
1719	c->tcp_handlers = NULL;
1720	c->tcp_free = NULL;
1721	c->type = comm_tcp;
1722	c->tcp_do_close = 0;
1723	c->do_not_close = 0;
1724	c->tcp_do_toggle_rw = 1;
1725	c->tcp_check_nb_connect = 0;
1726#ifdef USE_MSG_FASTOPEN
1727	c->tcp_do_fastopen = 0;
1728#endif
1729	c->repinfo.c = c;
1730	c->callback = callback;
1731	c->cb_arg = callback_arg;
1732	/* add to parent free list */
1733	c->tcp_free = parent->tcp_free;
1734	parent->tcp_free = c;
1735	/* ub_event stuff */
1736	evbits = UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT;
1737	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
1738		comm_point_tcp_handle_callback, c);
1739	if(c->ev->ev == NULL)
1740	{
1741		log_err("could not basetset tcphdl event");
1742		parent->tcp_free = c->tcp_free;
1743		free(c->ev);
1744		free(c);
1745		return NULL;
1746	}
1747	return c;
1748}
1749
1750struct comm_point*
1751comm_point_create_tcp(struct comm_base *base, int fd, int num, size_t bufsize,
1752        comm_point_callback_t* callback, void* callback_arg)
1753{
1754	struct comm_point* c = (struct comm_point*)calloc(1,
1755		sizeof(struct comm_point));
1756	short evbits;
1757	int i;
1758	/* first allocate the TCP accept listener */
1759	if(!c)
1760		return NULL;
1761	c->ev = (struct internal_event*)calloc(1,
1762		sizeof(struct internal_event));
1763	if(!c->ev) {
1764		free(c);
1765		return NULL;
1766	}
1767	c->ev->base = base;
1768	c->fd = fd;
1769	c->buffer = NULL;
1770	c->timeout = NULL;
1771	c->tcp_is_reading = 0;
1772	c->tcp_byte_count = 0;
1773	c->tcp_parent = NULL;
1774	c->max_tcp_count = num;
1775	c->cur_tcp_count = 0;
1776	c->tcp_handlers = (struct comm_point**)calloc((size_t)num,
1777		sizeof(struct comm_point*));
1778	if(!c->tcp_handlers) {
1779		free(c->ev);
1780		free(c);
1781		return NULL;
1782	}
1783	c->tcp_free = NULL;
1784	c->type = comm_tcp_accept;
1785	c->tcp_do_close = 0;
1786	c->do_not_close = 0;
1787	c->tcp_do_toggle_rw = 0;
1788	c->tcp_check_nb_connect = 0;
1789#ifdef USE_MSG_FASTOPEN
1790	c->tcp_do_fastopen = 0;
1791#endif
1792	c->callback = NULL;
1793	c->cb_arg = NULL;
1794	evbits = UB_EV_READ | UB_EV_PERSIST;
1795	/* ub_event stuff */
1796	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
1797		comm_point_tcp_accept_callback, c);
1798	if(c->ev->ev == NULL) {
1799		log_err("could not baseset tcpacc event");
1800		comm_point_delete(c);
1801		return NULL;
1802	}
1803	if (ub_event_add(c->ev->ev, c->timeout) != 0) {
1804		log_err("could not add tcpacc event");
1805		comm_point_delete(c);
1806		return NULL;
1807	}
1808	/* now prealloc the tcp handlers */
1809	for(i=0; i<num; i++) {
1810		c->tcp_handlers[i] = comm_point_create_tcp_handler(base,
1811			c, bufsize, callback, callback_arg);
1812		if(!c->tcp_handlers[i]) {
1813			comm_point_delete(c);
1814			return NULL;
1815		}
1816	}
1817
1818	return c;
1819}
1820
1821struct comm_point*
1822comm_point_create_tcp_out(struct comm_base *base, size_t bufsize,
1823        comm_point_callback_t* callback, void* callback_arg)
1824{
1825	struct comm_point* c = (struct comm_point*)calloc(1,
1826		sizeof(struct comm_point));
1827	short evbits;
1828	if(!c)
1829		return NULL;
1830	c->ev = (struct internal_event*)calloc(1,
1831		sizeof(struct internal_event));
1832	if(!c->ev) {
1833		free(c);
1834		return NULL;
1835	}
1836	c->ev->base = base;
1837	c->fd = -1;
1838	c->buffer = sldns_buffer_new(bufsize);
1839	if(!c->buffer) {
1840		free(c->ev);
1841		free(c);
1842		return NULL;
1843	}
1844	c->timeout = NULL;
1845	c->tcp_is_reading = 0;
1846	c->tcp_byte_count = 0;
1847	c->tcp_parent = NULL;
1848	c->max_tcp_count = 0;
1849	c->cur_tcp_count = 0;
1850	c->tcp_handlers = NULL;
1851	c->tcp_free = NULL;
1852	c->type = comm_tcp;
1853	c->tcp_do_close = 0;
1854	c->do_not_close = 0;
1855	c->tcp_do_toggle_rw = 1;
1856	c->tcp_check_nb_connect = 1;
1857#ifdef USE_MSG_FASTOPEN
1858	c->tcp_do_fastopen = 1;
1859#endif
1860	c->repinfo.c = c;
1861	c->callback = callback;
1862	c->cb_arg = callback_arg;
1863	evbits = UB_EV_PERSIST | UB_EV_WRITE;
1864	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
1865		comm_point_tcp_handle_callback, c);
1866	if(c->ev->ev == NULL)
1867	{
1868		log_err("could not baseset tcpout event");
1869		sldns_buffer_free(c->buffer);
1870		free(c->ev);
1871		free(c);
1872		return NULL;
1873	}
1874
1875	return c;
1876}
1877
1878struct comm_point*
1879comm_point_create_local(struct comm_base *base, int fd, size_t bufsize,
1880        comm_point_callback_t* callback, void* callback_arg)
1881{
1882	struct comm_point* c = (struct comm_point*)calloc(1,
1883		sizeof(struct comm_point));
1884	short evbits;
1885	if(!c)
1886		return NULL;
1887	c->ev = (struct internal_event*)calloc(1,
1888		sizeof(struct internal_event));
1889	if(!c->ev) {
1890		free(c);
1891		return NULL;
1892	}
1893	c->ev->base = base;
1894	c->fd = fd;
1895	c->buffer = sldns_buffer_new(bufsize);
1896	if(!c->buffer) {
1897		free(c->ev);
1898		free(c);
1899		return NULL;
1900	}
1901	c->timeout = NULL;
1902	c->tcp_is_reading = 1;
1903	c->tcp_byte_count = 0;
1904	c->tcp_parent = NULL;
1905	c->max_tcp_count = 0;
1906	c->cur_tcp_count = 0;
1907	c->tcp_handlers = NULL;
1908	c->tcp_free = NULL;
1909	c->type = comm_local;
1910	c->tcp_do_close = 0;
1911	c->do_not_close = 1;
1912	c->tcp_do_toggle_rw = 0;
1913	c->tcp_check_nb_connect = 0;
1914#ifdef USE_MSG_FASTOPEN
1915	c->tcp_do_fastopen = 0;
1916#endif
1917	c->callback = callback;
1918	c->cb_arg = callback_arg;
1919	/* ub_event stuff */
1920	evbits = UB_EV_PERSIST | UB_EV_READ;
1921	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
1922		comm_point_local_handle_callback, c);
1923	if(c->ev->ev == NULL) {
1924		log_err("could not baseset localhdl event");
1925		free(c->ev);
1926		free(c);
1927		return NULL;
1928	}
1929	if (ub_event_add(c->ev->ev, c->timeout) != 0) {
1930		log_err("could not add localhdl event");
1931		ub_event_free(c->ev->ev);
1932		free(c->ev);
1933		free(c);
1934		return NULL;
1935	}
1936	return c;
1937}
1938
1939struct comm_point*
1940comm_point_create_raw(struct comm_base* base, int fd, int writing,
1941	comm_point_callback_t* callback, void* callback_arg)
1942{
1943	struct comm_point* c = (struct comm_point*)calloc(1,
1944		sizeof(struct comm_point));
1945	short evbits;
1946	if(!c)
1947		return NULL;
1948	c->ev = (struct internal_event*)calloc(1,
1949		sizeof(struct internal_event));
1950	if(!c->ev) {
1951		free(c);
1952		return NULL;
1953	}
1954	c->ev->base = base;
1955	c->fd = fd;
1956	c->buffer = NULL;
1957	c->timeout = NULL;
1958	c->tcp_is_reading = 0;
1959	c->tcp_byte_count = 0;
1960	c->tcp_parent = NULL;
1961	c->max_tcp_count = 0;
1962	c->cur_tcp_count = 0;
1963	c->tcp_handlers = NULL;
1964	c->tcp_free = NULL;
1965	c->type = comm_raw;
1966	c->tcp_do_close = 0;
1967	c->do_not_close = 1;
1968	c->tcp_do_toggle_rw = 0;
1969	c->tcp_check_nb_connect = 0;
1970#ifdef USE_MSG_FASTOPEN
1971	c->tcp_do_fastopen = 0;
1972#endif
1973	c->callback = callback;
1974	c->cb_arg = callback_arg;
1975	/* ub_event stuff */
1976	if(writing)
1977		evbits = UB_EV_PERSIST | UB_EV_WRITE;
1978	else 	evbits = UB_EV_PERSIST | UB_EV_READ;
1979	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
1980		comm_point_raw_handle_callback, c);
1981	if(c->ev->ev == NULL) {
1982		log_err("could not baseset rawhdl event");
1983		free(c->ev);
1984		free(c);
1985		return NULL;
1986	}
1987	if (ub_event_add(c->ev->ev, c->timeout) != 0) {
1988		log_err("could not add rawhdl event");
1989		ub_event_free(c->ev->ev);
1990		free(c->ev);
1991		free(c);
1992		return NULL;
1993	}
1994	return c;
1995}
1996
1997void
1998comm_point_close(struct comm_point* c)
1999{
2000	if(!c)
2001		return;
2002	if(c->fd != -1)
2003		if(ub_event_del(c->ev->ev) != 0) {
2004			log_err("could not event_del on close");
2005		}
2006	/* close fd after removing from event lists, or epoll.. is messed up */
2007	if(c->fd != -1 && !c->do_not_close) {
2008		verbose(VERB_ALGO, "close fd %d", c->fd);
2009#ifndef USE_WINSOCK
2010		close(c->fd);
2011#else
2012		closesocket(c->fd);
2013#endif
2014	}
2015	c->fd = -1;
2016}
2017
2018void
2019comm_point_delete(struct comm_point* c)
2020{
2021	if(!c)
2022		return;
2023	if(c->type == comm_tcp && c->ssl) {
2024#ifdef HAVE_SSL
2025		SSL_shutdown(c->ssl);
2026		SSL_free(c->ssl);
2027#endif
2028	}
2029	comm_point_close(c);
2030	if(c->tcp_handlers) {
2031		int i;
2032		for(i=0; i<c->max_tcp_count; i++)
2033			comm_point_delete(c->tcp_handlers[i]);
2034		free(c->tcp_handlers);
2035	}
2036	free(c->timeout);
2037	if(c->type == comm_tcp || c->type == comm_local)
2038		sldns_buffer_free(c->buffer);
2039	ub_event_free(c->ev->ev);
2040	free(c->ev);
2041	free(c);
2042}
2043
2044void
2045comm_point_send_reply(struct comm_reply *repinfo)
2046{
2047	log_assert(repinfo && repinfo->c);
2048	if(repinfo->c->type == comm_udp) {
2049		if(repinfo->srctype)
2050			comm_point_send_udp_msg_if(repinfo->c,
2051			repinfo->c->buffer, (struct sockaddr*)&repinfo->addr,
2052			repinfo->addrlen, repinfo);
2053		else
2054			comm_point_send_udp_msg(repinfo->c, repinfo->c->buffer,
2055			(struct sockaddr*)&repinfo->addr, repinfo->addrlen);
2056#ifdef USE_DNSTAP
2057		if(repinfo->c->dtenv != NULL &&
2058		   repinfo->c->dtenv->log_client_response_messages)
2059			dt_msg_send_client_response(repinfo->c->dtenv,
2060			&repinfo->addr, repinfo->c->type, repinfo->c->buffer);
2061#endif
2062	} else {
2063#ifdef USE_DNSTAP
2064		if(repinfo->c->tcp_parent->dtenv != NULL &&
2065		   repinfo->c->tcp_parent->dtenv->log_client_response_messages)
2066			dt_msg_send_client_response(repinfo->c->tcp_parent->dtenv,
2067			&repinfo->addr, repinfo->c->type, repinfo->c->buffer);
2068#endif
2069		comm_point_start_listening(repinfo->c, -1,
2070			repinfo->c->tcp_timeout_msec);
2071	}
2072}
2073
2074void
2075comm_point_drop_reply(struct comm_reply* repinfo)
2076{
2077	if(!repinfo)
2078		return;
2079	log_assert(repinfo && repinfo->c);
2080	log_assert(repinfo->c->type != comm_tcp_accept);
2081	if(repinfo->c->type == comm_udp)
2082		return;
2083	reclaim_tcp_handler(repinfo->c);
2084}
2085
2086void
2087comm_point_stop_listening(struct comm_point* c)
2088{
2089	verbose(VERB_ALGO, "comm point stop listening %d", c->fd);
2090	if(ub_event_del(c->ev->ev) != 0) {
2091		log_err("event_del error to stoplisten");
2092	}
2093}
2094
2095void
2096comm_point_start_listening(struct comm_point* c, int newfd, int msec)
2097{
2098	verbose(VERB_ALGO, "comm point start listening %d",
2099		c->fd==-1?newfd:c->fd);
2100	if(c->type == comm_tcp_accept && !c->tcp_free) {
2101		/* no use to start listening no free slots. */
2102		return;
2103	}
2104	if(msec != -1 && msec != 0) {
2105		if(!c->timeout) {
2106			c->timeout = (struct timeval*)malloc(sizeof(
2107				struct timeval));
2108			if(!c->timeout) {
2109				log_err("cpsl: malloc failed. No net read.");
2110				return;
2111			}
2112		}
2113		ub_event_add_bits(c->ev->ev, UB_EV_TIMEOUT);
2114#ifndef S_SPLINT_S /* splint fails on struct timeval. */
2115		c->timeout->tv_sec = msec/1000;
2116		c->timeout->tv_usec = (msec%1000)*1000;
2117#endif /* S_SPLINT_S */
2118	}
2119	if(c->type == comm_tcp) {
2120		ub_event_del_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE);
2121		if(c->tcp_is_reading)
2122			ub_event_add_bits(c->ev->ev, UB_EV_READ);
2123		else	ub_event_add_bits(c->ev->ev, UB_EV_WRITE);
2124	}
2125	if(newfd != -1) {
2126		if(c->fd != -1) {
2127#ifndef USE_WINSOCK
2128			close(c->fd);
2129#else
2130			closesocket(c->fd);
2131#endif
2132		}
2133		c->fd = newfd;
2134		ub_event_set_fd(c->ev->ev, c->fd);
2135	}
2136	if(ub_event_add(c->ev->ev, msec==0?NULL:c->timeout) != 0) {
2137		log_err("event_add failed. in cpsl.");
2138	}
2139}
2140
2141void comm_point_listen_for_rw(struct comm_point* c, int rd, int wr)
2142{
2143	verbose(VERB_ALGO, "comm point listen_for_rw %d %d", c->fd, wr);
2144	if(ub_event_del(c->ev->ev) != 0) {
2145		log_err("event_del error to cplf");
2146	}
2147	ub_event_del_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE);
2148	if(rd) ub_event_add_bits(c->ev->ev, UB_EV_READ);
2149	if(wr) ub_event_add_bits(c->ev->ev, UB_EV_WRITE);
2150	if(ub_event_add(c->ev->ev, c->timeout) != 0) {
2151		log_err("event_add failed. in cplf.");
2152	}
2153}
2154
2155size_t comm_point_get_mem(struct comm_point* c)
2156{
2157	size_t s;
2158	if(!c)
2159		return 0;
2160	s = sizeof(*c) + sizeof(*c->ev);
2161	if(c->timeout)
2162		s += sizeof(*c->timeout);
2163	if(c->type == comm_tcp || c->type == comm_local)
2164		s += sizeof(*c->buffer) + sldns_buffer_capacity(c->buffer);
2165	if(c->type == comm_tcp_accept) {
2166		int i;
2167		for(i=0; i<c->max_tcp_count; i++)
2168			s += comm_point_get_mem(c->tcp_handlers[i]);
2169	}
2170	return s;
2171}
2172
2173struct comm_timer*
2174comm_timer_create(struct comm_base* base, void (*cb)(void*), void* cb_arg)
2175{
2176	struct internal_timer *tm = (struct internal_timer*)calloc(1,
2177		sizeof(struct internal_timer));
2178	if(!tm) {
2179		log_err("malloc failed");
2180		return NULL;
2181	}
2182	tm->super.ev_timer = tm;
2183	tm->base = base;
2184	tm->super.callback = cb;
2185	tm->super.cb_arg = cb_arg;
2186	tm->ev = ub_event_new(base->eb->base, -1, UB_EV_TIMEOUT,
2187		comm_timer_callback, &tm->super);
2188	if(tm->ev == NULL) {
2189		log_err("timer_create: event_base_set failed.");
2190		free(tm);
2191		return NULL;
2192	}
2193	return &tm->super;
2194}
2195
2196void
2197comm_timer_disable(struct comm_timer* timer)
2198{
2199	if(!timer)
2200		return;
2201	ub_timer_del(timer->ev_timer->ev);
2202	timer->ev_timer->enabled = 0;
2203}
2204
2205void
2206comm_timer_set(struct comm_timer* timer, struct timeval* tv)
2207{
2208	log_assert(tv);
2209	if(timer->ev_timer->enabled)
2210		comm_timer_disable(timer);
2211	if(ub_timer_add(timer->ev_timer->ev, timer->ev_timer->base->eb->base,
2212		comm_timer_callback, timer, tv) != 0)
2213		log_err("comm_timer_set: evtimer_add failed.");
2214	timer->ev_timer->enabled = 1;
2215}
2216
2217void
2218comm_timer_delete(struct comm_timer* timer)
2219{
2220	if(!timer)
2221		return;
2222	comm_timer_disable(timer);
2223	/* Free the sub struct timer->ev_timer derived from the super struct timer.
2224	 * i.e. assert(timer == timer->ev_timer)
2225	 */
2226	ub_event_free(timer->ev_timer->ev);
2227	free(timer->ev_timer);
2228}
2229
2230void
2231comm_timer_callback(int ATTR_UNUSED(fd), short event, void* arg)
2232{
2233	struct comm_timer* tm = (struct comm_timer*)arg;
2234	if(!(event&UB_EV_TIMEOUT))
2235		return;
2236	ub_comm_base_now(tm->ev_timer->base);
2237	tm->ev_timer->enabled = 0;
2238	fptr_ok(fptr_whitelist_comm_timer(tm->callback));
2239	(*tm->callback)(tm->cb_arg);
2240}
2241
2242int
2243comm_timer_is_set(struct comm_timer* timer)
2244{
2245	return (int)timer->ev_timer->enabled;
2246}
2247
2248size_t
2249comm_timer_get_mem(struct comm_timer* ATTR_UNUSED(timer))
2250{
2251	return sizeof(struct internal_timer);
2252}
2253
2254struct comm_signal*
2255comm_signal_create(struct comm_base* base,
2256        void (*callback)(int, void*), void* cb_arg)
2257{
2258	struct comm_signal* com = (struct comm_signal*)malloc(
2259		sizeof(struct comm_signal));
2260	if(!com) {
2261		log_err("malloc failed");
2262		return NULL;
2263	}
2264	com->base = base;
2265	com->callback = callback;
2266	com->cb_arg = cb_arg;
2267	com->ev_signal = NULL;
2268	return com;
2269}
2270
2271void
2272comm_signal_callback(int sig, short event, void* arg)
2273{
2274	struct comm_signal* comsig = (struct comm_signal*)arg;
2275	if(!(event & UB_EV_SIGNAL))
2276		return;
2277	ub_comm_base_now(comsig->base);
2278	fptr_ok(fptr_whitelist_comm_signal(comsig->callback));
2279	(*comsig->callback)(sig, comsig->cb_arg);
2280}
2281
2282int
2283comm_signal_bind(struct comm_signal* comsig, int sig)
2284{
2285	struct internal_signal* entry = (struct internal_signal*)calloc(1,
2286		sizeof(struct internal_signal));
2287	if(!entry) {
2288		log_err("malloc failed");
2289		return 0;
2290	}
2291	log_assert(comsig);
2292	/* add signal event */
2293	entry->ev = ub_signal_new(comsig->base->eb->base, sig,
2294		comm_signal_callback, comsig);
2295	if(entry->ev == NULL) {
2296		log_err("Could not create signal event");
2297		free(entry);
2298		return 0;
2299	}
2300	if(ub_signal_add(entry->ev, NULL) != 0) {
2301		log_err("Could not add signal handler");
2302		ub_event_free(entry->ev);
2303		free(entry);
2304		return 0;
2305	}
2306	/* link into list */
2307	entry->next = comsig->ev_signal;
2308	comsig->ev_signal = entry;
2309	return 1;
2310}
2311
2312void
2313comm_signal_delete(struct comm_signal* comsig)
2314{
2315	struct internal_signal* p, *np;
2316	if(!comsig)
2317		return;
2318	p=comsig->ev_signal;
2319	while(p) {
2320		np = p->next;
2321		ub_signal_del(p->ev);
2322		ub_event_free(p->ev);
2323		free(p);
2324		p = np;
2325	}
2326	free(comsig);
2327}
2328