1/* $NetBSD: socket.c,v 1.1 2024/02/18 20:57:57 christos Exp $ */ 2 3/* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16/*! \file */ 17 18#include <inttypes.h> 19#include <stdbool.h> 20#include <sys/param.h> 21#include <sys/socket.h> 22#include <sys/stat.h> 23#include <sys/types.h> 24#if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) 25#include <sys/sysctl.h> 26#endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */ 27#include <sys/time.h> 28#include <sys/uio.h> 29 30#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) 31#include <linux/netlink.h> 32#include <linux/rtnetlink.h> 33#endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \ 34 */ 35 36#include <errno.h> 37#include <fcntl.h> 38#include <stddef.h> 39#include <stdlib.h> 40#include <unistd.h> 41 42#include <isc/app.h> 43#include <isc/buffer.h> 44#include <isc/condition.h> 45#include <isc/formatcheck.h> 46#include <isc/list.h> 47#include <isc/log.h> 48#include <isc/mem.h> 49#include <isc/mutex.h> 50#include <isc/net.h> 51#include <isc/once.h> 52#include <isc/platform.h> 53#include <isc/print.h> 54#include <isc/refcount.h> 55#include <isc/region.h> 56#include <isc/resource.h> 57#include <isc/socket.h> 58#include <isc/stats.h> 59#include <isc/strerr.h> 60#include <isc/string.h> 61#include <isc/task.h> 62#include <isc/thread.h> 63#include <isc/util.h> 64 65#ifdef ISC_PLATFORM_HAVESYSUNH 66#include <sys/un.h> 67#endif /* ifdef ISC_PLATFORM_HAVESYSUNH */ 68#ifdef HAVE_KQUEUE 69#include <sys/event.h> 70#endif /* ifdef HAVE_KQUEUE */ 71#ifdef HAVE_EPOLL_CREATE1 72#include <sys/epoll.h> 73#endif /* ifdef HAVE_EPOLL_CREATE1 */ 74#if defined(HAVE_SYS_DEVPOLL_H) 75#include <sys/devpoll.h> 76#elif defined(HAVE_DEVPOLL_H) 77#include <devpoll.h> 78#endif /* if defined(HAVE_SYS_DEVPOLL_H) */ 79 80#include <netinet/tcp.h> 81 82#include "errno2result.h" 83 84#ifdef ENABLE_TCP_FASTOPEN 85#include <netinet/tcp.h> 86#endif /* ifdef ENABLE_TCP_FASTOPEN */ 87 88#ifdef HAVE_JSON_C 89#include <json_object.h> 90#endif /* HAVE_JSON_C */ 91 92#ifdef HAVE_LIBXML2 93#include <libxml/xmlwriter.h> 94#define ISC_XMLCHAR (const xmlChar *) 95#endif /* HAVE_LIBXML2 */ 96 97/*% 98 * Choose the most preferable multiplex method. 99 */ 100#if defined(HAVE_KQUEUE) 101#define USE_KQUEUE 102#elif defined(HAVE_EPOLL_CREATE1) 103#define USE_EPOLL 104#elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H) 105#define USE_DEVPOLL 106typedef struct { 107 unsigned int want_read : 1, want_write : 1; 108} pollinfo_t; 109#else /* if defined(HAVE_KQUEUE) */ 110#define USE_SELECT 111#endif /* HAVE_KQUEUE */ 112 113/* 114 * Set by the -T dscp option on the command line. If set to a value 115 * other than -1, we check to make sure DSCP values match it, and 116 * assert if not. 117 */ 118int isc_dscp_check_value = -1; 119 120/*% 121 * Maximum number of allowable open sockets. This is also the maximum 122 * allowable socket file descriptor. 123 * 124 * Care should be taken before modifying this value for select(): 125 * The API standard doesn't ensure select() accept more than (the system default 126 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in 127 * the vast majority of cases. This constant should therefore be increased only 128 * when absolutely necessary and possible, i.e., the server is exhausting all 129 * available file descriptors (up to FD_SETSIZE) and the select() function 130 * and FD_xxx macros support larger values than FD_SETSIZE (which may not 131 * always by true, but we keep using some of them to ensure as much 132 * portability as possible). Note also that overall server performance 133 * may be rather worsened with a larger value of this constant due to 134 * inherent scalability problems of select(). 135 * 136 * As a special note, this value shouldn't have to be touched if 137 * this is a build for an authoritative only DNS server. 138 */ 139#ifndef ISC_SOCKET_MAXSOCKETS 140#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 141#ifdef TUNE_LARGE 142#define ISC_SOCKET_MAXSOCKETS 21000 143#else /* ifdef TUNE_LARGE */ 144#define ISC_SOCKET_MAXSOCKETS 4096 145#endif /* TUNE_LARGE */ 146#elif defined(USE_SELECT) 147#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE 148#endif /* USE_KQUEUE... */ 149#endif /* ISC_SOCKET_MAXSOCKETS */ 150 151#ifdef USE_SELECT 152/*% 153 * Mac OS X needs a special definition to support larger values in select(). 154 * We always define this because a larger value can be specified run-time. 155 */ 156#ifdef __APPLE__ 157#define _DARWIN_UNLIMITED_SELECT 158#endif /* __APPLE__ */ 159#endif /* USE_SELECT */ 160 161#ifdef ISC_SOCKET_USE_POLLWATCH 162/*% 163 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel 164 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for 165 * some of the specified FD. The idea is based on the observation that it's 166 * likely for a busy server to keep receiving packets. It specifically works 167 * as follows: the socket watcher is first initialized with the state of 168 * "poll_idle". While it's in the idle state it keeps sleeping until a socket 169 * event occurs. When it wakes up for a socket I/O event, it moves to the 170 * poll_active state, and sets the poll timeout to a short period 171 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the 172 * watcher goes to the poll_checking state with the same timeout period. 173 * In this state, the watcher tries to detect whether this is a break 174 * during intermittent events or the kernel bug is triggered. If the next 175 * polling reports an event within the short period, the previous timeout is 176 * likely to be a kernel bug, and so the watcher goes back to the active state. 177 * Otherwise, it moves to the idle state again. 178 * 179 * It's not clear whether this is a thread-related bug, but since we've only 180 * seen this with threads, this workaround is used only when enabling threads. 181 */ 182 183typedef enum { poll_idle, poll_active, poll_checking } pollstate_t; 184 185#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT 186#define ISC_SOCKET_POLLWATCH_TIMEOUT 10 187#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */ 188#endif /* ISC_SOCKET_USE_POLLWATCH */ 189 190/*% 191 * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds. 192 */ 193#define FDLOCK_BITS 10 194#define FDLOCK_COUNT (1 << FDLOCK_BITS) 195#define FDLOCK_ID(fd) \ 196 (((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \ 197 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT))) 198 199/*% 200 * Maximum number of events communicated with the kernel. There should normally 201 * be no need for having a large number. 202 */ 203#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 204#ifndef ISC_SOCKET_MAXEVENTS 205#ifdef TUNE_LARGE 206#define ISC_SOCKET_MAXEVENTS 2048 207#else /* ifdef TUNE_LARGE */ 208#define ISC_SOCKET_MAXEVENTS 64 209#endif /* TUNE_LARGE */ 210#endif /* ifndef ISC_SOCKET_MAXEVENTS */ 211#endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \ 212 * */ 213 214/*% 215 * Some systems define the socket length argument as an int, some as size_t, 216 * some as socklen_t. This is here so it can be easily changed if needed. 217 */ 218#ifndef socklen_t 219#define socklen_t unsigned int 220#endif /* ifndef socklen_t */ 221 222/*% 223 * Define what the possible "soft" errors can be. These are non-fatal returns 224 * of various network related functions, like recv() and so on. 225 * 226 * For some reason, BSDI (and perhaps others) will sometimes return <0 227 * from recv() but will have errno==0. This is broken, but we have to 228 * work around it here. 229 */ 230#define SOFT_ERROR(e) \ 231 ((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \ 232 (e) == EINTR || (e) == 0) 233 234#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x) 235 236/*!< 237 * DLVL(90) -- Function entry/exit and other tracing. 238 * DLVL(70) -- Socket "correctness" -- including returning of events, etc. 239 * DLVL(60) -- Socket data send/receive 240 * DLVL(50) -- Event tracing, including receiving/sending completion events. 241 * DLVL(20) -- Socket creation/destruction. 242 */ 243#define TRACE_LEVEL 90 244#define CORRECTNESS_LEVEL 70 245#define IOEVENT_LEVEL 60 246#define EVENT_LEVEL 50 247#define CREATION_LEVEL 20 248 249#define TRACE DLVL(TRACE_LEVEL) 250#define CORRECTNESS DLVL(CORRECTNESS_LEVEL) 251#define IOEVENT DLVL(IOEVENT_LEVEL) 252#define EVENT DLVL(EVENT_LEVEL) 253#define CREATION DLVL(CREATION_LEVEL) 254 255typedef isc_event_t intev_t; 256 257#define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o') 258#define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC) 259 260/*! 261 * IPv6 control information. If the socket is an IPv6 socket we want 262 * to collect the destination address and interface so the client can 263 * set them on outgoing packets. 264 */ 265#ifndef USE_CMSG 266#define USE_CMSG 1 267#endif /* ifndef USE_CMSG */ 268 269/*% 270 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have 271 * a setsockopt() like interface to request timestamps, and if the OS 272 * doesn't do it for us, call gettimeofday() on every UDP receive? 273 */ 274#ifdef SO_TIMESTAMP 275#ifndef USE_CMSG 276#define USE_CMSG 1 277#endif /* ifndef USE_CMSG */ 278#endif /* ifdef SO_TIMESTAMP */ 279 280#if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE) 281#define SET_RCVBUF 282#endif 283 284#if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE) 285#define SET_SNDBUF 286#endif 287 288/*% 289 * Instead of calculating the cmsgbuf lengths every time we take 290 * a rule of thumb approach - sizes are taken from x86_64 linux, 291 * multiplied by 2, everything should fit. Those sizes are not 292 * large enough to cause any concern. 293 */ 294#if defined(USE_CMSG) 295#define CMSG_SP_IN6PKT 40 296#else /* if defined(USE_CMSG) */ 297#define CMSG_SP_IN6PKT 0 298#endif /* if defined(USE_CMSG) */ 299 300#if defined(USE_CMSG) && defined(SO_TIMESTAMP) 301#define CMSG_SP_TIMESTAMP 32 302#else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */ 303#define CMSG_SP_TIMESTAMP 0 304#endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */ 305 306#if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) 307#define CMSG_SP_TCTOS 24 308#else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */ 309#define CMSG_SP_TCTOS 0 310#endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */ 311 312#define CMSG_SP_INT 24 313 314/* Align cmsg buffers to be safe on SPARC etc. */ 315#define RECVCMSGBUFLEN \ 316 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \ 317 1, \ 318 sizeof(void *)) 319#define SENDCMSGBUFLEN \ 320 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \ 321 sizeof(void *)) 322 323/*% 324 * The number of times a send operation is repeated if the result is EINTR. 325 */ 326#define NRETRIES 10 327 328typedef struct isc__socketthread isc__socketthread_t; 329 330#define NEWCONNSOCK(ev) ((ev)->newsocket) 331 332struct isc_socket { 333 /* Not locked. */ 334 unsigned int magic; 335 isc_socketmgr_t *manager; 336 isc_mutex_t lock; 337 isc_sockettype_t type; 338 const isc_statscounter_t *statsindex; 339 isc_refcount_t references; 340 341 /* Locked by socket lock. */ 342 ISC_LINK(isc_socket_t) link; 343 int fd; 344 int pf; 345 int threadid; 346 char name[16]; 347 void *tag; 348 349 ISC_LIST(isc_socketevent_t) send_list; 350 ISC_LIST(isc_socketevent_t) recv_list; 351 ISC_LIST(isc_socket_newconnev_t) accept_list; 352 ISC_LIST(isc_socket_connev_t) connect_list; 353 354 isc_sockaddr_t peer_address; /* remote address */ 355 356 unsigned int listener : 1, /* listener socket */ 357 connected : 1, connecting : 1, /* connect pending 358 * */ 359 bound : 1, /* bound to local addr */ 360 dupped : 1, active : 1, /* currently active */ 361 pktdscp : 1; /* per packet dscp */ 362 363#ifdef ISC_PLATFORM_RECVOVERFLOW 364 unsigned char overflow; /* used for MSG_TRUNC fake */ 365#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */ 366 367 void *fdwatcharg; 368 isc_sockfdwatch_t fdwatchcb; 369 int fdwatchflags; 370 isc_task_t *fdwatchtask; 371 unsigned int dscp; 372}; 373 374#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g') 375#define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC) 376 377struct isc_socketmgr { 378 /* Not locked. */ 379 unsigned int magic; 380 isc_mem_t *mctx; 381 isc_mutex_t lock; 382 isc_stats_t *stats; 383 int nthreads; 384 isc__socketthread_t *threads; 385 unsigned int maxsocks; 386 /* Locked by manager lock. */ 387 ISC_LIST(isc_socket_t) socklist; 388 int reserved; /* unlocked */ 389 isc_condition_t shutdown_ok; 390 size_t maxudp; 391}; 392 393struct isc__socketthread { 394 isc_socketmgr_t *manager; 395 int threadid; 396 isc_thread_t thread; 397 int pipe_fds[2]; 398 isc_mutex_t *fdlock; 399 /* Locked by fdlock. */ 400 isc_socket_t **fds; 401 int *fdstate; 402#ifdef USE_KQUEUE 403 int kqueue_fd; 404 int nevents; 405 struct kevent *events; 406#endif /* USE_KQUEUE */ 407#ifdef USE_EPOLL 408 int epoll_fd; 409 int nevents; 410 struct epoll_event *events; 411 uint32_t *epoll_events; 412#endif /* USE_EPOLL */ 413#ifdef USE_DEVPOLL 414 int devpoll_fd; 415 isc_resourcevalue_t open_max; 416 unsigned int calls; 417 int nevents; 418 struct pollfd *events; 419 pollinfo_t *fdpollinfo; 420#endif /* USE_DEVPOLL */ 421#ifdef USE_SELECT 422 int fd_bufsize; 423 fd_set *read_fds; 424 fd_set *read_fds_copy; 425 fd_set *write_fds; 426 fd_set *write_fds_copy; 427 int maxfd; 428#endif /* USE_SELECT */ 429}; 430 431#define CLOSED 0 /* this one must be zero */ 432#define MANAGED 1 433#define CLOSE_PENDING 2 434 435/* 436 * send() and recv() iovec counts 437 */ 438#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) 439#ifdef ISC_PLATFORM_RECVOVERFLOW 440#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1) 441#else /* ifdef ISC_PLATFORM_RECVOVERFLOW */ 442#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) 443#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */ 444 445static isc_result_t 446socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 447 isc_socket_t **socketp, isc_socket_t *dup_socket); 448static void 449send_recvdone_event(isc_socket_t *, isc_socketevent_t **); 450static void 451send_senddone_event(isc_socket_t *, isc_socketevent_t **); 452static void 453send_connectdone_event(isc_socket_t *, isc_socket_connev_t **); 454static void 455free_socket(isc_socket_t **); 456static isc_result_t 457allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **); 458static void 459destroy(isc_socket_t **); 460static void 461internal_accept(isc_socket_t *); 462static void 463internal_connect(isc_socket_t *); 464static void 465internal_recv(isc_socket_t *); 466static void 467internal_send(isc_socket_t *); 468static void 469process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *); 470static void 471build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *, 472 struct iovec *, size_t *); 473static void 474build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *, 475 struct iovec *, size_t *); 476static bool 477process_ctlfd(isc__socketthread_t *thread); 478static void 479setdscp(isc_socket_t *sock, isc_dscp_t dscp); 480static void 481dispatch_recv(isc_socket_t *sock); 482static void 483dispatch_send(isc_socket_t *sock); 484static void 485internal_fdwatch_read(isc_socket_t *sock); 486static void 487internal_fdwatch_write(isc_socket_t *sock); 488 489#define SELECT_POKE_SHUTDOWN (-1) 490#define SELECT_POKE_NOTHING (-2) 491#define SELECT_POKE_READ (-3) 492#define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */ 493#define SELECT_POKE_WRITE (-4) 494#define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */ 495#define SELECT_POKE_CLOSE (-5) 496 497/*% 498 * Shortcut index arrays to get access to statistics counters. 499 */ 500enum { 501 STATID_OPEN = 0, 502 STATID_OPENFAIL = 1, 503 STATID_CLOSE = 2, 504 STATID_BINDFAIL = 3, 505 STATID_CONNECTFAIL = 4, 506 STATID_CONNECT = 5, 507 STATID_ACCEPTFAIL = 6, 508 STATID_ACCEPT = 7, 509 STATID_SENDFAIL = 8, 510 STATID_RECVFAIL = 9, 511 STATID_ACTIVE = 10 512}; 513static const isc_statscounter_t udp4statsindex[] = { 514 isc_sockstatscounter_udp4open, 515 isc_sockstatscounter_udp4openfail, 516 isc_sockstatscounter_udp4close, 517 isc_sockstatscounter_udp4bindfail, 518 isc_sockstatscounter_udp4connectfail, 519 isc_sockstatscounter_udp4connect, 520 -1, 521 -1, 522 isc_sockstatscounter_udp4sendfail, 523 isc_sockstatscounter_udp4recvfail, 524 isc_sockstatscounter_udp4active 525}; 526static const isc_statscounter_t udp6statsindex[] = { 527 isc_sockstatscounter_udp6open, 528 isc_sockstatscounter_udp6openfail, 529 isc_sockstatscounter_udp6close, 530 isc_sockstatscounter_udp6bindfail, 531 isc_sockstatscounter_udp6connectfail, 532 isc_sockstatscounter_udp6connect, 533 -1, 534 -1, 535 isc_sockstatscounter_udp6sendfail, 536 isc_sockstatscounter_udp6recvfail, 537 isc_sockstatscounter_udp6active 538}; 539static const isc_statscounter_t tcp4statsindex[] = { 540 isc_sockstatscounter_tcp4open, isc_sockstatscounter_tcp4openfail, 541 isc_sockstatscounter_tcp4close, isc_sockstatscounter_tcp4bindfail, 542 isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect, 543 isc_sockstatscounter_tcp4acceptfail, isc_sockstatscounter_tcp4accept, 544 isc_sockstatscounter_tcp4sendfail, isc_sockstatscounter_tcp4recvfail, 545 isc_sockstatscounter_tcp4active 546}; 547static const isc_statscounter_t tcp6statsindex[] = { 548 isc_sockstatscounter_tcp6open, isc_sockstatscounter_tcp6openfail, 549 isc_sockstatscounter_tcp6close, isc_sockstatscounter_tcp6bindfail, 550 isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect, 551 isc_sockstatscounter_tcp6acceptfail, isc_sockstatscounter_tcp6accept, 552 isc_sockstatscounter_tcp6sendfail, isc_sockstatscounter_tcp6recvfail, 553 isc_sockstatscounter_tcp6active 554}; 555static const isc_statscounter_t unixstatsindex[] = { 556 isc_sockstatscounter_unixopen, isc_sockstatscounter_unixopenfail, 557 isc_sockstatscounter_unixclose, isc_sockstatscounter_unixbindfail, 558 isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect, 559 isc_sockstatscounter_unixacceptfail, isc_sockstatscounter_unixaccept, 560 isc_sockstatscounter_unixsendfail, isc_sockstatscounter_unixrecvfail, 561 isc_sockstatscounter_unixactive 562}; 563static const isc_statscounter_t rawstatsindex[] = { 564 isc_sockstatscounter_rawopen, 565 isc_sockstatscounter_rawopenfail, 566 isc_sockstatscounter_rawclose, 567 -1, 568 -1, 569 -1, 570 -1, 571 -1, 572 -1, 573 isc_sockstatscounter_rawrecvfail, 574 isc_sockstatscounter_rawactive 575}; 576 577static int 578gen_threadid(isc_socket_t *sock); 579 580static int 581gen_threadid(isc_socket_t *sock) { 582 return (sock->fd % sock->manager->nthreads); 583} 584 585static void 586manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category, 587 isc_logmodule_t *module, int level, const char *fmt, ...) 588 ISC_FORMAT_PRINTF(5, 6); 589static void 590manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category, 591 isc_logmodule_t *module, int level, const char *fmt, ...) { 592 char msgbuf[2048]; 593 va_list ap; 594 595 if (!isc_log_wouldlog(isc_lctx, level)) { 596 return; 597 } 598 599 va_start(ap, fmt); 600 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 601 va_end(ap); 602 603 isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s", 604 sockmgr, msgbuf); 605} 606 607static void 608thread_log(isc__socketthread_t *thread, isc_logcategory_t *category, 609 isc_logmodule_t *module, int level, const char *fmt, ...) 610 ISC_FORMAT_PRINTF(5, 6); 611static void 612thread_log(isc__socketthread_t *thread, isc_logcategory_t *category, 613 isc_logmodule_t *module, int level, const char *fmt, ...) { 614 char msgbuf[2048]; 615 va_list ap; 616 617 if (!isc_log_wouldlog(isc_lctx, level)) { 618 return; 619 } 620 621 va_start(ap, fmt); 622 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 623 va_end(ap); 624 625 isc_log_write(isc_lctx, category, module, level, 626 "sockmgr %p thread %d: %s", thread->manager, 627 thread->threadid, msgbuf); 628} 629 630static void 631socket_log(isc_socket_t *sock, const isc_sockaddr_t *address, 632 isc_logcategory_t *category, isc_logmodule_t *module, int level, 633 const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7); 634static void 635socket_log(isc_socket_t *sock, const isc_sockaddr_t *address, 636 isc_logcategory_t *category, isc_logmodule_t *module, int level, 637 const char *fmt, ...) { 638 char msgbuf[2048]; 639 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 640 va_list ap; 641 642 if (!isc_log_wouldlog(isc_lctx, level)) { 643 return; 644 } 645 646 va_start(ap, fmt); 647 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 648 va_end(ap); 649 650 if (address == NULL) { 651 isc_log_write(isc_lctx, category, module, level, 652 "socket %p: %s", sock, msgbuf); 653 } else { 654 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); 655 isc_log_write(isc_lctx, category, module, level, 656 "socket %p %s: %s", sock, peerbuf, msgbuf); 657 } 658} 659 660/*% 661 * Increment socket-related statistics counters. 662 */ 663static void 664inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) { 665 REQUIRE(counterid != -1); 666 667 if (stats != NULL) { 668 isc_stats_increment(stats, counterid); 669 } 670} 671 672/*% 673 * Decrement socket-related statistics counters. 674 */ 675static void 676dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) { 677 REQUIRE(counterid != -1); 678 679 if (stats != NULL) { 680 isc_stats_decrement(stats, counterid); 681 } 682} 683 684static isc_result_t 685watch_fd(isc__socketthread_t *thread, int fd, int msg) { 686 isc_result_t result = ISC_R_SUCCESS; 687 688#ifdef USE_KQUEUE 689 struct kevent evchange; 690 691 memset(&evchange, 0, sizeof(evchange)); 692 if (msg == SELECT_POKE_READ) { 693 evchange.filter = EVFILT_READ; 694 } else { 695 evchange.filter = EVFILT_WRITE; 696 } 697 evchange.flags = EV_ADD; 698 evchange.ident = fd; 699 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) { 700 result = isc__errno2result(errno); 701 } 702 703 return (result); 704#elif defined(USE_EPOLL) 705 struct epoll_event event; 706 uint32_t oldevents; 707 int ret; 708 int op; 709 710 oldevents = thread->epoll_events[fd]; 711 if (msg == SELECT_POKE_READ) { 712 thread->epoll_events[fd] |= EPOLLIN; 713 } else { 714 thread->epoll_events[fd] |= EPOLLOUT; 715 } 716 717 event.events = thread->epoll_events[fd]; 718 memset(&event.data, 0, sizeof(event.data)); 719 event.data.fd = fd; 720 721 op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; 722 if (thread->fds[fd] != NULL) { 723 LOCK(&thread->fds[fd]->lock); 724 } 725 ret = epoll_ctl(thread->epoll_fd, op, fd, &event); 726 if (thread->fds[fd] != NULL) { 727 UNLOCK(&thread->fds[fd]->lock); 728 } 729 if (ret == -1) { 730 if (errno == EEXIST) { 731 UNEXPECTED_ERROR(__FILE__, __LINE__, 732 "epoll_ctl(ADD/MOD) returned " 733 "EEXIST for fd %d", 734 fd); 735 } 736 result = isc__errno2result(errno); 737 } 738 739 return (result); 740#elif defined(USE_DEVPOLL) 741 struct pollfd pfd; 742 743 memset(&pfd, 0, sizeof(pfd)); 744 if (msg == SELECT_POKE_READ) { 745 pfd.events = POLLIN; 746 } else { 747 pfd.events = POLLOUT; 748 } 749 pfd.fd = fd; 750 pfd.revents = 0; 751 if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) { 752 result = isc__errno2result(errno); 753 } else { 754 if (msg == SELECT_POKE_READ) { 755 thread->fdpollinfo[fd].want_read = 1; 756 } else { 757 thread->fdpollinfo[fd].want_write = 1; 758 } 759 } 760 761 return (result); 762#elif defined(USE_SELECT) 763 LOCK(&thread->manager->lock); 764 if (msg == SELECT_POKE_READ) { 765 FD_SET(fd, thread->read_fds); 766 } 767 if (msg == SELECT_POKE_WRITE) { 768 FD_SET(fd, thread->write_fds); 769 } 770 UNLOCK(&thread->manager->lock); 771 772 return (result); 773#endif /* ifdef USE_KQUEUE */ 774} 775 776static isc_result_t 777unwatch_fd(isc__socketthread_t *thread, int fd, int msg) { 778 isc_result_t result = ISC_R_SUCCESS; 779 780#ifdef USE_KQUEUE 781 struct kevent evchange; 782 783 memset(&evchange, 0, sizeof(evchange)); 784 if (msg == SELECT_POKE_READ) { 785 evchange.filter = EVFILT_READ; 786 } else { 787 evchange.filter = EVFILT_WRITE; 788 } 789 evchange.flags = EV_DELETE; 790 evchange.ident = fd; 791 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) { 792 result = isc__errno2result(errno); 793 } 794 795 return (result); 796#elif defined(USE_EPOLL) 797 struct epoll_event event; 798 int ret; 799 int op; 800 801 if (msg == SELECT_POKE_READ) { 802 thread->epoll_events[fd] &= ~(EPOLLIN); 803 } else { 804 thread->epoll_events[fd] &= ~(EPOLLOUT); 805 } 806 807 event.events = thread->epoll_events[fd]; 808 memset(&event.data, 0, sizeof(event.data)); 809 event.data.fd = fd; 810 811 op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD; 812 ret = epoll_ctl(thread->epoll_fd, op, fd, &event); 813 if (ret == -1 && errno != ENOENT) { 814 char strbuf[ISC_STRERRORSIZE]; 815 strerror_r(errno, strbuf, sizeof(strbuf)); 816 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s", 817 fd, strbuf); 818 result = ISC_R_UNEXPECTED; 819 } 820 return (result); 821#elif defined(USE_DEVPOLL) 822 struct pollfd pfds[2]; 823 size_t writelen = sizeof(pfds[0]); 824 825 memset(pfds, 0, sizeof(pfds)); 826 pfds[0].events = POLLREMOVE; 827 pfds[0].fd = fd; 828 829 /* 830 * Canceling read or write polling via /dev/poll is tricky. Since it 831 * only provides a way of canceling per FD, we may need to re-poll the 832 * socket for the other operation. 833 */ 834 if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) { 835 pfds[1].events = POLLOUT; 836 pfds[1].fd = fd; 837 writelen += sizeof(pfds[1]); 838 } 839 if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) { 840 pfds[1].events = POLLIN; 841 pfds[1].fd = fd; 842 writelen += sizeof(pfds[1]); 843 } 844 845 if (write(thread->devpoll_fd, pfds, writelen) == -1) { 846 result = isc__errno2result(errno); 847 } else { 848 if (msg == SELECT_POKE_READ) { 849 thread->fdpollinfo[fd].want_read = 0; 850 } else { 851 thread->fdpollinfo[fd].want_write = 0; 852 } 853 } 854 855 return (result); 856#elif defined(USE_SELECT) 857 LOCK(&thread->manager->lock); 858 if (msg == SELECT_POKE_READ) { 859 FD_CLR(fd, thread->read_fds); 860 } else if (msg == SELECT_POKE_WRITE) { 861 FD_CLR(fd, thread->write_fds); 862 } 863 UNLOCK(&thread->manager->lock); 864 865 return (result); 866#endif /* ifdef USE_KQUEUE */ 867} 868 869/* 870 * A poke message was received, perform a proper watch/unwatch 871 * on a fd provided 872 */ 873static void 874wakeup_socket(isc__socketthread_t *thread, int fd, int msg) { 875 isc_result_t result; 876 int lockid = FDLOCK_ID(fd); 877 878 /* 879 * This is a wakeup on a socket. If the socket is not in the 880 * process of being closed, start watching it for either reads 881 * or writes. 882 */ 883 884 INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks); 885 886 if (msg == SELECT_POKE_CLOSE) { 887 LOCK(&thread->fdlock[lockid]); 888 INSIST(thread->fdstate[fd] == CLOSE_PENDING); 889 thread->fdstate[fd] = CLOSED; 890 (void)unwatch_fd(thread, fd, SELECT_POKE_READ); 891 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE); 892 (void)close(fd); 893 UNLOCK(&thread->fdlock[lockid]); 894 return; 895 } 896 897 LOCK(&thread->fdlock[lockid]); 898 if (thread->fdstate[fd] == CLOSE_PENDING) { 899 /* 900 * We accept (and ignore) any error from unwatch_fd() as we are 901 * closing the socket, hoping it doesn't leave dangling state in 902 * the kernel. 903 * Note that unwatch_fd() must be called after releasing the 904 * fdlock; otherwise it could cause deadlock due to a lock order 905 * reversal. 906 */ 907 (void)unwatch_fd(thread, fd, SELECT_POKE_READ); 908 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE); 909 UNLOCK(&thread->fdlock[lockid]); 910 return; 911 } 912 if (thread->fdstate[fd] != MANAGED) { 913 UNLOCK(&thread->fdlock[lockid]); 914 return; 915 } 916 917 /* 918 * Set requested bit. 919 */ 920 result = watch_fd(thread, fd, msg); 921 if (result != ISC_R_SUCCESS) { 922 /* 923 * XXXJT: what should we do? Ignoring the failure of watching 924 * a socket will make the application dysfunctional, but there 925 * seems to be no reasonable recovery process. 926 */ 927 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 928 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 929 "failed to start watching FD (%d): %s", fd, 930 isc_result_totext(result)); 931 } 932 UNLOCK(&thread->fdlock[lockid]); 933} 934 935/* 936 * Poke the select loop when there is something for us to do. 937 * The write is required (by POSIX) to complete. That is, we 938 * will not get partial writes. 939 */ 940static void 941select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) { 942 int cc; 943 int buf[2]; 944 char strbuf[ISC_STRERRORSIZE]; 945 946 buf[0] = fd; 947 buf[1] = msg; 948 949 do { 950 cc = write(mgr->threads[threadid].pipe_fds[1], buf, 951 sizeof(buf)); 952#ifdef ENOSR 953 /* 954 * Treat ENOSR as EAGAIN but loop slowly as it is 955 * unlikely to clear fast. 956 */ 957 if (cc < 0 && errno == ENOSR) { 958 sleep(1); 959 errno = EAGAIN; 960 } 961#endif /* ifdef ENOSR */ 962 } while (cc < 0 && SOFT_ERROR(errno)); 963 964 if (cc < 0) { 965 strerror_r(errno, strbuf, sizeof(strbuf)); 966 FATAL_ERROR(__FILE__, __LINE__, 967 "write() failed during watcher poke: %s", strbuf); 968 } 969 970 INSIST(cc == sizeof(buf)); 971} 972 973/* 974 * Read a message on the internal fd. 975 */ 976static void 977select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) { 978 int buf[2]; 979 int cc; 980 char strbuf[ISC_STRERRORSIZE]; 981 982 cc = read(thread->pipe_fds[0], buf, sizeof(buf)); 983 if (cc < 0) { 984 *msg = SELECT_POKE_NOTHING; 985 *fd = -1; /* Silence compiler. */ 986 if (SOFT_ERROR(errno)) { 987 return; 988 } 989 990 strerror_r(errno, strbuf, sizeof(strbuf)); 991 FATAL_ERROR(__FILE__, __LINE__, 992 "read() failed during watcher poke: %s", strbuf); 993 } 994 INSIST(cc == sizeof(buf)); 995 996 *fd = buf[0]; 997 *msg = buf[1]; 998} 999 1000/* 1001 * Make a fd non-blocking. 1002 */ 1003static isc_result_t 1004make_nonblock(int fd) { 1005 int ret; 1006 char strbuf[ISC_STRERRORSIZE]; 1007#ifdef USE_FIONBIO_IOCTL 1008 int on = 1; 1009#else /* ifdef USE_FIONBIO_IOCTL */ 1010 int flags; 1011#endif /* ifdef USE_FIONBIO_IOCTL */ 1012 1013#ifdef USE_FIONBIO_IOCTL 1014 ret = ioctl(fd, FIONBIO, (char *)&on); 1015#else /* ifdef USE_FIONBIO_IOCTL */ 1016 flags = fcntl(fd, F_GETFL, 0); 1017 flags |= PORT_NONBLOCK; 1018 ret = fcntl(fd, F_SETFL, flags); 1019#endif /* ifdef USE_FIONBIO_IOCTL */ 1020 1021 if (ret == -1) { 1022 strerror_r(errno, strbuf, sizeof(strbuf)); 1023 UNEXPECTED_ERROR(__FILE__, __LINE__, 1024#ifdef USE_FIONBIO_IOCTL 1025 "ioctl(%d, FIONBIO, &on): %s", fd, 1026#else /* ifdef USE_FIONBIO_IOCTL */ 1027 "fcntl(%d, F_SETFL, %d): %s", fd, flags, 1028#endif /* ifdef USE_FIONBIO_IOCTL */ 1029 strbuf); 1030 1031 return (ISC_R_UNEXPECTED); 1032 } 1033 1034 return (ISC_R_SUCCESS); 1035} 1036 1037#ifdef USE_CMSG 1038/* 1039 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE. 1040 * In order to ensure as much portability as possible, we provide wrapper 1041 * functions of these macros. 1042 * Note that cmsg_space() could run slow on OSes that do not have 1043 * CMSG_SPACE. 1044 */ 1045static socklen_t 1046cmsg_len(socklen_t len) { 1047#ifdef CMSG_LEN 1048 return (CMSG_LEN(len)); 1049#else /* ifdef CMSG_LEN */ 1050 socklen_t hdrlen; 1051 1052 /* 1053 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA 1054 * is correct. 1055 */ 1056 hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL)); 1057 return (hdrlen + len); 1058#endif /* ifdef CMSG_LEN */ 1059} 1060 1061static socklen_t 1062cmsg_space(socklen_t len) { 1063#ifdef CMSG_SPACE 1064 return (CMSG_SPACE(len)); 1065#else /* ifdef CMSG_SPACE */ 1066 struct msghdr msg; 1067 struct cmsghdr *cmsgp; 1068 /* 1069 * XXX: The buffer length is an ad-hoc value, but should be enough 1070 * in a practical sense. 1071 */ 1072 char dummybuf[sizeof(struct cmsghdr) + 1024]; 1073 1074 memset(&msg, 0, sizeof(msg)); 1075 msg.msg_control = dummybuf; 1076 msg.msg_controllen = sizeof(dummybuf); 1077 1078 cmsgp = (struct cmsghdr *)dummybuf; 1079 cmsgp->cmsg_len = cmsg_len(len); 1080 1081 cmsgp = CMSG_NXTHDR(&msg, cmsgp); 1082 if (cmsgp != NULL) { 1083 return ((char *)cmsgp - (char *)msg.msg_control); 1084 } else { 1085 return (0); 1086 } 1087#endif /* ifdef CMSG_SPACE */ 1088} 1089#endif /* USE_CMSG */ 1090 1091/* 1092 * Process control messages received on a socket. 1093 */ 1094static void 1095process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { 1096#ifdef USE_CMSG 1097 struct cmsghdr *cmsgp; 1098 struct in6_pktinfo *pktinfop; 1099#ifdef SO_TIMESTAMP 1100 void *timevalp; 1101#endif /* ifdef SO_TIMESTAMP */ 1102#endif /* ifdef USE_CMSG */ 1103 1104 /* 1105 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. 1106 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. 1107 * They are all here, outside of the CPP tests, because it is 1108 * more consistent with the usual ISC coding style. 1109 */ 1110 UNUSED(sock); 1111 UNUSED(msg); 1112 UNUSED(dev); 1113 1114#ifdef MSG_TRUNC 1115 if ((msg->msg_flags & MSG_TRUNC) != 0) { 1116 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1117 } 1118#endif /* ifdef MSG_TRUNC */ 1119 1120#ifdef MSG_CTRUNC 1121 if ((msg->msg_flags & MSG_CTRUNC) != 0) { 1122 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; 1123 } 1124#endif /* ifdef MSG_CTRUNC */ 1125 1126#ifndef USE_CMSG 1127 return; 1128#else /* ifndef USE_CMSG */ 1129 if (msg->msg_controllen == 0U || msg->msg_control == NULL) { 1130 return; 1131 } 1132 1133#ifdef SO_TIMESTAMP 1134 timevalp = NULL; 1135#endif /* ifdef SO_TIMESTAMP */ 1136 pktinfop = NULL; 1137 1138 cmsgp = CMSG_FIRSTHDR(msg); 1139 while (cmsgp != NULL) { 1140 socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp); 1141 1142 if (cmsgp->cmsg_level == IPPROTO_IPV6 && 1143 cmsgp->cmsg_type == IPV6_PKTINFO) 1144 { 1145 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1146 memmove(&dev->pktinfo, pktinfop, 1147 sizeof(struct in6_pktinfo)); 1148 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 1149 socket_log(sock, NULL, TRACE, 1150 "interface received on ifindex %u", 1151 dev->pktinfo.ipi6_ifindex); 1152 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) { 1153 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; 1154 } 1155 goto next; 1156 } 1157 1158#ifdef SO_TIMESTAMP 1159 if (cmsgp->cmsg_level == SOL_SOCKET && 1160 cmsgp->cmsg_type == SCM_TIMESTAMP) 1161 { 1162 struct timeval tv; 1163 timevalp = CMSG_DATA(cmsgp); 1164 memmove(&tv, timevalp, sizeof(tv)); 1165 dev->timestamp.seconds = tv.tv_sec; 1166 dev->timestamp.nanoseconds = tv.tv_usec * 1000; 1167 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; 1168 goto next; 1169 } 1170#endif /* ifdef SO_TIMESTAMP */ 1171 1172#ifdef IPV6_TCLASS 1173 if (cmsgp->cmsg_level == IPPROTO_IPV6 && 1174 cmsgp->cmsg_type == IPV6_TCLASS) 1175 { 1176 dev->dscp = *(int *)CMSG_DATA(cmsgp); 1177 dev->dscp >>= 2; 1178 dev->attributes |= ISC_SOCKEVENTATTR_DSCP; 1179 goto next; 1180 } 1181#endif /* ifdef IPV6_TCLASS */ 1182 1183#ifdef IP_TOS 1184 if (cmsgp->cmsg_level == IPPROTO_IP && 1185 (cmsgp->cmsg_type == IP_TOS 1186#ifdef IP_RECVTOS 1187 || cmsgp->cmsg_type == IP_RECVTOS 1188#endif /* ifdef IP_RECVTOS */ 1189 )) 1190 { 1191 dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp); 1192 dev->dscp >>= 2; 1193 dev->attributes |= ISC_SOCKEVENTATTR_DSCP; 1194 goto next; 1195 } 1196#endif /* ifdef IP_TOS */ 1197 next: 1198 cmsgp = CMSG_NXTHDR(msg, cmsgp); 1199 } 1200#endif /* USE_CMSG */ 1201} 1202 1203/* 1204 * Construct an iov array and attach it to the msghdr passed in. This is 1205 * the SEND constructor, which will use the used region of the buffer 1206 * (if using a buffer list) or will use the internal region (if a single 1207 * buffer I/O is requested). 1208 * 1209 * Nothing can be NULL, and the done event must list at least one buffer 1210 * on the buffer linked list for this function to be meaningful. 1211 * 1212 * If write_countp != NULL, *write_countp will hold the number of bytes 1213 * this transaction can send. 1214 */ 1215static void 1216build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev, 1217 struct msghdr *msg, struct iovec *iov, size_t *write_countp) { 1218 unsigned int iovcount; 1219 size_t write_count; 1220 struct cmsghdr *cmsgp; 1221 1222 memset(msg, 0, sizeof(*msg)); 1223 1224 if (!sock->connected) { 1225 msg->msg_name = (void *)&dev->address.type.sa; 1226 msg->msg_namelen = dev->address.length; 1227 } else { 1228 msg->msg_name = NULL; 1229 msg->msg_namelen = 0; 1230 } 1231 1232 write_count = dev->region.length - dev->n; 1233 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1234 iov[0].iov_len = write_count; 1235 iovcount = 1; 1236 1237 msg->msg_iov = iov; 1238 msg->msg_iovlen = iovcount; 1239 msg->msg_control = NULL; 1240 msg->msg_controllen = 0; 1241 msg->msg_flags = 0; 1242#if defined(USE_CMSG) 1243 1244 if ((sock->type == isc_sockettype_udp) && 1245 ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) 1246 { 1247 struct in6_pktinfo *pktinfop; 1248 1249 socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u", 1250 dev->pktinfo.ipi6_ifindex); 1251 1252 msg->msg_control = (void *)cmsgbuf; 1253 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo)); 1254 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 1255 1256 cmsgp = (struct cmsghdr *)cmsgbuf; 1257 cmsgp->cmsg_level = IPPROTO_IPV6; 1258 cmsgp->cmsg_type = IPV6_PKTINFO; 1259 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo)); 1260 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1261 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); 1262 } 1263 1264#if defined(IPV6_USE_MIN_MTU) 1265 if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) && 1266 ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0)) 1267 { 1268 int use_min_mtu = 1; /* -1, 0, 1 */ 1269 1270 cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen); 1271 msg->msg_control = (void *)cmsgbuf; 1272 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu)); 1273 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 1274 1275 cmsgp->cmsg_level = IPPROTO_IPV6; 1276 cmsgp->cmsg_type = IPV6_USE_MIN_MTU; 1277 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu)); 1278 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu)); 1279 } 1280#endif /* if defined(IPV6_USE_MIN_MTU) */ 1281 1282 if (isc_dscp_check_value > -1) { 1283 if (sock->type == isc_sockettype_udp) { 1284 INSIST((int)dev->dscp == isc_dscp_check_value); 1285 } else if (sock->type == isc_sockettype_tcp) { 1286 INSIST((int)sock->dscp == isc_dscp_check_value); 1287 } 1288 } 1289 1290#if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)) 1291 if ((sock->type == isc_sockettype_udp) && 1292 ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0)) 1293 { 1294 int dscp = (dev->dscp << 2) & 0xff; 1295 1296 INSIST(dev->dscp < 0x40); 1297 1298#ifdef IP_TOS 1299 if (sock->pf == AF_INET && sock->pktdscp) { 1300 cmsgp = (struct cmsghdr *)(cmsgbuf + 1301 msg->msg_controllen); 1302 msg->msg_control = (void *)cmsgbuf; 1303 msg->msg_controllen += cmsg_space(sizeof(dscp)); 1304 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 1305 1306 cmsgp->cmsg_level = IPPROTO_IP; 1307 cmsgp->cmsg_type = IP_TOS; 1308 cmsgp->cmsg_len = cmsg_len(sizeof(char)); 1309 *(unsigned char *)CMSG_DATA(cmsgp) = dscp; 1310 } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) { 1311 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, 1312 (void *)&dscp, sizeof(int)) < 0) 1313 { 1314 char strbuf[ISC_STRERRORSIZE]; 1315 strerror_r(errno, strbuf, sizeof(strbuf)); 1316 UNEXPECTED_ERROR(__FILE__, __LINE__, 1317 "setsockopt(%d, IP_TOS, %.02x)" 1318 " failed: %s", 1319 sock->fd, dscp >> 2, strbuf); 1320 } else { 1321 sock->dscp = dscp; 1322 } 1323 } 1324#endif /* ifdef IP_TOS */ 1325#if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) 1326 if (sock->pf == AF_INET6 && sock->pktdscp) { 1327 cmsgp = (struct cmsghdr *)(cmsgbuf + 1328 msg->msg_controllen); 1329 msg->msg_control = (void *)cmsgbuf; 1330 msg->msg_controllen += cmsg_space(sizeof(dscp)); 1331 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 1332 1333 cmsgp->cmsg_level = IPPROTO_IPV6; 1334 cmsgp->cmsg_type = IPV6_TCLASS; 1335 cmsgp->cmsg_len = cmsg_len(sizeof(dscp)); 1336 memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp)); 1337 } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) { 1338 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS, 1339 (void *)&dscp, sizeof(int)) < 0) 1340 { 1341 char strbuf[ISC_STRERRORSIZE]; 1342 strerror_r(errno, strbuf, sizeof(strbuf)); 1343 UNEXPECTED_ERROR(__FILE__, __LINE__, 1344 "setsockopt(%d, IPV6_TCLASS, " 1345 "%.02x) failed: %s", 1346 sock->fd, dscp >> 2, strbuf); 1347 } else { 1348 sock->dscp = dscp; 1349 } 1350 } 1351#endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */ 1352 if (msg->msg_controllen != 0 && 1353 msg->msg_controllen < SENDCMSGBUFLEN) 1354 { 1355 memset(cmsgbuf + msg->msg_controllen, 0, 1356 SENDCMSGBUFLEN - msg->msg_controllen); 1357 } 1358 } 1359#endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \ 1360 * defined(IPV6_TCLASS)) \ 1361 * */ 1362#endif /* USE_CMSG */ 1363 1364 if (write_countp != NULL) { 1365 *write_countp = write_count; 1366 } 1367} 1368 1369/* 1370 * Construct an iov array and attach it to the msghdr passed in. This is 1371 * the RECV constructor, which will use the available region of the buffer 1372 * (if using a buffer list) or will use the internal region (if a single 1373 * buffer I/O is requested). 1374 * 1375 * Nothing can be NULL, and the done event must list at least one buffer 1376 * on the buffer linked list for this function to be meaningful. 1377 * 1378 * If read_countp != NULL, *read_countp will hold the number of bytes 1379 * this transaction can receive. 1380 */ 1381static void 1382build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev, 1383 struct msghdr *msg, struct iovec *iov, size_t *read_countp) { 1384 unsigned int iovcount; 1385 size_t read_count; 1386 1387 memset(msg, 0, sizeof(struct msghdr)); 1388 1389 if (sock->type == isc_sockettype_udp) { 1390 memset(&dev->address, 0, sizeof(dev->address)); 1391 msg->msg_name = (void *)&dev->address.type.sa; 1392 msg->msg_namelen = sizeof(dev->address.type); 1393 } else { /* TCP */ 1394 msg->msg_name = NULL; 1395 msg->msg_namelen = 0; 1396 dev->address = sock->peer_address; 1397 } 1398 1399 read_count = dev->region.length - dev->n; 1400 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1401 iov[0].iov_len = read_count; 1402 iovcount = 1; 1403 1404 /* 1405 * If needed, set up to receive that one extra byte. 1406 */ 1407#ifdef ISC_PLATFORM_RECVOVERFLOW 1408 if (sock->type == isc_sockettype_udp) { 1409 INSIST(iovcount < MAXSCATTERGATHER_RECV); 1410 iov[iovcount].iov_base = (void *)(&sock->overflow); 1411 iov[iovcount].iov_len = 1; 1412 iovcount++; 1413 } 1414#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */ 1415 1416 msg->msg_iov = iov; 1417 msg->msg_iovlen = iovcount; 1418 1419#if defined(USE_CMSG) 1420 msg->msg_control = cmsgbuf; 1421 msg->msg_controllen = RECVCMSGBUFLEN; 1422#else /* if defined(USE_CMSG) */ 1423 msg->msg_control = NULL; 1424 msg->msg_controllen = 0; 1425#endif /* USE_CMSG */ 1426 msg->msg_flags = 0; 1427 1428 if (read_countp != NULL) { 1429 *read_countp = read_count; 1430 } 1431} 1432 1433static void 1434set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock, 1435 isc_socketevent_t *dev) { 1436 if (sock->type == isc_sockettype_udp) { 1437 if (address != NULL) { 1438 dev->address = *address; 1439 } else { 1440 dev->address = sock->peer_address; 1441 } 1442 } else if (sock->type == isc_sockettype_tcp) { 1443 INSIST(address == NULL); 1444 dev->address = sock->peer_address; 1445 } 1446} 1447 1448static void 1449destroy_socketevent(isc_event_t *event) { 1450 isc_socketevent_t *ev = (isc_socketevent_t *)event; 1451 1452 (ev->destroy)(event); 1453} 1454 1455static isc_socketevent_t * 1456allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype, 1457 isc_taskaction_t action, void *arg) { 1458 isc_socketevent_t *ev; 1459 1460 ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype, 1461 action, arg, sizeof(*ev)); 1462 1463 ev->result = ISC_R_UNSET; 1464 ISC_LINK_INIT(ev, ev_link); 1465 ev->region.base = NULL; 1466 ev->n = 0; 1467 ev->offset = 0; 1468 ev->attributes = 0; 1469 ev->destroy = ev->ev_destroy; 1470 ev->ev_destroy = destroy_socketevent; 1471 ev->dscp = 0; 1472 1473 return (ev); 1474} 1475 1476#if defined(ISC_SOCKET_DEBUG) 1477static void 1478dump_msg(struct msghdr *msg) { 1479 unsigned int i; 1480 1481 printf("MSGHDR %p\n", msg); 1482 printf("\tname %p, namelen %ld\n", msg->msg_name, 1483 (long)msg->msg_namelen); 1484 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen); 1485 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) { 1486 printf("\t\t%u\tbase %p, len %ld\n", i, 1487 msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len); 1488 } 1489 printf("\tcontrol %p, controllen %ld\n", msg->msg_control, 1490 (long)msg->msg_controllen); 1491} 1492#endif /* if defined(ISC_SOCKET_DEBUG) */ 1493 1494#define DOIO_SUCCESS 0 /* i/o ok, event sent */ 1495#define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */ 1496#define DOIO_HARD 2 /* i/o error, event sent */ 1497#define DOIO_EOF 3 /* EOF, no event sent */ 1498 1499static int 1500doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { 1501 int cc; 1502 struct iovec iov[MAXSCATTERGATHER_RECV]; 1503 size_t read_count; 1504 struct msghdr msghdr; 1505 int recv_errno; 1506 char strbuf[ISC_STRERRORSIZE]; 1507 char cmsgbuf[RECVCMSGBUFLEN] = { 0 }; 1508 1509 build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count); 1510 1511#if defined(ISC_SOCKET_DEBUG) 1512 dump_msg(&msghdr); 1513#endif /* if defined(ISC_SOCKET_DEBUG) */ 1514 1515 cc = recvmsg(sock->fd, &msghdr, 0); 1516 recv_errno = errno; 1517 1518#if defined(ISC_SOCKET_DEBUG) 1519 dump_msg(&msghdr); 1520#endif /* if defined(ISC_SOCKET_DEBUG) */ 1521 1522 if (cc < 0) { 1523 if (SOFT_ERROR(recv_errno)) { 1524 return (DOIO_SOFT); 1525 } 1526 1527 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1528 strerror_r(recv_errno, strbuf, sizeof(strbuf)); 1529 socket_log(sock, NULL, IOEVENT, 1530 "doio_recv: recvmsg(%d) %d bytes, err %d/%s", 1531 sock->fd, cc, recv_errno, strbuf); 1532 } 1533 1534#define SOFT_OR_HARD(_system, _isc) \ 1535 if (recv_errno == _system) { \ 1536 if (sock->connected) { \ 1537 dev->result = _isc; \ 1538 inc_stats(sock->manager->stats, \ 1539 sock->statsindex[STATID_RECVFAIL]); \ 1540 return (DOIO_HARD); \ 1541 } \ 1542 return (DOIO_SOFT); \ 1543 } 1544#define ALWAYS_HARD(_system, _isc) \ 1545 if (recv_errno == _system) { \ 1546 dev->result = _isc; \ 1547 inc_stats(sock->manager->stats, \ 1548 sock->statsindex[STATID_RECVFAIL]); \ 1549 return (DOIO_HARD); \ 1550 } 1551 1552 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1553 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1554 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1555 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN); 1556 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES); 1557 /* 1558 * Older operating systems may still return EPROTO in some 1559 * situations, for example when receiving ICMP/ICMPv6 errors. 1560 * A real life scenario is when ICMPv6 returns code 5 or 6. 1561 * These codes are introduced in RFC 4443 from March 2006, 1562 * and the document obsoletes RFC 1885. But unfortunately not 1563 * all operating systems have caught up with the new standard 1564 * (in 2020) and thus a generic protocol error is returned. 1565 */ 1566 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); 1567 /* Should never get this one but it was seen. */ 1568#ifdef ENOPROTOOPT 1569 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH); 1570#endif /* ifdef ENOPROTOOPT */ 1571 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); 1572 1573#undef SOFT_OR_HARD 1574#undef ALWAYS_HARD 1575 1576 dev->result = isc__errno2result(recv_errno); 1577 inc_stats(sock->manager->stats, 1578 sock->statsindex[STATID_RECVFAIL]); 1579 return (DOIO_HARD); 1580 } 1581 1582 /* 1583 * On TCP and UNIX sockets, zero length reads indicate EOF, 1584 * while on UDP sockets, zero length reads are perfectly valid, 1585 * although strange. 1586 */ 1587 switch (sock->type) { 1588 case isc_sockettype_tcp: 1589 case isc_sockettype_unix: 1590 if (cc == 0) { 1591 return (DOIO_EOF); 1592 } 1593 break; 1594 case isc_sockettype_udp: 1595 case isc_sockettype_raw: 1596 break; 1597 case isc_sockettype_fdwatch: 1598 default: 1599 UNREACHABLE(); 1600 } 1601 1602 if (sock->type == isc_sockettype_udp) { 1603 dev->address.length = msghdr.msg_namelen; 1604 if (isc_sockaddr_getport(&dev->address) == 0) { 1605 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1606 socket_log(sock, &dev->address, IOEVENT, 1607 "dropping source port zero packet"); 1608 } 1609 return (DOIO_SOFT); 1610 } 1611 /* 1612 * Simulate a firewall blocking UDP responses bigger than 1613 * 'maxudp' bytes. 1614 */ 1615 if (sock->manager->maxudp != 0 && 1616 cc > (int)sock->manager->maxudp) 1617 { 1618 return (DOIO_SOFT); 1619 } 1620 } 1621 1622 socket_log(sock, &dev->address, IOEVENT, "packet received correctly"); 1623 1624 /* 1625 * Overflow bit detection. If we received MORE bytes than we should, 1626 * this indicates an overflow situation. Set the flag in the 1627 * dev entry and adjust how much we read by one. 1628 */ 1629#ifdef ISC_PLATFORM_RECVOVERFLOW 1630 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) { 1631 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1632 cc--; 1633 } 1634#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */ 1635 1636 /* 1637 * If there are control messages attached, run through them and pull 1638 * out the interesting bits. 1639 */ 1640 process_cmsg(sock, &msghdr, dev); 1641 1642 /* 1643 * update the buffers (if any) and the i/o count 1644 */ 1645 dev->n += cc; 1646 1647 /* 1648 * If we read less than we expected, update counters, 1649 * and let the upper layer poke the descriptor. 1650 */ 1651 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) { 1652 return (DOIO_SOFT); 1653 } 1654 1655 /* 1656 * Full reads are posted, or partials if partials are ok. 1657 */ 1658 dev->result = ISC_R_SUCCESS; 1659 return (DOIO_SUCCESS); 1660} 1661 1662/* 1663 * Returns: 1664 * DOIO_SUCCESS The operation succeeded. dev->result contains 1665 * ISC_R_SUCCESS. 1666 * 1667 * DOIO_HARD A hard or unexpected I/O error was encountered. 1668 * dev->result contains the appropriate error. 1669 * 1670 * DOIO_SOFT A soft I/O error was encountered. No senddone 1671 * event was sent. The operation should be retried. 1672 * 1673 * No other return values are possible. 1674 */ 1675static int 1676doio_send(isc_socket_t *sock, isc_socketevent_t *dev) { 1677 int cc; 1678 struct iovec iov[MAXSCATTERGATHER_SEND]; 1679 size_t write_count; 1680 struct msghdr msghdr; 1681 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 1682 int attempts = 0; 1683 int send_errno; 1684 char strbuf[ISC_STRERRORSIZE]; 1685 char cmsgbuf[SENDCMSGBUFLEN] = { 0 }; 1686 1687 build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count); 1688 1689resend: 1690 if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 && 1691 write_count > sock->manager->maxudp) 1692 { 1693 cc = write_count; 1694 } else { 1695 cc = sendmsg(sock->fd, &msghdr, 0); 1696 } 1697 send_errno = errno; 1698 1699 /* 1700 * Check for error or block condition. 1701 */ 1702 if (cc < 0) { 1703 if (send_errno == EINTR && ++attempts < NRETRIES) { 1704 goto resend; 1705 } 1706 1707 if (SOFT_ERROR(send_errno)) { 1708 if (errno == EWOULDBLOCK || errno == EAGAIN) { 1709 dev->result = ISC_R_WOULDBLOCK; 1710 } 1711 return (DOIO_SOFT); 1712 } 1713 1714#define SOFT_OR_HARD(_system, _isc) \ 1715 if (send_errno == _system) { \ 1716 if (sock->connected) { \ 1717 dev->result = _isc; \ 1718 inc_stats(sock->manager->stats, \ 1719 sock->statsindex[STATID_SENDFAIL]); \ 1720 return (DOIO_HARD); \ 1721 } \ 1722 return (DOIO_SOFT); \ 1723 } 1724#define ALWAYS_HARD(_system, _isc) \ 1725 if (send_errno == _system) { \ 1726 dev->result = _isc; \ 1727 inc_stats(sock->manager->stats, \ 1728 sock->statsindex[STATID_SENDFAIL]); \ 1729 return (DOIO_HARD); \ 1730 } 1731 1732 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1733 ALWAYS_HARD(EACCES, ISC_R_NOPERM); 1734 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 1735 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1736 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1737#ifdef EHOSTDOWN 1738 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH); 1739#endif /* ifdef EHOSTDOWN */ 1740 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1741 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES); 1742 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH); 1743 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED); 1744 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET); 1745 1746#undef SOFT_OR_HARD 1747#undef ALWAYS_HARD 1748 1749 /* 1750 * The other error types depend on whether or not the 1751 * socket is UDP or TCP. If it is UDP, some errors 1752 * that we expect to be fatal under TCP are merely 1753 * annoying, and are really soft errors. 1754 * 1755 * However, these soft errors are still returned as 1756 * a status. 1757 */ 1758 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); 1759 strerror_r(send_errno, strbuf, sizeof(strbuf)); 1760 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", 1761 addrbuf, strbuf); 1762 dev->result = isc__errno2result(send_errno); 1763 inc_stats(sock->manager->stats, 1764 sock->statsindex[STATID_SENDFAIL]); 1765 return (DOIO_HARD); 1766 } 1767 1768 if (cc == 0) { 1769 inc_stats(sock->manager->stats, 1770 sock->statsindex[STATID_SENDFAIL]); 1771 UNEXPECTED_ERROR(__FILE__, __LINE__, 1772 "doio_send: send() returned 0"); 1773 } 1774 1775 /* 1776 * If we write less than we expected, update counters, poke. 1777 */ 1778 dev->n += cc; 1779 if ((size_t)cc != write_count) { 1780 return (DOIO_SOFT); 1781 } 1782 1783 /* 1784 * Exactly what we wanted to write. We're done with this 1785 * entry. Post its completion event. 1786 */ 1787 dev->result = ISC_R_SUCCESS; 1788 return (DOIO_SUCCESS); 1789} 1790 1791/* 1792 * Kill. 1793 * 1794 * Caller must ensure that the socket is not locked and no external 1795 * references exist. 1796 */ 1797static void 1798socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) { 1799 int lockid = FDLOCK_ID(fd); 1800 /* 1801 * No one has this socket open, so the watcher doesn't have to be 1802 * poked, and the socket doesn't have to be locked. 1803 */ 1804 LOCK(&thread->fdlock[lockid]); 1805 thread->fds[fd] = NULL; 1806 if (sock->type == isc_sockettype_fdwatch) 1807 thread->fdstate[fd] = CLOSED; 1808 else 1809 thread->fdstate[fd] = CLOSE_PENDING; 1810 UNLOCK(&thread->fdlock[lockid]); 1811 if (sock->type == isc_sockettype_fdwatch) { 1812 /* 1813 * The caller may close the socket once this function returns, 1814 * and `fd' may be reassigned for a new socket. So we do 1815 * unwatch_fd() here, rather than defer it via select_poke(). 1816 * Note: this may complicate data protection among threads and 1817 * may reduce performance due to additional locks. One way to 1818 * solve this would be to dup() the watched descriptor, but we 1819 * take a simpler approach at this moment. 1820 */ 1821 (void)unwatch_fd(thread, fd, SELECT_POKE_READ); 1822 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE); 1823 } else 1824 select_poke(thread->manager, thread->threadid, fd, 1825 SELECT_POKE_CLOSE); 1826 1827 inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]); 1828 1829 LOCK(&sock->lock); 1830 if (sock->active == 1) { 1831 dec_stats(thread->manager->stats, 1832 sock->statsindex[STATID_ACTIVE]); 1833 sock->active = 0; 1834 } 1835 UNLOCK(&sock->lock); 1836 1837 /* 1838 * update manager->maxfd here (XXX: this should be implemented more 1839 * efficiently) 1840 */ 1841#ifdef USE_SELECT 1842 LOCK(&thread->manager->lock); 1843 if (thread->maxfd == fd) { 1844 int i; 1845 1846 thread->maxfd = 0; 1847 for (i = fd - 1; i >= 0; i--) { 1848 lockid = FDLOCK_ID(i); 1849 1850 LOCK(&thread->fdlock[lockid]); 1851 if (thread->fdstate[i] == MANAGED) { 1852 thread->maxfd = i; 1853 UNLOCK(&thread->fdlock[lockid]); 1854 break; 1855 } 1856 UNLOCK(&thread->fdlock[lockid]); 1857 } 1858 if (thread->maxfd < thread->pipe_fds[0]) { 1859 thread->maxfd = thread->pipe_fds[0]; 1860 } 1861 } 1862 1863 UNLOCK(&thread->manager->lock); 1864#endif /* USE_SELECT */ 1865} 1866 1867static void 1868destroy(isc_socket_t **sockp) { 1869 int fd = 0; 1870 isc_socket_t *sock = *sockp; 1871 isc_socketmgr_t *manager = sock->manager; 1872 isc__socketthread_t *thread = NULL; 1873 1874 socket_log(sock, NULL, CREATION, "destroying"); 1875 1876 isc_refcount_destroy(&sock->references); 1877 1878 LOCK(&sock->lock); 1879 INSIST(ISC_LIST_EMPTY(sock->connect_list)); 1880 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 1881 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1882 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1883 INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks); 1884 1885 if (sock->fd >= 0) { 1886 fd = sock->fd; 1887 thread = &manager->threads[sock->threadid]; 1888 sock->fd = -1; 1889 sock->threadid = -1; 1890 } 1891 UNLOCK(&sock->lock); 1892 1893 if (fd > 0) { 1894 socketclose(thread, sock, fd); 1895 } 1896 1897 LOCK(&manager->lock); 1898 1899 ISC_LIST_UNLINK(manager->socklist, sock, link); 1900 1901 if (ISC_LIST_EMPTY(manager->socklist)) { 1902 SIGNAL(&manager->shutdown_ok); 1903 } 1904 1905 /* can't unlock manager as its memory context is still used */ 1906 free_socket(sockp); 1907 1908 UNLOCK(&manager->lock); 1909} 1910 1911static isc_result_t 1912allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type, 1913 isc_socket_t **socketp) { 1914 isc_socket_t *sock; 1915 1916 sock = isc_mem_get(manager->mctx, sizeof(*sock)); 1917 1918 sock->magic = 0; 1919 isc_refcount_init(&sock->references, 0); 1920 1921 sock->manager = manager; 1922 sock->type = type; 1923 sock->fd = -1; 1924 sock->threadid = -1; 1925 sock->dscp = 0; /* TOS/TCLASS is zero until set. */ 1926 sock->dupped = 0; 1927 sock->statsindex = NULL; 1928 sock->active = 0; 1929 1930 ISC_LINK_INIT(sock, link); 1931 1932 memset(sock->name, 0, sizeof(sock->name)); 1933 sock->tag = NULL; 1934 1935 /* 1936 * Set up list of readers and writers to be initially empty. 1937 */ 1938 ISC_LIST_INIT(sock->recv_list); 1939 ISC_LIST_INIT(sock->send_list); 1940 ISC_LIST_INIT(sock->accept_list); 1941 ISC_LIST_INIT(sock->connect_list); 1942 1943 sock->listener = 0; 1944 sock->connected = 0; 1945 sock->connecting = 0; 1946 sock->bound = 0; 1947 sock->pktdscp = 0; 1948 1949 /* 1950 * Initialize the lock. 1951 */ 1952 isc_mutex_init(&sock->lock); 1953 1954 sock->magic = SOCKET_MAGIC; 1955 *socketp = sock; 1956 1957 return (ISC_R_SUCCESS); 1958} 1959 1960/* 1961 * This event requires that the various lists be empty, that the reference 1962 * count be 1, and that the magic number is valid. The other socket bits, 1963 * like the lock, must be initialized as well. The fd associated must be 1964 * marked as closed, by setting it to -1 on close, or this routine will 1965 * also close the socket. 1966 */ 1967static void 1968free_socket(isc_socket_t **socketp) { 1969 isc_socket_t *sock = *socketp; 1970 *socketp = NULL; 1971 1972 INSIST(VALID_SOCKET(sock)); 1973 isc_refcount_destroy(&sock->references); 1974 LOCK(&sock->lock); 1975 INSIST(!sock->connecting); 1976 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1977 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1978 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 1979 INSIST(ISC_LIST_EMPTY(sock->connect_list)); 1980 INSIST(!ISC_LINK_LINKED(sock, link)); 1981 UNLOCK(&sock->lock); 1982 1983 sock->magic = 0; 1984 1985 isc_mutex_destroy(&sock->lock); 1986 1987 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock)); 1988} 1989 1990#if defined(SET_RCVBUF) 1991static isc_once_t rcvbuf_once = ISC_ONCE_INIT; 1992static int rcvbuf = ISC_RECV_BUFFER_SIZE; 1993 1994static void 1995set_rcvbuf(void) { 1996 int fd; 1997 int max = rcvbuf, min; 1998 socklen_t len; 1999 2000 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 2001 if (fd == -1) { 2002 switch (errno) { 2003 case EPROTONOSUPPORT: 2004 case EPFNOSUPPORT: 2005 case EAFNOSUPPORT: 2006 /* 2007 * Linux 2.2 (and maybe others) return EINVAL instead of 2008 * EAFNOSUPPORT. 2009 */ 2010 case EINVAL: 2011 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); 2012 break; 2013 } 2014 } 2015 if (fd == -1) { 2016 return; 2017 } 2018 2019 len = sizeof(min); 2020 if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 && 2021 min < rcvbuf) 2022 { 2023 again: 2024 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf, 2025 sizeof(rcvbuf)) == -1) 2026 { 2027 if (errno == ENOBUFS && rcvbuf > min) { 2028 max = rcvbuf - 1; 2029 rcvbuf = (rcvbuf + min) / 2; 2030 goto again; 2031 } else { 2032 rcvbuf = min; 2033 goto cleanup; 2034 } 2035 } else { 2036 min = rcvbuf; 2037 } 2038 if (min != max) { 2039 rcvbuf = max; 2040 goto again; 2041 } 2042 } 2043cleanup: 2044 close(fd); 2045} 2046#endif /* ifdef SO_RCVBUF */ 2047 2048#if defined(SET_SNDBUF) 2049static isc_once_t sndbuf_once = ISC_ONCE_INIT; 2050static int sndbuf = ISC_SEND_BUFFER_SIZE; 2051 2052static void 2053set_sndbuf(void) { 2054 int fd; 2055 int max = sndbuf, min; 2056 socklen_t len; 2057 2058 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 2059 if (fd == -1) { 2060 switch (errno) { 2061 case EPROTONOSUPPORT: 2062 case EPFNOSUPPORT: 2063 case EAFNOSUPPORT: 2064 /* 2065 * Linux 2.2 (and maybe others) return EINVAL instead of 2066 * EAFNOSUPPORT. 2067 */ 2068 case EINVAL: 2069 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); 2070 break; 2071 } 2072 } 2073 if (fd == -1) { 2074 return; 2075 } 2076 2077 len = sizeof(min); 2078 if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 && 2079 min < sndbuf) 2080 { 2081 again: 2082 if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf, 2083 sizeof(sndbuf)) == -1) 2084 { 2085 if (errno == ENOBUFS && sndbuf > min) { 2086 max = sndbuf - 1; 2087 sndbuf = (sndbuf + min) / 2; 2088 goto again; 2089 } else { 2090 sndbuf = min; 2091 goto cleanup; 2092 } 2093 } else { 2094 min = sndbuf; 2095 } 2096 if (min != max) { 2097 sndbuf = max; 2098 goto again; 2099 } 2100 } 2101cleanup: 2102 close(fd); 2103} 2104#endif /* ifdef SO_SNDBUF */ 2105 2106static void 2107use_min_mtu(isc_socket_t *sock) { 2108#if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) 2109 UNUSED(sock); 2110#endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */ 2111#ifdef IPV6_USE_MIN_MTU 2112 /* use minimum MTU */ 2113 if (sock->pf == AF_INET6) { 2114 int on = 1; 2115 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, 2116 (void *)&on, sizeof(on)); 2117 } 2118#endif /* ifdef IPV6_USE_MIN_MTU */ 2119#if defined(IPV6_MTU) 2120 /* 2121 * Use minimum MTU on IPv6 sockets. 2122 */ 2123 if (sock->pf == AF_INET6) { 2124 int mtu = 1280; 2125 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu, 2126 sizeof(mtu)); 2127 } 2128#endif /* if defined(IPV6_MTU) */ 2129} 2130 2131static void 2132set_tcp_maxseg(isc_socket_t *sock, int size) { 2133#ifdef TCP_MAXSEG 2134 if (sock->type == isc_sockettype_tcp) { 2135 (void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG, 2136 (void *)&size, sizeof(size)); 2137 } 2138#endif /* ifdef TCP_MAXSEG */ 2139} 2140 2141static void 2142set_ip_disable_pmtud(isc_socket_t *sock) { 2143 /* 2144 * Disable Path MTU Discover on IP packets 2145 */ 2146 if (sock->pf == AF_INET6) { 2147#if defined(IPV6_DONTFRAG) 2148 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG, 2149 &(int){ 0 }, sizeof(int)); 2150#endif 2151#if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT) 2152 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER, 2153 &(int){ IP_PMTUDISC_OMIT }, sizeof(int)); 2154#endif 2155 } else if (sock->pf == AF_INET) { 2156#if defined(IP_DONTFRAG) 2157 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 }, 2158 sizeof(int)); 2159#endif 2160#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT) 2161 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER, 2162 &(int){ IP_PMTUDISC_OMIT }, sizeof(int)); 2163#endif 2164 } 2165} 2166 2167static isc_result_t 2168opensocket(isc_socketmgr_t *manager, isc_socket_t *sock, 2169 isc_socket_t *dup_socket) { 2170 isc_result_t result; 2171 char strbuf[ISC_STRERRORSIZE]; 2172 const char *err = "socket"; 2173 int tries = 0; 2174#if defined(USE_CMSG) || defined(SO_NOSIGPIPE) 2175 int on = 1; 2176#endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */ 2177#if defined(SET_RCVBUF) || defined(SET_SNDBUF) 2178 socklen_t optlen; 2179 int size = 0; 2180#endif 2181 2182again: 2183 if (dup_socket == NULL) { 2184 switch (sock->type) { 2185 case isc_sockettype_udp: 2186 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); 2187 break; 2188 case isc_sockettype_tcp: 2189 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); 2190 break; 2191 case isc_sockettype_unix: 2192 sock->fd = socket(sock->pf, SOCK_STREAM, 0); 2193 break; 2194 case isc_sockettype_raw: 2195 errno = EPFNOSUPPORT; 2196 /* 2197 * PF_ROUTE is a alias for PF_NETLINK on linux. 2198 */ 2199#if defined(PF_ROUTE) 2200 if (sock->fd == -1 && sock->pf == PF_ROUTE) { 2201#ifdef NETLINK_ROUTE 2202 sock->fd = socket(sock->pf, SOCK_RAW, 2203 NETLINK_ROUTE); 2204#else /* ifdef NETLINK_ROUTE */ 2205 sock->fd = socket(sock->pf, SOCK_RAW, 0); 2206#endif /* ifdef NETLINK_ROUTE */ 2207 if (sock->fd != -1) { 2208#ifdef NETLINK_ROUTE 2209 struct sockaddr_nl sa; 2210 int n; 2211 2212 /* 2213 * Do an implicit bind. 2214 */ 2215 memset(&sa, 0, sizeof(sa)); 2216 sa.nl_family = AF_NETLINK; 2217 sa.nl_groups = RTMGRP_IPV4_IFADDR | 2218 RTMGRP_IPV6_IFADDR; 2219 n = bind(sock->fd, 2220 (struct sockaddr *)&sa, 2221 sizeof(sa)); 2222 if (n < 0) { 2223 close(sock->fd); 2224 sock->fd = -1; 2225 } 2226#endif /* ifdef NETLINK_ROUTE */ 2227 sock->bound = 1; 2228 } 2229 } 2230#endif /* if defined(PF_ROUTE) */ 2231 break; 2232 case isc_sockettype_fdwatch: 2233 /* 2234 * We should not be called for isc_sockettype_fdwatch 2235 * sockets. 2236 */ 2237 INSIST(0); 2238 break; 2239 } 2240 } else { 2241 sock->fd = dup(dup_socket->fd); 2242 sock->dupped = 1; 2243 sock->bound = dup_socket->bound; 2244 } 2245 if (sock->fd == -1 && errno == EINTR && tries++ < 42) { 2246 goto again; 2247 } 2248 2249#ifdef F_DUPFD 2250 /* 2251 * Leave a space for stdio and TCP to work in. 2252 */ 2253 if (manager->reserved != 0 && sock->type == isc_sockettype_udp && 2254 sock->fd >= 0 && sock->fd < manager->reserved) 2255 { 2256 int newfd, tmp; 2257 newfd = fcntl(sock->fd, F_DUPFD, manager->reserved); 2258 tmp = errno; 2259 (void)close(sock->fd); 2260 errno = tmp; 2261 sock->fd = newfd; 2262 err = "isc_socket_create: fcntl/reserved"; 2263 } else if (sock->fd >= 0 && sock->fd < 20) { 2264 int newfd, tmp; 2265 newfd = fcntl(sock->fd, F_DUPFD, 20); 2266 tmp = errno; 2267 (void)close(sock->fd); 2268 errno = tmp; 2269 sock->fd = newfd; 2270 err = "isc_socket_create: fcntl"; 2271 } 2272#endif /* ifdef F_DUPFD */ 2273 2274 if (sock->fd >= (int)manager->maxsocks) { 2275 (void)close(sock->fd); 2276 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2277 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2278 "socket: file descriptor exceeds limit (%d/%u)", 2279 sock->fd, manager->maxsocks); 2280 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]); 2281 return (ISC_R_NORESOURCES); 2282 } 2283 2284 if (sock->fd < 0) { 2285 switch (errno) { 2286 case EMFILE: 2287 case ENFILE: 2288 strerror_r(errno, strbuf, sizeof(strbuf)); 2289 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2290 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2291 "%s: %s", err, strbuf); 2292 FALLTHROUGH; 2293 case ENOBUFS: 2294 inc_stats(manager->stats, 2295 sock->statsindex[STATID_OPENFAIL]); 2296 return (ISC_R_NORESOURCES); 2297 2298 case EPROTONOSUPPORT: 2299 case EPFNOSUPPORT: 2300 case EAFNOSUPPORT: 2301 /* 2302 * Linux 2.2 (and maybe others) return EINVAL instead of 2303 * EAFNOSUPPORT. 2304 */ 2305 case EINVAL: 2306 inc_stats(manager->stats, 2307 sock->statsindex[STATID_OPENFAIL]); 2308 return (ISC_R_FAMILYNOSUPPORT); 2309 2310 default: 2311 strerror_r(errno, strbuf, sizeof(strbuf)); 2312 UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s", 2313 err, strbuf); 2314 inc_stats(manager->stats, 2315 sock->statsindex[STATID_OPENFAIL]); 2316 return (ISC_R_UNEXPECTED); 2317 } 2318 } 2319 2320 if (dup_socket != NULL) { 2321 goto setup_done; 2322 } 2323 2324 result = make_nonblock(sock->fd); 2325 if (result != ISC_R_SUCCESS) { 2326 (void)close(sock->fd); 2327 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]); 2328 return (result); 2329 } 2330 2331#ifdef SO_NOSIGPIPE 2332 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on, 2333 sizeof(on)) < 0) 2334 { 2335 strerror_r(errno, strbuf, sizeof(strbuf)); 2336 UNEXPECTED_ERROR(__FILE__, __LINE__, 2337 "setsockopt(%d, SO_NOSIGPIPE) failed: %s", 2338 sock->fd, strbuf); 2339 /* Press on... */ 2340 } 2341#endif /* ifdef SO_NOSIGPIPE */ 2342 2343 /* 2344 * Use minimum mtu if possible. 2345 */ 2346 if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) { 2347 use_min_mtu(sock); 2348 set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */ 2349 } 2350 2351#if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) 2352 if (sock->type == isc_sockettype_udp) { 2353#if defined(USE_CMSG) 2354#if defined(SO_TIMESTAMP) 2355 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on, 2356 sizeof(on)) < 0 && 2357 errno != ENOPROTOOPT) 2358 { 2359 strerror_r(errno, strbuf, sizeof(strbuf)); 2360 UNEXPECTED_ERROR(__FILE__, __LINE__, 2361 "setsockopt(%d, SO_TIMESTAMP) failed: " 2362 "%s", 2363 sock->fd, strbuf); 2364 /* Press on... */ 2365 } 2366#endif /* SO_TIMESTAMP */ 2367 2368#ifdef IPV6_RECVPKTINFO 2369 /* RFC 3542 */ 2370 if ((sock->pf == AF_INET6) && 2371 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 2372 (void *)&on, sizeof(on)) < 0)) 2373 { 2374 strerror_r(errno, strbuf, sizeof(strbuf)); 2375 UNEXPECTED_ERROR(__FILE__, __LINE__, 2376 "setsockopt(%d, IPV6_RECVPKTINFO) " 2377 "failed: %s", 2378 sock->fd, strbuf); 2379 } 2380#else /* ifdef IPV6_RECVPKTINFO */ 2381 /* RFC 2292 */ 2382 if ((sock->pf == AF_INET6) && 2383 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, 2384 (void *)&on, sizeof(on)) < 0)) 2385 { 2386 strerror_r(errno, strbuf, sizeof(strbuf)); 2387 UNEXPECTED_ERROR(__FILE__, __LINE__, 2388 "setsockopt(%d, IPV6_PKTINFO) failed: " 2389 "%s", 2390 sock->fd, strbuf); 2391 } 2392#endif /* IPV6_RECVPKTINFO */ 2393#endif /* defined(USE_CMSG) */ 2394 2395#if defined(SET_RCVBUF) 2396 optlen = sizeof(size); 2397 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size, 2398 &optlen) == 0 && 2399 size < rcvbuf) 2400 { 2401 RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) == 2402 ISC_R_SUCCESS); 2403 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2404 (void *)&rcvbuf, sizeof(rcvbuf)) == -1) 2405 { 2406 strerror_r(errno, strbuf, sizeof(strbuf)); 2407 UNEXPECTED_ERROR(__FILE__, __LINE__, 2408 "setsockopt(%d, SO_RCVBUF, " 2409 "%d) failed: %s", 2410 sock->fd, rcvbuf, strbuf); 2411 } 2412 } 2413#endif /* if defined(SET_RCVBUF) */ 2414 2415#if defined(SET_SNDBUF) 2416 optlen = sizeof(size); 2417 if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size, 2418 &optlen) == 0 && 2419 size < sndbuf) 2420 { 2421 RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) == 2422 ISC_R_SUCCESS); 2423 if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, 2424 (void *)&sndbuf, sizeof(sndbuf)) == -1) 2425 { 2426 strerror_r(errno, strbuf, sizeof(strbuf)); 2427 UNEXPECTED_ERROR(__FILE__, __LINE__, 2428 "setsockopt(%d, SO_SNDBUF, " 2429 "%d) failed: %s", 2430 sock->fd, sndbuf, strbuf); 2431 } 2432 } 2433#endif /* if defined(SO_SNDBUF) */ 2434 } 2435#ifdef IPV6_RECVTCLASS 2436 if ((sock->pf == AF_INET6) && 2437 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on, 2438 sizeof(on)) < 0)) 2439 { 2440 strerror_r(errno, strbuf, sizeof(strbuf)); 2441 UNEXPECTED_ERROR(__FILE__, __LINE__, 2442 "setsockopt(%d, IPV6_RECVTCLASS) " 2443 "failed: %s", 2444 sock->fd, strbuf); 2445 } 2446#endif /* ifdef IPV6_RECVTCLASS */ 2447#ifdef IP_RECVTOS 2448 if ((sock->pf == AF_INET) && 2449 (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on, 2450 sizeof(on)) < 0)) 2451 { 2452 strerror_r(errno, strbuf, sizeof(strbuf)); 2453 UNEXPECTED_ERROR(__FILE__, __LINE__, 2454 "setsockopt(%d, IP_RECVTOS) " 2455 "failed: %s", 2456 sock->fd, strbuf); 2457 } 2458#endif /* ifdef IP_RECVTOS */ 2459#endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */ 2460 2461 set_ip_disable_pmtud(sock); 2462 2463setup_done: 2464 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]); 2465 if (sock->active == 0) { 2466 inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]); 2467 sock->active = 1; 2468 } 2469 2470 return (ISC_R_SUCCESS); 2471} 2472 2473/* 2474 * Create a 'type' socket or duplicate an existing socket, managed 2475 * by 'manager'. Events will be posted to 'task' and when dispatched 2476 * 'action' will be called with 'arg' as the arg value. The new 2477 * socket is returned in 'socketp'. 2478 */ 2479static isc_result_t 2480socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, 2481 isc_socket_t **socketp, isc_socket_t *dup_socket) { 2482 isc_socket_t *sock = NULL; 2483 isc__socketthread_t *thread; 2484 isc_result_t result; 2485 int lockid; 2486 2487 REQUIRE(VALID_MANAGER(manager)); 2488 REQUIRE(socketp != NULL && *socketp == NULL); 2489 REQUIRE(type != isc_sockettype_fdwatch); 2490 2491 result = allocate_socket(manager, type, &sock); 2492 if (result != ISC_R_SUCCESS) { 2493 return (result); 2494 } 2495 2496 switch (sock->type) { 2497 case isc_sockettype_udp: 2498 sock->statsindex = (pf == AF_INET) ? udp4statsindex 2499 : udp6statsindex; 2500#define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6) 2501 sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0; 2502 break; 2503 case isc_sockettype_tcp: 2504 sock->statsindex = (pf == AF_INET) ? tcp4statsindex 2505 : tcp6statsindex; 2506 break; 2507 case isc_sockettype_unix: 2508 sock->statsindex = unixstatsindex; 2509 break; 2510 case isc_sockettype_raw: 2511 sock->statsindex = rawstatsindex; 2512 break; 2513 default: 2514 UNREACHABLE(); 2515 } 2516 2517 sock->pf = pf; 2518 2519 result = opensocket(manager, sock, dup_socket); 2520 if (result != ISC_R_SUCCESS) { 2521 free_socket(&sock); 2522 return (result); 2523 } 2524 2525 if (sock->fd == -1) { 2526 abort(); 2527 } 2528 sock->threadid = gen_threadid(sock); 2529 isc_refcount_increment0(&sock->references); 2530 thread = &manager->threads[sock->threadid]; 2531 *socketp = sock; 2532 2533 /* 2534 * Note we don't have to lock the socket like we normally would because 2535 * there are no external references to it yet. 2536 */ 2537 2538 lockid = FDLOCK_ID(sock->fd); 2539 LOCK(&thread->fdlock[lockid]); 2540 thread->fds[sock->fd] = sock; 2541 thread->fdstate[sock->fd] = MANAGED; 2542#if defined(USE_EPOLL) 2543 thread->epoll_events[sock->fd] = 0; 2544#endif /* if defined(USE_EPOLL) */ 2545#ifdef USE_DEVPOLL 2546 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 && 2547 thread->fdpollinfo[sock->fd].want_write == 0); 2548#endif /* ifdef USE_DEVPOLL */ 2549 UNLOCK(&thread->fdlock[lockid]); 2550 2551 LOCK(&manager->lock); 2552 ISC_LIST_APPEND(manager->socklist, sock, link); 2553#ifdef USE_SELECT 2554 if (thread->maxfd < sock->fd) { 2555 thread->maxfd = sock->fd; 2556 } 2557#endif /* ifdef USE_SELECT */ 2558 UNLOCK(&manager->lock); 2559 2560 socket_log(sock, NULL, CREATION, 2561 dup_socket != NULL ? "dupped" : "created"); 2562 2563 return (ISC_R_SUCCESS); 2564} 2565 2566/*% 2567 * Create a new 'type' socket managed by 'manager'. Events 2568 * will be posted to 'task' and when dispatched 'action' will be 2569 * called with 'arg' as the arg value. The new socket is returned 2570 * in 'socketp'. 2571 */ 2572isc_result_t 2573isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 2574 isc_socket_t **socketp) { 2575 return (socket_create(manager0, pf, type, socketp, NULL)); 2576} 2577 2578/*% 2579 * Duplicate an existing socket. The new socket is returned 2580 * in 'socketp'. 2581 */ 2582isc_result_t 2583isc_socket_dup(isc_socket_t *sock, isc_socket_t **socketp) { 2584 REQUIRE(VALID_SOCKET(sock)); 2585 REQUIRE(socketp != NULL && *socketp == NULL); 2586 2587 return (socket_create(sock->manager, sock->pf, sock->type, socketp, 2588 sock)); 2589} 2590 2591isc_result_t 2592isc_socket_open(isc_socket_t *sock) { 2593 isc_result_t result; 2594 isc__socketthread_t *thread; 2595 2596 REQUIRE(VALID_SOCKET(sock)); 2597 2598 LOCK(&sock->lock); 2599 2600 REQUIRE(isc_refcount_current(&sock->references) >= 1); 2601 REQUIRE(sock->fd == -1); 2602 REQUIRE(sock->threadid == -1); 2603 REQUIRE(sock->type != isc_sockettype_fdwatch); 2604 2605 result = opensocket(sock->manager, sock, NULL); 2606 2607 UNLOCK(&sock->lock); 2608 2609 if (result != ISC_R_SUCCESS) { 2610 sock->fd = -1; 2611 } else { 2612 sock->threadid = gen_threadid(sock); 2613 thread = &sock->manager->threads[sock->threadid]; 2614 int lockid = FDLOCK_ID(sock->fd); 2615 2616 LOCK(&thread->fdlock[lockid]); 2617 thread->fds[sock->fd] = sock; 2618 thread->fdstate[sock->fd] = MANAGED; 2619#if defined(USE_EPOLL) 2620 thread->epoll_events[sock->fd] = 0; 2621#endif /* if defined(USE_EPOLL) */ 2622#ifdef USE_DEVPOLL 2623 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 && 2624 thread->fdpollinfo[sock->fd].want_write == 0); 2625#endif /* ifdef USE_DEVPOLL */ 2626 UNLOCK(&thread->fdlock[lockid]); 2627 2628#ifdef USE_SELECT 2629 LOCK(&sock->manager->lock); 2630 if (thread->maxfd < sock->fd) { 2631 thread->maxfd = sock->fd; 2632 } 2633 UNLOCK(&sock->manager->lock); 2634#endif /* ifdef USE_SELECT */ 2635 } 2636 2637 return (result); 2638} 2639 2640/* 2641 * Attach to a socket. Caller must explicitly detach when it is done. 2642 */ 2643void 2644isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) { 2645 REQUIRE(VALID_SOCKET(sock)); 2646 REQUIRE(socketp != NULL && *socketp == NULL); 2647 2648 int old_refs = isc_refcount_increment(&sock->references); 2649 REQUIRE(old_refs > 0); 2650 2651 *socketp = sock; 2652} 2653 2654/* 2655 * Dereference a socket. If this is the last reference to it, clean things 2656 * up by destroying the socket. 2657 */ 2658void 2659isc_socket_detach(isc_socket_t **socketp) { 2660 isc_socket_t *sock; 2661 2662 REQUIRE(socketp != NULL); 2663 sock = *socketp; 2664 REQUIRE(VALID_SOCKET(sock)); 2665 if (isc_refcount_decrement(&sock->references) == 1) { 2666 destroy(&sock); 2667 } 2668 2669 *socketp = NULL; 2670} 2671 2672isc_result_t 2673isc_socket_close(isc_socket_t *sock) { 2674 int fd; 2675 isc_socketmgr_t *manager; 2676 isc__socketthread_t *thread; 2677 fflush(stdout); 2678 REQUIRE(VALID_SOCKET(sock)); 2679 2680 LOCK(&sock->lock); 2681 2682 REQUIRE(sock->type != isc_sockettype_fdwatch); 2683 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); 2684 2685 INSIST(!sock->connecting); 2686 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2687 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2688 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2689 INSIST(ISC_LIST_EMPTY(sock->connect_list)); 2690 2691 manager = sock->manager; 2692 thread = &manager->threads[sock->threadid]; 2693 fd = sock->fd; 2694 sock->fd = -1; 2695 sock->threadid = -1; 2696 2697 sock->dupped = 0; 2698 memset(sock->name, 0, sizeof(sock->name)); 2699 sock->tag = NULL; 2700 sock->listener = 0; 2701 sock->connected = 0; 2702 sock->connecting = 0; 2703 sock->bound = 0; 2704 isc_sockaddr_any(&sock->peer_address); 2705 2706 UNLOCK(&sock->lock); 2707 2708 socketclose(thread, sock, fd); 2709 2710 return (ISC_R_SUCCESS); 2711} 2712 2713static void 2714dispatch_recv(isc_socket_t *sock) { 2715 if (sock->type != isc_sockettype_fdwatch) { 2716 internal_recv(sock); 2717 } else { 2718 internal_fdwatch_read(sock); 2719 } 2720} 2721 2722static void 2723dispatch_send(isc_socket_t *sock) { 2724 if (sock->type != isc_sockettype_fdwatch) { 2725 internal_send(sock); 2726 } else { 2727 internal_fdwatch_write(sock); 2728 } 2729} 2730 2731/* 2732 * Dequeue an item off the given socket's read queue, set the result code 2733 * in the done event to the one provided, and send it to the task it was 2734 * destined for. 2735 * 2736 * If the event to be sent is on a list, remove it before sending. If 2737 * asked to, send and detach from the socket as well. 2738 * 2739 * Caller must have the socket locked if the event is attached to the socket. 2740 */ 2741static void 2742send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 2743 isc_task_t *task; 2744 2745 task = (*dev)->ev_sender; 2746 2747 (*dev)->ev_sender = sock; 2748 2749 if (ISC_LINK_LINKED(*dev, ev_link)) { 2750 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); 2751 } 2752 2753 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) { 2754 isc_task_sendtoanddetach(&task, (isc_event_t **)dev, 2755 sock->threadid); 2756 } else { 2757 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid); 2758 } 2759} 2760 2761/* 2762 * See comments for send_recvdone_event() above. 2763 * 2764 * Caller must have the socket locked if the event is attached to the socket. 2765 */ 2766static void 2767send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 2768 isc_task_t *task; 2769 2770 INSIST(dev != NULL && *dev != NULL); 2771 2772 task = (*dev)->ev_sender; 2773 (*dev)->ev_sender = sock; 2774 2775 if (ISC_LINK_LINKED(*dev, ev_link)) { 2776 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); 2777 } 2778 2779 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) { 2780 isc_task_sendtoanddetach(&task, (isc_event_t **)dev, 2781 sock->threadid); 2782 } else { 2783 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid); 2784 } 2785} 2786 2787/* 2788 * See comments for send_recvdone_event() above. 2789 * 2790 * Caller must have the socket locked if the event is attached to the socket. 2791 */ 2792static void 2793send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) { 2794 isc_task_t *task; 2795 2796 INSIST(dev != NULL && *dev != NULL); 2797 2798 task = (*dev)->ev_sender; 2799 (*dev)->ev_sender = sock; 2800 2801 if (ISC_LINK_LINKED(*dev, ev_link)) { 2802 ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link); 2803 } 2804 2805 isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid); 2806} 2807 2808/* 2809 * Call accept() on a socket, to get the new file descriptor. The listen 2810 * socket is used as a prototype to create a new isc_socket_t. The new 2811 * socket has one outstanding reference. The task receiving the event 2812 * will be detached from just after the event is delivered. 2813 * 2814 * On entry to this function, the event delivered is the internal 2815 * readable event, and the first item on the accept_list should be 2816 * the done event we want to send. If the list is empty, this is a no-op, 2817 * so just unlock and return. 2818 */ 2819static void 2820internal_accept(isc_socket_t *sock) { 2821 isc_socketmgr_t *manager; 2822 isc__socketthread_t *thread, *nthread; 2823 isc_socket_newconnev_t *dev; 2824 isc_task_t *task; 2825 socklen_t addrlen; 2826 int fd; 2827 isc_result_t result = ISC_R_SUCCESS; 2828 char strbuf[ISC_STRERRORSIZE]; 2829 const char *err = "accept"; 2830 2831 INSIST(VALID_SOCKET(sock)); 2832 REQUIRE(sock->fd >= 0); 2833 2834 socket_log(sock, NULL, TRACE, "internal_accept called, locked socket"); 2835 2836 manager = sock->manager; 2837 INSIST(VALID_MANAGER(manager)); 2838 thread = &manager->threads[sock->threadid]; 2839 2840 INSIST(sock->listener); 2841 2842 /* 2843 * Get the first item off the accept list. 2844 * If it is empty, unlock the socket and return. 2845 */ 2846 dev = ISC_LIST_HEAD(sock->accept_list); 2847 if (dev == NULL) { 2848 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT); 2849 UNLOCK(&sock->lock); 2850 return; 2851 } 2852 2853 /* 2854 * Try to accept the new connection. If the accept fails with 2855 * EAGAIN or EINTR, simply poke the watcher to watch this socket 2856 * again. Also ignore ECONNRESET, which has been reported to 2857 * be spuriously returned on Linux 2.2.19 although it is not 2858 * a documented error for accept(). ECONNABORTED has been 2859 * reported for Solaris 8. The rest are thrown in not because 2860 * we have seen them but because they are ignored by other 2861 * daemons such as BIND 8 and Apache. 2862 */ 2863 2864 addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type); 2865 memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen); 2866 fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa, 2867 (void *)&addrlen); 2868 2869#ifdef F_DUPFD 2870 /* 2871 * Leave a space for stdio to work in. 2872 */ 2873 if (fd >= 0 && fd < 20) { 2874 int newfd, tmp; 2875 newfd = fcntl(fd, F_DUPFD, 20); 2876 tmp = errno; 2877 (void)close(fd); 2878 errno = tmp; 2879 fd = newfd; 2880 err = "accept/fcntl"; 2881 } 2882#endif /* ifdef F_DUPFD */ 2883 2884 if (fd < 0) { 2885 if (SOFT_ERROR(errno)) { 2886 goto soft_error; 2887 } 2888 switch (errno) { 2889 case ENFILE: 2890 case EMFILE: 2891 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2892 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2893 "%s: too many open file descriptors", 2894 err); 2895 goto soft_error; 2896 2897 case ENOBUFS: 2898 case ENOMEM: 2899 case ECONNRESET: 2900 case ECONNABORTED: 2901 case EHOSTUNREACH: 2902 case EHOSTDOWN: 2903 case ENETUNREACH: 2904 case ENETDOWN: 2905 case ECONNREFUSED: 2906#ifdef EPROTO 2907 case EPROTO: 2908#endif /* ifdef EPROTO */ 2909#ifdef ENONET 2910 case ENONET: 2911#endif /* ifdef ENONET */ 2912 goto soft_error; 2913 default: 2914 break; 2915 } 2916 strerror_r(errno, strbuf, sizeof(strbuf)); 2917 UNEXPECTED_ERROR(__FILE__, __LINE__, 2918 "internal_accept: %s() failed: %s", err, 2919 strbuf); 2920 fd = -1; 2921 result = ISC_R_UNEXPECTED; 2922 } else { 2923 if (addrlen == 0U) { 2924 UNEXPECTED_ERROR(__FILE__, __LINE__, 2925 "internal_accept(): " 2926 "accept() failed to return " 2927 "remote address"); 2928 2929 (void)close(fd); 2930 goto soft_error; 2931 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family != 2932 sock->pf) 2933 { 2934 UNEXPECTED_ERROR( 2935 __FILE__, __LINE__, 2936 "internal_accept(): " 2937 "accept() returned peer address " 2938 "family %u (expected %u)", 2939 NEWCONNSOCK(dev)->peer_address.type.sa.sa_family, 2940 sock->pf); 2941 (void)close(fd); 2942 goto soft_error; 2943 } else if (fd >= (int)manager->maxsocks) { 2944 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2945 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2946 "accept: file descriptor exceeds limit " 2947 "(%d/%u)", 2948 fd, manager->maxsocks); 2949 (void)close(fd); 2950 goto soft_error; 2951 } 2952 } 2953 2954 if (fd != -1) { 2955 NEWCONNSOCK(dev)->peer_address.length = addrlen; 2956 NEWCONNSOCK(dev)->pf = sock->pf; 2957 } 2958 2959 /* 2960 * Pull off the done event. 2961 */ 2962 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link); 2963 2964 /* 2965 * Poke watcher if there are more pending accepts. 2966 */ 2967 if (ISC_LIST_EMPTY(sock->accept_list)) { 2968 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT); 2969 } 2970 2971 if (fd != -1) { 2972 result = make_nonblock(fd); 2973 if (result != ISC_R_SUCCESS) { 2974 (void)close(fd); 2975 fd = -1; 2976 } 2977 } 2978 2979 /* 2980 * We need to unlock sock->lock now to be able to lock manager->lock 2981 * without risking a deadlock with xmlstats. 2982 */ 2983 UNLOCK(&sock->lock); 2984 2985 /* 2986 * -1 means the new socket didn't happen. 2987 */ 2988 if (fd != -1) { 2989 int lockid = FDLOCK_ID(fd); 2990 2991 NEWCONNSOCK(dev)->fd = fd; 2992 NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev)); 2993 NEWCONNSOCK(dev)->bound = 1; 2994 NEWCONNSOCK(dev)->connected = 1; 2995 nthread = &manager->threads[NEWCONNSOCK(dev)->threadid]; 2996 2997 /* 2998 * We already hold a lock on one fdlock in accepting thread, 2999 * we need to make sure that we don't double lock. 3000 */ 3001 bool same_bucket = (sock->threadid == 3002 NEWCONNSOCK(dev)->threadid) && 3003 (FDLOCK_ID(sock->fd) == lockid); 3004 3005 /* 3006 * Use minimum mtu if possible. 3007 */ 3008 use_min_mtu(NEWCONNSOCK(dev)); 3009 set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40); 3010 3011 /* 3012 * Ensure DSCP settings are inherited across accept. 3013 */ 3014 setdscp(NEWCONNSOCK(dev), sock->dscp); 3015 3016 /* 3017 * Save away the remote address 3018 */ 3019 dev->address = NEWCONNSOCK(dev)->peer_address; 3020 3021 if (NEWCONNSOCK(dev)->active == 0) { 3022 inc_stats(manager->stats, 3023 NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]); 3024 NEWCONNSOCK(dev)->active = 1; 3025 } 3026 3027 if (!same_bucket) { 3028 LOCK(&nthread->fdlock[lockid]); 3029 } 3030 nthread->fds[fd] = NEWCONNSOCK(dev); 3031 nthread->fdstate[fd] = MANAGED; 3032#if defined(USE_EPOLL) 3033 nthread->epoll_events[fd] = 0; 3034#endif /* if defined(USE_EPOLL) */ 3035 if (!same_bucket) { 3036 UNLOCK(&nthread->fdlock[lockid]); 3037 } 3038 3039 LOCK(&manager->lock); 3040 3041#ifdef USE_SELECT 3042 if (nthread->maxfd < fd) { 3043 nthread->maxfd = fd; 3044 } 3045#endif /* ifdef USE_SELECT */ 3046 3047 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION, 3048 "accepted connection, new socket %p", 3049 dev->newsocket); 3050 3051 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link); 3052 3053 UNLOCK(&manager->lock); 3054 3055 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]); 3056 } else { 3057 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3058 isc_refcount_decrementz(&NEWCONNSOCK(dev)->references); 3059 free_socket((isc_socket_t **)&dev->newsocket); 3060 } 3061 3062 /* 3063 * Fill in the done event details and send it off. 3064 */ 3065 dev->result = result; 3066 task = dev->ev_sender; 3067 dev->ev_sender = sock; 3068 3069 isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid); 3070 return; 3071 3072soft_error: 3073 watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT); 3074 UNLOCK(&sock->lock); 3075 3076 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3077 return; 3078} 3079 3080static void 3081internal_recv(isc_socket_t *sock) { 3082 isc_socketevent_t *dev; 3083 3084 INSIST(VALID_SOCKET(sock)); 3085 REQUIRE(sock->fd >= 0); 3086 3087 dev = ISC_LIST_HEAD(sock->recv_list); 3088 if (dev == NULL) { 3089 goto finish; 3090 } 3091 3092 socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p", 3093 dev, dev->ev_sender); 3094 3095 /* 3096 * Try to do as much I/O as possible on this socket. There are no 3097 * limits here, currently. 3098 */ 3099 while (dev != NULL) { 3100 switch (doio_recv(sock, dev)) { 3101 case DOIO_SOFT: 3102 goto finish; 3103 3104 case DOIO_EOF: 3105 /* 3106 * read of 0 means the remote end was closed. 3107 * Run through the event queue and dispatch all 3108 * the events with an EOF result code. 3109 */ 3110 do { 3111 dev->result = ISC_R_EOF; 3112 send_recvdone_event(sock, &dev); 3113 dev = ISC_LIST_HEAD(sock->recv_list); 3114 } while (dev != NULL); 3115 goto finish; 3116 3117 case DOIO_SUCCESS: 3118 case DOIO_HARD: 3119 send_recvdone_event(sock, &dev); 3120 break; 3121 } 3122 3123 dev = ISC_LIST_HEAD(sock->recv_list); 3124 } 3125 3126finish: 3127 if (ISC_LIST_EMPTY(sock->recv_list)) { 3128 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd, 3129 SELECT_POKE_READ); 3130 } 3131} 3132 3133static void 3134internal_send(isc_socket_t *sock) { 3135 isc_socketevent_t *dev; 3136 3137 INSIST(VALID_SOCKET(sock)); 3138 REQUIRE(sock->fd >= 0); 3139 3140 dev = ISC_LIST_HEAD(sock->send_list); 3141 if (dev == NULL) { 3142 goto finish; 3143 } 3144 socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev, 3145 dev->ev_sender); 3146 3147 /* 3148 * Try to do as much I/O as possible on this socket. There are no 3149 * limits here, currently. 3150 */ 3151 while (dev != NULL) { 3152 switch (doio_send(sock, dev)) { 3153 case DOIO_SOFT: 3154 goto finish; 3155 3156 case DOIO_HARD: 3157 case DOIO_SUCCESS: 3158 send_senddone_event(sock, &dev); 3159 break; 3160 } 3161 3162 dev = ISC_LIST_HEAD(sock->send_list); 3163 } 3164 3165finish: 3166 if (ISC_LIST_EMPTY(sock->send_list)) { 3167 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd, 3168 SELECT_POKE_WRITE); 3169 } 3170} 3171 3172static void 3173internal_fdwatch_write(isc_socket_t *sock) 3174{ 3175 int more_data; 3176 3177 INSIST(VALID_SOCKET(sock)); 3178 3179 isc_refcount_increment(&sock->references); 3180 UNLOCK(&sock->lock); 3181 3182 more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock, 3183 sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE); 3184 3185 LOCK(&sock->lock); 3186 3187 if (isc_refcount_decrement(&sock->references) == 0) { 3188 UNLOCK(&sock->lock); 3189 destroy(&sock); 3190 return; 3191 } 3192 3193 if (more_data) 3194 select_poke(sock->manager, sock->threadid, sock->fd, 3195 SELECT_POKE_WRITE); 3196} 3197 3198static void 3199internal_fdwatch_read(isc_socket_t *sock) 3200{ 3201 int more_data; 3202 3203 INSIST(VALID_SOCKET(sock)); 3204 3205 isc_refcount_increment(&sock->references); 3206 UNLOCK(&sock->lock); 3207 3208 more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock, 3209 sock->fdwatcharg, ISC_SOCKFDWATCH_READ); 3210 3211 LOCK(&sock->lock); 3212 3213 if (isc_refcount_decrement(&sock->references) == 0) { 3214 UNLOCK(&sock->lock); 3215 destroy(&sock); 3216 return; 3217 } 3218 3219 if (more_data) 3220 select_poke(sock->manager, sock->threadid, sock->fd, 3221 SELECT_POKE_READ); 3222} 3223 3224/* 3225 * Process read/writes on each fd here. Avoid locking 3226 * and unlocking twice if both reads and writes are possible. 3227 */ 3228static void 3229process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) { 3230 isc_socket_t *sock; 3231 int lockid = FDLOCK_ID(fd); 3232 3233 /* 3234 * If the socket is going to be closed, don't do more I/O. 3235 */ 3236 LOCK(&thread->fdlock[lockid]); 3237 if (thread->fdstate[fd] == CLOSE_PENDING) { 3238 UNLOCK(&thread->fdlock[lockid]); 3239 3240 (void)unwatch_fd(thread, fd, SELECT_POKE_READ); 3241 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE); 3242 return; 3243 } 3244 3245 sock = thread->fds[fd]; 3246 if (sock == NULL) { 3247 UNLOCK(&thread->fdlock[lockid]); 3248 return; 3249 } 3250 3251 LOCK(&sock->lock); 3252 3253 if (sock->fd < 0) { 3254 /* 3255 * Sock is being closed - the final external reference 3256 * is gone but it was not yet removed from event loop 3257 * and fdstate[]/fds[] as destroy() is waiting on 3258 * thread->fdlock[lockid] or sock->lock that we're holding. 3259 * Just release the locks and bail. 3260 */ 3261 UNLOCK(&sock->lock); 3262 UNLOCK(&thread->fdlock[lockid]); 3263 return; 3264 } 3265 3266 REQUIRE(readable || writeable); 3267 if (writeable) { 3268 if (sock->connecting) { 3269 internal_connect(sock); 3270 } else { 3271 dispatch_send(sock); 3272 } 3273 } 3274 3275 if (readable) { 3276 if (sock->listener) { 3277 internal_accept(sock); /* unlocks sock */ 3278 } else { 3279 dispatch_recv(sock); 3280 UNLOCK(&sock->lock); 3281 } 3282 } else { 3283 UNLOCK(&sock->lock); 3284 } 3285 3286 UNLOCK(&thread->fdlock[lockid]); 3287 3288 /* 3289 * Socket destruction might be pending, it will resume 3290 * after releasing fdlock and sock->lock. 3291 */ 3292} 3293 3294/* 3295 * process_fds is different for different event loops 3296 * it takes the events from event loops and for each FD 3297 * launches process_fd 3298 */ 3299#ifdef USE_KQUEUE 3300static bool 3301process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) { 3302 int i; 3303 bool readable, writable; 3304 bool done = false; 3305 bool have_ctlevent = false; 3306 if (nevents == thread->nevents) { 3307 /* 3308 * This is not an error, but something unexpected. If this 3309 * happens, it may indicate the need for increasing 3310 * ISC_SOCKET_MAXEVENTS. 3311 */ 3312 thread_log(thread, ISC_LOGCATEGORY_GENERAL, 3313 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3314 "maximum number of FD events (%d) received", 3315 nevents); 3316 } 3317 3318 for (i = 0; i < nevents; i++) { 3319 REQUIRE(events[i].ident < thread->manager->maxsocks); 3320 if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) { 3321 have_ctlevent = true; 3322 continue; 3323 } 3324 readable = (events[i].filter == EVFILT_READ); 3325 writable = (events[i].filter == EVFILT_WRITE); 3326 process_fd(thread, events[i].ident, readable, writable); 3327 } 3328 3329 if (have_ctlevent) { 3330 done = process_ctlfd(thread); 3331 } 3332 3333 return (done); 3334} 3335#elif defined(USE_EPOLL) 3336static bool 3337process_fds(isc__socketthread_t *thread, struct epoll_event *events, 3338 int nevents) { 3339 int i; 3340 bool done = false; 3341 bool have_ctlevent = false; 3342 3343 if (nevents == thread->nevents) { 3344 thread_log(thread, ISC_LOGCATEGORY_GENERAL, 3345 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3346 "maximum number of FD events (%d) received", 3347 nevents); 3348 } 3349 3350 for (i = 0; i < nevents; i++) { 3351 REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks); 3352 if (events[i].data.fd == thread->pipe_fds[0]) { 3353 have_ctlevent = true; 3354 continue; 3355 } 3356 if ((events[i].events & EPOLLERR) != 0 || 3357 (events[i].events & EPOLLHUP) != 0) 3358 { 3359 /* 3360 * epoll does not set IN/OUT bits on an erroneous 3361 * condition, so we need to try both anyway. This is a 3362 * bit inefficient, but should be okay for such rare 3363 * events. Note also that the read or write attempt 3364 * won't block because we use non-blocking sockets. 3365 */ 3366 int fd = events[i].data.fd; 3367 events[i].events |= thread->epoll_events[fd]; 3368 } 3369 process_fd(thread, events[i].data.fd, 3370 (events[i].events & EPOLLIN) != 0, 3371 (events[i].events & EPOLLOUT) != 0); 3372 } 3373 3374 if (have_ctlevent) { 3375 done = process_ctlfd(thread); 3376 } 3377 3378 return (done); 3379} 3380#elif defined(USE_DEVPOLL) 3381static bool 3382process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) { 3383 int i; 3384 bool done = false; 3385 bool have_ctlevent = false; 3386 3387 if (nevents == thread->nevents) { 3388 thread_log(thread, ISC_LOGCATEGORY_GENERAL, 3389 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3390 "maximum number of FD events (%d) received", 3391 nevents); 3392 } 3393 3394 for (i = 0; i < nevents; i++) { 3395 REQUIRE(events[i].fd < (int)thread->manager->maxsocks); 3396 if (events[i].fd == thread->pipe_fds[0]) { 3397 have_ctlevent = true; 3398 continue; 3399 } 3400 process_fd(thread, events[i].fd, 3401 (events[i].events & POLLIN) != 0, 3402 (events[i].events & POLLOUT) != 0); 3403 } 3404 3405 if (have_ctlevent) { 3406 done = process_ctlfd(thread); 3407 } 3408 3409 return (done); 3410} 3411#elif defined(USE_SELECT) 3412static void 3413process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds, 3414 fd_set *writefds) { 3415 int i; 3416 3417 REQUIRE(maxfd <= (int)thread->manager->maxsocks); 3418 3419 for (i = 0; i < maxfd; i++) { 3420 if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) { 3421 continue; 3422 } 3423 process_fd(thread, i, FD_ISSET(i, readfds), 3424 FD_ISSET(i, writefds)); 3425 } 3426} 3427#endif /* ifdef USE_KQUEUE */ 3428 3429static bool 3430process_ctlfd(isc__socketthread_t *thread) { 3431 int msg, fd; 3432 3433 for (;;) { 3434 select_readmsg(thread, &fd, &msg); 3435 3436 thread_log(thread, IOEVENT, 3437 "watcher got message %d for socket %d", msg, fd); 3438 3439 /* 3440 * Nothing to read? 3441 */ 3442 if (msg == SELECT_POKE_NOTHING) { 3443 break; 3444 } 3445 3446 /* 3447 * Handle shutdown message. We really should 3448 * jump out of this loop right away, but 3449 * it doesn't matter if we have to do a little 3450 * more work first. 3451 */ 3452 if (msg == SELECT_POKE_SHUTDOWN) { 3453 return (true); 3454 } 3455 3456 /* 3457 * This is a wakeup on a socket. Look 3458 * at the event queue for both read and write, 3459 * and decide if we need to watch on it now 3460 * or not. 3461 */ 3462 wakeup_socket(thread, fd, msg); 3463 } 3464 3465 return (false); 3466} 3467 3468/* 3469 * This is the thread that will loop forever, always in a select or poll 3470 * call. 3471 * 3472 * When select returns something to do, do whatever's necessary and post 3473 * an event to the task that was requesting the action. 3474 */ 3475static isc_threadresult_t 3476netthread(void *uap) { 3477 isc__socketthread_t *thread = uap; 3478 isc_socketmgr_t *manager = thread->manager; 3479 (void)manager; 3480 bool done; 3481 int cc; 3482#ifdef USE_KQUEUE 3483 const char *fnname = "kevent()"; 3484#elif defined(USE_EPOLL) 3485 const char *fnname = "epoll_wait()"; 3486#elif defined(USE_DEVPOLL) 3487 isc_result_t result; 3488 const char *fnname = "ioctl(DP_POLL)"; 3489 struct dvpoll dvp; 3490 int pass; 3491#if defined(ISC_SOCKET_USE_POLLWATCH) 3492 pollstate_t pollstate = poll_idle; 3493#endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */ 3494#elif defined(USE_SELECT) 3495 const char *fnname = "select()"; 3496 int maxfd; 3497 int ctlfd; 3498#endif /* ifdef USE_KQUEUE */ 3499 char strbuf[ISC_STRERRORSIZE]; 3500 3501#if defined(USE_SELECT) 3502 /* 3503 * Get the control fd here. This will never change. 3504 */ 3505 ctlfd = thread->pipe_fds[0]; 3506#endif /* if defined(USE_SELECT) */ 3507 done = false; 3508 while (!done) { 3509 do { 3510#ifdef USE_KQUEUE 3511 cc = kevent(thread->kqueue_fd, NULL, 0, thread->events, 3512 thread->nevents, NULL); 3513#elif defined(USE_EPOLL) 3514 cc = epoll_wait(thread->epoll_fd, thread->events, 3515 thread->nevents, -1); 3516#elif defined(USE_DEVPOLL) 3517 /* 3518 * Re-probe every thousand calls. 3519 */ 3520 if (thread->calls++ > 1000U) { 3521 result = isc_resource_getcurlimit( 3522 isc_resource_openfiles, 3523 &thread->open_max); 3524 if (result != ISC_R_SUCCESS) { 3525 thread->open_max = 64; 3526 } 3527 thread->calls = 0; 3528 } 3529 for (pass = 0; pass < 2; pass++) { 3530 dvp.dp_fds = thread->events; 3531 dvp.dp_nfds = thread->nevents; 3532 if (dvp.dp_nfds >= thread->open_max) { 3533 dvp.dp_nfds = thread->open_max - 1; 3534 } 3535#ifndef ISC_SOCKET_USE_POLLWATCH 3536 dvp.dp_timeout = -1; 3537#else /* ifndef ISC_SOCKET_USE_POLLWATCH */ 3538 if (pollstate == poll_idle) { 3539 dvp.dp_timeout = -1; 3540 } else { 3541 dvp.dp_timeout = 3542 ISC_SOCKET_POLLWATCH_TIMEOUT; 3543 } 3544#endif /* ISC_SOCKET_USE_POLLWATCH */ 3545 cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp); 3546 if (cc == -1 && errno == EINVAL) { 3547 /* 3548 * {OPEN_MAX} may have dropped. Look 3549 * up the current value and try again. 3550 */ 3551 result = isc_resource_getcurlimit( 3552 isc_resource_openfiles, 3553 &thread->open_max); 3554 if (result != ISC_R_SUCCESS) { 3555 thread->open_max = 64; 3556 } 3557 } else { 3558 break; 3559 } 3560 } 3561#elif defined(USE_SELECT) 3562 /* 3563 * We will have only one thread anyway, we can lock 3564 * manager lock and don't care 3565 */ 3566 LOCK(&manager->lock); 3567 memmove(thread->read_fds_copy, thread->read_fds, 3568 thread->fd_bufsize); 3569 memmove(thread->write_fds_copy, thread->write_fds, 3570 thread->fd_bufsize); 3571 maxfd = thread->maxfd + 1; 3572 UNLOCK(&manager->lock); 3573 3574 cc = select(maxfd, thread->read_fds_copy, 3575 thread->write_fds_copy, NULL, NULL); 3576#endif /* USE_KQUEUE */ 3577 3578 if (cc < 0 && !SOFT_ERROR(errno)) { 3579 strerror_r(errno, strbuf, sizeof(strbuf)); 3580 FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s", 3581 fnname, strbuf); 3582 } 3583 3584#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) 3585 if (cc == 0) { 3586 if (pollstate == poll_active) { 3587 pollstate = poll_checking; 3588 } else if (pollstate == poll_checking) { 3589 pollstate = poll_idle; 3590 } 3591 } else if (cc > 0) { 3592 if (pollstate == poll_checking) { 3593 /* 3594 * XXX: We'd like to use a more 3595 * verbose log level as it's actually an 3596 * unexpected event, but the kernel bug 3597 * reportedly happens pretty frequently 3598 * (and it can also be a false positive) 3599 * so it would be just too noisy. 3600 */ 3601 thread_log(thread, 3602 ISC_LOGCATEGORY_GENERAL, 3603 ISC_LOGMODULE_SOCKET, 3604 ISC_LOG_DEBUG(1), 3605 "unexpected POLL timeout"); 3606 } 3607 pollstate = poll_active; 3608 } 3609#endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */ 3610 } while (cc < 0); 3611 3612#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 3613 done = process_fds(thread, thread->events, cc); 3614#elif defined(USE_SELECT) 3615 process_fds(thread, maxfd, thread->read_fds_copy, 3616 thread->write_fds_copy); 3617 3618 /* 3619 * Process reads on internal, control fd. 3620 */ 3621 if (FD_ISSET(ctlfd, thread->read_fds_copy)) { 3622 done = process_ctlfd(thread); 3623 } 3624#endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \ 3625 * */ 3626 } 3627 3628 thread_log(thread, TRACE, "watcher exiting"); 3629 return ((isc_threadresult_t)0); 3630} 3631 3632void 3633isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) { 3634 REQUIRE(VALID_MANAGER(manager)); 3635 3636 manager->reserved = reserved; 3637} 3638 3639void 3640isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) { 3641 REQUIRE(VALID_MANAGER(manager)); 3642 3643 manager->maxudp = maxudp; 3644} 3645 3646/* 3647 * Setup socket thread, thread->manager and thread->threadid must be filled. 3648 */ 3649 3650static isc_result_t 3651setup_thread(isc__socketthread_t *thread) { 3652 isc_result_t result = ISC_R_SUCCESS; 3653 int i; 3654 char strbuf[ISC_STRERRORSIZE]; 3655 3656 REQUIRE(thread != NULL); 3657 REQUIRE(VALID_MANAGER(thread->manager)); 3658 REQUIRE(thread->threadid >= 0 && 3659 thread->threadid < thread->manager->nthreads); 3660 3661 thread->fds = 3662 isc_mem_get(thread->manager->mctx, 3663 thread->manager->maxsocks * sizeof(isc_socket_t *)); 3664 3665 memset(thread->fds, 0, 3666 thread->manager->maxsocks * sizeof(isc_socket_t *)); 3667 3668 thread->fdstate = isc_mem_get(thread->manager->mctx, 3669 thread->manager->maxsocks * sizeof(int)); 3670 3671 memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int)); 3672 3673 thread->fdlock = isc_mem_get(thread->manager->mctx, 3674 FDLOCK_COUNT * sizeof(isc_mutex_t)); 3675 3676 for (i = 0; i < FDLOCK_COUNT; i++) { 3677 isc_mutex_init(&thread->fdlock[i]); 3678 } 3679 3680 if (pipe(thread->pipe_fds) != 0) { 3681 strerror_r(errno, strbuf, sizeof(strbuf)); 3682 UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s", 3683 strbuf); 3684 return (ISC_R_UNEXPECTED); 3685 } 3686 RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS); 3687 3688#ifdef USE_KQUEUE 3689 thread->nevents = ISC_SOCKET_MAXEVENTS; 3690 thread->events = isc_mem_get(thread->manager->mctx, 3691 sizeof(struct kevent) * thread->nevents); 3692 3693 thread->kqueue_fd = kqueue(); 3694 if (thread->kqueue_fd == -1) { 3695 result = isc__errno2result(errno); 3696 strerror_r(errno, strbuf, sizeof(strbuf)); 3697 UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s", 3698 strbuf); 3699 isc_mem_put(thread->manager->mctx, thread->events, 3700 sizeof(struct kevent) * thread->nevents); 3701 return (result); 3702 } 3703 3704 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ); 3705 if (result != ISC_R_SUCCESS) { 3706 close(thread->kqueue_fd); 3707 isc_mem_put(thread->manager->mctx, thread->events, 3708 sizeof(struct kevent) * thread->nevents); 3709 } 3710 return (result); 3711 3712#elif defined(USE_EPOLL) 3713 thread->nevents = ISC_SOCKET_MAXEVENTS; 3714 thread->epoll_events = 3715 isc_mem_get(thread->manager->mctx, 3716 (thread->manager->maxsocks * sizeof(uint32_t))); 3717 3718 memset(thread->epoll_events, 0, 3719 thread->manager->maxsocks * sizeof(uint32_t)); 3720 3721 thread->events = 3722 isc_mem_get(thread->manager->mctx, 3723 sizeof(struct epoll_event) * thread->nevents); 3724 3725 thread->epoll_fd = epoll_create(thread->nevents); 3726 if (thread->epoll_fd == -1) { 3727 result = isc__errno2result(errno); 3728 strerror_r(errno, strbuf, sizeof(strbuf)); 3729 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s", 3730 strbuf); 3731 return (result); 3732 } 3733 3734 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ); 3735 return (result); 3736 3737#elif defined(USE_DEVPOLL) 3738 thread->nevents = ISC_SOCKET_MAXEVENTS; 3739 result = isc_resource_getcurlimit(isc_resource_openfiles, 3740 &thread->open_max); 3741 if (result != ISC_R_SUCCESS) { 3742 thread->open_max = 64; 3743 } 3744 thread->calls = 0; 3745 thread->events = isc_mem_get(thread->manager->mctx, 3746 sizeof(struct pollfd) * thread->nevents); 3747 3748 /* 3749 * Note: fdpollinfo should be able to support all possible FDs, so 3750 * it must have maxsocks entries (not nevents). 3751 */ 3752 thread->fdpollinfo = 3753 isc_mem_get(thread->manager->mctx, 3754 sizeof(pollinfo_t) * thread->manager->maxsocks); 3755 memset(thread->fdpollinfo, 0, 3756 sizeof(pollinfo_t) * thread->manager->maxsocks); 3757 thread->devpoll_fd = open("/dev/poll", O_RDWR); 3758 if (thread->devpoll_fd == -1) { 3759 result = isc__errno2result(errno); 3760 strerror_r(errno, strbuf, sizeof(strbuf)); 3761 UNEXPECTED_ERROR(__FILE__, __LINE__, 3762 "open(/dev/poll) failed: %s", strbuf); 3763 isc_mem_put(thread->manager->mctx, thread->events, 3764 sizeof(struct pollfd) * thread->nevents); 3765 isc_mem_put(thread->manager->mctx, thread->fdpollinfo, 3766 sizeof(pollinfo_t) * thread->manager->maxsocks); 3767 return (result); 3768 } 3769 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ); 3770 if (result != ISC_R_SUCCESS) { 3771 close(thread->devpoll_fd); 3772 isc_mem_put(thread->manager->mctx, thread->events, 3773 sizeof(struct pollfd) * thread->nevents); 3774 isc_mem_put(thread->manager->mctx, thread->fdpollinfo, 3775 sizeof(pollinfo_t) * thread->manager->maxsocks); 3776 return (result); 3777 } 3778 3779 return (ISC_R_SUCCESS); 3780#elif defined(USE_SELECT) 3781 UNUSED(result); 3782 3783#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE 3784 /* 3785 * Note: this code should also cover the case of MAXSOCKETS <= 3786 * FD_SETSIZE, but we separate the cases to avoid possible portability 3787 * issues regarding howmany() and the actual representation of fd_set. 3788 */ 3789 thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) * 3790 sizeof(fd_mask); 3791#else /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */ 3792 thread->fd_bufsize = sizeof(fd_set); 3793#endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */ 3794 3795 thread->read_fds = isc_mem_get(thread->manager->mctx, 3796 thread->fd_bufsize); 3797 thread->read_fds_copy = isc_mem_get(thread->manager->mctx, 3798 thread->fd_bufsize); 3799 thread->write_fds = isc_mem_get(thread->manager->mctx, 3800 thread->fd_bufsize); 3801 thread->write_fds_copy = isc_mem_get(thread->manager->mctx, 3802 thread->fd_bufsize); 3803 memset(thread->read_fds, 0, thread->fd_bufsize); 3804 memset(thread->write_fds, 0, thread->fd_bufsize); 3805 3806 (void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ); 3807 thread->maxfd = thread->pipe_fds[0]; 3808 3809 return (ISC_R_SUCCESS); 3810#endif /* USE_KQUEUE */ 3811} 3812 3813static void 3814cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) { 3815 isc_result_t result; 3816 int i; 3817 3818 result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ); 3819 if (result != ISC_R_SUCCESS) { 3820 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed"); 3821 } 3822#ifdef USE_KQUEUE 3823 close(thread->kqueue_fd); 3824 isc_mem_put(mctx, thread->events, 3825 sizeof(struct kevent) * thread->nevents); 3826#elif defined(USE_EPOLL) 3827 close(thread->epoll_fd); 3828 3829 isc_mem_put(mctx, thread->events, 3830 sizeof(struct epoll_event) * thread->nevents); 3831#elif defined(USE_DEVPOLL) 3832 close(thread->devpoll_fd); 3833 isc_mem_put(mctx, thread->events, 3834 sizeof(struct pollfd) * thread->nevents); 3835 isc_mem_put(mctx, thread->fdpollinfo, 3836 sizeof(pollinfo_t) * thread->manager->maxsocks); 3837#elif defined(USE_SELECT) 3838 if (thread->read_fds != NULL) { 3839 isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize); 3840 } 3841 if (thread->read_fds_copy != NULL) { 3842 isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize); 3843 } 3844 if (thread->write_fds != NULL) { 3845 isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize); 3846 } 3847 if (thread->write_fds_copy != NULL) { 3848 isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize); 3849 } 3850#endif /* USE_KQUEUE */ 3851 for (i = 0; i < (int)thread->manager->maxsocks; i++) { 3852 if (thread->fdstate[i] == CLOSE_PENDING) { 3853 /* no need to lock */ 3854 (void)close(i); 3855 } 3856 } 3857 3858#if defined(USE_EPOLL) 3859 isc_mem_put(thread->manager->mctx, thread->epoll_events, 3860 thread->manager->maxsocks * sizeof(uint32_t)); 3861#endif /* if defined(USE_EPOLL) */ 3862 isc_mem_put(thread->manager->mctx, thread->fds, 3863 thread->manager->maxsocks * sizeof(isc_socket_t *)); 3864 isc_mem_put(thread->manager->mctx, thread->fdstate, 3865 thread->manager->maxsocks * sizeof(int)); 3866 3867 for (i = 0; i < FDLOCK_COUNT; i++) { 3868 isc_mutex_destroy(&thread->fdlock[i]); 3869 } 3870 isc_mem_put(thread->manager->mctx, thread->fdlock, 3871 FDLOCK_COUNT * sizeof(isc_mutex_t)); 3872} 3873 3874isc_result_t 3875isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { 3876 return (isc_socketmgr_create2(mctx, managerp, 0, 1)); 3877} 3878 3879isc_result_t 3880isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 3881 unsigned int maxsocks, int nthreads) { 3882 int i; 3883 isc_socketmgr_t *manager; 3884 3885 REQUIRE(managerp != NULL && *managerp == NULL); 3886 3887 if (maxsocks == 0) { 3888 maxsocks = ISC_SOCKET_MAXSOCKETS; 3889 } 3890 3891 manager = isc_mem_get(mctx, sizeof(*manager)); 3892 3893 /* zero-clear so that necessary cleanup on failure will be easy */ 3894 memset(manager, 0, sizeof(*manager)); 3895 manager->maxsocks = maxsocks; 3896 manager->reserved = 0; 3897 manager->maxudp = 0; 3898 manager->nthreads = nthreads; 3899 manager->stats = NULL; 3900 3901 manager->magic = SOCKET_MANAGER_MAGIC; 3902 manager->mctx = NULL; 3903 ISC_LIST_INIT(manager->socklist); 3904 isc_mutex_init(&manager->lock); 3905 isc_condition_init(&manager->shutdown_ok); 3906 3907 /* 3908 * Start up the select/poll thread. 3909 */ 3910 manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) * 3911 manager->nthreads); 3912 isc_mem_attach(mctx, &manager->mctx); 3913 3914 for (i = 0; i < manager->nthreads; i++) { 3915 manager->threads[i].manager = manager; 3916 manager->threads[i].threadid = i; 3917 setup_thread(&manager->threads[i]); 3918 isc_thread_create(netthread, &manager->threads[i], 3919 &manager->threads[i].thread); 3920 char tname[1024]; 3921 sprintf(tname, "sock-%d", i); 3922 isc_thread_setname(manager->threads[i].thread, tname); 3923 } 3924 3925 *managerp = manager; 3926 3927 return (ISC_R_SUCCESS); 3928} 3929 3930isc_result_t 3931isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) { 3932 REQUIRE(VALID_MANAGER(manager)); 3933 REQUIRE(nsockp != NULL); 3934 3935 *nsockp = manager->maxsocks; 3936 3937 return (ISC_R_SUCCESS); 3938} 3939 3940void 3941isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) { 3942 REQUIRE(VALID_MANAGER(manager)); 3943 REQUIRE(ISC_LIST_EMPTY(manager->socklist)); 3944 REQUIRE(manager->stats == NULL); 3945 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max); 3946 3947 isc_stats_attach(stats, &manager->stats); 3948} 3949 3950void 3951isc_socketmgr_destroy(isc_socketmgr_t **managerp) { 3952 isc_socketmgr_t *manager; 3953 3954 /* 3955 * Destroy a socket manager. 3956 */ 3957 3958 REQUIRE(managerp != NULL); 3959 manager = *managerp; 3960 REQUIRE(VALID_MANAGER(manager)); 3961 3962 LOCK(&manager->lock); 3963 3964 /* 3965 * Wait for all sockets to be destroyed. 3966 */ 3967 while (!ISC_LIST_EMPTY(manager->socklist)) { 3968 manager_log(manager, CREATION, "sockets exist"); 3969 WAIT(&manager->shutdown_ok, &manager->lock); 3970 } 3971 3972 UNLOCK(&manager->lock); 3973 3974 /* 3975 * Here, poke our select/poll thread. Do this by closing the write 3976 * half of the pipe, which will send EOF to the read half. 3977 * This is currently a no-op in the non-threaded case. 3978 */ 3979 for (int i = 0; i < manager->nthreads; i++) { 3980 select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN); 3981 } 3982 3983 /* 3984 * Wait for thread to exit. 3985 */ 3986 for (int i = 0; i < manager->nthreads; i++) { 3987 isc_thread_join(manager->threads[i].thread, NULL); 3988 cleanup_thread(manager->mctx, &manager->threads[i]); 3989 } 3990 /* 3991 * Clean up. 3992 */ 3993 isc_mem_put(manager->mctx, manager->threads, 3994 sizeof(isc__socketthread_t) * manager->nthreads); 3995 (void)isc_condition_destroy(&manager->shutdown_ok); 3996 3997 if (manager->stats != NULL) { 3998 isc_stats_detach(&manager->stats); 3999 } 4000 isc_mutex_destroy(&manager->lock); 4001 manager->magic = 0; 4002 isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager)); 4003 4004 *managerp = NULL; 4005} 4006 4007static isc_result_t 4008socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4009 unsigned int flags) { 4010 int io_state; 4011 bool have_lock = false; 4012 isc_task_t *ntask = NULL; 4013 isc_result_t result = ISC_R_SUCCESS; 4014 4015 dev->ev_sender = task; 4016 4017 if (sock->type == isc_sockettype_udp) { 4018 io_state = doio_recv(sock, dev); 4019 } else { 4020 LOCK(&sock->lock); 4021 have_lock = true; 4022 4023 if (ISC_LIST_EMPTY(sock->recv_list)) { 4024 io_state = doio_recv(sock, dev); 4025 } else { 4026 io_state = DOIO_SOFT; 4027 } 4028 } 4029 4030 switch (io_state) { 4031 case DOIO_SOFT: 4032 /* 4033 * We couldn't read all or part of the request right now, so 4034 * queue it. 4035 * 4036 * Attach to socket and to task 4037 */ 4038 isc_task_attach(task, &ntask); 4039 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4040 4041 if (!have_lock) { 4042 LOCK(&sock->lock); 4043 have_lock = true; 4044 } 4045 4046 /* 4047 * Enqueue the request. If the socket was previously not being 4048 * watched, poke the watcher to start paying attention to it. 4049 */ 4050 bool do_poke = ISC_LIST_EMPTY(sock->recv_list); 4051 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); 4052 if (do_poke) { 4053 select_poke(sock->manager, sock->threadid, sock->fd, 4054 SELECT_POKE_READ); 4055 } 4056 4057 socket_log(sock, NULL, EVENT, 4058 "socket_recv: event %p -> task %p", dev, ntask); 4059 4060 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) { 4061 result = ISC_R_INPROGRESS; 4062 } 4063 break; 4064 4065 case DOIO_EOF: 4066 dev->result = ISC_R_EOF; 4067 FALLTHROUGH; 4068 4069 case DOIO_HARD: 4070 case DOIO_SUCCESS: 4071 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) { 4072 send_recvdone_event(sock, &dev); 4073 } 4074 break; 4075 } 4076 4077 if (have_lock) { 4078 UNLOCK(&sock->lock); 4079 } 4080 4081 return (result); 4082} 4083 4084isc_result_t 4085isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum, 4086 isc_task_t *task, isc_taskaction_t action, void *arg) { 4087 isc_socketevent_t *dev; 4088 isc_socketmgr_t *manager; 4089 4090 REQUIRE(VALID_SOCKET(sock)); 4091 REQUIRE(action != NULL); 4092 4093 manager = sock->manager; 4094 REQUIRE(VALID_MANAGER(manager)); 4095 4096 INSIST(sock->bound); 4097 4098 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE, 4099 action, arg); 4100 if (dev == NULL) { 4101 return (ISC_R_NOMEMORY); 4102 } 4103 4104 return (isc_socket_recv2(sock, region, minimum, task, dev, 0)); 4105} 4106 4107isc_result_t 4108isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum, 4109 isc_task_t *task, isc_socketevent_t *event, 4110 unsigned int flags) { 4111 event->ev_sender = sock; 4112 event->result = ISC_R_UNSET; 4113 event->region = *region; 4114 event->n = 0; 4115 event->offset = 0; 4116 event->attributes = 0; 4117 4118 /* 4119 * UDP sockets are always partial read. 4120 */ 4121 if (sock->type == isc_sockettype_udp) { 4122 event->minimum = 1; 4123 } else { 4124 if (minimum == 0) { 4125 event->minimum = region->length; 4126 } else { 4127 event->minimum = minimum; 4128 } 4129 } 4130 4131 return (socket_recv(sock, event, task, flags)); 4132} 4133 4134static isc_result_t 4135socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4136 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4137 unsigned int flags) { 4138 int io_state; 4139 bool have_lock = false; 4140 isc_task_t *ntask = NULL; 4141 isc_result_t result = ISC_R_SUCCESS; 4142 4143 dev->ev_sender = task; 4144 4145 set_dev_address(address, sock, dev); 4146 if (pktinfo != NULL) { 4147 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 4148 dev->pktinfo = *pktinfo; 4149 4150 if (!isc_sockaddr_issitelocal(&dev->address) && 4151 !isc_sockaddr_islinklocal(&dev->address)) 4152 { 4153 socket_log(sock, NULL, TRACE, 4154 "pktinfo structure provided, ifindex %u " 4155 "(set to 0)", 4156 pktinfo->ipi6_ifindex); 4157 4158 /* 4159 * Set the pktinfo index to 0 here, to let the 4160 * kernel decide what interface it should send on. 4161 */ 4162 dev->pktinfo.ipi6_ifindex = 0; 4163 } 4164 } 4165 4166 if (sock->type == isc_sockettype_udp) { 4167 io_state = doio_send(sock, dev); 4168 } else { 4169 LOCK(&sock->lock); 4170 have_lock = true; 4171 4172 if (ISC_LIST_EMPTY(sock->send_list)) { 4173 io_state = doio_send(sock, dev); 4174 } else { 4175 io_state = DOIO_SOFT; 4176 } 4177 } 4178 4179 switch (io_state) { 4180 case DOIO_SOFT: 4181 /* 4182 * We couldn't send all or part of the request right now, so 4183 * queue it unless ISC_SOCKFLAG_NORETRY is set. 4184 */ 4185 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { 4186 isc_task_attach(task, &ntask); 4187 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4188 4189 if (!have_lock) { 4190 LOCK(&sock->lock); 4191 have_lock = true; 4192 } 4193 4194 /* 4195 * Enqueue the request. If the socket was previously 4196 * not being watched, poke the watcher to start 4197 * paying attention to it. 4198 */ 4199 bool do_poke = ISC_LIST_EMPTY(sock->send_list); 4200 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); 4201 if (do_poke) { 4202 select_poke(sock->manager, sock->threadid, 4203 sock->fd, SELECT_POKE_WRITE); 4204 } 4205 socket_log(sock, NULL, EVENT, 4206 "socket_send: event %p -> task %p", dev, 4207 ntask); 4208 4209 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) { 4210 result = ISC_R_INPROGRESS; 4211 } 4212 break; 4213 } 4214 4215 FALLTHROUGH; 4216 4217 case DOIO_HARD: 4218 case DOIO_SUCCESS: 4219 if (!have_lock) { 4220 LOCK(&sock->lock); 4221 have_lock = true; 4222 } 4223 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) { 4224 send_senddone_event(sock, &dev); 4225 } 4226 break; 4227 } 4228 4229 if (have_lock) { 4230 UNLOCK(&sock->lock); 4231 } 4232 4233 return (result); 4234} 4235 4236isc_result_t 4237isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task, 4238 isc_taskaction_t action, void *arg) { 4239 /* 4240 * REQUIRE() checking is performed in isc_socket_sendto(). 4241 */ 4242 return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL)); 4243} 4244 4245isc_result_t 4246isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task, 4247 isc_taskaction_t action, void *arg, 4248 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) { 4249 isc_socketevent_t *dev; 4250 isc_socketmgr_t *manager; 4251 4252 REQUIRE(VALID_SOCKET(sock)); 4253 REQUIRE(region != NULL); 4254 REQUIRE(task != NULL); 4255 REQUIRE(action != NULL); 4256 4257 manager = sock->manager; 4258 REQUIRE(VALID_MANAGER(manager)); 4259 4260 INSIST(sock->bound); 4261 4262 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE, 4263 action, arg); 4264 if (dev == NULL) { 4265 return (ISC_R_NOMEMORY); 4266 } 4267 4268 dev->region = *region; 4269 4270 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4271} 4272 4273isc_result_t 4274isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task, 4275 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4276 isc_socketevent_t *event, unsigned int flags) { 4277 REQUIRE(VALID_SOCKET(sock)); 4278 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) == 4279 0); 4280 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) { 4281 REQUIRE(sock->type == isc_sockettype_udp); 4282 } 4283 event->ev_sender = sock; 4284 event->result = ISC_R_UNSET; 4285 event->region = *region; 4286 event->n = 0; 4287 event->offset = 0; 4288 event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED; 4289 4290 return (socket_send(sock, event, task, address, pktinfo, flags)); 4291} 4292 4293void 4294isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) { 4295#ifdef ISC_PLATFORM_HAVESYSUNH 4296 int s; 4297 struct stat sb; 4298 char strbuf[ISC_STRERRORSIZE]; 4299 4300 if (sockaddr->type.sa.sa_family != AF_UNIX) { 4301 return; 4302 } 4303 4304#ifndef S_ISSOCK 4305#if defined(S_IFMT) && defined(S_IFSOCK) 4306#define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK) 4307#elif defined(_S_IFMT) && defined(S_IFSOCK) 4308#define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK) 4309#endif /* if defined(S_IFMT) && defined(S_IFSOCK) */ 4310#endif /* ifndef S_ISSOCK */ 4311 4312#ifndef S_ISFIFO 4313#if defined(S_IFMT) && defined(S_IFIFO) 4314#define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO) 4315#elif defined(_S_IFMT) && defined(S_IFIFO) 4316#define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO) 4317#endif /* if defined(S_IFMT) && defined(S_IFIFO) */ 4318#endif /* ifndef S_ISFIFO */ 4319 4320#if !defined(S_ISFIFO) && !defined(S_ISSOCK) 4321/* cppcheck-suppress preprocessorErrorDirective */ 4322#error \ 4323 You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>. 4324#endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */ 4325 4326#ifndef S_ISFIFO 4327#define S_ISFIFO(mode) 0 4328#endif /* ifndef S_ISFIFO */ 4329 4330#ifndef S_ISSOCK 4331#define S_ISSOCK(mode) 0 4332#endif /* ifndef S_ISSOCK */ 4333 4334 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4335 switch (errno) { 4336 case ENOENT: 4337 if (active) { /* We exited cleanly last time */ 4338 break; 4339 } 4340 FALLTHROUGH; 4341 default: 4342 strerror_r(errno, strbuf, sizeof(strbuf)); 4343 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4344 ISC_LOGMODULE_SOCKET, 4345 active ? ISC_LOG_ERROR : ISC_LOG_WARNING, 4346 "isc_socket_cleanunix: stat(%s): %s", 4347 sockaddr->type.sunix.sun_path, strbuf); 4348 return; 4349 } 4350 } else { 4351 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4352 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4353 ISC_LOGMODULE_SOCKET, 4354 active ? ISC_LOG_ERROR : ISC_LOG_WARNING, 4355 "isc_socket_cleanunix: %s: not a socket", 4356 sockaddr->type.sunix.sun_path); 4357 return; 4358 } 4359 } 4360 4361 if (active) { 4362 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4363 strerror_r(errno, strbuf, sizeof(strbuf)); 4364 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4365 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4366 "isc_socket_cleanunix: unlink(%s): %s", 4367 sockaddr->type.sunix.sun_path, strbuf); 4368 } 4369 return; 4370 } 4371 4372 s = socket(AF_UNIX, SOCK_STREAM, 0); 4373 if (s < 0) { 4374 strerror_r(errno, strbuf, sizeof(strbuf)); 4375 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4376 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4377 "isc_socket_cleanunix: socket(%s): %s", 4378 sockaddr->type.sunix.sun_path, strbuf); 4379 return; 4380 } 4381 4382 if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix, 4383 sizeof(sockaddr->type.sunix)) < 0) 4384 { 4385 switch (errno) { 4386 case ECONNREFUSED: 4387 case ECONNRESET: 4388 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4389 strerror_r(errno, strbuf, sizeof(strbuf)); 4390 isc_log_write( 4391 isc_lctx, ISC_LOGCATEGORY_GENERAL, 4392 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4393 "isc_socket_cleanunix: " 4394 "unlink(%s): %s", 4395 sockaddr->type.sunix.sun_path, strbuf); 4396 } 4397 break; 4398 default: 4399 strerror_r(errno, strbuf, sizeof(strbuf)); 4400 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4401 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4402 "isc_socket_cleanunix: connect(%s): %s", 4403 sockaddr->type.sunix.sun_path, strbuf); 4404 break; 4405 } 4406 } 4407 close(s); 4408#else /* ifdef ISC_PLATFORM_HAVESYSUNH */ 4409 UNUSED(sockaddr); 4410 UNUSED(active); 4411#endif /* ifdef ISC_PLATFORM_HAVESYSUNH */ 4412} 4413 4414isc_result_t 4415isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm, 4416 uint32_t owner, uint32_t group) { 4417#ifdef ISC_PLATFORM_HAVESYSUNH 4418 isc_result_t result = ISC_R_SUCCESS; 4419 char strbuf[ISC_STRERRORSIZE]; 4420 char path[sizeof(sockaddr->type.sunix.sun_path)]; 4421#ifdef NEED_SECURE_DIRECTORY 4422 char *slash; 4423#endif /* ifdef NEED_SECURE_DIRECTORY */ 4424 4425 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX); 4426 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path)); 4427 strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path)); 4428 4429#ifdef NEED_SECURE_DIRECTORY 4430 slash = strrchr(path, '/'); 4431 if (slash != NULL) { 4432 if (slash != path) { 4433 *slash = '\0'; 4434 } else { 4435 strlcpy(path, "/", sizeof(path)); 4436 } 4437 } else { 4438 strlcpy(path, ".", sizeof(path)); 4439 } 4440#endif /* ifdef NEED_SECURE_DIRECTORY */ 4441 4442 if (chmod(path, perm) < 0) { 4443 strerror_r(errno, strbuf, sizeof(strbuf)); 4444 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4445 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4446 "isc_socket_permunix: chmod(%s, %d): %s", path, 4447 perm, strbuf); 4448 result = ISC_R_FAILURE; 4449 } 4450 if (chown(path, owner, group) < 0) { 4451 strerror_r(errno, strbuf, sizeof(strbuf)); 4452 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4453 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4454 "isc_socket_permunix: chown(%s, %d, %d): %s", 4455 path, owner, group, strbuf); 4456 result = ISC_R_FAILURE; 4457 } 4458 return (result); 4459#else /* ifdef ISC_PLATFORM_HAVESYSUNH */ 4460 UNUSED(sockaddr); 4461 UNUSED(perm); 4462 UNUSED(owner); 4463 UNUSED(group); 4464 return (ISC_R_NOTIMPLEMENTED); 4465#endif /* ifdef ISC_PLATFORM_HAVESYSUNH */ 4466} 4467 4468isc_result_t 4469isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr, 4470 isc_socket_options_t options) { 4471 char strbuf[ISC_STRERRORSIZE]; 4472 int on = 1; 4473 4474 REQUIRE(VALID_SOCKET(sock)); 4475 4476 LOCK(&sock->lock); 4477 4478 INSIST(!sock->bound); 4479 INSIST(!sock->dupped); 4480 4481 if (sock->pf != sockaddr->type.sa.sa_family) { 4482 UNLOCK(&sock->lock); 4483 return (ISC_R_FAMILYMISMATCH); 4484 } 4485 4486 /* 4487 * Only set SO_REUSEADDR when we want a specific port. 4488 */ 4489#ifdef AF_UNIX 4490 if (sock->pf == AF_UNIX) { 4491 goto bind_socket; 4492 } 4493#endif /* ifdef AF_UNIX */ 4494 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 && 4495 isc_sockaddr_getport(sockaddr) != (in_port_t)0) 4496 { 4497 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, 4498 sizeof(on)) < 0) 4499 { 4500 UNEXPECTED_ERROR(__FILE__, __LINE__, 4501 "setsockopt(%d) failed", sock->fd); 4502 } 4503#if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) 4504 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB, 4505 (void *)&on, sizeof(on)) < 0) 4506 { 4507 UNEXPECTED_ERROR(__FILE__, __LINE__, 4508 "setsockopt(%d) failed", sock->fd); 4509 } 4510#elif defined(__linux__) && defined(SO_REUSEPORT) 4511 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on, 4512 sizeof(on)) < 0) 4513 { 4514 UNEXPECTED_ERROR(__FILE__, __LINE__, 4515 "setsockopt(%d) failed", sock->fd); 4516 } 4517#endif /* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */ 4518 /* Press on... */ 4519 } 4520#ifdef AF_UNIX 4521bind_socket: 4522#endif /* ifdef AF_UNIX */ 4523 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) { 4524 inc_stats(sock->manager->stats, 4525 sock->statsindex[STATID_BINDFAIL]); 4526 4527 UNLOCK(&sock->lock); 4528 switch (errno) { 4529 case EACCES: 4530 return (ISC_R_NOPERM); 4531 case EADDRNOTAVAIL: 4532 return (ISC_R_ADDRNOTAVAIL); 4533 case EADDRINUSE: 4534 return (ISC_R_ADDRINUSE); 4535 case EINVAL: 4536 return (ISC_R_BOUND); 4537 default: 4538 strerror_r(errno, strbuf, sizeof(strbuf)); 4539 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s", 4540 strbuf); 4541 return (ISC_R_UNEXPECTED); 4542 } 4543 } 4544 4545 socket_log(sock, sockaddr, TRACE, "bound"); 4546 sock->bound = 1; 4547 4548 UNLOCK(&sock->lock); 4549 return (ISC_R_SUCCESS); 4550} 4551 4552/* 4553 * Enable this only for specific OS versions, and only when they have repaired 4554 * their problems with it. Until then, this is is broken and needs to be 4555 * disabled by default. See RT22589 for details. 4556 */ 4557#undef ENABLE_ACCEPTFILTER 4558 4559isc_result_t 4560isc_socket_filter(isc_socket_t *sock, const char *filter) { 4561#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 4562 char strbuf[ISC_STRERRORSIZE]; 4563 struct accept_filter_arg afa; 4564#else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */ 4565 UNUSED(sock); 4566 UNUSED(filter); 4567#endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */ 4568 4569 REQUIRE(VALID_SOCKET(sock)); 4570 4571#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 4572 bzero(&afa, sizeof(afa)); 4573 strlcpy(afa.af_name, filter, sizeof(afa.af_name)); 4574 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa, 4575 sizeof(afa)) == -1) 4576 { 4577 strerror_r(errno, strbuf, sizeof(strbuf)); 4578 socket_log(sock, NULL, CREATION, 4579 "setsockopt(SO_ACCEPTFILTER): %s", strbuf); 4580 return (ISC_R_FAILURE); 4581 } 4582 return (ISC_R_SUCCESS); 4583#else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */ 4584 return (ISC_R_NOTIMPLEMENTED); 4585#endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */ 4586} 4587 4588/* 4589 * Try enabling TCP Fast Open for a given socket if the OS supports it. 4590 */ 4591static void 4592set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) { 4593#if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) 4594 char strbuf[ISC_STRERRORSIZE]; 4595 4596/* 4597 * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also 4598 * shipping a default kernel without TFO support, so we special-case it by 4599 * performing an additional runtime check for TFO support using sysctl to 4600 * prevent setsockopt() errors from being logged. 4601 */ 4602#if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) 4603#define SYSCTL_TFO "net.inet.tcp.fastopen.enabled" 4604 unsigned int enabled; 4605 size_t enabledlen = sizeof(enabled); 4606 static bool tfo_notice_logged = false; 4607 4608 if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) { 4609 /* 4610 * This kernel does not support TCP Fast Open. There is 4611 * nothing more we can do. 4612 */ 4613 return; 4614 } else if (enabled == 0) { 4615 /* 4616 * This kernel does support TCP Fast Open, but it is disabled 4617 * by sysctl. Notify the user, but do not nag. 4618 */ 4619 if (!tfo_notice_logged) { 4620 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4621 ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE, 4622 "TCP_FASTOPEN support is disabled by " 4623 "sysctl (" SYSCTL_TFO " = 0)"); 4624 tfo_notice_logged = true; 4625 } 4626 return; 4627 } 4628#endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */ 4629 4630#ifdef __APPLE__ 4631 backlog = 1; 4632#else /* ifdef __APPLE__ */ 4633 backlog = backlog / 2; 4634 if (backlog == 0) { 4635 backlog = 1; 4636 } 4637#endif /* ifdef __APPLE__ */ 4638 if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog, 4639 sizeof(backlog)) < 0) 4640 { 4641 strerror_r(errno, strbuf, sizeof(strbuf)); 4642 UNEXPECTED_ERROR(__FILE__, __LINE__, 4643 "setsockopt(%d, TCP_FASTOPEN) failed with %s", 4644 sock->fd, strbuf); 4645 /* TCP_FASTOPEN is experimental so ignore failures */ 4646 } 4647#else /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */ 4648 UNUSED(sock); 4649 UNUSED(backlog); 4650#endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */ 4651} 4652 4653/* 4654 * Set up to listen on a given socket. We do this by creating an internal 4655 * event that will be dispatched when the socket has read activity. The 4656 * watcher will send the internal event to the task when there is a new 4657 * connection. 4658 * 4659 * Unlike in read, we don't preallocate a done event here. Every time there 4660 * is a new connection we'll have to allocate a new one anyway, so we might 4661 * as well keep things simple rather than having to track them. 4662 */ 4663isc_result_t 4664isc_socket_listen(isc_socket_t *sock, unsigned int backlog) { 4665 char strbuf[ISC_STRERRORSIZE]; 4666 4667 REQUIRE(VALID_SOCKET(sock)); 4668 4669 LOCK(&sock->lock); 4670 4671 REQUIRE(!sock->listener); 4672 REQUIRE(sock->bound); 4673 REQUIRE(sock->type == isc_sockettype_tcp || 4674 sock->type == isc_sockettype_unix); 4675 4676 if (backlog == 0) { 4677 backlog = SOMAXCONN; 4678 } 4679 4680 if (listen(sock->fd, (int)backlog) < 0) { 4681 UNLOCK(&sock->lock); 4682 strerror_r(errno, strbuf, sizeof(strbuf)); 4683 4684 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf); 4685 4686 return (ISC_R_UNEXPECTED); 4687 } 4688 4689 set_tcp_fastopen(sock, backlog); 4690 4691 sock->listener = 1; 4692 4693 UNLOCK(&sock->lock); 4694 return (ISC_R_SUCCESS); 4695} 4696 4697/* 4698 * This should try to do aggressive accept() XXXMLG 4699 */ 4700isc_result_t 4701isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action, 4702 void *arg) { 4703 isc_socket_newconnev_t *dev; 4704 isc_socketmgr_t *manager; 4705 isc_task_t *ntask = NULL; 4706 isc_socket_t *nsock; 4707 isc_result_t result; 4708 bool do_poke = false; 4709 4710 REQUIRE(VALID_SOCKET(sock)); 4711 manager = sock->manager; 4712 REQUIRE(VALID_MANAGER(manager)); 4713 4714 LOCK(&sock->lock); 4715 4716 REQUIRE(sock->listener); 4717 4718 /* 4719 * Sender field is overloaded here with the task we will be sending 4720 * this event to. Just before the actual event is delivered the 4721 * actual ev_sender will be touched up to be the socket. 4722 */ 4723 dev = (isc_socket_newconnev_t *)isc_event_allocate( 4724 manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg, 4725 sizeof(*dev)); 4726 ISC_LINK_INIT(dev, ev_link); 4727 4728 result = allocate_socket(manager, sock->type, &nsock); 4729 if (result != ISC_R_SUCCESS) { 4730 isc_event_free(ISC_EVENT_PTR(&dev)); 4731 UNLOCK(&sock->lock); 4732 return (result); 4733 } 4734 4735 /* 4736 * Attach to socket and to task. 4737 */ 4738 isc_task_attach(task, &ntask); 4739 if (isc_task_exiting(ntask)) { 4740 free_socket(&nsock); 4741 isc_task_detach(&ntask); 4742 isc_event_free(ISC_EVENT_PTR(&dev)); 4743 UNLOCK(&sock->lock); 4744 return (ISC_R_SHUTTINGDOWN); 4745 } 4746 isc_refcount_increment0(&nsock->references); 4747 nsock->statsindex = sock->statsindex; 4748 4749 dev->ev_sender = ntask; 4750 dev->newsocket = nsock; 4751 4752 /* 4753 * Poke watcher here. We still have the socket locked, so there 4754 * is no race condition. We will keep the lock for such a short 4755 * bit of time waking it up now or later won't matter all that much. 4756 */ 4757 do_poke = ISC_LIST_EMPTY(sock->accept_list); 4758 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link); 4759 if (do_poke) { 4760 select_poke(manager, sock->threadid, sock->fd, 4761 SELECT_POKE_ACCEPT); 4762 } 4763 UNLOCK(&sock->lock); 4764 return (ISC_R_SUCCESS); 4765} 4766 4767isc_result_t 4768isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr, 4769 isc_task_t *task, isc_taskaction_t action, void *arg) { 4770 isc_socket_connev_t *dev; 4771 isc_task_t *ntask = NULL; 4772 isc_socketmgr_t *manager; 4773 int cc; 4774 char strbuf[ISC_STRERRORSIZE]; 4775 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 4776 4777 REQUIRE(VALID_SOCKET(sock)); 4778 REQUIRE(addr != NULL); 4779 REQUIRE(task != NULL); 4780 REQUIRE(action != NULL); 4781 4782 manager = sock->manager; 4783 REQUIRE(VALID_MANAGER(manager)); 4784 REQUIRE(addr != NULL); 4785 4786 if (isc_sockaddr_ismulticast(addr)) { 4787 return (ISC_R_MULTICAST); 4788 } 4789 4790 LOCK(&sock->lock); 4791 4792 dev = (isc_socket_connev_t *)isc_event_allocate( 4793 manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg, 4794 sizeof(*dev)); 4795 ISC_LINK_INIT(dev, ev_link); 4796 4797 if (sock->connecting) { 4798 INSIST(isc_sockaddr_equal(&sock->peer_address, addr)); 4799 goto queue; 4800 } 4801 4802 if (sock->connected) { 4803 INSIST(isc_sockaddr_equal(&sock->peer_address, addr)); 4804 dev->result = ISC_R_SUCCESS; 4805 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid); 4806 4807 UNLOCK(&sock->lock); 4808 4809 return (ISC_R_SUCCESS); 4810 } 4811 4812 /* 4813 * Try to do the connect right away, as there can be only one 4814 * outstanding, and it might happen to complete. 4815 */ 4816 sock->peer_address = *addr; 4817 cc = connect(sock->fd, &addr->type.sa, addr->length); 4818 if (cc < 0) { 4819 /* 4820 * The socket is nonblocking and the connection cannot be 4821 * completed immediately. It is possible to select(2) or 4822 * poll(2) for completion by selecting the socket for writing. 4823 * After select(2) indicates writability, use getsockopt(2) to 4824 * read the SO_ERROR option at level SOL_SOCKET to determine 4825 * whether connect() completed successfully (SO_ERROR is zero) 4826 * or unsuccessfully (SO_ERROR is one of the usual error codes 4827 * listed here, explaining the reason for the failure). 4828 */ 4829 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { 4830 cc = 0; 4831 goto success; 4832 } 4833 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 4834 goto queue; 4835 } 4836 4837 switch (errno) { 4838#define ERROR_MATCH(a, b) \ 4839 case a: \ 4840 dev->result = b; \ 4841 goto err_exit; 4842 ERROR_MATCH(EACCES, ISC_R_NOPERM); 4843 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 4844 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 4845 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 4846 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 4847#ifdef EHOSTDOWN 4848 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 4849#endif /* ifdef EHOSTDOWN */ 4850 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 4851 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 4852 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 4853 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 4854 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 4855 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 4856#undef ERROR_MATCH 4857 } 4858 4859 sock->connected = 0; 4860 4861 strerror_r(errno, strbuf, sizeof(strbuf)); 4862 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf)); 4863 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s", 4864 addrbuf, errno, strbuf); 4865 4866 UNLOCK(&sock->lock); 4867 inc_stats(sock->manager->stats, 4868 sock->statsindex[STATID_CONNECTFAIL]); 4869 isc_event_free(ISC_EVENT_PTR(&dev)); 4870 return (ISC_R_UNEXPECTED); 4871 4872 err_exit: 4873 sock->connected = 0; 4874 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid); 4875 4876 UNLOCK(&sock->lock); 4877 inc_stats(sock->manager->stats, 4878 sock->statsindex[STATID_CONNECTFAIL]); 4879 return (ISC_R_SUCCESS); 4880 } 4881 4882 /* 4883 * If connect completed, fire off the done event. 4884 */ 4885success: 4886 if (cc == 0) { 4887 sock->connected = 1; 4888 sock->bound = 1; 4889 dev->result = ISC_R_SUCCESS; 4890 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid); 4891 4892 UNLOCK(&sock->lock); 4893 4894 inc_stats(sock->manager->stats, 4895 sock->statsindex[STATID_CONNECT]); 4896 4897 return (ISC_R_SUCCESS); 4898 } 4899 4900queue: 4901 4902 /* 4903 * Attach to task. 4904 */ 4905 isc_task_attach(task, &ntask); 4906 4907 dev->ev_sender = ntask; 4908 4909 /* 4910 * Poke watcher here. We still have the socket locked, so there 4911 * is no race condition. We will keep the lock for such a short 4912 * bit of time waking it up now or later won't matter all that much. 4913 */ 4914 bool do_poke = ISC_LIST_EMPTY(sock->connect_list); 4915 ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link); 4916 if (do_poke && !sock->connecting) { 4917 sock->connecting = 1; 4918 select_poke(manager, sock->threadid, sock->fd, 4919 SELECT_POKE_CONNECT); 4920 } 4921 4922 UNLOCK(&sock->lock); 4923 return (ISC_R_SUCCESS); 4924} 4925 4926/* 4927 * Called when a socket with a pending connect() finishes. 4928 */ 4929static void 4930internal_connect(isc_socket_t *sock) { 4931 isc_socket_connev_t *dev; 4932 int cc; 4933 isc_result_t result; 4934 socklen_t optlen; 4935 char strbuf[ISC_STRERRORSIZE]; 4936 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 4937 4938 INSIST(VALID_SOCKET(sock)); 4939 REQUIRE(sock->fd >= 0); 4940 4941 /* 4942 * Get the first item off the connect list. 4943 * If it is empty, unlock the socket and return. 4944 */ 4945 dev = ISC_LIST_HEAD(sock->connect_list); 4946 if (dev == NULL) { 4947 INSIST(!sock->connecting); 4948 goto finish; 4949 } 4950 4951 INSIST(sock->connecting); 4952 sock->connecting = 0; 4953 4954 /* 4955 * Get any possible error status here. 4956 */ 4957 optlen = sizeof(cc); 4958 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc, 4959 (void *)&optlen) != 0) 4960 { 4961 cc = errno; 4962 } else { 4963 errno = cc; 4964 } 4965 4966 if (errno != 0) { 4967 /* 4968 * If the error is EAGAIN, just re-select on this 4969 * fd and pretend nothing strange happened. 4970 */ 4971 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 4972 sock->connecting = 1; 4973 return; 4974 } 4975 4976 inc_stats(sock->manager->stats, 4977 sock->statsindex[STATID_CONNECTFAIL]); 4978 4979 /* 4980 * Translate other errors into ISC_R_* flavors. 4981 */ 4982 switch (errno) { 4983#define ERROR_MATCH(a, b) \ 4984 case a: \ 4985 result = b; \ 4986 break; 4987 ERROR_MATCH(EACCES, ISC_R_NOPERM); 4988 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 4989 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 4990 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 4991 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 4992#ifdef EHOSTDOWN 4993 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 4994#endif /* ifdef EHOSTDOWN */ 4995 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 4996 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 4997 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 4998 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 4999 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 5000 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5001#undef ERROR_MATCH 5002 default: 5003 result = ISC_R_UNEXPECTED; 5004 isc_sockaddr_format(&sock->peer_address, peerbuf, 5005 sizeof(peerbuf)); 5006 strerror_r(errno, strbuf, sizeof(strbuf)); 5007 UNEXPECTED_ERROR(__FILE__, __LINE__, 5008 "internal_connect: connect(%s) %s", 5009 peerbuf, strbuf); 5010 } 5011 } else { 5012 inc_stats(sock->manager->stats, 5013 sock->statsindex[STATID_CONNECT]); 5014 result = ISC_R_SUCCESS; 5015 sock->connected = 1; 5016 sock->bound = 1; 5017 } 5018 5019 do { 5020 dev->result = result; 5021 send_connectdone_event(sock, &dev); 5022 dev = ISC_LIST_HEAD(sock->connect_list); 5023 } while (dev != NULL); 5024 5025finish: 5026 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd, 5027 SELECT_POKE_CONNECT); 5028} 5029 5030isc_result_t 5031isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) { 5032 isc_result_t result; 5033 5034 REQUIRE(VALID_SOCKET(sock)); 5035 REQUIRE(addressp != NULL); 5036 5037 LOCK(&sock->lock); 5038 5039 if (sock->connected) { 5040 *addressp = sock->peer_address; 5041 result = ISC_R_SUCCESS; 5042 } else { 5043 result = ISC_R_NOTCONNECTED; 5044 } 5045 5046 UNLOCK(&sock->lock); 5047 5048 return (result); 5049} 5050 5051isc_result_t 5052isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) { 5053 socklen_t len; 5054 isc_result_t result; 5055 char strbuf[ISC_STRERRORSIZE]; 5056 5057 REQUIRE(VALID_SOCKET(sock)); 5058 REQUIRE(addressp != NULL); 5059 5060 LOCK(&sock->lock); 5061 5062 if (!sock->bound) { 5063 result = ISC_R_NOTBOUND; 5064 goto out; 5065 } 5066 5067 result = ISC_R_SUCCESS; 5068 5069 len = sizeof(addressp->type); 5070 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) { 5071 strerror_r(errno, strbuf, sizeof(strbuf)); 5072 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf); 5073 result = ISC_R_UNEXPECTED; 5074 goto out; 5075 } 5076 addressp->length = (unsigned int)len; 5077 5078out: 5079 UNLOCK(&sock->lock); 5080 5081 return (result); 5082} 5083 5084/* 5085 * Run through the list of events on this socket, and cancel the ones 5086 * queued for task "task" of type "how". "how" is a bitmask. 5087 */ 5088void 5089isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) { 5090 REQUIRE(VALID_SOCKET(sock)); 5091 5092 /* 5093 * Quick exit if there is nothing to do. Don't even bother locking 5094 * in this case. 5095 */ 5096 if (how == 0) { 5097 return; 5098 } 5099 5100 LOCK(&sock->lock); 5101 5102 /* 5103 * All of these do the same thing, more or less. 5104 * Each will: 5105 * o If the internal event is marked as "posted" try to 5106 * remove it from the task's queue. If this fails, mark it 5107 * as canceled instead, and let the task clean it up later. 5108 * o For each I/O request for that task of that type, post 5109 * its done event with status of "ISC_R_CANCELED". 5110 * o Reset any state needed. 5111 */ 5112 if (((how & ISC_SOCKCANCEL_RECV) != 0) && 5113 !ISC_LIST_EMPTY(sock->recv_list)) 5114 { 5115 isc_socketevent_t *dev; 5116 isc_socketevent_t *next; 5117 isc_task_t *current_task; 5118 5119 dev = ISC_LIST_HEAD(sock->recv_list); 5120 5121 while (dev != NULL) { 5122 current_task = dev->ev_sender; 5123 next = ISC_LIST_NEXT(dev, ev_link); 5124 5125 if ((task == NULL) || (task == current_task)) { 5126 dev->result = ISC_R_CANCELED; 5127 send_recvdone_event(sock, &dev); 5128 } 5129 dev = next; 5130 } 5131 } 5132 5133 if (((how & ISC_SOCKCANCEL_SEND) != 0) && 5134 !ISC_LIST_EMPTY(sock->send_list)) 5135 { 5136 isc_socketevent_t *dev; 5137 isc_socketevent_t *next; 5138 isc_task_t *current_task; 5139 5140 dev = ISC_LIST_HEAD(sock->send_list); 5141 5142 while (dev != NULL) { 5143 current_task = dev->ev_sender; 5144 next = ISC_LIST_NEXT(dev, ev_link); 5145 5146 if ((task == NULL) || (task == current_task)) { 5147 dev->result = ISC_R_CANCELED; 5148 send_senddone_event(sock, &dev); 5149 } 5150 dev = next; 5151 } 5152 } 5153 5154 if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) && 5155 !ISC_LIST_EMPTY(sock->accept_list)) 5156 { 5157 isc_socket_newconnev_t *dev; 5158 isc_socket_newconnev_t *next; 5159 isc_task_t *current_task; 5160 5161 dev = ISC_LIST_HEAD(sock->accept_list); 5162 while (dev != NULL) { 5163 current_task = dev->ev_sender; 5164 next = ISC_LIST_NEXT(dev, ev_link); 5165 5166 if ((task == NULL) || (task == current_task)) { 5167 ISC_LIST_UNLINK(sock->accept_list, dev, 5168 ev_link); 5169 5170 isc_refcount_decrementz( 5171 &NEWCONNSOCK(dev)->references); 5172 free_socket((isc_socket_t **)&dev->newsocket); 5173 5174 dev->result = ISC_R_CANCELED; 5175 dev->ev_sender = sock; 5176 isc_task_sendtoanddetach(¤t_task, 5177 ISC_EVENT_PTR(&dev), 5178 sock->threadid); 5179 } 5180 5181 dev = next; 5182 } 5183 } 5184 5185 if (((how & ISC_SOCKCANCEL_CONNECT) != 0) && 5186 !ISC_LIST_EMPTY(sock->connect_list)) 5187 { 5188 isc_socket_connev_t *dev; 5189 isc_socket_connev_t *next; 5190 isc_task_t *current_task; 5191 5192 INSIST(sock->connecting); 5193 sock->connecting = 0; 5194 5195 dev = ISC_LIST_HEAD(sock->connect_list); 5196 5197 while (dev != NULL) { 5198 current_task = dev->ev_sender; 5199 next = ISC_LIST_NEXT(dev, ev_link); 5200 5201 if ((task == NULL) || (task == current_task)) { 5202 dev->result = ISC_R_CANCELED; 5203 send_connectdone_event(sock, &dev); 5204 } 5205 dev = next; 5206 } 5207 } 5208 5209 UNLOCK(&sock->lock); 5210} 5211 5212isc_sockettype_t 5213isc_socket_gettype(isc_socket_t *sock) { 5214 REQUIRE(VALID_SOCKET(sock)); 5215 5216 return (sock->type); 5217} 5218 5219void 5220isc_socket_ipv6only(isc_socket_t *sock, bool yes) { 5221#if defined(IPV6_V6ONLY) && !defined(__OpenBSD__) 5222 int onoff = yes ? 1 : 0; 5223#else /* if defined(IPV6_V6ONLY) */ 5224 UNUSED(yes); 5225 UNUSED(sock); 5226#endif /* if defined(IPV6_V6ONLY) */ 5227 5228 REQUIRE(VALID_SOCKET(sock)); 5229 INSIST(!sock->dupped); 5230 5231#if defined(IPV6_V6ONLY) && !defined(__OpenBSD__) 5232 if (sock->pf == AF_INET6) { 5233 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, 5234 (void *)&onoff, sizeof(int)) < 0) 5235 { 5236 char strbuf[ISC_STRERRORSIZE]; 5237 strerror_r(errno, strbuf, sizeof(strbuf)); 5238 UNEXPECTED_ERROR(__FILE__, __LINE__, 5239 "setsockopt(%d, IPV6_V6ONLY) failed: " 5240 "%s", 5241 sock->fd, strbuf); 5242 } 5243 } 5244#endif /* ifdef IPV6_V6ONLY */ 5245} 5246 5247static void 5248setdscp(isc_socket_t *sock, isc_dscp_t dscp) { 5249#if defined(IP_TOS) || defined(IPV6_TCLASS) 5250 int value = dscp << 2; 5251#endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */ 5252 5253 sock->dscp = dscp; 5254 5255#ifdef IP_TOS 5256 if (sock->pf == AF_INET) { 5257 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value, 5258 sizeof(value)) < 0) 5259 { 5260 char strbuf[ISC_STRERRORSIZE]; 5261 strerror_r(errno, strbuf, sizeof(strbuf)); 5262 UNEXPECTED_ERROR(__FILE__, __LINE__, 5263 "setsockopt(%d, IP_TOS, %.02x) " 5264 "failed: %s", 5265 sock->fd, value >> 2, strbuf); 5266 } 5267 } 5268#endif /* ifdef IP_TOS */ 5269#ifdef IPV6_TCLASS 5270 if (sock->pf == AF_INET6) { 5271 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS, 5272 (void *)&value, sizeof(value)) < 0) 5273 { 5274 char strbuf[ISC_STRERRORSIZE]; 5275 strerror_r(errno, strbuf, sizeof(strbuf)); 5276 UNEXPECTED_ERROR(__FILE__, __LINE__, 5277 "setsockopt(%d, IPV6_TCLASS, %.02x) " 5278 "failed: %s", 5279 sock->fd, dscp >> 2, strbuf); 5280 } 5281 } 5282#endif /* ifdef IPV6_TCLASS */ 5283} 5284 5285void 5286isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) { 5287 REQUIRE(VALID_SOCKET(sock)); 5288 REQUIRE(dscp < 0x40); 5289 5290#if !defined(IP_TOS) && !defined(IPV6_TCLASS) 5291 UNUSED(dscp); 5292#else /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */ 5293 if (dscp < 0) { 5294 return; 5295 } 5296 5297 /* The DSCP value must not be changed once it has been set. */ 5298 if (isc_dscp_check_value != -1) { 5299 INSIST(dscp == isc_dscp_check_value); 5300 } 5301#endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */ 5302 5303#ifdef notyet 5304 REQUIRE(!sock->dupped); 5305#endif /* ifdef notyet */ 5306 5307 setdscp(sock, dscp); 5308} 5309 5310isc_socketevent_t * 5311isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype, 5312 isc_taskaction_t action, void *arg) { 5313 return (allocate_socketevent(mctx, sender, eventtype, action, arg)); 5314} 5315 5316void 5317isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) { 5318 /* 5319 * Name 'sock'. 5320 */ 5321 5322 REQUIRE(VALID_SOCKET(sock)); 5323 5324 LOCK(&sock->lock); 5325 strlcpy(sock->name, name, sizeof(sock->name)); 5326 sock->tag = tag; 5327 UNLOCK(&sock->lock); 5328} 5329 5330const char * 5331isc_socket_getname(isc_socket_t *sock) { 5332 return (sock->name); 5333} 5334 5335void * 5336isc_socket_gettag(isc_socket_t *sock) { 5337 return (sock->tag); 5338} 5339 5340int 5341isc_socket_getfd(isc_socket_t *sock) { 5342 return ((short)sock->fd); 5343} 5344 5345static isc_once_t hasreuseport_once = ISC_ONCE_INIT; 5346static bool hasreuseport = false; 5347 5348static void 5349init_hasreuseport(void) { 5350/* 5351 * SO_REUSEPORT works very differently on *BSD and on Linux (because why not). 5352 * We only want to use it on Linux, if it's available. On BSD we want to dup() 5353 * sockets instead of re-binding them. 5354 */ 5355#if (defined(SO_REUSEPORT) && defined(__linux__)) || \ 5356 (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) 5357 int sock, yes = 1; 5358 sock = socket(AF_INET, SOCK_DGRAM, 0); 5359 if (sock < 0) { 5360 sock = socket(AF_INET6, SOCK_DGRAM, 0); 5361 if (sock < 0) { 5362 return; 5363 } 5364 } 5365 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes, 5366 sizeof(yes)) < 0) 5367 { 5368 close(sock); 5369 return; 5370#if defined(__FreeBSD_kernel__) 5371 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes, 5372 sizeof(yes)) < 0) 5373#else /* if defined(__FreeBSD_kernel__) */ 5374 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes, 5375 sizeof(yes)) < 0) 5376#endif /* if defined(__FreeBSD_kernel__) */ 5377 { 5378 close(sock); 5379 return; 5380 } 5381 hasreuseport = true; 5382 close(sock); 5383#endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \ 5384 * (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */ 5385} 5386 5387bool 5388isc_socket_hasreuseport() { 5389 RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) == 5390 ISC_R_SUCCESS); 5391 return (hasreuseport); 5392} 5393 5394#if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) 5395static const char * 5396_socktype(isc_sockettype_t type) { 5397 switch (type) { 5398 case isc_sockettype_udp: 5399 return ("udp"); 5400 case isc_sockettype_tcp: 5401 return ("tcp"); 5402 case isc_sockettype_unix: 5403 return ("unix"); 5404 case isc_sockettype_fdwatch: 5405 return ("fdwatch"); 5406 default: 5407 return ("not-initialized"); 5408 } 5409} 5410#endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */ 5411 5412#ifdef HAVE_LIBXML2 5413#define TRY0(a) \ 5414 do { \ 5415 xmlrc = (a); \ 5416 if (xmlrc < 0) \ 5417 goto error; \ 5418 } while (0) 5419int 5420isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) { 5421 isc_socket_t *sock = NULL; 5422 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5423 isc_sockaddr_t addr; 5424 socklen_t len; 5425 int xmlrc; 5426 xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0; 5427 5428 LOCK(&mgr->lock); 5429 5430 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets")); 5431 sock = ISC_LIST_HEAD(mgr->socklist); 5432 while (sock != NULL) { 5433 LOCK(&sock->lock); 5434 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket")); 5435 5436 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id")); 5437 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock)); 5438 TRY0(xmlTextWriterEndElement(writer)); 5439 5440 if (sock->name[0] != 0) { 5441 TRY0(xmlTextWriterStartElement(writer, 5442 ISC_XMLCHAR "name")); 5443 TRY0(xmlTextWriterWriteFormatString(writer, "%s", 5444 sock->name)); 5445 TRY0(xmlTextWriterEndElement(writer)); /* name */ 5446 } 5447 5448 TRY0(xmlTextWriterStartElement(writer, 5449 ISC_XMLCHAR "references")); 5450 TRY0(xmlTextWriterWriteFormatString( 5451 writer, "%d", 5452 (int)isc_refcount_current(&sock->references))); 5453 TRY0(xmlTextWriterEndElement(writer)); 5454 5455 TRY0(xmlTextWriterWriteElement( 5456 writer, ISC_XMLCHAR "type", 5457 ISC_XMLCHAR _socktype(sock->type))); 5458 5459 if (sock->connected) { 5460 isc_sockaddr_format(&sock->peer_address, peerbuf, 5461 sizeof(peerbuf)); 5462 TRY0(xmlTextWriterWriteElement( 5463 writer, ISC_XMLCHAR "peer-address", 5464 ISC_XMLCHAR peerbuf)); 5465 } 5466 5467 len = sizeof(addr); 5468 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) { 5469 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf)); 5470 TRY0(xmlTextWriterWriteElement( 5471 writer, ISC_XMLCHAR "local-address", 5472 ISC_XMLCHAR peerbuf)); 5473 } 5474 5475 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states")); 5476 if (sock->listener) { 5477 TRY0(xmlTextWriterWriteElement(writer, 5478 ISC_XMLCHAR "state", 5479 ISC_XMLCHAR "listener")); 5480 } 5481 if (sock->connected) { 5482 TRY0(xmlTextWriterWriteElement( 5483 writer, ISC_XMLCHAR "state", 5484 ISC_XMLCHAR "connected")); 5485 } 5486 if (sock->connecting) { 5487 TRY0(xmlTextWriterWriteElement( 5488 writer, ISC_XMLCHAR "state", 5489 ISC_XMLCHAR "connecting")); 5490 } 5491 if (sock->bound) { 5492 TRY0(xmlTextWriterWriteElement(writer, 5493 ISC_XMLCHAR "state", 5494 ISC_XMLCHAR "bound")); 5495 } 5496 5497 TRY0(xmlTextWriterEndElement(writer)); /* states */ 5498 5499 TRY0(xmlTextWriterEndElement(writer)); /* socket */ 5500 5501 UNLOCK(&sock->lock); 5502 sock = ISC_LIST_NEXT(sock, link); 5503 } 5504 TRY0(xmlTextWriterEndElement(writer)); /* sockets */ 5505 5506error: 5507 if (sock != NULL) { 5508 UNLOCK(&sock->lock); 5509 } 5510 5511 UNLOCK(&mgr->lock); 5512 5513 return (xmlrc); 5514} 5515#endif /* HAVE_LIBXML2 */ 5516 5517#ifdef HAVE_JSON_C 5518#define CHECKMEM(m) \ 5519 do { \ 5520 if (m == NULL) { \ 5521 result = ISC_R_NOMEMORY; \ 5522 goto error; \ 5523 } \ 5524 } while (0) 5525 5526isc_result_t 5527isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) { 5528 isc_result_t result = ISC_R_SUCCESS; 5529 isc_socket_t *sock = NULL; 5530 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5531 isc_sockaddr_t addr; 5532 socklen_t len; 5533 json_object *obj, *array = json_object_new_array(); 5534 json_object *stats = (json_object *)stats0; 5535 5536 CHECKMEM(array); 5537 5538 LOCK(&mgr->lock); 5539 5540 sock = ISC_LIST_HEAD(mgr->socklist); 5541 while (sock != NULL) { 5542 json_object *states, *entry = json_object_new_object(); 5543 char buf[255]; 5544 5545 CHECKMEM(entry); 5546 json_object_array_add(array, entry); 5547 5548 LOCK(&sock->lock); 5549 5550 snprintf(buf, sizeof(buf), "%p", sock); 5551 obj = json_object_new_string(buf); 5552 CHECKMEM(obj); 5553 json_object_object_add(entry, "id", obj); 5554 5555 if (sock->name[0] != 0) { 5556 obj = json_object_new_string(sock->name); 5557 CHECKMEM(obj); 5558 json_object_object_add(entry, "name", obj); 5559 } 5560 5561 obj = json_object_new_int( 5562 (int)isc_refcount_current(&sock->references)); 5563 CHECKMEM(obj); 5564 json_object_object_add(entry, "references", obj); 5565 5566 obj = json_object_new_string(_socktype(sock->type)); 5567 CHECKMEM(obj); 5568 json_object_object_add(entry, "type", obj); 5569 5570 if (sock->connected) { 5571 isc_sockaddr_format(&sock->peer_address, peerbuf, 5572 sizeof(peerbuf)); 5573 obj = json_object_new_string(peerbuf); 5574 CHECKMEM(obj); 5575 json_object_object_add(entry, "peer-address", obj); 5576 } 5577 5578 len = sizeof(addr); 5579 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) { 5580 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf)); 5581 obj = json_object_new_string(peerbuf); 5582 CHECKMEM(obj); 5583 json_object_object_add(entry, "local-address", obj); 5584 } 5585 5586 states = json_object_new_array(); 5587 CHECKMEM(states); 5588 json_object_object_add(entry, "states", states); 5589 5590 if (sock->listener) { 5591 obj = json_object_new_string("listener"); 5592 CHECKMEM(obj); 5593 json_object_array_add(states, obj); 5594 } 5595 5596 if (sock->connected) { 5597 obj = json_object_new_string("connected"); 5598 CHECKMEM(obj); 5599 json_object_array_add(states, obj); 5600 } 5601 5602 if (sock->connecting) { 5603 obj = json_object_new_string("connecting"); 5604 CHECKMEM(obj); 5605 json_object_array_add(states, obj); 5606 } 5607 5608 if (sock->bound) { 5609 obj = json_object_new_string("bound"); 5610 CHECKMEM(obj); 5611 json_object_array_add(states, obj); 5612 } 5613 5614 UNLOCK(&sock->lock); 5615 sock = ISC_LIST_NEXT(sock, link); 5616 } 5617 5618 json_object_object_add(stats, "sockets", array); 5619 array = NULL; 5620 result = ISC_R_SUCCESS; 5621 5622error: 5623 if (array != NULL) { 5624 json_object_put(array); 5625 } 5626 5627 if (sock != NULL) { 5628 UNLOCK(&sock->lock); 5629 } 5630 5631 UNLOCK(&mgr->lock); 5632 5633 return (result); 5634} 5635#endif /* HAVE_JSON_C */ 5636 5637/* 5638 * Create a new 'type' socket managed by 'manager'. Events 5639 * will be posted to 'task' and when dispatched 'action' will be 5640 * called with 'arg' as the arg value. The new socket is returned 5641 * in 'socketp'. 5642 */ 5643isc_result_t 5644isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags, 5645 isc_sockfdwatch_t callback, void *cbarg, 5646 isc_task_t *task, isc_socket_t **socketp) 5647{ 5648 isc_socket_t *sock = NULL; 5649 isc__socketthread_t *thread; 5650 isc_result_t result; 5651 int lockid; 5652 5653 REQUIRE(VALID_MANAGER(manager)); 5654 REQUIRE(socketp != NULL && *socketp == NULL); 5655 5656 if (fd < 0 || (unsigned int)fd >= manager->maxsocks) 5657 return (ISC_R_RANGE); 5658 5659 result = allocate_socket(manager, isc_sockettype_fdwatch, &sock); 5660 if (result != ISC_R_SUCCESS) 5661 return (result); 5662 5663 sock->fd = fd; 5664 sock->fdwatcharg = cbarg; 5665 sock->fdwatchcb = callback; 5666 sock->fdwatchflags = flags; 5667 sock->fdwatchtask = task; 5668 5669 sock->threadid = gen_threadid(sock); 5670 isc_refcount_init(&sock->references, 1); 5671 thread = &manager->threads[sock->threadid]; 5672 *socketp = (isc_socket_t *)sock; 5673 5674 /* 5675 * Note we don't have to lock the socket like we normally would because 5676 * there are no external references to it yet. 5677 */ 5678 5679 lockid = FDLOCK_ID(sock->fd); 5680 LOCK(&thread->fdlock[lockid]); 5681 thread->fds[sock->fd] = sock; 5682 thread->fdstate[sock->fd] = MANAGED; 5683 5684#if defined(USE_EPOLL) 5685 manager->epoll_events[sock->fd] = 0; 5686#endif 5687#ifdef USE_DEVPOLL 5688 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 && 5689 thread->fdpollinfo[sock->fd].want_write == 0); 5690#endif /* ifdef USE_DEVPOLL */ 5691 UNLOCK(&thread->fdlock[lockid]); 5692 5693 LOCK(&manager->lock); 5694 ISC_LIST_APPEND(manager->socklist, sock, link); 5695#ifdef USE_SELECT 5696 if (thread->maxfd < sock->fd) 5697 thread->maxfd = sock->fd; 5698#endif 5699 UNLOCK(&manager->lock); 5700 5701 sock->active = 1; 5702 if (flags & ISC_SOCKFDWATCH_READ) 5703 select_poke(sock->manager, sock->threadid, sock->fd, 5704 SELECT_POKE_READ); 5705 if (flags & ISC_SOCKFDWATCH_WRITE) 5706 select_poke(sock->manager, sock->threadid, sock->fd, 5707 SELECT_POKE_WRITE); 5708 5709 socket_log(sock, NULL, CREATION, "fdwatch-created"); 5710 5711 return (ISC_R_SUCCESS); 5712} 5713 5714/* 5715 * Indicate to the manager that it should watch the socket again. 5716 * This can be used to restart watching if the previous event handler 5717 * didn't indicate there was more data to be processed. Primarily 5718 * it is for writing but could be used for reading if desired 5719 */ 5720 5721isc_result_t 5722isc_socket_fdwatchpoke(isc_socket_t *sock, int flags) 5723{ 5724 REQUIRE(VALID_SOCKET(sock)); 5725 5726 /* 5727 * We check both flags first to allow us to get the lock 5728 * once but only if we need it. 5729 */ 5730 5731 if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) { 5732 LOCK(&sock->lock); 5733 if ((flags & ISC_SOCKFDWATCH_READ) != 0) 5734 select_poke(sock->manager, sock->threadid, sock->fd, 5735 SELECT_POKE_READ); 5736 if ((flags & ISC_SOCKFDWATCH_WRITE) != 0) 5737 select_poke(sock->manager, sock->threadid, sock->fd, 5738 SELECT_POKE_WRITE); 5739 UNLOCK(&sock->lock); 5740 } 5741 5742 socket_log(sock, NULL, TRACE, "fdwatch-poked flags: %d", flags); 5743 5744 return (ISC_R_SUCCESS); 5745} 5746