1/* 2 * Copyright (C) 2004-2012 Internet Systems Consortium, Inc. ("ISC") 3 * Copyright (C) 1998-2003 Internet Software Consortium. 4 * 5 * Permission to use, copy, modify, and/or distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 11 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 15 * PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18/* $Id$ */ 19 20/*! \file */ 21 22#include <config.h> 23 24#include <sys/param.h> 25#include <sys/types.h> 26#include <sys/socket.h> 27#include <sys/stat.h> 28#include <sys/time.h> 29#include <sys/uio.h> 30 31#include <errno.h> 32#include <fcntl.h> 33#include <stddef.h> 34#include <stdlib.h> 35#include <string.h> 36#include <unistd.h> 37 38#include <isc/buffer.h> 39#include <isc/bufferlist.h> 40#include <isc/condition.h> 41#include <isc/formatcheck.h> 42#include <isc/list.h> 43#include <isc/log.h> 44#include <isc/mem.h> 45#include <isc/msgs.h> 46#include <isc/mutex.h> 47#include <isc/net.h> 48#include <isc/once.h> 49#include <isc/platform.h> 50#include <isc/print.h> 51#include <isc/region.h> 52#include <isc/socket.h> 53#include <isc/stats.h> 54#include <isc/strerror.h> 55#include <isc/task.h> 56#include <isc/thread.h> 57#include <isc/util.h> 58#include <isc/xml.h> 59 60#ifdef ISC_PLATFORM_HAVESYSUNH 61#include <sys/un.h> 62#endif 63#ifdef ISC_PLATFORM_HAVEKQUEUE 64#include <sys/event.h> 65#endif 66#ifdef ISC_PLATFORM_HAVEEPOLL 67#include <sys/epoll.h> 68#endif 69#ifdef ISC_PLATFORM_HAVEDEVPOLL 70#if defined(HAVE_SYS_DEVPOLL_H) 71#include <sys/devpoll.h> 72#elif defined(HAVE_DEVPOLL_H) 73#include <devpoll.h> 74#endif 75#endif 76 77#include "errno2result.h" 78 79/* See task.c about the following definition: */ 80#ifdef BIND9 81#ifdef ISC_PLATFORM_USETHREADS 82#define USE_WATCHER_THREAD 83#else 84#define USE_SHARED_MANAGER 85#endif /* ISC_PLATFORM_USETHREADS */ 86#endif /* BIND9 */ 87 88#ifndef USE_WATCHER_THREAD 89#include "socket_p.h" 90#include "../task_p.h" 91#endif /* USE_WATCHER_THREAD */ 92 93#if defined(SO_BSDCOMPAT) && defined(__linux__) 94#include <sys/utsname.h> 95#endif 96 97/*% 98 * Choose the most preferable multiplex method. 99 */ 100#ifdef ISC_PLATFORM_HAVEKQUEUE 101#define USE_KQUEUE 102#elif defined (ISC_PLATFORM_HAVEEPOLL) 103#define USE_EPOLL 104#elif defined (ISC_PLATFORM_HAVEDEVPOLL) 105#define USE_DEVPOLL 106typedef struct { 107 unsigned int want_read : 1, 108 want_write : 1; 109} pollinfo_t; 110#else 111#define USE_SELECT 112#endif /* ISC_PLATFORM_HAVEKQUEUE */ 113 114#ifndef USE_WATCHER_THREAD 115#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 116struct isc_socketwait { 117 int nevents; 118}; 119#elif defined (USE_SELECT) 120struct isc_socketwait { 121 fd_set *readset; 122 fd_set *writeset; 123 int nfds; 124 int maxfd; 125}; 126#endif /* USE_KQUEUE */ 127#endif /* !USE_WATCHER_THREAD */ 128 129/*% 130 * Maximum number of allowable open sockets. This is also the maximum 131 * allowable socket file descriptor. 132 * 133 * Care should be taken before modifying this value for select(): 134 * The API standard doesn't ensure select() accept more than (the system default 135 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in 136 * the vast majority of cases. This constant should therefore be increased only 137 * when absolutely necessary and possible, i.e., the server is exhausting all 138 * available file descriptors (up to FD_SETSIZE) and the select() function 139 * and FD_xxx macros support larger values than FD_SETSIZE (which may not 140 * always by true, but we keep using some of them to ensure as much 141 * portability as possible). Note also that overall server performance 142 * may be rather worsened with a larger value of this constant due to 143 * inherent scalability problems of select(). 144 * 145 * As a special note, this value shouldn't have to be touched if 146 * this is a build for an authoritative only DNS server. 147 */ 148#ifndef ISC_SOCKET_MAXSOCKETS 149#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 150#define ISC_SOCKET_MAXSOCKETS 4096 151#elif defined(USE_SELECT) 152#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE 153#endif /* USE_KQUEUE... */ 154#endif /* ISC_SOCKET_MAXSOCKETS */ 155 156#ifdef USE_SELECT 157/*% 158 * Mac OS X needs a special definition to support larger values in select(). 159 * We always define this because a larger value can be specified run-time. 160 */ 161#ifdef __APPLE__ 162#define _DARWIN_UNLIMITED_SELECT 163#endif /* __APPLE__ */ 164#endif /* USE_SELECT */ 165 166#ifdef ISC_SOCKET_USE_POLLWATCH 167/*% 168 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel 169 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for 170 * some of the specified FD. The idea is based on the observation that it's 171 * likely for a busy server to keep receiving packets. It specifically works 172 * as follows: the socket watcher is first initialized with the state of 173 * "poll_idle". While it's in the idle state it keeps sleeping until a socket 174 * event occurs. When it wakes up for a socket I/O event, it moves to the 175 * poll_active state, and sets the poll timeout to a short period 176 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the 177 * watcher goes to the poll_checking state with the same timeout period. 178 * In this state, the watcher tries to detect whether this is a break 179 * during intermittent events or the kernel bug is triggered. If the next 180 * polling reports an event within the short period, the previous timeout is 181 * likely to be a kernel bug, and so the watcher goes back to the active state. 182 * Otherwise, it moves to the idle state again. 183 * 184 * It's not clear whether this is a thread-related bug, but since we've only 185 * seen this with threads, this workaround is used only when enabling threads. 186 */ 187 188typedef enum { poll_idle, poll_active, poll_checking } pollstate_t; 189 190#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT 191#define ISC_SOCKET_POLLWATCH_TIMEOUT 10 192#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */ 193#endif /* ISC_SOCKET_USE_POLLWATCH */ 194 195/*% 196 * Size of per-FD lock buckets. 197 */ 198#ifdef ISC_PLATFORM_USETHREADS 199#define FDLOCK_COUNT 1024 200#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT) 201#else 202#define FDLOCK_COUNT 1 203#define FDLOCK_ID(fd) 0 204#endif /* ISC_PLATFORM_USETHREADS */ 205 206/*% 207 * Maximum number of events communicated with the kernel. There should normally 208 * be no need for having a large number. 209 */ 210#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 211#ifndef ISC_SOCKET_MAXEVENTS 212#define ISC_SOCKET_MAXEVENTS 64 213#endif 214#endif 215 216/*% 217 * Some systems define the socket length argument as an int, some as size_t, 218 * some as socklen_t. This is here so it can be easily changed if needed. 219 */ 220#ifndef ISC_SOCKADDR_LEN_T 221#define ISC_SOCKADDR_LEN_T unsigned int 222#endif 223 224/*% 225 * Define what the possible "soft" errors can be. These are non-fatal returns 226 * of various network related functions, like recv() and so on. 227 * 228 * For some reason, BSDI (and perhaps others) will sometimes return <0 229 * from recv() but will have errno==0. This is broken, but we have to 230 * work around it here. 231 */ 232#define SOFT_ERROR(e) ((e) == EAGAIN || \ 233 (e) == EWOULDBLOCK || \ 234 (e) == EINTR || \ 235 (e) == 0) 236 237#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x) 238 239/*!< 240 * DLVL(90) -- Function entry/exit and other tracing. 241 * DLVL(70) -- Socket "correctness" -- including returning of events, etc. 242 * DLVL(60) -- Socket data send/receive 243 * DLVL(50) -- Event tracing, including receiving/sending completion events. 244 * DLVL(20) -- Socket creation/destruction. 245 */ 246#define TRACE_LEVEL 90 247#define CORRECTNESS_LEVEL 70 248#define IOEVENT_LEVEL 60 249#define EVENT_LEVEL 50 250#define CREATION_LEVEL 20 251 252#define TRACE DLVL(TRACE_LEVEL) 253#define CORRECTNESS DLVL(CORRECTNESS_LEVEL) 254#define IOEVENT DLVL(IOEVENT_LEVEL) 255#define EVENT DLVL(EVENT_LEVEL) 256#define CREATION DLVL(CREATION_LEVEL) 257 258typedef isc_event_t intev_t; 259 260#define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o') 261#define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC) 262 263/*! 264 * IPv6 control information. If the socket is an IPv6 socket we want 265 * to collect the destination address and interface so the client can 266 * set them on outgoing packets. 267 */ 268#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 269#ifndef USE_CMSG 270#define USE_CMSG 1 271#endif 272#endif 273 274/*% 275 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have 276 * a setsockopt() like interface to request timestamps, and if the OS 277 * doesn't do it for us, call gettimeofday() on every UDP receive? 278 */ 279#ifdef SO_TIMESTAMP 280#ifndef USE_CMSG 281#define USE_CMSG 1 282#endif 283#endif 284 285/*% 286 * The size to raise the receive buffer to (from BIND 8). 287 */ 288#define RCVBUFSIZE (32*1024) 289 290/*% 291 * The number of times a send operation is repeated if the result is EINTR. 292 */ 293#define NRETRIES 10 294 295typedef struct isc__socket isc__socket_t; 296typedef struct isc__socketmgr isc__socketmgr_t; 297 298#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket) 299 300struct isc__socket { 301 /* Not locked. */ 302 isc_socket_t common; 303 isc__socketmgr_t *manager; 304 isc_mutex_t lock; 305 isc_sockettype_t type; 306 const isc_statscounter_t *statsindex; 307 308 /* Locked by socket lock. */ 309 ISC_LINK(isc__socket_t) link; 310 unsigned int references; 311 int fd; 312 int pf; 313 char name[16]; 314 void * tag; 315 316 ISC_LIST(isc_socketevent_t) send_list; 317 ISC_LIST(isc_socketevent_t) recv_list; 318 ISC_LIST(isc_socket_newconnev_t) accept_list; 319 isc_socket_connev_t *connect_ev; 320 321 /* 322 * Internal events. Posted when a descriptor is readable or 323 * writable. These are statically allocated and never freed. 324 * They will be set to non-purgable before use. 325 */ 326 intev_t readable_ev; 327 intev_t writable_ev; 328 329 isc_sockaddr_t peer_address; /* remote address */ 330 331 unsigned int pending_recv : 1, 332 pending_send : 1, 333 pending_accept : 1, 334 listener : 1, /* listener socket */ 335 connected : 1, 336 connecting : 1, /* connect pending */ 337 bound : 1; /* bound to local addr */ 338 339#ifdef ISC_NET_RECVOVERFLOW 340 unsigned char overflow; /* used for MSG_TRUNC fake */ 341#endif 342 343 char *recvcmsgbuf; 344 ISC_SOCKADDR_LEN_T recvcmsgbuflen; 345 char *sendcmsgbuf; 346 ISC_SOCKADDR_LEN_T sendcmsgbuflen; 347 348 void *fdwatcharg; 349 isc_sockfdwatch_t fdwatchcb; 350 int fdwatchflags; 351 isc_task_t *fdwatchtask; 352}; 353 354#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g') 355#define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC) 356 357struct isc__socketmgr { 358 /* Not locked. */ 359 isc_socketmgr_t common; 360 isc_mem_t *mctx; 361 isc_mutex_t lock; 362 isc_mutex_t *fdlock; 363 isc_stats_t *stats; 364#ifdef USE_KQUEUE 365 int kqueue_fd; 366 int nevents; 367 struct kevent *events; 368#endif /* USE_KQUEUE */ 369#ifdef USE_EPOLL 370 int epoll_fd; 371 int nevents; 372 struct epoll_event *events; 373#endif /* USE_EPOLL */ 374#ifdef USE_DEVPOLL 375 int devpoll_fd; 376 int nevents; 377 struct pollfd *events; 378#endif /* USE_DEVPOLL */ 379#ifdef USE_SELECT 380 int fd_bufsize; 381#endif /* USE_SELECT */ 382 unsigned int maxsocks; 383#ifdef ISC_PLATFORM_USETHREADS 384 int pipe_fds[2]; 385#endif 386 387 /* Locked by fdlock. */ 388 isc__socket_t **fds; 389 int *fdstate; 390#ifdef USE_DEVPOLL 391 pollinfo_t *fdpollinfo; 392#endif 393 394 /* Locked by manager lock. */ 395 ISC_LIST(isc__socket_t) socklist; 396#ifdef USE_SELECT 397 fd_set *read_fds; 398 fd_set *read_fds_copy; 399 fd_set *write_fds; 400 fd_set *write_fds_copy; 401 int maxfd; 402#endif /* USE_SELECT */ 403 int reserved; /* unlocked */ 404#ifdef USE_WATCHER_THREAD 405 isc_thread_t watcher; 406 isc_condition_t shutdown_ok; 407#else /* USE_WATCHER_THREAD */ 408 unsigned int refs; 409#endif /* USE_WATCHER_THREAD */ 410 int maxudp; 411}; 412 413#ifdef USE_SHARED_MANAGER 414static isc__socketmgr_t *socketmgr = NULL; 415#endif /* USE_SHARED_MANAGER */ 416 417#define CLOSED 0 /* this one must be zero */ 418#define MANAGED 1 419#define CLOSE_PENDING 2 420 421/* 422 * send() and recv() iovec counts 423 */ 424#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) 425#ifdef ISC_NET_RECVOVERFLOW 426# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1) 427#else 428# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) 429#endif 430 431static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **); 432static void send_senddone_event(isc__socket_t *, isc_socketevent_t **); 433static void free_socket(isc__socket_t **); 434static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t, 435 isc__socket_t **); 436static void destroy(isc__socket_t **); 437static void internal_accept(isc_task_t *, isc_event_t *); 438static void internal_connect(isc_task_t *, isc_event_t *); 439static void internal_recv(isc_task_t *, isc_event_t *); 440static void internal_send(isc_task_t *, isc_event_t *); 441static void internal_fdwatch_write(isc_task_t *, isc_event_t *); 442static void internal_fdwatch_read(isc_task_t *, isc_event_t *); 443static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *); 444static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *, 445 struct msghdr *, struct iovec *, size_t *); 446static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *, 447 struct msghdr *, struct iovec *, size_t *); 448#ifdef USE_WATCHER_THREAD 449static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager); 450#endif 451 452/*% 453 * The following can be either static or public, depending on build environment. 454 */ 455 456#ifdef BIND9 457#define ISC_SOCKETFUNC_SCOPE 458#else 459#define ISC_SOCKETFUNC_SCOPE static 460#endif 461 462ISC_SOCKETFUNC_SCOPE isc_result_t 463isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, 464 isc_socket_t **socketp); 465ISC_SOCKETFUNC_SCOPE void 466isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp); 467ISC_SOCKETFUNC_SCOPE void 468isc__socket_detach(isc_socket_t **socketp); 469ISC_SOCKETFUNC_SCOPE isc_result_t 470isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp); 471ISC_SOCKETFUNC_SCOPE isc_result_t 472isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 473 unsigned int maxsocks); 474ISC_SOCKETFUNC_SCOPE void 475isc__socketmgr_destroy(isc_socketmgr_t **managerp); 476ISC_SOCKETFUNC_SCOPE isc_result_t 477isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist, 478 unsigned int minimum, isc_task_t *task, 479 isc_taskaction_t action, const void *arg); 480ISC_SOCKETFUNC_SCOPE isc_result_t 481isc__socket_recv(isc_socket_t *sock, isc_region_t *region, 482 unsigned int minimum, isc_task_t *task, 483 isc_taskaction_t action, const void *arg); 484ISC_SOCKETFUNC_SCOPE isc_result_t 485isc__socket_recv2(isc_socket_t *sock, isc_region_t *region, 486 unsigned int minimum, isc_task_t *task, 487 isc_socketevent_t *event, unsigned int flags); 488ISC_SOCKETFUNC_SCOPE isc_result_t 489isc__socket_send(isc_socket_t *sock, isc_region_t *region, 490 isc_task_t *task, isc_taskaction_t action, const void *arg); 491ISC_SOCKETFUNC_SCOPE isc_result_t 492isc__socket_sendto(isc_socket_t *sock, isc_region_t *region, 493 isc_task_t *task, isc_taskaction_t action, const void *arg, 494 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo); 495ISC_SOCKETFUNC_SCOPE isc_result_t 496isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 497 isc_task_t *task, isc_taskaction_t action, const void *arg); 498ISC_SOCKETFUNC_SCOPE isc_result_t 499isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist, 500 isc_task_t *task, isc_taskaction_t action, const void *arg, 501 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo); 502ISC_SOCKETFUNC_SCOPE isc_result_t 503isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region, 504 isc_task_t *task, 505 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 506 isc_socketevent_t *event, unsigned int flags); 507ISC_SOCKETFUNC_SCOPE void 508isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active); 509ISC_SOCKETFUNC_SCOPE isc_result_t 510isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 511 isc_uint32_t owner, isc_uint32_t group); 512ISC_SOCKETFUNC_SCOPE isc_result_t 513isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr, 514 unsigned int options); 515ISC_SOCKETFUNC_SCOPE isc_result_t 516isc__socket_filter(isc_socket_t *sock, const char *filter); 517ISC_SOCKETFUNC_SCOPE isc_result_t 518isc__socket_listen(isc_socket_t *sock, unsigned int backlog); 519ISC_SOCKETFUNC_SCOPE isc_result_t 520isc__socket_accept(isc_socket_t *sock, 521 isc_task_t *task, isc_taskaction_t action, const void *arg); 522ISC_SOCKETFUNC_SCOPE isc_result_t 523isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, 524 isc_task_t *task, isc_taskaction_t action, 525 const void *arg); 526ISC_SOCKETFUNC_SCOPE isc_result_t 527isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp); 528ISC_SOCKETFUNC_SCOPE isc_result_t 529isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp); 530ISC_SOCKETFUNC_SCOPE void 531isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how); 532ISC_SOCKETFUNC_SCOPE isc_sockettype_t 533isc__socket_gettype(isc_socket_t *sock); 534ISC_SOCKETFUNC_SCOPE isc_boolean_t 535isc__socket_isbound(isc_socket_t *sock); 536ISC_SOCKETFUNC_SCOPE void 537isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes); 538#if defined(HAVE_LIBXML2) && defined(BIND9) 539ISC_SOCKETFUNC_SCOPE void 540isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer); 541#endif 542 543ISC_SOCKETFUNC_SCOPE isc_result_t 544isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags, 545 isc_sockfdwatch_t callback, void *cbarg, 546 isc_task_t *task, isc_socket_t **socketp); 547ISC_SOCKETFUNC_SCOPE isc_result_t 548isc__socket_fdwatchpoke(isc_socket_t *sock, int flags); 549 550static struct { 551 isc_socketmethods_t methods; 552 553 /*% 554 * The following are defined just for avoiding unused static functions. 555 */ 556#ifndef BIND9 557 void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter, 558 *listen, *accept, *getpeername, *isbound; 559#endif 560} socketmethods = { 561 { 562 isc__socket_attach, 563 isc__socket_detach, 564 isc__socket_bind, 565 isc__socket_sendto, 566 isc__socket_connect, 567 isc__socket_recv, 568 isc__socket_cancel, 569 isc__socket_getsockname, 570 isc__socket_gettype, 571 isc__socket_ipv6only, 572 isc__socket_fdwatchpoke 573 } 574#ifndef BIND9 575 , 576 (void *)isc__socket_recvv, (void *)isc__socket_send, 577 (void *)isc__socket_sendv, (void *)isc__socket_sendto2, 578 (void *)isc__socket_cleanunix, (void *)isc__socket_permunix, 579 (void *)isc__socket_filter, (void *)isc__socket_listen, 580 (void *)isc__socket_accept, (void *)isc__socket_getpeername, 581 (void *)isc__socket_isbound 582#endif 583}; 584 585static isc_socketmgrmethods_t socketmgrmethods = { 586 isc__socketmgr_destroy, 587 isc__socket_create, 588 isc__socket_fdwatchcreate 589}; 590 591#define SELECT_POKE_SHUTDOWN (-1) 592#define SELECT_POKE_NOTHING (-2) 593#define SELECT_POKE_READ (-3) 594#define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */ 595#define SELECT_POKE_WRITE (-4) 596#define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */ 597#define SELECT_POKE_CLOSE (-5) 598 599#define SOCK_DEAD(s) ((s)->references == 0) 600 601/*% 602 * Shortcut index arrays to get access to statistics counters. 603 */ 604enum { 605 STATID_OPEN = 0, 606 STATID_OPENFAIL = 1, 607 STATID_CLOSE = 2, 608 STATID_BINDFAIL = 3, 609 STATID_CONNECTFAIL = 4, 610 STATID_CONNECT = 5, 611 STATID_ACCEPTFAIL = 6, 612 STATID_ACCEPT = 7, 613 STATID_SENDFAIL = 8, 614 STATID_RECVFAIL = 9 615}; 616static const isc_statscounter_t upd4statsindex[] = { 617 isc_sockstatscounter_udp4open, 618 isc_sockstatscounter_udp4openfail, 619 isc_sockstatscounter_udp4close, 620 isc_sockstatscounter_udp4bindfail, 621 isc_sockstatscounter_udp4connectfail, 622 isc_sockstatscounter_udp4connect, 623 -1, 624 -1, 625 isc_sockstatscounter_udp4sendfail, 626 isc_sockstatscounter_udp4recvfail 627}; 628static const isc_statscounter_t upd6statsindex[] = { 629 isc_sockstatscounter_udp6open, 630 isc_sockstatscounter_udp6openfail, 631 isc_sockstatscounter_udp6close, 632 isc_sockstatscounter_udp6bindfail, 633 isc_sockstatscounter_udp6connectfail, 634 isc_sockstatscounter_udp6connect, 635 -1, 636 -1, 637 isc_sockstatscounter_udp6sendfail, 638 isc_sockstatscounter_udp6recvfail 639}; 640static const isc_statscounter_t tcp4statsindex[] = { 641 isc_sockstatscounter_tcp4open, 642 isc_sockstatscounter_tcp4openfail, 643 isc_sockstatscounter_tcp4close, 644 isc_sockstatscounter_tcp4bindfail, 645 isc_sockstatscounter_tcp4connectfail, 646 isc_sockstatscounter_tcp4connect, 647 isc_sockstatscounter_tcp4acceptfail, 648 isc_sockstatscounter_tcp4accept, 649 isc_sockstatscounter_tcp4sendfail, 650 isc_sockstatscounter_tcp4recvfail 651}; 652static const isc_statscounter_t tcp6statsindex[] = { 653 isc_sockstatscounter_tcp6open, 654 isc_sockstatscounter_tcp6openfail, 655 isc_sockstatscounter_tcp6close, 656 isc_sockstatscounter_tcp6bindfail, 657 isc_sockstatscounter_tcp6connectfail, 658 isc_sockstatscounter_tcp6connect, 659 isc_sockstatscounter_tcp6acceptfail, 660 isc_sockstatscounter_tcp6accept, 661 isc_sockstatscounter_tcp6sendfail, 662 isc_sockstatscounter_tcp6recvfail 663}; 664static const isc_statscounter_t unixstatsindex[] = { 665 isc_sockstatscounter_unixopen, 666 isc_sockstatscounter_unixopenfail, 667 isc_sockstatscounter_unixclose, 668 isc_sockstatscounter_unixbindfail, 669 isc_sockstatscounter_unixconnectfail, 670 isc_sockstatscounter_unixconnect, 671 isc_sockstatscounter_unixacceptfail, 672 isc_sockstatscounter_unixaccept, 673 isc_sockstatscounter_unixsendfail, 674 isc_sockstatscounter_unixrecvfail 675}; 676static const isc_statscounter_t fdwatchstatsindex[] = { 677 -1, 678 -1, 679 isc_sockstatscounter_fdwatchclose, 680 isc_sockstatscounter_fdwatchbindfail, 681 isc_sockstatscounter_fdwatchconnectfail, 682 isc_sockstatscounter_fdwatchconnect, 683 -1, 684 -1, 685 isc_sockstatscounter_fdwatchsendfail, 686 isc_sockstatscounter_fdwatchrecvfail 687}; 688 689#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \ 690 defined(USE_WATCHER_THREAD) 691static void 692manager_log(isc__socketmgr_t *sockmgr, 693 isc_logcategory_t *category, isc_logmodule_t *module, int level, 694 const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6); 695static void 696manager_log(isc__socketmgr_t *sockmgr, 697 isc_logcategory_t *category, isc_logmodule_t *module, int level, 698 const char *fmt, ...) 699{ 700 char msgbuf[2048]; 701 va_list ap; 702 703 if (! isc_log_wouldlog(isc_lctx, level)) 704 return; 705 706 va_start(ap, fmt); 707 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 708 va_end(ap); 709 710 isc_log_write(isc_lctx, category, module, level, 711 "sockmgr %p: %s", sockmgr, msgbuf); 712} 713#endif 714 715static void 716socket_log(isc__socket_t *sock, isc_sockaddr_t *address, 717 isc_logcategory_t *category, isc_logmodule_t *module, int level, 718 isc_msgcat_t *msgcat, int msgset, int message, 719 const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10); 720static void 721socket_log(isc__socket_t *sock, isc_sockaddr_t *address, 722 isc_logcategory_t *category, isc_logmodule_t *module, int level, 723 isc_msgcat_t *msgcat, int msgset, int message, 724 const char *fmt, ...) 725{ 726 char msgbuf[2048]; 727 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 728 va_list ap; 729 730 if (! isc_log_wouldlog(isc_lctx, level)) 731 return; 732 733 va_start(ap, fmt); 734 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 735 va_end(ap); 736 737 if (address == NULL) { 738 isc_log_iwrite(isc_lctx, category, module, level, 739 msgcat, msgset, message, 740 "socket %p: %s", sock, msgbuf); 741 } else { 742 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); 743 isc_log_iwrite(isc_lctx, category, module, level, 744 msgcat, msgset, message, 745 "socket %p %s: %s", sock, peerbuf, msgbuf); 746 } 747} 748 749#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \ 750 defined(USE_CMSG) && defined(IPV6_RECVPKTINFO) 751/* 752 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by 753 * setting IPV6_V6ONLY. 754 */ 755static void 756FIX_IPV6_RECVPKTINFO(isc__socket_t *sock) 757{ 758 char strbuf[ISC_STRERRORSIZE]; 759 int on = 1; 760 761 if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp) 762 return; 763 764 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 765 (void *)&on, sizeof(on)) < 0) { 766 767 isc__strerror(errno, strbuf, sizeof(strbuf)); 768 UNEXPECTED_ERROR(__FILE__, __LINE__, 769 "setsockopt(%d, IPV6_RECVPKTINFO) " 770 "%s: %s", sock->fd, 771 isc_msgcat_get(isc_msgcat, 772 ISC_MSGSET_GENERAL, 773 ISC_MSG_FAILED, 774 "failed"), 775 strbuf); 776 } 777} 778#else 779#define FIX_IPV6_RECVPKTINFO(sock) (void)0 780#endif 781 782/*% 783 * Increment socket-related statistics counters. 784 */ 785static inline void 786inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) { 787 REQUIRE(counterid != -1); 788 789 if (stats != NULL) 790 isc_stats_increment(stats, counterid); 791} 792 793static inline isc_result_t 794watch_fd(isc__socketmgr_t *manager, int fd, int msg) { 795 isc_result_t result = ISC_R_SUCCESS; 796 797#ifdef USE_KQUEUE 798 struct kevent evchange; 799 800 memset(&evchange, 0, sizeof(evchange)); 801 if (msg == SELECT_POKE_READ) 802 evchange.filter = EVFILT_READ; 803 else 804 evchange.filter = EVFILT_WRITE; 805 evchange.flags = EV_ADD; 806 evchange.ident = fd; 807 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 808 result = isc__errno2result(errno); 809 810 return (result); 811#elif defined(USE_EPOLL) 812 struct epoll_event event; 813 814 if (msg == SELECT_POKE_READ) 815 event.events = EPOLLIN; 816 else 817 event.events = EPOLLOUT; 818 memset(&event.data, 0, sizeof(event.data)); 819 event.data.fd = fd; 820 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && 821 errno != EEXIST) { 822 result = isc__errno2result(errno); 823 } 824 825 return (result); 826#elif defined(USE_DEVPOLL) 827 struct pollfd pfd; 828 int lockid = FDLOCK_ID(fd); 829 830 memset(&pfd, 0, sizeof(pfd)); 831 if (msg == SELECT_POKE_READ) 832 pfd.events = POLLIN; 833 else 834 pfd.events = POLLOUT; 835 pfd.fd = fd; 836 pfd.revents = 0; 837 LOCK(&manager->fdlock[lockid]); 838 if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1) 839 result = isc__errno2result(errno); 840 else { 841 if (msg == SELECT_POKE_READ) 842 manager->fdpollinfo[fd].want_read = 1; 843 else 844 manager->fdpollinfo[fd].want_write = 1; 845 } 846 UNLOCK(&manager->fdlock[lockid]); 847 848 return (result); 849#elif defined(USE_SELECT) 850 LOCK(&manager->lock); 851 if (msg == SELECT_POKE_READ) 852 FD_SET(fd, manager->read_fds); 853 if (msg == SELECT_POKE_WRITE) 854 FD_SET(fd, manager->write_fds); 855 UNLOCK(&manager->lock); 856 857 return (result); 858#endif 859} 860 861static inline isc_result_t 862unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) { 863 isc_result_t result = ISC_R_SUCCESS; 864 865#ifdef USE_KQUEUE 866 struct kevent evchange; 867 868 memset(&evchange, 0, sizeof(evchange)); 869 if (msg == SELECT_POKE_READ) 870 evchange.filter = EVFILT_READ; 871 else 872 evchange.filter = EVFILT_WRITE; 873 evchange.flags = EV_DELETE; 874 evchange.ident = fd; 875 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 876 result = isc__errno2result(errno); 877 878 return (result); 879#elif defined(USE_EPOLL) 880 struct epoll_event event; 881 882 if (msg == SELECT_POKE_READ) 883 event.events = EPOLLIN; 884 else 885 event.events = EPOLLOUT; 886 memset(&event.data, 0, sizeof(event.data)); 887 event.data.fd = fd; 888 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 && 889 errno != ENOENT) { 890 char strbuf[ISC_STRERRORSIZE]; 891 isc__strerror(errno, strbuf, sizeof(strbuf)); 892 UNEXPECTED_ERROR(__FILE__, __LINE__, 893 "epoll_ctl(DEL), %d: %s", fd, strbuf); 894 result = ISC_R_UNEXPECTED; 895 } 896 return (result); 897#elif defined(USE_DEVPOLL) 898 struct pollfd pfds[2]; 899 size_t writelen = sizeof(pfds[0]); 900 int lockid = FDLOCK_ID(fd); 901 902 memset(pfds, 0, sizeof(pfds)); 903 pfds[0].events = POLLREMOVE; 904 pfds[0].fd = fd; 905 906 /* 907 * Canceling read or write polling via /dev/poll is tricky. Since it 908 * only provides a way of canceling per FD, we may need to re-poll the 909 * socket for the other operation. 910 */ 911 LOCK(&manager->fdlock[lockid]); 912 if (msg == SELECT_POKE_READ && 913 manager->fdpollinfo[fd].want_write == 1) { 914 pfds[1].events = POLLOUT; 915 pfds[1].fd = fd; 916 writelen += sizeof(pfds[1]); 917 } 918 if (msg == SELECT_POKE_WRITE && 919 manager->fdpollinfo[fd].want_read == 1) { 920 pfds[1].events = POLLIN; 921 pfds[1].fd = fd; 922 writelen += sizeof(pfds[1]); 923 } 924 925 if (write(manager->devpoll_fd, pfds, writelen) == -1) 926 result = isc__errno2result(errno); 927 else { 928 if (msg == SELECT_POKE_READ) 929 manager->fdpollinfo[fd].want_read = 0; 930 else 931 manager->fdpollinfo[fd].want_write = 0; 932 } 933 UNLOCK(&manager->fdlock[lockid]); 934 935 return (result); 936#elif defined(USE_SELECT) 937 LOCK(&manager->lock); 938 if (msg == SELECT_POKE_READ) 939 FD_CLR(fd, manager->read_fds); 940 else if (msg == SELECT_POKE_WRITE) 941 FD_CLR(fd, manager->write_fds); 942 UNLOCK(&manager->lock); 943 944 return (result); 945#endif 946} 947 948static void 949wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) { 950 isc_result_t result; 951 int lockid = FDLOCK_ID(fd); 952 953 /* 954 * This is a wakeup on a socket. If the socket is not in the 955 * process of being closed, start watching it for either reads 956 * or writes. 957 */ 958 959 INSIST(fd >= 0 && fd < (int)manager->maxsocks); 960 961 if (msg == SELECT_POKE_CLOSE) { 962 /* No one should be updating fdstate, so no need to lock it */ 963 INSIST(manager->fdstate[fd] == CLOSE_PENDING); 964 manager->fdstate[fd] = CLOSED; 965 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 966 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 967 (void)close(fd); 968 return; 969 } 970 971 LOCK(&manager->fdlock[lockid]); 972 if (manager->fdstate[fd] == CLOSE_PENDING) { 973 UNLOCK(&manager->fdlock[lockid]); 974 975 /* 976 * We accept (and ignore) any error from unwatch_fd() as we are 977 * closing the socket, hoping it doesn't leave dangling state in 978 * the kernel. 979 * Note that unwatch_fd() must be called after releasing the 980 * fdlock; otherwise it could cause deadlock due to a lock order 981 * reversal. 982 */ 983 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 984 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 985 return; 986 } 987 if (manager->fdstate[fd] != MANAGED) { 988 UNLOCK(&manager->fdlock[lockid]); 989 return; 990 } 991 UNLOCK(&manager->fdlock[lockid]); 992 993 /* 994 * Set requested bit. 995 */ 996 result = watch_fd(manager, fd, msg); 997 if (result != ISC_R_SUCCESS) { 998 /* 999 * XXXJT: what should we do? Ignoring the failure of watching 1000 * a socket will make the application dysfunctional, but there 1001 * seems to be no reasonable recovery process. 1002 */ 1003 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 1004 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 1005 "failed to start watching FD (%d): %s", 1006 fd, isc_result_totext(result)); 1007 } 1008} 1009 1010#ifdef USE_WATCHER_THREAD 1011/* 1012 * Poke the select loop when there is something for us to do. 1013 * The write is required (by POSIX) to complete. That is, we 1014 * will not get partial writes. 1015 */ 1016static void 1017select_poke(isc__socketmgr_t *mgr, int fd, int msg) { 1018 int cc; 1019 int buf[2]; 1020 char strbuf[ISC_STRERRORSIZE]; 1021 1022 buf[0] = fd; 1023 buf[1] = msg; 1024 1025 do { 1026 cc = write(mgr->pipe_fds[1], buf, sizeof(buf)); 1027#ifdef ENOSR 1028 /* 1029 * Treat ENOSR as EAGAIN but loop slowly as it is 1030 * unlikely to clear fast. 1031 */ 1032 if (cc < 0 && errno == ENOSR) { 1033 sleep(1); 1034 errno = EAGAIN; 1035 } 1036#endif 1037 } while (cc < 0 && SOFT_ERROR(errno)); 1038 1039 if (cc < 0) { 1040 isc__strerror(errno, strbuf, sizeof(strbuf)); 1041 FATAL_ERROR(__FILE__, __LINE__, 1042 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 1043 ISC_MSG_WRITEFAILED, 1044 "write() failed " 1045 "during watcher poke: %s"), 1046 strbuf); 1047 } 1048 1049 INSIST(cc == sizeof(buf)); 1050} 1051 1052/* 1053 * Read a message on the internal fd. 1054 */ 1055static void 1056select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) { 1057 int buf[2]; 1058 int cc; 1059 char strbuf[ISC_STRERRORSIZE]; 1060 1061 cc = read(mgr->pipe_fds[0], buf, sizeof(buf)); 1062 if (cc < 0) { 1063 *msg = SELECT_POKE_NOTHING; 1064 *fd = -1; /* Silence compiler. */ 1065 if (SOFT_ERROR(errno)) 1066 return; 1067 1068 isc__strerror(errno, strbuf, sizeof(strbuf)); 1069 FATAL_ERROR(__FILE__, __LINE__, 1070 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 1071 ISC_MSG_READFAILED, 1072 "read() failed " 1073 "during watcher poke: %s"), 1074 strbuf); 1075 1076 return; 1077 } 1078 INSIST(cc == sizeof(buf)); 1079 1080 *fd = buf[0]; 1081 *msg = buf[1]; 1082} 1083#else /* USE_WATCHER_THREAD */ 1084/* 1085 * Update the state of the socketmgr when something changes. 1086 */ 1087static void 1088select_poke(isc__socketmgr_t *manager, int fd, int msg) { 1089 if (msg == SELECT_POKE_SHUTDOWN) 1090 return; 1091 else if (fd >= 0) 1092 wakeup_socket(manager, fd, msg); 1093 return; 1094} 1095#endif /* USE_WATCHER_THREAD */ 1096 1097/* 1098 * Make a fd non-blocking. 1099 */ 1100static isc_result_t 1101make_nonblock(int fd) { 1102 int ret; 1103 int flags; 1104 char strbuf[ISC_STRERRORSIZE]; 1105#ifdef USE_FIONBIO_IOCTL 1106 int on = 1; 1107 1108 ret = ioctl(fd, FIONBIO, (char *)&on); 1109#else 1110 flags = fcntl(fd, F_GETFL, 0); 1111 flags |= PORT_NONBLOCK; 1112 ret = fcntl(fd, F_SETFL, flags); 1113#endif 1114 1115 if (ret == -1) { 1116 isc__strerror(errno, strbuf, sizeof(strbuf)); 1117 UNEXPECTED_ERROR(__FILE__, __LINE__, 1118#ifdef USE_FIONBIO_IOCTL 1119 "ioctl(%d, FIONBIO, &on): %s", fd, 1120#else 1121 "fcntl(%d, F_SETFL, %d): %s", fd, flags, 1122#endif 1123 strbuf); 1124 1125 return (ISC_R_UNEXPECTED); 1126 } 1127 1128 return (ISC_R_SUCCESS); 1129} 1130 1131#ifdef USE_CMSG 1132/* 1133 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE. 1134 * In order to ensure as much portability as possible, we provide wrapper 1135 * functions of these macros. 1136 * Note that cmsg_space() could run slow on OSes that do not have 1137 * CMSG_SPACE. 1138 */ 1139static inline ISC_SOCKADDR_LEN_T 1140cmsg_len(ISC_SOCKADDR_LEN_T len) { 1141#ifdef CMSG_LEN 1142 return (CMSG_LEN(len)); 1143#else 1144 ISC_SOCKADDR_LEN_T hdrlen; 1145 1146 /* 1147 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA 1148 * is correct. 1149 */ 1150 hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL)); 1151 return (hdrlen + len); 1152#endif 1153} 1154 1155static inline ISC_SOCKADDR_LEN_T 1156cmsg_space(ISC_SOCKADDR_LEN_T len) { 1157#ifdef CMSG_SPACE 1158 return (CMSG_SPACE(len)); 1159#else 1160 struct msghdr msg; 1161 struct cmsghdr *cmsgp; 1162 /* 1163 * XXX: The buffer length is an ad-hoc value, but should be enough 1164 * in a practical sense. 1165 */ 1166 char dummybuf[sizeof(struct cmsghdr) + 1024]; 1167 1168 memset(&msg, 0, sizeof(msg)); 1169 msg.msg_control = dummybuf; 1170 msg.msg_controllen = sizeof(dummybuf); 1171 1172 cmsgp = (struct cmsghdr *)dummybuf; 1173 cmsgp->cmsg_len = cmsg_len(len); 1174 1175 cmsgp = CMSG_NXTHDR(&msg, cmsgp); 1176 if (cmsgp != NULL) 1177 return ((char *)cmsgp - (char *)msg.msg_control); 1178 else 1179 return (0); 1180#endif 1181} 1182#endif /* USE_CMSG */ 1183 1184/* 1185 * Process control messages received on a socket. 1186 */ 1187static void 1188process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { 1189#ifdef USE_CMSG 1190 struct cmsghdr *cmsgp; 1191#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1192 struct in6_pktinfo *pktinfop; 1193#endif 1194#ifdef SO_TIMESTAMP 1195 struct timeval *timevalp; 1196#endif 1197#endif 1198 1199 /* 1200 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. 1201 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. 1202 * They are all here, outside of the CPP tests, because it is 1203 * more consistent with the usual ISC coding style. 1204 */ 1205 UNUSED(sock); 1206 UNUSED(msg); 1207 UNUSED(dev); 1208 1209#ifdef ISC_NET_BSD44MSGHDR 1210 1211#ifdef MSG_TRUNC 1212 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC) 1213 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1214#endif 1215 1216#ifdef MSG_CTRUNC 1217 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC) 1218 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; 1219#endif 1220 1221#ifndef USE_CMSG 1222 return; 1223#else 1224 if (msg->msg_controllen == 0U || msg->msg_control == NULL) 1225 return; 1226 1227#ifdef SO_TIMESTAMP 1228 timevalp = NULL; 1229#endif 1230#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1231 pktinfop = NULL; 1232#endif 1233 1234 cmsgp = CMSG_FIRSTHDR(msg); 1235 while (cmsgp != NULL) { 1236 socket_log(sock, NULL, TRACE, 1237 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG, 1238 "processing cmsg %p", cmsgp); 1239 1240#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1241 if (cmsgp->cmsg_level == IPPROTO_IPV6 1242 && cmsgp->cmsg_type == IPV6_PKTINFO) { 1243 1244 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1245 memcpy(&dev->pktinfo, pktinfop, 1246 sizeof(struct in6_pktinfo)); 1247 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 1248 socket_log(sock, NULL, TRACE, 1249 isc_msgcat, ISC_MSGSET_SOCKET, 1250 ISC_MSG_IFRECEIVED, 1251 "interface received on ifindex %u", 1252 dev->pktinfo.ipi6_ifindex); 1253 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) 1254 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; 1255 goto next; 1256 } 1257#endif 1258 1259#ifdef SO_TIMESTAMP 1260 if (cmsgp->cmsg_level == SOL_SOCKET 1261 && cmsgp->cmsg_type == SCM_TIMESTAMP) { 1262 timevalp = (struct timeval *)CMSG_DATA(cmsgp); 1263 dev->timestamp.seconds = timevalp->tv_sec; 1264 dev->timestamp.nanoseconds = timevalp->tv_usec * 1000; 1265 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; 1266 goto next; 1267 } 1268#endif 1269 1270 next: 1271 cmsgp = CMSG_NXTHDR(msg, cmsgp); 1272 } 1273#endif /* USE_CMSG */ 1274 1275#endif /* ISC_NET_BSD44MSGHDR */ 1276} 1277 1278/* 1279 * Construct an iov array and attach it to the msghdr passed in. This is 1280 * the SEND constructor, which will use the used region of the buffer 1281 * (if using a buffer list) or will use the internal region (if a single 1282 * buffer I/O is requested). 1283 * 1284 * Nothing can be NULL, and the done event must list at least one buffer 1285 * on the buffer linked list for this function to be meaningful. 1286 * 1287 * If write_countp != NULL, *write_countp will hold the number of bytes 1288 * this transaction can send. 1289 */ 1290static void 1291build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev, 1292 struct msghdr *msg, struct iovec *iov, size_t *write_countp) 1293{ 1294 unsigned int iovcount; 1295 isc_buffer_t *buffer; 1296 isc_region_t used; 1297 size_t write_count; 1298 size_t skip_count; 1299 1300 memset(msg, 0, sizeof(*msg)); 1301 1302 if (!sock->connected) { 1303 msg->msg_name = (void *)&dev->address.type.sa; 1304 msg->msg_namelen = dev->address.length; 1305 } else { 1306 msg->msg_name = NULL; 1307 msg->msg_namelen = 0; 1308 } 1309 1310 buffer = ISC_LIST_HEAD(dev->bufferlist); 1311 write_count = 0; 1312 iovcount = 0; 1313 1314 /* 1315 * Single buffer I/O? Skip what we've done so far in this region. 1316 */ 1317 if (buffer == NULL) { 1318 write_count = dev->region.length - dev->n; 1319 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1320 iov[0].iov_len = write_count; 1321 iovcount = 1; 1322 1323 goto config; 1324 } 1325 1326 /* 1327 * Multibuffer I/O. 1328 * Skip the data in the buffer list that we have already written. 1329 */ 1330 skip_count = dev->n; 1331 while (buffer != NULL) { 1332 REQUIRE(ISC_BUFFER_VALID(buffer)); 1333 if (skip_count < isc_buffer_usedlength(buffer)) 1334 break; 1335 skip_count -= isc_buffer_usedlength(buffer); 1336 buffer = ISC_LIST_NEXT(buffer, link); 1337 } 1338 1339 while (buffer != NULL) { 1340 INSIST(iovcount < MAXSCATTERGATHER_SEND); 1341 1342 isc_buffer_usedregion(buffer, &used); 1343 1344 if (used.length > 0) { 1345 iov[iovcount].iov_base = (void *)(used.base 1346 + skip_count); 1347 iov[iovcount].iov_len = used.length - skip_count; 1348 write_count += (used.length - skip_count); 1349 skip_count = 0; 1350 iovcount++; 1351 } 1352 buffer = ISC_LIST_NEXT(buffer, link); 1353 } 1354 1355 INSIST(skip_count == 0U); 1356 1357 config: 1358 msg->msg_iov = iov; 1359 msg->msg_iovlen = iovcount; 1360 1361#ifdef ISC_NET_BSD44MSGHDR 1362 msg->msg_control = NULL; 1363 msg->msg_controllen = 0; 1364 msg->msg_flags = 0; 1365#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1366 if ((sock->type == isc_sockettype_udp) 1367 && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) { 1368#if defined(IPV6_USE_MIN_MTU) 1369 int use_min_mtu = 1; /* -1, 0, 1 */ 1370#endif 1371 struct cmsghdr *cmsgp; 1372 struct in6_pktinfo *pktinfop; 1373 1374 socket_log(sock, NULL, TRACE, 1375 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA, 1376 "sendto pktinfo data, ifindex %u", 1377 dev->pktinfo.ipi6_ifindex); 1378 1379 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo)); 1380 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1381 msg->msg_control = (void *)sock->sendcmsgbuf; 1382 1383 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf; 1384 cmsgp->cmsg_level = IPPROTO_IPV6; 1385 cmsgp->cmsg_type = IPV6_PKTINFO; 1386 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo)); 1387 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1388 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); 1389#if defined(IPV6_USE_MIN_MTU) 1390 /* 1391 * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD 1392 * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO 1393 * is used. 1394 */ 1395 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf + 1396 msg->msg_controllen); 1397 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu)); 1398 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1399 1400 cmsgp->cmsg_level = IPPROTO_IPV6; 1401 cmsgp->cmsg_type = IPV6_USE_MIN_MTU; 1402 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu)); 1403 memcpy(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu)); 1404#endif 1405 } 1406#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */ 1407#else /* ISC_NET_BSD44MSGHDR */ 1408 msg->msg_accrights = NULL; 1409 msg->msg_accrightslen = 0; 1410#endif /* ISC_NET_BSD44MSGHDR */ 1411 1412 if (write_countp != NULL) 1413 *write_countp = write_count; 1414} 1415 1416/* 1417 * Construct an iov array and attach it to the msghdr passed in. This is 1418 * the RECV constructor, which will use the available region of the buffer 1419 * (if using a buffer list) or will use the internal region (if a single 1420 * buffer I/O is requested). 1421 * 1422 * Nothing can be NULL, and the done event must list at least one buffer 1423 * on the buffer linked list for this function to be meaningful. 1424 * 1425 * If read_countp != NULL, *read_countp will hold the number of bytes 1426 * this transaction can receive. 1427 */ 1428static void 1429build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev, 1430 struct msghdr *msg, struct iovec *iov, size_t *read_countp) 1431{ 1432 unsigned int iovcount; 1433 isc_buffer_t *buffer; 1434 isc_region_t available; 1435 size_t read_count; 1436 1437 memset(msg, 0, sizeof(struct msghdr)); 1438 1439 if (sock->type == isc_sockettype_udp) { 1440 memset(&dev->address, 0, sizeof(dev->address)); 1441#ifdef BROKEN_RECVMSG 1442 if (sock->pf == AF_INET) { 1443 msg->msg_name = (void *)&dev->address.type.sin; 1444 msg->msg_namelen = sizeof(dev->address.type.sin6); 1445 } else if (sock->pf == AF_INET6) { 1446 msg->msg_name = (void *)&dev->address.type.sin6; 1447 msg->msg_namelen = sizeof(dev->address.type.sin6); 1448#ifdef ISC_PLATFORM_HAVESYSUNH 1449 } else if (sock->pf == AF_UNIX) { 1450 msg->msg_name = (void *)&dev->address.type.sunix; 1451 msg->msg_namelen = sizeof(dev->address.type.sunix); 1452#endif 1453 } else { 1454 msg->msg_name = (void *)&dev->address.type.sa; 1455 msg->msg_namelen = sizeof(dev->address.type); 1456 } 1457#else 1458 msg->msg_name = (void *)&dev->address.type.sa; 1459 msg->msg_namelen = sizeof(dev->address.type); 1460#endif 1461#ifdef ISC_NET_RECVOVERFLOW 1462 /* If needed, steal one iovec for overflow detection. */ 1463 maxiov--; 1464#endif 1465 } else { /* TCP */ 1466 msg->msg_name = NULL; 1467 msg->msg_namelen = 0; 1468 dev->address = sock->peer_address; 1469 } 1470 1471 buffer = ISC_LIST_HEAD(dev->bufferlist); 1472 read_count = 0; 1473 1474 /* 1475 * Single buffer I/O? Skip what we've done so far in this region. 1476 */ 1477 if (buffer == NULL) { 1478 read_count = dev->region.length - dev->n; 1479 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1480 iov[0].iov_len = read_count; 1481 iovcount = 1; 1482 1483 goto config; 1484 } 1485 1486 /* 1487 * Multibuffer I/O. 1488 * Skip empty buffers. 1489 */ 1490 while (buffer != NULL) { 1491 REQUIRE(ISC_BUFFER_VALID(buffer)); 1492 if (isc_buffer_availablelength(buffer) != 0) 1493 break; 1494 buffer = ISC_LIST_NEXT(buffer, link); 1495 } 1496 1497 iovcount = 0; 1498 while (buffer != NULL) { 1499 INSIST(iovcount < MAXSCATTERGATHER_RECV); 1500 1501 isc_buffer_availableregion(buffer, &available); 1502 1503 if (available.length > 0) { 1504 iov[iovcount].iov_base = (void *)(available.base); 1505 iov[iovcount].iov_len = available.length; 1506 read_count += available.length; 1507 iovcount++; 1508 } 1509 buffer = ISC_LIST_NEXT(buffer, link); 1510 } 1511 1512 config: 1513 1514 /* 1515 * If needed, set up to receive that one extra byte. Note that 1516 * we know there is at least one iov left, since we stole it 1517 * at the top of this function. 1518 */ 1519#ifdef ISC_NET_RECVOVERFLOW 1520 if (sock->type == isc_sockettype_udp) { 1521 iov[iovcount].iov_base = (void *)(&sock->overflow); 1522 iov[iovcount].iov_len = 1; 1523 iovcount++; 1524 } 1525#endif 1526 1527 msg->msg_iov = iov; 1528 msg->msg_iovlen = iovcount; 1529 1530#ifdef ISC_NET_BSD44MSGHDR 1531 msg->msg_control = NULL; 1532 msg->msg_controllen = 0; 1533 msg->msg_flags = 0; 1534#if defined(USE_CMSG) 1535 if (sock->type == isc_sockettype_udp) { 1536 msg->msg_control = sock->recvcmsgbuf; 1537 msg->msg_controllen = sock->recvcmsgbuflen; 1538 } 1539#endif /* USE_CMSG */ 1540#else /* ISC_NET_BSD44MSGHDR */ 1541 msg->msg_accrights = NULL; 1542 msg->msg_accrightslen = 0; 1543#endif /* ISC_NET_BSD44MSGHDR */ 1544 1545 if (read_countp != NULL) 1546 *read_countp = read_count; 1547} 1548 1549static void 1550set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock, 1551 isc_socketevent_t *dev) 1552{ 1553 if (sock->type == isc_sockettype_udp) { 1554 if (address != NULL) 1555 dev->address = *address; 1556 else 1557 dev->address = sock->peer_address; 1558 } else if (sock->type == isc_sockettype_tcp) { 1559 INSIST(address == NULL); 1560 dev->address = sock->peer_address; 1561 } 1562} 1563 1564static void 1565destroy_socketevent(isc_event_t *event) { 1566 isc_socketevent_t *ev = (isc_socketevent_t *)event; 1567 1568 INSIST(ISC_LIST_EMPTY(ev->bufferlist)); 1569 1570 (ev->destroy)(event); 1571} 1572 1573static isc_socketevent_t * 1574allocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype, 1575 isc_taskaction_t action, const void *arg) 1576{ 1577 isc_socketevent_t *ev; 1578 1579 ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx, 1580 sock, eventtype, 1581 action, arg, 1582 sizeof(*ev)); 1583 1584 if (ev == NULL) 1585 return (NULL); 1586 1587 ev->result = ISC_R_UNSET; 1588 ISC_LINK_INIT(ev, ev_link); 1589 ISC_LIST_INIT(ev->bufferlist); 1590 ev->region.base = NULL; 1591 ev->n = 0; 1592 ev->offset = 0; 1593 ev->attributes = 0; 1594 ev->destroy = ev->ev_destroy; 1595 ev->ev_destroy = destroy_socketevent; 1596 1597 return (ev); 1598} 1599 1600#if defined(ISC_SOCKET_DEBUG) 1601static void 1602dump_msg(struct msghdr *msg) { 1603 unsigned int i; 1604 1605 printf("MSGHDR %p\n", msg); 1606 printf("\tname %p, namelen %ld\n", msg->msg_name, 1607 (long) msg->msg_namelen); 1608 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, 1609 (long) msg->msg_iovlen); 1610 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) 1611 printf("\t\t%d\tbase %p, len %ld\n", i, 1612 msg->msg_iov[i].iov_base, 1613 (long) msg->msg_iov[i].iov_len); 1614#ifdef ISC_NET_BSD44MSGHDR 1615 printf("\tcontrol %p, controllen %ld\n", msg->msg_control, 1616 (long) msg->msg_controllen); 1617#endif 1618} 1619#endif 1620 1621#define DOIO_SUCCESS 0 /* i/o ok, event sent */ 1622#define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */ 1623#define DOIO_HARD 2 /* i/o error, event sent */ 1624#define DOIO_EOF 3 /* EOF, no event sent */ 1625 1626static int 1627doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) { 1628 int cc; 1629 struct iovec iov[MAXSCATTERGATHER_RECV]; 1630 size_t read_count; 1631 size_t actual_count; 1632 struct msghdr msghdr; 1633 isc_buffer_t *buffer; 1634 int recv_errno; 1635 char strbuf[ISC_STRERRORSIZE]; 1636 1637 build_msghdr_recv(sock, dev, &msghdr, iov, &read_count); 1638 1639#if defined(ISC_SOCKET_DEBUG) 1640 dump_msg(&msghdr); 1641#endif 1642 1643 cc = recvmsg(sock->fd, &msghdr, 0); 1644 recv_errno = errno; 1645 1646#if defined(ISC_SOCKET_DEBUG) 1647 dump_msg(&msghdr); 1648#endif 1649 1650 if (cc < 0) { 1651 if (SOFT_ERROR(recv_errno)) 1652 return (DOIO_SOFT); 1653 1654 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1655 isc__strerror(recv_errno, strbuf, sizeof(strbuf)); 1656 socket_log(sock, NULL, IOEVENT, 1657 isc_msgcat, ISC_MSGSET_SOCKET, 1658 ISC_MSG_DOIORECV, 1659 "doio_recv: recvmsg(%d) %d bytes, err %d/%s", 1660 sock->fd, cc, recv_errno, strbuf); 1661 } 1662 1663#define SOFT_OR_HARD(_system, _isc) \ 1664 if (recv_errno == _system) { \ 1665 if (sock->connected) { \ 1666 dev->result = _isc; \ 1667 inc_stats(sock->manager->stats, \ 1668 sock->statsindex[STATID_RECVFAIL]); \ 1669 return (DOIO_HARD); \ 1670 } \ 1671 return (DOIO_SOFT); \ 1672 } 1673#define ALWAYS_HARD(_system, _isc) \ 1674 if (recv_errno == _system) { \ 1675 dev->result = _isc; \ 1676 inc_stats(sock->manager->stats, \ 1677 sock->statsindex[STATID_RECVFAIL]); \ 1678 return (DOIO_HARD); \ 1679 } 1680 1681 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1682 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1683 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1684 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN); 1685 /* HPUX 11.11 can return EADDRNOTAVAIL. */ 1686 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1687 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1688 /* 1689 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 1690 * errors. 1691 */ 1692#ifdef EPROTO 1693 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); 1694#endif 1695 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); 1696 1697#undef SOFT_OR_HARD 1698#undef ALWAYS_HARD 1699 1700 dev->result = isc__errno2result(recv_errno); 1701 inc_stats(sock->manager->stats, 1702 sock->statsindex[STATID_RECVFAIL]); 1703 return (DOIO_HARD); 1704 } 1705 1706 /* 1707 * On TCP and UNIX sockets, zero length reads indicate EOF, 1708 * while on UDP sockets, zero length reads are perfectly valid, 1709 * although strange. 1710 */ 1711 switch (sock->type) { 1712 case isc_sockettype_tcp: 1713 case isc_sockettype_unix: 1714 if (cc == 0) 1715 return (DOIO_EOF); 1716 break; 1717 case isc_sockettype_udp: 1718 break; 1719 case isc_sockettype_fdwatch: 1720 default: 1721 INSIST(0); 1722 } 1723 1724 if (sock->type == isc_sockettype_udp) { 1725 dev->address.length = msghdr.msg_namelen; 1726 if (isc_sockaddr_getport(&dev->address) == 0) { 1727 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1728 socket_log(sock, &dev->address, IOEVENT, 1729 isc_msgcat, ISC_MSGSET_SOCKET, 1730 ISC_MSG_ZEROPORT, 1731 "dropping source port zero packet"); 1732 } 1733 return (DOIO_SOFT); 1734 } 1735 /* 1736 * Simulate a firewall blocking UDP responses bigger than 1737 * 512 bytes. 1738 */ 1739 if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp) 1740 return (DOIO_SOFT); 1741 } 1742 1743 socket_log(sock, &dev->address, IOEVENT, 1744 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV, 1745 "packet received correctly"); 1746 1747 /* 1748 * Overflow bit detection. If we received MORE bytes than we should, 1749 * this indicates an overflow situation. Set the flag in the 1750 * dev entry and adjust how much we read by one. 1751 */ 1752#ifdef ISC_NET_RECVOVERFLOW 1753 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) { 1754 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1755 cc--; 1756 } 1757#endif 1758 1759 /* 1760 * If there are control messages attached, run through them and pull 1761 * out the interesting bits. 1762 */ 1763 if (sock->type == isc_sockettype_udp) 1764 process_cmsg(sock, &msghdr, dev); 1765 1766 /* 1767 * update the buffers (if any) and the i/o count 1768 */ 1769 dev->n += cc; 1770 actual_count = cc; 1771 buffer = ISC_LIST_HEAD(dev->bufferlist); 1772 while (buffer != NULL && actual_count > 0U) { 1773 REQUIRE(ISC_BUFFER_VALID(buffer)); 1774 if (isc_buffer_availablelength(buffer) <= actual_count) { 1775 actual_count -= isc_buffer_availablelength(buffer); 1776 isc_buffer_add(buffer, 1777 isc_buffer_availablelength(buffer)); 1778 } else { 1779 isc_buffer_add(buffer, actual_count); 1780 actual_count = 0; 1781 POST(actual_count); 1782 break; 1783 } 1784 buffer = ISC_LIST_NEXT(buffer, link); 1785 if (buffer == NULL) { 1786 INSIST(actual_count == 0U); 1787 } 1788 } 1789 1790 /* 1791 * If we read less than we expected, update counters, 1792 * and let the upper layer poke the descriptor. 1793 */ 1794 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) 1795 return (DOIO_SOFT); 1796 1797 /* 1798 * Full reads are posted, or partials if partials are ok. 1799 */ 1800 dev->result = ISC_R_SUCCESS; 1801 return (DOIO_SUCCESS); 1802} 1803 1804/* 1805 * Returns: 1806 * DOIO_SUCCESS The operation succeeded. dev->result contains 1807 * ISC_R_SUCCESS. 1808 * 1809 * DOIO_HARD A hard or unexpected I/O error was encountered. 1810 * dev->result contains the appropriate error. 1811 * 1812 * DOIO_SOFT A soft I/O error was encountered. No senddone 1813 * event was sent. The operation should be retried. 1814 * 1815 * No other return values are possible. 1816 */ 1817static int 1818doio_send(isc__socket_t *sock, isc_socketevent_t *dev) { 1819 int cc; 1820 struct iovec iov[MAXSCATTERGATHER_SEND]; 1821 size_t write_count; 1822 struct msghdr msghdr; 1823 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 1824 int attempts = 0; 1825 int send_errno; 1826 char strbuf[ISC_STRERRORSIZE]; 1827 1828 build_msghdr_send(sock, dev, &msghdr, iov, &write_count); 1829 1830 resend: 1831 cc = sendmsg(sock->fd, &msghdr, 0); 1832 send_errno = errno; 1833 1834 /* 1835 * Check for error or block condition. 1836 */ 1837 if (cc < 0) { 1838 if (send_errno == EINTR && ++attempts < NRETRIES) 1839 goto resend; 1840 1841 if (SOFT_ERROR(send_errno)) 1842 return (DOIO_SOFT); 1843 1844#define SOFT_OR_HARD(_system, _isc) \ 1845 if (send_errno == _system) { \ 1846 if (sock->connected) { \ 1847 dev->result = _isc; \ 1848 inc_stats(sock->manager->stats, \ 1849 sock->statsindex[STATID_SENDFAIL]); \ 1850 return (DOIO_HARD); \ 1851 } \ 1852 return (DOIO_SOFT); \ 1853 } 1854#define ALWAYS_HARD(_system, _isc) \ 1855 if (send_errno == _system) { \ 1856 dev->result = _isc; \ 1857 inc_stats(sock->manager->stats, \ 1858 sock->statsindex[STATID_SENDFAIL]); \ 1859 return (DOIO_HARD); \ 1860 } 1861 1862 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1863 ALWAYS_HARD(EACCES, ISC_R_NOPERM); 1864 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 1865 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1866 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1867#ifdef EHOSTDOWN 1868 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH); 1869#endif 1870 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1871 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1872 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH); 1873 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED); 1874 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET); 1875 1876#undef SOFT_OR_HARD 1877#undef ALWAYS_HARD 1878 1879 /* 1880 * The other error types depend on whether or not the 1881 * socket is UDP or TCP. If it is UDP, some errors 1882 * that we expect to be fatal under TCP are merely 1883 * annoying, and are really soft errors. 1884 * 1885 * However, these soft errors are still returned as 1886 * a status. 1887 */ 1888 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); 1889 isc__strerror(send_errno, strbuf, sizeof(strbuf)); 1890 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", 1891 addrbuf, strbuf); 1892 dev->result = isc__errno2result(send_errno); 1893 inc_stats(sock->manager->stats, 1894 sock->statsindex[STATID_SENDFAIL]); 1895 return (DOIO_HARD); 1896 } 1897 1898 if (cc == 0) { 1899 inc_stats(sock->manager->stats, 1900 sock->statsindex[STATID_SENDFAIL]); 1901 UNEXPECTED_ERROR(__FILE__, __LINE__, 1902 "doio_send: send() %s 0", 1903 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 1904 ISC_MSG_RETURNED, "returned")); 1905 } 1906 1907 /* 1908 * If we write less than we expected, update counters, poke. 1909 */ 1910 dev->n += cc; 1911 if ((size_t)cc != write_count) 1912 return (DOIO_SOFT); 1913 1914 /* 1915 * Exactly what we wanted to write. We're done with this 1916 * entry. Post its completion event. 1917 */ 1918 dev->result = ISC_R_SUCCESS; 1919 return (DOIO_SUCCESS); 1920} 1921 1922/* 1923 * Kill. 1924 * 1925 * Caller must ensure that the socket is not locked and no external 1926 * references exist. 1927 */ 1928static void 1929closesocket(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) { 1930 isc_sockettype_t type = sock->type; 1931 int lockid = FDLOCK_ID(fd); 1932 1933 /* 1934 * No one has this socket open, so the watcher doesn't have to be 1935 * poked, and the socket doesn't have to be locked. 1936 */ 1937 LOCK(&manager->fdlock[lockid]); 1938 manager->fds[fd] = NULL; 1939 if (type == isc_sockettype_fdwatch) 1940 manager->fdstate[fd] = CLOSED; 1941 else 1942 manager->fdstate[fd] = CLOSE_PENDING; 1943 UNLOCK(&manager->fdlock[lockid]); 1944 if (type == isc_sockettype_fdwatch) { 1945 /* 1946 * The caller may close the socket once this function returns, 1947 * and `fd' may be reassigned for a new socket. So we do 1948 * unwatch_fd() here, rather than defer it via select_poke(). 1949 * Note: this may complicate data protection among threads and 1950 * may reduce performance due to additional locks. One way to 1951 * solve this would be to dup() the watched descriptor, but we 1952 * take a simpler approach at this moment. 1953 */ 1954 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1955 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1956 } else 1957 select_poke(manager, fd, SELECT_POKE_CLOSE); 1958 1959 inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]); 1960 1961 /* 1962 * update manager->maxfd here (XXX: this should be implemented more 1963 * efficiently) 1964 */ 1965#ifdef USE_SELECT 1966 LOCK(&manager->lock); 1967 if (manager->maxfd == fd) { 1968 int i; 1969 1970 manager->maxfd = 0; 1971 for (i = fd - 1; i >= 0; i--) { 1972 lockid = FDLOCK_ID(i); 1973 1974 LOCK(&manager->fdlock[lockid]); 1975 if (manager->fdstate[i] == MANAGED) { 1976 manager->maxfd = i; 1977 UNLOCK(&manager->fdlock[lockid]); 1978 break; 1979 } 1980 UNLOCK(&manager->fdlock[lockid]); 1981 } 1982#ifdef ISC_PLATFORM_USETHREADS 1983 if (manager->maxfd < manager->pipe_fds[0]) 1984 manager->maxfd = manager->pipe_fds[0]; 1985#endif 1986 } 1987 UNLOCK(&manager->lock); 1988#endif /* USE_SELECT */ 1989} 1990 1991static void 1992destroy(isc__socket_t **sockp) { 1993 int fd; 1994 isc__socket_t *sock = *sockp; 1995 isc__socketmgr_t *manager = sock->manager; 1996 1997 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 1998 ISC_MSG_DESTROYING, "destroying"); 1999 2000 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2001 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2002 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2003 INSIST(sock->connect_ev == NULL); 2004 REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks); 2005 2006 if (sock->fd >= 0) { 2007 fd = sock->fd; 2008 sock->fd = -1; 2009 closesocket(manager, sock, fd); 2010 } 2011 2012 LOCK(&manager->lock); 2013 2014 ISC_LIST_UNLINK(manager->socklist, sock, link); 2015 2016#ifdef USE_WATCHER_THREAD 2017 if (ISC_LIST_EMPTY(manager->socklist)) 2018 SIGNAL(&manager->shutdown_ok); 2019#endif /* USE_WATCHER_THREAD */ 2020 2021 /* can't unlock manager as its memory context is still used */ 2022 free_socket(sockp); 2023 2024 UNLOCK(&manager->lock); 2025} 2026 2027static isc_result_t 2028allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type, 2029 isc__socket_t **socketp) 2030{ 2031 isc__socket_t *sock; 2032 isc_result_t result; 2033 ISC_SOCKADDR_LEN_T cmsgbuflen; 2034 2035 sock = isc_mem_get(manager->mctx, sizeof(*sock)); 2036 2037 if (sock == NULL) 2038 return (ISC_R_NOMEMORY); 2039 2040 sock->common.magic = 0; 2041 sock->common.impmagic = 0; 2042 sock->references = 0; 2043 2044 sock->manager = manager; 2045 sock->type = type; 2046 sock->fd = -1; 2047 sock->statsindex = NULL; 2048 2049 ISC_LINK_INIT(sock, link); 2050 2051 sock->recvcmsgbuf = NULL; 2052 sock->sendcmsgbuf = NULL; 2053 2054 /* 2055 * set up cmsg buffers 2056 */ 2057 cmsgbuflen = 0; 2058#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 2059 cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo)); 2060#endif 2061#if defined(USE_CMSG) && defined(SO_TIMESTAMP) 2062 cmsgbuflen += cmsg_space(sizeof(struct timeval)); 2063#endif 2064 sock->recvcmsgbuflen = cmsgbuflen; 2065 if (sock->recvcmsgbuflen != 0U) { 2066 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 2067 if (sock->recvcmsgbuf == NULL) { 2068 result = ISC_R_NOMEMORY; 2069 goto error; 2070 } 2071 } 2072 2073 cmsgbuflen = 0; 2074#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 2075 cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo)); 2076#if defined(IPV6_USE_MIN_MTU) 2077 /* 2078 * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU 2079 * support. 2080 */ 2081 cmsgbuflen += cmsg_space(sizeof(int)); 2082#endif 2083#endif 2084 sock->sendcmsgbuflen = cmsgbuflen; 2085 if (sock->sendcmsgbuflen != 0U) { 2086 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 2087 if (sock->sendcmsgbuf == NULL) { 2088 result = ISC_R_NOMEMORY; 2089 goto error; 2090 } 2091 } 2092 2093 memset(sock->name, 0, sizeof(sock->name)); 2094 sock->tag = NULL; 2095 2096 /* 2097 * set up list of readers and writers to be initially empty 2098 */ 2099 ISC_LIST_INIT(sock->recv_list); 2100 ISC_LIST_INIT(sock->send_list); 2101 ISC_LIST_INIT(sock->accept_list); 2102 sock->connect_ev = NULL; 2103 sock->pending_recv = 0; 2104 sock->pending_send = 0; 2105 sock->pending_accept = 0; 2106 sock->listener = 0; 2107 sock->connected = 0; 2108 sock->connecting = 0; 2109 sock->bound = 0; 2110 2111 /* 2112 * initialize the lock 2113 */ 2114 result = isc_mutex_init(&sock->lock); 2115 if (result != ISC_R_SUCCESS) { 2116 sock->common.magic = 0; 2117 sock->common.impmagic = 0; 2118 goto error; 2119 } 2120 2121 /* 2122 * Initialize readable and writable events 2123 */ 2124 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t), 2125 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR, 2126 NULL, sock, sock, NULL, NULL); 2127 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t), 2128 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW, 2129 NULL, sock, sock, NULL, NULL); 2130 2131 sock->common.magic = ISCAPI_SOCKET_MAGIC; 2132 sock->common.impmagic = SOCKET_MAGIC; 2133 *socketp = sock; 2134 2135 return (ISC_R_SUCCESS); 2136 2137 error: 2138 if (sock->recvcmsgbuf != NULL) 2139 isc_mem_put(manager->mctx, sock->recvcmsgbuf, 2140 sock->recvcmsgbuflen); 2141 if (sock->sendcmsgbuf != NULL) 2142 isc_mem_put(manager->mctx, sock->sendcmsgbuf, 2143 sock->sendcmsgbuflen); 2144 isc_mem_put(manager->mctx, sock, sizeof(*sock)); 2145 2146 return (result); 2147} 2148 2149/* 2150 * This event requires that the various lists be empty, that the reference 2151 * count be 1, and that the magic number is valid. The other socket bits, 2152 * like the lock, must be initialized as well. The fd associated must be 2153 * marked as closed, by setting it to -1 on close, or this routine will 2154 * also close the socket. 2155 */ 2156static void 2157free_socket(isc__socket_t **socketp) { 2158 isc__socket_t *sock = *socketp; 2159 2160 INSIST(sock->references == 0); 2161 INSIST(VALID_SOCKET(sock)); 2162 INSIST(!sock->connecting); 2163 INSIST(!sock->pending_recv); 2164 INSIST(!sock->pending_send); 2165 INSIST(!sock->pending_accept); 2166 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2167 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2168 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2169 INSIST(!ISC_LINK_LINKED(sock, link)); 2170 2171 if (sock->recvcmsgbuf != NULL) 2172 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf, 2173 sock->recvcmsgbuflen); 2174 if (sock->sendcmsgbuf != NULL) 2175 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf, 2176 sock->sendcmsgbuflen); 2177 2178 sock->common.magic = 0; 2179 sock->common.impmagic = 0; 2180 2181 DESTROYLOCK(&sock->lock); 2182 2183 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock)); 2184 2185 *socketp = NULL; 2186} 2187 2188#ifdef SO_BSDCOMPAT 2189/* 2190 * This really should not be necessary to do. Having to workout 2191 * which kernel version we are on at run time so that we don't cause 2192 * the kernel to issue a warning about us using a deprecated socket option. 2193 * Such warnings should *never* be on by default in production kernels. 2194 * 2195 * We can't do this a build time because executables are moved between 2196 * machines and hence kernels. 2197 * 2198 * We can't just not set SO_BSDCOMAT because some kernels require it. 2199 */ 2200 2201static isc_once_t bsdcompat_once = ISC_ONCE_INIT; 2202isc_boolean_t bsdcompat = ISC_TRUE; 2203 2204static void 2205clear_bsdcompat(void) { 2206#ifdef __linux__ 2207 struct utsname buf; 2208 char *endp; 2209 long int major; 2210 long int minor; 2211 2212 uname(&buf); /* Can only fail if buf is bad in Linux. */ 2213 2214 /* Paranoia in parsing can be increased, but we trust uname(). */ 2215 major = strtol(buf.release, &endp, 10); 2216 if (*endp == '.') { 2217 minor = strtol(endp+1, &endp, 10); 2218 if ((major > 2) || ((major == 2) && (minor >= 4))) { 2219 bsdcompat = ISC_FALSE; 2220 } 2221 } 2222#endif /* __linux __ */ 2223} 2224#endif 2225 2226static isc_result_t 2227opensocket(isc__socketmgr_t *manager, isc__socket_t *sock) { 2228 isc_result_t result; 2229 char strbuf[ISC_STRERRORSIZE]; 2230 const char *err = "socket"; 2231 int tries = 0; 2232#if defined(USE_CMSG) || defined(SO_BSDCOMPAT) 2233 int on = 1; 2234#endif 2235#if defined(SO_RCVBUF) 2236 ISC_SOCKADDR_LEN_T optlen; 2237 int size; 2238#endif 2239 2240 again: 2241 switch (sock->type) { 2242 case isc_sockettype_udp: 2243 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); 2244 break; 2245 case isc_sockettype_tcp: 2246 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); 2247 break; 2248 case isc_sockettype_unix: 2249 sock->fd = socket(sock->pf, SOCK_STREAM, 0); 2250 break; 2251 case isc_sockettype_fdwatch: 2252 /* 2253 * We should not be called for isc_sockettype_fdwatch sockets. 2254 */ 2255 INSIST(0); 2256 break; 2257 } 2258 if (sock->fd == -1 && errno == EINTR && tries++ < 42) 2259 goto again; 2260 2261#ifdef F_DUPFD 2262 /* 2263 * Leave a space for stdio and TCP to work in. 2264 */ 2265 if (manager->reserved != 0 && sock->type == isc_sockettype_udp && 2266 sock->fd >= 0 && sock->fd < manager->reserved) { 2267 int new, tmp; 2268 new = fcntl(sock->fd, F_DUPFD, manager->reserved); 2269 tmp = errno; 2270 (void)close(sock->fd); 2271 errno = tmp; 2272 sock->fd = new; 2273 err = "isc_socket_create: fcntl/reserved"; 2274 } else if (sock->fd >= 0 && sock->fd < 20) { 2275 int new, tmp; 2276 new = fcntl(sock->fd, F_DUPFD, 20); 2277 tmp = errno; 2278 (void)close(sock->fd); 2279 errno = tmp; 2280 sock->fd = new; 2281 err = "isc_socket_create: fcntl"; 2282 } 2283#endif 2284 2285 if (sock->fd >= (int)manager->maxsocks) { 2286 (void)close(sock->fd); 2287 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2288 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2289 isc_msgcat, ISC_MSGSET_SOCKET, 2290 ISC_MSG_TOOMANYFDS, 2291 "socket: file descriptor exceeds limit (%d/%u)", 2292 sock->fd, manager->maxsocks); 2293 return (ISC_R_NORESOURCES); 2294 } 2295 2296 if (sock->fd < 0) { 2297 switch (errno) { 2298 case EMFILE: 2299 case ENFILE: 2300 isc__strerror(errno, strbuf, sizeof(strbuf)); 2301 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2302 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2303 isc_msgcat, ISC_MSGSET_SOCKET, 2304 ISC_MSG_TOOMANYFDS, 2305 "%s: %s", err, strbuf); 2306 /* fallthrough */ 2307 case ENOBUFS: 2308 return (ISC_R_NORESOURCES); 2309 2310 case EPROTONOSUPPORT: 2311 case EPFNOSUPPORT: 2312 case EAFNOSUPPORT: 2313 /* 2314 * Linux 2.2 (and maybe others) return EINVAL instead of 2315 * EAFNOSUPPORT. 2316 */ 2317 case EINVAL: 2318 return (ISC_R_FAMILYNOSUPPORT); 2319 2320 default: 2321 isc__strerror(errno, strbuf, sizeof(strbuf)); 2322 UNEXPECTED_ERROR(__FILE__, __LINE__, 2323 "%s() %s: %s", err, 2324 isc_msgcat_get(isc_msgcat, 2325 ISC_MSGSET_GENERAL, 2326 ISC_MSG_FAILED, 2327 "failed"), 2328 strbuf); 2329 return (ISC_R_UNEXPECTED); 2330 } 2331 } 2332 2333 result = make_nonblock(sock->fd); 2334 if (result != ISC_R_SUCCESS) { 2335 (void)close(sock->fd); 2336 return (result); 2337 } 2338 2339#ifdef SO_BSDCOMPAT 2340 RUNTIME_CHECK(isc_once_do(&bsdcompat_once, 2341 clear_bsdcompat) == ISC_R_SUCCESS); 2342 if (sock->type != isc_sockettype_unix && bsdcompat && 2343 setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, 2344 (void *)&on, sizeof(on)) < 0) { 2345 isc__strerror(errno, strbuf, sizeof(strbuf)); 2346 UNEXPECTED_ERROR(__FILE__, __LINE__, 2347 "setsockopt(%d, SO_BSDCOMPAT) %s: %s", 2348 sock->fd, 2349 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2350 ISC_MSG_FAILED, "failed"), 2351 strbuf); 2352 /* Press on... */ 2353 } 2354#endif 2355 2356#ifdef SO_NOSIGPIPE 2357 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, 2358 (void *)&on, sizeof(on)) < 0) { 2359 isc__strerror(errno, strbuf, sizeof(strbuf)); 2360 UNEXPECTED_ERROR(__FILE__, __LINE__, 2361 "setsockopt(%d, SO_NOSIGPIPE) %s: %s", 2362 sock->fd, 2363 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2364 ISC_MSG_FAILED, "failed"), 2365 strbuf); 2366 /* Press on... */ 2367 } 2368#endif 2369 2370#if defined(USE_CMSG) || defined(SO_RCVBUF) 2371 if (sock->type == isc_sockettype_udp) { 2372 2373#if defined(USE_CMSG) 2374#if defined(SO_TIMESTAMP) 2375 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, 2376 (void *)&on, sizeof(on)) < 0 2377 && errno != ENOPROTOOPT) { 2378 isc__strerror(errno, strbuf, sizeof(strbuf)); 2379 UNEXPECTED_ERROR(__FILE__, __LINE__, 2380 "setsockopt(%d, SO_TIMESTAMP) %s: %s", 2381 sock->fd, 2382 isc_msgcat_get(isc_msgcat, 2383 ISC_MSGSET_GENERAL, 2384 ISC_MSG_FAILED, 2385 "failed"), 2386 strbuf); 2387 /* Press on... */ 2388 } 2389#endif /* SO_TIMESTAMP */ 2390 2391#if defined(ISC_PLATFORM_HAVEIPV6) 2392 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { 2393 /* 2394 * Warn explicitly because this anomaly can be hidden 2395 * in usual operation (and unexpectedly appear later). 2396 */ 2397 UNEXPECTED_ERROR(__FILE__, __LINE__, 2398 "No buffer available to receive " 2399 "IPv6 destination"); 2400 } 2401#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 2402#ifdef IPV6_RECVPKTINFO 2403 /* RFC 3542 */ 2404 if ((sock->pf == AF_INET6) 2405 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 2406 (void *)&on, sizeof(on)) < 0)) { 2407 isc__strerror(errno, strbuf, sizeof(strbuf)); 2408 UNEXPECTED_ERROR(__FILE__, __LINE__, 2409 "setsockopt(%d, IPV6_RECVPKTINFO) " 2410 "%s: %s", sock->fd, 2411 isc_msgcat_get(isc_msgcat, 2412 ISC_MSGSET_GENERAL, 2413 ISC_MSG_FAILED, 2414 "failed"), 2415 strbuf); 2416 } 2417#else 2418 /* RFC 2292 */ 2419 if ((sock->pf == AF_INET6) 2420 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, 2421 (void *)&on, sizeof(on)) < 0)) { 2422 isc__strerror(errno, strbuf, sizeof(strbuf)); 2423 UNEXPECTED_ERROR(__FILE__, __LINE__, 2424 "setsockopt(%d, IPV6_PKTINFO) %s: %s", 2425 sock->fd, 2426 isc_msgcat_get(isc_msgcat, 2427 ISC_MSGSET_GENERAL, 2428 ISC_MSG_FAILED, 2429 "failed"), 2430 strbuf); 2431 } 2432#endif /* IPV6_RECVPKTINFO */ 2433#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */ 2434#ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/ 2435 /* use minimum MTU */ 2436 if (sock->pf == AF_INET6 && 2437 setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, 2438 (void *)&on, sizeof(on)) < 0) { 2439 isc__strerror(errno, strbuf, sizeof(strbuf)); 2440 UNEXPECTED_ERROR(__FILE__, __LINE__, 2441 "setsockopt(%d, IPV6_USE_MIN_MTU) " 2442 "%s: %s", sock->fd, 2443 isc_msgcat_get(isc_msgcat, 2444 ISC_MSGSET_GENERAL, 2445 ISC_MSG_FAILED, 2446 "failed"), 2447 strbuf); 2448 } 2449#endif 2450#if defined(IPV6_MTU) 2451 /* 2452 * Use minimum MTU on IPv6 sockets. 2453 */ 2454 if (sock->pf == AF_INET6) { 2455 int mtu = 1280; 2456 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, 2457 &mtu, sizeof(mtu)); 2458 } 2459#endif 2460#if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT) 2461 /* 2462 * Turn off Path MTU discovery on IPv6/UDP sockets. 2463 */ 2464 if (sock->pf == AF_INET6) { 2465 int action = IPV6_PMTUDISC_DONT; 2466 (void)setsockopt(sock->fd, IPPROTO_IPV6, 2467 IPV6_MTU_DISCOVER, &action, 2468 sizeof(action)); 2469 } 2470#endif 2471#endif /* ISC_PLATFORM_HAVEIPV6 */ 2472#endif /* defined(USE_CMSG) */ 2473 2474#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) 2475 /* 2476 * Turn off Path MTU discovery on IPv4/UDP sockets. 2477 */ 2478 if (sock->pf == AF_INET) { 2479 int action = IP_PMTUDISC_DONT; 2480 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER, 2481 &action, sizeof(action)); 2482 } 2483#endif 2484#if defined(IP_DONTFRAG) 2485 /* 2486 * Turn off Path MTU discovery on IPv4/UDP sockets. 2487 */ 2488 if (sock->pf == AF_INET) { 2489 int off = 0; 2490 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, 2491 &off, sizeof(off)); 2492 } 2493#endif 2494 2495#if defined(SO_RCVBUF) 2496 optlen = sizeof(size); 2497 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2498 (void *)&size, &optlen) >= 0 && 2499 size < RCVBUFSIZE) { 2500 size = RCVBUFSIZE; 2501 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2502 (void *)&size, sizeof(size)) == -1) { 2503 isc__strerror(errno, strbuf, sizeof(strbuf)); 2504 UNEXPECTED_ERROR(__FILE__, __LINE__, 2505 "setsockopt(%d, SO_RCVBUF, %d) %s: %s", 2506 sock->fd, size, 2507 isc_msgcat_get(isc_msgcat, 2508 ISC_MSGSET_GENERAL, 2509 ISC_MSG_FAILED, 2510 "failed"), 2511 strbuf); 2512 } 2513 } 2514#endif 2515 } 2516#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */ 2517 2518 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]); 2519 2520 return (ISC_R_SUCCESS); 2521} 2522 2523/*% 2524 * Create a new 'type' socket managed by 'manager'. Events 2525 * will be posted to 'task' and when dispatched 'action' will be 2526 * called with 'arg' as the arg value. The new socket is returned 2527 * in 'socketp'. 2528 */ 2529ISC_SOCKETFUNC_SCOPE isc_result_t 2530isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 2531 isc_socket_t **socketp) 2532{ 2533 isc__socket_t *sock = NULL; 2534 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 2535 isc_result_t result; 2536 int lockid; 2537 2538 REQUIRE(VALID_MANAGER(manager)); 2539 REQUIRE(socketp != NULL && *socketp == NULL); 2540 REQUIRE(type != isc_sockettype_fdwatch); 2541 2542 result = allocate_socket(manager, type, &sock); 2543 if (result != ISC_R_SUCCESS) 2544 return (result); 2545 2546 switch (sock->type) { 2547 case isc_sockettype_udp: 2548 sock->statsindex = 2549 (pf == AF_INET) ? upd4statsindex : upd6statsindex; 2550 break; 2551 case isc_sockettype_tcp: 2552 sock->statsindex = 2553 (pf == AF_INET) ? tcp4statsindex : tcp6statsindex; 2554 break; 2555 case isc_sockettype_unix: 2556 sock->statsindex = unixstatsindex; 2557 break; 2558 default: 2559 INSIST(0); 2560 } 2561 2562 sock->pf = pf; 2563 result = opensocket(manager, sock); 2564 if (result != ISC_R_SUCCESS) { 2565 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]); 2566 free_socket(&sock); 2567 return (result); 2568 } 2569 2570 sock->common.methods = (isc_socketmethods_t *)&socketmethods; 2571 sock->references = 1; 2572 *socketp = (isc_socket_t *)sock; 2573 2574 /* 2575 * Note we don't have to lock the socket like we normally would because 2576 * there are no external references to it yet. 2577 */ 2578 2579 lockid = FDLOCK_ID(sock->fd); 2580 LOCK(&manager->fdlock[lockid]); 2581 manager->fds[sock->fd] = sock; 2582 manager->fdstate[sock->fd] = MANAGED; 2583#ifdef USE_DEVPOLL 2584 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2585 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2586#endif 2587 UNLOCK(&manager->fdlock[lockid]); 2588 2589 LOCK(&manager->lock); 2590 ISC_LIST_APPEND(manager->socklist, sock, link); 2591#ifdef USE_SELECT 2592 if (manager->maxfd < sock->fd) 2593 manager->maxfd = sock->fd; 2594#endif 2595 UNLOCK(&manager->lock); 2596 2597 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2598 ISC_MSG_CREATED, "created"); 2599 2600 return (ISC_R_SUCCESS); 2601} 2602 2603#ifdef BIND9 2604ISC_SOCKETFUNC_SCOPE isc_result_t 2605isc__socket_open(isc_socket_t *sock0) { 2606 isc_result_t result; 2607 isc__socket_t *sock = (isc__socket_t *)sock0; 2608 2609 REQUIRE(VALID_SOCKET(sock)); 2610 2611 LOCK(&sock->lock); 2612 REQUIRE(sock->references == 1); 2613 REQUIRE(sock->type != isc_sockettype_fdwatch); 2614 UNLOCK(&sock->lock); 2615 /* 2616 * We don't need to retain the lock hereafter, since no one else has 2617 * this socket. 2618 */ 2619 REQUIRE(sock->fd == -1); 2620 2621 result = opensocket(sock->manager, sock); 2622 if (result != ISC_R_SUCCESS) 2623 sock->fd = -1; 2624 2625 if (result == ISC_R_SUCCESS) { 2626 int lockid = FDLOCK_ID(sock->fd); 2627 2628 LOCK(&sock->manager->fdlock[lockid]); 2629 sock->manager->fds[sock->fd] = sock; 2630 sock->manager->fdstate[sock->fd] = MANAGED; 2631#ifdef USE_DEVPOLL 2632 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2633 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2634#endif 2635 UNLOCK(&sock->manager->fdlock[lockid]); 2636 2637#ifdef USE_SELECT 2638 LOCK(&sock->manager->lock); 2639 if (sock->manager->maxfd < sock->fd) 2640 sock->manager->maxfd = sock->fd; 2641 UNLOCK(&sock->manager->lock); 2642#endif 2643 } 2644 2645 return (result); 2646} 2647#endif /* BIND9 */ 2648 2649/* 2650 * Create a new 'type' socket managed by 'manager'. Events 2651 * will be posted to 'task' and when dispatched 'action' will be 2652 * called with 'arg' as the arg value. The new socket is returned 2653 * in 'socketp'. 2654 */ 2655ISC_SOCKETFUNC_SCOPE isc_result_t 2656isc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags, 2657 isc_sockfdwatch_t callback, void *cbarg, 2658 isc_task_t *task, isc_socket_t **socketp) 2659{ 2660 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 2661 isc__socket_t *sock = NULL; 2662 isc_result_t result; 2663 int lockid; 2664 2665 REQUIRE(VALID_MANAGER(manager)); 2666 REQUIRE(socketp != NULL && *socketp == NULL); 2667 2668 result = allocate_socket(manager, isc_sockettype_fdwatch, &sock); 2669 if (result != ISC_R_SUCCESS) 2670 return (result); 2671 2672 sock->fd = fd; 2673 sock->fdwatcharg = cbarg; 2674 sock->fdwatchcb = callback; 2675 sock->fdwatchflags = flags; 2676 sock->fdwatchtask = task; 2677 sock->statsindex = fdwatchstatsindex; 2678 2679 sock->common.methods = (isc_socketmethods_t *)&socketmethods; 2680 sock->references = 1; 2681 *socketp = (isc_socket_t *)sock; 2682 2683 /* 2684 * Note we don't have to lock the socket like we normally would because 2685 * there are no external references to it yet. 2686 */ 2687 2688 lockid = FDLOCK_ID(sock->fd); 2689 LOCK(&manager->fdlock[lockid]); 2690 manager->fds[sock->fd] = sock; 2691 manager->fdstate[sock->fd] = MANAGED; 2692 UNLOCK(&manager->fdlock[lockid]); 2693 2694 LOCK(&manager->lock); 2695 ISC_LIST_APPEND(manager->socklist, sock, link); 2696#ifdef USE_SELECT 2697 if (manager->maxfd < sock->fd) 2698 manager->maxfd = sock->fd; 2699#endif 2700 UNLOCK(&manager->lock); 2701 2702 if (flags & ISC_SOCKFDWATCH_READ) 2703 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 2704 if (flags & ISC_SOCKFDWATCH_WRITE) 2705 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 2706 2707 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2708 ISC_MSG_CREATED, "fdwatch-created"); 2709 2710 return (ISC_R_SUCCESS); 2711} 2712 2713/* 2714 * Indicate to the manager that it should watch the socket again. 2715 * This can be used to restart watching if the previous event handler 2716 * didn't indicate there was more data to be processed. Primarily 2717 * it is for writing but could be used for reading if desired 2718 */ 2719 2720ISC_SOCKETFUNC_SCOPE isc_result_t 2721isc__socket_fdwatchpoke(isc_socket_t *sock0, int flags) 2722{ 2723 isc__socket_t *sock = (isc__socket_t *)sock0; 2724 2725 REQUIRE(VALID_SOCKET(sock)); 2726 2727 /* 2728 * We check both flags first to allow us to get the lock 2729 * once but only if we need it. 2730 */ 2731 2732 if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) { 2733 LOCK(&sock->lock); 2734 if (((flags & ISC_SOCKFDWATCH_READ) != 0) && 2735 !sock->pending_recv) 2736 select_poke(sock->manager, sock->fd, 2737 SELECT_POKE_READ); 2738 if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) && 2739 !sock->pending_send) 2740 select_poke(sock->manager, sock->fd, 2741 SELECT_POKE_WRITE); 2742 UNLOCK(&sock->lock); 2743 } 2744 2745 socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET, 2746 ISC_MSG_POKED, "fdwatch-poked flags: %d", flags); 2747 2748 return (ISC_R_SUCCESS); 2749} 2750 2751/* 2752 * Attach to a socket. Caller must explicitly detach when it is done. 2753 */ 2754ISC_SOCKETFUNC_SCOPE void 2755isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) { 2756 isc__socket_t *sock = (isc__socket_t *)sock0; 2757 2758 REQUIRE(VALID_SOCKET(sock)); 2759 REQUIRE(socketp != NULL && *socketp == NULL); 2760 2761 LOCK(&sock->lock); 2762 sock->references++; 2763 UNLOCK(&sock->lock); 2764 2765 *socketp = (isc_socket_t *)sock; 2766} 2767 2768/* 2769 * Dereference a socket. If this is the last reference to it, clean things 2770 * up by destroying the socket. 2771 */ 2772ISC_SOCKETFUNC_SCOPE void 2773isc__socket_detach(isc_socket_t **socketp) { 2774 isc__socket_t *sock; 2775 isc_boolean_t kill_socket = ISC_FALSE; 2776 2777 REQUIRE(socketp != NULL); 2778 sock = (isc__socket_t *)*socketp; 2779 REQUIRE(VALID_SOCKET(sock)); 2780 2781 LOCK(&sock->lock); 2782 REQUIRE(sock->references > 0); 2783 sock->references--; 2784 if (sock->references == 0) 2785 kill_socket = ISC_TRUE; 2786 UNLOCK(&sock->lock); 2787 2788 if (kill_socket) 2789 destroy(&sock); 2790 2791 *socketp = NULL; 2792} 2793 2794#ifdef BIND9 2795ISC_SOCKETFUNC_SCOPE isc_result_t 2796isc__socket_close(isc_socket_t *sock0) { 2797 isc__socket_t *sock = (isc__socket_t *)sock0; 2798 int fd; 2799 isc__socketmgr_t *manager; 2800 2801 REQUIRE(VALID_SOCKET(sock)); 2802 2803 LOCK(&sock->lock); 2804 2805 REQUIRE(sock->references == 1); 2806 REQUIRE(sock->type != isc_sockettype_fdwatch); 2807 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); 2808 2809 INSIST(!sock->connecting); 2810 INSIST(!sock->pending_recv); 2811 INSIST(!sock->pending_send); 2812 INSIST(!sock->pending_accept); 2813 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2814 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2815 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2816 INSIST(sock->connect_ev == NULL); 2817 2818 manager = sock->manager; 2819 fd = sock->fd; 2820 sock->fd = -1; 2821 memset(sock->name, 0, sizeof(sock->name)); 2822 sock->tag = NULL; 2823 sock->listener = 0; 2824 sock->connected = 0; 2825 sock->connecting = 0; 2826 sock->bound = 0; 2827 isc_sockaddr_any(&sock->peer_address); 2828 2829 UNLOCK(&sock->lock); 2830 2831 closesocket(manager, sock, fd); 2832 2833 return (ISC_R_SUCCESS); 2834} 2835#endif /* BIND9 */ 2836 2837/* 2838 * I/O is possible on a given socket. Schedule an event to this task that 2839 * will call an internal function to do the I/O. This will charge the 2840 * task with the I/O operation and let our select loop handler get back 2841 * to doing something real as fast as possible. 2842 * 2843 * The socket and manager must be locked before calling this function. 2844 */ 2845static void 2846dispatch_recv(isc__socket_t *sock) { 2847 intev_t *iev; 2848 isc_socketevent_t *ev; 2849 isc_task_t *sender; 2850 2851 INSIST(!sock->pending_recv); 2852 2853 if (sock->type != isc_sockettype_fdwatch) { 2854 ev = ISC_LIST_HEAD(sock->recv_list); 2855 if (ev == NULL) 2856 return; 2857 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2858 "dispatch_recv: event %p -> task %p", 2859 ev, ev->ev_sender); 2860 sender = ev->ev_sender; 2861 } else { 2862 sender = sock->fdwatchtask; 2863 } 2864 2865 sock->pending_recv = 1; 2866 iev = &sock->readable_ev; 2867 2868 sock->references++; 2869 iev->ev_sender = sock; 2870 if (sock->type == isc_sockettype_fdwatch) 2871 iev->ev_action = internal_fdwatch_read; 2872 else 2873 iev->ev_action = internal_recv; 2874 iev->ev_arg = sock; 2875 2876 isc_task_send(sender, (isc_event_t **)&iev); 2877} 2878 2879static void 2880dispatch_send(isc__socket_t *sock) { 2881 intev_t *iev; 2882 isc_socketevent_t *ev; 2883 isc_task_t *sender; 2884 2885 INSIST(!sock->pending_send); 2886 2887 if (sock->type != isc_sockettype_fdwatch) { 2888 ev = ISC_LIST_HEAD(sock->send_list); 2889 if (ev == NULL) 2890 return; 2891 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2892 "dispatch_send: event %p -> task %p", 2893 ev, ev->ev_sender); 2894 sender = ev->ev_sender; 2895 } else { 2896 sender = sock->fdwatchtask; 2897 } 2898 2899 sock->pending_send = 1; 2900 iev = &sock->writable_ev; 2901 2902 sock->references++; 2903 iev->ev_sender = sock; 2904 if (sock->type == isc_sockettype_fdwatch) 2905 iev->ev_action = internal_fdwatch_write; 2906 else 2907 iev->ev_action = internal_send; 2908 iev->ev_arg = sock; 2909 2910 isc_task_send(sender, (isc_event_t **)&iev); 2911} 2912 2913/* 2914 * Dispatch an internal accept event. 2915 */ 2916static void 2917dispatch_accept(isc__socket_t *sock) { 2918 intev_t *iev; 2919 isc_socket_newconnev_t *ev; 2920 2921 INSIST(!sock->pending_accept); 2922 2923 /* 2924 * Are there any done events left, or were they all canceled 2925 * before the manager got the socket lock? 2926 */ 2927 ev = ISC_LIST_HEAD(sock->accept_list); 2928 if (ev == NULL) 2929 return; 2930 2931 sock->pending_accept = 1; 2932 iev = &sock->readable_ev; 2933 2934 sock->references++; /* keep socket around for this internal event */ 2935 iev->ev_sender = sock; 2936 iev->ev_action = internal_accept; 2937 iev->ev_arg = sock; 2938 2939 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 2940} 2941 2942static void 2943dispatch_connect(isc__socket_t *sock) { 2944 intev_t *iev; 2945 isc_socket_connev_t *ev; 2946 2947 iev = &sock->writable_ev; 2948 2949 ev = sock->connect_ev; 2950 INSIST(ev != NULL); /* XXX */ 2951 2952 INSIST(sock->connecting); 2953 2954 sock->references++; /* keep socket around for this internal event */ 2955 iev->ev_sender = sock; 2956 iev->ev_action = internal_connect; 2957 iev->ev_arg = sock; 2958 2959 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 2960} 2961 2962/* 2963 * Dequeue an item off the given socket's read queue, set the result code 2964 * in the done event to the one provided, and send it to the task it was 2965 * destined for. 2966 * 2967 * If the event to be sent is on a list, remove it before sending. If 2968 * asked to, send and detach from the socket as well. 2969 * 2970 * Caller must have the socket locked if the event is attached to the socket. 2971 */ 2972static void 2973send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) { 2974 isc_task_t *task; 2975 2976 task = (*dev)->ev_sender; 2977 2978 (*dev)->ev_sender = sock; 2979 2980 if (ISC_LINK_LINKED(*dev, ev_link)) 2981 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); 2982 2983 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 2984 == ISC_SOCKEVENTATTR_ATTACHED) 2985 isc_task_sendanddetach(&task, (isc_event_t **)dev); 2986 else 2987 isc_task_send(task, (isc_event_t **)dev); 2988} 2989 2990/* 2991 * See comments for send_recvdone_event() above. 2992 * 2993 * Caller must have the socket locked if the event is attached to the socket. 2994 */ 2995static void 2996send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) { 2997 isc_task_t *task; 2998 2999 INSIST(dev != NULL && *dev != NULL); 3000 3001 task = (*dev)->ev_sender; 3002 (*dev)->ev_sender = sock; 3003 3004 if (ISC_LINK_LINKED(*dev, ev_link)) 3005 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); 3006 3007 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 3008 == ISC_SOCKEVENTATTR_ATTACHED) 3009 isc_task_sendanddetach(&task, (isc_event_t **)dev); 3010 else 3011 isc_task_send(task, (isc_event_t **)dev); 3012} 3013 3014/* 3015 * Call accept() on a socket, to get the new file descriptor. The listen 3016 * socket is used as a prototype to create a new isc_socket_t. The new 3017 * socket has one outstanding reference. The task receiving the event 3018 * will be detached from just after the event is delivered. 3019 * 3020 * On entry to this function, the event delivered is the internal 3021 * readable event, and the first item on the accept_list should be 3022 * the done event we want to send. If the list is empty, this is a no-op, 3023 * so just unlock and return. 3024 */ 3025static void 3026internal_accept(isc_task_t *me, isc_event_t *ev) { 3027 isc__socket_t *sock; 3028 isc__socketmgr_t *manager; 3029 isc_socket_newconnev_t *dev; 3030 isc_task_t *task; 3031 ISC_SOCKADDR_LEN_T addrlen; 3032 int fd; 3033 isc_result_t result = ISC_R_SUCCESS; 3034 char strbuf[ISC_STRERRORSIZE]; 3035 const char *err = "accept"; 3036 3037 UNUSED(me); 3038 3039 sock = ev->ev_sender; 3040 INSIST(VALID_SOCKET(sock)); 3041 3042 LOCK(&sock->lock); 3043 socket_log(sock, NULL, TRACE, 3044 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK, 3045 "internal_accept called, locked socket"); 3046 3047 manager = sock->manager; 3048 INSIST(VALID_MANAGER(manager)); 3049 3050 INSIST(sock->listener); 3051 INSIST(sock->pending_accept == 1); 3052 sock->pending_accept = 0; 3053 3054 INSIST(sock->references > 0); 3055 sock->references--; /* the internal event is done with this socket */ 3056 if (sock->references == 0) { 3057 UNLOCK(&sock->lock); 3058 destroy(&sock); 3059 return; 3060 } 3061 3062 /* 3063 * Get the first item off the accept list. 3064 * If it is empty, unlock the socket and return. 3065 */ 3066 dev = ISC_LIST_HEAD(sock->accept_list); 3067 if (dev == NULL) { 3068 UNLOCK(&sock->lock); 3069 return; 3070 } 3071 3072 /* 3073 * Try to accept the new connection. If the accept fails with 3074 * EAGAIN or EINTR, simply poke the watcher to watch this socket 3075 * again. Also ignore ECONNRESET, which has been reported to 3076 * be spuriously returned on Linux 2.2.19 although it is not 3077 * a documented error for accept(). ECONNABORTED has been 3078 * reported for Solaris 8. The rest are thrown in not because 3079 * we have seen them but because they are ignored by other 3080 * daemons such as BIND 8 and Apache. 3081 */ 3082 3083 addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type); 3084 memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen); 3085 fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa, 3086 (void *)&addrlen); 3087 3088#ifdef F_DUPFD 3089 /* 3090 * Leave a space for stdio to work in. 3091 */ 3092 if (fd >= 0 && fd < 20) { 3093 int new, tmp; 3094 new = fcntl(fd, F_DUPFD, 20); 3095 tmp = errno; 3096 (void)close(fd); 3097 errno = tmp; 3098 fd = new; 3099 err = "accept/fcntl"; 3100 } 3101#endif 3102 3103 if (fd < 0) { 3104 if (SOFT_ERROR(errno)) 3105 goto soft_error; 3106 switch (errno) { 3107 case ENFILE: 3108 case EMFILE: 3109 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 3110 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 3111 isc_msgcat, ISC_MSGSET_SOCKET, 3112 ISC_MSG_TOOMANYFDS, 3113 "%s: too many open file descriptors", 3114 err); 3115 goto soft_error; 3116 3117 case ENOBUFS: 3118 case ENOMEM: 3119 case ECONNRESET: 3120 case ECONNABORTED: 3121 case EHOSTUNREACH: 3122 case EHOSTDOWN: 3123 case ENETUNREACH: 3124 case ENETDOWN: 3125 case ECONNREFUSED: 3126#ifdef EPROTO 3127 case EPROTO: 3128#endif 3129#ifdef ENONET 3130 case ENONET: 3131#endif 3132 goto soft_error; 3133 default: 3134 break; 3135 } 3136 isc__strerror(errno, strbuf, sizeof(strbuf)); 3137 UNEXPECTED_ERROR(__FILE__, __LINE__, 3138 "internal_accept: %s() %s: %s", err, 3139 isc_msgcat_get(isc_msgcat, 3140 ISC_MSGSET_GENERAL, 3141 ISC_MSG_FAILED, 3142 "failed"), 3143 strbuf); 3144 fd = -1; 3145 result = ISC_R_UNEXPECTED; 3146 } else { 3147 if (addrlen == 0U) { 3148 UNEXPECTED_ERROR(__FILE__, __LINE__, 3149 "internal_accept(): " 3150 "accept() failed to return " 3151 "remote address"); 3152 3153 (void)close(fd); 3154 goto soft_error; 3155 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family != 3156 sock->pf) 3157 { 3158 UNEXPECTED_ERROR(__FILE__, __LINE__, 3159 "internal_accept(): " 3160 "accept() returned peer address " 3161 "family %u (expected %u)", 3162 NEWCONNSOCK(dev)->peer_address. 3163 type.sa.sa_family, 3164 sock->pf); 3165 (void)close(fd); 3166 goto soft_error; 3167 } else if (fd >= (int)manager->maxsocks) { 3168 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 3169 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 3170 isc_msgcat, ISC_MSGSET_SOCKET, 3171 ISC_MSG_TOOMANYFDS, 3172 "accept: " 3173 "file descriptor exceeds limit (%d/%u)", 3174 fd, manager->maxsocks); 3175 (void)close(fd); 3176 goto soft_error; 3177 } 3178 } 3179 3180 if (fd != -1) { 3181 NEWCONNSOCK(dev)->peer_address.length = addrlen; 3182 NEWCONNSOCK(dev)->pf = sock->pf; 3183 } 3184 3185 /* 3186 * Pull off the done event. 3187 */ 3188 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link); 3189 3190 /* 3191 * Poke watcher if there are more pending accepts. 3192 */ 3193 if (!ISC_LIST_EMPTY(sock->accept_list)) 3194 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 3195 3196 UNLOCK(&sock->lock); 3197 3198 if (fd != -1) { 3199 result = make_nonblock(fd); 3200 if (result != ISC_R_SUCCESS) { 3201 (void)close(fd); 3202 fd = -1; 3203 } 3204 } 3205 3206 /* 3207 * -1 means the new socket didn't happen. 3208 */ 3209 if (fd != -1) { 3210 int lockid = FDLOCK_ID(fd); 3211 3212 LOCK(&manager->fdlock[lockid]); 3213 manager->fds[fd] = NEWCONNSOCK(dev); 3214 manager->fdstate[fd] = MANAGED; 3215 UNLOCK(&manager->fdlock[lockid]); 3216 3217 LOCK(&manager->lock); 3218 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link); 3219 3220 NEWCONNSOCK(dev)->fd = fd; 3221 NEWCONNSOCK(dev)->bound = 1; 3222 NEWCONNSOCK(dev)->connected = 1; 3223 3224 /* 3225 * Save away the remote address 3226 */ 3227 dev->address = NEWCONNSOCK(dev)->peer_address; 3228 3229#ifdef USE_SELECT 3230 if (manager->maxfd < fd) 3231 manager->maxfd = fd; 3232#endif 3233 3234 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION, 3235 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, 3236 "accepted connection, new socket %p", 3237 dev->newsocket); 3238 3239 UNLOCK(&manager->lock); 3240 3241 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]); 3242 } else { 3243 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3244 NEWCONNSOCK(dev)->references--; 3245 free_socket((isc__socket_t **)&dev->newsocket); 3246 } 3247 3248 /* 3249 * Fill in the done event details and send it off. 3250 */ 3251 dev->result = result; 3252 task = dev->ev_sender; 3253 dev->ev_sender = sock; 3254 3255 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 3256 return; 3257 3258 soft_error: 3259 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 3260 UNLOCK(&sock->lock); 3261 3262 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3263 return; 3264} 3265 3266static void 3267internal_recv(isc_task_t *me, isc_event_t *ev) { 3268 isc_socketevent_t *dev; 3269 isc__socket_t *sock; 3270 3271 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3272 3273 sock = ev->ev_sender; 3274 INSIST(VALID_SOCKET(sock)); 3275 3276 LOCK(&sock->lock); 3277 socket_log(sock, NULL, IOEVENT, 3278 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3279 "internal_recv: task %p got event %p", me, ev); 3280 3281 INSIST(sock->pending_recv == 1); 3282 sock->pending_recv = 0; 3283 3284 INSIST(sock->references > 0); 3285 sock->references--; /* the internal event is done with this socket */ 3286 if (sock->references == 0) { 3287 UNLOCK(&sock->lock); 3288 destroy(&sock); 3289 return; 3290 } 3291 3292 /* 3293 * Try to do as much I/O as possible on this socket. There are no 3294 * limits here, currently. 3295 */ 3296 dev = ISC_LIST_HEAD(sock->recv_list); 3297 while (dev != NULL) { 3298 switch (doio_recv(sock, dev)) { 3299 case DOIO_SOFT: 3300 goto poke; 3301 3302 case DOIO_EOF: 3303 /* 3304 * read of 0 means the remote end was closed. 3305 * Run through the event queue and dispatch all 3306 * the events with an EOF result code. 3307 */ 3308 do { 3309 dev->result = ISC_R_EOF; 3310 send_recvdone_event(sock, &dev); 3311 dev = ISC_LIST_HEAD(sock->recv_list); 3312 } while (dev != NULL); 3313 goto poke; 3314 3315 case DOIO_SUCCESS: 3316 case DOIO_HARD: 3317 send_recvdone_event(sock, &dev); 3318 break; 3319 } 3320 3321 dev = ISC_LIST_HEAD(sock->recv_list); 3322 } 3323 3324 poke: 3325 if (!ISC_LIST_EMPTY(sock->recv_list)) 3326 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3327 3328 UNLOCK(&sock->lock); 3329} 3330 3331static void 3332internal_send(isc_task_t *me, isc_event_t *ev) { 3333 isc_socketevent_t *dev; 3334 isc__socket_t *sock; 3335 3336 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3337 3338 /* 3339 * Find out what socket this is and lock it. 3340 */ 3341 sock = (isc__socket_t *)ev->ev_sender; 3342 INSIST(VALID_SOCKET(sock)); 3343 3344 LOCK(&sock->lock); 3345 socket_log(sock, NULL, IOEVENT, 3346 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3347 "internal_send: task %p got event %p", me, ev); 3348 3349 INSIST(sock->pending_send == 1); 3350 sock->pending_send = 0; 3351 3352 INSIST(sock->references > 0); 3353 sock->references--; /* the internal event is done with this socket */ 3354 if (sock->references == 0) { 3355 UNLOCK(&sock->lock); 3356 destroy(&sock); 3357 return; 3358 } 3359 3360 /* 3361 * Try to do as much I/O as possible on this socket. There are no 3362 * limits here, currently. 3363 */ 3364 dev = ISC_LIST_HEAD(sock->send_list); 3365 while (dev != NULL) { 3366 switch (doio_send(sock, dev)) { 3367 case DOIO_SOFT: 3368 goto poke; 3369 3370 case DOIO_HARD: 3371 case DOIO_SUCCESS: 3372 send_senddone_event(sock, &dev); 3373 break; 3374 } 3375 3376 dev = ISC_LIST_HEAD(sock->send_list); 3377 } 3378 3379 poke: 3380 if (!ISC_LIST_EMPTY(sock->send_list)) 3381 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3382 3383 UNLOCK(&sock->lock); 3384} 3385 3386static void 3387internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) { 3388 isc__socket_t *sock; 3389 int more_data; 3390 3391 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3392 3393 /* 3394 * Find out what socket this is and lock it. 3395 */ 3396 sock = (isc__socket_t *)ev->ev_sender; 3397 INSIST(VALID_SOCKET(sock)); 3398 3399 LOCK(&sock->lock); 3400 socket_log(sock, NULL, IOEVENT, 3401 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3402 "internal_fdwatch_write: task %p got event %p", me, ev); 3403 3404 INSIST(sock->pending_send == 1); 3405 3406 UNLOCK(&sock->lock); 3407 more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock, 3408 sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE); 3409 LOCK(&sock->lock); 3410 3411 sock->pending_send = 0; 3412 3413 INSIST(sock->references > 0); 3414 sock->references--; /* the internal event is done with this socket */ 3415 if (sock->references == 0) { 3416 UNLOCK(&sock->lock); 3417 destroy(&sock); 3418 return; 3419 } 3420 3421 if (more_data) 3422 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3423 3424 UNLOCK(&sock->lock); 3425} 3426 3427static void 3428internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) { 3429 isc__socket_t *sock; 3430 int more_data; 3431 3432 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3433 3434 /* 3435 * Find out what socket this is and lock it. 3436 */ 3437 sock = (isc__socket_t *)ev->ev_sender; 3438 INSIST(VALID_SOCKET(sock)); 3439 3440 LOCK(&sock->lock); 3441 socket_log(sock, NULL, IOEVENT, 3442 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3443 "internal_fdwatch_read: task %p got event %p", me, ev); 3444 3445 INSIST(sock->pending_recv == 1); 3446 3447 UNLOCK(&sock->lock); 3448 more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock, 3449 sock->fdwatcharg, ISC_SOCKFDWATCH_READ); 3450 LOCK(&sock->lock); 3451 3452 sock->pending_recv = 0; 3453 3454 INSIST(sock->references > 0); 3455 sock->references--; /* the internal event is done with this socket */ 3456 if (sock->references == 0) { 3457 UNLOCK(&sock->lock); 3458 destroy(&sock); 3459 return; 3460 } 3461 3462 if (more_data) 3463 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3464 3465 UNLOCK(&sock->lock); 3466} 3467 3468/* 3469 * Process read/writes on each fd here. Avoid locking 3470 * and unlocking twice if both reads and writes are possible. 3471 */ 3472static void 3473process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable, 3474 isc_boolean_t writeable) 3475{ 3476 isc__socket_t *sock; 3477 isc_boolean_t unlock_sock; 3478 isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE; 3479 int lockid = FDLOCK_ID(fd); 3480 3481 /* 3482 * If the socket is going to be closed, don't do more I/O. 3483 */ 3484 LOCK(&manager->fdlock[lockid]); 3485 if (manager->fdstate[fd] == CLOSE_PENDING) { 3486 UNLOCK(&manager->fdlock[lockid]); 3487 3488 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3489 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3490 return; 3491 } 3492 3493 sock = manager->fds[fd]; 3494 unlock_sock = ISC_FALSE; 3495 if (readable) { 3496 if (sock == NULL) { 3497 unwatch_read = ISC_TRUE; 3498 goto check_write; 3499 } 3500 unlock_sock = ISC_TRUE; 3501 LOCK(&sock->lock); 3502 if (!SOCK_DEAD(sock)) { 3503 if (sock->listener) 3504 dispatch_accept(sock); 3505 else 3506 dispatch_recv(sock); 3507 } 3508 unwatch_read = ISC_TRUE; 3509 } 3510check_write: 3511 if (writeable) { 3512 if (sock == NULL) { 3513 unwatch_write = ISC_TRUE; 3514 goto unlock_fd; 3515 } 3516 if (!unlock_sock) { 3517 unlock_sock = ISC_TRUE; 3518 LOCK(&sock->lock); 3519 } 3520 if (!SOCK_DEAD(sock)) { 3521 if (sock->connecting) 3522 dispatch_connect(sock); 3523 else 3524 dispatch_send(sock); 3525 } 3526 unwatch_write = ISC_TRUE; 3527 } 3528 if (unlock_sock) 3529 UNLOCK(&sock->lock); 3530 3531 unlock_fd: 3532 UNLOCK(&manager->fdlock[lockid]); 3533 if (unwatch_read) 3534 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3535 if (unwatch_write) 3536 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3537 3538} 3539 3540#ifdef USE_KQUEUE 3541static isc_boolean_t 3542process_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) { 3543 int i; 3544 isc_boolean_t readable, writable; 3545 isc_boolean_t done = ISC_FALSE; 3546#ifdef USE_WATCHER_THREAD 3547 isc_boolean_t have_ctlevent = ISC_FALSE; 3548#endif 3549 3550 if (nevents == manager->nevents) { 3551 /* 3552 * This is not an error, but something unexpected. If this 3553 * happens, it may indicate the need for increasing 3554 * ISC_SOCKET_MAXEVENTS. 3555 */ 3556 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3557 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3558 "maximum number of FD events (%d) received", 3559 nevents); 3560 } 3561 3562 for (i = 0; i < nevents; i++) { 3563 REQUIRE(events[i].ident < manager->maxsocks); 3564#ifdef USE_WATCHER_THREAD 3565 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) { 3566 have_ctlevent = ISC_TRUE; 3567 continue; 3568 } 3569#endif 3570 readable = ISC_TF(events[i].filter == EVFILT_READ); 3571 writable = ISC_TF(events[i].filter == EVFILT_WRITE); 3572 process_fd(manager, events[i].ident, readable, writable); 3573 } 3574 3575#ifdef USE_WATCHER_THREAD 3576 if (have_ctlevent) 3577 done = process_ctlfd(manager); 3578#endif 3579 3580 return (done); 3581} 3582#elif defined(USE_EPOLL) 3583static isc_boolean_t 3584process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents) 3585{ 3586 int i; 3587 isc_boolean_t done = ISC_FALSE; 3588#ifdef USE_WATCHER_THREAD 3589 isc_boolean_t have_ctlevent = ISC_FALSE; 3590#endif 3591 3592 if (nevents == manager->nevents) { 3593 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3594 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3595 "maximum number of FD events (%d) received", 3596 nevents); 3597 } 3598 3599 for (i = 0; i < nevents; i++) { 3600 REQUIRE(events[i].data.fd < (int)manager->maxsocks); 3601#ifdef USE_WATCHER_THREAD 3602 if (events[i].data.fd == manager->pipe_fds[0]) { 3603 have_ctlevent = ISC_TRUE; 3604 continue; 3605 } 3606#endif 3607 if ((events[i].events & EPOLLERR) != 0 || 3608 (events[i].events & EPOLLHUP) != 0) { 3609 /* 3610 * epoll does not set IN/OUT bits on an erroneous 3611 * condition, so we need to try both anyway. This is a 3612 * bit inefficient, but should be okay for such rare 3613 * events. Note also that the read or write attempt 3614 * won't block because we use non-blocking sockets. 3615 */ 3616 events[i].events |= (EPOLLIN | EPOLLOUT); 3617 } 3618 process_fd(manager, events[i].data.fd, 3619 (events[i].events & EPOLLIN) != 0, 3620 (events[i].events & EPOLLOUT) != 0); 3621 } 3622 3623#ifdef USE_WATCHER_THREAD 3624 if (have_ctlevent) 3625 done = process_ctlfd(manager); 3626#endif 3627 3628 return (done); 3629} 3630#elif defined(USE_DEVPOLL) 3631static isc_boolean_t 3632process_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) { 3633 int i; 3634 isc_boolean_t done = ISC_FALSE; 3635#ifdef USE_WATCHER_THREAD 3636 isc_boolean_t have_ctlevent = ISC_FALSE; 3637#endif 3638 3639 if (nevents == manager->nevents) { 3640 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3641 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3642 "maximum number of FD events (%d) received", 3643 nevents); 3644 } 3645 3646 for (i = 0; i < nevents; i++) { 3647 REQUIRE(events[i].fd < (int)manager->maxsocks); 3648#ifdef USE_WATCHER_THREAD 3649 if (events[i].fd == manager->pipe_fds[0]) { 3650 have_ctlevent = ISC_TRUE; 3651 continue; 3652 } 3653#endif 3654 process_fd(manager, events[i].fd, 3655 (events[i].events & POLLIN) != 0, 3656 (events[i].events & POLLOUT) != 0); 3657 } 3658 3659#ifdef USE_WATCHER_THREAD 3660 if (have_ctlevent) 3661 done = process_ctlfd(manager); 3662#endif 3663 3664 return (done); 3665} 3666#elif defined(USE_SELECT) 3667static void 3668process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds, 3669 fd_set *writefds) 3670{ 3671 int i; 3672 3673 REQUIRE(maxfd <= (int)manager->maxsocks); 3674 3675 for (i = 0; i < maxfd; i++) { 3676#ifdef USE_WATCHER_THREAD 3677 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) 3678 continue; 3679#endif /* USE_WATCHER_THREAD */ 3680 process_fd(manager, i, FD_ISSET(i, readfds), 3681 FD_ISSET(i, writefds)); 3682 } 3683} 3684#endif 3685 3686#ifdef USE_WATCHER_THREAD 3687static isc_boolean_t 3688process_ctlfd(isc__socketmgr_t *manager) { 3689 int msg, fd; 3690 3691 for (;;) { 3692 select_readmsg(manager, &fd, &msg); 3693 3694 manager_log(manager, IOEVENT, 3695 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 3696 ISC_MSG_WATCHERMSG, 3697 "watcher got message %d " 3698 "for socket %d"), msg, fd); 3699 3700 /* 3701 * Nothing to read? 3702 */ 3703 if (msg == SELECT_POKE_NOTHING) 3704 break; 3705 3706 /* 3707 * Handle shutdown message. We really should 3708 * jump out of this loop right away, but 3709 * it doesn't matter if we have to do a little 3710 * more work first. 3711 */ 3712 if (msg == SELECT_POKE_SHUTDOWN) 3713 return (ISC_TRUE); 3714 3715 /* 3716 * This is a wakeup on a socket. Look 3717 * at the event queue for both read and write, 3718 * and decide if we need to watch on it now 3719 * or not. 3720 */ 3721 wakeup_socket(manager, fd, msg); 3722 } 3723 3724 return (ISC_FALSE); 3725} 3726 3727/* 3728 * This is the thread that will loop forever, always in a select or poll 3729 * call. 3730 * 3731 * When select returns something to do, track down what thread gets to do 3732 * this I/O and post the event to it. 3733 */ 3734static isc_threadresult_t 3735watcher(void *uap) { 3736 isc__socketmgr_t *manager = uap; 3737 isc_boolean_t done; 3738 int cc; 3739#ifdef USE_KQUEUE 3740 const char *fnname = "kevent()"; 3741#elif defined (USE_EPOLL) 3742 const char *fnname = "epoll_wait()"; 3743#elif defined(USE_DEVPOLL) 3744 const char *fnname = "ioctl(DP_POLL)"; 3745 struct dvpoll dvp; 3746#elif defined (USE_SELECT) 3747 const char *fnname = "select()"; 3748 int maxfd; 3749 int ctlfd; 3750#endif 3751 char strbuf[ISC_STRERRORSIZE]; 3752#ifdef ISC_SOCKET_USE_POLLWATCH 3753 pollstate_t pollstate = poll_idle; 3754#endif 3755 3756#if defined (USE_SELECT) 3757 /* 3758 * Get the control fd here. This will never change. 3759 */ 3760 ctlfd = manager->pipe_fds[0]; 3761#endif 3762 done = ISC_FALSE; 3763 while (!done) { 3764 do { 3765#ifdef USE_KQUEUE 3766 cc = kevent(manager->kqueue_fd, NULL, 0, 3767 manager->events, manager->nevents, NULL); 3768#elif defined(USE_EPOLL) 3769 cc = epoll_wait(manager->epoll_fd, manager->events, 3770 manager->nevents, -1); 3771#elif defined(USE_DEVPOLL) 3772 dvp.dp_fds = manager->events; 3773 dvp.dp_nfds = manager->nevents; 3774#ifndef ISC_SOCKET_USE_POLLWATCH 3775 dvp.dp_timeout = -1; 3776#else 3777 if (pollstate == poll_idle) 3778 dvp.dp_timeout = -1; 3779 else 3780 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT; 3781#endif /* ISC_SOCKET_USE_POLLWATCH */ 3782 cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 3783#elif defined(USE_SELECT) 3784 LOCK(&manager->lock); 3785 memcpy(manager->read_fds_copy, manager->read_fds, 3786 manager->fd_bufsize); 3787 memcpy(manager->write_fds_copy, manager->write_fds, 3788 manager->fd_bufsize); 3789 maxfd = manager->maxfd + 1; 3790 UNLOCK(&manager->lock); 3791 3792 cc = select(maxfd, manager->read_fds_copy, 3793 manager->write_fds_copy, NULL, NULL); 3794#endif /* USE_KQUEUE */ 3795 3796 if (cc < 0 && !SOFT_ERROR(errno)) { 3797 isc__strerror(errno, strbuf, sizeof(strbuf)); 3798 FATAL_ERROR(__FILE__, __LINE__, 3799 "%s %s: %s", fnname, 3800 isc_msgcat_get(isc_msgcat, 3801 ISC_MSGSET_GENERAL, 3802 ISC_MSG_FAILED, 3803 "failed"), strbuf); 3804 } 3805 3806#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) 3807 if (cc == 0) { 3808 if (pollstate == poll_active) 3809 pollstate = poll_checking; 3810 else if (pollstate == poll_checking) 3811 pollstate = poll_idle; 3812 } else if (cc > 0) { 3813 if (pollstate == poll_checking) { 3814 /* 3815 * XXX: We'd like to use a more 3816 * verbose log level as it's actually an 3817 * unexpected event, but the kernel bug 3818 * reportedly happens pretty frequently 3819 * (and it can also be a false positive) 3820 * so it would be just too noisy. 3821 */ 3822 manager_log(manager, 3823 ISC_LOGCATEGORY_GENERAL, 3824 ISC_LOGMODULE_SOCKET, 3825 ISC_LOG_DEBUG(1), 3826 "unexpected POLL timeout"); 3827 } 3828 pollstate = poll_active; 3829 } 3830#endif 3831 } while (cc < 0); 3832 3833#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) 3834 done = process_fds(manager, manager->events, cc); 3835#elif defined(USE_SELECT) 3836 process_fds(manager, maxfd, manager->read_fds_copy, 3837 manager->write_fds_copy); 3838 3839 /* 3840 * Process reads on internal, control fd. 3841 */ 3842 if (FD_ISSET(ctlfd, manager->read_fds_copy)) 3843 done = process_ctlfd(manager); 3844#endif 3845 } 3846 3847 manager_log(manager, TRACE, "%s", 3848 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3849 ISC_MSG_EXITING, "watcher exiting")); 3850 3851 return ((isc_threadresult_t)0); 3852} 3853#endif /* USE_WATCHER_THREAD */ 3854 3855#ifdef BIND9 3856ISC_SOCKETFUNC_SCOPE void 3857isc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) { 3858 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 3859 3860 REQUIRE(VALID_MANAGER(manager)); 3861 3862 manager->reserved = reserved; 3863} 3864 3865ISC_SOCKETFUNC_SCOPE void 3866isc___socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) { 3867 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 3868 3869 REQUIRE(VALID_MANAGER(manager)); 3870 3871 manager->maxudp = maxudp; 3872} 3873#endif /* BIND9 */ 3874 3875/* 3876 * Create a new socket manager. 3877 */ 3878 3879static isc_result_t 3880setup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) { 3881 isc_result_t result; 3882#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 3883 char strbuf[ISC_STRERRORSIZE]; 3884#endif 3885 3886#ifdef USE_KQUEUE 3887 manager->nevents = ISC_SOCKET_MAXEVENTS; 3888 manager->events = isc_mem_get(mctx, sizeof(struct kevent) * 3889 manager->nevents); 3890 if (manager->events == NULL) 3891 return (ISC_R_NOMEMORY); 3892 manager->kqueue_fd = kqueue(); 3893 if (manager->kqueue_fd == -1) { 3894 result = isc__errno2result(errno); 3895 isc__strerror(errno, strbuf, sizeof(strbuf)); 3896 UNEXPECTED_ERROR(__FILE__, __LINE__, 3897 "kqueue %s: %s", 3898 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3899 ISC_MSG_FAILED, "failed"), 3900 strbuf); 3901 isc_mem_put(mctx, manager->events, 3902 sizeof(struct kevent) * manager->nevents); 3903 return (result); 3904 } 3905 3906#ifdef USE_WATCHER_THREAD 3907 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3908 if (result != ISC_R_SUCCESS) { 3909 close(manager->kqueue_fd); 3910 isc_mem_put(mctx, manager->events, 3911 sizeof(struct kevent) * manager->nevents); 3912 return (result); 3913 } 3914#endif /* USE_WATCHER_THREAD */ 3915#elif defined(USE_EPOLL) 3916 manager->nevents = ISC_SOCKET_MAXEVENTS; 3917 manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * 3918 manager->nevents); 3919 if (manager->events == NULL) 3920 return (ISC_R_NOMEMORY); 3921 manager->epoll_fd = epoll_create(manager->nevents); 3922 if (manager->epoll_fd == -1) { 3923 result = isc__errno2result(errno); 3924 isc__strerror(errno, strbuf, sizeof(strbuf)); 3925 UNEXPECTED_ERROR(__FILE__, __LINE__, 3926 "epoll_create %s: %s", 3927 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3928 ISC_MSG_FAILED, "failed"), 3929 strbuf); 3930 isc_mem_put(mctx, manager->events, 3931 sizeof(struct epoll_event) * manager->nevents); 3932 return (result); 3933 } 3934#ifdef USE_WATCHER_THREAD 3935 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3936 if (result != ISC_R_SUCCESS) { 3937 close(manager->epoll_fd); 3938 isc_mem_put(mctx, manager->events, 3939 sizeof(struct epoll_event) * manager->nevents); 3940 return (result); 3941 } 3942#endif /* USE_WATCHER_THREAD */ 3943#elif defined(USE_DEVPOLL) 3944 /* 3945 * XXXJT: /dev/poll seems to reject large numbers of events, 3946 * so we should be careful about redefining ISC_SOCKET_MAXEVENTS. 3947 */ 3948 manager->nevents = ISC_SOCKET_MAXEVENTS; 3949 manager->events = isc_mem_get(mctx, sizeof(struct pollfd) * 3950 manager->nevents); 3951 if (manager->events == NULL) 3952 return (ISC_R_NOMEMORY); 3953 /* 3954 * Note: fdpollinfo should be able to support all possible FDs, so 3955 * it must have maxsocks entries (not nevents). 3956 */ 3957 manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) * 3958 manager->maxsocks); 3959 if (manager->fdpollinfo == NULL) { 3960 isc_mem_put(mctx, manager->events, 3961 sizeof(struct pollfd) * manager->nevents); 3962 return (ISC_R_NOMEMORY); 3963 } 3964 memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks); 3965 manager->devpoll_fd = open("/dev/poll", O_RDWR); 3966 if (manager->devpoll_fd == -1) { 3967 result = isc__errno2result(errno); 3968 isc__strerror(errno, strbuf, sizeof(strbuf)); 3969 UNEXPECTED_ERROR(__FILE__, __LINE__, 3970 "open(/dev/poll) %s: %s", 3971 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3972 ISC_MSG_FAILED, "failed"), 3973 strbuf); 3974 isc_mem_put(mctx, manager->events, 3975 sizeof(struct pollfd) * manager->nevents); 3976 isc_mem_put(mctx, manager->fdpollinfo, 3977 sizeof(pollinfo_t) * manager->maxsocks); 3978 return (result); 3979 } 3980#ifdef USE_WATCHER_THREAD 3981 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3982 if (result != ISC_R_SUCCESS) { 3983 close(manager->devpoll_fd); 3984 isc_mem_put(mctx, manager->events, 3985 sizeof(struct pollfd) * manager->nevents); 3986 isc_mem_put(mctx, manager->fdpollinfo, 3987 sizeof(pollinfo_t) * manager->maxsocks); 3988 return (result); 3989 } 3990#endif /* USE_WATCHER_THREAD */ 3991#elif defined(USE_SELECT) 3992 UNUSED(result); 3993 3994#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE 3995 /* 3996 * Note: this code should also cover the case of MAXSOCKETS <= 3997 * FD_SETSIZE, but we separate the cases to avoid possible portability 3998 * issues regarding howmany() and the actual representation of fd_set. 3999 */ 4000 manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) * 4001 sizeof(fd_mask); 4002#else 4003 manager->fd_bufsize = sizeof(fd_set); 4004#endif 4005 4006 manager->read_fds = NULL; 4007 manager->read_fds_copy = NULL; 4008 manager->write_fds = NULL; 4009 manager->write_fds_copy = NULL; 4010 4011 manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize); 4012 if (manager->read_fds != NULL) 4013 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); 4014 if (manager->read_fds_copy != NULL) 4015 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); 4016 if (manager->write_fds != NULL) { 4017 manager->write_fds_copy = isc_mem_get(mctx, 4018 manager->fd_bufsize); 4019 } 4020 if (manager->write_fds_copy == NULL) { 4021 if (manager->write_fds != NULL) { 4022 isc_mem_put(mctx, manager->write_fds, 4023 manager->fd_bufsize); 4024 } 4025 if (manager->read_fds_copy != NULL) { 4026 isc_mem_put(mctx, manager->read_fds_copy, 4027 manager->fd_bufsize); 4028 } 4029 if (manager->read_fds != NULL) { 4030 isc_mem_put(mctx, manager->read_fds, 4031 manager->fd_bufsize); 4032 } 4033 return (ISC_R_NOMEMORY); 4034 } 4035 memset(manager->read_fds, 0, manager->fd_bufsize); 4036 memset(manager->write_fds, 0, manager->fd_bufsize); 4037 4038#ifdef USE_WATCHER_THREAD 4039 (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4040 manager->maxfd = manager->pipe_fds[0]; 4041#else /* USE_WATCHER_THREAD */ 4042 manager->maxfd = 0; 4043#endif /* USE_WATCHER_THREAD */ 4044#endif /* USE_KQUEUE */ 4045 4046 return (ISC_R_SUCCESS); 4047} 4048 4049static void 4050cleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) { 4051#ifdef USE_WATCHER_THREAD 4052 isc_result_t result; 4053 4054 result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4055 if (result != ISC_R_SUCCESS) { 4056 UNEXPECTED_ERROR(__FILE__, __LINE__, 4057 "epoll_ctl(DEL) %s", 4058 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4059 ISC_MSG_FAILED, "failed")); 4060 } 4061#endif /* USE_WATCHER_THREAD */ 4062 4063#ifdef USE_KQUEUE 4064 close(manager->kqueue_fd); 4065 isc_mem_put(mctx, manager->events, 4066 sizeof(struct kevent) * manager->nevents); 4067#elif defined(USE_EPOLL) 4068 close(manager->epoll_fd); 4069 isc_mem_put(mctx, manager->events, 4070 sizeof(struct epoll_event) * manager->nevents); 4071#elif defined(USE_DEVPOLL) 4072 close(manager->devpoll_fd); 4073 isc_mem_put(mctx, manager->events, 4074 sizeof(struct pollfd) * manager->nevents); 4075 isc_mem_put(mctx, manager->fdpollinfo, 4076 sizeof(pollinfo_t) * manager->maxsocks); 4077#elif defined(USE_SELECT) 4078 if (manager->read_fds != NULL) 4079 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize); 4080 if (manager->read_fds_copy != NULL) 4081 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize); 4082 if (manager->write_fds != NULL) 4083 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize); 4084 if (manager->write_fds_copy != NULL) 4085 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize); 4086#endif /* USE_KQUEUE */ 4087} 4088 4089ISC_SOCKETFUNC_SCOPE isc_result_t 4090isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { 4091 return (isc__socketmgr_create2(mctx, managerp, 0)); 4092} 4093 4094ISC_SOCKETFUNC_SCOPE isc_result_t 4095isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 4096 unsigned int maxsocks) 4097{ 4098 int i; 4099 isc__socketmgr_t *manager; 4100#ifdef USE_WATCHER_THREAD 4101 char strbuf[ISC_STRERRORSIZE]; 4102#endif 4103 isc_result_t result; 4104 4105 REQUIRE(managerp != NULL && *managerp == NULL); 4106 4107#ifdef USE_SHARED_MANAGER 4108 if (socketmgr != NULL) { 4109 /* Don't allow maxsocks to be updated */ 4110 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) 4111 return (ISC_R_EXISTS); 4112 4113 socketmgr->refs++; 4114 *managerp = (isc_socketmgr_t *)socketmgr; 4115 return (ISC_R_SUCCESS); 4116 } 4117#endif /* USE_SHARED_MANAGER */ 4118 4119 if (maxsocks == 0) 4120 maxsocks = ISC_SOCKET_MAXSOCKETS; 4121 4122 manager = isc_mem_get(mctx, sizeof(*manager)); 4123 if (manager == NULL) 4124 return (ISC_R_NOMEMORY); 4125 4126 /* zero-clear so that necessary cleanup on failure will be easy */ 4127 memset(manager, 0, sizeof(*manager)); 4128 manager->maxsocks = maxsocks; 4129 manager->reserved = 0; 4130 manager->maxudp = 0; 4131 manager->fds = isc_mem_get(mctx, 4132 manager->maxsocks * sizeof(isc__socket_t *)); 4133 if (manager->fds == NULL) { 4134 result = ISC_R_NOMEMORY; 4135 goto free_manager; 4136 } 4137 manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int)); 4138 if (manager->fdstate == NULL) { 4139 result = ISC_R_NOMEMORY; 4140 goto free_manager; 4141 } 4142 manager->stats = NULL; 4143 4144 manager->common.methods = &socketmgrmethods; 4145 manager->common.magic = ISCAPI_SOCKETMGR_MAGIC; 4146 manager->common.impmagic = SOCKET_MANAGER_MAGIC; 4147 manager->mctx = NULL; 4148 memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); 4149 ISC_LIST_INIT(manager->socklist); 4150 result = isc_mutex_init(&manager->lock); 4151 if (result != ISC_R_SUCCESS) 4152 goto free_manager; 4153 manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t)); 4154 if (manager->fdlock == NULL) { 4155 result = ISC_R_NOMEMORY; 4156 goto cleanup_lock; 4157 } 4158 for (i = 0; i < FDLOCK_COUNT; i++) { 4159 result = isc_mutex_init(&manager->fdlock[i]); 4160 if (result != ISC_R_SUCCESS) { 4161 while (--i >= 0) 4162 DESTROYLOCK(&manager->fdlock[i]); 4163 isc_mem_put(mctx, manager->fdlock, 4164 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4165 manager->fdlock = NULL; 4166 goto cleanup_lock; 4167 } 4168 } 4169 4170#ifdef USE_WATCHER_THREAD 4171 if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { 4172 UNEXPECTED_ERROR(__FILE__, __LINE__, 4173 "isc_condition_init() %s", 4174 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4175 ISC_MSG_FAILED, "failed")); 4176 result = ISC_R_UNEXPECTED; 4177 goto cleanup_lock; 4178 } 4179 4180 /* 4181 * Create the special fds that will be used to wake up the 4182 * select/poll loop when something internal needs to be done. 4183 */ 4184 if (pipe(manager->pipe_fds) != 0) { 4185 isc__strerror(errno, strbuf, sizeof(strbuf)); 4186 UNEXPECTED_ERROR(__FILE__, __LINE__, 4187 "pipe() %s: %s", 4188 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4189 ISC_MSG_FAILED, "failed"), 4190 strbuf); 4191 result = ISC_R_UNEXPECTED; 4192 goto cleanup_condition; 4193 } 4194 4195 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS); 4196#if 0 4197 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS); 4198#endif 4199#endif /* USE_WATCHER_THREAD */ 4200 4201#ifdef USE_SHARED_MANAGER 4202 manager->refs = 1; 4203#endif /* USE_SHARED_MANAGER */ 4204 4205 /* 4206 * Set up initial state for the select loop 4207 */ 4208 result = setup_watcher(mctx, manager); 4209 if (result != ISC_R_SUCCESS) 4210 goto cleanup; 4211 memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); 4212#ifdef USE_WATCHER_THREAD 4213 /* 4214 * Start up the select/poll thread. 4215 */ 4216 if (isc_thread_create(watcher, manager, &manager->watcher) != 4217 ISC_R_SUCCESS) { 4218 UNEXPECTED_ERROR(__FILE__, __LINE__, 4219 "isc_thread_create() %s", 4220 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4221 ISC_MSG_FAILED, "failed")); 4222 cleanup_watcher(mctx, manager); 4223 result = ISC_R_UNEXPECTED; 4224 goto cleanup; 4225 } 4226#endif /* USE_WATCHER_THREAD */ 4227 isc_mem_attach(mctx, &manager->mctx); 4228 4229#ifdef USE_SHARED_MANAGER 4230 socketmgr = manager; 4231#endif /* USE_SHARED_MANAGER */ 4232 *managerp = (isc_socketmgr_t *)manager; 4233 4234 return (ISC_R_SUCCESS); 4235 4236cleanup: 4237#ifdef USE_WATCHER_THREAD 4238 (void)close(manager->pipe_fds[0]); 4239 (void)close(manager->pipe_fds[1]); 4240#endif /* USE_WATCHER_THREAD */ 4241 4242#ifdef USE_WATCHER_THREAD 4243cleanup_condition: 4244 (void)isc_condition_destroy(&manager->shutdown_ok); 4245#endif /* USE_WATCHER_THREAD */ 4246 4247 4248cleanup_lock: 4249 if (manager->fdlock != NULL) { 4250 for (i = 0; i < FDLOCK_COUNT; i++) 4251 DESTROYLOCK(&manager->fdlock[i]); 4252 } 4253 DESTROYLOCK(&manager->lock); 4254 4255free_manager: 4256 if (manager->fdlock != NULL) { 4257 isc_mem_put(mctx, manager->fdlock, 4258 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4259 } 4260 if (manager->fdstate != NULL) { 4261 isc_mem_put(mctx, manager->fdstate, 4262 manager->maxsocks * sizeof(int)); 4263 } 4264 if (manager->fds != NULL) { 4265 isc_mem_put(mctx, manager->fds, 4266 manager->maxsocks * sizeof(isc_socket_t *)); 4267 } 4268 isc_mem_put(mctx, manager, sizeof(*manager)); 4269 4270 return (result); 4271} 4272 4273#ifdef BIND9 4274isc_result_t 4275isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) { 4276 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4277 REQUIRE(VALID_MANAGER(manager)); 4278 REQUIRE(nsockp != NULL); 4279 4280 *nsockp = manager->maxsocks; 4281 4282 return (ISC_R_SUCCESS); 4283} 4284 4285void 4286isc__socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) { 4287 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4288 4289 REQUIRE(VALID_MANAGER(manager)); 4290 REQUIRE(ISC_LIST_EMPTY(manager->socklist)); 4291 REQUIRE(manager->stats == NULL); 4292 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max); 4293 4294 isc_stats_attach(stats, &manager->stats); 4295} 4296#endif 4297 4298ISC_SOCKETFUNC_SCOPE void 4299isc__socketmgr_destroy(isc_socketmgr_t **managerp) { 4300 isc__socketmgr_t *manager; 4301 int i; 4302 isc_mem_t *mctx; 4303 4304 /* 4305 * Destroy a socket manager. 4306 */ 4307 4308 REQUIRE(managerp != NULL); 4309 manager = (isc__socketmgr_t *)*managerp; 4310 REQUIRE(VALID_MANAGER(manager)); 4311 4312#ifdef USE_SHARED_MANAGER 4313 manager->refs--; 4314 if (manager->refs > 0) { 4315 *managerp = NULL; 4316 return; 4317 } 4318 socketmgr = NULL; 4319#endif /* USE_SHARED_MANAGER */ 4320 4321 LOCK(&manager->lock); 4322 4323 /* 4324 * Wait for all sockets to be destroyed. 4325 */ 4326 while (!ISC_LIST_EMPTY(manager->socklist)) { 4327#ifdef USE_WATCHER_THREAD 4328 manager_log(manager, CREATION, "%s", 4329 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 4330 ISC_MSG_SOCKETSREMAIN, 4331 "sockets exist")); 4332 WAIT(&manager->shutdown_ok, &manager->lock); 4333#else /* USE_WATCHER_THREAD */ 4334 UNLOCK(&manager->lock); 4335 isc__taskmgr_dispatch(NULL); 4336 LOCK(&manager->lock); 4337#endif /* USE_WATCHER_THREAD */ 4338 } 4339 4340 UNLOCK(&manager->lock); 4341 4342 /* 4343 * Here, poke our select/poll thread. Do this by closing the write 4344 * half of the pipe, which will send EOF to the read half. 4345 * This is currently a no-op in the non-threaded case. 4346 */ 4347 select_poke(manager, 0, SELECT_POKE_SHUTDOWN); 4348 4349#ifdef USE_WATCHER_THREAD 4350 /* 4351 * Wait for thread to exit. 4352 */ 4353 if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS) 4354 UNEXPECTED_ERROR(__FILE__, __LINE__, 4355 "isc_thread_join() %s", 4356 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4357 ISC_MSG_FAILED, "failed")); 4358#endif /* USE_WATCHER_THREAD */ 4359 4360 /* 4361 * Clean up. 4362 */ 4363 cleanup_watcher(manager->mctx, manager); 4364 4365#ifdef USE_WATCHER_THREAD 4366 (void)close(manager->pipe_fds[0]); 4367 (void)close(manager->pipe_fds[1]); 4368 (void)isc_condition_destroy(&manager->shutdown_ok); 4369#endif /* USE_WATCHER_THREAD */ 4370 4371 for (i = 0; i < (int)manager->maxsocks; i++) 4372 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ 4373 (void)close(i); 4374 4375 isc_mem_put(manager->mctx, manager->fds, 4376 manager->maxsocks * sizeof(isc__socket_t *)); 4377 isc_mem_put(manager->mctx, manager->fdstate, 4378 manager->maxsocks * sizeof(int)); 4379 4380 if (manager->stats != NULL) 4381 isc_stats_detach(&manager->stats); 4382 4383 if (manager->fdlock != NULL) { 4384 for (i = 0; i < FDLOCK_COUNT; i++) 4385 DESTROYLOCK(&manager->fdlock[i]); 4386 isc_mem_put(manager->mctx, manager->fdlock, 4387 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4388 } 4389 DESTROYLOCK(&manager->lock); 4390 manager->common.magic = 0; 4391 manager->common.impmagic = 0; 4392 mctx= manager->mctx; 4393 isc_mem_put(mctx, manager, sizeof(*manager)); 4394 4395 isc_mem_detach(&mctx); 4396 4397 *managerp = NULL; 4398 4399#ifdef USE_SHARED_MANAGER 4400 socketmgr = NULL; 4401#endif 4402} 4403 4404static isc_result_t 4405socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4406 unsigned int flags) 4407{ 4408 int io_state; 4409 isc_boolean_t have_lock = ISC_FALSE; 4410 isc_task_t *ntask = NULL; 4411 isc_result_t result = ISC_R_SUCCESS; 4412 4413 dev->ev_sender = task; 4414 4415 if (sock->type == isc_sockettype_udp) { 4416 io_state = doio_recv(sock, dev); 4417 } else { 4418 LOCK(&sock->lock); 4419 have_lock = ISC_TRUE; 4420 4421 if (ISC_LIST_EMPTY(sock->recv_list)) 4422 io_state = doio_recv(sock, dev); 4423 else 4424 io_state = DOIO_SOFT; 4425 } 4426 4427 switch (io_state) { 4428 case DOIO_SOFT: 4429 /* 4430 * We couldn't read all or part of the request right now, so 4431 * queue it. 4432 * 4433 * Attach to socket and to task 4434 */ 4435 isc_task_attach(task, &ntask); 4436 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4437 4438 if (!have_lock) { 4439 LOCK(&sock->lock); 4440 have_lock = ISC_TRUE; 4441 } 4442 4443 /* 4444 * Enqueue the request. If the socket was previously not being 4445 * watched, poke the watcher to start paying attention to it. 4446 */ 4447 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) 4448 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 4449 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); 4450 4451 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4452 "socket_recv: event %p -> task %p", 4453 dev, ntask); 4454 4455 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4456 result = ISC_R_INPROGRESS; 4457 break; 4458 4459 case DOIO_EOF: 4460 dev->result = ISC_R_EOF; 4461 /* fallthrough */ 4462 4463 case DOIO_HARD: 4464 case DOIO_SUCCESS: 4465 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4466 send_recvdone_event(sock, &dev); 4467 break; 4468 } 4469 4470 if (have_lock) 4471 UNLOCK(&sock->lock); 4472 4473 return (result); 4474} 4475 4476ISC_SOCKETFUNC_SCOPE isc_result_t 4477isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist, 4478 unsigned int minimum, isc_task_t *task, 4479 isc_taskaction_t action, const void *arg) 4480{ 4481 isc__socket_t *sock = (isc__socket_t *)sock0; 4482 isc_socketevent_t *dev; 4483 isc__socketmgr_t *manager; 4484 unsigned int iocount; 4485 isc_buffer_t *buffer; 4486 4487 REQUIRE(VALID_SOCKET(sock)); 4488 REQUIRE(buflist != NULL); 4489 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4490 REQUIRE(task != NULL); 4491 REQUIRE(action != NULL); 4492 4493 manager = sock->manager; 4494 REQUIRE(VALID_MANAGER(manager)); 4495 4496 iocount = isc_bufferlist_availablecount(buflist); 4497 REQUIRE(iocount > 0); 4498 4499 INSIST(sock->bound); 4500 4501 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4502 if (dev == NULL) 4503 return (ISC_R_NOMEMORY); 4504 4505 /* 4506 * UDP sockets are always partial read 4507 */ 4508 if (sock->type == isc_sockettype_udp) 4509 dev->minimum = 1; 4510 else { 4511 if (minimum == 0) 4512 dev->minimum = iocount; 4513 else 4514 dev->minimum = minimum; 4515 } 4516 4517 /* 4518 * Move each buffer from the passed in list to our internal one. 4519 */ 4520 buffer = ISC_LIST_HEAD(*buflist); 4521 while (buffer != NULL) { 4522 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4523 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4524 buffer = ISC_LIST_HEAD(*buflist); 4525 } 4526 4527 return (socket_recv(sock, dev, task, 0)); 4528} 4529 4530ISC_SOCKETFUNC_SCOPE isc_result_t 4531isc__socket_recv(isc_socket_t *sock0, isc_region_t *region, 4532 unsigned int minimum, isc_task_t *task, 4533 isc_taskaction_t action, const void *arg) 4534{ 4535 isc__socket_t *sock = (isc__socket_t *)sock0; 4536 isc_socketevent_t *dev; 4537 isc__socketmgr_t *manager; 4538 4539 REQUIRE(VALID_SOCKET(sock)); 4540 REQUIRE(action != NULL); 4541 4542 manager = sock->manager; 4543 REQUIRE(VALID_MANAGER(manager)); 4544 4545 INSIST(sock->bound); 4546 4547 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4548 if (dev == NULL) 4549 return (ISC_R_NOMEMORY); 4550 4551 return (isc__socket_recv2(sock0, region, minimum, task, dev, 0)); 4552} 4553 4554ISC_SOCKETFUNC_SCOPE isc_result_t 4555isc__socket_recv2(isc_socket_t *sock0, isc_region_t *region, 4556 unsigned int minimum, isc_task_t *task, 4557 isc_socketevent_t *event, unsigned int flags) 4558{ 4559 isc__socket_t *sock = (isc__socket_t *)sock0; 4560 4561 event->ev_sender = sock; 4562 event->result = ISC_R_UNSET; 4563 ISC_LIST_INIT(event->bufferlist); 4564 event->region = *region; 4565 event->n = 0; 4566 event->offset = 0; 4567 event->attributes = 0; 4568 4569 /* 4570 * UDP sockets are always partial read. 4571 */ 4572 if (sock->type == isc_sockettype_udp) 4573 event->minimum = 1; 4574 else { 4575 if (minimum == 0) 4576 event->minimum = region->length; 4577 else 4578 event->minimum = minimum; 4579 } 4580 4581 return (socket_recv(sock, event, task, flags)); 4582} 4583 4584static isc_result_t 4585socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4586 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4587 unsigned int flags) 4588{ 4589 int io_state; 4590 isc_boolean_t have_lock = ISC_FALSE; 4591 isc_task_t *ntask = NULL; 4592 isc_result_t result = ISC_R_SUCCESS; 4593 4594 dev->ev_sender = task; 4595 4596 set_dev_address(address, sock, dev); 4597 if (pktinfo != NULL) { 4598 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 4599 dev->pktinfo = *pktinfo; 4600 4601 if (!isc_sockaddr_issitelocal(&dev->address) && 4602 !isc_sockaddr_islinklocal(&dev->address)) { 4603 socket_log(sock, NULL, TRACE, isc_msgcat, 4604 ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED, 4605 "pktinfo structure provided, ifindex %u " 4606 "(set to 0)", pktinfo->ipi6_ifindex); 4607 4608 /* 4609 * Set the pktinfo index to 0 here, to let the 4610 * kernel decide what interface it should send on. 4611 */ 4612 dev->pktinfo.ipi6_ifindex = 0; 4613 } 4614 } 4615 4616 if (sock->type == isc_sockettype_udp) 4617 io_state = doio_send(sock, dev); 4618 else { 4619 LOCK(&sock->lock); 4620 have_lock = ISC_TRUE; 4621 4622 if (ISC_LIST_EMPTY(sock->send_list)) 4623 io_state = doio_send(sock, dev); 4624 else 4625 io_state = DOIO_SOFT; 4626 } 4627 4628 switch (io_state) { 4629 case DOIO_SOFT: 4630 /* 4631 * We couldn't send all or part of the request right now, so 4632 * queue it unless ISC_SOCKFLAG_NORETRY is set. 4633 */ 4634 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { 4635 isc_task_attach(task, &ntask); 4636 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4637 4638 if (!have_lock) { 4639 LOCK(&sock->lock); 4640 have_lock = ISC_TRUE; 4641 } 4642 4643 /* 4644 * Enqueue the request. If the socket was previously 4645 * not being watched, poke the watcher to start 4646 * paying attention to it. 4647 */ 4648 if (ISC_LIST_EMPTY(sock->send_list) && 4649 !sock->pending_send) 4650 select_poke(sock->manager, sock->fd, 4651 SELECT_POKE_WRITE); 4652 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); 4653 4654 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4655 "socket_send: event %p -> task %p", 4656 dev, ntask); 4657 4658 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4659 result = ISC_R_INPROGRESS; 4660 break; 4661 } 4662 4663 case DOIO_HARD: 4664 case DOIO_SUCCESS: 4665 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4666 send_senddone_event(sock, &dev); 4667 break; 4668 } 4669 4670 if (have_lock) 4671 UNLOCK(&sock->lock); 4672 4673 return (result); 4674} 4675 4676ISC_SOCKETFUNC_SCOPE isc_result_t 4677isc__socket_send(isc_socket_t *sock, isc_region_t *region, 4678 isc_task_t *task, isc_taskaction_t action, const void *arg) 4679{ 4680 /* 4681 * REQUIRE() checking is performed in isc_socket_sendto(). 4682 */ 4683 return (isc__socket_sendto(sock, region, task, action, arg, NULL, 4684 NULL)); 4685} 4686 4687ISC_SOCKETFUNC_SCOPE isc_result_t 4688isc__socket_sendto(isc_socket_t *sock0, isc_region_t *region, 4689 isc_task_t *task, isc_taskaction_t action, const void *arg, 4690 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4691{ 4692 isc__socket_t *sock = (isc__socket_t *)sock0; 4693 isc_socketevent_t *dev; 4694 isc__socketmgr_t *manager; 4695 4696 REQUIRE(VALID_SOCKET(sock)); 4697 REQUIRE(region != NULL); 4698 REQUIRE(task != NULL); 4699 REQUIRE(action != NULL); 4700 4701 manager = sock->manager; 4702 REQUIRE(VALID_MANAGER(manager)); 4703 4704 INSIST(sock->bound); 4705 4706 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4707 if (dev == NULL) 4708 return (ISC_R_NOMEMORY); 4709 4710 dev->region = *region; 4711 4712 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4713} 4714 4715ISC_SOCKETFUNC_SCOPE isc_result_t 4716isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 4717 isc_task_t *task, isc_taskaction_t action, const void *arg) 4718{ 4719 return (isc__socket_sendtov(sock, buflist, task, action, arg, NULL, 4720 NULL)); 4721} 4722 4723ISC_SOCKETFUNC_SCOPE isc_result_t 4724isc__socket_sendtov(isc_socket_t *sock0, isc_bufferlist_t *buflist, 4725 isc_task_t *task, isc_taskaction_t action, const void *arg, 4726 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4727{ 4728 isc__socket_t *sock = (isc__socket_t *)sock0; 4729 isc_socketevent_t *dev; 4730 isc__socketmgr_t *manager; 4731 unsigned int iocount; 4732 isc_buffer_t *buffer; 4733 4734 REQUIRE(VALID_SOCKET(sock)); 4735 REQUIRE(buflist != NULL); 4736 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4737 REQUIRE(task != NULL); 4738 REQUIRE(action != NULL); 4739 4740 manager = sock->manager; 4741 REQUIRE(VALID_MANAGER(manager)); 4742 4743 iocount = isc_bufferlist_usedcount(buflist); 4744 REQUIRE(iocount > 0); 4745 4746 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4747 if (dev == NULL) 4748 return (ISC_R_NOMEMORY); 4749 4750 /* 4751 * Move each buffer from the passed in list to our internal one. 4752 */ 4753 buffer = ISC_LIST_HEAD(*buflist); 4754 while (buffer != NULL) { 4755 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4756 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4757 buffer = ISC_LIST_HEAD(*buflist); 4758 } 4759 4760 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4761} 4762 4763ISC_SOCKETFUNC_SCOPE isc_result_t 4764isc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region, 4765 isc_task_t *task, 4766 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4767 isc_socketevent_t *event, unsigned int flags) 4768{ 4769 isc__socket_t *sock = (isc__socket_t *)sock0; 4770 4771 REQUIRE(VALID_SOCKET(sock)); 4772 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0); 4773 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) 4774 REQUIRE(sock->type == isc_sockettype_udp); 4775 event->ev_sender = sock; 4776 event->result = ISC_R_UNSET; 4777 ISC_LIST_INIT(event->bufferlist); 4778 event->region = *region; 4779 event->n = 0; 4780 event->offset = 0; 4781 event->attributes = 0; 4782 4783 return (socket_send(sock, event, task, address, pktinfo, flags)); 4784} 4785 4786ISC_SOCKETFUNC_SCOPE void 4787isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) { 4788#ifdef ISC_PLATFORM_HAVESYSUNH 4789 int s; 4790 struct stat sb; 4791 char strbuf[ISC_STRERRORSIZE]; 4792 4793 if (sockaddr->type.sa.sa_family != AF_UNIX) 4794 return; 4795 4796#ifndef S_ISSOCK 4797#if defined(S_IFMT) && defined(S_IFSOCK) 4798#define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK) 4799#elif defined(_S_IFMT) && defined(S_IFSOCK) 4800#define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK) 4801#endif 4802#endif 4803 4804#ifndef S_ISFIFO 4805#if defined(S_IFMT) && defined(S_IFIFO) 4806#define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO) 4807#elif defined(_S_IFMT) && defined(S_IFIFO) 4808#define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO) 4809#endif 4810#endif 4811 4812#if !defined(S_ISFIFO) && !defined(S_ISSOCK) 4813#error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>. 4814#endif 4815 4816#ifndef S_ISFIFO 4817#define S_ISFIFO(mode) 0 4818#endif 4819 4820#ifndef S_ISSOCK 4821#define S_ISSOCK(mode) 0 4822#endif 4823 4824 if (active) { 4825 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4826 isc__strerror(errno, strbuf, sizeof(strbuf)); 4827 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4828 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4829 "isc_socket_cleanunix: stat(%s): %s", 4830 sockaddr->type.sunix.sun_path, strbuf); 4831 return; 4832 } 4833 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4834 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4835 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4836 "isc_socket_cleanunix: %s: not a socket", 4837 sockaddr->type.sunix.sun_path); 4838 return; 4839 } 4840 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4841 isc__strerror(errno, strbuf, sizeof(strbuf)); 4842 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4843 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4844 "isc_socket_cleanunix: unlink(%s): %s", 4845 sockaddr->type.sunix.sun_path, strbuf); 4846 } 4847 return; 4848 } 4849 4850 s = socket(AF_UNIX, SOCK_STREAM, 0); 4851 if (s < 0) { 4852 isc__strerror(errno, strbuf, sizeof(strbuf)); 4853 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4854 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4855 "isc_socket_cleanunix: socket(%s): %s", 4856 sockaddr->type.sunix.sun_path, strbuf); 4857 return; 4858 } 4859 4860 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4861 switch (errno) { 4862 case ENOENT: /* We exited cleanly last time */ 4863 break; 4864 default: 4865 isc__strerror(errno, strbuf, sizeof(strbuf)); 4866 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4867 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4868 "isc_socket_cleanunix: stat(%s): %s", 4869 sockaddr->type.sunix.sun_path, strbuf); 4870 break; 4871 } 4872 goto cleanup; 4873 } 4874 4875 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4876 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4877 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4878 "isc_socket_cleanunix: %s: not a socket", 4879 sockaddr->type.sunix.sun_path); 4880 goto cleanup; 4881 } 4882 4883 if (connect(s, (struct sockaddr *)&sockaddr->type.sunix, 4884 sizeof(sockaddr->type.sunix)) < 0) { 4885 switch (errno) { 4886 case ECONNREFUSED: 4887 case ECONNRESET: 4888 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4889 isc__strerror(errno, strbuf, sizeof(strbuf)); 4890 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4891 ISC_LOGMODULE_SOCKET, 4892 ISC_LOG_WARNING, 4893 "isc_socket_cleanunix: " 4894 "unlink(%s): %s", 4895 sockaddr->type.sunix.sun_path, 4896 strbuf); 4897 } 4898 break; 4899 default: 4900 isc__strerror(errno, strbuf, sizeof(strbuf)); 4901 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4902 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4903 "isc_socket_cleanunix: connect(%s): %s", 4904 sockaddr->type.sunix.sun_path, strbuf); 4905 break; 4906 } 4907 } 4908 cleanup: 4909 close(s); 4910#else 4911 UNUSED(sockaddr); 4912 UNUSED(active); 4913#endif 4914} 4915 4916ISC_SOCKETFUNC_SCOPE isc_result_t 4917isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 4918 isc_uint32_t owner, isc_uint32_t group) 4919{ 4920#ifdef ISC_PLATFORM_HAVESYSUNH 4921 isc_result_t result = ISC_R_SUCCESS; 4922 char strbuf[ISC_STRERRORSIZE]; 4923 char path[sizeof(sockaddr->type.sunix.sun_path)]; 4924#ifdef NEED_SECURE_DIRECTORY 4925 char *slash; 4926#endif 4927 4928 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX); 4929 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path)); 4930 strcpy(path, sockaddr->type.sunix.sun_path); 4931 4932#ifdef NEED_SECURE_DIRECTORY 4933 slash = strrchr(path, '/'); 4934 if (slash != NULL) { 4935 if (slash != path) 4936 *slash = '\0'; 4937 else 4938 strcpy(path, "/"); 4939 } else 4940 strcpy(path, "."); 4941#endif 4942 4943 if (chmod(path, perm) < 0) { 4944 isc__strerror(errno, strbuf, sizeof(strbuf)); 4945 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4946 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4947 "isc_socket_permunix: chmod(%s, %d): %s", 4948 path, perm, strbuf); 4949 result = ISC_R_FAILURE; 4950 } 4951 if (chown(path, owner, group) < 0) { 4952 isc__strerror(errno, strbuf, sizeof(strbuf)); 4953 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4954 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4955 "isc_socket_permunix: chown(%s, %d, %d): %s", 4956 path, owner, group, 4957 strbuf); 4958 result = ISC_R_FAILURE; 4959 } 4960 return (result); 4961#else 4962 UNUSED(sockaddr); 4963 UNUSED(perm); 4964 UNUSED(owner); 4965 UNUSED(group); 4966 return (ISC_R_NOTIMPLEMENTED); 4967#endif 4968} 4969 4970ISC_SOCKETFUNC_SCOPE isc_result_t 4971isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr, 4972 unsigned int options) { 4973 isc__socket_t *sock = (isc__socket_t *)sock0; 4974 char strbuf[ISC_STRERRORSIZE]; 4975 int on = 1; 4976 4977 REQUIRE(VALID_SOCKET(sock)); 4978 4979 LOCK(&sock->lock); 4980 4981 INSIST(!sock->bound); 4982 4983 if (sock->pf != sockaddr->type.sa.sa_family) { 4984 UNLOCK(&sock->lock); 4985 return (ISC_R_FAMILYMISMATCH); 4986 } 4987 /* 4988 * Only set SO_REUSEADDR when we want a specific port. 4989 */ 4990#ifdef AF_UNIX 4991 if (sock->pf == AF_UNIX) 4992 goto bind_socket; 4993#endif 4994 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 && 4995 isc_sockaddr_getport(sockaddr) != (in_port_t)0 && 4996 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, 4997 sizeof(on)) < 0) { 4998 UNEXPECTED_ERROR(__FILE__, __LINE__, 4999 "setsockopt(%d) %s", sock->fd, 5000 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 5001 ISC_MSG_FAILED, "failed")); 5002 /* Press on... */ 5003 } 5004#ifdef AF_UNIX 5005 bind_socket: 5006#endif 5007 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) { 5008 inc_stats(sock->manager->stats, 5009 sock->statsindex[STATID_BINDFAIL]); 5010 5011 UNLOCK(&sock->lock); 5012 switch (errno) { 5013 case EACCES: 5014 return (ISC_R_NOPERM); 5015 case EADDRNOTAVAIL: 5016 return (ISC_R_ADDRNOTAVAIL); 5017 case EADDRINUSE: 5018 return (ISC_R_ADDRINUSE); 5019 case EINVAL: 5020 return (ISC_R_BOUND); 5021 default: 5022 isc__strerror(errno, strbuf, sizeof(strbuf)); 5023 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s", 5024 strbuf); 5025 return (ISC_R_UNEXPECTED); 5026 } 5027 } 5028 5029 socket_log(sock, sockaddr, TRACE, 5030 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound"); 5031 sock->bound = 1; 5032 5033 UNLOCK(&sock->lock); 5034 return (ISC_R_SUCCESS); 5035} 5036 5037/* 5038 * Enable this only for specific OS versions, and only when they have repaired 5039 * their problems with it. Until then, this is is broken and needs to be 5040 * diabled by default. See RT22589 for details. 5041 */ 5042#undef ENABLE_ACCEPTFILTER 5043 5044ISC_SOCKETFUNC_SCOPE isc_result_t 5045isc__socket_filter(isc_socket_t *sock0, const char *filter) { 5046 isc__socket_t *sock = (isc__socket_t *)sock0; 5047#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 5048 char strbuf[ISC_STRERRORSIZE]; 5049 struct accept_filter_arg afa; 5050#else 5051 UNUSED(sock); 5052 UNUSED(filter); 5053#endif 5054 5055 REQUIRE(VALID_SOCKET(sock)); 5056 5057#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 5058 bzero(&afa, sizeof(afa)); 5059 strncpy(afa.af_name, filter, sizeof(afa.af_name)); 5060 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, 5061 &afa, sizeof(afa)) == -1) { 5062 isc__strerror(errno, strbuf, sizeof(strbuf)); 5063 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 5064 ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s", 5065 strbuf); 5066 return (ISC_R_FAILURE); 5067 } 5068 return (ISC_R_SUCCESS); 5069#else 5070 return (ISC_R_NOTIMPLEMENTED); 5071#endif 5072} 5073 5074/* 5075 * Set up to listen on a given socket. We do this by creating an internal 5076 * event that will be dispatched when the socket has read activity. The 5077 * watcher will send the internal event to the task when there is a new 5078 * connection. 5079 * 5080 * Unlike in read, we don't preallocate a done event here. Every time there 5081 * is a new connection we'll have to allocate a new one anyway, so we might 5082 * as well keep things simple rather than having to track them. 5083 */ 5084ISC_SOCKETFUNC_SCOPE isc_result_t 5085isc__socket_listen(isc_socket_t *sock0, unsigned int backlog) { 5086 isc__socket_t *sock = (isc__socket_t *)sock0; 5087 char strbuf[ISC_STRERRORSIZE]; 5088 5089 REQUIRE(VALID_SOCKET(sock)); 5090 5091 LOCK(&sock->lock); 5092 5093 REQUIRE(!sock->listener); 5094 REQUIRE(sock->bound); 5095 REQUIRE(sock->type == isc_sockettype_tcp || 5096 sock->type == isc_sockettype_unix); 5097 5098 if (backlog == 0) 5099 backlog = SOMAXCONN; 5100 5101 if (listen(sock->fd, (int)backlog) < 0) { 5102 UNLOCK(&sock->lock); 5103 isc__strerror(errno, strbuf, sizeof(strbuf)); 5104 5105 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf); 5106 5107 return (ISC_R_UNEXPECTED); 5108 } 5109 5110 sock->listener = 1; 5111 5112 UNLOCK(&sock->lock); 5113 return (ISC_R_SUCCESS); 5114} 5115 5116/* 5117 * This should try to do aggressive accept() XXXMLG 5118 */ 5119ISC_SOCKETFUNC_SCOPE isc_result_t 5120isc__socket_accept(isc_socket_t *sock0, 5121 isc_task_t *task, isc_taskaction_t action, const void *arg) 5122{ 5123 isc__socket_t *sock = (isc__socket_t *)sock0; 5124 isc_socket_newconnev_t *dev; 5125 isc__socketmgr_t *manager; 5126 isc_task_t *ntask = NULL; 5127 isc__socket_t *nsock; 5128 isc_result_t result; 5129 isc_boolean_t do_poke = ISC_FALSE; 5130 5131 REQUIRE(VALID_SOCKET(sock)); 5132 manager = sock->manager; 5133 REQUIRE(VALID_MANAGER(manager)); 5134 5135 LOCK(&sock->lock); 5136 5137 REQUIRE(sock->listener); 5138 5139 /* 5140 * Sender field is overloaded here with the task we will be sending 5141 * this event to. Just before the actual event is delivered the 5142 * actual ev_sender will be touched up to be the socket. 5143 */ 5144 dev = (isc_socket_newconnev_t *) 5145 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN, 5146 action, arg, sizeof(*dev)); 5147 if (dev == NULL) { 5148 UNLOCK(&sock->lock); 5149 return (ISC_R_NOMEMORY); 5150 } 5151 ISC_LINK_INIT(dev, ev_link); 5152 5153 result = allocate_socket(manager, sock->type, &nsock); 5154 if (result != ISC_R_SUCCESS) { 5155 isc_event_free(ISC_EVENT_PTR(&dev)); 5156 UNLOCK(&sock->lock); 5157 return (result); 5158 } 5159 5160 /* 5161 * Attach to socket and to task. 5162 */ 5163 isc_task_attach(task, &ntask); 5164 if (isc_task_exiting(ntask)) { 5165 free_socket(&nsock); 5166 isc_task_detach(&ntask); 5167 isc_event_free(ISC_EVENT_PTR(&dev)); 5168 UNLOCK(&sock->lock); 5169 return (ISC_R_SHUTTINGDOWN); 5170 } 5171 nsock->references++; 5172 nsock->statsindex = sock->statsindex; 5173 5174 dev->ev_sender = ntask; 5175 dev->newsocket = (isc_socket_t *)nsock; 5176 5177 /* 5178 * Poke watcher here. We still have the socket locked, so there 5179 * is no race condition. We will keep the lock for such a short 5180 * bit of time waking it up now or later won't matter all that much. 5181 */ 5182 if (ISC_LIST_EMPTY(sock->accept_list)) 5183 do_poke = ISC_TRUE; 5184 5185 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link); 5186 5187 if (do_poke) 5188 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT); 5189 5190 UNLOCK(&sock->lock); 5191 return (ISC_R_SUCCESS); 5192} 5193 5194ISC_SOCKETFUNC_SCOPE isc_result_t 5195isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr, 5196 isc_task_t *task, isc_taskaction_t action, const void *arg) 5197{ 5198 isc__socket_t *sock = (isc__socket_t *)sock0; 5199 isc_socket_connev_t *dev; 5200 isc_task_t *ntask = NULL; 5201 isc__socketmgr_t *manager; 5202 int cc; 5203 char strbuf[ISC_STRERRORSIZE]; 5204 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 5205 5206 REQUIRE(VALID_SOCKET(sock)); 5207 REQUIRE(addr != NULL); 5208 REQUIRE(task != NULL); 5209 REQUIRE(action != NULL); 5210 5211 manager = sock->manager; 5212 REQUIRE(VALID_MANAGER(manager)); 5213 REQUIRE(addr != NULL); 5214 5215 if (isc_sockaddr_ismulticast(addr)) 5216 return (ISC_R_MULTICAST); 5217 5218 LOCK(&sock->lock); 5219 5220 REQUIRE(!sock->connecting); 5221 5222 dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock, 5223 ISC_SOCKEVENT_CONNECT, 5224 action, arg, 5225 sizeof(*dev)); 5226 if (dev == NULL) { 5227 UNLOCK(&sock->lock); 5228 return (ISC_R_NOMEMORY); 5229 } 5230 ISC_LINK_INIT(dev, ev_link); 5231 5232 /* 5233 * Try to do the connect right away, as there can be only one 5234 * outstanding, and it might happen to complete. 5235 */ 5236 sock->peer_address = *addr; 5237 cc = connect(sock->fd, &addr->type.sa, addr->length); 5238 if (cc < 0) { 5239 /* 5240 * HP-UX "fails" to connect a UDP socket and sets errno to 5241 * EINPROGRESS if it's non-blocking. We'd rather regard this as 5242 * a success and let the user detect it if it's really an error 5243 * at the time of sending a packet on the socket. 5244 */ 5245 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { 5246 cc = 0; 5247 goto success; 5248 } 5249 if (SOFT_ERROR(errno) || errno == EINPROGRESS) 5250 goto queue; 5251 5252 switch (errno) { 5253#define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit; 5254 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5255 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5256 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5257 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5258 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5259#ifdef EHOSTDOWN 5260 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5261#endif 5262 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5263 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5264 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5265 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5266 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5267#undef ERROR_MATCH 5268 } 5269 5270 sock->connected = 0; 5271 5272 isc__strerror(errno, strbuf, sizeof(strbuf)); 5273 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf)); 5274 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s", 5275 addrbuf, errno, strbuf); 5276 5277 UNLOCK(&sock->lock); 5278 inc_stats(sock->manager->stats, 5279 sock->statsindex[STATID_CONNECTFAIL]); 5280 isc_event_free(ISC_EVENT_PTR(&dev)); 5281 return (ISC_R_UNEXPECTED); 5282 5283 err_exit: 5284 sock->connected = 0; 5285 isc_task_send(task, ISC_EVENT_PTR(&dev)); 5286 5287 UNLOCK(&sock->lock); 5288 inc_stats(sock->manager->stats, 5289 sock->statsindex[STATID_CONNECTFAIL]); 5290 return (ISC_R_SUCCESS); 5291 } 5292 5293 /* 5294 * If connect completed, fire off the done event. 5295 */ 5296 success: 5297 if (cc == 0) { 5298 sock->connected = 1; 5299 sock->bound = 1; 5300 dev->result = ISC_R_SUCCESS; 5301 isc_task_send(task, ISC_EVENT_PTR(&dev)); 5302 5303 UNLOCK(&sock->lock); 5304 5305 inc_stats(sock->manager->stats, 5306 sock->statsindex[STATID_CONNECT]); 5307 5308 return (ISC_R_SUCCESS); 5309 } 5310 5311 queue: 5312 5313 /* 5314 * Attach to task. 5315 */ 5316 isc_task_attach(task, &ntask); 5317 5318 sock->connecting = 1; 5319 5320 dev->ev_sender = ntask; 5321 5322 /* 5323 * Poke watcher here. We still have the socket locked, so there 5324 * is no race condition. We will keep the lock for such a short 5325 * bit of time waking it up now or later won't matter all that much. 5326 */ 5327 if (sock->connect_ev == NULL) 5328 select_poke(manager, sock->fd, SELECT_POKE_CONNECT); 5329 5330 sock->connect_ev = dev; 5331 5332 UNLOCK(&sock->lock); 5333 return (ISC_R_SUCCESS); 5334} 5335 5336/* 5337 * Called when a socket with a pending connect() finishes. 5338 */ 5339static void 5340internal_connect(isc_task_t *me, isc_event_t *ev) { 5341 isc__socket_t *sock; 5342 isc_socket_connev_t *dev; 5343 isc_task_t *task; 5344 int cc; 5345 ISC_SOCKADDR_LEN_T optlen; 5346 char strbuf[ISC_STRERRORSIZE]; 5347 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5348 5349 UNUSED(me); 5350 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 5351 5352 sock = ev->ev_sender; 5353 INSIST(VALID_SOCKET(sock)); 5354 5355 LOCK(&sock->lock); 5356 5357 /* 5358 * When the internal event was sent the reference count was bumped 5359 * to keep the socket around for us. Decrement the count here. 5360 */ 5361 INSIST(sock->references > 0); 5362 sock->references--; 5363 if (sock->references == 0) { 5364 UNLOCK(&sock->lock); 5365 destroy(&sock); 5366 return; 5367 } 5368 5369 /* 5370 * Has this event been canceled? 5371 */ 5372 dev = sock->connect_ev; 5373 if (dev == NULL) { 5374 INSIST(!sock->connecting); 5375 UNLOCK(&sock->lock); 5376 return; 5377 } 5378 5379 INSIST(sock->connecting); 5380 sock->connecting = 0; 5381 5382 /* 5383 * Get any possible error status here. 5384 */ 5385 optlen = sizeof(cc); 5386 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, 5387 (void *)&cc, (void *)&optlen) < 0) 5388 cc = errno; 5389 else 5390 errno = cc; 5391 5392 if (errno != 0) { 5393 /* 5394 * If the error is EAGAIN, just re-select on this 5395 * fd and pretend nothing strange happened. 5396 */ 5397 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 5398 sock->connecting = 1; 5399 select_poke(sock->manager, sock->fd, 5400 SELECT_POKE_CONNECT); 5401 UNLOCK(&sock->lock); 5402 5403 return; 5404 } 5405 5406 inc_stats(sock->manager->stats, 5407 sock->statsindex[STATID_CONNECTFAIL]); 5408 5409 /* 5410 * Translate other errors into ISC_R_* flavors. 5411 */ 5412 switch (errno) { 5413#define ERROR_MATCH(a, b) case a: dev->result = b; break; 5414 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5415 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5416 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5417 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5418 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5419#ifdef EHOSTDOWN 5420 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5421#endif 5422 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5423 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5424 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5425 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5426 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 5427 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5428#undef ERROR_MATCH 5429 default: 5430 dev->result = ISC_R_UNEXPECTED; 5431 isc_sockaddr_format(&sock->peer_address, peerbuf, 5432 sizeof(peerbuf)); 5433 isc__strerror(errno, strbuf, sizeof(strbuf)); 5434 UNEXPECTED_ERROR(__FILE__, __LINE__, 5435 "internal_connect: connect(%s) %s", 5436 peerbuf, strbuf); 5437 } 5438 } else { 5439 inc_stats(sock->manager->stats, 5440 sock->statsindex[STATID_CONNECT]); 5441 dev->result = ISC_R_SUCCESS; 5442 sock->connected = 1; 5443 sock->bound = 1; 5444 } 5445 5446 sock->connect_ev = NULL; 5447 5448 UNLOCK(&sock->lock); 5449 5450 task = dev->ev_sender; 5451 dev->ev_sender = sock; 5452 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 5453} 5454 5455ISC_SOCKETFUNC_SCOPE isc_result_t 5456isc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) { 5457 isc__socket_t *sock = (isc__socket_t *)sock0; 5458 isc_result_t result; 5459 5460 REQUIRE(VALID_SOCKET(sock)); 5461 REQUIRE(addressp != NULL); 5462 5463 LOCK(&sock->lock); 5464 5465 if (sock->connected) { 5466 *addressp = sock->peer_address; 5467 result = ISC_R_SUCCESS; 5468 } else { 5469 result = ISC_R_NOTCONNECTED; 5470 } 5471 5472 UNLOCK(&sock->lock); 5473 5474 return (result); 5475} 5476 5477ISC_SOCKETFUNC_SCOPE isc_result_t 5478isc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) { 5479 isc__socket_t *sock = (isc__socket_t *)sock0; 5480 ISC_SOCKADDR_LEN_T len; 5481 isc_result_t result; 5482 char strbuf[ISC_STRERRORSIZE]; 5483 5484 REQUIRE(VALID_SOCKET(sock)); 5485 REQUIRE(addressp != NULL); 5486 5487 LOCK(&sock->lock); 5488 5489 if (!sock->bound) { 5490 result = ISC_R_NOTBOUND; 5491 goto out; 5492 } 5493 5494 result = ISC_R_SUCCESS; 5495 5496 len = sizeof(addressp->type); 5497 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) { 5498 isc__strerror(errno, strbuf, sizeof(strbuf)); 5499 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", 5500 strbuf); 5501 result = ISC_R_UNEXPECTED; 5502 goto out; 5503 } 5504 addressp->length = (unsigned int)len; 5505 5506 out: 5507 UNLOCK(&sock->lock); 5508 5509 return (result); 5510} 5511 5512/* 5513 * Run through the list of events on this socket, and cancel the ones 5514 * queued for task "task" of type "how". "how" is a bitmask. 5515 */ 5516ISC_SOCKETFUNC_SCOPE void 5517isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) { 5518 isc__socket_t *sock = (isc__socket_t *)sock0; 5519 5520 REQUIRE(VALID_SOCKET(sock)); 5521 5522 /* 5523 * Quick exit if there is nothing to do. Don't even bother locking 5524 * in this case. 5525 */ 5526 if (how == 0) 5527 return; 5528 5529 LOCK(&sock->lock); 5530 5531 /* 5532 * All of these do the same thing, more or less. 5533 * Each will: 5534 * o If the internal event is marked as "posted" try to 5535 * remove it from the task's queue. If this fails, mark it 5536 * as canceled instead, and let the task clean it up later. 5537 * o For each I/O request for that task of that type, post 5538 * its done event with status of "ISC_R_CANCELED". 5539 * o Reset any state needed. 5540 */ 5541 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) 5542 && !ISC_LIST_EMPTY(sock->recv_list)) { 5543 isc_socketevent_t *dev; 5544 isc_socketevent_t *next; 5545 isc_task_t *current_task; 5546 5547 dev = ISC_LIST_HEAD(sock->recv_list); 5548 5549 while (dev != NULL) { 5550 current_task = dev->ev_sender; 5551 next = ISC_LIST_NEXT(dev, ev_link); 5552 5553 if ((task == NULL) || (task == current_task)) { 5554 dev->result = ISC_R_CANCELED; 5555 send_recvdone_event(sock, &dev); 5556 } 5557 dev = next; 5558 } 5559 } 5560 5561 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) 5562 && !ISC_LIST_EMPTY(sock->send_list)) { 5563 isc_socketevent_t *dev; 5564 isc_socketevent_t *next; 5565 isc_task_t *current_task; 5566 5567 dev = ISC_LIST_HEAD(sock->send_list); 5568 5569 while (dev != NULL) { 5570 current_task = dev->ev_sender; 5571 next = ISC_LIST_NEXT(dev, ev_link); 5572 5573 if ((task == NULL) || (task == current_task)) { 5574 dev->result = ISC_R_CANCELED; 5575 send_senddone_event(sock, &dev); 5576 } 5577 dev = next; 5578 } 5579 } 5580 5581 if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT) 5582 && !ISC_LIST_EMPTY(sock->accept_list)) { 5583 isc_socket_newconnev_t *dev; 5584 isc_socket_newconnev_t *next; 5585 isc_task_t *current_task; 5586 5587 dev = ISC_LIST_HEAD(sock->accept_list); 5588 while (dev != NULL) { 5589 current_task = dev->ev_sender; 5590 next = ISC_LIST_NEXT(dev, ev_link); 5591 5592 if ((task == NULL) || (task == current_task)) { 5593 5594 ISC_LIST_UNLINK(sock->accept_list, dev, 5595 ev_link); 5596 5597 NEWCONNSOCK(dev)->references--; 5598 free_socket((isc__socket_t **)&dev->newsocket); 5599 5600 dev->result = ISC_R_CANCELED; 5601 dev->ev_sender = sock; 5602 isc_task_sendanddetach(¤t_task, 5603 ISC_EVENT_PTR(&dev)); 5604 } 5605 5606 dev = next; 5607 } 5608 } 5609 5610 /* 5611 * Connecting is not a list. 5612 */ 5613 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT) 5614 && sock->connect_ev != NULL) { 5615 isc_socket_connev_t *dev; 5616 isc_task_t *current_task; 5617 5618 INSIST(sock->connecting); 5619 sock->connecting = 0; 5620 5621 dev = sock->connect_ev; 5622 current_task = dev->ev_sender; 5623 5624 if ((task == NULL) || (task == current_task)) { 5625 sock->connect_ev = NULL; 5626 5627 dev->result = ISC_R_CANCELED; 5628 dev->ev_sender = sock; 5629 isc_task_sendanddetach(¤t_task, 5630 ISC_EVENT_PTR(&dev)); 5631 } 5632 } 5633 5634 UNLOCK(&sock->lock); 5635} 5636 5637ISC_SOCKETFUNC_SCOPE isc_sockettype_t 5638isc__socket_gettype(isc_socket_t *sock0) { 5639 isc__socket_t *sock = (isc__socket_t *)sock0; 5640 5641 REQUIRE(VALID_SOCKET(sock)); 5642 5643 return (sock->type); 5644} 5645 5646ISC_SOCKETFUNC_SCOPE isc_boolean_t 5647isc__socket_isbound(isc_socket_t *sock0) { 5648 isc__socket_t *sock = (isc__socket_t *)sock0; 5649 isc_boolean_t val; 5650 5651 REQUIRE(VALID_SOCKET(sock)); 5652 5653 LOCK(&sock->lock); 5654 val = ((sock->bound) ? ISC_TRUE : ISC_FALSE); 5655 UNLOCK(&sock->lock); 5656 5657 return (val); 5658} 5659 5660ISC_SOCKETFUNC_SCOPE void 5661isc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) { 5662 isc__socket_t *sock = (isc__socket_t *)sock0; 5663#if defined(IPV6_V6ONLY) 5664 int onoff = yes ? 1 : 0; 5665#else 5666 UNUSED(yes); 5667 UNUSED(sock); 5668#endif 5669 5670 REQUIRE(VALID_SOCKET(sock)); 5671 5672#ifdef IPV6_V6ONLY 5673 if (sock->pf == AF_INET6) { 5674 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, 5675 (void *)&onoff, sizeof(int)) < 0) { 5676 char strbuf[ISC_STRERRORSIZE]; 5677 isc__strerror(errno, strbuf, sizeof(strbuf)); 5678 UNEXPECTED_ERROR(__FILE__, __LINE__, 5679 "setsockopt(%d, IPV6_V6ONLY) " 5680 "%s: %s", sock->fd, 5681 isc_msgcat_get(isc_msgcat, 5682 ISC_MSGSET_GENERAL, 5683 ISC_MSG_FAILED, 5684 "failed"), 5685 strbuf); 5686 } 5687 } 5688 FIX_IPV6_RECVPKTINFO(sock); /* AIX */ 5689#endif 5690} 5691 5692#ifndef USE_WATCHER_THREAD 5693/* 5694 * In our assumed scenario, we can simply use a single static object. 5695 * XXX: this is not true if the application uses multiple threads with 5696 * 'multi-context' mode. Fixing this is a future TODO item. 5697 */ 5698static isc_socketwait_t swait_private; 5699 5700int 5701isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp, 5702 isc_socketwait_t **swaitp) 5703{ 5704 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 5705 5706 5707 int n; 5708#ifdef USE_KQUEUE 5709 struct timespec ts, *tsp; 5710#endif 5711#ifdef USE_EPOLL 5712 int timeout; 5713#endif 5714#ifdef USE_DEVPOLL 5715 struct dvpoll dvp; 5716#endif 5717 5718 REQUIRE(swaitp != NULL && *swaitp == NULL); 5719 5720#ifdef USE_SHARED_MANAGER 5721 if (manager == NULL) 5722 manager = socketmgr; 5723#endif 5724 if (manager == NULL) 5725 return (0); 5726 5727#ifdef USE_KQUEUE 5728 if (tvp != NULL) { 5729 ts.tv_sec = tvp->tv_sec; 5730 ts.tv_nsec = tvp->tv_usec * 1000; 5731 tsp = &ts; 5732 } else 5733 tsp = NULL; 5734 swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0, 5735 manager->events, manager->nevents, 5736 tsp); 5737 n = swait_private.nevents; 5738#elif defined(USE_EPOLL) 5739 if (tvp != NULL) 5740 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000; 5741 else 5742 timeout = -1; 5743 swait_private.nevents = epoll_wait(manager->epoll_fd, 5744 manager->events, 5745 manager->nevents, timeout); 5746 n = swait_private.nevents; 5747#elif defined(USE_DEVPOLL) 5748 dvp.dp_fds = manager->events; 5749 dvp.dp_nfds = manager->nevents; 5750 if (tvp != NULL) { 5751 dvp.dp_timeout = tvp->tv_sec * 1000 + 5752 (tvp->tv_usec + 999) / 1000; 5753 } else 5754 dvp.dp_timeout = -1; 5755 swait_private.nevents = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 5756 n = swait_private.nevents; 5757#elif defined(USE_SELECT) 5758 memcpy(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize); 5759 memcpy(manager->write_fds_copy, manager->write_fds, 5760 manager->fd_bufsize); 5761 5762 swait_private.readset = manager->read_fds_copy; 5763 swait_private.writeset = manager->write_fds_copy; 5764 swait_private.maxfd = manager->maxfd + 1; 5765 5766 n = select(swait_private.maxfd, swait_private.readset, 5767 swait_private.writeset, NULL, tvp); 5768#endif 5769 5770 *swaitp = &swait_private; 5771 return (n); 5772} 5773 5774isc_result_t 5775isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) { 5776 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 5777 5778 REQUIRE(swait == &swait_private); 5779 5780#ifdef USE_SHARED_MANAGER 5781 if (manager == NULL) 5782 manager = socketmgr; 5783#endif 5784 if (manager == NULL) 5785 return (ISC_R_NOTFOUND); 5786 5787#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 5788 (void)process_fds(manager, manager->events, swait->nevents); 5789 return (ISC_R_SUCCESS); 5790#elif defined(USE_SELECT) 5791 process_fds(manager, swait->maxfd, swait->readset, swait->writeset); 5792 return (ISC_R_SUCCESS); 5793#endif 5794} 5795#endif /* USE_WATCHER_THREAD */ 5796 5797#ifdef BIND9 5798void 5799isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) { 5800 isc__socket_t *socket = (isc__socket_t *)socket0; 5801 5802 /* 5803 * Name 'socket'. 5804 */ 5805 5806 REQUIRE(VALID_SOCKET(socket)); 5807 5808 LOCK(&socket->lock); 5809 memset(socket->name, 0, sizeof(socket->name)); 5810 strncpy(socket->name, name, sizeof(socket->name) - 1); 5811 socket->tag = tag; 5812 UNLOCK(&socket->lock); 5813} 5814 5815ISC_SOCKETFUNC_SCOPE const char * 5816isc__socket_getname(isc_socket_t *socket0) { 5817 isc__socket_t *socket = (isc__socket_t *)socket0; 5818 5819 return (socket->name); 5820} 5821 5822void * 5823isc__socket_gettag(isc_socket_t *socket0) { 5824 isc__socket_t *socket = (isc__socket_t *)socket0; 5825 5826 return (socket->tag); 5827} 5828#endif /* BIND9 */ 5829 5830#ifdef USE_SOCKETIMPREGISTER 5831isc_result_t 5832isc__socket_register() { 5833 return (isc_socket_register(isc__socketmgr_create)); 5834} 5835#endif 5836 5837#if defined(HAVE_LIBXML2) && defined(BIND9) 5838 5839static const char * 5840_socktype(isc_sockettype_t type) 5841{ 5842 if (type == isc_sockettype_udp) 5843 return ("udp"); 5844 else if (type == isc_sockettype_tcp) 5845 return ("tcp"); 5846 else if (type == isc_sockettype_unix) 5847 return ("unix"); 5848 else if (type == isc_sockettype_fdwatch) 5849 return ("fdwatch"); 5850 else 5851 return ("not-initialized"); 5852} 5853 5854ISC_SOCKETFUNC_SCOPE void 5855isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) { 5856 isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0; 5857 isc__socket_t *sock; 5858 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5859 isc_sockaddr_t addr; 5860 ISC_SOCKADDR_LEN_T len; 5861 5862 LOCK(&mgr->lock); 5863 5864#ifdef USE_SHARED_MANAGER 5865 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"); 5866 xmlTextWriterWriteFormatString(writer, "%d", mgr->refs); 5867 xmlTextWriterEndElement(writer); 5868#endif /* USE_SHARED_MANAGER */ 5869 5870 xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"); 5871 sock = ISC_LIST_HEAD(mgr->socklist); 5872 while (sock != NULL) { 5873 LOCK(&sock->lock); 5874 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"); 5875 5876 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"); 5877 xmlTextWriterWriteFormatString(writer, "%p", sock); 5878 xmlTextWriterEndElement(writer); 5879 5880 if (sock->name[0] != 0) { 5881 xmlTextWriterStartElement(writer, ISC_XMLCHAR "name"); 5882 xmlTextWriterWriteFormatString(writer, "%s", 5883 sock->name); 5884 xmlTextWriterEndElement(writer); /* name */ 5885 } 5886 5887 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"); 5888 xmlTextWriterWriteFormatString(writer, "%d", sock->references); 5889 xmlTextWriterEndElement(writer); 5890 5891 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type", 5892 ISC_XMLCHAR _socktype(sock->type)); 5893 5894 if (sock->connected) { 5895 isc_sockaddr_format(&sock->peer_address, peerbuf, 5896 sizeof(peerbuf)); 5897 xmlTextWriterWriteElement(writer, 5898 ISC_XMLCHAR "peer-address", 5899 ISC_XMLCHAR peerbuf); 5900 } 5901 5902 len = sizeof(addr); 5903 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) { 5904 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf)); 5905 xmlTextWriterWriteElement(writer, 5906 ISC_XMLCHAR "local-address", 5907 ISC_XMLCHAR peerbuf); 5908 } 5909 5910 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"); 5911 if (sock->pending_recv) 5912 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5913 ISC_XMLCHAR "pending-receive"); 5914 if (sock->pending_send) 5915 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5916 ISC_XMLCHAR "pending-send"); 5917 if (sock->pending_accept) 5918 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5919 ISC_XMLCHAR "pending_accept"); 5920 if (sock->listener) 5921 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5922 ISC_XMLCHAR "listener"); 5923 if (sock->connected) 5924 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5925 ISC_XMLCHAR "connected"); 5926 if (sock->connecting) 5927 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5928 ISC_XMLCHAR "connecting"); 5929 if (sock->bound) 5930 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5931 ISC_XMLCHAR "bound"); 5932 5933 xmlTextWriterEndElement(writer); /* states */ 5934 5935 xmlTextWriterEndElement(writer); /* socket */ 5936 5937 UNLOCK(&sock->lock); 5938 sock = ISC_LIST_NEXT(sock, link); 5939 } 5940 xmlTextWriterEndElement(writer); /* sockets */ 5941 5942 UNLOCK(&mgr->lock); 5943} 5944#endif /* HAVE_LIBXML2 */ 5945