1/* 2 * Copyright (C) 2004-2012 Internet Systems Consortium, Inc. ("ISC") 3 * Copyright (C) 1998-2003 Internet Software Consortium. 4 * 5 * Permission to use, copy, modify, and/or distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 11 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 15 * PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18/* $Id$ */ 19 20/*! \file */ 21 22#include <config.h> 23 24#include <sys/param.h> 25#include <sys/types.h> 26#include <sys/socket.h> 27#include <sys/stat.h> 28#include <sys/time.h> 29#include <sys/uio.h> 30 31#include <errno.h> 32#include <fcntl.h> 33#include <stddef.h> 34#include <stdlib.h> 35#include <string.h> 36#include <unistd.h> 37 38#include <isc/buffer.h> 39#include <isc/bufferlist.h> 40#include <isc/condition.h> 41#include <isc/formatcheck.h> 42#include <isc/list.h> 43#include <isc/log.h> 44#include <isc/mem.h> 45#include <isc/msgs.h> 46#include <isc/mutex.h> 47#include <isc/net.h> 48#include <isc/once.h> 49#include <isc/platform.h> 50#include <isc/print.h> 51#include <isc/region.h> 52#include <isc/socket.h> 53#include <isc/stats.h> 54#include <isc/strerror.h> 55#include <isc/task.h> 56#include <isc/thread.h> 57#include <isc/util.h> 58#include <isc/xml.h> 59 60#ifdef ISC_PLATFORM_HAVESYSUNH 61#include <sys/un.h> 62#endif 63#ifdef ISC_PLATFORM_HAVEKQUEUE 64#include <sys/event.h> 65#endif 66#ifdef ISC_PLATFORM_HAVEEPOLL 67#include <sys/epoll.h> 68#endif 69#ifdef ISC_PLATFORM_HAVEDEVPOLL 70#if defined(HAVE_SYS_DEVPOLL_H) 71#include <sys/devpoll.h> 72#elif defined(HAVE_DEVPOLL_H) 73#include <devpoll.h> 74#endif 75#endif 76 77#include "errno2result.h" 78 79/* See task.c about the following definition: */ 80#ifdef BIND9 81#ifdef ISC_PLATFORM_USETHREADS 82#define USE_WATCHER_THREAD 83#else 84#define USE_SHARED_MANAGER 85#endif /* ISC_PLATFORM_USETHREADS */ 86#endif /* BIND9 */ 87 88#ifndef USE_WATCHER_THREAD 89#include "socket_p.h" 90#include "../task_p.h" 91#endif /* USE_WATCHER_THREAD */ 92 93#if defined(SO_BSDCOMPAT) && defined(__linux__) 94#include <sys/utsname.h> 95#endif 96 97/*% 98 * Choose the most preferable multiplex method. 99 */ 100#ifdef ISC_PLATFORM_HAVEKQUEUE 101#define USE_KQUEUE 102#elif defined (ISC_PLATFORM_HAVEEPOLL) 103#define USE_EPOLL 104#elif defined (ISC_PLATFORM_HAVEDEVPOLL) 105#define USE_DEVPOLL 106typedef struct { 107 unsigned int want_read : 1, 108 want_write : 1; 109} pollinfo_t; 110#else 111#define USE_SELECT 112#endif /* ISC_PLATFORM_HAVEKQUEUE */ 113 114#ifndef USE_WATCHER_THREAD 115#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 116struct isc_socketwait { 117 int nevents; 118}; 119#elif defined (USE_SELECT) 120struct isc_socketwait { 121 fd_set *readset; 122 fd_set *writeset; 123 int nfds; 124 int maxfd; 125}; 126#endif /* USE_KQUEUE */ 127#endif /* !USE_WATCHER_THREAD */ 128 129/*% 130 * Maximum number of allowable open sockets. This is also the maximum 131 * allowable socket file descriptor. 132 * 133 * Care should be taken before modifying this value for select(): 134 * The API standard doesn't ensure select() accept more than (the system default 135 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in 136 * the vast majority of cases. This constant should therefore be increased only 137 * when absolutely necessary and possible, i.e., the server is exhausting all 138 * available file descriptors (up to FD_SETSIZE) and the select() function 139 * and FD_xxx macros support larger values than FD_SETSIZE (which may not 140 * always by true, but we keep using some of them to ensure as much 141 * portability as possible). Note also that overall server performance 142 * may be rather worsened with a larger value of this constant due to 143 * inherent scalability problems of select(). 144 * 145 * As a special note, this value shouldn't have to be touched if 146 * this is a build for an authoritative only DNS server. 147 */ 148#ifndef ISC_SOCKET_MAXSOCKETS 149#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 150#define ISC_SOCKET_MAXSOCKETS 4096 151#elif defined(USE_SELECT) 152#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE 153#endif /* USE_KQUEUE... */ 154#endif /* ISC_SOCKET_MAXSOCKETS */ 155 156#ifdef USE_SELECT 157/*% 158 * Mac OS X needs a special definition to support larger values in select(). 159 * We always define this because a larger value can be specified run-time. 160 */ 161#ifdef __APPLE__ 162#define _DARWIN_UNLIMITED_SELECT 163#endif /* __APPLE__ */ 164#endif /* USE_SELECT */ 165 166#ifdef ISC_SOCKET_USE_POLLWATCH 167/*% 168 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel 169 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for 170 * some of the specified FD. The idea is based on the observation that it's 171 * likely for a busy server to keep receiving packets. It specifically works 172 * as follows: the socket watcher is first initialized with the state of 173 * "poll_idle". While it's in the idle state it keeps sleeping until a socket 174 * event occurs. When it wakes up for a socket I/O event, it moves to the 175 * poll_active state, and sets the poll timeout to a short period 176 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the 177 * watcher goes to the poll_checking state with the same timeout period. 178 * In this state, the watcher tries to detect whether this is a break 179 * during intermittent events or the kernel bug is triggered. If the next 180 * polling reports an event within the short period, the previous timeout is 181 * likely to be a kernel bug, and so the watcher goes back to the active state. 182 * Otherwise, it moves to the idle state again. 183 * 184 * It's not clear whether this is a thread-related bug, but since we've only 185 * seen this with threads, this workaround is used only when enabling threads. 186 */ 187 188typedef enum { poll_idle, poll_active, poll_checking } pollstate_t; 189 190#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT 191#define ISC_SOCKET_POLLWATCH_TIMEOUT 10 192#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */ 193#endif /* ISC_SOCKET_USE_POLLWATCH */ 194 195/*% 196 * Size of per-FD lock buckets. 197 */ 198#ifdef ISC_PLATFORM_USETHREADS 199#define FDLOCK_COUNT 1024 200#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT) 201#else 202#define FDLOCK_COUNT 1 203#define FDLOCK_ID(fd) 0 204#endif /* ISC_PLATFORM_USETHREADS */ 205 206/*% 207 * Maximum number of events communicated with the kernel. There should normally 208 * be no need for having a large number. 209 */ 210#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 211#ifndef ISC_SOCKET_MAXEVENTS 212#define ISC_SOCKET_MAXEVENTS 64 213#endif 214#endif 215 216/*% 217 * Some systems define the socket length argument as an int, some as size_t, 218 * some as socklen_t. This is here so it can be easily changed if needed. 219 */ 220#ifndef ISC_SOCKADDR_LEN_T 221#define ISC_SOCKADDR_LEN_T unsigned int 222#endif 223 224/*% 225 * Define what the possible "soft" errors can be. These are non-fatal returns 226 * of various network related functions, like recv() and so on. 227 * 228 * For some reason, BSDI (and perhaps others) will sometimes return <0 229 * from recv() but will have errno==0. This is broken, but we have to 230 * work around it here. 231 */ 232#define SOFT_ERROR(e) ((e) == EAGAIN || \ 233 (e) == EWOULDBLOCK || \ 234 (e) == EINTR || \ 235 (e) == 0) 236 237#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x) 238 239/*!< 240 * DLVL(90) -- Function entry/exit and other tracing. 241 * DLVL(70) -- Socket "correctness" -- including returning of events, etc. 242 * DLVL(60) -- Socket data send/receive 243 * DLVL(50) -- Event tracing, including receiving/sending completion events. 244 * DLVL(20) -- Socket creation/destruction. 245 */ 246#define TRACE_LEVEL 90 247#define CORRECTNESS_LEVEL 70 248#define IOEVENT_LEVEL 60 249#define EVENT_LEVEL 50 250#define CREATION_LEVEL 20 251 252#define TRACE DLVL(TRACE_LEVEL) 253#define CORRECTNESS DLVL(CORRECTNESS_LEVEL) 254#define IOEVENT DLVL(IOEVENT_LEVEL) 255#define EVENT DLVL(EVENT_LEVEL) 256#define CREATION DLVL(CREATION_LEVEL) 257 258typedef isc_event_t intev_t; 259 260#define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o') 261#define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC) 262 263/*! 264 * IPv6 control information. If the socket is an IPv6 socket we want 265 * to collect the destination address and interface so the client can 266 * set them on outgoing packets. 267 */ 268#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 269#ifndef USE_CMSG 270#define USE_CMSG 1 271#endif 272#endif 273 274/*% 275 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have 276 * a setsockopt() like interface to request timestamps, and if the OS 277 * doesn't do it for us, call gettimeofday() on every UDP receive? 278 */ 279#ifdef SO_TIMESTAMP 280#ifndef USE_CMSG 281#define USE_CMSG 1 282#endif 283#endif 284 285/*% 286 * The size to raise the receive buffer to (from BIND 8). 287 */ 288#define RCVBUFSIZE (32*1024) 289 290/*% 291 * The number of times a send operation is repeated if the result is EINTR. 292 */ 293#define NRETRIES 10 294 295typedef struct isc__socket isc__socket_t; 296typedef struct isc__socketmgr isc__socketmgr_t; 297 298#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket) 299 300struct isc__socket { 301 /* Not locked. */ 302 isc_socket_t common; 303 isc__socketmgr_t *manager; 304 isc_mutex_t lock; 305 isc_sockettype_t type; 306 const isc_statscounter_t *statsindex; 307 308 /* Locked by socket lock. */ 309 ISC_LINK(isc__socket_t) link; 310 unsigned int references; 311 int fd; 312 int pf; 313 char name[16]; 314 void * tag; 315 316 ISC_LIST(isc_socketevent_t) send_list; 317 ISC_LIST(isc_socketevent_t) recv_list; 318 ISC_LIST(isc_socket_newconnev_t) accept_list; 319 isc_socket_connev_t *connect_ev; 320 321 /* 322 * Internal events. Posted when a descriptor is readable or 323 * writable. These are statically allocated and never freed. 324 * They will be set to non-purgable before use. 325 */ 326 intev_t readable_ev; 327 intev_t writable_ev; 328 329 isc_sockaddr_t peer_address; /* remote address */ 330 331 unsigned int pending_recv : 1, 332 pending_send : 1, 333 pending_accept : 1, 334 listener : 1, /* listener socket */ 335 connected : 1, 336 connecting : 1, /* connect pending */ 337 bound : 1, /* bound to local addr */ 338 dupped : 1; 339 340#ifdef ISC_NET_RECVOVERFLOW 341 unsigned char overflow; /* used for MSG_TRUNC fake */ 342#endif 343 344 char *recvcmsgbuf; 345 ISC_SOCKADDR_LEN_T recvcmsgbuflen; 346 char *sendcmsgbuf; 347 ISC_SOCKADDR_LEN_T sendcmsgbuflen; 348 349 void *fdwatcharg; 350 isc_sockfdwatch_t fdwatchcb; 351 int fdwatchflags; 352 isc_task_t *fdwatchtask; 353}; 354 355#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g') 356#define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC) 357 358struct isc__socketmgr { 359 /* Not locked. */ 360 isc_socketmgr_t common; 361 isc_mem_t *mctx; 362 isc_mutex_t lock; 363 isc_mutex_t *fdlock; 364 isc_stats_t *stats; 365#ifdef USE_KQUEUE 366 int kqueue_fd; 367 int nevents; 368 struct kevent *events; 369#endif /* USE_KQUEUE */ 370#ifdef USE_EPOLL 371 int epoll_fd; 372 int nevents; 373 struct epoll_event *events; 374#endif /* USE_EPOLL */ 375#ifdef USE_DEVPOLL 376 int devpoll_fd; 377 int nevents; 378 struct pollfd *events; 379#endif /* USE_DEVPOLL */ 380#ifdef USE_SELECT 381 int fd_bufsize; 382#endif /* USE_SELECT */ 383 unsigned int maxsocks; 384#ifdef ISC_PLATFORM_USETHREADS 385 int pipe_fds[2]; 386#endif 387 388 /* Locked by fdlock. */ 389 isc__socket_t **fds; 390 int *fdstate; 391#ifdef USE_DEVPOLL 392 pollinfo_t *fdpollinfo; 393#endif 394 395 /* Locked by manager lock. */ 396 ISC_LIST(isc__socket_t) socklist; 397#ifdef USE_SELECT 398 fd_set *read_fds; 399 fd_set *read_fds_copy; 400 fd_set *write_fds; 401 fd_set *write_fds_copy; 402 int maxfd; 403#endif /* USE_SELECT */ 404 int reserved; /* unlocked */ 405#ifdef USE_WATCHER_THREAD 406 isc_thread_t watcher; 407 isc_condition_t shutdown_ok; 408#else /* USE_WATCHER_THREAD */ 409 unsigned int refs; 410#endif /* USE_WATCHER_THREAD */ 411 int maxudp; 412}; 413 414#ifdef USE_SHARED_MANAGER 415static isc__socketmgr_t *socketmgr = NULL; 416#endif /* USE_SHARED_MANAGER */ 417 418#define CLOSED 0 /* this one must be zero */ 419#define MANAGED 1 420#define CLOSE_PENDING 2 421 422/* 423 * send() and recv() iovec counts 424 */ 425#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) 426#ifdef ISC_NET_RECVOVERFLOW 427# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1) 428#else 429# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) 430#endif 431 432static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf, 433 isc_sockettype_t type, 434 isc_socket_t **socketp, 435 isc_socket_t *dup_socket); 436static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **); 437static void send_senddone_event(isc__socket_t *, isc_socketevent_t **); 438static void free_socket(isc__socket_t **); 439static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t, 440 isc__socket_t **); 441static void destroy(isc__socket_t **); 442static void internal_accept(isc_task_t *, isc_event_t *); 443static void internal_connect(isc_task_t *, isc_event_t *); 444static void internal_recv(isc_task_t *, isc_event_t *); 445static void internal_send(isc_task_t *, isc_event_t *); 446static void internal_fdwatch_write(isc_task_t *, isc_event_t *); 447static void internal_fdwatch_read(isc_task_t *, isc_event_t *); 448static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *); 449static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *, 450 struct msghdr *, struct iovec *, size_t *); 451static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *, 452 struct msghdr *, struct iovec *, size_t *); 453#ifdef USE_WATCHER_THREAD 454static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager); 455#endif 456 457/*% 458 * The following can be either static or public, depending on build environment. 459 */ 460 461#ifdef BIND9 462#define ISC_SOCKETFUNC_SCOPE 463#else 464#define ISC_SOCKETFUNC_SCOPE static 465#endif 466 467ISC_SOCKETFUNC_SCOPE isc_result_t 468isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, 469 isc_socket_t **socketp); 470ISC_SOCKETFUNC_SCOPE void 471isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp); 472ISC_SOCKETFUNC_SCOPE void 473isc__socket_detach(isc_socket_t **socketp); 474ISC_SOCKETFUNC_SCOPE isc_result_t 475isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp); 476ISC_SOCKETFUNC_SCOPE isc_result_t 477isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 478 unsigned int maxsocks); 479ISC_SOCKETFUNC_SCOPE void 480isc__socketmgr_destroy(isc_socketmgr_t **managerp); 481ISC_SOCKETFUNC_SCOPE isc_result_t 482isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist, 483 unsigned int minimum, isc_task_t *task, 484 isc_taskaction_t action, const void *arg); 485ISC_SOCKETFUNC_SCOPE isc_result_t 486isc__socket_recv(isc_socket_t *sock, isc_region_t *region, 487 unsigned int minimum, isc_task_t *task, 488 isc_taskaction_t action, const void *arg); 489ISC_SOCKETFUNC_SCOPE isc_result_t 490isc__socket_recv2(isc_socket_t *sock, isc_region_t *region, 491 unsigned int minimum, isc_task_t *task, 492 isc_socketevent_t *event, unsigned int flags); 493ISC_SOCKETFUNC_SCOPE isc_result_t 494isc__socket_send(isc_socket_t *sock, isc_region_t *region, 495 isc_task_t *task, isc_taskaction_t action, const void *arg); 496ISC_SOCKETFUNC_SCOPE isc_result_t 497isc__socket_sendto(isc_socket_t *sock, isc_region_t *region, 498 isc_task_t *task, isc_taskaction_t action, const void *arg, 499 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo); 500ISC_SOCKETFUNC_SCOPE isc_result_t 501isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 502 isc_task_t *task, isc_taskaction_t action, const void *arg); 503ISC_SOCKETFUNC_SCOPE isc_result_t 504isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist, 505 isc_task_t *task, isc_taskaction_t action, const void *arg, 506 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo); 507ISC_SOCKETFUNC_SCOPE isc_result_t 508isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region, 509 isc_task_t *task, 510 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 511 isc_socketevent_t *event, unsigned int flags); 512ISC_SOCKETFUNC_SCOPE void 513isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active); 514ISC_SOCKETFUNC_SCOPE isc_result_t 515isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 516 isc_uint32_t owner, isc_uint32_t group); 517ISC_SOCKETFUNC_SCOPE isc_result_t 518isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr, 519 unsigned int options); 520ISC_SOCKETFUNC_SCOPE isc_result_t 521isc__socket_filter(isc_socket_t *sock, const char *filter); 522ISC_SOCKETFUNC_SCOPE isc_result_t 523isc__socket_listen(isc_socket_t *sock, unsigned int backlog); 524ISC_SOCKETFUNC_SCOPE isc_result_t 525isc__socket_accept(isc_socket_t *sock, 526 isc_task_t *task, isc_taskaction_t action, const void *arg); 527ISC_SOCKETFUNC_SCOPE isc_result_t 528isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, 529 isc_task_t *task, isc_taskaction_t action, 530 const void *arg); 531ISC_SOCKETFUNC_SCOPE isc_result_t 532isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp); 533ISC_SOCKETFUNC_SCOPE isc_result_t 534isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp); 535ISC_SOCKETFUNC_SCOPE void 536isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how); 537ISC_SOCKETFUNC_SCOPE isc_sockettype_t 538isc__socket_gettype(isc_socket_t *sock); 539ISC_SOCKETFUNC_SCOPE isc_boolean_t 540isc__socket_isbound(isc_socket_t *sock); 541ISC_SOCKETFUNC_SCOPE void 542isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes); 543#if defined(HAVE_LIBXML2) && defined(BIND9) 544ISC_SOCKETFUNC_SCOPE void 545isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer); 546#endif 547 548ISC_SOCKETFUNC_SCOPE isc_result_t 549isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags, 550 isc_sockfdwatch_t callback, void *cbarg, 551 isc_task_t *task, isc_socket_t **socketp); 552ISC_SOCKETFUNC_SCOPE isc_result_t 553isc__socket_fdwatchpoke(isc_socket_t *sock, int flags); 554ISC_SOCKETFUNC_SCOPE isc_result_t 555isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp); 556ISC_SOCKETFUNC_SCOPE int 557isc__socket_getfd(isc_socket_t *sock); 558 559static struct { 560 isc_socketmethods_t methods; 561 562 /*% 563 * The following are defined just for avoiding unused static functions. 564 */ 565#ifndef BIND9 566 void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter, 567 *listen, *accept, *getpeername, *isbound; 568#endif 569} socketmethods = { 570 { 571 isc__socket_attach, 572 isc__socket_detach, 573 isc__socket_bind, 574 isc__socket_sendto, 575 isc__socket_connect, 576 isc__socket_recv, 577 isc__socket_cancel, 578 isc__socket_getsockname, 579 isc__socket_gettype, 580 isc__socket_ipv6only, 581 isc__socket_fdwatchpoke, 582 isc__socket_dup, 583 isc__socket_getfd 584 } 585#ifndef BIND9 586 , 587 (void *)isc__socket_recvv, (void *)isc__socket_send, 588 (void *)isc__socket_sendv, (void *)isc__socket_sendto2, 589 (void *)isc__socket_cleanunix, (void *)isc__socket_permunix, 590 (void *)isc__socket_filter, (void *)isc__socket_listen, 591 (void *)isc__socket_accept, (void *)isc__socket_getpeername, 592 (void *)isc__socket_isbound 593#endif 594}; 595 596static isc_socketmgrmethods_t socketmgrmethods = { 597 isc__socketmgr_destroy, 598 isc__socket_create, 599 isc__socket_fdwatchcreate 600}; 601 602#define SELECT_POKE_SHUTDOWN (-1) 603#define SELECT_POKE_NOTHING (-2) 604#define SELECT_POKE_READ (-3) 605#define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */ 606#define SELECT_POKE_WRITE (-4) 607#define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */ 608#define SELECT_POKE_CLOSE (-5) 609 610#define SOCK_DEAD(s) ((s)->references == 0) 611 612/*% 613 * Shortcut index arrays to get access to statistics counters. 614 */ 615enum { 616 STATID_OPEN = 0, 617 STATID_OPENFAIL = 1, 618 STATID_CLOSE = 2, 619 STATID_BINDFAIL = 3, 620 STATID_CONNECTFAIL = 4, 621 STATID_CONNECT = 5, 622 STATID_ACCEPTFAIL = 6, 623 STATID_ACCEPT = 7, 624 STATID_SENDFAIL = 8, 625 STATID_RECVFAIL = 9 626}; 627static const isc_statscounter_t upd4statsindex[] = { 628 isc_sockstatscounter_udp4open, 629 isc_sockstatscounter_udp4openfail, 630 isc_sockstatscounter_udp4close, 631 isc_sockstatscounter_udp4bindfail, 632 isc_sockstatscounter_udp4connectfail, 633 isc_sockstatscounter_udp4connect, 634 -1, 635 -1, 636 isc_sockstatscounter_udp4sendfail, 637 isc_sockstatscounter_udp4recvfail 638}; 639static const isc_statscounter_t upd6statsindex[] = { 640 isc_sockstatscounter_udp6open, 641 isc_sockstatscounter_udp6openfail, 642 isc_sockstatscounter_udp6close, 643 isc_sockstatscounter_udp6bindfail, 644 isc_sockstatscounter_udp6connectfail, 645 isc_sockstatscounter_udp6connect, 646 -1, 647 -1, 648 isc_sockstatscounter_udp6sendfail, 649 isc_sockstatscounter_udp6recvfail 650}; 651static const isc_statscounter_t tcp4statsindex[] = { 652 isc_sockstatscounter_tcp4open, 653 isc_sockstatscounter_tcp4openfail, 654 isc_sockstatscounter_tcp4close, 655 isc_sockstatscounter_tcp4bindfail, 656 isc_sockstatscounter_tcp4connectfail, 657 isc_sockstatscounter_tcp4connect, 658 isc_sockstatscounter_tcp4acceptfail, 659 isc_sockstatscounter_tcp4accept, 660 isc_sockstatscounter_tcp4sendfail, 661 isc_sockstatscounter_tcp4recvfail 662}; 663static const isc_statscounter_t tcp6statsindex[] = { 664 isc_sockstatscounter_tcp6open, 665 isc_sockstatscounter_tcp6openfail, 666 isc_sockstatscounter_tcp6close, 667 isc_sockstatscounter_tcp6bindfail, 668 isc_sockstatscounter_tcp6connectfail, 669 isc_sockstatscounter_tcp6connect, 670 isc_sockstatscounter_tcp6acceptfail, 671 isc_sockstatscounter_tcp6accept, 672 isc_sockstatscounter_tcp6sendfail, 673 isc_sockstatscounter_tcp6recvfail 674}; 675static const isc_statscounter_t unixstatsindex[] = { 676 isc_sockstatscounter_unixopen, 677 isc_sockstatscounter_unixopenfail, 678 isc_sockstatscounter_unixclose, 679 isc_sockstatscounter_unixbindfail, 680 isc_sockstatscounter_unixconnectfail, 681 isc_sockstatscounter_unixconnect, 682 isc_sockstatscounter_unixacceptfail, 683 isc_sockstatscounter_unixaccept, 684 isc_sockstatscounter_unixsendfail, 685 isc_sockstatscounter_unixrecvfail 686}; 687static const isc_statscounter_t fdwatchstatsindex[] = { 688 -1, 689 -1, 690 isc_sockstatscounter_fdwatchclose, 691 isc_sockstatscounter_fdwatchbindfail, 692 isc_sockstatscounter_fdwatchconnectfail, 693 isc_sockstatscounter_fdwatchconnect, 694 -1, 695 -1, 696 isc_sockstatscounter_fdwatchsendfail, 697 isc_sockstatscounter_fdwatchrecvfail 698}; 699 700#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \ 701 defined(USE_WATCHER_THREAD) 702static void 703manager_log(isc__socketmgr_t *sockmgr, 704 isc_logcategory_t *category, isc_logmodule_t *module, int level, 705 const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6); 706static void 707manager_log(isc__socketmgr_t *sockmgr, 708 isc_logcategory_t *category, isc_logmodule_t *module, int level, 709 const char *fmt, ...) 710{ 711 char msgbuf[2048]; 712 va_list ap; 713 714 if (! isc_log_wouldlog(isc_lctx, level)) 715 return; 716 717 va_start(ap, fmt); 718 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 719 va_end(ap); 720 721 isc_log_write(isc_lctx, category, module, level, 722 "sockmgr %p: %s", sockmgr, msgbuf); 723} 724#endif 725 726static void 727socket_log(isc__socket_t *sock, isc_sockaddr_t *address, 728 isc_logcategory_t *category, isc_logmodule_t *module, int level, 729 isc_msgcat_t *msgcat, int msgset, int message, 730 const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10); 731static void 732socket_log(isc__socket_t *sock, isc_sockaddr_t *address, 733 isc_logcategory_t *category, isc_logmodule_t *module, int level, 734 isc_msgcat_t *msgcat, int msgset, int message, 735 const char *fmt, ...) 736{ 737 char msgbuf[2048]; 738 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 739 va_list ap; 740 741 if (! isc_log_wouldlog(isc_lctx, level)) 742 return; 743 744 va_start(ap, fmt); 745 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 746 va_end(ap); 747 748 if (address == NULL) { 749 isc_log_iwrite(isc_lctx, category, module, level, 750 msgcat, msgset, message, 751 "socket %p: %s", sock, msgbuf); 752 } else { 753 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); 754 isc_log_iwrite(isc_lctx, category, module, level, 755 msgcat, msgset, message, 756 "socket %p %s: %s", sock, peerbuf, msgbuf); 757 } 758} 759 760#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \ 761 defined(USE_CMSG) && defined(IPV6_RECVPKTINFO) 762/* 763 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by 764 * setting IPV6_V6ONLY. 765 */ 766static void 767FIX_IPV6_RECVPKTINFO(isc__socket_t *sock) 768{ 769 char strbuf[ISC_STRERRORSIZE]; 770 int on = 1; 771 772 if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp) 773 return; 774 775 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 776 (void *)&on, sizeof(on)) < 0) { 777 778 isc__strerror(errno, strbuf, sizeof(strbuf)); 779 UNEXPECTED_ERROR(__FILE__, __LINE__, 780 "setsockopt(%d, IPV6_RECVPKTINFO) " 781 "%s: %s", sock->fd, 782 isc_msgcat_get(isc_msgcat, 783 ISC_MSGSET_GENERAL, 784 ISC_MSG_FAILED, 785 "failed"), 786 strbuf); 787 } 788} 789#else 790#define FIX_IPV6_RECVPKTINFO(sock) (void)0 791#endif 792 793/*% 794 * Increment socket-related statistics counters. 795 */ 796static inline void 797inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) { 798 REQUIRE(counterid != -1); 799 800 if (stats != NULL) 801 isc_stats_increment(stats, counterid); 802} 803 804static inline isc_result_t 805watch_fd(isc__socketmgr_t *manager, int fd, int msg) { 806 isc_result_t result = ISC_R_SUCCESS; 807 808#ifdef USE_KQUEUE 809 struct kevent evchange; 810 811 memset(&evchange, 0, sizeof(evchange)); 812 if (msg == SELECT_POKE_READ) 813 evchange.filter = EVFILT_READ; 814 else 815 evchange.filter = EVFILT_WRITE; 816 evchange.flags = EV_ADD; 817 evchange.ident = fd; 818 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 819 result = isc__errno2result(errno); 820 821 return (result); 822#elif defined(USE_EPOLL) 823 struct epoll_event event; 824 825 if (msg == SELECT_POKE_READ) 826 event.events = EPOLLIN; 827 else 828 event.events = EPOLLOUT; 829 memset(&event.data, 0, sizeof(event.data)); 830 event.data.fd = fd; 831 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && 832 errno != EEXIST) { 833 result = isc__errno2result(errno); 834 } 835 836 return (result); 837#elif defined(USE_DEVPOLL) 838 struct pollfd pfd; 839 int lockid = FDLOCK_ID(fd); 840 841 memset(&pfd, 0, sizeof(pfd)); 842 if (msg == SELECT_POKE_READ) 843 pfd.events = POLLIN; 844 else 845 pfd.events = POLLOUT; 846 pfd.fd = fd; 847 pfd.revents = 0; 848 LOCK(&manager->fdlock[lockid]); 849 if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1) 850 result = isc__errno2result(errno); 851 else { 852 if (msg == SELECT_POKE_READ) 853 manager->fdpollinfo[fd].want_read = 1; 854 else 855 manager->fdpollinfo[fd].want_write = 1; 856 } 857 UNLOCK(&manager->fdlock[lockid]); 858 859 return (result); 860#elif defined(USE_SELECT) 861 LOCK(&manager->lock); 862 if (msg == SELECT_POKE_READ) 863 FD_SET(fd, manager->read_fds); 864 if (msg == SELECT_POKE_WRITE) 865 FD_SET(fd, manager->write_fds); 866 UNLOCK(&manager->lock); 867 868 return (result); 869#endif 870} 871 872static inline isc_result_t 873unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) { 874 isc_result_t result = ISC_R_SUCCESS; 875 876#ifdef USE_KQUEUE 877 struct kevent evchange; 878 879 memset(&evchange, 0, sizeof(evchange)); 880 if (msg == SELECT_POKE_READ) 881 evchange.filter = EVFILT_READ; 882 else 883 evchange.filter = EVFILT_WRITE; 884 evchange.flags = EV_DELETE; 885 evchange.ident = fd; 886 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 887 result = isc__errno2result(errno); 888 889 return (result); 890#elif defined(USE_EPOLL) 891 struct epoll_event event; 892 893 if (msg == SELECT_POKE_READ) 894 event.events = EPOLLIN; 895 else 896 event.events = EPOLLOUT; 897 memset(&event.data, 0, sizeof(event.data)); 898 event.data.fd = fd; 899 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 && 900 errno != ENOENT) { 901 char strbuf[ISC_STRERRORSIZE]; 902 isc__strerror(errno, strbuf, sizeof(strbuf)); 903 UNEXPECTED_ERROR(__FILE__, __LINE__, 904 "epoll_ctl(DEL), %d: %s", fd, strbuf); 905 result = ISC_R_UNEXPECTED; 906 } 907 return (result); 908#elif defined(USE_DEVPOLL) 909 struct pollfd pfds[2]; 910 size_t writelen = sizeof(pfds[0]); 911 int lockid = FDLOCK_ID(fd); 912 913 memset(pfds, 0, sizeof(pfds)); 914 pfds[0].events = POLLREMOVE; 915 pfds[0].fd = fd; 916 917 /* 918 * Canceling read or write polling via /dev/poll is tricky. Since it 919 * only provides a way of canceling per FD, we may need to re-poll the 920 * socket for the other operation. 921 */ 922 LOCK(&manager->fdlock[lockid]); 923 if (msg == SELECT_POKE_READ && 924 manager->fdpollinfo[fd].want_write == 1) { 925 pfds[1].events = POLLOUT; 926 pfds[1].fd = fd; 927 writelen += sizeof(pfds[1]); 928 } 929 if (msg == SELECT_POKE_WRITE && 930 manager->fdpollinfo[fd].want_read == 1) { 931 pfds[1].events = POLLIN; 932 pfds[1].fd = fd; 933 writelen += sizeof(pfds[1]); 934 } 935 936 if (write(manager->devpoll_fd, pfds, writelen) == -1) 937 result = isc__errno2result(errno); 938 else { 939 if (msg == SELECT_POKE_READ) 940 manager->fdpollinfo[fd].want_read = 0; 941 else 942 manager->fdpollinfo[fd].want_write = 0; 943 } 944 UNLOCK(&manager->fdlock[lockid]); 945 946 return (result); 947#elif defined(USE_SELECT) 948 LOCK(&manager->lock); 949 if (msg == SELECT_POKE_READ) 950 FD_CLR(fd, manager->read_fds); 951 else if (msg == SELECT_POKE_WRITE) 952 FD_CLR(fd, manager->write_fds); 953 UNLOCK(&manager->lock); 954 955 return (result); 956#endif 957} 958 959static void 960wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) { 961 isc_result_t result; 962 int lockid = FDLOCK_ID(fd); 963 964 /* 965 * This is a wakeup on a socket. If the socket is not in the 966 * process of being closed, start watching it for either reads 967 * or writes. 968 */ 969 970 INSIST(fd >= 0 && fd < (int)manager->maxsocks); 971 972 if (msg == SELECT_POKE_CLOSE) { 973 /* No one should be updating fdstate, so no need to lock it */ 974 INSIST(manager->fdstate[fd] == CLOSE_PENDING); 975 manager->fdstate[fd] = CLOSED; 976 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 977 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 978 (void)close(fd); 979 return; 980 } 981 982 LOCK(&manager->fdlock[lockid]); 983 if (manager->fdstate[fd] == CLOSE_PENDING) { 984 UNLOCK(&manager->fdlock[lockid]); 985 986 /* 987 * We accept (and ignore) any error from unwatch_fd() as we are 988 * closing the socket, hoping it doesn't leave dangling state in 989 * the kernel. 990 * Note that unwatch_fd() must be called after releasing the 991 * fdlock; otherwise it could cause deadlock due to a lock order 992 * reversal. 993 */ 994 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 995 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 996 return; 997 } 998 if (manager->fdstate[fd] != MANAGED) { 999 UNLOCK(&manager->fdlock[lockid]); 1000 return; 1001 } 1002 UNLOCK(&manager->fdlock[lockid]); 1003 1004 /* 1005 * Set requested bit. 1006 */ 1007 result = watch_fd(manager, fd, msg); 1008 if (result != ISC_R_SUCCESS) { 1009 /* 1010 * XXXJT: what should we do? Ignoring the failure of watching 1011 * a socket will make the application dysfunctional, but there 1012 * seems to be no reasonable recovery process. 1013 */ 1014 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 1015 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 1016 "failed to start watching FD (%d): %s", 1017 fd, isc_result_totext(result)); 1018 } 1019} 1020 1021#ifdef USE_WATCHER_THREAD 1022/* 1023 * Poke the select loop when there is something for us to do. 1024 * The write is required (by POSIX) to complete. That is, we 1025 * will not get partial writes. 1026 */ 1027static void 1028select_poke(isc__socketmgr_t *mgr, int fd, int msg) { 1029 int cc; 1030 int buf[2]; 1031 char strbuf[ISC_STRERRORSIZE]; 1032 1033 buf[0] = fd; 1034 buf[1] = msg; 1035 1036 do { 1037 cc = write(mgr->pipe_fds[1], buf, sizeof(buf)); 1038#ifdef ENOSR 1039 /* 1040 * Treat ENOSR as EAGAIN but loop slowly as it is 1041 * unlikely to clear fast. 1042 */ 1043 if (cc < 0 && errno == ENOSR) { 1044 sleep(1); 1045 errno = EAGAIN; 1046 } 1047#endif 1048 } while (cc < 0 && SOFT_ERROR(errno)); 1049 1050 if (cc < 0) { 1051 isc__strerror(errno, strbuf, sizeof(strbuf)); 1052 FATAL_ERROR(__FILE__, __LINE__, 1053 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 1054 ISC_MSG_WRITEFAILED, 1055 "write() failed " 1056 "during watcher poke: %s"), 1057 strbuf); 1058 } 1059 1060 INSIST(cc == sizeof(buf)); 1061} 1062 1063/* 1064 * Read a message on the internal fd. 1065 */ 1066static void 1067select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) { 1068 int buf[2]; 1069 int cc; 1070 char strbuf[ISC_STRERRORSIZE]; 1071 1072 cc = read(mgr->pipe_fds[0], buf, sizeof(buf)); 1073 if (cc < 0) { 1074 *msg = SELECT_POKE_NOTHING; 1075 *fd = -1; /* Silence compiler. */ 1076 if (SOFT_ERROR(errno)) 1077 return; 1078 1079 isc__strerror(errno, strbuf, sizeof(strbuf)); 1080 FATAL_ERROR(__FILE__, __LINE__, 1081 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 1082 ISC_MSG_READFAILED, 1083 "read() failed " 1084 "during watcher poke: %s"), 1085 strbuf); 1086 1087 return; 1088 } 1089 INSIST(cc == sizeof(buf)); 1090 1091 *fd = buf[0]; 1092 *msg = buf[1]; 1093} 1094#else /* USE_WATCHER_THREAD */ 1095/* 1096 * Update the state of the socketmgr when something changes. 1097 */ 1098static void 1099select_poke(isc__socketmgr_t *manager, int fd, int msg) { 1100 if (msg == SELECT_POKE_SHUTDOWN) 1101 return; 1102 else if (fd >= 0) 1103 wakeup_socket(manager, fd, msg); 1104 return; 1105} 1106#endif /* USE_WATCHER_THREAD */ 1107 1108/* 1109 * Make a fd non-blocking. 1110 */ 1111static isc_result_t 1112make_nonblock(int fd) { 1113 int ret; 1114 int flags; 1115 char strbuf[ISC_STRERRORSIZE]; 1116#ifdef USE_FIONBIO_IOCTL 1117 int on = 1; 1118 1119 ret = ioctl(fd, FIONBIO, (char *)&on); 1120#else 1121 flags = fcntl(fd, F_GETFL, 0); 1122 flags |= PORT_NONBLOCK; 1123 ret = fcntl(fd, F_SETFL, flags); 1124#endif 1125 1126 if (ret == -1) { 1127 isc__strerror(errno, strbuf, sizeof(strbuf)); 1128 UNEXPECTED_ERROR(__FILE__, __LINE__, 1129#ifdef USE_FIONBIO_IOCTL 1130 "ioctl(%d, FIONBIO, &on): %s", fd, 1131#else 1132 "fcntl(%d, F_SETFL, %d): %s", fd, flags, 1133#endif 1134 strbuf); 1135 1136 return (ISC_R_UNEXPECTED); 1137 } 1138 1139 return (ISC_R_SUCCESS); 1140} 1141 1142#ifdef USE_CMSG 1143/* 1144 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE. 1145 * In order to ensure as much portability as possible, we provide wrapper 1146 * functions of these macros. 1147 * Note that cmsg_space() could run slow on OSes that do not have 1148 * CMSG_SPACE. 1149 */ 1150static inline ISC_SOCKADDR_LEN_T 1151cmsg_len(ISC_SOCKADDR_LEN_T len) { 1152#ifdef CMSG_LEN 1153 return (CMSG_LEN(len)); 1154#else 1155 ISC_SOCKADDR_LEN_T hdrlen; 1156 1157 /* 1158 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA 1159 * is correct. 1160 */ 1161 hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL)); 1162 return (hdrlen + len); 1163#endif 1164} 1165 1166static inline ISC_SOCKADDR_LEN_T 1167cmsg_space(ISC_SOCKADDR_LEN_T len) { 1168#ifdef CMSG_SPACE 1169 return (CMSG_SPACE(len)); 1170#else 1171 struct msghdr msg; 1172 struct cmsghdr *cmsgp; 1173 /* 1174 * XXX: The buffer length is an ad-hoc value, but should be enough 1175 * in a practical sense. 1176 */ 1177 char dummybuf[sizeof(struct cmsghdr) + 1024]; 1178 1179 memset(&msg, 0, sizeof(msg)); 1180 msg.msg_control = dummybuf; 1181 msg.msg_controllen = sizeof(dummybuf); 1182 1183 cmsgp = (struct cmsghdr *)dummybuf; 1184 cmsgp->cmsg_len = cmsg_len(len); 1185 1186 cmsgp = CMSG_NXTHDR(&msg, cmsgp); 1187 if (cmsgp != NULL) 1188 return ((char *)cmsgp - (char *)msg.msg_control); 1189 else 1190 return (0); 1191#endif 1192} 1193#endif /* USE_CMSG */ 1194 1195/* 1196 * Process control messages received on a socket. 1197 */ 1198static void 1199process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { 1200#ifdef USE_CMSG 1201 struct cmsghdr *cmsgp; 1202#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1203 struct in6_pktinfo *pktinfop; 1204#endif 1205#ifdef SO_TIMESTAMP 1206 struct timeval *timevalp; 1207#endif 1208#endif 1209 1210 /* 1211 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. 1212 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. 1213 * They are all here, outside of the CPP tests, because it is 1214 * more consistent with the usual ISC coding style. 1215 */ 1216 UNUSED(sock); 1217 UNUSED(msg); 1218 UNUSED(dev); 1219 1220#ifdef ISC_NET_BSD44MSGHDR 1221 1222#ifdef MSG_TRUNC 1223 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC) 1224 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1225#endif 1226 1227#ifdef MSG_CTRUNC 1228 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC) 1229 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; 1230#endif 1231 1232#ifndef USE_CMSG 1233 return; 1234#else 1235 if (msg->msg_controllen == 0U || msg->msg_control == NULL) 1236 return; 1237 1238#ifdef SO_TIMESTAMP 1239 timevalp = NULL; 1240#endif 1241#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1242 pktinfop = NULL; 1243#endif 1244 1245 cmsgp = CMSG_FIRSTHDR(msg); 1246 while (cmsgp != NULL) { 1247 socket_log(sock, NULL, TRACE, 1248 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG, 1249 "processing cmsg %p", cmsgp); 1250 1251#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1252 if (cmsgp->cmsg_level == IPPROTO_IPV6 1253 && cmsgp->cmsg_type == IPV6_PKTINFO) { 1254 1255 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1256 memcpy(&dev->pktinfo, pktinfop, 1257 sizeof(struct in6_pktinfo)); 1258 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 1259 socket_log(sock, NULL, TRACE, 1260 isc_msgcat, ISC_MSGSET_SOCKET, 1261 ISC_MSG_IFRECEIVED, 1262 "interface received on ifindex %u", 1263 dev->pktinfo.ipi6_ifindex); 1264 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) 1265 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; 1266 goto next; 1267 } 1268#endif 1269 1270#ifdef SO_TIMESTAMP 1271 if (cmsgp->cmsg_level == SOL_SOCKET 1272 && cmsgp->cmsg_type == SCM_TIMESTAMP) { 1273 timevalp = (struct timeval *)CMSG_DATA(cmsgp); 1274 dev->timestamp.seconds = timevalp->tv_sec; 1275 dev->timestamp.nanoseconds = timevalp->tv_usec * 1000; 1276 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; 1277 goto next; 1278 } 1279#endif 1280 1281 next: 1282 cmsgp = CMSG_NXTHDR(msg, cmsgp); 1283 } 1284#endif /* USE_CMSG */ 1285 1286#endif /* ISC_NET_BSD44MSGHDR */ 1287} 1288 1289/* 1290 * Construct an iov array and attach it to the msghdr passed in. This is 1291 * the SEND constructor, which will use the used region of the buffer 1292 * (if using a buffer list) or will use the internal region (if a single 1293 * buffer I/O is requested). 1294 * 1295 * Nothing can be NULL, and the done event must list at least one buffer 1296 * on the buffer linked list for this function to be meaningful. 1297 * 1298 * If write_countp != NULL, *write_countp will hold the number of bytes 1299 * this transaction can send. 1300 */ 1301static void 1302build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev, 1303 struct msghdr *msg, struct iovec *iov, size_t *write_countp) 1304{ 1305 unsigned int iovcount; 1306 isc_buffer_t *buffer; 1307 isc_region_t used; 1308 size_t write_count; 1309 size_t skip_count; 1310 1311 memset(msg, 0, sizeof(*msg)); 1312 1313 if (!sock->connected) { 1314 msg->msg_name = (void *)&dev->address.type.sa; 1315 msg->msg_namelen = dev->address.length; 1316 } else { 1317 msg->msg_name = NULL; 1318 msg->msg_namelen = 0; 1319 } 1320 1321 buffer = ISC_LIST_HEAD(dev->bufferlist); 1322 write_count = 0; 1323 iovcount = 0; 1324 1325 /* 1326 * Single buffer I/O? Skip what we've done so far in this region. 1327 */ 1328 if (buffer == NULL) { 1329 write_count = dev->region.length - dev->n; 1330 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1331 iov[0].iov_len = write_count; 1332 iovcount = 1; 1333 1334 goto config; 1335 } 1336 1337 /* 1338 * Multibuffer I/O. 1339 * Skip the data in the buffer list that we have already written. 1340 */ 1341 skip_count = dev->n; 1342 while (buffer != NULL) { 1343 REQUIRE(ISC_BUFFER_VALID(buffer)); 1344 if (skip_count < isc_buffer_usedlength(buffer)) 1345 break; 1346 skip_count -= isc_buffer_usedlength(buffer); 1347 buffer = ISC_LIST_NEXT(buffer, link); 1348 } 1349 1350 while (buffer != NULL) { 1351 INSIST(iovcount < MAXSCATTERGATHER_SEND); 1352 1353 isc_buffer_usedregion(buffer, &used); 1354 1355 if (used.length > 0) { 1356 iov[iovcount].iov_base = (void *)(used.base 1357 + skip_count); 1358 iov[iovcount].iov_len = used.length - skip_count; 1359 write_count += (used.length - skip_count); 1360 skip_count = 0; 1361 iovcount++; 1362 } 1363 buffer = ISC_LIST_NEXT(buffer, link); 1364 } 1365 1366 INSIST(skip_count == 0U); 1367 1368 config: 1369 msg->msg_iov = iov; 1370 msg->msg_iovlen = iovcount; 1371 1372#ifdef ISC_NET_BSD44MSGHDR 1373 msg->msg_control = NULL; 1374 msg->msg_controllen = 0; 1375 msg->msg_flags = 0; 1376#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1377 if ((sock->type == isc_sockettype_udp) 1378 && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) { 1379#if defined(IPV6_USE_MIN_MTU) 1380 int use_min_mtu = 1; /* -1, 0, 1 */ 1381#endif 1382 struct cmsghdr *cmsgp; 1383 struct in6_pktinfo *pktinfop; 1384 1385 socket_log(sock, NULL, TRACE, 1386 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA, 1387 "sendto pktinfo data, ifindex %u", 1388 dev->pktinfo.ipi6_ifindex); 1389 1390 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo)); 1391 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1392 msg->msg_control = (void *)sock->sendcmsgbuf; 1393 1394 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf; 1395 cmsgp->cmsg_level = IPPROTO_IPV6; 1396 cmsgp->cmsg_type = IPV6_PKTINFO; 1397 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo)); 1398 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1399 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); 1400#if defined(IPV6_USE_MIN_MTU) 1401 /* 1402 * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD 1403 * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO 1404 * is used. 1405 */ 1406 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf + 1407 msg->msg_controllen); 1408 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu)); 1409 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1410 1411 cmsgp->cmsg_level = IPPROTO_IPV6; 1412 cmsgp->cmsg_type = IPV6_USE_MIN_MTU; 1413 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu)); 1414 memcpy(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu)); 1415#endif 1416 } 1417#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */ 1418#else /* ISC_NET_BSD44MSGHDR */ 1419 msg->msg_accrights = NULL; 1420 msg->msg_accrightslen = 0; 1421#endif /* ISC_NET_BSD44MSGHDR */ 1422 1423 if (write_countp != NULL) 1424 *write_countp = write_count; 1425} 1426 1427/* 1428 * Construct an iov array and attach it to the msghdr passed in. This is 1429 * the RECV constructor, which will use the available region of the buffer 1430 * (if using a buffer list) or will use the internal region (if a single 1431 * buffer I/O is requested). 1432 * 1433 * Nothing can be NULL, and the done event must list at least one buffer 1434 * on the buffer linked list for this function to be meaningful. 1435 * 1436 * If read_countp != NULL, *read_countp will hold the number of bytes 1437 * this transaction can receive. 1438 */ 1439static void 1440build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev, 1441 struct msghdr *msg, struct iovec *iov, size_t *read_countp) 1442{ 1443 unsigned int iovcount; 1444 isc_buffer_t *buffer; 1445 isc_region_t available; 1446 size_t read_count; 1447 1448 memset(msg, 0, sizeof(struct msghdr)); 1449 1450 if (sock->type == isc_sockettype_udp) { 1451 memset(&dev->address, 0, sizeof(dev->address)); 1452#ifdef BROKEN_RECVMSG 1453 if (sock->pf == AF_INET) { 1454 msg->msg_name = (void *)&dev->address.type.sin; 1455 msg->msg_namelen = sizeof(dev->address.type.sin6); 1456 } else if (sock->pf == AF_INET6) { 1457 msg->msg_name = (void *)&dev->address.type.sin6; 1458 msg->msg_namelen = sizeof(dev->address.type.sin6); 1459#ifdef ISC_PLATFORM_HAVESYSUNH 1460 } else if (sock->pf == AF_UNIX) { 1461 msg->msg_name = (void *)&dev->address.type.sunix; 1462 msg->msg_namelen = sizeof(dev->address.type.sunix); 1463#endif 1464 } else { 1465 msg->msg_name = (void *)&dev->address.type.sa; 1466 msg->msg_namelen = sizeof(dev->address.type); 1467 } 1468#else 1469 msg->msg_name = (void *)&dev->address.type.sa; 1470 msg->msg_namelen = sizeof(dev->address.type); 1471#endif 1472#ifdef ISC_NET_RECVOVERFLOW 1473 /* If needed, steal one iovec for overflow detection. */ 1474 maxiov--; 1475#endif 1476 } else { /* TCP */ 1477 msg->msg_name = NULL; 1478 msg->msg_namelen = 0; 1479 dev->address = sock->peer_address; 1480 } 1481 1482 buffer = ISC_LIST_HEAD(dev->bufferlist); 1483 read_count = 0; 1484 1485 /* 1486 * Single buffer I/O? Skip what we've done so far in this region. 1487 */ 1488 if (buffer == NULL) { 1489 read_count = dev->region.length - dev->n; 1490 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1491 iov[0].iov_len = read_count; 1492 iovcount = 1; 1493 1494 goto config; 1495 } 1496 1497 /* 1498 * Multibuffer I/O. 1499 * Skip empty buffers. 1500 */ 1501 while (buffer != NULL) { 1502 REQUIRE(ISC_BUFFER_VALID(buffer)); 1503 if (isc_buffer_availablelength(buffer) != 0) 1504 break; 1505 buffer = ISC_LIST_NEXT(buffer, link); 1506 } 1507 1508 iovcount = 0; 1509 while (buffer != NULL) { 1510 INSIST(iovcount < MAXSCATTERGATHER_RECV); 1511 1512 isc_buffer_availableregion(buffer, &available); 1513 1514 if (available.length > 0) { 1515 iov[iovcount].iov_base = (void *)(available.base); 1516 iov[iovcount].iov_len = available.length; 1517 read_count += available.length; 1518 iovcount++; 1519 } 1520 buffer = ISC_LIST_NEXT(buffer, link); 1521 } 1522 1523 config: 1524 1525 /* 1526 * If needed, set up to receive that one extra byte. Note that 1527 * we know there is at least one iov left, since we stole it 1528 * at the top of this function. 1529 */ 1530#ifdef ISC_NET_RECVOVERFLOW 1531 if (sock->type == isc_sockettype_udp) { 1532 iov[iovcount].iov_base = (void *)(&sock->overflow); 1533 iov[iovcount].iov_len = 1; 1534 iovcount++; 1535 } 1536#endif 1537 1538 msg->msg_iov = iov; 1539 msg->msg_iovlen = iovcount; 1540 1541#ifdef ISC_NET_BSD44MSGHDR 1542 msg->msg_control = NULL; 1543 msg->msg_controllen = 0; 1544 msg->msg_flags = 0; 1545#if defined(USE_CMSG) 1546 if (sock->type == isc_sockettype_udp) { 1547 msg->msg_control = sock->recvcmsgbuf; 1548 msg->msg_controllen = sock->recvcmsgbuflen; 1549 } 1550#endif /* USE_CMSG */ 1551#else /* ISC_NET_BSD44MSGHDR */ 1552 msg->msg_accrights = NULL; 1553 msg->msg_accrightslen = 0; 1554#endif /* ISC_NET_BSD44MSGHDR */ 1555 1556 if (read_countp != NULL) 1557 *read_countp = read_count; 1558} 1559 1560static void 1561set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock, 1562 isc_socketevent_t *dev) 1563{ 1564 if (sock->type == isc_sockettype_udp) { 1565 if (address != NULL) 1566 dev->address = *address; 1567 else 1568 dev->address = sock->peer_address; 1569 } else if (sock->type == isc_sockettype_tcp) { 1570 INSIST(address == NULL); 1571 dev->address = sock->peer_address; 1572 } 1573} 1574 1575static void 1576destroy_socketevent(isc_event_t *event) { 1577 isc_socketevent_t *ev = (isc_socketevent_t *)event; 1578 1579 INSIST(ISC_LIST_EMPTY(ev->bufferlist)); 1580 1581 (ev->destroy)(event); 1582} 1583 1584static isc_socketevent_t * 1585allocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype, 1586 isc_taskaction_t action, const void *arg) 1587{ 1588 isc_socketevent_t *ev; 1589 1590 ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx, 1591 sock, eventtype, 1592 action, arg, 1593 sizeof(*ev)); 1594 1595 if (ev == NULL) 1596 return (NULL); 1597 1598 ev->result = ISC_R_UNSET; 1599 ISC_LINK_INIT(ev, ev_link); 1600 ISC_LIST_INIT(ev->bufferlist); 1601 ev->region.base = NULL; 1602 ev->n = 0; 1603 ev->offset = 0; 1604 ev->attributes = 0; 1605 ev->destroy = ev->ev_destroy; 1606 ev->ev_destroy = destroy_socketevent; 1607 1608 return (ev); 1609} 1610 1611#if defined(ISC_SOCKET_DEBUG) 1612static void 1613dump_msg(struct msghdr *msg) { 1614 unsigned int i; 1615 1616 printf("MSGHDR %p\n", msg); 1617 printf("\tname %p, namelen %ld\n", msg->msg_name, 1618 (long) msg->msg_namelen); 1619 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, 1620 (long) msg->msg_iovlen); 1621 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) 1622 printf("\t\t%d\tbase %p, len %ld\n", i, 1623 msg->msg_iov[i].iov_base, 1624 (long) msg->msg_iov[i].iov_len); 1625#ifdef ISC_NET_BSD44MSGHDR 1626 printf("\tcontrol %p, controllen %ld\n", msg->msg_control, 1627 (long) msg->msg_controllen); 1628#endif 1629} 1630#endif 1631 1632#define DOIO_SUCCESS 0 /* i/o ok, event sent */ 1633#define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */ 1634#define DOIO_HARD 2 /* i/o error, event sent */ 1635#define DOIO_EOF 3 /* EOF, no event sent */ 1636 1637static int 1638doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) { 1639 int cc; 1640 struct iovec iov[MAXSCATTERGATHER_RECV]; 1641 size_t read_count; 1642 size_t actual_count; 1643 struct msghdr msghdr; 1644 isc_buffer_t *buffer; 1645 int recv_errno; 1646 char strbuf[ISC_STRERRORSIZE]; 1647 1648 build_msghdr_recv(sock, dev, &msghdr, iov, &read_count); 1649 1650#if defined(ISC_SOCKET_DEBUG) 1651 dump_msg(&msghdr); 1652#endif 1653 1654 cc = recvmsg(sock->fd, &msghdr, 0); 1655 recv_errno = errno; 1656 1657#if defined(ISC_SOCKET_DEBUG) 1658 dump_msg(&msghdr); 1659#endif 1660 1661 if (cc < 0) { 1662 if (SOFT_ERROR(recv_errno)) 1663 return (DOIO_SOFT); 1664 1665 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1666 isc__strerror(recv_errno, strbuf, sizeof(strbuf)); 1667 socket_log(sock, NULL, IOEVENT, 1668 isc_msgcat, ISC_MSGSET_SOCKET, 1669 ISC_MSG_DOIORECV, 1670 "doio_recv: recvmsg(%d) %d bytes, err %d/%s", 1671 sock->fd, cc, recv_errno, strbuf); 1672 } 1673 1674#define SOFT_OR_HARD(_system, _isc) \ 1675 if (recv_errno == _system) { \ 1676 if (sock->connected) { \ 1677 dev->result = _isc; \ 1678 inc_stats(sock->manager->stats, \ 1679 sock->statsindex[STATID_RECVFAIL]); \ 1680 return (DOIO_HARD); \ 1681 } \ 1682 return (DOIO_SOFT); \ 1683 } 1684#define ALWAYS_HARD(_system, _isc) \ 1685 if (recv_errno == _system) { \ 1686 dev->result = _isc; \ 1687 inc_stats(sock->manager->stats, \ 1688 sock->statsindex[STATID_RECVFAIL]); \ 1689 return (DOIO_HARD); \ 1690 } 1691 1692 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1693 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1694 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1695 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN); 1696 /* HPUX 11.11 can return EADDRNOTAVAIL. */ 1697 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1698 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1699 /* 1700 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 1701 * errors. 1702 */ 1703#ifdef EPROTO 1704 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); 1705#endif 1706 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); 1707 1708#undef SOFT_OR_HARD 1709#undef ALWAYS_HARD 1710 1711 dev->result = isc__errno2result(recv_errno); 1712 inc_stats(sock->manager->stats, 1713 sock->statsindex[STATID_RECVFAIL]); 1714 return (DOIO_HARD); 1715 } 1716 1717 /* 1718 * On TCP and UNIX sockets, zero length reads indicate EOF, 1719 * while on UDP sockets, zero length reads are perfectly valid, 1720 * although strange. 1721 */ 1722 switch (sock->type) { 1723 case isc_sockettype_tcp: 1724 case isc_sockettype_unix: 1725 if (cc == 0) 1726 return (DOIO_EOF); 1727 break; 1728 case isc_sockettype_udp: 1729 break; 1730 case isc_sockettype_fdwatch: 1731 default: 1732 INSIST(0); 1733 } 1734 1735 if (sock->type == isc_sockettype_udp) { 1736 dev->address.length = msghdr.msg_namelen; 1737 if (isc_sockaddr_getport(&dev->address) == 0) { 1738 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1739 socket_log(sock, &dev->address, IOEVENT, 1740 isc_msgcat, ISC_MSGSET_SOCKET, 1741 ISC_MSG_ZEROPORT, 1742 "dropping source port zero packet"); 1743 } 1744 return (DOIO_SOFT); 1745 } 1746 /* 1747 * Simulate a firewall blocking UDP responses bigger than 1748 * 512 bytes. 1749 */ 1750 if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp) 1751 return (DOIO_SOFT); 1752 } 1753 1754 socket_log(sock, &dev->address, IOEVENT, 1755 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV, 1756 "packet received correctly"); 1757 1758 /* 1759 * Overflow bit detection. If we received MORE bytes than we should, 1760 * this indicates an overflow situation. Set the flag in the 1761 * dev entry and adjust how much we read by one. 1762 */ 1763#ifdef ISC_NET_RECVOVERFLOW 1764 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) { 1765 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1766 cc--; 1767 } 1768#endif 1769 1770 /* 1771 * If there are control messages attached, run through them and pull 1772 * out the interesting bits. 1773 */ 1774 if (sock->type == isc_sockettype_udp) 1775 process_cmsg(sock, &msghdr, dev); 1776 1777 /* 1778 * update the buffers (if any) and the i/o count 1779 */ 1780 dev->n += cc; 1781 actual_count = cc; 1782 buffer = ISC_LIST_HEAD(dev->bufferlist); 1783 while (buffer != NULL && actual_count > 0U) { 1784 REQUIRE(ISC_BUFFER_VALID(buffer)); 1785 if (isc_buffer_availablelength(buffer) <= actual_count) { 1786 actual_count -= isc_buffer_availablelength(buffer); 1787 isc_buffer_add(buffer, 1788 isc_buffer_availablelength(buffer)); 1789 } else { 1790 isc_buffer_add(buffer, actual_count); 1791 actual_count = 0; 1792 POST(actual_count); 1793 break; 1794 } 1795 buffer = ISC_LIST_NEXT(buffer, link); 1796 if (buffer == NULL) { 1797 INSIST(actual_count == 0U); 1798 } 1799 } 1800 1801 /* 1802 * If we read less than we expected, update counters, 1803 * and let the upper layer poke the descriptor. 1804 */ 1805 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) 1806 return (DOIO_SOFT); 1807 1808 /* 1809 * Full reads are posted, or partials if partials are ok. 1810 */ 1811 dev->result = ISC_R_SUCCESS; 1812 return (DOIO_SUCCESS); 1813} 1814 1815/* 1816 * Returns: 1817 * DOIO_SUCCESS The operation succeeded. dev->result contains 1818 * ISC_R_SUCCESS. 1819 * 1820 * DOIO_HARD A hard or unexpected I/O error was encountered. 1821 * dev->result contains the appropriate error. 1822 * 1823 * DOIO_SOFT A soft I/O error was encountered. No senddone 1824 * event was sent. The operation should be retried. 1825 * 1826 * No other return values are possible. 1827 */ 1828static int 1829doio_send(isc__socket_t *sock, isc_socketevent_t *dev) { 1830 int cc; 1831 struct iovec iov[MAXSCATTERGATHER_SEND]; 1832 size_t write_count; 1833 struct msghdr msghdr; 1834 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 1835 int attempts = 0; 1836 int send_errno; 1837 char strbuf[ISC_STRERRORSIZE]; 1838 1839 build_msghdr_send(sock, dev, &msghdr, iov, &write_count); 1840 1841 resend: 1842 cc = sendmsg(sock->fd, &msghdr, 0); 1843 send_errno = errno; 1844 1845 /* 1846 * Check for error or block condition. 1847 */ 1848 if (cc < 0) { 1849 if (send_errno == EINTR && ++attempts < NRETRIES) 1850 goto resend; 1851 1852 if (SOFT_ERROR(send_errno)) 1853 return (DOIO_SOFT); 1854 1855#define SOFT_OR_HARD(_system, _isc) \ 1856 if (send_errno == _system) { \ 1857 if (sock->connected) { \ 1858 dev->result = _isc; \ 1859 inc_stats(sock->manager->stats, \ 1860 sock->statsindex[STATID_SENDFAIL]); \ 1861 return (DOIO_HARD); \ 1862 } \ 1863 return (DOIO_SOFT); \ 1864 } 1865#define ALWAYS_HARD(_system, _isc) \ 1866 if (send_errno == _system) { \ 1867 dev->result = _isc; \ 1868 inc_stats(sock->manager->stats, \ 1869 sock->statsindex[STATID_SENDFAIL]); \ 1870 return (DOIO_HARD); \ 1871 } 1872 1873 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1874 ALWAYS_HARD(EACCES, ISC_R_NOPERM); 1875 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 1876 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1877 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1878#ifdef EHOSTDOWN 1879 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH); 1880#endif 1881 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1882 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1883 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH); 1884 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED); 1885 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET); 1886 1887#undef SOFT_OR_HARD 1888#undef ALWAYS_HARD 1889 1890 /* 1891 * The other error types depend on whether or not the 1892 * socket is UDP or TCP. If it is UDP, some errors 1893 * that we expect to be fatal under TCP are merely 1894 * annoying, and are really soft errors. 1895 * 1896 * However, these soft errors are still returned as 1897 * a status. 1898 */ 1899 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); 1900 isc__strerror(send_errno, strbuf, sizeof(strbuf)); 1901 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", 1902 addrbuf, strbuf); 1903 dev->result = isc__errno2result(send_errno); 1904 inc_stats(sock->manager->stats, 1905 sock->statsindex[STATID_SENDFAIL]); 1906 return (DOIO_HARD); 1907 } 1908 1909 if (cc == 0) { 1910 inc_stats(sock->manager->stats, 1911 sock->statsindex[STATID_SENDFAIL]); 1912 UNEXPECTED_ERROR(__FILE__, __LINE__, 1913 "doio_send: send() %s 0", 1914 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 1915 ISC_MSG_RETURNED, "returned")); 1916 } 1917 1918 /* 1919 * If we write less than we expected, update counters, poke. 1920 */ 1921 dev->n += cc; 1922 if ((size_t)cc != write_count) 1923 return (DOIO_SOFT); 1924 1925 /* 1926 * Exactly what we wanted to write. We're done with this 1927 * entry. Post its completion event. 1928 */ 1929 dev->result = ISC_R_SUCCESS; 1930 return (DOIO_SUCCESS); 1931} 1932 1933/* 1934 * Kill. 1935 * 1936 * Caller must ensure that the socket is not locked and no external 1937 * references exist. 1938 */ 1939static void 1940closesocket(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) { 1941 isc_sockettype_t type = sock->type; 1942 int lockid = FDLOCK_ID(fd); 1943 1944 /* 1945 * No one has this socket open, so the watcher doesn't have to be 1946 * poked, and the socket doesn't have to be locked. 1947 */ 1948 LOCK(&manager->fdlock[lockid]); 1949 manager->fds[fd] = NULL; 1950 if (type == isc_sockettype_fdwatch) 1951 manager->fdstate[fd] = CLOSED; 1952 else 1953 manager->fdstate[fd] = CLOSE_PENDING; 1954 UNLOCK(&manager->fdlock[lockid]); 1955 if (type == isc_sockettype_fdwatch) { 1956 /* 1957 * The caller may close the socket once this function returns, 1958 * and `fd' may be reassigned for a new socket. So we do 1959 * unwatch_fd() here, rather than defer it via select_poke(). 1960 * Note: this may complicate data protection among threads and 1961 * may reduce performance due to additional locks. One way to 1962 * solve this would be to dup() the watched descriptor, but we 1963 * take a simpler approach at this moment. 1964 */ 1965 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1966 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1967 } else 1968 select_poke(manager, fd, SELECT_POKE_CLOSE); 1969 1970 inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]); 1971 1972 /* 1973 * update manager->maxfd here (XXX: this should be implemented more 1974 * efficiently) 1975 */ 1976#ifdef USE_SELECT 1977 LOCK(&manager->lock); 1978 if (manager->maxfd == fd) { 1979 int i; 1980 1981 manager->maxfd = 0; 1982 for (i = fd - 1; i >= 0; i--) { 1983 lockid = FDLOCK_ID(i); 1984 1985 LOCK(&manager->fdlock[lockid]); 1986 if (manager->fdstate[i] == MANAGED) { 1987 manager->maxfd = i; 1988 UNLOCK(&manager->fdlock[lockid]); 1989 break; 1990 } 1991 UNLOCK(&manager->fdlock[lockid]); 1992 } 1993#ifdef ISC_PLATFORM_USETHREADS 1994 if (manager->maxfd < manager->pipe_fds[0]) 1995 manager->maxfd = manager->pipe_fds[0]; 1996#endif 1997 } 1998 UNLOCK(&manager->lock); 1999#endif /* USE_SELECT */ 2000} 2001 2002static void 2003destroy(isc__socket_t **sockp) { 2004 int fd; 2005 isc__socket_t *sock = *sockp; 2006 isc__socketmgr_t *manager = sock->manager; 2007 2008 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2009 ISC_MSG_DESTROYING, "destroying"); 2010 2011 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2012 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2013 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2014 INSIST(sock->connect_ev == NULL); 2015 REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks); 2016 2017 if (sock->fd >= 0) { 2018 fd = sock->fd; 2019 sock->fd = -1; 2020 closesocket(manager, sock, fd); 2021 } 2022 2023 LOCK(&manager->lock); 2024 2025 ISC_LIST_UNLINK(manager->socklist, sock, link); 2026 2027#ifdef USE_WATCHER_THREAD 2028 if (ISC_LIST_EMPTY(manager->socklist)) 2029 SIGNAL(&manager->shutdown_ok); 2030#endif /* USE_WATCHER_THREAD */ 2031 2032 /* can't unlock manager as its memory context is still used */ 2033 free_socket(sockp); 2034 2035 UNLOCK(&manager->lock); 2036} 2037 2038static isc_result_t 2039allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type, 2040 isc__socket_t **socketp) 2041{ 2042 isc__socket_t *sock; 2043 isc_result_t result; 2044 ISC_SOCKADDR_LEN_T cmsgbuflen; 2045 2046 sock = isc_mem_get(manager->mctx, sizeof(*sock)); 2047 2048 if (sock == NULL) 2049 return (ISC_R_NOMEMORY); 2050 2051 sock->common.magic = 0; 2052 sock->common.impmagic = 0; 2053 sock->references = 0; 2054 2055 sock->manager = manager; 2056 sock->type = type; 2057 sock->fd = -1; 2058 sock->dupped = 0; 2059 sock->statsindex = NULL; 2060 2061 ISC_LINK_INIT(sock, link); 2062 2063 sock->recvcmsgbuf = NULL; 2064 sock->sendcmsgbuf = NULL; 2065 2066 /* 2067 * set up cmsg buffers 2068 */ 2069 cmsgbuflen = 0; 2070#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 2071 cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo)); 2072#endif 2073#if defined(USE_CMSG) && defined(SO_TIMESTAMP) 2074 cmsgbuflen += cmsg_space(sizeof(struct timeval)); 2075#endif 2076 sock->recvcmsgbuflen = cmsgbuflen; 2077 if (sock->recvcmsgbuflen != 0U) { 2078 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 2079 if (sock->recvcmsgbuf == NULL) { 2080 result = ISC_R_NOMEMORY; 2081 goto error; 2082 } 2083 } 2084 2085 cmsgbuflen = 0; 2086#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 2087 cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo)); 2088#if defined(IPV6_USE_MIN_MTU) 2089 /* 2090 * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU 2091 * support. 2092 */ 2093 cmsgbuflen += cmsg_space(sizeof(int)); 2094#endif 2095#endif 2096 sock->sendcmsgbuflen = cmsgbuflen; 2097 if (sock->sendcmsgbuflen != 0U) { 2098 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 2099 if (sock->sendcmsgbuf == NULL) { 2100 result = ISC_R_NOMEMORY; 2101 goto error; 2102 } 2103 } 2104 2105 memset(sock->name, 0, sizeof(sock->name)); 2106 sock->tag = NULL; 2107 2108 /* 2109 * set up list of readers and writers to be initially empty 2110 */ 2111 ISC_LIST_INIT(sock->recv_list); 2112 ISC_LIST_INIT(sock->send_list); 2113 ISC_LIST_INIT(sock->accept_list); 2114 sock->connect_ev = NULL; 2115 sock->pending_recv = 0; 2116 sock->pending_send = 0; 2117 sock->pending_accept = 0; 2118 sock->listener = 0; 2119 sock->connected = 0; 2120 sock->connecting = 0; 2121 sock->bound = 0; 2122 2123 /* 2124 * initialize the lock 2125 */ 2126 result = isc_mutex_init(&sock->lock); 2127 if (result != ISC_R_SUCCESS) { 2128 sock->common.magic = 0; 2129 sock->common.impmagic = 0; 2130 goto error; 2131 } 2132 2133 /* 2134 * Initialize readable and writable events 2135 */ 2136 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t), 2137 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR, 2138 NULL, sock, sock, NULL, NULL); 2139 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t), 2140 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW, 2141 NULL, sock, sock, NULL, NULL); 2142 2143 sock->common.magic = ISCAPI_SOCKET_MAGIC; 2144 sock->common.impmagic = SOCKET_MAGIC; 2145 *socketp = sock; 2146 2147 return (ISC_R_SUCCESS); 2148 2149 error: 2150 if (sock->recvcmsgbuf != NULL) 2151 isc_mem_put(manager->mctx, sock->recvcmsgbuf, 2152 sock->recvcmsgbuflen); 2153 if (sock->sendcmsgbuf != NULL) 2154 isc_mem_put(manager->mctx, sock->sendcmsgbuf, 2155 sock->sendcmsgbuflen); 2156 isc_mem_put(manager->mctx, sock, sizeof(*sock)); 2157 2158 return (result); 2159} 2160 2161/* 2162 * This event requires that the various lists be empty, that the reference 2163 * count be 1, and that the magic number is valid. The other socket bits, 2164 * like the lock, must be initialized as well. The fd associated must be 2165 * marked as closed, by setting it to -1 on close, or this routine will 2166 * also close the socket. 2167 */ 2168static void 2169free_socket(isc__socket_t **socketp) { 2170 isc__socket_t *sock = *socketp; 2171 2172 INSIST(sock->references == 0); 2173 INSIST(VALID_SOCKET(sock)); 2174 INSIST(!sock->connecting); 2175 INSIST(!sock->pending_recv); 2176 INSIST(!sock->pending_send); 2177 INSIST(!sock->pending_accept); 2178 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2179 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2180 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2181 INSIST(!ISC_LINK_LINKED(sock, link)); 2182 2183 if (sock->recvcmsgbuf != NULL) 2184 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf, 2185 sock->recvcmsgbuflen); 2186 if (sock->sendcmsgbuf != NULL) 2187 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf, 2188 sock->sendcmsgbuflen); 2189 2190 sock->common.magic = 0; 2191 sock->common.impmagic = 0; 2192 2193 DESTROYLOCK(&sock->lock); 2194 2195 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock)); 2196 2197 *socketp = NULL; 2198} 2199 2200#ifdef SO_BSDCOMPAT 2201/* 2202 * This really should not be necessary to do. Having to workout 2203 * which kernel version we are on at run time so that we don't cause 2204 * the kernel to issue a warning about us using a deprecated socket option. 2205 * Such warnings should *never* be on by default in production kernels. 2206 * 2207 * We can't do this a build time because executables are moved between 2208 * machines and hence kernels. 2209 * 2210 * We can't just not set SO_BSDCOMAT because some kernels require it. 2211 */ 2212 2213static isc_once_t bsdcompat_once = ISC_ONCE_INIT; 2214isc_boolean_t bsdcompat = ISC_TRUE; 2215 2216static void 2217clear_bsdcompat(void) { 2218#ifdef __linux__ 2219 struct utsname buf; 2220 char *endp; 2221 long int major; 2222 long int minor; 2223 2224 uname(&buf); /* Can only fail if buf is bad in Linux. */ 2225 2226 /* Paranoia in parsing can be increased, but we trust uname(). */ 2227 major = strtol(buf.release, &endp, 10); 2228 if (*endp == '.') { 2229 minor = strtol(endp+1, &endp, 10); 2230 if ((major > 2) || ((major == 2) && (minor >= 4))) { 2231 bsdcompat = ISC_FALSE; 2232 } 2233 } 2234#endif /* __linux __ */ 2235} 2236#endif 2237 2238static isc_result_t 2239opensocket(isc__socketmgr_t *manager, isc__socket_t *sock, 2240 isc__socket_t *dup_socket) 2241{ 2242 isc_result_t result; 2243 char strbuf[ISC_STRERRORSIZE]; 2244 const char *err = "socket"; 2245 int tries = 0; 2246#if defined(USE_CMSG) || defined(SO_BSDCOMPAT) 2247 int on = 1; 2248#endif 2249#if defined(SO_RCVBUF) 2250 ISC_SOCKADDR_LEN_T optlen; 2251 int size; 2252#endif 2253 2254 again: 2255 if (dup_socket == NULL) { 2256 switch (sock->type) { 2257 case isc_sockettype_udp: 2258 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); 2259 break; 2260 case isc_sockettype_tcp: 2261 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); 2262 break; 2263 case isc_sockettype_unix: 2264 sock->fd = socket(sock->pf, SOCK_STREAM, 0); 2265 break; 2266 case isc_sockettype_fdwatch: 2267 /* 2268 * We should not be called for isc_sockettype_fdwatch 2269 * sockets. 2270 */ 2271 INSIST(0); 2272 break; 2273 } 2274 } else { 2275 sock->fd = dup(dup_socket->fd); 2276 sock->dupped = 1; 2277 sock->bound = dup_socket->bound; 2278 } 2279 if (sock->fd == -1 && errno == EINTR && tries++ < 42) 2280 goto again; 2281 2282#ifdef F_DUPFD 2283 /* 2284 * Leave a space for stdio and TCP to work in. 2285 */ 2286 if (manager->reserved != 0 && sock->type == isc_sockettype_udp && 2287 sock->fd >= 0 && sock->fd < manager->reserved) { 2288 int new, tmp; 2289 new = fcntl(sock->fd, F_DUPFD, manager->reserved); 2290 tmp = errno; 2291 (void)close(sock->fd); 2292 errno = tmp; 2293 sock->fd = new; 2294 err = "isc_socket_create: fcntl/reserved"; 2295 } else if (sock->fd >= 0 && sock->fd < 20) { 2296 int new, tmp; 2297 new = fcntl(sock->fd, F_DUPFD, 20); 2298 tmp = errno; 2299 (void)close(sock->fd); 2300 errno = tmp; 2301 sock->fd = new; 2302 err = "isc_socket_create: fcntl"; 2303 } 2304#endif 2305 2306 if (sock->fd >= (int)manager->maxsocks) { 2307 (void)close(sock->fd); 2308 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2309 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2310 isc_msgcat, ISC_MSGSET_SOCKET, 2311 ISC_MSG_TOOMANYFDS, 2312 "socket: file descriptor exceeds limit (%d/%u)", 2313 sock->fd, manager->maxsocks); 2314 return (ISC_R_NORESOURCES); 2315 } 2316 2317 if (sock->fd < 0) { 2318 switch (errno) { 2319 case EMFILE: 2320 case ENFILE: 2321 isc__strerror(errno, strbuf, sizeof(strbuf)); 2322 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2323 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2324 isc_msgcat, ISC_MSGSET_SOCKET, 2325 ISC_MSG_TOOMANYFDS, 2326 "%s: %s", err, strbuf); 2327 /* fallthrough */ 2328 case ENOBUFS: 2329 return (ISC_R_NORESOURCES); 2330 2331 case EPROTONOSUPPORT: 2332 case EPFNOSUPPORT: 2333 case EAFNOSUPPORT: 2334 /* 2335 * Linux 2.2 (and maybe others) return EINVAL instead of 2336 * EAFNOSUPPORT. 2337 */ 2338 case EINVAL: 2339 return (ISC_R_FAMILYNOSUPPORT); 2340 2341 default: 2342 isc__strerror(errno, strbuf, sizeof(strbuf)); 2343 UNEXPECTED_ERROR(__FILE__, __LINE__, 2344 "%s() %s: %s", err, 2345 isc_msgcat_get(isc_msgcat, 2346 ISC_MSGSET_GENERAL, 2347 ISC_MSG_FAILED, 2348 "failed"), 2349 strbuf); 2350 return (ISC_R_UNEXPECTED); 2351 } 2352 } 2353 2354 if (dup_socket != NULL) 2355 goto setup_done; 2356 2357 result = make_nonblock(sock->fd); 2358 if (result != ISC_R_SUCCESS) { 2359 (void)close(sock->fd); 2360 return (result); 2361 } 2362 2363#ifdef SO_BSDCOMPAT 2364 RUNTIME_CHECK(isc_once_do(&bsdcompat_once, 2365 clear_bsdcompat) == ISC_R_SUCCESS); 2366 if (sock->type != isc_sockettype_unix && bsdcompat && 2367 setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, 2368 (void *)&on, sizeof(on)) < 0) { 2369 isc__strerror(errno, strbuf, sizeof(strbuf)); 2370 UNEXPECTED_ERROR(__FILE__, __LINE__, 2371 "setsockopt(%d, SO_BSDCOMPAT) %s: %s", 2372 sock->fd, 2373 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2374 ISC_MSG_FAILED, "failed"), 2375 strbuf); 2376 /* Press on... */ 2377 } 2378#endif 2379 2380#ifdef SO_NOSIGPIPE 2381 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, 2382 (void *)&on, sizeof(on)) < 0) { 2383 isc__strerror(errno, strbuf, sizeof(strbuf)); 2384 UNEXPECTED_ERROR(__FILE__, __LINE__, 2385 "setsockopt(%d, SO_NOSIGPIPE) %s: %s", 2386 sock->fd, 2387 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2388 ISC_MSG_FAILED, "failed"), 2389 strbuf); 2390 /* Press on... */ 2391 } 2392#endif 2393 2394#if defined(USE_CMSG) || defined(SO_RCVBUF) 2395 if (sock->type == isc_sockettype_udp) { 2396 2397#if defined(USE_CMSG) 2398#if defined(SO_TIMESTAMP) 2399 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, 2400 (void *)&on, sizeof(on)) < 0 2401 && errno != ENOPROTOOPT) { 2402 isc__strerror(errno, strbuf, sizeof(strbuf)); 2403 UNEXPECTED_ERROR(__FILE__, __LINE__, 2404 "setsockopt(%d, SO_TIMESTAMP) %s: %s", 2405 sock->fd, 2406 isc_msgcat_get(isc_msgcat, 2407 ISC_MSGSET_GENERAL, 2408 ISC_MSG_FAILED, 2409 "failed"), 2410 strbuf); 2411 /* Press on... */ 2412 } 2413#endif /* SO_TIMESTAMP */ 2414 2415#if defined(ISC_PLATFORM_HAVEIPV6) 2416 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { 2417 /* 2418 * Warn explicitly because this anomaly can be hidden 2419 * in usual operation (and unexpectedly appear later). 2420 */ 2421 UNEXPECTED_ERROR(__FILE__, __LINE__, 2422 "No buffer available to receive " 2423 "IPv6 destination"); 2424 } 2425#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 2426#ifdef IPV6_RECVPKTINFO 2427 /* RFC 3542 */ 2428 if ((sock->pf == AF_INET6) 2429 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 2430 (void *)&on, sizeof(on)) < 0)) { 2431 isc__strerror(errno, strbuf, sizeof(strbuf)); 2432 UNEXPECTED_ERROR(__FILE__, __LINE__, 2433 "setsockopt(%d, IPV6_RECVPKTINFO) " 2434 "%s: %s", sock->fd, 2435 isc_msgcat_get(isc_msgcat, 2436 ISC_MSGSET_GENERAL, 2437 ISC_MSG_FAILED, 2438 "failed"), 2439 strbuf); 2440 } 2441#else 2442 /* RFC 2292 */ 2443 if ((sock->pf == AF_INET6) 2444 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, 2445 (void *)&on, sizeof(on)) < 0)) { 2446 isc__strerror(errno, strbuf, sizeof(strbuf)); 2447 UNEXPECTED_ERROR(__FILE__, __LINE__, 2448 "setsockopt(%d, IPV6_PKTINFO) %s: %s", 2449 sock->fd, 2450 isc_msgcat_get(isc_msgcat, 2451 ISC_MSGSET_GENERAL, 2452 ISC_MSG_FAILED, 2453 "failed"), 2454 strbuf); 2455 } 2456#endif /* IPV6_RECVPKTINFO */ 2457#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */ 2458#ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/ 2459 /* use minimum MTU */ 2460 if (sock->pf == AF_INET6 && 2461 setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, 2462 (void *)&on, sizeof(on)) < 0) { 2463 isc__strerror(errno, strbuf, sizeof(strbuf)); 2464 UNEXPECTED_ERROR(__FILE__, __LINE__, 2465 "setsockopt(%d, IPV6_USE_MIN_MTU) " 2466 "%s: %s", sock->fd, 2467 isc_msgcat_get(isc_msgcat, 2468 ISC_MSGSET_GENERAL, 2469 ISC_MSG_FAILED, 2470 "failed"), 2471 strbuf); 2472 } 2473#endif 2474#if defined(IPV6_MTU) 2475 /* 2476 * Use minimum MTU on IPv6 sockets. 2477 */ 2478 if (sock->pf == AF_INET6) { 2479 int mtu = 1280; 2480 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, 2481 &mtu, sizeof(mtu)); 2482 } 2483#endif 2484#if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT) 2485 /* 2486 * Turn off Path MTU discovery on IPv6/UDP sockets. 2487 */ 2488 if (sock->pf == AF_INET6) { 2489 int action = IPV6_PMTUDISC_DONT; 2490 (void)setsockopt(sock->fd, IPPROTO_IPV6, 2491 IPV6_MTU_DISCOVER, &action, 2492 sizeof(action)); 2493 } 2494#endif 2495#endif /* ISC_PLATFORM_HAVEIPV6 */ 2496#endif /* defined(USE_CMSG) */ 2497 2498#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) 2499 /* 2500 * Turn off Path MTU discovery on IPv4/UDP sockets. 2501 */ 2502 if (sock->pf == AF_INET) { 2503 int action = IP_PMTUDISC_DONT; 2504 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER, 2505 &action, sizeof(action)); 2506 } 2507#endif 2508#if defined(IP_DONTFRAG) 2509 /* 2510 * Turn off Path MTU discovery on IPv4/UDP sockets. 2511 */ 2512 if (sock->pf == AF_INET) { 2513 int off = 0; 2514 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, 2515 &off, sizeof(off)); 2516 } 2517#endif 2518 2519#if defined(SO_RCVBUF) 2520 optlen = sizeof(size); 2521 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2522 (void *)&size, &optlen) >= 0 && 2523 size < RCVBUFSIZE) { 2524 size = RCVBUFSIZE; 2525 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2526 (void *)&size, sizeof(size)) == -1) { 2527 isc__strerror(errno, strbuf, sizeof(strbuf)); 2528 UNEXPECTED_ERROR(__FILE__, __LINE__, 2529 "setsockopt(%d, SO_RCVBUF, %d) %s: %s", 2530 sock->fd, size, 2531 isc_msgcat_get(isc_msgcat, 2532 ISC_MSGSET_GENERAL, 2533 ISC_MSG_FAILED, 2534 "failed"), 2535 strbuf); 2536 } 2537 } 2538#endif 2539 } 2540#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */ 2541 2542setup_done: 2543 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]); 2544 2545 return (ISC_R_SUCCESS); 2546} 2547 2548/* 2549 * Create a 'type' socket or duplicate an existing socket, managed 2550 * by 'manager'. Events will be posted to 'task' and when dispatched 2551 * 'action' will be called with 'arg' as the arg value. The new 2552 * socket is returned in 'socketp'. 2553 */ 2554static isc_result_t 2555socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 2556 isc_socket_t **socketp, isc_socket_t *dup_socket) 2557{ 2558 isc__socket_t *sock = NULL; 2559 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 2560 isc_result_t result; 2561 int lockid; 2562 2563 REQUIRE(VALID_MANAGER(manager)); 2564 REQUIRE(socketp != NULL && *socketp == NULL); 2565 REQUIRE(type != isc_sockettype_fdwatch); 2566 2567 result = allocate_socket(manager, type, &sock); 2568 if (result != ISC_R_SUCCESS) 2569 return (result); 2570 2571 switch (sock->type) { 2572 case isc_sockettype_udp: 2573 sock->statsindex = 2574 (pf == AF_INET) ? upd4statsindex : upd6statsindex; 2575 break; 2576 case isc_sockettype_tcp: 2577 sock->statsindex = 2578 (pf == AF_INET) ? tcp4statsindex : tcp6statsindex; 2579 break; 2580 case isc_sockettype_unix: 2581 sock->statsindex = unixstatsindex; 2582 break; 2583 default: 2584 INSIST(0); 2585 } 2586 2587 sock->pf = pf; 2588 2589 result = opensocket(manager, sock, (isc__socket_t *)dup_socket); 2590 if (result != ISC_R_SUCCESS) { 2591 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]); 2592 free_socket(&sock); 2593 return (result); 2594 } 2595 2596 sock->common.methods = (isc_socketmethods_t *)&socketmethods; 2597 sock->references = 1; 2598 *socketp = (isc_socket_t *)sock; 2599 2600 /* 2601 * Note we don't have to lock the socket like we normally would because 2602 * there are no external references to it yet. 2603 */ 2604 2605 lockid = FDLOCK_ID(sock->fd); 2606 LOCK(&manager->fdlock[lockid]); 2607 manager->fds[sock->fd] = sock; 2608 manager->fdstate[sock->fd] = MANAGED; 2609#ifdef USE_DEVPOLL 2610 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2611 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2612#endif 2613 UNLOCK(&manager->fdlock[lockid]); 2614 2615 LOCK(&manager->lock); 2616 ISC_LIST_APPEND(manager->socklist, sock, link); 2617#ifdef USE_SELECT 2618 if (manager->maxfd < sock->fd) 2619 manager->maxfd = sock->fd; 2620#endif 2621 UNLOCK(&manager->lock); 2622 2623 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2624 ISC_MSG_CREATED, dup_socket == NULL ? "dupped" : "created"); 2625 2626 return (ISC_R_SUCCESS); 2627} 2628 2629/*% 2630 * Create a new 'type' socket managed by 'manager'. Events 2631 * will be posted to 'task' and when dispatched 'action' will be 2632 * called with 'arg' as the arg value. The new socket is returned 2633 * in 'socketp'. 2634 */ 2635ISC_SOCKETFUNC_SCOPE isc_result_t 2636isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 2637 isc_socket_t **socketp) 2638{ 2639 return (socket_create(manager0, pf, type, socketp, NULL)); 2640} 2641 2642/*% 2643 * Duplicate an existing socket. The new socket is returned 2644 * in 'socketp'. 2645 */ 2646ISC_SOCKETFUNC_SCOPE isc_result_t 2647isc__socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) { 2648 isc__socket_t *sock = (isc__socket_t *)sock0; 2649 2650 REQUIRE(VALID_SOCKET(sock)); 2651 REQUIRE(socketp != NULL && *socketp == NULL); 2652 2653 return (socket_create((isc_socketmgr_t *) sock->manager, 2654 sock->pf, sock->type, socketp, 2655 sock0)); 2656} 2657 2658#ifdef BIND9 2659ISC_SOCKETFUNC_SCOPE isc_result_t 2660isc__socket_open(isc_socket_t *sock0) { 2661 isc_result_t result; 2662 isc__socket_t *sock = (isc__socket_t *)sock0; 2663 2664 REQUIRE(VALID_SOCKET(sock)); 2665 2666 LOCK(&sock->lock); 2667 REQUIRE(sock->references == 1); 2668 REQUIRE(sock->type != isc_sockettype_fdwatch); 2669 UNLOCK(&sock->lock); 2670 /* 2671 * We don't need to retain the lock hereafter, since no one else has 2672 * this socket. 2673 */ 2674 REQUIRE(sock->fd == -1); 2675 2676 result = opensocket(sock->manager, sock, NULL); 2677 if (result != ISC_R_SUCCESS) 2678 sock->fd = -1; 2679 2680 if (result == ISC_R_SUCCESS) { 2681 int lockid = FDLOCK_ID(sock->fd); 2682 2683 LOCK(&sock->manager->fdlock[lockid]); 2684 sock->manager->fds[sock->fd] = sock; 2685 sock->manager->fdstate[sock->fd] = MANAGED; 2686#ifdef USE_DEVPOLL 2687 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2688 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2689#endif 2690 UNLOCK(&sock->manager->fdlock[lockid]); 2691 2692#ifdef USE_SELECT 2693 LOCK(&sock->manager->lock); 2694 if (sock->manager->maxfd < sock->fd) 2695 sock->manager->maxfd = sock->fd; 2696 UNLOCK(&sock->manager->lock); 2697#endif 2698 } 2699 2700 return (result); 2701} 2702#endif /* BIND9 */ 2703 2704/* 2705 * Create a new 'type' socket managed by 'manager'. Events 2706 * will be posted to 'task' and when dispatched 'action' will be 2707 * called with 'arg' as the arg value. The new socket is returned 2708 * in 'socketp'. 2709 */ 2710ISC_SOCKETFUNC_SCOPE isc_result_t 2711isc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags, 2712 isc_sockfdwatch_t callback, void *cbarg, 2713 isc_task_t *task, isc_socket_t **socketp) 2714{ 2715 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 2716 isc__socket_t *sock = NULL; 2717 isc_result_t result; 2718 int lockid; 2719 2720 REQUIRE(VALID_MANAGER(manager)); 2721 REQUIRE(socketp != NULL && *socketp == NULL); 2722 2723 result = allocate_socket(manager, isc_sockettype_fdwatch, &sock); 2724 if (result != ISC_R_SUCCESS) 2725 return (result); 2726 2727 sock->fd = fd; 2728 sock->fdwatcharg = cbarg; 2729 sock->fdwatchcb = callback; 2730 sock->fdwatchflags = flags; 2731 sock->fdwatchtask = task; 2732 sock->statsindex = fdwatchstatsindex; 2733 2734 sock->common.methods = (isc_socketmethods_t *)&socketmethods; 2735 sock->references = 1; 2736 *socketp = (isc_socket_t *)sock; 2737 2738 /* 2739 * Note we don't have to lock the socket like we normally would because 2740 * there are no external references to it yet. 2741 */ 2742 2743 lockid = FDLOCK_ID(sock->fd); 2744 LOCK(&manager->fdlock[lockid]); 2745 manager->fds[sock->fd] = sock; 2746 manager->fdstate[sock->fd] = MANAGED; 2747 UNLOCK(&manager->fdlock[lockid]); 2748 2749 LOCK(&manager->lock); 2750 ISC_LIST_APPEND(manager->socklist, sock, link); 2751#ifdef USE_SELECT 2752 if (manager->maxfd < sock->fd) 2753 manager->maxfd = sock->fd; 2754#endif 2755 UNLOCK(&manager->lock); 2756 2757 if (flags & ISC_SOCKFDWATCH_READ) 2758 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 2759 if (flags & ISC_SOCKFDWATCH_WRITE) 2760 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 2761 2762 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2763 ISC_MSG_CREATED, "fdwatch-created"); 2764 2765 return (ISC_R_SUCCESS); 2766} 2767 2768/* 2769 * Indicate to the manager that it should watch the socket again. 2770 * This can be used to restart watching if the previous event handler 2771 * didn't indicate there was more data to be processed. Primarily 2772 * it is for writing but could be used for reading if desired 2773 */ 2774 2775ISC_SOCKETFUNC_SCOPE isc_result_t 2776isc__socket_fdwatchpoke(isc_socket_t *sock0, int flags) 2777{ 2778 isc__socket_t *sock = (isc__socket_t *)sock0; 2779 2780 REQUIRE(VALID_SOCKET(sock)); 2781 2782 /* 2783 * We check both flags first to allow us to get the lock 2784 * once but only if we need it. 2785 */ 2786 2787 if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) { 2788 LOCK(&sock->lock); 2789 if (((flags & ISC_SOCKFDWATCH_READ) != 0) && 2790 !sock->pending_recv) 2791 select_poke(sock->manager, sock->fd, 2792 SELECT_POKE_READ); 2793 if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) && 2794 !sock->pending_send) 2795 select_poke(sock->manager, sock->fd, 2796 SELECT_POKE_WRITE); 2797 UNLOCK(&sock->lock); 2798 } 2799 2800 socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET, 2801 ISC_MSG_POKED, "fdwatch-poked flags: %d", flags); 2802 2803 return (ISC_R_SUCCESS); 2804} 2805 2806/* 2807 * Attach to a socket. Caller must explicitly detach when it is done. 2808 */ 2809ISC_SOCKETFUNC_SCOPE void 2810isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) { 2811 isc__socket_t *sock = (isc__socket_t *)sock0; 2812 2813 REQUIRE(VALID_SOCKET(sock)); 2814 REQUIRE(socketp != NULL && *socketp == NULL); 2815 2816 LOCK(&sock->lock); 2817 sock->references++; 2818 UNLOCK(&sock->lock); 2819 2820 *socketp = (isc_socket_t *)sock; 2821} 2822 2823/* 2824 * Dereference a socket. If this is the last reference to it, clean things 2825 * up by destroying the socket. 2826 */ 2827ISC_SOCKETFUNC_SCOPE void 2828isc__socket_detach(isc_socket_t **socketp) { 2829 isc__socket_t *sock; 2830 isc_boolean_t kill_socket = ISC_FALSE; 2831 2832 REQUIRE(socketp != NULL); 2833 sock = (isc__socket_t *)*socketp; 2834 REQUIRE(VALID_SOCKET(sock)); 2835 2836 LOCK(&sock->lock); 2837 REQUIRE(sock->references > 0); 2838 sock->references--; 2839 if (sock->references == 0) 2840 kill_socket = ISC_TRUE; 2841 UNLOCK(&sock->lock); 2842 2843 if (kill_socket) 2844 destroy(&sock); 2845 2846 *socketp = NULL; 2847} 2848 2849#ifdef BIND9 2850ISC_SOCKETFUNC_SCOPE isc_result_t 2851isc__socket_close(isc_socket_t *sock0) { 2852 isc__socket_t *sock = (isc__socket_t *)sock0; 2853 int fd; 2854 isc__socketmgr_t *manager; 2855 2856 fflush(stdout); 2857 REQUIRE(VALID_SOCKET(sock)); 2858 2859 LOCK(&sock->lock); 2860 2861 REQUIRE(sock->references == 1); 2862 REQUIRE(sock->type != isc_sockettype_fdwatch); 2863 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); 2864 2865 INSIST(!sock->connecting); 2866 INSIST(!sock->pending_recv); 2867 INSIST(!sock->pending_send); 2868 INSIST(!sock->pending_accept); 2869 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2870 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2871 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2872 INSIST(sock->connect_ev == NULL); 2873 2874 manager = sock->manager; 2875 fd = sock->fd; 2876 sock->fd = -1; 2877 sock->dupped = 0; 2878 memset(sock->name, 0, sizeof(sock->name)); 2879 sock->tag = NULL; 2880 sock->listener = 0; 2881 sock->connected = 0; 2882 sock->connecting = 0; 2883 sock->bound = 0; 2884 isc_sockaddr_any(&sock->peer_address); 2885 2886 UNLOCK(&sock->lock); 2887 2888 closesocket(manager, sock, fd); 2889 2890 return (ISC_R_SUCCESS); 2891} 2892#endif /* BIND9 */ 2893 2894/* 2895 * I/O is possible on a given socket. Schedule an event to this task that 2896 * will call an internal function to do the I/O. This will charge the 2897 * task with the I/O operation and let our select loop handler get back 2898 * to doing something real as fast as possible. 2899 * 2900 * The socket and manager must be locked before calling this function. 2901 */ 2902static void 2903dispatch_recv(isc__socket_t *sock) { 2904 intev_t *iev; 2905 isc_socketevent_t *ev; 2906 isc_task_t *sender; 2907 2908 INSIST(!sock->pending_recv); 2909 2910 if (sock->type != isc_sockettype_fdwatch) { 2911 ev = ISC_LIST_HEAD(sock->recv_list); 2912 if (ev == NULL) 2913 return; 2914 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2915 "dispatch_recv: event %p -> task %p", 2916 ev, ev->ev_sender); 2917 sender = ev->ev_sender; 2918 } else { 2919 sender = sock->fdwatchtask; 2920 } 2921 2922 sock->pending_recv = 1; 2923 iev = &sock->readable_ev; 2924 2925 sock->references++; 2926 iev->ev_sender = sock; 2927 if (sock->type == isc_sockettype_fdwatch) 2928 iev->ev_action = internal_fdwatch_read; 2929 else 2930 iev->ev_action = internal_recv; 2931 iev->ev_arg = sock; 2932 2933 isc_task_send(sender, (isc_event_t **)&iev); 2934} 2935 2936static void 2937dispatch_send(isc__socket_t *sock) { 2938 intev_t *iev; 2939 isc_socketevent_t *ev; 2940 isc_task_t *sender; 2941 2942 INSIST(!sock->pending_send); 2943 2944 if (sock->type != isc_sockettype_fdwatch) { 2945 ev = ISC_LIST_HEAD(sock->send_list); 2946 if (ev == NULL) 2947 return; 2948 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2949 "dispatch_send: event %p -> task %p", 2950 ev, ev->ev_sender); 2951 sender = ev->ev_sender; 2952 } else { 2953 sender = sock->fdwatchtask; 2954 } 2955 2956 sock->pending_send = 1; 2957 iev = &sock->writable_ev; 2958 2959 sock->references++; 2960 iev->ev_sender = sock; 2961 if (sock->type == isc_sockettype_fdwatch) 2962 iev->ev_action = internal_fdwatch_write; 2963 else 2964 iev->ev_action = internal_send; 2965 iev->ev_arg = sock; 2966 2967 isc_task_send(sender, (isc_event_t **)&iev); 2968} 2969 2970/* 2971 * Dispatch an internal accept event. 2972 */ 2973static void 2974dispatch_accept(isc__socket_t *sock) { 2975 intev_t *iev; 2976 isc_socket_newconnev_t *ev; 2977 2978 INSIST(!sock->pending_accept); 2979 2980 /* 2981 * Are there any done events left, or were they all canceled 2982 * before the manager got the socket lock? 2983 */ 2984 ev = ISC_LIST_HEAD(sock->accept_list); 2985 if (ev == NULL) 2986 return; 2987 2988 sock->pending_accept = 1; 2989 iev = &sock->readable_ev; 2990 2991 sock->references++; /* keep socket around for this internal event */ 2992 iev->ev_sender = sock; 2993 iev->ev_action = internal_accept; 2994 iev->ev_arg = sock; 2995 2996 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 2997} 2998 2999static void 3000dispatch_connect(isc__socket_t *sock) { 3001 intev_t *iev; 3002 isc_socket_connev_t *ev; 3003 3004 iev = &sock->writable_ev; 3005 3006 ev = sock->connect_ev; 3007 INSIST(ev != NULL); /* XXX */ 3008 3009 INSIST(sock->connecting); 3010 3011 sock->references++; /* keep socket around for this internal event */ 3012 iev->ev_sender = sock; 3013 iev->ev_action = internal_connect; 3014 iev->ev_arg = sock; 3015 3016 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 3017} 3018 3019/* 3020 * Dequeue an item off the given socket's read queue, set the result code 3021 * in the done event to the one provided, and send it to the task it was 3022 * destined for. 3023 * 3024 * If the event to be sent is on a list, remove it before sending. If 3025 * asked to, send and detach from the socket as well. 3026 * 3027 * Caller must have the socket locked if the event is attached to the socket. 3028 */ 3029static void 3030send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) { 3031 isc_task_t *task; 3032 3033 task = (*dev)->ev_sender; 3034 3035 (*dev)->ev_sender = sock; 3036 3037 if (ISC_LINK_LINKED(*dev, ev_link)) 3038 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); 3039 3040 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 3041 == ISC_SOCKEVENTATTR_ATTACHED) 3042 isc_task_sendanddetach(&task, (isc_event_t **)dev); 3043 else 3044 isc_task_send(task, (isc_event_t **)dev); 3045} 3046 3047/* 3048 * See comments for send_recvdone_event() above. 3049 * 3050 * Caller must have the socket locked if the event is attached to the socket. 3051 */ 3052static void 3053send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) { 3054 isc_task_t *task; 3055 3056 INSIST(dev != NULL && *dev != NULL); 3057 3058 task = (*dev)->ev_sender; 3059 (*dev)->ev_sender = sock; 3060 3061 if (ISC_LINK_LINKED(*dev, ev_link)) 3062 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); 3063 3064 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 3065 == ISC_SOCKEVENTATTR_ATTACHED) 3066 isc_task_sendanddetach(&task, (isc_event_t **)dev); 3067 else 3068 isc_task_send(task, (isc_event_t **)dev); 3069} 3070 3071/* 3072 * Call accept() on a socket, to get the new file descriptor. The listen 3073 * socket is used as a prototype to create a new isc_socket_t. The new 3074 * socket has one outstanding reference. The task receiving the event 3075 * will be detached from just after the event is delivered. 3076 * 3077 * On entry to this function, the event delivered is the internal 3078 * readable event, and the first item on the accept_list should be 3079 * the done event we want to send. If the list is empty, this is a no-op, 3080 * so just unlock and return. 3081 */ 3082static void 3083internal_accept(isc_task_t *me, isc_event_t *ev) { 3084 isc__socket_t *sock; 3085 isc__socketmgr_t *manager; 3086 isc_socket_newconnev_t *dev; 3087 isc_task_t *task; 3088 ISC_SOCKADDR_LEN_T addrlen; 3089 int fd; 3090 isc_result_t result = ISC_R_SUCCESS; 3091 char strbuf[ISC_STRERRORSIZE]; 3092 const char *err = "accept"; 3093 3094 UNUSED(me); 3095 3096 sock = ev->ev_sender; 3097 INSIST(VALID_SOCKET(sock)); 3098 3099 LOCK(&sock->lock); 3100 socket_log(sock, NULL, TRACE, 3101 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK, 3102 "internal_accept called, locked socket"); 3103 3104 manager = sock->manager; 3105 INSIST(VALID_MANAGER(manager)); 3106 3107 INSIST(sock->listener); 3108 INSIST(sock->pending_accept == 1); 3109 sock->pending_accept = 0; 3110 3111 INSIST(sock->references > 0); 3112 sock->references--; /* the internal event is done with this socket */ 3113 if (sock->references == 0) { 3114 UNLOCK(&sock->lock); 3115 destroy(&sock); 3116 return; 3117 } 3118 3119 /* 3120 * Get the first item off the accept list. 3121 * If it is empty, unlock the socket and return. 3122 */ 3123 dev = ISC_LIST_HEAD(sock->accept_list); 3124 if (dev == NULL) { 3125 UNLOCK(&sock->lock); 3126 return; 3127 } 3128 3129 /* 3130 * Try to accept the new connection. If the accept fails with 3131 * EAGAIN or EINTR, simply poke the watcher to watch this socket 3132 * again. Also ignore ECONNRESET, which has been reported to 3133 * be spuriously returned on Linux 2.2.19 although it is not 3134 * a documented error for accept(). ECONNABORTED has been 3135 * reported for Solaris 8. The rest are thrown in not because 3136 * we have seen them but because they are ignored by other 3137 * daemons such as BIND 8 and Apache. 3138 */ 3139 3140 addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type); 3141 memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen); 3142 fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa, 3143 (void *)&addrlen); 3144 3145#ifdef F_DUPFD 3146 /* 3147 * Leave a space for stdio to work in. 3148 */ 3149 if (fd >= 0 && fd < 20) { 3150 int new, tmp; 3151 new = fcntl(fd, F_DUPFD, 20); 3152 tmp = errno; 3153 (void)close(fd); 3154 errno = tmp; 3155 fd = new; 3156 err = "accept/fcntl"; 3157 } 3158#endif 3159 3160 if (fd < 0) { 3161 if (SOFT_ERROR(errno)) 3162 goto soft_error; 3163 switch (errno) { 3164 case ENFILE: 3165 case EMFILE: 3166 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 3167 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 3168 isc_msgcat, ISC_MSGSET_SOCKET, 3169 ISC_MSG_TOOMANYFDS, 3170 "%s: too many open file descriptors", 3171 err); 3172 goto soft_error; 3173 3174 case ENOBUFS: 3175 case ENOMEM: 3176 case ECONNRESET: 3177 case ECONNABORTED: 3178 case EHOSTUNREACH: 3179 case EHOSTDOWN: 3180 case ENETUNREACH: 3181 case ENETDOWN: 3182 case ECONNREFUSED: 3183#ifdef EPROTO 3184 case EPROTO: 3185#endif 3186#ifdef ENONET 3187 case ENONET: 3188#endif 3189 goto soft_error; 3190 default: 3191 break; 3192 } 3193 isc__strerror(errno, strbuf, sizeof(strbuf)); 3194 UNEXPECTED_ERROR(__FILE__, __LINE__, 3195 "internal_accept: %s() %s: %s", err, 3196 isc_msgcat_get(isc_msgcat, 3197 ISC_MSGSET_GENERAL, 3198 ISC_MSG_FAILED, 3199 "failed"), 3200 strbuf); 3201 fd = -1; 3202 result = ISC_R_UNEXPECTED; 3203 } else { 3204 if (addrlen == 0U) { 3205 UNEXPECTED_ERROR(__FILE__, __LINE__, 3206 "internal_accept(): " 3207 "accept() failed to return " 3208 "remote address"); 3209 3210 (void)close(fd); 3211 goto soft_error; 3212 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family != 3213 sock->pf) 3214 { 3215 UNEXPECTED_ERROR(__FILE__, __LINE__, 3216 "internal_accept(): " 3217 "accept() returned peer address " 3218 "family %u (expected %u)", 3219 NEWCONNSOCK(dev)->peer_address. 3220 type.sa.sa_family, 3221 sock->pf); 3222 (void)close(fd); 3223 goto soft_error; 3224 } else if (fd >= (int)manager->maxsocks) { 3225 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 3226 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 3227 isc_msgcat, ISC_MSGSET_SOCKET, 3228 ISC_MSG_TOOMANYFDS, 3229 "accept: " 3230 "file descriptor exceeds limit (%d/%u)", 3231 fd, manager->maxsocks); 3232 (void)close(fd); 3233 goto soft_error; 3234 } 3235 } 3236 3237 if (fd != -1) { 3238 NEWCONNSOCK(dev)->peer_address.length = addrlen; 3239 NEWCONNSOCK(dev)->pf = sock->pf; 3240 } 3241 3242 /* 3243 * Pull off the done event. 3244 */ 3245 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link); 3246 3247 /* 3248 * Poke watcher if there are more pending accepts. 3249 */ 3250 if (!ISC_LIST_EMPTY(sock->accept_list)) 3251 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 3252 3253 UNLOCK(&sock->lock); 3254 3255 if (fd != -1) { 3256 result = make_nonblock(fd); 3257 if (result != ISC_R_SUCCESS) { 3258 (void)close(fd); 3259 fd = -1; 3260 } 3261 } 3262 3263 /* 3264 * -1 means the new socket didn't happen. 3265 */ 3266 if (fd != -1) { 3267 int lockid = FDLOCK_ID(fd); 3268 3269 LOCK(&manager->fdlock[lockid]); 3270 manager->fds[fd] = NEWCONNSOCK(dev); 3271 manager->fdstate[fd] = MANAGED; 3272 UNLOCK(&manager->fdlock[lockid]); 3273 3274 LOCK(&manager->lock); 3275 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link); 3276 3277 NEWCONNSOCK(dev)->fd = fd; 3278 NEWCONNSOCK(dev)->bound = 1; 3279 NEWCONNSOCK(dev)->connected = 1; 3280 3281 /* 3282 * Save away the remote address 3283 */ 3284 dev->address = NEWCONNSOCK(dev)->peer_address; 3285 3286#ifdef USE_SELECT 3287 if (manager->maxfd < fd) 3288 manager->maxfd = fd; 3289#endif 3290 3291 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION, 3292 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, 3293 "accepted connection, new socket %p", 3294 dev->newsocket); 3295 3296 UNLOCK(&manager->lock); 3297 3298 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]); 3299 } else { 3300 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3301 NEWCONNSOCK(dev)->references--; 3302 free_socket((isc__socket_t **)&dev->newsocket); 3303 } 3304 3305 /* 3306 * Fill in the done event details and send it off. 3307 */ 3308 dev->result = result; 3309 task = dev->ev_sender; 3310 dev->ev_sender = sock; 3311 3312 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 3313 return; 3314 3315 soft_error: 3316 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 3317 UNLOCK(&sock->lock); 3318 3319 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3320 return; 3321} 3322 3323static void 3324internal_recv(isc_task_t *me, isc_event_t *ev) { 3325 isc_socketevent_t *dev; 3326 isc__socket_t *sock; 3327 3328 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3329 3330 sock = ev->ev_sender; 3331 INSIST(VALID_SOCKET(sock)); 3332 3333 LOCK(&sock->lock); 3334 socket_log(sock, NULL, IOEVENT, 3335 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3336 "internal_recv: task %p got event %p", me, ev); 3337 3338 INSIST(sock->pending_recv == 1); 3339 sock->pending_recv = 0; 3340 3341 INSIST(sock->references > 0); 3342 sock->references--; /* the internal event is done with this socket */ 3343 if (sock->references == 0) { 3344 UNLOCK(&sock->lock); 3345 destroy(&sock); 3346 return; 3347 } 3348 3349 /* 3350 * Try to do as much I/O as possible on this socket. There are no 3351 * limits here, currently. 3352 */ 3353 dev = ISC_LIST_HEAD(sock->recv_list); 3354 while (dev != NULL) { 3355 switch (doio_recv(sock, dev)) { 3356 case DOIO_SOFT: 3357 goto poke; 3358 3359 case DOIO_EOF: 3360 /* 3361 * read of 0 means the remote end was closed. 3362 * Run through the event queue and dispatch all 3363 * the events with an EOF result code. 3364 */ 3365 do { 3366 dev->result = ISC_R_EOF; 3367 send_recvdone_event(sock, &dev); 3368 dev = ISC_LIST_HEAD(sock->recv_list); 3369 } while (dev != NULL); 3370 goto poke; 3371 3372 case DOIO_SUCCESS: 3373 case DOIO_HARD: 3374 send_recvdone_event(sock, &dev); 3375 break; 3376 } 3377 3378 dev = ISC_LIST_HEAD(sock->recv_list); 3379 } 3380 3381 poke: 3382 if (!ISC_LIST_EMPTY(sock->recv_list)) 3383 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3384 3385 UNLOCK(&sock->lock); 3386} 3387 3388static void 3389internal_send(isc_task_t *me, isc_event_t *ev) { 3390 isc_socketevent_t *dev; 3391 isc__socket_t *sock; 3392 3393 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3394 3395 /* 3396 * Find out what socket this is and lock it. 3397 */ 3398 sock = (isc__socket_t *)ev->ev_sender; 3399 INSIST(VALID_SOCKET(sock)); 3400 3401 LOCK(&sock->lock); 3402 socket_log(sock, NULL, IOEVENT, 3403 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3404 "internal_send: task %p got event %p", me, ev); 3405 3406 INSIST(sock->pending_send == 1); 3407 sock->pending_send = 0; 3408 3409 INSIST(sock->references > 0); 3410 sock->references--; /* the internal event is done with this socket */ 3411 if (sock->references == 0) { 3412 UNLOCK(&sock->lock); 3413 destroy(&sock); 3414 return; 3415 } 3416 3417 /* 3418 * Try to do as much I/O as possible on this socket. There are no 3419 * limits here, currently. 3420 */ 3421 dev = ISC_LIST_HEAD(sock->send_list); 3422 while (dev != NULL) { 3423 switch (doio_send(sock, dev)) { 3424 case DOIO_SOFT: 3425 goto poke; 3426 3427 case DOIO_HARD: 3428 case DOIO_SUCCESS: 3429 send_senddone_event(sock, &dev); 3430 break; 3431 } 3432 3433 dev = ISC_LIST_HEAD(sock->send_list); 3434 } 3435 3436 poke: 3437 if (!ISC_LIST_EMPTY(sock->send_list)) 3438 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3439 3440 UNLOCK(&sock->lock); 3441} 3442 3443static void 3444internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) { 3445 isc__socket_t *sock; 3446 int more_data; 3447 3448 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3449 3450 /* 3451 * Find out what socket this is and lock it. 3452 */ 3453 sock = (isc__socket_t *)ev->ev_sender; 3454 INSIST(VALID_SOCKET(sock)); 3455 3456 LOCK(&sock->lock); 3457 socket_log(sock, NULL, IOEVENT, 3458 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3459 "internal_fdwatch_write: task %p got event %p", me, ev); 3460 3461 INSIST(sock->pending_send == 1); 3462 3463 UNLOCK(&sock->lock); 3464 more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock, 3465 sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE); 3466 LOCK(&sock->lock); 3467 3468 sock->pending_send = 0; 3469 3470 INSIST(sock->references > 0); 3471 sock->references--; /* the internal event is done with this socket */ 3472 if (sock->references == 0) { 3473 UNLOCK(&sock->lock); 3474 destroy(&sock); 3475 return; 3476 } 3477 3478 if (more_data) 3479 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3480 3481 UNLOCK(&sock->lock); 3482} 3483 3484static void 3485internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) { 3486 isc__socket_t *sock; 3487 int more_data; 3488 3489 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3490 3491 /* 3492 * Find out what socket this is and lock it. 3493 */ 3494 sock = (isc__socket_t *)ev->ev_sender; 3495 INSIST(VALID_SOCKET(sock)); 3496 3497 LOCK(&sock->lock); 3498 socket_log(sock, NULL, IOEVENT, 3499 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3500 "internal_fdwatch_read: task %p got event %p", me, ev); 3501 3502 INSIST(sock->pending_recv == 1); 3503 3504 UNLOCK(&sock->lock); 3505 more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock, 3506 sock->fdwatcharg, ISC_SOCKFDWATCH_READ); 3507 LOCK(&sock->lock); 3508 3509 sock->pending_recv = 0; 3510 3511 INSIST(sock->references > 0); 3512 sock->references--; /* the internal event is done with this socket */ 3513 if (sock->references == 0) { 3514 UNLOCK(&sock->lock); 3515 destroy(&sock); 3516 return; 3517 } 3518 3519 if (more_data) 3520 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3521 3522 UNLOCK(&sock->lock); 3523} 3524 3525/* 3526 * Process read/writes on each fd here. Avoid locking 3527 * and unlocking twice if both reads and writes are possible. 3528 */ 3529static void 3530process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable, 3531 isc_boolean_t writeable) 3532{ 3533 isc__socket_t *sock; 3534 isc_boolean_t unlock_sock; 3535 isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE; 3536 int lockid = FDLOCK_ID(fd); 3537 3538 /* 3539 * If the socket is going to be closed, don't do more I/O. 3540 */ 3541 LOCK(&manager->fdlock[lockid]); 3542 if (manager->fdstate[fd] == CLOSE_PENDING) { 3543 UNLOCK(&manager->fdlock[lockid]); 3544 3545 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3546 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3547 return; 3548 } 3549 3550 sock = manager->fds[fd]; 3551 unlock_sock = ISC_FALSE; 3552 if (readable) { 3553 if (sock == NULL) { 3554 unwatch_read = ISC_TRUE; 3555 goto check_write; 3556 } 3557 unlock_sock = ISC_TRUE; 3558 LOCK(&sock->lock); 3559 if (!SOCK_DEAD(sock)) { 3560 if (sock->listener) 3561 dispatch_accept(sock); 3562 else 3563 dispatch_recv(sock); 3564 } 3565 unwatch_read = ISC_TRUE; 3566 } 3567check_write: 3568 if (writeable) { 3569 if (sock == NULL) { 3570 unwatch_write = ISC_TRUE; 3571 goto unlock_fd; 3572 } 3573 if (!unlock_sock) { 3574 unlock_sock = ISC_TRUE; 3575 LOCK(&sock->lock); 3576 } 3577 if (!SOCK_DEAD(sock)) { 3578 if (sock->connecting) 3579 dispatch_connect(sock); 3580 else 3581 dispatch_send(sock); 3582 } 3583 unwatch_write = ISC_TRUE; 3584 } 3585 if (unlock_sock) 3586 UNLOCK(&sock->lock); 3587 3588 unlock_fd: 3589 UNLOCK(&manager->fdlock[lockid]); 3590 if (unwatch_read) 3591 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3592 if (unwatch_write) 3593 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3594 3595} 3596 3597#ifdef USE_KQUEUE 3598static isc_boolean_t 3599process_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) { 3600 int i; 3601 isc_boolean_t readable, writable; 3602 isc_boolean_t done = ISC_FALSE; 3603#ifdef USE_WATCHER_THREAD 3604 isc_boolean_t have_ctlevent = ISC_FALSE; 3605#endif 3606 3607 if (nevents == manager->nevents) { 3608 /* 3609 * This is not an error, but something unexpected. If this 3610 * happens, it may indicate the need for increasing 3611 * ISC_SOCKET_MAXEVENTS. 3612 */ 3613 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3614 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3615 "maximum number of FD events (%d) received", 3616 nevents); 3617 } 3618 3619 for (i = 0; i < nevents; i++) { 3620 REQUIRE(events[i].ident < manager->maxsocks); 3621#ifdef USE_WATCHER_THREAD 3622 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) { 3623 have_ctlevent = ISC_TRUE; 3624 continue; 3625 } 3626#endif 3627 readable = ISC_TF(events[i].filter == EVFILT_READ); 3628 writable = ISC_TF(events[i].filter == EVFILT_WRITE); 3629 process_fd(manager, events[i].ident, readable, writable); 3630 } 3631 3632#ifdef USE_WATCHER_THREAD 3633 if (have_ctlevent) 3634 done = process_ctlfd(manager); 3635#endif 3636 3637 return (done); 3638} 3639#elif defined(USE_EPOLL) 3640static isc_boolean_t 3641process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents) 3642{ 3643 int i; 3644 isc_boolean_t done = ISC_FALSE; 3645#ifdef USE_WATCHER_THREAD 3646 isc_boolean_t have_ctlevent = ISC_FALSE; 3647#endif 3648 3649 if (nevents == manager->nevents) { 3650 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3651 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3652 "maximum number of FD events (%d) received", 3653 nevents); 3654 } 3655 3656 for (i = 0; i < nevents; i++) { 3657 REQUIRE(events[i].data.fd < (int)manager->maxsocks); 3658#ifdef USE_WATCHER_THREAD 3659 if (events[i].data.fd == manager->pipe_fds[0]) { 3660 have_ctlevent = ISC_TRUE; 3661 continue; 3662 } 3663#endif 3664 if ((events[i].events & EPOLLERR) != 0 || 3665 (events[i].events & EPOLLHUP) != 0) { 3666 /* 3667 * epoll does not set IN/OUT bits on an erroneous 3668 * condition, so we need to try both anyway. This is a 3669 * bit inefficient, but should be okay for such rare 3670 * events. Note also that the read or write attempt 3671 * won't block because we use non-blocking sockets. 3672 */ 3673 events[i].events |= (EPOLLIN | EPOLLOUT); 3674 } 3675 process_fd(manager, events[i].data.fd, 3676 (events[i].events & EPOLLIN) != 0, 3677 (events[i].events & EPOLLOUT) != 0); 3678 } 3679 3680#ifdef USE_WATCHER_THREAD 3681 if (have_ctlevent) 3682 done = process_ctlfd(manager); 3683#endif 3684 3685 return (done); 3686} 3687#elif defined(USE_DEVPOLL) 3688static isc_boolean_t 3689process_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) { 3690 int i; 3691 isc_boolean_t done = ISC_FALSE; 3692#ifdef USE_WATCHER_THREAD 3693 isc_boolean_t have_ctlevent = ISC_FALSE; 3694#endif 3695 3696 if (nevents == manager->nevents) { 3697 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3698 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3699 "maximum number of FD events (%d) received", 3700 nevents); 3701 } 3702 3703 for (i = 0; i < nevents; i++) { 3704 REQUIRE(events[i].fd < (int)manager->maxsocks); 3705#ifdef USE_WATCHER_THREAD 3706 if (events[i].fd == manager->pipe_fds[0]) { 3707 have_ctlevent = ISC_TRUE; 3708 continue; 3709 } 3710#endif 3711 process_fd(manager, events[i].fd, 3712 (events[i].events & POLLIN) != 0, 3713 (events[i].events & POLLOUT) != 0); 3714 } 3715 3716#ifdef USE_WATCHER_THREAD 3717 if (have_ctlevent) 3718 done = process_ctlfd(manager); 3719#endif 3720 3721 return (done); 3722} 3723#elif defined(USE_SELECT) 3724static void 3725process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds, 3726 fd_set *writefds) 3727{ 3728 int i; 3729 3730 REQUIRE(maxfd <= (int)manager->maxsocks); 3731 3732 for (i = 0; i < maxfd; i++) { 3733#ifdef USE_WATCHER_THREAD 3734 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) 3735 continue; 3736#endif /* USE_WATCHER_THREAD */ 3737 process_fd(manager, i, FD_ISSET(i, readfds), 3738 FD_ISSET(i, writefds)); 3739 } 3740} 3741#endif 3742 3743#ifdef USE_WATCHER_THREAD 3744static isc_boolean_t 3745process_ctlfd(isc__socketmgr_t *manager) { 3746 int msg, fd; 3747 3748 for (;;) { 3749 select_readmsg(manager, &fd, &msg); 3750 3751 manager_log(manager, IOEVENT, 3752 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 3753 ISC_MSG_WATCHERMSG, 3754 "watcher got message %d " 3755 "for socket %d"), msg, fd); 3756 3757 /* 3758 * Nothing to read? 3759 */ 3760 if (msg == SELECT_POKE_NOTHING) 3761 break; 3762 3763 /* 3764 * Handle shutdown message. We really should 3765 * jump out of this loop right away, but 3766 * it doesn't matter if we have to do a little 3767 * more work first. 3768 */ 3769 if (msg == SELECT_POKE_SHUTDOWN) 3770 return (ISC_TRUE); 3771 3772 /* 3773 * This is a wakeup on a socket. Look 3774 * at the event queue for both read and write, 3775 * and decide if we need to watch on it now 3776 * or not. 3777 */ 3778 wakeup_socket(manager, fd, msg); 3779 } 3780 3781 return (ISC_FALSE); 3782} 3783 3784/* 3785 * This is the thread that will loop forever, always in a select or poll 3786 * call. 3787 * 3788 * When select returns something to do, track down what thread gets to do 3789 * this I/O and post the event to it. 3790 */ 3791static isc_threadresult_t 3792watcher(void *uap) { 3793 isc__socketmgr_t *manager = uap; 3794 isc_boolean_t done; 3795 int cc; 3796#ifdef USE_KQUEUE 3797 const char *fnname = "kevent()"; 3798#elif defined (USE_EPOLL) 3799 const char *fnname = "epoll_wait()"; 3800#elif defined(USE_DEVPOLL) 3801 const char *fnname = "ioctl(DP_POLL)"; 3802 struct dvpoll dvp; 3803#elif defined (USE_SELECT) 3804 const char *fnname = "select()"; 3805 int maxfd; 3806 int ctlfd; 3807#endif 3808 char strbuf[ISC_STRERRORSIZE]; 3809#ifdef ISC_SOCKET_USE_POLLWATCH 3810 pollstate_t pollstate = poll_idle; 3811#endif 3812 3813#if defined (USE_SELECT) 3814 /* 3815 * Get the control fd here. This will never change. 3816 */ 3817 ctlfd = manager->pipe_fds[0]; 3818#endif 3819 done = ISC_FALSE; 3820 while (!done) { 3821 do { 3822#ifdef USE_KQUEUE 3823 cc = kevent(manager->kqueue_fd, NULL, 0, 3824 manager->events, manager->nevents, NULL); 3825#elif defined(USE_EPOLL) 3826 cc = epoll_wait(manager->epoll_fd, manager->events, 3827 manager->nevents, -1); 3828#elif defined(USE_DEVPOLL) 3829 dvp.dp_fds = manager->events; 3830 dvp.dp_nfds = manager->nevents; 3831#ifndef ISC_SOCKET_USE_POLLWATCH 3832 dvp.dp_timeout = -1; 3833#else 3834 if (pollstate == poll_idle) 3835 dvp.dp_timeout = -1; 3836 else 3837 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT; 3838#endif /* ISC_SOCKET_USE_POLLWATCH */ 3839 cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 3840#elif defined(USE_SELECT) 3841 LOCK(&manager->lock); 3842 memcpy(manager->read_fds_copy, manager->read_fds, 3843 manager->fd_bufsize); 3844 memcpy(manager->write_fds_copy, manager->write_fds, 3845 manager->fd_bufsize); 3846 maxfd = manager->maxfd + 1; 3847 UNLOCK(&manager->lock); 3848 3849 cc = select(maxfd, manager->read_fds_copy, 3850 manager->write_fds_copy, NULL, NULL); 3851#endif /* USE_KQUEUE */ 3852 3853 if (cc < 0 && !SOFT_ERROR(errno)) { 3854 isc__strerror(errno, strbuf, sizeof(strbuf)); 3855 FATAL_ERROR(__FILE__, __LINE__, 3856 "%s %s: %s", fnname, 3857 isc_msgcat_get(isc_msgcat, 3858 ISC_MSGSET_GENERAL, 3859 ISC_MSG_FAILED, 3860 "failed"), strbuf); 3861 } 3862 3863#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) 3864 if (cc == 0) { 3865 if (pollstate == poll_active) 3866 pollstate = poll_checking; 3867 else if (pollstate == poll_checking) 3868 pollstate = poll_idle; 3869 } else if (cc > 0) { 3870 if (pollstate == poll_checking) { 3871 /* 3872 * XXX: We'd like to use a more 3873 * verbose log level as it's actually an 3874 * unexpected event, but the kernel bug 3875 * reportedly happens pretty frequently 3876 * (and it can also be a false positive) 3877 * so it would be just too noisy. 3878 */ 3879 manager_log(manager, 3880 ISC_LOGCATEGORY_GENERAL, 3881 ISC_LOGMODULE_SOCKET, 3882 ISC_LOG_DEBUG(1), 3883 "unexpected POLL timeout"); 3884 } 3885 pollstate = poll_active; 3886 } 3887#endif 3888 } while (cc < 0); 3889 3890#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) 3891 done = process_fds(manager, manager->events, cc); 3892#elif defined(USE_SELECT) 3893 process_fds(manager, maxfd, manager->read_fds_copy, 3894 manager->write_fds_copy); 3895 3896 /* 3897 * Process reads on internal, control fd. 3898 */ 3899 if (FD_ISSET(ctlfd, manager->read_fds_copy)) 3900 done = process_ctlfd(manager); 3901#endif 3902 } 3903 3904 manager_log(manager, TRACE, "%s", 3905 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3906 ISC_MSG_EXITING, "watcher exiting")); 3907 3908 return ((isc_threadresult_t)0); 3909} 3910#endif /* USE_WATCHER_THREAD */ 3911 3912#ifdef BIND9 3913ISC_SOCKETFUNC_SCOPE void 3914isc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) { 3915 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 3916 3917 REQUIRE(VALID_MANAGER(manager)); 3918 3919 manager->reserved = reserved; 3920} 3921 3922ISC_SOCKETFUNC_SCOPE void 3923isc___socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) { 3924 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 3925 3926 REQUIRE(VALID_MANAGER(manager)); 3927 3928 manager->maxudp = maxudp; 3929} 3930#endif /* BIND9 */ 3931 3932/* 3933 * Create a new socket manager. 3934 */ 3935 3936static isc_result_t 3937setup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) { 3938 isc_result_t result; 3939#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 3940 char strbuf[ISC_STRERRORSIZE]; 3941#endif 3942 3943#ifdef USE_KQUEUE 3944 manager->nevents = ISC_SOCKET_MAXEVENTS; 3945 manager->events = isc_mem_get(mctx, sizeof(struct kevent) * 3946 manager->nevents); 3947 if (manager->events == NULL) 3948 return (ISC_R_NOMEMORY); 3949 manager->kqueue_fd = kqueue(); 3950 if (manager->kqueue_fd == -1) { 3951 result = isc__errno2result(errno); 3952 isc__strerror(errno, strbuf, sizeof(strbuf)); 3953 UNEXPECTED_ERROR(__FILE__, __LINE__, 3954 "kqueue %s: %s", 3955 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3956 ISC_MSG_FAILED, "failed"), 3957 strbuf); 3958 isc_mem_put(mctx, manager->events, 3959 sizeof(struct kevent) * manager->nevents); 3960 return (result); 3961 } 3962 3963#ifdef USE_WATCHER_THREAD 3964 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3965 if (result != ISC_R_SUCCESS) { 3966 close(manager->kqueue_fd); 3967 isc_mem_put(mctx, manager->events, 3968 sizeof(struct kevent) * manager->nevents); 3969 return (result); 3970 } 3971#endif /* USE_WATCHER_THREAD */ 3972#elif defined(USE_EPOLL) 3973 manager->nevents = ISC_SOCKET_MAXEVENTS; 3974 manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * 3975 manager->nevents); 3976 if (manager->events == NULL) 3977 return (ISC_R_NOMEMORY); 3978 manager->epoll_fd = epoll_create(manager->nevents); 3979 if (manager->epoll_fd == -1) { 3980 result = isc__errno2result(errno); 3981 isc__strerror(errno, strbuf, sizeof(strbuf)); 3982 UNEXPECTED_ERROR(__FILE__, __LINE__, 3983 "epoll_create %s: %s", 3984 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3985 ISC_MSG_FAILED, "failed"), 3986 strbuf); 3987 isc_mem_put(mctx, manager->events, 3988 sizeof(struct epoll_event) * manager->nevents); 3989 return (result); 3990 } 3991#ifdef USE_WATCHER_THREAD 3992 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3993 if (result != ISC_R_SUCCESS) { 3994 close(manager->epoll_fd); 3995 isc_mem_put(mctx, manager->events, 3996 sizeof(struct epoll_event) * manager->nevents); 3997 return (result); 3998 } 3999#endif /* USE_WATCHER_THREAD */ 4000#elif defined(USE_DEVPOLL) 4001 /* 4002 * XXXJT: /dev/poll seems to reject large numbers of events, 4003 * so we should be careful about redefining ISC_SOCKET_MAXEVENTS. 4004 */ 4005 manager->nevents = ISC_SOCKET_MAXEVENTS; 4006 manager->events = isc_mem_get(mctx, sizeof(struct pollfd) * 4007 manager->nevents); 4008 if (manager->events == NULL) 4009 return (ISC_R_NOMEMORY); 4010 /* 4011 * Note: fdpollinfo should be able to support all possible FDs, so 4012 * it must have maxsocks entries (not nevents). 4013 */ 4014 manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) * 4015 manager->maxsocks); 4016 if (manager->fdpollinfo == NULL) { 4017 isc_mem_put(mctx, manager->events, 4018 sizeof(struct pollfd) * manager->nevents); 4019 return (ISC_R_NOMEMORY); 4020 } 4021 memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks); 4022 manager->devpoll_fd = open("/dev/poll", O_RDWR); 4023 if (manager->devpoll_fd == -1) { 4024 result = isc__errno2result(errno); 4025 isc__strerror(errno, strbuf, sizeof(strbuf)); 4026 UNEXPECTED_ERROR(__FILE__, __LINE__, 4027 "open(/dev/poll) %s: %s", 4028 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4029 ISC_MSG_FAILED, "failed"), 4030 strbuf); 4031 isc_mem_put(mctx, manager->events, 4032 sizeof(struct pollfd) * manager->nevents); 4033 isc_mem_put(mctx, manager->fdpollinfo, 4034 sizeof(pollinfo_t) * manager->maxsocks); 4035 return (result); 4036 } 4037#ifdef USE_WATCHER_THREAD 4038 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4039 if (result != ISC_R_SUCCESS) { 4040 close(manager->devpoll_fd); 4041 isc_mem_put(mctx, manager->events, 4042 sizeof(struct pollfd) * manager->nevents); 4043 isc_mem_put(mctx, manager->fdpollinfo, 4044 sizeof(pollinfo_t) * manager->maxsocks); 4045 return (result); 4046 } 4047#endif /* USE_WATCHER_THREAD */ 4048#elif defined(USE_SELECT) 4049 UNUSED(result); 4050 4051#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE 4052 /* 4053 * Note: this code should also cover the case of MAXSOCKETS <= 4054 * FD_SETSIZE, but we separate the cases to avoid possible portability 4055 * issues regarding howmany() and the actual representation of fd_set. 4056 */ 4057 manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) * 4058 sizeof(fd_mask); 4059#else 4060 manager->fd_bufsize = sizeof(fd_set); 4061#endif 4062 4063 manager->read_fds = NULL; 4064 manager->read_fds_copy = NULL; 4065 manager->write_fds = NULL; 4066 manager->write_fds_copy = NULL; 4067 4068 manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize); 4069 if (manager->read_fds != NULL) 4070 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); 4071 if (manager->read_fds_copy != NULL) 4072 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); 4073 if (manager->write_fds != NULL) { 4074 manager->write_fds_copy = isc_mem_get(mctx, 4075 manager->fd_bufsize); 4076 } 4077 if (manager->write_fds_copy == NULL) { 4078 if (manager->write_fds != NULL) { 4079 isc_mem_put(mctx, manager->write_fds, 4080 manager->fd_bufsize); 4081 } 4082 if (manager->read_fds_copy != NULL) { 4083 isc_mem_put(mctx, manager->read_fds_copy, 4084 manager->fd_bufsize); 4085 } 4086 if (manager->read_fds != NULL) { 4087 isc_mem_put(mctx, manager->read_fds, 4088 manager->fd_bufsize); 4089 } 4090 return (ISC_R_NOMEMORY); 4091 } 4092 memset(manager->read_fds, 0, manager->fd_bufsize); 4093 memset(manager->write_fds, 0, manager->fd_bufsize); 4094 4095#ifdef USE_WATCHER_THREAD 4096 (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4097 manager->maxfd = manager->pipe_fds[0]; 4098#else /* USE_WATCHER_THREAD */ 4099 manager->maxfd = 0; 4100#endif /* USE_WATCHER_THREAD */ 4101#endif /* USE_KQUEUE */ 4102 4103 return (ISC_R_SUCCESS); 4104} 4105 4106static void 4107cleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) { 4108#ifdef USE_WATCHER_THREAD 4109 isc_result_t result; 4110 4111 result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4112 if (result != ISC_R_SUCCESS) { 4113 UNEXPECTED_ERROR(__FILE__, __LINE__, 4114 "epoll_ctl(DEL) %s", 4115 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4116 ISC_MSG_FAILED, "failed")); 4117 } 4118#endif /* USE_WATCHER_THREAD */ 4119 4120#ifdef USE_KQUEUE 4121 close(manager->kqueue_fd); 4122 isc_mem_put(mctx, manager->events, 4123 sizeof(struct kevent) * manager->nevents); 4124#elif defined(USE_EPOLL) 4125 close(manager->epoll_fd); 4126 isc_mem_put(mctx, manager->events, 4127 sizeof(struct epoll_event) * manager->nevents); 4128#elif defined(USE_DEVPOLL) 4129 close(manager->devpoll_fd); 4130 isc_mem_put(mctx, manager->events, 4131 sizeof(struct pollfd) * manager->nevents); 4132 isc_mem_put(mctx, manager->fdpollinfo, 4133 sizeof(pollinfo_t) * manager->maxsocks); 4134#elif defined(USE_SELECT) 4135 if (manager->read_fds != NULL) 4136 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize); 4137 if (manager->read_fds_copy != NULL) 4138 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize); 4139 if (manager->write_fds != NULL) 4140 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize); 4141 if (manager->write_fds_copy != NULL) 4142 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize); 4143#endif /* USE_KQUEUE */ 4144} 4145 4146ISC_SOCKETFUNC_SCOPE isc_result_t 4147isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { 4148 return (isc__socketmgr_create2(mctx, managerp, 0)); 4149} 4150 4151ISC_SOCKETFUNC_SCOPE isc_result_t 4152isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 4153 unsigned int maxsocks) 4154{ 4155 int i; 4156 isc__socketmgr_t *manager; 4157#ifdef USE_WATCHER_THREAD 4158 char strbuf[ISC_STRERRORSIZE]; 4159#endif 4160 isc_result_t result; 4161 4162 REQUIRE(managerp != NULL && *managerp == NULL); 4163 4164#ifdef USE_SHARED_MANAGER 4165 if (socketmgr != NULL) { 4166 /* Don't allow maxsocks to be updated */ 4167 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) 4168 return (ISC_R_EXISTS); 4169 4170 socketmgr->refs++; 4171 *managerp = (isc_socketmgr_t *)socketmgr; 4172 return (ISC_R_SUCCESS); 4173 } 4174#endif /* USE_SHARED_MANAGER */ 4175 4176 if (maxsocks == 0) 4177 maxsocks = ISC_SOCKET_MAXSOCKETS; 4178 4179 manager = isc_mem_get(mctx, sizeof(*manager)); 4180 if (manager == NULL) 4181 return (ISC_R_NOMEMORY); 4182 4183 /* zero-clear so that necessary cleanup on failure will be easy */ 4184 memset(manager, 0, sizeof(*manager)); 4185 manager->maxsocks = maxsocks; 4186 manager->reserved = 0; 4187 manager->maxudp = 0; 4188 manager->fds = isc_mem_get(mctx, 4189 manager->maxsocks * sizeof(isc__socket_t *)); 4190 if (manager->fds == NULL) { 4191 result = ISC_R_NOMEMORY; 4192 goto free_manager; 4193 } 4194 manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int)); 4195 if (manager->fdstate == NULL) { 4196 result = ISC_R_NOMEMORY; 4197 goto free_manager; 4198 } 4199 manager->stats = NULL; 4200 4201 manager->common.methods = &socketmgrmethods; 4202 manager->common.magic = ISCAPI_SOCKETMGR_MAGIC; 4203 manager->common.impmagic = SOCKET_MANAGER_MAGIC; 4204 manager->mctx = NULL; 4205 memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); 4206 ISC_LIST_INIT(manager->socklist); 4207 result = isc_mutex_init(&manager->lock); 4208 if (result != ISC_R_SUCCESS) 4209 goto free_manager; 4210 manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t)); 4211 if (manager->fdlock == NULL) { 4212 result = ISC_R_NOMEMORY; 4213 goto cleanup_lock; 4214 } 4215 for (i = 0; i < FDLOCK_COUNT; i++) { 4216 result = isc_mutex_init(&manager->fdlock[i]); 4217 if (result != ISC_R_SUCCESS) { 4218 while (--i >= 0) 4219 DESTROYLOCK(&manager->fdlock[i]); 4220 isc_mem_put(mctx, manager->fdlock, 4221 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4222 manager->fdlock = NULL; 4223 goto cleanup_lock; 4224 } 4225 } 4226 4227#ifdef USE_WATCHER_THREAD 4228 if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { 4229 UNEXPECTED_ERROR(__FILE__, __LINE__, 4230 "isc_condition_init() %s", 4231 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4232 ISC_MSG_FAILED, "failed")); 4233 result = ISC_R_UNEXPECTED; 4234 goto cleanup_lock; 4235 } 4236 4237 /* 4238 * Create the special fds that will be used to wake up the 4239 * select/poll loop when something internal needs to be done. 4240 */ 4241 if (pipe(manager->pipe_fds) != 0) { 4242 isc__strerror(errno, strbuf, sizeof(strbuf)); 4243 UNEXPECTED_ERROR(__FILE__, __LINE__, 4244 "pipe() %s: %s", 4245 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4246 ISC_MSG_FAILED, "failed"), 4247 strbuf); 4248 result = ISC_R_UNEXPECTED; 4249 goto cleanup_condition; 4250 } 4251 4252 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS); 4253#if 0 4254 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS); 4255#endif 4256#endif /* USE_WATCHER_THREAD */ 4257 4258#ifdef USE_SHARED_MANAGER 4259 manager->refs = 1; 4260#endif /* USE_SHARED_MANAGER */ 4261 4262 /* 4263 * Set up initial state for the select loop 4264 */ 4265 result = setup_watcher(mctx, manager); 4266 if (result != ISC_R_SUCCESS) 4267 goto cleanup; 4268 memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); 4269#ifdef USE_WATCHER_THREAD 4270 /* 4271 * Start up the select/poll thread. 4272 */ 4273 if (isc_thread_create(watcher, manager, &manager->watcher) != 4274 ISC_R_SUCCESS) { 4275 UNEXPECTED_ERROR(__FILE__, __LINE__, 4276 "isc_thread_create() %s", 4277 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4278 ISC_MSG_FAILED, "failed")); 4279 cleanup_watcher(mctx, manager); 4280 result = ISC_R_UNEXPECTED; 4281 goto cleanup; 4282 } 4283#endif /* USE_WATCHER_THREAD */ 4284 isc_mem_attach(mctx, &manager->mctx); 4285 4286#ifdef USE_SHARED_MANAGER 4287 socketmgr = manager; 4288#endif /* USE_SHARED_MANAGER */ 4289 *managerp = (isc_socketmgr_t *)manager; 4290 4291 return (ISC_R_SUCCESS); 4292 4293cleanup: 4294#ifdef USE_WATCHER_THREAD 4295 (void)close(manager->pipe_fds[0]); 4296 (void)close(manager->pipe_fds[1]); 4297#endif /* USE_WATCHER_THREAD */ 4298 4299#ifdef USE_WATCHER_THREAD 4300cleanup_condition: 4301 (void)isc_condition_destroy(&manager->shutdown_ok); 4302#endif /* USE_WATCHER_THREAD */ 4303 4304 4305cleanup_lock: 4306 if (manager->fdlock != NULL) { 4307 for (i = 0; i < FDLOCK_COUNT; i++) 4308 DESTROYLOCK(&manager->fdlock[i]); 4309 } 4310 DESTROYLOCK(&manager->lock); 4311 4312free_manager: 4313 if (manager->fdlock != NULL) { 4314 isc_mem_put(mctx, manager->fdlock, 4315 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4316 } 4317 if (manager->fdstate != NULL) { 4318 isc_mem_put(mctx, manager->fdstate, 4319 manager->maxsocks * sizeof(int)); 4320 } 4321 if (manager->fds != NULL) { 4322 isc_mem_put(mctx, manager->fds, 4323 manager->maxsocks * sizeof(isc_socket_t *)); 4324 } 4325 isc_mem_put(mctx, manager, sizeof(*manager)); 4326 4327 return (result); 4328} 4329 4330#ifdef BIND9 4331isc_result_t 4332isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) { 4333 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4334 REQUIRE(VALID_MANAGER(manager)); 4335 REQUIRE(nsockp != NULL); 4336 4337 *nsockp = manager->maxsocks; 4338 4339 return (ISC_R_SUCCESS); 4340} 4341 4342void 4343isc__socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) { 4344 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4345 4346 REQUIRE(VALID_MANAGER(manager)); 4347 REQUIRE(ISC_LIST_EMPTY(manager->socklist)); 4348 REQUIRE(manager->stats == NULL); 4349 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max); 4350 4351 isc_stats_attach(stats, &manager->stats); 4352} 4353#endif 4354 4355ISC_SOCKETFUNC_SCOPE void 4356isc__socketmgr_destroy(isc_socketmgr_t **managerp) { 4357 isc__socketmgr_t *manager; 4358 int i; 4359 isc_mem_t *mctx; 4360 4361 /* 4362 * Destroy a socket manager. 4363 */ 4364 4365 REQUIRE(managerp != NULL); 4366 manager = (isc__socketmgr_t *)*managerp; 4367 REQUIRE(VALID_MANAGER(manager)); 4368 4369#ifdef USE_SHARED_MANAGER 4370 manager->refs--; 4371 if (manager->refs > 0) { 4372 *managerp = NULL; 4373 return; 4374 } 4375 socketmgr = NULL; 4376#endif /* USE_SHARED_MANAGER */ 4377 4378 LOCK(&manager->lock); 4379 4380 /* 4381 * Wait for all sockets to be destroyed. 4382 */ 4383 while (!ISC_LIST_EMPTY(manager->socklist)) { 4384#ifdef USE_WATCHER_THREAD 4385 manager_log(manager, CREATION, "%s", 4386 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 4387 ISC_MSG_SOCKETSREMAIN, 4388 "sockets exist")); 4389 WAIT(&manager->shutdown_ok, &manager->lock); 4390#else /* USE_WATCHER_THREAD */ 4391 UNLOCK(&manager->lock); 4392 isc__taskmgr_dispatch(NULL); 4393 LOCK(&manager->lock); 4394#endif /* USE_WATCHER_THREAD */ 4395 } 4396 4397 UNLOCK(&manager->lock); 4398 4399 /* 4400 * Here, poke our select/poll thread. Do this by closing the write 4401 * half of the pipe, which will send EOF to the read half. 4402 * This is currently a no-op in the non-threaded case. 4403 */ 4404 select_poke(manager, 0, SELECT_POKE_SHUTDOWN); 4405 4406#ifdef USE_WATCHER_THREAD 4407 /* 4408 * Wait for thread to exit. 4409 */ 4410 if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS) 4411 UNEXPECTED_ERROR(__FILE__, __LINE__, 4412 "isc_thread_join() %s", 4413 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4414 ISC_MSG_FAILED, "failed")); 4415#endif /* USE_WATCHER_THREAD */ 4416 4417 /* 4418 * Clean up. 4419 */ 4420 cleanup_watcher(manager->mctx, manager); 4421 4422#ifdef USE_WATCHER_THREAD 4423 (void)close(manager->pipe_fds[0]); 4424 (void)close(manager->pipe_fds[1]); 4425 (void)isc_condition_destroy(&manager->shutdown_ok); 4426#endif /* USE_WATCHER_THREAD */ 4427 4428 for (i = 0; i < (int)manager->maxsocks; i++) 4429 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ 4430 (void)close(i); 4431 4432 isc_mem_put(manager->mctx, manager->fds, 4433 manager->maxsocks * sizeof(isc__socket_t *)); 4434 isc_mem_put(manager->mctx, manager->fdstate, 4435 manager->maxsocks * sizeof(int)); 4436 4437 if (manager->stats != NULL) 4438 isc_stats_detach(&manager->stats); 4439 4440 if (manager->fdlock != NULL) { 4441 for (i = 0; i < FDLOCK_COUNT; i++) 4442 DESTROYLOCK(&manager->fdlock[i]); 4443 isc_mem_put(manager->mctx, manager->fdlock, 4444 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4445 } 4446 DESTROYLOCK(&manager->lock); 4447 manager->common.magic = 0; 4448 manager->common.impmagic = 0; 4449 mctx= manager->mctx; 4450 isc_mem_put(mctx, manager, sizeof(*manager)); 4451 4452 isc_mem_detach(&mctx); 4453 4454 *managerp = NULL; 4455 4456#ifdef USE_SHARED_MANAGER 4457 socketmgr = NULL; 4458#endif 4459} 4460 4461static isc_result_t 4462socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4463 unsigned int flags) 4464{ 4465 int io_state; 4466 isc_boolean_t have_lock = ISC_FALSE; 4467 isc_task_t *ntask = NULL; 4468 isc_result_t result = ISC_R_SUCCESS; 4469 4470 dev->ev_sender = task; 4471 4472 if (sock->type == isc_sockettype_udp) { 4473 io_state = doio_recv(sock, dev); 4474 } else { 4475 LOCK(&sock->lock); 4476 have_lock = ISC_TRUE; 4477 4478 if (ISC_LIST_EMPTY(sock->recv_list)) 4479 io_state = doio_recv(sock, dev); 4480 else 4481 io_state = DOIO_SOFT; 4482 } 4483 4484 switch (io_state) { 4485 case DOIO_SOFT: 4486 /* 4487 * We couldn't read all or part of the request right now, so 4488 * queue it. 4489 * 4490 * Attach to socket and to task 4491 */ 4492 isc_task_attach(task, &ntask); 4493 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4494 4495 if (!have_lock) { 4496 LOCK(&sock->lock); 4497 have_lock = ISC_TRUE; 4498 } 4499 4500 /* 4501 * Enqueue the request. If the socket was previously not being 4502 * watched, poke the watcher to start paying attention to it. 4503 */ 4504 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) 4505 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 4506 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); 4507 4508 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4509 "socket_recv: event %p -> task %p", 4510 dev, ntask); 4511 4512 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4513 result = ISC_R_INPROGRESS; 4514 break; 4515 4516 case DOIO_EOF: 4517 dev->result = ISC_R_EOF; 4518 /* fallthrough */ 4519 4520 case DOIO_HARD: 4521 case DOIO_SUCCESS: 4522 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4523 send_recvdone_event(sock, &dev); 4524 break; 4525 } 4526 4527 if (have_lock) 4528 UNLOCK(&sock->lock); 4529 4530 return (result); 4531} 4532 4533ISC_SOCKETFUNC_SCOPE isc_result_t 4534isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist, 4535 unsigned int minimum, isc_task_t *task, 4536 isc_taskaction_t action, const void *arg) 4537{ 4538 isc__socket_t *sock = (isc__socket_t *)sock0; 4539 isc_socketevent_t *dev; 4540 isc__socketmgr_t *manager; 4541 unsigned int iocount; 4542 isc_buffer_t *buffer; 4543 4544 REQUIRE(VALID_SOCKET(sock)); 4545 REQUIRE(buflist != NULL); 4546 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4547 REQUIRE(task != NULL); 4548 REQUIRE(action != NULL); 4549 4550 manager = sock->manager; 4551 REQUIRE(VALID_MANAGER(manager)); 4552 4553 iocount = isc_bufferlist_availablecount(buflist); 4554 REQUIRE(iocount > 0); 4555 4556 INSIST(sock->bound); 4557 4558 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4559 if (dev == NULL) 4560 return (ISC_R_NOMEMORY); 4561 4562 /* 4563 * UDP sockets are always partial read 4564 */ 4565 if (sock->type == isc_sockettype_udp) 4566 dev->minimum = 1; 4567 else { 4568 if (minimum == 0) 4569 dev->minimum = iocount; 4570 else 4571 dev->minimum = minimum; 4572 } 4573 4574 /* 4575 * Move each buffer from the passed in list to our internal one. 4576 */ 4577 buffer = ISC_LIST_HEAD(*buflist); 4578 while (buffer != NULL) { 4579 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4580 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4581 buffer = ISC_LIST_HEAD(*buflist); 4582 } 4583 4584 return (socket_recv(sock, dev, task, 0)); 4585} 4586 4587ISC_SOCKETFUNC_SCOPE isc_result_t 4588isc__socket_recv(isc_socket_t *sock0, isc_region_t *region, 4589 unsigned int minimum, isc_task_t *task, 4590 isc_taskaction_t action, const void *arg) 4591{ 4592 isc__socket_t *sock = (isc__socket_t *)sock0; 4593 isc_socketevent_t *dev; 4594 isc__socketmgr_t *manager; 4595 4596 REQUIRE(VALID_SOCKET(sock)); 4597 REQUIRE(action != NULL); 4598 4599 manager = sock->manager; 4600 REQUIRE(VALID_MANAGER(manager)); 4601 4602 INSIST(sock->bound); 4603 4604 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4605 if (dev == NULL) 4606 return (ISC_R_NOMEMORY); 4607 4608 return (isc__socket_recv2(sock0, region, minimum, task, dev, 0)); 4609} 4610 4611ISC_SOCKETFUNC_SCOPE isc_result_t 4612isc__socket_recv2(isc_socket_t *sock0, isc_region_t *region, 4613 unsigned int minimum, isc_task_t *task, 4614 isc_socketevent_t *event, unsigned int flags) 4615{ 4616 isc__socket_t *sock = (isc__socket_t *)sock0; 4617 4618 event->ev_sender = sock; 4619 event->result = ISC_R_UNSET; 4620 ISC_LIST_INIT(event->bufferlist); 4621 event->region = *region; 4622 event->n = 0; 4623 event->offset = 0; 4624 event->attributes = 0; 4625 4626 /* 4627 * UDP sockets are always partial read. 4628 */ 4629 if (sock->type == isc_sockettype_udp) 4630 event->minimum = 1; 4631 else { 4632 if (minimum == 0) 4633 event->minimum = region->length; 4634 else 4635 event->minimum = minimum; 4636 } 4637 4638 return (socket_recv(sock, event, task, flags)); 4639} 4640 4641static isc_result_t 4642socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4643 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4644 unsigned int flags) 4645{ 4646 int io_state; 4647 isc_boolean_t have_lock = ISC_FALSE; 4648 isc_task_t *ntask = NULL; 4649 isc_result_t result = ISC_R_SUCCESS; 4650 4651 dev->ev_sender = task; 4652 4653 set_dev_address(address, sock, dev); 4654 if (pktinfo != NULL) { 4655 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 4656 dev->pktinfo = *pktinfo; 4657 4658 if (!isc_sockaddr_issitelocal(&dev->address) && 4659 !isc_sockaddr_islinklocal(&dev->address)) { 4660 socket_log(sock, NULL, TRACE, isc_msgcat, 4661 ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED, 4662 "pktinfo structure provided, ifindex %u " 4663 "(set to 0)", pktinfo->ipi6_ifindex); 4664 4665 /* 4666 * Set the pktinfo index to 0 here, to let the 4667 * kernel decide what interface it should send on. 4668 */ 4669 dev->pktinfo.ipi6_ifindex = 0; 4670 } 4671 } 4672 4673 if (sock->type == isc_sockettype_udp) 4674 io_state = doio_send(sock, dev); 4675 else { 4676 LOCK(&sock->lock); 4677 have_lock = ISC_TRUE; 4678 4679 if (ISC_LIST_EMPTY(sock->send_list)) 4680 io_state = doio_send(sock, dev); 4681 else 4682 io_state = DOIO_SOFT; 4683 } 4684 4685 switch (io_state) { 4686 case DOIO_SOFT: 4687 /* 4688 * We couldn't send all or part of the request right now, so 4689 * queue it unless ISC_SOCKFLAG_NORETRY is set. 4690 */ 4691 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { 4692 isc_task_attach(task, &ntask); 4693 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4694 4695 if (!have_lock) { 4696 LOCK(&sock->lock); 4697 have_lock = ISC_TRUE; 4698 } 4699 4700 /* 4701 * Enqueue the request. If the socket was previously 4702 * not being watched, poke the watcher to start 4703 * paying attention to it. 4704 */ 4705 if (ISC_LIST_EMPTY(sock->send_list) && 4706 !sock->pending_send) 4707 select_poke(sock->manager, sock->fd, 4708 SELECT_POKE_WRITE); 4709 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); 4710 4711 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4712 "socket_send: event %p -> task %p", 4713 dev, ntask); 4714 4715 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4716 result = ISC_R_INPROGRESS; 4717 break; 4718 } 4719 4720 case DOIO_HARD: 4721 case DOIO_SUCCESS: 4722 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4723 send_senddone_event(sock, &dev); 4724 break; 4725 } 4726 4727 if (have_lock) 4728 UNLOCK(&sock->lock); 4729 4730 return (result); 4731} 4732 4733ISC_SOCKETFUNC_SCOPE isc_result_t 4734isc__socket_send(isc_socket_t *sock, isc_region_t *region, 4735 isc_task_t *task, isc_taskaction_t action, const void *arg) 4736{ 4737 /* 4738 * REQUIRE() checking is performed in isc_socket_sendto(). 4739 */ 4740 return (isc__socket_sendto(sock, region, task, action, arg, NULL, 4741 NULL)); 4742} 4743 4744ISC_SOCKETFUNC_SCOPE isc_result_t 4745isc__socket_sendto(isc_socket_t *sock0, isc_region_t *region, 4746 isc_task_t *task, isc_taskaction_t action, const void *arg, 4747 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4748{ 4749 isc__socket_t *sock = (isc__socket_t *)sock0; 4750 isc_socketevent_t *dev; 4751 isc__socketmgr_t *manager; 4752 4753 REQUIRE(VALID_SOCKET(sock)); 4754 REQUIRE(region != NULL); 4755 REQUIRE(task != NULL); 4756 REQUIRE(action != NULL); 4757 4758 manager = sock->manager; 4759 REQUIRE(VALID_MANAGER(manager)); 4760 4761 INSIST(sock->bound); 4762 4763 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4764 if (dev == NULL) 4765 return (ISC_R_NOMEMORY); 4766 4767 dev->region = *region; 4768 4769 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4770} 4771 4772ISC_SOCKETFUNC_SCOPE isc_result_t 4773isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 4774 isc_task_t *task, isc_taskaction_t action, const void *arg) 4775{ 4776 return (isc__socket_sendtov(sock, buflist, task, action, arg, NULL, 4777 NULL)); 4778} 4779 4780ISC_SOCKETFUNC_SCOPE isc_result_t 4781isc__socket_sendtov(isc_socket_t *sock0, isc_bufferlist_t *buflist, 4782 isc_task_t *task, isc_taskaction_t action, const void *arg, 4783 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4784{ 4785 isc__socket_t *sock = (isc__socket_t *)sock0; 4786 isc_socketevent_t *dev; 4787 isc__socketmgr_t *manager; 4788 unsigned int iocount; 4789 isc_buffer_t *buffer; 4790 4791 REQUIRE(VALID_SOCKET(sock)); 4792 REQUIRE(buflist != NULL); 4793 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4794 REQUIRE(task != NULL); 4795 REQUIRE(action != NULL); 4796 4797 manager = sock->manager; 4798 REQUIRE(VALID_MANAGER(manager)); 4799 4800 iocount = isc_bufferlist_usedcount(buflist); 4801 REQUIRE(iocount > 0); 4802 4803 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4804 if (dev == NULL) 4805 return (ISC_R_NOMEMORY); 4806 4807 /* 4808 * Move each buffer from the passed in list to our internal one. 4809 */ 4810 buffer = ISC_LIST_HEAD(*buflist); 4811 while (buffer != NULL) { 4812 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4813 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4814 buffer = ISC_LIST_HEAD(*buflist); 4815 } 4816 4817 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4818} 4819 4820ISC_SOCKETFUNC_SCOPE isc_result_t 4821isc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region, 4822 isc_task_t *task, 4823 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4824 isc_socketevent_t *event, unsigned int flags) 4825{ 4826 isc__socket_t *sock = (isc__socket_t *)sock0; 4827 4828 REQUIRE(VALID_SOCKET(sock)); 4829 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0); 4830 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) 4831 REQUIRE(sock->type == isc_sockettype_udp); 4832 event->ev_sender = sock; 4833 event->result = ISC_R_UNSET; 4834 ISC_LIST_INIT(event->bufferlist); 4835 event->region = *region; 4836 event->n = 0; 4837 event->offset = 0; 4838 event->attributes = 0; 4839 4840 return (socket_send(sock, event, task, address, pktinfo, flags)); 4841} 4842 4843ISC_SOCKETFUNC_SCOPE void 4844isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) { 4845#ifdef ISC_PLATFORM_HAVESYSUNH 4846 int s; 4847 struct stat sb; 4848 char strbuf[ISC_STRERRORSIZE]; 4849 4850 if (sockaddr->type.sa.sa_family != AF_UNIX) 4851 return; 4852 4853#ifndef S_ISSOCK 4854#if defined(S_IFMT) && defined(S_IFSOCK) 4855#define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK) 4856#elif defined(_S_IFMT) && defined(S_IFSOCK) 4857#define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK) 4858#endif 4859#endif 4860 4861#ifndef S_ISFIFO 4862#if defined(S_IFMT) && defined(S_IFIFO) 4863#define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO) 4864#elif defined(_S_IFMT) && defined(S_IFIFO) 4865#define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO) 4866#endif 4867#endif 4868 4869#if !defined(S_ISFIFO) && !defined(S_ISSOCK) 4870#error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>. 4871#endif 4872 4873#ifndef S_ISFIFO 4874#define S_ISFIFO(mode) 0 4875#endif 4876 4877#ifndef S_ISSOCK 4878#define S_ISSOCK(mode) 0 4879#endif 4880 4881 if (active) { 4882 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4883 isc__strerror(errno, strbuf, sizeof(strbuf)); 4884 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4885 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4886 "isc_socket_cleanunix: stat(%s): %s", 4887 sockaddr->type.sunix.sun_path, strbuf); 4888 return; 4889 } 4890 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4891 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4892 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4893 "isc_socket_cleanunix: %s: not a socket", 4894 sockaddr->type.sunix.sun_path); 4895 return; 4896 } 4897 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4898 isc__strerror(errno, strbuf, sizeof(strbuf)); 4899 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4900 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4901 "isc_socket_cleanunix: unlink(%s): %s", 4902 sockaddr->type.sunix.sun_path, strbuf); 4903 } 4904 return; 4905 } 4906 4907 s = socket(AF_UNIX, SOCK_STREAM, 0); 4908 if (s < 0) { 4909 isc__strerror(errno, strbuf, sizeof(strbuf)); 4910 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4911 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4912 "isc_socket_cleanunix: socket(%s): %s", 4913 sockaddr->type.sunix.sun_path, strbuf); 4914 return; 4915 } 4916 4917 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4918 switch (errno) { 4919 case ENOENT: /* We exited cleanly last time */ 4920 break; 4921 default: 4922 isc__strerror(errno, strbuf, sizeof(strbuf)); 4923 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4924 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4925 "isc_socket_cleanunix: stat(%s): %s", 4926 sockaddr->type.sunix.sun_path, strbuf); 4927 break; 4928 } 4929 goto cleanup; 4930 } 4931 4932 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4933 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4934 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4935 "isc_socket_cleanunix: %s: not a socket", 4936 sockaddr->type.sunix.sun_path); 4937 goto cleanup; 4938 } 4939 4940 if (connect(s, (struct sockaddr *)&sockaddr->type.sunix, 4941 sizeof(sockaddr->type.sunix)) < 0) { 4942 switch (errno) { 4943 case ECONNREFUSED: 4944 case ECONNRESET: 4945 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4946 isc__strerror(errno, strbuf, sizeof(strbuf)); 4947 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4948 ISC_LOGMODULE_SOCKET, 4949 ISC_LOG_WARNING, 4950 "isc_socket_cleanunix: " 4951 "unlink(%s): %s", 4952 sockaddr->type.sunix.sun_path, 4953 strbuf); 4954 } 4955 break; 4956 default: 4957 isc__strerror(errno, strbuf, sizeof(strbuf)); 4958 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4959 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4960 "isc_socket_cleanunix: connect(%s): %s", 4961 sockaddr->type.sunix.sun_path, strbuf); 4962 break; 4963 } 4964 } 4965 cleanup: 4966 close(s); 4967#else 4968 UNUSED(sockaddr); 4969 UNUSED(active); 4970#endif 4971} 4972 4973ISC_SOCKETFUNC_SCOPE isc_result_t 4974isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 4975 isc_uint32_t owner, isc_uint32_t group) 4976{ 4977#ifdef ISC_PLATFORM_HAVESYSUNH 4978 isc_result_t result = ISC_R_SUCCESS; 4979 char strbuf[ISC_STRERRORSIZE]; 4980 char path[sizeof(sockaddr->type.sunix.sun_path)]; 4981#ifdef NEED_SECURE_DIRECTORY 4982 char *slash; 4983#endif 4984 4985 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX); 4986 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path)); 4987 strcpy(path, sockaddr->type.sunix.sun_path); 4988 4989#ifdef NEED_SECURE_DIRECTORY 4990 slash = strrchr(path, '/'); 4991 if (slash != NULL) { 4992 if (slash != path) 4993 *slash = '\0'; 4994 else 4995 strcpy(path, "/"); 4996 } else 4997 strcpy(path, "."); 4998#endif 4999 5000 if (chmod(path, perm) < 0) { 5001 isc__strerror(errno, strbuf, sizeof(strbuf)); 5002 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5003 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 5004 "isc_socket_permunix: chmod(%s, %d): %s", 5005 path, perm, strbuf); 5006 result = ISC_R_FAILURE; 5007 } 5008 if (chown(path, owner, group) < 0) { 5009 isc__strerror(errno, strbuf, sizeof(strbuf)); 5010 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5011 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 5012 "isc_socket_permunix: chown(%s, %d, %d): %s", 5013 path, owner, group, 5014 strbuf); 5015 result = ISC_R_FAILURE; 5016 } 5017 return (result); 5018#else 5019 UNUSED(sockaddr); 5020 UNUSED(perm); 5021 UNUSED(owner); 5022 UNUSED(group); 5023 return (ISC_R_NOTIMPLEMENTED); 5024#endif 5025} 5026 5027ISC_SOCKETFUNC_SCOPE isc_result_t 5028isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr, 5029 unsigned int options) { 5030 isc__socket_t *sock = (isc__socket_t *)sock0; 5031 char strbuf[ISC_STRERRORSIZE]; 5032 int on = 1; 5033 5034 REQUIRE(VALID_SOCKET(sock)); 5035 5036 LOCK(&sock->lock); 5037 5038 INSIST(!sock->bound); 5039 INSIST(!sock->dupped); 5040 5041 if (sock->pf != sockaddr->type.sa.sa_family) { 5042 UNLOCK(&sock->lock); 5043 return (ISC_R_FAMILYMISMATCH); 5044 } 5045 5046 /* 5047 * Only set SO_REUSEADDR when we want a specific port. 5048 */ 5049#ifdef AF_UNIX 5050 if (sock->pf == AF_UNIX) 5051 goto bind_socket; 5052#endif 5053 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 && 5054 isc_sockaddr_getport(sockaddr) != (in_port_t)0 && 5055 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, 5056 sizeof(on)) < 0) { 5057 UNEXPECTED_ERROR(__FILE__, __LINE__, 5058 "setsockopt(%d) %s", sock->fd, 5059 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 5060 ISC_MSG_FAILED, "failed")); 5061 /* Press on... */ 5062 } 5063#ifdef AF_UNIX 5064 bind_socket: 5065#endif 5066 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) { 5067 inc_stats(sock->manager->stats, 5068 sock->statsindex[STATID_BINDFAIL]); 5069 5070 UNLOCK(&sock->lock); 5071 switch (errno) { 5072 case EACCES: 5073 return (ISC_R_NOPERM); 5074 case EADDRNOTAVAIL: 5075 return (ISC_R_ADDRNOTAVAIL); 5076 case EADDRINUSE: 5077 return (ISC_R_ADDRINUSE); 5078 case EINVAL: 5079 return (ISC_R_BOUND); 5080 default: 5081 isc__strerror(errno, strbuf, sizeof(strbuf)); 5082 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s", 5083 strbuf); 5084 return (ISC_R_UNEXPECTED); 5085 } 5086 } 5087 5088 socket_log(sock, sockaddr, TRACE, 5089 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound"); 5090 sock->bound = 1; 5091 5092 UNLOCK(&sock->lock); 5093 return (ISC_R_SUCCESS); 5094} 5095 5096/* 5097 * Enable this only for specific OS versions, and only when they have repaired 5098 * their problems with it. Until then, this is is broken and needs to be 5099 * diabled by default. See RT22589 for details. 5100 */ 5101#undef ENABLE_ACCEPTFILTER 5102 5103ISC_SOCKETFUNC_SCOPE isc_result_t 5104isc__socket_filter(isc_socket_t *sock0, const char *filter) { 5105 isc__socket_t *sock = (isc__socket_t *)sock0; 5106#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 5107 char strbuf[ISC_STRERRORSIZE]; 5108 struct accept_filter_arg afa; 5109#else 5110 UNUSED(sock); 5111 UNUSED(filter); 5112#endif 5113 5114 REQUIRE(VALID_SOCKET(sock)); 5115 5116#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 5117 bzero(&afa, sizeof(afa)); 5118 strncpy(afa.af_name, filter, sizeof(afa.af_name)); 5119 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, 5120 &afa, sizeof(afa)) == -1) { 5121 isc__strerror(errno, strbuf, sizeof(strbuf)); 5122 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 5123 ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s", 5124 strbuf); 5125 return (ISC_R_FAILURE); 5126 } 5127 return (ISC_R_SUCCESS); 5128#else 5129 return (ISC_R_NOTIMPLEMENTED); 5130#endif 5131} 5132 5133/* 5134 * Set up to listen on a given socket. We do this by creating an internal 5135 * event that will be dispatched when the socket has read activity. The 5136 * watcher will send the internal event to the task when there is a new 5137 * connection. 5138 * 5139 * Unlike in read, we don't preallocate a done event here. Every time there 5140 * is a new connection we'll have to allocate a new one anyway, so we might 5141 * as well keep things simple rather than having to track them. 5142 */ 5143ISC_SOCKETFUNC_SCOPE isc_result_t 5144isc__socket_listen(isc_socket_t *sock0, unsigned int backlog) { 5145 isc__socket_t *sock = (isc__socket_t *)sock0; 5146 char strbuf[ISC_STRERRORSIZE]; 5147 5148 REQUIRE(VALID_SOCKET(sock)); 5149 5150 LOCK(&sock->lock); 5151 5152 REQUIRE(!sock->listener); 5153 REQUIRE(sock->bound); 5154 REQUIRE(sock->type == isc_sockettype_tcp || 5155 sock->type == isc_sockettype_unix); 5156 5157 if (backlog == 0) 5158 backlog = SOMAXCONN; 5159 5160 if (listen(sock->fd, (int)backlog) < 0) { 5161 UNLOCK(&sock->lock); 5162 isc__strerror(errno, strbuf, sizeof(strbuf)); 5163 5164 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf); 5165 5166 return (ISC_R_UNEXPECTED); 5167 } 5168 5169 sock->listener = 1; 5170 5171 UNLOCK(&sock->lock); 5172 return (ISC_R_SUCCESS); 5173} 5174 5175/* 5176 * This should try to do aggressive accept() XXXMLG 5177 */ 5178ISC_SOCKETFUNC_SCOPE isc_result_t 5179isc__socket_accept(isc_socket_t *sock0, 5180 isc_task_t *task, isc_taskaction_t action, const void *arg) 5181{ 5182 isc__socket_t *sock = (isc__socket_t *)sock0; 5183 isc_socket_newconnev_t *dev; 5184 isc__socketmgr_t *manager; 5185 isc_task_t *ntask = NULL; 5186 isc__socket_t *nsock; 5187 isc_result_t result; 5188 isc_boolean_t do_poke = ISC_FALSE; 5189 5190 REQUIRE(VALID_SOCKET(sock)); 5191 manager = sock->manager; 5192 REQUIRE(VALID_MANAGER(manager)); 5193 5194 LOCK(&sock->lock); 5195 5196 REQUIRE(sock->listener); 5197 5198 /* 5199 * Sender field is overloaded here with the task we will be sending 5200 * this event to. Just before the actual event is delivered the 5201 * actual ev_sender will be touched up to be the socket. 5202 */ 5203 dev = (isc_socket_newconnev_t *) 5204 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN, 5205 action, arg, sizeof(*dev)); 5206 if (dev == NULL) { 5207 UNLOCK(&sock->lock); 5208 return (ISC_R_NOMEMORY); 5209 } 5210 ISC_LINK_INIT(dev, ev_link); 5211 5212 result = allocate_socket(manager, sock->type, &nsock); 5213 if (result != ISC_R_SUCCESS) { 5214 isc_event_free(ISC_EVENT_PTR(&dev)); 5215 UNLOCK(&sock->lock); 5216 return (result); 5217 } 5218 5219 /* 5220 * Attach to socket and to task. 5221 */ 5222 isc_task_attach(task, &ntask); 5223 if (isc_task_exiting(ntask)) { 5224 free_socket(&nsock); 5225 isc_task_detach(&ntask); 5226 isc_event_free(ISC_EVENT_PTR(&dev)); 5227 UNLOCK(&sock->lock); 5228 return (ISC_R_SHUTTINGDOWN); 5229 } 5230 nsock->references++; 5231 nsock->statsindex = sock->statsindex; 5232 5233 dev->ev_sender = ntask; 5234 dev->newsocket = (isc_socket_t *)nsock; 5235 5236 /* 5237 * Poke watcher here. We still have the socket locked, so there 5238 * is no race condition. We will keep the lock for such a short 5239 * bit of time waking it up now or later won't matter all that much. 5240 */ 5241 if (ISC_LIST_EMPTY(sock->accept_list)) 5242 do_poke = ISC_TRUE; 5243 5244 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link); 5245 5246 if (do_poke) 5247 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT); 5248 5249 UNLOCK(&sock->lock); 5250 return (ISC_R_SUCCESS); 5251} 5252 5253ISC_SOCKETFUNC_SCOPE isc_result_t 5254isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr, 5255 isc_task_t *task, isc_taskaction_t action, const void *arg) 5256{ 5257 isc__socket_t *sock = (isc__socket_t *)sock0; 5258 isc_socket_connev_t *dev; 5259 isc_task_t *ntask = NULL; 5260 isc__socketmgr_t *manager; 5261 int cc; 5262 char strbuf[ISC_STRERRORSIZE]; 5263 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 5264 5265 REQUIRE(VALID_SOCKET(sock)); 5266 REQUIRE(addr != NULL); 5267 REQUIRE(task != NULL); 5268 REQUIRE(action != NULL); 5269 5270 manager = sock->manager; 5271 REQUIRE(VALID_MANAGER(manager)); 5272 REQUIRE(addr != NULL); 5273 5274 if (isc_sockaddr_ismulticast(addr)) 5275 return (ISC_R_MULTICAST); 5276 5277 LOCK(&sock->lock); 5278 5279 REQUIRE(!sock->connecting); 5280 5281 dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock, 5282 ISC_SOCKEVENT_CONNECT, 5283 action, arg, 5284 sizeof(*dev)); 5285 if (dev == NULL) { 5286 UNLOCK(&sock->lock); 5287 return (ISC_R_NOMEMORY); 5288 } 5289 ISC_LINK_INIT(dev, ev_link); 5290 5291 /* 5292 * Try to do the connect right away, as there can be only one 5293 * outstanding, and it might happen to complete. 5294 */ 5295 sock->peer_address = *addr; 5296 cc = connect(sock->fd, &addr->type.sa, addr->length); 5297 if (cc < 0) { 5298 /* 5299 * HP-UX "fails" to connect a UDP socket and sets errno to 5300 * EINPROGRESS if it's non-blocking. We'd rather regard this as 5301 * a success and let the user detect it if it's really an error 5302 * at the time of sending a packet on the socket. 5303 */ 5304 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { 5305 cc = 0; 5306 goto success; 5307 } 5308 if (SOFT_ERROR(errno) || errno == EINPROGRESS) 5309 goto queue; 5310 5311 switch (errno) { 5312#define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit; 5313 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5314 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5315 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5316 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5317 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5318#ifdef EHOSTDOWN 5319 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5320#endif 5321 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5322 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5323 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5324 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5325 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5326#undef ERROR_MATCH 5327 } 5328 5329 sock->connected = 0; 5330 5331 isc__strerror(errno, strbuf, sizeof(strbuf)); 5332 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf)); 5333 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s", 5334 addrbuf, errno, strbuf); 5335 5336 UNLOCK(&sock->lock); 5337 inc_stats(sock->manager->stats, 5338 sock->statsindex[STATID_CONNECTFAIL]); 5339 isc_event_free(ISC_EVENT_PTR(&dev)); 5340 return (ISC_R_UNEXPECTED); 5341 5342 err_exit: 5343 sock->connected = 0; 5344 isc_task_send(task, ISC_EVENT_PTR(&dev)); 5345 5346 UNLOCK(&sock->lock); 5347 inc_stats(sock->manager->stats, 5348 sock->statsindex[STATID_CONNECTFAIL]); 5349 return (ISC_R_SUCCESS); 5350 } 5351 5352 /* 5353 * If connect completed, fire off the done event. 5354 */ 5355 success: 5356 if (cc == 0) { 5357 sock->connected = 1; 5358 sock->bound = 1; 5359 dev->result = ISC_R_SUCCESS; 5360 isc_task_send(task, ISC_EVENT_PTR(&dev)); 5361 5362 UNLOCK(&sock->lock); 5363 5364 inc_stats(sock->manager->stats, 5365 sock->statsindex[STATID_CONNECT]); 5366 5367 return (ISC_R_SUCCESS); 5368 } 5369 5370 queue: 5371 5372 /* 5373 * Attach to task. 5374 */ 5375 isc_task_attach(task, &ntask); 5376 5377 sock->connecting = 1; 5378 5379 dev->ev_sender = ntask; 5380 5381 /* 5382 * Poke watcher here. We still have the socket locked, so there 5383 * is no race condition. We will keep the lock for such a short 5384 * bit of time waking it up now or later won't matter all that much. 5385 */ 5386 if (sock->connect_ev == NULL) 5387 select_poke(manager, sock->fd, SELECT_POKE_CONNECT); 5388 5389 sock->connect_ev = dev; 5390 5391 UNLOCK(&sock->lock); 5392 return (ISC_R_SUCCESS); 5393} 5394 5395/* 5396 * Called when a socket with a pending connect() finishes. 5397 */ 5398static void 5399internal_connect(isc_task_t *me, isc_event_t *ev) { 5400 isc__socket_t *sock; 5401 isc_socket_connev_t *dev; 5402 isc_task_t *task; 5403 int cc; 5404 ISC_SOCKADDR_LEN_T optlen; 5405 char strbuf[ISC_STRERRORSIZE]; 5406 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5407 5408 UNUSED(me); 5409 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 5410 5411 sock = ev->ev_sender; 5412 INSIST(VALID_SOCKET(sock)); 5413 5414 LOCK(&sock->lock); 5415 5416 /* 5417 * When the internal event was sent the reference count was bumped 5418 * to keep the socket around for us. Decrement the count here. 5419 */ 5420 INSIST(sock->references > 0); 5421 sock->references--; 5422 if (sock->references == 0) { 5423 UNLOCK(&sock->lock); 5424 destroy(&sock); 5425 return; 5426 } 5427 5428 /* 5429 * Has this event been canceled? 5430 */ 5431 dev = sock->connect_ev; 5432 if (dev == NULL) { 5433 INSIST(!sock->connecting); 5434 UNLOCK(&sock->lock); 5435 return; 5436 } 5437 5438 INSIST(sock->connecting); 5439 sock->connecting = 0; 5440 5441 /* 5442 * Get any possible error status here. 5443 */ 5444 optlen = sizeof(cc); 5445 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, 5446 (void *)&cc, (void *)&optlen) < 0) 5447 cc = errno; 5448 else 5449 errno = cc; 5450 5451 if (errno != 0) { 5452 /* 5453 * If the error is EAGAIN, just re-select on this 5454 * fd and pretend nothing strange happened. 5455 */ 5456 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 5457 sock->connecting = 1; 5458 select_poke(sock->manager, sock->fd, 5459 SELECT_POKE_CONNECT); 5460 UNLOCK(&sock->lock); 5461 5462 return; 5463 } 5464 5465 inc_stats(sock->manager->stats, 5466 sock->statsindex[STATID_CONNECTFAIL]); 5467 5468 /* 5469 * Translate other errors into ISC_R_* flavors. 5470 */ 5471 switch (errno) { 5472#define ERROR_MATCH(a, b) case a: dev->result = b; break; 5473 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5474 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5475 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5476 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5477 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5478#ifdef EHOSTDOWN 5479 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5480#endif 5481 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5482 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5483 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5484 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5485 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 5486 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5487#undef ERROR_MATCH 5488 default: 5489 dev->result = ISC_R_UNEXPECTED; 5490 isc_sockaddr_format(&sock->peer_address, peerbuf, 5491 sizeof(peerbuf)); 5492 isc__strerror(errno, strbuf, sizeof(strbuf)); 5493 UNEXPECTED_ERROR(__FILE__, __LINE__, 5494 "internal_connect: connect(%s) %s", 5495 peerbuf, strbuf); 5496 } 5497 } else { 5498 inc_stats(sock->manager->stats, 5499 sock->statsindex[STATID_CONNECT]); 5500 dev->result = ISC_R_SUCCESS; 5501 sock->connected = 1; 5502 sock->bound = 1; 5503 } 5504 5505 sock->connect_ev = NULL; 5506 5507 UNLOCK(&sock->lock); 5508 5509 task = dev->ev_sender; 5510 dev->ev_sender = sock; 5511 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 5512} 5513 5514ISC_SOCKETFUNC_SCOPE isc_result_t 5515isc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) { 5516 isc__socket_t *sock = (isc__socket_t *)sock0; 5517 isc_result_t result; 5518 5519 REQUIRE(VALID_SOCKET(sock)); 5520 REQUIRE(addressp != NULL); 5521 5522 LOCK(&sock->lock); 5523 5524 if (sock->connected) { 5525 *addressp = sock->peer_address; 5526 result = ISC_R_SUCCESS; 5527 } else { 5528 result = ISC_R_NOTCONNECTED; 5529 } 5530 5531 UNLOCK(&sock->lock); 5532 5533 return (result); 5534} 5535 5536ISC_SOCKETFUNC_SCOPE isc_result_t 5537isc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) { 5538 isc__socket_t *sock = (isc__socket_t *)sock0; 5539 ISC_SOCKADDR_LEN_T len; 5540 isc_result_t result; 5541 char strbuf[ISC_STRERRORSIZE]; 5542 5543 REQUIRE(VALID_SOCKET(sock)); 5544 REQUIRE(addressp != NULL); 5545 5546 LOCK(&sock->lock); 5547 5548 if (!sock->bound) { 5549 result = ISC_R_NOTBOUND; 5550 goto out; 5551 } 5552 5553 result = ISC_R_SUCCESS; 5554 5555 len = sizeof(addressp->type); 5556 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) { 5557 isc__strerror(errno, strbuf, sizeof(strbuf)); 5558 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", 5559 strbuf); 5560 result = ISC_R_UNEXPECTED; 5561 goto out; 5562 } 5563 addressp->length = (unsigned int)len; 5564 5565 out: 5566 UNLOCK(&sock->lock); 5567 5568 return (result); 5569} 5570 5571/* 5572 * Run through the list of events on this socket, and cancel the ones 5573 * queued for task "task" of type "how". "how" is a bitmask. 5574 */ 5575ISC_SOCKETFUNC_SCOPE void 5576isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) { 5577 isc__socket_t *sock = (isc__socket_t *)sock0; 5578 5579 REQUIRE(VALID_SOCKET(sock)); 5580 5581 /* 5582 * Quick exit if there is nothing to do. Don't even bother locking 5583 * in this case. 5584 */ 5585 if (how == 0) 5586 return; 5587 5588 LOCK(&sock->lock); 5589 5590 /* 5591 * All of these do the same thing, more or less. 5592 * Each will: 5593 * o If the internal event is marked as "posted" try to 5594 * remove it from the task's queue. If this fails, mark it 5595 * as canceled instead, and let the task clean it up later. 5596 * o For each I/O request for that task of that type, post 5597 * its done event with status of "ISC_R_CANCELED". 5598 * o Reset any state needed. 5599 */ 5600 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) 5601 && !ISC_LIST_EMPTY(sock->recv_list)) { 5602 isc_socketevent_t *dev; 5603 isc_socketevent_t *next; 5604 isc_task_t *current_task; 5605 5606 dev = ISC_LIST_HEAD(sock->recv_list); 5607 5608 while (dev != NULL) { 5609 current_task = dev->ev_sender; 5610 next = ISC_LIST_NEXT(dev, ev_link); 5611 5612 if ((task == NULL) || (task == current_task)) { 5613 dev->result = ISC_R_CANCELED; 5614 send_recvdone_event(sock, &dev); 5615 } 5616 dev = next; 5617 } 5618 } 5619 5620 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) 5621 && !ISC_LIST_EMPTY(sock->send_list)) { 5622 isc_socketevent_t *dev; 5623 isc_socketevent_t *next; 5624 isc_task_t *current_task; 5625 5626 dev = ISC_LIST_HEAD(sock->send_list); 5627 5628 while (dev != NULL) { 5629 current_task = dev->ev_sender; 5630 next = ISC_LIST_NEXT(dev, ev_link); 5631 5632 if ((task == NULL) || (task == current_task)) { 5633 dev->result = ISC_R_CANCELED; 5634 send_senddone_event(sock, &dev); 5635 } 5636 dev = next; 5637 } 5638 } 5639 5640 if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT) 5641 && !ISC_LIST_EMPTY(sock->accept_list)) { 5642 isc_socket_newconnev_t *dev; 5643 isc_socket_newconnev_t *next; 5644 isc_task_t *current_task; 5645 5646 dev = ISC_LIST_HEAD(sock->accept_list); 5647 while (dev != NULL) { 5648 current_task = dev->ev_sender; 5649 next = ISC_LIST_NEXT(dev, ev_link); 5650 5651 if ((task == NULL) || (task == current_task)) { 5652 5653 ISC_LIST_UNLINK(sock->accept_list, dev, 5654 ev_link); 5655 5656 NEWCONNSOCK(dev)->references--; 5657 free_socket((isc__socket_t **)&dev->newsocket); 5658 5659 dev->result = ISC_R_CANCELED; 5660 dev->ev_sender = sock; 5661 isc_task_sendanddetach(¤t_task, 5662 ISC_EVENT_PTR(&dev)); 5663 } 5664 5665 dev = next; 5666 } 5667 } 5668 5669 /* 5670 * Connecting is not a list. 5671 */ 5672 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT) 5673 && sock->connect_ev != NULL) { 5674 isc_socket_connev_t *dev; 5675 isc_task_t *current_task; 5676 5677 INSIST(sock->connecting); 5678 sock->connecting = 0; 5679 5680 dev = sock->connect_ev; 5681 current_task = dev->ev_sender; 5682 5683 if ((task == NULL) || (task == current_task)) { 5684 sock->connect_ev = NULL; 5685 5686 dev->result = ISC_R_CANCELED; 5687 dev->ev_sender = sock; 5688 isc_task_sendanddetach(¤t_task, 5689 ISC_EVENT_PTR(&dev)); 5690 } 5691 } 5692 5693 UNLOCK(&sock->lock); 5694} 5695 5696ISC_SOCKETFUNC_SCOPE isc_sockettype_t 5697isc__socket_gettype(isc_socket_t *sock0) { 5698 isc__socket_t *sock = (isc__socket_t *)sock0; 5699 5700 REQUIRE(VALID_SOCKET(sock)); 5701 5702 return (sock->type); 5703} 5704 5705ISC_SOCKETFUNC_SCOPE isc_boolean_t 5706isc__socket_isbound(isc_socket_t *sock0) { 5707 isc__socket_t *sock = (isc__socket_t *)sock0; 5708 isc_boolean_t val; 5709 5710 REQUIRE(VALID_SOCKET(sock)); 5711 5712 LOCK(&sock->lock); 5713 val = ((sock->bound) ? ISC_TRUE : ISC_FALSE); 5714 UNLOCK(&sock->lock); 5715 5716 return (val); 5717} 5718 5719ISC_SOCKETFUNC_SCOPE void 5720isc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) { 5721 isc__socket_t *sock = (isc__socket_t *)sock0; 5722#if defined(IPV6_V6ONLY) 5723 int onoff = yes ? 1 : 0; 5724#else 5725 UNUSED(yes); 5726 UNUSED(sock); 5727#endif 5728 5729 REQUIRE(VALID_SOCKET(sock)); 5730 INSIST(!sock->dupped); 5731 5732#ifdef IPV6_V6ONLY 5733 if (sock->pf == AF_INET6) { 5734 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, 5735 (void *)&onoff, sizeof(int)) < 0) { 5736 char strbuf[ISC_STRERRORSIZE]; 5737 isc__strerror(errno, strbuf, sizeof(strbuf)); 5738 UNEXPECTED_ERROR(__FILE__, __LINE__, 5739 "setsockopt(%d, IPV6_V6ONLY) " 5740 "%s: %s", sock->fd, 5741 isc_msgcat_get(isc_msgcat, 5742 ISC_MSGSET_GENERAL, 5743 ISC_MSG_FAILED, 5744 "failed"), 5745 strbuf); 5746 } 5747 } 5748 FIX_IPV6_RECVPKTINFO(sock); /* AIX */ 5749#endif 5750} 5751 5752#ifndef USE_WATCHER_THREAD 5753/* 5754 * In our assumed scenario, we can simply use a single static object. 5755 * XXX: this is not true if the application uses multiple threads with 5756 * 'multi-context' mode. Fixing this is a future TODO item. 5757 */ 5758static isc_socketwait_t swait_private; 5759 5760int 5761isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp, 5762 isc_socketwait_t **swaitp) 5763{ 5764 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 5765 5766 5767 int n; 5768#ifdef USE_KQUEUE 5769 struct timespec ts, *tsp; 5770#endif 5771#ifdef USE_EPOLL 5772 int timeout; 5773#endif 5774#ifdef USE_DEVPOLL 5775 struct dvpoll dvp; 5776#endif 5777 5778 REQUIRE(swaitp != NULL && *swaitp == NULL); 5779 5780#ifdef USE_SHARED_MANAGER 5781 if (manager == NULL) 5782 manager = socketmgr; 5783#endif 5784 if (manager == NULL) 5785 return (0); 5786 5787#ifdef USE_KQUEUE 5788 if (tvp != NULL) { 5789 ts.tv_sec = tvp->tv_sec; 5790 ts.tv_nsec = tvp->tv_usec * 1000; 5791 tsp = &ts; 5792 } else 5793 tsp = NULL; 5794 swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0, 5795 manager->events, manager->nevents, 5796 tsp); 5797 n = swait_private.nevents; 5798#elif defined(USE_EPOLL) 5799 if (tvp != NULL) 5800 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000; 5801 else 5802 timeout = -1; 5803 swait_private.nevents = epoll_wait(manager->epoll_fd, 5804 manager->events, 5805 manager->nevents, timeout); 5806 n = swait_private.nevents; 5807#elif defined(USE_DEVPOLL) 5808 dvp.dp_fds = manager->events; 5809 dvp.dp_nfds = manager->nevents; 5810 if (tvp != NULL) { 5811 dvp.dp_timeout = tvp->tv_sec * 1000 + 5812 (tvp->tv_usec + 999) / 1000; 5813 } else 5814 dvp.dp_timeout = -1; 5815 swait_private.nevents = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 5816 n = swait_private.nevents; 5817#elif defined(USE_SELECT) 5818 memcpy(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize); 5819 memcpy(manager->write_fds_copy, manager->write_fds, 5820 manager->fd_bufsize); 5821 5822 swait_private.readset = manager->read_fds_copy; 5823 swait_private.writeset = manager->write_fds_copy; 5824 swait_private.maxfd = manager->maxfd + 1; 5825 5826 n = select(swait_private.maxfd, swait_private.readset, 5827 swait_private.writeset, NULL, tvp); 5828#endif 5829 5830 *swaitp = &swait_private; 5831 return (n); 5832} 5833 5834isc_result_t 5835isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) { 5836 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 5837 5838 REQUIRE(swait == &swait_private); 5839 5840#ifdef USE_SHARED_MANAGER 5841 if (manager == NULL) 5842 manager = socketmgr; 5843#endif 5844 if (manager == NULL) 5845 return (ISC_R_NOTFOUND); 5846 5847#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 5848 (void)process_fds(manager, manager->events, swait->nevents); 5849 return (ISC_R_SUCCESS); 5850#elif defined(USE_SELECT) 5851 process_fds(manager, swait->maxfd, swait->readset, swait->writeset); 5852 return (ISC_R_SUCCESS); 5853#endif 5854} 5855#endif /* USE_WATCHER_THREAD */ 5856 5857#ifdef BIND9 5858void 5859isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) { 5860 isc__socket_t *socket = (isc__socket_t *)socket0; 5861 5862 /* 5863 * Name 'socket'. 5864 */ 5865 5866 REQUIRE(VALID_SOCKET(socket)); 5867 5868 LOCK(&socket->lock); 5869 memset(socket->name, 0, sizeof(socket->name)); 5870 strncpy(socket->name, name, sizeof(socket->name) - 1); 5871 socket->tag = tag; 5872 UNLOCK(&socket->lock); 5873} 5874 5875ISC_SOCKETFUNC_SCOPE const char * 5876isc__socket_getname(isc_socket_t *socket0) { 5877 isc__socket_t *socket = (isc__socket_t *)socket0; 5878 5879 return (socket->name); 5880} 5881 5882void * 5883isc__socket_gettag(isc_socket_t *socket0) { 5884 isc__socket_t *socket = (isc__socket_t *)socket0; 5885 5886 return (socket->tag); 5887} 5888#endif /* BIND9 */ 5889 5890#ifdef USE_SOCKETIMPREGISTER 5891isc_result_t 5892isc__socket_register() { 5893 return (isc_socket_register(isc__socketmgr_create)); 5894} 5895#endif 5896 5897ISC_SOCKETFUNC_SCOPE int 5898isc__socket_getfd(isc_socket_t *socket0) { 5899 isc__socket_t *socket = (isc__socket_t *)socket0; 5900 5901 return ((short) socket->fd); 5902} 5903 5904#if defined(HAVE_LIBXML2) && defined(BIND9) 5905 5906static const char * 5907_socktype(isc_sockettype_t type) 5908{ 5909 if (type == isc_sockettype_udp) 5910 return ("udp"); 5911 else if (type == isc_sockettype_tcp) 5912 return ("tcp"); 5913 else if (type == isc_sockettype_unix) 5914 return ("unix"); 5915 else if (type == isc_sockettype_fdwatch) 5916 return ("fdwatch"); 5917 else 5918 return ("not-initialized"); 5919} 5920 5921ISC_SOCKETFUNC_SCOPE void 5922isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) { 5923 isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0; 5924 isc__socket_t *sock; 5925 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5926 isc_sockaddr_t addr; 5927 ISC_SOCKADDR_LEN_T len; 5928 5929 LOCK(&mgr->lock); 5930 5931#ifdef USE_SHARED_MANAGER 5932 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"); 5933 xmlTextWriterWriteFormatString(writer, "%d", mgr->refs); 5934 xmlTextWriterEndElement(writer); 5935#endif /* USE_SHARED_MANAGER */ 5936 5937 xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"); 5938 sock = ISC_LIST_HEAD(mgr->socklist); 5939 while (sock != NULL) { 5940 LOCK(&sock->lock); 5941 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"); 5942 5943 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"); 5944 xmlTextWriterWriteFormatString(writer, "%p", sock); 5945 xmlTextWriterEndElement(writer); 5946 5947 if (sock->name[0] != 0) { 5948 xmlTextWriterStartElement(writer, ISC_XMLCHAR "name"); 5949 xmlTextWriterWriteFormatString(writer, "%s", 5950 sock->name); 5951 xmlTextWriterEndElement(writer); /* name */ 5952 } 5953 5954 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"); 5955 xmlTextWriterWriteFormatString(writer, "%d", sock->references); 5956 xmlTextWriterEndElement(writer); 5957 5958 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type", 5959 ISC_XMLCHAR _socktype(sock->type)); 5960 5961 if (sock->connected) { 5962 isc_sockaddr_format(&sock->peer_address, peerbuf, 5963 sizeof(peerbuf)); 5964 xmlTextWriterWriteElement(writer, 5965 ISC_XMLCHAR "peer-address", 5966 ISC_XMLCHAR peerbuf); 5967 } 5968 5969 len = sizeof(addr); 5970 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) { 5971 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf)); 5972 xmlTextWriterWriteElement(writer, 5973 ISC_XMLCHAR "local-address", 5974 ISC_XMLCHAR peerbuf); 5975 } 5976 5977 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"); 5978 if (sock->pending_recv) 5979 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5980 ISC_XMLCHAR "pending-receive"); 5981 if (sock->pending_send) 5982 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5983 ISC_XMLCHAR "pending-send"); 5984 if (sock->pending_accept) 5985 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5986 ISC_XMLCHAR "pending_accept"); 5987 if (sock->listener) 5988 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5989 ISC_XMLCHAR "listener"); 5990 if (sock->connected) 5991 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5992 ISC_XMLCHAR "connected"); 5993 if (sock->connecting) 5994 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5995 ISC_XMLCHAR "connecting"); 5996 if (sock->bound) 5997 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5998 ISC_XMLCHAR "bound"); 5999 6000 xmlTextWriterEndElement(writer); /* states */ 6001 6002 xmlTextWriterEndElement(writer); /* socket */ 6003 6004 UNLOCK(&sock->lock); 6005 sock = ISC_LIST_NEXT(sock, link); 6006 } 6007 xmlTextWriterEndElement(writer); /* sockets */ 6008 6009 UNLOCK(&mgr->lock); 6010} 6011#endif /* HAVE_LIBXML2 */ 6012