1/* 2 * Copyright (C) 2004-2014 Internet Systems Consortium, Inc. ("ISC") 3 * Copyright (C) 1998-2003 Internet Software Consortium. 4 * 5 * Permission to use, copy, modify, and/or distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 11 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 15 * PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18/* $Id$ */ 19 20/*! \file */ 21 22#include <config.h> 23 24#include <sys/param.h> 25#include <sys/types.h> 26#include <sys/socket.h> 27#include <sys/stat.h> 28#include <sys/time.h> 29#include <sys/uio.h> 30 31#include <errno.h> 32#include <fcntl.h> 33#include <stddef.h> 34#include <stdlib.h> 35#include <string.h> 36#include <unistd.h> 37 38#include <isc/buffer.h> 39#include <isc/bufferlist.h> 40#include <isc/condition.h> 41#include <isc/formatcheck.h> 42#include <isc/list.h> 43#include <isc/log.h> 44#include <isc/mem.h> 45#include <isc/msgs.h> 46#include <isc/mutex.h> 47#include <isc/net.h> 48#include <isc/once.h> 49#include <isc/platform.h> 50#include <isc/print.h> 51#include <isc/region.h> 52#include <isc/socket.h> 53#include <isc/stats.h> 54#include <isc/strerror.h> 55#include <isc/task.h> 56#include <isc/thread.h> 57#include <isc/util.h> 58#include <isc/xml.h> 59 60#ifdef ISC_PLATFORM_HAVESYSUNH 61#include <sys/un.h> 62#endif 63#ifdef ISC_PLATFORM_HAVEKQUEUE 64#include <sys/event.h> 65#endif 66#ifdef ISC_PLATFORM_HAVEEPOLL 67#include <sys/epoll.h> 68#endif 69#ifdef ISC_PLATFORM_HAVEDEVPOLL 70#if defined(HAVE_SYS_DEVPOLL_H) 71#include <sys/devpoll.h> 72#elif defined(HAVE_DEVPOLL_H) 73#include <devpoll.h> 74#endif 75#endif 76 77#include "errno2result.h" 78 79/* See task.c about the following definition: */ 80#ifdef BIND9 81#ifdef ISC_PLATFORM_USETHREADS 82#define USE_WATCHER_THREAD 83#else 84#define USE_SHARED_MANAGER 85#endif /* ISC_PLATFORM_USETHREADS */ 86#endif /* BIND9 */ 87 88#ifndef USE_WATCHER_THREAD 89#include "socket_p.h" 90#include "../task_p.h" 91#endif /* USE_WATCHER_THREAD */ 92 93#if defined(SO_BSDCOMPAT) && defined(__linux__) 94#include <sys/utsname.h> 95#endif 96 97/*% 98 * Choose the most preferable multiplex method. 99 */ 100#ifdef ISC_PLATFORM_HAVEKQUEUE 101#define USE_KQUEUE 102#elif defined (ISC_PLATFORM_HAVEEPOLL) 103#define USE_EPOLL 104#elif defined (ISC_PLATFORM_HAVEDEVPOLL) 105#define USE_DEVPOLL 106typedef struct { 107 unsigned int want_read : 1, 108 want_write : 1; 109} pollinfo_t; 110#else 111#define USE_SELECT 112#endif /* ISC_PLATFORM_HAVEKQUEUE */ 113 114#ifndef USE_WATCHER_THREAD 115#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 116struct isc_socketwait { 117 int nevents; 118}; 119#elif defined (USE_SELECT) 120struct isc_socketwait { 121 fd_set *readset; 122 fd_set *writeset; 123 int nfds; 124 int maxfd; 125}; 126#endif /* USE_KQUEUE */ 127#endif /* !USE_WATCHER_THREAD */ 128 129/*% 130 * Maximum number of allowable open sockets. This is also the maximum 131 * allowable socket file descriptor. 132 * 133 * Care should be taken before modifying this value for select(): 134 * The API standard doesn't ensure select() accept more than (the system default 135 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in 136 * the vast majority of cases. This constant should therefore be increased only 137 * when absolutely necessary and possible, i.e., the server is exhausting all 138 * available file descriptors (up to FD_SETSIZE) and the select() function 139 * and FD_xxx macros support larger values than FD_SETSIZE (which may not 140 * always by true, but we keep using some of them to ensure as much 141 * portability as possible). Note also that overall server performance 142 * may be rather worsened with a larger value of this constant due to 143 * inherent scalability problems of select(). 144 * 145 * As a special note, this value shouldn't have to be touched if 146 * this is a build for an authoritative only DNS server. 147 */ 148#ifndef ISC_SOCKET_MAXSOCKETS 149#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 150#define ISC_SOCKET_MAXSOCKETS 4096 151#elif defined(USE_SELECT) 152#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE 153#endif /* USE_KQUEUE... */ 154#endif /* ISC_SOCKET_MAXSOCKETS */ 155 156#ifdef USE_SELECT 157/*% 158 * Mac OS X needs a special definition to support larger values in select(). 159 * We always define this because a larger value can be specified run-time. 160 */ 161#ifdef __APPLE__ 162#define _DARWIN_UNLIMITED_SELECT 163#endif /* __APPLE__ */ 164#endif /* USE_SELECT */ 165 166#ifdef ISC_SOCKET_USE_POLLWATCH 167/*% 168 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel 169 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for 170 * some of the specified FD. The idea is based on the observation that it's 171 * likely for a busy server to keep receiving packets. It specifically works 172 * as follows: the socket watcher is first initialized with the state of 173 * "poll_idle". While it's in the idle state it keeps sleeping until a socket 174 * event occurs. When it wakes up for a socket I/O event, it moves to the 175 * poll_active state, and sets the poll timeout to a short period 176 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the 177 * watcher goes to the poll_checking state with the same timeout period. 178 * In this state, the watcher tries to detect whether this is a break 179 * during intermittent events or the kernel bug is triggered. If the next 180 * polling reports an event within the short period, the previous timeout is 181 * likely to be a kernel bug, and so the watcher goes back to the active state. 182 * Otherwise, it moves to the idle state again. 183 * 184 * It's not clear whether this is a thread-related bug, but since we've only 185 * seen this with threads, this workaround is used only when enabling threads. 186 */ 187 188typedef enum { poll_idle, poll_active, poll_checking } pollstate_t; 189 190#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT 191#define ISC_SOCKET_POLLWATCH_TIMEOUT 10 192#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */ 193#endif /* ISC_SOCKET_USE_POLLWATCH */ 194 195/*% 196 * Size of per-FD lock buckets. 197 */ 198#ifdef ISC_PLATFORM_USETHREADS 199#define FDLOCK_COUNT 1024 200#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT) 201#else 202#define FDLOCK_COUNT 1 203#define FDLOCK_ID(fd) 0 204#endif /* ISC_PLATFORM_USETHREADS */ 205 206/*% 207 * Maximum number of events communicated with the kernel. There should normally 208 * be no need for having a large number. 209 */ 210#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 211#ifndef ISC_SOCKET_MAXEVENTS 212#define ISC_SOCKET_MAXEVENTS 64 213#endif 214#endif 215 216/*% 217 * Some systems define the socket length argument as an int, some as size_t, 218 * some as socklen_t. This is here so it can be easily changed if needed. 219 */ 220#ifndef ISC_SOCKADDR_LEN_T 221#define ISC_SOCKADDR_LEN_T unsigned int 222#endif 223 224/*% 225 * Define what the possible "soft" errors can be. These are non-fatal returns 226 * of various network related functions, like recv() and so on. 227 * 228 * For some reason, BSDI (and perhaps others) will sometimes return <0 229 * from recv() but will have errno==0. This is broken, but we have to 230 * work around it here. 231 */ 232#define SOFT_ERROR(e) ((e) == EAGAIN || \ 233 (e) == EWOULDBLOCK || \ 234 (e) == EINTR || \ 235 (e) == 0) 236 237#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x) 238 239/*!< 240 * DLVL(90) -- Function entry/exit and other tracing. 241 * DLVL(70) -- Socket "correctness" -- including returning of events, etc. 242 * DLVL(60) -- Socket data send/receive 243 * DLVL(50) -- Event tracing, including receiving/sending completion events. 244 * DLVL(20) -- Socket creation/destruction. 245 */ 246#define TRACE_LEVEL 90 247#define CORRECTNESS_LEVEL 70 248#define IOEVENT_LEVEL 60 249#define EVENT_LEVEL 50 250#define CREATION_LEVEL 20 251 252#define TRACE DLVL(TRACE_LEVEL) 253#define CORRECTNESS DLVL(CORRECTNESS_LEVEL) 254#define IOEVENT DLVL(IOEVENT_LEVEL) 255#define EVENT DLVL(EVENT_LEVEL) 256#define CREATION DLVL(CREATION_LEVEL) 257 258typedef isc_event_t intev_t; 259 260#define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o') 261#define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC) 262 263/*! 264 * IPv6 control information. If the socket is an IPv6 socket we want 265 * to collect the destination address and interface so the client can 266 * set them on outgoing packets. 267 */ 268#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 269#ifndef USE_CMSG 270#define USE_CMSG 1 271#endif 272#endif 273 274/*% 275 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have 276 * a setsockopt() like interface to request timestamps, and if the OS 277 * doesn't do it for us, call gettimeofday() on every UDP receive? 278 */ 279#ifdef SO_TIMESTAMP 280#ifndef USE_CMSG 281#define USE_CMSG 1 282#endif 283#endif 284 285/*% 286 * The size to raise the receive buffer to (from BIND 8). 287 */ 288#define RCVBUFSIZE (32*1024) 289 290/*% 291 * The number of times a send operation is repeated if the result is EINTR. 292 */ 293#define NRETRIES 10 294 295typedef struct isc__socket isc__socket_t; 296typedef struct isc__socketmgr isc__socketmgr_t; 297 298#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket) 299 300struct isc__socket { 301 /* Not locked. */ 302 isc_socket_t common; 303 isc__socketmgr_t *manager; 304 isc_mutex_t lock; 305 isc_sockettype_t type; 306 const isc_statscounter_t *statsindex; 307 308 /* Locked by socket lock. */ 309 ISC_LINK(isc__socket_t) link; 310 unsigned int references; 311 int fd; 312 int pf; 313 char name[16]; 314 void * tag; 315 316 ISC_LIST(isc_socketevent_t) send_list; 317 ISC_LIST(isc_socketevent_t) recv_list; 318 ISC_LIST(isc_socket_newconnev_t) accept_list; 319 isc_socket_connev_t *connect_ev; 320 321 /* 322 * Internal events. Posted when a descriptor is readable or 323 * writable. These are statically allocated and never freed. 324 * They will be set to non-purgable before use. 325 */ 326 intev_t readable_ev; 327 intev_t writable_ev; 328 329 isc_sockaddr_t peer_address; /* remote address */ 330 331 unsigned int pending_recv : 1, 332 pending_send : 1, 333 pending_accept : 1, 334 listener : 1, /* listener socket */ 335 connected : 1, 336 connecting : 1, /* connect pending */ 337 bound : 1, /* bound to local addr */ 338 dupped : 1; 339 340#ifdef ISC_NET_RECVOVERFLOW 341 unsigned char overflow; /* used for MSG_TRUNC fake */ 342#endif 343 344 char *recvcmsgbuf; 345 ISC_SOCKADDR_LEN_T recvcmsgbuflen; 346 char *sendcmsgbuf; 347 ISC_SOCKADDR_LEN_T sendcmsgbuflen; 348 349 void *fdwatcharg; 350 isc_sockfdwatch_t fdwatchcb; 351 int fdwatchflags; 352 isc_task_t *fdwatchtask; 353}; 354 355#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g') 356#define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC) 357 358struct isc__socketmgr { 359 /* Not locked. */ 360 isc_socketmgr_t common; 361 isc_mem_t *mctx; 362 isc_mutex_t lock; 363 isc_mutex_t *fdlock; 364 isc_stats_t *stats; 365#ifdef USE_KQUEUE 366 int kqueue_fd; 367 int nevents; 368 struct kevent *events; 369#endif /* USE_KQUEUE */ 370#ifdef USE_EPOLL 371 int epoll_fd; 372 int nevents; 373 struct epoll_event *events; 374#endif /* USE_EPOLL */ 375#ifdef USE_DEVPOLL 376 int devpoll_fd; 377 int nevents; 378 struct pollfd *events; 379#endif /* USE_DEVPOLL */ 380#ifdef USE_SELECT 381 int fd_bufsize; 382#endif /* USE_SELECT */ 383 unsigned int maxsocks; 384#ifdef ISC_PLATFORM_USETHREADS 385 int pipe_fds[2]; 386#endif 387 388 /* Locked by fdlock. */ 389 isc__socket_t **fds; 390 int *fdstate; 391#ifdef USE_DEVPOLL 392 pollinfo_t *fdpollinfo; 393#endif 394 395 /* Locked by manager lock. */ 396 ISC_LIST(isc__socket_t) socklist; 397#ifdef USE_SELECT 398 fd_set *read_fds; 399 fd_set *read_fds_copy; 400 fd_set *write_fds; 401 fd_set *write_fds_copy; 402 int maxfd; 403#endif /* USE_SELECT */ 404 int reserved; /* unlocked */ 405#ifdef USE_WATCHER_THREAD 406 isc_thread_t watcher; 407 isc_condition_t shutdown_ok; 408#else /* USE_WATCHER_THREAD */ 409 unsigned int refs; 410#endif /* USE_WATCHER_THREAD */ 411 int maxudp; 412}; 413 414#ifdef USE_SHARED_MANAGER 415static isc__socketmgr_t *socketmgr = NULL; 416#endif /* USE_SHARED_MANAGER */ 417 418#define CLOSED 0 /* this one must be zero */ 419#define MANAGED 1 420#define CLOSE_PENDING 2 421 422/* 423 * send() and recv() iovec counts 424 */ 425#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) 426#ifdef ISC_NET_RECVOVERFLOW 427# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1) 428#else 429# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) 430#endif 431 432static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf, 433 isc_sockettype_t type, 434 isc_socket_t **socketp, 435 isc_socket_t *dup_socket); 436static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **); 437static void send_senddone_event(isc__socket_t *, isc_socketevent_t **); 438static void free_socket(isc__socket_t **); 439static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t, 440 isc__socket_t **); 441static void destroy(isc__socket_t **); 442static void internal_accept(isc_task_t *, isc_event_t *); 443static void internal_connect(isc_task_t *, isc_event_t *); 444static void internal_recv(isc_task_t *, isc_event_t *); 445static void internal_send(isc_task_t *, isc_event_t *); 446static void internal_fdwatch_write(isc_task_t *, isc_event_t *); 447static void internal_fdwatch_read(isc_task_t *, isc_event_t *); 448static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *); 449static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *, 450 struct msghdr *, struct iovec *, size_t *); 451static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *, 452 struct msghdr *, struct iovec *, size_t *); 453#ifdef USE_WATCHER_THREAD 454static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager); 455#endif 456 457/*% 458 * The following can be either static or public, depending on build environment. 459 */ 460 461#ifdef BIND9 462#define ISC_SOCKETFUNC_SCOPE 463#else 464#define ISC_SOCKETFUNC_SCOPE static 465#endif 466 467ISC_SOCKETFUNC_SCOPE isc_result_t 468isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, 469 isc_socket_t **socketp); 470ISC_SOCKETFUNC_SCOPE void 471isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp); 472ISC_SOCKETFUNC_SCOPE void 473isc__socket_detach(isc_socket_t **socketp); 474ISC_SOCKETFUNC_SCOPE isc_result_t 475isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp); 476ISC_SOCKETFUNC_SCOPE isc_result_t 477isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 478 unsigned int maxsocks); 479ISC_SOCKETFUNC_SCOPE void 480isc__socketmgr_destroy(isc_socketmgr_t **managerp); 481ISC_SOCKETFUNC_SCOPE isc_result_t 482isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist, 483 unsigned int minimum, isc_task_t *task, 484 isc_taskaction_t action, const void *arg); 485ISC_SOCKETFUNC_SCOPE isc_result_t 486isc__socket_recv(isc_socket_t *sock, isc_region_t *region, 487 unsigned int minimum, isc_task_t *task, 488 isc_taskaction_t action, const void *arg); 489ISC_SOCKETFUNC_SCOPE isc_result_t 490isc__socket_recv2(isc_socket_t *sock, isc_region_t *region, 491 unsigned int minimum, isc_task_t *task, 492 isc_socketevent_t *event, unsigned int flags); 493ISC_SOCKETFUNC_SCOPE isc_result_t 494isc__socket_send(isc_socket_t *sock, isc_region_t *region, 495 isc_task_t *task, isc_taskaction_t action, const void *arg); 496ISC_SOCKETFUNC_SCOPE isc_result_t 497isc__socket_sendto(isc_socket_t *sock, isc_region_t *region, 498 isc_task_t *task, isc_taskaction_t action, const void *arg, 499 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo); 500ISC_SOCKETFUNC_SCOPE isc_result_t 501isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 502 isc_task_t *task, isc_taskaction_t action, const void *arg); 503ISC_SOCKETFUNC_SCOPE isc_result_t 504isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist, 505 isc_task_t *task, isc_taskaction_t action, const void *arg, 506 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo); 507ISC_SOCKETFUNC_SCOPE isc_result_t 508isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist, 509 isc_task_t *task, isc_taskaction_t action, const void *arg, 510 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 511 unsigned int flags); 512ISC_SOCKETFUNC_SCOPE isc_result_t 513isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region, 514 isc_task_t *task, 515 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 516 isc_socketevent_t *event, unsigned int flags); 517ISC_SOCKETFUNC_SCOPE void 518isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active); 519ISC_SOCKETFUNC_SCOPE isc_result_t 520isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 521 isc_uint32_t owner, isc_uint32_t group); 522ISC_SOCKETFUNC_SCOPE isc_result_t 523isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr, 524 unsigned int options); 525ISC_SOCKETFUNC_SCOPE isc_result_t 526isc__socket_filter(isc_socket_t *sock, const char *filter); 527ISC_SOCKETFUNC_SCOPE isc_result_t 528isc__socket_listen(isc_socket_t *sock, unsigned int backlog); 529ISC_SOCKETFUNC_SCOPE isc_result_t 530isc__socket_accept(isc_socket_t *sock, 531 isc_task_t *task, isc_taskaction_t action, const void *arg); 532ISC_SOCKETFUNC_SCOPE isc_result_t 533isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, 534 isc_task_t *task, isc_taskaction_t action, 535 const void *arg); 536ISC_SOCKETFUNC_SCOPE isc_result_t 537isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp); 538ISC_SOCKETFUNC_SCOPE isc_result_t 539isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp); 540ISC_SOCKETFUNC_SCOPE void 541isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how); 542ISC_SOCKETFUNC_SCOPE isc_sockettype_t 543isc__socket_gettype(isc_socket_t *sock); 544ISC_SOCKETFUNC_SCOPE isc_boolean_t 545isc__socket_isbound(isc_socket_t *sock); 546ISC_SOCKETFUNC_SCOPE void 547isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes); 548#if defined(HAVE_LIBXML2) && defined(BIND9) 549ISC_SOCKETFUNC_SCOPE void 550isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer); 551#endif 552 553ISC_SOCKETFUNC_SCOPE isc_result_t 554isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags, 555 isc_sockfdwatch_t callback, void *cbarg, 556 isc_task_t *task, isc_socket_t **socketp); 557ISC_SOCKETFUNC_SCOPE isc_result_t 558isc__socket_fdwatchpoke(isc_socket_t *sock, int flags); 559ISC_SOCKETFUNC_SCOPE isc_result_t 560isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp); 561ISC_SOCKETFUNC_SCOPE int 562isc__socket_getfd(isc_socket_t *sock); 563 564static struct { 565 isc_socketmethods_t methods; 566 567 /*% 568 * The following are defined just for avoiding unused static functions. 569 */ 570#ifndef BIND9 571 void *recvv, *send, *sendv, *sendto2, *sendtov, *cleanunix, *permunix, 572 *filter, *listen, *accept, *getpeername, *isbound; 573#endif 574} socketmethods = { 575 { 576 isc__socket_attach, 577 isc__socket_detach, 578 isc__socket_bind, 579 isc__socket_sendto, 580 isc__socket_sendto2, 581 isc__socket_connect, 582 isc__socket_recv, 583 isc__socket_recv2, 584 isc__socket_cancel, 585 isc__socket_getsockname, 586 isc__socket_gettype, 587 isc__socket_ipv6only, 588 isc__socket_fdwatchpoke, 589 isc__socket_dup, 590 isc__socket_getfd 591 } 592#ifndef BIND9 593 , 594 (void *)isc__socket_recvv, (void *)isc__socket_send, 595 (void *)isc__socket_sendv, (void *)isc__socket_sendto2, 596 (void *)isc__socket_sendtov, 597 (void *)isc__socket_cleanunix, (void *)isc__socket_permunix, 598 (void *)isc__socket_filter, (void *)isc__socket_listen, 599 (void *)isc__socket_accept, (void *)isc__socket_getpeername, 600 (void *)isc__socket_isbound 601#endif 602}; 603 604static isc_socketmgrmethods_t socketmgrmethods = { 605 isc__socketmgr_destroy, 606 isc__socket_create, 607 isc__socket_fdwatchcreate 608}; 609 610#define SELECT_POKE_SHUTDOWN (-1) 611#define SELECT_POKE_NOTHING (-2) 612#define SELECT_POKE_READ (-3) 613#define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */ 614#define SELECT_POKE_WRITE (-4) 615#define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */ 616#define SELECT_POKE_CLOSE (-5) 617 618#define SOCK_DEAD(s) ((s)->references == 0) 619 620/*% 621 * Shortcut index arrays to get access to statistics counters. 622 */ 623enum { 624 STATID_OPEN = 0, 625 STATID_OPENFAIL = 1, 626 STATID_CLOSE = 2, 627 STATID_BINDFAIL = 3, 628 STATID_CONNECTFAIL = 4, 629 STATID_CONNECT = 5, 630 STATID_ACCEPTFAIL = 6, 631 STATID_ACCEPT = 7, 632 STATID_SENDFAIL = 8, 633 STATID_RECVFAIL = 9 634}; 635static const isc_statscounter_t udp4statsindex[] = { 636 isc_sockstatscounter_udp4open, 637 isc_sockstatscounter_udp4openfail, 638 isc_sockstatscounter_udp4close, 639 isc_sockstatscounter_udp4bindfail, 640 isc_sockstatscounter_udp4connectfail, 641 isc_sockstatscounter_udp4connect, 642 -1, 643 -1, 644 isc_sockstatscounter_udp4sendfail, 645 isc_sockstatscounter_udp4recvfail 646}; 647static const isc_statscounter_t udp6statsindex[] = { 648 isc_sockstatscounter_udp6open, 649 isc_sockstatscounter_udp6openfail, 650 isc_sockstatscounter_udp6close, 651 isc_sockstatscounter_udp6bindfail, 652 isc_sockstatscounter_udp6connectfail, 653 isc_sockstatscounter_udp6connect, 654 -1, 655 -1, 656 isc_sockstatscounter_udp6sendfail, 657 isc_sockstatscounter_udp6recvfail 658}; 659static const isc_statscounter_t tcp4statsindex[] = { 660 isc_sockstatscounter_tcp4open, 661 isc_sockstatscounter_tcp4openfail, 662 isc_sockstatscounter_tcp4close, 663 isc_sockstatscounter_tcp4bindfail, 664 isc_sockstatscounter_tcp4connectfail, 665 isc_sockstatscounter_tcp4connect, 666 isc_sockstatscounter_tcp4acceptfail, 667 isc_sockstatscounter_tcp4accept, 668 isc_sockstatscounter_tcp4sendfail, 669 isc_sockstatscounter_tcp4recvfail 670}; 671static const isc_statscounter_t tcp6statsindex[] = { 672 isc_sockstatscounter_tcp6open, 673 isc_sockstatscounter_tcp6openfail, 674 isc_sockstatscounter_tcp6close, 675 isc_sockstatscounter_tcp6bindfail, 676 isc_sockstatscounter_tcp6connectfail, 677 isc_sockstatscounter_tcp6connect, 678 isc_sockstatscounter_tcp6acceptfail, 679 isc_sockstatscounter_tcp6accept, 680 isc_sockstatscounter_tcp6sendfail, 681 isc_sockstatscounter_tcp6recvfail 682}; 683static const isc_statscounter_t unixstatsindex[] = { 684 isc_sockstatscounter_unixopen, 685 isc_sockstatscounter_unixopenfail, 686 isc_sockstatscounter_unixclose, 687 isc_sockstatscounter_unixbindfail, 688 isc_sockstatscounter_unixconnectfail, 689 isc_sockstatscounter_unixconnect, 690 isc_sockstatscounter_unixacceptfail, 691 isc_sockstatscounter_unixaccept, 692 isc_sockstatscounter_unixsendfail, 693 isc_sockstatscounter_unixrecvfail 694}; 695static const isc_statscounter_t fdwatchstatsindex[] = { 696 -1, 697 -1, 698 isc_sockstatscounter_fdwatchclose, 699 isc_sockstatscounter_fdwatchbindfail, 700 isc_sockstatscounter_fdwatchconnectfail, 701 isc_sockstatscounter_fdwatchconnect, 702 -1, 703 -1, 704 isc_sockstatscounter_fdwatchsendfail, 705 isc_sockstatscounter_fdwatchrecvfail 706}; 707 708#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \ 709 defined(USE_WATCHER_THREAD) 710static void 711manager_log(isc__socketmgr_t *sockmgr, 712 isc_logcategory_t *category, isc_logmodule_t *module, int level, 713 const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6); 714static void 715manager_log(isc__socketmgr_t *sockmgr, 716 isc_logcategory_t *category, isc_logmodule_t *module, int level, 717 const char *fmt, ...) 718{ 719 char msgbuf[2048]; 720 va_list ap; 721 722 if (! isc_log_wouldlog(isc_lctx, level)) 723 return; 724 725 va_start(ap, fmt); 726 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 727 va_end(ap); 728 729 isc_log_write(isc_lctx, category, module, level, 730 "sockmgr %p: %s", sockmgr, msgbuf); 731} 732#endif 733 734static void 735socket_log(isc__socket_t *sock, isc_sockaddr_t *address, 736 isc_logcategory_t *category, isc_logmodule_t *module, int level, 737 isc_msgcat_t *msgcat, int msgset, int message, 738 const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10); 739static void 740socket_log(isc__socket_t *sock, isc_sockaddr_t *address, 741 isc_logcategory_t *category, isc_logmodule_t *module, int level, 742 isc_msgcat_t *msgcat, int msgset, int message, 743 const char *fmt, ...) 744{ 745 char msgbuf[2048]; 746 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 747 va_list ap; 748 749 if (! isc_log_wouldlog(isc_lctx, level)) 750 return; 751 752 va_start(ap, fmt); 753 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 754 va_end(ap); 755 756 if (address == NULL) { 757 isc_log_iwrite(isc_lctx, category, module, level, 758 msgcat, msgset, message, 759 "socket %p: %s", sock, msgbuf); 760 } else { 761 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); 762 isc_log_iwrite(isc_lctx, category, module, level, 763 msgcat, msgset, message, 764 "socket %p %s: %s", sock, peerbuf, msgbuf); 765 } 766} 767 768#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \ 769 defined(USE_CMSG) && defined(IPV6_RECVPKTINFO) 770/* 771 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by 772 * setting IPV6_V6ONLY. 773 */ 774static void 775FIX_IPV6_RECVPKTINFO(isc__socket_t *sock) 776{ 777 char strbuf[ISC_STRERRORSIZE]; 778 int on = 1; 779 780 if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp) 781 return; 782 783 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 784 (void *)&on, sizeof(on)) < 0) { 785 786 isc__strerror(errno, strbuf, sizeof(strbuf)); 787 UNEXPECTED_ERROR(__FILE__, __LINE__, 788 "setsockopt(%d, IPV6_RECVPKTINFO) " 789 "%s: %s", sock->fd, 790 isc_msgcat_get(isc_msgcat, 791 ISC_MSGSET_GENERAL, 792 ISC_MSG_FAILED, 793 "failed"), 794 strbuf); 795 } 796} 797#else 798#define FIX_IPV6_RECVPKTINFO(sock) (void)0 799#endif 800 801/*% 802 * Increment socket-related statistics counters. 803 */ 804static inline void 805inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) { 806 REQUIRE(counterid != -1); 807 808 if (stats != NULL) 809 isc_stats_increment(stats, counterid); 810} 811 812static inline isc_result_t 813watch_fd(isc__socketmgr_t *manager, int fd, int msg) { 814 isc_result_t result = ISC_R_SUCCESS; 815 816#ifdef USE_KQUEUE 817 struct kevent evchange; 818 819 memset(&evchange, 0, sizeof(evchange)); 820 if (msg == SELECT_POKE_READ) 821 evchange.filter = EVFILT_READ; 822 else 823 evchange.filter = EVFILT_WRITE; 824 evchange.flags = EV_ADD; 825 evchange.ident = fd; 826 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 827 result = isc__errno2result(errno); 828 829 return (result); 830#elif defined(USE_EPOLL) 831 struct epoll_event event; 832 833 if (msg == SELECT_POKE_READ) 834 event.events = EPOLLIN; 835 else 836 event.events = EPOLLOUT; 837 memset(&event.data, 0, sizeof(event.data)); 838 event.data.fd = fd; 839 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && 840 errno != EEXIST) { 841 result = isc__errno2result(errno); 842 } 843 844 return (result); 845#elif defined(USE_DEVPOLL) 846 struct pollfd pfd; 847 int lockid = FDLOCK_ID(fd); 848 849 memset(&pfd, 0, sizeof(pfd)); 850 if (msg == SELECT_POKE_READ) 851 pfd.events = POLLIN; 852 else 853 pfd.events = POLLOUT; 854 pfd.fd = fd; 855 pfd.revents = 0; 856 LOCK(&manager->fdlock[lockid]); 857 if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1) 858 result = isc__errno2result(errno); 859 else { 860 if (msg == SELECT_POKE_READ) 861 manager->fdpollinfo[fd].want_read = 1; 862 else 863 manager->fdpollinfo[fd].want_write = 1; 864 } 865 UNLOCK(&manager->fdlock[lockid]); 866 867 return (result); 868#elif defined(USE_SELECT) 869 LOCK(&manager->lock); 870 if (msg == SELECT_POKE_READ) 871 FD_SET(fd, manager->read_fds); 872 if (msg == SELECT_POKE_WRITE) 873 FD_SET(fd, manager->write_fds); 874 UNLOCK(&manager->lock); 875 876 return (result); 877#endif 878} 879 880static inline isc_result_t 881unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) { 882 isc_result_t result = ISC_R_SUCCESS; 883 884#ifdef USE_KQUEUE 885 struct kevent evchange; 886 887 memset(&evchange, 0, sizeof(evchange)); 888 if (msg == SELECT_POKE_READ) 889 evchange.filter = EVFILT_READ; 890 else 891 evchange.filter = EVFILT_WRITE; 892 evchange.flags = EV_DELETE; 893 evchange.ident = fd; 894 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 895 result = isc__errno2result(errno); 896 897 return (result); 898#elif defined(USE_EPOLL) 899 struct epoll_event event; 900 901 if (msg == SELECT_POKE_READ) 902 event.events = EPOLLIN; 903 else 904 event.events = EPOLLOUT; 905 memset(&event.data, 0, sizeof(event.data)); 906 event.data.fd = fd; 907 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 && 908 errno != ENOENT) { 909 char strbuf[ISC_STRERRORSIZE]; 910 isc__strerror(errno, strbuf, sizeof(strbuf)); 911 UNEXPECTED_ERROR(__FILE__, __LINE__, 912 "epoll_ctl(DEL), %d: %s", fd, strbuf); 913 result = ISC_R_UNEXPECTED; 914 } 915 return (result); 916#elif defined(USE_DEVPOLL) 917 struct pollfd pfds[2]; 918 size_t writelen = sizeof(pfds[0]); 919 int lockid = FDLOCK_ID(fd); 920 921 memset(pfds, 0, sizeof(pfds)); 922 pfds[0].events = POLLREMOVE; 923 pfds[0].fd = fd; 924 925 /* 926 * Canceling read or write polling via /dev/poll is tricky. Since it 927 * only provides a way of canceling per FD, we may need to re-poll the 928 * socket for the other operation. 929 */ 930 LOCK(&manager->fdlock[lockid]); 931 if (msg == SELECT_POKE_READ && 932 manager->fdpollinfo[fd].want_write == 1) { 933 pfds[1].events = POLLOUT; 934 pfds[1].fd = fd; 935 writelen += sizeof(pfds[1]); 936 } 937 if (msg == SELECT_POKE_WRITE && 938 manager->fdpollinfo[fd].want_read == 1) { 939 pfds[1].events = POLLIN; 940 pfds[1].fd = fd; 941 writelen += sizeof(pfds[1]); 942 } 943 944 if (write(manager->devpoll_fd, pfds, writelen) == -1) 945 result = isc__errno2result(errno); 946 else { 947 if (msg == SELECT_POKE_READ) 948 manager->fdpollinfo[fd].want_read = 0; 949 else 950 manager->fdpollinfo[fd].want_write = 0; 951 } 952 UNLOCK(&manager->fdlock[lockid]); 953 954 return (result); 955#elif defined(USE_SELECT) 956 LOCK(&manager->lock); 957 if (msg == SELECT_POKE_READ) 958 FD_CLR(fd, manager->read_fds); 959 else if (msg == SELECT_POKE_WRITE) 960 FD_CLR(fd, manager->write_fds); 961 UNLOCK(&manager->lock); 962 963 return (result); 964#endif 965} 966 967static void 968wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) { 969 isc_result_t result; 970 int lockid = FDLOCK_ID(fd); 971 972 /* 973 * This is a wakeup on a socket. If the socket is not in the 974 * process of being closed, start watching it for either reads 975 * or writes. 976 */ 977 978 INSIST(fd >= 0 && fd < (int)manager->maxsocks); 979 980 if (msg == SELECT_POKE_CLOSE) { 981 /* No one should be updating fdstate, so no need to lock it */ 982 INSIST(manager->fdstate[fd] == CLOSE_PENDING); 983 manager->fdstate[fd] = CLOSED; 984 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 985 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 986 (void)close(fd); 987 return; 988 } 989 990 LOCK(&manager->fdlock[lockid]); 991 if (manager->fdstate[fd] == CLOSE_PENDING) { 992 UNLOCK(&manager->fdlock[lockid]); 993 994 /* 995 * We accept (and ignore) any error from unwatch_fd() as we are 996 * closing the socket, hoping it doesn't leave dangling state in 997 * the kernel. 998 * Note that unwatch_fd() must be called after releasing the 999 * fdlock; otherwise it could cause deadlock due to a lock order 1000 * reversal. 1001 */ 1002 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1003 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1004 return; 1005 } 1006 if (manager->fdstate[fd] != MANAGED) { 1007 UNLOCK(&manager->fdlock[lockid]); 1008 return; 1009 } 1010 UNLOCK(&manager->fdlock[lockid]); 1011 1012 /* 1013 * Set requested bit. 1014 */ 1015 result = watch_fd(manager, fd, msg); 1016 if (result != ISC_R_SUCCESS) { 1017 /* 1018 * XXXJT: what should we do? Ignoring the failure of watching 1019 * a socket will make the application dysfunctional, but there 1020 * seems to be no reasonable recovery process. 1021 */ 1022 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 1023 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 1024 "failed to start watching FD (%d): %s", 1025 fd, isc_result_totext(result)); 1026 } 1027} 1028 1029#ifdef USE_WATCHER_THREAD 1030/* 1031 * Poke the select loop when there is something for us to do. 1032 * The write is required (by POSIX) to complete. That is, we 1033 * will not get partial writes. 1034 */ 1035static void 1036select_poke(isc__socketmgr_t *mgr, int fd, int msg) { 1037 int cc; 1038 int buf[2]; 1039 char strbuf[ISC_STRERRORSIZE]; 1040 1041 buf[0] = fd; 1042 buf[1] = msg; 1043 1044 do { 1045 cc = write(mgr->pipe_fds[1], buf, sizeof(buf)); 1046#ifdef ENOSR 1047 /* 1048 * Treat ENOSR as EAGAIN but loop slowly as it is 1049 * unlikely to clear fast. 1050 */ 1051 if (cc < 0 && errno == ENOSR) { 1052 sleep(1); 1053 errno = EAGAIN; 1054 } 1055#endif 1056 } while (cc < 0 && SOFT_ERROR(errno)); 1057 1058 if (cc < 0) { 1059 isc__strerror(errno, strbuf, sizeof(strbuf)); 1060 FATAL_ERROR(__FILE__, __LINE__, 1061 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 1062 ISC_MSG_WRITEFAILED, 1063 "write() failed " 1064 "during watcher poke: %s"), 1065 strbuf); 1066 } 1067 1068 INSIST(cc == sizeof(buf)); 1069} 1070 1071/* 1072 * Read a message on the internal fd. 1073 */ 1074static void 1075select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) { 1076 int buf[2]; 1077 int cc; 1078 char strbuf[ISC_STRERRORSIZE]; 1079 1080 cc = read(mgr->pipe_fds[0], buf, sizeof(buf)); 1081 if (cc < 0) { 1082 *msg = SELECT_POKE_NOTHING; 1083 *fd = -1; /* Silence compiler. */ 1084 if (SOFT_ERROR(errno)) 1085 return; 1086 1087 isc__strerror(errno, strbuf, sizeof(strbuf)); 1088 FATAL_ERROR(__FILE__, __LINE__, 1089 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 1090 ISC_MSG_READFAILED, 1091 "read() failed " 1092 "during watcher poke: %s"), 1093 strbuf); 1094 1095 return; 1096 } 1097 INSIST(cc == sizeof(buf)); 1098 1099 *fd = buf[0]; 1100 *msg = buf[1]; 1101} 1102#else /* USE_WATCHER_THREAD */ 1103/* 1104 * Update the state of the socketmgr when something changes. 1105 */ 1106static void 1107select_poke(isc__socketmgr_t *manager, int fd, int msg) { 1108 if (msg == SELECT_POKE_SHUTDOWN) 1109 return; 1110 else if (fd >= 0) 1111 wakeup_socket(manager, fd, msg); 1112 return; 1113} 1114#endif /* USE_WATCHER_THREAD */ 1115 1116/* 1117 * Make a fd non-blocking. 1118 */ 1119static isc_result_t 1120make_nonblock(int fd) { 1121 int ret; 1122 int flags; 1123 char strbuf[ISC_STRERRORSIZE]; 1124#ifdef USE_FIONBIO_IOCTL 1125 int on = 1; 1126 1127 ret = ioctl(fd, FIONBIO, (char *)&on); 1128#else 1129 flags = fcntl(fd, F_GETFL, 0); 1130 flags |= PORT_NONBLOCK; 1131 ret = fcntl(fd, F_SETFL, flags); 1132#endif 1133 1134 if (ret == -1) { 1135 isc__strerror(errno, strbuf, sizeof(strbuf)); 1136 UNEXPECTED_ERROR(__FILE__, __LINE__, 1137#ifdef USE_FIONBIO_IOCTL 1138 "ioctl(%d, FIONBIO, &on): %s", fd, 1139#else 1140 "fcntl(%d, F_SETFL, %d): %s", fd, flags, 1141#endif 1142 strbuf); 1143 1144 return (ISC_R_UNEXPECTED); 1145 } 1146 1147 return (ISC_R_SUCCESS); 1148} 1149 1150#ifdef USE_CMSG 1151/* 1152 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE. 1153 * In order to ensure as much portability as possible, we provide wrapper 1154 * functions of these macros. 1155 * Note that cmsg_space() could run slow on OSes that do not have 1156 * CMSG_SPACE. 1157 */ 1158static inline ISC_SOCKADDR_LEN_T 1159cmsg_len(ISC_SOCKADDR_LEN_T len) { 1160#ifdef CMSG_LEN 1161 return (CMSG_LEN(len)); 1162#else 1163 ISC_SOCKADDR_LEN_T hdrlen; 1164 1165 /* 1166 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA 1167 * is correct. 1168 */ 1169 hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL)); 1170 return (hdrlen + len); 1171#endif 1172} 1173 1174static inline ISC_SOCKADDR_LEN_T 1175cmsg_space(ISC_SOCKADDR_LEN_T len) { 1176#ifdef CMSG_SPACE 1177 return (CMSG_SPACE(len)); 1178#else 1179 struct msghdr msg; 1180 struct cmsghdr *cmsgp; 1181 /* 1182 * XXX: The buffer length is an ad-hoc value, but should be enough 1183 * in a practical sense. 1184 */ 1185 char dummybuf[sizeof(struct cmsghdr) + 1024]; 1186 1187 memset(&msg, 0, sizeof(msg)); 1188 msg.msg_control = dummybuf; 1189 msg.msg_controllen = sizeof(dummybuf); 1190 1191 cmsgp = (struct cmsghdr *)dummybuf; 1192 cmsgp->cmsg_len = cmsg_len(len); 1193 1194 cmsgp = CMSG_NXTHDR(&msg, cmsgp); 1195 if (cmsgp != NULL) 1196 return ((char *)cmsgp - (char *)msg.msg_control); 1197 else 1198 return (0); 1199#endif 1200} 1201#endif /* USE_CMSG */ 1202 1203/* 1204 * Process control messages received on a socket. 1205 */ 1206static void 1207process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { 1208#ifdef USE_CMSG 1209 struct cmsghdr *cmsgp; 1210#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1211 struct in6_pktinfo *pktinfop; 1212#endif 1213#ifdef SO_TIMESTAMP 1214 void *timevalp; 1215#endif 1216#endif 1217 1218 /* 1219 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. 1220 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. 1221 * They are all here, outside of the CPP tests, because it is 1222 * more consistent with the usual ISC coding style. 1223 */ 1224 UNUSED(sock); 1225 UNUSED(msg); 1226 UNUSED(dev); 1227 1228#ifdef ISC_NET_BSD44MSGHDR 1229 1230#ifdef MSG_TRUNC 1231 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC) 1232 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1233#endif 1234 1235#ifdef MSG_CTRUNC 1236 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC) 1237 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; 1238#endif 1239 1240#ifndef USE_CMSG 1241 return; 1242#else 1243 if (msg->msg_controllen == 0U || msg->msg_control == NULL) 1244 return; 1245 1246#ifdef SO_TIMESTAMP 1247 timevalp = NULL; 1248#endif 1249#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1250 pktinfop = NULL; 1251#endif 1252 1253 cmsgp = CMSG_FIRSTHDR(msg); 1254 while (cmsgp != NULL) { 1255 socket_log(sock, NULL, TRACE, 1256 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG, 1257 "processing cmsg %p", cmsgp); 1258 1259#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1260 if (cmsgp->cmsg_level == IPPROTO_IPV6 1261 && cmsgp->cmsg_type == IPV6_PKTINFO) { 1262 1263 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1264 memmove(&dev->pktinfo, pktinfop, 1265 sizeof(struct in6_pktinfo)); 1266 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 1267 socket_log(sock, NULL, TRACE, 1268 isc_msgcat, ISC_MSGSET_SOCKET, 1269 ISC_MSG_IFRECEIVED, 1270 "interface received on ifindex %u", 1271 dev->pktinfo.ipi6_ifindex); 1272 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) 1273 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; 1274 goto next; 1275 } 1276#endif 1277 1278#ifdef SO_TIMESTAMP 1279 if (cmsgp->cmsg_level == SOL_SOCKET 1280 && cmsgp->cmsg_type == SCM_TIMESTAMP) { 1281 struct timeval tv; 1282 timevalp = CMSG_DATA(cmsgp); 1283 memmove(&tv, timevalp, sizeof(tv)); 1284 dev->timestamp.seconds = tv.tv_sec; 1285 dev->timestamp.nanoseconds = tv.tv_usec * 1000; 1286 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; 1287 goto next; 1288 } 1289#endif 1290 1291 next: 1292 cmsgp = CMSG_NXTHDR(msg, cmsgp); 1293 } 1294#endif /* USE_CMSG */ 1295 1296#endif /* ISC_NET_BSD44MSGHDR */ 1297} 1298 1299/* 1300 * Construct an iov array and attach it to the msghdr passed in. This is 1301 * the SEND constructor, which will use the used region of the buffer 1302 * (if using a buffer list) or will use the internal region (if a single 1303 * buffer I/O is requested). 1304 * 1305 * Nothing can be NULL, and the done event must list at least one buffer 1306 * on the buffer linked list for this function to be meaningful. 1307 * 1308 * If write_countp != NULL, *write_countp will hold the number of bytes 1309 * this transaction can send. 1310 */ 1311static void 1312build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev, 1313 struct msghdr *msg, struct iovec *iov, size_t *write_countp) 1314{ 1315 unsigned int iovcount; 1316 isc_buffer_t *buffer; 1317 isc_region_t used; 1318 size_t write_count; 1319 size_t skip_count; 1320 1321 memset(msg, 0, sizeof(*msg)); 1322 1323 if (!sock->connected) { 1324 msg->msg_name = (void *)&dev->address.type.sa; 1325 msg->msg_namelen = dev->address.length; 1326 } else { 1327 msg->msg_name = NULL; 1328 msg->msg_namelen = 0; 1329 } 1330 1331 buffer = ISC_LIST_HEAD(dev->bufferlist); 1332 write_count = 0; 1333 iovcount = 0; 1334 1335 /* 1336 * Single buffer I/O? Skip what we've done so far in this region. 1337 */ 1338 if (buffer == NULL) { 1339 write_count = dev->region.length - dev->n; 1340 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1341 iov[0].iov_len = write_count; 1342 iovcount = 1; 1343 1344 goto config; 1345 } 1346 1347 /* 1348 * Multibuffer I/O. 1349 * Skip the data in the buffer list that we have already written. 1350 */ 1351 skip_count = dev->n; 1352 while (buffer != NULL) { 1353 REQUIRE(ISC_BUFFER_VALID(buffer)); 1354 if (skip_count < isc_buffer_usedlength(buffer)) 1355 break; 1356 skip_count -= isc_buffer_usedlength(buffer); 1357 buffer = ISC_LIST_NEXT(buffer, link); 1358 } 1359 1360 while (buffer != NULL) { 1361 INSIST(iovcount < MAXSCATTERGATHER_SEND); 1362 1363 isc_buffer_usedregion(buffer, &used); 1364 1365 if (used.length > 0) { 1366 iov[iovcount].iov_base = (void *)(used.base 1367 + skip_count); 1368 iov[iovcount].iov_len = used.length - skip_count; 1369 write_count += (used.length - skip_count); 1370 skip_count = 0; 1371 iovcount++; 1372 } 1373 buffer = ISC_LIST_NEXT(buffer, link); 1374 } 1375 1376 INSIST(skip_count == 0U); 1377 1378 config: 1379 msg->msg_iov = iov; 1380 msg->msg_iovlen = iovcount; 1381 1382#ifdef ISC_NET_BSD44MSGHDR 1383 msg->msg_control = NULL; 1384 msg->msg_controllen = 0; 1385 msg->msg_flags = 0; 1386#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1387 if ((sock->type == isc_sockettype_udp) 1388 && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) { 1389#if defined(IPV6_USE_MIN_MTU) 1390 int use_min_mtu = 1; /* -1, 0, 1 */ 1391#endif 1392 struct cmsghdr *cmsgp; 1393 struct in6_pktinfo *pktinfop; 1394 1395 socket_log(sock, NULL, TRACE, 1396 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA, 1397 "sendto pktinfo data, ifindex %u", 1398 dev->pktinfo.ipi6_ifindex); 1399 1400 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo)); 1401 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1402 msg->msg_control = (void *)sock->sendcmsgbuf; 1403 1404 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf; 1405 cmsgp->cmsg_level = IPPROTO_IPV6; 1406 cmsgp->cmsg_type = IPV6_PKTINFO; 1407 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo)); 1408 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1409 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); 1410#if defined(IPV6_USE_MIN_MTU) 1411 /* 1412 * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD 1413 * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO 1414 * is used. 1415 */ 1416 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf + 1417 msg->msg_controllen); 1418 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu)); 1419 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1420 1421 cmsgp->cmsg_level = IPPROTO_IPV6; 1422 cmsgp->cmsg_type = IPV6_USE_MIN_MTU; 1423 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu)); 1424 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu)); 1425#endif 1426 } 1427#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */ 1428#else /* ISC_NET_BSD44MSGHDR */ 1429 msg->msg_accrights = NULL; 1430 msg->msg_accrightslen = 0; 1431#endif /* ISC_NET_BSD44MSGHDR */ 1432 1433 if (write_countp != NULL) 1434 *write_countp = write_count; 1435} 1436 1437/* 1438 * Construct an iov array and attach it to the msghdr passed in. This is 1439 * the RECV constructor, which will use the available region of the buffer 1440 * (if using a buffer list) or will use the internal region (if a single 1441 * buffer I/O is requested). 1442 * 1443 * Nothing can be NULL, and the done event must list at least one buffer 1444 * on the buffer linked list for this function to be meaningful. 1445 * 1446 * If read_countp != NULL, *read_countp will hold the number of bytes 1447 * this transaction can receive. 1448 */ 1449static void 1450build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev, 1451 struct msghdr *msg, struct iovec *iov, size_t *read_countp) 1452{ 1453 unsigned int iovcount; 1454 isc_buffer_t *buffer; 1455 isc_region_t available; 1456 size_t read_count; 1457 1458 memset(msg, 0, sizeof(struct msghdr)); 1459 1460 if (sock->type == isc_sockettype_udp) { 1461 memset(&dev->address, 0, sizeof(dev->address)); 1462#ifdef BROKEN_RECVMSG 1463 if (sock->pf == AF_INET) { 1464 msg->msg_name = (void *)&dev->address.type.sin; 1465 msg->msg_namelen = sizeof(dev->address.type.sin6); 1466 } else if (sock->pf == AF_INET6) { 1467 msg->msg_name = (void *)&dev->address.type.sin6; 1468 msg->msg_namelen = sizeof(dev->address.type.sin6); 1469#ifdef ISC_PLATFORM_HAVESYSUNH 1470 } else if (sock->pf == AF_UNIX) { 1471 msg->msg_name = (void *)&dev->address.type.sunix; 1472 msg->msg_namelen = sizeof(dev->address.type.sunix); 1473#endif 1474 } else { 1475 msg->msg_name = (void *)&dev->address.type.sa; 1476 msg->msg_namelen = sizeof(dev->address.type); 1477 } 1478#else 1479 msg->msg_name = (void *)&dev->address.type.sa; 1480 msg->msg_namelen = sizeof(dev->address.type); 1481#endif 1482#ifdef ISC_NET_RECVOVERFLOW 1483 /* If needed, steal one iovec for overflow detection. */ 1484 maxiov--; 1485#endif 1486 } else { /* TCP */ 1487 msg->msg_name = NULL; 1488 msg->msg_namelen = 0; 1489 dev->address = sock->peer_address; 1490 } 1491 1492 buffer = ISC_LIST_HEAD(dev->bufferlist); 1493 read_count = 0; 1494 1495 /* 1496 * Single buffer I/O? Skip what we've done so far in this region. 1497 */ 1498 if (buffer == NULL) { 1499 read_count = dev->region.length - dev->n; 1500 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1501 iov[0].iov_len = read_count; 1502 iovcount = 1; 1503 1504 goto config; 1505 } 1506 1507 /* 1508 * Multibuffer I/O. 1509 * Skip empty buffers. 1510 */ 1511 while (buffer != NULL) { 1512 REQUIRE(ISC_BUFFER_VALID(buffer)); 1513 if (isc_buffer_availablelength(buffer) != 0) 1514 break; 1515 buffer = ISC_LIST_NEXT(buffer, link); 1516 } 1517 1518 iovcount = 0; 1519 while (buffer != NULL) { 1520 INSIST(iovcount < MAXSCATTERGATHER_RECV); 1521 1522 isc_buffer_availableregion(buffer, &available); 1523 1524 if (available.length > 0) { 1525 iov[iovcount].iov_base = (void *)(available.base); 1526 iov[iovcount].iov_len = available.length; 1527 read_count += available.length; 1528 iovcount++; 1529 } 1530 buffer = ISC_LIST_NEXT(buffer, link); 1531 } 1532 1533 config: 1534 1535 /* 1536 * If needed, set up to receive that one extra byte. Note that 1537 * we know there is at least one iov left, since we stole it 1538 * at the top of this function. 1539 */ 1540#ifdef ISC_NET_RECVOVERFLOW 1541 if (sock->type == isc_sockettype_udp) { 1542 iov[iovcount].iov_base = (void *)(&sock->overflow); 1543 iov[iovcount].iov_len = 1; 1544 iovcount++; 1545 } 1546#endif 1547 1548 msg->msg_iov = iov; 1549 msg->msg_iovlen = iovcount; 1550 1551#ifdef ISC_NET_BSD44MSGHDR 1552 msg->msg_control = NULL; 1553 msg->msg_controllen = 0; 1554 msg->msg_flags = 0; 1555#if defined(USE_CMSG) 1556 if (sock->type == isc_sockettype_udp) { 1557 msg->msg_control = sock->recvcmsgbuf; 1558 msg->msg_controllen = sock->recvcmsgbuflen; 1559 } 1560#endif /* USE_CMSG */ 1561#else /* ISC_NET_BSD44MSGHDR */ 1562 msg->msg_accrights = NULL; 1563 msg->msg_accrightslen = 0; 1564#endif /* ISC_NET_BSD44MSGHDR */ 1565 1566 if (read_countp != NULL) 1567 *read_countp = read_count; 1568} 1569 1570static void 1571set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock, 1572 isc_socketevent_t *dev) 1573{ 1574 if (sock->type == isc_sockettype_udp) { 1575 if (address != NULL) 1576 dev->address = *address; 1577 else 1578 dev->address = sock->peer_address; 1579 } else if (sock->type == isc_sockettype_tcp) { 1580 INSIST(address == NULL); 1581 dev->address = sock->peer_address; 1582 } 1583} 1584 1585static void 1586destroy_socketevent(isc_event_t *event) { 1587 isc_socketevent_t *ev = (isc_socketevent_t *)event; 1588 1589 INSIST(ISC_LIST_EMPTY(ev->bufferlist)); 1590 1591 (ev->destroy)(event); 1592} 1593 1594static isc_socketevent_t * 1595allocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype, 1596 isc_taskaction_t action, const void *arg) 1597{ 1598 isc_socketevent_t *ev; 1599 1600 ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx, 1601 sock, eventtype, 1602 action, arg, 1603 sizeof(*ev)); 1604 1605 if (ev == NULL) 1606 return (NULL); 1607 1608 ev->result = ISC_R_UNSET; 1609 ISC_LINK_INIT(ev, ev_link); 1610 ISC_LIST_INIT(ev->bufferlist); 1611 ev->region.base = NULL; 1612 ev->n = 0; 1613 ev->offset = 0; 1614 ev->attributes = 0; 1615 ev->destroy = ev->ev_destroy; 1616 ev->ev_destroy = destroy_socketevent; 1617 1618 return (ev); 1619} 1620 1621#if defined(ISC_SOCKET_DEBUG) 1622static void 1623dump_msg(struct msghdr *msg) { 1624 unsigned int i; 1625 1626 printf("MSGHDR %p\n", msg); 1627 printf("\tname %p, namelen %ld\n", msg->msg_name, 1628 (long) msg->msg_namelen); 1629 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, 1630 (long) msg->msg_iovlen); 1631 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) 1632 printf("\t\t%d\tbase %p, len %ld\n", i, 1633 msg->msg_iov[i].iov_base, 1634 (long) msg->msg_iov[i].iov_len); 1635#ifdef ISC_NET_BSD44MSGHDR 1636 printf("\tcontrol %p, controllen %ld\n", msg->msg_control, 1637 (long) msg->msg_controllen); 1638#endif 1639} 1640#endif 1641 1642#define DOIO_SUCCESS 0 /* i/o ok, event sent */ 1643#define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */ 1644#define DOIO_HARD 2 /* i/o error, event sent */ 1645#define DOIO_EOF 3 /* EOF, no event sent */ 1646 1647static int 1648doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) { 1649 int cc; 1650 struct iovec iov[MAXSCATTERGATHER_RECV]; 1651 size_t read_count; 1652 size_t actual_count; 1653 struct msghdr msghdr; 1654 isc_buffer_t *buffer; 1655 int recv_errno; 1656 char strbuf[ISC_STRERRORSIZE]; 1657 1658 build_msghdr_recv(sock, dev, &msghdr, iov, &read_count); 1659 1660#if defined(ISC_SOCKET_DEBUG) 1661 dump_msg(&msghdr); 1662#endif 1663 1664 cc = recvmsg(sock->fd, &msghdr, 0); 1665 recv_errno = errno; 1666 1667#if defined(ISC_SOCKET_DEBUG) 1668 dump_msg(&msghdr); 1669#endif 1670 1671 if (cc < 0) { 1672 if (SOFT_ERROR(recv_errno)) 1673 return (DOIO_SOFT); 1674 1675 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1676 isc__strerror(recv_errno, strbuf, sizeof(strbuf)); 1677 socket_log(sock, NULL, IOEVENT, 1678 isc_msgcat, ISC_MSGSET_SOCKET, 1679 ISC_MSG_DOIORECV, 1680 "doio_recv: recvmsg(%d) %d bytes, err %d/%s", 1681 sock->fd, cc, recv_errno, strbuf); 1682 } 1683 1684#define SOFT_OR_HARD(_system, _isc) \ 1685 if (recv_errno == _system) { \ 1686 if (sock->connected) { \ 1687 dev->result = _isc; \ 1688 inc_stats(sock->manager->stats, \ 1689 sock->statsindex[STATID_RECVFAIL]); \ 1690 return (DOIO_HARD); \ 1691 } \ 1692 return (DOIO_SOFT); \ 1693 } 1694#define ALWAYS_HARD(_system, _isc) \ 1695 if (recv_errno == _system) { \ 1696 dev->result = _isc; \ 1697 inc_stats(sock->manager->stats, \ 1698 sock->statsindex[STATID_RECVFAIL]); \ 1699 return (DOIO_HARD); \ 1700 } 1701 1702 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1703 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1704 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1705 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN); 1706 /* HPUX 11.11 can return EADDRNOTAVAIL. */ 1707 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1708 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1709 /* Should never get this one but it was seen. */ 1710#ifdef ENOPROTOOPT 1711 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH); 1712#endif 1713 /* 1714 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 1715 * errors. 1716 */ 1717#ifdef EPROTO 1718 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); 1719#endif 1720 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); 1721 1722#undef SOFT_OR_HARD 1723#undef ALWAYS_HARD 1724 1725 dev->result = isc__errno2result(recv_errno); 1726 inc_stats(sock->manager->stats, 1727 sock->statsindex[STATID_RECVFAIL]); 1728 return (DOIO_HARD); 1729 } 1730 1731 /* 1732 * On TCP and UNIX sockets, zero length reads indicate EOF, 1733 * while on UDP sockets, zero length reads are perfectly valid, 1734 * although strange. 1735 */ 1736 switch (sock->type) { 1737 case isc_sockettype_tcp: 1738 case isc_sockettype_unix: 1739 if (cc == 0) 1740 return (DOIO_EOF); 1741 break; 1742 case isc_sockettype_udp: 1743 break; 1744 case isc_sockettype_fdwatch: 1745 default: 1746 INSIST(0); 1747 } 1748 1749 if (sock->type == isc_sockettype_udp) { 1750 dev->address.length = msghdr.msg_namelen; 1751 if (isc_sockaddr_getport(&dev->address) == 0) { 1752 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1753 socket_log(sock, &dev->address, IOEVENT, 1754 isc_msgcat, ISC_MSGSET_SOCKET, 1755 ISC_MSG_ZEROPORT, 1756 "dropping source port zero packet"); 1757 } 1758 return (DOIO_SOFT); 1759 } 1760 /* 1761 * Simulate a firewall blocking UDP responses bigger than 1762 * 512 bytes. 1763 */ 1764 if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp) 1765 return (DOIO_SOFT); 1766 } 1767 1768 socket_log(sock, &dev->address, IOEVENT, 1769 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV, 1770 "packet received correctly"); 1771 1772 /* 1773 * Overflow bit detection. If we received MORE bytes than we should, 1774 * this indicates an overflow situation. Set the flag in the 1775 * dev entry and adjust how much we read by one. 1776 */ 1777#ifdef ISC_NET_RECVOVERFLOW 1778 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) { 1779 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1780 cc--; 1781 } 1782#endif 1783 1784 /* 1785 * If there are control messages attached, run through them and pull 1786 * out the interesting bits. 1787 */ 1788 if (sock->type == isc_sockettype_udp) 1789 process_cmsg(sock, &msghdr, dev); 1790 1791 /* 1792 * update the buffers (if any) and the i/o count 1793 */ 1794 dev->n += cc; 1795 actual_count = cc; 1796 buffer = ISC_LIST_HEAD(dev->bufferlist); 1797 while (buffer != NULL && actual_count > 0U) { 1798 REQUIRE(ISC_BUFFER_VALID(buffer)); 1799 if (isc_buffer_availablelength(buffer) <= actual_count) { 1800 actual_count -= isc_buffer_availablelength(buffer); 1801 isc_buffer_add(buffer, 1802 isc_buffer_availablelength(buffer)); 1803 } else { 1804 isc_buffer_add(buffer, actual_count); 1805 actual_count = 0; 1806 POST(actual_count); 1807 break; 1808 } 1809 buffer = ISC_LIST_NEXT(buffer, link); 1810 if (buffer == NULL) { 1811 INSIST(actual_count == 0U); 1812 } 1813 } 1814 1815 /* 1816 * If we read less than we expected, update counters, 1817 * and let the upper layer poke the descriptor. 1818 */ 1819 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) 1820 return (DOIO_SOFT); 1821 1822 /* 1823 * Full reads are posted, or partials if partials are ok. 1824 */ 1825 dev->result = ISC_R_SUCCESS; 1826 return (DOIO_SUCCESS); 1827} 1828 1829/* 1830 * Returns: 1831 * DOIO_SUCCESS The operation succeeded. dev->result contains 1832 * ISC_R_SUCCESS. 1833 * 1834 * DOIO_HARD A hard or unexpected I/O error was encountered. 1835 * dev->result contains the appropriate error. 1836 * 1837 * DOIO_SOFT A soft I/O error was encountered. No senddone 1838 * event was sent. The operation should be retried. 1839 * 1840 * No other return values are possible. 1841 */ 1842static int 1843doio_send(isc__socket_t *sock, isc_socketevent_t *dev) { 1844 int cc; 1845 struct iovec iov[MAXSCATTERGATHER_SEND]; 1846 size_t write_count; 1847 struct msghdr msghdr; 1848 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 1849 int attempts = 0; 1850 int send_errno; 1851 char strbuf[ISC_STRERRORSIZE]; 1852 1853 build_msghdr_send(sock, dev, &msghdr, iov, &write_count); 1854 1855 resend: 1856 cc = sendmsg(sock->fd, &msghdr, 0); 1857 send_errno = errno; 1858 1859 /* 1860 * Check for error or block condition. 1861 */ 1862 if (cc < 0) { 1863 if (send_errno == EINTR && ++attempts < NRETRIES) 1864 goto resend; 1865 1866 if (SOFT_ERROR(send_errno)) 1867 return (DOIO_SOFT); 1868 1869#define SOFT_OR_HARD(_system, _isc) \ 1870 if (send_errno == _system) { \ 1871 if (sock->connected) { \ 1872 dev->result = _isc; \ 1873 inc_stats(sock->manager->stats, \ 1874 sock->statsindex[STATID_SENDFAIL]); \ 1875 return (DOIO_HARD); \ 1876 } \ 1877 return (DOIO_SOFT); \ 1878 } 1879#define ALWAYS_HARD(_system, _isc) \ 1880 if (send_errno == _system) { \ 1881 dev->result = _isc; \ 1882 inc_stats(sock->manager->stats, \ 1883 sock->statsindex[STATID_SENDFAIL]); \ 1884 return (DOIO_HARD); \ 1885 } 1886 1887 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1888 ALWAYS_HARD(EACCES, ISC_R_NOPERM); 1889 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 1890 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1891 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1892#ifdef EHOSTDOWN 1893 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH); 1894#endif 1895 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1896 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1897 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH); 1898 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED); 1899 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET); 1900 1901#undef SOFT_OR_HARD 1902#undef ALWAYS_HARD 1903 1904 /* 1905 * The other error types depend on whether or not the 1906 * socket is UDP or TCP. If it is UDP, some errors 1907 * that we expect to be fatal under TCP are merely 1908 * annoying, and are really soft errors. 1909 * 1910 * However, these soft errors are still returned as 1911 * a status. 1912 */ 1913 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); 1914 isc__strerror(send_errno, strbuf, sizeof(strbuf)); 1915 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", 1916 addrbuf, strbuf); 1917 dev->result = isc__errno2result(send_errno); 1918 inc_stats(sock->manager->stats, 1919 sock->statsindex[STATID_SENDFAIL]); 1920 return (DOIO_HARD); 1921 } 1922 1923 if (cc == 0) { 1924 inc_stats(sock->manager->stats, 1925 sock->statsindex[STATID_SENDFAIL]); 1926 UNEXPECTED_ERROR(__FILE__, __LINE__, 1927 "doio_send: send() %s 0", 1928 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 1929 ISC_MSG_RETURNED, "returned")); 1930 } 1931 1932 /* 1933 * If we write less than we expected, update counters, poke. 1934 */ 1935 dev->n += cc; 1936 if ((size_t)cc != write_count) 1937 return (DOIO_SOFT); 1938 1939 /* 1940 * Exactly what we wanted to write. We're done with this 1941 * entry. Post its completion event. 1942 */ 1943 dev->result = ISC_R_SUCCESS; 1944 return (DOIO_SUCCESS); 1945} 1946 1947/* 1948 * Kill. 1949 * 1950 * Caller must ensure that the socket is not locked and no external 1951 * references exist. 1952 */ 1953static void 1954closesocket(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) { 1955 isc_sockettype_t type = sock->type; 1956 int lockid = FDLOCK_ID(fd); 1957 1958 /* 1959 * No one has this socket open, so the watcher doesn't have to be 1960 * poked, and the socket doesn't have to be locked. 1961 */ 1962 LOCK(&manager->fdlock[lockid]); 1963 manager->fds[fd] = NULL; 1964 if (type == isc_sockettype_fdwatch) 1965 manager->fdstate[fd] = CLOSED; 1966 else 1967 manager->fdstate[fd] = CLOSE_PENDING; 1968 UNLOCK(&manager->fdlock[lockid]); 1969 if (type == isc_sockettype_fdwatch) { 1970 /* 1971 * The caller may close the socket once this function returns, 1972 * and `fd' may be reassigned for a new socket. So we do 1973 * unwatch_fd() here, rather than defer it via select_poke(). 1974 * Note: this may complicate data protection among threads and 1975 * may reduce performance due to additional locks. One way to 1976 * solve this would be to dup() the watched descriptor, but we 1977 * take a simpler approach at this moment. 1978 */ 1979 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1980 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1981 } else 1982 select_poke(manager, fd, SELECT_POKE_CLOSE); 1983 1984 inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]); 1985 1986 /* 1987 * update manager->maxfd here (XXX: this should be implemented more 1988 * efficiently) 1989 */ 1990#ifdef USE_SELECT 1991 LOCK(&manager->lock); 1992 if (manager->maxfd == fd) { 1993 int i; 1994 1995 manager->maxfd = 0; 1996 for (i = fd - 1; i >= 0; i--) { 1997 lockid = FDLOCK_ID(i); 1998 1999 LOCK(&manager->fdlock[lockid]); 2000 if (manager->fdstate[i] == MANAGED) { 2001 manager->maxfd = i; 2002 UNLOCK(&manager->fdlock[lockid]); 2003 break; 2004 } 2005 UNLOCK(&manager->fdlock[lockid]); 2006 } 2007#ifdef ISC_PLATFORM_USETHREADS 2008 if (manager->maxfd < manager->pipe_fds[0]) 2009 manager->maxfd = manager->pipe_fds[0]; 2010#endif 2011 } 2012 UNLOCK(&manager->lock); 2013#endif /* USE_SELECT */ 2014} 2015 2016static void 2017destroy(isc__socket_t **sockp) { 2018 int fd; 2019 isc__socket_t *sock = *sockp; 2020 isc__socketmgr_t *manager = sock->manager; 2021 2022 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2023 ISC_MSG_DESTROYING, "destroying"); 2024 2025 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2026 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2027 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2028 INSIST(sock->connect_ev == NULL); 2029 REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks); 2030 2031 if (sock->fd >= 0) { 2032 fd = sock->fd; 2033 sock->fd = -1; 2034 closesocket(manager, sock, fd); 2035 } 2036 2037 LOCK(&manager->lock); 2038 2039 ISC_LIST_UNLINK(manager->socklist, sock, link); 2040 2041#ifdef USE_WATCHER_THREAD 2042 if (ISC_LIST_EMPTY(manager->socklist)) 2043 SIGNAL(&manager->shutdown_ok); 2044#endif /* USE_WATCHER_THREAD */ 2045 2046 /* can't unlock manager as its memory context is still used */ 2047 free_socket(sockp); 2048 2049 UNLOCK(&manager->lock); 2050} 2051 2052static isc_result_t 2053allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type, 2054 isc__socket_t **socketp) 2055{ 2056 isc__socket_t *sock; 2057 isc_result_t result; 2058 ISC_SOCKADDR_LEN_T cmsgbuflen; 2059 2060 sock = isc_mem_get(manager->mctx, sizeof(*sock)); 2061 2062 if (sock == NULL) 2063 return (ISC_R_NOMEMORY); 2064 2065 sock->common.magic = 0; 2066 sock->common.impmagic = 0; 2067 sock->references = 0; 2068 2069 sock->manager = manager; 2070 sock->type = type; 2071 sock->fd = -1; 2072 sock->dupped = 0; 2073 sock->statsindex = NULL; 2074 2075 ISC_LINK_INIT(sock, link); 2076 2077 sock->recvcmsgbuf = NULL; 2078 sock->sendcmsgbuf = NULL; 2079 2080 /* 2081 * Set up cmsg buffers. 2082 */ 2083 cmsgbuflen = 0; 2084#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 2085 cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo)); 2086#endif 2087#if defined(USE_CMSG) && defined(SO_TIMESTAMP) 2088 cmsgbuflen += cmsg_space(sizeof(struct timeval)); 2089#endif 2090 sock->recvcmsgbuflen = cmsgbuflen; 2091 if (sock->recvcmsgbuflen != 0U) { 2092 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 2093 if (sock->recvcmsgbuf == NULL) { 2094 result = ISC_R_NOMEMORY; 2095 goto error; 2096 } 2097 } 2098 2099 cmsgbuflen = 0; 2100#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 2101 cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo)); 2102#if defined(IPV6_USE_MIN_MTU) 2103 /* 2104 * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU 2105 * support. 2106 */ 2107 cmsgbuflen += cmsg_space(sizeof(int)); 2108#endif 2109#endif 2110 sock->sendcmsgbuflen = cmsgbuflen; 2111 if (sock->sendcmsgbuflen != 0U) { 2112 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 2113 if (sock->sendcmsgbuf == NULL) { 2114 result = ISC_R_NOMEMORY; 2115 goto error; 2116 } 2117 } 2118 2119 memset(sock->name, 0, sizeof(sock->name)); 2120 sock->tag = NULL; 2121 2122 /* 2123 * Set up list of readers and writers to be initially empty. 2124 */ 2125 ISC_LIST_INIT(sock->recv_list); 2126 ISC_LIST_INIT(sock->send_list); 2127 ISC_LIST_INIT(sock->accept_list); 2128 sock->connect_ev = NULL; 2129 sock->pending_recv = 0; 2130 sock->pending_send = 0; 2131 sock->pending_accept = 0; 2132 sock->listener = 0; 2133 sock->connected = 0; 2134 sock->connecting = 0; 2135 sock->bound = 0; 2136 2137 /* 2138 * Initialize the lock. 2139 */ 2140 result = isc_mutex_init(&sock->lock); 2141 if (result != ISC_R_SUCCESS) { 2142 sock->common.magic = 0; 2143 sock->common.impmagic = 0; 2144 goto error; 2145 } 2146 2147 /* 2148 * Initialize readable and writable events. 2149 */ 2150 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t), 2151 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR, 2152 NULL, sock, sock, NULL, NULL); 2153 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t), 2154 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW, 2155 NULL, sock, sock, NULL, NULL); 2156 2157 sock->common.magic = ISCAPI_SOCKET_MAGIC; 2158 sock->common.impmagic = SOCKET_MAGIC; 2159 *socketp = sock; 2160 2161 return (ISC_R_SUCCESS); 2162 2163 error: 2164 if (sock->recvcmsgbuf != NULL) 2165 isc_mem_put(manager->mctx, sock->recvcmsgbuf, 2166 sock->recvcmsgbuflen); 2167 if (sock->sendcmsgbuf != NULL) 2168 isc_mem_put(manager->mctx, sock->sendcmsgbuf, 2169 sock->sendcmsgbuflen); 2170 isc_mem_put(manager->mctx, sock, sizeof(*sock)); 2171 2172 return (result); 2173} 2174 2175/* 2176 * This event requires that the various lists be empty, that the reference 2177 * count be 1, and that the magic number is valid. The other socket bits, 2178 * like the lock, must be initialized as well. The fd associated must be 2179 * marked as closed, by setting it to -1 on close, or this routine will 2180 * also close the socket. 2181 */ 2182static void 2183free_socket(isc__socket_t **socketp) { 2184 isc__socket_t *sock = *socketp; 2185 2186 INSIST(sock->references == 0); 2187 INSIST(VALID_SOCKET(sock)); 2188 INSIST(!sock->connecting); 2189 INSIST(!sock->pending_recv); 2190 INSIST(!sock->pending_send); 2191 INSIST(!sock->pending_accept); 2192 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2193 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2194 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2195 INSIST(!ISC_LINK_LINKED(sock, link)); 2196 2197 if (sock->recvcmsgbuf != NULL) 2198 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf, 2199 sock->recvcmsgbuflen); 2200 if (sock->sendcmsgbuf != NULL) 2201 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf, 2202 sock->sendcmsgbuflen); 2203 2204 sock->common.magic = 0; 2205 sock->common.impmagic = 0; 2206 2207 DESTROYLOCK(&sock->lock); 2208 2209 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock)); 2210 2211 *socketp = NULL; 2212} 2213 2214#ifdef SO_BSDCOMPAT 2215/* 2216 * This really should not be necessary to do. Having to workout 2217 * which kernel version we are on at run time so that we don't cause 2218 * the kernel to issue a warning about us using a deprecated socket option. 2219 * Such warnings should *never* be on by default in production kernels. 2220 * 2221 * We can't do this a build time because executables are moved between 2222 * machines and hence kernels. 2223 * 2224 * We can't just not set SO_BSDCOMAT because some kernels require it. 2225 */ 2226 2227static isc_once_t bsdcompat_once = ISC_ONCE_INIT; 2228isc_boolean_t bsdcompat = ISC_TRUE; 2229 2230static void 2231clear_bsdcompat(void) { 2232#ifdef __linux__ 2233 struct utsname buf; 2234 char *endp; 2235 long int major; 2236 long int minor; 2237 2238 uname(&buf); /* Can only fail if buf is bad in Linux. */ 2239 2240 /* Paranoia in parsing can be increased, but we trust uname(). */ 2241 major = strtol(buf.release, &endp, 10); 2242 if (*endp == '.') { 2243 minor = strtol(endp+1, &endp, 10); 2244 if ((major > 2) || ((major == 2) && (minor >= 4))) { 2245 bsdcompat = ISC_FALSE; 2246 } 2247 } 2248#endif /* __linux __ */ 2249} 2250#endif 2251 2252static void 2253use_min_mtu(isc__socket_t *sock) { 2254#if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) 2255 UNUSED(sock); 2256#endif 2257#ifdef IPV6_USE_MIN_MTU 2258 /* use minimum MTU */ 2259 if (sock->pf == AF_INET6) { 2260 int on = 1; 2261 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, 2262 (void *)&on, sizeof(on)); 2263 } 2264#endif 2265#if defined(IPV6_MTU) 2266 /* 2267 * Use minimum MTU on IPv6 sockets. 2268 */ 2269 if (sock->pf == AF_INET6) { 2270 int mtu = 1280; 2271 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, 2272 &mtu, sizeof(mtu)); 2273 } 2274#endif 2275} 2276 2277static isc_result_t 2278opensocket(isc__socketmgr_t *manager, isc__socket_t *sock, 2279 isc__socket_t *dup_socket) 2280{ 2281 isc_result_t result; 2282 char strbuf[ISC_STRERRORSIZE]; 2283 const char *err = "socket"; 2284 int tries = 0; 2285#if defined(USE_CMSG) || defined(SO_BSDCOMPAT) || defined(SO_NOSIGPIPE) 2286 int on = 1; 2287#endif 2288#if defined(SO_RCVBUF) 2289 ISC_SOCKADDR_LEN_T optlen; 2290 int size; 2291#endif 2292 2293 again: 2294 if (dup_socket == NULL) { 2295 switch (sock->type) { 2296 case isc_sockettype_udp: 2297 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); 2298 break; 2299 case isc_sockettype_tcp: 2300 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); 2301 break; 2302 case isc_sockettype_unix: 2303 sock->fd = socket(sock->pf, SOCK_STREAM, 0); 2304 break; 2305 case isc_sockettype_fdwatch: 2306 /* 2307 * We should not be called for isc_sockettype_fdwatch 2308 * sockets. 2309 */ 2310 INSIST(0); 2311 break; 2312 } 2313 } else { 2314 sock->fd = dup(dup_socket->fd); 2315 sock->dupped = 1; 2316 sock->bound = dup_socket->bound; 2317 } 2318 if (sock->fd == -1 && errno == EINTR && tries++ < 42) 2319 goto again; 2320 2321#ifdef F_DUPFD 2322 /* 2323 * Leave a space for stdio and TCP to work in. 2324 */ 2325 if (manager->reserved != 0 && sock->type == isc_sockettype_udp && 2326 sock->fd >= 0 && sock->fd < manager->reserved) { 2327 int new, tmp; 2328 new = fcntl(sock->fd, F_DUPFD, manager->reserved); 2329 tmp = errno; 2330 (void)close(sock->fd); 2331 errno = tmp; 2332 sock->fd = new; 2333 err = "isc_socket_create: fcntl/reserved"; 2334 } else if (sock->fd >= 0 && sock->fd < 20) { 2335 int new, tmp; 2336 new = fcntl(sock->fd, F_DUPFD, 20); 2337 tmp = errno; 2338 (void)close(sock->fd); 2339 errno = tmp; 2340 sock->fd = new; 2341 err = "isc_socket_create: fcntl"; 2342 } 2343#endif 2344 2345 if (sock->fd >= (int)manager->maxsocks) { 2346 (void)close(sock->fd); 2347 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2348 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2349 isc_msgcat, ISC_MSGSET_SOCKET, 2350 ISC_MSG_TOOMANYFDS, 2351 "socket: file descriptor exceeds limit (%d/%u)", 2352 sock->fd, manager->maxsocks); 2353 return (ISC_R_NORESOURCES); 2354 } 2355 2356 if (sock->fd < 0) { 2357 switch (errno) { 2358 case EMFILE: 2359 case ENFILE: 2360 isc__strerror(errno, strbuf, sizeof(strbuf)); 2361 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2362 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2363 isc_msgcat, ISC_MSGSET_SOCKET, 2364 ISC_MSG_TOOMANYFDS, 2365 "%s: %s", err, strbuf); 2366 /* fallthrough */ 2367 case ENOBUFS: 2368 return (ISC_R_NORESOURCES); 2369 2370 case EPROTONOSUPPORT: 2371 case EPFNOSUPPORT: 2372 case EAFNOSUPPORT: 2373 /* 2374 * Linux 2.2 (and maybe others) return EINVAL instead of 2375 * EAFNOSUPPORT. 2376 */ 2377 case EINVAL: 2378 return (ISC_R_FAMILYNOSUPPORT); 2379 2380 default: 2381 isc__strerror(errno, strbuf, sizeof(strbuf)); 2382 UNEXPECTED_ERROR(__FILE__, __LINE__, 2383 "%s() %s: %s", err, 2384 isc_msgcat_get(isc_msgcat, 2385 ISC_MSGSET_GENERAL, 2386 ISC_MSG_FAILED, 2387 "failed"), 2388 strbuf); 2389 return (ISC_R_UNEXPECTED); 2390 } 2391 } 2392 2393 if (dup_socket != NULL) 2394 goto setup_done; 2395 2396 result = make_nonblock(sock->fd); 2397 if (result != ISC_R_SUCCESS) { 2398 (void)close(sock->fd); 2399 return (result); 2400 } 2401 2402#ifdef SO_BSDCOMPAT 2403 RUNTIME_CHECK(isc_once_do(&bsdcompat_once, 2404 clear_bsdcompat) == ISC_R_SUCCESS); 2405 if (sock->type != isc_sockettype_unix && bsdcompat && 2406 setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, 2407 (void *)&on, sizeof(on)) < 0) { 2408 isc__strerror(errno, strbuf, sizeof(strbuf)); 2409 UNEXPECTED_ERROR(__FILE__, __LINE__, 2410 "setsockopt(%d, SO_BSDCOMPAT) %s: %s", 2411 sock->fd, 2412 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2413 ISC_MSG_FAILED, "failed"), 2414 strbuf); 2415 /* Press on... */ 2416 } 2417#endif 2418 2419#ifdef SO_NOSIGPIPE 2420 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, 2421 (void *)&on, sizeof(on)) < 0) { 2422 isc__strerror(errno, strbuf, sizeof(strbuf)); 2423 UNEXPECTED_ERROR(__FILE__, __LINE__, 2424 "setsockopt(%d, SO_NOSIGPIPE) %s: %s", 2425 sock->fd, 2426 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2427 ISC_MSG_FAILED, "failed"), 2428 strbuf); 2429 /* Press on... */ 2430 } 2431#endif 2432 2433 /* 2434 * Use minimum mtu if possible. 2435 */ 2436 use_min_mtu(sock); 2437 2438#if defined(USE_CMSG) || defined(SO_RCVBUF) 2439 if (sock->type == isc_sockettype_udp) { 2440 2441#if defined(USE_CMSG) 2442#if defined(SO_TIMESTAMP) 2443 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, 2444 (void *)&on, sizeof(on)) < 0 2445 && errno != ENOPROTOOPT) { 2446 isc__strerror(errno, strbuf, sizeof(strbuf)); 2447 UNEXPECTED_ERROR(__FILE__, __LINE__, 2448 "setsockopt(%d, SO_TIMESTAMP) %s: %s", 2449 sock->fd, 2450 isc_msgcat_get(isc_msgcat, 2451 ISC_MSGSET_GENERAL, 2452 ISC_MSG_FAILED, 2453 "failed"), 2454 strbuf); 2455 /* Press on... */ 2456 } 2457#endif /* SO_TIMESTAMP */ 2458 2459#if defined(ISC_PLATFORM_HAVEIPV6) 2460 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { 2461 /* 2462 * Warn explicitly because this anomaly can be hidden 2463 * in usual operation (and unexpectedly appear later). 2464 */ 2465 UNEXPECTED_ERROR(__FILE__, __LINE__, 2466 "No buffer available to receive " 2467 "IPv6 destination"); 2468 } 2469#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 2470#ifdef IPV6_RECVPKTINFO 2471 /* RFC 3542 */ 2472 if ((sock->pf == AF_INET6) 2473 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 2474 (void *)&on, sizeof(on)) < 0)) { 2475 isc__strerror(errno, strbuf, sizeof(strbuf)); 2476 UNEXPECTED_ERROR(__FILE__, __LINE__, 2477 "setsockopt(%d, IPV6_RECVPKTINFO) " 2478 "%s: %s", sock->fd, 2479 isc_msgcat_get(isc_msgcat, 2480 ISC_MSGSET_GENERAL, 2481 ISC_MSG_FAILED, 2482 "failed"), 2483 strbuf); 2484 } 2485#else 2486 /* RFC 2292 */ 2487 if ((sock->pf == AF_INET6) 2488 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, 2489 (void *)&on, sizeof(on)) < 0)) { 2490 isc__strerror(errno, strbuf, sizeof(strbuf)); 2491 UNEXPECTED_ERROR(__FILE__, __LINE__, 2492 "setsockopt(%d, IPV6_PKTINFO) %s: %s", 2493 sock->fd, 2494 isc_msgcat_get(isc_msgcat, 2495 ISC_MSGSET_GENERAL, 2496 ISC_MSG_FAILED, 2497 "failed"), 2498 strbuf); 2499 } 2500#endif /* IPV6_RECVPKTINFO */ 2501#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */ 2502#if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT) 2503 /* 2504 * Turn off Path MTU discovery on IPv6/UDP sockets. 2505 */ 2506 if (sock->pf == AF_INET6) { 2507 int action = IPV6_PMTUDISC_DONT; 2508 (void)setsockopt(sock->fd, IPPROTO_IPV6, 2509 IPV6_MTU_DISCOVER, &action, 2510 sizeof(action)); 2511 } 2512#endif 2513#endif /* ISC_PLATFORM_HAVEIPV6 */ 2514#endif /* defined(USE_CMSG) */ 2515 2516#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) 2517 /* 2518 * Turn off Path MTU discovery on IPv4/UDP sockets. 2519 */ 2520 if (sock->pf == AF_INET) { 2521 int action = IP_PMTUDISC_DONT; 2522 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER, 2523 &action, sizeof(action)); 2524 } 2525#endif 2526#if defined(IP_DONTFRAG) 2527 /* 2528 * Turn off Path MTU discovery on IPv4/UDP sockets. 2529 */ 2530 if (sock->pf == AF_INET) { 2531 int off = 0; 2532 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, 2533 &off, sizeof(off)); 2534 } 2535#endif 2536 2537#if defined(SO_RCVBUF) 2538 optlen = sizeof(size); 2539 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2540 (void *)&size, &optlen) >= 0 && 2541 size < RCVBUFSIZE) { 2542 size = RCVBUFSIZE; 2543 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2544 (void *)&size, sizeof(size)) == -1) { 2545 isc__strerror(errno, strbuf, sizeof(strbuf)); 2546 UNEXPECTED_ERROR(__FILE__, __LINE__, 2547 "setsockopt(%d, SO_RCVBUF, %d) %s: %s", 2548 sock->fd, size, 2549 isc_msgcat_get(isc_msgcat, 2550 ISC_MSGSET_GENERAL, 2551 ISC_MSG_FAILED, 2552 "failed"), 2553 strbuf); 2554 } 2555 } 2556#endif 2557 } 2558#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */ 2559 2560setup_done: 2561 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]); 2562 2563 return (ISC_R_SUCCESS); 2564} 2565 2566/* 2567 * Create a 'type' socket or duplicate an existing socket, managed 2568 * by 'manager'. Events will be posted to 'task' and when dispatched 2569 * 'action' will be called with 'arg' as the arg value. The new 2570 * socket is returned in 'socketp'. 2571 */ 2572static isc_result_t 2573socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 2574 isc_socket_t **socketp, isc_socket_t *dup_socket) 2575{ 2576 isc__socket_t *sock = NULL; 2577 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 2578 isc_result_t result; 2579 int lockid; 2580 2581 REQUIRE(VALID_MANAGER(manager)); 2582 REQUIRE(socketp != NULL && *socketp == NULL); 2583 REQUIRE(type != isc_sockettype_fdwatch); 2584 2585 result = allocate_socket(manager, type, &sock); 2586 if (result != ISC_R_SUCCESS) 2587 return (result); 2588 2589 switch (sock->type) { 2590 case isc_sockettype_udp: 2591 sock->statsindex = 2592 (pf == AF_INET) ? udp4statsindex : udp6statsindex; 2593 break; 2594 case isc_sockettype_tcp: 2595 sock->statsindex = 2596 (pf == AF_INET) ? tcp4statsindex : tcp6statsindex; 2597 break; 2598 case isc_sockettype_unix: 2599 sock->statsindex = unixstatsindex; 2600 break; 2601 default: 2602 INSIST(0); 2603 } 2604 2605 sock->pf = pf; 2606 2607 result = opensocket(manager, sock, (isc__socket_t *)dup_socket); 2608 if (result != ISC_R_SUCCESS) { 2609 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]); 2610 free_socket(&sock); 2611 return (result); 2612 } 2613 2614 sock->common.methods = (isc_socketmethods_t *)&socketmethods; 2615 sock->references = 1; 2616 *socketp = (isc_socket_t *)sock; 2617 2618 /* 2619 * Note we don't have to lock the socket like we normally would because 2620 * there are no external references to it yet. 2621 */ 2622 2623 lockid = FDLOCK_ID(sock->fd); 2624 LOCK(&manager->fdlock[lockid]); 2625 manager->fds[sock->fd] = sock; 2626 manager->fdstate[sock->fd] = MANAGED; 2627#ifdef USE_DEVPOLL 2628 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2629 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2630#endif 2631 UNLOCK(&manager->fdlock[lockid]); 2632 2633 LOCK(&manager->lock); 2634 ISC_LIST_APPEND(manager->socklist, sock, link); 2635#ifdef USE_SELECT 2636 if (manager->maxfd < sock->fd) 2637 manager->maxfd = sock->fd; 2638#endif 2639 UNLOCK(&manager->lock); 2640 2641 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2642 ISC_MSG_CREATED, dup_socket != NULL ? "dupped" : "created"); 2643 2644 return (ISC_R_SUCCESS); 2645} 2646 2647/*% 2648 * Create a new 'type' socket managed by 'manager'. Events 2649 * will be posted to 'task' and when dispatched 'action' will be 2650 * called with 'arg' as the arg value. The new socket is returned 2651 * in 'socketp'. 2652 */ 2653ISC_SOCKETFUNC_SCOPE isc_result_t 2654isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 2655 isc_socket_t **socketp) 2656{ 2657 return (socket_create(manager0, pf, type, socketp, NULL)); 2658} 2659 2660/*% 2661 * Duplicate an existing socket. The new socket is returned 2662 * in 'socketp'. 2663 */ 2664ISC_SOCKETFUNC_SCOPE isc_result_t 2665isc__socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) { 2666 isc__socket_t *sock = (isc__socket_t *)sock0; 2667 2668 REQUIRE(VALID_SOCKET(sock)); 2669 REQUIRE(socketp != NULL && *socketp == NULL); 2670 2671 return (socket_create((isc_socketmgr_t *) sock->manager, 2672 sock->pf, sock->type, socketp, 2673 sock0)); 2674} 2675 2676#ifdef BIND9 2677ISC_SOCKETFUNC_SCOPE isc_result_t 2678isc__socket_open(isc_socket_t *sock0) { 2679 isc_result_t result; 2680 isc__socket_t *sock = (isc__socket_t *)sock0; 2681 2682 REQUIRE(VALID_SOCKET(sock)); 2683 2684 LOCK(&sock->lock); 2685 REQUIRE(sock->references == 1); 2686 REQUIRE(sock->type != isc_sockettype_fdwatch); 2687 UNLOCK(&sock->lock); 2688 /* 2689 * We don't need to retain the lock hereafter, since no one else has 2690 * this socket. 2691 */ 2692 REQUIRE(sock->fd == -1); 2693 2694 result = opensocket(sock->manager, sock, NULL); 2695 if (result != ISC_R_SUCCESS) 2696 sock->fd = -1; 2697 2698 if (result == ISC_R_SUCCESS) { 2699 int lockid = FDLOCK_ID(sock->fd); 2700 2701 LOCK(&sock->manager->fdlock[lockid]); 2702 sock->manager->fds[sock->fd] = sock; 2703 sock->manager->fdstate[sock->fd] = MANAGED; 2704#ifdef USE_DEVPOLL 2705 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2706 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2707#endif 2708 UNLOCK(&sock->manager->fdlock[lockid]); 2709 2710#ifdef USE_SELECT 2711 LOCK(&sock->manager->lock); 2712 if (sock->manager->maxfd < sock->fd) 2713 sock->manager->maxfd = sock->fd; 2714 UNLOCK(&sock->manager->lock); 2715#endif 2716 } 2717 2718 return (result); 2719} 2720#endif /* BIND9 */ 2721 2722/* 2723 * Create a new 'type' socket managed by 'manager'. Events 2724 * will be posted to 'task' and when dispatched 'action' will be 2725 * called with 'arg' as the arg value. The new socket is returned 2726 * in 'socketp'. 2727 */ 2728ISC_SOCKETFUNC_SCOPE isc_result_t 2729isc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags, 2730 isc_sockfdwatch_t callback, void *cbarg, 2731 isc_task_t *task, isc_socket_t **socketp) 2732{ 2733 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 2734 isc__socket_t *sock = NULL; 2735 isc_result_t result; 2736 int lockid; 2737 2738 REQUIRE(VALID_MANAGER(manager)); 2739 REQUIRE(socketp != NULL && *socketp == NULL); 2740 2741 result = allocate_socket(manager, isc_sockettype_fdwatch, &sock); 2742 if (result != ISC_R_SUCCESS) 2743 return (result); 2744 2745 sock->fd = fd; 2746 sock->fdwatcharg = cbarg; 2747 sock->fdwatchcb = callback; 2748 sock->fdwatchflags = flags; 2749 sock->fdwatchtask = task; 2750 sock->statsindex = fdwatchstatsindex; 2751 2752 sock->common.methods = (isc_socketmethods_t *)&socketmethods; 2753 sock->references = 1; 2754 *socketp = (isc_socket_t *)sock; 2755 2756 /* 2757 * Note we don't have to lock the socket like we normally would because 2758 * there are no external references to it yet. 2759 */ 2760 2761 lockid = FDLOCK_ID(sock->fd); 2762 LOCK(&manager->fdlock[lockid]); 2763 manager->fds[sock->fd] = sock; 2764 manager->fdstate[sock->fd] = MANAGED; 2765 UNLOCK(&manager->fdlock[lockid]); 2766 2767 LOCK(&manager->lock); 2768 ISC_LIST_APPEND(manager->socklist, sock, link); 2769#ifdef USE_SELECT 2770 if (manager->maxfd < sock->fd) 2771 manager->maxfd = sock->fd; 2772#endif 2773 UNLOCK(&manager->lock); 2774 2775 if (flags & ISC_SOCKFDWATCH_READ) 2776 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 2777 if (flags & ISC_SOCKFDWATCH_WRITE) 2778 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 2779 2780 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2781 ISC_MSG_CREATED, "fdwatch-created"); 2782 2783 return (ISC_R_SUCCESS); 2784} 2785 2786/* 2787 * Indicate to the manager that it should watch the socket again. 2788 * This can be used to restart watching if the previous event handler 2789 * didn't indicate there was more data to be processed. Primarily 2790 * it is for writing but could be used for reading if desired 2791 */ 2792 2793ISC_SOCKETFUNC_SCOPE isc_result_t 2794isc__socket_fdwatchpoke(isc_socket_t *sock0, int flags) 2795{ 2796 isc__socket_t *sock = (isc__socket_t *)sock0; 2797 2798 REQUIRE(VALID_SOCKET(sock)); 2799 2800 /* 2801 * We check both flags first to allow us to get the lock 2802 * once but only if we need it. 2803 */ 2804 2805 if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) { 2806 LOCK(&sock->lock); 2807 if (((flags & ISC_SOCKFDWATCH_READ) != 0) && 2808 !sock->pending_recv) 2809 select_poke(sock->manager, sock->fd, 2810 SELECT_POKE_READ); 2811 if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) && 2812 !sock->pending_send) 2813 select_poke(sock->manager, sock->fd, 2814 SELECT_POKE_WRITE); 2815 UNLOCK(&sock->lock); 2816 } 2817 2818 socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET, 2819 ISC_MSG_POKED, "fdwatch-poked flags: %d", flags); 2820 2821 return (ISC_R_SUCCESS); 2822} 2823 2824/* 2825 * Attach to a socket. Caller must explicitly detach when it is done. 2826 */ 2827ISC_SOCKETFUNC_SCOPE void 2828isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) { 2829 isc__socket_t *sock = (isc__socket_t *)sock0; 2830 2831 REQUIRE(VALID_SOCKET(sock)); 2832 REQUIRE(socketp != NULL && *socketp == NULL); 2833 2834 LOCK(&sock->lock); 2835 sock->references++; 2836 UNLOCK(&sock->lock); 2837 2838 *socketp = (isc_socket_t *)sock; 2839} 2840 2841/* 2842 * Dereference a socket. If this is the last reference to it, clean things 2843 * up by destroying the socket. 2844 */ 2845ISC_SOCKETFUNC_SCOPE void 2846isc__socket_detach(isc_socket_t **socketp) { 2847 isc__socket_t *sock; 2848 isc_boolean_t kill_socket = ISC_FALSE; 2849 2850 REQUIRE(socketp != NULL); 2851 sock = (isc__socket_t *)*socketp; 2852 REQUIRE(VALID_SOCKET(sock)); 2853 2854 LOCK(&sock->lock); 2855 REQUIRE(sock->references > 0); 2856 sock->references--; 2857 if (sock->references == 0) 2858 kill_socket = ISC_TRUE; 2859 UNLOCK(&sock->lock); 2860 2861 if (kill_socket) 2862 destroy(&sock); 2863 2864 *socketp = NULL; 2865} 2866 2867#ifdef BIND9 2868ISC_SOCKETFUNC_SCOPE isc_result_t 2869isc__socket_close(isc_socket_t *sock0) { 2870 isc__socket_t *sock = (isc__socket_t *)sock0; 2871 int fd; 2872 isc__socketmgr_t *manager; 2873 2874 fflush(stdout); 2875 REQUIRE(VALID_SOCKET(sock)); 2876 2877 LOCK(&sock->lock); 2878 2879 REQUIRE(sock->references == 1); 2880 REQUIRE(sock->type != isc_sockettype_fdwatch); 2881 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); 2882 2883 INSIST(!sock->connecting); 2884 INSIST(!sock->pending_recv); 2885 INSIST(!sock->pending_send); 2886 INSIST(!sock->pending_accept); 2887 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2888 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2889 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2890 INSIST(sock->connect_ev == NULL); 2891 2892 manager = sock->manager; 2893 fd = sock->fd; 2894 sock->fd = -1; 2895 sock->dupped = 0; 2896 memset(sock->name, 0, sizeof(sock->name)); 2897 sock->tag = NULL; 2898 sock->listener = 0; 2899 sock->connected = 0; 2900 sock->connecting = 0; 2901 sock->bound = 0; 2902 isc_sockaddr_any(&sock->peer_address); 2903 2904 UNLOCK(&sock->lock); 2905 2906 closesocket(manager, sock, fd); 2907 2908 return (ISC_R_SUCCESS); 2909} 2910#endif /* BIND9 */ 2911 2912/* 2913 * I/O is possible on a given socket. Schedule an event to this task that 2914 * will call an internal function to do the I/O. This will charge the 2915 * task with the I/O operation and let our select loop handler get back 2916 * to doing something real as fast as possible. 2917 * 2918 * The socket and manager must be locked before calling this function. 2919 */ 2920static void 2921dispatch_recv(isc__socket_t *sock) { 2922 intev_t *iev; 2923 isc_socketevent_t *ev; 2924 isc_task_t *sender; 2925 2926 INSIST(!sock->pending_recv); 2927 2928 if (sock->type != isc_sockettype_fdwatch) { 2929 ev = ISC_LIST_HEAD(sock->recv_list); 2930 if (ev == NULL) 2931 return; 2932 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2933 "dispatch_recv: event %p -> task %p", 2934 ev, ev->ev_sender); 2935 sender = ev->ev_sender; 2936 } else { 2937 sender = sock->fdwatchtask; 2938 } 2939 2940 sock->pending_recv = 1; 2941 iev = &sock->readable_ev; 2942 2943 sock->references++; 2944 iev->ev_sender = sock; 2945 if (sock->type == isc_sockettype_fdwatch) 2946 iev->ev_action = internal_fdwatch_read; 2947 else 2948 iev->ev_action = internal_recv; 2949 iev->ev_arg = sock; 2950 2951 isc_task_send(sender, (isc_event_t **)&iev); 2952} 2953 2954static void 2955dispatch_send(isc__socket_t *sock) { 2956 intev_t *iev; 2957 isc_socketevent_t *ev; 2958 isc_task_t *sender; 2959 2960 INSIST(!sock->pending_send); 2961 2962 if (sock->type != isc_sockettype_fdwatch) { 2963 ev = ISC_LIST_HEAD(sock->send_list); 2964 if (ev == NULL) 2965 return; 2966 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2967 "dispatch_send: event %p -> task %p", 2968 ev, ev->ev_sender); 2969 sender = ev->ev_sender; 2970 } else { 2971 sender = sock->fdwatchtask; 2972 } 2973 2974 sock->pending_send = 1; 2975 iev = &sock->writable_ev; 2976 2977 sock->references++; 2978 iev->ev_sender = sock; 2979 if (sock->type == isc_sockettype_fdwatch) 2980 iev->ev_action = internal_fdwatch_write; 2981 else 2982 iev->ev_action = internal_send; 2983 iev->ev_arg = sock; 2984 2985 isc_task_send(sender, (isc_event_t **)&iev); 2986} 2987 2988/* 2989 * Dispatch an internal accept event. 2990 */ 2991static void 2992dispatch_accept(isc__socket_t *sock) { 2993 intev_t *iev; 2994 isc_socket_newconnev_t *ev; 2995 2996 INSIST(!sock->pending_accept); 2997 2998 /* 2999 * Are there any done events left, or were they all canceled 3000 * before the manager got the socket lock? 3001 */ 3002 ev = ISC_LIST_HEAD(sock->accept_list); 3003 if (ev == NULL) 3004 return; 3005 3006 sock->pending_accept = 1; 3007 iev = &sock->readable_ev; 3008 3009 sock->references++; /* keep socket around for this internal event */ 3010 iev->ev_sender = sock; 3011 iev->ev_action = internal_accept; 3012 iev->ev_arg = sock; 3013 3014 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 3015} 3016 3017static void 3018dispatch_connect(isc__socket_t *sock) { 3019 intev_t *iev; 3020 isc_socket_connev_t *ev; 3021 3022 iev = &sock->writable_ev; 3023 3024 ev = sock->connect_ev; 3025 INSIST(ev != NULL); /* XXX */ 3026 3027 INSIST(sock->connecting); 3028 3029 sock->references++; /* keep socket around for this internal event */ 3030 iev->ev_sender = sock; 3031 iev->ev_action = internal_connect; 3032 iev->ev_arg = sock; 3033 3034 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 3035} 3036 3037/* 3038 * Dequeue an item off the given socket's read queue, set the result code 3039 * in the done event to the one provided, and send it to the task it was 3040 * destined for. 3041 * 3042 * If the event to be sent is on a list, remove it before sending. If 3043 * asked to, send and detach from the socket as well. 3044 * 3045 * Caller must have the socket locked if the event is attached to the socket. 3046 */ 3047static void 3048send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) { 3049 isc_task_t *task; 3050 3051 task = (*dev)->ev_sender; 3052 3053 (*dev)->ev_sender = sock; 3054 3055 if (ISC_LINK_LINKED(*dev, ev_link)) 3056 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); 3057 3058 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 3059 == ISC_SOCKEVENTATTR_ATTACHED) 3060 isc_task_sendanddetach(&task, (isc_event_t **)dev); 3061 else 3062 isc_task_send(task, (isc_event_t **)dev); 3063} 3064 3065/* 3066 * See comments for send_recvdone_event() above. 3067 * 3068 * Caller must have the socket locked if the event is attached to the socket. 3069 */ 3070static void 3071send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) { 3072 isc_task_t *task; 3073 3074 INSIST(dev != NULL && *dev != NULL); 3075 3076 task = (*dev)->ev_sender; 3077 (*dev)->ev_sender = sock; 3078 3079 if (ISC_LINK_LINKED(*dev, ev_link)) 3080 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); 3081 3082 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 3083 == ISC_SOCKEVENTATTR_ATTACHED) 3084 isc_task_sendanddetach(&task, (isc_event_t **)dev); 3085 else 3086 isc_task_send(task, (isc_event_t **)dev); 3087} 3088 3089/* 3090 * Call accept() on a socket, to get the new file descriptor. The listen 3091 * socket is used as a prototype to create a new isc_socket_t. The new 3092 * socket has one outstanding reference. The task receiving the event 3093 * will be detached from just after the event is delivered. 3094 * 3095 * On entry to this function, the event delivered is the internal 3096 * readable event, and the first item on the accept_list should be 3097 * the done event we want to send. If the list is empty, this is a no-op, 3098 * so just unlock and return. 3099 */ 3100static void 3101internal_accept(isc_task_t *me, isc_event_t *ev) { 3102 isc__socket_t *sock; 3103 isc__socketmgr_t *manager; 3104 isc_socket_newconnev_t *dev; 3105 isc_task_t *task; 3106 ISC_SOCKADDR_LEN_T addrlen; 3107 int fd; 3108 isc_result_t result = ISC_R_SUCCESS; 3109 char strbuf[ISC_STRERRORSIZE]; 3110 const char *err = "accept"; 3111 3112 UNUSED(me); 3113 3114 sock = ev->ev_sender; 3115 INSIST(VALID_SOCKET(sock)); 3116 3117 LOCK(&sock->lock); 3118 socket_log(sock, NULL, TRACE, 3119 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK, 3120 "internal_accept called, locked socket"); 3121 3122 manager = sock->manager; 3123 INSIST(VALID_MANAGER(manager)); 3124 3125 INSIST(sock->listener); 3126 INSIST(sock->pending_accept == 1); 3127 sock->pending_accept = 0; 3128 3129 INSIST(sock->references > 0); 3130 sock->references--; /* the internal event is done with this socket */ 3131 if (sock->references == 0) { 3132 UNLOCK(&sock->lock); 3133 destroy(&sock); 3134 return; 3135 } 3136 3137 /* 3138 * Get the first item off the accept list. 3139 * If it is empty, unlock the socket and return. 3140 */ 3141 dev = ISC_LIST_HEAD(sock->accept_list); 3142 if (dev == NULL) { 3143 UNLOCK(&sock->lock); 3144 return; 3145 } 3146 3147 /* 3148 * Try to accept the new connection. If the accept fails with 3149 * EAGAIN or EINTR, simply poke the watcher to watch this socket 3150 * again. Also ignore ECONNRESET, which has been reported to 3151 * be spuriously returned on Linux 2.2.19 although it is not 3152 * a documented error for accept(). ECONNABORTED has been 3153 * reported for Solaris 8. The rest are thrown in not because 3154 * we have seen them but because they are ignored by other 3155 * daemons such as BIND 8 and Apache. 3156 */ 3157 3158 addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type); 3159 memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen); 3160 fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa, 3161 (void *)&addrlen); 3162 3163#ifdef F_DUPFD 3164 /* 3165 * Leave a space for stdio to work in. 3166 */ 3167 if (fd >= 0 && fd < 20) { 3168 int new, tmp; 3169 new = fcntl(fd, F_DUPFD, 20); 3170 tmp = errno; 3171 (void)close(fd); 3172 errno = tmp; 3173 fd = new; 3174 err = "accept/fcntl"; 3175 } 3176#endif 3177 3178 if (fd < 0) { 3179 if (SOFT_ERROR(errno)) 3180 goto soft_error; 3181 switch (errno) { 3182 case ENFILE: 3183 case EMFILE: 3184 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 3185 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 3186 isc_msgcat, ISC_MSGSET_SOCKET, 3187 ISC_MSG_TOOMANYFDS, 3188 "%s: too many open file descriptors", 3189 err); 3190 goto soft_error; 3191 3192 case ENOBUFS: 3193 case ENOMEM: 3194 case ECONNRESET: 3195 case ECONNABORTED: 3196 case EHOSTUNREACH: 3197 case EHOSTDOWN: 3198 case ENETUNREACH: 3199 case ENETDOWN: 3200 case ECONNREFUSED: 3201#ifdef EPROTO 3202 case EPROTO: 3203#endif 3204#ifdef ENONET 3205 case ENONET: 3206#endif 3207 goto soft_error; 3208 default: 3209 break; 3210 } 3211 isc__strerror(errno, strbuf, sizeof(strbuf)); 3212 UNEXPECTED_ERROR(__FILE__, __LINE__, 3213 "internal_accept: %s() %s: %s", err, 3214 isc_msgcat_get(isc_msgcat, 3215 ISC_MSGSET_GENERAL, 3216 ISC_MSG_FAILED, 3217 "failed"), 3218 strbuf); 3219 fd = -1; 3220 result = ISC_R_UNEXPECTED; 3221 } else { 3222 if (addrlen == 0U) { 3223 UNEXPECTED_ERROR(__FILE__, __LINE__, 3224 "internal_accept(): " 3225 "accept() failed to return " 3226 "remote address"); 3227 3228 (void)close(fd); 3229 goto soft_error; 3230 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family != 3231 sock->pf) 3232 { 3233 UNEXPECTED_ERROR(__FILE__, __LINE__, 3234 "internal_accept(): " 3235 "accept() returned peer address " 3236 "family %u (expected %u)", 3237 NEWCONNSOCK(dev)->peer_address. 3238 type.sa.sa_family, 3239 sock->pf); 3240 (void)close(fd); 3241 goto soft_error; 3242 } else if (fd >= (int)manager->maxsocks) { 3243 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 3244 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 3245 isc_msgcat, ISC_MSGSET_SOCKET, 3246 ISC_MSG_TOOMANYFDS, 3247 "accept: " 3248 "file descriptor exceeds limit (%d/%u)", 3249 fd, manager->maxsocks); 3250 (void)close(fd); 3251 goto soft_error; 3252 } 3253 } 3254 3255 if (fd != -1) { 3256 NEWCONNSOCK(dev)->peer_address.length = addrlen; 3257 NEWCONNSOCK(dev)->pf = sock->pf; 3258 } 3259 3260 /* 3261 * Pull off the done event. 3262 */ 3263 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link); 3264 3265 /* 3266 * Poke watcher if there are more pending accepts. 3267 */ 3268 if (!ISC_LIST_EMPTY(sock->accept_list)) 3269 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 3270 3271 UNLOCK(&sock->lock); 3272 3273 if (fd != -1) { 3274 result = make_nonblock(fd); 3275 if (result != ISC_R_SUCCESS) { 3276 (void)close(fd); 3277 fd = -1; 3278 } 3279 } 3280 3281 /* 3282 * -1 means the new socket didn't happen. 3283 */ 3284 if (fd != -1) { 3285 int lockid = FDLOCK_ID(fd); 3286 3287 NEWCONNSOCK(dev)->fd = fd; 3288 NEWCONNSOCK(dev)->bound = 1; 3289 NEWCONNSOCK(dev)->connected = 1; 3290 3291 /* 3292 * Use minimum mtu if possible. 3293 */ 3294 use_min_mtu(NEWCONNSOCK(dev)); 3295 3296 /* 3297 * Save away the remote address 3298 */ 3299 dev->address = NEWCONNSOCK(dev)->peer_address; 3300 3301 LOCK(&manager->fdlock[lockid]); 3302 manager->fds[fd] = NEWCONNSOCK(dev); 3303 manager->fdstate[fd] = MANAGED; 3304 UNLOCK(&manager->fdlock[lockid]); 3305 3306 LOCK(&manager->lock); 3307 3308#ifdef USE_SELECT 3309 if (manager->maxfd < fd) 3310 manager->maxfd = fd; 3311#endif 3312 3313 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION, 3314 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, 3315 "accepted connection, new socket %p", 3316 dev->newsocket); 3317 3318 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link); 3319 3320 UNLOCK(&manager->lock); 3321 3322 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]); 3323 } else { 3324 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3325 NEWCONNSOCK(dev)->references--; 3326 free_socket((isc__socket_t **)&dev->newsocket); 3327 } 3328 3329 /* 3330 * Fill in the done event details and send it off. 3331 */ 3332 dev->result = result; 3333 task = dev->ev_sender; 3334 dev->ev_sender = sock; 3335 3336 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 3337 return; 3338 3339 soft_error: 3340 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 3341 UNLOCK(&sock->lock); 3342 3343 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3344 return; 3345} 3346 3347static void 3348internal_recv(isc_task_t *me, isc_event_t *ev) { 3349 isc_socketevent_t *dev; 3350 isc__socket_t *sock; 3351 3352 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3353 3354 sock = ev->ev_sender; 3355 INSIST(VALID_SOCKET(sock)); 3356 3357 LOCK(&sock->lock); 3358 socket_log(sock, NULL, IOEVENT, 3359 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3360 "internal_recv: task %p got event %p", me, ev); 3361 3362 INSIST(sock->pending_recv == 1); 3363 sock->pending_recv = 0; 3364 3365 INSIST(sock->references > 0); 3366 sock->references--; /* the internal event is done with this socket */ 3367 if (sock->references == 0) { 3368 UNLOCK(&sock->lock); 3369 destroy(&sock); 3370 return; 3371 } 3372 3373 /* 3374 * Try to do as much I/O as possible on this socket. There are no 3375 * limits here, currently. 3376 */ 3377 dev = ISC_LIST_HEAD(sock->recv_list); 3378 while (dev != NULL) { 3379 switch (doio_recv(sock, dev)) { 3380 case DOIO_SOFT: 3381 goto poke; 3382 3383 case DOIO_EOF: 3384 /* 3385 * read of 0 means the remote end was closed. 3386 * Run through the event queue and dispatch all 3387 * the events with an EOF result code. 3388 */ 3389 do { 3390 dev->result = ISC_R_EOF; 3391 send_recvdone_event(sock, &dev); 3392 dev = ISC_LIST_HEAD(sock->recv_list); 3393 } while (dev != NULL); 3394 goto poke; 3395 3396 case DOIO_SUCCESS: 3397 case DOIO_HARD: 3398 send_recvdone_event(sock, &dev); 3399 break; 3400 } 3401 3402 dev = ISC_LIST_HEAD(sock->recv_list); 3403 } 3404 3405 poke: 3406 if (!ISC_LIST_EMPTY(sock->recv_list)) 3407 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3408 3409 UNLOCK(&sock->lock); 3410} 3411 3412static void 3413internal_send(isc_task_t *me, isc_event_t *ev) { 3414 isc_socketevent_t *dev; 3415 isc__socket_t *sock; 3416 3417 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3418 3419 /* 3420 * Find out what socket this is and lock it. 3421 */ 3422 sock = (isc__socket_t *)ev->ev_sender; 3423 INSIST(VALID_SOCKET(sock)); 3424 3425 LOCK(&sock->lock); 3426 socket_log(sock, NULL, IOEVENT, 3427 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3428 "internal_send: task %p got event %p", me, ev); 3429 3430 INSIST(sock->pending_send == 1); 3431 sock->pending_send = 0; 3432 3433 INSIST(sock->references > 0); 3434 sock->references--; /* the internal event is done with this socket */ 3435 if (sock->references == 0) { 3436 UNLOCK(&sock->lock); 3437 destroy(&sock); 3438 return; 3439 } 3440 3441 /* 3442 * Try to do as much I/O as possible on this socket. There are no 3443 * limits here, currently. 3444 */ 3445 dev = ISC_LIST_HEAD(sock->send_list); 3446 while (dev != NULL) { 3447 switch (doio_send(sock, dev)) { 3448 case DOIO_SOFT: 3449 goto poke; 3450 3451 case DOIO_HARD: 3452 case DOIO_SUCCESS: 3453 send_senddone_event(sock, &dev); 3454 break; 3455 } 3456 3457 dev = ISC_LIST_HEAD(sock->send_list); 3458 } 3459 3460 poke: 3461 if (!ISC_LIST_EMPTY(sock->send_list)) 3462 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3463 3464 UNLOCK(&sock->lock); 3465} 3466 3467static void 3468internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) { 3469 isc__socket_t *sock; 3470 int more_data; 3471 3472 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3473 3474 /* 3475 * Find out what socket this is and lock it. 3476 */ 3477 sock = (isc__socket_t *)ev->ev_sender; 3478 INSIST(VALID_SOCKET(sock)); 3479 3480 LOCK(&sock->lock); 3481 socket_log(sock, NULL, IOEVENT, 3482 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3483 "internal_fdwatch_write: task %p got event %p", me, ev); 3484 3485 INSIST(sock->pending_send == 1); 3486 3487 UNLOCK(&sock->lock); 3488 more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock, 3489 sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE); 3490 LOCK(&sock->lock); 3491 3492 sock->pending_send = 0; 3493 3494 INSIST(sock->references > 0); 3495 sock->references--; /* the internal event is done with this socket */ 3496 if (sock->references == 0) { 3497 UNLOCK(&sock->lock); 3498 destroy(&sock); 3499 return; 3500 } 3501 3502 if (more_data) 3503 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3504 3505 UNLOCK(&sock->lock); 3506} 3507 3508static void 3509internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) { 3510 isc__socket_t *sock; 3511 int more_data; 3512 3513 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3514 3515 /* 3516 * Find out what socket this is and lock it. 3517 */ 3518 sock = (isc__socket_t *)ev->ev_sender; 3519 INSIST(VALID_SOCKET(sock)); 3520 3521 LOCK(&sock->lock); 3522 socket_log(sock, NULL, IOEVENT, 3523 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3524 "internal_fdwatch_read: task %p got event %p", me, ev); 3525 3526 INSIST(sock->pending_recv == 1); 3527 3528 UNLOCK(&sock->lock); 3529 more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock, 3530 sock->fdwatcharg, ISC_SOCKFDWATCH_READ); 3531 LOCK(&sock->lock); 3532 3533 sock->pending_recv = 0; 3534 3535 INSIST(sock->references > 0); 3536 sock->references--; /* the internal event is done with this socket */ 3537 if (sock->references == 0) { 3538 UNLOCK(&sock->lock); 3539 destroy(&sock); 3540 return; 3541 } 3542 3543 if (more_data) 3544 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3545 3546 UNLOCK(&sock->lock); 3547} 3548 3549/* 3550 * Process read/writes on each fd here. Avoid locking 3551 * and unlocking twice if both reads and writes are possible. 3552 */ 3553static void 3554process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable, 3555 isc_boolean_t writeable) 3556{ 3557 isc__socket_t *sock; 3558 isc_boolean_t unlock_sock; 3559 isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE; 3560 int lockid = FDLOCK_ID(fd); 3561 3562 /* 3563 * If the socket is going to be closed, don't do more I/O. 3564 */ 3565 LOCK(&manager->fdlock[lockid]); 3566 if (manager->fdstate[fd] == CLOSE_PENDING) { 3567 UNLOCK(&manager->fdlock[lockid]); 3568 3569 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3570 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3571 return; 3572 } 3573 3574 sock = manager->fds[fd]; 3575 unlock_sock = ISC_FALSE; 3576 if (readable) { 3577 if (sock == NULL) { 3578 unwatch_read = ISC_TRUE; 3579 goto check_write; 3580 } 3581 unlock_sock = ISC_TRUE; 3582 LOCK(&sock->lock); 3583 if (!SOCK_DEAD(sock)) { 3584 if (sock->listener) 3585 dispatch_accept(sock); 3586 else 3587 dispatch_recv(sock); 3588 } 3589 unwatch_read = ISC_TRUE; 3590 } 3591check_write: 3592 if (writeable) { 3593 if (sock == NULL) { 3594 unwatch_write = ISC_TRUE; 3595 goto unlock_fd; 3596 } 3597 if (!unlock_sock) { 3598 unlock_sock = ISC_TRUE; 3599 LOCK(&sock->lock); 3600 } 3601 if (!SOCK_DEAD(sock)) { 3602 if (sock->connecting) 3603 dispatch_connect(sock); 3604 else 3605 dispatch_send(sock); 3606 } 3607 unwatch_write = ISC_TRUE; 3608 } 3609 if (unlock_sock) 3610 UNLOCK(&sock->lock); 3611 3612 unlock_fd: 3613 UNLOCK(&manager->fdlock[lockid]); 3614 if (unwatch_read) 3615 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3616 if (unwatch_write) 3617 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3618 3619} 3620 3621#ifdef USE_KQUEUE 3622static isc_boolean_t 3623process_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) { 3624 int i; 3625 isc_boolean_t readable, writable; 3626 isc_boolean_t done = ISC_FALSE; 3627#ifdef USE_WATCHER_THREAD 3628 isc_boolean_t have_ctlevent = ISC_FALSE; 3629#endif 3630 3631 if (nevents == manager->nevents) { 3632 /* 3633 * This is not an error, but something unexpected. If this 3634 * happens, it may indicate the need for increasing 3635 * ISC_SOCKET_MAXEVENTS. 3636 */ 3637 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3638 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3639 "maximum number of FD events (%d) received", 3640 nevents); 3641 } 3642 3643 for (i = 0; i < nevents; i++) { 3644 REQUIRE(events[i].ident < manager->maxsocks); 3645#ifdef USE_WATCHER_THREAD 3646 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) { 3647 have_ctlevent = ISC_TRUE; 3648 continue; 3649 } 3650#endif 3651 readable = ISC_TF(events[i].filter == EVFILT_READ); 3652 writable = ISC_TF(events[i].filter == EVFILT_WRITE); 3653 process_fd(manager, events[i].ident, readable, writable); 3654 } 3655 3656#ifdef USE_WATCHER_THREAD 3657 if (have_ctlevent) 3658 done = process_ctlfd(manager); 3659#endif 3660 3661 return (done); 3662} 3663#elif defined(USE_EPOLL) 3664static isc_boolean_t 3665process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents) 3666{ 3667 int i; 3668 isc_boolean_t done = ISC_FALSE; 3669#ifdef USE_WATCHER_THREAD 3670 isc_boolean_t have_ctlevent = ISC_FALSE; 3671#endif 3672 3673 if (nevents == manager->nevents) { 3674 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3675 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3676 "maximum number of FD events (%d) received", 3677 nevents); 3678 } 3679 3680 for (i = 0; i < nevents; i++) { 3681 REQUIRE(events[i].data.fd < (int)manager->maxsocks); 3682#ifdef USE_WATCHER_THREAD 3683 if (events[i].data.fd == manager->pipe_fds[0]) { 3684 have_ctlevent = ISC_TRUE; 3685 continue; 3686 } 3687#endif 3688 if ((events[i].events & EPOLLERR) != 0 || 3689 (events[i].events & EPOLLHUP) != 0) { 3690 /* 3691 * epoll does not set IN/OUT bits on an erroneous 3692 * condition, so we need to try both anyway. This is a 3693 * bit inefficient, but should be okay for such rare 3694 * events. Note also that the read or write attempt 3695 * won't block because we use non-blocking sockets. 3696 */ 3697 events[i].events |= (EPOLLIN | EPOLLOUT); 3698 } 3699 process_fd(manager, events[i].data.fd, 3700 (events[i].events & EPOLLIN) != 0, 3701 (events[i].events & EPOLLOUT) != 0); 3702 } 3703 3704#ifdef USE_WATCHER_THREAD 3705 if (have_ctlevent) 3706 done = process_ctlfd(manager); 3707#endif 3708 3709 return (done); 3710} 3711#elif defined(USE_DEVPOLL) 3712static isc_boolean_t 3713process_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) { 3714 int i; 3715 isc_boolean_t done = ISC_FALSE; 3716#ifdef USE_WATCHER_THREAD 3717 isc_boolean_t have_ctlevent = ISC_FALSE; 3718#endif 3719 3720 if (nevents == manager->nevents) { 3721 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3722 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3723 "maximum number of FD events (%d) received", 3724 nevents); 3725 } 3726 3727 for (i = 0; i < nevents; i++) { 3728 REQUIRE(events[i].fd < (int)manager->maxsocks); 3729#ifdef USE_WATCHER_THREAD 3730 if (events[i].fd == manager->pipe_fds[0]) { 3731 have_ctlevent = ISC_TRUE; 3732 continue; 3733 } 3734#endif 3735 process_fd(manager, events[i].fd, 3736 (events[i].events & POLLIN) != 0, 3737 (events[i].events & POLLOUT) != 0); 3738 } 3739 3740#ifdef USE_WATCHER_THREAD 3741 if (have_ctlevent) 3742 done = process_ctlfd(manager); 3743#endif 3744 3745 return (done); 3746} 3747#elif defined(USE_SELECT) 3748static void 3749process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds, 3750 fd_set *writefds) 3751{ 3752 int i; 3753 3754 REQUIRE(maxfd <= (int)manager->maxsocks); 3755 3756 for (i = 0; i < maxfd; i++) { 3757#ifdef USE_WATCHER_THREAD 3758 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) 3759 continue; 3760#endif /* USE_WATCHER_THREAD */ 3761 process_fd(manager, i, FD_ISSET(i, readfds), 3762 FD_ISSET(i, writefds)); 3763 } 3764} 3765#endif 3766 3767#ifdef USE_WATCHER_THREAD 3768static isc_boolean_t 3769process_ctlfd(isc__socketmgr_t *manager) { 3770 int msg, fd; 3771 3772 for (;;) { 3773 select_readmsg(manager, &fd, &msg); 3774 3775 manager_log(manager, IOEVENT, 3776 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 3777 ISC_MSG_WATCHERMSG, 3778 "watcher got message %d " 3779 "for socket %d"), msg, fd); 3780 3781 /* 3782 * Nothing to read? 3783 */ 3784 if (msg == SELECT_POKE_NOTHING) 3785 break; 3786 3787 /* 3788 * Handle shutdown message. We really should 3789 * jump out of this loop right away, but 3790 * it doesn't matter if we have to do a little 3791 * more work first. 3792 */ 3793 if (msg == SELECT_POKE_SHUTDOWN) 3794 return (ISC_TRUE); 3795 3796 /* 3797 * This is a wakeup on a socket. Look 3798 * at the event queue for both read and write, 3799 * and decide if we need to watch on it now 3800 * or not. 3801 */ 3802 wakeup_socket(manager, fd, msg); 3803 } 3804 3805 return (ISC_FALSE); 3806} 3807 3808/* 3809 * This is the thread that will loop forever, always in a select or poll 3810 * call. 3811 * 3812 * When select returns something to do, track down what thread gets to do 3813 * this I/O and post the event to it. 3814 */ 3815static isc_threadresult_t 3816watcher(void *uap) { 3817 isc__socketmgr_t *manager = uap; 3818 isc_boolean_t done; 3819 int cc; 3820#ifdef USE_KQUEUE 3821 const char *fnname = "kevent()"; 3822#elif defined (USE_EPOLL) 3823 const char *fnname = "epoll_wait()"; 3824#elif defined(USE_DEVPOLL) 3825 const char *fnname = "ioctl(DP_POLL)"; 3826 struct dvpoll dvp; 3827#elif defined (USE_SELECT) 3828 const char *fnname = "select()"; 3829 int maxfd; 3830 int ctlfd; 3831#endif 3832 char strbuf[ISC_STRERRORSIZE]; 3833#ifdef ISC_SOCKET_USE_POLLWATCH 3834 pollstate_t pollstate = poll_idle; 3835#endif 3836 3837#if defined (USE_SELECT) 3838 /* 3839 * Get the control fd here. This will never change. 3840 */ 3841 ctlfd = manager->pipe_fds[0]; 3842#endif 3843 done = ISC_FALSE; 3844 while (!done) { 3845 do { 3846#ifdef USE_KQUEUE 3847 cc = kevent(manager->kqueue_fd, NULL, 0, 3848 manager->events, manager->nevents, NULL); 3849#elif defined(USE_EPOLL) 3850 cc = epoll_wait(manager->epoll_fd, manager->events, 3851 manager->nevents, -1); 3852#elif defined(USE_DEVPOLL) 3853 dvp.dp_fds = manager->events; 3854 dvp.dp_nfds = manager->nevents; 3855#ifndef ISC_SOCKET_USE_POLLWATCH 3856 dvp.dp_timeout = -1; 3857#else 3858 if (pollstate == poll_idle) 3859 dvp.dp_timeout = -1; 3860 else 3861 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT; 3862#endif /* ISC_SOCKET_USE_POLLWATCH */ 3863 cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 3864#elif defined(USE_SELECT) 3865 LOCK(&manager->lock); 3866 memmove(manager->read_fds_copy, manager->read_fds, 3867 manager->fd_bufsize); 3868 memmove(manager->write_fds_copy, manager->write_fds, 3869 manager->fd_bufsize); 3870 maxfd = manager->maxfd + 1; 3871 UNLOCK(&manager->lock); 3872 3873 cc = select(maxfd, manager->read_fds_copy, 3874 manager->write_fds_copy, NULL, NULL); 3875#endif /* USE_KQUEUE */ 3876 3877 if (cc < 0 && !SOFT_ERROR(errno)) { 3878 isc__strerror(errno, strbuf, sizeof(strbuf)); 3879 FATAL_ERROR(__FILE__, __LINE__, 3880 "%s %s: %s", fnname, 3881 isc_msgcat_get(isc_msgcat, 3882 ISC_MSGSET_GENERAL, 3883 ISC_MSG_FAILED, 3884 "failed"), strbuf); 3885 } 3886 3887#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) 3888 if (cc == 0) { 3889 if (pollstate == poll_active) 3890 pollstate = poll_checking; 3891 else if (pollstate == poll_checking) 3892 pollstate = poll_idle; 3893 } else if (cc > 0) { 3894 if (pollstate == poll_checking) { 3895 /* 3896 * XXX: We'd like to use a more 3897 * verbose log level as it's actually an 3898 * unexpected event, but the kernel bug 3899 * reportedly happens pretty frequently 3900 * (and it can also be a false positive) 3901 * so it would be just too noisy. 3902 */ 3903 manager_log(manager, 3904 ISC_LOGCATEGORY_GENERAL, 3905 ISC_LOGMODULE_SOCKET, 3906 ISC_LOG_DEBUG(1), 3907 "unexpected POLL timeout"); 3908 } 3909 pollstate = poll_active; 3910 } 3911#endif 3912 } while (cc < 0); 3913 3914#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) 3915 done = process_fds(manager, manager->events, cc); 3916#elif defined(USE_SELECT) 3917 process_fds(manager, maxfd, manager->read_fds_copy, 3918 manager->write_fds_copy); 3919 3920 /* 3921 * Process reads on internal, control fd. 3922 */ 3923 if (FD_ISSET(ctlfd, manager->read_fds_copy)) 3924 done = process_ctlfd(manager); 3925#endif 3926 } 3927 3928 manager_log(manager, TRACE, "%s", 3929 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3930 ISC_MSG_EXITING, "watcher exiting")); 3931 3932 return ((isc_threadresult_t)0); 3933} 3934#endif /* USE_WATCHER_THREAD */ 3935 3936#ifdef BIND9 3937ISC_SOCKETFUNC_SCOPE void 3938isc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) { 3939 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 3940 3941 REQUIRE(VALID_MANAGER(manager)); 3942 3943 manager->reserved = reserved; 3944} 3945 3946ISC_SOCKETFUNC_SCOPE void 3947isc___socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) { 3948 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 3949 3950 REQUIRE(VALID_MANAGER(manager)); 3951 3952 manager->maxudp = maxudp; 3953} 3954#endif /* BIND9 */ 3955 3956/* 3957 * Create a new socket manager. 3958 */ 3959 3960static isc_result_t 3961setup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) { 3962 isc_result_t result; 3963#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 3964 char strbuf[ISC_STRERRORSIZE]; 3965#endif 3966 3967#ifdef USE_KQUEUE 3968 manager->nevents = ISC_SOCKET_MAXEVENTS; 3969 manager->events = isc_mem_get(mctx, sizeof(struct kevent) * 3970 manager->nevents); 3971 if (manager->events == NULL) 3972 return (ISC_R_NOMEMORY); 3973 manager->kqueue_fd = kqueue(); 3974 if (manager->kqueue_fd == -1) { 3975 result = isc__errno2result(errno); 3976 isc__strerror(errno, strbuf, sizeof(strbuf)); 3977 UNEXPECTED_ERROR(__FILE__, __LINE__, 3978 "kqueue %s: %s", 3979 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3980 ISC_MSG_FAILED, "failed"), 3981 strbuf); 3982 isc_mem_put(mctx, manager->events, 3983 sizeof(struct kevent) * manager->nevents); 3984 return (result); 3985 } 3986 3987#ifdef USE_WATCHER_THREAD 3988 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3989 if (result != ISC_R_SUCCESS) { 3990 close(manager->kqueue_fd); 3991 isc_mem_put(mctx, manager->events, 3992 sizeof(struct kevent) * manager->nevents); 3993 return (result); 3994 } 3995#endif /* USE_WATCHER_THREAD */ 3996#elif defined(USE_EPOLL) 3997 manager->nevents = ISC_SOCKET_MAXEVENTS; 3998 manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * 3999 manager->nevents); 4000 if (manager->events == NULL) 4001 return (ISC_R_NOMEMORY); 4002 manager->epoll_fd = epoll_create(manager->nevents); 4003 if (manager->epoll_fd == -1) { 4004 result = isc__errno2result(errno); 4005 isc__strerror(errno, strbuf, sizeof(strbuf)); 4006 UNEXPECTED_ERROR(__FILE__, __LINE__, 4007 "epoll_create %s: %s", 4008 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4009 ISC_MSG_FAILED, "failed"), 4010 strbuf); 4011 isc_mem_put(mctx, manager->events, 4012 sizeof(struct epoll_event) * manager->nevents); 4013 return (result); 4014 } 4015#ifdef USE_WATCHER_THREAD 4016 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4017 if (result != ISC_R_SUCCESS) { 4018 close(manager->epoll_fd); 4019 isc_mem_put(mctx, manager->events, 4020 sizeof(struct epoll_event) * manager->nevents); 4021 return (result); 4022 } 4023#endif /* USE_WATCHER_THREAD */ 4024#elif defined(USE_DEVPOLL) 4025 /* 4026 * XXXJT: /dev/poll seems to reject large numbers of events, 4027 * so we should be careful about redefining ISC_SOCKET_MAXEVENTS. 4028 */ 4029 manager->nevents = ISC_SOCKET_MAXEVENTS; 4030 manager->events = isc_mem_get(mctx, sizeof(struct pollfd) * 4031 manager->nevents); 4032 if (manager->events == NULL) 4033 return (ISC_R_NOMEMORY); 4034 /* 4035 * Note: fdpollinfo should be able to support all possible FDs, so 4036 * it must have maxsocks entries (not nevents). 4037 */ 4038 manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) * 4039 manager->maxsocks); 4040 if (manager->fdpollinfo == NULL) { 4041 isc_mem_put(mctx, manager->events, 4042 sizeof(struct pollfd) * manager->nevents); 4043 return (ISC_R_NOMEMORY); 4044 } 4045 memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks); 4046 manager->devpoll_fd = open("/dev/poll", O_RDWR); 4047 if (manager->devpoll_fd == -1) { 4048 result = isc__errno2result(errno); 4049 isc__strerror(errno, strbuf, sizeof(strbuf)); 4050 UNEXPECTED_ERROR(__FILE__, __LINE__, 4051 "open(/dev/poll) %s: %s", 4052 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4053 ISC_MSG_FAILED, "failed"), 4054 strbuf); 4055 isc_mem_put(mctx, manager->events, 4056 sizeof(struct pollfd) * manager->nevents); 4057 isc_mem_put(mctx, manager->fdpollinfo, 4058 sizeof(pollinfo_t) * manager->maxsocks); 4059 return (result); 4060 } 4061#ifdef USE_WATCHER_THREAD 4062 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4063 if (result != ISC_R_SUCCESS) { 4064 close(manager->devpoll_fd); 4065 isc_mem_put(mctx, manager->events, 4066 sizeof(struct pollfd) * manager->nevents); 4067 isc_mem_put(mctx, manager->fdpollinfo, 4068 sizeof(pollinfo_t) * manager->maxsocks); 4069 return (result); 4070 } 4071#endif /* USE_WATCHER_THREAD */ 4072#elif defined(USE_SELECT) 4073 UNUSED(result); 4074 4075#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE 4076 /* 4077 * Note: this code should also cover the case of MAXSOCKETS <= 4078 * FD_SETSIZE, but we separate the cases to avoid possible portability 4079 * issues regarding howmany() and the actual representation of fd_set. 4080 */ 4081 manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) * 4082 sizeof(fd_mask); 4083#else 4084 manager->fd_bufsize = sizeof(fd_set); 4085#endif 4086 4087 manager->read_fds = NULL; 4088 manager->read_fds_copy = NULL; 4089 manager->write_fds = NULL; 4090 manager->write_fds_copy = NULL; 4091 4092 manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize); 4093 if (manager->read_fds != NULL) 4094 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); 4095 if (manager->read_fds_copy != NULL) 4096 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); 4097 if (manager->write_fds != NULL) { 4098 manager->write_fds_copy = isc_mem_get(mctx, 4099 manager->fd_bufsize); 4100 } 4101 if (manager->write_fds_copy == NULL) { 4102 if (manager->write_fds != NULL) { 4103 isc_mem_put(mctx, manager->write_fds, 4104 manager->fd_bufsize); 4105 } 4106 if (manager->read_fds_copy != NULL) { 4107 isc_mem_put(mctx, manager->read_fds_copy, 4108 manager->fd_bufsize); 4109 } 4110 if (manager->read_fds != NULL) { 4111 isc_mem_put(mctx, manager->read_fds, 4112 manager->fd_bufsize); 4113 } 4114 return (ISC_R_NOMEMORY); 4115 } 4116 memset(manager->read_fds, 0, manager->fd_bufsize); 4117 memset(manager->write_fds, 0, manager->fd_bufsize); 4118 4119#ifdef USE_WATCHER_THREAD 4120 (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4121 manager->maxfd = manager->pipe_fds[0]; 4122#else /* USE_WATCHER_THREAD */ 4123 manager->maxfd = 0; 4124#endif /* USE_WATCHER_THREAD */ 4125#endif /* USE_KQUEUE */ 4126 4127 return (ISC_R_SUCCESS); 4128} 4129 4130static void 4131cleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) { 4132#ifdef USE_WATCHER_THREAD 4133 isc_result_t result; 4134 4135 result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4136 if (result != ISC_R_SUCCESS) { 4137 UNEXPECTED_ERROR(__FILE__, __LINE__, 4138 "epoll_ctl(DEL) %s", 4139 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4140 ISC_MSG_FAILED, "failed")); 4141 } 4142#endif /* USE_WATCHER_THREAD */ 4143 4144#ifdef USE_KQUEUE 4145 close(manager->kqueue_fd); 4146 isc_mem_put(mctx, manager->events, 4147 sizeof(struct kevent) * manager->nevents); 4148#elif defined(USE_EPOLL) 4149 close(manager->epoll_fd); 4150 isc_mem_put(mctx, manager->events, 4151 sizeof(struct epoll_event) * manager->nevents); 4152#elif defined(USE_DEVPOLL) 4153 close(manager->devpoll_fd); 4154 isc_mem_put(mctx, manager->events, 4155 sizeof(struct pollfd) * manager->nevents); 4156 isc_mem_put(mctx, manager->fdpollinfo, 4157 sizeof(pollinfo_t) * manager->maxsocks); 4158#elif defined(USE_SELECT) 4159 if (manager->read_fds != NULL) 4160 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize); 4161 if (manager->read_fds_copy != NULL) 4162 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize); 4163 if (manager->write_fds != NULL) 4164 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize); 4165 if (manager->write_fds_copy != NULL) 4166 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize); 4167#endif /* USE_KQUEUE */ 4168} 4169 4170ISC_SOCKETFUNC_SCOPE isc_result_t 4171isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { 4172 return (isc__socketmgr_create2(mctx, managerp, 0)); 4173} 4174 4175ISC_SOCKETFUNC_SCOPE isc_result_t 4176isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 4177 unsigned int maxsocks) 4178{ 4179 int i; 4180 isc__socketmgr_t *manager; 4181#ifdef USE_WATCHER_THREAD 4182 char strbuf[ISC_STRERRORSIZE]; 4183#endif 4184 isc_result_t result; 4185 4186 REQUIRE(managerp != NULL && *managerp == NULL); 4187 4188#ifdef USE_SHARED_MANAGER 4189 if (socketmgr != NULL) { 4190 /* Don't allow maxsocks to be updated */ 4191 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) 4192 return (ISC_R_EXISTS); 4193 4194 socketmgr->refs++; 4195 *managerp = (isc_socketmgr_t *)socketmgr; 4196 return (ISC_R_SUCCESS); 4197 } 4198#endif /* USE_SHARED_MANAGER */ 4199 4200 if (maxsocks == 0) 4201 maxsocks = ISC_SOCKET_MAXSOCKETS; 4202 4203 manager = isc_mem_get(mctx, sizeof(*manager)); 4204 if (manager == NULL) 4205 return (ISC_R_NOMEMORY); 4206 4207 /* zero-clear so that necessary cleanup on failure will be easy */ 4208 memset(manager, 0, sizeof(*manager)); 4209 manager->maxsocks = maxsocks; 4210 manager->reserved = 0; 4211 manager->maxudp = 0; 4212 manager->fds = isc_mem_get(mctx, 4213 manager->maxsocks * sizeof(isc__socket_t *)); 4214 if (manager->fds == NULL) { 4215 result = ISC_R_NOMEMORY; 4216 goto free_manager; 4217 } 4218 manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int)); 4219 if (manager->fdstate == NULL) { 4220 result = ISC_R_NOMEMORY; 4221 goto free_manager; 4222 } 4223 manager->stats = NULL; 4224 4225 manager->common.methods = &socketmgrmethods; 4226 manager->common.magic = ISCAPI_SOCKETMGR_MAGIC; 4227 manager->common.impmagic = SOCKET_MANAGER_MAGIC; 4228 manager->mctx = NULL; 4229 memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); 4230 ISC_LIST_INIT(manager->socklist); 4231 result = isc_mutex_init(&manager->lock); 4232 if (result != ISC_R_SUCCESS) 4233 goto free_manager; 4234 manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t)); 4235 if (manager->fdlock == NULL) { 4236 result = ISC_R_NOMEMORY; 4237 goto cleanup_lock; 4238 } 4239 for (i = 0; i < FDLOCK_COUNT; i++) { 4240 result = isc_mutex_init(&manager->fdlock[i]); 4241 if (result != ISC_R_SUCCESS) { 4242 while (--i >= 0) 4243 DESTROYLOCK(&manager->fdlock[i]); 4244 isc_mem_put(mctx, manager->fdlock, 4245 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4246 manager->fdlock = NULL; 4247 goto cleanup_lock; 4248 } 4249 } 4250 4251#ifdef USE_WATCHER_THREAD 4252 if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { 4253 UNEXPECTED_ERROR(__FILE__, __LINE__, 4254 "isc_condition_init() %s", 4255 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4256 ISC_MSG_FAILED, "failed")); 4257 result = ISC_R_UNEXPECTED; 4258 goto cleanup_lock; 4259 } 4260 4261 /* 4262 * Create the special fds that will be used to wake up the 4263 * select/poll loop when something internal needs to be done. 4264 */ 4265 if (pipe(manager->pipe_fds) != 0) { 4266 isc__strerror(errno, strbuf, sizeof(strbuf)); 4267 UNEXPECTED_ERROR(__FILE__, __LINE__, 4268 "pipe() %s: %s", 4269 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4270 ISC_MSG_FAILED, "failed"), 4271 strbuf); 4272 result = ISC_R_UNEXPECTED; 4273 goto cleanup_condition; 4274 } 4275 4276 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS); 4277#if 0 4278 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS); 4279#endif 4280#endif /* USE_WATCHER_THREAD */ 4281 4282#ifdef USE_SHARED_MANAGER 4283 manager->refs = 1; 4284#endif /* USE_SHARED_MANAGER */ 4285 4286 /* 4287 * Set up initial state for the select loop 4288 */ 4289 result = setup_watcher(mctx, manager); 4290 if (result != ISC_R_SUCCESS) 4291 goto cleanup; 4292 memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); 4293#ifdef USE_WATCHER_THREAD 4294 /* 4295 * Start up the select/poll thread. 4296 */ 4297 if (isc_thread_create(watcher, manager, &manager->watcher) != 4298 ISC_R_SUCCESS) { 4299 UNEXPECTED_ERROR(__FILE__, __LINE__, 4300 "isc_thread_create() %s", 4301 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4302 ISC_MSG_FAILED, "failed")); 4303 cleanup_watcher(mctx, manager); 4304 result = ISC_R_UNEXPECTED; 4305 goto cleanup; 4306 } 4307#endif /* USE_WATCHER_THREAD */ 4308 isc_mem_attach(mctx, &manager->mctx); 4309 4310#ifdef USE_SHARED_MANAGER 4311 socketmgr = manager; 4312#endif /* USE_SHARED_MANAGER */ 4313 *managerp = (isc_socketmgr_t *)manager; 4314 4315 return (ISC_R_SUCCESS); 4316 4317cleanup: 4318#ifdef USE_WATCHER_THREAD 4319 (void)close(manager->pipe_fds[0]); 4320 (void)close(manager->pipe_fds[1]); 4321#endif /* USE_WATCHER_THREAD */ 4322 4323#ifdef USE_WATCHER_THREAD 4324cleanup_condition: 4325 (void)isc_condition_destroy(&manager->shutdown_ok); 4326#endif /* USE_WATCHER_THREAD */ 4327 4328 4329cleanup_lock: 4330 if (manager->fdlock != NULL) { 4331 for (i = 0; i < FDLOCK_COUNT; i++) 4332 DESTROYLOCK(&manager->fdlock[i]); 4333 } 4334 DESTROYLOCK(&manager->lock); 4335 4336free_manager: 4337 if (manager->fdlock != NULL) { 4338 isc_mem_put(mctx, manager->fdlock, 4339 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4340 } 4341 if (manager->fdstate != NULL) { 4342 isc_mem_put(mctx, manager->fdstate, 4343 manager->maxsocks * sizeof(int)); 4344 } 4345 if (manager->fds != NULL) { 4346 isc_mem_put(mctx, manager->fds, 4347 manager->maxsocks * sizeof(isc_socket_t *)); 4348 } 4349 isc_mem_put(mctx, manager, sizeof(*manager)); 4350 4351 return (result); 4352} 4353 4354#ifdef BIND9 4355isc_result_t 4356isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) { 4357 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4358 REQUIRE(VALID_MANAGER(manager)); 4359 REQUIRE(nsockp != NULL); 4360 4361 *nsockp = manager->maxsocks; 4362 4363 return (ISC_R_SUCCESS); 4364} 4365 4366void 4367isc__socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) { 4368 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4369 4370 REQUIRE(VALID_MANAGER(manager)); 4371 REQUIRE(ISC_LIST_EMPTY(manager->socklist)); 4372 REQUIRE(manager->stats == NULL); 4373 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max); 4374 4375 isc_stats_attach(stats, &manager->stats); 4376} 4377#endif 4378 4379ISC_SOCKETFUNC_SCOPE void 4380isc__socketmgr_destroy(isc_socketmgr_t **managerp) { 4381 isc__socketmgr_t *manager; 4382 int i; 4383 isc_mem_t *mctx; 4384 4385 /* 4386 * Destroy a socket manager. 4387 */ 4388 4389 REQUIRE(managerp != NULL); 4390 manager = (isc__socketmgr_t *)*managerp; 4391 REQUIRE(VALID_MANAGER(manager)); 4392 4393#ifdef USE_SHARED_MANAGER 4394 manager->refs--; 4395 if (manager->refs > 0) { 4396 *managerp = NULL; 4397 return; 4398 } 4399 socketmgr = NULL; 4400#endif /* USE_SHARED_MANAGER */ 4401 4402 LOCK(&manager->lock); 4403 4404 /* 4405 * Wait for all sockets to be destroyed. 4406 */ 4407 while (!ISC_LIST_EMPTY(manager->socklist)) { 4408#ifdef USE_WATCHER_THREAD 4409 manager_log(manager, CREATION, "%s", 4410 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 4411 ISC_MSG_SOCKETSREMAIN, 4412 "sockets exist")); 4413 WAIT(&manager->shutdown_ok, &manager->lock); 4414#else /* USE_WATCHER_THREAD */ 4415 UNLOCK(&manager->lock); 4416 isc__taskmgr_dispatch(NULL); 4417 LOCK(&manager->lock); 4418#endif /* USE_WATCHER_THREAD */ 4419 } 4420 4421 UNLOCK(&manager->lock); 4422 4423 /* 4424 * Here, poke our select/poll thread. Do this by closing the write 4425 * half of the pipe, which will send EOF to the read half. 4426 * This is currently a no-op in the non-threaded case. 4427 */ 4428 select_poke(manager, 0, SELECT_POKE_SHUTDOWN); 4429 4430#ifdef USE_WATCHER_THREAD 4431 /* 4432 * Wait for thread to exit. 4433 */ 4434 if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS) 4435 UNEXPECTED_ERROR(__FILE__, __LINE__, 4436 "isc_thread_join() %s", 4437 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4438 ISC_MSG_FAILED, "failed")); 4439#endif /* USE_WATCHER_THREAD */ 4440 4441 /* 4442 * Clean up. 4443 */ 4444 cleanup_watcher(manager->mctx, manager); 4445 4446#ifdef USE_WATCHER_THREAD 4447 (void)close(manager->pipe_fds[0]); 4448 (void)close(manager->pipe_fds[1]); 4449 (void)isc_condition_destroy(&manager->shutdown_ok); 4450#endif /* USE_WATCHER_THREAD */ 4451 4452 for (i = 0; i < (int)manager->maxsocks; i++) 4453 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ 4454 (void)close(i); 4455 4456 isc_mem_put(manager->mctx, manager->fds, 4457 manager->maxsocks * sizeof(isc__socket_t *)); 4458 isc_mem_put(manager->mctx, manager->fdstate, 4459 manager->maxsocks * sizeof(int)); 4460 4461 if (manager->stats != NULL) 4462 isc_stats_detach(&manager->stats); 4463 4464 if (manager->fdlock != NULL) { 4465 for (i = 0; i < FDLOCK_COUNT; i++) 4466 DESTROYLOCK(&manager->fdlock[i]); 4467 isc_mem_put(manager->mctx, manager->fdlock, 4468 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4469 } 4470 DESTROYLOCK(&manager->lock); 4471 manager->common.magic = 0; 4472 manager->common.impmagic = 0; 4473 mctx= manager->mctx; 4474 isc_mem_put(mctx, manager, sizeof(*manager)); 4475 4476 isc_mem_detach(&mctx); 4477 4478 *managerp = NULL; 4479 4480#ifdef USE_SHARED_MANAGER 4481 socketmgr = NULL; 4482#endif 4483} 4484 4485static isc_result_t 4486socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4487 unsigned int flags) 4488{ 4489 int io_state; 4490 isc_boolean_t have_lock = ISC_FALSE; 4491 isc_task_t *ntask = NULL; 4492 isc_result_t result = ISC_R_SUCCESS; 4493 4494 dev->ev_sender = task; 4495 4496 if (sock->type == isc_sockettype_udp) { 4497 io_state = doio_recv(sock, dev); 4498 } else { 4499 LOCK(&sock->lock); 4500 have_lock = ISC_TRUE; 4501 4502 if (ISC_LIST_EMPTY(sock->recv_list)) 4503 io_state = doio_recv(sock, dev); 4504 else 4505 io_state = DOIO_SOFT; 4506 } 4507 4508 switch (io_state) { 4509 case DOIO_SOFT: 4510 /* 4511 * We couldn't read all or part of the request right now, so 4512 * queue it. 4513 * 4514 * Attach to socket and to task 4515 */ 4516 isc_task_attach(task, &ntask); 4517 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4518 4519 if (!have_lock) { 4520 LOCK(&sock->lock); 4521 have_lock = ISC_TRUE; 4522 } 4523 4524 /* 4525 * Enqueue the request. If the socket was previously not being 4526 * watched, poke the watcher to start paying attention to it. 4527 */ 4528 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) 4529 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 4530 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); 4531 4532 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4533 "socket_recv: event %p -> task %p", 4534 dev, ntask); 4535 4536 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4537 result = ISC_R_INPROGRESS; 4538 break; 4539 4540 case DOIO_EOF: 4541 dev->result = ISC_R_EOF; 4542 /* fallthrough */ 4543 4544 case DOIO_HARD: 4545 case DOIO_SUCCESS: 4546 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4547 send_recvdone_event(sock, &dev); 4548 break; 4549 } 4550 4551 if (have_lock) 4552 UNLOCK(&sock->lock); 4553 4554 return (result); 4555} 4556 4557ISC_SOCKETFUNC_SCOPE isc_result_t 4558isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist, 4559 unsigned int minimum, isc_task_t *task, 4560 isc_taskaction_t action, const void *arg) 4561{ 4562 isc__socket_t *sock = (isc__socket_t *)sock0; 4563 isc_socketevent_t *dev; 4564 isc__socketmgr_t *manager; 4565 unsigned int iocount; 4566 isc_buffer_t *buffer; 4567 4568 REQUIRE(VALID_SOCKET(sock)); 4569 REQUIRE(buflist != NULL); 4570 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4571 REQUIRE(task != NULL); 4572 REQUIRE(action != NULL); 4573 4574 manager = sock->manager; 4575 REQUIRE(VALID_MANAGER(manager)); 4576 4577 iocount = isc_bufferlist_availablecount(buflist); 4578 REQUIRE(iocount > 0); 4579 4580 INSIST(sock->bound); 4581 4582 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4583 if (dev == NULL) 4584 return (ISC_R_NOMEMORY); 4585 4586 /* 4587 * UDP sockets are always partial read 4588 */ 4589 if (sock->type == isc_sockettype_udp) 4590 dev->minimum = 1; 4591 else { 4592 if (minimum == 0) 4593 dev->minimum = iocount; 4594 else 4595 dev->minimum = minimum; 4596 } 4597 4598 /* 4599 * Move each buffer from the passed in list to our internal one. 4600 */ 4601 buffer = ISC_LIST_HEAD(*buflist); 4602 while (buffer != NULL) { 4603 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4604 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4605 buffer = ISC_LIST_HEAD(*buflist); 4606 } 4607 4608 return (socket_recv(sock, dev, task, 0)); 4609} 4610 4611ISC_SOCKETFUNC_SCOPE isc_result_t 4612isc__socket_recv(isc_socket_t *sock0, isc_region_t *region, 4613 unsigned int minimum, isc_task_t *task, 4614 isc_taskaction_t action, const void *arg) 4615{ 4616 isc__socket_t *sock = (isc__socket_t *)sock0; 4617 isc_socketevent_t *dev; 4618 isc__socketmgr_t *manager; 4619 4620 REQUIRE(VALID_SOCKET(sock)); 4621 REQUIRE(action != NULL); 4622 4623 manager = sock->manager; 4624 REQUIRE(VALID_MANAGER(manager)); 4625 4626 INSIST(sock->bound); 4627 4628 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4629 if (dev == NULL) 4630 return (ISC_R_NOMEMORY); 4631 4632 return (isc__socket_recv2(sock0, region, minimum, task, dev, 0)); 4633} 4634 4635ISC_SOCKETFUNC_SCOPE isc_result_t 4636isc__socket_recv2(isc_socket_t *sock0, isc_region_t *region, 4637 unsigned int minimum, isc_task_t *task, 4638 isc_socketevent_t *event, unsigned int flags) 4639{ 4640 isc__socket_t *sock = (isc__socket_t *)sock0; 4641 4642 event->ev_sender = sock; 4643 event->result = ISC_R_UNSET; 4644 ISC_LIST_INIT(event->bufferlist); 4645 event->region = *region; 4646 event->n = 0; 4647 event->offset = 0; 4648 event->attributes = 0; 4649 4650 /* 4651 * UDP sockets are always partial read. 4652 */ 4653 if (sock->type == isc_sockettype_udp) 4654 event->minimum = 1; 4655 else { 4656 if (minimum == 0) 4657 event->minimum = region->length; 4658 else 4659 event->minimum = minimum; 4660 } 4661 4662 return (socket_recv(sock, event, task, flags)); 4663} 4664 4665static isc_result_t 4666socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4667 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4668 unsigned int flags) 4669{ 4670 int io_state; 4671 isc_boolean_t have_lock = ISC_FALSE; 4672 isc_task_t *ntask = NULL; 4673 isc_result_t result = ISC_R_SUCCESS; 4674 4675 dev->ev_sender = task; 4676 4677 set_dev_address(address, sock, dev); 4678 if (pktinfo != NULL) { 4679 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 4680 dev->pktinfo = *pktinfo; 4681 4682 if (!isc_sockaddr_issitelocal(&dev->address) && 4683 !isc_sockaddr_islinklocal(&dev->address)) { 4684 socket_log(sock, NULL, TRACE, isc_msgcat, 4685 ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED, 4686 "pktinfo structure provided, ifindex %u " 4687 "(set to 0)", pktinfo->ipi6_ifindex); 4688 4689 /* 4690 * Set the pktinfo index to 0 here, to let the 4691 * kernel decide what interface it should send on. 4692 */ 4693 dev->pktinfo.ipi6_ifindex = 0; 4694 } 4695 } 4696 4697 if (sock->type == isc_sockettype_udp) 4698 io_state = doio_send(sock, dev); 4699 else { 4700 LOCK(&sock->lock); 4701 have_lock = ISC_TRUE; 4702 4703 if (ISC_LIST_EMPTY(sock->send_list)) 4704 io_state = doio_send(sock, dev); 4705 else 4706 io_state = DOIO_SOFT; 4707 } 4708 4709 switch (io_state) { 4710 case DOIO_SOFT: 4711 /* 4712 * We couldn't send all or part of the request right now, so 4713 * queue it unless ISC_SOCKFLAG_NORETRY is set. 4714 */ 4715 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { 4716 isc_task_attach(task, &ntask); 4717 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4718 4719 if (!have_lock) { 4720 LOCK(&sock->lock); 4721 have_lock = ISC_TRUE; 4722 } 4723 4724 /* 4725 * Enqueue the request. If the socket was previously 4726 * not being watched, poke the watcher to start 4727 * paying attention to it. 4728 */ 4729 if (ISC_LIST_EMPTY(sock->send_list) && 4730 !sock->pending_send) 4731 select_poke(sock->manager, sock->fd, 4732 SELECT_POKE_WRITE); 4733 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); 4734 4735 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4736 "socket_send: event %p -> task %p", 4737 dev, ntask); 4738 4739 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4740 result = ISC_R_INPROGRESS; 4741 break; 4742 } 4743 4744 case DOIO_HARD: 4745 case DOIO_SUCCESS: 4746 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4747 send_senddone_event(sock, &dev); 4748 break; 4749 } 4750 4751 if (have_lock) 4752 UNLOCK(&sock->lock); 4753 4754 return (result); 4755} 4756 4757ISC_SOCKETFUNC_SCOPE isc_result_t 4758isc__socket_send(isc_socket_t *sock, isc_region_t *region, 4759 isc_task_t *task, isc_taskaction_t action, const void *arg) 4760{ 4761 /* 4762 * REQUIRE() checking is performed in isc_socket_sendto(). 4763 */ 4764 return (isc__socket_sendto(sock, region, task, action, arg, NULL, 4765 NULL)); 4766} 4767 4768ISC_SOCKETFUNC_SCOPE isc_result_t 4769isc__socket_sendto(isc_socket_t *sock0, isc_region_t *region, 4770 isc_task_t *task, isc_taskaction_t action, const void *arg, 4771 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4772{ 4773 isc__socket_t *sock = (isc__socket_t *)sock0; 4774 isc_socketevent_t *dev; 4775 isc__socketmgr_t *manager; 4776 4777 REQUIRE(VALID_SOCKET(sock)); 4778 REQUIRE(region != NULL); 4779 REQUIRE(task != NULL); 4780 REQUIRE(action != NULL); 4781 4782 manager = sock->manager; 4783 REQUIRE(VALID_MANAGER(manager)); 4784 4785 INSIST(sock->bound); 4786 4787 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4788 if (dev == NULL) 4789 return (ISC_R_NOMEMORY); 4790 4791 dev->region = *region; 4792 4793 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4794} 4795 4796ISC_SOCKETFUNC_SCOPE isc_result_t 4797isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 4798 isc_task_t *task, isc_taskaction_t action, const void *arg) 4799{ 4800 return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL, 4801 NULL, 0)); 4802} 4803 4804ISC_SOCKETFUNC_SCOPE isc_result_t 4805isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist, 4806 isc_task_t *task, isc_taskaction_t action, const void *arg, 4807 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4808{ 4809 return (isc__socket_sendtov2(sock, buflist, task, action, arg, address, 4810 pktinfo, 0)); 4811} 4812 4813ISC_SOCKETFUNC_SCOPE isc_result_t 4814isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist, 4815 isc_task_t *task, isc_taskaction_t action, const void *arg, 4816 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4817 unsigned int flags) 4818{ 4819 isc__socket_t *sock = (isc__socket_t *)sock0; 4820 isc_socketevent_t *dev; 4821 isc__socketmgr_t *manager; 4822 unsigned int iocount; 4823 isc_buffer_t *buffer; 4824 4825 REQUIRE(VALID_SOCKET(sock)); 4826 REQUIRE(buflist != NULL); 4827 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4828 REQUIRE(task != NULL); 4829 REQUIRE(action != NULL); 4830 4831 manager = sock->manager; 4832 REQUIRE(VALID_MANAGER(manager)); 4833 4834 iocount = isc_bufferlist_usedcount(buflist); 4835 REQUIRE(iocount > 0); 4836 4837 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4838 if (dev == NULL) 4839 return (ISC_R_NOMEMORY); 4840 4841 /* 4842 * Move each buffer from the passed in list to our internal one. 4843 */ 4844 buffer = ISC_LIST_HEAD(*buflist); 4845 while (buffer != NULL) { 4846 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4847 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4848 buffer = ISC_LIST_HEAD(*buflist); 4849 } 4850 4851 return (socket_send(sock, dev, task, address, pktinfo, flags)); 4852} 4853 4854ISC_SOCKETFUNC_SCOPE isc_result_t 4855isc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region, 4856 isc_task_t *task, 4857 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4858 isc_socketevent_t *event, unsigned int flags) 4859{ 4860 isc__socket_t *sock = (isc__socket_t *)sock0; 4861 4862 REQUIRE(VALID_SOCKET(sock)); 4863 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0); 4864 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) 4865 REQUIRE(sock->type == isc_sockettype_udp); 4866 event->ev_sender = sock; 4867 event->result = ISC_R_UNSET; 4868 ISC_LIST_INIT(event->bufferlist); 4869 event->region = *region; 4870 event->n = 0; 4871 event->offset = 0; 4872 event->attributes = 0; 4873 4874 return (socket_send(sock, event, task, address, pktinfo, flags)); 4875} 4876 4877ISC_SOCKETFUNC_SCOPE void 4878isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) { 4879#ifdef ISC_PLATFORM_HAVESYSUNH 4880 int s; 4881 struct stat sb; 4882 char strbuf[ISC_STRERRORSIZE]; 4883 4884 if (sockaddr->type.sa.sa_family != AF_UNIX) 4885 return; 4886 4887#ifndef S_ISSOCK 4888#if defined(S_IFMT) && defined(S_IFSOCK) 4889#define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK) 4890#elif defined(_S_IFMT) && defined(S_IFSOCK) 4891#define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK) 4892#endif 4893#endif 4894 4895#ifndef S_ISFIFO 4896#if defined(S_IFMT) && defined(S_IFIFO) 4897#define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO) 4898#elif defined(_S_IFMT) && defined(S_IFIFO) 4899#define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO) 4900#endif 4901#endif 4902 4903#if !defined(S_ISFIFO) && !defined(S_ISSOCK) 4904#error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>. 4905#endif 4906 4907#ifndef S_ISFIFO 4908#define S_ISFIFO(mode) 0 4909#endif 4910 4911#ifndef S_ISSOCK 4912#define S_ISSOCK(mode) 0 4913#endif 4914 4915 if (active) { 4916 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4917 isc__strerror(errno, strbuf, sizeof(strbuf)); 4918 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4919 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4920 "isc_socket_cleanunix: stat(%s): %s", 4921 sockaddr->type.sunix.sun_path, strbuf); 4922 return; 4923 } 4924 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4925 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4926 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4927 "isc_socket_cleanunix: %s: not a socket", 4928 sockaddr->type.sunix.sun_path); 4929 return; 4930 } 4931 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4932 isc__strerror(errno, strbuf, sizeof(strbuf)); 4933 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4934 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4935 "isc_socket_cleanunix: unlink(%s): %s", 4936 sockaddr->type.sunix.sun_path, strbuf); 4937 } 4938 return; 4939 } 4940 4941 s = socket(AF_UNIX, SOCK_STREAM, 0); 4942 if (s < 0) { 4943 isc__strerror(errno, strbuf, sizeof(strbuf)); 4944 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4945 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4946 "isc_socket_cleanunix: socket(%s): %s", 4947 sockaddr->type.sunix.sun_path, strbuf); 4948 return; 4949 } 4950 4951 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4952 switch (errno) { 4953 case ENOENT: /* We exited cleanly last time */ 4954 break; 4955 default: 4956 isc__strerror(errno, strbuf, sizeof(strbuf)); 4957 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4958 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4959 "isc_socket_cleanunix: stat(%s): %s", 4960 sockaddr->type.sunix.sun_path, strbuf); 4961 break; 4962 } 4963 goto cleanup; 4964 } 4965 4966 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4967 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4968 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4969 "isc_socket_cleanunix: %s: not a socket", 4970 sockaddr->type.sunix.sun_path); 4971 goto cleanup; 4972 } 4973 4974 if (connect(s, (struct sockaddr *)&sockaddr->type.sunix, 4975 sizeof(sockaddr->type.sunix)) < 0) { 4976 switch (errno) { 4977 case ECONNREFUSED: 4978 case ECONNRESET: 4979 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4980 isc__strerror(errno, strbuf, sizeof(strbuf)); 4981 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4982 ISC_LOGMODULE_SOCKET, 4983 ISC_LOG_WARNING, 4984 "isc_socket_cleanunix: " 4985 "unlink(%s): %s", 4986 sockaddr->type.sunix.sun_path, 4987 strbuf); 4988 } 4989 break; 4990 default: 4991 isc__strerror(errno, strbuf, sizeof(strbuf)); 4992 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4993 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4994 "isc_socket_cleanunix: connect(%s): %s", 4995 sockaddr->type.sunix.sun_path, strbuf); 4996 break; 4997 } 4998 } 4999 cleanup: 5000 close(s); 5001#else 5002 UNUSED(sockaddr); 5003 UNUSED(active); 5004#endif 5005} 5006 5007ISC_SOCKETFUNC_SCOPE isc_result_t 5008isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 5009 isc_uint32_t owner, isc_uint32_t group) 5010{ 5011#ifdef ISC_PLATFORM_HAVESYSUNH 5012 isc_result_t result = ISC_R_SUCCESS; 5013 char strbuf[ISC_STRERRORSIZE]; 5014 char path[sizeof(sockaddr->type.sunix.sun_path)]; 5015#ifdef NEED_SECURE_DIRECTORY 5016 char *slash; 5017#endif 5018 5019 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX); 5020 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path)); 5021 strcpy(path, sockaddr->type.sunix.sun_path); 5022 5023#ifdef NEED_SECURE_DIRECTORY 5024 slash = strrchr(path, '/'); 5025 if (slash != NULL) { 5026 if (slash != path) 5027 *slash = '\0'; 5028 else 5029 strcpy(path, "/"); 5030 } else 5031 strcpy(path, "."); 5032#endif 5033 5034 if (chmod(path, perm) < 0) { 5035 isc__strerror(errno, strbuf, sizeof(strbuf)); 5036 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5037 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 5038 "isc_socket_permunix: chmod(%s, %d): %s", 5039 path, perm, strbuf); 5040 result = ISC_R_FAILURE; 5041 } 5042 if (chown(path, owner, group) < 0) { 5043 isc__strerror(errno, strbuf, sizeof(strbuf)); 5044 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5045 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 5046 "isc_socket_permunix: chown(%s, %d, %d): %s", 5047 path, owner, group, 5048 strbuf); 5049 result = ISC_R_FAILURE; 5050 } 5051 return (result); 5052#else 5053 UNUSED(sockaddr); 5054 UNUSED(perm); 5055 UNUSED(owner); 5056 UNUSED(group); 5057 return (ISC_R_NOTIMPLEMENTED); 5058#endif 5059} 5060 5061ISC_SOCKETFUNC_SCOPE isc_result_t 5062isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr, 5063 unsigned int options) { 5064 isc__socket_t *sock = (isc__socket_t *)sock0; 5065 char strbuf[ISC_STRERRORSIZE]; 5066 int on = 1; 5067 5068 REQUIRE(VALID_SOCKET(sock)); 5069 5070 LOCK(&sock->lock); 5071 5072 INSIST(!sock->bound); 5073 INSIST(!sock->dupped); 5074 5075 if (sock->pf != sockaddr->type.sa.sa_family) { 5076 UNLOCK(&sock->lock); 5077 return (ISC_R_FAMILYMISMATCH); 5078 } 5079 5080 /* 5081 * Only set SO_REUSEADDR when we want a specific port. 5082 */ 5083#ifdef AF_UNIX 5084 if (sock->pf == AF_UNIX) 5085 goto bind_socket; 5086#endif 5087 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 && 5088 isc_sockaddr_getport(sockaddr) != (in_port_t)0 && 5089 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, 5090 sizeof(on)) < 0) { 5091 UNEXPECTED_ERROR(__FILE__, __LINE__, 5092 "setsockopt(%d) %s", sock->fd, 5093 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 5094 ISC_MSG_FAILED, "failed")); 5095 /* Press on... */ 5096 } 5097#ifdef AF_UNIX 5098 bind_socket: 5099#endif 5100 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) { 5101 inc_stats(sock->manager->stats, 5102 sock->statsindex[STATID_BINDFAIL]); 5103 5104 UNLOCK(&sock->lock); 5105 switch (errno) { 5106 case EACCES: 5107 return (ISC_R_NOPERM); 5108 case EADDRNOTAVAIL: 5109 return (ISC_R_ADDRNOTAVAIL); 5110 case EADDRINUSE: 5111 return (ISC_R_ADDRINUSE); 5112 case EINVAL: 5113 return (ISC_R_BOUND); 5114 default: 5115 isc__strerror(errno, strbuf, sizeof(strbuf)); 5116 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s", 5117 strbuf); 5118 return (ISC_R_UNEXPECTED); 5119 } 5120 } 5121 5122 socket_log(sock, sockaddr, TRACE, 5123 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound"); 5124 sock->bound = 1; 5125 5126 UNLOCK(&sock->lock); 5127 return (ISC_R_SUCCESS); 5128} 5129 5130/* 5131 * Enable this only for specific OS versions, and only when they have repaired 5132 * their problems with it. Until then, this is is broken and needs to be 5133 * diabled by default. See RT22589 for details. 5134 */ 5135#undef ENABLE_ACCEPTFILTER 5136 5137ISC_SOCKETFUNC_SCOPE isc_result_t 5138isc__socket_filter(isc_socket_t *sock0, const char *filter) { 5139 isc__socket_t *sock = (isc__socket_t *)sock0; 5140#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 5141 char strbuf[ISC_STRERRORSIZE]; 5142 struct accept_filter_arg afa; 5143#else 5144 UNUSED(sock); 5145 UNUSED(filter); 5146#endif 5147 5148 REQUIRE(VALID_SOCKET(sock)); 5149 5150#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 5151 bzero(&afa, sizeof(afa)); 5152 strncpy(afa.af_name, filter, sizeof(afa.af_name)); 5153 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, 5154 &afa, sizeof(afa)) == -1) { 5155 isc__strerror(errno, strbuf, sizeof(strbuf)); 5156 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 5157 ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s", 5158 strbuf); 5159 return (ISC_R_FAILURE); 5160 } 5161 return (ISC_R_SUCCESS); 5162#else 5163 return (ISC_R_NOTIMPLEMENTED); 5164#endif 5165} 5166 5167/* 5168 * Set up to listen on a given socket. We do this by creating an internal 5169 * event that will be dispatched when the socket has read activity. The 5170 * watcher will send the internal event to the task when there is a new 5171 * connection. 5172 * 5173 * Unlike in read, we don't preallocate a done event here. Every time there 5174 * is a new connection we'll have to allocate a new one anyway, so we might 5175 * as well keep things simple rather than having to track them. 5176 */ 5177ISC_SOCKETFUNC_SCOPE isc_result_t 5178isc__socket_listen(isc_socket_t *sock0, unsigned int backlog) { 5179 isc__socket_t *sock = (isc__socket_t *)sock0; 5180 char strbuf[ISC_STRERRORSIZE]; 5181 5182 REQUIRE(VALID_SOCKET(sock)); 5183 5184 LOCK(&sock->lock); 5185 5186 REQUIRE(!sock->listener); 5187 REQUIRE(sock->bound); 5188 REQUIRE(sock->type == isc_sockettype_tcp || 5189 sock->type == isc_sockettype_unix); 5190 5191 if (backlog == 0) 5192 backlog = SOMAXCONN; 5193 5194 if (listen(sock->fd, (int)backlog) < 0) { 5195 UNLOCK(&sock->lock); 5196 isc__strerror(errno, strbuf, sizeof(strbuf)); 5197 5198 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf); 5199 5200 return (ISC_R_UNEXPECTED); 5201 } 5202 5203 sock->listener = 1; 5204 5205 UNLOCK(&sock->lock); 5206 return (ISC_R_SUCCESS); 5207} 5208 5209/* 5210 * This should try to do aggressive accept() XXXMLG 5211 */ 5212ISC_SOCKETFUNC_SCOPE isc_result_t 5213isc__socket_accept(isc_socket_t *sock0, 5214 isc_task_t *task, isc_taskaction_t action, const void *arg) 5215{ 5216 isc__socket_t *sock = (isc__socket_t *)sock0; 5217 isc_socket_newconnev_t *dev; 5218 isc__socketmgr_t *manager; 5219 isc_task_t *ntask = NULL; 5220 isc__socket_t *nsock; 5221 isc_result_t result; 5222 isc_boolean_t do_poke = ISC_FALSE; 5223 5224 REQUIRE(VALID_SOCKET(sock)); 5225 manager = sock->manager; 5226 REQUIRE(VALID_MANAGER(manager)); 5227 5228 LOCK(&sock->lock); 5229 5230 REQUIRE(sock->listener); 5231 5232 /* 5233 * Sender field is overloaded here with the task we will be sending 5234 * this event to. Just before the actual event is delivered the 5235 * actual ev_sender will be touched up to be the socket. 5236 */ 5237 dev = (isc_socket_newconnev_t *) 5238 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN, 5239 action, arg, sizeof(*dev)); 5240 if (dev == NULL) { 5241 UNLOCK(&sock->lock); 5242 return (ISC_R_NOMEMORY); 5243 } 5244 ISC_LINK_INIT(dev, ev_link); 5245 5246 result = allocate_socket(manager, sock->type, &nsock); 5247 if (result != ISC_R_SUCCESS) { 5248 isc_event_free(ISC_EVENT_PTR(&dev)); 5249 UNLOCK(&sock->lock); 5250 return (result); 5251 } 5252 5253 /* 5254 * Attach to socket and to task. 5255 */ 5256 isc_task_attach(task, &ntask); 5257 if (isc_task_exiting(ntask)) { 5258 free_socket(&nsock); 5259 isc_task_detach(&ntask); 5260 isc_event_free(ISC_EVENT_PTR(&dev)); 5261 UNLOCK(&sock->lock); 5262 return (ISC_R_SHUTTINGDOWN); 5263 } 5264 nsock->references++; 5265 nsock->statsindex = sock->statsindex; 5266 5267 dev->ev_sender = ntask; 5268 dev->newsocket = (isc_socket_t *)nsock; 5269 5270 /* 5271 * Poke watcher here. We still have the socket locked, so there 5272 * is no race condition. We will keep the lock for such a short 5273 * bit of time waking it up now or later won't matter all that much. 5274 */ 5275 if (ISC_LIST_EMPTY(sock->accept_list)) 5276 do_poke = ISC_TRUE; 5277 5278 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link); 5279 5280 if (do_poke) 5281 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT); 5282 5283 UNLOCK(&sock->lock); 5284 return (ISC_R_SUCCESS); 5285} 5286 5287ISC_SOCKETFUNC_SCOPE isc_result_t 5288isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr, 5289 isc_task_t *task, isc_taskaction_t action, const void *arg) 5290{ 5291 isc__socket_t *sock = (isc__socket_t *)sock0; 5292 isc_socket_connev_t *dev; 5293 isc_task_t *ntask = NULL; 5294 isc__socketmgr_t *manager; 5295 int cc; 5296 char strbuf[ISC_STRERRORSIZE]; 5297 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 5298 5299 REQUIRE(VALID_SOCKET(sock)); 5300 REQUIRE(addr != NULL); 5301 REQUIRE(task != NULL); 5302 REQUIRE(action != NULL); 5303 5304 manager = sock->manager; 5305 REQUIRE(VALID_MANAGER(manager)); 5306 REQUIRE(addr != NULL); 5307 5308 if (isc_sockaddr_ismulticast(addr)) 5309 return (ISC_R_MULTICAST); 5310 5311 LOCK(&sock->lock); 5312 5313 REQUIRE(!sock->connecting); 5314 5315 dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock, 5316 ISC_SOCKEVENT_CONNECT, 5317 action, arg, 5318 sizeof(*dev)); 5319 if (dev == NULL) { 5320 UNLOCK(&sock->lock); 5321 return (ISC_R_NOMEMORY); 5322 } 5323 ISC_LINK_INIT(dev, ev_link); 5324 5325 /* 5326 * Try to do the connect right away, as there can be only one 5327 * outstanding, and it might happen to complete. 5328 */ 5329 sock->peer_address = *addr; 5330 cc = connect(sock->fd, &addr->type.sa, addr->length); 5331 if (cc < 0) { 5332 /* 5333 * HP-UX "fails" to connect a UDP socket and sets errno to 5334 * EINPROGRESS if it's non-blocking. We'd rather regard this as 5335 * a success and let the user detect it if it's really an error 5336 * at the time of sending a packet on the socket. 5337 */ 5338 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { 5339 cc = 0; 5340 goto success; 5341 } 5342 if (SOFT_ERROR(errno) || errno == EINPROGRESS) 5343 goto queue; 5344 5345 switch (errno) { 5346#define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit; 5347 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5348 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5349 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5350 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5351 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5352#ifdef EHOSTDOWN 5353 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5354#endif 5355 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5356 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5357 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5358 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5359 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5360#undef ERROR_MATCH 5361 } 5362 5363 sock->connected = 0; 5364 5365 isc__strerror(errno, strbuf, sizeof(strbuf)); 5366 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf)); 5367 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s", 5368 addrbuf, errno, strbuf); 5369 5370 UNLOCK(&sock->lock); 5371 inc_stats(sock->manager->stats, 5372 sock->statsindex[STATID_CONNECTFAIL]); 5373 isc_event_free(ISC_EVENT_PTR(&dev)); 5374 return (ISC_R_UNEXPECTED); 5375 5376 err_exit: 5377 sock->connected = 0; 5378 isc_task_send(task, ISC_EVENT_PTR(&dev)); 5379 5380 UNLOCK(&sock->lock); 5381 inc_stats(sock->manager->stats, 5382 sock->statsindex[STATID_CONNECTFAIL]); 5383 return (ISC_R_SUCCESS); 5384 } 5385 5386 /* 5387 * If connect completed, fire off the done event. 5388 */ 5389 success: 5390 if (cc == 0) { 5391 sock->connected = 1; 5392 sock->bound = 1; 5393 dev->result = ISC_R_SUCCESS; 5394 isc_task_send(task, ISC_EVENT_PTR(&dev)); 5395 5396 UNLOCK(&sock->lock); 5397 5398 inc_stats(sock->manager->stats, 5399 sock->statsindex[STATID_CONNECT]); 5400 5401 return (ISC_R_SUCCESS); 5402 } 5403 5404 queue: 5405 5406 /* 5407 * Attach to task. 5408 */ 5409 isc_task_attach(task, &ntask); 5410 5411 sock->connecting = 1; 5412 5413 dev->ev_sender = ntask; 5414 5415 /* 5416 * Poke watcher here. We still have the socket locked, so there 5417 * is no race condition. We will keep the lock for such a short 5418 * bit of time waking it up now or later won't matter all that much. 5419 */ 5420 if (sock->connect_ev == NULL) 5421 select_poke(manager, sock->fd, SELECT_POKE_CONNECT); 5422 5423 sock->connect_ev = dev; 5424 5425 UNLOCK(&sock->lock); 5426 return (ISC_R_SUCCESS); 5427} 5428 5429/* 5430 * Called when a socket with a pending connect() finishes. 5431 */ 5432static void 5433internal_connect(isc_task_t *me, isc_event_t *ev) { 5434 isc__socket_t *sock; 5435 isc_socket_connev_t *dev; 5436 isc_task_t *task; 5437 int cc; 5438 ISC_SOCKADDR_LEN_T optlen; 5439 char strbuf[ISC_STRERRORSIZE]; 5440 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5441 5442 UNUSED(me); 5443 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 5444 5445 sock = ev->ev_sender; 5446 INSIST(VALID_SOCKET(sock)); 5447 5448 LOCK(&sock->lock); 5449 5450 /* 5451 * When the internal event was sent the reference count was bumped 5452 * to keep the socket around for us. Decrement the count here. 5453 */ 5454 INSIST(sock->references > 0); 5455 sock->references--; 5456 if (sock->references == 0) { 5457 UNLOCK(&sock->lock); 5458 destroy(&sock); 5459 return; 5460 } 5461 5462 /* 5463 * Has this event been canceled? 5464 */ 5465 dev = sock->connect_ev; 5466 if (dev == NULL) { 5467 INSIST(!sock->connecting); 5468 UNLOCK(&sock->lock); 5469 return; 5470 } 5471 5472 INSIST(sock->connecting); 5473 sock->connecting = 0; 5474 5475 /* 5476 * Get any possible error status here. 5477 */ 5478 optlen = sizeof(cc); 5479 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, 5480 (void *)&cc, (void *)&optlen) < 0) 5481 cc = errno; 5482 else 5483 errno = cc; 5484 5485 if (errno != 0) { 5486 /* 5487 * If the error is EAGAIN, just re-select on this 5488 * fd and pretend nothing strange happened. 5489 */ 5490 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 5491 sock->connecting = 1; 5492 select_poke(sock->manager, sock->fd, 5493 SELECT_POKE_CONNECT); 5494 UNLOCK(&sock->lock); 5495 5496 return; 5497 } 5498 5499 inc_stats(sock->manager->stats, 5500 sock->statsindex[STATID_CONNECTFAIL]); 5501 5502 /* 5503 * Translate other errors into ISC_R_* flavors. 5504 */ 5505 switch (errno) { 5506#define ERROR_MATCH(a, b) case a: dev->result = b; break; 5507 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5508 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5509 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5510 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5511 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5512#ifdef EHOSTDOWN 5513 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5514#endif 5515 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5516 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5517 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5518 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5519 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 5520 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5521#undef ERROR_MATCH 5522 default: 5523 dev->result = ISC_R_UNEXPECTED; 5524 isc_sockaddr_format(&sock->peer_address, peerbuf, 5525 sizeof(peerbuf)); 5526 isc__strerror(errno, strbuf, sizeof(strbuf)); 5527 UNEXPECTED_ERROR(__FILE__, __LINE__, 5528 "internal_connect: connect(%s) %s", 5529 peerbuf, strbuf); 5530 } 5531 } else { 5532 inc_stats(sock->manager->stats, 5533 sock->statsindex[STATID_CONNECT]); 5534 dev->result = ISC_R_SUCCESS; 5535 sock->connected = 1; 5536 sock->bound = 1; 5537 } 5538 5539 sock->connect_ev = NULL; 5540 5541 UNLOCK(&sock->lock); 5542 5543 task = dev->ev_sender; 5544 dev->ev_sender = sock; 5545 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 5546} 5547 5548ISC_SOCKETFUNC_SCOPE isc_result_t 5549isc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) { 5550 isc__socket_t *sock = (isc__socket_t *)sock0; 5551 isc_result_t result; 5552 5553 REQUIRE(VALID_SOCKET(sock)); 5554 REQUIRE(addressp != NULL); 5555 5556 LOCK(&sock->lock); 5557 5558 if (sock->connected) { 5559 *addressp = sock->peer_address; 5560 result = ISC_R_SUCCESS; 5561 } else { 5562 result = ISC_R_NOTCONNECTED; 5563 } 5564 5565 UNLOCK(&sock->lock); 5566 5567 return (result); 5568} 5569 5570ISC_SOCKETFUNC_SCOPE isc_result_t 5571isc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) { 5572 isc__socket_t *sock = (isc__socket_t *)sock0; 5573 ISC_SOCKADDR_LEN_T len; 5574 isc_result_t result; 5575 char strbuf[ISC_STRERRORSIZE]; 5576 5577 REQUIRE(VALID_SOCKET(sock)); 5578 REQUIRE(addressp != NULL); 5579 5580 LOCK(&sock->lock); 5581 5582 if (!sock->bound) { 5583 result = ISC_R_NOTBOUND; 5584 goto out; 5585 } 5586 5587 result = ISC_R_SUCCESS; 5588 5589 len = sizeof(addressp->type); 5590 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) { 5591 isc__strerror(errno, strbuf, sizeof(strbuf)); 5592 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", 5593 strbuf); 5594 result = ISC_R_UNEXPECTED; 5595 goto out; 5596 } 5597 addressp->length = (unsigned int)len; 5598 5599 out: 5600 UNLOCK(&sock->lock); 5601 5602 return (result); 5603} 5604 5605/* 5606 * Run through the list of events on this socket, and cancel the ones 5607 * queued for task "task" of type "how". "how" is a bitmask. 5608 */ 5609ISC_SOCKETFUNC_SCOPE void 5610isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) { 5611 isc__socket_t *sock = (isc__socket_t *)sock0; 5612 5613 REQUIRE(VALID_SOCKET(sock)); 5614 5615 /* 5616 * Quick exit if there is nothing to do. Don't even bother locking 5617 * in this case. 5618 */ 5619 if (how == 0) 5620 return; 5621 5622 LOCK(&sock->lock); 5623 5624 /* 5625 * All of these do the same thing, more or less. 5626 * Each will: 5627 * o If the internal event is marked as "posted" try to 5628 * remove it from the task's queue. If this fails, mark it 5629 * as canceled instead, and let the task clean it up later. 5630 * o For each I/O request for that task of that type, post 5631 * its done event with status of "ISC_R_CANCELED". 5632 * o Reset any state needed. 5633 */ 5634 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) 5635 && !ISC_LIST_EMPTY(sock->recv_list)) { 5636 isc_socketevent_t *dev; 5637 isc_socketevent_t *next; 5638 isc_task_t *current_task; 5639 5640 dev = ISC_LIST_HEAD(sock->recv_list); 5641 5642 while (dev != NULL) { 5643 current_task = dev->ev_sender; 5644 next = ISC_LIST_NEXT(dev, ev_link); 5645 5646 if ((task == NULL) || (task == current_task)) { 5647 dev->result = ISC_R_CANCELED; 5648 send_recvdone_event(sock, &dev); 5649 } 5650 dev = next; 5651 } 5652 } 5653 5654 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) 5655 && !ISC_LIST_EMPTY(sock->send_list)) { 5656 isc_socketevent_t *dev; 5657 isc_socketevent_t *next; 5658 isc_task_t *current_task; 5659 5660 dev = ISC_LIST_HEAD(sock->send_list); 5661 5662 while (dev != NULL) { 5663 current_task = dev->ev_sender; 5664 next = ISC_LIST_NEXT(dev, ev_link); 5665 5666 if ((task == NULL) || (task == current_task)) { 5667 dev->result = ISC_R_CANCELED; 5668 send_senddone_event(sock, &dev); 5669 } 5670 dev = next; 5671 } 5672 } 5673 5674 if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT) 5675 && !ISC_LIST_EMPTY(sock->accept_list)) { 5676 isc_socket_newconnev_t *dev; 5677 isc_socket_newconnev_t *next; 5678 isc_task_t *current_task; 5679 5680 dev = ISC_LIST_HEAD(sock->accept_list); 5681 while (dev != NULL) { 5682 current_task = dev->ev_sender; 5683 next = ISC_LIST_NEXT(dev, ev_link); 5684 5685 if ((task == NULL) || (task == current_task)) { 5686 5687 ISC_LIST_UNLINK(sock->accept_list, dev, 5688 ev_link); 5689 5690 NEWCONNSOCK(dev)->references--; 5691 free_socket((isc__socket_t **)&dev->newsocket); 5692 5693 dev->result = ISC_R_CANCELED; 5694 dev->ev_sender = sock; 5695 isc_task_sendanddetach(¤t_task, 5696 ISC_EVENT_PTR(&dev)); 5697 } 5698 5699 dev = next; 5700 } 5701 } 5702 5703 /* 5704 * Connecting is not a list. 5705 */ 5706 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT) 5707 && sock->connect_ev != NULL) { 5708 isc_socket_connev_t *dev; 5709 isc_task_t *current_task; 5710 5711 INSIST(sock->connecting); 5712 sock->connecting = 0; 5713 5714 dev = sock->connect_ev; 5715 current_task = dev->ev_sender; 5716 5717 if ((task == NULL) || (task == current_task)) { 5718 sock->connect_ev = NULL; 5719 5720 dev->result = ISC_R_CANCELED; 5721 dev->ev_sender = sock; 5722 isc_task_sendanddetach(¤t_task, 5723 ISC_EVENT_PTR(&dev)); 5724 } 5725 } 5726 5727 UNLOCK(&sock->lock); 5728} 5729 5730ISC_SOCKETFUNC_SCOPE isc_sockettype_t 5731isc__socket_gettype(isc_socket_t *sock0) { 5732 isc__socket_t *sock = (isc__socket_t *)sock0; 5733 5734 REQUIRE(VALID_SOCKET(sock)); 5735 5736 return (sock->type); 5737} 5738 5739ISC_SOCKETFUNC_SCOPE isc_boolean_t 5740isc__socket_isbound(isc_socket_t *sock0) { 5741 isc__socket_t *sock = (isc__socket_t *)sock0; 5742 isc_boolean_t val; 5743 5744 REQUIRE(VALID_SOCKET(sock)); 5745 5746 LOCK(&sock->lock); 5747 val = ((sock->bound) ? ISC_TRUE : ISC_FALSE); 5748 UNLOCK(&sock->lock); 5749 5750 return (val); 5751} 5752 5753ISC_SOCKETFUNC_SCOPE void 5754isc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) { 5755 isc__socket_t *sock = (isc__socket_t *)sock0; 5756#if defined(IPV6_V6ONLY) 5757 int onoff = yes ? 1 : 0; 5758#else 5759 UNUSED(yes); 5760 UNUSED(sock); 5761#endif 5762 5763 REQUIRE(VALID_SOCKET(sock)); 5764 INSIST(!sock->dupped); 5765 5766#ifdef IPV6_V6ONLY 5767 if (sock->pf == AF_INET6) { 5768 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, 5769 (void *)&onoff, sizeof(int)) < 0) { 5770 char strbuf[ISC_STRERRORSIZE]; 5771 isc__strerror(errno, strbuf, sizeof(strbuf)); 5772 UNEXPECTED_ERROR(__FILE__, __LINE__, 5773 "setsockopt(%d, IPV6_V6ONLY) " 5774 "%s: %s", sock->fd, 5775 isc_msgcat_get(isc_msgcat, 5776 ISC_MSGSET_GENERAL, 5777 ISC_MSG_FAILED, 5778 "failed"), 5779 strbuf); 5780 } 5781 } 5782 FIX_IPV6_RECVPKTINFO(sock); /* AIX */ 5783#endif 5784} 5785 5786#ifndef USE_WATCHER_THREAD 5787/* 5788 * In our assumed scenario, we can simply use a single static object. 5789 * XXX: this is not true if the application uses multiple threads with 5790 * 'multi-context' mode. Fixing this is a future TODO item. 5791 */ 5792static isc_socketwait_t swait_private; 5793 5794int 5795isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp, 5796 isc_socketwait_t **swaitp) 5797{ 5798 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 5799 5800 5801 int n; 5802#ifdef USE_KQUEUE 5803 struct timespec ts, *tsp; 5804#endif 5805#ifdef USE_EPOLL 5806 int timeout; 5807#endif 5808#ifdef USE_DEVPOLL 5809 struct dvpoll dvp; 5810#endif 5811 5812 REQUIRE(swaitp != NULL && *swaitp == NULL); 5813 5814#ifdef USE_SHARED_MANAGER 5815 if (manager == NULL) 5816 manager = socketmgr; 5817#endif 5818 if (manager == NULL) 5819 return (0); 5820 5821#ifdef USE_KQUEUE 5822 if (tvp != NULL) { 5823 ts.tv_sec = tvp->tv_sec; 5824 ts.tv_nsec = tvp->tv_usec * 1000; 5825 tsp = &ts; 5826 } else 5827 tsp = NULL; 5828 swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0, 5829 manager->events, manager->nevents, 5830 tsp); 5831 n = swait_private.nevents; 5832#elif defined(USE_EPOLL) 5833 if (tvp != NULL) 5834 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000; 5835 else 5836 timeout = -1; 5837 swait_private.nevents = epoll_wait(manager->epoll_fd, 5838 manager->events, 5839 manager->nevents, timeout); 5840 n = swait_private.nevents; 5841#elif defined(USE_DEVPOLL) 5842 dvp.dp_fds = manager->events; 5843 dvp.dp_nfds = manager->nevents; 5844 if (tvp != NULL) { 5845 dvp.dp_timeout = tvp->tv_sec * 1000 + 5846 (tvp->tv_usec + 999) / 1000; 5847 } else 5848 dvp.dp_timeout = -1; 5849 swait_private.nevents = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 5850 n = swait_private.nevents; 5851#elif defined(USE_SELECT) 5852 memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize); 5853 memmove(manager->write_fds_copy, manager->write_fds, 5854 manager->fd_bufsize); 5855 5856 swait_private.readset = manager->read_fds_copy; 5857 swait_private.writeset = manager->write_fds_copy; 5858 swait_private.maxfd = manager->maxfd + 1; 5859 5860 n = select(swait_private.maxfd, swait_private.readset, 5861 swait_private.writeset, NULL, tvp); 5862#endif 5863 5864 *swaitp = &swait_private; 5865 return (n); 5866} 5867 5868isc_result_t 5869isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) { 5870 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 5871 5872 REQUIRE(swait == &swait_private); 5873 5874#ifdef USE_SHARED_MANAGER 5875 if (manager == NULL) 5876 manager = socketmgr; 5877#endif 5878 if (manager == NULL) 5879 return (ISC_R_NOTFOUND); 5880 5881#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 5882 (void)process_fds(manager, manager->events, swait->nevents); 5883 return (ISC_R_SUCCESS); 5884#elif defined(USE_SELECT) 5885 process_fds(manager, swait->maxfd, swait->readset, swait->writeset); 5886 return (ISC_R_SUCCESS); 5887#endif 5888} 5889#endif /* USE_WATCHER_THREAD */ 5890 5891#ifdef BIND9 5892void 5893isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) { 5894 isc__socket_t *socket = (isc__socket_t *)socket0; 5895 5896 /* 5897 * Name 'socket'. 5898 */ 5899 5900 REQUIRE(VALID_SOCKET(socket)); 5901 5902 LOCK(&socket->lock); 5903 memset(socket->name, 0, sizeof(socket->name)); 5904 strncpy(socket->name, name, sizeof(socket->name) - 1); 5905 socket->tag = tag; 5906 UNLOCK(&socket->lock); 5907} 5908 5909ISC_SOCKETFUNC_SCOPE const char * 5910isc__socket_getname(isc_socket_t *socket0) { 5911 isc__socket_t *socket = (isc__socket_t *)socket0; 5912 5913 return (socket->name); 5914} 5915 5916void * 5917isc__socket_gettag(isc_socket_t *socket0) { 5918 isc__socket_t *socket = (isc__socket_t *)socket0; 5919 5920 return (socket->tag); 5921} 5922#endif /* BIND9 */ 5923 5924#ifdef USE_SOCKETIMPREGISTER 5925isc_result_t 5926isc__socket_register() { 5927 return (isc_socket_register(isc__socketmgr_create)); 5928} 5929#endif 5930 5931ISC_SOCKETFUNC_SCOPE int 5932isc__socket_getfd(isc_socket_t *socket0) { 5933 isc__socket_t *socket = (isc__socket_t *)socket0; 5934 5935 return ((short) socket->fd); 5936} 5937 5938#if defined(HAVE_LIBXML2) && defined(BIND9) 5939 5940static const char * 5941_socktype(isc_sockettype_t type) 5942{ 5943 if (type == isc_sockettype_udp) 5944 return ("udp"); 5945 else if (type == isc_sockettype_tcp) 5946 return ("tcp"); 5947 else if (type == isc_sockettype_unix) 5948 return ("unix"); 5949 else if (type == isc_sockettype_fdwatch) 5950 return ("fdwatch"); 5951 else 5952 return ("not-initialized"); 5953} 5954 5955#define TRY0(a) do { xmlrc = (a); if (xmlrc < 0) goto error; } while(0) 5956ISC_SOCKETFUNC_SCOPE int 5957isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) { 5958 isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0; 5959 isc__socket_t *sock = NULL; 5960 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5961 isc_sockaddr_t addr; 5962 ISC_SOCKADDR_LEN_T len; 5963 int xmlrc; 5964 5965 LOCK(&mgr->lock); 5966 5967#ifdef USE_SHARED_MANAGER 5968 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "references")); 5969 TRY0(xmlTextWriterWriteFormatString(writer, "%d", mgr->refs)); 5970 TRY0(xmlTextWriterEndElement(writer)); 5971#endif /* USE_SHARED_MANAGER */ 5972 5973 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets")); 5974 sock = ISC_LIST_HEAD(mgr->socklist); 5975 while (sock != NULL) { 5976 LOCK(&sock->lock); 5977 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket")); 5978 5979 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id")); 5980 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock)); 5981 TRY0(xmlTextWriterEndElement(writer)); 5982 5983 if (sock->name[0] != 0) { 5984 TRY0(xmlTextWriterStartElement(writer, 5985 ISC_XMLCHAR "name")); 5986 TRY0(xmlTextWriterWriteFormatString(writer, "%s", 5987 sock->name)); 5988 TRY0(xmlTextWriterEndElement(writer)); /* name */ 5989 } 5990 5991 TRY0(xmlTextWriterStartElement(writer, 5992 ISC_XMLCHAR "references")); 5993 TRY0(xmlTextWriterWriteFormatString(writer, "%d", 5994 sock->references)); 5995 TRY0(xmlTextWriterEndElement(writer)); 5996 5997 TRY0(xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type", 5998 ISC_XMLCHAR _socktype(sock->type))); 5999 6000 if (sock->connected) { 6001 isc_sockaddr_format(&sock->peer_address, peerbuf, 6002 sizeof(peerbuf)); 6003 TRY0(xmlTextWriterWriteElement(writer, 6004 ISC_XMLCHAR "peer-address", 6005 ISC_XMLCHAR peerbuf)); 6006 } 6007 6008 len = sizeof(addr); 6009 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) { 6010 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf)); 6011 TRY0(xmlTextWriterWriteElement(writer, 6012 ISC_XMLCHAR "local-address", 6013 ISC_XMLCHAR peerbuf)); 6014 } 6015 6016 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states")); 6017 if (sock->pending_recv) 6018 TRY0(xmlTextWriterWriteElement(writer, 6019 ISC_XMLCHAR "state", 6020 ISC_XMLCHAR "pending-receive")); 6021 if (sock->pending_send) 6022 TRY0(xmlTextWriterWriteElement(writer, 6023 ISC_XMLCHAR "state", 6024 ISC_XMLCHAR "pending-send")); 6025 if (sock->pending_accept) 6026 TRY0(xmlTextWriterWriteElement(writer, 6027 ISC_XMLCHAR "state", 6028 ISC_XMLCHAR "pending_accept")); 6029 if (sock->listener) 6030 TRY0(xmlTextWriterWriteElement(writer, 6031 ISC_XMLCHAR "state", 6032 ISC_XMLCHAR "listener")); 6033 if (sock->connected) 6034 TRY0(xmlTextWriterWriteElement(writer, 6035 ISC_XMLCHAR "state", 6036 ISC_XMLCHAR "connected")); 6037 if (sock->connecting) 6038 TRY0(xmlTextWriterWriteElement(writer, 6039 ISC_XMLCHAR "state", 6040 ISC_XMLCHAR "connecting")); 6041 if (sock->bound) 6042 TRY0(xmlTextWriterWriteElement(writer, 6043 ISC_XMLCHAR "state", 6044 ISC_XMLCHAR "bound")); 6045 6046 TRY0(xmlTextWriterEndElement(writer)); /* states */ 6047 6048 TRY0(xmlTextWriterEndElement(writer)); /* socket */ 6049 6050 UNLOCK(&sock->lock); 6051 sock = ISC_LIST_NEXT(sock, link); 6052 } 6053 TRY0(xmlTextWriterEndElement(writer)); /* sockets */ 6054 6055 error: 6056 if (sock != NULL) 6057 UNLOCK(&sock->lock); 6058 6059 UNLOCK(&mgr->lock); 6060 6061 return (xmlrc); 6062} 6063#endif /* HAVE_LIBXML2 */ 6064