1/* 2 * Copyright (C) 2004-2009 Internet Systems Consortium, Inc. ("ISC") 3 * Copyright (C) 1998-2003 Internet Software Consortium. 4 * 5 * Permission to use, copy, modify, and/or distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 11 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 15 * PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18/* $Id: socket.c,v 1.308.12.8 2009/04/18 01:29:26 jinmei Exp $ */ 19 20/*! \file */ 21 22#include <config.h> 23 24#include <sys/param.h> 25#include <sys/types.h> 26#include <sys/socket.h> 27#include <sys/stat.h> 28#include <sys/time.h> 29#include <sys/uio.h> 30 31#include <errno.h> 32#include <fcntl.h> 33#include <stddef.h> 34#include <stdlib.h> 35#include <string.h> 36#include <unistd.h> 37 38#include <isc/buffer.h> 39#include <isc/bufferlist.h> 40#include <isc/condition.h> 41#include <isc/formatcheck.h> 42#include <isc/list.h> 43#include <isc/log.h> 44#include <isc/mem.h> 45#include <isc/msgs.h> 46#include <isc/mutex.h> 47#include <isc/net.h> 48#include <isc/once.h> 49#include <isc/platform.h> 50#include <isc/print.h> 51#include <isc/region.h> 52#include <isc/socket.h> 53#include <isc/stats.h> 54#include <isc/strerror.h> 55#include <isc/task.h> 56#include <isc/thread.h> 57#include <isc/util.h> 58#include <isc/xml.h> 59 60#ifdef ISC_PLATFORM_HAVESYSUNH 61#include <sys/un.h> 62#endif 63#ifdef ISC_PLATFORM_HAVEKQUEUE 64#include <sys/event.h> 65#endif 66#ifdef ISC_PLATFORM_HAVEEPOLL 67#include <sys/epoll.h> 68#endif 69#ifdef ISC_PLATFORM_HAVEDEVPOLL 70#include <sys/devpoll.h> 71#endif 72 73#include "errno2result.h" 74 75#ifndef ISC_PLATFORM_USETHREADS 76#include "socket_p.h" 77#endif /* ISC_PLATFORM_USETHREADS */ 78 79#if defined(SO_BSDCOMPAT) && defined(__linux__) 80#include <sys/utsname.h> 81#endif 82 83/*% 84 * Choose the most preferable multiplex method. 85 */ 86#ifdef ISC_PLATFORM_HAVEKQUEUE 87#define USE_KQUEUE 88#elif defined (ISC_PLATFORM_HAVEEPOLL) 89#define USE_EPOLL 90#elif defined (ISC_PLATFORM_HAVEDEVPOLL) 91#define USE_DEVPOLL 92typedef struct { 93 unsigned int want_read : 1, 94 want_write : 1; 95} pollinfo_t; 96#else 97#define USE_SELECT 98#endif /* ISC_PLATFORM_HAVEKQUEUE */ 99 100#ifndef ISC_PLATFORM_USETHREADS 101#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 102struct isc_socketwait { 103 int nevents; 104}; 105#elif defined (USE_SELECT) 106struct isc_socketwait { 107 fd_set *readset; 108 fd_set *writeset; 109 int nfds; 110 int maxfd; 111}; 112#endif /* USE_KQUEUE */ 113#endif /* !ISC_PLATFORM_USETHREADS */ 114 115/*% 116 * Maximum number of allowable open sockets. This is also the maximum 117 * allowable socket file descriptor. 118 * 119 * Care should be taken before modifying this value for select(): 120 * The API standard doesn't ensure select() accept more than (the system default 121 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in 122 * the vast majority of cases. This constant should therefore be increased only 123 * when absolutely necessary and possible, i.e., the server is exhausting all 124 * available file descriptors (up to FD_SETSIZE) and the select() function 125 * and FD_xxx macros support larger values than FD_SETSIZE (which may not 126 * always by true, but we keep using some of them to ensure as much 127 * portability as possible). Note also that overall server performance 128 * may be rather worsened with a larger value of this constant due to 129 * inherent scalability problems of select(). 130 * 131 * As a special note, this value shouldn't have to be touched if 132 * this is a build for an authoritative only DNS server. 133 */ 134#ifndef ISC_SOCKET_MAXSOCKETS 135#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 136#define ISC_SOCKET_MAXSOCKETS 4096 137#elif defined(USE_SELECT) 138#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE 139#endif /* USE_KQUEUE... */ 140#endif /* ISC_SOCKET_MAXSOCKETS */ 141 142#ifdef USE_SELECT 143/*% 144 * Mac OS X needs a special definition to support larger values in select(). 145 * We always define this because a larger value can be specified run-time. 146 */ 147#ifdef __APPLE__ 148#define _DARWIN_UNLIMITED_SELECT 149#endif /* __APPLE__ */ 150#endif /* USE_SELECT */ 151 152#ifdef ISC_SOCKET_USE_POLLWATCH 153/*% 154 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel 155 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for 156 * some of the specified FD. The idea is based on the observation that it's 157 * likely for a busy server to keep receiving packets. It specifically works 158 * as follows: the socket watcher is first initialized with the state of 159 * "poll_idle". While it's in the idle state it keeps sleeping until a socket 160 * event occurs. When it wakes up for a socket I/O event, it moves to the 161 * poll_active state, and sets the poll timeout to a short period 162 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the 163 * watcher goes to the poll_checking state with the same timeout period. 164 * In this state, the watcher tries to detect whether this is a break 165 * during intermittent events or the kernel bug is triggered. If the next 166 * polling reports an event within the short period, the previous timeout is 167 * likely to be a kernel bug, and so the watcher goes back to the active state. 168 * Otherwise, it moves to the idle state again. 169 * 170 * It's not clear whether this is a thread-related bug, but since we've only 171 * seen this with threads, this workaround is used only when enabling threads. 172 */ 173 174typedef enum { poll_idle, poll_active, poll_checking } pollstate_t; 175 176#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT 177#define ISC_SOCKET_POLLWATCH_TIMEOUT 10 178#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */ 179#endif /* ISC_SOCKET_USE_POLLWATCH */ 180 181/*% 182 * Size of per-FD lock buckets. 183 */ 184#ifdef ISC_PLATFORM_USETHREADS 185#define FDLOCK_COUNT 1024 186#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT) 187#else 188#define FDLOCK_COUNT 1 189#define FDLOCK_ID(fd) 0 190#endif /* ISC_PLATFORM_USETHREADS */ 191 192/*% 193 * Maximum number of events communicated with the kernel. There should normally 194 * be no need for having a large number. 195 */ 196#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 197#ifndef ISC_SOCKET_MAXEVENTS 198#define ISC_SOCKET_MAXEVENTS 64 199#endif 200#endif 201 202/*% 203 * Some systems define the socket length argument as an int, some as size_t, 204 * some as socklen_t. This is here so it can be easily changed if needed. 205 */ 206#ifndef ISC_SOCKADDR_LEN_T 207#define ISC_SOCKADDR_LEN_T unsigned int 208#endif 209 210/*% 211 * Define what the possible "soft" errors can be. These are non-fatal returns 212 * of various network related functions, like recv() and so on. 213 * 214 * For some reason, BSDI (and perhaps others) will sometimes return <0 215 * from recv() but will have errno==0. This is broken, but we have to 216 * work around it here. 217 */ 218#define SOFT_ERROR(e) ((e) == EAGAIN || \ 219 (e) == EWOULDBLOCK || \ 220 (e) == EINTR || \ 221 (e) == 0) 222 223#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x) 224 225/*!< 226 * DLVL(90) -- Function entry/exit and other tracing. 227 * DLVL(70) -- Socket "correctness" -- including returning of events, etc. 228 * DLVL(60) -- Socket data send/receive 229 * DLVL(50) -- Event tracing, including receiving/sending completion events. 230 * DLVL(20) -- Socket creation/destruction. 231 */ 232#define TRACE_LEVEL 90 233#define CORRECTNESS_LEVEL 70 234#define IOEVENT_LEVEL 60 235#define EVENT_LEVEL 50 236#define CREATION_LEVEL 20 237 238#define TRACE DLVL(TRACE_LEVEL) 239#define CORRECTNESS DLVL(CORRECTNESS_LEVEL) 240#define IOEVENT DLVL(IOEVENT_LEVEL) 241#define EVENT DLVL(EVENT_LEVEL) 242#define CREATION DLVL(CREATION_LEVEL) 243 244typedef isc_event_t intev_t; 245 246#define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o') 247#define VALID_SOCKET(t) ISC_MAGIC_VALID(t, SOCKET_MAGIC) 248 249/*! 250 * IPv6 control information. If the socket is an IPv6 socket we want 251 * to collect the destination address and interface so the client can 252 * set them on outgoing packets. 253 */ 254#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 255#ifndef USE_CMSG 256#define USE_CMSG 1 257#endif 258#endif 259 260/*% 261 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have 262 * a setsockopt() like interface to request timestamps, and if the OS 263 * doesn't do it for us, call gettimeofday() on every UDP receive? 264 */ 265#ifdef SO_TIMESTAMP 266#ifndef USE_CMSG 267#define USE_CMSG 1 268#endif 269#endif 270 271/*% 272 * The size to raise the receive buffer to (from BIND 8). 273 */ 274#define RCVBUFSIZE (32*1024) 275 276/*% 277 * The number of times a send operation is repeated if the result is EINTR. 278 */ 279#define NRETRIES 10 280 281struct isc_socket { 282 /* Not locked. */ 283 unsigned int magic; 284 isc_socketmgr_t *manager; 285 isc_mutex_t lock; 286 isc_sockettype_t type; 287 const isc_statscounter_t *statsindex; 288 289 /* Locked by socket lock. */ 290 ISC_LINK(isc_socket_t) link; 291 unsigned int references; 292 int fd; 293 int pf; 294 char name[16]; 295 void * tag; 296 297 ISC_LIST(isc_socketevent_t) send_list; 298 ISC_LIST(isc_socketevent_t) recv_list; 299 ISC_LIST(isc_socket_newconnev_t) accept_list; 300 isc_socket_connev_t *connect_ev; 301 302 /* 303 * Internal events. Posted when a descriptor is readable or 304 * writable. These are statically allocated and never freed. 305 * They will be set to non-purgable before use. 306 */ 307 intev_t readable_ev; 308 intev_t writable_ev; 309 310 isc_sockaddr_t peer_address; /* remote address */ 311 312 unsigned int pending_recv : 1, 313 pending_send : 1, 314 pending_accept : 1, 315 listener : 1, /* listener socket */ 316 connected : 1, 317 connecting : 1, /* connect pending */ 318 bound : 1; /* bound to local addr */ 319 320#ifdef ISC_NET_RECVOVERFLOW 321 unsigned char overflow; /* used for MSG_TRUNC fake */ 322#endif 323 324 char *recvcmsgbuf; 325 ISC_SOCKADDR_LEN_T recvcmsgbuflen; 326 char *sendcmsgbuf; 327 ISC_SOCKADDR_LEN_T sendcmsgbuflen; 328 329 void *fdwatcharg; 330 isc_sockfdwatch_t fdwatchcb; 331 int fdwatchflags; 332 isc_task_t *fdwatchtask; 333}; 334 335#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g') 336#define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC) 337 338struct isc_socketmgr { 339 /* Not locked. */ 340 unsigned int magic; 341 isc_mem_t *mctx; 342 isc_mutex_t lock; 343 isc_mutex_t *fdlock; 344 isc_stats_t *stats; 345#ifdef USE_KQUEUE 346 int kqueue_fd; 347 int nevents; 348 struct kevent *events; 349#endif /* USE_KQUEUE */ 350#ifdef USE_EPOLL 351 int epoll_fd; 352 int nevents; 353 struct epoll_event *events; 354#endif /* USE_EPOLL */ 355#ifdef USE_DEVPOLL 356 int devpoll_fd; 357 int nevents; 358 struct pollfd *events; 359#endif /* USE_DEVPOLL */ 360#ifdef USE_SELECT 361 int fd_bufsize; 362#endif /* USE_SELECT */ 363 unsigned int maxsocks; 364#ifdef ISC_PLATFORM_USETHREADS 365 int pipe_fds[2]; 366#endif 367 368 /* Locked by fdlock. */ 369 isc_socket_t **fds; 370 int *fdstate; 371#ifdef USE_DEVPOLL 372 pollinfo_t *fdpollinfo; 373#endif 374 375 /* Locked by manager lock. */ 376 ISC_LIST(isc_socket_t) socklist; 377#ifdef USE_SELECT 378 fd_set *read_fds; 379 fd_set *read_fds_copy; 380 fd_set *write_fds; 381 fd_set *write_fds_copy; 382 int maxfd; 383#endif /* USE_SELECT */ 384 int reserved; /* unlocked */ 385#ifdef ISC_PLATFORM_USETHREADS 386 isc_thread_t watcher; 387 isc_condition_t shutdown_ok; 388#else /* ISC_PLATFORM_USETHREADS */ 389 unsigned int refs; 390#endif /* ISC_PLATFORM_USETHREADS */ 391}; 392 393#ifndef ISC_PLATFORM_USETHREADS 394static isc_socketmgr_t *socketmgr = NULL; 395#endif /* ISC_PLATFORM_USETHREADS */ 396 397#define CLOSED 0 /* this one must be zero */ 398#define MANAGED 1 399#define CLOSE_PENDING 2 400 401/* 402 * send() and recv() iovec counts 403 */ 404#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) 405#ifdef ISC_NET_RECVOVERFLOW 406# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1) 407#else 408# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) 409#endif 410 411static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **); 412static void send_senddone_event(isc_socket_t *, isc_socketevent_t **); 413static void free_socket(isc_socket_t **); 414static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t, 415 isc_socket_t **); 416static void destroy(isc_socket_t **); 417static void internal_accept(isc_task_t *, isc_event_t *); 418static void internal_connect(isc_task_t *, isc_event_t *); 419static void internal_recv(isc_task_t *, isc_event_t *); 420static void internal_send(isc_task_t *, isc_event_t *); 421static void internal_fdwatch_write(isc_task_t *, isc_event_t *); 422static void internal_fdwatch_read(isc_task_t *, isc_event_t *); 423static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *); 424static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *, 425 struct msghdr *, struct iovec *, size_t *); 426static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *, 427 struct msghdr *, struct iovec *, size_t *); 428#ifdef ISC_PLATFORM_USETHREADS 429static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager); 430#endif 431 432#define SELECT_POKE_SHUTDOWN (-1) 433#define SELECT_POKE_NOTHING (-2) 434#define SELECT_POKE_READ (-3) 435#define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */ 436#define SELECT_POKE_WRITE (-4) 437#define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */ 438#define SELECT_POKE_CLOSE (-5) 439 440#define SOCK_DEAD(s) ((s)->references == 0) 441 442/*% 443 * Shortcut index arrays to get access to statistics counters. 444 */ 445enum { 446 STATID_OPEN = 0, 447 STATID_OPENFAIL = 1, 448 STATID_CLOSE = 2, 449 STATID_BINDFAIL = 3, 450 STATID_CONNECTFAIL = 4, 451 STATID_CONNECT = 5, 452 STATID_ACCEPTFAIL = 6, 453 STATID_ACCEPT = 7, 454 STATID_SENDFAIL = 8, 455 STATID_RECVFAIL = 9 456}; 457static const isc_statscounter_t upd4statsindex[] = { 458 isc_sockstatscounter_udp4open, 459 isc_sockstatscounter_udp4openfail, 460 isc_sockstatscounter_udp4close, 461 isc_sockstatscounter_udp4bindfail, 462 isc_sockstatscounter_udp4connectfail, 463 isc_sockstatscounter_udp4connect, 464 -1, 465 -1, 466 isc_sockstatscounter_udp4sendfail, 467 isc_sockstatscounter_udp4recvfail 468}; 469static const isc_statscounter_t upd6statsindex[] = { 470 isc_sockstatscounter_udp6open, 471 isc_sockstatscounter_udp6openfail, 472 isc_sockstatscounter_udp6close, 473 isc_sockstatscounter_udp6bindfail, 474 isc_sockstatscounter_udp6connectfail, 475 isc_sockstatscounter_udp6connect, 476 -1, 477 -1, 478 isc_sockstatscounter_udp6sendfail, 479 isc_sockstatscounter_udp6recvfail 480}; 481static const isc_statscounter_t tcp4statsindex[] = { 482 isc_sockstatscounter_tcp4open, 483 isc_sockstatscounter_tcp4openfail, 484 isc_sockstatscounter_tcp4close, 485 isc_sockstatscounter_tcp4bindfail, 486 isc_sockstatscounter_tcp4connectfail, 487 isc_sockstatscounter_tcp4connect, 488 isc_sockstatscounter_tcp4acceptfail, 489 isc_sockstatscounter_tcp4accept, 490 isc_sockstatscounter_tcp4sendfail, 491 isc_sockstatscounter_tcp4recvfail 492}; 493static const isc_statscounter_t tcp6statsindex[] = { 494 isc_sockstatscounter_tcp6open, 495 isc_sockstatscounter_tcp6openfail, 496 isc_sockstatscounter_tcp6close, 497 isc_sockstatscounter_tcp6bindfail, 498 isc_sockstatscounter_tcp6connectfail, 499 isc_sockstatscounter_tcp6connect, 500 isc_sockstatscounter_tcp6acceptfail, 501 isc_sockstatscounter_tcp6accept, 502 isc_sockstatscounter_tcp6sendfail, 503 isc_sockstatscounter_tcp6recvfail 504}; 505static const isc_statscounter_t unixstatsindex[] = { 506 isc_sockstatscounter_unixopen, 507 isc_sockstatscounter_unixopenfail, 508 isc_sockstatscounter_unixclose, 509 isc_sockstatscounter_unixbindfail, 510 isc_sockstatscounter_unixconnectfail, 511 isc_sockstatscounter_unixconnect, 512 isc_sockstatscounter_unixacceptfail, 513 isc_sockstatscounter_unixaccept, 514 isc_sockstatscounter_unixsendfail, 515 isc_sockstatscounter_unixrecvfail 516}; 517static const isc_statscounter_t fdwatchstatsindex[] = { 518 -1, 519 -1, 520 isc_sockstatscounter_fdwatchclose, 521 isc_sockstatscounter_fdwatchbindfail, 522 isc_sockstatscounter_fdwatchconnectfail, 523 isc_sockstatscounter_fdwatchconnect, 524 -1, 525 -1, 526 isc_sockstatscounter_fdwatchsendfail, 527 isc_sockstatscounter_fdwatchrecvfail 528}; 529 530static void 531manager_log(isc_socketmgr_t *sockmgr, 532 isc_logcategory_t *category, isc_logmodule_t *module, int level, 533 const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6); 534static void 535manager_log(isc_socketmgr_t *sockmgr, 536 isc_logcategory_t *category, isc_logmodule_t *module, int level, 537 const char *fmt, ...) 538{ 539 char msgbuf[2048]; 540 va_list ap; 541 542 if (! isc_log_wouldlog(isc_lctx, level)) 543 return; 544 545 va_start(ap, fmt); 546 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 547 va_end(ap); 548 549 isc_log_write(isc_lctx, category, module, level, 550 "sockmgr %p: %s", sockmgr, msgbuf); 551} 552 553static void 554socket_log(isc_socket_t *sock, isc_sockaddr_t *address, 555 isc_logcategory_t *category, isc_logmodule_t *module, int level, 556 isc_msgcat_t *msgcat, int msgset, int message, 557 const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10); 558static void 559socket_log(isc_socket_t *sock, isc_sockaddr_t *address, 560 isc_logcategory_t *category, isc_logmodule_t *module, int level, 561 isc_msgcat_t *msgcat, int msgset, int message, 562 const char *fmt, ...) 563{ 564 char msgbuf[2048]; 565 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 566 va_list ap; 567 568 if (! isc_log_wouldlog(isc_lctx, level)) 569 return; 570 571 va_start(ap, fmt); 572 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 573 va_end(ap); 574 575 if (address == NULL) { 576 isc_log_iwrite(isc_lctx, category, module, level, 577 msgcat, msgset, message, 578 "socket %p: %s", sock, msgbuf); 579 } else { 580 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); 581 isc_log_iwrite(isc_lctx, category, module, level, 582 msgcat, msgset, message, 583 "socket %p %s: %s", sock, peerbuf, msgbuf); 584 } 585} 586 587#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \ 588 defined(USE_CMSG) && defined(IPV6_RECVPKTINFO) 589/* 590 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by 591 * setting IPV6_V6ONLY. 592 */ 593static void 594FIX_IPV6_RECVPKTINFO(isc_socket_t *sock) 595{ 596 char strbuf[ISC_STRERRORSIZE]; 597 int on = 1; 598 599 if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp) 600 return; 601 602 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 603 (void *)&on, sizeof(on)) < 0) { 604 605 UNEXPECTED_ERROR(__FILE__, __LINE__, 606 "setsockopt(%d, IPV6_RECVPKTINFO) " 607 "%s: %s", sock->fd, 608 isc_msgcat_get(isc_msgcat, 609 ISC_MSGSET_GENERAL, 610 ISC_MSG_FAILED, 611 "failed"), 612 strbuf); 613 } 614} 615#else 616#define FIX_IPV6_RECVPKTINFO(sock) (void)0 617#endif 618 619/*% 620 * Increment socket-related statistics counters. 621 */ 622static inline void 623inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) { 624 REQUIRE(counterid != -1); 625 626 if (stats != NULL) 627 isc_stats_increment(stats, counterid); 628} 629 630static inline isc_result_t 631watch_fd(isc_socketmgr_t *manager, int fd, int msg) { 632 isc_result_t result = ISC_R_SUCCESS; 633 634#ifdef USE_KQUEUE 635 struct kevent evchange; 636 637 memset(&evchange, 0, sizeof(evchange)); 638 if (msg == SELECT_POKE_READ) 639 evchange.filter = EVFILT_READ; 640 else 641 evchange.filter = EVFILT_WRITE; 642 evchange.flags = EV_ADD; 643 evchange.ident = fd; 644 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 645 result = isc__errno2result(errno); 646 647 return (result); 648#elif defined(USE_EPOLL) 649 struct epoll_event event; 650 651 if (msg == SELECT_POKE_READ) 652 event.events = EPOLLIN; 653 else 654 event.events = EPOLLOUT; 655 event.data.fd = fd; 656 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && 657 errno != EEXIST) { 658 result = isc__errno2result(errno); 659 } 660 661 return (result); 662#elif defined(USE_DEVPOLL) 663 struct pollfd pfd; 664 int lockid = FDLOCK_ID(fd); 665 666 memset(&pfd, 0, sizeof(pfd)); 667 if (msg == SELECT_POKE_READ) 668 pfd.events = POLLIN; 669 else 670 pfd.events = POLLOUT; 671 pfd.fd = fd; 672 pfd.revents = 0; 673 LOCK(&manager->fdlock[lockid]); 674 if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1) 675 result = isc__errno2result(errno); 676 else { 677 if (msg == SELECT_POKE_READ) 678 manager->fdpollinfo[fd].want_read = 1; 679 else 680 manager->fdpollinfo[fd].want_write = 1; 681 } 682 UNLOCK(&manager->fdlock[lockid]); 683 684 return (result); 685#elif defined(USE_SELECT) 686 LOCK(&manager->lock); 687 if (msg == SELECT_POKE_READ) 688 FD_SET(fd, manager->read_fds); 689 if (msg == SELECT_POKE_WRITE) 690 FD_SET(fd, manager->write_fds); 691 UNLOCK(&manager->lock); 692 693 return (result); 694#endif 695} 696 697static inline isc_result_t 698unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) { 699 isc_result_t result = ISC_R_SUCCESS; 700 701#ifdef USE_KQUEUE 702 struct kevent evchange; 703 704 memset(&evchange, 0, sizeof(evchange)); 705 if (msg == SELECT_POKE_READ) 706 evchange.filter = EVFILT_READ; 707 else 708 evchange.filter = EVFILT_WRITE; 709 evchange.flags = EV_DELETE; 710 evchange.ident = fd; 711 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 712 result = isc__errno2result(errno); 713 714 return (result); 715#elif defined(USE_EPOLL) 716 struct epoll_event event; 717 718 if (msg == SELECT_POKE_READ) 719 event.events = EPOLLIN; 720 else 721 event.events = EPOLLOUT; 722 event.data.fd = fd; 723 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 && 724 errno != ENOENT) { 725 char strbuf[ISC_STRERRORSIZE]; 726 isc__strerror(errno, strbuf, sizeof(strbuf)); 727 UNEXPECTED_ERROR(__FILE__, __LINE__, 728 "epoll_ctl(DEL), %d: %s", fd, strbuf); 729 result = ISC_R_UNEXPECTED; 730 } 731 return (result); 732#elif defined(USE_DEVPOLL) 733 struct pollfd pfds[2]; 734 size_t writelen = sizeof(pfds[0]); 735 int lockid = FDLOCK_ID(fd); 736 737 memset(pfds, 0, sizeof(pfds)); 738 pfds[0].events = POLLREMOVE; 739 pfds[0].fd = fd; 740 741 /* 742 * Canceling read or write polling via /dev/poll is tricky. Since it 743 * only provides a way of canceling per FD, we may need to re-poll the 744 * socket for the other operation. 745 */ 746 LOCK(&manager->fdlock[lockid]); 747 if (msg == SELECT_POKE_READ && 748 manager->fdpollinfo[fd].want_write == 1) { 749 pfds[1].events = POLLOUT; 750 pfds[1].fd = fd; 751 writelen += sizeof(pfds[1]); 752 } 753 if (msg == SELECT_POKE_WRITE && 754 manager->fdpollinfo[fd].want_read == 1) { 755 pfds[1].events = POLLIN; 756 pfds[1].fd = fd; 757 writelen += sizeof(pfds[1]); 758 } 759 760 if (write(manager->devpoll_fd, pfds, writelen) == -1) 761 result = isc__errno2result(errno); 762 else { 763 if (msg == SELECT_POKE_READ) 764 manager->fdpollinfo[fd].want_read = 0; 765 else 766 manager->fdpollinfo[fd].want_write = 0; 767 } 768 UNLOCK(&manager->fdlock[lockid]); 769 770 return (result); 771#elif defined(USE_SELECT) 772 LOCK(&manager->lock); 773 if (msg == SELECT_POKE_READ) 774 FD_CLR(fd, manager->read_fds); 775 else if (msg == SELECT_POKE_WRITE) 776 FD_CLR(fd, manager->write_fds); 777 UNLOCK(&manager->lock); 778 779 return (result); 780#endif 781} 782 783static void 784wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { 785 isc_result_t result; 786 int lockid = FDLOCK_ID(fd); 787 788 /* 789 * This is a wakeup on a socket. If the socket is not in the 790 * process of being closed, start watching it for either reads 791 * or writes. 792 */ 793 794 INSIST(fd >= 0 && fd < (int)manager->maxsocks); 795 796 if (msg == SELECT_POKE_CLOSE) { 797 /* No one should be updating fdstate, so no need to lock it */ 798 INSIST(manager->fdstate[fd] == CLOSE_PENDING); 799 manager->fdstate[fd] = CLOSED; 800 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 801 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 802 (void)close(fd); 803 return; 804 } 805 806 LOCK(&manager->fdlock[lockid]); 807 if (manager->fdstate[fd] == CLOSE_PENDING) { 808 UNLOCK(&manager->fdlock[lockid]); 809 810 /* 811 * We accept (and ignore) any error from unwatch_fd() as we are 812 * closing the socket, hoping it doesn't leave dangling state in 813 * the kernel. 814 * Note that unwatch_fd() must be called after releasing the 815 * fdlock; otherwise it could cause deadlock due to a lock order 816 * reversal. 817 */ 818 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 819 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 820 return; 821 } 822 if (manager->fdstate[fd] != MANAGED) { 823 UNLOCK(&manager->fdlock[lockid]); 824 return; 825 } 826 UNLOCK(&manager->fdlock[lockid]); 827 828 /* 829 * Set requested bit. 830 */ 831 result = watch_fd(manager, fd, msg); 832 if (result != ISC_R_SUCCESS) { 833 /* 834 * XXXJT: what should we do? Ignoring the failure of watching 835 * a socket will make the application dysfunctional, but there 836 * seems to be no reasonable recovery process. 837 */ 838 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 839 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 840 "failed to start watching FD (%d): %s", 841 fd, isc_result_totext(result)); 842 } 843} 844 845#ifdef ISC_PLATFORM_USETHREADS 846/* 847 * Poke the select loop when there is something for us to do. 848 * The write is required (by POSIX) to complete. That is, we 849 * will not get partial writes. 850 */ 851static void 852select_poke(isc_socketmgr_t *mgr, int fd, int msg) { 853 int cc; 854 int buf[2]; 855 char strbuf[ISC_STRERRORSIZE]; 856 857 buf[0] = fd; 858 buf[1] = msg; 859 860 do { 861 cc = write(mgr->pipe_fds[1], buf, sizeof(buf)); 862#ifdef ENOSR 863 /* 864 * Treat ENOSR as EAGAIN but loop slowly as it is 865 * unlikely to clear fast. 866 */ 867 if (cc < 0 && errno == ENOSR) { 868 sleep(1); 869 errno = EAGAIN; 870 } 871#endif 872 } while (cc < 0 && SOFT_ERROR(errno)); 873 874 if (cc < 0) { 875 isc__strerror(errno, strbuf, sizeof(strbuf)); 876 FATAL_ERROR(__FILE__, __LINE__, 877 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 878 ISC_MSG_WRITEFAILED, 879 "write() failed " 880 "during watcher poke: %s"), 881 strbuf); 882 } 883 884 INSIST(cc == sizeof(buf)); 885} 886 887/* 888 * Read a message on the internal fd. 889 */ 890static void 891select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) { 892 int buf[2]; 893 int cc; 894 char strbuf[ISC_STRERRORSIZE]; 895 896 cc = read(mgr->pipe_fds[0], buf, sizeof(buf)); 897 if (cc < 0) { 898 *msg = SELECT_POKE_NOTHING; 899 *fd = -1; /* Silence compiler. */ 900 if (SOFT_ERROR(errno)) 901 return; 902 903 isc__strerror(errno, strbuf, sizeof(strbuf)); 904 FATAL_ERROR(__FILE__, __LINE__, 905 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 906 ISC_MSG_READFAILED, 907 "read() failed " 908 "during watcher poke: %s"), 909 strbuf); 910 911 return; 912 } 913 INSIST(cc == sizeof(buf)); 914 915 *fd = buf[0]; 916 *msg = buf[1]; 917} 918#else /* ISC_PLATFORM_USETHREADS */ 919/* 920 * Update the state of the socketmgr when something changes. 921 */ 922static void 923select_poke(isc_socketmgr_t *manager, int fd, int msg) { 924 if (msg == SELECT_POKE_SHUTDOWN) 925 return; 926 else if (fd >= 0) 927 wakeup_socket(manager, fd, msg); 928 return; 929} 930#endif /* ISC_PLATFORM_USETHREADS */ 931 932/* 933 * Make a fd non-blocking. 934 */ 935static isc_result_t 936make_nonblock(int fd) { 937 int ret; 938 int flags; 939 char strbuf[ISC_STRERRORSIZE]; 940#ifdef USE_FIONBIO_IOCTL 941 int on = 1; 942 943 ret = ioctl(fd, FIONBIO, (char *)&on); 944#else 945 flags = fcntl(fd, F_GETFL, 0); 946 flags |= PORT_NONBLOCK; 947 ret = fcntl(fd, F_SETFL, flags); 948#endif 949 950 if (ret == -1) { 951 isc__strerror(errno, strbuf, sizeof(strbuf)); 952 UNEXPECTED_ERROR(__FILE__, __LINE__, 953#ifdef USE_FIONBIO_IOCTL 954 "ioctl(%d, FIONBIO, &on): %s", fd, 955#else 956 "fcntl(%d, F_SETFL, %d): %s", fd, flags, 957#endif 958 strbuf); 959 960 return (ISC_R_UNEXPECTED); 961 } 962 963 return (ISC_R_SUCCESS); 964} 965 966#ifdef USE_CMSG 967/* 968 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE. 969 * In order to ensure as much portability as possible, we provide wrapper 970 * functions of these macros. 971 * Note that cmsg_space() could run slow on OSes that do not have 972 * CMSG_SPACE. 973 */ 974static inline ISC_SOCKADDR_LEN_T 975cmsg_len(ISC_SOCKADDR_LEN_T len) { 976#ifdef CMSG_LEN 977 return (CMSG_LEN(len)); 978#else 979 ISC_SOCKADDR_LEN_T hdrlen; 980 981 /* 982 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA 983 * is correct. 984 */ 985 hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL)); 986 return (hdrlen + len); 987#endif 988} 989 990static inline ISC_SOCKADDR_LEN_T 991cmsg_space(ISC_SOCKADDR_LEN_T len) { 992#ifdef CMSG_SPACE 993 return (CMSG_SPACE(len)); 994#else 995 struct msghdr msg; 996 struct cmsghdr *cmsgp; 997 /* 998 * XXX: The buffer length is an ad-hoc value, but should be enough 999 * in a practical sense. 1000 */ 1001 char dummybuf[sizeof(struct cmsghdr) + 1024]; 1002 1003 memset(&msg, 0, sizeof(msg)); 1004 msg.msg_control = dummybuf; 1005 msg.msg_controllen = sizeof(dummybuf); 1006 1007 cmsgp = (struct cmsghdr *)dummybuf; 1008 cmsgp->cmsg_len = cmsg_len(len); 1009 1010 cmsgp = CMSG_NXTHDR(&msg, cmsgp); 1011 if (cmsgp != NULL) 1012 return ((char *)cmsgp - (char *)msg.msg_control); 1013 else 1014 return (0); 1015#endif 1016} 1017#endif /* USE_CMSG */ 1018 1019/* 1020 * Process control messages received on a socket. 1021 */ 1022static void 1023process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { 1024#ifdef USE_CMSG 1025 struct cmsghdr *cmsgp; 1026#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1027 struct in6_pktinfo *pktinfop; 1028#endif 1029#ifdef SO_TIMESTAMP 1030 struct timeval *timevalp; 1031#endif 1032#endif 1033 1034 /* 1035 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. 1036 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. 1037 * They are all here, outside of the CPP tests, because it is 1038 * more consistent with the usual ISC coding style. 1039 */ 1040 UNUSED(sock); 1041 UNUSED(msg); 1042 UNUSED(dev); 1043 1044#ifdef ISC_NET_BSD44MSGHDR 1045 1046#ifdef MSG_TRUNC 1047 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC) 1048 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1049#endif 1050 1051#ifdef MSG_CTRUNC 1052 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC) 1053 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; 1054#endif 1055 1056#ifndef USE_CMSG 1057 return; 1058#else 1059 if (msg->msg_controllen == 0U || msg->msg_control == NULL) 1060 return; 1061 1062#ifdef SO_TIMESTAMP 1063 timevalp = NULL; 1064#endif 1065#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1066 pktinfop = NULL; 1067#endif 1068 1069 cmsgp = CMSG_FIRSTHDR(msg); 1070 while (cmsgp != NULL) { 1071 socket_log(sock, NULL, TRACE, 1072 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG, 1073 "processing cmsg %p", cmsgp); 1074 1075#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1076 if (cmsgp->cmsg_level == IPPROTO_IPV6 1077 && cmsgp->cmsg_type == IPV6_PKTINFO) { 1078 1079 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1080 memcpy(&dev->pktinfo, pktinfop, 1081 sizeof(struct in6_pktinfo)); 1082 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 1083 socket_log(sock, NULL, TRACE, 1084 isc_msgcat, ISC_MSGSET_SOCKET, 1085 ISC_MSG_IFRECEIVED, 1086 "interface received on ifindex %u", 1087 dev->pktinfo.ipi6_ifindex); 1088 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) 1089 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; 1090 goto next; 1091 } 1092#endif 1093 1094#ifdef SO_TIMESTAMP 1095 if (cmsgp->cmsg_level == SOL_SOCKET 1096 && cmsgp->cmsg_type == SCM_TIMESTAMP) { 1097 timevalp = (struct timeval *)CMSG_DATA(cmsgp); 1098 dev->timestamp.seconds = timevalp->tv_sec; 1099 dev->timestamp.nanoseconds = timevalp->tv_usec * 1000; 1100 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; 1101 goto next; 1102 } 1103#endif 1104 1105 next: 1106 cmsgp = CMSG_NXTHDR(msg, cmsgp); 1107 } 1108#endif /* USE_CMSG */ 1109 1110#endif /* ISC_NET_BSD44MSGHDR */ 1111} 1112 1113/* 1114 * Construct an iov array and attach it to the msghdr passed in. This is 1115 * the SEND constructor, which will use the used region of the buffer 1116 * (if using a buffer list) or will use the internal region (if a single 1117 * buffer I/O is requested). 1118 * 1119 * Nothing can be NULL, and the done event must list at least one buffer 1120 * on the buffer linked list for this function to be meaningful. 1121 * 1122 * If write_countp != NULL, *write_countp will hold the number of bytes 1123 * this transaction can send. 1124 */ 1125static void 1126build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev, 1127 struct msghdr *msg, struct iovec *iov, size_t *write_countp) 1128{ 1129 unsigned int iovcount; 1130 isc_buffer_t *buffer; 1131 isc_region_t used; 1132 size_t write_count; 1133 size_t skip_count; 1134 1135 memset(msg, 0, sizeof(*msg)); 1136 1137 if (!sock->connected) { 1138 msg->msg_name = (void *)&dev->address.type.sa; 1139 msg->msg_namelen = dev->address.length; 1140 } else { 1141 msg->msg_name = NULL; 1142 msg->msg_namelen = 0; 1143 } 1144 1145 buffer = ISC_LIST_HEAD(dev->bufferlist); 1146 write_count = 0; 1147 iovcount = 0; 1148 1149 /* 1150 * Single buffer I/O? Skip what we've done so far in this region. 1151 */ 1152 if (buffer == NULL) { 1153 write_count = dev->region.length - dev->n; 1154 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1155 iov[0].iov_len = write_count; 1156 iovcount = 1; 1157 1158 goto config; 1159 } 1160 1161 /* 1162 * Multibuffer I/O. 1163 * Skip the data in the buffer list that we have already written. 1164 */ 1165 skip_count = dev->n; 1166 while (buffer != NULL) { 1167 REQUIRE(ISC_BUFFER_VALID(buffer)); 1168 if (skip_count < isc_buffer_usedlength(buffer)) 1169 break; 1170 skip_count -= isc_buffer_usedlength(buffer); 1171 buffer = ISC_LIST_NEXT(buffer, link); 1172 } 1173 1174 while (buffer != NULL) { 1175 INSIST(iovcount < MAXSCATTERGATHER_SEND); 1176 1177 isc_buffer_usedregion(buffer, &used); 1178 1179 if (used.length > 0) { 1180 iov[iovcount].iov_base = (void *)(used.base 1181 + skip_count); 1182 iov[iovcount].iov_len = used.length - skip_count; 1183 write_count += (used.length - skip_count); 1184 skip_count = 0; 1185 iovcount++; 1186 } 1187 buffer = ISC_LIST_NEXT(buffer, link); 1188 } 1189 1190 INSIST(skip_count == 0U); 1191 1192 config: 1193 msg->msg_iov = iov; 1194 msg->msg_iovlen = iovcount; 1195 1196#ifdef ISC_NET_BSD44MSGHDR 1197 msg->msg_control = NULL; 1198 msg->msg_controllen = 0; 1199 msg->msg_flags = 0; 1200#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1201 if ((sock->type == isc_sockettype_udp) 1202 && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) { 1203 struct cmsghdr *cmsgp; 1204 struct in6_pktinfo *pktinfop; 1205 1206 socket_log(sock, NULL, TRACE, 1207 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA, 1208 "sendto pktinfo data, ifindex %u", 1209 dev->pktinfo.ipi6_ifindex); 1210 1211 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo)); 1212 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1213 msg->msg_control = (void *)sock->sendcmsgbuf; 1214 1215 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf; 1216 cmsgp->cmsg_level = IPPROTO_IPV6; 1217 cmsgp->cmsg_type = IPV6_PKTINFO; 1218 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo)); 1219 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1220 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); 1221 } 1222#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */ 1223#else /* ISC_NET_BSD44MSGHDR */ 1224 msg->msg_accrights = NULL; 1225 msg->msg_accrightslen = 0; 1226#endif /* ISC_NET_BSD44MSGHDR */ 1227 1228 if (write_countp != NULL) 1229 *write_countp = write_count; 1230} 1231 1232/* 1233 * Construct an iov array and attach it to the msghdr passed in. This is 1234 * the RECV constructor, which will use the available region of the buffer 1235 * (if using a buffer list) or will use the internal region (if a single 1236 * buffer I/O is requested). 1237 * 1238 * Nothing can be NULL, and the done event must list at least one buffer 1239 * on the buffer linked list for this function to be meaningful. 1240 * 1241 * If read_countp != NULL, *read_countp will hold the number of bytes 1242 * this transaction can receive. 1243 */ 1244static void 1245build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev, 1246 struct msghdr *msg, struct iovec *iov, size_t *read_countp) 1247{ 1248 unsigned int iovcount; 1249 isc_buffer_t *buffer; 1250 isc_region_t available; 1251 size_t read_count; 1252 1253 memset(msg, 0, sizeof(struct msghdr)); 1254 1255 if (sock->type == isc_sockettype_udp) { 1256 memset(&dev->address, 0, sizeof(dev->address)); 1257#ifdef BROKEN_RECVMSG 1258 if (sock->pf == AF_INET) { 1259 msg->msg_name = (void *)&dev->address.type.sin; 1260 msg->msg_namelen = sizeof(dev->address.type.sin6); 1261 } else if (sock->pf == AF_INET6) { 1262 msg->msg_name = (void *)&dev->address.type.sin6; 1263 msg->msg_namelen = sizeof(dev->address.type.sin6); 1264#ifdef ISC_PLATFORM_HAVESYSUNH 1265 } else if (sock->pf == AF_UNIX) { 1266 msg->msg_name = (void *)&dev->address.type.sunix; 1267 msg->msg_namelen = sizeof(dev->address.type.sunix); 1268#endif 1269 } else { 1270 msg->msg_name = (void *)&dev->address.type.sa; 1271 msg->msg_namelen = sizeof(dev->address.type); 1272 } 1273#else 1274 msg->msg_name = (void *)&dev->address.type.sa; 1275 msg->msg_namelen = sizeof(dev->address.type); 1276#endif 1277#ifdef ISC_NET_RECVOVERFLOW 1278 /* If needed, steal one iovec for overflow detection. */ 1279 maxiov--; 1280#endif 1281 } else { /* TCP */ 1282 msg->msg_name = NULL; 1283 msg->msg_namelen = 0; 1284 dev->address = sock->peer_address; 1285 } 1286 1287 buffer = ISC_LIST_HEAD(dev->bufferlist); 1288 read_count = 0; 1289 1290 /* 1291 * Single buffer I/O? Skip what we've done so far in this region. 1292 */ 1293 if (buffer == NULL) { 1294 read_count = dev->region.length - dev->n; 1295 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1296 iov[0].iov_len = read_count; 1297 iovcount = 1; 1298 1299 goto config; 1300 } 1301 1302 /* 1303 * Multibuffer I/O. 1304 * Skip empty buffers. 1305 */ 1306 while (buffer != NULL) { 1307 REQUIRE(ISC_BUFFER_VALID(buffer)); 1308 if (isc_buffer_availablelength(buffer) != 0) 1309 break; 1310 buffer = ISC_LIST_NEXT(buffer, link); 1311 } 1312 1313 iovcount = 0; 1314 while (buffer != NULL) { 1315 INSIST(iovcount < MAXSCATTERGATHER_RECV); 1316 1317 isc_buffer_availableregion(buffer, &available); 1318 1319 if (available.length > 0) { 1320 iov[iovcount].iov_base = (void *)(available.base); 1321 iov[iovcount].iov_len = available.length; 1322 read_count += available.length; 1323 iovcount++; 1324 } 1325 buffer = ISC_LIST_NEXT(buffer, link); 1326 } 1327 1328 config: 1329 1330 /* 1331 * If needed, set up to receive that one extra byte. Note that 1332 * we know there is at least one iov left, since we stole it 1333 * at the top of this function. 1334 */ 1335#ifdef ISC_NET_RECVOVERFLOW 1336 if (sock->type == isc_sockettype_udp) { 1337 iov[iovcount].iov_base = (void *)(&sock->overflow); 1338 iov[iovcount].iov_len = 1; 1339 iovcount++; 1340 } 1341#endif 1342 1343 msg->msg_iov = iov; 1344 msg->msg_iovlen = iovcount; 1345 1346#ifdef ISC_NET_BSD44MSGHDR 1347 msg->msg_control = NULL; 1348 msg->msg_controllen = 0; 1349 msg->msg_flags = 0; 1350#if defined(USE_CMSG) 1351 if (sock->type == isc_sockettype_udp) { 1352 msg->msg_control = sock->recvcmsgbuf; 1353 msg->msg_controllen = sock->recvcmsgbuflen; 1354 } 1355#endif /* USE_CMSG */ 1356#else /* ISC_NET_BSD44MSGHDR */ 1357 msg->msg_accrights = NULL; 1358 msg->msg_accrightslen = 0; 1359#endif /* ISC_NET_BSD44MSGHDR */ 1360 1361 if (read_countp != NULL) 1362 *read_countp = read_count; 1363} 1364 1365static void 1366set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock, 1367 isc_socketevent_t *dev) 1368{ 1369 if (sock->type == isc_sockettype_udp) { 1370 if (address != NULL) 1371 dev->address = *address; 1372 else 1373 dev->address = sock->peer_address; 1374 } else if (sock->type == isc_sockettype_tcp) { 1375 INSIST(address == NULL); 1376 dev->address = sock->peer_address; 1377 } 1378} 1379 1380static void 1381destroy_socketevent(isc_event_t *event) { 1382 isc_socketevent_t *ev = (isc_socketevent_t *)event; 1383 1384 INSIST(ISC_LIST_EMPTY(ev->bufferlist)); 1385 1386 (ev->destroy)(event); 1387} 1388 1389static isc_socketevent_t * 1390allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype, 1391 isc_taskaction_t action, const void *arg) 1392{ 1393 isc_socketevent_t *ev; 1394 1395 ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx, 1396 sock, eventtype, 1397 action, arg, 1398 sizeof(*ev)); 1399 1400 if (ev == NULL) 1401 return (NULL); 1402 1403 ev->result = ISC_R_UNEXPECTED; 1404 ISC_LINK_INIT(ev, ev_link); 1405 ISC_LIST_INIT(ev->bufferlist); 1406 ev->region.base = NULL; 1407 ev->n = 0; 1408 ev->offset = 0; 1409 ev->attributes = 0; 1410 ev->destroy = ev->ev_destroy; 1411 ev->ev_destroy = destroy_socketevent; 1412 1413 return (ev); 1414} 1415 1416#if defined(ISC_SOCKET_DEBUG) 1417static void 1418dump_msg(struct msghdr *msg) { 1419 unsigned int i; 1420 1421 printf("MSGHDR %p\n", msg); 1422 printf("\tname %p, namelen %ld\n", msg->msg_name, 1423 (long) msg->msg_namelen); 1424 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, 1425 (long) msg->msg_iovlen); 1426 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) 1427 printf("\t\t%d\tbase %p, len %ld\n", i, 1428 msg->msg_iov[i].iov_base, 1429 (long) msg->msg_iov[i].iov_len); 1430#ifdef ISC_NET_BSD44MSGHDR 1431 printf("\tcontrol %p, controllen %ld\n", msg->msg_control, 1432 (long) msg->msg_controllen); 1433#endif 1434} 1435#endif 1436 1437#define DOIO_SUCCESS 0 /* i/o ok, event sent */ 1438#define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */ 1439#define DOIO_HARD 2 /* i/o error, event sent */ 1440#define DOIO_EOF 3 /* EOF, no event sent */ 1441 1442static int 1443doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { 1444 int cc; 1445 struct iovec iov[MAXSCATTERGATHER_RECV]; 1446 size_t read_count; 1447 size_t actual_count; 1448 struct msghdr msghdr; 1449 isc_buffer_t *buffer; 1450 int recv_errno; 1451 char strbuf[ISC_STRERRORSIZE]; 1452 1453 build_msghdr_recv(sock, dev, &msghdr, iov, &read_count); 1454 1455#if defined(ISC_SOCKET_DEBUG) 1456 dump_msg(&msghdr); 1457#endif 1458 1459 cc = recvmsg(sock->fd, &msghdr, 0); 1460 recv_errno = errno; 1461 1462#if defined(ISC_SOCKET_DEBUG) 1463 dump_msg(&msghdr); 1464#endif 1465 1466 if (cc < 0) { 1467 if (SOFT_ERROR(recv_errno)) 1468 return (DOIO_SOFT); 1469 1470 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1471 isc__strerror(recv_errno, strbuf, sizeof(strbuf)); 1472 socket_log(sock, NULL, IOEVENT, 1473 isc_msgcat, ISC_MSGSET_SOCKET, 1474 ISC_MSG_DOIORECV, 1475 "doio_recv: recvmsg(%d) %d bytes, err %d/%s", 1476 sock->fd, cc, recv_errno, strbuf); 1477 } 1478 1479#define SOFT_OR_HARD(_system, _isc) \ 1480 if (recv_errno == _system) { \ 1481 if (sock->connected) { \ 1482 dev->result = _isc; \ 1483 inc_stats(sock->manager->stats, \ 1484 sock->statsindex[STATID_RECVFAIL]); \ 1485 return (DOIO_HARD); \ 1486 } \ 1487 return (DOIO_SOFT); \ 1488 } 1489#define ALWAYS_HARD(_system, _isc) \ 1490 if (recv_errno == _system) { \ 1491 dev->result = _isc; \ 1492 inc_stats(sock->manager->stats, \ 1493 sock->statsindex[STATID_RECVFAIL]); \ 1494 return (DOIO_HARD); \ 1495 } 1496 1497 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1498 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1499 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1500 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN); 1501 /* HPUX 11.11 can return EADDRNOTAVAIL. */ 1502 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1503 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1504 /* 1505 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 1506 * errors. 1507 */ 1508#ifdef EPROTO 1509 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); 1510#endif 1511 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); 1512 1513#undef SOFT_OR_HARD 1514#undef ALWAYS_HARD 1515 1516 dev->result = isc__errno2result(recv_errno); 1517 inc_stats(sock->manager->stats, 1518 sock->statsindex[STATID_RECVFAIL]); 1519 return (DOIO_HARD); 1520 } 1521 1522 /* 1523 * On TCP, zero length reads indicate EOF, while on 1524 * UDP, zero length reads are perfectly valid, although 1525 * strange. 1526 */ 1527 if ((sock->type == isc_sockettype_tcp) && (cc == 0)) 1528 return (DOIO_EOF); 1529 1530 if (sock->type == isc_sockettype_udp) { 1531 dev->address.length = msghdr.msg_namelen; 1532 if (isc_sockaddr_getport(&dev->address) == 0) { 1533 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1534 socket_log(sock, &dev->address, IOEVENT, 1535 isc_msgcat, ISC_MSGSET_SOCKET, 1536 ISC_MSG_ZEROPORT, 1537 "dropping source port zero packet"); 1538 } 1539 return (DOIO_SOFT); 1540 } 1541 } 1542 1543 socket_log(sock, &dev->address, IOEVENT, 1544 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV, 1545 "packet received correctly"); 1546 1547 /* 1548 * Overflow bit detection. If we received MORE bytes than we should, 1549 * this indicates an overflow situation. Set the flag in the 1550 * dev entry and adjust how much we read by one. 1551 */ 1552#ifdef ISC_NET_RECVOVERFLOW 1553 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) { 1554 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1555 cc--; 1556 } 1557#endif 1558 1559 /* 1560 * If there are control messages attached, run through them and pull 1561 * out the interesting bits. 1562 */ 1563 if (sock->type == isc_sockettype_udp) 1564 process_cmsg(sock, &msghdr, dev); 1565 1566 /* 1567 * update the buffers (if any) and the i/o count 1568 */ 1569 dev->n += cc; 1570 actual_count = cc; 1571 buffer = ISC_LIST_HEAD(dev->bufferlist); 1572 while (buffer != NULL && actual_count > 0U) { 1573 REQUIRE(ISC_BUFFER_VALID(buffer)); 1574 if (isc_buffer_availablelength(buffer) <= actual_count) { 1575 actual_count -= isc_buffer_availablelength(buffer); 1576 isc_buffer_add(buffer, 1577 isc_buffer_availablelength(buffer)); 1578 } else { 1579 isc_buffer_add(buffer, actual_count); 1580 actual_count = 0; 1581 break; 1582 } 1583 buffer = ISC_LIST_NEXT(buffer, link); 1584 if (buffer == NULL) { 1585 INSIST(actual_count == 0U); 1586 } 1587 } 1588 1589 /* 1590 * If we read less than we expected, update counters, 1591 * and let the upper layer poke the descriptor. 1592 */ 1593 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) 1594 return (DOIO_SOFT); 1595 1596 /* 1597 * Full reads are posted, or partials if partials are ok. 1598 */ 1599 dev->result = ISC_R_SUCCESS; 1600 return (DOIO_SUCCESS); 1601} 1602 1603/* 1604 * Returns: 1605 * DOIO_SUCCESS The operation succeeded. dev->result contains 1606 * ISC_R_SUCCESS. 1607 * 1608 * DOIO_HARD A hard or unexpected I/O error was encountered. 1609 * dev->result contains the appropriate error. 1610 * 1611 * DOIO_SOFT A soft I/O error was encountered. No senddone 1612 * event was sent. The operation should be retried. 1613 * 1614 * No other return values are possible. 1615 */ 1616static int 1617doio_send(isc_socket_t *sock, isc_socketevent_t *dev) { 1618 int cc; 1619 struct iovec iov[MAXSCATTERGATHER_SEND]; 1620 size_t write_count; 1621 struct msghdr msghdr; 1622 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 1623 int attempts = 0; 1624 int send_errno; 1625 char strbuf[ISC_STRERRORSIZE]; 1626 1627 build_msghdr_send(sock, dev, &msghdr, iov, &write_count); 1628 1629 resend: 1630 cc = sendmsg(sock->fd, &msghdr, 0); 1631 send_errno = errno; 1632 1633 /* 1634 * Check for error or block condition. 1635 */ 1636 if (cc < 0) { 1637 if (send_errno == EINTR && ++attempts < NRETRIES) 1638 goto resend; 1639 1640 if (SOFT_ERROR(send_errno)) 1641 return (DOIO_SOFT); 1642 1643#define SOFT_OR_HARD(_system, _isc) \ 1644 if (send_errno == _system) { \ 1645 if (sock->connected) { \ 1646 dev->result = _isc; \ 1647 inc_stats(sock->manager->stats, \ 1648 sock->statsindex[STATID_SENDFAIL]); \ 1649 return (DOIO_HARD); \ 1650 } \ 1651 return (DOIO_SOFT); \ 1652 } 1653#define ALWAYS_HARD(_system, _isc) \ 1654 if (send_errno == _system) { \ 1655 dev->result = _isc; \ 1656 inc_stats(sock->manager->stats, \ 1657 sock->statsindex[STATID_SENDFAIL]); \ 1658 return (DOIO_HARD); \ 1659 } 1660 1661 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1662 ALWAYS_HARD(EACCES, ISC_R_NOPERM); 1663 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 1664 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1665 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1666#ifdef EHOSTDOWN 1667 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH); 1668#endif 1669 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1670 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1671 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH); 1672 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED); 1673 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET); 1674 1675#undef SOFT_OR_HARD 1676#undef ALWAYS_HARD 1677 1678 /* 1679 * The other error types depend on whether or not the 1680 * socket is UDP or TCP. If it is UDP, some errors 1681 * that we expect to be fatal under TCP are merely 1682 * annoying, and are really soft errors. 1683 * 1684 * However, these soft errors are still returned as 1685 * a status. 1686 */ 1687 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); 1688 isc__strerror(send_errno, strbuf, sizeof(strbuf)); 1689 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", 1690 addrbuf, strbuf); 1691 dev->result = isc__errno2result(send_errno); 1692 inc_stats(sock->manager->stats, 1693 sock->statsindex[STATID_SENDFAIL]); 1694 return (DOIO_HARD); 1695 } 1696 1697 if (cc == 0) { 1698 inc_stats(sock->manager->stats, 1699 sock->statsindex[STATID_SENDFAIL]); 1700 UNEXPECTED_ERROR(__FILE__, __LINE__, 1701 "doio_send: send() %s 0", 1702 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 1703 ISC_MSG_RETURNED, "returned")); 1704 } 1705 1706 /* 1707 * If we write less than we expected, update counters, poke. 1708 */ 1709 dev->n += cc; 1710 if ((size_t)cc != write_count) 1711 return (DOIO_SOFT); 1712 1713 /* 1714 * Exactly what we wanted to write. We're done with this 1715 * entry. Post its completion event. 1716 */ 1717 dev->result = ISC_R_SUCCESS; 1718 return (DOIO_SUCCESS); 1719} 1720 1721/* 1722 * Kill. 1723 * 1724 * Caller must ensure that the socket is not locked and no external 1725 * references exist. 1726 */ 1727static void 1728closesocket(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) { 1729 isc_sockettype_t type = sock->type; 1730 int lockid = FDLOCK_ID(fd); 1731 1732 /* 1733 * No one has this socket open, so the watcher doesn't have to be 1734 * poked, and the socket doesn't have to be locked. 1735 */ 1736 LOCK(&manager->fdlock[lockid]); 1737 manager->fds[fd] = NULL; 1738 if (type == isc_sockettype_fdwatch) 1739 manager->fdstate[fd] = CLOSED; 1740 else 1741 manager->fdstate[fd] = CLOSE_PENDING; 1742 UNLOCK(&manager->fdlock[lockid]); 1743 if (type == isc_sockettype_fdwatch) { 1744 /* 1745 * The caller may close the socket once this function returns, 1746 * and `fd' may be reassigned for a new socket. So we do 1747 * unwatch_fd() here, rather than defer it via select_poke(). 1748 * Note: this may complicate data protection among threads and 1749 * may reduce performance due to additional locks. One way to 1750 * solve this would be to dup() the watched descriptor, but we 1751 * take a simpler approach at this moment. 1752 */ 1753 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1754 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1755 } else 1756 select_poke(manager, fd, SELECT_POKE_CLOSE); 1757 1758 inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]); 1759 1760 /* 1761 * update manager->maxfd here (XXX: this should be implemented more 1762 * efficiently) 1763 */ 1764#ifdef USE_SELECT 1765 LOCK(&manager->lock); 1766 if (manager->maxfd == fd) { 1767 int i; 1768 1769 manager->maxfd = 0; 1770 for (i = fd - 1; i >= 0; i--) { 1771 lockid = FDLOCK_ID(i); 1772 1773 LOCK(&manager->fdlock[lockid]); 1774 if (manager->fdstate[i] == MANAGED) { 1775 manager->maxfd = i; 1776 UNLOCK(&manager->fdlock[lockid]); 1777 break; 1778 } 1779 UNLOCK(&manager->fdlock[lockid]); 1780 } 1781#ifdef ISC_PLATFORM_USETHREADS 1782 if (manager->maxfd < manager->pipe_fds[0]) 1783 manager->maxfd = manager->pipe_fds[0]; 1784#endif 1785 } 1786 UNLOCK(&manager->lock); 1787#endif /* USE_SELECT */ 1788} 1789 1790static void 1791destroy(isc_socket_t **sockp) { 1792 int fd; 1793 isc_socket_t *sock = *sockp; 1794 isc_socketmgr_t *manager = sock->manager; 1795 1796 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 1797 ISC_MSG_DESTROYING, "destroying"); 1798 1799 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 1800 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1801 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1802 INSIST(sock->connect_ev == NULL); 1803 REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks); 1804 1805 if (sock->fd >= 0) { 1806 fd = sock->fd; 1807 sock->fd = -1; 1808 closesocket(manager, sock, fd); 1809 } 1810 1811 LOCK(&manager->lock); 1812 1813 ISC_LIST_UNLINK(manager->socklist, sock, link); 1814 1815#ifdef ISC_PLATFORM_USETHREADS 1816 if (ISC_LIST_EMPTY(manager->socklist)) 1817 SIGNAL(&manager->shutdown_ok); 1818#endif /* ISC_PLATFORM_USETHREADS */ 1819 1820 UNLOCK(&manager->lock); 1821 1822 free_socket(sockp); 1823} 1824 1825static isc_result_t 1826allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type, 1827 isc_socket_t **socketp) 1828{ 1829 isc_socket_t *sock; 1830 isc_result_t result; 1831 ISC_SOCKADDR_LEN_T cmsgbuflen; 1832 1833 sock = isc_mem_get(manager->mctx, sizeof(*sock)); 1834 1835 if (sock == NULL) 1836 return (ISC_R_NOMEMORY); 1837 1838 result = ISC_R_UNEXPECTED; 1839 1840 sock->magic = 0; 1841 sock->references = 0; 1842 1843 sock->manager = manager; 1844 sock->type = type; 1845 sock->fd = -1; 1846 sock->statsindex = NULL; 1847 1848 ISC_LINK_INIT(sock, link); 1849 1850 sock->recvcmsgbuf = NULL; 1851 sock->sendcmsgbuf = NULL; 1852 1853 /* 1854 * set up cmsg buffers 1855 */ 1856 cmsgbuflen = 0; 1857#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1858 cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo)); 1859#endif 1860#if defined(USE_CMSG) && defined(SO_TIMESTAMP) 1861 cmsgbuflen += cmsg_space(sizeof(struct timeval)); 1862#endif 1863 sock->recvcmsgbuflen = cmsgbuflen; 1864 if (sock->recvcmsgbuflen != 0U) { 1865 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 1866 if (sock->recvcmsgbuf == NULL) 1867 goto error; 1868 } 1869 1870 cmsgbuflen = 0; 1871#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1872 cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo)); 1873#endif 1874 sock->sendcmsgbuflen = cmsgbuflen; 1875 if (sock->sendcmsgbuflen != 0U) { 1876 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 1877 if (sock->sendcmsgbuf == NULL) 1878 goto error; 1879 } 1880 1881 memset(sock->name, 0, sizeof(sock->name)); 1882 sock->tag = NULL; 1883 1884 /* 1885 * set up list of readers and writers to be initially empty 1886 */ 1887 ISC_LIST_INIT(sock->recv_list); 1888 ISC_LIST_INIT(sock->send_list); 1889 ISC_LIST_INIT(sock->accept_list); 1890 sock->connect_ev = NULL; 1891 sock->pending_recv = 0; 1892 sock->pending_send = 0; 1893 sock->pending_accept = 0; 1894 sock->listener = 0; 1895 sock->connected = 0; 1896 sock->connecting = 0; 1897 sock->bound = 0; 1898 1899 /* 1900 * initialize the lock 1901 */ 1902 result = isc_mutex_init(&sock->lock); 1903 if (result != ISC_R_SUCCESS) { 1904 sock->magic = 0; 1905 goto error; 1906 } 1907 1908 /* 1909 * Initialize readable and writable events 1910 */ 1911 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t), 1912 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR, 1913 NULL, sock, sock, NULL, NULL); 1914 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t), 1915 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW, 1916 NULL, sock, sock, NULL, NULL); 1917 1918 sock->magic = SOCKET_MAGIC; 1919 *socketp = sock; 1920 1921 return (ISC_R_SUCCESS); 1922 1923 error: 1924 if (sock->recvcmsgbuf != NULL) 1925 isc_mem_put(manager->mctx, sock->recvcmsgbuf, 1926 sock->recvcmsgbuflen); 1927 if (sock->sendcmsgbuf != NULL) 1928 isc_mem_put(manager->mctx, sock->sendcmsgbuf, 1929 sock->sendcmsgbuflen); 1930 isc_mem_put(manager->mctx, sock, sizeof(*sock)); 1931 1932 return (result); 1933} 1934 1935/* 1936 * This event requires that the various lists be empty, that the reference 1937 * count be 1, and that the magic number is valid. The other socket bits, 1938 * like the lock, must be initialized as well. The fd associated must be 1939 * marked as closed, by setting it to -1 on close, or this routine will 1940 * also close the socket. 1941 */ 1942static void 1943free_socket(isc_socket_t **socketp) { 1944 isc_socket_t *sock = *socketp; 1945 1946 INSIST(sock->references == 0); 1947 INSIST(VALID_SOCKET(sock)); 1948 INSIST(!sock->connecting); 1949 INSIST(!sock->pending_recv); 1950 INSIST(!sock->pending_send); 1951 INSIST(!sock->pending_accept); 1952 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1953 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1954 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 1955 INSIST(!ISC_LINK_LINKED(sock, link)); 1956 1957 if (sock->recvcmsgbuf != NULL) 1958 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf, 1959 sock->recvcmsgbuflen); 1960 if (sock->sendcmsgbuf != NULL) 1961 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf, 1962 sock->sendcmsgbuflen); 1963 1964 sock->magic = 0; 1965 1966 DESTROYLOCK(&sock->lock); 1967 1968 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock)); 1969 1970 *socketp = NULL; 1971} 1972 1973#ifdef SO_BSDCOMPAT 1974/* 1975 * This really should not be necessary to do. Having to workout 1976 * which kernel version we are on at run time so that we don't cause 1977 * the kernel to issue a warning about us using a deprecated socket option. 1978 * Such warnings should *never* be on by default in production kernels. 1979 * 1980 * We can't do this a build time because executables are moved between 1981 * machines and hence kernels. 1982 * 1983 * We can't just not set SO_BSDCOMAT because some kernels require it. 1984 */ 1985 1986static isc_once_t bsdcompat_once = ISC_ONCE_INIT; 1987isc_boolean_t bsdcompat = ISC_TRUE; 1988 1989static void 1990clear_bsdcompat(void) { 1991#ifdef __linux__ 1992 struct utsname buf; 1993 char *endp; 1994 long int major; 1995 long int minor; 1996 1997 uname(&buf); /* Can only fail if buf is bad in Linux. */ 1998 1999 /* Paranoia in parsing can be increased, but we trust uname(). */ 2000 major = strtol(buf.release, &endp, 10); 2001 if (*endp == '.') { 2002 minor = strtol(endp+1, &endp, 10); 2003 if ((major > 2) || ((major == 2) && (minor >= 4))) { 2004 bsdcompat = ISC_FALSE; 2005 } 2006 } 2007#endif /* __linux __ */ 2008} 2009#endif 2010 2011static isc_result_t 2012opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) { 2013 char strbuf[ISC_STRERRORSIZE]; 2014 const char *err = "socket"; 2015 int tries = 0; 2016#if defined(USE_CMSG) || defined(SO_BSDCOMPAT) 2017 int on = 1; 2018#endif 2019#if defined(SO_RCVBUF) 2020 ISC_SOCKADDR_LEN_T optlen; 2021 int size; 2022#endif 2023 2024 again: 2025 switch (sock->type) { 2026 case isc_sockettype_udp: 2027 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); 2028 break; 2029 case isc_sockettype_tcp: 2030 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); 2031 break; 2032 case isc_sockettype_unix: 2033 sock->fd = socket(sock->pf, SOCK_STREAM, 0); 2034 break; 2035 case isc_sockettype_fdwatch: 2036 /* 2037 * We should not be called for isc_sockettype_fdwatch sockets. 2038 */ 2039 INSIST(0); 2040 break; 2041 } 2042 if (sock->fd == -1 && errno == EINTR && tries++ < 42) 2043 goto again; 2044 2045#ifdef F_DUPFD 2046 /* 2047 * Leave a space for stdio and TCP to work in. 2048 */ 2049 if (manager->reserved != 0 && sock->type == isc_sockettype_udp && 2050 sock->fd >= 0 && sock->fd < manager->reserved) { 2051 int new, tmp; 2052 new = fcntl(sock->fd, F_DUPFD, manager->reserved); 2053 tmp = errno; 2054 (void)close(sock->fd); 2055 errno = tmp; 2056 sock->fd = new; 2057 err = "isc_socket_create: fcntl/reserved"; 2058 } else if (sock->fd >= 0 && sock->fd < 20) { 2059 int new, tmp; 2060 new = fcntl(sock->fd, F_DUPFD, 20); 2061 tmp = errno; 2062 (void)close(sock->fd); 2063 errno = tmp; 2064 sock->fd = new; 2065 err = "isc_socket_create: fcntl"; 2066 } 2067#endif 2068 2069 if (sock->fd >= (int)manager->maxsocks) { 2070 (void)close(sock->fd); 2071 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2072 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2073 isc_msgcat, ISC_MSGSET_SOCKET, 2074 ISC_MSG_TOOMANYFDS, 2075 "socket: file descriptor exceeds limit (%d/%u)", 2076 sock->fd, manager->maxsocks); 2077 return (ISC_R_NORESOURCES); 2078 } 2079 2080 if (sock->fd < 0) { 2081 switch (errno) { 2082 case EMFILE: 2083 case ENFILE: 2084 isc__strerror(errno, strbuf, sizeof(strbuf)); 2085 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2086 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2087 isc_msgcat, ISC_MSGSET_SOCKET, 2088 ISC_MSG_TOOMANYFDS, 2089 "%s: %s", err, strbuf); 2090 /* fallthrough */ 2091 case ENOBUFS: 2092 return (ISC_R_NORESOURCES); 2093 2094 case EPROTONOSUPPORT: 2095 case EPFNOSUPPORT: 2096 case EAFNOSUPPORT: 2097 /* 2098 * Linux 2.2 (and maybe others) return EINVAL instead of 2099 * EAFNOSUPPORT. 2100 */ 2101 case EINVAL: 2102 return (ISC_R_FAMILYNOSUPPORT); 2103 2104 default: 2105 isc__strerror(errno, strbuf, sizeof(strbuf)); 2106 UNEXPECTED_ERROR(__FILE__, __LINE__, 2107 "%s() %s: %s", err, 2108 isc_msgcat_get(isc_msgcat, 2109 ISC_MSGSET_GENERAL, 2110 ISC_MSG_FAILED, 2111 "failed"), 2112 strbuf); 2113 return (ISC_R_UNEXPECTED); 2114 } 2115 } 2116 2117 if (make_nonblock(sock->fd) != ISC_R_SUCCESS) { 2118 (void)close(sock->fd); 2119 return (ISC_R_UNEXPECTED); 2120 } 2121 2122#ifdef SO_BSDCOMPAT 2123 RUNTIME_CHECK(isc_once_do(&bsdcompat_once, 2124 clear_bsdcompat) == ISC_R_SUCCESS); 2125 if (sock->type != isc_sockettype_unix && bsdcompat && 2126 setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, 2127 (void *)&on, sizeof(on)) < 0) { 2128 isc__strerror(errno, strbuf, sizeof(strbuf)); 2129 UNEXPECTED_ERROR(__FILE__, __LINE__, 2130 "setsockopt(%d, SO_BSDCOMPAT) %s: %s", 2131 sock->fd, 2132 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2133 ISC_MSG_FAILED, "failed"), 2134 strbuf); 2135 /* Press on... */ 2136 } 2137#endif 2138 2139#ifdef SO_NOSIGPIPE 2140 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, 2141 (void *)&on, sizeof(on)) < 0) { 2142 isc__strerror(errno, strbuf, sizeof(strbuf)); 2143 UNEXPECTED_ERROR(__FILE__, __LINE__, 2144 "setsockopt(%d, SO_NOSIGPIPE) %s: %s", 2145 sock->fd, 2146 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2147 ISC_MSG_FAILED, "failed"), 2148 strbuf); 2149 /* Press on... */ 2150 } 2151#endif 2152 2153#if defined(USE_CMSG) || defined(SO_RCVBUF) 2154 if (sock->type == isc_sockettype_udp) { 2155 2156#if defined(USE_CMSG) 2157#if defined(SO_TIMESTAMP) 2158 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, 2159 (void *)&on, sizeof(on)) < 0 2160 && errno != ENOPROTOOPT) { 2161 isc__strerror(errno, strbuf, sizeof(strbuf)); 2162 UNEXPECTED_ERROR(__FILE__, __LINE__, 2163 "setsockopt(%d, SO_TIMESTAMP) %s: %s", 2164 sock->fd, 2165 isc_msgcat_get(isc_msgcat, 2166 ISC_MSGSET_GENERAL, 2167 ISC_MSG_FAILED, 2168 "failed"), 2169 strbuf); 2170 /* Press on... */ 2171 } 2172#endif /* SO_TIMESTAMP */ 2173 2174#if defined(ISC_PLATFORM_HAVEIPV6) 2175 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { 2176 /* 2177 * Warn explicitly because this anomaly can be hidden 2178 * in usual operation (and unexpectedly appear later). 2179 */ 2180 UNEXPECTED_ERROR(__FILE__, __LINE__, 2181 "No buffer available to receive " 2182 "IPv6 destination"); 2183 } 2184#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 2185#ifdef IPV6_RECVPKTINFO 2186 /* RFC 3542 */ 2187 if ((sock->pf == AF_INET6) 2188 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 2189 (void *)&on, sizeof(on)) < 0)) { 2190 isc__strerror(errno, strbuf, sizeof(strbuf)); 2191 UNEXPECTED_ERROR(__FILE__, __LINE__, 2192 "setsockopt(%d, IPV6_RECVPKTINFO) " 2193 "%s: %s", sock->fd, 2194 isc_msgcat_get(isc_msgcat, 2195 ISC_MSGSET_GENERAL, 2196 ISC_MSG_FAILED, 2197 "failed"), 2198 strbuf); 2199 } 2200#else 2201 /* RFC 2292 */ 2202 if ((sock->pf == AF_INET6) 2203 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, 2204 (void *)&on, sizeof(on)) < 0)) { 2205 isc__strerror(errno, strbuf, sizeof(strbuf)); 2206 UNEXPECTED_ERROR(__FILE__, __LINE__, 2207 "setsockopt(%d, IPV6_PKTINFO) %s: %s", 2208 sock->fd, 2209 isc_msgcat_get(isc_msgcat, 2210 ISC_MSGSET_GENERAL, 2211 ISC_MSG_FAILED, 2212 "failed"), 2213 strbuf); 2214 } 2215#endif /* IPV6_RECVPKTINFO */ 2216#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */ 2217#ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/ 2218 /* use minimum MTU */ 2219 if (sock->pf == AF_INET6) { 2220 (void)setsockopt(sock->fd, IPPROTO_IPV6, 2221 IPV6_USE_MIN_MTU, 2222 (void *)&on, sizeof(on)); 2223 } 2224#endif 2225#endif /* ISC_PLATFORM_HAVEIPV6 */ 2226#endif /* defined(USE_CMSG) */ 2227 2228#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) 2229 /* 2230 * Turn off Path MTU discovery on IPv4/UDP sockets. 2231 */ 2232 if (sock->pf == AF_INET) { 2233 int action = IP_PMTUDISC_DONT; 2234 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER, 2235 &action, sizeof(action)); 2236 } 2237#endif 2238#if defined(IP_DONTFRAG) 2239 /* 2240 * Turn off Path MTU discovery on IPv4/UDP sockets. 2241 */ 2242 if (sock->pf == AF_INET) { 2243 int off = 0; 2244 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, 2245 &off, sizeof(off)); 2246 } 2247#endif 2248 2249#if defined(SO_RCVBUF) 2250 optlen = sizeof(size); 2251 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2252 (void *)&size, &optlen) >= 0 && 2253 size < RCVBUFSIZE) { 2254 size = RCVBUFSIZE; 2255 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2256 (void *)&size, sizeof(size)) == -1) { 2257 isc__strerror(errno, strbuf, sizeof(strbuf)); 2258 UNEXPECTED_ERROR(__FILE__, __LINE__, 2259 "setsockopt(%d, SO_RCVBUF, %d) %s: %s", 2260 sock->fd, size, 2261 isc_msgcat_get(isc_msgcat, 2262 ISC_MSGSET_GENERAL, 2263 ISC_MSG_FAILED, 2264 "failed"), 2265 strbuf); 2266 } 2267 } 2268#endif 2269 } 2270#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */ 2271 2272 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]); 2273 2274 return (ISC_R_SUCCESS); 2275} 2276 2277/*% 2278 * Create a new 'type' socket managed by 'manager'. Events 2279 * will be posted to 'task' and when dispatched 'action' will be 2280 * called with 'arg' as the arg value. The new socket is returned 2281 * in 'socketp'. 2282 */ 2283isc_result_t 2284isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, 2285 isc_socket_t **socketp) 2286{ 2287 isc_socket_t *sock = NULL; 2288 isc_result_t result; 2289 int lockid; 2290 2291 REQUIRE(VALID_MANAGER(manager)); 2292 REQUIRE(socketp != NULL && *socketp == NULL); 2293 REQUIRE(type != isc_sockettype_fdwatch); 2294 2295 result = allocate_socket(manager, type, &sock); 2296 if (result != ISC_R_SUCCESS) 2297 return (result); 2298 2299 switch (sock->type) { 2300 case isc_sockettype_udp: 2301 sock->statsindex = 2302 (pf == AF_INET) ? upd4statsindex : upd6statsindex; 2303 break; 2304 case isc_sockettype_tcp: 2305 sock->statsindex = 2306 (pf == AF_INET) ? tcp4statsindex : tcp6statsindex; 2307 break; 2308 case isc_sockettype_unix: 2309 sock->statsindex = unixstatsindex; 2310 break; 2311 default: 2312 INSIST(0); 2313 } 2314 2315 sock->pf = pf; 2316 result = opensocket(manager, sock); 2317 if (result != ISC_R_SUCCESS) { 2318 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]); 2319 free_socket(&sock); 2320 return (result); 2321 } 2322 2323 sock->references = 1; 2324 *socketp = sock; 2325 2326 /* 2327 * Note we don't have to lock the socket like we normally would because 2328 * there are no external references to it yet. 2329 */ 2330 2331 lockid = FDLOCK_ID(sock->fd); 2332 LOCK(&manager->fdlock[lockid]); 2333 manager->fds[sock->fd] = sock; 2334 manager->fdstate[sock->fd] = MANAGED; 2335#ifdef USE_DEVPOLL 2336 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2337 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2338#endif 2339 UNLOCK(&manager->fdlock[lockid]); 2340 2341 LOCK(&manager->lock); 2342 ISC_LIST_APPEND(manager->socklist, sock, link); 2343#ifdef USE_SELECT 2344 if (manager->maxfd < sock->fd) 2345 manager->maxfd = sock->fd; 2346#endif 2347 UNLOCK(&manager->lock); 2348 2349 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2350 ISC_MSG_CREATED, "created"); 2351 2352 return (ISC_R_SUCCESS); 2353} 2354 2355isc_result_t 2356isc_socket_open(isc_socket_t *sock) { 2357 isc_result_t result; 2358 2359 REQUIRE(VALID_SOCKET(sock)); 2360 2361 LOCK(&sock->lock); 2362 REQUIRE(sock->references == 1); 2363 REQUIRE(sock->type != isc_sockettype_fdwatch); 2364 UNLOCK(&sock->lock); 2365 /* 2366 * We don't need to retain the lock hereafter, since no one else has 2367 * this socket. 2368 */ 2369 REQUIRE(sock->fd == -1); 2370 2371 result = opensocket(sock->manager, sock); 2372 if (result != ISC_R_SUCCESS) 2373 sock->fd = -1; 2374 2375 if (result == ISC_R_SUCCESS) { 2376 int lockid = FDLOCK_ID(sock->fd); 2377 2378 LOCK(&sock->manager->fdlock[lockid]); 2379 sock->manager->fds[sock->fd] = sock; 2380 sock->manager->fdstate[sock->fd] = MANAGED; 2381#ifdef USE_DEVPOLL 2382 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2383 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2384#endif 2385 UNLOCK(&sock->manager->fdlock[lockid]); 2386 2387#ifdef USE_SELECT 2388 LOCK(&sock->manager->lock); 2389 if (sock->manager->maxfd < sock->fd) 2390 sock->manager->maxfd = sock->fd; 2391 UNLOCK(&sock->manager->lock); 2392#endif 2393 } 2394 2395 return (result); 2396} 2397 2398/* 2399 * Create a new 'type' socket managed by 'manager'. Events 2400 * will be posted to 'task' and when dispatched 'action' will be 2401 * called with 'arg' as the arg value. The new socket is returned 2402 * in 'socketp'. 2403 */ 2404isc_result_t 2405isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags, 2406 isc_sockfdwatch_t callback, void *cbarg, 2407 isc_task_t *task, isc_socket_t **socketp) 2408{ 2409 isc_socket_t *sock = NULL; 2410 isc_result_t result; 2411 int lockid; 2412 2413 REQUIRE(VALID_MANAGER(manager)); 2414 REQUIRE(socketp != NULL && *socketp == NULL); 2415 2416 result = allocate_socket(manager, isc_sockettype_fdwatch, &sock); 2417 if (result != ISC_R_SUCCESS) 2418 return (result); 2419 2420 sock->fd = fd; 2421 sock->fdwatcharg = cbarg; 2422 sock->fdwatchcb = callback; 2423 sock->fdwatchflags = flags; 2424 sock->fdwatchtask = task; 2425 sock->statsindex = fdwatchstatsindex; 2426 2427 sock->references = 1; 2428 *socketp = sock; 2429 2430 /* 2431 * Note we don't have to lock the socket like we normally would because 2432 * there are no external references to it yet. 2433 */ 2434 2435 lockid = FDLOCK_ID(sock->fd); 2436 LOCK(&manager->fdlock[lockid]); 2437 manager->fds[sock->fd] = sock; 2438 manager->fdstate[sock->fd] = MANAGED; 2439 UNLOCK(&manager->fdlock[lockid]); 2440 2441 LOCK(&manager->lock); 2442 ISC_LIST_APPEND(manager->socklist, sock, link); 2443#ifdef USE_SELECT 2444 if (manager->maxfd < sock->fd) 2445 manager->maxfd = sock->fd; 2446#endif 2447 UNLOCK(&manager->lock); 2448 2449 if (flags & ISC_SOCKFDWATCH_READ) 2450 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 2451 if (flags & ISC_SOCKFDWATCH_WRITE) 2452 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 2453 2454 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2455 ISC_MSG_CREATED, "fdwatch-created"); 2456 2457 return (ISC_R_SUCCESS); 2458} 2459 2460/* 2461 * Attach to a socket. Caller must explicitly detach when it is done. 2462 */ 2463void 2464isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) { 2465 REQUIRE(VALID_SOCKET(sock)); 2466 REQUIRE(socketp != NULL && *socketp == NULL); 2467 2468 LOCK(&sock->lock); 2469 sock->references++; 2470 UNLOCK(&sock->lock); 2471 2472 *socketp = sock; 2473} 2474 2475/* 2476 * Dereference a socket. If this is the last reference to it, clean things 2477 * up by destroying the socket. 2478 */ 2479void 2480isc_socket_detach(isc_socket_t **socketp) { 2481 isc_socket_t *sock; 2482 isc_boolean_t kill_socket = ISC_FALSE; 2483 2484 REQUIRE(socketp != NULL); 2485 sock = *socketp; 2486 REQUIRE(VALID_SOCKET(sock)); 2487 2488 LOCK(&sock->lock); 2489 REQUIRE(sock->references > 0); 2490 sock->references--; 2491 if (sock->references == 0) 2492 kill_socket = ISC_TRUE; 2493 UNLOCK(&sock->lock); 2494 2495 if (kill_socket) 2496 destroy(&sock); 2497 2498 *socketp = NULL; 2499} 2500 2501isc_result_t 2502isc_socket_close(isc_socket_t *sock) { 2503 int fd; 2504 isc_socketmgr_t *manager; 2505 isc_sockettype_t type; 2506 2507 REQUIRE(VALID_SOCKET(sock)); 2508 2509 LOCK(&sock->lock); 2510 2511 REQUIRE(sock->references == 1); 2512 REQUIRE(sock->type != isc_sockettype_fdwatch); 2513 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); 2514 2515 INSIST(!sock->connecting); 2516 INSIST(!sock->pending_recv); 2517 INSIST(!sock->pending_send); 2518 INSIST(!sock->pending_accept); 2519 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2520 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2521 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2522 INSIST(sock->connect_ev == NULL); 2523 2524 manager = sock->manager; 2525 type = sock->type; 2526 fd = sock->fd; 2527 sock->fd = -1; 2528 memset(sock->name, 0, sizeof(sock->name)); 2529 sock->tag = NULL; 2530 sock->listener = 0; 2531 sock->connected = 0; 2532 sock->connecting = 0; 2533 sock->bound = 0; 2534 isc_sockaddr_any(&sock->peer_address); 2535 2536 UNLOCK(&sock->lock); 2537 2538 closesocket(manager, sock, fd); 2539 2540 return (ISC_R_SUCCESS); 2541} 2542 2543/* 2544 * I/O is possible on a given socket. Schedule an event to this task that 2545 * will call an internal function to do the I/O. This will charge the 2546 * task with the I/O operation and let our select loop handler get back 2547 * to doing something real as fast as possible. 2548 * 2549 * The socket and manager must be locked before calling this function. 2550 */ 2551static void 2552dispatch_recv(isc_socket_t *sock) { 2553 intev_t *iev; 2554 isc_socketevent_t *ev; 2555 isc_task_t *sender; 2556 2557 INSIST(!sock->pending_recv); 2558 2559 if (sock->type != isc_sockettype_fdwatch) { 2560 ev = ISC_LIST_HEAD(sock->recv_list); 2561 if (ev == NULL) 2562 return; 2563 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2564 "dispatch_recv: event %p -> task %p", 2565 ev, ev->ev_sender); 2566 sender = ev->ev_sender; 2567 } else { 2568 sender = sock->fdwatchtask; 2569 } 2570 2571 sock->pending_recv = 1; 2572 iev = &sock->readable_ev; 2573 2574 sock->references++; 2575 iev->ev_sender = sock; 2576 if (sock->type == isc_sockettype_fdwatch) 2577 iev->ev_action = internal_fdwatch_read; 2578 else 2579 iev->ev_action = internal_recv; 2580 iev->ev_arg = sock; 2581 2582 isc_task_send(sender, (isc_event_t **)&iev); 2583} 2584 2585static void 2586dispatch_send(isc_socket_t *sock) { 2587 intev_t *iev; 2588 isc_socketevent_t *ev; 2589 isc_task_t *sender; 2590 2591 INSIST(!sock->pending_send); 2592 2593 if (sock->type != isc_sockettype_fdwatch) { 2594 ev = ISC_LIST_HEAD(sock->send_list); 2595 if (ev == NULL) 2596 return; 2597 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2598 "dispatch_send: event %p -> task %p", 2599 ev, ev->ev_sender); 2600 sender = ev->ev_sender; 2601 } else { 2602 sender = sock->fdwatchtask; 2603 } 2604 2605 sock->pending_send = 1; 2606 iev = &sock->writable_ev; 2607 2608 sock->references++; 2609 iev->ev_sender = sock; 2610 if (sock->type == isc_sockettype_fdwatch) 2611 iev->ev_action = internal_fdwatch_write; 2612 else 2613 iev->ev_action = internal_send; 2614 iev->ev_arg = sock; 2615 2616 isc_task_send(sender, (isc_event_t **)&iev); 2617} 2618 2619/* 2620 * Dispatch an internal accept event. 2621 */ 2622static void 2623dispatch_accept(isc_socket_t *sock) { 2624 intev_t *iev; 2625 isc_socket_newconnev_t *ev; 2626 2627 INSIST(!sock->pending_accept); 2628 2629 /* 2630 * Are there any done events left, or were they all canceled 2631 * before the manager got the socket lock? 2632 */ 2633 ev = ISC_LIST_HEAD(sock->accept_list); 2634 if (ev == NULL) 2635 return; 2636 2637 sock->pending_accept = 1; 2638 iev = &sock->readable_ev; 2639 2640 sock->references++; /* keep socket around for this internal event */ 2641 iev->ev_sender = sock; 2642 iev->ev_action = internal_accept; 2643 iev->ev_arg = sock; 2644 2645 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 2646} 2647 2648static void 2649dispatch_connect(isc_socket_t *sock) { 2650 intev_t *iev; 2651 isc_socket_connev_t *ev; 2652 2653 iev = &sock->writable_ev; 2654 2655 ev = sock->connect_ev; 2656 INSIST(ev != NULL); /* XXX */ 2657 2658 INSIST(sock->connecting); 2659 2660 sock->references++; /* keep socket around for this internal event */ 2661 iev->ev_sender = sock; 2662 iev->ev_action = internal_connect; 2663 iev->ev_arg = sock; 2664 2665 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 2666} 2667 2668/* 2669 * Dequeue an item off the given socket's read queue, set the result code 2670 * in the done event to the one provided, and send it to the task it was 2671 * destined for. 2672 * 2673 * If the event to be sent is on a list, remove it before sending. If 2674 * asked to, send and detach from the socket as well. 2675 * 2676 * Caller must have the socket locked if the event is attached to the socket. 2677 */ 2678static void 2679send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 2680 isc_task_t *task; 2681 2682 task = (*dev)->ev_sender; 2683 2684 (*dev)->ev_sender = sock; 2685 2686 if (ISC_LINK_LINKED(*dev, ev_link)) 2687 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); 2688 2689 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 2690 == ISC_SOCKEVENTATTR_ATTACHED) 2691 isc_task_sendanddetach(&task, (isc_event_t **)dev); 2692 else 2693 isc_task_send(task, (isc_event_t **)dev); 2694} 2695 2696/* 2697 * See comments for send_recvdone_event() above. 2698 * 2699 * Caller must have the socket locked if the event is attached to the socket. 2700 */ 2701static void 2702send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 2703 isc_task_t *task; 2704 2705 INSIST(dev != NULL && *dev != NULL); 2706 2707 task = (*dev)->ev_sender; 2708 (*dev)->ev_sender = sock; 2709 2710 if (ISC_LINK_LINKED(*dev, ev_link)) 2711 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); 2712 2713 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 2714 == ISC_SOCKEVENTATTR_ATTACHED) 2715 isc_task_sendanddetach(&task, (isc_event_t **)dev); 2716 else 2717 isc_task_send(task, (isc_event_t **)dev); 2718} 2719 2720/* 2721 * Call accept() on a socket, to get the new file descriptor. The listen 2722 * socket is used as a prototype to create a new isc_socket_t. The new 2723 * socket has one outstanding reference. The task receiving the event 2724 * will be detached from just after the event is delivered. 2725 * 2726 * On entry to this function, the event delivered is the internal 2727 * readable event, and the first item on the accept_list should be 2728 * the done event we want to send. If the list is empty, this is a no-op, 2729 * so just unlock and return. 2730 */ 2731static void 2732internal_accept(isc_task_t *me, isc_event_t *ev) { 2733 isc_socket_t *sock; 2734 isc_socketmgr_t *manager; 2735 isc_socket_newconnev_t *dev; 2736 isc_task_t *task; 2737 ISC_SOCKADDR_LEN_T addrlen; 2738 int fd; 2739 isc_result_t result = ISC_R_SUCCESS; 2740 char strbuf[ISC_STRERRORSIZE]; 2741 const char *err = "accept"; 2742 2743 UNUSED(me); 2744 2745 sock = ev->ev_sender; 2746 INSIST(VALID_SOCKET(sock)); 2747 2748 LOCK(&sock->lock); 2749 socket_log(sock, NULL, TRACE, 2750 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK, 2751 "internal_accept called, locked socket"); 2752 2753 manager = sock->manager; 2754 INSIST(VALID_MANAGER(manager)); 2755 2756 INSIST(sock->listener); 2757 INSIST(sock->pending_accept == 1); 2758 sock->pending_accept = 0; 2759 2760 INSIST(sock->references > 0); 2761 sock->references--; /* the internal event is done with this socket */ 2762 if (sock->references == 0) { 2763 UNLOCK(&sock->lock); 2764 destroy(&sock); 2765 return; 2766 } 2767 2768 /* 2769 * Get the first item off the accept list. 2770 * If it is empty, unlock the socket and return. 2771 */ 2772 dev = ISC_LIST_HEAD(sock->accept_list); 2773 if (dev == NULL) { 2774 UNLOCK(&sock->lock); 2775 return; 2776 } 2777 2778 /* 2779 * Try to accept the new connection. If the accept fails with 2780 * EAGAIN or EINTR, simply poke the watcher to watch this socket 2781 * again. Also ignore ECONNRESET, which has been reported to 2782 * be spuriously returned on Linux 2.2.19 although it is not 2783 * a documented error for accept(). ECONNABORTED has been 2784 * reported for Solaris 8. The rest are thrown in not because 2785 * we have seen them but because they are ignored by other 2786 * daemons such as BIND 8 and Apache. 2787 */ 2788 2789 addrlen = sizeof(dev->newsocket->peer_address.type); 2790 memset(&dev->newsocket->peer_address.type, 0, addrlen); 2791 fd = accept(sock->fd, &dev->newsocket->peer_address.type.sa, 2792 (void *)&addrlen); 2793 2794#ifdef F_DUPFD 2795 /* 2796 * Leave a space for stdio to work in. 2797 */ 2798 if (fd >= 0 && fd < 20) { 2799 int new, tmp; 2800 new = fcntl(fd, F_DUPFD, 20); 2801 tmp = errno; 2802 (void)close(fd); 2803 errno = tmp; 2804 fd = new; 2805 err = "accept/fcntl"; 2806 } 2807#endif 2808 2809 if (fd < 0) { 2810 if (SOFT_ERROR(errno)) 2811 goto soft_error; 2812 switch (errno) { 2813 case ENFILE: 2814 case EMFILE: 2815 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2816 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2817 isc_msgcat, ISC_MSGSET_SOCKET, 2818 ISC_MSG_TOOMANYFDS, 2819 "%s: too many open file descriptors", 2820 err); 2821 goto soft_error; 2822 2823 case ENOBUFS: 2824 case ENOMEM: 2825 case ECONNRESET: 2826 case ECONNABORTED: 2827 case EHOSTUNREACH: 2828 case EHOSTDOWN: 2829 case ENETUNREACH: 2830 case ENETDOWN: 2831 case ECONNREFUSED: 2832#ifdef EPROTO 2833 case EPROTO: 2834#endif 2835#ifdef ENONET 2836 case ENONET: 2837#endif 2838 goto soft_error; 2839 default: 2840 break; 2841 } 2842 isc__strerror(errno, strbuf, sizeof(strbuf)); 2843 UNEXPECTED_ERROR(__FILE__, __LINE__, 2844 "internal_accept: %s() %s: %s", err, 2845 isc_msgcat_get(isc_msgcat, 2846 ISC_MSGSET_GENERAL, 2847 ISC_MSG_FAILED, 2848 "failed"), 2849 strbuf); 2850 fd = -1; 2851 result = ISC_R_UNEXPECTED; 2852 } else { 2853 if (addrlen == 0U) { 2854 UNEXPECTED_ERROR(__FILE__, __LINE__, 2855 "internal_accept(): " 2856 "accept() failed to return " 2857 "remote address"); 2858 2859 (void)close(fd); 2860 goto soft_error; 2861 } else if (dev->newsocket->peer_address.type.sa.sa_family != 2862 sock->pf) 2863 { 2864 UNEXPECTED_ERROR(__FILE__, __LINE__, 2865 "internal_accept(): " 2866 "accept() returned peer address " 2867 "family %u (expected %u)", 2868 dev->newsocket->peer_address. 2869 type.sa.sa_family, 2870 sock->pf); 2871 (void)close(fd); 2872 goto soft_error; 2873 } else if (fd >= (int)manager->maxsocks) { 2874 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2875 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2876 isc_msgcat, ISC_MSGSET_SOCKET, 2877 ISC_MSG_TOOMANYFDS, 2878 "accept: " 2879 "file descriptor exceeds limit (%d/%u)", 2880 fd, manager->maxsocks); 2881 (void)close(fd); 2882 goto soft_error; 2883 } 2884 } 2885 2886 if (fd != -1) { 2887 dev->newsocket->peer_address.length = addrlen; 2888 dev->newsocket->pf = sock->pf; 2889 } 2890 2891 /* 2892 * Pull off the done event. 2893 */ 2894 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link); 2895 2896 /* 2897 * Poke watcher if there are more pending accepts. 2898 */ 2899 if (!ISC_LIST_EMPTY(sock->accept_list)) 2900 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 2901 2902 UNLOCK(&sock->lock); 2903 2904 if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) { 2905 (void)close(fd); 2906 fd = -1; 2907 result = ISC_R_UNEXPECTED; 2908 } 2909 2910 /* 2911 * -1 means the new socket didn't happen. 2912 */ 2913 if (fd != -1) { 2914 int lockid = FDLOCK_ID(fd); 2915 2916 LOCK(&manager->fdlock[lockid]); 2917 manager->fds[fd] = dev->newsocket; 2918 manager->fdstate[fd] = MANAGED; 2919 UNLOCK(&manager->fdlock[lockid]); 2920 2921 LOCK(&manager->lock); 2922 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link); 2923 2924 dev->newsocket->fd = fd; 2925 dev->newsocket->bound = 1; 2926 dev->newsocket->connected = 1; 2927 2928 /* 2929 * Save away the remote address 2930 */ 2931 dev->address = dev->newsocket->peer_address; 2932 2933#ifdef USE_SELECT 2934 if (manager->maxfd < fd) 2935 manager->maxfd = fd; 2936#endif 2937 2938 socket_log(sock, &dev->newsocket->peer_address, CREATION, 2939 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, 2940 "accepted connection, new socket %p", 2941 dev->newsocket); 2942 2943 UNLOCK(&manager->lock); 2944 2945 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]); 2946 } else { 2947 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 2948 dev->newsocket->references--; 2949 free_socket(&dev->newsocket); 2950 } 2951 2952 /* 2953 * Fill in the done event details and send it off. 2954 */ 2955 dev->result = result; 2956 task = dev->ev_sender; 2957 dev->ev_sender = sock; 2958 2959 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 2960 return; 2961 2962 soft_error: 2963 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 2964 UNLOCK(&sock->lock); 2965 2966 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 2967 return; 2968} 2969 2970static void 2971internal_recv(isc_task_t *me, isc_event_t *ev) { 2972 isc_socketevent_t *dev; 2973 isc_socket_t *sock; 2974 2975 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 2976 2977 sock = ev->ev_sender; 2978 INSIST(VALID_SOCKET(sock)); 2979 2980 LOCK(&sock->lock); 2981 socket_log(sock, NULL, IOEVENT, 2982 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 2983 "internal_recv: task %p got event %p", me, ev); 2984 2985 INSIST(sock->pending_recv == 1); 2986 sock->pending_recv = 0; 2987 2988 INSIST(sock->references > 0); 2989 sock->references--; /* the internal event is done with this socket */ 2990 if (sock->references == 0) { 2991 UNLOCK(&sock->lock); 2992 destroy(&sock); 2993 return; 2994 } 2995 2996 /* 2997 * Try to do as much I/O as possible on this socket. There are no 2998 * limits here, currently. 2999 */ 3000 dev = ISC_LIST_HEAD(sock->recv_list); 3001 while (dev != NULL) { 3002 switch (doio_recv(sock, dev)) { 3003 case DOIO_SOFT: 3004 goto poke; 3005 3006 case DOIO_EOF: 3007 /* 3008 * read of 0 means the remote end was closed. 3009 * Run through the event queue and dispatch all 3010 * the events with an EOF result code. 3011 */ 3012 do { 3013 dev->result = ISC_R_EOF; 3014 send_recvdone_event(sock, &dev); 3015 dev = ISC_LIST_HEAD(sock->recv_list); 3016 } while (dev != NULL); 3017 goto poke; 3018 3019 case DOIO_SUCCESS: 3020 case DOIO_HARD: 3021 send_recvdone_event(sock, &dev); 3022 break; 3023 } 3024 3025 dev = ISC_LIST_HEAD(sock->recv_list); 3026 } 3027 3028 poke: 3029 if (!ISC_LIST_EMPTY(sock->recv_list)) 3030 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3031 3032 UNLOCK(&sock->lock); 3033} 3034 3035static void 3036internal_send(isc_task_t *me, isc_event_t *ev) { 3037 isc_socketevent_t *dev; 3038 isc_socket_t *sock; 3039 3040 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3041 3042 /* 3043 * Find out what socket this is and lock it. 3044 */ 3045 sock = (isc_socket_t *)ev->ev_sender; 3046 INSIST(VALID_SOCKET(sock)); 3047 3048 LOCK(&sock->lock); 3049 socket_log(sock, NULL, IOEVENT, 3050 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3051 "internal_send: task %p got event %p", me, ev); 3052 3053 INSIST(sock->pending_send == 1); 3054 sock->pending_send = 0; 3055 3056 INSIST(sock->references > 0); 3057 sock->references--; /* the internal event is done with this socket */ 3058 if (sock->references == 0) { 3059 UNLOCK(&sock->lock); 3060 destroy(&sock); 3061 return; 3062 } 3063 3064 /* 3065 * Try to do as much I/O as possible on this socket. There are no 3066 * limits here, currently. 3067 */ 3068 dev = ISC_LIST_HEAD(sock->send_list); 3069 while (dev != NULL) { 3070 switch (doio_send(sock, dev)) { 3071 case DOIO_SOFT: 3072 goto poke; 3073 3074 case DOIO_HARD: 3075 case DOIO_SUCCESS: 3076 send_senddone_event(sock, &dev); 3077 break; 3078 } 3079 3080 dev = ISC_LIST_HEAD(sock->send_list); 3081 } 3082 3083 poke: 3084 if (!ISC_LIST_EMPTY(sock->send_list)) 3085 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3086 3087 UNLOCK(&sock->lock); 3088} 3089 3090static void 3091internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) { 3092 isc_socket_t *sock; 3093 int more_data; 3094 3095 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3096 3097 /* 3098 * Find out what socket this is and lock it. 3099 */ 3100 sock = (isc_socket_t *)ev->ev_sender; 3101 INSIST(VALID_SOCKET(sock)); 3102 3103 LOCK(&sock->lock); 3104 socket_log(sock, NULL, IOEVENT, 3105 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3106 "internal_fdwatch_write: task %p got event %p", me, ev); 3107 3108 INSIST(sock->pending_send == 1); 3109 3110 UNLOCK(&sock->lock); 3111 more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg); 3112 LOCK(&sock->lock); 3113 3114 sock->pending_send = 0; 3115 3116 INSIST(sock->references > 0); 3117 sock->references--; /* the internal event is done with this socket */ 3118 if (sock->references == 0) { 3119 UNLOCK(&sock->lock); 3120 destroy(&sock); 3121 return; 3122 } 3123 3124 if (more_data) 3125 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3126 3127 UNLOCK(&sock->lock); 3128} 3129 3130static void 3131internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) { 3132 isc_socket_t *sock; 3133 int more_data; 3134 3135 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3136 3137 /* 3138 * Find out what socket this is and lock it. 3139 */ 3140 sock = (isc_socket_t *)ev->ev_sender; 3141 INSIST(VALID_SOCKET(sock)); 3142 3143 LOCK(&sock->lock); 3144 socket_log(sock, NULL, IOEVENT, 3145 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3146 "internal_fdwatch_read: task %p got event %p", me, ev); 3147 3148 INSIST(sock->pending_recv == 1); 3149 3150 UNLOCK(&sock->lock); 3151 more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg); 3152 LOCK(&sock->lock); 3153 3154 sock->pending_recv = 0; 3155 3156 INSIST(sock->references > 0); 3157 sock->references--; /* the internal event is done with this socket */ 3158 if (sock->references == 0) { 3159 UNLOCK(&sock->lock); 3160 destroy(&sock); 3161 return; 3162 } 3163 3164 if (more_data) 3165 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3166 3167 UNLOCK(&sock->lock); 3168} 3169 3170/* 3171 * Process read/writes on each fd here. Avoid locking 3172 * and unlocking twice if both reads and writes are possible. 3173 */ 3174static void 3175process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable, 3176 isc_boolean_t writeable) 3177{ 3178 isc_socket_t *sock; 3179 isc_boolean_t unlock_sock; 3180 isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE; 3181 int lockid = FDLOCK_ID(fd); 3182 3183 /* 3184 * If the socket is going to be closed, don't do more I/O. 3185 */ 3186 LOCK(&manager->fdlock[lockid]); 3187 if (manager->fdstate[fd] == CLOSE_PENDING) { 3188 UNLOCK(&manager->fdlock[lockid]); 3189 3190 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3191 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3192 return; 3193 } 3194 3195 sock = manager->fds[fd]; 3196 unlock_sock = ISC_FALSE; 3197 if (readable) { 3198 if (sock == NULL) { 3199 unwatch_read = ISC_TRUE; 3200 goto check_write; 3201 } 3202 unlock_sock = ISC_TRUE; 3203 LOCK(&sock->lock); 3204 if (!SOCK_DEAD(sock)) { 3205 if (sock->listener) 3206 dispatch_accept(sock); 3207 else 3208 dispatch_recv(sock); 3209 } 3210 unwatch_read = ISC_TRUE; 3211 } 3212check_write: 3213 if (writeable) { 3214 if (sock == NULL) { 3215 unwatch_write = ISC_TRUE; 3216 goto unlock_fd; 3217 } 3218 if (!unlock_sock) { 3219 unlock_sock = ISC_TRUE; 3220 LOCK(&sock->lock); 3221 } 3222 if (!SOCK_DEAD(sock)) { 3223 if (sock->connecting) 3224 dispatch_connect(sock); 3225 else 3226 dispatch_send(sock); 3227 } 3228 unwatch_write = ISC_TRUE; 3229 } 3230 if (unlock_sock) 3231 UNLOCK(&sock->lock); 3232 3233 unlock_fd: 3234 UNLOCK(&manager->fdlock[lockid]); 3235 if (unwatch_read) 3236 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3237 if (unwatch_write) 3238 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3239 3240} 3241 3242#ifdef USE_KQUEUE 3243static isc_boolean_t 3244process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) { 3245 int i; 3246 isc_boolean_t readable, writable; 3247 isc_boolean_t done = ISC_FALSE; 3248#ifdef ISC_PLATFORM_USETHREADS 3249 isc_boolean_t have_ctlevent = ISC_FALSE; 3250#endif 3251 3252 if (nevents == manager->nevents) { 3253 /* 3254 * This is not an error, but something unexpected. If this 3255 * happens, it may indicate the need for increasing 3256 * ISC_SOCKET_MAXEVENTS. 3257 */ 3258 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3259 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3260 "maximum number of FD events (%d) received", 3261 nevents); 3262 } 3263 3264 for (i = 0; i < nevents; i++) { 3265 REQUIRE(events[i].ident < manager->maxsocks); 3266#ifdef ISC_PLATFORM_USETHREADS 3267 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) { 3268 have_ctlevent = ISC_TRUE; 3269 continue; 3270 } 3271#endif 3272 readable = ISC_TF(events[i].filter == EVFILT_READ); 3273 writable = ISC_TF(events[i].filter == EVFILT_WRITE); 3274 process_fd(manager, events[i].ident, readable, writable); 3275 } 3276 3277#ifdef ISC_PLATFORM_USETHREADS 3278 if (have_ctlevent) 3279 done = process_ctlfd(manager); 3280#endif 3281 3282 return (done); 3283} 3284#elif defined(USE_EPOLL) 3285static isc_boolean_t 3286process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) { 3287 int i; 3288 isc_boolean_t done = ISC_FALSE; 3289#ifdef ISC_PLATFORM_USETHREADS 3290 isc_boolean_t have_ctlevent = ISC_FALSE; 3291#endif 3292 3293 if (nevents == manager->nevents) { 3294 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3295 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3296 "maximum number of FD events (%d) received", 3297 nevents); 3298 } 3299 3300 for (i = 0; i < nevents; i++) { 3301 REQUIRE(events[i].data.fd < (int)manager->maxsocks); 3302#ifdef ISC_PLATFORM_USETHREADS 3303 if (events[i].data.fd == manager->pipe_fds[0]) { 3304 have_ctlevent = ISC_TRUE; 3305 continue; 3306 } 3307#endif 3308 if ((events[i].events & EPOLLERR) != 0 || 3309 (events[i].events & EPOLLHUP) != 0) { 3310 /* 3311 * epoll does not set IN/OUT bits on an erroneous 3312 * condition, so we need to try both anyway. This is a 3313 * bit inefficient, but should be okay for such rare 3314 * events. Note also that the read or write attempt 3315 * won't block because we use non-blocking sockets. 3316 */ 3317 events[i].events |= (EPOLLIN | EPOLLOUT); 3318 } 3319 process_fd(manager, events[i].data.fd, 3320 (events[i].events & EPOLLIN) != 0, 3321 (events[i].events & EPOLLOUT) != 0); 3322 } 3323 3324#ifdef ISC_PLATFORM_USETHREADS 3325 if (have_ctlevent) 3326 done = process_ctlfd(manager); 3327#endif 3328 3329 return (done); 3330} 3331#elif defined(USE_DEVPOLL) 3332static isc_boolean_t 3333process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) { 3334 int i; 3335 isc_boolean_t done = ISC_FALSE; 3336#ifdef ISC_PLATFORM_USETHREADS 3337 isc_boolean_t have_ctlevent = ISC_FALSE; 3338#endif 3339 3340 if (nevents == manager->nevents) { 3341 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3342 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3343 "maximum number of FD events (%d) received", 3344 nevents); 3345 } 3346 3347 for (i = 0; i < nevents; i++) { 3348 REQUIRE(events[i].fd < (int)manager->maxsocks); 3349#ifdef ISC_PLATFORM_USETHREADS 3350 if (events[i].fd == manager->pipe_fds[0]) { 3351 have_ctlevent = ISC_TRUE; 3352 continue; 3353 } 3354#endif 3355 process_fd(manager, events[i].fd, 3356 (events[i].events & POLLIN) != 0, 3357 (events[i].events & POLLOUT) != 0); 3358 } 3359 3360#ifdef ISC_PLATFORM_USETHREADS 3361 if (have_ctlevent) 3362 done = process_ctlfd(manager); 3363#endif 3364 3365 return (done); 3366} 3367#elif defined(USE_SELECT) 3368static void 3369process_fds(isc_socketmgr_t *manager, int maxfd, 3370 fd_set *readfds, fd_set *writefds) 3371{ 3372 int i; 3373 3374 REQUIRE(maxfd <= (int)manager->maxsocks); 3375 3376 for (i = 0; i < maxfd; i++) { 3377#ifdef ISC_PLATFORM_USETHREADS 3378 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) 3379 continue; 3380#endif /* ISC_PLATFORM_USETHREADS */ 3381 process_fd(manager, i, FD_ISSET(i, readfds), 3382 FD_ISSET(i, writefds)); 3383 } 3384} 3385#endif 3386 3387#ifdef ISC_PLATFORM_USETHREADS 3388static isc_boolean_t 3389process_ctlfd(isc_socketmgr_t *manager) { 3390 int msg, fd; 3391 3392 for (;;) { 3393 select_readmsg(manager, &fd, &msg); 3394 3395 manager_log(manager, IOEVENT, 3396 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 3397 ISC_MSG_WATCHERMSG, 3398 "watcher got message %d " 3399 "for socket %d"), msg, fd); 3400 3401 /* 3402 * Nothing to read? 3403 */ 3404 if (msg == SELECT_POKE_NOTHING) 3405 break; 3406 3407 /* 3408 * Handle shutdown message. We really should 3409 * jump out of this loop right away, but 3410 * it doesn't matter if we have to do a little 3411 * more work first. 3412 */ 3413 if (msg == SELECT_POKE_SHUTDOWN) 3414 return (ISC_TRUE); 3415 3416 /* 3417 * This is a wakeup on a socket. Look 3418 * at the event queue for both read and write, 3419 * and decide if we need to watch on it now 3420 * or not. 3421 */ 3422 wakeup_socket(manager, fd, msg); 3423 } 3424 3425 return (ISC_FALSE); 3426} 3427 3428/* 3429 * This is the thread that will loop forever, always in a select or poll 3430 * call. 3431 * 3432 * When select returns something to do, track down what thread gets to do 3433 * this I/O and post the event to it. 3434 */ 3435static isc_threadresult_t 3436watcher(void *uap) { 3437 isc_socketmgr_t *manager = uap; 3438 isc_boolean_t done; 3439 int ctlfd; 3440 int cc; 3441#ifdef USE_KQUEUE 3442 const char *fnname = "kevent()"; 3443#elif defined (USE_EPOLL) 3444 const char *fnname = "epoll_wait()"; 3445#elif defined(USE_DEVPOLL) 3446 const char *fnname = "ioctl(DP_POLL)"; 3447 struct dvpoll dvp; 3448#elif defined (USE_SELECT) 3449 const char *fnname = "select()"; 3450 int maxfd; 3451#endif 3452 char strbuf[ISC_STRERRORSIZE]; 3453#ifdef ISC_SOCKET_USE_POLLWATCH 3454 pollstate_t pollstate = poll_idle; 3455#endif 3456 3457 /* 3458 * Get the control fd here. This will never change. 3459 */ 3460 ctlfd = manager->pipe_fds[0]; 3461 done = ISC_FALSE; 3462 while (!done) { 3463 do { 3464#ifdef USE_KQUEUE 3465 cc = kevent(manager->kqueue_fd, NULL, 0, 3466 manager->events, manager->nevents, NULL); 3467#elif defined(USE_EPOLL) 3468 cc = epoll_wait(manager->epoll_fd, manager->events, 3469 manager->nevents, -1); 3470#elif defined(USE_DEVPOLL) 3471 dvp.dp_fds = manager->events; 3472 dvp.dp_nfds = manager->nevents; 3473#ifndef ISC_SOCKET_USE_POLLWATCH 3474 dvp.dp_timeout = -1; 3475#else 3476 if (pollstate == poll_idle) 3477 dvp.dp_timeout = -1; 3478 else 3479 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT; 3480#endif /* ISC_SOCKET_USE_POLLWATCH */ 3481 cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 3482#elif defined(USE_SELECT) 3483 LOCK(&manager->lock); 3484 memcpy(manager->read_fds_copy, manager->read_fds, 3485 manager->fd_bufsize); 3486 memcpy(manager->write_fds_copy, manager->write_fds, 3487 manager->fd_bufsize); 3488 maxfd = manager->maxfd + 1; 3489 UNLOCK(&manager->lock); 3490 3491 cc = select(maxfd, manager->read_fds_copy, 3492 manager->write_fds_copy, NULL, NULL); 3493#endif /* USE_KQUEUE */ 3494 3495 if (cc < 0 && !SOFT_ERROR(errno)) { 3496 isc__strerror(errno, strbuf, sizeof(strbuf)); 3497 FATAL_ERROR(__FILE__, __LINE__, 3498 "%s %s: %s", fnname, 3499 isc_msgcat_get(isc_msgcat, 3500 ISC_MSGSET_GENERAL, 3501 ISC_MSG_FAILED, 3502 "failed"), strbuf); 3503 } 3504 3505#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) 3506 if (cc == 0) { 3507 if (pollstate == poll_active) 3508 pollstate = poll_checking; 3509 else if (pollstate == poll_checking) 3510 pollstate = poll_idle; 3511 } else if (cc > 0) { 3512 if (pollstate == poll_checking) { 3513 /* 3514 * XXX: We'd like to use a more 3515 * verbose log level as it's actually an 3516 * unexpected event, but the kernel bug 3517 * reportedly happens pretty frequently 3518 * (and it can also be a false positive) 3519 * so it would be just too noisy. 3520 */ 3521 manager_log(manager, 3522 ISC_LOGCATEGORY_GENERAL, 3523 ISC_LOGMODULE_SOCKET, 3524 ISC_LOG_DEBUG(1), 3525 "unexpected POLL timeout"); 3526 } 3527 pollstate = poll_active; 3528 } 3529#endif 3530 } while (cc < 0); 3531 3532#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) 3533 done = process_fds(manager, manager->events, cc); 3534#elif defined(USE_SELECT) 3535 process_fds(manager, maxfd, manager->read_fds_copy, 3536 manager->write_fds_copy); 3537 3538 /* 3539 * Process reads on internal, control fd. 3540 */ 3541 if (FD_ISSET(ctlfd, manager->read_fds_copy)) 3542 done = process_ctlfd(manager); 3543#endif 3544 } 3545 3546 manager_log(manager, TRACE, "%s", 3547 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3548 ISC_MSG_EXITING, "watcher exiting")); 3549 3550 return ((isc_threadresult_t)0); 3551} 3552#endif /* ISC_PLATFORM_USETHREADS */ 3553 3554void 3555isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) { 3556 3557 REQUIRE(VALID_MANAGER(manager)); 3558 3559 manager->reserved = reserved; 3560} 3561 3562/* 3563 * Create a new socket manager. 3564 */ 3565 3566static isc_result_t 3567setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { 3568 isc_result_t result; 3569#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 3570 char strbuf[ISC_STRERRORSIZE]; 3571#endif 3572 3573#ifdef USE_KQUEUE 3574 manager->nevents = ISC_SOCKET_MAXEVENTS; 3575 manager->events = isc_mem_get(mctx, sizeof(struct kevent) * 3576 manager->nevents); 3577 if (manager->events == NULL) 3578 return (ISC_R_NOMEMORY); 3579 manager->kqueue_fd = kqueue(); 3580 if (manager->kqueue_fd == -1) { 3581 result = isc__errno2result(errno); 3582 isc__strerror(errno, strbuf, sizeof(strbuf)); 3583 UNEXPECTED_ERROR(__FILE__, __LINE__, 3584 "kqueue %s: %s", 3585 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3586 ISC_MSG_FAILED, "failed"), 3587 strbuf); 3588 isc_mem_put(mctx, manager->events, 3589 sizeof(struct kevent) * manager->nevents); 3590 return (result); 3591 } 3592 3593#ifdef ISC_PLATFORM_USETHREADS 3594 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3595 if (result != ISC_R_SUCCESS) { 3596 close(manager->kqueue_fd); 3597 isc_mem_put(mctx, manager->events, 3598 sizeof(struct kevent) * manager->nevents); 3599 return (result); 3600 } 3601#endif /* ISC_PLATFORM_USETHREADS */ 3602#elif defined(USE_EPOLL) 3603 manager->nevents = ISC_SOCKET_MAXEVENTS; 3604 manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * 3605 manager->nevents); 3606 if (manager->events == NULL) 3607 return (ISC_R_NOMEMORY); 3608 manager->epoll_fd = epoll_create(manager->nevents); 3609 if (manager->epoll_fd == -1) { 3610 result = isc__errno2result(errno); 3611 isc__strerror(errno, strbuf, sizeof(strbuf)); 3612 UNEXPECTED_ERROR(__FILE__, __LINE__, 3613 "epoll_create %s: %s", 3614 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3615 ISC_MSG_FAILED, "failed"), 3616 strbuf); 3617 isc_mem_put(mctx, manager->events, 3618 sizeof(struct epoll_event) * manager->nevents); 3619 return (result); 3620 } 3621#ifdef ISC_PLATFORM_USETHREADS 3622 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3623 if (result != ISC_R_SUCCESS) { 3624 close(manager->epoll_fd); 3625 isc_mem_put(mctx, manager->events, 3626 sizeof(struct epoll_event) * manager->nevents); 3627 return (result); 3628 } 3629#endif /* ISC_PLATFORM_USETHREADS */ 3630#elif defined(USE_DEVPOLL) 3631 /* 3632 * XXXJT: /dev/poll seems to reject large numbers of events, 3633 * so we should be careful about redefining ISC_SOCKET_MAXEVENTS. 3634 */ 3635 manager->nevents = ISC_SOCKET_MAXEVENTS; 3636 manager->events = isc_mem_get(mctx, sizeof(struct pollfd) * 3637 manager->nevents); 3638 if (manager->events == NULL) 3639 return (ISC_R_NOMEMORY); 3640 /* 3641 * Note: fdpollinfo should be able to support all possible FDs, so 3642 * it must have maxsocks entries (not nevents). 3643 */ 3644 manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) * 3645 manager->maxsocks); 3646 if (manager->fdpollinfo == NULL) { 3647 isc_mem_put(mctx, manager->events, 3648 sizeof(pollinfo_t) * manager->maxsocks); 3649 return (ISC_R_NOMEMORY); 3650 } 3651 memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks); 3652 manager->devpoll_fd = open("/dev/poll", O_RDWR); 3653 if (manager->devpoll_fd == -1) { 3654 result = isc__errno2result(errno); 3655 isc__strerror(errno, strbuf, sizeof(strbuf)); 3656 UNEXPECTED_ERROR(__FILE__, __LINE__, 3657 "open(/dev/poll) %s: %s", 3658 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3659 ISC_MSG_FAILED, "failed"), 3660 strbuf); 3661 isc_mem_put(mctx, manager->events, 3662 sizeof(struct pollfd) * manager->nevents); 3663 isc_mem_put(mctx, manager->fdpollinfo, 3664 sizeof(pollinfo_t) * manager->maxsocks); 3665 return (result); 3666 } 3667#ifdef ISC_PLATFORM_USETHREADS 3668 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3669 if (result != ISC_R_SUCCESS) { 3670 close(manager->devpoll_fd); 3671 isc_mem_put(mctx, manager->events, 3672 sizeof(struct pollfd) * manager->nevents); 3673 isc_mem_put(mctx, manager->fdpollinfo, 3674 sizeof(pollinfo_t) * manager->maxsocks); 3675 return (result); 3676 } 3677#endif /* ISC_PLATFORM_USETHREADS */ 3678#elif defined(USE_SELECT) 3679 UNUSED(result); 3680 3681#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE 3682 /* 3683 * Note: this code should also cover the case of MAXSOCKETS <= 3684 * FD_SETSIZE, but we separate the cases to avoid possible portability 3685 * issues regarding howmany() and the actual representation of fd_set. 3686 */ 3687 manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) * 3688 sizeof(fd_mask); 3689#else 3690 manager->fd_bufsize = sizeof(fd_set); 3691#endif 3692 3693 manager->read_fds = NULL; 3694 manager->read_fds_copy = NULL; 3695 manager->write_fds = NULL; 3696 manager->write_fds_copy = NULL; 3697 3698 manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize); 3699 if (manager->read_fds != NULL) 3700 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); 3701 if (manager->read_fds_copy != NULL) 3702 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); 3703 if (manager->write_fds != NULL) { 3704 manager->write_fds_copy = isc_mem_get(mctx, 3705 manager->fd_bufsize); 3706 } 3707 if (manager->write_fds_copy == NULL) { 3708 if (manager->write_fds != NULL) { 3709 isc_mem_put(mctx, manager->write_fds, 3710 manager->fd_bufsize); 3711 } 3712 if (manager->read_fds_copy != NULL) { 3713 isc_mem_put(mctx, manager->read_fds_copy, 3714 manager->fd_bufsize); 3715 } 3716 if (manager->read_fds != NULL) { 3717 isc_mem_put(mctx, manager->read_fds, 3718 manager->fd_bufsize); 3719 } 3720 return (ISC_R_NOMEMORY); 3721 } 3722 memset(manager->read_fds, 0, manager->fd_bufsize); 3723 memset(manager->write_fds, 0, manager->fd_bufsize); 3724 3725#ifdef ISC_PLATFORM_USETHREADS 3726 (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3727 manager->maxfd = manager->pipe_fds[0]; 3728#else /* ISC_PLATFORM_USETHREADS */ 3729 manager->maxfd = 0; 3730#endif /* ISC_PLATFORM_USETHREADS */ 3731#endif /* USE_KQUEUE */ 3732 3733 return (ISC_R_SUCCESS); 3734} 3735 3736static void 3737cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { 3738#ifdef ISC_PLATFORM_USETHREADS 3739 isc_result_t result; 3740 3741 result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3742 if (result != ISC_R_SUCCESS) { 3743 UNEXPECTED_ERROR(__FILE__, __LINE__, 3744 "epoll_ctl(DEL) %s", 3745 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3746 ISC_MSG_FAILED, "failed")); 3747 } 3748#endif /* ISC_PLATFORM_USETHREADS */ 3749 3750#ifdef USE_KQUEUE 3751 close(manager->kqueue_fd); 3752 isc_mem_put(mctx, manager->events, 3753 sizeof(struct kevent) * manager->nevents); 3754#elif defined(USE_EPOLL) 3755 close(manager->epoll_fd); 3756 isc_mem_put(mctx, manager->events, 3757 sizeof(struct epoll_event) * manager->nevents); 3758#elif defined(USE_DEVPOLL) 3759 close(manager->devpoll_fd); 3760 isc_mem_put(mctx, manager->events, 3761 sizeof(struct pollfd) * manager->nevents); 3762 isc_mem_put(mctx, manager->fdpollinfo, 3763 sizeof(pollinfo_t) * manager->maxsocks); 3764#elif defined(USE_SELECT) 3765 if (manager->read_fds != NULL) 3766 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize); 3767 if (manager->read_fds_copy != NULL) 3768 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize); 3769 if (manager->write_fds != NULL) 3770 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize); 3771 if (manager->write_fds_copy != NULL) 3772 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize); 3773#endif /* USE_KQUEUE */ 3774} 3775 3776isc_result_t 3777isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { 3778 return (isc_socketmgr_create2(mctx, managerp, 0)); 3779} 3780 3781isc_result_t 3782isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 3783 unsigned int maxsocks) 3784{ 3785 int i; 3786 isc_socketmgr_t *manager; 3787#ifdef ISC_PLATFORM_USETHREADS 3788 char strbuf[ISC_STRERRORSIZE]; 3789#endif 3790 isc_result_t result; 3791 3792 REQUIRE(managerp != NULL && *managerp == NULL); 3793 3794#ifndef ISC_PLATFORM_USETHREADS 3795 if (socketmgr != NULL) { 3796 /* Don't allow maxsocks to be updated */ 3797 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) 3798 return (ISC_R_EXISTS); 3799 3800 socketmgr->refs++; 3801 *managerp = socketmgr; 3802 return (ISC_R_SUCCESS); 3803 } 3804#endif /* ISC_PLATFORM_USETHREADS */ 3805 3806 if (maxsocks == 0) 3807 maxsocks = ISC_SOCKET_MAXSOCKETS; 3808 3809 manager = isc_mem_get(mctx, sizeof(*manager)); 3810 if (manager == NULL) 3811 return (ISC_R_NOMEMORY); 3812 3813 /* zero-clear so that necessary cleanup on failure will be easy */ 3814 memset(manager, 0, sizeof(*manager)); 3815 manager->maxsocks = maxsocks; 3816 manager->reserved = 0; 3817 manager->fds = isc_mem_get(mctx, 3818 manager->maxsocks * sizeof(isc_socket_t *)); 3819 if (manager->fds == NULL) { 3820 result = ISC_R_NOMEMORY; 3821 goto free_manager; 3822 } 3823 manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int)); 3824 if (manager->fdstate == NULL) { 3825 result = ISC_R_NOMEMORY; 3826 goto free_manager; 3827 } 3828 manager->stats = NULL; 3829 3830 manager->magic = SOCKET_MANAGER_MAGIC; 3831 manager->mctx = NULL; 3832 memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); 3833 ISC_LIST_INIT(manager->socklist); 3834 result = isc_mutex_init(&manager->lock); 3835 if (result != ISC_R_SUCCESS) 3836 goto free_manager; 3837 manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t)); 3838 if (manager->fdlock == NULL) { 3839 result = ISC_R_NOMEMORY; 3840 goto cleanup_lock; 3841 } 3842 for (i = 0; i < FDLOCK_COUNT; i++) { 3843 result = isc_mutex_init(&manager->fdlock[i]); 3844 if (result != ISC_R_SUCCESS) { 3845 while (--i >= 0) 3846 DESTROYLOCK(&manager->fdlock[i]); 3847 isc_mem_put(mctx, manager->fdlock, 3848 FDLOCK_COUNT * sizeof(isc_mutex_t)); 3849 manager->fdlock = NULL; 3850 goto cleanup_lock; 3851 } 3852 } 3853 3854#ifdef ISC_PLATFORM_USETHREADS 3855 if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { 3856 UNEXPECTED_ERROR(__FILE__, __LINE__, 3857 "isc_condition_init() %s", 3858 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3859 ISC_MSG_FAILED, "failed")); 3860 result = ISC_R_UNEXPECTED; 3861 goto cleanup_lock; 3862 } 3863 3864 /* 3865 * Create the special fds that will be used to wake up the 3866 * select/poll loop when something internal needs to be done. 3867 */ 3868 if (pipe(manager->pipe_fds) != 0) { 3869 isc__strerror(errno, strbuf, sizeof(strbuf)); 3870 UNEXPECTED_ERROR(__FILE__, __LINE__, 3871 "pipe() %s: %s", 3872 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3873 ISC_MSG_FAILED, "failed"), 3874 strbuf); 3875 result = ISC_R_UNEXPECTED; 3876 goto cleanup_condition; 3877 } 3878 3879 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS); 3880#if 0 3881 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS); 3882#endif 3883#else /* ISC_PLATFORM_USETHREADS */ 3884 manager->refs = 1; 3885#endif /* ISC_PLATFORM_USETHREADS */ 3886 3887 /* 3888 * Set up initial state for the select loop 3889 */ 3890 result = setup_watcher(mctx, manager); 3891 if (result != ISC_R_SUCCESS) 3892 goto cleanup; 3893 memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); 3894#ifdef ISC_PLATFORM_USETHREADS 3895 /* 3896 * Start up the select/poll thread. 3897 */ 3898 if (isc_thread_create(watcher, manager, &manager->watcher) != 3899 ISC_R_SUCCESS) { 3900 UNEXPECTED_ERROR(__FILE__, __LINE__, 3901 "isc_thread_create() %s", 3902 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3903 ISC_MSG_FAILED, "failed")); 3904 cleanup_watcher(mctx, manager); 3905 result = ISC_R_UNEXPECTED; 3906 goto cleanup; 3907 } 3908#endif /* ISC_PLATFORM_USETHREADS */ 3909 isc_mem_attach(mctx, &manager->mctx); 3910 3911#ifndef ISC_PLATFORM_USETHREADS 3912 socketmgr = manager; 3913#endif /* ISC_PLATFORM_USETHREADS */ 3914 *managerp = manager; 3915 3916 return (ISC_R_SUCCESS); 3917 3918cleanup: 3919#ifdef ISC_PLATFORM_USETHREADS 3920 (void)close(manager->pipe_fds[0]); 3921 (void)close(manager->pipe_fds[1]); 3922#endif /* ISC_PLATFORM_USETHREADS */ 3923 3924#ifdef ISC_PLATFORM_USETHREADS 3925cleanup_condition: 3926 (void)isc_condition_destroy(&manager->shutdown_ok); 3927#endif /* ISC_PLATFORM_USETHREADS */ 3928 3929 3930cleanup_lock: 3931 if (manager->fdlock != NULL) { 3932 for (i = 0; i < FDLOCK_COUNT; i++) 3933 DESTROYLOCK(&manager->fdlock[i]); 3934 } 3935 DESTROYLOCK(&manager->lock); 3936 3937free_manager: 3938 if (manager->fdlock != NULL) { 3939 isc_mem_put(mctx, manager->fdlock, 3940 FDLOCK_COUNT * sizeof(isc_mutex_t)); 3941 } 3942 if (manager->fdstate != NULL) { 3943 isc_mem_put(mctx, manager->fdstate, 3944 manager->maxsocks * sizeof(int)); 3945 } 3946 if (manager->fds != NULL) { 3947 isc_mem_put(mctx, manager->fds, 3948 manager->maxsocks * sizeof(isc_socket_t *)); 3949 } 3950 isc_mem_put(mctx, manager, sizeof(*manager)); 3951 3952 return (result); 3953} 3954 3955isc_result_t 3956isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) { 3957 REQUIRE(VALID_MANAGER(manager)); 3958 REQUIRE(nsockp != NULL); 3959 3960 *nsockp = manager->maxsocks; 3961 3962 return (ISC_R_SUCCESS); 3963} 3964 3965void 3966isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) { 3967 REQUIRE(VALID_MANAGER(manager)); 3968 REQUIRE(ISC_LIST_EMPTY(manager->socklist)); 3969 REQUIRE(manager->stats == NULL); 3970 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max); 3971 3972 isc_stats_attach(stats, &manager->stats); 3973} 3974 3975void 3976isc_socketmgr_destroy(isc_socketmgr_t **managerp) { 3977 isc_socketmgr_t *manager; 3978 int i; 3979 isc_mem_t *mctx; 3980 3981 /* 3982 * Destroy a socket manager. 3983 */ 3984 3985 REQUIRE(managerp != NULL); 3986 manager = *managerp; 3987 REQUIRE(VALID_MANAGER(manager)); 3988 3989#ifndef ISC_PLATFORM_USETHREADS 3990 if (manager->refs > 1) { 3991 manager->refs--; 3992 *managerp = NULL; 3993 return; 3994 } 3995#endif /* ISC_PLATFORM_USETHREADS */ 3996 3997 LOCK(&manager->lock); 3998 3999#ifdef ISC_PLATFORM_USETHREADS 4000 /* 4001 * Wait for all sockets to be destroyed. 4002 */ 4003 while (!ISC_LIST_EMPTY(manager->socklist)) { 4004 manager_log(manager, CREATION, "%s", 4005 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 4006 ISC_MSG_SOCKETSREMAIN, 4007 "sockets exist")); 4008 WAIT(&manager->shutdown_ok, &manager->lock); 4009 } 4010#else /* ISC_PLATFORM_USETHREADS */ 4011 /* 4012 * Hope all sockets have been destroyed. 4013 */ 4014 if (!ISC_LIST_EMPTY(manager->socklist)) { 4015 manager_log(manager, CREATION, "%s", 4016 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 4017 ISC_MSG_SOCKETSREMAIN, 4018 "sockets exist")); 4019 INSIST(0); 4020 } 4021#endif /* ISC_PLATFORM_USETHREADS */ 4022 4023 UNLOCK(&manager->lock); 4024 4025 /* 4026 * Here, poke our select/poll thread. Do this by closing the write 4027 * half of the pipe, which will send EOF to the read half. 4028 * This is currently a no-op in the non-threaded case. 4029 */ 4030 select_poke(manager, 0, SELECT_POKE_SHUTDOWN); 4031 4032#ifdef ISC_PLATFORM_USETHREADS 4033 /* 4034 * Wait for thread to exit. 4035 */ 4036 if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS) 4037 UNEXPECTED_ERROR(__FILE__, __LINE__, 4038 "isc_thread_join() %s", 4039 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4040 ISC_MSG_FAILED, "failed")); 4041#endif /* ISC_PLATFORM_USETHREADS */ 4042 4043 /* 4044 * Clean up. 4045 */ 4046 cleanup_watcher(manager->mctx, manager); 4047 4048#ifdef ISC_PLATFORM_USETHREADS 4049 (void)close(manager->pipe_fds[0]); 4050 (void)close(manager->pipe_fds[1]); 4051 (void)isc_condition_destroy(&manager->shutdown_ok); 4052#endif /* ISC_PLATFORM_USETHREADS */ 4053 4054 for (i = 0; i < (int)manager->maxsocks; i++) 4055 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ 4056 (void)close(i); 4057 4058 isc_mem_put(manager->mctx, manager->fds, 4059 manager->maxsocks * sizeof(isc_socket_t *)); 4060 isc_mem_put(manager->mctx, manager->fdstate, 4061 manager->maxsocks * sizeof(int)); 4062 4063 if (manager->stats != NULL) 4064 isc_stats_detach(&manager->stats); 4065 4066 if (manager->fdlock != NULL) { 4067 for (i = 0; i < FDLOCK_COUNT; i++) 4068 DESTROYLOCK(&manager->fdlock[i]); 4069 isc_mem_put(manager->mctx, manager->fdlock, 4070 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4071 } 4072 DESTROYLOCK(&manager->lock); 4073 manager->magic = 0; 4074 mctx= manager->mctx; 4075 isc_mem_put(mctx, manager, sizeof(*manager)); 4076 4077 isc_mem_detach(&mctx); 4078 4079 *managerp = NULL; 4080} 4081 4082static isc_result_t 4083socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4084 unsigned int flags) 4085{ 4086 int io_state; 4087 isc_boolean_t have_lock = ISC_FALSE; 4088 isc_task_t *ntask = NULL; 4089 isc_result_t result = ISC_R_SUCCESS; 4090 4091 dev->ev_sender = task; 4092 4093 if (sock->type == isc_sockettype_udp) { 4094 io_state = doio_recv(sock, dev); 4095 } else { 4096 LOCK(&sock->lock); 4097 have_lock = ISC_TRUE; 4098 4099 if (ISC_LIST_EMPTY(sock->recv_list)) 4100 io_state = doio_recv(sock, dev); 4101 else 4102 io_state = DOIO_SOFT; 4103 } 4104 4105 switch (io_state) { 4106 case DOIO_SOFT: 4107 /* 4108 * We couldn't read all or part of the request right now, so 4109 * queue it. 4110 * 4111 * Attach to socket and to task 4112 */ 4113 isc_task_attach(task, &ntask); 4114 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4115 4116 if (!have_lock) { 4117 LOCK(&sock->lock); 4118 have_lock = ISC_TRUE; 4119 } 4120 4121 /* 4122 * Enqueue the request. If the socket was previously not being 4123 * watched, poke the watcher to start paying attention to it. 4124 */ 4125 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) 4126 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 4127 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); 4128 4129 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4130 "socket_recv: event %p -> task %p", 4131 dev, ntask); 4132 4133 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4134 result = ISC_R_INPROGRESS; 4135 break; 4136 4137 case DOIO_EOF: 4138 dev->result = ISC_R_EOF; 4139 /* fallthrough */ 4140 4141 case DOIO_HARD: 4142 case DOIO_SUCCESS: 4143 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4144 send_recvdone_event(sock, &dev); 4145 break; 4146 } 4147 4148 if (have_lock) 4149 UNLOCK(&sock->lock); 4150 4151 return (result); 4152} 4153 4154isc_result_t 4155isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist, 4156 unsigned int minimum, isc_task_t *task, 4157 isc_taskaction_t action, const void *arg) 4158{ 4159 isc_socketevent_t *dev; 4160 isc_socketmgr_t *manager; 4161 unsigned int iocount; 4162 isc_buffer_t *buffer; 4163 4164 REQUIRE(VALID_SOCKET(sock)); 4165 REQUIRE(buflist != NULL); 4166 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4167 REQUIRE(task != NULL); 4168 REQUIRE(action != NULL); 4169 4170 manager = sock->manager; 4171 REQUIRE(VALID_MANAGER(manager)); 4172 4173 iocount = isc_bufferlist_availablecount(buflist); 4174 REQUIRE(iocount > 0); 4175 4176 INSIST(sock->bound); 4177 4178 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4179 if (dev == NULL) { 4180 return (ISC_R_NOMEMORY); 4181 } 4182 4183 /* 4184 * UDP sockets are always partial read 4185 */ 4186 if (sock->type == isc_sockettype_udp) 4187 dev->minimum = 1; 4188 else { 4189 if (minimum == 0) 4190 dev->minimum = iocount; 4191 else 4192 dev->minimum = minimum; 4193 } 4194 4195 /* 4196 * Move each buffer from the passed in list to our internal one. 4197 */ 4198 buffer = ISC_LIST_HEAD(*buflist); 4199 while (buffer != NULL) { 4200 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4201 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4202 buffer = ISC_LIST_HEAD(*buflist); 4203 } 4204 4205 return (socket_recv(sock, dev, task, 0)); 4206} 4207 4208isc_result_t 4209isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum, 4210 isc_task_t *task, isc_taskaction_t action, const void *arg) 4211{ 4212 isc_socketevent_t *dev; 4213 isc_socketmgr_t *manager; 4214 4215 REQUIRE(VALID_SOCKET(sock)); 4216 REQUIRE(action != NULL); 4217 4218 manager = sock->manager; 4219 REQUIRE(VALID_MANAGER(manager)); 4220 4221 INSIST(sock->bound); 4222 4223 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4224 if (dev == NULL) 4225 return (ISC_R_NOMEMORY); 4226 4227 return (isc_socket_recv2(sock, region, minimum, task, dev, 0)); 4228} 4229 4230isc_result_t 4231isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, 4232 unsigned int minimum, isc_task_t *task, 4233 isc_socketevent_t *event, unsigned int flags) 4234{ 4235 event->ev_sender = sock; 4236 event->result = ISC_R_UNEXPECTED; 4237 ISC_LIST_INIT(event->bufferlist); 4238 event->region = *region; 4239 event->n = 0; 4240 event->offset = 0; 4241 event->attributes = 0; 4242 4243 /* 4244 * UDP sockets are always partial read. 4245 */ 4246 if (sock->type == isc_sockettype_udp) 4247 event->minimum = 1; 4248 else { 4249 if (minimum == 0) 4250 event->minimum = region->length; 4251 else 4252 event->minimum = minimum; 4253 } 4254 4255 return (socket_recv(sock, event, task, flags)); 4256} 4257 4258static isc_result_t 4259socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4260 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4261 unsigned int flags) 4262{ 4263 int io_state; 4264 isc_boolean_t have_lock = ISC_FALSE; 4265 isc_task_t *ntask = NULL; 4266 isc_result_t result = ISC_R_SUCCESS; 4267 4268 dev->ev_sender = task; 4269 4270 set_dev_address(address, sock, dev); 4271 if (pktinfo != NULL) { 4272 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 4273 dev->pktinfo = *pktinfo; 4274 4275 if (!isc_sockaddr_issitelocal(&dev->address) && 4276 !isc_sockaddr_islinklocal(&dev->address)) { 4277 socket_log(sock, NULL, TRACE, isc_msgcat, 4278 ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED, 4279 "pktinfo structure provided, ifindex %u " 4280 "(set to 0)", pktinfo->ipi6_ifindex); 4281 4282 /* 4283 * Set the pktinfo index to 0 here, to let the 4284 * kernel decide what interface it should send on. 4285 */ 4286 dev->pktinfo.ipi6_ifindex = 0; 4287 } 4288 } 4289 4290 if (sock->type == isc_sockettype_udp) 4291 io_state = doio_send(sock, dev); 4292 else { 4293 LOCK(&sock->lock); 4294 have_lock = ISC_TRUE; 4295 4296 if (ISC_LIST_EMPTY(sock->send_list)) 4297 io_state = doio_send(sock, dev); 4298 else 4299 io_state = DOIO_SOFT; 4300 } 4301 4302 switch (io_state) { 4303 case DOIO_SOFT: 4304 /* 4305 * We couldn't send all or part of the request right now, so 4306 * queue it unless ISC_SOCKFLAG_NORETRY is set. 4307 */ 4308 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { 4309 isc_task_attach(task, &ntask); 4310 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4311 4312 if (!have_lock) { 4313 LOCK(&sock->lock); 4314 have_lock = ISC_TRUE; 4315 } 4316 4317 /* 4318 * Enqueue the request. If the socket was previously 4319 * not being watched, poke the watcher to start 4320 * paying attention to it. 4321 */ 4322 if (ISC_LIST_EMPTY(sock->send_list) && 4323 !sock->pending_send) 4324 select_poke(sock->manager, sock->fd, 4325 SELECT_POKE_WRITE); 4326 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); 4327 4328 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4329 "socket_send: event %p -> task %p", 4330 dev, ntask); 4331 4332 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4333 result = ISC_R_INPROGRESS; 4334 break; 4335 } 4336 4337 case DOIO_HARD: 4338 case DOIO_SUCCESS: 4339 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4340 send_senddone_event(sock, &dev); 4341 break; 4342 } 4343 4344 if (have_lock) 4345 UNLOCK(&sock->lock); 4346 4347 return (result); 4348} 4349 4350isc_result_t 4351isc_socket_send(isc_socket_t *sock, isc_region_t *region, 4352 isc_task_t *task, isc_taskaction_t action, const void *arg) 4353{ 4354 /* 4355 * REQUIRE() checking is performed in isc_socket_sendto(). 4356 */ 4357 return (isc_socket_sendto(sock, region, task, action, arg, NULL, 4358 NULL)); 4359} 4360 4361isc_result_t 4362isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, 4363 isc_task_t *task, isc_taskaction_t action, const void *arg, 4364 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4365{ 4366 isc_socketevent_t *dev; 4367 isc_socketmgr_t *manager; 4368 4369 REQUIRE(VALID_SOCKET(sock)); 4370 REQUIRE(region != NULL); 4371 REQUIRE(task != NULL); 4372 REQUIRE(action != NULL); 4373 4374 manager = sock->manager; 4375 REQUIRE(VALID_MANAGER(manager)); 4376 4377 INSIST(sock->bound); 4378 4379 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4380 if (dev == NULL) { 4381 return (ISC_R_NOMEMORY); 4382 } 4383 4384 dev->region = *region; 4385 4386 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4387} 4388 4389isc_result_t 4390isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 4391 isc_task_t *task, isc_taskaction_t action, const void *arg) 4392{ 4393 return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL, 4394 NULL)); 4395} 4396 4397isc_result_t 4398isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist, 4399 isc_task_t *task, isc_taskaction_t action, const void *arg, 4400 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4401{ 4402 isc_socketevent_t *dev; 4403 isc_socketmgr_t *manager; 4404 unsigned int iocount; 4405 isc_buffer_t *buffer; 4406 4407 REQUIRE(VALID_SOCKET(sock)); 4408 REQUIRE(buflist != NULL); 4409 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4410 REQUIRE(task != NULL); 4411 REQUIRE(action != NULL); 4412 4413 manager = sock->manager; 4414 REQUIRE(VALID_MANAGER(manager)); 4415 4416 iocount = isc_bufferlist_usedcount(buflist); 4417 REQUIRE(iocount > 0); 4418 4419 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4420 if (dev == NULL) { 4421 return (ISC_R_NOMEMORY); 4422 } 4423 4424 /* 4425 * Move each buffer from the passed in list to our internal one. 4426 */ 4427 buffer = ISC_LIST_HEAD(*buflist); 4428 while (buffer != NULL) { 4429 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4430 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4431 buffer = ISC_LIST_HEAD(*buflist); 4432 } 4433 4434 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4435} 4436 4437isc_result_t 4438isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, 4439 isc_task_t *task, 4440 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4441 isc_socketevent_t *event, unsigned int flags) 4442{ 4443 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0); 4444 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) 4445 REQUIRE(sock->type == isc_sockettype_udp); 4446 event->ev_sender = sock; 4447 event->result = ISC_R_UNEXPECTED; 4448 ISC_LIST_INIT(event->bufferlist); 4449 event->region = *region; 4450 event->n = 0; 4451 event->offset = 0; 4452 event->attributes = 0; 4453 4454 return (socket_send(sock, event, task, address, pktinfo, flags)); 4455} 4456 4457void 4458isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) { 4459#ifdef ISC_PLATFORM_HAVESYSUNH 4460 int s; 4461 struct stat sb; 4462 char strbuf[ISC_STRERRORSIZE]; 4463 4464 if (sockaddr->type.sa.sa_family != AF_UNIX) 4465 return; 4466 4467#ifndef S_ISSOCK 4468#if defined(S_IFMT) && defined(S_IFSOCK) 4469#define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK) 4470#elif defined(_S_IFMT) && defined(S_IFSOCK) 4471#define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK) 4472#endif 4473#endif 4474 4475#ifndef S_ISFIFO 4476#if defined(S_IFMT) && defined(S_IFIFO) 4477#define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO) 4478#elif defined(_S_IFMT) && defined(S_IFIFO) 4479#define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO) 4480#endif 4481#endif 4482 4483#if !defined(S_ISFIFO) && !defined(S_ISSOCK) 4484#error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>. 4485#endif 4486 4487#ifndef S_ISFIFO 4488#define S_ISFIFO(mode) 0 4489#endif 4490 4491#ifndef S_ISSOCK 4492#define S_ISSOCK(mode) 0 4493#endif 4494 4495 if (active) { 4496 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4497 isc__strerror(errno, strbuf, sizeof(strbuf)); 4498 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4499 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4500 "isc_socket_cleanunix: stat(%s): %s", 4501 sockaddr->type.sunix.sun_path, strbuf); 4502 return; 4503 } 4504 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4505 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4506 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4507 "isc_socket_cleanunix: %s: not a socket", 4508 sockaddr->type.sunix.sun_path); 4509 return; 4510 } 4511 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4512 isc__strerror(errno, strbuf, sizeof(strbuf)); 4513 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4514 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4515 "isc_socket_cleanunix: unlink(%s): %s", 4516 sockaddr->type.sunix.sun_path, strbuf); 4517 } 4518 return; 4519 } 4520 4521 s = socket(AF_UNIX, SOCK_STREAM, 0); 4522 if (s < 0) { 4523 isc__strerror(errno, strbuf, sizeof(strbuf)); 4524 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4525 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4526 "isc_socket_cleanunix: socket(%s): %s", 4527 sockaddr->type.sunix.sun_path, strbuf); 4528 return; 4529 } 4530 4531 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4532 switch (errno) { 4533 case ENOENT: /* We exited cleanly last time */ 4534 break; 4535 default: 4536 isc__strerror(errno, strbuf, sizeof(strbuf)); 4537 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4538 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4539 "isc_socket_cleanunix: stat(%s): %s", 4540 sockaddr->type.sunix.sun_path, strbuf); 4541 break; 4542 } 4543 goto cleanup; 4544 } 4545 4546 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4547 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4548 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4549 "isc_socket_cleanunix: %s: not a socket", 4550 sockaddr->type.sunix.sun_path); 4551 goto cleanup; 4552 } 4553 4554 if (connect(s, (struct sockaddr *)&sockaddr->type.sunix, 4555 sizeof(sockaddr->type.sunix)) < 0) { 4556 switch (errno) { 4557 case ECONNREFUSED: 4558 case ECONNRESET: 4559 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4560 isc__strerror(errno, strbuf, sizeof(strbuf)); 4561 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4562 ISC_LOGMODULE_SOCKET, 4563 ISC_LOG_WARNING, 4564 "isc_socket_cleanunix: " 4565 "unlink(%s): %s", 4566 sockaddr->type.sunix.sun_path, 4567 strbuf); 4568 } 4569 break; 4570 default: 4571 isc__strerror(errno, strbuf, sizeof(strbuf)); 4572 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4573 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4574 "isc_socket_cleanunix: connect(%s): %s", 4575 sockaddr->type.sunix.sun_path, strbuf); 4576 break; 4577 } 4578 } 4579 cleanup: 4580 close(s); 4581#else 4582 UNUSED(sockaddr); 4583 UNUSED(active); 4584#endif 4585} 4586 4587isc_result_t 4588isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 4589 isc_uint32_t owner, isc_uint32_t group) 4590{ 4591#ifdef ISC_PLATFORM_HAVESYSUNH 4592 isc_result_t result = ISC_R_SUCCESS; 4593 char strbuf[ISC_STRERRORSIZE]; 4594 char path[sizeof(sockaddr->type.sunix.sun_path)]; 4595#ifdef NEED_SECURE_DIRECTORY 4596 char *slash; 4597#endif 4598 4599 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX); 4600 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path)); 4601 strcpy(path, sockaddr->type.sunix.sun_path); 4602 4603#ifdef NEED_SECURE_DIRECTORY 4604 slash = strrchr(path, '/'); 4605 if (slash != NULL) { 4606 if (slash != path) 4607 *slash = '\0'; 4608 else 4609 strcpy(path, "/"); 4610 } else 4611 strcpy(path, "."); 4612#endif 4613 4614 if (chmod(path, perm) < 0) { 4615 isc__strerror(errno, strbuf, sizeof(strbuf)); 4616 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4617 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4618 "isc_socket_permunix: chmod(%s, %d): %s", 4619 path, perm, strbuf); 4620 result = ISC_R_FAILURE; 4621 } 4622 if (chown(path, owner, group) < 0) { 4623 isc__strerror(errno, strbuf, sizeof(strbuf)); 4624 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4625 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4626 "isc_socket_permunix: chown(%s, %d, %d): %s", 4627 path, owner, group, 4628 strbuf); 4629 result = ISC_R_FAILURE; 4630 } 4631 return (result); 4632#else 4633 UNUSED(sockaddr); 4634 UNUSED(perm); 4635 UNUSED(owner); 4636 UNUSED(group); 4637 return (ISC_R_NOTIMPLEMENTED); 4638#endif 4639} 4640 4641isc_result_t 4642isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr, 4643 unsigned int options) { 4644 char strbuf[ISC_STRERRORSIZE]; 4645 int on = 1; 4646 4647 LOCK(&sock->lock); 4648 4649 INSIST(!sock->bound); 4650 4651 if (sock->pf != sockaddr->type.sa.sa_family) { 4652 UNLOCK(&sock->lock); 4653 return (ISC_R_FAMILYMISMATCH); 4654 } 4655 /* 4656 * Only set SO_REUSEADDR when we want a specific port. 4657 */ 4658#ifdef AF_UNIX 4659 if (sock->pf == AF_UNIX) 4660 goto bind_socket; 4661#endif 4662 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 && 4663 isc_sockaddr_getport(sockaddr) != (in_port_t)0 && 4664 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, 4665 sizeof(on)) < 0) { 4666 UNEXPECTED_ERROR(__FILE__, __LINE__, 4667 "setsockopt(%d) %s", sock->fd, 4668 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4669 ISC_MSG_FAILED, "failed")); 4670 /* Press on... */ 4671 } 4672#ifdef AF_UNIX 4673 bind_socket: 4674#endif 4675 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) { 4676 inc_stats(sock->manager->stats, 4677 sock->statsindex[STATID_BINDFAIL]); 4678 4679 UNLOCK(&sock->lock); 4680 switch (errno) { 4681 case EACCES: 4682 return (ISC_R_NOPERM); 4683 case EADDRNOTAVAIL: 4684 return (ISC_R_ADDRNOTAVAIL); 4685 case EADDRINUSE: 4686 return (ISC_R_ADDRINUSE); 4687 case EINVAL: 4688 return (ISC_R_BOUND); 4689 default: 4690 isc__strerror(errno, strbuf, sizeof(strbuf)); 4691 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s", 4692 strbuf); 4693 return (ISC_R_UNEXPECTED); 4694 } 4695 } 4696 4697 socket_log(sock, sockaddr, TRACE, 4698 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound"); 4699 sock->bound = 1; 4700 4701 UNLOCK(&sock->lock); 4702 return (ISC_R_SUCCESS); 4703} 4704 4705isc_result_t 4706isc_socket_filter(isc_socket_t *sock, const char *filter) { 4707#ifdef SO_ACCEPTFILTER 4708 char strbuf[ISC_STRERRORSIZE]; 4709 struct accept_filter_arg afa; 4710#else 4711 UNUSED(sock); 4712 UNUSED(filter); 4713#endif 4714 4715 REQUIRE(VALID_SOCKET(sock)); 4716 4717#ifdef SO_ACCEPTFILTER 4718 bzero(&afa, sizeof(afa)); 4719 strncpy(afa.af_name, filter, sizeof(afa.af_name)); 4720 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, 4721 &afa, sizeof(afa)) == -1) { 4722 isc__strerror(errno, strbuf, sizeof(strbuf)); 4723 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 4724 ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s", 4725 strbuf); 4726 return (ISC_R_FAILURE); 4727 } 4728 return (ISC_R_SUCCESS); 4729#else 4730 return (ISC_R_NOTIMPLEMENTED); 4731#endif 4732} 4733 4734/* 4735 * Set up to listen on a given socket. We do this by creating an internal 4736 * event that will be dispatched when the socket has read activity. The 4737 * watcher will send the internal event to the task when there is a new 4738 * connection. 4739 * 4740 * Unlike in read, we don't preallocate a done event here. Every time there 4741 * is a new connection we'll have to allocate a new one anyway, so we might 4742 * as well keep things simple rather than having to track them. 4743 */ 4744isc_result_t 4745isc_socket_listen(isc_socket_t *sock, unsigned int backlog) { 4746 char strbuf[ISC_STRERRORSIZE]; 4747 4748 REQUIRE(VALID_SOCKET(sock)); 4749 4750 LOCK(&sock->lock); 4751 4752 REQUIRE(!sock->listener); 4753 REQUIRE(sock->bound); 4754 REQUIRE(sock->type == isc_sockettype_tcp || 4755 sock->type == isc_sockettype_unix); 4756 4757 if (backlog == 0) 4758 backlog = SOMAXCONN; 4759 4760 if (listen(sock->fd, (int)backlog) < 0) { 4761 UNLOCK(&sock->lock); 4762 isc__strerror(errno, strbuf, sizeof(strbuf)); 4763 4764 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf); 4765 4766 return (ISC_R_UNEXPECTED); 4767 } 4768 4769 sock->listener = 1; 4770 4771 UNLOCK(&sock->lock); 4772 return (ISC_R_SUCCESS); 4773} 4774 4775/* 4776 * This should try to do aggressive accept() XXXMLG 4777 */ 4778isc_result_t 4779isc_socket_accept(isc_socket_t *sock, 4780 isc_task_t *task, isc_taskaction_t action, const void *arg) 4781{ 4782 isc_socket_newconnev_t *dev; 4783 isc_socketmgr_t *manager; 4784 isc_task_t *ntask = NULL; 4785 isc_socket_t *nsock; 4786 isc_result_t result; 4787 isc_boolean_t do_poke = ISC_FALSE; 4788 4789 REQUIRE(VALID_SOCKET(sock)); 4790 manager = sock->manager; 4791 REQUIRE(VALID_MANAGER(manager)); 4792 4793 LOCK(&sock->lock); 4794 4795 REQUIRE(sock->listener); 4796 4797 /* 4798 * Sender field is overloaded here with the task we will be sending 4799 * this event to. Just before the actual event is delivered the 4800 * actual ev_sender will be touched up to be the socket. 4801 */ 4802 dev = (isc_socket_newconnev_t *) 4803 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN, 4804 action, arg, sizeof(*dev)); 4805 if (dev == NULL) { 4806 UNLOCK(&sock->lock); 4807 return (ISC_R_NOMEMORY); 4808 } 4809 ISC_LINK_INIT(dev, ev_link); 4810 4811 result = allocate_socket(manager, sock->type, &nsock); 4812 if (result != ISC_R_SUCCESS) { 4813 isc_event_free(ISC_EVENT_PTR(&dev)); 4814 UNLOCK(&sock->lock); 4815 return (result); 4816 } 4817 4818 /* 4819 * Attach to socket and to task. 4820 */ 4821 isc_task_attach(task, &ntask); 4822 nsock->references++; 4823 nsock->statsindex = sock->statsindex; 4824 4825 dev->ev_sender = ntask; 4826 dev->newsocket = nsock; 4827 4828 /* 4829 * Poke watcher here. We still have the socket locked, so there 4830 * is no race condition. We will keep the lock for such a short 4831 * bit of time waking it up now or later won't matter all that much. 4832 */ 4833 if (ISC_LIST_EMPTY(sock->accept_list)) 4834 do_poke = ISC_TRUE; 4835 4836 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link); 4837 4838 if (do_poke) 4839 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT); 4840 4841 UNLOCK(&sock->lock); 4842 return (ISC_R_SUCCESS); 4843} 4844 4845isc_result_t 4846isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, 4847 isc_task_t *task, isc_taskaction_t action, const void *arg) 4848{ 4849 isc_socket_connev_t *dev; 4850 isc_task_t *ntask = NULL; 4851 isc_socketmgr_t *manager; 4852 int cc; 4853 char strbuf[ISC_STRERRORSIZE]; 4854 4855 REQUIRE(VALID_SOCKET(sock)); 4856 REQUIRE(addr != NULL); 4857 REQUIRE(task != NULL); 4858 REQUIRE(action != NULL); 4859 4860 manager = sock->manager; 4861 REQUIRE(VALID_MANAGER(manager)); 4862 REQUIRE(addr != NULL); 4863 4864 if (isc_sockaddr_ismulticast(addr)) 4865 return (ISC_R_MULTICAST); 4866 4867 LOCK(&sock->lock); 4868 4869 REQUIRE(!sock->connecting); 4870 4871 dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock, 4872 ISC_SOCKEVENT_CONNECT, 4873 action, arg, 4874 sizeof(*dev)); 4875 if (dev == NULL) { 4876 UNLOCK(&sock->lock); 4877 return (ISC_R_NOMEMORY); 4878 } 4879 ISC_LINK_INIT(dev, ev_link); 4880 4881 /* 4882 * Try to do the connect right away, as there can be only one 4883 * outstanding, and it might happen to complete. 4884 */ 4885 sock->peer_address = *addr; 4886 cc = connect(sock->fd, &addr->type.sa, addr->length); 4887 if (cc < 0) { 4888 /* 4889 * HP-UX "fails" to connect a UDP socket and sets errno to 4890 * EINPROGRESS if it's non-blocking. We'd rather regard this as 4891 * a success and let the user detect it if it's really an error 4892 * at the time of sending a packet on the socket. 4893 */ 4894 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { 4895 cc = 0; 4896 goto success; 4897 } 4898 if (SOFT_ERROR(errno) || errno == EINPROGRESS) 4899 goto queue; 4900 4901 switch (errno) { 4902#define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit; 4903 ERROR_MATCH(EACCES, ISC_R_NOPERM); 4904 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 4905 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 4906 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 4907 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 4908#ifdef EHOSTDOWN 4909 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 4910#endif 4911 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 4912 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 4913 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 4914 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 4915 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 4916#undef ERROR_MATCH 4917 } 4918 4919 sock->connected = 0; 4920 4921 isc__strerror(errno, strbuf, sizeof(strbuf)); 4922 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf); 4923 4924 UNLOCK(&sock->lock); 4925 inc_stats(sock->manager->stats, 4926 sock->statsindex[STATID_CONNECTFAIL]); 4927 isc_event_free(ISC_EVENT_PTR(&dev)); 4928 return (ISC_R_UNEXPECTED); 4929 4930 err_exit: 4931 sock->connected = 0; 4932 isc_task_send(task, ISC_EVENT_PTR(&dev)); 4933 4934 UNLOCK(&sock->lock); 4935 inc_stats(sock->manager->stats, 4936 sock->statsindex[STATID_CONNECTFAIL]); 4937 return (ISC_R_SUCCESS); 4938 } 4939 4940 /* 4941 * If connect completed, fire off the done event. 4942 */ 4943 success: 4944 if (cc == 0) { 4945 sock->connected = 1; 4946 sock->bound = 1; 4947 dev->result = ISC_R_SUCCESS; 4948 isc_task_send(task, ISC_EVENT_PTR(&dev)); 4949 4950 UNLOCK(&sock->lock); 4951 4952 inc_stats(sock->manager->stats, 4953 sock->statsindex[STATID_CONNECT]); 4954 4955 return (ISC_R_SUCCESS); 4956 } 4957 4958 queue: 4959 4960 /* 4961 * Attach to task. 4962 */ 4963 isc_task_attach(task, &ntask); 4964 4965 sock->connecting = 1; 4966 4967 dev->ev_sender = ntask; 4968 4969 /* 4970 * Poke watcher here. We still have the socket locked, so there 4971 * is no race condition. We will keep the lock for such a short 4972 * bit of time waking it up now or later won't matter all that much. 4973 */ 4974 if (sock->connect_ev == NULL) 4975 select_poke(manager, sock->fd, SELECT_POKE_CONNECT); 4976 4977 sock->connect_ev = dev; 4978 4979 UNLOCK(&sock->lock); 4980 return (ISC_R_SUCCESS); 4981} 4982 4983/* 4984 * Called when a socket with a pending connect() finishes. 4985 */ 4986static void 4987internal_connect(isc_task_t *me, isc_event_t *ev) { 4988 isc_socket_t *sock; 4989 isc_socket_connev_t *dev; 4990 isc_task_t *task; 4991 int cc; 4992 ISC_SOCKADDR_LEN_T optlen; 4993 char strbuf[ISC_STRERRORSIZE]; 4994 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 4995 4996 UNUSED(me); 4997 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 4998 4999 sock = ev->ev_sender; 5000 INSIST(VALID_SOCKET(sock)); 5001 5002 LOCK(&sock->lock); 5003 5004 /* 5005 * When the internal event was sent the reference count was bumped 5006 * to keep the socket around for us. Decrement the count here. 5007 */ 5008 INSIST(sock->references > 0); 5009 sock->references--; 5010 if (sock->references == 0) { 5011 UNLOCK(&sock->lock); 5012 destroy(&sock); 5013 return; 5014 } 5015 5016 /* 5017 * Has this event been canceled? 5018 */ 5019 dev = sock->connect_ev; 5020 if (dev == NULL) { 5021 INSIST(!sock->connecting); 5022 UNLOCK(&sock->lock); 5023 return; 5024 } 5025 5026 INSIST(sock->connecting); 5027 sock->connecting = 0; 5028 5029 /* 5030 * Get any possible error status here. 5031 */ 5032 optlen = sizeof(cc); 5033 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, 5034 (void *)&cc, (void *)&optlen) < 0) 5035 cc = errno; 5036 else 5037 errno = cc; 5038 5039 if (errno != 0) { 5040 /* 5041 * If the error is EAGAIN, just re-select on this 5042 * fd and pretend nothing strange happened. 5043 */ 5044 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 5045 sock->connecting = 1; 5046 select_poke(sock->manager, sock->fd, 5047 SELECT_POKE_CONNECT); 5048 UNLOCK(&sock->lock); 5049 5050 return; 5051 } 5052 5053 inc_stats(sock->manager->stats, 5054 sock->statsindex[STATID_CONNECTFAIL]); 5055 5056 /* 5057 * Translate other errors into ISC_R_* flavors. 5058 */ 5059 switch (errno) { 5060#define ERROR_MATCH(a, b) case a: dev->result = b; break; 5061 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5062 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5063 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5064 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5065 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5066#ifdef EHOSTDOWN 5067 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5068#endif 5069 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5070 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5071 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5072 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5073 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 5074 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5075#undef ERROR_MATCH 5076 default: 5077 dev->result = ISC_R_UNEXPECTED; 5078 isc_sockaddr_format(&sock->peer_address, peerbuf, 5079 sizeof(peerbuf)); 5080 isc__strerror(errno, strbuf, sizeof(strbuf)); 5081 UNEXPECTED_ERROR(__FILE__, __LINE__, 5082 "internal_connect: connect(%s) %s", 5083 peerbuf, strbuf); 5084 } 5085 } else { 5086 inc_stats(sock->manager->stats, 5087 sock->statsindex[STATID_CONNECT]); 5088 dev->result = ISC_R_SUCCESS; 5089 sock->connected = 1; 5090 sock->bound = 1; 5091 } 5092 5093 sock->connect_ev = NULL; 5094 5095 UNLOCK(&sock->lock); 5096 5097 task = dev->ev_sender; 5098 dev->ev_sender = sock; 5099 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 5100} 5101 5102isc_result_t 5103isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) { 5104 isc_result_t result; 5105 5106 REQUIRE(VALID_SOCKET(sock)); 5107 REQUIRE(addressp != NULL); 5108 5109 LOCK(&sock->lock); 5110 5111 if (sock->connected) { 5112 *addressp = sock->peer_address; 5113 result = ISC_R_SUCCESS; 5114 } else { 5115 result = ISC_R_NOTCONNECTED; 5116 } 5117 5118 UNLOCK(&sock->lock); 5119 5120 return (result); 5121} 5122 5123isc_result_t 5124isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) { 5125 ISC_SOCKADDR_LEN_T len; 5126 isc_result_t result; 5127 char strbuf[ISC_STRERRORSIZE]; 5128 5129 REQUIRE(VALID_SOCKET(sock)); 5130 REQUIRE(addressp != NULL); 5131 5132 LOCK(&sock->lock); 5133 5134 if (!sock->bound) { 5135 result = ISC_R_NOTBOUND; 5136 goto out; 5137 } 5138 5139 result = ISC_R_SUCCESS; 5140 5141 len = sizeof(addressp->type); 5142 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) { 5143 isc__strerror(errno, strbuf, sizeof(strbuf)); 5144 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", 5145 strbuf); 5146 result = ISC_R_UNEXPECTED; 5147 goto out; 5148 } 5149 addressp->length = (unsigned int)len; 5150 5151 out: 5152 UNLOCK(&sock->lock); 5153 5154 return (result); 5155} 5156 5157/* 5158 * Run through the list of events on this socket, and cancel the ones 5159 * queued for task "task" of type "how". "how" is a bitmask. 5160 */ 5161void 5162isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) { 5163 5164 REQUIRE(VALID_SOCKET(sock)); 5165 5166 /* 5167 * Quick exit if there is nothing to do. Don't even bother locking 5168 * in this case. 5169 */ 5170 if (how == 0) 5171 return; 5172 5173 LOCK(&sock->lock); 5174 5175 /* 5176 * All of these do the same thing, more or less. 5177 * Each will: 5178 * o If the internal event is marked as "posted" try to 5179 * remove it from the task's queue. If this fails, mark it 5180 * as canceled instead, and let the task clean it up later. 5181 * o For each I/O request for that task of that type, post 5182 * its done event with status of "ISC_R_CANCELED". 5183 * o Reset any state needed. 5184 */ 5185 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) 5186 && !ISC_LIST_EMPTY(sock->recv_list)) { 5187 isc_socketevent_t *dev; 5188 isc_socketevent_t *next; 5189 isc_task_t *current_task; 5190 5191 dev = ISC_LIST_HEAD(sock->recv_list); 5192 5193 while (dev != NULL) { 5194 current_task = dev->ev_sender; 5195 next = ISC_LIST_NEXT(dev, ev_link); 5196 5197 if ((task == NULL) || (task == current_task)) { 5198 dev->result = ISC_R_CANCELED; 5199 send_recvdone_event(sock, &dev); 5200 } 5201 dev = next; 5202 } 5203 } 5204 5205 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) 5206 && !ISC_LIST_EMPTY(sock->send_list)) { 5207 isc_socketevent_t *dev; 5208 isc_socketevent_t *next; 5209 isc_task_t *current_task; 5210 5211 dev = ISC_LIST_HEAD(sock->send_list); 5212 5213 while (dev != NULL) { 5214 current_task = dev->ev_sender; 5215 next = ISC_LIST_NEXT(dev, ev_link); 5216 5217 if ((task == NULL) || (task == current_task)) { 5218 dev->result = ISC_R_CANCELED; 5219 send_senddone_event(sock, &dev); 5220 } 5221 dev = next; 5222 } 5223 } 5224 5225 if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT) 5226 && !ISC_LIST_EMPTY(sock->accept_list)) { 5227 isc_socket_newconnev_t *dev; 5228 isc_socket_newconnev_t *next; 5229 isc_task_t *current_task; 5230 5231 dev = ISC_LIST_HEAD(sock->accept_list); 5232 while (dev != NULL) { 5233 current_task = dev->ev_sender; 5234 next = ISC_LIST_NEXT(dev, ev_link); 5235 5236 if ((task == NULL) || (task == current_task)) { 5237 5238 ISC_LIST_UNLINK(sock->accept_list, dev, 5239 ev_link); 5240 5241 dev->newsocket->references--; 5242 free_socket(&dev->newsocket); 5243 5244 dev->result = ISC_R_CANCELED; 5245 dev->ev_sender = sock; 5246 isc_task_sendanddetach(¤t_task, 5247 ISC_EVENT_PTR(&dev)); 5248 } 5249 5250 dev = next; 5251 } 5252 } 5253 5254 /* 5255 * Connecting is not a list. 5256 */ 5257 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT) 5258 && sock->connect_ev != NULL) { 5259 isc_socket_connev_t *dev; 5260 isc_task_t *current_task; 5261 5262 INSIST(sock->connecting); 5263 sock->connecting = 0; 5264 5265 dev = sock->connect_ev; 5266 current_task = dev->ev_sender; 5267 5268 if ((task == NULL) || (task == current_task)) { 5269 sock->connect_ev = NULL; 5270 5271 dev->result = ISC_R_CANCELED; 5272 dev->ev_sender = sock; 5273 isc_task_sendanddetach(¤t_task, 5274 ISC_EVENT_PTR(&dev)); 5275 } 5276 } 5277 5278 UNLOCK(&sock->lock); 5279} 5280 5281isc_sockettype_t 5282isc_socket_gettype(isc_socket_t *sock) { 5283 REQUIRE(VALID_SOCKET(sock)); 5284 5285 return (sock->type); 5286} 5287 5288isc_boolean_t 5289isc_socket_isbound(isc_socket_t *sock) { 5290 isc_boolean_t val; 5291 5292 LOCK(&sock->lock); 5293 val = ((sock->bound) ? ISC_TRUE : ISC_FALSE); 5294 UNLOCK(&sock->lock); 5295 5296 return (val); 5297} 5298 5299void 5300isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) { 5301#if defined(IPV6_V6ONLY) 5302 int onoff = yes ? 1 : 0; 5303#else 5304 UNUSED(yes); 5305 UNUSED(sock); 5306#endif 5307 5308 REQUIRE(VALID_SOCKET(sock)); 5309 5310#ifdef IPV6_V6ONLY 5311 if (sock->pf == AF_INET6) { 5312 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, 5313 (void *)&onoff, sizeof(int)) < 0) { 5314 char strbuf[ISC_STRERRORSIZE]; 5315 5316 UNEXPECTED_ERROR(__FILE__, __LINE__, 5317 "setsockopt(%d, IPV6_V6ONLY) " 5318 "%s: %s", sock->fd, 5319 isc_msgcat_get(isc_msgcat, 5320 ISC_MSGSET_GENERAL, 5321 ISC_MSG_FAILED, 5322 "failed"), 5323 strbuf); 5324 } 5325 } 5326 FIX_IPV6_RECVPKTINFO(sock); /* AIX */ 5327#endif 5328} 5329 5330#ifndef ISC_PLATFORM_USETHREADS 5331/* In our assumed scenario, we can simply use a single static object. */ 5332static isc_socketwait_t swait_private; 5333 5334int 5335isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) { 5336 int n; 5337#ifdef USE_KQUEUE 5338 struct timespec ts, *tsp; 5339#endif 5340#ifdef USE_EPOLL 5341 int timeout; 5342#endif 5343#ifdef USE_DEVPOLL 5344 struct dvpoll dvp; 5345#endif 5346 5347 REQUIRE(swaitp != NULL && *swaitp == NULL); 5348 5349 if (socketmgr == NULL) 5350 return (0); 5351 5352#ifdef USE_KQUEUE 5353 if (tvp != NULL) { 5354 ts.tv_sec = tvp->tv_sec; 5355 ts.tv_nsec = tvp->tv_usec * 1000; 5356 tsp = &ts; 5357 } else 5358 tsp = NULL; 5359 swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0, 5360 socketmgr->events, socketmgr->nevents, 5361 tsp); 5362 n = swait_private.nevents; 5363#elif defined(USE_EPOLL) 5364 if (tvp != NULL) 5365 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000; 5366 else 5367 timeout = -1; 5368 swait_private.nevents = epoll_wait(socketmgr->epoll_fd, 5369 socketmgr->events, 5370 socketmgr->nevents, timeout); 5371 n = swait_private.nevents; 5372#elif defined(USE_DEVPOLL) 5373 dvp.dp_fds = socketmgr->events; 5374 dvp.dp_nfds = socketmgr->nevents; 5375 if (tvp != NULL) { 5376 dvp.dp_timeout = tvp->tv_sec * 1000 + 5377 (tvp->tv_usec + 999) / 1000; 5378 } else 5379 dvp.dp_timeout = -1; 5380 swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp); 5381 n = swait_private.nevents; 5382#elif defined(USE_SELECT) 5383 memcpy(socketmgr->read_fds_copy, socketmgr->read_fds, 5384 socketmgr->fd_bufsize); 5385 memcpy(socketmgr->write_fds_copy, socketmgr->write_fds, 5386 socketmgr->fd_bufsize); 5387 5388 swait_private.readset = socketmgr->read_fds_copy; 5389 swait_private.writeset = socketmgr->write_fds_copy; 5390 swait_private.maxfd = socketmgr->maxfd + 1; 5391 5392 n = select(swait_private.maxfd, swait_private.readset, 5393 swait_private.writeset, NULL, tvp); 5394#endif 5395 5396 *swaitp = &swait_private; 5397 return (n); 5398} 5399 5400isc_result_t 5401isc__socketmgr_dispatch(isc_socketwait_t *swait) { 5402 REQUIRE(swait == &swait_private); 5403 5404 if (socketmgr == NULL) 5405 return (ISC_R_NOTFOUND); 5406 5407#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 5408 (void)process_fds(socketmgr, socketmgr->events, swait->nevents); 5409 return (ISC_R_SUCCESS); 5410#elif defined(USE_SELECT) 5411 process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset); 5412 return (ISC_R_SUCCESS); 5413#endif 5414} 5415#endif /* ISC_PLATFORM_USETHREADS */ 5416 5417void 5418isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) { 5419 5420 /* 5421 * Name 'socket'. 5422 */ 5423 5424 REQUIRE(VALID_SOCKET(socket)); 5425 5426 LOCK(&socket->lock); 5427 memset(socket->name, 0, sizeof(socket->name)); 5428 strncpy(socket->name, name, sizeof(socket->name) - 1); 5429 socket->tag = tag; 5430 UNLOCK(&socket->lock); 5431} 5432 5433const char * 5434isc_socket_getname(isc_socket_t *socket) { 5435 return (socket->name); 5436} 5437 5438void * 5439isc_socket_gettag(isc_socket_t *socket) { 5440 return (socket->tag); 5441} 5442 5443#ifdef HAVE_LIBXML2 5444 5445static const char * 5446_socktype(isc_sockettype_t type) 5447{ 5448 if (type == isc_sockettype_udp) 5449 return ("udp"); 5450 else if (type == isc_sockettype_tcp) 5451 return ("tcp"); 5452 else if (type == isc_sockettype_unix) 5453 return ("unix"); 5454 else if (type == isc_sockettype_fdwatch) 5455 return ("fdwatch"); 5456 else 5457 return ("not-initialized"); 5458} 5459 5460void 5461isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer) 5462{ 5463 isc_socket_t *sock; 5464 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5465 isc_sockaddr_t addr; 5466 ISC_SOCKADDR_LEN_T len; 5467 5468 LOCK(&mgr->lock); 5469 5470#ifndef ISC_PLATFORM_USETHREADS 5471 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"); 5472 xmlTextWriterWriteFormatString(writer, "%d", mgr->refs); 5473 xmlTextWriterEndElement(writer); 5474#endif 5475 5476 xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"); 5477 sock = ISC_LIST_HEAD(mgr->socklist); 5478 while (sock != NULL) { 5479 LOCK(&sock->lock); 5480 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"); 5481 5482 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"); 5483 xmlTextWriterWriteFormatString(writer, "%p", sock); 5484 xmlTextWriterEndElement(writer); 5485 5486 if (sock->name[0] != 0) { 5487 xmlTextWriterStartElement(writer, ISC_XMLCHAR "name"); 5488 xmlTextWriterWriteFormatString(writer, "%s", 5489 sock->name); 5490 xmlTextWriterEndElement(writer); /* name */ 5491 } 5492 5493 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"); 5494 xmlTextWriterWriteFormatString(writer, "%d", sock->references); 5495 xmlTextWriterEndElement(writer); 5496 5497 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type", 5498 ISC_XMLCHAR _socktype(sock->type)); 5499 5500 if (sock->connected) { 5501 isc_sockaddr_format(&sock->peer_address, peerbuf, 5502 sizeof(peerbuf)); 5503 xmlTextWriterWriteElement(writer, 5504 ISC_XMLCHAR "peer-address", 5505 ISC_XMLCHAR peerbuf); 5506 } 5507 5508 len = sizeof(addr); 5509 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) { 5510 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf)); 5511 xmlTextWriterWriteElement(writer, 5512 ISC_XMLCHAR "local-address", 5513 ISC_XMLCHAR peerbuf); 5514 } 5515 5516 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"); 5517 if (sock->pending_recv) 5518 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5519 ISC_XMLCHAR "pending-receive"); 5520 if (sock->pending_send) 5521 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5522 ISC_XMLCHAR "pending-send"); 5523 if (sock->pending_accept) 5524 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5525 ISC_XMLCHAR "pending_accept"); 5526 if (sock->listener) 5527 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5528 ISC_XMLCHAR "listener"); 5529 if (sock->connected) 5530 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5531 ISC_XMLCHAR "connected"); 5532 if (sock->connecting) 5533 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5534 ISC_XMLCHAR "connecting"); 5535 if (sock->bound) 5536 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5537 ISC_XMLCHAR "bound"); 5538 5539 xmlTextWriterEndElement(writer); /* states */ 5540 5541 xmlTextWriterEndElement(writer); /* socket */ 5542 5543 UNLOCK(&sock->lock); 5544 sock = ISC_LIST_NEXT(sock, link); 5545 } 5546 xmlTextWriterEndElement(writer); /* sockets */ 5547 5548 UNLOCK(&mgr->lock); 5549} 5550#endif /* HAVE_LIBXML2 */ 5551