1/* $NetBSD: socket.c,v 1.2 2011/08/16 04:45:17 christos Exp $ */ 2 3/* 4 * Copyright (C) 2004-2009 Internet Systems Consortium, Inc. ("ISC") 5 * Copyright (C) 1998-2003 Internet Software Consortium. 6 * 7 * Permission to use, copy, modify, and/or distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 12 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 13 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 14 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 15 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 16 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 17 * PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20/* Id: socket.c,v 1.308.12.8 2009/04/18 01:29:26 jinmei Exp */ 21 22/*! \file */ 23 24#include <config.h> 25 26#include <sys/param.h> 27#include <sys/types.h> 28#include <sys/socket.h> 29#include <sys/stat.h> 30#include <sys/time.h> 31#include <sys/uio.h> 32 33#include <errno.h> 34#include <fcntl.h> 35#include <stddef.h> 36#include <stdlib.h> 37#include <string.h> 38#include <unistd.h> 39 40#include <isc/buffer.h> 41#include <isc/bufferlist.h> 42#include <isc/condition.h> 43#include <isc/formatcheck.h> 44#include <isc/list.h> 45#include <isc/log.h> 46#include <isc/mem.h> 47#include <isc/msgs.h> 48#include <isc/mutex.h> 49#include <isc/net.h> 50#include <isc/once.h> 51#include <isc/platform.h> 52#include <isc/print.h> 53#include <isc/region.h> 54#include <isc/socket.h> 55#include <isc/stats.h> 56#include <isc/strerror.h> 57#include <isc/task.h> 58#include <isc/thread.h> 59#include <isc/util.h> 60#include <isc/xml.h> 61 62#ifdef ISC_PLATFORM_HAVESYSUNH 63#include <sys/un.h> 64#endif 65#ifdef ISC_PLATFORM_HAVEKQUEUE 66#include <sys/event.h> 67#endif 68#ifdef ISC_PLATFORM_HAVEEPOLL 69#include <sys/epoll.h> 70#endif 71#ifdef ISC_PLATFORM_HAVEDEVPOLL 72#include <sys/devpoll.h> 73#endif 74 75#include "errno2result.h" 76 77#ifndef ISC_PLATFORM_USETHREADS 78#include "socket_p.h" 79#endif /* ISC_PLATFORM_USETHREADS */ 80 81#if defined(SO_BSDCOMPAT) && defined(__linux__) 82#include <sys/utsname.h> 83#endif 84 85/*% 86 * Choose the most preferable multiplex method. 87 */ 88#ifdef ISC_PLATFORM_HAVEKQUEUE 89#define USE_KQUEUE 90#elif defined (ISC_PLATFORM_HAVEEPOLL) 91#define USE_EPOLL 92#elif defined (ISC_PLATFORM_HAVEDEVPOLL) 93#define USE_DEVPOLL 94typedef struct { 95 unsigned int want_read : 1, 96 want_write : 1; 97} pollinfo_t; 98#else 99#define USE_SELECT 100#endif /* ISC_PLATFORM_HAVEKQUEUE */ 101 102#ifndef ISC_PLATFORM_USETHREADS 103#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 104struct isc_socketwait { 105 int nevents; 106}; 107#elif defined (USE_SELECT) 108struct isc_socketwait { 109 fd_set *readset; 110 fd_set *writeset; 111 int nfds; 112 int maxfd; 113}; 114#endif /* USE_KQUEUE */ 115#endif /* !ISC_PLATFORM_USETHREADS */ 116 117/*% 118 * Maximum number of allowable open sockets. This is also the maximum 119 * allowable socket file descriptor. 120 * 121 * Care should be taken before modifying this value for select(): 122 * The API standard doesn't ensure select() accept more than (the system default 123 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in 124 * the vast majority of cases. This constant should therefore be increased only 125 * when absolutely necessary and possible, i.e., the server is exhausting all 126 * available file descriptors (up to FD_SETSIZE) and the select() function 127 * and FD_xxx macros support larger values than FD_SETSIZE (which may not 128 * always by true, but we keep using some of them to ensure as much 129 * portability as possible). Note also that overall server performance 130 * may be rather worsened with a larger value of this constant due to 131 * inherent scalability problems of select(). 132 * 133 * As a special note, this value shouldn't have to be touched if 134 * this is a build for an authoritative only DNS server. 135 */ 136#ifndef ISC_SOCKET_MAXSOCKETS 137#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 138#define ISC_SOCKET_MAXSOCKETS 4096 139#elif defined(USE_SELECT) 140#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE 141#endif /* USE_KQUEUE... */ 142#endif /* ISC_SOCKET_MAXSOCKETS */ 143 144#ifdef USE_SELECT 145/*% 146 * Mac OS X needs a special definition to support larger values in select(). 147 * We always define this because a larger value can be specified run-time. 148 */ 149#ifdef __APPLE__ 150#define _DARWIN_UNLIMITED_SELECT 151#endif /* __APPLE__ */ 152#endif /* USE_SELECT */ 153 154#ifdef ISC_SOCKET_USE_POLLWATCH 155/*% 156 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel 157 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for 158 * some of the specified FD. The idea is based on the observation that it's 159 * likely for a busy server to keep receiving packets. It specifically works 160 * as follows: the socket watcher is first initialized with the state of 161 * "poll_idle". While it's in the idle state it keeps sleeping until a socket 162 * event occurs. When it wakes up for a socket I/O event, it moves to the 163 * poll_active state, and sets the poll timeout to a short period 164 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the 165 * watcher goes to the poll_checking state with the same timeout period. 166 * In this state, the watcher tries to detect whether this is a break 167 * during intermittent events or the kernel bug is triggered. If the next 168 * polling reports an event within the short period, the previous timeout is 169 * likely to be a kernel bug, and so the watcher goes back to the active state. 170 * Otherwise, it moves to the idle state again. 171 * 172 * It's not clear whether this is a thread-related bug, but since we've only 173 * seen this with threads, this workaround is used only when enabling threads. 174 */ 175 176typedef enum { poll_idle, poll_active, poll_checking } pollstate_t; 177 178#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT 179#define ISC_SOCKET_POLLWATCH_TIMEOUT 10 180#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */ 181#endif /* ISC_SOCKET_USE_POLLWATCH */ 182 183/*% 184 * Size of per-FD lock buckets. 185 */ 186#ifdef ISC_PLATFORM_USETHREADS 187#define FDLOCK_COUNT 1024 188#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT) 189#else 190#define FDLOCK_COUNT 1 191#define FDLOCK_ID(fd) 0 192#endif /* ISC_PLATFORM_USETHREADS */ 193 194/*% 195 * Maximum number of events communicated with the kernel. There should normally 196 * be no need for having a large number. 197 */ 198#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 199#ifndef ISC_SOCKET_MAXEVENTS 200#define ISC_SOCKET_MAXEVENTS 64 201#endif 202#endif 203 204/*% 205 * Some systems define the socket length argument as an int, some as size_t, 206 * some as socklen_t. This is here so it can be easily changed if needed. 207 */ 208#ifndef ISC_SOCKADDR_LEN_T 209#define ISC_SOCKADDR_LEN_T unsigned int 210#endif 211 212/*% 213 * Define what the possible "soft" errors can be. These are non-fatal returns 214 * of various network related functions, like recv() and so on. 215 * 216 * For some reason, BSDI (and perhaps others) will sometimes return <0 217 * from recv() but will have errno==0. This is broken, but we have to 218 * work around it here. 219 */ 220#define SOFT_ERROR(e) ((e) == EAGAIN || \ 221 (e) == EWOULDBLOCK || \ 222 (e) == EINTR || \ 223 (e) == 0) 224 225#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x) 226 227/*!< 228 * DLVL(90) -- Function entry/exit and other tracing. 229 * DLVL(70) -- Socket "correctness" -- including returning of events, etc. 230 * DLVL(60) -- Socket data send/receive 231 * DLVL(50) -- Event tracing, including receiving/sending completion events. 232 * DLVL(20) -- Socket creation/destruction. 233 */ 234#define TRACE_LEVEL 90 235#define CORRECTNESS_LEVEL 70 236#define IOEVENT_LEVEL 60 237#define EVENT_LEVEL 50 238#define CREATION_LEVEL 20 239 240#define TRACE DLVL(TRACE_LEVEL) 241#define CORRECTNESS DLVL(CORRECTNESS_LEVEL) 242#define IOEVENT DLVL(IOEVENT_LEVEL) 243#define EVENT DLVL(EVENT_LEVEL) 244#define CREATION DLVL(CREATION_LEVEL) 245 246typedef isc_event_t intev_t; 247 248#define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o') 249#define VALID_SOCKET(t) ISC_MAGIC_VALID(t, SOCKET_MAGIC) 250 251/*! 252 * IPv6 control information. If the socket is an IPv6 socket we want 253 * to collect the destination address and interface so the client can 254 * set them on outgoing packets. 255 */ 256#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 257#ifndef USE_CMSG 258#define USE_CMSG 1 259#endif 260#endif 261 262/*% 263 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have 264 * a setsockopt() like interface to request timestamps, and if the OS 265 * doesn't do it for us, call gettimeofday() on every UDP receive? 266 */ 267#ifdef SO_TIMESTAMP 268#ifndef USE_CMSG 269#define USE_CMSG 1 270#endif 271#endif 272 273/*% 274 * The size to raise the receive buffer to (from BIND 8). 275 */ 276#define RCVBUFSIZE (32*1024) 277 278/*% 279 * The number of times a send operation is repeated if the result is EINTR. 280 */ 281#define NRETRIES 10 282 283struct isc_socket { 284 /* Not locked. */ 285 unsigned int magic; 286 isc_socketmgr_t *manager; 287 isc_mutex_t lock; 288 isc_sockettype_t type; 289 const isc_statscounter_t *statsindex; 290 291 /* Locked by socket lock. */ 292 ISC_LINK(isc_socket_t) link; 293 unsigned int references; 294 int fd; 295 int pf; 296 char name[16]; 297 void * tag; 298 299 ISC_LIST(isc_socketevent_t) send_list; 300 ISC_LIST(isc_socketevent_t) recv_list; 301 ISC_LIST(isc_socket_newconnev_t) accept_list; 302 isc_socket_connev_t *connect_ev; 303 304 /* 305 * Internal events. Posted when a descriptor is readable or 306 * writable. These are statically allocated and never freed. 307 * They will be set to non-purgable before use. 308 */ 309 intev_t readable_ev; 310 intev_t writable_ev; 311 312 isc_sockaddr_t peer_address; /* remote address */ 313 314 unsigned int pending_recv : 1, 315 pending_send : 1, 316 pending_accept : 1, 317 listener : 1, /* listener socket */ 318 connected : 1, 319 connecting : 1, /* connect pending */ 320 bound : 1; /* bound to local addr */ 321 322#ifdef ISC_NET_RECVOVERFLOW 323 unsigned char overflow; /* used for MSG_TRUNC fake */ 324#endif 325 326 char *recvcmsgbuf; 327 ISC_SOCKADDR_LEN_T recvcmsgbuflen; 328 char *sendcmsgbuf; 329 ISC_SOCKADDR_LEN_T sendcmsgbuflen; 330 331 void *fdwatcharg; 332 isc_sockfdwatch_t fdwatchcb; 333 int fdwatchflags; 334 isc_task_t *fdwatchtask; 335}; 336 337#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g') 338#define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC) 339 340struct isc_socketmgr { 341 /* Not locked. */ 342 unsigned int magic; 343 isc_mem_t *mctx; 344 isc_mutex_t lock; 345 isc_mutex_t *fdlock; 346 isc_stats_t *stats; 347#ifdef USE_KQUEUE 348 int kqueue_fd; 349 int nevents; 350 struct kevent *events; 351#endif /* USE_KQUEUE */ 352#ifdef USE_EPOLL 353 int epoll_fd; 354 int nevents; 355 struct epoll_event *events; 356#endif /* USE_EPOLL */ 357#ifdef USE_DEVPOLL 358 int devpoll_fd; 359 int nevents; 360 struct pollfd *events; 361#endif /* USE_DEVPOLL */ 362#ifdef USE_SELECT 363 int fd_bufsize; 364#endif /* USE_SELECT */ 365 unsigned int maxsocks; 366#ifdef ISC_PLATFORM_USETHREADS 367 int pipe_fds[2]; 368#endif 369 370 /* Locked by fdlock. */ 371 isc_socket_t **fds; 372 int *fdstate; 373#ifdef USE_DEVPOLL 374 pollinfo_t *fdpollinfo; 375#endif 376 377 /* Locked by manager lock. */ 378 ISC_LIST(isc_socket_t) socklist; 379#ifdef USE_SELECT 380 fd_set *read_fds; 381 fd_set *read_fds_copy; 382 fd_set *write_fds; 383 fd_set *write_fds_copy; 384 int maxfd; 385#endif /* USE_SELECT */ 386 int reserved; /* unlocked */ 387#ifdef ISC_PLATFORM_USETHREADS 388 isc_thread_t watcher; 389 isc_condition_t shutdown_ok; 390#else /* ISC_PLATFORM_USETHREADS */ 391 unsigned int refs; 392#endif /* ISC_PLATFORM_USETHREADS */ 393}; 394 395#ifndef ISC_PLATFORM_USETHREADS 396static isc_socketmgr_t *socketmgr = NULL; 397#endif /* ISC_PLATFORM_USETHREADS */ 398 399#define CLOSED 0 /* this one must be zero */ 400#define MANAGED 1 401#define CLOSE_PENDING 2 402 403/* 404 * send() and recv() iovec counts 405 */ 406#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) 407#ifdef ISC_NET_RECVOVERFLOW 408# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1) 409#else 410# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) 411#endif 412 413static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **); 414static void send_senddone_event(isc_socket_t *, isc_socketevent_t **); 415static void free_socket(isc_socket_t **); 416static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t, 417 isc_socket_t **); 418static void destroy(isc_socket_t **); 419static void internal_accept(isc_task_t *, isc_event_t *); 420static void internal_connect(isc_task_t *, isc_event_t *); 421static void internal_recv(isc_task_t *, isc_event_t *); 422static void internal_send(isc_task_t *, isc_event_t *); 423static void internal_fdwatch_write(isc_task_t *, isc_event_t *); 424static void internal_fdwatch_read(isc_task_t *, isc_event_t *); 425static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *); 426static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *, 427 struct msghdr *, struct iovec *, size_t *); 428static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *, 429 struct msghdr *, struct iovec *, size_t *); 430#ifdef ISC_PLATFORM_USETHREADS 431static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager); 432#endif 433 434#define SELECT_POKE_SHUTDOWN (-1) 435#define SELECT_POKE_NOTHING (-2) 436#define SELECT_POKE_READ (-3) 437#define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */ 438#define SELECT_POKE_WRITE (-4) 439#define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */ 440#define SELECT_POKE_CLOSE (-5) 441 442#define SOCK_DEAD(s) ((s)->references == 0) 443 444/*% 445 * Shortcut index arrays to get access to statistics counters. 446 */ 447enum { 448 STATID_OPEN = 0, 449 STATID_OPENFAIL = 1, 450 STATID_CLOSE = 2, 451 STATID_BINDFAIL = 3, 452 STATID_CONNECTFAIL = 4, 453 STATID_CONNECT = 5, 454 STATID_ACCEPTFAIL = 6, 455 STATID_ACCEPT = 7, 456 STATID_SENDFAIL = 8, 457 STATID_RECVFAIL = 9 458}; 459static const isc_statscounter_t upd4statsindex[] = { 460 isc_sockstatscounter_udp4open, 461 isc_sockstatscounter_udp4openfail, 462 isc_sockstatscounter_udp4close, 463 isc_sockstatscounter_udp4bindfail, 464 isc_sockstatscounter_udp4connectfail, 465 isc_sockstatscounter_udp4connect, 466 -1, 467 -1, 468 isc_sockstatscounter_udp4sendfail, 469 isc_sockstatscounter_udp4recvfail 470}; 471static const isc_statscounter_t upd6statsindex[] = { 472 isc_sockstatscounter_udp6open, 473 isc_sockstatscounter_udp6openfail, 474 isc_sockstatscounter_udp6close, 475 isc_sockstatscounter_udp6bindfail, 476 isc_sockstatscounter_udp6connectfail, 477 isc_sockstatscounter_udp6connect, 478 -1, 479 -1, 480 isc_sockstatscounter_udp6sendfail, 481 isc_sockstatscounter_udp6recvfail 482}; 483static const isc_statscounter_t tcp4statsindex[] = { 484 isc_sockstatscounter_tcp4open, 485 isc_sockstatscounter_tcp4openfail, 486 isc_sockstatscounter_tcp4close, 487 isc_sockstatscounter_tcp4bindfail, 488 isc_sockstatscounter_tcp4connectfail, 489 isc_sockstatscounter_tcp4connect, 490 isc_sockstatscounter_tcp4acceptfail, 491 isc_sockstatscounter_tcp4accept, 492 isc_sockstatscounter_tcp4sendfail, 493 isc_sockstatscounter_tcp4recvfail 494}; 495static const isc_statscounter_t tcp6statsindex[] = { 496 isc_sockstatscounter_tcp6open, 497 isc_sockstatscounter_tcp6openfail, 498 isc_sockstatscounter_tcp6close, 499 isc_sockstatscounter_tcp6bindfail, 500 isc_sockstatscounter_tcp6connectfail, 501 isc_sockstatscounter_tcp6connect, 502 isc_sockstatscounter_tcp6acceptfail, 503 isc_sockstatscounter_tcp6accept, 504 isc_sockstatscounter_tcp6sendfail, 505 isc_sockstatscounter_tcp6recvfail 506}; 507static const isc_statscounter_t unixstatsindex[] = { 508 isc_sockstatscounter_unixopen, 509 isc_sockstatscounter_unixopenfail, 510 isc_sockstatscounter_unixclose, 511 isc_sockstatscounter_unixbindfail, 512 isc_sockstatscounter_unixconnectfail, 513 isc_sockstatscounter_unixconnect, 514 isc_sockstatscounter_unixacceptfail, 515 isc_sockstatscounter_unixaccept, 516 isc_sockstatscounter_unixsendfail, 517 isc_sockstatscounter_unixrecvfail 518}; 519static const isc_statscounter_t fdwatchstatsindex[] = { 520 -1, 521 -1, 522 isc_sockstatscounter_fdwatchclose, 523 isc_sockstatscounter_fdwatchbindfail, 524 isc_sockstatscounter_fdwatchconnectfail, 525 isc_sockstatscounter_fdwatchconnect, 526 -1, 527 -1, 528 isc_sockstatscounter_fdwatchsendfail, 529 isc_sockstatscounter_fdwatchrecvfail 530}; 531 532static void 533manager_log(isc_socketmgr_t *sockmgr, 534 isc_logcategory_t *category, isc_logmodule_t *module, int level, 535 const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6); 536static void 537manager_log(isc_socketmgr_t *sockmgr, 538 isc_logcategory_t *category, isc_logmodule_t *module, int level, 539 const char *fmt, ...) 540{ 541 char msgbuf[2048]; 542 va_list ap; 543 544 if (! isc_log_wouldlog(isc_lctx, level)) 545 return; 546 547 va_start(ap, fmt); 548 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 549 va_end(ap); 550 551 isc_log_write(isc_lctx, category, module, level, 552 "sockmgr %p: %s", sockmgr, msgbuf); 553} 554 555static void 556socket_log(isc_socket_t *sock, isc_sockaddr_t *address, 557 isc_logcategory_t *category, isc_logmodule_t *module, int level, 558 isc_msgcat_t *msgcat, int msgset, int message, 559 const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10); 560static void 561socket_log(isc_socket_t *sock, isc_sockaddr_t *address, 562 isc_logcategory_t *category, isc_logmodule_t *module, int level, 563 isc_msgcat_t *msgcat, int msgset, int message, 564 const char *fmt, ...) 565{ 566 char msgbuf[2048]; 567 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 568 va_list ap; 569 570 if (! isc_log_wouldlog(isc_lctx, level)) 571 return; 572 573 va_start(ap, fmt); 574 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 575 va_end(ap); 576 577 if (address == NULL) { 578 isc_log_iwrite(isc_lctx, category, module, level, 579 msgcat, msgset, message, 580 "socket %p: %s", sock, msgbuf); 581 } else { 582 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); 583 isc_log_iwrite(isc_lctx, category, module, level, 584 msgcat, msgset, message, 585 "socket %p %s: %s", sock, peerbuf, msgbuf); 586 } 587} 588 589#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \ 590 defined(USE_CMSG) && defined(IPV6_RECVPKTINFO) 591/* 592 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by 593 * setting IPV6_V6ONLY. 594 */ 595static void 596FIX_IPV6_RECVPKTINFO(isc_socket_t *sock) 597{ 598 char strbuf[ISC_STRERRORSIZE]; 599 int on = 1; 600 601 if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp) 602 return; 603 604 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 605 (void *)&on, sizeof(on)) < 0) { 606 isc__strerror(errno, strbuf, sizeof(strbuf)); 607 UNEXPECTED_ERROR(__FILE__, __LINE__, 608 "setsockopt(%d, IPV6_RECVPKTINFO) " 609 "%s: %s", sock->fd, 610 isc_msgcat_get(isc_msgcat, 611 ISC_MSGSET_GENERAL, 612 ISC_MSG_FAILED, 613 "failed"), 614 strbuf); 615 } 616} 617#else 618#define FIX_IPV6_RECVPKTINFO(sock) (void)0 619#endif 620 621/*% 622 * Increment socket-related statistics counters. 623 */ 624static inline void 625inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) { 626 REQUIRE(counterid != -1); 627 628 if (stats != NULL) 629 isc_stats_increment(stats, counterid); 630} 631 632static inline isc_result_t 633watch_fd(isc_socketmgr_t *manager, int fd, int msg) { 634 isc_result_t result = ISC_R_SUCCESS; 635 636#ifdef USE_KQUEUE 637 struct kevent evchange; 638 639 memset(&evchange, 0, sizeof(evchange)); 640 if (msg == SELECT_POKE_READ) 641 evchange.filter = EVFILT_READ; 642 else 643 evchange.filter = EVFILT_WRITE; 644 evchange.flags = EV_ADD; 645 evchange.ident = fd; 646 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 647 result = isc__errno2result(errno); 648 649 return (result); 650#elif defined(USE_EPOLL) 651 struct epoll_event event; 652 653 if (msg == SELECT_POKE_READ) 654 event.events = EPOLLIN; 655 else 656 event.events = EPOLLOUT; 657 event.data.fd = fd; 658 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && 659 errno != EEXIST) { 660 result = isc__errno2result(errno); 661 } 662 663 return (result); 664#elif defined(USE_DEVPOLL) 665 struct pollfd pfd; 666 int lockid = FDLOCK_ID(fd); 667 668 memset(&pfd, 0, sizeof(pfd)); 669 if (msg == SELECT_POKE_READ) 670 pfd.events = POLLIN; 671 else 672 pfd.events = POLLOUT; 673 pfd.fd = fd; 674 pfd.revents = 0; 675 LOCK(&manager->fdlock[lockid]); 676 if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1) 677 result = isc__errno2result(errno); 678 else { 679 if (msg == SELECT_POKE_READ) 680 manager->fdpollinfo[fd].want_read = 1; 681 else 682 manager->fdpollinfo[fd].want_write = 1; 683 } 684 UNLOCK(&manager->fdlock[lockid]); 685 686 return (result); 687#elif defined(USE_SELECT) 688 LOCK(&manager->lock); 689 if (msg == SELECT_POKE_READ) 690 FD_SET(fd, manager->read_fds); 691 if (msg == SELECT_POKE_WRITE) 692 FD_SET(fd, manager->write_fds); 693 UNLOCK(&manager->lock); 694 695 return (result); 696#endif 697} 698 699static inline isc_result_t 700unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) { 701 isc_result_t result = ISC_R_SUCCESS; 702 703#ifdef USE_KQUEUE 704 struct kevent evchange; 705 706 memset(&evchange, 0, sizeof(evchange)); 707 if (msg == SELECT_POKE_READ) 708 evchange.filter = EVFILT_READ; 709 else 710 evchange.filter = EVFILT_WRITE; 711 evchange.flags = EV_DELETE; 712 evchange.ident = fd; 713 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 714 result = isc__errno2result(errno); 715 716 return (result); 717#elif defined(USE_EPOLL) 718 struct epoll_event event; 719 720 if (msg == SELECT_POKE_READ) 721 event.events = EPOLLIN; 722 else 723 event.events = EPOLLOUT; 724 event.data.fd = fd; 725 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 && 726 errno != ENOENT) { 727 char strbuf[ISC_STRERRORSIZE]; 728 isc__strerror(errno, strbuf, sizeof(strbuf)); 729 UNEXPECTED_ERROR(__FILE__, __LINE__, 730 "epoll_ctl(DEL), %d: %s", fd, strbuf); 731 result = ISC_R_UNEXPECTED; 732 } 733 return (result); 734#elif defined(USE_DEVPOLL) 735 struct pollfd pfds[2]; 736 size_t writelen = sizeof(pfds[0]); 737 int lockid = FDLOCK_ID(fd); 738 739 memset(pfds, 0, sizeof(pfds)); 740 pfds[0].events = POLLREMOVE; 741 pfds[0].fd = fd; 742 743 /* 744 * Canceling read or write polling via /dev/poll is tricky. Since it 745 * only provides a way of canceling per FD, we may need to re-poll the 746 * socket for the other operation. 747 */ 748 LOCK(&manager->fdlock[lockid]); 749 if (msg == SELECT_POKE_READ && 750 manager->fdpollinfo[fd].want_write == 1) { 751 pfds[1].events = POLLOUT; 752 pfds[1].fd = fd; 753 writelen += sizeof(pfds[1]); 754 } 755 if (msg == SELECT_POKE_WRITE && 756 manager->fdpollinfo[fd].want_read == 1) { 757 pfds[1].events = POLLIN; 758 pfds[1].fd = fd; 759 writelen += sizeof(pfds[1]); 760 } 761 762 if (write(manager->devpoll_fd, pfds, writelen) == -1) 763 result = isc__errno2result(errno); 764 else { 765 if (msg == SELECT_POKE_READ) 766 manager->fdpollinfo[fd].want_read = 0; 767 else 768 manager->fdpollinfo[fd].want_write = 0; 769 } 770 UNLOCK(&manager->fdlock[lockid]); 771 772 return (result); 773#elif defined(USE_SELECT) 774 LOCK(&manager->lock); 775 if (msg == SELECT_POKE_READ) 776 FD_CLR(fd, manager->read_fds); 777 else if (msg == SELECT_POKE_WRITE) 778 FD_CLR(fd, manager->write_fds); 779 UNLOCK(&manager->lock); 780 781 return (result); 782#endif 783} 784 785static void 786wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { 787 isc_result_t result; 788 int lockid = FDLOCK_ID(fd); 789 790 /* 791 * This is a wakeup on a socket. If the socket is not in the 792 * process of being closed, start watching it for either reads 793 * or writes. 794 */ 795 796 INSIST(fd >= 0 && fd < (int)manager->maxsocks); 797 798 if (msg == SELECT_POKE_CLOSE) { 799 /* No one should be updating fdstate, so no need to lock it */ 800 INSIST(manager->fdstate[fd] == CLOSE_PENDING); 801 manager->fdstate[fd] = CLOSED; 802 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 803 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 804 (void)close(fd); 805 return; 806 } 807 808 LOCK(&manager->fdlock[lockid]); 809 if (manager->fdstate[fd] == CLOSE_PENDING) { 810 UNLOCK(&manager->fdlock[lockid]); 811 812 /* 813 * We accept (and ignore) any error from unwatch_fd() as we are 814 * closing the socket, hoping it doesn't leave dangling state in 815 * the kernel. 816 * Note that unwatch_fd() must be called after releasing the 817 * fdlock; otherwise it could cause deadlock due to a lock order 818 * reversal. 819 */ 820 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 821 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 822 return; 823 } 824 if (manager->fdstate[fd] != MANAGED) { 825 UNLOCK(&manager->fdlock[lockid]); 826 return; 827 } 828 UNLOCK(&manager->fdlock[lockid]); 829 830 /* 831 * Set requested bit. 832 */ 833 result = watch_fd(manager, fd, msg); 834 if (result != ISC_R_SUCCESS) { 835 /* 836 * XXXJT: what should we do? Ignoring the failure of watching 837 * a socket will make the application dysfunctional, but there 838 * seems to be no reasonable recovery process. 839 */ 840 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 841 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 842 "failed to start watching FD (%d): %s", 843 fd, isc_result_totext(result)); 844 } 845} 846 847#ifdef ISC_PLATFORM_USETHREADS 848/* 849 * Poke the select loop when there is something for us to do. 850 * The write is required (by POSIX) to complete. That is, we 851 * will not get partial writes. 852 */ 853static void 854select_poke(isc_socketmgr_t *mgr, int fd, int msg) { 855 int cc; 856 int buf[2]; 857 char strbuf[ISC_STRERRORSIZE]; 858 859 buf[0] = fd; 860 buf[1] = msg; 861 862 do { 863 cc = write(mgr->pipe_fds[1], buf, sizeof(buf)); 864#ifdef ENOSR 865 /* 866 * Treat ENOSR as EAGAIN but loop slowly as it is 867 * unlikely to clear fast. 868 */ 869 if (cc < 0 && errno == ENOSR) { 870 sleep(1); 871 errno = EAGAIN; 872 } 873#endif 874 } while (cc < 0 && SOFT_ERROR(errno)); 875 876 if (cc < 0) { 877 isc__strerror(errno, strbuf, sizeof(strbuf)); 878 FATAL_ERROR(__FILE__, __LINE__, 879 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 880 ISC_MSG_WRITEFAILED, 881 "write() failed " 882 "during watcher poke: %s"), 883 strbuf); 884 } 885 886 INSIST(cc == sizeof(buf)); 887} 888 889/* 890 * Read a message on the internal fd. 891 */ 892static void 893select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) { 894 int buf[2]; 895 int cc; 896 char strbuf[ISC_STRERRORSIZE]; 897 898 cc = read(mgr->pipe_fds[0], buf, sizeof(buf)); 899 if (cc < 0) { 900 *msg = SELECT_POKE_NOTHING; 901 *fd = -1; /* Silence compiler. */ 902 if (SOFT_ERROR(errno)) 903 return; 904 905 isc__strerror(errno, strbuf, sizeof(strbuf)); 906 FATAL_ERROR(__FILE__, __LINE__, 907 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 908 ISC_MSG_READFAILED, 909 "read() failed " 910 "during watcher poke: %s"), 911 strbuf); 912 913 return; 914 } 915 INSIST(cc == sizeof(buf)); 916 917 *fd = buf[0]; 918 *msg = buf[1]; 919} 920#else /* ISC_PLATFORM_USETHREADS */ 921/* 922 * Update the state of the socketmgr when something changes. 923 */ 924static void 925select_poke(isc_socketmgr_t *manager, int fd, int msg) { 926 if (msg == SELECT_POKE_SHUTDOWN) 927 return; 928 else if (fd >= 0) 929 wakeup_socket(manager, fd, msg); 930 return; 931} 932#endif /* ISC_PLATFORM_USETHREADS */ 933 934/* 935 * Make a fd non-blocking. 936 */ 937static isc_result_t 938make_nonblock(int fd) { 939 int ret; 940 int flags; 941 char strbuf[ISC_STRERRORSIZE]; 942#ifdef USE_FIONBIO_IOCTL 943 int on = 1; 944 945 ret = ioctl(fd, FIONBIO, (char *)&on); 946#else 947 flags = fcntl(fd, F_GETFL, 0); 948 flags |= PORT_NONBLOCK; 949 ret = fcntl(fd, F_SETFL, flags); 950#endif 951 952 if (ret == -1) { 953 isc__strerror(errno, strbuf, sizeof(strbuf)); 954 UNEXPECTED_ERROR(__FILE__, __LINE__, 955#ifdef USE_FIONBIO_IOCTL 956 "ioctl(%d, FIONBIO, &on): %s", fd, 957#else 958 "fcntl(%d, F_SETFL, %d): %s", fd, flags, 959#endif 960 strbuf); 961 962 return (ISC_R_UNEXPECTED); 963 } 964 965 return (ISC_R_SUCCESS); 966} 967 968#ifdef USE_CMSG 969/* 970 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE. 971 * In order to ensure as much portability as possible, we provide wrapper 972 * functions of these macros. 973 * Note that cmsg_space() could run slow on OSes that do not have 974 * CMSG_SPACE. 975 */ 976static inline ISC_SOCKADDR_LEN_T 977cmsg_len(ISC_SOCKADDR_LEN_T len) { 978#ifdef CMSG_LEN 979 return (CMSG_LEN(len)); 980#else 981 ISC_SOCKADDR_LEN_T hdrlen; 982 983 /* 984 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA 985 * is correct. 986 */ 987 hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL)); 988 return (hdrlen + len); 989#endif 990} 991 992static inline ISC_SOCKADDR_LEN_T 993cmsg_space(ISC_SOCKADDR_LEN_T len) { 994#ifdef CMSG_SPACE 995 return (CMSG_SPACE(len)); 996#else 997 struct msghdr msg; 998 struct cmsghdr *cmsgp; 999 /* 1000 * XXX: The buffer length is an ad-hoc value, but should be enough 1001 * in a practical sense. 1002 */ 1003 char dummybuf[sizeof(struct cmsghdr) + 1024]; 1004 1005 memset(&msg, 0, sizeof(msg)); 1006 msg.msg_control = dummybuf; 1007 msg.msg_controllen = sizeof(dummybuf); 1008 1009 cmsgp = (struct cmsghdr *)dummybuf; 1010 cmsgp->cmsg_len = cmsg_len(len); 1011 1012 cmsgp = CMSG_NXTHDR(&msg, cmsgp); 1013 if (cmsgp != NULL) 1014 return ((char *)cmsgp - (char *)msg.msg_control); 1015 else 1016 return (0); 1017#endif 1018} 1019#endif /* USE_CMSG */ 1020 1021/* 1022 * Process control messages received on a socket. 1023 */ 1024static void 1025process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { 1026#ifdef USE_CMSG 1027 struct cmsghdr *cmsgp; 1028#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1029 struct in6_pktinfo *pktinfop; 1030#endif 1031#ifdef SO_TIMESTAMP 1032 struct timeval *timevalp; 1033#endif 1034#endif 1035 1036 /* 1037 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. 1038 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. 1039 * They are all here, outside of the CPP tests, because it is 1040 * more consistent with the usual ISC coding style. 1041 */ 1042 UNUSED(sock); 1043 UNUSED(msg); 1044 UNUSED(dev); 1045 1046#ifdef ISC_NET_BSD44MSGHDR 1047 1048#ifdef MSG_TRUNC 1049 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC) 1050 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1051#endif 1052 1053#ifdef MSG_CTRUNC 1054 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC) 1055 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; 1056#endif 1057 1058#ifndef USE_CMSG 1059 return; 1060#else 1061 if (msg->msg_controllen == 0U || msg->msg_control == NULL) 1062 return; 1063 1064#ifdef SO_TIMESTAMP 1065 timevalp = NULL; 1066#endif 1067#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1068 pktinfop = NULL; 1069#endif 1070 1071 cmsgp = CMSG_FIRSTHDR(msg); 1072 while (cmsgp != NULL) { 1073 socket_log(sock, NULL, TRACE, 1074 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG, 1075 "processing cmsg %p", cmsgp); 1076 1077#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1078 if (cmsgp->cmsg_level == IPPROTO_IPV6 1079 && cmsgp->cmsg_type == IPV6_PKTINFO) { 1080 1081 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1082 memcpy(&dev->pktinfo, pktinfop, 1083 sizeof(struct in6_pktinfo)); 1084 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 1085 socket_log(sock, NULL, TRACE, 1086 isc_msgcat, ISC_MSGSET_SOCKET, 1087 ISC_MSG_IFRECEIVED, 1088 "interface received on ifindex %u", 1089 dev->pktinfo.ipi6_ifindex); 1090 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) 1091 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; 1092 goto next; 1093 } 1094#endif 1095 1096#ifdef SO_TIMESTAMP 1097 if (cmsgp->cmsg_level == SOL_SOCKET 1098 && cmsgp->cmsg_type == SCM_TIMESTAMP) { 1099 timevalp = (struct timeval *)CMSG_DATA(cmsgp); 1100 dev->timestamp.seconds = timevalp->tv_sec; 1101 dev->timestamp.nanoseconds = timevalp->tv_usec * 1000; 1102 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; 1103 goto next; 1104 } 1105#endif 1106 1107 next: 1108 cmsgp = CMSG_NXTHDR(msg, cmsgp); 1109 } 1110#endif /* USE_CMSG */ 1111 1112#endif /* ISC_NET_BSD44MSGHDR */ 1113} 1114 1115/* 1116 * Construct an iov array and attach it to the msghdr passed in. This is 1117 * the SEND constructor, which will use the used region of the buffer 1118 * (if using a buffer list) or will use the internal region (if a single 1119 * buffer I/O is requested). 1120 * 1121 * Nothing can be NULL, and the done event must list at least one buffer 1122 * on the buffer linked list for this function to be meaningful. 1123 * 1124 * If write_countp != NULL, *write_countp will hold the number of bytes 1125 * this transaction can send. 1126 */ 1127static void 1128build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev, 1129 struct msghdr *msg, struct iovec *iov, size_t *write_countp) 1130{ 1131 unsigned int iovcount; 1132 isc_buffer_t *buffer; 1133 isc_region_t used; 1134 size_t write_count; 1135 size_t skip_count; 1136 1137 memset(msg, 0, sizeof(*msg)); 1138 1139 if (!sock->connected) { 1140 msg->msg_name = (void *)&dev->address.type.sa; 1141 msg->msg_namelen = dev->address.length; 1142 } else { 1143 msg->msg_name = NULL; 1144 msg->msg_namelen = 0; 1145 } 1146 1147 buffer = ISC_LIST_HEAD(dev->bufferlist); 1148 write_count = 0; 1149 iovcount = 0; 1150 1151 /* 1152 * Single buffer I/O? Skip what we've done so far in this region. 1153 */ 1154 if (buffer == NULL) { 1155 write_count = dev->region.length - dev->n; 1156 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1157 iov[0].iov_len = write_count; 1158 iovcount = 1; 1159 1160 goto config; 1161 } 1162 1163 /* 1164 * Multibuffer I/O. 1165 * Skip the data in the buffer list that we have already written. 1166 */ 1167 skip_count = dev->n; 1168 while (buffer != NULL) { 1169 REQUIRE(ISC_BUFFER_VALID(buffer)); 1170 if (skip_count < isc_buffer_usedlength(buffer)) 1171 break; 1172 skip_count -= isc_buffer_usedlength(buffer); 1173 buffer = ISC_LIST_NEXT(buffer, link); 1174 } 1175 1176 while (buffer != NULL) { 1177 INSIST(iovcount < MAXSCATTERGATHER_SEND); 1178 1179 isc_buffer_usedregion(buffer, &used); 1180 1181 if (used.length > 0) { 1182 iov[iovcount].iov_base = (void *)(used.base 1183 + skip_count); 1184 iov[iovcount].iov_len = used.length - skip_count; 1185 write_count += (used.length - skip_count); 1186 skip_count = 0; 1187 iovcount++; 1188 } 1189 buffer = ISC_LIST_NEXT(buffer, link); 1190 } 1191 1192 INSIST(skip_count == 0U); 1193 1194 config: 1195 msg->msg_iov = iov; 1196 msg->msg_iovlen = iovcount; 1197 1198#ifdef ISC_NET_BSD44MSGHDR 1199 msg->msg_control = NULL; 1200 msg->msg_controllen = 0; 1201 msg->msg_flags = 0; 1202#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1203 if ((sock->type == isc_sockettype_udp) 1204 && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) { 1205 struct cmsghdr *cmsgp; 1206 struct in6_pktinfo *pktinfop; 1207 1208 socket_log(sock, NULL, TRACE, 1209 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA, 1210 "sendto pktinfo data, ifindex %u", 1211 dev->pktinfo.ipi6_ifindex); 1212 1213 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo)); 1214 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1215 msg->msg_control = (void *)sock->sendcmsgbuf; 1216 1217 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf; 1218 cmsgp->cmsg_level = IPPROTO_IPV6; 1219 cmsgp->cmsg_type = IPV6_PKTINFO; 1220 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo)); 1221 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1222 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); 1223 } 1224#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */ 1225#else /* ISC_NET_BSD44MSGHDR */ 1226 msg->msg_accrights = NULL; 1227 msg->msg_accrightslen = 0; 1228#endif /* ISC_NET_BSD44MSGHDR */ 1229 1230 if (write_countp != NULL) 1231 *write_countp = write_count; 1232} 1233 1234/* 1235 * Construct an iov array and attach it to the msghdr passed in. This is 1236 * the RECV constructor, which will use the available region of the buffer 1237 * (if using a buffer list) or will use the internal region (if a single 1238 * buffer I/O is requested). 1239 * 1240 * Nothing can be NULL, and the done event must list at least one buffer 1241 * on the buffer linked list for this function to be meaningful. 1242 * 1243 * If read_countp != NULL, *read_countp will hold the number of bytes 1244 * this transaction can receive. 1245 */ 1246static void 1247build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev, 1248 struct msghdr *msg, struct iovec *iov, size_t *read_countp) 1249{ 1250 unsigned int iovcount; 1251 isc_buffer_t *buffer; 1252 isc_region_t available; 1253 size_t read_count; 1254 1255 memset(msg, 0, sizeof(struct msghdr)); 1256 1257 if (sock->type == isc_sockettype_udp) { 1258 memset(&dev->address, 0, sizeof(dev->address)); 1259#ifdef BROKEN_RECVMSG 1260 if (sock->pf == AF_INET) { 1261 msg->msg_name = (void *)&dev->address.type.sin; 1262 msg->msg_namelen = sizeof(dev->address.type.sin6); 1263 } else if (sock->pf == AF_INET6) { 1264 msg->msg_name = (void *)&dev->address.type.sin6; 1265 msg->msg_namelen = sizeof(dev->address.type.sin6); 1266#ifdef ISC_PLATFORM_HAVESYSUNH 1267 } else if (sock->pf == AF_UNIX) { 1268 msg->msg_name = (void *)&dev->address.type.sunix; 1269 msg->msg_namelen = sizeof(dev->address.type.sunix); 1270#endif 1271 } else { 1272 msg->msg_name = (void *)&dev->address.type.sa; 1273 msg->msg_namelen = sizeof(dev->address.type); 1274 } 1275#else 1276 msg->msg_name = (void *)&dev->address.type.sa; 1277 msg->msg_namelen = sizeof(dev->address.type); 1278#endif 1279#ifdef ISC_NET_RECVOVERFLOW 1280 /* If needed, steal one iovec for overflow detection. */ 1281 maxiov--; 1282#endif 1283 } else { /* TCP */ 1284 msg->msg_name = NULL; 1285 msg->msg_namelen = 0; 1286 dev->address = sock->peer_address; 1287 } 1288 1289 buffer = ISC_LIST_HEAD(dev->bufferlist); 1290 read_count = 0; 1291 1292 /* 1293 * Single buffer I/O? Skip what we've done so far in this region. 1294 */ 1295 if (buffer == NULL) { 1296 read_count = dev->region.length - dev->n; 1297 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1298 iov[0].iov_len = read_count; 1299 iovcount = 1; 1300 1301 goto config; 1302 } 1303 1304 /* 1305 * Multibuffer I/O. 1306 * Skip empty buffers. 1307 */ 1308 while (buffer != NULL) { 1309 REQUIRE(ISC_BUFFER_VALID(buffer)); 1310 if (isc_buffer_availablelength(buffer) != 0) 1311 break; 1312 buffer = ISC_LIST_NEXT(buffer, link); 1313 } 1314 1315 iovcount = 0; 1316 while (buffer != NULL) { 1317 INSIST(iovcount < MAXSCATTERGATHER_RECV); 1318 1319 isc_buffer_availableregion(buffer, &available); 1320 1321 if (available.length > 0) { 1322 iov[iovcount].iov_base = (void *)(available.base); 1323 iov[iovcount].iov_len = available.length; 1324 read_count += available.length; 1325 iovcount++; 1326 } 1327 buffer = ISC_LIST_NEXT(buffer, link); 1328 } 1329 1330 config: 1331 1332 /* 1333 * If needed, set up to receive that one extra byte. Note that 1334 * we know there is at least one iov left, since we stole it 1335 * at the top of this function. 1336 */ 1337#ifdef ISC_NET_RECVOVERFLOW 1338 if (sock->type == isc_sockettype_udp) { 1339 iov[iovcount].iov_base = (void *)(&sock->overflow); 1340 iov[iovcount].iov_len = 1; 1341 iovcount++; 1342 } 1343#endif 1344 1345 msg->msg_iov = iov; 1346 msg->msg_iovlen = iovcount; 1347 1348#ifdef ISC_NET_BSD44MSGHDR 1349 msg->msg_control = NULL; 1350 msg->msg_controllen = 0; 1351 msg->msg_flags = 0; 1352#if defined(USE_CMSG) 1353 if (sock->type == isc_sockettype_udp) { 1354 msg->msg_control = sock->recvcmsgbuf; 1355 msg->msg_controllen = sock->recvcmsgbuflen; 1356 } 1357#endif /* USE_CMSG */ 1358#else /* ISC_NET_BSD44MSGHDR */ 1359 msg->msg_accrights = NULL; 1360 msg->msg_accrightslen = 0; 1361#endif /* ISC_NET_BSD44MSGHDR */ 1362 1363 if (read_countp != NULL) 1364 *read_countp = read_count; 1365} 1366 1367static void 1368set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock, 1369 isc_socketevent_t *dev) 1370{ 1371 if (sock->type == isc_sockettype_udp) { 1372 if (address != NULL) 1373 dev->address = *address; 1374 else 1375 dev->address = sock->peer_address; 1376 } else if (sock->type == isc_sockettype_tcp) { 1377 INSIST(address == NULL); 1378 dev->address = sock->peer_address; 1379 } 1380} 1381 1382static void 1383destroy_socketevent(isc_event_t *event) { 1384 isc_socketevent_t *ev = (isc_socketevent_t *)event; 1385 1386 INSIST(ISC_LIST_EMPTY(ev->bufferlist)); 1387 1388 (ev->destroy)(event); 1389} 1390 1391static isc_socketevent_t * 1392allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype, 1393 isc_taskaction_t action, const void *arg) 1394{ 1395 isc_socketevent_t *ev; 1396 1397 ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx, 1398 sock, eventtype, 1399 action, arg, 1400 sizeof(*ev)); 1401 1402 if (ev == NULL) 1403 return (NULL); 1404 1405 ev->result = ISC_R_UNEXPECTED; 1406 ISC_LINK_INIT(ev, ev_link); 1407 ISC_LIST_INIT(ev->bufferlist); 1408 ev->region.base = NULL; 1409 ev->n = 0; 1410 ev->offset = 0; 1411 ev->attributes = 0; 1412 ev->destroy = ev->ev_destroy; 1413 ev->ev_destroy = destroy_socketevent; 1414 1415 return (ev); 1416} 1417 1418#if defined(ISC_SOCKET_DEBUG) 1419static void 1420dump_msg(struct msghdr *msg) { 1421 unsigned int i; 1422 1423 printf("MSGHDR %p\n", msg); 1424 printf("\tname %p, namelen %ld\n", msg->msg_name, 1425 (long) msg->msg_namelen); 1426 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, 1427 (long) msg->msg_iovlen); 1428 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) 1429 printf("\t\t%d\tbase %p, len %ld\n", i, 1430 msg->msg_iov[i].iov_base, 1431 (long) msg->msg_iov[i].iov_len); 1432#ifdef ISC_NET_BSD44MSGHDR 1433 printf("\tcontrol %p, controllen %ld\n", msg->msg_control, 1434 (long) msg->msg_controllen); 1435#endif 1436} 1437#endif 1438 1439#define DOIO_SUCCESS 0 /* i/o ok, event sent */ 1440#define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */ 1441#define DOIO_HARD 2 /* i/o error, event sent */ 1442#define DOIO_EOF 3 /* EOF, no event sent */ 1443 1444static int 1445doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { 1446 int cc; 1447 struct iovec iov[MAXSCATTERGATHER_RECV]; 1448 size_t read_count; 1449 size_t actual_count; 1450 struct msghdr msghdr; 1451 isc_buffer_t *buffer; 1452 int recv_errno; 1453 char strbuf[ISC_STRERRORSIZE]; 1454 1455 build_msghdr_recv(sock, dev, &msghdr, iov, &read_count); 1456 1457#if defined(ISC_SOCKET_DEBUG) 1458 dump_msg(&msghdr); 1459#endif 1460 1461 cc = recvmsg(sock->fd, &msghdr, 0); 1462 recv_errno = errno; 1463 1464#if defined(ISC_SOCKET_DEBUG) 1465 dump_msg(&msghdr); 1466#endif 1467 1468 if (cc < 0) { 1469 if (SOFT_ERROR(recv_errno)) 1470 return (DOIO_SOFT); 1471 1472 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1473 isc__strerror(recv_errno, strbuf, sizeof(strbuf)); 1474 socket_log(sock, NULL, IOEVENT, 1475 isc_msgcat, ISC_MSGSET_SOCKET, 1476 ISC_MSG_DOIORECV, 1477 "doio_recv: recvmsg(%d) %d bytes, err %d/%s", 1478 sock->fd, cc, recv_errno, strbuf); 1479 } 1480 1481#define SOFT_OR_HARD(_system, _isc) \ 1482 if (recv_errno == _system) { \ 1483 if (sock->connected) { \ 1484 dev->result = _isc; \ 1485 inc_stats(sock->manager->stats, \ 1486 sock->statsindex[STATID_RECVFAIL]); \ 1487 return (DOIO_HARD); \ 1488 } \ 1489 return (DOIO_SOFT); \ 1490 } 1491#define ALWAYS_HARD(_system, _isc) \ 1492 if (recv_errno == _system) { \ 1493 dev->result = _isc; \ 1494 inc_stats(sock->manager->stats, \ 1495 sock->statsindex[STATID_RECVFAIL]); \ 1496 return (DOIO_HARD); \ 1497 } 1498 1499 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1500 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1501 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1502 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN); 1503 /* HPUX 11.11 can return EADDRNOTAVAIL. */ 1504 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1505 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1506 /* 1507 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 1508 * errors. 1509 */ 1510#ifdef EPROTO 1511 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); 1512#endif 1513 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); 1514 1515#undef SOFT_OR_HARD 1516#undef ALWAYS_HARD 1517 1518 dev->result = isc__errno2result(recv_errno); 1519 inc_stats(sock->manager->stats, 1520 sock->statsindex[STATID_RECVFAIL]); 1521 return (DOIO_HARD); 1522 } 1523 1524 /* 1525 * On TCP, zero length reads indicate EOF, while on 1526 * UDP, zero length reads are perfectly valid, although 1527 * strange. 1528 */ 1529 if ((sock->type == isc_sockettype_tcp) && (cc == 0)) 1530 return (DOIO_EOF); 1531 1532 if (sock->type == isc_sockettype_udp) { 1533 dev->address.length = msghdr.msg_namelen; 1534 if (isc_sockaddr_getport(&dev->address) == 0) { 1535 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1536 socket_log(sock, &dev->address, IOEVENT, 1537 isc_msgcat, ISC_MSGSET_SOCKET, 1538 ISC_MSG_ZEROPORT, 1539 "dropping source port zero packet"); 1540 } 1541 return (DOIO_SOFT); 1542 } 1543 } 1544 1545 socket_log(sock, &dev->address, IOEVENT, 1546 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV, 1547 "packet received correctly"); 1548 1549 /* 1550 * Overflow bit detection. If we received MORE bytes than we should, 1551 * this indicates an overflow situation. Set the flag in the 1552 * dev entry and adjust how much we read by one. 1553 */ 1554#ifdef ISC_NET_RECVOVERFLOW 1555 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) { 1556 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1557 cc--; 1558 } 1559#endif 1560 1561 /* 1562 * If there are control messages attached, run through them and pull 1563 * out the interesting bits. 1564 */ 1565 if (sock->type == isc_sockettype_udp) 1566 process_cmsg(sock, &msghdr, dev); 1567 1568 /* 1569 * update the buffers (if any) and the i/o count 1570 */ 1571 dev->n += cc; 1572 actual_count = cc; 1573 buffer = ISC_LIST_HEAD(dev->bufferlist); 1574 while (buffer != NULL && actual_count > 0U) { 1575 REQUIRE(ISC_BUFFER_VALID(buffer)); 1576 if (isc_buffer_availablelength(buffer) <= actual_count) { 1577 actual_count -= isc_buffer_availablelength(buffer); 1578 isc_buffer_add(buffer, 1579 isc_buffer_availablelength(buffer)); 1580 } else { 1581 isc_buffer_add(buffer, actual_count); 1582 actual_count = 0; 1583 break; 1584 } 1585 buffer = ISC_LIST_NEXT(buffer, link); 1586 if (buffer == NULL) { 1587 INSIST(actual_count == 0U); 1588 } 1589 } 1590 1591 /* 1592 * If we read less than we expected, update counters, 1593 * and let the upper layer poke the descriptor. 1594 */ 1595 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) 1596 return (DOIO_SOFT); 1597 1598 /* 1599 * Full reads are posted, or partials if partials are ok. 1600 */ 1601 dev->result = ISC_R_SUCCESS; 1602 return (DOIO_SUCCESS); 1603} 1604 1605/* 1606 * Returns: 1607 * DOIO_SUCCESS The operation succeeded. dev->result contains 1608 * ISC_R_SUCCESS. 1609 * 1610 * DOIO_HARD A hard or unexpected I/O error was encountered. 1611 * dev->result contains the appropriate error. 1612 * 1613 * DOIO_SOFT A soft I/O error was encountered. No senddone 1614 * event was sent. The operation should be retried. 1615 * 1616 * No other return values are possible. 1617 */ 1618static int 1619doio_send(isc_socket_t *sock, isc_socketevent_t *dev) { 1620 int cc; 1621 struct iovec iov[MAXSCATTERGATHER_SEND]; 1622 size_t write_count; 1623 struct msghdr msghdr; 1624 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 1625 int attempts = 0; 1626 int send_errno; 1627 char strbuf[ISC_STRERRORSIZE]; 1628 1629 build_msghdr_send(sock, dev, &msghdr, iov, &write_count); 1630 1631 resend: 1632 cc = sendmsg(sock->fd, &msghdr, 0); 1633 send_errno = errno; 1634 1635 /* 1636 * Check for error or block condition. 1637 */ 1638 if (cc < 0) { 1639 if (send_errno == EINTR && ++attempts < NRETRIES) 1640 goto resend; 1641 1642 if (SOFT_ERROR(send_errno)) 1643 return (DOIO_SOFT); 1644 1645#define SOFT_OR_HARD(_system, _isc) \ 1646 if (send_errno == _system) { \ 1647 if (sock->connected) { \ 1648 dev->result = _isc; \ 1649 inc_stats(sock->manager->stats, \ 1650 sock->statsindex[STATID_SENDFAIL]); \ 1651 return (DOIO_HARD); \ 1652 } \ 1653 return (DOIO_SOFT); \ 1654 } 1655#define ALWAYS_HARD(_system, _isc) \ 1656 if (send_errno == _system) { \ 1657 dev->result = _isc; \ 1658 inc_stats(sock->manager->stats, \ 1659 sock->statsindex[STATID_SENDFAIL]); \ 1660 return (DOIO_HARD); \ 1661 } 1662 1663 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1664 ALWAYS_HARD(EACCES, ISC_R_NOPERM); 1665 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 1666 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1667 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1668#ifdef EHOSTDOWN 1669 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH); 1670#endif 1671 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1672 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1673 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH); 1674 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED); 1675 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET); 1676 1677#undef SOFT_OR_HARD 1678#undef ALWAYS_HARD 1679 1680 /* 1681 * The other error types depend on whether or not the 1682 * socket is UDP or TCP. If it is UDP, some errors 1683 * that we expect to be fatal under TCP are merely 1684 * annoying, and are really soft errors. 1685 * 1686 * However, these soft errors are still returned as 1687 * a status. 1688 */ 1689 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); 1690 isc__strerror(send_errno, strbuf, sizeof(strbuf)); 1691 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", 1692 addrbuf, strbuf); 1693 dev->result = isc__errno2result(send_errno); 1694 inc_stats(sock->manager->stats, 1695 sock->statsindex[STATID_SENDFAIL]); 1696 return (DOIO_HARD); 1697 } 1698 1699 if (cc == 0) { 1700 inc_stats(sock->manager->stats, 1701 sock->statsindex[STATID_SENDFAIL]); 1702 UNEXPECTED_ERROR(__FILE__, __LINE__, 1703 "doio_send: send() %s 0", 1704 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 1705 ISC_MSG_RETURNED, "returned")); 1706 } 1707 1708 /* 1709 * If we write less than we expected, update counters, poke. 1710 */ 1711 dev->n += cc; 1712 if ((size_t)cc != write_count) 1713 return (DOIO_SOFT); 1714 1715 /* 1716 * Exactly what we wanted to write. We're done with this 1717 * entry. Post its completion event. 1718 */ 1719 dev->result = ISC_R_SUCCESS; 1720 return (DOIO_SUCCESS); 1721} 1722 1723/* 1724 * Kill. 1725 * 1726 * Caller must ensure that the socket is not locked and no external 1727 * references exist. 1728 */ 1729static void 1730closesocket(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) { 1731 isc_sockettype_t type = sock->type; 1732 int lockid = FDLOCK_ID(fd); 1733 1734 /* 1735 * No one has this socket open, so the watcher doesn't have to be 1736 * poked, and the socket doesn't have to be locked. 1737 */ 1738 LOCK(&manager->fdlock[lockid]); 1739 manager->fds[fd] = NULL; 1740 if (type == isc_sockettype_fdwatch) 1741 manager->fdstate[fd] = CLOSED; 1742 else 1743 manager->fdstate[fd] = CLOSE_PENDING; 1744 UNLOCK(&manager->fdlock[lockid]); 1745 if (type == isc_sockettype_fdwatch) { 1746 /* 1747 * The caller may close the socket once this function returns, 1748 * and `fd' may be reassigned for a new socket. So we do 1749 * unwatch_fd() here, rather than defer it via select_poke(). 1750 * Note: this may complicate data protection among threads and 1751 * may reduce performance due to additional locks. One way to 1752 * solve this would be to dup() the watched descriptor, but we 1753 * take a simpler approach at this moment. 1754 */ 1755 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1756 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1757 } else 1758 select_poke(manager, fd, SELECT_POKE_CLOSE); 1759 1760 inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]); 1761 1762 /* 1763 * update manager->maxfd here (XXX: this should be implemented more 1764 * efficiently) 1765 */ 1766#ifdef USE_SELECT 1767 LOCK(&manager->lock); 1768 if (manager->maxfd == fd) { 1769 int i; 1770 1771 manager->maxfd = 0; 1772 for (i = fd - 1; i >= 0; i--) { 1773 lockid = FDLOCK_ID(i); 1774 1775 LOCK(&manager->fdlock[lockid]); 1776 if (manager->fdstate[i] == MANAGED) { 1777 manager->maxfd = i; 1778 UNLOCK(&manager->fdlock[lockid]); 1779 break; 1780 } 1781 UNLOCK(&manager->fdlock[lockid]); 1782 } 1783#ifdef ISC_PLATFORM_USETHREADS 1784 if (manager->maxfd < manager->pipe_fds[0]) 1785 manager->maxfd = manager->pipe_fds[0]; 1786#endif 1787 } 1788 UNLOCK(&manager->lock); 1789#endif /* USE_SELECT */ 1790} 1791 1792static void 1793destroy(isc_socket_t **sockp) { 1794 int fd; 1795 isc_socket_t *sock = *sockp; 1796 isc_socketmgr_t *manager = sock->manager; 1797 1798 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 1799 ISC_MSG_DESTROYING, "destroying"); 1800 1801 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 1802 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1803 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1804 INSIST(sock->connect_ev == NULL); 1805 REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks); 1806 1807 if (sock->fd >= 0) { 1808 fd = sock->fd; 1809 sock->fd = -1; 1810 closesocket(manager, sock, fd); 1811 } 1812 1813 LOCK(&manager->lock); 1814 1815 ISC_LIST_UNLINK(manager->socklist, sock, link); 1816 1817#ifdef ISC_PLATFORM_USETHREADS 1818 if (ISC_LIST_EMPTY(manager->socklist)) 1819 SIGNAL(&manager->shutdown_ok); 1820#endif /* ISC_PLATFORM_USETHREADS */ 1821 1822 UNLOCK(&manager->lock); 1823 1824 free_socket(sockp); 1825} 1826 1827static isc_result_t 1828allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type, 1829 isc_socket_t **socketp) 1830{ 1831 isc_socket_t *sock; 1832 isc_result_t result; 1833 ISC_SOCKADDR_LEN_T cmsgbuflen; 1834 1835 sock = isc_mem_get(manager->mctx, sizeof(*sock)); 1836 1837 if (sock == NULL) 1838 return (ISC_R_NOMEMORY); 1839 1840 result = ISC_R_UNEXPECTED; 1841 1842 sock->magic = 0; 1843 sock->references = 0; 1844 1845 sock->manager = manager; 1846 sock->type = type; 1847 sock->fd = -1; 1848 sock->statsindex = NULL; 1849 1850 ISC_LINK_INIT(sock, link); 1851 1852 sock->recvcmsgbuf = NULL; 1853 sock->sendcmsgbuf = NULL; 1854 1855 /* 1856 * set up cmsg buffers 1857 */ 1858 cmsgbuflen = 0; 1859#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1860 cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo)); 1861#endif 1862#if defined(USE_CMSG) && defined(SO_TIMESTAMP) 1863 cmsgbuflen += cmsg_space(sizeof(struct timeval)); 1864#endif 1865 sock->recvcmsgbuflen = cmsgbuflen; 1866 if (sock->recvcmsgbuflen != 0U) { 1867 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 1868 if (sock->recvcmsgbuf == NULL) 1869 goto error; 1870 } 1871 1872 cmsgbuflen = 0; 1873#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1874 cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo)); 1875#endif 1876 sock->sendcmsgbuflen = cmsgbuflen; 1877 if (sock->sendcmsgbuflen != 0U) { 1878 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 1879 if (sock->sendcmsgbuf == NULL) 1880 goto error; 1881 } 1882 1883 memset(sock->name, 0, sizeof(sock->name)); 1884 sock->tag = NULL; 1885 1886 /* 1887 * set up list of readers and writers to be initially empty 1888 */ 1889 ISC_LIST_INIT(sock->recv_list); 1890 ISC_LIST_INIT(sock->send_list); 1891 ISC_LIST_INIT(sock->accept_list); 1892 sock->connect_ev = NULL; 1893 sock->pending_recv = 0; 1894 sock->pending_send = 0; 1895 sock->pending_accept = 0; 1896 sock->listener = 0; 1897 sock->connected = 0; 1898 sock->connecting = 0; 1899 sock->bound = 0; 1900 1901 /* 1902 * initialize the lock 1903 */ 1904 result = isc_mutex_init(&sock->lock); 1905 if (result != ISC_R_SUCCESS) { 1906 sock->magic = 0; 1907 goto error; 1908 } 1909 1910 /* 1911 * Initialize readable and writable events 1912 */ 1913 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t), 1914 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR, 1915 NULL, sock, sock, NULL, NULL); 1916 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t), 1917 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW, 1918 NULL, sock, sock, NULL, NULL); 1919 1920 sock->magic = SOCKET_MAGIC; 1921 *socketp = sock; 1922 1923 return (ISC_R_SUCCESS); 1924 1925 error: 1926 if (sock->recvcmsgbuf != NULL) 1927 isc_mem_put(manager->mctx, sock->recvcmsgbuf, 1928 sock->recvcmsgbuflen); 1929 if (sock->sendcmsgbuf != NULL) 1930 isc_mem_put(manager->mctx, sock->sendcmsgbuf, 1931 sock->sendcmsgbuflen); 1932 isc_mem_put(manager->mctx, sock, sizeof(*sock)); 1933 1934 return (result); 1935} 1936 1937/* 1938 * This event requires that the various lists be empty, that the reference 1939 * count be 1, and that the magic number is valid. The other socket bits, 1940 * like the lock, must be initialized as well. The fd associated must be 1941 * marked as closed, by setting it to -1 on close, or this routine will 1942 * also close the socket. 1943 */ 1944static void 1945free_socket(isc_socket_t **socketp) { 1946 isc_socket_t *sock = *socketp; 1947 1948 INSIST(sock->references == 0); 1949 INSIST(VALID_SOCKET(sock)); 1950 INSIST(!sock->connecting); 1951 INSIST(!sock->pending_recv); 1952 INSIST(!sock->pending_send); 1953 INSIST(!sock->pending_accept); 1954 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1955 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1956 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 1957 INSIST(!ISC_LINK_LINKED(sock, link)); 1958 1959 if (sock->recvcmsgbuf != NULL) 1960 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf, 1961 sock->recvcmsgbuflen); 1962 if (sock->sendcmsgbuf != NULL) 1963 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf, 1964 sock->sendcmsgbuflen); 1965 1966 sock->magic = 0; 1967 1968 DESTROYLOCK(&sock->lock); 1969 1970 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock)); 1971 1972 *socketp = NULL; 1973} 1974 1975#ifdef SO_BSDCOMPAT 1976/* 1977 * This really should not be necessary to do. Having to workout 1978 * which kernel version we are on at run time so that we don't cause 1979 * the kernel to issue a warning about us using a deprecated socket option. 1980 * Such warnings should *never* be on by default in production kernels. 1981 * 1982 * We can't do this a build time because executables are moved between 1983 * machines and hence kernels. 1984 * 1985 * We can't just not set SO_BSDCOMAT because some kernels require it. 1986 */ 1987 1988static isc_once_t bsdcompat_once = ISC_ONCE_INIT; 1989isc_boolean_t bsdcompat = ISC_TRUE; 1990 1991static void 1992clear_bsdcompat(void) { 1993#ifdef __linux__ 1994 struct utsname buf; 1995 char *endp; 1996 long int major; 1997 long int minor; 1998 1999 uname(&buf); /* Can only fail if buf is bad in Linux. */ 2000 2001 /* Paranoia in parsing can be increased, but we trust uname(). */ 2002 major = strtol(buf.release, &endp, 10); 2003 if (*endp == '.') { 2004 minor = strtol(endp+1, &endp, 10); 2005 if ((major > 2) || ((major == 2) && (minor >= 4))) { 2006 bsdcompat = ISC_FALSE; 2007 } 2008 } 2009#endif /* __linux __ */ 2010} 2011#endif 2012 2013static isc_result_t 2014opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) { 2015 char strbuf[ISC_STRERRORSIZE]; 2016 const char *err = "socket"; 2017 int tries = 0; 2018#if defined(USE_CMSG) || defined(SO_BSDCOMPAT) 2019 int on = 1; 2020#endif 2021#if defined(SO_RCVBUF) 2022 ISC_SOCKADDR_LEN_T optlen; 2023 int size; 2024#endif 2025 2026 again: 2027 switch (sock->type) { 2028 case isc_sockettype_udp: 2029 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); 2030 break; 2031 case isc_sockettype_tcp: 2032 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); 2033 break; 2034 case isc_sockettype_unix: 2035 sock->fd = socket(sock->pf, SOCK_STREAM, 0); 2036 break; 2037 case isc_sockettype_fdwatch: 2038 /* 2039 * We should not be called for isc_sockettype_fdwatch sockets. 2040 */ 2041 INSIST(0); 2042 break; 2043 } 2044 if (sock->fd == -1 && errno == EINTR && tries++ < 42) 2045 goto again; 2046 2047#ifdef F_DUPFD 2048 /* 2049 * Leave a space for stdio and TCP to work in. 2050 */ 2051 if (manager->reserved != 0 && sock->type == isc_sockettype_udp && 2052 sock->fd >= 0 && sock->fd < manager->reserved) { 2053 int new, tmp; 2054 new = fcntl(sock->fd, F_DUPFD, manager->reserved); 2055 tmp = errno; 2056 (void)close(sock->fd); 2057 errno = tmp; 2058 sock->fd = new; 2059 err = "isc_socket_create: fcntl/reserved"; 2060 } else if (sock->fd >= 0 && sock->fd < 20) { 2061 int new, tmp; 2062 new = fcntl(sock->fd, F_DUPFD, 20); 2063 tmp = errno; 2064 (void)close(sock->fd); 2065 errno = tmp; 2066 sock->fd = new; 2067 err = "isc_socket_create: fcntl"; 2068 } 2069#endif 2070 2071 if (sock->fd >= (int)manager->maxsocks) { 2072 (void)close(sock->fd); 2073 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2074 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2075 isc_msgcat, ISC_MSGSET_SOCKET, 2076 ISC_MSG_TOOMANYFDS, 2077 "socket: file descriptor exceeds limit (%d/%u)", 2078 sock->fd, manager->maxsocks); 2079 return (ISC_R_NORESOURCES); 2080 } 2081 2082 if (sock->fd < 0) { 2083 switch (errno) { 2084 case EMFILE: 2085 case ENFILE: 2086 isc__strerror(errno, strbuf, sizeof(strbuf)); 2087 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2088 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2089 isc_msgcat, ISC_MSGSET_SOCKET, 2090 ISC_MSG_TOOMANYFDS, 2091 "%s: %s", err, strbuf); 2092 /* fallthrough */ 2093 case ENOBUFS: 2094 return (ISC_R_NORESOURCES); 2095 2096 case EPROTONOSUPPORT: 2097 case EPFNOSUPPORT: 2098 case EAFNOSUPPORT: 2099 /* 2100 * Linux 2.2 (and maybe others) return EINVAL instead of 2101 * EAFNOSUPPORT. 2102 */ 2103 case EINVAL: 2104 return (ISC_R_FAMILYNOSUPPORT); 2105 2106 default: 2107 isc__strerror(errno, strbuf, sizeof(strbuf)); 2108 UNEXPECTED_ERROR(__FILE__, __LINE__, 2109 "%s() %s: %s", err, 2110 isc_msgcat_get(isc_msgcat, 2111 ISC_MSGSET_GENERAL, 2112 ISC_MSG_FAILED, 2113 "failed"), 2114 strbuf); 2115 return (ISC_R_UNEXPECTED); 2116 } 2117 } 2118 2119 if (make_nonblock(sock->fd) != ISC_R_SUCCESS) { 2120 (void)close(sock->fd); 2121 return (ISC_R_UNEXPECTED); 2122 } 2123 2124#ifdef SO_BSDCOMPAT 2125 RUNTIME_CHECK(isc_once_do(&bsdcompat_once, 2126 clear_bsdcompat) == ISC_R_SUCCESS); 2127 if (sock->type != isc_sockettype_unix && bsdcompat && 2128 setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, 2129 (void *)&on, sizeof(on)) < 0) { 2130 isc__strerror(errno, strbuf, sizeof(strbuf)); 2131 UNEXPECTED_ERROR(__FILE__, __LINE__, 2132 "setsockopt(%d, SO_BSDCOMPAT) %s: %s", 2133 sock->fd, 2134 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2135 ISC_MSG_FAILED, "failed"), 2136 strbuf); 2137 /* Press on... */ 2138 } 2139#endif 2140 2141#ifdef SO_NOSIGPIPE 2142 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, 2143 (void *)&on, sizeof(on)) < 0) { 2144 isc__strerror(errno, strbuf, sizeof(strbuf)); 2145 UNEXPECTED_ERROR(__FILE__, __LINE__, 2146 "setsockopt(%d, SO_NOSIGPIPE) %s: %s", 2147 sock->fd, 2148 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2149 ISC_MSG_FAILED, "failed"), 2150 strbuf); 2151 /* Press on... */ 2152 } 2153#endif 2154 2155#if defined(USE_CMSG) || defined(SO_RCVBUF) 2156 if (sock->type == isc_sockettype_udp) { 2157 2158#if defined(USE_CMSG) 2159#if defined(SO_TIMESTAMP) 2160 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, 2161 (void *)&on, sizeof(on)) < 0 2162 && errno != ENOPROTOOPT) { 2163 isc__strerror(errno, strbuf, sizeof(strbuf)); 2164 UNEXPECTED_ERROR(__FILE__, __LINE__, 2165 "setsockopt(%d, SO_TIMESTAMP) %s: %s", 2166 sock->fd, 2167 isc_msgcat_get(isc_msgcat, 2168 ISC_MSGSET_GENERAL, 2169 ISC_MSG_FAILED, 2170 "failed"), 2171 strbuf); 2172 /* Press on... */ 2173 } 2174#endif /* SO_TIMESTAMP */ 2175 2176#if defined(ISC_PLATFORM_HAVEIPV6) 2177 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { 2178 /* 2179 * Warn explicitly because this anomaly can be hidden 2180 * in usual operation (and unexpectedly appear later). 2181 */ 2182 UNEXPECTED_ERROR(__FILE__, __LINE__, 2183 "No buffer available to receive " 2184 "IPv6 destination"); 2185 } 2186#ifdef ISC_PLATFORM_HAVEIN6PKTINFO 2187#ifdef IPV6_RECVPKTINFO 2188 /* RFC 3542 */ 2189 if ((sock->pf == AF_INET6) 2190 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 2191 (void *)&on, sizeof(on)) < 0)) { 2192 isc__strerror(errno, strbuf, sizeof(strbuf)); 2193 UNEXPECTED_ERROR(__FILE__, __LINE__, 2194 "setsockopt(%d, IPV6_RECVPKTINFO) " 2195 "%s: %s", sock->fd, 2196 isc_msgcat_get(isc_msgcat, 2197 ISC_MSGSET_GENERAL, 2198 ISC_MSG_FAILED, 2199 "failed"), 2200 strbuf); 2201 } 2202#else 2203 /* RFC 2292 */ 2204 if ((sock->pf == AF_INET6) 2205 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, 2206 (void *)&on, sizeof(on)) < 0)) { 2207 isc__strerror(errno, strbuf, sizeof(strbuf)); 2208 UNEXPECTED_ERROR(__FILE__, __LINE__, 2209 "setsockopt(%d, IPV6_PKTINFO) %s: %s", 2210 sock->fd, 2211 isc_msgcat_get(isc_msgcat, 2212 ISC_MSGSET_GENERAL, 2213 ISC_MSG_FAILED, 2214 "failed"), 2215 strbuf); 2216 } 2217#endif /* IPV6_RECVPKTINFO */ 2218#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */ 2219#ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/ 2220 /* use minimum MTU */ 2221 if (sock->pf == AF_INET6) { 2222 (void)setsockopt(sock->fd, IPPROTO_IPV6, 2223 IPV6_USE_MIN_MTU, 2224 (void *)&on, sizeof(on)); 2225 } 2226#endif 2227#endif /* ISC_PLATFORM_HAVEIPV6 */ 2228#endif /* defined(USE_CMSG) */ 2229 2230#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) 2231 /* 2232 * Turn off Path MTU discovery on IPv4/UDP sockets. 2233 */ 2234 if (sock->pf == AF_INET) { 2235 int action = IP_PMTUDISC_DONT; 2236 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER, 2237 &action, sizeof(action)); 2238 } 2239#endif 2240#if defined(IP_DONTFRAG) 2241 /* 2242 * Turn off Path MTU discovery on IPv4/UDP sockets. 2243 */ 2244 if (sock->pf == AF_INET) { 2245 int off = 0; 2246 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, 2247 &off, sizeof(off)); 2248 } 2249#endif 2250 2251#if defined(SO_RCVBUF) 2252 optlen = sizeof(size); 2253 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2254 (void *)&size, &optlen) >= 0 && 2255 size < RCVBUFSIZE) { 2256 size = RCVBUFSIZE; 2257 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2258 (void *)&size, sizeof(size)) == -1) { 2259 isc__strerror(errno, strbuf, sizeof(strbuf)); 2260 UNEXPECTED_ERROR(__FILE__, __LINE__, 2261 "setsockopt(%d, SO_RCVBUF, %d) %s: %s", 2262 sock->fd, size, 2263 isc_msgcat_get(isc_msgcat, 2264 ISC_MSGSET_GENERAL, 2265 ISC_MSG_FAILED, 2266 "failed"), 2267 strbuf); 2268 } 2269 } 2270#endif 2271 } 2272#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */ 2273 2274 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]); 2275 2276 return (ISC_R_SUCCESS); 2277} 2278 2279/*% 2280 * Create a new 'type' socket managed by 'manager'. Events 2281 * will be posted to 'task' and when dispatched 'action' will be 2282 * called with 'arg' as the arg value. The new socket is returned 2283 * in 'socketp'. 2284 */ 2285isc_result_t 2286isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, 2287 isc_socket_t **socketp) 2288{ 2289 isc_socket_t *sock = NULL; 2290 isc_result_t result; 2291 int lockid; 2292 2293 REQUIRE(VALID_MANAGER(manager)); 2294 REQUIRE(socketp != NULL && *socketp == NULL); 2295 REQUIRE(type != isc_sockettype_fdwatch); 2296 2297 result = allocate_socket(manager, type, &sock); 2298 if (result != ISC_R_SUCCESS) 2299 return (result); 2300 2301 switch (sock->type) { 2302 case isc_sockettype_udp: 2303 sock->statsindex = 2304 (pf == AF_INET) ? upd4statsindex : upd6statsindex; 2305 break; 2306 case isc_sockettype_tcp: 2307 sock->statsindex = 2308 (pf == AF_INET) ? tcp4statsindex : tcp6statsindex; 2309 break; 2310 case isc_sockettype_unix: 2311 sock->statsindex = unixstatsindex; 2312 break; 2313 default: 2314 INSIST(0); 2315 } 2316 2317 sock->pf = pf; 2318 result = opensocket(manager, sock); 2319 if (result != ISC_R_SUCCESS) { 2320 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]); 2321 free_socket(&sock); 2322 return (result); 2323 } 2324 2325 sock->references = 1; 2326 *socketp = sock; 2327 2328 /* 2329 * Note we don't have to lock the socket like we normally would because 2330 * there are no external references to it yet. 2331 */ 2332 2333 lockid = FDLOCK_ID(sock->fd); 2334 LOCK(&manager->fdlock[lockid]); 2335 manager->fds[sock->fd] = sock; 2336 manager->fdstate[sock->fd] = MANAGED; 2337#ifdef USE_DEVPOLL 2338 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2339 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2340#endif 2341 UNLOCK(&manager->fdlock[lockid]); 2342 2343 LOCK(&manager->lock); 2344 ISC_LIST_APPEND(manager->socklist, sock, link); 2345#ifdef USE_SELECT 2346 if (manager->maxfd < sock->fd) 2347 manager->maxfd = sock->fd; 2348#endif 2349 UNLOCK(&manager->lock); 2350 2351 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2352 ISC_MSG_CREATED, "created"); 2353 2354 return (ISC_R_SUCCESS); 2355} 2356 2357isc_result_t 2358isc_socket_open(isc_socket_t *sock) { 2359 isc_result_t result; 2360 2361 REQUIRE(VALID_SOCKET(sock)); 2362 2363 LOCK(&sock->lock); 2364 REQUIRE(sock->references == 1); 2365 REQUIRE(sock->type != isc_sockettype_fdwatch); 2366 UNLOCK(&sock->lock); 2367 /* 2368 * We don't need to retain the lock hereafter, since no one else has 2369 * this socket. 2370 */ 2371 REQUIRE(sock->fd == -1); 2372 2373 result = opensocket(sock->manager, sock); 2374 if (result != ISC_R_SUCCESS) 2375 sock->fd = -1; 2376 2377 if (result == ISC_R_SUCCESS) { 2378 int lockid = FDLOCK_ID(sock->fd); 2379 2380 LOCK(&sock->manager->fdlock[lockid]); 2381 sock->manager->fds[sock->fd] = sock; 2382 sock->manager->fdstate[sock->fd] = MANAGED; 2383#ifdef USE_DEVPOLL 2384 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2385 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2386#endif 2387 UNLOCK(&sock->manager->fdlock[lockid]); 2388 2389#ifdef USE_SELECT 2390 LOCK(&sock->manager->lock); 2391 if (sock->manager->maxfd < sock->fd) 2392 sock->manager->maxfd = sock->fd; 2393 UNLOCK(&sock->manager->lock); 2394#endif 2395 } 2396 2397 return (result); 2398} 2399 2400/* 2401 * Create a new 'type' socket managed by 'manager'. Events 2402 * will be posted to 'task' and when dispatched 'action' will be 2403 * called with 'arg' as the arg value. The new socket is returned 2404 * in 'socketp'. 2405 */ 2406isc_result_t 2407isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags, 2408 isc_sockfdwatch_t callback, void *cbarg, 2409 isc_task_t *task, isc_socket_t **socketp) 2410{ 2411 isc_socket_t *sock = NULL; 2412 isc_result_t result; 2413 int lockid; 2414 2415 REQUIRE(VALID_MANAGER(manager)); 2416 REQUIRE(socketp != NULL && *socketp == NULL); 2417 2418 result = allocate_socket(manager, isc_sockettype_fdwatch, &sock); 2419 if (result != ISC_R_SUCCESS) 2420 return (result); 2421 2422 sock->fd = fd; 2423 sock->fdwatcharg = cbarg; 2424 sock->fdwatchcb = callback; 2425 sock->fdwatchflags = flags; 2426 sock->fdwatchtask = task; 2427 sock->statsindex = fdwatchstatsindex; 2428 2429 sock->references = 1; 2430 *socketp = sock; 2431 2432 /* 2433 * Note we don't have to lock the socket like we normally would because 2434 * there are no external references to it yet. 2435 */ 2436 2437 lockid = FDLOCK_ID(sock->fd); 2438 LOCK(&manager->fdlock[lockid]); 2439 manager->fds[sock->fd] = sock; 2440 manager->fdstate[sock->fd] = MANAGED; 2441 UNLOCK(&manager->fdlock[lockid]); 2442 2443 LOCK(&manager->lock); 2444 ISC_LIST_APPEND(manager->socklist, sock, link); 2445#ifdef USE_SELECT 2446 if (manager->maxfd < sock->fd) 2447 manager->maxfd = sock->fd; 2448#endif 2449 UNLOCK(&manager->lock); 2450 2451 if (flags & ISC_SOCKFDWATCH_READ) 2452 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 2453 if (flags & ISC_SOCKFDWATCH_WRITE) 2454 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 2455 2456 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2457 ISC_MSG_CREATED, "fdwatch-created"); 2458 2459 return (ISC_R_SUCCESS); 2460} 2461 2462/* 2463 * Attach to a socket. Caller must explicitly detach when it is done. 2464 */ 2465void 2466isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) { 2467 REQUIRE(VALID_SOCKET(sock)); 2468 REQUIRE(socketp != NULL && *socketp == NULL); 2469 2470 LOCK(&sock->lock); 2471 sock->references++; 2472 UNLOCK(&sock->lock); 2473 2474 *socketp = sock; 2475} 2476 2477/* 2478 * Dereference a socket. If this is the last reference to it, clean things 2479 * up by destroying the socket. 2480 */ 2481void 2482isc_socket_detach(isc_socket_t **socketp) { 2483 isc_socket_t *sock; 2484 isc_boolean_t kill_socket = ISC_FALSE; 2485 2486 REQUIRE(socketp != NULL); 2487 sock = *socketp; 2488 REQUIRE(VALID_SOCKET(sock)); 2489 2490 LOCK(&sock->lock); 2491 REQUIRE(sock->references > 0); 2492 sock->references--; 2493 if (sock->references == 0) 2494 kill_socket = ISC_TRUE; 2495 UNLOCK(&sock->lock); 2496 2497 if (kill_socket) 2498 destroy(&sock); 2499 2500 *socketp = NULL; 2501} 2502 2503isc_result_t 2504isc_socket_close(isc_socket_t *sock) { 2505 int fd; 2506 isc_socketmgr_t *manager; 2507 isc_sockettype_t type; 2508 2509 REQUIRE(VALID_SOCKET(sock)); 2510 2511 LOCK(&sock->lock); 2512 2513 REQUIRE(sock->references == 1); 2514 REQUIRE(sock->type != isc_sockettype_fdwatch); 2515 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); 2516 2517 INSIST(!sock->connecting); 2518 INSIST(!sock->pending_recv); 2519 INSIST(!sock->pending_send); 2520 INSIST(!sock->pending_accept); 2521 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2522 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2523 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2524 INSIST(sock->connect_ev == NULL); 2525 2526 manager = sock->manager; 2527 type = sock->type; 2528 fd = sock->fd; 2529 sock->fd = -1; 2530 memset(sock->name, 0, sizeof(sock->name)); 2531 sock->tag = NULL; 2532 sock->listener = 0; 2533 sock->connected = 0; 2534 sock->connecting = 0; 2535 sock->bound = 0; 2536 isc_sockaddr_any(&sock->peer_address); 2537 2538 UNLOCK(&sock->lock); 2539 2540 closesocket(manager, sock, fd); 2541 2542 return (ISC_R_SUCCESS); 2543} 2544 2545/* 2546 * I/O is possible on a given socket. Schedule an event to this task that 2547 * will call an internal function to do the I/O. This will charge the 2548 * task with the I/O operation and let our select loop handler get back 2549 * to doing something real as fast as possible. 2550 * 2551 * The socket and manager must be locked before calling this function. 2552 */ 2553static void 2554dispatch_recv(isc_socket_t *sock) { 2555 intev_t *iev; 2556 isc_socketevent_t *ev; 2557 isc_task_t *sender; 2558 2559 INSIST(!sock->pending_recv); 2560 2561 if (sock->type != isc_sockettype_fdwatch) { 2562 ev = ISC_LIST_HEAD(sock->recv_list); 2563 if (ev == NULL) 2564 return; 2565 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2566 "dispatch_recv: event %p -> task %p", 2567 ev, ev->ev_sender); 2568 sender = ev->ev_sender; 2569 } else { 2570 sender = sock->fdwatchtask; 2571 } 2572 2573 sock->pending_recv = 1; 2574 iev = &sock->readable_ev; 2575 2576 sock->references++; 2577 iev->ev_sender = sock; 2578 if (sock->type == isc_sockettype_fdwatch) 2579 iev->ev_action = internal_fdwatch_read; 2580 else 2581 iev->ev_action = internal_recv; 2582 iev->ev_arg = sock; 2583 2584 isc_task_send(sender, (isc_event_t **)&iev); 2585} 2586 2587static void 2588dispatch_send(isc_socket_t *sock) { 2589 intev_t *iev; 2590 isc_socketevent_t *ev; 2591 isc_task_t *sender; 2592 2593 INSIST(!sock->pending_send); 2594 2595 if (sock->type != isc_sockettype_fdwatch) { 2596 ev = ISC_LIST_HEAD(sock->send_list); 2597 if (ev == NULL) 2598 return; 2599 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2600 "dispatch_send: event %p -> task %p", 2601 ev, ev->ev_sender); 2602 sender = ev->ev_sender; 2603 } else { 2604 sender = sock->fdwatchtask; 2605 } 2606 2607 sock->pending_send = 1; 2608 iev = &sock->writable_ev; 2609 2610 sock->references++; 2611 iev->ev_sender = sock; 2612 if (sock->type == isc_sockettype_fdwatch) 2613 iev->ev_action = internal_fdwatch_write; 2614 else 2615 iev->ev_action = internal_send; 2616 iev->ev_arg = sock; 2617 2618 isc_task_send(sender, (isc_event_t **)&iev); 2619} 2620 2621/* 2622 * Dispatch an internal accept event. 2623 */ 2624static void 2625dispatch_accept(isc_socket_t *sock) { 2626 intev_t *iev; 2627 isc_socket_newconnev_t *ev; 2628 2629 INSIST(!sock->pending_accept); 2630 2631 /* 2632 * Are there any done events left, or were they all canceled 2633 * before the manager got the socket lock? 2634 */ 2635 ev = ISC_LIST_HEAD(sock->accept_list); 2636 if (ev == NULL) 2637 return; 2638 2639 sock->pending_accept = 1; 2640 iev = &sock->readable_ev; 2641 2642 sock->references++; /* keep socket around for this internal event */ 2643 iev->ev_sender = sock; 2644 iev->ev_action = internal_accept; 2645 iev->ev_arg = sock; 2646 2647 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 2648} 2649 2650static void 2651dispatch_connect(isc_socket_t *sock) { 2652 intev_t *iev; 2653 isc_socket_connev_t *ev; 2654 2655 iev = &sock->writable_ev; 2656 2657 ev = sock->connect_ev; 2658 INSIST(ev != NULL); /* XXX */ 2659 2660 INSIST(sock->connecting); 2661 2662 sock->references++; /* keep socket around for this internal event */ 2663 iev->ev_sender = sock; 2664 iev->ev_action = internal_connect; 2665 iev->ev_arg = sock; 2666 2667 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 2668} 2669 2670/* 2671 * Dequeue an item off the given socket's read queue, set the result code 2672 * in the done event to the one provided, and send it to the task it was 2673 * destined for. 2674 * 2675 * If the event to be sent is on a list, remove it before sending. If 2676 * asked to, send and detach from the socket as well. 2677 * 2678 * Caller must have the socket locked if the event is attached to the socket. 2679 */ 2680static void 2681send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 2682 isc_task_t *task; 2683 2684 task = (*dev)->ev_sender; 2685 2686 (*dev)->ev_sender = sock; 2687 2688 if (ISC_LINK_LINKED(*dev, ev_link)) 2689 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); 2690 2691 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 2692 == ISC_SOCKEVENTATTR_ATTACHED) 2693 isc_task_sendanddetach(&task, (isc_event_t **)dev); 2694 else 2695 isc_task_send(task, (isc_event_t **)dev); 2696} 2697 2698/* 2699 * See comments for send_recvdone_event() above. 2700 * 2701 * Caller must have the socket locked if the event is attached to the socket. 2702 */ 2703static void 2704send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 2705 isc_task_t *task; 2706 2707 INSIST(dev != NULL && *dev != NULL); 2708 2709 task = (*dev)->ev_sender; 2710 (*dev)->ev_sender = sock; 2711 2712 if (ISC_LINK_LINKED(*dev, ev_link)) 2713 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); 2714 2715 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 2716 == ISC_SOCKEVENTATTR_ATTACHED) 2717 isc_task_sendanddetach(&task, (isc_event_t **)dev); 2718 else 2719 isc_task_send(task, (isc_event_t **)dev); 2720} 2721 2722/* 2723 * Call accept() on a socket, to get the new file descriptor. The listen 2724 * socket is used as a prototype to create a new isc_socket_t. The new 2725 * socket has one outstanding reference. The task receiving the event 2726 * will be detached from just after the event is delivered. 2727 * 2728 * On entry to this function, the event delivered is the internal 2729 * readable event, and the first item on the accept_list should be 2730 * the done event we want to send. If the list is empty, this is a no-op, 2731 * so just unlock and return. 2732 */ 2733static void 2734internal_accept(isc_task_t *me, isc_event_t *ev) { 2735 isc_socket_t *sock; 2736 isc_socketmgr_t *manager; 2737 isc_socket_newconnev_t *dev; 2738 isc_task_t *task; 2739 ISC_SOCKADDR_LEN_T addrlen; 2740 int fd; 2741 isc_result_t result = ISC_R_SUCCESS; 2742 char strbuf[ISC_STRERRORSIZE]; 2743 const char *err = "accept"; 2744 2745 UNUSED(me); 2746 2747 sock = ev->ev_sender; 2748 INSIST(VALID_SOCKET(sock)); 2749 2750 LOCK(&sock->lock); 2751 socket_log(sock, NULL, TRACE, 2752 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK, 2753 "internal_accept called, locked socket"); 2754 2755 manager = sock->manager; 2756 INSIST(VALID_MANAGER(manager)); 2757 2758 INSIST(sock->listener); 2759 INSIST(sock->pending_accept == 1); 2760 sock->pending_accept = 0; 2761 2762 INSIST(sock->references > 0); 2763 sock->references--; /* the internal event is done with this socket */ 2764 if (sock->references == 0) { 2765 UNLOCK(&sock->lock); 2766 destroy(&sock); 2767 return; 2768 } 2769 2770 /* 2771 * Get the first item off the accept list. 2772 * If it is empty, unlock the socket and return. 2773 */ 2774 dev = ISC_LIST_HEAD(sock->accept_list); 2775 if (dev == NULL) { 2776 UNLOCK(&sock->lock); 2777 return; 2778 } 2779 2780 /* 2781 * Try to accept the new connection. If the accept fails with 2782 * EAGAIN or EINTR, simply poke the watcher to watch this socket 2783 * again. Also ignore ECONNRESET, which has been reported to 2784 * be spuriously returned on Linux 2.2.19 although it is not 2785 * a documented error for accept(). ECONNABORTED has been 2786 * reported for Solaris 8. The rest are thrown in not because 2787 * we have seen them but because they are ignored by other 2788 * daemons such as BIND 8 and Apache. 2789 */ 2790 2791 addrlen = sizeof(dev->newsocket->peer_address.type); 2792 memset(&dev->newsocket->peer_address.type, 0, addrlen); 2793 fd = accept(sock->fd, &dev->newsocket->peer_address.type.sa, 2794 (void *)&addrlen); 2795 2796#ifdef F_DUPFD 2797 /* 2798 * Leave a space for stdio to work in. 2799 */ 2800 if (fd >= 0 && fd < 20) { 2801 int new, tmp; 2802 new = fcntl(fd, F_DUPFD, 20); 2803 tmp = errno; 2804 (void)close(fd); 2805 errno = tmp; 2806 fd = new; 2807 err = "accept/fcntl"; 2808 } 2809#endif 2810 2811 if (fd < 0) { 2812 if (SOFT_ERROR(errno)) 2813 goto soft_error; 2814 switch (errno) { 2815 case ENFILE: 2816 case EMFILE: 2817 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2818 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2819 isc_msgcat, ISC_MSGSET_SOCKET, 2820 ISC_MSG_TOOMANYFDS, 2821 "%s: too many open file descriptors", 2822 err); 2823 goto soft_error; 2824 2825 case ENOBUFS: 2826 case ENOMEM: 2827 case ECONNRESET: 2828 case ECONNABORTED: 2829 case EHOSTUNREACH: 2830 case EHOSTDOWN: 2831 case ENETUNREACH: 2832 case ENETDOWN: 2833 case ECONNREFUSED: 2834#ifdef EPROTO 2835 case EPROTO: 2836#endif 2837#ifdef ENONET 2838 case ENONET: 2839#endif 2840 goto soft_error; 2841 default: 2842 break; 2843 } 2844 isc__strerror(errno, strbuf, sizeof(strbuf)); 2845 UNEXPECTED_ERROR(__FILE__, __LINE__, 2846 "internal_accept: %s() %s: %s", err, 2847 isc_msgcat_get(isc_msgcat, 2848 ISC_MSGSET_GENERAL, 2849 ISC_MSG_FAILED, 2850 "failed"), 2851 strbuf); 2852 fd = -1; 2853 result = ISC_R_UNEXPECTED; 2854 } else { 2855 if (addrlen == 0U) { 2856 UNEXPECTED_ERROR(__FILE__, __LINE__, 2857 "internal_accept(): " 2858 "accept() failed to return " 2859 "remote address"); 2860 2861 (void)close(fd); 2862 goto soft_error; 2863 } else if (dev->newsocket->peer_address.type.sa.sa_family != 2864 sock->pf) 2865 { 2866 UNEXPECTED_ERROR(__FILE__, __LINE__, 2867 "internal_accept(): " 2868 "accept() returned peer address " 2869 "family %u (expected %u)", 2870 dev->newsocket->peer_address. 2871 type.sa.sa_family, 2872 sock->pf); 2873 (void)close(fd); 2874 goto soft_error; 2875 } else if (fd >= (int)manager->maxsocks) { 2876 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2877 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2878 isc_msgcat, ISC_MSGSET_SOCKET, 2879 ISC_MSG_TOOMANYFDS, 2880 "accept: " 2881 "file descriptor exceeds limit (%d/%u)", 2882 fd, manager->maxsocks); 2883 (void)close(fd); 2884 goto soft_error; 2885 } 2886 } 2887 2888 if (fd != -1) { 2889 dev->newsocket->peer_address.length = addrlen; 2890 dev->newsocket->pf = sock->pf; 2891 } 2892 2893 /* 2894 * Pull off the done event. 2895 */ 2896 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link); 2897 2898 /* 2899 * Poke watcher if there are more pending accepts. 2900 */ 2901 if (!ISC_LIST_EMPTY(sock->accept_list)) 2902 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 2903 2904 UNLOCK(&sock->lock); 2905 2906 if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) { 2907 (void)close(fd); 2908 fd = -1; 2909 result = ISC_R_UNEXPECTED; 2910 } 2911 2912 /* 2913 * -1 means the new socket didn't happen. 2914 */ 2915 if (fd != -1) { 2916 int lockid = FDLOCK_ID(fd); 2917 2918 LOCK(&manager->fdlock[lockid]); 2919 manager->fds[fd] = dev->newsocket; 2920 manager->fdstate[fd] = MANAGED; 2921 UNLOCK(&manager->fdlock[lockid]); 2922 2923 LOCK(&manager->lock); 2924 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link); 2925 2926 dev->newsocket->fd = fd; 2927 dev->newsocket->bound = 1; 2928 dev->newsocket->connected = 1; 2929 2930 /* 2931 * Save away the remote address 2932 */ 2933 dev->address = dev->newsocket->peer_address; 2934 2935#ifdef USE_SELECT 2936 if (manager->maxfd < fd) 2937 manager->maxfd = fd; 2938#endif 2939 2940 socket_log(sock, &dev->newsocket->peer_address, CREATION, 2941 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, 2942 "accepted connection, new socket %p", 2943 dev->newsocket); 2944 2945 UNLOCK(&manager->lock); 2946 2947 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]); 2948 } else { 2949 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 2950 dev->newsocket->references--; 2951 free_socket(&dev->newsocket); 2952 } 2953 2954 /* 2955 * Fill in the done event details and send it off. 2956 */ 2957 dev->result = result; 2958 task = dev->ev_sender; 2959 dev->ev_sender = sock; 2960 2961 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 2962 return; 2963 2964 soft_error: 2965 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 2966 UNLOCK(&sock->lock); 2967 2968 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 2969 return; 2970} 2971 2972static void 2973internal_recv(isc_task_t *me, isc_event_t *ev) { 2974 isc_socketevent_t *dev; 2975 isc_socket_t *sock; 2976 2977 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 2978 2979 sock = ev->ev_sender; 2980 INSIST(VALID_SOCKET(sock)); 2981 2982 LOCK(&sock->lock); 2983 socket_log(sock, NULL, IOEVENT, 2984 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 2985 "internal_recv: task %p got event %p", me, ev); 2986 2987 INSIST(sock->pending_recv == 1); 2988 sock->pending_recv = 0; 2989 2990 INSIST(sock->references > 0); 2991 sock->references--; /* the internal event is done with this socket */ 2992 if (sock->references == 0) { 2993 UNLOCK(&sock->lock); 2994 destroy(&sock); 2995 return; 2996 } 2997 2998 /* 2999 * Try to do as much I/O as possible on this socket. There are no 3000 * limits here, currently. 3001 */ 3002 dev = ISC_LIST_HEAD(sock->recv_list); 3003 while (dev != NULL) { 3004 switch (doio_recv(sock, dev)) { 3005 case DOIO_SOFT: 3006 goto poke; 3007 3008 case DOIO_EOF: 3009 /* 3010 * read of 0 means the remote end was closed. 3011 * Run through the event queue and dispatch all 3012 * the events with an EOF result code. 3013 */ 3014 do { 3015 dev->result = ISC_R_EOF; 3016 send_recvdone_event(sock, &dev); 3017 dev = ISC_LIST_HEAD(sock->recv_list); 3018 } while (dev != NULL); 3019 goto poke; 3020 3021 case DOIO_SUCCESS: 3022 case DOIO_HARD: 3023 send_recvdone_event(sock, &dev); 3024 break; 3025 } 3026 3027 dev = ISC_LIST_HEAD(sock->recv_list); 3028 } 3029 3030 poke: 3031 if (!ISC_LIST_EMPTY(sock->recv_list)) 3032 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3033 3034 UNLOCK(&sock->lock); 3035} 3036 3037static void 3038internal_send(isc_task_t *me, isc_event_t *ev) { 3039 isc_socketevent_t *dev; 3040 isc_socket_t *sock; 3041 3042 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3043 3044 /* 3045 * Find out what socket this is and lock it. 3046 */ 3047 sock = (isc_socket_t *)ev->ev_sender; 3048 INSIST(VALID_SOCKET(sock)); 3049 3050 LOCK(&sock->lock); 3051 socket_log(sock, NULL, IOEVENT, 3052 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3053 "internal_send: task %p got event %p", me, ev); 3054 3055 INSIST(sock->pending_send == 1); 3056 sock->pending_send = 0; 3057 3058 INSIST(sock->references > 0); 3059 sock->references--; /* the internal event is done with this socket */ 3060 if (sock->references == 0) { 3061 UNLOCK(&sock->lock); 3062 destroy(&sock); 3063 return; 3064 } 3065 3066 /* 3067 * Try to do as much I/O as possible on this socket. There are no 3068 * limits here, currently. 3069 */ 3070 dev = ISC_LIST_HEAD(sock->send_list); 3071 while (dev != NULL) { 3072 switch (doio_send(sock, dev)) { 3073 case DOIO_SOFT: 3074 goto poke; 3075 3076 case DOIO_HARD: 3077 case DOIO_SUCCESS: 3078 send_senddone_event(sock, &dev); 3079 break; 3080 } 3081 3082 dev = ISC_LIST_HEAD(sock->send_list); 3083 } 3084 3085 poke: 3086 if (!ISC_LIST_EMPTY(sock->send_list)) 3087 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3088 3089 UNLOCK(&sock->lock); 3090} 3091 3092static void 3093internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) { 3094 isc_socket_t *sock; 3095 int more_data; 3096 3097 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3098 3099 /* 3100 * Find out what socket this is and lock it. 3101 */ 3102 sock = (isc_socket_t *)ev->ev_sender; 3103 INSIST(VALID_SOCKET(sock)); 3104 3105 LOCK(&sock->lock); 3106 socket_log(sock, NULL, IOEVENT, 3107 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3108 "internal_fdwatch_write: task %p got event %p", me, ev); 3109 3110 INSIST(sock->pending_send == 1); 3111 3112 UNLOCK(&sock->lock); 3113 more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg); 3114 LOCK(&sock->lock); 3115 3116 sock->pending_send = 0; 3117 3118 INSIST(sock->references > 0); 3119 sock->references--; /* the internal event is done with this socket */ 3120 if (sock->references == 0) { 3121 UNLOCK(&sock->lock); 3122 destroy(&sock); 3123 return; 3124 } 3125 3126 if (more_data) 3127 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3128 3129 UNLOCK(&sock->lock); 3130} 3131 3132static void 3133internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) { 3134 isc_socket_t *sock; 3135 int more_data; 3136 3137 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3138 3139 /* 3140 * Find out what socket this is and lock it. 3141 */ 3142 sock = (isc_socket_t *)ev->ev_sender; 3143 INSIST(VALID_SOCKET(sock)); 3144 3145 LOCK(&sock->lock); 3146 socket_log(sock, NULL, IOEVENT, 3147 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3148 "internal_fdwatch_read: task %p got event %p", me, ev); 3149 3150 INSIST(sock->pending_recv == 1); 3151 3152 UNLOCK(&sock->lock); 3153 more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg); 3154 LOCK(&sock->lock); 3155 3156 sock->pending_recv = 0; 3157 3158 INSIST(sock->references > 0); 3159 sock->references--; /* the internal event is done with this socket */ 3160 if (sock->references == 0) { 3161 UNLOCK(&sock->lock); 3162 destroy(&sock); 3163 return; 3164 } 3165 3166 if (more_data) 3167 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3168 3169 UNLOCK(&sock->lock); 3170} 3171 3172/* 3173 * Process read/writes on each fd here. Avoid locking 3174 * and unlocking twice if both reads and writes are possible. 3175 */ 3176static void 3177process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable, 3178 isc_boolean_t writeable) 3179{ 3180 isc_socket_t *sock; 3181 isc_boolean_t unlock_sock; 3182 isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE; 3183 int lockid = FDLOCK_ID(fd); 3184 3185 /* 3186 * If the socket is going to be closed, don't do more I/O. 3187 */ 3188 LOCK(&manager->fdlock[lockid]); 3189 if (manager->fdstate[fd] == CLOSE_PENDING) { 3190 UNLOCK(&manager->fdlock[lockid]); 3191 3192 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3193 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3194 return; 3195 } 3196 3197 sock = manager->fds[fd]; 3198 unlock_sock = ISC_FALSE; 3199 if (readable) { 3200 if (sock == NULL) { 3201 unwatch_read = ISC_TRUE; 3202 goto check_write; 3203 } 3204 unlock_sock = ISC_TRUE; 3205 LOCK(&sock->lock); 3206 if (!SOCK_DEAD(sock)) { 3207 if (sock->listener) 3208 dispatch_accept(sock); 3209 else 3210 dispatch_recv(sock); 3211 } 3212 unwatch_read = ISC_TRUE; 3213 } 3214check_write: 3215 if (writeable) { 3216 if (sock == NULL) { 3217 unwatch_write = ISC_TRUE; 3218 goto unlock_fd; 3219 } 3220 if (!unlock_sock) { 3221 unlock_sock = ISC_TRUE; 3222 LOCK(&sock->lock); 3223 } 3224 if (!SOCK_DEAD(sock)) { 3225 if (sock->connecting) 3226 dispatch_connect(sock); 3227 else 3228 dispatch_send(sock); 3229 } 3230 unwatch_write = ISC_TRUE; 3231 } 3232 if (unlock_sock) 3233 UNLOCK(&sock->lock); 3234 3235 unlock_fd: 3236 UNLOCK(&manager->fdlock[lockid]); 3237 if (unwatch_read) 3238 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3239 if (unwatch_write) 3240 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3241 3242} 3243 3244#ifdef USE_KQUEUE 3245static isc_boolean_t 3246process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) { 3247 int i; 3248 isc_boolean_t readable, writable; 3249 isc_boolean_t done = ISC_FALSE; 3250#ifdef ISC_PLATFORM_USETHREADS 3251 isc_boolean_t have_ctlevent = ISC_FALSE; 3252#endif 3253 3254 if (nevents == manager->nevents) { 3255 /* 3256 * This is not an error, but something unexpected. If this 3257 * happens, it may indicate the need for increasing 3258 * ISC_SOCKET_MAXEVENTS. 3259 */ 3260 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3261 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3262 "maximum number of FD events (%d) received", 3263 nevents); 3264 } 3265 3266 for (i = 0; i < nevents; i++) { 3267 REQUIRE(events[i].ident < manager->maxsocks); 3268#ifdef ISC_PLATFORM_USETHREADS 3269 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) { 3270 have_ctlevent = ISC_TRUE; 3271 continue; 3272 } 3273#endif 3274 readable = ISC_TF(events[i].filter == EVFILT_READ); 3275 writable = ISC_TF(events[i].filter == EVFILT_WRITE); 3276 process_fd(manager, events[i].ident, readable, writable); 3277 } 3278 3279#ifdef ISC_PLATFORM_USETHREADS 3280 if (have_ctlevent) 3281 done = process_ctlfd(manager); 3282#endif 3283 3284 return (done); 3285} 3286#elif defined(USE_EPOLL) 3287static isc_boolean_t 3288process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) { 3289 int i; 3290 isc_boolean_t done = ISC_FALSE; 3291#ifdef ISC_PLATFORM_USETHREADS 3292 isc_boolean_t have_ctlevent = ISC_FALSE; 3293#endif 3294 3295 if (nevents == manager->nevents) { 3296 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3297 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3298 "maximum number of FD events (%d) received", 3299 nevents); 3300 } 3301 3302 for (i = 0; i < nevents; i++) { 3303 REQUIRE(events[i].data.fd < (int)manager->maxsocks); 3304#ifdef ISC_PLATFORM_USETHREADS 3305 if (events[i].data.fd == manager->pipe_fds[0]) { 3306 have_ctlevent = ISC_TRUE; 3307 continue; 3308 } 3309#endif 3310 if ((events[i].events & EPOLLERR) != 0 || 3311 (events[i].events & EPOLLHUP) != 0) { 3312 /* 3313 * epoll does not set IN/OUT bits on an erroneous 3314 * condition, so we need to try both anyway. This is a 3315 * bit inefficient, but should be okay for such rare 3316 * events. Note also that the read or write attempt 3317 * won't block because we use non-blocking sockets. 3318 */ 3319 events[i].events |= (EPOLLIN | EPOLLOUT); 3320 } 3321 process_fd(manager, events[i].data.fd, 3322 (events[i].events & EPOLLIN) != 0, 3323 (events[i].events & EPOLLOUT) != 0); 3324 } 3325 3326#ifdef ISC_PLATFORM_USETHREADS 3327 if (have_ctlevent) 3328 done = process_ctlfd(manager); 3329#endif 3330 3331 return (done); 3332} 3333#elif defined(USE_DEVPOLL) 3334static isc_boolean_t 3335process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) { 3336 int i; 3337 isc_boolean_t done = ISC_FALSE; 3338#ifdef ISC_PLATFORM_USETHREADS 3339 isc_boolean_t have_ctlevent = ISC_FALSE; 3340#endif 3341 3342 if (nevents == manager->nevents) { 3343 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3344 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3345 "maximum number of FD events (%d) received", 3346 nevents); 3347 } 3348 3349 for (i = 0; i < nevents; i++) { 3350 REQUIRE(events[i].fd < (int)manager->maxsocks); 3351#ifdef ISC_PLATFORM_USETHREADS 3352 if (events[i].fd == manager->pipe_fds[0]) { 3353 have_ctlevent = ISC_TRUE; 3354 continue; 3355 } 3356#endif 3357 process_fd(manager, events[i].fd, 3358 (events[i].events & POLLIN) != 0, 3359 (events[i].events & POLLOUT) != 0); 3360 } 3361 3362#ifdef ISC_PLATFORM_USETHREADS 3363 if (have_ctlevent) 3364 done = process_ctlfd(manager); 3365#endif 3366 3367 return (done); 3368} 3369#elif defined(USE_SELECT) 3370static void 3371process_fds(isc_socketmgr_t *manager, int maxfd, 3372 fd_set *readfds, fd_set *writefds) 3373{ 3374 int i; 3375 3376 REQUIRE(maxfd <= (int)manager->maxsocks); 3377 3378 for (i = 0; i < maxfd; i++) { 3379#ifdef ISC_PLATFORM_USETHREADS 3380 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) 3381 continue; 3382#endif /* ISC_PLATFORM_USETHREADS */ 3383 process_fd(manager, i, FD_ISSET(i, readfds), 3384 FD_ISSET(i, writefds)); 3385 } 3386} 3387#endif 3388 3389#ifdef ISC_PLATFORM_USETHREADS 3390static isc_boolean_t 3391process_ctlfd(isc_socketmgr_t *manager) { 3392 int msg, fd; 3393 3394 for (;;) { 3395 select_readmsg(manager, &fd, &msg); 3396 3397 manager_log(manager, IOEVENT, 3398 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 3399 ISC_MSG_WATCHERMSG, 3400 "watcher got message %d " 3401 "for socket %d"), msg, fd); 3402 3403 /* 3404 * Nothing to read? 3405 */ 3406 if (msg == SELECT_POKE_NOTHING) 3407 break; 3408 3409 /* 3410 * Handle shutdown message. We really should 3411 * jump out of this loop right away, but 3412 * it doesn't matter if we have to do a little 3413 * more work first. 3414 */ 3415 if (msg == SELECT_POKE_SHUTDOWN) 3416 return (ISC_TRUE); 3417 3418 /* 3419 * This is a wakeup on a socket. Look 3420 * at the event queue for both read and write, 3421 * and decide if we need to watch on it now 3422 * or not. 3423 */ 3424 wakeup_socket(manager, fd, msg); 3425 } 3426 3427 return (ISC_FALSE); 3428} 3429 3430/* 3431 * This is the thread that will loop forever, always in a select or poll 3432 * call. 3433 * 3434 * When select returns something to do, track down what thread gets to do 3435 * this I/O and post the event to it. 3436 */ 3437static isc_threadresult_t 3438watcher(void *uap) { 3439 isc_socketmgr_t *manager = uap; 3440 isc_boolean_t done; 3441 int ctlfd; 3442 int cc; 3443#ifdef USE_KQUEUE 3444 const char *fnname = "kevent()"; 3445#elif defined (USE_EPOLL) 3446 const char *fnname = "epoll_wait()"; 3447#elif defined(USE_DEVPOLL) 3448 const char *fnname = "ioctl(DP_POLL)"; 3449 struct dvpoll dvp; 3450#elif defined (USE_SELECT) 3451 const char *fnname = "select()"; 3452 int maxfd; 3453#endif 3454 char strbuf[ISC_STRERRORSIZE]; 3455#ifdef ISC_SOCKET_USE_POLLWATCH 3456 pollstate_t pollstate = poll_idle; 3457#endif 3458 3459 /* 3460 * Get the control fd here. This will never change. 3461 */ 3462 ctlfd = manager->pipe_fds[0]; 3463 done = ISC_FALSE; 3464 while (!done) { 3465 do { 3466#ifdef USE_KQUEUE 3467 cc = kevent(manager->kqueue_fd, NULL, 0, 3468 manager->events, manager->nevents, NULL); 3469#elif defined(USE_EPOLL) 3470 cc = epoll_wait(manager->epoll_fd, manager->events, 3471 manager->nevents, -1); 3472#elif defined(USE_DEVPOLL) 3473 dvp.dp_fds = manager->events; 3474 dvp.dp_nfds = manager->nevents; 3475#ifndef ISC_SOCKET_USE_POLLWATCH 3476 dvp.dp_timeout = -1; 3477#else 3478 if (pollstate == poll_idle) 3479 dvp.dp_timeout = -1; 3480 else 3481 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT; 3482#endif /* ISC_SOCKET_USE_POLLWATCH */ 3483 cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 3484#elif defined(USE_SELECT) 3485 LOCK(&manager->lock); 3486 memcpy(manager->read_fds_copy, manager->read_fds, 3487 manager->fd_bufsize); 3488 memcpy(manager->write_fds_copy, manager->write_fds, 3489 manager->fd_bufsize); 3490 maxfd = manager->maxfd + 1; 3491 UNLOCK(&manager->lock); 3492 3493 cc = select(maxfd, manager->read_fds_copy, 3494 manager->write_fds_copy, NULL, NULL); 3495#endif /* USE_KQUEUE */ 3496 3497 if (cc < 0 && !SOFT_ERROR(errno)) { 3498 isc__strerror(errno, strbuf, sizeof(strbuf)); 3499 FATAL_ERROR(__FILE__, __LINE__, 3500 "%s %s: %s", fnname, 3501 isc_msgcat_get(isc_msgcat, 3502 ISC_MSGSET_GENERAL, 3503 ISC_MSG_FAILED, 3504 "failed"), strbuf); 3505 } 3506 3507#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) 3508 if (cc == 0) { 3509 if (pollstate == poll_active) 3510 pollstate = poll_checking; 3511 else if (pollstate == poll_checking) 3512 pollstate = poll_idle; 3513 } else if (cc > 0) { 3514 if (pollstate == poll_checking) { 3515 /* 3516 * XXX: We'd like to use a more 3517 * verbose log level as it's actually an 3518 * unexpected event, but the kernel bug 3519 * reportedly happens pretty frequently 3520 * (and it can also be a false positive) 3521 * so it would be just too noisy. 3522 */ 3523 manager_log(manager, 3524 ISC_LOGCATEGORY_GENERAL, 3525 ISC_LOGMODULE_SOCKET, 3526 ISC_LOG_DEBUG(1), 3527 "unexpected POLL timeout"); 3528 } 3529 pollstate = poll_active; 3530 } 3531#endif 3532 } while (cc < 0); 3533 3534#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) 3535 done = process_fds(manager, manager->events, cc); 3536#elif defined(USE_SELECT) 3537 process_fds(manager, maxfd, manager->read_fds_copy, 3538 manager->write_fds_copy); 3539 3540 /* 3541 * Process reads on internal, control fd. 3542 */ 3543 if (FD_ISSET(ctlfd, manager->read_fds_copy)) 3544 done = process_ctlfd(manager); 3545#endif 3546 } 3547 3548 manager_log(manager, TRACE, "%s", 3549 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3550 ISC_MSG_EXITING, "watcher exiting")); 3551 3552 return ((isc_threadresult_t)0); 3553} 3554#endif /* ISC_PLATFORM_USETHREADS */ 3555 3556void 3557isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) { 3558 3559 REQUIRE(VALID_MANAGER(manager)); 3560 3561 manager->reserved = reserved; 3562} 3563 3564/* 3565 * Create a new socket manager. 3566 */ 3567 3568static isc_result_t 3569setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { 3570 isc_result_t result; 3571#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 3572 char strbuf[ISC_STRERRORSIZE]; 3573#endif 3574 3575#ifdef USE_KQUEUE 3576 manager->nevents = ISC_SOCKET_MAXEVENTS; 3577 manager->events = isc_mem_get(mctx, sizeof(struct kevent) * 3578 manager->nevents); 3579 if (manager->events == NULL) 3580 return (ISC_R_NOMEMORY); 3581 manager->kqueue_fd = kqueue(); 3582 if (manager->kqueue_fd == -1) { 3583 result = isc__errno2result(errno); 3584 isc__strerror(errno, strbuf, sizeof(strbuf)); 3585 UNEXPECTED_ERROR(__FILE__, __LINE__, 3586 "kqueue %s: %s", 3587 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3588 ISC_MSG_FAILED, "failed"), 3589 strbuf); 3590 isc_mem_put(mctx, manager->events, 3591 sizeof(struct kevent) * manager->nevents); 3592 return (result); 3593 } 3594 3595#ifdef ISC_PLATFORM_USETHREADS 3596 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3597 if (result != ISC_R_SUCCESS) { 3598 close(manager->kqueue_fd); 3599 isc_mem_put(mctx, manager->events, 3600 sizeof(struct kevent) * manager->nevents); 3601 return (result); 3602 } 3603#endif /* ISC_PLATFORM_USETHREADS */ 3604#elif defined(USE_EPOLL) 3605 manager->nevents = ISC_SOCKET_MAXEVENTS; 3606 manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * 3607 manager->nevents); 3608 if (manager->events == NULL) 3609 return (ISC_R_NOMEMORY); 3610 manager->epoll_fd = epoll_create(manager->nevents); 3611 if (manager->epoll_fd == -1) { 3612 result = isc__errno2result(errno); 3613 isc__strerror(errno, strbuf, sizeof(strbuf)); 3614 UNEXPECTED_ERROR(__FILE__, __LINE__, 3615 "epoll_create %s: %s", 3616 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3617 ISC_MSG_FAILED, "failed"), 3618 strbuf); 3619 isc_mem_put(mctx, manager->events, 3620 sizeof(struct epoll_event) * manager->nevents); 3621 return (result); 3622 } 3623#ifdef ISC_PLATFORM_USETHREADS 3624 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3625 if (result != ISC_R_SUCCESS) { 3626 close(manager->epoll_fd); 3627 isc_mem_put(mctx, manager->events, 3628 sizeof(struct epoll_event) * manager->nevents); 3629 return (result); 3630 } 3631#endif /* ISC_PLATFORM_USETHREADS */ 3632#elif defined(USE_DEVPOLL) 3633 /* 3634 * XXXJT: /dev/poll seems to reject large numbers of events, 3635 * so we should be careful about redefining ISC_SOCKET_MAXEVENTS. 3636 */ 3637 manager->nevents = ISC_SOCKET_MAXEVENTS; 3638 manager->events = isc_mem_get(mctx, sizeof(struct pollfd) * 3639 manager->nevents); 3640 if (manager->events == NULL) 3641 return (ISC_R_NOMEMORY); 3642 /* 3643 * Note: fdpollinfo should be able to support all possible FDs, so 3644 * it must have maxsocks entries (not nevents). 3645 */ 3646 manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) * 3647 manager->maxsocks); 3648 if (manager->fdpollinfo == NULL) { 3649 isc_mem_put(mctx, manager->events, 3650 sizeof(pollinfo_t) * manager->maxsocks); 3651 return (ISC_R_NOMEMORY); 3652 } 3653 memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks); 3654 manager->devpoll_fd = open("/dev/poll", O_RDWR); 3655 if (manager->devpoll_fd == -1) { 3656 result = isc__errno2result(errno); 3657 isc__strerror(errno, strbuf, sizeof(strbuf)); 3658 UNEXPECTED_ERROR(__FILE__, __LINE__, 3659 "open(/dev/poll) %s: %s", 3660 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3661 ISC_MSG_FAILED, "failed"), 3662 strbuf); 3663 isc_mem_put(mctx, manager->events, 3664 sizeof(struct pollfd) * manager->nevents); 3665 isc_mem_put(mctx, manager->fdpollinfo, 3666 sizeof(pollinfo_t) * manager->maxsocks); 3667 return (result); 3668 } 3669#ifdef ISC_PLATFORM_USETHREADS 3670 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3671 if (result != ISC_R_SUCCESS) { 3672 close(manager->devpoll_fd); 3673 isc_mem_put(mctx, manager->events, 3674 sizeof(struct pollfd) * manager->nevents); 3675 isc_mem_put(mctx, manager->fdpollinfo, 3676 sizeof(pollinfo_t) * manager->maxsocks); 3677 return (result); 3678 } 3679#endif /* ISC_PLATFORM_USETHREADS */ 3680#elif defined(USE_SELECT) 3681 UNUSED(result); 3682 3683#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE 3684 /* 3685 * Note: this code should also cover the case of MAXSOCKETS <= 3686 * FD_SETSIZE, but we separate the cases to avoid possible portability 3687 * issues regarding howmany() and the actual representation of fd_set. 3688 */ 3689 manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) * 3690 sizeof(fd_mask); 3691#else 3692 manager->fd_bufsize = sizeof(fd_set); 3693#endif 3694 3695 manager->read_fds = NULL; 3696 manager->read_fds_copy = NULL; 3697 manager->write_fds = NULL; 3698 manager->write_fds_copy = NULL; 3699 3700 manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize); 3701 if (manager->read_fds != NULL) 3702 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); 3703 if (manager->read_fds_copy != NULL) 3704 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); 3705 if (manager->write_fds != NULL) { 3706 manager->write_fds_copy = isc_mem_get(mctx, 3707 manager->fd_bufsize); 3708 } 3709 if (manager->write_fds_copy == NULL) { 3710 if (manager->write_fds != NULL) { 3711 isc_mem_put(mctx, manager->write_fds, 3712 manager->fd_bufsize); 3713 } 3714 if (manager->read_fds_copy != NULL) { 3715 isc_mem_put(mctx, manager->read_fds_copy, 3716 manager->fd_bufsize); 3717 } 3718 if (manager->read_fds != NULL) { 3719 isc_mem_put(mctx, manager->read_fds, 3720 manager->fd_bufsize); 3721 } 3722 return (ISC_R_NOMEMORY); 3723 } 3724 memset(manager->read_fds, 0, manager->fd_bufsize); 3725 memset(manager->write_fds, 0, manager->fd_bufsize); 3726 3727#ifdef ISC_PLATFORM_USETHREADS 3728 (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3729 manager->maxfd = manager->pipe_fds[0]; 3730#else /* ISC_PLATFORM_USETHREADS */ 3731 manager->maxfd = 0; 3732#endif /* ISC_PLATFORM_USETHREADS */ 3733#endif /* USE_KQUEUE */ 3734 3735 return (ISC_R_SUCCESS); 3736} 3737 3738static void 3739cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { 3740#ifdef ISC_PLATFORM_USETHREADS 3741 isc_result_t result; 3742 3743 result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 3744 if (result != ISC_R_SUCCESS) { 3745 UNEXPECTED_ERROR(__FILE__, __LINE__, 3746 "epoll_ctl(DEL) %s", 3747 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3748 ISC_MSG_FAILED, "failed")); 3749 } 3750#endif /* ISC_PLATFORM_USETHREADS */ 3751 3752#ifdef USE_KQUEUE 3753 close(manager->kqueue_fd); 3754 isc_mem_put(mctx, manager->events, 3755 sizeof(struct kevent) * manager->nevents); 3756#elif defined(USE_EPOLL) 3757 close(manager->epoll_fd); 3758 isc_mem_put(mctx, manager->events, 3759 sizeof(struct epoll_event) * manager->nevents); 3760#elif defined(USE_DEVPOLL) 3761 close(manager->devpoll_fd); 3762 isc_mem_put(mctx, manager->events, 3763 sizeof(struct pollfd) * manager->nevents); 3764 isc_mem_put(mctx, manager->fdpollinfo, 3765 sizeof(pollinfo_t) * manager->maxsocks); 3766#elif defined(USE_SELECT) 3767 if (manager->read_fds != NULL) 3768 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize); 3769 if (manager->read_fds_copy != NULL) 3770 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize); 3771 if (manager->write_fds != NULL) 3772 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize); 3773 if (manager->write_fds_copy != NULL) 3774 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize); 3775#endif /* USE_KQUEUE */ 3776} 3777 3778isc_result_t 3779isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { 3780 return (isc_socketmgr_create2(mctx, managerp, 0)); 3781} 3782 3783isc_result_t 3784isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 3785 unsigned int maxsocks) 3786{ 3787 int i; 3788 isc_socketmgr_t *manager; 3789#ifdef ISC_PLATFORM_USETHREADS 3790 char strbuf[ISC_STRERRORSIZE]; 3791#endif 3792 isc_result_t result; 3793 3794 REQUIRE(managerp != NULL && *managerp == NULL); 3795 3796#ifndef ISC_PLATFORM_USETHREADS 3797 if (socketmgr != NULL) { 3798 /* Don't allow maxsocks to be updated */ 3799 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) 3800 return (ISC_R_EXISTS); 3801 3802 socketmgr->refs++; 3803 *managerp = socketmgr; 3804 return (ISC_R_SUCCESS); 3805 } 3806#endif /* ISC_PLATFORM_USETHREADS */ 3807 3808 if (maxsocks == 0) 3809 maxsocks = ISC_SOCKET_MAXSOCKETS; 3810 3811 manager = isc_mem_get(mctx, sizeof(*manager)); 3812 if (manager == NULL) 3813 return (ISC_R_NOMEMORY); 3814 3815 /* zero-clear so that necessary cleanup on failure will be easy */ 3816 memset(manager, 0, sizeof(*manager)); 3817 manager->maxsocks = maxsocks; 3818 manager->reserved = 0; 3819 manager->fds = isc_mem_get(mctx, 3820 manager->maxsocks * sizeof(isc_socket_t *)); 3821 if (manager->fds == NULL) { 3822 result = ISC_R_NOMEMORY; 3823 goto free_manager; 3824 } 3825 manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int)); 3826 if (manager->fdstate == NULL) { 3827 result = ISC_R_NOMEMORY; 3828 goto free_manager; 3829 } 3830 manager->stats = NULL; 3831 3832 manager->magic = SOCKET_MANAGER_MAGIC; 3833 manager->mctx = NULL; 3834 memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); 3835 ISC_LIST_INIT(manager->socklist); 3836 result = isc_mutex_init(&manager->lock); 3837 if (result != ISC_R_SUCCESS) 3838 goto free_manager; 3839 manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t)); 3840 if (manager->fdlock == NULL) { 3841 result = ISC_R_NOMEMORY; 3842 goto cleanup_lock; 3843 } 3844 for (i = 0; i < FDLOCK_COUNT; i++) { 3845 result = isc_mutex_init(&manager->fdlock[i]); 3846 if (result != ISC_R_SUCCESS) { 3847 while (--i >= 0) 3848 DESTROYLOCK(&manager->fdlock[i]); 3849 isc_mem_put(mctx, manager->fdlock, 3850 FDLOCK_COUNT * sizeof(isc_mutex_t)); 3851 manager->fdlock = NULL; 3852 goto cleanup_lock; 3853 } 3854 } 3855 3856#ifdef ISC_PLATFORM_USETHREADS 3857 if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { 3858 UNEXPECTED_ERROR(__FILE__, __LINE__, 3859 "isc_condition_init() %s", 3860 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3861 ISC_MSG_FAILED, "failed")); 3862 result = ISC_R_UNEXPECTED; 3863 goto cleanup_lock; 3864 } 3865 3866 /* 3867 * Create the special fds that will be used to wake up the 3868 * select/poll loop when something internal needs to be done. 3869 */ 3870 if (pipe(manager->pipe_fds) != 0) { 3871 isc__strerror(errno, strbuf, sizeof(strbuf)); 3872 UNEXPECTED_ERROR(__FILE__, __LINE__, 3873 "pipe() %s: %s", 3874 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3875 ISC_MSG_FAILED, "failed"), 3876 strbuf); 3877 result = ISC_R_UNEXPECTED; 3878 goto cleanup_condition; 3879 } 3880 3881 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS); 3882#if 0 3883 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS); 3884#endif 3885#else /* ISC_PLATFORM_USETHREADS */ 3886 manager->refs = 1; 3887#endif /* ISC_PLATFORM_USETHREADS */ 3888 3889 /* 3890 * Set up initial state for the select loop 3891 */ 3892 result = setup_watcher(mctx, manager); 3893 if (result != ISC_R_SUCCESS) 3894 goto cleanup; 3895 memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); 3896#ifdef ISC_PLATFORM_USETHREADS 3897 /* 3898 * Start up the select/poll thread. 3899 */ 3900 if (isc_thread_create(watcher, manager, &manager->watcher) != 3901 ISC_R_SUCCESS) { 3902 UNEXPECTED_ERROR(__FILE__, __LINE__, 3903 "isc_thread_create() %s", 3904 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 3905 ISC_MSG_FAILED, "failed")); 3906 cleanup_watcher(mctx, manager); 3907 result = ISC_R_UNEXPECTED; 3908 goto cleanup; 3909 } 3910#endif /* ISC_PLATFORM_USETHREADS */ 3911 isc_mem_attach(mctx, &manager->mctx); 3912 3913#ifndef ISC_PLATFORM_USETHREADS 3914 socketmgr = manager; 3915#endif /* ISC_PLATFORM_USETHREADS */ 3916 *managerp = manager; 3917 3918 return (ISC_R_SUCCESS); 3919 3920cleanup: 3921#ifdef ISC_PLATFORM_USETHREADS 3922 (void)close(manager->pipe_fds[0]); 3923 (void)close(manager->pipe_fds[1]); 3924#endif /* ISC_PLATFORM_USETHREADS */ 3925 3926#ifdef ISC_PLATFORM_USETHREADS 3927cleanup_condition: 3928 (void)isc_condition_destroy(&manager->shutdown_ok); 3929#endif /* ISC_PLATFORM_USETHREADS */ 3930 3931 3932cleanup_lock: 3933 if (manager->fdlock != NULL) { 3934 for (i = 0; i < FDLOCK_COUNT; i++) 3935 DESTROYLOCK(&manager->fdlock[i]); 3936 } 3937 DESTROYLOCK(&manager->lock); 3938 3939free_manager: 3940 if (manager->fdlock != NULL) { 3941 isc_mem_put(mctx, manager->fdlock, 3942 FDLOCK_COUNT * sizeof(isc_mutex_t)); 3943 } 3944 if (manager->fdstate != NULL) { 3945 isc_mem_put(mctx, manager->fdstate, 3946 manager->maxsocks * sizeof(int)); 3947 } 3948 if (manager->fds != NULL) { 3949 isc_mem_put(mctx, manager->fds, 3950 manager->maxsocks * sizeof(isc_socket_t *)); 3951 } 3952 isc_mem_put(mctx, manager, sizeof(*manager)); 3953 3954 return (result); 3955} 3956 3957isc_result_t 3958isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) { 3959 REQUIRE(VALID_MANAGER(manager)); 3960 REQUIRE(nsockp != NULL); 3961 3962 *nsockp = manager->maxsocks; 3963 3964 return (ISC_R_SUCCESS); 3965} 3966 3967void 3968isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) { 3969 REQUIRE(VALID_MANAGER(manager)); 3970 REQUIRE(ISC_LIST_EMPTY(manager->socklist)); 3971 REQUIRE(manager->stats == NULL); 3972 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max); 3973 3974 isc_stats_attach(stats, &manager->stats); 3975} 3976 3977void 3978isc_socketmgr_destroy(isc_socketmgr_t **managerp) { 3979 isc_socketmgr_t *manager; 3980 int i; 3981 isc_mem_t *mctx; 3982 3983 /* 3984 * Destroy a socket manager. 3985 */ 3986 3987 REQUIRE(managerp != NULL); 3988 manager = *managerp; 3989 REQUIRE(VALID_MANAGER(manager)); 3990 3991#ifndef ISC_PLATFORM_USETHREADS 3992 if (manager->refs > 1) { 3993 manager->refs--; 3994 *managerp = NULL; 3995 return; 3996 } 3997#endif /* ISC_PLATFORM_USETHREADS */ 3998 3999 LOCK(&manager->lock); 4000 4001#ifdef ISC_PLATFORM_USETHREADS 4002 /* 4003 * Wait for all sockets to be destroyed. 4004 */ 4005 while (!ISC_LIST_EMPTY(manager->socklist)) { 4006 manager_log(manager, CREATION, "%s", 4007 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 4008 ISC_MSG_SOCKETSREMAIN, 4009 "sockets exist")); 4010 WAIT(&manager->shutdown_ok, &manager->lock); 4011 } 4012#else /* ISC_PLATFORM_USETHREADS */ 4013 /* 4014 * Hope all sockets have been destroyed. 4015 */ 4016 if (!ISC_LIST_EMPTY(manager->socklist)) { 4017 manager_log(manager, CREATION, "%s", 4018 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 4019 ISC_MSG_SOCKETSREMAIN, 4020 "sockets exist")); 4021 INSIST(0); 4022 } 4023#endif /* ISC_PLATFORM_USETHREADS */ 4024 4025 UNLOCK(&manager->lock); 4026 4027 /* 4028 * Here, poke our select/poll thread. Do this by closing the write 4029 * half of the pipe, which will send EOF to the read half. 4030 * This is currently a no-op in the non-threaded case. 4031 */ 4032 select_poke(manager, 0, SELECT_POKE_SHUTDOWN); 4033 4034#ifdef ISC_PLATFORM_USETHREADS 4035 /* 4036 * Wait for thread to exit. 4037 */ 4038 if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS) 4039 UNEXPECTED_ERROR(__FILE__, __LINE__, 4040 "isc_thread_join() %s", 4041 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4042 ISC_MSG_FAILED, "failed")); 4043#endif /* ISC_PLATFORM_USETHREADS */ 4044 4045 /* 4046 * Clean up. 4047 */ 4048 cleanup_watcher(manager->mctx, manager); 4049 4050#ifdef ISC_PLATFORM_USETHREADS 4051 (void)close(manager->pipe_fds[0]); 4052 (void)close(manager->pipe_fds[1]); 4053 (void)isc_condition_destroy(&manager->shutdown_ok); 4054#endif /* ISC_PLATFORM_USETHREADS */ 4055 4056 for (i = 0; i < (int)manager->maxsocks; i++) 4057 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ 4058 (void)close(i); 4059 4060 isc_mem_put(manager->mctx, manager->fds, 4061 manager->maxsocks * sizeof(isc_socket_t *)); 4062 isc_mem_put(manager->mctx, manager->fdstate, 4063 manager->maxsocks * sizeof(int)); 4064 4065 if (manager->stats != NULL) 4066 isc_stats_detach(&manager->stats); 4067 4068 if (manager->fdlock != NULL) { 4069 for (i = 0; i < FDLOCK_COUNT; i++) 4070 DESTROYLOCK(&manager->fdlock[i]); 4071 isc_mem_put(manager->mctx, manager->fdlock, 4072 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4073 } 4074 DESTROYLOCK(&manager->lock); 4075 manager->magic = 0; 4076 mctx= manager->mctx; 4077 isc_mem_put(mctx, manager, sizeof(*manager)); 4078 4079 isc_mem_detach(&mctx); 4080 4081 *managerp = NULL; 4082} 4083 4084static isc_result_t 4085socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4086 unsigned int flags) 4087{ 4088 int io_state; 4089 isc_boolean_t have_lock = ISC_FALSE; 4090 isc_task_t *ntask = NULL; 4091 isc_result_t result = ISC_R_SUCCESS; 4092 4093 dev->ev_sender = task; 4094 4095 if (sock->type == isc_sockettype_udp) { 4096 io_state = doio_recv(sock, dev); 4097 } else { 4098 LOCK(&sock->lock); 4099 have_lock = ISC_TRUE; 4100 4101 if (ISC_LIST_EMPTY(sock->recv_list)) 4102 io_state = doio_recv(sock, dev); 4103 else 4104 io_state = DOIO_SOFT; 4105 } 4106 4107 switch (io_state) { 4108 case DOIO_SOFT: 4109 /* 4110 * We couldn't read all or part of the request right now, so 4111 * queue it. 4112 * 4113 * Attach to socket and to task 4114 */ 4115 isc_task_attach(task, &ntask); 4116 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4117 4118 if (!have_lock) { 4119 LOCK(&sock->lock); 4120 have_lock = ISC_TRUE; 4121 } 4122 4123 /* 4124 * Enqueue the request. If the socket was previously not being 4125 * watched, poke the watcher to start paying attention to it. 4126 */ 4127 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) 4128 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 4129 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); 4130 4131 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4132 "socket_recv: event %p -> task %p", 4133 dev, ntask); 4134 4135 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4136 result = ISC_R_INPROGRESS; 4137 break; 4138 4139 case DOIO_EOF: 4140 dev->result = ISC_R_EOF; 4141 /* fallthrough */ 4142 4143 case DOIO_HARD: 4144 case DOIO_SUCCESS: 4145 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4146 send_recvdone_event(sock, &dev); 4147 break; 4148 } 4149 4150 if (have_lock) 4151 UNLOCK(&sock->lock); 4152 4153 return (result); 4154} 4155 4156isc_result_t 4157isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist, 4158 unsigned int minimum, isc_task_t *task, 4159 isc_taskaction_t action, const void *arg) 4160{ 4161 isc_socketevent_t *dev; 4162 isc_socketmgr_t *manager; 4163 unsigned int iocount; 4164 isc_buffer_t *buffer; 4165 4166 REQUIRE(VALID_SOCKET(sock)); 4167 REQUIRE(buflist != NULL); 4168 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4169 REQUIRE(task != NULL); 4170 REQUIRE(action != NULL); 4171 4172 manager = sock->manager; 4173 REQUIRE(VALID_MANAGER(manager)); 4174 4175 iocount = isc_bufferlist_availablecount(buflist); 4176 REQUIRE(iocount > 0); 4177 4178 INSIST(sock->bound); 4179 4180 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4181 if (dev == NULL) { 4182 return (ISC_R_NOMEMORY); 4183 } 4184 4185 /* 4186 * UDP sockets are always partial read 4187 */ 4188 if (sock->type == isc_sockettype_udp) 4189 dev->minimum = 1; 4190 else { 4191 if (minimum == 0) 4192 dev->minimum = iocount; 4193 else 4194 dev->minimum = minimum; 4195 } 4196 4197 /* 4198 * Move each buffer from the passed in list to our internal one. 4199 */ 4200 buffer = ISC_LIST_HEAD(*buflist); 4201 while (buffer != NULL) { 4202 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4203 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4204 buffer = ISC_LIST_HEAD(*buflist); 4205 } 4206 4207 return (socket_recv(sock, dev, task, 0)); 4208} 4209 4210isc_result_t 4211isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum, 4212 isc_task_t *task, isc_taskaction_t action, const void *arg) 4213{ 4214 isc_socketevent_t *dev; 4215 isc_socketmgr_t *manager; 4216 4217 REQUIRE(VALID_SOCKET(sock)); 4218 REQUIRE(action != NULL); 4219 4220 manager = sock->manager; 4221 REQUIRE(VALID_MANAGER(manager)); 4222 4223 INSIST(sock->bound); 4224 4225 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg); 4226 if (dev == NULL) 4227 return (ISC_R_NOMEMORY); 4228 4229 return (isc_socket_recv2(sock, region, minimum, task, dev, 0)); 4230} 4231 4232isc_result_t 4233isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, 4234 unsigned int minimum, isc_task_t *task, 4235 isc_socketevent_t *event, unsigned int flags) 4236{ 4237 event->ev_sender = sock; 4238 event->result = ISC_R_UNEXPECTED; 4239 ISC_LIST_INIT(event->bufferlist); 4240 event->region = *region; 4241 event->n = 0; 4242 event->offset = 0; 4243 event->attributes = 0; 4244 4245 /* 4246 * UDP sockets are always partial read. 4247 */ 4248 if (sock->type == isc_sockettype_udp) 4249 event->minimum = 1; 4250 else { 4251 if (minimum == 0) 4252 event->minimum = region->length; 4253 else 4254 event->minimum = minimum; 4255 } 4256 4257 return (socket_recv(sock, event, task, flags)); 4258} 4259 4260static isc_result_t 4261socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4262 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4263 unsigned int flags) 4264{ 4265 int io_state; 4266 isc_boolean_t have_lock = ISC_FALSE; 4267 isc_task_t *ntask = NULL; 4268 isc_result_t result = ISC_R_SUCCESS; 4269 4270 dev->ev_sender = task; 4271 4272 set_dev_address(address, sock, dev); 4273 if (pktinfo != NULL) { 4274 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 4275 dev->pktinfo = *pktinfo; 4276 4277 if (!isc_sockaddr_issitelocal(&dev->address) && 4278 !isc_sockaddr_islinklocal(&dev->address)) { 4279 socket_log(sock, NULL, TRACE, isc_msgcat, 4280 ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED, 4281 "pktinfo structure provided, ifindex %u " 4282 "(set to 0)", pktinfo->ipi6_ifindex); 4283 4284 /* 4285 * Set the pktinfo index to 0 here, to let the 4286 * kernel decide what interface it should send on. 4287 */ 4288 dev->pktinfo.ipi6_ifindex = 0; 4289 } 4290 } 4291 4292 if (sock->type == isc_sockettype_udp) 4293 io_state = doio_send(sock, dev); 4294 else { 4295 LOCK(&sock->lock); 4296 have_lock = ISC_TRUE; 4297 4298 if (ISC_LIST_EMPTY(sock->send_list)) 4299 io_state = doio_send(sock, dev); 4300 else 4301 io_state = DOIO_SOFT; 4302 } 4303 4304 switch (io_state) { 4305 case DOIO_SOFT: 4306 /* 4307 * We couldn't send all or part of the request right now, so 4308 * queue it unless ISC_SOCKFLAG_NORETRY is set. 4309 */ 4310 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { 4311 isc_task_attach(task, &ntask); 4312 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4313 4314 if (!have_lock) { 4315 LOCK(&sock->lock); 4316 have_lock = ISC_TRUE; 4317 } 4318 4319 /* 4320 * Enqueue the request. If the socket was previously 4321 * not being watched, poke the watcher to start 4322 * paying attention to it. 4323 */ 4324 if (ISC_LIST_EMPTY(sock->send_list) && 4325 !sock->pending_send) 4326 select_poke(sock->manager, sock->fd, 4327 SELECT_POKE_WRITE); 4328 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); 4329 4330 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4331 "socket_send: event %p -> task %p", 4332 dev, ntask); 4333 4334 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4335 result = ISC_R_INPROGRESS; 4336 break; 4337 } 4338 4339 case DOIO_HARD: 4340 case DOIO_SUCCESS: 4341 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4342 send_senddone_event(sock, &dev); 4343 break; 4344 } 4345 4346 if (have_lock) 4347 UNLOCK(&sock->lock); 4348 4349 return (result); 4350} 4351 4352isc_result_t 4353isc_socket_send(isc_socket_t *sock, isc_region_t *region, 4354 isc_task_t *task, isc_taskaction_t action, const void *arg) 4355{ 4356 /* 4357 * REQUIRE() checking is performed in isc_socket_sendto(). 4358 */ 4359 return (isc_socket_sendto(sock, region, task, action, arg, NULL, 4360 NULL)); 4361} 4362 4363isc_result_t 4364isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, 4365 isc_task_t *task, isc_taskaction_t action, const void *arg, 4366 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4367{ 4368 isc_socketevent_t *dev; 4369 isc_socketmgr_t *manager; 4370 4371 REQUIRE(VALID_SOCKET(sock)); 4372 REQUIRE(region != NULL); 4373 REQUIRE(task != NULL); 4374 REQUIRE(action != NULL); 4375 4376 manager = sock->manager; 4377 REQUIRE(VALID_MANAGER(manager)); 4378 4379 INSIST(sock->bound); 4380 4381 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4382 if (dev == NULL) { 4383 return (ISC_R_NOMEMORY); 4384 } 4385 4386 dev->region = *region; 4387 4388 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4389} 4390 4391isc_result_t 4392isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 4393 isc_task_t *task, isc_taskaction_t action, const void *arg) 4394{ 4395 return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL, 4396 NULL)); 4397} 4398 4399isc_result_t 4400isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist, 4401 isc_task_t *task, isc_taskaction_t action, const void *arg, 4402 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 4403{ 4404 isc_socketevent_t *dev; 4405 isc_socketmgr_t *manager; 4406 unsigned int iocount; 4407 isc_buffer_t *buffer; 4408 4409 REQUIRE(VALID_SOCKET(sock)); 4410 REQUIRE(buflist != NULL); 4411 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4412 REQUIRE(task != NULL); 4413 REQUIRE(action != NULL); 4414 4415 manager = sock->manager; 4416 REQUIRE(VALID_MANAGER(manager)); 4417 4418 iocount = isc_bufferlist_usedcount(buflist); 4419 REQUIRE(iocount > 0); 4420 4421 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg); 4422 if (dev == NULL) { 4423 return (ISC_R_NOMEMORY); 4424 } 4425 4426 /* 4427 * Move each buffer from the passed in list to our internal one. 4428 */ 4429 buffer = ISC_LIST_HEAD(*buflist); 4430 while (buffer != NULL) { 4431 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4432 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4433 buffer = ISC_LIST_HEAD(*buflist); 4434 } 4435 4436 return (socket_send(sock, dev, task, address, pktinfo, 0)); 4437} 4438 4439isc_result_t 4440isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, 4441 isc_task_t *task, 4442 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 4443 isc_socketevent_t *event, unsigned int flags) 4444{ 4445 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0); 4446 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) 4447 REQUIRE(sock->type == isc_sockettype_udp); 4448 event->ev_sender = sock; 4449 event->result = ISC_R_UNEXPECTED; 4450 ISC_LIST_INIT(event->bufferlist); 4451 event->region = *region; 4452 event->n = 0; 4453 event->offset = 0; 4454 event->attributes = 0; 4455 4456 return (socket_send(sock, event, task, address, pktinfo, flags)); 4457} 4458 4459void 4460isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) { 4461#ifdef ISC_PLATFORM_HAVESYSUNH 4462 int s; 4463 struct stat sb; 4464 char strbuf[ISC_STRERRORSIZE]; 4465 4466 if (sockaddr->type.sa.sa_family != AF_UNIX) 4467 return; 4468 4469#ifndef S_ISSOCK 4470#if defined(S_IFMT) && defined(S_IFSOCK) 4471#define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK) 4472#elif defined(_S_IFMT) && defined(S_IFSOCK) 4473#define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK) 4474#endif 4475#endif 4476 4477#ifndef S_ISFIFO 4478#if defined(S_IFMT) && defined(S_IFIFO) 4479#define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO) 4480#elif defined(_S_IFMT) && defined(S_IFIFO) 4481#define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO) 4482#endif 4483#endif 4484 4485#if !defined(S_ISFIFO) && !defined(S_ISSOCK) 4486#error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>. 4487#endif 4488 4489#ifndef S_ISFIFO 4490#define S_ISFIFO(mode) 0 4491#endif 4492 4493#ifndef S_ISSOCK 4494#define S_ISSOCK(mode) 0 4495#endif 4496 4497 if (active) { 4498 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4499 isc__strerror(errno, strbuf, sizeof(strbuf)); 4500 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4501 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4502 "isc_socket_cleanunix: stat(%s): %s", 4503 sockaddr->type.sunix.sun_path, strbuf); 4504 return; 4505 } 4506 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4507 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4508 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4509 "isc_socket_cleanunix: %s: not a socket", 4510 sockaddr->type.sunix.sun_path); 4511 return; 4512 } 4513 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4514 isc__strerror(errno, strbuf, sizeof(strbuf)); 4515 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4516 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4517 "isc_socket_cleanunix: unlink(%s): %s", 4518 sockaddr->type.sunix.sun_path, strbuf); 4519 } 4520 return; 4521 } 4522 4523 s = socket(AF_UNIX, SOCK_STREAM, 0); 4524 if (s < 0) { 4525 isc__strerror(errno, strbuf, sizeof(strbuf)); 4526 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4527 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4528 "isc_socket_cleanunix: socket(%s): %s", 4529 sockaddr->type.sunix.sun_path, strbuf); 4530 return; 4531 } 4532 4533 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 4534 switch (errno) { 4535 case ENOENT: /* We exited cleanly last time */ 4536 break; 4537 default: 4538 isc__strerror(errno, strbuf, sizeof(strbuf)); 4539 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4540 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4541 "isc_socket_cleanunix: stat(%s): %s", 4542 sockaddr->type.sunix.sun_path, strbuf); 4543 break; 4544 } 4545 goto cleanup; 4546 } 4547 4548 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 4549 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4550 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4551 "isc_socket_cleanunix: %s: not a socket", 4552 sockaddr->type.sunix.sun_path); 4553 goto cleanup; 4554 } 4555 4556 if (connect(s, (struct sockaddr *)&sockaddr->type.sunix, 4557 sizeof(sockaddr->type.sunix)) < 0) { 4558 switch (errno) { 4559 case ECONNREFUSED: 4560 case ECONNRESET: 4561 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 4562 isc__strerror(errno, strbuf, sizeof(strbuf)); 4563 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4564 ISC_LOGMODULE_SOCKET, 4565 ISC_LOG_WARNING, 4566 "isc_socket_cleanunix: " 4567 "unlink(%s): %s", 4568 sockaddr->type.sunix.sun_path, 4569 strbuf); 4570 } 4571 break; 4572 default: 4573 isc__strerror(errno, strbuf, sizeof(strbuf)); 4574 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4575 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 4576 "isc_socket_cleanunix: connect(%s): %s", 4577 sockaddr->type.sunix.sun_path, strbuf); 4578 break; 4579 } 4580 } 4581 cleanup: 4582 close(s); 4583#else 4584 UNUSED(sockaddr); 4585 UNUSED(active); 4586#endif 4587} 4588 4589isc_result_t 4590isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 4591 isc_uint32_t owner, isc_uint32_t group) 4592{ 4593#ifdef ISC_PLATFORM_HAVESYSUNH 4594 isc_result_t result = ISC_R_SUCCESS; 4595 char strbuf[ISC_STRERRORSIZE]; 4596 char path[sizeof(sockaddr->type.sunix.sun_path)]; 4597#ifdef NEED_SECURE_DIRECTORY 4598 char *slash; 4599#endif 4600 4601 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX); 4602 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path)); 4603 strcpy(path, sockaddr->type.sunix.sun_path); 4604 4605#ifdef NEED_SECURE_DIRECTORY 4606 slash = strrchr(path, '/'); 4607 if (slash != NULL) { 4608 if (slash != path) 4609 *slash = '\0'; 4610 else 4611 strcpy(path, "/"); 4612 } else 4613 strcpy(path, "."); 4614#endif 4615 4616 if (chmod(path, perm) < 0) { 4617 isc__strerror(errno, strbuf, sizeof(strbuf)); 4618 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4619 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4620 "isc_socket_permunix: chmod(%s, %d): %s", 4621 path, perm, strbuf); 4622 result = ISC_R_FAILURE; 4623 } 4624 if (chown(path, owner, group) < 0) { 4625 isc__strerror(errno, strbuf, sizeof(strbuf)); 4626 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 4627 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 4628 "isc_socket_permunix: chown(%s, %d, %d): %s", 4629 path, owner, group, 4630 strbuf); 4631 result = ISC_R_FAILURE; 4632 } 4633 return (result); 4634#else 4635 UNUSED(sockaddr); 4636 UNUSED(perm); 4637 UNUSED(owner); 4638 UNUSED(group); 4639 return (ISC_R_NOTIMPLEMENTED); 4640#endif 4641} 4642 4643isc_result_t 4644isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr, 4645 unsigned int options) { 4646 char strbuf[ISC_STRERRORSIZE]; 4647 int on = 1; 4648 4649 LOCK(&sock->lock); 4650 4651 INSIST(!sock->bound); 4652 4653 if (sock->pf != sockaddr->type.sa.sa_family) { 4654 UNLOCK(&sock->lock); 4655 return (ISC_R_FAMILYMISMATCH); 4656 } 4657 /* 4658 * Only set SO_REUSEADDR when we want a specific port. 4659 */ 4660#ifdef AF_UNIX 4661 if (sock->pf == AF_UNIX) 4662 goto bind_socket; 4663#endif 4664 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 && 4665 isc_sockaddr_getport(sockaddr) != (in_port_t)0 && 4666 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, 4667 sizeof(on)) < 0) { 4668 UNEXPECTED_ERROR(__FILE__, __LINE__, 4669 "setsockopt(%d) %s", sock->fd, 4670 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4671 ISC_MSG_FAILED, "failed")); 4672 /* Press on... */ 4673 } 4674#ifdef AF_UNIX 4675 bind_socket: 4676#endif 4677 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) { 4678 inc_stats(sock->manager->stats, 4679 sock->statsindex[STATID_BINDFAIL]); 4680 4681 UNLOCK(&sock->lock); 4682 switch (errno) { 4683 case EACCES: 4684 return (ISC_R_NOPERM); 4685 case EADDRNOTAVAIL: 4686 return (ISC_R_ADDRNOTAVAIL); 4687 case EADDRINUSE: 4688 return (ISC_R_ADDRINUSE); 4689 case EINVAL: 4690 return (ISC_R_BOUND); 4691 default: 4692 isc__strerror(errno, strbuf, sizeof(strbuf)); 4693 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s", 4694 strbuf); 4695 return (ISC_R_UNEXPECTED); 4696 } 4697 } 4698 4699 socket_log(sock, sockaddr, TRACE, 4700 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound"); 4701 sock->bound = 1; 4702 4703 UNLOCK(&sock->lock); 4704 return (ISC_R_SUCCESS); 4705} 4706 4707isc_result_t 4708isc_socket_filter(isc_socket_t *sock, const char *filter) { 4709#ifdef SO_ACCEPTFILTER 4710 char strbuf[ISC_STRERRORSIZE]; 4711 struct accept_filter_arg afa; 4712#else 4713 UNUSED(sock); 4714 UNUSED(filter); 4715#endif 4716 4717 REQUIRE(VALID_SOCKET(sock)); 4718 4719#ifdef SO_ACCEPTFILTER 4720 bzero(&afa, sizeof(afa)); 4721 strncpy(afa.af_name, filter, sizeof(afa.af_name)); 4722 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, 4723 &afa, sizeof(afa)) == -1) { 4724 isc__strerror(errno, strbuf, sizeof(strbuf)); 4725 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 4726 ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s", 4727 strbuf); 4728 return (ISC_R_FAILURE); 4729 } 4730 return (ISC_R_SUCCESS); 4731#else 4732 return (ISC_R_NOTIMPLEMENTED); 4733#endif 4734} 4735 4736/* 4737 * Set up to listen on a given socket. We do this by creating an internal 4738 * event that will be dispatched when the socket has read activity. The 4739 * watcher will send the internal event to the task when there is a new 4740 * connection. 4741 * 4742 * Unlike in read, we don't preallocate a done event here. Every time there 4743 * is a new connection we'll have to allocate a new one anyway, so we might 4744 * as well keep things simple rather than having to track them. 4745 */ 4746isc_result_t 4747isc_socket_listen(isc_socket_t *sock, unsigned int backlog) { 4748 char strbuf[ISC_STRERRORSIZE]; 4749 4750 REQUIRE(VALID_SOCKET(sock)); 4751 4752 LOCK(&sock->lock); 4753 4754 REQUIRE(!sock->listener); 4755 REQUIRE(sock->bound); 4756 REQUIRE(sock->type == isc_sockettype_tcp || 4757 sock->type == isc_sockettype_unix); 4758 4759 if (backlog == 0) 4760 backlog = SOMAXCONN; 4761 4762 if (listen(sock->fd, (int)backlog) < 0) { 4763 UNLOCK(&sock->lock); 4764 isc__strerror(errno, strbuf, sizeof(strbuf)); 4765 4766 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf); 4767 4768 return (ISC_R_UNEXPECTED); 4769 } 4770 4771 sock->listener = 1; 4772 4773 UNLOCK(&sock->lock); 4774 return (ISC_R_SUCCESS); 4775} 4776 4777/* 4778 * This should try to do aggressive accept() XXXMLG 4779 */ 4780isc_result_t 4781isc_socket_accept(isc_socket_t *sock, 4782 isc_task_t *task, isc_taskaction_t action, const void *arg) 4783{ 4784 isc_socket_newconnev_t *dev; 4785 isc_socketmgr_t *manager; 4786 isc_task_t *ntask = NULL; 4787 isc_socket_t *nsock; 4788 isc_result_t result; 4789 isc_boolean_t do_poke = ISC_FALSE; 4790 4791 REQUIRE(VALID_SOCKET(sock)); 4792 manager = sock->manager; 4793 REQUIRE(VALID_MANAGER(manager)); 4794 4795 LOCK(&sock->lock); 4796 4797 REQUIRE(sock->listener); 4798 4799 /* 4800 * Sender field is overloaded here with the task we will be sending 4801 * this event to. Just before the actual event is delivered the 4802 * actual ev_sender will be touched up to be the socket. 4803 */ 4804 dev = (isc_socket_newconnev_t *) 4805 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN, 4806 action, arg, sizeof(*dev)); 4807 if (dev == NULL) { 4808 UNLOCK(&sock->lock); 4809 return (ISC_R_NOMEMORY); 4810 } 4811 ISC_LINK_INIT(dev, ev_link); 4812 4813 result = allocate_socket(manager, sock->type, &nsock); 4814 if (result != ISC_R_SUCCESS) { 4815 isc_event_free(ISC_EVENT_PTR(&dev)); 4816 UNLOCK(&sock->lock); 4817 return (result); 4818 } 4819 4820 /* 4821 * Attach to socket and to task. 4822 */ 4823 isc_task_attach(task, &ntask); 4824 nsock->references++; 4825 nsock->statsindex = sock->statsindex; 4826 4827 dev->ev_sender = ntask; 4828 dev->newsocket = nsock; 4829 4830 /* 4831 * Poke watcher here. We still have the socket locked, so there 4832 * is no race condition. We will keep the lock for such a short 4833 * bit of time waking it up now or later won't matter all that much. 4834 */ 4835 if (ISC_LIST_EMPTY(sock->accept_list)) 4836 do_poke = ISC_TRUE; 4837 4838 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link); 4839 4840 if (do_poke) 4841 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT); 4842 4843 UNLOCK(&sock->lock); 4844 return (ISC_R_SUCCESS); 4845} 4846 4847isc_result_t 4848isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, 4849 isc_task_t *task, isc_taskaction_t action, const void *arg) 4850{ 4851 isc_socket_connev_t *dev; 4852 isc_task_t *ntask = NULL; 4853 isc_socketmgr_t *manager; 4854 int cc; 4855 char strbuf[ISC_STRERRORSIZE]; 4856 4857 REQUIRE(VALID_SOCKET(sock)); 4858 REQUIRE(addr != NULL); 4859 REQUIRE(task != NULL); 4860 REQUIRE(action != NULL); 4861 4862 manager = sock->manager; 4863 REQUIRE(VALID_MANAGER(manager)); 4864 REQUIRE(addr != NULL); 4865 4866 if (isc_sockaddr_ismulticast(addr)) 4867 return (ISC_R_MULTICAST); 4868 4869 LOCK(&sock->lock); 4870 4871 REQUIRE(!sock->connecting); 4872 4873 dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock, 4874 ISC_SOCKEVENT_CONNECT, 4875 action, arg, 4876 sizeof(*dev)); 4877 if (dev == NULL) { 4878 UNLOCK(&sock->lock); 4879 return (ISC_R_NOMEMORY); 4880 } 4881 ISC_LINK_INIT(dev, ev_link); 4882 4883 /* 4884 * Try to do the connect right away, as there can be only one 4885 * outstanding, and it might happen to complete. 4886 */ 4887 sock->peer_address = *addr; 4888 cc = connect(sock->fd, &addr->type.sa, addr->length); 4889 if (cc < 0) { 4890 /* 4891 * HP-UX "fails" to connect a UDP socket and sets errno to 4892 * EINPROGRESS if it's non-blocking. We'd rather regard this as 4893 * a success and let the user detect it if it's really an error 4894 * at the time of sending a packet on the socket. 4895 */ 4896 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { 4897 cc = 0; 4898 goto success; 4899 } 4900 if (SOFT_ERROR(errno) || errno == EINPROGRESS) 4901 goto queue; 4902 4903 switch (errno) { 4904#define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit; 4905 ERROR_MATCH(EACCES, ISC_R_NOPERM); 4906 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 4907 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 4908 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 4909 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 4910#ifdef EHOSTDOWN 4911 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 4912#endif 4913 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 4914 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 4915 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 4916 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 4917 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 4918#undef ERROR_MATCH 4919 } 4920 4921 sock->connected = 0; 4922 4923 isc__strerror(errno, strbuf, sizeof(strbuf)); 4924 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf); 4925 4926 UNLOCK(&sock->lock); 4927 inc_stats(sock->manager->stats, 4928 sock->statsindex[STATID_CONNECTFAIL]); 4929 isc_event_free(ISC_EVENT_PTR(&dev)); 4930 return (ISC_R_UNEXPECTED); 4931 4932 err_exit: 4933 sock->connected = 0; 4934 isc_task_send(task, ISC_EVENT_PTR(&dev)); 4935 4936 UNLOCK(&sock->lock); 4937 inc_stats(sock->manager->stats, 4938 sock->statsindex[STATID_CONNECTFAIL]); 4939 return (ISC_R_SUCCESS); 4940 } 4941 4942 /* 4943 * If connect completed, fire off the done event. 4944 */ 4945 success: 4946 if (cc == 0) { 4947 sock->connected = 1; 4948 sock->bound = 1; 4949 dev->result = ISC_R_SUCCESS; 4950 isc_task_send(task, ISC_EVENT_PTR(&dev)); 4951 4952 UNLOCK(&sock->lock); 4953 4954 inc_stats(sock->manager->stats, 4955 sock->statsindex[STATID_CONNECT]); 4956 4957 return (ISC_R_SUCCESS); 4958 } 4959 4960 queue: 4961 4962 /* 4963 * Attach to task. 4964 */ 4965 isc_task_attach(task, &ntask); 4966 4967 sock->connecting = 1; 4968 4969 dev->ev_sender = ntask; 4970 4971 /* 4972 * Poke watcher here. We still have the socket locked, so there 4973 * is no race condition. We will keep the lock for such a short 4974 * bit of time waking it up now or later won't matter all that much. 4975 */ 4976 if (sock->connect_ev == NULL) 4977 select_poke(manager, sock->fd, SELECT_POKE_CONNECT); 4978 4979 sock->connect_ev = dev; 4980 4981 UNLOCK(&sock->lock); 4982 return (ISC_R_SUCCESS); 4983} 4984 4985/* 4986 * Called when a socket with a pending connect() finishes. 4987 */ 4988static void 4989internal_connect(isc_task_t *me, isc_event_t *ev) { 4990 isc_socket_t *sock; 4991 isc_socket_connev_t *dev; 4992 isc_task_t *task; 4993 int cc; 4994 ISC_SOCKADDR_LEN_T optlen; 4995 char strbuf[ISC_STRERRORSIZE]; 4996 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 4997 4998 UNUSED(me); 4999 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 5000 5001 sock = ev->ev_sender; 5002 INSIST(VALID_SOCKET(sock)); 5003 5004 LOCK(&sock->lock); 5005 5006 /* 5007 * When the internal event was sent the reference count was bumped 5008 * to keep the socket around for us. Decrement the count here. 5009 */ 5010 INSIST(sock->references > 0); 5011 sock->references--; 5012 if (sock->references == 0) { 5013 UNLOCK(&sock->lock); 5014 destroy(&sock); 5015 return; 5016 } 5017 5018 /* 5019 * Has this event been canceled? 5020 */ 5021 dev = sock->connect_ev; 5022 if (dev == NULL) { 5023 INSIST(!sock->connecting); 5024 UNLOCK(&sock->lock); 5025 return; 5026 } 5027 5028 INSIST(sock->connecting); 5029 sock->connecting = 0; 5030 5031 /* 5032 * Get any possible error status here. 5033 */ 5034 optlen = sizeof(cc); 5035 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, 5036 (void *)&cc, (void *)&optlen) < 0) 5037 cc = errno; 5038 else 5039 errno = cc; 5040 5041 if (errno != 0) { 5042 /* 5043 * If the error is EAGAIN, just re-select on this 5044 * fd and pretend nothing strange happened. 5045 */ 5046 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 5047 sock->connecting = 1; 5048 select_poke(sock->manager, sock->fd, 5049 SELECT_POKE_CONNECT); 5050 UNLOCK(&sock->lock); 5051 5052 return; 5053 } 5054 5055 inc_stats(sock->manager->stats, 5056 sock->statsindex[STATID_CONNECTFAIL]); 5057 5058 /* 5059 * Translate other errors into ISC_R_* flavors. 5060 */ 5061 switch (errno) { 5062#define ERROR_MATCH(a, b) case a: dev->result = b; break; 5063 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5064 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5065 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5066 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5067 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5068#ifdef EHOSTDOWN 5069 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5070#endif 5071 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5072 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5073 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5074 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5075 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 5076 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5077#undef ERROR_MATCH 5078 default: 5079 dev->result = ISC_R_UNEXPECTED; 5080 isc_sockaddr_format(&sock->peer_address, peerbuf, 5081 sizeof(peerbuf)); 5082 isc__strerror(errno, strbuf, sizeof(strbuf)); 5083 UNEXPECTED_ERROR(__FILE__, __LINE__, 5084 "internal_connect: connect(%s) %s", 5085 peerbuf, strbuf); 5086 } 5087 } else { 5088 inc_stats(sock->manager->stats, 5089 sock->statsindex[STATID_CONNECT]); 5090 dev->result = ISC_R_SUCCESS; 5091 sock->connected = 1; 5092 sock->bound = 1; 5093 } 5094 5095 sock->connect_ev = NULL; 5096 5097 UNLOCK(&sock->lock); 5098 5099 task = dev->ev_sender; 5100 dev->ev_sender = sock; 5101 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 5102} 5103 5104isc_result_t 5105isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) { 5106 isc_result_t result; 5107 5108 REQUIRE(VALID_SOCKET(sock)); 5109 REQUIRE(addressp != NULL); 5110 5111 LOCK(&sock->lock); 5112 5113 if (sock->connected) { 5114 *addressp = sock->peer_address; 5115 result = ISC_R_SUCCESS; 5116 } else { 5117 result = ISC_R_NOTCONNECTED; 5118 } 5119 5120 UNLOCK(&sock->lock); 5121 5122 return (result); 5123} 5124 5125isc_result_t 5126isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) { 5127 ISC_SOCKADDR_LEN_T len; 5128 isc_result_t result; 5129 char strbuf[ISC_STRERRORSIZE]; 5130 5131 REQUIRE(VALID_SOCKET(sock)); 5132 REQUIRE(addressp != NULL); 5133 5134 LOCK(&sock->lock); 5135 5136 if (!sock->bound) { 5137 result = ISC_R_NOTBOUND; 5138 goto out; 5139 } 5140 5141 result = ISC_R_SUCCESS; 5142 5143 len = sizeof(addressp->type); 5144 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) { 5145 isc__strerror(errno, strbuf, sizeof(strbuf)); 5146 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", 5147 strbuf); 5148 result = ISC_R_UNEXPECTED; 5149 goto out; 5150 } 5151 addressp->length = (unsigned int)len; 5152 5153 out: 5154 UNLOCK(&sock->lock); 5155 5156 return (result); 5157} 5158 5159/* 5160 * Run through the list of events on this socket, and cancel the ones 5161 * queued for task "task" of type "how". "how" is a bitmask. 5162 */ 5163void 5164isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) { 5165 5166 REQUIRE(VALID_SOCKET(sock)); 5167 5168 /* 5169 * Quick exit if there is nothing to do. Don't even bother locking 5170 * in this case. 5171 */ 5172 if (how == 0) 5173 return; 5174 5175 LOCK(&sock->lock); 5176 5177 /* 5178 * All of these do the same thing, more or less. 5179 * Each will: 5180 * o If the internal event is marked as "posted" try to 5181 * remove it from the task's queue. If this fails, mark it 5182 * as canceled instead, and let the task clean it up later. 5183 * o For each I/O request for that task of that type, post 5184 * its done event with status of "ISC_R_CANCELED". 5185 * o Reset any state needed. 5186 */ 5187 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) 5188 && !ISC_LIST_EMPTY(sock->recv_list)) { 5189 isc_socketevent_t *dev; 5190 isc_socketevent_t *next; 5191 isc_task_t *current_task; 5192 5193 dev = ISC_LIST_HEAD(sock->recv_list); 5194 5195 while (dev != NULL) { 5196 current_task = dev->ev_sender; 5197 next = ISC_LIST_NEXT(dev, ev_link); 5198 5199 if ((task == NULL) || (task == current_task)) { 5200 dev->result = ISC_R_CANCELED; 5201 send_recvdone_event(sock, &dev); 5202 } 5203 dev = next; 5204 } 5205 } 5206 5207 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) 5208 && !ISC_LIST_EMPTY(sock->send_list)) { 5209 isc_socketevent_t *dev; 5210 isc_socketevent_t *next; 5211 isc_task_t *current_task; 5212 5213 dev = ISC_LIST_HEAD(sock->send_list); 5214 5215 while (dev != NULL) { 5216 current_task = dev->ev_sender; 5217 next = ISC_LIST_NEXT(dev, ev_link); 5218 5219 if ((task == NULL) || (task == current_task)) { 5220 dev->result = ISC_R_CANCELED; 5221 send_senddone_event(sock, &dev); 5222 } 5223 dev = next; 5224 } 5225 } 5226 5227 if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT) 5228 && !ISC_LIST_EMPTY(sock->accept_list)) { 5229 isc_socket_newconnev_t *dev; 5230 isc_socket_newconnev_t *next; 5231 isc_task_t *current_task; 5232 5233 dev = ISC_LIST_HEAD(sock->accept_list); 5234 while (dev != NULL) { 5235 current_task = dev->ev_sender; 5236 next = ISC_LIST_NEXT(dev, ev_link); 5237 5238 if ((task == NULL) || (task == current_task)) { 5239 5240 ISC_LIST_UNLINK(sock->accept_list, dev, 5241 ev_link); 5242 5243 dev->newsocket->references--; 5244 free_socket(&dev->newsocket); 5245 5246 dev->result = ISC_R_CANCELED; 5247 dev->ev_sender = sock; 5248 isc_task_sendanddetach(¤t_task, 5249 ISC_EVENT_PTR(&dev)); 5250 } 5251 5252 dev = next; 5253 } 5254 } 5255 5256 /* 5257 * Connecting is not a list. 5258 */ 5259 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT) 5260 && sock->connect_ev != NULL) { 5261 isc_socket_connev_t *dev; 5262 isc_task_t *current_task; 5263 5264 INSIST(sock->connecting); 5265 sock->connecting = 0; 5266 5267 dev = sock->connect_ev; 5268 current_task = dev->ev_sender; 5269 5270 if ((task == NULL) || (task == current_task)) { 5271 sock->connect_ev = NULL; 5272 5273 dev->result = ISC_R_CANCELED; 5274 dev->ev_sender = sock; 5275 isc_task_sendanddetach(¤t_task, 5276 ISC_EVENT_PTR(&dev)); 5277 } 5278 } 5279 5280 UNLOCK(&sock->lock); 5281} 5282 5283isc_sockettype_t 5284isc_socket_gettype(isc_socket_t *sock) { 5285 REQUIRE(VALID_SOCKET(sock)); 5286 5287 return (sock->type); 5288} 5289 5290isc_boolean_t 5291isc_socket_isbound(isc_socket_t *sock) { 5292 isc_boolean_t val; 5293 5294 LOCK(&sock->lock); 5295 val = ((sock->bound) ? ISC_TRUE : ISC_FALSE); 5296 UNLOCK(&sock->lock); 5297 5298 return (val); 5299} 5300 5301void 5302isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) { 5303#if defined(IPV6_V6ONLY) 5304 int onoff = yes ? 1 : 0; 5305#else 5306 UNUSED(yes); 5307 UNUSED(sock); 5308#endif 5309 5310 REQUIRE(VALID_SOCKET(sock)); 5311 5312#ifdef IPV6_V6ONLY 5313 if (sock->pf == AF_INET6) { 5314 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, 5315 (void *)&onoff, sizeof(int)) < 0) { 5316 char strbuf[ISC_STRERRORSIZE]; 5317 5318 UNEXPECTED_ERROR(__FILE__, __LINE__, 5319 "setsockopt(%d, IPV6_V6ONLY) " 5320 "%s: %s", sock->fd, 5321 isc_msgcat_get(isc_msgcat, 5322 ISC_MSGSET_GENERAL, 5323 ISC_MSG_FAILED, 5324 "failed"), 5325 strbuf); 5326 } 5327 } 5328 FIX_IPV6_RECVPKTINFO(sock); /* AIX */ 5329#endif 5330} 5331 5332#ifndef ISC_PLATFORM_USETHREADS 5333/* In our assumed scenario, we can simply use a single static object. */ 5334static isc_socketwait_t swait_private; 5335 5336int 5337isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) { 5338 int n; 5339#ifdef USE_KQUEUE 5340 struct timespec ts, *tsp; 5341#endif 5342#ifdef USE_EPOLL 5343 int timeout; 5344#endif 5345#ifdef USE_DEVPOLL 5346 struct dvpoll dvp; 5347#endif 5348 5349 REQUIRE(swaitp != NULL && *swaitp == NULL); 5350 5351 if (socketmgr == NULL) 5352 return (0); 5353 5354#ifdef USE_KQUEUE 5355 if (tvp != NULL) { 5356 ts.tv_sec = tvp->tv_sec; 5357 ts.tv_nsec = tvp->tv_usec * 1000; 5358 tsp = &ts; 5359 } else 5360 tsp = NULL; 5361 swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0, 5362 socketmgr->events, socketmgr->nevents, 5363 tsp); 5364 n = swait_private.nevents; 5365#elif defined(USE_EPOLL) 5366 if (tvp != NULL) 5367 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000; 5368 else 5369 timeout = -1; 5370 swait_private.nevents = epoll_wait(socketmgr->epoll_fd, 5371 socketmgr->events, 5372 socketmgr->nevents, timeout); 5373 n = swait_private.nevents; 5374#elif defined(USE_DEVPOLL) 5375 dvp.dp_fds = socketmgr->events; 5376 dvp.dp_nfds = socketmgr->nevents; 5377 if (tvp != NULL) { 5378 dvp.dp_timeout = tvp->tv_sec * 1000 + 5379 (tvp->tv_usec + 999) / 1000; 5380 } else 5381 dvp.dp_timeout = -1; 5382 swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp); 5383 n = swait_private.nevents; 5384#elif defined(USE_SELECT) 5385 memcpy(socketmgr->read_fds_copy, socketmgr->read_fds, 5386 socketmgr->fd_bufsize); 5387 memcpy(socketmgr->write_fds_copy, socketmgr->write_fds, 5388 socketmgr->fd_bufsize); 5389 5390 swait_private.readset = socketmgr->read_fds_copy; 5391 swait_private.writeset = socketmgr->write_fds_copy; 5392 swait_private.maxfd = socketmgr->maxfd + 1; 5393 5394 n = select(swait_private.maxfd, swait_private.readset, 5395 swait_private.writeset, NULL, tvp); 5396#endif 5397 5398 *swaitp = &swait_private; 5399 return (n); 5400} 5401 5402isc_result_t 5403isc__socketmgr_dispatch(isc_socketwait_t *swait) { 5404 REQUIRE(swait == &swait_private); 5405 5406 if (socketmgr == NULL) 5407 return (ISC_R_NOTFOUND); 5408 5409#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 5410 (void)process_fds(socketmgr, socketmgr->events, swait->nevents); 5411 return (ISC_R_SUCCESS); 5412#elif defined(USE_SELECT) 5413 process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset); 5414 return (ISC_R_SUCCESS); 5415#endif 5416} 5417#endif /* ISC_PLATFORM_USETHREADS */ 5418 5419void 5420isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) { 5421 5422 /* 5423 * Name 'socket'. 5424 */ 5425 5426 REQUIRE(VALID_SOCKET(socket)); 5427 5428 LOCK(&socket->lock); 5429 memset(socket->name, 0, sizeof(socket->name)); 5430 strncpy(socket->name, name, sizeof(socket->name) - 1); 5431 socket->tag = tag; 5432 UNLOCK(&socket->lock); 5433} 5434 5435const char * 5436isc_socket_getname(isc_socket_t *socket) { 5437 return (socket->name); 5438} 5439 5440void * 5441isc_socket_gettag(isc_socket_t *socket) { 5442 return (socket->tag); 5443} 5444 5445#ifdef HAVE_LIBXML2 5446 5447static const char * 5448_socktype(isc_sockettype_t type) 5449{ 5450 if (type == isc_sockettype_udp) 5451 return ("udp"); 5452 else if (type == isc_sockettype_tcp) 5453 return ("tcp"); 5454 else if (type == isc_sockettype_unix) 5455 return ("unix"); 5456 else if (type == isc_sockettype_fdwatch) 5457 return ("fdwatch"); 5458 else 5459 return ("not-initialized"); 5460} 5461 5462void 5463isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer) 5464{ 5465 isc_socket_t *sock; 5466 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5467 isc_sockaddr_t addr; 5468 ISC_SOCKADDR_LEN_T len; 5469 5470 LOCK(&mgr->lock); 5471 5472#ifndef ISC_PLATFORM_USETHREADS 5473 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"); 5474 xmlTextWriterWriteFormatString(writer, "%d", mgr->refs); 5475 xmlTextWriterEndElement(writer); 5476#endif 5477 5478 xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"); 5479 sock = ISC_LIST_HEAD(mgr->socklist); 5480 while (sock != NULL) { 5481 LOCK(&sock->lock); 5482 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"); 5483 5484 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"); 5485 xmlTextWriterWriteFormatString(writer, "%p", sock); 5486 xmlTextWriterEndElement(writer); 5487 5488 if (sock->name[0] != 0) { 5489 xmlTextWriterStartElement(writer, ISC_XMLCHAR "name"); 5490 xmlTextWriterWriteFormatString(writer, "%s", 5491 sock->name); 5492 xmlTextWriterEndElement(writer); /* name */ 5493 } 5494 5495 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"); 5496 xmlTextWriterWriteFormatString(writer, "%d", sock->references); 5497 xmlTextWriterEndElement(writer); 5498 5499 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type", 5500 ISC_XMLCHAR _socktype(sock->type)); 5501 5502 if (sock->connected) { 5503 isc_sockaddr_format(&sock->peer_address, peerbuf, 5504 sizeof(peerbuf)); 5505 xmlTextWriterWriteElement(writer, 5506 ISC_XMLCHAR "peer-address", 5507 ISC_XMLCHAR peerbuf); 5508 } 5509 5510 len = sizeof(addr); 5511 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) { 5512 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf)); 5513 xmlTextWriterWriteElement(writer, 5514 ISC_XMLCHAR "local-address", 5515 ISC_XMLCHAR peerbuf); 5516 } 5517 5518 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"); 5519 if (sock->pending_recv) 5520 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5521 ISC_XMLCHAR "pending-receive"); 5522 if (sock->pending_send) 5523 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5524 ISC_XMLCHAR "pending-send"); 5525 if (sock->pending_accept) 5526 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5527 ISC_XMLCHAR "pending_accept"); 5528 if (sock->listener) 5529 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5530 ISC_XMLCHAR "listener"); 5531 if (sock->connected) 5532 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5533 ISC_XMLCHAR "connected"); 5534 if (sock->connecting) 5535 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5536 ISC_XMLCHAR "connecting"); 5537 if (sock->bound) 5538 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state", 5539 ISC_XMLCHAR "bound"); 5540 5541 xmlTextWriterEndElement(writer); /* states */ 5542 5543 xmlTextWriterEndElement(writer); /* socket */ 5544 5545 UNLOCK(&sock->lock); 5546 sock = ISC_LIST_NEXT(sock, link); 5547 } 5548 xmlTextWriterEndElement(writer); /* sockets */ 5549 5550 UNLOCK(&mgr->lock); 5551} 5552#endif /* HAVE_LIBXML2 */ 5553