1/* 2 * Copyright (c) 2001, 2017, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2016, 2017, SAP SE and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27/* 28 * This file contains implementations of NET_... functions. The NET_.. functions are 29 * wrappers for common file- and socket functions plus provisions for non-blocking IO. 30 * 31 * (basically, the layers remember all file descriptors waiting for a particular fd; 32 * all threads waiting on a certain fd can be woken up by sending them a signal; this 33 * is done e.g. when the fd is closed.) 34 * 35 * This was originally copied from the linux_close.c implementation. 36 * 37 * Side Note: This coding needs initialization. Under Linux this is done 38 * automatically via __attribute((constructor)), on AIX this is done manually 39 * (see aix_close_init). 40 * 41 */ 42 43/* 44 AIX needs a workaround for I/O cancellation, see: 45 http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.basetechref/doc/basetrf1/close.htm 46 ... 47 The close subroutine is blocked until all subroutines which use the file 48 descriptor return to usr space. For example, when a thread is calling close 49 and another thread is calling select with the same file descriptor, the 50 close subroutine does not return until the select call returns. 51 ... 52*/ 53 54#include <assert.h> 55#include <limits.h> 56#include <stdio.h> 57#include <stdlib.h> 58#include <signal.h> 59#include <pthread.h> 60#include <sys/types.h> 61#include <sys/socket.h> 62#include <sys/time.h> 63#include <sys/resource.h> 64#include <sys/uio.h> 65#include <unistd.h> 66#include <errno.h> 67#include <poll.h> 68#include "jvm.h" 69#include "net_util.h" 70 71/* 72 * Stack allocated by thread when doing blocking operation 73 */ 74typedef struct threadEntry { 75 pthread_t thr; /* this thread */ 76 struct threadEntry *next; /* next thread */ 77 int intr; /* interrupted */ 78} threadEntry_t; 79 80/* 81 * Heap allocated during initialized - one entry per fd 82 */ 83typedef struct { 84 pthread_mutex_t lock; /* fd lock */ 85 threadEntry_t *threads; /* threads blocked on fd */ 86} fdEntry_t; 87 88/* 89 * Signal to unblock thread 90 */ 91static int sigWakeup = (SIGRTMAX - 1); 92 93/* 94 * fdTable holds one entry per file descriptor, up to a certain 95 * maximum. 96 * Theoretically, the number of possible file descriptors can get 97 * large, though usually it does not. Entries for small value file 98 * descriptors are kept in a simple table, which covers most scenarios. 99 * Entries for large value file descriptors are kept in an overflow 100 * table, which is organized as a sparse two dimensional array whose 101 * slabs are allocated on demand. This covers all corner cases while 102 * keeping memory consumption reasonable. 103 */ 104 105/* Base table for low value file descriptors */ 106static fdEntry_t* fdTable = NULL; 107/* Maximum size of base table (in number of entries). */ 108static const int fdTableMaxSize = 0x1000; /* 4K */ 109/* Actual size of base table (in number of entries) */ 110static int fdTableLen = 0; 111/* Max. theoretical number of file descriptors on system. */ 112static int fdLimit = 0; 113 114/* Overflow table, should base table not be large enough. Organized as 115 * an array of n slabs, each holding 64k entries. 116 */ 117static fdEntry_t** fdOverflowTable = NULL; 118/* Number of slabs in the overflow table */ 119static int fdOverflowTableLen = 0; 120/* Number of entries in one slab */ 121static const int fdOverflowTableSlabSize = 0x10000; /* 64k */ 122pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER; 123 124/* 125 * Null signal handler 126 */ 127static void sig_wakeup(int sig) { 128} 129 130/* 131 * Initialization routine (executed when library is loaded) 132 * Allocate fd tables and sets up signal handler. 133 * 134 * On AIX we don't have __attribute((constructor)) so we need to initialize 135 * manually (from JNI_OnLoad() in 'src/share/native/java/net/net_util.c') 136 */ 137void aix_close_init() { 138 struct rlimit nbr_files; 139 sigset_t sigset; 140 struct sigaction sa; 141 int i = 0; 142 143 /* Determine the maximum number of possible file descriptors. */ 144 if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) { 145 fprintf(stderr, "library initialization failed - " 146 "unable to get max # of allocated fds\n"); 147 abort(); 148 } 149 if (nbr_files.rlim_max != RLIM_INFINITY) { 150 fdLimit = nbr_files.rlim_max; 151 } else { 152 /* We just do not know. */ 153 fdLimit = INT_MAX; 154 } 155 156 /* Allocate table for low value file descriptors. */ 157 fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize; 158 fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t)); 159 if (fdTable == NULL) { 160 fprintf(stderr, "library initialization failed - " 161 "unable to allocate file descriptor table - out of memory"); 162 abort(); 163 } else { 164 for (i = 0; i < fdTableLen; i ++) { 165 pthread_mutex_init(&fdTable[i].lock, NULL); 166 } 167 } 168 169 /* Allocate overflow table, if needed */ 170 if (fdLimit > fdTableMaxSize) { 171 fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1; 172 fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*)); 173 if (fdOverflowTable == NULL) { 174 fprintf(stderr, "library initialization failed - " 175 "unable to allocate file descriptor overflow table - out of memory"); 176 abort(); 177 } 178 } 179 180 /* 181 * Setup the signal handler 182 */ 183 sa.sa_handler = sig_wakeup; 184 sa.sa_flags = 0; 185 sigemptyset(&sa.sa_mask); 186 sigaction(sigWakeup, &sa, NULL); 187 188 sigemptyset(&sigset); 189 sigaddset(&sigset, sigWakeup); 190 sigprocmask(SIG_UNBLOCK, &sigset, NULL); 191} 192 193/* 194 * Return the fd table for this fd. 195 */ 196static inline fdEntry_t *getFdEntry(int fd) 197{ 198 fdEntry_t* result = NULL; 199 200 if (fd < 0) { 201 return NULL; 202 } 203 204 /* This should not happen. If it does, our assumption about 205 * max. fd value was wrong. */ 206 assert(fd < fdLimit); 207 208 if (fd < fdTableMaxSize) { 209 /* fd is in base table. */ 210 assert(fd < fdTableLen); 211 result = &fdTable[fd]; 212 } else { 213 /* fd is in overflow table. */ 214 const int indexInOverflowTable = fd - fdTableMaxSize; 215 const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize; 216 const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize; 217 fdEntry_t* slab = NULL; 218 assert(rootindex < fdOverflowTableLen); 219 assert(slabindex < fdOverflowTableSlabSize); 220 pthread_mutex_lock(&fdOverflowTableLock); 221 /* Allocate new slab in overflow table if needed */ 222 if (fdOverflowTable[rootindex] == NULL) { 223 fdEntry_t* const newSlab = 224 (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t)); 225 if (newSlab == NULL) { 226 fprintf(stderr, "Unable to allocate file descriptor overflow" 227 " table slab - out of memory"); 228 pthread_mutex_unlock(&fdOverflowTableLock); 229 abort(); 230 } else { 231 int i; 232 for (i = 0; i < fdOverflowTableSlabSize; i ++) { 233 pthread_mutex_init(&newSlab[i].lock, NULL); 234 } 235 fdOverflowTable[rootindex] = newSlab; 236 } 237 } 238 pthread_mutex_unlock(&fdOverflowTableLock); 239 slab = fdOverflowTable[rootindex]; 240 result = &slab[slabindex]; 241 } 242 243 return result; 244 245} 246 247 248/* 249 * Start a blocking operation :- 250 * Insert thread onto thread list for the fd. 251 */ 252static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self) 253{ 254 self->thr = pthread_self(); 255 self->intr = 0; 256 257 pthread_mutex_lock(&(fdEntry->lock)); 258 { 259 self->next = fdEntry->threads; 260 fdEntry->threads = self; 261 } 262 pthread_mutex_unlock(&(fdEntry->lock)); 263} 264 265/* 266 * End a blocking operation :- 267 * Remove thread from thread list for the fd 268 * If fd has been interrupted then set errno to EBADF 269 */ 270static inline void endOp 271 (fdEntry_t *fdEntry, threadEntry_t *self) 272{ 273 int orig_errno = errno; 274 pthread_mutex_lock(&(fdEntry->lock)); 275 { 276 threadEntry_t *curr, *prev=NULL; 277 curr = fdEntry->threads; 278 while (curr != NULL) { 279 if (curr == self) { 280 if (curr->intr) { 281 orig_errno = EBADF; 282 } 283 if (prev == NULL) { 284 fdEntry->threads = curr->next; 285 } else { 286 prev->next = curr->next; 287 } 288 break; 289 } 290 prev = curr; 291 curr = curr->next; 292 } 293 } 294 pthread_mutex_unlock(&(fdEntry->lock)); 295 errno = orig_errno; 296} 297 298/* 299 * Close or dup2 a file descriptor ensuring that all threads blocked on 300 * the file descriptor are notified via a wakeup signal. 301 * 302 * fd1 < 0 => close(fd2) 303 * fd1 >= 0 => dup2(fd1, fd2) 304 * 305 * Returns -1 with errno set if operation fails. 306 */ 307static int closefd(int fd1, int fd2) { 308 int rv, orig_errno; 309 fdEntry_t *fdEntry = getFdEntry(fd2); 310 if (fdEntry == NULL) { 311 errno = EBADF; 312 return -1; 313 } 314 315 /* 316 * Lock the fd to hold-off additional I/O on this fd. 317 */ 318 pthread_mutex_lock(&(fdEntry->lock)); 319 320 { 321 /* On fast machines we see that we enter dup2 before the 322 * accepting thread had a chance to get and process the signal. 323 * So in case we woke a thread up, give it some time to cope. 324 * Also see https://bugs.openjdk.java.net/browse/JDK-8006395 */ 325 int num_woken = 0; 326 327 /* 328 * Send a wakeup signal to all threads blocked on this 329 * file descriptor. 330 */ 331 threadEntry_t *curr = fdEntry->threads; 332 while (curr != NULL) { 333 curr->intr = 1; 334 pthread_kill( curr->thr, sigWakeup ); 335 num_woken ++; 336 curr = curr->next; 337 } 338 339 if (num_woken > 0) { 340 usleep(num_woken * 50); 341 } 342 343 /* 344 * And close/dup the file descriptor 345 * (restart if interrupted by signal) 346 */ 347 do { 348 if (fd1 < 0) { 349 rv = close(fd2); 350 } else { 351 rv = dup2(fd1, fd2); 352 } 353 } while (rv == -1 && errno == EINTR); 354 } 355 356 /* 357 * Unlock without destroying errno 358 */ 359 orig_errno = errno; 360 pthread_mutex_unlock(&(fdEntry->lock)); 361 errno = orig_errno; 362 363 return rv; 364} 365 366/* 367 * Wrapper for dup2 - same semantics as dup2 system call except 368 * that any threads blocked in an I/O system call on fd2 will be 369 * preempted and return -1/EBADF; 370 */ 371int NET_Dup2(int fd, int fd2) { 372 if (fd < 0) { 373 errno = EBADF; 374 return -1; 375 } 376 return closefd(fd, fd2); 377} 378 379/* 380 * Wrapper for close - same semantics as close system call 381 * except that any threads blocked in an I/O on fd will be 382 * preempted and the I/O system call will return -1/EBADF. 383 */ 384int NET_SocketClose(int fd) { 385 return closefd(-1, fd); 386} 387 388/************** Basic I/O operations here ***************/ 389 390/* 391 * Macro to perform a blocking IO operation. Restarts 392 * automatically if interrupted by signal (other than 393 * our wakeup signal) 394 */ 395#define BLOCKING_IO_RETURN_INT(FD, FUNC) { \ 396 int ret; \ 397 threadEntry_t self; \ 398 fdEntry_t *fdEntry = getFdEntry(FD); \ 399 if (fdEntry == NULL) { \ 400 errno = EBADF; \ 401 return -1; \ 402 } \ 403 do { \ 404 startOp(fdEntry, &self); \ 405 ret = FUNC; \ 406 endOp(fdEntry, &self); \ 407 } while (ret == -1 && errno == EINTR); \ 408 return ret; \ 409} 410 411int NET_Read(int s, void* buf, size_t len) { 412 BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) ); 413} 414 415int NET_NonBlockingRead(int s, void* buf, size_t len) { 416 BLOCKING_IO_RETURN_INT(s, recv(s, buf, len, MSG_NONBLOCK)); 417} 418 419int NET_ReadV(int s, const struct iovec * vector, int count) { 420 BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) ); 421} 422 423int NET_RecvFrom(int s, void *buf, int len, unsigned int flags, 424 struct sockaddr *from, socklen_t *fromlen) { 425 BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, fromlen) ); 426} 427 428int NET_Send(int s, void *msg, int len, unsigned int flags) { 429 BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) ); 430} 431 432int NET_WriteV(int s, const struct iovec * vector, int count) { 433 BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) ); 434} 435 436int NET_SendTo(int s, const void *msg, int len, unsigned int 437 flags, const struct sockaddr *to, int tolen) { 438 BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) ); 439} 440 441int NET_Accept(int s, struct sockaddr *addr, socklen_t *addrlen) { 442 BLOCKING_IO_RETURN_INT( s, accept(s, addr, addrlen) ); 443} 444 445int NET_Connect(int s, struct sockaddr *addr, int addrlen) { 446 int crc = -1, prc = -1; 447 threadEntry_t self; 448 fdEntry_t* fdEntry = getFdEntry(s); 449 450 if (fdEntry == NULL) { 451 errno = EBADF; 452 return -1; 453 } 454 455 /* On AIX, when the system call connect() is interrupted, the connection 456 * is not aborted and it will be established asynchronously by the kernel. 457 * Hence, no need to restart connect() when EINTR is received 458 */ 459 startOp(fdEntry, &self); 460 crc = connect(s, addr, addrlen); 461 endOp(fdEntry, &self); 462 463 if (crc == -1 && errno == EINTR) { 464 struct pollfd s_pollfd; 465 int sockopt_arg = 0; 466 socklen_t len; 467 468 s_pollfd.fd = s; 469 s_pollfd.events = POLLOUT | POLLERR; 470 471 /* poll the file descriptor */ 472 do { 473 startOp(fdEntry, &self); 474 prc = poll(&s_pollfd, 1, -1); 475 endOp(fdEntry, &self); 476 } while (prc == -1 && errno == EINTR); 477 478 if (prc < 0) 479 return prc; 480 481 len = sizeof(sockopt_arg); 482 483 /* Check whether the connection has been established */ 484 if (getsockopt(s, SOL_SOCKET, SO_ERROR, &sockopt_arg, &len) == -1) 485 return -1; 486 487 if (sockopt_arg != 0 ) { 488 errno = sockopt_arg; 489 return -1; 490 } 491 } else { 492 return crc; 493 } 494 495 /* At this point, fd is connected. Set successful return code */ 496 return 0; 497} 498 499int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) { 500 BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) ); 501} 502 503/* 504 * Wrapper for poll(s, timeout). 505 * Auto restarts with adjusted timeout if interrupted by 506 * signal other than our wakeup signal. 507 */ 508int NET_Timeout(JNIEnv *env, int s, long timeout, jlong nanoTimeStamp) { 509 jlong prevNanoTime = nanoTimeStamp; 510 jlong nanoTimeout = (jlong) timeout * NET_NSEC_PER_MSEC; 511 fdEntry_t *fdEntry = getFdEntry(s); 512 513 /* 514 * Check that fd hasn't been closed. 515 */ 516 if (fdEntry == NULL) { 517 errno = EBADF; 518 return -1; 519 } 520 521 for(;;) { 522 struct pollfd pfd; 523 int rv; 524 threadEntry_t self; 525 526 /* 527 * Poll the fd. If interrupted by our wakeup signal 528 * errno will be set to EBADF. 529 */ 530 pfd.fd = s; 531 pfd.events = POLLIN | POLLERR; 532 533 startOp(fdEntry, &self); 534 rv = poll(&pfd, 1, nanoTimeout / NET_NSEC_PER_MSEC); 535 endOp(fdEntry, &self); 536 537 /* 538 * If interrupted then adjust timeout. If timeout 539 * has expired return 0 (indicating timeout expired). 540 */ 541 if (rv < 0 && errno == EINTR) { 542 jlong newNanoTime = JVM_NanoTime(env, 0); 543 nanoTimeout -= newNanoTime - prevNanoTime; 544 if (nanoTimeout < NET_NSEC_PER_MSEC) { 545 return 0; 546 } 547 prevNanoTime = newNanoTime; 548 } else { 549 return rv; 550 } 551 } 552} 553