1/*
2 * Copyright (c) 2001, 2017, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2016, 2017, SAP SE and/or its affiliates. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.  Oracle designates this
9 * particular file as subject to the "Classpath" exception as provided
10 * by Oracle in the LICENSE file that accompanied this code.
11 *
12 * This code is distributed in the hope that it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 * version 2 for more details (a copy is included in the LICENSE file that
16 * accompanied this code).
17 *
18 * You should have received a copy of the GNU General Public License version
19 * 2 along with this work; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23 * or visit www.oracle.com if you need additional information or have any
24 * questions.
25 */
26
27/*
28 * This file contains implementations of NET_... functions. The NET_.. functions are
29 * wrappers for common file- and socket functions plus provisions for non-blocking IO.
30 *
31 * (basically, the layers remember all  file descriptors waiting for a particular fd;
32 *  all threads waiting on a certain fd can be woken up by sending them a signal; this
33 *  is done e.g. when the fd is closed.)
34 *
35 * This was originally copied from the linux_close.c implementation.
36 *
37 * Side Note: This coding needs initialization. Under Linux this is done
38 * automatically via __attribute((constructor)), on AIX this is done manually
39 * (see aix_close_init).
40 *
41 */
42
43/*
44   AIX needs a workaround for I/O cancellation, see:
45   http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.basetechref/doc/basetrf1/close.htm
46   ...
47   The close subroutine is blocked until all subroutines which use the file
48   descriptor return to usr space. For example, when a thread is calling close
49   and another thread is calling select with the same file descriptor, the
50   close subroutine does not return until the select call returns.
51   ...
52*/
53
54#include <assert.h>
55#include <limits.h>
56#include <stdio.h>
57#include <stdlib.h>
58#include <signal.h>
59#include <pthread.h>
60#include <sys/types.h>
61#include <sys/socket.h>
62#include <sys/time.h>
63#include <sys/resource.h>
64#include <sys/uio.h>
65#include <unistd.h>
66#include <errno.h>
67#include <poll.h>
68#include "jvm.h"
69#include "net_util.h"
70
71/*
72 * Stack allocated by thread when doing blocking operation
73 */
74typedef struct threadEntry {
75    pthread_t thr;                      /* this thread */
76    struct threadEntry *next;           /* next thread */
77    int intr;                           /* interrupted */
78} threadEntry_t;
79
80/*
81 * Heap allocated during initialized - one entry per fd
82 */
83typedef struct {
84    pthread_mutex_t lock;               /* fd lock */
85    threadEntry_t *threads;             /* threads blocked on fd */
86} fdEntry_t;
87
88/*
89 * Signal to unblock thread
90 */
91static int sigWakeup = (SIGRTMAX - 1);
92
93/*
94 * fdTable holds one entry per file descriptor, up to a certain
95 * maximum.
96 * Theoretically, the number of possible file descriptors can get
97 * large, though usually it does not. Entries for small value file
98 * descriptors are kept in a simple table, which covers most scenarios.
99 * Entries for large value file descriptors are kept in an overflow
100 * table, which is organized as a sparse two dimensional array whose
101 * slabs are allocated on demand. This covers all corner cases while
102 * keeping memory consumption reasonable.
103 */
104
105/* Base table for low value file descriptors */
106static fdEntry_t* fdTable = NULL;
107/* Maximum size of base table (in number of entries). */
108static const int fdTableMaxSize = 0x1000; /* 4K */
109/* Actual size of base table (in number of entries) */
110static int fdTableLen = 0;
111/* Max. theoretical number of file descriptors on system. */
112static int fdLimit = 0;
113
114/* Overflow table, should base table not be large enough. Organized as
115 *   an array of n slabs, each holding 64k entries.
116 */
117static fdEntry_t** fdOverflowTable = NULL;
118/* Number of slabs in the overflow table */
119static int fdOverflowTableLen = 0;
120/* Number of entries in one slab */
121static const int fdOverflowTableSlabSize = 0x10000; /* 64k */
122pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER;
123
124/*
125 * Null signal handler
126 */
127static void sig_wakeup(int sig) {
128}
129
130/*
131 * Initialization routine (executed when library is loaded)
132 * Allocate fd tables and sets up signal handler.
133 *
134 * On AIX we don't have __attribute((constructor)) so we need to initialize
135 * manually (from JNI_OnLoad() in 'src/share/native/java/net/net_util.c')
136 */
137void aix_close_init() {
138    struct rlimit nbr_files;
139    sigset_t sigset;
140    struct sigaction sa;
141    int i = 0;
142
143    /* Determine the maximum number of possible file descriptors. */
144    if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) {
145        fprintf(stderr, "library initialization failed - "
146                "unable to get max # of allocated fds\n");
147        abort();
148    }
149    if (nbr_files.rlim_max != RLIM_INFINITY) {
150        fdLimit = nbr_files.rlim_max;
151    } else {
152        /* We just do not know. */
153        fdLimit = INT_MAX;
154    }
155
156    /* Allocate table for low value file descriptors. */
157    fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize;
158    fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t));
159    if (fdTable == NULL) {
160        fprintf(stderr, "library initialization failed - "
161                "unable to allocate file descriptor table - out of memory");
162        abort();
163    } else {
164        for (i = 0; i < fdTableLen; i ++) {
165            pthread_mutex_init(&fdTable[i].lock, NULL);
166        }
167    }
168
169    /* Allocate overflow table, if needed */
170    if (fdLimit > fdTableMaxSize) {
171        fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1;
172        fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*));
173        if (fdOverflowTable == NULL) {
174            fprintf(stderr, "library initialization failed - "
175                    "unable to allocate file descriptor overflow table - out of memory");
176            abort();
177        }
178    }
179
180    /*
181     * Setup the signal handler
182     */
183    sa.sa_handler = sig_wakeup;
184    sa.sa_flags   = 0;
185    sigemptyset(&sa.sa_mask);
186    sigaction(sigWakeup, &sa, NULL);
187
188    sigemptyset(&sigset);
189    sigaddset(&sigset, sigWakeup);
190    sigprocmask(SIG_UNBLOCK, &sigset, NULL);
191}
192
193/*
194 * Return the fd table for this fd.
195 */
196static inline fdEntry_t *getFdEntry(int fd)
197{
198    fdEntry_t* result = NULL;
199
200    if (fd < 0) {
201        return NULL;
202    }
203
204    /* This should not happen. If it does, our assumption about
205     * max. fd value was wrong. */
206    assert(fd < fdLimit);
207
208    if (fd < fdTableMaxSize) {
209        /* fd is in base table. */
210        assert(fd < fdTableLen);
211        result = &fdTable[fd];
212    } else {
213        /* fd is in overflow table. */
214        const int indexInOverflowTable = fd - fdTableMaxSize;
215        const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize;
216        const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize;
217        fdEntry_t* slab = NULL;
218        assert(rootindex < fdOverflowTableLen);
219        assert(slabindex < fdOverflowTableSlabSize);
220        pthread_mutex_lock(&fdOverflowTableLock);
221        /* Allocate new slab in overflow table if needed */
222        if (fdOverflowTable[rootindex] == NULL) {
223            fdEntry_t* const newSlab =
224                (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t));
225            if (newSlab == NULL) {
226                fprintf(stderr, "Unable to allocate file descriptor overflow"
227                        " table slab - out of memory");
228                pthread_mutex_unlock(&fdOverflowTableLock);
229                abort();
230            } else {
231                int i;
232                for (i = 0; i < fdOverflowTableSlabSize; i ++) {
233                    pthread_mutex_init(&newSlab[i].lock, NULL);
234                }
235                fdOverflowTable[rootindex] = newSlab;
236            }
237        }
238        pthread_mutex_unlock(&fdOverflowTableLock);
239        slab = fdOverflowTable[rootindex];
240        result = &slab[slabindex];
241    }
242
243    return result;
244
245}
246
247
248/*
249 * Start a blocking operation :-
250 *    Insert thread onto thread list for the fd.
251 */
252static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self)
253{
254    self->thr = pthread_self();
255    self->intr = 0;
256
257    pthread_mutex_lock(&(fdEntry->lock));
258    {
259        self->next = fdEntry->threads;
260        fdEntry->threads = self;
261    }
262    pthread_mutex_unlock(&(fdEntry->lock));
263}
264
265/*
266 * End a blocking operation :-
267 *     Remove thread from thread list for the fd
268 *     If fd has been interrupted then set errno to EBADF
269 */
270static inline void endOp
271    (fdEntry_t *fdEntry, threadEntry_t *self)
272{
273    int orig_errno = errno;
274    pthread_mutex_lock(&(fdEntry->lock));
275    {
276        threadEntry_t *curr, *prev=NULL;
277        curr = fdEntry->threads;
278        while (curr != NULL) {
279            if (curr == self) {
280                if (curr->intr) {
281                    orig_errno = EBADF;
282                }
283                if (prev == NULL) {
284                    fdEntry->threads = curr->next;
285                } else {
286                    prev->next = curr->next;
287                }
288                break;
289            }
290            prev = curr;
291            curr = curr->next;
292        }
293    }
294    pthread_mutex_unlock(&(fdEntry->lock));
295    errno = orig_errno;
296}
297
298/*
299 * Close or dup2 a file descriptor ensuring that all threads blocked on
300 * the file descriptor are notified via a wakeup signal.
301 *
302 *      fd1 < 0    => close(fd2)
303 *      fd1 >= 0   => dup2(fd1, fd2)
304 *
305 * Returns -1 with errno set if operation fails.
306 */
307static int closefd(int fd1, int fd2) {
308    int rv, orig_errno;
309    fdEntry_t *fdEntry = getFdEntry(fd2);
310    if (fdEntry == NULL) {
311        errno = EBADF;
312        return -1;
313    }
314
315    /*
316     * Lock the fd to hold-off additional I/O on this fd.
317     */
318    pthread_mutex_lock(&(fdEntry->lock));
319
320    {
321        /* On fast machines we see that we enter dup2 before the
322         * accepting thread had a chance to get and process the signal.
323         * So in case we woke a thread up, give it some time to cope.
324         * Also see https://bugs.openjdk.java.net/browse/JDK-8006395 */
325        int num_woken = 0;
326
327        /*
328         * Send a wakeup signal to all threads blocked on this
329         * file descriptor.
330         */
331        threadEntry_t *curr = fdEntry->threads;
332        while (curr != NULL) {
333            curr->intr = 1;
334            pthread_kill( curr->thr, sigWakeup );
335            num_woken ++;
336            curr = curr->next;
337        }
338
339        if (num_woken > 0) {
340          usleep(num_woken * 50);
341        }
342
343        /*
344         * And close/dup the file descriptor
345         * (restart if interrupted by signal)
346         */
347        do {
348            if (fd1 < 0) {
349                rv = close(fd2);
350            } else {
351                rv = dup2(fd1, fd2);
352            }
353        } while (rv == -1 && errno == EINTR);
354    }
355
356    /*
357     * Unlock without destroying errno
358     */
359    orig_errno = errno;
360    pthread_mutex_unlock(&(fdEntry->lock));
361    errno = orig_errno;
362
363    return rv;
364}
365
366/*
367 * Wrapper for dup2 - same semantics as dup2 system call except
368 * that any threads blocked in an I/O system call on fd2 will be
369 * preempted and return -1/EBADF;
370 */
371int NET_Dup2(int fd, int fd2) {
372    if (fd < 0) {
373        errno = EBADF;
374        return -1;
375    }
376    return closefd(fd, fd2);
377}
378
379/*
380 * Wrapper for close - same semantics as close system call
381 * except that any threads blocked in an I/O on fd will be
382 * preempted and the I/O system call will return -1/EBADF.
383 */
384int NET_SocketClose(int fd) {
385    return closefd(-1, fd);
386}
387
388/************** Basic I/O operations here ***************/
389
390/*
391 * Macro to perform a blocking IO operation. Restarts
392 * automatically if interrupted by signal (other than
393 * our wakeup signal)
394 */
395#define BLOCKING_IO_RETURN_INT(FD, FUNC) {      \
396    int ret;                                    \
397    threadEntry_t self;                         \
398    fdEntry_t *fdEntry = getFdEntry(FD);        \
399    if (fdEntry == NULL) {                      \
400        errno = EBADF;                          \
401        return -1;                              \
402    }                                           \
403    do {                                        \
404        startOp(fdEntry, &self);                \
405        ret = FUNC;                             \
406        endOp(fdEntry, &self);                  \
407    } while (ret == -1 && errno == EINTR);      \
408    return ret;                                 \
409}
410
411int NET_Read(int s, void* buf, size_t len) {
412    BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) );
413}
414
415int NET_NonBlockingRead(int s, void* buf, size_t len) {
416    BLOCKING_IO_RETURN_INT(s, recv(s, buf, len, MSG_NONBLOCK));
417}
418
419int NET_ReadV(int s, const struct iovec * vector, int count) {
420    BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) );
421}
422
423int NET_RecvFrom(int s, void *buf, int len, unsigned int flags,
424       struct sockaddr *from, socklen_t *fromlen) {
425    BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, fromlen) );
426}
427
428int NET_Send(int s, void *msg, int len, unsigned int flags) {
429    BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) );
430}
431
432int NET_WriteV(int s, const struct iovec * vector, int count) {
433    BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) );
434}
435
436int NET_SendTo(int s, const void *msg, int len,  unsigned  int
437       flags, const struct sockaddr *to, int tolen) {
438    BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) );
439}
440
441int NET_Accept(int s, struct sockaddr *addr, socklen_t *addrlen) {
442    BLOCKING_IO_RETURN_INT( s, accept(s, addr, addrlen) );
443}
444
445int NET_Connect(int s, struct sockaddr *addr, int addrlen) {
446    int crc = -1, prc = -1;
447    threadEntry_t self;
448    fdEntry_t* fdEntry = getFdEntry(s);
449
450    if (fdEntry == NULL) {
451        errno = EBADF;
452        return -1;
453    }
454
455    /* On AIX, when the system call connect() is interrupted, the connection
456     * is not aborted and it will be established asynchronously by the kernel.
457     * Hence, no need to restart connect() when EINTR is received
458     */
459    startOp(fdEntry, &self);
460    crc = connect(s, addr, addrlen);
461    endOp(fdEntry, &self);
462
463    if (crc == -1 && errno == EINTR) {
464        struct pollfd s_pollfd;
465        int sockopt_arg = 0;
466        socklen_t len;
467
468        s_pollfd.fd = s;
469        s_pollfd.events = POLLOUT | POLLERR;
470
471        /* poll the file descriptor */
472        do {
473            startOp(fdEntry, &self);
474            prc = poll(&s_pollfd, 1, -1);
475            endOp(fdEntry, &self);
476        } while (prc == -1  && errno == EINTR);
477
478        if (prc < 0)
479            return prc;
480
481        len = sizeof(sockopt_arg);
482
483        /* Check whether the connection has been established */
484        if (getsockopt(s, SOL_SOCKET, SO_ERROR, &sockopt_arg, &len) == -1)
485            return -1;
486
487        if (sockopt_arg != 0 ) {
488            errno = sockopt_arg;
489            return -1;
490        }
491    } else {
492        return crc;
493    }
494
495    /* At this point, fd is connected. Set successful return code */
496    return 0;
497}
498
499int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) {
500    BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) );
501}
502
503/*
504 * Wrapper for poll(s, timeout).
505 * Auto restarts with adjusted timeout if interrupted by
506 * signal other than our wakeup signal.
507 */
508int NET_Timeout(JNIEnv *env, int s, long timeout, jlong nanoTimeStamp) {
509    jlong prevNanoTime = nanoTimeStamp;
510    jlong nanoTimeout = (jlong) timeout * NET_NSEC_PER_MSEC;
511    fdEntry_t *fdEntry = getFdEntry(s);
512
513    /*
514     * Check that fd hasn't been closed.
515     */
516    if (fdEntry == NULL) {
517        errno = EBADF;
518        return -1;
519    }
520
521    for(;;) {
522        struct pollfd pfd;
523        int rv;
524        threadEntry_t self;
525
526        /*
527         * Poll the fd. If interrupted by our wakeup signal
528         * errno will be set to EBADF.
529         */
530        pfd.fd = s;
531        pfd.events = POLLIN | POLLERR;
532
533        startOp(fdEntry, &self);
534        rv = poll(&pfd, 1, nanoTimeout / NET_NSEC_PER_MSEC);
535        endOp(fdEntry, &self);
536
537        /*
538         * If interrupted then adjust timeout. If timeout
539         * has expired return 0 (indicating timeout expired).
540         */
541        if (rv < 0 && errno == EINTR) {
542            jlong newNanoTime = JVM_NanoTime(env, 0);
543            nanoTimeout -= newNanoTime - prevNanoTime;
544            if (nanoTimeout < NET_NSEC_PER_MSEC) {
545                return 0;
546            }
547            prevNanoTime = newNanoTime;
548        } else {
549            return rv;
550        }
551    }
552}
553