1/*
2 * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2016, SAP SE and/or its affiliates. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.  Oracle designates this
9 * particular file as subject to the "Classpath" exception as provided
10 * by Oracle in the LICENSE file that accompanied this code.
11 *
12 * This code is distributed in the hope that it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 * version 2 for more details (a copy is included in the LICENSE file that
16 * accompanied this code).
17 *
18 * You should have received a copy of the GNU General Public License version
19 * 2 along with this work; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23 * or visit www.oracle.com if you need additional information or have any
24 * questions.
25 */
26
27/*
28 * This file contains implementations of NET_... functions. The NET_.. functions are
29 * wrappers for common file- and socket functions plus provisions for non-blocking IO.
30 *
31 * (basically, the layers remember all  file descriptors waiting for a particular fd;
32 *  all threads waiting on a certain fd can be woken up by sending them a signal; this
33 *  is done e.g. when the fd is closed.)
34 *
35 * This was originally copied from the linux_close.c implementation.
36 *
37 * Side Note: This coding needs initialization. Under Linux this is done
38 * automatically via __attribute((constructor)), on AIX this is done manually
39 * (see aix_close_init).
40 *
41 */
42
43/*
44   AIX needs a workaround for I/O cancellation, see:
45   http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.basetechref/doc/basetrf1/close.htm
46   ...
47   The close subroutine is blocked until all subroutines which use the file
48   descriptor return to usr space. For example, when a thread is calling close
49   and another thread is calling select with the same file descriptor, the
50   close subroutine does not return until the select call returns.
51   ...
52*/
53
54#include <assert.h>
55#include <limits.h>
56#include <stdio.h>
57#include <stdlib.h>
58#include <signal.h>
59#include <pthread.h>
60#include <sys/types.h>
61#include <sys/socket.h>
62#include <sys/time.h>
63#include <sys/resource.h>
64#include <sys/uio.h>
65#include <unistd.h>
66#include <errno.h>
67#include <sys/poll.h>
68
69/*
70 * Stack allocated by thread when doing blocking operation
71 */
72typedef struct threadEntry {
73    pthread_t thr;                      /* this thread */
74    struct threadEntry *next;           /* next thread */
75    int intr;                           /* interrupted */
76} threadEntry_t;
77
78/*
79 * Heap allocated during initialized - one entry per fd
80 */
81typedef struct {
82    pthread_mutex_t lock;               /* fd lock */
83    threadEntry_t *threads;             /* threads blocked on fd */
84} fdEntry_t;
85
86/*
87 * Signal to unblock thread
88 */
89static int sigWakeup = (SIGRTMAX - 1);
90
91/*
92 * fdTable holds one entry per file descriptor, up to a certain
93 * maximum.
94 * Theoretically, the number of possible file descriptors can get
95 * large, though usually it does not. Entries for small value file
96 * descriptors are kept in a simple table, which covers most scenarios.
97 * Entries for large value file descriptors are kept in an overflow
98 * table, which is organized as a sparse two dimensional array whose
99 * slabs are allocated on demand. This covers all corner cases while
100 * keeping memory consumption reasonable.
101 */
102
103/* Base table for low value file descriptors */
104static fdEntry_t* fdTable = NULL;
105/* Maximum size of base table (in number of entries). */
106static const int fdTableMaxSize = 0x1000; /* 4K */
107/* Actual size of base table (in number of entries) */
108static int fdTableLen = 0;
109/* Max. theoretical number of file descriptors on system. */
110static int fdLimit = 0;
111
112/* Overflow table, should base table not be large enough. Organized as
113 *   an array of n slabs, each holding 64k entries.
114 */
115static fdEntry_t** fdOverflowTable = NULL;
116/* Number of slabs in the overflow table */
117static int fdOverflowTableLen = 0;
118/* Number of entries in one slab */
119static const int fdOverflowTableSlabSize = 0x10000; /* 64k */
120pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER;
121
122/*
123 * Null signal handler
124 */
125static void sig_wakeup(int sig) {
126}
127
128/*
129 * Initialization routine (executed when library is loaded)
130 * Allocate fd tables and sets up signal handler.
131 *
132 * On AIX we don't have __attribute((constructor)) so we need to initialize
133 * manually (from JNI_OnLoad() in 'src/share/native/java/net/net_util.c')
134 */
135void aix_close_init() {
136    struct rlimit nbr_files;
137    sigset_t sigset;
138    struct sigaction sa;
139    int i = 0;
140
141    /* Determine the maximum number of possible file descriptors. */
142    if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) {
143        fprintf(stderr, "library initialization failed - "
144                "unable to get max # of allocated fds\n");
145        abort();
146    }
147    if (nbr_files.rlim_max != RLIM_INFINITY) {
148        fdLimit = nbr_files.rlim_max;
149    } else {
150        /* We just do not know. */
151        fdLimit = INT_MAX;
152    }
153
154    /* Allocate table for low value file descriptors. */
155    fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize;
156    fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t));
157    if (fdTable == NULL) {
158        fprintf(stderr, "library initialization failed - "
159                "unable to allocate file descriptor table - out of memory");
160        abort();
161    } else {
162        for (i = 0; i < fdTableLen; i ++) {
163            pthread_mutex_init(&fdTable[i].lock, NULL);
164        }
165    }
166
167    /* Allocate overflow table, if needed */
168    if (fdLimit > fdTableMaxSize) {
169        fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1;
170        fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*));
171        if (fdOverflowTable == NULL) {
172            fprintf(stderr, "library initialization failed - "
173                    "unable to allocate file descriptor overflow table - out of memory");
174            abort();
175        }
176    }
177
178    /*
179     * Setup the signal handler
180     */
181    sa.sa_handler = sig_wakeup;
182    sa.sa_flags   = 0;
183    sigemptyset(&sa.sa_mask);
184    sigaction(sigWakeup, &sa, NULL);
185
186    sigemptyset(&sigset);
187    sigaddset(&sigset, sigWakeup);
188    sigprocmask(SIG_UNBLOCK, &sigset, NULL);
189}
190
191/*
192 * Return the fd table for this fd.
193 */
194static inline fdEntry_t *getFdEntry(int fd)
195{
196    fdEntry_t* result = NULL;
197
198    if (fd < 0) {
199        return NULL;
200    }
201
202    /* This should not happen. If it does, our assumption about
203     * max. fd value was wrong. */
204    assert(fd < fdLimit);
205
206    if (fd < fdTableMaxSize) {
207        /* fd is in base table. */
208        assert(fd < fdTableLen);
209        result = &fdTable[fd];
210    } else {
211        /* fd is in overflow table. */
212        const int indexInOverflowTable = fd - fdTableMaxSize;
213        const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize;
214        const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize;
215        fdEntry_t* slab = NULL;
216        assert(rootindex < fdOverflowTableLen);
217        assert(slabindex < fdOverflowTableSlabSize);
218        pthread_mutex_lock(&fdOverflowTableLock);
219        /* Allocate new slab in overflow table if needed */
220        if (fdOverflowTable[rootindex] == NULL) {
221            fdEntry_t* const newSlab =
222                (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t));
223            if (newSlab == NULL) {
224                fprintf(stderr, "Unable to allocate file descriptor overflow"
225                        " table slab - out of memory");
226                pthread_mutex_unlock(&fdOverflowTableLock);
227                abort();
228            } else {
229                int i;
230                for (i = 0; i < fdOverflowTableSlabSize; i ++) {
231                    pthread_mutex_init(&newSlab[i].lock, NULL);
232                }
233                fdOverflowTable[rootindex] = newSlab;
234            }
235        }
236        pthread_mutex_unlock(&fdOverflowTableLock);
237        slab = fdOverflowTable[rootindex];
238        result = &slab[slabindex];
239    }
240
241    return result;
242
243}
244
245
246/*
247 * Start a blocking operation :-
248 *    Insert thread onto thread list for the fd.
249 */
250static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self)
251{
252    self->thr = pthread_self();
253    self->intr = 0;
254
255    pthread_mutex_lock(&(fdEntry->lock));
256    {
257        self->next = fdEntry->threads;
258        fdEntry->threads = self;
259    }
260    pthread_mutex_unlock(&(fdEntry->lock));
261}
262
263/*
264 * End a blocking operation :-
265 *     Remove thread from thread list for the fd
266 *     If fd has been interrupted then set errno to EBADF
267 */
268static inline void endOp
269    (fdEntry_t *fdEntry, threadEntry_t *self)
270{
271    int orig_errno = errno;
272    pthread_mutex_lock(&(fdEntry->lock));
273    {
274        threadEntry_t *curr, *prev=NULL;
275        curr = fdEntry->threads;
276        while (curr != NULL) {
277            if (curr == self) {
278                if (curr->intr) {
279                    orig_errno = EBADF;
280                }
281                if (prev == NULL) {
282                    fdEntry->threads = curr->next;
283                } else {
284                    prev->next = curr->next;
285                }
286                break;
287            }
288            prev = curr;
289            curr = curr->next;
290        }
291    }
292    pthread_mutex_unlock(&(fdEntry->lock));
293    errno = orig_errno;
294}
295
296/*
297 * Close or dup2 a file descriptor ensuring that all threads blocked on
298 * the file descriptor are notified via a wakeup signal.
299 *
300 *      fd1 < 0    => close(fd2)
301 *      fd1 >= 0   => dup2(fd1, fd2)
302 *
303 * Returns -1 with errno set if operation fails.
304 */
305static int closefd(int fd1, int fd2) {
306    int rv, orig_errno;
307    fdEntry_t *fdEntry = getFdEntry(fd2);
308    if (fdEntry == NULL) {
309        errno = EBADF;
310        return -1;
311    }
312
313    /*
314     * Lock the fd to hold-off additional I/O on this fd.
315     */
316    pthread_mutex_lock(&(fdEntry->lock));
317
318    {
319        /* On fast machines we see that we enter dup2 before the
320         * accepting thread had a chance to get and process the signal.
321         * So in case we woke a thread up, give it some time to cope.
322         * Also see https://bugs.openjdk.java.net/browse/JDK-8006395 */
323        int num_woken = 0;
324
325        /*
326         * Send a wakeup signal to all threads blocked on this
327         * file descriptor.
328         */
329        threadEntry_t *curr = fdEntry->threads;
330        while (curr != NULL) {
331            curr->intr = 1;
332            pthread_kill( curr->thr, sigWakeup );
333            num_woken ++;
334            curr = curr->next;
335        }
336
337        if (num_woken > 0) {
338          usleep(num_woken * 50);
339        }
340
341        /*
342         * And close/dup the file descriptor
343         * (restart if interrupted by signal)
344         */
345        do {
346            if (fd1 < 0) {
347                rv = close(fd2);
348            } else {
349                rv = dup2(fd1, fd2);
350            }
351        } while (rv == -1 && errno == EINTR);
352    }
353
354    /*
355     * Unlock without destroying errno
356     */
357    orig_errno = errno;
358    pthread_mutex_unlock(&(fdEntry->lock));
359    errno = orig_errno;
360
361    return rv;
362}
363
364/*
365 * Wrapper for dup2 - same semantics as dup2 system call except
366 * that any threads blocked in an I/O system call on fd2 will be
367 * preempted and return -1/EBADF;
368 */
369int NET_Dup2(int fd, int fd2) {
370    if (fd < 0) {
371        errno = EBADF;
372        return -1;
373    }
374    return closefd(fd, fd2);
375}
376
377/*
378 * Wrapper for close - same semantics as close system call
379 * except that any threads blocked in an I/O on fd will be
380 * preempted and the I/O system call will return -1/EBADF.
381 */
382int NET_SocketClose(int fd) {
383    return closefd(-1, fd);
384}
385
386/************** Basic I/O operations here ***************/
387
388/*
389 * Macro to perform a blocking IO operation. Restarts
390 * automatically if interrupted by signal (other than
391 * our wakeup signal)
392 */
393#define BLOCKING_IO_RETURN_INT(FD, FUNC) {      \
394    int ret;                                    \
395    threadEntry_t self;                         \
396    fdEntry_t *fdEntry = getFdEntry(FD);        \
397    if (fdEntry == NULL) {                      \
398        errno = EBADF;                          \
399        return -1;                              \
400    }                                           \
401    do {                                        \
402        startOp(fdEntry, &self);                \
403        ret = FUNC;                             \
404        endOp(fdEntry, &self);                  \
405    } while (ret == -1 && errno == EINTR);      \
406    return ret;                                 \
407}
408
409int NET_Read(int s, void* buf, size_t len) {
410    BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) );
411}
412
413int NET_NonBlockingRead(int s, void* buf, size_t len) {
414    BLOCKING_IO_RETURN_INT(s, recv(s, buf, len, MSG_NONBLOCK));
415}
416
417int NET_ReadV(int s, const struct iovec * vector, int count) {
418    BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) );
419}
420
421int NET_RecvFrom(int s, void *buf, int len, unsigned int flags,
422       struct sockaddr *from, socklen_t *fromlen) {
423    BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, fromlen) );
424}
425
426int NET_Send(int s, void *msg, int len, unsigned int flags) {
427    BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) );
428}
429
430int NET_WriteV(int s, const struct iovec * vector, int count) {
431    BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) );
432}
433
434int NET_SendTo(int s, const void *msg, int len,  unsigned  int
435       flags, const struct sockaddr *to, int tolen) {
436    BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) );
437}
438
439int NET_Accept(int s, struct sockaddr *addr, socklen_t *addrlen) {
440    BLOCKING_IO_RETURN_INT( s, accept(s, addr, addrlen) );
441}
442
443int NET_Connect(int s, struct sockaddr *addr, int addrlen) {
444    int crc = -1, prc = -1;
445    threadEntry_t self;
446    fdEntry_t* fdEntry = getFdEntry(s);
447
448    if (fdEntry == NULL) {
449        errno = EBADF;
450        return -1;
451    }
452
453    /* On AIX, when the system call connect() is interrupted, the connection
454     * is not aborted and it will be established asynchronously by the kernel.
455     * Hence, no need to restart connect() when EINTR is received
456     */
457    startOp(fdEntry, &self);
458    crc = connect(s, addr, addrlen);
459    endOp(fdEntry, &self);
460
461    if (crc == -1 && errno == EINTR) {
462        struct pollfd s_pollfd;
463        int sockopt_arg = 0;
464        socklen_t len;
465
466        s_pollfd.fd = s;
467        s_pollfd.events = POLLOUT | POLLERR;
468
469        /* poll the file descriptor */
470        do {
471            startOp(fdEntry, &self);
472            prc = poll(&s_pollfd, 1, -1);
473            endOp(fdEntry, &self);
474        } while (prc == -1  && errno == EINTR);
475
476        if (prc < 0)
477            return prc;
478
479        len = sizeof(sockopt_arg);
480
481        /* Check whether the connection has been established */
482        if (getsockopt(s, SOL_SOCKET, SO_ERROR, &sockopt_arg, &len) == -1)
483            return -1;
484
485        if (sockopt_arg != 0 ) {
486            errno = sockopt_arg;
487            return -1;
488        }
489    } else {
490        return crc;
491    }
492
493    /* At this point, fd is connected. Set successful return code */
494    return 0;
495}
496
497int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) {
498    BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) );
499}
500
501/*
502 * Wrapper for poll(s, timeout).
503 * Auto restarts with adjusted timeout if interrupted by
504 * signal other than our wakeup signal.
505 */
506int NET_Timeout0(int s, long timeout, long currentTime) {
507    long prevtime = currentTime, newtime;
508    struct timeval t;
509    fdEntry_t *fdEntry = getFdEntry(s);
510
511    /*
512     * Check that fd hasn't been closed.
513     */
514    if (fdEntry == NULL) {
515        errno = EBADF;
516        return -1;
517    }
518
519    for(;;) {
520        struct pollfd pfd;
521        int rv;
522        threadEntry_t self;
523
524        /*
525         * Poll the fd. If interrupted by our wakeup signal
526         * errno will be set to EBADF.
527         */
528        pfd.fd = s;
529        pfd.events = POLLIN | POLLERR;
530
531        startOp(fdEntry, &self);
532        rv = poll(&pfd, 1, timeout);
533        endOp(fdEntry, &self);
534
535        /*
536         * If interrupted then adjust timeout. If timeout
537         * has expired return 0 (indicating timeout expired).
538         */
539        if (rv < 0 && errno == EINTR) {
540            if (timeout > 0) {
541                gettimeofday(&t, NULL);
542                newtime = t.tv_sec * 1000  +  t.tv_usec / 1000;
543                timeout -= newtime - prevtime;
544                if (timeout <= 0) {
545                    return 0;
546                }
547                prevtime = newtime;
548            }
549        } else {
550            return rv;
551        }
552
553    }
554}
555