1/*
2 * Copyright (c) 2001, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26#include <assert.h>
27#include <limits.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <signal.h>
31#include <pthread.h>
32#include <sys/types.h>
33#include <sys/socket.h>
34#include <sys/time.h>
35#include <sys/resource.h>
36#include <sys/uio.h>
37#include <unistd.h>
38#include <errno.h>
39#include <poll.h>
40#include "jvm.h"
41#include "net_util.h"
42
43/*
44 * Stack allocated by thread when doing blocking operation
45 */
46typedef struct threadEntry {
47    pthread_t thr;                      /* this thread */
48    struct threadEntry *next;           /* next thread */
49    int intr;                           /* interrupted */
50} threadEntry_t;
51
52/*
53 * Heap allocated during initialized - one entry per fd
54 */
55typedef struct {
56    pthread_mutex_t lock;               /* fd lock */
57    threadEntry_t *threads;             /* threads blocked on fd */
58} fdEntry_t;
59
60/*
61 * Signal to unblock thread
62 */
63static int sigWakeup = (__SIGRTMAX - 2);
64
65/*
66 * fdTable holds one entry per file descriptor, up to a certain
67 * maximum.
68 * Theoretically, the number of possible file descriptors can get
69 * large, though usually it does not. Entries for small value file
70 * descriptors are kept in a simple table, which covers most scenarios.
71 * Entries for large value file descriptors are kept in an overflow
72 * table, which is organized as a sparse two dimensional array whose
73 * slabs are allocated on demand. This covers all corner cases while
74 * keeping memory consumption reasonable.
75 */
76
77/* Base table for low value file descriptors */
78static fdEntry_t* fdTable = NULL;
79/* Maximum size of base table (in number of entries). */
80static const int fdTableMaxSize = 0x1000; /* 4K */
81/* Actual size of base table (in number of entries) */
82static int fdTableLen = 0;
83/* Max. theoretical number of file descriptors on system. */
84static int fdLimit = 0;
85
86/* Overflow table, should base table not be large enough. Organized as
87 *   an array of n slabs, each holding 64k entries.
88 */
89static fdEntry_t** fdOverflowTable = NULL;
90/* Number of slabs in the overflow table */
91static int fdOverflowTableLen = 0;
92/* Number of entries in one slab */
93static const int fdOverflowTableSlabSize = 0x10000; /* 64k */
94pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER;
95
96/*
97 * Null signal handler
98 */
99static void sig_wakeup(int sig) {
100}
101
102/*
103 * Initialization routine (executed when library is loaded)
104 * Allocate fd tables and sets up signal handler.
105 */
106static void __attribute((constructor)) init() {
107    struct rlimit nbr_files;
108    sigset_t sigset;
109    struct sigaction sa;
110    int i = 0;
111
112    /* Determine the maximum number of possible file descriptors. */
113    if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) {
114        fprintf(stderr, "library initialization failed - "
115                "unable to get max # of allocated fds\n");
116        abort();
117    }
118    if (nbr_files.rlim_max != RLIM_INFINITY) {
119        fdLimit = nbr_files.rlim_max;
120    } else {
121        /* We just do not know. */
122        fdLimit = INT_MAX;
123    }
124
125    /* Allocate table for low value file descriptors. */
126    fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize;
127    fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t));
128    if (fdTable == NULL) {
129        fprintf(stderr, "library initialization failed - "
130                "unable to allocate file descriptor table - out of memory");
131        abort();
132    } else {
133        for (i = 0; i < fdTableLen; i ++) {
134            pthread_mutex_init(&fdTable[i].lock, NULL);
135        }
136    }
137
138    /* Allocate overflow table, if needed */
139    if (fdLimit > fdTableMaxSize) {
140        fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1;
141        fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*));
142        if (fdOverflowTable == NULL) {
143            fprintf(stderr, "library initialization failed - "
144                    "unable to allocate file descriptor overflow table - out of memory");
145            abort();
146        }
147    }
148
149    /*
150     * Setup the signal handler
151     */
152    sa.sa_handler = sig_wakeup;
153    sa.sa_flags   = 0;
154    sigemptyset(&sa.sa_mask);
155    sigaction(sigWakeup, &sa, NULL);
156
157    sigemptyset(&sigset);
158    sigaddset(&sigset, sigWakeup);
159    sigprocmask(SIG_UNBLOCK, &sigset, NULL);
160}
161
162/*
163 * Return the fd table for this fd.
164 */
165static inline fdEntry_t *getFdEntry(int fd)
166{
167    fdEntry_t* result = NULL;
168
169    if (fd < 0) {
170        return NULL;
171    }
172
173    /* This should not happen. If it does, our assumption about
174     * max. fd value was wrong. */
175    assert(fd < fdLimit);
176
177    if (fd < fdTableMaxSize) {
178        /* fd is in base table. */
179        assert(fd < fdTableLen);
180        result = &fdTable[fd];
181    } else {
182        /* fd is in overflow table. */
183        const int indexInOverflowTable = fd - fdTableMaxSize;
184        const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize;
185        const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize;
186        fdEntry_t* slab = NULL;
187        assert(rootindex < fdOverflowTableLen);
188        assert(slabindex < fdOverflowTableSlabSize);
189        pthread_mutex_lock(&fdOverflowTableLock);
190        /* Allocate new slab in overflow table if needed */
191        if (fdOverflowTable[rootindex] == NULL) {
192            fdEntry_t* const newSlab =
193                (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t));
194            if (newSlab == NULL) {
195                fprintf(stderr, "Unable to allocate file descriptor overflow"
196                        " table slab - out of memory");
197                pthread_mutex_unlock(&fdOverflowTableLock);
198                abort();
199            } else {
200                int i;
201                for (i = 0; i < fdOverflowTableSlabSize; i ++) {
202                    pthread_mutex_init(&newSlab[i].lock, NULL);
203                }
204                fdOverflowTable[rootindex] = newSlab;
205            }
206        }
207        pthread_mutex_unlock(&fdOverflowTableLock);
208        slab = fdOverflowTable[rootindex];
209        result = &slab[slabindex];
210    }
211
212    return result;
213
214}
215
216/*
217 * Start a blocking operation :-
218 *    Insert thread onto thread list for the fd.
219 */
220static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self)
221{
222    self->thr = pthread_self();
223    self->intr = 0;
224
225    pthread_mutex_lock(&(fdEntry->lock));
226    {
227        self->next = fdEntry->threads;
228        fdEntry->threads = self;
229    }
230    pthread_mutex_unlock(&(fdEntry->lock));
231}
232
233/*
234 * End a blocking operation :-
235 *     Remove thread from thread list for the fd
236 *     If fd has been interrupted then set errno to EBADF
237 */
238static inline void endOp
239    (fdEntry_t *fdEntry, threadEntry_t *self)
240{
241    int orig_errno = errno;
242    pthread_mutex_lock(&(fdEntry->lock));
243    {
244        threadEntry_t *curr, *prev=NULL;
245        curr = fdEntry->threads;
246        while (curr != NULL) {
247            if (curr == self) {
248                if (curr->intr) {
249                    orig_errno = EBADF;
250                }
251                if (prev == NULL) {
252                    fdEntry->threads = curr->next;
253                } else {
254                    prev->next = curr->next;
255                }
256                break;
257            }
258            prev = curr;
259            curr = curr->next;
260        }
261    }
262    pthread_mutex_unlock(&(fdEntry->lock));
263    errno = orig_errno;
264}
265
266/*
267 * Close or dup2 a file descriptor ensuring that all threads blocked on
268 * the file descriptor are notified via a wakeup signal.
269 *
270 *      fd1 < 0    => close(fd2)
271 *      fd1 >= 0   => dup2(fd1, fd2)
272 *
273 * Returns -1 with errno set if operation fails.
274 */
275static int closefd(int fd1, int fd2) {
276    int rv, orig_errno;
277    fdEntry_t *fdEntry = getFdEntry(fd2);
278    if (fdEntry == NULL) {
279        errno = EBADF;
280        return -1;
281    }
282
283    /*
284     * Lock the fd to hold-off additional I/O on this fd.
285     */
286    pthread_mutex_lock(&(fdEntry->lock));
287
288    {
289        /*
290         * And close/dup the file descriptor
291         * (restart if interrupted by signal)
292         */
293        do {
294            if (fd1 < 0) {
295                rv = close(fd2);
296            } else {
297                rv = dup2(fd1, fd2);
298            }
299        } while (rv == -1 && errno == EINTR);
300
301        /*
302         * Send a wakeup signal to all threads blocked on this
303         * file descriptor.
304         */
305        threadEntry_t *curr = fdEntry->threads;
306        while (curr != NULL) {
307            curr->intr = 1;
308            pthread_kill( curr->thr, sigWakeup );
309            curr = curr->next;
310        }
311    }
312
313    /*
314     * Unlock without destroying errno
315     */
316    orig_errno = errno;
317    pthread_mutex_unlock(&(fdEntry->lock));
318    errno = orig_errno;
319
320    return rv;
321}
322
323/*
324 * Wrapper for dup2 - same semantics as dup2 system call except
325 * that any threads blocked in an I/O system call on fd2 will be
326 * preempted and return -1/EBADF;
327 */
328int NET_Dup2(int fd, int fd2) {
329    if (fd < 0) {
330        errno = EBADF;
331        return -1;
332    }
333    return closefd(fd, fd2);
334}
335
336/*
337 * Wrapper for close - same semantics as close system call
338 * except that any threads blocked in an I/O on fd will be
339 * preempted and the I/O system call will return -1/EBADF.
340 */
341int NET_SocketClose(int fd) {
342    return closefd(-1, fd);
343}
344
345/************** Basic I/O operations here ***************/
346
347/*
348 * Macro to perform a blocking IO operation. Restarts
349 * automatically if interrupted by signal (other than
350 * our wakeup signal)
351 */
352#define BLOCKING_IO_RETURN_INT(FD, FUNC) {      \
353    int ret;                                    \
354    threadEntry_t self;                         \
355    fdEntry_t *fdEntry = getFdEntry(FD);        \
356    if (fdEntry == NULL) {                      \
357        errno = EBADF;                          \
358        return -1;                              \
359    }                                           \
360    do {                                        \
361        startOp(fdEntry, &self);                \
362        ret = FUNC;                             \
363        endOp(fdEntry, &self);                  \
364    } while (ret == -1 && errno == EINTR);      \
365    return ret;                                 \
366}
367
368int NET_Read(int s, void* buf, size_t len) {
369    BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) );
370}
371
372int NET_NonBlockingRead(int s, void* buf, size_t len) {
373    BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, MSG_DONTWAIT) );
374}
375
376int NET_ReadV(int s, const struct iovec * vector, int count) {
377    BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) );
378}
379
380int NET_RecvFrom(int s, void *buf, int len, unsigned int flags,
381       struct sockaddr *from, socklen_t *fromlen) {
382    BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, fromlen) );
383}
384
385int NET_Send(int s, void *msg, int len, unsigned int flags) {
386    BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) );
387}
388
389int NET_WriteV(int s, const struct iovec * vector, int count) {
390    BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) );
391}
392
393int NET_SendTo(int s, const void *msg, int len,  unsigned  int
394       flags, const struct sockaddr *to, int tolen) {
395    BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) );
396}
397
398int NET_Accept(int s, struct sockaddr *addr, socklen_t *addrlen) {
399    BLOCKING_IO_RETURN_INT( s, accept(s, addr, addrlen) );
400}
401
402int NET_Connect(int s, struct sockaddr *addr, int addrlen) {
403    BLOCKING_IO_RETURN_INT( s, connect(s, addr, addrlen) );
404}
405
406int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) {
407    BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) );
408}
409
410/*
411 * Wrapper for poll(s, timeout).
412 * Auto restarts with adjusted timeout if interrupted by
413 * signal other than our wakeup signal.
414 */
415int NET_Timeout(JNIEnv *env, int s, long timeout, jlong nanoTimeStamp) {
416    jlong prevNanoTime = nanoTimeStamp;
417    jlong nanoTimeout = (jlong)timeout * NET_NSEC_PER_MSEC;
418    fdEntry_t *fdEntry = getFdEntry(s);
419
420    /*
421     * Check that fd hasn't been closed.
422     */
423    if (fdEntry == NULL) {
424        errno = EBADF;
425        return -1;
426    }
427
428    for(;;) {
429        struct pollfd pfd;
430        int rv;
431        threadEntry_t self;
432
433        /*
434         * Poll the fd. If interrupted by our wakeup signal
435         * errno will be set to EBADF.
436         */
437        pfd.fd = s;
438        pfd.events = POLLIN | POLLERR;
439
440        startOp(fdEntry, &self);
441        rv = poll(&pfd, 1, nanoTimeout / NET_NSEC_PER_MSEC);
442        endOp(fdEntry, &self);
443        /*
444         * If interrupted then adjust timeout. If timeout
445         * has expired return 0 (indicating timeout expired).
446         */
447        if (rv < 0 && errno == EINTR) {
448            jlong newNanoTime = JVM_NanoTime(env, 0);
449            nanoTimeout -= newNanoTime - prevNanoTime;
450            if (nanoTimeout < NET_NSEC_PER_MSEC) {
451                return 0;
452            }
453            prevNanoTime = newNanoTime;
454        } else {
455            return rv;
456        }
457    }
458}
459