1/*
2 * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26#include <assert.h>
27#include <limits.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <sys/param.h>
31#include <signal.h>
32#include <pthread.h>
33#include <sys/types.h>
34#include <sys/socket.h>
35#include <sys/select.h>
36#include <sys/time.h>
37#include <sys/resource.h>
38#include <sys/uio.h>
39#include <unistd.h>
40#include <errno.h>
41#include <sys/poll.h>
42
43/*
44 * Stack allocated by thread when doing blocking operation
45 */
46typedef struct threadEntry {
47    pthread_t thr;                      /* this thread */
48    struct threadEntry *next;           /* next thread */
49    int intr;                           /* interrupted */
50} threadEntry_t;
51
52/*
53 * Heap allocated during initialized - one entry per fd
54 */
55typedef struct {
56    pthread_mutex_t lock;               /* fd lock */
57    threadEntry_t *threads;             /* threads blocked on fd */
58} fdEntry_t;
59
60/*
61 * Signal to unblock thread
62 */
63static int sigWakeup = SIGIO;
64
65/*
66 * fdTable holds one entry per file descriptor, up to a certain
67 * maximum.
68 * Theoretically, the number of possible file descriptors can get
69 * large, though usually it does not. Entries for small value file
70 * descriptors are kept in a simple table, which covers most scenarios.
71 * Entries for large value file descriptors are kept in an overflow
72 * table, which is organized as a sparse two dimensional array whose
73 * slabs are allocated on demand. This covers all corner cases while
74 * keeping memory consumption reasonable.
75 */
76
77/* Base table for low value file descriptors */
78static fdEntry_t* fdTable = NULL;
79/* Maximum size of base table (in number of entries). */
80static const int fdTableMaxSize = 0x1000; /* 4K */
81/* Actual size of base table (in number of entries) */
82static int fdTableLen = 0;
83/* Max. theoretical number of file descriptors on system. */
84static int fdLimit = 0;
85
86/* Overflow table, should base table not be large enough. Organized as
87 *   an array of n slabs, each holding 64k entries.
88 */
89static fdEntry_t** fdOverflowTable = NULL;
90/* Number of slabs in the overflow table */
91static int fdOverflowTableLen = 0;
92/* Number of entries in one slab */
93static const int fdOverflowTableSlabSize = 0x10000; /* 64k */
94pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER;
95
96/*
97 * Null signal handler
98 */
99static void sig_wakeup(int sig) {
100}
101
102/*
103 * Initialization routine (executed when library is loaded)
104 * Allocate fd tables and sets up signal handler.
105 */
106static void __attribute((constructor)) init() {
107    struct rlimit nbr_files;
108    sigset_t sigset;
109    struct sigaction sa;
110    int i = 0;
111
112    /* Determine the maximum number of possible file descriptors. */
113    if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) {
114        fprintf(stderr, "library initialization failed - "
115                "unable to get max # of allocated fds\n");
116        abort();
117    }
118    if (nbr_files.rlim_max != RLIM_INFINITY) {
119        fdLimit = nbr_files.rlim_max;
120    } else {
121        /* We just do not know. */
122        fdLimit = INT_MAX;
123    }
124
125    /* Allocate table for low value file descriptors. */
126    fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize;
127    fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t));
128    if (fdTable == NULL) {
129        fprintf(stderr, "library initialization failed - "
130                "unable to allocate file descriptor table - out of memory");
131        abort();
132    } else {
133        for (i = 0; i < fdTableLen; i ++) {
134            pthread_mutex_init(&fdTable[i].lock, NULL);
135        }
136    }
137
138    /* Allocate overflow table, if needed */
139    if (fdLimit > fdTableMaxSize) {
140        fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1;
141        fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*));
142        if (fdOverflowTable == NULL) {
143            fprintf(stderr, "library initialization failed - "
144                    "unable to allocate file descriptor overflow table - out of memory");
145            abort();
146        }
147    }
148
149    /*
150     * Setup the signal handler
151     */
152    sa.sa_handler = sig_wakeup;
153    sa.sa_flags   = 0;
154    sigemptyset(&sa.sa_mask);
155    sigaction(sigWakeup, &sa, NULL);
156
157    sigemptyset(&sigset);
158    sigaddset(&sigset, sigWakeup);
159    sigprocmask(SIG_UNBLOCK, &sigset, NULL);
160}
161
162/*
163 * Return the fd table for this fd.
164 */
165static inline fdEntry_t *getFdEntry(int fd)
166{
167    fdEntry_t* result = NULL;
168
169    if (fd < 0) {
170        return NULL;
171    }
172
173    /* This should not happen. If it does, our assumption about
174     * max. fd value was wrong. */
175    assert(fd < fdLimit);
176
177    if (fd < fdTableMaxSize) {
178        /* fd is in base table. */
179        assert(fd < fdTableLen);
180        result = &fdTable[fd];
181    } else {
182        /* fd is in overflow table. */
183        const int indexInOverflowTable = fd - fdTableMaxSize;
184        const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize;
185        const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize;
186        fdEntry_t* slab = NULL;
187        assert(rootindex < fdOverflowTableLen);
188        assert(slabindex < fdOverflowTableSlabSize);
189        pthread_mutex_lock(&fdOverflowTableLock);
190        /* Allocate new slab in overflow table if needed */
191        if (fdOverflowTable[rootindex] == NULL) {
192            fdEntry_t* const newSlab =
193                (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t));
194            if (newSlab == NULL) {
195                fprintf(stderr, "Unable to allocate file descriptor overflow"
196                        " table slab - out of memory");
197                pthread_mutex_unlock(&fdOverflowTableLock);
198                abort();
199            } else {
200                int i;
201                for (i = 0; i < fdOverflowTableSlabSize; i ++) {
202                    pthread_mutex_init(&newSlab[i].lock, NULL);
203                }
204                fdOverflowTable[rootindex] = newSlab;
205            }
206        }
207        pthread_mutex_unlock(&fdOverflowTableLock);
208        slab = fdOverflowTable[rootindex];
209        result = &slab[slabindex];
210    }
211
212    return result;
213
214}
215
216
217/*
218 * Start a blocking operation :-
219 *    Insert thread onto thread list for the fd.
220 */
221static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self)
222{
223    self->thr = pthread_self();
224    self->intr = 0;
225
226    pthread_mutex_lock(&(fdEntry->lock));
227    {
228        self->next = fdEntry->threads;
229        fdEntry->threads = self;
230    }
231    pthread_mutex_unlock(&(fdEntry->lock));
232}
233
234/*
235 * End a blocking operation :-
236 *     Remove thread from thread list for the fd
237 *     If fd has been interrupted then set errno to EBADF
238 */
239static inline void endOp
240    (fdEntry_t *fdEntry, threadEntry_t *self)
241{
242    int orig_errno = errno;
243    pthread_mutex_lock(&(fdEntry->lock));
244    {
245        threadEntry_t *curr, *prev=NULL;
246        curr = fdEntry->threads;
247        while (curr != NULL) {
248            if (curr == self) {
249                if (curr->intr) {
250                    orig_errno = EBADF;
251                }
252                if (prev == NULL) {
253                    fdEntry->threads = curr->next;
254                } else {
255                    prev->next = curr->next;
256                }
257                break;
258            }
259            prev = curr;
260            curr = curr->next;
261        }
262    }
263    pthread_mutex_unlock(&(fdEntry->lock));
264    errno = orig_errno;
265}
266
267/*
268 * Close or dup2 a file descriptor ensuring that all threads blocked on
269 * the file descriptor are notified via a wakeup signal.
270 *
271 *      fd1 < 0    => close(fd2)
272 *      fd1 >= 0   => dup2(fd1, fd2)
273 *
274 * Returns -1 with errno set if operation fails.
275 */
276static int closefd(int fd1, int fd2) {
277    int rv, orig_errno;
278    fdEntry_t *fdEntry = getFdEntry(fd2);
279    if (fdEntry == NULL) {
280        errno = EBADF;
281        return -1;
282    }
283
284    /*
285     * Lock the fd to hold-off additional I/O on this fd.
286     */
287    pthread_mutex_lock(&(fdEntry->lock));
288
289    {
290        /*
291         * Send a wakeup signal to all threads blocked on this
292         * file descriptor.
293         */
294        threadEntry_t *curr = fdEntry->threads;
295        while (curr != NULL) {
296            curr->intr = 1;
297            pthread_kill( curr->thr, sigWakeup );
298            curr = curr->next;
299        }
300
301        /*
302         * And close/dup the file descriptor
303         * (restart if interrupted by signal)
304         */
305        do {
306            if (fd1 < 0) {
307                rv = close(fd2);
308            } else {
309                rv = dup2(fd1, fd2);
310            }
311        } while (rv == -1 && errno == EINTR);
312
313    }
314
315    /*
316     * Unlock without destroying errno
317     */
318    orig_errno = errno;
319    pthread_mutex_unlock(&(fdEntry->lock));
320    errno = orig_errno;
321
322    return rv;
323}
324
325/*
326 * Wrapper for dup2 - same semantics as dup2 system call except
327 * that any threads blocked in an I/O system call on fd2 will be
328 * preempted and return -1/EBADF;
329 */
330int NET_Dup2(int fd, int fd2) {
331    if (fd < 0) {
332        errno = EBADF;
333        return -1;
334    }
335    return closefd(fd, fd2);
336}
337
338/*
339 * Wrapper for close - same semantics as close system call
340 * except that any threads blocked in an I/O on fd will be
341 * preempted and the I/O system call will return -1/EBADF.
342 */
343int NET_SocketClose(int fd) {
344    return closefd(-1, fd);
345}
346
347/************** Basic I/O operations here ***************/
348
349/*
350 * Macro to perform a blocking IO operation. Restarts
351 * automatically if interrupted by signal (other than
352 * our wakeup signal)
353 */
354#define BLOCKING_IO_RETURN_INT(FD, FUNC) {      \
355    int ret;                                    \
356    threadEntry_t self;                         \
357    fdEntry_t *fdEntry = getFdEntry(FD);        \
358    if (fdEntry == NULL) {                      \
359        errno = EBADF;                          \
360        return -1;                              \
361    }                                           \
362    do {                                        \
363        startOp(fdEntry, &self);                \
364        ret = FUNC;                             \
365        endOp(fdEntry, &self);                  \
366    } while (ret == -1 && errno == EINTR);      \
367    return ret;                                 \
368}
369
370int NET_Read(int s, void* buf, size_t len) {
371    BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) );
372}
373
374int NET_NonBlockingRead(int s, void* buf, size_t len) {
375    BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, MSG_DONTWAIT));
376}
377
378int NET_ReadV(int s, const struct iovec * vector, int count) {
379    BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) );
380}
381
382int NET_RecvFrom(int s, void *buf, int len, unsigned int flags,
383       struct sockaddr *from, socklen_t *fromlen) {
384    BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, fromlen) );
385}
386
387int NET_Send(int s, void *msg, int len, unsigned int flags) {
388    BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) );
389}
390
391int NET_WriteV(int s, const struct iovec * vector, int count) {
392    BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) );
393}
394
395int NET_SendTo(int s, const void *msg, int len,  unsigned  int
396       flags, const struct sockaddr *to, int tolen) {
397    BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) );
398}
399
400int NET_Accept(int s, struct sockaddr *addr, socklen_t *addrlen) {
401    BLOCKING_IO_RETURN_INT( s, accept(s, addr, addrlen) );
402}
403
404int NET_Connect(int s, struct sockaddr *addr, int addrlen) {
405    BLOCKING_IO_RETURN_INT( s, connect(s, addr, addrlen) );
406}
407
408int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) {
409    BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) );
410}
411
412/*
413 * Wrapper for select(s, timeout). We are using select() on Mac OS due to Bug 7131399.
414 * Auto restarts with adjusted timeout if interrupted by
415 * signal other than our wakeup signal.
416 */
417int NET_Timeout0(int s, long timeout, long currentTime) {
418    long prevtime = currentTime, newtime;
419    struct timeval t, *tp = &t;
420    fd_set fds;
421    fd_set* fdsp = NULL;
422    int allocated = 0;
423    threadEntry_t self;
424    fdEntry_t *fdEntry = getFdEntry(s);
425
426    /*
427     * Check that fd hasn't been closed.
428     */
429    if (fdEntry == NULL) {
430        errno = EBADF;
431        return -1;
432    }
433
434    /*
435     * Pick up current time as may need to adjust timeout
436     */
437    if (timeout > 0) {
438        /* Timed */
439        t.tv_sec = timeout / 1000;
440        t.tv_usec = (timeout % 1000) * 1000;
441    } else if (timeout < 0) {
442        /* Blocking */
443        tp = 0;
444    } else {
445        /* Poll */
446        t.tv_sec = 0;
447        t.tv_usec = 0;
448    }
449
450    if (s < FD_SETSIZE) {
451        fdsp = &fds;
452        FD_ZERO(fdsp);
453    } else {
454        int length = (howmany(s+1, NFDBITS)) * sizeof(int);
455        fdsp = (fd_set *) calloc(1, length);
456        if (fdsp == NULL) {
457            return -1;   // errno will be set to ENOMEM
458        }
459        allocated = 1;
460    }
461    FD_SET(s, fdsp);
462
463    for(;;) {
464        int rv;
465
466        /*
467         * call select on the fd. If interrupted by our wakeup signal
468         * errno will be set to EBADF.
469         */
470
471        startOp(fdEntry, &self);
472        rv = select(s+1, fdsp, 0, 0, tp);
473        endOp(fdEntry, &self);
474
475        /*
476         * If interrupted then adjust timeout. If timeout
477         * has expired return 0 (indicating timeout expired).
478         */
479        if (rv < 0 && errno == EINTR) {
480            if (timeout > 0) {
481                struct timeval now;
482                gettimeofday(&now, NULL);
483                newtime = now.tv_sec * 1000  +  now.tv_usec / 1000;
484                timeout -= newtime - prevtime;
485                if (timeout <= 0) {
486                    if (allocated != 0)
487                        free(fdsp);
488                    return 0;
489                }
490                prevtime = newtime;
491                t.tv_sec = timeout / 1000;
492                t.tv_usec = (timeout % 1000) * 1000;
493            }
494        } else {
495            if (allocated != 0)
496                free(fdsp);
497            return rv;
498        }
499
500    }
501}
502