1/*
2 * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26#include <assert.h>
27#include <limits.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <signal.h>
31#include <pthread.h>
32#include <sys/types.h>
33#include <sys/socket.h>
34#include <sys/time.h>
35#include <sys/resource.h>
36#include <sys/uio.h>
37#include <unistd.h>
38#include <errno.h>
39#include <sys/poll.h>
40
41/*
42 * Stack allocated by thread when doing blocking operation
43 */
44typedef struct threadEntry {
45    pthread_t thr;                      /* this thread */
46    struct threadEntry *next;           /* next thread */
47    int intr;                           /* interrupted */
48} threadEntry_t;
49
50/*
51 * Heap allocated during initialized - one entry per fd
52 */
53typedef struct {
54    pthread_mutex_t lock;               /* fd lock */
55    threadEntry_t *threads;             /* threads blocked on fd */
56} fdEntry_t;
57
58/*
59 * Signal to unblock thread
60 */
61static int sigWakeup = (__SIGRTMAX - 2);
62
63/*
64 * fdTable holds one entry per file descriptor, up to a certain
65 * maximum.
66 * Theoretically, the number of possible file descriptors can get
67 * large, though usually it does not. Entries for small value file
68 * descriptors are kept in a simple table, which covers most scenarios.
69 * Entries for large value file descriptors are kept in an overflow
70 * table, which is organized as a sparse two dimensional array whose
71 * slabs are allocated on demand. This covers all corner cases while
72 * keeping memory consumption reasonable.
73 */
74
75/* Base table for low value file descriptors */
76static fdEntry_t* fdTable = NULL;
77/* Maximum size of base table (in number of entries). */
78static const int fdTableMaxSize = 0x1000; /* 4K */
79/* Actual size of base table (in number of entries) */
80static int fdTableLen = 0;
81/* Max. theoretical number of file descriptors on system. */
82static int fdLimit = 0;
83
84/* Overflow table, should base table not be large enough. Organized as
85 *   an array of n slabs, each holding 64k entries.
86 */
87static fdEntry_t** fdOverflowTable = NULL;
88/* Number of slabs in the overflow table */
89static int fdOverflowTableLen = 0;
90/* Number of entries in one slab */
91static const int fdOverflowTableSlabSize = 0x10000; /* 64k */
92pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER;
93
94/*
95 * Null signal handler
96 */
97static void sig_wakeup(int sig) {
98}
99
100/*
101 * Initialization routine (executed when library is loaded)
102 * Allocate fd tables and sets up signal handler.
103 */
104static void __attribute((constructor)) init() {
105    struct rlimit nbr_files;
106    sigset_t sigset;
107    struct sigaction sa;
108    int i = 0;
109
110    /* Determine the maximum number of possible file descriptors. */
111    if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) {
112        fprintf(stderr, "library initialization failed - "
113                "unable to get max # of allocated fds\n");
114        abort();
115    }
116    if (nbr_files.rlim_max != RLIM_INFINITY) {
117        fdLimit = nbr_files.rlim_max;
118    } else {
119        /* We just do not know. */
120        fdLimit = INT_MAX;
121    }
122
123    /* Allocate table for low value file descriptors. */
124    fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize;
125    fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t));
126    if (fdTable == NULL) {
127        fprintf(stderr, "library initialization failed - "
128                "unable to allocate file descriptor table - out of memory");
129        abort();
130    } else {
131        for (i = 0; i < fdTableLen; i ++) {
132            pthread_mutex_init(&fdTable[i].lock, NULL);
133        }
134    }
135
136    /* Allocate overflow table, if needed */
137    if (fdLimit > fdTableMaxSize) {
138        fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1;
139        fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*));
140        if (fdOverflowTable == NULL) {
141            fprintf(stderr, "library initialization failed - "
142                    "unable to allocate file descriptor overflow table - out of memory");
143            abort();
144        }
145    }
146
147    /*
148     * Setup the signal handler
149     */
150    sa.sa_handler = sig_wakeup;
151    sa.sa_flags   = 0;
152    sigemptyset(&sa.sa_mask);
153    sigaction(sigWakeup, &sa, NULL);
154
155    sigemptyset(&sigset);
156    sigaddset(&sigset, sigWakeup);
157    sigprocmask(SIG_UNBLOCK, &sigset, NULL);
158}
159
160/*
161 * Return the fd table for this fd.
162 */
163static inline fdEntry_t *getFdEntry(int fd)
164{
165    fdEntry_t* result = NULL;
166
167    if (fd < 0) {
168        return NULL;
169    }
170
171    /* This should not happen. If it does, our assumption about
172     * max. fd value was wrong. */
173    assert(fd < fdLimit);
174
175    if (fd < fdTableMaxSize) {
176        /* fd is in base table. */
177        assert(fd < fdTableLen);
178        result = &fdTable[fd];
179    } else {
180        /* fd is in overflow table. */
181        const int indexInOverflowTable = fd - fdTableMaxSize;
182        const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize;
183        const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize;
184        fdEntry_t* slab = NULL;
185        assert(rootindex < fdOverflowTableLen);
186        assert(slabindex < fdOverflowTableSlabSize);
187        pthread_mutex_lock(&fdOverflowTableLock);
188        /* Allocate new slab in overflow table if needed */
189        if (fdOverflowTable[rootindex] == NULL) {
190            fdEntry_t* const newSlab =
191                (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t));
192            if (newSlab == NULL) {
193                fprintf(stderr, "Unable to allocate file descriptor overflow"
194                        " table slab - out of memory");
195                pthread_mutex_unlock(&fdOverflowTableLock);
196                abort();
197            } else {
198                int i;
199                for (i = 0; i < fdOverflowTableSlabSize; i ++) {
200                    pthread_mutex_init(&newSlab[i].lock, NULL);
201                }
202                fdOverflowTable[rootindex] = newSlab;
203            }
204        }
205        pthread_mutex_unlock(&fdOverflowTableLock);
206        slab = fdOverflowTable[rootindex];
207        result = &slab[slabindex];
208    }
209
210    return result;
211
212}
213
214/*
215 * Start a blocking operation :-
216 *    Insert thread onto thread list for the fd.
217 */
218static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self)
219{
220    self->thr = pthread_self();
221    self->intr = 0;
222
223    pthread_mutex_lock(&(fdEntry->lock));
224    {
225        self->next = fdEntry->threads;
226        fdEntry->threads = self;
227    }
228    pthread_mutex_unlock(&(fdEntry->lock));
229}
230
231/*
232 * End a blocking operation :-
233 *     Remove thread from thread list for the fd
234 *     If fd has been interrupted then set errno to EBADF
235 */
236static inline void endOp
237    (fdEntry_t *fdEntry, threadEntry_t *self)
238{
239    int orig_errno = errno;
240    pthread_mutex_lock(&(fdEntry->lock));
241    {
242        threadEntry_t *curr, *prev=NULL;
243        curr = fdEntry->threads;
244        while (curr != NULL) {
245            if (curr == self) {
246                if (curr->intr) {
247                    orig_errno = EBADF;
248                }
249                if (prev == NULL) {
250                    fdEntry->threads = curr->next;
251                } else {
252                    prev->next = curr->next;
253                }
254                break;
255            }
256            prev = curr;
257            curr = curr->next;
258        }
259    }
260    pthread_mutex_unlock(&(fdEntry->lock));
261    errno = orig_errno;
262}
263
264/*
265 * Close or dup2 a file descriptor ensuring that all threads blocked on
266 * the file descriptor are notified via a wakeup signal.
267 *
268 *      fd1 < 0    => close(fd2)
269 *      fd1 >= 0   => dup2(fd1, fd2)
270 *
271 * Returns -1 with errno set if operation fails.
272 */
273static int closefd(int fd1, int fd2) {
274    int rv, orig_errno;
275    fdEntry_t *fdEntry = getFdEntry(fd2);
276    if (fdEntry == NULL) {
277        errno = EBADF;
278        return -1;
279    }
280
281    /*
282     * Lock the fd to hold-off additional I/O on this fd.
283     */
284    pthread_mutex_lock(&(fdEntry->lock));
285
286    {
287        /*
288         * And close/dup the file descriptor
289         * (restart if interrupted by signal)
290         */
291        do {
292            if (fd1 < 0) {
293                rv = close(fd2);
294            } else {
295                rv = dup2(fd1, fd2);
296            }
297        } while (rv == -1 && errno == EINTR);
298
299        /*
300         * Send a wakeup signal to all threads blocked on this
301         * file descriptor.
302         */
303        threadEntry_t *curr = fdEntry->threads;
304        while (curr != NULL) {
305            curr->intr = 1;
306            pthread_kill( curr->thr, sigWakeup );
307            curr = curr->next;
308        }
309    }
310
311    /*
312     * Unlock without destroying errno
313     */
314    orig_errno = errno;
315    pthread_mutex_unlock(&(fdEntry->lock));
316    errno = orig_errno;
317
318    return rv;
319}
320
321/*
322 * Wrapper for dup2 - same semantics as dup2 system call except
323 * that any threads blocked in an I/O system call on fd2 will be
324 * preempted and return -1/EBADF;
325 */
326int NET_Dup2(int fd, int fd2) {
327    if (fd < 0) {
328        errno = EBADF;
329        return -1;
330    }
331    return closefd(fd, fd2);
332}
333
334/*
335 * Wrapper for close - same semantics as close system call
336 * except that any threads blocked in an I/O on fd will be
337 * preempted and the I/O system call will return -1/EBADF.
338 */
339int NET_SocketClose(int fd) {
340    return closefd(-1, fd);
341}
342
343/************** Basic I/O operations here ***************/
344
345/*
346 * Macro to perform a blocking IO operation. Restarts
347 * automatically if interrupted by signal (other than
348 * our wakeup signal)
349 */
350#define BLOCKING_IO_RETURN_INT(FD, FUNC) {      \
351    int ret;                                    \
352    threadEntry_t self;                         \
353    fdEntry_t *fdEntry = getFdEntry(FD);        \
354    if (fdEntry == NULL) {                      \
355        errno = EBADF;                          \
356        return -1;                              \
357    }                                           \
358    do {                                        \
359        startOp(fdEntry, &self);                \
360        ret = FUNC;                             \
361        endOp(fdEntry, &self);                  \
362    } while (ret == -1 && errno == EINTR);      \
363    return ret;                                 \
364}
365
366int NET_Read(int s, void* buf, size_t len) {
367    BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) );
368}
369
370int NET_NonBlockingRead(int s, void* buf, size_t len) {
371    BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, MSG_DONTWAIT) );
372}
373
374int NET_ReadV(int s, const struct iovec * vector, int count) {
375    BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) );
376}
377
378int NET_RecvFrom(int s, void *buf, int len, unsigned int flags,
379       struct sockaddr *from, socklen_t *fromlen) {
380    BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, fromlen) );
381}
382
383int NET_Send(int s, void *msg, int len, unsigned int flags) {
384    BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) );
385}
386
387int NET_WriteV(int s, const struct iovec * vector, int count) {
388    BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) );
389}
390
391int NET_SendTo(int s, const void *msg, int len,  unsigned  int
392       flags, const struct sockaddr *to, int tolen) {
393    BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) );
394}
395
396int NET_Accept(int s, struct sockaddr *addr, socklen_t *addrlen) {
397    BLOCKING_IO_RETURN_INT( s, accept(s, addr, addrlen) );
398}
399
400int NET_Connect(int s, struct sockaddr *addr, int addrlen) {
401    BLOCKING_IO_RETURN_INT( s, connect(s, addr, addrlen) );
402}
403
404int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) {
405    BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) );
406}
407
408/*
409 * Wrapper for poll(s, timeout).
410 * Auto restarts with adjusted timeout if interrupted by
411 * signal other than our wakeup signal.
412 */
413int NET_Timeout0(int s, long timeout, long currentTime) {
414    long prevtime = currentTime, newtime;
415    struct timeval t;
416    fdEntry_t *fdEntry = getFdEntry(s);
417
418    /*
419     * Check that fd hasn't been closed.
420     */
421    if (fdEntry == NULL) {
422        errno = EBADF;
423        return -1;
424    }
425
426    for(;;) {
427        struct pollfd pfd;
428        int rv;
429        threadEntry_t self;
430
431        /*
432         * Poll the fd. If interrupted by our wakeup signal
433         * errno will be set to EBADF.
434         */
435        pfd.fd = s;
436        pfd.events = POLLIN | POLLERR;
437
438        startOp(fdEntry, &self);
439        rv = poll(&pfd, 1, timeout);
440        endOp(fdEntry, &self);
441
442        /*
443         * If interrupted then adjust timeout. If timeout
444         * has expired return 0 (indicating timeout expired).
445         */
446        if (rv < 0 && errno == EINTR) {
447            if (timeout > 0) {
448                gettimeofday(&t, NULL);
449                newtime = t.tv_sec * 1000  +  t.tv_usec / 1000;
450                timeout -= newtime - prevtime;
451                if (timeout <= 0) {
452                    return 0;
453                }
454                prevtime = newtime;
455            }
456        } else {
457            return rv;
458        }
459
460    }
461}
462