1321936Shselasky/*
2321936Shselasky * Copyright (c) 2011-2012 Intel Corporation.  All rights reserved.
3321936Shselasky *
4321936Shselasky * This software is available to you under a choice of one of two
5321936Shselasky * licenses.  You may choose to be licensed under the terms of the GNU
6321936Shselasky * General Public License (GPL) Version 2, available from the file
7321936Shselasky * COPYING in the main directory of this source tree, or the
8321936Shselasky * OpenIB.org BSD license below:
9321936Shselasky *
10321936Shselasky *     Redistribution and use in source and binary forms, with or
11321936Shselasky *     without modification, are permitted provided that the following
12321936Shselasky *     conditions are met:
13321936Shselasky *
14321936Shselasky *      - Redistributions of source code must retain the above
15321936Shselasky *        copyright notice, this list of conditions and the following
16321936Shselasky *        disclaimer.
17321936Shselasky *
18321936Shselasky *      - Redistributions in binary form must reproduce the above
19321936Shselasky *        copyright notice, this list of conditions and the following
20321936Shselasky *        disclaimer in the documentation and/or other materials
21321936Shselasky *        provided with the distribution.
22321936Shselasky *
23321936Shselasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24321936Shselasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25321936Shselasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26321936Shselasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27321936Shselasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28321936Shselasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29321936Shselasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30321936Shselasky * SOFTWARE.
31321936Shselasky *
32321936Shselasky */
33321936Shselasky#define _GNU_SOURCE
34321936Shselasky#include <config.h>
35321936Shselasky
36321936Shselasky#include <sys/types.h>
37321936Shselasky#include <sys/socket.h>
38321936Shselasky#include <sys/uio.h>
39321936Shselasky#include <sys/stat.h>
40321936Shselasky#include <sys/mman.h>
41321936Shselasky#include <stdarg.h>
42321936Shselasky#include <dlfcn.h>
43321936Shselasky#include <netdb.h>
44321936Shselasky#include <unistd.h>
45321936Shselasky#include <fcntl.h>
46321936Shselasky#include <string.h>
47321936Shselasky#include <netinet/tcp.h>
48321936Shselasky#include <unistd.h>
49321936Shselasky#include <semaphore.h>
50321936Shselasky#include <ctype.h>
51321936Shselasky#include <stdlib.h>
52321936Shselasky#include <stdio.h>
53321936Shselasky
54321936Shselasky#include <rdma/rdma_cma.h>
55321936Shselasky#include <rdma/rdma_verbs.h>
56321936Shselasky#include <rdma/rsocket.h>
57321936Shselasky#include "cma.h"
58321936Shselasky#include "indexer.h"
59321936Shselasky
60321936Shselaskystruct socket_calls {
61321936Shselasky	int (*socket)(int domain, int type, int protocol);
62321936Shselasky	int (*bind)(int socket, const struct sockaddr *addr, socklen_t addrlen);
63321936Shselasky	int (*listen)(int socket, int backlog);
64321936Shselasky	int (*accept)(int socket, struct sockaddr *addr, socklen_t *addrlen);
65321936Shselasky	int (*connect)(int socket, const struct sockaddr *addr, socklen_t addrlen);
66321936Shselasky	ssize_t (*recv)(int socket, void *buf, size_t len, int flags);
67321936Shselasky	ssize_t (*recvfrom)(int socket, void *buf, size_t len, int flags,
68321936Shselasky			    struct sockaddr *src_addr, socklen_t *addrlen);
69321936Shselasky	ssize_t (*recvmsg)(int socket, struct msghdr *msg, int flags);
70321936Shselasky	ssize_t (*read)(int socket, void *buf, size_t count);
71321936Shselasky	ssize_t (*readv)(int socket, const struct iovec *iov, int iovcnt);
72321936Shselasky	ssize_t (*send)(int socket, const void *buf, size_t len, int flags);
73321936Shselasky	ssize_t (*sendto)(int socket, const void *buf, size_t len, int flags,
74321936Shselasky			  const struct sockaddr *dest_addr, socklen_t addrlen);
75321936Shselasky	ssize_t (*sendmsg)(int socket, const struct msghdr *msg, int flags);
76321936Shselasky	ssize_t (*write)(int socket, const void *buf, size_t count);
77321936Shselasky	ssize_t (*writev)(int socket, const struct iovec *iov, int iovcnt);
78321936Shselasky	int (*poll)(struct pollfd *fds, nfds_t nfds, int timeout);
79321936Shselasky	int (*shutdown)(int socket, int how);
80321936Shselasky	int (*close)(int socket);
81321936Shselasky	int (*getpeername)(int socket, struct sockaddr *addr, socklen_t *addrlen);
82321936Shselasky	int (*getsockname)(int socket, struct sockaddr *addr, socklen_t *addrlen);
83321936Shselasky	int (*setsockopt)(int socket, int level, int optname,
84321936Shselasky			  const void *optval, socklen_t optlen);
85321936Shselasky	int (*getsockopt)(int socket, int level, int optname,
86321936Shselasky			  void *optval, socklen_t *optlen);
87321936Shselasky	int (*fcntl)(int socket, int cmd, ... /* arg */);
88321936Shselasky	int (*dup2)(int oldfd, int newfd);
89321936Shselasky	ssize_t (*sendfile)(int out_fd, int in_fd, off_t *offset, size_t count);
90321936Shselasky	int (*fxstat)(int ver, int fd, struct stat *buf);
91321936Shselasky};
92321936Shselasky
93321936Shselaskystatic struct socket_calls real;
94321936Shselaskystatic struct socket_calls rs;
95321936Shselasky
96321936Shselaskystatic struct index_map idm;
97321936Shselaskystatic pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
98321936Shselasky
99321936Shselaskystatic int sq_size;
100321936Shselaskystatic int rq_size;
101321936Shselaskystatic int sq_inline;
102321936Shselaskystatic int fork_support;
103321936Shselasky
104321936Shselaskyenum fd_type {
105321936Shselasky	fd_normal,
106321936Shselasky	fd_rsocket
107321936Shselasky};
108321936Shselasky
109321936Shselaskyenum fd_fork_state {
110321936Shselasky	fd_ready,
111321936Shselasky	fd_fork,
112321936Shselasky	fd_fork_listen,
113321936Shselasky	fd_fork_active,
114321936Shselasky	fd_fork_passive
115321936Shselasky};
116321936Shselasky
117321936Shselaskystruct fd_info {
118321936Shselasky	enum fd_type type;
119321936Shselasky	enum fd_fork_state state;
120321936Shselasky	int fd;
121321936Shselasky	int dupfd;
122321936Shselasky	_Atomic(int) refcnt;
123321936Shselasky};
124321936Shselasky
125321936Shselaskystruct config_entry {
126321936Shselasky	char *name;
127321936Shselasky	int domain;
128321936Shselasky	int type;
129321936Shselasky	int protocol;
130321936Shselasky};
131321936Shselasky
132321936Shselaskystatic struct config_entry *config;
133321936Shselaskystatic int config_cnt;
134321936Shselasky
135321936Shselaskystatic void free_config(void)
136321936Shselasky{
137321936Shselasky	while (config_cnt)
138321936Shselasky		free(config[--config_cnt].name);
139321936Shselasky
140321936Shselasky	free(config);
141321936Shselasky}
142321936Shselasky
143321936Shselasky/*
144321936Shselasky * Config file format:
145321936Shselasky * # Starting '#' indicates comment
146321936Shselasky * # wild card values are supported using '*'
147321936Shselasky * # domain - *, INET, INET6, IB
148321936Shselasky * # type - *, STREAM, DGRAM
149321936Shselasky * # protocol - *, TCP, UDP
150321936Shselasky * program_name domain type protocol
151321936Shselasky */
152321936Shselaskystatic void scan_config(void)
153321936Shselasky{
154321936Shselasky	struct config_entry *new_config;
155321936Shselasky	FILE *fp;
156321936Shselasky	char line[120], prog[64], dom[16], type[16], proto[16];
157321936Shselasky
158321936Shselasky	fp = fopen(RS_CONF_DIR "/preload_config", "r");
159321936Shselasky	if (!fp)
160321936Shselasky		return;
161321936Shselasky
162321936Shselasky	while (fgets(line, sizeof(line), fp)) {
163321936Shselasky		if (line[0] == '#')
164321936Shselasky			continue;
165321936Shselasky
166321936Shselasky		if (sscanf(line, "%64s%16s%16s%16s", prog, dom, type, proto) != 4)
167321936Shselasky			continue;
168321936Shselasky
169321936Shselasky		new_config = realloc(config, (config_cnt + 1) *
170321936Shselasky					     sizeof(struct config_entry));
171321936Shselasky		if (!new_config)
172321936Shselasky			break;
173321936Shselasky
174321936Shselasky		config = new_config;
175321936Shselasky		memset(&config[config_cnt], 0, sizeof(struct config_entry));
176321936Shselasky
177321936Shselasky		if (!strcasecmp(dom, "INET") ||
178321936Shselasky		    !strcasecmp(dom, "AF_INET") ||
179321936Shselasky		    !strcasecmp(dom, "PF_INET")) {
180321936Shselasky			config[config_cnt].domain = AF_INET;
181321936Shselasky		} else if (!strcasecmp(dom, "INET6") ||
182321936Shselasky			   !strcasecmp(dom, "AF_INET6") ||
183321936Shselasky			   !strcasecmp(dom, "PF_INET6")) {
184321936Shselasky			config[config_cnt].domain = AF_INET6;
185321936Shselasky		} else if (!strcasecmp(dom, "IB") ||
186321936Shselasky			   !strcasecmp(dom, "AF_IB") ||
187321936Shselasky			   !strcasecmp(dom, "PF_IB")) {
188321936Shselasky			config[config_cnt].domain = AF_IB;
189321936Shselasky		} else if (strcmp(dom, "*")) {
190321936Shselasky			continue;
191321936Shselasky		}
192321936Shselasky
193321936Shselasky		if (!strcasecmp(type, "STREAM") ||
194321936Shselasky		    !strcasecmp(type, "SOCK_STREAM")) {
195321936Shselasky			config[config_cnt].type = SOCK_STREAM;
196321936Shselasky		} else if (!strcasecmp(type, "DGRAM") ||
197321936Shselasky			   !strcasecmp(type, "SOCK_DGRAM")) {
198321936Shselasky			config[config_cnt].type = SOCK_DGRAM;
199321936Shselasky		} else if (strcmp(type, "*")) {
200321936Shselasky			continue;
201321936Shselasky		}
202321936Shselasky
203321936Shselasky		if (!strcasecmp(proto, "TCP") ||
204321936Shselasky		    !strcasecmp(proto, "IPPROTO_TCP")) {
205321936Shselasky			config[config_cnt].protocol = IPPROTO_TCP;
206321936Shselasky		} else if (!strcasecmp(proto, "UDP") ||
207321936Shselasky			   !strcasecmp(proto, "IPPROTO_UDP")) {
208321936Shselasky			config[config_cnt].protocol = IPPROTO_UDP;
209321936Shselasky		} else if (strcmp(proto, "*")) {
210321936Shselasky			continue;
211321936Shselasky		}
212321936Shselasky
213321936Shselasky		if (strcmp(prog, "*")) {
214321936Shselasky		    if (!(config[config_cnt].name = strdup(prog)))
215321936Shselasky			    continue;
216321936Shselasky		}
217321936Shselasky
218321936Shselasky		config_cnt++;
219321936Shselasky	}
220321936Shselasky
221321936Shselasky	fclose(fp);
222321936Shselasky	if (config_cnt)
223321936Shselasky		atexit(free_config);
224321936Shselasky}
225321936Shselasky
226321936Shselaskystatic int intercept_socket(int domain, int type, int protocol)
227321936Shselasky{
228321936Shselasky	int i;
229321936Shselasky
230321936Shselasky	if (!config_cnt)
231321936Shselasky		return 1;
232321936Shselasky
233321936Shselasky	if (!protocol) {
234321936Shselasky		if (type == SOCK_STREAM)
235321936Shselasky			protocol = IPPROTO_TCP;
236321936Shselasky		else if (type == SOCK_DGRAM)
237321936Shselasky			protocol = IPPROTO_UDP;
238321936Shselasky	}
239321936Shselasky
240321936Shselasky	for (i = 0; i < config_cnt; i++) {
241321936Shselasky		if ((!config[i].name ||
242321936Shselasky		     !strncasecmp(config[i].name, program_invocation_short_name,
243321936Shselasky				  strlen(config[i].name))) &&
244321936Shselasky		    (!config[i].domain || config[i].domain == domain) &&
245321936Shselasky		    (!config[i].type || config[i].type == type) &&
246321936Shselasky		    (!config[i].protocol || config[i].protocol == protocol))
247321936Shselasky			return 1;
248321936Shselasky	}
249321936Shselasky
250321936Shselasky	return 0;
251321936Shselasky}
252321936Shselasky
253321936Shselaskystatic int fd_open(void)
254321936Shselasky{
255321936Shselasky	struct fd_info *fdi;
256321936Shselasky	int ret, index;
257321936Shselasky
258321936Shselasky	fdi = calloc(1, sizeof(*fdi));
259321936Shselasky	if (!fdi)
260321936Shselasky		return ERR(ENOMEM);
261321936Shselasky
262321936Shselasky	index = open("/dev/null", O_RDONLY);
263321936Shselasky	if (index < 0) {
264321936Shselasky		ret = index;
265321936Shselasky		goto err1;
266321936Shselasky	}
267321936Shselasky
268321936Shselasky	fdi->dupfd = -1;
269321936Shselasky	atomic_store(&fdi->refcnt, 1);
270321936Shselasky	pthread_mutex_lock(&mut);
271321936Shselasky	ret = idm_set(&idm, index, fdi);
272321936Shselasky	pthread_mutex_unlock(&mut);
273321936Shselasky	if (ret < 0)
274321936Shselasky		goto err2;
275321936Shselasky
276321936Shselasky	return index;
277321936Shselasky
278321936Shselaskyerr2:
279321936Shselasky	real.close(index);
280321936Shselaskyerr1:
281321936Shselasky	free(fdi);
282321936Shselasky	return ret;
283321936Shselasky}
284321936Shselasky
285321936Shselaskystatic void fd_store(int index, int fd, enum fd_type type, enum fd_fork_state state)
286321936Shselasky{
287321936Shselasky	struct fd_info *fdi;
288321936Shselasky
289321936Shselasky	fdi = idm_at(&idm, index);
290321936Shselasky	fdi->fd = fd;
291321936Shselasky	fdi->type = type;
292321936Shselasky	fdi->state = state;
293321936Shselasky}
294321936Shselasky
295321936Shselaskystatic inline enum fd_type fd_get(int index, int *fd)
296321936Shselasky{
297321936Shselasky	struct fd_info *fdi;
298321936Shselasky
299321936Shselasky	fdi = idm_lookup(&idm, index);
300321936Shselasky	if (fdi) {
301321936Shselasky		*fd = fdi->fd;
302321936Shselasky		return fdi->type;
303321936Shselasky
304321936Shselasky	} else {
305321936Shselasky		*fd = index;
306321936Shselasky		return fd_normal;
307321936Shselasky	}
308321936Shselasky}
309321936Shselasky
310321936Shselaskystatic inline int fd_getd(int index)
311321936Shselasky{
312321936Shselasky	struct fd_info *fdi;
313321936Shselasky
314321936Shselasky	fdi = idm_lookup(&idm, index);
315321936Shselasky	return fdi ? fdi->fd : index;
316321936Shselasky}
317321936Shselasky
318321936Shselaskystatic inline enum fd_fork_state fd_gets(int index)
319321936Shselasky{
320321936Shselasky	struct fd_info *fdi;
321321936Shselasky
322321936Shselasky	fdi = idm_lookup(&idm, index);
323321936Shselasky	return fdi ? fdi->state : fd_ready;
324321936Shselasky}
325321936Shselasky
326321936Shselaskystatic inline enum fd_type fd_gett(int index)
327321936Shselasky{
328321936Shselasky	struct fd_info *fdi;
329321936Shselasky
330321936Shselasky	fdi = idm_lookup(&idm, index);
331321936Shselasky	return fdi ? fdi->type : fd_normal;
332321936Shselasky}
333321936Shselasky
334321936Shselaskystatic enum fd_type fd_close(int index, int *fd)
335321936Shselasky{
336321936Shselasky	struct fd_info *fdi;
337321936Shselasky	enum fd_type type;
338321936Shselasky
339321936Shselasky	fdi = idm_lookup(&idm, index);
340321936Shselasky	if (fdi) {
341321936Shselasky		idm_clear(&idm, index);
342321936Shselasky		*fd = fdi->fd;
343321936Shselasky		type = fdi->type;
344321936Shselasky		real.close(index);
345321936Shselasky		free(fdi);
346321936Shselasky	} else {
347321936Shselasky		*fd = index;
348321936Shselasky		type = fd_normal;
349321936Shselasky	}
350321936Shselasky	return type;
351321936Shselasky}
352321936Shselasky
353321936Shselaskystatic void getenv_options(void)
354321936Shselasky{
355321936Shselasky	char *var;
356321936Shselasky
357321936Shselasky	var = getenv("RS_SQ_SIZE");
358321936Shselasky	if (var)
359321936Shselasky		sq_size = atoi(var);
360321936Shselasky
361321936Shselasky	var = getenv("RS_RQ_SIZE");
362321936Shselasky	if (var)
363321936Shselasky		rq_size = atoi(var);
364321936Shselasky
365321936Shselasky	var = getenv("RS_INLINE");
366321936Shselasky	if (var)
367321936Shselasky		sq_inline = atoi(var);
368321936Shselasky
369321936Shselasky	var = getenv("RDMAV_FORK_SAFE");
370321936Shselasky	if (var)
371321936Shselasky		fork_support = atoi(var);
372321936Shselasky}
373321936Shselasky
374321936Shselaskystatic void init_preload(void)
375321936Shselasky{
376321936Shselasky	static int init;
377321936Shselasky
378321936Shselasky	/* Quick check without lock */
379321936Shselasky	if (init)
380321936Shselasky		return;
381321936Shselasky
382321936Shselasky	pthread_mutex_lock(&mut);
383321936Shselasky	if (init)
384321936Shselasky		goto out;
385321936Shselasky
386321936Shselasky	real.socket = dlsym(RTLD_NEXT, "socket");
387321936Shselasky	real.bind = dlsym(RTLD_NEXT, "bind");
388321936Shselasky	real.listen = dlsym(RTLD_NEXT, "listen");
389321936Shselasky	real.accept = dlsym(RTLD_NEXT, "accept");
390321936Shselasky	real.connect = dlsym(RTLD_NEXT, "connect");
391321936Shselasky	real.recv = dlsym(RTLD_NEXT, "recv");
392321936Shselasky	real.recvfrom = dlsym(RTLD_NEXT, "recvfrom");
393321936Shselasky	real.recvmsg = dlsym(RTLD_NEXT, "recvmsg");
394321936Shselasky	real.read = dlsym(RTLD_NEXT, "read");
395321936Shselasky	real.readv = dlsym(RTLD_NEXT, "readv");
396321936Shselasky	real.send = dlsym(RTLD_NEXT, "send");
397321936Shselasky	real.sendto = dlsym(RTLD_NEXT, "sendto");
398321936Shselasky	real.sendmsg = dlsym(RTLD_NEXT, "sendmsg");
399321936Shselasky	real.write = dlsym(RTLD_NEXT, "write");
400321936Shselasky	real.writev = dlsym(RTLD_NEXT, "writev");
401321936Shselasky	real.poll = dlsym(RTLD_NEXT, "poll");
402321936Shselasky	real.shutdown = dlsym(RTLD_NEXT, "shutdown");
403321936Shselasky	real.close = dlsym(RTLD_NEXT, "close");
404321936Shselasky	real.getpeername = dlsym(RTLD_NEXT, "getpeername");
405321936Shselasky	real.getsockname = dlsym(RTLD_NEXT, "getsockname");
406321936Shselasky	real.setsockopt = dlsym(RTLD_NEXT, "setsockopt");
407321936Shselasky	real.getsockopt = dlsym(RTLD_NEXT, "getsockopt");
408321936Shselasky	real.fcntl = dlsym(RTLD_NEXT, "fcntl");
409321936Shselasky	real.dup2 = dlsym(RTLD_NEXT, "dup2");
410321936Shselasky	real.sendfile = dlsym(RTLD_NEXT, "sendfile");
411321936Shselasky	real.fxstat = dlsym(RTLD_NEXT, "__fxstat");
412321936Shselasky
413321936Shselasky	rs.socket = dlsym(RTLD_DEFAULT, "rsocket");
414321936Shselasky	rs.bind = dlsym(RTLD_DEFAULT, "rbind");
415321936Shselasky	rs.listen = dlsym(RTLD_DEFAULT, "rlisten");
416321936Shselasky	rs.accept = dlsym(RTLD_DEFAULT, "raccept");
417321936Shselasky	rs.connect = dlsym(RTLD_DEFAULT, "rconnect");
418321936Shselasky	rs.recv = dlsym(RTLD_DEFAULT, "rrecv");
419321936Shselasky	rs.recvfrom = dlsym(RTLD_DEFAULT, "rrecvfrom");
420321936Shselasky	rs.recvmsg = dlsym(RTLD_DEFAULT, "rrecvmsg");
421321936Shselasky	rs.read = dlsym(RTLD_DEFAULT, "rread");
422321936Shselasky	rs.readv = dlsym(RTLD_DEFAULT, "rreadv");
423321936Shselasky	rs.send = dlsym(RTLD_DEFAULT, "rsend");
424321936Shselasky	rs.sendto = dlsym(RTLD_DEFAULT, "rsendto");
425321936Shselasky	rs.sendmsg = dlsym(RTLD_DEFAULT, "rsendmsg");
426321936Shselasky	rs.write = dlsym(RTLD_DEFAULT, "rwrite");
427321936Shselasky	rs.writev = dlsym(RTLD_DEFAULT, "rwritev");
428321936Shselasky	rs.poll = dlsym(RTLD_DEFAULT, "rpoll");
429321936Shselasky	rs.shutdown = dlsym(RTLD_DEFAULT, "rshutdown");
430321936Shselasky	rs.close = dlsym(RTLD_DEFAULT, "rclose");
431321936Shselasky	rs.getpeername = dlsym(RTLD_DEFAULT, "rgetpeername");
432321936Shselasky	rs.getsockname = dlsym(RTLD_DEFAULT, "rgetsockname");
433321936Shselasky	rs.setsockopt = dlsym(RTLD_DEFAULT, "rsetsockopt");
434321936Shselasky	rs.getsockopt = dlsym(RTLD_DEFAULT, "rgetsockopt");
435321936Shselasky	rs.fcntl = dlsym(RTLD_DEFAULT, "rfcntl");
436321936Shselasky
437321936Shselasky	getenv_options();
438321936Shselasky	scan_config();
439321936Shselasky	init = 1;
440321936Shselaskyout:
441321936Shselasky	pthread_mutex_unlock(&mut);
442321936Shselasky}
443321936Shselasky
444321936Shselasky/*
445321936Shselasky * We currently only handle copying a few common values.
446321936Shselasky */
447321936Shselaskystatic int copysockopts(int dfd, int sfd, struct socket_calls *dapi,
448321936Shselasky			struct socket_calls *sapi)
449321936Shselasky{
450321936Shselasky	socklen_t len;
451321936Shselasky	int param, ret;
452321936Shselasky
453321936Shselasky	ret = sapi->fcntl(sfd, F_GETFL);
454321936Shselasky	if (ret > 0)
455321936Shselasky		ret = dapi->fcntl(dfd, F_SETFL, ret);
456321936Shselasky	if (ret)
457321936Shselasky		return ret;
458321936Shselasky
459321936Shselasky	len = sizeof param;
460321936Shselasky	ret = sapi->getsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, &param, &len);
461321936Shselasky	if (param && !ret)
462321936Shselasky		ret = dapi->setsockopt(dfd, SOL_SOCKET, SO_REUSEADDR, &param, len);
463321936Shselasky	if (ret)
464321936Shselasky		return ret;
465321936Shselasky
466321936Shselasky	len = sizeof param;
467321936Shselasky	ret = sapi->getsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, &param, &len);
468321936Shselasky	if (param && !ret)
469321936Shselasky		ret = dapi->setsockopt(dfd, IPPROTO_TCP, TCP_NODELAY, &param, len);
470321936Shselasky	if (ret)
471321936Shselasky		return ret;
472321936Shselasky
473321936Shselasky	return 0;
474321936Shselasky}
475321936Shselasky
476321936Shselasky/*
477321936Shselasky * Convert between an rsocket and a normal socket.
478321936Shselasky */
479321936Shselaskystatic int transpose_socket(int socket, enum fd_type new_type)
480321936Shselasky{
481321936Shselasky	socklen_t len = 0;
482321936Shselasky	int sfd, dfd, param, ret;
483321936Shselasky	struct socket_calls *sapi, *dapi;
484321936Shselasky
485321936Shselasky	sfd = fd_getd(socket);
486321936Shselasky	if (new_type == fd_rsocket) {
487321936Shselasky		dapi = &rs;
488321936Shselasky		sapi = &real;
489321936Shselasky	} else {
490321936Shselasky		dapi = &real;
491321936Shselasky		sapi = &rs;
492321936Shselasky	}
493321936Shselasky
494321936Shselasky	ret = sapi->getsockname(sfd, NULL, &len);
495321936Shselasky	if (ret)
496321936Shselasky		return ret;
497321936Shselasky
498321936Shselasky	param = (len == sizeof(struct sockaddr_in6)) ? PF_INET6 : PF_INET;
499321936Shselasky	dfd = dapi->socket(param, SOCK_STREAM, 0);
500321936Shselasky	if (dfd < 0)
501321936Shselasky		return dfd;
502321936Shselasky
503321936Shselasky	ret = copysockopts(dfd, sfd, dapi, sapi);
504321936Shselasky	if (ret)
505321936Shselasky		goto err;
506321936Shselasky
507321936Shselasky	fd_store(socket, dfd, new_type, fd_ready);
508321936Shselasky	return dfd;
509321936Shselasky
510321936Shselaskyerr:
511321936Shselasky	dapi->close(dfd);
512321936Shselasky	return ret;
513321936Shselasky}
514321936Shselasky
515321936Shselasky/*
516321936Shselasky * Use defaults on failure.
517321936Shselasky */
518321936Shselaskystatic void set_rsocket_options(int rsocket)
519321936Shselasky{
520321936Shselasky	if (sq_size)
521321936Shselasky		rsetsockopt(rsocket, SOL_RDMA, RDMA_SQSIZE, &sq_size, sizeof sq_size);
522321936Shselasky
523321936Shselasky	if (rq_size)
524321936Shselasky		rsetsockopt(rsocket, SOL_RDMA, RDMA_RQSIZE, &rq_size, sizeof rq_size);
525321936Shselasky
526321936Shselasky	if (sq_inline)
527321936Shselasky		rsetsockopt(rsocket, SOL_RDMA, RDMA_INLINE, &sq_inline, sizeof sq_inline);
528321936Shselasky}
529321936Shselasky
530321936Shselaskyint socket(int domain, int type, int protocol)
531321936Shselasky{
532321936Shselasky	static __thread int recursive;
533321936Shselasky	int index, ret;
534321936Shselasky
535321936Shselasky	init_preload();
536321936Shselasky
537321936Shselasky	if (recursive || !intercept_socket(domain, type, protocol))
538321936Shselasky		goto real;
539321936Shselasky
540321936Shselasky	index = fd_open();
541321936Shselasky	if (index < 0)
542321936Shselasky		return index;
543321936Shselasky
544321936Shselasky	if (fork_support && (domain == PF_INET || domain == PF_INET6) &&
545321936Shselasky	    (type == SOCK_STREAM) && (!protocol || protocol == IPPROTO_TCP)) {
546321936Shselasky		ret = real.socket(domain, type, protocol);
547321936Shselasky		if (ret < 0)
548321936Shselasky			return ret;
549321936Shselasky		fd_store(index, ret, fd_normal, fd_fork);
550321936Shselasky		return index;
551321936Shselasky	}
552321936Shselasky
553321936Shselasky	recursive = 1;
554321936Shselasky	ret = rsocket(domain, type, protocol);
555321936Shselasky	recursive = 0;
556321936Shselasky	if (ret >= 0) {
557321936Shselasky		fd_store(index, ret, fd_rsocket, fd_ready);
558321936Shselasky		set_rsocket_options(ret);
559321936Shselasky		return index;
560321936Shselasky	}
561321936Shselasky	fd_close(index, &ret);
562321936Shselaskyreal:
563321936Shselasky	return real.socket(domain, type, protocol);
564321936Shselasky}
565321936Shselasky
566321936Shselaskyint bind(int socket, const struct sockaddr *addr, socklen_t addrlen)
567321936Shselasky{
568321936Shselasky	int fd;
569321936Shselasky	return (fd_get(socket, &fd) == fd_rsocket) ?
570321936Shselasky		rbind(fd, addr, addrlen) : real.bind(fd, addr, addrlen);
571321936Shselasky}
572321936Shselasky
573321936Shselaskyint listen(int socket, int backlog)
574321936Shselasky{
575321936Shselasky	int fd, ret;
576321936Shselasky	if (fd_get(socket, &fd) == fd_rsocket) {
577321936Shselasky		ret = rlisten(fd, backlog);
578321936Shselasky	} else {
579321936Shselasky		ret = real.listen(fd, backlog);
580321936Shselasky		if (!ret && fd_gets(socket) == fd_fork)
581321936Shselasky			fd_store(socket, fd, fd_normal, fd_fork_listen);
582321936Shselasky	}
583321936Shselasky	return ret;
584321936Shselasky}
585321936Shselasky
586321936Shselaskyint accept(int socket, struct sockaddr *addr, socklen_t *addrlen)
587321936Shselasky{
588321936Shselasky	int fd, index, ret;
589321936Shselasky
590321936Shselasky	if (fd_get(socket, &fd) == fd_rsocket) {
591321936Shselasky		index = fd_open();
592321936Shselasky		if (index < 0)
593321936Shselasky			return index;
594321936Shselasky
595321936Shselasky		ret = raccept(fd, addr, addrlen);
596321936Shselasky		if (ret < 0) {
597321936Shselasky			fd_close(index, &fd);
598321936Shselasky			return ret;
599321936Shselasky		}
600321936Shselasky
601321936Shselasky		fd_store(index, ret, fd_rsocket, fd_ready);
602321936Shselasky		return index;
603321936Shselasky	} else if (fd_gets(socket) == fd_fork_listen) {
604321936Shselasky		index = fd_open();
605321936Shselasky		if (index < 0)
606321936Shselasky			return index;
607321936Shselasky
608321936Shselasky		ret = real.accept(fd, addr, addrlen);
609321936Shselasky		if (ret < 0) {
610321936Shselasky			fd_close(index, &fd);
611321936Shselasky			return ret;
612321936Shselasky		}
613321936Shselasky
614321936Shselasky		fd_store(index, ret, fd_normal, fd_fork_passive);
615321936Shselasky		return index;
616321936Shselasky	} else {
617321936Shselasky		return real.accept(fd, addr, addrlen);
618321936Shselasky	}
619321936Shselasky}
620321936Shselasky
621321936Shselasky/*
622321936Shselasky * We can't fork RDMA connections and pass them from the parent to the child
623321936Shselasky * process.  Instead, we need to establish the RDMA connection after calling
624321936Shselasky * fork.  To do this, we delay establishing the RDMA connection until we try
625321936Shselasky * to send/receive on the server side.
626321936Shselasky */
627321936Shselaskystatic void fork_active(int socket)
628321936Shselasky{
629321936Shselasky	struct sockaddr_storage addr;
630321936Shselasky	int sfd, dfd, ret;
631321936Shselasky	socklen_t len;
632321936Shselasky	uint32_t msg;
633321936Shselasky	long flags;
634321936Shselasky
635321936Shselasky	sfd = fd_getd(socket);
636321936Shselasky
637321936Shselasky	flags = real.fcntl(sfd, F_GETFL);
638321936Shselasky	real.fcntl(sfd, F_SETFL, 0);
639321936Shselasky	ret = real.recv(sfd, &msg, sizeof msg, MSG_PEEK);
640321936Shselasky	real.fcntl(sfd, F_SETFL, flags);
641321936Shselasky	if ((ret != sizeof msg) || msg)
642321936Shselasky		goto err1;
643321936Shselasky
644321936Shselasky	len = sizeof addr;
645321936Shselasky	ret = real.getpeername(sfd, (struct sockaddr *) &addr, &len);
646321936Shselasky	if (ret)
647321936Shselasky		goto err1;
648321936Shselasky
649321936Shselasky	dfd = rsocket(addr.ss_family, SOCK_STREAM, 0);
650321936Shselasky	if (dfd < 0)
651321936Shselasky		goto err1;
652321936Shselasky
653321936Shselasky	ret = rconnect(dfd, (struct sockaddr *) &addr, len);
654321936Shselasky	if (ret)
655321936Shselasky		goto err2;
656321936Shselasky
657321936Shselasky	set_rsocket_options(dfd);
658321936Shselasky	copysockopts(dfd, sfd, &rs, &real);
659321936Shselasky	real.shutdown(sfd, SHUT_RDWR);
660321936Shselasky	real.close(sfd);
661321936Shselasky	fd_store(socket, dfd, fd_rsocket, fd_ready);
662321936Shselasky	return;
663321936Shselasky
664321936Shselaskyerr2:
665321936Shselasky	rclose(dfd);
666321936Shselaskyerr1:
667321936Shselasky	fd_store(socket, sfd, fd_normal, fd_ready);
668321936Shselasky}
669321936Shselasky
670321936Shselasky/*
671321936Shselasky * The server will start listening for the new connection, then send a
672321936Shselasky * message to the active side when the listen is ready.  This does leave
673321936Shselasky * fork unsupported in the following case: the server is nonblocking and
674321936Shselasky * calls select/poll waiting to receive data from the client.
675321936Shselasky */
676321936Shselaskystatic void fork_passive(int socket)
677321936Shselasky{
678321936Shselasky	struct sockaddr_in6 sin6;
679321936Shselasky	sem_t *sem;
680321936Shselasky	int lfd, sfd, dfd, ret, param;
681321936Shselasky	socklen_t len;
682321936Shselasky	uint32_t msg;
683321936Shselasky
684321936Shselasky	sfd = fd_getd(socket);
685321936Shselasky
686321936Shselasky	len = sizeof sin6;
687321936Shselasky	ret = real.getsockname(sfd, (struct sockaddr *) &sin6, &len);
688321936Shselasky	if (ret)
689321936Shselasky		goto out;
690321936Shselasky	sin6.sin6_flowinfo = 0;
691321936Shselasky	sin6.sin6_scope_id = 0;
692321936Shselasky	memset(&sin6.sin6_addr, 0, sizeof sin6.sin6_addr);
693321936Shselasky
694321936Shselasky	sem = sem_open("/rsocket_fork", O_CREAT | O_RDWR,
695321936Shselasky		       S_IRWXU | S_IRWXG, 1);
696321936Shselasky	if (sem == SEM_FAILED) {
697321936Shselasky		ret = -1;
698321936Shselasky		goto out;
699321936Shselasky	}
700321936Shselasky
701321936Shselasky	lfd = rsocket(sin6.sin6_family, SOCK_STREAM, 0);
702321936Shselasky	if (lfd < 0) {
703321936Shselasky		ret = lfd;
704321936Shselasky		goto sclose;
705321936Shselasky	}
706321936Shselasky
707321936Shselasky	param = 1;
708321936Shselasky	rsetsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &param, sizeof param);
709321936Shselasky
710321936Shselasky	sem_wait(sem);
711321936Shselasky	ret = rbind(lfd, (struct sockaddr *) &sin6, sizeof sin6);
712321936Shselasky	if (ret)
713321936Shselasky		goto lclose;
714321936Shselasky
715321936Shselasky	ret = rlisten(lfd, 1);
716321936Shselasky	if (ret)
717321936Shselasky		goto lclose;
718321936Shselasky
719321936Shselasky	msg = 0;
720321936Shselasky	len = real.write(sfd, &msg, sizeof msg);
721321936Shselasky	if (len != sizeof msg)
722321936Shselasky		goto lclose;
723321936Shselasky
724321936Shselasky	dfd = raccept(lfd, NULL, NULL);
725321936Shselasky	if (dfd < 0) {
726321936Shselasky		ret  = dfd;
727321936Shselasky		goto lclose;
728321936Shselasky	}
729321936Shselasky
730321936Shselasky	set_rsocket_options(dfd);
731321936Shselasky	copysockopts(dfd, sfd, &rs, &real);
732321936Shselasky	real.shutdown(sfd, SHUT_RDWR);
733321936Shselasky	real.close(sfd);
734321936Shselasky	fd_store(socket, dfd, fd_rsocket, fd_ready);
735321936Shselasky
736321936Shselaskylclose:
737321936Shselasky	rclose(lfd);
738321936Shselasky	sem_post(sem);
739321936Shselaskysclose:
740321936Shselasky	sem_close(sem);
741321936Shselaskyout:
742321936Shselasky	if (ret)
743321936Shselasky		fd_store(socket, sfd, fd_normal, fd_ready);
744321936Shselasky}
745321936Shselasky
746321936Shselaskystatic inline enum fd_type fd_fork_get(int index, int *fd)
747321936Shselasky{
748321936Shselasky	struct fd_info *fdi;
749321936Shselasky
750321936Shselasky	fdi = idm_lookup(&idm, index);
751321936Shselasky	if (fdi) {
752321936Shselasky		if (fdi->state == fd_fork_passive)
753321936Shselasky			fork_passive(index);
754321936Shselasky		else if (fdi->state == fd_fork_active)
755321936Shselasky			fork_active(index);
756321936Shselasky		*fd = fdi->fd;
757321936Shselasky		return fdi->type;
758321936Shselasky
759321936Shselasky	} else {
760321936Shselasky		*fd = index;
761321936Shselasky		return fd_normal;
762321936Shselasky	}
763321936Shselasky}
764321936Shselasky
765321936Shselaskyint connect(int socket, const struct sockaddr *addr, socklen_t addrlen)
766321936Shselasky{
767321936Shselasky	int fd, ret;
768321936Shselasky
769321936Shselasky	if (fd_get(socket, &fd) == fd_rsocket) {
770321936Shselasky		ret = rconnect(fd, addr, addrlen);
771321936Shselasky		if (!ret || errno == EINPROGRESS)
772321936Shselasky			return ret;
773321936Shselasky
774321936Shselasky		ret = transpose_socket(socket, fd_normal);
775321936Shselasky		if (ret < 0)
776321936Shselasky			return ret;
777321936Shselasky
778321936Shselasky		rclose(fd);
779321936Shselasky		fd = ret;
780321936Shselasky	} else if (fd_gets(socket) == fd_fork) {
781321936Shselasky		fd_store(socket, fd, fd_normal, fd_fork_active);
782321936Shselasky	}
783321936Shselasky
784321936Shselasky	return real.connect(fd, addr, addrlen);
785321936Shselasky}
786321936Shselasky
787321936Shselaskyssize_t recv(int socket, void *buf, size_t len, int flags)
788321936Shselasky{
789321936Shselasky	int fd;
790321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
791321936Shselasky		rrecv(fd, buf, len, flags) : real.recv(fd, buf, len, flags);
792321936Shselasky}
793321936Shselasky
794321936Shselaskyssize_t recvfrom(int socket, void *buf, size_t len, int flags,
795321936Shselasky		 struct sockaddr *src_addr, socklen_t *addrlen)
796321936Shselasky{
797321936Shselasky	int fd;
798321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
799321936Shselasky		rrecvfrom(fd, buf, len, flags, src_addr, addrlen) :
800321936Shselasky		real.recvfrom(fd, buf, len, flags, src_addr, addrlen);
801321936Shselasky}
802321936Shselasky
803321936Shselaskyssize_t recvmsg(int socket, struct msghdr *msg, int flags)
804321936Shselasky{
805321936Shselasky	int fd;
806321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
807321936Shselasky		rrecvmsg(fd, msg, flags) : real.recvmsg(fd, msg, flags);
808321936Shselasky}
809321936Shselasky
810321936Shselaskyssize_t read(int socket, void *buf, size_t count)
811321936Shselasky{
812321936Shselasky	int fd;
813321936Shselasky	init_preload();
814321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
815321936Shselasky		rread(fd, buf, count) : real.read(fd, buf, count);
816321936Shselasky}
817321936Shselasky
818321936Shselaskyssize_t readv(int socket, const struct iovec *iov, int iovcnt)
819321936Shselasky{
820321936Shselasky	int fd;
821321936Shselasky	init_preload();
822321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
823321936Shselasky		rreadv(fd, iov, iovcnt) : real.readv(fd, iov, iovcnt);
824321936Shselasky}
825321936Shselasky
826321936Shselaskyssize_t send(int socket, const void *buf, size_t len, int flags)
827321936Shselasky{
828321936Shselasky	int fd;
829321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
830321936Shselasky		rsend(fd, buf, len, flags) : real.send(fd, buf, len, flags);
831321936Shselasky}
832321936Shselasky
833321936Shselaskyssize_t sendto(int socket, const void *buf, size_t len, int flags,
834321936Shselasky		const struct sockaddr *dest_addr, socklen_t addrlen)
835321936Shselasky{
836321936Shselasky	int fd;
837321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
838321936Shselasky		rsendto(fd, buf, len, flags, dest_addr, addrlen) :
839321936Shselasky		real.sendto(fd, buf, len, flags, dest_addr, addrlen);
840321936Shselasky}
841321936Shselasky
842321936Shselaskyssize_t sendmsg(int socket, const struct msghdr *msg, int flags)
843321936Shselasky{
844321936Shselasky	int fd;
845321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
846321936Shselasky		rsendmsg(fd, msg, flags) : real.sendmsg(fd, msg, flags);
847321936Shselasky}
848321936Shselasky
849321936Shselaskyssize_t write(int socket, const void *buf, size_t count)
850321936Shselasky{
851321936Shselasky	int fd;
852321936Shselasky	init_preload();
853321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
854321936Shselasky		rwrite(fd, buf, count) : real.write(fd, buf, count);
855321936Shselasky}
856321936Shselasky
857321936Shselaskyssize_t writev(int socket, const struct iovec *iov, int iovcnt)
858321936Shselasky{
859321936Shselasky	int fd;
860321936Shselasky	init_preload();
861321936Shselasky	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
862321936Shselasky		rwritev(fd, iov, iovcnt) : real.writev(fd, iov, iovcnt);
863321936Shselasky}
864321936Shselasky
865321936Shselaskystatic struct pollfd *fds_alloc(nfds_t nfds)
866321936Shselasky{
867321936Shselasky	static __thread struct pollfd *rfds;
868321936Shselasky	static __thread nfds_t rnfds;
869321936Shselasky
870321936Shselasky	if (nfds > rnfds) {
871321936Shselasky		if (rfds)
872321936Shselasky			free(rfds);
873321936Shselasky
874321936Shselasky		rfds = malloc(sizeof(*rfds) * nfds);
875321936Shselasky		rnfds = rfds ? nfds : 0;
876321936Shselasky	}
877321936Shselasky
878321936Shselasky	return rfds;
879321936Shselasky}
880321936Shselasky
881321936Shselaskyint poll(struct pollfd *fds, nfds_t nfds, int timeout)
882321936Shselasky{
883321936Shselasky	struct pollfd *rfds;
884321936Shselasky	int i, ret;
885321936Shselasky
886321936Shselasky	init_preload();
887321936Shselasky	for (i = 0; i < nfds; i++) {
888321936Shselasky		if (fd_gett(fds[i].fd) == fd_rsocket)
889321936Shselasky			goto use_rpoll;
890321936Shselasky	}
891321936Shselasky
892321936Shselasky	return real.poll(fds, nfds, timeout);
893321936Shselasky
894321936Shselaskyuse_rpoll:
895321936Shselasky	rfds = fds_alloc(nfds);
896321936Shselasky	if (!rfds)
897321936Shselasky		return ERR(ENOMEM);
898321936Shselasky
899321936Shselasky	for (i = 0; i < nfds; i++) {
900321936Shselasky		rfds[i].fd = fd_getd(fds[i].fd);
901321936Shselasky		rfds[i].events = fds[i].events;
902321936Shselasky		rfds[i].revents = 0;
903321936Shselasky	}
904321936Shselasky
905321936Shselasky	ret = rpoll(rfds, nfds, timeout);
906321936Shselasky
907321936Shselasky	for (i = 0; i < nfds; i++)
908321936Shselasky		fds[i].revents = rfds[i].revents;
909321936Shselasky
910321936Shselasky	return ret;
911321936Shselasky}
912321936Shselasky
913321936Shselaskystatic void select_to_rpoll(struct pollfd *fds, int *nfds,
914321936Shselasky			    fd_set *readfds, fd_set *writefds, fd_set *exceptfds)
915321936Shselasky{
916321936Shselasky	int fd, events, i = 0;
917321936Shselasky
918321936Shselasky	for (fd = 0; fd < *nfds; fd++) {
919321936Shselasky		events = (readfds && FD_ISSET(fd, readfds)) ? POLLIN : 0;
920321936Shselasky		if (writefds && FD_ISSET(fd, writefds))
921321936Shselasky			events |= POLLOUT;
922321936Shselasky
923321936Shselasky		if (events || (exceptfds && FD_ISSET(fd, exceptfds))) {
924321936Shselasky			fds[i].fd = fd_getd(fd);
925321936Shselasky			fds[i++].events = events;
926321936Shselasky		}
927321936Shselasky	}
928321936Shselasky
929321936Shselasky	*nfds = i;
930321936Shselasky}
931321936Shselasky
932321936Shselaskystatic int rpoll_to_select(struct pollfd *fds, int nfds,
933321936Shselasky			   fd_set *readfds, fd_set *writefds, fd_set *exceptfds)
934321936Shselasky{
935321936Shselasky	int fd, rfd, i, cnt = 0;
936321936Shselasky
937321936Shselasky	for (i = 0, fd = 0; i < nfds; fd++) {
938321936Shselasky		rfd = fd_getd(fd);
939321936Shselasky		if (rfd != fds[i].fd)
940321936Shselasky			continue;
941321936Shselasky
942321936Shselasky		if (readfds && (fds[i].revents & POLLIN)) {
943321936Shselasky			FD_SET(fd, readfds);
944321936Shselasky			cnt++;
945321936Shselasky		}
946321936Shselasky
947321936Shselasky		if (writefds && (fds[i].revents & POLLOUT)) {
948321936Shselasky			FD_SET(fd, writefds);
949321936Shselasky			cnt++;
950321936Shselasky		}
951321936Shselasky
952321936Shselasky		if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) {
953321936Shselasky			FD_SET(fd, exceptfds);
954321936Shselasky			cnt++;
955321936Shselasky		}
956321936Shselasky		i++;
957321936Shselasky	}
958321936Shselasky
959321936Shselasky	return cnt;
960321936Shselasky}
961321936Shselasky
962321936Shselaskystatic int rs_convert_timeout(struct timeval *timeout)
963321936Shselasky{
964321936Shselasky	return !timeout ? -1 : timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
965321936Shselasky}
966321936Shselasky
967321936Shselaskyint select(int nfds, fd_set *readfds, fd_set *writefds,
968321936Shselasky	   fd_set *exceptfds, struct timeval *timeout)
969321936Shselasky{
970321936Shselasky	struct pollfd *fds;
971321936Shselasky	int ret;
972321936Shselasky
973321936Shselasky	fds = fds_alloc(nfds);
974321936Shselasky	if (!fds)
975321936Shselasky		return ERR(ENOMEM);
976321936Shselasky
977321936Shselasky	select_to_rpoll(fds, &nfds, readfds, writefds, exceptfds);
978321936Shselasky	ret = rpoll(fds, nfds, rs_convert_timeout(timeout));
979321936Shselasky
980321936Shselasky	if (readfds)
981321936Shselasky		FD_ZERO(readfds);
982321936Shselasky	if (writefds)
983321936Shselasky		FD_ZERO(writefds);
984321936Shselasky	if (exceptfds)
985321936Shselasky		FD_ZERO(exceptfds);
986321936Shselasky
987321936Shselasky	if (ret > 0)
988321936Shselasky		ret = rpoll_to_select(fds, nfds, readfds, writefds, exceptfds);
989321936Shselasky
990321936Shselasky	return ret;
991321936Shselasky}
992321936Shselasky
993321936Shselaskyint shutdown(int socket, int how)
994321936Shselasky{
995321936Shselasky	int fd;
996321936Shselasky	return (fd_get(socket, &fd) == fd_rsocket) ?
997321936Shselasky		rshutdown(fd, how) : real.shutdown(fd, how);
998321936Shselasky}
999321936Shselasky
1000321936Shselaskyint close(int socket)
1001321936Shselasky{
1002321936Shselasky	struct fd_info *fdi;
1003321936Shselasky	int ret;
1004321936Shselasky
1005321936Shselasky	init_preload();
1006321936Shselasky	fdi = idm_lookup(&idm, socket);
1007321936Shselasky	if (!fdi)
1008321936Shselasky		return real.close(socket);
1009321936Shselasky
1010321936Shselasky	if (fdi->dupfd != -1) {
1011321936Shselasky		ret = close(fdi->dupfd);
1012321936Shselasky		if (ret)
1013321936Shselasky			return ret;
1014321936Shselasky	}
1015321936Shselasky
1016321936Shselasky	if (atomic_fetch_sub(&fdi->refcnt, 1) != 1)
1017321936Shselasky		return 0;
1018321936Shselasky
1019321936Shselasky	idm_clear(&idm, socket);
1020321936Shselasky	real.close(socket);
1021321936Shselasky	ret = (fdi->type == fd_rsocket) ? rclose(fdi->fd) : real.close(fdi->fd);
1022321936Shselasky	free(fdi);
1023321936Shselasky	return ret;
1024321936Shselasky}
1025321936Shselasky
1026321936Shselaskyint getpeername(int socket, struct sockaddr *addr, socklen_t *addrlen)
1027321936Shselasky{
1028321936Shselasky	int fd;
1029321936Shselasky	return (fd_get(socket, &fd) == fd_rsocket) ?
1030321936Shselasky		rgetpeername(fd, addr, addrlen) :
1031321936Shselasky		real.getpeername(fd, addr, addrlen);
1032321936Shselasky}
1033321936Shselasky
1034321936Shselaskyint getsockname(int socket, struct sockaddr *addr, socklen_t *addrlen)
1035321936Shselasky{
1036321936Shselasky	int fd;
1037321936Shselasky	init_preload();
1038321936Shselasky	return (fd_get(socket, &fd) == fd_rsocket) ?
1039321936Shselasky		rgetsockname(fd, addr, addrlen) :
1040321936Shselasky		real.getsockname(fd, addr, addrlen);
1041321936Shselasky}
1042321936Shselasky
1043321936Shselaskyint setsockopt(int socket, int level, int optname,
1044321936Shselasky		const void *optval, socklen_t optlen)
1045321936Shselasky{
1046321936Shselasky	int fd;
1047321936Shselasky	return (fd_get(socket, &fd) == fd_rsocket) ?
1048321936Shselasky		rsetsockopt(fd, level, optname, optval, optlen) :
1049321936Shselasky		real.setsockopt(fd, level, optname, optval, optlen);
1050321936Shselasky}
1051321936Shselasky
1052321936Shselaskyint getsockopt(int socket, int level, int optname,
1053321936Shselasky		void *optval, socklen_t *optlen)
1054321936Shselasky{
1055321936Shselasky	int fd;
1056321936Shselasky	return (fd_get(socket, &fd) == fd_rsocket) ?
1057321936Shselasky		rgetsockopt(fd, level, optname, optval, optlen) :
1058321936Shselasky		real.getsockopt(fd, level, optname, optval, optlen);
1059321936Shselasky}
1060321936Shselasky
1061321936Shselaskyint fcntl(int socket, int cmd, ... /* arg */)
1062321936Shselasky{
1063321936Shselasky	va_list args;
1064321936Shselasky	long lparam;
1065321936Shselasky	void *pparam;
1066321936Shselasky	int fd, ret;
1067321936Shselasky
1068321936Shselasky	init_preload();
1069321936Shselasky	va_start(args, cmd);
1070321936Shselasky	switch (cmd) {
1071321936Shselasky	case F_GETFD:
1072321936Shselasky	case F_GETFL:
1073321936Shselasky	case F_GETOWN:
1074321936Shselasky	case F_GETSIG:
1075321936Shselasky	case F_GETLEASE:
1076321936Shselasky		ret = (fd_get(socket, &fd) == fd_rsocket) ?
1077321936Shselasky			rfcntl(fd, cmd) : real.fcntl(fd, cmd);
1078321936Shselasky		break;
1079321936Shselasky	case F_DUPFD:
1080321936Shselasky	/*case F_DUPFD_CLOEXEC:*/
1081321936Shselasky	case F_SETFD:
1082321936Shselasky	case F_SETFL:
1083321936Shselasky	case F_SETOWN:
1084321936Shselasky	case F_SETSIG:
1085321936Shselasky	case F_SETLEASE:
1086321936Shselasky	case F_NOTIFY:
1087321936Shselasky		lparam = va_arg(args, long);
1088321936Shselasky		ret = (fd_get(socket, &fd) == fd_rsocket) ?
1089321936Shselasky			rfcntl(fd, cmd, lparam) : real.fcntl(fd, cmd, lparam);
1090321936Shselasky		break;
1091321936Shselasky	default:
1092321936Shselasky		pparam = va_arg(args, void *);
1093321936Shselasky		ret = (fd_get(socket, &fd) == fd_rsocket) ?
1094321936Shselasky			rfcntl(fd, cmd, pparam) : real.fcntl(fd, cmd, pparam);
1095321936Shselasky		break;
1096321936Shselasky	}
1097321936Shselasky	va_end(args);
1098321936Shselasky	return ret;
1099321936Shselasky}
1100321936Shselasky
1101321936Shselasky/*
1102321936Shselasky * dup2 is not thread safe
1103321936Shselasky */
1104321936Shselaskyint dup2(int oldfd, int newfd)
1105321936Shselasky{
1106321936Shselasky	struct fd_info *oldfdi, *newfdi;
1107321936Shselasky	int ret;
1108321936Shselasky
1109321936Shselasky	init_preload();
1110321936Shselasky	oldfdi = idm_lookup(&idm, oldfd);
1111321936Shselasky	if (oldfdi) {
1112321936Shselasky		if (oldfdi->state == fd_fork_passive)
1113321936Shselasky			fork_passive(oldfd);
1114321936Shselasky		else if (oldfdi->state == fd_fork_active)
1115321936Shselasky			fork_active(oldfd);
1116321936Shselasky	}
1117321936Shselasky
1118321936Shselasky	newfdi = idm_lookup(&idm, newfd);
1119321936Shselasky	if (newfdi) {
1120321936Shselasky		 /* newfd cannot have been dup'ed directly */
1121321936Shselasky		if (atomic_load(&newfdi->refcnt) > 1)
1122321936Shselasky			return ERR(EBUSY);
1123321936Shselasky		close(newfd);
1124321936Shselasky	}
1125321936Shselasky
1126321936Shselasky	ret = real.dup2(oldfd, newfd);
1127321936Shselasky	if (!oldfdi || ret != newfd)
1128321936Shselasky		return ret;
1129321936Shselasky
1130321936Shselasky	newfdi = calloc(1, sizeof(*newfdi));
1131321936Shselasky	if (!newfdi) {
1132321936Shselasky		close(newfd);
1133321936Shselasky		return ERR(ENOMEM);
1134321936Shselasky	}
1135321936Shselasky
1136321936Shselasky	pthread_mutex_lock(&mut);
1137321936Shselasky	idm_set(&idm, newfd, newfdi);
1138321936Shselasky	pthread_mutex_unlock(&mut);
1139321936Shselasky
1140321936Shselasky	newfdi->fd = oldfdi->fd;
1141321936Shselasky	newfdi->type = oldfdi->type;
1142321936Shselasky	if (oldfdi->dupfd != -1) {
1143321936Shselasky		newfdi->dupfd = oldfdi->dupfd;
1144321936Shselasky		oldfdi = idm_lookup(&idm, oldfdi->dupfd);
1145321936Shselasky	} else {
1146321936Shselasky		newfdi->dupfd = oldfd;
1147321936Shselasky	}
1148321936Shselasky	atomic_store(&newfdi->refcnt, 1);
1149321936Shselasky	atomic_fetch_add(&oldfdi->refcnt, 1);
1150321936Shselasky	return newfd;
1151321936Shselasky}
1152321936Shselasky
1153321936Shselaskyssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1154321936Shselasky{
1155321936Shselasky	void *file_addr;
1156321936Shselasky	int fd;
1157321936Shselasky	size_t ret;
1158321936Shselasky
1159321936Shselasky	if (fd_get(out_fd, &fd) != fd_rsocket)
1160321936Shselasky		return real.sendfile(fd, in_fd, offset, count);
1161321936Shselasky
1162321936Shselasky	file_addr = mmap(NULL, count, PROT_READ, 0, in_fd, offset ? *offset : 0);
1163321936Shselasky	if (file_addr == (void *) -1)
1164321936Shselasky		return -1;
1165321936Shselasky
1166321936Shselasky	ret = rwrite(fd, file_addr, count);
1167321936Shselasky	if ((ret > 0) && offset)
1168321936Shselasky		lseek(in_fd, ret, SEEK_CUR);
1169321936Shselasky	munmap(file_addr, count);
1170321936Shselasky	return ret;
1171321936Shselasky}
1172321936Shselasky
1173321936Shselaskyint __fxstat(int ver, int socket, struct stat *buf)
1174321936Shselasky{
1175321936Shselasky	int fd, ret;
1176321936Shselasky
1177321936Shselasky	init_preload();
1178321936Shselasky	if (fd_get(socket, &fd) == fd_rsocket) {
1179321936Shselasky		ret = real.fxstat(ver, socket, buf);
1180321936Shselasky		if (!ret)
1181321936Shselasky			buf->st_mode = (buf->st_mode & ~S_IFMT) | __S_IFSOCK;
1182321936Shselasky	} else {
1183321936Shselasky		ret = real.fxstat(ver, fd, buf);
1184321936Shselasky	}
1185321936Shselasky	return ret;
1186321936Shselasky}
1187