1321936Shselasky/* 2321936Shselasky * Copyright (c) 2011-2012 Intel Corporation. All rights reserved. 3321936Shselasky * 4321936Shselasky * This software is available to you under a choice of one of two 5321936Shselasky * licenses. You may choose to be licensed under the terms of the GNU 6321936Shselasky * General Public License (GPL) Version 2, available from the file 7321936Shselasky * COPYING in the main directory of this source tree, or the 8321936Shselasky * OpenIB.org BSD license below: 9321936Shselasky * 10321936Shselasky * Redistribution and use in source and binary forms, with or 11321936Shselasky * without modification, are permitted provided that the following 12321936Shselasky * conditions are met: 13321936Shselasky * 14321936Shselasky * - Redistributions of source code must retain the above 15321936Shselasky * copyright notice, this list of conditions and the following 16321936Shselasky * disclaimer. 17321936Shselasky * 18321936Shselasky * - Redistributions in binary form must reproduce the above 19321936Shselasky * copyright notice, this list of conditions and the following 20321936Shselasky * disclaimer in the documentation and/or other materials 21321936Shselasky * provided with the distribution. 22321936Shselasky * 23321936Shselasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24321936Shselasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25321936Shselasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26321936Shselasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27321936Shselasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28321936Shselasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29321936Shselasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30321936Shselasky * SOFTWARE. 31321936Shselasky * 32321936Shselasky */ 33321936Shselasky#define _GNU_SOURCE 34321936Shselasky#include <config.h> 35321936Shselasky 36321936Shselasky#include <sys/types.h> 37321936Shselasky#include <sys/socket.h> 38321936Shselasky#include <sys/uio.h> 39321936Shselasky#include <sys/stat.h> 40321936Shselasky#include <sys/mman.h> 41321936Shselasky#include <stdarg.h> 42321936Shselasky#include <dlfcn.h> 43321936Shselasky#include <netdb.h> 44321936Shselasky#include <unistd.h> 45321936Shselasky#include <fcntl.h> 46321936Shselasky#include <string.h> 47321936Shselasky#include <netinet/tcp.h> 48321936Shselasky#include <unistd.h> 49321936Shselasky#include <semaphore.h> 50321936Shselasky#include <ctype.h> 51321936Shselasky#include <stdlib.h> 52321936Shselasky#include <stdio.h> 53321936Shselasky 54321936Shselasky#include <rdma/rdma_cma.h> 55321936Shselasky#include <rdma/rdma_verbs.h> 56321936Shselasky#include <rdma/rsocket.h> 57321936Shselasky#include "cma.h" 58321936Shselasky#include "indexer.h" 59321936Shselasky 60321936Shselaskystruct socket_calls { 61321936Shselasky int (*socket)(int domain, int type, int protocol); 62321936Shselasky int (*bind)(int socket, const struct sockaddr *addr, socklen_t addrlen); 63321936Shselasky int (*listen)(int socket, int backlog); 64321936Shselasky int (*accept)(int socket, struct sockaddr *addr, socklen_t *addrlen); 65321936Shselasky int (*connect)(int socket, const struct sockaddr *addr, socklen_t addrlen); 66321936Shselasky ssize_t (*recv)(int socket, void *buf, size_t len, int flags); 67321936Shselasky ssize_t (*recvfrom)(int socket, void *buf, size_t len, int flags, 68321936Shselasky struct sockaddr *src_addr, socklen_t *addrlen); 69321936Shselasky ssize_t (*recvmsg)(int socket, struct msghdr *msg, int flags); 70321936Shselasky ssize_t (*read)(int socket, void *buf, size_t count); 71321936Shselasky ssize_t (*readv)(int socket, const struct iovec *iov, int iovcnt); 72321936Shselasky ssize_t (*send)(int socket, const void *buf, size_t len, int flags); 73321936Shselasky ssize_t (*sendto)(int socket, const void *buf, size_t len, int flags, 74321936Shselasky const struct sockaddr *dest_addr, socklen_t addrlen); 75321936Shselasky ssize_t (*sendmsg)(int socket, const struct msghdr *msg, int flags); 76321936Shselasky ssize_t (*write)(int socket, const void *buf, size_t count); 77321936Shselasky ssize_t (*writev)(int socket, const struct iovec *iov, int iovcnt); 78321936Shselasky int (*poll)(struct pollfd *fds, nfds_t nfds, int timeout); 79321936Shselasky int (*shutdown)(int socket, int how); 80321936Shselasky int (*close)(int socket); 81321936Shselasky int (*getpeername)(int socket, struct sockaddr *addr, socklen_t *addrlen); 82321936Shselasky int (*getsockname)(int socket, struct sockaddr *addr, socklen_t *addrlen); 83321936Shselasky int (*setsockopt)(int socket, int level, int optname, 84321936Shselasky const void *optval, socklen_t optlen); 85321936Shselasky int (*getsockopt)(int socket, int level, int optname, 86321936Shselasky void *optval, socklen_t *optlen); 87321936Shselasky int (*fcntl)(int socket, int cmd, ... /* arg */); 88321936Shselasky int (*dup2)(int oldfd, int newfd); 89321936Shselasky ssize_t (*sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); 90321936Shselasky int (*fxstat)(int ver, int fd, struct stat *buf); 91321936Shselasky}; 92321936Shselasky 93321936Shselaskystatic struct socket_calls real; 94321936Shselaskystatic struct socket_calls rs; 95321936Shselasky 96321936Shselaskystatic struct index_map idm; 97321936Shselaskystatic pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; 98321936Shselasky 99321936Shselaskystatic int sq_size; 100321936Shselaskystatic int rq_size; 101321936Shselaskystatic int sq_inline; 102321936Shselaskystatic int fork_support; 103321936Shselasky 104321936Shselaskyenum fd_type { 105321936Shselasky fd_normal, 106321936Shselasky fd_rsocket 107321936Shselasky}; 108321936Shselasky 109321936Shselaskyenum fd_fork_state { 110321936Shselasky fd_ready, 111321936Shselasky fd_fork, 112321936Shselasky fd_fork_listen, 113321936Shselasky fd_fork_active, 114321936Shselasky fd_fork_passive 115321936Shselasky}; 116321936Shselasky 117321936Shselaskystruct fd_info { 118321936Shselasky enum fd_type type; 119321936Shselasky enum fd_fork_state state; 120321936Shselasky int fd; 121321936Shselasky int dupfd; 122321936Shselasky _Atomic(int) refcnt; 123321936Shselasky}; 124321936Shselasky 125321936Shselaskystruct config_entry { 126321936Shselasky char *name; 127321936Shselasky int domain; 128321936Shselasky int type; 129321936Shselasky int protocol; 130321936Shselasky}; 131321936Shselasky 132321936Shselaskystatic struct config_entry *config; 133321936Shselaskystatic int config_cnt; 134321936Shselasky 135321936Shselaskystatic void free_config(void) 136321936Shselasky{ 137321936Shselasky while (config_cnt) 138321936Shselasky free(config[--config_cnt].name); 139321936Shselasky 140321936Shselasky free(config); 141321936Shselasky} 142321936Shselasky 143321936Shselasky/* 144321936Shselasky * Config file format: 145321936Shselasky * # Starting '#' indicates comment 146321936Shselasky * # wild card values are supported using '*' 147321936Shselasky * # domain - *, INET, INET6, IB 148321936Shselasky * # type - *, STREAM, DGRAM 149321936Shselasky * # protocol - *, TCP, UDP 150321936Shselasky * program_name domain type protocol 151321936Shselasky */ 152321936Shselaskystatic void scan_config(void) 153321936Shselasky{ 154321936Shselasky struct config_entry *new_config; 155321936Shselasky FILE *fp; 156321936Shselasky char line[120], prog[64], dom[16], type[16], proto[16]; 157321936Shselasky 158321936Shselasky fp = fopen(RS_CONF_DIR "/preload_config", "r"); 159321936Shselasky if (!fp) 160321936Shselasky return; 161321936Shselasky 162321936Shselasky while (fgets(line, sizeof(line), fp)) { 163321936Shselasky if (line[0] == '#') 164321936Shselasky continue; 165321936Shselasky 166321936Shselasky if (sscanf(line, "%64s%16s%16s%16s", prog, dom, type, proto) != 4) 167321936Shselasky continue; 168321936Shselasky 169321936Shselasky new_config = realloc(config, (config_cnt + 1) * 170321936Shselasky sizeof(struct config_entry)); 171321936Shselasky if (!new_config) 172321936Shselasky break; 173321936Shselasky 174321936Shselasky config = new_config; 175321936Shselasky memset(&config[config_cnt], 0, sizeof(struct config_entry)); 176321936Shselasky 177321936Shselasky if (!strcasecmp(dom, "INET") || 178321936Shselasky !strcasecmp(dom, "AF_INET") || 179321936Shselasky !strcasecmp(dom, "PF_INET")) { 180321936Shselasky config[config_cnt].domain = AF_INET; 181321936Shselasky } else if (!strcasecmp(dom, "INET6") || 182321936Shselasky !strcasecmp(dom, "AF_INET6") || 183321936Shselasky !strcasecmp(dom, "PF_INET6")) { 184321936Shselasky config[config_cnt].domain = AF_INET6; 185321936Shselasky } else if (!strcasecmp(dom, "IB") || 186321936Shselasky !strcasecmp(dom, "AF_IB") || 187321936Shselasky !strcasecmp(dom, "PF_IB")) { 188321936Shselasky config[config_cnt].domain = AF_IB; 189321936Shselasky } else if (strcmp(dom, "*")) { 190321936Shselasky continue; 191321936Shselasky } 192321936Shselasky 193321936Shselasky if (!strcasecmp(type, "STREAM") || 194321936Shselasky !strcasecmp(type, "SOCK_STREAM")) { 195321936Shselasky config[config_cnt].type = SOCK_STREAM; 196321936Shselasky } else if (!strcasecmp(type, "DGRAM") || 197321936Shselasky !strcasecmp(type, "SOCK_DGRAM")) { 198321936Shselasky config[config_cnt].type = SOCK_DGRAM; 199321936Shselasky } else if (strcmp(type, "*")) { 200321936Shselasky continue; 201321936Shselasky } 202321936Shselasky 203321936Shselasky if (!strcasecmp(proto, "TCP") || 204321936Shselasky !strcasecmp(proto, "IPPROTO_TCP")) { 205321936Shselasky config[config_cnt].protocol = IPPROTO_TCP; 206321936Shselasky } else if (!strcasecmp(proto, "UDP") || 207321936Shselasky !strcasecmp(proto, "IPPROTO_UDP")) { 208321936Shselasky config[config_cnt].protocol = IPPROTO_UDP; 209321936Shselasky } else if (strcmp(proto, "*")) { 210321936Shselasky continue; 211321936Shselasky } 212321936Shselasky 213321936Shselasky if (strcmp(prog, "*")) { 214321936Shselasky if (!(config[config_cnt].name = strdup(prog))) 215321936Shselasky continue; 216321936Shselasky } 217321936Shselasky 218321936Shselasky config_cnt++; 219321936Shselasky } 220321936Shselasky 221321936Shselasky fclose(fp); 222321936Shselasky if (config_cnt) 223321936Shselasky atexit(free_config); 224321936Shselasky} 225321936Shselasky 226321936Shselaskystatic int intercept_socket(int domain, int type, int protocol) 227321936Shselasky{ 228321936Shselasky int i; 229321936Shselasky 230321936Shselasky if (!config_cnt) 231321936Shselasky return 1; 232321936Shselasky 233321936Shselasky if (!protocol) { 234321936Shselasky if (type == SOCK_STREAM) 235321936Shselasky protocol = IPPROTO_TCP; 236321936Shselasky else if (type == SOCK_DGRAM) 237321936Shselasky protocol = IPPROTO_UDP; 238321936Shselasky } 239321936Shselasky 240321936Shselasky for (i = 0; i < config_cnt; i++) { 241321936Shselasky if ((!config[i].name || 242321936Shselasky !strncasecmp(config[i].name, program_invocation_short_name, 243321936Shselasky strlen(config[i].name))) && 244321936Shselasky (!config[i].domain || config[i].domain == domain) && 245321936Shselasky (!config[i].type || config[i].type == type) && 246321936Shselasky (!config[i].protocol || config[i].protocol == protocol)) 247321936Shselasky return 1; 248321936Shselasky } 249321936Shselasky 250321936Shselasky return 0; 251321936Shselasky} 252321936Shselasky 253321936Shselaskystatic int fd_open(void) 254321936Shselasky{ 255321936Shselasky struct fd_info *fdi; 256321936Shselasky int ret, index; 257321936Shselasky 258321936Shselasky fdi = calloc(1, sizeof(*fdi)); 259321936Shselasky if (!fdi) 260321936Shselasky return ERR(ENOMEM); 261321936Shselasky 262321936Shselasky index = open("/dev/null", O_RDONLY); 263321936Shselasky if (index < 0) { 264321936Shselasky ret = index; 265321936Shselasky goto err1; 266321936Shselasky } 267321936Shselasky 268321936Shselasky fdi->dupfd = -1; 269321936Shselasky atomic_store(&fdi->refcnt, 1); 270321936Shselasky pthread_mutex_lock(&mut); 271321936Shselasky ret = idm_set(&idm, index, fdi); 272321936Shselasky pthread_mutex_unlock(&mut); 273321936Shselasky if (ret < 0) 274321936Shselasky goto err2; 275321936Shselasky 276321936Shselasky return index; 277321936Shselasky 278321936Shselaskyerr2: 279321936Shselasky real.close(index); 280321936Shselaskyerr1: 281321936Shselasky free(fdi); 282321936Shselasky return ret; 283321936Shselasky} 284321936Shselasky 285321936Shselaskystatic void fd_store(int index, int fd, enum fd_type type, enum fd_fork_state state) 286321936Shselasky{ 287321936Shselasky struct fd_info *fdi; 288321936Shselasky 289321936Shselasky fdi = idm_at(&idm, index); 290321936Shselasky fdi->fd = fd; 291321936Shselasky fdi->type = type; 292321936Shselasky fdi->state = state; 293321936Shselasky} 294321936Shselasky 295321936Shselaskystatic inline enum fd_type fd_get(int index, int *fd) 296321936Shselasky{ 297321936Shselasky struct fd_info *fdi; 298321936Shselasky 299321936Shselasky fdi = idm_lookup(&idm, index); 300321936Shselasky if (fdi) { 301321936Shselasky *fd = fdi->fd; 302321936Shselasky return fdi->type; 303321936Shselasky 304321936Shselasky } else { 305321936Shselasky *fd = index; 306321936Shselasky return fd_normal; 307321936Shselasky } 308321936Shselasky} 309321936Shselasky 310321936Shselaskystatic inline int fd_getd(int index) 311321936Shselasky{ 312321936Shselasky struct fd_info *fdi; 313321936Shselasky 314321936Shselasky fdi = idm_lookup(&idm, index); 315321936Shselasky return fdi ? fdi->fd : index; 316321936Shselasky} 317321936Shselasky 318321936Shselaskystatic inline enum fd_fork_state fd_gets(int index) 319321936Shselasky{ 320321936Shselasky struct fd_info *fdi; 321321936Shselasky 322321936Shselasky fdi = idm_lookup(&idm, index); 323321936Shselasky return fdi ? fdi->state : fd_ready; 324321936Shselasky} 325321936Shselasky 326321936Shselaskystatic inline enum fd_type fd_gett(int index) 327321936Shselasky{ 328321936Shselasky struct fd_info *fdi; 329321936Shselasky 330321936Shselasky fdi = idm_lookup(&idm, index); 331321936Shselasky return fdi ? fdi->type : fd_normal; 332321936Shselasky} 333321936Shselasky 334321936Shselaskystatic enum fd_type fd_close(int index, int *fd) 335321936Shselasky{ 336321936Shselasky struct fd_info *fdi; 337321936Shselasky enum fd_type type; 338321936Shselasky 339321936Shselasky fdi = idm_lookup(&idm, index); 340321936Shselasky if (fdi) { 341321936Shselasky idm_clear(&idm, index); 342321936Shselasky *fd = fdi->fd; 343321936Shselasky type = fdi->type; 344321936Shselasky real.close(index); 345321936Shselasky free(fdi); 346321936Shselasky } else { 347321936Shselasky *fd = index; 348321936Shselasky type = fd_normal; 349321936Shselasky } 350321936Shselasky return type; 351321936Shselasky} 352321936Shselasky 353321936Shselaskystatic void getenv_options(void) 354321936Shselasky{ 355321936Shselasky char *var; 356321936Shselasky 357321936Shselasky var = getenv("RS_SQ_SIZE"); 358321936Shselasky if (var) 359321936Shselasky sq_size = atoi(var); 360321936Shselasky 361321936Shselasky var = getenv("RS_RQ_SIZE"); 362321936Shselasky if (var) 363321936Shselasky rq_size = atoi(var); 364321936Shselasky 365321936Shselasky var = getenv("RS_INLINE"); 366321936Shselasky if (var) 367321936Shselasky sq_inline = atoi(var); 368321936Shselasky 369321936Shselasky var = getenv("RDMAV_FORK_SAFE"); 370321936Shselasky if (var) 371321936Shselasky fork_support = atoi(var); 372321936Shselasky} 373321936Shselasky 374321936Shselaskystatic void init_preload(void) 375321936Shselasky{ 376321936Shselasky static int init; 377321936Shselasky 378321936Shselasky /* Quick check without lock */ 379321936Shselasky if (init) 380321936Shselasky return; 381321936Shselasky 382321936Shselasky pthread_mutex_lock(&mut); 383321936Shselasky if (init) 384321936Shselasky goto out; 385321936Shselasky 386321936Shselasky real.socket = dlsym(RTLD_NEXT, "socket"); 387321936Shselasky real.bind = dlsym(RTLD_NEXT, "bind"); 388321936Shselasky real.listen = dlsym(RTLD_NEXT, "listen"); 389321936Shselasky real.accept = dlsym(RTLD_NEXT, "accept"); 390321936Shselasky real.connect = dlsym(RTLD_NEXT, "connect"); 391321936Shselasky real.recv = dlsym(RTLD_NEXT, "recv"); 392321936Shselasky real.recvfrom = dlsym(RTLD_NEXT, "recvfrom"); 393321936Shselasky real.recvmsg = dlsym(RTLD_NEXT, "recvmsg"); 394321936Shselasky real.read = dlsym(RTLD_NEXT, "read"); 395321936Shselasky real.readv = dlsym(RTLD_NEXT, "readv"); 396321936Shselasky real.send = dlsym(RTLD_NEXT, "send"); 397321936Shselasky real.sendto = dlsym(RTLD_NEXT, "sendto"); 398321936Shselasky real.sendmsg = dlsym(RTLD_NEXT, "sendmsg"); 399321936Shselasky real.write = dlsym(RTLD_NEXT, "write"); 400321936Shselasky real.writev = dlsym(RTLD_NEXT, "writev"); 401321936Shselasky real.poll = dlsym(RTLD_NEXT, "poll"); 402321936Shselasky real.shutdown = dlsym(RTLD_NEXT, "shutdown"); 403321936Shselasky real.close = dlsym(RTLD_NEXT, "close"); 404321936Shselasky real.getpeername = dlsym(RTLD_NEXT, "getpeername"); 405321936Shselasky real.getsockname = dlsym(RTLD_NEXT, "getsockname"); 406321936Shselasky real.setsockopt = dlsym(RTLD_NEXT, "setsockopt"); 407321936Shselasky real.getsockopt = dlsym(RTLD_NEXT, "getsockopt"); 408321936Shselasky real.fcntl = dlsym(RTLD_NEXT, "fcntl"); 409321936Shselasky real.dup2 = dlsym(RTLD_NEXT, "dup2"); 410321936Shselasky real.sendfile = dlsym(RTLD_NEXT, "sendfile"); 411321936Shselasky real.fxstat = dlsym(RTLD_NEXT, "__fxstat"); 412321936Shselasky 413321936Shselasky rs.socket = dlsym(RTLD_DEFAULT, "rsocket"); 414321936Shselasky rs.bind = dlsym(RTLD_DEFAULT, "rbind"); 415321936Shselasky rs.listen = dlsym(RTLD_DEFAULT, "rlisten"); 416321936Shselasky rs.accept = dlsym(RTLD_DEFAULT, "raccept"); 417321936Shselasky rs.connect = dlsym(RTLD_DEFAULT, "rconnect"); 418321936Shselasky rs.recv = dlsym(RTLD_DEFAULT, "rrecv"); 419321936Shselasky rs.recvfrom = dlsym(RTLD_DEFAULT, "rrecvfrom"); 420321936Shselasky rs.recvmsg = dlsym(RTLD_DEFAULT, "rrecvmsg"); 421321936Shselasky rs.read = dlsym(RTLD_DEFAULT, "rread"); 422321936Shselasky rs.readv = dlsym(RTLD_DEFAULT, "rreadv"); 423321936Shselasky rs.send = dlsym(RTLD_DEFAULT, "rsend"); 424321936Shselasky rs.sendto = dlsym(RTLD_DEFAULT, "rsendto"); 425321936Shselasky rs.sendmsg = dlsym(RTLD_DEFAULT, "rsendmsg"); 426321936Shselasky rs.write = dlsym(RTLD_DEFAULT, "rwrite"); 427321936Shselasky rs.writev = dlsym(RTLD_DEFAULT, "rwritev"); 428321936Shselasky rs.poll = dlsym(RTLD_DEFAULT, "rpoll"); 429321936Shselasky rs.shutdown = dlsym(RTLD_DEFAULT, "rshutdown"); 430321936Shselasky rs.close = dlsym(RTLD_DEFAULT, "rclose"); 431321936Shselasky rs.getpeername = dlsym(RTLD_DEFAULT, "rgetpeername"); 432321936Shselasky rs.getsockname = dlsym(RTLD_DEFAULT, "rgetsockname"); 433321936Shselasky rs.setsockopt = dlsym(RTLD_DEFAULT, "rsetsockopt"); 434321936Shselasky rs.getsockopt = dlsym(RTLD_DEFAULT, "rgetsockopt"); 435321936Shselasky rs.fcntl = dlsym(RTLD_DEFAULT, "rfcntl"); 436321936Shselasky 437321936Shselasky getenv_options(); 438321936Shselasky scan_config(); 439321936Shselasky init = 1; 440321936Shselaskyout: 441321936Shselasky pthread_mutex_unlock(&mut); 442321936Shselasky} 443321936Shselasky 444321936Shselasky/* 445321936Shselasky * We currently only handle copying a few common values. 446321936Shselasky */ 447321936Shselaskystatic int copysockopts(int dfd, int sfd, struct socket_calls *dapi, 448321936Shselasky struct socket_calls *sapi) 449321936Shselasky{ 450321936Shselasky socklen_t len; 451321936Shselasky int param, ret; 452321936Shselasky 453321936Shselasky ret = sapi->fcntl(sfd, F_GETFL); 454321936Shselasky if (ret > 0) 455321936Shselasky ret = dapi->fcntl(dfd, F_SETFL, ret); 456321936Shselasky if (ret) 457321936Shselasky return ret; 458321936Shselasky 459321936Shselasky len = sizeof param; 460321936Shselasky ret = sapi->getsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, ¶m, &len); 461321936Shselasky if (param && !ret) 462321936Shselasky ret = dapi->setsockopt(dfd, SOL_SOCKET, SO_REUSEADDR, ¶m, len); 463321936Shselasky if (ret) 464321936Shselasky return ret; 465321936Shselasky 466321936Shselasky len = sizeof param; 467321936Shselasky ret = sapi->getsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, ¶m, &len); 468321936Shselasky if (param && !ret) 469321936Shselasky ret = dapi->setsockopt(dfd, IPPROTO_TCP, TCP_NODELAY, ¶m, len); 470321936Shselasky if (ret) 471321936Shselasky return ret; 472321936Shselasky 473321936Shselasky return 0; 474321936Shselasky} 475321936Shselasky 476321936Shselasky/* 477321936Shselasky * Convert between an rsocket and a normal socket. 478321936Shselasky */ 479321936Shselaskystatic int transpose_socket(int socket, enum fd_type new_type) 480321936Shselasky{ 481321936Shselasky socklen_t len = 0; 482321936Shselasky int sfd, dfd, param, ret; 483321936Shselasky struct socket_calls *sapi, *dapi; 484321936Shselasky 485321936Shselasky sfd = fd_getd(socket); 486321936Shselasky if (new_type == fd_rsocket) { 487321936Shselasky dapi = &rs; 488321936Shselasky sapi = ℜ 489321936Shselasky } else { 490321936Shselasky dapi = ℜ 491321936Shselasky sapi = &rs; 492321936Shselasky } 493321936Shselasky 494321936Shselasky ret = sapi->getsockname(sfd, NULL, &len); 495321936Shselasky if (ret) 496321936Shselasky return ret; 497321936Shselasky 498321936Shselasky param = (len == sizeof(struct sockaddr_in6)) ? PF_INET6 : PF_INET; 499321936Shselasky dfd = dapi->socket(param, SOCK_STREAM, 0); 500321936Shselasky if (dfd < 0) 501321936Shselasky return dfd; 502321936Shselasky 503321936Shselasky ret = copysockopts(dfd, sfd, dapi, sapi); 504321936Shselasky if (ret) 505321936Shselasky goto err; 506321936Shselasky 507321936Shselasky fd_store(socket, dfd, new_type, fd_ready); 508321936Shselasky return dfd; 509321936Shselasky 510321936Shselaskyerr: 511321936Shselasky dapi->close(dfd); 512321936Shselasky return ret; 513321936Shselasky} 514321936Shselasky 515321936Shselasky/* 516321936Shselasky * Use defaults on failure. 517321936Shselasky */ 518321936Shselaskystatic void set_rsocket_options(int rsocket) 519321936Shselasky{ 520321936Shselasky if (sq_size) 521321936Shselasky rsetsockopt(rsocket, SOL_RDMA, RDMA_SQSIZE, &sq_size, sizeof sq_size); 522321936Shselasky 523321936Shselasky if (rq_size) 524321936Shselasky rsetsockopt(rsocket, SOL_RDMA, RDMA_RQSIZE, &rq_size, sizeof rq_size); 525321936Shselasky 526321936Shselasky if (sq_inline) 527321936Shselasky rsetsockopt(rsocket, SOL_RDMA, RDMA_INLINE, &sq_inline, sizeof sq_inline); 528321936Shselasky} 529321936Shselasky 530321936Shselaskyint socket(int domain, int type, int protocol) 531321936Shselasky{ 532321936Shselasky static __thread int recursive; 533321936Shselasky int index, ret; 534321936Shselasky 535321936Shselasky init_preload(); 536321936Shselasky 537321936Shselasky if (recursive || !intercept_socket(domain, type, protocol)) 538321936Shselasky goto real; 539321936Shselasky 540321936Shselasky index = fd_open(); 541321936Shselasky if (index < 0) 542321936Shselasky return index; 543321936Shselasky 544321936Shselasky if (fork_support && (domain == PF_INET || domain == PF_INET6) && 545321936Shselasky (type == SOCK_STREAM) && (!protocol || protocol == IPPROTO_TCP)) { 546321936Shselasky ret = real.socket(domain, type, protocol); 547321936Shselasky if (ret < 0) 548321936Shselasky return ret; 549321936Shselasky fd_store(index, ret, fd_normal, fd_fork); 550321936Shselasky return index; 551321936Shselasky } 552321936Shselasky 553321936Shselasky recursive = 1; 554321936Shselasky ret = rsocket(domain, type, protocol); 555321936Shselasky recursive = 0; 556321936Shselasky if (ret >= 0) { 557321936Shselasky fd_store(index, ret, fd_rsocket, fd_ready); 558321936Shselasky set_rsocket_options(ret); 559321936Shselasky return index; 560321936Shselasky } 561321936Shselasky fd_close(index, &ret); 562321936Shselaskyreal: 563321936Shselasky return real.socket(domain, type, protocol); 564321936Shselasky} 565321936Shselasky 566321936Shselaskyint bind(int socket, const struct sockaddr *addr, socklen_t addrlen) 567321936Shselasky{ 568321936Shselasky int fd; 569321936Shselasky return (fd_get(socket, &fd) == fd_rsocket) ? 570321936Shselasky rbind(fd, addr, addrlen) : real.bind(fd, addr, addrlen); 571321936Shselasky} 572321936Shselasky 573321936Shselaskyint listen(int socket, int backlog) 574321936Shselasky{ 575321936Shselasky int fd, ret; 576321936Shselasky if (fd_get(socket, &fd) == fd_rsocket) { 577321936Shselasky ret = rlisten(fd, backlog); 578321936Shselasky } else { 579321936Shselasky ret = real.listen(fd, backlog); 580321936Shselasky if (!ret && fd_gets(socket) == fd_fork) 581321936Shselasky fd_store(socket, fd, fd_normal, fd_fork_listen); 582321936Shselasky } 583321936Shselasky return ret; 584321936Shselasky} 585321936Shselasky 586321936Shselaskyint accept(int socket, struct sockaddr *addr, socklen_t *addrlen) 587321936Shselasky{ 588321936Shselasky int fd, index, ret; 589321936Shselasky 590321936Shselasky if (fd_get(socket, &fd) == fd_rsocket) { 591321936Shselasky index = fd_open(); 592321936Shselasky if (index < 0) 593321936Shselasky return index; 594321936Shselasky 595321936Shselasky ret = raccept(fd, addr, addrlen); 596321936Shselasky if (ret < 0) { 597321936Shselasky fd_close(index, &fd); 598321936Shselasky return ret; 599321936Shselasky } 600321936Shselasky 601321936Shselasky fd_store(index, ret, fd_rsocket, fd_ready); 602321936Shselasky return index; 603321936Shselasky } else if (fd_gets(socket) == fd_fork_listen) { 604321936Shselasky index = fd_open(); 605321936Shselasky if (index < 0) 606321936Shselasky return index; 607321936Shselasky 608321936Shselasky ret = real.accept(fd, addr, addrlen); 609321936Shselasky if (ret < 0) { 610321936Shselasky fd_close(index, &fd); 611321936Shselasky return ret; 612321936Shselasky } 613321936Shselasky 614321936Shselasky fd_store(index, ret, fd_normal, fd_fork_passive); 615321936Shselasky return index; 616321936Shselasky } else { 617321936Shselasky return real.accept(fd, addr, addrlen); 618321936Shselasky } 619321936Shselasky} 620321936Shselasky 621321936Shselasky/* 622321936Shselasky * We can't fork RDMA connections and pass them from the parent to the child 623321936Shselasky * process. Instead, we need to establish the RDMA connection after calling 624321936Shselasky * fork. To do this, we delay establishing the RDMA connection until we try 625321936Shselasky * to send/receive on the server side. 626321936Shselasky */ 627321936Shselaskystatic void fork_active(int socket) 628321936Shselasky{ 629321936Shselasky struct sockaddr_storage addr; 630321936Shselasky int sfd, dfd, ret; 631321936Shselasky socklen_t len; 632321936Shselasky uint32_t msg; 633321936Shselasky long flags; 634321936Shselasky 635321936Shselasky sfd = fd_getd(socket); 636321936Shselasky 637321936Shselasky flags = real.fcntl(sfd, F_GETFL); 638321936Shselasky real.fcntl(sfd, F_SETFL, 0); 639321936Shselasky ret = real.recv(sfd, &msg, sizeof msg, MSG_PEEK); 640321936Shselasky real.fcntl(sfd, F_SETFL, flags); 641321936Shselasky if ((ret != sizeof msg) || msg) 642321936Shselasky goto err1; 643321936Shselasky 644321936Shselasky len = sizeof addr; 645321936Shselasky ret = real.getpeername(sfd, (struct sockaddr *) &addr, &len); 646321936Shselasky if (ret) 647321936Shselasky goto err1; 648321936Shselasky 649321936Shselasky dfd = rsocket(addr.ss_family, SOCK_STREAM, 0); 650321936Shselasky if (dfd < 0) 651321936Shselasky goto err1; 652321936Shselasky 653321936Shselasky ret = rconnect(dfd, (struct sockaddr *) &addr, len); 654321936Shselasky if (ret) 655321936Shselasky goto err2; 656321936Shselasky 657321936Shselasky set_rsocket_options(dfd); 658321936Shselasky copysockopts(dfd, sfd, &rs, &real); 659321936Shselasky real.shutdown(sfd, SHUT_RDWR); 660321936Shselasky real.close(sfd); 661321936Shselasky fd_store(socket, dfd, fd_rsocket, fd_ready); 662321936Shselasky return; 663321936Shselasky 664321936Shselaskyerr2: 665321936Shselasky rclose(dfd); 666321936Shselaskyerr1: 667321936Shselasky fd_store(socket, sfd, fd_normal, fd_ready); 668321936Shselasky} 669321936Shselasky 670321936Shselasky/* 671321936Shselasky * The server will start listening for the new connection, then send a 672321936Shselasky * message to the active side when the listen is ready. This does leave 673321936Shselasky * fork unsupported in the following case: the server is nonblocking and 674321936Shselasky * calls select/poll waiting to receive data from the client. 675321936Shselasky */ 676321936Shselaskystatic void fork_passive(int socket) 677321936Shselasky{ 678321936Shselasky struct sockaddr_in6 sin6; 679321936Shselasky sem_t *sem; 680321936Shselasky int lfd, sfd, dfd, ret, param; 681321936Shselasky socklen_t len; 682321936Shselasky uint32_t msg; 683321936Shselasky 684321936Shselasky sfd = fd_getd(socket); 685321936Shselasky 686321936Shselasky len = sizeof sin6; 687321936Shselasky ret = real.getsockname(sfd, (struct sockaddr *) &sin6, &len); 688321936Shselasky if (ret) 689321936Shselasky goto out; 690321936Shselasky sin6.sin6_flowinfo = 0; 691321936Shselasky sin6.sin6_scope_id = 0; 692321936Shselasky memset(&sin6.sin6_addr, 0, sizeof sin6.sin6_addr); 693321936Shselasky 694321936Shselasky sem = sem_open("/rsocket_fork", O_CREAT | O_RDWR, 695321936Shselasky S_IRWXU | S_IRWXG, 1); 696321936Shselasky if (sem == SEM_FAILED) { 697321936Shselasky ret = -1; 698321936Shselasky goto out; 699321936Shselasky } 700321936Shselasky 701321936Shselasky lfd = rsocket(sin6.sin6_family, SOCK_STREAM, 0); 702321936Shselasky if (lfd < 0) { 703321936Shselasky ret = lfd; 704321936Shselasky goto sclose; 705321936Shselasky } 706321936Shselasky 707321936Shselasky param = 1; 708321936Shselasky rsetsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, ¶m, sizeof param); 709321936Shselasky 710321936Shselasky sem_wait(sem); 711321936Shselasky ret = rbind(lfd, (struct sockaddr *) &sin6, sizeof sin6); 712321936Shselasky if (ret) 713321936Shselasky goto lclose; 714321936Shselasky 715321936Shselasky ret = rlisten(lfd, 1); 716321936Shselasky if (ret) 717321936Shselasky goto lclose; 718321936Shselasky 719321936Shselasky msg = 0; 720321936Shselasky len = real.write(sfd, &msg, sizeof msg); 721321936Shselasky if (len != sizeof msg) 722321936Shselasky goto lclose; 723321936Shselasky 724321936Shselasky dfd = raccept(lfd, NULL, NULL); 725321936Shselasky if (dfd < 0) { 726321936Shselasky ret = dfd; 727321936Shselasky goto lclose; 728321936Shselasky } 729321936Shselasky 730321936Shselasky set_rsocket_options(dfd); 731321936Shselasky copysockopts(dfd, sfd, &rs, &real); 732321936Shselasky real.shutdown(sfd, SHUT_RDWR); 733321936Shselasky real.close(sfd); 734321936Shselasky fd_store(socket, dfd, fd_rsocket, fd_ready); 735321936Shselasky 736321936Shselaskylclose: 737321936Shselasky rclose(lfd); 738321936Shselasky sem_post(sem); 739321936Shselaskysclose: 740321936Shselasky sem_close(sem); 741321936Shselaskyout: 742321936Shselasky if (ret) 743321936Shselasky fd_store(socket, sfd, fd_normal, fd_ready); 744321936Shselasky} 745321936Shselasky 746321936Shselaskystatic inline enum fd_type fd_fork_get(int index, int *fd) 747321936Shselasky{ 748321936Shselasky struct fd_info *fdi; 749321936Shselasky 750321936Shselasky fdi = idm_lookup(&idm, index); 751321936Shselasky if (fdi) { 752321936Shselasky if (fdi->state == fd_fork_passive) 753321936Shselasky fork_passive(index); 754321936Shselasky else if (fdi->state == fd_fork_active) 755321936Shselasky fork_active(index); 756321936Shselasky *fd = fdi->fd; 757321936Shselasky return fdi->type; 758321936Shselasky 759321936Shselasky } else { 760321936Shselasky *fd = index; 761321936Shselasky return fd_normal; 762321936Shselasky } 763321936Shselasky} 764321936Shselasky 765321936Shselaskyint connect(int socket, const struct sockaddr *addr, socklen_t addrlen) 766321936Shselasky{ 767321936Shselasky int fd, ret; 768321936Shselasky 769321936Shselasky if (fd_get(socket, &fd) == fd_rsocket) { 770321936Shselasky ret = rconnect(fd, addr, addrlen); 771321936Shselasky if (!ret || errno == EINPROGRESS) 772321936Shselasky return ret; 773321936Shselasky 774321936Shselasky ret = transpose_socket(socket, fd_normal); 775321936Shselasky if (ret < 0) 776321936Shselasky return ret; 777321936Shselasky 778321936Shselasky rclose(fd); 779321936Shselasky fd = ret; 780321936Shselasky } else if (fd_gets(socket) == fd_fork) { 781321936Shselasky fd_store(socket, fd, fd_normal, fd_fork_active); 782321936Shselasky } 783321936Shselasky 784321936Shselasky return real.connect(fd, addr, addrlen); 785321936Shselasky} 786321936Shselasky 787321936Shselaskyssize_t recv(int socket, void *buf, size_t len, int flags) 788321936Shselasky{ 789321936Shselasky int fd; 790321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 791321936Shselasky rrecv(fd, buf, len, flags) : real.recv(fd, buf, len, flags); 792321936Shselasky} 793321936Shselasky 794321936Shselaskyssize_t recvfrom(int socket, void *buf, size_t len, int flags, 795321936Shselasky struct sockaddr *src_addr, socklen_t *addrlen) 796321936Shselasky{ 797321936Shselasky int fd; 798321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 799321936Shselasky rrecvfrom(fd, buf, len, flags, src_addr, addrlen) : 800321936Shselasky real.recvfrom(fd, buf, len, flags, src_addr, addrlen); 801321936Shselasky} 802321936Shselasky 803321936Shselaskyssize_t recvmsg(int socket, struct msghdr *msg, int flags) 804321936Shselasky{ 805321936Shselasky int fd; 806321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 807321936Shselasky rrecvmsg(fd, msg, flags) : real.recvmsg(fd, msg, flags); 808321936Shselasky} 809321936Shselasky 810321936Shselaskyssize_t read(int socket, void *buf, size_t count) 811321936Shselasky{ 812321936Shselasky int fd; 813321936Shselasky init_preload(); 814321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 815321936Shselasky rread(fd, buf, count) : real.read(fd, buf, count); 816321936Shselasky} 817321936Shselasky 818321936Shselaskyssize_t readv(int socket, const struct iovec *iov, int iovcnt) 819321936Shselasky{ 820321936Shselasky int fd; 821321936Shselasky init_preload(); 822321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 823321936Shselasky rreadv(fd, iov, iovcnt) : real.readv(fd, iov, iovcnt); 824321936Shselasky} 825321936Shselasky 826321936Shselaskyssize_t send(int socket, const void *buf, size_t len, int flags) 827321936Shselasky{ 828321936Shselasky int fd; 829321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 830321936Shselasky rsend(fd, buf, len, flags) : real.send(fd, buf, len, flags); 831321936Shselasky} 832321936Shselasky 833321936Shselaskyssize_t sendto(int socket, const void *buf, size_t len, int flags, 834321936Shselasky const struct sockaddr *dest_addr, socklen_t addrlen) 835321936Shselasky{ 836321936Shselasky int fd; 837321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 838321936Shselasky rsendto(fd, buf, len, flags, dest_addr, addrlen) : 839321936Shselasky real.sendto(fd, buf, len, flags, dest_addr, addrlen); 840321936Shselasky} 841321936Shselasky 842321936Shselaskyssize_t sendmsg(int socket, const struct msghdr *msg, int flags) 843321936Shselasky{ 844321936Shselasky int fd; 845321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 846321936Shselasky rsendmsg(fd, msg, flags) : real.sendmsg(fd, msg, flags); 847321936Shselasky} 848321936Shselasky 849321936Shselaskyssize_t write(int socket, const void *buf, size_t count) 850321936Shselasky{ 851321936Shselasky int fd; 852321936Shselasky init_preload(); 853321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 854321936Shselasky rwrite(fd, buf, count) : real.write(fd, buf, count); 855321936Shselasky} 856321936Shselasky 857321936Shselaskyssize_t writev(int socket, const struct iovec *iov, int iovcnt) 858321936Shselasky{ 859321936Shselasky int fd; 860321936Shselasky init_preload(); 861321936Shselasky return (fd_fork_get(socket, &fd) == fd_rsocket) ? 862321936Shselasky rwritev(fd, iov, iovcnt) : real.writev(fd, iov, iovcnt); 863321936Shselasky} 864321936Shselasky 865321936Shselaskystatic struct pollfd *fds_alloc(nfds_t nfds) 866321936Shselasky{ 867321936Shselasky static __thread struct pollfd *rfds; 868321936Shselasky static __thread nfds_t rnfds; 869321936Shselasky 870321936Shselasky if (nfds > rnfds) { 871321936Shselasky if (rfds) 872321936Shselasky free(rfds); 873321936Shselasky 874321936Shselasky rfds = malloc(sizeof(*rfds) * nfds); 875321936Shselasky rnfds = rfds ? nfds : 0; 876321936Shselasky } 877321936Shselasky 878321936Shselasky return rfds; 879321936Shselasky} 880321936Shselasky 881321936Shselaskyint poll(struct pollfd *fds, nfds_t nfds, int timeout) 882321936Shselasky{ 883321936Shselasky struct pollfd *rfds; 884321936Shselasky int i, ret; 885321936Shselasky 886321936Shselasky init_preload(); 887321936Shselasky for (i = 0; i < nfds; i++) { 888321936Shselasky if (fd_gett(fds[i].fd) == fd_rsocket) 889321936Shselasky goto use_rpoll; 890321936Shselasky } 891321936Shselasky 892321936Shselasky return real.poll(fds, nfds, timeout); 893321936Shselasky 894321936Shselaskyuse_rpoll: 895321936Shselasky rfds = fds_alloc(nfds); 896321936Shselasky if (!rfds) 897321936Shselasky return ERR(ENOMEM); 898321936Shselasky 899321936Shselasky for (i = 0; i < nfds; i++) { 900321936Shselasky rfds[i].fd = fd_getd(fds[i].fd); 901321936Shselasky rfds[i].events = fds[i].events; 902321936Shselasky rfds[i].revents = 0; 903321936Shselasky } 904321936Shselasky 905321936Shselasky ret = rpoll(rfds, nfds, timeout); 906321936Shselasky 907321936Shselasky for (i = 0; i < nfds; i++) 908321936Shselasky fds[i].revents = rfds[i].revents; 909321936Shselasky 910321936Shselasky return ret; 911321936Shselasky} 912321936Shselasky 913321936Shselaskystatic void select_to_rpoll(struct pollfd *fds, int *nfds, 914321936Shselasky fd_set *readfds, fd_set *writefds, fd_set *exceptfds) 915321936Shselasky{ 916321936Shselasky int fd, events, i = 0; 917321936Shselasky 918321936Shselasky for (fd = 0; fd < *nfds; fd++) { 919321936Shselasky events = (readfds && FD_ISSET(fd, readfds)) ? POLLIN : 0; 920321936Shselasky if (writefds && FD_ISSET(fd, writefds)) 921321936Shselasky events |= POLLOUT; 922321936Shselasky 923321936Shselasky if (events || (exceptfds && FD_ISSET(fd, exceptfds))) { 924321936Shselasky fds[i].fd = fd_getd(fd); 925321936Shselasky fds[i++].events = events; 926321936Shselasky } 927321936Shselasky } 928321936Shselasky 929321936Shselasky *nfds = i; 930321936Shselasky} 931321936Shselasky 932321936Shselaskystatic int rpoll_to_select(struct pollfd *fds, int nfds, 933321936Shselasky fd_set *readfds, fd_set *writefds, fd_set *exceptfds) 934321936Shselasky{ 935321936Shselasky int fd, rfd, i, cnt = 0; 936321936Shselasky 937321936Shselasky for (i = 0, fd = 0; i < nfds; fd++) { 938321936Shselasky rfd = fd_getd(fd); 939321936Shselasky if (rfd != fds[i].fd) 940321936Shselasky continue; 941321936Shselasky 942321936Shselasky if (readfds && (fds[i].revents & POLLIN)) { 943321936Shselasky FD_SET(fd, readfds); 944321936Shselasky cnt++; 945321936Shselasky } 946321936Shselasky 947321936Shselasky if (writefds && (fds[i].revents & POLLOUT)) { 948321936Shselasky FD_SET(fd, writefds); 949321936Shselasky cnt++; 950321936Shselasky } 951321936Shselasky 952321936Shselasky if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) { 953321936Shselasky FD_SET(fd, exceptfds); 954321936Shselasky cnt++; 955321936Shselasky } 956321936Shselasky i++; 957321936Shselasky } 958321936Shselasky 959321936Shselasky return cnt; 960321936Shselasky} 961321936Shselasky 962321936Shselaskystatic int rs_convert_timeout(struct timeval *timeout) 963321936Shselasky{ 964321936Shselasky return !timeout ? -1 : timeout->tv_sec * 1000 + timeout->tv_usec / 1000; 965321936Shselasky} 966321936Shselasky 967321936Shselaskyint select(int nfds, fd_set *readfds, fd_set *writefds, 968321936Shselasky fd_set *exceptfds, struct timeval *timeout) 969321936Shselasky{ 970321936Shselasky struct pollfd *fds; 971321936Shselasky int ret; 972321936Shselasky 973321936Shselasky fds = fds_alloc(nfds); 974321936Shselasky if (!fds) 975321936Shselasky return ERR(ENOMEM); 976321936Shselasky 977321936Shselasky select_to_rpoll(fds, &nfds, readfds, writefds, exceptfds); 978321936Shselasky ret = rpoll(fds, nfds, rs_convert_timeout(timeout)); 979321936Shselasky 980321936Shselasky if (readfds) 981321936Shselasky FD_ZERO(readfds); 982321936Shselasky if (writefds) 983321936Shselasky FD_ZERO(writefds); 984321936Shselasky if (exceptfds) 985321936Shselasky FD_ZERO(exceptfds); 986321936Shselasky 987321936Shselasky if (ret > 0) 988321936Shselasky ret = rpoll_to_select(fds, nfds, readfds, writefds, exceptfds); 989321936Shselasky 990321936Shselasky return ret; 991321936Shselasky} 992321936Shselasky 993321936Shselaskyint shutdown(int socket, int how) 994321936Shselasky{ 995321936Shselasky int fd; 996321936Shselasky return (fd_get(socket, &fd) == fd_rsocket) ? 997321936Shselasky rshutdown(fd, how) : real.shutdown(fd, how); 998321936Shselasky} 999321936Shselasky 1000321936Shselaskyint close(int socket) 1001321936Shselasky{ 1002321936Shselasky struct fd_info *fdi; 1003321936Shselasky int ret; 1004321936Shselasky 1005321936Shselasky init_preload(); 1006321936Shselasky fdi = idm_lookup(&idm, socket); 1007321936Shselasky if (!fdi) 1008321936Shselasky return real.close(socket); 1009321936Shselasky 1010321936Shselasky if (fdi->dupfd != -1) { 1011321936Shselasky ret = close(fdi->dupfd); 1012321936Shselasky if (ret) 1013321936Shselasky return ret; 1014321936Shselasky } 1015321936Shselasky 1016321936Shselasky if (atomic_fetch_sub(&fdi->refcnt, 1) != 1) 1017321936Shselasky return 0; 1018321936Shselasky 1019321936Shselasky idm_clear(&idm, socket); 1020321936Shselasky real.close(socket); 1021321936Shselasky ret = (fdi->type == fd_rsocket) ? rclose(fdi->fd) : real.close(fdi->fd); 1022321936Shselasky free(fdi); 1023321936Shselasky return ret; 1024321936Shselasky} 1025321936Shselasky 1026321936Shselaskyint getpeername(int socket, struct sockaddr *addr, socklen_t *addrlen) 1027321936Shselasky{ 1028321936Shselasky int fd; 1029321936Shselasky return (fd_get(socket, &fd) == fd_rsocket) ? 1030321936Shselasky rgetpeername(fd, addr, addrlen) : 1031321936Shselasky real.getpeername(fd, addr, addrlen); 1032321936Shselasky} 1033321936Shselasky 1034321936Shselaskyint getsockname(int socket, struct sockaddr *addr, socklen_t *addrlen) 1035321936Shselasky{ 1036321936Shselasky int fd; 1037321936Shselasky init_preload(); 1038321936Shselasky return (fd_get(socket, &fd) == fd_rsocket) ? 1039321936Shselasky rgetsockname(fd, addr, addrlen) : 1040321936Shselasky real.getsockname(fd, addr, addrlen); 1041321936Shselasky} 1042321936Shselasky 1043321936Shselaskyint setsockopt(int socket, int level, int optname, 1044321936Shselasky const void *optval, socklen_t optlen) 1045321936Shselasky{ 1046321936Shselasky int fd; 1047321936Shselasky return (fd_get(socket, &fd) == fd_rsocket) ? 1048321936Shselasky rsetsockopt(fd, level, optname, optval, optlen) : 1049321936Shselasky real.setsockopt(fd, level, optname, optval, optlen); 1050321936Shselasky} 1051321936Shselasky 1052321936Shselaskyint getsockopt(int socket, int level, int optname, 1053321936Shselasky void *optval, socklen_t *optlen) 1054321936Shselasky{ 1055321936Shselasky int fd; 1056321936Shselasky return (fd_get(socket, &fd) == fd_rsocket) ? 1057321936Shselasky rgetsockopt(fd, level, optname, optval, optlen) : 1058321936Shselasky real.getsockopt(fd, level, optname, optval, optlen); 1059321936Shselasky} 1060321936Shselasky 1061321936Shselaskyint fcntl(int socket, int cmd, ... /* arg */) 1062321936Shselasky{ 1063321936Shselasky va_list args; 1064321936Shselasky long lparam; 1065321936Shselasky void *pparam; 1066321936Shselasky int fd, ret; 1067321936Shselasky 1068321936Shselasky init_preload(); 1069321936Shselasky va_start(args, cmd); 1070321936Shselasky switch (cmd) { 1071321936Shselasky case F_GETFD: 1072321936Shselasky case F_GETFL: 1073321936Shselasky case F_GETOWN: 1074321936Shselasky case F_GETSIG: 1075321936Shselasky case F_GETLEASE: 1076321936Shselasky ret = (fd_get(socket, &fd) == fd_rsocket) ? 1077321936Shselasky rfcntl(fd, cmd) : real.fcntl(fd, cmd); 1078321936Shselasky break; 1079321936Shselasky case F_DUPFD: 1080321936Shselasky /*case F_DUPFD_CLOEXEC:*/ 1081321936Shselasky case F_SETFD: 1082321936Shselasky case F_SETFL: 1083321936Shselasky case F_SETOWN: 1084321936Shselasky case F_SETSIG: 1085321936Shselasky case F_SETLEASE: 1086321936Shselasky case F_NOTIFY: 1087321936Shselasky lparam = va_arg(args, long); 1088321936Shselasky ret = (fd_get(socket, &fd) == fd_rsocket) ? 1089321936Shselasky rfcntl(fd, cmd, lparam) : real.fcntl(fd, cmd, lparam); 1090321936Shselasky break; 1091321936Shselasky default: 1092321936Shselasky pparam = va_arg(args, void *); 1093321936Shselasky ret = (fd_get(socket, &fd) == fd_rsocket) ? 1094321936Shselasky rfcntl(fd, cmd, pparam) : real.fcntl(fd, cmd, pparam); 1095321936Shselasky break; 1096321936Shselasky } 1097321936Shselasky va_end(args); 1098321936Shselasky return ret; 1099321936Shselasky} 1100321936Shselasky 1101321936Shselasky/* 1102321936Shselasky * dup2 is not thread safe 1103321936Shselasky */ 1104321936Shselaskyint dup2(int oldfd, int newfd) 1105321936Shselasky{ 1106321936Shselasky struct fd_info *oldfdi, *newfdi; 1107321936Shselasky int ret; 1108321936Shselasky 1109321936Shselasky init_preload(); 1110321936Shselasky oldfdi = idm_lookup(&idm, oldfd); 1111321936Shselasky if (oldfdi) { 1112321936Shselasky if (oldfdi->state == fd_fork_passive) 1113321936Shselasky fork_passive(oldfd); 1114321936Shselasky else if (oldfdi->state == fd_fork_active) 1115321936Shselasky fork_active(oldfd); 1116321936Shselasky } 1117321936Shselasky 1118321936Shselasky newfdi = idm_lookup(&idm, newfd); 1119321936Shselasky if (newfdi) { 1120321936Shselasky /* newfd cannot have been dup'ed directly */ 1121321936Shselasky if (atomic_load(&newfdi->refcnt) > 1) 1122321936Shselasky return ERR(EBUSY); 1123321936Shselasky close(newfd); 1124321936Shselasky } 1125321936Shselasky 1126321936Shselasky ret = real.dup2(oldfd, newfd); 1127321936Shselasky if (!oldfdi || ret != newfd) 1128321936Shselasky return ret; 1129321936Shselasky 1130321936Shselasky newfdi = calloc(1, sizeof(*newfdi)); 1131321936Shselasky if (!newfdi) { 1132321936Shselasky close(newfd); 1133321936Shselasky return ERR(ENOMEM); 1134321936Shselasky } 1135321936Shselasky 1136321936Shselasky pthread_mutex_lock(&mut); 1137321936Shselasky idm_set(&idm, newfd, newfdi); 1138321936Shselasky pthread_mutex_unlock(&mut); 1139321936Shselasky 1140321936Shselasky newfdi->fd = oldfdi->fd; 1141321936Shselasky newfdi->type = oldfdi->type; 1142321936Shselasky if (oldfdi->dupfd != -1) { 1143321936Shselasky newfdi->dupfd = oldfdi->dupfd; 1144321936Shselasky oldfdi = idm_lookup(&idm, oldfdi->dupfd); 1145321936Shselasky } else { 1146321936Shselasky newfdi->dupfd = oldfd; 1147321936Shselasky } 1148321936Shselasky atomic_store(&newfdi->refcnt, 1); 1149321936Shselasky atomic_fetch_add(&oldfdi->refcnt, 1); 1150321936Shselasky return newfd; 1151321936Shselasky} 1152321936Shselasky 1153321936Shselaskyssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count) 1154321936Shselasky{ 1155321936Shselasky void *file_addr; 1156321936Shselasky int fd; 1157321936Shselasky size_t ret; 1158321936Shselasky 1159321936Shselasky if (fd_get(out_fd, &fd) != fd_rsocket) 1160321936Shselasky return real.sendfile(fd, in_fd, offset, count); 1161321936Shselasky 1162321936Shselasky file_addr = mmap(NULL, count, PROT_READ, 0, in_fd, offset ? *offset : 0); 1163321936Shselasky if (file_addr == (void *) -1) 1164321936Shselasky return -1; 1165321936Shselasky 1166321936Shselasky ret = rwrite(fd, file_addr, count); 1167321936Shselasky if ((ret > 0) && offset) 1168321936Shselasky lseek(in_fd, ret, SEEK_CUR); 1169321936Shselasky munmap(file_addr, count); 1170321936Shselasky return ret; 1171321936Shselasky} 1172321936Shselasky 1173321936Shselaskyint __fxstat(int ver, int socket, struct stat *buf) 1174321936Shselasky{ 1175321936Shselasky int fd, ret; 1176321936Shselasky 1177321936Shselasky init_preload(); 1178321936Shselasky if (fd_get(socket, &fd) == fd_rsocket) { 1179321936Shselasky ret = real.fxstat(ver, socket, buf); 1180321936Shselasky if (!ret) 1181321936Shselasky buf->st_mode = (buf->st_mode & ~S_IFMT) | __S_IFSOCK; 1182321936Shselasky } else { 1183321936Shselasky ret = real.fxstat(ver, fd, buf); 1184321936Shselasky } 1185321936Shselasky return ret; 1186321936Shselasky} 1187