socksyscalls.c revision 408:11731380d322
1153761Swollman/* 2192886Sedwin * CDDL HEADER START 3192886Sedwin * 464499Swollman * The contents of this file are subject to the terms of the 52742Swollman * Common Development and Distribution License, Version 1.0 only 62742Swollman * (the "License"). You may not use this file except in compliance 7243003Sedwin * with the License. 82742Swollman * 9158421Swollman * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 102742Swollman * or http://www.opensolaris.org/os/licensing. 11158421Swollman * See the License for the specific language governing permissions 12158421Swollman * and limitations under the License. 132742Swollman * 14248307Sedwin * When distributing Covered Code, include this CDDL HEADER in each 15248307Sedwin * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16248307Sedwin * If applicable, add the following below this CDDL HEADER, with the 17248307Sedwin * fields enclosed by brackets "[]" replaced with your own identifying 1886222Swollman * information: Portions Copyright [yyyy] [name of copyright owner] 1920094Swollman * 2020094Swollman * CDDL HEADER END 2120094Swollman */ 2220094Swollman/* 2320094Swollman * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24158421Swollman * Use is subject to license terms. 25158421Swollman */ 2620094Swollman 2719878Swollman#pragma ident "%Z%%M% %I% %E% SMI" 2819878Swollman 2919878Swollman#include <sys/types.h> 3019878Swollman#include <sys/t_lock.h> 3119878Swollman#include <sys/param.h> 3219878Swollman#include <sys/systm.h> 3319878Swollman#include <sys/buf.h> 3419878Swollman#include <sys/conf.h> 3558787Sru#include <sys/cred.h> 3658787Sru#include <sys/kmem.h> 3758787Sru#include <sys/sysmacros.h> 3858787Sru#include <sys/vfs.h> 3958787Sru#include <sys/vnode.h> 4058787Sru#include <sys/debug.h> 4158787Sru#include <sys/errno.h> 4258787Sru#include <sys/time.h> 4358787Sru#include <sys/file.h> 4458787Sru#include <sys/open.h> 4558787Sru#include <sys/user.h> 4658787Sru#include <sys/termios.h> 4758787Sru#include <sys/stream.h> 4858787Sru#include <sys/strsubr.h> 4958787Sru#include <sys/strsun.h> 5058787Sru#include <sys/esunddi.h> 5158787Sru#include <sys/flock.h> 5258787Sru#include <sys/modctl.h> 532742Swollman#include <sys/cmn_err.h> 542742Swollman#include <sys/vmsystm.h> 552742Swollman#include <sys/policy.h> 562742Swollman 572742Swollman#include <sys/socket.h> 582742Swollman#include <sys/socketvar.h> 592742Swollman#include <netinet/in.h> 6019878Swollman#include <sys/un.h> 612742Swollman#include <inet/nca/ncadoorhdr.h> 622742Swollman 632742Swollman#include <sys/isa_defs.h> 6419878Swollman#include <sys/inttypes.h> 652742Swollman#include <sys/systm.h> 662742Swollman#include <sys/cpuvar.h> 67149514Swollman#include <sys/atomic.h> 6821217Swollman#include <sys/filio.h> 699908Swollman#include <sys/sendfile.h> 709908Swollman#include <sys/ddi.h> 712742Swollman#include <vm/seg.h> 7219878Swollman#include <vm/seg_map.h> 7319878Swollman#include <vm/seg_kpm.h> 7419878Swollman#include <fs/sockfs/nl7c.h> 7519878Swollman 7619878Swollman#ifdef SOCK_TEST 7719878Swollmanint do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */ 7819878Swollman#else 7919878Swollman#define do_useracc 1 8019878Swollman#endif /* SOCK_TEST */ 8119878Swollman 8219878Swollmanextern int xnet_truncate_print; 8319878Swollman 8419878Swollman/* 8519878Swollman * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" 8619878Swollman * as there isn't a formal definition of IOV_MAX ??? 8719878Swollman */ 8893799Swollman#define MSG_MAXIOVLEN 16 8958787Sru 9058787Sru/* 9119878Swollman * Kernel component of socket creation. 9219878Swollman * 9319878Swollman * The socket library determines which version number to use. 949908Swollman * First the library calls this with a NULL devpath. If this fails 95149514Swollman * to find a transport (using solookup) the library will look in /etc/netconfig 969908Swollman * for the appropriate transport. If one is found it will pass in the 979908Swollman * devpath for the kernel to use. 989908Swollman */ 9921217Swollmanint 10019878Swollmanso_socket(int domain, int type, int protocol, char *devpath, int version) 10119878Swollman{ 1029908Swollman vnode_t *accessvp; 103149514Swollman struct sonode *so; 1049908Swollman vnode_t *vp; 1059908Swollman struct file *fp; 1069908Swollman int fd; 1079908Swollman int error; 10858787Sru boolean_t wildcard = B_FALSE; 10958787Sru int saved_error = 0; 11058787Sru int sdomain = domain; 11164499Swollman 11264499Swollman dprint(1, ("so_socket(%d,%d,%d,%p,%d)\n", 113175034Sedwin domain, type, protocol, devpath, version)); 114175034Sedwin 115175034Sedwin if (domain == AF_NCA) { 116175034Sedwin /* 117175034Sedwin * The request is for an NCA socket so for NL7C use the 11858787Sru * INET domain instead and mark NL7C_AF_NCA below. 11958787Sru */ 12067578Swollman domain = AF_INET; 12158787Sru /* 12258787Sru * NL7C is not supported in non-global zones, 12358787Sru * we enforce this restriction here. 124149514Swollman */ 12564499Swollman if (getzoneid() != GLOBAL_ZONEID) { 12664499Swollman return (set_errno(ENOTSUP)); 12764499Swollman } 12864499Swollman } 12986222Swollman 13086222Swollman accessvp = solookup(domain, type, protocol, devpath, &error); 13186222Swollman if (accessvp == NULL) { 13286222Swollman /* 13386222Swollman * If there is either an EPROTONOSUPPORT or EPROTOTYPE error 13486222Swollman * it makes sense doing the wildcard lookup since the 13586222Swollman * protocol might not be in the table. 13686222Swollman */ 13786222Swollman if (devpath != NULL || protocol == 0 || 13886222Swollman !(error == EPROTONOSUPPORT || error == EPROTOTYPE)) 13986222Swollman return (set_errno(error)); 14086222Swollman 14186222Swollman saved_error = error; 14286222Swollman 14386222Swollman /* 14486222Swollman * Try wildcard lookup. Never use devpath for wildcards. 14586222Swollman */ 14686222Swollman accessvp = solookup(domain, type, 0, NULL, &error); 14786222Swollman if (accessvp == NULL) { 14886222Swollman /* 14986222Swollman * Can't find in kernel table - have library 15086222Swollman * fall back to /etc/netconfig and tell us 15186222Swollman * the devpath (The library will do this if it didn't 152175034Sedwin * already pass in a devpath). 153175034Sedwin */ 154175034Sedwin if (saved_error != 0) 155175034Sedwin error = saved_error; 156175034Sedwin return (set_errno(error)); 157175034Sedwin } 158175034Sedwin wildcard = B_TRUE; 159175034Sedwin } 160175034Sedwin 161175034Sedwin /* Check the device policy */ 162175034Sedwin if ((error = secpolicy_spec_open(CRED(), 163175034Sedwin accessvp, FREAD|FWRITE)) != 0) { 164175034Sedwin return (set_errno(error)); 165175034Sedwin } 166175034Sedwin 167175034Sedwin if (domain == AF_NCA) { 168175034Sedwin so = sonca_create(accessvp, domain, type, protocol, version, 169175034Sedwin NULL, &error); 170175034Sedwin } else if (protocol == IPPROTO_SCTP) { 171175034Sedwin so = sosctp_create(accessvp, domain, type, protocol, version, 172183066Sedwin NULL, &error); 173183066Sedwin } else { 174183066Sedwin so = sotpi_create(accessvp, domain, type, protocol, version, 175183066Sedwin NULL, &error); 176183066Sedwin } 177183066Sedwin if (so == NULL) { 178183066Sedwin return (set_errno(error)); 179183066Sedwin } 180183066Sedwin if (sdomain == AF_NCA && domain == AF_INET) { 181183066Sedwin so->so_nl7c_flags = NL7C_AF_NCA; 182183066Sedwin } 183183066Sedwin vp = SOTOV(so); 184183066Sedwin 185183864Sedwin if (wildcard) { 186183864Sedwin /* 187183864Sedwin * Issue SO_PROTOTYPE setsockopt. 188183864Sedwin */ 189183864Sedwin error = SOP_SETSOCKOPT(so, SOL_SOCKET, SO_PROTOTYPE, 190183864Sedwin &protocol, 191183864Sedwin (t_uscalar_t)sizeof (protocol)); 192183864Sedwin if (error) { 193183864Sedwin (void) VOP_CLOSE(vp, 0, 1, 0, CRED()); 194183864Sedwin VN_RELE(vp); 195183864Sedwin /* 196183864Sedwin * Setsockopt often fails with ENOPROTOOPT but socket() 197183864Sedwin * should fail with EPROTONOSUPPORT/EPROTOTYPE. 198183864Sedwin */ 199183864Sedwin if (saved_error != 0 && error == ENOPROTOOPT) 200183864Sedwin error = saved_error; 201184406Sedwin else 202184406Sedwin error = EPROTONOSUPPORT; 203184406Sedwin return (set_errno(error)); 204184406Sedwin } 205184406Sedwin } 206184406Sedwin if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) { 207184406Sedwin (void) VOP_CLOSE(vp, 0, 1, 0, CRED()); 208184406Sedwin VN_RELE(vp); 209184406Sedwin return (set_errno(error)); 210184406Sedwin } 211184406Sedwin 212184406Sedwin /* 213184406Sedwin * Now fill in the entries that falloc reserved 214184406Sedwin */ 215184406Sedwin mutex_exit(&fp->f_tlock); 216184406Sedwin setf(fd, fp); 217184406Sedwin 218184406Sedwin return (fd); 219184406Sedwin} 220184406Sedwin 221198515Sedwin/* 222198515Sedwin * Map from a file descriptor to a socket node. 223198515Sedwin * Returns with the file descriptor held i.e. the caller has to 224198515Sedwin * use releasef when done with the file descriptor. 225198515Sedwin */ 226198515Sedwinstatic struct sonode * 227198515Sedwingetsonode(int sock, int *errorp, file_t **fpp) 228198515Sedwin{ 229198515Sedwin file_t *fp; 230198515Sedwin vnode_t *vp; 231198515Sedwin struct sonode *so; 232198515Sedwin 233198515Sedwin if ((fp = getf(sock)) == NULL) { 234198515Sedwin *errorp = EBADF; 235175034Sedwin eprintline(*errorp); 236198515Sedwin return (NULL); 237198515Sedwin } 238240457Sedwin vp = fp->f_vnode; 239136638Swollman /* Check if it is a socket */ 240136638Swollman if (vp->v_type != VSOCK) { 241136638Swollman releasef(sock); 242136638Swollman *errorp = ENOTSOCK; 243136638Swollman eprintline(*errorp); 244136638Swollman return (NULL); 245136638Swollman } 24693799Swollman /* 247158421Swollman * Use the stream head to find the real socket vnode. 24893799Swollman * This is needed when namefs sits above sockfs. 249158421Swollman */ 25093799Swollman if (vp->v_stream) { 25193799Swollman ASSERT(vp->v_stream->sd_vnode); 252158421Swollman vp = vp->v_stream->sd_vnode; 253136638Swollman 254136638Swollman so = VTOSO(vp); 255136638Swollman if (so->so_version == SOV_STREAM) { 256136638Swollman releasef(sock); 257136638Swollman *errorp = ENOTSOCK; 258136638Swollman eprintsoline(so, *errorp); 259136638Swollman return (NULL); 260136638Swollman } 261136638Swollman } else { 262136638Swollman so = VTOSO(vp); 263136638Swollman } 264136638Swollman if (fpp) 265136638Swollman *fpp = fp; 266136638Swollman return (so); 267136638Swollman} 268136638Swollman 269136638Swollman/* 270136638Swollman * Allocate and copyin a sockaddr. 271136638Swollman * Ensures NULL termination for AF_UNIX addresses by extending them 272136638Swollman * with one NULL byte if need be. Verifies that the length is not 273136638Swollman * excessive to prevent an application from consuming all of kernel 274136638Swollman * memory. Returns NULL when an error occurred. 275136638Swollman */ 276136638Swollmanstatic struct sockaddr * 277136638Swollmancopyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp, 278136638Swollman int *errorp) 279136638Swollman{ 280136638Swollman char *faddr; 281136638Swollman size_t namelen = (size_t)*namelenp; 282136638Swollman 283136638Swollman ASSERT(namelen != 0); 284136638Swollman if (namelen > SO_MAXARGSIZE) { 285136638Swollman *errorp = EINVAL; 286136638Swollman eprintsoline(so, *errorp); 287136638Swollman return (NULL); 288136638Swollman } 289136638Swollman 290136638Swollman faddr = (char *)kmem_alloc(namelen, KM_SLEEP); 291136638Swollman if (copyin(name, faddr, namelen)) { 292136638Swollman kmem_free(faddr, namelen); 293136638Swollman *errorp = EFAULT; 294136638Swollman eprintsoline(so, *errorp); 295136638Swollman return (NULL); 296136638Swollman } 297136638Swollman 29893799Swollman /* 299177591Sedwin * Add space for NULL termination if needed. 300177591Sedwin * Do a quick check if the last byte is NUL. 301177591Sedwin */ 302177591Sedwin if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') { 303177591Sedwin /* Check if there is any NULL termination */ 304177591Sedwin size_t i; 305177591Sedwin int foundnull = 0; 306177591Sedwin 307177591Sedwin for (i = sizeof (name->sa_family); i < namelen; i++) { 308177591Sedwin if (faddr[i] == '\0') { 309177591Sedwin foundnull = 1; 310177591Sedwin break; 311177591Sedwin } 312177591Sedwin } 313177591Sedwin if (!foundnull) { 314177591Sedwin /* Add extra byte for NUL padding */ 315177591Sedwin char *nfaddr; 316177591Sedwin 317177591Sedwin nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP); 318177591Sedwin bcopy(faddr, nfaddr, namelen); 319177591Sedwin kmem_free(faddr, namelen); 320177591Sedwin 321177591Sedwin /* NUL terminate */ 322177591Sedwin nfaddr[namelen] = '\0'; 323177591Sedwin namelen++; 324177591Sedwin ASSERT((socklen_t)namelen == namelen); 325177591Sedwin *namelenp = (socklen_t)namelen; 326177591Sedwin faddr = nfaddr; 327177591Sedwin } 328240457Sedwin } 329240457Sedwin return ((struct sockaddr *)faddr); 330240457Sedwin} 331177591Sedwin 332177591Sedwin/* 333177591Sedwin * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL. 334177591Sedwin */ 335177591Sedwinstatic int 336177591Sedwincopyout_arg(void *uaddr, socklen_t ulen, void *ulenp, 337177591Sedwin void *kaddr, socklen_t klen) 338177591Sedwin{ 339177591Sedwin if (uaddr != NULL) { 340177591Sedwin if (ulen > klen) 341177591Sedwin ulen = klen; 342177591Sedwin 343177591Sedwin if (ulen != 0) { 344177591Sedwin if (copyout(kaddr, uaddr, ulen)) 345177591Sedwin return (EFAULT); 346177591Sedwin } 347177591Sedwin } else 348177591Sedwin ulen = 0; 349177591Sedwin 350177591Sedwin if (ulenp != NULL) { 351177591Sedwin if (copyout(&ulen, ulenp, sizeof (ulen))) 352177591Sedwin return (EFAULT); 353177591Sedwin } 354177591Sedwin return (0); 355177591Sedwin} 356177591Sedwin 357177591Sedwin/* 358177591Sedwin * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL. 359177591Sedwin * If klen is greater than ulen it still uses the non-truncated 360177591Sedwin * klen to update ulenp. 361177591Sedwin */ 362177591Sedwinstatic int 363177591Sedwincopyout_name(void *uaddr, socklen_t ulen, void *ulenp, 364177591Sedwin void *kaddr, socklen_t klen) 365181421Sedwin{ 366158421Swollman if (uaddr != NULL) { 367158421Swollman if (ulen >= klen) 368181421Sedwin ulen = klen; 369181421Sedwin else if (ulen != 0 && xnet_truncate_print) { 370181421Sedwin printf("sockfs: truncating copyout of address using " 371181421Sedwin "XNET semantics for pid = %d. Lengths %d, %d\n", 372181421Sedwin curproc->p_pid, klen, ulen); 373190372Sedwin } 374190372Sedwin 375190372Sedwin if (ulen != 0) { 376190372Sedwin if (copyout(kaddr, uaddr, ulen)) 37793799Swollman return (EFAULT); 378190372Sedwin } else 379190372Sedwin klen = 0; 380190372Sedwin } else 381190372Sedwin klen = 0; 382190372Sedwin 383190372Sedwin if (ulenp != NULL) { 384190372Sedwin if (copyout(&klen, ulenp, sizeof (klen))) 385190372Sedwin return (EFAULT); 386190372Sedwin } 387190372Sedwin return (0); 388248307Sedwin} 389190372Sedwin 390240457Sedwin/* 391248307Sedwin * The socketpair() code in libsocket creates two sockets (using 392248307Sedwin * the /etc/netconfig fallback if needed) before calling this routine 393190372Sedwin * to connect the two sockets together. 394190372Sedwin * 395190372Sedwin * For a SOCK_STREAM socketpair a listener is needed - in that case this 396190372Sedwin * routine will create a new file descriptor as part of accepting the 397190372Sedwin * connection. The library socketpair() will check if svs[2] has changed 398190372Sedwin * in which case it will close the changed fd. 399198515Sedwin * 400198515Sedwin * Note that this code could use the TPI feature of accepting the connection 401190372Sedwin * on the listening endpoint. However, that would require significant changes 402198515Sedwin * to soaccept. 403198515Sedwin */ 404198515Sedwinint 405197597Sedwinso_socketpair(int sv[2]) 406198515Sedwin{ 407198515Sedwin int svs[2]; 408198515Sedwin struct sonode *so1, *so2; 409198515Sedwin int error; 410197597Sedwin struct sockaddr_ux *name; 411198515Sedwin size_t namelen; 412197597Sedwin 413198515Sedwin dprint(1, ("so_socketpair(%p)\n", sv)); 414198515Sedwin 415198515Sedwin error = useracc(sv, sizeof (svs), B_WRITE); 416198515Sedwin if (error && do_useracc) 417198515Sedwin return (set_errno(EFAULT)); 418198515Sedwin 419198515Sedwin if (copyin(sv, svs, sizeof (svs))) 420198515Sedwin return (set_errno(EFAULT)); 421198515Sedwin 422198515Sedwin if ((so1 = getsonode(svs[0], &error, NULL)) == NULL) 423198515Sedwin return (set_errno(error)); 424198515Sedwin 425198515Sedwin if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) { 426198515Sedwin releasef(svs[0]); 427198515Sedwin return (set_errno(error)); 428198515Sedwin } 429198515Sedwin 430198515Sedwin if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) { 431198515Sedwin error = EOPNOTSUPP; 432197597Sedwin goto done; 433206868Sedwin } 434206868Sedwin 435206868Sedwin /* 436206868Sedwin * The code below makes assumptions about the "sockfs" implementation. 437206868Sedwin * So make sure that the correct implementation is really used. 438206868Sedwin */ 439206868Sedwin ASSERT(so1->so_ops == &sotpi_sonodeops); 440206868Sedwin ASSERT(so2->so_ops == &sotpi_sonodeops); 441206868Sedwin 442206868Sedwin if (so1->so_type == SOCK_DGRAM) { 443206868Sedwin /* 444206868Sedwin * Bind both sockets and connect them with each other. 445206868Sedwin * Need to allocate name/namelen for soconnect. 446206868Sedwin */ 447206868Sedwin error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC); 448206868Sedwin if (error) { 449206868Sedwin eprintsoline(so1, error); 450206868Sedwin goto done; 451206868Sedwin } 452206868Sedwin error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC); 453206868Sedwin if (error) { 454248307Sedwin eprintsoline(so2, error); 455248307Sedwin goto done; 456248307Sedwin } 4572742Swollman namelen = sizeof (struct sockaddr_ux); 45820094Swollman name = kmem_alloc(namelen, KM_SLEEP); 459136638Swollman name->sou_family = AF_UNIX; 460136638Swollman name->sou_addr = so2->so_ux_laddr; 46193799Swollman error = SOP_CONNECT(so1, 46219878Swollman (struct sockaddr *)name, 46358787Sru (socklen_t)namelen, 46493799Swollman 0, _SOCONNECT_NOXLATE); 46593799Swollman if (error) { 466175034Sedwin kmem_free(name, namelen); 46720094Swollman eprintsoline(so1, error); 468184406Sedwin goto done; 469184406Sedwin } 47020094Swollman name->sou_addr = so1->so_ux_laddr; 471158421Swollman error = SOP_CONNECT(so2, 47293799Swollman (struct sockaddr *)name, 47393799Swollman (socklen_t)namelen, 47493799Swollman 0, _SOCONNECT_NOXLATE); 47593799Swollman kmem_free(name, namelen); 47693799Swollman if (error) { 47793799Swollman eprintsoline(so2, error); 478136638Swollman goto done; 47993799Swollman } 48020094Swollman releasef(svs[0]); 48158787Sru releasef(svs[1]); 48293799Swollman } else { 48393799Swollman /* 48493799Swollman * Bind both sockets, with so1 being a listener. 48593799Swollman * Connect so2 to so1 - nonblocking to avoid waiting for 486175034Sedwin * soaccept to complete. 48720094Swollman * Accept a connection on so1. Pass out the new fd as sv[0]. 488184406Sedwin * The library will detect the changed fd and close 489184406Sedwin * the original one. 490184406Sedwin */ 491184406Sedwin struct sonode *nso; 492184406Sedwin struct vnode *nvp; 493184406Sedwin struct file *nfp; 494184406Sedwin int nfd; 495184406Sedwin 496184406Sedwin /* 497184406Sedwin * We could simply call SOP_LISTEN() here (which would do the 498184406Sedwin * binding automatically) if the code didn't rely on passing 499184406Sedwin * _SOBIND_NOXLATE to the TPI implementation of SOP_BIND(). 500136638Swollman */ 501136638Swollman error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC|_SOBIND_NOXLATE| 502136638Swollman _SOBIND_LISTEN|_SOBIND_SOCKETPAIR); 503136638Swollman if (error) { 504136638Swollman eprintsoline(so1, error); 505136638Swollman goto done; 506136638Swollman } 507136638Swollman error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC); 508136638Swollman if (error) { 509136638Swollman eprintsoline(so2, error); 510136638Swollman goto done; 511175034Sedwin } 512136638Swollman 513136638Swollman namelen = sizeof (struct sockaddr_ux); 514136638Swollman name = kmem_alloc(namelen, KM_SLEEP); 515136638Swollman name->sou_family = AF_UNIX; 516136638Swollman name->sou_addr = so1->so_ux_laddr; 517136638Swollman error = SOP_CONNECT(so2, 518136638Swollman (struct sockaddr *)name, 519136638Swollman (socklen_t)namelen, 520136638Swollman FNONBLOCK, _SOCONNECT_NOXLATE); 521136638Swollman kmem_free(name, namelen); 522136638Swollman if (error) { 523136638Swollman if (error != EINPROGRESS) { 524184406Sedwin eprintsoline(so2, error); 525184406Sedwin goto done; 526136638Swollman } 527136638Swollman } 528136638Swollman 529136638Swollman error = SOP_ACCEPT(so1, 0, &nso); 530136638Swollman if (error) { 531136638Swollman eprintsoline(so1, error); 532136638Swollman goto done; 533136638Swollman } 534136638Swollman 535136638Swollman /* wait for so2 being SS_CONNECTED ignoring signals */ 536136638Swollman mutex_enter(&so2->so_lock); 537136638Swollman error = sowaitconnected(so2, 0, 1); 538184406Sedwin mutex_exit(&so2->so_lock); 539184406Sedwin nvp = SOTOV(nso); 540136638Swollman if (error != 0) { 54120094Swollman (void) VOP_CLOSE(nvp, 0, 1, 0, CRED()); 542136638Swollman VN_RELE(nvp); 54393799Swollman eprintsoline(so2, error); 54420094Swollman goto done; 54520094Swollman } 54693799Swollman 54793799Swollman if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) { 54893799Swollman (void) VOP_CLOSE(nvp, 0, 1, 0, CRED()); 54920094Swollman VN_RELE(nvp); 55093799Swollman eprintsoline(nso, error); 55193799Swollman goto done; 55293799Swollman } 553184406Sedwin /* 554184406Sedwin * fill in the entries that falloc reserved 55520094Swollman */ 556149514Swollman mutex_exit(&nfp->f_tlock); 557136638Swollman setf(nfd, nfp); 55893799Swollman 55920094Swollman releasef(svs[0]); 56058787Sru releasef(svs[1]); 56193799Swollman svs[0] = nfd; 56293799Swollman 56393799Swollman /* 56493799Swollman * The socketpair library routine will close the original 565136638Swollman * svs[0] when this code passes out a different file 566136638Swollman * descriptor. 567184406Sedwin */ 568184406Sedwin if (copyout(svs, sv, sizeof (svs))) { 56920094Swollman (void) closeandsetf(nfd, NULL); 57020094Swollman eprintline(EFAULT); 571136638Swollman return (set_errno(EFAULT)); 57293799Swollman } 57320094Swollman } 57420094Swollman return (0); 57593799Swollman 57693799Swollmandone: 57793799Swollman releasef(svs[0]); 57820094Swollman releasef(svs[1]); 57920094Swollman return (set_errno(error)); 58020094Swollman} 58193799Swollman 58293799Swollmanint 583136638Swollmanbind(int sock, struct sockaddr *name, socklen_t namelen, int version) 584136638Swollman{ 585184406Sedwin struct sonode *so; 586184406Sedwin int error; 587136638Swollman 588177591Sedwin dprint(1, ("bind(%d, %p, %d)\n", 589198515Sedwin sock, name, namelen)); 590206868Sedwin 591206868Sedwin if ((so = getsonode(sock, &error, NULL)) == NULL) 592198515Sedwin return (set_errno(error)); 593177591Sedwin 594177591Sedwin /* Allocate and copyin name */ 595177591Sedwin /* 596177591Sedwin * X/Open test does not expect EFAULT with NULL name and non-zero 597181421Sedwin * namelen. 598181421Sedwin */ 599181421Sedwin if (name != NULL && namelen != 0) { 600181421Sedwin ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 601181421Sedwin name = copyin_name(so, name, &namelen, &error); 602181421Sedwin if (name == NULL) { 603181421Sedwin releasef(sock); 604177591Sedwin return (set_errno(error)); 605177591Sedwin } 606177591Sedwin } else { 607198515Sedwin name = NULL; 608177591Sedwin namelen = 0; 609136638Swollman } 610136638Swollman 611136638Swollman switch (version) { 612136638Swollman default: 613136638Swollman error = SOP_BIND(so, name, namelen, 0); 614136638Swollman break; 615136638Swollman case SOV_XPG4_2: 616136638Swollman error = SOP_BIND(so, name, namelen, _SOBIND_XPG4_2); 617136638Swollman break; 618184406Sedwin case SOV_SOCKBSD: 619184406Sedwin error = SOP_BIND(so, name, namelen, _SOBIND_SOCKBSD); 620136638Swollman break; 621136638Swollman } 622136638Swollmandone: 623136638Swollman releasef(sock); 624136638Swollman if (name != NULL) 625136638Swollman kmem_free(name, (size_t)namelen); 626136638Swollman 627136638Swollman if (error) 628136638Swollman return (set_errno(error)); 629136638Swollman return (0); 630184406Sedwin} 631184406Sedwin 6328029Swollman/* ARGSUSED2 */ 63314343Swollmanint 63414343Swollmanlisten(int sock, int backlog, int version) 63514343Swollman{ 63619878Swollman struct sonode *so; 63714343Swollman int error; 63814343Swollman 6392742Swollman dprint(1, ("listen(%d, %d)\n", 6402742Swollman sock, backlog)); 6412742Swollman 64286222Swollman if ((so = getsonode(sock, &error, NULL)) == NULL) 64319878Swollman return (set_errno(error)); 64419878Swollman 6452742Swollman error = SOP_LISTEN(so, backlog); 6462742Swollman 6472742Swollman releasef(sock); 648149514Swollman if (error) 6492742Swollman return (set_errno(error)); 6502742Swollman return (0); 6512742Swollman} 6522742Swollman 6532742Swollman/*ARGSUSED3*/ 6542742Swollmanint 65520094Swollmanaccept(int sock, struct sockaddr *name, socklen_t *namelenp, int version) 65620094Swollman{ 65720094Swollman struct sonode *so; 65820094Swollman file_t *fp; 65920094Swollman int error; 66020094Swollman socklen_t namelen; 66120094Swollman struct sonode *nso; 66220094Swollman struct vnode *nvp; 66320094Swollman struct file *nfp; 66420094Swollman int nfd; 66520094Swollman 66620094Swollman dprint(1, ("accept(%d, %p, %p)\n", 66720094Swollman sock, name, namelenp)); 66820094Swollman 66920094Swollman if ((so = getsonode(sock, &error, &fp)) == NULL) 67020094Swollman return (set_errno(error)); 67120094Swollman 67220094Swollman if (name != NULL) { 67320094Swollman ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 67420094Swollman if (copyin(namelenp, &namelen, sizeof (namelen))) { 67520094Swollman releasef(sock); 67620094Swollman return (set_errno(EFAULT)); 67720094Swollman } 67820094Swollman if (namelen != 0) { 67920094Swollman error = useracc(name, (size_t)namelen, B_WRITE); 68020094Swollman if (error && do_useracc) { 68143014Swollman releasef(sock); 68243014Swollman return (set_errno(EFAULT)); 68343014Swollman } 68443014Swollman } else 68575267Swollman name = NULL; 68675267Swollman } else { 68775267Swollman namelen = 0; 68875267Swollman } 68975267Swollman 69075267Swollman /* 691105196Swollman * Allocate the user fd before SOP_ACCEPT() in order to 692105196Swollman * catch EMFILE errors before calling SOP_ACCEPT(). 693105196Swollman */ 694105196Swollman if ((nfd = ufalloc(0)) == -1) { 695105196Swollman eprintsoline(so, EMFILE); 696105196Swollman releasef(sock); 697105196Swollman return (set_errno(EMFILE)); 698105196Swollman } 699105196Swollman error = SOP_ACCEPT(so, fp->f_flag, &nso); 700105196Swollman releasef(sock); 701105196Swollman if (error) { 702105196Swollman setf(nfd, NULL); 703105196Swollman return (set_errno(error)); 704105196Swollman } 705105196Swollman 706105196Swollman nvp = SOTOV(nso); 707105196Swollman 708136638Swollman /* 709136638Swollman * so_faddr_sa can not go away even though we are not holding so_lock. 710136638Swollman * However, in theory its content could change from underneath us. 711136638Swollman * But this is not possible in practice since it can only 712136638Swollman * change due to either some socket system call 713172479Sedwin * or due to a T_CONN_CON being received from the stream head. 714172479Sedwin * Since the falloc/setf have not yet been done no thread 715172479Sedwin * can do any system call on nso and T_CONN_CON can not arrive 716172479Sedwin * on a socket that is already connected. 717181421Sedwin * Thus there is no reason to hold so_lock here. 718181421Sedwin * 719181421Sedwin * SOP_ACCEPT() is required to have set the valid bit for the faddr, 720181421Sedwin * but it could be instantly cleared by a disconnect from the transport. 721181421Sedwin * For that reason we ignore it here. 722181421Sedwin */ 723181421Sedwin ASSERT(MUTEX_NOT_HELD(&nso->so_lock)); 724181421Sedwin error = copyout_name(name, namelen, namelenp, 725181421Sedwin nso->so_faddr_sa, (socklen_t)nso->so_faddr_len); 726181421Sedwin if (error) { 727181421Sedwin setf(nfd, NULL); 728181421Sedwin (void) VOP_CLOSE(nvp, 0, 1, 0, CRED()); 729181421Sedwin VN_RELE(nvp); 730181421Sedwin return (set_errno(error)); 731181421Sedwin } 732181421Sedwin if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) { 733181421Sedwin setf(nfd, NULL); 734181421Sedwin (void) VOP_CLOSE(nvp, 0, 1, 0, CRED()); 735181421Sedwin VN_RELE(nvp); 736181421Sedwin eprintsoline(so, error); 737181421Sedwin return (set_errno(error)); 738192886Sedwin } 739192886Sedwin /* 740181421Sedwin * fill in the entries that falloc reserved 741181421Sedwin */ 742181421Sedwin nfp->f_vnode = nvp; 743181421Sedwin mutex_exit(&nfp->f_tlock); 744221092Sedwin setf(nfd, nfp); 745181421Sedwin 746181421Sedwin /* 747181421Sedwin * Copy FNDELAY and FNONBLOCK from listener to acceptor 748181421Sedwin */ 749181421Sedwin if (so->so_state & (SS_NDELAY|SS_NONBLOCK)) { 750181421Sedwin uint_t oflag = nfp->f_flag; 751181421Sedwin int arg = 0; 752181421Sedwin 753181421Sedwin if (so->so_state & SS_NONBLOCK) 754181421Sedwin arg |= FNONBLOCK; 755181421Sedwin else if (so->so_state & SS_NDELAY) 756181421Sedwin arg |= FNDELAY; 757181421Sedwin 758181421Sedwin /* 759181421Sedwin * This code is a simplification of the F_SETFL code in fcntl() 760181421Sedwin * Ignore any errors from VOP_SETFL. 761105196Swollman */ 762105196Swollman if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred)) != 0) { 76343014Swollman eprintsoline(so, error); 76443014Swollman error = 0; 765163302Sru } else { 76643014Swollman mutex_enter(&nfp->f_tlock); 767183066Sedwin nfp->f_flag &= ~FMASK | (FREAD|FWRITE); 768183066Sedwin nfp->f_flag |= arg; 769183066Sedwin mutex_exit(&nfp->f_tlock); 770183066Sedwin } 771183066Sedwin } 772183066Sedwin return (nfd); 773183066Sedwin} 774183066Sedwin 775183066Sedwinint 776183066Sedwinconnect(int sock, struct sockaddr *name, socklen_t namelen, int version) 777183066Sedwin{ 778183066Sedwin struct sonode *so; 779183066Sedwin file_t *fp; 780183066Sedwin int error; 781183066Sedwin 782183536Sedwin dprint(1, ("connect(%d, %p, %d)\n", 783183066Sedwin sock, name, namelen)); 784183066Sedwin 785183066Sedwin if ((so = getsonode(sock, &error, &fp)) == NULL) 786183066Sedwin return (set_errno(error)); 787183066Sedwin 788183066Sedwin /* Allocate and copyin name */ 789183066Sedwin if (namelen != 0) { 790183066Sedwin ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 791183066Sedwin name = copyin_name(so, name, &namelen, &error); 792183066Sedwin if (name == NULL) { 793183066Sedwin releasef(sock); 794183066Sedwin return (set_errno(error)); 795183066Sedwin } 796226289Sedwin } else 797226289Sedwin name = NULL; 798226289Sedwin 799240457Sedwin error = SOP_CONNECT(so, name, namelen, fp->f_flag, 800240457Sedwin (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2); 801183066Sedwin releasef(sock); 802226289Sedwin if (name) 803226289Sedwin kmem_free(name, (size_t)namelen); 804226289Sedwin if (error) 805226289Sedwin return (set_errno(error)); 806226289Sedwin return (0); 807226289Sedwin} 808226289Sedwin 809226289Sedwin/*ARGSUSED2*/ 810226289Sedwinint 811226289Sedwinshutdown(int sock, int how, int version) 812248307Sedwin{ 813226289Sedwin struct sonode *so; 814248307Sedwin int error; 815226289Sedwin 816226289Sedwin dprint(1, ("shutdown(%d, %d)\n", 817226752Sedwin sock, how)); 818226752Sedwin 819226752Sedwin if ((so = getsonode(sock, &error, NULL)) == NULL) 820226752Sedwin return (set_errno(error)); 821226752Sedwin 822226752Sedwin error = SOP_SHUTDOWN(so, how); 823226752Sedwin 824226752Sedwin releasef(sock); 825226752Sedwin if (error) 826226752Sedwin return (set_errno(error)); 827226752Sedwin return (0); 828226752Sedwin} 829242208Sedwin 830242208Sedwin/* 831242208Sedwin * Common receive routine. 832242208Sedwin */ 833242208Sedwinstatic ssize_t 834226752Sedwinrecvit(int sock, 835242208Sedwin struct nmsghdr *msg, 836242208Sedwin struct uio *uiop, 837242208Sedwin int flags, 838242208Sedwin socklen_t *namelenp, 8392742Swollman socklen_t *controllenp, 84043543Swollman int *flagsp) 84143543Swollman{ 84258787Sru struct sonode *so; 84358787Sru file_t *fp; 84458787Sru void *name; 84543543Swollman socklen_t namelen; 84643014Swollman void *control; 84743543Swollman socklen_t controllen; 84843543Swollman ssize_t len; 84958787Sru int error; 85058787Sru 85158787Sru if ((so = getsonode(sock, &error, &fp)) == NULL) 85243543Swollman return (set_errno(error)); 85358787Sru 85443543Swollman len = uiop->uio_resid; 85543014Swollman uiop->uio_fmode = fp->f_flag; 85643543Swollman uiop->uio_extflg = UIO_COPY_CACHED; 85743543Swollman 85843543Swollman name = msg->msg_name; 85943543Swollman namelen = msg->msg_namelen; 86043543Swollman control = msg->msg_control; 86158787Sru controllen = msg->msg_controllen; 86243543Swollman 86343543Swollman msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | 86458787Sru MSG_DONTWAIT | MSG_XPG4_2); 86543543Swollman 86658787Sru error = SOP_RECVMSG(so, msg, uiop); 86758787Sru if (error) { 86843543Swollman releasef(sock); 86958787Sru return (set_errno(error)); 87043543Swollman } 87158787Sru lwp_stat_update(LWP_STAT_MSGRCV, 1); 87258787Sru so_update_attrs(so, SOACC); 87343543Swollman releasef(sock); 87443543Swollman 87543543Swollman error = copyout_name(name, namelen, namelenp, 87658787Sru msg->msg_name, msg->msg_namelen); 87743543Swollman if (error) 87843543Swollman goto err; 87958787Sru 88043543Swollman if (flagsp != NULL) { 88158787Sru /* 88258787Sru * Clear internal flag. 88343543Swollman */ 88458787Sru msg->msg_flags &= ~MSG_XPG4_2; 88558787Sru 88643543Swollman /* 88743543Swollman * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only 88858787Sru * when controllen is zero and there is control data to 88958787Sru * copy out. 89043543Swollman */ 89143543Swollman if (controllen != 0 && 89258787Sru (msg->msg_controllen > controllen || control == NULL)) { 89358787Sru dprint(1, ("recvit: CTRUNC %d %d %p\n", 89443543Swollman msg->msg_controllen, controllen, control)); 89543543Swollman 89643543Swollman msg->msg_flags |= MSG_CTRUNC; 89758787Sru } 89858787Sru if (copyout(&msg->msg_flags, flagsp, 89943543Swollman sizeof (msg->msg_flags))) { 90043543Swollman error = EFAULT; 90158787Sru goto err; 90258787Sru } 90343543Swollman } 90443543Swollman /* 90558787Sru * Note: This MUST be done last. There can be no "goto err" after this 90658787Sru * point since it could make so_closefds run twice on some part 90743014Swollman * of the file descriptor array. 90843543Swollman */ 909136638Swollman if (controllen != 0) { 910136638Swollman if (!(flags & MSG_XPG4_2)) { 91143543Swollman /* 912136638Swollman * Good old msg_accrights can only return a multiple 91343543Swollman * of 4 bytes. 91443543Swollman */ 91558787Sru controllen &= ~((int)sizeof (uint32_t) - 1); 91658787Sru } 91758787Sru error = copyout_arg(control, controllen, controllenp, 91843543Swollman msg->msg_control, msg->msg_controllen); 91943543Swollman if (error) 92058787Sru goto err; 92158787Sru 922149514Swollman if (msg->msg_controllen > controllen || control == NULL) { 92343014Swollman if (control == NULL) 92443014Swollman controllen = 0; 92543014Swollman so_closefds(msg->msg_control, msg->msg_controllen, 92643014Swollman !(flags & MSG_XPG4_2), controllen); 92743014Swollman } 92843014Swollman } 92943543Swollman if (msg->msg_namelen != 0) 93058787Sru kmem_free(msg->msg_name, (size_t)msg->msg_namelen); 93143543Swollman if (msg->msg_controllen != 0) 93243014Swollman kmem_free(msg->msg_control, (size_t)msg->msg_controllen); 93358787Sru return (len - uiop->uio_resid); 93443543Swollman 93558787Sruerr: 93658787Sru /* 93758787Sru * If we fail and the control part contains file descriptors 93858787Sru * we have to close the fd's. 93958787Sru */ 94058787Sru if (msg->msg_controllen != 0) 94167578Swollman so_closefds(msg->msg_control, msg->msg_controllen, 94267578Swollman !(flags & MSG_XPG4_2), 0); 94367578Swollman if (msg->msg_namelen != 0) 94467578Swollman kmem_free(msg->msg_name, (size_t)msg->msg_namelen); 94567578Swollman if (msg->msg_controllen != 0) 94675267Swollman kmem_free(msg->msg_control, (size_t)msg->msg_controllen); 94775267Swollman return (set_errno(error)); 94875267Swollman} 94975267Swollman 95086222Swollman/* 95186222Swollman * Native system call 952105196Swollman */ 953163302Srussize_t 954105196Swollmanrecv(int sock, void *buffer, size_t len, int flags) 955181421Sedwin{ 956105196Swollman struct nmsghdr lmsg; 957121098Swollman struct uio auio; 958181421Sedwin struct iovec aiov[1]; 959136638Swollman 960136638Swollman dprint(1, ("recv(%d, %p, %ld, %d)\n", 961181421Sedwin sock, buffer, len, flags)); 962136638Swollman 963153670Swollman if ((ssize_t)len < 0) { 964153670Swollman return (set_errno(EINVAL)); 965163302Sru } 966172479Sedwin 967172479Sedwin aiov[0].iov_base = buffer; 968172479Sedwin aiov[0].iov_len = len; 969172479Sedwin auio.uio_loffset = 0; 970181421Sedwin auio.uio_iov = aiov; 971181421Sedwin auio.uio_iovcnt = 1; 972183066Sedwin auio.uio_resid = len; 973183536Sedwin auio.uio_segflg = UIO_USERSPACE; 974183536Sedwin auio.uio_limit = 0; 975183536Sedwin 976183536Sedwin lmsg.msg_namelen = 0; 977183536Sedwin lmsg.msg_controllen = 0; 978183536Sedwin lmsg.msg_flags = 0; 979183536Sedwin return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL)); 980183536Sedwin} 981183536Sedwin 982183536Sedwinssize_t 983183536Sedwinrecvfrom(int sock, void *buffer, size_t len, int flags, 984183536Sedwin struct sockaddr *name, socklen_t *namelenp) 985183536Sedwin{ 986183536Sedwin struct nmsghdr lmsg; 987183536Sedwin struct uio auio; 988183536Sedwin struct iovec aiov[1]; 989183536Sedwin 990183536Sedwin dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n", 991183536Sedwin sock, buffer, len, flags, name, namelenp)); 992183536Sedwin 993183536Sedwin if ((ssize_t)len < 0) { 994183536Sedwin return (set_errno(EINVAL)); 995183536Sedwin } 996183536Sedwin 997183536Sedwin aiov[0].iov_base = buffer; 998121098Swollman aiov[0].iov_len = len; 999136638Swollman auio.uio_loffset = 0; 100020094Swollman auio.uio_iov = aiov; 10012742Swollman auio.uio_iovcnt = 1; 100220094Swollman auio.uio_resid = len; 1003136638Swollman auio.uio_segflg = UIO_USERSPACE; 10042742Swollman auio.uio_limit = 0; 100558787Sru 1006136638Swollman lmsg.msg_name = (char *)name; 1007136638Swollman if (namelenp != NULL) { 1008136638Swollman if (copyin(namelenp, &lmsg.msg_namelen, 1009136638Swollman sizeof (lmsg.msg_namelen))) 101058787Sru return (set_errno(EFAULT)); 1011136638Swollman } else { 1012136638Swollman lmsg.msg_namelen = 0; 1013136638Swollman } 1014136638Swollman lmsg.msg_controllen = 0; 1015136638Swollman lmsg.msg_flags = 0; 101620094Swollman 101758787Sru return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL)); 101875267Swollman} 1019121098Swollman 1020121098Swollman/* 1021121098Swollman * Uses the MSG_XPG4_2 flag to determine if the caller is using 102243543Swollman * struct omsghdr or struct nmsghdr. 102358787Sru */ 102458787Srussize_t 102543543Swollmanrecvmsg(int sock, struct nmsghdr *msg, int flags) 1026181421Sedwin{ 1027181421Sedwin STRUCT_DECL(nmsghdr, u_lmsg); 1028181421Sedwin STRUCT_HANDLE(nmsghdr, umsgptr); 1029181421Sedwin struct nmsghdr lmsg; 1030181421Sedwin struct uio auio; 1031181421Sedwin struct iovec aiov[MSG_MAXIOVLEN]; 1032181421Sedwin int iovcnt; 103358787Sru ssize_t len; 103475267Swollman int i; 103520094Swollman int *flagsp; 103658787Sru model_t model; 103758787Sru 103875267Swollman dprint(1, ("recvmsg(%d, %p, %d)\n", 103986222Swollman sock, msg, flags)); 1040105196Swollman 1041105196Swollman model = get_udatamodel(); 104220094Swollman STRUCT_INIT(u_lmsg, model); 104375267Swollman STRUCT_SET_HANDLE(umsgptr, model, msg); 104475267Swollman 104575267Swollman if (flags & MSG_XPG4_2) { 104675267Swollman if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg))) 104775267Swollman return (set_errno(EFAULT)); 104886222Swollman flagsp = STRUCT_FADDR(umsgptr, msg_flags); 1049105196Swollman } else { 1050105196Swollman /* 105175267Swollman * Assumes that nmsghdr and omsghdr are identically shaped 105258787Sru * except for the added msg_flags field. 105343543Swollman */ 105458787Sru if (copyin(msg, STRUCT_BUF(u_lmsg), 105558787Sru SIZEOF_STRUCT(omsghdr, model))) 1056121098Swollman return (set_errno(EFAULT)); 1057242208Sedwin STRUCT_FSET(u_lmsg, msg_flags, 0); 1058242208Sedwin flagsp = NULL; 105920094Swollman } 106058787Sru 106143543Swollman /* 106258787Sru * Code below us will kmem_alloc memory and hang it 106358787Sru * off msg_control and msg_name fields. This forces 106458787Sru * us to copy the structure to its native form. 106558787Sru */ 106675267Swollman lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name); 106786222Swollman lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen); 1068105196Swollman lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov); 1069105196Swollman lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen); 107043543Swollman lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control); 1071121098Swollman lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen); 1072121098Swollman lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags); 1073121098Swollman 1074121098Swollman iovcnt = lmsg.msg_iovlen; 1075121098Swollman 1076226752Sedwin if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { 1077242208Sedwin return (set_errno(EMSGSIZE)); 1078242208Sedwin } 1079121098Swollman 1080121098Swollman#ifdef _SYSCALL32_IMPL 108158787Sru /* 108258787Sru * 32-bit callers need to have their iovec expanded, while ensuring 10832742Swollman * that they can't move more than 2Gbytes of data in a single call. 108458787Sru */ 108558787Sru if (model == DATAMODEL_ILP32) { 108658787Sru struct iovec32 aiov32[MSG_MAXIOVLEN]; 108720094Swollman ssize32_t count32; 1088121098Swollman 1089121098Swollman if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, 109058787Sru iovcnt * sizeof (struct iovec32))) 109120094Swollman return (set_errno(EFAULT)); 1092121098Swollman 1093121098Swollman count32 = 0; 1094121098Swollman for (i = 0; i < iovcnt; i++) { 1095138323Swollman ssize32_t iovlen32; 1096136638Swollman 1097121098Swollman iovlen32 = aiov32[i].iov_len; 1098181421Sedwin count32 += iovlen32; 109943543Swollman if (iovlen32 < 0 || count32 < 0) 110058787Sru return (set_errno(EINVAL)); 110158787Sru aiov[i].iov_len = iovlen32; 110243543Swollman aiov[i].iov_base = 110358787Sru (caddr_t)(uintptr_t)aiov32[i].iov_base; 110458787Sru } 110558787Sru } else 110658787Sru#endif /* _SYSCALL32_IMPL */ 110775267Swollman if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) { 110875267Swollman return (set_errno(EFAULT)); 110958787Sru } 111075267Swollman len = 0; 1111121098Swollman for (i = 0; i < iovcnt; i++) { 1112121098Swollman ssize_t iovlen = aiov[i].iov_len; 11132742Swollman len += iovlen; 111458787Sru if (iovlen < 0 || len < 0) { 111558787Sru return (set_errno(EINVAL)); 111658787Sru } 111758787Sru } 111820094Swollman auio.uio_loffset = 0; 111975267Swollman auio.uio_iov = aiov; 112075267Swollman auio.uio_iovcnt = iovcnt; 112175267Swollman auio.uio_resid = len; 112275267Swollman auio.uio_segflg = UIO_USERSPACE; 112375267Swollman auio.uio_limit = 0; 112475267Swollman 1125181421Sedwin if (lmsg.msg_control != NULL && 1126181421Sedwin (do_useracc == 0 || 112775267Swollman useracc(lmsg.msg_control, lmsg.msg_controllen, 112858787Sru B_WRITE) != 0)) { 112975267Swollman return (set_errno(EFAULT)); 113058787Sru } 1131181421Sedwin 1132181421Sedwin return (recvit(sock, &lmsg, &auio, flags, 11332742Swollman STRUCT_FADDR(umsgptr, msg_namelen), 11342742Swollman STRUCT_FADDR(umsgptr, msg_controllen), flagsp)); 11352742Swollman} 113614343Swollman 113714343Swollman/* 113814343Swollman * Common send function. 113943014Swollman */ 114043014Swollmanstatic ssize_t 114143014Swollmansendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags) 114243014Swollman{ 11432742Swollman struct sonode *so; 114458787Sru file_t *fp; 114558787Sru void *name; 114658787Sru socklen_t namelen; 114758787Sru void *control; 1148169811Swollman socklen_t controllen; 1149169811Swollman ssize_t len; 115075267Swollman int error; 1151169811Swollman 1152169811Swollman if ((so = getsonode(sock, &error, &fp)) == NULL) 1153169811Swollman return (set_errno(error)); 1154169811Swollman 1155169811Swollman uiop->uio_fmode = fp->f_flag; 1156169811Swollman 115775267Swollman if (so->so_family == AF_UNIX) 1158169811Swollman uiop->uio_extflg = UIO_COPY_CACHED; 1159169811Swollman else 1160169811Swollman uiop->uio_extflg = UIO_COPY_DEFAULT; 1161169811Swollman 1162169811Swollman /* Allocate and copyin name and control */ 1163169811Swollman name = msg->msg_name; 1164169811Swollman namelen = msg->msg_namelen; 1165114173Swollman if (name != NULL && namelen != 0) { 1166176974Sedwin ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1167176974Sedwin name = copyin_name(so, 1168176974Sedwin (struct sockaddr *)name, 1169176974Sedwin &namelen, &error); 1170240457Sedwin if (name == NULL) 1171176974Sedwin goto done3; 1172176974Sedwin /* copyin_name null terminates addresses for AF_UNIX */ 1173176974Sedwin msg->msg_namelen = namelen; 1174176974Sedwin msg->msg_name = name; 1175176974Sedwin } else { 1176176974Sedwin msg->msg_name = name = NULL; 1177176974Sedwin msg->msg_namelen = namelen = 0; 1178176974Sedwin } 1179181421Sedwin 1180176974Sedwin control = msg->msg_control; 1181240457Sedwin controllen = msg->msg_controllen; 1182176974Sedwin if ((control != NULL) && (controllen != 0)) { 1183176974Sedwin /* 1184176974Sedwin * Verify that the length is not excessive to prevent 1185176974Sedwin * an application from consuming all of kernel memory. 1186204887Sedwin */ 1187204887Sedwin if (controllen > SO_MAXARGSIZE) { 1188204887Sedwin error = EINVAL; 1189204887Sedwin goto done2; 1190204887Sedwin } 1191204887Sedwin control = kmem_alloc(controllen, KM_SLEEP); 1192204887Sedwin 1193204887Sedwin ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1194204887Sedwin if (copyin(msg->msg_control, control, controllen)) { 1195204887Sedwin error = EFAULT; 1196204887Sedwin goto done1; 1197204887Sedwin } 1198219411Sedwin msg->msg_control = control; 1199219411Sedwin } else { 1200219411Sedwin msg->msg_control = control = NULL; 1201219411Sedwin msg->msg_controllen = controllen = 0; 1202219411Sedwin } 1203219411Sedwin 1204219411Sedwin len = uiop->uio_resid; 1205219411Sedwin msg->msg_flags = flags; 1206219411Sedwin 1207219411Sedwin error = SOP_SENDMSG(so, msg, uiop); 1208219411Sedwindone1: 1209219411Sedwin if (control != NULL) 1210219411Sedwin kmem_free(control, controllen); 1211219411Sedwindone2: 1212219411Sedwin if (name != NULL) 1213219411Sedwin kmem_free(name, namelen); 1214219411Sedwindone3: 1215220286Sedwin if (error != 0) { 1216220286Sedwin releasef(sock); 1217220286Sedwin return (set_errno(error)); 1218220286Sedwin } 1219220286Sedwin lwp_stat_update(LWP_STAT_MSGSND, 1); 1220220286Sedwin so_update_attrs(so, SOMOD); 1221220286Sedwin releasef(sock); 1222220286Sedwin return (len - uiop->uio_resid); 1223220286Sedwin} 1224220286Sedwin 1225220286Sedwin/* 1226220286Sedwin * Native system call 1227220286Sedwin */ 1228233445Sedwinssize_t 1229233445Sedwinsend(int sock, void *buffer, size_t len, int flags) 1230233445Sedwin{ 1231233445Sedwin struct nmsghdr lmsg; 1232233445Sedwin struct uio auio; 1233233445Sedwin struct iovec aiov[1]; 1234233445Sedwin 1235233445Sedwin dprint(1, ("send(%d, %p, %ld, %d)\n", 1236233445Sedwin sock, buffer, len, flags)); 1237233445Sedwin 1238233445Sedwin if ((ssize_t)len < 0) { 1239233445Sedwin return (set_errno(EINVAL)); 1240233445Sedwin } 1241233445Sedwin 1242233445Sedwin aiov[0].iov_base = buffer; 1243233445Sedwin aiov[0].iov_len = len; 1244248307Sedwin auio.uio_loffset = 0; 1245248307Sedwin auio.uio_iov = aiov; 1246248307Sedwin auio.uio_iovcnt = 1; 1247248307Sedwin auio.uio_resid = len; 1248248307Sedwin auio.uio_segflg = UIO_USERSPACE; 1249248307Sedwin auio.uio_limit = 0; 1250248307Sedwin 1251233445Sedwin lmsg.msg_name = NULL; 1252233445Sedwin lmsg.msg_control = NULL; 1253233445Sedwin if (!(flags & MSG_XPG4_2)) { 12542742Swollman /* 1255169811Swollman * In order to be compatible with the libsocket/sockmod 125658787Sru * implementation we set EOR for all send* calls. 1257169811Swollman */ 1258169811Swollman flags |= MSG_EOR; 1259169811Swollman } 1260169811Swollman return (sendit(sock, &lmsg, &auio, flags)); 1261169811Swollman} 1262169811Swollman 1263169811Swollman/* 1264169811Swollman * Uses the MSG_XPG4_2 flag to determine if the caller is using 1265169811Swollman * struct omsghdr or struct nmsghdr. 1266169811Swollman */ 1267169811Swollmanssize_t 1268169811Swollmansendmsg(int sock, struct nmsghdr *msg, int flags) 1269169811Swollman{ 1270169811Swollman struct nmsghdr lmsg; 1271169811Swollman STRUCT_DECL(nmsghdr, u_lmsg); 1272169811Swollman struct uio auio; 1273169811Swollman struct iovec aiov[MSG_MAXIOVLEN]; 1274169811Swollman int iovcnt; 1275169811Swollman ssize_t len; 1276169811Swollman int i; 1277169811Swollman model_t model; 1278169811Swollman 1279169811Swollman dprint(1, ("sendmsg(%d, %p, %d)\n", sock, msg, flags)); 1280169811Swollman 128175267Swollman model = get_udatamodel(); 128275267Swollman STRUCT_INIT(u_lmsg, model); 1283220286Sedwin 1284176974Sedwin if (flags & MSG_XPG4_2) { 1285176974Sedwin if (copyin(msg, (char *)STRUCT_BUF(u_lmsg), 1286176974Sedwin STRUCT_SIZE(u_lmsg))) 1287176974Sedwin return (set_errno(EFAULT)); 1288204887Sedwin } else { 1289220286Sedwin /* 1290220286Sedwin * Assumes that nmsghdr and omsghdr are identically shaped 1291233445Sedwin * except for the added msg_flags field. 1292248307Sedwin */ 1293248307Sedwin if (copyin(msg, (char *)STRUCT_BUF(u_lmsg), 1294169811Swollman SIZEOF_STRUCT(omsghdr, model))) 129558787Sru return (set_errno(EFAULT)); 12962742Swollman /* 1297169811Swollman * In order to be compatible with the libsocket/sockmod 1298169811Swollman * implementation we set EOR for all send* calls. 1299169811Swollman */ 1300169811Swollman flags |= MSG_EOR; 1301169811Swollman } 1302169811Swollman 1303169811Swollman /* 130419878Swollman * Code below us will kmem_alloc memory and hang it 1305169811Swollman * off msg_control and msg_name fields. This forces 1306169811Swollman * us to copy the structure to its native form. 1307158421Swollman */ 130819878Swollman lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name); 13092742Swollman lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen); 131086222Swollman lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov); 131186222Swollman lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen); 131286222Swollman lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control); 13132742Swollman lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen); 13142742Swollman lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags); 1315248307Sedwin 1316248307Sedwin iovcnt = lmsg.msg_iovlen; 1317248307Sedwin 1318248307Sedwin if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { 131920094Swollman /* 1320158421Swollman * Unless this is XPG 4.2 we allow iovcnt == 0 to 1321158421Swollman * be compatible with SunOS 4.X and 4.4BSD. 13222742Swollman */ 1323248307Sedwin if (iovcnt != 0 || (flags & MSG_XPG4_2)) 1324248307Sedwin return (set_errno(EMSGSIZE)); 132520094Swollman } 13262742Swollman 13272742Swollman#ifdef _SYSCALL32_IMPL 13282742Swollman /* 13292742Swollman * 32-bit callers need to have their iovec expanded, while ensuring 1330248307Sedwin * that they can't move more than 2Gbytes of data in a single call. 1331248307Sedwin */ 1332158421Swollman if (model == DATAMODEL_ILP32) { 1333169811Swollman struct iovec32 aiov32[MSG_MAXIOVLEN]; 1334158421Swollman ssize32_t count32; 1335158421Swollman 1336158421Swollman if (iovcnt != 0 && 1337158421Swollman copyin((struct iovec32 *)lmsg.msg_iov, aiov32, 1338158421Swollman iovcnt * sizeof (struct iovec32))) 1339169811Swollman return (set_errno(EFAULT)); 1340158421Swollman 1341158421Swollman count32 = 0; 1342158421Swollman for (i = 0; i < iovcnt; i++) { 1343158421Swollman ssize32_t iovlen32; 1344158421Swollman 1345158421Swollman iovlen32 = aiov32[i].iov_len; 1346169811Swollman count32 += iovlen32; 13472742Swollman if (iovlen32 < 0 || count32 < 0) 1348248307Sedwin return (set_errno(EINVAL)); 134919878Swollman aiov[i].iov_len = iovlen32; 13502742Swollman aiov[i].iov_base = 13512742Swollman (caddr_t)(uintptr_t)aiov32[i].iov_base; 1352223629Sedwin } 1353223629Sedwin } else 1354223629Sedwin#endif /* _SYSCALL32_IMPL */ 1355223629Sedwin if (iovcnt != 0 && 1356223629Sedwin copyin(lmsg.msg_iov, aiov, 1357223629Sedwin (unsigned)iovcnt * sizeof (struct iovec))) { 1358223629Sedwin return (set_errno(EFAULT)); 1359223629Sedwin } 13602742Swollman len = 0; 1361169811Swollman for (i = 0; i < iovcnt; i++) { 1362248307Sedwin ssize_t iovlen = aiov[i].iov_len; 1363248307Sedwin len += iovlen; 1364169811Swollman if (iovlen < 0 || len < 0) { 1365169811Swollman return (set_errno(EINVAL)); 1366169811Swollman } 1367169811Swollman } 1368169811Swollman auio.uio_loffset = 0; 1369169811Swollman auio.uio_iov = aiov; 13702742Swollman auio.uio_iovcnt = iovcnt; 13712742Swollman auio.uio_resid = len; 137219878Swollman auio.uio_segflg = UIO_USERSPACE; 137319878Swollman auio.uio_limit = 0; 13742742Swollman 137519878Swollman return (sendit(sock, &lmsg, &auio, flags)); 137619878Swollman} 13772742Swollman 13782742Swollmanssize_t 137975267Swollmansendto(int sock, void *buffer, size_t len, int flags, 1380158421Swollman struct sockaddr *name, socklen_t namelen) 1381158421Swollman{ 1382158421Swollman struct nmsghdr lmsg; 138375267Swollman struct uio auio; 138475267Swollman struct iovec aiov[1]; 138575267Swollman 138675267Swollman dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n", 138775267Swollman sock, buffer, len, flags, name, namelen)); 138875267Swollman 138975267Swollman if ((ssize_t)len < 0) { 139075267Swollman return (set_errno(EINVAL)); 139175267Swollman } 139275267Swollman 139375267Swollman aiov[0].iov_base = buffer; 139475267Swollman aiov[0].iov_len = len; 139575267Swollman auio.uio_loffset = 0; 139675267Swollman auio.uio_iov = aiov; 139775267Swollman auio.uio_iovcnt = 1; 139875267Swollman auio.uio_resid = len; 139975267Swollman auio.uio_segflg = UIO_USERSPACE; 140075267Swollman auio.uio_limit = 0; 140175267Swollman 140275267Swollman lmsg.msg_name = (char *)name; 140375267Swollman lmsg.msg_namelen = namelen; 140475267Swollman lmsg.msg_control = NULL; 140575267Swollman if (!(flags & MSG_XPG4_2)) { 140675267Swollman /* 140775267Swollman * In order to be compatible with the libsocket/sockmod 140875267Swollman * implementation we set EOR for all send* calls. 140975267Swollman */ 141075267Swollman flags |= MSG_EOR; 141175267Swollman } 141275267Swollman return (sendit(sock, &lmsg, &auio, flags)); 141375267Swollman} 141475267Swollman 141575267Swollman/*ARGSUSED3*/ 141675267Swollmanint 141775267Swollmangetpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version) 141875267Swollman{ 141975267Swollman struct sonode *so; 142075267Swollman int error; 142175267Swollman socklen_t namelen; 142275267Swollman union { 142375267Swollman struct sockaddr_in sin; 142475267Swollman struct sockaddr_in6 sin6; 142575267Swollman } sin; /* Temporary buffer, common case */ 142675267Swollman void *addr; /* Temporary buffer, uncommon case */ 142775267Swollman socklen_t addrlen, size; 142875267Swollman 1429220549Sedwin dprint(1, ("getpeername(%d, %p, %p)\n", 1430220549Sedwin sock, name, namelenp)); 1431220549Sedwin 1432220549Sedwin if ((so = getsonode(sock, &error, NULL)) == NULL) 1433220549Sedwin goto bad; 1434220549Sedwin 1435220549Sedwin ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1436220549Sedwin if (copyin(namelenp, &namelen, sizeof (namelen)) || 1437220549Sedwin (name == NULL && namelen != 0)) { 1438220549Sedwin error = EFAULT; 1439220549Sedwin goto rel_out; 1440220549Sedwin } 1441220549Sedwin /* 1442220549Sedwin * If a connect or accept has been done, unless we're an Xnet socket, 1443220549Sedwin * the remote address has already been updated in so_faddr_sa. 1444220549Sedwin */ 1445220549Sedwin if (so->so_version != SOV_SOCKSTREAM && so->so_version != SOV_SOCKBSD || 1446220549Sedwin !(so->so_state & SS_FADDR_VALID)) { 1447233445Sedwin if ((error = SOP_GETPEERNAME(so)) != 0) 1448233445Sedwin goto rel_out; 1449233445Sedwin } 1450233445Sedwin 1451233445Sedwin if (so->so_faddr_maxlen <= sizeof (sin)) { 1452233445Sedwin size = 0; 1453233445Sedwin addr = &sin; 1454233445Sedwin } else { 1455233445Sedwin /* 1456233445Sedwin * Allocate temporary to avoid holding so_lock across 1457233445Sedwin * copyout 1458233445Sedwin */ 1459233445Sedwin size = so->so_faddr_maxlen; 1460233445Sedwin addr = kmem_alloc(size, KM_SLEEP); 1461233445Sedwin } 14622742Swollman /* Prevent so_faddr_sa/len from changing while accessed */ 146319878Swollman mutex_enter(&so->so_lock); 146419878Swollman if (!(so->so_state & SS_ISCONNECTED)) { 146519878Swollman mutex_exit(&so->so_lock); 146619878Swollman error = ENOTCONN; 146719878Swollman goto free_out; 146819878Swollman } 146919878Swollman addrlen = so->so_faddr_len; 147019878Swollman bcopy(so->so_faddr_sa, addr, addrlen); 147175267Swollman mutex_exit(&so->so_lock); 147275267Swollman 1473220549Sedwin ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1474233445Sedwin error = copyout_name(name, namelen, namelenp, addr, 14752742Swollman (so->so_state & SS_FADDR_NOXLATE) ? 0 : addrlen); 14762742Swollmanfree_out: 147719878Swollman if (size != 0) 147819878Swollman kmem_free(addr, size); 147919878Swollmanrel_out: 1480233445Sedwin releasef(sock); 1481233445Sedwinbad: return (error != 0 ? set_errno(error) : 0); 14822742Swollman} 14832742Swollman 14842742Swollman/*ARGSUSED3*/ 14852742Swollmanint 148619878Swollmangetsockname(int sock, struct sockaddr *name, 148719878Swollman socklen_t *namelenp, int version) 14882742Swollman{ 14892742Swollman struct sonode *so; 14902742Swollman int error; 14912742Swollman socklen_t namelen; 149219878Swollman union { 149319878Swollman struct sockaddr_in sin; 149420094Swollman struct sockaddr_in6 sin6; 149520094Swollman } sin; /* Temporary buffer, common case */ 149620094Swollman void *addr; /* Temporary buffer, uncommon case */ 14972742Swollman socklen_t addrlen, size; 14982742Swollman 1499158421Swollman dprint(1, ("getsockname(%d, %p, %p)\n", 1500158421Swollman sock, name, namelenp)); 1501158421Swollman 150258787Sru if ((so = getsonode(sock, &error, NULL)) == NULL) 15032742Swollman goto bad; 150458787Sru 150519878Swollman ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 150658787Sru if (copyin(namelenp, &namelen, sizeof (namelen)) || 150719878Swollman (name == NULL && namelen != 0)) { 150820094Swollman error = EFAULT; 150920094Swollman goto rel_out; 151020094Swollman } 151120094Swollman 151220094Swollman /* 151330711Swollman * If a bind or accept has been done, unless we're an Xnet endpoint, 151420094Swollman * the local address has already been updated in so_laddr_sa. 151558787Sru */ 151675267Swollman if ((so->so_version != SOV_SOCKSTREAM && 151767578Swollman so->so_version != SOV_SOCKBSD) || 151867578Swollman !(so->so_state & SS_LADDR_VALID)) { 151967578Swollman if ((error = SOP_GETSOCKNAME(so)) != 0) 152075267Swollman goto rel_out; 152175267Swollman } 152275267Swollman 152375267Swollman if (so->so_laddr_maxlen <= sizeof (sin)) { 152475267Swollman size = 0; 152575267Swollman addr = &sin; 152675267Swollman } else { 152775267Swollman /* 152875267Swollman * Allocate temporary to avoid holding so_lock across 152975267Swollman * copyout 153075267Swollman */ 153175267Swollman size = so->so_laddr_maxlen; 153293799Swollman addr = kmem_alloc(size, KM_SLEEP); 1533158421Swollman } 153475267Swollman /* Prevent so_laddr_sa/len from changing while accessed */ 1535158421Swollman mutex_enter(&so->so_lock); 153675267Swollman addrlen = so->so_laddr_len; 153793799Swollman bcopy(so->so_laddr_sa, addr, addrlen); 153893799Swollman mutex_exit(&so->so_lock); 153993799Swollman 154093799Swollman ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 154193799Swollman error = copyout_name(name, namelen, namelenp, 1542149514Swollman addr, addrlen); 1543149514Swollman if (size != 0) 1544149514Swollman kmem_free(addr, size); 1545149514Swollmanrel_out: 1546149514Swollman releasef(sock); 1547149514Swollmanbad: return (error != 0 ? set_errno(error) : 0); 1548149514Swollman} 1549169811Swollman 1550169811Swollman/*ARGSUSED5*/ 1551169811Swollmanint 1552204566Sedwingetsockopt(int sock, 1553204566Sedwin int level, 1554204566Sedwin int option_name, 1555204566Sedwin void *option_value, 1556204566Sedwin socklen_t *option_lenp, 1557204566Sedwin int version) 1558204566Sedwin{ 1559204566Sedwin struct sonode *so; 1560204566Sedwin socklen_t optlen, optlen_res; 1561204566Sedwin void *optval; 1562204566Sedwin int error; 1563204566Sedwin 1564204566Sedwin dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n", 1565204566Sedwin sock, level, option_name, option_value, option_lenp)); 1566204566Sedwin 1567204566Sedwin if ((so = getsonode(sock, &error, NULL)) == NULL) 1568204887Sedwin return (set_errno(error)); 1569204566Sedwin 1570248307Sedwin ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1571248307Sedwin if (copyin(option_lenp, &optlen, sizeof (optlen))) { 1572248307Sedwin releasef(sock); 1573248307Sedwin return (set_errno(EFAULT)); 1574248307Sedwin } 1575248307Sedwin /* 1576248307Sedwin * Verify that the length is not excessive to prevent 1577248307Sedwin * an application from consuming all of kernel memory. 1578248307Sedwin */ 157975267Swollman if (optlen > SO_MAXARGSIZE) { 15802742Swollman error = EINVAL; 15812742Swollman releasef(sock); 158219878Swollman return (set_errno(error)); 158319878Swollman } 158419878Swollman optval = kmem_alloc(optlen, KM_SLEEP); 158519878Swollman optlen_res = optlen; 15862742Swollman error = SOP_GETSOCKOPT(so, level, option_name, optval, 15872742Swollman &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2); 1588136638Swollman releasef(sock); 1589136638Swollman if (error) { 1590136638Swollman kmem_free(optval, optlen); 1591149514Swollman return (set_errno(error)); 1592136638Swollman } 1593136638Swollman error = copyout_arg(option_value, optlen, option_lenp, 1594158421Swollman optval, optlen_res); 1595158421Swollman kmem_free(optval, optlen); 1596136638Swollman if (error) 15972742Swollman return (set_errno(error)); 159819878Swollman return (0); 159919878Swollman} 160019878Swollman 160119878Swollman/*ARGSUSED5*/ 1602136638Swollmanint 1603136638Swollmansetsockopt(int sock, 160419878Swollman int level, 160519878Swollman int option_name, 1606158421Swollman void *option_value, 160758787Sru socklen_t option_len, 160858787Sru int version) 16092742Swollman{ 16102742Swollman struct sonode *so; 161186222Swollman intptr_t buffer[2]; 161219878Swollman void *optval = NULL; 16132742Swollman int error; 16142742Swollman 16152742Swollman dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n", 16162742Swollman sock, level, option_name, option_value, option_len)); 161719878Swollman 16182742Swollman if ((so = getsonode(sock, &error, NULL)) == NULL) 16192742Swollman return (set_errno(error)); 162058787Sru 16212742Swollman if (option_value != NULL) { 16222742Swollman if (option_len != 0) { 16232742Swollman /* 16242742Swollman * Verify that the length is not excessive to prevent 16252742Swollman * an application from consuming all of kernel memory. 16262742Swollman */ 162719878Swollman if (option_len > SO_MAXARGSIZE) { 162819878Swollman error = EINVAL; 162919878Swollman goto done2; 16302742Swollman } 16312742Swollman optval = option_len <= sizeof (buffer) ? 16322742Swollman &buffer : kmem_alloc((size_t)option_len, KM_SLEEP); 16332742Swollman ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 16342742Swollman if (copyin(option_value, optval, (size_t)option_len)) { 16352742Swollman error = EFAULT; 16362742Swollman goto done1; 1637149514Swollman } 16382742Swollman } 1639158421Swollman } else 16402742Swollman option_len = 0; 1641158421Swollman 164219878Swollman error = SOP_SETSOCKOPT(so, level, option_name, optval, 164319878Swollman (t_uscalar_t)option_len); 164419878Swollmandone1: 164519878Swollman if (optval != buffer) 1646158421Swollman kmem_free(optval, (size_t)option_len); 164719878Swollmandone2: 164819878Swollman releasef(sock); 164919878Swollman if (error) 1650158421Swollman return (set_errno(error)); 165119878Swollman return (0); 16522742Swollman} 1653158421Swollman 1654158421Swollman/* 1655158421Swollman * Add config info when devpath is non-NULL; delete info when devpath is NULL. 165619878Swollman * devpath is a user address. 165719878Swollman */ 165819878Swollmanint 165919878Swollmansockconfig(int domain, int type, int protocol, char *devpath) 166019878Swollman{ 166119878Swollman char *kdevpath; /* Copied in devpath string */ 166219878Swollman size_t kdevpathlen; 166319878Swollman int error = 0; 166419878Swollman 166519878Swollman dprint(1, ("sockconfig(%d, %d, %d, %p)\n", 166619878Swollman domain, type, protocol, devpath)); 166719878Swollman 166819878Swollman if (secpolicy_net_config(CRED(), B_FALSE) != 0) 166919878Swollman return (set_errno(EPERM)); 167019878Swollman 167119878Swollman if (devpath == NULL) { 167219878Swollman /* Deleting an entry */ 167319878Swollman kdevpath = NULL; 167419878Swollman kdevpathlen = 0; 167519878Swollman } else { 167619878Swollman /* 167719878Swollman * Adding an entry. 167819878Swollman * Copyin the devpath. 167919878Swollman * This also makes it possible to check for too long pathnames. 168019878Swollman * Compress the space needed for the devpath before passing it 1681158421Swollman * to soconfig - soconfig will store the string until 168258787Sru * the configuration is removed. 168320094Swollman */ 168420094Swollman char *buf; 168530711Swollman 168620094Swollman buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); 1687136638Swollman if ((error = copyinstr(devpath, buf, MAXPATHLEN, 1688136638Swollman &kdevpathlen)) != 0) { 1689136638Swollman kmem_free(buf, MAXPATHLEN); 1690149514Swollman goto done; 1691149514Swollman } 1692149514Swollman 1693149514Swollman kdevpath = kmem_alloc(kdevpathlen, KM_SLEEP); 1694149514Swollman bcopy(buf, kdevpath, kdevpathlen); 1695149514Swollman kdevpath[kdevpathlen - 1] = '\0'; 1696153670Swollman 1697153670Swollman kmem_free(buf, MAXPATHLEN); 1698153670Swollman } 1699153670Swollman error = soconfig(domain, type, protocol, kdevpath, (int)kdevpathlen); 1700153670Swollmandone: 1701153670Swollman if (error) { 1702163302Sru eprintline(error); 1703163302Sru return (set_errno(error)); 1704163302Sru } 1705163302Sru return (0); 17062742Swollman} 17072742Swollman 170819878Swollman 170919878Swollman/* 171019878Swollman * Sendfile is implemented through two schemes, direct I/O or by 17112742Swollman * caching in the filesystem page cache. We cache the input file by 17122742Swollman * default and use direct I/O only if sendfile_max_size is set 1713172479Sedwin * appropriately as explained below. Note that this logic is consistent 1714174242Sedwin * with other filesystems where caching is turned on by default 1715174242Sedwin * unless explicitly turned off by using the DIRECTIO ioctl. 1716174242Sedwin * 1717174242Sedwin * We choose a slightly different scheme here. One can turn off 1718174242Sedwin * caching by setting sendfile_max_size to 0. One can also enable 1719174242Sedwin * caching of files <= sendfile_max_size by setting sendfile_max_size 1720174242Sedwin * to an appropriate value. By default sendfile_max_size is set to the 1721172479Sedwin * maximum value so that all files are cached. In future, we may provide 17222742Swollman * better interfaces for caching the file. 17232742Swollman * 172486222Swollman * Sendfile through Direct I/O (Zero copy) 172519878Swollman * -------------------------------------- 1726174242Sedwin * 1727172479Sedwin * As disks are normally slower than the network, we can't have a 1728 * single thread that reads the disk and writes to the network. We 1729 * need to have parallelism. This is done by having the sendfile 1730 * thread create another thread that reads from the filesystem 1731 * and queues it for network processing. In this scheme, the data 1732 * is never copied anywhere i.e it is zero copy unlike the other 1733 * scheme. 1734 * 1735 * We have a sendfile queue (snfq) where each sendfile 1736 * request (snf_req_t) is queued for processing by a thread. Number 1737 * of threads is dynamically allocated and they exit if they are idling 1738 * beyond a specified amount of time. When each request (snf_req_t) is 1739 * processed by a thread, it produces a number of mblk_t structures to 1740 * be consumed by the sendfile thread. snf_deque and snf_enque are 1741 * used for consuming and producing mblks. Size of the filesystem 1742 * read is determined by the tuneable (sendfile_read_size). A single 1743 * mblk holds sendfile_read_size worth of data (except the last 1744 * read of the file) which is sent down as a whole to the network. 1745 * sendfile_read_size is set to 1 MB as this seems to be the optimal 1746 * value for the UFS filesystem backed by a striped storage array. 1747 * 1748 * Synchronisation between read (producer) and write (consumer) threads. 1749 * -------------------------------------------------------------------- 1750 * 1751 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while 1752 * adding and deleting items in this list. Error can happen anytime 1753 * during read or write. There could be unprocessed mblks in the 1754 * sr_ib_XXX list when a read or write error occurs. Whenever error 1755 * is encountered, we need two things to happen : 1756 * 1757 * a) One of the threads need to clean the mblks. 1758 * b) When one thread encounters an error, the other should stop. 1759 * 1760 * For (a), we don't want to penalise the reader thread as it could do 1761 * some useful work processing other requests. For (b), the error can 1762 * be detected by examining sr_read_error or sr_write_error. 1763 * sr_lock protects sr_read_error and sr_write_error. If both reader and 1764 * writer encounters error, we need to report the write error back to 1765 * the application as that's what would have happened if the operations 1766 * were done sequentially. With this in mind, following should work : 1767 * 1768 * - Check for errors before read or write. 1769 * - If the reader encounters error, set the error in sr_read_error. 1770 * Check sr_write_error, if it is set, send cv_signal as it is 1771 * waiting for reader to complete. If it is not set, the writer 1772 * is either running sinking data to the network or blocked 1773 * because of flow control. For handling the latter case, we 1774 * always send a signal. In any case, it will examine sr_read_error 1775 * and return. sr_read_error is marked with SR_READ_DONE to tell 1776 * the writer that the reader is done in all the cases. 1777 * - If the writer encounters error, set the error in sr_write_error. 1778 * The reader thread is either blocked because of flow control or 1779 * running reading data from the disk. For the former, we need to 1780 * wakeup the thread. Again to keep it simple, we always wake up 1781 * the reader thread. Then, wait for the read thread to complete 1782 * if it is not done yet. Cleanup and return. 1783 * 1784 * High and low water marks for the read thread. 1785 * -------------------------------------------- 1786 * 1787 * If sendfile() is used to send data over a slow network, we need to 1788 * make sure that the read thread does not produce data at a faster 1789 * rate than the network. This can happen if the disk is faster than 1790 * the network. In such a case, we don't want to build a very large queue. 1791 * But we would still like to get all of the network throughput possible. 1792 * This implies that network should never block waiting for data. 1793 * As there are lot of disk throughput/network throughput combinations 1794 * possible, it is difficult to come up with an accurate number. 1795 * A typical 10K RPM disk has a max seek latency 17ms and rotational 1796 * latency of 3ms for reading a disk block. Thus, the total latency to 1797 * initiate a new read, transfer data from the disk and queue for 1798 * transmission would take about a max of 25ms. Todays max transfer rate 1799 * for network is 100MB/sec. If the thread is blocked because of flow 1800 * control, it would take 25ms to get new data ready for transmission. 1801 * We have to make sure that network is not idling, while we are initiating 1802 * new transfers. So, at 100MB/sec, to keep network busy we would need 1803 * 2.5MB of data. Roundig off, we keep the low water mark to be 3MB of data. 1804 * We need to pick a high water mark so that the woken up thread would 1805 * do considerable work before blocking again to prevent thrashing. Currently, 1806 * we pick this to be 10 times that of the low water mark. 1807 * 1808 * Sendfile with segmap caching (One copy from page cache to mblks). 1809 * ---------------------------------------------------------------- 1810 * 1811 * We use the segmap cache for caching the file, if the size of file 1812 * is <= sendfile_max_size. In this case we don't use threads as VM 1813 * is reasonably fast enough to keep up with the network. If the underlying 1814 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth 1815 * of data into segmap space, and use the virtual address from segmap 1816 * directly through desballoc() to avoid copy. Once the transport is done 1817 * with the data, the mapping will be released through segmap_release() 1818 * called by the call-back routine. 1819 * 1820 * If zero-copy is not allowed by the transport, we simply call VOP_READ() 1821 * to copy the data from the filesystem into our temporary network buffer. 1822 * 1823 * To disable caching, set sendfile_max_size to 0. 1824 */ 1825 1826uint_t sendfile_read_size = 1024 * 1024; 1827#define SENDFILE_REQ_LOWAT 3 * 1024 * 1024 1828uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT; 1829uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT; 1830struct sendfile_stats sf_stats; 1831struct sendfile_queue *snfq; 1832clock_t snfq_timeout; 1833off64_t sendfile_max_size; 1834 1835static void snf_enque(snf_req_t *, mblk_t *); 1836static mblk_t *snf_deque(snf_req_t *); 1837 1838void 1839sendfile_init(void) 1840{ 1841 snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP); 1842 1843 mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL); 1844 cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL); 1845 snfq->snfq_max_threads = max_ncpus; 1846 snfq_timeout = SNFQ_TIMEOUT; 1847 /* Cache all files by default. */ 1848 sendfile_max_size = MAXOFFSET_T; 1849} 1850 1851/* 1852 * Queues a mblk_t for network processing. 1853 */ 1854static void 1855snf_enque(snf_req_t *sr, mblk_t *mp) 1856{ 1857 mp->b_next = NULL; 1858 mutex_enter(&sr->sr_lock); 1859 if (sr->sr_mp_head == NULL) { 1860 sr->sr_mp_head = sr->sr_mp_tail = mp; 1861 cv_signal(&sr->sr_cv); 1862 } else { 1863 sr->sr_mp_tail->b_next = mp; 1864 sr->sr_mp_tail = mp; 1865 } 1866 sr->sr_qlen += MBLKL(mp); 1867 while ((sr->sr_qlen > sr->sr_hiwat) && 1868 (sr->sr_write_error == 0)) { 1869 sf_stats.ss_full_waits++; 1870 cv_wait(&sr->sr_cv, &sr->sr_lock); 1871 } 1872 mutex_exit(&sr->sr_lock); 1873} 1874 1875/* 1876 * De-queues a mblk_t for network processing. 1877 */ 1878static mblk_t * 1879snf_deque(snf_req_t *sr) 1880{ 1881 mblk_t *mp; 1882 1883 mutex_enter(&sr->sr_lock); 1884 /* 1885 * If we have encountered an error on read or read is 1886 * completed and no more mblks, return NULL. 1887 * We need to check for NULL sr_mp_head also as 1888 * the reads could have completed and there is 1889 * nothing more to come. 1890 */ 1891 if (((sr->sr_read_error & ~SR_READ_DONE) != 0) || 1892 ((sr->sr_read_error & SR_READ_DONE) && 1893 sr->sr_mp_head == NULL)) { 1894 mutex_exit(&sr->sr_lock); 1895 return (NULL); 1896 } 1897 /* 1898 * To start with neither SR_READ_DONE is marked nor 1899 * the error is set. When we wake up from cv_wait, 1900 * following are the possibilities : 1901 * 1902 * a) sr_read_error is zero and mblks are queued. 1903 * b) sr_read_error is set to SR_READ_DONE 1904 * and mblks are queued. 1905 * c) sr_read_error is set to SR_READ_DONE 1906 * and no mblks. 1907 * d) sr_read_error is set to some error other 1908 * than SR_READ_DONE. 1909 */ 1910 1911 while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) { 1912 sf_stats.ss_empty_waits++; 1913 cv_wait(&sr->sr_cv, &sr->sr_lock); 1914 } 1915 /* Handle (a) and (b) first - the normal case. */ 1916 if (((sr->sr_read_error & ~SR_READ_DONE) == 0) && 1917 (sr->sr_mp_head != NULL)) { 1918 mp = sr->sr_mp_head; 1919 sr->sr_mp_head = mp->b_next; 1920 sr->sr_qlen -= MBLKL(mp); 1921 if (sr->sr_qlen < sr->sr_lowat) 1922 cv_signal(&sr->sr_cv); 1923 mutex_exit(&sr->sr_lock); 1924 mp->b_next = NULL; 1925 return (mp); 1926 } 1927 /* Handle (c) and (d). */ 1928 mutex_exit(&sr->sr_lock); 1929 return (NULL); 1930} 1931 1932/* 1933 * Reads data from the filesystem and queues it for network processing. 1934 */ 1935void 1936snf_async_read(snf_req_t *sr) 1937{ 1938 size_t iosize; 1939 u_offset_t fileoff; 1940 u_offset_t size; 1941 int ret_size; 1942 int error; 1943 file_t *fp; 1944 mblk_t *mp; 1945 1946 fp = sr->sr_fp; 1947 size = sr->sr_file_size; 1948 fileoff = sr->sr_file_off; 1949 1950 /* 1951 * Ignore the error for filesystems that doesn't support DIRECTIO. 1952 */ 1953 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0, 1954 kcred, NULL); 1955 1956 while ((size != 0) && (sr->sr_write_error == 0)) { 1957 1958 iosize = (int)MIN(sr->sr_maxpsz, size); 1959 1960 if ((mp = allocb(iosize, BPRI_MED)) == NULL) { 1961 error = EAGAIN; 1962 break; 1963 } 1964 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize); 1965 1966 /* Error or Reached EOF ? */ 1967 if ((error != 0) || (ret_size == 0)) { 1968 freeb(mp); 1969 break; 1970 } 1971 mp->b_wptr = mp->b_rptr + ret_size; 1972 1973 snf_enque(sr, mp); 1974 size -= ret_size; 1975 fileoff += ret_size; 1976 } 1977 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0, 1978 kcred, NULL); 1979 mutex_enter(&sr->sr_lock); 1980 sr->sr_read_error = error; 1981 sr->sr_read_error |= SR_READ_DONE; 1982 cv_signal(&sr->sr_cv); 1983 mutex_exit(&sr->sr_lock); 1984} 1985 1986void 1987snf_async_thread(void) 1988{ 1989 snf_req_t *sr; 1990 callb_cpr_t cprinfo; 1991 clock_t time_left = 1; 1992 clock_t now; 1993 1994 CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq"); 1995 1996 mutex_enter(&snfq->snfq_lock); 1997 for (;;) { 1998 /* 1999 * If we didn't find a entry, then block until woken up 2000 * again and then look through the queues again. 2001 */ 2002 while ((sr = snfq->snfq_req_head) == NULL) { 2003 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2004 if (time_left <= 0) { 2005 snfq->snfq_svc_threads--; 2006 CALLB_CPR_EXIT(&cprinfo); 2007 thread_exit(); 2008 /* NOTREACHED */ 2009 } 2010 snfq->snfq_idle_cnt++; 2011 2012 time_to_wait(&now, snfq_timeout); 2013 time_left = cv_timedwait(&snfq->snfq_cv, 2014 &snfq->snfq_lock, now); 2015 snfq->snfq_idle_cnt--; 2016 2017 CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock); 2018 } 2019 snfq->snfq_req_head = sr->sr_next; 2020 snfq->snfq_req_cnt--; 2021 mutex_exit(&snfq->snfq_lock); 2022 snf_async_read(sr); 2023 mutex_enter(&snfq->snfq_lock); 2024 } 2025} 2026 2027 2028snf_req_t * 2029create_thread(int operation, struct vnode *vp, file_t *fp, 2030 u_offset_t fileoff, u_offset_t size) 2031{ 2032 snf_req_t *sr; 2033 stdata_t *stp; 2034 2035 sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP); 2036 2037 sr->sr_vp = vp; 2038 sr->sr_fp = fp; 2039 stp = vp->v_stream; 2040 2041 /* 2042 * store sd_qn_maxpsz into sr_maxpsz while we have stream head. 2043 * stream might be closed before thread returns from snf_async_read. 2044 */ 2045 if (stp->sd_qn_maxpsz > 0) { 2046 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz); 2047 } else { 2048 sr->sr_maxpsz = MAXBSIZE; 2049 } 2050 2051 sr->sr_operation = operation; 2052 sr->sr_file_off = fileoff; 2053 sr->sr_file_size = size; 2054 sr->sr_hiwat = sendfile_req_hiwat; 2055 sr->sr_lowat = sendfile_req_lowat; 2056 mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL); 2057 cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL); 2058 /* 2059 * See whether we need another thread for servicing this 2060 * request. If there are already enough requests queued 2061 * for the threads, create one if not exceeding 2062 * snfq_max_threads. 2063 */ 2064 mutex_enter(&snfq->snfq_lock); 2065 if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt && 2066 snfq->snfq_svc_threads < snfq->snfq_max_threads) { 2067 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0, 2068 TS_RUN, minclsyspri); 2069 snfq->snfq_svc_threads++; 2070 } 2071 if (snfq->snfq_req_head == NULL) { 2072 snfq->snfq_req_head = snfq->snfq_req_tail = sr; 2073 cv_signal(&snfq->snfq_cv); 2074 } else { 2075 snfq->snfq_req_tail->sr_next = sr; 2076 snfq->snfq_req_tail = sr; 2077 } 2078 snfq->snfq_req_cnt++; 2079 mutex_exit(&snfq->snfq_lock); 2080 return (sr); 2081} 2082 2083int 2084snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size, 2085 ssize_t *count) 2086{ 2087 snf_req_t *sr; 2088 mblk_t *mp; 2089 int iosize; 2090 int error = 0; 2091 short fflag; 2092 struct vnode *vp; 2093 int ksize; 2094 2095 ksize = 0; 2096 *count = 0; 2097 2098 vp = fp->f_vnode; 2099 fflag = fp->f_flag; 2100 if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL) 2101 return (EAGAIN); 2102 2103 /* 2104 * We check for read error in snf_deque. It has to check 2105 * for successful READ_DONE and return NULL, and we might 2106 * as well make an additional check there. 2107 */ 2108 while ((mp = snf_deque(sr)) != NULL) { 2109 2110 if (ISSIG(curthread, JUSTLOOKING)) { 2111 freeb(mp); 2112 error = EINTR; 2113 break; 2114 } 2115 iosize = MBLKL(mp); 2116 2117 if ((error = kstrwritemp(vp, mp, fflag)) != 0) { 2118 freeb(mp); 2119 break; 2120 } 2121 ksize += iosize; 2122 } 2123 *count = ksize; 2124 2125 mutex_enter(&sr->sr_lock); 2126 sr->sr_write_error = error; 2127 /* Look at the big comments on why we cv_signal here. */ 2128 cv_signal(&sr->sr_cv); 2129 2130 /* Wait for the reader to complete always. */ 2131 while (!(sr->sr_read_error & SR_READ_DONE)) { 2132 cv_wait(&sr->sr_cv, &sr->sr_lock); 2133 } 2134 /* If there is no write error, check for read error. */ 2135 if (error == 0) 2136 error = (sr->sr_read_error & ~SR_READ_DONE); 2137 2138 if (error != 0) { 2139 mblk_t *next_mp; 2140 2141 mp = sr->sr_mp_head; 2142 while (mp != NULL) { 2143 next_mp = mp->b_next; 2144 mp->b_next = NULL; 2145 freeb(mp); 2146 mp = next_mp; 2147 } 2148 } 2149 mutex_exit(&sr->sr_lock); 2150 kmem_free(sr, sizeof (snf_req_t)); 2151 return (error); 2152} 2153 2154typedef struct { 2155 frtn_t snfi_frtn; 2156 caddr_t snfi_base; 2157 uint_t snfi_mapoff; 2158 size_t snfi_len; 2159 vnode_t *snfi_vp; 2160} snf_smap_desbinfo; 2161 2162/* 2163 * The callback function when the last ref of the mblk is dropped, 2164 * normally occurs when TCP receives the ack. But it can be the driver 2165 * too due to lazy reclaim. 2166 */ 2167void 2168snf_smap_desbfree(snf_smap_desbinfo *snfi) 2169{ 2170 if (!segmap_kpm) { 2171 /* 2172 * We don't need to call segmap_fault(F_SOFTUNLOCK) for 2173 * segmap_kpm as long as the latter never falls back to 2174 * "use_segmap_range". (See segmap_getmapflt().) 2175 * 2176 * Using S_OTHER saves an redundant hat_setref() in 2177 * segmap_unlock() 2178 */ 2179 (void) segmap_fault(kas.a_hat, segkmap, 2180 (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base + 2181 snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len, 2182 F_SOFTUNLOCK, S_OTHER); 2183 } 2184 (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED); 2185 VN_RELE(snfi->snfi_vp); 2186 kmem_free(snfi, sizeof (*snfi)); 2187} 2188 2189/* 2190 * Use segmap instead of bcopy to send down a chain of desballoca'ed, mblks. 2191 * Each mblk contains a segmap slot of no more than MAXBSIZE. The total 2192 * length of a chain is no more than sd_qn_maxpsz. 2193 * 2194 * At the end of the whole sendfile() operation, we wait till the data from 2195 * the last mblk is ack'ed by the transport before returning so that the 2196 * caller of sendfile() can safely modify the file content. 2197 */ 2198int 2199snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, 2200 uint_t maxpsz, ssize_t *count, boolean_t nowait) 2201{ 2202 caddr_t base; 2203 int mapoff; 2204 vnode_t *vp; 2205 mblk_t *mp, *mp1; 2206 int iosize, iosize1; 2207 int error; 2208 short fflag; 2209 int ksize; 2210 snf_smap_desbinfo *snfi; 2211 struct vattr va; 2212 boolean_t dowait = B_FALSE; 2213 2214 vp = fp->f_vnode; 2215 fflag = fp->f_flag; 2216 ksize = 0; 2217 for (;;) { 2218 if (ISSIG(curthread, JUSTLOOKING)) { 2219 error = EINTR; 2220 break; 2221 } 2222 iosize = 0; 2223 mp = NULL; 2224 do { 2225 mapoff = fileoff & MAXBOFFSET; 2226 iosize1 = MAXBSIZE - mapoff; 2227 if (iosize1 > size) 2228 iosize1 = size; 2229 /* 2230 * we don't forcefault because we'll call 2231 * segmap_fault(F_SOFTLOCK) next. 2232 * 2233 * S_READ will get the ref bit set (by either 2234 * segmap_getmapflt() or segmap_fault()) and page 2235 * shared locked. 2236 */ 2237 base = segmap_getmapflt(segkmap, fvp, fileoff, iosize1, 2238 segmap_kpm ? SM_FAULT : 0, S_READ); 2239 2240 snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP); 2241 snfi->snfi_len = (size_t)roundup(mapoff+iosize1, 2242 PAGESIZE)- (mapoff & PAGEMASK); 2243 /* 2244 * We must call segmap_fault() even for segmap_kpm 2245 * because that's how error gets returned. 2246 * (segmap_getmapflt() never fails but segmap_fault() 2247 * does.) 2248 */ 2249 if (segmap_fault(kas.a_hat, segkmap, 2250 (caddr_t)(uintptr_t)(((uintptr_t)base + mapoff) & 2251 PAGEMASK), snfi->snfi_len, F_SOFTLOCK, 2252 S_READ) != 0) { 2253 (void) segmap_release(segkmap, base, 0); 2254 kmem_free(snfi, sizeof (*snfi)); 2255 freemsg(mp); 2256 error = EIO; 2257 goto out; 2258 } 2259 snfi->snfi_frtn.free_func = snf_smap_desbfree; 2260 snfi->snfi_frtn.free_arg = (caddr_t)snfi; 2261 snfi->snfi_base = base; 2262 snfi->snfi_mapoff = mapoff; 2263 mp1 = desballoca((uchar_t *)base + mapoff, 2264 iosize1, BPRI_HI, &snfi->snfi_frtn); 2265 2266 if (mp1 == NULL) { 2267 (void) segmap_fault(kas.a_hat, segkmap, 2268 (caddr_t)(uintptr_t)(((uintptr_t)base + 2269 mapoff) & PAGEMASK), snfi->snfi_len, 2270 F_SOFTUNLOCK, S_OTHER); 2271 (void) segmap_release(segkmap, base, 0); 2272 kmem_free(snfi, sizeof (*snfi)); 2273 freemsg(mp); 2274 error = EAGAIN; 2275 goto out; 2276 } 2277 VN_HOLD(fvp); 2278 snfi->snfi_vp = fvp; 2279 mp1->b_wptr += iosize1; 2280 2281 /* Mark this dblk with the zero-copy flag */ 2282 mp1->b_datap->db_struioflag |= STRUIO_ZC; 2283 if (mp == NULL) 2284 mp = mp1; 2285 else 2286 linkb(mp, mp1); 2287 iosize += iosize1; 2288 fileoff += iosize1; 2289 size -= iosize1; 2290 } while (iosize < maxpsz && size != 0); 2291 2292 if (size == 0 && !nowait) { 2293 ASSERT(!dowait); 2294 dowait = B_TRUE; 2295 mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 2296 } 2297 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2298 if ((error = kstrwritemp(vp, mp, fflag)) != 0) { 2299 *count = ksize; 2300 freemsg(mp); 2301 return (error); 2302 } 2303 ksize += iosize; 2304 if (size == 0) 2305 goto done; 2306 2307 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2308 va.va_mask = AT_SIZE; 2309 error = VOP_GETATTR(fvp, &va, 0, kcred); 2310 if (error) 2311 break; 2312 /* Read as much as possible. */ 2313 if (fileoff >= va.va_size) 2314 break; 2315 if (size + fileoff > va.va_size) 2316 size = va.va_size - fileoff; 2317 } 2318out: 2319 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2320done: 2321 *count = ksize; 2322 if (dowait) { 2323 stdata_t *stp; 2324 2325 stp = vp->v_stream; 2326 mutex_enter(&stp->sd_lock); 2327 while (!(stp->sd_flag & STZCNOTIFY)) { 2328 (void) cv_wait_sig(&stp->sd_zcopy_wait, 2329 &stp->sd_lock); 2330 } 2331 stp->sd_flag &= ~STZCNOTIFY; 2332 mutex_exit(&stp->sd_lock); 2333 } 2334 return (error); 2335} 2336 2337int 2338snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, 2339 uint_t maxpsz, ssize_t *count) 2340{ 2341 struct vnode *vp; 2342 mblk_t *mp; 2343 int iosize; 2344 int error; 2345 short fflag; 2346 int ksize; 2347 int ioflag; 2348 struct uio auio; 2349 struct iovec aiov; 2350 struct vattr va; 2351 2352 vp = fp->f_vnode; 2353 fflag = fp->f_flag; 2354 ksize = 0; 2355 auio.uio_iov = &aiov; 2356 auio.uio_iovcnt = 1; 2357 auio.uio_segflg = UIO_SYSSPACE; 2358 auio.uio_llimit = MAXOFFSET_T; 2359 auio.uio_fmode = fflag; 2360 auio.uio_extflg = UIO_COPY_CACHED; 2361 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC); 2362 /* If read sync is not asked for, filter sync flags */ 2363 if ((ioflag & FRSYNC) == 0) 2364 ioflag &= ~(FSYNC|FDSYNC); 2365 for (;;) { 2366 if (ISSIG(curthread, JUSTLOOKING)) { 2367 error = EINTR; 2368 break; 2369 } 2370 iosize = (int)MIN(maxpsz, size); 2371 if ((mp = allocb(iosize, BPRI_MED)) == NULL) { 2372 error = EAGAIN; 2373 break; 2374 } 2375 aiov.iov_base = (caddr_t)mp->b_rptr; 2376 aiov.iov_len = iosize; 2377 auio.uio_loffset = fileoff; 2378 auio.uio_resid = iosize; 2379 2380 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL); 2381 iosize -= auio.uio_resid; 2382 2383 if (error == EINTR && iosize != 0) 2384 error = 0; 2385 2386 if (error != 0 || iosize == 0) { 2387 freeb(mp); 2388 break; 2389 } 2390 mp->b_wptr = mp->b_rptr + iosize; 2391 2392 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2393 if ((error = kstrwritemp(vp, mp, fflag)) != 0) { 2394 *count = ksize; 2395 freeb(mp); 2396 return (error); 2397 } 2398 ksize += iosize; 2399 size -= iosize; 2400 if (size == 0) 2401 goto done; 2402 2403 fileoff += iosize; 2404 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2405 va.va_mask = AT_SIZE; 2406 error = VOP_GETATTR(fvp, &va, 0, kcred); 2407 if (error) 2408 break; 2409 /* Read as much as possible. */ 2410 if (fileoff >= va.va_size) 2411 size = 0; 2412 else if (size + fileoff > va.va_size) 2413 size = va.va_size - fileoff; 2414 } 2415 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2416done: 2417 *count = ksize; 2418 return (error); 2419} 2420 2421#if defined(_SYSCALL32_IMPL) || defined(_ILP32) 2422/* 2423 * Largefile support for 32 bit applications only. 2424 */ 2425int 2426sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv, 2427 ssize32_t *count32) 2428{ 2429 ssize32_t sfv_len; 2430 u_offset_t sfv_off, va_size; 2431 struct vnode *vp, *fvp, *realvp; 2432 struct vattr va; 2433 stdata_t *stp; 2434 ssize_t count = 0; 2435 int error = 0; 2436 boolean_t dozcopy = B_FALSE; 2437 uint_t maxpsz; 2438 2439 sfv_len = (ssize32_t)sfv->sfv_len; 2440 if (sfv_len < 0) { 2441 error = EINVAL; 2442 goto out; 2443 } 2444 2445 if (sfv_len == 0) goto out; 2446 2447 sfv_off = (u_offset_t)sfv->sfv_off; 2448 2449 /* Same checks as in pread */ 2450 if (sfv_off > MAXOFFSET_T) { 2451 error = EINVAL; 2452 goto out; 2453 } 2454 if (sfv_off + sfv_len > MAXOFFSET_T) 2455 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 2456 2457 /* 2458 * There are no more checks on sfv_len. So, we cast it to 2459 * u_offset_t and share the snf_direct_io/snf_cache code between 2460 * 32 bit and 64 bit. 2461 * 2462 * TODO: should do nbl_need_check() like read()? 2463 */ 2464 if (sfv_len > sendfile_max_size) { 2465 sf_stats.ss_file_not_cached++; 2466 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len, 2467 &count); 2468 goto out; 2469 } 2470 fvp = rfp->f_vnode; 2471 if (VOP_REALVP(fvp, &realvp) == 0) 2472 fvp = realvp; 2473 /* 2474 * Grab the lock as a reader to prevent the file size 2475 * from changing underneath. 2476 */ 2477 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2478 va.va_mask = AT_SIZE; 2479 error = VOP_GETATTR(fvp, &va, 0, kcred); 2480 va_size = va.va_size; 2481 if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) { 2482 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2483 goto out; 2484 } 2485 /* Read as much as possible. */ 2486 if (sfv_off + sfv_len > va_size) 2487 sfv_len = va_size - sfv_off; 2488 2489 vp = fp->f_vnode; 2490 stp = vp->v_stream; 2491 if (stp->sd_qn_maxpsz == INFPSZ) 2492 maxpsz = MAXOFF32_T; 2493 else 2494 maxpsz = roundup(stp->sd_qn_maxpsz, MAXBSIZE); 2495 /* 2496 * When the NOWAIT flag is not set, we enable zero-copy only if the 2497 * transfer size is large enough. This prevents performance loss 2498 * when the caller sends the file piece by piece. 2499 */ 2500 if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) || 2501 (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) && 2502 !vn_has_flocks(fvp)) { 2503 if ((stp->sd_copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) { 2504 int on = 1; 2505 2506 if (SOP_SETSOCKOPT(VTOSO(vp), SOL_SOCKET, 2507 SO_SND_COPYAVOID, &on, sizeof (on)) == 0) 2508 dozcopy = B_TRUE; 2509 } else { 2510 dozcopy = (stp->sd_copyflag & STZCVMSAFE); 2511 } 2512 } 2513 if (dozcopy) { 2514 sf_stats.ss_file_segmap++; 2515 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len, 2516 maxpsz, &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0)); 2517 } else { 2518 sf_stats.ss_file_cached++; 2519 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len, 2520 maxpsz, &count); 2521 } 2522out: 2523 releasef(sfv->sfv_fd); 2524 *count32 = (ssize32_t)count; 2525 return (error); 2526} 2527#endif 2528 2529#ifdef _SYSCALL32_IMPL 2530/* 2531 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a 2532 * ssize_t rather than ssize32_t; see the comments above read32 for details. 2533 */ 2534 2535ssize_t 2536recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags) 2537{ 2538 return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags)); 2539} 2540 2541ssize_t 2542recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags, 2543 caddr32_t name, caddr32_t namelenp) 2544{ 2545 return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags, 2546 (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp)); 2547} 2548 2549ssize_t 2550send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags) 2551{ 2552 return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags)); 2553} 2554 2555ssize_t 2556sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags, 2557 caddr32_t name, socklen_t namelen) 2558{ 2559 return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags, 2560 (void *)(uintptr_t)name, namelen)); 2561} 2562#endif /* _SYSCALL32_IMPL */ 2563 2564/* 2565 * Function wrappers (mostly arround the sonode switch) for 2566 * backward compatibility. 2567 */ 2568 2569int 2570soaccept(struct sonode *so, int fflag, struct sonode **nsop) 2571{ 2572 return (SOP_ACCEPT(so, fflag, nsop)); 2573} 2574 2575int 2576sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen, 2577 int backlog, int flags) 2578{ 2579 int error; 2580 2581 error = SOP_BIND(so, name, namelen, flags); 2582 if (error == 0 && backlog != 0) 2583 return (SOP_LISTEN(so, backlog)); 2584 2585 return (error); 2586} 2587 2588int 2589solisten(struct sonode *so, int backlog) 2590{ 2591 return (SOP_LISTEN(so, backlog)); 2592} 2593 2594int 2595soconnect(struct sonode *so, const struct sockaddr *name, socklen_t namelen, 2596 int fflag, int flags) 2597{ 2598 return (SOP_CONNECT(so, name, namelen, fflag, flags)); 2599} 2600 2601int 2602sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) 2603{ 2604 return (SOP_RECVMSG(so, msg, uiop)); 2605} 2606 2607int 2608sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) 2609{ 2610 return (SOP_SENDMSG(so, msg, uiop)); 2611} 2612 2613int 2614sogetpeername(struct sonode *so) 2615{ 2616 return (SOP_GETPEERNAME(so)); 2617} 2618 2619int 2620sogetsockname(struct sonode *so) 2621{ 2622 return (SOP_GETSOCKNAME(so)); 2623} 2624 2625int 2626soshutdown(struct sonode *so, int how) 2627{ 2628 return (SOP_SHUTDOWN(so, how)); 2629} 2630 2631int 2632sogetsockopt(struct sonode *so, int level, int option_name, void *optval, 2633 socklen_t *optlenp, int flags) 2634{ 2635 return (SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, 2636 flags)); 2637} 2638 2639int 2640sosetsockopt(struct sonode *so, int level, int option_name, const void *optval, 2641 t_uscalar_t optlen) 2642{ 2643 return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen)); 2644} 2645 2646/* 2647 * Because this is backward compatibility interface it only needs to be 2648 * able to handle the creation of TPI sockfs sockets. 2649 */ 2650struct sonode * 2651socreate(vnode_t *accessvp, int domain, int type, int protocol, int version, 2652 struct sonode *tso, int *errorp) 2653{ 2654 return (sotpi_create(accessvp, domain, type, protocol, version, tso, 2655 errorp)); 2656} 2657