socksyscalls.c revision 408:11731380d322
1153761Swollman/*
2192886Sedwin * CDDL HEADER START
3192886Sedwin *
464499Swollman * The contents of this file are subject to the terms of the
52742Swollman * Common Development and Distribution License, Version 1.0 only
62742Swollman * (the "License").  You may not use this file except in compliance
7243003Sedwin * with the License.
82742Swollman *
9158421Swollman * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
102742Swollman * or http://www.opensolaris.org/os/licensing.
11158421Swollman * See the License for the specific language governing permissions
12158421Swollman * and limitations under the License.
132742Swollman *
14248307Sedwin * When distributing Covered Code, include this CDDL HEADER in each
15248307Sedwin * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16248307Sedwin * If applicable, add the following below this CDDL HEADER, with the
17248307Sedwin * fields enclosed by brackets "[]" replaced with your own identifying
1886222Swollman * information: Portions Copyright [yyyy] [name of copyright owner]
1920094Swollman *
2020094Swollman * CDDL HEADER END
2120094Swollman */
2220094Swollman/*
2320094Swollman * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24158421Swollman * Use is subject to license terms.
25158421Swollman */
2620094Swollman
2719878Swollman#pragma ident	"%Z%%M%	%I%	%E% SMI"
2819878Swollman
2919878Swollman#include <sys/types.h>
3019878Swollman#include <sys/t_lock.h>
3119878Swollman#include <sys/param.h>
3219878Swollman#include <sys/systm.h>
3319878Swollman#include <sys/buf.h>
3419878Swollman#include <sys/conf.h>
3558787Sru#include <sys/cred.h>
3658787Sru#include <sys/kmem.h>
3758787Sru#include <sys/sysmacros.h>
3858787Sru#include <sys/vfs.h>
3958787Sru#include <sys/vnode.h>
4058787Sru#include <sys/debug.h>
4158787Sru#include <sys/errno.h>
4258787Sru#include <sys/time.h>
4358787Sru#include <sys/file.h>
4458787Sru#include <sys/open.h>
4558787Sru#include <sys/user.h>
4658787Sru#include <sys/termios.h>
4758787Sru#include <sys/stream.h>
4858787Sru#include <sys/strsubr.h>
4958787Sru#include <sys/strsun.h>
5058787Sru#include <sys/esunddi.h>
5158787Sru#include <sys/flock.h>
5258787Sru#include <sys/modctl.h>
532742Swollman#include <sys/cmn_err.h>
542742Swollman#include <sys/vmsystm.h>
552742Swollman#include <sys/policy.h>
562742Swollman
572742Swollman#include <sys/socket.h>
582742Swollman#include <sys/socketvar.h>
592742Swollman#include <netinet/in.h>
6019878Swollman#include <sys/un.h>
612742Swollman#include <inet/nca/ncadoorhdr.h>
622742Swollman
632742Swollman#include <sys/isa_defs.h>
6419878Swollman#include <sys/inttypes.h>
652742Swollman#include <sys/systm.h>
662742Swollman#include <sys/cpuvar.h>
67149514Swollman#include <sys/atomic.h>
6821217Swollman#include <sys/filio.h>
699908Swollman#include <sys/sendfile.h>
709908Swollman#include <sys/ddi.h>
712742Swollman#include <vm/seg.h>
7219878Swollman#include <vm/seg_map.h>
7319878Swollman#include <vm/seg_kpm.h>
7419878Swollman#include <fs/sockfs/nl7c.h>
7519878Swollman
7619878Swollman#ifdef SOCK_TEST
7719878Swollmanint do_useracc = 1;		/* Controlled by setting SO_DEBUG to 4 */
7819878Swollman#else
7919878Swollman#define	do_useracc	1
8019878Swollman#endif /* SOCK_TEST */
8119878Swollman
8219878Swollmanextern int xnet_truncate_print;
8319878Swollman
8419878Swollman/*
8519878Swollman * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
8619878Swollman *	 as there isn't a formal definition of IOV_MAX ???
8719878Swollman */
8893799Swollman#define	MSG_MAXIOVLEN	16
8958787Sru
9058787Sru/*
9119878Swollman * Kernel component of socket creation.
9219878Swollman *
9319878Swollman * The socket library determines which version number to use.
949908Swollman * First the library calls this with a NULL devpath. If this fails
95149514Swollman * to find a transport (using solookup) the library will look in /etc/netconfig
969908Swollman * for the appropriate transport. If one is found it will pass in the
979908Swollman * devpath for the kernel to use.
989908Swollman */
9921217Swollmanint
10019878Swollmanso_socket(int domain, int type, int protocol, char *devpath, int version)
10119878Swollman{
1029908Swollman	vnode_t *accessvp;
103149514Swollman	struct sonode *so;
1049908Swollman	vnode_t *vp;
1059908Swollman	struct file *fp;
1069908Swollman	int fd;
1079908Swollman	int error;
10858787Sru	boolean_t wildcard = B_FALSE;
10958787Sru	int saved_error = 0;
11058787Sru	int sdomain = domain;
11164499Swollman
11264499Swollman	dprint(1, ("so_socket(%d,%d,%d,%p,%d)\n",
113175034Sedwin		domain, type, protocol, devpath, version));
114175034Sedwin
115175034Sedwin	if (domain == AF_NCA) {
116175034Sedwin		/*
117175034Sedwin		 * The request is for an NCA socket so for NL7C use the
11858787Sru		 * INET domain instead and mark NL7C_AF_NCA below.
11958787Sru		 */
12067578Swollman		domain = AF_INET;
12158787Sru		/*
12258787Sru		 * NL7C is not supported in non-global zones,
12358787Sru		 *  we enforce this restriction here.
124149514Swollman		 */
12564499Swollman		if (getzoneid() != GLOBAL_ZONEID) {
12664499Swollman			return (set_errno(ENOTSUP));
12764499Swollman		}
12864499Swollman	}
12986222Swollman
13086222Swollman	accessvp = solookup(domain, type, protocol, devpath, &error);
13186222Swollman	if (accessvp == NULL) {
13286222Swollman		/*
13386222Swollman		 * If there is either an EPROTONOSUPPORT or EPROTOTYPE error
13486222Swollman		 * it makes sense doing the wildcard lookup since the
13586222Swollman		 * protocol might not be in the table.
13686222Swollman		 */
13786222Swollman		if (devpath != NULL || protocol == 0 ||
13886222Swollman		    !(error == EPROTONOSUPPORT || error == EPROTOTYPE))
13986222Swollman			return (set_errno(error));
14086222Swollman
14186222Swollman		saved_error = error;
14286222Swollman
14386222Swollman		/*
14486222Swollman		 * Try wildcard lookup. Never use devpath for wildcards.
14586222Swollman		 */
14686222Swollman		accessvp = solookup(domain, type, 0, NULL, &error);
14786222Swollman		if (accessvp == NULL) {
14886222Swollman			/*
14986222Swollman			 * Can't find in kernel table - have library
15086222Swollman			 * fall back to /etc/netconfig and tell us
15186222Swollman			 * the devpath (The library will do this if it didn't
152175034Sedwin			 * already pass in a devpath).
153175034Sedwin			 */
154175034Sedwin			if (saved_error != 0)
155175034Sedwin				error = saved_error;
156175034Sedwin			return (set_errno(error));
157175034Sedwin		}
158175034Sedwin		wildcard = B_TRUE;
159175034Sedwin	}
160175034Sedwin
161175034Sedwin	/* Check the device policy */
162175034Sedwin	if ((error = secpolicy_spec_open(CRED(),
163175034Sedwin	    accessvp, FREAD|FWRITE)) != 0) {
164175034Sedwin		return (set_errno(error));
165175034Sedwin	}
166175034Sedwin
167175034Sedwin	if (domain == AF_NCA) {
168175034Sedwin		so = sonca_create(accessvp, domain, type, protocol, version,
169175034Sedwin		    NULL, &error);
170175034Sedwin	} else if (protocol == IPPROTO_SCTP) {
171175034Sedwin		so = sosctp_create(accessvp, domain, type, protocol, version,
172183066Sedwin		    NULL, &error);
173183066Sedwin	} else {
174183066Sedwin		so = sotpi_create(accessvp, domain, type, protocol, version,
175183066Sedwin		    NULL, &error);
176183066Sedwin	}
177183066Sedwin	if (so == NULL) {
178183066Sedwin		return (set_errno(error));
179183066Sedwin	}
180183066Sedwin	if (sdomain == AF_NCA && domain == AF_INET) {
181183066Sedwin		so->so_nl7c_flags = NL7C_AF_NCA;
182183066Sedwin	}
183183066Sedwin	vp = SOTOV(so);
184183066Sedwin
185183864Sedwin	if (wildcard) {
186183864Sedwin		/*
187183864Sedwin		 * Issue SO_PROTOTYPE setsockopt.
188183864Sedwin		 */
189183864Sedwin		error = SOP_SETSOCKOPT(so, SOL_SOCKET, SO_PROTOTYPE,
190183864Sedwin				&protocol,
191183864Sedwin				(t_uscalar_t)sizeof (protocol));
192183864Sedwin		if (error) {
193183864Sedwin			(void) VOP_CLOSE(vp, 0, 1, 0, CRED());
194183864Sedwin			VN_RELE(vp);
195183864Sedwin			/*
196183864Sedwin			 * Setsockopt often fails with ENOPROTOOPT but socket()
197183864Sedwin			 * should fail with EPROTONOSUPPORT/EPROTOTYPE.
198183864Sedwin			 */
199183864Sedwin			if (saved_error != 0 && error == ENOPROTOOPT)
200183864Sedwin				error = saved_error;
201184406Sedwin			else
202184406Sedwin				error = EPROTONOSUPPORT;
203184406Sedwin			return (set_errno(error));
204184406Sedwin		}
205184406Sedwin	}
206184406Sedwin	if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
207184406Sedwin		(void) VOP_CLOSE(vp, 0, 1, 0, CRED());
208184406Sedwin		VN_RELE(vp);
209184406Sedwin		return (set_errno(error));
210184406Sedwin	}
211184406Sedwin
212184406Sedwin	/*
213184406Sedwin	 * Now fill in the entries that falloc reserved
214184406Sedwin	 */
215184406Sedwin	mutex_exit(&fp->f_tlock);
216184406Sedwin	setf(fd, fp);
217184406Sedwin
218184406Sedwin	return (fd);
219184406Sedwin}
220184406Sedwin
221198515Sedwin/*
222198515Sedwin * Map from a file descriptor to a socket node.
223198515Sedwin * Returns with the file descriptor held i.e. the caller has to
224198515Sedwin * use releasef when done with the file descriptor.
225198515Sedwin */
226198515Sedwinstatic struct sonode *
227198515Sedwingetsonode(int sock, int *errorp, file_t **fpp)
228198515Sedwin{
229198515Sedwin	file_t *fp;
230198515Sedwin	vnode_t *vp;
231198515Sedwin	struct sonode *so;
232198515Sedwin
233198515Sedwin	if ((fp = getf(sock)) == NULL) {
234198515Sedwin		*errorp = EBADF;
235175034Sedwin		eprintline(*errorp);
236198515Sedwin		return (NULL);
237198515Sedwin	}
238240457Sedwin	vp = fp->f_vnode;
239136638Swollman	/* Check if it is a socket */
240136638Swollman	if (vp->v_type != VSOCK) {
241136638Swollman		releasef(sock);
242136638Swollman		*errorp = ENOTSOCK;
243136638Swollman		eprintline(*errorp);
244136638Swollman		return (NULL);
245136638Swollman	}
24693799Swollman	/*
247158421Swollman	 * Use the stream head to find the real socket vnode.
24893799Swollman	 * This is needed when namefs sits above sockfs.
249158421Swollman	 */
25093799Swollman	if (vp->v_stream) {
25193799Swollman		ASSERT(vp->v_stream->sd_vnode);
252158421Swollman		vp = vp->v_stream->sd_vnode;
253136638Swollman
254136638Swollman		so = VTOSO(vp);
255136638Swollman		if (so->so_version == SOV_STREAM) {
256136638Swollman			releasef(sock);
257136638Swollman			*errorp = ENOTSOCK;
258136638Swollman			eprintsoline(so, *errorp);
259136638Swollman			return (NULL);
260136638Swollman		}
261136638Swollman	} else {
262136638Swollman		so = VTOSO(vp);
263136638Swollman	}
264136638Swollman	if (fpp)
265136638Swollman		*fpp = fp;
266136638Swollman	return (so);
267136638Swollman}
268136638Swollman
269136638Swollman/*
270136638Swollman * Allocate and copyin a sockaddr.
271136638Swollman * Ensures NULL termination for AF_UNIX addresses by extending them
272136638Swollman * with one NULL byte if need be. Verifies that the length is not
273136638Swollman * excessive to prevent an application from consuming all of kernel
274136638Swollman * memory. Returns NULL when an error occurred.
275136638Swollman */
276136638Swollmanstatic struct sockaddr *
277136638Swollmancopyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
278136638Swollman	    int *errorp)
279136638Swollman{
280136638Swollman	char	*faddr;
281136638Swollman	size_t	namelen = (size_t)*namelenp;
282136638Swollman
283136638Swollman	ASSERT(namelen != 0);
284136638Swollman	if (namelen > SO_MAXARGSIZE) {
285136638Swollman		*errorp = EINVAL;
286136638Swollman		eprintsoline(so, *errorp);
287136638Swollman		return (NULL);
288136638Swollman	}
289136638Swollman
290136638Swollman	faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
291136638Swollman	if (copyin(name, faddr, namelen)) {
292136638Swollman		kmem_free(faddr, namelen);
293136638Swollman		*errorp = EFAULT;
294136638Swollman		eprintsoline(so, *errorp);
295136638Swollman		return (NULL);
296136638Swollman	}
297136638Swollman
29893799Swollman	/*
299177591Sedwin	 * Add space for NULL termination if needed.
300177591Sedwin	 * Do a quick check if the last byte is NUL.
301177591Sedwin	 */
302177591Sedwin	if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
303177591Sedwin		/* Check if there is any NULL termination */
304177591Sedwin		size_t	i;
305177591Sedwin		int foundnull = 0;
306177591Sedwin
307177591Sedwin		for (i = sizeof (name->sa_family); i < namelen; i++) {
308177591Sedwin			if (faddr[i] == '\0') {
309177591Sedwin				foundnull = 1;
310177591Sedwin				break;
311177591Sedwin			}
312177591Sedwin		}
313177591Sedwin		if (!foundnull) {
314177591Sedwin			/* Add extra byte for NUL padding */
315177591Sedwin			char *nfaddr;
316177591Sedwin
317177591Sedwin			nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
318177591Sedwin			bcopy(faddr, nfaddr, namelen);
319177591Sedwin			kmem_free(faddr, namelen);
320177591Sedwin
321177591Sedwin			/* NUL terminate */
322177591Sedwin			nfaddr[namelen] = '\0';
323177591Sedwin			namelen++;
324177591Sedwin			ASSERT((socklen_t)namelen == namelen);
325177591Sedwin			*namelenp = (socklen_t)namelen;
326177591Sedwin			faddr = nfaddr;
327177591Sedwin		}
328240457Sedwin	}
329240457Sedwin	return ((struct sockaddr *)faddr);
330240457Sedwin}
331177591Sedwin
332177591Sedwin/*
333177591Sedwin * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
334177591Sedwin */
335177591Sedwinstatic int
336177591Sedwincopyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
337177591Sedwin		void *kaddr, socklen_t klen)
338177591Sedwin{
339177591Sedwin	if (uaddr != NULL) {
340177591Sedwin		if (ulen > klen)
341177591Sedwin			ulen = klen;
342177591Sedwin
343177591Sedwin		if (ulen != 0) {
344177591Sedwin			if (copyout(kaddr, uaddr, ulen))
345177591Sedwin				return (EFAULT);
346177591Sedwin		}
347177591Sedwin	} else
348177591Sedwin		ulen = 0;
349177591Sedwin
350177591Sedwin	if (ulenp != NULL) {
351177591Sedwin		if (copyout(&ulen, ulenp, sizeof (ulen)))
352177591Sedwin			return (EFAULT);
353177591Sedwin	}
354177591Sedwin	return (0);
355177591Sedwin}
356177591Sedwin
357177591Sedwin/*
358177591Sedwin * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
359177591Sedwin * If klen is greater than ulen it still uses the non-truncated
360177591Sedwin * klen to update ulenp.
361177591Sedwin */
362177591Sedwinstatic int
363177591Sedwincopyout_name(void *uaddr, socklen_t ulen, void *ulenp,
364177591Sedwin		void *kaddr, socklen_t klen)
365181421Sedwin{
366158421Swollman	if (uaddr != NULL) {
367158421Swollman		if (ulen >= klen)
368181421Sedwin			ulen = klen;
369181421Sedwin		else if (ulen != 0 && xnet_truncate_print) {
370181421Sedwin			printf("sockfs: truncating copyout of address using "
371181421Sedwin			    "XNET semantics for pid = %d. Lengths %d, %d\n",
372181421Sedwin			    curproc->p_pid, klen, ulen);
373190372Sedwin		}
374190372Sedwin
375190372Sedwin		if (ulen != 0) {
376190372Sedwin			if (copyout(kaddr, uaddr, ulen))
37793799Swollman				return (EFAULT);
378190372Sedwin		} else
379190372Sedwin			klen = 0;
380190372Sedwin	} else
381190372Sedwin		klen = 0;
382190372Sedwin
383190372Sedwin	if (ulenp != NULL) {
384190372Sedwin		if (copyout(&klen, ulenp, sizeof (klen)))
385190372Sedwin			return (EFAULT);
386190372Sedwin	}
387190372Sedwin	return (0);
388248307Sedwin}
389190372Sedwin
390240457Sedwin/*
391248307Sedwin * The socketpair() code in libsocket creates two sockets (using
392248307Sedwin * the /etc/netconfig fallback if needed) before calling this routine
393190372Sedwin * to connect the two sockets together.
394190372Sedwin *
395190372Sedwin * For a SOCK_STREAM socketpair a listener is needed - in that case this
396190372Sedwin * routine will create a new file descriptor as part of accepting the
397190372Sedwin * connection. The library socketpair() will check if svs[2] has changed
398190372Sedwin * in which case it will close the changed fd.
399198515Sedwin *
400198515Sedwin * Note that this code could use the TPI feature of accepting the connection
401190372Sedwin * on the listening endpoint. However, that would require significant changes
402198515Sedwin * to soaccept.
403198515Sedwin */
404198515Sedwinint
405197597Sedwinso_socketpair(int sv[2])
406198515Sedwin{
407198515Sedwin	int svs[2];
408198515Sedwin	struct sonode *so1, *so2;
409198515Sedwin	int error;
410197597Sedwin	struct sockaddr_ux *name;
411198515Sedwin	size_t namelen;
412197597Sedwin
413198515Sedwin	dprint(1, ("so_socketpair(%p)\n", sv));
414198515Sedwin
415198515Sedwin	error = useracc(sv, sizeof (svs), B_WRITE);
416198515Sedwin	if (error && do_useracc)
417198515Sedwin		return (set_errno(EFAULT));
418198515Sedwin
419198515Sedwin	if (copyin(sv, svs, sizeof (svs)))
420198515Sedwin		return (set_errno(EFAULT));
421198515Sedwin
422198515Sedwin	if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
423198515Sedwin		return (set_errno(error));
424198515Sedwin
425198515Sedwin	if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
426198515Sedwin		releasef(svs[0]);
427198515Sedwin		return (set_errno(error));
428198515Sedwin	}
429198515Sedwin
430198515Sedwin	if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
431198515Sedwin		error = EOPNOTSUPP;
432197597Sedwin		goto done;
433206868Sedwin	}
434206868Sedwin
435206868Sedwin	/*
436206868Sedwin	 * The code below makes assumptions about the "sockfs" implementation.
437206868Sedwin	 * So make sure that the correct implementation is really used.
438206868Sedwin	 */
439206868Sedwin	ASSERT(so1->so_ops == &sotpi_sonodeops);
440206868Sedwin	ASSERT(so2->so_ops == &sotpi_sonodeops);
441206868Sedwin
442206868Sedwin	if (so1->so_type == SOCK_DGRAM) {
443206868Sedwin		/*
444206868Sedwin		 * Bind both sockets and connect them with each other.
445206868Sedwin		 * Need to allocate name/namelen for soconnect.
446206868Sedwin		 */
447206868Sedwin		error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC);
448206868Sedwin		if (error) {
449206868Sedwin			eprintsoline(so1, error);
450206868Sedwin			goto done;
451206868Sedwin		}
452206868Sedwin		error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC);
453206868Sedwin		if (error) {
454248307Sedwin			eprintsoline(so2, error);
455248307Sedwin			goto done;
456248307Sedwin		}
4572742Swollman		namelen = sizeof (struct sockaddr_ux);
45820094Swollman		name = kmem_alloc(namelen, KM_SLEEP);
459136638Swollman		name->sou_family = AF_UNIX;
460136638Swollman		name->sou_addr = so2->so_ux_laddr;
46193799Swollman		error = SOP_CONNECT(so1,
46219878Swollman				(struct sockaddr *)name,
46358787Sru				(socklen_t)namelen,
46493799Swollman				0, _SOCONNECT_NOXLATE);
46593799Swollman		if (error) {
466175034Sedwin			kmem_free(name, namelen);
46720094Swollman			eprintsoline(so1, error);
468184406Sedwin			goto done;
469184406Sedwin		}
47020094Swollman		name->sou_addr = so1->so_ux_laddr;
471158421Swollman		error = SOP_CONNECT(so2,
47293799Swollman				(struct sockaddr *)name,
47393799Swollman				(socklen_t)namelen,
47493799Swollman				0, _SOCONNECT_NOXLATE);
47593799Swollman		kmem_free(name, namelen);
47693799Swollman		if (error) {
47793799Swollman			eprintsoline(so2, error);
478136638Swollman			goto done;
47993799Swollman		}
48020094Swollman		releasef(svs[0]);
48158787Sru		releasef(svs[1]);
48293799Swollman	} else {
48393799Swollman		/*
48493799Swollman		 * Bind both sockets, with so1 being a listener.
48593799Swollman		 * Connect so2 to so1 - nonblocking to avoid waiting for
486175034Sedwin		 * soaccept to complete.
48720094Swollman		 * Accept a connection on so1. Pass out the new fd as sv[0].
488184406Sedwin		 * The library will detect the changed fd and close
489184406Sedwin		 * the original one.
490184406Sedwin		 */
491184406Sedwin		struct sonode *nso;
492184406Sedwin		struct vnode *nvp;
493184406Sedwin		struct file *nfp;
494184406Sedwin		int nfd;
495184406Sedwin
496184406Sedwin		/*
497184406Sedwin		 * We could simply call SOP_LISTEN() here (which would do the
498184406Sedwin		 * binding automatically) if the code didn't rely on passing
499184406Sedwin		 * _SOBIND_NOXLATE to the TPI implementation of SOP_BIND().
500136638Swollman		 */
501136638Swollman		error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC|_SOBIND_NOXLATE|
502136638Swollman		    _SOBIND_LISTEN|_SOBIND_SOCKETPAIR);
503136638Swollman		if (error) {
504136638Swollman			eprintsoline(so1, error);
505136638Swollman			goto done;
506136638Swollman		}
507136638Swollman		error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC);
508136638Swollman		if (error) {
509136638Swollman			eprintsoline(so2, error);
510136638Swollman			goto done;
511175034Sedwin		}
512136638Swollman
513136638Swollman		namelen = sizeof (struct sockaddr_ux);
514136638Swollman		name = kmem_alloc(namelen, KM_SLEEP);
515136638Swollman		name->sou_family = AF_UNIX;
516136638Swollman		name->sou_addr = so1->so_ux_laddr;
517136638Swollman		error = SOP_CONNECT(so2,
518136638Swollman				(struct sockaddr *)name,
519136638Swollman				(socklen_t)namelen,
520136638Swollman				FNONBLOCK, _SOCONNECT_NOXLATE);
521136638Swollman		kmem_free(name, namelen);
522136638Swollman		if (error) {
523136638Swollman			if (error != EINPROGRESS) {
524184406Sedwin				eprintsoline(so2, error);
525184406Sedwin				goto done;
526136638Swollman			}
527136638Swollman		}
528136638Swollman
529136638Swollman		error = SOP_ACCEPT(so1, 0, &nso);
530136638Swollman		if (error) {
531136638Swollman			eprintsoline(so1, error);
532136638Swollman			goto done;
533136638Swollman		}
534136638Swollman
535136638Swollman		/* wait for so2 being SS_CONNECTED ignoring signals */
536136638Swollman		mutex_enter(&so2->so_lock);
537136638Swollman		error = sowaitconnected(so2, 0, 1);
538184406Sedwin		mutex_exit(&so2->so_lock);
539184406Sedwin		nvp = SOTOV(nso);
540136638Swollman		if (error != 0) {
54120094Swollman			(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
542136638Swollman			VN_RELE(nvp);
54393799Swollman			eprintsoline(so2, error);
54420094Swollman			goto done;
54520094Swollman		}
54693799Swollman
54793799Swollman		if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
54893799Swollman			(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
54920094Swollman			VN_RELE(nvp);
55093799Swollman			eprintsoline(nso, error);
55193799Swollman			goto done;
55293799Swollman		}
553184406Sedwin		/*
554184406Sedwin		 * fill in the entries that falloc reserved
55520094Swollman		 */
556149514Swollman		mutex_exit(&nfp->f_tlock);
557136638Swollman		setf(nfd, nfp);
55893799Swollman
55920094Swollman		releasef(svs[0]);
56058787Sru		releasef(svs[1]);
56193799Swollman		svs[0] = nfd;
56293799Swollman
56393799Swollman		/*
56493799Swollman		 * The socketpair library routine will close the original
565136638Swollman		 * svs[0] when this code passes out a different file
566136638Swollman		 * descriptor.
567184406Sedwin		 */
568184406Sedwin		if (copyout(svs, sv, sizeof (svs))) {
56920094Swollman			(void) closeandsetf(nfd, NULL);
57020094Swollman			eprintline(EFAULT);
571136638Swollman			return (set_errno(EFAULT));
57293799Swollman		}
57320094Swollman	}
57420094Swollman	return (0);
57593799Swollman
57693799Swollmandone:
57793799Swollman	releasef(svs[0]);
57820094Swollman	releasef(svs[1]);
57920094Swollman	return (set_errno(error));
58020094Swollman}
58193799Swollman
58293799Swollmanint
583136638Swollmanbind(int sock, struct sockaddr *name, socklen_t namelen, int version)
584136638Swollman{
585184406Sedwin	struct sonode *so;
586184406Sedwin	int error;
587136638Swollman
588177591Sedwin	dprint(1, ("bind(%d, %p, %d)\n",
589198515Sedwin		sock, name, namelen));
590206868Sedwin
591206868Sedwin	if ((so = getsonode(sock, &error, NULL)) == NULL)
592198515Sedwin		return (set_errno(error));
593177591Sedwin
594177591Sedwin	/* Allocate and copyin name */
595177591Sedwin	/*
596177591Sedwin	 * X/Open test does not expect EFAULT with NULL name and non-zero
597181421Sedwin	 * namelen.
598181421Sedwin	 */
599181421Sedwin	if (name != NULL && namelen != 0) {
600181421Sedwin		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
601181421Sedwin		name = copyin_name(so, name, &namelen, &error);
602181421Sedwin		if (name == NULL) {
603181421Sedwin			releasef(sock);
604177591Sedwin			return (set_errno(error));
605177591Sedwin		}
606177591Sedwin	} else {
607198515Sedwin		name = NULL;
608177591Sedwin		namelen = 0;
609136638Swollman	}
610136638Swollman
611136638Swollman	switch (version) {
612136638Swollman	default:
613136638Swollman		error = SOP_BIND(so, name, namelen, 0);
614136638Swollman		break;
615136638Swollman	case SOV_XPG4_2:
616136638Swollman		error = SOP_BIND(so, name, namelen, _SOBIND_XPG4_2);
617136638Swollman		break;
618184406Sedwin	case SOV_SOCKBSD:
619184406Sedwin		error = SOP_BIND(so, name, namelen, _SOBIND_SOCKBSD);
620136638Swollman		break;
621136638Swollman	}
622136638Swollmandone:
623136638Swollman	releasef(sock);
624136638Swollman	if (name != NULL)
625136638Swollman		kmem_free(name, (size_t)namelen);
626136638Swollman
627136638Swollman	if (error)
628136638Swollman		return (set_errno(error));
629136638Swollman	return (0);
630184406Sedwin}
631184406Sedwin
6328029Swollman/* ARGSUSED2 */
63314343Swollmanint
63414343Swollmanlisten(int sock, int backlog, int version)
63514343Swollman{
63619878Swollman	struct sonode *so;
63714343Swollman	int error;
63814343Swollman
6392742Swollman	dprint(1, ("listen(%d, %d)\n",
6402742Swollman		sock, backlog));
6412742Swollman
64286222Swollman	if ((so = getsonode(sock, &error, NULL)) == NULL)
64319878Swollman		return (set_errno(error));
64419878Swollman
6452742Swollman	error = SOP_LISTEN(so, backlog);
6462742Swollman
6472742Swollman	releasef(sock);
648149514Swollman	if (error)
6492742Swollman		return (set_errno(error));
6502742Swollman	return (0);
6512742Swollman}
6522742Swollman
6532742Swollman/*ARGSUSED3*/
6542742Swollmanint
65520094Swollmanaccept(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
65620094Swollman{
65720094Swollman	struct sonode *so;
65820094Swollman	file_t *fp;
65920094Swollman	int error;
66020094Swollman	socklen_t namelen;
66120094Swollman	struct sonode *nso;
66220094Swollman	struct vnode *nvp;
66320094Swollman	struct file *nfp;
66420094Swollman	int nfd;
66520094Swollman
66620094Swollman	dprint(1, ("accept(%d, %p, %p)\n",
66720094Swollman		sock, name, namelenp));
66820094Swollman
66920094Swollman	if ((so = getsonode(sock, &error, &fp)) == NULL)
67020094Swollman		return (set_errno(error));
67120094Swollman
67220094Swollman	if (name != NULL) {
67320094Swollman		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
67420094Swollman		if (copyin(namelenp, &namelen, sizeof (namelen))) {
67520094Swollman			releasef(sock);
67620094Swollman			return (set_errno(EFAULT));
67720094Swollman		}
67820094Swollman		if (namelen != 0) {
67920094Swollman			error = useracc(name, (size_t)namelen, B_WRITE);
68020094Swollman			if (error && do_useracc) {
68143014Swollman				releasef(sock);
68243014Swollman				return (set_errno(EFAULT));
68343014Swollman			}
68443014Swollman		} else
68575267Swollman			name = NULL;
68675267Swollman	} else {
68775267Swollman		namelen = 0;
68875267Swollman	}
68975267Swollman
69075267Swollman	/*
691105196Swollman	 * Allocate the user fd before SOP_ACCEPT() in order to
692105196Swollman	 * catch EMFILE errors before calling SOP_ACCEPT().
693105196Swollman	 */
694105196Swollman	if ((nfd = ufalloc(0)) == -1) {
695105196Swollman		eprintsoline(so, EMFILE);
696105196Swollman		releasef(sock);
697105196Swollman		return (set_errno(EMFILE));
698105196Swollman	}
699105196Swollman	error = SOP_ACCEPT(so, fp->f_flag, &nso);
700105196Swollman	releasef(sock);
701105196Swollman	if (error) {
702105196Swollman		setf(nfd, NULL);
703105196Swollman		return (set_errno(error));
704105196Swollman	}
705105196Swollman
706105196Swollman	nvp = SOTOV(nso);
707105196Swollman
708136638Swollman	/*
709136638Swollman	 * so_faddr_sa can not go away even though we are not holding so_lock.
710136638Swollman	 * However, in theory its content could change from underneath us.
711136638Swollman	 * But this is not possible in practice since it can only
712136638Swollman	 * change due to either some socket system call
713172479Sedwin	 * or due to a T_CONN_CON being received from the stream head.
714172479Sedwin	 * Since the falloc/setf have not yet been done no thread
715172479Sedwin	 * can do any system call on nso and T_CONN_CON can not arrive
716172479Sedwin	 * on a socket that is already connected.
717181421Sedwin	 * Thus there is no reason to hold so_lock here.
718181421Sedwin	 *
719181421Sedwin	 * SOP_ACCEPT() is required to have set the valid bit for the faddr,
720181421Sedwin	 * but it could be instantly cleared by a disconnect from the transport.
721181421Sedwin	 * For that reason we ignore it here.
722181421Sedwin	 */
723181421Sedwin	ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
724181421Sedwin	error = copyout_name(name, namelen, namelenp,
725181421Sedwin	    nso->so_faddr_sa, (socklen_t)nso->so_faddr_len);
726181421Sedwin	if (error) {
727181421Sedwin		setf(nfd, NULL);
728181421Sedwin		(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
729181421Sedwin		VN_RELE(nvp);
730181421Sedwin		return (set_errno(error));
731181421Sedwin	}
732181421Sedwin	if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
733181421Sedwin		setf(nfd, NULL);
734181421Sedwin		(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
735181421Sedwin		VN_RELE(nvp);
736181421Sedwin		eprintsoline(so, error);
737181421Sedwin		return (set_errno(error));
738192886Sedwin	}
739192886Sedwin	/*
740181421Sedwin	 * fill in the entries that falloc reserved
741181421Sedwin	 */
742181421Sedwin	nfp->f_vnode = nvp;
743181421Sedwin	mutex_exit(&nfp->f_tlock);
744221092Sedwin	setf(nfd, nfp);
745181421Sedwin
746181421Sedwin	/*
747181421Sedwin	 * Copy FNDELAY and FNONBLOCK from listener to acceptor
748181421Sedwin	 */
749181421Sedwin	if (so->so_state & (SS_NDELAY|SS_NONBLOCK)) {
750181421Sedwin		uint_t oflag = nfp->f_flag;
751181421Sedwin		int arg = 0;
752181421Sedwin
753181421Sedwin		if (so->so_state & SS_NONBLOCK)
754181421Sedwin			arg |= FNONBLOCK;
755181421Sedwin		else if (so->so_state & SS_NDELAY)
756181421Sedwin			arg |= FNDELAY;
757181421Sedwin
758181421Sedwin		/*
759181421Sedwin		 * This code is a simplification of the F_SETFL code in fcntl()
760181421Sedwin		 * Ignore any errors from VOP_SETFL.
761105196Swollman		 */
762105196Swollman		if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred)) != 0) {
76343014Swollman			eprintsoline(so, error);
76443014Swollman			error = 0;
765163302Sru		} else {
76643014Swollman			mutex_enter(&nfp->f_tlock);
767183066Sedwin			nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
768183066Sedwin			nfp->f_flag |= arg;
769183066Sedwin			mutex_exit(&nfp->f_tlock);
770183066Sedwin		}
771183066Sedwin	}
772183066Sedwin	return (nfd);
773183066Sedwin}
774183066Sedwin
775183066Sedwinint
776183066Sedwinconnect(int sock, struct sockaddr *name, socklen_t namelen, int version)
777183066Sedwin{
778183066Sedwin	struct sonode *so;
779183066Sedwin	file_t *fp;
780183066Sedwin	int error;
781183066Sedwin
782183536Sedwin	dprint(1, ("connect(%d, %p, %d)\n",
783183066Sedwin		sock, name, namelen));
784183066Sedwin
785183066Sedwin	if ((so = getsonode(sock, &error, &fp)) == NULL)
786183066Sedwin		return (set_errno(error));
787183066Sedwin
788183066Sedwin	/* Allocate and copyin name */
789183066Sedwin	if (namelen != 0) {
790183066Sedwin		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
791183066Sedwin		name = copyin_name(so, name, &namelen, &error);
792183066Sedwin		if (name == NULL) {
793183066Sedwin			releasef(sock);
794183066Sedwin			return (set_errno(error));
795183066Sedwin		}
796226289Sedwin	} else
797226289Sedwin		name = NULL;
798226289Sedwin
799240457Sedwin	error = SOP_CONNECT(so, name, namelen, fp->f_flag,
800240457Sedwin	    (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2);
801183066Sedwin	releasef(sock);
802226289Sedwin	if (name)
803226289Sedwin		kmem_free(name, (size_t)namelen);
804226289Sedwin	if (error)
805226289Sedwin		return (set_errno(error));
806226289Sedwin	return (0);
807226289Sedwin}
808226289Sedwin
809226289Sedwin/*ARGSUSED2*/
810226289Sedwinint
811226289Sedwinshutdown(int sock, int how, int version)
812248307Sedwin{
813226289Sedwin	struct sonode *so;
814248307Sedwin	int error;
815226289Sedwin
816226289Sedwin	dprint(1, ("shutdown(%d, %d)\n",
817226752Sedwin		sock, how));
818226752Sedwin
819226752Sedwin	if ((so = getsonode(sock, &error, NULL)) == NULL)
820226752Sedwin		return (set_errno(error));
821226752Sedwin
822226752Sedwin	error = SOP_SHUTDOWN(so, how);
823226752Sedwin
824226752Sedwin	releasef(sock);
825226752Sedwin	if (error)
826226752Sedwin		return (set_errno(error));
827226752Sedwin	return (0);
828226752Sedwin}
829242208Sedwin
830242208Sedwin/*
831242208Sedwin * Common receive routine.
832242208Sedwin */
833242208Sedwinstatic ssize_t
834226752Sedwinrecvit(int sock,
835242208Sedwin	struct nmsghdr *msg,
836242208Sedwin	struct uio *uiop,
837242208Sedwin	int flags,
838242208Sedwin	socklen_t *namelenp,
8392742Swollman	socklen_t *controllenp,
84043543Swollman	int *flagsp)
84143543Swollman{
84258787Sru	struct sonode *so;
84358787Sru	file_t *fp;
84458787Sru	void *name;
84543543Swollman	socklen_t namelen;
84643014Swollman	void *control;
84743543Swollman	socklen_t controllen;
84843543Swollman	ssize_t len;
84958787Sru	int error;
85058787Sru
85158787Sru	if ((so = getsonode(sock, &error, &fp)) == NULL)
85243543Swollman		return (set_errno(error));
85358787Sru
85443543Swollman	len = uiop->uio_resid;
85543014Swollman	uiop->uio_fmode = fp->f_flag;
85643543Swollman	uiop->uio_extflg = UIO_COPY_CACHED;
85743543Swollman
85843543Swollman	name = msg->msg_name;
85943543Swollman	namelen = msg->msg_namelen;
86043543Swollman	control = msg->msg_control;
86158787Sru	controllen = msg->msg_controllen;
86243543Swollman
86343543Swollman	msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
86458787Sru	    MSG_DONTWAIT | MSG_XPG4_2);
86543543Swollman
86658787Sru	error = SOP_RECVMSG(so, msg, uiop);
86758787Sru	if (error) {
86843543Swollman		releasef(sock);
86958787Sru		return (set_errno(error));
87043543Swollman	}
87158787Sru	lwp_stat_update(LWP_STAT_MSGRCV, 1);
87258787Sru	so_update_attrs(so, SOACC);
87343543Swollman	releasef(sock);
87443543Swollman
87543543Swollman	error = copyout_name(name, namelen, namelenp,
87658787Sru	    msg->msg_name, msg->msg_namelen);
87743543Swollman	if (error)
87843543Swollman		goto err;
87958787Sru
88043543Swollman	if (flagsp != NULL) {
88158787Sru		/*
88258787Sru		 * Clear internal flag.
88343543Swollman		 */
88458787Sru		msg->msg_flags &= ~MSG_XPG4_2;
88558787Sru
88643543Swollman		/*
88743543Swollman		 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
88858787Sru		 * when controllen is zero and there is control data to
88958787Sru		 * copy out.
89043543Swollman		 */
89143543Swollman		if (controllen != 0 &&
89258787Sru		    (msg->msg_controllen > controllen || control == NULL)) {
89358787Sru			dprint(1, ("recvit: CTRUNC %d %d %p\n",
89443543Swollman			    msg->msg_controllen, controllen, control));
89543543Swollman
89643543Swollman			msg->msg_flags |= MSG_CTRUNC;
89758787Sru		}
89858787Sru		if (copyout(&msg->msg_flags, flagsp,
89943543Swollman		    sizeof (msg->msg_flags))) {
90043543Swollman			error = EFAULT;
90158787Sru			goto err;
90258787Sru		}
90343543Swollman	}
90443543Swollman	/*
90558787Sru	 * Note: This MUST be done last. There can be no "goto err" after this
90658787Sru	 * point since it could make so_closefds run twice on some part
90743014Swollman	 * of the file descriptor array.
90843543Swollman	 */
909136638Swollman	if (controllen != 0) {
910136638Swollman		if (!(flags & MSG_XPG4_2)) {
91143543Swollman			/*
912136638Swollman			 * Good old msg_accrights can only return a multiple
91343543Swollman			 * of 4 bytes.
91443543Swollman			 */
91558787Sru			controllen &= ~((int)sizeof (uint32_t) - 1);
91658787Sru		}
91758787Sru		error = copyout_arg(control, controllen, controllenp,
91843543Swollman		    msg->msg_control, msg->msg_controllen);
91943543Swollman		if (error)
92058787Sru			goto err;
92158787Sru
922149514Swollman		if (msg->msg_controllen > controllen || control == NULL) {
92343014Swollman			if (control == NULL)
92443014Swollman				controllen = 0;
92543014Swollman			so_closefds(msg->msg_control, msg->msg_controllen,
92643014Swollman			    !(flags & MSG_XPG4_2), controllen);
92743014Swollman		}
92843014Swollman	}
92943543Swollman	if (msg->msg_namelen != 0)
93058787Sru		kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
93143543Swollman	if (msg->msg_controllen != 0)
93243014Swollman		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
93358787Sru	return (len - uiop->uio_resid);
93443543Swollman
93558787Sruerr:
93658787Sru	/*
93758787Sru	 * If we fail and the control part contains file descriptors
93858787Sru	 * we have to close the fd's.
93958787Sru	 */
94058787Sru	if (msg->msg_controllen != 0)
94167578Swollman		so_closefds(msg->msg_control, msg->msg_controllen,
94267578Swollman		    !(flags & MSG_XPG4_2), 0);
94367578Swollman	if (msg->msg_namelen != 0)
94467578Swollman		kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
94567578Swollman	if (msg->msg_controllen != 0)
94675267Swollman		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
94775267Swollman	return (set_errno(error));
94875267Swollman}
94975267Swollman
95086222Swollman/*
95186222Swollman * Native system call
952105196Swollman */
953163302Srussize_t
954105196Swollmanrecv(int sock, void *buffer, size_t len, int flags)
955181421Sedwin{
956105196Swollman	struct nmsghdr lmsg;
957121098Swollman	struct uio auio;
958181421Sedwin	struct iovec aiov[1];
959136638Swollman
960136638Swollman	dprint(1, ("recv(%d, %p, %ld, %d)\n",
961181421Sedwin		sock, buffer, len, flags));
962136638Swollman
963153670Swollman	if ((ssize_t)len < 0) {
964153670Swollman		return (set_errno(EINVAL));
965163302Sru	}
966172479Sedwin
967172479Sedwin	aiov[0].iov_base = buffer;
968172479Sedwin	aiov[0].iov_len = len;
969172479Sedwin	auio.uio_loffset = 0;
970181421Sedwin	auio.uio_iov = aiov;
971181421Sedwin	auio.uio_iovcnt = 1;
972183066Sedwin	auio.uio_resid = len;
973183536Sedwin	auio.uio_segflg = UIO_USERSPACE;
974183536Sedwin	auio.uio_limit = 0;
975183536Sedwin
976183536Sedwin	lmsg.msg_namelen = 0;
977183536Sedwin	lmsg.msg_controllen = 0;
978183536Sedwin	lmsg.msg_flags = 0;
979183536Sedwin	return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
980183536Sedwin}
981183536Sedwin
982183536Sedwinssize_t
983183536Sedwinrecvfrom(int sock, void *buffer, size_t len, int flags,
984183536Sedwin	struct sockaddr *name, socklen_t *namelenp)
985183536Sedwin{
986183536Sedwin	struct nmsghdr lmsg;
987183536Sedwin	struct uio auio;
988183536Sedwin	struct iovec aiov[1];
989183536Sedwin
990183536Sedwin	dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
991183536Sedwin		sock, buffer, len, flags, name, namelenp));
992183536Sedwin
993183536Sedwin	if ((ssize_t)len < 0) {
994183536Sedwin		return (set_errno(EINVAL));
995183536Sedwin	}
996183536Sedwin
997183536Sedwin	aiov[0].iov_base = buffer;
998121098Swollman	aiov[0].iov_len = len;
999136638Swollman	auio.uio_loffset = 0;
100020094Swollman	auio.uio_iov = aiov;
10012742Swollman	auio.uio_iovcnt = 1;
100220094Swollman	auio.uio_resid = len;
1003136638Swollman	auio.uio_segflg = UIO_USERSPACE;
10042742Swollman	auio.uio_limit = 0;
100558787Sru
1006136638Swollman	lmsg.msg_name = (char *)name;
1007136638Swollman	if (namelenp != NULL) {
1008136638Swollman		if (copyin(namelenp, &lmsg.msg_namelen,
1009136638Swollman		    sizeof (lmsg.msg_namelen)))
101058787Sru			return (set_errno(EFAULT));
1011136638Swollman	} else {
1012136638Swollman		lmsg.msg_namelen = 0;
1013136638Swollman	}
1014136638Swollman	lmsg.msg_controllen = 0;
1015136638Swollman	lmsg.msg_flags = 0;
101620094Swollman
101758787Sru	return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
101875267Swollman}
1019121098Swollman
1020121098Swollman/*
1021121098Swollman * Uses the MSG_XPG4_2 flag to determine if the caller is using
102243543Swollman * struct omsghdr or struct nmsghdr.
102358787Sru */
102458787Srussize_t
102543543Swollmanrecvmsg(int sock, struct nmsghdr *msg, int flags)
1026181421Sedwin{
1027181421Sedwin	STRUCT_DECL(nmsghdr, u_lmsg);
1028181421Sedwin	STRUCT_HANDLE(nmsghdr, umsgptr);
1029181421Sedwin	struct nmsghdr lmsg;
1030181421Sedwin	struct uio auio;
1031181421Sedwin	struct iovec aiov[MSG_MAXIOVLEN];
1032181421Sedwin	int iovcnt;
103358787Sru	ssize_t len;
103475267Swollman	int i;
103520094Swollman	int *flagsp;
103658787Sru	model_t	model;
103758787Sru
103875267Swollman	dprint(1, ("recvmsg(%d, %p, %d)\n",
103986222Swollman		sock, msg, flags));
1040105196Swollman
1041105196Swollman	model = get_udatamodel();
104220094Swollman	STRUCT_INIT(u_lmsg, model);
104375267Swollman	STRUCT_SET_HANDLE(umsgptr, model, msg);
104475267Swollman
104575267Swollman	if (flags & MSG_XPG4_2) {
104675267Swollman		if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
104775267Swollman			return (set_errno(EFAULT));
104886222Swollman		flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1049105196Swollman	} else {
1050105196Swollman		/*
105175267Swollman		 * Assumes that nmsghdr and omsghdr are identically shaped
105258787Sru		 * except for the added msg_flags field.
105343543Swollman		 */
105458787Sru		if (copyin(msg, STRUCT_BUF(u_lmsg),
105558787Sru		    SIZEOF_STRUCT(omsghdr, model)))
1056121098Swollman			return (set_errno(EFAULT));
1057242208Sedwin		STRUCT_FSET(u_lmsg, msg_flags, 0);
1058242208Sedwin		flagsp = NULL;
105920094Swollman	}
106058787Sru
106143543Swollman	/*
106258787Sru	 * Code below us will kmem_alloc memory and hang it
106358787Sru	 * off msg_control and msg_name fields. This forces
106458787Sru	 * us to copy the structure to its native form.
106558787Sru	 */
106675267Swollman	lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
106786222Swollman	lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1068105196Swollman	lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1069105196Swollman	lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
107043543Swollman	lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1071121098Swollman	lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1072121098Swollman	lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1073121098Swollman
1074121098Swollman	iovcnt = lmsg.msg_iovlen;
1075121098Swollman
1076226752Sedwin	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1077242208Sedwin		return (set_errno(EMSGSIZE));
1078242208Sedwin	}
1079121098Swollman
1080121098Swollman#ifdef _SYSCALL32_IMPL
108158787Sru	/*
108258787Sru	 * 32-bit callers need to have their iovec expanded, while ensuring
10832742Swollman	 * that they can't move more than 2Gbytes of data in a single call.
108458787Sru	 */
108558787Sru	if (model == DATAMODEL_ILP32) {
108658787Sru		struct iovec32 aiov32[MSG_MAXIOVLEN];
108720094Swollman		ssize32_t count32;
1088121098Swollman
1089121098Swollman		if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
109058787Sru		    iovcnt * sizeof (struct iovec32)))
109120094Swollman			return (set_errno(EFAULT));
1092121098Swollman
1093121098Swollman		count32 = 0;
1094121098Swollman		for (i = 0; i < iovcnt; i++) {
1095138323Swollman			ssize32_t iovlen32;
1096136638Swollman
1097121098Swollman			iovlen32 = aiov32[i].iov_len;
1098181421Sedwin			count32 += iovlen32;
109943543Swollman			if (iovlen32 < 0 || count32 < 0)
110058787Sru				return (set_errno(EINVAL));
110158787Sru			aiov[i].iov_len = iovlen32;
110243543Swollman			aiov[i].iov_base =
110358787Sru			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
110458787Sru		}
110558787Sru	} else
110658787Sru#endif /* _SYSCALL32_IMPL */
110775267Swollman	if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
110875267Swollman		return (set_errno(EFAULT));
110958787Sru	}
111075267Swollman	len = 0;
1111121098Swollman	for (i = 0; i < iovcnt; i++) {
1112121098Swollman		ssize_t iovlen = aiov[i].iov_len;
11132742Swollman		len += iovlen;
111458787Sru		if (iovlen < 0 || len < 0) {
111558787Sru			return (set_errno(EINVAL));
111658787Sru		}
111758787Sru	}
111820094Swollman	auio.uio_loffset = 0;
111975267Swollman	auio.uio_iov = aiov;
112075267Swollman	auio.uio_iovcnt = iovcnt;
112175267Swollman	auio.uio_resid = len;
112275267Swollman	auio.uio_segflg = UIO_USERSPACE;
112375267Swollman	auio.uio_limit = 0;
112475267Swollman
1125181421Sedwin	if (lmsg.msg_control != NULL &&
1126181421Sedwin	    (do_useracc == 0 ||
112775267Swollman	    useracc(lmsg.msg_control, lmsg.msg_controllen,
112858787Sru			B_WRITE) != 0)) {
112975267Swollman		return (set_errno(EFAULT));
113058787Sru	}
1131181421Sedwin
1132181421Sedwin	return (recvit(sock, &lmsg, &auio, flags,
11332742Swollman		STRUCT_FADDR(umsgptr, msg_namelen),
11342742Swollman		STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
11352742Swollman}
113614343Swollman
113714343Swollman/*
113814343Swollman * Common send function.
113943014Swollman */
114043014Swollmanstatic ssize_t
114143014Swollmansendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
114243014Swollman{
11432742Swollman	struct sonode *so;
114458787Sru	file_t *fp;
114558787Sru	void *name;
114658787Sru	socklen_t namelen;
114758787Sru	void *control;
1148169811Swollman	socklen_t controllen;
1149169811Swollman	ssize_t len;
115075267Swollman	int error;
1151169811Swollman
1152169811Swollman	if ((so = getsonode(sock, &error, &fp)) == NULL)
1153169811Swollman		return (set_errno(error));
1154169811Swollman
1155169811Swollman	uiop->uio_fmode = fp->f_flag;
1156169811Swollman
115775267Swollman	if (so->so_family == AF_UNIX)
1158169811Swollman		uiop->uio_extflg = UIO_COPY_CACHED;
1159169811Swollman	else
1160169811Swollman		uiop->uio_extflg = UIO_COPY_DEFAULT;
1161169811Swollman
1162169811Swollman	/* Allocate and copyin name and control */
1163169811Swollman	name = msg->msg_name;
1164169811Swollman	namelen = msg->msg_namelen;
1165114173Swollman	if (name != NULL && namelen != 0) {
1166176974Sedwin		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1167176974Sedwin		name = copyin_name(so,
1168176974Sedwin				(struct sockaddr *)name,
1169176974Sedwin				&namelen, &error);
1170240457Sedwin		if (name == NULL)
1171176974Sedwin			goto done3;
1172176974Sedwin		/* copyin_name null terminates addresses for AF_UNIX */
1173176974Sedwin		msg->msg_namelen = namelen;
1174176974Sedwin		msg->msg_name = name;
1175176974Sedwin	} else {
1176176974Sedwin		msg->msg_name = name = NULL;
1177176974Sedwin		msg->msg_namelen = namelen = 0;
1178176974Sedwin	}
1179181421Sedwin
1180176974Sedwin	control = msg->msg_control;
1181240457Sedwin	controllen = msg->msg_controllen;
1182176974Sedwin	if ((control != NULL) && (controllen != 0)) {
1183176974Sedwin		/*
1184176974Sedwin		 * Verify that the length is not excessive to prevent
1185176974Sedwin		 * an application from consuming all of kernel memory.
1186204887Sedwin		 */
1187204887Sedwin		if (controllen > SO_MAXARGSIZE) {
1188204887Sedwin			error = EINVAL;
1189204887Sedwin			goto done2;
1190204887Sedwin		}
1191204887Sedwin		control = kmem_alloc(controllen, KM_SLEEP);
1192204887Sedwin
1193204887Sedwin		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1194204887Sedwin		if (copyin(msg->msg_control, control, controllen)) {
1195204887Sedwin			error = EFAULT;
1196204887Sedwin			goto done1;
1197204887Sedwin		}
1198219411Sedwin		msg->msg_control = control;
1199219411Sedwin	} else {
1200219411Sedwin		msg->msg_control = control = NULL;
1201219411Sedwin		msg->msg_controllen = controllen = 0;
1202219411Sedwin	}
1203219411Sedwin
1204219411Sedwin	len = uiop->uio_resid;
1205219411Sedwin	msg->msg_flags = flags;
1206219411Sedwin
1207219411Sedwin	error = SOP_SENDMSG(so, msg, uiop);
1208219411Sedwindone1:
1209219411Sedwin	if (control != NULL)
1210219411Sedwin		kmem_free(control, controllen);
1211219411Sedwindone2:
1212219411Sedwin	if (name != NULL)
1213219411Sedwin		kmem_free(name, namelen);
1214219411Sedwindone3:
1215220286Sedwin	if (error != 0) {
1216220286Sedwin		releasef(sock);
1217220286Sedwin		return (set_errno(error));
1218220286Sedwin	}
1219220286Sedwin	lwp_stat_update(LWP_STAT_MSGSND, 1);
1220220286Sedwin	so_update_attrs(so, SOMOD);
1221220286Sedwin	releasef(sock);
1222220286Sedwin	return (len - uiop->uio_resid);
1223220286Sedwin}
1224220286Sedwin
1225220286Sedwin/*
1226220286Sedwin * Native system call
1227220286Sedwin */
1228233445Sedwinssize_t
1229233445Sedwinsend(int sock, void *buffer, size_t len, int flags)
1230233445Sedwin{
1231233445Sedwin	struct nmsghdr lmsg;
1232233445Sedwin	struct uio auio;
1233233445Sedwin	struct iovec aiov[1];
1234233445Sedwin
1235233445Sedwin	dprint(1, ("send(%d, %p, %ld, %d)\n",
1236233445Sedwin		sock, buffer, len, flags));
1237233445Sedwin
1238233445Sedwin	if ((ssize_t)len < 0) {
1239233445Sedwin		return (set_errno(EINVAL));
1240233445Sedwin	}
1241233445Sedwin
1242233445Sedwin	aiov[0].iov_base = buffer;
1243233445Sedwin	aiov[0].iov_len = len;
1244248307Sedwin	auio.uio_loffset = 0;
1245248307Sedwin	auio.uio_iov = aiov;
1246248307Sedwin	auio.uio_iovcnt = 1;
1247248307Sedwin	auio.uio_resid = len;
1248248307Sedwin	auio.uio_segflg = UIO_USERSPACE;
1249248307Sedwin	auio.uio_limit = 0;
1250248307Sedwin
1251233445Sedwin	lmsg.msg_name = NULL;
1252233445Sedwin	lmsg.msg_control = NULL;
1253233445Sedwin	if (!(flags & MSG_XPG4_2)) {
12542742Swollman		/*
1255169811Swollman		 * In order to be compatible with the libsocket/sockmod
125658787Sru		 * implementation we set EOR for all send* calls.
1257169811Swollman		 */
1258169811Swollman		flags |= MSG_EOR;
1259169811Swollman	}
1260169811Swollman	return (sendit(sock, &lmsg, &auio, flags));
1261169811Swollman}
1262169811Swollman
1263169811Swollman/*
1264169811Swollman * Uses the MSG_XPG4_2 flag to determine if the caller is using
1265169811Swollman * struct omsghdr or struct nmsghdr.
1266169811Swollman */
1267169811Swollmanssize_t
1268169811Swollmansendmsg(int sock, struct nmsghdr *msg, int flags)
1269169811Swollman{
1270169811Swollman	struct nmsghdr lmsg;
1271169811Swollman	STRUCT_DECL(nmsghdr, u_lmsg);
1272169811Swollman	struct uio auio;
1273169811Swollman	struct iovec aiov[MSG_MAXIOVLEN];
1274169811Swollman	int iovcnt;
1275169811Swollman	ssize_t len;
1276169811Swollman	int i;
1277169811Swollman	model_t	model;
1278169811Swollman
1279169811Swollman	dprint(1, ("sendmsg(%d, %p, %d)\n", sock, msg, flags));
1280169811Swollman
128175267Swollman	model = get_udatamodel();
128275267Swollman	STRUCT_INIT(u_lmsg, model);
1283220286Sedwin
1284176974Sedwin	if (flags & MSG_XPG4_2) {
1285176974Sedwin		if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1286176974Sedwin		    STRUCT_SIZE(u_lmsg)))
1287176974Sedwin			return (set_errno(EFAULT));
1288204887Sedwin	} else {
1289220286Sedwin		/*
1290220286Sedwin		 * Assumes that nmsghdr and omsghdr are identically shaped
1291233445Sedwin		 * except for the added msg_flags field.
1292248307Sedwin		 */
1293248307Sedwin		if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1294169811Swollman		    SIZEOF_STRUCT(omsghdr, model)))
129558787Sru			return (set_errno(EFAULT));
12962742Swollman		/*
1297169811Swollman		 * In order to be compatible with the libsocket/sockmod
1298169811Swollman		 * implementation we set EOR for all send* calls.
1299169811Swollman		 */
1300169811Swollman		flags |= MSG_EOR;
1301169811Swollman	}
1302169811Swollman
1303169811Swollman	/*
130419878Swollman	 * Code below us will kmem_alloc memory and hang it
1305169811Swollman	 * off msg_control and msg_name fields. This forces
1306169811Swollman	 * us to copy the structure to its native form.
1307158421Swollman	 */
130819878Swollman	lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
13092742Swollman	lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
131086222Swollman	lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
131186222Swollman	lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
131286222Swollman	lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
13132742Swollman	lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
13142742Swollman	lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1315248307Sedwin
1316248307Sedwin	iovcnt = lmsg.msg_iovlen;
1317248307Sedwin
1318248307Sedwin	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
131920094Swollman		/*
1320158421Swollman		 * Unless this is XPG 4.2 we allow iovcnt == 0 to
1321158421Swollman		 * be compatible with SunOS 4.X and 4.4BSD.
13222742Swollman		 */
1323248307Sedwin		if (iovcnt != 0 || (flags & MSG_XPG4_2))
1324248307Sedwin			return (set_errno(EMSGSIZE));
132520094Swollman	}
13262742Swollman
13272742Swollman#ifdef _SYSCALL32_IMPL
13282742Swollman	/*
13292742Swollman	 * 32-bit callers need to have their iovec expanded, while ensuring
1330248307Sedwin	 * that they can't move more than 2Gbytes of data in a single call.
1331248307Sedwin	 */
1332158421Swollman	if (model == DATAMODEL_ILP32) {
1333169811Swollman		struct iovec32 aiov32[MSG_MAXIOVLEN];
1334158421Swollman		ssize32_t count32;
1335158421Swollman
1336158421Swollman		if (iovcnt != 0 &&
1337158421Swollman		    copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1338158421Swollman		    iovcnt * sizeof (struct iovec32)))
1339169811Swollman			return (set_errno(EFAULT));
1340158421Swollman
1341158421Swollman		count32 = 0;
1342158421Swollman		for (i = 0; i < iovcnt; i++) {
1343158421Swollman			ssize32_t iovlen32;
1344158421Swollman
1345158421Swollman			iovlen32 = aiov32[i].iov_len;
1346169811Swollman			count32 += iovlen32;
13472742Swollman			if (iovlen32 < 0 || count32 < 0)
1348248307Sedwin				return (set_errno(EINVAL));
134919878Swollman			aiov[i].iov_len = iovlen32;
13502742Swollman			aiov[i].iov_base =
13512742Swollman			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1352223629Sedwin		}
1353223629Sedwin	} else
1354223629Sedwin#endif /* _SYSCALL32_IMPL */
1355223629Sedwin	if (iovcnt != 0 &&
1356223629Sedwin	    copyin(lmsg.msg_iov, aiov,
1357223629Sedwin	    (unsigned)iovcnt * sizeof (struct iovec))) {
1358223629Sedwin		return (set_errno(EFAULT));
1359223629Sedwin	}
13602742Swollman	len = 0;
1361169811Swollman	for (i = 0; i < iovcnt; i++) {
1362248307Sedwin		ssize_t iovlen = aiov[i].iov_len;
1363248307Sedwin		len += iovlen;
1364169811Swollman		if (iovlen < 0 || len < 0) {
1365169811Swollman			return (set_errno(EINVAL));
1366169811Swollman		}
1367169811Swollman	}
1368169811Swollman	auio.uio_loffset = 0;
1369169811Swollman	auio.uio_iov = aiov;
13702742Swollman	auio.uio_iovcnt = iovcnt;
13712742Swollman	auio.uio_resid = len;
137219878Swollman	auio.uio_segflg = UIO_USERSPACE;
137319878Swollman	auio.uio_limit = 0;
13742742Swollman
137519878Swollman	return (sendit(sock, &lmsg, &auio, flags));
137619878Swollman}
13772742Swollman
13782742Swollmanssize_t
137975267Swollmansendto(int sock, void *buffer, size_t len, int flags,
1380158421Swollman    struct sockaddr *name, socklen_t namelen)
1381158421Swollman{
1382158421Swollman	struct nmsghdr lmsg;
138375267Swollman	struct uio auio;
138475267Swollman	struct iovec aiov[1];
138575267Swollman
138675267Swollman	dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
138775267Swollman		sock, buffer, len, flags, name, namelen));
138875267Swollman
138975267Swollman	if ((ssize_t)len < 0) {
139075267Swollman		return (set_errno(EINVAL));
139175267Swollman	}
139275267Swollman
139375267Swollman	aiov[0].iov_base = buffer;
139475267Swollman	aiov[0].iov_len = len;
139575267Swollman	auio.uio_loffset = 0;
139675267Swollman	auio.uio_iov = aiov;
139775267Swollman	auio.uio_iovcnt = 1;
139875267Swollman	auio.uio_resid = len;
139975267Swollman	auio.uio_segflg = UIO_USERSPACE;
140075267Swollman	auio.uio_limit = 0;
140175267Swollman
140275267Swollman	lmsg.msg_name = (char *)name;
140375267Swollman	lmsg.msg_namelen = namelen;
140475267Swollman	lmsg.msg_control = NULL;
140575267Swollman	if (!(flags & MSG_XPG4_2)) {
140675267Swollman		/*
140775267Swollman		 * In order to be compatible with the libsocket/sockmod
140875267Swollman		 * implementation we set EOR for all send* calls.
140975267Swollman		 */
141075267Swollman		flags |= MSG_EOR;
141175267Swollman	}
141275267Swollman	return (sendit(sock, &lmsg, &auio, flags));
141375267Swollman}
141475267Swollman
141575267Swollman/*ARGSUSED3*/
141675267Swollmanint
141775267Swollmangetpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
141875267Swollman{
141975267Swollman	struct sonode *so;
142075267Swollman	int error;
142175267Swollman	socklen_t namelen;
142275267Swollman	union {
142375267Swollman		struct sockaddr_in sin;
142475267Swollman		struct sockaddr_in6 sin6;
142575267Swollman	} sin;			/* Temporary buffer, common case */
142675267Swollman	void *addr;		/* Temporary buffer, uncommon case */
142775267Swollman	socklen_t addrlen, size;
142875267Swollman
1429220549Sedwin	dprint(1, ("getpeername(%d, %p, %p)\n",
1430220549Sedwin		sock, name, namelenp));
1431220549Sedwin
1432220549Sedwin	if ((so = getsonode(sock, &error, NULL)) == NULL)
1433220549Sedwin		goto bad;
1434220549Sedwin
1435220549Sedwin	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1436220549Sedwin	if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1437220549Sedwin	    (name == NULL && namelen != 0)) {
1438220549Sedwin		error = EFAULT;
1439220549Sedwin		goto rel_out;
1440220549Sedwin	}
1441220549Sedwin	/*
1442220549Sedwin	 * If a connect or accept has been done, unless we're an Xnet socket,
1443220549Sedwin	 * the remote address has already been updated in so_faddr_sa.
1444220549Sedwin	 */
1445220549Sedwin	if (so->so_version != SOV_SOCKSTREAM && so->so_version != SOV_SOCKBSD ||
1446220549Sedwin	    !(so->so_state & SS_FADDR_VALID)) {
1447233445Sedwin		if ((error = SOP_GETPEERNAME(so)) != 0)
1448233445Sedwin			goto rel_out;
1449233445Sedwin	}
1450233445Sedwin
1451233445Sedwin	if (so->so_faddr_maxlen <= sizeof (sin)) {
1452233445Sedwin		size = 0;
1453233445Sedwin		addr = &sin;
1454233445Sedwin	} else {
1455233445Sedwin		/*
1456233445Sedwin		 * Allocate temporary to avoid holding so_lock across
1457233445Sedwin		 * copyout
1458233445Sedwin		 */
1459233445Sedwin		size = so->so_faddr_maxlen;
1460233445Sedwin		addr = kmem_alloc(size, KM_SLEEP);
1461233445Sedwin	}
14622742Swollman	/* Prevent so_faddr_sa/len from changing while accessed */
146319878Swollman	mutex_enter(&so->so_lock);
146419878Swollman	if (!(so->so_state & SS_ISCONNECTED)) {
146519878Swollman		mutex_exit(&so->so_lock);
146619878Swollman		error = ENOTCONN;
146719878Swollman		goto free_out;
146819878Swollman	}
146919878Swollman	addrlen = so->so_faddr_len;
147019878Swollman	bcopy(so->so_faddr_sa, addr, addrlen);
147175267Swollman	mutex_exit(&so->so_lock);
147275267Swollman
1473220549Sedwin	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1474233445Sedwin	error = copyout_name(name, namelen, namelenp, addr,
14752742Swollman	    (so->so_state & SS_FADDR_NOXLATE) ? 0 : addrlen);
14762742Swollmanfree_out:
147719878Swollman	if (size != 0)
147819878Swollman		kmem_free(addr, size);
147919878Swollmanrel_out:
1480233445Sedwin	releasef(sock);
1481233445Sedwinbad:	return (error != 0 ? set_errno(error) : 0);
14822742Swollman}
14832742Swollman
14842742Swollman/*ARGSUSED3*/
14852742Swollmanint
148619878Swollmangetsockname(int sock, struct sockaddr *name,
148719878Swollman		socklen_t *namelenp, int version)
14882742Swollman{
14892742Swollman	struct sonode *so;
14902742Swollman	int error;
14912742Swollman	socklen_t namelen;
149219878Swollman	union {
149319878Swollman		struct sockaddr_in sin;
149420094Swollman		struct sockaddr_in6 sin6;
149520094Swollman	} sin;			/* Temporary buffer, common case */
149620094Swollman	void *addr;		/* Temporary buffer, uncommon case */
14972742Swollman	socklen_t addrlen, size;
14982742Swollman
1499158421Swollman	dprint(1, ("getsockname(%d, %p, %p)\n",
1500158421Swollman		sock, name, namelenp));
1501158421Swollman
150258787Sru	if ((so = getsonode(sock, &error, NULL)) == NULL)
15032742Swollman		goto bad;
150458787Sru
150519878Swollman	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
150658787Sru	if (copyin(namelenp, &namelen, sizeof (namelen)) ||
150719878Swollman	    (name == NULL && namelen != 0)) {
150820094Swollman		error = EFAULT;
150920094Swollman		goto rel_out;
151020094Swollman	}
151120094Swollman
151220094Swollman	/*
151330711Swollman	 * If a bind or accept has been done, unless we're an Xnet endpoint,
151420094Swollman	 * the local address has already been updated in so_laddr_sa.
151558787Sru	 */
151675267Swollman	if ((so->so_version != SOV_SOCKSTREAM &&
151767578Swollman	    so->so_version != SOV_SOCKBSD) ||
151867578Swollman	    !(so->so_state & SS_LADDR_VALID)) {
151967578Swollman		if ((error = SOP_GETSOCKNAME(so)) != 0)
152075267Swollman			goto rel_out;
152175267Swollman	}
152275267Swollman
152375267Swollman	if (so->so_laddr_maxlen <= sizeof (sin)) {
152475267Swollman		size = 0;
152575267Swollman		addr = &sin;
152675267Swollman	} else {
152775267Swollman		/*
152875267Swollman		 * Allocate temporary to avoid holding so_lock across
152975267Swollman		 * copyout
153075267Swollman		 */
153175267Swollman		size = so->so_laddr_maxlen;
153293799Swollman		addr = kmem_alloc(size, KM_SLEEP);
1533158421Swollman	}
153475267Swollman	/* Prevent so_laddr_sa/len from changing while accessed */
1535158421Swollman	mutex_enter(&so->so_lock);
153675267Swollman	addrlen = so->so_laddr_len;
153793799Swollman	bcopy(so->so_laddr_sa, addr, addrlen);
153893799Swollman	mutex_exit(&so->so_lock);
153993799Swollman
154093799Swollman	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
154193799Swollman	error = copyout_name(name, namelen, namelenp,
1542149514Swollman	    addr, addrlen);
1543149514Swollman	if (size != 0)
1544149514Swollman		kmem_free(addr, size);
1545149514Swollmanrel_out:
1546149514Swollman	releasef(sock);
1547149514Swollmanbad:	return (error != 0 ? set_errno(error) : 0);
1548149514Swollman}
1549169811Swollman
1550169811Swollman/*ARGSUSED5*/
1551169811Swollmanint
1552204566Sedwingetsockopt(int sock,
1553204566Sedwin	int level,
1554204566Sedwin	int option_name,
1555204566Sedwin	void *option_value,
1556204566Sedwin	socklen_t *option_lenp,
1557204566Sedwin	int version)
1558204566Sedwin{
1559204566Sedwin	struct sonode *so;
1560204566Sedwin	socklen_t optlen, optlen_res;
1561204566Sedwin	void *optval;
1562204566Sedwin	int error;
1563204566Sedwin
1564204566Sedwin	dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1565204566Sedwin		sock, level, option_name, option_value, option_lenp));
1566204566Sedwin
1567204566Sedwin	if ((so = getsonode(sock, &error, NULL)) == NULL)
1568204887Sedwin		return (set_errno(error));
1569204566Sedwin
1570248307Sedwin	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1571248307Sedwin	if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1572248307Sedwin		releasef(sock);
1573248307Sedwin		return (set_errno(EFAULT));
1574248307Sedwin	}
1575248307Sedwin	/*
1576248307Sedwin	 * Verify that the length is not excessive to prevent
1577248307Sedwin	 * an application from consuming all of kernel memory.
1578248307Sedwin	 */
157975267Swollman	if (optlen > SO_MAXARGSIZE) {
15802742Swollman		error = EINVAL;
15812742Swollman		releasef(sock);
158219878Swollman		return (set_errno(error));
158319878Swollman	}
158419878Swollman	optval = kmem_alloc(optlen, KM_SLEEP);
158519878Swollman	optlen_res = optlen;
15862742Swollman	error = SOP_GETSOCKOPT(so, level, option_name, optval,
15872742Swollman	    &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2);
1588136638Swollman	releasef(sock);
1589136638Swollman	if (error) {
1590136638Swollman		kmem_free(optval, optlen);
1591149514Swollman		return (set_errno(error));
1592136638Swollman	}
1593136638Swollman	error = copyout_arg(option_value, optlen, option_lenp,
1594158421Swollman	    optval, optlen_res);
1595158421Swollman	kmem_free(optval, optlen);
1596136638Swollman	if (error)
15972742Swollman		return (set_errno(error));
159819878Swollman	return (0);
159919878Swollman}
160019878Swollman
160119878Swollman/*ARGSUSED5*/
1602136638Swollmanint
1603136638Swollmansetsockopt(int sock,
160419878Swollman	int level,
160519878Swollman	int option_name,
1606158421Swollman	void *option_value,
160758787Sru	socklen_t option_len,
160858787Sru	int version)
16092742Swollman{
16102742Swollman	struct sonode *so;
161186222Swollman	intptr_t buffer[2];
161219878Swollman	void *optval = NULL;
16132742Swollman	int error;
16142742Swollman
16152742Swollman	dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
16162742Swollman		sock, level, option_name, option_value, option_len));
161719878Swollman
16182742Swollman	if ((so = getsonode(sock, &error, NULL)) == NULL)
16192742Swollman		return (set_errno(error));
162058787Sru
16212742Swollman	if (option_value != NULL) {
16222742Swollman		if (option_len != 0) {
16232742Swollman			/*
16242742Swollman			 * Verify that the length is not excessive to prevent
16252742Swollman			 * an application from consuming all of kernel memory.
16262742Swollman			 */
162719878Swollman			if (option_len > SO_MAXARGSIZE) {
162819878Swollman				error = EINVAL;
162919878Swollman				goto done2;
16302742Swollman			}
16312742Swollman			optval = option_len <= sizeof (buffer) ?
16322742Swollman			    &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
16332742Swollman			ASSERT(MUTEX_NOT_HELD(&so->so_lock));
16342742Swollman			if (copyin(option_value, optval, (size_t)option_len)) {
16352742Swollman				error = EFAULT;
16362742Swollman				goto done1;
1637149514Swollman			}
16382742Swollman		}
1639158421Swollman	} else
16402742Swollman		option_len = 0;
1641158421Swollman
164219878Swollman	error = SOP_SETSOCKOPT(so, level, option_name, optval,
164319878Swollman	    (t_uscalar_t)option_len);
164419878Swollmandone1:
164519878Swollman	if (optval != buffer)
1646158421Swollman		kmem_free(optval, (size_t)option_len);
164719878Swollmandone2:
164819878Swollman	releasef(sock);
164919878Swollman	if (error)
1650158421Swollman		return (set_errno(error));
165119878Swollman	return (0);
16522742Swollman}
1653158421Swollman
1654158421Swollman/*
1655158421Swollman * Add config info when devpath is non-NULL; delete info when devpath is NULL.
165619878Swollman * devpath is a user address.
165719878Swollman */
165819878Swollmanint
165919878Swollmansockconfig(int domain, int type, int protocol, char *devpath)
166019878Swollman{
166119878Swollman	char *kdevpath;		/* Copied in devpath string */
166219878Swollman	size_t kdevpathlen;
166319878Swollman	int error = 0;
166419878Swollman
166519878Swollman	dprint(1, ("sockconfig(%d, %d, %d, %p)\n",
166619878Swollman		domain, type, protocol, devpath));
166719878Swollman
166819878Swollman	if (secpolicy_net_config(CRED(), B_FALSE) != 0)
166919878Swollman		return (set_errno(EPERM));
167019878Swollman
167119878Swollman	if (devpath == NULL) {
167219878Swollman		/* Deleting an entry */
167319878Swollman		kdevpath = NULL;
167419878Swollman		kdevpathlen = 0;
167519878Swollman	} else {
167619878Swollman		/*
167719878Swollman		 * Adding an entry.
167819878Swollman		 * Copyin the devpath.
167919878Swollman		 * This also makes it possible to check for too long pathnames.
168019878Swollman		 * Compress the space needed for the devpath before passing it
1681158421Swollman		 * to soconfig - soconfig will store the string until
168258787Sru		 * the configuration is removed.
168320094Swollman		 */
168420094Swollman		char *buf;
168530711Swollman
168620094Swollman		buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1687136638Swollman		if ((error = copyinstr(devpath, buf, MAXPATHLEN,
1688136638Swollman		    &kdevpathlen)) != 0) {
1689136638Swollman			kmem_free(buf, MAXPATHLEN);
1690149514Swollman			goto done;
1691149514Swollman		}
1692149514Swollman
1693149514Swollman		kdevpath = kmem_alloc(kdevpathlen, KM_SLEEP);
1694149514Swollman		bcopy(buf, kdevpath, kdevpathlen);
1695149514Swollman		kdevpath[kdevpathlen - 1] = '\0';
1696153670Swollman
1697153670Swollman		kmem_free(buf, MAXPATHLEN);
1698153670Swollman	}
1699153670Swollman	error = soconfig(domain, type, protocol, kdevpath, (int)kdevpathlen);
1700153670Swollmandone:
1701153670Swollman	if (error) {
1702163302Sru		eprintline(error);
1703163302Sru		return (set_errno(error));
1704163302Sru	}
1705163302Sru	return (0);
17062742Swollman}
17072742Swollman
170819878Swollman
170919878Swollman/*
171019878Swollman * Sendfile is implemented through two schemes, direct I/O or by
17112742Swollman * caching in the filesystem page cache. We cache the input file by
17122742Swollman * default and use direct I/O only if sendfile_max_size is set
1713172479Sedwin * appropriately as explained below. Note that this logic is consistent
1714174242Sedwin * with other filesystems where caching is turned on by default
1715174242Sedwin * unless explicitly turned off by using the DIRECTIO ioctl.
1716174242Sedwin *
1717174242Sedwin * We choose a slightly different scheme here. One can turn off
1718174242Sedwin * caching by setting sendfile_max_size to 0. One can also enable
1719174242Sedwin * caching of files <= sendfile_max_size by setting sendfile_max_size
1720174242Sedwin * to an appropriate value. By default sendfile_max_size is set to the
1721172479Sedwin * maximum value so that all files are cached. In future, we may provide
17222742Swollman * better interfaces for caching the file.
17232742Swollman *
172486222Swollman * Sendfile through Direct I/O (Zero copy)
172519878Swollman * --------------------------------------
1726174242Sedwin *
1727172479Sedwin * As disks are normally slower than the network, we can't have a
1728 * single thread that reads the disk and writes to the network. We
1729 * need to have parallelism. This is done by having the sendfile
1730 * thread create another thread that reads from the filesystem
1731 * and queues it for network processing. In this scheme, the data
1732 * is never copied anywhere i.e it is zero copy unlike the other
1733 * scheme.
1734 *
1735 * We have a sendfile queue (snfq) where each sendfile
1736 * request (snf_req_t) is queued for processing by a thread. Number
1737 * of threads is dynamically allocated and they exit if they are idling
1738 * beyond a specified amount of time. When each request (snf_req_t) is
1739 * processed by a thread, it produces a number of mblk_t structures to
1740 * be consumed by the sendfile thread. snf_deque and snf_enque are
1741 * used for consuming and producing mblks. Size of the filesystem
1742 * read is determined by the tuneable (sendfile_read_size). A single
1743 * mblk holds sendfile_read_size worth of data (except the last
1744 * read of the file) which is sent down as a whole to the network.
1745 * sendfile_read_size is set to 1 MB as this seems to be the optimal
1746 * value for the UFS filesystem backed by a striped storage array.
1747 *
1748 * Synchronisation between read (producer) and write (consumer) threads.
1749 * --------------------------------------------------------------------
1750 *
1751 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1752 * adding and deleting items in this list. Error can happen anytime
1753 * during read or write. There could be unprocessed mblks in the
1754 * sr_ib_XXX list when a read or write error occurs. Whenever error
1755 * is encountered, we need two things to happen :
1756 *
1757 * a) One of the threads need to clean the mblks.
1758 * b) When one thread encounters an error, the other should stop.
1759 *
1760 * For (a), we don't want to penalise the reader thread as it could do
1761 * some useful work processing other requests. For (b), the error can
1762 * be detected by examining sr_read_error or sr_write_error.
1763 * sr_lock protects sr_read_error and sr_write_error. If both reader and
1764 * writer encounters error, we need to report the write error back to
1765 * the application as that's what would have happened if the operations
1766 * were done sequentially. With this in mind, following should work :
1767 *
1768 * 	- Check for errors before read or write.
1769 *	- If the reader encounters error, set the error in sr_read_error.
1770 *	  Check sr_write_error, if it is set, send cv_signal as it is
1771 *	  waiting for reader to complete. If it is not set, the writer
1772 *	  is either running sinking data to the network or blocked
1773 *        because of flow control. For handling the latter case, we
1774 *	  always send a signal. In any case, it will examine sr_read_error
1775 *	  and return. sr_read_error is marked with SR_READ_DONE to tell
1776 *	  the writer that the reader is done in all the cases.
1777 *	- If the writer encounters error, set the error in sr_write_error.
1778 *	  The reader thread is either blocked because of flow control or
1779 *	  running reading data from the disk. For the former, we need to
1780 *	  wakeup the thread. Again to keep it simple, we always wake up
1781 *	  the reader thread. Then, wait for the read thread to complete
1782 *	  if it is not done yet. Cleanup and return.
1783 *
1784 * High and low water marks for the read thread.
1785 * --------------------------------------------
1786 *
1787 * If sendfile() is used to send data over a slow network, we need to
1788 * make sure that the read thread does not produce data at a faster
1789 * rate than the network. This can happen if the disk is faster than
1790 * the network. In such a case, we don't want to build a very large queue.
1791 * But we would still like to get all of the network throughput possible.
1792 * This implies that network should never block waiting for data.
1793 * As there are lot of disk throughput/network throughput combinations
1794 * possible, it is difficult to come up with an accurate number.
1795 * A typical 10K RPM disk has a max seek latency 17ms and rotational
1796 * latency of 3ms for reading a disk block. Thus, the total latency to
1797 * initiate a new read, transfer data from the disk and queue for
1798 * transmission would take about a max of 25ms. Todays max transfer rate
1799 * for network is 100MB/sec. If the thread is blocked because of flow
1800 * control, it would take 25ms to get new data ready for transmission.
1801 * We have to make sure that network is not idling, while we are initiating
1802 * new transfers. So, at 100MB/sec, to keep network busy we would need
1803 * 2.5MB of data. Roundig off, we keep the low water mark to be 3MB of data.
1804 * We need to pick a high water mark so that the woken up thread would
1805 * do considerable work before blocking again to prevent thrashing. Currently,
1806 * we pick this to be 10 times that of the low water mark.
1807 *
1808 * Sendfile with segmap caching (One copy from page cache to mblks).
1809 * ----------------------------------------------------------------
1810 *
1811 * We use the segmap cache for caching the file, if the size of file
1812 * is <= sendfile_max_size. In this case we don't use threads as VM
1813 * is reasonably fast enough to keep up with the network. If the underlying
1814 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1815 * of data into segmap space, and use the virtual address from segmap
1816 * directly through desballoc() to avoid copy. Once the transport is done
1817 * with the data, the mapping will be released through segmap_release()
1818 * called by the call-back routine.
1819 *
1820 * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1821 * to copy the data from the filesystem into our temporary network buffer.
1822 *
1823 * To disable caching, set sendfile_max_size to 0.
1824 */
1825
1826uint_t sendfile_read_size = 1024 * 1024;
1827#define	SENDFILE_REQ_LOWAT	3 * 1024 * 1024
1828uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
1829uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
1830struct sendfile_stats sf_stats;
1831struct sendfile_queue *snfq;
1832clock_t snfq_timeout;
1833off64_t sendfile_max_size;
1834
1835static void snf_enque(snf_req_t *, mblk_t *);
1836static mblk_t *snf_deque(snf_req_t *);
1837
1838void
1839sendfile_init(void)
1840{
1841	snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
1842
1843	mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
1844	cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
1845	snfq->snfq_max_threads = max_ncpus;
1846	snfq_timeout = SNFQ_TIMEOUT;
1847	/* Cache all files by default. */
1848	sendfile_max_size = MAXOFFSET_T;
1849}
1850
1851/*
1852 * Queues a mblk_t for network processing.
1853 */
1854static void
1855snf_enque(snf_req_t *sr, mblk_t *mp)
1856{
1857	mp->b_next = NULL;
1858	mutex_enter(&sr->sr_lock);
1859	if (sr->sr_mp_head == NULL) {
1860		sr->sr_mp_head = sr->sr_mp_tail = mp;
1861		cv_signal(&sr->sr_cv);
1862	} else {
1863		sr->sr_mp_tail->b_next = mp;
1864		sr->sr_mp_tail = mp;
1865	}
1866	sr->sr_qlen += MBLKL(mp);
1867	while ((sr->sr_qlen > sr->sr_hiwat) &&
1868	    (sr->sr_write_error == 0)) {
1869		sf_stats.ss_full_waits++;
1870		cv_wait(&sr->sr_cv, &sr->sr_lock);
1871	}
1872	mutex_exit(&sr->sr_lock);
1873}
1874
1875/*
1876 * De-queues a mblk_t for network processing.
1877 */
1878static mblk_t *
1879snf_deque(snf_req_t *sr)
1880{
1881	mblk_t *mp;
1882
1883	mutex_enter(&sr->sr_lock);
1884	/*
1885	 * If we have encountered an error on read or read is
1886	 * completed and no more mblks, return NULL.
1887	 * We need to check for NULL sr_mp_head also as
1888	 * the reads could have completed and there is
1889	 * nothing more to come.
1890	 */
1891	if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
1892	    ((sr->sr_read_error & SR_READ_DONE) &&
1893	    sr->sr_mp_head == NULL)) {
1894		mutex_exit(&sr->sr_lock);
1895		return (NULL);
1896	}
1897	/*
1898	 * To start with neither SR_READ_DONE is marked nor
1899	 * the error is set. When we wake up from cv_wait,
1900	 * following are the possibilities :
1901	 *
1902	 *	a) sr_read_error is zero and mblks are queued.
1903	 *	b) sr_read_error is set to SR_READ_DONE
1904	 *	   and mblks are queued.
1905	 *	c) sr_read_error is set to SR_READ_DONE
1906	 *	   and no mblks.
1907	 *	d) sr_read_error is set to some error other
1908	 *	   than SR_READ_DONE.
1909	 */
1910
1911	while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
1912		sf_stats.ss_empty_waits++;
1913		cv_wait(&sr->sr_cv, &sr->sr_lock);
1914	}
1915	/* Handle (a) and (b) first  - the normal case. */
1916	if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
1917	    (sr->sr_mp_head != NULL)) {
1918		mp = sr->sr_mp_head;
1919		sr->sr_mp_head = mp->b_next;
1920		sr->sr_qlen -= MBLKL(mp);
1921		if (sr->sr_qlen < sr->sr_lowat)
1922			cv_signal(&sr->sr_cv);
1923		mutex_exit(&sr->sr_lock);
1924		mp->b_next = NULL;
1925		return (mp);
1926	}
1927	/* Handle (c) and (d). */
1928	mutex_exit(&sr->sr_lock);
1929	return (NULL);
1930}
1931
1932/*
1933 * Reads data from the filesystem and queues it for network processing.
1934 */
1935void
1936snf_async_read(snf_req_t *sr)
1937{
1938	size_t iosize;
1939	u_offset_t fileoff;
1940	u_offset_t size;
1941	int ret_size;
1942	int error;
1943	file_t *fp;
1944	mblk_t *mp;
1945
1946	fp = sr->sr_fp;
1947	size = sr->sr_file_size;
1948	fileoff = sr->sr_file_off;
1949
1950	/*
1951	 * Ignore the error for filesystems that doesn't support DIRECTIO.
1952	 */
1953	(void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
1954	    kcred, NULL);
1955
1956	while ((size != 0) && (sr->sr_write_error == 0)) {
1957
1958		iosize = (int)MIN(sr->sr_maxpsz, size);
1959
1960		if ((mp = allocb(iosize, BPRI_MED)) == NULL) {
1961			error = EAGAIN;
1962			break;
1963		}
1964		ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
1965
1966		/* Error or Reached EOF ? */
1967		if ((error != 0) || (ret_size == 0)) {
1968			freeb(mp);
1969			break;
1970		}
1971		mp->b_wptr = mp->b_rptr + ret_size;
1972
1973		snf_enque(sr, mp);
1974		size -= ret_size;
1975		fileoff += ret_size;
1976	}
1977	(void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
1978	    kcred, NULL);
1979	mutex_enter(&sr->sr_lock);
1980	sr->sr_read_error = error;
1981	sr->sr_read_error |= SR_READ_DONE;
1982	cv_signal(&sr->sr_cv);
1983	mutex_exit(&sr->sr_lock);
1984}
1985
1986void
1987snf_async_thread(void)
1988{
1989	snf_req_t *sr;
1990	callb_cpr_t cprinfo;
1991	clock_t time_left = 1;
1992	clock_t now;
1993
1994	CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
1995
1996	mutex_enter(&snfq->snfq_lock);
1997	for (;;) {
1998		/*
1999		 * If we didn't find a entry, then block until woken up
2000		 * again and then look through the queues again.
2001		 */
2002		while ((sr = snfq->snfq_req_head) == NULL) {
2003			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2004			if (time_left <= 0) {
2005				snfq->snfq_svc_threads--;
2006				CALLB_CPR_EXIT(&cprinfo);
2007				thread_exit();
2008				/* NOTREACHED */
2009			}
2010			snfq->snfq_idle_cnt++;
2011
2012			time_to_wait(&now, snfq_timeout);
2013			time_left = cv_timedwait(&snfq->snfq_cv,
2014			    &snfq->snfq_lock, now);
2015			snfq->snfq_idle_cnt--;
2016
2017			CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2018		}
2019		snfq->snfq_req_head = sr->sr_next;
2020		snfq->snfq_req_cnt--;
2021		mutex_exit(&snfq->snfq_lock);
2022		snf_async_read(sr);
2023		mutex_enter(&snfq->snfq_lock);
2024	}
2025}
2026
2027
2028snf_req_t *
2029create_thread(int operation, struct vnode *vp, file_t *fp,
2030    u_offset_t fileoff, u_offset_t size)
2031{
2032	snf_req_t *sr;
2033	stdata_t *stp;
2034
2035	sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2036
2037	sr->sr_vp = vp;
2038	sr->sr_fp = fp;
2039	stp = vp->v_stream;
2040
2041	/*
2042	 * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2043	 * stream might be closed before thread returns from snf_async_read.
2044	 */
2045	if (stp->sd_qn_maxpsz > 0) {
2046		sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2047	} else {
2048		sr->sr_maxpsz = MAXBSIZE;
2049	}
2050
2051	sr->sr_operation = operation;
2052	sr->sr_file_off = fileoff;
2053	sr->sr_file_size = size;
2054	sr->sr_hiwat = sendfile_req_hiwat;
2055	sr->sr_lowat = sendfile_req_lowat;
2056	mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2057	cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2058	/*
2059	 * See whether we need another thread for servicing this
2060	 * request. If there are already enough requests queued
2061	 * for the threads, create one if not exceeding
2062	 * snfq_max_threads.
2063	 */
2064	mutex_enter(&snfq->snfq_lock);
2065	if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2066	    snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2067		(void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2068		    TS_RUN, minclsyspri);
2069		snfq->snfq_svc_threads++;
2070	}
2071	if (snfq->snfq_req_head == NULL) {
2072		snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2073		cv_signal(&snfq->snfq_cv);
2074	} else {
2075		snfq->snfq_req_tail->sr_next = sr;
2076		snfq->snfq_req_tail = sr;
2077	}
2078	snfq->snfq_req_cnt++;
2079	mutex_exit(&snfq->snfq_lock);
2080	return (sr);
2081}
2082
2083int
2084snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2085    ssize_t *count)
2086{
2087	snf_req_t *sr;
2088	mblk_t *mp;
2089	int iosize;
2090	int error = 0;
2091	short fflag;
2092	struct vnode *vp;
2093	int ksize;
2094
2095	ksize = 0;
2096	*count = 0;
2097
2098	vp = fp->f_vnode;
2099	fflag = fp->f_flag;
2100	if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2101		return (EAGAIN);
2102
2103	/*
2104	 * We check for read error in snf_deque. It has to check
2105	 * for successful READ_DONE and return NULL, and we might
2106	 * as well make an additional check there.
2107	 */
2108	while ((mp = snf_deque(sr)) != NULL) {
2109
2110		if (ISSIG(curthread, JUSTLOOKING)) {
2111			freeb(mp);
2112			error = EINTR;
2113			break;
2114		}
2115		iosize = MBLKL(mp);
2116
2117		if ((error = kstrwritemp(vp, mp, fflag)) != 0) {
2118			freeb(mp);
2119			break;
2120		}
2121		ksize += iosize;
2122	}
2123	*count = ksize;
2124
2125	mutex_enter(&sr->sr_lock);
2126	sr->sr_write_error = error;
2127	/* Look at the big comments on why we cv_signal here. */
2128	cv_signal(&sr->sr_cv);
2129
2130	/* Wait for the reader to complete always. */
2131	while (!(sr->sr_read_error & SR_READ_DONE)) {
2132		cv_wait(&sr->sr_cv, &sr->sr_lock);
2133	}
2134	/* If there is no write error, check for read error. */
2135	if (error == 0)
2136		error = (sr->sr_read_error & ~SR_READ_DONE);
2137
2138	if (error != 0) {
2139		mblk_t *next_mp;
2140
2141		mp = sr->sr_mp_head;
2142		while (mp != NULL) {
2143			next_mp = mp->b_next;
2144			mp->b_next = NULL;
2145			freeb(mp);
2146			mp = next_mp;
2147		}
2148	}
2149	mutex_exit(&sr->sr_lock);
2150	kmem_free(sr, sizeof (snf_req_t));
2151	return (error);
2152}
2153
2154typedef struct {
2155	frtn_t		snfi_frtn;
2156	caddr_t		snfi_base;
2157	uint_t		snfi_mapoff;
2158	size_t		snfi_len;
2159	vnode_t		*snfi_vp;
2160} snf_smap_desbinfo;
2161
2162/*
2163 * The callback function when the last ref of the mblk is dropped,
2164 * normally occurs when TCP receives the ack. But it can be the driver
2165 * too due to lazy reclaim.
2166 */
2167void
2168snf_smap_desbfree(snf_smap_desbinfo *snfi)
2169{
2170	if (!segmap_kpm) {
2171		/*
2172		 * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2173		 * segmap_kpm as long as the latter never falls back to
2174		 * "use_segmap_range". (See segmap_getmapflt().)
2175		 *
2176		 * Using S_OTHER saves an redundant hat_setref() in
2177		 * segmap_unlock()
2178		 */
2179		(void) segmap_fault(kas.a_hat, segkmap,
2180		    (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2181		    snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2182		    F_SOFTUNLOCK, S_OTHER);
2183	}
2184	(void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2185	VN_RELE(snfi->snfi_vp);
2186	kmem_free(snfi, sizeof (*snfi));
2187}
2188
2189/*
2190 * Use segmap instead of bcopy to send down a chain of desballoca'ed, mblks.
2191 * Each mblk contains a segmap slot of no more than MAXBSIZE. The total
2192 * length of a chain is no more than sd_qn_maxpsz.
2193 *
2194 * At the end of the whole sendfile() operation, we wait till the data from
2195 * the last mblk is ack'ed by the transport before returning so that the
2196 * caller of sendfile() can safely modify the file content.
2197 */
2198int
2199snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2200    uint_t maxpsz, ssize_t *count, boolean_t nowait)
2201{
2202	caddr_t base;
2203	int mapoff;
2204	vnode_t *vp;
2205	mblk_t *mp, *mp1;
2206	int iosize, iosize1;
2207	int error;
2208	short fflag;
2209	int ksize;
2210	snf_smap_desbinfo *snfi;
2211	struct vattr va;
2212	boolean_t dowait = B_FALSE;
2213
2214	vp = fp->f_vnode;
2215	fflag = fp->f_flag;
2216	ksize = 0;
2217	for (;;) {
2218		if (ISSIG(curthread, JUSTLOOKING)) {
2219			error = EINTR;
2220			break;
2221		}
2222		iosize = 0;
2223		mp = NULL;
2224		do {
2225			mapoff = fileoff & MAXBOFFSET;
2226			iosize1 = MAXBSIZE - mapoff;
2227			if (iosize1 > size)
2228				iosize1 = size;
2229			/*
2230			 * we don't forcefault because we'll call
2231			 * segmap_fault(F_SOFTLOCK) next.
2232			 *
2233			 * S_READ will get the ref bit set (by either
2234			 * segmap_getmapflt() or segmap_fault()) and page
2235			 * shared locked.
2236			 */
2237			base = segmap_getmapflt(segkmap, fvp, fileoff, iosize1,
2238			    segmap_kpm ? SM_FAULT : 0, S_READ);
2239
2240			snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2241			snfi->snfi_len = (size_t)roundup(mapoff+iosize1,
2242			    PAGESIZE)- (mapoff & PAGEMASK);
2243			/*
2244			 * We must call segmap_fault() even for segmap_kpm
2245			 * because that's how error gets returned.
2246			 * (segmap_getmapflt() never fails but segmap_fault()
2247			 * does.)
2248			 */
2249			if (segmap_fault(kas.a_hat, segkmap,
2250			    (caddr_t)(uintptr_t)(((uintptr_t)base + mapoff) &
2251			    PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2252			    S_READ) != 0) {
2253				(void) segmap_release(segkmap, base, 0);
2254				kmem_free(snfi, sizeof (*snfi));
2255				freemsg(mp);
2256				error = EIO;
2257				goto out;
2258			}
2259			snfi->snfi_frtn.free_func = snf_smap_desbfree;
2260			snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2261			snfi->snfi_base = base;
2262			snfi->snfi_mapoff = mapoff;
2263			mp1 = desballoca((uchar_t *)base + mapoff,
2264			    iosize1, BPRI_HI, &snfi->snfi_frtn);
2265
2266			if (mp1 == NULL) {
2267				(void) segmap_fault(kas.a_hat, segkmap,
2268				    (caddr_t)(uintptr_t)(((uintptr_t)base +
2269				    mapoff) & PAGEMASK), snfi->snfi_len,
2270				    F_SOFTUNLOCK, S_OTHER);
2271				(void) segmap_release(segkmap, base, 0);
2272				kmem_free(snfi, sizeof (*snfi));
2273				freemsg(mp);
2274				error = EAGAIN;
2275				goto out;
2276			}
2277			VN_HOLD(fvp);
2278			snfi->snfi_vp = fvp;
2279			mp1->b_wptr += iosize1;
2280
2281			/* Mark this dblk with the zero-copy flag */
2282			mp1->b_datap->db_struioflag |= STRUIO_ZC;
2283			if (mp == NULL)
2284				mp = mp1;
2285			else
2286				linkb(mp, mp1);
2287			iosize += iosize1;
2288			fileoff += iosize1;
2289			size -= iosize1;
2290		} while (iosize < maxpsz && size != 0);
2291
2292		if (size == 0 && !nowait) {
2293			ASSERT(!dowait);
2294			dowait = B_TRUE;
2295			mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2296		}
2297		VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2298		if ((error = kstrwritemp(vp, mp, fflag)) != 0) {
2299			*count = ksize;
2300			freemsg(mp);
2301			return (error);
2302		}
2303		ksize += iosize;
2304		if (size == 0)
2305			goto done;
2306
2307		(void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2308		va.va_mask = AT_SIZE;
2309		error = VOP_GETATTR(fvp, &va, 0, kcred);
2310		if (error)
2311			break;
2312		/* Read as much as possible. */
2313		if (fileoff >= va.va_size)
2314			break;
2315		if (size + fileoff > va.va_size)
2316			size = va.va_size - fileoff;
2317	}
2318out:
2319	VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2320done:
2321	*count = ksize;
2322	if (dowait) {
2323		stdata_t *stp;
2324
2325		stp = vp->v_stream;
2326		mutex_enter(&stp->sd_lock);
2327		while (!(stp->sd_flag & STZCNOTIFY)) {
2328			(void) cv_wait_sig(&stp->sd_zcopy_wait,
2329			    &stp->sd_lock);
2330		}
2331		stp->sd_flag &= ~STZCNOTIFY;
2332		mutex_exit(&stp->sd_lock);
2333	}
2334	return (error);
2335}
2336
2337int
2338snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2339    uint_t maxpsz, ssize_t *count)
2340{
2341	struct vnode *vp;
2342	mblk_t *mp;
2343	int iosize;
2344	int error;
2345	short fflag;
2346	int ksize;
2347	int ioflag;
2348	struct uio auio;
2349	struct iovec aiov;
2350	struct vattr va;
2351
2352	vp = fp->f_vnode;
2353	fflag = fp->f_flag;
2354	ksize = 0;
2355	auio.uio_iov = &aiov;
2356	auio.uio_iovcnt = 1;
2357	auio.uio_segflg = UIO_SYSSPACE;
2358	auio.uio_llimit = MAXOFFSET_T;
2359	auio.uio_fmode = fflag;
2360	auio.uio_extflg = UIO_COPY_CACHED;
2361	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2362	/* If read sync is not asked for, filter sync flags */
2363	if ((ioflag & FRSYNC) == 0)
2364		ioflag &= ~(FSYNC|FDSYNC);
2365	for (;;) {
2366		if (ISSIG(curthread, JUSTLOOKING)) {
2367			error = EINTR;
2368			break;
2369		}
2370		iosize = (int)MIN(maxpsz, size);
2371		if ((mp = allocb(iosize, BPRI_MED)) == NULL) {
2372			error = EAGAIN;
2373			break;
2374		}
2375		aiov.iov_base = (caddr_t)mp->b_rptr;
2376		aiov.iov_len = iosize;
2377		auio.uio_loffset = fileoff;
2378		auio.uio_resid = iosize;
2379
2380		error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2381		iosize -= auio.uio_resid;
2382
2383		if (error == EINTR && iosize != 0)
2384			error = 0;
2385
2386		if (error != 0 || iosize == 0) {
2387			freeb(mp);
2388			break;
2389		}
2390		mp->b_wptr = mp->b_rptr + iosize;
2391
2392		VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2393		if ((error = kstrwritemp(vp, mp, fflag)) != 0) {
2394			*count = ksize;
2395			freeb(mp);
2396			return (error);
2397		}
2398		ksize += iosize;
2399		size -= iosize;
2400		if (size == 0)
2401			goto done;
2402
2403		fileoff += iosize;
2404		(void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2405		va.va_mask = AT_SIZE;
2406		error = VOP_GETATTR(fvp, &va, 0, kcred);
2407		if (error)
2408			break;
2409		/* Read as much as possible. */
2410		if (fileoff >= va.va_size)
2411			size = 0;
2412		else if (size + fileoff > va.va_size)
2413			size = va.va_size - fileoff;
2414	}
2415	VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2416done:
2417	*count = ksize;
2418	return (error);
2419}
2420
2421#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2422/*
2423 * Largefile support for 32 bit applications only.
2424 */
2425int
2426sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2427    ssize32_t *count32)
2428{
2429	ssize32_t sfv_len;
2430	u_offset_t sfv_off, va_size;
2431	struct vnode *vp, *fvp, *realvp;
2432	struct vattr va;
2433	stdata_t *stp;
2434	ssize_t count = 0;
2435	int error = 0;
2436	boolean_t dozcopy = B_FALSE;
2437	uint_t maxpsz;
2438
2439	sfv_len = (ssize32_t)sfv->sfv_len;
2440	if (sfv_len < 0) {
2441		error = EINVAL;
2442		goto out;
2443	}
2444
2445	if (sfv_len == 0) goto out;
2446
2447	sfv_off = (u_offset_t)sfv->sfv_off;
2448
2449	/* Same checks as in pread */
2450	if (sfv_off > MAXOFFSET_T) {
2451		error = EINVAL;
2452		goto out;
2453	}
2454	if (sfv_off + sfv_len > MAXOFFSET_T)
2455		sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2456
2457	/*
2458	 * There are no more checks on sfv_len. So, we cast it to
2459	 * u_offset_t and share the snf_direct_io/snf_cache code between
2460	 * 32 bit and 64 bit.
2461	 *
2462	 * TODO: should do nbl_need_check() like read()?
2463	 */
2464	if (sfv_len > sendfile_max_size) {
2465		sf_stats.ss_file_not_cached++;
2466		error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2467		    &count);
2468		goto out;
2469	}
2470	fvp = rfp->f_vnode;
2471	if (VOP_REALVP(fvp, &realvp) == 0)
2472		fvp = realvp;
2473	/*
2474	 * Grab the lock as a reader to prevent the file size
2475	 * from changing underneath.
2476	 */
2477	(void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2478	va.va_mask = AT_SIZE;
2479	error = VOP_GETATTR(fvp, &va, 0, kcred);
2480	va_size = va.va_size;
2481	if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2482		VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2483		goto out;
2484	}
2485	/* Read as much as possible. */
2486	if (sfv_off + sfv_len > va_size)
2487		sfv_len = va_size - sfv_off;
2488
2489	vp = fp->f_vnode;
2490	stp = vp->v_stream;
2491	if (stp->sd_qn_maxpsz == INFPSZ)
2492		maxpsz = MAXOFF32_T;
2493	else
2494		maxpsz = roundup(stp->sd_qn_maxpsz, MAXBSIZE);
2495	/*
2496	 * When the NOWAIT flag is not set, we enable zero-copy only if the
2497	 * transfer size is large enough. This prevents performance loss
2498	 * when the caller sends the file piece by piece.
2499	 */
2500	if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2501	    (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2502	    !vn_has_flocks(fvp)) {
2503		if ((stp->sd_copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2504			int on = 1;
2505
2506			if (SOP_SETSOCKOPT(VTOSO(vp), SOL_SOCKET,
2507			    SO_SND_COPYAVOID, &on, sizeof (on)) == 0)
2508				dozcopy = B_TRUE;
2509		} else {
2510			dozcopy = (stp->sd_copyflag & STZCVMSAFE);
2511		}
2512	}
2513	if (dozcopy) {
2514		sf_stats.ss_file_segmap++;
2515		error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2516		    maxpsz, &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2517	} else {
2518		sf_stats.ss_file_cached++;
2519		error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2520		    maxpsz, &count);
2521	}
2522out:
2523	releasef(sfv->sfv_fd);
2524	*count32 = (ssize32_t)count;
2525	return (error);
2526}
2527#endif
2528
2529#ifdef _SYSCALL32_IMPL
2530/*
2531 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2532 * ssize_t rather than ssize32_t; see the comments above read32 for details.
2533 */
2534
2535ssize_t
2536recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2537{
2538	return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2539}
2540
2541ssize_t
2542recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2543	caddr32_t name, caddr32_t namelenp)
2544{
2545	return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2546	    (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2547}
2548
2549ssize_t
2550send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2551{
2552	return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2553}
2554
2555ssize_t
2556sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2557	caddr32_t name, socklen_t namelen)
2558{
2559	return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2560	    (void *)(uintptr_t)name, namelen));
2561}
2562#endif	/* _SYSCALL32_IMPL */
2563
2564/*
2565 * Function wrappers (mostly arround the sonode switch) for
2566 * backward compatibility.
2567 */
2568
2569int
2570soaccept(struct sonode *so, int fflag, struct sonode **nsop)
2571{
2572	return (SOP_ACCEPT(so, fflag, nsop));
2573}
2574
2575int
2576sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2577    int backlog, int flags)
2578{
2579	int	error;
2580
2581	error = SOP_BIND(so, name, namelen, flags);
2582	if (error == 0 && backlog != 0)
2583		return (SOP_LISTEN(so, backlog));
2584
2585	return (error);
2586}
2587
2588int
2589solisten(struct sonode *so, int backlog)
2590{
2591	return (SOP_LISTEN(so, backlog));
2592}
2593
2594int
2595soconnect(struct sonode *so, const struct sockaddr *name, socklen_t namelen,
2596    int fflag, int flags)
2597{
2598	return (SOP_CONNECT(so, name, namelen, fflag, flags));
2599}
2600
2601int
2602sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2603{
2604	return (SOP_RECVMSG(so, msg, uiop));
2605}
2606
2607int
2608sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2609{
2610	return (SOP_SENDMSG(so, msg, uiop));
2611}
2612
2613int
2614sogetpeername(struct sonode *so)
2615{
2616	return (SOP_GETPEERNAME(so));
2617}
2618
2619int
2620sogetsockname(struct sonode *so)
2621{
2622	return (SOP_GETSOCKNAME(so));
2623}
2624
2625int
2626soshutdown(struct sonode *so, int how)
2627{
2628	return (SOP_SHUTDOWN(so, how));
2629}
2630
2631int
2632sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
2633    socklen_t *optlenp, int flags)
2634{
2635	return (SOP_GETSOCKOPT(so, level, option_name, optval, optlenp,
2636	    flags));
2637}
2638
2639int
2640sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
2641    t_uscalar_t optlen)
2642{
2643	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen));
2644}
2645
2646/*
2647 * Because this is backward compatibility interface it only needs to be
2648 * able to handle the creation of TPI sockfs sockets.
2649 */
2650struct sonode *
2651socreate(vnode_t *accessvp, int domain, int type, int protocol, int version,
2652    struct sonode *tso, int *errorp)
2653{
2654	return (sotpi_create(accessvp, domain, type, protocol, version, tso,
2655	    errorp));
2656}
2657