1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <sys/t_lock.h>
28#include <sys/param.h>
29#include <sys/systm.h>
30#include <sys/buf.h>
31#include <sys/conf.h>
32#include <sys/cred.h>
33#include <sys/kmem.h>
34#include <sys/sysmacros.h>
35#include <sys/vfs.h>
36#include <sys/vnode.h>
37#include <sys/debug.h>
38#include <sys/errno.h>
39#include <sys/time.h>
40#include <sys/file.h>
41#include <sys/user.h>
42#include <sys/stream.h>
43#include <sys/strsubr.h>
44#include <sys/strsun.h>
45#include <sys/sunddi.h>
46#include <sys/esunddi.h>
47#include <sys/flock.h>
48#include <sys/modctl.h>
49#include <sys/cmn_err.h>
50#include <sys/vmsystm.h>
51#include <sys/policy.h>
52
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55
56#include <sys/isa_defs.h>
57#include <sys/inttypes.h>
58#include <sys/systm.h>
59#include <sys/cpuvar.h>
60#include <sys/filio.h>
61#include <sys/sendfile.h>
62#include <sys/ddi.h>
63#include <vm/seg.h>
64#include <vm/seg_map.h>
65#include <vm/seg_kpm.h>
66
67#include <fs/sockfs/nl7c.h>
68#include <fs/sockfs/sockcommon.h>
69#include <fs/sockfs/sockfilter_impl.h>
70#include <fs/sockfs/socktpi.h>
71
72#ifdef SOCK_TEST
73int do_useracc = 1;		/* Controlled by setting SO_DEBUG to 4 */
74#else
75#define	do_useracc	1
76#endif /* SOCK_TEST */
77
78extern int 	xnet_truncate_print;
79
80extern void	nl7c_init(void);
81extern int	sockfs_defer_nl7c_init;
82
83/*
84 * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
85 *	 as there isn't a formal definition of IOV_MAX ???
86 */
87#define	MSG_MAXIOVLEN	16
88
89/*
90 * Kernel component of socket creation.
91 *
92 * The socket library determines which version number to use.
93 * First the library calls this with a NULL devpath. If this fails
94 * to find a transport (using solookup) the library will look in /etc/netconfig
95 * for the appropriate transport. If one is found it will pass in the
96 * devpath for the kernel to use.
97 */
98int
99so_socket(int family, int type, int protocol, char *devpath, int version)
100{
101	struct sonode *so;
102	vnode_t *vp;
103	struct file *fp;
104	int fd;
105	int error;
106
107	if (devpath != NULL) {
108		char *buf;
109		size_t kdevpathlen = 0;
110
111		buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
112		if ((error = copyinstr(devpath, buf,
113		    MAXPATHLEN, &kdevpathlen)) != 0) {
114			kmem_free(buf, MAXPATHLEN);
115			return (set_errno(error));
116		}
117		so = socket_create(family, type, protocol, buf, NULL,
118		    SOCKET_SLEEP, version, CRED(), &error);
119		kmem_free(buf, MAXPATHLEN);
120	} else {
121		so = socket_create(family, type, protocol, NULL, NULL,
122		    SOCKET_SLEEP, version, CRED(), &error);
123	}
124	if (so == NULL)
125		return (set_errno(error));
126
127	/* Allocate a file descriptor for the socket */
128	vp = SOTOV(so);
129	if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
130		(void) socket_close(so, 0, CRED());
131		socket_destroy(so);
132		return (set_errno(error));
133	}
134
135	/*
136	 * Now fill in the entries that falloc reserved
137	 */
138	mutex_exit(&fp->f_tlock);
139	setf(fd, fp);
140
141	return (fd);
142}
143
144/*
145 * Map from a file descriptor to a socket node.
146 * Returns with the file descriptor held i.e. the caller has to
147 * use releasef when done with the file descriptor.
148 */
149struct sonode *
150getsonode(int sock, int *errorp, file_t **fpp)
151{
152	file_t *fp;
153	vnode_t *vp;
154	struct sonode *so;
155
156	if ((fp = getf(sock)) == NULL) {
157		*errorp = EBADF;
158		eprintline(*errorp);
159		return (NULL);
160	}
161	vp = fp->f_vnode;
162	/* Check if it is a socket */
163	if (vp->v_type != VSOCK) {
164		releasef(sock);
165		*errorp = ENOTSOCK;
166		eprintline(*errorp);
167		return (NULL);
168	}
169	/*
170	 * Use the stream head to find the real socket vnode.
171	 * This is needed when namefs sits above sockfs.
172	 */
173	if (vp->v_stream) {
174		ASSERT(vp->v_stream->sd_vnode);
175		vp = vp->v_stream->sd_vnode;
176
177		so = VTOSO(vp);
178		if (so->so_version == SOV_STREAM) {
179			releasef(sock);
180			*errorp = ENOTSOCK;
181			eprintsoline(so, *errorp);
182			return (NULL);
183		}
184	} else {
185		so = VTOSO(vp);
186	}
187	if (fpp)
188		*fpp = fp;
189	return (so);
190}
191
192/*
193 * Allocate and copyin a sockaddr.
194 * Ensures NULL termination for AF_UNIX addresses by extending them
195 * with one NULL byte if need be. Verifies that the length is not
196 * excessive to prevent an application from consuming all of kernel
197 * memory. Returns NULL when an error occurred.
198 */
199static struct sockaddr *
200copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
201	    int *errorp)
202{
203	char	*faddr;
204	size_t	namelen = (size_t)*namelenp;
205
206	ASSERT(namelen != 0);
207	if (namelen > SO_MAXARGSIZE) {
208		*errorp = EINVAL;
209		eprintsoline(so, *errorp);
210		return (NULL);
211	}
212
213	faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
214	if (copyin(name, faddr, namelen)) {
215		kmem_free(faddr, namelen);
216		*errorp = EFAULT;
217		eprintsoline(so, *errorp);
218		return (NULL);
219	}
220
221	/*
222	 * Add space for NULL termination if needed.
223	 * Do a quick check if the last byte is NUL.
224	 */
225	if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
226		/* Check if there is any NULL termination */
227		size_t	i;
228		int foundnull = 0;
229
230		for (i = sizeof (name->sa_family); i < namelen; i++) {
231			if (faddr[i] == '\0') {
232				foundnull = 1;
233				break;
234			}
235		}
236		if (!foundnull) {
237			/* Add extra byte for NUL padding */
238			char *nfaddr;
239
240			nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
241			bcopy(faddr, nfaddr, namelen);
242			kmem_free(faddr, namelen);
243
244			/* NUL terminate */
245			nfaddr[namelen] = '\0';
246			namelen++;
247			ASSERT((socklen_t)namelen == namelen);
248			*namelenp = (socklen_t)namelen;
249			faddr = nfaddr;
250		}
251	}
252	return ((struct sockaddr *)faddr);
253}
254
255/*
256 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
257 */
258static int
259copyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
260		void *kaddr, socklen_t klen)
261{
262	if (uaddr != NULL) {
263		if (ulen > klen)
264			ulen = klen;
265
266		if (ulen != 0) {
267			if (copyout(kaddr, uaddr, ulen))
268				return (EFAULT);
269		}
270	} else
271		ulen = 0;
272
273	if (ulenp != NULL) {
274		if (copyout(&ulen, ulenp, sizeof (ulen)))
275			return (EFAULT);
276	}
277	return (0);
278}
279
280/*
281 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
282 * If klen is greater than ulen it still uses the non-truncated
283 * klen to update ulenp.
284 */
285static int
286copyout_name(void *uaddr, socklen_t ulen, void *ulenp,
287		void *kaddr, socklen_t klen)
288{
289	if (uaddr != NULL) {
290		if (ulen >= klen)
291			ulen = klen;
292		else if (ulen != 0 && xnet_truncate_print) {
293			printf("sockfs: truncating copyout of address using "
294			    "XNET semantics for pid = %d. Lengths %d, %d\n",
295			    curproc->p_pid, klen, ulen);
296		}
297
298		if (ulen != 0) {
299			if (copyout(kaddr, uaddr, ulen))
300				return (EFAULT);
301		} else
302			klen = 0;
303	} else
304		klen = 0;
305
306	if (ulenp != NULL) {
307		if (copyout(&klen, ulenp, sizeof (klen)))
308			return (EFAULT);
309	}
310	return (0);
311}
312
313/*
314 * The socketpair() code in libsocket creates two sockets (using
315 * the /etc/netconfig fallback if needed) before calling this routine
316 * to connect the two sockets together.
317 *
318 * For a SOCK_STREAM socketpair a listener is needed - in that case this
319 * routine will create a new file descriptor as part of accepting the
320 * connection. The library socketpair() will check if svs[2] has changed
321 * in which case it will close the changed fd.
322 *
323 * Note that this code could use the TPI feature of accepting the connection
324 * on the listening endpoint. However, that would require significant changes
325 * to soaccept.
326 */
327int
328so_socketpair(int sv[2])
329{
330	int svs[2];
331	struct sonode *so1, *so2;
332	int error;
333	struct sockaddr_ux *name;
334	size_t namelen;
335	sotpi_info_t *sti1;
336	sotpi_info_t *sti2;
337
338	dprint(1, ("so_socketpair(%p)\n", (void *)sv));
339
340	error = useracc(sv, sizeof (svs), B_WRITE);
341	if (error && do_useracc)
342		return (set_errno(EFAULT));
343
344	if (copyin(sv, svs, sizeof (svs)))
345		return (set_errno(EFAULT));
346
347	if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
348		return (set_errno(error));
349
350	if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
351		releasef(svs[0]);
352		return (set_errno(error));
353	}
354
355	if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
356		error = EOPNOTSUPP;
357		goto done;
358	}
359
360	sti1 = SOTOTPI(so1);
361	sti2 = SOTOTPI(so2);
362
363	/*
364	 * The code below makes assumptions about the "sockfs" implementation.
365	 * So make sure that the correct implementation is really used.
366	 */
367	ASSERT(so1->so_ops == &sotpi_sonodeops);
368	ASSERT(so2->so_ops == &sotpi_sonodeops);
369
370	if (so1->so_type == SOCK_DGRAM) {
371		/*
372		 * Bind both sockets and connect them with each other.
373		 * Need to allocate name/namelen for soconnect.
374		 */
375		error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
376		if (error) {
377			eprintsoline(so1, error);
378			goto done;
379		}
380		error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
381		if (error) {
382			eprintsoline(so2, error);
383			goto done;
384		}
385		namelen = sizeof (struct sockaddr_ux);
386		name = kmem_alloc(namelen, KM_SLEEP);
387		name->sou_family = AF_UNIX;
388		name->sou_addr = sti2->sti_ux_laddr;
389		error = socket_connect(so1,
390		    (struct sockaddr *)name,
391		    (socklen_t)namelen,
392		    0, _SOCONNECT_NOXLATE, CRED());
393		if (error) {
394			kmem_free(name, namelen);
395			eprintsoline(so1, error);
396			goto done;
397		}
398		name->sou_addr = sti1->sti_ux_laddr;
399		error = socket_connect(so2,
400		    (struct sockaddr *)name,
401		    (socklen_t)namelen,
402		    0, _SOCONNECT_NOXLATE, CRED());
403		kmem_free(name, namelen);
404		if (error) {
405			eprintsoline(so2, error);
406			goto done;
407		}
408		releasef(svs[0]);
409		releasef(svs[1]);
410	} else {
411		/*
412		 * Bind both sockets, with so1 being a listener.
413		 * Connect so2 to so1 - nonblocking to avoid waiting for
414		 * soaccept to complete.
415		 * Accept a connection on so1. Pass out the new fd as sv[0].
416		 * The library will detect the changed fd and close
417		 * the original one.
418		 */
419		struct sonode *nso;
420		struct vnode *nvp;
421		struct file *nfp;
422		int nfd;
423
424		/*
425		 * We could simply call socket_listen() here (which would do the
426		 * binding automatically) if the code didn't rely on passing
427		 * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
428		 */
429		error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
430		    _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
431		    CRED());
432		if (error) {
433			eprintsoline(so1, error);
434			goto done;
435		}
436		error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
437		if (error) {
438			eprintsoline(so2, error);
439			goto done;
440		}
441
442		namelen = sizeof (struct sockaddr_ux);
443		name = kmem_alloc(namelen, KM_SLEEP);
444		name->sou_family = AF_UNIX;
445		name->sou_addr = sti1->sti_ux_laddr;
446		error = socket_connect(so2,
447		    (struct sockaddr *)name,
448		    (socklen_t)namelen,
449		    FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
450		kmem_free(name, namelen);
451		if (error) {
452			if (error != EINPROGRESS) {
453				eprintsoline(so2, error); goto done;
454			}
455		}
456
457		error = socket_accept(so1, 0, CRED(), &nso);
458		if (error) {
459			eprintsoline(so1, error);
460			goto done;
461		}
462
463		/* wait for so2 being SS_CONNECTED ignoring signals */
464		mutex_enter(&so2->so_lock);
465		error = sowaitconnected(so2, 0, 1);
466		mutex_exit(&so2->so_lock);
467		if (error != 0) {
468			(void) socket_close(nso, 0, CRED());
469			socket_destroy(nso);
470			eprintsoline(so2, error);
471			goto done;
472		}
473
474		nvp = SOTOV(nso);
475		if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
476			(void) socket_close(nso, 0, CRED());
477			socket_destroy(nso);
478			eprintsoline(nso, error);
479			goto done;
480		}
481		/*
482		 * fill in the entries that falloc reserved
483		 */
484		mutex_exit(&nfp->f_tlock);
485		setf(nfd, nfp);
486
487		releasef(svs[0]);
488		releasef(svs[1]);
489		svs[0] = nfd;
490
491		/*
492		 * The socketpair library routine will close the original
493		 * svs[0] when this code passes out a different file
494		 * descriptor.
495		 */
496		if (copyout(svs, sv, sizeof (svs))) {
497			(void) closeandsetf(nfd, NULL);
498			eprintline(EFAULT);
499			return (set_errno(EFAULT));
500		}
501	}
502	return (0);
503
504done:
505	releasef(svs[0]);
506	releasef(svs[1]);
507	return (set_errno(error));
508}
509
510int
511bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
512{
513	struct sonode *so;
514	int error;
515
516	dprint(1, ("bind(%d, %p, %d)\n",
517	    sock, (void *)name, namelen));
518
519	if ((so = getsonode(sock, &error, NULL)) == NULL)
520		return (set_errno(error));
521
522	/* Allocate and copyin name */
523	/*
524	 * X/Open test does not expect EFAULT with NULL name and non-zero
525	 * namelen.
526	 */
527	if (name != NULL && namelen != 0) {
528		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
529		name = copyin_name(so, name, &namelen, &error);
530		if (name == NULL) {
531			releasef(sock);
532			return (set_errno(error));
533		}
534	} else {
535		name = NULL;
536		namelen = 0;
537	}
538
539	switch (version) {
540	default:
541		error = socket_bind(so, name, namelen, 0, CRED());
542		break;
543	case SOV_XPG4_2:
544		error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
545		break;
546	case SOV_SOCKBSD:
547		error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
548		break;
549	}
550done:
551	releasef(sock);
552	if (name != NULL)
553		kmem_free(name, (size_t)namelen);
554
555	if (error)
556		return (set_errno(error));
557	return (0);
558}
559
560/* ARGSUSED2 */
561int
562listen(int sock, int backlog, int version)
563{
564	struct sonode *so;
565	int error;
566
567	dprint(1, ("listen(%d, %d)\n",
568	    sock, backlog));
569
570	if ((so = getsonode(sock, &error, NULL)) == NULL)
571		return (set_errno(error));
572
573	error = socket_listen(so, backlog, CRED());
574
575	releasef(sock);
576	if (error)
577		return (set_errno(error));
578	return (0);
579}
580
581/*ARGSUSED3*/
582int
583accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
584{
585	struct sonode *so;
586	file_t *fp;
587	int error;
588	socklen_t namelen;
589	struct sonode *nso;
590	struct vnode *nvp;
591	struct file *nfp;
592	int nfd;
593	struct sockaddr *addrp;
594	socklen_t addrlen;
595
596	dprint(1, ("accept(%d, %p, %p)\n",
597	    sock, (void *)name, (void *)namelenp));
598
599	if ((so = getsonode(sock, &error, &fp)) == NULL)
600		return (set_errno(error));
601
602	if (name != NULL) {
603		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
604		if (copyin(namelenp, &namelen, sizeof (namelen))) {
605			releasef(sock);
606			return (set_errno(EFAULT));
607		}
608		if (namelen != 0) {
609			error = useracc(name, (size_t)namelen, B_WRITE);
610			if (error && do_useracc) {
611				releasef(sock);
612				return (set_errno(EFAULT));
613			}
614		} else
615			name = NULL;
616	} else {
617		namelen = 0;
618	}
619
620	/*
621	 * Allocate the user fd before socket_accept() in order to
622	 * catch EMFILE errors before calling socket_accept().
623	 */
624	if ((nfd = ufalloc(0)) == -1) {
625		eprintsoline(so, EMFILE);
626		releasef(sock);
627		return (set_errno(EMFILE));
628	}
629	error = socket_accept(so, fp->f_flag, CRED(), &nso);
630	if (error) {
631		setf(nfd, NULL);
632		releasef(sock);
633		return (set_errno(error));
634	}
635
636	nvp = SOTOV(nso);
637
638	ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
639	if (namelen != 0) {
640		addrlen = so->so_max_addr_len;
641		addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
642
643		if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
644		    &addrlen, B_TRUE, CRED())) == 0) {
645			error = copyout_name(name, namelen, namelenp,
646			    addrp, addrlen);
647		} else {
648			ASSERT(error == EINVAL || error == ENOTCONN);
649			error = ECONNABORTED;
650		}
651		kmem_free(addrp, so->so_max_addr_len);
652	}
653
654	if (error) {
655		setf(nfd, NULL);
656		(void) socket_close(nso, 0, CRED());
657		socket_destroy(nso);
658		releasef(sock);
659		return (set_errno(error));
660	}
661	if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
662		setf(nfd, NULL);
663		(void) socket_close(nso, 0, CRED());
664		socket_destroy(nso);
665		eprintsoline(so, error);
666		releasef(sock);
667		return (set_errno(error));
668	}
669	/*
670	 * fill in the entries that falloc reserved
671	 */
672	nfp->f_vnode = nvp;
673	mutex_exit(&nfp->f_tlock);
674	setf(nfd, nfp);
675
676	/*
677	 * Copy FNDELAY and FNONBLOCK from listener to acceptor
678	 */
679	if (so->so_state & (SS_NDELAY|SS_NONBLOCK)) {
680		uint_t oflag = nfp->f_flag;
681		int arg = 0;
682
683		if (so->so_state & SS_NONBLOCK)
684			arg |= FNONBLOCK;
685		else if (so->so_state & SS_NDELAY)
686			arg |= FNDELAY;
687
688		/*
689		 * This code is a simplification of the F_SETFL code in fcntl()
690		 * Ignore any errors from VOP_SETFL.
691		 */
692		if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
693		    != 0) {
694			eprintsoline(so, error);
695			error = 0;
696		} else {
697			mutex_enter(&nfp->f_tlock);
698			nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
699			nfp->f_flag |= arg;
700			mutex_exit(&nfp->f_tlock);
701		}
702	}
703	releasef(sock);
704	return (nfd);
705}
706
707int
708connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
709{
710	struct sonode *so;
711	file_t *fp;
712	int error;
713
714	dprint(1, ("connect(%d, %p, %d)\n",
715	    sock, (void *)name, namelen));
716
717	if ((so = getsonode(sock, &error, &fp)) == NULL)
718		return (set_errno(error));
719
720	/* Allocate and copyin name */
721	if (namelen != 0) {
722		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
723		name = copyin_name(so, name, &namelen, &error);
724		if (name == NULL) {
725			releasef(sock);
726			return (set_errno(error));
727		}
728	} else
729		name = NULL;
730
731	error = socket_connect(so, name, namelen, fp->f_flag,
732	    (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
733	releasef(sock);
734	if (name)
735		kmem_free(name, (size_t)namelen);
736	if (error)
737		return (set_errno(error));
738	return (0);
739}
740
741/*ARGSUSED2*/
742int
743shutdown(int sock, int how, int version)
744{
745	struct sonode *so;
746	int error;
747
748	dprint(1, ("shutdown(%d, %d)\n",
749	    sock, how));
750
751	if ((so = getsonode(sock, &error, NULL)) == NULL)
752		return (set_errno(error));
753
754	error = socket_shutdown(so, how, CRED());
755
756	releasef(sock);
757	if (error)
758		return (set_errno(error));
759	return (0);
760}
761
762/*
763 * Common receive routine.
764 */
765static ssize_t
766recvit(int sock,
767	struct nmsghdr *msg,
768	struct uio *uiop,
769	int flags,
770	socklen_t *namelenp,
771	socklen_t *controllenp,
772	int *flagsp)
773{
774	struct sonode *so;
775	file_t *fp;
776	void *name;
777	socklen_t namelen;
778	void *control;
779	socklen_t controllen;
780	ssize_t len;
781	int error;
782
783	if ((so = getsonode(sock, &error, &fp)) == NULL)
784		return (set_errno(error));
785
786	len = uiop->uio_resid;
787	uiop->uio_fmode = fp->f_flag;
788	uiop->uio_extflg = UIO_COPY_CACHED;
789
790	name = msg->msg_name;
791	namelen = msg->msg_namelen;
792	control = msg->msg_control;
793	controllen = msg->msg_controllen;
794
795	msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
796	    MSG_DONTWAIT | MSG_XPG4_2);
797
798	error = socket_recvmsg(so, msg, uiop, CRED());
799	if (error) {
800		releasef(sock);
801		return (set_errno(error));
802	}
803	lwp_stat_update(LWP_STAT_MSGRCV, 1);
804	releasef(sock);
805
806	error = copyout_name(name, namelen, namelenp,
807	    msg->msg_name, msg->msg_namelen);
808	if (error)
809		goto err;
810
811	if (flagsp != NULL) {
812		/*
813		 * Clear internal flag.
814		 */
815		msg->msg_flags &= ~MSG_XPG4_2;
816
817		/*
818		 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
819		 * when controllen is zero and there is control data to
820		 * copy out.
821		 */
822		if (controllen != 0 &&
823		    (msg->msg_controllen > controllen || control == NULL)) {
824			dprint(1, ("recvit: CTRUNC %d %d %p\n",
825			    msg->msg_controllen, controllen, control));
826
827			msg->msg_flags |= MSG_CTRUNC;
828		}
829		if (copyout(&msg->msg_flags, flagsp,
830		    sizeof (msg->msg_flags))) {
831			error = EFAULT;
832			goto err;
833		}
834	}
835	/*
836	 * Note: This MUST be done last. There can be no "goto err" after this
837	 * point since it could make so_closefds run twice on some part
838	 * of the file descriptor array.
839	 */
840	if (controllen != 0) {
841		if (!(flags & MSG_XPG4_2)) {
842			/*
843			 * Good old msg_accrights can only return a multiple
844			 * of 4 bytes.
845			 */
846			controllen &= ~((int)sizeof (uint32_t) - 1);
847		}
848		error = copyout_arg(control, controllen, controllenp,
849		    msg->msg_control, msg->msg_controllen);
850		if (error)
851			goto err;
852
853		if (msg->msg_controllen > controllen || control == NULL) {
854			if (control == NULL)
855				controllen = 0;
856			so_closefds(msg->msg_control, msg->msg_controllen,
857			    !(flags & MSG_XPG4_2), controllen);
858		}
859	}
860	if (msg->msg_namelen != 0)
861		kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
862	if (msg->msg_controllen != 0)
863		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
864	return (len - uiop->uio_resid);
865
866err:
867	/*
868	 * If we fail and the control part contains file descriptors
869	 * we have to close the fd's.
870	 */
871	if (msg->msg_controllen != 0)
872		so_closefds(msg->msg_control, msg->msg_controllen,
873		    !(flags & MSG_XPG4_2), 0);
874	if (msg->msg_namelen != 0)
875		kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
876	if (msg->msg_controllen != 0)
877		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
878	return (set_errno(error));
879}
880
881/*
882 * Native system call
883 */
884ssize_t
885recv(int sock, void *buffer, size_t len, int flags)
886{
887	struct nmsghdr lmsg;
888	struct uio auio;
889	struct iovec aiov[1];
890
891	dprint(1, ("recv(%d, %p, %ld, %d)\n",
892	    sock, buffer, len, flags));
893
894	if ((ssize_t)len < 0) {
895		return (set_errno(EINVAL));
896	}
897
898	aiov[0].iov_base = buffer;
899	aiov[0].iov_len = len;
900	auio.uio_loffset = 0;
901	auio.uio_iov = aiov;
902	auio.uio_iovcnt = 1;
903	auio.uio_resid = len;
904	auio.uio_segflg = UIO_USERSPACE;
905	auio.uio_limit = 0;
906
907	lmsg.msg_namelen = 0;
908	lmsg.msg_controllen = 0;
909	lmsg.msg_flags = 0;
910	return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
911}
912
913ssize_t
914recvfrom(int sock, void *buffer, size_t len, int flags,
915	struct sockaddr *name, socklen_t *namelenp)
916{
917	struct nmsghdr lmsg;
918	struct uio auio;
919	struct iovec aiov[1];
920
921	dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
922	    sock, buffer, len, flags, (void *)name, (void *)namelenp));
923
924	if ((ssize_t)len < 0) {
925		return (set_errno(EINVAL));
926	}
927
928	aiov[0].iov_base = buffer;
929	aiov[0].iov_len = len;
930	auio.uio_loffset = 0;
931	auio.uio_iov = aiov;
932	auio.uio_iovcnt = 1;
933	auio.uio_resid = len;
934	auio.uio_segflg = UIO_USERSPACE;
935	auio.uio_limit = 0;
936
937	lmsg.msg_name = (char *)name;
938	if (namelenp != NULL) {
939		if (copyin(namelenp, &lmsg.msg_namelen,
940		    sizeof (lmsg.msg_namelen)))
941			return (set_errno(EFAULT));
942	} else {
943		lmsg.msg_namelen = 0;
944	}
945	lmsg.msg_controllen = 0;
946	lmsg.msg_flags = 0;
947
948	return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
949}
950
951/*
952 * Uses the MSG_XPG4_2 flag to determine if the caller is using
953 * struct omsghdr or struct nmsghdr.
954 */
955ssize_t
956recvmsg(int sock, struct nmsghdr *msg, int flags)
957{
958	STRUCT_DECL(nmsghdr, u_lmsg);
959	STRUCT_HANDLE(nmsghdr, umsgptr);
960	struct nmsghdr lmsg;
961	struct uio auio;
962	struct iovec aiov[MSG_MAXIOVLEN];
963	int iovcnt;
964	ssize_t len;
965	int i;
966	int *flagsp;
967	model_t	model;
968
969	dprint(1, ("recvmsg(%d, %p, %d)\n",
970	    sock, (void *)msg, flags));
971
972	model = get_udatamodel();
973	STRUCT_INIT(u_lmsg, model);
974	STRUCT_SET_HANDLE(umsgptr, model, msg);
975
976	if (flags & MSG_XPG4_2) {
977		if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
978			return (set_errno(EFAULT));
979		flagsp = STRUCT_FADDR(umsgptr, msg_flags);
980	} else {
981		/*
982		 * Assumes that nmsghdr and omsghdr are identically shaped
983		 * except for the added msg_flags field.
984		 */
985		if (copyin(msg, STRUCT_BUF(u_lmsg),
986		    SIZEOF_STRUCT(omsghdr, model)))
987			return (set_errno(EFAULT));
988		STRUCT_FSET(u_lmsg, msg_flags, 0);
989		flagsp = NULL;
990	}
991
992	/*
993	 * Code below us will kmem_alloc memory and hang it
994	 * off msg_control and msg_name fields. This forces
995	 * us to copy the structure to its native form.
996	 */
997	lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
998	lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
999	lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1000	lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1001	lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1002	lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1003	lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1004
1005	iovcnt = lmsg.msg_iovlen;
1006
1007	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1008		return (set_errno(EMSGSIZE));
1009	}
1010
1011#ifdef _SYSCALL32_IMPL
1012	/*
1013	 * 32-bit callers need to have their iovec expanded, while ensuring
1014	 * that they can't move more than 2Gbytes of data in a single call.
1015	 */
1016	if (model == DATAMODEL_ILP32) {
1017		struct iovec32 aiov32[MSG_MAXIOVLEN];
1018		ssize32_t count32;
1019
1020		if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1021		    iovcnt * sizeof (struct iovec32)))
1022			return (set_errno(EFAULT));
1023
1024		count32 = 0;
1025		for (i = 0; i < iovcnt; i++) {
1026			ssize32_t iovlen32;
1027
1028			iovlen32 = aiov32[i].iov_len;
1029			count32 += iovlen32;
1030			if (iovlen32 < 0 || count32 < 0)
1031				return (set_errno(EINVAL));
1032			aiov[i].iov_len = iovlen32;
1033			aiov[i].iov_base =
1034			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1035		}
1036	} else
1037#endif /* _SYSCALL32_IMPL */
1038	if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1039		return (set_errno(EFAULT));
1040	}
1041	len = 0;
1042	for (i = 0; i < iovcnt; i++) {
1043		ssize_t iovlen = aiov[i].iov_len;
1044		len += iovlen;
1045		if (iovlen < 0 || len < 0) {
1046			return (set_errno(EINVAL));
1047		}
1048	}
1049	auio.uio_loffset = 0;
1050	auio.uio_iov = aiov;
1051	auio.uio_iovcnt = iovcnt;
1052	auio.uio_resid = len;
1053	auio.uio_segflg = UIO_USERSPACE;
1054	auio.uio_limit = 0;
1055
1056	if (lmsg.msg_control != NULL &&
1057	    (do_useracc == 0 ||
1058	    useracc(lmsg.msg_control, lmsg.msg_controllen,
1059	    B_WRITE) != 0)) {
1060		return (set_errno(EFAULT));
1061	}
1062
1063	return (recvit(sock, &lmsg, &auio, flags,
1064	    STRUCT_FADDR(umsgptr, msg_namelen),
1065	    STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
1066}
1067
1068/*
1069 * Common send function.
1070 */
1071static ssize_t
1072sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1073{
1074	struct sonode *so;
1075	file_t *fp;
1076	void *name;
1077	socklen_t namelen;
1078	void *control;
1079	socklen_t controllen;
1080	ssize_t len;
1081	int error;
1082
1083	if ((so = getsonode(sock, &error, &fp)) == NULL)
1084		return (set_errno(error));
1085
1086	uiop->uio_fmode = fp->f_flag;
1087
1088	if (so->so_family == AF_UNIX)
1089		uiop->uio_extflg = UIO_COPY_CACHED;
1090	else
1091		uiop->uio_extflg = UIO_COPY_DEFAULT;
1092
1093	/* Allocate and copyin name and control */
1094	name = msg->msg_name;
1095	namelen = msg->msg_namelen;
1096	if (name != NULL && namelen != 0) {
1097		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1098		name = copyin_name(so,
1099		    (struct sockaddr *)name,
1100		    &namelen, &error);
1101		if (name == NULL)
1102			goto done3;
1103		/* copyin_name null terminates addresses for AF_UNIX */
1104		msg->msg_namelen = namelen;
1105		msg->msg_name = name;
1106	} else {
1107		msg->msg_name = name = NULL;
1108		msg->msg_namelen = namelen = 0;
1109	}
1110
1111	control = msg->msg_control;
1112	controllen = msg->msg_controllen;
1113	if ((control != NULL) && (controllen != 0)) {
1114		/*
1115		 * Verify that the length is not excessive to prevent
1116		 * an application from consuming all of kernel memory.
1117		 */
1118		if (controllen > SO_MAXARGSIZE) {
1119			error = EINVAL;
1120			goto done2;
1121		}
1122		control = kmem_alloc(controllen, KM_SLEEP);
1123
1124		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1125		if (copyin(msg->msg_control, control, controllen)) {
1126			error = EFAULT;
1127			goto done1;
1128		}
1129		msg->msg_control = control;
1130	} else {
1131		msg->msg_control = control = NULL;
1132		msg->msg_controllen = controllen = 0;
1133	}
1134
1135	len = uiop->uio_resid;
1136	msg->msg_flags = flags;
1137
1138	error = socket_sendmsg(so, msg, uiop, CRED());
1139done1:
1140	if (control != NULL)
1141		kmem_free(control, controllen);
1142done2:
1143	if (name != NULL)
1144		kmem_free(name, namelen);
1145done3:
1146	if (error != 0) {
1147		releasef(sock);
1148		return (set_errno(error));
1149	}
1150	lwp_stat_update(LWP_STAT_MSGSND, 1);
1151	releasef(sock);
1152	return (len - uiop->uio_resid);
1153}
1154
1155/*
1156 * Native system call
1157 */
1158ssize_t
1159send(int sock, void *buffer, size_t len, int flags)
1160{
1161	struct nmsghdr lmsg;
1162	struct uio auio;
1163	struct iovec aiov[1];
1164
1165	dprint(1, ("send(%d, %p, %ld, %d)\n",
1166	    sock, buffer, len, flags));
1167
1168	if ((ssize_t)len < 0) {
1169		return (set_errno(EINVAL));
1170	}
1171
1172	aiov[0].iov_base = buffer;
1173	aiov[0].iov_len = len;
1174	auio.uio_loffset = 0;
1175	auio.uio_iov = aiov;
1176	auio.uio_iovcnt = 1;
1177	auio.uio_resid = len;
1178	auio.uio_segflg = UIO_USERSPACE;
1179	auio.uio_limit = 0;
1180
1181	lmsg.msg_name = NULL;
1182	lmsg.msg_control = NULL;
1183	if (!(flags & MSG_XPG4_2)) {
1184		/*
1185		 * In order to be compatible with the libsocket/sockmod
1186		 * implementation we set EOR for all send* calls.
1187		 */
1188		flags |= MSG_EOR;
1189	}
1190	return (sendit(sock, &lmsg, &auio, flags));
1191}
1192
1193/*
1194 * Uses the MSG_XPG4_2 flag to determine if the caller is using
1195 * struct omsghdr or struct nmsghdr.
1196 */
1197ssize_t
1198sendmsg(int sock, struct nmsghdr *msg, int flags)
1199{
1200	struct nmsghdr lmsg;
1201	STRUCT_DECL(nmsghdr, u_lmsg);
1202	struct uio auio;
1203	struct iovec aiov[MSG_MAXIOVLEN];
1204	int iovcnt;
1205	ssize_t len;
1206	int i;
1207	model_t	model;
1208
1209	dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1210
1211	model = get_udatamodel();
1212	STRUCT_INIT(u_lmsg, model);
1213
1214	if (flags & MSG_XPG4_2) {
1215		if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1216		    STRUCT_SIZE(u_lmsg)))
1217			return (set_errno(EFAULT));
1218	} else {
1219		/*
1220		 * Assumes that nmsghdr and omsghdr are identically shaped
1221		 * except for the added msg_flags field.
1222		 */
1223		if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1224		    SIZEOF_STRUCT(omsghdr, model)))
1225			return (set_errno(EFAULT));
1226		/*
1227		 * In order to be compatible with the libsocket/sockmod
1228		 * implementation we set EOR for all send* calls.
1229		 */
1230		flags |= MSG_EOR;
1231	}
1232
1233	/*
1234	 * Code below us will kmem_alloc memory and hang it
1235	 * off msg_control and msg_name fields. This forces
1236	 * us to copy the structure to its native form.
1237	 */
1238	lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1239	lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1240	lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1241	lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1242	lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1243	lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1244	lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1245
1246	iovcnt = lmsg.msg_iovlen;
1247
1248	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1249		/*
1250		 * Unless this is XPG 4.2 we allow iovcnt == 0 to
1251		 * be compatible with SunOS 4.X and 4.4BSD.
1252		 */
1253		if (iovcnt != 0 || (flags & MSG_XPG4_2))
1254			return (set_errno(EMSGSIZE));
1255	}
1256
1257#ifdef _SYSCALL32_IMPL
1258	/*
1259	 * 32-bit callers need to have their iovec expanded, while ensuring
1260	 * that they can't move more than 2Gbytes of data in a single call.
1261	 */
1262	if (model == DATAMODEL_ILP32) {
1263		struct iovec32 aiov32[MSG_MAXIOVLEN];
1264		ssize32_t count32;
1265
1266		if (iovcnt != 0 &&
1267		    copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1268		    iovcnt * sizeof (struct iovec32)))
1269			return (set_errno(EFAULT));
1270
1271		count32 = 0;
1272		for (i = 0; i < iovcnt; i++) {
1273			ssize32_t iovlen32;
1274
1275			iovlen32 = aiov32[i].iov_len;
1276			count32 += iovlen32;
1277			if (iovlen32 < 0 || count32 < 0)
1278				return (set_errno(EINVAL));
1279			aiov[i].iov_len = iovlen32;
1280			aiov[i].iov_base =
1281			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1282		}
1283	} else
1284#endif /* _SYSCALL32_IMPL */
1285	if (iovcnt != 0 &&
1286	    copyin(lmsg.msg_iov, aiov,
1287	    (unsigned)iovcnt * sizeof (struct iovec))) {
1288		return (set_errno(EFAULT));
1289	}
1290	len = 0;
1291	for (i = 0; i < iovcnt; i++) {
1292		ssize_t iovlen = aiov[i].iov_len;
1293		len += iovlen;
1294		if (iovlen < 0 || len < 0) {
1295			return (set_errno(EINVAL));
1296		}
1297	}
1298	auio.uio_loffset = 0;
1299	auio.uio_iov = aiov;
1300	auio.uio_iovcnt = iovcnt;
1301	auio.uio_resid = len;
1302	auio.uio_segflg = UIO_USERSPACE;
1303	auio.uio_limit = 0;
1304
1305	return (sendit(sock, &lmsg, &auio, flags));
1306}
1307
1308ssize_t
1309sendto(int sock, void *buffer, size_t len, int flags,
1310    struct sockaddr *name, socklen_t namelen)
1311{
1312	struct nmsghdr lmsg;
1313	struct uio auio;
1314	struct iovec aiov[1];
1315
1316	dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1317	    sock, buffer, len, flags, (void *)name, namelen));
1318
1319	if ((ssize_t)len < 0) {
1320		return (set_errno(EINVAL));
1321	}
1322
1323	aiov[0].iov_base = buffer;
1324	aiov[0].iov_len = len;
1325	auio.uio_loffset = 0;
1326	auio.uio_iov = aiov;
1327	auio.uio_iovcnt = 1;
1328	auio.uio_resid = len;
1329	auio.uio_segflg = UIO_USERSPACE;
1330	auio.uio_limit = 0;
1331
1332	lmsg.msg_name = (char *)name;
1333	lmsg.msg_namelen = namelen;
1334	lmsg.msg_control = NULL;
1335	if (!(flags & MSG_XPG4_2)) {
1336		/*
1337		 * In order to be compatible with the libsocket/sockmod
1338		 * implementation we set EOR for all send* calls.
1339		 */
1340		flags |= MSG_EOR;
1341	}
1342	return (sendit(sock, &lmsg, &auio, flags));
1343}
1344
1345/*ARGSUSED3*/
1346int
1347getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1348{
1349	struct sonode *so;
1350	int error;
1351	socklen_t namelen;
1352	socklen_t sock_addrlen;
1353	struct sockaddr *sock_addrp;
1354
1355	dprint(1, ("getpeername(%d, %p, %p)\n",
1356	    sock, (void *)name, (void *)namelenp));
1357
1358	if ((so = getsonode(sock, &error, NULL)) == NULL)
1359		goto bad;
1360
1361	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1362	if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1363	    (name == NULL && namelen != 0)) {
1364		error = EFAULT;
1365		goto rel_out;
1366	}
1367	sock_addrlen = so->so_max_addr_len;
1368	sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1369
1370	if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1371	    B_FALSE, CRED())) == 0) {
1372		ASSERT(sock_addrlen <= so->so_max_addr_len);
1373		error = copyout_name(name, namelen, namelenp,
1374		    (void *)sock_addrp, sock_addrlen);
1375	}
1376	kmem_free(sock_addrp, so->so_max_addr_len);
1377rel_out:
1378	releasef(sock);
1379bad:	return (error != 0 ? set_errno(error) : 0);
1380}
1381
1382/*ARGSUSED3*/
1383int
1384getsockname(int sock, struct sockaddr *name,
1385		socklen_t *namelenp, int version)
1386{
1387	struct sonode *so;
1388	int error;
1389	socklen_t namelen, sock_addrlen;
1390	struct sockaddr *sock_addrp;
1391
1392	dprint(1, ("getsockname(%d, %p, %p)\n",
1393	    sock, (void *)name, (void *)namelenp));
1394
1395	if ((so = getsonode(sock, &error, NULL)) == NULL)
1396		goto bad;
1397
1398	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1399	if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1400	    (name == NULL && namelen != 0)) {
1401		error = EFAULT;
1402		goto rel_out;
1403	}
1404
1405	sock_addrlen = so->so_max_addr_len;
1406	sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1407	if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1408	    CRED())) == 0) {
1409		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1410		ASSERT(sock_addrlen <= so->so_max_addr_len);
1411		error = copyout_name(name, namelen, namelenp,
1412		    (void *)sock_addrp, sock_addrlen);
1413	}
1414	kmem_free(sock_addrp, so->so_max_addr_len);
1415rel_out:
1416	releasef(sock);
1417bad:	return (error != 0 ? set_errno(error) : 0);
1418}
1419
1420/*ARGSUSED5*/
1421int
1422getsockopt(int sock,
1423	int level,
1424	int option_name,
1425	void *option_value,
1426	socklen_t *option_lenp,
1427	int version)
1428{
1429	struct sonode *so;
1430	socklen_t optlen, optlen_res;
1431	void *optval;
1432	int error;
1433
1434	dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1435	    sock, level, option_name, option_value, (void *)option_lenp));
1436
1437	if ((so = getsonode(sock, &error, NULL)) == NULL)
1438		return (set_errno(error));
1439
1440	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1441	if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1442		releasef(sock);
1443		return (set_errno(EFAULT));
1444	}
1445	/*
1446	 * Verify that the length is not excessive to prevent
1447	 * an application from consuming all of kernel memory.
1448	 */
1449	if (optlen > SO_MAXARGSIZE) {
1450		error = EINVAL;
1451		releasef(sock);
1452		return (set_errno(error));
1453	}
1454	optval = kmem_alloc(optlen, KM_SLEEP);
1455	optlen_res = optlen;
1456	error = socket_getsockopt(so, level, option_name, optval,
1457	    &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1458	    CRED());
1459	releasef(sock);
1460	if (error) {
1461		kmem_free(optval, optlen);
1462		return (set_errno(error));
1463	}
1464	error = copyout_arg(option_value, optlen, option_lenp,
1465	    optval, optlen_res);
1466	kmem_free(optval, optlen);
1467	if (error)
1468		return (set_errno(error));
1469	return (0);
1470}
1471
1472/*ARGSUSED5*/
1473int
1474setsockopt(int sock,
1475	int level,
1476	int option_name,
1477	void *option_value,
1478	socklen_t option_len,
1479	int version)
1480{
1481	struct sonode *so;
1482	intptr_t buffer[2];
1483	void *optval = NULL;
1484	int error;
1485
1486	dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1487	    sock, level, option_name, option_value, option_len));
1488
1489	if ((so = getsonode(sock, &error, NULL)) == NULL)
1490		return (set_errno(error));
1491
1492	if (option_value != NULL) {
1493		if (option_len != 0) {
1494			/*
1495			 * Verify that the length is not excessive to prevent
1496			 * an application from consuming all of kernel memory.
1497			 */
1498			if (option_len > SO_MAXARGSIZE) {
1499				error = EINVAL;
1500				goto done2;
1501			}
1502			optval = option_len <= sizeof (buffer) ?
1503			    &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1504			ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1505			if (copyin(option_value, optval, (size_t)option_len)) {
1506				error = EFAULT;
1507				goto done1;
1508			}
1509		}
1510	} else
1511		option_len = 0;
1512
1513	error = socket_setsockopt(so, level, option_name, optval,
1514	    (t_uscalar_t)option_len, CRED());
1515done1:
1516	if (optval != buffer)
1517		kmem_free(optval, (size_t)option_len);
1518done2:
1519	releasef(sock);
1520	if (error)
1521		return (set_errno(error));
1522	return (0);
1523}
1524
1525static int
1526sockconf_add_sock(int family, int type, int protocol, char *name)
1527{
1528	int error = 0;
1529	char *kdevpath = NULL;
1530	char *kmodule = NULL;
1531	char *buf = NULL;
1532	size_t pathlen = 0;
1533	struct sockparams *sp;
1534
1535	if (name == NULL)
1536		return (EINVAL);
1537	/*
1538	 * Copyin the name.
1539	 * This also makes it possible to check for too long pathnames.
1540	 * Compress the space needed for the name before passing it
1541	 * to soconfig - soconfig will store the string until
1542	 * the configuration is removed.
1543	 */
1544	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1545	if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1546		kmem_free(buf, MAXPATHLEN);
1547		return (error);
1548	}
1549	if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1550		/* For device */
1551
1552		/*
1553		 * Special handling for NCA:
1554		 *
1555		 * DEV_NCA is never opened even if an application
1556		 * requests for AF_NCA. The device opened is instead a
1557		 * predefined AF_INET transport (NCA_INET_DEV).
1558		 *
1559		 * Prior to Volo (PSARC/2007/587) NCA would determine
1560		 * the device using a lookup, which worked then because
1561		 * all protocols were based on TPI. Since TPI is no
1562		 * longer the default, we have to explicitly state
1563		 * which device to use.
1564		 */
1565		if (strcmp(buf, NCA_DEV) == 0) {
1566			/* only support entry <28, 2, 0> */
1567			if (family != AF_NCA || type != SOCK_STREAM ||
1568			    protocol != 0) {
1569				kmem_free(buf, MAXPATHLEN);
1570				return (EINVAL);
1571			}
1572
1573			pathlen = strlen(NCA_INET_DEV) + 1;
1574			kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1575			bcopy(NCA_INET_DEV, kdevpath, pathlen);
1576			kdevpath[pathlen - 1] = '\0';
1577		} else {
1578			kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1579			bcopy(buf, kdevpath, pathlen);
1580			kdevpath[pathlen - 1] = '\0';
1581		}
1582	} else {
1583		/* For socket module */
1584		kmodule = kmem_alloc(pathlen, KM_SLEEP);
1585		bcopy(buf, kmodule, pathlen);
1586		kmodule[pathlen - 1] = '\0';
1587		pathlen = 0;
1588	}
1589	kmem_free(buf, MAXPATHLEN);
1590
1591	/* sockparams_create frees mod name and devpath upon failure */
1592	sp = sockparams_create(family, type, protocol, kmodule,
1593	    kdevpath, pathlen, 0, KM_SLEEP, &error);
1594	if (sp != NULL) {
1595		error = sockparams_add(sp);
1596		if (error != 0)
1597			sockparams_destroy(sp);
1598	}
1599
1600	return (error);
1601}
1602
1603static int
1604sockconf_remove_sock(int family, int type, int protocol)
1605{
1606	return (sockparams_delete(family, type, protocol));
1607}
1608
1609static int
1610sockconfig_remove_filter(const char *uname)
1611{
1612	char kname[SOF_MAXNAMELEN];
1613	size_t len;
1614	int error;
1615	sof_entry_t *ent;
1616
1617	if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1618		return (error);
1619
1620	ent = sof_entry_remove_by_name(kname);
1621	if (ent == NULL)
1622		return (ENXIO);
1623
1624	mutex_enter(&ent->sofe_lock);
1625	ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1626	if (ent->sofe_refcnt == 0) {
1627		mutex_exit(&ent->sofe_lock);
1628		sof_entry_free(ent);
1629	} else {
1630		/* let the last socket free the filter */
1631		ent->sofe_flags |= SOFEF_CONDEMED;
1632		mutex_exit(&ent->sofe_lock);
1633	}
1634
1635	return (0);
1636}
1637
1638static int
1639sockconfig_add_filter(const char *uname, void *ufilpropp)
1640{
1641	struct sockconfig_filter_props filprop;
1642	sof_entry_t *ent;
1643	int error;
1644	size_t tuplesz, len;
1645	char hintbuf[SOF_MAXNAMELEN];
1646
1647	ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1648	mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1649
1650	if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1651	    &len)) != 0) {
1652		sof_entry_free(ent);
1653		return (error);
1654	}
1655
1656	if (get_udatamodel() == DATAMODEL_NATIVE) {
1657		if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1658			sof_entry_free(ent);
1659			return (EFAULT);
1660		}
1661	}
1662#ifdef	_SYSCALL32_IMPL
1663	else {
1664		struct sockconfig_filter_props32 filprop32;
1665
1666		if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1667			sof_entry_free(ent);
1668			return (EFAULT);
1669		}
1670		filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1671		filprop.sfp_autoattach = filprop32.sfp_autoattach;
1672		filprop.sfp_hint = filprop32.sfp_hint;
1673		filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1674		filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1675		filprop.sfp_socktuple =
1676		    (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1677	}
1678#endif	/* _SYSCALL32_IMPL */
1679
1680	if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1681	    sizeof (ent->sofe_modname), &len)) != 0) {
1682		sof_entry_free(ent);
1683		return (error);
1684	}
1685
1686	/*
1687	 * A filter must specify at least one socket tuple.
1688	 */
1689	if (filprop.sfp_socktuple_cnt == 0 ||
1690	    filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1691		sof_entry_free(ent);
1692		return (EINVAL);
1693	}
1694	ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1695	ent->sofe_hint = filprop.sfp_hint;
1696
1697	/*
1698	 * Verify the hint, and copy in the hint argument, if necessary.
1699	 */
1700	switch (ent->sofe_hint) {
1701	case SOF_HINT_BEFORE:
1702	case SOF_HINT_AFTER:
1703		if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1704		    sizeof (hintbuf), &len)) != 0) {
1705			sof_entry_free(ent);
1706			return (error);
1707		}
1708		ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1709		bcopy(hintbuf, ent->sofe_hintarg, len);
1710		/* FALLTHRU */
1711	case SOF_HINT_TOP:
1712	case SOF_HINT_BOTTOM:
1713		/* hints cannot be used with programmatic filters */
1714		if (ent->sofe_flags & SOFEF_PROG) {
1715			sof_entry_free(ent);
1716			return (EINVAL);
1717		}
1718		break;
1719	case SOF_HINT_NONE:
1720		break;
1721	default:
1722		/* bad hint value */
1723		sof_entry_free(ent);
1724		return (EINVAL);
1725	}
1726
1727	ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1728	tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1729	ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1730
1731	if (get_udatamodel() == DATAMODEL_NATIVE) {
1732		if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1733		    tuplesz)) {
1734			sof_entry_free(ent);
1735			return (EFAULT);
1736		}
1737	}
1738#ifdef	_SYSCALL32_IMPL
1739	else {
1740		int i;
1741		caddr_t data = (caddr_t)filprop.sfp_socktuple;
1742		sof_socktuple_t	*tup = ent->sofe_socktuple;
1743		sof_socktuple32_t tup32;
1744
1745		tup = ent->sofe_socktuple;
1746		for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1747			ASSERT(tup < ent->sofe_socktuple + tuplesz);
1748
1749			if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1750				sof_entry_free(ent);
1751				return (EFAULT);
1752			}
1753			tup->sofst_family = tup32.sofst_family;
1754			tup->sofst_type = tup32.sofst_type;
1755			tup->sofst_protocol = tup32.sofst_protocol;
1756
1757			data += sizeof (tup32);
1758		}
1759	}
1760#endif	/* _SYSCALL32_IMPL */
1761
1762	/* Sockets can start using the filter as soon as the filter is added */
1763	if ((error = sof_entry_add(ent)) != 0)
1764		sof_entry_free(ent);
1765
1766	return (error);
1767}
1768
1769/*
1770 * Socket configuration system call. It is used to add and remove
1771 * socket types.
1772 */
1773int
1774sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1775{
1776	int error = 0;
1777
1778	if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1779		return (set_errno(EPERM));
1780
1781	if (sockfs_defer_nl7c_init) {
1782		nl7c_init();
1783		sockfs_defer_nl7c_init = 0;
1784	}
1785
1786	switch (cmd) {
1787	case SOCKCONFIG_ADD_SOCK:
1788		error = sockconf_add_sock((int)(uintptr_t)arg1,
1789		    (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1790		break;
1791	case SOCKCONFIG_REMOVE_SOCK:
1792		error = sockconf_remove_sock((int)(uintptr_t)arg1,
1793		    (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1794		break;
1795	case SOCKCONFIG_ADD_FILTER:
1796		error = sockconfig_add_filter((const char *)arg1, arg2);
1797		break;
1798	case SOCKCONFIG_REMOVE_FILTER:
1799		error = sockconfig_remove_filter((const char *)arg1);
1800		break;
1801	default:
1802#ifdef	DEBUG
1803		cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1804#endif
1805		error = EINVAL;
1806		break;
1807	}
1808
1809	if (error != 0) {
1810		eprintline(error);
1811		return (set_errno(error));
1812	}
1813	return (0);
1814}
1815
1816
1817/*
1818 * Sendfile is implemented through two schemes, direct I/O or by
1819 * caching in the filesystem page cache. We cache the input file by
1820 * default and use direct I/O only if sendfile_max_size is set
1821 * appropriately as explained below. Note that this logic is consistent
1822 * with other filesystems where caching is turned on by default
1823 * unless explicitly turned off by using the DIRECTIO ioctl.
1824 *
1825 * We choose a slightly different scheme here. One can turn off
1826 * caching by setting sendfile_max_size to 0. One can also enable
1827 * caching of files <= sendfile_max_size by setting sendfile_max_size
1828 * to an appropriate value. By default sendfile_max_size is set to the
1829 * maximum value so that all files are cached. In future, we may provide
1830 * better interfaces for caching the file.
1831 *
1832 * Sendfile through Direct I/O (Zero copy)
1833 * --------------------------------------
1834 *
1835 * As disks are normally slower than the network, we can't have a
1836 * single thread that reads the disk and writes to the network. We
1837 * need to have parallelism. This is done by having the sendfile
1838 * thread create another thread that reads from the filesystem
1839 * and queues it for network processing. In this scheme, the data
1840 * is never copied anywhere i.e it is zero copy unlike the other
1841 * scheme.
1842 *
1843 * We have a sendfile queue (snfq) where each sendfile
1844 * request (snf_req_t) is queued for processing by a thread. Number
1845 * of threads is dynamically allocated and they exit if they are idling
1846 * beyond a specified amount of time. When each request (snf_req_t) is
1847 * processed by a thread, it produces a number of mblk_t structures to
1848 * be consumed by the sendfile thread. snf_deque and snf_enque are
1849 * used for consuming and producing mblks. Size of the filesystem
1850 * read is determined by the tunable (sendfile_read_size). A single
1851 * mblk holds sendfile_read_size worth of data (except the last
1852 * read of the file) which is sent down as a whole to the network.
1853 * sendfile_read_size is set to 1 MB as this seems to be the optimal
1854 * value for the UFS filesystem backed by a striped storage array.
1855 *
1856 * Synchronisation between read (producer) and write (consumer) threads.
1857 * --------------------------------------------------------------------
1858 *
1859 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1860 * adding and deleting items in this list. Error can happen anytime
1861 * during read or write. There could be unprocessed mblks in the
1862 * sr_ib_XXX list when a read or write error occurs. Whenever error
1863 * is encountered, we need two things to happen :
1864 *
1865 * a) One of the threads need to clean the mblks.
1866 * b) When one thread encounters an error, the other should stop.
1867 *
1868 * For (a), we don't want to penalize the reader thread as it could do
1869 * some useful work processing other requests. For (b), the error can
1870 * be detected by examining sr_read_error or sr_write_error.
1871 * sr_lock protects sr_read_error and sr_write_error. If both reader and
1872 * writer encounters error, we need to report the write error back to
1873 * the application as that's what would have happened if the operations
1874 * were done sequentially. With this in mind, following should work :
1875 *
1876 * 	- Check for errors before read or write.
1877 *	- If the reader encounters error, set the error in sr_read_error.
1878 *	  Check sr_write_error, if it is set, send cv_signal as it is
1879 *	  waiting for reader to complete. If it is not set, the writer
1880 *	  is either running sinking data to the network or blocked
1881 *        because of flow control. For handling the latter case, we
1882 *	  always send a signal. In any case, it will examine sr_read_error
1883 *	  and return. sr_read_error is marked with SR_READ_DONE to tell
1884 *	  the writer that the reader is done in all the cases.
1885 *	- If the writer encounters error, set the error in sr_write_error.
1886 *	  The reader thread is either blocked because of flow control or
1887 *	  running reading data from the disk. For the former, we need to
1888 *	  wakeup the thread. Again to keep it simple, we always wake up
1889 *	  the reader thread. Then, wait for the read thread to complete
1890 *	  if it is not done yet. Cleanup and return.
1891 *
1892 * High and low water marks for the read thread.
1893 * --------------------------------------------
1894 *
1895 * If sendfile() is used to send data over a slow network, we need to
1896 * make sure that the read thread does not produce data at a faster
1897 * rate than the network. This can happen if the disk is faster than
1898 * the network. In such a case, we don't want to build a very large queue.
1899 * But we would still like to get all of the network throughput possible.
1900 * This implies that network should never block waiting for data.
1901 * As there are lot of disk throughput/network throughput combinations
1902 * possible, it is difficult to come up with an accurate number.
1903 * A typical 10K RPM disk has a max seek latency 17ms and rotational
1904 * latency of 3ms for reading a disk block. Thus, the total latency to
1905 * initiate a new read, transfer data from the disk and queue for
1906 * transmission would take about a max of 25ms. Todays max transfer rate
1907 * for network is 100MB/sec. If the thread is blocked because of flow
1908 * control, it would take 25ms to get new data ready for transmission.
1909 * We have to make sure that network is not idling, while we are initiating
1910 * new transfers. So, at 100MB/sec, to keep network busy we would need
1911 * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1912 * We need to pick a high water mark so that the woken up thread would
1913 * do considerable work before blocking again to prevent thrashing. Currently,
1914 * we pick this to be 10 times that of the low water mark.
1915 *
1916 * Sendfile with segmap caching (One copy from page cache to mblks).
1917 * ----------------------------------------------------------------
1918 *
1919 * We use the segmap cache for caching the file, if the size of file
1920 * is <= sendfile_max_size. In this case we don't use threads as VM
1921 * is reasonably fast enough to keep up with the network. If the underlying
1922 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1923 * of data into segmap space, and use the virtual address from segmap
1924 * directly through desballoc() to avoid copy. Once the transport is done
1925 * with the data, the mapping will be released through segmap_release()
1926 * called by the call-back routine.
1927 *
1928 * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1929 * to copy the data from the filesystem into our temporary network buffer.
1930 *
1931 * To disable caching, set sendfile_max_size to 0.
1932 */
1933
1934uint_t sendfile_read_size = 1024 * 1024;
1935#define	SENDFILE_REQ_LOWAT	3 * 1024 * 1024
1936uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
1937uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
1938struct sendfile_stats sf_stats;
1939struct sendfile_queue *snfq;
1940clock_t snfq_timeout;
1941off64_t sendfile_max_size;
1942
1943static void snf_enque(snf_req_t *, mblk_t *);
1944static mblk_t *snf_deque(snf_req_t *);
1945
1946void
1947sendfile_init(void)
1948{
1949	snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
1950
1951	mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
1952	cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
1953	snfq->snfq_max_threads = max_ncpus;
1954	snfq_timeout = SNFQ_TIMEOUT;
1955	/* Cache all files by default. */
1956	sendfile_max_size = MAXOFFSET_T;
1957}
1958
1959/*
1960 * Queues a mblk_t for network processing.
1961 */
1962static void
1963snf_enque(snf_req_t *sr, mblk_t *mp)
1964{
1965	mp->b_next = NULL;
1966	mutex_enter(&sr->sr_lock);
1967	if (sr->sr_mp_head == NULL) {
1968		sr->sr_mp_head = sr->sr_mp_tail = mp;
1969		cv_signal(&sr->sr_cv);
1970	} else {
1971		sr->sr_mp_tail->b_next = mp;
1972		sr->sr_mp_tail = mp;
1973	}
1974	sr->sr_qlen += MBLKL(mp);
1975	while ((sr->sr_qlen > sr->sr_hiwat) &&
1976	    (sr->sr_write_error == 0)) {
1977		sf_stats.ss_full_waits++;
1978		cv_wait(&sr->sr_cv, &sr->sr_lock);
1979	}
1980	mutex_exit(&sr->sr_lock);
1981}
1982
1983/*
1984 * De-queues a mblk_t for network processing.
1985 */
1986static mblk_t *
1987snf_deque(snf_req_t *sr)
1988{
1989	mblk_t *mp;
1990
1991	mutex_enter(&sr->sr_lock);
1992	/*
1993	 * If we have encountered an error on read or read is
1994	 * completed and no more mblks, return NULL.
1995	 * We need to check for NULL sr_mp_head also as
1996	 * the reads could have completed and there is
1997	 * nothing more to come.
1998	 */
1999	if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2000	    ((sr->sr_read_error & SR_READ_DONE) &&
2001	    sr->sr_mp_head == NULL)) {
2002		mutex_exit(&sr->sr_lock);
2003		return (NULL);
2004	}
2005	/*
2006	 * To start with neither SR_READ_DONE is marked nor
2007	 * the error is set. When we wake up from cv_wait,
2008	 * following are the possibilities :
2009	 *
2010	 *	a) sr_read_error is zero and mblks are queued.
2011	 *	b) sr_read_error is set to SR_READ_DONE
2012	 *	   and mblks are queued.
2013	 *	c) sr_read_error is set to SR_READ_DONE
2014	 *	   and no mblks.
2015	 *	d) sr_read_error is set to some error other
2016	 *	   than SR_READ_DONE.
2017	 */
2018
2019	while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2020		sf_stats.ss_empty_waits++;
2021		cv_wait(&sr->sr_cv, &sr->sr_lock);
2022	}
2023	/* Handle (a) and (b) first  - the normal case. */
2024	if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2025	    (sr->sr_mp_head != NULL)) {
2026		mp = sr->sr_mp_head;
2027		sr->sr_mp_head = mp->b_next;
2028		sr->sr_qlen -= MBLKL(mp);
2029		if (sr->sr_qlen < sr->sr_lowat)
2030			cv_signal(&sr->sr_cv);
2031		mutex_exit(&sr->sr_lock);
2032		mp->b_next = NULL;
2033		return (mp);
2034	}
2035	/* Handle (c) and (d). */
2036	mutex_exit(&sr->sr_lock);
2037	return (NULL);
2038}
2039
2040/*
2041 * Reads data from the filesystem and queues it for network processing.
2042 */
2043void
2044snf_async_read(snf_req_t *sr)
2045{
2046	size_t iosize;
2047	u_offset_t fileoff;
2048	u_offset_t size;
2049	int ret_size;
2050	int error;
2051	file_t *fp;
2052	mblk_t *mp;
2053	struct vnode *vp;
2054	int extra = 0;
2055	int maxblk = 0;
2056	int wroff = 0;
2057	struct sonode *so;
2058
2059	fp = sr->sr_fp;
2060	size = sr->sr_file_size;
2061	fileoff = sr->sr_file_off;
2062
2063	/*
2064	 * Ignore the error for filesystems that doesn't support DIRECTIO.
2065	 */
2066	(void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2067	    kcred, NULL, NULL);
2068
2069	vp = sr->sr_vp;
2070	if (vp->v_type == VSOCK) {
2071		stdata_t *stp;
2072
2073		/*
2074		 * Get the extra space to insert a header and a trailer.
2075		 */
2076		so = VTOSO(vp);
2077		stp = vp->v_stream;
2078		if (stp == NULL) {
2079			wroff = so->so_proto_props.sopp_wroff;
2080			maxblk = so->so_proto_props.sopp_maxblk;
2081			extra = wroff + so->so_proto_props.sopp_tail;
2082		} else {
2083			wroff = (int)(stp->sd_wroff);
2084			maxblk = (int)(stp->sd_maxblk);
2085			extra = wroff + (int)(stp->sd_tail);
2086		}
2087	}
2088
2089	while ((size != 0) && (sr->sr_write_error == 0)) {
2090
2091		iosize = (int)MIN(sr->sr_maxpsz, size);
2092
2093		/*
2094		 * Socket filters can limit the mblk size,
2095		 * so limit reads to maxblk if there are
2096		 * filters present.
2097		 */
2098		if (vp->v_type == VSOCK &&
2099		    so->so_filter_active > 0 && maxblk != INFPSZ)
2100			iosize = (int)MIN(iosize, maxblk);
2101
2102		if (is_system_labeled()) {
2103			mp = allocb_cred(iosize + extra, CRED(),
2104			    curproc->p_pid);
2105		} else {
2106			mp = allocb(iosize + extra, BPRI_MED);
2107		}
2108		if (mp == NULL) {
2109			error = EAGAIN;
2110			break;
2111		}
2112
2113		mp->b_rptr += wroff;
2114
2115		ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2116
2117		/* Error or Reached EOF ? */
2118		if ((error != 0) || (ret_size == 0)) {
2119			freeb(mp);
2120			break;
2121		}
2122		mp->b_wptr = mp->b_rptr + ret_size;
2123
2124		snf_enque(sr, mp);
2125		size -= ret_size;
2126		fileoff += ret_size;
2127	}
2128	(void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2129	    kcred, NULL, NULL);
2130	mutex_enter(&sr->sr_lock);
2131	sr->sr_read_error = error;
2132	sr->sr_read_error |= SR_READ_DONE;
2133	cv_signal(&sr->sr_cv);
2134	mutex_exit(&sr->sr_lock);
2135}
2136
2137void
2138snf_async_thread(void)
2139{
2140	snf_req_t *sr;
2141	callb_cpr_t cprinfo;
2142	clock_t time_left = 1;
2143
2144	CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2145
2146	mutex_enter(&snfq->snfq_lock);
2147	for (;;) {
2148		/*
2149		 * If we didn't find a entry, then block until woken up
2150		 * again and then look through the queues again.
2151		 */
2152		while ((sr = snfq->snfq_req_head) == NULL) {
2153			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2154			if (time_left <= 0) {
2155				snfq->snfq_svc_threads--;
2156				CALLB_CPR_EXIT(&cprinfo);
2157				thread_exit();
2158				/* NOTREACHED */
2159			}
2160			snfq->snfq_idle_cnt++;
2161
2162			time_left = cv_reltimedwait(&snfq->snfq_cv,
2163			    &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2164			snfq->snfq_idle_cnt--;
2165
2166			CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2167		}
2168		snfq->snfq_req_head = sr->sr_next;
2169		snfq->snfq_req_cnt--;
2170		mutex_exit(&snfq->snfq_lock);
2171		snf_async_read(sr);
2172		mutex_enter(&snfq->snfq_lock);
2173	}
2174}
2175
2176
2177snf_req_t *
2178create_thread(int operation, struct vnode *vp, file_t *fp,
2179    u_offset_t fileoff, u_offset_t size)
2180{
2181	snf_req_t *sr;
2182	stdata_t *stp;
2183
2184	sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2185
2186	sr->sr_vp = vp;
2187	sr->sr_fp = fp;
2188	stp = vp->v_stream;
2189
2190	/*
2191	 * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2192	 * stream might be closed before thread returns from snf_async_read.
2193	 */
2194	if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2195		sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2196	} else {
2197		sr->sr_maxpsz = MAXBSIZE;
2198	}
2199
2200	sr->sr_operation = operation;
2201	sr->sr_file_off = fileoff;
2202	sr->sr_file_size = size;
2203	sr->sr_hiwat = sendfile_req_hiwat;
2204	sr->sr_lowat = sendfile_req_lowat;
2205	mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2206	cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2207	/*
2208	 * See whether we need another thread for servicing this
2209	 * request. If there are already enough requests queued
2210	 * for the threads, create one if not exceeding
2211	 * snfq_max_threads.
2212	 */
2213	mutex_enter(&snfq->snfq_lock);
2214	if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2215	    snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2216		(void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2217		    TS_RUN, minclsyspri);
2218		snfq->snfq_svc_threads++;
2219	}
2220	if (snfq->snfq_req_head == NULL) {
2221		snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2222		cv_signal(&snfq->snfq_cv);
2223	} else {
2224		snfq->snfq_req_tail->sr_next = sr;
2225		snfq->snfq_req_tail = sr;
2226	}
2227	snfq->snfq_req_cnt++;
2228	mutex_exit(&snfq->snfq_lock);
2229	return (sr);
2230}
2231
2232int
2233snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2234    ssize_t *count)
2235{
2236	snf_req_t *sr;
2237	mblk_t *mp;
2238	int iosize;
2239	int error = 0;
2240	short fflag;
2241	struct vnode *vp;
2242	int ksize;
2243	struct nmsghdr msg;
2244
2245	ksize = 0;
2246	*count = 0;
2247	bzero(&msg, sizeof (msg));
2248
2249	vp = fp->f_vnode;
2250	fflag = fp->f_flag;
2251	if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2252		return (EAGAIN);
2253
2254	/*
2255	 * We check for read error in snf_deque. It has to check
2256	 * for successful READ_DONE and return NULL, and we might
2257	 * as well make an additional check there.
2258	 */
2259	while ((mp = snf_deque(sr)) != NULL) {
2260
2261		if (ISSIG(curthread, JUSTLOOKING)) {
2262			freeb(mp);
2263			error = EINTR;
2264			break;
2265		}
2266		iosize = MBLKL(mp);
2267
2268		error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2269
2270		if (error != 0) {
2271			if (mp != NULL)
2272				freeb(mp);
2273			break;
2274		}
2275		ksize += iosize;
2276	}
2277	*count = ksize;
2278
2279	mutex_enter(&sr->sr_lock);
2280	sr->sr_write_error = error;
2281	/* Look at the big comments on why we cv_signal here. */
2282	cv_signal(&sr->sr_cv);
2283
2284	/* Wait for the reader to complete always. */
2285	while (!(sr->sr_read_error & SR_READ_DONE)) {
2286		cv_wait(&sr->sr_cv, &sr->sr_lock);
2287	}
2288	/* If there is no write error, check for read error. */
2289	if (error == 0)
2290		error = (sr->sr_read_error & ~SR_READ_DONE);
2291
2292	if (error != 0) {
2293		mblk_t *next_mp;
2294
2295		mp = sr->sr_mp_head;
2296		while (mp != NULL) {
2297			next_mp = mp->b_next;
2298			mp->b_next = NULL;
2299			freeb(mp);
2300			mp = next_mp;
2301		}
2302	}
2303	mutex_exit(&sr->sr_lock);
2304	kmem_free(sr, sizeof (snf_req_t));
2305	return (error);
2306}
2307
2308/* Maximum no.of pages allocated by vpm for sendfile at a time */
2309#define	SNF_VPMMAXPGS	(VPMMAXPGS/2)
2310
2311/*
2312 * Maximum no.of elements in the list returned by vpm, including
2313 * NULL for the last entry
2314 */
2315#define	SNF_MAXVMAPS	(SNF_VPMMAXPGS + 1)
2316
2317typedef struct {
2318	unsigned int	snfv_ref;
2319	frtn_t		snfv_frtn;
2320	vnode_t		*snfv_vp;
2321	struct vmap	snfv_vml[SNF_MAXVMAPS];
2322} snf_vmap_desbinfo;
2323
2324typedef struct {
2325	frtn_t		snfi_frtn;
2326	caddr_t		snfi_base;
2327	uint_t		snfi_mapoff;
2328	size_t		snfi_len;
2329	vnode_t		*snfi_vp;
2330} snf_smap_desbinfo;
2331
2332/*
2333 * The callback function used for vpm mapped mblks called when the last ref of
2334 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2335 * can be the driver too due to lazy reclaim.
2336 */
2337void
2338snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2339{
2340	ASSERT(snfv->snfv_ref != 0);
2341	if (atomic_add_32_nv(&snfv->snfv_ref, -1) == 0) {
2342		vpm_unmap_pages(snfv->snfv_vml, S_READ);
2343		VN_RELE(snfv->snfv_vp);
2344		kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2345	}
2346}
2347
2348/*
2349 * The callback function used for segmap'ped mblks called when the last ref of
2350 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2351 * can be the driver too due to lazy reclaim.
2352 */
2353void
2354snf_smap_desbfree(snf_smap_desbinfo *snfi)
2355{
2356	if (! IS_KPM_ADDR(snfi->snfi_base)) {
2357		/*
2358		 * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2359		 * segmap_kpm as long as the latter never falls back to
2360		 * "use_segmap_range". (See segmap_getmapflt().)
2361		 *
2362		 * Using S_OTHER saves an redundant hat_setref() in
2363		 * segmap_unlock()
2364		 */
2365		(void) segmap_fault(kas.a_hat, segkmap,
2366		    (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2367		    snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2368		    F_SOFTUNLOCK, S_OTHER);
2369	}
2370	(void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2371	VN_RELE(snfi->snfi_vp);
2372	kmem_free(snfi, sizeof (*snfi));
2373}
2374
2375/*
2376 * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2377 * When segmap is used, the mblk contains a segmap slot of no more
2378 * than MAXBSIZE.
2379 *
2380 * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2381 * in each iteration and sent by socket_sendmblk until an error occurs or
2382 * the requested size has been transferred. An mblk is esballoca'ed from
2383 * each mapped page and a chain of these mblk is sent to the transport layer.
2384 * vpm will be called to unmap the pages when all mblks have been freed by
2385 * free_func.
2386 *
2387 * At the end of the whole sendfile() operation, we wait till the data from
2388 * the last mblk is ack'ed by the transport before returning so that the
2389 * caller of sendfile() can safely modify the file content.
2390 */
2391int
2392snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2393    ssize_t *count, boolean_t nowait)
2394{
2395	caddr_t base;
2396	int mapoff;
2397	vnode_t *vp;
2398	mblk_t *mp = NULL;
2399	int chain_size;
2400	int error;
2401	clock_t deadlk_wait;
2402	short fflag;
2403	int ksize;
2404	struct vattr va;
2405	boolean_t dowait = B_FALSE;
2406	struct nmsghdr msg;
2407
2408	vp = fp->f_vnode;
2409	fflag = fp->f_flag;
2410	ksize = 0;
2411	bzero(&msg, sizeof (msg));
2412
2413	for (;;) {
2414		if (ISSIG(curthread, JUSTLOOKING)) {
2415			error = EINTR;
2416			break;
2417		}
2418
2419		if (vpm_enable) {
2420			snf_vmap_desbinfo *snfv;
2421			mblk_t *nmp;
2422			int mblk_size;
2423			int maxsize;
2424			int i;
2425
2426			mapoff = fileoff & PAGEOFFSET;
2427			maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2428
2429			snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2430			    KM_SLEEP);
2431
2432			/*
2433			 * Get vpm mappings for maxsize with read access.
2434			 * If the pages aren't available yet, we get
2435			 * DEADLK, so wait and try again a little later using
2436			 * an increasing wait. We might be here a long time.
2437			 *
2438			 * If delay_sig returns EINTR, be sure to exit and
2439			 * pass it up to the caller.
2440			 */
2441			deadlk_wait = 0;
2442			while ((error = vpm_map_pages(fvp, fileoff,
2443			    (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2444			    SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2445				deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2446				if ((error = delay_sig(deadlk_wait)) != 0) {
2447					break;
2448				}
2449			}
2450			if (error != 0) {
2451				kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2452				error = (error == EINTR) ? EINTR : EIO;
2453				goto out;
2454			}
2455			snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2456			snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2457
2458			/* Construct the mblk chain from the page mappings */
2459			chain_size = 0;
2460			for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2461			    total_size > 0; i++) {
2462				ASSERT(chain_size < maxsize);
2463				mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2464				    mapoff, total_size);
2465				nmp = esballoca(
2466				    (uchar_t *)snfv->snfv_vml[i].vs_addr +
2467				    mapoff, mblk_size, BPRI_HI,
2468				    &snfv->snfv_frtn);
2469
2470				/*
2471				 * We return EAGAIN after unmapping the pages
2472				 * if we cannot allocate the the head of the
2473				 * chain. Otherwise, we continue sending the
2474				 * mblks constructed so far.
2475				 */
2476				if (nmp == NULL) {
2477					if (i == 0) {
2478						vpm_unmap_pages(snfv->snfv_vml,
2479						    S_READ);
2480						kmem_free(snfv,
2481						    sizeof (snf_vmap_desbinfo));
2482						error = EAGAIN;
2483						goto out;
2484					}
2485					break;
2486				}
2487				/* Mark this dblk with the zero-copy flag */
2488				nmp->b_datap->db_struioflag |= STRUIO_ZC;
2489				nmp->b_wptr += mblk_size;
2490				chain_size += mblk_size;
2491				fileoff += mblk_size;
2492				total_size -= mblk_size;
2493				snfv->snfv_ref++;
2494				mapoff = 0;
2495				if (i > 0)
2496					linkb(mp, nmp);
2497				else
2498					mp = nmp;
2499			}
2500			VN_HOLD(fvp);
2501			snfv->snfv_vp = fvp;
2502		} else {
2503			/* vpm not supported. fallback to segmap */
2504			snf_smap_desbinfo *snfi;
2505
2506			mapoff = fileoff & MAXBOFFSET;
2507			chain_size = MAXBSIZE - mapoff;
2508			if (chain_size > total_size)
2509				chain_size = total_size;
2510			/*
2511			 * we don't forcefault because we'll call
2512			 * segmap_fault(F_SOFTLOCK) next.
2513			 *
2514			 * S_READ will get the ref bit set (by either
2515			 * segmap_getmapflt() or segmap_fault()) and page
2516			 * shared locked.
2517			 */
2518			base = segmap_getmapflt(segkmap, fvp, fileoff,
2519			    chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2520
2521			snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2522			snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2523			    PAGESIZE)- (mapoff & PAGEMASK);
2524			/*
2525			 * We must call segmap_fault() even for segmap_kpm
2526			 * because that's how error gets returned.
2527			 * (segmap_getmapflt() never fails but segmap_fault()
2528			 * does.)
2529			 *
2530			 * If the pages aren't available yet, we get
2531			 * DEADLK, so wait and try again a little later using
2532			 * an increasing wait. We might be here a long time.
2533			 *
2534			 * If delay_sig returns EINTR, be sure to exit and
2535			 * pass it up to the caller.
2536			 */
2537			deadlk_wait = 0;
2538			while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2539			    segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2540			    mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2541			    S_READ))) == EDEADLK) {
2542				deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2543				if ((error = delay_sig(deadlk_wait)) != 0) {
2544					break;
2545				}
2546			}
2547			if (error != 0) {
2548				(void) segmap_release(segkmap, base, 0);
2549				kmem_free(snfi, sizeof (*snfi));
2550				error = (error == EINTR) ? EINTR : EIO;
2551				goto out;
2552			}
2553			snfi->snfi_frtn.free_func = snf_smap_desbfree;
2554			snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2555			snfi->snfi_base = base;
2556			snfi->snfi_mapoff = mapoff;
2557			mp = esballoca((uchar_t *)base + mapoff, chain_size,
2558			    BPRI_HI, &snfi->snfi_frtn);
2559
2560			if (mp == NULL) {
2561				(void) segmap_fault(kas.a_hat, segkmap,
2562				    (caddr_t)(uintptr_t)(((uintptr_t)base +
2563				    mapoff) & PAGEMASK), snfi->snfi_len,
2564				    F_SOFTUNLOCK, S_OTHER);
2565				(void) segmap_release(segkmap, base, 0);
2566				kmem_free(snfi, sizeof (*snfi));
2567				freemsg(mp);
2568				error = EAGAIN;
2569				goto out;
2570			}
2571			VN_HOLD(fvp);
2572			snfi->snfi_vp = fvp;
2573			mp->b_wptr += chain_size;
2574
2575			/* Mark this dblk with the zero-copy flag */
2576			mp->b_datap->db_struioflag |= STRUIO_ZC;
2577			fileoff += chain_size;
2578			total_size -= chain_size;
2579		}
2580
2581		if (total_size == 0 && !nowait) {
2582			ASSERT(!dowait);
2583			dowait = B_TRUE;
2584			mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2585		}
2586		VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2587		error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2588		if (error != 0) {
2589			/*
2590			 * mp contains the mblks that were not sent by
2591			 * socket_sendmblk. Use its size to update *count
2592			 */
2593			*count = ksize + (chain_size - msgdsize(mp));
2594			if (mp != NULL)
2595				freemsg(mp);
2596			return (error);
2597		}
2598		ksize += chain_size;
2599		if (total_size == 0)
2600			goto done;
2601
2602		(void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2603		va.va_mask = AT_SIZE;
2604		error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2605		if (error)
2606			break;
2607		/* Read as much as possible. */
2608		if (fileoff >= va.va_size)
2609			break;
2610		if (total_size + fileoff > va.va_size)
2611			total_size = va.va_size - fileoff;
2612	}
2613out:
2614	VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2615done:
2616	*count = ksize;
2617	if (dowait) {
2618		stdata_t *stp;
2619
2620		stp = vp->v_stream;
2621		if (stp == NULL) {
2622			struct sonode *so;
2623			so = VTOSO(vp);
2624			error = so_zcopy_wait(so);
2625		} else {
2626			mutex_enter(&stp->sd_lock);
2627			while (!(stp->sd_flag & STZCNOTIFY)) {
2628				if (cv_wait_sig(&stp->sd_zcopy_wait,
2629				    &stp->sd_lock) == 0) {
2630					error = EINTR;
2631					break;
2632				}
2633			}
2634			stp->sd_flag &= ~STZCNOTIFY;
2635			mutex_exit(&stp->sd_lock);
2636		}
2637	}
2638	return (error);
2639}
2640
2641int
2642snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2643    uint_t maxpsz, ssize_t *count)
2644{
2645	struct vnode *vp;
2646	mblk_t *mp;
2647	int iosize;
2648	int extra = 0;
2649	int error;
2650	short fflag;
2651	int ksize;
2652	int ioflag;
2653	struct uio auio;
2654	struct iovec aiov;
2655	struct vattr va;
2656	int maxblk = 0;
2657	int wroff = 0;
2658	struct sonode *so;
2659	struct nmsghdr msg;
2660
2661	vp = fp->f_vnode;
2662	if (vp->v_type == VSOCK) {
2663		stdata_t *stp;
2664
2665		/*
2666		 * Get the extra space to insert a header and a trailer.
2667		 */
2668		so = VTOSO(vp);
2669		stp = vp->v_stream;
2670		if (stp == NULL) {
2671			wroff = so->so_proto_props.sopp_wroff;
2672			maxblk = so->so_proto_props.sopp_maxblk;
2673			extra = wroff + so->so_proto_props.sopp_tail;
2674		} else {
2675			wroff = (int)(stp->sd_wroff);
2676			maxblk = (int)(stp->sd_maxblk);
2677			extra = wroff + (int)(stp->sd_tail);
2678		}
2679	}
2680	bzero(&msg, sizeof (msg));
2681	fflag = fp->f_flag;
2682	ksize = 0;
2683	auio.uio_iov = &aiov;
2684	auio.uio_iovcnt = 1;
2685	auio.uio_segflg = UIO_SYSSPACE;
2686	auio.uio_llimit = MAXOFFSET_T;
2687	auio.uio_fmode = fflag;
2688	auio.uio_extflg = UIO_COPY_CACHED;
2689	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2690	/* If read sync is not asked for, filter sync flags */
2691	if ((ioflag & FRSYNC) == 0)
2692		ioflag &= ~(FSYNC|FDSYNC);
2693	for (;;) {
2694		if (ISSIG(curthread, JUSTLOOKING)) {
2695			error = EINTR;
2696			break;
2697		}
2698		iosize = (int)MIN(maxpsz, size);
2699
2700		/*
2701		 * Socket filters can limit the mblk size,
2702		 * so limit reads to maxblk if there are
2703		 * filters present.
2704		 */
2705		if (vp->v_type == VSOCK &&
2706		    so->so_filter_active > 0 && maxblk != INFPSZ)
2707			iosize = (int)MIN(iosize, maxblk);
2708
2709		if (is_system_labeled()) {
2710			mp = allocb_cred(iosize + extra, CRED(),
2711			    curproc->p_pid);
2712		} else {
2713			mp = allocb(iosize + extra, BPRI_MED);
2714		}
2715		if (mp == NULL) {
2716			error = EAGAIN;
2717			break;
2718		}
2719
2720		mp->b_rptr += wroff;
2721
2722		aiov.iov_base = (caddr_t)mp->b_rptr;
2723		aiov.iov_len = iosize;
2724		auio.uio_loffset = fileoff;
2725		auio.uio_resid = iosize;
2726
2727		error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2728		iosize -= auio.uio_resid;
2729
2730		if (error == EINTR && iosize != 0)
2731			error = 0;
2732
2733		if (error != 0 || iosize == 0) {
2734			freeb(mp);
2735			break;
2736		}
2737		mp->b_wptr = mp->b_rptr + iosize;
2738
2739		VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2740
2741		error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2742
2743		if (error != 0) {
2744			*count = ksize;
2745			if (mp != NULL)
2746				freeb(mp);
2747			return (error);
2748		}
2749		ksize += iosize;
2750		size -= iosize;
2751		if (size == 0)
2752			goto done;
2753
2754		fileoff += iosize;
2755		(void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2756		va.va_mask = AT_SIZE;
2757		error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2758		if (error)
2759			break;
2760		/* Read as much as possible. */
2761		if (fileoff >= va.va_size)
2762			size = 0;
2763		else if (size + fileoff > va.va_size)
2764			size = va.va_size - fileoff;
2765	}
2766	VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2767done:
2768	*count = ksize;
2769	return (error);
2770}
2771
2772#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2773/*
2774 * Largefile support for 32 bit applications only.
2775 */
2776int
2777sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2778    ssize32_t *count32)
2779{
2780	ssize32_t sfv_len;
2781	u_offset_t sfv_off, va_size;
2782	struct vnode *vp, *fvp, *realvp;
2783	struct vattr va;
2784	stdata_t *stp;
2785	ssize_t count = 0;
2786	int error = 0;
2787	boolean_t dozcopy = B_FALSE;
2788	uint_t maxpsz;
2789
2790	sfv_len = (ssize32_t)sfv->sfv_len;
2791	if (sfv_len < 0) {
2792		error = EINVAL;
2793		goto out;
2794	}
2795
2796	if (sfv_len == 0) goto out;
2797
2798	sfv_off = (u_offset_t)sfv->sfv_off;
2799
2800	/* Same checks as in pread */
2801	if (sfv_off > MAXOFFSET_T) {
2802		error = EINVAL;
2803		goto out;
2804	}
2805	if (sfv_off + sfv_len > MAXOFFSET_T)
2806		sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2807
2808	/*
2809	 * There are no more checks on sfv_len. So, we cast it to
2810	 * u_offset_t and share the snf_direct_io/snf_cache code between
2811	 * 32 bit and 64 bit.
2812	 *
2813	 * TODO: should do nbl_need_check() like read()?
2814	 */
2815	if (sfv_len > sendfile_max_size) {
2816		sf_stats.ss_file_not_cached++;
2817		error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2818		    &count);
2819		goto out;
2820	}
2821	fvp = rfp->f_vnode;
2822	if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2823		fvp = realvp;
2824	/*
2825	 * Grab the lock as a reader to prevent the file size
2826	 * from changing underneath.
2827	 */
2828	(void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2829	va.va_mask = AT_SIZE;
2830	error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2831	va_size = va.va_size;
2832	if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2833		VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2834		goto out;
2835	}
2836	/* Read as much as possible. */
2837	if (sfv_off + sfv_len > va_size)
2838		sfv_len = va_size - sfv_off;
2839
2840	vp = fp->f_vnode;
2841	stp = vp->v_stream;
2842	/*
2843	 * When the NOWAIT flag is not set, we enable zero-copy only if the
2844	 * transfer size is large enough. This prevents performance loss
2845	 * when the caller sends the file piece by piece.
2846	 */
2847	if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2848	    (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2849	    !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2850		uint_t copyflag;
2851		copyflag = stp != NULL ? stp->sd_copyflag :
2852		    VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2853		if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2854			int on = 1;
2855
2856			if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2857			    SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2858				dozcopy = B_TRUE;
2859		} else {
2860			dozcopy = copyflag & STZCVMSAFE;
2861		}
2862	}
2863	if (dozcopy) {
2864		sf_stats.ss_file_segmap++;
2865		error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2866		    &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2867	} else {
2868		if (vp->v_type == VSOCK && stp == NULL) {
2869			sonode_t *so = VTOSO(vp);
2870			maxpsz = so->so_proto_props.sopp_maxpsz;
2871		} else if (stp != NULL) {
2872			maxpsz = stp->sd_qn_maxpsz;
2873		} else {
2874			maxpsz = maxphys;
2875		}
2876
2877		if (maxpsz == INFPSZ)
2878			maxpsz = maxphys;
2879		else
2880			maxpsz = roundup(maxpsz, MAXBSIZE);
2881		sf_stats.ss_file_cached++;
2882		error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2883		    maxpsz, &count);
2884	}
2885out:
2886	releasef(sfv->sfv_fd);
2887	*count32 = (ssize32_t)count;
2888	return (error);
2889}
2890#endif
2891
2892#ifdef _SYSCALL32_IMPL
2893/*
2894 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2895 * ssize_t rather than ssize32_t; see the comments above read32 for details.
2896 */
2897
2898ssize_t
2899recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2900{
2901	return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2902}
2903
2904ssize_t
2905recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2906	caddr32_t name, caddr32_t namelenp)
2907{
2908	return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2909	    (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2910}
2911
2912ssize_t
2913send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2914{
2915	return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2916}
2917
2918ssize_t
2919sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2920	caddr32_t name, socklen_t namelen)
2921{
2922	return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2923	    (void *)(uintptr_t)name, namelen));
2924}
2925#endif	/* _SYSCALL32_IMPL */
2926
2927/*
2928 * Function wrappers (mostly around the sonode switch) for
2929 * backward compatibility.
2930 */
2931
2932int
2933soaccept(struct sonode *so, int fflag, struct sonode **nsop)
2934{
2935	return (socket_accept(so, fflag, CRED(), nsop));
2936}
2937
2938int
2939sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2940    int backlog, int flags)
2941{
2942	int	error;
2943
2944	error = socket_bind(so, name, namelen, flags, CRED());
2945	if (error == 0 && backlog != 0)
2946		return (socket_listen(so, backlog, CRED()));
2947
2948	return (error);
2949}
2950
2951int
2952solisten(struct sonode *so, int backlog)
2953{
2954	return (socket_listen(so, backlog, CRED()));
2955}
2956
2957int
2958soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2959    int fflag, int flags)
2960{
2961	return (socket_connect(so, name, namelen, fflag, flags, CRED()));
2962}
2963
2964int
2965sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2966{
2967	return (socket_recvmsg(so, msg, uiop, CRED()));
2968}
2969
2970int
2971sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2972{
2973	return (socket_sendmsg(so, msg, uiop, CRED()));
2974}
2975
2976int
2977soshutdown(struct sonode *so, int how)
2978{
2979	return (socket_shutdown(so, how, CRED()));
2980}
2981
2982int
2983sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
2984    socklen_t *optlenp, int flags)
2985{
2986	return (socket_getsockopt(so, level, option_name, optval, optlenp,
2987	    flags, CRED()));
2988}
2989
2990int
2991sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
2992    t_uscalar_t optlen)
2993{
2994	return (socket_setsockopt(so, level, option_name, optval, optlen,
2995	    CRED()));
2996}
2997
2998/*
2999 * Because this is backward compatibility interface it only needs to be
3000 * able to handle the creation of TPI sockfs sockets.
3001 */
3002struct sonode *
3003socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3004    int *errorp)
3005{
3006	struct sonode *so;
3007
3008	ASSERT(sp != NULL);
3009
3010	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3011	    version, SOCKET_SLEEP, errorp, CRED());
3012	if (so == NULL) {
3013		SOCKPARAMS_DEC_REF(sp);
3014	} else {
3015		if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3016			/* Cannot fail, only bumps so_count */
3017			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3018		} else {
3019			socket_destroy(so);
3020			so = NULL;
3021		}
3022	}
3023	return (so);
3024}
3025