kern_sendfile.c revision 205316
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 205316 2010-03-19 10:41:32Z kib $");
37
38#include "opt_inet.h"
39#include "opt_inet6.h"
40#include "opt_sctp.h"
41#include "opt_compat.h"
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/mutex.h>
49#include <sys/sysproto.h>
50#include <sys/malloc.h>
51#include <sys/filedesc.h>
52#include <sys/event.h>
53#include <sys/proc.h>
54#include <sys/fcntl.h>
55#include <sys/file.h>
56#include <sys/filio.h>
57#include <sys/jail.h>
58#include <sys/mount.h>
59#include <sys/mbuf.h>
60#include <sys/protosw.h>
61#include <sys/sf_buf.h>
62#include <sys/socket.h>
63#include <sys/socketvar.h>
64#include <sys/signalvar.h>
65#include <sys/syscallsubr.h>
66#include <sys/sysctl.h>
67#include <sys/uio.h>
68#include <sys/vnode.h>
69#ifdef KTRACE
70#include <sys/ktrace.h>
71#endif
72
73#include <net/vnet.h>
74
75#include <security/audit/audit.h>
76#include <security/mac/mac_framework.h>
77
78#include <vm/vm.h>
79#include <vm/vm_object.h>
80#include <vm/vm_page.h>
81#include <vm/vm_pageout.h>
82#include <vm/vm_kern.h>
83#include <vm/vm_extern.h>
84
85#if defined(INET) || defined(INET6)
86#ifdef SCTP
87#include <netinet/sctp.h>
88#include <netinet/sctp_peeloff.h>
89#endif /* SCTP */
90#endif /* INET || INET6 */
91
92static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
93static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
94
95static int accept1(struct thread *td, struct accept_args *uap, int compat);
96static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
97static int getsockname1(struct thread *td, struct getsockname_args *uap,
98			int compat);
99static int getpeername1(struct thread *td, struct getpeername_args *uap,
100			int compat);
101
102/*
103 * NSFBUFS-related variables and associated sysctls
104 */
105int nsfbufs;
106int nsfbufspeak;
107int nsfbufsused;
108
109SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
110    "Maximum number of sendfile(2) sf_bufs available");
111SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
112    "Number of sendfile(2) sf_bufs at peak usage");
113SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
114    "Number of sendfile(2) sf_bufs in use");
115
116/*
117 * Convert a user file descriptor to a kernel file entry.  A reference on the
118 * file entry is held upon returning.  This is lighter weight than
119 * fgetsock(), which bumps the socket reference drops the file reference
120 * count instead, as this approach avoids several additional mutex operations
121 * associated with the additional reference count.  If requested, return the
122 * open file flags.
123 */
124static int
125getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
126{
127	struct file *fp;
128	int error;
129
130	fp = NULL;
131	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) {
132		error = EBADF;
133	} else if (fp->f_type != DTYPE_SOCKET) {
134		fdrop(fp, curthread);
135		fp = NULL;
136		error = ENOTSOCK;
137	} else {
138		if (fflagp != NULL)
139			*fflagp = fp->f_flag;
140		error = 0;
141	}
142	*fpp = fp;
143	return (error);
144}
145
146/*
147 * System call interface to the socket abstraction.
148 */
149#if defined(COMPAT_43)
150#define COMPAT_OLDSOCK
151#endif
152
153int
154socket(td, uap)
155	struct thread *td;
156	struct socket_args /* {
157		int	domain;
158		int	type;
159		int	protocol;
160	} */ *uap;
161{
162	struct filedesc *fdp;
163	struct socket *so;
164	struct file *fp;
165	int fd, error;
166
167	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
168#ifdef MAC
169	error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
170	    uap->protocol);
171	if (error)
172		return (error);
173#endif
174	fdp = td->td_proc->p_fd;
175	error = falloc(td, &fp, &fd);
176	if (error)
177		return (error);
178	/* An extra reference on `fp' has been held for us by falloc(). */
179	error = socreate(uap->domain, &so, uap->type, uap->protocol,
180	    td->td_ucred, td);
181	if (error) {
182		fdclose(fdp, fp, fd, td);
183	} else {
184		finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
185		td->td_retval[0] = fd;
186	}
187	fdrop(fp, td);
188	return (error);
189}
190
191/* ARGSUSED */
192int
193bind(td, uap)
194	struct thread *td;
195	struct bind_args /* {
196		int	s;
197		caddr_t	name;
198		int	namelen;
199	} */ *uap;
200{
201	struct sockaddr *sa;
202	int error;
203
204	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
205		return (error);
206
207	error = kern_bind(td, uap->s, sa);
208	free(sa, M_SONAME);
209	return (error);
210}
211
212int
213kern_bind(td, fd, sa)
214	struct thread *td;
215	int fd;
216	struct sockaddr *sa;
217{
218	struct socket *so;
219	struct file *fp;
220	int error;
221
222	AUDIT_ARG_FD(fd);
223	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
224	if (error)
225		return (error);
226	so = fp->f_data;
227#ifdef KTRACE
228	if (KTRPOINT(td, KTR_STRUCT))
229		ktrsockaddr(sa);
230#endif
231#ifdef MAC
232	error = mac_socket_check_bind(td->td_ucred, so, sa);
233	if (error == 0)
234#endif
235		error = sobind(so, sa, td);
236	fdrop(fp, td);
237	return (error);
238}
239
240/* ARGSUSED */
241int
242listen(td, uap)
243	struct thread *td;
244	struct listen_args /* {
245		int	s;
246		int	backlog;
247	} */ *uap;
248{
249	struct socket *so;
250	struct file *fp;
251	int error;
252
253	AUDIT_ARG_FD(uap->s);
254	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
255	if (error == 0) {
256		so = fp->f_data;
257#ifdef MAC
258		error = mac_socket_check_listen(td->td_ucred, so);
259		if (error == 0) {
260#endif
261			CURVNET_SET(so->so_vnet);
262			error = solisten(so, uap->backlog, td);
263			CURVNET_RESTORE();
264#ifdef MAC
265		}
266#endif
267		fdrop(fp, td);
268	}
269	return(error);
270}
271
272/*
273 * accept1()
274 */
275static int
276accept1(td, uap, compat)
277	struct thread *td;
278	struct accept_args /* {
279		int	s;
280		struct sockaddr	* __restrict name;
281		socklen_t	* __restrict anamelen;
282	} */ *uap;
283	int compat;
284{
285	struct sockaddr *name;
286	socklen_t namelen;
287	struct file *fp;
288	int error;
289
290	if (uap->name == NULL)
291		return (kern_accept(td, uap->s, NULL, NULL, NULL));
292
293	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
294	if (error)
295		return (error);
296
297	error = kern_accept(td, uap->s, &name, &namelen, &fp);
298
299	/*
300	 * return a namelen of zero for older code which might
301	 * ignore the return value from accept.
302	 */
303	if (error) {
304		(void) copyout(&namelen,
305		    uap->anamelen, sizeof(*uap->anamelen));
306		return (error);
307	}
308
309	if (error == 0 && name != NULL) {
310#ifdef COMPAT_OLDSOCK
311		if (compat)
312			((struct osockaddr *)name)->sa_family =
313			    name->sa_family;
314#endif
315		error = copyout(name, uap->name, namelen);
316	}
317	if (error == 0)
318		error = copyout(&namelen, uap->anamelen,
319		    sizeof(namelen));
320	if (error)
321		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
322	fdrop(fp, td);
323	free(name, M_SONAME);
324	return (error);
325}
326
327int
328kern_accept(struct thread *td, int s, struct sockaddr **name,
329    socklen_t *namelen, struct file **fp)
330{
331	struct filedesc *fdp;
332	struct file *headfp, *nfp = NULL;
333	struct sockaddr *sa = NULL;
334	int error;
335	struct socket *head, *so;
336	int fd;
337	u_int fflag;
338	pid_t pgid;
339	int tmp;
340
341	if (name) {
342		*name = NULL;
343		if (*namelen < 0)
344			return (EINVAL);
345	}
346
347	AUDIT_ARG_FD(s);
348	fdp = td->td_proc->p_fd;
349	error = getsock(fdp, s, &headfp, &fflag);
350	if (error)
351		return (error);
352	head = headfp->f_data;
353	if ((head->so_options & SO_ACCEPTCONN) == 0) {
354		error = EINVAL;
355		goto done;
356	}
357#ifdef MAC
358	error = mac_socket_check_accept(td->td_ucred, head);
359	if (error != 0)
360		goto done;
361#endif
362	error = falloc(td, &nfp, &fd);
363	if (error)
364		goto done;
365	ACCEPT_LOCK();
366	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
367		ACCEPT_UNLOCK();
368		error = EWOULDBLOCK;
369		goto noconnection;
370	}
371	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
372		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
373			head->so_error = ECONNABORTED;
374			break;
375		}
376		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
377		    "accept", 0);
378		if (error) {
379			ACCEPT_UNLOCK();
380			goto noconnection;
381		}
382	}
383	if (head->so_error) {
384		error = head->so_error;
385		head->so_error = 0;
386		ACCEPT_UNLOCK();
387		goto noconnection;
388	}
389	so = TAILQ_FIRST(&head->so_comp);
390	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
391	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
392
393	/*
394	 * Before changing the flags on the socket, we have to bump the
395	 * reference count.  Otherwise, if the protocol calls sofree(),
396	 * the socket will be released due to a zero refcount.
397	 */
398	SOCK_LOCK(so);			/* soref() and so_state update */
399	soref(so);			/* file descriptor reference */
400
401	TAILQ_REMOVE(&head->so_comp, so, so_list);
402	head->so_qlen--;
403	so->so_state |= (head->so_state & SS_NBIO);
404	so->so_qstate &= ~SQ_COMP;
405	so->so_head = NULL;
406
407	SOCK_UNLOCK(so);
408	ACCEPT_UNLOCK();
409
410	/* An extra reference on `nfp' has been held for us by falloc(). */
411	td->td_retval[0] = fd;
412
413	/* connection has been removed from the listen queue */
414	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
415
416	pgid = fgetown(&head->so_sigio);
417	if (pgid != 0)
418		fsetown(pgid, &so->so_sigio);
419
420	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
421	/* Sync socket nonblocking/async state with file flags */
422	tmp = fflag & FNONBLOCK;
423	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
424	tmp = fflag & FASYNC;
425	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
426	sa = 0;
427	CURVNET_SET(so->so_vnet);
428	error = soaccept(so, &sa);
429	CURVNET_RESTORE();
430	if (error) {
431		/*
432		 * return a namelen of zero for older code which might
433		 * ignore the return value from accept.
434		 */
435		if (name)
436			*namelen = 0;
437		goto noconnection;
438	}
439	if (sa == NULL) {
440		if (name)
441			*namelen = 0;
442		goto done;
443	}
444	if (name) {
445		/* check sa_len before it is destroyed */
446		if (*namelen > sa->sa_len)
447			*namelen = sa->sa_len;
448#ifdef KTRACE
449		if (KTRPOINT(td, KTR_STRUCT))
450			ktrsockaddr(sa);
451#endif
452		*name = sa;
453		sa = NULL;
454	}
455noconnection:
456	if (sa)
457		free(sa, M_SONAME);
458
459	/*
460	 * close the new descriptor, assuming someone hasn't ripped it
461	 * out from under us.
462	 */
463	if (error)
464		fdclose(fdp, nfp, fd, td);
465
466	/*
467	 * Release explicitly held references before returning.  We return
468	 * a reference on nfp to the caller on success if they request it.
469	 */
470done:
471	if (fp != NULL) {
472		if (error == 0) {
473			*fp = nfp;
474			nfp = NULL;
475		} else
476			*fp = NULL;
477	}
478	if (nfp != NULL)
479		fdrop(nfp, td);
480	fdrop(headfp, td);
481	return (error);
482}
483
484int
485accept(td, uap)
486	struct thread *td;
487	struct accept_args *uap;
488{
489
490	return (accept1(td, uap, 0));
491}
492
493#ifdef COMPAT_OLDSOCK
494int
495oaccept(td, uap)
496	struct thread *td;
497	struct accept_args *uap;
498{
499
500	return (accept1(td, uap, 1));
501}
502#endif /* COMPAT_OLDSOCK */
503
504/* ARGSUSED */
505int
506connect(td, uap)
507	struct thread *td;
508	struct connect_args /* {
509		int	s;
510		caddr_t	name;
511		int	namelen;
512	} */ *uap;
513{
514	struct sockaddr *sa;
515	int error;
516
517	error = getsockaddr(&sa, uap->name, uap->namelen);
518	if (error)
519		return (error);
520
521	error = kern_connect(td, uap->s, sa);
522	free(sa, M_SONAME);
523	return (error);
524}
525
526
527int
528kern_connect(td, fd, sa)
529	struct thread *td;
530	int fd;
531	struct sockaddr *sa;
532{
533	struct socket *so;
534	struct file *fp;
535	int error;
536	int interrupted = 0;
537
538	AUDIT_ARG_FD(fd);
539	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
540	if (error)
541		return (error);
542	so = fp->f_data;
543	if (so->so_state & SS_ISCONNECTING) {
544		error = EALREADY;
545		goto done1;
546	}
547#ifdef KTRACE
548	if (KTRPOINT(td, KTR_STRUCT))
549		ktrsockaddr(sa);
550#endif
551#ifdef MAC
552	error = mac_socket_check_connect(td->td_ucred, so, sa);
553	if (error)
554		goto bad;
555#endif
556	error = soconnect(so, sa, td);
557	if (error)
558		goto bad;
559	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
560		error = EINPROGRESS;
561		goto done1;
562	}
563	SOCK_LOCK(so);
564	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
565		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
566		    "connec", 0);
567		if (error) {
568			if (error == EINTR || error == ERESTART)
569				interrupted = 1;
570			break;
571		}
572	}
573	if (error == 0) {
574		error = so->so_error;
575		so->so_error = 0;
576	}
577	SOCK_UNLOCK(so);
578bad:
579	if (!interrupted)
580		so->so_state &= ~SS_ISCONNECTING;
581	if (error == ERESTART)
582		error = EINTR;
583done1:
584	fdrop(fp, td);
585	return (error);
586}
587
588int
589kern_socketpair(struct thread *td, int domain, int type, int protocol,
590    int *rsv)
591{
592	struct filedesc *fdp = td->td_proc->p_fd;
593	struct file *fp1, *fp2;
594	struct socket *so1, *so2;
595	int fd, error;
596
597	AUDIT_ARG_SOCKET(domain, type, protocol);
598#ifdef MAC
599	/* We might want to have a separate check for socket pairs. */
600	error = mac_socket_check_create(td->td_ucred, domain, type,
601	    protocol);
602	if (error)
603		return (error);
604#endif
605	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
606	if (error)
607		return (error);
608	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
609	if (error)
610		goto free1;
611	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
612	error = falloc(td, &fp1, &fd);
613	if (error)
614		goto free2;
615	rsv[0] = fd;
616	fp1->f_data = so1;	/* so1 already has ref count */
617	error = falloc(td, &fp2, &fd);
618	if (error)
619		goto free3;
620	fp2->f_data = so2;	/* so2 already has ref count */
621	rsv[1] = fd;
622	error = soconnect2(so1, so2);
623	if (error)
624		goto free4;
625	if (type == SOCK_DGRAM) {
626		/*
627		 * Datagram socket connection is asymmetric.
628		 */
629		 error = soconnect2(so2, so1);
630		 if (error)
631			goto free4;
632	}
633	finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);
634	finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);
635	fdrop(fp1, td);
636	fdrop(fp2, td);
637	return (0);
638free4:
639	fdclose(fdp, fp2, rsv[1], td);
640	fdrop(fp2, td);
641free3:
642	fdclose(fdp, fp1, rsv[0], td);
643	fdrop(fp1, td);
644free2:
645	if (so2 != NULL)
646		(void)soclose(so2);
647free1:
648	if (so1 != NULL)
649		(void)soclose(so1);
650	return (error);
651}
652
653int
654socketpair(struct thread *td, struct socketpair_args *uap)
655{
656	int error, sv[2];
657
658	error = kern_socketpair(td, uap->domain, uap->type,
659	    uap->protocol, sv);
660	if (error)
661		return (error);
662	error = copyout(sv, uap->rsv, 2 * sizeof(int));
663	if (error) {
664		(void)kern_close(td, sv[0]);
665		(void)kern_close(td, sv[1]);
666	}
667	return (error);
668}
669
670static int
671sendit(td, s, mp, flags)
672	struct thread *td;
673	int s;
674	struct msghdr *mp;
675	int flags;
676{
677	struct mbuf *control;
678	struct sockaddr *to;
679	int error;
680
681	if (mp->msg_name != NULL) {
682		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
683		if (error) {
684			to = NULL;
685			goto bad;
686		}
687		mp->msg_name = to;
688	} else {
689		to = NULL;
690	}
691
692	if (mp->msg_control) {
693		if (mp->msg_controllen < sizeof(struct cmsghdr)
694#ifdef COMPAT_OLDSOCK
695		    && mp->msg_flags != MSG_COMPAT
696#endif
697		) {
698			error = EINVAL;
699			goto bad;
700		}
701		error = sockargs(&control, mp->msg_control,
702		    mp->msg_controllen, MT_CONTROL);
703		if (error)
704			goto bad;
705#ifdef COMPAT_OLDSOCK
706		if (mp->msg_flags == MSG_COMPAT) {
707			struct cmsghdr *cm;
708
709			M_PREPEND(control, sizeof(*cm), M_WAIT);
710			cm = mtod(control, struct cmsghdr *);
711			cm->cmsg_len = control->m_len;
712			cm->cmsg_level = SOL_SOCKET;
713			cm->cmsg_type = SCM_RIGHTS;
714		}
715#endif
716	} else {
717		control = NULL;
718	}
719
720	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
721
722bad:
723	if (to)
724		free(to, M_SONAME);
725	return (error);
726}
727
728int
729kern_sendit(td, s, mp, flags, control, segflg)
730	struct thread *td;
731	int s;
732	struct msghdr *mp;
733	int flags;
734	struct mbuf *control;
735	enum uio_seg segflg;
736{
737	struct file *fp;
738	struct uio auio;
739	struct iovec *iov;
740	struct socket *so;
741	int i;
742	int len, error;
743#ifdef KTRACE
744	struct uio *ktruio = NULL;
745#endif
746
747	AUDIT_ARG_FD(s);
748	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
749	if (error)
750		return (error);
751	so = (struct socket *)fp->f_data;
752
753#ifdef MAC
754	if (mp->msg_name != NULL) {
755		error = mac_socket_check_connect(td->td_ucred, so,
756		    mp->msg_name);
757		if (error)
758			goto bad;
759	}
760	error = mac_socket_check_send(td->td_ucred, so);
761	if (error)
762		goto bad;
763#endif
764
765	auio.uio_iov = mp->msg_iov;
766	auio.uio_iovcnt = mp->msg_iovlen;
767	auio.uio_segflg = segflg;
768	auio.uio_rw = UIO_WRITE;
769	auio.uio_td = td;
770	auio.uio_offset = 0;			/* XXX */
771	auio.uio_resid = 0;
772	iov = mp->msg_iov;
773	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
774		if ((auio.uio_resid += iov->iov_len) < 0) {
775			error = EINVAL;
776			goto bad;
777		}
778	}
779#ifdef KTRACE
780	if (KTRPOINT(td, KTR_GENIO))
781		ktruio = cloneuio(&auio);
782#endif
783	len = auio.uio_resid;
784	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
785	if (error) {
786		if (auio.uio_resid != len && (error == ERESTART ||
787		    error == EINTR || error == EWOULDBLOCK))
788			error = 0;
789		/* Generation of SIGPIPE can be controlled per socket */
790		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
791		    !(flags & MSG_NOSIGNAL)) {
792			PROC_LOCK(td->td_proc);
793			psignal(td->td_proc, SIGPIPE);
794			PROC_UNLOCK(td->td_proc);
795		}
796	}
797	if (error == 0)
798		td->td_retval[0] = len - auio.uio_resid;
799#ifdef KTRACE
800	if (ktruio != NULL) {
801		ktruio->uio_resid = td->td_retval[0];
802		ktrgenio(s, UIO_WRITE, ktruio, error);
803	}
804#endif
805bad:
806	fdrop(fp, td);
807	return (error);
808}
809
810int
811sendto(td, uap)
812	struct thread *td;
813	struct sendto_args /* {
814		int	s;
815		caddr_t	buf;
816		size_t	len;
817		int	flags;
818		caddr_t	to;
819		int	tolen;
820	} */ *uap;
821{
822	struct msghdr msg;
823	struct iovec aiov;
824	int error;
825
826	msg.msg_name = uap->to;
827	msg.msg_namelen = uap->tolen;
828	msg.msg_iov = &aiov;
829	msg.msg_iovlen = 1;
830	msg.msg_control = 0;
831#ifdef COMPAT_OLDSOCK
832	msg.msg_flags = 0;
833#endif
834	aiov.iov_base = uap->buf;
835	aiov.iov_len = uap->len;
836	error = sendit(td, uap->s, &msg, uap->flags);
837	return (error);
838}
839
840#ifdef COMPAT_OLDSOCK
841int
842osend(td, uap)
843	struct thread *td;
844	struct osend_args /* {
845		int	s;
846		caddr_t	buf;
847		int	len;
848		int	flags;
849	} */ *uap;
850{
851	struct msghdr msg;
852	struct iovec aiov;
853	int error;
854
855	msg.msg_name = 0;
856	msg.msg_namelen = 0;
857	msg.msg_iov = &aiov;
858	msg.msg_iovlen = 1;
859	aiov.iov_base = uap->buf;
860	aiov.iov_len = uap->len;
861	msg.msg_control = 0;
862	msg.msg_flags = 0;
863	error = sendit(td, uap->s, &msg, uap->flags);
864	return (error);
865}
866
867int
868osendmsg(td, uap)
869	struct thread *td;
870	struct osendmsg_args /* {
871		int	s;
872		caddr_t	msg;
873		int	flags;
874	} */ *uap;
875{
876	struct msghdr msg;
877	struct iovec *iov;
878	int error;
879
880	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
881	if (error)
882		return (error);
883	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
884	if (error)
885		return (error);
886	msg.msg_iov = iov;
887	msg.msg_flags = MSG_COMPAT;
888	error = sendit(td, uap->s, &msg, uap->flags);
889	free(iov, M_IOV);
890	return (error);
891}
892#endif
893
894int
895sendmsg(td, uap)
896	struct thread *td;
897	struct sendmsg_args /* {
898		int	s;
899		caddr_t	msg;
900		int	flags;
901	} */ *uap;
902{
903	struct msghdr msg;
904	struct iovec *iov;
905	int error;
906
907	error = copyin(uap->msg, &msg, sizeof (msg));
908	if (error)
909		return (error);
910	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
911	if (error)
912		return (error);
913	msg.msg_iov = iov;
914#ifdef COMPAT_OLDSOCK
915	msg.msg_flags = 0;
916#endif
917	error = sendit(td, uap->s, &msg, uap->flags);
918	free(iov, M_IOV);
919	return (error);
920}
921
922int
923kern_recvit(td, s, mp, fromseg, controlp)
924	struct thread *td;
925	int s;
926	struct msghdr *mp;
927	enum uio_seg fromseg;
928	struct mbuf **controlp;
929{
930	struct uio auio;
931	struct iovec *iov;
932	int i;
933	socklen_t len;
934	int error;
935	struct mbuf *m, *control = 0;
936	caddr_t ctlbuf;
937	struct file *fp;
938	struct socket *so;
939	struct sockaddr *fromsa = 0;
940#ifdef KTRACE
941	struct uio *ktruio = NULL;
942#endif
943
944	if (controlp != NULL)
945		*controlp = NULL;
946
947	AUDIT_ARG_FD(s);
948	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
949	if (error)
950		return (error);
951	so = fp->f_data;
952
953#ifdef MAC
954	error = mac_socket_check_receive(td->td_ucred, so);
955	if (error) {
956		fdrop(fp, td);
957		return (error);
958	}
959#endif
960
961	auio.uio_iov = mp->msg_iov;
962	auio.uio_iovcnt = mp->msg_iovlen;
963	auio.uio_segflg = UIO_USERSPACE;
964	auio.uio_rw = UIO_READ;
965	auio.uio_td = td;
966	auio.uio_offset = 0;			/* XXX */
967	auio.uio_resid = 0;
968	iov = mp->msg_iov;
969	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
970		if ((auio.uio_resid += iov->iov_len) < 0) {
971			fdrop(fp, td);
972			return (EINVAL);
973		}
974	}
975#ifdef KTRACE
976	if (KTRPOINT(td, KTR_GENIO))
977		ktruio = cloneuio(&auio);
978#endif
979	len = auio.uio_resid;
980	CURVNET_SET(so->so_vnet);
981	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
982	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
983	    &mp->msg_flags);
984	CURVNET_RESTORE();
985	if (error) {
986		if (auio.uio_resid != (int)len && (error == ERESTART ||
987		    error == EINTR || error == EWOULDBLOCK))
988			error = 0;
989	}
990#ifdef KTRACE
991	if (ktruio != NULL) {
992		ktruio->uio_resid = (int)len - auio.uio_resid;
993		ktrgenio(s, UIO_READ, ktruio, error);
994	}
995#endif
996	if (error)
997		goto out;
998	td->td_retval[0] = (int)len - auio.uio_resid;
999	if (mp->msg_name) {
1000		len = mp->msg_namelen;
1001		if (len <= 0 || fromsa == 0)
1002			len = 0;
1003		else {
1004			/* save sa_len before it is destroyed by MSG_COMPAT */
1005			len = MIN(len, fromsa->sa_len);
1006#ifdef COMPAT_OLDSOCK
1007			if (mp->msg_flags & MSG_COMPAT)
1008				((struct osockaddr *)fromsa)->sa_family =
1009				    fromsa->sa_family;
1010#endif
1011			if (fromseg == UIO_USERSPACE) {
1012				error = copyout(fromsa, mp->msg_name,
1013				    (unsigned)len);
1014				if (error)
1015					goto out;
1016			} else
1017				bcopy(fromsa, mp->msg_name, len);
1018		}
1019		mp->msg_namelen = len;
1020	}
1021	if (mp->msg_control && controlp == NULL) {
1022#ifdef COMPAT_OLDSOCK
1023		/*
1024		 * We assume that old recvmsg calls won't receive access
1025		 * rights and other control info, esp. as control info
1026		 * is always optional and those options didn't exist in 4.3.
1027		 * If we receive rights, trim the cmsghdr; anything else
1028		 * is tossed.
1029		 */
1030		if (control && mp->msg_flags & MSG_COMPAT) {
1031			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1032			    SOL_SOCKET ||
1033			    mtod(control, struct cmsghdr *)->cmsg_type !=
1034			    SCM_RIGHTS) {
1035				mp->msg_controllen = 0;
1036				goto out;
1037			}
1038			control->m_len -= sizeof (struct cmsghdr);
1039			control->m_data += sizeof (struct cmsghdr);
1040		}
1041#endif
1042		len = mp->msg_controllen;
1043		m = control;
1044		mp->msg_controllen = 0;
1045		ctlbuf = mp->msg_control;
1046
1047		while (m && len > 0) {
1048			unsigned int tocopy;
1049
1050			if (len >= m->m_len)
1051				tocopy = m->m_len;
1052			else {
1053				mp->msg_flags |= MSG_CTRUNC;
1054				tocopy = len;
1055			}
1056
1057			if ((error = copyout(mtod(m, caddr_t),
1058					ctlbuf, tocopy)) != 0)
1059				goto out;
1060
1061			ctlbuf += tocopy;
1062			len -= tocopy;
1063			m = m->m_next;
1064		}
1065		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1066	}
1067out:
1068	fdrop(fp, td);
1069#ifdef KTRACE
1070	if (fromsa && KTRPOINT(td, KTR_STRUCT))
1071		ktrsockaddr(fromsa);
1072#endif
1073	if (fromsa)
1074		free(fromsa, M_SONAME);
1075
1076	if (error == 0 && controlp != NULL)
1077		*controlp = control;
1078	else  if (control)
1079		m_freem(control);
1080
1081	return (error);
1082}
1083
1084static int
1085recvit(td, s, mp, namelenp)
1086	struct thread *td;
1087	int s;
1088	struct msghdr *mp;
1089	void *namelenp;
1090{
1091	int error;
1092
1093	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1094	if (error)
1095		return (error);
1096	if (namelenp) {
1097		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1098#ifdef COMPAT_OLDSOCK
1099		if (mp->msg_flags & MSG_COMPAT)
1100			error = 0;	/* old recvfrom didn't check */
1101#endif
1102	}
1103	return (error);
1104}
1105
1106int
1107recvfrom(td, uap)
1108	struct thread *td;
1109	struct recvfrom_args /* {
1110		int	s;
1111		caddr_t	buf;
1112		size_t	len;
1113		int	flags;
1114		struct sockaddr * __restrict	from;
1115		socklen_t * __restrict fromlenaddr;
1116	} */ *uap;
1117{
1118	struct msghdr msg;
1119	struct iovec aiov;
1120	int error;
1121
1122	if (uap->fromlenaddr) {
1123		error = copyin(uap->fromlenaddr,
1124		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1125		if (error)
1126			goto done2;
1127	} else {
1128		msg.msg_namelen = 0;
1129	}
1130	msg.msg_name = uap->from;
1131	msg.msg_iov = &aiov;
1132	msg.msg_iovlen = 1;
1133	aiov.iov_base = uap->buf;
1134	aiov.iov_len = uap->len;
1135	msg.msg_control = 0;
1136	msg.msg_flags = uap->flags;
1137	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1138done2:
1139	return(error);
1140}
1141
1142#ifdef COMPAT_OLDSOCK
1143int
1144orecvfrom(td, uap)
1145	struct thread *td;
1146	struct recvfrom_args *uap;
1147{
1148
1149	uap->flags |= MSG_COMPAT;
1150	return (recvfrom(td, uap));
1151}
1152#endif
1153
1154#ifdef COMPAT_OLDSOCK
1155int
1156orecv(td, uap)
1157	struct thread *td;
1158	struct orecv_args /* {
1159		int	s;
1160		caddr_t	buf;
1161		int	len;
1162		int	flags;
1163	} */ *uap;
1164{
1165	struct msghdr msg;
1166	struct iovec aiov;
1167	int error;
1168
1169	msg.msg_name = 0;
1170	msg.msg_namelen = 0;
1171	msg.msg_iov = &aiov;
1172	msg.msg_iovlen = 1;
1173	aiov.iov_base = uap->buf;
1174	aiov.iov_len = uap->len;
1175	msg.msg_control = 0;
1176	msg.msg_flags = uap->flags;
1177	error = recvit(td, uap->s, &msg, NULL);
1178	return (error);
1179}
1180
1181/*
1182 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1183 * overlays the new one, missing only the flags, and with the (old) access
1184 * rights where the control fields are now.
1185 */
1186int
1187orecvmsg(td, uap)
1188	struct thread *td;
1189	struct orecvmsg_args /* {
1190		int	s;
1191		struct	omsghdr *msg;
1192		int	flags;
1193	} */ *uap;
1194{
1195	struct msghdr msg;
1196	struct iovec *iov;
1197	int error;
1198
1199	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1200	if (error)
1201		return (error);
1202	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1203	if (error)
1204		return (error);
1205	msg.msg_flags = uap->flags | MSG_COMPAT;
1206	msg.msg_iov = iov;
1207	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1208	if (msg.msg_controllen && error == 0)
1209		error = copyout(&msg.msg_controllen,
1210		    &uap->msg->msg_accrightslen, sizeof (int));
1211	free(iov, M_IOV);
1212	return (error);
1213}
1214#endif
1215
1216int
1217recvmsg(td, uap)
1218	struct thread *td;
1219	struct recvmsg_args /* {
1220		int	s;
1221		struct	msghdr *msg;
1222		int	flags;
1223	} */ *uap;
1224{
1225	struct msghdr msg;
1226	struct iovec *uiov, *iov;
1227	int error;
1228
1229	error = copyin(uap->msg, &msg, sizeof (msg));
1230	if (error)
1231		return (error);
1232	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1233	if (error)
1234		return (error);
1235	msg.msg_flags = uap->flags;
1236#ifdef COMPAT_OLDSOCK
1237	msg.msg_flags &= ~MSG_COMPAT;
1238#endif
1239	uiov = msg.msg_iov;
1240	msg.msg_iov = iov;
1241	error = recvit(td, uap->s, &msg, NULL);
1242	if (error == 0) {
1243		msg.msg_iov = uiov;
1244		error = copyout(&msg, uap->msg, sizeof(msg));
1245	}
1246	free(iov, M_IOV);
1247	return (error);
1248}
1249
1250/* ARGSUSED */
1251int
1252shutdown(td, uap)
1253	struct thread *td;
1254	struct shutdown_args /* {
1255		int	s;
1256		int	how;
1257	} */ *uap;
1258{
1259	struct socket *so;
1260	struct file *fp;
1261	int error;
1262
1263	AUDIT_ARG_FD(uap->s);
1264	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
1265	if (error == 0) {
1266		so = fp->f_data;
1267		error = soshutdown(so, uap->how);
1268		fdrop(fp, td);
1269	}
1270	return (error);
1271}
1272
1273/* ARGSUSED */
1274int
1275setsockopt(td, uap)
1276	struct thread *td;
1277	struct setsockopt_args /* {
1278		int	s;
1279		int	level;
1280		int	name;
1281		caddr_t	val;
1282		int	valsize;
1283	} */ *uap;
1284{
1285
1286	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1287	    uap->val, UIO_USERSPACE, uap->valsize));
1288}
1289
1290int
1291kern_setsockopt(td, s, level, name, val, valseg, valsize)
1292	struct thread *td;
1293	int s;
1294	int level;
1295	int name;
1296	void *val;
1297	enum uio_seg valseg;
1298	socklen_t valsize;
1299{
1300	int error;
1301	struct socket *so;
1302	struct file *fp;
1303	struct sockopt sopt;
1304
1305	if (val == NULL && valsize != 0)
1306		return (EFAULT);
1307	if ((int)valsize < 0)
1308		return (EINVAL);
1309
1310	sopt.sopt_dir = SOPT_SET;
1311	sopt.sopt_level = level;
1312	sopt.sopt_name = name;
1313	sopt.sopt_val = val;
1314	sopt.sopt_valsize = valsize;
1315	switch (valseg) {
1316	case UIO_USERSPACE:
1317		sopt.sopt_td = td;
1318		break;
1319	case UIO_SYSSPACE:
1320		sopt.sopt_td = NULL;
1321		break;
1322	default:
1323		panic("kern_setsockopt called with bad valseg");
1324	}
1325
1326	AUDIT_ARG_FD(s);
1327	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1328	if (error == 0) {
1329		so = fp->f_data;
1330		CURVNET_SET(so->so_vnet);
1331		error = sosetopt(so, &sopt);
1332		CURVNET_RESTORE();
1333		fdrop(fp, td);
1334	}
1335	return(error);
1336}
1337
1338/* ARGSUSED */
1339int
1340getsockopt(td, uap)
1341	struct thread *td;
1342	struct getsockopt_args /* {
1343		int	s;
1344		int	level;
1345		int	name;
1346		void * __restrict	val;
1347		socklen_t * __restrict avalsize;
1348	} */ *uap;
1349{
1350	socklen_t valsize;
1351	int	error;
1352
1353	if (uap->val) {
1354		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1355		if (error)
1356			return (error);
1357	}
1358
1359	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1360	    uap->val, UIO_USERSPACE, &valsize);
1361
1362	if (error == 0)
1363		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1364	return (error);
1365}
1366
1367/*
1368 * Kernel version of getsockopt.
1369 * optval can be a userland or userspace. optlen is always a kernel pointer.
1370 */
1371int
1372kern_getsockopt(td, s, level, name, val, valseg, valsize)
1373	struct thread *td;
1374	int s;
1375	int level;
1376	int name;
1377	void *val;
1378	enum uio_seg valseg;
1379	socklen_t *valsize;
1380{
1381	int error;
1382	struct  socket *so;
1383	struct file *fp;
1384	struct	sockopt sopt;
1385
1386	if (val == NULL)
1387		*valsize = 0;
1388	if ((int)*valsize < 0)
1389		return (EINVAL);
1390
1391	sopt.sopt_dir = SOPT_GET;
1392	sopt.sopt_level = level;
1393	sopt.sopt_name = name;
1394	sopt.sopt_val = val;
1395	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1396	switch (valseg) {
1397	case UIO_USERSPACE:
1398		sopt.sopt_td = td;
1399		break;
1400	case UIO_SYSSPACE:
1401		sopt.sopt_td = NULL;
1402		break;
1403	default:
1404		panic("kern_getsockopt called with bad valseg");
1405	}
1406
1407	AUDIT_ARG_FD(s);
1408	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1409	if (error == 0) {
1410		so = fp->f_data;
1411		CURVNET_SET(so->so_vnet);
1412		error = sogetopt(so, &sopt);
1413		CURVNET_RESTORE();
1414		*valsize = sopt.sopt_valsize;
1415		fdrop(fp, td);
1416	}
1417	return (error);
1418}
1419
1420/*
1421 * getsockname1() - Get socket name.
1422 */
1423/* ARGSUSED */
1424static int
1425getsockname1(td, uap, compat)
1426	struct thread *td;
1427	struct getsockname_args /* {
1428		int	fdes;
1429		struct sockaddr * __restrict asa;
1430		socklen_t * __restrict alen;
1431	} */ *uap;
1432	int compat;
1433{
1434	struct sockaddr *sa;
1435	socklen_t len;
1436	int error;
1437
1438	error = copyin(uap->alen, &len, sizeof(len));
1439	if (error)
1440		return (error);
1441
1442	error = kern_getsockname(td, uap->fdes, &sa, &len);
1443	if (error)
1444		return (error);
1445
1446	if (len != 0) {
1447#ifdef COMPAT_OLDSOCK
1448		if (compat)
1449			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1450#endif
1451		error = copyout(sa, uap->asa, (u_int)len);
1452	}
1453	free(sa, M_SONAME);
1454	if (error == 0)
1455		error = copyout(&len, uap->alen, sizeof(len));
1456	return (error);
1457}
1458
1459int
1460kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1461    socklen_t *alen)
1462{
1463	struct socket *so;
1464	struct file *fp;
1465	socklen_t len;
1466	int error;
1467
1468	if (*alen < 0)
1469		return (EINVAL);
1470
1471	AUDIT_ARG_FD(fd);
1472	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1473	if (error)
1474		return (error);
1475	so = fp->f_data;
1476	*sa = NULL;
1477	CURVNET_SET(so->so_vnet);
1478	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1479	CURVNET_RESTORE();
1480	if (error)
1481		goto bad;
1482	if (*sa == NULL)
1483		len = 0;
1484	else
1485		len = MIN(*alen, (*sa)->sa_len);
1486	*alen = len;
1487#ifdef KTRACE
1488	if (KTRPOINT(td, KTR_STRUCT))
1489		ktrsockaddr(*sa);
1490#endif
1491bad:
1492	fdrop(fp, td);
1493	if (error && *sa) {
1494		free(*sa, M_SONAME);
1495		*sa = NULL;
1496	}
1497	return (error);
1498}
1499
1500int
1501getsockname(td, uap)
1502	struct thread *td;
1503	struct getsockname_args *uap;
1504{
1505
1506	return (getsockname1(td, uap, 0));
1507}
1508
1509#ifdef COMPAT_OLDSOCK
1510int
1511ogetsockname(td, uap)
1512	struct thread *td;
1513	struct getsockname_args *uap;
1514{
1515
1516	return (getsockname1(td, uap, 1));
1517}
1518#endif /* COMPAT_OLDSOCK */
1519
1520/*
1521 * getpeername1() - Get name of peer for connected socket.
1522 */
1523/* ARGSUSED */
1524static int
1525getpeername1(td, uap, compat)
1526	struct thread *td;
1527	struct getpeername_args /* {
1528		int	fdes;
1529		struct sockaddr * __restrict	asa;
1530		socklen_t * __restrict	alen;
1531	} */ *uap;
1532	int compat;
1533{
1534	struct sockaddr *sa;
1535	socklen_t len;
1536	int error;
1537
1538	error = copyin(uap->alen, &len, sizeof (len));
1539	if (error)
1540		return (error);
1541
1542	error = kern_getpeername(td, uap->fdes, &sa, &len);
1543	if (error)
1544		return (error);
1545
1546	if (len != 0) {
1547#ifdef COMPAT_OLDSOCK
1548		if (compat)
1549			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1550#endif
1551		error = copyout(sa, uap->asa, (u_int)len);
1552	}
1553	free(sa, M_SONAME);
1554	if (error == 0)
1555		error = copyout(&len, uap->alen, sizeof(len));
1556	return (error);
1557}
1558
1559int
1560kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1561    socklen_t *alen)
1562{
1563	struct socket *so;
1564	struct file *fp;
1565	socklen_t len;
1566	int error;
1567
1568	if (*alen < 0)
1569		return (EINVAL);
1570
1571	AUDIT_ARG_FD(fd);
1572	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1573	if (error)
1574		return (error);
1575	so = fp->f_data;
1576	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1577		error = ENOTCONN;
1578		goto done;
1579	}
1580	*sa = NULL;
1581	CURVNET_SET(so->so_vnet);
1582	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1583	CURVNET_RESTORE();
1584	if (error)
1585		goto bad;
1586	if (*sa == NULL)
1587		len = 0;
1588	else
1589		len = MIN(*alen, (*sa)->sa_len);
1590	*alen = len;
1591#ifdef KTRACE
1592	if (KTRPOINT(td, KTR_STRUCT))
1593		ktrsockaddr(*sa);
1594#endif
1595bad:
1596	if (error && *sa) {
1597		free(*sa, M_SONAME);
1598		*sa = NULL;
1599	}
1600done:
1601	fdrop(fp, td);
1602	return (error);
1603}
1604
1605int
1606getpeername(td, uap)
1607	struct thread *td;
1608	struct getpeername_args *uap;
1609{
1610
1611	return (getpeername1(td, uap, 0));
1612}
1613
1614#ifdef COMPAT_OLDSOCK
1615int
1616ogetpeername(td, uap)
1617	struct thread *td;
1618	struct ogetpeername_args *uap;
1619{
1620
1621	/* XXX uap should have type `getpeername_args *' to begin with. */
1622	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1623}
1624#endif /* COMPAT_OLDSOCK */
1625
1626int
1627sockargs(mp, buf, buflen, type)
1628	struct mbuf **mp;
1629	caddr_t buf;
1630	int buflen, type;
1631{
1632	struct sockaddr *sa;
1633	struct mbuf *m;
1634	int error;
1635
1636	if ((u_int)buflen > MLEN) {
1637#ifdef COMPAT_OLDSOCK
1638		if (type == MT_SONAME && (u_int)buflen <= 112)
1639			buflen = MLEN;		/* unix domain compat. hack */
1640		else
1641#endif
1642			if ((u_int)buflen > MCLBYTES)
1643				return (EINVAL);
1644	}
1645	m = m_get(M_WAIT, type);
1646	if ((u_int)buflen > MLEN)
1647		MCLGET(m, M_WAIT);
1648	m->m_len = buflen;
1649	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1650	if (error)
1651		(void) m_free(m);
1652	else {
1653		*mp = m;
1654		if (type == MT_SONAME) {
1655			sa = mtod(m, struct sockaddr *);
1656
1657#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1658			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1659				sa->sa_family = sa->sa_len;
1660#endif
1661			sa->sa_len = buflen;
1662		}
1663	}
1664	return (error);
1665}
1666
1667int
1668getsockaddr(namp, uaddr, len)
1669	struct sockaddr **namp;
1670	caddr_t uaddr;
1671	size_t len;
1672{
1673	struct sockaddr *sa;
1674	int error;
1675
1676	if (len > SOCK_MAXADDRLEN)
1677		return (ENAMETOOLONG);
1678	if (len < offsetof(struct sockaddr, sa_data[0]))
1679		return (EINVAL);
1680	sa = malloc(len, M_SONAME, M_WAITOK);
1681	error = copyin(uaddr, sa, len);
1682	if (error) {
1683		free(sa, M_SONAME);
1684	} else {
1685#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1686		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1687			sa->sa_family = sa->sa_len;
1688#endif
1689		sa->sa_len = len;
1690		*namp = sa;
1691	}
1692	return (error);
1693}
1694
1695#include <sys/condvar.h>
1696
1697struct sendfile_sync {
1698	struct mtx	mtx;
1699	struct cv	cv;
1700	unsigned 	count;
1701};
1702
1703/*
1704 * Detach mapped page and release resources back to the system.
1705 */
1706void
1707sf_buf_mext(void *addr, void *args)
1708{
1709	vm_page_t m;
1710	struct sendfile_sync *sfs;
1711
1712	m = sf_buf_page(args);
1713	sf_buf_free(args);
1714	vm_page_lock_queues();
1715	vm_page_unwire(m, 0);
1716	/*
1717	 * Check for the object going away on us. This can
1718	 * happen since we don't hold a reference to it.
1719	 * If so, we're responsible for freeing the page.
1720	 */
1721	if (m->wire_count == 0 && m->object == NULL)
1722		vm_page_free(m);
1723	vm_page_unlock_queues();
1724	if (addr == NULL)
1725		return;
1726	sfs = addr;
1727	mtx_lock(&sfs->mtx);
1728	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
1729	if (--sfs->count == 0)
1730		cv_signal(&sfs->cv);
1731	mtx_unlock(&sfs->mtx);
1732}
1733
1734/*
1735 * sendfile(2)
1736 *
1737 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1738 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1739 *
1740 * Send a file specified by 'fd' and starting at 'offset' to a socket
1741 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1742 * 0.  Optionally add a header and/or trailer to the socket output.  If
1743 * specified, write the total number of bytes sent into *sbytes.
1744 */
1745int
1746sendfile(struct thread *td, struct sendfile_args *uap)
1747{
1748
1749	return (do_sendfile(td, uap, 0));
1750}
1751
1752static int
1753do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1754{
1755	struct sf_hdtr hdtr;
1756	struct uio *hdr_uio, *trl_uio;
1757	int error;
1758
1759	hdr_uio = trl_uio = NULL;
1760
1761	if (uap->hdtr != NULL) {
1762		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1763		if (error)
1764			goto out;
1765		if (hdtr.headers != NULL) {
1766			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1767			if (error)
1768				goto out;
1769		}
1770		if (hdtr.trailers != NULL) {
1771			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1772			if (error)
1773				goto out;
1774
1775		}
1776	}
1777
1778	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
1779out:
1780	if (hdr_uio)
1781		free(hdr_uio, M_IOV);
1782	if (trl_uio)
1783		free(trl_uio, M_IOV);
1784	return (error);
1785}
1786
1787#ifdef COMPAT_FREEBSD4
1788int
1789freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1790{
1791	struct sendfile_args args;
1792
1793	args.fd = uap->fd;
1794	args.s = uap->s;
1795	args.offset = uap->offset;
1796	args.nbytes = uap->nbytes;
1797	args.hdtr = uap->hdtr;
1798	args.sbytes = uap->sbytes;
1799	args.flags = uap->flags;
1800
1801	return (do_sendfile(td, &args, 1));
1802}
1803#endif /* COMPAT_FREEBSD4 */
1804
1805int
1806kern_sendfile(struct thread *td, struct sendfile_args *uap,
1807    struct uio *hdr_uio, struct uio *trl_uio, int compat)
1808{
1809	struct file *sock_fp;
1810	struct vnode *vp;
1811	struct vm_object *obj = NULL;
1812	struct socket *so = NULL;
1813	struct mbuf *m = NULL;
1814	struct sf_buf *sf;
1815	struct vm_page *pg;
1816	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
1817	int error, hdrlen = 0, mnw = 0;
1818	int vfslocked;
1819	struct sendfile_sync *sfs = NULL;
1820
1821	/*
1822	 * The file descriptor must be a regular file and have a
1823	 * backing VM object.
1824	 * File offset must be positive.  If it goes beyond EOF
1825	 * we send only the header/trailer and no payload data.
1826	 */
1827	AUDIT_ARG_FD(uap->fd);
1828	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1829		goto out;
1830	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1831	vn_lock(vp, LK_SHARED | LK_RETRY);
1832	if (vp->v_type == VREG) {
1833		obj = vp->v_object;
1834		if (obj != NULL) {
1835			/*
1836			 * Temporarily increase the backing VM
1837			 * object's reference count so that a forced
1838			 * reclamation of its vnode does not
1839			 * immediately destroy it.
1840			 */
1841			VM_OBJECT_LOCK(obj);
1842			if ((obj->flags & OBJ_DEAD) == 0) {
1843				vm_object_reference_locked(obj);
1844				VM_OBJECT_UNLOCK(obj);
1845			} else {
1846				VM_OBJECT_UNLOCK(obj);
1847				obj = NULL;
1848			}
1849		}
1850	}
1851	VOP_UNLOCK(vp, 0);
1852	VFS_UNLOCK_GIANT(vfslocked);
1853	if (obj == NULL) {
1854		error = EINVAL;
1855		goto out;
1856	}
1857	if (uap->offset < 0) {
1858		error = EINVAL;
1859		goto out;
1860	}
1861
1862	/*
1863	 * The socket must be a stream socket and connected.
1864	 * Remember if it a blocking or non-blocking socket.
1865	 */
1866	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
1867	    NULL)) != 0)
1868		goto out;
1869	so = sock_fp->f_data;
1870	if (so->so_type != SOCK_STREAM) {
1871		error = EINVAL;
1872		goto out;
1873	}
1874	if ((so->so_state & SS_ISCONNECTED) == 0) {
1875		error = ENOTCONN;
1876		goto out;
1877	}
1878	/*
1879	 * Do not wait on memory allocations but return ENOMEM for
1880	 * caller to retry later.
1881	 * XXX: Experimental.
1882	 */
1883	if (uap->flags & SF_MNOWAIT)
1884		mnw = 1;
1885
1886	if (uap->flags & SF_SYNC) {
1887		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK);
1888		memset(sfs, 0, sizeof *sfs);
1889		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
1890		cv_init(&sfs->cv, "sendfile");
1891	}
1892
1893#ifdef MAC
1894	error = mac_socket_check_send(td->td_ucred, so);
1895	if (error)
1896		goto out;
1897#endif
1898
1899	/* If headers are specified copy them into mbufs. */
1900	if (hdr_uio != NULL) {
1901		hdr_uio->uio_td = td;
1902		hdr_uio->uio_rw = UIO_WRITE;
1903		if (hdr_uio->uio_resid > 0) {
1904			/*
1905			 * In FBSD < 5.0 the nbytes to send also included
1906			 * the header.  If compat is specified subtract the
1907			 * header size from nbytes.
1908			 */
1909			if (compat) {
1910				if (uap->nbytes > hdr_uio->uio_resid)
1911					uap->nbytes -= hdr_uio->uio_resid;
1912				else
1913					uap->nbytes = 0;
1914			}
1915			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
1916			    0, 0, 0);
1917			if (m == NULL) {
1918				error = mnw ? EAGAIN : ENOBUFS;
1919				goto out;
1920			}
1921			hdrlen = m_length(m, NULL);
1922		}
1923	}
1924
1925	/*
1926	 * Protect against multiple writers to the socket.
1927	 *
1928	 * XXXRW: Historically this has assumed non-interruptibility, so now
1929	 * we implement that, but possibly shouldn't.
1930	 */
1931	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
1932
1933	/*
1934	 * Loop through the pages of the file, starting with the requested
1935	 * offset. Get a file page (do I/O if necessary), map the file page
1936	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1937	 * it on the socket.
1938	 * This is done in two loops.  The inner loop turns as many pages
1939	 * as it can, up to available socket buffer space, without blocking
1940	 * into mbufs to have it bulk delivered into the socket send buffer.
1941	 * The outer loop checks the state and available space of the socket
1942	 * and takes care of the overall progress.
1943	 */
1944	for (off = uap->offset, rem = uap->nbytes; ; ) {
1945		int loopbytes = 0;
1946		int space = 0;
1947		int done = 0;
1948
1949		/*
1950		 * Check the socket state for ongoing connection,
1951		 * no errors and space in socket buffer.
1952		 * If space is low allow for the remainder of the
1953		 * file to be processed if it fits the socket buffer.
1954		 * Otherwise block in waiting for sufficient space
1955		 * to proceed, or if the socket is nonblocking, return
1956		 * to userland with EAGAIN while reporting how far
1957		 * we've come.
1958		 * We wait until the socket buffer has significant free
1959		 * space to do bulk sends.  This makes good use of file
1960		 * system read ahead and allows packet segmentation
1961		 * offloading hardware to take over lots of work.  If
1962		 * we were not careful here we would send off only one
1963		 * sfbuf at a time.
1964		 */
1965		SOCKBUF_LOCK(&so->so_snd);
1966		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
1967			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
1968retry_space:
1969		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1970			error = EPIPE;
1971			SOCKBUF_UNLOCK(&so->so_snd);
1972			goto done;
1973		} else if (so->so_error) {
1974			error = so->so_error;
1975			so->so_error = 0;
1976			SOCKBUF_UNLOCK(&so->so_snd);
1977			goto done;
1978		}
1979		space = sbspace(&so->so_snd);
1980		if (space < rem &&
1981		    (space <= 0 ||
1982		     space < so->so_snd.sb_lowat)) {
1983			if (so->so_state & SS_NBIO) {
1984				SOCKBUF_UNLOCK(&so->so_snd);
1985				error = EAGAIN;
1986				goto done;
1987			}
1988			/*
1989			 * sbwait drops the lock while sleeping.
1990			 * When we loop back to retry_space the
1991			 * state may have changed and we retest
1992			 * for it.
1993			 */
1994			error = sbwait(&so->so_snd);
1995			/*
1996			 * An error from sbwait usually indicates that we've
1997			 * been interrupted by a signal. If we've sent anything
1998			 * then return bytes sent, otherwise return the error.
1999			 */
2000			if (error) {
2001				SOCKBUF_UNLOCK(&so->so_snd);
2002				goto done;
2003			}
2004			goto retry_space;
2005		}
2006		SOCKBUF_UNLOCK(&so->so_snd);
2007
2008		/*
2009		 * Reduce space in the socket buffer by the size of
2010		 * the header mbuf chain.
2011		 * hdrlen is set to 0 after the first loop.
2012		 */
2013		space -= hdrlen;
2014
2015		/*
2016		 * Loop and construct maximum sized mbuf chain to be bulk
2017		 * dumped into socket buffer.
2018		 */
2019		while (space > loopbytes) {
2020			vm_pindex_t pindex;
2021			vm_offset_t pgoff;
2022			struct mbuf *m0;
2023
2024			VM_OBJECT_LOCK(obj);
2025			/*
2026			 * Calculate the amount to transfer.
2027			 * Not to exceed a page, the EOF,
2028			 * or the passed in nbytes.
2029			 */
2030			pgoff = (vm_offset_t)(off & PAGE_MASK);
2031			xfsize = omin(PAGE_SIZE - pgoff,
2032			    obj->un_pager.vnp.vnp_size - uap->offset -
2033			    fsbytes - loopbytes);
2034			if (uap->nbytes)
2035				rem = (uap->nbytes - fsbytes - loopbytes);
2036			else
2037				rem = obj->un_pager.vnp.vnp_size -
2038				    uap->offset - fsbytes - loopbytes;
2039			xfsize = omin(rem, xfsize);
2040			xfsize = omin(space - loopbytes, xfsize);
2041			if (xfsize <= 0) {
2042				VM_OBJECT_UNLOCK(obj);
2043				done = 1;		/* all data sent */
2044				break;
2045			}
2046
2047			/*
2048			 * Attempt to look up the page.  Allocate
2049			 * if not found or wait and loop if busy.
2050			 */
2051			pindex = OFF_TO_IDX(off);
2052			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
2053			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
2054
2055			/*
2056			 * Check if page is valid for what we need,
2057			 * otherwise initiate I/O.
2058			 * If we already turned some pages into mbufs,
2059			 * send them off before we come here again and
2060			 * block.
2061			 */
2062			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
2063				VM_OBJECT_UNLOCK(obj);
2064			else if (m != NULL)
2065				error = EAGAIN;	/* send what we already got */
2066			else if (uap->flags & SF_NODISKIO)
2067				error = EBUSY;
2068			else {
2069				int bsize, resid;
2070
2071				/*
2072				 * Ensure that our page is still around
2073				 * when the I/O completes.
2074				 */
2075				vm_page_io_start(pg);
2076				VM_OBJECT_UNLOCK(obj);
2077
2078				/*
2079				 * Get the page from backing store.
2080				 */
2081				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2082				error = vn_lock(vp, LK_SHARED);
2083				if (error != 0)
2084					goto after_read;
2085				bsize = vp->v_mount->mnt_stat.f_iosize;
2086
2087				/*
2088				 * XXXMAC: Because we don't have fp->f_cred
2089				 * here, we pass in NOCRED.  This is probably
2090				 * wrong, but is consistent with our original
2091				 * implementation.
2092				 */
2093				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
2094				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2095				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
2096				    td->td_ucred, NOCRED, &resid, td);
2097				VOP_UNLOCK(vp, 0);
2098			after_read:
2099				VFS_UNLOCK_GIANT(vfslocked);
2100				VM_OBJECT_LOCK(obj);
2101				vm_page_io_finish(pg);
2102				if (!error)
2103					VM_OBJECT_UNLOCK(obj);
2104				mbstat.sf_iocnt++;
2105			}
2106			if (error) {
2107				vm_page_lock_queues();
2108				vm_page_unwire(pg, 0);
2109				/*
2110				 * See if anyone else might know about
2111				 * this page.  If not and it is not valid,
2112				 * then free it.
2113				 */
2114				if (pg->wire_count == 0 && pg->valid == 0 &&
2115				    pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
2116				    pg->hold_count == 0) {
2117					vm_page_free(pg);
2118				}
2119				vm_page_unlock_queues();
2120				VM_OBJECT_UNLOCK(obj);
2121				if (error == EAGAIN)
2122					error = 0;	/* not a real error */
2123				break;
2124			}
2125
2126			/*
2127			 * Get a sendfile buf.  We usually wait as long
2128			 * as necessary, but this wait can be interrupted.
2129			 */
2130			if ((sf = sf_buf_alloc(pg,
2131			    (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) {
2132				mbstat.sf_allocfail++;
2133				vm_page_lock_queues();
2134				vm_page_unwire(pg, 0);
2135				/*
2136				 * XXX: Not same check as above!?
2137				 */
2138				if (pg->wire_count == 0 && pg->object == NULL)
2139					vm_page_free(pg);
2140				vm_page_unlock_queues();
2141				error = (mnw ? EAGAIN : EINTR);
2142				break;
2143			}
2144
2145			/*
2146			 * Get an mbuf and set it up as having
2147			 * external storage.
2148			 */
2149			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2150			if (m0 == NULL) {
2151				error = (mnw ? EAGAIN : ENOBUFS);
2152				sf_buf_mext((void *)sf_buf_kva(sf), sf);
2153				break;
2154			}
2155			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
2156			    sfs, sf, M_RDONLY, EXT_SFBUF);
2157			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2158			m0->m_len = xfsize;
2159
2160			/* Append to mbuf chain. */
2161			if (m != NULL)
2162				m_cat(m, m0);
2163			else
2164				m = m0;
2165
2166			/* Keep track of bits processed. */
2167			loopbytes += xfsize;
2168			off += xfsize;
2169
2170			if (sfs != NULL) {
2171				mtx_lock(&sfs->mtx);
2172				sfs->count++;
2173				mtx_unlock(&sfs->mtx);
2174			}
2175		}
2176
2177		/* Add the buffer chain to the socket buffer. */
2178		if (m != NULL) {
2179			int mlen, err;
2180
2181			mlen = m_length(m, NULL);
2182			SOCKBUF_LOCK(&so->so_snd);
2183			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2184				error = EPIPE;
2185				SOCKBUF_UNLOCK(&so->so_snd);
2186				goto done;
2187			}
2188			SOCKBUF_UNLOCK(&so->so_snd);
2189			CURVNET_SET(so->so_vnet);
2190			/* Avoid error aliasing. */
2191			err = (*so->so_proto->pr_usrreqs->pru_send)
2192				    (so, 0, m, NULL, NULL, td);
2193			CURVNET_RESTORE();
2194			if (err == 0) {
2195				/*
2196				 * We need two counters to get the
2197				 * file offset and nbytes to send
2198				 * right:
2199				 * - sbytes contains the total amount
2200				 *   of bytes sent, including headers.
2201				 * - fsbytes contains the total amount
2202				 *   of bytes sent from the file.
2203				 */
2204				sbytes += mlen;
2205				fsbytes += mlen;
2206				if (hdrlen) {
2207					fsbytes -= hdrlen;
2208					hdrlen = 0;
2209				}
2210			} else if (error == 0)
2211				error = err;
2212			m = NULL;	/* pru_send always consumes */
2213		}
2214
2215		/* Quit outer loop on error or when we're done. */
2216		if (done)
2217			break;
2218		if (error)
2219			goto done;
2220	}
2221
2222	/*
2223	 * Send trailers. Wimp out and use writev(2).
2224	 */
2225	if (trl_uio != NULL) {
2226		sbunlock(&so->so_snd);
2227		error = kern_writev(td, uap->s, trl_uio);
2228		if (error == 0)
2229			sbytes += td->td_retval[0];
2230		goto out;
2231	}
2232
2233done:
2234	sbunlock(&so->so_snd);
2235out:
2236	/*
2237	 * If there was no error we have to clear td->td_retval[0]
2238	 * because it may have been set by writev.
2239	 */
2240	if (error == 0) {
2241		td->td_retval[0] = 0;
2242	}
2243	if (uap->sbytes != NULL) {
2244		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2245	}
2246	if (obj != NULL)
2247		vm_object_deallocate(obj);
2248	if (vp != NULL) {
2249		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2250		vrele(vp);
2251		VFS_UNLOCK_GIANT(vfslocked);
2252	}
2253	if (so)
2254		fdrop(sock_fp, td);
2255	if (m)
2256		m_freem(m);
2257
2258	if (sfs != NULL) {
2259		mtx_lock(&sfs->mtx);
2260		if (sfs->count != 0)
2261			cv_wait(&sfs->cv, &sfs->mtx);
2262		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2263		cv_destroy(&sfs->cv);
2264		mtx_destroy(&sfs->mtx);
2265		free(sfs, M_TEMP);
2266	}
2267
2268	if (error == ERESTART)
2269		error = EINTR;
2270
2271	return (error);
2272}
2273
2274/*
2275 * SCTP syscalls.
2276 * Functionality only compiled in if SCTP is defined in the kernel Makefile,
2277 * otherwise all return EOPNOTSUPP.
2278 * XXX: We should make this loadable one day.
2279 */
2280int
2281sctp_peeloff(td, uap)
2282	struct thread *td;
2283	struct sctp_peeloff_args /* {
2284		int	sd;
2285		caddr_t	name;
2286	} */ *uap;
2287{
2288#if (defined(INET) || defined(INET6)) && defined(SCTP)
2289	struct filedesc *fdp;
2290	struct file *nfp = NULL;
2291	int error;
2292	struct socket *head, *so;
2293	int fd;
2294	u_int fflag;
2295
2296	fdp = td->td_proc->p_fd;
2297	AUDIT_ARG_FD(uap->sd);
2298	error = fgetsock(td, uap->sd, &head, &fflag);
2299	if (error)
2300		goto done2;
2301	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
2302	if (error)
2303		goto done2;
2304	/*
2305	 * At this point we know we do have a assoc to pull
2306	 * we proceed to get the fd setup. This may block
2307	 * but that is ok.
2308	 */
2309
2310	error = falloc(td, &nfp, &fd);
2311	if (error)
2312		goto done;
2313	td->td_retval[0] = fd;
2314
2315	CURVNET_SET(head->so_vnet);
2316	so = sonewconn(head, SS_ISCONNECTED);
2317	if (so == NULL)
2318		goto noconnection;
2319	/*
2320	 * Before changing the flags on the socket, we have to bump the
2321	 * reference count.  Otherwise, if the protocol calls sofree(),
2322	 * the socket will be released due to a zero refcount.
2323	 */
2324        SOCK_LOCK(so);
2325        soref(so);                      /* file descriptor reference */
2326        SOCK_UNLOCK(so);
2327
2328	ACCEPT_LOCK();
2329
2330	TAILQ_REMOVE(&head->so_comp, so, so_list);
2331	head->so_qlen--;
2332	so->so_state |= (head->so_state & SS_NBIO);
2333	so->so_state &= ~SS_NOFDREF;
2334	so->so_qstate &= ~SQ_COMP;
2335	so->so_head = NULL;
2336	ACCEPT_UNLOCK();
2337	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
2338	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
2339	if (error)
2340		goto noconnection;
2341	if (head->so_sigio != NULL)
2342		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
2343
2344noconnection:
2345	/*
2346	 * close the new descriptor, assuming someone hasn't ripped it
2347	 * out from under us.
2348	 */
2349	if (error)
2350		fdclose(fdp, nfp, fd, td);
2351
2352	/*
2353	 * Release explicitly held references before returning.
2354	 */
2355	CURVNET_RESTORE();
2356done:
2357	if (nfp != NULL)
2358		fdrop(nfp, td);
2359	fputsock(head);
2360done2:
2361	return (error);
2362#else  /* SCTP */
2363	return (EOPNOTSUPP);
2364#endif /* SCTP */
2365}
2366
2367int
2368sctp_generic_sendmsg (td, uap)
2369	struct thread *td;
2370	struct sctp_generic_sendmsg_args /* {
2371		int sd,
2372		caddr_t msg,
2373		int mlen,
2374		caddr_t to,
2375		__socklen_t tolen,
2376		struct sctp_sndrcvinfo *sinfo,
2377		int flags
2378	} */ *uap;
2379{
2380#if (defined(INET) || defined(INET6)) && defined(SCTP)
2381	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2382	struct socket *so;
2383	struct file *fp = NULL;
2384	int use_rcvinfo = 1;
2385	int error = 0, len;
2386	struct sockaddr *to = NULL;
2387#ifdef KTRACE
2388	struct uio *ktruio = NULL;
2389#endif
2390	struct uio auio;
2391	struct iovec iov[1];
2392
2393	if (uap->sinfo) {
2394		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2395		if (error)
2396			return (error);
2397		u_sinfo = &sinfo;
2398	}
2399	if (uap->tolen) {
2400		error = getsockaddr(&to, uap->to, uap->tolen);
2401		if (error) {
2402			to = NULL;
2403			goto sctp_bad2;
2404		}
2405	}
2406
2407	AUDIT_ARG_FD(uap->sd);
2408	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2409	if (error)
2410		goto sctp_bad;
2411#ifdef KTRACE
2412	if (KTRPOINT(td, KTR_STRUCT))
2413		ktrsockaddr(to);
2414#endif
2415
2416	iov[0].iov_base = uap->msg;
2417	iov[0].iov_len = uap->mlen;
2418
2419	so = (struct socket *)fp->f_data;
2420#ifdef MAC
2421	error = mac_socket_check_send(td->td_ucred, so);
2422	if (error)
2423		goto sctp_bad;
2424#endif /* MAC */
2425
2426	auio.uio_iov =  iov;
2427	auio.uio_iovcnt = 1;
2428	auio.uio_segflg = UIO_USERSPACE;
2429	auio.uio_rw = UIO_WRITE;
2430	auio.uio_td = td;
2431	auio.uio_offset = 0;			/* XXX */
2432	auio.uio_resid = 0;
2433	len = auio.uio_resid = uap->mlen;
2434	CURVNET_SET(so->so_vnet);
2435	error = sctp_lower_sosend(so, to, &auio,
2436		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2437		    uap->flags, use_rcvinfo, u_sinfo, td);
2438	CURVNET_RESTORE();
2439	if (error) {
2440		if (auio.uio_resid != len && (error == ERESTART ||
2441		    error == EINTR || error == EWOULDBLOCK))
2442			error = 0;
2443		/* Generation of SIGPIPE can be controlled per socket. */
2444		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2445		    !(uap->flags & MSG_NOSIGNAL)) {
2446			PROC_LOCK(td->td_proc);
2447			psignal(td->td_proc, SIGPIPE);
2448			PROC_UNLOCK(td->td_proc);
2449		}
2450	}
2451	if (error == 0)
2452		td->td_retval[0] = len - auio.uio_resid;
2453#ifdef KTRACE
2454	if (ktruio != NULL) {
2455		ktruio->uio_resid = td->td_retval[0];
2456		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2457	}
2458#endif /* KTRACE */
2459sctp_bad:
2460	if (fp)
2461		fdrop(fp, td);
2462sctp_bad2:
2463	if (to)
2464		free(to, M_SONAME);
2465	return (error);
2466#else  /* SCTP */
2467	return (EOPNOTSUPP);
2468#endif /* SCTP */
2469}
2470
2471int
2472sctp_generic_sendmsg_iov(td, uap)
2473	struct thread *td;
2474	struct sctp_generic_sendmsg_iov_args /* {
2475		int sd,
2476		struct iovec *iov,
2477		int iovlen,
2478		caddr_t to,
2479		__socklen_t tolen,
2480		struct sctp_sndrcvinfo *sinfo,
2481		int flags
2482	} */ *uap;
2483{
2484#if (defined(INET) || defined(INET6)) && defined(SCTP)
2485	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2486	struct socket *so;
2487	struct file *fp = NULL;
2488	int use_rcvinfo = 1;
2489	int error=0, len, i;
2490	struct sockaddr *to = NULL;
2491#ifdef KTRACE
2492	struct uio *ktruio = NULL;
2493#endif
2494	struct uio auio;
2495	struct iovec *iov, *tiov;
2496
2497	if (uap->sinfo) {
2498		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2499		if (error)
2500			return (error);
2501		u_sinfo = &sinfo;
2502	}
2503	if (uap->tolen) {
2504		error = getsockaddr(&to, uap->to, uap->tolen);
2505		if (error) {
2506			to = NULL;
2507			goto sctp_bad2;
2508		}
2509	}
2510
2511	AUDIT_ARG_FD(uap->sd);
2512	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2513	if (error)
2514		goto sctp_bad1;
2515
2516	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2517	if (error)
2518		goto sctp_bad1;
2519#ifdef KTRACE
2520	if (KTRPOINT(td, KTR_STRUCT))
2521		ktrsockaddr(to);
2522#endif
2523
2524	so = (struct socket *)fp->f_data;
2525#ifdef MAC
2526	error = mac_socket_check_send(td->td_ucred, so);
2527	if (error)
2528		goto sctp_bad;
2529#endif /* MAC */
2530
2531	auio.uio_iov = iov;
2532	auio.uio_iovcnt = uap->iovlen;
2533	auio.uio_segflg = UIO_USERSPACE;
2534	auio.uio_rw = UIO_WRITE;
2535	auio.uio_td = td;
2536	auio.uio_offset = 0;			/* XXX */
2537	auio.uio_resid = 0;
2538	tiov = iov;
2539	for (i = 0; i <uap->iovlen; i++, tiov++) {
2540		if ((auio.uio_resid += tiov->iov_len) < 0) {
2541			error = EINVAL;
2542			goto sctp_bad;
2543		}
2544	}
2545	len = auio.uio_resid;
2546	CURVNET_SET(so->so_vnet);
2547	error = sctp_lower_sosend(so, to, &auio,
2548		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2549		    uap->flags, use_rcvinfo, u_sinfo, td);
2550	CURVNET_RESTORE();
2551	if (error) {
2552		if (auio.uio_resid != len && (error == ERESTART ||
2553		    error == EINTR || error == EWOULDBLOCK))
2554			error = 0;
2555		/* Generation of SIGPIPE can be controlled per socket */
2556		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2557		    !(uap->flags & MSG_NOSIGNAL)) {
2558			PROC_LOCK(td->td_proc);
2559			psignal(td->td_proc, SIGPIPE);
2560			PROC_UNLOCK(td->td_proc);
2561		}
2562	}
2563	if (error == 0)
2564		td->td_retval[0] = len - auio.uio_resid;
2565#ifdef KTRACE
2566	if (ktruio != NULL) {
2567		ktruio->uio_resid = td->td_retval[0];
2568		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2569	}
2570#endif /* KTRACE */
2571sctp_bad:
2572	free(iov, M_IOV);
2573sctp_bad1:
2574	if (fp)
2575		fdrop(fp, td);
2576sctp_bad2:
2577	if (to)
2578		free(to, M_SONAME);
2579	return (error);
2580#else  /* SCTP */
2581	return (EOPNOTSUPP);
2582#endif /* SCTP */
2583}
2584
2585int
2586sctp_generic_recvmsg(td, uap)
2587	struct thread *td;
2588	struct sctp_generic_recvmsg_args /* {
2589		int sd,
2590		struct iovec *iov,
2591		int iovlen,
2592		struct sockaddr *from,
2593		__socklen_t *fromlenaddr,
2594		struct sctp_sndrcvinfo *sinfo,
2595		int *msg_flags
2596	} */ *uap;
2597{
2598#if (defined(INET) || defined(INET6)) && defined(SCTP)
2599	u_int8_t sockbufstore[256];
2600	struct uio auio;
2601	struct iovec *iov, *tiov;
2602	struct sctp_sndrcvinfo sinfo;
2603	struct socket *so;
2604	struct file *fp = NULL;
2605	struct sockaddr *fromsa;
2606	int fromlen;
2607	int len, i, msg_flags;
2608	int error = 0;
2609#ifdef KTRACE
2610	struct uio *ktruio = NULL;
2611#endif
2612
2613	AUDIT_ARG_FD(uap->sd);
2614	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2615	if (error) {
2616		return (error);
2617	}
2618	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2619	if (error) {
2620		goto out1;
2621	}
2622
2623	so = fp->f_data;
2624#ifdef MAC
2625	error = mac_socket_check_receive(td->td_ucred, so);
2626	if (error) {
2627		goto out;
2628		return (error);
2629	}
2630#endif /* MAC */
2631
2632	if (uap->fromlenaddr) {
2633		error = copyin(uap->fromlenaddr,
2634		    &fromlen, sizeof (fromlen));
2635		if (error) {
2636			goto out;
2637		}
2638	} else {
2639		fromlen = 0;
2640	}
2641	if (uap->msg_flags) {
2642		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
2643		if (error) {
2644			goto out;
2645		}
2646	} else {
2647		msg_flags = 0;
2648	}
2649	auio.uio_iov = iov;
2650	auio.uio_iovcnt = uap->iovlen;
2651  	auio.uio_segflg = UIO_USERSPACE;
2652	auio.uio_rw = UIO_READ;
2653	auio.uio_td = td;
2654	auio.uio_offset = 0;			/* XXX */
2655	auio.uio_resid = 0;
2656	tiov = iov;
2657	for (i = 0; i <uap->iovlen; i++, tiov++) {
2658		if ((auio.uio_resid += tiov->iov_len) < 0) {
2659			error = EINVAL;
2660			goto out;
2661		}
2662	}
2663	len = auio.uio_resid;
2664	fromsa = (struct sockaddr *)sockbufstore;
2665
2666#ifdef KTRACE
2667	if (KTRPOINT(td, KTR_GENIO))
2668		ktruio = cloneuio(&auio);
2669#endif /* KTRACE */
2670	CURVNET_SET(so->so_vnet);
2671	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
2672		    fromsa, fromlen, &msg_flags,
2673		    (struct sctp_sndrcvinfo *)&sinfo, 1);
2674	CURVNET_RESTORE();
2675	if (error) {
2676		if (auio.uio_resid != (int)len && (error == ERESTART ||
2677		    error == EINTR || error == EWOULDBLOCK))
2678			error = 0;
2679	} else {
2680		if (uap->sinfo)
2681			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
2682	}
2683#ifdef KTRACE
2684	if (ktruio != NULL) {
2685		ktruio->uio_resid = (int)len - auio.uio_resid;
2686		ktrgenio(uap->sd, UIO_READ, ktruio, error);
2687	}
2688#endif /* KTRACE */
2689	if (error)
2690		goto out;
2691	td->td_retval[0] = (int)len - auio.uio_resid;
2692
2693	if (fromlen && uap->from) {
2694		len = fromlen;
2695		if (len <= 0 || fromsa == 0)
2696			len = 0;
2697		else {
2698			len = MIN(len, fromsa->sa_len);
2699			error = copyout(fromsa, uap->from, (unsigned)len);
2700			if (error)
2701				goto out;
2702		}
2703		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
2704		if (error) {
2705			goto out;
2706		}
2707	}
2708#ifdef KTRACE
2709	if (KTRPOINT(td, KTR_STRUCT))
2710		ktrsockaddr(fromsa);
2711#endif
2712	if (uap->msg_flags) {
2713		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
2714		if (error) {
2715			goto out;
2716		}
2717	}
2718out:
2719	free(iov, M_IOV);
2720out1:
2721	if (fp)
2722		fdrop(fp, td);
2723
2724	return (error);
2725#else  /* SCTP */
2726	return (EOPNOTSUPP);
2727#endif /* SCTP */
2728}
2729