kern_sendfile.c revision 247602
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 247602 2013-03-02 00:53:12Z pjd $");
37
38#include "opt_capsicum.h"
39#include "opt_inet.h"
40#include "opt_inet6.h"
41#include "opt_sctp.h"
42#include "opt_compat.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/capability.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/mutex.h>
51#include <sys/sysproto.h>
52#include <sys/malloc.h>
53#include <sys/filedesc.h>
54#include <sys/event.h>
55#include <sys/proc.h>
56#include <sys/fcntl.h>
57#include <sys/file.h>
58#include <sys/filio.h>
59#include <sys/jail.h>
60#include <sys/mount.h>
61#include <sys/mbuf.h>
62#include <sys/protosw.h>
63#include <sys/sf_buf.h>
64#include <sys/sysent.h>
65#include <sys/socket.h>
66#include <sys/socketvar.h>
67#include <sys/signalvar.h>
68#include <sys/syscallsubr.h>
69#include <sys/sysctl.h>
70#include <sys/uio.h>
71#include <sys/vnode.h>
72#ifdef KTRACE
73#include <sys/ktrace.h>
74#endif
75#ifdef COMPAT_FREEBSD32
76#include <compat/freebsd32/freebsd32_util.h>
77#endif
78
79#include <net/vnet.h>
80
81#include <security/audit/audit.h>
82#include <security/mac/mac_framework.h>
83
84#include <vm/vm.h>
85#include <vm/vm_param.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/vm_pageout.h>
89#include <vm/vm_kern.h>
90#include <vm/vm_extern.h>
91
92#if defined(INET) || defined(INET6)
93#ifdef SCTP
94#include <netinet/sctp.h>
95#include <netinet/sctp_peeloff.h>
96#endif /* SCTP */
97#endif /* INET || INET6 */
98
99static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
100static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
101
102static int accept1(struct thread *td, struct accept_args *uap, int compat);
103static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
104static int getsockname1(struct thread *td, struct getsockname_args *uap,
105			int compat);
106static int getpeername1(struct thread *td, struct getpeername_args *uap,
107			int compat);
108
109/*
110 * NSFBUFS-related variables and associated sysctls
111 */
112int nsfbufs;
113int nsfbufspeak;
114int nsfbufsused;
115
116SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
117    "Maximum number of sendfile(2) sf_bufs available");
118SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
119    "Number of sendfile(2) sf_bufs at peak usage");
120SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
121    "Number of sendfile(2) sf_bufs in use");
122
123/*
124 * Convert a user file descriptor to a kernel file entry and check if required
125 * capability rights are present.
126 * A reference on the file entry is held upon returning.
127 */
128static int
129getsock_cap(struct filedesc *fdp, int fd, cap_rights_t rights,
130    struct file **fpp, u_int *fflagp)
131{
132	struct file *fp;
133	int error;
134
135	error = fget_unlocked(fdp, fd, rights, 0, &fp, NULL);
136	if (error != 0)
137		return (error);
138	if (fp->f_type != DTYPE_SOCKET) {
139		fdrop(fp, curthread);
140		return (ENOTSOCK);
141	}
142	if (fflagp != NULL)
143		*fflagp = fp->f_flag;
144	*fpp = fp;
145	return (0);
146}
147
148/*
149 * System call interface to the socket abstraction.
150 */
151#if defined(COMPAT_43)
152#define COMPAT_OLDSOCK
153#endif
154
155int
156sys_socket(td, uap)
157	struct thread *td;
158	struct socket_args /* {
159		int	domain;
160		int	type;
161		int	protocol;
162	} */ *uap;
163{
164	struct socket *so;
165	struct file *fp;
166	int fd, error;
167
168	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
169#ifdef MAC
170	error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
171	    uap->protocol);
172	if (error)
173		return (error);
174#endif
175	error = falloc(td, &fp, &fd, 0);
176	if (error)
177		return (error);
178	/* An extra reference on `fp' has been held for us by falloc(). */
179	error = socreate(uap->domain, &so, uap->type, uap->protocol,
180	    td->td_ucred, td);
181	if (error) {
182		fdclose(td->td_proc->p_fd, fp, fd, td);
183	} else {
184		finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
185		td->td_retval[0] = fd;
186	}
187	fdrop(fp, td);
188	return (error);
189}
190
191/* ARGSUSED */
192int
193sys_bind(td, uap)
194	struct thread *td;
195	struct bind_args /* {
196		int	s;
197		caddr_t	name;
198		int	namelen;
199	} */ *uap;
200{
201	struct sockaddr *sa;
202	int error;
203
204	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
205		return (error);
206
207	error = kern_bind(td, uap->s, sa);
208	free(sa, M_SONAME);
209	return (error);
210}
211
212int
213kern_bind(td, fd, sa)
214	struct thread *td;
215	int fd;
216	struct sockaddr *sa;
217{
218	struct socket *so;
219	struct file *fp;
220	int error;
221
222	AUDIT_ARG_FD(fd);
223	AUDIT_ARG_SOCKADDR(td, sa);
224	error = getsock_cap(td->td_proc->p_fd, fd, CAP_BIND, &fp, NULL);
225	if (error)
226		return (error);
227	so = fp->f_data;
228#ifdef KTRACE
229	if (KTRPOINT(td, KTR_STRUCT))
230		ktrsockaddr(sa);
231#endif
232#ifdef MAC
233	error = mac_socket_check_bind(td->td_ucred, so, sa);
234	if (error == 0)
235#endif
236		error = sobind(so, sa, td);
237	fdrop(fp, td);
238	return (error);
239}
240
241/* ARGSUSED */
242int
243sys_listen(td, uap)
244	struct thread *td;
245	struct listen_args /* {
246		int	s;
247		int	backlog;
248	} */ *uap;
249{
250	struct socket *so;
251	struct file *fp;
252	int error;
253
254	AUDIT_ARG_FD(uap->s);
255	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_LISTEN, &fp, NULL);
256	if (error == 0) {
257		so = fp->f_data;
258#ifdef MAC
259		error = mac_socket_check_listen(td->td_ucred, so);
260		if (error == 0)
261#endif
262			error = solisten(so, uap->backlog, td);
263		fdrop(fp, td);
264	}
265	return(error);
266}
267
268/*
269 * accept1()
270 */
271static int
272accept1(td, uap, compat)
273	struct thread *td;
274	struct accept_args /* {
275		int	s;
276		struct sockaddr	* __restrict name;
277		socklen_t	* __restrict anamelen;
278	} */ *uap;
279	int compat;
280{
281	struct sockaddr *name;
282	socklen_t namelen;
283	struct file *fp;
284	int error;
285
286	if (uap->name == NULL)
287		return (kern_accept(td, uap->s, NULL, NULL, NULL));
288
289	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
290	if (error)
291		return (error);
292
293	error = kern_accept(td, uap->s, &name, &namelen, &fp);
294
295	/*
296	 * return a namelen of zero for older code which might
297	 * ignore the return value from accept.
298	 */
299	if (error) {
300		(void) copyout(&namelen,
301		    uap->anamelen, sizeof(*uap->anamelen));
302		return (error);
303	}
304
305	if (error == 0 && name != NULL) {
306#ifdef COMPAT_OLDSOCK
307		if (compat)
308			((struct osockaddr *)name)->sa_family =
309			    name->sa_family;
310#endif
311		error = copyout(name, uap->name, namelen);
312	}
313	if (error == 0)
314		error = copyout(&namelen, uap->anamelen,
315		    sizeof(namelen));
316	if (error)
317		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
318	fdrop(fp, td);
319	free(name, M_SONAME);
320	return (error);
321}
322
323int
324kern_accept(struct thread *td, int s, struct sockaddr **name,
325    socklen_t *namelen, struct file **fp)
326{
327	struct filedesc *fdp;
328	struct file *headfp, *nfp = NULL;
329	struct sockaddr *sa = NULL;
330	int error;
331	struct socket *head, *so;
332	int fd;
333	u_int fflag;
334	pid_t pgid;
335	int tmp;
336
337	if (name) {
338		*name = NULL;
339		if (*namelen < 0)
340			return (EINVAL);
341	}
342
343	AUDIT_ARG_FD(s);
344	fdp = td->td_proc->p_fd;
345	error = getsock_cap(fdp, s, CAP_ACCEPT, &headfp, &fflag);
346	if (error)
347		return (error);
348	head = headfp->f_data;
349	if ((head->so_options & SO_ACCEPTCONN) == 0) {
350		error = EINVAL;
351		goto done;
352	}
353#ifdef MAC
354	error = mac_socket_check_accept(td->td_ucred, head);
355	if (error != 0)
356		goto done;
357#endif
358	error = falloc(td, &nfp, &fd, 0);
359	if (error)
360		goto done;
361	ACCEPT_LOCK();
362	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
363		ACCEPT_UNLOCK();
364		error = EWOULDBLOCK;
365		goto noconnection;
366	}
367	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
368		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
369			head->so_error = ECONNABORTED;
370			break;
371		}
372		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
373		    "accept", 0);
374		if (error) {
375			ACCEPT_UNLOCK();
376			goto noconnection;
377		}
378	}
379	if (head->so_error) {
380		error = head->so_error;
381		head->so_error = 0;
382		ACCEPT_UNLOCK();
383		goto noconnection;
384	}
385	so = TAILQ_FIRST(&head->so_comp);
386	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
387	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
388
389	/*
390	 * Before changing the flags on the socket, we have to bump the
391	 * reference count.  Otherwise, if the protocol calls sofree(),
392	 * the socket will be released due to a zero refcount.
393	 */
394	SOCK_LOCK(so);			/* soref() and so_state update */
395	soref(so);			/* file descriptor reference */
396
397	TAILQ_REMOVE(&head->so_comp, so, so_list);
398	head->so_qlen--;
399	so->so_state |= (head->so_state & SS_NBIO);
400	so->so_qstate &= ~SQ_COMP;
401	so->so_head = NULL;
402
403	SOCK_UNLOCK(so);
404	ACCEPT_UNLOCK();
405
406	/* An extra reference on `nfp' has been held for us by falloc(). */
407	td->td_retval[0] = fd;
408
409	/* connection has been removed from the listen queue */
410	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
411
412	pgid = fgetown(&head->so_sigio);
413	if (pgid != 0)
414		fsetown(pgid, &so->so_sigio);
415
416	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
417	/* Sync socket nonblocking/async state with file flags */
418	tmp = fflag & FNONBLOCK;
419	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
420	tmp = fflag & FASYNC;
421	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
422	sa = 0;
423	error = soaccept(so, &sa);
424	if (error) {
425		/*
426		 * return a namelen of zero for older code which might
427		 * ignore the return value from accept.
428		 */
429		if (name)
430			*namelen = 0;
431		goto noconnection;
432	}
433	if (sa == NULL) {
434		if (name)
435			*namelen = 0;
436		goto done;
437	}
438	AUDIT_ARG_SOCKADDR(td, sa);
439	if (name) {
440		/* check sa_len before it is destroyed */
441		if (*namelen > sa->sa_len)
442			*namelen = sa->sa_len;
443#ifdef KTRACE
444		if (KTRPOINT(td, KTR_STRUCT))
445			ktrsockaddr(sa);
446#endif
447		*name = sa;
448		sa = NULL;
449	}
450noconnection:
451	if (sa)
452		free(sa, M_SONAME);
453
454	/*
455	 * close the new descriptor, assuming someone hasn't ripped it
456	 * out from under us.
457	 */
458	if (error)
459		fdclose(fdp, nfp, fd, td);
460
461	/*
462	 * Release explicitly held references before returning.  We return
463	 * a reference on nfp to the caller on success if they request it.
464	 */
465done:
466	if (fp != NULL) {
467		if (error == 0) {
468			*fp = nfp;
469			nfp = NULL;
470		} else
471			*fp = NULL;
472	}
473	if (nfp != NULL)
474		fdrop(nfp, td);
475	fdrop(headfp, td);
476	return (error);
477}
478
479int
480sys_accept(td, uap)
481	struct thread *td;
482	struct accept_args *uap;
483{
484
485	return (accept1(td, uap, 0));
486}
487
488#ifdef COMPAT_OLDSOCK
489int
490oaccept(td, uap)
491	struct thread *td;
492	struct accept_args *uap;
493{
494
495	return (accept1(td, uap, 1));
496}
497#endif /* COMPAT_OLDSOCK */
498
499/* ARGSUSED */
500int
501sys_connect(td, uap)
502	struct thread *td;
503	struct connect_args /* {
504		int	s;
505		caddr_t	name;
506		int	namelen;
507	} */ *uap;
508{
509	struct sockaddr *sa;
510	int error;
511
512	error = getsockaddr(&sa, uap->name, uap->namelen);
513	if (error)
514		return (error);
515
516	error = kern_connect(td, uap->s, sa);
517	free(sa, M_SONAME);
518	return (error);
519}
520
521
522int
523kern_connect(td, fd, sa)
524	struct thread *td;
525	int fd;
526	struct sockaddr *sa;
527{
528	struct socket *so;
529	struct file *fp;
530	int error;
531	int interrupted = 0;
532
533	AUDIT_ARG_FD(fd);
534	AUDIT_ARG_SOCKADDR(td, sa);
535	error = getsock_cap(td->td_proc->p_fd, fd, CAP_CONNECT, &fp, NULL);
536	if (error)
537		return (error);
538	so = fp->f_data;
539	if (so->so_state & SS_ISCONNECTING) {
540		error = EALREADY;
541		goto done1;
542	}
543#ifdef KTRACE
544	if (KTRPOINT(td, KTR_STRUCT))
545		ktrsockaddr(sa);
546#endif
547#ifdef MAC
548	error = mac_socket_check_connect(td->td_ucred, so, sa);
549	if (error)
550		goto bad;
551#endif
552	error = soconnect(so, sa, td);
553	if (error)
554		goto bad;
555	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
556		error = EINPROGRESS;
557		goto done1;
558	}
559	SOCK_LOCK(so);
560	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
561		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
562		    "connec", 0);
563		if (error) {
564			if (error == EINTR || error == ERESTART)
565				interrupted = 1;
566			break;
567		}
568	}
569	if (error == 0) {
570		error = so->so_error;
571		so->so_error = 0;
572	}
573	SOCK_UNLOCK(so);
574bad:
575	if (!interrupted)
576		so->so_state &= ~SS_ISCONNECTING;
577	if (error == ERESTART)
578		error = EINTR;
579done1:
580	fdrop(fp, td);
581	return (error);
582}
583
584int
585kern_socketpair(struct thread *td, int domain, int type, int protocol,
586    int *rsv)
587{
588	struct filedesc *fdp = td->td_proc->p_fd;
589	struct file *fp1, *fp2;
590	struct socket *so1, *so2;
591	int fd, error;
592
593	AUDIT_ARG_SOCKET(domain, type, protocol);
594#ifdef MAC
595	/* We might want to have a separate check for socket pairs. */
596	error = mac_socket_check_create(td->td_ucred, domain, type,
597	    protocol);
598	if (error)
599		return (error);
600#endif
601	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
602	if (error)
603		return (error);
604	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
605	if (error)
606		goto free1;
607	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
608	error = falloc(td, &fp1, &fd, 0);
609	if (error)
610		goto free2;
611	rsv[0] = fd;
612	fp1->f_data = so1;	/* so1 already has ref count */
613	error = falloc(td, &fp2, &fd, 0);
614	if (error)
615		goto free3;
616	fp2->f_data = so2;	/* so2 already has ref count */
617	rsv[1] = fd;
618	error = soconnect2(so1, so2);
619	if (error)
620		goto free4;
621	if (type == SOCK_DGRAM) {
622		/*
623		 * Datagram socket connection is asymmetric.
624		 */
625		 error = soconnect2(so2, so1);
626		 if (error)
627			goto free4;
628	}
629	finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);
630	finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);
631	fdrop(fp1, td);
632	fdrop(fp2, td);
633	return (0);
634free4:
635	fdclose(fdp, fp2, rsv[1], td);
636	fdrop(fp2, td);
637free3:
638	fdclose(fdp, fp1, rsv[0], td);
639	fdrop(fp1, td);
640free2:
641	if (so2 != NULL)
642		(void)soclose(so2);
643free1:
644	if (so1 != NULL)
645		(void)soclose(so1);
646	return (error);
647}
648
649int
650sys_socketpair(struct thread *td, struct socketpair_args *uap)
651{
652	int error, sv[2];
653
654	error = kern_socketpair(td, uap->domain, uap->type,
655	    uap->protocol, sv);
656	if (error)
657		return (error);
658	error = copyout(sv, uap->rsv, 2 * sizeof(int));
659	if (error) {
660		(void)kern_close(td, sv[0]);
661		(void)kern_close(td, sv[1]);
662	}
663	return (error);
664}
665
666static int
667sendit(td, s, mp, flags)
668	struct thread *td;
669	int s;
670	struct msghdr *mp;
671	int flags;
672{
673	struct mbuf *control;
674	struct sockaddr *to;
675	int error;
676
677#ifdef CAPABILITY_MODE
678	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
679		return (ECAPMODE);
680#endif
681
682	if (mp->msg_name != NULL) {
683		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
684		if (error) {
685			to = NULL;
686			goto bad;
687		}
688		mp->msg_name = to;
689	} else {
690		to = NULL;
691	}
692
693	if (mp->msg_control) {
694		if (mp->msg_controllen < sizeof(struct cmsghdr)
695#ifdef COMPAT_OLDSOCK
696		    && mp->msg_flags != MSG_COMPAT
697#endif
698		) {
699			error = EINVAL;
700			goto bad;
701		}
702		error = sockargs(&control, mp->msg_control,
703		    mp->msg_controllen, MT_CONTROL);
704		if (error)
705			goto bad;
706#ifdef COMPAT_OLDSOCK
707		if (mp->msg_flags == MSG_COMPAT) {
708			struct cmsghdr *cm;
709
710			M_PREPEND(control, sizeof(*cm), M_WAITOK);
711			cm = mtod(control, struct cmsghdr *);
712			cm->cmsg_len = control->m_len;
713			cm->cmsg_level = SOL_SOCKET;
714			cm->cmsg_type = SCM_RIGHTS;
715		}
716#endif
717	} else {
718		control = NULL;
719	}
720
721	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
722
723bad:
724	if (to)
725		free(to, M_SONAME);
726	return (error);
727}
728
729int
730kern_sendit(td, s, mp, flags, control, segflg)
731	struct thread *td;
732	int s;
733	struct msghdr *mp;
734	int flags;
735	struct mbuf *control;
736	enum uio_seg segflg;
737{
738	struct file *fp;
739	struct uio auio;
740	struct iovec *iov;
741	struct socket *so;
742	int i, error;
743	ssize_t len;
744	cap_rights_t rights;
745#ifdef KTRACE
746	struct uio *ktruio = NULL;
747#endif
748
749	AUDIT_ARG_FD(s);
750	rights = CAP_SEND;
751	if (mp->msg_name != NULL) {
752		AUDIT_ARG_SOCKADDR(td, mp->msg_name);
753		rights |= CAP_CONNECT;
754	}
755	error = getsock_cap(td->td_proc->p_fd, s, rights, &fp, NULL);
756	if (error)
757		return (error);
758	so = (struct socket *)fp->f_data;
759
760#ifdef KTRACE
761	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
762		ktrsockaddr(mp->msg_name);
763#endif
764#ifdef MAC
765	if (mp->msg_name != NULL) {
766		error = mac_socket_check_connect(td->td_ucred, so,
767		    mp->msg_name);
768		if (error)
769			goto bad;
770	}
771	error = mac_socket_check_send(td->td_ucred, so);
772	if (error)
773		goto bad;
774#endif
775
776	auio.uio_iov = mp->msg_iov;
777	auio.uio_iovcnt = mp->msg_iovlen;
778	auio.uio_segflg = segflg;
779	auio.uio_rw = UIO_WRITE;
780	auio.uio_td = td;
781	auio.uio_offset = 0;			/* XXX */
782	auio.uio_resid = 0;
783	iov = mp->msg_iov;
784	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
785		if ((auio.uio_resid += iov->iov_len) < 0) {
786			error = EINVAL;
787			goto bad;
788		}
789	}
790#ifdef KTRACE
791	if (KTRPOINT(td, KTR_GENIO))
792		ktruio = cloneuio(&auio);
793#endif
794	len = auio.uio_resid;
795	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
796	if (error) {
797		if (auio.uio_resid != len && (error == ERESTART ||
798		    error == EINTR || error == EWOULDBLOCK))
799			error = 0;
800		/* Generation of SIGPIPE can be controlled per socket */
801		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
802		    !(flags & MSG_NOSIGNAL)) {
803			PROC_LOCK(td->td_proc);
804			tdsignal(td, SIGPIPE);
805			PROC_UNLOCK(td->td_proc);
806		}
807	}
808	if (error == 0)
809		td->td_retval[0] = len - auio.uio_resid;
810#ifdef KTRACE
811	if (ktruio != NULL) {
812		ktruio->uio_resid = td->td_retval[0];
813		ktrgenio(s, UIO_WRITE, ktruio, error);
814	}
815#endif
816bad:
817	fdrop(fp, td);
818	return (error);
819}
820
821int
822sys_sendto(td, uap)
823	struct thread *td;
824	struct sendto_args /* {
825		int	s;
826		caddr_t	buf;
827		size_t	len;
828		int	flags;
829		caddr_t	to;
830		int	tolen;
831	} */ *uap;
832{
833	struct msghdr msg;
834	struct iovec aiov;
835	int error;
836
837	msg.msg_name = uap->to;
838	msg.msg_namelen = uap->tolen;
839	msg.msg_iov = &aiov;
840	msg.msg_iovlen = 1;
841	msg.msg_control = 0;
842#ifdef COMPAT_OLDSOCK
843	msg.msg_flags = 0;
844#endif
845	aiov.iov_base = uap->buf;
846	aiov.iov_len = uap->len;
847	error = sendit(td, uap->s, &msg, uap->flags);
848	return (error);
849}
850
851#ifdef COMPAT_OLDSOCK
852int
853osend(td, uap)
854	struct thread *td;
855	struct osend_args /* {
856		int	s;
857		caddr_t	buf;
858		int	len;
859		int	flags;
860	} */ *uap;
861{
862	struct msghdr msg;
863	struct iovec aiov;
864	int error;
865
866	msg.msg_name = 0;
867	msg.msg_namelen = 0;
868	msg.msg_iov = &aiov;
869	msg.msg_iovlen = 1;
870	aiov.iov_base = uap->buf;
871	aiov.iov_len = uap->len;
872	msg.msg_control = 0;
873	msg.msg_flags = 0;
874	error = sendit(td, uap->s, &msg, uap->flags);
875	return (error);
876}
877
878int
879osendmsg(td, uap)
880	struct thread *td;
881	struct osendmsg_args /* {
882		int	s;
883		caddr_t	msg;
884		int	flags;
885	} */ *uap;
886{
887	struct msghdr msg;
888	struct iovec *iov;
889	int error;
890
891	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
892	if (error)
893		return (error);
894	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
895	if (error)
896		return (error);
897	msg.msg_iov = iov;
898	msg.msg_flags = MSG_COMPAT;
899	error = sendit(td, uap->s, &msg, uap->flags);
900	free(iov, M_IOV);
901	return (error);
902}
903#endif
904
905int
906sys_sendmsg(td, uap)
907	struct thread *td;
908	struct sendmsg_args /* {
909		int	s;
910		caddr_t	msg;
911		int	flags;
912	} */ *uap;
913{
914	struct msghdr msg;
915	struct iovec *iov;
916	int error;
917
918	error = copyin(uap->msg, &msg, sizeof (msg));
919	if (error)
920		return (error);
921	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
922	if (error)
923		return (error);
924	msg.msg_iov = iov;
925#ifdef COMPAT_OLDSOCK
926	msg.msg_flags = 0;
927#endif
928	error = sendit(td, uap->s, &msg, uap->flags);
929	free(iov, M_IOV);
930	return (error);
931}
932
933int
934kern_recvit(td, s, mp, fromseg, controlp)
935	struct thread *td;
936	int s;
937	struct msghdr *mp;
938	enum uio_seg fromseg;
939	struct mbuf **controlp;
940{
941	struct uio auio;
942	struct iovec *iov;
943	int i;
944	ssize_t len;
945	int error;
946	struct mbuf *m, *control = NULL;
947	caddr_t ctlbuf;
948	struct file *fp;
949	struct socket *so;
950	struct sockaddr *fromsa = NULL;
951#ifdef KTRACE
952	struct uio *ktruio = NULL;
953#endif
954
955	if (controlp != NULL)
956		*controlp = NULL;
957
958	AUDIT_ARG_FD(s);
959	error = getsock_cap(td->td_proc->p_fd, s, CAP_RECV, &fp, NULL);
960	if (error)
961		return (error);
962	so = fp->f_data;
963
964#ifdef MAC
965	error = mac_socket_check_receive(td->td_ucred, so);
966	if (error) {
967		fdrop(fp, td);
968		return (error);
969	}
970#endif
971
972	auio.uio_iov = mp->msg_iov;
973	auio.uio_iovcnt = mp->msg_iovlen;
974	auio.uio_segflg = UIO_USERSPACE;
975	auio.uio_rw = UIO_READ;
976	auio.uio_td = td;
977	auio.uio_offset = 0;			/* XXX */
978	auio.uio_resid = 0;
979	iov = mp->msg_iov;
980	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
981		if ((auio.uio_resid += iov->iov_len) < 0) {
982			fdrop(fp, td);
983			return (EINVAL);
984		}
985	}
986#ifdef KTRACE
987	if (KTRPOINT(td, KTR_GENIO))
988		ktruio = cloneuio(&auio);
989#endif
990	len = auio.uio_resid;
991	error = soreceive(so, &fromsa, &auio, NULL,
992	    (mp->msg_control || controlp) ? &control : NULL,
993	    &mp->msg_flags);
994	if (error) {
995		if (auio.uio_resid != len && (error == ERESTART ||
996		    error == EINTR || error == EWOULDBLOCK))
997			error = 0;
998	}
999	if (fromsa != NULL)
1000		AUDIT_ARG_SOCKADDR(td, fromsa);
1001#ifdef KTRACE
1002	if (ktruio != NULL) {
1003		ktruio->uio_resid = len - auio.uio_resid;
1004		ktrgenio(s, UIO_READ, ktruio, error);
1005	}
1006#endif
1007	if (error)
1008		goto out;
1009	td->td_retval[0] = len - auio.uio_resid;
1010	if (mp->msg_name) {
1011		len = mp->msg_namelen;
1012		if (len <= 0 || fromsa == NULL)
1013			len = 0;
1014		else {
1015			/* save sa_len before it is destroyed by MSG_COMPAT */
1016			len = MIN(len, fromsa->sa_len);
1017#ifdef COMPAT_OLDSOCK
1018			if (mp->msg_flags & MSG_COMPAT)
1019				((struct osockaddr *)fromsa)->sa_family =
1020				    fromsa->sa_family;
1021#endif
1022			if (fromseg == UIO_USERSPACE) {
1023				error = copyout(fromsa, mp->msg_name,
1024				    (unsigned)len);
1025				if (error)
1026					goto out;
1027			} else
1028				bcopy(fromsa, mp->msg_name, len);
1029		}
1030		mp->msg_namelen = len;
1031	}
1032	if (mp->msg_control && controlp == NULL) {
1033#ifdef COMPAT_OLDSOCK
1034		/*
1035		 * We assume that old recvmsg calls won't receive access
1036		 * rights and other control info, esp. as control info
1037		 * is always optional and those options didn't exist in 4.3.
1038		 * If we receive rights, trim the cmsghdr; anything else
1039		 * is tossed.
1040		 */
1041		if (control && mp->msg_flags & MSG_COMPAT) {
1042			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1043			    SOL_SOCKET ||
1044			    mtod(control, struct cmsghdr *)->cmsg_type !=
1045			    SCM_RIGHTS) {
1046				mp->msg_controllen = 0;
1047				goto out;
1048			}
1049			control->m_len -= sizeof (struct cmsghdr);
1050			control->m_data += sizeof (struct cmsghdr);
1051		}
1052#endif
1053		len = mp->msg_controllen;
1054		m = control;
1055		mp->msg_controllen = 0;
1056		ctlbuf = mp->msg_control;
1057
1058		while (m && len > 0) {
1059			unsigned int tocopy;
1060
1061			if (len >= m->m_len)
1062				tocopy = m->m_len;
1063			else {
1064				mp->msg_flags |= MSG_CTRUNC;
1065				tocopy = len;
1066			}
1067
1068			if ((error = copyout(mtod(m, caddr_t),
1069					ctlbuf, tocopy)) != 0)
1070				goto out;
1071
1072			ctlbuf += tocopy;
1073			len -= tocopy;
1074			m = m->m_next;
1075		}
1076		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1077	}
1078out:
1079	fdrop(fp, td);
1080#ifdef KTRACE
1081	if (fromsa && KTRPOINT(td, KTR_STRUCT))
1082		ktrsockaddr(fromsa);
1083#endif
1084	if (fromsa)
1085		free(fromsa, M_SONAME);
1086
1087	if (error == 0 && controlp != NULL)
1088		*controlp = control;
1089	else  if (control)
1090		m_freem(control);
1091
1092	return (error);
1093}
1094
1095static int
1096recvit(td, s, mp, namelenp)
1097	struct thread *td;
1098	int s;
1099	struct msghdr *mp;
1100	void *namelenp;
1101{
1102	int error;
1103
1104	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1105	if (error)
1106		return (error);
1107	if (namelenp) {
1108		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1109#ifdef COMPAT_OLDSOCK
1110		if (mp->msg_flags & MSG_COMPAT)
1111			error = 0;	/* old recvfrom didn't check */
1112#endif
1113	}
1114	return (error);
1115}
1116
1117int
1118sys_recvfrom(td, uap)
1119	struct thread *td;
1120	struct recvfrom_args /* {
1121		int	s;
1122		caddr_t	buf;
1123		size_t	len;
1124		int	flags;
1125		struct sockaddr * __restrict	from;
1126		socklen_t * __restrict fromlenaddr;
1127	} */ *uap;
1128{
1129	struct msghdr msg;
1130	struct iovec aiov;
1131	int error;
1132
1133	if (uap->fromlenaddr) {
1134		error = copyin(uap->fromlenaddr,
1135		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1136		if (error)
1137			goto done2;
1138	} else {
1139		msg.msg_namelen = 0;
1140	}
1141	msg.msg_name = uap->from;
1142	msg.msg_iov = &aiov;
1143	msg.msg_iovlen = 1;
1144	aiov.iov_base = uap->buf;
1145	aiov.iov_len = uap->len;
1146	msg.msg_control = 0;
1147	msg.msg_flags = uap->flags;
1148	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1149done2:
1150	return(error);
1151}
1152
1153#ifdef COMPAT_OLDSOCK
1154int
1155orecvfrom(td, uap)
1156	struct thread *td;
1157	struct recvfrom_args *uap;
1158{
1159
1160	uap->flags |= MSG_COMPAT;
1161	return (sys_recvfrom(td, uap));
1162}
1163#endif
1164
1165#ifdef COMPAT_OLDSOCK
1166int
1167orecv(td, uap)
1168	struct thread *td;
1169	struct orecv_args /* {
1170		int	s;
1171		caddr_t	buf;
1172		int	len;
1173		int	flags;
1174	} */ *uap;
1175{
1176	struct msghdr msg;
1177	struct iovec aiov;
1178	int error;
1179
1180	msg.msg_name = 0;
1181	msg.msg_namelen = 0;
1182	msg.msg_iov = &aiov;
1183	msg.msg_iovlen = 1;
1184	aiov.iov_base = uap->buf;
1185	aiov.iov_len = uap->len;
1186	msg.msg_control = 0;
1187	msg.msg_flags = uap->flags;
1188	error = recvit(td, uap->s, &msg, NULL);
1189	return (error);
1190}
1191
1192/*
1193 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1194 * overlays the new one, missing only the flags, and with the (old) access
1195 * rights where the control fields are now.
1196 */
1197int
1198orecvmsg(td, uap)
1199	struct thread *td;
1200	struct orecvmsg_args /* {
1201		int	s;
1202		struct	omsghdr *msg;
1203		int	flags;
1204	} */ *uap;
1205{
1206	struct msghdr msg;
1207	struct iovec *iov;
1208	int error;
1209
1210	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1211	if (error)
1212		return (error);
1213	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1214	if (error)
1215		return (error);
1216	msg.msg_flags = uap->flags | MSG_COMPAT;
1217	msg.msg_iov = iov;
1218	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1219	if (msg.msg_controllen && error == 0)
1220		error = copyout(&msg.msg_controllen,
1221		    &uap->msg->msg_accrightslen, sizeof (int));
1222	free(iov, M_IOV);
1223	return (error);
1224}
1225#endif
1226
1227int
1228sys_recvmsg(td, uap)
1229	struct thread *td;
1230	struct recvmsg_args /* {
1231		int	s;
1232		struct	msghdr *msg;
1233		int	flags;
1234	} */ *uap;
1235{
1236	struct msghdr msg;
1237	struct iovec *uiov, *iov;
1238	int error;
1239
1240	error = copyin(uap->msg, &msg, sizeof (msg));
1241	if (error)
1242		return (error);
1243	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1244	if (error)
1245		return (error);
1246	msg.msg_flags = uap->flags;
1247#ifdef COMPAT_OLDSOCK
1248	msg.msg_flags &= ~MSG_COMPAT;
1249#endif
1250	uiov = msg.msg_iov;
1251	msg.msg_iov = iov;
1252	error = recvit(td, uap->s, &msg, NULL);
1253	if (error == 0) {
1254		msg.msg_iov = uiov;
1255		error = copyout(&msg, uap->msg, sizeof(msg));
1256	}
1257	free(iov, M_IOV);
1258	return (error);
1259}
1260
1261/* ARGSUSED */
1262int
1263sys_shutdown(td, uap)
1264	struct thread *td;
1265	struct shutdown_args /* {
1266		int	s;
1267		int	how;
1268	} */ *uap;
1269{
1270	struct socket *so;
1271	struct file *fp;
1272	int error;
1273
1274	AUDIT_ARG_FD(uap->s);
1275	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SHUTDOWN, &fp,
1276	    NULL);
1277	if (error == 0) {
1278		so = fp->f_data;
1279		error = soshutdown(so, uap->how);
1280		fdrop(fp, td);
1281	}
1282	return (error);
1283}
1284
1285/* ARGSUSED */
1286int
1287sys_setsockopt(td, uap)
1288	struct thread *td;
1289	struct setsockopt_args /* {
1290		int	s;
1291		int	level;
1292		int	name;
1293		caddr_t	val;
1294		int	valsize;
1295	} */ *uap;
1296{
1297
1298	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1299	    uap->val, UIO_USERSPACE, uap->valsize));
1300}
1301
1302int
1303kern_setsockopt(td, s, level, name, val, valseg, valsize)
1304	struct thread *td;
1305	int s;
1306	int level;
1307	int name;
1308	void *val;
1309	enum uio_seg valseg;
1310	socklen_t valsize;
1311{
1312	int error;
1313	struct socket *so;
1314	struct file *fp;
1315	struct sockopt sopt;
1316
1317	if (val == NULL && valsize != 0)
1318		return (EFAULT);
1319	if ((int)valsize < 0)
1320		return (EINVAL);
1321
1322	sopt.sopt_dir = SOPT_SET;
1323	sopt.sopt_level = level;
1324	sopt.sopt_name = name;
1325	sopt.sopt_val = val;
1326	sopt.sopt_valsize = valsize;
1327	switch (valseg) {
1328	case UIO_USERSPACE:
1329		sopt.sopt_td = td;
1330		break;
1331	case UIO_SYSSPACE:
1332		sopt.sopt_td = NULL;
1333		break;
1334	default:
1335		panic("kern_setsockopt called with bad valseg");
1336	}
1337
1338	AUDIT_ARG_FD(s);
1339	error = getsock_cap(td->td_proc->p_fd, s, CAP_SETSOCKOPT, &fp, NULL);
1340	if (error == 0) {
1341		so = fp->f_data;
1342		error = sosetopt(so, &sopt);
1343		fdrop(fp, td);
1344	}
1345	return(error);
1346}
1347
1348/* ARGSUSED */
1349int
1350sys_getsockopt(td, uap)
1351	struct thread *td;
1352	struct getsockopt_args /* {
1353		int	s;
1354		int	level;
1355		int	name;
1356		void * __restrict	val;
1357		socklen_t * __restrict avalsize;
1358	} */ *uap;
1359{
1360	socklen_t valsize;
1361	int	error;
1362
1363	if (uap->val) {
1364		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1365		if (error)
1366			return (error);
1367	}
1368
1369	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1370	    uap->val, UIO_USERSPACE, &valsize);
1371
1372	if (error == 0)
1373		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1374	return (error);
1375}
1376
1377/*
1378 * Kernel version of getsockopt.
1379 * optval can be a userland or userspace. optlen is always a kernel pointer.
1380 */
1381int
1382kern_getsockopt(td, s, level, name, val, valseg, valsize)
1383	struct thread *td;
1384	int s;
1385	int level;
1386	int name;
1387	void *val;
1388	enum uio_seg valseg;
1389	socklen_t *valsize;
1390{
1391	int error;
1392	struct  socket *so;
1393	struct file *fp;
1394	struct	sockopt sopt;
1395
1396	if (val == NULL)
1397		*valsize = 0;
1398	if ((int)*valsize < 0)
1399		return (EINVAL);
1400
1401	sopt.sopt_dir = SOPT_GET;
1402	sopt.sopt_level = level;
1403	sopt.sopt_name = name;
1404	sopt.sopt_val = val;
1405	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1406	switch (valseg) {
1407	case UIO_USERSPACE:
1408		sopt.sopt_td = td;
1409		break;
1410	case UIO_SYSSPACE:
1411		sopt.sopt_td = NULL;
1412		break;
1413	default:
1414		panic("kern_getsockopt called with bad valseg");
1415	}
1416
1417	AUDIT_ARG_FD(s);
1418	error = getsock_cap(td->td_proc->p_fd, s, CAP_GETSOCKOPT, &fp, NULL);
1419	if (error == 0) {
1420		so = fp->f_data;
1421		error = sogetopt(so, &sopt);
1422		*valsize = sopt.sopt_valsize;
1423		fdrop(fp, td);
1424	}
1425	return (error);
1426}
1427
1428/*
1429 * getsockname1() - Get socket name.
1430 */
1431/* ARGSUSED */
1432static int
1433getsockname1(td, uap, compat)
1434	struct thread *td;
1435	struct getsockname_args /* {
1436		int	fdes;
1437		struct sockaddr * __restrict asa;
1438		socklen_t * __restrict alen;
1439	} */ *uap;
1440	int compat;
1441{
1442	struct sockaddr *sa;
1443	socklen_t len;
1444	int error;
1445
1446	error = copyin(uap->alen, &len, sizeof(len));
1447	if (error)
1448		return (error);
1449
1450	error = kern_getsockname(td, uap->fdes, &sa, &len);
1451	if (error)
1452		return (error);
1453
1454	if (len != 0) {
1455#ifdef COMPAT_OLDSOCK
1456		if (compat)
1457			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1458#endif
1459		error = copyout(sa, uap->asa, (u_int)len);
1460	}
1461	free(sa, M_SONAME);
1462	if (error == 0)
1463		error = copyout(&len, uap->alen, sizeof(len));
1464	return (error);
1465}
1466
1467int
1468kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1469    socklen_t *alen)
1470{
1471	struct socket *so;
1472	struct file *fp;
1473	socklen_t len;
1474	int error;
1475
1476	if (*alen < 0)
1477		return (EINVAL);
1478
1479	AUDIT_ARG_FD(fd);
1480	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETSOCKNAME, &fp, NULL);
1481	if (error)
1482		return (error);
1483	so = fp->f_data;
1484	*sa = NULL;
1485	CURVNET_SET(so->so_vnet);
1486	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1487	CURVNET_RESTORE();
1488	if (error)
1489		goto bad;
1490	if (*sa == NULL)
1491		len = 0;
1492	else
1493		len = MIN(*alen, (*sa)->sa_len);
1494	*alen = len;
1495#ifdef KTRACE
1496	if (KTRPOINT(td, KTR_STRUCT))
1497		ktrsockaddr(*sa);
1498#endif
1499bad:
1500	fdrop(fp, td);
1501	if (error && *sa) {
1502		free(*sa, M_SONAME);
1503		*sa = NULL;
1504	}
1505	return (error);
1506}
1507
1508int
1509sys_getsockname(td, uap)
1510	struct thread *td;
1511	struct getsockname_args *uap;
1512{
1513
1514	return (getsockname1(td, uap, 0));
1515}
1516
1517#ifdef COMPAT_OLDSOCK
1518int
1519ogetsockname(td, uap)
1520	struct thread *td;
1521	struct getsockname_args *uap;
1522{
1523
1524	return (getsockname1(td, uap, 1));
1525}
1526#endif /* COMPAT_OLDSOCK */
1527
1528/*
1529 * getpeername1() - Get name of peer for connected socket.
1530 */
1531/* ARGSUSED */
1532static int
1533getpeername1(td, uap, compat)
1534	struct thread *td;
1535	struct getpeername_args /* {
1536		int	fdes;
1537		struct sockaddr * __restrict	asa;
1538		socklen_t * __restrict	alen;
1539	} */ *uap;
1540	int compat;
1541{
1542	struct sockaddr *sa;
1543	socklen_t len;
1544	int error;
1545
1546	error = copyin(uap->alen, &len, sizeof (len));
1547	if (error)
1548		return (error);
1549
1550	error = kern_getpeername(td, uap->fdes, &sa, &len);
1551	if (error)
1552		return (error);
1553
1554	if (len != 0) {
1555#ifdef COMPAT_OLDSOCK
1556		if (compat)
1557			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1558#endif
1559		error = copyout(sa, uap->asa, (u_int)len);
1560	}
1561	free(sa, M_SONAME);
1562	if (error == 0)
1563		error = copyout(&len, uap->alen, sizeof(len));
1564	return (error);
1565}
1566
1567int
1568kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1569    socklen_t *alen)
1570{
1571	struct socket *so;
1572	struct file *fp;
1573	socklen_t len;
1574	int error;
1575
1576	if (*alen < 0)
1577		return (EINVAL);
1578
1579	AUDIT_ARG_FD(fd);
1580	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETPEERNAME, &fp, NULL);
1581	if (error)
1582		return (error);
1583	so = fp->f_data;
1584	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1585		error = ENOTCONN;
1586		goto done;
1587	}
1588	*sa = NULL;
1589	CURVNET_SET(so->so_vnet);
1590	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1591	CURVNET_RESTORE();
1592	if (error)
1593		goto bad;
1594	if (*sa == NULL)
1595		len = 0;
1596	else
1597		len = MIN(*alen, (*sa)->sa_len);
1598	*alen = len;
1599#ifdef KTRACE
1600	if (KTRPOINT(td, KTR_STRUCT))
1601		ktrsockaddr(*sa);
1602#endif
1603bad:
1604	if (error && *sa) {
1605		free(*sa, M_SONAME);
1606		*sa = NULL;
1607	}
1608done:
1609	fdrop(fp, td);
1610	return (error);
1611}
1612
1613int
1614sys_getpeername(td, uap)
1615	struct thread *td;
1616	struct getpeername_args *uap;
1617{
1618
1619	return (getpeername1(td, uap, 0));
1620}
1621
1622#ifdef COMPAT_OLDSOCK
1623int
1624ogetpeername(td, uap)
1625	struct thread *td;
1626	struct ogetpeername_args *uap;
1627{
1628
1629	/* XXX uap should have type `getpeername_args *' to begin with. */
1630	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1631}
1632#endif /* COMPAT_OLDSOCK */
1633
1634int
1635sockargs(mp, buf, buflen, type)
1636	struct mbuf **mp;
1637	caddr_t buf;
1638	int buflen, type;
1639{
1640	struct sockaddr *sa;
1641	struct mbuf *m;
1642	int error;
1643
1644	if ((u_int)buflen > MLEN) {
1645#ifdef COMPAT_OLDSOCK
1646		if (type == MT_SONAME && (u_int)buflen <= 112)
1647			buflen = MLEN;		/* unix domain compat. hack */
1648		else
1649#endif
1650			if ((u_int)buflen > MCLBYTES)
1651				return (EINVAL);
1652	}
1653	m = m_get(M_WAITOK, type);
1654	if ((u_int)buflen > MLEN)
1655		MCLGET(m, M_WAITOK);
1656	m->m_len = buflen;
1657	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1658	if (error)
1659		(void) m_free(m);
1660	else {
1661		*mp = m;
1662		if (type == MT_SONAME) {
1663			sa = mtod(m, struct sockaddr *);
1664
1665#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1666			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1667				sa->sa_family = sa->sa_len;
1668#endif
1669			sa->sa_len = buflen;
1670		}
1671	}
1672	return (error);
1673}
1674
1675int
1676getsockaddr(namp, uaddr, len)
1677	struct sockaddr **namp;
1678	caddr_t uaddr;
1679	size_t len;
1680{
1681	struct sockaddr *sa;
1682	int error;
1683
1684	if (len > SOCK_MAXADDRLEN)
1685		return (ENAMETOOLONG);
1686	if (len < offsetof(struct sockaddr, sa_data[0]))
1687		return (EINVAL);
1688	sa = malloc(len, M_SONAME, M_WAITOK);
1689	error = copyin(uaddr, sa, len);
1690	if (error) {
1691		free(sa, M_SONAME);
1692	} else {
1693#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1694		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1695			sa->sa_family = sa->sa_len;
1696#endif
1697		sa->sa_len = len;
1698		*namp = sa;
1699	}
1700	return (error);
1701}
1702
1703#include <sys/condvar.h>
1704
1705struct sendfile_sync {
1706	struct mtx	mtx;
1707	struct cv	cv;
1708	unsigned	count;
1709};
1710
1711/*
1712 * Detach mapped page and release resources back to the system.
1713 */
1714void
1715sf_buf_mext(void *addr, void *args)
1716{
1717	vm_page_t m;
1718	struct sendfile_sync *sfs;
1719
1720	m = sf_buf_page(args);
1721	sf_buf_free(args);
1722	vm_page_lock(m);
1723	vm_page_unwire(m, 0);
1724	/*
1725	 * Check for the object going away on us. This can
1726	 * happen since we don't hold a reference to it.
1727	 * If so, we're responsible for freeing the page.
1728	 */
1729	if (m->wire_count == 0 && m->object == NULL)
1730		vm_page_free(m);
1731	vm_page_unlock(m);
1732	if (addr == NULL)
1733		return;
1734	sfs = addr;
1735	mtx_lock(&sfs->mtx);
1736	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
1737	if (--sfs->count == 0)
1738		cv_signal(&sfs->cv);
1739	mtx_unlock(&sfs->mtx);
1740}
1741
1742/*
1743 * sendfile(2)
1744 *
1745 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1746 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1747 *
1748 * Send a file specified by 'fd' and starting at 'offset' to a socket
1749 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1750 * 0.  Optionally add a header and/or trailer to the socket output.  If
1751 * specified, write the total number of bytes sent into *sbytes.
1752 */
1753int
1754sys_sendfile(struct thread *td, struct sendfile_args *uap)
1755{
1756
1757	return (do_sendfile(td, uap, 0));
1758}
1759
1760static int
1761do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1762{
1763	struct sf_hdtr hdtr;
1764	struct uio *hdr_uio, *trl_uio;
1765	int error;
1766
1767	hdr_uio = trl_uio = NULL;
1768
1769	if (uap->hdtr != NULL) {
1770		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1771		if (error)
1772			goto out;
1773		if (hdtr.headers != NULL) {
1774			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1775			if (error)
1776				goto out;
1777		}
1778		if (hdtr.trailers != NULL) {
1779			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1780			if (error)
1781				goto out;
1782
1783		}
1784	}
1785
1786	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
1787out:
1788	if (hdr_uio)
1789		free(hdr_uio, M_IOV);
1790	if (trl_uio)
1791		free(trl_uio, M_IOV);
1792	return (error);
1793}
1794
1795#ifdef COMPAT_FREEBSD4
1796int
1797freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1798{
1799	struct sendfile_args args;
1800
1801	args.fd = uap->fd;
1802	args.s = uap->s;
1803	args.offset = uap->offset;
1804	args.nbytes = uap->nbytes;
1805	args.hdtr = uap->hdtr;
1806	args.sbytes = uap->sbytes;
1807	args.flags = uap->flags;
1808
1809	return (do_sendfile(td, &args, 1));
1810}
1811#endif /* COMPAT_FREEBSD4 */
1812
1813int
1814kern_sendfile(struct thread *td, struct sendfile_args *uap,
1815    struct uio *hdr_uio, struct uio *trl_uio, int compat)
1816{
1817	struct file *sock_fp;
1818	struct vnode *vp;
1819	struct vm_object *obj = NULL;
1820	struct socket *so = NULL;
1821	struct mbuf *m = NULL;
1822	struct sf_buf *sf;
1823	struct vm_page *pg;
1824	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
1825	int error, hdrlen = 0, mnw = 0;
1826	struct sendfile_sync *sfs = NULL;
1827
1828	/*
1829	 * The file descriptor must be a regular file and have a
1830	 * backing VM object.
1831	 * File offset must be positive.  If it goes beyond EOF
1832	 * we send only the header/trailer and no payload data.
1833	 */
1834	AUDIT_ARG_FD(uap->fd);
1835	/*
1836	 * sendfile(2) can start at any offset within a file so we require
1837	 * CAP_READ+CAP_SEEK = CAP_PREAD.
1838	 */
1839	if ((error = fgetvp_read(td, uap->fd, CAP_PREAD, &vp)) != 0)
1840		goto out;
1841	vn_lock(vp, LK_SHARED | LK_RETRY);
1842	if (vp->v_type == VREG) {
1843		obj = vp->v_object;
1844		if (obj != NULL) {
1845			/*
1846			 * Temporarily increase the backing VM
1847			 * object's reference count so that a forced
1848			 * reclamation of its vnode does not
1849			 * immediately destroy it.
1850			 */
1851			VM_OBJECT_LOCK(obj);
1852			if ((obj->flags & OBJ_DEAD) == 0) {
1853				vm_object_reference_locked(obj);
1854				VM_OBJECT_UNLOCK(obj);
1855			} else {
1856				VM_OBJECT_UNLOCK(obj);
1857				obj = NULL;
1858			}
1859		}
1860	}
1861	VOP_UNLOCK(vp, 0);
1862	if (obj == NULL) {
1863		error = EINVAL;
1864		goto out;
1865	}
1866	if (uap->offset < 0) {
1867		error = EINVAL;
1868		goto out;
1869	}
1870
1871	/*
1872	 * The socket must be a stream socket and connected.
1873	 * Remember if it a blocking or non-blocking socket.
1874	 */
1875	if ((error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SEND,
1876	    &sock_fp, NULL)) != 0)
1877		goto out;
1878	so = sock_fp->f_data;
1879	if (so->so_type != SOCK_STREAM) {
1880		error = EINVAL;
1881		goto out;
1882	}
1883	if ((so->so_state & SS_ISCONNECTED) == 0) {
1884		error = ENOTCONN;
1885		goto out;
1886	}
1887	/*
1888	 * Do not wait on memory allocations but return ENOMEM for
1889	 * caller to retry later.
1890	 * XXX: Experimental.
1891	 */
1892	if (uap->flags & SF_MNOWAIT)
1893		mnw = 1;
1894
1895	if (uap->flags & SF_SYNC) {
1896		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
1897		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
1898		cv_init(&sfs->cv, "sendfile");
1899	}
1900
1901#ifdef MAC
1902	error = mac_socket_check_send(td->td_ucred, so);
1903	if (error)
1904		goto out;
1905#endif
1906
1907	/* If headers are specified copy them into mbufs. */
1908	if (hdr_uio != NULL) {
1909		hdr_uio->uio_td = td;
1910		hdr_uio->uio_rw = UIO_WRITE;
1911		if (hdr_uio->uio_resid > 0) {
1912			/*
1913			 * In FBSD < 5.0 the nbytes to send also included
1914			 * the header.  If compat is specified subtract the
1915			 * header size from nbytes.
1916			 */
1917			if (compat) {
1918				if (uap->nbytes > hdr_uio->uio_resid)
1919					uap->nbytes -= hdr_uio->uio_resid;
1920				else
1921					uap->nbytes = 0;
1922			}
1923			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
1924			    0, 0, 0);
1925			if (m == NULL) {
1926				error = mnw ? EAGAIN : ENOBUFS;
1927				goto out;
1928			}
1929			hdrlen = m_length(m, NULL);
1930		}
1931	}
1932
1933	/*
1934	 * Protect against multiple writers to the socket.
1935	 *
1936	 * XXXRW: Historically this has assumed non-interruptibility, so now
1937	 * we implement that, but possibly shouldn't.
1938	 */
1939	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
1940
1941	/*
1942	 * Loop through the pages of the file, starting with the requested
1943	 * offset. Get a file page (do I/O if necessary), map the file page
1944	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1945	 * it on the socket.
1946	 * This is done in two loops.  The inner loop turns as many pages
1947	 * as it can, up to available socket buffer space, without blocking
1948	 * into mbufs to have it bulk delivered into the socket send buffer.
1949	 * The outer loop checks the state and available space of the socket
1950	 * and takes care of the overall progress.
1951	 */
1952	for (off = uap->offset, rem = uap->nbytes; ; ) {
1953		struct mbuf *mtail = NULL;
1954		int loopbytes = 0;
1955		int space = 0;
1956		int done = 0;
1957
1958		/*
1959		 * Check the socket state for ongoing connection,
1960		 * no errors and space in socket buffer.
1961		 * If space is low allow for the remainder of the
1962		 * file to be processed if it fits the socket buffer.
1963		 * Otherwise block in waiting for sufficient space
1964		 * to proceed, or if the socket is nonblocking, return
1965		 * to userland with EAGAIN while reporting how far
1966		 * we've come.
1967		 * We wait until the socket buffer has significant free
1968		 * space to do bulk sends.  This makes good use of file
1969		 * system read ahead and allows packet segmentation
1970		 * offloading hardware to take over lots of work.  If
1971		 * we were not careful here we would send off only one
1972		 * sfbuf at a time.
1973		 */
1974		SOCKBUF_LOCK(&so->so_snd);
1975		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
1976			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
1977retry_space:
1978		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1979			error = EPIPE;
1980			SOCKBUF_UNLOCK(&so->so_snd);
1981			goto done;
1982		} else if (so->so_error) {
1983			error = so->so_error;
1984			so->so_error = 0;
1985			SOCKBUF_UNLOCK(&so->so_snd);
1986			goto done;
1987		}
1988		space = sbspace(&so->so_snd);
1989		if (space < rem &&
1990		    (space <= 0 ||
1991		     space < so->so_snd.sb_lowat)) {
1992			if (so->so_state & SS_NBIO) {
1993				SOCKBUF_UNLOCK(&so->so_snd);
1994				error = EAGAIN;
1995				goto done;
1996			}
1997			/*
1998			 * sbwait drops the lock while sleeping.
1999			 * When we loop back to retry_space the
2000			 * state may have changed and we retest
2001			 * for it.
2002			 */
2003			error = sbwait(&so->so_snd);
2004			/*
2005			 * An error from sbwait usually indicates that we've
2006			 * been interrupted by a signal. If we've sent anything
2007			 * then return bytes sent, otherwise return the error.
2008			 */
2009			if (error) {
2010				SOCKBUF_UNLOCK(&so->so_snd);
2011				goto done;
2012			}
2013			goto retry_space;
2014		}
2015		SOCKBUF_UNLOCK(&so->so_snd);
2016
2017		/*
2018		 * Reduce space in the socket buffer by the size of
2019		 * the header mbuf chain.
2020		 * hdrlen is set to 0 after the first loop.
2021		 */
2022		space -= hdrlen;
2023
2024		/*
2025		 * Loop and construct maximum sized mbuf chain to be bulk
2026		 * dumped into socket buffer.
2027		 */
2028		while (space > loopbytes) {
2029			vm_pindex_t pindex;
2030			vm_offset_t pgoff;
2031			struct mbuf *m0;
2032
2033			VM_OBJECT_LOCK(obj);
2034			/*
2035			 * Calculate the amount to transfer.
2036			 * Not to exceed a page, the EOF,
2037			 * or the passed in nbytes.
2038			 */
2039			pgoff = (vm_offset_t)(off & PAGE_MASK);
2040			xfsize = omin(PAGE_SIZE - pgoff,
2041			    obj->un_pager.vnp.vnp_size - uap->offset -
2042			    fsbytes - loopbytes);
2043			if (uap->nbytes)
2044				rem = (uap->nbytes - fsbytes - loopbytes);
2045			else
2046				rem = obj->un_pager.vnp.vnp_size -
2047				    uap->offset - fsbytes - loopbytes;
2048			xfsize = omin(rem, xfsize);
2049			xfsize = omin(space - loopbytes, xfsize);
2050			if (xfsize <= 0) {
2051				VM_OBJECT_UNLOCK(obj);
2052				done = 1;		/* all data sent */
2053				break;
2054			}
2055
2056			/*
2057			 * Attempt to look up the page.  Allocate
2058			 * if not found or wait and loop if busy.
2059			 */
2060			pindex = OFF_TO_IDX(off);
2061			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
2062			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
2063
2064			/*
2065			 * Check if page is valid for what we need,
2066			 * otherwise initiate I/O.
2067			 * If we already turned some pages into mbufs,
2068			 * send them off before we come here again and
2069			 * block.
2070			 */
2071			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
2072				VM_OBJECT_UNLOCK(obj);
2073			else if (m != NULL)
2074				error = EAGAIN;	/* send what we already got */
2075			else if (uap->flags & SF_NODISKIO)
2076				error = EBUSY;
2077			else {
2078				int bsize;
2079				ssize_t resid;
2080
2081				/*
2082				 * Ensure that our page is still around
2083				 * when the I/O completes.
2084				 */
2085				vm_page_io_start(pg);
2086				VM_OBJECT_UNLOCK(obj);
2087
2088				/*
2089				 * Get the page from backing store.
2090				 */
2091				error = vn_lock(vp, LK_SHARED);
2092				if (error != 0)
2093					goto after_read;
2094				bsize = vp->v_mount->mnt_stat.f_iosize;
2095
2096				/*
2097				 * XXXMAC: Because we don't have fp->f_cred
2098				 * here, we pass in NOCRED.  This is probably
2099				 * wrong, but is consistent with our original
2100				 * implementation.
2101				 */
2102				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
2103				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2104				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
2105				    td->td_ucred, NOCRED, &resid, td);
2106				VOP_UNLOCK(vp, 0);
2107			after_read:
2108				VM_OBJECT_LOCK(obj);
2109				vm_page_io_finish(pg);
2110				if (!error)
2111					VM_OBJECT_UNLOCK(obj);
2112				mbstat.sf_iocnt++;
2113			}
2114			if (error) {
2115				vm_page_lock(pg);
2116				vm_page_unwire(pg, 0);
2117				/*
2118				 * See if anyone else might know about
2119				 * this page.  If not and it is not valid,
2120				 * then free it.
2121				 */
2122				if (pg->wire_count == 0 && pg->valid == 0 &&
2123				    pg->busy == 0 && !(pg->oflags & VPO_BUSY))
2124					vm_page_free(pg);
2125				vm_page_unlock(pg);
2126				VM_OBJECT_UNLOCK(obj);
2127				if (error == EAGAIN)
2128					error = 0;	/* not a real error */
2129				break;
2130			}
2131
2132			/*
2133			 * Get a sendfile buf.  When allocating the
2134			 * first buffer for mbuf chain, we usually
2135			 * wait as long as necessary, but this wait
2136			 * can be interrupted.  For consequent
2137			 * buffers, do not sleep, since several
2138			 * threads might exhaust the buffers and then
2139			 * deadlock.
2140			 */
2141			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
2142			    SFB_CATCH);
2143			if (sf == NULL) {
2144				mbstat.sf_allocfail++;
2145				vm_page_lock(pg);
2146				vm_page_unwire(pg, 0);
2147				KASSERT(pg->object != NULL,
2148				    ("kern_sendfile: object disappeared"));
2149				vm_page_unlock(pg);
2150				if (m == NULL)
2151					error = (mnw ? EAGAIN : EINTR);
2152				break;
2153			}
2154
2155			/*
2156			 * Get an mbuf and set it up as having
2157			 * external storage.
2158			 */
2159			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2160			if (m0 == NULL) {
2161				error = (mnw ? EAGAIN : ENOBUFS);
2162				sf_buf_mext((void *)sf_buf_kva(sf), sf);
2163				break;
2164			}
2165			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
2166			    sfs, sf, M_RDONLY, EXT_SFBUF);
2167			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2168			m0->m_len = xfsize;
2169
2170			/* Append to mbuf chain. */
2171			if (mtail != NULL)
2172				mtail->m_next = m0;
2173			else if (m != NULL)
2174				m_last(m)->m_next = m0;
2175			else
2176				m = m0;
2177			mtail = m0;
2178
2179			/* Keep track of bits processed. */
2180			loopbytes += xfsize;
2181			off += xfsize;
2182
2183			if (sfs != NULL) {
2184				mtx_lock(&sfs->mtx);
2185				sfs->count++;
2186				mtx_unlock(&sfs->mtx);
2187			}
2188		}
2189
2190		/* Add the buffer chain to the socket buffer. */
2191		if (m != NULL) {
2192			int mlen, err;
2193
2194			mlen = m_length(m, NULL);
2195			SOCKBUF_LOCK(&so->so_snd);
2196			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2197				error = EPIPE;
2198				SOCKBUF_UNLOCK(&so->so_snd);
2199				goto done;
2200			}
2201			SOCKBUF_UNLOCK(&so->so_snd);
2202			CURVNET_SET(so->so_vnet);
2203			/* Avoid error aliasing. */
2204			err = (*so->so_proto->pr_usrreqs->pru_send)
2205				    (so, 0, m, NULL, NULL, td);
2206			CURVNET_RESTORE();
2207			if (err == 0) {
2208				/*
2209				 * We need two counters to get the
2210				 * file offset and nbytes to send
2211				 * right:
2212				 * - sbytes contains the total amount
2213				 *   of bytes sent, including headers.
2214				 * - fsbytes contains the total amount
2215				 *   of bytes sent from the file.
2216				 */
2217				sbytes += mlen;
2218				fsbytes += mlen;
2219				if (hdrlen) {
2220					fsbytes -= hdrlen;
2221					hdrlen = 0;
2222				}
2223			} else if (error == 0)
2224				error = err;
2225			m = NULL;	/* pru_send always consumes */
2226		}
2227
2228		/* Quit outer loop on error or when we're done. */
2229		if (done)
2230			break;
2231		if (error)
2232			goto done;
2233	}
2234
2235	/*
2236	 * Send trailers. Wimp out and use writev(2).
2237	 */
2238	if (trl_uio != NULL) {
2239		sbunlock(&so->so_snd);
2240		error = kern_writev(td, uap->s, trl_uio);
2241		if (error == 0)
2242			sbytes += td->td_retval[0];
2243		goto out;
2244	}
2245
2246done:
2247	sbunlock(&so->so_snd);
2248out:
2249	/*
2250	 * If there was no error we have to clear td->td_retval[0]
2251	 * because it may have been set by writev.
2252	 */
2253	if (error == 0) {
2254		td->td_retval[0] = 0;
2255	}
2256	if (uap->sbytes != NULL) {
2257		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2258	}
2259	if (obj != NULL)
2260		vm_object_deallocate(obj);
2261	if (vp != NULL)
2262		vrele(vp);
2263	if (so)
2264		fdrop(sock_fp, td);
2265	if (m)
2266		m_freem(m);
2267
2268	if (sfs != NULL) {
2269		mtx_lock(&sfs->mtx);
2270		if (sfs->count != 0)
2271			cv_wait(&sfs->cv, &sfs->mtx);
2272		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2273		cv_destroy(&sfs->cv);
2274		mtx_destroy(&sfs->mtx);
2275		free(sfs, M_TEMP);
2276	}
2277
2278	if (error == ERESTART)
2279		error = EINTR;
2280
2281	return (error);
2282}
2283
2284/*
2285 * SCTP syscalls.
2286 * Functionality only compiled in if SCTP is defined in the kernel Makefile,
2287 * otherwise all return EOPNOTSUPP.
2288 * XXX: We should make this loadable one day.
2289 */
2290int
2291sys_sctp_peeloff(td, uap)
2292	struct thread *td;
2293	struct sctp_peeloff_args /* {
2294		int	sd;
2295		caddr_t	name;
2296	} */ *uap;
2297{
2298#if (defined(INET) || defined(INET6)) && defined(SCTP)
2299	struct file *nfp = NULL;
2300	int error;
2301	struct socket *head, *so;
2302	int fd;
2303	u_int fflag;
2304
2305	AUDIT_ARG_FD(uap->sd);
2306	error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag);
2307	if (error)
2308		goto done2;
2309	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
2310		error = EOPNOTSUPP;
2311		goto done;
2312	}
2313	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
2314	if (error)
2315		goto done;
2316	/*
2317	 * At this point we know we do have a assoc to pull
2318	 * we proceed to get the fd setup. This may block
2319	 * but that is ok.
2320	 */
2321
2322	error = falloc(td, &nfp, &fd, 0);
2323	if (error)
2324		goto done;
2325	td->td_retval[0] = fd;
2326
2327	CURVNET_SET(head->so_vnet);
2328	so = sonewconn(head, SS_ISCONNECTED);
2329	if (so == NULL)
2330		goto noconnection;
2331	/*
2332	 * Before changing the flags on the socket, we have to bump the
2333	 * reference count.  Otherwise, if the protocol calls sofree(),
2334	 * the socket will be released due to a zero refcount.
2335	 */
2336        SOCK_LOCK(so);
2337        soref(so);                      /* file descriptor reference */
2338        SOCK_UNLOCK(so);
2339
2340	ACCEPT_LOCK();
2341
2342	TAILQ_REMOVE(&head->so_comp, so, so_list);
2343	head->so_qlen--;
2344	so->so_state |= (head->so_state & SS_NBIO);
2345	so->so_state &= ~SS_NOFDREF;
2346	so->so_qstate &= ~SQ_COMP;
2347	so->so_head = NULL;
2348	ACCEPT_UNLOCK();
2349	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
2350	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
2351	if (error)
2352		goto noconnection;
2353	if (head->so_sigio != NULL)
2354		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
2355
2356noconnection:
2357	/*
2358	 * close the new descriptor, assuming someone hasn't ripped it
2359	 * out from under us.
2360	 */
2361	if (error)
2362		fdclose(td->td_proc->p_fd, nfp, fd, td);
2363
2364	/*
2365	 * Release explicitly held references before returning.
2366	 */
2367	CURVNET_RESTORE();
2368done:
2369	if (nfp != NULL)
2370		fdrop(nfp, td);
2371	fputsock(head);
2372done2:
2373	return (error);
2374#else  /* SCTP */
2375	return (EOPNOTSUPP);
2376#endif /* SCTP */
2377}
2378
2379int
2380sys_sctp_generic_sendmsg (td, uap)
2381	struct thread *td;
2382	struct sctp_generic_sendmsg_args /* {
2383		int sd,
2384		caddr_t msg,
2385		int mlen,
2386		caddr_t to,
2387		__socklen_t tolen,
2388		struct sctp_sndrcvinfo *sinfo,
2389		int flags
2390	} */ *uap;
2391{
2392#if (defined(INET) || defined(INET6)) && defined(SCTP)
2393	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2394	struct socket *so;
2395	struct file *fp = NULL;
2396	int error = 0, len;
2397	struct sockaddr *to = NULL;
2398#ifdef KTRACE
2399	struct uio *ktruio = NULL;
2400#endif
2401	struct uio auio;
2402	struct iovec iov[1];
2403	cap_rights_t rights;
2404
2405	if (uap->sinfo) {
2406		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2407		if (error)
2408			return (error);
2409		u_sinfo = &sinfo;
2410	}
2411
2412	rights = CAP_SEND;
2413	if (uap->tolen) {
2414		error = getsockaddr(&to, uap->to, uap->tolen);
2415		if (error) {
2416			to = NULL;
2417			goto sctp_bad2;
2418		}
2419		rights |= CAP_CONNECT;
2420	}
2421
2422	AUDIT_ARG_FD(uap->sd);
2423	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
2424	if (error)
2425		goto sctp_bad;
2426#ifdef KTRACE
2427	if (to && (KTRPOINT(td, KTR_STRUCT)))
2428		ktrsockaddr(to);
2429#endif
2430
2431	iov[0].iov_base = uap->msg;
2432	iov[0].iov_len = uap->mlen;
2433
2434	so = (struct socket *)fp->f_data;
2435	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2436		error = EOPNOTSUPP;
2437		goto sctp_bad;
2438	}
2439#ifdef MAC
2440	error = mac_socket_check_send(td->td_ucred, so);
2441	if (error)
2442		goto sctp_bad;
2443#endif /* MAC */
2444
2445	auio.uio_iov =  iov;
2446	auio.uio_iovcnt = 1;
2447	auio.uio_segflg = UIO_USERSPACE;
2448	auio.uio_rw = UIO_WRITE;
2449	auio.uio_td = td;
2450	auio.uio_offset = 0;			/* XXX */
2451	auio.uio_resid = 0;
2452	len = auio.uio_resid = uap->mlen;
2453	CURVNET_SET(so->so_vnet);
2454	error = sctp_lower_sosend(so, to, &auio,
2455		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2456		    uap->flags, u_sinfo, td);
2457	CURVNET_RESTORE();
2458	if (error) {
2459		if (auio.uio_resid != len && (error == ERESTART ||
2460		    error == EINTR || error == EWOULDBLOCK))
2461			error = 0;
2462		/* Generation of SIGPIPE can be controlled per socket. */
2463		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2464		    !(uap->flags & MSG_NOSIGNAL)) {
2465			PROC_LOCK(td->td_proc);
2466			tdsignal(td, SIGPIPE);
2467			PROC_UNLOCK(td->td_proc);
2468		}
2469	}
2470	if (error == 0)
2471		td->td_retval[0] = len - auio.uio_resid;
2472#ifdef KTRACE
2473	if (ktruio != NULL) {
2474		ktruio->uio_resid = td->td_retval[0];
2475		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2476	}
2477#endif /* KTRACE */
2478sctp_bad:
2479	if (fp)
2480		fdrop(fp, td);
2481sctp_bad2:
2482	if (to)
2483		free(to, M_SONAME);
2484	return (error);
2485#else  /* SCTP */
2486	return (EOPNOTSUPP);
2487#endif /* SCTP */
2488}
2489
2490int
2491sys_sctp_generic_sendmsg_iov(td, uap)
2492	struct thread *td;
2493	struct sctp_generic_sendmsg_iov_args /* {
2494		int sd,
2495		struct iovec *iov,
2496		int iovlen,
2497		caddr_t to,
2498		__socklen_t tolen,
2499		struct sctp_sndrcvinfo *sinfo,
2500		int flags
2501	} */ *uap;
2502{
2503#if (defined(INET) || defined(INET6)) && defined(SCTP)
2504	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2505	struct socket *so;
2506	struct file *fp = NULL;
2507	int error=0, i;
2508	ssize_t len;
2509	struct sockaddr *to = NULL;
2510#ifdef KTRACE
2511	struct uio *ktruio = NULL;
2512#endif
2513	struct uio auio;
2514	struct iovec *iov, *tiov;
2515	cap_rights_t rights;
2516
2517	if (uap->sinfo) {
2518		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2519		if (error)
2520			return (error);
2521		u_sinfo = &sinfo;
2522	}
2523	rights = CAP_SEND;
2524	if (uap->tolen) {
2525		error = getsockaddr(&to, uap->to, uap->tolen);
2526		if (error) {
2527			to = NULL;
2528			goto sctp_bad2;
2529		}
2530		rights |= CAP_CONNECT;
2531	}
2532
2533	AUDIT_ARG_FD(uap->sd);
2534	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
2535	if (error)
2536		goto sctp_bad1;
2537
2538#ifdef COMPAT_FREEBSD32
2539	if (SV_CURPROC_FLAG(SV_ILP32))
2540		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2541		    uap->iovlen, &iov, EMSGSIZE);
2542	else
2543#endif
2544		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2545	if (error)
2546		goto sctp_bad1;
2547#ifdef KTRACE
2548	if (to && (KTRPOINT(td, KTR_STRUCT)))
2549		ktrsockaddr(to);
2550#endif
2551
2552	so = (struct socket *)fp->f_data;
2553	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2554		error = EOPNOTSUPP;
2555		goto sctp_bad;
2556	}
2557#ifdef MAC
2558	error = mac_socket_check_send(td->td_ucred, so);
2559	if (error)
2560		goto sctp_bad;
2561#endif /* MAC */
2562
2563	auio.uio_iov = iov;
2564	auio.uio_iovcnt = uap->iovlen;
2565	auio.uio_segflg = UIO_USERSPACE;
2566	auio.uio_rw = UIO_WRITE;
2567	auio.uio_td = td;
2568	auio.uio_offset = 0;			/* XXX */
2569	auio.uio_resid = 0;
2570	tiov = iov;
2571	for (i = 0; i <uap->iovlen; i++, tiov++) {
2572		if ((auio.uio_resid += tiov->iov_len) < 0) {
2573			error = EINVAL;
2574			goto sctp_bad;
2575		}
2576	}
2577	len = auio.uio_resid;
2578	CURVNET_SET(so->so_vnet);
2579	error = sctp_lower_sosend(so, to, &auio,
2580		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2581		    uap->flags, u_sinfo, td);
2582	CURVNET_RESTORE();
2583	if (error) {
2584		if (auio.uio_resid != len && (error == ERESTART ||
2585		    error == EINTR || error == EWOULDBLOCK))
2586			error = 0;
2587		/* Generation of SIGPIPE can be controlled per socket */
2588		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2589		    !(uap->flags & MSG_NOSIGNAL)) {
2590			PROC_LOCK(td->td_proc);
2591			tdsignal(td, SIGPIPE);
2592			PROC_UNLOCK(td->td_proc);
2593		}
2594	}
2595	if (error == 0)
2596		td->td_retval[0] = len - auio.uio_resid;
2597#ifdef KTRACE
2598	if (ktruio != NULL) {
2599		ktruio->uio_resid = td->td_retval[0];
2600		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2601	}
2602#endif /* KTRACE */
2603sctp_bad:
2604	free(iov, M_IOV);
2605sctp_bad1:
2606	if (fp)
2607		fdrop(fp, td);
2608sctp_bad2:
2609	if (to)
2610		free(to, M_SONAME);
2611	return (error);
2612#else  /* SCTP */
2613	return (EOPNOTSUPP);
2614#endif /* SCTP */
2615}
2616
2617int
2618sys_sctp_generic_recvmsg(td, uap)
2619	struct thread *td;
2620	struct sctp_generic_recvmsg_args /* {
2621		int sd,
2622		struct iovec *iov,
2623		int iovlen,
2624		struct sockaddr *from,
2625		__socklen_t *fromlenaddr,
2626		struct sctp_sndrcvinfo *sinfo,
2627		int *msg_flags
2628	} */ *uap;
2629{
2630#if (defined(INET) || defined(INET6)) && defined(SCTP)
2631	uint8_t sockbufstore[256];
2632	struct uio auio;
2633	struct iovec *iov, *tiov;
2634	struct sctp_sndrcvinfo sinfo;
2635	struct socket *so;
2636	struct file *fp = NULL;
2637	struct sockaddr *fromsa;
2638	int fromlen;
2639	ssize_t len;
2640	int i, msg_flags;
2641	int error = 0;
2642#ifdef KTRACE
2643	struct uio *ktruio = NULL;
2644#endif
2645
2646	AUDIT_ARG_FD(uap->sd);
2647	error = getsock_cap(td->td_proc->p_fd, uap->sd, CAP_RECV, &fp, NULL);
2648	if (error) {
2649		return (error);
2650	}
2651#ifdef COMPAT_FREEBSD32
2652	if (SV_CURPROC_FLAG(SV_ILP32))
2653		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2654		    uap->iovlen, &iov, EMSGSIZE);
2655	else
2656#endif
2657		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2658	if (error)
2659		goto out1;
2660
2661	so = fp->f_data;
2662	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2663		error = EOPNOTSUPP;
2664		goto out;
2665	}
2666#ifdef MAC
2667	error = mac_socket_check_receive(td->td_ucred, so);
2668	if (error) {
2669		goto out;
2670	}
2671#endif /* MAC */
2672
2673	if (uap->fromlenaddr) {
2674		error = copyin(uap->fromlenaddr,
2675		    &fromlen, sizeof (fromlen));
2676		if (error) {
2677			goto out;
2678		}
2679	} else {
2680		fromlen = 0;
2681	}
2682	if (uap->msg_flags) {
2683		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
2684		if (error) {
2685			goto out;
2686		}
2687	} else {
2688		msg_flags = 0;
2689	}
2690	auio.uio_iov = iov;
2691	auio.uio_iovcnt = uap->iovlen;
2692	auio.uio_segflg = UIO_USERSPACE;
2693	auio.uio_rw = UIO_READ;
2694	auio.uio_td = td;
2695	auio.uio_offset = 0;			/* XXX */
2696	auio.uio_resid = 0;
2697	tiov = iov;
2698	for (i = 0; i <uap->iovlen; i++, tiov++) {
2699		if ((auio.uio_resid += tiov->iov_len) < 0) {
2700			error = EINVAL;
2701			goto out;
2702		}
2703	}
2704	len = auio.uio_resid;
2705	fromsa = (struct sockaddr *)sockbufstore;
2706
2707#ifdef KTRACE
2708	if (KTRPOINT(td, KTR_GENIO))
2709		ktruio = cloneuio(&auio);
2710#endif /* KTRACE */
2711	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
2712	CURVNET_SET(so->so_vnet);
2713	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
2714		    fromsa, fromlen, &msg_flags,
2715		    (struct sctp_sndrcvinfo *)&sinfo, 1);
2716	CURVNET_RESTORE();
2717	if (error) {
2718		if (auio.uio_resid != len && (error == ERESTART ||
2719		    error == EINTR || error == EWOULDBLOCK))
2720			error = 0;
2721	} else {
2722		if (uap->sinfo)
2723			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
2724	}
2725#ifdef KTRACE
2726	if (ktruio != NULL) {
2727		ktruio->uio_resid = len - auio.uio_resid;
2728		ktrgenio(uap->sd, UIO_READ, ktruio, error);
2729	}
2730#endif /* KTRACE */
2731	if (error)
2732		goto out;
2733	td->td_retval[0] = len - auio.uio_resid;
2734
2735	if (fromlen && uap->from) {
2736		len = fromlen;
2737		if (len <= 0 || fromsa == 0)
2738			len = 0;
2739		else {
2740			len = MIN(len, fromsa->sa_len);
2741			error = copyout(fromsa, uap->from, (size_t)len);
2742			if (error)
2743				goto out;
2744		}
2745		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
2746		if (error) {
2747			goto out;
2748		}
2749	}
2750#ifdef KTRACE
2751	if (KTRPOINT(td, KTR_STRUCT))
2752		ktrsockaddr(fromsa);
2753#endif
2754	if (uap->msg_flags) {
2755		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
2756		if (error) {
2757			goto out;
2758		}
2759	}
2760out:
2761	free(iov, M_IOV);
2762out1:
2763	if (fp)
2764		fdrop(fp, td);
2765
2766	return (error);
2767#else  /* SCTP */
2768	return (EOPNOTSUPP);
2769#endif /* SCTP */
2770}
2771