kern_sendfile.c revision 250154
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 250154 2013-05-01 20:10:21Z jilles $");
37
38#include "opt_capsicum.h"
39#include "opt_inet.h"
40#include "opt_inet6.h"
41#include "opt_sctp.h"
42#include "opt_compat.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/capability.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/mutex.h>
51#include <sys/sysproto.h>
52#include <sys/malloc.h>
53#include <sys/filedesc.h>
54#include <sys/event.h>
55#include <sys/proc.h>
56#include <sys/fcntl.h>
57#include <sys/file.h>
58#include <sys/filio.h>
59#include <sys/jail.h>
60#include <sys/mount.h>
61#include <sys/mbuf.h>
62#include <sys/protosw.h>
63#include <sys/rwlock.h>
64#include <sys/sf_buf.h>
65#include <sys/sysent.h>
66#include <sys/socket.h>
67#include <sys/socketvar.h>
68#include <sys/signalvar.h>
69#include <sys/syscallsubr.h>
70#include <sys/sysctl.h>
71#include <sys/uio.h>
72#include <sys/vnode.h>
73#ifdef KTRACE
74#include <sys/ktrace.h>
75#endif
76#ifdef COMPAT_FREEBSD32
77#include <compat/freebsd32/freebsd32_util.h>
78#endif
79
80#include <net/vnet.h>
81
82#include <security/audit/audit.h>
83#include <security/mac/mac_framework.h>
84
85#include <vm/vm.h>
86#include <vm/vm_param.h>
87#include <vm/vm_object.h>
88#include <vm/vm_page.h>
89#include <vm/vm_pageout.h>
90#include <vm/vm_kern.h>
91#include <vm/vm_extern.h>
92
93#if defined(INET) || defined(INET6)
94#ifdef SCTP
95#include <netinet/sctp.h>
96#include <netinet/sctp_peeloff.h>
97#endif /* SCTP */
98#endif /* INET || INET6 */
99
100/*
101 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
102 * and SOCK_NONBLOCK.
103 */
104#define	ACCEPT4_INHERIT	0x1
105#define	ACCEPT4_COMPAT	0x2
106
107static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
108static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
109
110static int accept1(struct thread *td, int s, struct sockaddr *uname,
111		   socklen_t *anamelen, int flags);
112static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
113static int getsockname1(struct thread *td, struct getsockname_args *uap,
114			int compat);
115static int getpeername1(struct thread *td, struct getpeername_args *uap,
116			int compat);
117
118/*
119 * NSFBUFS-related variables and associated sysctls
120 */
121int nsfbufs;
122int nsfbufspeak;
123int nsfbufsused;
124
125SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
126    "Maximum number of sendfile(2) sf_bufs available");
127SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
128    "Number of sendfile(2) sf_bufs at peak usage");
129SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
130    "Number of sendfile(2) sf_bufs in use");
131
132/*
133 * Convert a user file descriptor to a kernel file entry and check if required
134 * capability rights are present.
135 * A reference on the file entry is held upon returning.
136 */
137static int
138getsock_cap(struct filedesc *fdp, int fd, cap_rights_t rights,
139    struct file **fpp, u_int *fflagp)
140{
141	struct file *fp;
142	int error;
143
144	error = fget_unlocked(fdp, fd, rights, 0, &fp, NULL);
145	if (error != 0)
146		return (error);
147	if (fp->f_type != DTYPE_SOCKET) {
148		fdrop(fp, curthread);
149		return (ENOTSOCK);
150	}
151	if (fflagp != NULL)
152		*fflagp = fp->f_flag;
153	*fpp = fp;
154	return (0);
155}
156
157/*
158 * System call interface to the socket abstraction.
159 */
160#if defined(COMPAT_43)
161#define COMPAT_OLDSOCK
162#endif
163
164int
165sys_socket(td, uap)
166	struct thread *td;
167	struct socket_args /* {
168		int	domain;
169		int	type;
170		int	protocol;
171	} */ *uap;
172{
173	struct socket *so;
174	struct file *fp;
175	int fd, error, type, oflag, fflag;
176
177	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
178
179	type = uap->type;
180	oflag = 0;
181	fflag = 0;
182	if ((type & SOCK_CLOEXEC) != 0) {
183		type &= ~SOCK_CLOEXEC;
184		oflag |= O_CLOEXEC;
185	}
186	if ((type & SOCK_NONBLOCK) != 0) {
187		type &= ~SOCK_NONBLOCK;
188		fflag |= FNONBLOCK;
189	}
190
191#ifdef MAC
192	error = mac_socket_check_create(td->td_ucred, uap->domain, type,
193	    uap->protocol);
194	if (error)
195		return (error);
196#endif
197	error = falloc(td, &fp, &fd, oflag);
198	if (error)
199		return (error);
200	/* An extra reference on `fp' has been held for us by falloc(). */
201	error = socreate(uap->domain, &so, type, uap->protocol,
202	    td->td_ucred, td);
203	if (error) {
204		fdclose(td->td_proc->p_fd, fp, fd, td);
205	} else {
206		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
207		if ((fflag & FNONBLOCK) != 0)
208			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
209		td->td_retval[0] = fd;
210	}
211	fdrop(fp, td);
212	return (error);
213}
214
215/* ARGSUSED */
216int
217sys_bind(td, uap)
218	struct thread *td;
219	struct bind_args /* {
220		int	s;
221		caddr_t	name;
222		int	namelen;
223	} */ *uap;
224{
225	struct sockaddr *sa;
226	int error;
227
228	error = getsockaddr(&sa, uap->name, uap->namelen);
229	if (error == 0) {
230		error = kern_bind(td, uap->s, sa);
231		free(sa, M_SONAME);
232	}
233	return (error);
234}
235
236static int
237kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
238{
239	struct socket *so;
240	struct file *fp;
241	int error;
242
243	AUDIT_ARG_FD(fd);
244	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
245	error = getsock_cap(td->td_proc->p_fd, fd, CAP_BIND, &fp, NULL);
246	if (error)
247		return (error);
248	so = fp->f_data;
249#ifdef KTRACE
250	if (KTRPOINT(td, KTR_STRUCT))
251		ktrsockaddr(sa);
252#endif
253#ifdef MAC
254	error = mac_socket_check_bind(td->td_ucred, so, sa);
255	if (error == 0) {
256#endif
257		if (dirfd == AT_FDCWD)
258			error = sobind(so, sa, td);
259		else
260			error = sobindat(dirfd, so, sa, td);
261#ifdef MAC
262	}
263#endif
264	fdrop(fp, td);
265	return (error);
266}
267
268int
269kern_bind(struct thread *td, int fd, struct sockaddr *sa)
270{
271
272	return (kern_bindat(td, AT_FDCWD, fd, sa));
273}
274
275/* ARGSUSED */
276int
277sys_bindat(td, uap)
278	struct thread *td;
279	struct bindat_args /* {
280		int	fd;
281		int	s;
282		caddr_t	name;
283		int	namelen;
284	} */ *uap;
285{
286	struct sockaddr *sa;
287	int error;
288
289	error = getsockaddr(&sa, uap->name, uap->namelen);
290	if (error == 0) {
291		error = kern_bindat(td, uap->fd, uap->s, sa);
292		free(sa, M_SONAME);
293	}
294	return (error);
295}
296
297/* ARGSUSED */
298int
299sys_listen(td, uap)
300	struct thread *td;
301	struct listen_args /* {
302		int	s;
303		int	backlog;
304	} */ *uap;
305{
306	struct socket *so;
307	struct file *fp;
308	int error;
309
310	AUDIT_ARG_FD(uap->s);
311	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_LISTEN, &fp, NULL);
312	if (error == 0) {
313		so = fp->f_data;
314#ifdef MAC
315		error = mac_socket_check_listen(td->td_ucred, so);
316		if (error == 0)
317#endif
318			error = solisten(so, uap->backlog, td);
319		fdrop(fp, td);
320	}
321	return(error);
322}
323
324/*
325 * accept1()
326 */
327static int
328accept1(td, s, uname, anamelen, flags)
329	struct thread *td;
330	int s;
331	struct sockaddr *uname;
332	socklen_t *anamelen;
333	int flags;
334{
335	struct sockaddr *name;
336	socklen_t namelen;
337	struct file *fp;
338	int error;
339
340	if (uname == NULL)
341		return (kern_accept4(td, s, NULL, NULL, flags, NULL));
342
343	error = copyin(anamelen, &namelen, sizeof (namelen));
344	if (error)
345		return (error);
346
347	error = kern_accept4(td, s, &name, &namelen, flags, &fp);
348
349	/*
350	 * return a namelen of zero for older code which might
351	 * ignore the return value from accept.
352	 */
353	if (error) {
354		(void) copyout(&namelen, anamelen, sizeof(*anamelen));
355		return (error);
356	}
357
358	if (error == 0 && uname != NULL) {
359#ifdef COMPAT_OLDSOCK
360		if (flags & ACCEPT4_COMPAT)
361			((struct osockaddr *)name)->sa_family =
362			    name->sa_family;
363#endif
364		error = copyout(name, uname, namelen);
365	}
366	if (error == 0)
367		error = copyout(&namelen, anamelen,
368		    sizeof(namelen));
369	if (error)
370		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
371	fdrop(fp, td);
372	free(name, M_SONAME);
373	return (error);
374}
375
376int
377kern_accept(struct thread *td, int s, struct sockaddr **name,
378    socklen_t *namelen, struct file **fp)
379{
380	return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
381}
382
383int
384kern_accept4(struct thread *td, int s, struct sockaddr **name,
385    socklen_t *namelen, int flags, struct file **fp)
386{
387	struct filedesc *fdp;
388	struct file *headfp, *nfp = NULL;
389	struct sockaddr *sa = NULL;
390	int error;
391	struct socket *head, *so;
392	int fd;
393	u_int fflag;
394	pid_t pgid;
395	int tmp;
396
397	if (name)
398		*name = NULL;
399
400	AUDIT_ARG_FD(s);
401	fdp = td->td_proc->p_fd;
402	error = getsock_cap(fdp, s, CAP_ACCEPT, &headfp, &fflag);
403	if (error)
404		return (error);
405	head = headfp->f_data;
406	if ((head->so_options & SO_ACCEPTCONN) == 0) {
407		error = EINVAL;
408		goto done;
409	}
410#ifdef MAC
411	error = mac_socket_check_accept(td->td_ucred, head);
412	if (error != 0)
413		goto done;
414#endif
415	error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
416	if (error)
417		goto done;
418	ACCEPT_LOCK();
419	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
420		ACCEPT_UNLOCK();
421		error = EWOULDBLOCK;
422		goto noconnection;
423	}
424	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
425		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
426			head->so_error = ECONNABORTED;
427			break;
428		}
429		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
430		    "accept", 0);
431		if (error) {
432			ACCEPT_UNLOCK();
433			goto noconnection;
434		}
435	}
436	if (head->so_error) {
437		error = head->so_error;
438		head->so_error = 0;
439		ACCEPT_UNLOCK();
440		goto noconnection;
441	}
442	so = TAILQ_FIRST(&head->so_comp);
443	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
444	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
445
446	/*
447	 * Before changing the flags on the socket, we have to bump the
448	 * reference count.  Otherwise, if the protocol calls sofree(),
449	 * the socket will be released due to a zero refcount.
450	 */
451	SOCK_LOCK(so);			/* soref() and so_state update */
452	soref(so);			/* file descriptor reference */
453
454	TAILQ_REMOVE(&head->so_comp, so, so_list);
455	head->so_qlen--;
456	if (flags & ACCEPT4_INHERIT)
457		so->so_state |= (head->so_state & SS_NBIO);
458	else
459		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
460	so->so_qstate &= ~SQ_COMP;
461	so->so_head = NULL;
462
463	SOCK_UNLOCK(so);
464	ACCEPT_UNLOCK();
465
466	/* An extra reference on `nfp' has been held for us by falloc(). */
467	td->td_retval[0] = fd;
468
469	/* connection has been removed from the listen queue */
470	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
471
472	if (flags & ACCEPT4_INHERIT) {
473		pgid = fgetown(&head->so_sigio);
474		if (pgid != 0)
475			fsetown(pgid, &so->so_sigio);
476	} else {
477		fflag &= ~(FNONBLOCK | FASYNC);
478		if (flags & SOCK_NONBLOCK)
479			fflag |= FNONBLOCK;
480	}
481
482	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
483	/* Sync socket nonblocking/async state with file flags */
484	tmp = fflag & FNONBLOCK;
485	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
486	tmp = fflag & FASYNC;
487	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
488	sa = 0;
489	error = soaccept(so, &sa);
490	if (error) {
491		/*
492		 * return a namelen of zero for older code which might
493		 * ignore the return value from accept.
494		 */
495		if (name)
496			*namelen = 0;
497		goto noconnection;
498	}
499	if (sa == NULL) {
500		if (name)
501			*namelen = 0;
502		goto done;
503	}
504	AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
505	if (name) {
506		/* check sa_len before it is destroyed */
507		if (*namelen > sa->sa_len)
508			*namelen = sa->sa_len;
509#ifdef KTRACE
510		if (KTRPOINT(td, KTR_STRUCT))
511			ktrsockaddr(sa);
512#endif
513		*name = sa;
514		sa = NULL;
515	}
516noconnection:
517	if (sa)
518		free(sa, M_SONAME);
519
520	/*
521	 * close the new descriptor, assuming someone hasn't ripped it
522	 * out from under us.
523	 */
524	if (error)
525		fdclose(fdp, nfp, fd, td);
526
527	/*
528	 * Release explicitly held references before returning.  We return
529	 * a reference on nfp to the caller on success if they request it.
530	 */
531done:
532	if (fp != NULL) {
533		if (error == 0) {
534			*fp = nfp;
535			nfp = NULL;
536		} else
537			*fp = NULL;
538	}
539	if (nfp != NULL)
540		fdrop(nfp, td);
541	fdrop(headfp, td);
542	return (error);
543}
544
545int
546sys_accept(td, uap)
547	struct thread *td;
548	struct accept_args *uap;
549{
550
551	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
552}
553
554int
555sys_accept4(td, uap)
556	struct thread *td;
557	struct accept4_args *uap;
558{
559	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
560		return (EINVAL);
561
562	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
563}
564
565#ifdef COMPAT_OLDSOCK
566int
567oaccept(td, uap)
568	struct thread *td;
569	struct accept_args *uap;
570{
571
572	return (accept1(td, uap->s, uap->name, uap->anamelen,
573	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
574}
575#endif /* COMPAT_OLDSOCK */
576
577/* ARGSUSED */
578int
579sys_connect(td, uap)
580	struct thread *td;
581	struct connect_args /* {
582		int	s;
583		caddr_t	name;
584		int	namelen;
585	} */ *uap;
586{
587	struct sockaddr *sa;
588	int error;
589
590	error = getsockaddr(&sa, uap->name, uap->namelen);
591	if (error == 0) {
592		error = kern_connect(td, uap->s, sa);
593		free(sa, M_SONAME);
594	}
595	return (error);
596}
597
598static int
599kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
600{
601	struct socket *so;
602	struct file *fp;
603	int error;
604	int interrupted = 0;
605
606	AUDIT_ARG_FD(fd);
607	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
608	error = getsock_cap(td->td_proc->p_fd, fd, CAP_CONNECT, &fp, NULL);
609	if (error)
610		return (error);
611	so = fp->f_data;
612	if (so->so_state & SS_ISCONNECTING) {
613		error = EALREADY;
614		goto done1;
615	}
616#ifdef KTRACE
617	if (KTRPOINT(td, KTR_STRUCT))
618		ktrsockaddr(sa);
619#endif
620#ifdef MAC
621	error = mac_socket_check_connect(td->td_ucred, so, sa);
622	if (error)
623		goto bad;
624#endif
625	if (dirfd == AT_FDCWD)
626		error = soconnect(so, sa, td);
627	else
628		error = soconnectat(dirfd, so, sa, td);
629	if (error)
630		goto bad;
631	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
632		error = EINPROGRESS;
633		goto done1;
634	}
635	SOCK_LOCK(so);
636	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
637		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
638		    "connec", 0);
639		if (error) {
640			if (error == EINTR || error == ERESTART)
641				interrupted = 1;
642			break;
643		}
644	}
645	if (error == 0) {
646		error = so->so_error;
647		so->so_error = 0;
648	}
649	SOCK_UNLOCK(so);
650bad:
651	if (!interrupted)
652		so->so_state &= ~SS_ISCONNECTING;
653	if (error == ERESTART)
654		error = EINTR;
655done1:
656	fdrop(fp, td);
657	return (error);
658}
659
660int
661kern_connect(struct thread *td, int fd, struct sockaddr *sa)
662{
663
664	return (kern_connectat(td, AT_FDCWD, fd, sa));
665}
666
667/* ARGSUSED */
668int
669sys_connectat(td, uap)
670	struct thread *td;
671	struct connectat_args /* {
672		int	fd;
673		int	s;
674		caddr_t	name;
675		int	namelen;
676	} */ *uap;
677{
678	struct sockaddr *sa;
679	int error;
680
681	error = getsockaddr(&sa, uap->name, uap->namelen);
682	if (error == 0) {
683		error = kern_connectat(td, uap->fd, uap->s, sa);
684		free(sa, M_SONAME);
685	}
686	return (error);
687}
688
689int
690kern_socketpair(struct thread *td, int domain, int type, int protocol,
691    int *rsv)
692{
693	struct filedesc *fdp = td->td_proc->p_fd;
694	struct file *fp1, *fp2;
695	struct socket *so1, *so2;
696	int fd, error, oflag, fflag;
697
698	AUDIT_ARG_SOCKET(domain, type, protocol);
699
700	oflag = 0;
701	fflag = 0;
702	if ((type & SOCK_CLOEXEC) != 0) {
703		type &= ~SOCK_CLOEXEC;
704		oflag |= O_CLOEXEC;
705	}
706	if ((type & SOCK_NONBLOCK) != 0) {
707		type &= ~SOCK_NONBLOCK;
708		fflag |= FNONBLOCK;
709	}
710#ifdef MAC
711	/* We might want to have a separate check for socket pairs. */
712	error = mac_socket_check_create(td->td_ucred, domain, type,
713	    protocol);
714	if (error)
715		return (error);
716#endif
717	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
718	if (error)
719		return (error);
720	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
721	if (error)
722		goto free1;
723	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
724	error = falloc(td, &fp1, &fd, oflag);
725	if (error)
726		goto free2;
727	rsv[0] = fd;
728	fp1->f_data = so1;	/* so1 already has ref count */
729	error = falloc(td, &fp2, &fd, oflag);
730	if (error)
731		goto free3;
732	fp2->f_data = so2;	/* so2 already has ref count */
733	rsv[1] = fd;
734	error = soconnect2(so1, so2);
735	if (error)
736		goto free4;
737	if (type == SOCK_DGRAM) {
738		/*
739		 * Datagram socket connection is asymmetric.
740		 */
741		 error = soconnect2(so2, so1);
742		 if (error)
743			goto free4;
744	}
745	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
746	    &socketops);
747	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
748	    &socketops);
749	if ((fflag & FNONBLOCK) != 0) {
750		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
751		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
752	}
753	fdrop(fp1, td);
754	fdrop(fp2, td);
755	return (0);
756free4:
757	fdclose(fdp, fp2, rsv[1], td);
758	fdrop(fp2, td);
759free3:
760	fdclose(fdp, fp1, rsv[0], td);
761	fdrop(fp1, td);
762free2:
763	if (so2 != NULL)
764		(void)soclose(so2);
765free1:
766	if (so1 != NULL)
767		(void)soclose(so1);
768	return (error);
769}
770
771int
772sys_socketpair(struct thread *td, struct socketpair_args *uap)
773{
774	int error, sv[2];
775
776	error = kern_socketpair(td, uap->domain, uap->type,
777	    uap->protocol, sv);
778	if (error)
779		return (error);
780	error = copyout(sv, uap->rsv, 2 * sizeof(int));
781	if (error) {
782		(void)kern_close(td, sv[0]);
783		(void)kern_close(td, sv[1]);
784	}
785	return (error);
786}
787
788static int
789sendit(td, s, mp, flags)
790	struct thread *td;
791	int s;
792	struct msghdr *mp;
793	int flags;
794{
795	struct mbuf *control;
796	struct sockaddr *to;
797	int error;
798
799#ifdef CAPABILITY_MODE
800	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
801		return (ECAPMODE);
802#endif
803
804	if (mp->msg_name != NULL) {
805		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
806		if (error) {
807			to = NULL;
808			goto bad;
809		}
810		mp->msg_name = to;
811	} else {
812		to = NULL;
813	}
814
815	if (mp->msg_control) {
816		if (mp->msg_controllen < sizeof(struct cmsghdr)
817#ifdef COMPAT_OLDSOCK
818		    && mp->msg_flags != MSG_COMPAT
819#endif
820		) {
821			error = EINVAL;
822			goto bad;
823		}
824		error = sockargs(&control, mp->msg_control,
825		    mp->msg_controllen, MT_CONTROL);
826		if (error)
827			goto bad;
828#ifdef COMPAT_OLDSOCK
829		if (mp->msg_flags == MSG_COMPAT) {
830			struct cmsghdr *cm;
831
832			M_PREPEND(control, sizeof(*cm), M_WAITOK);
833			cm = mtod(control, struct cmsghdr *);
834			cm->cmsg_len = control->m_len;
835			cm->cmsg_level = SOL_SOCKET;
836			cm->cmsg_type = SCM_RIGHTS;
837		}
838#endif
839	} else {
840		control = NULL;
841	}
842
843	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
844
845bad:
846	if (to)
847		free(to, M_SONAME);
848	return (error);
849}
850
851int
852kern_sendit(td, s, mp, flags, control, segflg)
853	struct thread *td;
854	int s;
855	struct msghdr *mp;
856	int flags;
857	struct mbuf *control;
858	enum uio_seg segflg;
859{
860	struct file *fp;
861	struct uio auio;
862	struct iovec *iov;
863	struct socket *so;
864	int i, error;
865	ssize_t len;
866	cap_rights_t rights;
867#ifdef KTRACE
868	struct uio *ktruio = NULL;
869#endif
870
871	AUDIT_ARG_FD(s);
872	rights = CAP_SEND;
873	if (mp->msg_name != NULL) {
874		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
875		rights |= CAP_CONNECT;
876	}
877	error = getsock_cap(td->td_proc->p_fd, s, rights, &fp, NULL);
878	if (error)
879		return (error);
880	so = (struct socket *)fp->f_data;
881
882#ifdef KTRACE
883	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
884		ktrsockaddr(mp->msg_name);
885#endif
886#ifdef MAC
887	if (mp->msg_name != NULL) {
888		error = mac_socket_check_connect(td->td_ucred, so,
889		    mp->msg_name);
890		if (error)
891			goto bad;
892	}
893	error = mac_socket_check_send(td->td_ucred, so);
894	if (error)
895		goto bad;
896#endif
897
898	auio.uio_iov = mp->msg_iov;
899	auio.uio_iovcnt = mp->msg_iovlen;
900	auio.uio_segflg = segflg;
901	auio.uio_rw = UIO_WRITE;
902	auio.uio_td = td;
903	auio.uio_offset = 0;			/* XXX */
904	auio.uio_resid = 0;
905	iov = mp->msg_iov;
906	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
907		if ((auio.uio_resid += iov->iov_len) < 0) {
908			error = EINVAL;
909			goto bad;
910		}
911	}
912#ifdef KTRACE
913	if (KTRPOINT(td, KTR_GENIO))
914		ktruio = cloneuio(&auio);
915#endif
916	len = auio.uio_resid;
917	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
918	if (error) {
919		if (auio.uio_resid != len && (error == ERESTART ||
920		    error == EINTR || error == EWOULDBLOCK))
921			error = 0;
922		/* Generation of SIGPIPE can be controlled per socket */
923		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
924		    !(flags & MSG_NOSIGNAL)) {
925			PROC_LOCK(td->td_proc);
926			tdsignal(td, SIGPIPE);
927			PROC_UNLOCK(td->td_proc);
928		}
929	}
930	if (error == 0)
931		td->td_retval[0] = len - auio.uio_resid;
932#ifdef KTRACE
933	if (ktruio != NULL) {
934		ktruio->uio_resid = td->td_retval[0];
935		ktrgenio(s, UIO_WRITE, ktruio, error);
936	}
937#endif
938bad:
939	fdrop(fp, td);
940	return (error);
941}
942
943int
944sys_sendto(td, uap)
945	struct thread *td;
946	struct sendto_args /* {
947		int	s;
948		caddr_t	buf;
949		size_t	len;
950		int	flags;
951		caddr_t	to;
952		int	tolen;
953	} */ *uap;
954{
955	struct msghdr msg;
956	struct iovec aiov;
957	int error;
958
959	msg.msg_name = uap->to;
960	msg.msg_namelen = uap->tolen;
961	msg.msg_iov = &aiov;
962	msg.msg_iovlen = 1;
963	msg.msg_control = 0;
964#ifdef COMPAT_OLDSOCK
965	msg.msg_flags = 0;
966#endif
967	aiov.iov_base = uap->buf;
968	aiov.iov_len = uap->len;
969	error = sendit(td, uap->s, &msg, uap->flags);
970	return (error);
971}
972
973#ifdef COMPAT_OLDSOCK
974int
975osend(td, uap)
976	struct thread *td;
977	struct osend_args /* {
978		int	s;
979		caddr_t	buf;
980		int	len;
981		int	flags;
982	} */ *uap;
983{
984	struct msghdr msg;
985	struct iovec aiov;
986	int error;
987
988	msg.msg_name = 0;
989	msg.msg_namelen = 0;
990	msg.msg_iov = &aiov;
991	msg.msg_iovlen = 1;
992	aiov.iov_base = uap->buf;
993	aiov.iov_len = uap->len;
994	msg.msg_control = 0;
995	msg.msg_flags = 0;
996	error = sendit(td, uap->s, &msg, uap->flags);
997	return (error);
998}
999
1000int
1001osendmsg(td, uap)
1002	struct thread *td;
1003	struct osendmsg_args /* {
1004		int	s;
1005		caddr_t	msg;
1006		int	flags;
1007	} */ *uap;
1008{
1009	struct msghdr msg;
1010	struct iovec *iov;
1011	int error;
1012
1013	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1014	if (error)
1015		return (error);
1016	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1017	if (error)
1018		return (error);
1019	msg.msg_iov = iov;
1020	msg.msg_flags = MSG_COMPAT;
1021	error = sendit(td, uap->s, &msg, uap->flags);
1022	free(iov, M_IOV);
1023	return (error);
1024}
1025#endif
1026
1027int
1028sys_sendmsg(td, uap)
1029	struct thread *td;
1030	struct sendmsg_args /* {
1031		int	s;
1032		caddr_t	msg;
1033		int	flags;
1034	} */ *uap;
1035{
1036	struct msghdr msg;
1037	struct iovec *iov;
1038	int error;
1039
1040	error = copyin(uap->msg, &msg, sizeof (msg));
1041	if (error)
1042		return (error);
1043	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1044	if (error)
1045		return (error);
1046	msg.msg_iov = iov;
1047#ifdef COMPAT_OLDSOCK
1048	msg.msg_flags = 0;
1049#endif
1050	error = sendit(td, uap->s, &msg, uap->flags);
1051	free(iov, M_IOV);
1052	return (error);
1053}
1054
1055int
1056kern_recvit(td, s, mp, fromseg, controlp)
1057	struct thread *td;
1058	int s;
1059	struct msghdr *mp;
1060	enum uio_seg fromseg;
1061	struct mbuf **controlp;
1062{
1063	struct uio auio;
1064	struct iovec *iov;
1065	int i;
1066	ssize_t len;
1067	int error;
1068	struct mbuf *m, *control = NULL;
1069	caddr_t ctlbuf;
1070	struct file *fp;
1071	struct socket *so;
1072	struct sockaddr *fromsa = NULL;
1073#ifdef KTRACE
1074	struct uio *ktruio = NULL;
1075#endif
1076
1077	if (controlp != NULL)
1078		*controlp = NULL;
1079
1080	AUDIT_ARG_FD(s);
1081	error = getsock_cap(td->td_proc->p_fd, s, CAP_RECV, &fp, NULL);
1082	if (error)
1083		return (error);
1084	so = fp->f_data;
1085
1086#ifdef MAC
1087	error = mac_socket_check_receive(td->td_ucred, so);
1088	if (error) {
1089		fdrop(fp, td);
1090		return (error);
1091	}
1092#endif
1093
1094	auio.uio_iov = mp->msg_iov;
1095	auio.uio_iovcnt = mp->msg_iovlen;
1096	auio.uio_segflg = UIO_USERSPACE;
1097	auio.uio_rw = UIO_READ;
1098	auio.uio_td = td;
1099	auio.uio_offset = 0;			/* XXX */
1100	auio.uio_resid = 0;
1101	iov = mp->msg_iov;
1102	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1103		if ((auio.uio_resid += iov->iov_len) < 0) {
1104			fdrop(fp, td);
1105			return (EINVAL);
1106		}
1107	}
1108#ifdef KTRACE
1109	if (KTRPOINT(td, KTR_GENIO))
1110		ktruio = cloneuio(&auio);
1111#endif
1112	len = auio.uio_resid;
1113	error = soreceive(so, &fromsa, &auio, NULL,
1114	    (mp->msg_control || controlp) ? &control : NULL,
1115	    &mp->msg_flags);
1116	if (error) {
1117		if (auio.uio_resid != len && (error == ERESTART ||
1118		    error == EINTR || error == EWOULDBLOCK))
1119			error = 0;
1120	}
1121	if (fromsa != NULL)
1122		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
1123#ifdef KTRACE
1124	if (ktruio != NULL) {
1125		ktruio->uio_resid = len - auio.uio_resid;
1126		ktrgenio(s, UIO_READ, ktruio, error);
1127	}
1128#endif
1129	if (error)
1130		goto out;
1131	td->td_retval[0] = len - auio.uio_resid;
1132	if (mp->msg_name) {
1133		len = mp->msg_namelen;
1134		if (len <= 0 || fromsa == NULL)
1135			len = 0;
1136		else {
1137			/* save sa_len before it is destroyed by MSG_COMPAT */
1138			len = MIN(len, fromsa->sa_len);
1139#ifdef COMPAT_OLDSOCK
1140			if (mp->msg_flags & MSG_COMPAT)
1141				((struct osockaddr *)fromsa)->sa_family =
1142				    fromsa->sa_family;
1143#endif
1144			if (fromseg == UIO_USERSPACE) {
1145				error = copyout(fromsa, mp->msg_name,
1146				    (unsigned)len);
1147				if (error)
1148					goto out;
1149			} else
1150				bcopy(fromsa, mp->msg_name, len);
1151		}
1152		mp->msg_namelen = len;
1153	}
1154	if (mp->msg_control && controlp == NULL) {
1155#ifdef COMPAT_OLDSOCK
1156		/*
1157		 * We assume that old recvmsg calls won't receive access
1158		 * rights and other control info, esp. as control info
1159		 * is always optional and those options didn't exist in 4.3.
1160		 * If we receive rights, trim the cmsghdr; anything else
1161		 * is tossed.
1162		 */
1163		if (control && mp->msg_flags & MSG_COMPAT) {
1164			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1165			    SOL_SOCKET ||
1166			    mtod(control, struct cmsghdr *)->cmsg_type !=
1167			    SCM_RIGHTS) {
1168				mp->msg_controllen = 0;
1169				goto out;
1170			}
1171			control->m_len -= sizeof (struct cmsghdr);
1172			control->m_data += sizeof (struct cmsghdr);
1173		}
1174#endif
1175		len = mp->msg_controllen;
1176		m = control;
1177		mp->msg_controllen = 0;
1178		ctlbuf = mp->msg_control;
1179
1180		while (m && len > 0) {
1181			unsigned int tocopy;
1182
1183			if (len >= m->m_len)
1184				tocopy = m->m_len;
1185			else {
1186				mp->msg_flags |= MSG_CTRUNC;
1187				tocopy = len;
1188			}
1189
1190			if ((error = copyout(mtod(m, caddr_t),
1191					ctlbuf, tocopy)) != 0)
1192				goto out;
1193
1194			ctlbuf += tocopy;
1195			len -= tocopy;
1196			m = m->m_next;
1197		}
1198		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1199	}
1200out:
1201	fdrop(fp, td);
1202#ifdef KTRACE
1203	if (fromsa && KTRPOINT(td, KTR_STRUCT))
1204		ktrsockaddr(fromsa);
1205#endif
1206	if (fromsa)
1207		free(fromsa, M_SONAME);
1208
1209	if (error == 0 && controlp != NULL)
1210		*controlp = control;
1211	else  if (control)
1212		m_freem(control);
1213
1214	return (error);
1215}
1216
1217static int
1218recvit(td, s, mp, namelenp)
1219	struct thread *td;
1220	int s;
1221	struct msghdr *mp;
1222	void *namelenp;
1223{
1224	int error;
1225
1226	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1227	if (error)
1228		return (error);
1229	if (namelenp) {
1230		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1231#ifdef COMPAT_OLDSOCK
1232		if (mp->msg_flags & MSG_COMPAT)
1233			error = 0;	/* old recvfrom didn't check */
1234#endif
1235	}
1236	return (error);
1237}
1238
1239int
1240sys_recvfrom(td, uap)
1241	struct thread *td;
1242	struct recvfrom_args /* {
1243		int	s;
1244		caddr_t	buf;
1245		size_t	len;
1246		int	flags;
1247		struct sockaddr * __restrict	from;
1248		socklen_t * __restrict fromlenaddr;
1249	} */ *uap;
1250{
1251	struct msghdr msg;
1252	struct iovec aiov;
1253	int error;
1254
1255	if (uap->fromlenaddr) {
1256		error = copyin(uap->fromlenaddr,
1257		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1258		if (error)
1259			goto done2;
1260	} else {
1261		msg.msg_namelen = 0;
1262	}
1263	msg.msg_name = uap->from;
1264	msg.msg_iov = &aiov;
1265	msg.msg_iovlen = 1;
1266	aiov.iov_base = uap->buf;
1267	aiov.iov_len = uap->len;
1268	msg.msg_control = 0;
1269	msg.msg_flags = uap->flags;
1270	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1271done2:
1272	return(error);
1273}
1274
1275#ifdef COMPAT_OLDSOCK
1276int
1277orecvfrom(td, uap)
1278	struct thread *td;
1279	struct recvfrom_args *uap;
1280{
1281
1282	uap->flags |= MSG_COMPAT;
1283	return (sys_recvfrom(td, uap));
1284}
1285#endif
1286
1287#ifdef COMPAT_OLDSOCK
1288int
1289orecv(td, uap)
1290	struct thread *td;
1291	struct orecv_args /* {
1292		int	s;
1293		caddr_t	buf;
1294		int	len;
1295		int	flags;
1296	} */ *uap;
1297{
1298	struct msghdr msg;
1299	struct iovec aiov;
1300	int error;
1301
1302	msg.msg_name = 0;
1303	msg.msg_namelen = 0;
1304	msg.msg_iov = &aiov;
1305	msg.msg_iovlen = 1;
1306	aiov.iov_base = uap->buf;
1307	aiov.iov_len = uap->len;
1308	msg.msg_control = 0;
1309	msg.msg_flags = uap->flags;
1310	error = recvit(td, uap->s, &msg, NULL);
1311	return (error);
1312}
1313
1314/*
1315 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1316 * overlays the new one, missing only the flags, and with the (old) access
1317 * rights where the control fields are now.
1318 */
1319int
1320orecvmsg(td, uap)
1321	struct thread *td;
1322	struct orecvmsg_args /* {
1323		int	s;
1324		struct	omsghdr *msg;
1325		int	flags;
1326	} */ *uap;
1327{
1328	struct msghdr msg;
1329	struct iovec *iov;
1330	int error;
1331
1332	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1333	if (error)
1334		return (error);
1335	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1336	if (error)
1337		return (error);
1338	msg.msg_flags = uap->flags | MSG_COMPAT;
1339	msg.msg_iov = iov;
1340	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1341	if (msg.msg_controllen && error == 0)
1342		error = copyout(&msg.msg_controllen,
1343		    &uap->msg->msg_accrightslen, sizeof (int));
1344	free(iov, M_IOV);
1345	return (error);
1346}
1347#endif
1348
1349int
1350sys_recvmsg(td, uap)
1351	struct thread *td;
1352	struct recvmsg_args /* {
1353		int	s;
1354		struct	msghdr *msg;
1355		int	flags;
1356	} */ *uap;
1357{
1358	struct msghdr msg;
1359	struct iovec *uiov, *iov;
1360	int error;
1361
1362	error = copyin(uap->msg, &msg, sizeof (msg));
1363	if (error)
1364		return (error);
1365	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1366	if (error)
1367		return (error);
1368	msg.msg_flags = uap->flags;
1369#ifdef COMPAT_OLDSOCK
1370	msg.msg_flags &= ~MSG_COMPAT;
1371#endif
1372	uiov = msg.msg_iov;
1373	msg.msg_iov = iov;
1374	error = recvit(td, uap->s, &msg, NULL);
1375	if (error == 0) {
1376		msg.msg_iov = uiov;
1377		error = copyout(&msg, uap->msg, sizeof(msg));
1378	}
1379	free(iov, M_IOV);
1380	return (error);
1381}
1382
1383/* ARGSUSED */
1384int
1385sys_shutdown(td, uap)
1386	struct thread *td;
1387	struct shutdown_args /* {
1388		int	s;
1389		int	how;
1390	} */ *uap;
1391{
1392	struct socket *so;
1393	struct file *fp;
1394	int error;
1395
1396	AUDIT_ARG_FD(uap->s);
1397	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SHUTDOWN, &fp,
1398	    NULL);
1399	if (error == 0) {
1400		so = fp->f_data;
1401		error = soshutdown(so, uap->how);
1402		fdrop(fp, td);
1403	}
1404	return (error);
1405}
1406
1407/* ARGSUSED */
1408int
1409sys_setsockopt(td, uap)
1410	struct thread *td;
1411	struct setsockopt_args /* {
1412		int	s;
1413		int	level;
1414		int	name;
1415		caddr_t	val;
1416		int	valsize;
1417	} */ *uap;
1418{
1419
1420	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1421	    uap->val, UIO_USERSPACE, uap->valsize));
1422}
1423
1424int
1425kern_setsockopt(td, s, level, name, val, valseg, valsize)
1426	struct thread *td;
1427	int s;
1428	int level;
1429	int name;
1430	void *val;
1431	enum uio_seg valseg;
1432	socklen_t valsize;
1433{
1434	int error;
1435	struct socket *so;
1436	struct file *fp;
1437	struct sockopt sopt;
1438
1439	if (val == NULL && valsize != 0)
1440		return (EFAULT);
1441	if ((int)valsize < 0)
1442		return (EINVAL);
1443
1444	sopt.sopt_dir = SOPT_SET;
1445	sopt.sopt_level = level;
1446	sopt.sopt_name = name;
1447	sopt.sopt_val = val;
1448	sopt.sopt_valsize = valsize;
1449	switch (valseg) {
1450	case UIO_USERSPACE:
1451		sopt.sopt_td = td;
1452		break;
1453	case UIO_SYSSPACE:
1454		sopt.sopt_td = NULL;
1455		break;
1456	default:
1457		panic("kern_setsockopt called with bad valseg");
1458	}
1459
1460	AUDIT_ARG_FD(s);
1461	error = getsock_cap(td->td_proc->p_fd, s, CAP_SETSOCKOPT, &fp, NULL);
1462	if (error == 0) {
1463		so = fp->f_data;
1464		error = sosetopt(so, &sopt);
1465		fdrop(fp, td);
1466	}
1467	return(error);
1468}
1469
1470/* ARGSUSED */
1471int
1472sys_getsockopt(td, uap)
1473	struct thread *td;
1474	struct getsockopt_args /* {
1475		int	s;
1476		int	level;
1477		int	name;
1478		void * __restrict	val;
1479		socklen_t * __restrict avalsize;
1480	} */ *uap;
1481{
1482	socklen_t valsize;
1483	int	error;
1484
1485	if (uap->val) {
1486		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1487		if (error)
1488			return (error);
1489	}
1490
1491	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1492	    uap->val, UIO_USERSPACE, &valsize);
1493
1494	if (error == 0)
1495		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1496	return (error);
1497}
1498
1499/*
1500 * Kernel version of getsockopt.
1501 * optval can be a userland or userspace. optlen is always a kernel pointer.
1502 */
1503int
1504kern_getsockopt(td, s, level, name, val, valseg, valsize)
1505	struct thread *td;
1506	int s;
1507	int level;
1508	int name;
1509	void *val;
1510	enum uio_seg valseg;
1511	socklen_t *valsize;
1512{
1513	int error;
1514	struct  socket *so;
1515	struct file *fp;
1516	struct	sockopt sopt;
1517
1518	if (val == NULL)
1519		*valsize = 0;
1520	if ((int)*valsize < 0)
1521		return (EINVAL);
1522
1523	sopt.sopt_dir = SOPT_GET;
1524	sopt.sopt_level = level;
1525	sopt.sopt_name = name;
1526	sopt.sopt_val = val;
1527	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1528	switch (valseg) {
1529	case UIO_USERSPACE:
1530		sopt.sopt_td = td;
1531		break;
1532	case UIO_SYSSPACE:
1533		sopt.sopt_td = NULL;
1534		break;
1535	default:
1536		panic("kern_getsockopt called with bad valseg");
1537	}
1538
1539	AUDIT_ARG_FD(s);
1540	error = getsock_cap(td->td_proc->p_fd, s, CAP_GETSOCKOPT, &fp, NULL);
1541	if (error == 0) {
1542		so = fp->f_data;
1543		error = sogetopt(so, &sopt);
1544		*valsize = sopt.sopt_valsize;
1545		fdrop(fp, td);
1546	}
1547	return (error);
1548}
1549
1550/*
1551 * getsockname1() - Get socket name.
1552 */
1553/* ARGSUSED */
1554static int
1555getsockname1(td, uap, compat)
1556	struct thread *td;
1557	struct getsockname_args /* {
1558		int	fdes;
1559		struct sockaddr * __restrict asa;
1560		socklen_t * __restrict alen;
1561	} */ *uap;
1562	int compat;
1563{
1564	struct sockaddr *sa;
1565	socklen_t len;
1566	int error;
1567
1568	error = copyin(uap->alen, &len, sizeof(len));
1569	if (error)
1570		return (error);
1571
1572	error = kern_getsockname(td, uap->fdes, &sa, &len);
1573	if (error)
1574		return (error);
1575
1576	if (len != 0) {
1577#ifdef COMPAT_OLDSOCK
1578		if (compat)
1579			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1580#endif
1581		error = copyout(sa, uap->asa, (u_int)len);
1582	}
1583	free(sa, M_SONAME);
1584	if (error == 0)
1585		error = copyout(&len, uap->alen, sizeof(len));
1586	return (error);
1587}
1588
1589int
1590kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1591    socklen_t *alen)
1592{
1593	struct socket *so;
1594	struct file *fp;
1595	socklen_t len;
1596	int error;
1597
1598	AUDIT_ARG_FD(fd);
1599	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETSOCKNAME, &fp, NULL);
1600	if (error)
1601		return (error);
1602	so = fp->f_data;
1603	*sa = NULL;
1604	CURVNET_SET(so->so_vnet);
1605	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1606	CURVNET_RESTORE();
1607	if (error)
1608		goto bad;
1609	if (*sa == NULL)
1610		len = 0;
1611	else
1612		len = MIN(*alen, (*sa)->sa_len);
1613	*alen = len;
1614#ifdef KTRACE
1615	if (KTRPOINT(td, KTR_STRUCT))
1616		ktrsockaddr(*sa);
1617#endif
1618bad:
1619	fdrop(fp, td);
1620	if (error && *sa) {
1621		free(*sa, M_SONAME);
1622		*sa = NULL;
1623	}
1624	return (error);
1625}
1626
1627int
1628sys_getsockname(td, uap)
1629	struct thread *td;
1630	struct getsockname_args *uap;
1631{
1632
1633	return (getsockname1(td, uap, 0));
1634}
1635
1636#ifdef COMPAT_OLDSOCK
1637int
1638ogetsockname(td, uap)
1639	struct thread *td;
1640	struct getsockname_args *uap;
1641{
1642
1643	return (getsockname1(td, uap, 1));
1644}
1645#endif /* COMPAT_OLDSOCK */
1646
1647/*
1648 * getpeername1() - Get name of peer for connected socket.
1649 */
1650/* ARGSUSED */
1651static int
1652getpeername1(td, uap, compat)
1653	struct thread *td;
1654	struct getpeername_args /* {
1655		int	fdes;
1656		struct sockaddr * __restrict	asa;
1657		socklen_t * __restrict	alen;
1658	} */ *uap;
1659	int compat;
1660{
1661	struct sockaddr *sa;
1662	socklen_t len;
1663	int error;
1664
1665	error = copyin(uap->alen, &len, sizeof (len));
1666	if (error)
1667		return (error);
1668
1669	error = kern_getpeername(td, uap->fdes, &sa, &len);
1670	if (error)
1671		return (error);
1672
1673	if (len != 0) {
1674#ifdef COMPAT_OLDSOCK
1675		if (compat)
1676			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1677#endif
1678		error = copyout(sa, uap->asa, (u_int)len);
1679	}
1680	free(sa, M_SONAME);
1681	if (error == 0)
1682		error = copyout(&len, uap->alen, sizeof(len));
1683	return (error);
1684}
1685
1686int
1687kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1688    socklen_t *alen)
1689{
1690	struct socket *so;
1691	struct file *fp;
1692	socklen_t len;
1693	int error;
1694
1695	AUDIT_ARG_FD(fd);
1696	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETPEERNAME, &fp, NULL);
1697	if (error)
1698		return (error);
1699	so = fp->f_data;
1700	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1701		error = ENOTCONN;
1702		goto done;
1703	}
1704	*sa = NULL;
1705	CURVNET_SET(so->so_vnet);
1706	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1707	CURVNET_RESTORE();
1708	if (error)
1709		goto bad;
1710	if (*sa == NULL)
1711		len = 0;
1712	else
1713		len = MIN(*alen, (*sa)->sa_len);
1714	*alen = len;
1715#ifdef KTRACE
1716	if (KTRPOINT(td, KTR_STRUCT))
1717		ktrsockaddr(*sa);
1718#endif
1719bad:
1720	if (error && *sa) {
1721		free(*sa, M_SONAME);
1722		*sa = NULL;
1723	}
1724done:
1725	fdrop(fp, td);
1726	return (error);
1727}
1728
1729int
1730sys_getpeername(td, uap)
1731	struct thread *td;
1732	struct getpeername_args *uap;
1733{
1734
1735	return (getpeername1(td, uap, 0));
1736}
1737
1738#ifdef COMPAT_OLDSOCK
1739int
1740ogetpeername(td, uap)
1741	struct thread *td;
1742	struct ogetpeername_args *uap;
1743{
1744
1745	/* XXX uap should have type `getpeername_args *' to begin with. */
1746	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1747}
1748#endif /* COMPAT_OLDSOCK */
1749
1750int
1751sockargs(mp, buf, buflen, type)
1752	struct mbuf **mp;
1753	caddr_t buf;
1754	int buflen, type;
1755{
1756	struct sockaddr *sa;
1757	struct mbuf *m;
1758	int error;
1759
1760	if (buflen > MLEN) {
1761#ifdef COMPAT_OLDSOCK
1762		if (type == MT_SONAME && buflen <= 112)
1763			buflen = MLEN;		/* unix domain compat. hack */
1764		else
1765#endif
1766			if (buflen > MCLBYTES)
1767				return (EINVAL);
1768	}
1769	m = m_get2(buflen, M_WAITOK, type, 0);
1770	m->m_len = buflen;
1771	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1772	if (error)
1773		(void) m_free(m);
1774	else {
1775		*mp = m;
1776		if (type == MT_SONAME) {
1777			sa = mtod(m, struct sockaddr *);
1778
1779#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1780			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1781				sa->sa_family = sa->sa_len;
1782#endif
1783			sa->sa_len = buflen;
1784		}
1785	}
1786	return (error);
1787}
1788
1789int
1790getsockaddr(namp, uaddr, len)
1791	struct sockaddr **namp;
1792	caddr_t uaddr;
1793	size_t len;
1794{
1795	struct sockaddr *sa;
1796	int error;
1797
1798	if (len > SOCK_MAXADDRLEN)
1799		return (ENAMETOOLONG);
1800	if (len < offsetof(struct sockaddr, sa_data[0]))
1801		return (EINVAL);
1802	sa = malloc(len, M_SONAME, M_WAITOK);
1803	error = copyin(uaddr, sa, len);
1804	if (error) {
1805		free(sa, M_SONAME);
1806	} else {
1807#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1808		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1809			sa->sa_family = sa->sa_len;
1810#endif
1811		sa->sa_len = len;
1812		*namp = sa;
1813	}
1814	return (error);
1815}
1816
1817#include <sys/condvar.h>
1818
1819struct sendfile_sync {
1820	struct mtx	mtx;
1821	struct cv	cv;
1822	unsigned	count;
1823};
1824
1825/*
1826 * Detach mapped page and release resources back to the system.
1827 */
1828void
1829sf_buf_mext(void *addr, void *args)
1830{
1831	vm_page_t m;
1832	struct sendfile_sync *sfs;
1833
1834	m = sf_buf_page(args);
1835	sf_buf_free(args);
1836	vm_page_lock(m);
1837	vm_page_unwire(m, 0);
1838	/*
1839	 * Check for the object going away on us. This can
1840	 * happen since we don't hold a reference to it.
1841	 * If so, we're responsible for freeing the page.
1842	 */
1843	if (m->wire_count == 0 && m->object == NULL)
1844		vm_page_free(m);
1845	vm_page_unlock(m);
1846	if (addr == NULL)
1847		return;
1848	sfs = addr;
1849	mtx_lock(&sfs->mtx);
1850	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
1851	if (--sfs->count == 0)
1852		cv_signal(&sfs->cv);
1853	mtx_unlock(&sfs->mtx);
1854}
1855
1856/*
1857 * sendfile(2)
1858 *
1859 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1860 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1861 *
1862 * Send a file specified by 'fd' and starting at 'offset' to a socket
1863 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1864 * 0.  Optionally add a header and/or trailer to the socket output.  If
1865 * specified, write the total number of bytes sent into *sbytes.
1866 */
1867int
1868sys_sendfile(struct thread *td, struct sendfile_args *uap)
1869{
1870
1871	return (do_sendfile(td, uap, 0));
1872}
1873
1874static int
1875do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1876{
1877	struct sf_hdtr hdtr;
1878	struct uio *hdr_uio, *trl_uio;
1879	int error;
1880
1881	hdr_uio = trl_uio = NULL;
1882
1883	if (uap->hdtr != NULL) {
1884		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1885		if (error)
1886			goto out;
1887		if (hdtr.headers != NULL) {
1888			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1889			if (error)
1890				goto out;
1891		}
1892		if (hdtr.trailers != NULL) {
1893			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1894			if (error)
1895				goto out;
1896
1897		}
1898	}
1899
1900	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
1901out:
1902	if (hdr_uio)
1903		free(hdr_uio, M_IOV);
1904	if (trl_uio)
1905		free(trl_uio, M_IOV);
1906	return (error);
1907}
1908
1909#ifdef COMPAT_FREEBSD4
1910int
1911freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1912{
1913	struct sendfile_args args;
1914
1915	args.fd = uap->fd;
1916	args.s = uap->s;
1917	args.offset = uap->offset;
1918	args.nbytes = uap->nbytes;
1919	args.hdtr = uap->hdtr;
1920	args.sbytes = uap->sbytes;
1921	args.flags = uap->flags;
1922
1923	return (do_sendfile(td, &args, 1));
1924}
1925#endif /* COMPAT_FREEBSD4 */
1926
1927int
1928kern_sendfile(struct thread *td, struct sendfile_args *uap,
1929    struct uio *hdr_uio, struct uio *trl_uio, int compat)
1930{
1931	struct file *sock_fp;
1932	struct vnode *vp;
1933	struct vm_object *obj = NULL;
1934	struct socket *so = NULL;
1935	struct mbuf *m = NULL;
1936	struct sf_buf *sf;
1937	struct vm_page *pg;
1938	struct vattr va;
1939	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
1940	int error, hdrlen = 0, mnw = 0;
1941	int bsize;
1942	struct sendfile_sync *sfs = NULL;
1943
1944	/*
1945	 * The file descriptor must be a regular file and have a
1946	 * backing VM object.
1947	 * File offset must be positive.  If it goes beyond EOF
1948	 * we send only the header/trailer and no payload data.
1949	 */
1950	AUDIT_ARG_FD(uap->fd);
1951	/*
1952	 * sendfile(2) can start at any offset within a file so we require
1953	 * CAP_READ+CAP_SEEK = CAP_PREAD.
1954	 */
1955	if ((error = fgetvp_read(td, uap->fd, CAP_PREAD, &vp)) != 0)
1956		goto out;
1957	vn_lock(vp, LK_SHARED | LK_RETRY);
1958	if (vp->v_type == VREG) {
1959		obj = vp->v_object;
1960		if (obj != NULL) {
1961			/*
1962			 * Temporarily increase the backing VM
1963			 * object's reference count so that a forced
1964			 * reclamation of its vnode does not
1965			 * immediately destroy it.
1966			 */
1967			VM_OBJECT_WLOCK(obj);
1968			if ((obj->flags & OBJ_DEAD) == 0) {
1969				vm_object_reference_locked(obj);
1970				VM_OBJECT_WUNLOCK(obj);
1971			} else {
1972				VM_OBJECT_WUNLOCK(obj);
1973				obj = NULL;
1974			}
1975		}
1976	}
1977	VOP_UNLOCK(vp, 0);
1978	if (obj == NULL) {
1979		error = EINVAL;
1980		goto out;
1981	}
1982	if (uap->offset < 0) {
1983		error = EINVAL;
1984		goto out;
1985	}
1986
1987	/*
1988	 * The socket must be a stream socket and connected.
1989	 * Remember if it a blocking or non-blocking socket.
1990	 */
1991	if ((error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SEND,
1992	    &sock_fp, NULL)) != 0)
1993		goto out;
1994	so = sock_fp->f_data;
1995	if (so->so_type != SOCK_STREAM) {
1996		error = EINVAL;
1997		goto out;
1998	}
1999	if ((so->so_state & SS_ISCONNECTED) == 0) {
2000		error = ENOTCONN;
2001		goto out;
2002	}
2003	/*
2004	 * Do not wait on memory allocations but return ENOMEM for
2005	 * caller to retry later.
2006	 * XXX: Experimental.
2007	 */
2008	if (uap->flags & SF_MNOWAIT)
2009		mnw = 1;
2010
2011	if (uap->flags & SF_SYNC) {
2012		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
2013		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
2014		cv_init(&sfs->cv, "sendfile");
2015	}
2016
2017#ifdef MAC
2018	error = mac_socket_check_send(td->td_ucred, so);
2019	if (error)
2020		goto out;
2021#endif
2022
2023	/* If headers are specified copy them into mbufs. */
2024	if (hdr_uio != NULL) {
2025		hdr_uio->uio_td = td;
2026		hdr_uio->uio_rw = UIO_WRITE;
2027		if (hdr_uio->uio_resid > 0) {
2028			/*
2029			 * In FBSD < 5.0 the nbytes to send also included
2030			 * the header.  If compat is specified subtract the
2031			 * header size from nbytes.
2032			 */
2033			if (compat) {
2034				if (uap->nbytes > hdr_uio->uio_resid)
2035					uap->nbytes -= hdr_uio->uio_resid;
2036				else
2037					uap->nbytes = 0;
2038			}
2039			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
2040			    0, 0, 0);
2041			if (m == NULL) {
2042				error = mnw ? EAGAIN : ENOBUFS;
2043				goto out;
2044			}
2045			hdrlen = m_length(m, NULL);
2046		}
2047	}
2048
2049	/*
2050	 * Protect against multiple writers to the socket.
2051	 *
2052	 * XXXRW: Historically this has assumed non-interruptibility, so now
2053	 * we implement that, but possibly shouldn't.
2054	 */
2055	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
2056
2057	/*
2058	 * Loop through the pages of the file, starting with the requested
2059	 * offset. Get a file page (do I/O if necessary), map the file page
2060	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
2061	 * it on the socket.
2062	 * This is done in two loops.  The inner loop turns as many pages
2063	 * as it can, up to available socket buffer space, without blocking
2064	 * into mbufs to have it bulk delivered into the socket send buffer.
2065	 * The outer loop checks the state and available space of the socket
2066	 * and takes care of the overall progress.
2067	 */
2068	for (off = uap->offset, rem = uap->nbytes; ; ) {
2069		struct mbuf *mtail = NULL;
2070		int loopbytes = 0;
2071		int space = 0;
2072		int done = 0;
2073
2074		/*
2075		 * Check the socket state for ongoing connection,
2076		 * no errors and space in socket buffer.
2077		 * If space is low allow for the remainder of the
2078		 * file to be processed if it fits the socket buffer.
2079		 * Otherwise block in waiting for sufficient space
2080		 * to proceed, or if the socket is nonblocking, return
2081		 * to userland with EAGAIN while reporting how far
2082		 * we've come.
2083		 * We wait until the socket buffer has significant free
2084		 * space to do bulk sends.  This makes good use of file
2085		 * system read ahead and allows packet segmentation
2086		 * offloading hardware to take over lots of work.  If
2087		 * we were not careful here we would send off only one
2088		 * sfbuf at a time.
2089		 */
2090		SOCKBUF_LOCK(&so->so_snd);
2091		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
2092			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
2093retry_space:
2094		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2095			error = EPIPE;
2096			SOCKBUF_UNLOCK(&so->so_snd);
2097			goto done;
2098		} else if (so->so_error) {
2099			error = so->so_error;
2100			so->so_error = 0;
2101			SOCKBUF_UNLOCK(&so->so_snd);
2102			goto done;
2103		}
2104		space = sbspace(&so->so_snd);
2105		if (space < rem &&
2106		    (space <= 0 ||
2107		     space < so->so_snd.sb_lowat)) {
2108			if (so->so_state & SS_NBIO) {
2109				SOCKBUF_UNLOCK(&so->so_snd);
2110				error = EAGAIN;
2111				goto done;
2112			}
2113			/*
2114			 * sbwait drops the lock while sleeping.
2115			 * When we loop back to retry_space the
2116			 * state may have changed and we retest
2117			 * for it.
2118			 */
2119			error = sbwait(&so->so_snd);
2120			/*
2121			 * An error from sbwait usually indicates that we've
2122			 * been interrupted by a signal. If we've sent anything
2123			 * then return bytes sent, otherwise return the error.
2124			 */
2125			if (error) {
2126				SOCKBUF_UNLOCK(&so->so_snd);
2127				goto done;
2128			}
2129			goto retry_space;
2130		}
2131		SOCKBUF_UNLOCK(&so->so_snd);
2132
2133		/*
2134		 * Reduce space in the socket buffer by the size of
2135		 * the header mbuf chain.
2136		 * hdrlen is set to 0 after the first loop.
2137		 */
2138		space -= hdrlen;
2139
2140		error = vn_lock(vp, LK_SHARED);
2141		if (error != 0)
2142			goto done;
2143		error = VOP_GETATTR(vp, &va, td->td_ucred);
2144		if (error != 0) {
2145			VOP_UNLOCK(vp, 0);
2146			goto done;
2147		}
2148		bsize = vp->v_mount->mnt_stat.f_iosize;
2149
2150		/*
2151		 * Loop and construct maximum sized mbuf chain to be bulk
2152		 * dumped into socket buffer.
2153		 */
2154		while (1) {
2155			vm_pindex_t pindex;
2156			vm_offset_t pgoff;
2157			struct mbuf *m0;
2158
2159			/*
2160			 * Calculate the amount to transfer.
2161			 * Not to exceed a page, the EOF,
2162			 * or the passed in nbytes.
2163			 */
2164			pgoff = (vm_offset_t)(off & PAGE_MASK);
2165			if (uap->nbytes)
2166				rem = (uap->nbytes - fsbytes - loopbytes);
2167			else
2168				rem = va.va_size -
2169				    uap->offset - fsbytes - loopbytes;
2170			xfsize = omin(PAGE_SIZE - pgoff, rem);
2171			xfsize = omin(space - loopbytes, xfsize);
2172			if (xfsize <= 0) {
2173				done = 1;		/* all data sent */
2174				break;
2175			}
2176
2177			/*
2178			 * We've already overfilled the socket.
2179			 * Let the outer loop figure out how to handle it.
2180			 */
2181			if (space <= loopbytes) {
2182				done = 0;
2183				break;
2184			}
2185
2186			/*
2187			 * Attempt to look up the page.  Allocate
2188			 * if not found or wait and loop if busy.
2189			 */
2190			pindex = OFF_TO_IDX(off);
2191			VM_OBJECT_WLOCK(obj);
2192			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
2193			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
2194
2195			/*
2196			 * Check if page is valid for what we need,
2197			 * otherwise initiate I/O.
2198			 * If we already turned some pages into mbufs,
2199			 * send them off before we come here again and
2200			 * block.
2201			 */
2202			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
2203				VM_OBJECT_WUNLOCK(obj);
2204			else if (m != NULL)
2205				error = EAGAIN;	/* send what we already got */
2206			else if (uap->flags & SF_NODISKIO)
2207				error = EBUSY;
2208			else {
2209				ssize_t resid;
2210
2211				/*
2212				 * Ensure that our page is still around
2213				 * when the I/O completes.
2214				 */
2215				vm_page_io_start(pg);
2216				VM_OBJECT_WUNLOCK(obj);
2217
2218				/*
2219				 * Get the page from backing store.
2220				 * XXXMAC: Because we don't have fp->f_cred
2221				 * here, we pass in NOCRED.  This is probably
2222				 * wrong, but is consistent with our original
2223				 * implementation.
2224				 */
2225				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
2226				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2227				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
2228				    td->td_ucred, NOCRED, &resid, td);
2229				VM_OBJECT_WLOCK(obj);
2230				vm_page_io_finish(pg);
2231				if (!error)
2232					VM_OBJECT_WUNLOCK(obj);
2233				mbstat.sf_iocnt++;
2234			}
2235			if (error) {
2236				vm_page_lock(pg);
2237				vm_page_unwire(pg, 0);
2238				/*
2239				 * See if anyone else might know about
2240				 * this page.  If not and it is not valid,
2241				 * then free it.
2242				 */
2243				if (pg->wire_count == 0 && pg->valid == 0 &&
2244				    pg->busy == 0 && !(pg->oflags & VPO_BUSY))
2245					vm_page_free(pg);
2246				vm_page_unlock(pg);
2247				VM_OBJECT_WUNLOCK(obj);
2248				if (error == EAGAIN)
2249					error = 0;	/* not a real error */
2250				break;
2251			}
2252
2253			/*
2254			 * Get a sendfile buf.  When allocating the
2255			 * first buffer for mbuf chain, we usually
2256			 * wait as long as necessary, but this wait
2257			 * can be interrupted.  For consequent
2258			 * buffers, do not sleep, since several
2259			 * threads might exhaust the buffers and then
2260			 * deadlock.
2261			 */
2262			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
2263			    SFB_CATCH);
2264			if (sf == NULL) {
2265				mbstat.sf_allocfail++;
2266				vm_page_lock(pg);
2267				vm_page_unwire(pg, 0);
2268				KASSERT(pg->object != NULL,
2269				    ("kern_sendfile: object disappeared"));
2270				vm_page_unlock(pg);
2271				if (m == NULL)
2272					error = (mnw ? EAGAIN : EINTR);
2273				break;
2274			}
2275
2276			/*
2277			 * Get an mbuf and set it up as having
2278			 * external storage.
2279			 */
2280			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2281			if (m0 == NULL) {
2282				error = (mnw ? EAGAIN : ENOBUFS);
2283				sf_buf_mext(NULL, sf);
2284				break;
2285			}
2286			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
2287			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
2288			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
2289				error = (mnw ? EAGAIN : ENOBUFS);
2290				sf_buf_mext(NULL, sf);
2291				m_freem(m0);
2292				break;
2293			}
2294			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2295			m0->m_len = xfsize;
2296
2297			/* Append to mbuf chain. */
2298			if (mtail != NULL)
2299				mtail->m_next = m0;
2300			else if (m != NULL)
2301				m_last(m)->m_next = m0;
2302			else
2303				m = m0;
2304			mtail = m0;
2305
2306			/* Keep track of bits processed. */
2307			loopbytes += xfsize;
2308			off += xfsize;
2309
2310			if (sfs != NULL) {
2311				mtx_lock(&sfs->mtx);
2312				sfs->count++;
2313				mtx_unlock(&sfs->mtx);
2314			}
2315		}
2316
2317		VOP_UNLOCK(vp, 0);
2318
2319		/* Add the buffer chain to the socket buffer. */
2320		if (m != NULL) {
2321			int mlen, err;
2322
2323			mlen = m_length(m, NULL);
2324			SOCKBUF_LOCK(&so->so_snd);
2325			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2326				error = EPIPE;
2327				SOCKBUF_UNLOCK(&so->so_snd);
2328				goto done;
2329			}
2330			SOCKBUF_UNLOCK(&so->so_snd);
2331			CURVNET_SET(so->so_vnet);
2332			/* Avoid error aliasing. */
2333			err = (*so->so_proto->pr_usrreqs->pru_send)
2334				    (so, 0, m, NULL, NULL, td);
2335			CURVNET_RESTORE();
2336			if (err == 0) {
2337				/*
2338				 * We need two counters to get the
2339				 * file offset and nbytes to send
2340				 * right:
2341				 * - sbytes contains the total amount
2342				 *   of bytes sent, including headers.
2343				 * - fsbytes contains the total amount
2344				 *   of bytes sent from the file.
2345				 */
2346				sbytes += mlen;
2347				fsbytes += mlen;
2348				if (hdrlen) {
2349					fsbytes -= hdrlen;
2350					hdrlen = 0;
2351				}
2352			} else if (error == 0)
2353				error = err;
2354			m = NULL;	/* pru_send always consumes */
2355		}
2356
2357		/* Quit outer loop on error or when we're done. */
2358		if (done)
2359			break;
2360		if (error)
2361			goto done;
2362	}
2363
2364	/*
2365	 * Send trailers. Wimp out and use writev(2).
2366	 */
2367	if (trl_uio != NULL) {
2368		sbunlock(&so->so_snd);
2369		error = kern_writev(td, uap->s, trl_uio);
2370		if (error == 0)
2371			sbytes += td->td_retval[0];
2372		goto out;
2373	}
2374
2375done:
2376	sbunlock(&so->so_snd);
2377out:
2378	/*
2379	 * If there was no error we have to clear td->td_retval[0]
2380	 * because it may have been set by writev.
2381	 */
2382	if (error == 0) {
2383		td->td_retval[0] = 0;
2384	}
2385	if (uap->sbytes != NULL) {
2386		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2387	}
2388	if (obj != NULL)
2389		vm_object_deallocate(obj);
2390	if (vp != NULL)
2391		vrele(vp);
2392	if (so)
2393		fdrop(sock_fp, td);
2394	if (m)
2395		m_freem(m);
2396
2397	if (sfs != NULL) {
2398		mtx_lock(&sfs->mtx);
2399		if (sfs->count != 0)
2400			cv_wait(&sfs->cv, &sfs->mtx);
2401		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2402		cv_destroy(&sfs->cv);
2403		mtx_destroy(&sfs->mtx);
2404		free(sfs, M_TEMP);
2405	}
2406
2407	if (error == ERESTART)
2408		error = EINTR;
2409
2410	return (error);
2411}
2412
2413/*
2414 * SCTP syscalls.
2415 * Functionality only compiled in if SCTP is defined in the kernel Makefile,
2416 * otherwise all return EOPNOTSUPP.
2417 * XXX: We should make this loadable one day.
2418 */
2419int
2420sys_sctp_peeloff(td, uap)
2421	struct thread *td;
2422	struct sctp_peeloff_args /* {
2423		int	sd;
2424		caddr_t	name;
2425	} */ *uap;
2426{
2427#if (defined(INET) || defined(INET6)) && defined(SCTP)
2428	struct file *nfp = NULL;
2429	int error;
2430	struct socket *head, *so;
2431	int fd;
2432	u_int fflag;
2433
2434	AUDIT_ARG_FD(uap->sd);
2435	error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag);
2436	if (error)
2437		goto done2;
2438	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
2439		error = EOPNOTSUPP;
2440		goto done;
2441	}
2442	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
2443	if (error)
2444		goto done;
2445	/*
2446	 * At this point we know we do have a assoc to pull
2447	 * we proceed to get the fd setup. This may block
2448	 * but that is ok.
2449	 */
2450
2451	error = falloc(td, &nfp, &fd, 0);
2452	if (error)
2453		goto done;
2454	td->td_retval[0] = fd;
2455
2456	CURVNET_SET(head->so_vnet);
2457	so = sonewconn(head, SS_ISCONNECTED);
2458	if (so == NULL) {
2459		error = ENOMEM;
2460		goto noconnection;
2461	}
2462	/*
2463	 * Before changing the flags on the socket, we have to bump the
2464	 * reference count.  Otherwise, if the protocol calls sofree(),
2465	 * the socket will be released due to a zero refcount.
2466	 */
2467        SOCK_LOCK(so);
2468        soref(so);                      /* file descriptor reference */
2469        SOCK_UNLOCK(so);
2470
2471	ACCEPT_LOCK();
2472
2473	TAILQ_REMOVE(&head->so_comp, so, so_list);
2474	head->so_qlen--;
2475	so->so_state |= (head->so_state & SS_NBIO);
2476	so->so_state &= ~SS_NOFDREF;
2477	so->so_qstate &= ~SQ_COMP;
2478	so->so_head = NULL;
2479	ACCEPT_UNLOCK();
2480	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
2481	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
2482	if (error)
2483		goto noconnection;
2484	if (head->so_sigio != NULL)
2485		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
2486
2487noconnection:
2488	/*
2489	 * close the new descriptor, assuming someone hasn't ripped it
2490	 * out from under us.
2491	 */
2492	if (error)
2493		fdclose(td->td_proc->p_fd, nfp, fd, td);
2494
2495	/*
2496	 * Release explicitly held references before returning.
2497	 */
2498	CURVNET_RESTORE();
2499done:
2500	if (nfp != NULL)
2501		fdrop(nfp, td);
2502	fputsock(head);
2503done2:
2504	return (error);
2505#else  /* SCTP */
2506	return (EOPNOTSUPP);
2507#endif /* SCTP */
2508}
2509
2510int
2511sys_sctp_generic_sendmsg (td, uap)
2512	struct thread *td;
2513	struct sctp_generic_sendmsg_args /* {
2514		int sd,
2515		caddr_t msg,
2516		int mlen,
2517		caddr_t to,
2518		__socklen_t tolen,
2519		struct sctp_sndrcvinfo *sinfo,
2520		int flags
2521	} */ *uap;
2522{
2523#if (defined(INET) || defined(INET6)) && defined(SCTP)
2524	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2525	struct socket *so;
2526	struct file *fp = NULL;
2527	int error = 0, len;
2528	struct sockaddr *to = NULL;
2529#ifdef KTRACE
2530	struct uio *ktruio = NULL;
2531#endif
2532	struct uio auio;
2533	struct iovec iov[1];
2534	cap_rights_t rights;
2535
2536	if (uap->sinfo) {
2537		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2538		if (error)
2539			return (error);
2540		u_sinfo = &sinfo;
2541	}
2542
2543	rights = CAP_SEND;
2544	if (uap->tolen) {
2545		error = getsockaddr(&to, uap->to, uap->tolen);
2546		if (error) {
2547			to = NULL;
2548			goto sctp_bad2;
2549		}
2550		rights |= CAP_CONNECT;
2551	}
2552
2553	AUDIT_ARG_FD(uap->sd);
2554	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
2555	if (error)
2556		goto sctp_bad;
2557#ifdef KTRACE
2558	if (to && (KTRPOINT(td, KTR_STRUCT)))
2559		ktrsockaddr(to);
2560#endif
2561
2562	iov[0].iov_base = uap->msg;
2563	iov[0].iov_len = uap->mlen;
2564
2565	so = (struct socket *)fp->f_data;
2566	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2567		error = EOPNOTSUPP;
2568		goto sctp_bad;
2569	}
2570#ifdef MAC
2571	error = mac_socket_check_send(td->td_ucred, so);
2572	if (error)
2573		goto sctp_bad;
2574#endif /* MAC */
2575
2576	auio.uio_iov =  iov;
2577	auio.uio_iovcnt = 1;
2578	auio.uio_segflg = UIO_USERSPACE;
2579	auio.uio_rw = UIO_WRITE;
2580	auio.uio_td = td;
2581	auio.uio_offset = 0;			/* XXX */
2582	auio.uio_resid = 0;
2583	len = auio.uio_resid = uap->mlen;
2584	CURVNET_SET(so->so_vnet);
2585	error = sctp_lower_sosend(so, to, &auio,
2586		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2587		    uap->flags, u_sinfo, td);
2588	CURVNET_RESTORE();
2589	if (error) {
2590		if (auio.uio_resid != len && (error == ERESTART ||
2591		    error == EINTR || error == EWOULDBLOCK))
2592			error = 0;
2593		/* Generation of SIGPIPE can be controlled per socket. */
2594		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2595		    !(uap->flags & MSG_NOSIGNAL)) {
2596			PROC_LOCK(td->td_proc);
2597			tdsignal(td, SIGPIPE);
2598			PROC_UNLOCK(td->td_proc);
2599		}
2600	}
2601	if (error == 0)
2602		td->td_retval[0] = len - auio.uio_resid;
2603#ifdef KTRACE
2604	if (ktruio != NULL) {
2605		ktruio->uio_resid = td->td_retval[0];
2606		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2607	}
2608#endif /* KTRACE */
2609sctp_bad:
2610	if (fp)
2611		fdrop(fp, td);
2612sctp_bad2:
2613	if (to)
2614		free(to, M_SONAME);
2615	return (error);
2616#else  /* SCTP */
2617	return (EOPNOTSUPP);
2618#endif /* SCTP */
2619}
2620
2621int
2622sys_sctp_generic_sendmsg_iov(td, uap)
2623	struct thread *td;
2624	struct sctp_generic_sendmsg_iov_args /* {
2625		int sd,
2626		struct iovec *iov,
2627		int iovlen,
2628		caddr_t to,
2629		__socklen_t tolen,
2630		struct sctp_sndrcvinfo *sinfo,
2631		int flags
2632	} */ *uap;
2633{
2634#if (defined(INET) || defined(INET6)) && defined(SCTP)
2635	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2636	struct socket *so;
2637	struct file *fp = NULL;
2638	int error=0, i;
2639	ssize_t len;
2640	struct sockaddr *to = NULL;
2641#ifdef KTRACE
2642	struct uio *ktruio = NULL;
2643#endif
2644	struct uio auio;
2645	struct iovec *iov, *tiov;
2646	cap_rights_t rights;
2647
2648	if (uap->sinfo) {
2649		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2650		if (error)
2651			return (error);
2652		u_sinfo = &sinfo;
2653	}
2654	rights = CAP_SEND;
2655	if (uap->tolen) {
2656		error = getsockaddr(&to, uap->to, uap->tolen);
2657		if (error) {
2658			to = NULL;
2659			goto sctp_bad2;
2660		}
2661		rights |= CAP_CONNECT;
2662	}
2663
2664	AUDIT_ARG_FD(uap->sd);
2665	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
2666	if (error)
2667		goto sctp_bad1;
2668
2669#ifdef COMPAT_FREEBSD32
2670	if (SV_CURPROC_FLAG(SV_ILP32))
2671		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2672		    uap->iovlen, &iov, EMSGSIZE);
2673	else
2674#endif
2675		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2676	if (error)
2677		goto sctp_bad1;
2678#ifdef KTRACE
2679	if (to && (KTRPOINT(td, KTR_STRUCT)))
2680		ktrsockaddr(to);
2681#endif
2682
2683	so = (struct socket *)fp->f_data;
2684	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2685		error = EOPNOTSUPP;
2686		goto sctp_bad;
2687	}
2688#ifdef MAC
2689	error = mac_socket_check_send(td->td_ucred, so);
2690	if (error)
2691		goto sctp_bad;
2692#endif /* MAC */
2693
2694	auio.uio_iov = iov;
2695	auio.uio_iovcnt = uap->iovlen;
2696	auio.uio_segflg = UIO_USERSPACE;
2697	auio.uio_rw = UIO_WRITE;
2698	auio.uio_td = td;
2699	auio.uio_offset = 0;			/* XXX */
2700	auio.uio_resid = 0;
2701	tiov = iov;
2702	for (i = 0; i <uap->iovlen; i++, tiov++) {
2703		if ((auio.uio_resid += tiov->iov_len) < 0) {
2704			error = EINVAL;
2705			goto sctp_bad;
2706		}
2707	}
2708	len = auio.uio_resid;
2709	CURVNET_SET(so->so_vnet);
2710	error = sctp_lower_sosend(so, to, &auio,
2711		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2712		    uap->flags, u_sinfo, td);
2713	CURVNET_RESTORE();
2714	if (error) {
2715		if (auio.uio_resid != len && (error == ERESTART ||
2716		    error == EINTR || error == EWOULDBLOCK))
2717			error = 0;
2718		/* Generation of SIGPIPE can be controlled per socket */
2719		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2720		    !(uap->flags & MSG_NOSIGNAL)) {
2721			PROC_LOCK(td->td_proc);
2722			tdsignal(td, SIGPIPE);
2723			PROC_UNLOCK(td->td_proc);
2724		}
2725	}
2726	if (error == 0)
2727		td->td_retval[0] = len - auio.uio_resid;
2728#ifdef KTRACE
2729	if (ktruio != NULL) {
2730		ktruio->uio_resid = td->td_retval[0];
2731		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2732	}
2733#endif /* KTRACE */
2734sctp_bad:
2735	free(iov, M_IOV);
2736sctp_bad1:
2737	if (fp)
2738		fdrop(fp, td);
2739sctp_bad2:
2740	if (to)
2741		free(to, M_SONAME);
2742	return (error);
2743#else  /* SCTP */
2744	return (EOPNOTSUPP);
2745#endif /* SCTP */
2746}
2747
2748int
2749sys_sctp_generic_recvmsg(td, uap)
2750	struct thread *td;
2751	struct sctp_generic_recvmsg_args /* {
2752		int sd,
2753		struct iovec *iov,
2754		int iovlen,
2755		struct sockaddr *from,
2756		__socklen_t *fromlenaddr,
2757		struct sctp_sndrcvinfo *sinfo,
2758		int *msg_flags
2759	} */ *uap;
2760{
2761#if (defined(INET) || defined(INET6)) && defined(SCTP)
2762	uint8_t sockbufstore[256];
2763	struct uio auio;
2764	struct iovec *iov, *tiov;
2765	struct sctp_sndrcvinfo sinfo;
2766	struct socket *so;
2767	struct file *fp = NULL;
2768	struct sockaddr *fromsa;
2769	int fromlen;
2770	ssize_t len;
2771	int i, msg_flags;
2772	int error = 0;
2773#ifdef KTRACE
2774	struct uio *ktruio = NULL;
2775#endif
2776
2777	AUDIT_ARG_FD(uap->sd);
2778	error = getsock_cap(td->td_proc->p_fd, uap->sd, CAP_RECV, &fp, NULL);
2779	if (error) {
2780		return (error);
2781	}
2782#ifdef COMPAT_FREEBSD32
2783	if (SV_CURPROC_FLAG(SV_ILP32))
2784		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2785		    uap->iovlen, &iov, EMSGSIZE);
2786	else
2787#endif
2788		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2789	if (error)
2790		goto out1;
2791
2792	so = fp->f_data;
2793	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2794		error = EOPNOTSUPP;
2795		goto out;
2796	}
2797#ifdef MAC
2798	error = mac_socket_check_receive(td->td_ucred, so);
2799	if (error) {
2800		goto out;
2801	}
2802#endif /* MAC */
2803
2804	if (uap->fromlenaddr) {
2805		error = copyin(uap->fromlenaddr,
2806		    &fromlen, sizeof (fromlen));
2807		if (error) {
2808			goto out;
2809		}
2810	} else {
2811		fromlen = 0;
2812	}
2813	if (uap->msg_flags) {
2814		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
2815		if (error) {
2816			goto out;
2817		}
2818	} else {
2819		msg_flags = 0;
2820	}
2821	auio.uio_iov = iov;
2822	auio.uio_iovcnt = uap->iovlen;
2823	auio.uio_segflg = UIO_USERSPACE;
2824	auio.uio_rw = UIO_READ;
2825	auio.uio_td = td;
2826	auio.uio_offset = 0;			/* XXX */
2827	auio.uio_resid = 0;
2828	tiov = iov;
2829	for (i = 0; i <uap->iovlen; i++, tiov++) {
2830		if ((auio.uio_resid += tiov->iov_len) < 0) {
2831			error = EINVAL;
2832			goto out;
2833		}
2834	}
2835	len = auio.uio_resid;
2836	fromsa = (struct sockaddr *)sockbufstore;
2837
2838#ifdef KTRACE
2839	if (KTRPOINT(td, KTR_GENIO))
2840		ktruio = cloneuio(&auio);
2841#endif /* KTRACE */
2842	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
2843	CURVNET_SET(so->so_vnet);
2844	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
2845		    fromsa, fromlen, &msg_flags,
2846		    (struct sctp_sndrcvinfo *)&sinfo, 1);
2847	CURVNET_RESTORE();
2848	if (error) {
2849		if (auio.uio_resid != len && (error == ERESTART ||
2850		    error == EINTR || error == EWOULDBLOCK))
2851			error = 0;
2852	} else {
2853		if (uap->sinfo)
2854			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
2855	}
2856#ifdef KTRACE
2857	if (ktruio != NULL) {
2858		ktruio->uio_resid = len - auio.uio_resid;
2859		ktrgenio(uap->sd, UIO_READ, ktruio, error);
2860	}
2861#endif /* KTRACE */
2862	if (error)
2863		goto out;
2864	td->td_retval[0] = len - auio.uio_resid;
2865
2866	if (fromlen && uap->from) {
2867		len = fromlen;
2868		if (len <= 0 || fromsa == 0)
2869			len = 0;
2870		else {
2871			len = MIN(len, fromsa->sa_len);
2872			error = copyout(fromsa, uap->from, (size_t)len);
2873			if (error)
2874				goto out;
2875		}
2876		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
2877		if (error) {
2878			goto out;
2879		}
2880	}
2881#ifdef KTRACE
2882	if (KTRPOINT(td, KTR_STRUCT))
2883		ktrsockaddr(fromsa);
2884#endif
2885	if (uap->msg_flags) {
2886		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
2887		if (error) {
2888			goto out;
2889		}
2890	}
2891out:
2892	free(iov, M_IOV);
2893out1:
2894	if (fp)
2895		fdrop(fp, td);
2896
2897	return (error);
2898#else  /* SCTP */
2899	return (EOPNOTSUPP);
2900#endif /* SCTP */
2901}
2902