uipc_syscalls.c revision 284310
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 284310 2015-06-12 11:32:20Z glebius $");
37
38#include "opt_capsicum.h"
39#include "opt_inet.h"
40#include "opt_inet6.h"
41#include "opt_compat.h"
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/capsicum.h>
47#include <sys/condvar.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/mutex.h>
51#include <sys/sysproto.h>
52#include <sys/malloc.h>
53#include <sys/filedesc.h>
54#include <sys/event.h>
55#include <sys/proc.h>
56#include <sys/fcntl.h>
57#include <sys/file.h>
58#include <sys/filio.h>
59#include <sys/jail.h>
60#include <sys/mman.h>
61#include <sys/mount.h>
62#include <sys/mbuf.h>
63#include <sys/protosw.h>
64#include <sys/rwlock.h>
65#include <sys/sf_buf.h>
66#include <sys/sysent.h>
67#include <sys/socket.h>
68#include <sys/socketvar.h>
69#include <sys/signalvar.h>
70#include <sys/syscallsubr.h>
71#include <sys/sysctl.h>
72#include <sys/uio.h>
73#include <sys/vnode.h>
74#ifdef KTRACE
75#include <sys/ktrace.h>
76#endif
77#ifdef COMPAT_FREEBSD32
78#include <compat/freebsd32/freebsd32_util.h>
79#endif
80
81#include <net/vnet.h>
82
83#include <security/audit/audit.h>
84#include <security/mac/mac_framework.h>
85
86#include <vm/vm.h>
87#include <vm/vm_param.h>
88#include <vm/vm_object.h>
89#include <vm/vm_page.h>
90#include <vm/vm_pager.h>
91#include <vm/vm_kern.h>
92#include <vm/vm_extern.h>
93#include <vm/uma.h>
94
95/*
96 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
97 * and SOCK_NONBLOCK.
98 */
99#define	ACCEPT4_INHERIT	0x1
100#define	ACCEPT4_COMPAT	0x2
101
102static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
103static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
104
105static int accept1(struct thread *td, int s, struct sockaddr *uname,
106		   socklen_t *anamelen, int flags);
107static int do_sendfile(struct thread *td, struct sendfile_args *uap,
108		   int compat);
109static int getsockname1(struct thread *td, struct getsockname_args *uap,
110			int compat);
111static int getpeername1(struct thread *td, struct getpeername_args *uap,
112			int compat);
113
114counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
115
116/*
117 * sendfile(2)-related variables and associated sysctls
118 */
119static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
120    "sendfile(2) tunables");
121static int sfreadahead = 1;
122SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
123    &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
124
125static void
126sfstat_init(const void *unused)
127{
128
129	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
130	    M_WAITOK);
131}
132SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
133
134static int
135sfstat_sysctl(SYSCTL_HANDLER_ARGS)
136{
137	struct sfstat s;
138
139	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
140	if (req->newptr)
141		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
142	return (SYSCTL_OUT(req, &s, sizeof(s)));
143}
144SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
145    NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
146
147/*
148 * Convert a user file descriptor to a kernel file entry and check if required
149 * capability rights are present.
150 * A reference on the file entry is held upon returning.
151 */
152int
153getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
154    struct file **fpp, u_int *fflagp)
155{
156	struct file *fp;
157	int error;
158
159	error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL);
160	if (error != 0)
161		return (error);
162	if (fp->f_type != DTYPE_SOCKET) {
163		fdrop(fp, td);
164		return (ENOTSOCK);
165	}
166	if (fflagp != NULL)
167		*fflagp = fp->f_flag;
168	*fpp = fp;
169	return (0);
170}
171
172/*
173 * System call interface to the socket abstraction.
174 */
175#if defined(COMPAT_43)
176#define COMPAT_OLDSOCK
177#endif
178
179int
180sys_socket(td, uap)
181	struct thread *td;
182	struct socket_args /* {
183		int	domain;
184		int	type;
185		int	protocol;
186	} */ *uap;
187{
188	struct socket *so;
189	struct file *fp;
190	int fd, error, type, oflag, fflag;
191
192	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
193
194	type = uap->type;
195	oflag = 0;
196	fflag = 0;
197	if ((type & SOCK_CLOEXEC) != 0) {
198		type &= ~SOCK_CLOEXEC;
199		oflag |= O_CLOEXEC;
200	}
201	if ((type & SOCK_NONBLOCK) != 0) {
202		type &= ~SOCK_NONBLOCK;
203		fflag |= FNONBLOCK;
204	}
205
206#ifdef MAC
207	error = mac_socket_check_create(td->td_ucred, uap->domain, type,
208	    uap->protocol);
209	if (error != 0)
210		return (error);
211#endif
212	error = falloc(td, &fp, &fd, oflag);
213	if (error != 0)
214		return (error);
215	/* An extra reference on `fp' has been held for us by falloc(). */
216	error = socreate(uap->domain, &so, type, uap->protocol,
217	    td->td_ucred, td);
218	if (error != 0) {
219		fdclose(td, fp, fd);
220	} else {
221		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
222		if ((fflag & FNONBLOCK) != 0)
223			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
224		td->td_retval[0] = fd;
225	}
226	fdrop(fp, td);
227	return (error);
228}
229
230/* ARGSUSED */
231int
232sys_bind(td, uap)
233	struct thread *td;
234	struct bind_args /* {
235		int	s;
236		caddr_t	name;
237		int	namelen;
238	} */ *uap;
239{
240	struct sockaddr *sa;
241	int error;
242
243	error = getsockaddr(&sa, uap->name, uap->namelen);
244	if (error == 0) {
245		error = kern_bindat(td, AT_FDCWD, uap->s, sa);
246		free(sa, M_SONAME);
247	}
248	return (error);
249}
250
251int
252kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
253{
254	struct socket *so;
255	struct file *fp;
256	cap_rights_t rights;
257	int error;
258
259	AUDIT_ARG_FD(fd);
260	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
261	error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND),
262	    &fp, NULL);
263	if (error != 0)
264		return (error);
265	so = fp->f_data;
266#ifdef KTRACE
267	if (KTRPOINT(td, KTR_STRUCT))
268		ktrsockaddr(sa);
269#endif
270#ifdef MAC
271	error = mac_socket_check_bind(td->td_ucred, so, sa);
272	if (error == 0) {
273#endif
274		if (dirfd == AT_FDCWD)
275			error = sobind(so, sa, td);
276		else
277			error = sobindat(dirfd, so, sa, td);
278#ifdef MAC
279	}
280#endif
281	fdrop(fp, td);
282	return (error);
283}
284
285/* ARGSUSED */
286int
287sys_bindat(td, uap)
288	struct thread *td;
289	struct bindat_args /* {
290		int	fd;
291		int	s;
292		caddr_t	name;
293		int	namelen;
294	} */ *uap;
295{
296	struct sockaddr *sa;
297	int error;
298
299	error = getsockaddr(&sa, uap->name, uap->namelen);
300	if (error == 0) {
301		error = kern_bindat(td, uap->fd, uap->s, sa);
302		free(sa, M_SONAME);
303	}
304	return (error);
305}
306
307/* ARGSUSED */
308int
309sys_listen(td, uap)
310	struct thread *td;
311	struct listen_args /* {
312		int	s;
313		int	backlog;
314	} */ *uap;
315{
316	struct socket *so;
317	struct file *fp;
318	cap_rights_t rights;
319	int error;
320
321	AUDIT_ARG_FD(uap->s);
322	error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN),
323	    &fp, NULL);
324	if (error == 0) {
325		so = fp->f_data;
326#ifdef MAC
327		error = mac_socket_check_listen(td->td_ucred, so);
328		if (error == 0)
329#endif
330			error = solisten(so, uap->backlog, td);
331		fdrop(fp, td);
332	}
333	return(error);
334}
335
336/*
337 * accept1()
338 */
339static int
340accept1(td, s, uname, anamelen, flags)
341	struct thread *td;
342	int s;
343	struct sockaddr *uname;
344	socklen_t *anamelen;
345	int flags;
346{
347	struct sockaddr *name;
348	socklen_t namelen;
349	struct file *fp;
350	int error;
351
352	if (uname == NULL)
353		return (kern_accept4(td, s, NULL, NULL, flags, NULL));
354
355	error = copyin(anamelen, &namelen, sizeof (namelen));
356	if (error != 0)
357		return (error);
358
359	error = kern_accept4(td, s, &name, &namelen, flags, &fp);
360
361	if (error != 0)
362		return (error);
363
364	if (error == 0 && uname != NULL) {
365#ifdef COMPAT_OLDSOCK
366		if (flags & ACCEPT4_COMPAT)
367			((struct osockaddr *)name)->sa_family =
368			    name->sa_family;
369#endif
370		error = copyout(name, uname, namelen);
371	}
372	if (error == 0)
373		error = copyout(&namelen, anamelen,
374		    sizeof(namelen));
375	if (error != 0)
376		fdclose(td, fp, td->td_retval[0]);
377	fdrop(fp, td);
378	free(name, M_SONAME);
379	return (error);
380}
381
382int
383kern_accept(struct thread *td, int s, struct sockaddr **name,
384    socklen_t *namelen, struct file **fp)
385{
386	return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
387}
388
389int
390kern_accept4(struct thread *td, int s, struct sockaddr **name,
391    socklen_t *namelen, int flags, struct file **fp)
392{
393	struct file *headfp, *nfp = NULL;
394	struct sockaddr *sa = NULL;
395	struct socket *head, *so;
396	cap_rights_t rights;
397	u_int fflag;
398	pid_t pgid;
399	int error, fd, tmp;
400
401	if (name != NULL)
402		*name = NULL;
403
404	AUDIT_ARG_FD(s);
405	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT),
406	    &headfp, &fflag);
407	if (error != 0)
408		return (error);
409	head = headfp->f_data;
410	if ((head->so_options & SO_ACCEPTCONN) == 0) {
411		error = EINVAL;
412		goto done;
413	}
414#ifdef MAC
415	error = mac_socket_check_accept(td->td_ucred, head);
416	if (error != 0)
417		goto done;
418#endif
419	error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
420	if (error != 0)
421		goto done;
422	ACCEPT_LOCK();
423	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
424		ACCEPT_UNLOCK();
425		error = EWOULDBLOCK;
426		goto noconnection;
427	}
428	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
429		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
430			head->so_error = ECONNABORTED;
431			break;
432		}
433		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
434		    "accept", 0);
435		if (error != 0) {
436			ACCEPT_UNLOCK();
437			goto noconnection;
438		}
439	}
440	if (head->so_error) {
441		error = head->so_error;
442		head->so_error = 0;
443		ACCEPT_UNLOCK();
444		goto noconnection;
445	}
446	so = TAILQ_FIRST(&head->so_comp);
447	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
448	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
449
450	/*
451	 * Before changing the flags on the socket, we have to bump the
452	 * reference count.  Otherwise, if the protocol calls sofree(),
453	 * the socket will be released due to a zero refcount.
454	 */
455	SOCK_LOCK(so);			/* soref() and so_state update */
456	soref(so);			/* file descriptor reference */
457
458	TAILQ_REMOVE(&head->so_comp, so, so_list);
459	head->so_qlen--;
460	if (flags & ACCEPT4_INHERIT)
461		so->so_state |= (head->so_state & SS_NBIO);
462	else
463		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
464	so->so_qstate &= ~SQ_COMP;
465	so->so_head = NULL;
466
467	SOCK_UNLOCK(so);
468	ACCEPT_UNLOCK();
469
470	/* An extra reference on `nfp' has been held for us by falloc(). */
471	td->td_retval[0] = fd;
472
473	/* connection has been removed from the listen queue */
474	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
475
476	if (flags & ACCEPT4_INHERIT) {
477		pgid = fgetown(&head->so_sigio);
478		if (pgid != 0)
479			fsetown(pgid, &so->so_sigio);
480	} else {
481		fflag &= ~(FNONBLOCK | FASYNC);
482		if (flags & SOCK_NONBLOCK)
483			fflag |= FNONBLOCK;
484	}
485
486	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
487	/* Sync socket nonblocking/async state with file flags */
488	tmp = fflag & FNONBLOCK;
489	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
490	tmp = fflag & FASYNC;
491	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
492	sa = 0;
493	error = soaccept(so, &sa);
494	if (error != 0)
495		goto noconnection;
496	if (sa == NULL) {
497		if (name)
498			*namelen = 0;
499		goto done;
500	}
501	AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
502	if (name) {
503		/* check sa_len before it is destroyed */
504		if (*namelen > sa->sa_len)
505			*namelen = sa->sa_len;
506#ifdef KTRACE
507		if (KTRPOINT(td, KTR_STRUCT))
508			ktrsockaddr(sa);
509#endif
510		*name = sa;
511		sa = NULL;
512	}
513noconnection:
514	free(sa, M_SONAME);
515
516	/*
517	 * close the new descriptor, assuming someone hasn't ripped it
518	 * out from under us.
519	 */
520	if (error != 0)
521		fdclose(td, nfp, fd);
522
523	/*
524	 * Release explicitly held references before returning.  We return
525	 * a reference on nfp to the caller on success if they request it.
526	 */
527done:
528	if (fp != NULL) {
529		if (error == 0) {
530			*fp = nfp;
531			nfp = NULL;
532		} else
533			*fp = NULL;
534	}
535	if (nfp != NULL)
536		fdrop(nfp, td);
537	fdrop(headfp, td);
538	return (error);
539}
540
541int
542sys_accept(td, uap)
543	struct thread *td;
544	struct accept_args *uap;
545{
546
547	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
548}
549
550int
551sys_accept4(td, uap)
552	struct thread *td;
553	struct accept4_args *uap;
554{
555
556	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
557		return (EINVAL);
558
559	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
560}
561
562#ifdef COMPAT_OLDSOCK
563int
564oaccept(td, uap)
565	struct thread *td;
566	struct accept_args *uap;
567{
568
569	return (accept1(td, uap->s, uap->name, uap->anamelen,
570	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
571}
572#endif /* COMPAT_OLDSOCK */
573
574/* ARGSUSED */
575int
576sys_connect(td, uap)
577	struct thread *td;
578	struct connect_args /* {
579		int	s;
580		caddr_t	name;
581		int	namelen;
582	} */ *uap;
583{
584	struct sockaddr *sa;
585	int error;
586
587	error = getsockaddr(&sa, uap->name, uap->namelen);
588	if (error == 0) {
589		error = kern_connectat(td, AT_FDCWD, uap->s, sa);
590		free(sa, M_SONAME);
591	}
592	return (error);
593}
594
595int
596kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
597{
598	struct socket *so;
599	struct file *fp;
600	cap_rights_t rights;
601	int error, interrupted = 0;
602
603	AUDIT_ARG_FD(fd);
604	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
605	error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT),
606	    &fp, NULL);
607	if (error != 0)
608		return (error);
609	so = fp->f_data;
610	if (so->so_state & SS_ISCONNECTING) {
611		error = EALREADY;
612		goto done1;
613	}
614#ifdef KTRACE
615	if (KTRPOINT(td, KTR_STRUCT))
616		ktrsockaddr(sa);
617#endif
618#ifdef MAC
619	error = mac_socket_check_connect(td->td_ucred, so, sa);
620	if (error != 0)
621		goto bad;
622#endif
623	if (dirfd == AT_FDCWD)
624		error = soconnect(so, sa, td);
625	else
626		error = soconnectat(dirfd, so, sa, td);
627	if (error != 0)
628		goto bad;
629	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
630		error = EINPROGRESS;
631		goto done1;
632	}
633	SOCK_LOCK(so);
634	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
635		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
636		    "connec", 0);
637		if (error != 0) {
638			if (error == EINTR || error == ERESTART)
639				interrupted = 1;
640			break;
641		}
642	}
643	if (error == 0) {
644		error = so->so_error;
645		so->so_error = 0;
646	}
647	SOCK_UNLOCK(so);
648bad:
649	if (!interrupted)
650		so->so_state &= ~SS_ISCONNECTING;
651	if (error == ERESTART)
652		error = EINTR;
653done1:
654	fdrop(fp, td);
655	return (error);
656}
657
658/* ARGSUSED */
659int
660sys_connectat(td, uap)
661	struct thread *td;
662	struct connectat_args /* {
663		int	fd;
664		int	s;
665		caddr_t	name;
666		int	namelen;
667	} */ *uap;
668{
669	struct sockaddr *sa;
670	int error;
671
672	error = getsockaddr(&sa, uap->name, uap->namelen);
673	if (error == 0) {
674		error = kern_connectat(td, uap->fd, uap->s, sa);
675		free(sa, M_SONAME);
676	}
677	return (error);
678}
679
680int
681kern_socketpair(struct thread *td, int domain, int type, int protocol,
682    int *rsv)
683{
684	struct file *fp1, *fp2;
685	struct socket *so1, *so2;
686	int fd, error, oflag, fflag;
687
688	AUDIT_ARG_SOCKET(domain, type, protocol);
689
690	oflag = 0;
691	fflag = 0;
692	if ((type & SOCK_CLOEXEC) != 0) {
693		type &= ~SOCK_CLOEXEC;
694		oflag |= O_CLOEXEC;
695	}
696	if ((type & SOCK_NONBLOCK) != 0) {
697		type &= ~SOCK_NONBLOCK;
698		fflag |= FNONBLOCK;
699	}
700#ifdef MAC
701	/* We might want to have a separate check for socket pairs. */
702	error = mac_socket_check_create(td->td_ucred, domain, type,
703	    protocol);
704	if (error != 0)
705		return (error);
706#endif
707	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
708	if (error != 0)
709		return (error);
710	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
711	if (error != 0)
712		goto free1;
713	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
714	error = falloc(td, &fp1, &fd, oflag);
715	if (error != 0)
716		goto free2;
717	rsv[0] = fd;
718	fp1->f_data = so1;	/* so1 already has ref count */
719	error = falloc(td, &fp2, &fd, oflag);
720	if (error != 0)
721		goto free3;
722	fp2->f_data = so2;	/* so2 already has ref count */
723	rsv[1] = fd;
724	error = soconnect2(so1, so2);
725	if (error != 0)
726		goto free4;
727	if (type == SOCK_DGRAM) {
728		/*
729		 * Datagram socket connection is asymmetric.
730		 */
731		 error = soconnect2(so2, so1);
732		 if (error != 0)
733			goto free4;
734	}
735	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
736	    &socketops);
737	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
738	    &socketops);
739	if ((fflag & FNONBLOCK) != 0) {
740		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
741		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
742	}
743	fdrop(fp1, td);
744	fdrop(fp2, td);
745	return (0);
746free4:
747	fdclose(td, fp2, rsv[1]);
748	fdrop(fp2, td);
749free3:
750	fdclose(td, fp1, rsv[0]);
751	fdrop(fp1, td);
752free2:
753	if (so2 != NULL)
754		(void)soclose(so2);
755free1:
756	if (so1 != NULL)
757		(void)soclose(so1);
758	return (error);
759}
760
761int
762sys_socketpair(struct thread *td, struct socketpair_args *uap)
763{
764	int error, sv[2];
765
766	error = kern_socketpair(td, uap->domain, uap->type,
767	    uap->protocol, sv);
768	if (error != 0)
769		return (error);
770	error = copyout(sv, uap->rsv, 2 * sizeof(int));
771	if (error != 0) {
772		(void)kern_close(td, sv[0]);
773		(void)kern_close(td, sv[1]);
774	}
775	return (error);
776}
777
778static int
779sendit(td, s, mp, flags)
780	struct thread *td;
781	int s;
782	struct msghdr *mp;
783	int flags;
784{
785	struct mbuf *control;
786	struct sockaddr *to;
787	int error;
788
789#ifdef CAPABILITY_MODE
790	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
791		return (ECAPMODE);
792#endif
793
794	if (mp->msg_name != NULL) {
795		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
796		if (error != 0) {
797			to = NULL;
798			goto bad;
799		}
800		mp->msg_name = to;
801	} else {
802		to = NULL;
803	}
804
805	if (mp->msg_control) {
806		if (mp->msg_controllen < sizeof(struct cmsghdr)
807#ifdef COMPAT_OLDSOCK
808		    && mp->msg_flags != MSG_COMPAT
809#endif
810		) {
811			error = EINVAL;
812			goto bad;
813		}
814		error = sockargs(&control, mp->msg_control,
815		    mp->msg_controllen, MT_CONTROL);
816		if (error != 0)
817			goto bad;
818#ifdef COMPAT_OLDSOCK
819		if (mp->msg_flags == MSG_COMPAT) {
820			struct cmsghdr *cm;
821
822			M_PREPEND(control, sizeof(*cm), M_WAITOK);
823			cm = mtod(control, struct cmsghdr *);
824			cm->cmsg_len = control->m_len;
825			cm->cmsg_level = SOL_SOCKET;
826			cm->cmsg_type = SCM_RIGHTS;
827		}
828#endif
829	} else {
830		control = NULL;
831	}
832
833	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
834
835bad:
836	free(to, M_SONAME);
837	return (error);
838}
839
840int
841kern_sendit(td, s, mp, flags, control, segflg)
842	struct thread *td;
843	int s;
844	struct msghdr *mp;
845	int flags;
846	struct mbuf *control;
847	enum uio_seg segflg;
848{
849	struct file *fp;
850	struct uio auio;
851	struct iovec *iov;
852	struct socket *so;
853	cap_rights_t rights;
854#ifdef KTRACE
855	struct uio *ktruio = NULL;
856#endif
857	ssize_t len;
858	int i, error;
859
860	AUDIT_ARG_FD(s);
861	cap_rights_init(&rights, CAP_SEND);
862	if (mp->msg_name != NULL) {
863		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
864		cap_rights_set(&rights, CAP_CONNECT);
865	}
866	error = getsock_cap(td, s, &rights, &fp, NULL);
867	if (error != 0)
868		return (error);
869	so = (struct socket *)fp->f_data;
870
871#ifdef KTRACE
872	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
873		ktrsockaddr(mp->msg_name);
874#endif
875#ifdef MAC
876	if (mp->msg_name != NULL) {
877		error = mac_socket_check_connect(td->td_ucred, so,
878		    mp->msg_name);
879		if (error != 0)
880			goto bad;
881	}
882	error = mac_socket_check_send(td->td_ucred, so);
883	if (error != 0)
884		goto bad;
885#endif
886
887	auio.uio_iov = mp->msg_iov;
888	auio.uio_iovcnt = mp->msg_iovlen;
889	auio.uio_segflg = segflg;
890	auio.uio_rw = UIO_WRITE;
891	auio.uio_td = td;
892	auio.uio_offset = 0;			/* XXX */
893	auio.uio_resid = 0;
894	iov = mp->msg_iov;
895	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
896		if ((auio.uio_resid += iov->iov_len) < 0) {
897			error = EINVAL;
898			goto bad;
899		}
900	}
901#ifdef KTRACE
902	if (KTRPOINT(td, KTR_GENIO))
903		ktruio = cloneuio(&auio);
904#endif
905	len = auio.uio_resid;
906	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
907	if (error != 0) {
908		if (auio.uio_resid != len && (error == ERESTART ||
909		    error == EINTR || error == EWOULDBLOCK))
910			error = 0;
911		/* Generation of SIGPIPE can be controlled per socket */
912		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
913		    !(flags & MSG_NOSIGNAL)) {
914			PROC_LOCK(td->td_proc);
915			tdsignal(td, SIGPIPE);
916			PROC_UNLOCK(td->td_proc);
917		}
918	}
919	if (error == 0)
920		td->td_retval[0] = len - auio.uio_resid;
921#ifdef KTRACE
922	if (ktruio != NULL) {
923		ktruio->uio_resid = td->td_retval[0];
924		ktrgenio(s, UIO_WRITE, ktruio, error);
925	}
926#endif
927bad:
928	fdrop(fp, td);
929	return (error);
930}
931
932int
933sys_sendto(td, uap)
934	struct thread *td;
935	struct sendto_args /* {
936		int	s;
937		caddr_t	buf;
938		size_t	len;
939		int	flags;
940		caddr_t	to;
941		int	tolen;
942	} */ *uap;
943{
944	struct msghdr msg;
945	struct iovec aiov;
946
947	msg.msg_name = uap->to;
948	msg.msg_namelen = uap->tolen;
949	msg.msg_iov = &aiov;
950	msg.msg_iovlen = 1;
951	msg.msg_control = 0;
952#ifdef COMPAT_OLDSOCK
953	msg.msg_flags = 0;
954#endif
955	aiov.iov_base = uap->buf;
956	aiov.iov_len = uap->len;
957	return (sendit(td, uap->s, &msg, uap->flags));
958}
959
960#ifdef COMPAT_OLDSOCK
961int
962osend(td, uap)
963	struct thread *td;
964	struct osend_args /* {
965		int	s;
966		caddr_t	buf;
967		int	len;
968		int	flags;
969	} */ *uap;
970{
971	struct msghdr msg;
972	struct iovec aiov;
973
974	msg.msg_name = 0;
975	msg.msg_namelen = 0;
976	msg.msg_iov = &aiov;
977	msg.msg_iovlen = 1;
978	aiov.iov_base = uap->buf;
979	aiov.iov_len = uap->len;
980	msg.msg_control = 0;
981	msg.msg_flags = 0;
982	return (sendit(td, uap->s, &msg, uap->flags));
983}
984
985int
986osendmsg(td, uap)
987	struct thread *td;
988	struct osendmsg_args /* {
989		int	s;
990		caddr_t	msg;
991		int	flags;
992	} */ *uap;
993{
994	struct msghdr msg;
995	struct iovec *iov;
996	int error;
997
998	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
999	if (error != 0)
1000		return (error);
1001	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1002	if (error != 0)
1003		return (error);
1004	msg.msg_iov = iov;
1005	msg.msg_flags = MSG_COMPAT;
1006	error = sendit(td, uap->s, &msg, uap->flags);
1007	free(iov, M_IOV);
1008	return (error);
1009}
1010#endif
1011
1012int
1013sys_sendmsg(td, uap)
1014	struct thread *td;
1015	struct sendmsg_args /* {
1016		int	s;
1017		caddr_t	msg;
1018		int	flags;
1019	} */ *uap;
1020{
1021	struct msghdr msg;
1022	struct iovec *iov;
1023	int error;
1024
1025	error = copyin(uap->msg, &msg, sizeof (msg));
1026	if (error != 0)
1027		return (error);
1028	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1029	if (error != 0)
1030		return (error);
1031	msg.msg_iov = iov;
1032#ifdef COMPAT_OLDSOCK
1033	msg.msg_flags = 0;
1034#endif
1035	error = sendit(td, uap->s, &msg, uap->flags);
1036	free(iov, M_IOV);
1037	return (error);
1038}
1039
1040int
1041kern_recvit(td, s, mp, fromseg, controlp)
1042	struct thread *td;
1043	int s;
1044	struct msghdr *mp;
1045	enum uio_seg fromseg;
1046	struct mbuf **controlp;
1047{
1048	struct uio auio;
1049	struct iovec *iov;
1050	struct mbuf *m, *control = NULL;
1051	caddr_t ctlbuf;
1052	struct file *fp;
1053	struct socket *so;
1054	struct sockaddr *fromsa = NULL;
1055	cap_rights_t rights;
1056#ifdef KTRACE
1057	struct uio *ktruio = NULL;
1058#endif
1059	ssize_t len;
1060	int error, i;
1061
1062	if (controlp != NULL)
1063		*controlp = NULL;
1064
1065	AUDIT_ARG_FD(s);
1066	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV),
1067	    &fp, NULL);
1068	if (error != 0)
1069		return (error);
1070	so = fp->f_data;
1071
1072#ifdef MAC
1073	error = mac_socket_check_receive(td->td_ucred, so);
1074	if (error != 0) {
1075		fdrop(fp, td);
1076		return (error);
1077	}
1078#endif
1079
1080	auio.uio_iov = mp->msg_iov;
1081	auio.uio_iovcnt = mp->msg_iovlen;
1082	auio.uio_segflg = UIO_USERSPACE;
1083	auio.uio_rw = UIO_READ;
1084	auio.uio_td = td;
1085	auio.uio_offset = 0;			/* XXX */
1086	auio.uio_resid = 0;
1087	iov = mp->msg_iov;
1088	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1089		if ((auio.uio_resid += iov->iov_len) < 0) {
1090			fdrop(fp, td);
1091			return (EINVAL);
1092		}
1093	}
1094#ifdef KTRACE
1095	if (KTRPOINT(td, KTR_GENIO))
1096		ktruio = cloneuio(&auio);
1097#endif
1098	len = auio.uio_resid;
1099	error = soreceive(so, &fromsa, &auio, NULL,
1100	    (mp->msg_control || controlp) ? &control : NULL,
1101	    &mp->msg_flags);
1102	if (error != 0) {
1103		if (auio.uio_resid != len && (error == ERESTART ||
1104		    error == EINTR || error == EWOULDBLOCK))
1105			error = 0;
1106	}
1107	if (fromsa != NULL)
1108		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
1109#ifdef KTRACE
1110	if (ktruio != NULL) {
1111		ktruio->uio_resid = len - auio.uio_resid;
1112		ktrgenio(s, UIO_READ, ktruio, error);
1113	}
1114#endif
1115	if (error != 0)
1116		goto out;
1117	td->td_retval[0] = len - auio.uio_resid;
1118	if (mp->msg_name) {
1119		len = mp->msg_namelen;
1120		if (len <= 0 || fromsa == NULL)
1121			len = 0;
1122		else {
1123			/* save sa_len before it is destroyed by MSG_COMPAT */
1124			len = MIN(len, fromsa->sa_len);
1125#ifdef COMPAT_OLDSOCK
1126			if (mp->msg_flags & MSG_COMPAT)
1127				((struct osockaddr *)fromsa)->sa_family =
1128				    fromsa->sa_family;
1129#endif
1130			if (fromseg == UIO_USERSPACE) {
1131				error = copyout(fromsa, mp->msg_name,
1132				    (unsigned)len);
1133				if (error != 0)
1134					goto out;
1135			} else
1136				bcopy(fromsa, mp->msg_name, len);
1137		}
1138		mp->msg_namelen = len;
1139	}
1140	if (mp->msg_control && controlp == NULL) {
1141#ifdef COMPAT_OLDSOCK
1142		/*
1143		 * We assume that old recvmsg calls won't receive access
1144		 * rights and other control info, esp. as control info
1145		 * is always optional and those options didn't exist in 4.3.
1146		 * If we receive rights, trim the cmsghdr; anything else
1147		 * is tossed.
1148		 */
1149		if (control && mp->msg_flags & MSG_COMPAT) {
1150			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1151			    SOL_SOCKET ||
1152			    mtod(control, struct cmsghdr *)->cmsg_type !=
1153			    SCM_RIGHTS) {
1154				mp->msg_controllen = 0;
1155				goto out;
1156			}
1157			control->m_len -= sizeof (struct cmsghdr);
1158			control->m_data += sizeof (struct cmsghdr);
1159		}
1160#endif
1161		len = mp->msg_controllen;
1162		m = control;
1163		mp->msg_controllen = 0;
1164		ctlbuf = mp->msg_control;
1165
1166		while (m && len > 0) {
1167			unsigned int tocopy;
1168
1169			if (len >= m->m_len)
1170				tocopy = m->m_len;
1171			else {
1172				mp->msg_flags |= MSG_CTRUNC;
1173				tocopy = len;
1174			}
1175
1176			if ((error = copyout(mtod(m, caddr_t),
1177					ctlbuf, tocopy)) != 0)
1178				goto out;
1179
1180			ctlbuf += tocopy;
1181			len -= tocopy;
1182			m = m->m_next;
1183		}
1184		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1185	}
1186out:
1187	fdrop(fp, td);
1188#ifdef KTRACE
1189	if (fromsa && KTRPOINT(td, KTR_STRUCT))
1190		ktrsockaddr(fromsa);
1191#endif
1192	free(fromsa, M_SONAME);
1193
1194	if (error == 0 && controlp != NULL)
1195		*controlp = control;
1196	else  if (control)
1197		m_freem(control);
1198
1199	return (error);
1200}
1201
1202static int
1203recvit(td, s, mp, namelenp)
1204	struct thread *td;
1205	int s;
1206	struct msghdr *mp;
1207	void *namelenp;
1208{
1209	int error;
1210
1211	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1212	if (error != 0)
1213		return (error);
1214	if (namelenp != NULL) {
1215		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1216#ifdef COMPAT_OLDSOCK
1217		if (mp->msg_flags & MSG_COMPAT)
1218			error = 0;	/* old recvfrom didn't check */
1219#endif
1220	}
1221	return (error);
1222}
1223
1224int
1225sys_recvfrom(td, uap)
1226	struct thread *td;
1227	struct recvfrom_args /* {
1228		int	s;
1229		caddr_t	buf;
1230		size_t	len;
1231		int	flags;
1232		struct sockaddr * __restrict	from;
1233		socklen_t * __restrict fromlenaddr;
1234	} */ *uap;
1235{
1236	struct msghdr msg;
1237	struct iovec aiov;
1238	int error;
1239
1240	if (uap->fromlenaddr) {
1241		error = copyin(uap->fromlenaddr,
1242		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1243		if (error != 0)
1244			goto done2;
1245	} else {
1246		msg.msg_namelen = 0;
1247	}
1248	msg.msg_name = uap->from;
1249	msg.msg_iov = &aiov;
1250	msg.msg_iovlen = 1;
1251	aiov.iov_base = uap->buf;
1252	aiov.iov_len = uap->len;
1253	msg.msg_control = 0;
1254	msg.msg_flags = uap->flags;
1255	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1256done2:
1257	return (error);
1258}
1259
1260#ifdef COMPAT_OLDSOCK
1261int
1262orecvfrom(td, uap)
1263	struct thread *td;
1264	struct recvfrom_args *uap;
1265{
1266
1267	uap->flags |= MSG_COMPAT;
1268	return (sys_recvfrom(td, uap));
1269}
1270#endif
1271
1272#ifdef COMPAT_OLDSOCK
1273int
1274orecv(td, uap)
1275	struct thread *td;
1276	struct orecv_args /* {
1277		int	s;
1278		caddr_t	buf;
1279		int	len;
1280		int	flags;
1281	} */ *uap;
1282{
1283	struct msghdr msg;
1284	struct iovec aiov;
1285
1286	msg.msg_name = 0;
1287	msg.msg_namelen = 0;
1288	msg.msg_iov = &aiov;
1289	msg.msg_iovlen = 1;
1290	aiov.iov_base = uap->buf;
1291	aiov.iov_len = uap->len;
1292	msg.msg_control = 0;
1293	msg.msg_flags = uap->flags;
1294	return (recvit(td, uap->s, &msg, NULL));
1295}
1296
1297/*
1298 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1299 * overlays the new one, missing only the flags, and with the (old) access
1300 * rights where the control fields are now.
1301 */
1302int
1303orecvmsg(td, uap)
1304	struct thread *td;
1305	struct orecvmsg_args /* {
1306		int	s;
1307		struct	omsghdr *msg;
1308		int	flags;
1309	} */ *uap;
1310{
1311	struct msghdr msg;
1312	struct iovec *iov;
1313	int error;
1314
1315	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1316	if (error != 0)
1317		return (error);
1318	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1319	if (error != 0)
1320		return (error);
1321	msg.msg_flags = uap->flags | MSG_COMPAT;
1322	msg.msg_iov = iov;
1323	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1324	if (msg.msg_controllen && error == 0)
1325		error = copyout(&msg.msg_controllen,
1326		    &uap->msg->msg_accrightslen, sizeof (int));
1327	free(iov, M_IOV);
1328	return (error);
1329}
1330#endif
1331
1332int
1333sys_recvmsg(td, uap)
1334	struct thread *td;
1335	struct recvmsg_args /* {
1336		int	s;
1337		struct	msghdr *msg;
1338		int	flags;
1339	} */ *uap;
1340{
1341	struct msghdr msg;
1342	struct iovec *uiov, *iov;
1343	int error;
1344
1345	error = copyin(uap->msg, &msg, sizeof (msg));
1346	if (error != 0)
1347		return (error);
1348	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1349	if (error != 0)
1350		return (error);
1351	msg.msg_flags = uap->flags;
1352#ifdef COMPAT_OLDSOCK
1353	msg.msg_flags &= ~MSG_COMPAT;
1354#endif
1355	uiov = msg.msg_iov;
1356	msg.msg_iov = iov;
1357	error = recvit(td, uap->s, &msg, NULL);
1358	if (error == 0) {
1359		msg.msg_iov = uiov;
1360		error = copyout(&msg, uap->msg, sizeof(msg));
1361	}
1362	free(iov, M_IOV);
1363	return (error);
1364}
1365
1366/* ARGSUSED */
1367int
1368sys_shutdown(td, uap)
1369	struct thread *td;
1370	struct shutdown_args /* {
1371		int	s;
1372		int	how;
1373	} */ *uap;
1374{
1375	struct socket *so;
1376	struct file *fp;
1377	cap_rights_t rights;
1378	int error;
1379
1380	AUDIT_ARG_FD(uap->s);
1381	error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN),
1382	    &fp, NULL);
1383	if (error == 0) {
1384		so = fp->f_data;
1385		error = soshutdown(so, uap->how);
1386		fdrop(fp, td);
1387	}
1388	return (error);
1389}
1390
1391/* ARGSUSED */
1392int
1393sys_setsockopt(td, uap)
1394	struct thread *td;
1395	struct setsockopt_args /* {
1396		int	s;
1397		int	level;
1398		int	name;
1399		caddr_t	val;
1400		int	valsize;
1401	} */ *uap;
1402{
1403
1404	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1405	    uap->val, UIO_USERSPACE, uap->valsize));
1406}
1407
1408int
1409kern_setsockopt(td, s, level, name, val, valseg, valsize)
1410	struct thread *td;
1411	int s;
1412	int level;
1413	int name;
1414	void *val;
1415	enum uio_seg valseg;
1416	socklen_t valsize;
1417{
1418	struct socket *so;
1419	struct file *fp;
1420	struct sockopt sopt;
1421	cap_rights_t rights;
1422	int error;
1423
1424	if (val == NULL && valsize != 0)
1425		return (EFAULT);
1426	if ((int)valsize < 0)
1427		return (EINVAL);
1428
1429	sopt.sopt_dir = SOPT_SET;
1430	sopt.sopt_level = level;
1431	sopt.sopt_name = name;
1432	sopt.sopt_val = val;
1433	sopt.sopt_valsize = valsize;
1434	switch (valseg) {
1435	case UIO_USERSPACE:
1436		sopt.sopt_td = td;
1437		break;
1438	case UIO_SYSSPACE:
1439		sopt.sopt_td = NULL;
1440		break;
1441	default:
1442		panic("kern_setsockopt called with bad valseg");
1443	}
1444
1445	AUDIT_ARG_FD(s);
1446	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT),
1447	    &fp, NULL);
1448	if (error == 0) {
1449		so = fp->f_data;
1450		error = sosetopt(so, &sopt);
1451		fdrop(fp, td);
1452	}
1453	return(error);
1454}
1455
1456/* ARGSUSED */
1457int
1458sys_getsockopt(td, uap)
1459	struct thread *td;
1460	struct getsockopt_args /* {
1461		int	s;
1462		int	level;
1463		int	name;
1464		void * __restrict	val;
1465		socklen_t * __restrict avalsize;
1466	} */ *uap;
1467{
1468	socklen_t valsize;
1469	int error;
1470
1471	if (uap->val) {
1472		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1473		if (error != 0)
1474			return (error);
1475	}
1476
1477	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1478	    uap->val, UIO_USERSPACE, &valsize);
1479
1480	if (error == 0)
1481		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1482	return (error);
1483}
1484
1485/*
1486 * Kernel version of getsockopt.
1487 * optval can be a userland or userspace. optlen is always a kernel pointer.
1488 */
1489int
1490kern_getsockopt(td, s, level, name, val, valseg, valsize)
1491	struct thread *td;
1492	int s;
1493	int level;
1494	int name;
1495	void *val;
1496	enum uio_seg valseg;
1497	socklen_t *valsize;
1498{
1499	struct socket *so;
1500	struct file *fp;
1501	struct sockopt sopt;
1502	cap_rights_t rights;
1503	int error;
1504
1505	if (val == NULL)
1506		*valsize = 0;
1507	if ((int)*valsize < 0)
1508		return (EINVAL);
1509
1510	sopt.sopt_dir = SOPT_GET;
1511	sopt.sopt_level = level;
1512	sopt.sopt_name = name;
1513	sopt.sopt_val = val;
1514	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1515	switch (valseg) {
1516	case UIO_USERSPACE:
1517		sopt.sopt_td = td;
1518		break;
1519	case UIO_SYSSPACE:
1520		sopt.sopt_td = NULL;
1521		break;
1522	default:
1523		panic("kern_getsockopt called with bad valseg");
1524	}
1525
1526	AUDIT_ARG_FD(s);
1527	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT),
1528	    &fp, NULL);
1529	if (error == 0) {
1530		so = fp->f_data;
1531		error = sogetopt(so, &sopt);
1532		*valsize = sopt.sopt_valsize;
1533		fdrop(fp, td);
1534	}
1535	return (error);
1536}
1537
1538/*
1539 * getsockname1() - Get socket name.
1540 */
1541/* ARGSUSED */
1542static int
1543getsockname1(td, uap, compat)
1544	struct thread *td;
1545	struct getsockname_args /* {
1546		int	fdes;
1547		struct sockaddr * __restrict asa;
1548		socklen_t * __restrict alen;
1549	} */ *uap;
1550	int compat;
1551{
1552	struct sockaddr *sa;
1553	socklen_t len;
1554	int error;
1555
1556	error = copyin(uap->alen, &len, sizeof(len));
1557	if (error != 0)
1558		return (error);
1559
1560	error = kern_getsockname(td, uap->fdes, &sa, &len);
1561	if (error != 0)
1562		return (error);
1563
1564	if (len != 0) {
1565#ifdef COMPAT_OLDSOCK
1566		if (compat)
1567			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1568#endif
1569		error = copyout(sa, uap->asa, (u_int)len);
1570	}
1571	free(sa, M_SONAME);
1572	if (error == 0)
1573		error = copyout(&len, uap->alen, sizeof(len));
1574	return (error);
1575}
1576
1577int
1578kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1579    socklen_t *alen)
1580{
1581	struct socket *so;
1582	struct file *fp;
1583	cap_rights_t rights;
1584	socklen_t len;
1585	int error;
1586
1587	AUDIT_ARG_FD(fd);
1588	error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME),
1589	    &fp, NULL);
1590	if (error != 0)
1591		return (error);
1592	so = fp->f_data;
1593	*sa = NULL;
1594	CURVNET_SET(so->so_vnet);
1595	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1596	CURVNET_RESTORE();
1597	if (error != 0)
1598		goto bad;
1599	if (*sa == NULL)
1600		len = 0;
1601	else
1602		len = MIN(*alen, (*sa)->sa_len);
1603	*alen = len;
1604#ifdef KTRACE
1605	if (KTRPOINT(td, KTR_STRUCT))
1606		ktrsockaddr(*sa);
1607#endif
1608bad:
1609	fdrop(fp, td);
1610	if (error != 0 && *sa != NULL) {
1611		free(*sa, M_SONAME);
1612		*sa = NULL;
1613	}
1614	return (error);
1615}
1616
1617int
1618sys_getsockname(td, uap)
1619	struct thread *td;
1620	struct getsockname_args *uap;
1621{
1622
1623	return (getsockname1(td, uap, 0));
1624}
1625
1626#ifdef COMPAT_OLDSOCK
1627int
1628ogetsockname(td, uap)
1629	struct thread *td;
1630	struct getsockname_args *uap;
1631{
1632
1633	return (getsockname1(td, uap, 1));
1634}
1635#endif /* COMPAT_OLDSOCK */
1636
1637/*
1638 * getpeername1() - Get name of peer for connected socket.
1639 */
1640/* ARGSUSED */
1641static int
1642getpeername1(td, uap, compat)
1643	struct thread *td;
1644	struct getpeername_args /* {
1645		int	fdes;
1646		struct sockaddr * __restrict	asa;
1647		socklen_t * __restrict	alen;
1648	} */ *uap;
1649	int compat;
1650{
1651	struct sockaddr *sa;
1652	socklen_t len;
1653	int error;
1654
1655	error = copyin(uap->alen, &len, sizeof (len));
1656	if (error != 0)
1657		return (error);
1658
1659	error = kern_getpeername(td, uap->fdes, &sa, &len);
1660	if (error != 0)
1661		return (error);
1662
1663	if (len != 0) {
1664#ifdef COMPAT_OLDSOCK
1665		if (compat)
1666			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1667#endif
1668		error = copyout(sa, uap->asa, (u_int)len);
1669	}
1670	free(sa, M_SONAME);
1671	if (error == 0)
1672		error = copyout(&len, uap->alen, sizeof(len));
1673	return (error);
1674}
1675
1676int
1677kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1678    socklen_t *alen)
1679{
1680	struct socket *so;
1681	struct file *fp;
1682	cap_rights_t rights;
1683	socklen_t len;
1684	int error;
1685
1686	AUDIT_ARG_FD(fd);
1687	error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME),
1688	    &fp, NULL);
1689	if (error != 0)
1690		return (error);
1691	so = fp->f_data;
1692	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1693		error = ENOTCONN;
1694		goto done;
1695	}
1696	*sa = NULL;
1697	CURVNET_SET(so->so_vnet);
1698	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1699	CURVNET_RESTORE();
1700	if (error != 0)
1701		goto bad;
1702	if (*sa == NULL)
1703		len = 0;
1704	else
1705		len = MIN(*alen, (*sa)->sa_len);
1706	*alen = len;
1707#ifdef KTRACE
1708	if (KTRPOINT(td, KTR_STRUCT))
1709		ktrsockaddr(*sa);
1710#endif
1711bad:
1712	if (error != 0 && *sa != NULL) {
1713		free(*sa, M_SONAME);
1714		*sa = NULL;
1715	}
1716done:
1717	fdrop(fp, td);
1718	return (error);
1719}
1720
1721int
1722sys_getpeername(td, uap)
1723	struct thread *td;
1724	struct getpeername_args *uap;
1725{
1726
1727	return (getpeername1(td, uap, 0));
1728}
1729
1730#ifdef COMPAT_OLDSOCK
1731int
1732ogetpeername(td, uap)
1733	struct thread *td;
1734	struct ogetpeername_args *uap;
1735{
1736
1737	/* XXX uap should have type `getpeername_args *' to begin with. */
1738	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1739}
1740#endif /* COMPAT_OLDSOCK */
1741
1742int
1743sockargs(mp, buf, buflen, type)
1744	struct mbuf **mp;
1745	caddr_t buf;
1746	int buflen, type;
1747{
1748	struct sockaddr *sa;
1749	struct mbuf *m;
1750	int error;
1751
1752	if (buflen > MLEN) {
1753#ifdef COMPAT_OLDSOCK
1754		if (type == MT_SONAME && buflen <= 112)
1755			buflen = MLEN;		/* unix domain compat. hack */
1756		else
1757#endif
1758			if (buflen > MCLBYTES)
1759				return (EINVAL);
1760	}
1761	m = m_get2(buflen, M_WAITOK, type, 0);
1762	m->m_len = buflen;
1763	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1764	if (error != 0)
1765		(void) m_free(m);
1766	else {
1767		*mp = m;
1768		if (type == MT_SONAME) {
1769			sa = mtod(m, struct sockaddr *);
1770
1771#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1772			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1773				sa->sa_family = sa->sa_len;
1774#endif
1775			sa->sa_len = buflen;
1776		}
1777	}
1778	return (error);
1779}
1780
1781int
1782getsockaddr(namp, uaddr, len)
1783	struct sockaddr **namp;
1784	caddr_t uaddr;
1785	size_t len;
1786{
1787	struct sockaddr *sa;
1788	int error;
1789
1790	if (len > SOCK_MAXADDRLEN)
1791		return (ENAMETOOLONG);
1792	if (len < offsetof(struct sockaddr, sa_data[0]))
1793		return (EINVAL);
1794	sa = malloc(len, M_SONAME, M_WAITOK);
1795	error = copyin(uaddr, sa, len);
1796	if (error != 0) {
1797		free(sa, M_SONAME);
1798	} else {
1799#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1800		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1801			sa->sa_family = sa->sa_len;
1802#endif
1803		sa->sa_len = len;
1804		*namp = sa;
1805	}
1806	return (error);
1807}
1808
1809struct sendfile_sync {
1810	struct mtx	mtx;
1811	struct cv	cv;
1812	unsigned	count;
1813};
1814
1815/*
1816 * Add more references to a vm_page + sf_buf + sendfile_sync.
1817 */
1818void
1819sf_ext_ref(void *arg1, void *arg2)
1820{
1821	struct sf_buf *sf = arg1;
1822	struct sendfile_sync *sfs = arg2;
1823	vm_page_t pg = sf_buf_page(sf);
1824
1825	sf_buf_ref(sf);
1826
1827	vm_page_lock(pg);
1828	vm_page_wire(pg);
1829	vm_page_unlock(pg);
1830
1831	if (sfs != NULL) {
1832		mtx_lock(&sfs->mtx);
1833		KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
1834		sfs->count++;
1835		mtx_unlock(&sfs->mtx);
1836	}
1837}
1838
1839/*
1840 * Detach mapped page and release resources back to the system.
1841 */
1842void
1843sf_ext_free(void *arg1, void *arg2)
1844{
1845	struct sf_buf *sf = arg1;
1846	struct sendfile_sync *sfs = arg2;
1847	vm_page_t pg = sf_buf_page(sf);
1848
1849	sf_buf_free(sf);
1850
1851	vm_page_lock(pg);
1852	vm_page_unwire(pg, PQ_INACTIVE);
1853	/*
1854	 * Check for the object going away on us. This can
1855	 * happen since we don't hold a reference to it.
1856	 * If so, we're responsible for freeing the page.
1857	 */
1858	if (pg->wire_count == 0 && pg->object == NULL)
1859		vm_page_free(pg);
1860	vm_page_unlock(pg);
1861
1862	if (sfs != NULL) {
1863		mtx_lock(&sfs->mtx);
1864		KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
1865		if (--sfs->count == 0)
1866			cv_signal(&sfs->cv);
1867		mtx_unlock(&sfs->mtx);
1868	}
1869}
1870
1871/*
1872 * sendfile(2)
1873 *
1874 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1875 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1876 *
1877 * Send a file specified by 'fd' and starting at 'offset' to a socket
1878 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1879 * 0.  Optionally add a header and/or trailer to the socket output.  If
1880 * specified, write the total number of bytes sent into *sbytes.
1881 */
1882int
1883sys_sendfile(struct thread *td, struct sendfile_args *uap)
1884{
1885
1886	return (do_sendfile(td, uap, 0));
1887}
1888
1889static int
1890do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1891{
1892	struct sf_hdtr hdtr;
1893	struct uio *hdr_uio, *trl_uio;
1894	struct file *fp;
1895	cap_rights_t rights;
1896	off_t sbytes;
1897	int error;
1898
1899	/*
1900	 * File offset must be positive.  If it goes beyond EOF
1901	 * we send only the header/trailer and no payload data.
1902	 */
1903	if (uap->offset < 0)
1904		return (EINVAL);
1905
1906	hdr_uio = trl_uio = NULL;
1907
1908	if (uap->hdtr != NULL) {
1909		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1910		if (error != 0)
1911			goto out;
1912		if (hdtr.headers != NULL) {
1913			error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
1914			    &hdr_uio);
1915			if (error != 0)
1916				goto out;
1917		}
1918		if (hdtr.trailers != NULL) {
1919			error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
1920			    &trl_uio);
1921			if (error != 0)
1922				goto out;
1923		}
1924	}
1925
1926	AUDIT_ARG_FD(uap->fd);
1927
1928	/*
1929	 * sendfile(2) can start at any offset within a file so we require
1930	 * CAP_READ+CAP_SEEK = CAP_PREAD.
1931	 */
1932	if ((error = fget_read(td, uap->fd,
1933	    cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
1934		goto out;
1935	}
1936
1937	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
1938	    uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
1939	fdrop(fp, td);
1940
1941	if (uap->sbytes != NULL)
1942		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1943
1944out:
1945	free(hdr_uio, M_IOV);
1946	free(trl_uio, M_IOV);
1947	return (error);
1948}
1949
1950#ifdef COMPAT_FREEBSD4
1951int
1952freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1953{
1954	struct sendfile_args args;
1955
1956	args.fd = uap->fd;
1957	args.s = uap->s;
1958	args.offset = uap->offset;
1959	args.nbytes = uap->nbytes;
1960	args.hdtr = uap->hdtr;
1961	args.sbytes = uap->sbytes;
1962	args.flags = uap->flags;
1963
1964	return (do_sendfile(td, &args, 1));
1965}
1966#endif /* COMPAT_FREEBSD4 */
1967
1968static int
1969sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
1970    off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
1971{
1972	vm_page_t m;
1973	vm_pindex_t pindex;
1974	ssize_t resid;
1975	int error, readahead, rv;
1976
1977	pindex = OFF_TO_IDX(off);
1978	VM_OBJECT_WLOCK(obj);
1979	m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
1980	    VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
1981
1982	/*
1983	 * Check if page is valid for what we need, otherwise initiate I/O.
1984	 *
1985	 * The non-zero nd argument prevents disk I/O, instead we
1986	 * return the caller what he specified in nd.  In particular,
1987	 * if we already turned some pages into mbufs, nd == EAGAIN
1988	 * and the main function send them the pages before we come
1989	 * here again and block.
1990	 */
1991	if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
1992		if (vp == NULL)
1993			vm_page_xunbusy(m);
1994		VM_OBJECT_WUNLOCK(obj);
1995		*res = m;
1996		return (0);
1997	} else if (nd != 0) {
1998		if (vp == NULL)
1999			vm_page_xunbusy(m);
2000		error = nd;
2001		goto free_page;
2002	}
2003
2004	/*
2005	 * Get the page from backing store.
2006	 */
2007	error = 0;
2008	if (vp != NULL) {
2009		VM_OBJECT_WUNLOCK(obj);
2010		readahead = sfreadahead * MAXBSIZE;
2011
2012		/*
2013		 * Use vn_rdwr() instead of the pager interface for
2014		 * the vnode, to allow the read-ahead.
2015		 *
2016		 * XXXMAC: Because we don't have fp->f_cred here, we
2017		 * pass in NOCRED.  This is probably wrong, but is
2018		 * consistent with our original implementation.
2019		 */
2020		error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
2021		    UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
2022		    bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
2023		SFSTAT_INC(sf_iocnt);
2024		VM_OBJECT_WLOCK(obj);
2025	} else {
2026		if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
2027			rv = vm_pager_get_pages(obj, &m, 1, 0);
2028			SFSTAT_INC(sf_iocnt);
2029			if (rv != VM_PAGER_OK) {
2030				vm_page_lock(m);
2031				vm_page_free(m);
2032				vm_page_unlock(m);
2033				m = NULL;
2034				error = EIO;
2035			}
2036		} else {
2037			pmap_zero_page(m);
2038			m->valid = VM_PAGE_BITS_ALL;
2039			m->dirty = 0;
2040		}
2041		if (m != NULL)
2042			vm_page_xunbusy(m);
2043	}
2044	if (error == 0) {
2045		*res = m;
2046	} else if (m != NULL) {
2047free_page:
2048		vm_page_lock(m);
2049		vm_page_unwire(m, PQ_INACTIVE);
2050
2051		/*
2052		 * See if anyone else might know about this page.  If
2053		 * not and it is not valid, then free it.
2054		 */
2055		if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
2056			vm_page_free(m);
2057		vm_page_unlock(m);
2058	}
2059	KASSERT(error != 0 || (m->wire_count > 0 &&
2060	    vm_page_is_valid(m, off & PAGE_MASK, xfsize)),
2061	    ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off,
2062	    xfsize));
2063	VM_OBJECT_WUNLOCK(obj);
2064	return (error);
2065}
2066
2067static int
2068sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
2069    struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
2070    int *bsize)
2071{
2072	struct vattr va;
2073	vm_object_t obj;
2074	struct vnode *vp;
2075	struct shmfd *shmfd;
2076	int error;
2077
2078	vp = *vp_res = NULL;
2079	obj = NULL;
2080	shmfd = *shmfd_res = NULL;
2081	*bsize = 0;
2082
2083	/*
2084	 * The file descriptor must be a regular file and have a
2085	 * backing VM object.
2086	 */
2087	if (fp->f_type == DTYPE_VNODE) {
2088		vp = fp->f_vnode;
2089		vn_lock(vp, LK_SHARED | LK_RETRY);
2090		if (vp->v_type != VREG) {
2091			error = EINVAL;
2092			goto out;
2093		}
2094		*bsize = vp->v_mount->mnt_stat.f_iosize;
2095		error = VOP_GETATTR(vp, &va, td->td_ucred);
2096		if (error != 0)
2097			goto out;
2098		*obj_size = va.va_size;
2099		obj = vp->v_object;
2100		if (obj == NULL) {
2101			error = EINVAL;
2102			goto out;
2103		}
2104	} else if (fp->f_type == DTYPE_SHM) {
2105		error = 0;
2106		shmfd = fp->f_data;
2107		obj = shmfd->shm_object;
2108		*obj_size = shmfd->shm_size;
2109	} else {
2110		error = EINVAL;
2111		goto out;
2112	}
2113
2114	VM_OBJECT_WLOCK(obj);
2115	if ((obj->flags & OBJ_DEAD) != 0) {
2116		VM_OBJECT_WUNLOCK(obj);
2117		error = EBADF;
2118		goto out;
2119	}
2120
2121	/*
2122	 * Temporarily increase the backing VM object's reference
2123	 * count so that a forced reclamation of its vnode does not
2124	 * immediately destroy it.
2125	 */
2126	vm_object_reference_locked(obj);
2127	VM_OBJECT_WUNLOCK(obj);
2128	*obj_res = obj;
2129	*vp_res = vp;
2130	*shmfd_res = shmfd;
2131
2132out:
2133	if (vp != NULL)
2134		VOP_UNLOCK(vp, 0);
2135	return (error);
2136}
2137
2138static int
2139kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
2140    struct socket **so)
2141{
2142	cap_rights_t rights;
2143	int error;
2144
2145	*sock_fp = NULL;
2146	*so = NULL;
2147
2148	/*
2149	 * The socket must be a stream socket and connected.
2150	 */
2151	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND),
2152	    sock_fp, NULL);
2153	if (error != 0)
2154		return (error);
2155	*so = (*sock_fp)->f_data;
2156	if ((*so)->so_type != SOCK_STREAM)
2157		return (EINVAL);
2158	if (((*so)->so_state & SS_ISCONNECTED) == 0)
2159		return (ENOTCONN);
2160	return (0);
2161}
2162
2163int
2164vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
2165    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
2166    int kflags, struct thread *td)
2167{
2168	struct file *sock_fp;
2169	struct vnode *vp;
2170	struct vm_object *obj;
2171	struct socket *so;
2172	struct mbuf *m;
2173	struct sf_buf *sf;
2174	struct vm_page *pg;
2175	struct shmfd *shmfd;
2176	struct sendfile_sync *sfs;
2177	struct vattr va;
2178	off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
2179	int error, bsize, nd, hdrlen, mnw;
2180
2181	pg = NULL;
2182	obj = NULL;
2183	so = NULL;
2184	m = NULL;
2185	sfs = NULL;
2186	fsbytes = sbytes = 0;
2187	hdrlen = mnw = 0;
2188	rem = nbytes;
2189	obj_size = 0;
2190
2191	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
2192	if (error != 0)
2193		return (error);
2194	if (rem == 0)
2195		rem = obj_size;
2196
2197	error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
2198	if (error != 0)
2199		goto out;
2200
2201	/*
2202	 * Do not wait on memory allocations but return ENOMEM for
2203	 * caller to retry later.
2204	 * XXX: Experimental.
2205	 */
2206	if (flags & SF_MNOWAIT)
2207		mnw = 1;
2208
2209	if (flags & SF_SYNC) {
2210		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
2211		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
2212		cv_init(&sfs->cv, "sendfile");
2213	}
2214
2215#ifdef MAC
2216	error = mac_socket_check_send(td->td_ucred, so);
2217	if (error != 0)
2218		goto out;
2219#endif
2220
2221	/* If headers are specified copy them into mbufs. */
2222	if (hdr_uio != NULL) {
2223		hdr_uio->uio_td = td;
2224		hdr_uio->uio_rw = UIO_WRITE;
2225		if (hdr_uio->uio_resid > 0) {
2226			/*
2227			 * In FBSD < 5.0 the nbytes to send also included
2228			 * the header.  If compat is specified subtract the
2229			 * header size from nbytes.
2230			 */
2231			if (kflags & SFK_COMPAT) {
2232				if (nbytes > hdr_uio->uio_resid)
2233					nbytes -= hdr_uio->uio_resid;
2234				else
2235					nbytes = 0;
2236			}
2237			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
2238			    0, 0, 0);
2239			if (m == NULL) {
2240				error = mnw ? EAGAIN : ENOBUFS;
2241				goto out;
2242			}
2243			hdrlen = m_length(m, NULL);
2244		}
2245	}
2246
2247	/*
2248	 * Protect against multiple writers to the socket.
2249	 *
2250	 * XXXRW: Historically this has assumed non-interruptibility, so now
2251	 * we implement that, but possibly shouldn't.
2252	 */
2253	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
2254
2255	/*
2256	 * Loop through the pages of the file, starting with the requested
2257	 * offset. Get a file page (do I/O if necessary), map the file page
2258	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
2259	 * it on the socket.
2260	 * This is done in two loops.  The inner loop turns as many pages
2261	 * as it can, up to available socket buffer space, without blocking
2262	 * into mbufs to have it bulk delivered into the socket send buffer.
2263	 * The outer loop checks the state and available space of the socket
2264	 * and takes care of the overall progress.
2265	 */
2266	for (off = offset; ; ) {
2267		struct mbuf *mtail;
2268		int loopbytes;
2269		int space;
2270		int done;
2271
2272		if ((nbytes != 0 && nbytes == fsbytes) ||
2273		    (nbytes == 0 && obj_size == fsbytes))
2274			break;
2275
2276		mtail = NULL;
2277		loopbytes = 0;
2278		space = 0;
2279		done = 0;
2280
2281		/*
2282		 * Check the socket state for ongoing connection,
2283		 * no errors and space in socket buffer.
2284		 * If space is low allow for the remainder of the
2285		 * file to be processed if it fits the socket buffer.
2286		 * Otherwise block in waiting for sufficient space
2287		 * to proceed, or if the socket is nonblocking, return
2288		 * to userland with EAGAIN while reporting how far
2289		 * we've come.
2290		 * We wait until the socket buffer has significant free
2291		 * space to do bulk sends.  This makes good use of file
2292		 * system read ahead and allows packet segmentation
2293		 * offloading hardware to take over lots of work.  If
2294		 * we were not careful here we would send off only one
2295		 * sfbuf at a time.
2296		 */
2297		SOCKBUF_LOCK(&so->so_snd);
2298		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
2299			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
2300retry_space:
2301		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2302			error = EPIPE;
2303			SOCKBUF_UNLOCK(&so->so_snd);
2304			goto done;
2305		} else if (so->so_error) {
2306			error = so->so_error;
2307			so->so_error = 0;
2308			SOCKBUF_UNLOCK(&so->so_snd);
2309			goto done;
2310		}
2311		space = sbspace(&so->so_snd);
2312		if (space < rem &&
2313		    (space <= 0 ||
2314		     space < so->so_snd.sb_lowat)) {
2315			if (so->so_state & SS_NBIO) {
2316				SOCKBUF_UNLOCK(&so->so_snd);
2317				error = EAGAIN;
2318				goto done;
2319			}
2320			/*
2321			 * sbwait drops the lock while sleeping.
2322			 * When we loop back to retry_space the
2323			 * state may have changed and we retest
2324			 * for it.
2325			 */
2326			error = sbwait(&so->so_snd);
2327			/*
2328			 * An error from sbwait usually indicates that we've
2329			 * been interrupted by a signal. If we've sent anything
2330			 * then return bytes sent, otherwise return the error.
2331			 */
2332			if (error != 0) {
2333				SOCKBUF_UNLOCK(&so->so_snd);
2334				goto done;
2335			}
2336			goto retry_space;
2337		}
2338		SOCKBUF_UNLOCK(&so->so_snd);
2339
2340		/*
2341		 * Reduce space in the socket buffer by the size of
2342		 * the header mbuf chain.
2343		 * hdrlen is set to 0 after the first loop.
2344		 */
2345		space -= hdrlen;
2346
2347		if (vp != NULL) {
2348			error = vn_lock(vp, LK_SHARED);
2349			if (error != 0)
2350				goto done;
2351			error = VOP_GETATTR(vp, &va, td->td_ucred);
2352			if (error != 0 || off >= va.va_size) {
2353				VOP_UNLOCK(vp, 0);
2354				goto done;
2355			}
2356			obj_size = va.va_size;
2357		}
2358
2359		/*
2360		 * Loop and construct maximum sized mbuf chain to be bulk
2361		 * dumped into socket buffer.
2362		 */
2363		while (space > loopbytes) {
2364			vm_offset_t pgoff;
2365			struct mbuf *m0;
2366
2367			/*
2368			 * Calculate the amount to transfer.
2369			 * Not to exceed a page, the EOF,
2370			 * or the passed in nbytes.
2371			 */
2372			pgoff = (vm_offset_t)(off & PAGE_MASK);
2373			rem = obj_size - offset;
2374			if (nbytes != 0)
2375				rem = omin(rem, nbytes);
2376			rem -= fsbytes + loopbytes;
2377			xfsize = omin(PAGE_SIZE - pgoff, rem);
2378			xfsize = omin(space - loopbytes, xfsize);
2379			if (xfsize <= 0) {
2380				done = 1;		/* all data sent */
2381				break;
2382			}
2383
2384			/*
2385			 * Attempt to look up the page.  Allocate
2386			 * if not found or wait and loop if busy.
2387			 */
2388			if (m != NULL)
2389				nd = EAGAIN; /* send what we already got */
2390			else if ((flags & SF_NODISKIO) != 0)
2391				nd = EBUSY;
2392			else
2393				nd = 0;
2394			error = sendfile_readpage(obj, vp, nd, off,
2395			    xfsize, bsize, td, &pg);
2396			if (error != 0) {
2397				if (error == EAGAIN)
2398					error = 0;	/* not a real error */
2399				break;
2400			}
2401
2402			/*
2403			 * Get a sendfile buf.  When allocating the
2404			 * first buffer for mbuf chain, we usually
2405			 * wait as long as necessary, but this wait
2406			 * can be interrupted.  For consequent
2407			 * buffers, do not sleep, since several
2408			 * threads might exhaust the buffers and then
2409			 * deadlock.
2410			 */
2411			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
2412			    SFB_CATCH);
2413			if (sf == NULL) {
2414				SFSTAT_INC(sf_allocfail);
2415				vm_page_lock(pg);
2416				vm_page_unwire(pg, PQ_INACTIVE);
2417				KASSERT(pg->object != NULL,
2418				    ("%s: object disappeared", __func__));
2419				vm_page_unlock(pg);
2420				if (m == NULL)
2421					error = (mnw ? EAGAIN : EINTR);
2422				break;
2423			}
2424
2425			/*
2426			 * Get an mbuf and set it up as having
2427			 * external storage.
2428			 */
2429			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2430			if (m0 == NULL) {
2431				error = (mnw ? EAGAIN : ENOBUFS);
2432				sf_ext_free(sf, NULL);
2433				break;
2434			}
2435			/*
2436			 * Attach EXT_SFBUF external storage.
2437			 */
2438			m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf);
2439			m0->m_ext.ext_size = PAGE_SIZE;
2440			m0->m_ext.ext_arg1 = sf;
2441			m0->m_ext.ext_arg2 = sfs;
2442			m0->m_ext.ext_type = EXT_SFBUF;
2443			m0->m_ext.ext_flags = 0;
2444			m0->m_flags |= (M_EXT|M_RDONLY);
2445			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2446			m0->m_len = xfsize;
2447
2448			/* Append to mbuf chain. */
2449			if (mtail != NULL)
2450				mtail->m_next = m0;
2451			else if (m != NULL)
2452				m_last(m)->m_next = m0;
2453			else
2454				m = m0;
2455			mtail = m0;
2456
2457			/* Keep track of bits processed. */
2458			loopbytes += xfsize;
2459			off += xfsize;
2460
2461			if (sfs != NULL) {
2462				mtx_lock(&sfs->mtx);
2463				sfs->count++;
2464				mtx_unlock(&sfs->mtx);
2465			}
2466		}
2467
2468		if (vp != NULL)
2469			VOP_UNLOCK(vp, 0);
2470
2471		/* Add the buffer chain to the socket buffer. */
2472		if (m != NULL) {
2473			int mlen, err;
2474
2475			mlen = m_length(m, NULL);
2476			SOCKBUF_LOCK(&so->so_snd);
2477			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2478				error = EPIPE;
2479				SOCKBUF_UNLOCK(&so->so_snd);
2480				goto done;
2481			}
2482			SOCKBUF_UNLOCK(&so->so_snd);
2483			CURVNET_SET(so->so_vnet);
2484			/* Avoid error aliasing. */
2485			err = (*so->so_proto->pr_usrreqs->pru_send)
2486				    (so, 0, m, NULL, NULL, td);
2487			CURVNET_RESTORE();
2488			if (err == 0) {
2489				/*
2490				 * We need two counters to get the
2491				 * file offset and nbytes to send
2492				 * right:
2493				 * - sbytes contains the total amount
2494				 *   of bytes sent, including headers.
2495				 * - fsbytes contains the total amount
2496				 *   of bytes sent from the file.
2497				 */
2498				sbytes += mlen;
2499				fsbytes += mlen;
2500				if (hdrlen) {
2501					fsbytes -= hdrlen;
2502					hdrlen = 0;
2503				}
2504			} else if (error == 0)
2505				error = err;
2506			m = NULL;	/* pru_send always consumes */
2507		}
2508
2509		/* Quit outer loop on error or when we're done. */
2510		if (done)
2511			break;
2512		if (error != 0)
2513			goto done;
2514	}
2515
2516	/*
2517	 * Send trailers. Wimp out and use writev(2).
2518	 */
2519	if (trl_uio != NULL) {
2520		sbunlock(&so->so_snd);
2521		error = kern_writev(td, sockfd, trl_uio);
2522		if (error == 0)
2523			sbytes += td->td_retval[0];
2524		goto out;
2525	}
2526
2527done:
2528	sbunlock(&so->so_snd);
2529out:
2530	/*
2531	 * If there was no error we have to clear td->td_retval[0]
2532	 * because it may have been set by writev.
2533	 */
2534	if (error == 0) {
2535		td->td_retval[0] = 0;
2536	}
2537	if (sent != NULL) {
2538		(*sent) = sbytes;
2539	}
2540	if (obj != NULL)
2541		vm_object_deallocate(obj);
2542	if (so)
2543		fdrop(sock_fp, td);
2544	if (m)
2545		m_freem(m);
2546
2547	if (sfs != NULL) {
2548		mtx_lock(&sfs->mtx);
2549		if (sfs->count != 0)
2550			cv_wait(&sfs->cv, &sfs->mtx);
2551		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2552		cv_destroy(&sfs->cv);
2553		mtx_destroy(&sfs->mtx);
2554		free(sfs, M_TEMP);
2555	}
2556
2557	if (error == ERESTART)
2558		error = EINTR;
2559
2560	return (error);
2561}
2562