uipc_syscalls.c revision 147784
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 147784 2005-07-05 22:49:10Z rwatson $");
37
38#include "opt_compat.h"
39#include "opt_ktrace.h"
40#include "opt_mac.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/mac.h>
47#include <sys/mutex.h>
48#include <sys/sysproto.h>
49#include <sys/malloc.h>
50#include <sys/filedesc.h>
51#include <sys/event.h>
52#include <sys/proc.h>
53#include <sys/fcntl.h>
54#include <sys/file.h>
55#include <sys/filio.h>
56#include <sys/mount.h>
57#include <sys/mbuf.h>
58#include <sys/protosw.h>
59#include <sys/sf_buf.h>
60#include <sys/socket.h>
61#include <sys/socketvar.h>
62#include <sys/signalvar.h>
63#include <sys/syscallsubr.h>
64#include <sys/sysctl.h>
65#include <sys/uio.h>
66#include <sys/vnode.h>
67#ifdef KTRACE
68#include <sys/ktrace.h>
69#endif
70
71#include <vm/vm.h>
72#include <vm/vm_object.h>
73#include <vm/vm_page.h>
74#include <vm/vm_pageout.h>
75#include <vm/vm_kern.h>
76#include <vm/vm_extern.h>
77
78static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
79static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
80
81static int accept1(struct thread *td, struct accept_args *uap, int compat);
82static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
83static int getsockname1(struct thread *td, struct getsockname_args *uap,
84			int compat);
85static int getpeername1(struct thread *td, struct getpeername_args *uap,
86			int compat);
87
88/*
89 * NSFBUFS-related variables and associated sysctls
90 */
91int nsfbufs;
92int nsfbufspeak;
93int nsfbufsused;
94
95SYSCTL_DECL(_kern_ipc);
96SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
97    "Maximum number of sendfile(2) sf_bufs available");
98SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
99    "Number of sendfile(2) sf_bufs at peak usage");
100SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
101    "Number of sendfile(2) sf_bufs in use");
102
103/*
104 * Convert a user file descriptor to a kernel file entry.  A reference on the
105 * file entry is held upon returning.  This is lighter weight than
106 * fgetsock(), which bumps the socket reference drops the file reference
107 * count instead, as this approach avoids several additional mutex operations
108 * associated with the additional reference count.
109 */
110static int
111getsock(struct filedesc *fdp, int fd, struct file **fpp)
112{
113	struct file *fp;
114	int error;
115
116	fp = NULL;
117	if (fdp == NULL)
118		error = EBADF;
119	else {
120		FILEDESC_LOCK_FAST(fdp);
121		fp = fget_locked(fdp, fd);
122		if (fp == NULL)
123			error = EBADF;
124		else if (fp->f_type != DTYPE_SOCKET) {
125			fp = NULL;
126			error = ENOTSOCK;
127		} else {
128			fhold(fp);
129			error = 0;
130		}
131		FILEDESC_UNLOCK_FAST(fdp);
132	}
133	*fpp = fp;
134	return (error);
135}
136
137/*
138 * System call interface to the socket abstraction.
139 */
140#if defined(COMPAT_43)
141#define COMPAT_OLDSOCK
142#endif
143
144/*
145 * MPSAFE
146 */
147int
148socket(td, uap)
149	struct thread *td;
150	register struct socket_args /* {
151		int	domain;
152		int	type;
153		int	protocol;
154	} */ *uap;
155{
156	struct filedesc *fdp;
157	struct socket *so;
158	struct file *fp;
159	int fd, error;
160
161#ifdef MAC
162	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
163	    uap->protocol);
164	if (error)
165		return (error);
166#endif
167	fdp = td->td_proc->p_fd;
168	error = falloc(td, &fp, &fd);
169	if (error)
170		return (error);
171	/* An extra reference on `fp' has been held for us by falloc(). */
172	NET_LOCK_GIANT();
173	error = socreate(uap->domain, &so, uap->type, uap->protocol,
174	    td->td_ucred, td);
175	NET_UNLOCK_GIANT();
176	if (error) {
177		fdclose(fdp, fp, fd, td);
178	} else {
179		FILEDESC_LOCK_FAST(fdp);
180		fp->f_data = so;	/* already has ref count */
181		fp->f_flag = FREAD|FWRITE;
182		fp->f_ops = &socketops;
183		fp->f_type = DTYPE_SOCKET;
184		FILEDESC_UNLOCK_FAST(fdp);
185		td->td_retval[0] = fd;
186	}
187	fdrop(fp, td);
188	return (error);
189}
190
191/*
192 * MPSAFE
193 */
194/* ARGSUSED */
195int
196bind(td, uap)
197	struct thread *td;
198	register struct bind_args /* {
199		int	s;
200		caddr_t	name;
201		int	namelen;
202	} */ *uap;
203{
204	struct sockaddr *sa;
205	int error;
206
207	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
208		return (error);
209
210	return (kern_bind(td, uap->s, sa));
211}
212
213int
214kern_bind(td, fd, sa)
215	struct thread *td;
216	int fd;
217	struct sockaddr *sa;
218{
219	struct socket *so;
220	struct file *fp;
221	int error;
222
223	NET_LOCK_GIANT();
224	error = getsock(td->td_proc->p_fd, fd, &fp);
225	if (error)
226		goto done2;
227	so = fp->f_data;
228#ifdef MAC
229	SOCK_LOCK(so);
230	error = mac_check_socket_bind(td->td_ucred, so, sa);
231	SOCK_UNLOCK(so);
232	if (error)
233		goto done1;
234#endif
235	error = sobind(so, sa, td);
236#ifdef MAC
237done1:
238#endif
239	fdrop(fp, td);
240done2:
241	NET_UNLOCK_GIANT();
242	FREE(sa, M_SONAME);
243	return (error);
244}
245
246/*
247 * MPSAFE
248 */
249/* ARGSUSED */
250int
251listen(td, uap)
252	struct thread *td;
253	register struct listen_args /* {
254		int	s;
255		int	backlog;
256	} */ *uap;
257{
258	struct socket *so;
259	struct file *fp;
260	int error;
261
262	NET_LOCK_GIANT();
263	error = getsock(td->td_proc->p_fd, uap->s, &fp);
264	if (error == 0) {
265		so = fp->f_data;
266#ifdef MAC
267		SOCK_LOCK(so);
268		error = mac_check_socket_listen(td->td_ucred, so);
269		SOCK_UNLOCK(so);
270		if (error)
271			goto done;
272#endif
273		error = solisten(so, uap->backlog, td);
274#ifdef MAC
275done:
276#endif
277		fdrop(fp, td);
278	}
279	NET_UNLOCK_GIANT();
280	return(error);
281}
282
283/*
284 * accept1()
285 * MPSAFE
286 */
287static int
288accept1(td, uap, compat)
289	struct thread *td;
290	register struct accept_args /* {
291		int	s;
292		struct sockaddr	* __restrict name;
293		socklen_t	* __restrict anamelen;
294	} */ *uap;
295	int compat;
296{
297	struct filedesc *fdp;
298	struct file *nfp = NULL;
299	struct sockaddr *sa = NULL;
300	socklen_t namelen;
301	int error;
302	struct socket *head, *so;
303	int fd;
304	u_int fflag;
305	pid_t pgid;
306	int tmp;
307
308	fdp = td->td_proc->p_fd;
309	if (uap->name) {
310		error = copyin(uap->anamelen, &namelen, sizeof (namelen));
311		if(error)
312			return (error);
313		if (namelen < 0)
314			return (EINVAL);
315	}
316	NET_LOCK_GIANT();
317	error = fgetsock(td, uap->s, &head, &fflag);
318	if (error)
319		goto done2;
320	if ((head->so_options & SO_ACCEPTCONN) == 0) {
321		error = EINVAL;
322		goto done;
323	}
324#ifdef MAC
325	SOCK_LOCK(head);
326	error = mac_check_socket_accept(td->td_ucred, head);
327	SOCK_UNLOCK(head);
328	if (error != 0)
329		goto done;
330#endif
331	error = falloc(td, &nfp, &fd);
332	if (error)
333		goto done;
334	ACCEPT_LOCK();
335	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
336		ACCEPT_UNLOCK();
337		error = EWOULDBLOCK;
338		goto noconnection;
339	}
340	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
341		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
342			head->so_error = ECONNABORTED;
343			break;
344		}
345		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
346		    "accept", 0);
347		if (error) {
348			ACCEPT_UNLOCK();
349			goto noconnection;
350		}
351	}
352	if (head->so_error) {
353		error = head->so_error;
354		head->so_error = 0;
355		ACCEPT_UNLOCK();
356		goto noconnection;
357	}
358	so = TAILQ_FIRST(&head->so_comp);
359	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
360	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
361
362	/*
363	 * Before changing the flags on the socket, we have to bump the
364	 * reference count.  Otherwise, if the protocol calls sofree(),
365	 * the socket will be released due to a zero refcount.
366	 */
367	SOCK_LOCK(so);			/* soref() and so_state update */
368	soref(so);			/* file descriptor reference */
369
370	TAILQ_REMOVE(&head->so_comp, so, so_list);
371	head->so_qlen--;
372	so->so_state |= (head->so_state & SS_NBIO);
373	so->so_qstate &= ~SQ_COMP;
374	so->so_head = NULL;
375
376	SOCK_UNLOCK(so);
377	ACCEPT_UNLOCK();
378
379	/* An extra reference on `nfp' has been held for us by falloc(). */
380	td->td_retval[0] = fd;
381
382	/* connection has been removed from the listen queue */
383	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
384
385	pgid = fgetown(&head->so_sigio);
386	if (pgid != 0)
387		fsetown(pgid, &so->so_sigio);
388
389	FILE_LOCK(nfp);
390	nfp->f_data = so;	/* nfp has ref count from falloc */
391	nfp->f_flag = fflag;
392	nfp->f_ops = &socketops;
393	nfp->f_type = DTYPE_SOCKET;
394	FILE_UNLOCK(nfp);
395	/* Sync socket nonblocking/async state with file flags */
396	tmp = fflag & FNONBLOCK;
397	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
398	tmp = fflag & FASYNC;
399	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
400	sa = 0;
401	error = soaccept(so, &sa);
402	if (error) {
403		/*
404		 * return a namelen of zero for older code which might
405		 * ignore the return value from accept.
406		 */
407		if (uap->name != NULL) {
408			namelen = 0;
409			(void) copyout(&namelen,
410			    uap->anamelen, sizeof(*uap->anamelen));
411		}
412		goto noconnection;
413	}
414	if (sa == NULL) {
415		namelen = 0;
416		if (uap->name)
417			goto gotnoname;
418		error = 0;
419		goto done;
420	}
421	if (uap->name) {
422		/* check sa_len before it is destroyed */
423		if (namelen > sa->sa_len)
424			namelen = sa->sa_len;
425#ifdef COMPAT_OLDSOCK
426		if (compat)
427			((struct osockaddr *)sa)->sa_family =
428			    sa->sa_family;
429#endif
430		error = copyout(sa, uap->name, (u_int)namelen);
431		if (!error)
432gotnoname:
433			error = copyout(&namelen,
434			    uap->anamelen, sizeof (*uap->anamelen));
435	}
436noconnection:
437	if (sa)
438		FREE(sa, M_SONAME);
439
440	/*
441	 * close the new descriptor, assuming someone hasn't ripped it
442	 * out from under us.
443	 */
444	if (error)
445		fdclose(fdp, nfp, fd, td);
446
447	/*
448	 * Release explicitly held references before returning.
449	 */
450done:
451	if (nfp != NULL)
452		fdrop(nfp, td);
453	fputsock(head);
454done2:
455	NET_UNLOCK_GIANT();
456	return (error);
457}
458
459/*
460 * MPSAFE (accept1() is MPSAFE)
461 */
462int
463accept(td, uap)
464	struct thread *td;
465	struct accept_args *uap;
466{
467
468	return (accept1(td, uap, 0));
469}
470
471#ifdef COMPAT_OLDSOCK
472/*
473 * MPSAFE (accept1() is MPSAFE)
474 */
475int
476oaccept(td, uap)
477	struct thread *td;
478	struct accept_args *uap;
479{
480
481	return (accept1(td, uap, 1));
482}
483#endif /* COMPAT_OLDSOCK */
484
485/*
486 * MPSAFE
487 */
488/* ARGSUSED */
489int
490connect(td, uap)
491	struct thread *td;
492	register struct connect_args /* {
493		int	s;
494		caddr_t	name;
495		int	namelen;
496	} */ *uap;
497{
498	struct sockaddr *sa;
499	int error;
500
501	error = getsockaddr(&sa, uap->name, uap->namelen);
502	if (error)
503		return (error);
504
505	return (kern_connect(td, uap->s, sa));
506}
507
508
509int
510kern_connect(td, fd, sa)
511	struct thread *td;
512	int fd;
513	struct sockaddr *sa;
514{
515	struct socket *so;
516	struct file *fp;
517	int error;
518	int interrupted = 0;
519
520	NET_LOCK_GIANT();
521	error = getsock(td->td_proc->p_fd, fd, &fp);
522	if (error)
523		goto done2;
524	so = fp->f_data;
525	if (so->so_state & SS_ISCONNECTING) {
526		error = EALREADY;
527		goto done1;
528	}
529#ifdef MAC
530	SOCK_LOCK(so);
531	error = mac_check_socket_connect(td->td_ucred, so, sa);
532	SOCK_UNLOCK(so);
533	if (error)
534		goto bad;
535#endif
536	error = soconnect(so, sa, td);
537	if (error)
538		goto bad;
539	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
540		error = EINPROGRESS;
541		goto done1;
542	}
543	SOCK_LOCK(so);
544	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
545		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
546		    "connec", 0);
547		if (error) {
548			if (error == EINTR || error == ERESTART)
549				interrupted = 1;
550			break;
551		}
552	}
553	if (error == 0) {
554		error = so->so_error;
555		so->so_error = 0;
556	}
557	SOCK_UNLOCK(so);
558bad:
559	if (!interrupted)
560		so->so_state &= ~SS_ISCONNECTING;
561	if (error == ERESTART)
562		error = EINTR;
563done1:
564	fdrop(fp, td);
565done2:
566	NET_UNLOCK_GIANT();
567	FREE(sa, M_SONAME);
568	return (error);
569}
570
571/*
572 * MPSAFE
573 */
574int
575socketpair(td, uap)
576	struct thread *td;
577	register struct socketpair_args /* {
578		int	domain;
579		int	type;
580		int	protocol;
581		int	*rsv;
582	} */ *uap;
583{
584	register struct filedesc *fdp = td->td_proc->p_fd;
585	struct file *fp1, *fp2;
586	struct socket *so1, *so2;
587	int fd, error, sv[2];
588
589#ifdef MAC
590	/* We might want to have a separate check for socket pairs. */
591	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
592	    uap->protocol);
593	if (error)
594		return (error);
595#endif
596
597	NET_LOCK_GIANT();
598	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
599	    td->td_ucred, td);
600	if (error)
601		goto done2;
602	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
603	    td->td_ucred, td);
604	if (error)
605		goto free1;
606	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
607	error = falloc(td, &fp1, &fd);
608	if (error)
609		goto free2;
610	sv[0] = fd;
611	fp1->f_data = so1;	/* so1 already has ref count */
612	error = falloc(td, &fp2, &fd);
613	if (error)
614		goto free3;
615	fp2->f_data = so2;	/* so2 already has ref count */
616	sv[1] = fd;
617	error = soconnect2(so1, so2);
618	if (error)
619		goto free4;
620	if (uap->type == SOCK_DGRAM) {
621		/*
622		 * Datagram socket connection is asymmetric.
623		 */
624		 error = soconnect2(so2, so1);
625		 if (error)
626			goto free4;
627	}
628	FILE_LOCK(fp1);
629	fp1->f_flag = FREAD|FWRITE;
630	fp1->f_ops = &socketops;
631	fp1->f_type = DTYPE_SOCKET;
632	FILE_UNLOCK(fp1);
633	FILE_LOCK(fp2);
634	fp2->f_flag = FREAD|FWRITE;
635	fp2->f_ops = &socketops;
636	fp2->f_type = DTYPE_SOCKET;
637	FILE_UNLOCK(fp2);
638	error = copyout(sv, uap->rsv, 2 * sizeof (int));
639	fdrop(fp1, td);
640	fdrop(fp2, td);
641	goto done2;
642free4:
643	fdclose(fdp, fp2, sv[1], td);
644	fdrop(fp2, td);
645free3:
646	fdclose(fdp, fp1, sv[0], td);
647	fdrop(fp1, td);
648free2:
649	(void)soclose(so2);
650free1:
651	(void)soclose(so1);
652done2:
653	NET_UNLOCK_GIANT();
654	return (error);
655}
656
657static int
658sendit(td, s, mp, flags)
659	register struct thread *td;
660	int s;
661	register struct msghdr *mp;
662	int flags;
663{
664	struct mbuf *control;
665	struct sockaddr *to;
666	int error;
667
668	if (mp->msg_name != NULL) {
669		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
670		if (error) {
671			to = NULL;
672			goto bad;
673		}
674		mp->msg_name = to;
675	} else {
676		to = NULL;
677	}
678
679	if (mp->msg_control) {
680		if (mp->msg_controllen < sizeof(struct cmsghdr)
681#ifdef COMPAT_OLDSOCK
682		    && mp->msg_flags != MSG_COMPAT
683#endif
684		) {
685			error = EINVAL;
686			goto bad;
687		}
688		error = sockargs(&control, mp->msg_control,
689		    mp->msg_controllen, MT_CONTROL);
690		if (error)
691			goto bad;
692#ifdef COMPAT_OLDSOCK
693		if (mp->msg_flags == MSG_COMPAT) {
694			register struct cmsghdr *cm;
695
696			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
697			if (control == 0) {
698				error = ENOBUFS;
699				goto bad;
700			} else {
701				cm = mtod(control, struct cmsghdr *);
702				cm->cmsg_len = control->m_len;
703				cm->cmsg_level = SOL_SOCKET;
704				cm->cmsg_type = SCM_RIGHTS;
705			}
706		}
707#endif
708	} else {
709		control = NULL;
710	}
711
712	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
713
714bad:
715	if (to)
716		FREE(to, M_SONAME);
717	return (error);
718}
719
720int
721kern_sendit(td, s, mp, flags, control, segflg)
722	struct thread *td;
723	int s;
724	struct msghdr *mp;
725	int flags;
726	struct mbuf *control;
727	enum uio_seg segflg;
728{
729	struct file *fp;
730	struct uio auio;
731	struct iovec *iov;
732	struct socket *so;
733	int i;
734	int len, error;
735#ifdef KTRACE
736	struct uio *ktruio = NULL;
737#endif
738
739	NET_LOCK_GIANT();
740	error = getsock(td->td_proc->p_fd, s, &fp);
741	if (error)
742		goto bad2;
743	so = (struct socket *)fp->f_data;
744
745#ifdef MAC
746	SOCK_LOCK(so);
747	error = mac_check_socket_send(td->td_ucred, so);
748	SOCK_UNLOCK(so);
749	if (error)
750		goto bad;
751#endif
752
753	auio.uio_iov = mp->msg_iov;
754	auio.uio_iovcnt = mp->msg_iovlen;
755	auio.uio_segflg = segflg;
756	auio.uio_rw = UIO_WRITE;
757	auio.uio_td = td;
758	auio.uio_offset = 0;			/* XXX */
759	auio.uio_resid = 0;
760	iov = mp->msg_iov;
761	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
762		if ((auio.uio_resid += iov->iov_len) < 0) {
763			error = EINVAL;
764			goto bad;
765		}
766	}
767#ifdef KTRACE
768	if (KTRPOINT(td, KTR_GENIO))
769		ktruio = cloneuio(&auio);
770#endif
771	len = auio.uio_resid;
772	error = so->so_proto->pr_usrreqs->pru_sosend(so, mp->msg_name, &auio,
773	    0, control, flags, td);
774	if (error) {
775		if (auio.uio_resid != len && (error == ERESTART ||
776		    error == EINTR || error == EWOULDBLOCK))
777			error = 0;
778		/* Generation of SIGPIPE can be controlled per socket */
779		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
780		    !(flags & MSG_NOSIGNAL)) {
781			PROC_LOCK(td->td_proc);
782			psignal(td->td_proc, SIGPIPE);
783			PROC_UNLOCK(td->td_proc);
784		}
785	}
786	if (error == 0)
787		td->td_retval[0] = len - auio.uio_resid;
788#ifdef KTRACE
789	if (ktruio != NULL) {
790		ktruio->uio_resid = td->td_retval[0];
791		ktrgenio(s, UIO_WRITE, ktruio, error);
792	}
793#endif
794bad:
795	fdrop(fp, td);
796bad2:
797	NET_UNLOCK_GIANT();
798	return (error);
799}
800
801/*
802 * MPSAFE
803 */
804int
805sendto(td, uap)
806	struct thread *td;
807	register struct sendto_args /* {
808		int	s;
809		caddr_t	buf;
810		size_t	len;
811		int	flags;
812		caddr_t	to;
813		int	tolen;
814	} */ *uap;
815{
816	struct msghdr msg;
817	struct iovec aiov;
818	int error;
819
820	msg.msg_name = uap->to;
821	msg.msg_namelen = uap->tolen;
822	msg.msg_iov = &aiov;
823	msg.msg_iovlen = 1;
824	msg.msg_control = 0;
825#ifdef COMPAT_OLDSOCK
826	msg.msg_flags = 0;
827#endif
828	aiov.iov_base = uap->buf;
829	aiov.iov_len = uap->len;
830	error = sendit(td, uap->s, &msg, uap->flags);
831	return (error);
832}
833
834#ifdef COMPAT_OLDSOCK
835/*
836 * MPSAFE
837 */
838int
839osend(td, uap)
840	struct thread *td;
841	register struct osend_args /* {
842		int	s;
843		caddr_t	buf;
844		int	len;
845		int	flags;
846	} */ *uap;
847{
848	struct msghdr msg;
849	struct iovec aiov;
850	int error;
851
852	msg.msg_name = 0;
853	msg.msg_namelen = 0;
854	msg.msg_iov = &aiov;
855	msg.msg_iovlen = 1;
856	aiov.iov_base = uap->buf;
857	aiov.iov_len = uap->len;
858	msg.msg_control = 0;
859	msg.msg_flags = 0;
860	error = sendit(td, uap->s, &msg, uap->flags);
861	return (error);
862}
863
864/*
865 * MPSAFE
866 */
867int
868osendmsg(td, uap)
869	struct thread *td;
870	struct osendmsg_args /* {
871		int	s;
872		caddr_t	msg;
873		int	flags;
874	} */ *uap;
875{
876	struct msghdr msg;
877	struct iovec *iov;
878	int error;
879
880	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
881	if (error)
882		return (error);
883	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
884	if (error)
885		return (error);
886	msg.msg_iov = iov;
887	msg.msg_flags = MSG_COMPAT;
888	error = sendit(td, uap->s, &msg, uap->flags);
889	free(iov, M_IOV);
890	return (error);
891}
892#endif
893
894/*
895 * MPSAFE
896 */
897int
898sendmsg(td, uap)
899	struct thread *td;
900	struct sendmsg_args /* {
901		int	s;
902		caddr_t	msg;
903		int	flags;
904	} */ *uap;
905{
906	struct msghdr msg;
907	struct iovec *iov;
908	int error;
909
910	error = copyin(uap->msg, &msg, sizeof (msg));
911	if (error)
912		return (error);
913	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
914	if (error)
915		return (error);
916	msg.msg_iov = iov;
917#ifdef COMPAT_OLDSOCK
918	msg.msg_flags = 0;
919#endif
920	error = sendit(td, uap->s, &msg, uap->flags);
921	free(iov, M_IOV);
922	return (error);
923}
924
925static int
926recvit(td, s, mp, namelenp)
927	struct thread *td;
928	int s;
929	struct msghdr *mp;
930	void *namelenp;
931{
932	struct uio auio;
933	struct iovec *iov;
934	int i;
935	socklen_t len;
936	int error;
937	struct mbuf *m, *control = 0;
938	caddr_t ctlbuf;
939	struct file *fp;
940	struct socket *so;
941	struct sockaddr *fromsa = 0;
942#ifdef KTRACE
943	struct uio *ktruio = NULL;
944#endif
945
946	NET_LOCK_GIANT();
947	error = getsock(td->td_proc->p_fd, s, &fp);
948	if (error) {
949		NET_UNLOCK_GIANT();
950		return (error);
951	}
952	so = fp->f_data;
953
954#ifdef MAC
955	SOCK_LOCK(so);
956	error = mac_check_socket_receive(td->td_ucred, so);
957	SOCK_UNLOCK(so);
958	if (error) {
959		fdrop(fp, td);
960		NET_UNLOCK_GIANT();
961		return (error);
962	}
963#endif
964
965	auio.uio_iov = mp->msg_iov;
966	auio.uio_iovcnt = mp->msg_iovlen;
967	auio.uio_segflg = UIO_USERSPACE;
968	auio.uio_rw = UIO_READ;
969	auio.uio_td = td;
970	auio.uio_offset = 0;			/* XXX */
971	auio.uio_resid = 0;
972	iov = mp->msg_iov;
973	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
974		if ((auio.uio_resid += iov->iov_len) < 0) {
975			fdrop(fp, td);
976			NET_UNLOCK_GIANT();
977			return (EINVAL);
978		}
979	}
980#ifdef KTRACE
981	if (KTRPOINT(td, KTR_GENIO))
982		ktruio = cloneuio(&auio);
983#endif
984	len = auio.uio_resid;
985	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
986	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
987	    &mp->msg_flags);
988	if (error) {
989		if (auio.uio_resid != (int)len && (error == ERESTART ||
990		    error == EINTR || error == EWOULDBLOCK))
991			error = 0;
992	}
993#ifdef KTRACE
994	if (ktruio != NULL) {
995		ktruio->uio_resid = (int)len - auio.uio_resid;
996		ktrgenio(s, UIO_READ, ktruio, error);
997	}
998#endif
999	if (error)
1000		goto out;
1001	td->td_retval[0] = (int)len - auio.uio_resid;
1002	if (mp->msg_name) {
1003		len = mp->msg_namelen;
1004		if (len <= 0 || fromsa == 0)
1005			len = 0;
1006		else {
1007			/* save sa_len before it is destroyed by MSG_COMPAT */
1008			len = MIN(len, fromsa->sa_len);
1009#ifdef COMPAT_OLDSOCK
1010			if (mp->msg_flags & MSG_COMPAT)
1011				((struct osockaddr *)fromsa)->sa_family =
1012				    fromsa->sa_family;
1013#endif
1014			error = copyout(fromsa, mp->msg_name, (unsigned)len);
1015			if (error)
1016				goto out;
1017		}
1018		mp->msg_namelen = len;
1019		if (namelenp &&
1020		    (error = copyout(&len, namelenp, sizeof (socklen_t)))) {
1021#ifdef COMPAT_OLDSOCK
1022			if (mp->msg_flags & MSG_COMPAT)
1023				error = 0;	/* old recvfrom didn't check */
1024			else
1025#endif
1026			goto out;
1027		}
1028	}
1029	if (mp->msg_control) {
1030#ifdef COMPAT_OLDSOCK
1031		/*
1032		 * We assume that old recvmsg calls won't receive access
1033		 * rights and other control info, esp. as control info
1034		 * is always optional and those options didn't exist in 4.3.
1035		 * If we receive rights, trim the cmsghdr; anything else
1036		 * is tossed.
1037		 */
1038		if (control && mp->msg_flags & MSG_COMPAT) {
1039			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1040			    SOL_SOCKET ||
1041			    mtod(control, struct cmsghdr *)->cmsg_type !=
1042			    SCM_RIGHTS) {
1043				mp->msg_controllen = 0;
1044				goto out;
1045			}
1046			control->m_len -= sizeof (struct cmsghdr);
1047			control->m_data += sizeof (struct cmsghdr);
1048		}
1049#endif
1050		len = mp->msg_controllen;
1051		m = control;
1052		mp->msg_controllen = 0;
1053		ctlbuf = mp->msg_control;
1054
1055		while (m && len > 0) {
1056			unsigned int tocopy;
1057
1058			if (len >= m->m_len)
1059				tocopy = m->m_len;
1060			else {
1061				mp->msg_flags |= MSG_CTRUNC;
1062				tocopy = len;
1063			}
1064
1065			if ((error = copyout(mtod(m, caddr_t),
1066					ctlbuf, tocopy)) != 0)
1067				goto out;
1068
1069			ctlbuf += tocopy;
1070			len -= tocopy;
1071			m = m->m_next;
1072		}
1073		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1074	}
1075out:
1076	fdrop(fp, td);
1077	NET_UNLOCK_GIANT();
1078	if (fromsa)
1079		FREE(fromsa, M_SONAME);
1080	if (control)
1081		m_freem(control);
1082	return (error);
1083}
1084
1085/*
1086 * MPSAFE
1087 */
1088int
1089recvfrom(td, uap)
1090	struct thread *td;
1091	register struct recvfrom_args /* {
1092		int	s;
1093		caddr_t	buf;
1094		size_t	len;
1095		int	flags;
1096		struct sockaddr * __restrict	from;
1097		socklen_t * __restrict fromlenaddr;
1098	} */ *uap;
1099{
1100	struct msghdr msg;
1101	struct iovec aiov;
1102	int error;
1103
1104	if (uap->fromlenaddr) {
1105		error = copyin(uap->fromlenaddr,
1106		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1107		if (error)
1108			goto done2;
1109	} else {
1110		msg.msg_namelen = 0;
1111	}
1112	msg.msg_name = uap->from;
1113	msg.msg_iov = &aiov;
1114	msg.msg_iovlen = 1;
1115	aiov.iov_base = uap->buf;
1116	aiov.iov_len = uap->len;
1117	msg.msg_control = 0;
1118	msg.msg_flags = uap->flags;
1119	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1120done2:
1121	return(error);
1122}
1123
1124#ifdef COMPAT_OLDSOCK
1125/*
1126 * MPSAFE
1127 */
1128int
1129orecvfrom(td, uap)
1130	struct thread *td;
1131	struct recvfrom_args *uap;
1132{
1133
1134	uap->flags |= MSG_COMPAT;
1135	return (recvfrom(td, uap));
1136}
1137#endif
1138
1139
1140#ifdef COMPAT_OLDSOCK
1141/*
1142 * MPSAFE
1143 */
1144int
1145orecv(td, uap)
1146	struct thread *td;
1147	register struct orecv_args /* {
1148		int	s;
1149		caddr_t	buf;
1150		int	len;
1151		int	flags;
1152	} */ *uap;
1153{
1154	struct msghdr msg;
1155	struct iovec aiov;
1156	int error;
1157
1158	msg.msg_name = 0;
1159	msg.msg_namelen = 0;
1160	msg.msg_iov = &aiov;
1161	msg.msg_iovlen = 1;
1162	aiov.iov_base = uap->buf;
1163	aiov.iov_len = uap->len;
1164	msg.msg_control = 0;
1165	msg.msg_flags = uap->flags;
1166	error = recvit(td, uap->s, &msg, NULL);
1167	return (error);
1168}
1169
1170/*
1171 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1172 * overlays the new one, missing only the flags, and with the (old) access
1173 * rights where the control fields are now.
1174 *
1175 * MPSAFE
1176 */
1177int
1178orecvmsg(td, uap)
1179	struct thread *td;
1180	struct orecvmsg_args /* {
1181		int	s;
1182		struct	omsghdr *msg;
1183		int	flags;
1184	} */ *uap;
1185{
1186	struct msghdr msg;
1187	struct iovec *iov;
1188	int error;
1189
1190	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1191	if (error)
1192		return (error);
1193	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1194	if (error)
1195		return (error);
1196	msg.msg_flags = uap->flags | MSG_COMPAT;
1197	msg.msg_iov = iov;
1198	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1199	if (msg.msg_controllen && error == 0)
1200		error = copyout(&msg.msg_controllen,
1201		    &uap->msg->msg_accrightslen, sizeof (int));
1202	free(iov, M_IOV);
1203	return (error);
1204}
1205#endif
1206
1207/*
1208 * MPSAFE
1209 */
1210int
1211recvmsg(td, uap)
1212	struct thread *td;
1213	struct recvmsg_args /* {
1214		int	s;
1215		struct	msghdr *msg;
1216		int	flags;
1217	} */ *uap;
1218{
1219	struct msghdr msg;
1220	struct iovec *uiov, *iov;
1221	int error;
1222
1223	error = copyin(uap->msg, &msg, sizeof (msg));
1224	if (error)
1225		return (error);
1226	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1227	if (error)
1228		return (error);
1229	msg.msg_flags = uap->flags;
1230#ifdef COMPAT_OLDSOCK
1231	msg.msg_flags &= ~MSG_COMPAT;
1232#endif
1233	uiov = msg.msg_iov;
1234	msg.msg_iov = iov;
1235	error = recvit(td, uap->s, &msg, NULL);
1236	if (error == 0) {
1237		msg.msg_iov = uiov;
1238		error = copyout(&msg, uap->msg, sizeof(msg));
1239	}
1240	free(iov, M_IOV);
1241	return (error);
1242}
1243
1244/*
1245 * MPSAFE
1246 */
1247/* ARGSUSED */
1248int
1249shutdown(td, uap)
1250	struct thread *td;
1251	register struct shutdown_args /* {
1252		int	s;
1253		int	how;
1254	} */ *uap;
1255{
1256	struct socket *so;
1257	struct file *fp;
1258	int error;
1259
1260	NET_LOCK_GIANT();
1261	error = getsock(td->td_proc->p_fd, uap->s, &fp);
1262	if (error == 0) {
1263		so = fp->f_data;
1264		error = soshutdown(so, uap->how);
1265		fdrop(fp, td);
1266	}
1267	NET_UNLOCK_GIANT();
1268	return (error);
1269}
1270
1271/*
1272 * MPSAFE
1273 */
1274/* ARGSUSED */
1275int
1276setsockopt(td, uap)
1277	struct thread *td;
1278	register struct setsockopt_args /* {
1279		int	s;
1280		int	level;
1281		int	name;
1282		caddr_t	val;
1283		int	valsize;
1284	} */ *uap;
1285{
1286
1287	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1288	    uap->val, UIO_USERSPACE, uap->valsize));
1289}
1290
1291int
1292kern_setsockopt(td, s, level, name, val, valseg, valsize)
1293	struct thread *td;
1294	int s;
1295	int level;
1296	int name;
1297	void *val;
1298	enum uio_seg valseg;
1299	socklen_t valsize;
1300{
1301	int error;
1302	struct socket *so;
1303	struct file *fp;
1304	struct sockopt sopt;
1305
1306	if (val == NULL && valsize != 0)
1307		return (EFAULT);
1308	if (valsize < 0)
1309		return (EINVAL);
1310
1311	sopt.sopt_dir = SOPT_SET;
1312	sopt.sopt_level = level;
1313	sopt.sopt_name = name;
1314	sopt.sopt_val = val;
1315	sopt.sopt_valsize = valsize;
1316	switch (valseg) {
1317	case UIO_USERSPACE:
1318		sopt.sopt_td = td;
1319		break;
1320	case UIO_SYSSPACE:
1321		sopt.sopt_td = NULL;
1322		break;
1323	default:
1324		panic("kern_setsockopt called with bad valseg");
1325	}
1326
1327	NET_LOCK_GIANT();
1328	error = getsock(td->td_proc->p_fd, s, &fp);
1329	if (error == 0) {
1330		so = fp->f_data;
1331		error = sosetopt(so, &sopt);
1332		fdrop(fp, td);
1333	}
1334	NET_UNLOCK_GIANT();
1335	return(error);
1336}
1337
1338/*
1339 * MPSAFE
1340 */
1341/* ARGSUSED */
1342int
1343getsockopt(td, uap)
1344	struct thread *td;
1345	register struct getsockopt_args /* {
1346		int	s;
1347		int	level;
1348		int	name;
1349		void * __restrict	val;
1350		socklen_t * __restrict avalsize;
1351	} */ *uap;
1352{
1353	socklen_t valsize;
1354	int	error;
1355
1356	if (uap->val) {
1357		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1358		if (error)
1359			return (error);
1360	}
1361
1362	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1363	    uap->val, UIO_USERSPACE, &valsize);
1364
1365	if (error == 0)
1366		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1367	return (error);
1368}
1369
1370/*
1371 * Kernel version of getsockopt.
1372 * optval can be a userland or userspace. optlen is always a kernel pointer.
1373 */
1374int
1375kern_getsockopt(td, s, level, name, val, valseg, valsize)
1376	struct thread *td;
1377	int s;
1378	int level;
1379	int name;
1380	void *val;
1381	enum uio_seg valseg;
1382	socklen_t *valsize;
1383{
1384	int error;
1385	struct  socket *so;
1386	struct file *fp;
1387	struct	sockopt sopt;
1388
1389	if (val == NULL)
1390		*valsize = 0;
1391	if (*valsize < 0)
1392		return (EINVAL);
1393
1394	sopt.sopt_dir = SOPT_GET;
1395	sopt.sopt_level = level;
1396	sopt.sopt_name = name;
1397	sopt.sopt_val = val;
1398	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1399	switch (valseg) {
1400	case UIO_USERSPACE:
1401		sopt.sopt_td = td;
1402		break;
1403	case UIO_SYSSPACE:
1404		sopt.sopt_td = NULL;
1405		break;
1406	default:
1407		panic("kern_getsockopt called with bad valseg");
1408	}
1409
1410	NET_LOCK_GIANT();
1411	error = getsock(td->td_proc->p_fd, s, &fp);
1412	if (error == 0) {
1413		so = fp->f_data;
1414		error = sogetopt(so, &sopt);
1415		*valsize = sopt.sopt_valsize;
1416		fdrop(fp, td);
1417	}
1418	NET_UNLOCK_GIANT();
1419	return (error);
1420}
1421
1422/*
1423 * getsockname1() - Get socket name.
1424 *
1425 * MPSAFE
1426 */
1427/* ARGSUSED */
1428static int
1429getsockname1(td, uap, compat)
1430	struct thread *td;
1431	register struct getsockname_args /* {
1432		int	fdes;
1433		struct sockaddr * __restrict asa;
1434		socklen_t * __restrict alen;
1435	} */ *uap;
1436	int compat;
1437{
1438	struct socket *so;
1439	struct sockaddr *sa;
1440	struct file *fp;
1441	socklen_t len;
1442	int error;
1443
1444	NET_LOCK_GIANT();
1445	error = getsock(td->td_proc->p_fd, uap->fdes, &fp);
1446	if (error)
1447		goto done2;
1448	so = fp->f_data;
1449	error = copyin(uap->alen, &len, sizeof (len));
1450	if (error)
1451		goto done1;
1452	if (len < 0) {
1453		error = EINVAL;
1454		goto done1;
1455	}
1456	sa = 0;
1457	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1458	if (error)
1459		goto bad;
1460	if (sa == 0) {
1461		len = 0;
1462		goto gotnothing;
1463	}
1464
1465	len = MIN(len, sa->sa_len);
1466#ifdef COMPAT_OLDSOCK
1467	if (compat)
1468		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1469#endif
1470	error = copyout(sa, uap->asa, (u_int)len);
1471	if (error == 0)
1472gotnothing:
1473		error = copyout(&len, uap->alen, sizeof (len));
1474bad:
1475	if (sa)
1476		FREE(sa, M_SONAME);
1477done1:
1478	fdrop(fp, td);
1479done2:
1480	NET_UNLOCK_GIANT();
1481	return (error);
1482}
1483
1484/*
1485 * MPSAFE
1486 */
1487int
1488getsockname(td, uap)
1489	struct thread *td;
1490	struct getsockname_args *uap;
1491{
1492
1493	return (getsockname1(td, uap, 0));
1494}
1495
1496#ifdef COMPAT_OLDSOCK
1497/*
1498 * MPSAFE
1499 */
1500int
1501ogetsockname(td, uap)
1502	struct thread *td;
1503	struct getsockname_args *uap;
1504{
1505
1506	return (getsockname1(td, uap, 1));
1507}
1508#endif /* COMPAT_OLDSOCK */
1509
1510/*
1511 * getpeername1() - Get name of peer for connected socket.
1512 *
1513 * MPSAFE
1514 */
1515/* ARGSUSED */
1516static int
1517getpeername1(td, uap, compat)
1518	struct thread *td;
1519	register struct getpeername_args /* {
1520		int	fdes;
1521		struct sockaddr * __restrict	asa;
1522		socklen_t * __restrict	alen;
1523	} */ *uap;
1524	int compat;
1525{
1526	struct socket *so;
1527	struct sockaddr *sa;
1528	struct file *fp;
1529	socklen_t len;
1530	int error;
1531
1532	NET_LOCK_GIANT();
1533	error = getsock(td->td_proc->p_fd, uap->fdes, &fp);
1534	if (error)
1535		goto done2;
1536	so = fp->f_data;
1537	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1538		error = ENOTCONN;
1539		goto done1;
1540	}
1541	error = copyin(uap->alen, &len, sizeof (len));
1542	if (error)
1543		goto done1;
1544	if (len < 0) {
1545		error = EINVAL;
1546		goto done1;
1547	}
1548	sa = 0;
1549	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1550	if (error)
1551		goto bad;
1552	if (sa == 0) {
1553		len = 0;
1554		goto gotnothing;
1555	}
1556	len = MIN(len, sa->sa_len);
1557#ifdef COMPAT_OLDSOCK
1558	if (compat)
1559		((struct osockaddr *)sa)->sa_family =
1560		    sa->sa_family;
1561#endif
1562	error = copyout(sa, uap->asa, (u_int)len);
1563	if (error)
1564		goto bad;
1565gotnothing:
1566	error = copyout(&len, uap->alen, sizeof (len));
1567bad:
1568	if (sa)
1569		FREE(sa, M_SONAME);
1570done1:
1571	fdrop(fp, td);
1572done2:
1573	NET_UNLOCK_GIANT();
1574	return (error);
1575}
1576
1577/*
1578 * MPSAFE
1579 */
1580int
1581getpeername(td, uap)
1582	struct thread *td;
1583	struct getpeername_args *uap;
1584{
1585
1586	return (getpeername1(td, uap, 0));
1587}
1588
1589#ifdef COMPAT_OLDSOCK
1590/*
1591 * MPSAFE
1592 */
1593int
1594ogetpeername(td, uap)
1595	struct thread *td;
1596	struct ogetpeername_args *uap;
1597{
1598
1599	/* XXX uap should have type `getpeername_args *' to begin with. */
1600	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1601}
1602#endif /* COMPAT_OLDSOCK */
1603
1604int
1605sockargs(mp, buf, buflen, type)
1606	struct mbuf **mp;
1607	caddr_t buf;
1608	int buflen, type;
1609{
1610	register struct sockaddr *sa;
1611	register struct mbuf *m;
1612	int error;
1613
1614	if ((u_int)buflen > MLEN) {
1615#ifdef COMPAT_OLDSOCK
1616		if (type == MT_SONAME && (u_int)buflen <= 112)
1617			buflen = MLEN;		/* unix domain compat. hack */
1618		else
1619#endif
1620			if ((u_int)buflen > MCLBYTES)
1621				return (EINVAL);
1622	}
1623	m = m_get(M_TRYWAIT, type);
1624	if (m == NULL)
1625		return (ENOBUFS);
1626	if ((u_int)buflen > MLEN) {
1627		MCLGET(m, M_TRYWAIT);
1628		if ((m->m_flags & M_EXT) == 0) {
1629			m_free(m);
1630			return (ENOBUFS);
1631		}
1632	}
1633	m->m_len = buflen;
1634	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1635	if (error)
1636		(void) m_free(m);
1637	else {
1638		*mp = m;
1639		if (type == MT_SONAME) {
1640			sa = mtod(m, struct sockaddr *);
1641
1642#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1643			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1644				sa->sa_family = sa->sa_len;
1645#endif
1646			sa->sa_len = buflen;
1647		}
1648	}
1649	return (error);
1650}
1651
1652int
1653getsockaddr(namp, uaddr, len)
1654	struct sockaddr **namp;
1655	caddr_t uaddr;
1656	size_t len;
1657{
1658	struct sockaddr *sa;
1659	int error;
1660
1661	if (len > SOCK_MAXADDRLEN)
1662		return (ENAMETOOLONG);
1663	if (len < offsetof(struct sockaddr, sa_data[0]))
1664		return (EINVAL);
1665	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1666	error = copyin(uaddr, sa, len);
1667	if (error) {
1668		FREE(sa, M_SONAME);
1669	} else {
1670#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1671		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1672			sa->sa_family = sa->sa_len;
1673#endif
1674		sa->sa_len = len;
1675		*namp = sa;
1676	}
1677	return (error);
1678}
1679
1680/*
1681 * Detach mapped page and release resources back to the system.
1682 */
1683void
1684sf_buf_mext(void *addr, void *args)
1685{
1686	vm_page_t m;
1687
1688	m = sf_buf_page(args);
1689	sf_buf_free(args);
1690	vm_page_lock_queues();
1691	vm_page_unwire(m, 0);
1692	/*
1693	 * Check for the object going away on us. This can
1694	 * happen since we don't hold a reference to it.
1695	 * If so, we're responsible for freeing the page.
1696	 */
1697	if (m->wire_count == 0 && m->object == NULL)
1698		vm_page_free(m);
1699	vm_page_unlock_queues();
1700}
1701
1702/*
1703 * sendfile(2)
1704 *
1705 * MPSAFE
1706 *
1707 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1708 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1709 *
1710 * Send a file specified by 'fd' and starting at 'offset' to a socket
1711 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1712 * nbytes == 0. Optionally add a header and/or trailer to the socket
1713 * output. If specified, write the total number of bytes sent into *sbytes.
1714 *
1715 */
1716int
1717sendfile(struct thread *td, struct sendfile_args *uap)
1718{
1719
1720	return (do_sendfile(td, uap, 0));
1721}
1722
1723#ifdef COMPAT_FREEBSD4
1724int
1725freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1726{
1727	struct sendfile_args args;
1728
1729	args.fd = uap->fd;
1730	args.s = uap->s;
1731	args.offset = uap->offset;
1732	args.nbytes = uap->nbytes;
1733	args.hdtr = uap->hdtr;
1734	args.sbytes = uap->sbytes;
1735	args.flags = uap->flags;
1736
1737	return (do_sendfile(td, &args, 1));
1738}
1739#endif /* COMPAT_FREEBSD4 */
1740
1741static int
1742do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1743{
1744	struct vnode *vp;
1745	struct vm_object *obj;
1746	struct socket *so = NULL;
1747	struct mbuf *m, *m_header = NULL;
1748	struct sf_buf *sf;
1749	struct vm_page *pg;
1750	struct writev_args nuap;
1751	struct sf_hdtr hdtr;
1752	struct uio *hdr_uio = NULL;
1753	off_t off, xfsize, hdtr_size, sbytes = 0;
1754	int error, headersize = 0, headersent = 0;
1755
1756	mtx_lock(&Giant);
1757
1758	hdtr_size = 0;
1759
1760	/*
1761	 * The descriptor must be a regular file and have a backing VM object.
1762	 */
1763	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1764		goto done;
1765	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1766	obj = vp->v_object;
1767	VOP_UNLOCK(vp, 0, td);
1768	if (obj == NULL) {
1769		error = EINVAL;
1770		goto done;
1771	}
1772	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1773		goto done;
1774	if (so->so_type != SOCK_STREAM) {
1775		error = EINVAL;
1776		goto done;
1777	}
1778	if ((so->so_state & SS_ISCONNECTED) == 0) {
1779		error = ENOTCONN;
1780		goto done;
1781	}
1782	if (uap->offset < 0) {
1783		error = EINVAL;
1784		goto done;
1785	}
1786
1787#ifdef MAC
1788	SOCK_LOCK(so);
1789	error = mac_check_socket_send(td->td_ucred, so);
1790	SOCK_UNLOCK(so);
1791	if (error)
1792		goto done;
1793#endif
1794
1795	/*
1796	 * If specified, get the pointer to the sf_hdtr struct for
1797	 * any headers/trailers.
1798	 */
1799	if (uap->hdtr != NULL) {
1800		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1801		if (error)
1802			goto done;
1803		/*
1804		 * Send any headers.
1805		 */
1806		if (hdtr.headers != NULL) {
1807			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1808			if (error)
1809				goto done;
1810			hdr_uio->uio_td = td;
1811			hdr_uio->uio_rw = UIO_WRITE;
1812			if (hdr_uio->uio_resid > 0) {
1813				m_header = m_uiotombuf(hdr_uio, M_DONTWAIT, 0, 0);
1814				if (m_header == NULL)
1815					goto done;
1816				headersize = m_header->m_pkthdr.len;
1817				if (compat)
1818					sbytes += headersize;
1819			}
1820		}
1821	}
1822
1823	/*
1824	 * Protect against multiple writers to the socket.
1825	 */
1826	SOCKBUF_LOCK(&so->so_snd);
1827	(void) sblock(&so->so_snd, M_WAITOK);
1828	SOCKBUF_UNLOCK(&so->so_snd);
1829
1830	/*
1831	 * Loop through the pages in the file, starting with the requested
1832	 * offset. Get a file page (do I/O if necessary), map the file page
1833	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1834	 * it on the socket.
1835	 */
1836	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1837		vm_pindex_t pindex;
1838		vm_offset_t pgoff;
1839
1840		pindex = OFF_TO_IDX(off);
1841		VM_OBJECT_LOCK(obj);
1842retry_lookup:
1843		/*
1844		 * Calculate the amount to transfer. Not to exceed a page,
1845		 * the EOF, or the passed in nbytes.
1846		 */
1847		xfsize = obj->un_pager.vnp.vnp_size - off;
1848		VM_OBJECT_UNLOCK(obj);
1849		if (xfsize > PAGE_SIZE)
1850			xfsize = PAGE_SIZE;
1851		pgoff = (vm_offset_t)(off & PAGE_MASK);
1852		if (PAGE_SIZE - pgoff < xfsize)
1853			xfsize = PAGE_SIZE - pgoff;
1854		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1855			xfsize = uap->nbytes - sbytes;
1856		if (xfsize <= 0) {
1857			if (m_header != NULL) {
1858				m = m_header;
1859				m_header = NULL;
1860				SOCKBUF_LOCK(&so->so_snd);
1861				goto retry_space;
1862			} else
1863				break;
1864		}
1865		/*
1866		 * Optimize the non-blocking case by looking at the socket space
1867		 * before going to the extra work of constituting the sf_buf.
1868		 */
1869		SOCKBUF_LOCK(&so->so_snd);
1870		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1871			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1872				error = EPIPE;
1873			else
1874				error = EAGAIN;
1875			sbunlock(&so->so_snd);
1876			SOCKBUF_UNLOCK(&so->so_snd);
1877			goto done;
1878		}
1879		SOCKBUF_UNLOCK(&so->so_snd);
1880		VM_OBJECT_LOCK(obj);
1881		/*
1882		 * Attempt to look up the page.
1883		 *
1884		 *	Allocate if not found
1885		 *
1886		 *	Wait and loop if busy.
1887		 */
1888		pg = vm_page_lookup(obj, pindex);
1889
1890		if (pg == NULL) {
1891			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NOBUSY |
1892			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
1893			if (pg == NULL) {
1894				VM_OBJECT_UNLOCK(obj);
1895				VM_WAIT;
1896				VM_OBJECT_LOCK(obj);
1897				goto retry_lookup;
1898			}
1899			vm_page_lock_queues();
1900		} else {
1901			vm_page_lock_queues();
1902			if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
1903				goto retry_lookup;
1904			/*
1905			 * Wire the page so it does not get ripped out from
1906			 * under us.
1907			 */
1908			vm_page_wire(pg);
1909		}
1910
1911		/*
1912		 * If page is not valid for what we need, initiate I/O
1913		 */
1914
1915		if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) {
1916			VM_OBJECT_UNLOCK(obj);
1917		} else if (uap->flags & SF_NODISKIO) {
1918			error = EBUSY;
1919		} else {
1920			int bsize, resid;
1921
1922			/*
1923			 * Ensure that our page is still around when the I/O
1924			 * completes.
1925			 */
1926			vm_page_io_start(pg);
1927			vm_page_unlock_queues();
1928			VM_OBJECT_UNLOCK(obj);
1929
1930			/*
1931			 * Get the page from backing store.
1932			 */
1933			bsize = vp->v_mount->mnt_stat.f_iosize;
1934			vn_lock(vp, LK_SHARED | LK_RETRY, td);
1935			/*
1936			 * XXXMAC: Because we don't have fp->f_cred here,
1937			 * we pass in NOCRED.  This is probably wrong, but
1938			 * is consistent with our original implementation.
1939			 */
1940			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
1941			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
1942			    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
1943			    td->td_ucred, NOCRED, &resid, td);
1944			VOP_UNLOCK(vp, 0, td);
1945			VM_OBJECT_LOCK(obj);
1946			vm_page_lock_queues();
1947			vm_page_io_finish(pg);
1948			if (!error)
1949				VM_OBJECT_UNLOCK(obj);
1950			mbstat.sf_iocnt++;
1951		}
1952
1953		if (error) {
1954			vm_page_unwire(pg, 0);
1955			/*
1956			 * See if anyone else might know about this page.
1957			 * If not and it is not valid, then free it.
1958			 */
1959			if (pg->wire_count == 0 && pg->valid == 0 &&
1960			    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1961			    pg->hold_count == 0) {
1962				vm_page_free(pg);
1963			}
1964			vm_page_unlock_queues();
1965			VM_OBJECT_UNLOCK(obj);
1966			SOCKBUF_LOCK(&so->so_snd);
1967			sbunlock(&so->so_snd);
1968			SOCKBUF_UNLOCK(&so->so_snd);
1969			goto done;
1970		}
1971		vm_page_unlock_queues();
1972
1973		/*
1974		 * Get a sendfile buf. We usually wait as long as necessary,
1975		 * but this wait can be interrupted.
1976		 */
1977		if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) {
1978			mbstat.sf_allocfail++;
1979			vm_page_lock_queues();
1980			vm_page_unwire(pg, 0);
1981			if (pg->wire_count == 0 && pg->object == NULL)
1982				vm_page_free(pg);
1983			vm_page_unlock_queues();
1984			SOCKBUF_LOCK(&so->so_snd);
1985			sbunlock(&so->so_snd);
1986			SOCKBUF_UNLOCK(&so->so_snd);
1987			error = EINTR;
1988			goto done;
1989		}
1990
1991		/*
1992		 * Get an mbuf header and set it up as having external storage.
1993		 */
1994		if (m_header)
1995			MGET(m, M_TRYWAIT, MT_DATA);
1996		else
1997			MGETHDR(m, M_TRYWAIT, MT_DATA);
1998		if (m == NULL) {
1999			error = ENOBUFS;
2000			sf_buf_mext((void *)sf_buf_kva(sf), sf);
2001			SOCKBUF_LOCK(&so->so_snd);
2002			sbunlock(&so->so_snd);
2003			SOCKBUF_UNLOCK(&so->so_snd);
2004			goto done;
2005		}
2006		/*
2007		 * Setup external storage for mbuf.
2008		 */
2009		MEXTADD(m, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, sf, M_RDONLY,
2010		    EXT_SFBUF);
2011		m->m_data = (char *)sf_buf_kva(sf) + pgoff;
2012		m->m_pkthdr.len = m->m_len = xfsize;
2013
2014		if (m_header) {
2015			m_cat(m_header, m);
2016			m = m_header;
2017			m_header = NULL;
2018			m_fixhdr(m);
2019		}
2020
2021		/*
2022		 * Add the buffer to the socket buffer chain.
2023		 */
2024		SOCKBUF_LOCK(&so->so_snd);
2025retry_space:
2026		/*
2027		 * Make sure that the socket is still able to take more data.
2028		 * CANTSENDMORE being true usually means that the connection
2029		 * was closed. so_error is true when an error was sensed after
2030		 * a previous send.
2031		 * The state is checked after the page mapping and buffer
2032		 * allocation above since those operations may block and make
2033		 * any socket checks stale. From this point forward, nothing
2034		 * blocks before the pru_send (or more accurately, any blocking
2035		 * results in a loop back to here to re-check).
2036		 */
2037		SOCKBUF_LOCK_ASSERT(&so->so_snd);
2038		if ((so->so_snd.sb_state & SBS_CANTSENDMORE) || so->so_error) {
2039			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2040				error = EPIPE;
2041			} else {
2042				error = so->so_error;
2043				so->so_error = 0;
2044			}
2045			m_freem(m);
2046			sbunlock(&so->so_snd);
2047			SOCKBUF_UNLOCK(&so->so_snd);
2048			goto done;
2049		}
2050		/*
2051		 * Wait for socket space to become available. We do this just
2052		 * after checking the connection state above in order to avoid
2053		 * a race condition with sbwait().
2054		 */
2055		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
2056			if (so->so_state & SS_NBIO) {
2057				m_freem(m);
2058				sbunlock(&so->so_snd);
2059				SOCKBUF_UNLOCK(&so->so_snd);
2060				error = EAGAIN;
2061				goto done;
2062			}
2063			error = sbwait(&so->so_snd);
2064			/*
2065			 * An error from sbwait usually indicates that we've
2066			 * been interrupted by a signal. If we've sent anything
2067			 * then return bytes sent, otherwise return the error.
2068			 */
2069			if (error) {
2070				m_freem(m);
2071				sbunlock(&so->so_snd);
2072				SOCKBUF_UNLOCK(&so->so_snd);
2073				goto done;
2074			}
2075			goto retry_space;
2076		}
2077		SOCKBUF_UNLOCK(&so->so_snd);
2078		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
2079		if (error) {
2080			SOCKBUF_LOCK(&so->so_snd);
2081			sbunlock(&so->so_snd);
2082			SOCKBUF_UNLOCK(&so->so_snd);
2083			goto done;
2084		}
2085		headersent = 1;
2086	}
2087	SOCKBUF_LOCK(&so->so_snd);
2088	sbunlock(&so->so_snd);
2089	SOCKBUF_UNLOCK(&so->so_snd);
2090
2091	/*
2092	 * Send trailers. Wimp out and use writev(2).
2093	 */
2094	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
2095			nuap.fd = uap->s;
2096			nuap.iovp = hdtr.trailers;
2097			nuap.iovcnt = hdtr.trl_cnt;
2098			error = writev(td, &nuap);
2099			if (error)
2100				goto done;
2101			if (compat)
2102				sbytes += td->td_retval[0];
2103			else
2104				hdtr_size += td->td_retval[0];
2105	}
2106
2107done:
2108	if (headersent) {
2109		if (!compat)
2110			hdtr_size += headersize;
2111	} else {
2112		if (compat)
2113			sbytes -= headersize;
2114	}
2115	/*
2116	 * If there was no error we have to clear td->td_retval[0]
2117	 * because it may have been set by writev.
2118	 */
2119	if (error == 0) {
2120		td->td_retval[0] = 0;
2121	}
2122	if (uap->sbytes != NULL) {
2123		if (!compat)
2124			sbytes += hdtr_size;
2125		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2126	}
2127	if (vp)
2128		vrele(vp);
2129	if (so)
2130		fputsock(so);
2131	if (hdr_uio != NULL)
2132		free(hdr_uio, M_IOV);
2133	if (m_header)
2134		m_freem(m_header);
2135
2136	mtx_unlock(&Giant);
2137
2138	if (error == ERESTART)
2139		error = EINTR;
2140
2141	return (error);
2142}
2143