kern_sendfile.c revision 124396
1/*
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 124396 2004-01-11 19:56:42Z des $");
41
42#include "opt_compat.h"
43#include "opt_ktrace.h"
44#include "opt_mac.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/mac.h>
51#include <sys/mutex.h>
52#include <sys/sysproto.h>
53#include <sys/malloc.h>
54#include <sys/filedesc.h>
55#include <sys/event.h>
56#include <sys/proc.h>
57#include <sys/fcntl.h>
58#include <sys/file.h>
59#include <sys/filio.h>
60#include <sys/mount.h>
61#include <sys/mbuf.h>
62#include <sys/protosw.h>
63#include <sys/sf_buf.h>
64#include <sys/socket.h>
65#include <sys/socketvar.h>
66#include <sys/signalvar.h>
67#include <sys/syscallsubr.h>
68#include <sys/uio.h>
69#include <sys/vnode.h>
70#ifdef KTRACE
71#include <sys/ktrace.h>
72#endif
73
74#include <vm/vm.h>
75#include <vm/vm_object.h>
76#include <vm/vm_page.h>
77#include <vm/vm_pageout.h>
78#include <vm/vm_kern.h>
79#include <vm/vm_extern.h>
80
81static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
82static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
83
84static int accept1(struct thread *td, struct accept_args *uap, int compat);
85static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
86static int getsockname1(struct thread *td, struct getsockname_args *uap,
87			int compat);
88static int getpeername1(struct thread *td, struct getpeername_args *uap,
89			int compat);
90
91/*
92 * System call interface to the socket abstraction.
93 */
94#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
95#define COMPAT_OLDSOCK
96#endif
97
98/*
99 * MPSAFE
100 */
101int
102socket(td, uap)
103	struct thread *td;
104	register struct socket_args /* {
105		int	domain;
106		int	type;
107		int	protocol;
108	} */ *uap;
109{
110	struct filedesc *fdp;
111	struct socket *so;
112	struct file *fp;
113	int fd, error;
114
115	fdp = td->td_proc->p_fd;
116	error = falloc(td, &fp, &fd);
117	if (error)
118		goto done2;
119	/* An extra reference on `fp' has been held for us by falloc(). */
120	mtx_lock(&Giant);
121	error = socreate(uap->domain, &so, uap->type, uap->protocol,
122	    td->td_ucred, td);
123	mtx_unlock(&Giant);
124	FILEDESC_LOCK(fdp);
125	if (error) {
126		if (fdp->fd_ofiles[fd] == fp) {
127			fdp->fd_ofiles[fd] = NULL;
128			fdunused(fdp, fd);
129			FILEDESC_UNLOCK(fdp);
130			fdrop(fp, td);
131		} else {
132			FILEDESC_UNLOCK(fdp);
133		}
134	} else {
135		fp->f_data = so;	/* already has ref count */
136		fp->f_flag = FREAD|FWRITE;
137		fp->f_ops = &socketops;
138		fp->f_type = DTYPE_SOCKET;
139		FILEDESC_UNLOCK(fdp);
140		td->td_retval[0] = fd;
141	}
142	fdrop(fp, td);
143done2:
144	return (error);
145}
146
147/*
148 * MPSAFE
149 */
150/* ARGSUSED */
151int
152bind(td, uap)
153	struct thread *td;
154	register struct bind_args /* {
155		int	s;
156		caddr_t	name;
157		int	namelen;
158	} */ *uap;
159{
160	struct sockaddr *sa;
161	int error;
162
163	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
164		return (error);
165
166	return (kern_bind(td, uap->s, sa));
167}
168
169int
170kern_bind(td, fd, sa)
171	struct thread *td;
172	int fd;
173	struct sockaddr *sa;
174{
175	struct socket *so;
176	int error;
177
178	mtx_lock(&Giant);
179	if ((error = fgetsock(td, fd, &so, NULL)) != 0)
180		goto done2;
181#ifdef MAC
182	error = mac_check_socket_bind(td->td_ucred, so, sa);
183	if (error)
184		goto done1;
185#endif
186	error = sobind(so, sa, td);
187#ifdef MAC
188done1:
189#endif
190	fputsock(so);
191done2:
192	mtx_unlock(&Giant);
193	FREE(sa, M_SONAME);
194	return (error);
195}
196
197/*
198 * MPSAFE
199 */
200/* ARGSUSED */
201int
202listen(td, uap)
203	struct thread *td;
204	register struct listen_args /* {
205		int	s;
206		int	backlog;
207	} */ *uap;
208{
209	struct socket *so;
210	int error;
211
212	mtx_lock(&Giant);
213	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
214#ifdef MAC
215		error = mac_check_socket_listen(td->td_ucred, so);
216		if (error)
217			goto done;
218#endif
219		error = solisten(so, uap->backlog, td);
220#ifdef MAC
221done:
222#endif
223		fputsock(so);
224	}
225	mtx_unlock(&Giant);
226	return(error);
227}
228
229/*
230 * accept1()
231 * MPSAFE
232 */
233static int
234accept1(td, uap, compat)
235	struct thread *td;
236	register struct accept_args /* {
237		int	s;
238		struct sockaddr	* __restrict name;
239		socklen_t	* __restrict anamelen;
240	} */ *uap;
241	int compat;
242{
243	struct filedesc *fdp;
244	struct file *nfp = NULL;
245	struct sockaddr *sa;
246	socklen_t namelen;
247	int error, s;
248	struct socket *head, *so;
249	int fd;
250	u_int fflag;
251	pid_t pgid;
252	int tmp;
253
254	fdp = td->td_proc->p_fd;
255	if (uap->name) {
256		error = copyin(uap->anamelen, &namelen, sizeof (namelen));
257		if(error)
258			goto done3;
259		if (namelen < 0) {
260			error = EINVAL;
261			goto done3;
262		}
263	}
264	mtx_lock(&Giant);
265	error = fgetsock(td, uap->s, &head, &fflag);
266	if (error)
267		goto done2;
268	s = splnet();
269	if ((head->so_options & SO_ACCEPTCONN) == 0) {
270		splx(s);
271		error = EINVAL;
272		goto done;
273	}
274	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
275		if (head->so_state & SS_CANTRCVMORE) {
276			head->so_error = ECONNABORTED;
277			break;
278		}
279		if ((head->so_state & SS_NBIO) != 0) {
280			head->so_error = EWOULDBLOCK;
281			break;
282		}
283		error = tsleep(&head->so_timeo, PSOCK | PCATCH,
284		    "accept", 0);
285		if (error) {
286			splx(s);
287			goto done;
288		}
289	}
290	if (head->so_error) {
291		error = head->so_error;
292		head->so_error = 0;
293		splx(s);
294		goto done;
295	}
296
297	/*
298	 * At this point we know that there is at least one connection
299	 * ready to be accepted. Remove it from the queue prior to
300	 * allocating the file descriptor for it since falloc() may
301	 * block allowing another process to accept the connection
302	 * instead.
303	 */
304	so = TAILQ_FIRST(&head->so_comp);
305	TAILQ_REMOVE(&head->so_comp, so, so_list);
306	head->so_qlen--;
307
308	error = falloc(td, &nfp, &fd);
309	if (error) {
310		/*
311		 * Probably ran out of file descriptors. Put the
312		 * unaccepted connection back onto the queue and
313		 * do another wakeup so some other process might
314		 * have a chance at it.
315		 */
316		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
317		head->so_qlen++;
318		wakeup_one(&head->so_timeo);
319		splx(s);
320		goto done;
321	}
322	/* An extra reference on `nfp' has been held for us by falloc(). */
323	td->td_retval[0] = fd;
324
325	/* connection has been removed from the listen queue */
326	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
327
328	so->so_state &= ~SS_COMP;
329	so->so_head = NULL;
330	pgid = fgetown(&head->so_sigio);
331	if (pgid != 0)
332		fsetown(pgid, &so->so_sigio);
333
334	FILE_LOCK(nfp);
335	soref(so);			/* file descriptor reference */
336	nfp->f_data = so;	/* nfp has ref count from falloc */
337	nfp->f_flag = fflag;
338	nfp->f_ops = &socketops;
339	nfp->f_type = DTYPE_SOCKET;
340	FILE_UNLOCK(nfp);
341	/* Sync socket nonblocking/async state with file flags */
342	tmp = fflag & FNONBLOCK;
343	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
344	tmp = fflag & FASYNC;
345	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
346	sa = 0;
347	error = soaccept(so, &sa);
348	if (error) {
349		/*
350		 * return a namelen of zero for older code which might
351		 * ignore the return value from accept.
352		 */
353		if (uap->name != NULL) {
354			namelen = 0;
355			(void) copyout(&namelen,
356			    uap->anamelen, sizeof(*uap->anamelen));
357		}
358		goto noconnection;
359	}
360	if (sa == NULL) {
361		namelen = 0;
362		if (uap->name)
363			goto gotnoname;
364		splx(s);
365		error = 0;
366		goto done;
367	}
368	if (uap->name) {
369		/* check sa_len before it is destroyed */
370		if (namelen > sa->sa_len)
371			namelen = sa->sa_len;
372#ifdef COMPAT_OLDSOCK
373		if (compat)
374			((struct osockaddr *)sa)->sa_family =
375			    sa->sa_family;
376#endif
377		error = copyout(sa, uap->name, (u_int)namelen);
378		if (!error)
379gotnoname:
380			error = copyout(&namelen,
381			    uap->anamelen, sizeof (*uap->anamelen));
382	}
383noconnection:
384	if (sa)
385		FREE(sa, M_SONAME);
386
387	/*
388	 * close the new descriptor, assuming someone hasn't ripped it
389	 * out from under us.
390	 */
391	if (error) {
392		FILEDESC_LOCK(fdp);
393		if (fdp->fd_ofiles[fd] == nfp) {
394			fdp->fd_ofiles[fd] = NULL;
395			fdunused(fdp, fd);
396			FILEDESC_UNLOCK(fdp);
397			fdrop(nfp, td);
398		} else {
399			FILEDESC_UNLOCK(fdp);
400		}
401	}
402	splx(s);
403
404	/*
405	 * Release explicitly held references before returning.
406	 */
407done:
408	if (nfp != NULL)
409		fdrop(nfp, td);
410	fputsock(head);
411done2:
412	mtx_unlock(&Giant);
413done3:
414	return (error);
415}
416
417/*
418 * MPSAFE (accept1() is MPSAFE)
419 */
420int
421accept(td, uap)
422	struct thread *td;
423	struct accept_args *uap;
424{
425
426	return (accept1(td, uap, 0));
427}
428
429#ifdef COMPAT_OLDSOCK
430/*
431 * MPSAFE (accept1() is MPSAFE)
432 */
433int
434oaccept(td, uap)
435	struct thread *td;
436	struct accept_args *uap;
437{
438
439	return (accept1(td, uap, 1));
440}
441#endif /* COMPAT_OLDSOCK */
442
443/*
444 * MPSAFE
445 */
446/* ARGSUSED */
447int
448connect(td, uap)
449	struct thread *td;
450	register struct connect_args /* {
451		int	s;
452		caddr_t	name;
453		int	namelen;
454	} */ *uap;
455{
456	struct sockaddr *sa;
457	int error;
458
459	error = getsockaddr(&sa, uap->name, uap->namelen);
460	if (error)
461		return (error);
462
463	return (kern_connect(td, uap->s, sa));
464}
465
466
467int
468kern_connect(td, fd, sa)
469	struct thread *td;
470	int fd;
471	struct sockaddr *sa;
472{
473	struct socket *so;
474	int error, s;
475	int interrupted = 0;
476
477	mtx_lock(&Giant);
478	if ((error = fgetsock(td, fd, &so, NULL)) != 0)
479		goto done2;
480	if (so->so_state & SS_ISCONNECTING) {
481		error = EALREADY;
482		goto done1;
483	}
484#ifdef MAC
485	error = mac_check_socket_connect(td->td_ucred, so, sa);
486	if (error)
487		goto bad;
488#endif
489	error = soconnect(so, sa, td);
490	if (error)
491		goto bad;
492	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
493		error = EINPROGRESS;
494		goto done1;
495	}
496	s = splnet();
497	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
498		error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0);
499		if (error) {
500			if (error == EINTR || error == ERESTART)
501				interrupted = 1;
502			break;
503		}
504	}
505	if (error == 0) {
506		error = so->so_error;
507		so->so_error = 0;
508	}
509	splx(s);
510bad:
511	if (!interrupted)
512		so->so_state &= ~SS_ISCONNECTING;
513	if (error == ERESTART)
514		error = EINTR;
515done1:
516	fputsock(so);
517done2:
518	mtx_unlock(&Giant);
519	FREE(sa, M_SONAME);
520	return (error);
521}
522
523/*
524 * MPSAFE
525 */
526int
527socketpair(td, uap)
528	struct thread *td;
529	register struct socketpair_args /* {
530		int	domain;
531		int	type;
532		int	protocol;
533		int	*rsv;
534	} */ *uap;
535{
536	register struct filedesc *fdp = td->td_proc->p_fd;
537	struct file *fp1, *fp2;
538	struct socket *so1, *so2;
539	int fd, error, sv[2];
540
541	mtx_lock(&Giant);
542	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
543	    td->td_ucred, td);
544	if (error)
545		goto done2;
546	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
547	    td->td_ucred, td);
548	if (error)
549		goto free1;
550	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
551	error = falloc(td, &fp1, &fd);
552	if (error)
553		goto free2;
554	sv[0] = fd;
555	fp1->f_data = so1;	/* so1 already has ref count */
556	error = falloc(td, &fp2, &fd);
557	if (error)
558		goto free3;
559	fp2->f_data = so2;	/* so2 already has ref count */
560	sv[1] = fd;
561	error = soconnect2(so1, so2);
562	if (error)
563		goto free4;
564	if (uap->type == SOCK_DGRAM) {
565		/*
566		 * Datagram socket connection is asymmetric.
567		 */
568		 error = soconnect2(so2, so1);
569		 if (error)
570			goto free4;
571	}
572	FILE_LOCK(fp1);
573	fp1->f_flag = FREAD|FWRITE;
574	fp1->f_ops = &socketops;
575	fp1->f_type = DTYPE_SOCKET;
576	FILE_UNLOCK(fp1);
577	FILE_LOCK(fp2);
578	fp2->f_flag = FREAD|FWRITE;
579	fp2->f_ops = &socketops;
580	fp2->f_type = DTYPE_SOCKET;
581	FILE_UNLOCK(fp2);
582	error = copyout(sv, uap->rsv, 2 * sizeof (int));
583	fdrop(fp1, td);
584	fdrop(fp2, td);
585	goto done2;
586free4:
587	FILEDESC_LOCK(fdp);
588	if (fdp->fd_ofiles[sv[1]] == fp2) {
589		fdp->fd_ofiles[sv[1]] = NULL;
590		fdunused(fdp, sv[1]);
591		FILEDESC_UNLOCK(fdp);
592		fdrop(fp2, td);
593	} else {
594		FILEDESC_UNLOCK(fdp);
595	}
596	fdrop(fp2, td);
597free3:
598	FILEDESC_LOCK(fdp);
599	if (fdp->fd_ofiles[sv[0]] == fp1) {
600		fdp->fd_ofiles[sv[0]] = NULL;
601		fdunused(fdp, sv[0]);
602		FILEDESC_UNLOCK(fdp);
603		fdrop(fp1, td);
604	} else {
605		FILEDESC_UNLOCK(fdp);
606	}
607	fdrop(fp1, td);
608free2:
609	(void)soclose(so2);
610free1:
611	(void)soclose(so1);
612done2:
613	mtx_unlock(&Giant);
614	return (error);
615}
616
617static int
618sendit(td, s, mp, flags)
619	register struct thread *td;
620	int s;
621	register struct msghdr *mp;
622	int flags;
623{
624	struct mbuf *control;
625	struct sockaddr *to;
626	int error;
627
628	if (mp->msg_name != NULL) {
629		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
630		if (error) {
631			to = NULL;
632			goto bad;
633		}
634		mp->msg_name = to;
635	} else {
636		to = NULL;
637	}
638
639	if (mp->msg_control) {
640		if (mp->msg_controllen < sizeof(struct cmsghdr)
641#ifdef COMPAT_OLDSOCK
642		    && mp->msg_flags != MSG_COMPAT
643#endif
644		) {
645			error = EINVAL;
646			goto bad;
647		}
648		error = sockargs(&control, mp->msg_control,
649		    mp->msg_controllen, MT_CONTROL);
650		if (error)
651			goto bad;
652#ifdef COMPAT_OLDSOCK
653		if (mp->msg_flags == MSG_COMPAT) {
654			register struct cmsghdr *cm;
655
656			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
657			if (control == 0) {
658				error = ENOBUFS;
659				goto bad;
660			} else {
661				cm = mtod(control, struct cmsghdr *);
662				cm->cmsg_len = control->m_len;
663				cm->cmsg_level = SOL_SOCKET;
664				cm->cmsg_type = SCM_RIGHTS;
665			}
666		}
667#endif
668	} else {
669		control = NULL;
670	}
671
672	error = kern_sendit(td, s, mp, flags, control);
673
674bad:
675	if (to)
676		FREE(to, M_SONAME);
677	return (error);
678}
679
680int
681kern_sendit(td, s, mp, flags, control)
682	struct thread *td;
683	int s;
684	struct msghdr *mp;
685	int flags;
686	struct mbuf *control;
687{
688	struct uio auio;
689	struct iovec *iov;
690	struct socket *so;
691	int i;
692	int len, error;
693#ifdef KTRACE
694	struct iovec *ktriov = NULL;
695	struct uio ktruio;
696	int iovlen;
697#endif
698
699	mtx_lock(&Giant);
700	if ((error = fgetsock(td, s, &so, NULL)) != 0)
701		goto bad2;
702
703#ifdef MAC
704	error = mac_check_socket_send(td->td_ucred, so);
705	if (error)
706		goto bad;
707#endif
708
709	auio.uio_iov = mp->msg_iov;
710	auio.uio_iovcnt = mp->msg_iovlen;
711	auio.uio_segflg = UIO_USERSPACE;
712	auio.uio_rw = UIO_WRITE;
713	auio.uio_td = td;
714	auio.uio_offset = 0;			/* XXX */
715	auio.uio_resid = 0;
716	iov = mp->msg_iov;
717	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
718		if ((auio.uio_resid += iov->iov_len) < 0) {
719			error = EINVAL;
720			goto bad;
721		}
722	}
723#ifdef KTRACE
724	if (KTRPOINT(td, KTR_GENIO)) {
725		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
726		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
727		bcopy(auio.uio_iov, ktriov, iovlen);
728		ktruio = auio;
729	}
730#endif
731	len = auio.uio_resid;
732	error = so->so_proto->pr_usrreqs->pru_sosend(so, mp->msg_name, &auio,
733	    0, control, flags, td);
734	if (error) {
735		if (auio.uio_resid != len && (error == ERESTART ||
736		    error == EINTR || error == EWOULDBLOCK))
737			error = 0;
738		/* Generation of SIGPIPE can be controlled per socket */
739		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE)) {
740			PROC_LOCK(td->td_proc);
741			psignal(td->td_proc, SIGPIPE);
742			PROC_UNLOCK(td->td_proc);
743		}
744	}
745	if (error == 0)
746		td->td_retval[0] = len - auio.uio_resid;
747#ifdef KTRACE
748	if (ktriov != NULL) {
749		if (error == 0) {
750			ktruio.uio_iov = ktriov;
751			ktruio.uio_resid = td->td_retval[0];
752			ktrgenio(s, UIO_WRITE, &ktruio, error);
753		}
754		FREE(ktriov, M_TEMP);
755	}
756#endif
757bad:
758	fputsock(so);
759bad2:
760	mtx_unlock(&Giant);
761	return (error);
762}
763
764/*
765 * MPSAFE
766 */
767int
768sendto(td, uap)
769	struct thread *td;
770	register struct sendto_args /* {
771		int	s;
772		caddr_t	buf;
773		size_t	len;
774		int	flags;
775		caddr_t	to;
776		int	tolen;
777	} */ *uap;
778{
779	struct msghdr msg;
780	struct iovec aiov;
781	int error;
782
783	msg.msg_name = uap->to;
784	msg.msg_namelen = uap->tolen;
785	msg.msg_iov = &aiov;
786	msg.msg_iovlen = 1;
787	msg.msg_control = 0;
788#ifdef COMPAT_OLDSOCK
789	msg.msg_flags = 0;
790#endif
791	aiov.iov_base = uap->buf;
792	aiov.iov_len = uap->len;
793	error = sendit(td, uap->s, &msg, uap->flags);
794	return (error);
795}
796
797#ifdef COMPAT_OLDSOCK
798/*
799 * MPSAFE
800 */
801int
802osend(td, uap)
803	struct thread *td;
804	register struct osend_args /* {
805		int	s;
806		caddr_t	buf;
807		int	len;
808		int	flags;
809	} */ *uap;
810{
811	struct msghdr msg;
812	struct iovec aiov;
813	int error;
814
815	msg.msg_name = 0;
816	msg.msg_namelen = 0;
817	msg.msg_iov = &aiov;
818	msg.msg_iovlen = 1;
819	aiov.iov_base = uap->buf;
820	aiov.iov_len = uap->len;
821	msg.msg_control = 0;
822	msg.msg_flags = 0;
823	error = sendit(td, uap->s, &msg, uap->flags);
824	return (error);
825}
826
827/*
828 * MPSAFE
829 */
830int
831osendmsg(td, uap)
832	struct thread *td;
833	register struct osendmsg_args /* {
834		int	s;
835		caddr_t	msg;
836		int	flags;
837	} */ *uap;
838{
839	struct msghdr msg;
840	struct iovec aiov[UIO_SMALLIOV], *iov;
841	int error;
842
843	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
844	if (error)
845		goto done2;
846	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
847		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
848			error = EMSGSIZE;
849			goto done2;
850		}
851		MALLOC(iov, struct iovec *,
852		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
853		      M_WAITOK);
854	} else {
855		iov = aiov;
856	}
857	error = copyin(msg.msg_iov, iov,
858	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
859	if (error)
860		goto done;
861	msg.msg_flags = MSG_COMPAT;
862	msg.msg_iov = iov;
863	error = sendit(td, uap->s, &msg, uap->flags);
864done:
865	if (iov != aiov)
866		FREE(iov, M_IOV);
867done2:
868	return (error);
869}
870#endif
871
872/*
873 * MPSAFE
874 */
875int
876sendmsg(td, uap)
877	struct thread *td;
878	register struct sendmsg_args /* {
879		int	s;
880		caddr_t	msg;
881		int	flags;
882	} */ *uap;
883{
884	struct msghdr msg;
885	struct iovec aiov[UIO_SMALLIOV], *iov;
886	int error;
887
888	error = copyin(uap->msg, &msg, sizeof (msg));
889	if (error)
890		goto done2;
891	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
892		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
893			error = EMSGSIZE;
894			goto done2;
895		}
896		MALLOC(iov, struct iovec *,
897		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
898		       M_WAITOK);
899	} else {
900		iov = aiov;
901	}
902	if (msg.msg_iovlen &&
903	    (error = copyin(msg.msg_iov, iov,
904	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
905		goto done;
906	msg.msg_iov = iov;
907#ifdef COMPAT_OLDSOCK
908	msg.msg_flags = 0;
909#endif
910	error = sendit(td, uap->s, &msg, uap->flags);
911done:
912	if (iov != aiov)
913		FREE(iov, M_IOV);
914done2:
915	return (error);
916}
917
918static int
919recvit(td, s, mp, namelenp)
920	register struct thread *td;
921	int s;
922	register struct msghdr *mp;
923	void *namelenp;
924{
925	struct uio auio;
926	register struct iovec *iov;
927	register int i;
928	socklen_t len;
929	int error;
930	struct mbuf *m, *control = 0;
931	caddr_t ctlbuf;
932	struct socket *so;
933	struct sockaddr *fromsa = 0;
934#ifdef KTRACE
935	struct iovec *ktriov = NULL;
936	struct uio ktruio;
937	int iovlen;
938#endif
939
940	mtx_lock(&Giant);
941	if ((error = fgetsock(td, s, &so, NULL)) != 0) {
942		mtx_unlock(&Giant);
943		return (error);
944	}
945
946#ifdef MAC
947	error = mac_check_socket_receive(td->td_ucred, so);
948	if (error) {
949		fputsock(so);
950		mtx_unlock(&Giant);
951		return (error);
952	}
953#endif
954
955	auio.uio_iov = mp->msg_iov;
956	auio.uio_iovcnt = mp->msg_iovlen;
957	auio.uio_segflg = UIO_USERSPACE;
958	auio.uio_rw = UIO_READ;
959	auio.uio_td = td;
960	auio.uio_offset = 0;			/* XXX */
961	auio.uio_resid = 0;
962	iov = mp->msg_iov;
963	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
964		if ((auio.uio_resid += iov->iov_len) < 0) {
965			fputsock(so);
966			return (EINVAL);
967		}
968	}
969#ifdef KTRACE
970	if (KTRPOINT(td, KTR_GENIO)) {
971		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
972		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
973		bcopy(auio.uio_iov, ktriov, iovlen);
974		ktruio = auio;
975	}
976#endif
977	len = auio.uio_resid;
978	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
979	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
980	    &mp->msg_flags);
981	if (error) {
982		if (auio.uio_resid != (int)len && (error == ERESTART ||
983		    error == EINTR || error == EWOULDBLOCK))
984			error = 0;
985	}
986#ifdef KTRACE
987	if (ktriov != NULL) {
988		if (error == 0) {
989			ktruio.uio_iov = ktriov;
990			ktruio.uio_resid = (int)len - auio.uio_resid;
991			ktrgenio(s, UIO_READ, &ktruio, error);
992		}
993		FREE(ktriov, M_TEMP);
994	}
995#endif
996	if (error)
997		goto out;
998	td->td_retval[0] = (int)len - auio.uio_resid;
999	if (mp->msg_name) {
1000		len = mp->msg_namelen;
1001		if (len <= 0 || fromsa == 0)
1002			len = 0;
1003		else {
1004			/* save sa_len before it is destroyed by MSG_COMPAT */
1005			len = MIN(len, fromsa->sa_len);
1006#ifdef COMPAT_OLDSOCK
1007			if (mp->msg_flags & MSG_COMPAT)
1008				((struct osockaddr *)fromsa)->sa_family =
1009				    fromsa->sa_family;
1010#endif
1011			error = copyout(fromsa, mp->msg_name, (unsigned)len);
1012			if (error)
1013				goto out;
1014		}
1015		mp->msg_namelen = len;
1016		if (namelenp &&
1017		    (error = copyout(&len, namelenp, sizeof (socklen_t)))) {
1018#ifdef COMPAT_OLDSOCK
1019			if (mp->msg_flags & MSG_COMPAT)
1020				error = 0;	/* old recvfrom didn't check */
1021			else
1022#endif
1023			goto out;
1024		}
1025	}
1026	if (mp->msg_control) {
1027#ifdef COMPAT_OLDSOCK
1028		/*
1029		 * We assume that old recvmsg calls won't receive access
1030		 * rights and other control info, esp. as control info
1031		 * is always optional and those options didn't exist in 4.3.
1032		 * If we receive rights, trim the cmsghdr; anything else
1033		 * is tossed.
1034		 */
1035		if (control && mp->msg_flags & MSG_COMPAT) {
1036			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1037			    SOL_SOCKET ||
1038			    mtod(control, struct cmsghdr *)->cmsg_type !=
1039			    SCM_RIGHTS) {
1040				mp->msg_controllen = 0;
1041				goto out;
1042			}
1043			control->m_len -= sizeof (struct cmsghdr);
1044			control->m_data += sizeof (struct cmsghdr);
1045		}
1046#endif
1047		len = mp->msg_controllen;
1048		m = control;
1049		mp->msg_controllen = 0;
1050		ctlbuf = mp->msg_control;
1051
1052		while (m && len > 0) {
1053			unsigned int tocopy;
1054
1055			if (len >= m->m_len)
1056				tocopy = m->m_len;
1057			else {
1058				mp->msg_flags |= MSG_CTRUNC;
1059				tocopy = len;
1060			}
1061
1062			if ((error = copyout(mtod(m, caddr_t),
1063					ctlbuf, tocopy)) != 0)
1064				goto out;
1065
1066			ctlbuf += tocopy;
1067			len -= tocopy;
1068			m = m->m_next;
1069		}
1070		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1071	}
1072out:
1073	fputsock(so);
1074	mtx_unlock(&Giant);
1075	if (fromsa)
1076		FREE(fromsa, M_SONAME);
1077	if (control)
1078		m_freem(control);
1079	return (error);
1080}
1081
1082/*
1083 * MPSAFE
1084 */
1085int
1086recvfrom(td, uap)
1087	struct thread *td;
1088	register struct recvfrom_args /* {
1089		int	s;
1090		caddr_t	buf;
1091		size_t	len;
1092		int	flags;
1093		struct sockaddr * __restrict	from;
1094		socklen_t * __restrict fromlenaddr;
1095	} */ *uap;
1096{
1097	struct msghdr msg;
1098	struct iovec aiov;
1099	int error;
1100
1101	if (uap->fromlenaddr) {
1102		error = copyin(uap->fromlenaddr,
1103		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1104		if (error)
1105			goto done2;
1106	} else {
1107		msg.msg_namelen = 0;
1108	}
1109	msg.msg_name = uap->from;
1110	msg.msg_iov = &aiov;
1111	msg.msg_iovlen = 1;
1112	aiov.iov_base = uap->buf;
1113	aiov.iov_len = uap->len;
1114	msg.msg_control = 0;
1115	msg.msg_flags = uap->flags;
1116	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1117done2:
1118	return(error);
1119}
1120
1121#ifdef COMPAT_OLDSOCK
1122/*
1123 * MPSAFE
1124 */
1125int
1126orecvfrom(td, uap)
1127	struct thread *td;
1128	struct recvfrom_args *uap;
1129{
1130
1131	uap->flags |= MSG_COMPAT;
1132	return (recvfrom(td, uap));
1133}
1134#endif
1135
1136
1137#ifdef COMPAT_OLDSOCK
1138/*
1139 * MPSAFE
1140 */
1141int
1142orecv(td, uap)
1143	struct thread *td;
1144	register struct orecv_args /* {
1145		int	s;
1146		caddr_t	buf;
1147		int	len;
1148		int	flags;
1149	} */ *uap;
1150{
1151	struct msghdr msg;
1152	struct iovec aiov;
1153	int error;
1154
1155	msg.msg_name = 0;
1156	msg.msg_namelen = 0;
1157	msg.msg_iov = &aiov;
1158	msg.msg_iovlen = 1;
1159	aiov.iov_base = uap->buf;
1160	aiov.iov_len = uap->len;
1161	msg.msg_control = 0;
1162	msg.msg_flags = uap->flags;
1163	error = recvit(td, uap->s, &msg, NULL);
1164	return (error);
1165}
1166
1167/*
1168 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1169 * overlays the new one, missing only the flags, and with the (old) access
1170 * rights where the control fields are now.
1171 *
1172 * MPSAFE
1173 */
1174int
1175orecvmsg(td, uap)
1176	struct thread *td;
1177	register struct orecvmsg_args /* {
1178		int	s;
1179		struct	omsghdr *msg;
1180		int	flags;
1181	} */ *uap;
1182{
1183	struct msghdr msg;
1184	struct iovec aiov[UIO_SMALLIOV], *iov;
1185	int error;
1186
1187	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1188	if (error)
1189		return (error);
1190
1191	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1192		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1193			error = EMSGSIZE;
1194			goto done2;
1195		}
1196		MALLOC(iov, struct iovec *,
1197		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1198		      M_WAITOK);
1199	} else {
1200		iov = aiov;
1201	}
1202	msg.msg_flags = uap->flags | MSG_COMPAT;
1203	error = copyin(msg.msg_iov, iov,
1204	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1205	if (error)
1206		goto done;
1207	msg.msg_iov = iov;
1208	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1209
1210	if (msg.msg_controllen && error == 0)
1211		error = copyout(&msg.msg_controllen,
1212		    &uap->msg->msg_accrightslen, sizeof (int));
1213done:
1214	if (iov != aiov)
1215		FREE(iov, M_IOV);
1216done2:
1217	return (error);
1218}
1219#endif
1220
1221/*
1222 * MPSAFE
1223 */
1224int
1225recvmsg(td, uap)
1226	struct thread *td;
1227	register struct recvmsg_args /* {
1228		int	s;
1229		struct	msghdr *msg;
1230		int	flags;
1231	} */ *uap;
1232{
1233	struct msghdr msg;
1234	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1235	register int error;
1236
1237	error = copyin(uap->msg, &msg, sizeof (msg));
1238	if (error)
1239		goto done2;
1240	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1241		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1242			error = EMSGSIZE;
1243			goto done2;
1244		}
1245		MALLOC(iov, struct iovec *,
1246		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1247		       M_WAITOK);
1248	} else {
1249		iov = aiov;
1250	}
1251#ifdef COMPAT_OLDSOCK
1252	msg.msg_flags = uap->flags &~ MSG_COMPAT;
1253#else
1254	msg.msg_flags = uap->flags;
1255#endif
1256	uiov = msg.msg_iov;
1257	msg.msg_iov = iov;
1258	error = copyin(uiov, iov,
1259	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1260	if (error)
1261		goto done;
1262	error = recvit(td, uap->s, &msg, NULL);
1263	if (!error) {
1264		msg.msg_iov = uiov;
1265		error = copyout(&msg, uap->msg, sizeof(msg));
1266	}
1267done:
1268	if (iov != aiov)
1269		FREE(iov, M_IOV);
1270done2:
1271	return (error);
1272}
1273
1274/*
1275 * MPSAFE
1276 */
1277/* ARGSUSED */
1278int
1279shutdown(td, uap)
1280	struct thread *td;
1281	register struct shutdown_args /* {
1282		int	s;
1283		int	how;
1284	} */ *uap;
1285{
1286	struct socket *so;
1287	int error;
1288
1289	mtx_lock(&Giant);
1290	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1291		error = soshutdown(so, uap->how);
1292		fputsock(so);
1293	}
1294	mtx_unlock(&Giant);
1295	return(error);
1296}
1297
1298/*
1299 * MPSAFE
1300 */
1301/* ARGSUSED */
1302int
1303setsockopt(td, uap)
1304	struct thread *td;
1305	register struct setsockopt_args /* {
1306		int	s;
1307		int	level;
1308		int	name;
1309		caddr_t	val;
1310		int	valsize;
1311	} */ *uap;
1312{
1313	struct socket *so;
1314	struct sockopt sopt;
1315	int error;
1316
1317	if (uap->val == 0 && uap->valsize != 0)
1318		return (EFAULT);
1319	if (uap->valsize < 0)
1320		return (EINVAL);
1321
1322	mtx_lock(&Giant);
1323	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1324		sopt.sopt_dir = SOPT_SET;
1325		sopt.sopt_level = uap->level;
1326		sopt.sopt_name = uap->name;
1327		sopt.sopt_val = uap->val;
1328		sopt.sopt_valsize = uap->valsize;
1329		sopt.sopt_td = td;
1330		error = sosetopt(so, &sopt);
1331		fputsock(so);
1332	}
1333	mtx_unlock(&Giant);
1334	return(error);
1335}
1336
1337/*
1338 * MPSAFE
1339 */
1340/* ARGSUSED */
1341int
1342getsockopt(td, uap)
1343	struct thread *td;
1344	register struct getsockopt_args /* {
1345		int	s;
1346		int	level;
1347		int	name;
1348		void * __restrict	val;
1349		socklen_t * __restrict avalsize;
1350	} */ *uap;
1351{
1352	socklen_t valsize;
1353	int	error;
1354	struct  socket *so;
1355	struct	sockopt sopt;
1356
1357	mtx_lock(&Giant);
1358	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1359		goto done2;
1360	if (uap->val) {
1361		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1362		if (error)
1363			goto done1;
1364		if (valsize < 0) {
1365			error = EINVAL;
1366			goto done1;
1367		}
1368	} else {
1369		valsize = 0;
1370	}
1371
1372	sopt.sopt_dir = SOPT_GET;
1373	sopt.sopt_level = uap->level;
1374	sopt.sopt_name = uap->name;
1375	sopt.sopt_val = uap->val;
1376	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1377	sopt.sopt_td = td;
1378
1379	error = sogetopt(so, &sopt);
1380	if (error == 0) {
1381		valsize = sopt.sopt_valsize;
1382		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1383	}
1384done1:
1385	fputsock(so);
1386done2:
1387	mtx_unlock(&Giant);
1388	return (error);
1389}
1390
1391/*
1392 * getsockname1() - Get socket name.
1393 *
1394 * MPSAFE
1395 */
1396/* ARGSUSED */
1397static int
1398getsockname1(td, uap, compat)
1399	struct thread *td;
1400	register struct getsockname_args /* {
1401		int	fdes;
1402		struct sockaddr * __restrict asa;
1403		socklen_t * __restrict alen;
1404	} */ *uap;
1405	int compat;
1406{
1407	struct socket *so;
1408	struct sockaddr *sa;
1409	socklen_t len;
1410	int error;
1411
1412	mtx_lock(&Giant);
1413	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1414		goto done2;
1415	error = copyin(uap->alen, &len, sizeof (len));
1416	if (error)
1417		goto done1;
1418	if (len < 0) {
1419		error = EINVAL;
1420		goto done1;
1421	}
1422	sa = 0;
1423	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1424	if (error)
1425		goto bad;
1426	if (sa == 0) {
1427		len = 0;
1428		goto gotnothing;
1429	}
1430
1431	len = MIN(len, sa->sa_len);
1432#ifdef COMPAT_OLDSOCK
1433	if (compat)
1434		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1435#endif
1436	error = copyout(sa, uap->asa, (u_int)len);
1437	if (error == 0)
1438gotnothing:
1439		error = copyout(&len, uap->alen, sizeof (len));
1440bad:
1441	if (sa)
1442		FREE(sa, M_SONAME);
1443done1:
1444	fputsock(so);
1445done2:
1446	mtx_unlock(&Giant);
1447	return (error);
1448}
1449
1450/*
1451 * MPSAFE
1452 */
1453int
1454getsockname(td, uap)
1455	struct thread *td;
1456	struct getsockname_args *uap;
1457{
1458
1459	return (getsockname1(td, uap, 0));
1460}
1461
1462#ifdef COMPAT_OLDSOCK
1463/*
1464 * MPSAFE
1465 */
1466int
1467ogetsockname(td, uap)
1468	struct thread *td;
1469	struct getsockname_args *uap;
1470{
1471
1472	return (getsockname1(td, uap, 1));
1473}
1474#endif /* COMPAT_OLDSOCK */
1475
1476/*
1477 * getpeername1() - Get name of peer for connected socket.
1478 *
1479 * MPSAFE
1480 */
1481/* ARGSUSED */
1482static int
1483getpeername1(td, uap, compat)
1484	struct thread *td;
1485	register struct getpeername_args /* {
1486		int	fdes;
1487		struct sockaddr * __restrict	asa;
1488		socklen_t * __restrict	alen;
1489	} */ *uap;
1490	int compat;
1491{
1492	struct socket *so;
1493	struct sockaddr *sa;
1494	socklen_t len;
1495	int error;
1496
1497	mtx_lock(&Giant);
1498	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1499		goto done2;
1500	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1501		error = ENOTCONN;
1502		goto done1;
1503	}
1504	error = copyin(uap->alen, &len, sizeof (len));
1505	if (error)
1506		goto done1;
1507	if (len < 0) {
1508		error = EINVAL;
1509		goto done1;
1510	}
1511	sa = 0;
1512	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1513	if (error)
1514		goto bad;
1515	if (sa == 0) {
1516		len = 0;
1517		goto gotnothing;
1518	}
1519	len = MIN(len, sa->sa_len);
1520#ifdef COMPAT_OLDSOCK
1521	if (compat)
1522		((struct osockaddr *)sa)->sa_family =
1523		    sa->sa_family;
1524#endif
1525	error = copyout(sa, uap->asa, (u_int)len);
1526	if (error)
1527		goto bad;
1528gotnothing:
1529	error = copyout(&len, uap->alen, sizeof (len));
1530bad:
1531	if (sa)
1532		FREE(sa, M_SONAME);
1533done1:
1534	fputsock(so);
1535done2:
1536	mtx_unlock(&Giant);
1537	return (error);
1538}
1539
1540/*
1541 * MPSAFE
1542 */
1543int
1544getpeername(td, uap)
1545	struct thread *td;
1546	struct getpeername_args *uap;
1547{
1548
1549	return (getpeername1(td, uap, 0));
1550}
1551
1552#ifdef COMPAT_OLDSOCK
1553/*
1554 * MPSAFE
1555 */
1556int
1557ogetpeername(td, uap)
1558	struct thread *td;
1559	struct ogetpeername_args *uap;
1560{
1561
1562	/* XXX uap should have type `getpeername_args *' to begin with. */
1563	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1564}
1565#endif /* COMPAT_OLDSOCK */
1566
1567int
1568sockargs(mp, buf, buflen, type)
1569	struct mbuf **mp;
1570	caddr_t buf;
1571	int buflen, type;
1572{
1573	register struct sockaddr *sa;
1574	register struct mbuf *m;
1575	int error;
1576
1577	if ((u_int)buflen > MLEN) {
1578#ifdef COMPAT_OLDSOCK
1579		if (type == MT_SONAME && (u_int)buflen <= 112)
1580			buflen = MLEN;		/* unix domain compat. hack */
1581		else
1582#endif
1583		return (EINVAL);
1584	}
1585	m = m_get(M_TRYWAIT, type);
1586	if (m == NULL)
1587		return (ENOBUFS);
1588	m->m_len = buflen;
1589	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1590	if (error)
1591		(void) m_free(m);
1592	else {
1593		*mp = m;
1594		if (type == MT_SONAME) {
1595			sa = mtod(m, struct sockaddr *);
1596
1597#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1598			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1599				sa->sa_family = sa->sa_len;
1600#endif
1601			sa->sa_len = buflen;
1602		}
1603	}
1604	return (error);
1605}
1606
1607int
1608getsockaddr(namp, uaddr, len)
1609	struct sockaddr **namp;
1610	caddr_t uaddr;
1611	size_t len;
1612{
1613	struct sockaddr *sa;
1614	int error;
1615
1616	if (len > SOCK_MAXADDRLEN)
1617		return (ENAMETOOLONG);
1618	if (len < offsetof(struct sockaddr, sa_data[0]))
1619		return (EINVAL);
1620	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1621	error = copyin(uaddr, sa, len);
1622	if (error) {
1623		FREE(sa, M_SONAME);
1624	} else {
1625#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1626		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1627			sa->sa_family = sa->sa_len;
1628#endif
1629		sa->sa_len = len;
1630		*namp = sa;
1631	}
1632	return (error);
1633}
1634
1635/*
1636 * sendfile(2)
1637 *
1638 * MPSAFE
1639 *
1640 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1641 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1642 *
1643 * Send a file specified by 'fd' and starting at 'offset' to a socket
1644 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1645 * nbytes == 0. Optionally add a header and/or trailer to the socket
1646 * output. If specified, write the total number of bytes sent into *sbytes.
1647 *
1648 */
1649int
1650sendfile(struct thread *td, struct sendfile_args *uap)
1651{
1652
1653	return (do_sendfile(td, uap, 0));
1654}
1655
1656#ifdef COMPAT_FREEBSD4
1657int
1658freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1659{
1660	struct sendfile_args args;
1661
1662	args.fd = uap->fd;
1663	args.s = uap->s;
1664	args.offset = uap->offset;
1665	args.nbytes = uap->nbytes;
1666	args.hdtr = uap->hdtr;
1667	args.sbytes = uap->sbytes;
1668	args.flags = uap->flags;
1669
1670	return (do_sendfile(td, &args, 1));
1671}
1672#endif /* COMPAT_FREEBSD4 */
1673
1674static int
1675do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1676{
1677	struct vnode *vp;
1678	struct vm_object *obj;
1679	struct socket *so = NULL;
1680	struct mbuf *m;
1681	struct sf_buf *sf;
1682	struct vm_page *pg;
1683	struct writev_args nuap;
1684	struct sf_hdtr hdtr;
1685	off_t off, xfsize, hdtr_size, sbytes = 0;
1686	int error, s;
1687
1688	mtx_lock(&Giant);
1689
1690	hdtr_size = 0;
1691
1692	/*
1693	 * The descriptor must be a regular file and have a backing VM object.
1694	 */
1695	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1696		goto done;
1697	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1698	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1699		error = EINVAL;
1700		VOP_UNLOCK(vp, 0, td);
1701		goto done;
1702	}
1703	VOP_UNLOCK(vp, 0, td);
1704	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1705		goto done;
1706	if (so->so_type != SOCK_STREAM) {
1707		error = EINVAL;
1708		goto done;
1709	}
1710	if ((so->so_state & SS_ISCONNECTED) == 0) {
1711		error = ENOTCONN;
1712		goto done;
1713	}
1714	if (uap->offset < 0) {
1715		error = EINVAL;
1716		goto done;
1717	}
1718
1719#ifdef MAC
1720	error = mac_check_socket_send(td->td_ucred, so);
1721	if (error)
1722		goto done;
1723#endif
1724
1725	/*
1726	 * If specified, get the pointer to the sf_hdtr struct for
1727	 * any headers/trailers.
1728	 */
1729	if (uap->hdtr != NULL) {
1730		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1731		if (error)
1732			goto done;
1733		/*
1734		 * Send any headers. Wimp out and use writev(2).
1735		 */
1736		if (hdtr.headers != NULL) {
1737			nuap.fd = uap->s;
1738			nuap.iovp = hdtr.headers;
1739			nuap.iovcnt = hdtr.hdr_cnt;
1740			error = writev(td, &nuap);
1741			if (error)
1742				goto done;
1743			if (compat)
1744				sbytes += td->td_retval[0];
1745			else
1746				hdtr_size += td->td_retval[0];
1747		}
1748	}
1749
1750	/*
1751	 * Protect against multiple writers to the socket.
1752	 */
1753	(void) sblock(&so->so_snd, M_WAITOK);
1754
1755	/*
1756	 * Loop through the pages in the file, starting with the requested
1757	 * offset. Get a file page (do I/O if necessary), map the file page
1758	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1759	 * it on the socket.
1760	 */
1761	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1762		vm_pindex_t pindex;
1763		vm_offset_t pgoff;
1764
1765		pindex = OFF_TO_IDX(off);
1766		VM_OBJECT_LOCK(obj);
1767retry_lookup:
1768		/*
1769		 * Calculate the amount to transfer. Not to exceed a page,
1770		 * the EOF, or the passed in nbytes.
1771		 */
1772		xfsize = obj->un_pager.vnp.vnp_size - off;
1773		VM_OBJECT_UNLOCK(obj);
1774		if (xfsize > PAGE_SIZE)
1775			xfsize = PAGE_SIZE;
1776		pgoff = (vm_offset_t)(off & PAGE_MASK);
1777		if (PAGE_SIZE - pgoff < xfsize)
1778			xfsize = PAGE_SIZE - pgoff;
1779		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1780			xfsize = uap->nbytes - sbytes;
1781		if (xfsize <= 0)
1782			break;
1783		/*
1784		 * Optimize the non-blocking case by looking at the socket space
1785		 * before going to the extra work of constituting the sf_buf.
1786		 */
1787		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1788			if (so->so_state & SS_CANTSENDMORE)
1789				error = EPIPE;
1790			else
1791				error = EAGAIN;
1792			sbunlock(&so->so_snd);
1793			goto done;
1794		}
1795		VM_OBJECT_LOCK(obj);
1796		/*
1797		 * Attempt to look up the page.
1798		 *
1799		 *	Allocate if not found
1800		 *
1801		 *	Wait and loop if busy.
1802		 */
1803		pg = vm_page_lookup(obj, pindex);
1804
1805		if (pg == NULL) {
1806			pg = vm_page_alloc(obj, pindex,
1807			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
1808			if (pg == NULL) {
1809				VM_OBJECT_UNLOCK(obj);
1810				VM_WAIT;
1811				VM_OBJECT_LOCK(obj);
1812				goto retry_lookup;
1813			}
1814			vm_page_lock_queues();
1815			vm_page_wakeup(pg);
1816		} else {
1817			vm_page_lock_queues();
1818			if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
1819				goto retry_lookup;
1820			/*
1821			 * Wire the page so it does not get ripped out from
1822			 * under us.
1823			 */
1824			vm_page_wire(pg);
1825		}
1826
1827		/*
1828		 * If page is not valid for what we need, initiate I/O
1829		 */
1830
1831		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1832			int bsize, resid;
1833
1834			/*
1835			 * Ensure that our page is still around when the I/O
1836			 * completes.
1837			 */
1838			vm_page_io_start(pg);
1839			vm_page_unlock_queues();
1840			VM_OBJECT_UNLOCK(obj);
1841
1842			/*
1843			 * Get the page from backing store.
1844			 */
1845			bsize = vp->v_mount->mnt_stat.f_iosize;
1846			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
1847			/*
1848			 * XXXMAC: Because we don't have fp->f_cred here,
1849			 * we pass in NOCRED.  This is probably wrong, but
1850			 * is consistent with our original implementation.
1851			 */
1852			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
1853			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
1854			    IO_VMIO | ((MAXBSIZE / bsize) << 16),
1855			    td->td_ucred, NOCRED, &resid, td);
1856			VOP_UNLOCK(vp, 0, td);
1857			if (error)
1858				VM_OBJECT_LOCK(obj);
1859			vm_page_lock_queues();
1860			vm_page_flag_clear(pg, PG_ZERO);
1861			vm_page_io_finish(pg);
1862			if (error) {
1863				vm_page_unwire(pg, 0);
1864				/*
1865				 * See if anyone else might know about this page.
1866				 * If not and it is not valid, then free it.
1867				 */
1868				if (pg->wire_count == 0 && pg->valid == 0 &&
1869				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1870				    pg->hold_count == 0) {
1871					vm_page_busy(pg);
1872					vm_page_free(pg);
1873				}
1874				vm_page_unlock_queues();
1875				VM_OBJECT_UNLOCK(obj);
1876				sbunlock(&so->so_snd);
1877				goto done;
1878			}
1879			mbstat.sf_iocnt++;
1880		} else {
1881			VM_OBJECT_UNLOCK(obj);
1882		}
1883		vm_page_unlock_queues();
1884
1885		/*
1886		 * Get a sendfile buf. We usually wait as long as necessary,
1887		 * but this wait can be interrupted.
1888		 */
1889		if ((sf = sf_buf_alloc(pg)) == NULL) {
1890			mbstat.sf_allocfail++;
1891			vm_page_lock_queues();
1892			vm_page_unwire(pg, 0);
1893			if (pg->wire_count == 0 && pg->object == NULL)
1894				vm_page_free(pg);
1895			vm_page_unlock_queues();
1896			sbunlock(&so->so_snd);
1897			error = EINTR;
1898			goto done;
1899		}
1900
1901		/*
1902		 * Get an mbuf header and set it up as having external storage.
1903		 */
1904		MGETHDR(m, M_TRYWAIT, MT_DATA);
1905		if (m == NULL) {
1906			error = ENOBUFS;
1907			sf_buf_free((void *)sf_buf_kva(sf), sf);
1908			sbunlock(&so->so_snd);
1909			goto done;
1910		}
1911		/*
1912		 * Setup external storage for mbuf.
1913		 */
1914		MEXTADD(m, sf_buf_kva(sf), PAGE_SIZE, sf_buf_free, sf, M_RDONLY,
1915		    EXT_SFBUF);
1916		m->m_data = (char *)sf_buf_kva(sf) + pgoff;
1917		m->m_pkthdr.len = m->m_len = xfsize;
1918		/*
1919		 * Add the buffer to the socket buffer chain.
1920		 */
1921		s = splnet();
1922retry_space:
1923		/*
1924		 * Make sure that the socket is still able to take more data.
1925		 * CANTSENDMORE being true usually means that the connection
1926		 * was closed. so_error is true when an error was sensed after
1927		 * a previous send.
1928		 * The state is checked after the page mapping and buffer
1929		 * allocation above since those operations may block and make
1930		 * any socket checks stale. From this point forward, nothing
1931		 * blocks before the pru_send (or more accurately, any blocking
1932		 * results in a loop back to here to re-check).
1933		 */
1934		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1935			if (so->so_state & SS_CANTSENDMORE) {
1936				error = EPIPE;
1937			} else {
1938				error = so->so_error;
1939				so->so_error = 0;
1940			}
1941			m_freem(m);
1942			sbunlock(&so->so_snd);
1943			splx(s);
1944			goto done;
1945		}
1946		/*
1947		 * Wait for socket space to become available. We do this just
1948		 * after checking the connection state above in order to avoid
1949		 * a race condition with sbwait().
1950		 */
1951		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1952			if (so->so_state & SS_NBIO) {
1953				m_freem(m);
1954				sbunlock(&so->so_snd);
1955				splx(s);
1956				error = EAGAIN;
1957				goto done;
1958			}
1959			error = sbwait(&so->so_snd);
1960			/*
1961			 * An error from sbwait usually indicates that we've
1962			 * been interrupted by a signal. If we've sent anything
1963			 * then return bytes sent, otherwise return the error.
1964			 */
1965			if (error) {
1966				m_freem(m);
1967				sbunlock(&so->so_snd);
1968				splx(s);
1969				goto done;
1970			}
1971			goto retry_space;
1972		}
1973		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
1974		splx(s);
1975		if (error) {
1976			sbunlock(&so->so_snd);
1977			goto done;
1978		}
1979	}
1980	sbunlock(&so->so_snd);
1981
1982	/*
1983	 * Send trailers. Wimp out and use writev(2).
1984	 */
1985	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1986			nuap.fd = uap->s;
1987			nuap.iovp = hdtr.trailers;
1988			nuap.iovcnt = hdtr.trl_cnt;
1989			error = writev(td, &nuap);
1990			if (error)
1991				goto done;
1992			if (compat)
1993				sbytes += td->td_retval[0];
1994			else
1995				hdtr_size += td->td_retval[0];
1996	}
1997
1998done:
1999	/*
2000	 * If there was no error we have to clear td->td_retval[0]
2001	 * because it may have been set by writev.
2002	 */
2003	if (error == 0) {
2004		td->td_retval[0] = 0;
2005	}
2006	if (uap->sbytes != NULL) {
2007		if (!compat)
2008			sbytes += hdtr_size;
2009		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2010	}
2011	if (vp)
2012		vrele(vp);
2013	if (so)
2014		fputsock(so);
2015
2016	mtx_unlock(&Giant);
2017
2018	if (error == ERESTART)
2019		error = EINTR;
2020
2021	return (error);
2022}
2023