kern_sendfile.c revision 123844
1/*
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 123844 2003-12-25 23:44:38Z dwmalone $");
41
42#include "opt_compat.h"
43#include "opt_ktrace.h"
44#include "opt_mac.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/mac.h>
51#include <sys/mutex.h>
52#include <sys/sysproto.h>
53#include <sys/malloc.h>
54#include <sys/filedesc.h>
55#include <sys/event.h>
56#include <sys/proc.h>
57#include <sys/fcntl.h>
58#include <sys/file.h>
59#include <sys/filio.h>
60#include <sys/mount.h>
61#include <sys/mbuf.h>
62#include <sys/protosw.h>
63#include <sys/sf_buf.h>
64#include <sys/socket.h>
65#include <sys/socketvar.h>
66#include <sys/signalvar.h>
67#include <sys/syscallsubr.h>
68#include <sys/uio.h>
69#include <sys/vnode.h>
70#ifdef KTRACE
71#include <sys/ktrace.h>
72#endif
73
74#include <vm/vm.h>
75#include <vm/vm_object.h>
76#include <vm/vm_page.h>
77#include <vm/vm_pageout.h>
78#include <vm/vm_kern.h>
79#include <vm/vm_extern.h>
80
81static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
82static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
83
84static int accept1(struct thread *td, struct accept_args *uap, int compat);
85static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
86static int getsockname1(struct thread *td, struct getsockname_args *uap,
87			int compat);
88static int getpeername1(struct thread *td, struct getpeername_args *uap,
89			int compat);
90
91/*
92 * System call interface to the socket abstraction.
93 */
94#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
95#define COMPAT_OLDSOCK
96#endif
97
98/*
99 * MPSAFE
100 */
101int
102socket(td, uap)
103	struct thread *td;
104	register struct socket_args /* {
105		int	domain;
106		int	type;
107		int	protocol;
108	} */ *uap;
109{
110	struct filedesc *fdp;
111	struct socket *so;
112	struct file *fp;
113	int fd, error;
114
115	fdp = td->td_proc->p_fd;
116	error = falloc(td, &fp, &fd);
117	if (error)
118		goto done2;
119	/* An extra reference on `fp' has been held for us by falloc(). */
120	mtx_lock(&Giant);
121	error = socreate(uap->domain, &so, uap->type, uap->protocol,
122	    td->td_ucred, td);
123	mtx_unlock(&Giant);
124	FILEDESC_LOCK(fdp);
125	if (error) {
126		if (fdp->fd_ofiles[fd] == fp) {
127			fdp->fd_ofiles[fd] = NULL;
128			FILEDESC_UNLOCK(fdp);
129			fdrop(fp, td);
130		} else
131			FILEDESC_UNLOCK(fdp);
132	} else {
133		fp->f_data = so;	/* already has ref count */
134		fp->f_flag = FREAD|FWRITE;
135		fp->f_ops = &socketops;
136		fp->f_type = DTYPE_SOCKET;
137		FILEDESC_UNLOCK(fdp);
138		td->td_retval[0] = fd;
139	}
140	fdrop(fp, td);
141done2:
142	return (error);
143}
144
145/*
146 * MPSAFE
147 */
148/* ARGSUSED */
149int
150bind(td, uap)
151	struct thread *td;
152	register struct bind_args /* {
153		int	s;
154		caddr_t	name;
155		int	namelen;
156	} */ *uap;
157{
158	struct sockaddr *sa;
159	int error;
160
161	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
162		return (error);
163
164	return (kern_bind(td, uap->s, sa));
165}
166
167int
168kern_bind(td, fd, sa)
169	struct thread *td;
170	int fd;
171	struct sockaddr *sa;
172{
173	struct socket *so;
174	int error;
175
176	mtx_lock(&Giant);
177	if ((error = fgetsock(td, fd, &so, NULL)) != 0)
178		goto done2;
179#ifdef MAC
180	error = mac_check_socket_bind(td->td_ucred, so, sa);
181	if (error)
182		goto done1;
183#endif
184	error = sobind(so, sa, td);
185#ifdef MAC
186done1:
187#endif
188	fputsock(so);
189done2:
190	mtx_unlock(&Giant);
191	FREE(sa, M_SONAME);
192	return (error);
193}
194
195/*
196 * MPSAFE
197 */
198/* ARGSUSED */
199int
200listen(td, uap)
201	struct thread *td;
202	register struct listen_args /* {
203		int	s;
204		int	backlog;
205	} */ *uap;
206{
207	struct socket *so;
208	int error;
209
210	mtx_lock(&Giant);
211	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
212#ifdef MAC
213		error = mac_check_socket_listen(td->td_ucred, so);
214		if (error)
215			goto done;
216#endif
217		error = solisten(so, uap->backlog, td);
218#ifdef MAC
219done:
220#endif
221		fputsock(so);
222	}
223	mtx_unlock(&Giant);
224	return(error);
225}
226
227/*
228 * accept1()
229 * MPSAFE
230 */
231static int
232accept1(td, uap, compat)
233	struct thread *td;
234	register struct accept_args /* {
235		int	s;
236		struct sockaddr	* __restrict name;
237		socklen_t	* __restrict anamelen;
238	} */ *uap;
239	int compat;
240{
241	struct filedesc *fdp;
242	struct file *nfp = NULL;
243	struct sockaddr *sa;
244	socklen_t namelen;
245	int error, s;
246	struct socket *head, *so;
247	int fd;
248	u_int fflag;
249	pid_t pgid;
250	int tmp;
251
252	fdp = td->td_proc->p_fd;
253	if (uap->name) {
254		error = copyin(uap->anamelen, &namelen, sizeof (namelen));
255		if(error)
256			goto done3;
257		if (namelen < 0) {
258			error = EINVAL;
259			goto done3;
260		}
261	}
262	mtx_lock(&Giant);
263	error = fgetsock(td, uap->s, &head, &fflag);
264	if (error)
265		goto done2;
266	s = splnet();
267	if ((head->so_options & SO_ACCEPTCONN) == 0) {
268		splx(s);
269		error = EINVAL;
270		goto done;
271	}
272	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
273		if (head->so_state & SS_CANTRCVMORE) {
274			head->so_error = ECONNABORTED;
275			break;
276		}
277		if ((head->so_state & SS_NBIO) != 0) {
278			head->so_error = EWOULDBLOCK;
279			break;
280		}
281		error = tsleep(&head->so_timeo, PSOCK | PCATCH,
282		    "accept", 0);
283		if (error) {
284			splx(s);
285			goto done;
286		}
287	}
288	if (head->so_error) {
289		error = head->so_error;
290		head->so_error = 0;
291		splx(s);
292		goto done;
293	}
294
295	/*
296	 * At this point we know that there is at least one connection
297	 * ready to be accepted. Remove it from the queue prior to
298	 * allocating the file descriptor for it since falloc() may
299	 * block allowing another process to accept the connection
300	 * instead.
301	 */
302	so = TAILQ_FIRST(&head->so_comp);
303	TAILQ_REMOVE(&head->so_comp, so, so_list);
304	head->so_qlen--;
305
306	error = falloc(td, &nfp, &fd);
307	if (error) {
308		/*
309		 * Probably ran out of file descriptors. Put the
310		 * unaccepted connection back onto the queue and
311		 * do another wakeup so some other process might
312		 * have a chance at it.
313		 */
314		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
315		head->so_qlen++;
316		wakeup_one(&head->so_timeo);
317		splx(s);
318		goto done;
319	}
320	/* An extra reference on `nfp' has been held for us by falloc(). */
321	td->td_retval[0] = fd;
322
323	/* connection has been removed from the listen queue */
324	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
325
326	so->so_state &= ~SS_COMP;
327	so->so_head = NULL;
328	pgid = fgetown(&head->so_sigio);
329	if (pgid != 0)
330		fsetown(pgid, &so->so_sigio);
331
332	FILE_LOCK(nfp);
333	soref(so);			/* file descriptor reference */
334	nfp->f_data = so;	/* nfp has ref count from falloc */
335	nfp->f_flag = fflag;
336	nfp->f_ops = &socketops;
337	nfp->f_type = DTYPE_SOCKET;
338	FILE_UNLOCK(nfp);
339	/* Sync socket nonblocking/async state with file flags */
340	tmp = fflag & FNONBLOCK;
341	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
342	tmp = fflag & FASYNC;
343	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
344	sa = 0;
345	error = soaccept(so, &sa);
346	if (error) {
347		/*
348		 * return a namelen of zero for older code which might
349	 	 * ignore the return value from accept.
350		 */
351		if (uap->name != NULL) {
352			namelen = 0;
353			(void) copyout(&namelen,
354			    uap->anamelen, sizeof(*uap->anamelen));
355		}
356		goto noconnection;
357	}
358	if (sa == NULL) {
359		namelen = 0;
360		if (uap->name)
361			goto gotnoname;
362		splx(s);
363		error = 0;
364		goto done;
365	}
366	if (uap->name) {
367		/* check sa_len before it is destroyed */
368		if (namelen > sa->sa_len)
369			namelen = sa->sa_len;
370#ifdef COMPAT_OLDSOCK
371		if (compat)
372			((struct osockaddr *)sa)->sa_family =
373			    sa->sa_family;
374#endif
375		error = copyout(sa, uap->name, (u_int)namelen);
376		if (!error)
377gotnoname:
378			error = copyout(&namelen,
379			    uap->anamelen, sizeof (*uap->anamelen));
380	}
381noconnection:
382	if (sa)
383		FREE(sa, M_SONAME);
384
385	/*
386	 * close the new descriptor, assuming someone hasn't ripped it
387	 * out from under us.
388	 */
389	if (error) {
390		FILEDESC_LOCK(fdp);
391		if (fdp->fd_ofiles[fd] == nfp) {
392			fdp->fd_ofiles[fd] = NULL;
393			FILEDESC_UNLOCK(fdp);
394			fdrop(nfp, td);
395		} else {
396			FILEDESC_UNLOCK(fdp);
397		}
398	}
399	splx(s);
400
401	/*
402	 * Release explicitly held references before returning.
403	 */
404done:
405	if (nfp != NULL)
406		fdrop(nfp, td);
407	fputsock(head);
408done2:
409	mtx_unlock(&Giant);
410done3:
411	return (error);
412}
413
414/*
415 * MPSAFE (accept1() is MPSAFE)
416 */
417int
418accept(td, uap)
419	struct thread *td;
420	struct accept_args *uap;
421{
422
423	return (accept1(td, uap, 0));
424}
425
426#ifdef COMPAT_OLDSOCK
427/*
428 * MPSAFE (accept1() is MPSAFE)
429 */
430int
431oaccept(td, uap)
432	struct thread *td;
433	struct accept_args *uap;
434{
435
436	return (accept1(td, uap, 1));
437}
438#endif /* COMPAT_OLDSOCK */
439
440/*
441 * MPSAFE
442 */
443/* ARGSUSED */
444int
445connect(td, uap)
446	struct thread *td;
447	register struct connect_args /* {
448		int	s;
449		caddr_t	name;
450		int	namelen;
451	} */ *uap;
452{
453	struct sockaddr *sa;
454	int error;
455
456	error = getsockaddr(&sa, uap->name, uap->namelen);
457	if (error)
458		return error;
459
460	return (kern_connect(td, uap->s, sa));
461}
462
463
464int
465kern_connect(td, fd, sa)
466	struct thread *td;
467	int fd;
468	struct sockaddr *sa;
469{
470	struct socket *so;
471	int error, s;
472	int interrupted = 0;
473
474	mtx_lock(&Giant);
475	if ((error = fgetsock(td, fd, &so, NULL)) != 0)
476		goto done2;
477	if (so->so_state & SS_ISCONNECTING) {
478		error = EALREADY;
479		goto done1;
480	}
481#ifdef MAC
482	error = mac_check_socket_connect(td->td_ucred, so, sa);
483	if (error)
484		goto bad;
485#endif
486	error = soconnect(so, sa, td);
487	if (error)
488		goto bad;
489	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
490		error = EINPROGRESS;
491		goto done1;
492	}
493	s = splnet();
494	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
495		error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0);
496		if (error) {
497			if (error == EINTR || error == ERESTART)
498				interrupted = 1;
499			break;
500		}
501	}
502	if (error == 0) {
503		error = so->so_error;
504		so->so_error = 0;
505	}
506	splx(s);
507bad:
508	if (!interrupted)
509		so->so_state &= ~SS_ISCONNECTING;
510	if (error == ERESTART)
511		error = EINTR;
512done1:
513	fputsock(so);
514done2:
515	mtx_unlock(&Giant);
516	FREE(sa, M_SONAME);
517	return (error);
518}
519
520/*
521 * MPSAFE
522 */
523int
524socketpair(td, uap)
525	struct thread *td;
526	register struct socketpair_args /* {
527		int	domain;
528		int	type;
529		int	protocol;
530		int	*rsv;
531	} */ *uap;
532{
533	register struct filedesc *fdp = td->td_proc->p_fd;
534	struct file *fp1, *fp2;
535	struct socket *so1, *so2;
536	int fd, error, sv[2];
537
538	mtx_lock(&Giant);
539	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
540	    td->td_ucred, td);
541	if (error)
542		goto done2;
543	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
544	    td->td_ucred, td);
545	if (error)
546		goto free1;
547	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
548	error = falloc(td, &fp1, &fd);
549	if (error)
550		goto free2;
551	sv[0] = fd;
552	fp1->f_data = so1;	/* so1 already has ref count */
553	error = falloc(td, &fp2, &fd);
554	if (error)
555		goto free3;
556	fp2->f_data = so2;	/* so2 already has ref count */
557	sv[1] = fd;
558	error = soconnect2(so1, so2);
559	if (error)
560		goto free4;
561	if (uap->type == SOCK_DGRAM) {
562		/*
563		 * Datagram socket connection is asymmetric.
564		 */
565		 error = soconnect2(so2, so1);
566		 if (error)
567			goto free4;
568	}
569	FILE_LOCK(fp1);
570	fp1->f_flag = FREAD|FWRITE;
571	fp1->f_ops = &socketops;
572	fp1->f_type = DTYPE_SOCKET;
573	FILE_UNLOCK(fp1);
574	FILE_LOCK(fp2);
575	fp2->f_flag = FREAD|FWRITE;
576	fp2->f_ops = &socketops;
577	fp2->f_type = DTYPE_SOCKET;
578	FILE_UNLOCK(fp2);
579	error = copyout(sv, uap->rsv, 2 * sizeof (int));
580	fdrop(fp1, td);
581	fdrop(fp2, td);
582	goto done2;
583free4:
584	FILEDESC_LOCK(fdp);
585	if (fdp->fd_ofiles[sv[1]] == fp2) {
586		fdp->fd_ofiles[sv[1]] = NULL;
587		FILEDESC_UNLOCK(fdp);
588		fdrop(fp2, td);
589	} else
590		FILEDESC_UNLOCK(fdp);
591	fdrop(fp2, td);
592free3:
593	FILEDESC_LOCK(fdp);
594	if (fdp->fd_ofiles[sv[0]] == fp1) {
595		fdp->fd_ofiles[sv[0]] = NULL;
596		FILEDESC_UNLOCK(fdp);
597		fdrop(fp1, td);
598	} else
599		FILEDESC_UNLOCK(fdp);
600	fdrop(fp1, td);
601free2:
602	(void)soclose(so2);
603free1:
604	(void)soclose(so1);
605done2:
606	mtx_unlock(&Giant);
607	return (error);
608}
609
610static int
611sendit(td, s, mp, flags)
612	register struct thread *td;
613	int s;
614	register struct msghdr *mp;
615	int flags;
616{
617	struct mbuf *control;
618	struct sockaddr *to;
619	int error;
620
621	if (mp->msg_name != NULL) {
622		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
623		if (error) {
624			to = NULL;
625			goto bad;
626		}
627		mp->msg_name = to;
628	} else
629		to = NULL;
630
631	if (mp->msg_control) {
632		if (mp->msg_controllen < sizeof(struct cmsghdr)
633#ifdef COMPAT_OLDSOCK
634		    && mp->msg_flags != MSG_COMPAT
635#endif
636		) {
637			error = EINVAL;
638			goto bad;
639		}
640		error = sockargs(&control, mp->msg_control,
641		    mp->msg_controllen, MT_CONTROL);
642		if (error)
643			goto bad;
644#ifdef COMPAT_OLDSOCK
645		if (mp->msg_flags == MSG_COMPAT) {
646			register struct cmsghdr *cm;
647
648			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
649			if (control == 0) {
650				error = ENOBUFS;
651				goto bad;
652			} else {
653				cm = mtod(control, struct cmsghdr *);
654				cm->cmsg_len = control->m_len;
655				cm->cmsg_level = SOL_SOCKET;
656				cm->cmsg_type = SCM_RIGHTS;
657			}
658		}
659#endif
660	} else {
661		control = NULL;
662	}
663
664	error = kern_sendit(td, s, mp, flags, control);
665
666bad:
667	if (to)
668		FREE(to, M_SONAME);
669	return (error);
670}
671
672int
673kern_sendit(td, s, mp, flags, control)
674	struct thread *td;
675	int s;
676	struct msghdr *mp;
677	int flags;
678	struct mbuf *control;
679{
680	struct uio auio;
681	struct iovec *iov;
682	struct socket *so;
683	int i;
684	int len, error;
685#ifdef KTRACE
686	struct iovec *ktriov = NULL;
687	struct uio ktruio;
688	int iovlen;
689#endif
690
691	mtx_lock(&Giant);
692	if ((error = fgetsock(td, s, &so, NULL)) != 0)
693		goto bad2;
694
695#ifdef MAC
696	error = mac_check_socket_send(td->td_ucred, so);
697	if (error)
698		goto bad;
699#endif
700
701	auio.uio_iov = mp->msg_iov;
702	auio.uio_iovcnt = mp->msg_iovlen;
703	auio.uio_segflg = UIO_USERSPACE;
704	auio.uio_rw = UIO_WRITE;
705	auio.uio_td = td;
706	auio.uio_offset = 0;			/* XXX */
707	auio.uio_resid = 0;
708	iov = mp->msg_iov;
709	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
710		if ((auio.uio_resid += iov->iov_len) < 0) {
711			error = EINVAL;
712			goto bad;
713		}
714	}
715#ifdef KTRACE
716	if (KTRPOINT(td, KTR_GENIO)) {
717		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
718		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
719		bcopy(auio.uio_iov, ktriov, iovlen);
720		ktruio = auio;
721	}
722#endif
723	len = auio.uio_resid;
724	error = so->so_proto->pr_usrreqs->pru_sosend(so, mp->msg_name, &auio,
725	    0, control, flags, td);
726	if (error) {
727		if (auio.uio_resid != len && (error == ERESTART ||
728		    error == EINTR || error == EWOULDBLOCK))
729			error = 0;
730		/* Generation of SIGPIPE can be controlled per socket */
731		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE)) {
732			PROC_LOCK(td->td_proc);
733			psignal(td->td_proc, SIGPIPE);
734			PROC_UNLOCK(td->td_proc);
735		}
736	}
737	if (error == 0)
738		td->td_retval[0] = len - auio.uio_resid;
739#ifdef KTRACE
740	if (ktriov != NULL) {
741		if (error == 0) {
742			ktruio.uio_iov = ktriov;
743			ktruio.uio_resid = td->td_retval[0];
744			ktrgenio(s, UIO_WRITE, &ktruio, error);
745		}
746		FREE(ktriov, M_TEMP);
747	}
748#endif
749bad:
750	fputsock(so);
751bad2:
752	mtx_unlock(&Giant);
753	return (error);
754}
755
756/*
757 * MPSAFE
758 */
759int
760sendto(td, uap)
761	struct thread *td;
762	register struct sendto_args /* {
763		int	s;
764		caddr_t	buf;
765		size_t	len;
766		int	flags;
767		caddr_t	to;
768		int	tolen;
769	} */ *uap;
770{
771	struct msghdr msg;
772	struct iovec aiov;
773	int error;
774
775	msg.msg_name = uap->to;
776	msg.msg_namelen = uap->tolen;
777	msg.msg_iov = &aiov;
778	msg.msg_iovlen = 1;
779	msg.msg_control = 0;
780#ifdef COMPAT_OLDSOCK
781	msg.msg_flags = 0;
782#endif
783	aiov.iov_base = uap->buf;
784	aiov.iov_len = uap->len;
785	error = sendit(td, uap->s, &msg, uap->flags);
786	return (error);
787}
788
789#ifdef COMPAT_OLDSOCK
790/*
791 * MPSAFE
792 */
793int
794osend(td, uap)
795	struct thread *td;
796	register struct osend_args /* {
797		int	s;
798		caddr_t	buf;
799		int	len;
800		int	flags;
801	} */ *uap;
802{
803	struct msghdr msg;
804	struct iovec aiov;
805	int error;
806
807	msg.msg_name = 0;
808	msg.msg_namelen = 0;
809	msg.msg_iov = &aiov;
810	msg.msg_iovlen = 1;
811	aiov.iov_base = uap->buf;
812	aiov.iov_len = uap->len;
813	msg.msg_control = 0;
814	msg.msg_flags = 0;
815	error = sendit(td, uap->s, &msg, uap->flags);
816	return (error);
817}
818
819/*
820 * MPSAFE
821 */
822int
823osendmsg(td, uap)
824	struct thread *td;
825	register struct osendmsg_args /* {
826		int	s;
827		caddr_t	msg;
828		int	flags;
829	} */ *uap;
830{
831	struct msghdr msg;
832	struct iovec aiov[UIO_SMALLIOV], *iov;
833	int error;
834
835	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
836	if (error)
837		goto done2;
838	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
839		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
840			error = EMSGSIZE;
841			goto done2;
842		}
843		MALLOC(iov, struct iovec *,
844		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
845		      M_WAITOK);
846	} else {
847		iov = aiov;
848	}
849	error = copyin(msg.msg_iov, iov,
850	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
851	if (error)
852		goto done;
853	msg.msg_flags = MSG_COMPAT;
854	msg.msg_iov = iov;
855	error = sendit(td, uap->s, &msg, uap->flags);
856done:
857	if (iov != aiov)
858		FREE(iov, M_IOV);
859done2:
860	return (error);
861}
862#endif
863
864/*
865 * MPSAFE
866 */
867int
868sendmsg(td, uap)
869	struct thread *td;
870	register struct sendmsg_args /* {
871		int	s;
872		caddr_t	msg;
873		int	flags;
874	} */ *uap;
875{
876	struct msghdr msg;
877	struct iovec aiov[UIO_SMALLIOV], *iov;
878	int error;
879
880	error = copyin(uap->msg, &msg, sizeof (msg));
881	if (error)
882		goto done2;
883	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
884		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
885			error = EMSGSIZE;
886			goto done2;
887		}
888		MALLOC(iov, struct iovec *,
889		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
890		       M_WAITOK);
891	} else {
892		iov = aiov;
893	}
894	if (msg.msg_iovlen &&
895	    (error = copyin(msg.msg_iov, iov,
896	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
897		goto done;
898	msg.msg_iov = iov;
899#ifdef COMPAT_OLDSOCK
900	msg.msg_flags = 0;
901#endif
902	error = sendit(td, uap->s, &msg, uap->flags);
903done:
904	if (iov != aiov)
905		FREE(iov, M_IOV);
906done2:
907	return (error);
908}
909
910static int
911recvit(td, s, mp, namelenp)
912	register struct thread *td;
913	int s;
914	register struct msghdr *mp;
915	void *namelenp;
916{
917	struct uio auio;
918	register struct iovec *iov;
919	register int i;
920	socklen_t len;
921	int error;
922	struct mbuf *m, *control = 0;
923	caddr_t ctlbuf;
924	struct socket *so;
925	struct sockaddr *fromsa = 0;
926#ifdef KTRACE
927	struct iovec *ktriov = NULL;
928	struct uio ktruio;
929	int iovlen;
930#endif
931
932	mtx_lock(&Giant);
933	if ((error = fgetsock(td, s, &so, NULL)) != 0) {
934		mtx_unlock(&Giant);
935		return (error);
936	}
937
938#ifdef MAC
939	error = mac_check_socket_receive(td->td_ucred, so);
940	if (error) {
941		fputsock(so);
942		mtx_unlock(&Giant);
943		return (error);
944	}
945#endif
946
947	auio.uio_iov = mp->msg_iov;
948	auio.uio_iovcnt = mp->msg_iovlen;
949	auio.uio_segflg = UIO_USERSPACE;
950	auio.uio_rw = UIO_READ;
951	auio.uio_td = td;
952	auio.uio_offset = 0;			/* XXX */
953	auio.uio_resid = 0;
954	iov = mp->msg_iov;
955	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
956		if ((auio.uio_resid += iov->iov_len) < 0) {
957			fputsock(so);
958			return (EINVAL);
959		}
960	}
961#ifdef KTRACE
962	if (KTRPOINT(td, KTR_GENIO)) {
963		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
964		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
965		bcopy(auio.uio_iov, ktriov, iovlen);
966		ktruio = auio;
967	}
968#endif
969	len = auio.uio_resid;
970	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
971	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
972	    &mp->msg_flags);
973	if (error) {
974		if (auio.uio_resid != (int)len && (error == ERESTART ||
975		    error == EINTR || error == EWOULDBLOCK))
976			error = 0;
977	}
978#ifdef KTRACE
979	if (ktriov != NULL) {
980		if (error == 0) {
981			ktruio.uio_iov = ktriov;
982			ktruio.uio_resid = (int)len - auio.uio_resid;
983			ktrgenio(s, UIO_READ, &ktruio, error);
984		}
985		FREE(ktriov, M_TEMP);
986	}
987#endif
988	if (error)
989		goto out;
990	td->td_retval[0] = (int)len - auio.uio_resid;
991	if (mp->msg_name) {
992		len = mp->msg_namelen;
993		if (len <= 0 || fromsa == 0)
994			len = 0;
995		else {
996			/* save sa_len before it is destroyed by MSG_COMPAT */
997			len = MIN(len, fromsa->sa_len);
998#ifdef COMPAT_OLDSOCK
999			if (mp->msg_flags & MSG_COMPAT)
1000				((struct osockaddr *)fromsa)->sa_family =
1001				    fromsa->sa_family;
1002#endif
1003			error = copyout(fromsa, mp->msg_name, (unsigned)len);
1004			if (error)
1005				goto out;
1006		}
1007		mp->msg_namelen = len;
1008		if (namelenp &&
1009		    (error = copyout(&len, namelenp, sizeof (socklen_t)))) {
1010#ifdef COMPAT_OLDSOCK
1011			if (mp->msg_flags & MSG_COMPAT)
1012				error = 0;	/* old recvfrom didn't check */
1013			else
1014#endif
1015			goto out;
1016		}
1017	}
1018	if (mp->msg_control) {
1019#ifdef COMPAT_OLDSOCK
1020		/*
1021		 * We assume that old recvmsg calls won't receive access
1022		 * rights and other control info, esp. as control info
1023		 * is always optional and those options didn't exist in 4.3.
1024		 * If we receive rights, trim the cmsghdr; anything else
1025		 * is tossed.
1026		 */
1027		if (control && mp->msg_flags & MSG_COMPAT) {
1028			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1029			    SOL_SOCKET ||
1030			    mtod(control, struct cmsghdr *)->cmsg_type !=
1031			    SCM_RIGHTS) {
1032				mp->msg_controllen = 0;
1033				goto out;
1034			}
1035			control->m_len -= sizeof (struct cmsghdr);
1036			control->m_data += sizeof (struct cmsghdr);
1037		}
1038#endif
1039		len = mp->msg_controllen;
1040		m = control;
1041		mp->msg_controllen = 0;
1042		ctlbuf = mp->msg_control;
1043
1044		while (m && len > 0) {
1045			unsigned int tocopy;
1046
1047			if (len >= m->m_len)
1048				tocopy = m->m_len;
1049			else {
1050				mp->msg_flags |= MSG_CTRUNC;
1051				tocopy = len;
1052			}
1053
1054			if ((error = copyout(mtod(m, caddr_t),
1055					ctlbuf, tocopy)) != 0)
1056				goto out;
1057
1058			ctlbuf += tocopy;
1059			len -= tocopy;
1060			m = m->m_next;
1061		}
1062		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1063	}
1064out:
1065	fputsock(so);
1066	mtx_unlock(&Giant);
1067	if (fromsa)
1068		FREE(fromsa, M_SONAME);
1069	if (control)
1070		m_freem(control);
1071	return (error);
1072}
1073
1074/*
1075 * MPSAFE
1076 */
1077int
1078recvfrom(td, uap)
1079	struct thread *td;
1080	register struct recvfrom_args /* {
1081		int	s;
1082		caddr_t	buf;
1083		size_t	len;
1084		int	flags;
1085		struct sockaddr * __restrict	from;
1086		socklen_t * __restrict fromlenaddr;
1087	} */ *uap;
1088{
1089	struct msghdr msg;
1090	struct iovec aiov;
1091	int error;
1092
1093	if (uap->fromlenaddr) {
1094		error = copyin(uap->fromlenaddr,
1095		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1096		if (error)
1097			goto done2;
1098	} else {
1099		msg.msg_namelen = 0;
1100	}
1101	msg.msg_name = uap->from;
1102	msg.msg_iov = &aiov;
1103	msg.msg_iovlen = 1;
1104	aiov.iov_base = uap->buf;
1105	aiov.iov_len = uap->len;
1106	msg.msg_control = 0;
1107	msg.msg_flags = uap->flags;
1108	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1109done2:
1110	return(error);
1111}
1112
1113#ifdef COMPAT_OLDSOCK
1114/*
1115 * MPSAFE
1116 */
1117int
1118orecvfrom(td, uap)
1119	struct thread *td;
1120	struct recvfrom_args *uap;
1121{
1122
1123	uap->flags |= MSG_COMPAT;
1124	return (recvfrom(td, uap));
1125}
1126#endif
1127
1128
1129#ifdef COMPAT_OLDSOCK
1130/*
1131 * MPSAFE
1132 */
1133int
1134orecv(td, uap)
1135	struct thread *td;
1136	register struct orecv_args /* {
1137		int	s;
1138		caddr_t	buf;
1139		int	len;
1140		int	flags;
1141	} */ *uap;
1142{
1143	struct msghdr msg;
1144	struct iovec aiov;
1145	int error;
1146
1147	msg.msg_name = 0;
1148	msg.msg_namelen = 0;
1149	msg.msg_iov = &aiov;
1150	msg.msg_iovlen = 1;
1151	aiov.iov_base = uap->buf;
1152	aiov.iov_len = uap->len;
1153	msg.msg_control = 0;
1154	msg.msg_flags = uap->flags;
1155	error = recvit(td, uap->s, &msg, NULL);
1156	return (error);
1157}
1158
1159/*
1160 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1161 * overlays the new one, missing only the flags, and with the (old) access
1162 * rights where the control fields are now.
1163 *
1164 * MPSAFE
1165 */
1166int
1167orecvmsg(td, uap)
1168	struct thread *td;
1169	register struct orecvmsg_args /* {
1170		int	s;
1171		struct	omsghdr *msg;
1172		int	flags;
1173	} */ *uap;
1174{
1175	struct msghdr msg;
1176	struct iovec aiov[UIO_SMALLIOV], *iov;
1177	int error;
1178
1179	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1180	if (error)
1181		return (error);
1182
1183	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1184		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1185			error = EMSGSIZE;
1186			goto done2;
1187		}
1188		MALLOC(iov, struct iovec *,
1189		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1190		      M_WAITOK);
1191	} else {
1192		iov = aiov;
1193	}
1194	msg.msg_flags = uap->flags | MSG_COMPAT;
1195	error = copyin(msg.msg_iov, iov,
1196	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1197	if (error)
1198		goto done;
1199	msg.msg_iov = iov;
1200	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1201
1202	if (msg.msg_controllen && error == 0)
1203		error = copyout(&msg.msg_controllen,
1204		    &uap->msg->msg_accrightslen, sizeof (int));
1205done:
1206	if (iov != aiov)
1207		FREE(iov, M_IOV);
1208done2:
1209	return (error);
1210}
1211#endif
1212
1213/*
1214 * MPSAFE
1215 */
1216int
1217recvmsg(td, uap)
1218	struct thread *td;
1219	register struct recvmsg_args /* {
1220		int	s;
1221		struct	msghdr *msg;
1222		int	flags;
1223	} */ *uap;
1224{
1225	struct msghdr msg;
1226	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1227	register int error;
1228
1229	error = copyin(uap->msg, &msg, sizeof (msg));
1230	if (error)
1231		goto done2;
1232	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1233		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1234			error = EMSGSIZE;
1235			goto done2;
1236		}
1237		MALLOC(iov, struct iovec *,
1238		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1239		       M_WAITOK);
1240	} else {
1241		iov = aiov;
1242	}
1243#ifdef COMPAT_OLDSOCK
1244	msg.msg_flags = uap->flags &~ MSG_COMPAT;
1245#else
1246	msg.msg_flags = uap->flags;
1247#endif
1248	uiov = msg.msg_iov;
1249	msg.msg_iov = iov;
1250	error = copyin(uiov, iov,
1251	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1252	if (error)
1253		goto done;
1254	error = recvit(td, uap->s, &msg, NULL);
1255	if (!error) {
1256		msg.msg_iov = uiov;
1257		error = copyout(&msg, uap->msg, sizeof(msg));
1258	}
1259done:
1260	if (iov != aiov)
1261		FREE(iov, M_IOV);
1262done2:
1263	return (error);
1264}
1265
1266/*
1267 * MPSAFE
1268 */
1269/* ARGSUSED */
1270int
1271shutdown(td, uap)
1272	struct thread *td;
1273	register struct shutdown_args /* {
1274		int	s;
1275		int	how;
1276	} */ *uap;
1277{
1278	struct socket *so;
1279	int error;
1280
1281	mtx_lock(&Giant);
1282	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1283		error = soshutdown(so, uap->how);
1284		fputsock(so);
1285	}
1286	mtx_unlock(&Giant);
1287	return(error);
1288}
1289
1290/*
1291 * MPSAFE
1292 */
1293/* ARGSUSED */
1294int
1295setsockopt(td, uap)
1296	struct thread *td;
1297	register struct setsockopt_args /* {
1298		int	s;
1299		int	level;
1300		int	name;
1301		caddr_t	val;
1302		int	valsize;
1303	} */ *uap;
1304{
1305	struct socket *so;
1306	struct sockopt sopt;
1307	int error;
1308
1309	if (uap->val == 0 && uap->valsize != 0)
1310		return (EFAULT);
1311	if (uap->valsize < 0)
1312		return (EINVAL);
1313
1314	mtx_lock(&Giant);
1315	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1316		sopt.sopt_dir = SOPT_SET;
1317		sopt.sopt_level = uap->level;
1318		sopt.sopt_name = uap->name;
1319		sopt.sopt_val = uap->val;
1320		sopt.sopt_valsize = uap->valsize;
1321		sopt.sopt_td = td;
1322		error = sosetopt(so, &sopt);
1323		fputsock(so);
1324	}
1325	mtx_unlock(&Giant);
1326	return(error);
1327}
1328
1329/*
1330 * MPSAFE
1331 */
1332/* ARGSUSED */
1333int
1334getsockopt(td, uap)
1335	struct thread *td;
1336	register struct getsockopt_args /* {
1337		int	s;
1338		int	level;
1339		int	name;
1340		void * __restrict	val;
1341		socklen_t * __restrict avalsize;
1342	} */ *uap;
1343{
1344	socklen_t valsize;
1345	int	error;
1346	struct  socket *so;
1347	struct	sockopt sopt;
1348
1349	mtx_lock(&Giant);
1350	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1351		goto done2;
1352	if (uap->val) {
1353		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1354		if (error)
1355			goto done1;
1356		if (valsize < 0) {
1357			error = EINVAL;
1358			goto done1;
1359		}
1360	} else {
1361		valsize = 0;
1362	}
1363
1364	sopt.sopt_dir = SOPT_GET;
1365	sopt.sopt_level = uap->level;
1366	sopt.sopt_name = uap->name;
1367	sopt.sopt_val = uap->val;
1368	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1369	sopt.sopt_td = td;
1370
1371	error = sogetopt(so, &sopt);
1372	if (error == 0) {
1373		valsize = sopt.sopt_valsize;
1374		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1375	}
1376done1:
1377	fputsock(so);
1378done2:
1379	mtx_unlock(&Giant);
1380	return (error);
1381}
1382
1383/*
1384 * getsockname1() - Get socket name.
1385 *
1386 * MPSAFE
1387 */
1388/* ARGSUSED */
1389static int
1390getsockname1(td, uap, compat)
1391	struct thread *td;
1392	register struct getsockname_args /* {
1393		int	fdes;
1394		struct sockaddr * __restrict asa;
1395		socklen_t * __restrict alen;
1396	} */ *uap;
1397	int compat;
1398{
1399	struct socket *so;
1400	struct sockaddr *sa;
1401	socklen_t len;
1402	int error;
1403
1404	mtx_lock(&Giant);
1405	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1406		goto done2;
1407	error = copyin(uap->alen, &len, sizeof (len));
1408	if (error)
1409		goto done1;
1410	if (len < 0) {
1411		error = EINVAL;
1412		goto done1;
1413	}
1414	sa = 0;
1415	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1416	if (error)
1417		goto bad;
1418	if (sa == 0) {
1419		len = 0;
1420		goto gotnothing;
1421	}
1422
1423	len = MIN(len, sa->sa_len);
1424#ifdef COMPAT_OLDSOCK
1425	if (compat)
1426		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1427#endif
1428	error = copyout(sa, uap->asa, (u_int)len);
1429	if (error == 0)
1430gotnothing:
1431		error = copyout(&len, uap->alen, sizeof (len));
1432bad:
1433	if (sa)
1434		FREE(sa, M_SONAME);
1435done1:
1436	fputsock(so);
1437done2:
1438	mtx_unlock(&Giant);
1439	return (error);
1440}
1441
1442/*
1443 * MPSAFE
1444 */
1445int
1446getsockname(td, uap)
1447	struct thread *td;
1448	struct getsockname_args *uap;
1449{
1450
1451	return (getsockname1(td, uap, 0));
1452}
1453
1454#ifdef COMPAT_OLDSOCK
1455/*
1456 * MPSAFE
1457 */
1458int
1459ogetsockname(td, uap)
1460	struct thread *td;
1461	struct getsockname_args *uap;
1462{
1463
1464	return (getsockname1(td, uap, 1));
1465}
1466#endif /* COMPAT_OLDSOCK */
1467
1468/*
1469 * getpeername1() - Get name of peer for connected socket.
1470 *
1471 * MPSAFE
1472 */
1473/* ARGSUSED */
1474static int
1475getpeername1(td, uap, compat)
1476	struct thread *td;
1477	register struct getpeername_args /* {
1478		int	fdes;
1479		struct sockaddr * __restrict	asa;
1480		socklen_t * __restrict	alen;
1481	} */ *uap;
1482	int compat;
1483{
1484	struct socket *so;
1485	struct sockaddr *sa;
1486	socklen_t len;
1487	int error;
1488
1489	mtx_lock(&Giant);
1490	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1491		goto done2;
1492	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1493		error = ENOTCONN;
1494		goto done1;
1495	}
1496	error = copyin(uap->alen, &len, sizeof (len));
1497	if (error)
1498		goto done1;
1499	if (len < 0) {
1500		error = EINVAL;
1501		goto done1;
1502	}
1503	sa = 0;
1504	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1505	if (error)
1506		goto bad;
1507	if (sa == 0) {
1508		len = 0;
1509		goto gotnothing;
1510	}
1511	len = MIN(len, sa->sa_len);
1512#ifdef COMPAT_OLDSOCK
1513	if (compat)
1514		((struct osockaddr *)sa)->sa_family =
1515		    sa->sa_family;
1516#endif
1517	error = copyout(sa, uap->asa, (u_int)len);
1518	if (error)
1519		goto bad;
1520gotnothing:
1521	error = copyout(&len, uap->alen, sizeof (len));
1522bad:
1523	if (sa)
1524		FREE(sa, M_SONAME);
1525done1:
1526	fputsock(so);
1527done2:
1528	mtx_unlock(&Giant);
1529	return (error);
1530}
1531
1532/*
1533 * MPSAFE
1534 */
1535int
1536getpeername(td, uap)
1537	struct thread *td;
1538	struct getpeername_args *uap;
1539{
1540
1541	return (getpeername1(td, uap, 0));
1542}
1543
1544#ifdef COMPAT_OLDSOCK
1545/*
1546 * MPSAFE
1547 */
1548int
1549ogetpeername(td, uap)
1550	struct thread *td;
1551	struct ogetpeername_args *uap;
1552{
1553
1554	/* XXX uap should have type `getpeername_args *' to begin with. */
1555	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1556}
1557#endif /* COMPAT_OLDSOCK */
1558
1559int
1560sockargs(mp, buf, buflen, type)
1561	struct mbuf **mp;
1562	caddr_t buf;
1563	int buflen, type;
1564{
1565	register struct sockaddr *sa;
1566	register struct mbuf *m;
1567	int error;
1568
1569	if ((u_int)buflen > MLEN) {
1570#ifdef COMPAT_OLDSOCK
1571		if (type == MT_SONAME && (u_int)buflen <= 112)
1572			buflen = MLEN;		/* unix domain compat. hack */
1573		else
1574#endif
1575		return (EINVAL);
1576	}
1577	m = m_get(M_TRYWAIT, type);
1578	if (m == NULL)
1579		return (ENOBUFS);
1580	m->m_len = buflen;
1581	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1582	if (error)
1583		(void) m_free(m);
1584	else {
1585		*mp = m;
1586		if (type == MT_SONAME) {
1587			sa = mtod(m, struct sockaddr *);
1588
1589#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1590			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1591				sa->sa_family = sa->sa_len;
1592#endif
1593			sa->sa_len = buflen;
1594		}
1595	}
1596	return (error);
1597}
1598
1599int
1600getsockaddr(namp, uaddr, len)
1601	struct sockaddr **namp;
1602	caddr_t uaddr;
1603	size_t len;
1604{
1605	struct sockaddr *sa;
1606	int error;
1607
1608	if (len > SOCK_MAXADDRLEN)
1609		return ENAMETOOLONG;
1610	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1611	error = copyin(uaddr, sa, len);
1612	if (error) {
1613		FREE(sa, M_SONAME);
1614	} else {
1615#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1616		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1617			sa->sa_family = sa->sa_len;
1618#endif
1619		sa->sa_len = len;
1620		*namp = sa;
1621	}
1622	return error;
1623}
1624
1625/*
1626 * sendfile(2)
1627 *
1628 * MPSAFE
1629 *
1630 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1631 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1632 *
1633 * Send a file specified by 'fd' and starting at 'offset' to a socket
1634 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1635 * nbytes == 0. Optionally add a header and/or trailer to the socket
1636 * output. If specified, write the total number of bytes sent into *sbytes.
1637 *
1638 */
1639int
1640sendfile(struct thread *td, struct sendfile_args *uap)
1641{
1642
1643	return (do_sendfile(td, uap, 0));
1644}
1645
1646#ifdef COMPAT_FREEBSD4
1647int
1648freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1649{
1650	struct sendfile_args args;
1651
1652	args.fd = uap->fd;
1653	args.s = uap->s;
1654	args.offset = uap->offset;
1655	args.nbytes = uap->nbytes;
1656	args.hdtr = uap->hdtr;
1657	args.sbytes = uap->sbytes;
1658	args.flags = uap->flags;
1659
1660	return (do_sendfile(td, &args, 1));
1661}
1662#endif /* COMPAT_FREEBSD4 */
1663
1664static int
1665do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1666{
1667	struct vnode *vp;
1668	struct vm_object *obj;
1669	struct socket *so = NULL;
1670	struct mbuf *m;
1671	struct sf_buf *sf;
1672	struct vm_page *pg;
1673	struct writev_args nuap;
1674	struct sf_hdtr hdtr;
1675	off_t off, xfsize, hdtr_size, sbytes = 0;
1676	int error, s;
1677
1678	mtx_lock(&Giant);
1679
1680	hdtr_size = 0;
1681
1682	/*
1683	 * The descriptor must be a regular file and have a backing VM object.
1684	 */
1685	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1686		goto done;
1687	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1688	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1689		error = EINVAL;
1690		VOP_UNLOCK(vp, 0, td);
1691		goto done;
1692	}
1693	VOP_UNLOCK(vp, 0, td);
1694	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1695		goto done;
1696	if (so->so_type != SOCK_STREAM) {
1697		error = EINVAL;
1698		goto done;
1699	}
1700	if ((so->so_state & SS_ISCONNECTED) == 0) {
1701		error = ENOTCONN;
1702		goto done;
1703	}
1704	if (uap->offset < 0) {
1705		error = EINVAL;
1706		goto done;
1707	}
1708
1709#ifdef MAC
1710	error = mac_check_socket_send(td->td_ucred, so);
1711	if (error)
1712		goto done;
1713#endif
1714
1715	/*
1716	 * If specified, get the pointer to the sf_hdtr struct for
1717	 * any headers/trailers.
1718	 */
1719	if (uap->hdtr != NULL) {
1720		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1721		if (error)
1722			goto done;
1723		/*
1724		 * Send any headers. Wimp out and use writev(2).
1725		 */
1726		if (hdtr.headers != NULL) {
1727			nuap.fd = uap->s;
1728			nuap.iovp = hdtr.headers;
1729			nuap.iovcnt = hdtr.hdr_cnt;
1730			error = writev(td, &nuap);
1731			if (error)
1732				goto done;
1733			if (compat)
1734				sbytes += td->td_retval[0];
1735			else
1736				hdtr_size += td->td_retval[0];
1737		}
1738	}
1739
1740	/*
1741	 * Protect against multiple writers to the socket.
1742	 */
1743	(void) sblock(&so->so_snd, M_WAITOK);
1744
1745	/*
1746	 * Loop through the pages in the file, starting with the requested
1747	 * offset. Get a file page (do I/O if necessary), map the file page
1748	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1749	 * it on the socket.
1750	 */
1751	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1752		vm_pindex_t pindex;
1753		vm_offset_t pgoff;
1754
1755		pindex = OFF_TO_IDX(off);
1756		VM_OBJECT_LOCK(obj);
1757retry_lookup:
1758		/*
1759		 * Calculate the amount to transfer. Not to exceed a page,
1760		 * the EOF, or the passed in nbytes.
1761		 */
1762		xfsize = obj->un_pager.vnp.vnp_size - off;
1763		VM_OBJECT_UNLOCK(obj);
1764		if (xfsize > PAGE_SIZE)
1765			xfsize = PAGE_SIZE;
1766		pgoff = (vm_offset_t)(off & PAGE_MASK);
1767		if (PAGE_SIZE - pgoff < xfsize)
1768			xfsize = PAGE_SIZE - pgoff;
1769		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1770			xfsize = uap->nbytes - sbytes;
1771		if (xfsize <= 0)
1772			break;
1773		/*
1774		 * Optimize the non-blocking case by looking at the socket space
1775		 * before going to the extra work of constituting the sf_buf.
1776		 */
1777		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1778			if (so->so_state & SS_CANTSENDMORE)
1779				error = EPIPE;
1780			else
1781				error = EAGAIN;
1782			sbunlock(&so->so_snd);
1783			goto done;
1784		}
1785		VM_OBJECT_LOCK(obj);
1786		/*
1787		 * Attempt to look up the page.
1788		 *
1789		 *	Allocate if not found
1790		 *
1791		 *	Wait and loop if busy.
1792		 */
1793		pg = vm_page_lookup(obj, pindex);
1794
1795		if (pg == NULL) {
1796			pg = vm_page_alloc(obj, pindex,
1797			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
1798			if (pg == NULL) {
1799				VM_OBJECT_UNLOCK(obj);
1800				VM_WAIT;
1801				VM_OBJECT_LOCK(obj);
1802				goto retry_lookup;
1803			}
1804			vm_page_lock_queues();
1805			vm_page_wakeup(pg);
1806		} else {
1807			vm_page_lock_queues();
1808			if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
1809				goto retry_lookup;
1810			/*
1811		 	 * Wire the page so it does not get ripped out from
1812			 * under us.
1813			 */
1814			vm_page_wire(pg);
1815		}
1816
1817		/*
1818		 * If page is not valid for what we need, initiate I/O
1819		 */
1820
1821		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1822			int bsize, resid;
1823
1824			/*
1825			 * Ensure that our page is still around when the I/O
1826			 * completes.
1827			 */
1828			vm_page_io_start(pg);
1829			vm_page_unlock_queues();
1830			VM_OBJECT_UNLOCK(obj);
1831
1832			/*
1833			 * Get the page from backing store.
1834			 */
1835			bsize = vp->v_mount->mnt_stat.f_iosize;
1836			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
1837			/*
1838			 * XXXMAC: Because we don't have fp->f_cred here,
1839			 * we pass in NOCRED.  This is probably wrong, but
1840			 * is consistent with our original implementation.
1841			 */
1842			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
1843			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
1844			    IO_VMIO | ((MAXBSIZE / bsize) << 16),
1845			    td->td_ucred, NOCRED, &resid, td);
1846			VOP_UNLOCK(vp, 0, td);
1847			if (error)
1848				VM_OBJECT_LOCK(obj);
1849			vm_page_lock_queues();
1850			vm_page_flag_clear(pg, PG_ZERO);
1851			vm_page_io_finish(pg);
1852			if (error) {
1853				vm_page_unwire(pg, 0);
1854				/*
1855				 * See if anyone else might know about this page.
1856				 * If not and it is not valid, then free it.
1857				 */
1858				if (pg->wire_count == 0 && pg->valid == 0 &&
1859				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1860				    pg->hold_count == 0) {
1861					vm_page_busy(pg);
1862					vm_page_free(pg);
1863				}
1864				vm_page_unlock_queues();
1865				VM_OBJECT_UNLOCK(obj);
1866				sbunlock(&so->so_snd);
1867				goto done;
1868			}
1869		} else
1870			VM_OBJECT_UNLOCK(obj);
1871		vm_page_unlock_queues();
1872
1873		/*
1874		 * Get a sendfile buf. We usually wait as long as necessary,
1875		 * but this wait can be interrupted.
1876		 */
1877		if ((sf = sf_buf_alloc(pg)) == NULL) {
1878			vm_page_lock_queues();
1879			vm_page_unwire(pg, 0);
1880			if (pg->wire_count == 0 && pg->object == NULL)
1881				vm_page_free(pg);
1882			vm_page_unlock_queues();
1883			sbunlock(&so->so_snd);
1884			error = EINTR;
1885			goto done;
1886		}
1887
1888		/*
1889		 * Get an mbuf header and set it up as having external storage.
1890		 */
1891		MGETHDR(m, M_TRYWAIT, MT_DATA);
1892		if (m == NULL) {
1893			error = ENOBUFS;
1894			sf_buf_free((void *)sf_buf_kva(sf), sf);
1895			sbunlock(&so->so_snd);
1896			goto done;
1897		}
1898		/*
1899		 * Setup external storage for mbuf.
1900		 */
1901		MEXTADD(m, sf_buf_kva(sf), PAGE_SIZE, sf_buf_free, sf, M_RDONLY,
1902		    EXT_SFBUF);
1903		m->m_data = (char *)sf_buf_kva(sf) + pgoff;
1904		m->m_pkthdr.len = m->m_len = xfsize;
1905		/*
1906		 * Add the buffer to the socket buffer chain.
1907		 */
1908		s = splnet();
1909retry_space:
1910		/*
1911		 * Make sure that the socket is still able to take more data.
1912		 * CANTSENDMORE being true usually means that the connection
1913		 * was closed. so_error is true when an error was sensed after
1914		 * a previous send.
1915		 * The state is checked after the page mapping and buffer
1916		 * allocation above since those operations may block and make
1917		 * any socket checks stale. From this point forward, nothing
1918		 * blocks before the pru_send (or more accurately, any blocking
1919		 * results in a loop back to here to re-check).
1920		 */
1921		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1922			if (so->so_state & SS_CANTSENDMORE) {
1923				error = EPIPE;
1924			} else {
1925				error = so->so_error;
1926				so->so_error = 0;
1927			}
1928			m_freem(m);
1929			sbunlock(&so->so_snd);
1930			splx(s);
1931			goto done;
1932		}
1933		/*
1934		 * Wait for socket space to become available. We do this just
1935		 * after checking the connection state above in order to avoid
1936		 * a race condition with sbwait().
1937		 */
1938		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1939			if (so->so_state & SS_NBIO) {
1940				m_freem(m);
1941				sbunlock(&so->so_snd);
1942				splx(s);
1943				error = EAGAIN;
1944				goto done;
1945			}
1946			error = sbwait(&so->so_snd);
1947			/*
1948			 * An error from sbwait usually indicates that we've
1949			 * been interrupted by a signal. If we've sent anything
1950			 * then return bytes sent, otherwise return the error.
1951			 */
1952			if (error) {
1953				m_freem(m);
1954				sbunlock(&so->so_snd);
1955				splx(s);
1956				goto done;
1957			}
1958			goto retry_space;
1959		}
1960		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
1961		splx(s);
1962		if (error) {
1963			sbunlock(&so->so_snd);
1964			goto done;
1965		}
1966	}
1967	sbunlock(&so->so_snd);
1968
1969	/*
1970	 * Send trailers. Wimp out and use writev(2).
1971	 */
1972	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1973			nuap.fd = uap->s;
1974			nuap.iovp = hdtr.trailers;
1975			nuap.iovcnt = hdtr.trl_cnt;
1976			error = writev(td, &nuap);
1977			if (error)
1978				goto done;
1979			if (compat)
1980				sbytes += td->td_retval[0];
1981			else
1982				hdtr_size += td->td_retval[0];
1983	}
1984
1985done:
1986	/*
1987	 * If there was no error we have to clear td->td_retval[0]
1988	 * because it may have been set by writev.
1989	 */
1990	if (error == 0) {
1991		td->td_retval[0] = 0;
1992	}
1993	if (uap->sbytes != NULL) {
1994		if (!compat)
1995			sbytes += hdtr_size;
1996		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1997	}
1998	if (vp)
1999		vrele(vp);
2000	if (so)
2001		fputsock(so);
2002
2003	mtx_unlock(&Giant);
2004
2005	if (error == ERESTART)
2006		error = EINTR;
2007
2008	return (error);
2009}
2010