kern_sendfile.c revision 118448
1/*
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 118448 2003-08-04 21:28:57Z dwmalone $");
41
42#include "opt_compat.h"
43#include "opt_ktrace.h"
44#include "opt_mac.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/mac.h>
51#include <sys/mutex.h>
52#include <sys/sysproto.h>
53#include <sys/malloc.h>
54#include <sys/filedesc.h>
55#include <sys/event.h>
56#include <sys/proc.h>
57#include <sys/fcntl.h>
58#include <sys/file.h>
59#include <sys/filio.h>
60#include <sys/mount.h>
61#include <sys/mbuf.h>
62#include <sys/protosw.h>
63#include <sys/socket.h>
64#include <sys/socketvar.h>
65#include <sys/signalvar.h>
66#include <sys/syscallsubr.h>
67#include <sys/uio.h>
68#include <sys/vnode.h>
69#ifdef KTRACE
70#include <sys/ktrace.h>
71#endif
72
73#include <vm/vm.h>
74#include <vm/vm_object.h>
75#include <vm/vm_page.h>
76#include <vm/vm_pageout.h>
77#include <vm/vm_kern.h>
78#include <vm/vm_extern.h>
79
80static void sf_buf_init(void *arg);
81SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
82
83static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
84static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
85
86static int accept1(struct thread *td, struct accept_args *uap, int compat);
87static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
88static int getsockname1(struct thread *td, struct getsockname_args *uap,
89			int compat);
90static int getpeername1(struct thread *td, struct getpeername_args *uap,
91			int compat);
92
93/*
94 * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
95 * sf_freelist head with the sf_lock mutex.
96 */
97static struct {
98	SLIST_HEAD(, sf_buf) sf_head;
99	struct mtx sf_lock;
100} sf_freelist;
101
102static u_int sf_buf_alloc_want;
103
104/*
105 * System call interface to the socket abstraction.
106 */
107#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
108#define COMPAT_OLDSOCK
109#endif
110
111/*
112 * MPSAFE
113 */
114int
115socket(td, uap)
116	struct thread *td;
117	register struct socket_args /* {
118		int	domain;
119		int	type;
120		int	protocol;
121	} */ *uap;
122{
123	struct filedesc *fdp;
124	struct socket *so;
125	struct file *fp;
126	int fd, error;
127
128	mtx_lock(&Giant);
129	fdp = td->td_proc->p_fd;
130	error = falloc(td, &fp, &fd);
131	if (error)
132		goto done2;
133	fhold(fp);
134	error = socreate(uap->domain, &so, uap->type, uap->protocol,
135	    td->td_ucred, td);
136	FILEDESC_LOCK(fdp);
137	if (error) {
138		if (fdp->fd_ofiles[fd] == fp) {
139			fdp->fd_ofiles[fd] = NULL;
140			FILEDESC_UNLOCK(fdp);
141			fdrop(fp, td);
142		} else
143			FILEDESC_UNLOCK(fdp);
144	} else {
145		fp->f_data = so;	/* already has ref count */
146		fp->f_flag = FREAD|FWRITE;
147		fp->f_ops = &socketops;
148		fp->f_type = DTYPE_SOCKET;
149		FILEDESC_UNLOCK(fdp);
150		td->td_retval[0] = fd;
151	}
152	fdrop(fp, td);
153done2:
154	mtx_unlock(&Giant);
155	return (error);
156}
157
158/*
159 * MPSAFE
160 */
161/* ARGSUSED */
162int
163bind(td, uap)
164	struct thread *td;
165	register struct bind_args /* {
166		int	s;
167		caddr_t	name;
168		int	namelen;
169	} */ *uap;
170{
171	struct sockaddr *sa;
172	int error;
173
174	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
175		return (error);
176
177	return (kern_bind(td, uap->s, sa));
178}
179
180int
181kern_bind(td, fd, sa)
182	struct thread *td;
183	int fd;
184	struct sockaddr *sa;
185{
186	struct socket *so;
187	int error;
188
189	mtx_lock(&Giant);
190	if ((error = fgetsock(td, fd, &so, NULL)) != 0)
191		goto done2;
192#ifdef MAC
193	error = mac_check_socket_bind(td->td_ucred, so, sa);
194	if (error)
195		goto done1;
196#endif
197	error = sobind(so, sa, td);
198#ifdef MAC
199done1:
200#endif
201	fputsock(so);
202done2:
203	mtx_unlock(&Giant);
204	FREE(sa, M_SONAME);
205	return (error);
206}
207
208/*
209 * MPSAFE
210 */
211/* ARGSUSED */
212int
213listen(td, uap)
214	struct thread *td;
215	register struct listen_args /* {
216		int	s;
217		int	backlog;
218	} */ *uap;
219{
220	struct socket *so;
221	int error;
222
223	mtx_lock(&Giant);
224	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
225#ifdef MAC
226		error = mac_check_socket_listen(td->td_ucred, so);
227		if (error)
228			goto done;
229#endif
230		error = solisten(so, uap->backlog, td);
231#ifdef MAC
232done:
233#endif
234		fputsock(so);
235	}
236	mtx_unlock(&Giant);
237	return(error);
238}
239
240/*
241 * accept1()
242 * MPSAFE
243 */
244static int
245accept1(td, uap, compat)
246	struct thread *td;
247	register struct accept_args /* {
248		int	s;
249		caddr_t	name;
250		int	*anamelen;
251	} */ *uap;
252	int compat;
253{
254	struct filedesc *fdp;
255	struct file *nfp = NULL;
256	struct sockaddr *sa;
257	int namelen, error, s;
258	struct socket *head, *so;
259	int fd;
260	u_int fflag;
261	pid_t pgid;
262	int tmp;
263
264	fdp = td->td_proc->p_fd;
265	if (uap->name) {
266		error = copyin(uap->anamelen, &namelen, sizeof (namelen));
267		if(error)
268			goto done3;
269		if (namelen < 0) {
270			error = EINVAL;
271			goto done3;
272		}
273	}
274	mtx_lock(&Giant);
275	error = fgetsock(td, uap->s, &head, &fflag);
276	if (error)
277		goto done2;
278	s = splnet();
279	if ((head->so_options & SO_ACCEPTCONN) == 0) {
280		splx(s);
281		error = EINVAL;
282		goto done;
283	}
284	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
285		if (head->so_state & SS_CANTRCVMORE) {
286			head->so_error = ECONNABORTED;
287			break;
288		}
289		if ((head->so_state & SS_NBIO) != 0) {
290			head->so_error = EWOULDBLOCK;
291			break;
292		}
293		error = tsleep(&head->so_timeo, PSOCK | PCATCH,
294		    "accept", 0);
295		if (error) {
296			splx(s);
297			goto done;
298		}
299	}
300	if (head->so_error) {
301		error = head->so_error;
302		head->so_error = 0;
303		splx(s);
304		goto done;
305	}
306
307	/*
308	 * At this point we know that there is at least one connection
309	 * ready to be accepted. Remove it from the queue prior to
310	 * allocating the file descriptor for it since falloc() may
311	 * block allowing another process to accept the connection
312	 * instead.
313	 */
314	so = TAILQ_FIRST(&head->so_comp);
315	TAILQ_REMOVE(&head->so_comp, so, so_list);
316	head->so_qlen--;
317
318	error = falloc(td, &nfp, &fd);
319	if (error) {
320		/*
321		 * Probably ran out of file descriptors. Put the
322		 * unaccepted connection back onto the queue and
323		 * do another wakeup so some other process might
324		 * have a chance at it.
325		 */
326		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
327		head->so_qlen++;
328		wakeup_one(&head->so_timeo);
329		splx(s);
330		goto done;
331	}
332	fhold(nfp);
333	td->td_retval[0] = fd;
334
335	/* connection has been removed from the listen queue */
336	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
337
338	so->so_state &= ~SS_COMP;
339	so->so_head = NULL;
340	pgid = fgetown(&head->so_sigio);
341	if (pgid != 0)
342		fsetown(pgid, &so->so_sigio);
343
344	FILE_LOCK(nfp);
345	soref(so);			/* file descriptor reference */
346	nfp->f_data = so;	/* nfp has ref count from falloc */
347	nfp->f_flag = fflag;
348	nfp->f_ops = &socketops;
349	nfp->f_type = DTYPE_SOCKET;
350	FILE_UNLOCK(nfp);
351	/* Sync socket nonblocking/async state with file flags */
352	tmp = fflag & FNONBLOCK;
353	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
354	tmp = fflag & FASYNC;
355	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
356	sa = 0;
357	error = soaccept(so, &sa);
358	if (error) {
359		/*
360		 * return a namelen of zero for older code which might
361	 	 * ignore the return value from accept.
362		 */
363		if (uap->name != NULL) {
364			namelen = 0;
365			(void) copyout(&namelen,
366			    uap->anamelen, sizeof(*uap->anamelen));
367		}
368		goto noconnection;
369	}
370	if (sa == NULL) {
371		namelen = 0;
372		if (uap->name)
373			goto gotnoname;
374		splx(s);
375		error = 0;
376		goto done;
377	}
378	if (uap->name) {
379		/* check sa_len before it is destroyed */
380		if (namelen > sa->sa_len)
381			namelen = sa->sa_len;
382#ifdef COMPAT_OLDSOCK
383		if (compat)
384			((struct osockaddr *)sa)->sa_family =
385			    sa->sa_family;
386#endif
387		error = copyout(sa, uap->name, (u_int)namelen);
388		if (!error)
389gotnoname:
390			error = copyout(&namelen,
391			    uap->anamelen, sizeof (*uap->anamelen));
392	}
393noconnection:
394	if (sa)
395		FREE(sa, M_SONAME);
396
397	/*
398	 * close the new descriptor, assuming someone hasn't ripped it
399	 * out from under us.
400	 */
401	if (error) {
402		FILEDESC_LOCK(fdp);
403		if (fdp->fd_ofiles[fd] == nfp) {
404			fdp->fd_ofiles[fd] = NULL;
405			FILEDESC_UNLOCK(fdp);
406			fdrop(nfp, td);
407		} else {
408			FILEDESC_UNLOCK(fdp);
409		}
410	}
411	splx(s);
412
413	/*
414	 * Release explicitly held references before returning.
415	 */
416done:
417	if (nfp != NULL)
418		fdrop(nfp, td);
419	fputsock(head);
420done2:
421	mtx_unlock(&Giant);
422done3:
423	return (error);
424}
425
426/*
427 * MPSAFE (accept1() is MPSAFE)
428 */
429int
430accept(td, uap)
431	struct thread *td;
432	struct accept_args *uap;
433{
434
435	return (accept1(td, uap, 0));
436}
437
438#ifdef COMPAT_OLDSOCK
439/*
440 * MPSAFE (accept1() is MPSAFE)
441 */
442int
443oaccept(td, uap)
444	struct thread *td;
445	struct accept_args *uap;
446{
447
448	return (accept1(td, uap, 1));
449}
450#endif /* COMPAT_OLDSOCK */
451
452/*
453 * MPSAFE
454 */
455/* ARGSUSED */
456int
457connect(td, uap)
458	struct thread *td;
459	register struct connect_args /* {
460		int	s;
461		caddr_t	name;
462		int	namelen;
463	} */ *uap;
464{
465	struct sockaddr *sa;
466	int error;
467
468	error = getsockaddr(&sa, uap->name, uap->namelen);
469	if (error)
470		return error;
471
472	return (kern_connect(td, uap->s, sa));
473}
474
475
476int
477kern_connect(td, fd, sa)
478	struct thread *td;
479	int fd;
480	struct sockaddr *sa;
481{
482	struct socket *so;
483	int error, s;
484
485	mtx_lock(&Giant);
486	if ((error = fgetsock(td, fd, &so, NULL)) != 0)
487		goto done2;
488	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
489		error = EALREADY;
490		goto done1;
491	}
492#ifdef MAC
493	error = mac_check_socket_connect(td->td_ucred, so, sa);
494	if (error)
495		goto bad;
496#endif
497	error = soconnect(so, sa, td);
498	if (error)
499		goto bad;
500	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
501		error = EINPROGRESS;
502		goto done1;
503	}
504	s = splnet();
505	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
506		error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0);
507		if (error)
508			break;
509	}
510	if (error == 0) {
511		error = so->so_error;
512		so->so_error = 0;
513	}
514	splx(s);
515bad:
516	so->so_state &= ~SS_ISCONNECTING;
517	if (error == ERESTART)
518		error = EINTR;
519done1:
520	fputsock(so);
521done2:
522	mtx_unlock(&Giant);
523	FREE(sa, M_SONAME);
524	return (error);
525}
526
527/*
528 * MPSAFE
529 */
530int
531socketpair(td, uap)
532	struct thread *td;
533	register struct socketpair_args /* {
534		int	domain;
535		int	type;
536		int	protocol;
537		int	*rsv;
538	} */ *uap;
539{
540	register struct filedesc *fdp = td->td_proc->p_fd;
541	struct file *fp1, *fp2;
542	struct socket *so1, *so2;
543	int fd, error, sv[2];
544
545	mtx_lock(&Giant);
546	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
547	    td->td_ucred, td);
548	if (error)
549		goto done2;
550	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
551	    td->td_ucred, td);
552	if (error)
553		goto free1;
554	error = falloc(td, &fp1, &fd);
555	if (error)
556		goto free2;
557	fhold(fp1);
558	sv[0] = fd;
559	fp1->f_data = so1;	/* so1 already has ref count */
560	error = falloc(td, &fp2, &fd);
561	if (error)
562		goto free3;
563	fhold(fp2);
564	fp2->f_data = so2;	/* so2 already has ref count */
565	sv[1] = fd;
566	error = soconnect2(so1, so2);
567	if (error)
568		goto free4;
569	if (uap->type == SOCK_DGRAM) {
570		/*
571		 * Datagram socket connection is asymmetric.
572		 */
573		 error = soconnect2(so2, so1);
574		 if (error)
575			goto free4;
576	}
577	FILE_LOCK(fp1);
578	fp1->f_flag = FREAD|FWRITE;
579	fp1->f_ops = &socketops;
580	fp1->f_type = DTYPE_SOCKET;
581	FILE_UNLOCK(fp1);
582	FILE_LOCK(fp2);
583	fp2->f_flag = FREAD|FWRITE;
584	fp2->f_ops = &socketops;
585	fp2->f_type = DTYPE_SOCKET;
586	FILE_UNLOCK(fp2);
587	error = copyout(sv, uap->rsv, 2 * sizeof (int));
588	fdrop(fp1, td);
589	fdrop(fp2, td);
590	goto done2;
591free4:
592	FILEDESC_LOCK(fdp);
593	if (fdp->fd_ofiles[sv[1]] == fp2) {
594		fdp->fd_ofiles[sv[1]] = NULL;
595		FILEDESC_UNLOCK(fdp);
596		fdrop(fp2, td);
597	} else
598		FILEDESC_UNLOCK(fdp);
599	fdrop(fp2, td);
600free3:
601	FILEDESC_LOCK(fdp);
602	if (fdp->fd_ofiles[sv[0]] == fp1) {
603		fdp->fd_ofiles[sv[0]] = NULL;
604		FILEDESC_UNLOCK(fdp);
605		fdrop(fp1, td);
606	} else
607		FILEDESC_UNLOCK(fdp);
608	fdrop(fp1, td);
609free2:
610	(void)soclose(so2);
611free1:
612	(void)soclose(so1);
613done2:
614	mtx_unlock(&Giant);
615	return (error);
616}
617
618static int
619sendit(td, s, mp, flags)
620	register struct thread *td;
621	int s;
622	register struct msghdr *mp;
623	int flags;
624{
625	struct mbuf *control;
626	struct sockaddr *to;
627	int error;
628
629	if (mp->msg_name != NULL) {
630		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
631		if (error) {
632			to = NULL;
633			goto bad;
634		}
635		mp->msg_name = to;
636	} else
637		to = NULL;
638
639	if (mp->msg_control) {
640		if (mp->msg_controllen < sizeof(struct cmsghdr)
641#ifdef COMPAT_OLDSOCK
642		    && mp->msg_flags != MSG_COMPAT
643#endif
644		) {
645			error = EINVAL;
646			goto bad;
647		}
648		error = sockargs(&control, mp->msg_control,
649		    mp->msg_controllen, MT_CONTROL);
650		if (error)
651			goto bad;
652#ifdef COMPAT_OLDSOCK
653		if (mp->msg_flags == MSG_COMPAT) {
654			register struct cmsghdr *cm;
655
656			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
657			if (control == 0) {
658				error = ENOBUFS;
659				goto bad;
660			} else {
661				cm = mtod(control, struct cmsghdr *);
662				cm->cmsg_len = control->m_len;
663				cm->cmsg_level = SOL_SOCKET;
664				cm->cmsg_type = SCM_RIGHTS;
665			}
666		}
667#endif
668	} else {
669		control = NULL;
670	}
671
672	error = kern_sendit(td, s, mp, flags, control);
673
674bad:
675	if (to)
676		FREE(to, M_SONAME);
677	return (error);
678}
679
680int
681kern_sendit(td, s, mp, flags, control)
682	struct thread *td;
683	int s;
684	struct msghdr *mp;
685	int flags;
686	struct mbuf *control;
687{
688	struct uio auio;
689	struct iovec *iov;
690	struct socket *so;
691	int i;
692	int len, error;
693#ifdef KTRACE
694	struct iovec *ktriov = NULL;
695	struct uio ktruio;
696	int iovlen;
697#endif
698
699	mtx_lock(&Giant);
700	if ((error = fgetsock(td, s, &so, NULL)) != 0)
701		goto bad2;
702
703#ifdef MAC
704	error = mac_check_socket_send(td->td_ucred, so);
705	if (error)
706		goto bad;
707#endif
708
709	auio.uio_iov = mp->msg_iov;
710	auio.uio_iovcnt = mp->msg_iovlen;
711	auio.uio_segflg = UIO_USERSPACE;
712	auio.uio_rw = UIO_WRITE;
713	auio.uio_td = td;
714	auio.uio_offset = 0;			/* XXX */
715	auio.uio_resid = 0;
716	iov = mp->msg_iov;
717	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
718		if ((auio.uio_resid += iov->iov_len) < 0) {
719			error = EINVAL;
720			goto bad;
721		}
722	}
723#ifdef KTRACE
724	if (KTRPOINT(td, KTR_GENIO)) {
725		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
726		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
727		bcopy(auio.uio_iov, ktriov, iovlen);
728		ktruio = auio;
729	}
730#endif
731	len = auio.uio_resid;
732	error = so->so_proto->pr_usrreqs->pru_sosend(so, mp->msg_name, &auio,
733	    0, control, flags, td);
734	if (error) {
735		if (auio.uio_resid != len && (error == ERESTART ||
736		    error == EINTR || error == EWOULDBLOCK))
737			error = 0;
738		/* Generation of SIGPIPE can be controlled per socket */
739		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE)) {
740			PROC_LOCK(td->td_proc);
741			psignal(td->td_proc, SIGPIPE);
742			PROC_UNLOCK(td->td_proc);
743		}
744	}
745	if (error == 0)
746		td->td_retval[0] = len - auio.uio_resid;
747#ifdef KTRACE
748	if (ktriov != NULL) {
749		if (error == 0) {
750			ktruio.uio_iov = ktriov;
751			ktruio.uio_resid = td->td_retval[0];
752			ktrgenio(s, UIO_WRITE, &ktruio, error);
753		}
754		FREE(ktriov, M_TEMP);
755	}
756#endif
757bad:
758	fputsock(so);
759bad2:
760	mtx_unlock(&Giant);
761	return (error);
762}
763
764/*
765 * MPSAFE
766 */
767int
768sendto(td, uap)
769	struct thread *td;
770	register struct sendto_args /* {
771		int	s;
772		caddr_t	buf;
773		size_t	len;
774		int	flags;
775		caddr_t	to;
776		int	tolen;
777	} */ *uap;
778{
779	struct msghdr msg;
780	struct iovec aiov;
781	int error;
782
783	msg.msg_name = uap->to;
784	msg.msg_namelen = uap->tolen;
785	msg.msg_iov = &aiov;
786	msg.msg_iovlen = 1;
787	msg.msg_control = 0;
788#ifdef COMPAT_OLDSOCK
789	msg.msg_flags = 0;
790#endif
791	aiov.iov_base = uap->buf;
792	aiov.iov_len = uap->len;
793	error = sendit(td, uap->s, &msg, uap->flags);
794	return (error);
795}
796
797#ifdef COMPAT_OLDSOCK
798/*
799 * MPSAFE
800 */
801int
802osend(td, uap)
803	struct thread *td;
804	register struct osend_args /* {
805		int	s;
806		caddr_t	buf;
807		int	len;
808		int	flags;
809	} */ *uap;
810{
811	struct msghdr msg;
812	struct iovec aiov;
813	int error;
814
815	msg.msg_name = 0;
816	msg.msg_namelen = 0;
817	msg.msg_iov = &aiov;
818	msg.msg_iovlen = 1;
819	aiov.iov_base = uap->buf;
820	aiov.iov_len = uap->len;
821	msg.msg_control = 0;
822	msg.msg_flags = 0;
823	error = sendit(td, uap->s, &msg, uap->flags);
824	return (error);
825}
826
827/*
828 * MPSAFE
829 */
830int
831osendmsg(td, uap)
832	struct thread *td;
833	register struct osendmsg_args /* {
834		int	s;
835		caddr_t	msg;
836		int	flags;
837	} */ *uap;
838{
839	struct msghdr msg;
840	struct iovec aiov[UIO_SMALLIOV], *iov;
841	int error;
842
843	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
844	if (error)
845		goto done2;
846	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
847		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
848			error = EMSGSIZE;
849			goto done2;
850		}
851		MALLOC(iov, struct iovec *,
852		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
853		      M_WAITOK);
854	} else {
855		iov = aiov;
856	}
857	error = copyin(msg.msg_iov, iov,
858	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
859	if (error)
860		goto done;
861	msg.msg_flags = MSG_COMPAT;
862	msg.msg_iov = iov;
863	error = sendit(td, uap->s, &msg, uap->flags);
864done:
865	if (iov != aiov)
866		FREE(iov, M_IOV);
867done2:
868	return (error);
869}
870#endif
871
872/*
873 * MPSAFE
874 */
875int
876sendmsg(td, uap)
877	struct thread *td;
878	register struct sendmsg_args /* {
879		int	s;
880		caddr_t	msg;
881		int	flags;
882	} */ *uap;
883{
884	struct msghdr msg;
885	struct iovec aiov[UIO_SMALLIOV], *iov;
886	int error;
887
888	error = copyin(uap->msg, &msg, sizeof (msg));
889	if (error)
890		goto done2;
891	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
892		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
893			error = EMSGSIZE;
894			goto done2;
895		}
896		MALLOC(iov, struct iovec *,
897		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
898		       M_WAITOK);
899	} else {
900		iov = aiov;
901	}
902	if (msg.msg_iovlen &&
903	    (error = copyin(msg.msg_iov, iov,
904	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
905		goto done;
906	msg.msg_iov = iov;
907#ifdef COMPAT_OLDSOCK
908	msg.msg_flags = 0;
909#endif
910	error = sendit(td, uap->s, &msg, uap->flags);
911done:
912	if (iov != aiov)
913		FREE(iov, M_IOV);
914done2:
915	return (error);
916}
917
918static int
919recvit(td, s, mp, namelenp)
920	register struct thread *td;
921	int s;
922	register struct msghdr *mp;
923	void *namelenp;
924{
925	struct uio auio;
926	register struct iovec *iov;
927	register int i;
928	int len, error;
929	struct mbuf *m, *control = 0;
930	caddr_t ctlbuf;
931	struct socket *so;
932	struct sockaddr *fromsa = 0;
933#ifdef KTRACE
934	struct iovec *ktriov = NULL;
935	struct uio ktruio;
936	int iovlen;
937#endif
938
939	mtx_lock(&Giant);
940	if ((error = fgetsock(td, s, &so, NULL)) != 0)
941		return (error);
942
943#ifdef MAC
944	error = mac_check_socket_receive(td->td_ucred, so);
945	if (error) {
946		fputsock(so);
947		return (error);
948	}
949#endif
950
951	auio.uio_iov = mp->msg_iov;
952	auio.uio_iovcnt = mp->msg_iovlen;
953	auio.uio_segflg = UIO_USERSPACE;
954	auio.uio_rw = UIO_READ;
955	auio.uio_td = td;
956	auio.uio_offset = 0;			/* XXX */
957	auio.uio_resid = 0;
958	iov = mp->msg_iov;
959	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
960		if ((auio.uio_resid += iov->iov_len) < 0) {
961			fputsock(so);
962			return (EINVAL);
963		}
964	}
965#ifdef KTRACE
966	if (KTRPOINT(td, KTR_GENIO)) {
967		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
968		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
969		bcopy(auio.uio_iov, ktriov, iovlen);
970		ktruio = auio;
971	}
972#endif
973	len = auio.uio_resid;
974	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
975	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
976	    &mp->msg_flags);
977	if (error) {
978		if (auio.uio_resid != len && (error == ERESTART ||
979		    error == EINTR || error == EWOULDBLOCK))
980			error = 0;
981	}
982#ifdef KTRACE
983	if (ktriov != NULL) {
984		if (error == 0) {
985			ktruio.uio_iov = ktriov;
986			ktruio.uio_resid = len - auio.uio_resid;
987			ktrgenio(s, UIO_READ, &ktruio, error);
988		}
989		FREE(ktriov, M_TEMP);
990	}
991#endif
992	if (error)
993		goto out;
994	td->td_retval[0] = len - auio.uio_resid;
995	if (mp->msg_name) {
996		len = mp->msg_namelen;
997		if (len <= 0 || fromsa == 0)
998			len = 0;
999		else {
1000			/* save sa_len before it is destroyed by MSG_COMPAT */
1001			len = MIN(len, fromsa->sa_len);
1002#ifdef COMPAT_OLDSOCK
1003			if (mp->msg_flags & MSG_COMPAT)
1004				((struct osockaddr *)fromsa)->sa_family =
1005				    fromsa->sa_family;
1006#endif
1007			error = copyout(fromsa, mp->msg_name, (unsigned)len);
1008			if (error)
1009				goto out;
1010		}
1011		mp->msg_namelen = len;
1012		if (namelenp &&
1013		    (error = copyout(&len, namelenp, sizeof (int)))) {
1014#ifdef COMPAT_OLDSOCK
1015			if (mp->msg_flags & MSG_COMPAT)
1016				error = 0;	/* old recvfrom didn't check */
1017			else
1018#endif
1019			goto out;
1020		}
1021	}
1022	if (mp->msg_control) {
1023#ifdef COMPAT_OLDSOCK
1024		/*
1025		 * We assume that old recvmsg calls won't receive access
1026		 * rights and other control info, esp. as control info
1027		 * is always optional and those options didn't exist in 4.3.
1028		 * If we receive rights, trim the cmsghdr; anything else
1029		 * is tossed.
1030		 */
1031		if (control && mp->msg_flags & MSG_COMPAT) {
1032			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1033			    SOL_SOCKET ||
1034			    mtod(control, struct cmsghdr *)->cmsg_type !=
1035			    SCM_RIGHTS) {
1036				mp->msg_controllen = 0;
1037				goto out;
1038			}
1039			control->m_len -= sizeof (struct cmsghdr);
1040			control->m_data += sizeof (struct cmsghdr);
1041		}
1042#endif
1043		len = mp->msg_controllen;
1044		m = control;
1045		mp->msg_controllen = 0;
1046		ctlbuf = mp->msg_control;
1047
1048		while (m && len > 0) {
1049			unsigned int tocopy;
1050
1051			if (len >= m->m_len)
1052				tocopy = m->m_len;
1053			else {
1054				mp->msg_flags |= MSG_CTRUNC;
1055				tocopy = len;
1056			}
1057
1058			if ((error = copyout(mtod(m, caddr_t),
1059					ctlbuf, tocopy)) != 0)
1060				goto out;
1061
1062			ctlbuf += tocopy;
1063			len -= tocopy;
1064			m = m->m_next;
1065		}
1066		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1067	}
1068out:
1069	fputsock(so);
1070	mtx_unlock(&Giant);
1071	if (fromsa)
1072		FREE(fromsa, M_SONAME);
1073	if (control)
1074		m_freem(control);
1075	return (error);
1076}
1077
1078/*
1079 * MPSAFE
1080 */
1081int
1082recvfrom(td, uap)
1083	struct thread *td;
1084	register struct recvfrom_args /* {
1085		int	s;
1086		caddr_t	buf;
1087		size_t	len;
1088		int	flags;
1089		caddr_t	from;
1090		int	*fromlenaddr;
1091	} */ *uap;
1092{
1093	struct msghdr msg;
1094	struct iovec aiov;
1095	int error;
1096
1097	if (uap->fromlenaddr) {
1098		error = copyin(uap->fromlenaddr,
1099		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1100		if (error)
1101			goto done2;
1102	} else {
1103		msg.msg_namelen = 0;
1104	}
1105	msg.msg_name = uap->from;
1106	msg.msg_iov = &aiov;
1107	msg.msg_iovlen = 1;
1108	aiov.iov_base = uap->buf;
1109	aiov.iov_len = uap->len;
1110	msg.msg_control = 0;
1111	msg.msg_flags = uap->flags;
1112	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1113done2:
1114	return(error);
1115}
1116
1117#ifdef COMPAT_OLDSOCK
1118/*
1119 * MPSAFE
1120 */
1121int
1122orecvfrom(td, uap)
1123	struct thread *td;
1124	struct recvfrom_args *uap;
1125{
1126
1127	uap->flags |= MSG_COMPAT;
1128	return (recvfrom(td, uap));
1129}
1130#endif
1131
1132
1133#ifdef COMPAT_OLDSOCK
1134/*
1135 * MPSAFE
1136 */
1137int
1138orecv(td, uap)
1139	struct thread *td;
1140	register struct orecv_args /* {
1141		int	s;
1142		caddr_t	buf;
1143		int	len;
1144		int	flags;
1145	} */ *uap;
1146{
1147	struct msghdr msg;
1148	struct iovec aiov;
1149	int error;
1150
1151	msg.msg_name = 0;
1152	msg.msg_namelen = 0;
1153	msg.msg_iov = &aiov;
1154	msg.msg_iovlen = 1;
1155	aiov.iov_base = uap->buf;
1156	aiov.iov_len = uap->len;
1157	msg.msg_control = 0;
1158	msg.msg_flags = uap->flags;
1159	error = recvit(td, uap->s, &msg, NULL);
1160	return (error);
1161}
1162
1163/*
1164 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1165 * overlays the new one, missing only the flags, and with the (old) access
1166 * rights where the control fields are now.
1167 *
1168 * MPSAFE
1169 */
1170int
1171orecvmsg(td, uap)
1172	struct thread *td;
1173	register struct orecvmsg_args /* {
1174		int	s;
1175		struct	omsghdr *msg;
1176		int	flags;
1177	} */ *uap;
1178{
1179	struct msghdr msg;
1180	struct iovec aiov[UIO_SMALLIOV], *iov;
1181	int error;
1182
1183	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1184	if (error)
1185		return (error);
1186
1187	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1188		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1189			error = EMSGSIZE;
1190			goto done2;
1191		}
1192		MALLOC(iov, struct iovec *,
1193		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1194		      M_WAITOK);
1195	} else {
1196		iov = aiov;
1197	}
1198	msg.msg_flags = uap->flags | MSG_COMPAT;
1199	error = copyin(msg.msg_iov, iov,
1200	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1201	if (error)
1202		goto done;
1203	msg.msg_iov = iov;
1204	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1205
1206	if (msg.msg_controllen && error == 0)
1207		error = copyout(&msg.msg_controllen,
1208		    &uap->msg->msg_accrightslen, sizeof (int));
1209done:
1210	if (iov != aiov)
1211		FREE(iov, M_IOV);
1212done2:
1213	return (error);
1214}
1215#endif
1216
1217/*
1218 * MPSAFE
1219 */
1220int
1221recvmsg(td, uap)
1222	struct thread *td;
1223	register struct recvmsg_args /* {
1224		int	s;
1225		struct	msghdr *msg;
1226		int	flags;
1227	} */ *uap;
1228{
1229	struct msghdr msg;
1230	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1231	register int error;
1232
1233	error = copyin(uap->msg, &msg, sizeof (msg));
1234	if (error)
1235		goto done2;
1236	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1237		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1238			error = EMSGSIZE;
1239			goto done2;
1240		}
1241		MALLOC(iov, struct iovec *,
1242		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1243		       M_WAITOK);
1244	} else {
1245		iov = aiov;
1246	}
1247#ifdef COMPAT_OLDSOCK
1248	msg.msg_flags = uap->flags &~ MSG_COMPAT;
1249#else
1250	msg.msg_flags = uap->flags;
1251#endif
1252	uiov = msg.msg_iov;
1253	msg.msg_iov = iov;
1254	error = copyin(uiov, iov,
1255	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1256	if (error)
1257		goto done;
1258	error = recvit(td, uap->s, &msg, NULL);
1259	if (!error) {
1260		msg.msg_iov = uiov;
1261		error = copyout(&msg, uap->msg, sizeof(msg));
1262	}
1263done:
1264	if (iov != aiov)
1265		FREE(iov, M_IOV);
1266done2:
1267	return (error);
1268}
1269
1270/*
1271 * MPSAFE
1272 */
1273/* ARGSUSED */
1274int
1275shutdown(td, uap)
1276	struct thread *td;
1277	register struct shutdown_args /* {
1278		int	s;
1279		int	how;
1280	} */ *uap;
1281{
1282	struct socket *so;
1283	int error;
1284
1285	mtx_lock(&Giant);
1286	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1287		error = soshutdown(so, uap->how);
1288		fputsock(so);
1289	}
1290	mtx_unlock(&Giant);
1291	return(error);
1292}
1293
1294/*
1295 * MPSAFE
1296 */
1297/* ARGSUSED */
1298int
1299setsockopt(td, uap)
1300	struct thread *td;
1301	register struct setsockopt_args /* {
1302		int	s;
1303		int	level;
1304		int	name;
1305		caddr_t	val;
1306		int	valsize;
1307	} */ *uap;
1308{
1309	struct socket *so;
1310	struct sockopt sopt;
1311	int error;
1312
1313	if (uap->val == 0 && uap->valsize != 0)
1314		return (EFAULT);
1315	if (uap->valsize < 0)
1316		return (EINVAL);
1317
1318	mtx_lock(&Giant);
1319	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1320		sopt.sopt_dir = SOPT_SET;
1321		sopt.sopt_level = uap->level;
1322		sopt.sopt_name = uap->name;
1323		sopt.sopt_val = uap->val;
1324		sopt.sopt_valsize = uap->valsize;
1325		sopt.sopt_td = td;
1326		error = sosetopt(so, &sopt);
1327		fputsock(so);
1328	}
1329	mtx_unlock(&Giant);
1330	return(error);
1331}
1332
1333/*
1334 * MPSAFE
1335 */
1336/* ARGSUSED */
1337int
1338getsockopt(td, uap)
1339	struct thread *td;
1340	register struct getsockopt_args /* {
1341		int	s;
1342		int	level;
1343		int	name;
1344		caddr_t	val;
1345		int	*avalsize;
1346	} */ *uap;
1347{
1348	int	valsize, error;
1349	struct  socket *so;
1350	struct	sockopt sopt;
1351
1352	mtx_lock(&Giant);
1353	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1354		goto done2;
1355	if (uap->val) {
1356		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1357		if (error)
1358			goto done1;
1359		if (valsize < 0) {
1360			error = EINVAL;
1361			goto done1;
1362		}
1363	} else {
1364		valsize = 0;
1365	}
1366
1367	sopt.sopt_dir = SOPT_GET;
1368	sopt.sopt_level = uap->level;
1369	sopt.sopt_name = uap->name;
1370	sopt.sopt_val = uap->val;
1371	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1372	sopt.sopt_td = td;
1373
1374	error = sogetopt(so, &sopt);
1375	if (error == 0) {
1376		valsize = sopt.sopt_valsize;
1377		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1378	}
1379done1:
1380	fputsock(so);
1381done2:
1382	mtx_unlock(&Giant);
1383	return (error);
1384}
1385
1386/*
1387 * getsockname1() - Get socket name.
1388 *
1389 * MPSAFE
1390 */
1391/* ARGSUSED */
1392static int
1393getsockname1(td, uap, compat)
1394	struct thread *td;
1395	register struct getsockname_args /* {
1396		int	fdes;
1397		caddr_t	asa;
1398		int	*alen;
1399	} */ *uap;
1400	int compat;
1401{
1402	struct socket *so;
1403	struct sockaddr *sa;
1404	int len, error;
1405
1406	mtx_lock(&Giant);
1407	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1408		goto done2;
1409	error = copyin(uap->alen, &len, sizeof (len));
1410	if (error)
1411		goto done1;
1412	if (len < 0) {
1413		error = EINVAL;
1414		goto done1;
1415	}
1416	sa = 0;
1417	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1418	if (error)
1419		goto bad;
1420	if (sa == 0) {
1421		len = 0;
1422		goto gotnothing;
1423	}
1424
1425	len = MIN(len, sa->sa_len);
1426#ifdef COMPAT_OLDSOCK
1427	if (compat)
1428		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1429#endif
1430	error = copyout(sa, uap->asa, (u_int)len);
1431	if (error == 0)
1432gotnothing:
1433		error = copyout(&len, uap->alen, sizeof (len));
1434bad:
1435	if (sa)
1436		FREE(sa, M_SONAME);
1437done1:
1438	fputsock(so);
1439done2:
1440	mtx_unlock(&Giant);
1441	return (error);
1442}
1443
1444/*
1445 * MPSAFE
1446 */
1447int
1448getsockname(td, uap)
1449	struct thread *td;
1450	struct getsockname_args *uap;
1451{
1452
1453	return (getsockname1(td, uap, 0));
1454}
1455
1456#ifdef COMPAT_OLDSOCK
1457/*
1458 * MPSAFE
1459 */
1460int
1461ogetsockname(td, uap)
1462	struct thread *td;
1463	struct getsockname_args *uap;
1464{
1465
1466	return (getsockname1(td, uap, 1));
1467}
1468#endif /* COMPAT_OLDSOCK */
1469
1470/*
1471 * getpeername1() - Get name of peer for connected socket.
1472 *
1473 * MPSAFE
1474 */
1475/* ARGSUSED */
1476static int
1477getpeername1(td, uap, compat)
1478	struct thread *td;
1479	register struct getpeername_args /* {
1480		int	fdes;
1481		caddr_t	asa;
1482		int	*alen;
1483	} */ *uap;
1484	int compat;
1485{
1486	struct socket *so;
1487	struct sockaddr *sa;
1488	int len, error;
1489
1490	mtx_lock(&Giant);
1491	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1492		goto done2;
1493	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1494		error = ENOTCONN;
1495		goto done1;
1496	}
1497	error = copyin(uap->alen, &len, sizeof (len));
1498	if (error)
1499		goto done1;
1500	if (len < 0) {
1501		error = EINVAL;
1502		goto done1;
1503	}
1504	sa = 0;
1505	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1506	if (error)
1507		goto bad;
1508	if (sa == 0) {
1509		len = 0;
1510		goto gotnothing;
1511	}
1512	len = MIN(len, sa->sa_len);
1513#ifdef COMPAT_OLDSOCK
1514	if (compat)
1515		((struct osockaddr *)sa)->sa_family =
1516		    sa->sa_family;
1517#endif
1518	error = copyout(sa, uap->asa, (u_int)len);
1519	if (error)
1520		goto bad;
1521gotnothing:
1522	error = copyout(&len, uap->alen, sizeof (len));
1523bad:
1524	if (sa)
1525		FREE(sa, M_SONAME);
1526done1:
1527	fputsock(so);
1528done2:
1529	mtx_unlock(&Giant);
1530	return (error);
1531}
1532
1533/*
1534 * MPSAFE
1535 */
1536int
1537getpeername(td, uap)
1538	struct thread *td;
1539	struct getpeername_args *uap;
1540{
1541
1542	return (getpeername1(td, uap, 0));
1543}
1544
1545#ifdef COMPAT_OLDSOCK
1546/*
1547 * MPSAFE
1548 */
1549int
1550ogetpeername(td, uap)
1551	struct thread *td;
1552	struct ogetpeername_args *uap;
1553{
1554
1555	/* XXX uap should have type `getpeername_args *' to begin with. */
1556	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1557}
1558#endif /* COMPAT_OLDSOCK */
1559
1560int
1561sockargs(mp, buf, buflen, type)
1562	struct mbuf **mp;
1563	caddr_t buf;
1564	int buflen, type;
1565{
1566	register struct sockaddr *sa;
1567	register struct mbuf *m;
1568	int error;
1569
1570	if ((u_int)buflen > MLEN) {
1571#ifdef COMPAT_OLDSOCK
1572		if (type == MT_SONAME && (u_int)buflen <= 112)
1573			buflen = MLEN;		/* unix domain compat. hack */
1574		else
1575#endif
1576		return (EINVAL);
1577	}
1578	m = m_get(M_TRYWAIT, type);
1579	if (m == NULL)
1580		return (ENOBUFS);
1581	m->m_len = buflen;
1582	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1583	if (error)
1584		(void) m_free(m);
1585	else {
1586		*mp = m;
1587		if (type == MT_SONAME) {
1588			sa = mtod(m, struct sockaddr *);
1589
1590#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1591			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1592				sa->sa_family = sa->sa_len;
1593#endif
1594			sa->sa_len = buflen;
1595		}
1596	}
1597	return (error);
1598}
1599
1600int
1601getsockaddr(namp, uaddr, len)
1602	struct sockaddr **namp;
1603	caddr_t uaddr;
1604	size_t len;
1605{
1606	struct sockaddr *sa;
1607	int error;
1608
1609	if (len > SOCK_MAXADDRLEN)
1610		return ENAMETOOLONG;
1611	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1612	error = copyin(uaddr, sa, len);
1613	if (error) {
1614		FREE(sa, M_SONAME);
1615	} else {
1616#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1617		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1618			sa->sa_family = sa->sa_len;
1619#endif
1620		sa->sa_len = len;
1621		*namp = sa;
1622	}
1623	return error;
1624}
1625
1626/*
1627 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1628 */
1629static void
1630sf_buf_init(void *arg)
1631{
1632	struct sf_buf *sf_bufs;
1633	vm_offset_t sf_base;
1634	int i;
1635
1636	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
1637	mtx_lock(&sf_freelist.sf_lock);
1638	SLIST_INIT(&sf_freelist.sf_head);
1639	sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE);
1640	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
1641	    M_NOWAIT | M_ZERO);
1642	for (i = 0; i < nsfbufs; i++) {
1643		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1644		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
1645	}
1646	sf_buf_alloc_want = 0;
1647	mtx_unlock(&sf_freelist.sf_lock);
1648}
1649
1650/*
1651 * Get an sf_buf from the freelist. Will block if none are available.
1652 */
1653struct sf_buf *
1654sf_buf_alloc(struct vm_page *m)
1655{
1656	struct sf_buf *sf;
1657	int error;
1658
1659	mtx_lock(&sf_freelist.sf_lock);
1660	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
1661		sf_buf_alloc_want++;
1662		error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
1663		    "sfbufa", 0);
1664		sf_buf_alloc_want--;
1665
1666		/*
1667		 * If we got a signal, don't risk going back to sleep.
1668		 */
1669		if (error)
1670			break;
1671	}
1672	if (sf != NULL) {
1673		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
1674		sf->m = m;
1675		pmap_qenter(sf->kva, &sf->m, 1);
1676	}
1677	mtx_unlock(&sf_freelist.sf_lock);
1678	return (sf);
1679}
1680
1681/*
1682 * Detatch mapped page and release resources back to the system.
1683 */
1684void
1685sf_buf_free(void *addr, void *args)
1686{
1687	struct sf_buf *sf;
1688	struct vm_page *m;
1689
1690	sf = args;
1691	pmap_qremove((vm_offset_t)addr, 1);
1692	m = sf->m;
1693	vm_page_lock_queues();
1694	vm_page_unwire(m, 0);
1695	/*
1696	 * Check for the object going away on us. This can
1697	 * happen since we don't hold a reference to it.
1698	 * If so, we're responsible for freeing the page.
1699	 */
1700	if (m->wire_count == 0 && m->object == NULL)
1701		vm_page_free(m);
1702	vm_page_unlock_queues();
1703	sf->m = NULL;
1704	mtx_lock(&sf_freelist.sf_lock);
1705	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
1706	if (sf_buf_alloc_want > 0)
1707		wakeup_one(&sf_freelist);
1708	mtx_unlock(&sf_freelist.sf_lock);
1709}
1710
1711/*
1712 * sendfile(2)
1713 *
1714 * MPSAFE
1715 *
1716 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1717 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1718 *
1719 * Send a file specified by 'fd' and starting at 'offset' to a socket
1720 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1721 * nbytes == 0. Optionally add a header and/or trailer to the socket
1722 * output. If specified, write the total number of bytes sent into *sbytes.
1723 *
1724 */
1725int
1726sendfile(struct thread *td, struct sendfile_args *uap)
1727{
1728
1729	return (do_sendfile(td, uap, 0));
1730}
1731
1732#ifdef COMPAT_FREEBSD4
1733int
1734freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1735{
1736	struct sendfile_args args;
1737
1738	args.fd = uap->fd;
1739	args.s = uap->s;
1740	args.offset = uap->offset;
1741	args.nbytes = uap->nbytes;
1742	args.hdtr = uap->hdtr;
1743	args.sbytes = uap->sbytes;
1744	args.flags = uap->flags;
1745
1746	return (do_sendfile(td, &args, 1));
1747}
1748#endif /* COMPAT_FREEBSD4 */
1749
1750static int
1751do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1752{
1753	struct vnode *vp;
1754	struct vm_object *obj;
1755	struct socket *so = NULL;
1756	struct mbuf *m;
1757	struct sf_buf *sf;
1758	struct vm_page *pg;
1759	struct writev_args nuap;
1760	struct sf_hdtr hdtr;
1761	off_t off, xfsize, hdtr_size, sbytes = 0;
1762	int error, s;
1763
1764	mtx_lock(&Giant);
1765
1766	hdtr_size = 0;
1767
1768	/*
1769	 * The descriptor must be a regular file and have a backing VM object.
1770	 */
1771	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1772		goto done;
1773	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1774	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1775		error = EINVAL;
1776		VOP_UNLOCK(vp, 0, td);
1777		goto done;
1778	}
1779	VOP_UNLOCK(vp, 0, td);
1780	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1781		goto done;
1782	if (so->so_type != SOCK_STREAM) {
1783		error = EINVAL;
1784		goto done;
1785	}
1786	if ((so->so_state & SS_ISCONNECTED) == 0) {
1787		error = ENOTCONN;
1788		goto done;
1789	}
1790	if (uap->offset < 0) {
1791		error = EINVAL;
1792		goto done;
1793	}
1794
1795#ifdef MAC
1796	error = mac_check_socket_send(td->td_ucred, so);
1797	if (error)
1798		goto done;
1799#endif
1800
1801	/*
1802	 * If specified, get the pointer to the sf_hdtr struct for
1803	 * any headers/trailers.
1804	 */
1805	if (uap->hdtr != NULL) {
1806		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1807		if (error)
1808			goto done;
1809		/*
1810		 * Send any headers. Wimp out and use writev(2).
1811		 */
1812		if (hdtr.headers != NULL) {
1813			nuap.fd = uap->s;
1814			nuap.iovp = hdtr.headers;
1815			nuap.iovcnt = hdtr.hdr_cnt;
1816			error = writev(td, &nuap);
1817			if (error)
1818				goto done;
1819			if (compat)
1820				sbytes += td->td_retval[0];
1821			else
1822				hdtr_size += td->td_retval[0];
1823		}
1824	}
1825
1826	/*
1827	 * Protect against multiple writers to the socket.
1828	 */
1829	(void) sblock(&so->so_snd, M_WAITOK);
1830
1831	/*
1832	 * Loop through the pages in the file, starting with the requested
1833	 * offset. Get a file page (do I/O if necessary), map the file page
1834	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1835	 * it on the socket.
1836	 */
1837	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1838		vm_pindex_t pindex;
1839		vm_offset_t pgoff;
1840
1841		pindex = OFF_TO_IDX(off);
1842		VM_OBJECT_LOCK(obj);
1843retry_lookup:
1844		/*
1845		 * Calculate the amount to transfer. Not to exceed a page,
1846		 * the EOF, or the passed in nbytes.
1847		 */
1848		xfsize = obj->un_pager.vnp.vnp_size - off;
1849		VM_OBJECT_UNLOCK(obj);
1850		if (xfsize > PAGE_SIZE)
1851			xfsize = PAGE_SIZE;
1852		pgoff = (vm_offset_t)(off & PAGE_MASK);
1853		if (PAGE_SIZE - pgoff < xfsize)
1854			xfsize = PAGE_SIZE - pgoff;
1855		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1856			xfsize = uap->nbytes - sbytes;
1857		if (xfsize <= 0)
1858			break;
1859		/*
1860		 * Optimize the non-blocking case by looking at the socket space
1861		 * before going to the extra work of constituting the sf_buf.
1862		 */
1863		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1864			if (so->so_state & SS_CANTSENDMORE)
1865				error = EPIPE;
1866			else
1867				error = EAGAIN;
1868			sbunlock(&so->so_snd);
1869			goto done;
1870		}
1871		VM_OBJECT_LOCK(obj);
1872		/*
1873		 * Attempt to look up the page.
1874		 *
1875		 *	Allocate if not found
1876		 *
1877		 *	Wait and loop if busy.
1878		 */
1879		pg = vm_page_lookup(obj, pindex);
1880
1881		if (pg == NULL) {
1882			pg = vm_page_alloc(obj, pindex,
1883			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
1884			if (pg == NULL) {
1885				VM_OBJECT_UNLOCK(obj);
1886				VM_WAIT;
1887				VM_OBJECT_LOCK(obj);
1888				goto retry_lookup;
1889			}
1890			vm_page_lock_queues();
1891			vm_page_wakeup(pg);
1892		} else {
1893			vm_page_lock_queues();
1894			if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
1895				goto retry_lookup;
1896			/*
1897		 	 * Wire the page so it does not get ripped out from
1898			 * under us.
1899			 */
1900			vm_page_wire(pg);
1901		}
1902
1903		/*
1904		 * If page is not valid for what we need, initiate I/O
1905		 */
1906
1907		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1908			int bsize, resid;
1909
1910			/*
1911			 * Ensure that our page is still around when the I/O
1912			 * completes.
1913			 */
1914			vm_page_io_start(pg);
1915			vm_page_unlock_queues();
1916			VM_OBJECT_UNLOCK(obj);
1917
1918			/*
1919			 * Get the page from backing store.
1920			 */
1921			bsize = vp->v_mount->mnt_stat.f_iosize;
1922			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
1923			/*
1924			 * XXXMAC: Because we don't have fp->f_cred here,
1925			 * we pass in NOCRED.  This is probably wrong, but
1926			 * is consistent with our original implementation.
1927			 */
1928			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
1929			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
1930			    IO_VMIO | ((MAXBSIZE / bsize) << 16),
1931			    td->td_ucred, NOCRED, &resid, td);
1932			VOP_UNLOCK(vp, 0, td);
1933			if (error)
1934				VM_OBJECT_LOCK(obj);
1935			vm_page_lock_queues();
1936			vm_page_flag_clear(pg, PG_ZERO);
1937			vm_page_io_finish(pg);
1938			if (error) {
1939				vm_page_unwire(pg, 0);
1940				/*
1941				 * See if anyone else might know about this page.
1942				 * If not and it is not valid, then free it.
1943				 */
1944				if (pg->wire_count == 0 && pg->valid == 0 &&
1945				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1946				    pg->hold_count == 0) {
1947					vm_page_busy(pg);
1948					vm_page_free(pg);
1949				}
1950				vm_page_unlock_queues();
1951				VM_OBJECT_UNLOCK(obj);
1952				sbunlock(&so->so_snd);
1953				goto done;
1954			}
1955		} else
1956			VM_OBJECT_UNLOCK(obj);
1957		vm_page_unlock_queues();
1958
1959		/*
1960		 * Get a sendfile buf. We usually wait as long as necessary,
1961		 * but this wait can be interrupted.
1962		 */
1963		if ((sf = sf_buf_alloc(pg)) == NULL) {
1964			vm_page_lock_queues();
1965			vm_page_unwire(pg, 0);
1966			if (pg->wire_count == 0 && pg->object == NULL)
1967				vm_page_free(pg);
1968			vm_page_unlock_queues();
1969			sbunlock(&so->so_snd);
1970			error = EINTR;
1971			goto done;
1972		}
1973
1974		/*
1975		 * Get an mbuf header and set it up as having external storage.
1976		 */
1977		MGETHDR(m, M_TRYWAIT, MT_DATA);
1978		if (m == NULL) {
1979			error = ENOBUFS;
1980			sf_buf_free((void *)sf->kva, sf);
1981			sbunlock(&so->so_snd);
1982			goto done;
1983		}
1984		/*
1985		 * Setup external storage for mbuf.
1986		 */
1987		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, sf, M_RDONLY,
1988		    EXT_SFBUF);
1989		m->m_data = (char *) sf->kva + pgoff;
1990		m->m_pkthdr.len = m->m_len = xfsize;
1991		/*
1992		 * Add the buffer to the socket buffer chain.
1993		 */
1994		s = splnet();
1995retry_space:
1996		/*
1997		 * Make sure that the socket is still able to take more data.
1998		 * CANTSENDMORE being true usually means that the connection
1999		 * was closed. so_error is true when an error was sensed after
2000		 * a previous send.
2001		 * The state is checked after the page mapping and buffer
2002		 * allocation above since those operations may block and make
2003		 * any socket checks stale. From this point forward, nothing
2004		 * blocks before the pru_send (or more accurately, any blocking
2005		 * results in a loop back to here to re-check).
2006		 */
2007		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
2008			if (so->so_state & SS_CANTSENDMORE) {
2009				error = EPIPE;
2010			} else {
2011				error = so->so_error;
2012				so->so_error = 0;
2013			}
2014			m_freem(m);
2015			sbunlock(&so->so_snd);
2016			splx(s);
2017			goto done;
2018		}
2019		/*
2020		 * Wait for socket space to become available. We do this just
2021		 * after checking the connection state above in order to avoid
2022		 * a race condition with sbwait().
2023		 */
2024		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
2025			if (so->so_state & SS_NBIO) {
2026				m_freem(m);
2027				sbunlock(&so->so_snd);
2028				splx(s);
2029				error = EAGAIN;
2030				goto done;
2031			}
2032			error = sbwait(&so->so_snd);
2033			/*
2034			 * An error from sbwait usually indicates that we've
2035			 * been interrupted by a signal. If we've sent anything
2036			 * then return bytes sent, otherwise return the error.
2037			 */
2038			if (error) {
2039				m_freem(m);
2040				sbunlock(&so->so_snd);
2041				splx(s);
2042				goto done;
2043			}
2044			goto retry_space;
2045		}
2046		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
2047		splx(s);
2048		if (error) {
2049			sbunlock(&so->so_snd);
2050			goto done;
2051		}
2052	}
2053	sbunlock(&so->so_snd);
2054
2055	/*
2056	 * Send trailers. Wimp out and use writev(2).
2057	 */
2058	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
2059			nuap.fd = uap->s;
2060			nuap.iovp = hdtr.trailers;
2061			nuap.iovcnt = hdtr.trl_cnt;
2062			error = writev(td, &nuap);
2063			if (error)
2064				goto done;
2065			if (compat)
2066				sbytes += td->td_retval[0];
2067			else
2068				hdtr_size += td->td_retval[0];
2069	}
2070
2071done:
2072	/*
2073	 * If there was no error we have to clear td->td_retval[0]
2074	 * because it may have been set by writev.
2075	 */
2076	if (error == 0) {
2077		td->td_retval[0] = 0;
2078	}
2079	if (uap->sbytes != NULL) {
2080		if (!compat)
2081			sbytes += hdtr_size;
2082		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2083	}
2084	if (vp)
2085		vrele(vp);
2086	if (so)
2087		fputsock(so);
2088	mtx_unlock(&Giant);
2089	return (error);
2090}
2091