kern_sendfile.c revision 82610
1/*
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37 * $FreeBSD: head/sys/kern/uipc_syscalls.c 82610 2001-08-31 00:37:34Z dillon $
38 */
39
40#include "opt_compat.h"
41#include "opt_ktrace.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/mutex.h>
48#include <sys/sysproto.h>
49#include <sys/malloc.h>
50#include <sys/filedesc.h>
51#include <sys/event.h>
52#include <sys/proc.h>
53#include <sys/fcntl.h>
54#include <sys/file.h>
55#include <sys/lock.h>
56#include <sys/mount.h>
57#include <sys/mbuf.h>
58#include <sys/protosw.h>
59#include <sys/socket.h>
60#include <sys/socketvar.h>
61#include <sys/signalvar.h>
62#include <sys/uio.h>
63#include <sys/vnode.h>
64#ifdef KTRACE
65#include <sys/ktrace.h>
66#endif
67
68#include <vm/vm.h>
69#include <vm/vm_object.h>
70#include <vm/vm_page.h>
71#include <vm/vm_pageout.h>
72#include <vm/vm_kern.h>
73#include <vm/vm_extern.h>
74
75static void sf_buf_init(void *arg);
76SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
77static struct sf_buf *sf_buf_alloc(void);
78static void sf_buf_free(caddr_t addr, void *args);
79
80static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
81static int recvit __P((struct proc *p, int s, struct msghdr *mp,
82		       caddr_t namelenp));
83
84static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
85static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
86			     int compat));
87static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
88			     int compat));
89
90/*
91 * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
92 * sf_freelist head with the sf_lock mutex.
93 */
94static struct {
95	SLIST_HEAD(, sf_buf) sf_head;
96	struct mtx sf_lock;
97} sf_freelist;
98
99static vm_offset_t sf_base;
100static struct sf_buf *sf_bufs;
101static u_int sf_buf_alloc_want;
102
103/*
104 * System call interface to the socket abstraction.
105 */
106#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
107#define COMPAT_OLDSOCK
108#endif
109
110extern	struct fileops socketops;
111
112/*
113 * MPSAFE
114 */
115int
116socket(p, uap)
117	struct proc *p;
118	register struct socket_args /* {
119		int	domain;
120		int	type;
121		int	protocol;
122	} */ *uap;
123{
124	struct filedesc *fdp;
125	struct socket *so;
126	struct file *fp;
127	int fd, error;
128
129	mtx_lock(&Giant);
130
131	fdp = p->p_fd;
132	error = falloc(p, &fp, &fd);
133	if (error)
134		goto done2;
135	fhold(fp);
136	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
137	if (error) {
138		if (fdp->fd_ofiles[fd] == fp) {
139			fdp->fd_ofiles[fd] = NULL;
140			fdrop(fp, p);
141		}
142	} else {
143		fp->f_data = (caddr_t)so;
144		fp->f_flag = FREAD|FWRITE;
145		fp->f_ops = &socketops;
146		fp->f_type = DTYPE_SOCKET;
147		p->p_retval[0] = fd;
148	}
149	fdrop(fp, p);
150done2:
151	mtx_unlock(&Giant);
152	return (error);
153}
154
155/*
156 * MPSAFE
157 */
158/* ARGSUSED */
159int
160bind(p, uap)
161	struct proc *p;
162	register struct bind_args /* {
163		int	s;
164		caddr_t	name;
165		int	namelen;
166	} */ *uap;
167{
168	struct file *fp;
169	struct sockaddr *sa;
170	int error;
171
172	mtx_lock(&Giant);
173
174	error = holdsock(p->p_fd, uap->s, &fp);
175	if (error)
176		goto done2;
177	error = getsockaddr(&sa, uap->name, uap->namelen);
178	if (error) {
179		fdrop(fp, p);
180		goto done2;
181	}
182	error = sobind((struct socket *)fp->f_data, sa, p);
183	FREE(sa, M_SONAME);
184	fdrop(fp, p);
185done2:
186	mtx_unlock(&Giant);
187	return (error);
188}
189
190/*
191 * MPSAFE
192 */
193/* ARGSUSED */
194int
195listen(p, uap)
196	struct proc *p;
197	register struct listen_args /* {
198		int	s;
199		int	backlog;
200	} */ *uap;
201{
202	struct file *fp;
203	int error;
204
205	mtx_lock(&Giant);
206	error = holdsock(p->p_fd, uap->s, &fp);
207	if (error == 0) {
208		error = solisten((struct socket *)fp->f_data, uap->backlog, p);
209		fdrop(fp, p);
210	}
211	mtx_unlock(&Giant);
212	return(error);
213}
214
215/*
216 * accept1()
217 * MPSAFE
218 */
219static int
220accept1(p, uap, compat)
221	struct proc *p;
222	register struct accept_args /* {
223		int	s;
224		caddr_t	name;
225		int	*anamelen;
226	} */ *uap;
227	int compat;
228{
229	struct filedesc *fdp;
230	struct file *lfp = NULL;
231	struct file *nfp = NULL;
232	struct sockaddr *sa;
233	int namelen, error, s;
234	struct socket *head, *so;
235	int fd;
236	short fflag;		/* type must match fp->f_flag */
237
238	mtx_lock(&Giant);
239	fdp = p->p_fd;
240
241	if (uap->name) {
242		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
243			sizeof (namelen));
244		if(error)
245			goto done2;
246	}
247	error = holdsock(fdp, uap->s, &lfp);
248	if (error)
249		goto done2;
250	s = splnet();
251	head = (struct socket *)lfp->f_data;
252	if ((head->so_options & SO_ACCEPTCONN) == 0) {
253		splx(s);
254		error = EINVAL;
255		goto done;
256	}
257	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
258		splx(s);
259		error = EWOULDBLOCK;
260		goto done;
261	}
262	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
263		if (head->so_state & SS_CANTRCVMORE) {
264			head->so_error = ECONNABORTED;
265			break;
266		}
267		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
268		    "accept", 0);
269		if (error) {
270			splx(s);
271			goto done;
272		}
273	}
274	if (head->so_error) {
275		error = head->so_error;
276		head->so_error = 0;
277		splx(s);
278		goto done;
279	}
280
281	/*
282	 * At this point we know that there is at least one connection
283	 * ready to be accepted. Remove it from the queue prior to
284	 * allocating the file descriptor for it since falloc() may
285	 * block allowing another process to accept the connection
286	 * instead.
287	 */
288	so = TAILQ_FIRST(&head->so_comp);
289	TAILQ_REMOVE(&head->so_comp, so, so_list);
290	head->so_qlen--;
291
292	fflag = lfp->f_flag;
293	error = falloc(p, &nfp, &fd);
294	if (error) {
295		/*
296		 * Probably ran out of file descriptors. Put the
297		 * unaccepted connection back onto the queue and
298		 * do another wakeup so some other process might
299		 * have a chance at it.
300		 */
301		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
302		head->so_qlen++;
303		wakeup_one(&head->so_timeo);
304		splx(s);
305		goto done;
306	}
307	fhold(nfp);
308	p->p_retval[0] = fd;
309
310	/* connection has been removed from the listen queue */
311	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
312
313	so->so_state &= ~SS_COMP;
314	so->so_head = NULL;
315	if (head->so_sigio != NULL)
316		fsetown(fgetown(head->so_sigio), &so->so_sigio);
317
318	nfp->f_data = (caddr_t)so;
319	nfp->f_flag = fflag;
320	nfp->f_ops = &socketops;
321	nfp->f_type = DTYPE_SOCKET;
322	sa = 0;
323	error = soaccept(so, &sa);
324	if (error) {
325		/*
326		 * return a namelen of zero for older code which might
327	 	 * ignore the return value from accept.
328		 */
329		if (uap->name != NULL) {
330			namelen = 0;
331			(void) copyout((caddr_t)&namelen,
332			    (caddr_t)uap->anamelen, sizeof(*uap->anamelen));
333		}
334		goto noconnection;
335	}
336	if (sa == NULL) {
337		namelen = 0;
338		if (uap->name)
339			goto gotnoname;
340		splx(s);
341		error = 0;
342		goto done;
343	}
344	if (uap->name) {
345		/* check sa_len before it is destroyed */
346		if (namelen > sa->sa_len)
347			namelen = sa->sa_len;
348#ifdef COMPAT_OLDSOCK
349		if (compat)
350			((struct osockaddr *)sa)->sa_family =
351			    sa->sa_family;
352#endif
353		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
354		if (!error)
355gotnoname:
356			error = copyout((caddr_t)&namelen,
357			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
358	}
359noconnection:
360	if (sa)
361		FREE(sa, M_SONAME);
362
363	/*
364	 * close the new descriptor, assuming someone hasn't ripped it
365	 * out from under us.
366	 */
367	if (error) {
368		if (fdp->fd_ofiles[fd] == nfp) {
369			fdp->fd_ofiles[fd] = NULL;
370			fdrop(nfp, p);
371		}
372	}
373	splx(s);
374
375	/*
376	 * Release explicitly held references before returning.
377	 */
378done:
379	if (nfp != NULL)
380		fdrop(nfp, p);
381	fdrop(lfp, p);
382done2:
383	mtx_unlock(&Giant);
384	return (error);
385}
386
387/*
388 * MPSAFE (accept1() is MPSAFE)
389 */
390int
391accept(p, uap)
392	struct proc *p;
393	struct accept_args *uap;
394{
395	return (accept1(p, uap, 0));
396}
397
398#ifdef COMPAT_OLDSOCK
399/*
400 * MPSAFE (accept1() is MPSAFE)
401 */
402int
403oaccept(p, uap)
404	struct proc *p;
405	struct accept_args *uap;
406{
407	return (accept1(p, uap, 1));
408}
409#endif /* COMPAT_OLDSOCK */
410
411/*
412 * MPSAFE
413 */
414/* ARGSUSED */
415int
416connect(p, uap)
417	struct proc *p;
418	register struct connect_args /* {
419		int	s;
420		caddr_t	name;
421		int	namelen;
422	} */ *uap;
423{
424	struct file *fp;
425	register struct socket *so;
426	struct sockaddr *sa;
427	int error, s;
428
429	mtx_lock(&Giant);
430
431	error = holdsock(p->p_fd, uap->s, &fp);
432	if (error)
433		goto done2;
434	so = (struct socket *)fp->f_data;
435	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
436		error = EALREADY;
437		goto done;
438	}
439	error = getsockaddr(&sa, uap->name, uap->namelen);
440	if (error)
441		goto done;
442	error = soconnect(so, sa, p);
443	if (error)
444		goto bad;
445	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
446		FREE(sa, M_SONAME);
447		error = EINPROGRESS;
448		goto done;
449	}
450	s = splnet();
451	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
452		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
453		    "connec", 0);
454		if (error)
455			break;
456	}
457	if (error == 0) {
458		error = so->so_error;
459		so->so_error = 0;
460	}
461	splx(s);
462bad:
463	so->so_state &= ~SS_ISCONNECTING;
464	FREE(sa, M_SONAME);
465	if (error == ERESTART)
466		error = EINTR;
467done:
468	fdrop(fp, p);
469done2:
470	mtx_unlock(&Giant);
471	return (error);
472}
473
474/*
475 * MPSAFE
476 */
477int
478socketpair(p, uap)
479	struct proc *p;
480	register struct socketpair_args /* {
481		int	domain;
482		int	type;
483		int	protocol;
484		int	*rsv;
485	} */ *uap;
486{
487	register struct filedesc *fdp = p->p_fd;
488	struct file *fp1, *fp2;
489	struct socket *so1, *so2;
490	int fd, error, sv[2];
491
492	mtx_lock(&Giant);
493
494	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
495	if (error)
496		goto done2;
497	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
498	if (error)
499		goto free1;
500	error = falloc(p, &fp1, &fd);
501	if (error)
502		goto free2;
503	fhold(fp1);
504	sv[0] = fd;
505	fp1->f_data = (caddr_t)so1;
506	error = falloc(p, &fp2, &fd);
507	if (error)
508		goto free3;
509	fhold(fp2);
510	fp2->f_data = (caddr_t)so2;
511	sv[1] = fd;
512	error = soconnect2(so1, so2);
513	if (error)
514		goto free4;
515	if (uap->type == SOCK_DGRAM) {
516		/*
517		 * Datagram socket connection is asymmetric.
518		 */
519		 error = soconnect2(so2, so1);
520		 if (error)
521			goto free4;
522	}
523	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
524	fp1->f_ops = fp2->f_ops = &socketops;
525	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
526	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
527	fdrop(fp1, p);
528	fdrop(fp2, p);
529	goto done2;
530free4:
531	if (fdp->fd_ofiles[sv[1]] == fp2) {
532		fdp->fd_ofiles[sv[1]] = NULL;
533		fdrop(fp2, p);
534	}
535	fdrop(fp2, p);
536free3:
537	if (fdp->fd_ofiles[sv[0]] == fp1) {
538		fdp->fd_ofiles[sv[0]] = NULL;
539		fdrop(fp1, p);
540	}
541	fdrop(fp1, p);
542free2:
543	(void)soclose(so2);
544free1:
545	(void)soclose(so1);
546done2:
547	mtx_unlock(&Giant);
548	return (error);
549}
550
551static int
552sendit(p, s, mp, flags)
553	register struct proc *p;
554	int s;
555	register struct msghdr *mp;
556	int flags;
557{
558	struct file *fp;
559	struct uio auio;
560	register struct iovec *iov;
561	register int i;
562	struct mbuf *control;
563	struct sockaddr *to;
564	int len, error;
565	struct socket *so;
566#ifdef KTRACE
567	struct iovec *ktriov = NULL;
568	struct uio ktruio;
569#endif
570
571	error = holdsock(p->p_fd, s, &fp);
572	if (error)
573		return (error);
574	auio.uio_iov = mp->msg_iov;
575	auio.uio_iovcnt = mp->msg_iovlen;
576	auio.uio_segflg = UIO_USERSPACE;
577	auio.uio_rw = UIO_WRITE;
578	auio.uio_procp = p;
579	auio.uio_offset = 0;			/* XXX */
580	auio.uio_resid = 0;
581	iov = mp->msg_iov;
582	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
583		if ((auio.uio_resid += iov->iov_len) < 0) {
584			fdrop(fp, p);
585			return (EINVAL);
586		}
587	}
588	if (mp->msg_name) {
589		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
590		if (error) {
591			fdrop(fp, p);
592			return (error);
593		}
594	} else {
595		to = 0;
596	}
597	if (mp->msg_control) {
598		if (mp->msg_controllen < sizeof(struct cmsghdr)
599#ifdef COMPAT_OLDSOCK
600		    && mp->msg_flags != MSG_COMPAT
601#endif
602		) {
603			error = EINVAL;
604			goto bad;
605		}
606		error = sockargs(&control, mp->msg_control,
607		    mp->msg_controllen, MT_CONTROL);
608		if (error)
609			goto bad;
610#ifdef COMPAT_OLDSOCK
611		if (mp->msg_flags == MSG_COMPAT) {
612			register struct cmsghdr *cm;
613
614			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
615			if (control == 0) {
616				error = ENOBUFS;
617				goto bad;
618			} else {
619				cm = mtod(control, struct cmsghdr *);
620				cm->cmsg_len = control->m_len;
621				cm->cmsg_level = SOL_SOCKET;
622				cm->cmsg_type = SCM_RIGHTS;
623			}
624		}
625#endif
626	} else {
627		control = 0;
628	}
629#ifdef KTRACE
630	if (KTRPOINT(p, KTR_GENIO)) {
631		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
632
633		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
634		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
635		ktruio = auio;
636	}
637#endif
638	len = auio.uio_resid;
639	so = (struct socket *)fp->f_data;
640	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
641						     flags, p);
642	if (error) {
643		if (auio.uio_resid != len && (error == ERESTART ||
644		    error == EINTR || error == EWOULDBLOCK))
645			error = 0;
646		if (error == EPIPE) {
647			PROC_LOCK(p);
648			psignal(p, SIGPIPE);
649			PROC_UNLOCK(p);
650		}
651	}
652	if (error == 0)
653		p->p_retval[0] = len - auio.uio_resid;
654#ifdef KTRACE
655	if (ktriov != NULL) {
656		if (error == 0) {
657			ktruio.uio_iov = ktriov;
658			ktruio.uio_resid = p->p_retval[0];
659			ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error);
660		}
661		FREE(ktriov, M_TEMP);
662	}
663#endif
664bad:
665	fdrop(fp, p);
666	if (to)
667		FREE(to, M_SONAME);
668	return (error);
669}
670
671/*
672 * MPSAFE
673 */
674int
675sendto(p, uap)
676	struct proc *p;
677	register struct sendto_args /* {
678		int	s;
679		caddr_t	buf;
680		size_t	len;
681		int	flags;
682		caddr_t	to;
683		int	tolen;
684	} */ *uap;
685{
686	struct msghdr msg;
687	struct iovec aiov;
688	int error;
689
690	msg.msg_name = uap->to;
691	msg.msg_namelen = uap->tolen;
692	msg.msg_iov = &aiov;
693	msg.msg_iovlen = 1;
694	msg.msg_control = 0;
695#ifdef COMPAT_OLDSOCK
696	msg.msg_flags = 0;
697#endif
698	aiov.iov_base = uap->buf;
699	aiov.iov_len = uap->len;
700	mtx_lock(&Giant);
701	error = sendit(p, uap->s, &msg, uap->flags);
702	mtx_unlock(&Giant);
703	return (error);
704}
705
706#ifdef COMPAT_OLDSOCK
707/*
708 * MPSAFE
709 */
710int
711osend(p, uap)
712	struct proc *p;
713	register struct osend_args /* {
714		int	s;
715		caddr_t	buf;
716		int	len;
717		int	flags;
718	} */ *uap;
719{
720	struct msghdr msg;
721	struct iovec aiov;
722	int error;
723
724	msg.msg_name = 0;
725	msg.msg_namelen = 0;
726	msg.msg_iov = &aiov;
727	msg.msg_iovlen = 1;
728	aiov.iov_base = uap->buf;
729	aiov.iov_len = uap->len;
730	msg.msg_control = 0;
731	msg.msg_flags = 0;
732	mtx_lock(&Giant);
733	error = sendit(p, uap->s, &msg, uap->flags);
734	mtx_unlock(&Giant);
735	return (error);
736}
737
738/*
739 * MPSAFE
740 */
741int
742osendmsg(p, uap)
743	struct proc *p;
744	register struct osendmsg_args /* {
745		int	s;
746		caddr_t	msg;
747		int	flags;
748	} */ *uap;
749{
750	struct msghdr msg;
751	struct iovec aiov[UIO_SMALLIOV], *iov;
752	int error;
753
754	mtx_lock(&Giant);
755	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
756	if (error)
757		goto done2;
758	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
759		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
760			error = EMSGSIZE;
761			goto done2;
762		}
763		MALLOC(iov, struct iovec *,
764		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
765		      M_WAITOK);
766	} else {
767		iov = aiov;
768	}
769	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
770	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
771	if (error)
772		goto done;
773	msg.msg_flags = MSG_COMPAT;
774	msg.msg_iov = iov;
775	error = sendit(p, uap->s, &msg, uap->flags);
776done:
777	if (iov != aiov)
778		FREE(iov, M_IOV);
779done2:
780	mtx_unlock(&Giant);
781	return (error);
782}
783#endif
784
785/*
786 * MPSAFE
787 */
788int
789sendmsg(p, uap)
790	struct proc *p;
791	register struct sendmsg_args /* {
792		int	s;
793		caddr_t	msg;
794		int	flags;
795	} */ *uap;
796{
797	struct msghdr msg;
798	struct iovec aiov[UIO_SMALLIOV], *iov;
799	int error;
800
801	mtx_lock(&Giant);
802
803	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
804	if (error)
805		goto done2;
806	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
807		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
808			error = EMSGSIZE;
809			goto done2;
810		}
811		MALLOC(iov, struct iovec *,
812		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
813		       M_WAITOK);
814	} else {
815		iov = aiov;
816	}
817	if (msg.msg_iovlen &&
818	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
819	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
820		goto done;
821	msg.msg_iov = iov;
822#ifdef COMPAT_OLDSOCK
823	msg.msg_flags = 0;
824#endif
825	error = sendit(p, uap->s, &msg, uap->flags);
826done:
827	if (iov != aiov)
828		FREE(iov, M_IOV);
829done2:
830	mtx_unlock(&Giant);
831	return (error);
832}
833
834static int
835recvit(p, s, mp, namelenp)
836	register struct proc *p;
837	int s;
838	register struct msghdr *mp;
839	caddr_t namelenp;
840{
841	struct file *fp;
842	struct uio auio;
843	register struct iovec *iov;
844	register int i;
845	int len, error;
846	struct mbuf *m, *control = 0;
847	caddr_t ctlbuf;
848	struct socket *so;
849	struct sockaddr *fromsa = 0;
850#ifdef KTRACE
851	struct iovec *ktriov = NULL;
852	struct uio ktruio;
853#endif
854
855	error = holdsock(p->p_fd, s, &fp);
856	if (error)
857		return (error);
858	auio.uio_iov = mp->msg_iov;
859	auio.uio_iovcnt = mp->msg_iovlen;
860	auio.uio_segflg = UIO_USERSPACE;
861	auio.uio_rw = UIO_READ;
862	auio.uio_procp = p;
863	auio.uio_offset = 0;			/* XXX */
864	auio.uio_resid = 0;
865	iov = mp->msg_iov;
866	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
867		if ((auio.uio_resid += iov->iov_len) < 0) {
868			fdrop(fp, p);
869			return (EINVAL);
870		}
871	}
872#ifdef KTRACE
873	if (KTRPOINT(p, KTR_GENIO)) {
874		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
875
876		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
877		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
878		ktruio = auio;
879	}
880#endif
881	len = auio.uio_resid;
882	so = (struct socket *)fp->f_data;
883	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
884	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
885	    &mp->msg_flags);
886	if (error) {
887		if (auio.uio_resid != len && (error == ERESTART ||
888		    error == EINTR || error == EWOULDBLOCK))
889			error = 0;
890	}
891#ifdef KTRACE
892	if (ktriov != NULL) {
893		if (error == 0) {
894			ktruio.uio_iov = ktriov;
895			ktruio.uio_resid = len - auio.uio_resid;
896			ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error);
897		}
898		FREE(ktriov, M_TEMP);
899	}
900#endif
901	if (error)
902		goto out;
903	p->p_retval[0] = len - auio.uio_resid;
904	if (mp->msg_name) {
905		len = mp->msg_namelen;
906		if (len <= 0 || fromsa == 0)
907			len = 0;
908		else {
909#ifndef MIN
910#define MIN(a,b) ((a)>(b)?(b):(a))
911#endif
912			/* save sa_len before it is destroyed by MSG_COMPAT */
913			len = MIN(len, fromsa->sa_len);
914#ifdef COMPAT_OLDSOCK
915			if (mp->msg_flags & MSG_COMPAT)
916				((struct osockaddr *)fromsa)->sa_family =
917				    fromsa->sa_family;
918#endif
919			error = copyout(fromsa,
920			    (caddr_t)mp->msg_name, (unsigned)len);
921			if (error)
922				goto out;
923		}
924		mp->msg_namelen = len;
925		if (namelenp &&
926		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
927#ifdef COMPAT_OLDSOCK
928			if (mp->msg_flags & MSG_COMPAT)
929				error = 0;	/* old recvfrom didn't check */
930			else
931#endif
932			goto out;
933		}
934	}
935	if (mp->msg_control) {
936#ifdef COMPAT_OLDSOCK
937		/*
938		 * We assume that old recvmsg calls won't receive access
939		 * rights and other control info, esp. as control info
940		 * is always optional and those options didn't exist in 4.3.
941		 * If we receive rights, trim the cmsghdr; anything else
942		 * is tossed.
943		 */
944		if (control && mp->msg_flags & MSG_COMPAT) {
945			if (mtod(control, struct cmsghdr *)->cmsg_level !=
946			    SOL_SOCKET ||
947			    mtod(control, struct cmsghdr *)->cmsg_type !=
948			    SCM_RIGHTS) {
949				mp->msg_controllen = 0;
950				goto out;
951			}
952			control->m_len -= sizeof (struct cmsghdr);
953			control->m_data += sizeof (struct cmsghdr);
954		}
955#endif
956		len = mp->msg_controllen;
957		m = control;
958		mp->msg_controllen = 0;
959		ctlbuf = (caddr_t) mp->msg_control;
960
961		while (m && len > 0) {
962			unsigned int tocopy;
963
964			if (len >= m->m_len)
965				tocopy = m->m_len;
966			else {
967				mp->msg_flags |= MSG_CTRUNC;
968				tocopy = len;
969			}
970
971			if ((error = copyout((caddr_t)mtod(m, caddr_t),
972					ctlbuf, tocopy)) != 0)
973				goto out;
974
975			ctlbuf += tocopy;
976			len -= tocopy;
977			m = m->m_next;
978		}
979		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
980	}
981out:
982	fdrop(fp, p);
983	if (fromsa)
984		FREE(fromsa, M_SONAME);
985	if (control)
986		m_freem(control);
987	return (error);
988}
989
990/*
991 * MPSAFE
992 */
993int
994recvfrom(p, uap)
995	struct proc *p;
996	register struct recvfrom_args /* {
997		int	s;
998		caddr_t	buf;
999		size_t	len;
1000		int	flags;
1001		caddr_t	from;
1002		int	*fromlenaddr;
1003	} */ *uap;
1004{
1005	struct msghdr msg;
1006	struct iovec aiov;
1007	int error;
1008
1009	mtx_lock(&Giant);
1010
1011	if (uap->fromlenaddr) {
1012		error = copyin((caddr_t)uap->fromlenaddr,
1013		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
1014		if (error)
1015			goto done2;
1016	} else {
1017		msg.msg_namelen = 0;
1018	}
1019	msg.msg_name = uap->from;
1020	msg.msg_iov = &aiov;
1021	msg.msg_iovlen = 1;
1022	aiov.iov_base = uap->buf;
1023	aiov.iov_len = uap->len;
1024	msg.msg_control = 0;
1025	msg.msg_flags = uap->flags;
1026	error = recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr);
1027done2:
1028	mtx_unlock(&Giant);
1029	return(error);
1030}
1031
1032#ifdef COMPAT_OLDSOCK
1033/*
1034 * MPSAFE
1035 */
1036int
1037orecvfrom(p, uap)
1038	struct proc *p;
1039	struct recvfrom_args *uap;
1040{
1041
1042	uap->flags |= MSG_COMPAT;
1043	return (recvfrom(p, uap));
1044}
1045#endif
1046
1047
1048#ifdef COMPAT_OLDSOCK
1049/*
1050 * MPSAFE
1051 */
1052int
1053orecv(p, uap)
1054	struct proc *p;
1055	register struct orecv_args /* {
1056		int	s;
1057		caddr_t	buf;
1058		int	len;
1059		int	flags;
1060	} */ *uap;
1061{
1062	struct msghdr msg;
1063	struct iovec aiov;
1064	int error;
1065
1066	mtx_lock(&Giant);
1067	msg.msg_name = 0;
1068	msg.msg_namelen = 0;
1069	msg.msg_iov = &aiov;
1070	msg.msg_iovlen = 1;
1071	aiov.iov_base = uap->buf;
1072	aiov.iov_len = uap->len;
1073	msg.msg_control = 0;
1074	msg.msg_flags = uap->flags;
1075	error = recvit(p, uap->s, &msg, (caddr_t)0);
1076	mtx_unlock(&Giant);
1077	return (error);
1078}
1079
1080/*
1081 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1082 * overlays the new one, missing only the flags, and with the (old) access
1083 * rights where the control fields are now.
1084 *
1085 * MPSAFE
1086 */
1087int
1088orecvmsg(p, uap)
1089	struct proc *p;
1090	register struct orecvmsg_args /* {
1091		int	s;
1092		struct	omsghdr *msg;
1093		int	flags;
1094	} */ *uap;
1095{
1096	struct msghdr msg;
1097	struct iovec aiov[UIO_SMALLIOV], *iov;
1098	int error;
1099
1100	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
1101	    sizeof (struct omsghdr));
1102	if (error)
1103		return (error);
1104
1105	mtx_lock(&Giant);
1106	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1107		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1108			error = EMSGSIZE;
1109			goto done2;
1110		}
1111		MALLOC(iov, struct iovec *,
1112		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1113		      M_WAITOK);
1114	} else {
1115		iov = aiov;
1116	}
1117	msg.msg_flags = uap->flags | MSG_COMPAT;
1118	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
1119	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1120	if (error)
1121		goto done;
1122	msg.msg_iov = iov;
1123	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
1124
1125	if (msg.msg_controllen && error == 0)
1126		error = copyout((caddr_t)&msg.msg_controllen,
1127		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
1128done:
1129	if (iov != aiov)
1130		FREE(iov, M_IOV);
1131done2:
1132	mtx_unlock(&Giant);
1133	return (error);
1134}
1135#endif
1136
1137/*
1138 * MPSAFE
1139 */
1140int
1141recvmsg(p, uap)
1142	struct proc *p;
1143	register struct recvmsg_args /* {
1144		int	s;
1145		struct	msghdr *msg;
1146		int	flags;
1147	} */ *uap;
1148{
1149	struct msghdr msg;
1150	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1151	register int error;
1152
1153	mtx_lock(&Giant);
1154
1155	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
1156	if (error)
1157		goto done2;
1158	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1159		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1160			error = EMSGSIZE;
1161			goto done2;
1162		}
1163		MALLOC(iov, struct iovec *,
1164		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1165		       M_WAITOK);
1166	} else {
1167		iov = aiov;
1168	}
1169#ifdef COMPAT_OLDSOCK
1170	msg.msg_flags = uap->flags &~ MSG_COMPAT;
1171#else
1172	msg.msg_flags = uap->flags;
1173#endif
1174	uiov = msg.msg_iov;
1175	msg.msg_iov = iov;
1176	error = copyin((caddr_t)uiov, (caddr_t)iov,
1177	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1178	if (error)
1179		goto done;
1180	error = recvit(p, uap->s, &msg, (caddr_t)0);
1181	if (!error) {
1182		msg.msg_iov = uiov;
1183		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
1184	}
1185done:
1186	if (iov != aiov)
1187		FREE(iov, M_IOV);
1188done2:
1189	mtx_unlock(&Giant);
1190	return (error);
1191}
1192
1193/*
1194 * MPSAFE
1195 */
1196/* ARGSUSED */
1197int
1198shutdown(p, uap)
1199	struct proc *p;
1200	register struct shutdown_args /* {
1201		int	s;
1202		int	how;
1203	} */ *uap;
1204{
1205	struct file *fp;
1206	int error;
1207
1208	mtx_lock(&Giant);
1209
1210	error = holdsock(p->p_fd, uap->s, &fp);
1211	if (error == 0) {
1212		error = soshutdown((struct socket *)fp->f_data, uap->how);
1213		fdrop(fp, p);
1214	}
1215	mtx_unlock(&Giant);
1216	return(error);
1217}
1218
1219/*
1220 * MPSAFE
1221 */
1222/* ARGSUSED */
1223int
1224setsockopt(p, uap)
1225	struct proc *p;
1226	register struct setsockopt_args /* {
1227		int	s;
1228		int	level;
1229		int	name;
1230		caddr_t	val;
1231		int	valsize;
1232	} */ *uap;
1233{
1234	struct file *fp;
1235	struct sockopt sopt;
1236	int error;
1237
1238	if (uap->val == 0 && uap->valsize != 0)
1239		return (EFAULT);
1240	if (uap->valsize < 0)
1241		return (EINVAL);
1242
1243	mtx_lock(&Giant);
1244	error = holdsock(p->p_fd, uap->s, &fp);
1245	if (error == 0) {
1246		sopt.sopt_dir = SOPT_SET;
1247		sopt.sopt_level = uap->level;
1248		sopt.sopt_name = uap->name;
1249		sopt.sopt_val = uap->val;
1250		sopt.sopt_valsize = uap->valsize;
1251		sopt.sopt_p = p;
1252		error = sosetopt((struct socket *)fp->f_data, &sopt);
1253		fdrop(fp, p);
1254	}
1255	mtx_unlock(&Giant);
1256	return(error);
1257}
1258
1259/*
1260 * MPSAFE
1261 */
1262/* ARGSUSED */
1263int
1264getsockopt(p, uap)
1265	struct proc *p;
1266	register struct getsockopt_args /* {
1267		int	s;
1268		int	level;
1269		int	name;
1270		caddr_t	val;
1271		int	*avalsize;
1272	} */ *uap;
1273{
1274	int	valsize, error;
1275	struct	file *fp;
1276	struct	sockopt sopt;
1277
1278	mtx_lock(&Giant);
1279
1280	error = holdsock(p->p_fd, uap->s, &fp);
1281	if (error)
1282		goto done2;
1283	if (uap->val) {
1284		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1285		    sizeof (valsize));
1286		if (error) {
1287			fdrop(fp, p);
1288			goto done2;
1289		}
1290		if (valsize < 0) {
1291			fdrop(fp, p);
1292			error = EINVAL;
1293			goto done2;
1294		}
1295	} else {
1296		valsize = 0;
1297	}
1298
1299	sopt.sopt_dir = SOPT_GET;
1300	sopt.sopt_level = uap->level;
1301	sopt.sopt_name = uap->name;
1302	sopt.sopt_val = uap->val;
1303	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1304	sopt.sopt_p = p;
1305
1306	error = sogetopt((struct socket *)fp->f_data, &sopt);
1307	if (error == 0) {
1308		valsize = sopt.sopt_valsize;
1309		error = copyout((caddr_t)&valsize,
1310				(caddr_t)uap->avalsize, sizeof (valsize));
1311	}
1312	fdrop(fp, p);
1313done2:
1314	mtx_unlock(&Giant);
1315	return (error);
1316}
1317
1318/*
1319 * getsockname1() - Get socket name.
1320 *
1321 * MPSAFE
1322 */
1323/* ARGSUSED */
1324static int
1325getsockname1(p, uap, compat)
1326	struct proc *p;
1327	register struct getsockname_args /* {
1328		int	fdes;
1329		caddr_t	asa;
1330		int	*alen;
1331	} */ *uap;
1332	int compat;
1333{
1334	struct file *fp;
1335	register struct socket *so;
1336	struct sockaddr *sa;
1337	int len, error;
1338
1339	mtx_lock(&Giant);
1340
1341	error = holdsock(p->p_fd, uap->fdes, &fp);
1342	if (error)
1343		goto done2;
1344	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1345	if (error) {
1346		fdrop(fp, p);
1347		goto done2;
1348	}
1349	so = (struct socket *)fp->f_data;
1350	sa = 0;
1351	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1352	if (error)
1353		goto bad;
1354	if (sa == 0) {
1355		len = 0;
1356		goto gotnothing;
1357	}
1358
1359	len = MIN(len, sa->sa_len);
1360#ifdef COMPAT_OLDSOCK
1361	if (compat)
1362		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1363#endif
1364	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1365	if (error == 0)
1366gotnothing:
1367		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1368		    sizeof (len));
1369bad:
1370	if (sa)
1371		FREE(sa, M_SONAME);
1372	fdrop(fp, p);
1373done2:
1374	mtx_unlock(&Giant);
1375	return (error);
1376}
1377
1378/*
1379 * MPSAFE
1380 */
1381int
1382getsockname(p, uap)
1383	struct proc *p;
1384	struct getsockname_args *uap;
1385{
1386	return (getsockname1(p, uap, 0));
1387}
1388
1389#ifdef COMPAT_OLDSOCK
1390/*
1391 * MPSAFE
1392 */
1393int
1394ogetsockname(p, uap)
1395	struct proc *p;
1396	struct getsockname_args *uap;
1397{
1398	return (getsockname1(p, uap, 1));
1399}
1400#endif /* COMPAT_OLDSOCK */
1401
1402/*
1403 * getpeername1() - Get name of peer for connected socket.
1404 *
1405 * MPSAFE
1406 */
1407/* ARGSUSED */
1408static int
1409getpeername1(p, uap, compat)
1410	struct proc *p;
1411	register struct getpeername_args /* {
1412		int	fdes;
1413		caddr_t	asa;
1414		int	*alen;
1415	} */ *uap;
1416	int compat;
1417{
1418	struct file *fp;
1419	register struct socket *so;
1420	struct sockaddr *sa;
1421	int len, error;
1422
1423	mtx_lock(&Giant);
1424
1425	error = holdsock(p->p_fd, uap->fdes, &fp);
1426	if (error)
1427		goto done2;
1428	so = (struct socket *)fp->f_data;
1429	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1430		fdrop(fp, p);
1431		error = ENOTCONN;
1432		goto done2;
1433	}
1434	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1435	if (error) {
1436		fdrop(fp, p);
1437		goto done2;
1438	}
1439	sa = 0;
1440	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1441	if (error)
1442		goto bad;
1443	if (sa == 0) {
1444		len = 0;
1445		goto gotnothing;
1446	}
1447	len = MIN(len, sa->sa_len);
1448#ifdef COMPAT_OLDSOCK
1449	if (compat)
1450		((struct osockaddr *)sa)->sa_family =
1451		    sa->sa_family;
1452#endif
1453	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1454	if (error)
1455		goto bad;
1456gotnothing:
1457	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1458bad:
1459	if (sa)
1460		FREE(sa, M_SONAME);
1461	fdrop(fp, p);
1462done2:
1463	mtx_unlock(&Giant);
1464	return (error);
1465}
1466
1467/*
1468 * MPSAFE
1469 */
1470int
1471getpeername(p, uap)
1472	struct proc *p;
1473	struct getpeername_args *uap;
1474{
1475	return (getpeername1(p, uap, 0));
1476}
1477
1478#ifdef COMPAT_OLDSOCK
1479/*
1480 * MPSAFE
1481 */
1482int
1483ogetpeername(p, uap)
1484	struct proc *p;
1485	struct ogetpeername_args *uap;
1486{
1487	/* XXX uap should have type `getpeername_args *' to begin with. */
1488	return (getpeername1(p, (struct getpeername_args *)uap, 1));
1489}
1490#endif /* COMPAT_OLDSOCK */
1491
1492int
1493sockargs(mp, buf, buflen, type)
1494	struct mbuf **mp;
1495	caddr_t buf;
1496	int buflen, type;
1497{
1498	register struct sockaddr *sa;
1499	register struct mbuf *m;
1500	int error;
1501
1502	if ((u_int)buflen > MLEN) {
1503#ifdef COMPAT_OLDSOCK
1504		if (type == MT_SONAME && (u_int)buflen <= 112)
1505			buflen = MLEN;		/* unix domain compat. hack */
1506		else
1507#endif
1508		return (EINVAL);
1509	}
1510	m = m_get(M_TRYWAIT, type);
1511	if (m == NULL)
1512		return (ENOBUFS);
1513	m->m_len = buflen;
1514	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1515	if (error)
1516		(void) m_free(m);
1517	else {
1518		*mp = m;
1519		if (type == MT_SONAME) {
1520			sa = mtod(m, struct sockaddr *);
1521
1522#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1523			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1524				sa->sa_family = sa->sa_len;
1525#endif
1526			sa->sa_len = buflen;
1527		}
1528	}
1529	return (error);
1530}
1531
1532int
1533getsockaddr(namp, uaddr, len)
1534	struct sockaddr **namp;
1535	caddr_t uaddr;
1536	size_t len;
1537{
1538	struct sockaddr *sa;
1539	int error;
1540
1541	if (len > SOCK_MAXADDRLEN)
1542		return ENAMETOOLONG;
1543	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1544	error = copyin(uaddr, sa, len);
1545	if (error) {
1546		FREE(sa, M_SONAME);
1547	} else {
1548#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1549		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1550			sa->sa_family = sa->sa_len;
1551#endif
1552		sa->sa_len = len;
1553		*namp = sa;
1554	}
1555	return error;
1556}
1557
1558/*
1559 * holdsock() - load the struct file pointer associated
1560 * with a socket into *fpp.  If an error occurs, non-zero
1561 * will be returned and *fpp will be set to NULL.
1562 */
1563int
1564holdsock(fdp, fdes, fpp)
1565	struct filedesc *fdp;
1566	int fdes;
1567	struct file **fpp;
1568{
1569	register struct file *fp = NULL;
1570	int error = 0;
1571
1572	if ((unsigned)fdes >= fdp->fd_nfiles ||
1573	    (fp = fdp->fd_ofiles[fdes]) == NULL) {
1574		error = EBADF;
1575	} else if (fp->f_type != DTYPE_SOCKET) {
1576		error = ENOTSOCK;
1577		fp = NULL;
1578	} else {
1579		fhold(fp);
1580	}
1581	*fpp = fp;
1582	return(error);
1583}
1584
1585/*
1586 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1587 * XXX - The sf_buf functions are currently private to sendfile(2), so have
1588 * been made static, but may be useful in the future for doing zero-copy in
1589 * other parts of the networking code.
1590 */
1591static void
1592sf_buf_init(void *arg)
1593{
1594	int i;
1595
1596	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", MTX_DEF);
1597	mtx_lock(&sf_freelist.sf_lock);
1598	SLIST_INIT(&sf_freelist.sf_head);
1599	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1600	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
1601	    M_NOWAIT | M_ZERO);
1602	for (i = 0; i < nsfbufs; i++) {
1603		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1604		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
1605	}
1606	sf_buf_alloc_want = 0;
1607	mtx_unlock(&sf_freelist.sf_lock);
1608}
1609
1610/*
1611 * Get an sf_buf from the freelist. Will block if none are available.
1612 */
1613static struct sf_buf *
1614sf_buf_alloc()
1615{
1616	struct sf_buf *sf;
1617	int error;
1618
1619	mtx_lock(&sf_freelist.sf_lock);
1620	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
1621		sf_buf_alloc_want++;
1622		error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
1623		    "sfbufa", 0);
1624		sf_buf_alloc_want--;
1625
1626		/*
1627		 * If we got a signal, don't risk going back to sleep.
1628		 */
1629		if (error)
1630			break;
1631	}
1632	if (sf != NULL)
1633		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
1634	mtx_unlock(&sf_freelist.sf_lock);
1635	return (sf);
1636}
1637
1638#define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1639
1640/*
1641 * Detatch mapped page and release resources back to the system.
1642 */
1643static void
1644sf_buf_free(caddr_t addr, void *args)
1645{
1646	struct sf_buf *sf;
1647	struct vm_page *m;
1648
1649	GIANT_REQUIRED;
1650
1651	sf = dtosf(addr);
1652	pmap_qremove((vm_offset_t)addr, 1);
1653	m = sf->m;
1654	vm_page_unwire(m, 0);
1655	/*
1656	 * Check for the object going away on us. This can
1657	 * happen since we don't hold a reference to it.
1658	 * If so, we're responsible for freeing the page.
1659	 */
1660	if (m->wire_count == 0 && m->object == NULL)
1661		vm_page_free(m);
1662	sf->m = NULL;
1663	mtx_lock(&sf_freelist.sf_lock);
1664	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
1665	if (sf_buf_alloc_want > 0)
1666		wakeup_one(&sf_freelist);
1667	mtx_unlock(&sf_freelist.sf_lock);
1668}
1669
1670/*
1671 * sendfile(2)
1672 *
1673 * MPSAFE
1674 *
1675 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1676 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1677 *
1678 * Send a file specified by 'fd' and starting at 'offset' to a socket
1679 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1680 * nbytes == 0. Optionally add a header and/or trailer to the socket
1681 * output. If specified, write the total number of bytes sent into *sbytes.
1682 *
1683 */
1684int
1685sendfile(struct proc *p, struct sendfile_args *uap)
1686{
1687	struct file *fp;
1688	struct filedesc *fdp = p->p_fd;
1689	struct vnode *vp;
1690	struct vm_object *obj;
1691	struct socket *so;
1692	struct mbuf *m;
1693	struct sf_buf *sf;
1694	struct vm_page *pg;
1695	struct writev_args nuap;
1696	struct sf_hdtr hdtr;
1697	off_t off, xfsize, sbytes = 0;
1698	int error = 0, s;
1699
1700	mtx_lock(&Giant);
1701
1702	vp = NULL;
1703	/*
1704	 * Do argument checking. Must be a regular file in, stream
1705	 * type and connected socket out, positive offset.
1706	 */
1707	fp = holdfp(fdp, uap->fd, FREAD);
1708	if (fp == NULL) {
1709		error = EBADF;
1710		goto done;
1711	}
1712	if (fp->f_type != DTYPE_VNODE) {
1713		error = EINVAL;
1714		goto done;
1715	}
1716	vp = (struct vnode *)fp->f_data;
1717	vref(vp);
1718	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1719		error = EINVAL;
1720		goto done;
1721	}
1722	fdrop(fp, p);
1723	error = holdsock(p->p_fd, uap->s, &fp);
1724	if (error)
1725		goto done;
1726	so = (struct socket *)fp->f_data;
1727	if (so->so_type != SOCK_STREAM) {
1728		error = EINVAL;
1729		goto done;
1730	}
1731	if ((so->so_state & SS_ISCONNECTED) == 0) {
1732		error = ENOTCONN;
1733		goto done;
1734	}
1735	if (uap->offset < 0) {
1736		error = EINVAL;
1737		goto done;
1738	}
1739
1740	/*
1741	 * If specified, get the pointer to the sf_hdtr struct for
1742	 * any headers/trailers.
1743	 */
1744	if (uap->hdtr != NULL) {
1745		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1746		if (error)
1747			goto done;
1748		/*
1749		 * Send any headers. Wimp out and use writev(2).
1750		 */
1751		if (hdtr.headers != NULL) {
1752			nuap.fd = uap->s;
1753			nuap.iovp = hdtr.headers;
1754			nuap.iovcnt = hdtr.hdr_cnt;
1755			error = writev(p, &nuap);
1756			if (error)
1757				goto done;
1758			sbytes += p->p_retval[0];
1759		}
1760	}
1761
1762	/*
1763	 * Protect against multiple writers to the socket.
1764	 */
1765	(void) sblock(&so->so_snd, M_WAITOK);
1766
1767	/*
1768	 * Loop through the pages in the file, starting with the requested
1769	 * offset. Get a file page (do I/O if necessary), map the file page
1770	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1771	 * it on the socket.
1772	 */
1773	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1774		vm_pindex_t pindex;
1775		vm_offset_t pgoff;
1776
1777		pindex = OFF_TO_IDX(off);
1778retry_lookup:
1779		/*
1780		 * Calculate the amount to transfer. Not to exceed a page,
1781		 * the EOF, or the passed in nbytes.
1782		 */
1783		xfsize = obj->un_pager.vnp.vnp_size - off;
1784		if (xfsize > PAGE_SIZE)
1785			xfsize = PAGE_SIZE;
1786		pgoff = (vm_offset_t)(off & PAGE_MASK);
1787		if (PAGE_SIZE - pgoff < xfsize)
1788			xfsize = PAGE_SIZE - pgoff;
1789		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1790			xfsize = uap->nbytes - sbytes;
1791		if (xfsize <= 0)
1792			break;
1793		/*
1794		 * Optimize the non-blocking case by looking at the socket space
1795		 * before going to the extra work of constituting the sf_buf.
1796		 */
1797		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1798			if (so->so_state & SS_CANTSENDMORE)
1799				error = EPIPE;
1800			else
1801				error = EAGAIN;
1802			sbunlock(&so->so_snd);
1803			goto done;
1804		}
1805		/*
1806		 * Attempt to look up the page.
1807		 *
1808		 *	Allocate if not found
1809		 *
1810		 *	Wait and loop if busy.
1811		 */
1812		pg = vm_page_lookup(obj, pindex);
1813
1814		if (pg == NULL) {
1815			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1816			if (pg == NULL) {
1817				VM_WAIT;
1818				goto retry_lookup;
1819			}
1820			vm_page_wakeup(pg);
1821		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1822			goto retry_lookup;
1823		}
1824
1825		/*
1826		 * Wire the page so it does not get ripped out from under
1827		 * us.
1828		 */
1829
1830		vm_page_wire(pg);
1831
1832		/*
1833		 * If page is not valid for what we need, initiate I/O
1834		 */
1835
1836		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1837			struct uio auio;
1838			struct iovec aiov;
1839			int bsize;
1840
1841			/*
1842			 * Ensure that our page is still around when the I/O
1843			 * completes.
1844			 */
1845			vm_page_io_start(pg);
1846
1847			/*
1848			 * Get the page from backing store.
1849			 */
1850			bsize = vp->v_mount->mnt_stat.f_iosize;
1851			auio.uio_iov = &aiov;
1852			auio.uio_iovcnt = 1;
1853			aiov.iov_base = 0;
1854			aiov.iov_len = MAXBSIZE;
1855			auio.uio_resid = MAXBSIZE;
1856			auio.uio_offset = trunc_page(off);
1857			auio.uio_segflg = UIO_NOCOPY;
1858			auio.uio_rw = UIO_READ;
1859			auio.uio_procp = p;
1860			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1861			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1862			        p->p_ucred);
1863			VOP_UNLOCK(vp, 0, p);
1864			vm_page_flag_clear(pg, PG_ZERO);
1865			vm_page_io_finish(pg);
1866			if (error) {
1867				vm_page_unwire(pg, 0);
1868				/*
1869				 * See if anyone else might know about this page.
1870				 * If not and it is not valid, then free it.
1871				 */
1872				if (pg->wire_count == 0 && pg->valid == 0 &&
1873				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1874				    pg->hold_count == 0) {
1875					vm_page_busy(pg);
1876					vm_page_free(pg);
1877				}
1878				sbunlock(&so->so_snd);
1879				goto done;
1880			}
1881		}
1882
1883
1884		/*
1885		 * Get a sendfile buf. We usually wait as long as necessary,
1886		 * but this wait can be interrupted.
1887		 */
1888		if ((sf = sf_buf_alloc()) == NULL) {
1889			vm_page_unwire(pg, 0);
1890			if (pg->wire_count == 0 && pg->object == NULL)
1891				vm_page_free(pg);
1892			sbunlock(&so->so_snd);
1893			error = EINTR;
1894			goto done;
1895		}
1896
1897		/*
1898		 * Allocate a kernel virtual page and insert the physical page
1899		 * into it.
1900		 */
1901		sf->m = pg;
1902		pmap_qenter(sf->kva, &pg, 1);
1903		/*
1904		 * Get an mbuf header and set it up as having external storage.
1905		 */
1906		MGETHDR(m, M_TRYWAIT, MT_DATA);
1907		if (m == NULL) {
1908			error = ENOBUFS;
1909			sf_buf_free((void *)sf->kva, NULL);
1910			sbunlock(&so->so_snd);
1911			goto done;
1912		}
1913		/*
1914		 * Setup external storage for mbuf.
1915		 */
1916		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY,
1917		    EXT_SFBUF);
1918		m->m_data = (char *) sf->kva + pgoff;
1919		m->m_pkthdr.len = m->m_len = xfsize;
1920		/*
1921		 * Add the buffer to the socket buffer chain.
1922		 */
1923		s = splnet();
1924retry_space:
1925		/*
1926		 * Make sure that the socket is still able to take more data.
1927		 * CANTSENDMORE being true usually means that the connection
1928		 * was closed. so_error is true when an error was sensed after
1929		 * a previous send.
1930		 * The state is checked after the page mapping and buffer
1931		 * allocation above since those operations may block and make
1932		 * any socket checks stale. From this point forward, nothing
1933		 * blocks before the pru_send (or more accurately, any blocking
1934		 * results in a loop back to here to re-check).
1935		 */
1936		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1937			if (so->so_state & SS_CANTSENDMORE) {
1938				error = EPIPE;
1939			} else {
1940				error = so->so_error;
1941				so->so_error = 0;
1942			}
1943			m_freem(m);
1944			sbunlock(&so->so_snd);
1945			splx(s);
1946			goto done;
1947		}
1948		/*
1949		 * Wait for socket space to become available. We do this just
1950		 * after checking the connection state above in order to avoid
1951		 * a race condition with sbwait().
1952		 */
1953		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1954			if (so->so_state & SS_NBIO) {
1955				m_freem(m);
1956				sbunlock(&so->so_snd);
1957				splx(s);
1958				error = EAGAIN;
1959				goto done;
1960			}
1961			error = sbwait(&so->so_snd);
1962			/*
1963			 * An error from sbwait usually indicates that we've
1964			 * been interrupted by a signal. If we've sent anything
1965			 * then return bytes sent, otherwise return the error.
1966			 */
1967			if (error) {
1968				m_freem(m);
1969				sbunlock(&so->so_snd);
1970				splx(s);
1971				goto done;
1972			}
1973			goto retry_space;
1974		}
1975		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1976		splx(s);
1977		if (error) {
1978			sbunlock(&so->so_snd);
1979			goto done;
1980		}
1981	}
1982	sbunlock(&so->so_snd);
1983
1984	/*
1985	 * Send trailers. Wimp out and use writev(2).
1986	 */
1987	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1988			nuap.fd = uap->s;
1989			nuap.iovp = hdtr.trailers;
1990			nuap.iovcnt = hdtr.trl_cnt;
1991			error = writev(p, &nuap);
1992			if (error)
1993				goto done;
1994			sbytes += p->p_retval[0];
1995	}
1996
1997done:
1998	/*
1999	 * If there was no error we have to clear p->p_retval[0]
2000	 * because it may have been set by writev.
2001	 */
2002	if (error == 0) {
2003		p->p_retval[0] = 0;
2004	}
2005	if (uap->sbytes != NULL) {
2006		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2007	}
2008	if (vp)
2009		vrele(vp);
2010	if (fp)
2011		fdrop(fp, p);
2012	mtx_unlock(&Giant);
2013	return (error);
2014}
2015
2016