kern_sendfile.c revision 78699
1/*
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37 * $FreeBSD: head/sys/kern/uipc_syscalls.c 78699 2001-06-24 12:27:30Z dwmalone $
38 */
39
40#include "opt_compat.h"
41#include "opt_ktrace.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/mutex.h>
48#include <sys/sysproto.h>
49#include <sys/malloc.h>
50#include <sys/filedesc.h>
51#include <sys/event.h>
52#include <sys/proc.h>
53#include <sys/fcntl.h>
54#include <sys/file.h>
55#include <sys/lock.h>
56#include <sys/mount.h>
57#include <sys/mbuf.h>
58#include <sys/protosw.h>
59#include <sys/socket.h>
60#include <sys/socketvar.h>
61#include <sys/signalvar.h>
62#include <sys/uio.h>
63#include <sys/vnode.h>
64#ifdef KTRACE
65#include <sys/ktrace.h>
66#endif
67
68#include <vm/vm.h>
69#include <vm/vm_object.h>
70#include <vm/vm_page.h>
71#include <vm/vm_pageout.h>
72#include <vm/vm_kern.h>
73#include <vm/vm_extern.h>
74
75static void sf_buf_init(void *arg);
76SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
77static struct sf_buf *sf_buf_alloc(void);
78static void sf_buf_free(caddr_t addr, void *args);
79
80static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
81static int recvit __P((struct proc *p, int s, struct msghdr *mp,
82		       caddr_t namelenp));
83
84static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
85static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
86			     int compat));
87static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
88			     int compat));
89
90/*
91 * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
92 * sf_freelist head with the sf_lock mutex.
93 */
94static struct {
95	SLIST_HEAD(, sf_buf) sf_head;
96	struct mtx sf_lock;
97} sf_freelist;
98
99static vm_offset_t sf_base;
100static struct sf_buf *sf_bufs;
101static u_int sf_buf_alloc_want;
102
103/*
104 * System call interface to the socket abstraction.
105 */
106#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
107#define COMPAT_OLDSOCK
108#endif
109
110extern	struct fileops socketops;
111
112int
113socket(p, uap)
114	struct proc *p;
115	register struct socket_args /* {
116		int	domain;
117		int	type;
118		int	protocol;
119	} */ *uap;
120{
121	struct filedesc *fdp = p->p_fd;
122	struct socket *so;
123	struct file *fp;
124	int fd, error;
125
126	error = falloc(p, &fp, &fd);
127	if (error)
128		return (error);
129	fhold(fp);
130	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
131	if (error) {
132		if (fdp->fd_ofiles[fd] == fp) {
133			fdp->fd_ofiles[fd] = NULL;
134			fdrop(fp, p);
135		}
136	} else {
137		fp->f_data = (caddr_t)so;
138		fp->f_flag = FREAD|FWRITE;
139		fp->f_ops = &socketops;
140		fp->f_type = DTYPE_SOCKET;
141		p->p_retval[0] = fd;
142	}
143	fdrop(fp, p);
144	return (error);
145}
146
147/* ARGSUSED */
148int
149bind(p, uap)
150	struct proc *p;
151	register struct bind_args /* {
152		int	s;
153		caddr_t	name;
154		int	namelen;
155	} */ *uap;
156{
157	struct file *fp;
158	struct sockaddr *sa;
159	int error;
160
161	error = holdsock(p->p_fd, uap->s, &fp);
162	if (error)
163		return (error);
164	error = getsockaddr(&sa, uap->name, uap->namelen);
165	if (error) {
166		fdrop(fp, p);
167		return (error);
168	}
169	error = sobind((struct socket *)fp->f_data, sa, p);
170	FREE(sa, M_SONAME);
171	fdrop(fp, p);
172	return (error);
173}
174
175/* ARGSUSED */
176int
177listen(p, uap)
178	struct proc *p;
179	register struct listen_args /* {
180		int	s;
181		int	backlog;
182	} */ *uap;
183{
184	struct file *fp;
185	int error;
186
187	error = holdsock(p->p_fd, uap->s, &fp);
188	if (error)
189		return (error);
190	error = solisten((struct socket *)fp->f_data, uap->backlog, p);
191	fdrop(fp, p);
192	return(error);
193}
194
195static int
196accept1(p, uap, compat)
197	struct proc *p;
198	register struct accept_args /* {
199		int	s;
200		caddr_t	name;
201		int	*anamelen;
202	} */ *uap;
203	int compat;
204{
205	struct filedesc *fdp = p->p_fd;
206	struct file *lfp = NULL;
207	struct file *nfp = NULL;
208	struct sockaddr *sa;
209	int namelen, error, s;
210	struct socket *head, *so;
211	int fd;
212	short fflag;		/* type must match fp->f_flag */
213
214	if (uap->name) {
215		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
216			sizeof (namelen));
217		if(error)
218			return (error);
219	}
220	error = holdsock(fdp, uap->s, &lfp);
221	if (error)
222		return (error);
223	s = splnet();
224	head = (struct socket *)lfp->f_data;
225	if ((head->so_options & SO_ACCEPTCONN) == 0) {
226		splx(s);
227		error = EINVAL;
228		goto done;
229	}
230	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
231		splx(s);
232		error = EWOULDBLOCK;
233		goto done;
234	}
235	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
236		if (head->so_state & SS_CANTRCVMORE) {
237			head->so_error = ECONNABORTED;
238			break;
239		}
240		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
241		    "accept", 0);
242		if (error) {
243			splx(s);
244			goto done;
245		}
246	}
247	if (head->so_error) {
248		error = head->so_error;
249		head->so_error = 0;
250		splx(s);
251		goto done;
252	}
253
254	/*
255	 * At this point we know that there is at least one connection
256	 * ready to be accepted. Remove it from the queue prior to
257	 * allocating the file descriptor for it since falloc() may
258	 * block allowing another process to accept the connection
259	 * instead.
260	 */
261	so = TAILQ_FIRST(&head->so_comp);
262	TAILQ_REMOVE(&head->so_comp, so, so_list);
263	head->so_qlen--;
264
265	fflag = lfp->f_flag;
266	error = falloc(p, &nfp, &fd);
267	if (error) {
268		/*
269		 * Probably ran out of file descriptors. Put the
270		 * unaccepted connection back onto the queue and
271		 * do another wakeup so some other process might
272		 * have a chance at it.
273		 */
274		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
275		head->so_qlen++;
276		wakeup_one(&head->so_timeo);
277		splx(s);
278		goto done;
279	}
280	fhold(nfp);
281	p->p_retval[0] = fd;
282
283	/* connection has been removed from the listen queue */
284	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
285
286	so->so_state &= ~SS_COMP;
287	so->so_head = NULL;
288	if (head->so_sigio != NULL)
289		fsetown(fgetown(head->so_sigio), &so->so_sigio);
290
291	nfp->f_data = (caddr_t)so;
292	nfp->f_flag = fflag;
293	nfp->f_ops = &socketops;
294	nfp->f_type = DTYPE_SOCKET;
295	sa = 0;
296	error = soaccept(so, &sa);
297	if (error) {
298		/*
299		 * return a namelen of zero for older code which might
300	 	 * ignore the return value from accept.
301		 */
302		if (uap->name != NULL) {
303			namelen = 0;
304			(void) copyout((caddr_t)&namelen,
305			    (caddr_t)uap->anamelen, sizeof(*uap->anamelen));
306		}
307		goto noconnection;
308	}
309	if (sa == NULL) {
310		namelen = 0;
311		if (uap->name)
312			goto gotnoname;
313		splx(s);
314		error = 0;
315		goto done;
316	}
317	if (uap->name) {
318		/* check sa_len before it is destroyed */
319		if (namelen > sa->sa_len)
320			namelen = sa->sa_len;
321#ifdef COMPAT_OLDSOCK
322		if (compat)
323			((struct osockaddr *)sa)->sa_family =
324			    sa->sa_family;
325#endif
326		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
327		if (!error)
328gotnoname:
329			error = copyout((caddr_t)&namelen,
330			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
331	}
332noconnection:
333	if (sa)
334		FREE(sa, M_SONAME);
335
336	/*
337	 * close the new descriptor, assuming someone hasn't ripped it
338	 * out from under us.
339	 */
340	if (error) {
341		if (fdp->fd_ofiles[fd] == nfp) {
342			fdp->fd_ofiles[fd] = NULL;
343			fdrop(nfp, p);
344		}
345	}
346	splx(s);
347
348	/*
349	 * Release explicitly held references before returning.
350	 */
351done:
352	if (nfp != NULL)
353		fdrop(nfp, p);
354	fdrop(lfp, p);
355	return (error);
356}
357
358int
359accept(p, uap)
360	struct proc *p;
361	struct accept_args *uap;
362{
363
364	return (accept1(p, uap, 0));
365}
366
367#ifdef COMPAT_OLDSOCK
368int
369oaccept(p, uap)
370	struct proc *p;
371	struct accept_args *uap;
372{
373
374	return (accept1(p, uap, 1));
375}
376#endif /* COMPAT_OLDSOCK */
377
378/* ARGSUSED */
379int
380connect(p, uap)
381	struct proc *p;
382	register struct connect_args /* {
383		int	s;
384		caddr_t	name;
385		int	namelen;
386	} */ *uap;
387{
388	struct file *fp;
389	register struct socket *so;
390	struct sockaddr *sa;
391	int error, s;
392
393	error = holdsock(p->p_fd, uap->s, &fp);
394	if (error)
395		return (error);
396	so = (struct socket *)fp->f_data;
397	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
398		error = EALREADY;
399		goto done;
400	}
401	error = getsockaddr(&sa, uap->name, uap->namelen);
402	if (error)
403		goto done;
404	error = soconnect(so, sa, p);
405	if (error)
406		goto bad;
407	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
408		FREE(sa, M_SONAME);
409		error = EINPROGRESS;
410		goto done;
411	}
412	s = splnet();
413	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
414		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
415		    "connec", 0);
416		if (error)
417			break;
418	}
419	if (error == 0) {
420		error = so->so_error;
421		so->so_error = 0;
422	}
423	splx(s);
424bad:
425	so->so_state &= ~SS_ISCONNECTING;
426	FREE(sa, M_SONAME);
427	if (error == ERESTART)
428		error = EINTR;
429done:
430	fdrop(fp, p);
431	return (error);
432}
433
434int
435socketpair(p, uap)
436	struct proc *p;
437	register struct socketpair_args /* {
438		int	domain;
439		int	type;
440		int	protocol;
441		int	*rsv;
442	} */ *uap;
443{
444	register struct filedesc *fdp = p->p_fd;
445	struct file *fp1, *fp2;
446	struct socket *so1, *so2;
447	int fd, error, sv[2];
448
449	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
450	if (error)
451		return (error);
452	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
453	if (error)
454		goto free1;
455	error = falloc(p, &fp1, &fd);
456	if (error)
457		goto free2;
458	fhold(fp1);
459	sv[0] = fd;
460	fp1->f_data = (caddr_t)so1;
461	error = falloc(p, &fp2, &fd);
462	if (error)
463		goto free3;
464	fhold(fp2);
465	fp2->f_data = (caddr_t)so2;
466	sv[1] = fd;
467	error = soconnect2(so1, so2);
468	if (error)
469		goto free4;
470	if (uap->type == SOCK_DGRAM) {
471		/*
472		 * Datagram socket connection is asymmetric.
473		 */
474		 error = soconnect2(so2, so1);
475		 if (error)
476			goto free4;
477	}
478	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
479	fp1->f_ops = fp2->f_ops = &socketops;
480	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
481	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
482	fdrop(fp1, p);
483	fdrop(fp2, p);
484	return (error);
485free4:
486	if (fdp->fd_ofiles[sv[1]] == fp2) {
487		fdp->fd_ofiles[sv[1]] = NULL;
488		fdrop(fp2, p);
489	}
490	fdrop(fp2, p);
491free3:
492	if (fdp->fd_ofiles[sv[0]] == fp1) {
493		fdp->fd_ofiles[sv[0]] = NULL;
494		fdrop(fp1, p);
495	}
496	fdrop(fp1, p);
497free2:
498	(void)soclose(so2);
499free1:
500	(void)soclose(so1);
501	return (error);
502}
503
504static int
505sendit(p, s, mp, flags)
506	register struct proc *p;
507	int s;
508	register struct msghdr *mp;
509	int flags;
510{
511	struct file *fp;
512	struct uio auio;
513	register struct iovec *iov;
514	register int i;
515	struct mbuf *control;
516	struct sockaddr *to;
517	int len, error;
518	struct socket *so;
519#ifdef KTRACE
520	struct iovec *ktriov = NULL;
521	struct uio ktruio;
522#endif
523
524	error = holdsock(p->p_fd, s, &fp);
525	if (error)
526		return (error);
527	auio.uio_iov = mp->msg_iov;
528	auio.uio_iovcnt = mp->msg_iovlen;
529	auio.uio_segflg = UIO_USERSPACE;
530	auio.uio_rw = UIO_WRITE;
531	auio.uio_procp = p;
532	auio.uio_offset = 0;			/* XXX */
533	auio.uio_resid = 0;
534	iov = mp->msg_iov;
535	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
536		if ((auio.uio_resid += iov->iov_len) < 0) {
537			fdrop(fp, p);
538			return (EINVAL);
539		}
540	}
541	if (mp->msg_name) {
542		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
543		if (error) {
544			fdrop(fp, p);
545			return (error);
546		}
547	} else {
548		to = 0;
549	}
550	if (mp->msg_control) {
551		if (mp->msg_controllen < sizeof(struct cmsghdr)
552#ifdef COMPAT_OLDSOCK
553		    && mp->msg_flags != MSG_COMPAT
554#endif
555		) {
556			error = EINVAL;
557			goto bad;
558		}
559		error = sockargs(&control, mp->msg_control,
560		    mp->msg_controllen, MT_CONTROL);
561		if (error)
562			goto bad;
563#ifdef COMPAT_OLDSOCK
564		if (mp->msg_flags == MSG_COMPAT) {
565			register struct cmsghdr *cm;
566
567			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
568			if (control == 0) {
569				error = ENOBUFS;
570				goto bad;
571			} else {
572				cm = mtod(control, struct cmsghdr *);
573				cm->cmsg_len = control->m_len;
574				cm->cmsg_level = SOL_SOCKET;
575				cm->cmsg_type = SCM_RIGHTS;
576			}
577		}
578#endif
579	} else {
580		control = 0;
581	}
582#ifdef KTRACE
583	if (KTRPOINT(p, KTR_GENIO)) {
584		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
585
586		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
587		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
588		ktruio = auio;
589	}
590#endif
591	len = auio.uio_resid;
592	so = (struct socket *)fp->f_data;
593	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
594						     flags, p);
595	if (error) {
596		if (auio.uio_resid != len && (error == ERESTART ||
597		    error == EINTR || error == EWOULDBLOCK))
598			error = 0;
599		if (error == EPIPE) {
600			PROC_LOCK(p);
601			psignal(p, SIGPIPE);
602			PROC_UNLOCK(p);
603		}
604	}
605	if (error == 0)
606		p->p_retval[0] = len - auio.uio_resid;
607#ifdef KTRACE
608	if (ktriov != NULL) {
609		if (error == 0) {
610			ktruio.uio_iov = ktriov;
611			ktruio.uio_resid = p->p_retval[0];
612			ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error);
613		}
614		FREE(ktriov, M_TEMP);
615	}
616#endif
617bad:
618	fdrop(fp, p);
619	if (to)
620		FREE(to, M_SONAME);
621	return (error);
622}
623
624int
625sendto(p, uap)
626	struct proc *p;
627	register struct sendto_args /* {
628		int	s;
629		caddr_t	buf;
630		size_t	len;
631		int	flags;
632		caddr_t	to;
633		int	tolen;
634	} */ *uap;
635{
636	struct msghdr msg;
637	struct iovec aiov;
638
639	msg.msg_name = uap->to;
640	msg.msg_namelen = uap->tolen;
641	msg.msg_iov = &aiov;
642	msg.msg_iovlen = 1;
643	msg.msg_control = 0;
644#ifdef COMPAT_OLDSOCK
645	msg.msg_flags = 0;
646#endif
647	aiov.iov_base = uap->buf;
648	aiov.iov_len = uap->len;
649	return (sendit(p, uap->s, &msg, uap->flags));
650}
651
652#ifdef COMPAT_OLDSOCK
653int
654osend(p, uap)
655	struct proc *p;
656	register struct osend_args /* {
657		int	s;
658		caddr_t	buf;
659		int	len;
660		int	flags;
661	} */ *uap;
662{
663	struct msghdr msg;
664	struct iovec aiov;
665
666	msg.msg_name = 0;
667	msg.msg_namelen = 0;
668	msg.msg_iov = &aiov;
669	msg.msg_iovlen = 1;
670	aiov.iov_base = uap->buf;
671	aiov.iov_len = uap->len;
672	msg.msg_control = 0;
673	msg.msg_flags = 0;
674	return (sendit(p, uap->s, &msg, uap->flags));
675}
676
677int
678osendmsg(p, uap)
679	struct proc *p;
680	register struct osendmsg_args /* {
681		int	s;
682		caddr_t	msg;
683		int	flags;
684	} */ *uap;
685{
686	struct msghdr msg;
687	struct iovec aiov[UIO_SMALLIOV], *iov;
688	int error;
689
690	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
691	if (error)
692		return (error);
693	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
694		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
695			return (EMSGSIZE);
696		MALLOC(iov, struct iovec *,
697		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
698		      M_WAITOK);
699	} else
700		iov = aiov;
701	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
702	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
703	if (error)
704		goto done;
705	msg.msg_flags = MSG_COMPAT;
706	msg.msg_iov = iov;
707	error = sendit(p, uap->s, &msg, uap->flags);
708done:
709	if (iov != aiov)
710		FREE(iov, M_IOV);
711	return (error);
712}
713#endif
714
715int
716sendmsg(p, uap)
717	struct proc *p;
718	register struct sendmsg_args /* {
719		int	s;
720		caddr_t	msg;
721		int	flags;
722	} */ *uap;
723{
724	struct msghdr msg;
725	struct iovec aiov[UIO_SMALLIOV], *iov;
726	int error;
727
728	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
729	if (error)
730		return (error);
731	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
732		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
733			return (EMSGSIZE);
734		MALLOC(iov, struct iovec *,
735		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
736		       M_WAITOK);
737	} else
738		iov = aiov;
739	if (msg.msg_iovlen &&
740	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
741	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
742		goto done;
743	msg.msg_iov = iov;
744#ifdef COMPAT_OLDSOCK
745	msg.msg_flags = 0;
746#endif
747	error = sendit(p, uap->s, &msg, uap->flags);
748done:
749	if (iov != aiov)
750		FREE(iov, M_IOV);
751	return (error);
752}
753
754static int
755recvit(p, s, mp, namelenp)
756	register struct proc *p;
757	int s;
758	register struct msghdr *mp;
759	caddr_t namelenp;
760{
761	struct file *fp;
762	struct uio auio;
763	register struct iovec *iov;
764	register int i;
765	int len, error;
766	struct mbuf *m, *control = 0;
767	caddr_t ctlbuf;
768	struct socket *so;
769	struct sockaddr *fromsa = 0;
770#ifdef KTRACE
771	struct iovec *ktriov = NULL;
772	struct uio ktruio;
773#endif
774
775	error = holdsock(p->p_fd, s, &fp);
776	if (error)
777		return (error);
778	auio.uio_iov = mp->msg_iov;
779	auio.uio_iovcnt = mp->msg_iovlen;
780	auio.uio_segflg = UIO_USERSPACE;
781	auio.uio_rw = UIO_READ;
782	auio.uio_procp = p;
783	auio.uio_offset = 0;			/* XXX */
784	auio.uio_resid = 0;
785	iov = mp->msg_iov;
786	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
787		if ((auio.uio_resid += iov->iov_len) < 0) {
788			fdrop(fp, p);
789			return (EINVAL);
790		}
791	}
792#ifdef KTRACE
793	if (KTRPOINT(p, KTR_GENIO)) {
794		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
795
796		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
797		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
798		ktruio = auio;
799	}
800#endif
801	len = auio.uio_resid;
802	so = (struct socket *)fp->f_data;
803	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
804	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
805	    &mp->msg_flags);
806	if (error) {
807		if (auio.uio_resid != len && (error == ERESTART ||
808		    error == EINTR || error == EWOULDBLOCK))
809			error = 0;
810	}
811#ifdef KTRACE
812	if (ktriov != NULL) {
813		if (error == 0) {
814			ktruio.uio_iov = ktriov;
815			ktruio.uio_resid = len - auio.uio_resid;
816			ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error);
817		}
818		FREE(ktriov, M_TEMP);
819	}
820#endif
821	if (error)
822		goto out;
823	p->p_retval[0] = len - auio.uio_resid;
824	if (mp->msg_name) {
825		len = mp->msg_namelen;
826		if (len <= 0 || fromsa == 0)
827			len = 0;
828		else {
829#ifndef MIN
830#define MIN(a,b) ((a)>(b)?(b):(a))
831#endif
832			/* save sa_len before it is destroyed by MSG_COMPAT */
833			len = MIN(len, fromsa->sa_len);
834#ifdef COMPAT_OLDSOCK
835			if (mp->msg_flags & MSG_COMPAT)
836				((struct osockaddr *)fromsa)->sa_family =
837				    fromsa->sa_family;
838#endif
839			error = copyout(fromsa,
840			    (caddr_t)mp->msg_name, (unsigned)len);
841			if (error)
842				goto out;
843		}
844		mp->msg_namelen = len;
845		if (namelenp &&
846		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
847#ifdef COMPAT_OLDSOCK
848			if (mp->msg_flags & MSG_COMPAT)
849				error = 0;	/* old recvfrom didn't check */
850			else
851#endif
852			goto out;
853		}
854	}
855	if (mp->msg_control) {
856#ifdef COMPAT_OLDSOCK
857		/*
858		 * We assume that old recvmsg calls won't receive access
859		 * rights and other control info, esp. as control info
860		 * is always optional and those options didn't exist in 4.3.
861		 * If we receive rights, trim the cmsghdr; anything else
862		 * is tossed.
863		 */
864		if (control && mp->msg_flags & MSG_COMPAT) {
865			if (mtod(control, struct cmsghdr *)->cmsg_level !=
866			    SOL_SOCKET ||
867			    mtod(control, struct cmsghdr *)->cmsg_type !=
868			    SCM_RIGHTS) {
869				mp->msg_controllen = 0;
870				goto out;
871			}
872			control->m_len -= sizeof (struct cmsghdr);
873			control->m_data += sizeof (struct cmsghdr);
874		}
875#endif
876		len = mp->msg_controllen;
877		m = control;
878		mp->msg_controllen = 0;
879		ctlbuf = (caddr_t) mp->msg_control;
880
881		while (m && len > 0) {
882			unsigned int tocopy;
883
884			if (len >= m->m_len)
885				tocopy = m->m_len;
886			else {
887				mp->msg_flags |= MSG_CTRUNC;
888				tocopy = len;
889			}
890
891			if ((error = copyout((caddr_t)mtod(m, caddr_t),
892					ctlbuf, tocopy)) != 0)
893				goto out;
894
895			ctlbuf += tocopy;
896			len -= tocopy;
897			m = m->m_next;
898		}
899		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
900	}
901out:
902	fdrop(fp, p);
903	if (fromsa)
904		FREE(fromsa, M_SONAME);
905	if (control)
906		m_freem(control);
907	return (error);
908}
909
910int
911recvfrom(p, uap)
912	struct proc *p;
913	register struct recvfrom_args /* {
914		int	s;
915		caddr_t	buf;
916		size_t	len;
917		int	flags;
918		caddr_t	from;
919		int	*fromlenaddr;
920	} */ *uap;
921{
922	struct msghdr msg;
923	struct iovec aiov;
924	int error;
925
926	if (uap->fromlenaddr) {
927		error = copyin((caddr_t)uap->fromlenaddr,
928		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
929		if (error)
930			return (error);
931	} else
932		msg.msg_namelen = 0;
933	msg.msg_name = uap->from;
934	msg.msg_iov = &aiov;
935	msg.msg_iovlen = 1;
936	aiov.iov_base = uap->buf;
937	aiov.iov_len = uap->len;
938	msg.msg_control = 0;
939	msg.msg_flags = uap->flags;
940	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
941}
942
943#ifdef COMPAT_OLDSOCK
944int
945orecvfrom(p, uap)
946	struct proc *p;
947	struct recvfrom_args *uap;
948{
949
950	uap->flags |= MSG_COMPAT;
951	return (recvfrom(p, uap));
952}
953#endif
954
955
956#ifdef COMPAT_OLDSOCK
957int
958orecv(p, uap)
959	struct proc *p;
960	register struct orecv_args /* {
961		int	s;
962		caddr_t	buf;
963		int	len;
964		int	flags;
965	} */ *uap;
966{
967	struct msghdr msg;
968	struct iovec aiov;
969
970	msg.msg_name = 0;
971	msg.msg_namelen = 0;
972	msg.msg_iov = &aiov;
973	msg.msg_iovlen = 1;
974	aiov.iov_base = uap->buf;
975	aiov.iov_len = uap->len;
976	msg.msg_control = 0;
977	msg.msg_flags = uap->flags;
978	return (recvit(p, uap->s, &msg, (caddr_t)0));
979}
980
981/*
982 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
983 * overlays the new one, missing only the flags, and with the (old) access
984 * rights where the control fields are now.
985 */
986int
987orecvmsg(p, uap)
988	struct proc *p;
989	register struct orecvmsg_args /* {
990		int	s;
991		struct	omsghdr *msg;
992		int	flags;
993	} */ *uap;
994{
995	struct msghdr msg;
996	struct iovec aiov[UIO_SMALLIOV], *iov;
997	int error;
998
999	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
1000	    sizeof (struct omsghdr));
1001	if (error)
1002		return (error);
1003	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1004		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
1005			return (EMSGSIZE);
1006		MALLOC(iov, struct iovec *,
1007		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1008		      M_WAITOK);
1009	} else
1010		iov = aiov;
1011	msg.msg_flags = uap->flags | MSG_COMPAT;
1012	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
1013	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1014	if (error)
1015		goto done;
1016	msg.msg_iov = iov;
1017	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
1018
1019	if (msg.msg_controllen && error == 0)
1020		error = copyout((caddr_t)&msg.msg_controllen,
1021		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
1022done:
1023	if (iov != aiov)
1024		FREE(iov, M_IOV);
1025	return (error);
1026}
1027#endif
1028
1029int
1030recvmsg(p, uap)
1031	struct proc *p;
1032	register struct recvmsg_args /* {
1033		int	s;
1034		struct	msghdr *msg;
1035		int	flags;
1036	} */ *uap;
1037{
1038	struct msghdr msg;
1039	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1040	register int error;
1041
1042	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
1043	if (error)
1044		return (error);
1045	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1046		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
1047			return (EMSGSIZE);
1048		MALLOC(iov, struct iovec *,
1049		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1050		       M_WAITOK);
1051	} else
1052		iov = aiov;
1053#ifdef COMPAT_OLDSOCK
1054	msg.msg_flags = uap->flags &~ MSG_COMPAT;
1055#else
1056	msg.msg_flags = uap->flags;
1057#endif
1058	uiov = msg.msg_iov;
1059	msg.msg_iov = iov;
1060	error = copyin((caddr_t)uiov, (caddr_t)iov,
1061	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1062	if (error)
1063		goto done;
1064	error = recvit(p, uap->s, &msg, (caddr_t)0);
1065	if (!error) {
1066		msg.msg_iov = uiov;
1067		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
1068	}
1069done:
1070	if (iov != aiov)
1071		FREE(iov, M_IOV);
1072	return (error);
1073}
1074
1075/* ARGSUSED */
1076int
1077shutdown(p, uap)
1078	struct proc *p;
1079	register struct shutdown_args /* {
1080		int	s;
1081		int	how;
1082	} */ *uap;
1083{
1084	struct file *fp;
1085	int error;
1086
1087	error = holdsock(p->p_fd, uap->s, &fp);
1088	if (error)
1089		return (error);
1090	error = soshutdown((struct socket *)fp->f_data, uap->how);
1091	fdrop(fp, p);
1092	return(error);
1093}
1094
1095/* ARGSUSED */
1096int
1097setsockopt(p, uap)
1098	struct proc *p;
1099	register struct setsockopt_args /* {
1100		int	s;
1101		int	level;
1102		int	name;
1103		caddr_t	val;
1104		int	valsize;
1105	} */ *uap;
1106{
1107	struct file *fp;
1108	struct sockopt sopt;
1109	int error;
1110
1111	if (uap->val == 0 && uap->valsize != 0)
1112		return (EFAULT);
1113	if (uap->valsize < 0)
1114		return (EINVAL);
1115
1116	error = holdsock(p->p_fd, uap->s, &fp);
1117	if (error)
1118		return (error);
1119
1120	sopt.sopt_dir = SOPT_SET;
1121	sopt.sopt_level = uap->level;
1122	sopt.sopt_name = uap->name;
1123	sopt.sopt_val = uap->val;
1124	sopt.sopt_valsize = uap->valsize;
1125	sopt.sopt_p = p;
1126	error = sosetopt((struct socket *)fp->f_data, &sopt);
1127	fdrop(fp, p);
1128	return(error);
1129}
1130
1131/* ARGSUSED */
1132int
1133getsockopt(p, uap)
1134	struct proc *p;
1135	register struct getsockopt_args /* {
1136		int	s;
1137		int	level;
1138		int	name;
1139		caddr_t	val;
1140		int	*avalsize;
1141	} */ *uap;
1142{
1143	int	valsize, error;
1144	struct	file *fp;
1145	struct	sockopt sopt;
1146
1147	error = holdsock(p->p_fd, uap->s, &fp);
1148	if (error)
1149		return (error);
1150	if (uap->val) {
1151		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1152		    sizeof (valsize));
1153		if (error) {
1154			fdrop(fp, p);
1155			return (error);
1156		}
1157		if (valsize < 0) {
1158			fdrop(fp, p);
1159			return (EINVAL);
1160		}
1161	} else {
1162		valsize = 0;
1163	}
1164
1165	sopt.sopt_dir = SOPT_GET;
1166	sopt.sopt_level = uap->level;
1167	sopt.sopt_name = uap->name;
1168	sopt.sopt_val = uap->val;
1169	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1170	sopt.sopt_p = p;
1171
1172	error = sogetopt((struct socket *)fp->f_data, &sopt);
1173	if (error == 0) {
1174		valsize = sopt.sopt_valsize;
1175		error = copyout((caddr_t)&valsize,
1176				(caddr_t)uap->avalsize, sizeof (valsize));
1177	}
1178	fdrop(fp, p);
1179	return (error);
1180}
1181
1182/*
1183 * Get socket name.
1184 */
1185/* ARGSUSED */
1186static int
1187getsockname1(p, uap, compat)
1188	struct proc *p;
1189	register struct getsockname_args /* {
1190		int	fdes;
1191		caddr_t	asa;
1192		int	*alen;
1193	} */ *uap;
1194	int compat;
1195{
1196	struct file *fp;
1197	register struct socket *so;
1198	struct sockaddr *sa;
1199	int len, error;
1200
1201	error = holdsock(p->p_fd, uap->fdes, &fp);
1202	if (error)
1203		return (error);
1204	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1205	if (error) {
1206		fdrop(fp, p);
1207		return (error);
1208	}
1209	so = (struct socket *)fp->f_data;
1210	sa = 0;
1211	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1212	if (error)
1213		goto bad;
1214	if (sa == 0) {
1215		len = 0;
1216		goto gotnothing;
1217	}
1218
1219	len = MIN(len, sa->sa_len);
1220#ifdef COMPAT_OLDSOCK
1221	if (compat)
1222		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1223#endif
1224	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1225	if (error == 0)
1226gotnothing:
1227		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1228		    sizeof (len));
1229bad:
1230	if (sa)
1231		FREE(sa, M_SONAME);
1232	fdrop(fp, p);
1233	return (error);
1234}
1235
1236int
1237getsockname(p, uap)
1238	struct proc *p;
1239	struct getsockname_args *uap;
1240{
1241
1242	return (getsockname1(p, uap, 0));
1243}
1244
1245#ifdef COMPAT_OLDSOCK
1246int
1247ogetsockname(p, uap)
1248	struct proc *p;
1249	struct getsockname_args *uap;
1250{
1251
1252	return (getsockname1(p, uap, 1));
1253}
1254#endif /* COMPAT_OLDSOCK */
1255
1256/*
1257 * Get name of peer for connected socket.
1258 */
1259/* ARGSUSED */
1260static int
1261getpeername1(p, uap, compat)
1262	struct proc *p;
1263	register struct getpeername_args /* {
1264		int	fdes;
1265		caddr_t	asa;
1266		int	*alen;
1267	} */ *uap;
1268	int compat;
1269{
1270	struct file *fp;
1271	register struct socket *so;
1272	struct sockaddr *sa;
1273	int len, error;
1274
1275	error = holdsock(p->p_fd, uap->fdes, &fp);
1276	if (error)
1277		return (error);
1278	so = (struct socket *)fp->f_data;
1279	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1280		fdrop(fp, p);
1281		return (ENOTCONN);
1282	}
1283	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1284	if (error) {
1285		fdrop(fp, p);
1286		return (error);
1287	}
1288	sa = 0;
1289	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1290	if (error)
1291		goto bad;
1292	if (sa == 0) {
1293		len = 0;
1294		goto gotnothing;
1295	}
1296	len = MIN(len, sa->sa_len);
1297#ifdef COMPAT_OLDSOCK
1298	if (compat)
1299		((struct osockaddr *)sa)->sa_family =
1300		    sa->sa_family;
1301#endif
1302	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1303	if (error)
1304		goto bad;
1305gotnothing:
1306	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1307bad:
1308	if (sa)
1309		FREE(sa, M_SONAME);
1310	fdrop(fp, p);
1311	return (error);
1312}
1313
1314int
1315getpeername(p, uap)
1316	struct proc *p;
1317	struct getpeername_args *uap;
1318{
1319
1320	return (getpeername1(p, uap, 0));
1321}
1322
1323#ifdef COMPAT_OLDSOCK
1324int
1325ogetpeername(p, uap)
1326	struct proc *p;
1327	struct ogetpeername_args *uap;
1328{
1329
1330	/* XXX uap should have type `getpeername_args *' to begin with. */
1331	return (getpeername1(p, (struct getpeername_args *)uap, 1));
1332}
1333#endif /* COMPAT_OLDSOCK */
1334
1335int
1336sockargs(mp, buf, buflen, type)
1337	struct mbuf **mp;
1338	caddr_t buf;
1339	int buflen, type;
1340{
1341	register struct sockaddr *sa;
1342	register struct mbuf *m;
1343	int error;
1344
1345	if ((u_int)buflen > MLEN) {
1346#ifdef COMPAT_OLDSOCK
1347		if (type == MT_SONAME && (u_int)buflen <= 112)
1348			buflen = MLEN;		/* unix domain compat. hack */
1349		else
1350#endif
1351		return (EINVAL);
1352	}
1353	m = m_get(M_TRYWAIT, type);
1354	if (m == NULL)
1355		return (ENOBUFS);
1356	m->m_len = buflen;
1357	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1358	if (error)
1359		(void) m_free(m);
1360	else {
1361		*mp = m;
1362		if (type == MT_SONAME) {
1363			sa = mtod(m, struct sockaddr *);
1364
1365#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1366			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1367				sa->sa_family = sa->sa_len;
1368#endif
1369			sa->sa_len = buflen;
1370		}
1371	}
1372	return (error);
1373}
1374
1375int
1376getsockaddr(namp, uaddr, len)
1377	struct sockaddr **namp;
1378	caddr_t uaddr;
1379	size_t len;
1380{
1381	struct sockaddr *sa;
1382	int error;
1383
1384	if (len > SOCK_MAXADDRLEN)
1385		return ENAMETOOLONG;
1386	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1387	error = copyin(uaddr, sa, len);
1388	if (error) {
1389		FREE(sa, M_SONAME);
1390	} else {
1391#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1392		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1393			sa->sa_family = sa->sa_len;
1394#endif
1395		sa->sa_len = len;
1396		*namp = sa;
1397	}
1398	return error;
1399}
1400
1401/*
1402 * holdsock() - load the struct file pointer associated
1403 * with a socket into *fpp.  If an error occurs, non-zero
1404 * will be returned and *fpp will be set to NULL.
1405 */
1406int
1407holdsock(fdp, fdes, fpp)
1408	struct filedesc *fdp;
1409	int fdes;
1410	struct file **fpp;
1411{
1412	register struct file *fp = NULL;
1413	int error = 0;
1414
1415	if ((unsigned)fdes >= fdp->fd_nfiles ||
1416	    (fp = fdp->fd_ofiles[fdes]) == NULL) {
1417		error = EBADF;
1418	} else if (fp->f_type != DTYPE_SOCKET) {
1419		error = ENOTSOCK;
1420		fp = NULL;
1421	} else {
1422		fhold(fp);
1423	}
1424	*fpp = fp;
1425	return(error);
1426}
1427
1428/*
1429 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1430 * XXX - The sf_buf functions are currently private to sendfile(2), so have
1431 * been made static, but may be useful in the future for doing zero-copy in
1432 * other parts of the networking code.
1433 */
1434static void
1435sf_buf_init(void *arg)
1436{
1437	int i;
1438
1439	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", MTX_DEF);
1440	mtx_lock(&sf_freelist.sf_lock);
1441	SLIST_INIT(&sf_freelist.sf_head);
1442	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1443	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
1444	    M_NOWAIT | M_ZERO);
1445	for (i = 0; i < nsfbufs; i++) {
1446		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1447		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
1448	}
1449	sf_buf_alloc_want = 0;
1450	mtx_unlock(&sf_freelist.sf_lock);
1451}
1452
1453/*
1454 * Get an sf_buf from the freelist. Will block if none are available.
1455 */
1456static struct sf_buf *
1457sf_buf_alloc()
1458{
1459	struct sf_buf *sf;
1460	int error;
1461
1462	mtx_lock(&sf_freelist.sf_lock);
1463	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
1464		sf_buf_alloc_want++;
1465		error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
1466		    "sfbufa", 0);
1467		sf_buf_alloc_want--;
1468
1469		/*
1470		 * If we got a signal, don't risk going back to sleep.
1471		 */
1472		if (error)
1473			break;
1474	}
1475	if (sf != NULL)
1476		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
1477	mtx_unlock(&sf_freelist.sf_lock);
1478	return (sf);
1479}
1480
1481#define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1482
1483/*
1484 * Detatch mapped page and release resources back to the system.
1485 */
1486static void
1487sf_buf_free(caddr_t addr, void *args)
1488{
1489	struct sf_buf *sf;
1490	struct vm_page *m;
1491
1492	sf = dtosf(addr);
1493	mtx_lock(&vm_mtx);
1494	pmap_qremove((vm_offset_t)addr, 1);
1495	m = sf->m;
1496	vm_page_unwire(m, 0);
1497	/*
1498	 * Check for the object going away on us. This can
1499	 * happen since we don't hold a reference to it.
1500	 * If so, we're responsible for freeing the page.
1501	 */
1502	if (m->wire_count == 0 && m->object == NULL)
1503		vm_page_free(m);
1504	mtx_unlock(&vm_mtx);
1505	sf->m = NULL;
1506	mtx_lock(&sf_freelist.sf_lock);
1507	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
1508	if (sf_buf_alloc_want > 0)
1509		wakeup_one(&sf_freelist);
1510	mtx_unlock(&sf_freelist.sf_lock);
1511}
1512
1513/*
1514 * sendfile(2)
1515 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1516 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1517 *
1518 * Send a file specified by 'fd' and starting at 'offset' to a socket
1519 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1520 * nbytes == 0. Optionally add a header and/or trailer to the socket
1521 * output. If specified, write the total number of bytes sent into *sbytes.
1522 */
1523int
1524sendfile(struct proc *p, struct sendfile_args *uap)
1525{
1526	struct file *fp;
1527	struct filedesc *fdp = p->p_fd;
1528	struct vnode *vp;
1529	struct vm_object *obj;
1530	struct socket *so;
1531	struct mbuf *m;
1532	struct sf_buf *sf;
1533	struct vm_page *pg;
1534	struct writev_args nuap;
1535	struct sf_hdtr hdtr;
1536	off_t off, xfsize, sbytes = 0;
1537	int error = 0, s;
1538
1539	vp = NULL;
1540	/*
1541	 * Do argument checking. Must be a regular file in, stream
1542	 * type and connected socket out, positive offset.
1543	 */
1544	fp = holdfp(fdp, uap->fd, FREAD);
1545	if (fp == NULL) {
1546		error = EBADF;
1547		goto done;
1548	}
1549	if (fp->f_type != DTYPE_VNODE) {
1550		error = EINVAL;
1551		goto done;
1552	}
1553	vp = (struct vnode *)fp->f_data;
1554	vref(vp);
1555	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1556		error = EINVAL;
1557		goto done;
1558	}
1559	fdrop(fp, p);
1560	error = holdsock(p->p_fd, uap->s, &fp);
1561	if (error)
1562		goto done;
1563	so = (struct socket *)fp->f_data;
1564	if (so->so_type != SOCK_STREAM) {
1565		error = EINVAL;
1566		goto done;
1567	}
1568	if ((so->so_state & SS_ISCONNECTED) == 0) {
1569		error = ENOTCONN;
1570		goto done;
1571	}
1572	if (uap->offset < 0) {
1573		error = EINVAL;
1574		goto done;
1575	}
1576
1577	/*
1578	 * If specified, get the pointer to the sf_hdtr struct for
1579	 * any headers/trailers.
1580	 */
1581	if (uap->hdtr != NULL) {
1582		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1583		if (error)
1584			goto done;
1585		/*
1586		 * Send any headers. Wimp out and use writev(2).
1587		 */
1588		if (hdtr.headers != NULL) {
1589			nuap.fd = uap->s;
1590			nuap.iovp = hdtr.headers;
1591			nuap.iovcnt = hdtr.hdr_cnt;
1592			error = writev(p, &nuap);
1593			if (error)
1594				goto done;
1595			sbytes += p->p_retval[0];
1596		}
1597	}
1598
1599	/*
1600	 * Protect against multiple writers to the socket.
1601	 */
1602	(void) sblock(&so->so_snd, M_WAITOK);
1603
1604	/*
1605	 * Loop through the pages in the file, starting with the requested
1606	 * offset. Get a file page (do I/O if necessary), map the file page
1607	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1608	 * it on the socket.
1609	 */
1610	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1611		vm_pindex_t pindex;
1612		vm_offset_t pgoff;
1613
1614		pindex = OFF_TO_IDX(off);
1615retry_lookup:
1616		/*
1617		 * Calculate the amount to transfer. Not to exceed a page,
1618		 * the EOF, or the passed in nbytes.
1619		 */
1620		xfsize = obj->un_pager.vnp.vnp_size - off;
1621		if (xfsize > PAGE_SIZE)
1622			xfsize = PAGE_SIZE;
1623		pgoff = (vm_offset_t)(off & PAGE_MASK);
1624		if (PAGE_SIZE - pgoff < xfsize)
1625			xfsize = PAGE_SIZE - pgoff;
1626		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1627			xfsize = uap->nbytes - sbytes;
1628		if (xfsize <= 0)
1629			break;
1630		/*
1631		 * Optimize the non-blocking case by looking at the socket space
1632		 * before going to the extra work of constituting the sf_buf.
1633		 */
1634		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1635			if (so->so_state & SS_CANTSENDMORE)
1636				error = EPIPE;
1637			else
1638				error = EAGAIN;
1639			sbunlock(&so->so_snd);
1640			goto done;
1641		}
1642		/*
1643		 * Attempt to look up the page.
1644		 *
1645		 *	Allocate if not found
1646		 *
1647		 *	Wait and loop if busy.
1648		 */
1649		mtx_lock(&vm_mtx);
1650		pg = vm_page_lookup(obj, pindex);
1651
1652		if (pg == NULL) {
1653			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1654			if (pg == NULL) {
1655				VM_WAIT;
1656				mtx_unlock(&vm_mtx);
1657				goto retry_lookup;
1658			}
1659			vm_page_wakeup(pg);
1660		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1661			mtx_unlock(&vm_mtx);
1662			goto retry_lookup;
1663		}
1664
1665		/*
1666		 * Wire the page so it does not get ripped out from under
1667		 * us.
1668		 */
1669
1670		vm_page_wire(pg);
1671
1672		/*
1673		 * If page is not valid for what we need, initiate I/O
1674		 */
1675
1676		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1677			struct uio auio;
1678			struct iovec aiov;
1679			int bsize;
1680
1681			/*
1682			 * Ensure that our page is still around when the I/O
1683			 * completes.
1684			 */
1685			vm_page_io_start(pg);
1686			mtx_unlock(&vm_mtx);
1687
1688			/*
1689			 * Get the page from backing store.
1690			 */
1691			bsize = vp->v_mount->mnt_stat.f_iosize;
1692			auio.uio_iov = &aiov;
1693			auio.uio_iovcnt = 1;
1694			aiov.iov_base = 0;
1695			aiov.iov_len = MAXBSIZE;
1696			auio.uio_resid = MAXBSIZE;
1697			auio.uio_offset = trunc_page(off);
1698			auio.uio_segflg = UIO_NOCOPY;
1699			auio.uio_rw = UIO_READ;
1700			auio.uio_procp = p;
1701			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1702			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1703			        p->p_ucred);
1704			VOP_UNLOCK(vp, 0, p);
1705			mtx_lock(&vm_mtx);
1706			vm_page_flag_clear(pg, PG_ZERO);
1707			vm_page_io_finish(pg);
1708			if (error) {
1709				vm_page_unwire(pg, 0);
1710				/*
1711				 * See if anyone else might know about this page.
1712				 * If not and it is not valid, then free it.
1713				 */
1714				if (pg->wire_count == 0 && pg->valid == 0 &&
1715				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1716				    pg->hold_count == 0) {
1717					vm_page_busy(pg);
1718					vm_page_free(pg);
1719				}
1720				mtx_unlock(&vm_mtx);
1721				sbunlock(&so->so_snd);
1722				goto done;
1723			}
1724		}
1725
1726
1727		/*
1728		 * Get a sendfile buf. We usually wait as long as necessary,
1729		 * but this wait can be interrupted.
1730		 */
1731		mtx_unlock(&vm_mtx);
1732		if ((sf = sf_buf_alloc()) == NULL) {
1733			mtx_lock(&vm_mtx);
1734			vm_page_unwire(pg, 0);
1735			if (pg->wire_count == 0 && pg->object == NULL)
1736				vm_page_free(pg);
1737			mtx_unlock(&vm_mtx);
1738			sbunlock(&so->so_snd);
1739			error = EINTR;
1740			goto done;
1741		}
1742
1743		/*
1744		 * Allocate a kernel virtual page and insert the physical page
1745		 * into it.
1746		 */
1747		mtx_lock(&vm_mtx);
1748		sf->m = pg;
1749		pmap_qenter(sf->kva, &pg, 1);
1750		mtx_unlock(&vm_mtx);
1751		/*
1752		 * Get an mbuf header and set it up as having external storage.
1753		 */
1754		MGETHDR(m, M_TRYWAIT, MT_DATA);
1755		if (m == NULL) {
1756			error = ENOBUFS;
1757			sf_buf_free((void *)sf->kva, NULL);
1758			sbunlock(&so->so_snd);
1759			goto done;
1760		}
1761		/*
1762		 * Setup external storage for mbuf.
1763		 */
1764		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY,
1765		    EXT_SFBUF);
1766		m->m_data = (char *) sf->kva + pgoff;
1767		m->m_pkthdr.len = m->m_len = xfsize;
1768		/*
1769		 * Add the buffer to the socket buffer chain.
1770		 */
1771		s = splnet();
1772retry_space:
1773		/*
1774		 * Make sure that the socket is still able to take more data.
1775		 * CANTSENDMORE being true usually means that the connection
1776		 * was closed. so_error is true when an error was sensed after
1777		 * a previous send.
1778		 * The state is checked after the page mapping and buffer
1779		 * allocation above since those operations may block and make
1780		 * any socket checks stale. From this point forward, nothing
1781		 * blocks before the pru_send (or more accurately, any blocking
1782		 * results in a loop back to here to re-check).
1783		 */
1784		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1785			if (so->so_state & SS_CANTSENDMORE) {
1786				error = EPIPE;
1787			} else {
1788				error = so->so_error;
1789				so->so_error = 0;
1790			}
1791			m_freem(m);
1792			sbunlock(&so->so_snd);
1793			splx(s);
1794			goto done;
1795		}
1796		/*
1797		 * Wait for socket space to become available. We do this just
1798		 * after checking the connection state above in order to avoid
1799		 * a race condition with sbwait().
1800		 */
1801		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1802			if (so->so_state & SS_NBIO) {
1803				m_freem(m);
1804				sbunlock(&so->so_snd);
1805				splx(s);
1806				error = EAGAIN;
1807				goto done;
1808			}
1809			error = sbwait(&so->so_snd);
1810			/*
1811			 * An error from sbwait usually indicates that we've
1812			 * been interrupted by a signal. If we've sent anything
1813			 * then return bytes sent, otherwise return the error.
1814			 */
1815			if (error) {
1816				m_freem(m);
1817				sbunlock(&so->so_snd);
1818				splx(s);
1819				goto done;
1820			}
1821			goto retry_space;
1822		}
1823		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1824		splx(s);
1825		if (error) {
1826			sbunlock(&so->so_snd);
1827			goto done;
1828		}
1829	}
1830	sbunlock(&so->so_snd);
1831
1832	/*
1833	 * Send trailers. Wimp out and use writev(2).
1834	 */
1835	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1836			nuap.fd = uap->s;
1837			nuap.iovp = hdtr.trailers;
1838			nuap.iovcnt = hdtr.trl_cnt;
1839			error = writev(p, &nuap);
1840			if (error)
1841				goto done;
1842			sbytes += p->p_retval[0];
1843	}
1844
1845done:
1846	/*
1847	 * If there was no error we have to clear p->p_retval[0]
1848	 * because it may have been set by writev.
1849	 */
1850	if (error == 0) {
1851		p->p_retval[0] = 0;
1852	}
1853	if (uap->sbytes != NULL) {
1854		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1855	}
1856	if (vp)
1857		vrele(vp);
1858	if (fp)
1859		fdrop(fp, p);
1860	return (error);
1861}
1862