kern_sendfile.c revision 62378
1/*
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37 * $FreeBSD: head/sys/kern/uipc_syscalls.c 62378 2000-07-02 08:08:09Z green $
38 */
39
40#include "opt_compat.h"
41#include "opt_ktrace.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/kernel.h>
46#include <sys/sysproto.h>
47#include <sys/malloc.h>
48#include <sys/filedesc.h>
49#include <sys/event.h>
50#include <sys/proc.h>
51#include <sys/fcntl.h>
52#include <sys/file.h>
53#include <sys/mbuf.h>
54#include <sys/protosw.h>
55#include <sys/socket.h>
56#include <sys/socketvar.h>
57#include <sys/signalvar.h>
58#include <sys/uio.h>
59#include <sys/vnode.h>
60#include <sys/lock.h>
61#include <sys/mount.h>
62#ifdef KTRACE
63#include <sys/ktrace.h>
64#endif
65#include <vm/vm.h>
66#include <vm/vm_object.h>
67#include <vm/vm_page.h>
68#include <vm/vm_pageout.h>
69#include <vm/vm_kern.h>
70#include <vm/vm_extern.h>
71
72static void sf_buf_init(void *arg);
73SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
74static struct sf_buf *sf_buf_alloc(void);
75static void sf_buf_ref(caddr_t addr, u_int size);
76static void sf_buf_free(caddr_t addr, u_int size);
77
78static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
79static int recvit __P((struct proc *p, int s, struct msghdr *mp,
80		       caddr_t namelenp));
81
82static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
83static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
84			     int compat));
85static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
86			     int compat));
87
88static SLIST_HEAD(, sf_buf) sf_freelist;
89static vm_offset_t sf_base;
90static struct sf_buf *sf_bufs;
91static int sf_buf_alloc_want;
92
93/*
94 * System call interface to the socket abstraction.
95 */
96#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
97#define COMPAT_OLDSOCK
98#endif
99
100extern	struct fileops socketops;
101
102int
103socket(p, uap)
104	struct proc *p;
105	register struct socket_args /* {
106		int	domain;
107		int	type;
108		int	protocol;
109	} */ *uap;
110{
111	struct filedesc *fdp = p->p_fd;
112	struct socket *so;
113	struct file *fp;
114	int fd, error;
115
116	error = falloc(p, &fp, &fd);
117	if (error)
118		return (error);
119	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
120	if (error) {
121		fdp->fd_ofiles[fd] = 0;
122		ffree(fp);
123	} else {
124		fp->f_data = (caddr_t)so;
125		fp->f_flag = FREAD|FWRITE;
126		fp->f_ops = &socketops;
127		fp->f_type = DTYPE_SOCKET;
128		p->p_retval[0] = fd;
129	}
130	return (error);
131}
132
133/* ARGSUSED */
134int
135bind(p, uap)
136	struct proc *p;
137	register struct bind_args /* {
138		int	s;
139		caddr_t	name;
140		int	namelen;
141	} */ *uap;
142{
143	struct file *fp;
144	struct sockaddr *sa;
145	int error;
146
147	error = getsock(p->p_fd, uap->s, &fp);
148	if (error)
149		return (error);
150	error = getsockaddr(&sa, uap->name, uap->namelen);
151	if (error)
152		return (error);
153	error = sobind((struct socket *)fp->f_data, sa, p);
154	FREE(sa, M_SONAME);
155	return (error);
156}
157
158/* ARGSUSED */
159int
160listen(p, uap)
161	struct proc *p;
162	register struct listen_args /* {
163		int	s;
164		int	backlog;
165	} */ *uap;
166{
167	struct file *fp;
168	int error;
169
170	error = getsock(p->p_fd, uap->s, &fp);
171	if (error)
172		return (error);
173	return (solisten((struct socket *)fp->f_data, uap->backlog, p));
174}
175
176static int
177accept1(p, uap, compat)
178	struct proc *p;
179	register struct accept_args /* {
180		int	s;
181		caddr_t	name;
182		int	*anamelen;
183	} */ *uap;
184	int compat;
185{
186	struct filedesc *fdp = p->p_fd;
187	struct file *fp;
188	struct sockaddr *sa;
189	int namelen, error, s;
190	struct socket *head, *so;
191	int fd;
192	short fflag;		/* type must match fp->f_flag */
193
194	if (uap->name) {
195		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
196			sizeof (namelen));
197		if(error)
198			return (error);
199	}
200	error = getsock(fdp, uap->s, &fp);
201	if (error)
202		return (error);
203	s = splnet();
204	head = (struct socket *)fp->f_data;
205	if ((head->so_options & SO_ACCEPTCONN) == 0) {
206		splx(s);
207		return (EINVAL);
208	}
209	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
210		splx(s);
211		return (EWOULDBLOCK);
212	}
213	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
214		if (head->so_state & SS_CANTRCVMORE) {
215			head->so_error = ECONNABORTED;
216			break;
217		}
218		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
219		    "accept", 0);
220		if (error) {
221			splx(s);
222			return (error);
223		}
224	}
225	if (head->so_error) {
226		error = head->so_error;
227		head->so_error = 0;
228		splx(s);
229		return (error);
230	}
231
232	/*
233	 * At this point we know that there is at least one connection
234	 * ready to be accepted. Remove it from the queue prior to
235	 * allocating the file descriptor for it since falloc() may
236	 * block allowing another process to accept the connection
237	 * instead.
238	 */
239	so = TAILQ_FIRST(&head->so_comp);
240	TAILQ_REMOVE(&head->so_comp, so, so_list);
241	head->so_qlen--;
242
243	fflag = fp->f_flag;
244	error = falloc(p, &fp, &fd);
245	if (error) {
246		/*
247		 * Probably ran out of file descriptors. Put the
248		 * unaccepted connection back onto the queue and
249		 * do another wakeup so some other process might
250		 * have a chance at it.
251		 */
252		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
253		head->so_qlen++;
254		wakeup_one(&head->so_timeo);
255		splx(s);
256		return (error);
257	} else
258		p->p_retval[0] = fd;
259
260	/* connection has been removed from the listen queue */
261	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
262
263	so->so_state &= ~SS_COMP;
264	so->so_head = NULL;
265	if (head->so_sigio != NULL)
266		fsetown(fgetown(head->so_sigio), &so->so_sigio);
267
268	fp->f_data = (caddr_t)so;
269	fp->f_flag = fflag;
270	fp->f_ops = &socketops;
271	fp->f_type = DTYPE_SOCKET;
272	sa = 0;
273	(void) soaccept(so, &sa);
274	if (sa == 0) {
275		namelen = 0;
276		if (uap->name)
277			goto gotnoname;
278		splx(s);
279		return 0;
280	}
281	if (uap->name) {
282		/* check sa_len before it is destroyed */
283		if (namelen > sa->sa_len)
284			namelen = sa->sa_len;
285#ifdef COMPAT_OLDSOCK
286		if (compat)
287			((struct osockaddr *)sa)->sa_family =
288			    sa->sa_family;
289#endif
290		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
291		if (!error)
292gotnoname:
293			error = copyout((caddr_t)&namelen,
294			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
295	}
296	if (sa)
297		FREE(sa, M_SONAME);
298	if (error) {
299		fdp->fd_ofiles[fd] = 0;
300		ffree(fp);
301	}
302	splx(s);
303	return (error);
304}
305
306int
307accept(p, uap)
308	struct proc *p;
309	struct accept_args *uap;
310{
311
312	return (accept1(p, uap, 0));
313}
314
315#ifdef COMPAT_OLDSOCK
316int
317oaccept(p, uap)
318	struct proc *p;
319	struct accept_args *uap;
320{
321
322	return (accept1(p, uap, 1));
323}
324#endif /* COMPAT_OLDSOCK */
325
326/* ARGSUSED */
327int
328connect(p, uap)
329	struct proc *p;
330	register struct connect_args /* {
331		int	s;
332		caddr_t	name;
333		int	namelen;
334	} */ *uap;
335{
336	struct file *fp;
337	register struct socket *so;
338	struct sockaddr *sa;
339	int error, s;
340
341	error = getsock(p->p_fd, uap->s, &fp);
342	if (error)
343		return (error);
344	so = (struct socket *)fp->f_data;
345	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
346		return (EALREADY);
347	error = getsockaddr(&sa, uap->name, uap->namelen);
348	if (error)
349		return (error);
350	error = soconnect(so, sa, p);
351	if (error)
352		goto bad;
353	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
354		FREE(sa, M_SONAME);
355		return (EINPROGRESS);
356	}
357	s = splnet();
358	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
359		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
360		    "connec", 0);
361		if (error)
362			break;
363	}
364	if (error == 0) {
365		error = so->so_error;
366		so->so_error = 0;
367	}
368	splx(s);
369bad:
370	so->so_state &= ~SS_ISCONNECTING;
371	FREE(sa, M_SONAME);
372	if (error == ERESTART)
373		error = EINTR;
374	return (error);
375}
376
377int
378socketpair(p, uap)
379	struct proc *p;
380	register struct socketpair_args /* {
381		int	domain;
382		int	type;
383		int	protocol;
384		int	*rsv;
385	} */ *uap;
386{
387	register struct filedesc *fdp = p->p_fd;
388	struct file *fp1, *fp2;
389	struct socket *so1, *so2;
390	int fd, error, sv[2];
391
392	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
393	if (error)
394		return (error);
395	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
396	if (error)
397		goto free1;
398	error = falloc(p, &fp1, &fd);
399	if (error)
400		goto free2;
401	sv[0] = fd;
402	fp1->f_data = (caddr_t)so1;
403	error = falloc(p, &fp2, &fd);
404	if (error)
405		goto free3;
406	fp2->f_data = (caddr_t)so2;
407	sv[1] = fd;
408	error = soconnect2(so1, so2);
409	if (error)
410		goto free4;
411	if (uap->type == SOCK_DGRAM) {
412		/*
413		 * Datagram socket connection is asymmetric.
414		 */
415		 error = soconnect2(so2, so1);
416		 if (error)
417			goto free4;
418	}
419	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
420	fp1->f_ops = fp2->f_ops = &socketops;
421	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
422	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
423	return (error);
424free4:
425	fdp->fd_ofiles[sv[1]] = 0;
426	ffree(fp2);
427free3:
428	fdp->fd_ofiles[sv[0]] = 0;
429	ffree(fp1);
430free2:
431	(void)soclose(so2);
432free1:
433	(void)soclose(so1);
434	return (error);
435}
436
437static int
438sendit(p, s, mp, flags)
439	register struct proc *p;
440	int s;
441	register struct msghdr *mp;
442	int flags;
443{
444	struct file *fp;
445	struct uio auio;
446	register struct iovec *iov;
447	register int i;
448	struct mbuf *control;
449	struct sockaddr *to;
450	int len, error;
451	struct socket *so;
452#ifdef KTRACE
453	struct iovec *ktriov = NULL;
454	struct uio ktruio;
455#endif
456
457	error = getsock(p->p_fd, s, &fp);
458	if (error)
459		return (error);
460	auio.uio_iov = mp->msg_iov;
461	auio.uio_iovcnt = mp->msg_iovlen;
462	auio.uio_segflg = UIO_USERSPACE;
463	auio.uio_rw = UIO_WRITE;
464	auio.uio_procp = p;
465	auio.uio_offset = 0;			/* XXX */
466	auio.uio_resid = 0;
467	iov = mp->msg_iov;
468	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
469		if ((auio.uio_resid += iov->iov_len) < 0)
470			return (EINVAL);
471	}
472	if (mp->msg_name) {
473		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
474		if (error)
475			return (error);
476	} else
477		to = 0;
478	if (mp->msg_control) {
479		if (mp->msg_controllen < sizeof(struct cmsghdr)
480#ifdef COMPAT_OLDSOCK
481		    && mp->msg_flags != MSG_COMPAT
482#endif
483		) {
484			error = EINVAL;
485			goto bad;
486		}
487		error = sockargs(&control, mp->msg_control,
488		    mp->msg_controllen, MT_CONTROL);
489		if (error)
490			goto bad;
491#ifdef COMPAT_OLDSOCK
492		if (mp->msg_flags == MSG_COMPAT) {
493			register struct cmsghdr *cm;
494
495			M_PREPEND(control, sizeof(*cm), M_WAIT);
496			if (control == 0) {
497				error = ENOBUFS;
498				goto bad;
499			} else {
500				cm = mtod(control, struct cmsghdr *);
501				cm->cmsg_len = control->m_len;
502				cm->cmsg_level = SOL_SOCKET;
503				cm->cmsg_type = SCM_RIGHTS;
504			}
505		}
506#endif
507	} else
508		control = 0;
509#ifdef KTRACE
510	if (KTRPOINT(p, KTR_GENIO)) {
511		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
512
513		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
514		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
515		ktruio = auio;
516	}
517#endif
518	len = auio.uio_resid;
519	so = (struct socket *)fp->f_data;
520	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
521						     flags, p);
522	if (error) {
523		if (auio.uio_resid != len && (error == ERESTART ||
524		    error == EINTR || error == EWOULDBLOCK))
525			error = 0;
526		if (error == EPIPE)
527			psignal(p, SIGPIPE);
528	}
529	if (error == 0)
530		p->p_retval[0] = len - auio.uio_resid;
531#ifdef KTRACE
532	if (ktriov != NULL) {
533		if (error == 0) {
534			ktruio.uio_iov = ktriov;
535			ktruio.uio_resid = p->p_retval[0];
536			ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error);
537		}
538		FREE(ktriov, M_TEMP);
539	}
540#endif
541bad:
542	if (to)
543		FREE(to, M_SONAME);
544	return (error);
545}
546
547int
548sendto(p, uap)
549	struct proc *p;
550	register struct sendto_args /* {
551		int	s;
552		caddr_t	buf;
553		size_t	len;
554		int	flags;
555		caddr_t	to;
556		int	tolen;
557	} */ *uap;
558{
559	struct msghdr msg;
560	struct iovec aiov;
561
562	msg.msg_name = uap->to;
563	msg.msg_namelen = uap->tolen;
564	msg.msg_iov = &aiov;
565	msg.msg_iovlen = 1;
566	msg.msg_control = 0;
567#ifdef COMPAT_OLDSOCK
568	msg.msg_flags = 0;
569#endif
570	aiov.iov_base = uap->buf;
571	aiov.iov_len = uap->len;
572	return (sendit(p, uap->s, &msg, uap->flags));
573}
574
575#ifdef COMPAT_OLDSOCK
576int
577osend(p, uap)
578	struct proc *p;
579	register struct osend_args /* {
580		int	s;
581		caddr_t	buf;
582		int	len;
583		int	flags;
584	} */ *uap;
585{
586	struct msghdr msg;
587	struct iovec aiov;
588
589	msg.msg_name = 0;
590	msg.msg_namelen = 0;
591	msg.msg_iov = &aiov;
592	msg.msg_iovlen = 1;
593	aiov.iov_base = uap->buf;
594	aiov.iov_len = uap->len;
595	msg.msg_control = 0;
596	msg.msg_flags = 0;
597	return (sendit(p, uap->s, &msg, uap->flags));
598}
599
600int
601osendmsg(p, uap)
602	struct proc *p;
603	register struct osendmsg_args /* {
604		int	s;
605		caddr_t	msg;
606		int	flags;
607	} */ *uap;
608{
609	struct msghdr msg;
610	struct iovec aiov[UIO_SMALLIOV], *iov;
611	int error;
612
613	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
614	if (error)
615		return (error);
616	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
617		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
618			return (EMSGSIZE);
619		MALLOC(iov, struct iovec *,
620		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
621		      M_WAITOK);
622	} else
623		iov = aiov;
624	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
625	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
626	if (error)
627		goto done;
628	msg.msg_flags = MSG_COMPAT;
629	msg.msg_iov = iov;
630	error = sendit(p, uap->s, &msg, uap->flags);
631done:
632	if (iov != aiov)
633		FREE(iov, M_IOV);
634	return (error);
635}
636#endif
637
638int
639sendmsg(p, uap)
640	struct proc *p;
641	register struct sendmsg_args /* {
642		int	s;
643		caddr_t	msg;
644		int	flags;
645	} */ *uap;
646{
647	struct msghdr msg;
648	struct iovec aiov[UIO_SMALLIOV], *iov;
649	int error;
650
651	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
652	if (error)
653		return (error);
654	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
655		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
656			return (EMSGSIZE);
657		MALLOC(iov, struct iovec *,
658		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
659		       M_WAITOK);
660	} else
661		iov = aiov;
662	if (msg.msg_iovlen &&
663	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
664	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
665		goto done;
666	msg.msg_iov = iov;
667#ifdef COMPAT_OLDSOCK
668	msg.msg_flags = 0;
669#endif
670	error = sendit(p, uap->s, &msg, uap->flags);
671done:
672	if (iov != aiov)
673		FREE(iov, M_IOV);
674	return (error);
675}
676
677static int
678recvit(p, s, mp, namelenp)
679	register struct proc *p;
680	int s;
681	register struct msghdr *mp;
682	caddr_t namelenp;
683{
684	struct file *fp;
685	struct uio auio;
686	register struct iovec *iov;
687	register int i;
688	int len, error;
689	struct mbuf *m, *control = 0;
690	caddr_t ctlbuf;
691	struct socket *so;
692	struct sockaddr *fromsa = 0;
693#ifdef KTRACE
694	struct iovec *ktriov = NULL;
695	struct uio ktruio;
696#endif
697
698	error = getsock(p->p_fd, s, &fp);
699	if (error)
700		return (error);
701	auio.uio_iov = mp->msg_iov;
702	auio.uio_iovcnt = mp->msg_iovlen;
703	auio.uio_segflg = UIO_USERSPACE;
704	auio.uio_rw = UIO_READ;
705	auio.uio_procp = p;
706	auio.uio_offset = 0;			/* XXX */
707	auio.uio_resid = 0;
708	iov = mp->msg_iov;
709	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
710		if ((auio.uio_resid += iov->iov_len) < 0)
711			return (EINVAL);
712	}
713#ifdef KTRACE
714	if (KTRPOINT(p, KTR_GENIO)) {
715		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
716
717		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
718		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
719		ktruio = auio;
720	}
721#endif
722	len = auio.uio_resid;
723	so = (struct socket *)fp->f_data;
724	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
725	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
726	    &mp->msg_flags);
727	if (error) {
728		if (auio.uio_resid != len && (error == ERESTART ||
729		    error == EINTR || error == EWOULDBLOCK))
730			error = 0;
731	}
732#ifdef KTRACE
733	if (ktriov != NULL) {
734		if (error == 0) {
735			ktruio.uio_iov = ktriov;
736			ktruio.uio_resid = len - auio.uio_resid;
737			ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error);
738		}
739		FREE(ktriov, M_TEMP);
740	}
741#endif
742	if (error)
743		goto out;
744	p->p_retval[0] = len - auio.uio_resid;
745	if (mp->msg_name) {
746		len = mp->msg_namelen;
747		if (len <= 0 || fromsa == 0)
748			len = 0;
749		else {
750#ifndef MIN
751#define MIN(a,b) ((a)>(b)?(b):(a))
752#endif
753			/* save sa_len before it is destroyed by MSG_COMPAT */
754			len = MIN(len, fromsa->sa_len);
755#ifdef COMPAT_OLDSOCK
756			if (mp->msg_flags & MSG_COMPAT)
757				((struct osockaddr *)fromsa)->sa_family =
758				    fromsa->sa_family;
759#endif
760			error = copyout(fromsa,
761			    (caddr_t)mp->msg_name, (unsigned)len);
762			if (error)
763				goto out;
764		}
765		mp->msg_namelen = len;
766		if (namelenp &&
767		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
768#ifdef COMPAT_OLDSOCK
769			if (mp->msg_flags & MSG_COMPAT)
770				error = 0;	/* old recvfrom didn't check */
771			else
772#endif
773			goto out;
774		}
775	}
776	if (mp->msg_control) {
777#ifdef COMPAT_OLDSOCK
778		/*
779		 * We assume that old recvmsg calls won't receive access
780		 * rights and other control info, esp. as control info
781		 * is always optional and those options didn't exist in 4.3.
782		 * If we receive rights, trim the cmsghdr; anything else
783		 * is tossed.
784		 */
785		if (control && mp->msg_flags & MSG_COMPAT) {
786			if (mtod(control, struct cmsghdr *)->cmsg_level !=
787			    SOL_SOCKET ||
788			    mtod(control, struct cmsghdr *)->cmsg_type !=
789			    SCM_RIGHTS) {
790				mp->msg_controllen = 0;
791				goto out;
792			}
793			control->m_len -= sizeof (struct cmsghdr);
794			control->m_data += sizeof (struct cmsghdr);
795		}
796#endif
797		len = mp->msg_controllen;
798		m = control;
799		mp->msg_controllen = 0;
800		ctlbuf = (caddr_t) mp->msg_control;
801
802		while (m && len > 0) {
803			unsigned int tocopy;
804
805			if (len >= m->m_len)
806				tocopy = m->m_len;
807			else {
808				mp->msg_flags |= MSG_CTRUNC;
809				tocopy = len;
810			}
811
812			if ((error = copyout((caddr_t)mtod(m, caddr_t),
813					ctlbuf, tocopy)) != 0)
814				goto out;
815
816			ctlbuf += tocopy;
817			len -= tocopy;
818			m = m->m_next;
819		}
820		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
821	}
822out:
823	if (fromsa)
824		FREE(fromsa, M_SONAME);
825	if (control)
826		m_freem(control);
827	return (error);
828}
829
830int
831recvfrom(p, uap)
832	struct proc *p;
833	register struct recvfrom_args /* {
834		int	s;
835		caddr_t	buf;
836		size_t	len;
837		int	flags;
838		caddr_t	from;
839		int	*fromlenaddr;
840	} */ *uap;
841{
842	struct msghdr msg;
843	struct iovec aiov;
844	int error;
845
846	if (uap->fromlenaddr) {
847		error = copyin((caddr_t)uap->fromlenaddr,
848		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
849		if (error)
850			return (error);
851	} else
852		msg.msg_namelen = 0;
853	msg.msg_name = uap->from;
854	msg.msg_iov = &aiov;
855	msg.msg_iovlen = 1;
856	aiov.iov_base = uap->buf;
857	aiov.iov_len = uap->len;
858	msg.msg_control = 0;
859	msg.msg_flags = uap->flags;
860	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
861}
862
863#ifdef COMPAT_OLDSOCK
864int
865orecvfrom(p, uap)
866	struct proc *p;
867	struct recvfrom_args *uap;
868{
869
870	uap->flags |= MSG_COMPAT;
871	return (recvfrom(p, uap));
872}
873#endif
874
875
876#ifdef COMPAT_OLDSOCK
877int
878orecv(p, uap)
879	struct proc *p;
880	register struct orecv_args /* {
881		int	s;
882		caddr_t	buf;
883		int	len;
884		int	flags;
885	} */ *uap;
886{
887	struct msghdr msg;
888	struct iovec aiov;
889
890	msg.msg_name = 0;
891	msg.msg_namelen = 0;
892	msg.msg_iov = &aiov;
893	msg.msg_iovlen = 1;
894	aiov.iov_base = uap->buf;
895	aiov.iov_len = uap->len;
896	msg.msg_control = 0;
897	msg.msg_flags = uap->flags;
898	return (recvit(p, uap->s, &msg, (caddr_t)0));
899}
900
901/*
902 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
903 * overlays the new one, missing only the flags, and with the (old) access
904 * rights where the control fields are now.
905 */
906int
907orecvmsg(p, uap)
908	struct proc *p;
909	register struct orecvmsg_args /* {
910		int	s;
911		struct	omsghdr *msg;
912		int	flags;
913	} */ *uap;
914{
915	struct msghdr msg;
916	struct iovec aiov[UIO_SMALLIOV], *iov;
917	int error;
918
919	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
920	    sizeof (struct omsghdr));
921	if (error)
922		return (error);
923	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
924		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
925			return (EMSGSIZE);
926		MALLOC(iov, struct iovec *,
927		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
928		      M_WAITOK);
929	} else
930		iov = aiov;
931	msg.msg_flags = uap->flags | MSG_COMPAT;
932	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
933	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
934	if (error)
935		goto done;
936	msg.msg_iov = iov;
937	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
938
939	if (msg.msg_controllen && error == 0)
940		error = copyout((caddr_t)&msg.msg_controllen,
941		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
942done:
943	if (iov != aiov)
944		FREE(iov, M_IOV);
945	return (error);
946}
947#endif
948
949int
950recvmsg(p, uap)
951	struct proc *p;
952	register struct recvmsg_args /* {
953		int	s;
954		struct	msghdr *msg;
955		int	flags;
956	} */ *uap;
957{
958	struct msghdr msg;
959	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
960	register int error;
961
962	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
963	if (error)
964		return (error);
965	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
966		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
967			return (EMSGSIZE);
968		MALLOC(iov, struct iovec *,
969		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
970		       M_WAITOK);
971	} else
972		iov = aiov;
973#ifdef COMPAT_OLDSOCK
974	msg.msg_flags = uap->flags &~ MSG_COMPAT;
975#else
976	msg.msg_flags = uap->flags;
977#endif
978	uiov = msg.msg_iov;
979	msg.msg_iov = iov;
980	error = copyin((caddr_t)uiov, (caddr_t)iov,
981	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
982	if (error)
983		goto done;
984	error = recvit(p, uap->s, &msg, (caddr_t)0);
985	if (!error) {
986		msg.msg_iov = uiov;
987		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
988	}
989done:
990	if (iov != aiov)
991		FREE(iov, M_IOV);
992	return (error);
993}
994
995/* ARGSUSED */
996int
997shutdown(p, uap)
998	struct proc *p;
999	register struct shutdown_args /* {
1000		int	s;
1001		int	how;
1002	} */ *uap;
1003{
1004	struct file *fp;
1005	int error;
1006
1007	error = getsock(p->p_fd, uap->s, &fp);
1008	if (error)
1009		return (error);
1010	return (soshutdown((struct socket *)fp->f_data, uap->how));
1011}
1012
1013/* ARGSUSED */
1014int
1015setsockopt(p, uap)
1016	struct proc *p;
1017	register struct setsockopt_args /* {
1018		int	s;
1019		int	level;
1020		int	name;
1021		caddr_t	val;
1022		int	valsize;
1023	} */ *uap;
1024{
1025	struct file *fp;
1026	struct sockopt sopt;
1027	int error;
1028
1029	if (uap->val == 0 && uap->valsize != 0)
1030		return (EFAULT);
1031	if (uap->valsize < 0)
1032		return (EINVAL);
1033
1034	error = getsock(p->p_fd, uap->s, &fp);
1035	if (error)
1036		return (error);
1037
1038	sopt.sopt_dir = SOPT_SET;
1039	sopt.sopt_level = uap->level;
1040	sopt.sopt_name = uap->name;
1041	sopt.sopt_val = uap->val;
1042	sopt.sopt_valsize = uap->valsize;
1043	sopt.sopt_p = p;
1044
1045	return (sosetopt((struct socket *)fp->f_data, &sopt));
1046}
1047
1048/* ARGSUSED */
1049int
1050getsockopt(p, uap)
1051	struct proc *p;
1052	register struct getsockopt_args /* {
1053		int	s;
1054		int	level;
1055		int	name;
1056		caddr_t	val;
1057		int	*avalsize;
1058	} */ *uap;
1059{
1060	int	valsize, error;
1061	struct	file *fp;
1062	struct	sockopt sopt;
1063
1064	error = getsock(p->p_fd, uap->s, &fp);
1065	if (error)
1066		return (error);
1067	if (uap->val) {
1068		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1069		    sizeof (valsize));
1070		if (error)
1071			return (error);
1072		if (valsize < 0)
1073			return (EINVAL);
1074	} else
1075		valsize = 0;
1076
1077	sopt.sopt_dir = SOPT_GET;
1078	sopt.sopt_level = uap->level;
1079	sopt.sopt_name = uap->name;
1080	sopt.sopt_val = uap->val;
1081	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1082	sopt.sopt_p = p;
1083
1084	error = sogetopt((struct socket *)fp->f_data, &sopt);
1085	if (error == 0) {
1086		valsize = sopt.sopt_valsize;
1087		error = copyout((caddr_t)&valsize,
1088				(caddr_t)uap->avalsize, sizeof (valsize));
1089	}
1090	return (error);
1091}
1092
1093/*
1094 * Get socket name.
1095 */
1096/* ARGSUSED */
1097static int
1098getsockname1(p, uap, compat)
1099	struct proc *p;
1100	register struct getsockname_args /* {
1101		int	fdes;
1102		caddr_t	asa;
1103		int	*alen;
1104	} */ *uap;
1105	int compat;
1106{
1107	struct file *fp;
1108	register struct socket *so;
1109	struct sockaddr *sa;
1110	int len, error;
1111
1112	error = getsock(p->p_fd, uap->fdes, &fp);
1113	if (error)
1114		return (error);
1115	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1116	if (error)
1117		return (error);
1118	so = (struct socket *)fp->f_data;
1119	sa = 0;
1120	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1121	if (error)
1122		goto bad;
1123	if (sa == 0) {
1124		len = 0;
1125		goto gotnothing;
1126	}
1127
1128	len = MIN(len, sa->sa_len);
1129#ifdef COMPAT_OLDSOCK
1130	if (compat)
1131		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1132#endif
1133	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1134	if (error == 0)
1135gotnothing:
1136		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1137		    sizeof (len));
1138bad:
1139	if (sa)
1140		FREE(sa, M_SONAME);
1141	return (error);
1142}
1143
1144int
1145getsockname(p, uap)
1146	struct proc *p;
1147	struct getsockname_args *uap;
1148{
1149
1150	return (getsockname1(p, uap, 0));
1151}
1152
1153#ifdef COMPAT_OLDSOCK
1154int
1155ogetsockname(p, uap)
1156	struct proc *p;
1157	struct getsockname_args *uap;
1158{
1159
1160	return (getsockname1(p, uap, 1));
1161}
1162#endif /* COMPAT_OLDSOCK */
1163
1164/*
1165 * Get name of peer for connected socket.
1166 */
1167/* ARGSUSED */
1168static int
1169getpeername1(p, uap, compat)
1170	struct proc *p;
1171	register struct getpeername_args /* {
1172		int	fdes;
1173		caddr_t	asa;
1174		int	*alen;
1175	} */ *uap;
1176	int compat;
1177{
1178	struct file *fp;
1179	register struct socket *so;
1180	struct sockaddr *sa;
1181	int len, error;
1182
1183	error = getsock(p->p_fd, uap->fdes, &fp);
1184	if (error)
1185		return (error);
1186	so = (struct socket *)fp->f_data;
1187	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
1188		return (ENOTCONN);
1189	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1190	if (error)
1191		return (error);
1192	sa = 0;
1193	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1194	if (error)
1195		goto bad;
1196	if (sa == 0) {
1197		len = 0;
1198		goto gotnothing;
1199	}
1200	len = MIN(len, sa->sa_len);
1201#ifdef COMPAT_OLDSOCK
1202	if (compat)
1203		((struct osockaddr *)sa)->sa_family =
1204		    sa->sa_family;
1205#endif
1206	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1207	if (error)
1208		goto bad;
1209gotnothing:
1210	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1211bad:
1212	if (sa) FREE(sa, M_SONAME);
1213	return (error);
1214}
1215
1216int
1217getpeername(p, uap)
1218	struct proc *p;
1219	struct getpeername_args *uap;
1220{
1221
1222	return (getpeername1(p, uap, 0));
1223}
1224
1225#ifdef COMPAT_OLDSOCK
1226int
1227ogetpeername(p, uap)
1228	struct proc *p;
1229	struct ogetpeername_args *uap;
1230{
1231
1232	/* XXX uap should have type `getpeername_args *' to begin with. */
1233	return (getpeername1(p, (struct getpeername_args *)uap, 1));
1234}
1235#endif /* COMPAT_OLDSOCK */
1236
1237int
1238sockargs(mp, buf, buflen, type)
1239	struct mbuf **mp;
1240	caddr_t buf;
1241	int buflen, type;
1242{
1243	register struct sockaddr *sa;
1244	register struct mbuf *m;
1245	int error;
1246
1247	if ((u_int)buflen > MLEN) {
1248#ifdef COMPAT_OLDSOCK
1249		if (type == MT_SONAME && (u_int)buflen <= 112)
1250			buflen = MLEN;		/* unix domain compat. hack */
1251		else
1252#endif
1253		return (EINVAL);
1254	}
1255	m = m_get(M_WAIT, type);
1256	if (m == NULL)
1257		return (ENOBUFS);
1258	m->m_len = buflen;
1259	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1260	if (error)
1261		(void) m_free(m);
1262	else {
1263		*mp = m;
1264		if (type == MT_SONAME) {
1265			sa = mtod(m, struct sockaddr *);
1266
1267#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1268			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1269				sa->sa_family = sa->sa_len;
1270#endif
1271			sa->sa_len = buflen;
1272		}
1273	}
1274	return (error);
1275}
1276
1277int
1278getsockaddr(namp, uaddr, len)
1279	struct sockaddr **namp;
1280	caddr_t uaddr;
1281	size_t len;
1282{
1283	struct sockaddr *sa;
1284	int error;
1285
1286	if (len > SOCK_MAXADDRLEN)
1287		return ENAMETOOLONG;
1288	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1289	error = copyin(uaddr, sa, len);
1290	if (error) {
1291		FREE(sa, M_SONAME);
1292	} else {
1293#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1294		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1295			sa->sa_family = sa->sa_len;
1296#endif
1297		sa->sa_len = len;
1298		*namp = sa;
1299	}
1300	return error;
1301}
1302
1303int
1304getsock(fdp, fdes, fpp)
1305	struct filedesc *fdp;
1306	int fdes;
1307	struct file **fpp;
1308{
1309	register struct file *fp;
1310
1311	if ((unsigned)fdes >= fdp->fd_nfiles ||
1312	    (fp = fdp->fd_ofiles[fdes]) == NULL)
1313		return (EBADF);
1314	if (fp->f_type != DTYPE_SOCKET)
1315		return (ENOTSOCK);
1316	*fpp = fp;
1317	return (0);
1318}
1319
1320/*
1321 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1322 * XXX - The sf_buf functions are currently private to sendfile(2), so have
1323 * been made static, but may be useful in the future for doing zero-copy in
1324 * other parts of the networking code.
1325 */
1326static void
1327sf_buf_init(void *arg)
1328{
1329	int i;
1330
1331	SLIST_INIT(&sf_freelist);
1332	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1333	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
1334	bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
1335	for (i = 0; i < nsfbufs; i++) {
1336		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1337		SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
1338	}
1339}
1340
1341/*
1342 * Get an sf_buf from the freelist. Will block if none are available.
1343 */
1344static struct sf_buf *
1345sf_buf_alloc()
1346{
1347	struct sf_buf *sf;
1348	int s;
1349
1350	s = splimp();
1351	while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
1352		sf_buf_alloc_want = 1;
1353		tsleep(&sf_freelist, PVM, "sfbufa", 0);
1354	}
1355	SLIST_REMOVE_HEAD(&sf_freelist, free_list);
1356	splx(s);
1357	sf->refcnt = 1;
1358	return (sf);
1359}
1360
1361#define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1362static void
1363sf_buf_ref(caddr_t addr, u_int size)
1364{
1365	struct sf_buf *sf;
1366
1367	sf = dtosf(addr);
1368	if (sf->refcnt == 0)
1369		panic("sf_buf_ref: referencing a free sf_buf");
1370	sf->refcnt++;
1371}
1372
1373/*
1374 * Lose a reference to an sf_buf. When none left, detach mapped page
1375 * and release resources back to the system.
1376 *
1377 * Must be called at splimp.
1378 */
1379static void
1380sf_buf_free(caddr_t addr, u_int size)
1381{
1382	struct sf_buf *sf;
1383	struct vm_page *m;
1384	int s;
1385
1386	sf = dtosf(addr);
1387	if (sf->refcnt == 0)
1388		panic("sf_buf_free: freeing free sf_buf");
1389	sf->refcnt--;
1390	if (sf->refcnt == 0) {
1391		pmap_qremove((vm_offset_t)addr, 1);
1392		m = sf->m;
1393		s = splvm();
1394		vm_page_unwire(m, 0);
1395		/*
1396		 * Check for the object going away on us. This can
1397		 * happen since we don't hold a reference to it.
1398		 * If so, we're responsible for freeing the page.
1399		 */
1400		if (m->wire_count == 0 && m->object == NULL)
1401			vm_page_free(m);
1402		splx(s);
1403		sf->m = NULL;
1404		SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
1405		if (sf_buf_alloc_want) {
1406			sf_buf_alloc_want = 0;
1407			wakeup(&sf_freelist);
1408		}
1409	}
1410}
1411
1412/*
1413 * sendfile(2).
1414 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1415 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1416 *
1417 * Send a file specified by 'fd' and starting at 'offset' to a socket
1418 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1419 * nbytes == 0. Optionally add a header and/or trailer to the socket
1420 * output. If specified, write the total number of bytes sent into *sbytes.
1421 */
1422int
1423sendfile(struct proc *p, struct sendfile_args *uap)
1424{
1425	struct file *fp;
1426	struct filedesc *fdp = p->p_fd;
1427	struct vnode *vp;
1428	struct vm_object *obj;
1429	struct socket *so;
1430	struct mbuf *m;
1431	struct sf_buf *sf;
1432	struct vm_page *pg;
1433	struct writev_args nuap;
1434	struct sf_hdtr hdtr;
1435	off_t off, xfsize, sbytes = 0;
1436	int error = 0, s;
1437
1438	vp = NULL;
1439	/*
1440	 * Do argument checking. Must be a regular file in, stream
1441	 * type and connected socket out, positive offset.
1442	 */
1443	fp = getfp(fdp, uap->fd, FREAD);
1444	if (fp == NULL) {
1445		error = EBADF;
1446		goto done;
1447	}
1448	if (fp->f_type != DTYPE_VNODE) {
1449		error = EINVAL;
1450		goto done;
1451	}
1452	vp = (struct vnode *)fp->f_data;
1453	vref(vp);
1454	obj = vp->v_object;
1455	if (vp->v_type != VREG || obj == NULL) {
1456		error = EINVAL;
1457		goto done;
1458	}
1459	error = getsock(p->p_fd, uap->s, &fp);
1460	if (error)
1461		goto done;
1462	so = (struct socket *)fp->f_data;
1463	if (so->so_type != SOCK_STREAM) {
1464		error = EINVAL;
1465		goto done;
1466	}
1467	if ((so->so_state & SS_ISCONNECTED) == 0) {
1468		error = ENOTCONN;
1469		goto done;
1470	}
1471	if (uap->offset < 0) {
1472		error = EINVAL;
1473		goto done;
1474	}
1475
1476	/*
1477	 * If specified, get the pointer to the sf_hdtr struct for
1478	 * any headers/trailers.
1479	 */
1480	if (uap->hdtr != NULL) {
1481		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1482		if (error)
1483			goto done;
1484		/*
1485		 * Send any headers. Wimp out and use writev(2).
1486		 */
1487		if (hdtr.headers != NULL) {
1488			nuap.fd = uap->s;
1489			nuap.iovp = hdtr.headers;
1490			nuap.iovcnt = hdtr.hdr_cnt;
1491			error = writev(p, &nuap);
1492			if (error)
1493				goto done;
1494			sbytes += p->p_retval[0];
1495		}
1496	}
1497
1498	/*
1499	 * Protect against multiple writers to the socket.
1500	 */
1501	(void) sblock(&so->so_snd, M_WAITOK);
1502
1503	/*
1504	 * Loop through the pages in the file, starting with the requested
1505	 * offset. Get a file page (do I/O if necessary), map the file page
1506	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1507	 * it on the socket.
1508	 */
1509	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1510		vm_pindex_t pindex;
1511		vm_offset_t pgoff;
1512
1513		pindex = OFF_TO_IDX(off);
1514retry_lookup:
1515		/*
1516		 * Calculate the amount to transfer. Not to exceed a page,
1517		 * the EOF, or the passed in nbytes.
1518		 */
1519		xfsize = obj->un_pager.vnp.vnp_size - off;
1520		if (xfsize > PAGE_SIZE)
1521			xfsize = PAGE_SIZE;
1522		pgoff = (vm_offset_t)(off & PAGE_MASK);
1523		if (PAGE_SIZE - pgoff < xfsize)
1524			xfsize = PAGE_SIZE - pgoff;
1525		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1526			xfsize = uap->nbytes - sbytes;
1527		if (xfsize <= 0)
1528			break;
1529		/*
1530		 * Optimize the non-blocking case by looking at the socket space
1531		 * before going to the extra work of constituting the sf_buf.
1532		 */
1533		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1534			if (so->so_state & SS_CANTSENDMORE)
1535				error = EPIPE;
1536			else
1537				error = EAGAIN;
1538			sbunlock(&so->so_snd);
1539			goto done;
1540		}
1541		/*
1542		 * Attempt to look up the page.
1543		 *
1544		 *	Allocate if not found
1545		 *
1546		 *	Wait and loop if busy.
1547		 */
1548		pg = vm_page_lookup(obj, pindex);
1549
1550		if (pg == NULL) {
1551			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1552			if (pg == NULL) {
1553				VM_WAIT;
1554				goto retry_lookup;
1555			}
1556			vm_page_wakeup(pg);
1557		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1558			goto retry_lookup;
1559		}
1560
1561		/*
1562		 * Wire the page so it does not get ripped out from under
1563		 * us.
1564		 */
1565
1566		vm_page_wire(pg);
1567
1568		/*
1569		 * If page is not valid for what we need, initiate I/O
1570		 */
1571
1572		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1573			struct uio auio;
1574			struct iovec aiov;
1575			int bsize;
1576
1577			/*
1578			 * Ensure that our page is still around when the I/O
1579			 * completes.
1580			 */
1581			vm_page_io_start(pg);
1582
1583			/*
1584			 * Get the page from backing store.
1585			 */
1586			bsize = vp->v_mount->mnt_stat.f_iosize;
1587			auio.uio_iov = &aiov;
1588			auio.uio_iovcnt = 1;
1589			aiov.iov_base = 0;
1590			aiov.iov_len = MAXBSIZE;
1591			auio.uio_resid = MAXBSIZE;
1592			auio.uio_offset = trunc_page(off);
1593			auio.uio_segflg = UIO_NOCOPY;
1594			auio.uio_rw = UIO_READ;
1595			auio.uio_procp = p;
1596			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1597			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1598			        p->p_ucred);
1599			VOP_UNLOCK(vp, 0, p);
1600			vm_page_flag_clear(pg, PG_ZERO);
1601			vm_page_io_finish(pg);
1602			if (error) {
1603				vm_page_unwire(pg, 0);
1604				/*
1605				 * See if anyone else might know about this page.
1606				 * If not and it is not valid, then free it.
1607				 */
1608				if (pg->wire_count == 0 && pg->valid == 0 &&
1609				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1610				    pg->hold_count == 0)
1611					vm_page_free(pg);
1612				sbunlock(&so->so_snd);
1613				goto done;
1614			}
1615		}
1616
1617		/*
1618		 * Allocate a kernel virtual page and insert the physical page
1619		 * into it.
1620		 */
1621
1622		sf = sf_buf_alloc();
1623		sf->m = pg;
1624		pmap_qenter(sf->kva, &pg, 1);
1625		/*
1626		 * Get an mbuf header and set it up as having external storage.
1627		 */
1628		MGETHDR(m, M_WAIT, MT_DATA);
1629		if (m == NULL) {
1630			error = ENOBUFS;
1631			goto done;
1632		}
1633		m->m_ext.ext_free = sf_buf_free;
1634		m->m_ext.ext_ref = sf_buf_ref;
1635		m->m_ext.ext_buf = (void *)sf->kva;
1636		m->m_ext.ext_size = PAGE_SIZE;
1637		m->m_data = (char *) sf->kva + pgoff;
1638		m->m_flags |= M_EXT;
1639		m->m_pkthdr.len = m->m_len = xfsize;
1640		/*
1641		 * Add the buffer to the socket buffer chain.
1642		 */
1643		s = splnet();
1644retry_space:
1645		/*
1646		 * Make sure that the socket is still able to take more data.
1647		 * CANTSENDMORE being true usually means that the connection
1648		 * was closed. so_error is true when an error was sensed after
1649		 * a previous send.
1650		 * The state is checked after the page mapping and buffer
1651		 * allocation above since those operations may block and make
1652		 * any socket checks stale. From this point forward, nothing
1653		 * blocks before the pru_send (or more accurately, any blocking
1654		 * results in a loop back to here to re-check).
1655		 */
1656		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1657			if (so->so_state & SS_CANTSENDMORE) {
1658				error = EPIPE;
1659			} else {
1660				error = so->so_error;
1661				so->so_error = 0;
1662			}
1663			m_freem(m);
1664			sbunlock(&so->so_snd);
1665			splx(s);
1666			goto done;
1667		}
1668		/*
1669		 * Wait for socket space to become available. We do this just
1670		 * after checking the connection state above in order to avoid
1671		 * a race condition with sbwait().
1672		 */
1673		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1674			if (so->so_state & SS_NBIO) {
1675				m_freem(m);
1676				sbunlock(&so->so_snd);
1677				splx(s);
1678				error = EAGAIN;
1679				goto done;
1680			}
1681			error = sbwait(&so->so_snd);
1682			/*
1683			 * An error from sbwait usually indicates that we've
1684			 * been interrupted by a signal. If we've sent anything
1685			 * then return bytes sent, otherwise return the error.
1686			 */
1687			if (error) {
1688				m_freem(m);
1689				sbunlock(&so->so_snd);
1690				splx(s);
1691				goto done;
1692			}
1693			goto retry_space;
1694		}
1695		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1696		splx(s);
1697		if (error) {
1698			sbunlock(&so->so_snd);
1699			goto done;
1700		}
1701	}
1702	sbunlock(&so->so_snd);
1703
1704	/*
1705	 * Send trailers. Wimp out and use writev(2).
1706	 */
1707	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1708			nuap.fd = uap->s;
1709			nuap.iovp = hdtr.trailers;
1710			nuap.iovcnt = hdtr.trl_cnt;
1711			error = writev(p, &nuap);
1712			if (error)
1713				goto done;
1714			sbytes += p->p_retval[0];
1715	}
1716
1717done:
1718	if (uap->sbytes != NULL) {
1719		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1720	}
1721	if (vp)
1722		vrele(vp);
1723	return (error);
1724}
1725