kern_sendfile.c revision 65770
1220497Smarkm/*
2220497Smarkm * Copyright (c) 1982, 1986, 1989, 1990, 1993
3220497Smarkm *	The Regents of the University of California.  All rights reserved.
4220497Smarkm *
5220497Smarkm * sendfile(2) and related extensions:
6220497Smarkm * Copyright (c) 1998, David Greenman. All rights reserved.
7220497Smarkm *
8220497Smarkm * Redistribution and use in source and binary forms, with or without
9220497Smarkm * modification, are permitted provided that the following conditions
10220497Smarkm * are met:
11220497Smarkm * 1. Redistributions of source code must retain the above copyright
12220497Smarkm *    notice, this list of conditions and the following disclaimer.
13220497Smarkm * 2. Redistributions in binary form must reproduce the above copyright
14220497Smarkm *    notice, this list of conditions and the following disclaimer in the
15220497Smarkm *    documentation and/or other materials provided with the distribution.
16220497Smarkm * 3. All advertising materials mentioning features or use of this software
17220497Smarkm *    must display the following acknowledgement:
18220497Smarkm *	This product includes software developed by the University of
19220497Smarkm *	California, Berkeley and its contributors.
20220497Smarkm * 4. Neither the name of the University nor the names of its contributors
21220497Smarkm *    may be used to endorse or promote products derived from this software
22220497Smarkm *    without specific prior written permission.
23220497Smarkm *
24220497Smarkm * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25220497Smarkm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26220497Smarkm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27220497Smarkm * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28220497Smarkm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29220497Smarkm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30220497Smarkm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31220497Smarkm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32220497Smarkm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33220497Smarkm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34220497Smarkm * SUCH DAMAGE.
35220497Smarkm *
36220497Smarkm *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37220497Smarkm * $FreeBSD: head/sys/kern/uipc_syscalls.c 65770 2000-09-12 09:49:08Z bp $
38220497Smarkm */
39220497Smarkm
40220497Smarkm#include "opt_compat.h"
41220497Smarkm#include "opt_ktrace.h"
42220497Smarkm
43220497Smarkm#include <sys/param.h>
44220497Smarkm#include <sys/systm.h>
45220497Smarkm#include <sys/kernel.h>
46220497Smarkm#include <sys/sysproto.h>
47220497Smarkm#include <sys/malloc.h>
48220497Smarkm#include <sys/filedesc.h>
49220497Smarkm#include <sys/event.h>
50220497Smarkm#include <sys/proc.h>
51220497Smarkm#include <sys/fcntl.h>
52220497Smarkm#include <sys/file.h>
53220497Smarkm#include <sys/mbuf.h>
54220497Smarkm#include <sys/protosw.h>
55220497Smarkm#include <sys/socket.h>
56220497Smarkm#include <sys/socketvar.h>
57220497Smarkm#include <sys/signalvar.h>
58220497Smarkm#include <sys/uio.h>
59220497Smarkm#include <sys/vnode.h>
60220497Smarkm#include <sys/lock.h>
61220497Smarkm#include <sys/mount.h>
62220497Smarkm#ifdef KTRACE
63220497Smarkm#include <sys/ktrace.h>
64220497Smarkm#endif
65220497Smarkm#include <vm/vm.h>
66220497Smarkm#include <vm/vm_object.h>
67220497Smarkm#include <vm/vm_page.h>
68220497Smarkm#include <vm/vm_pageout.h>
69220497Smarkm#include <vm/vm_kern.h>
70220497Smarkm#include <vm/vm_extern.h>
71220497Smarkm
72220497Smarkmstatic void sf_buf_init(void *arg);
73220497SmarkmSYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
74220497Smarkmstatic struct sf_buf *sf_buf_alloc(void);
75220497Smarkmstatic void sf_buf_free(caddr_t addr, void *args);
76220497Smarkm
77220497Smarkmstatic int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
78220497Smarkmstatic int recvit __P((struct proc *p, int s, struct msghdr *mp,
79220497Smarkm		       caddr_t namelenp));
80220497Smarkm
81220497Smarkmstatic int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
82220497Smarkmstatic int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
83220497Smarkm			     int compat));
84220497Smarkmstatic int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
85220497Smarkm			     int compat));
86220497Smarkm
87220497Smarkmstatic SLIST_HEAD(, sf_buf) sf_freelist;
88220497Smarkmstatic vm_offset_t sf_base;
89220497Smarkmstatic struct sf_buf *sf_bufs;
90220497Smarkmstatic int sf_buf_alloc_want;
91220497Smarkm
92220497Smarkm/*
93220497Smarkm * System call interface to the socket abstraction.
94220497Smarkm */
95220497Smarkm#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
96220497Smarkm#define COMPAT_OLDSOCK
97220497Smarkm#endif
98220497Smarkm
99220497Smarkmextern	struct fileops socketops;
100220497Smarkm
101220497Smarkmint
102220497Smarkmsocket(p, uap)
103220497Smarkm	struct proc *p;
104220497Smarkm	register struct socket_args /* {
105220497Smarkm		int	domain;
106220497Smarkm		int	type;
107220497Smarkm		int	protocol;
108220497Smarkm	} */ *uap;
109220497Smarkm{
110220497Smarkm	struct filedesc *fdp = p->p_fd;
111220497Smarkm	struct socket *so;
112220497Smarkm	struct file *fp;
113220497Smarkm	int fd, error;
114220497Smarkm
115220497Smarkm	error = falloc(p, &fp, &fd);
116220497Smarkm	if (error)
117220497Smarkm		return (error);
118220497Smarkm	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
119220497Smarkm	if (error) {
120220497Smarkm		fdp->fd_ofiles[fd] = 0;
121220497Smarkm		ffree(fp);
122220497Smarkm	} else {
123220497Smarkm		fp->f_data = (caddr_t)so;
124220497Smarkm		fp->f_flag = FREAD|FWRITE;
125220497Smarkm		fp->f_ops = &socketops;
126220497Smarkm		fp->f_type = DTYPE_SOCKET;
127220497Smarkm		p->p_retval[0] = fd;
128220497Smarkm	}
129220497Smarkm	return (error);
130220497Smarkm}
131220497Smarkm
132220497Smarkm/* ARGSUSED */
133220497Smarkmint
134220497Smarkmbind(p, uap)
135220497Smarkm	struct proc *p;
136220497Smarkm	register struct bind_args /* {
137220497Smarkm		int	s;
138220497Smarkm		caddr_t	name;
139220497Smarkm		int	namelen;
140220497Smarkm	} */ *uap;
141220497Smarkm{
142220497Smarkm	struct file *fp;
143220497Smarkm	struct sockaddr *sa;
144220497Smarkm	int error;
145220497Smarkm
146220497Smarkm	error = getsock(p->p_fd, uap->s, &fp);
147220497Smarkm	if (error)
148220497Smarkm		return (error);
149220497Smarkm	error = getsockaddr(&sa, uap->name, uap->namelen);
150220497Smarkm	if (error)
151220497Smarkm		return (error);
152220497Smarkm	error = sobind((struct socket *)fp->f_data, sa, p);
153220497Smarkm	FREE(sa, M_SONAME);
154220497Smarkm	return (error);
155220497Smarkm}
156220497Smarkm
157220497Smarkm/* ARGSUSED */
158220497Smarkmint
159220497Smarkmlisten(p, uap)
160220497Smarkm	struct proc *p;
161220497Smarkm	register struct listen_args /* {
162220497Smarkm		int	s;
163220497Smarkm		int	backlog;
164220497Smarkm	} */ *uap;
165220497Smarkm{
166220497Smarkm	struct file *fp;
167220497Smarkm	int error;
168220497Smarkm
169220497Smarkm	error = getsock(p->p_fd, uap->s, &fp);
170220497Smarkm	if (error)
171220497Smarkm		return (error);
172220497Smarkm	return (solisten((struct socket *)fp->f_data, uap->backlog, p));
173220497Smarkm}
174220497Smarkm
175220497Smarkmstatic int
176220497Smarkmaccept1(p, uap, compat)
177220497Smarkm	struct proc *p;
178220497Smarkm	register struct accept_args /* {
179220497Smarkm		int	s;
180220497Smarkm		caddr_t	name;
181220497Smarkm		int	*anamelen;
182220497Smarkm	} */ *uap;
183220497Smarkm	int compat;
184220497Smarkm{
185220497Smarkm	struct filedesc *fdp = p->p_fd;
186220497Smarkm	struct file *fp;
187220497Smarkm	struct sockaddr *sa;
188220497Smarkm	int namelen, error, s;
189220497Smarkm	struct socket *head, *so;
190220497Smarkm	int fd;
191220497Smarkm	short fflag;		/* type must match fp->f_flag */
192220497Smarkm
193220497Smarkm	if (uap->name) {
194220497Smarkm		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
195220497Smarkm			sizeof (namelen));
196220497Smarkm		if(error)
197220497Smarkm			return (error);
198220497Smarkm	}
199220497Smarkm	error = getsock(fdp, uap->s, &fp);
200220497Smarkm	if (error)
201220497Smarkm		return (error);
202220497Smarkm	s = splnet();
203220497Smarkm	head = (struct socket *)fp->f_data;
204220497Smarkm	if ((head->so_options & SO_ACCEPTCONN) == 0) {
205220497Smarkm		splx(s);
206220497Smarkm		return (EINVAL);
207220497Smarkm	}
208220497Smarkm	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
209220497Smarkm		splx(s);
210220497Smarkm		return (EWOULDBLOCK);
211220497Smarkm	}
212220497Smarkm	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
213220497Smarkm		if (head->so_state & SS_CANTRCVMORE) {
214220497Smarkm			head->so_error = ECONNABORTED;
215220497Smarkm			break;
216220497Smarkm		}
217220497Smarkm		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
218220497Smarkm		    "accept", 0);
219220497Smarkm		if (error) {
220220497Smarkm			splx(s);
221220497Smarkm			return (error);
222220497Smarkm		}
223220497Smarkm	}
224220497Smarkm	if (head->so_error) {
225220497Smarkm		error = head->so_error;
226220497Smarkm		head->so_error = 0;
227220497Smarkm		splx(s);
228220497Smarkm		return (error);
229220497Smarkm	}
230220497Smarkm
231220497Smarkm	/*
232220497Smarkm	 * At this point we know that there is at least one connection
233220497Smarkm	 * ready to be accepted. Remove it from the queue prior to
234220497Smarkm	 * allocating the file descriptor for it since falloc() may
235220497Smarkm	 * block allowing another process to accept the connection
236220497Smarkm	 * instead.
237220497Smarkm	 */
238220497Smarkm	so = TAILQ_FIRST(&head->so_comp);
239220497Smarkm	TAILQ_REMOVE(&head->so_comp, so, so_list);
240220497Smarkm	head->so_qlen--;
241220497Smarkm
242220497Smarkm	fflag = fp->f_flag;
243220497Smarkm	error = falloc(p, &fp, &fd);
244220497Smarkm	if (error) {
245220497Smarkm		/*
246220497Smarkm		 * Probably ran out of file descriptors. Put the
247220497Smarkm		 * unaccepted connection back onto the queue and
248220497Smarkm		 * do another wakeup so some other process might
249220497Smarkm		 * have a chance at it.
250220497Smarkm		 */
251220497Smarkm		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
252220497Smarkm		head->so_qlen++;
253220497Smarkm		wakeup_one(&head->so_timeo);
254220497Smarkm		splx(s);
255220497Smarkm		return (error);
256220497Smarkm	} else
257220497Smarkm		p->p_retval[0] = fd;
258220497Smarkm
259220497Smarkm	/* connection has been removed from the listen queue */
260220497Smarkm	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
261220497Smarkm
262220497Smarkm	so->so_state &= ~SS_COMP;
263220497Smarkm	so->so_head = NULL;
264220497Smarkm	if (head->so_sigio != NULL)
265220497Smarkm		fsetown(fgetown(head->so_sigio), &so->so_sigio);
266220497Smarkm
267220497Smarkm	fp->f_data = (caddr_t)so;
268220497Smarkm	fp->f_flag = fflag;
269220497Smarkm	fp->f_ops = &socketops;
270220497Smarkm	fp->f_type = DTYPE_SOCKET;
271220497Smarkm	sa = 0;
272220497Smarkm	(void) soaccept(so, &sa);
273220497Smarkm	if (sa == 0) {
274220497Smarkm		namelen = 0;
275220497Smarkm		if (uap->name)
276220497Smarkm			goto gotnoname;
277220497Smarkm		splx(s);
278220497Smarkm		return 0;
279220497Smarkm	}
280220497Smarkm	if (uap->name) {
281220497Smarkm		/* check sa_len before it is destroyed */
282220497Smarkm		if (namelen > sa->sa_len)
283220497Smarkm			namelen = sa->sa_len;
284220497Smarkm#ifdef COMPAT_OLDSOCK
285220497Smarkm		if (compat)
286220497Smarkm			((struct osockaddr *)sa)->sa_family =
287220497Smarkm			    sa->sa_family;
288220497Smarkm#endif
289220497Smarkm		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
290220497Smarkm		if (!error)
291220497Smarkmgotnoname:
292220497Smarkm			error = copyout((caddr_t)&namelen,
293220497Smarkm			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
294220497Smarkm	}
295220497Smarkm	if (sa)
296220497Smarkm		FREE(sa, M_SONAME);
297220497Smarkm	if (error) {
298220497Smarkm		fdp->fd_ofiles[fd] = 0;
299220497Smarkm		ffree(fp);
300220497Smarkm	}
301220497Smarkm	splx(s);
302220497Smarkm	return (error);
303220497Smarkm}
304220497Smarkm
305220497Smarkmint
306220497Smarkmaccept(p, uap)
307220497Smarkm	struct proc *p;
308220497Smarkm	struct accept_args *uap;
309220497Smarkm{
310220497Smarkm
311220497Smarkm	return (accept1(p, uap, 0));
312220497Smarkm}
313220497Smarkm
314220497Smarkm#ifdef COMPAT_OLDSOCK
315220497Smarkmint
316220497Smarkmoaccept(p, uap)
317220497Smarkm	struct proc *p;
318220497Smarkm	struct accept_args *uap;
319220497Smarkm{
320220497Smarkm
321220497Smarkm	return (accept1(p, uap, 1));
322220497Smarkm}
323220497Smarkm#endif /* COMPAT_OLDSOCK */
324220497Smarkm
325220497Smarkm/* ARGSUSED */
326220497Smarkmint
327220497Smarkmconnect(p, uap)
328220497Smarkm	struct proc *p;
329220497Smarkm	register struct connect_args /* {
330220497Smarkm		int	s;
331220497Smarkm		caddr_t	name;
332220497Smarkm		int	namelen;
333220497Smarkm	} */ *uap;
334220497Smarkm{
335220497Smarkm	struct file *fp;
336220497Smarkm	register struct socket *so;
337220497Smarkm	struct sockaddr *sa;
338220497Smarkm	int error, s;
339220497Smarkm
340220497Smarkm	error = getsock(p->p_fd, uap->s, &fp);
341220497Smarkm	if (error)
342220497Smarkm		return (error);
343220497Smarkm	so = (struct socket *)fp->f_data;
344220497Smarkm	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
345220497Smarkm		return (EALREADY);
346220497Smarkm	error = getsockaddr(&sa, uap->name, uap->namelen);
347220497Smarkm	if (error)
348220497Smarkm		return (error);
349220497Smarkm	error = soconnect(so, sa, p);
350220497Smarkm	if (error)
351220497Smarkm		goto bad;
352220497Smarkm	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
353220497Smarkm		FREE(sa, M_SONAME);
354220497Smarkm		return (EINPROGRESS);
355220497Smarkm	}
356220497Smarkm	s = splnet();
357220497Smarkm	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
358220497Smarkm		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
359220497Smarkm		    "connec", 0);
360220497Smarkm		if (error)
361220497Smarkm			break;
362220497Smarkm	}
363220497Smarkm	if (error == 0) {
364220497Smarkm		error = so->so_error;
365220497Smarkm		so->so_error = 0;
366220497Smarkm	}
367220497Smarkm	splx(s);
368220497Smarkmbad:
369220497Smarkm	so->so_state &= ~SS_ISCONNECTING;
370220497Smarkm	FREE(sa, M_SONAME);
371220497Smarkm	if (error == ERESTART)
372220497Smarkm		error = EINTR;
373220497Smarkm	return (error);
374220497Smarkm}
375220497Smarkm
376220497Smarkmint
377220497Smarkmsocketpair(p, uap)
378220497Smarkm	struct proc *p;
379220497Smarkm	register struct socketpair_args /* {
380220497Smarkm		int	domain;
381220497Smarkm		int	type;
382220497Smarkm		int	protocol;
383220497Smarkm		int	*rsv;
384220497Smarkm	} */ *uap;
385220497Smarkm{
386220497Smarkm	register struct filedesc *fdp = p->p_fd;
387220497Smarkm	struct file *fp1, *fp2;
388220497Smarkm	struct socket *so1, *so2;
389220497Smarkm	int fd, error, sv[2];
390220497Smarkm
391220497Smarkm	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
392220497Smarkm	if (error)
393220497Smarkm		return (error);
394220497Smarkm	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
395220497Smarkm	if (error)
396220497Smarkm		goto free1;
397220497Smarkm	error = falloc(p, &fp1, &fd);
398220497Smarkm	if (error)
399220497Smarkm		goto free2;
400220497Smarkm	sv[0] = fd;
401220497Smarkm	fp1->f_data = (caddr_t)so1;
402220497Smarkm	error = falloc(p, &fp2, &fd);
403220497Smarkm	if (error)
404220497Smarkm		goto free3;
405220497Smarkm	fp2->f_data = (caddr_t)so2;
406220497Smarkm	sv[1] = fd;
407220497Smarkm	error = soconnect2(so1, so2);
408220497Smarkm	if (error)
409220497Smarkm		goto free4;
410220497Smarkm	if (uap->type == SOCK_DGRAM) {
411220497Smarkm		/*
412220497Smarkm		 * Datagram socket connection is asymmetric.
413220497Smarkm		 */
414220497Smarkm		 error = soconnect2(so2, so1);
415220497Smarkm		 if (error)
416220497Smarkm			goto free4;
417220497Smarkm	}
418220497Smarkm	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
419220497Smarkm	fp1->f_ops = fp2->f_ops = &socketops;
420220497Smarkm	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
421220497Smarkm	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
422220497Smarkm	return (error);
423220497Smarkmfree4:
424220497Smarkm	fdp->fd_ofiles[sv[1]] = 0;
425220497Smarkm	ffree(fp2);
426220497Smarkmfree3:
427220497Smarkm	fdp->fd_ofiles[sv[0]] = 0;
428220497Smarkm	ffree(fp1);
429220497Smarkmfree2:
430220497Smarkm	(void)soclose(so2);
431220497Smarkmfree1:
432220497Smarkm	(void)soclose(so1);
433220497Smarkm	return (error);
434220497Smarkm}
435220497Smarkm
436220497Smarkmstatic int
437220497Smarkmsendit(p, s, mp, flags)
438220497Smarkm	register struct proc *p;
439220497Smarkm	int s;
440220497Smarkm	register struct msghdr *mp;
441220497Smarkm	int flags;
442220497Smarkm{
443220497Smarkm	struct file *fp;
444220497Smarkm	struct uio auio;
445220497Smarkm	register struct iovec *iov;
446220497Smarkm	register int i;
447220497Smarkm	struct mbuf *control;
448220497Smarkm	struct sockaddr *to;
449220497Smarkm	int len, error;
450220497Smarkm	struct socket *so;
451220497Smarkm#ifdef KTRACE
452220497Smarkm	struct iovec *ktriov = NULL;
453220497Smarkm	struct uio ktruio;
454220497Smarkm#endif
455220497Smarkm
456220497Smarkm	error = getsock(p->p_fd, s, &fp);
457220497Smarkm	if (error)
458220497Smarkm		return (error);
459220497Smarkm	auio.uio_iov = mp->msg_iov;
460220497Smarkm	auio.uio_iovcnt = mp->msg_iovlen;
461220497Smarkm	auio.uio_segflg = UIO_USERSPACE;
462220497Smarkm	auio.uio_rw = UIO_WRITE;
463220497Smarkm	auio.uio_procp = p;
464220497Smarkm	auio.uio_offset = 0;			/* XXX */
465220497Smarkm	auio.uio_resid = 0;
466220497Smarkm	iov = mp->msg_iov;
467220497Smarkm	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
468220497Smarkm		if ((auio.uio_resid += iov->iov_len) < 0)
469220497Smarkm			return (EINVAL);
470220497Smarkm	}
471220497Smarkm	if (mp->msg_name) {
472220497Smarkm		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
473220497Smarkm		if (error)
474220497Smarkm			return (error);
475220497Smarkm	} else
476220497Smarkm		to = 0;
477220497Smarkm	if (mp->msg_control) {
478		if (mp->msg_controllen < sizeof(struct cmsghdr)
479#ifdef COMPAT_OLDSOCK
480		    && mp->msg_flags != MSG_COMPAT
481#endif
482		) {
483			error = EINVAL;
484			goto bad;
485		}
486		error = sockargs(&control, mp->msg_control,
487		    mp->msg_controllen, MT_CONTROL);
488		if (error)
489			goto bad;
490#ifdef COMPAT_OLDSOCK
491		if (mp->msg_flags == MSG_COMPAT) {
492			register struct cmsghdr *cm;
493
494			M_PREPEND(control, sizeof(*cm), M_WAIT);
495			if (control == 0) {
496				error = ENOBUFS;
497				goto bad;
498			} else {
499				cm = mtod(control, struct cmsghdr *);
500				cm->cmsg_len = control->m_len;
501				cm->cmsg_level = SOL_SOCKET;
502				cm->cmsg_type = SCM_RIGHTS;
503			}
504		}
505#endif
506	} else
507		control = 0;
508#ifdef KTRACE
509	if (KTRPOINT(p, KTR_GENIO)) {
510		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
511
512		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
513		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
514		ktruio = auio;
515	}
516#endif
517	len = auio.uio_resid;
518	so = (struct socket *)fp->f_data;
519	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
520						     flags, p);
521	if (error) {
522		if (auio.uio_resid != len && (error == ERESTART ||
523		    error == EINTR || error == EWOULDBLOCK))
524			error = 0;
525		if (error == EPIPE)
526			psignal(p, SIGPIPE);
527	}
528	if (error == 0)
529		p->p_retval[0] = len - auio.uio_resid;
530#ifdef KTRACE
531	if (ktriov != NULL) {
532		if (error == 0) {
533			ktruio.uio_iov = ktriov;
534			ktruio.uio_resid = p->p_retval[0];
535			ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error);
536		}
537		FREE(ktriov, M_TEMP);
538	}
539#endif
540bad:
541	if (to)
542		FREE(to, M_SONAME);
543	return (error);
544}
545
546int
547sendto(p, uap)
548	struct proc *p;
549	register struct sendto_args /* {
550		int	s;
551		caddr_t	buf;
552		size_t	len;
553		int	flags;
554		caddr_t	to;
555		int	tolen;
556	} */ *uap;
557{
558	struct msghdr msg;
559	struct iovec aiov;
560
561	msg.msg_name = uap->to;
562	msg.msg_namelen = uap->tolen;
563	msg.msg_iov = &aiov;
564	msg.msg_iovlen = 1;
565	msg.msg_control = 0;
566#ifdef COMPAT_OLDSOCK
567	msg.msg_flags = 0;
568#endif
569	aiov.iov_base = uap->buf;
570	aiov.iov_len = uap->len;
571	return (sendit(p, uap->s, &msg, uap->flags));
572}
573
574#ifdef COMPAT_OLDSOCK
575int
576osend(p, uap)
577	struct proc *p;
578	register struct osend_args /* {
579		int	s;
580		caddr_t	buf;
581		int	len;
582		int	flags;
583	} */ *uap;
584{
585	struct msghdr msg;
586	struct iovec aiov;
587
588	msg.msg_name = 0;
589	msg.msg_namelen = 0;
590	msg.msg_iov = &aiov;
591	msg.msg_iovlen = 1;
592	aiov.iov_base = uap->buf;
593	aiov.iov_len = uap->len;
594	msg.msg_control = 0;
595	msg.msg_flags = 0;
596	return (sendit(p, uap->s, &msg, uap->flags));
597}
598
599int
600osendmsg(p, uap)
601	struct proc *p;
602	register struct osendmsg_args /* {
603		int	s;
604		caddr_t	msg;
605		int	flags;
606	} */ *uap;
607{
608	struct msghdr msg;
609	struct iovec aiov[UIO_SMALLIOV], *iov;
610	int error;
611
612	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
613	if (error)
614		return (error);
615	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
616		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
617			return (EMSGSIZE);
618		MALLOC(iov, struct iovec *,
619		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
620		      M_WAITOK);
621	} else
622		iov = aiov;
623	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
624	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
625	if (error)
626		goto done;
627	msg.msg_flags = MSG_COMPAT;
628	msg.msg_iov = iov;
629	error = sendit(p, uap->s, &msg, uap->flags);
630done:
631	if (iov != aiov)
632		FREE(iov, M_IOV);
633	return (error);
634}
635#endif
636
637int
638sendmsg(p, uap)
639	struct proc *p;
640	register struct sendmsg_args /* {
641		int	s;
642		caddr_t	msg;
643		int	flags;
644	} */ *uap;
645{
646	struct msghdr msg;
647	struct iovec aiov[UIO_SMALLIOV], *iov;
648	int error;
649
650	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
651	if (error)
652		return (error);
653	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
654		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
655			return (EMSGSIZE);
656		MALLOC(iov, struct iovec *,
657		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
658		       M_WAITOK);
659	} else
660		iov = aiov;
661	if (msg.msg_iovlen &&
662	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
663	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
664		goto done;
665	msg.msg_iov = iov;
666#ifdef COMPAT_OLDSOCK
667	msg.msg_flags = 0;
668#endif
669	error = sendit(p, uap->s, &msg, uap->flags);
670done:
671	if (iov != aiov)
672		FREE(iov, M_IOV);
673	return (error);
674}
675
676static int
677recvit(p, s, mp, namelenp)
678	register struct proc *p;
679	int s;
680	register struct msghdr *mp;
681	caddr_t namelenp;
682{
683	struct file *fp;
684	struct uio auio;
685	register struct iovec *iov;
686	register int i;
687	int len, error;
688	struct mbuf *m, *control = 0;
689	caddr_t ctlbuf;
690	struct socket *so;
691	struct sockaddr *fromsa = 0;
692#ifdef KTRACE
693	struct iovec *ktriov = NULL;
694	struct uio ktruio;
695#endif
696
697	error = getsock(p->p_fd, s, &fp);
698	if (error)
699		return (error);
700	auio.uio_iov = mp->msg_iov;
701	auio.uio_iovcnt = mp->msg_iovlen;
702	auio.uio_segflg = UIO_USERSPACE;
703	auio.uio_rw = UIO_READ;
704	auio.uio_procp = p;
705	auio.uio_offset = 0;			/* XXX */
706	auio.uio_resid = 0;
707	iov = mp->msg_iov;
708	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
709		if ((auio.uio_resid += iov->iov_len) < 0)
710			return (EINVAL);
711	}
712#ifdef KTRACE
713	if (KTRPOINT(p, KTR_GENIO)) {
714		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
715
716		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
717		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
718		ktruio = auio;
719	}
720#endif
721	len = auio.uio_resid;
722	so = (struct socket *)fp->f_data;
723	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
724	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
725	    &mp->msg_flags);
726	if (error) {
727		if (auio.uio_resid != len && (error == ERESTART ||
728		    error == EINTR || error == EWOULDBLOCK))
729			error = 0;
730	}
731#ifdef KTRACE
732	if (ktriov != NULL) {
733		if (error == 0) {
734			ktruio.uio_iov = ktriov;
735			ktruio.uio_resid = len - auio.uio_resid;
736			ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error);
737		}
738		FREE(ktriov, M_TEMP);
739	}
740#endif
741	if (error)
742		goto out;
743	p->p_retval[0] = len - auio.uio_resid;
744	if (mp->msg_name) {
745		len = mp->msg_namelen;
746		if (len <= 0 || fromsa == 0)
747			len = 0;
748		else {
749#ifndef MIN
750#define MIN(a,b) ((a)>(b)?(b):(a))
751#endif
752			/* save sa_len before it is destroyed by MSG_COMPAT */
753			len = MIN(len, fromsa->sa_len);
754#ifdef COMPAT_OLDSOCK
755			if (mp->msg_flags & MSG_COMPAT)
756				((struct osockaddr *)fromsa)->sa_family =
757				    fromsa->sa_family;
758#endif
759			error = copyout(fromsa,
760			    (caddr_t)mp->msg_name, (unsigned)len);
761			if (error)
762				goto out;
763		}
764		mp->msg_namelen = len;
765		if (namelenp &&
766		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
767#ifdef COMPAT_OLDSOCK
768			if (mp->msg_flags & MSG_COMPAT)
769				error = 0;	/* old recvfrom didn't check */
770			else
771#endif
772			goto out;
773		}
774	}
775	if (mp->msg_control) {
776#ifdef COMPAT_OLDSOCK
777		/*
778		 * We assume that old recvmsg calls won't receive access
779		 * rights and other control info, esp. as control info
780		 * is always optional and those options didn't exist in 4.3.
781		 * If we receive rights, trim the cmsghdr; anything else
782		 * is tossed.
783		 */
784		if (control && mp->msg_flags & MSG_COMPAT) {
785			if (mtod(control, struct cmsghdr *)->cmsg_level !=
786			    SOL_SOCKET ||
787			    mtod(control, struct cmsghdr *)->cmsg_type !=
788			    SCM_RIGHTS) {
789				mp->msg_controllen = 0;
790				goto out;
791			}
792			control->m_len -= sizeof (struct cmsghdr);
793			control->m_data += sizeof (struct cmsghdr);
794		}
795#endif
796		len = mp->msg_controllen;
797		m = control;
798		mp->msg_controllen = 0;
799		ctlbuf = (caddr_t) mp->msg_control;
800
801		while (m && len > 0) {
802			unsigned int tocopy;
803
804			if (len >= m->m_len)
805				tocopy = m->m_len;
806			else {
807				mp->msg_flags |= MSG_CTRUNC;
808				tocopy = len;
809			}
810
811			if ((error = copyout((caddr_t)mtod(m, caddr_t),
812					ctlbuf, tocopy)) != 0)
813				goto out;
814
815			ctlbuf += tocopy;
816			len -= tocopy;
817			m = m->m_next;
818		}
819		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
820	}
821out:
822	if (fromsa)
823		FREE(fromsa, M_SONAME);
824	if (control)
825		m_freem(control);
826	return (error);
827}
828
829int
830recvfrom(p, uap)
831	struct proc *p;
832	register struct recvfrom_args /* {
833		int	s;
834		caddr_t	buf;
835		size_t	len;
836		int	flags;
837		caddr_t	from;
838		int	*fromlenaddr;
839	} */ *uap;
840{
841	struct msghdr msg;
842	struct iovec aiov;
843	int error;
844
845	if (uap->fromlenaddr) {
846		error = copyin((caddr_t)uap->fromlenaddr,
847		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
848		if (error)
849			return (error);
850	} else
851		msg.msg_namelen = 0;
852	msg.msg_name = uap->from;
853	msg.msg_iov = &aiov;
854	msg.msg_iovlen = 1;
855	aiov.iov_base = uap->buf;
856	aiov.iov_len = uap->len;
857	msg.msg_control = 0;
858	msg.msg_flags = uap->flags;
859	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
860}
861
862#ifdef COMPAT_OLDSOCK
863int
864orecvfrom(p, uap)
865	struct proc *p;
866	struct recvfrom_args *uap;
867{
868
869	uap->flags |= MSG_COMPAT;
870	return (recvfrom(p, uap));
871}
872#endif
873
874
875#ifdef COMPAT_OLDSOCK
876int
877orecv(p, uap)
878	struct proc *p;
879	register struct orecv_args /* {
880		int	s;
881		caddr_t	buf;
882		int	len;
883		int	flags;
884	} */ *uap;
885{
886	struct msghdr msg;
887	struct iovec aiov;
888
889	msg.msg_name = 0;
890	msg.msg_namelen = 0;
891	msg.msg_iov = &aiov;
892	msg.msg_iovlen = 1;
893	aiov.iov_base = uap->buf;
894	aiov.iov_len = uap->len;
895	msg.msg_control = 0;
896	msg.msg_flags = uap->flags;
897	return (recvit(p, uap->s, &msg, (caddr_t)0));
898}
899
900/*
901 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
902 * overlays the new one, missing only the flags, and with the (old) access
903 * rights where the control fields are now.
904 */
905int
906orecvmsg(p, uap)
907	struct proc *p;
908	register struct orecvmsg_args /* {
909		int	s;
910		struct	omsghdr *msg;
911		int	flags;
912	} */ *uap;
913{
914	struct msghdr msg;
915	struct iovec aiov[UIO_SMALLIOV], *iov;
916	int error;
917
918	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
919	    sizeof (struct omsghdr));
920	if (error)
921		return (error);
922	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
923		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
924			return (EMSGSIZE);
925		MALLOC(iov, struct iovec *,
926		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
927		      M_WAITOK);
928	} else
929		iov = aiov;
930	msg.msg_flags = uap->flags | MSG_COMPAT;
931	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
932	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
933	if (error)
934		goto done;
935	msg.msg_iov = iov;
936	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
937
938	if (msg.msg_controllen && error == 0)
939		error = copyout((caddr_t)&msg.msg_controllen,
940		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
941done:
942	if (iov != aiov)
943		FREE(iov, M_IOV);
944	return (error);
945}
946#endif
947
948int
949recvmsg(p, uap)
950	struct proc *p;
951	register struct recvmsg_args /* {
952		int	s;
953		struct	msghdr *msg;
954		int	flags;
955	} */ *uap;
956{
957	struct msghdr msg;
958	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
959	register int error;
960
961	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
962	if (error)
963		return (error);
964	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
965		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
966			return (EMSGSIZE);
967		MALLOC(iov, struct iovec *,
968		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
969		       M_WAITOK);
970	} else
971		iov = aiov;
972#ifdef COMPAT_OLDSOCK
973	msg.msg_flags = uap->flags &~ MSG_COMPAT;
974#else
975	msg.msg_flags = uap->flags;
976#endif
977	uiov = msg.msg_iov;
978	msg.msg_iov = iov;
979	error = copyin((caddr_t)uiov, (caddr_t)iov,
980	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
981	if (error)
982		goto done;
983	error = recvit(p, uap->s, &msg, (caddr_t)0);
984	if (!error) {
985		msg.msg_iov = uiov;
986		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
987	}
988done:
989	if (iov != aiov)
990		FREE(iov, M_IOV);
991	return (error);
992}
993
994/* ARGSUSED */
995int
996shutdown(p, uap)
997	struct proc *p;
998	register struct shutdown_args /* {
999		int	s;
1000		int	how;
1001	} */ *uap;
1002{
1003	struct file *fp;
1004	int error;
1005
1006	error = getsock(p->p_fd, uap->s, &fp);
1007	if (error)
1008		return (error);
1009	return (soshutdown((struct socket *)fp->f_data, uap->how));
1010}
1011
1012/* ARGSUSED */
1013int
1014setsockopt(p, uap)
1015	struct proc *p;
1016	register struct setsockopt_args /* {
1017		int	s;
1018		int	level;
1019		int	name;
1020		caddr_t	val;
1021		int	valsize;
1022	} */ *uap;
1023{
1024	struct file *fp;
1025	struct sockopt sopt;
1026	int error;
1027
1028	if (uap->val == 0 && uap->valsize != 0)
1029		return (EFAULT);
1030	if (uap->valsize < 0)
1031		return (EINVAL);
1032
1033	error = getsock(p->p_fd, uap->s, &fp);
1034	if (error)
1035		return (error);
1036
1037	sopt.sopt_dir = SOPT_SET;
1038	sopt.sopt_level = uap->level;
1039	sopt.sopt_name = uap->name;
1040	sopt.sopt_val = uap->val;
1041	sopt.sopt_valsize = uap->valsize;
1042	sopt.sopt_p = p;
1043
1044	return (sosetopt((struct socket *)fp->f_data, &sopt));
1045}
1046
1047/* ARGSUSED */
1048int
1049getsockopt(p, uap)
1050	struct proc *p;
1051	register struct getsockopt_args /* {
1052		int	s;
1053		int	level;
1054		int	name;
1055		caddr_t	val;
1056		int	*avalsize;
1057	} */ *uap;
1058{
1059	int	valsize, error;
1060	struct	file *fp;
1061	struct	sockopt sopt;
1062
1063	error = getsock(p->p_fd, uap->s, &fp);
1064	if (error)
1065		return (error);
1066	if (uap->val) {
1067		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1068		    sizeof (valsize));
1069		if (error)
1070			return (error);
1071		if (valsize < 0)
1072			return (EINVAL);
1073	} else
1074		valsize = 0;
1075
1076	sopt.sopt_dir = SOPT_GET;
1077	sopt.sopt_level = uap->level;
1078	sopt.sopt_name = uap->name;
1079	sopt.sopt_val = uap->val;
1080	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1081	sopt.sopt_p = p;
1082
1083	error = sogetopt((struct socket *)fp->f_data, &sopt);
1084	if (error == 0) {
1085		valsize = sopt.sopt_valsize;
1086		error = copyout((caddr_t)&valsize,
1087				(caddr_t)uap->avalsize, sizeof (valsize));
1088	}
1089	return (error);
1090}
1091
1092/*
1093 * Get socket name.
1094 */
1095/* ARGSUSED */
1096static int
1097getsockname1(p, uap, compat)
1098	struct proc *p;
1099	register struct getsockname_args /* {
1100		int	fdes;
1101		caddr_t	asa;
1102		int	*alen;
1103	} */ *uap;
1104	int compat;
1105{
1106	struct file *fp;
1107	register struct socket *so;
1108	struct sockaddr *sa;
1109	int len, error;
1110
1111	error = getsock(p->p_fd, uap->fdes, &fp);
1112	if (error)
1113		return (error);
1114	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1115	if (error)
1116		return (error);
1117	so = (struct socket *)fp->f_data;
1118	sa = 0;
1119	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1120	if (error)
1121		goto bad;
1122	if (sa == 0) {
1123		len = 0;
1124		goto gotnothing;
1125	}
1126
1127	len = MIN(len, sa->sa_len);
1128#ifdef COMPAT_OLDSOCK
1129	if (compat)
1130		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1131#endif
1132	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1133	if (error == 0)
1134gotnothing:
1135		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1136		    sizeof (len));
1137bad:
1138	if (sa)
1139		FREE(sa, M_SONAME);
1140	return (error);
1141}
1142
1143int
1144getsockname(p, uap)
1145	struct proc *p;
1146	struct getsockname_args *uap;
1147{
1148
1149	return (getsockname1(p, uap, 0));
1150}
1151
1152#ifdef COMPAT_OLDSOCK
1153int
1154ogetsockname(p, uap)
1155	struct proc *p;
1156	struct getsockname_args *uap;
1157{
1158
1159	return (getsockname1(p, uap, 1));
1160}
1161#endif /* COMPAT_OLDSOCK */
1162
1163/*
1164 * Get name of peer for connected socket.
1165 */
1166/* ARGSUSED */
1167static int
1168getpeername1(p, uap, compat)
1169	struct proc *p;
1170	register struct getpeername_args /* {
1171		int	fdes;
1172		caddr_t	asa;
1173		int	*alen;
1174	} */ *uap;
1175	int compat;
1176{
1177	struct file *fp;
1178	register struct socket *so;
1179	struct sockaddr *sa;
1180	int len, error;
1181
1182	error = getsock(p->p_fd, uap->fdes, &fp);
1183	if (error)
1184		return (error);
1185	so = (struct socket *)fp->f_data;
1186	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
1187		return (ENOTCONN);
1188	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1189	if (error)
1190		return (error);
1191	sa = 0;
1192	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1193	if (error)
1194		goto bad;
1195	if (sa == 0) {
1196		len = 0;
1197		goto gotnothing;
1198	}
1199	len = MIN(len, sa->sa_len);
1200#ifdef COMPAT_OLDSOCK
1201	if (compat)
1202		((struct osockaddr *)sa)->sa_family =
1203		    sa->sa_family;
1204#endif
1205	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1206	if (error)
1207		goto bad;
1208gotnothing:
1209	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1210bad:
1211	if (sa) FREE(sa, M_SONAME);
1212	return (error);
1213}
1214
1215int
1216getpeername(p, uap)
1217	struct proc *p;
1218	struct getpeername_args *uap;
1219{
1220
1221	return (getpeername1(p, uap, 0));
1222}
1223
1224#ifdef COMPAT_OLDSOCK
1225int
1226ogetpeername(p, uap)
1227	struct proc *p;
1228	struct ogetpeername_args *uap;
1229{
1230
1231	/* XXX uap should have type `getpeername_args *' to begin with. */
1232	return (getpeername1(p, (struct getpeername_args *)uap, 1));
1233}
1234#endif /* COMPAT_OLDSOCK */
1235
1236int
1237sockargs(mp, buf, buflen, type)
1238	struct mbuf **mp;
1239	caddr_t buf;
1240	int buflen, type;
1241{
1242	register struct sockaddr *sa;
1243	register struct mbuf *m;
1244	int error;
1245
1246	if ((u_int)buflen > MLEN) {
1247#ifdef COMPAT_OLDSOCK
1248		if (type == MT_SONAME && (u_int)buflen <= 112)
1249			buflen = MLEN;		/* unix domain compat. hack */
1250		else
1251#endif
1252		return (EINVAL);
1253	}
1254	m = m_get(M_WAIT, type);
1255	if (m == NULL)
1256		return (ENOBUFS);
1257	m->m_len = buflen;
1258	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1259	if (error)
1260		(void) m_free(m);
1261	else {
1262		*mp = m;
1263		if (type == MT_SONAME) {
1264			sa = mtod(m, struct sockaddr *);
1265
1266#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1267			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1268				sa->sa_family = sa->sa_len;
1269#endif
1270			sa->sa_len = buflen;
1271		}
1272	}
1273	return (error);
1274}
1275
1276int
1277getsockaddr(namp, uaddr, len)
1278	struct sockaddr **namp;
1279	caddr_t uaddr;
1280	size_t len;
1281{
1282	struct sockaddr *sa;
1283	int error;
1284
1285	if (len > SOCK_MAXADDRLEN)
1286		return ENAMETOOLONG;
1287	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1288	error = copyin(uaddr, sa, len);
1289	if (error) {
1290		FREE(sa, M_SONAME);
1291	} else {
1292#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1293		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1294			sa->sa_family = sa->sa_len;
1295#endif
1296		sa->sa_len = len;
1297		*namp = sa;
1298	}
1299	return error;
1300}
1301
1302int
1303getsock(fdp, fdes, fpp)
1304	struct filedesc *fdp;
1305	int fdes;
1306	struct file **fpp;
1307{
1308	register struct file *fp;
1309
1310	if ((unsigned)fdes >= fdp->fd_nfiles ||
1311	    (fp = fdp->fd_ofiles[fdes]) == NULL)
1312		return (EBADF);
1313	if (fp->f_type != DTYPE_SOCKET)
1314		return (ENOTSOCK);
1315	*fpp = fp;
1316	return (0);
1317}
1318
1319/*
1320 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1321 * XXX - The sf_buf functions are currently private to sendfile(2), so have
1322 * been made static, but may be useful in the future for doing zero-copy in
1323 * other parts of the networking code.
1324 */
1325static void
1326sf_buf_init(void *arg)
1327{
1328	int i;
1329
1330	SLIST_INIT(&sf_freelist);
1331	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1332	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
1333	bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
1334	for (i = 0; i < nsfbufs; i++) {
1335		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1336		SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
1337	}
1338}
1339
1340/*
1341 * Get an sf_buf from the freelist. Will block if none are available.
1342 */
1343static struct sf_buf *
1344sf_buf_alloc()
1345{
1346	struct sf_buf *sf;
1347	int s;
1348
1349	s = splimp();
1350	while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
1351		sf_buf_alloc_want = 1;
1352		tsleep(&sf_freelist, PVM, "sfbufa", 0);
1353	}
1354	SLIST_REMOVE_HEAD(&sf_freelist, free_list);
1355	splx(s);
1356	return (sf);
1357}
1358
1359#define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1360
1361/*
1362 *
1363 * Detatch mapped page and release resources back to the system.
1364 *
1365 * Must be called at splimp.
1366 */
1367static void
1368sf_buf_free(caddr_t addr, void *args)
1369{
1370	struct sf_buf *sf;
1371	struct vm_page *m;
1372	int s;
1373
1374	sf = dtosf(addr);
1375	pmap_qremove((vm_offset_t)addr, 1);
1376	m = sf->m;
1377	s = splvm();
1378	vm_page_unwire(m, 0);
1379	/*
1380	 * Check for the object going away on us. This can
1381	 * happen since we don't hold a reference to it.
1382	 * If so, we're responsible for freeing the page.
1383	 */
1384	if (m->wire_count == 0 && m->object == NULL)
1385		vm_page_free(m);
1386	splx(s);
1387	sf->m = NULL;
1388	SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
1389	if (sf_buf_alloc_want) {
1390		sf_buf_alloc_want = 0;
1391		wakeup(&sf_freelist);
1392	}
1393}
1394
1395/*
1396 * sendfile(2).
1397 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1398 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1399 *
1400 * Send a file specified by 'fd' and starting at 'offset' to a socket
1401 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1402 * nbytes == 0. Optionally add a header and/or trailer to the socket
1403 * output. If specified, write the total number of bytes sent into *sbytes.
1404 */
1405int
1406sendfile(struct proc *p, struct sendfile_args *uap)
1407{
1408	struct file *fp;
1409	struct filedesc *fdp = p->p_fd;
1410	struct vnode *vp;
1411	struct vm_object *obj;
1412	struct socket *so;
1413	struct mbuf *m;
1414	struct sf_buf *sf;
1415	struct vm_page *pg;
1416	struct writev_args nuap;
1417	struct sf_hdtr hdtr;
1418	off_t off, xfsize, sbytes = 0;
1419	int error = 0, s;
1420
1421	vp = NULL;
1422	/*
1423	 * Do argument checking. Must be a regular file in, stream
1424	 * type and connected socket out, positive offset.
1425	 */
1426	fp = getfp(fdp, uap->fd, FREAD);
1427	if (fp == NULL) {
1428		error = EBADF;
1429		goto done;
1430	}
1431	if (fp->f_type != DTYPE_VNODE) {
1432		error = EINVAL;
1433		goto done;
1434	}
1435	vp = (struct vnode *)fp->f_data;
1436	vref(vp);
1437	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1438		error = EINVAL;
1439		goto done;
1440	}
1441	error = getsock(p->p_fd, uap->s, &fp);
1442	if (error)
1443		goto done;
1444	so = (struct socket *)fp->f_data;
1445	if (so->so_type != SOCK_STREAM) {
1446		error = EINVAL;
1447		goto done;
1448	}
1449	if ((so->so_state & SS_ISCONNECTED) == 0) {
1450		error = ENOTCONN;
1451		goto done;
1452	}
1453	if (uap->offset < 0) {
1454		error = EINVAL;
1455		goto done;
1456	}
1457
1458	/*
1459	 * If specified, get the pointer to the sf_hdtr struct for
1460	 * any headers/trailers.
1461	 */
1462	if (uap->hdtr != NULL) {
1463		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1464		if (error)
1465			goto done;
1466		/*
1467		 * Send any headers. Wimp out and use writev(2).
1468		 */
1469		if (hdtr.headers != NULL) {
1470			nuap.fd = uap->s;
1471			nuap.iovp = hdtr.headers;
1472			nuap.iovcnt = hdtr.hdr_cnt;
1473			error = writev(p, &nuap);
1474			if (error)
1475				goto done;
1476			sbytes += p->p_retval[0];
1477		}
1478	}
1479
1480	/*
1481	 * Protect against multiple writers to the socket.
1482	 */
1483	(void) sblock(&so->so_snd, M_WAITOK);
1484
1485	/*
1486	 * Loop through the pages in the file, starting with the requested
1487	 * offset. Get a file page (do I/O if necessary), map the file page
1488	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1489	 * it on the socket.
1490	 */
1491	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1492		vm_pindex_t pindex;
1493		vm_offset_t pgoff;
1494
1495		pindex = OFF_TO_IDX(off);
1496retry_lookup:
1497		/*
1498		 * Calculate the amount to transfer. Not to exceed a page,
1499		 * the EOF, or the passed in nbytes.
1500		 */
1501		xfsize = obj->un_pager.vnp.vnp_size - off;
1502		if (xfsize > PAGE_SIZE)
1503			xfsize = PAGE_SIZE;
1504		pgoff = (vm_offset_t)(off & PAGE_MASK);
1505		if (PAGE_SIZE - pgoff < xfsize)
1506			xfsize = PAGE_SIZE - pgoff;
1507		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1508			xfsize = uap->nbytes - sbytes;
1509		if (xfsize <= 0)
1510			break;
1511		/*
1512		 * Optimize the non-blocking case by looking at the socket space
1513		 * before going to the extra work of constituting the sf_buf.
1514		 */
1515		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1516			if (so->so_state & SS_CANTSENDMORE)
1517				error = EPIPE;
1518			else
1519				error = EAGAIN;
1520			sbunlock(&so->so_snd);
1521			goto done;
1522		}
1523		/*
1524		 * Attempt to look up the page.
1525		 *
1526		 *	Allocate if not found
1527		 *
1528		 *	Wait and loop if busy.
1529		 */
1530		pg = vm_page_lookup(obj, pindex);
1531
1532		if (pg == NULL) {
1533			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1534			if (pg == NULL) {
1535				VM_WAIT;
1536				goto retry_lookup;
1537			}
1538			vm_page_wakeup(pg);
1539		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1540			goto retry_lookup;
1541		}
1542
1543		/*
1544		 * Wire the page so it does not get ripped out from under
1545		 * us.
1546		 */
1547
1548		vm_page_wire(pg);
1549
1550		/*
1551		 * If page is not valid for what we need, initiate I/O
1552		 */
1553
1554		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1555			struct uio auio;
1556			struct iovec aiov;
1557			int bsize;
1558
1559			/*
1560			 * Ensure that our page is still around when the I/O
1561			 * completes.
1562			 */
1563			vm_page_io_start(pg);
1564
1565			/*
1566			 * Get the page from backing store.
1567			 */
1568			bsize = vp->v_mount->mnt_stat.f_iosize;
1569			auio.uio_iov = &aiov;
1570			auio.uio_iovcnt = 1;
1571			aiov.iov_base = 0;
1572			aiov.iov_len = MAXBSIZE;
1573			auio.uio_resid = MAXBSIZE;
1574			auio.uio_offset = trunc_page(off);
1575			auio.uio_segflg = UIO_NOCOPY;
1576			auio.uio_rw = UIO_READ;
1577			auio.uio_procp = p;
1578			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1579			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1580			        p->p_ucred);
1581			VOP_UNLOCK(vp, 0, p);
1582			vm_page_flag_clear(pg, PG_ZERO);
1583			vm_page_io_finish(pg);
1584			if (error) {
1585				vm_page_unwire(pg, 0);
1586				/*
1587				 * See if anyone else might know about this page.
1588				 * If not and it is not valid, then free it.
1589				 */
1590				if (pg->wire_count == 0 && pg->valid == 0 &&
1591				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1592				    pg->hold_count == 0)
1593					vm_page_free(pg);
1594				sbunlock(&so->so_snd);
1595				goto done;
1596			}
1597		}
1598
1599		/*
1600		 * Allocate a kernel virtual page and insert the physical page
1601		 * into it.
1602		 */
1603
1604		sf = sf_buf_alloc();
1605		sf->m = pg;
1606		pmap_qenter(sf->kva, &pg, 1);
1607		/*
1608		 * Get an mbuf header and set it up as having external storage.
1609		 */
1610		MGETHDR(m, M_WAIT, MT_DATA);
1611		if (m == NULL) {
1612			error = ENOBUFS;
1613			goto done;
1614		}
1615		/*
1616		 * Setup external storage for mbuf.
1617		 */
1618		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL);
1619		m->m_data = (char *) sf->kva + pgoff;
1620		m->m_pkthdr.len = m->m_len = xfsize;
1621		/*
1622		 * Add the buffer to the socket buffer chain.
1623		 */
1624		s = splnet();
1625retry_space:
1626		/*
1627		 * Make sure that the socket is still able to take more data.
1628		 * CANTSENDMORE being true usually means that the connection
1629		 * was closed. so_error is true when an error was sensed after
1630		 * a previous send.
1631		 * The state is checked after the page mapping and buffer
1632		 * allocation above since those operations may block and make
1633		 * any socket checks stale. From this point forward, nothing
1634		 * blocks before the pru_send (or more accurately, any blocking
1635		 * results in a loop back to here to re-check).
1636		 */
1637		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1638			if (so->so_state & SS_CANTSENDMORE) {
1639				error = EPIPE;
1640			} else {
1641				error = so->so_error;
1642				so->so_error = 0;
1643			}
1644			m_freem(m);
1645			sbunlock(&so->so_snd);
1646			splx(s);
1647			goto done;
1648		}
1649		/*
1650		 * Wait for socket space to become available. We do this just
1651		 * after checking the connection state above in order to avoid
1652		 * a race condition with sbwait().
1653		 */
1654		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1655			if (so->so_state & SS_NBIO) {
1656				m_freem(m);
1657				sbunlock(&so->so_snd);
1658				splx(s);
1659				error = EAGAIN;
1660				goto done;
1661			}
1662			error = sbwait(&so->so_snd);
1663			/*
1664			 * An error from sbwait usually indicates that we've
1665			 * been interrupted by a signal. If we've sent anything
1666			 * then return bytes sent, otherwise return the error.
1667			 */
1668			if (error) {
1669				m_freem(m);
1670				sbunlock(&so->so_snd);
1671				splx(s);
1672				goto done;
1673			}
1674			goto retry_space;
1675		}
1676		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1677		splx(s);
1678		if (error) {
1679			sbunlock(&so->so_snd);
1680			goto done;
1681		}
1682	}
1683	sbunlock(&so->so_snd);
1684
1685	/*
1686	 * Send trailers. Wimp out and use writev(2).
1687	 */
1688	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1689			nuap.fd = uap->s;
1690			nuap.iovp = hdtr.trailers;
1691			nuap.iovcnt = hdtr.trl_cnt;
1692			error = writev(p, &nuap);
1693			if (error)
1694				goto done;
1695			sbytes += p->p_retval[0];
1696	}
1697
1698done:
1699	if (uap->sbytes != NULL) {
1700		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1701	}
1702	if (vp)
1703		vrele(vp);
1704	return (error);
1705}
1706