kern_sendfile.c revision 97658
1/*
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37 * $FreeBSD: head/sys/kern/uipc_syscalls.c 97658 2002-05-31 11:52:35Z tanimura $
38 */
39
40#include "opt_compat.h"
41#include "opt_ktrace.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/mutex.h>
48#include <sys/sysproto.h>
49#include <sys/malloc.h>
50#include <sys/filedesc.h>
51#include <sys/event.h>
52#include <sys/proc.h>
53#include <sys/fcntl.h>
54#include <sys/file.h>
55#include <sys/lock.h>
56#include <sys/mount.h>
57#include <sys/mbuf.h>
58#include <sys/protosw.h>
59#include <sys/socket.h>
60#include <sys/socketvar.h>
61#include <sys/signalvar.h>
62#include <sys/uio.h>
63#include <sys/vnode.h>
64#ifdef KTRACE
65#include <sys/ktrace.h>
66#endif
67
68#include <vm/vm.h>
69#include <vm/vm_object.h>
70#include <vm/vm_page.h>
71#include <vm/vm_pageout.h>
72#include <vm/vm_kern.h>
73#include <vm/vm_extern.h>
74
75static void sf_buf_init(void *arg);
76SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
77static struct sf_buf *sf_buf_alloc(void);
78static void sf_buf_free(caddr_t addr, void *args);
79
80static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
81static int recvit(struct thread *td, int s, struct msghdr *mp,
82		  caddr_t namelenp);
83
84static int accept1(struct thread *td, struct accept_args *uap, int compat);
85static int getsockname1(struct thread *td, struct getsockname_args *uap,
86			int compat);
87static int getpeername1(struct thread *td, struct getpeername_args *uap,
88			int compat);
89
90/*
91 * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
92 * sf_freelist head with the sf_lock mutex.
93 */
94static struct {
95	SLIST_HEAD(, sf_buf) sf_head;
96	struct mtx sf_lock;
97} sf_freelist;
98
99static vm_offset_t sf_base;
100static struct sf_buf *sf_bufs;
101static u_int sf_buf_alloc_want;
102
103/*
104 * System call interface to the socket abstraction.
105 */
106#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
107#define COMPAT_OLDSOCK
108#endif
109
110extern	struct fileops socketops;
111
112/*
113 * MPSAFE
114 */
115int
116socket(td, uap)
117	struct thread *td;
118	register struct socket_args /* {
119		int	domain;
120		int	type;
121		int	protocol;
122	} */ *uap;
123{
124	struct filedesc *fdp;
125	struct socket *so;
126	struct file *fp;
127	int fd, error;
128
129	mtx_lock(&Giant);
130	fdp = td->td_proc->p_fd;
131	error = falloc(td, &fp, &fd);
132	if (error)
133		goto done2;
134	fhold(fp);
135	error = socreate(uap->domain, &so, uap->type, uap->protocol,
136	    td->td_ucred, td);
137	FILEDESC_LOCK(fdp);
138	if (error) {
139		if (fdp->fd_ofiles[fd] == fp) {
140			fdp->fd_ofiles[fd] = NULL;
141			FILEDESC_UNLOCK(fdp);
142			fdrop(fp, td);
143		} else
144			FILEDESC_UNLOCK(fdp);
145	} else {
146		fp->f_data = (caddr_t)so;	/* already has ref count */
147		fp->f_flag = FREAD|FWRITE;
148		fp->f_ops = &socketops;
149		fp->f_type = DTYPE_SOCKET;
150		FILEDESC_UNLOCK(fdp);
151		td->td_retval[0] = fd;
152	}
153	fdrop(fp, td);
154done2:
155	mtx_unlock(&Giant);
156	return (error);
157}
158
159/*
160 * MPSAFE
161 */
162/* ARGSUSED */
163int
164bind(td, uap)
165	struct thread *td;
166	register struct bind_args /* {
167		int	s;
168		caddr_t	name;
169		int	namelen;
170	} */ *uap;
171{
172	struct socket *so;
173	struct sockaddr *sa;
174	int error;
175
176	mtx_lock(&Giant);
177	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
178		goto done2;
179	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
180		goto done1;
181	error = sobind(so, sa, td);
182	FREE(sa, M_SONAME);
183done1:
184	fputsock(so);
185done2:
186	mtx_unlock(&Giant);
187	return (error);
188}
189
190/*
191 * MPSAFE
192 */
193/* ARGSUSED */
194int
195listen(td, uap)
196	struct thread *td;
197	register struct listen_args /* {
198		int	s;
199		int	backlog;
200	} */ *uap;
201{
202	struct socket *so;
203	int error;
204
205	mtx_lock(&Giant);
206	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
207		error = solisten(so, uap->backlog, td);
208		fputsock(so);
209	}
210	mtx_unlock(&Giant);
211	return(error);
212}
213
214/*
215 * accept1()
216 * MPSAFE
217 */
218static int
219accept1(td, uap, compat)
220	struct thread *td;
221	register struct accept_args /* {
222		int	s;
223		caddr_t	name;
224		int	*anamelen;
225	} */ *uap;
226	int compat;
227{
228	struct filedesc *fdp;
229	struct file *nfp = NULL;
230	struct sockaddr *sa;
231	int namelen, error, s;
232	struct socket *head, *so;
233	int fd;
234	u_int fflag;
235
236	mtx_lock(&Giant);
237	fdp = td->td_proc->p_fd;
238	if (uap->name) {
239		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
240			sizeof (namelen));
241		if(error)
242			goto done2;
243	}
244	error = fgetsock(td, uap->s, &head, &fflag);
245	if (error)
246		goto done2;
247	s = splnet();
248	if ((head->so_options & SO_ACCEPTCONN) == 0) {
249		splx(s);
250		error = EINVAL;
251		goto done;
252	}
253	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
254		splx(s);
255		error = EWOULDBLOCK;
256		goto done;
257	}
258	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
259		if (head->so_state & SS_CANTRCVMORE) {
260			head->so_error = ECONNABORTED;
261			break;
262		}
263		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
264		    "accept", 0);
265		if (error) {
266			splx(s);
267			goto done;
268		}
269	}
270	if (head->so_error) {
271		error = head->so_error;
272		head->so_error = 0;
273		splx(s);
274		goto done;
275	}
276
277	/*
278	 * At this point we know that there is at least one connection
279	 * ready to be accepted. Remove it from the queue prior to
280	 * allocating the file descriptor for it since falloc() may
281	 * block allowing another process to accept the connection
282	 * instead.
283	 */
284	so = TAILQ_FIRST(&head->so_comp);
285	TAILQ_REMOVE(&head->so_comp, so, so_list);
286	head->so_qlen--;
287
288	error = falloc(td, &nfp, &fd);
289	if (error) {
290		/*
291		 * Probably ran out of file descriptors. Put the
292		 * unaccepted connection back onto the queue and
293		 * do another wakeup so some other process might
294		 * have a chance at it.
295		 */
296		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
297		head->so_qlen++;
298		wakeup_one(&head->so_timeo);
299		splx(s);
300		goto done;
301	}
302	fhold(nfp);
303	td->td_retval[0] = fd;
304
305	/* connection has been removed from the listen queue */
306	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
307
308	so->so_state &= ~SS_COMP;
309	so->so_head = NULL;
310	if (head->so_sigio != NULL)
311		fsetown(fgetown(head->so_sigio), &so->so_sigio);
312
313	FILE_LOCK(nfp);
314	soref(so);			/* file descriptor reference */
315	nfp->f_data = (caddr_t)so;	/* nfp has ref count from falloc */
316	nfp->f_flag = fflag;
317	nfp->f_ops = &socketops;
318	nfp->f_type = DTYPE_SOCKET;
319	FILE_UNLOCK(nfp);
320	sa = 0;
321	error = soaccept(so, &sa);
322	if (error) {
323		/*
324		 * return a namelen of zero for older code which might
325	 	 * ignore the return value from accept.
326		 */
327		if (uap->name != NULL) {
328			namelen = 0;
329			(void) copyout((caddr_t)&namelen,
330			    (caddr_t)uap->anamelen, sizeof(*uap->anamelen));
331		}
332		goto noconnection;
333	}
334	if (sa == NULL) {
335		namelen = 0;
336		if (uap->name)
337			goto gotnoname;
338		splx(s);
339		error = 0;
340		goto done;
341	}
342	if (uap->name) {
343		/* check sa_len before it is destroyed */
344		if (namelen > sa->sa_len)
345			namelen = sa->sa_len;
346#ifdef COMPAT_OLDSOCK
347		if (compat)
348			((struct osockaddr *)sa)->sa_family =
349			    sa->sa_family;
350#endif
351		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
352		if (!error)
353gotnoname:
354			error = copyout((caddr_t)&namelen,
355			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
356	}
357noconnection:
358	if (sa)
359		FREE(sa, M_SONAME);
360
361	/*
362	 * close the new descriptor, assuming someone hasn't ripped it
363	 * out from under us.
364	 */
365	if (error) {
366		FILEDESC_LOCK(fdp);
367		if (fdp->fd_ofiles[fd] == nfp) {
368			fdp->fd_ofiles[fd] = NULL;
369			FILEDESC_UNLOCK(fdp);
370			fdrop(nfp, td);
371		} else {
372			FILEDESC_UNLOCK(fdp);
373		}
374	}
375	splx(s);
376
377	/*
378	 * Release explicitly held references before returning.
379	 */
380done:
381	if (nfp != NULL)
382		fdrop(nfp, td);
383	fputsock(head);
384done2:
385	mtx_unlock(&Giant);
386	return (error);
387}
388
389/*
390 * MPSAFE (accept1() is MPSAFE)
391 */
392int
393accept(td, uap)
394	struct thread *td;
395	struct accept_args *uap;
396{
397
398	return (accept1(td, uap, 0));
399}
400
401#ifdef COMPAT_OLDSOCK
402/*
403 * MPSAFE (accept1() is MPSAFE)
404 */
405int
406oaccept(td, uap)
407	struct thread *td;
408	struct accept_args *uap;
409{
410
411	return (accept1(td, uap, 1));
412}
413#endif /* COMPAT_OLDSOCK */
414
415/*
416 * MPSAFE
417 */
418/* ARGSUSED */
419int
420connect(td, uap)
421	struct thread *td;
422	register struct connect_args /* {
423		int	s;
424		caddr_t	name;
425		int	namelen;
426	} */ *uap;
427{
428	struct socket *so;
429	struct sockaddr *sa;
430	int error, s;
431
432	mtx_lock(&Giant);
433	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
434		goto done2;
435	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
436		error = EALREADY;
437		goto done1;
438	}
439	error = getsockaddr(&sa, uap->name, uap->namelen);
440	if (error)
441		goto done1;
442	error = soconnect(so, sa, td);
443	if (error)
444		goto bad;
445	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
446		FREE(sa, M_SONAME);
447		error = EINPROGRESS;
448		goto done1;
449	}
450	s = splnet();
451	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
452		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, "connec", 0);
453		if (error)
454			break;
455	}
456	if (error == 0) {
457		error = so->so_error;
458		so->so_error = 0;
459	}
460	splx(s);
461bad:
462	so->so_state &= ~SS_ISCONNECTING;
463	FREE(sa, M_SONAME);
464	if (error == ERESTART)
465		error = EINTR;
466done1:
467	fputsock(so);
468done2:
469	mtx_unlock(&Giant);
470	return (error);
471}
472
473/*
474 * MPSAFE
475 */
476int
477socketpair(td, uap)
478	struct thread *td;
479	register struct socketpair_args /* {
480		int	domain;
481		int	type;
482		int	protocol;
483		int	*rsv;
484	} */ *uap;
485{
486	register struct filedesc *fdp = td->td_proc->p_fd;
487	struct file *fp1, *fp2;
488	struct socket *so1, *so2;
489	int fd, error, sv[2];
490
491	mtx_lock(&Giant);
492	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
493	    td->td_ucred, td);
494	if (error)
495		goto done2;
496	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
497	    td->td_ucred, td);
498	if (error)
499		goto free1;
500	error = falloc(td, &fp1, &fd);
501	if (error)
502		goto free2;
503	fhold(fp1);
504	sv[0] = fd;
505	fp1->f_data = (caddr_t)so1;	/* so1 already has ref count */
506	error = falloc(td, &fp2, &fd);
507	if (error)
508		goto free3;
509	fhold(fp2);
510	fp2->f_data = (caddr_t)so2;	/* so2 already has ref count */
511	sv[1] = fd;
512	error = soconnect2(so1, so2);
513	if (error)
514		goto free4;
515	if (uap->type == SOCK_DGRAM) {
516		/*
517		 * Datagram socket connection is asymmetric.
518		 */
519		 error = soconnect2(so2, so1);
520		 if (error)
521			goto free4;
522	}
523	FILE_LOCK(fp1);
524	fp1->f_flag = FREAD|FWRITE;
525	fp1->f_ops = &socketops;
526	fp1->f_type = DTYPE_SOCKET;
527	FILE_UNLOCK(fp1);
528	FILE_LOCK(fp2);
529	fp2->f_flag = FREAD|FWRITE;
530	fp2->f_ops = &socketops;
531	fp2->f_type = DTYPE_SOCKET;
532	FILE_UNLOCK(fp2);
533	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
534	fdrop(fp1, td);
535	fdrop(fp2, td);
536	goto done2;
537free4:
538	FILEDESC_LOCK(fdp);
539	if (fdp->fd_ofiles[sv[1]] == fp2) {
540		fdp->fd_ofiles[sv[1]] = NULL;
541		FILEDESC_UNLOCK(fdp);
542		fdrop(fp2, td);
543	} else
544		FILEDESC_UNLOCK(fdp);
545	fdrop(fp2, td);
546free3:
547	FILEDESC_LOCK(fdp);
548	if (fdp->fd_ofiles[sv[0]] == fp1) {
549		fdp->fd_ofiles[sv[0]] = NULL;
550		FILEDESC_UNLOCK(fdp);
551		fdrop(fp1, td);
552	} else
553		FILEDESC_UNLOCK(fdp);
554	fdrop(fp1, td);
555free2:
556	(void)soclose(so2);
557free1:
558	(void)soclose(so1);
559done2:
560	mtx_unlock(&Giant);
561	return (error);
562}
563
564static int
565sendit(td, s, mp, flags)
566	register struct thread *td;
567	int s;
568	register struct msghdr *mp;
569	int flags;
570{
571	struct uio auio;
572	register struct iovec *iov;
573	register int i;
574	struct mbuf *control;
575	struct sockaddr *to = NULL;
576	int len, error;
577	struct socket *so;
578#ifdef KTRACE
579	struct iovec *ktriov = NULL;
580	struct uio ktruio;
581#endif
582
583	if ((error = fgetsock(td, s, &so, NULL)) != 0)
584		return (error);
585	auio.uio_iov = mp->msg_iov;
586	auio.uio_iovcnt = mp->msg_iovlen;
587	auio.uio_segflg = UIO_USERSPACE;
588	auio.uio_rw = UIO_WRITE;
589	auio.uio_td = td;
590	auio.uio_offset = 0;			/* XXX */
591	auio.uio_resid = 0;
592	iov = mp->msg_iov;
593	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
594		if ((auio.uio_resid += iov->iov_len) < 0) {
595			error = EINVAL;
596			goto bad;
597		}
598	}
599	if (mp->msg_name) {
600		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
601		if (error)
602			goto bad;
603	}
604	if (mp->msg_control) {
605		if (mp->msg_controllen < sizeof(struct cmsghdr)
606#ifdef COMPAT_OLDSOCK
607		    && mp->msg_flags != MSG_COMPAT
608#endif
609		) {
610			error = EINVAL;
611			goto bad;
612		}
613		error = sockargs(&control, mp->msg_control,
614		    mp->msg_controllen, MT_CONTROL);
615		if (error)
616			goto bad;
617#ifdef COMPAT_OLDSOCK
618		if (mp->msg_flags == MSG_COMPAT) {
619			register struct cmsghdr *cm;
620
621			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
622			if (control == 0) {
623				error = ENOBUFS;
624				goto bad;
625			} else {
626				cm = mtod(control, struct cmsghdr *);
627				cm->cmsg_len = control->m_len;
628				cm->cmsg_level = SOL_SOCKET;
629				cm->cmsg_type = SCM_RIGHTS;
630			}
631		}
632#endif
633	} else {
634		control = 0;
635	}
636#ifdef KTRACE
637	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
638		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
639
640		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
641		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
642		ktruio = auio;
643	}
644#endif
645	len = auio.uio_resid;
646	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
647						     flags, td);
648	if (error) {
649		if (auio.uio_resid != len && (error == ERESTART ||
650		    error == EINTR || error == EWOULDBLOCK))
651			error = 0;
652		if (error == EPIPE) {
653			PROC_LOCK(td->td_proc);
654			psignal(td->td_proc, SIGPIPE);
655			PROC_UNLOCK(td->td_proc);
656		}
657	}
658	if (error == 0)
659		td->td_retval[0] = len - auio.uio_resid;
660#ifdef KTRACE
661	if (ktriov != NULL) {
662		if (error == 0) {
663			ktruio.uio_iov = ktriov;
664			ktruio.uio_resid = td->td_retval[0];
665			ktrgenio(td->td_proc->p_tracep, s, UIO_WRITE, &ktruio, error);
666		}
667		FREE(ktriov, M_TEMP);
668	}
669#endif
670bad:
671	fputsock(so);
672	if (to)
673		FREE(to, M_SONAME);
674	return (error);
675}
676
677/*
678 * MPSAFE
679 */
680int
681sendto(td, uap)
682	struct thread *td;
683	register struct sendto_args /* {
684		int	s;
685		caddr_t	buf;
686		size_t	len;
687		int	flags;
688		caddr_t	to;
689		int	tolen;
690	} */ *uap;
691{
692	struct msghdr msg;
693	struct iovec aiov;
694	int error;
695
696	msg.msg_name = uap->to;
697	msg.msg_namelen = uap->tolen;
698	msg.msg_iov = &aiov;
699	msg.msg_iovlen = 1;
700	msg.msg_control = 0;
701#ifdef COMPAT_OLDSOCK
702	msg.msg_flags = 0;
703#endif
704	aiov.iov_base = uap->buf;
705	aiov.iov_len = uap->len;
706	mtx_lock(&Giant);
707	error = sendit(td, uap->s, &msg, uap->flags);
708	mtx_unlock(&Giant);
709	return (error);
710}
711
712#ifdef COMPAT_OLDSOCK
713/*
714 * MPSAFE
715 */
716int
717osend(td, uap)
718	struct thread *td;
719	register struct osend_args /* {
720		int	s;
721		caddr_t	buf;
722		int	len;
723		int	flags;
724	} */ *uap;
725{
726	struct msghdr msg;
727	struct iovec aiov;
728	int error;
729
730	msg.msg_name = 0;
731	msg.msg_namelen = 0;
732	msg.msg_iov = &aiov;
733	msg.msg_iovlen = 1;
734	aiov.iov_base = uap->buf;
735	aiov.iov_len = uap->len;
736	msg.msg_control = 0;
737	msg.msg_flags = 0;
738	mtx_lock(&Giant);
739	error = sendit(td, uap->s, &msg, uap->flags);
740	mtx_unlock(&Giant);
741	return (error);
742}
743
744/*
745 * MPSAFE
746 */
747int
748osendmsg(td, uap)
749	struct thread *td;
750	register struct osendmsg_args /* {
751		int	s;
752		caddr_t	msg;
753		int	flags;
754	} */ *uap;
755{
756	struct msghdr msg;
757	struct iovec aiov[UIO_SMALLIOV], *iov;
758	int error;
759
760	mtx_lock(&Giant);
761	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
762	if (error)
763		goto done2;
764	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
765		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
766			error = EMSGSIZE;
767			goto done2;
768		}
769		MALLOC(iov, struct iovec *,
770		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
771		      M_WAITOK);
772	} else {
773		iov = aiov;
774	}
775	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
776	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
777	if (error)
778		goto done;
779	msg.msg_flags = MSG_COMPAT;
780	msg.msg_iov = iov;
781	error = sendit(td, uap->s, &msg, uap->flags);
782done:
783	if (iov != aiov)
784		FREE(iov, M_IOV);
785done2:
786	mtx_unlock(&Giant);
787	return (error);
788}
789#endif
790
791/*
792 * MPSAFE
793 */
794int
795sendmsg(td, uap)
796	struct thread *td;
797	register struct sendmsg_args /* {
798		int	s;
799		caddr_t	msg;
800		int	flags;
801	} */ *uap;
802{
803	struct msghdr msg;
804	struct iovec aiov[UIO_SMALLIOV], *iov;
805	int error;
806
807	mtx_lock(&Giant);
808	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
809	if (error)
810		goto done2;
811	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
812		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
813			error = EMSGSIZE;
814			goto done2;
815		}
816		MALLOC(iov, struct iovec *,
817		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
818		       M_WAITOK);
819	} else {
820		iov = aiov;
821	}
822	if (msg.msg_iovlen &&
823	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
824	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
825		goto done;
826	msg.msg_iov = iov;
827#ifdef COMPAT_OLDSOCK
828	msg.msg_flags = 0;
829#endif
830	error = sendit(td, uap->s, &msg, uap->flags);
831done:
832	if (iov != aiov)
833		FREE(iov, M_IOV);
834done2:
835	mtx_unlock(&Giant);
836	return (error);
837}
838
839static int
840recvit(td, s, mp, namelenp)
841	register struct thread *td;
842	int s;
843	register struct msghdr *mp;
844	caddr_t namelenp;
845{
846	struct uio auio;
847	register struct iovec *iov;
848	register int i;
849	int len, error;
850	struct mbuf *m, *control = 0;
851	caddr_t ctlbuf;
852	struct socket *so;
853	struct sockaddr *fromsa = 0;
854#ifdef KTRACE
855	struct iovec *ktriov = NULL;
856	struct uio ktruio;
857#endif
858
859	if ((error = fgetsock(td, s, &so, NULL)) != 0)
860		return (error);
861	auio.uio_iov = mp->msg_iov;
862	auio.uio_iovcnt = mp->msg_iovlen;
863	auio.uio_segflg = UIO_USERSPACE;
864	auio.uio_rw = UIO_READ;
865	auio.uio_td = td;
866	auio.uio_offset = 0;			/* XXX */
867	auio.uio_resid = 0;
868	iov = mp->msg_iov;
869	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
870		if ((auio.uio_resid += iov->iov_len) < 0) {
871			fputsock(so);
872			return (EINVAL);
873		}
874	}
875#ifdef KTRACE
876	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
877		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
878
879		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
880		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
881		ktruio = auio;
882	}
883#endif
884	len = auio.uio_resid;
885	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
886	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
887	    &mp->msg_flags);
888	if (error) {
889		if (auio.uio_resid != len && (error == ERESTART ||
890		    error == EINTR || error == EWOULDBLOCK))
891			error = 0;
892	}
893#ifdef KTRACE
894	if (ktriov != NULL) {
895		if (error == 0) {
896			ktruio.uio_iov = ktriov;
897			ktruio.uio_resid = len - auio.uio_resid;
898			ktrgenio(td->td_proc->p_tracep, s, UIO_READ, &ktruio, error);
899		}
900		FREE(ktriov, M_TEMP);
901	}
902#endif
903	if (error)
904		goto out;
905	td->td_retval[0] = len - auio.uio_resid;
906	if (mp->msg_name) {
907		len = mp->msg_namelen;
908		if (len <= 0 || fromsa == 0)
909			len = 0;
910		else {
911#ifndef MIN
912#define MIN(a,b) ((a)>(b)?(b):(a))
913#endif
914			/* save sa_len before it is destroyed by MSG_COMPAT */
915			len = MIN(len, fromsa->sa_len);
916#ifdef COMPAT_OLDSOCK
917			if (mp->msg_flags & MSG_COMPAT)
918				((struct osockaddr *)fromsa)->sa_family =
919				    fromsa->sa_family;
920#endif
921			error = copyout(fromsa,
922			    (caddr_t)mp->msg_name, (unsigned)len);
923			if (error)
924				goto out;
925		}
926		mp->msg_namelen = len;
927		if (namelenp &&
928		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
929#ifdef COMPAT_OLDSOCK
930			if (mp->msg_flags & MSG_COMPAT)
931				error = 0;	/* old recvfrom didn't check */
932			else
933#endif
934			goto out;
935		}
936	}
937	if (mp->msg_control) {
938#ifdef COMPAT_OLDSOCK
939		/*
940		 * We assume that old recvmsg calls won't receive access
941		 * rights and other control info, esp. as control info
942		 * is always optional and those options didn't exist in 4.3.
943		 * If we receive rights, trim the cmsghdr; anything else
944		 * is tossed.
945		 */
946		if (control && mp->msg_flags & MSG_COMPAT) {
947			if (mtod(control, struct cmsghdr *)->cmsg_level !=
948			    SOL_SOCKET ||
949			    mtod(control, struct cmsghdr *)->cmsg_type !=
950			    SCM_RIGHTS) {
951				mp->msg_controllen = 0;
952				goto out;
953			}
954			control->m_len -= sizeof (struct cmsghdr);
955			control->m_data += sizeof (struct cmsghdr);
956		}
957#endif
958		len = mp->msg_controllen;
959		m = control;
960		mp->msg_controllen = 0;
961		ctlbuf = (caddr_t) mp->msg_control;
962
963		while (m && len > 0) {
964			unsigned int tocopy;
965
966			if (len >= m->m_len)
967				tocopy = m->m_len;
968			else {
969				mp->msg_flags |= MSG_CTRUNC;
970				tocopy = len;
971			}
972
973			if ((error = copyout((caddr_t)mtod(m, caddr_t),
974					ctlbuf, tocopy)) != 0)
975				goto out;
976
977			ctlbuf += tocopy;
978			len -= tocopy;
979			m = m->m_next;
980		}
981		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
982	}
983out:
984	fputsock(so);
985	if (fromsa)
986		FREE(fromsa, M_SONAME);
987	if (control)
988		m_freem(control);
989	return (error);
990}
991
992/*
993 * MPSAFE
994 */
995int
996recvfrom(td, uap)
997	struct thread *td;
998	register struct recvfrom_args /* {
999		int	s;
1000		caddr_t	buf;
1001		size_t	len;
1002		int	flags;
1003		caddr_t	from;
1004		int	*fromlenaddr;
1005	} */ *uap;
1006{
1007	struct msghdr msg;
1008	struct iovec aiov;
1009	int error;
1010
1011	mtx_lock(&Giant);
1012	if (uap->fromlenaddr) {
1013		error = copyin((caddr_t)uap->fromlenaddr,
1014		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
1015		if (error)
1016			goto done2;
1017	} else {
1018		msg.msg_namelen = 0;
1019	}
1020	msg.msg_name = uap->from;
1021	msg.msg_iov = &aiov;
1022	msg.msg_iovlen = 1;
1023	aiov.iov_base = uap->buf;
1024	aiov.iov_len = uap->len;
1025	msg.msg_control = 0;
1026	msg.msg_flags = uap->flags;
1027	error = recvit(td, uap->s, &msg, (caddr_t)uap->fromlenaddr);
1028done2:
1029	mtx_unlock(&Giant);
1030	return(error);
1031}
1032
1033#ifdef COMPAT_OLDSOCK
1034/*
1035 * MPSAFE
1036 */
1037int
1038orecvfrom(td, uap)
1039	struct thread *td;
1040	struct recvfrom_args *uap;
1041{
1042
1043	uap->flags |= MSG_COMPAT;
1044	return (recvfrom(td, uap));
1045}
1046#endif
1047
1048
1049#ifdef COMPAT_OLDSOCK
1050/*
1051 * MPSAFE
1052 */
1053int
1054orecv(td, uap)
1055	struct thread *td;
1056	register struct orecv_args /* {
1057		int	s;
1058		caddr_t	buf;
1059		int	len;
1060		int	flags;
1061	} */ *uap;
1062{
1063	struct msghdr msg;
1064	struct iovec aiov;
1065	int error;
1066
1067	mtx_lock(&Giant);
1068	msg.msg_name = 0;
1069	msg.msg_namelen = 0;
1070	msg.msg_iov = &aiov;
1071	msg.msg_iovlen = 1;
1072	aiov.iov_base = uap->buf;
1073	aiov.iov_len = uap->len;
1074	msg.msg_control = 0;
1075	msg.msg_flags = uap->flags;
1076	error = recvit(td, uap->s, &msg, (caddr_t)0);
1077	mtx_unlock(&Giant);
1078	return (error);
1079}
1080
1081/*
1082 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1083 * overlays the new one, missing only the flags, and with the (old) access
1084 * rights where the control fields are now.
1085 *
1086 * MPSAFE
1087 */
1088int
1089orecvmsg(td, uap)
1090	struct thread *td;
1091	register struct orecvmsg_args /* {
1092		int	s;
1093		struct	omsghdr *msg;
1094		int	flags;
1095	} */ *uap;
1096{
1097	struct msghdr msg;
1098	struct iovec aiov[UIO_SMALLIOV], *iov;
1099	int error;
1100
1101	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
1102	    sizeof (struct omsghdr));
1103	if (error)
1104		return (error);
1105
1106	mtx_lock(&Giant);
1107	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1108		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1109			error = EMSGSIZE;
1110			goto done2;
1111		}
1112		MALLOC(iov, struct iovec *,
1113		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1114		      M_WAITOK);
1115	} else {
1116		iov = aiov;
1117	}
1118	msg.msg_flags = uap->flags | MSG_COMPAT;
1119	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
1120	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1121	if (error)
1122		goto done;
1123	msg.msg_iov = iov;
1124	error = recvit(td, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
1125
1126	if (msg.msg_controllen && error == 0)
1127		error = copyout((caddr_t)&msg.msg_controllen,
1128		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
1129done:
1130	if (iov != aiov)
1131		FREE(iov, M_IOV);
1132done2:
1133	mtx_unlock(&Giant);
1134	return (error);
1135}
1136#endif
1137
1138/*
1139 * MPSAFE
1140 */
1141int
1142recvmsg(td, uap)
1143	struct thread *td;
1144	register struct recvmsg_args /* {
1145		int	s;
1146		struct	msghdr *msg;
1147		int	flags;
1148	} */ *uap;
1149{
1150	struct msghdr msg;
1151	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1152	register int error;
1153
1154	mtx_lock(&Giant);
1155	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
1156	if (error)
1157		goto done2;
1158	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1159		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1160			error = EMSGSIZE;
1161			goto done2;
1162		}
1163		MALLOC(iov, struct iovec *,
1164		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1165		       M_WAITOK);
1166	} else {
1167		iov = aiov;
1168	}
1169#ifdef COMPAT_OLDSOCK
1170	msg.msg_flags = uap->flags &~ MSG_COMPAT;
1171#else
1172	msg.msg_flags = uap->flags;
1173#endif
1174	uiov = msg.msg_iov;
1175	msg.msg_iov = iov;
1176	error = copyin((caddr_t)uiov, (caddr_t)iov,
1177	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1178	if (error)
1179		goto done;
1180	error = recvit(td, uap->s, &msg, (caddr_t)0);
1181	if (!error) {
1182		msg.msg_iov = uiov;
1183		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
1184	}
1185done:
1186	if (iov != aiov)
1187		FREE(iov, M_IOV);
1188done2:
1189	mtx_unlock(&Giant);
1190	return (error);
1191}
1192
1193/*
1194 * MPSAFE
1195 */
1196/* ARGSUSED */
1197int
1198shutdown(td, uap)
1199	struct thread *td;
1200	register struct shutdown_args /* {
1201		int	s;
1202		int	how;
1203	} */ *uap;
1204{
1205	struct socket *so;
1206	int error;
1207
1208	mtx_lock(&Giant);
1209	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1210		error = soshutdown(so, uap->how);
1211		fputsock(so);
1212	}
1213	mtx_unlock(&Giant);
1214	return(error);
1215}
1216
1217/*
1218 * MPSAFE
1219 */
1220/* ARGSUSED */
1221int
1222setsockopt(td, uap)
1223	struct thread *td;
1224	register struct setsockopt_args /* {
1225		int	s;
1226		int	level;
1227		int	name;
1228		caddr_t	val;
1229		int	valsize;
1230	} */ *uap;
1231{
1232	struct socket *so;
1233	struct sockopt sopt;
1234	int error;
1235
1236	if (uap->val == 0 && uap->valsize != 0)
1237		return (EFAULT);
1238	if (uap->valsize < 0)
1239		return (EINVAL);
1240
1241	mtx_lock(&Giant);
1242	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1243		sopt.sopt_dir = SOPT_SET;
1244		sopt.sopt_level = uap->level;
1245		sopt.sopt_name = uap->name;
1246		sopt.sopt_val = uap->val;
1247		sopt.sopt_valsize = uap->valsize;
1248		sopt.sopt_td = td;
1249		error = sosetopt(so, &sopt);
1250		fputsock(so);
1251	}
1252	mtx_unlock(&Giant);
1253	return(error);
1254}
1255
1256/*
1257 * MPSAFE
1258 */
1259/* ARGSUSED */
1260int
1261getsockopt(td, uap)
1262	struct thread *td;
1263	register struct getsockopt_args /* {
1264		int	s;
1265		int	level;
1266		int	name;
1267		caddr_t	val;
1268		int	*avalsize;
1269	} */ *uap;
1270{
1271	int	valsize, error;
1272	struct  socket *so;
1273	struct	sockopt sopt;
1274
1275	mtx_lock(&Giant);
1276	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1277		goto done2;
1278	if (uap->val) {
1279		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1280		    sizeof (valsize));
1281		if (error)
1282			goto done1;
1283		if (valsize < 0) {
1284			error = EINVAL;
1285			goto done1;
1286		}
1287	} else {
1288		valsize = 0;
1289	}
1290
1291	sopt.sopt_dir = SOPT_GET;
1292	sopt.sopt_level = uap->level;
1293	sopt.sopt_name = uap->name;
1294	sopt.sopt_val = uap->val;
1295	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1296	sopt.sopt_td = td;
1297
1298	error = sogetopt(so, &sopt);
1299	if (error == 0) {
1300		valsize = sopt.sopt_valsize;
1301		error = copyout((caddr_t)&valsize,
1302				(caddr_t)uap->avalsize, sizeof (valsize));
1303	}
1304done1:
1305	fputsock(so);
1306done2:
1307	mtx_unlock(&Giant);
1308	return (error);
1309}
1310
1311/*
1312 * getsockname1() - Get socket name.
1313 *
1314 * MPSAFE
1315 */
1316/* ARGSUSED */
1317static int
1318getsockname1(td, uap, compat)
1319	struct thread *td;
1320	register struct getsockname_args /* {
1321		int	fdes;
1322		caddr_t	asa;
1323		int	*alen;
1324	} */ *uap;
1325	int compat;
1326{
1327	struct socket *so;
1328	struct sockaddr *sa;
1329	int len, error;
1330
1331	mtx_lock(&Giant);
1332	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1333		goto done2;
1334	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1335	if (error)
1336		goto done1;
1337	sa = 0;
1338	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1339	if (error)
1340		goto bad;
1341	if (sa == 0) {
1342		len = 0;
1343		goto gotnothing;
1344	}
1345
1346	len = MIN(len, sa->sa_len);
1347#ifdef COMPAT_OLDSOCK
1348	if (compat)
1349		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1350#endif
1351	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1352	if (error == 0)
1353gotnothing:
1354		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1355		    sizeof (len));
1356bad:
1357	if (sa)
1358		FREE(sa, M_SONAME);
1359done1:
1360	fputsock(so);
1361done2:
1362	mtx_unlock(&Giant);
1363	return (error);
1364}
1365
1366/*
1367 * MPSAFE
1368 */
1369int
1370getsockname(td, uap)
1371	struct thread *td;
1372	struct getsockname_args *uap;
1373{
1374
1375	return (getsockname1(td, uap, 0));
1376}
1377
1378#ifdef COMPAT_OLDSOCK
1379/*
1380 * MPSAFE
1381 */
1382int
1383ogetsockname(td, uap)
1384	struct thread *td;
1385	struct getsockname_args *uap;
1386{
1387
1388	return (getsockname1(td, uap, 1));
1389}
1390#endif /* COMPAT_OLDSOCK */
1391
1392/*
1393 * getpeername1() - Get name of peer for connected socket.
1394 *
1395 * MPSAFE
1396 */
1397/* ARGSUSED */
1398static int
1399getpeername1(td, uap, compat)
1400	struct thread *td;
1401	register struct getpeername_args /* {
1402		int	fdes;
1403		caddr_t	asa;
1404		int	*alen;
1405	} */ *uap;
1406	int compat;
1407{
1408	struct socket *so;
1409	struct sockaddr *sa;
1410	int len, error;
1411
1412	mtx_lock(&Giant);
1413	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1414		goto done2;
1415	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1416		error = ENOTCONN;
1417		goto done1;
1418	}
1419	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1420	if (error)
1421		goto done1;
1422	sa = 0;
1423	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1424	if (error)
1425		goto bad;
1426	if (sa == 0) {
1427		len = 0;
1428		goto gotnothing;
1429	}
1430	len = MIN(len, sa->sa_len);
1431#ifdef COMPAT_OLDSOCK
1432	if (compat)
1433		((struct osockaddr *)sa)->sa_family =
1434		    sa->sa_family;
1435#endif
1436	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1437	if (error)
1438		goto bad;
1439gotnothing:
1440	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1441bad:
1442	if (sa)
1443		FREE(sa, M_SONAME);
1444done1:
1445	fputsock(so);
1446done2:
1447	mtx_unlock(&Giant);
1448	return (error);
1449}
1450
1451/*
1452 * MPSAFE
1453 */
1454int
1455getpeername(td, uap)
1456	struct thread *td;
1457	struct getpeername_args *uap;
1458{
1459
1460	return (getpeername1(td, uap, 0));
1461}
1462
1463#ifdef COMPAT_OLDSOCK
1464/*
1465 * MPSAFE
1466 */
1467int
1468ogetpeername(td, uap)
1469	struct thread *td;
1470	struct ogetpeername_args *uap;
1471{
1472
1473	/* XXX uap should have type `getpeername_args *' to begin with. */
1474	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1475}
1476#endif /* COMPAT_OLDSOCK */
1477
1478int
1479sockargs(mp, buf, buflen, type)
1480	struct mbuf **mp;
1481	caddr_t buf;
1482	int buflen, type;
1483{
1484	register struct sockaddr *sa;
1485	register struct mbuf *m;
1486	int error;
1487
1488	if ((u_int)buflen > MLEN) {
1489#ifdef COMPAT_OLDSOCK
1490		if (type == MT_SONAME && (u_int)buflen <= 112)
1491			buflen = MLEN;		/* unix domain compat. hack */
1492		else
1493#endif
1494		return (EINVAL);
1495	}
1496	m = m_get(M_TRYWAIT, type);
1497	if (m == NULL)
1498		return (ENOBUFS);
1499	m->m_len = buflen;
1500	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1501	if (error)
1502		(void) m_free(m);
1503	else {
1504		*mp = m;
1505		if (type == MT_SONAME) {
1506			sa = mtod(m, struct sockaddr *);
1507
1508#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1509			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1510				sa->sa_family = sa->sa_len;
1511#endif
1512			sa->sa_len = buflen;
1513		}
1514	}
1515	return (error);
1516}
1517
1518int
1519getsockaddr(namp, uaddr, len)
1520	struct sockaddr **namp;
1521	caddr_t uaddr;
1522	size_t len;
1523{
1524	struct sockaddr *sa;
1525	int error;
1526
1527	if (len > SOCK_MAXADDRLEN)
1528		return ENAMETOOLONG;
1529	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1530	error = copyin(uaddr, sa, len);
1531	if (error) {
1532		FREE(sa, M_SONAME);
1533	} else {
1534#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1535		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1536			sa->sa_family = sa->sa_len;
1537#endif
1538		sa->sa_len = len;
1539		*namp = sa;
1540	}
1541	return error;
1542}
1543
1544/*
1545 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1546 * XXX - The sf_buf functions are currently private to sendfile(2), so have
1547 * been made static, but may be useful in the future for doing zero-copy in
1548 * other parts of the networking code.
1549 */
1550static void
1551sf_buf_init(void *arg)
1552{
1553	int i;
1554
1555	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
1556	mtx_lock(&sf_freelist.sf_lock);
1557	SLIST_INIT(&sf_freelist.sf_head);
1558	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1559	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
1560	    M_NOWAIT | M_ZERO);
1561	for (i = 0; i < nsfbufs; i++) {
1562		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1563		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
1564	}
1565	sf_buf_alloc_want = 0;
1566	mtx_unlock(&sf_freelist.sf_lock);
1567}
1568
1569/*
1570 * Get an sf_buf from the freelist. Will block if none are available.
1571 */
1572static struct sf_buf *
1573sf_buf_alloc()
1574{
1575	struct sf_buf *sf;
1576	int error;
1577
1578	mtx_lock(&sf_freelist.sf_lock);
1579	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
1580		sf_buf_alloc_want++;
1581		error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
1582		    "sfbufa", 0);
1583		sf_buf_alloc_want--;
1584
1585		/*
1586		 * If we got a signal, don't risk going back to sleep.
1587		 */
1588		if (error)
1589			break;
1590	}
1591	if (sf != NULL)
1592		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
1593	mtx_unlock(&sf_freelist.sf_lock);
1594	return (sf);
1595}
1596
1597#define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1598
1599/*
1600 * Detatch mapped page and release resources back to the system.
1601 */
1602static void
1603sf_buf_free(caddr_t addr, void *args)
1604{
1605	struct sf_buf *sf;
1606	struct vm_page *m;
1607
1608	GIANT_REQUIRED;
1609
1610	sf = dtosf(addr);
1611	pmap_qremove((vm_offset_t)addr, 1);
1612	m = sf->m;
1613	vm_page_unwire(m, 0);
1614	/*
1615	 * Check for the object going away on us. This can
1616	 * happen since we don't hold a reference to it.
1617	 * If so, we're responsible for freeing the page.
1618	 */
1619	if (m->wire_count == 0 && m->object == NULL)
1620		vm_page_free(m);
1621	sf->m = NULL;
1622	mtx_lock(&sf_freelist.sf_lock);
1623	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
1624	if (sf_buf_alloc_want > 0)
1625		wakeup_one(&sf_freelist);
1626	mtx_unlock(&sf_freelist.sf_lock);
1627}
1628
1629/*
1630 * sendfile(2)
1631 *
1632 * MPSAFE
1633 *
1634 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1635 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1636 *
1637 * Send a file specified by 'fd' and starting at 'offset' to a socket
1638 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1639 * nbytes == 0. Optionally add a header and/or trailer to the socket
1640 * output. If specified, write the total number of bytes sent into *sbytes.
1641 *
1642 */
1643int
1644sendfile(struct thread *td, struct sendfile_args *uap)
1645{
1646	struct vnode *vp;
1647	struct vm_object *obj;
1648	struct socket *so = NULL;
1649	struct mbuf *m;
1650	struct sf_buf *sf;
1651	struct vm_page *pg;
1652	struct writev_args nuap;
1653	struct sf_hdtr hdtr;
1654	off_t off, xfsize, hdtr_size, sbytes = 0;
1655	int error, s;
1656
1657	mtx_lock(&Giant);
1658
1659	hdtr_size = 0;
1660
1661	/*
1662	 * The descriptor must be a regular file and have a backing VM object.
1663	 */
1664	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1665		goto done;
1666	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1667		error = EINVAL;
1668		goto done;
1669	}
1670	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1671		goto done;
1672	if (so->so_type != SOCK_STREAM) {
1673		error = EINVAL;
1674		goto done;
1675	}
1676	if ((so->so_state & SS_ISCONNECTED) == 0) {
1677		error = ENOTCONN;
1678		goto done;
1679	}
1680	if (uap->offset < 0) {
1681		error = EINVAL;
1682		goto done;
1683	}
1684
1685	/*
1686	 * If specified, get the pointer to the sf_hdtr struct for
1687	 * any headers/trailers.
1688	 */
1689	if (uap->hdtr != NULL) {
1690		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1691		if (error)
1692			goto done;
1693		/*
1694		 * Send any headers. Wimp out and use writev(2).
1695		 */
1696		if (hdtr.headers != NULL) {
1697			nuap.fd = uap->s;
1698			nuap.iovp = hdtr.headers;
1699			nuap.iovcnt = hdtr.hdr_cnt;
1700			error = writev(td, &nuap);
1701			if (error)
1702				goto done;
1703			hdtr_size += td->td_retval[0];
1704		}
1705	}
1706
1707	/*
1708	 * Protect against multiple writers to the socket.
1709	 */
1710	(void) sblock(&so->so_snd, M_WAITOK);
1711
1712	/*
1713	 * Loop through the pages in the file, starting with the requested
1714	 * offset. Get a file page (do I/O if necessary), map the file page
1715	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1716	 * it on the socket.
1717	 */
1718	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1719		vm_pindex_t pindex;
1720		vm_offset_t pgoff;
1721
1722		pindex = OFF_TO_IDX(off);
1723retry_lookup:
1724		/*
1725		 * Calculate the amount to transfer. Not to exceed a page,
1726		 * the EOF, or the passed in nbytes.
1727		 */
1728		xfsize = obj->un_pager.vnp.vnp_size - off;
1729		if (xfsize > PAGE_SIZE)
1730			xfsize = PAGE_SIZE;
1731		pgoff = (vm_offset_t)(off & PAGE_MASK);
1732		if (PAGE_SIZE - pgoff < xfsize)
1733			xfsize = PAGE_SIZE - pgoff;
1734		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1735			xfsize = uap->nbytes - sbytes;
1736		if (xfsize <= 0)
1737			break;
1738		/*
1739		 * Optimize the non-blocking case by looking at the socket space
1740		 * before going to the extra work of constituting the sf_buf.
1741		 */
1742		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1743			if (so->so_state & SS_CANTSENDMORE)
1744				error = EPIPE;
1745			else
1746				error = EAGAIN;
1747			sbunlock(&so->so_snd);
1748			goto done;
1749		}
1750		/*
1751		 * Attempt to look up the page.
1752		 *
1753		 *	Allocate if not found
1754		 *
1755		 *	Wait and loop if busy.
1756		 */
1757		pg = vm_page_lookup(obj, pindex);
1758
1759		if (pg == NULL) {
1760			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1761			if (pg == NULL) {
1762				VM_WAIT;
1763				goto retry_lookup;
1764			}
1765			vm_page_wakeup(pg);
1766		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1767			goto retry_lookup;
1768		}
1769
1770		/*
1771		 * Wire the page so it does not get ripped out from under
1772		 * us.
1773		 */
1774
1775		vm_page_wire(pg);
1776
1777		/*
1778		 * If page is not valid for what we need, initiate I/O
1779		 */
1780
1781		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1782			int bsize;
1783
1784			/*
1785			 * Ensure that our page is still around when the I/O
1786			 * completes.
1787			 */
1788			vm_page_io_start(pg);
1789
1790			/*
1791			 * Get the page from backing store.
1792			 */
1793			bsize = vp->v_mount->mnt_stat.f_iosize;
1794			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
1795			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
1796			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
1797			    IO_VMIO | ((MAXBSIZE / bsize) << 16),
1798			    td->td_ucred, NULL, td);
1799			VOP_UNLOCK(vp, 0, td);
1800			vm_page_flag_clear(pg, PG_ZERO);
1801			vm_page_io_finish(pg);
1802			if (error) {
1803				vm_page_unwire(pg, 0);
1804				/*
1805				 * See if anyone else might know about this page.
1806				 * If not and it is not valid, then free it.
1807				 */
1808				if (pg->wire_count == 0 && pg->valid == 0 &&
1809				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1810				    pg->hold_count == 0) {
1811					vm_page_busy(pg);
1812					vm_page_free(pg);
1813				}
1814				sbunlock(&so->so_snd);
1815				goto done;
1816			}
1817		}
1818
1819
1820		/*
1821		 * Get a sendfile buf. We usually wait as long as necessary,
1822		 * but this wait can be interrupted.
1823		 */
1824		if ((sf = sf_buf_alloc()) == NULL) {
1825			vm_page_unwire(pg, 0);
1826			if (pg->wire_count == 0 && pg->object == NULL)
1827				vm_page_free(pg);
1828			sbunlock(&so->so_snd);
1829			error = EINTR;
1830			goto done;
1831		}
1832
1833		/*
1834		 * Allocate a kernel virtual page and insert the physical page
1835		 * into it.
1836		 */
1837		sf->m = pg;
1838		pmap_qenter(sf->kva, &pg, 1);
1839		/*
1840		 * Get an mbuf header and set it up as having external storage.
1841		 */
1842		MGETHDR(m, M_TRYWAIT, MT_DATA);
1843		if (m == NULL) {
1844			error = ENOBUFS;
1845			sf_buf_free((void *)sf->kva, NULL);
1846			sbunlock(&so->so_snd);
1847			goto done;
1848		}
1849		/*
1850		 * Setup external storage for mbuf.
1851		 */
1852		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY,
1853		    EXT_SFBUF);
1854		m->m_data = (char *) sf->kva + pgoff;
1855		m->m_pkthdr.len = m->m_len = xfsize;
1856		/*
1857		 * Add the buffer to the socket buffer chain.
1858		 */
1859		s = splnet();
1860retry_space:
1861		/*
1862		 * Make sure that the socket is still able to take more data.
1863		 * CANTSENDMORE being true usually means that the connection
1864		 * was closed. so_error is true when an error was sensed after
1865		 * a previous send.
1866		 * The state is checked after the page mapping and buffer
1867		 * allocation above since those operations may block and make
1868		 * any socket checks stale. From this point forward, nothing
1869		 * blocks before the pru_send (or more accurately, any blocking
1870		 * results in a loop back to here to re-check).
1871		 */
1872		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1873			if (so->so_state & SS_CANTSENDMORE) {
1874				error = EPIPE;
1875			} else {
1876				error = so->so_error;
1877				so->so_error = 0;
1878			}
1879			m_freem(m);
1880			sbunlock(&so->so_snd);
1881			splx(s);
1882			goto done;
1883		}
1884		/*
1885		 * Wait for socket space to become available. We do this just
1886		 * after checking the connection state above in order to avoid
1887		 * a race condition with sbwait().
1888		 */
1889		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1890			if (so->so_state & SS_NBIO) {
1891				m_freem(m);
1892				sbunlock(&so->so_snd);
1893				splx(s);
1894				error = EAGAIN;
1895				goto done;
1896			}
1897			error = sbwait(&so->so_snd);
1898			/*
1899			 * An error from sbwait usually indicates that we've
1900			 * been interrupted by a signal. If we've sent anything
1901			 * then return bytes sent, otherwise return the error.
1902			 */
1903			if (error) {
1904				m_freem(m);
1905				sbunlock(&so->so_snd);
1906				splx(s);
1907				goto done;
1908			}
1909			goto retry_space;
1910		}
1911		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
1912		splx(s);
1913		if (error) {
1914			sbunlock(&so->so_snd);
1915			goto done;
1916		}
1917	}
1918	sbunlock(&so->so_snd);
1919
1920	/*
1921	 * Send trailers. Wimp out and use writev(2).
1922	 */
1923	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1924			nuap.fd = uap->s;
1925			nuap.iovp = hdtr.trailers;
1926			nuap.iovcnt = hdtr.trl_cnt;
1927			error = writev(td, &nuap);
1928			if (error)
1929				goto done;
1930			hdtr_size += td->td_retval[0];
1931	}
1932
1933done:
1934	/*
1935	 * If there was no error we have to clear td->td_retval[0]
1936	 * because it may have been set by writev.
1937	 */
1938	if (error == 0) {
1939		td->td_retval[0] = 0;
1940	}
1941	if (uap->sbytes != NULL) {
1942		sbytes += hdtr_size;
1943		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1944	}
1945	if (vp)
1946		vrele(vp);
1947	if (so)
1948		fputsock(so);
1949	mtx_unlock(&Giant);
1950	return (error);
1951}
1952