kern_sendfile.c revision 161125
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 161125 2006-08-09 17:43:27Z alc $");
37
38#include "opt_compat.h"
39#include "opt_ktrace.h"
40#include "opt_mac.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/mac.h>
47#include <sys/mutex.h>
48#include <sys/sysproto.h>
49#include <sys/malloc.h>
50#include <sys/filedesc.h>
51#include <sys/event.h>
52#include <sys/proc.h>
53#include <sys/fcntl.h>
54#include <sys/file.h>
55#include <sys/filio.h>
56#include <sys/mount.h>
57#include <sys/mbuf.h>
58#include <sys/protosw.h>
59#include <sys/sf_buf.h>
60#include <sys/socket.h>
61#include <sys/socketvar.h>
62#include <sys/signalvar.h>
63#include <sys/syscallsubr.h>
64#include <sys/sysctl.h>
65#include <sys/uio.h>
66#include <sys/vnode.h>
67#ifdef KTRACE
68#include <sys/ktrace.h>
69#endif
70
71#include <vm/vm.h>
72#include <vm/vm_object.h>
73#include <vm/vm_page.h>
74#include <vm/vm_pageout.h>
75#include <vm/vm_kern.h>
76#include <vm/vm_extern.h>
77
78static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
79static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
80
81static int accept1(struct thread *td, struct accept_args *uap, int compat);
82static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
83static int getsockname1(struct thread *td, struct getsockname_args *uap,
84			int compat);
85static int getpeername1(struct thread *td, struct getpeername_args *uap,
86			int compat);
87
88/*
89 * NSFBUFS-related variables and associated sysctls
90 */
91int nsfbufs;
92int nsfbufspeak;
93int nsfbufsused;
94
95SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
96    "Maximum number of sendfile(2) sf_bufs available");
97SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
98    "Number of sendfile(2) sf_bufs at peak usage");
99SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
100    "Number of sendfile(2) sf_bufs in use");
101
102/*
103 * Convert a user file descriptor to a kernel file entry.  A reference on the
104 * file entry is held upon returning.  This is lighter weight than
105 * fgetsock(), which bumps the socket reference drops the file reference
106 * count instead, as this approach avoids several additional mutex operations
107 * associated with the additional reference count.  If requested, return the
108 * open file flags.
109 */
110static int
111getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
112{
113	struct file *fp;
114	int error;
115
116	fp = NULL;
117	if (fdp == NULL)
118		error = EBADF;
119	else {
120		FILEDESC_LOCK_FAST(fdp);
121		fp = fget_locked(fdp, fd);
122		if (fp == NULL)
123			error = EBADF;
124		else if (fp->f_type != DTYPE_SOCKET) {
125			fp = NULL;
126			error = ENOTSOCK;
127		} else {
128			fhold(fp);
129			if (fflagp != NULL)
130				*fflagp = fp->f_flag;
131			error = 0;
132		}
133		FILEDESC_UNLOCK_FAST(fdp);
134	}
135	*fpp = fp;
136	return (error);
137}
138
139/*
140 * System call interface to the socket abstraction.
141 */
142#if defined(COMPAT_43)
143#define COMPAT_OLDSOCK
144#endif
145
146/*
147 * MPSAFE
148 */
149int
150socket(td, uap)
151	struct thread *td;
152	register struct socket_args /* {
153		int	domain;
154		int	type;
155		int	protocol;
156	} */ *uap;
157{
158	struct filedesc *fdp;
159	struct socket *so;
160	struct file *fp;
161	int fd, error;
162
163#ifdef MAC
164	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
165	    uap->protocol);
166	if (error)
167		return (error);
168#endif
169	fdp = td->td_proc->p_fd;
170	error = falloc(td, &fp, &fd);
171	if (error)
172		return (error);
173	/* An extra reference on `fp' has been held for us by falloc(). */
174	NET_LOCK_GIANT();
175	error = socreate(uap->domain, &so, uap->type, uap->protocol,
176	    td->td_ucred, td);
177	NET_UNLOCK_GIANT();
178	if (error) {
179		fdclose(fdp, fp, fd, td);
180	} else {
181		FILEDESC_LOCK_FAST(fdp);
182		fp->f_data = so;	/* already has ref count */
183		fp->f_flag = FREAD|FWRITE;
184		fp->f_ops = &socketops;
185		fp->f_type = DTYPE_SOCKET;
186		FILEDESC_UNLOCK_FAST(fdp);
187		td->td_retval[0] = fd;
188	}
189	fdrop(fp, td);
190	return (error);
191}
192
193/*
194 * MPSAFE
195 */
196/* ARGSUSED */
197int
198bind(td, uap)
199	struct thread *td;
200	register struct bind_args /* {
201		int	s;
202		caddr_t	name;
203		int	namelen;
204	} */ *uap;
205{
206	struct sockaddr *sa;
207	int error;
208
209	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
210		return (error);
211
212	error = kern_bind(td, uap->s, sa);
213	free(sa, M_SONAME);
214	return (error);
215}
216
217int
218kern_bind(td, fd, sa)
219	struct thread *td;
220	int fd;
221	struct sockaddr *sa;
222{
223	struct socket *so;
224	struct file *fp;
225	int error;
226
227	NET_LOCK_GIANT();
228	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
229	if (error)
230		goto done2;
231	so = fp->f_data;
232#ifdef MAC
233	SOCK_LOCK(so);
234	error = mac_check_socket_bind(td->td_ucred, so, sa);
235	SOCK_UNLOCK(so);
236	if (error)
237		goto done1;
238#endif
239	error = sobind(so, sa, td);
240#ifdef MAC
241done1:
242#endif
243	fdrop(fp, td);
244done2:
245	NET_UNLOCK_GIANT();
246	return (error);
247}
248
249/*
250 * MPSAFE
251 */
252/* ARGSUSED */
253int
254listen(td, uap)
255	struct thread *td;
256	register struct listen_args /* {
257		int	s;
258		int	backlog;
259	} */ *uap;
260{
261	struct socket *so;
262	struct file *fp;
263	int error;
264
265	NET_LOCK_GIANT();
266	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
267	if (error == 0) {
268		so = fp->f_data;
269#ifdef MAC
270		SOCK_LOCK(so);
271		error = mac_check_socket_listen(td->td_ucred, so);
272		SOCK_UNLOCK(so);
273		if (error)
274			goto done;
275#endif
276		error = solisten(so, uap->backlog, td);
277#ifdef MAC
278done:
279#endif
280		fdrop(fp, td);
281	}
282	NET_UNLOCK_GIANT();
283	return(error);
284}
285
286/*
287 * accept1()
288 * MPSAFE
289 */
290static int
291accept1(td, uap, compat)
292	struct thread *td;
293	register struct accept_args /* {
294		int	s;
295		struct sockaddr	* __restrict name;
296		socklen_t	* __restrict anamelen;
297	} */ *uap;
298	int compat;
299{
300	struct sockaddr *name;
301	socklen_t namelen;
302	struct file *fp;
303	int error;
304
305	if (uap->name == NULL)
306		return (kern_accept(td, uap->s, NULL, NULL, NULL));
307
308	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
309	if (error)
310		return (error);
311
312	error = kern_accept(td, uap->s, &name, &namelen, &fp);
313
314	/*
315	 * return a namelen of zero for older code which might
316	 * ignore the return value from accept.
317	 */
318	if (error) {
319		(void) copyout(&namelen,
320		    uap->anamelen, sizeof(*uap->anamelen));
321		return (error);
322	}
323
324	if (error == 0 && name != NULL) {
325#ifdef COMPAT_OLDSOCK
326		if (compat)
327			((struct osockaddr *)name)->sa_family =
328			    name->sa_family;
329#endif
330		error = copyout(name, uap->name, namelen);
331	}
332	if (error == 0)
333		error = copyout(&namelen, uap->anamelen,
334		    sizeof(namelen));
335	if (error)
336		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
337	fdrop(fp, td);
338	free(name, M_SONAME);
339	return (error);
340}
341
342int
343kern_accept(struct thread *td, int s, struct sockaddr **name,
344    socklen_t *namelen, struct file **fp)
345{
346	struct filedesc *fdp;
347	struct file *headfp, *nfp = NULL;
348	struct sockaddr *sa = NULL;
349	int error;
350	struct socket *head, *so;
351	int fd;
352	u_int fflag;
353	pid_t pgid;
354	int tmp;
355
356	if (name) {
357		*name = NULL;
358		if (*namelen < 0)
359			return (EINVAL);
360	}
361
362	fdp = td->td_proc->p_fd;
363	NET_LOCK_GIANT();
364	error = getsock(fdp, s, &headfp, &fflag);
365	if (error)
366		goto done2;
367	head = headfp->f_data;
368	if ((head->so_options & SO_ACCEPTCONN) == 0) {
369		error = EINVAL;
370		goto done;
371	}
372#ifdef MAC
373	SOCK_LOCK(head);
374	error = mac_check_socket_accept(td->td_ucred, head);
375	SOCK_UNLOCK(head);
376	if (error != 0)
377		goto done;
378#endif
379	error = falloc(td, &nfp, &fd);
380	if (error)
381		goto done;
382	ACCEPT_LOCK();
383	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
384		ACCEPT_UNLOCK();
385		error = EWOULDBLOCK;
386		goto noconnection;
387	}
388	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
389		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
390			head->so_error = ECONNABORTED;
391			break;
392		}
393		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
394		    "accept", 0);
395		if (error) {
396			ACCEPT_UNLOCK();
397			goto noconnection;
398		}
399	}
400	if (head->so_error) {
401		error = head->so_error;
402		head->so_error = 0;
403		ACCEPT_UNLOCK();
404		goto noconnection;
405	}
406	so = TAILQ_FIRST(&head->so_comp);
407	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
408	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
409
410	/*
411	 * Before changing the flags on the socket, we have to bump the
412	 * reference count.  Otherwise, if the protocol calls sofree(),
413	 * the socket will be released due to a zero refcount.
414	 */
415	SOCK_LOCK(so);			/* soref() and so_state update */
416	soref(so);			/* file descriptor reference */
417
418	TAILQ_REMOVE(&head->so_comp, so, so_list);
419	head->so_qlen--;
420	so->so_state |= (head->so_state & SS_NBIO);
421	so->so_qstate &= ~SQ_COMP;
422	so->so_head = NULL;
423
424	SOCK_UNLOCK(so);
425	ACCEPT_UNLOCK();
426
427	/* An extra reference on `nfp' has been held for us by falloc(). */
428	td->td_retval[0] = fd;
429
430	/* connection has been removed from the listen queue */
431	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
432
433	pgid = fgetown(&head->so_sigio);
434	if (pgid != 0)
435		fsetown(pgid, &so->so_sigio);
436
437	FILE_LOCK(nfp);
438	nfp->f_data = so;	/* nfp has ref count from falloc */
439	nfp->f_flag = fflag;
440	nfp->f_ops = &socketops;
441	nfp->f_type = DTYPE_SOCKET;
442	FILE_UNLOCK(nfp);
443	/* Sync socket nonblocking/async state with file flags */
444	tmp = fflag & FNONBLOCK;
445	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
446	tmp = fflag & FASYNC;
447	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
448	sa = 0;
449	error = soaccept(so, &sa);
450	if (error) {
451		/*
452		 * return a namelen of zero for older code which might
453		 * ignore the return value from accept.
454		 */
455		if (name)
456			*namelen = 0;
457		goto noconnection;
458	}
459	if (sa == NULL) {
460		if (name)
461			*namelen = 0;
462		goto done;
463	}
464	if (name) {
465		/* check sa_len before it is destroyed */
466		if (*namelen > sa->sa_len)
467			*namelen = sa->sa_len;
468		*name = sa;
469		sa = NULL;
470	}
471noconnection:
472	if (sa)
473		FREE(sa, M_SONAME);
474
475	/*
476	 * close the new descriptor, assuming someone hasn't ripped it
477	 * out from under us.
478	 */
479	if (error)
480		fdclose(fdp, nfp, fd, td);
481
482	/*
483	 * Release explicitly held references before returning.  We return
484	 * a reference on nfp to the caller on success if they request it.
485	 */
486done:
487	if (fp != NULL) {
488		if (error == 0) {
489			*fp = nfp;
490			nfp = NULL;
491		} else
492			*fp = NULL;
493	}
494	if (nfp != NULL)
495		fdrop(nfp, td);
496	fdrop(headfp, td);
497done2:
498	NET_UNLOCK_GIANT();
499	return (error);
500}
501
502/*
503 * MPSAFE (accept1() is MPSAFE)
504 */
505int
506accept(td, uap)
507	struct thread *td;
508	struct accept_args *uap;
509{
510
511	return (accept1(td, uap, 0));
512}
513
514#ifdef COMPAT_OLDSOCK
515/*
516 * MPSAFE (accept1() is MPSAFE)
517 */
518int
519oaccept(td, uap)
520	struct thread *td;
521	struct accept_args *uap;
522{
523
524	return (accept1(td, uap, 1));
525}
526#endif /* COMPAT_OLDSOCK */
527
528/*
529 * MPSAFE
530 */
531/* ARGSUSED */
532int
533connect(td, uap)
534	struct thread *td;
535	register struct connect_args /* {
536		int	s;
537		caddr_t	name;
538		int	namelen;
539	} */ *uap;
540{
541	struct sockaddr *sa;
542	int error;
543
544	error = getsockaddr(&sa, uap->name, uap->namelen);
545	if (error)
546		return (error);
547
548	error = kern_connect(td, uap->s, sa);
549	free(sa, M_SONAME);
550	return (error);
551}
552
553
554int
555kern_connect(td, fd, sa)
556	struct thread *td;
557	int fd;
558	struct sockaddr *sa;
559{
560	struct socket *so;
561	struct file *fp;
562	int error;
563	int interrupted = 0;
564
565	NET_LOCK_GIANT();
566	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
567	if (error)
568		goto done2;
569	so = fp->f_data;
570	if (so->so_state & SS_ISCONNECTING) {
571		error = EALREADY;
572		goto done1;
573	}
574#ifdef MAC
575	SOCK_LOCK(so);
576	error = mac_check_socket_connect(td->td_ucred, so, sa);
577	SOCK_UNLOCK(so);
578	if (error)
579		goto bad;
580#endif
581	error = soconnect(so, sa, td);
582	if (error)
583		goto bad;
584	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
585		error = EINPROGRESS;
586		goto done1;
587	}
588	SOCK_LOCK(so);
589	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
590		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
591		    "connec", 0);
592		if (error) {
593			if (error == EINTR || error == ERESTART)
594				interrupted = 1;
595			break;
596		}
597	}
598	if (error == 0) {
599		error = so->so_error;
600		so->so_error = 0;
601	}
602	SOCK_UNLOCK(so);
603bad:
604	if (!interrupted)
605		so->so_state &= ~SS_ISCONNECTING;
606	if (error == ERESTART)
607		error = EINTR;
608done1:
609	fdrop(fp, td);
610done2:
611	NET_UNLOCK_GIANT();
612	return (error);
613}
614
615/*
616 * MPSAFE
617 */
618int
619socketpair(td, uap)
620	struct thread *td;
621	register struct socketpair_args /* {
622		int	domain;
623		int	type;
624		int	protocol;
625		int	*rsv;
626	} */ *uap;
627{
628	register struct filedesc *fdp = td->td_proc->p_fd;
629	struct file *fp1, *fp2;
630	struct socket *so1, *so2;
631	int fd, error, sv[2];
632
633#ifdef MAC
634	/* We might want to have a separate check for socket pairs. */
635	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
636	    uap->protocol);
637	if (error)
638		return (error);
639#endif
640
641	NET_LOCK_GIANT();
642	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
643	    td->td_ucred, td);
644	if (error)
645		goto done2;
646	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
647	    td->td_ucred, td);
648	if (error)
649		goto free1;
650	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
651	error = falloc(td, &fp1, &fd);
652	if (error)
653		goto free2;
654	sv[0] = fd;
655	fp1->f_data = so1;	/* so1 already has ref count */
656	error = falloc(td, &fp2, &fd);
657	if (error)
658		goto free3;
659	fp2->f_data = so2;	/* so2 already has ref count */
660	sv[1] = fd;
661	error = soconnect2(so1, so2);
662	if (error)
663		goto free4;
664	if (uap->type == SOCK_DGRAM) {
665		/*
666		 * Datagram socket connection is asymmetric.
667		 */
668		 error = soconnect2(so2, so1);
669		 if (error)
670			goto free4;
671	}
672	FILE_LOCK(fp1);
673	fp1->f_flag = FREAD|FWRITE;
674	fp1->f_ops = &socketops;
675	fp1->f_type = DTYPE_SOCKET;
676	FILE_UNLOCK(fp1);
677	FILE_LOCK(fp2);
678	fp2->f_flag = FREAD|FWRITE;
679	fp2->f_ops = &socketops;
680	fp2->f_type = DTYPE_SOCKET;
681	FILE_UNLOCK(fp2);
682	error = copyout(sv, uap->rsv, 2 * sizeof (int));
683	fdrop(fp1, td);
684	fdrop(fp2, td);
685	goto done2;
686free4:
687	fdclose(fdp, fp2, sv[1], td);
688	fdrop(fp2, td);
689free3:
690	fdclose(fdp, fp1, sv[0], td);
691	fdrop(fp1, td);
692free2:
693	(void)soclose(so2);
694free1:
695	(void)soclose(so1);
696done2:
697	NET_UNLOCK_GIANT();
698	return (error);
699}
700
701static int
702sendit(td, s, mp, flags)
703	register struct thread *td;
704	int s;
705	register struct msghdr *mp;
706	int flags;
707{
708	struct mbuf *control;
709	struct sockaddr *to;
710	int error;
711
712	if (mp->msg_name != NULL) {
713		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
714		if (error) {
715			to = NULL;
716			goto bad;
717		}
718		mp->msg_name = to;
719	} else {
720		to = NULL;
721	}
722
723	if (mp->msg_control) {
724		if (mp->msg_controllen < sizeof(struct cmsghdr)
725#ifdef COMPAT_OLDSOCK
726		    && mp->msg_flags != MSG_COMPAT
727#endif
728		) {
729			error = EINVAL;
730			goto bad;
731		}
732		error = sockargs(&control, mp->msg_control,
733		    mp->msg_controllen, MT_CONTROL);
734		if (error)
735			goto bad;
736#ifdef COMPAT_OLDSOCK
737		if (mp->msg_flags == MSG_COMPAT) {
738			register struct cmsghdr *cm;
739
740			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
741			if (control == 0) {
742				error = ENOBUFS;
743				goto bad;
744			} else {
745				cm = mtod(control, struct cmsghdr *);
746				cm->cmsg_len = control->m_len;
747				cm->cmsg_level = SOL_SOCKET;
748				cm->cmsg_type = SCM_RIGHTS;
749			}
750		}
751#endif
752	} else {
753		control = NULL;
754	}
755
756	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
757
758bad:
759	if (to)
760		FREE(to, M_SONAME);
761	return (error);
762}
763
764int
765kern_sendit(td, s, mp, flags, control, segflg)
766	struct thread *td;
767	int s;
768	struct msghdr *mp;
769	int flags;
770	struct mbuf *control;
771	enum uio_seg segflg;
772{
773	struct file *fp;
774	struct uio auio;
775	struct iovec *iov;
776	struct socket *so;
777	int i;
778	int len, error;
779#ifdef KTRACE
780	struct uio *ktruio = NULL;
781#endif
782
783	NET_LOCK_GIANT();
784	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
785	if (error)
786		goto bad2;
787	so = (struct socket *)fp->f_data;
788
789#ifdef MAC
790	SOCK_LOCK(so);
791	error = mac_check_socket_send(td->td_ucred, so);
792	SOCK_UNLOCK(so);
793	if (error)
794		goto bad;
795#endif
796
797	auio.uio_iov = mp->msg_iov;
798	auio.uio_iovcnt = mp->msg_iovlen;
799	auio.uio_segflg = segflg;
800	auio.uio_rw = UIO_WRITE;
801	auio.uio_td = td;
802	auio.uio_offset = 0;			/* XXX */
803	auio.uio_resid = 0;
804	iov = mp->msg_iov;
805	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
806		if ((auio.uio_resid += iov->iov_len) < 0) {
807			error = EINVAL;
808			goto bad;
809		}
810	}
811#ifdef KTRACE
812	if (KTRPOINT(td, KTR_GENIO))
813		ktruio = cloneuio(&auio);
814#endif
815	len = auio.uio_resid;
816	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
817	if (error) {
818		if (auio.uio_resid != len && (error == ERESTART ||
819		    error == EINTR || error == EWOULDBLOCK))
820			error = 0;
821		/* Generation of SIGPIPE can be controlled per socket */
822		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
823		    !(flags & MSG_NOSIGNAL)) {
824			PROC_LOCK(td->td_proc);
825			psignal(td->td_proc, SIGPIPE);
826			PROC_UNLOCK(td->td_proc);
827		}
828	}
829	if (error == 0)
830		td->td_retval[0] = len - auio.uio_resid;
831#ifdef KTRACE
832	if (ktruio != NULL) {
833		ktruio->uio_resid = td->td_retval[0];
834		ktrgenio(s, UIO_WRITE, ktruio, error);
835	}
836#endif
837bad:
838	fdrop(fp, td);
839bad2:
840	NET_UNLOCK_GIANT();
841	return (error);
842}
843
844/*
845 * MPSAFE
846 */
847int
848sendto(td, uap)
849	struct thread *td;
850	register struct sendto_args /* {
851		int	s;
852		caddr_t	buf;
853		size_t	len;
854		int	flags;
855		caddr_t	to;
856		int	tolen;
857	} */ *uap;
858{
859	struct msghdr msg;
860	struct iovec aiov;
861	int error;
862
863	msg.msg_name = uap->to;
864	msg.msg_namelen = uap->tolen;
865	msg.msg_iov = &aiov;
866	msg.msg_iovlen = 1;
867	msg.msg_control = 0;
868#ifdef COMPAT_OLDSOCK
869	msg.msg_flags = 0;
870#endif
871	aiov.iov_base = uap->buf;
872	aiov.iov_len = uap->len;
873	error = sendit(td, uap->s, &msg, uap->flags);
874	return (error);
875}
876
877#ifdef COMPAT_OLDSOCK
878/*
879 * MPSAFE
880 */
881int
882osend(td, uap)
883	struct thread *td;
884	register struct osend_args /* {
885		int	s;
886		caddr_t	buf;
887		int	len;
888		int	flags;
889	} */ *uap;
890{
891	struct msghdr msg;
892	struct iovec aiov;
893	int error;
894
895	msg.msg_name = 0;
896	msg.msg_namelen = 0;
897	msg.msg_iov = &aiov;
898	msg.msg_iovlen = 1;
899	aiov.iov_base = uap->buf;
900	aiov.iov_len = uap->len;
901	msg.msg_control = 0;
902	msg.msg_flags = 0;
903	error = sendit(td, uap->s, &msg, uap->flags);
904	return (error);
905}
906
907/*
908 * MPSAFE
909 */
910int
911osendmsg(td, uap)
912	struct thread *td;
913	struct osendmsg_args /* {
914		int	s;
915		caddr_t	msg;
916		int	flags;
917	} */ *uap;
918{
919	struct msghdr msg;
920	struct iovec *iov;
921	int error;
922
923	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
924	if (error)
925		return (error);
926	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
927	if (error)
928		return (error);
929	msg.msg_iov = iov;
930	msg.msg_flags = MSG_COMPAT;
931	error = sendit(td, uap->s, &msg, uap->flags);
932	free(iov, M_IOV);
933	return (error);
934}
935#endif
936
937/*
938 * MPSAFE
939 */
940int
941sendmsg(td, uap)
942	struct thread *td;
943	struct sendmsg_args /* {
944		int	s;
945		caddr_t	msg;
946		int	flags;
947	} */ *uap;
948{
949	struct msghdr msg;
950	struct iovec *iov;
951	int error;
952
953	error = copyin(uap->msg, &msg, sizeof (msg));
954	if (error)
955		return (error);
956	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
957	if (error)
958		return (error);
959	msg.msg_iov = iov;
960#ifdef COMPAT_OLDSOCK
961	msg.msg_flags = 0;
962#endif
963	error = sendit(td, uap->s, &msg, uap->flags);
964	free(iov, M_IOV);
965	return (error);
966}
967
968int
969kern_recvit(td, s, mp, fromseg, controlp)
970	struct thread *td;
971	int s;
972	struct msghdr *mp;
973	enum uio_seg fromseg;
974	struct mbuf **controlp;
975{
976	struct uio auio;
977	struct iovec *iov;
978	int i;
979	socklen_t len;
980	int error;
981	struct mbuf *m, *control = 0;
982	caddr_t ctlbuf;
983	struct file *fp;
984	struct socket *so;
985	struct sockaddr *fromsa = 0;
986#ifdef KTRACE
987	struct uio *ktruio = NULL;
988#endif
989
990	if(controlp != NULL)
991		*controlp = 0;
992
993	NET_LOCK_GIANT();
994	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
995	if (error) {
996		NET_UNLOCK_GIANT();
997		return (error);
998	}
999	so = fp->f_data;
1000
1001#ifdef MAC
1002	SOCK_LOCK(so);
1003	error = mac_check_socket_receive(td->td_ucred, so);
1004	SOCK_UNLOCK(so);
1005	if (error) {
1006		fdrop(fp, td);
1007		NET_UNLOCK_GIANT();
1008		return (error);
1009	}
1010#endif
1011
1012	auio.uio_iov = mp->msg_iov;
1013	auio.uio_iovcnt = mp->msg_iovlen;
1014	auio.uio_segflg = UIO_USERSPACE;
1015	auio.uio_rw = UIO_READ;
1016	auio.uio_td = td;
1017	auio.uio_offset = 0;			/* XXX */
1018	auio.uio_resid = 0;
1019	iov = mp->msg_iov;
1020	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1021		if ((auio.uio_resid += iov->iov_len) < 0) {
1022			fdrop(fp, td);
1023			NET_UNLOCK_GIANT();
1024			return (EINVAL);
1025		}
1026	}
1027#ifdef KTRACE
1028	if (KTRPOINT(td, KTR_GENIO))
1029		ktruio = cloneuio(&auio);
1030#endif
1031	len = auio.uio_resid;
1032	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
1033	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
1034	    &mp->msg_flags);
1035	if (error) {
1036		if (auio.uio_resid != (int)len && (error == ERESTART ||
1037		    error == EINTR || error == EWOULDBLOCK))
1038			error = 0;
1039	}
1040#ifdef KTRACE
1041	if (ktruio != NULL) {
1042		ktruio->uio_resid = (int)len - auio.uio_resid;
1043		ktrgenio(s, UIO_READ, ktruio, error);
1044	}
1045#endif
1046	if (error)
1047		goto out;
1048	td->td_retval[0] = (int)len - auio.uio_resid;
1049	if (mp->msg_name) {
1050		len = mp->msg_namelen;
1051		if (len <= 0 || fromsa == 0)
1052			len = 0;
1053		else {
1054			/* save sa_len before it is destroyed by MSG_COMPAT */
1055			len = MIN(len, fromsa->sa_len);
1056#ifdef COMPAT_OLDSOCK
1057			if (mp->msg_flags & MSG_COMPAT)
1058				((struct osockaddr *)fromsa)->sa_family =
1059				    fromsa->sa_family;
1060#endif
1061			if (fromseg == UIO_USERSPACE) {
1062				error = copyout(fromsa, mp->msg_name,
1063				    (unsigned)len);
1064				if (error)
1065					goto out;
1066			} else
1067				bcopy(fromsa, mp->msg_name, len);
1068		}
1069		mp->msg_namelen = len;
1070	}
1071	if (mp->msg_control && controlp == NULL) {
1072#ifdef COMPAT_OLDSOCK
1073		/*
1074		 * We assume that old recvmsg calls won't receive access
1075		 * rights and other control info, esp. as control info
1076		 * is always optional and those options didn't exist in 4.3.
1077		 * If we receive rights, trim the cmsghdr; anything else
1078		 * is tossed.
1079		 */
1080		if (control && mp->msg_flags & MSG_COMPAT) {
1081			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1082			    SOL_SOCKET ||
1083			    mtod(control, struct cmsghdr *)->cmsg_type !=
1084			    SCM_RIGHTS) {
1085				mp->msg_controllen = 0;
1086				goto out;
1087			}
1088			control->m_len -= sizeof (struct cmsghdr);
1089			control->m_data += sizeof (struct cmsghdr);
1090		}
1091#endif
1092		len = mp->msg_controllen;
1093		m = control;
1094		mp->msg_controllen = 0;
1095		ctlbuf = mp->msg_control;
1096
1097		while (m && len > 0) {
1098			unsigned int tocopy;
1099
1100			if (len >= m->m_len)
1101				tocopy = m->m_len;
1102			else {
1103				mp->msg_flags |= MSG_CTRUNC;
1104				tocopy = len;
1105			}
1106
1107			if ((error = copyout(mtod(m, caddr_t),
1108					ctlbuf, tocopy)) != 0)
1109				goto out;
1110
1111			ctlbuf += tocopy;
1112			len -= tocopy;
1113			m = m->m_next;
1114		}
1115		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1116	}
1117out:
1118	fdrop(fp, td);
1119	NET_UNLOCK_GIANT();
1120	if (fromsa)
1121		FREE(fromsa, M_SONAME);
1122
1123	if (error == 0 && controlp != NULL)
1124		*controlp = control;
1125	else  if (control)
1126		m_freem(control);
1127
1128	return (error);
1129}
1130
1131static int
1132recvit(td, s, mp, namelenp)
1133	struct thread *td;
1134	int s;
1135	struct msghdr *mp;
1136	void *namelenp;
1137{
1138	int error;
1139
1140	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1141	if (error)
1142		return (error);
1143	if (namelenp) {
1144		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1145#ifdef COMPAT_OLDSOCK
1146		if (mp->msg_flags & MSG_COMPAT)
1147			error = 0;	/* old recvfrom didn't check */
1148#endif
1149	}
1150	return (error);
1151}
1152
1153/*
1154 * MPSAFE
1155 */
1156int
1157recvfrom(td, uap)
1158	struct thread *td;
1159	register struct recvfrom_args /* {
1160		int	s;
1161		caddr_t	buf;
1162		size_t	len;
1163		int	flags;
1164		struct sockaddr * __restrict	from;
1165		socklen_t * __restrict fromlenaddr;
1166	} */ *uap;
1167{
1168	struct msghdr msg;
1169	struct iovec aiov;
1170	int error;
1171
1172	if (uap->fromlenaddr) {
1173		error = copyin(uap->fromlenaddr,
1174		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1175		if (error)
1176			goto done2;
1177	} else {
1178		msg.msg_namelen = 0;
1179	}
1180	msg.msg_name = uap->from;
1181	msg.msg_iov = &aiov;
1182	msg.msg_iovlen = 1;
1183	aiov.iov_base = uap->buf;
1184	aiov.iov_len = uap->len;
1185	msg.msg_control = 0;
1186	msg.msg_flags = uap->flags;
1187	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1188done2:
1189	return(error);
1190}
1191
1192#ifdef COMPAT_OLDSOCK
1193/*
1194 * MPSAFE
1195 */
1196int
1197orecvfrom(td, uap)
1198	struct thread *td;
1199	struct recvfrom_args *uap;
1200{
1201
1202	uap->flags |= MSG_COMPAT;
1203	return (recvfrom(td, uap));
1204}
1205#endif
1206
1207
1208#ifdef COMPAT_OLDSOCK
1209/*
1210 * MPSAFE
1211 */
1212int
1213orecv(td, uap)
1214	struct thread *td;
1215	register struct orecv_args /* {
1216		int	s;
1217		caddr_t	buf;
1218		int	len;
1219		int	flags;
1220	} */ *uap;
1221{
1222	struct msghdr msg;
1223	struct iovec aiov;
1224	int error;
1225
1226	msg.msg_name = 0;
1227	msg.msg_namelen = 0;
1228	msg.msg_iov = &aiov;
1229	msg.msg_iovlen = 1;
1230	aiov.iov_base = uap->buf;
1231	aiov.iov_len = uap->len;
1232	msg.msg_control = 0;
1233	msg.msg_flags = uap->flags;
1234	error = recvit(td, uap->s, &msg, NULL);
1235	return (error);
1236}
1237
1238/*
1239 * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1240 * overlays the new one, missing only the flags, and with the (old) access
1241 * rights where the control fields are now.
1242 *
1243 * MPSAFE
1244 */
1245int
1246orecvmsg(td, uap)
1247	struct thread *td;
1248	struct orecvmsg_args /* {
1249		int	s;
1250		struct	omsghdr *msg;
1251		int	flags;
1252	} */ *uap;
1253{
1254	struct msghdr msg;
1255	struct iovec *iov;
1256	int error;
1257
1258	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1259	if (error)
1260		return (error);
1261	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1262	if (error)
1263		return (error);
1264	msg.msg_flags = uap->flags | MSG_COMPAT;
1265	msg.msg_iov = iov;
1266	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1267	if (msg.msg_controllen && error == 0)
1268		error = copyout(&msg.msg_controllen,
1269		    &uap->msg->msg_accrightslen, sizeof (int));
1270	free(iov, M_IOV);
1271	return (error);
1272}
1273#endif
1274
1275/*
1276 * MPSAFE
1277 */
1278int
1279recvmsg(td, uap)
1280	struct thread *td;
1281	struct recvmsg_args /* {
1282		int	s;
1283		struct	msghdr *msg;
1284		int	flags;
1285	} */ *uap;
1286{
1287	struct msghdr msg;
1288	struct iovec *uiov, *iov;
1289	int error;
1290
1291	error = copyin(uap->msg, &msg, sizeof (msg));
1292	if (error)
1293		return (error);
1294	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1295	if (error)
1296		return (error);
1297	msg.msg_flags = uap->flags;
1298#ifdef COMPAT_OLDSOCK
1299	msg.msg_flags &= ~MSG_COMPAT;
1300#endif
1301	uiov = msg.msg_iov;
1302	msg.msg_iov = iov;
1303	error = recvit(td, uap->s, &msg, NULL);
1304	if (error == 0) {
1305		msg.msg_iov = uiov;
1306		error = copyout(&msg, uap->msg, sizeof(msg));
1307	}
1308	free(iov, M_IOV);
1309	return (error);
1310}
1311
1312/*
1313 * MPSAFE
1314 */
1315/* ARGSUSED */
1316int
1317shutdown(td, uap)
1318	struct thread *td;
1319	register struct shutdown_args /* {
1320		int	s;
1321		int	how;
1322	} */ *uap;
1323{
1324	struct socket *so;
1325	struct file *fp;
1326	int error;
1327
1328	NET_LOCK_GIANT();
1329	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
1330	if (error == 0) {
1331		so = fp->f_data;
1332		error = soshutdown(so, uap->how);
1333		fdrop(fp, td);
1334	}
1335	NET_UNLOCK_GIANT();
1336	return (error);
1337}
1338
1339/*
1340 * MPSAFE
1341 */
1342/* ARGSUSED */
1343int
1344setsockopt(td, uap)
1345	struct thread *td;
1346	register struct setsockopt_args /* {
1347		int	s;
1348		int	level;
1349		int	name;
1350		caddr_t	val;
1351		int	valsize;
1352	} */ *uap;
1353{
1354
1355	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1356	    uap->val, UIO_USERSPACE, uap->valsize));
1357}
1358
1359int
1360kern_setsockopt(td, s, level, name, val, valseg, valsize)
1361	struct thread *td;
1362	int s;
1363	int level;
1364	int name;
1365	void *val;
1366	enum uio_seg valseg;
1367	socklen_t valsize;
1368{
1369	int error;
1370	struct socket *so;
1371	struct file *fp;
1372	struct sockopt sopt;
1373
1374	if (val == NULL && valsize != 0)
1375		return (EFAULT);
1376	if ((int)valsize < 0)
1377		return (EINVAL);
1378
1379	sopt.sopt_dir = SOPT_SET;
1380	sopt.sopt_level = level;
1381	sopt.sopt_name = name;
1382	sopt.sopt_val = val;
1383	sopt.sopt_valsize = valsize;
1384	switch (valseg) {
1385	case UIO_USERSPACE:
1386		sopt.sopt_td = td;
1387		break;
1388	case UIO_SYSSPACE:
1389		sopt.sopt_td = NULL;
1390		break;
1391	default:
1392		panic("kern_setsockopt called with bad valseg");
1393	}
1394
1395	NET_LOCK_GIANT();
1396	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1397	if (error == 0) {
1398		so = fp->f_data;
1399		error = sosetopt(so, &sopt);
1400		fdrop(fp, td);
1401	}
1402	NET_UNLOCK_GIANT();
1403	return(error);
1404}
1405
1406/*
1407 * MPSAFE
1408 */
1409/* ARGSUSED */
1410int
1411getsockopt(td, uap)
1412	struct thread *td;
1413	register struct getsockopt_args /* {
1414		int	s;
1415		int	level;
1416		int	name;
1417		void * __restrict	val;
1418		socklen_t * __restrict avalsize;
1419	} */ *uap;
1420{
1421	socklen_t valsize;
1422	int	error;
1423
1424	if (uap->val) {
1425		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1426		if (error)
1427			return (error);
1428	}
1429
1430	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1431	    uap->val, UIO_USERSPACE, &valsize);
1432
1433	if (error == 0)
1434		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1435	return (error);
1436}
1437
1438/*
1439 * Kernel version of getsockopt.
1440 * optval can be a userland or userspace. optlen is always a kernel pointer.
1441 */
1442int
1443kern_getsockopt(td, s, level, name, val, valseg, valsize)
1444	struct thread *td;
1445	int s;
1446	int level;
1447	int name;
1448	void *val;
1449	enum uio_seg valseg;
1450	socklen_t *valsize;
1451{
1452	int error;
1453	struct  socket *so;
1454	struct file *fp;
1455	struct	sockopt sopt;
1456
1457	if (val == NULL)
1458		*valsize = 0;
1459	if ((int)*valsize < 0)
1460		return (EINVAL);
1461
1462	sopt.sopt_dir = SOPT_GET;
1463	sopt.sopt_level = level;
1464	sopt.sopt_name = name;
1465	sopt.sopt_val = val;
1466	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1467	switch (valseg) {
1468	case UIO_USERSPACE:
1469		sopt.sopt_td = td;
1470		break;
1471	case UIO_SYSSPACE:
1472		sopt.sopt_td = NULL;
1473		break;
1474	default:
1475		panic("kern_getsockopt called with bad valseg");
1476	}
1477
1478	NET_LOCK_GIANT();
1479	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1480	if (error == 0) {
1481		so = fp->f_data;
1482		error = sogetopt(so, &sopt);
1483		*valsize = sopt.sopt_valsize;
1484		fdrop(fp, td);
1485	}
1486	NET_UNLOCK_GIANT();
1487	return (error);
1488}
1489
1490/*
1491 * getsockname1() - Get socket name.
1492 *
1493 * MPSAFE
1494 */
1495/* ARGSUSED */
1496static int
1497getsockname1(td, uap, compat)
1498	struct thread *td;
1499	register struct getsockname_args /* {
1500		int	fdes;
1501		struct sockaddr * __restrict asa;
1502		socklen_t * __restrict alen;
1503	} */ *uap;
1504	int compat;
1505{
1506	struct sockaddr *sa;
1507	socklen_t len;
1508	int error;
1509
1510	error = copyin(uap->alen, &len, sizeof(len));
1511	if (error)
1512		return (error);
1513
1514	error = kern_getsockname(td, uap->fdes, &sa, &len);
1515	if (error)
1516		return (error);
1517
1518	if (len != 0) {
1519#ifdef COMPAT_OLDSOCK
1520		if (compat)
1521			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1522#endif
1523		error = copyout(sa, uap->asa, (u_int)len);
1524	}
1525	free(sa, M_SONAME);
1526	if (error == 0)
1527		error = copyout(&len, uap->alen, sizeof(len));
1528	return (error);
1529}
1530
1531int
1532kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1533    socklen_t *alen)
1534{
1535	struct socket *so;
1536	struct file *fp;
1537	socklen_t len;
1538	int error;
1539
1540	if (*alen < 0)
1541		return (EINVAL);
1542
1543	NET_LOCK_GIANT();
1544	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1545	if (error)
1546		goto done;
1547	so = fp->f_data;
1548	*sa = NULL;
1549	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1550	if (error)
1551		goto bad;
1552	if (*sa == NULL)
1553		len = 0;
1554	else
1555		len = MIN(*alen, (*sa)->sa_len);
1556	*alen = len;
1557bad:
1558	fdrop(fp, td);
1559	if (error && *sa) {
1560		free(*sa, M_SONAME);
1561		*sa = NULL;
1562	}
1563done:
1564	NET_UNLOCK_GIANT();
1565	return (error);
1566}
1567
1568/*
1569 * MPSAFE
1570 */
1571int
1572getsockname(td, uap)
1573	struct thread *td;
1574	struct getsockname_args *uap;
1575{
1576
1577	return (getsockname1(td, uap, 0));
1578}
1579
1580#ifdef COMPAT_OLDSOCK
1581/*
1582 * MPSAFE
1583 */
1584int
1585ogetsockname(td, uap)
1586	struct thread *td;
1587	struct getsockname_args *uap;
1588{
1589
1590	return (getsockname1(td, uap, 1));
1591}
1592#endif /* COMPAT_OLDSOCK */
1593
1594/*
1595 * getpeername1() - Get name of peer for connected socket.
1596 *
1597 * MPSAFE
1598 */
1599/* ARGSUSED */
1600static int
1601getpeername1(td, uap, compat)
1602	struct thread *td;
1603	register struct getpeername_args /* {
1604		int	fdes;
1605		struct sockaddr * __restrict	asa;
1606		socklen_t * __restrict	alen;
1607	} */ *uap;
1608	int compat;
1609{
1610	struct sockaddr *sa;
1611	socklen_t len;
1612	int error;
1613
1614	error = copyin(uap->alen, &len, sizeof (len));
1615	if (error)
1616		return (error);
1617
1618	error = kern_getpeername(td, uap->fdes, &sa, &len);
1619	if (error)
1620		return (error);
1621
1622	if (len != 0) {
1623#ifdef COMPAT_OLDSOCK
1624		if (compat)
1625			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1626#endif
1627		error = copyout(sa, uap->asa, (u_int)len);
1628	}
1629	free(sa, M_SONAME);
1630	if (error == 0)
1631		error = copyout(&len, uap->alen, sizeof(len));
1632	return (error);
1633}
1634
1635int
1636kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1637    socklen_t *alen)
1638{
1639	struct socket *so;
1640	struct file *fp;
1641	socklen_t len;
1642	int error;
1643
1644	if (*alen < 0)
1645		return (EINVAL);
1646
1647	NET_LOCK_GIANT();
1648	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1649	if (error)
1650		goto done2;
1651	so = fp->f_data;
1652	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1653		error = ENOTCONN;
1654		goto done1;
1655	}
1656	*sa = NULL;
1657	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1658	if (error)
1659		goto bad;
1660	if (*sa == NULL)
1661		len = 0;
1662	else
1663		len = MIN(*alen, (*sa)->sa_len);
1664	*alen = len;
1665bad:
1666	if (error && *sa) {
1667		free(*sa, M_SONAME);
1668		*sa = NULL;
1669	}
1670done1:
1671	fdrop(fp, td);
1672done2:
1673	NET_UNLOCK_GIANT();
1674	return (error);
1675}
1676
1677/*
1678 * MPSAFE
1679 */
1680int
1681getpeername(td, uap)
1682	struct thread *td;
1683	struct getpeername_args *uap;
1684{
1685
1686	return (getpeername1(td, uap, 0));
1687}
1688
1689#ifdef COMPAT_OLDSOCK
1690/*
1691 * MPSAFE
1692 */
1693int
1694ogetpeername(td, uap)
1695	struct thread *td;
1696	struct ogetpeername_args *uap;
1697{
1698
1699	/* XXX uap should have type `getpeername_args *' to begin with. */
1700	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1701}
1702#endif /* COMPAT_OLDSOCK */
1703
1704int
1705sockargs(mp, buf, buflen, type)
1706	struct mbuf **mp;
1707	caddr_t buf;
1708	int buflen, type;
1709{
1710	register struct sockaddr *sa;
1711	register struct mbuf *m;
1712	int error;
1713
1714	if ((u_int)buflen > MLEN) {
1715#ifdef COMPAT_OLDSOCK
1716		if (type == MT_SONAME && (u_int)buflen <= 112)
1717			buflen = MLEN;		/* unix domain compat. hack */
1718		else
1719#endif
1720			if ((u_int)buflen > MCLBYTES)
1721				return (EINVAL);
1722	}
1723	m = m_get(M_TRYWAIT, type);
1724	if (m == NULL)
1725		return (ENOBUFS);
1726	if ((u_int)buflen > MLEN) {
1727		MCLGET(m, M_TRYWAIT);
1728		if ((m->m_flags & M_EXT) == 0) {
1729			m_free(m);
1730			return (ENOBUFS);
1731		}
1732	}
1733	m->m_len = buflen;
1734	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1735	if (error)
1736		(void) m_free(m);
1737	else {
1738		*mp = m;
1739		if (type == MT_SONAME) {
1740			sa = mtod(m, struct sockaddr *);
1741
1742#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1743			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1744				sa->sa_family = sa->sa_len;
1745#endif
1746			sa->sa_len = buflen;
1747		}
1748	}
1749	return (error);
1750}
1751
1752int
1753getsockaddr(namp, uaddr, len)
1754	struct sockaddr **namp;
1755	caddr_t uaddr;
1756	size_t len;
1757{
1758	struct sockaddr *sa;
1759	int error;
1760
1761	if (len > SOCK_MAXADDRLEN)
1762		return (ENAMETOOLONG);
1763	if (len < offsetof(struct sockaddr, sa_data[0]))
1764		return (EINVAL);
1765	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1766	error = copyin(uaddr, sa, len);
1767	if (error) {
1768		FREE(sa, M_SONAME);
1769	} else {
1770#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1771		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1772			sa->sa_family = sa->sa_len;
1773#endif
1774		sa->sa_len = len;
1775		*namp = sa;
1776	}
1777	return (error);
1778}
1779
1780/*
1781 * Detach mapped page and release resources back to the system.
1782 */
1783void
1784sf_buf_mext(void *addr, void *args)
1785{
1786	vm_page_t m;
1787
1788	m = sf_buf_page(args);
1789	sf_buf_free(args);
1790	vm_page_lock_queues();
1791	vm_page_unwire(m, 0);
1792	/*
1793	 * Check for the object going away on us. This can
1794	 * happen since we don't hold a reference to it.
1795	 * If so, we're responsible for freeing the page.
1796	 */
1797	if (m->wire_count == 0 && m->object == NULL)
1798		vm_page_free(m);
1799	vm_page_unlock_queues();
1800}
1801
1802/*
1803 * sendfile(2)
1804 *
1805 * MPSAFE
1806 *
1807 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1808 *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1809 *
1810 * Send a file specified by 'fd' and starting at 'offset' to a socket
1811 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1812 * nbytes == 0. Optionally add a header and/or trailer to the socket
1813 * output. If specified, write the total number of bytes sent into *sbytes.
1814 *
1815 */
1816int
1817sendfile(struct thread *td, struct sendfile_args *uap)
1818{
1819
1820	return (do_sendfile(td, uap, 0));
1821}
1822
1823static int
1824do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1825{
1826	struct sf_hdtr hdtr;
1827	struct uio *hdr_uio, *trl_uio;
1828	int error;
1829
1830	hdr_uio = trl_uio = NULL;
1831
1832	if (uap->hdtr != NULL) {
1833		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1834		if (error)
1835			goto out;
1836		if (hdtr.headers != NULL) {
1837			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1838			if (error)
1839				goto out;
1840		}
1841		if (hdtr.trailers != NULL) {
1842			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1843			if (error)
1844				goto out;
1845
1846		}
1847	}
1848
1849	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
1850out:
1851	if (hdr_uio)
1852		free(hdr_uio, M_IOV);
1853	if (trl_uio)
1854		free(trl_uio, M_IOV);
1855	return (error);
1856}
1857
1858#ifdef COMPAT_FREEBSD4
1859int
1860freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1861{
1862	struct sendfile_args args;
1863
1864	args.fd = uap->fd;
1865	args.s = uap->s;
1866	args.offset = uap->offset;
1867	args.nbytes = uap->nbytes;
1868	args.hdtr = uap->hdtr;
1869	args.sbytes = uap->sbytes;
1870	args.flags = uap->flags;
1871
1872	return (do_sendfile(td, &args, 1));
1873}
1874#endif /* COMPAT_FREEBSD4 */
1875
1876int
1877kern_sendfile(struct thread *td, struct sendfile_args *uap,
1878    struct uio *hdr_uio, struct uio *trl_uio, int compat)
1879{
1880	struct file *sock_fp;
1881	struct vnode *vp;
1882	struct vm_object *obj = NULL;
1883	struct socket *so = NULL;
1884	struct mbuf *m, *m_header = NULL;
1885	struct sf_buf *sf;
1886	struct vm_page *pg;
1887	off_t off, xfsize, hdtr_size, sbytes = 0;
1888	int error, headersize = 0, headersent = 0;
1889	int vfslocked;
1890
1891	NET_LOCK_GIANT();
1892
1893	hdtr_size = 0;
1894
1895	/*
1896	 * The descriptor must be a regular file and have a backing VM object.
1897	 */
1898	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1899		goto done;
1900	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1901	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1902	obj = vp->v_object;
1903	if (obj != NULL) {
1904		/*
1905		 * Temporarily increase the backing VM object's reference
1906		 * count so that a forced reclamation of its vnode does not
1907		 * immediately destroy it.
1908		 */
1909		VM_OBJECT_LOCK(obj);
1910		if ((obj->flags & OBJ_DEAD) == 0) {
1911			vm_object_reference_locked(obj);
1912			VM_OBJECT_UNLOCK(obj);
1913		} else {
1914			VM_OBJECT_UNLOCK(obj);
1915			obj = NULL;
1916		}
1917	}
1918	VOP_UNLOCK(vp, 0, td);
1919	VFS_UNLOCK_GIANT(vfslocked);
1920	if (obj == NULL) {
1921		error = EINVAL;
1922		goto done;
1923	}
1924	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp, NULL)) != 0)
1925		goto done;
1926	so = sock_fp->f_data;
1927	if (so->so_type != SOCK_STREAM) {
1928		error = EINVAL;
1929		goto done;
1930	}
1931	if ((so->so_state & SS_ISCONNECTED) == 0) {
1932		error = ENOTCONN;
1933		goto done;
1934	}
1935	if (uap->offset < 0) {
1936		error = EINVAL;
1937		goto done;
1938	}
1939
1940#ifdef MAC
1941	SOCK_LOCK(so);
1942	error = mac_check_socket_send(td->td_ucred, so);
1943	SOCK_UNLOCK(so);
1944	if (error)
1945		goto done;
1946#endif
1947
1948	/*
1949	 * If specified, get the pointer to the sf_hdtr struct for
1950	 * any headers/trailers.
1951	 */
1952	if (hdr_uio != NULL) {
1953		hdr_uio->uio_td = td;
1954		hdr_uio->uio_rw = UIO_WRITE;
1955		if (hdr_uio->uio_resid > 0) {
1956			m_header = m_uiotombuf(hdr_uio, M_DONTWAIT, 0, 0);
1957			if (m_header == NULL)
1958				goto done;
1959			headersize = m_header->m_pkthdr.len;
1960			if (compat)
1961				sbytes += headersize;
1962		}
1963	}
1964
1965	/*
1966	 * Protect against multiple writers to the socket.
1967	 */
1968	SOCKBUF_LOCK(&so->so_snd);
1969	(void) sblock(&so->so_snd, M_WAITOK);
1970	SOCKBUF_UNLOCK(&so->so_snd);
1971
1972	/*
1973	 * Loop through the pages in the file, starting with the requested
1974	 * offset. Get a file page (do I/O if necessary), map the file page
1975	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1976	 * it on the socket.
1977	 */
1978	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1979		vm_pindex_t pindex;
1980		vm_offset_t pgoff;
1981
1982		pindex = OFF_TO_IDX(off);
1983		VM_OBJECT_LOCK(obj);
1984retry_lookup:
1985		/*
1986		 * Calculate the amount to transfer. Not to exceed a page,
1987		 * the EOF, or the passed in nbytes.
1988		 */
1989		xfsize = obj->un_pager.vnp.vnp_size - off;
1990		VM_OBJECT_UNLOCK(obj);
1991		if (xfsize > PAGE_SIZE)
1992			xfsize = PAGE_SIZE;
1993		pgoff = (vm_offset_t)(off & PAGE_MASK);
1994		if (PAGE_SIZE - pgoff < xfsize)
1995			xfsize = PAGE_SIZE - pgoff;
1996		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1997			xfsize = uap->nbytes - sbytes;
1998		if (xfsize <= 0) {
1999			if (m_header != NULL) {
2000				m = m_header;
2001				m_header = NULL;
2002				SOCKBUF_LOCK(&so->so_snd);
2003				goto retry_space;
2004			} else
2005				break;
2006		}
2007		/*
2008		 * Optimize the non-blocking case by looking at the socket space
2009		 * before going to the extra work of constituting the sf_buf.
2010		 */
2011		SOCKBUF_LOCK(&so->so_snd);
2012		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
2013			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
2014				error = EPIPE;
2015			else
2016				error = EAGAIN;
2017			sbunlock(&so->so_snd);
2018			SOCKBUF_UNLOCK(&so->so_snd);
2019			goto done;
2020		}
2021		SOCKBUF_UNLOCK(&so->so_snd);
2022		VM_OBJECT_LOCK(obj);
2023		/*
2024		 * Attempt to look up the page.
2025		 *
2026		 *	Allocate if not found
2027		 *
2028		 *	Wait and loop if busy.
2029		 */
2030		pg = vm_page_lookup(obj, pindex);
2031
2032		if (pg == NULL) {
2033			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NOBUSY |
2034			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
2035			if (pg == NULL) {
2036				VM_OBJECT_UNLOCK(obj);
2037				VM_WAIT;
2038				VM_OBJECT_LOCK(obj);
2039				goto retry_lookup;
2040			}
2041		} else if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
2042			goto retry_lookup;
2043		else {
2044			/*
2045			 * Wire the page so it does not get ripped out from
2046			 * under us.
2047			 */
2048			vm_page_lock_queues();
2049			vm_page_wire(pg);
2050			vm_page_unlock_queues();
2051		}
2052
2053		/*
2054		 * If page is not valid for what we need, initiate I/O
2055		 */
2056
2057		if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) {
2058			VM_OBJECT_UNLOCK(obj);
2059		} else if (uap->flags & SF_NODISKIO) {
2060			error = EBUSY;
2061		} else {
2062			int bsize, resid;
2063
2064			/*
2065			 * Ensure that our page is still around when the I/O
2066			 * completes.
2067			 */
2068			vm_page_io_start(pg);
2069			VM_OBJECT_UNLOCK(obj);
2070
2071			/*
2072			 * Get the page from backing store.
2073			 */
2074			bsize = vp->v_mount->mnt_stat.f_iosize;
2075			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2076			vn_lock(vp, LK_SHARED | LK_RETRY, td);
2077			/*
2078			 * XXXMAC: Because we don't have fp->f_cred here,
2079			 * we pass in NOCRED.  This is probably wrong, but
2080			 * is consistent with our original implementation.
2081			 */
2082			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
2083			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2084			    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
2085			    td->td_ucred, NOCRED, &resid, td);
2086			VOP_UNLOCK(vp, 0, td);
2087			VFS_UNLOCK_GIANT(vfslocked);
2088			VM_OBJECT_LOCK(obj);
2089			vm_page_io_finish(pg);
2090			if (!error)
2091				VM_OBJECT_UNLOCK(obj);
2092			mbstat.sf_iocnt++;
2093		}
2094
2095		if (error) {
2096			vm_page_lock_queues();
2097			vm_page_unwire(pg, 0);
2098			/*
2099			 * See if anyone else might know about this page.
2100			 * If not and it is not valid, then free it.
2101			 */
2102			if (pg->wire_count == 0 && pg->valid == 0 &&
2103			    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
2104			    pg->hold_count == 0) {
2105				vm_page_free(pg);
2106			}
2107			vm_page_unlock_queues();
2108			VM_OBJECT_UNLOCK(obj);
2109			SOCKBUF_LOCK(&so->so_snd);
2110			sbunlock(&so->so_snd);
2111			SOCKBUF_UNLOCK(&so->so_snd);
2112			goto done;
2113		}
2114
2115		/*
2116		 * Get a sendfile buf. We usually wait as long as necessary,
2117		 * but this wait can be interrupted.
2118		 */
2119		if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) {
2120			mbstat.sf_allocfail++;
2121			vm_page_lock_queues();
2122			vm_page_unwire(pg, 0);
2123			if (pg->wire_count == 0 && pg->object == NULL)
2124				vm_page_free(pg);
2125			vm_page_unlock_queues();
2126			SOCKBUF_LOCK(&so->so_snd);
2127			sbunlock(&so->so_snd);
2128			SOCKBUF_UNLOCK(&so->so_snd);
2129			error = EINTR;
2130			goto done;
2131		}
2132
2133		/*
2134		 * Get an mbuf header and set it up as having external storage.
2135		 */
2136		if (m_header)
2137			MGET(m, M_TRYWAIT, MT_DATA);
2138		else
2139			MGETHDR(m, M_TRYWAIT, MT_DATA);
2140		if (m == NULL) {
2141			error = ENOBUFS;
2142			sf_buf_mext((void *)sf_buf_kva(sf), sf);
2143			SOCKBUF_LOCK(&so->so_snd);
2144			sbunlock(&so->so_snd);
2145			SOCKBUF_UNLOCK(&so->so_snd);
2146			goto done;
2147		}
2148		/*
2149		 * Setup external storage for mbuf.
2150		 */
2151		MEXTADD(m, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, sf, M_RDONLY,
2152		    EXT_SFBUF);
2153		m->m_data = (char *)sf_buf_kva(sf) + pgoff;
2154		m->m_pkthdr.len = m->m_len = xfsize;
2155
2156		if (m_header) {
2157			m_cat(m_header, m);
2158			m = m_header;
2159			m_header = NULL;
2160			m_fixhdr(m);
2161		}
2162
2163		/*
2164		 * Add the buffer to the socket buffer chain.
2165		 */
2166		SOCKBUF_LOCK(&so->so_snd);
2167retry_space:
2168		/*
2169		 * Make sure that the socket is still able to take more data.
2170		 * CANTSENDMORE being true usually means that the connection
2171		 * was closed. so_error is true when an error was sensed after
2172		 * a previous send.
2173		 * The state is checked after the page mapping and buffer
2174		 * allocation above since those operations may block and make
2175		 * any socket checks stale. From this point forward, nothing
2176		 * blocks before the pru_send (or more accurately, any blocking
2177		 * results in a loop back to here to re-check).
2178		 */
2179		SOCKBUF_LOCK_ASSERT(&so->so_snd);
2180		if ((so->so_snd.sb_state & SBS_CANTSENDMORE) || so->so_error) {
2181			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2182				error = EPIPE;
2183			} else {
2184				error = so->so_error;
2185				so->so_error = 0;
2186			}
2187			m_freem(m);
2188			sbunlock(&so->so_snd);
2189			SOCKBUF_UNLOCK(&so->so_snd);
2190			goto done;
2191		}
2192		/*
2193		 * Wait for socket space to become available. We do this just
2194		 * after checking the connection state above in order to avoid
2195		 * a race condition with sbwait().
2196		 */
2197		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
2198			if (so->so_state & SS_NBIO) {
2199				m_freem(m);
2200				sbunlock(&so->so_snd);
2201				SOCKBUF_UNLOCK(&so->so_snd);
2202				error = EAGAIN;
2203				goto done;
2204			}
2205			error = sbwait(&so->so_snd);
2206			/*
2207			 * An error from sbwait usually indicates that we've
2208			 * been interrupted by a signal. If we've sent anything
2209			 * then return bytes sent, otherwise return the error.
2210			 */
2211			if (error) {
2212				m_freem(m);
2213				sbunlock(&so->so_snd);
2214				SOCKBUF_UNLOCK(&so->so_snd);
2215				goto done;
2216			}
2217			goto retry_space;
2218		}
2219		SOCKBUF_UNLOCK(&so->so_snd);
2220		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
2221		if (error) {
2222			SOCKBUF_LOCK(&so->so_snd);
2223			sbunlock(&so->so_snd);
2224			SOCKBUF_UNLOCK(&so->so_snd);
2225			goto done;
2226		}
2227		headersent = 1;
2228	}
2229	SOCKBUF_LOCK(&so->so_snd);
2230	sbunlock(&so->so_snd);
2231	SOCKBUF_UNLOCK(&so->so_snd);
2232
2233	/*
2234	 * Send trailers. Wimp out and use writev(2).
2235	 */
2236	if (trl_uio != NULL) {
2237		error = kern_writev(td, uap->s, trl_uio);
2238		if (error)
2239			goto done;
2240		if (compat)
2241			sbytes += td->td_retval[0];
2242		else
2243			hdtr_size += td->td_retval[0];
2244	}
2245
2246done:
2247	if (headersent) {
2248		if (!compat)
2249			hdtr_size += headersize;
2250	} else {
2251		if (compat)
2252			sbytes -= headersize;
2253	}
2254	/*
2255	 * If there was no error we have to clear td->td_retval[0]
2256	 * because it may have been set by writev.
2257	 */
2258	if (error == 0) {
2259		td->td_retval[0] = 0;
2260	}
2261	if (uap->sbytes != NULL) {
2262		if (!compat)
2263			sbytes += hdtr_size;
2264		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2265	}
2266	if (obj != NULL)
2267		vm_object_deallocate(obj);
2268	if (vp != NULL) {
2269		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2270		vrele(vp);
2271		VFS_UNLOCK_GIANT(vfslocked);
2272	}
2273	if (so)
2274		fdrop(sock_fp, td);
2275	if (m_header)
2276		m_freem(m_header);
2277
2278	NET_UNLOCK_GIANT();
2279
2280	if (error == ERESTART)
2281		error = EINTR;
2282
2283	return (error);
2284}
2285