1/*
2 * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1989, 1990, 1993
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * sendfile(2) and related extensions:
33 * Copyright (c) 1998, David Greenman. All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 *    must display the following acknowledgement:
45 *	This product includes software developed by the University of
46 *	California, Berkeley and its contributors.
47 * 4. Neither the name of the University nor the names of its contributors
48 *    may be used to endorse or promote products derived from this software
49 *    without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
64 */
65/*
66 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
67 * support for mandatory and extensible security protections.  This notice
68 * is included in support of clause 2.2 (b) of the Apple Public License,
69 * Version 2.0.
70 */
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/filedesc.h>
75#include <sys/proc_internal.h>
76#include <sys/file_internal.h>
77#include <sys/vnode_internal.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <kern/lock.h>
81#include <sys/domain.h>
82#include <sys/protosw.h>
83#include <sys/signalvar.h>
84#include <sys/socket.h>
85#include <sys/socketvar.h>
86#include <sys/kernel.h>
87#include <sys/uio_internal.h>
88#include <sys/kauth.h>
89#include <kern/task.h>
90
91#include <security/audit/audit.h>
92
93#include <sys/kdebug.h>
94#include <sys/sysproto.h>
95#include <netinet/in.h>
96#include <net/route.h>
97#include <netinet/in_pcb.h>
98
99#if CONFIG_MACF_SOCKET_SUBSET
100#include <security/mac_framework.h>
101#endif /* MAC_SOCKET_SUBSET */
102
103#define	f_flag f_fglob->fg_flag
104#define	f_type f_fglob->fg_type
105#define	f_msgcount f_fglob->fg_msgcount
106#define	f_cred f_fglob->fg_cred
107#define	f_ops f_fglob->fg_ops
108#define	f_offset f_fglob->fg_offset
109#define	f_data f_fglob->fg_data
110
111
112#define	DBG_LAYER_IN_BEG	NETDBG_CODE(DBG_NETSOCK, 0)
113#define	DBG_LAYER_IN_END	NETDBG_CODE(DBG_NETSOCK, 2)
114#define	DBG_LAYER_OUT_BEG	NETDBG_CODE(DBG_NETSOCK, 1)
115#define	DBG_LAYER_OUT_END	NETDBG_CODE(DBG_NETSOCK, 3)
116#define	DBG_FNC_SENDMSG		NETDBG_CODE(DBG_NETSOCK, (1 << 8) | 1)
117#define	DBG_FNC_SENDTO		NETDBG_CODE(DBG_NETSOCK, (2 << 8) | 1)
118#define	DBG_FNC_SENDIT		NETDBG_CODE(DBG_NETSOCK, (3 << 8) | 1)
119#define	DBG_FNC_RECVFROM	NETDBG_CODE(DBG_NETSOCK, (5 << 8))
120#define	DBG_FNC_RECVMSG		NETDBG_CODE(DBG_NETSOCK, (6 << 8))
121#define	DBG_FNC_RECVIT		NETDBG_CODE(DBG_NETSOCK, (7 << 8))
122#define	DBG_FNC_SENDFILE	NETDBG_CODE(DBG_NETSOCK, (10 << 8))
123#define	DBG_FNC_SENDFILE_WAIT	NETDBG_CODE(DBG_NETSOCK, ((10 << 8) | 1))
124#define	DBG_FNC_SENDFILE_READ	NETDBG_CODE(DBG_NETSOCK, ((10 << 8) | 2))
125#define	DBG_FNC_SENDFILE_SEND	NETDBG_CODE(DBG_NETSOCK, ((10 << 8) | 3))
126
127
128#define	HACK_FOR_4056224 1
129#if HACK_FOR_4056224
130static pid_t last_pid_4056224 = 0;
131#endif /* HACK_FOR_4056224 */
132
133/* TODO: should be in header file */
134int falloc_locked(proc_t, struct fileproc **, int *, vfs_context_t, int);
135
136static int sendit(struct proc *, int, struct user_msghdr *, uio_t, int,
137    int32_t *);
138static int recvit(struct proc *, int, struct user_msghdr *, uio_t, user_addr_t,
139    int32_t *);
140static int getsockaddr(struct socket *, struct sockaddr **, user_addr_t,
141    size_t, boolean_t);
142static int getsockaddr_s(struct socket *, struct sockaddr_storage *,
143    user_addr_t, size_t, boolean_t);
144#if SENDFILE
145static void alloc_sendpkt(int, size_t, unsigned int *, struct mbuf **,
146    boolean_t);
147#endif /* SENDFILE */
148
149/*
150 * System call interface to the socket abstraction.
151 */
152
153extern	struct fileops socketops;
154
155/*
156 * Returns:	0			Success
157 *		EACCES			Mandatory Access Control failure
158 *	falloc:ENFILE
159 *	falloc:EMFILE
160 *	falloc:ENOMEM
161 *	socreate:EAFNOSUPPORT
162 *	socreate:EPROTOTYPE
163 *	socreate:EPROTONOSUPPORT
164 *	socreate:ENOBUFS
165 *	socreate:ENOMEM
166 *	socreate:EISCONN
167 *	socreate:???			[other protocol families, IPSEC]
168 */
169int
170socket(struct proc *p, struct socket_args *uap, int32_t *retval)
171{
172	struct socket *so;
173	struct fileproc *fp;
174	int fd, error;
175
176	AUDIT_ARG(socket, uap->domain, uap->type, uap->protocol);
177#if CONFIG_MACF_SOCKET_SUBSET
178	if ((error = mac_socket_check_create(kauth_cred_get(), uap->domain,
179	    uap->type, uap->protocol)) != 0)
180		return (error);
181#endif /* MAC_SOCKET_SUBSET */
182
183	error = falloc(p, &fp, &fd, vfs_context_current());
184	if (error) {
185		return (error);
186	}
187	fp->f_flag = FREAD|FWRITE;
188	fp->f_type = DTYPE_SOCKET;
189	fp->f_ops = &socketops;
190
191	error = socreate(uap->domain, &so, uap->type, uap->protocol);
192	if (error) {
193		fp_free(p, fd, fp);
194	} else {
195		thread_t			thread;
196		struct uthread		*ut;
197
198		thread = current_thread();
199		ut = get_bsdthread_info(thread);
200
201		/* if this is a backgrounded thread then throttle all new sockets */
202		if (proc_get_selfthread_isbackground() != 0) {
203			so->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BACKGROUND;
204			so->so_background_thread = thread;
205		}
206		fp->f_data = (caddr_t)so;
207
208		proc_fdlock(p);
209		procfdtbl_releasefd(p, fd, NULL);
210
211		fp_drop(p, fd, fp, 1);
212		proc_fdunlock(p);
213
214		*retval = fd;
215	}
216	return (error);
217}
218
219/*
220 * Returns:	0			Success
221 *		EDESTADDRREQ		Destination address required
222 *		EBADF			Bad file descriptor
223 *		EACCES			Mandatory Access Control failure
224 *	file_socket:ENOTSOCK
225 *	file_socket:EBADF
226 *	getsockaddr:ENAMETOOLONG	Filename too long
227 *	getsockaddr:EINVAL		Invalid argument
228 *	getsockaddr:ENOMEM		Not enough space
229 *	getsockaddr:EFAULT		Bad address
230 *	sobind:???
231 */
232/* ARGSUSED */
233int
234bind(__unused proc_t p, struct bind_args *uap, __unused int32_t *retval)
235{
236	struct sockaddr_storage ss;
237	struct sockaddr *sa = NULL;
238	struct socket *so;
239	boolean_t want_free = TRUE;
240	int error;
241
242	AUDIT_ARG(fd, uap->s);
243	error = file_socket(uap->s, &so);
244	if (error != 0)
245		return (error);
246	if (so == NULL) {
247		error = EBADF;
248		goto out;
249	}
250	if (uap->name == USER_ADDR_NULL) {
251		error = EDESTADDRREQ;
252		goto out;
253	}
254	if (uap->namelen > sizeof (ss)) {
255		error = getsockaddr(so, &sa, uap->name, uap->namelen, TRUE);
256	} else {
257		error = getsockaddr_s(so, &ss, uap->name, uap->namelen, TRUE);
258		if (error == 0) {
259			sa = (struct sockaddr *)&ss;
260			want_free = FALSE;
261		}
262	}
263	if (error != 0)
264		goto out;
265	AUDIT_ARG(sockaddr, vfs_context_cwd(vfs_context_current()), sa);
266#if CONFIG_MACF_SOCKET_SUBSET
267	if ((error = mac_socket_check_bind(kauth_cred_get(), so, sa)) == 0)
268		error = sobind(so, sa);
269#else
270		error = sobind(so, sa);
271#endif /* MAC_SOCKET_SUBSET */
272	if (want_free)
273		FREE(sa, M_SONAME);
274out:
275	file_drop(uap->s);
276	return (error);
277}
278
279/*
280 * Returns:	0			Success
281 *		EBADF
282 *		EACCES			Mandatory Access Control failure
283 *	file_socket:ENOTSOCK
284 *	file_socket:EBADF
285 *	solisten:EINVAL
286 *	solisten:EOPNOTSUPP
287 *	solisten:???
288 */
289int
290listen(__unused struct proc *p, struct listen_args *uap,
291    __unused int32_t *retval)
292{
293	int error;
294	struct socket *so;
295
296	AUDIT_ARG(fd, uap->s);
297	error = file_socket(uap->s, &so);
298	if (error)
299		return (error);
300	if (so != NULL)
301#if CONFIG_MACF_SOCKET_SUBSET
302	{
303		error = mac_socket_check_listen(kauth_cred_get(), so);
304		if (error == 0)
305			error = solisten(so, uap->backlog);
306	}
307#else
308		error =  solisten(so, uap->backlog);
309#endif /* MAC_SOCKET_SUBSET */
310	else
311		error = EBADF;
312
313	file_drop(uap->s);
314	return (error);
315}
316
317/*
318 * Returns:	fp_getfsock:EBADF	Bad file descriptor
319 *		fp_getfsock:EOPNOTSUPP	...
320 *		xlate => :ENOTSOCK	Socket operation on non-socket
321 *		:EFAULT			Bad address on copyin/copyout
322 *		:EBADF			Bad file descriptor
323 *		:EOPNOTSUPP		Operation not supported on socket
324 *		:EINVAL			Invalid argument
325 *		:EWOULDBLOCK		Operation would block
326 *		:ECONNABORTED		Connection aborted
327 *		:EINTR			Interrupted function
328 *		:EACCES			Mandatory Access Control failure
329 *		falloc_locked:ENFILE	Too many files open in system
330 *		falloc_locked::EMFILE	Too many open files
331 *		falloc_locked::ENOMEM	Not enough space
332 *		0			Success
333 */
334int
335accept_nocancel(struct proc *p, struct accept_nocancel_args *uap,
336    int32_t *retval)
337{
338	struct fileproc *fp;
339	struct sockaddr *sa = NULL;
340	socklen_t namelen;
341	int error;
342	struct socket *head, *so = NULL;
343	lck_mtx_t *mutex_held;
344	int fd = uap->s;
345	int newfd;
346	short fflag;		/* type must match fp->f_flag */
347	int dosocklock = 0;
348
349	*retval = -1;
350
351	AUDIT_ARG(fd, uap->s);
352
353	if (uap->name) {
354		error = copyin(uap->anamelen, (caddr_t)&namelen,
355		    sizeof (socklen_t));
356		if (error)
357			return (error);
358	}
359	error = fp_getfsock(p, fd, &fp, &head);
360	if (error) {
361		if (error == EOPNOTSUPP)
362			error = ENOTSOCK;
363		return (error);
364	}
365	if (head == NULL) {
366		error = EBADF;
367		goto out;
368	}
369#if CONFIG_MACF_SOCKET_SUBSET
370	if ((error = mac_socket_check_accept(kauth_cred_get(), head)) != 0)
371		goto out;
372#endif /* MAC_SOCKET_SUBSET */
373
374	socket_lock(head, 1);
375
376	if (head->so_proto->pr_getlock != NULL)  {
377		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
378		dosocklock = 1;
379	} else {
380		mutex_held = head->so_proto->pr_domain->dom_mtx;
381		dosocklock = 0;
382	}
383
384	if ((head->so_options & SO_ACCEPTCONN) == 0) {
385		if ((head->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
386			error = EOPNOTSUPP;
387		} else {
388			/* POSIX: The socket is not accepting connections */
389			error = EINVAL;
390		}
391		socket_unlock(head, 1);
392		goto out;
393	}
394	if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) {
395		socket_unlock(head, 1);
396		error = EWOULDBLOCK;
397		goto out;
398	}
399	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
400		if (head->so_state & SS_CANTRCVMORE) {
401			head->so_error = ECONNABORTED;
402			break;
403		}
404		if (head->so_usecount < 1)
405			panic("accept: head=%p refcount=%d\n", head,
406			    head->so_usecount);
407		error = msleep((caddr_t)&head->so_timeo, mutex_held,
408		    PSOCK | PCATCH, "accept", 0);
409		if (head->so_usecount < 1)
410			panic("accept: 2 head=%p refcount=%d\n", head,
411			    head->so_usecount);
412		if ((head->so_state & SS_DRAINING)) {
413			error = ECONNABORTED;
414		}
415		if (error) {
416			socket_unlock(head, 1);
417			goto out;
418		}
419	}
420	if (head->so_error) {
421		error = head->so_error;
422		head->so_error = 0;
423		socket_unlock(head, 1);
424		goto out;
425	}
426
427
428	/*
429	 * At this point we know that there is at least one connection
430	 * ready to be accepted. Remove it from the queue prior to
431	 * allocating the file descriptor for it since falloc() may
432	 * block allowing another process to accept the connection
433	 * instead.
434	 */
435	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
436	so = TAILQ_FIRST(&head->so_comp);
437	TAILQ_REMOVE(&head->so_comp, so, so_list);
438	head->so_qlen--;
439	/* unlock head to avoid deadlock with select, keep a ref on head */
440	socket_unlock(head, 0);
441
442#if CONFIG_MACF_SOCKET_SUBSET
443	/*
444	 * Pass the pre-accepted socket to the MAC framework. This is
445	 * cheaper than allocating a file descriptor for the socket,
446	 * calling the protocol accept callback, and possibly freeing
447	 * the file descriptor should the MAC check fails.
448	 */
449	if ((error = mac_socket_check_accepted(kauth_cred_get(), so)) != 0) {
450		so->so_state &= ~(SS_NOFDREF | SS_COMP);
451		so->so_head = NULL;
452		soclose(so);
453		/* Drop reference on listening socket */
454		sodereference(head);
455		goto out;
456	}
457#endif /* MAC_SOCKET_SUBSET */
458
459	/*
460	 * Pass the pre-accepted socket to any interested socket filter(s).
461	 * Upon failure, the socket would have been closed by the callee.
462	 */
463	if (so->so_filt != NULL && (error = soacceptfilter(so)) != 0) {
464		/* Drop reference on listening socket */
465		sodereference(head);
466		/* Propagate socket filter's error code to the caller */
467		goto out;
468	}
469
470	fflag = fp->f_flag;
471	error = falloc(p, &fp, &newfd, vfs_context_current());
472	if (error) {
473		/*
474		 * Probably ran out of file descriptors.
475		 *
476		 * <rdar://problem/8554930>
477		 * Don't put this back on the socket like we used to, that
478		 * just causes the client to spin. Drop the socket.
479		 */
480		so->so_state &= ~(SS_NOFDREF | SS_COMP);
481		so->so_head = NULL;
482		soclose(so);
483		sodereference(head);
484		goto out;
485	}
486	*retval = newfd;
487	fp->f_type = DTYPE_SOCKET;
488	fp->f_flag = fflag;
489	fp->f_ops = &socketops;
490	fp->f_data = (caddr_t)so;
491	socket_lock(head, 0);
492	if (dosocklock)
493		socket_lock(so, 1);
494	so->so_state &= ~SS_COMP;
495	so->so_head = NULL;
496	(void) soacceptlock(so, &sa, 0);
497	socket_unlock(head, 1);
498	if (sa == NULL) {
499		namelen = 0;
500		if (uap->name)
501			goto gotnoname;
502		error = 0;
503		goto releasefd;
504	}
505	AUDIT_ARG(sockaddr, vfs_context_cwd(vfs_context_current()), sa);
506
507	if (uap->name) {
508		socklen_t	sa_len;
509
510		/* save sa_len before it is destroyed */
511		sa_len = sa->sa_len;
512		namelen = MIN(namelen, sa_len);
513		error = copyout(sa, uap->name, namelen);
514		if (!error)
515			/* return the actual, untruncated address length */
516			namelen = sa_len;
517gotnoname:
518		error = copyout((caddr_t)&namelen, uap->anamelen,
519		    sizeof (socklen_t));
520	}
521	FREE(sa, M_SONAME);
522
523releasefd:
524	/*
525	 * If the socket has been marked as inactive by sosetdefunct(),
526	 * disallow further operations on it.
527	 */
528	if (so->so_flags & SOF_DEFUNCT) {
529		sodefunct(current_proc(), so,
530		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
531	}
532
533	if (dosocklock)
534		socket_unlock(so, 1);
535
536	proc_fdlock(p);
537	procfdtbl_releasefd(p, newfd, NULL);
538	fp_drop(p, newfd, fp, 1);
539	proc_fdunlock(p);
540
541out:
542	file_drop(fd);
543	return (error);
544}
545
546int
547accept(struct proc *p, struct accept_args *uap, int32_t *retval)
548{
549	__pthread_testcancel(1);
550	return(accept_nocancel(p, (struct accept_nocancel_args *)uap, retval));
551}
552
553/*
554 * Returns:	0			Success
555 *		EBADF			Bad file descriptor
556 *		EALREADY		Connection already in progress
557 *		EINPROGRESS		Operation in progress
558 *		ECONNABORTED		Connection aborted
559 *		EINTR			Interrupted function
560 *		EACCES			Mandatory Access Control failure
561 *	file_socket:ENOTSOCK
562 *	file_socket:EBADF
563 *	getsockaddr:ENAMETOOLONG	Filename too long
564 *	getsockaddr:EINVAL		Invalid argument
565 *	getsockaddr:ENOMEM		Not enough space
566 *	getsockaddr:EFAULT		Bad address
567 *	soconnectlock:EOPNOTSUPP
568 *	soconnectlock:EISCONN
569 *	soconnectlock:???		[depends on protocol, filters]
570 *	msleep:EINTR
571 *
572 * Imputed:	so_error		error may be set from so_error, which
573 *					may have been set by soconnectlock.
574 */
575/* ARGSUSED */
576int
577connect(struct proc *p, struct connect_args *uap, int32_t *retval)
578{
579	__pthread_testcancel(1);
580	return(connect_nocancel(p, (struct connect_nocancel_args *)uap, retval));
581}
582
583int
584connect_nocancel(__unused proc_t p, struct connect_nocancel_args *uap, __unused int32_t *retval)
585{
586	struct socket *so;
587	struct sockaddr_storage ss;
588	struct sockaddr *sa = NULL;
589	lck_mtx_t *mutex_held;
590	boolean_t want_free = TRUE;
591	int error;
592	int fd = uap->s;
593	boolean_t dgram;
594
595	AUDIT_ARG(fd, uap->s);
596	error = file_socket(fd, &so);
597	if (error != 0)
598		return (error);
599	if (so == NULL) {
600		error = EBADF;
601		goto out;
602	}
603
604	/*
605	 * Ask getsockaddr{_s} to not translate AF_UNSPEC to AF_INET
606	 * if this is a datagram socket; translate for other types.
607	 */
608	dgram = (so->so_type == SOCK_DGRAM);
609
610	/* Get socket address now before we obtain socket lock */
611	if (uap->namelen > sizeof (ss)) {
612		error = getsockaddr(so, &sa, uap->name, uap->namelen, !dgram);
613	} else {
614		error = getsockaddr_s(so, &ss, uap->name, uap->namelen, !dgram);
615		if (error == 0) {
616			sa = (struct sockaddr *)&ss;
617			want_free = FALSE;
618		}
619	}
620	if (error != 0)
621		goto out;
622
623	AUDIT_ARG(sockaddr, vfs_context_cwd(vfs_context_current()), sa);
624#if CONFIG_MACF_SOCKET_SUBSET
625	if ((error = mac_socket_check_connect(kauth_cred_get(), so, sa)) != 0) {
626		if (want_free)
627			FREE(sa, M_SONAME);
628		goto out;
629	}
630#endif /* MAC_SOCKET_SUBSET */
631	socket_lock(so, 1);
632
633	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
634		if (want_free)
635			FREE(sa, M_SONAME);
636		socket_unlock(so, 1);
637		error = EALREADY;
638		goto out;
639	}
640	error = soconnectlock(so, sa, 0);
641	if (error)
642		goto bad;
643	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
644		if (want_free)
645			FREE(sa, M_SONAME);
646		socket_unlock(so, 1);
647		error = EINPROGRESS;
648		goto out;
649	}
650	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
651		if (so->so_proto->pr_getlock != NULL)
652			mutex_held = (*so->so_proto->pr_getlock)(so, 0);
653		else
654			mutex_held = so->so_proto->pr_domain->dom_mtx;
655		error = msleep((caddr_t)&so->so_timeo, mutex_held,
656		    PSOCK | PCATCH, "connect", 0);
657		if ((so->so_state & SS_DRAINING)) {
658			error = ECONNABORTED;
659		}
660		if (error)
661			break;
662	}
663	if (error == 0) {
664		error = so->so_error;
665		so->so_error = 0;
666	}
667bad:
668	so->so_state &= ~SS_ISCONNECTING;
669	socket_unlock(so, 1);
670	if (want_free)
671		FREE(sa, M_SONAME);
672	if (error == ERESTART)
673		error = EINTR;
674out:
675	file_drop(fd);
676	return (error);
677}
678
679/*
680 * Returns:	0			Success
681 *	socreate:EAFNOSUPPORT
682 *	socreate:EPROTOTYPE
683 *	socreate:EPROTONOSUPPORT
684 *	socreate:ENOBUFS
685 *	socreate:ENOMEM
686 *	socreate:EISCONN
687 *	socreate:???			[other protocol families, IPSEC]
688 *	falloc:ENFILE
689 *	falloc:EMFILE
690 *	falloc:ENOMEM
691 *	copyout:EFAULT
692 *	soconnect2:EINVAL
693 *	soconnect2:EPROTOTYPE
694 *	soconnect2:???			[other protocol families[
695 */
696int
697socketpair(struct proc *p, struct socketpair_args *uap,
698    __unused int32_t *retval)
699{
700	struct fileproc *fp1, *fp2;
701	struct socket *so1, *so2;
702	int fd, error, sv[2];
703
704	AUDIT_ARG(socket, uap->domain, uap->type, uap->protocol);
705	error = socreate(uap->domain, &so1, uap->type, uap->protocol);
706	if (error)
707		return (error);
708	error = socreate(uap->domain, &so2, uap->type, uap->protocol);
709	if (error)
710		goto free1;
711
712	error = falloc(p, &fp1, &fd, vfs_context_current());
713	if (error) {
714		goto free2;
715	}
716	fp1->f_flag = FREAD|FWRITE;
717	fp1->f_type = DTYPE_SOCKET;
718	fp1->f_ops = &socketops;
719	fp1->f_data = (caddr_t)so1;
720	sv[0] = fd;
721
722	error = falloc(p, &fp2, &fd, vfs_context_current());
723	if (error) {
724		goto free3;
725	}
726	fp2->f_flag = FREAD|FWRITE;
727	fp2->f_type = DTYPE_SOCKET;
728	fp2->f_ops = &socketops;
729	fp2->f_data = (caddr_t)so2;
730	sv[1] = fd;
731
732	error = soconnect2(so1, so2);
733	if (error) {
734		goto free4;
735	}
736	if (uap->type == SOCK_DGRAM) {
737		/*
738		 * Datagram socket connection is asymmetric.
739		 */
740		error = soconnect2(so2, so1);
741		if (error) {
742			goto free4;
743		}
744	}
745
746	if ((error = copyout(sv, uap->rsv, 2 * sizeof (int))) != 0)
747		goto free4;
748
749	proc_fdlock(p);
750	procfdtbl_releasefd(p, sv[0], NULL);
751	procfdtbl_releasefd(p, sv[1], NULL);
752	fp_drop(p, sv[0], fp1, 1);
753	fp_drop(p, sv[1], fp2, 1);
754	proc_fdunlock(p);
755
756	return (0);
757free4:
758	fp_free(p, sv[1], fp2);
759free3:
760	fp_free(p, sv[0], fp1);
761free2:
762	(void) soclose(so2);
763free1:
764	(void) soclose(so1);
765	return (error);
766}
767
768/*
769 * Returns:	0			Success
770 *		EINVAL
771 *		ENOBUFS
772 *		EBADF
773 *		EPIPE
774 *		EACCES			Mandatory Access Control failure
775 *	file_socket:ENOTSOCK
776 *	file_socket:EBADF
777 *	getsockaddr:ENAMETOOLONG	Filename too long
778 *	getsockaddr:EINVAL		Invalid argument
779 *	getsockaddr:ENOMEM		Not enough space
780 *	getsockaddr:EFAULT		Bad address
781 *	<pru_sosend>:EACCES[TCP]
782 *	<pru_sosend>:EADDRINUSE[TCP]
783 *	<pru_sosend>:EADDRNOTAVAIL[TCP]
784 *	<pru_sosend>:EAFNOSUPPORT[TCP]
785 *	<pru_sosend>:EAGAIN[TCP]
786 *	<pru_sosend>:EBADF
787 *	<pru_sosend>:ECONNRESET[TCP]
788 *	<pru_sosend>:EFAULT
789 *	<pru_sosend>:EHOSTUNREACH[TCP]
790 *	<pru_sosend>:EINTR
791 *	<pru_sosend>:EINVAL
792 *	<pru_sosend>:EISCONN[AF_INET]
793 *	<pru_sosend>:EMSGSIZE[TCP]
794 *	<pru_sosend>:ENETDOWN[TCP]
795 *	<pru_sosend>:ENETUNREACH[TCP]
796 *	<pru_sosend>:ENOBUFS
797 *	<pru_sosend>:ENOMEM[TCP]
798 *	<pru_sosend>:ENOTCONN[AF_INET]
799 *	<pru_sosend>:EOPNOTSUPP
800 *	<pru_sosend>:EPERM[TCP]
801 *	<pru_sosend>:EPIPE
802 *	<pru_sosend>:EWOULDBLOCK
803 *	<pru_sosend>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
804 *	<pru_sosend>:???[AF_INET]	[whatever a filter author chooses]
805 *	<pru_sosend>:???		[value from so_error]
806 *	sockargs:???
807 */
808static int
809sendit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop,
810    int flags, int32_t *retval)
811{
812	struct mbuf *control = NULL;
813	struct sockaddr_storage ss;
814	struct sockaddr *to = NULL;
815	boolean_t want_free = TRUE;
816	int error;
817	struct socket *so;
818	user_ssize_t len;
819
820	KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_START, 0, 0, 0, 0, 0);
821
822	error = file_socket(s, &so);
823	if (error) {
824		KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, error, 0, 0, 0, 0);
825		return (error);
826	}
827	if (so == NULL) {
828		error = EBADF;
829		goto out;
830	}
831	if (mp->msg_name != USER_ADDR_NULL) {
832		if (mp->msg_namelen > sizeof (ss)) {
833			error = getsockaddr(so, &to, mp->msg_name,
834			    mp->msg_namelen, TRUE);
835		} else {
836			error = getsockaddr_s(so, &ss, mp->msg_name,
837			    mp->msg_namelen, TRUE);
838			if (error == 0) {
839				to = (struct sockaddr *)&ss;
840				want_free = FALSE;
841			}
842		}
843		if (error != 0)
844			goto out;
845		AUDIT_ARG(sockaddr, vfs_context_cwd(vfs_context_current()), to);
846	}
847	if (mp->msg_control != USER_ADDR_NULL) {
848		if (mp->msg_controllen < sizeof (struct cmsghdr)) {
849			error = EINVAL;
850			goto bad;
851		}
852		error = sockargs(&control, mp->msg_control,
853		    mp->msg_controllen, MT_CONTROL);
854		if (error != 0)
855			goto bad;
856	}
857
858#if CONFIG_MACF_SOCKET_SUBSET
859	/*
860	 * We check the state without holding the socket lock;
861	 * if a race condition occurs, it would simply result
862	 * in an extra call to the MAC check function.
863	 */
864	if ( to != NULL &&
865	    !(so->so_state & SS_DEFUNCT) &&
866	    (error = mac_socket_check_send(kauth_cred_get(), so, to)) != 0)
867		goto bad;
868#endif /* MAC_SOCKET_SUBSET */
869
870	len = uio_resid(uiop);
871	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, uiop, 0, control,
872	    flags);
873	if (error != 0) {
874		if (uio_resid(uiop) != len && (error == ERESTART ||
875		    error == EINTR || error == EWOULDBLOCK))
876			error = 0;
877		/* Generation of SIGPIPE can be controlled per socket */
878		if (error == EPIPE && !(so->so_flags & SOF_NOSIGPIPE))
879			psignal(p, SIGPIPE);
880	}
881	if (error == 0)
882		*retval = (int)(len - uio_resid(uiop));
883bad:
884	if (to != NULL && want_free)
885		FREE(to, M_SONAME);
886out:
887	KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, error, 0, 0, 0, 0);
888	file_drop(s);
889	return (error);
890}
891
892/*
893 * Returns:	0			Success
894 *		ENOMEM
895 *	sendit:???			[see sendit definition in this file]
896 *	write:???			[4056224: applicable for pipes]
897 */
898int
899sendto(struct proc *p, struct sendto_args *uap, int32_t *retval)
900{
901	__pthread_testcancel(1);
902	return(sendto_nocancel(p, (struct sendto_nocancel_args *)uap, retval));
903}
904
905int
906sendto_nocancel(struct proc *p, struct sendto_nocancel_args *uap, int32_t *retval)
907{
908	struct user_msghdr msg;
909	int error;
910	uio_t auio = NULL;
911
912	KERNEL_DEBUG(DBG_FNC_SENDTO | DBG_FUNC_START, 0, 0, 0, 0, 0);
913	AUDIT_ARG(fd, uap->s);
914
915	auio = uio_create(1, 0,
916	    (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
917	    UIO_WRITE);
918	if (auio == NULL) {
919		return (ENOMEM);
920	}
921	uio_addiov(auio, uap->buf, uap->len);
922
923	msg.msg_name = uap->to;
924	msg.msg_namelen = uap->tolen;
925	/* no need to set up msg_iov.  sendit uses uio_t we send it */
926	msg.msg_iov = 0;
927	msg.msg_iovlen = 0;
928	msg.msg_control = 0;
929	msg.msg_flags = 0;
930
931	error = sendit(p, uap->s, &msg, auio, uap->flags, retval);
932
933	if (auio != NULL) {
934		uio_free(auio);
935	}
936
937#if HACK_FOR_4056224
938	/*
939	 * Radar 4056224
940	 * Temporary workaround to let send() and recv() work over
941	 * a pipe for binary compatibility
942	 * This will be removed in the release following Tiger
943	 */
944	if (error == ENOTSOCK) {
945		struct fileproc *fp;
946
947		if (fp_lookup(p, uap->s, &fp, 0) == 0) {
948			(void) fp_drop(p, uap->s, fp, 0);
949
950			if (fp->f_type == DTYPE_PIPE) {
951				struct write_args write_uap;
952				user_ssize_t write_retval;
953
954				if (p->p_pid > last_pid_4056224) {
955					last_pid_4056224 = p->p_pid;
956
957					printf("%s[%d] uses send/recv "
958					    "on a pipe\n", p->p_comm, p->p_pid);
959				}
960
961				bzero(&write_uap, sizeof (struct write_args));
962				write_uap.fd = uap->s;
963				write_uap.cbuf = uap->buf;
964				write_uap.nbyte = uap->len;
965
966				error = write(p, &write_uap, &write_retval);
967				*retval = (int)write_retval;
968			}
969		}
970	}
971#endif /* HACK_FOR_4056224 */
972
973	KERNEL_DEBUG(DBG_FNC_SENDTO | DBG_FUNC_END, error, *retval, 0, 0, 0);
974
975	return (error);
976}
977
978/*
979 * Returns:	0			Success
980 *		ENOBUFS
981 *	copyin:EFAULT
982 *	sendit:???			[see sendit definition in this file]
983 */
984int
985sendmsg(struct proc *p, struct sendmsg_args *uap, int32_t *retval)
986{
987	__pthread_testcancel(1);
988	return(sendmsg_nocancel(p, (struct sendmsg_nocancel_args *)uap, retval));
989}
990
991int
992sendmsg_nocancel(struct proc *p, struct sendmsg_nocancel_args *uap, int32_t *retval)
993{
994	struct user32_msghdr msg32;
995	struct user64_msghdr msg64;
996	struct user_msghdr user_msg;
997	caddr_t msghdrp;
998	int	size_of_msghdr;
999	int error;
1000	uio_t auio = NULL;
1001	struct user_iovec *iovp;
1002
1003	KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_START, 0, 0, 0, 0, 0);
1004	AUDIT_ARG(fd, uap->s);
1005	if (IS_64BIT_PROCESS(p)) {
1006		msghdrp = (caddr_t)&msg64;
1007		size_of_msghdr = sizeof (msg64);
1008	} else {
1009		msghdrp = (caddr_t)&msg32;
1010		size_of_msghdr = sizeof (msg32);
1011	}
1012	error = copyin(uap->msg, msghdrp, size_of_msghdr);
1013	if (error) {
1014		KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, error, 0, 0, 0, 0);
1015		return (error);
1016	}
1017
1018	if (IS_64BIT_PROCESS(p)) {
1019		user_msg.msg_flags = msg64.msg_flags;
1020		user_msg.msg_controllen = msg64.msg_controllen;
1021		user_msg.msg_control = msg64.msg_control;
1022		user_msg.msg_iovlen = msg64.msg_iovlen;
1023		user_msg.msg_iov = msg64.msg_iov;
1024		user_msg.msg_namelen = msg64.msg_namelen;
1025		user_msg.msg_name = msg64.msg_name;
1026	} else {
1027		user_msg.msg_flags = msg32.msg_flags;
1028		user_msg.msg_controllen = msg32.msg_controllen;
1029		user_msg.msg_control = msg32.msg_control;
1030		user_msg.msg_iovlen = msg32.msg_iovlen;
1031		user_msg.msg_iov = msg32.msg_iov;
1032		user_msg.msg_namelen = msg32.msg_namelen;
1033		user_msg.msg_name = msg32.msg_name;
1034	}
1035
1036	if (user_msg.msg_iovlen <= 0 || user_msg.msg_iovlen > UIO_MAXIOV) {
1037		KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, EMSGSIZE,
1038		    0, 0, 0, 0);
1039		return (EMSGSIZE);
1040	}
1041
1042	/* allocate a uio large enough to hold the number of iovecs passed */
1043	auio = uio_create(user_msg.msg_iovlen, 0,
1044	    (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1045	    UIO_WRITE);
1046	if (auio == NULL) {
1047		error = ENOBUFS;
1048		goto done;
1049	}
1050
1051	if (user_msg.msg_iovlen) {
1052		/*
1053		 * get location of iovecs within the uio.
1054		 * then copyin the iovecs from user space.
1055		 */
1056		iovp = uio_iovsaddr(auio);
1057		if (iovp == NULL) {
1058			error = ENOBUFS;
1059			goto done;
1060		}
1061		error = copyin_user_iovec_array(user_msg.msg_iov,
1062			IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
1063			user_msg.msg_iovlen, iovp);
1064		if (error)
1065			goto done;
1066		user_msg.msg_iov = CAST_USER_ADDR_T(iovp);
1067
1068		/* finish setup of uio_t */
1069		uio_calculateresid(auio);
1070	} else {
1071		user_msg.msg_iov = 0;
1072	}
1073
1074	/* msg_flags is ignored for send */
1075	user_msg.msg_flags = 0;
1076
1077	error = sendit(p, uap->s, &user_msg, auio, uap->flags, retval);
1078done:
1079	if (auio != NULL) {
1080		uio_free(auio);
1081	}
1082	KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, error, 0, 0, 0, 0);
1083
1084	return (error);
1085}
1086
1087/*
1088 * Returns:	0			Success
1089 *		ENOTSOCK
1090 *		EINVAL
1091 *		EBADF
1092 *		EACCES			Mandatory Access Control failure
1093 *	copyout:EFAULT
1094 *	fp_lookup:EBADF
1095 *	<pru_soreceive>:ENOBUFS
1096 *	<pru_soreceive>:ENOTCONN
1097 *	<pru_soreceive>:EWOULDBLOCK
1098 *	<pru_soreceive>:EFAULT
1099 *	<pru_soreceive>:EINTR
1100 *	<pru_soreceive>:EBADF
1101 *	<pru_soreceive>:EINVAL
1102 *	<pru_soreceive>:EMSGSIZE
1103 *	<pru_soreceive>:???
1104 *
1105 * Notes:	Additional return values from calls through <pru_soreceive>
1106 *		depend on protocols other than TCP or AF_UNIX, which are
1107 *		documented above.
1108 */
1109static int
1110recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop,
1111    user_addr_t namelenp, int32_t *retval)
1112{
1113	int len, error;
1114	struct mbuf *m, *control = 0;
1115	user_addr_t ctlbuf;
1116	struct socket *so;
1117	struct sockaddr *fromsa = 0;
1118	struct fileproc *fp;
1119
1120	KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1121	proc_fdlock(p);
1122	if ((error = fp_lookup(p, s, &fp, 1))) {
1123		KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, error, 0, 0, 0, 0);
1124		proc_fdunlock(p);
1125		return (error);
1126	}
1127	if (fp->f_type != DTYPE_SOCKET) {
1128		fp_drop(p, s, fp, 1);
1129		proc_fdunlock(p);
1130		return (ENOTSOCK);
1131	}
1132
1133	so = (struct socket *)fp->f_data;
1134	if (so == NULL) {
1135		fp_drop(p, s, fp, 1);
1136		proc_fdunlock(p);
1137		return (EBADF);
1138	}
1139
1140	proc_fdunlock(p);
1141
1142#if CONFIG_MACF_SOCKET_SUBSET
1143	/*
1144	 * We check the state without holding the socket lock;
1145	 * if a race condition occurs, it would simply result
1146	 * in an extra call to the MAC check function.
1147	 */
1148	if (!(so->so_state & SS_DEFUNCT) &&
1149	    !(so->so_state & SS_ISCONNECTED) &&
1150	    (error = mac_socket_check_receive(kauth_cred_get(), so)) != 0)
1151		goto out1;
1152#endif /* MAC_SOCKET_SUBSET */
1153	if (uio_resid(uiop) < 0) {
1154		KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, EINVAL, 0, 0, 0, 0);
1155		error = EINVAL;
1156		goto out1;
1157	}
1158
1159	len = uio_resid(uiop);
1160	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, uiop,
1161	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
1162	    &mp->msg_flags);
1163	if (fromsa)
1164		AUDIT_ARG(sockaddr, vfs_context_cwd(vfs_context_current()),
1165		    fromsa);
1166	if (error) {
1167		if (uio_resid(uiop) != len && (error == ERESTART ||
1168		    error == EINTR || error == EWOULDBLOCK))
1169			error = 0;
1170	}
1171
1172	if (error)
1173		goto out;
1174
1175	*retval = len - uio_resid(uiop);
1176	if (mp->msg_name) {
1177		socklen_t sa_len = 0;
1178
1179		len = mp->msg_namelen;
1180		if (len <= 0 || fromsa == 0) {
1181			len = 0;
1182		} else {
1183#ifndef MIN
1184#define	MIN(a, b) ((a) > (b) ? (b) : (a))
1185#endif
1186			sa_len = fromsa->sa_len;
1187			len = MIN((unsigned int)len, sa_len);
1188			error = copyout(fromsa, mp->msg_name, (unsigned)len);
1189			if (error)
1190				goto out;
1191		}
1192		mp->msg_namelen = sa_len;
1193		/* return the actual, untruncated address length */
1194		if (namelenp &&
1195		    (error = copyout((caddr_t)&sa_len, namelenp,
1196		    sizeof (int)))) {
1197			goto out;
1198		}
1199	}
1200	if (mp->msg_control) {
1201		len = mp->msg_controllen;
1202		m = control;
1203		mp->msg_controllen = 0;
1204		ctlbuf = mp->msg_control;
1205
1206		while (m && len > 0) {
1207			unsigned int tocopy;
1208			struct cmsghdr *cp = mtod(m, struct cmsghdr *);
1209			int cp_size = CMSG_ALIGN(cp->cmsg_len);
1210			int buflen = m->m_len;
1211
1212			while (buflen > 0 && len > 0) {
1213
1214				/*
1215				 SCM_TIMESTAMP hack because  struct timeval has a
1216				 * different size for 32 bits and 64 bits processes
1217				 */
1218				if (cp->cmsg_level == SOL_SOCKET &&  cp->cmsg_type == SCM_TIMESTAMP) {
1219					unsigned char tmp_buffer[CMSG_SPACE(sizeof(struct user64_timeval))];
1220					struct cmsghdr *tmp_cp = (struct cmsghdr *)(void *)tmp_buffer;
1221					int tmp_space;
1222					struct timeval *tv = (struct timeval *)(void *)CMSG_DATA(cp);
1223
1224					tmp_cp->cmsg_level = SOL_SOCKET;
1225					tmp_cp->cmsg_type = SCM_TIMESTAMP;
1226
1227					if (proc_is64bit(p)) {
1228						struct user64_timeval *tv64 = (struct user64_timeval *)(void *)CMSG_DATA(tmp_cp);
1229
1230						tv64->tv_sec = tv->tv_sec;
1231						tv64->tv_usec = tv->tv_usec;
1232
1233						tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user64_timeval));
1234						tmp_space = CMSG_SPACE(sizeof(struct user64_timeval));
1235					} else {
1236						struct user32_timeval *tv32 = (struct user32_timeval *)(void *)CMSG_DATA(tmp_cp);
1237
1238						tv32->tv_sec = tv->tv_sec;
1239						tv32->tv_usec = tv->tv_usec;
1240
1241						tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user32_timeval));
1242						tmp_space = CMSG_SPACE(sizeof(struct user32_timeval));
1243					}
1244					if (len >= tmp_space) {
1245						tocopy = tmp_space;
1246					} else {
1247						mp->msg_flags |= MSG_CTRUNC;
1248						tocopy = len;
1249					}
1250					error = copyout(tmp_buffer, ctlbuf, tocopy);
1251					if (error)
1252						goto out;
1253
1254				} else {
1255
1256					if (cp_size > buflen) {
1257						panic("cp_size > buflen, something wrong with alignment!");
1258					}
1259
1260					if (len >= cp_size) {
1261						tocopy = cp_size;
1262					} else {
1263						mp->msg_flags |= MSG_CTRUNC;
1264						tocopy = len;
1265					}
1266
1267					error = copyout((caddr_t) cp, ctlbuf,
1268									tocopy);
1269					if (error)
1270						goto out;
1271				}
1272
1273
1274				ctlbuf += tocopy;
1275				len -= tocopy;
1276
1277				buflen -= cp_size;
1278				cp = (struct cmsghdr *)(void *)((unsigned char *) cp + cp_size);
1279				cp_size = CMSG_ALIGN(cp->cmsg_len);
1280			}
1281
1282			m = m->m_next;
1283		}
1284		mp->msg_controllen = ctlbuf - mp->msg_control;
1285	}
1286out:
1287	if (fromsa)
1288		FREE(fromsa, M_SONAME);
1289	if (control)
1290		m_freem(control);
1291	KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, error, 0, 0, 0, 0);
1292out1:
1293	fp_drop(p, s, fp, 0);
1294	return (error);
1295}
1296
1297/*
1298 * Returns:	0			Success
1299 *		ENOMEM
1300 *	copyin:EFAULT
1301 *	recvit:???
1302 *	read:???			[4056224: applicable for pipes]
1303 *
1304 * Notes:	The read entry point is only called as part of support for
1305 *		binary backward compatability; new code should use read
1306 *		instead of recv or recvfrom when attempting to read data
1307 *		from pipes.
1308 *
1309 *		For full documentation of the return codes from recvit, see
1310 *		the block header for the recvit function.
1311 */
1312int
1313recvfrom(struct proc *p, struct recvfrom_args *uap, int32_t *retval)
1314{
1315	__pthread_testcancel(1);
1316	return(recvfrom_nocancel(p, (struct recvfrom_nocancel_args *)uap, retval));
1317}
1318
1319int
1320recvfrom_nocancel(struct proc *p, struct recvfrom_nocancel_args *uap, int32_t *retval)
1321{
1322	struct user_msghdr msg;
1323	int error;
1324	uio_t auio = NULL;
1325
1326	KERNEL_DEBUG(DBG_FNC_RECVFROM | DBG_FUNC_START, 0, 0, 0, 0, 0);
1327	AUDIT_ARG(fd, uap->s);
1328
1329	if (uap->fromlenaddr) {
1330		error = copyin(uap->fromlenaddr,
1331		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
1332		if (error)
1333			return (error);
1334	} else {
1335		msg.msg_namelen = 0;
1336	}
1337	msg.msg_name = uap->from;
1338	auio = uio_create(1, 0,
1339	    (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1340	    UIO_READ);
1341	if (auio == NULL) {
1342		return (ENOMEM);
1343	}
1344
1345	uio_addiov(auio, uap->buf, uap->len);
1346	/* no need to set up msg_iov.  recvit uses uio_t we send it */
1347	msg.msg_iov = 0;
1348	msg.msg_iovlen = 0;
1349	msg.msg_control = 0;
1350	msg.msg_controllen = 0;
1351	msg.msg_flags = uap->flags;
1352	error = recvit(p, uap->s, &msg, auio, uap->fromlenaddr, retval);
1353	if (auio != NULL) {
1354		uio_free(auio);
1355	}
1356
1357#if HACK_FOR_4056224
1358	/*
1359	 * Radar 4056224
1360	 * Temporary workaround to let send() and recv() work over
1361	 * a pipe for binary compatibility
1362	 * This will be removed in the release following Tiger
1363	 */
1364	if (error == ENOTSOCK && proc_is64bit(p) == 0) {
1365		struct fileproc *fp;
1366
1367		if (fp_lookup(p, uap->s, &fp, 0) == 0) {
1368			(void) fp_drop(p, uap->s, fp, 0);
1369
1370			if (fp->f_type == DTYPE_PIPE) {
1371				struct read_args read_uap;
1372				user_ssize_t read_retval;
1373
1374				if (p->p_pid > last_pid_4056224) {
1375					last_pid_4056224 = p->p_pid;
1376
1377					printf("%s[%d] uses send/recv on "
1378					    "a pipe\n", p->p_comm, p->p_pid);
1379				}
1380
1381				bzero(&read_uap, sizeof (struct read_args));
1382				read_uap.fd = uap->s;
1383				read_uap.cbuf = uap->buf;
1384				read_uap.nbyte = uap->len;
1385
1386				error = read(p, &read_uap, &read_retval);
1387				*retval = (int)read_retval;
1388			}
1389		}
1390	}
1391#endif /* HACK_FOR_4056224 */
1392
1393	KERNEL_DEBUG(DBG_FNC_RECVFROM | DBG_FUNC_END, error, 0, 0, 0, 0);
1394
1395	return (error);
1396}
1397
1398/*
1399 * Returns:	0			Success
1400 *		EMSGSIZE
1401 *		ENOMEM
1402 *	copyin:EFAULT
1403 *	copyout:EFAULT
1404 *	recvit:???
1405 *
1406 * Notes:	For full documentation of the return codes from recvit, see
1407 *		the block header for the recvit function.
1408 */
1409int
1410recvmsg(struct proc *p, struct recvmsg_args *uap, int32_t *retval)
1411{
1412	__pthread_testcancel(1);
1413	return(recvmsg_nocancel(p, (struct recvmsg_nocancel_args *)uap, retval));
1414}
1415
1416int
1417recvmsg_nocancel(struct proc *p, struct recvmsg_nocancel_args *uap, int32_t *retval)
1418{
1419	struct user32_msghdr msg32;
1420	struct user64_msghdr msg64;
1421	struct user_msghdr user_msg;
1422	caddr_t msghdrp;
1423	int	size_of_msghdr;
1424	user_addr_t uiov;
1425	int error;
1426	uio_t auio = NULL;
1427	struct user_iovec *iovp;
1428
1429	KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_START, 0, 0, 0, 0, 0);
1430	AUDIT_ARG(fd, uap->s);
1431	if (IS_64BIT_PROCESS(p)) {
1432		msghdrp = (caddr_t)&msg64;
1433		size_of_msghdr = sizeof (msg64);
1434	} else {
1435		msghdrp = (caddr_t)&msg32;
1436		size_of_msghdr = sizeof (msg32);
1437	}
1438	error = copyin(uap->msg, msghdrp, size_of_msghdr);
1439	if (error) {
1440		KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, error, 0, 0, 0, 0);
1441		return (error);
1442	}
1443
1444	/* only need to copy if user process is not 64-bit */
1445	if (IS_64BIT_PROCESS(p)) {
1446		user_msg.msg_flags = msg64.msg_flags;
1447		user_msg.msg_controllen = msg64.msg_controllen;
1448		user_msg.msg_control = msg64.msg_control;
1449		user_msg.msg_iovlen = msg64.msg_iovlen;
1450		user_msg.msg_iov = msg64.msg_iov;
1451		user_msg.msg_namelen = msg64.msg_namelen;
1452		user_msg.msg_name = msg64.msg_name;
1453	} else {
1454		user_msg.msg_flags = msg32.msg_flags;
1455		user_msg.msg_controllen = msg32.msg_controllen;
1456		user_msg.msg_control = msg32.msg_control;
1457		user_msg.msg_iovlen = msg32.msg_iovlen;
1458		user_msg.msg_iov = msg32.msg_iov;
1459		user_msg.msg_namelen = msg32.msg_namelen;
1460		user_msg.msg_name = msg32.msg_name;
1461	}
1462
1463	if (user_msg.msg_iovlen <= 0 || user_msg.msg_iovlen > UIO_MAXIOV) {
1464		KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, EMSGSIZE,
1465		    0, 0, 0, 0);
1466		return (EMSGSIZE);
1467	}
1468
1469	user_msg.msg_flags = uap->flags;
1470
1471	/* allocate a uio large enough to hold the number of iovecs passed */
1472	auio = uio_create(user_msg.msg_iovlen, 0,
1473	    (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1474	    UIO_READ);
1475	if (auio == NULL) {
1476		error = ENOMEM;
1477		goto done;
1478	}
1479
1480	/*
1481	 * get location of iovecs within the uio.  then copyin the iovecs from
1482	 * user space.
1483	 */
1484	iovp = uio_iovsaddr(auio);
1485	if (iovp == NULL) {
1486		error = ENOMEM;
1487		goto done;
1488	}
1489	uiov = user_msg.msg_iov;
1490	user_msg.msg_iov = CAST_USER_ADDR_T(iovp);
1491	error = copyin_user_iovec_array(uiov,
1492		IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
1493		user_msg.msg_iovlen, iovp);
1494	if (error)
1495		goto done;
1496
1497	/* finish setup of uio_t */
1498	uio_calculateresid(auio);
1499
1500	error = recvit(p, uap->s, &user_msg, auio, 0, retval);
1501	if (!error) {
1502		user_msg.msg_iov = uiov;
1503		if (IS_64BIT_PROCESS(p)) {
1504			msg64.msg_flags = user_msg.msg_flags;
1505			msg64.msg_controllen = user_msg.msg_controllen;
1506			msg64.msg_control = user_msg.msg_control;
1507			msg64.msg_iovlen = user_msg.msg_iovlen;
1508			msg64.msg_iov = user_msg.msg_iov;
1509			msg64.msg_namelen = user_msg.msg_namelen;
1510			msg64.msg_name = user_msg.msg_name;
1511		} else {
1512			msg32.msg_flags = user_msg.msg_flags;
1513			msg32.msg_controllen = user_msg.msg_controllen;
1514			msg32.msg_control = user_msg.msg_control;
1515			msg32.msg_iovlen = user_msg.msg_iovlen;
1516			msg32.msg_iov = user_msg.msg_iov;
1517			msg32.msg_namelen = user_msg.msg_namelen;
1518			msg32.msg_name = user_msg.msg_name;
1519		}
1520		error = copyout(msghdrp, uap->msg, size_of_msghdr);
1521	}
1522done:
1523	if (auio != NULL) {
1524		uio_free(auio);
1525	}
1526	KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, error, 0, 0, 0, 0);
1527	return (error);
1528}
1529
1530/*
1531 * Returns:	0			Success
1532 *		EBADF
1533 *	file_socket:ENOTSOCK
1534 *	file_socket:EBADF
1535 *	soshutdown:EINVAL
1536 *	soshutdown:ENOTCONN
1537 *	soshutdown:EADDRNOTAVAIL[TCP]
1538 *	soshutdown:ENOBUFS[TCP]
1539 *	soshutdown:EMSGSIZE[TCP]
1540 *	soshutdown:EHOSTUNREACH[TCP]
1541 *	soshutdown:ENETUNREACH[TCP]
1542 *	soshutdown:ENETDOWN[TCP]
1543 *	soshutdown:ENOMEM[TCP]
1544 *	soshutdown:EACCES[TCP]
1545 *	soshutdown:EMSGSIZE[TCP]
1546 *	soshutdown:ENOBUFS[TCP]
1547 *	soshutdown:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
1548 *	soshutdown:???			[other protocol families]
1549 */
1550/* ARGSUSED */
1551int
1552shutdown(__unused struct proc *p, struct shutdown_args *uap,
1553    __unused int32_t *retval)
1554{
1555	struct socket *so;
1556	int error;
1557
1558	AUDIT_ARG(fd, uap->s);
1559	error = file_socket(uap->s, &so);
1560	if (error)
1561		return (error);
1562	if (so == NULL) {
1563		error = EBADF;
1564		goto out;
1565	}
1566	error =  soshutdown((struct socket *)so, uap->how);
1567out:
1568	file_drop(uap->s);
1569	return (error);
1570}
1571
1572/*
1573 * Returns:	0			Success
1574 *		EFAULT
1575 *		EINVAL
1576 *		EACCES			Mandatory Access Control failure
1577 *	file_socket:ENOTSOCK
1578 *	file_socket:EBADF
1579 *	sosetopt:EINVAL
1580 *	sosetopt:ENOPROTOOPT
1581 *	sosetopt:ENOBUFS
1582 *	sosetopt:EDOM
1583 *	sosetopt:EFAULT
1584 *	sosetopt:EOPNOTSUPP[AF_UNIX]
1585 *	sosetopt:???
1586 */
1587/* ARGSUSED */
1588int
1589setsockopt(struct proc *p, struct setsockopt_args *uap,
1590    __unused int32_t *retval)
1591{
1592	struct socket *so;
1593	struct sockopt sopt;
1594	int error;
1595
1596	AUDIT_ARG(fd, uap->s);
1597	if (uap->val == 0 && uap->valsize != 0)
1598		return (EFAULT);
1599	/* No bounds checking on size (it's unsigned) */
1600
1601	error = file_socket(uap->s, &so);
1602	if (error)
1603		return (error);
1604
1605	sopt.sopt_dir = SOPT_SET;
1606	sopt.sopt_level = uap->level;
1607	sopt.sopt_name = uap->name;
1608	sopt.sopt_val = uap->val;
1609	sopt.sopt_valsize = uap->valsize;
1610	sopt.sopt_p = p;
1611
1612	if (so == NULL) {
1613		error = EINVAL;
1614		goto out;
1615	}
1616#if CONFIG_MACF_SOCKET_SUBSET
1617	if ((error = mac_socket_check_setsockopt(kauth_cred_get(), so,
1618	    &sopt)) != 0)
1619		goto out;
1620#endif /* MAC_SOCKET_SUBSET */
1621	error = sosetopt(so, &sopt);
1622out:
1623	file_drop(uap->s);
1624	return (error);
1625}
1626
1627
1628
1629/*
1630 * Returns:	0			Success
1631 *		EINVAL
1632 *		EBADF
1633 *		EACCES			Mandatory Access Control failure
1634 *	copyin:EFAULT
1635 *	copyout:EFAULT
1636 *	file_socket:ENOTSOCK
1637 *	file_socket:EBADF
1638 *	sogetopt:???
1639 */
1640int
1641getsockopt(struct proc *p, struct getsockopt_args  *uap,
1642    __unused int32_t *retval)
1643{
1644	int		error;
1645	socklen_t	valsize;
1646	struct sockopt	sopt;
1647	struct socket *so;
1648
1649	error = file_socket(uap->s, &so);
1650	if (error)
1651		return (error);
1652	if (uap->val) {
1653		error = copyin(uap->avalsize, (caddr_t)&valsize,
1654		    sizeof (valsize));
1655		if (error)
1656			goto out;
1657		/* No bounds checking on size (it's unsigned) */
1658	} else {
1659		valsize = 0;
1660	}
1661	sopt.sopt_dir = SOPT_GET;
1662	sopt.sopt_level = uap->level;
1663	sopt.sopt_name = uap->name;
1664	sopt.sopt_val = uap->val;
1665	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1666	sopt.sopt_p = p;
1667
1668	if (so == NULL) {
1669		error = EBADF;
1670		goto out;
1671	}
1672#if CONFIG_MACF_SOCKET_SUBSET
1673	if ((error = mac_socket_check_getsockopt(kauth_cred_get(), so,
1674	    &sopt)) != 0)
1675		goto out;
1676#endif /* MAC_SOCKET_SUBSET */
1677	error = sogetopt((struct socket *)so, &sopt);
1678	if (error == 0) {
1679		valsize = sopt.sopt_valsize;
1680		error = copyout((caddr_t)&valsize, uap->avalsize,
1681		    sizeof (valsize));
1682	}
1683out:
1684	file_drop(uap->s);
1685	return (error);
1686}
1687
1688
1689/*
1690 * Get socket name.
1691 *
1692 * Returns:	0			Success
1693 *		EBADF
1694 *	file_socket:ENOTSOCK
1695 *	file_socket:EBADF
1696 *	copyin:EFAULT
1697 *	copyout:EFAULT
1698 *	<pru_sockaddr>:ENOBUFS[TCP]
1699 *	<pru_sockaddr>:ECONNRESET[TCP]
1700 *	<pru_sockaddr>:EINVAL[AF_UNIX]
1701 *	<sf_getsockname>:???
1702 */
1703/* ARGSUSED */
1704int
1705getsockname(__unused struct proc *p, struct getsockname_args *uap,
1706    __unused int32_t *retval)
1707{
1708	struct socket *so;
1709	struct sockaddr *sa;
1710	socklen_t len;
1711	socklen_t sa_len;
1712	int error;
1713
1714	error = file_socket(uap->fdes, &so);
1715	if (error)
1716		return (error);
1717	error = copyin(uap->alen, (caddr_t)&len, sizeof (socklen_t));
1718	if (error)
1719		goto out;
1720	if (so == NULL) {
1721		error = EBADF;
1722		goto out;
1723	}
1724	sa = 0;
1725	socket_lock(so, 1);
1726	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1727	if (error == 0) {
1728		error = sflt_getsockname(so, &sa);
1729		if (error == EJUSTRETURN)
1730			error = 0;
1731	}
1732	socket_unlock(so, 1);
1733	if (error)
1734		goto bad;
1735	if (sa == 0) {
1736		len = 0;
1737		goto gotnothing;
1738	}
1739
1740	sa_len = sa->sa_len;
1741	len = MIN(len, sa_len);
1742	error = copyout((caddr_t)sa, uap->asa, len);
1743	if (error)
1744		goto bad;
1745	/* return the actual, untruncated address length */
1746	len = sa_len;
1747gotnothing:
1748		error = copyout((caddr_t)&len, uap->alen, sizeof (socklen_t));
1749bad:
1750	if (sa)
1751		FREE(sa, M_SONAME);
1752out:
1753	file_drop(uap->fdes);
1754	return (error);
1755}
1756
1757/*
1758 * Get name of peer for connected socket.
1759 *
1760 * Returns:	0			Success
1761 *		EBADF
1762 *		EINVAL
1763 *		ENOTCONN
1764 *	file_socket:ENOTSOCK
1765 *	file_socket:EBADF
1766 *	copyin:EFAULT
1767 *	copyout:EFAULT
1768 *	<pru_peeraddr>:???
1769 *	<sf_getpeername>:???
1770 */
1771/* ARGSUSED */
1772int
1773getpeername(__unused struct proc *p, struct getpeername_args *uap,
1774    __unused int32_t *retval)
1775{
1776	struct socket *so;
1777	struct sockaddr *sa;
1778	socklen_t len;
1779	socklen_t sa_len;
1780	int error;
1781
1782	error = file_socket(uap->fdes, &so);
1783	if (error)
1784		return (error);
1785	if (so == NULL) {
1786		error = EBADF;
1787		goto out;
1788	}
1789
1790	socket_lock(so, 1);
1791
1792	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
1793	    (SS_CANTRCVMORE | SS_CANTSENDMORE)) {
1794		/* the socket has been shutdown, no more getpeername's */
1795		socket_unlock(so, 1);
1796		error = EINVAL;
1797		goto out;
1798	}
1799
1800	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1801		socket_unlock(so, 1);
1802		error = ENOTCONN;
1803		goto out;
1804	}
1805	error = copyin(uap->alen, (caddr_t)&len, sizeof (socklen_t));
1806	if (error) {
1807		socket_unlock(so, 1);
1808		goto out;
1809	}
1810	sa = 0;
1811	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1812	if (error == 0) {
1813		error = sflt_getpeername(so, &sa);
1814		if (error == EJUSTRETURN)
1815			error = 0;
1816	}
1817	socket_unlock(so, 1);
1818	if (error)
1819		goto bad;
1820	if (sa == 0) {
1821		len = 0;
1822		goto gotnothing;
1823	}
1824	sa_len = sa->sa_len;
1825	len = MIN(len, sa_len);
1826	error = copyout(sa, uap->asa, len);
1827	if (error)
1828		goto bad;
1829	/* return the actual, untruncated address length */
1830	len = sa_len;
1831gotnothing:
1832	error = copyout((caddr_t)&len, uap->alen, sizeof (socklen_t));
1833bad:
1834	if (sa) FREE(sa, M_SONAME);
1835out:
1836	file_drop(uap->fdes);
1837	return (error);
1838}
1839
1840int
1841sockargs(struct mbuf **mp, user_addr_t data, int buflen, int type)
1842{
1843	struct sockaddr *sa;
1844	struct mbuf *m;
1845	int error;
1846
1847	size_t alloc_buflen = (size_t)buflen;
1848
1849	if(alloc_buflen > INT_MAX/2)
1850		return (EINVAL);
1851#ifdef __LP64__
1852	/* The fd's in the buffer must expand to be pointers, thus we need twice as much space */
1853	if(type == MT_CONTROL)
1854		alloc_buflen = ((buflen - sizeof(struct cmsghdr))*2) + sizeof(struct cmsghdr);
1855#endif
1856	if (alloc_buflen > MLEN) {
1857		if (type == MT_SONAME && alloc_buflen <= 112)
1858			alloc_buflen = MLEN;		/* unix domain compat. hack */
1859		else if (alloc_buflen > MCLBYTES)
1860			return (EINVAL);
1861	}
1862	m = m_get(M_WAIT, type);
1863	if (m == NULL)
1864		return (ENOBUFS);
1865	if (alloc_buflen > MLEN) {
1866		MCLGET(m, M_WAIT);
1867		if ((m->m_flags & M_EXT) == 0) {
1868			m_free(m);
1869			return (ENOBUFS);
1870		}
1871	}
1872	/* K64: We still copyin the original buflen because it gets expanded later
1873	 * and we lie about the size of the mbuf because it only affects unp_* functions
1874	 */
1875	m->m_len = buflen;
1876	error = copyin(data, mtod(m, caddr_t), (u_int)buflen);
1877	if (error) {
1878		(void) m_free(m);
1879	} else {
1880		*mp = m;
1881		if (type == MT_SONAME) {
1882			sa = mtod(m, struct sockaddr *);
1883			sa->sa_len = buflen;
1884		}
1885	}
1886	return (error);
1887}
1888
1889/*
1890 * Given a user_addr_t of length len, allocate and fill out a *sa.
1891 *
1892 * Returns:	0			Success
1893 *		ENAMETOOLONG		Filename too long
1894 *		EINVAL			Invalid argument
1895 *		ENOMEM			Not enough space
1896 *		copyin:EFAULT		Bad address
1897 */
1898static int
1899getsockaddr(struct socket *so, struct sockaddr **namp, user_addr_t uaddr,
1900    size_t len, boolean_t translate_unspec)
1901{
1902	struct sockaddr *sa;
1903	int error;
1904
1905	if (len > SOCK_MAXADDRLEN)
1906		return (ENAMETOOLONG);
1907
1908	if (len < offsetof(struct sockaddr, sa_data[0]))
1909		return (EINVAL);
1910
1911	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK | M_ZERO);
1912	if (sa == NULL) {
1913		return (ENOMEM);
1914	}
1915	error = copyin(uaddr, (caddr_t)sa, len);
1916	if (error) {
1917		FREE(sa, M_SONAME);
1918	} else {
1919		/*
1920		 * Force sa_family to AF_INET on AF_INET sockets to handle
1921		 * legacy applications that use AF_UNSPEC (0).  On all other
1922		 * sockets we leave it unchanged and let the lower layer
1923		 * handle it.
1924		 */
1925		if (translate_unspec && sa->sa_family == AF_UNSPEC &&
1926		    INP_CHECK_SOCKAF(so, AF_INET) &&
1927		    len == sizeof (struct sockaddr_in))
1928			sa->sa_family = AF_INET;
1929
1930		sa->sa_len = len;
1931		*namp = sa;
1932	}
1933	return (error);
1934}
1935
1936static int
1937getsockaddr_s(struct socket *so, struct sockaddr_storage *ss,
1938    user_addr_t uaddr, size_t len, boolean_t translate_unspec)
1939{
1940	int error;
1941
1942	if (ss == NULL || uaddr == USER_ADDR_NULL ||
1943	    len < offsetof(struct sockaddr, sa_data[0]))
1944		return (EINVAL);
1945
1946	/*
1947	 * sockaddr_storage size is less than SOCK_MAXADDRLEN,
1948	 * so the check here is inclusive.
1949	 */
1950	if (len > sizeof (*ss))
1951		return (ENAMETOOLONG);
1952
1953	bzero(ss, sizeof (*ss));
1954	error = copyin(uaddr, (caddr_t)ss, len);
1955	if (error == 0) {
1956		/*
1957		 * Force sa_family to AF_INET on AF_INET sockets to handle
1958		 * legacy applications that use AF_UNSPEC (0).  On all other
1959		 * sockets we leave it unchanged and let the lower layer
1960		 * handle it.
1961		 */
1962		if (translate_unspec && ss->ss_family == AF_UNSPEC &&
1963		    INP_CHECK_SOCKAF(so, AF_INET) &&
1964		    len == sizeof (struct sockaddr_in))
1965			ss->ss_family = AF_INET;
1966
1967		ss->ss_len = len;
1968	}
1969	return (error);
1970}
1971
1972#if SENDFILE
1973
1974SYSCTL_DECL(_kern_ipc);
1975
1976#define	SFUIOBUFS 64
1977static int sendfileuiobufs = SFUIOBUFS;
1978SYSCTL_INT(_kern_ipc, OID_AUTO, sendfileuiobufs, CTLFLAG_RW | CTLFLAG_LOCKED, &sendfileuiobufs,
1979    0, "");
1980
1981/* Macros to compute the number of mbufs needed depending on cluster size */
1982#define	HOWMANY_16K(n)	((((unsigned int)(n) - 1) >> (PGSHIFT + 2)) + 1)
1983#define	HOWMANY_4K(n)	((((unsigned int)(n) - 1) >> PGSHIFT) + 1)
1984
1985/* Upper send limit in bytes (sendfileuiobufs * PAGESIZE) */
1986#define SENDFILE_MAX_BYTES	(sendfileuiobufs << PGSHIFT)
1987
1988/* Upper send limit in the number of mbuf clusters */
1989#define	SENDFILE_MAX_16K	HOWMANY_16K(SENDFILE_MAX_BYTES)
1990#define	SENDFILE_MAX_4K		HOWMANY_4K(SENDFILE_MAX_BYTES)
1991
1992size_t mbuf_pkt_maxlen(mbuf_t m);
1993
1994__private_extern__ size_t
1995mbuf_pkt_maxlen(mbuf_t m)
1996{
1997	size_t maxlen = 0;
1998
1999	while (m) {
2000		maxlen += mbuf_maxlen(m);
2001		m = mbuf_next(m);
2002	}
2003	return (maxlen);
2004}
2005
2006static void
2007alloc_sendpkt(int how, size_t pktlen, unsigned int *maxchunks,
2008    struct mbuf **m, boolean_t jumbocl)
2009{
2010	unsigned int needed;
2011
2012	if (pktlen == 0)
2013		panic("%s: pktlen (%ld) must be non-zero\n", __func__, pktlen);
2014
2015	/*
2016	 * Try to allocate for the whole thing.  Since we want full control
2017	 * over the buffer size and be able to accept partial result, we can't
2018	 * use mbuf_allocpacket().  The logic below is similar to sosend().
2019	 */
2020	*m = NULL;
2021	if (pktlen > MBIGCLBYTES && jumbocl) {
2022		needed = MIN(SENDFILE_MAX_16K, HOWMANY_16K(pktlen));
2023		*m = m_getpackets_internal(&needed, 1, how, 0, M16KCLBYTES);
2024	}
2025	if (*m == NULL) {
2026		needed = MIN(SENDFILE_MAX_4K, HOWMANY_4K(pktlen));
2027		*m = m_getpackets_internal(&needed, 1, how, 0, MBIGCLBYTES);
2028	}
2029
2030	/*
2031	 * Our previous attempt(s) at allocation had failed; the system
2032	 * may be short on mbufs, and we want to block until they are
2033	 * available.  This time, ask just for 1 mbuf and don't return
2034	 * until we get it.
2035	 */
2036	if (*m == NULL) {
2037		needed = 1;
2038		*m = m_getpackets_internal(&needed, 1, M_WAIT, 1, MBIGCLBYTES);
2039	}
2040	if (*m == NULL)
2041		panic("%s: blocking allocation returned NULL\n", __func__);
2042
2043	*maxchunks = needed;
2044}
2045
2046/*
2047 * sendfile(2).
2048 * int sendfile(int fd, int s, off_t offset, off_t *nbytes,
2049 *	 struct sf_hdtr *hdtr, int flags)
2050 *
2051 * Send a file specified by 'fd' and starting at 'offset' to a socket
2052 * specified by 's'. Send only '*nbytes' of the file or until EOF if
2053 * *nbytes == 0. Optionally add a header and/or trailer to the socket
2054 * output. If specified, write the total number of bytes sent into *nbytes.
2055 */
2056int
2057sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval)
2058{
2059	struct fileproc *fp;
2060	struct vnode *vp;
2061	struct socket *so;
2062	struct writev_nocancel_args nuap;
2063	user_ssize_t writev_retval;
2064	struct user_sf_hdtr user_hdtr;
2065	struct user32_sf_hdtr user32_hdtr;
2066	struct user64_sf_hdtr user64_hdtr;
2067	off_t off, xfsize;
2068	off_t nbytes = 0, sbytes = 0;
2069	int error = 0;
2070	size_t sizeof_hdtr;
2071	off_t file_size;
2072	struct vfs_context context = *vfs_context_current();
2073#define ENXIO_10146739_DBG(err_str) {	\
2074	if (error == ENXIO) {		\
2075		printf(err_str,		\
2076		__func__,		\
2077		"File a radar related to rdar://10146739 \n");	\
2078	}				\
2079}
2080	KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE | DBG_FUNC_START), uap->s,
2081	    0, 0, 0, 0);
2082
2083	AUDIT_ARG(fd, uap->fd);
2084	AUDIT_ARG(value32, uap->s);
2085
2086	/*
2087	 * Do argument checking. Must be a regular file in, stream
2088	 * type and connected socket out, positive offset.
2089	 */
2090	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
2091		ENXIO_10146739_DBG("%s: fp_getfvp error. %s");
2092		goto done;
2093	}
2094	if ((fp->f_flag & FREAD) == 0) {
2095		error = EBADF;
2096		goto done1;
2097	}
2098	if (vnode_isreg(vp) == 0) {
2099		error = ENOTSUP;
2100		goto done1;
2101	}
2102	error = file_socket(uap->s, &so);
2103	if (error) {
2104		ENXIO_10146739_DBG("%s: file_socket error. %s");
2105		goto done1;
2106	}
2107	if (so == NULL) {
2108		error = EBADF;
2109		goto done2;
2110	}
2111	if (so->so_type != SOCK_STREAM) {
2112		error = EINVAL;
2113		goto done2;
2114	}
2115	if ((so->so_state & SS_ISCONNECTED) == 0) {
2116		error = ENOTCONN;
2117		goto done2;
2118	}
2119	if (uap->offset < 0) {
2120		error = EINVAL;
2121		goto done2;
2122	}
2123	if (uap->nbytes == USER_ADDR_NULL) {
2124		error = EINVAL;
2125		goto done2;
2126	}
2127	if (uap->flags != 0) {
2128		error = EINVAL;
2129		goto done2;
2130	}
2131
2132	context.vc_ucred = fp->f_fglob->fg_cred;
2133
2134#if CONFIG_MACF_SOCKET_SUBSET
2135	/* JMM - fetch connected sockaddr? */
2136	error = mac_socket_check_send(context.vc_ucred, so, NULL);
2137	if (error)
2138		goto done2;
2139#endif
2140
2141	/*
2142	 * Get number of bytes to send
2143	 * Should it applies to size of header and trailer?
2144	 * JMM - error handling?
2145	 */
2146	copyin(uap->nbytes, &nbytes, sizeof (off_t));
2147
2148	/*
2149	 * If specified, get the pointer to the sf_hdtr struct for
2150	 * any headers/trailers.
2151	 */
2152	if (uap->hdtr != USER_ADDR_NULL) {
2153		caddr_t hdtrp;
2154
2155		bzero(&user_hdtr, sizeof (user_hdtr));
2156		if (IS_64BIT_PROCESS(p)) {
2157			hdtrp = (caddr_t)&user64_hdtr;
2158			sizeof_hdtr = sizeof (user64_hdtr);
2159		} else {
2160			hdtrp = (caddr_t)&user32_hdtr;
2161			sizeof_hdtr = sizeof (user32_hdtr);
2162		}
2163		error = copyin(uap->hdtr, hdtrp, sizeof_hdtr);
2164		if (error)
2165			goto done2;
2166		if (IS_64BIT_PROCESS(p)) {
2167			user_hdtr.headers = user64_hdtr.headers;
2168			user_hdtr.hdr_cnt = user64_hdtr.hdr_cnt;
2169			user_hdtr.trailers = user64_hdtr.trailers;
2170			user_hdtr.trl_cnt = user64_hdtr.trl_cnt;
2171		} else {
2172			user_hdtr.headers = user32_hdtr.headers;
2173			user_hdtr.hdr_cnt = user32_hdtr.hdr_cnt;
2174			user_hdtr.trailers = user32_hdtr.trailers;
2175			user_hdtr.trl_cnt = user32_hdtr.trl_cnt;
2176		}
2177
2178		/*
2179		 * Send any headers. Wimp out and use writev(2).
2180		 */
2181		if (user_hdtr.headers != USER_ADDR_NULL) {
2182			bzero(&nuap, sizeof (struct writev_args));
2183			nuap.fd = uap->s;
2184			nuap.iovp = user_hdtr.headers;
2185			nuap.iovcnt = user_hdtr.hdr_cnt;
2186			error = writev_nocancel(p, &nuap, &writev_retval);
2187			if (error) {
2188				ENXIO_10146739_DBG("%s: writev_nocancel error. %s");
2189				goto done2;
2190			}
2191			sbytes += writev_retval;
2192		}
2193	}
2194
2195	/*
2196	 * Get the file size for 2 reasons:
2197	 *  1. We don't want to allocate more mbufs than necessary
2198	 *  2. We don't want to read past the end of file
2199	 */
2200	if ((error = vnode_size(vp, &file_size, vfs_context_current())) != 0) {
2201		ENXIO_10146739_DBG("%s: vnode_size error. %s");
2202		goto done2;
2203	}
2204
2205	/*
2206	 * Simply read file data into a chain of mbufs that used with scatter
2207	 * gather reads. We're not (yet?) setup to use zero copy external
2208	 * mbufs that point to the file pages.
2209	 */
2210	socket_lock(so, 1);
2211	error = sblock(&so->so_snd, M_WAIT);
2212	if (error) {
2213		socket_unlock(so, 1);
2214		goto done2;
2215	}
2216	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
2217		mbuf_t	m0 = NULL, m;
2218		unsigned int	nbufs = sendfileuiobufs, i;
2219		uio_t	auio;
2220		char	uio_buf[UIO_SIZEOF(sendfileuiobufs)]; /* 1 KB !!! */
2221		size_t	uiolen;
2222		user_ssize_t	rlen;
2223		off_t	pgoff;
2224		size_t	pktlen;
2225		boolean_t jumbocl;
2226
2227		/*
2228		 * Calculate the amount to transfer.
2229		 * Align to round number of pages.
2230		 * Not to exceed send socket buffer,
2231		 * the EOF, or the passed in nbytes.
2232		 */
2233		xfsize = sbspace(&so->so_snd);
2234
2235		if (xfsize <= 0) {
2236			if (so->so_state & SS_CANTSENDMORE) {
2237				error = EPIPE;
2238				goto done3;
2239			} else if ((so->so_state & SS_NBIO)) {
2240				error = EAGAIN;
2241				goto done3;
2242			} else {
2243				xfsize = PAGE_SIZE;
2244			}
2245		}
2246
2247		if (xfsize > SENDFILE_MAX_BYTES)
2248			xfsize = SENDFILE_MAX_BYTES;
2249		else if (xfsize > PAGE_SIZE)
2250			xfsize = trunc_page(xfsize);
2251		pgoff = off & PAGE_MASK_64;
2252		if (pgoff > 0 && PAGE_SIZE - pgoff < xfsize)
2253			xfsize = PAGE_SIZE_64 - pgoff;
2254		if (nbytes && xfsize > (nbytes - sbytes))
2255			xfsize = nbytes - sbytes;
2256		if (xfsize <= 0)
2257			break;
2258		if (off + xfsize > file_size)
2259			xfsize = file_size - off;
2260		if (xfsize <= 0)
2261			break;
2262
2263		/*
2264		 * Attempt to use larger than system page-size clusters for
2265		 * large writes only if there is a jumbo cluster pool and
2266		 * if the socket is marked accordingly.
2267		 */
2268		jumbocl = sosendjcl && njcl > 0 &&
2269		    ((so->so_flags & SOF_MULTIPAGES) || sosendjcl_ignore_capab);
2270
2271		socket_unlock(so, 0);
2272		alloc_sendpkt(M_WAIT, xfsize, &nbufs, &m0, jumbocl);
2273		pktlen = mbuf_pkt_maxlen(m0);
2274		if (pktlen < (size_t)xfsize)
2275			xfsize = pktlen;
2276
2277		auio = uio_createwithbuffer(nbufs, off, UIO_SYSSPACE,
2278		    UIO_READ, &uio_buf[0], sizeof (uio_buf));
2279		if (auio == NULL) {
2280			printf("sendfile failed. nbufs = %d. %s", nbufs,
2281				"File a radar related to rdar://10146739.\n");
2282			mbuf_freem(m0);
2283			error = ENXIO;
2284			socket_lock(so, 0);
2285			goto done3;
2286		}
2287
2288		for (i = 0, m = m0, uiolen = 0;
2289		    i < nbufs && m != NULL && uiolen < (size_t)xfsize;
2290		    i++, m = mbuf_next(m)) {
2291			size_t mlen = mbuf_maxlen(m);
2292
2293			if (mlen + uiolen > (size_t)xfsize)
2294				mlen = xfsize - uiolen;
2295			mbuf_setlen(m, mlen);
2296			uio_addiov(auio, CAST_USER_ADDR_T(mbuf_datastart(m)),
2297			    mlen);
2298			uiolen += mlen;
2299		}
2300
2301		if (xfsize != uio_resid(auio))
2302			printf("sendfile: xfsize: %lld != uio_resid(auio): "
2303				"%lld\n", xfsize, (long long)uio_resid(auio));
2304
2305		KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_READ | DBG_FUNC_START),
2306		    uap->s, (unsigned int)((xfsize >> 32) & 0x0ffffffff),
2307		    (unsigned int)(xfsize & 0x0ffffffff), 0, 0);
2308		error = fo_read(fp, auio, FOF_OFFSET, &context);
2309		socket_lock(so, 0);
2310		if (error != 0) {
2311			if (uio_resid(auio) != xfsize && (error == ERESTART ||
2312			    error == EINTR || error == EWOULDBLOCK)) {
2313				error = 0;
2314			} else {
2315				ENXIO_10146739_DBG("%s: fo_read error. %s");
2316				mbuf_freem(m0);
2317				goto done3;
2318			}
2319		}
2320		xfsize -= uio_resid(auio);
2321		KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_READ | DBG_FUNC_END),
2322		    uap->s, (unsigned int)((xfsize >> 32) & 0x0ffffffff),
2323		    (unsigned int)(xfsize & 0x0ffffffff), 0, 0);
2324
2325		if (xfsize == 0) {
2326			//printf("sendfile: fo_read 0 bytes, EOF\n");
2327			break;
2328		}
2329		if (xfsize + off > file_size)
2330			printf("sendfile: xfsize: %lld + off: %lld > file_size:"
2331			    "%lld\n", xfsize, off, file_size);
2332		for (i = 0, m = m0, rlen = 0;
2333		    i < nbufs && m != NULL && rlen < xfsize;
2334		    i++, m = mbuf_next(m)) {
2335			size_t mlen = mbuf_maxlen(m);
2336
2337			if (rlen + mlen > (size_t)xfsize)
2338				mlen = xfsize - rlen;
2339			mbuf_setlen(m, mlen);
2340
2341			rlen += mlen;
2342		}
2343		mbuf_pkthdr_setlen(m0, xfsize);
2344
2345retry_space:
2346		/*
2347		 * Make sure that the socket is still able to take more data.
2348		 * CANTSENDMORE being true usually means that the connection
2349		 * was closed. so_error is true when an error was sensed after
2350		 * a previous send.
2351		 * The state is checked after the page mapping and buffer
2352		 * allocation above since those operations may block and make
2353		 * any socket checks stale. From this point forward, nothing
2354		 * blocks before the pru_send (or more accurately, any blocking
2355		 * results in a loop back to here to re-check).
2356		 */
2357		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
2358			if (so->so_state & SS_CANTSENDMORE) {
2359				error = EPIPE;
2360			} else {
2361				error = so->so_error;
2362				so->so_error = 0;
2363			}
2364			m_freem(m0);
2365			ENXIO_10146739_DBG("%s: Unexpected socket error. %s");
2366			goto done3;
2367		}
2368		/*
2369		 * Wait for socket space to become available. We do this just
2370		 * after checking the connection state above in order to avoid
2371		 * a race condition with sbwait().
2372		 */
2373		if (sbspace(&so->so_snd) < (long)so->so_snd.sb_lowat) {
2374			if (so->so_state & SS_NBIO) {
2375				m_freem(m0);
2376				error = EAGAIN;
2377				goto done3;
2378			}
2379			KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_WAIT |
2380			    DBG_FUNC_START), uap->s, 0, 0, 0, 0);
2381			error = sbwait(&so->so_snd);
2382			KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_WAIT|
2383			    DBG_FUNC_END), uap->s, 0, 0, 0, 0);
2384			/*
2385			 * An error from sbwait usually indicates that we've
2386			 * been interrupted by a signal. If we've sent anything
2387			 * then return bytes sent, otherwise return the error.
2388			 */
2389			if (error) {
2390				m_freem(m0);
2391				goto done3;
2392			}
2393			goto retry_space;
2394		}
2395
2396		struct mbuf *control = NULL;
2397		{
2398			/*
2399			 * Socket filter processing
2400			 */
2401
2402			error = sflt_data_out(so, NULL, &m0, &control, 0);
2403			if (error) {
2404				if (error == EJUSTRETURN) {
2405					error = 0;
2406					continue;
2407				}
2408				ENXIO_10146739_DBG("%s: sflt_data_out error. %s");
2409				goto done3;
2410			}
2411			/*
2412			 * End Socket filter processing
2413			 */
2414		}
2415		KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_SEND | DBG_FUNC_START),
2416		    uap->s, 0, 0, 0, 0);
2417		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m0,
2418		    0, control, p);
2419		KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_SEND | DBG_FUNC_START),
2420		    uap->s, 0, 0, 0, 0);
2421		if (error) {
2422			ENXIO_10146739_DBG("%s: pru_send error. %s");
2423			goto done3;
2424		}
2425	}
2426	sbunlock(&so->so_snd, 0);	/* will unlock socket */
2427	/*
2428	 * Send trailers. Wimp out and use writev(2).
2429	 */
2430	if (uap->hdtr != USER_ADDR_NULL &&
2431	    user_hdtr.trailers != USER_ADDR_NULL) {
2432		bzero(&nuap, sizeof (struct writev_args));
2433		nuap.fd = uap->s;
2434		nuap.iovp = user_hdtr.trailers;
2435		nuap.iovcnt = user_hdtr.trl_cnt;
2436		error = writev_nocancel(p, &nuap, &writev_retval);
2437		if (error) {
2438			ENXIO_10146739_DBG("%s: writev_nocancel error. %s");
2439			goto done2;
2440		}
2441		sbytes += writev_retval;
2442	}
2443done2:
2444	file_drop(uap->s);
2445done1:
2446	file_drop(uap->fd);
2447done:
2448	if (uap->nbytes != USER_ADDR_NULL) {
2449		/* XXX this appears bogus for some early failure conditions */
2450		copyout(&sbytes, uap->nbytes, sizeof (off_t));
2451	}
2452	KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE | DBG_FUNC_END), uap->s,
2453	    (unsigned int)((sbytes >> 32) & 0x0ffffffff),
2454	    (unsigned int)(sbytes & 0x0ffffffff), error, 0);
2455	return (error);
2456done3:
2457	sbunlock(&so->so_snd, 0);	/* will unlock socket */
2458	goto done2;
2459}
2460
2461
2462#endif /* SENDFILE */
2463