1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1993
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
61 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.9 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
68#include <sys/sysctl.h>
69#include <sys/mbuf.h>
70#if INET6
71#include <sys/domain.h>
72#endif /* INET6 */
73#include <sys/kasl.h>
74#include <sys/socket.h>
75#include <sys/socketvar.h>
76#include <sys/protosw.h>
77#include <sys/syslog.h>
78
79#include <net/if.h>
80#include <net/route.h>
81#include <net/ntstat.h>
82
83#include <netinet/in.h>
84#include <netinet/in_systm.h>
85#if INET6
86#include <netinet/ip6.h>
87#endif
88#include <netinet/in_pcb.h>
89#if INET6
90#include <netinet6/in6_pcb.h>
91#endif
92#include <netinet/in_var.h>
93#include <netinet/ip_var.h>
94#if INET6
95#include <netinet6/ip6_var.h>
96#endif
97#include <netinet/tcp.h>
98#include <netinet/tcp_fsm.h>
99#include <netinet/tcp_seq.h>
100#include <netinet/tcp_timer.h>
101#include <netinet/tcp_var.h>
102#include <netinet/tcpip.h>
103#include <mach/sdt.h>
104#if TCPDEBUG
105#include <netinet/tcp_debug.h>
106#endif
107#if MPTCP
108#include <netinet/mptcp_var.h>
109#endif /* MPTCP */
110
111#if IPSEC
112#include <netinet6/ipsec.h>
113#endif /*IPSEC*/
114
115#if FLOW_DIVERT
116#include <netinet/flow_divert.h>
117#endif /* FLOW_DIVERT */
118
119void	tcp_fill_info(struct tcpcb *, struct tcp_info *);
120errno_t tcp_fill_info_for_info_tuple(struct info_tuple *, struct tcp_info *);
121
122int tcp_sysctl_info(struct sysctl_oid *, void *, int , struct sysctl_req *);
123
124/*
125 * TCP protocol interface to socket abstraction.
126 */
127extern	char *tcpstates[];	/* XXX ??? */
128
129static int	tcp_attach(struct socket *, struct proc *);
130static int	tcp_connect(struct tcpcb *, struct sockaddr *, struct proc *);
131#if INET6
132static int	tcp6_connect(struct tcpcb *, struct sockaddr *, struct proc *);
133static int	tcp6_usr_connect(struct socket *, struct sockaddr *,
134		    struct proc *);
135#endif /* INET6 */
136static struct tcpcb *
137		tcp_disconnect(struct tcpcb *);
138static struct tcpcb *
139		tcp_usrclosed(struct tcpcb *);
140
141extern uint32_t tcp_autorcvbuf_max;
142
143extern void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
144
145#if TCPDEBUG
146#define	TCPDEBUG0	int ostate = 0
147#define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
148#define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
149				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
150#else
151#define	TCPDEBUG0
152#define	TCPDEBUG1()
153#define	TCPDEBUG2(req)
154#endif
155
156SYSCTL_PROC(_net_inet_tcp, OID_AUTO, info,
157    CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
158    0 , 0, tcp_sysctl_info, "S", "TCP info per tuple");
159
160/*
161 * TCP attaches to socket via pru_attach(), reserving space,
162 * and an internet control block.
163 *
164 * Returns:	0			Success
165 *		EISCONN
166 *	tcp_attach:ENOBUFS
167 *	tcp_attach:ENOMEM
168 *	tcp_attach:???			[IPSEC specific]
169 */
170static int
171tcp_usr_attach(struct socket *so, __unused int proto, struct proc *p)
172{
173	int error;
174	struct inpcb *inp = sotoinpcb(so);
175	struct tcpcb *tp = 0;
176	TCPDEBUG0;
177
178	TCPDEBUG1();
179	if (inp) {
180		error = EISCONN;
181		goto out;
182	}
183
184	error = tcp_attach(so, p);
185	if (error)
186		goto out;
187
188	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
189		so->so_linger = TCP_LINGERTIME * hz;
190	tp = sototcpcb(so);
191out:
192	TCPDEBUG2(PRU_ATTACH);
193	return error;
194}
195
196/*
197 * pru_detach() detaches the TCP protocol from the socket.
198 * If the protocol state is non-embryonic, then can't
199 * do this directly: have to initiate a pru_disconnect(),
200 * which may finish later; embryonic TCB's can just
201 * be discarded here.
202 */
203static int
204tcp_usr_detach(struct socket *so)
205{
206	int error = 0;
207	struct inpcb *inp = sotoinpcb(so);
208	struct tcpcb *tp;
209	TCPDEBUG0;
210
211	if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
212		return EINVAL;	/* XXX */
213	}
214	lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
215	tp = intotcpcb(inp);
216	/* In case we got disconnected from the peer */
217        if (tp == NULL)
218		goto out;
219	TCPDEBUG1();
220
221	calculate_tcp_clock();
222
223	tp = tcp_disconnect(tp);
224out:
225	TCPDEBUG2(PRU_DETACH);
226	return error;
227}
228
229#define	COMMON_START()	TCPDEBUG0;					\
230do {									\
231	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD)		\
232		return (EINVAL);					\
233	if (inp->inp_flags2 & INP2_WANT_FLOW_DIVERT)			\
234		return (EPROTOTYPE);					\
235	tp = intotcpcb(inp);						\
236	TCPDEBUG1();							\
237	calculate_tcp_clock();						\
238} while (0)
239
240#define COMMON_END(req)	out: TCPDEBUG2(req); return error; goto out
241
242
243/*
244 * Give the socket an address.
245 *
246 * Returns:	0			Success
247 *		EINVAL			Invalid argument [COMMON_START]
248 *		EAFNOSUPPORT		Address family not supported
249 *	in_pcbbind:EADDRNOTAVAIL	Address not available.
250 *	in_pcbbind:EINVAL		Invalid argument
251 *	in_pcbbind:EAFNOSUPPORT		Address family not supported [notdef]
252 *	in_pcbbind:EACCES		Permission denied
253 *	in_pcbbind:EADDRINUSE		Address in use
254 *	in_pcbbind:EAGAIN		Resource unavailable, try again
255 *	in_pcbbind:EPERM		Operation not permitted
256 */
257static int
258tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
259{
260	int error = 0;
261	struct inpcb *inp = sotoinpcb(so);
262	struct tcpcb *tp;
263	struct sockaddr_in *sinp;
264
265	COMMON_START();
266
267	if (nam->sa_family != 0 && nam->sa_family != AF_INET) {
268		error = EAFNOSUPPORT;
269		goto out;
270	}
271
272	/*
273	 * Must check for multicast addresses and disallow binding
274	 * to them.
275	 */
276	sinp = (struct sockaddr_in *)(void *)nam;
277	if (sinp->sin_family == AF_INET &&
278	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
279		error = EAFNOSUPPORT;
280		goto out;
281	}
282	error = in_pcbbind(inp, nam, p);
283	if (error)
284		goto out;
285	COMMON_END(PRU_BIND);
286
287}
288
289#if INET6
290static int
291tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
292{
293	int error = 0;
294	struct inpcb *inp = sotoinpcb(so);
295	struct tcpcb *tp;
296	struct sockaddr_in6 *sin6p;
297
298	COMMON_START();
299
300	if (nam->sa_family != 0 && nam->sa_family != AF_INET6) {
301		error = EAFNOSUPPORT;
302		goto out;
303	}
304
305	/*
306	 * Must check for multicast addresses and disallow binding
307	 * to them.
308	 */
309	sin6p = (struct sockaddr_in6 *)(void *)nam;
310	if (sin6p->sin6_family == AF_INET6 &&
311	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
312		error = EAFNOSUPPORT;
313		goto out;
314	}
315	inp->inp_vflag &= ~INP_IPV4;
316	inp->inp_vflag |= INP_IPV6;
317	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
318		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
319			inp->inp_vflag |= INP_IPV4;
320		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
321			struct sockaddr_in sin;
322
323			in6_sin6_2_sin(&sin, sin6p);
324			inp->inp_vflag |= INP_IPV4;
325			inp->inp_vflag &= ~INP_IPV6;
326			error = in_pcbbind(inp, (struct sockaddr *)&sin, p);
327			goto out;
328		}
329	}
330	error = in6_pcbbind(inp, nam, p);
331	if (error)
332		goto out;
333	COMMON_END(PRU_BIND);
334}
335#endif /* INET6 */
336
337/*
338 * Prepare to accept connections.
339 *
340 * Returns:	0			Success
341 *		EINVAL [COMMON_START]
342 *	in_pcbbind:EADDRNOTAVAIL	Address not available.
343 *	in_pcbbind:EINVAL		Invalid argument
344 *	in_pcbbind:EAFNOSUPPORT		Address family not supported [notdef]
345 *	in_pcbbind:EACCES		Permission denied
346 *	in_pcbbind:EADDRINUSE		Address in use
347 *	in_pcbbind:EAGAIN		Resource unavailable, try again
348 *	in_pcbbind:EPERM		Operation not permitted
349 */
350static int
351tcp_usr_listen(struct socket *so, struct proc *p)
352{
353	int error = 0;
354	struct inpcb *inp = sotoinpcb(so);
355	struct tcpcb *tp;
356
357	COMMON_START();
358	if (inp->inp_lport == 0)
359		error = in_pcbbind(inp, NULL, p);
360	if (error == 0)
361		tp->t_state = TCPS_LISTEN;
362	COMMON_END(PRU_LISTEN);
363}
364
365#if INET6
366static int
367tcp6_usr_listen(struct socket *so, struct proc *p)
368{
369	int error = 0;
370	struct inpcb *inp = sotoinpcb(so);
371	struct tcpcb *tp;
372
373	COMMON_START();
374	if (inp->inp_lport == 0) {
375		inp->inp_vflag &= ~INP_IPV4;
376		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
377			inp->inp_vflag |= INP_IPV4;
378		error = in6_pcbbind(inp, NULL, p);
379	}
380	if (error == 0)
381		tp->t_state = TCPS_LISTEN;
382	COMMON_END(PRU_LISTEN);
383}
384#endif /* INET6 */
385
386/*
387 * Initiate connection to peer.
388 * Create a template for use in transmissions on this connection.
389 * Enter SYN_SENT state, and mark socket as connecting.
390 * Start keep-alive timer, and seed output sequence space.
391 * Send initial segment on connection.
392 */
393static int
394tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
395{
396	int error = 0;
397	struct inpcb *inp = sotoinpcb(so);
398	struct tcpcb *tp;
399	struct sockaddr_in *sinp;
400
401	TCPDEBUG0;
402	if (inp == NULL) {
403		return EINVAL;
404	} else if (inp->inp_state == INPCB_STATE_DEAD) {
405		if (so->so_error) {
406			error = so->so_error;
407			so->so_error = 0;
408			return error;
409		} else
410			return EINVAL;
411	}
412#if FLOW_DIVERT
413   	else if (inp->inp_flags2 & INP2_WANT_FLOW_DIVERT) {
414		uint32_t fd_ctl_unit = 0;
415		error = flow_divert_check_policy(so, p, FALSE, &fd_ctl_unit);
416		if (error == 0) {
417			if (fd_ctl_unit > 0) {
418				error = flow_divert_pcb_init(so, fd_ctl_unit);
419				if (error == 0) {
420					error = flow_divert_connect_out(so, nam, p);
421				}
422			} else {
423				error = ENETDOWN;
424			}
425		}
426		return error;
427	}
428#endif /* FLOW_DIVERT */
429	tp = intotcpcb(inp);
430	TCPDEBUG1();
431
432	calculate_tcp_clock();
433
434	if (nam->sa_family != 0 && nam->sa_family != AF_INET) {
435		error = EAFNOSUPPORT;
436		goto out;
437	}
438	/*
439	 * Must disallow TCP ``connections'' to multicast addresses.
440	 */
441	sinp = (struct sockaddr_in *)(void *)nam;
442	if (sinp->sin_family == AF_INET
443	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
444		error = EAFNOSUPPORT;
445		goto out;
446	}
447
448	if ((error = tcp_connect(tp, nam, p)) != 0)
449		goto out;
450	error = tcp_output(tp);
451	COMMON_END(PRU_CONNECT);
452}
453
454static int
455tcp_usr_connectx_common(struct socket *so, int af,
456    struct sockaddr_list **src_sl, struct sockaddr_list **dst_sl,
457    struct proc *p, uint32_t ifscope, associd_t aid, connid_t *pcid,
458    uint32_t flags, void *arg, uint32_t arglen)
459{
460#pragma unused(aid)
461#if !MPTCP
462#pragma unused(flags, arg, arglen)
463#endif /* !MPTCP */
464	struct sockaddr_entry *src_se = NULL, *dst_se = NULL;
465	struct inpcb *inp = sotoinpcb(so);
466	int error;
467
468	if (inp == NULL)
469		return (EINVAL);
470
471	VERIFY(dst_sl != NULL);
472
473	/* select source (if specified) and destination addresses */
474	error = in_selectaddrs(af, src_sl, &src_se, dst_sl, &dst_se);
475	if (error != 0)
476		return (error);
477
478	VERIFY(*dst_sl != NULL && dst_se != NULL);
479	VERIFY(src_se == NULL || *src_sl != NULL);
480	VERIFY(dst_se->se_addr->sa_family == af);
481	VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
482
483	/*
484	 * We get here for 2 cases:
485	 *
486	 *   a. From MPTCP, to connect a subflow.  There is no need to
487	 *	bind the socket to the source address and/or interface,
488	 *	since everything has been taken care of by MPTCP.  We
489	 *	simply check whether or not this is for the initial
490	 *	MPTCP connection attempt, or to join an existing one.
491	 *
492	 *   b.	From the socket layer, to connect a TCP.  Perform the
493	 *	bind to source address and/or interface as necessary.
494	 */
495#if MPTCP
496	if (flags & TCP_CONNREQF_MPTCP) {
497		struct mptsub_connreq *mpcr = arg;
498
499		/* Check to make sure this came down from MPTCP */
500		if (arg == NULL || arglen != sizeof (*mpcr))
501			return (EOPNOTSUPP);
502
503		switch (mpcr->mpcr_type) {
504		case MPTSUB_CONNREQ_MP_ENABLE:
505			break;
506		case MPTSUB_CONNREQ_MP_ADD:
507			break;
508		default:
509			return (EOPNOTSUPP);
510		}
511	} else
512#endif /* MPTCP */
513	{
514		/* bind socket to the specified interface, if requested */
515		if (ifscope != IFSCOPE_NONE &&
516		    (error = inp_bindif(inp, ifscope, NULL)) != 0)
517			return (error);
518
519		/* if source address and/or port is specified, bind to it */
520		if (src_se != NULL) {
521			struct sockaddr *sa = src_se->se_addr;
522			error = sobindlock(so, sa, 0);	/* already locked */
523			if (error != 0)
524				return (error);
525		}
526	}
527
528	switch (af) {
529	case AF_INET:
530		error = tcp_usr_connect(so, dst_se->se_addr, p);
531		break;
532#if INET6
533	case AF_INET6:
534		error = tcp6_usr_connect(so, dst_se->se_addr, p);
535		break;
536#endif /* INET6 */
537	default:
538		VERIFY(0);
539		/* NOTREACHED */
540	}
541
542	if (error == 0 && pcid != NULL)
543		*pcid = 1;	/* there is only 1 connection for a TCP */
544
545	return (error);
546}
547
548static int
549tcp_usr_connectx(struct socket *so, struct sockaddr_list **src_sl,
550    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
551    associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
552    uint32_t arglen)
553{
554	return (tcp_usr_connectx_common(so, AF_INET, src_sl, dst_sl,
555	    p, ifscope, aid, pcid, flags, arg, arglen));
556}
557
558#if INET6
559static int
560tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
561{
562	int error = 0;
563	struct inpcb *inp = sotoinpcb(so);
564	struct tcpcb *tp;
565	struct sockaddr_in6 *sin6p;
566
567	TCPDEBUG0;
568	if (inp == NULL) {
569		return EINVAL;
570	} else if (inp->inp_state == INPCB_STATE_DEAD) {
571		if (so->so_error) {
572			error = so->so_error;
573			so->so_error = 0;
574			return error;
575		} else
576			return EINVAL;
577	}
578#if FLOW_DIVERT
579	else if (inp->inp_flags2 & INP2_WANT_FLOW_DIVERT) {
580		uint32_t fd_ctl_unit = 0;
581		error = flow_divert_check_policy(so, p, FALSE, &fd_ctl_unit);
582		if (error == 0) {
583			if (fd_ctl_unit > 0) {
584				error = flow_divert_pcb_init(so, fd_ctl_unit);
585				if (error == 0) {
586					error = flow_divert_connect_out(so, nam, p);
587				}
588			} else {
589				error = ENETDOWN;
590			}
591		}
592		return error;
593	}
594#endif /* FLOW_DIVERT */
595	tp = intotcpcb(inp);
596	TCPDEBUG1();
597
598	calculate_tcp_clock();
599
600	if (nam->sa_family != 0 && nam->sa_family != AF_INET6) {
601		error = EAFNOSUPPORT;
602		goto out;
603	}
604
605	/*
606	 * Must disallow TCP ``connections'' to multicast addresses.
607	 */
608	sin6p = (struct sockaddr_in6 *)(void *)nam;
609	if (sin6p->sin6_family == AF_INET6
610	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
611		error = EAFNOSUPPORT;
612		goto out;
613	}
614
615	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
616		struct sockaddr_in sin;
617
618		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
619			return (EINVAL);
620
621		in6_sin6_2_sin(&sin, sin6p);
622		inp->inp_vflag |= INP_IPV4;
623		inp->inp_vflag &= ~INP_IPV6;
624		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0)
625			goto out;
626		error = tcp_output(tp);
627		goto out;
628	}
629	inp->inp_vflag &= ~INP_IPV4;
630	inp->inp_vflag |= INP_IPV6;
631	if ((error = tcp6_connect(tp, nam, p)) != 0)
632		goto out;
633	error = tcp_output(tp);
634	if (error)
635		goto out;
636	COMMON_END(PRU_CONNECT);
637}
638
639static int
640tcp6_usr_connectx(struct socket *so, struct sockaddr_list **src_sl,
641    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
642    associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
643    uint32_t arglen)
644{
645	return (tcp_usr_connectx_common(so, AF_INET6, src_sl, dst_sl,
646	    p, ifscope, aid, pcid, flags, arg, arglen));
647}
648#endif /* INET6 */
649
650/*
651 * Initiate disconnect from peer.
652 * If connection never passed embryonic stage, just drop;
653 * else if don't need to let data drain, then can just drop anyways,
654 * else have to begin TCP shutdown process: mark socket disconnecting,
655 * drain unread data, state switch to reflect user close, and
656 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
657 * when peer sends FIN and acks ours.
658 *
659 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
660 */
661static int
662tcp_usr_disconnect(struct socket *so)
663{
664	int error = 0;
665	struct inpcb *inp = sotoinpcb(so);
666	struct tcpcb *tp;
667
668	lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
669	    LCK_MTX_ASSERT_OWNED);
670	COMMON_START();
671        /* In case we got disconnected from the peer */
672        if (tp == NULL)
673		goto out;
674	tp = tcp_disconnect(tp);
675	COMMON_END(PRU_DISCONNECT);
676}
677
678/*
679 * User-protocol pru_disconnectx callback.
680 */
681static int
682tcp_usr_disconnectx(struct socket *so, associd_t aid, connid_t cid)
683{
684#pragma unused(cid)
685	if (aid != ASSOCID_ANY && aid != ASSOCID_ALL)
686		return (EINVAL);
687
688	return (tcp_usr_disconnect(so));
689}
690
691/*
692 * Accept a connection.  Essentially all the work is
693 * done at higher levels; just return the address
694 * of the peer, storing through addr.
695 */
696static int
697tcp_usr_accept(struct socket *so, struct sockaddr **nam)
698{
699	int error = 0;
700	struct inpcb *inp = sotoinpcb(so);
701	struct tcpcb *tp = NULL;
702	TCPDEBUG0;
703
704	in_getpeeraddr(so, nam);
705
706	if (so->so_state & SS_ISDISCONNECTED) {
707		error = ECONNABORTED;
708		goto out;
709	}
710	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD)
711		return (EINVAL);
712	else if (inp->inp_flags2 & INP2_WANT_FLOW_DIVERT)
713		return (EPROTOTYPE);
714
715	tp = intotcpcb(inp);
716	TCPDEBUG1();
717
718	calculate_tcp_clock();
719
720	COMMON_END(PRU_ACCEPT);
721}
722
723#if INET6
724static int
725tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
726{
727	int error = 0;
728	struct inpcb *inp = sotoinpcb(so);
729	struct tcpcb *tp = NULL;
730	TCPDEBUG0;
731
732	if (so->so_state & SS_ISDISCONNECTED) {
733		error = ECONNABORTED;
734		goto out;
735	}
736	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD)
737		return (EINVAL);
738	else if (inp->inp_flags2 & INP2_WANT_FLOW_DIVERT)
739		return (EPROTOTYPE);
740
741	tp = intotcpcb(inp);
742	TCPDEBUG1();
743
744	calculate_tcp_clock();
745
746	in6_mapped_peeraddr(so, nam);
747	COMMON_END(PRU_ACCEPT);
748}
749#endif /* INET6 */
750
751/*
752 * Mark the connection as being incapable of further output.
753 *
754 * Returns:	0			Success
755 *		EINVAL [COMMON_START]
756 *	tcp_output:EADDRNOTAVAIL
757 *	tcp_output:ENOBUFS
758 *	tcp_output:EMSGSIZE
759 *	tcp_output:EHOSTUNREACH
760 *	tcp_output:ENETUNREACH
761 *	tcp_output:ENETDOWN
762 *	tcp_output:ENOMEM
763 *	tcp_output:EACCES
764 *	tcp_output:EMSGSIZE
765 *	tcp_output:ENOBUFS
766 *	tcp_output:???			[ignorable: mostly IPSEC/firewall/DLIL]
767 */
768static int
769tcp_usr_shutdown(struct socket *so)
770{
771	int error = 0;
772	struct inpcb *inp = sotoinpcb(so);
773	struct tcpcb *tp;
774
775	TCPDEBUG0;
776	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD)
777		return (EINVAL);
778
779	socantsendmore(so);
780
781        /*
782	 * In case we got disconnected from the peer, or if this is
783	 * a socket that is to be flow-diverted (but not yet).
784	 */
785	tp = intotcpcb(inp);
786	TCPDEBUG1();
787        if (tp == NULL || (inp->inp_flags2 & INP2_WANT_FLOW_DIVERT)) {
788		if (tp != NULL)
789			error = EPROTOTYPE;
790		goto out;
791	}
792
793	calculate_tcp_clock();
794
795	tp = tcp_usrclosed(tp);
796#if MPTCP
797	/* A reset has been sent but socket exists, do not send FIN */
798	if ((so->so_flags & SOF_MP_SUBFLOW) &&
799	    (tp) && (tp->t_mpflags & TMPF_RESET)) {
800		goto out;
801	}
802#endif
803	if (tp)
804		error = tcp_output(tp);
805	COMMON_END(PRU_SHUTDOWN);
806}
807
808/*
809 * After a receive, possibly send window update to peer.
810 */
811static int
812tcp_usr_rcvd(struct socket *so, __unused int flags)
813{
814	int error = 0;
815	struct inpcb *inp = sotoinpcb(so);
816	struct tcpcb *tp;
817
818	COMMON_START();
819        /* In case we got disconnected from the peer */
820        if (tp == NULL)
821		goto out;
822	tcp_sbrcv_trim(tp, &so->so_rcv);
823
824	tcp_output(tp);
825	COMMON_END(PRU_RCVD);
826}
827
828/*
829 * Do a send by putting data in output queue and updating urgent
830 * marker if URG set.  Possibly send more data.  Unlike the other
831 * pru_*() routines, the mbuf chains are our responsibility.  We
832 * must either enqueue them or free them.  The other pru_* routines
833 * generally are caller-frees.
834 *
835 * Returns:	0			Success
836 *		ECONNRESET
837 *		EINVAL
838 *		ENOBUFS
839 *	tcp_connect:EADDRINUSE		Address in use
840 *	tcp_connect:EADDRNOTAVAIL	Address not available.
841 *	tcp_connect:EINVAL		Invalid argument
842 *	tcp_connect:EAFNOSUPPORT	Address family not supported [notdef]
843 *	tcp_connect:EACCES		Permission denied
844 *	tcp_connect:EAGAIN		Resource unavailable, try again
845 *	tcp_connect:EPERM		Operation not permitted
846 *	tcp_output:EADDRNOTAVAIL
847 *	tcp_output:ENOBUFS
848 *	tcp_output:EMSGSIZE
849 *	tcp_output:EHOSTUNREACH
850 *	tcp_output:ENETUNREACH
851 *	tcp_output:ENETDOWN
852 *	tcp_output:ENOMEM
853 *	tcp_output:EACCES
854 *	tcp_output:EMSGSIZE
855 *	tcp_output:ENOBUFS
856 *	tcp_output:???			[ignorable: mostly IPSEC/firewall/DLIL]
857 *	tcp6_connect:???		[IPV6 only]
858 */
859static int
860tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
861     struct sockaddr *nam, struct mbuf *control, struct proc *p)
862{
863	int error = 0;
864	struct inpcb *inp = sotoinpcb(so);
865	struct tcpcb *tp;
866	uint32_t msgpri = MSG_PRI_DEFAULT;
867#if INET6
868	int isipv6;
869#endif
870	TCPDEBUG0;
871
872	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD ||
873	    (inp->inp_flags2 & INP2_WANT_FLOW_DIVERT)) {
874		/*
875		 * OOPS! we lost a race, the TCP session got reset after
876		 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
877		 * network interrupt in the non-splnet() section of sosend().
878		 */
879		if (m != NULL)
880			m_freem(m);
881		if (control != NULL) {
882			m_freem(control);
883			control = NULL;
884		}
885		if (inp != NULL && (inp->inp_flags2 & INP2_WANT_FLOW_DIVERT))
886			error = EPROTOTYPE;
887		else
888			error = ECONNRESET;	/* XXX EPIPE? */
889		tp = NULL;
890		TCPDEBUG1();
891		goto out;
892	}
893#if INET6
894	isipv6 = nam && nam->sa_family == AF_INET6;
895#endif /* INET6 */
896	tp = intotcpcb(inp);
897	TCPDEBUG1();
898
899	calculate_tcp_clock();
900
901	if (control != NULL) {
902		if (so->so_flags & SOF_ENABLE_MSGS) {
903			/* Get the msg priority from control mbufs */
904			error = tcp_get_msg_priority(control, &msgpri);
905			if (error) {
906				m_freem(control);
907				if (m != NULL)
908					m_freem(m);
909				control = NULL;
910				m = NULL;
911				goto out;
912			}
913			m_freem(control);
914			control = NULL;
915		} else if (control->m_len) {
916			/*
917			 * if not unordered, TCP should not have
918			 * control mbufs
919			 */
920			m_freem(control);
921			if (m != NULL)
922				m_freem(m);
923			control = NULL;
924			m = NULL;
925			error = EINVAL;
926			goto out;
927		}
928	}
929
930	if (so->so_flags & SOF_ENABLE_MSGS) {
931		VERIFY(m->m_flags & M_PKTHDR);
932		m->m_pkthdr.msg_pri = msgpri;
933	}
934
935	/* MPTCP sublow socket buffers must not be compressed */
936	VERIFY(!(so->so_flags & SOF_MP_SUBFLOW) ||
937	    (so->so_snd.sb_flags & SB_NOCOMPRESS));
938
939	if(!(flags & PRUS_OOB)) {
940		/* Call msg send if message delivery is enabled */
941		if (so->so_flags & SOF_ENABLE_MSGS)
942			sbappendmsg_snd(&so->so_snd, m);
943		else
944			sbappendstream(&so->so_snd, m);
945
946		if (nam && tp->t_state < TCPS_SYN_SENT) {
947			/*
948			 * Do implied connect if not yet connected,
949			 * initialize window to default value, and
950			 * initialize maxseg/maxopd using peer's cached
951			 * MSS.
952			 */
953#if INET6
954			if (isipv6)
955				error = tcp6_connect(tp, nam, p);
956			else
957#endif /* INET6 */
958				error = tcp_connect(tp, nam, p);
959			if (error)
960				goto out;
961			tp->snd_wnd = TTCP_CLIENT_SND_WND;
962			tcp_mss(tp, -1, IFSCOPE_NONE);
963		}
964
965		if (flags & PRUS_EOF) {
966			/*
967			 * Close the send side of the connection after
968			 * the data is sent.
969			 */
970			socantsendmore(so);
971			tp = tcp_usrclosed(tp);
972		}
973		if (tp != NULL) {
974			if (flags & PRUS_MORETOCOME)
975				tp->t_flags |= TF_MORETOCOME;
976			error = tcp_output(tp);
977			if (flags & PRUS_MORETOCOME)
978				tp->t_flags &= ~TF_MORETOCOME;
979		}
980	} else {
981		if (sbspace(&so->so_snd) == 0) {
982			/* if no space is left in sockbuf,
983			 * do not try to squeeze in OOB traffic */
984			m_freem(m);
985			error = ENOBUFS;
986			goto out;
987		}
988		/*
989		 * According to RFC961 (Assigned Protocols),
990		 * the urgent pointer points to the last octet
991		 * of urgent data.  We continue, however,
992		 * to consider it to indicate the first octet
993		 * of data past the urgent section.
994		 * Otherwise, snd_up should be one lower.
995		 */
996		sbappendstream(&so->so_snd, m);
997		if (nam && tp->t_state < TCPS_SYN_SENT) {
998			/*
999			 * Do implied connect if not yet connected,
1000			 * initialize window to default value, and
1001			 * initialize maxseg/maxopd using peer's cached
1002			 * MSS.
1003			 */
1004#if INET6
1005			if (isipv6)
1006				error = tcp6_connect(tp, nam, p);
1007			else
1008#endif /* INET6 */
1009			error = tcp_connect(tp, nam, p);
1010			if (error)
1011				goto out;
1012			tp->snd_wnd = TTCP_CLIENT_SND_WND;
1013			tcp_mss(tp, -1, IFSCOPE_NONE);
1014		}
1015		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
1016		tp->t_force = 1;
1017		error = tcp_output(tp);
1018		tp->t_force = 0;
1019	}
1020	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
1021		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
1022}
1023
1024/*
1025 * Abort the TCP.
1026 */
1027static int
1028tcp_usr_abort(struct socket *so)
1029{
1030	int error = 0;
1031	struct inpcb *inp = sotoinpcb(so);
1032	struct tcpcb *tp;
1033
1034	COMMON_START();
1035        /* In case we got disconnected from the peer */
1036        if (tp == NULL)
1037		goto out;
1038	tp = tcp_drop(tp, ECONNABORTED);
1039	so->so_usecount--;
1040	COMMON_END(PRU_ABORT);
1041}
1042
1043/*
1044 * Receive out-of-band data.
1045 *
1046 * Returns:	0			Success
1047 *		EINVAL [COMMON_START]
1048 *		EINVAL
1049 *		EWOULDBLOCK
1050 */
1051static int
1052tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
1053{
1054	int error = 0;
1055	struct inpcb *inp = sotoinpcb(so);
1056	struct tcpcb *tp;
1057
1058	COMMON_START();
1059	if ((so->so_oobmark == 0 &&
1060	     (so->so_state & SS_RCVATMARK) == 0) ||
1061	    so->so_options & SO_OOBINLINE ||
1062	    tp->t_oobflags & TCPOOB_HADDATA) {
1063		error = EINVAL;
1064		goto out;
1065	}
1066	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
1067		error = EWOULDBLOCK;
1068		goto out;
1069	}
1070	m->m_len = 1;
1071	*mtod(m, caddr_t) = tp->t_iobc;
1072	if ((flags & MSG_PEEK) == 0)
1073		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1074	COMMON_END(PRU_RCVOOB);
1075}
1076
1077/* xxx - should be const */
1078struct pr_usrreqs tcp_usrreqs = {
1079	.pru_abort =		tcp_usr_abort,
1080	.pru_accept =		tcp_usr_accept,
1081	.pru_attach =		tcp_usr_attach,
1082	.pru_bind =		tcp_usr_bind,
1083	.pru_connect =		tcp_usr_connect,
1084	.pru_connectx =		tcp_usr_connectx,
1085	.pru_control =		in_control,
1086	.pru_detach =		tcp_usr_detach,
1087	.pru_disconnect =	tcp_usr_disconnect,
1088	.pru_disconnectx =	tcp_usr_disconnectx,
1089	.pru_listen =		tcp_usr_listen,
1090	.pru_peeraddr =		in_getpeeraddr,
1091	.pru_rcvd =		tcp_usr_rcvd,
1092	.pru_rcvoob =		tcp_usr_rcvoob,
1093	.pru_send =		tcp_usr_send,
1094	.pru_shutdown =		tcp_usr_shutdown,
1095	.pru_sockaddr =		in_getsockaddr,
1096	.pru_sosend =		sosend,
1097	.pru_soreceive =	soreceive,
1098};
1099
1100#if INET6
1101struct pr_usrreqs tcp6_usrreqs = {
1102	.pru_abort =		tcp_usr_abort,
1103	.pru_accept =		tcp6_usr_accept,
1104	.pru_attach =		tcp_usr_attach,
1105	.pru_bind =		tcp6_usr_bind,
1106	.pru_connect =		tcp6_usr_connect,
1107	.pru_connectx =		tcp6_usr_connectx,
1108	.pru_control =		in6_control,
1109	.pru_detach =		tcp_usr_detach,
1110	.pru_disconnect =	tcp_usr_disconnect,
1111	.pru_disconnectx =	tcp_usr_disconnectx,
1112	.pru_listen =		tcp6_usr_listen,
1113	.pru_peeraddr =		in6_mapped_peeraddr,
1114	.pru_rcvd =		tcp_usr_rcvd,
1115	.pru_rcvoob =		tcp_usr_rcvoob,
1116	.pru_send =		tcp_usr_send,
1117	.pru_shutdown =		tcp_usr_shutdown,
1118	.pru_sockaddr =		in6_mapped_sockaddr,
1119	.pru_sosend =		sosend,
1120	.pru_soreceive =	soreceive,
1121};
1122#endif /* INET6 */
1123
1124/*
1125 * Common subroutine to open a TCP connection to remote host specified
1126 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
1127 * port number if needed.  Call in_pcbladdr to do the routing and to choose
1128 * a local host address (interface).  If there is an existing incarnation
1129 * of the same connection in TIME-WAIT state and if the remote host was
1130 * sending CC options and if the connection duration was < MSL, then
1131 * truncate the previous TIME-WAIT state and proceed.
1132 * Initialize connection parameters and enter SYN-SENT state.
1133 *
1134 * Returns:	0			Success
1135 *		EADDRINUSE
1136 *		EINVAL
1137 *	in_pcbbind:EADDRNOTAVAIL	Address not available.
1138 *	in_pcbbind:EINVAL		Invalid argument
1139 *	in_pcbbind:EAFNOSUPPORT		Address family not supported [notdef]
1140 *	in_pcbbind:EACCES		Permission denied
1141 *	in_pcbbind:EADDRINUSE		Address in use
1142 *	in_pcbbind:EAGAIN		Resource unavailable, try again
1143 *	in_pcbbind:EPERM		Operation not permitted
1144 *	in_pcbladdr:EINVAL		Invalid argument
1145 *	in_pcbladdr:EAFNOSUPPORT	Address family not supported
1146 *	in_pcbladdr:EADDRNOTAVAIL	Address not available
1147 */
1148static int
1149tcp_connect(tp, nam, p)
1150	register struct tcpcb *tp;
1151	struct sockaddr *nam;
1152	struct proc *p;
1153{
1154	struct inpcb *inp = tp->t_inpcb, *oinp;
1155	struct socket *so = inp->inp_socket;
1156	struct tcpcb *otp;
1157	struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
1158	struct in_addr laddr;
1159	struct rmxp_tao *taop;
1160	struct rmxp_tao tao_noncached;
1161	int error = 0;
1162	struct ifnet *outif = NULL;
1163
1164	if (inp->inp_lport == 0) {
1165		error = in_pcbbind(inp, NULL, p);
1166		if (error)
1167			goto done;
1168	}
1169
1170	/*
1171	 * Cannot simply call in_pcbconnect, because there might be an
1172	 * earlier incarnation of this same connection still in
1173	 * TIME_WAIT state, creating an ADDRINUSE error.
1174	 */
1175	error = in_pcbladdr(inp, nam, &laddr, IFSCOPE_NONE, &outif);
1176	if (error)
1177		goto done;
1178
1179	tcp_unlock(inp->inp_socket, 0, 0);
1180	oinp = in_pcblookup_hash(inp->inp_pcbinfo,
1181	    sin->sin_addr, sin->sin_port,
1182	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr : laddr,
1183	    inp->inp_lport,  0, NULL);
1184
1185	tcp_lock(inp->inp_socket, 0, 0);
1186	if (oinp) {
1187		if (oinp != inp) /* 4143933: avoid deadlock if inp == oinp */
1188			tcp_lock(oinp->inp_socket, 1, 0);
1189		if (in_pcb_checkstate(oinp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1190			if (oinp != inp)
1191				tcp_unlock(oinp->inp_socket, 1, 0);
1192			goto skip_oinp;
1193		}
1194
1195		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
1196		    otp->t_state == TCPS_TIME_WAIT &&
1197		    ((int)(tcp_now - otp->t_starttime)) < tcp_msl &&
1198		    (otp->t_flags & TF_RCVD_CC)) {
1199			otp = tcp_close(otp);
1200		} else {
1201			printf("tcp_connect: inp=0x%llx err=EADDRINUSE\n",
1202			    (uint64_t)VM_KERNEL_ADDRPERM(inp));
1203			if (oinp != inp)
1204				tcp_unlock(oinp->inp_socket, 1, 0);
1205			error = EADDRINUSE;
1206			goto done;
1207		}
1208		if (oinp != inp)
1209			tcp_unlock(oinp->inp_socket, 1, 0);
1210	}
1211skip_oinp:
1212	if ((inp->inp_laddr.s_addr == INADDR_ANY ? laddr.s_addr :
1213	    inp->inp_laddr.s_addr) == sin->sin_addr.s_addr &&
1214	    inp->inp_lport == sin->sin_port) {
1215		error = EINVAL;
1216		goto done;
1217	}
1218	if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->ipi_lock)) {
1219		/*lock inversion issue, mostly with udp multicast packets */
1220		socket_unlock(inp->inp_socket, 0);
1221		lck_rw_lock_exclusive(inp->inp_pcbinfo->ipi_lock);
1222		socket_lock(inp->inp_socket, 0);
1223	}
1224	if (inp->inp_laddr.s_addr == INADDR_ANY) {
1225		inp->inp_laddr = laddr;
1226		/* no reference needed */
1227		inp->inp_last_outifp = outif;
1228		inp->inp_flags |= INP_INADDR_ANY;
1229	}
1230	inp->inp_faddr = sin->sin_addr;
1231	inp->inp_fport = sin->sin_port;
1232	in_pcbrehash(inp);
1233	lck_rw_done(inp->inp_pcbinfo->ipi_lock);
1234
1235	if (inp->inp_flowhash == 0)
1236		inp->inp_flowhash = inp_calc_flowhash(inp);
1237
1238	tcp_set_max_rwinscale(tp, so);
1239
1240	soisconnecting(so);
1241	tcpstat.tcps_connattempt++;
1242	tp->t_state = TCPS_SYN_SENT;
1243	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_CONN_KEEPINIT(tp));
1244	tp->iss = tcp_new_isn(tp);
1245	tcp_sendseqinit(tp);
1246	if (nstat_collect)
1247		nstat_route_connect_attempt(inp->inp_route.ro_rt);
1248
1249	/*
1250	 * Generate a CC value for this connection and
1251	 * check whether CC or CCnew should be used.
1252	 */
1253	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
1254		taop = &tao_noncached;
1255		bzero(taop, sizeof(*taop));
1256	}
1257
1258	tp->cc_send = CC_INC(tcp_ccgen);
1259	if (taop->tao_ccsent != 0 &&
1260	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
1261		taop->tao_ccsent = tp->cc_send;
1262	} else {
1263		taop->tao_ccsent = 0;
1264		tp->t_flags |= TF_SENDCCNEW;
1265	}
1266
1267done:
1268	if (outif != NULL)
1269		ifnet_release(outif);
1270
1271	return (error);
1272}
1273
1274#if INET6
1275static int
1276tcp6_connect(tp, nam, p)
1277	register struct tcpcb *tp;
1278	struct sockaddr *nam;
1279	struct proc *p;
1280{
1281	struct inpcb *inp = tp->t_inpcb, *oinp;
1282	struct socket *so = inp->inp_socket;
1283	struct tcpcb *otp;
1284	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)nam;
1285	struct in6_addr addr6;
1286	struct rmxp_tao *taop;
1287	struct rmxp_tao tao_noncached;
1288	int error = 0;
1289	struct ifnet *outif = NULL;
1290
1291	if (inp->inp_lport == 0) {
1292		error = in6_pcbbind(inp, NULL, p);
1293		if (error)
1294			goto done;
1295	}
1296
1297	/*
1298	 * Cannot simply call in_pcbconnect, because there might be an
1299	 * earlier incarnation of this same connection still in
1300	 * TIME_WAIT state, creating an ADDRINUSE error.
1301	 *
1302	 * in6_pcbladdr() might return an ifp with its reference held
1303	 * even in the error case, so make sure that it's released
1304	 * whenever it's non-NULL.
1305	 */
1306	error = in6_pcbladdr(inp, nam, &addr6, &outif);
1307	if (error)
1308		goto done;
1309	tcp_unlock(inp->inp_socket, 0, 0);
1310	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
1311				  &sin6->sin6_addr, sin6->sin6_port,
1312				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
1313				  ? &addr6
1314				  : &inp->in6p_laddr,
1315				  inp->inp_lport,  0, NULL);
1316	tcp_lock(inp->inp_socket, 0, 0);
1317	if (oinp) {
1318		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
1319		    otp->t_state == TCPS_TIME_WAIT &&
1320		    ((int)(tcp_now - otp->t_starttime)) < tcp_msl &&
1321		    (otp->t_flags & TF_RCVD_CC)) {
1322			otp = tcp_close(otp);
1323		} else {
1324			error = EADDRINUSE;
1325			goto done;
1326		}
1327	}
1328	if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->ipi_lock)) {
1329		/*lock inversion issue, mostly with udp multicast packets */
1330		socket_unlock(inp->inp_socket, 0);
1331		lck_rw_lock_exclusive(inp->inp_pcbinfo->ipi_lock);
1332		socket_lock(inp->inp_socket, 0);
1333	}
1334	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
1335		inp->in6p_laddr = addr6;
1336		inp->in6p_last_outifp = outif;	/* no reference needed */
1337		inp->in6p_flags |= INP_IN6ADDR_ANY;
1338	}
1339	inp->in6p_faddr = sin6->sin6_addr;
1340	inp->inp_fport = sin6->sin6_port;
1341	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
1342		inp->inp_flow = sin6->sin6_flowinfo;
1343	in_pcbrehash(inp);
1344	lck_rw_done(inp->inp_pcbinfo->ipi_lock);
1345
1346	if (inp->inp_flowhash == 0)
1347		inp->inp_flowhash = inp_calc_flowhash(inp);
1348	/* update flowinfo - RFC 6437 */
1349	if (inp->inp_flow == 0 && inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
1350		inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
1351		inp->inp_flow |=
1352		    (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
1353	}
1354
1355	tcp_set_max_rwinscale(tp, so);
1356
1357	soisconnecting(so);
1358	tcpstat.tcps_connattempt++;
1359	tp->t_state = TCPS_SYN_SENT;
1360	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1361		TCP_CONN_KEEPINIT(tp));
1362	tp->iss = tcp_new_isn(tp);
1363	tcp_sendseqinit(tp);
1364	if (nstat_collect)
1365		nstat_route_connect_attempt(inp->inp_route.ro_rt);
1366
1367	/*
1368	 * Generate a CC value for this connection and
1369	 * check whether CC or CCnew should be used.
1370	 */
1371	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
1372		taop = &tao_noncached;
1373		bzero(taop, sizeof(*taop));
1374	}
1375
1376	tp->cc_send = CC_INC(tcp_ccgen);
1377	if (taop->tao_ccsent != 0 &&
1378	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
1379		taop->tao_ccsent = tp->cc_send;
1380	} else {
1381		taop->tao_ccsent = 0;
1382		tp->t_flags |= TF_SENDCCNEW;
1383	}
1384
1385done:
1386	if (outif != NULL)
1387		ifnet_release(outif);
1388
1389	return (error);
1390}
1391#endif /* INET6 */
1392
1393/*
1394 * Export TCP internal state information via a struct tcp_info
1395 */
1396__private_extern__ void
1397tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
1398{
1399	struct inpcb *inp = tp->t_inpcb;
1400
1401	bzero(ti, sizeof(*ti));
1402
1403	ti->tcpi_state = tp->t_state;
1404
1405	if (tp->t_state > TCPS_LISTEN) {
1406		if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
1407			ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1408		if (tp->t_flags & TF_SACK_PERMIT)
1409			ti->tcpi_options |= TCPI_OPT_SACK;
1410		if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
1411			ti->tcpi_options |= TCPI_OPT_WSCALE;
1412			ti->tcpi_snd_wscale = tp->snd_scale;
1413			ti->tcpi_rcv_wscale = tp->rcv_scale;
1414		}
1415
1416		/* Are we in retranmission episode */
1417		if (tp->snd_max != tp->snd_nxt)
1418			ti->tcpi_flags |= TCPI_FLAG_LOSSRECOVERY;
1419		else
1420			ti->tcpi_flags &= ~TCPI_FLAG_LOSSRECOVERY;
1421
1422		ti->tcpi_rto = tp->t_timer[TCPT_REXMT] ? tp->t_rxtcur : 0;
1423		ti->tcpi_snd_mss = tp->t_maxseg;
1424		ti->tcpi_rcv_mss = tp->t_maxseg;
1425
1426		ti->tcpi_rttcur = tp->t_rttcur;
1427		ti->tcpi_srtt = tp->t_srtt >> TCP_RTT_SHIFT;
1428		ti->tcpi_rttvar = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
1429		ti->tcpi_rttbest = tp->t_rttbest >> TCP_RTT_SHIFT;
1430
1431		ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
1432		ti->tcpi_snd_cwnd = tp->snd_cwnd;
1433		ti->tcpi_snd_sbbytes = tp->t_inpcb->inp_socket->so_snd.sb_cc;
1434
1435		ti->tcpi_rcv_space = tp->rcv_wnd;
1436
1437		ti->tcpi_snd_wnd = tp->snd_wnd;
1438		ti->tcpi_snd_nxt = tp->snd_nxt;
1439		ti->tcpi_rcv_nxt = tp->rcv_nxt;
1440
1441		/* convert bytes/msec to bits/sec */
1442		if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
1443			tp->t_bwmeas != NULL) {
1444			ti->tcpi_snd_bw	= (tp->t_bwmeas->bw_sndbw * 8000);
1445		}
1446
1447		ti->tcpi_last_outif = (tp->t_inpcb->inp_last_outifp == NULL) ? 0 :
1448		    tp->t_inpcb->inp_last_outifp->if_index;
1449
1450		//atomic_get_64(ti->tcpi_txbytes, &inp->inp_stat->txbytes);
1451		ti->tcpi_txpackets = inp->inp_stat->txpackets;
1452		ti->tcpi_txbytes = inp->inp_stat->txbytes;
1453		ti->tcpi_txretransmitbytes = tp->t_stat.txretransmitbytes;
1454		ti->tcpi_txunacked = tp->snd_max - tp->snd_una;
1455
1456		//atomic_get_64(ti->tcpi_rxbytes, &inp->inp_stat->rxbytes);
1457		ti->tcpi_rxpackets = inp->inp_stat->rxpackets;
1458		ti->tcpi_rxbytes = inp->inp_stat->rxbytes;
1459		ti->tcpi_rxduplicatebytes = tp->t_stat.rxduplicatebytes;
1460		ti->tcpi_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
1461
1462		if (tp->t_state > TCPS_LISTEN) {
1463			ti->tcpi_synrexmits = tp->t_stat.synrxtshift;
1464		}
1465		ti->tcpi_cell_rxpackets = inp->inp_cstat->rxpackets;
1466		ti->tcpi_cell_rxbytes = inp->inp_cstat->rxbytes;
1467		ti->tcpi_cell_txpackets = inp->inp_cstat->txpackets;
1468		ti->tcpi_cell_txbytes = inp->inp_cstat->txbytes;
1469
1470		ti->tcpi_wifi_rxpackets = inp->inp_wstat->rxpackets;
1471		ti->tcpi_wifi_rxbytes = inp->inp_wstat->rxbytes;
1472		ti->tcpi_wifi_txpackets = inp->inp_wstat->txpackets;
1473		ti->tcpi_wifi_txbytes = inp->inp_wstat->txbytes;
1474	}
1475}
1476
1477__private_extern__ errno_t
1478tcp_fill_info_for_info_tuple(struct info_tuple *itpl, struct tcp_info *ti)
1479{
1480	struct inpcbinfo *pcbinfo = NULL;
1481	struct inpcb *inp = NULL;
1482	struct socket *so;
1483	struct tcpcb *tp;
1484
1485	if (itpl->itpl_proto == IPPROTO_TCP)
1486		pcbinfo = &tcbinfo;
1487	else
1488		return EINVAL;
1489
1490	if (itpl->itpl_local_sa.sa_family == AF_INET &&
1491		itpl->itpl_remote_sa.sa_family == AF_INET) {
1492		inp = in_pcblookup_hash(pcbinfo,
1493			itpl->itpl_remote_sin.sin_addr,
1494			itpl->itpl_remote_sin.sin_port,
1495			itpl->itpl_local_sin.sin_addr,
1496			itpl->itpl_local_sin.sin_port,
1497			0, NULL);
1498	} else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
1499		itpl->itpl_remote_sa.sa_family == AF_INET6) {
1500		struct in6_addr ina6_local;
1501		struct in6_addr ina6_remote;
1502
1503		ina6_local = itpl->itpl_local_sin6.sin6_addr;
1504		if (IN6_IS_SCOPE_LINKLOCAL(&ina6_local) &&
1505			itpl->itpl_local_sin6.sin6_scope_id)
1506			ina6_local.s6_addr16[1] = htons(itpl->itpl_local_sin6.sin6_scope_id);
1507
1508		ina6_remote = itpl->itpl_remote_sin6.sin6_addr;
1509		if (IN6_IS_SCOPE_LINKLOCAL(&ina6_remote) &&
1510			itpl->itpl_remote_sin6.sin6_scope_id)
1511			ina6_remote.s6_addr16[1] = htons(itpl->itpl_remote_sin6.sin6_scope_id);
1512
1513		inp = in6_pcblookup_hash(pcbinfo,
1514			&ina6_remote,
1515			itpl->itpl_remote_sin6.sin6_port,
1516			&ina6_local,
1517			itpl->itpl_local_sin6.sin6_port,
1518			0, NULL);
1519	} else {
1520		return EINVAL;
1521	}
1522	if (inp == NULL || (so = inp->inp_socket) == NULL)
1523		return ENOENT;
1524
1525	socket_lock(so, 0);
1526	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1527		socket_unlock(so, 0);
1528		return ENOENT;
1529	}
1530	tp = intotcpcb(inp);
1531
1532	tcp_fill_info(tp, ti);
1533	socket_unlock(so, 0);
1534
1535	return 0;
1536}
1537
1538
1539__private_extern__ int
1540tcp_sysctl_info(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
1541{
1542	int error;
1543	struct tcp_info ti;
1544	struct info_tuple itpl;
1545	proc_t caller = PROC_NULL;
1546	proc_t caller_parent = PROC_NULL;
1547	char command_name[MAXCOMLEN + 1] = "";
1548	char parent_name[MAXCOMLEN + 1] = "";
1549
1550	if ((caller = proc_self()) != PROC_NULL) {
1551		/* get process name */
1552		strlcpy(command_name, caller->p_comm, sizeof(command_name));
1553
1554		/* get parent process name if possible */
1555		if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
1556			strlcpy(parent_name, caller_parent->p_comm,
1557			    sizeof(parent_name));
1558			proc_rele(caller_parent);
1559		}
1560
1561		if ((escape_str(command_name, strlen(command_name),
1562		    sizeof(command_name)) == 0) &&
1563		    (escape_str(parent_name, strlen(parent_name),
1564		    sizeof(parent_name)) == 0)) {
1565			kern_asl_msg(LOG_DEBUG, "messagetracer",
1566			    5,
1567			    "com.apple.message.domain",
1568			    "com.apple.kernel.tcpstat", /* 1 */
1569			    "com.apple.message.signature",
1570			    "tcpinfo", /* 2 */
1571			    "com.apple.message.signature2", command_name, /* 3 */
1572			    "com.apple.message.signature3", parent_name, /* 4 */
1573			    "com.apple.message.summarize", "YES", /* 5 */
1574			    NULL);
1575		}
1576	}
1577
1578	if (caller != PROC_NULL)
1579		proc_rele(caller);
1580
1581	if (req->newptr == USER_ADDR_NULL) {
1582		return EINVAL;
1583	}
1584	if (req->newlen < sizeof(struct info_tuple)) {
1585		return EINVAL;
1586	}
1587	error = SYSCTL_IN(req, &itpl, sizeof(struct info_tuple));
1588	if (error != 0) {
1589		return error;
1590	}
1591	error = tcp_fill_info_for_info_tuple(&itpl, &ti);
1592	if (error != 0) {
1593		return error;
1594	}
1595	error = SYSCTL_OUT(req, &ti, sizeof(struct tcp_info));
1596	if (error != 0) {
1597		return error;
1598	}
1599
1600	return 0;
1601}
1602
1603static int
1604tcp_lookup_peer_pid_locked(struct socket *so, pid_t *out_pid)
1605{
1606	int error = EHOSTUNREACH;
1607	*out_pid = -1;
1608	if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN;
1609
1610	struct inpcb	*inp = (struct inpcb*)so->so_pcb;
1611	uint16_t		lport = inp->inp_lport;
1612	uint16_t		fport = inp->inp_fport;
1613	struct inpcb	*finp = NULL;
1614
1615	if (inp->inp_vflag & INP_IPV6) {
1616		struct	in6_addr	laddr6 = inp->in6p_laddr;
1617		struct	in6_addr	faddr6 = inp->in6p_faddr;
1618		socket_unlock(so, 0);
1619		finp = in6_pcblookup_hash(&tcbinfo, &laddr6, lport, &faddr6, fport, 0, NULL);
1620		socket_lock(so, 0);
1621	} else if (inp->inp_vflag & INP_IPV4) {
1622		struct	in_addr	laddr4 = inp->inp_laddr;
1623		struct	in_addr	faddr4 = inp->inp_faddr;
1624		socket_unlock(so, 0);
1625		finp = in_pcblookup_hash(&tcbinfo, laddr4, lport, faddr4, fport, 0, NULL);
1626		socket_lock(so, 0);
1627	}
1628
1629	if (finp) {
1630		*out_pid = finp->inp_socket->last_pid;
1631		error = 0;
1632		in_pcb_checkstate(finp, WNT_RELEASE, 0);
1633	}
1634
1635	return error;
1636}
1637
1638void
1639tcp_getconninfo(struct socket *so, struct conninfo_tcp *tcp_ci)
1640{
1641	(void) tcp_lookup_peer_pid_locked(so, &tcp_ci->tcpci_peer_pid);
1642	tcp_fill_info(sototcpcb(so), &tcp_ci->tcpci_tcp_info);
1643}
1644
1645/*
1646 * The new sockopt interface makes it possible for us to block in the
1647 * copyin/out step (if we take a page fault).  Taking a page fault at
1648 * splnet() is probably a Bad Thing.  (Since sockets and pcbs both now
1649 * use TSM, there probably isn't any need for this function to run at
1650 * splnet() any more.  This needs more examination.)
1651 */
1652int
1653tcp_ctloutput(so, sopt)
1654	struct socket *so;
1655	struct sockopt *sopt;
1656{
1657	int	error, opt, optval;
1658	struct	inpcb *inp;
1659	struct	tcpcb *tp;
1660
1661	error = 0;
1662	inp = sotoinpcb(so);
1663	if (inp == NULL) {
1664		return (ECONNRESET);
1665	}
1666	/* Allow <SOL_SOCKET,SO_FLUSH/SO_TRAFFIC_MGT_BACKGROUND> at this level */
1667	if (sopt->sopt_level != IPPROTO_TCP &&
1668	    !(sopt->sopt_level == SOL_SOCKET && (sopt->sopt_name == SO_FLUSH ||
1669	    sopt->sopt_name == SO_TRAFFIC_MGT_BACKGROUND))) {
1670#if INET6
1671		if (SOCK_CHECK_DOM(so, PF_INET6))
1672			error = ip6_ctloutput(so, sopt);
1673		else
1674#endif /* INET6 */
1675		error = ip_ctloutput(so, sopt);
1676		return (error);
1677	}
1678	tp = intotcpcb(inp);
1679	if (tp == NULL) {
1680		return (ECONNRESET);
1681	}
1682
1683	calculate_tcp_clock();
1684
1685	switch (sopt->sopt_dir) {
1686	case SOPT_SET:
1687		switch (sopt->sopt_name) {
1688		case TCP_NODELAY:
1689		case TCP_NOOPT:
1690		case TCP_NOPUSH:
1691			error = sooptcopyin(sopt, &optval, sizeof optval,
1692					    sizeof optval);
1693			if (error)
1694				break;
1695
1696			switch (sopt->sopt_name) {
1697			case TCP_NODELAY:
1698				opt = TF_NODELAY;
1699				break;
1700			case TCP_NOOPT:
1701				opt = TF_NOOPT;
1702				break;
1703			case TCP_NOPUSH:
1704				opt = TF_NOPUSH;
1705				break;
1706			default:
1707				opt = 0; /* dead code to fool gcc */
1708				break;
1709			}
1710
1711			if (optval)
1712				tp->t_flags |= opt;
1713			else
1714				tp->t_flags &= ~opt;
1715			break;
1716		case TCP_RXT_FINDROP:
1717			error = sooptcopyin(sopt, &optval, sizeof optval,
1718				sizeof optval);
1719			if (error)
1720				break;
1721			opt = TF_RXTFINDROP;
1722			if (optval)
1723				tp->t_flagsext |= opt;
1724			else
1725				tp->t_flagsext &= ~opt;
1726			break;
1727		case TCP_MEASURE_SND_BW:
1728			error = sooptcopyin(sopt, &optval, sizeof optval,
1729				sizeof optval);
1730			if (error)
1731				break;
1732			opt = TF_MEASURESNDBW;
1733			if (optval) {
1734				if (tp->t_bwmeas == NULL) {
1735					tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1736					if (tp->t_bwmeas == NULL) {
1737						error = ENOMEM;
1738						break;
1739					}
1740				}
1741				tp->t_flagsext |= opt;
1742			} else {
1743				tp->t_flagsext &= ~opt;
1744				/* Reset snd bw measurement state */
1745				tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
1746				if (tp->t_bwmeas != NULL) {
1747					tcp_bwmeas_free(tp);
1748				}
1749			}
1750			break;
1751		case TCP_MEASURE_BW_BURST: {
1752			struct tcp_measure_bw_burst in;
1753			uint32_t minpkts, maxpkts;
1754			bzero(&in, sizeof(in));
1755
1756			error = sooptcopyin(sopt, &in, sizeof(in),
1757				sizeof(in));
1758			if (error)
1759				break;
1760			if ((tp->t_flagsext & TF_MEASURESNDBW) == 0 ||
1761				tp->t_bwmeas == NULL) {
1762				error = EINVAL;
1763				break;
1764			}
1765			minpkts = (in.min_burst_size != 0) ? in.min_burst_size :
1766				tp->t_bwmeas->bw_minsizepkts;
1767			maxpkts = (in.max_burst_size != 0) ? in.max_burst_size :
1768				tp->t_bwmeas->bw_maxsizepkts;
1769			if (minpkts > maxpkts) {
1770				error = EINVAL;
1771				break;
1772			}
1773			tp->t_bwmeas->bw_minsizepkts = minpkts;
1774			tp->t_bwmeas->bw_maxsizepkts = maxpkts;
1775			tp->t_bwmeas->bw_minsize = (minpkts * tp->t_maxseg);
1776			tp->t_bwmeas->bw_maxsize = (maxpkts * tp->t_maxseg);
1777			break;
1778		}
1779		case TCP_MAXSEG:
1780			error = sooptcopyin(sopt, &optval, sizeof optval,
1781					    sizeof optval);
1782			if (error)
1783				break;
1784
1785			if (optval > 0 && optval <= tp->t_maxseg &&
1786			    optval + 40 >= tcp_minmss)
1787				tp->t_maxseg = optval;
1788			else
1789				error = EINVAL;
1790			break;
1791
1792		case TCP_KEEPALIVE:
1793			error = sooptcopyin(sopt, &optval, sizeof optval,
1794						sizeof optval);
1795			if (error)
1796				break;
1797			if (optval < 0 || optval > UINT32_MAX/TCP_RETRANSHZ) {
1798				error = EINVAL;
1799			} else {
1800				tp->t_keepidle = optval * TCP_RETRANSHZ;
1801				/* reset the timer to new value */
1802				tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1803					TCP_CONN_KEEPIDLE(tp));
1804				tcp_check_timer_state(tp);
1805			}
1806                        break;
1807
1808		case TCP_CONNECTIONTIMEOUT:
1809			error = sooptcopyin(sopt, &optval, sizeof optval,
1810						sizeof optval);
1811			if (error)
1812				break;
1813			if (optval < 0 || optval > UINT32_MAX/TCP_RETRANSHZ) {
1814				error = EINVAL;
1815			} else {
1816				tp->t_keepinit = optval * TCP_RETRANSHZ;
1817				if (tp->t_state == TCPS_SYN_RECEIVED ||
1818					tp->t_state == TCPS_SYN_SENT) {
1819					tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1820						TCP_CONN_KEEPINIT(tp));
1821					tcp_check_timer_state(tp);
1822				}
1823			}
1824			break;
1825
1826		case TCP_KEEPINTVL:
1827			error = sooptcopyin(sopt, &optval, sizeof(optval),
1828				sizeof(optval));
1829			if (error)
1830				break;
1831			if (optval < 0 || optval > UINT32_MAX/TCP_RETRANSHZ) {
1832				error = EINVAL;
1833			} else {
1834				tp->t_keepintvl = optval * TCP_RETRANSHZ;
1835				if (tp->t_state == TCPS_FIN_WAIT_2 &&
1836					TCP_CONN_MAXIDLE(tp) > 0) {
1837					tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
1838						TCP_CONN_MAXIDLE(tp));
1839					tcp_check_timer_state(tp);
1840				}
1841			}
1842			break;
1843
1844		case TCP_KEEPCNT:
1845			error = sooptcopyin(sopt, &optval, sizeof(optval),
1846				sizeof(optval));
1847			if (error)
1848				break;
1849			if (optval < 0 || optval > INT32_MAX) {
1850				error = EINVAL;
1851			} else {
1852				tp->t_keepcnt = optval;
1853				if (tp->t_state == TCPS_FIN_WAIT_2 &&
1854					TCP_CONN_MAXIDLE(tp) > 0) {
1855					tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
1856						TCP_CONN_MAXIDLE(tp));
1857					tcp_check_timer_state(tp);
1858				}
1859			}
1860			break;
1861
1862		case PERSIST_TIMEOUT:
1863			error = sooptcopyin(sopt, &optval, sizeof optval,
1864						sizeof optval);
1865			if (error)
1866				break;
1867			if (optval < 0)
1868				error = EINVAL;
1869			else
1870				tp->t_persist_timeout = optval * TCP_RETRANSHZ;
1871			break;
1872		case TCP_RXT_CONNDROPTIME:
1873			error = sooptcopyin(sopt, &optval, sizeof(optval),
1874					sizeof(optval));
1875			if (error)
1876				break;
1877			if (optval < 0)
1878				error = EINVAL;
1879			else
1880				tp->t_rxt_conndroptime = optval * TCP_RETRANSHZ;
1881			break;
1882		case TCP_NOTSENT_LOWAT:
1883			error = sooptcopyin(sopt, &optval, sizeof(optval),
1884				sizeof(optval));
1885			if (error)
1886				break;
1887			if (optval < 0) {
1888				error = EINVAL;
1889				break;
1890			} else {
1891				if (optval == 0) {
1892					so->so_flags &= ~(SOF_NOTSENT_LOWAT);
1893					tp->t_notsent_lowat = 0;
1894				} else {
1895					so->so_flags |= SOF_NOTSENT_LOWAT;
1896					tp->t_notsent_lowat = optval;
1897				}
1898			}
1899			break;
1900		case TCP_ADAPTIVE_READ_TIMEOUT:
1901			error = sooptcopyin(sopt, &optval, sizeof (optval),
1902				sizeof(optval));
1903			if (error)
1904				break;
1905			if (optval < 0 ||
1906				optval > TCP_ADAPTIVE_TIMEOUT_MAX) {
1907				error = EINVAL;
1908				break;
1909			} else if (optval == 0) {
1910				tp->t_adaptive_rtimo = 0;
1911				tcp_keepalive_reset(tp);
1912			} else {
1913				tp->t_adaptive_rtimo = optval;
1914			}
1915			break;
1916		case TCP_ADAPTIVE_WRITE_TIMEOUT:
1917			error = sooptcopyin(sopt, &optval, sizeof (optval),
1918				sizeof (optval));
1919			if (error)
1920				break;
1921			if (optval < 0 ||
1922				optval > TCP_ADAPTIVE_TIMEOUT_MAX) {
1923				error = EINVAL;
1924				break;
1925			} else {
1926				tp->t_adaptive_wtimo = optval;
1927			}
1928			break;
1929		case TCP_ENABLE_MSGS:
1930			error = sooptcopyin(sopt, &optval, sizeof(optval),
1931				sizeof(optval));
1932			if (error)
1933				break;
1934			if (optval < 0 || optval > 1) {
1935				error = EINVAL;
1936			} else if (optval == 1) {
1937				/*
1938				 * Check if messages option is already
1939				 * enabled, if so return.
1940				 */
1941				if (so->so_flags & SOF_ENABLE_MSGS) {
1942					VERIFY(so->so_msg_state != NULL);
1943					break;
1944				}
1945
1946				/*
1947				 * allocate memory for storing message
1948				 * related state
1949				 */
1950				VERIFY(so->so_msg_state == NULL);
1951				MALLOC(so->so_msg_state,
1952					struct msg_state *,
1953					sizeof(struct msg_state),
1954					M_TEMP, M_WAITOK | M_ZERO);
1955				if (so->so_msg_state == NULL) {
1956					error = ENOMEM;
1957					break;
1958				}
1959
1960				/* Enable message delivery */
1961				so->so_flags |= SOF_ENABLE_MSGS;
1962			} else {
1963				/*
1964				 * Can't disable message delivery on socket
1965				 * because of restrictions imposed by
1966				 * encoding/decoding
1967				 */
1968				error = EINVAL;
1969			}
1970			break;
1971		case TCP_SENDMOREACKS:
1972			error = sooptcopyin(sopt, &optval, sizeof(optval),
1973				sizeof(optval));
1974			if (error)
1975				break;
1976			if (optval < 0 || optval > 1) {
1977				error = EINVAL;
1978			} else if (optval == 0) {
1979				tp->t_flagsext &= ~(TF_NOSTRETCHACK);
1980			} else {
1981				tp->t_flagsext |= TF_NOSTRETCHACK;
1982			}
1983			break;
1984		case SO_FLUSH:
1985			if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
1986			    sizeof (optval))) != 0)
1987				break;
1988
1989			error = inp_flush(inp, optval);
1990			break;
1991
1992		case SO_TRAFFIC_MGT_BACKGROUND:
1993			if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
1994			    sizeof (optval))) != 0)
1995				break;
1996
1997			if (optval) {
1998				socket_set_traffic_mgt_flags_locked(so,
1999				    TRAFFIC_MGT_SO_BACKGROUND);
2000			} else {
2001				socket_clear_traffic_mgt_flags_locked(so,
2002				    TRAFFIC_MGT_SO_BACKGROUND);
2003			}
2004			break;
2005
2006		default:
2007			error = ENOPROTOOPT;
2008			break;
2009		}
2010		break;
2011
2012	case SOPT_GET:
2013		switch (sopt->sopt_name) {
2014		case TCP_NODELAY:
2015			optval = tp->t_flags & TF_NODELAY;
2016			break;
2017		case TCP_MAXSEG:
2018			optval = tp->t_maxseg;
2019			break;
2020		case TCP_KEEPALIVE:
2021			optval = tp->t_keepidle / TCP_RETRANSHZ;
2022			break;
2023		case TCP_KEEPINTVL:
2024			optval = tp->t_keepintvl / TCP_RETRANSHZ;
2025			break;
2026		case TCP_KEEPCNT:
2027			optval = tp->t_keepcnt;
2028			break;
2029		case TCP_NOOPT:
2030			optval = tp->t_flags & TF_NOOPT;
2031			break;
2032		case TCP_NOPUSH:
2033			optval = tp->t_flags & TF_NOPUSH;
2034			break;
2035		case TCP_CONNECTIONTIMEOUT:
2036			optval = tp->t_keepinit / TCP_RETRANSHZ;
2037			break;
2038		case PERSIST_TIMEOUT:
2039			optval = tp->t_persist_timeout / TCP_RETRANSHZ;
2040			break;
2041		case TCP_RXT_CONNDROPTIME:
2042			optval = tp->t_rxt_conndroptime / TCP_RETRANSHZ;
2043			break;
2044		case TCP_RXT_FINDROP:
2045			optval = tp->t_flagsext & TF_RXTFINDROP;
2046			break;
2047		case TCP_MEASURE_SND_BW:
2048			optval = tp->t_flagsext & TF_MEASURESNDBW;
2049			break;
2050		case TCP_INFO: {
2051			struct tcp_info ti;
2052
2053			tcp_fill_info(tp, &ti);
2054			error = sooptcopyout(sopt, &ti, sizeof(struct tcp_info));
2055			goto done;
2056			/* NOT REACHED */
2057		}
2058		case TCP_MEASURE_BW_BURST: {
2059			struct tcp_measure_bw_burst out;
2060			if ((tp->t_flagsext & TF_MEASURESNDBW) == 0 ||
2061				tp->t_bwmeas == NULL) {
2062				error = EINVAL;
2063				break;
2064			}
2065			out.min_burst_size = tp->t_bwmeas->bw_minsizepkts;
2066			out.max_burst_size = tp->t_bwmeas->bw_maxsizepkts;
2067			error = sooptcopyout(sopt, &out, sizeof(out));
2068			goto done;
2069		}
2070		case TCP_NOTSENT_LOWAT:
2071			if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
2072				optval = tp->t_notsent_lowat;
2073			} else {
2074				optval = 0;
2075			}
2076			break;
2077
2078		case TCP_ENABLE_MSGS:
2079			if (so->so_flags & SOF_ENABLE_MSGS) {
2080				optval = 1;
2081			} else {
2082				optval = 0;
2083			}
2084			break;
2085		case TCP_SENDMOREACKS:
2086			if (tp->t_flagsext & TF_NOSTRETCHACK)
2087				optval = 1;
2088			else
2089				optval = 0;
2090			break;
2091		case TCP_PEER_PID: {
2092			pid_t	pid;
2093			error = tcp_lookup_peer_pid_locked(so, &pid);
2094			if (error == 0)
2095				error = sooptcopyout(sopt, &pid, sizeof(pid));
2096			goto done;
2097		}
2098		case TCP_ADAPTIVE_READ_TIMEOUT:
2099			optval = tp->t_adaptive_rtimo;
2100			break;
2101		case TCP_ADAPTIVE_WRITE_TIMEOUT:
2102			optval = tp->t_adaptive_wtimo;
2103			break;
2104		case SO_TRAFFIC_MGT_BACKGROUND:
2105			optval = (so->so_traffic_mgt_flags &
2106			    TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
2107			break;
2108		default:
2109			error = ENOPROTOOPT;
2110			break;
2111		}
2112		if (error == 0)
2113			error = sooptcopyout(sopt, &optval, sizeof optval);
2114		break;
2115	}
2116done:
2117	return (error);
2118}
2119
2120/*
2121 * tcp_sendspace and tcp_recvspace are the default send and receive window
2122 * sizes, respectively.  These are obsolescent (this information should
2123 * be set by the route).
2124 */
2125u_int32_t	tcp_sendspace = 1448*256;
2126u_int32_t	tcp_recvspace = 1448*384;
2127
2128/* During attach, the size of socket buffer allocated is limited to
2129 * sb_max in sbreserve. Disallow setting the tcp send and recv space
2130 * to be more than sb_max because that will cause tcp_attach to fail
2131 * (see radar 5713060)
2132 */
2133static int
2134sysctl_tcp_sospace(struct sysctl_oid *oidp, __unused void *arg1,
2135	__unused int arg2, struct sysctl_req *req) {
2136	u_int32_t new_value = 0, *space_p = NULL;
2137	int changed = 0, error = 0;
2138	u_quad_t sb_effective_max = (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES;
2139
2140	switch (oidp->oid_number) {
2141		case TCPCTL_SENDSPACE:
2142			space_p = &tcp_sendspace;
2143			break;
2144		case TCPCTL_RECVSPACE:
2145			space_p = &tcp_recvspace;
2146			break;
2147		default:
2148			return EINVAL;
2149	}
2150	error = sysctl_io_number(req, *space_p, sizeof(u_int32_t),
2151		&new_value, &changed);
2152	if (changed) {
2153		if (new_value > 0 && new_value <= sb_effective_max) {
2154			*space_p = new_value;
2155		} else {
2156			error = ERANGE;
2157		}
2158	}
2159	return error;
2160}
2161
2162SYSCTL_PROC(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2163    &tcp_sendspace , 0, &sysctl_tcp_sospace, "IU", "Maximum outgoing TCP datagram size");
2164SYSCTL_PROC(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2165    &tcp_recvspace , 0, &sysctl_tcp_sospace, "IU", "Maximum incoming TCP datagram size");
2166
2167
2168/*
2169 * Attach TCP protocol to socket, allocating
2170 * internet protocol control block, tcp control block,
2171 * bufer space, and entering LISTEN state if to accept connections.
2172 *
2173 * Returns:	0			Success
2174 *	in_pcballoc:ENOBUFS
2175 *	in_pcballoc:ENOMEM
2176 *	in_pcballoc:???			[IPSEC specific]
2177 *	soreserve:ENOBUFS
2178 */
2179static int
2180tcp_attach(so, p)
2181	struct socket *so;
2182	struct proc *p;
2183{
2184	register struct tcpcb *tp;
2185	struct inpcb *inp;
2186	int error;
2187#if INET6
2188	int isipv6 = SOCK_CHECK_DOM(so, PF_INET6) != 0;
2189#endif
2190
2191	error = in_pcballoc(so, &tcbinfo, p);
2192	if (error)
2193		return (error);
2194
2195	inp = sotoinpcb(so);
2196
2197	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
2198		error = soreserve(so, tcp_sendspace, tcp_recvspace);
2199		if (error)
2200			return (error);
2201	}
2202	if ((so->so_rcv.sb_flags & SB_USRSIZE) == 0)
2203		so->so_rcv.sb_flags |= SB_AUTOSIZE;
2204	if ((so->so_snd.sb_flags & SB_USRSIZE) == 0)
2205		so->so_snd.sb_flags |= SB_AUTOSIZE;
2206
2207#if INET6
2208	if (isipv6) {
2209		inp->inp_vflag |= INP_IPV6;
2210		inp->in6p_hops = -1;	/* use kernel default */
2211	}
2212	else
2213#endif /* INET6 */
2214	inp->inp_vflag |= INP_IPV4;
2215	tp = tcp_newtcpcb(inp);
2216	if (tp == NULL) {
2217		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
2218
2219		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
2220#if INET6
2221		if (isipv6)
2222			in6_pcbdetach(inp);
2223		else
2224#endif /* INET6 */
2225		in_pcbdetach(inp);
2226		so->so_state |= nofd;
2227		return (ENOBUFS);
2228	}
2229	if (nstat_collect) {
2230		nstat_tcp_new_pcb(inp);
2231	}
2232	tp->t_state = TCPS_CLOSED;
2233	return (0);
2234}
2235
2236/*
2237 * Initiate (or continue) disconnect.
2238 * If embryonic state, just send reset (once).
2239 * If in ``let data drain'' option and linger null, just drop.
2240 * Otherwise (hard), mark socket disconnecting and drop
2241 * current input data; switch states based on user close, and
2242 * send segment to peer (with FIN).
2243 */
2244static struct tcpcb *
2245tcp_disconnect(tp)
2246	register struct tcpcb *tp;
2247{
2248	struct socket *so = tp->t_inpcb->inp_socket;
2249
2250	if (tp->t_state < TCPS_ESTABLISHED)
2251		tp = tcp_close(tp);
2252	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
2253		tp = tcp_drop(tp, 0);
2254	else {
2255		soisdisconnecting(so);
2256		sbflush(&so->so_rcv);
2257		tp = tcp_usrclosed(tp);
2258#if MPTCP
2259		/* A reset has been sent but socket exists, do not send FIN */
2260		if ((so->so_flags & SOF_MP_SUBFLOW) &&
2261		    (tp) && (tp->t_mpflags & TMPF_RESET))
2262			return (tp);
2263#endif
2264		if (tp)
2265			(void) tcp_output(tp);
2266	}
2267	return (tp);
2268}
2269
2270/*
2271 * User issued close, and wish to trail through shutdown states:
2272 * if never received SYN, just forget it.  If got a SYN from peer,
2273 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
2274 * If already got a FIN from peer, then almost done; go to LAST_ACK
2275 * state.  In all other cases, have already sent FIN to peer (e.g.
2276 * after PRU_SHUTDOWN), and just have to play tedious game waiting
2277 * for peer to send FIN or not respond to keep-alives, etc.
2278 * We can let the user exit from the close as soon as the FIN is acked.
2279 */
2280static struct tcpcb *
2281tcp_usrclosed(tp)
2282	register struct tcpcb *tp;
2283{
2284
2285	switch (tp->t_state) {
2286
2287	case TCPS_CLOSED:
2288	case TCPS_LISTEN:
2289		tp = tcp_close(tp);
2290		break;
2291
2292	case TCPS_SYN_SENT:
2293	case TCPS_SYN_RECEIVED:
2294		tp->t_flags |= TF_NEEDFIN;
2295		break;
2296
2297	case TCPS_ESTABLISHED:
2298		DTRACE_TCP4(state__change, void, NULL,
2299			struct inpcb *, tp->t_inpcb,
2300			struct tcpcb *, tp,
2301			int32_t, TCPS_FIN_WAIT_1);
2302		tp->t_state = TCPS_FIN_WAIT_1;
2303		break;
2304
2305	case TCPS_CLOSE_WAIT:
2306		DTRACE_TCP4(state__change, void, NULL,
2307			struct inpcb *, tp->t_inpcb,
2308			struct tcpcb *, tp,
2309			int32_t, TCPS_LAST_ACK);
2310		tp->t_state = TCPS_LAST_ACK;
2311		break;
2312	}
2313	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
2314		soisdisconnected(tp->t_inpcb->inp_socket);
2315		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
2316		if (tp->t_state == TCPS_FIN_WAIT_2)
2317			tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
2318				TCP_CONN_MAXIDLE(tp));
2319	}
2320	return (tp);
2321}
2322
2323void
2324tcp_in_cksum_stats(u_int32_t len)
2325{
2326	tcpstat.tcps_rcv_swcsum++;
2327	tcpstat.tcps_rcv_swcsum_bytes += len;
2328}
2329
2330void
2331tcp_out_cksum_stats(u_int32_t len)
2332{
2333	tcpstat.tcps_snd_swcsum++;
2334	tcpstat.tcps_snd_swcsum_bytes += len;
2335}
2336
2337#if INET6
2338void
2339tcp_in6_cksum_stats(u_int32_t len)
2340{
2341	tcpstat.tcps_rcv6_swcsum++;
2342	tcpstat.tcps_rcv6_swcsum_bytes += len;
2343}
2344
2345void
2346tcp_out6_cksum_stats(u_int32_t len)
2347{
2348	tcpstat.tcps_snd6_swcsum++;
2349	tcpstat.tcps_snd6_swcsum_bytes += len;
2350}
2351
2352/*
2353 * When messages are enabled on a TCP socket, the message priority
2354 * is sent as a control message. This function will extract it.
2355 */
2356int
2357tcp_get_msg_priority(struct mbuf *control, uint32_t *msgpri)
2358{
2359	struct cmsghdr *cm;
2360	if (control == NULL)
2361		return(EINVAL);
2362
2363	for (cm = M_FIRST_CMSGHDR(control); cm;
2364		cm = M_NXT_CMSGHDR(control, cm)) {
2365		if (cm->cmsg_len < sizeof(struct cmsghdr) ||
2366			cm->cmsg_len > control->m_len) {
2367			return (EINVAL);
2368		}
2369		if (cm->cmsg_level == SOL_SOCKET &&
2370			cm->cmsg_type == SCM_MSG_PRIORITY) {
2371			*msgpri = *(unsigned int *)(void *)CMSG_DATA(cm);
2372			break;
2373		}
2374	}
2375
2376	VERIFY(*msgpri >= MSG_PRI_MIN && *msgpri <= MSG_PRI_MAX);
2377	return (0);
2378}
2379#endif /* INET6 */
2380