1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1993
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
61 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.9 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
68#include <sys/sysctl.h>
69#include <sys/mbuf.h>
70#if INET6
71#include <sys/domain.h>
72#endif /* INET6 */
73#include <sys/kasl.h>
74#include <sys/socket.h>
75#include <sys/socketvar.h>
76#include <sys/protosw.h>
77#include <sys/syslog.h>
78
79#include <net/if.h>
80#include <net/route.h>
81#include <net/ntstat.h>
82#include <net/content_filter.h>
83
84#include <netinet/in.h>
85#include <netinet/in_systm.h>
86#if INET6
87#include <netinet/ip6.h>
88#endif
89#include <netinet/in_pcb.h>
90#if INET6
91#include <netinet6/in6_pcb.h>
92#endif
93#include <netinet/in_var.h>
94#include <netinet/ip_var.h>
95#if INET6
96#include <netinet6/ip6_var.h>
97#endif
98#include <netinet/tcp.h>
99#include <netinet/tcp_fsm.h>
100#include <netinet/tcp_seq.h>
101#include <netinet/tcp_timer.h>
102#include <netinet/tcp_var.h>
103#include <netinet/tcpip.h>
104#include <mach/sdt.h>
105#if TCPDEBUG
106#include <netinet/tcp_debug.h>
107#endif
108#if MPTCP
109#include <netinet/mptcp_var.h>
110#endif /* MPTCP */
111
112#if IPSEC
113#include <netinet6/ipsec.h>
114#endif /*IPSEC*/
115
116#if FLOW_DIVERT
117#include <netinet/flow_divert.h>
118#endif /* FLOW_DIVERT */
119
120void	tcp_fill_info(struct tcpcb *, struct tcp_info *);
121errno_t tcp_fill_info_for_info_tuple(struct info_tuple *, struct tcp_info *);
122
123int tcp_sysctl_info(struct sysctl_oid *, void *, int , struct sysctl_req *);
124
125/*
126 * TCP protocol interface to socket abstraction.
127 */
128extern	char *tcpstates[];	/* XXX ??? */
129
130static int	tcp_attach(struct socket *, struct proc *);
131static int	tcp_connect(struct tcpcb *, struct sockaddr *, struct proc *);
132#if INET6
133static int	tcp6_connect(struct tcpcb *, struct sockaddr *, struct proc *);
134static int	tcp6_usr_connect(struct socket *, struct sockaddr *,
135		    struct proc *);
136#endif /* INET6 */
137static struct tcpcb *
138		tcp_disconnect(struct tcpcb *);
139static struct tcpcb *
140		tcp_usrclosed(struct tcpcb *);
141
142extern uint32_t tcp_autorcvbuf_max;
143
144extern void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
145
146#if TCPDEBUG
147#define	TCPDEBUG0	int ostate = 0
148#define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
149#define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
150				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
151#else
152#define	TCPDEBUG0
153#define	TCPDEBUG1()
154#define	TCPDEBUG2(req)
155#endif
156
157SYSCTL_PROC(_net_inet_tcp, OID_AUTO, info,
158    CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
159    0 , 0, tcp_sysctl_info, "S", "TCP info per tuple");
160
161/*
162 * TCP attaches to socket via pru_attach(), reserving space,
163 * and an internet control block.
164 *
165 * Returns:	0			Success
166 *		EISCONN
167 *	tcp_attach:ENOBUFS
168 *	tcp_attach:ENOMEM
169 *	tcp_attach:???			[IPSEC specific]
170 */
171static int
172tcp_usr_attach(struct socket *so, __unused int proto, struct proc *p)
173{
174	int error;
175	struct inpcb *inp = sotoinpcb(so);
176	struct tcpcb *tp = 0;
177	TCPDEBUG0;
178
179	TCPDEBUG1();
180	if (inp) {
181		error = EISCONN;
182		goto out;
183	}
184
185	error = tcp_attach(so, p);
186	if (error)
187		goto out;
188
189	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
190		so->so_linger = TCP_LINGERTIME * hz;
191	tp = sototcpcb(so);
192out:
193	TCPDEBUG2(PRU_ATTACH);
194	return error;
195}
196
197/*
198 * pru_detach() detaches the TCP protocol from the socket.
199 * If the protocol state is non-embryonic, then can't
200 * do this directly: have to initiate a pru_disconnect(),
201 * which may finish later; embryonic TCB's can just
202 * be discarded here.
203 */
204static int
205tcp_usr_detach(struct socket *so)
206{
207	int error = 0;
208	struct inpcb *inp = sotoinpcb(so);
209	struct tcpcb *tp;
210	TCPDEBUG0;
211
212	if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
213		return EINVAL;	/* XXX */
214	}
215	lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
216	tp = intotcpcb(inp);
217	/* In case we got disconnected from the peer */
218        if (tp == NULL)
219		goto out;
220	TCPDEBUG1();
221
222	calculate_tcp_clock();
223
224	tp = tcp_disconnect(tp);
225out:
226	TCPDEBUG2(PRU_DETACH);
227	return error;
228}
229
230#if NECP
231#define	COMMON_START()	TCPDEBUG0;					\
232do {									\
233	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD)		\
234		return (EINVAL);					\
235	if (necp_socket_should_use_flow_divert(inp))			\
236		return (EPROTOTYPE);					\
237	tp = intotcpcb(inp);						\
238	TCPDEBUG1();							\
239	calculate_tcp_clock();						\
240} while (0)
241#else /* NECP */
242#define	COMMON_START()	TCPDEBUG0;					\
243do {									\
244	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD)		\
245		return (EINVAL);					\
246	tp = intotcpcb(inp);						\
247	TCPDEBUG1();							\
248	calculate_tcp_clock();						\
249} while (0)
250#endif /* !NECP */
251
252#define COMMON_END(req)	out: TCPDEBUG2(req); return error; goto out
253
254
255/*
256 * Give the socket an address.
257 *
258 * Returns:	0			Success
259 *		EINVAL			Invalid argument [COMMON_START]
260 *		EAFNOSUPPORT		Address family not supported
261 *	in_pcbbind:EADDRNOTAVAIL	Address not available.
262 *	in_pcbbind:EINVAL		Invalid argument
263 *	in_pcbbind:EAFNOSUPPORT		Address family not supported [notdef]
264 *	in_pcbbind:EACCES		Permission denied
265 *	in_pcbbind:EADDRINUSE		Address in use
266 *	in_pcbbind:EAGAIN		Resource unavailable, try again
267 *	in_pcbbind:EPERM		Operation not permitted
268 */
269static int
270tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
271{
272	int error = 0;
273	struct inpcb *inp = sotoinpcb(so);
274	struct tcpcb *tp;
275	struct sockaddr_in *sinp;
276
277	COMMON_START();
278
279	if (nam->sa_family != 0 && nam->sa_family != AF_INET) {
280		error = EAFNOSUPPORT;
281		goto out;
282	}
283
284	/*
285	 * Must check for multicast addresses and disallow binding
286	 * to them.
287	 */
288	sinp = (struct sockaddr_in *)(void *)nam;
289	if (sinp->sin_family == AF_INET &&
290	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
291		error = EAFNOSUPPORT;
292		goto out;
293	}
294	error = in_pcbbind(inp, nam, p);
295	if (error)
296		goto out;
297	COMMON_END(PRU_BIND);
298
299}
300
301#if INET6
302static int
303tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
304{
305	int error = 0;
306	struct inpcb *inp = sotoinpcb(so);
307	struct tcpcb *tp;
308	struct sockaddr_in6 *sin6p;
309
310	COMMON_START();
311
312	if (nam->sa_family != 0 && nam->sa_family != AF_INET6) {
313		error = EAFNOSUPPORT;
314		goto out;
315	}
316
317	/*
318	 * Must check for multicast addresses and disallow binding
319	 * to them.
320	 */
321	sin6p = (struct sockaddr_in6 *)(void *)nam;
322	if (sin6p->sin6_family == AF_INET6 &&
323	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
324		error = EAFNOSUPPORT;
325		goto out;
326	}
327	inp->inp_vflag &= ~INP_IPV4;
328	inp->inp_vflag |= INP_IPV6;
329	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
330		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
331			inp->inp_vflag |= INP_IPV4;
332		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
333			struct sockaddr_in sin;
334
335			in6_sin6_2_sin(&sin, sin6p);
336			inp->inp_vflag |= INP_IPV4;
337			inp->inp_vflag &= ~INP_IPV6;
338			error = in_pcbbind(inp, (struct sockaddr *)&sin, p);
339			goto out;
340		}
341	}
342	error = in6_pcbbind(inp, nam, p);
343	if (error)
344		goto out;
345	COMMON_END(PRU_BIND);
346}
347#endif /* INET6 */
348
349/*
350 * Prepare to accept connections.
351 *
352 * Returns:	0			Success
353 *		EINVAL [COMMON_START]
354 *	in_pcbbind:EADDRNOTAVAIL	Address not available.
355 *	in_pcbbind:EINVAL		Invalid argument
356 *	in_pcbbind:EAFNOSUPPORT		Address family not supported [notdef]
357 *	in_pcbbind:EACCES		Permission denied
358 *	in_pcbbind:EADDRINUSE		Address in use
359 *	in_pcbbind:EAGAIN		Resource unavailable, try again
360 *	in_pcbbind:EPERM		Operation not permitted
361 */
362static int
363tcp_usr_listen(struct socket *so, struct proc *p)
364{
365	int error = 0;
366	struct inpcb *inp = sotoinpcb(so);
367	struct tcpcb *tp;
368
369	COMMON_START();
370	if (inp->inp_lport == 0)
371		error = in_pcbbind(inp, NULL, p);
372	if (error == 0)
373		tp->t_state = TCPS_LISTEN;
374	COMMON_END(PRU_LISTEN);
375}
376
377#if INET6
378static int
379tcp6_usr_listen(struct socket *so, struct proc *p)
380{
381	int error = 0;
382	struct inpcb *inp = sotoinpcb(so);
383	struct tcpcb *tp;
384
385	COMMON_START();
386	if (inp->inp_lport == 0) {
387		inp->inp_vflag &= ~INP_IPV4;
388		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
389			inp->inp_vflag |= INP_IPV4;
390		error = in6_pcbbind(inp, NULL, p);
391	}
392	if (error == 0)
393		tp->t_state = TCPS_LISTEN;
394	COMMON_END(PRU_LISTEN);
395}
396#endif /* INET6 */
397
398/*
399 * Initiate connection to peer.
400 * Create a template for use in transmissions on this connection.
401 * Enter SYN_SENT state, and mark socket as connecting.
402 * Start keep-alive timer, and seed output sequence space.
403 * Send initial segment on connection.
404 */
405static int
406tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
407{
408	int error = 0;
409	struct inpcb *inp = sotoinpcb(so);
410	struct tcpcb *tp;
411	struct sockaddr_in *sinp;
412
413	TCPDEBUG0;
414	if (inp == NULL) {
415		return EINVAL;
416	} else if (inp->inp_state == INPCB_STATE_DEAD) {
417		if (so->so_error) {
418			error = so->so_error;
419			so->so_error = 0;
420			return error;
421		} else
422			return EINVAL;
423	}
424#if NECP
425#if FLOW_DIVERT
426	else if (necp_socket_should_use_flow_divert(inp)) {
427		uint32_t fd_ctl_unit = necp_socket_get_flow_divert_control_unit(inp);
428		if (fd_ctl_unit > 0) {
429			error = flow_divert_pcb_init(so, fd_ctl_unit);
430			if (error == 0) {
431				error = flow_divert_connect_out(so, nam, p);
432			}
433		} else {
434			error = ENETDOWN;
435		}
436		return error;
437	}
438#endif /* FLOW_DIVERT */
439#if CONTENT_FILTER
440	error = cfil_sock_attach(so);
441	if (error != 0)
442		return error;
443#endif /* CONTENT_FILTER */
444#endif /* NECP */
445	tp = intotcpcb(inp);
446	TCPDEBUG1();
447
448	calculate_tcp_clock();
449
450	if (nam->sa_family != 0 && nam->sa_family != AF_INET) {
451		error = EAFNOSUPPORT;
452		goto out;
453	}
454	/*
455	 * Must disallow TCP ``connections'' to multicast addresses.
456	 */
457	sinp = (struct sockaddr_in *)(void *)nam;
458	if (sinp->sin_family == AF_INET
459	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
460		error = EAFNOSUPPORT;
461		goto out;
462	}
463
464	if ((error = tcp_connect(tp, nam, p)) != 0)
465		goto out;
466	error = tcp_output(tp);
467	COMMON_END(PRU_CONNECT);
468}
469
470static int
471tcp_usr_connectx_common(struct socket *so, int af,
472    struct sockaddr_list **src_sl, struct sockaddr_list **dst_sl,
473    struct proc *p, uint32_t ifscope, associd_t aid, connid_t *pcid,
474    uint32_t flags, void *arg, uint32_t arglen)
475{
476#pragma unused(aid)
477#if !MPTCP
478#pragma unused(flags, arg, arglen)
479#endif /* !MPTCP */
480	struct sockaddr_entry *src_se = NULL, *dst_se = NULL;
481	struct inpcb *inp = sotoinpcb(so);
482	int error;
483
484	if (inp == NULL)
485		return (EINVAL);
486
487	VERIFY(dst_sl != NULL);
488
489	/* select source (if specified) and destination addresses */
490	error = in_selectaddrs(af, src_sl, &src_se, dst_sl, &dst_se);
491	if (error != 0)
492		return (error);
493
494	VERIFY(*dst_sl != NULL && dst_se != NULL);
495	VERIFY(src_se == NULL || *src_sl != NULL);
496	VERIFY(dst_se->se_addr->sa_family == af);
497	VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
498
499#if NECP
500	inp_update_necp_policy(inp, src_se ? src_se->se_addr : NULL, dst_se ? dst_se->se_addr : NULL, ifscope);
501#endif /* NECP */
502
503	/*
504	 * We get here for 2 cases:
505	 *
506	 *   a. From MPTCP, to connect a subflow.  There is no need to
507	 *	bind the socket to the source address and/or interface,
508	 *	since everything has been taken care of by MPTCP.  We
509	 *	simply check whether or not this is for the initial
510	 *	MPTCP connection attempt, or to join an existing one.
511	 *
512	 *   b.	From the socket layer, to connect a TCP.  Perform the
513	 *	bind to source address and/or interface as necessary.
514	 */
515#if MPTCP
516	if (flags & TCP_CONNREQF_MPTCP) {
517		struct mptsub_connreq *mpcr = arg;
518
519		/* Check to make sure this came down from MPTCP */
520		if (arg == NULL || arglen != sizeof (*mpcr))
521			return (EOPNOTSUPP);
522
523		switch (mpcr->mpcr_type) {
524		case MPTSUB_CONNREQ_MP_ENABLE:
525			break;
526		case MPTSUB_CONNREQ_MP_ADD:
527			break;
528		default:
529			return (EOPNOTSUPP);
530		}
531	} else
532#endif /* MPTCP */
533	{
534		/* bind socket to the specified interface, if requested */
535		if (ifscope != IFSCOPE_NONE &&
536		    (error = inp_bindif(inp, ifscope, NULL)) != 0)
537			return (error);
538
539		/* if source address and/or port is specified, bind to it */
540		if (src_se != NULL) {
541			struct sockaddr *sa = src_se->se_addr;
542			error = sobindlock(so, sa, 0);	/* already locked */
543			if (error != 0)
544				return (error);
545		}
546	}
547
548	switch (af) {
549	case AF_INET:
550		error = tcp_usr_connect(so, dst_se->se_addr, p);
551		break;
552#if INET6
553	case AF_INET6:
554		error = tcp6_usr_connect(so, dst_se->se_addr, p);
555		break;
556#endif /* INET6 */
557	default:
558		VERIFY(0);
559		/* NOTREACHED */
560	}
561
562	if (error == 0 && pcid != NULL)
563		*pcid = 1;	/* there is only 1 connection for a TCP */
564
565	return (error);
566}
567
568static int
569tcp_usr_connectx(struct socket *so, struct sockaddr_list **src_sl,
570    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
571    associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
572    uint32_t arglen)
573{
574	return (tcp_usr_connectx_common(so, AF_INET, src_sl, dst_sl,
575	    p, ifscope, aid, pcid, flags, arg, arglen));
576}
577
578#if INET6
579static int
580tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
581{
582	int error = 0;
583	struct inpcb *inp = sotoinpcb(so);
584	struct tcpcb *tp;
585	struct sockaddr_in6 *sin6p;
586
587	TCPDEBUG0;
588	if (inp == NULL) {
589		return EINVAL;
590	} else if (inp->inp_state == INPCB_STATE_DEAD) {
591		if (so->so_error) {
592			error = so->so_error;
593			so->so_error = 0;
594			return error;
595		} else
596			return EINVAL;
597	}
598#if NECP
599#if FLOW_DIVERT
600	else if (necp_socket_should_use_flow_divert(inp)) {
601		uint32_t fd_ctl_unit = necp_socket_get_flow_divert_control_unit(inp);
602		if (fd_ctl_unit > 0) {
603			error = flow_divert_pcb_init(so, fd_ctl_unit);
604			if (error == 0) {
605				error = flow_divert_connect_out(so, nam, p);
606			}
607		} else {
608			error = ENETDOWN;
609		}
610		return error;
611	}
612#endif /* FLOW_DIVERT */
613#if CONTENT_FILTER
614	error = cfil_sock_attach(so);
615	if (error != 0)
616		return error;
617#endif /* CONTENT_FILTER */
618#endif /* NECP */
619
620	tp = intotcpcb(inp);
621	TCPDEBUG1();
622
623	calculate_tcp_clock();
624
625	if (nam->sa_family != 0 && nam->sa_family != AF_INET6) {
626		error = EAFNOSUPPORT;
627		goto out;
628	}
629
630	/*
631	 * Must disallow TCP ``connections'' to multicast addresses.
632	 */
633	sin6p = (struct sockaddr_in6 *)(void *)nam;
634	if (sin6p->sin6_family == AF_INET6
635	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
636		error = EAFNOSUPPORT;
637		goto out;
638	}
639
640	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
641		struct sockaddr_in sin;
642
643		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
644			return (EINVAL);
645
646		in6_sin6_2_sin(&sin, sin6p);
647		inp->inp_vflag |= INP_IPV4;
648		inp->inp_vflag &= ~INP_IPV6;
649		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0)
650			goto out;
651		error = tcp_output(tp);
652		goto out;
653	}
654	inp->inp_vflag &= ~INP_IPV4;
655	inp->inp_vflag |= INP_IPV6;
656	if ((error = tcp6_connect(tp, nam, p)) != 0)
657		goto out;
658	error = tcp_output(tp);
659	if (error)
660		goto out;
661	COMMON_END(PRU_CONNECT);
662}
663
664static int
665tcp6_usr_connectx(struct socket *so, struct sockaddr_list **src_sl,
666    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
667    associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
668    uint32_t arglen)
669{
670	return (tcp_usr_connectx_common(so, AF_INET6, src_sl, dst_sl,
671	    p, ifscope, aid, pcid, flags, arg, arglen));
672}
673#endif /* INET6 */
674
675/*
676 * Initiate disconnect from peer.
677 * If connection never passed embryonic stage, just drop;
678 * else if don't need to let data drain, then can just drop anyways,
679 * else have to begin TCP shutdown process: mark socket disconnecting,
680 * drain unread data, state switch to reflect user close, and
681 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
682 * when peer sends FIN and acks ours.
683 *
684 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
685 */
686static int
687tcp_usr_disconnect(struct socket *so)
688{
689	int error = 0;
690	struct inpcb *inp = sotoinpcb(so);
691	struct tcpcb *tp;
692
693	lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
694	    LCK_MTX_ASSERT_OWNED);
695	COMMON_START();
696        /* In case we got disconnected from the peer */
697        if (tp == NULL)
698		goto out;
699	tp = tcp_disconnect(tp);
700	COMMON_END(PRU_DISCONNECT);
701}
702
703/*
704 * User-protocol pru_disconnectx callback.
705 */
706static int
707tcp_usr_disconnectx(struct socket *so, associd_t aid, connid_t cid)
708{
709#pragma unused(cid)
710	if (aid != ASSOCID_ANY && aid != ASSOCID_ALL)
711		return (EINVAL);
712
713	return (tcp_usr_disconnect(so));
714}
715
716/*
717 * Accept a connection.  Essentially all the work is
718 * done at higher levels; just return the address
719 * of the peer, storing through addr.
720 */
721static int
722tcp_usr_accept(struct socket *so, struct sockaddr **nam)
723{
724	int error = 0;
725	struct inpcb *inp = sotoinpcb(so);
726	struct tcpcb *tp = NULL;
727	TCPDEBUG0;
728
729	in_getpeeraddr(so, nam);
730
731	if (so->so_state & SS_ISDISCONNECTED) {
732		error = ECONNABORTED;
733		goto out;
734	}
735	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD)
736		return (EINVAL);
737#if NECP
738	else if (necp_socket_should_use_flow_divert(inp))
739		return (EPROTOTYPE);
740#if CONTENT_FILTER
741	error = cfil_sock_attach(so);
742	if (error != 0)
743		return (error);
744#endif /* CONTENT_FILTER */
745#endif /* NECP */
746
747	tp = intotcpcb(inp);
748	TCPDEBUG1();
749
750	calculate_tcp_clock();
751
752	COMMON_END(PRU_ACCEPT);
753}
754
755#if INET6
756static int
757tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
758{
759	int error = 0;
760	struct inpcb *inp = sotoinpcb(so);
761	struct tcpcb *tp = NULL;
762	TCPDEBUG0;
763
764	if (so->so_state & SS_ISDISCONNECTED) {
765		error = ECONNABORTED;
766		goto out;
767	}
768	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD)
769		return (EINVAL);
770#if NECP
771	else if (necp_socket_should_use_flow_divert(inp))
772		return (EPROTOTYPE);
773#if CONTENT_FILTER
774	error = cfil_sock_attach(so);
775	if (error != 0)
776		return (error);
777#endif /* CONTENT_FILTER */
778#endif /* NECP */
779
780	tp = intotcpcb(inp);
781	TCPDEBUG1();
782
783	calculate_tcp_clock();
784
785	in6_mapped_peeraddr(so, nam);
786	COMMON_END(PRU_ACCEPT);
787}
788#endif /* INET6 */
789
790/*
791 * Mark the connection as being incapable of further output.
792 *
793 * Returns:	0			Success
794 *		EINVAL [COMMON_START]
795 *	tcp_output:EADDRNOTAVAIL
796 *	tcp_output:ENOBUFS
797 *	tcp_output:EMSGSIZE
798 *	tcp_output:EHOSTUNREACH
799 *	tcp_output:ENETUNREACH
800 *	tcp_output:ENETDOWN
801 *	tcp_output:ENOMEM
802 *	tcp_output:EACCES
803 *	tcp_output:EMSGSIZE
804 *	tcp_output:ENOBUFS
805 *	tcp_output:???			[ignorable: mostly IPSEC/firewall/DLIL]
806 */
807static int
808tcp_usr_shutdown(struct socket *so)
809{
810	int error = 0;
811	struct inpcb *inp = sotoinpcb(so);
812	struct tcpcb *tp;
813
814	TCPDEBUG0;
815	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD)
816		return (EINVAL);
817
818	socantsendmore(so);
819
820        /*
821	 * In case we got disconnected from the peer, or if this is
822	 * a socket that is to be flow-diverted (but not yet).
823	 */
824	tp = intotcpcb(inp);
825	TCPDEBUG1();
826
827	if (tp == NULL
828#if NECP
829		|| (necp_socket_should_use_flow_divert(inp))
830#endif /* NECP */
831		) {
832		if (tp != NULL)
833			error = EPROTOTYPE;
834		goto out;
835	}
836
837	calculate_tcp_clock();
838
839	tp = tcp_usrclosed(tp);
840#if MPTCP
841	/* A reset has been sent but socket exists, do not send FIN */
842	if ((so->so_flags & SOF_MP_SUBFLOW) &&
843	    (tp) && (tp->t_mpflags & TMPF_RESET)) {
844		goto out;
845	}
846#endif
847#if CONTENT_FILTER
848	/* Don't send a FIN yet */
849	if (tp && !(so->so_state & SS_ISDISCONNECTED) &&
850		cfil_sock_data_pending(&so->so_snd))
851		goto out;
852#endif /* CONTENT_FILTER */
853	if (tp)
854		error = tcp_output(tp);
855	COMMON_END(PRU_SHUTDOWN);
856}
857
858/*
859 * After a receive, possibly send window update to peer.
860 */
861static int
862tcp_usr_rcvd(struct socket *so, __unused int flags)
863{
864	int error = 0;
865	struct inpcb *inp = sotoinpcb(so);
866	struct tcpcb *tp;
867
868	COMMON_START();
869        /* In case we got disconnected from the peer */
870        if (tp == NULL)
871		goto out;
872	tcp_sbrcv_trim(tp, &so->so_rcv);
873
874	tcp_output(tp);
875
876#if CONTENT_FILTER
877	cfil_sock_buf_update(&so->so_rcv);
878#endif /* CONTENT_FILTER */
879
880	COMMON_END(PRU_RCVD);
881}
882
883/*
884 * Do a send by putting data in output queue and updating urgent
885 * marker if URG set.  Possibly send more data.  Unlike the other
886 * pru_*() routines, the mbuf chains are our responsibility.  We
887 * must either enqueue them or free them.  The other pru_* routines
888 * generally are caller-frees.
889 *
890 * Returns:	0			Success
891 *		ECONNRESET
892 *		EINVAL
893 *		ENOBUFS
894 *	tcp_connect:EADDRINUSE		Address in use
895 *	tcp_connect:EADDRNOTAVAIL	Address not available.
896 *	tcp_connect:EINVAL		Invalid argument
897 *	tcp_connect:EAFNOSUPPORT	Address family not supported [notdef]
898 *	tcp_connect:EACCES		Permission denied
899 *	tcp_connect:EAGAIN		Resource unavailable, try again
900 *	tcp_connect:EPERM		Operation not permitted
901 *	tcp_output:EADDRNOTAVAIL
902 *	tcp_output:ENOBUFS
903 *	tcp_output:EMSGSIZE
904 *	tcp_output:EHOSTUNREACH
905 *	tcp_output:ENETUNREACH
906 *	tcp_output:ENETDOWN
907 *	tcp_output:ENOMEM
908 *	tcp_output:EACCES
909 *	tcp_output:EMSGSIZE
910 *	tcp_output:ENOBUFS
911 *	tcp_output:???			[ignorable: mostly IPSEC/firewall/DLIL]
912 *	tcp6_connect:???		[IPV6 only]
913 */
914static int
915tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
916     struct sockaddr *nam, struct mbuf *control, struct proc *p)
917{
918	int error = 0;
919	struct inpcb *inp = sotoinpcb(so);
920	struct tcpcb *tp;
921	uint32_t msgpri = MSG_PRI_DEFAULT;
922#if INET6
923	int isipv6;
924#endif
925	TCPDEBUG0;
926
927	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD
928#if NECP
929		|| (necp_socket_should_use_flow_divert(inp))
930#endif /* NECP */
931		) {
932		/*
933		 * OOPS! we lost a race, the TCP session got reset after
934		 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
935		 * network interrupt in the non-splnet() section of sosend().
936		 */
937		if (m != NULL)
938			m_freem(m);
939		if (control != NULL) {
940			m_freem(control);
941			control = NULL;
942		}
943
944		if (inp == NULL)
945			error = ECONNRESET;	/* XXX EPIPE? */
946		else
947			error = EPROTOTYPE;
948		tp = NULL;
949		TCPDEBUG1();
950		goto out;
951	}
952#if INET6
953	isipv6 = nam && nam->sa_family == AF_INET6;
954#endif /* INET6 */
955	tp = intotcpcb(inp);
956	TCPDEBUG1();
957
958	calculate_tcp_clock();
959
960	if (control != NULL) {
961		if (so->so_flags & SOF_ENABLE_MSGS) {
962			/* Get the msg priority from control mbufs */
963			error = tcp_get_msg_priority(control, &msgpri);
964			if (error) {
965				m_freem(control);
966				if (m != NULL)
967					m_freem(m);
968				control = NULL;
969				m = NULL;
970				goto out;
971			}
972			m_freem(control);
973			control = NULL;
974		} else if (control->m_len) {
975			/*
976			 * if not unordered, TCP should not have
977			 * control mbufs
978			 */
979			m_freem(control);
980			if (m != NULL)
981				m_freem(m);
982			control = NULL;
983			m = NULL;
984			error = EINVAL;
985			goto out;
986		}
987	}
988
989	if (so->so_flags & SOF_ENABLE_MSGS) {
990		VERIFY(m->m_flags & M_PKTHDR);
991		m->m_pkthdr.msg_pri = msgpri;
992	}
993
994	/* MPTCP sublow socket buffers must not be compressed */
995	VERIFY(!(so->so_flags & SOF_MP_SUBFLOW) ||
996	    (so->so_snd.sb_flags & SB_NOCOMPRESS));
997
998	if(!(flags & PRUS_OOB)) {
999		/* Call msg send if message delivery is enabled */
1000		if (so->so_flags & SOF_ENABLE_MSGS)
1001			sbappendmsg_snd(&so->so_snd, m);
1002		else
1003			sbappendstream(&so->so_snd, m);
1004
1005		if (nam && tp->t_state < TCPS_SYN_SENT) {
1006			/*
1007			 * Do implied connect if not yet connected,
1008			 * initialize window to default value, and
1009			 * initialize maxseg/maxopd using peer's cached
1010			 * MSS.
1011			 */
1012#if INET6
1013			if (isipv6)
1014				error = tcp6_connect(tp, nam, p);
1015			else
1016#endif /* INET6 */
1017				error = tcp_connect(tp, nam, p);
1018			if (error)
1019				goto out;
1020			tp->snd_wnd = TTCP_CLIENT_SND_WND;
1021			tcp_mss(tp, -1, IFSCOPE_NONE);
1022		}
1023
1024		if (flags & PRUS_EOF) {
1025			/*
1026			 * Close the send side of the connection after
1027			 * the data is sent.
1028			 */
1029			socantsendmore(so);
1030			tp = tcp_usrclosed(tp);
1031		}
1032		if (tp != NULL) {
1033			if (flags & PRUS_MORETOCOME)
1034				tp->t_flags |= TF_MORETOCOME;
1035			error = tcp_output(tp);
1036			if (flags & PRUS_MORETOCOME)
1037				tp->t_flags &= ~TF_MORETOCOME;
1038		}
1039	} else {
1040		if (sbspace(&so->so_snd) == 0) {
1041			/* if no space is left in sockbuf,
1042			 * do not try to squeeze in OOB traffic */
1043			m_freem(m);
1044			error = ENOBUFS;
1045			goto out;
1046		}
1047		/*
1048		 * According to RFC961 (Assigned Protocols),
1049		 * the urgent pointer points to the last octet
1050		 * of urgent data.  We continue, however,
1051		 * to consider it to indicate the first octet
1052		 * of data past the urgent section.
1053		 * Otherwise, snd_up should be one lower.
1054		 */
1055		sbappendstream(&so->so_snd, m);
1056		if (nam && tp->t_state < TCPS_SYN_SENT) {
1057			/*
1058			 * Do implied connect if not yet connected,
1059			 * initialize window to default value, and
1060			 * initialize maxseg/maxopd using peer's cached
1061			 * MSS.
1062			 */
1063#if INET6
1064			if (isipv6)
1065				error = tcp6_connect(tp, nam, p);
1066			else
1067#endif /* INET6 */
1068			error = tcp_connect(tp, nam, p);
1069			if (error)
1070				goto out;
1071			tp->snd_wnd = TTCP_CLIENT_SND_WND;
1072			tcp_mss(tp, -1, IFSCOPE_NONE);
1073		}
1074		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
1075		tp->t_flagsext |= TF_FORCE;
1076		error = tcp_output(tp);
1077		tp->t_flagsext &= ~TF_FORCE;
1078	}
1079	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
1080		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
1081}
1082
1083/*
1084 * Abort the TCP.
1085 */
1086static int
1087tcp_usr_abort(struct socket *so)
1088{
1089	int error = 0;
1090	struct inpcb *inp = sotoinpcb(so);
1091	struct tcpcb *tp;
1092
1093	COMMON_START();
1094        /* In case we got disconnected from the peer */
1095        if (tp == NULL)
1096		goto out;
1097	tp = tcp_drop(tp, ECONNABORTED);
1098	so->so_usecount--;
1099	COMMON_END(PRU_ABORT);
1100}
1101
1102/*
1103 * Receive out-of-band data.
1104 *
1105 * Returns:	0			Success
1106 *		EINVAL [COMMON_START]
1107 *		EINVAL
1108 *		EWOULDBLOCK
1109 */
1110static int
1111tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
1112{
1113	int error = 0;
1114	struct inpcb *inp = sotoinpcb(so);
1115	struct tcpcb *tp;
1116
1117	COMMON_START();
1118	if ((so->so_oobmark == 0 &&
1119	     (so->so_state & SS_RCVATMARK) == 0) ||
1120	    so->so_options & SO_OOBINLINE ||
1121	    tp->t_oobflags & TCPOOB_HADDATA) {
1122		error = EINVAL;
1123		goto out;
1124	}
1125	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
1126		error = EWOULDBLOCK;
1127		goto out;
1128	}
1129	m->m_len = 1;
1130	*mtod(m, caddr_t) = tp->t_iobc;
1131	if ((flags & MSG_PEEK) == 0)
1132		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1133	COMMON_END(PRU_RCVOOB);
1134}
1135
1136/* xxx - should be const */
1137struct pr_usrreqs tcp_usrreqs = {
1138	.pru_abort =		tcp_usr_abort,
1139	.pru_accept =		tcp_usr_accept,
1140	.pru_attach =		tcp_usr_attach,
1141	.pru_bind =		tcp_usr_bind,
1142	.pru_connect =		tcp_usr_connect,
1143	.pru_connectx =		tcp_usr_connectx,
1144	.pru_control =		in_control,
1145	.pru_detach =		tcp_usr_detach,
1146	.pru_disconnect =	tcp_usr_disconnect,
1147	.pru_disconnectx =	tcp_usr_disconnectx,
1148	.pru_listen =		tcp_usr_listen,
1149	.pru_peeraddr =		in_getpeeraddr,
1150	.pru_rcvd =		tcp_usr_rcvd,
1151	.pru_rcvoob =		tcp_usr_rcvoob,
1152	.pru_send =		tcp_usr_send,
1153	.pru_shutdown =		tcp_usr_shutdown,
1154	.pru_sockaddr =		in_getsockaddr,
1155	.pru_sosend =		sosend,
1156	.pru_soreceive =	soreceive,
1157};
1158
1159#if INET6
1160struct pr_usrreqs tcp6_usrreqs = {
1161	.pru_abort =		tcp_usr_abort,
1162	.pru_accept =		tcp6_usr_accept,
1163	.pru_attach =		tcp_usr_attach,
1164	.pru_bind =		tcp6_usr_bind,
1165	.pru_connect =		tcp6_usr_connect,
1166	.pru_connectx =		tcp6_usr_connectx,
1167	.pru_control =		in6_control,
1168	.pru_detach =		tcp_usr_detach,
1169	.pru_disconnect =	tcp_usr_disconnect,
1170	.pru_disconnectx =	tcp_usr_disconnectx,
1171	.pru_listen =		tcp6_usr_listen,
1172	.pru_peeraddr =		in6_mapped_peeraddr,
1173	.pru_rcvd =		tcp_usr_rcvd,
1174	.pru_rcvoob =		tcp_usr_rcvoob,
1175	.pru_send =		tcp_usr_send,
1176	.pru_shutdown =		tcp_usr_shutdown,
1177	.pru_sockaddr =		in6_mapped_sockaddr,
1178	.pru_sosend =		sosend,
1179	.pru_soreceive =	soreceive,
1180};
1181#endif /* INET6 */
1182
1183/*
1184 * Common subroutine to open a TCP connection to remote host specified
1185 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
1186 * port number if needed.  Call in_pcbladdr to do the routing and to choose
1187 * a local host address (interface).  If there is an existing incarnation
1188 * of the same connection in TIME-WAIT state and if the remote host was
1189 * sending CC options and if the connection duration was < MSL, then
1190 * truncate the previous TIME-WAIT state and proceed.
1191 * Initialize connection parameters and enter SYN-SENT state.
1192 *
1193 * Returns:	0			Success
1194 *		EADDRINUSE
1195 *		EINVAL
1196 *	in_pcbbind:EADDRNOTAVAIL	Address not available.
1197 *	in_pcbbind:EINVAL		Invalid argument
1198 *	in_pcbbind:EAFNOSUPPORT		Address family not supported [notdef]
1199 *	in_pcbbind:EACCES		Permission denied
1200 *	in_pcbbind:EADDRINUSE		Address in use
1201 *	in_pcbbind:EAGAIN		Resource unavailable, try again
1202 *	in_pcbbind:EPERM		Operation not permitted
1203 *	in_pcbladdr:EINVAL		Invalid argument
1204 *	in_pcbladdr:EAFNOSUPPORT	Address family not supported
1205 *	in_pcbladdr:EADDRNOTAVAIL	Address not available
1206 */
1207static int
1208tcp_connect(tp, nam, p)
1209	register struct tcpcb *tp;
1210	struct sockaddr *nam;
1211	struct proc *p;
1212{
1213	struct inpcb *inp = tp->t_inpcb, *oinp;
1214	struct socket *so = inp->inp_socket;
1215	struct tcpcb *otp;
1216	struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
1217	struct in_addr laddr;
1218	struct rmxp_tao *taop;
1219	struct rmxp_tao tao_noncached;
1220	int error = 0;
1221	struct ifnet *outif = NULL;
1222
1223	if (inp->inp_lport == 0) {
1224		error = in_pcbbind(inp, NULL, p);
1225		if (error)
1226			goto done;
1227	}
1228
1229	/*
1230	 * Cannot simply call in_pcbconnect, because there might be an
1231	 * earlier incarnation of this same connection still in
1232	 * TIME_WAIT state, creating an ADDRINUSE error.
1233	 */
1234	error = in_pcbladdr(inp, nam, &laddr, IFSCOPE_NONE, &outif);
1235	if (error)
1236		goto done;
1237
1238	tcp_unlock(inp->inp_socket, 0, 0);
1239	oinp = in_pcblookup_hash(inp->inp_pcbinfo,
1240	    sin->sin_addr, sin->sin_port,
1241	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr : laddr,
1242	    inp->inp_lport,  0, NULL);
1243
1244	tcp_lock(inp->inp_socket, 0, 0);
1245	if (oinp) {
1246		if (oinp != inp) /* 4143933: avoid deadlock if inp == oinp */
1247			tcp_lock(oinp->inp_socket, 1, 0);
1248		if (in_pcb_checkstate(oinp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1249			if (oinp != inp)
1250				tcp_unlock(oinp->inp_socket, 1, 0);
1251			goto skip_oinp;
1252		}
1253
1254		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
1255		    otp->t_state == TCPS_TIME_WAIT &&
1256		    ((int)(tcp_now - otp->t_starttime)) < tcp_msl &&
1257		    (otp->t_flags & TF_RCVD_CC)) {
1258			otp = tcp_close(otp);
1259		} else {
1260			printf("tcp_connect: inp=0x%llx err=EADDRINUSE\n",
1261			    (uint64_t)VM_KERNEL_ADDRPERM(inp));
1262			if (oinp != inp)
1263				tcp_unlock(oinp->inp_socket, 1, 0);
1264			error = EADDRINUSE;
1265			goto done;
1266		}
1267		if (oinp != inp)
1268			tcp_unlock(oinp->inp_socket, 1, 0);
1269	}
1270skip_oinp:
1271	if ((inp->inp_laddr.s_addr == INADDR_ANY ? laddr.s_addr :
1272	    inp->inp_laddr.s_addr) == sin->sin_addr.s_addr &&
1273	    inp->inp_lport == sin->sin_port) {
1274		error = EINVAL;
1275		goto done;
1276	}
1277	if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->ipi_lock)) {
1278		/*lock inversion issue, mostly with udp multicast packets */
1279		socket_unlock(inp->inp_socket, 0);
1280		lck_rw_lock_exclusive(inp->inp_pcbinfo->ipi_lock);
1281		socket_lock(inp->inp_socket, 0);
1282	}
1283	if (inp->inp_laddr.s_addr == INADDR_ANY) {
1284		inp->inp_laddr = laddr;
1285		/* no reference needed */
1286		inp->inp_last_outifp = outif;
1287		inp->inp_flags |= INP_INADDR_ANY;
1288	}
1289	inp->inp_faddr = sin->sin_addr;
1290	inp->inp_fport = sin->sin_port;
1291	in_pcbrehash(inp);
1292	lck_rw_done(inp->inp_pcbinfo->ipi_lock);
1293
1294	if (inp->inp_flowhash == 0)
1295		inp->inp_flowhash = inp_calc_flowhash(inp);
1296
1297	tcp_set_max_rwinscale(tp, so);
1298
1299	soisconnecting(so);
1300	tcpstat.tcps_connattempt++;
1301	tp->t_state = TCPS_SYN_SENT;
1302	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_CONN_KEEPINIT(tp));
1303	tp->iss = tcp_new_isn(tp);
1304	tcp_sendseqinit(tp);
1305	if (nstat_collect)
1306		nstat_route_connect_attempt(inp->inp_route.ro_rt);
1307
1308	/*
1309	 * Generate a CC value for this connection and
1310	 * check whether CC or CCnew should be used.
1311	 */
1312	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
1313		taop = &tao_noncached;
1314		bzero(taop, sizeof(*taop));
1315	}
1316
1317	tp->cc_send = CC_INC(tcp_ccgen);
1318	if (taop->tao_ccsent != 0 &&
1319	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
1320		taop->tao_ccsent = tp->cc_send;
1321	} else {
1322		taop->tao_ccsent = 0;
1323		tp->t_flags |= TF_SENDCCNEW;
1324	}
1325
1326done:
1327	if (outif != NULL)
1328		ifnet_release(outif);
1329
1330	return (error);
1331}
1332
1333#if INET6
1334static int
1335tcp6_connect(tp, nam, p)
1336	register struct tcpcb *tp;
1337	struct sockaddr *nam;
1338	struct proc *p;
1339{
1340	struct inpcb *inp = tp->t_inpcb, *oinp;
1341	struct socket *so = inp->inp_socket;
1342	struct tcpcb *otp;
1343	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)nam;
1344	struct in6_addr addr6;
1345	struct rmxp_tao *taop;
1346	struct rmxp_tao tao_noncached;
1347	int error = 0;
1348	struct ifnet *outif = NULL;
1349
1350	if (inp->inp_lport == 0) {
1351		error = in6_pcbbind(inp, NULL, p);
1352		if (error)
1353			goto done;
1354	}
1355
1356	/*
1357	 * Cannot simply call in_pcbconnect, because there might be an
1358	 * earlier incarnation of this same connection still in
1359	 * TIME_WAIT state, creating an ADDRINUSE error.
1360	 *
1361	 * in6_pcbladdr() might return an ifp with its reference held
1362	 * even in the error case, so make sure that it's released
1363	 * whenever it's non-NULL.
1364	 */
1365	error = in6_pcbladdr(inp, nam, &addr6, &outif);
1366	if (error)
1367		goto done;
1368	tcp_unlock(inp->inp_socket, 0, 0);
1369	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
1370				  &sin6->sin6_addr, sin6->sin6_port,
1371				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
1372				  ? &addr6
1373				  : &inp->in6p_laddr,
1374				  inp->inp_lport,  0, NULL);
1375	tcp_lock(inp->inp_socket, 0, 0);
1376	if (oinp) {
1377		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
1378		    otp->t_state == TCPS_TIME_WAIT &&
1379		    ((int)(tcp_now - otp->t_starttime)) < tcp_msl &&
1380		    (otp->t_flags & TF_RCVD_CC)) {
1381			otp = tcp_close(otp);
1382		} else {
1383			error = EADDRINUSE;
1384			goto done;
1385		}
1386	}
1387	if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->ipi_lock)) {
1388		/*lock inversion issue, mostly with udp multicast packets */
1389		socket_unlock(inp->inp_socket, 0);
1390		lck_rw_lock_exclusive(inp->inp_pcbinfo->ipi_lock);
1391		socket_lock(inp->inp_socket, 0);
1392	}
1393	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
1394		inp->in6p_laddr = addr6;
1395		inp->in6p_last_outifp = outif;	/* no reference needed */
1396		inp->in6p_flags |= INP_IN6ADDR_ANY;
1397	}
1398	inp->in6p_faddr = sin6->sin6_addr;
1399	inp->inp_fport = sin6->sin6_port;
1400	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
1401		inp->inp_flow = sin6->sin6_flowinfo;
1402	in_pcbrehash(inp);
1403	lck_rw_done(inp->inp_pcbinfo->ipi_lock);
1404
1405	if (inp->inp_flowhash == 0)
1406		inp->inp_flowhash = inp_calc_flowhash(inp);
1407	/* update flowinfo - RFC 6437 */
1408	if (inp->inp_flow == 0 && inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
1409		inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
1410		inp->inp_flow |=
1411		    (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
1412	}
1413
1414	tcp_set_max_rwinscale(tp, so);
1415
1416	soisconnecting(so);
1417	tcpstat.tcps_connattempt++;
1418	tp->t_state = TCPS_SYN_SENT;
1419	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1420		TCP_CONN_KEEPINIT(tp));
1421	tp->iss = tcp_new_isn(tp);
1422	tcp_sendseqinit(tp);
1423	if (nstat_collect)
1424		nstat_route_connect_attempt(inp->inp_route.ro_rt);
1425
1426	/*
1427	 * Generate a CC value for this connection and
1428	 * check whether CC or CCnew should be used.
1429	 */
1430	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
1431		taop = &tao_noncached;
1432		bzero(taop, sizeof(*taop));
1433	}
1434
1435	tp->cc_send = CC_INC(tcp_ccgen);
1436	if (taop->tao_ccsent != 0 &&
1437	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
1438		taop->tao_ccsent = tp->cc_send;
1439	} else {
1440		taop->tao_ccsent = 0;
1441		tp->t_flags |= TF_SENDCCNEW;
1442	}
1443
1444done:
1445	if (outif != NULL)
1446		ifnet_release(outif);
1447
1448	return (error);
1449}
1450#endif /* INET6 */
1451
1452/*
1453 * Export TCP internal state information via a struct tcp_info
1454 */
1455__private_extern__ void
1456tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
1457{
1458	struct inpcb *inp = tp->t_inpcb;
1459
1460	bzero(ti, sizeof(*ti));
1461
1462	ti->tcpi_state = tp->t_state;
1463
1464	if (tp->t_state > TCPS_LISTEN) {
1465		if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
1466			ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1467		if (tp->t_flags & TF_SACK_PERMIT)
1468			ti->tcpi_options |= TCPI_OPT_SACK;
1469		if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
1470			ti->tcpi_options |= TCPI_OPT_WSCALE;
1471			ti->tcpi_snd_wscale = tp->snd_scale;
1472			ti->tcpi_rcv_wscale = tp->rcv_scale;
1473		}
1474
1475		/* Are we in retranmission episode */
1476		if (tp->snd_max != tp->snd_nxt)
1477			ti->tcpi_flags |= TCPI_FLAG_LOSSRECOVERY;
1478		else
1479			ti->tcpi_flags &= ~TCPI_FLAG_LOSSRECOVERY;
1480
1481		ti->tcpi_rto = tp->t_timer[TCPT_REXMT] ? tp->t_rxtcur : 0;
1482		ti->tcpi_snd_mss = tp->t_maxseg;
1483		ti->tcpi_rcv_mss = tp->t_maxseg;
1484
1485		ti->tcpi_rttcur = tp->t_rttcur;
1486		ti->tcpi_srtt = tp->t_srtt >> TCP_RTT_SHIFT;
1487		ti->tcpi_rttvar = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
1488		ti->tcpi_rttbest = tp->t_rttbest >> TCP_RTT_SHIFT;
1489
1490		ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
1491		ti->tcpi_snd_cwnd = tp->snd_cwnd;
1492		ti->tcpi_snd_sbbytes = tp->t_inpcb->inp_socket->so_snd.sb_cc;
1493
1494		ti->tcpi_rcv_space = tp->rcv_wnd;
1495
1496		ti->tcpi_snd_wnd = tp->snd_wnd;
1497		ti->tcpi_snd_nxt = tp->snd_nxt;
1498		ti->tcpi_rcv_nxt = tp->rcv_nxt;
1499
1500		/* convert bytes/msec to bits/sec */
1501		if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
1502			tp->t_bwmeas != NULL) {
1503			ti->tcpi_snd_bw	= (tp->t_bwmeas->bw_sndbw * 8000);
1504		}
1505
1506		ti->tcpi_last_outif = (tp->t_inpcb->inp_last_outifp == NULL) ? 0 :
1507		    tp->t_inpcb->inp_last_outifp->if_index;
1508
1509		//atomic_get_64(ti->tcpi_txbytes, &inp->inp_stat->txbytes);
1510		ti->tcpi_txpackets = inp->inp_stat->txpackets;
1511		ti->tcpi_txbytes = inp->inp_stat->txbytes;
1512		ti->tcpi_txretransmitbytes = tp->t_stat.txretransmitbytes;
1513		ti->tcpi_txunacked = tp->snd_max - tp->snd_una;
1514
1515		//atomic_get_64(ti->tcpi_rxbytes, &inp->inp_stat->rxbytes);
1516		ti->tcpi_rxpackets = inp->inp_stat->rxpackets;
1517		ti->tcpi_rxbytes = inp->inp_stat->rxbytes;
1518		ti->tcpi_rxduplicatebytes = tp->t_stat.rxduplicatebytes;
1519		ti->tcpi_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
1520
1521		if (tp->t_state > TCPS_LISTEN) {
1522			ti->tcpi_synrexmits = tp->t_stat.synrxtshift;
1523		}
1524		ti->tcpi_cell_rxpackets = inp->inp_cstat->rxpackets;
1525		ti->tcpi_cell_rxbytes = inp->inp_cstat->rxbytes;
1526		ti->tcpi_cell_txpackets = inp->inp_cstat->txpackets;
1527		ti->tcpi_cell_txbytes = inp->inp_cstat->txbytes;
1528
1529		ti->tcpi_wifi_rxpackets = inp->inp_wstat->rxpackets;
1530		ti->tcpi_wifi_rxbytes = inp->inp_wstat->rxbytes;
1531		ti->tcpi_wifi_txpackets = inp->inp_wstat->txpackets;
1532		ti->tcpi_wifi_txbytes = inp->inp_wstat->txbytes;
1533
1534		ti->tcpi_wired_rxpackets = inp->inp_Wstat->rxpackets;
1535		ti->tcpi_wired_rxbytes = inp->inp_Wstat->rxbytes;
1536		ti->tcpi_wired_txpackets = inp->inp_Wstat->txpackets;
1537		ti->tcpi_wired_txbytes = inp->inp_Wstat->txbytes;
1538	}
1539}
1540
1541__private_extern__ errno_t
1542tcp_fill_info_for_info_tuple(struct info_tuple *itpl, struct tcp_info *ti)
1543{
1544	struct inpcbinfo *pcbinfo = NULL;
1545	struct inpcb *inp = NULL;
1546	struct socket *so;
1547	struct tcpcb *tp;
1548
1549	if (itpl->itpl_proto == IPPROTO_TCP)
1550		pcbinfo = &tcbinfo;
1551	else
1552		return EINVAL;
1553
1554	if (itpl->itpl_local_sa.sa_family == AF_INET &&
1555		itpl->itpl_remote_sa.sa_family == AF_INET) {
1556		inp = in_pcblookup_hash(pcbinfo,
1557			itpl->itpl_remote_sin.sin_addr,
1558			itpl->itpl_remote_sin.sin_port,
1559			itpl->itpl_local_sin.sin_addr,
1560			itpl->itpl_local_sin.sin_port,
1561			0, NULL);
1562	} else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
1563		itpl->itpl_remote_sa.sa_family == AF_INET6) {
1564		struct in6_addr ina6_local;
1565		struct in6_addr ina6_remote;
1566
1567		ina6_local = itpl->itpl_local_sin6.sin6_addr;
1568		if (IN6_IS_SCOPE_LINKLOCAL(&ina6_local) &&
1569			itpl->itpl_local_sin6.sin6_scope_id)
1570			ina6_local.s6_addr16[1] = htons(itpl->itpl_local_sin6.sin6_scope_id);
1571
1572		ina6_remote = itpl->itpl_remote_sin6.sin6_addr;
1573		if (IN6_IS_SCOPE_LINKLOCAL(&ina6_remote) &&
1574			itpl->itpl_remote_sin6.sin6_scope_id)
1575			ina6_remote.s6_addr16[1] = htons(itpl->itpl_remote_sin6.sin6_scope_id);
1576
1577		inp = in6_pcblookup_hash(pcbinfo,
1578			&ina6_remote,
1579			itpl->itpl_remote_sin6.sin6_port,
1580			&ina6_local,
1581			itpl->itpl_local_sin6.sin6_port,
1582			0, NULL);
1583	} else {
1584		return EINVAL;
1585	}
1586	if (inp == NULL || (so = inp->inp_socket) == NULL)
1587		return ENOENT;
1588
1589	socket_lock(so, 0);
1590	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1591		socket_unlock(so, 0);
1592		return ENOENT;
1593	}
1594	tp = intotcpcb(inp);
1595
1596	tcp_fill_info(tp, ti);
1597	socket_unlock(so, 0);
1598
1599	return 0;
1600}
1601
1602
1603__private_extern__ int
1604tcp_sysctl_info(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
1605{
1606	int error;
1607	struct tcp_info ti;
1608	struct info_tuple itpl;
1609	proc_t caller = PROC_NULL;
1610	proc_t caller_parent = PROC_NULL;
1611	char command_name[MAXCOMLEN + 1] = "";
1612	char parent_name[MAXCOMLEN + 1] = "";
1613
1614	if ((caller = proc_self()) != PROC_NULL) {
1615		/* get process name */
1616		strlcpy(command_name, caller->p_comm, sizeof(command_name));
1617
1618		/* get parent process name if possible */
1619		if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
1620			strlcpy(parent_name, caller_parent->p_comm,
1621			    sizeof(parent_name));
1622			proc_rele(caller_parent);
1623		}
1624
1625		if ((escape_str(command_name, strlen(command_name),
1626		    sizeof(command_name)) == 0) &&
1627		    (escape_str(parent_name, strlen(parent_name),
1628		    sizeof(parent_name)) == 0)) {
1629			kern_asl_msg(LOG_DEBUG, "messagetracer",
1630			    5,
1631			    "com.apple.message.domain",
1632			    "com.apple.kernel.tcpstat", /* 1 */
1633			    "com.apple.message.signature",
1634			    "tcpinfo", /* 2 */
1635			    "com.apple.message.signature2", command_name, /* 3 */
1636			    "com.apple.message.signature3", parent_name, /* 4 */
1637			    "com.apple.message.summarize", "YES", /* 5 */
1638			    NULL);
1639		}
1640	}
1641
1642	if (caller != PROC_NULL)
1643		proc_rele(caller);
1644
1645	if (req->newptr == USER_ADDR_NULL) {
1646		return EINVAL;
1647	}
1648	if (req->newlen < sizeof(struct info_tuple)) {
1649		return EINVAL;
1650	}
1651	error = SYSCTL_IN(req, &itpl, sizeof(struct info_tuple));
1652	if (error != 0) {
1653		return error;
1654	}
1655	error = tcp_fill_info_for_info_tuple(&itpl, &ti);
1656	if (error != 0) {
1657		return error;
1658	}
1659	error = SYSCTL_OUT(req, &ti, sizeof(struct tcp_info));
1660	if (error != 0) {
1661		return error;
1662	}
1663
1664	return 0;
1665}
1666
1667static int
1668tcp_lookup_peer_pid_locked(struct socket *so, pid_t *out_pid)
1669{
1670	int error = EHOSTUNREACH;
1671	*out_pid = -1;
1672	if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN;
1673
1674	struct inpcb	*inp = (struct inpcb*)so->so_pcb;
1675	uint16_t		lport = inp->inp_lport;
1676	uint16_t		fport = inp->inp_fport;
1677	struct inpcb	*finp = NULL;
1678
1679	if (inp->inp_vflag & INP_IPV6) {
1680		struct	in6_addr	laddr6 = inp->in6p_laddr;
1681		struct	in6_addr	faddr6 = inp->in6p_faddr;
1682		socket_unlock(so, 0);
1683		finp = in6_pcblookup_hash(&tcbinfo, &laddr6, lport, &faddr6, fport, 0, NULL);
1684		socket_lock(so, 0);
1685	} else if (inp->inp_vflag & INP_IPV4) {
1686		struct	in_addr	laddr4 = inp->inp_laddr;
1687		struct	in_addr	faddr4 = inp->inp_faddr;
1688		socket_unlock(so, 0);
1689		finp = in_pcblookup_hash(&tcbinfo, laddr4, lport, faddr4, fport, 0, NULL);
1690		socket_lock(so, 0);
1691	}
1692
1693	if (finp) {
1694		*out_pid = finp->inp_socket->last_pid;
1695		error = 0;
1696		in_pcb_checkstate(finp, WNT_RELEASE, 0);
1697	}
1698
1699	return error;
1700}
1701
1702void
1703tcp_getconninfo(struct socket *so, struct conninfo_tcp *tcp_ci)
1704{
1705	(void) tcp_lookup_peer_pid_locked(so, &tcp_ci->tcpci_peer_pid);
1706	tcp_fill_info(sototcpcb(so), &tcp_ci->tcpci_tcp_info);
1707}
1708
1709/*
1710 * The new sockopt interface makes it possible for us to block in the
1711 * copyin/out step (if we take a page fault).  Taking a page fault at
1712 * splnet() is probably a Bad Thing.  (Since sockets and pcbs both now
1713 * use TSM, there probably isn't any need for this function to run at
1714 * splnet() any more.  This needs more examination.)
1715 */
1716int
1717tcp_ctloutput(so, sopt)
1718	struct socket *so;
1719	struct sockopt *sopt;
1720{
1721	int	error, opt, optval;
1722	struct	inpcb *inp;
1723	struct	tcpcb *tp;
1724
1725	error = 0;
1726	inp = sotoinpcb(so);
1727	if (inp == NULL) {
1728		return (ECONNRESET);
1729	}
1730	/* Allow <SOL_SOCKET,SO_FLUSH/SO_TRAFFIC_MGT_BACKGROUND> at this level */
1731	if (sopt->sopt_level != IPPROTO_TCP &&
1732	    !(sopt->sopt_level == SOL_SOCKET && (sopt->sopt_name == SO_FLUSH ||
1733	    sopt->sopt_name == SO_TRAFFIC_MGT_BACKGROUND))) {
1734#if INET6
1735		if (SOCK_CHECK_DOM(so, PF_INET6))
1736			error = ip6_ctloutput(so, sopt);
1737		else
1738#endif /* INET6 */
1739		error = ip_ctloutput(so, sopt);
1740		return (error);
1741	}
1742	tp = intotcpcb(inp);
1743	if (tp == NULL) {
1744		return (ECONNRESET);
1745	}
1746
1747	calculate_tcp_clock();
1748
1749	switch (sopt->sopt_dir) {
1750	case SOPT_SET:
1751		switch (sopt->sopt_name) {
1752		case TCP_NODELAY:
1753		case TCP_NOOPT:
1754		case TCP_NOPUSH:
1755		case TCP_ENABLE_ECN:
1756			error = sooptcopyin(sopt, &optval, sizeof optval,
1757					    sizeof optval);
1758			if (error)
1759				break;
1760
1761			switch (sopt->sopt_name) {
1762			case TCP_NODELAY:
1763				opt = TF_NODELAY;
1764				break;
1765			case TCP_NOOPT:
1766				opt = TF_NOOPT;
1767				break;
1768			case TCP_NOPUSH:
1769				opt = TF_NOPUSH;
1770				break;
1771			case TCP_ENABLE_ECN:
1772				opt = TF_ENABLE_ECN;
1773				break;
1774			default:
1775				opt = 0; /* dead code to fool gcc */
1776				break;
1777			}
1778
1779			if (optval)
1780				tp->t_flags |= opt;
1781			else
1782				tp->t_flags &= ~opt;
1783			break;
1784		case TCP_RXT_FINDROP:
1785		case TCP_NOTIMEWAIT:
1786			error = sooptcopyin(sopt, &optval, sizeof optval,
1787				sizeof optval);
1788			if (error)
1789				break;
1790			switch (sopt->sopt_name) {
1791			case TCP_RXT_FINDROP:
1792				opt = TF_RXTFINDROP;
1793				break;
1794			case TCP_NOTIMEWAIT:
1795				opt = TF_NOTIMEWAIT;
1796				break;
1797			default:
1798				opt = 0;
1799				break;
1800			}
1801			if (optval)
1802				tp->t_flagsext |= opt;
1803			else
1804				tp->t_flagsext &= ~opt;
1805			break;
1806		case TCP_MEASURE_SND_BW:
1807			error = sooptcopyin(sopt, &optval, sizeof optval,
1808				sizeof optval);
1809			if (error)
1810				break;
1811			opt = TF_MEASURESNDBW;
1812			if (optval) {
1813				if (tp->t_bwmeas == NULL) {
1814					tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1815					if (tp->t_bwmeas == NULL) {
1816						error = ENOMEM;
1817						break;
1818					}
1819				}
1820				tp->t_flagsext |= opt;
1821			} else {
1822				tp->t_flagsext &= ~opt;
1823				/* Reset snd bw measurement state */
1824				tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
1825				if (tp->t_bwmeas != NULL) {
1826					tcp_bwmeas_free(tp);
1827				}
1828			}
1829			break;
1830		case TCP_MEASURE_BW_BURST: {
1831			struct tcp_measure_bw_burst in;
1832			uint32_t minpkts, maxpkts;
1833			bzero(&in, sizeof(in));
1834
1835			error = sooptcopyin(sopt, &in, sizeof(in),
1836				sizeof(in));
1837			if (error)
1838				break;
1839			if ((tp->t_flagsext & TF_MEASURESNDBW) == 0 ||
1840				tp->t_bwmeas == NULL) {
1841				error = EINVAL;
1842				break;
1843			}
1844			minpkts = (in.min_burst_size != 0) ? in.min_burst_size :
1845				tp->t_bwmeas->bw_minsizepkts;
1846			maxpkts = (in.max_burst_size != 0) ? in.max_burst_size :
1847				tp->t_bwmeas->bw_maxsizepkts;
1848			if (minpkts > maxpkts) {
1849				error = EINVAL;
1850				break;
1851			}
1852			tp->t_bwmeas->bw_minsizepkts = minpkts;
1853			tp->t_bwmeas->bw_maxsizepkts = maxpkts;
1854			tp->t_bwmeas->bw_minsize = (minpkts * tp->t_maxseg);
1855			tp->t_bwmeas->bw_maxsize = (maxpkts * tp->t_maxseg);
1856			break;
1857		}
1858		case TCP_MAXSEG:
1859			error = sooptcopyin(sopt, &optval, sizeof optval,
1860					    sizeof optval);
1861			if (error)
1862				break;
1863
1864			if (optval > 0 && optval <= tp->t_maxseg &&
1865			    optval + 40 >= tcp_minmss)
1866				tp->t_maxseg = optval;
1867			else
1868				error = EINVAL;
1869			break;
1870
1871		case TCP_KEEPALIVE:
1872			error = sooptcopyin(sopt, &optval, sizeof optval,
1873						sizeof optval);
1874			if (error)
1875				break;
1876			if (optval < 0 || optval > UINT32_MAX/TCP_RETRANSHZ) {
1877				error = EINVAL;
1878			} else {
1879				tp->t_keepidle = optval * TCP_RETRANSHZ;
1880				/* reset the timer to new value */
1881				tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1882					TCP_CONN_KEEPIDLE(tp));
1883				tcp_check_timer_state(tp);
1884			}
1885                        break;
1886
1887		case TCP_CONNECTIONTIMEOUT:
1888			error = sooptcopyin(sopt, &optval, sizeof optval,
1889						sizeof optval);
1890			if (error)
1891				break;
1892			if (optval < 0 || optval > UINT32_MAX/TCP_RETRANSHZ) {
1893				error = EINVAL;
1894			} else {
1895				tp->t_keepinit = optval * TCP_RETRANSHZ;
1896				if (tp->t_state == TCPS_SYN_RECEIVED ||
1897					tp->t_state == TCPS_SYN_SENT) {
1898					tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1899						TCP_CONN_KEEPINIT(tp));
1900					tcp_check_timer_state(tp);
1901				}
1902			}
1903			break;
1904
1905		case TCP_KEEPINTVL:
1906			error = sooptcopyin(sopt, &optval, sizeof(optval),
1907				sizeof(optval));
1908			if (error)
1909				break;
1910			if (optval < 0 || optval > UINT32_MAX/TCP_RETRANSHZ) {
1911				error = EINVAL;
1912			} else {
1913				tp->t_keepintvl = optval * TCP_RETRANSHZ;
1914				if (tp->t_state == TCPS_FIN_WAIT_2 &&
1915					TCP_CONN_MAXIDLE(tp) > 0) {
1916					tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
1917						TCP_CONN_MAXIDLE(tp));
1918					tcp_check_timer_state(tp);
1919				}
1920			}
1921			break;
1922
1923		case TCP_KEEPCNT:
1924			error = sooptcopyin(sopt, &optval, sizeof(optval),
1925				sizeof(optval));
1926			if (error)
1927				break;
1928			if (optval < 0 || optval > INT32_MAX) {
1929				error = EINVAL;
1930			} else {
1931				tp->t_keepcnt = optval;
1932				if (tp->t_state == TCPS_FIN_WAIT_2 &&
1933					TCP_CONN_MAXIDLE(tp) > 0) {
1934					tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
1935						TCP_CONN_MAXIDLE(tp));
1936					tcp_check_timer_state(tp);
1937				}
1938			}
1939			break;
1940
1941		case PERSIST_TIMEOUT:
1942			error = sooptcopyin(sopt, &optval, sizeof optval,
1943						sizeof optval);
1944			if (error)
1945				break;
1946			if (optval < 0)
1947				error = EINVAL;
1948			else
1949				tp->t_persist_timeout = optval * TCP_RETRANSHZ;
1950			break;
1951		case TCP_RXT_CONNDROPTIME:
1952			error = sooptcopyin(sopt, &optval, sizeof(optval),
1953					sizeof(optval));
1954			if (error)
1955				break;
1956			if (optval < 0)
1957				error = EINVAL;
1958			else
1959				tp->t_rxt_conndroptime = optval * TCP_RETRANSHZ;
1960			break;
1961		case TCP_NOTSENT_LOWAT:
1962			error = sooptcopyin(sopt, &optval, sizeof(optval),
1963				sizeof(optval));
1964			if (error)
1965				break;
1966			if (optval < 0) {
1967				error = EINVAL;
1968				break;
1969			} else {
1970				if (optval == 0) {
1971					so->so_flags &= ~(SOF_NOTSENT_LOWAT);
1972					tp->t_notsent_lowat = 0;
1973				} else {
1974					so->so_flags |= SOF_NOTSENT_LOWAT;
1975					tp->t_notsent_lowat = optval;
1976				}
1977			}
1978			break;
1979		case TCP_ADAPTIVE_READ_TIMEOUT:
1980			error = sooptcopyin(sopt, &optval, sizeof (optval),
1981				sizeof(optval));
1982			if (error)
1983				break;
1984			if (optval < 0 ||
1985				optval > TCP_ADAPTIVE_TIMEOUT_MAX) {
1986				error = EINVAL;
1987				break;
1988			} else if (optval == 0) {
1989				tp->t_adaptive_rtimo = 0;
1990				tcp_keepalive_reset(tp);
1991			} else {
1992				tp->t_adaptive_rtimo = optval;
1993			}
1994			break;
1995		case TCP_ADAPTIVE_WRITE_TIMEOUT:
1996			error = sooptcopyin(sopt, &optval, sizeof (optval),
1997				sizeof (optval));
1998			if (error)
1999				break;
2000			if (optval < 0 ||
2001				optval > TCP_ADAPTIVE_TIMEOUT_MAX) {
2002				error = EINVAL;
2003				break;
2004			} else {
2005				tp->t_adaptive_wtimo = optval;
2006			}
2007			break;
2008		case TCP_ENABLE_MSGS:
2009			error = sooptcopyin(sopt, &optval, sizeof(optval),
2010				sizeof(optval));
2011			if (error)
2012				break;
2013			if (optval < 0 || optval > 1) {
2014				error = EINVAL;
2015			} else if (optval == 1) {
2016				/*
2017				 * Check if messages option is already
2018				 * enabled, if so return.
2019				 */
2020				if (so->so_flags & SOF_ENABLE_MSGS) {
2021					VERIFY(so->so_msg_state != NULL);
2022					break;
2023				}
2024
2025				/*
2026				 * allocate memory for storing message
2027				 * related state
2028				 */
2029				VERIFY(so->so_msg_state == NULL);
2030				MALLOC(so->so_msg_state,
2031					struct msg_state *,
2032					sizeof(struct msg_state),
2033					M_TEMP, M_WAITOK | M_ZERO);
2034				if (so->so_msg_state == NULL) {
2035					error = ENOMEM;
2036					break;
2037				}
2038
2039				/* Enable message delivery */
2040				so->so_flags |= SOF_ENABLE_MSGS;
2041			} else {
2042				/*
2043				 * Can't disable message delivery on socket
2044				 * because of restrictions imposed by
2045				 * encoding/decoding
2046				 */
2047				error = EINVAL;
2048			}
2049			break;
2050		case TCP_SENDMOREACKS:
2051			error = sooptcopyin(sopt, &optval, sizeof(optval),
2052				sizeof(optval));
2053			if (error)
2054				break;
2055			if (optval < 0 || optval > 1) {
2056				error = EINVAL;
2057			} else if (optval == 0) {
2058				tp->t_flagsext &= ~(TF_NOSTRETCHACK);
2059			} else {
2060				tp->t_flagsext |= TF_NOSTRETCHACK;
2061			}
2062			break;
2063		case TCP_DISABLE_BLACKHOLE_DETECTION:
2064			error = sooptcopyin(sopt, &optval, sizeof(optval),
2065				sizeof(optval));
2066			if (error)
2067				break;
2068			if (optval < 0 || optval > 1) {
2069				error = EINVAL;
2070			} else if (optval == 0) {
2071				tp->t_flagsext &= ~TF_NOBLACKHOLE_DETECTION;
2072			} else {
2073				tp->t_flagsext |= TF_NOBLACKHOLE_DETECTION;
2074				if ((tp->t_flags & TF_BLACKHOLE) &&
2075				    tp->t_pmtud_saved_maxopd > 0)
2076					tcp_pmtud_revert_segment_size(tp);
2077			}
2078			break;
2079		case SO_FLUSH:
2080			if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
2081			    sizeof (optval))) != 0)
2082				break;
2083
2084			error = inp_flush(inp, optval);
2085			break;
2086
2087		case SO_TRAFFIC_MGT_BACKGROUND:
2088			if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
2089			    sizeof (optval))) != 0)
2090				break;
2091
2092			if (optval) {
2093				socket_set_traffic_mgt_flags_locked(so,
2094				    TRAFFIC_MGT_SO_BACKGROUND);
2095			} else {
2096				socket_clear_traffic_mgt_flags_locked(so,
2097				    TRAFFIC_MGT_SO_BACKGROUND);
2098			}
2099			break;
2100
2101		default:
2102			error = ENOPROTOOPT;
2103			break;
2104		}
2105		break;
2106
2107	case SOPT_GET:
2108		switch (sopt->sopt_name) {
2109		case TCP_NODELAY:
2110			optval = tp->t_flags & TF_NODELAY;
2111			break;
2112		case TCP_MAXSEG:
2113			optval = tp->t_maxseg;
2114			break;
2115		case TCP_KEEPALIVE:
2116			optval = tp->t_keepidle / TCP_RETRANSHZ;
2117			break;
2118		case TCP_KEEPINTVL:
2119			optval = tp->t_keepintvl / TCP_RETRANSHZ;
2120			break;
2121		case TCP_KEEPCNT:
2122			optval = tp->t_keepcnt;
2123			break;
2124		case TCP_NOOPT:
2125			optval = tp->t_flags & TF_NOOPT;
2126			break;
2127		case TCP_NOPUSH:
2128			optval = tp->t_flags & TF_NOPUSH;
2129			break;
2130		case TCP_ENABLE_ECN:
2131			optval = (tp->t_flags & TF_ENABLE_ECN) ? 1 : 0;
2132			break;
2133		case TCP_CONNECTIONTIMEOUT:
2134			optval = tp->t_keepinit / TCP_RETRANSHZ;
2135			break;
2136		case PERSIST_TIMEOUT:
2137			optval = tp->t_persist_timeout / TCP_RETRANSHZ;
2138			break;
2139		case TCP_RXT_CONNDROPTIME:
2140			optval = tp->t_rxt_conndroptime / TCP_RETRANSHZ;
2141			break;
2142		case TCP_RXT_FINDROP:
2143			optval = tp->t_flagsext & TF_RXTFINDROP;
2144			break;
2145		case TCP_NOTIMEWAIT:
2146			optval = (tp->t_flagsext & TF_NOTIMEWAIT) ? 1 : 0;
2147			break;
2148		case TCP_MEASURE_SND_BW:
2149			optval = tp->t_flagsext & TF_MEASURESNDBW;
2150			break;
2151		case TCP_INFO: {
2152			struct tcp_info ti;
2153
2154			tcp_fill_info(tp, &ti);
2155			error = sooptcopyout(sopt, &ti, sizeof(struct tcp_info));
2156			goto done;
2157			/* NOT REACHED */
2158		}
2159		case TCP_MEASURE_BW_BURST: {
2160			struct tcp_measure_bw_burst out;
2161			if ((tp->t_flagsext & TF_MEASURESNDBW) == 0 ||
2162				tp->t_bwmeas == NULL) {
2163				error = EINVAL;
2164				break;
2165			}
2166			out.min_burst_size = tp->t_bwmeas->bw_minsizepkts;
2167			out.max_burst_size = tp->t_bwmeas->bw_maxsizepkts;
2168			error = sooptcopyout(sopt, &out, sizeof(out));
2169			goto done;
2170		}
2171		case TCP_NOTSENT_LOWAT:
2172			if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
2173				optval = tp->t_notsent_lowat;
2174			} else {
2175				optval = 0;
2176			}
2177			break;
2178
2179		case TCP_ENABLE_MSGS:
2180			if (so->so_flags & SOF_ENABLE_MSGS) {
2181				optval = 1;
2182			} else {
2183				optval = 0;
2184			}
2185			break;
2186		case TCP_SENDMOREACKS:
2187			if (tp->t_flagsext & TF_NOSTRETCHACK)
2188				optval = 1;
2189			else
2190				optval = 0;
2191			break;
2192		case TCP_DISABLE_BLACKHOLE_DETECTION:
2193			if (tp->t_flagsext & TF_NOBLACKHOLE_DETECTION)
2194				optval = 1;
2195			else
2196				optval = 0;
2197			break;
2198		case TCP_PEER_PID: {
2199			pid_t	pid;
2200			error = tcp_lookup_peer_pid_locked(so, &pid);
2201			if (error == 0)
2202				error = sooptcopyout(sopt, &pid, sizeof(pid));
2203			goto done;
2204		}
2205		case TCP_ADAPTIVE_READ_TIMEOUT:
2206			optval = tp->t_adaptive_rtimo;
2207			break;
2208		case TCP_ADAPTIVE_WRITE_TIMEOUT:
2209			optval = tp->t_adaptive_wtimo;
2210			break;
2211		case SO_TRAFFIC_MGT_BACKGROUND:
2212			optval = (so->so_traffic_mgt_flags &
2213			    TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
2214			break;
2215		default:
2216			error = ENOPROTOOPT;
2217			break;
2218		}
2219		if (error == 0)
2220			error = sooptcopyout(sopt, &optval, sizeof optval);
2221		break;
2222	}
2223done:
2224	return (error);
2225}
2226
2227/*
2228 * tcp_sendspace and tcp_recvspace are the default send and receive window
2229 * sizes, respectively.  These are obsolescent (this information should
2230 * be set by the route).
2231 */
2232u_int32_t	tcp_sendspace = 1448*256;
2233u_int32_t	tcp_recvspace = 1448*384;
2234
2235/* During attach, the size of socket buffer allocated is limited to
2236 * sb_max in sbreserve. Disallow setting the tcp send and recv space
2237 * to be more than sb_max because that will cause tcp_attach to fail
2238 * (see radar 5713060)
2239 */
2240static int
2241sysctl_tcp_sospace(struct sysctl_oid *oidp, __unused void *arg1,
2242	__unused int arg2, struct sysctl_req *req) {
2243	u_int32_t new_value = 0, *space_p = NULL;
2244	int changed = 0, error = 0;
2245	u_quad_t sb_effective_max = (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES;
2246
2247	switch (oidp->oid_number) {
2248		case TCPCTL_SENDSPACE:
2249			space_p = &tcp_sendspace;
2250			break;
2251		case TCPCTL_RECVSPACE:
2252			space_p = &tcp_recvspace;
2253			break;
2254		default:
2255			return EINVAL;
2256	}
2257	error = sysctl_io_number(req, *space_p, sizeof(u_int32_t),
2258		&new_value, &changed);
2259	if (changed) {
2260		if (new_value > 0 && new_value <= sb_effective_max) {
2261			*space_p = new_value;
2262		} else {
2263			error = ERANGE;
2264		}
2265	}
2266	return error;
2267}
2268
2269SYSCTL_PROC(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2270    &tcp_sendspace , 0, &sysctl_tcp_sospace, "IU", "Maximum outgoing TCP datagram size");
2271SYSCTL_PROC(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2272    &tcp_recvspace , 0, &sysctl_tcp_sospace, "IU", "Maximum incoming TCP datagram size");
2273
2274
2275/*
2276 * Attach TCP protocol to socket, allocating
2277 * internet protocol control block, tcp control block,
2278 * bufer space, and entering LISTEN state if to accept connections.
2279 *
2280 * Returns:	0			Success
2281 *	in_pcballoc:ENOBUFS
2282 *	in_pcballoc:ENOMEM
2283 *	in_pcballoc:???			[IPSEC specific]
2284 *	soreserve:ENOBUFS
2285 */
2286static int
2287tcp_attach(so, p)
2288	struct socket *so;
2289	struct proc *p;
2290{
2291	register struct tcpcb *tp;
2292	struct inpcb *inp;
2293	int error;
2294#if INET6
2295	int isipv6 = SOCK_CHECK_DOM(so, PF_INET6) != 0;
2296#endif
2297
2298	error = in_pcballoc(so, &tcbinfo, p);
2299	if (error)
2300		return (error);
2301
2302	inp = sotoinpcb(so);
2303
2304	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
2305		error = soreserve(so, tcp_sendspace, tcp_recvspace);
2306		if (error)
2307			return (error);
2308	}
2309	if ((so->so_rcv.sb_flags & SB_USRSIZE) == 0)
2310		so->so_rcv.sb_flags |= SB_AUTOSIZE;
2311	if ((so->so_snd.sb_flags & SB_USRSIZE) == 0)
2312		so->so_snd.sb_flags |= SB_AUTOSIZE;
2313
2314#if INET6
2315	if (isipv6) {
2316		inp->inp_vflag |= INP_IPV6;
2317		inp->in6p_hops = -1;	/* use kernel default */
2318	}
2319	else
2320#endif /* INET6 */
2321	inp->inp_vflag |= INP_IPV4;
2322	tp = tcp_newtcpcb(inp);
2323	if (tp == NULL) {
2324		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
2325
2326		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
2327#if INET6
2328		if (isipv6)
2329			in6_pcbdetach(inp);
2330		else
2331#endif /* INET6 */
2332		in_pcbdetach(inp);
2333		so->so_state |= nofd;
2334		return (ENOBUFS);
2335	}
2336	if (nstat_collect)
2337		nstat_tcp_new_pcb(inp);
2338	tp->t_state = TCPS_CLOSED;
2339	return (0);
2340}
2341
2342/*
2343 * Initiate (or continue) disconnect.
2344 * If embryonic state, just send reset (once).
2345 * If in ``let data drain'' option and linger null, just drop.
2346 * Otherwise (hard), mark socket disconnecting and drop
2347 * current input data; switch states based on user close, and
2348 * send segment to peer (with FIN).
2349 */
2350static struct tcpcb *
2351tcp_disconnect(tp)
2352	register struct tcpcb *tp;
2353{
2354	struct socket *so = tp->t_inpcb->inp_socket;
2355
2356	if (tp->t_state < TCPS_ESTABLISHED)
2357		tp = tcp_close(tp);
2358	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
2359		tp = tcp_drop(tp, 0);
2360	else {
2361		soisdisconnecting(so);
2362		sbflush(&so->so_rcv);
2363		tp = tcp_usrclosed(tp);
2364#if MPTCP
2365		/* A reset has been sent but socket exists, do not send FIN */
2366		if ((so->so_flags & SOF_MP_SUBFLOW) &&
2367		    (tp) && (tp->t_mpflags & TMPF_RESET))
2368			return (tp);
2369#endif
2370		if (tp)
2371			(void) tcp_output(tp);
2372	}
2373	return (tp);
2374}
2375
2376/*
2377 * User issued close, and wish to trail through shutdown states:
2378 * if never received SYN, just forget it.  If got a SYN from peer,
2379 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
2380 * If already got a FIN from peer, then almost done; go to LAST_ACK
2381 * state.  In all other cases, have already sent FIN to peer (e.g.
2382 * after PRU_SHUTDOWN), and just have to play tedious game waiting
2383 * for peer to send FIN or not respond to keep-alives, etc.
2384 * We can let the user exit from the close as soon as the FIN is acked.
2385 */
2386static struct tcpcb *
2387tcp_usrclosed(tp)
2388	register struct tcpcb *tp;
2389{
2390
2391	switch (tp->t_state) {
2392
2393	case TCPS_CLOSED:
2394	case TCPS_LISTEN:
2395		tp = tcp_close(tp);
2396		break;
2397
2398	case TCPS_SYN_SENT:
2399	case TCPS_SYN_RECEIVED:
2400		tp->t_flags |= TF_NEEDFIN;
2401		break;
2402
2403	case TCPS_ESTABLISHED:
2404		DTRACE_TCP4(state__change, void, NULL,
2405			struct inpcb *, tp->t_inpcb,
2406			struct tcpcb *, tp,
2407			int32_t, TCPS_FIN_WAIT_1);
2408		tp->t_state = TCPS_FIN_WAIT_1;
2409		break;
2410
2411	case TCPS_CLOSE_WAIT:
2412		DTRACE_TCP4(state__change, void, NULL,
2413			struct inpcb *, tp->t_inpcb,
2414			struct tcpcb *, tp,
2415			int32_t, TCPS_LAST_ACK);
2416		tp->t_state = TCPS_LAST_ACK;
2417		break;
2418	}
2419	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
2420		soisdisconnected(tp->t_inpcb->inp_socket);
2421		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
2422		if (tp->t_state == TCPS_FIN_WAIT_2)
2423			tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
2424				TCP_CONN_MAXIDLE(tp));
2425	}
2426	return (tp);
2427}
2428
2429void
2430tcp_in_cksum_stats(u_int32_t len)
2431{
2432	tcpstat.tcps_rcv_swcsum++;
2433	tcpstat.tcps_rcv_swcsum_bytes += len;
2434}
2435
2436void
2437tcp_out_cksum_stats(u_int32_t len)
2438{
2439	tcpstat.tcps_snd_swcsum++;
2440	tcpstat.tcps_snd_swcsum_bytes += len;
2441}
2442
2443#if INET6
2444void
2445tcp_in6_cksum_stats(u_int32_t len)
2446{
2447	tcpstat.tcps_rcv6_swcsum++;
2448	tcpstat.tcps_rcv6_swcsum_bytes += len;
2449}
2450
2451void
2452tcp_out6_cksum_stats(u_int32_t len)
2453{
2454	tcpstat.tcps_snd6_swcsum++;
2455	tcpstat.tcps_snd6_swcsum_bytes += len;
2456}
2457
2458/*
2459 * When messages are enabled on a TCP socket, the message priority
2460 * is sent as a control message. This function will extract it.
2461 */
2462int
2463tcp_get_msg_priority(struct mbuf *control, uint32_t *msgpri)
2464{
2465	struct cmsghdr *cm;
2466	if (control == NULL)
2467		return(EINVAL);
2468
2469	for (cm = M_FIRST_CMSGHDR(control); cm;
2470		cm = M_NXT_CMSGHDR(control, cm)) {
2471		if (cm->cmsg_len < sizeof(struct cmsghdr) ||
2472			cm->cmsg_len > control->m_len) {
2473			return (EINVAL);
2474		}
2475		if (cm->cmsg_level == SOL_SOCKET &&
2476			cm->cmsg_type == SCM_MSG_PRIORITY) {
2477			*msgpri = *(unsigned int *)(void *)CMSG_DATA(cm);
2478			break;
2479		}
2480	}
2481
2482	VERIFY(*msgpri >= MSG_PRI_MIN && *msgpri <= MSG_PRI_MAX);
2483	return (0);
2484}
2485#endif /* INET6 */
2486