tcp_usrreq.c revision 331722
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2006-2007 Robert N. M. Watson
5 * Copyright (c) 2010-2011 Juniper Networks, Inc.
6 * All rights reserved.
7 *
8 * Portions of this software were developed by Robert N. M. Watson under
9 * contract to Juniper Networks, Inc.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: stable/11/sys/netinet/tcp_usrreq.c 331722 2018-03-29 02:50:57Z eadler $");
40
41#include "opt_ddb.h"
42#include "opt_inet.h"
43#include "opt_inet6.h"
44#include "opt_ipsec.h"
45#include "opt_tcpdebug.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/limits.h>
50#include <sys/malloc.h>
51#include <sys/refcount.h>
52#include <sys/kernel.h>
53#include <sys/sysctl.h>
54#include <sys/mbuf.h>
55#ifdef INET6
56#include <sys/domain.h>
57#endif /* INET6 */
58#include <sys/socket.h>
59#include <sys/socketvar.h>
60#include <sys/protosw.h>
61#include <sys/proc.h>
62#include <sys/jail.h>
63#include <sys/syslog.h>
64
65#ifdef DDB
66#include <ddb/ddb.h>
67#endif
68
69#include <net/if.h>
70#include <net/if_var.h>
71#include <net/route.h>
72#include <net/vnet.h>
73
74#include <netinet/in.h>
75#include <netinet/in_kdtrace.h>
76#include <netinet/in_pcb.h>
77#include <netinet/in_systm.h>
78#include <netinet/in_var.h>
79#include <netinet/ip_var.h>
80#ifdef INET6
81#include <netinet/ip6.h>
82#include <netinet6/in6_pcb.h>
83#include <netinet6/ip6_var.h>
84#include <netinet6/scope6_var.h>
85#endif
86#ifdef TCP_RFC7413
87#include <netinet/tcp_fastopen.h>
88#endif
89#include <netinet/tcp.h>
90#include <netinet/tcp_fsm.h>
91#include <netinet/tcp_seq.h>
92#include <netinet/tcp_timer.h>
93#include <netinet/tcp_var.h>
94#include <netinet/tcpip.h>
95#include <netinet/cc/cc.h>
96#ifdef TCPPCAP
97#include <netinet/tcp_pcap.h>
98#endif
99#ifdef TCPDEBUG
100#include <netinet/tcp_debug.h>
101#endif
102#ifdef TCP_OFFLOAD
103#include <netinet/tcp_offload.h>
104#endif
105#include <netipsec/ipsec_support.h>
106
107/*
108 * TCP protocol interface to socket abstraction.
109 */
110static int	tcp_attach(struct socket *);
111#ifdef INET
112static int	tcp_connect(struct tcpcb *, struct sockaddr *,
113		    struct thread *td);
114#endif /* INET */
115#ifdef INET6
116static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
117		    struct thread *td);
118#endif /* INET6 */
119static void	tcp_disconnect(struct tcpcb *);
120static void	tcp_usrclosed(struct tcpcb *);
121static void	tcp_fill_info(struct tcpcb *, struct tcp_info *);
122
123#ifdef TCPDEBUG
124#define	TCPDEBUG0	int ostate = 0
125#define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
126#define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
127				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
128#else
129#define	TCPDEBUG0
130#define	TCPDEBUG1()
131#define	TCPDEBUG2(req)
132#endif
133
134/*
135 * TCP attaches to socket via pru_attach(), reserving space,
136 * and an internet control block.
137 */
138static int
139tcp_usr_attach(struct socket *so, int proto, struct thread *td)
140{
141	struct inpcb *inp;
142	struct tcpcb *tp = NULL;
143	int error;
144	TCPDEBUG0;
145
146	inp = sotoinpcb(so);
147	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
148	TCPDEBUG1();
149
150	error = tcp_attach(so);
151	if (error)
152		goto out;
153
154	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
155		so->so_linger = TCP_LINGERTIME;
156
157	inp = sotoinpcb(so);
158	tp = intotcpcb(inp);
159out:
160	TCPDEBUG2(PRU_ATTACH);
161	TCP_PROBE2(debug__user, tp, PRU_ATTACH);
162	return error;
163}
164
165/*
166 * tcp_detach is called when the socket layer loses its final reference
167 * to the socket, be it a file descriptor reference, a reference from TCP,
168 * etc.  At this point, there is only one case in which we will keep around
169 * inpcb state: time wait.
170 *
171 * This function can probably be re-absorbed back into tcp_usr_detach() now
172 * that there is a single detach path.
173 */
174static void
175tcp_detach(struct socket *so, struct inpcb *inp)
176{
177	struct tcpcb *tp;
178
179	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
180	INP_WLOCK_ASSERT(inp);
181
182	KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
183	KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));
184
185	tp = intotcpcb(inp);
186
187	if (inp->inp_flags & INP_TIMEWAIT) {
188		/*
189		 * There are two cases to handle: one in which the time wait
190		 * state is being discarded (INP_DROPPED), and one in which
191		 * this connection will remain in timewait.  In the former,
192		 * it is time to discard all state (except tcptw, which has
193		 * already been discarded by the timewait close code, which
194		 * should be further up the call stack somewhere).  In the
195		 * latter case, we detach from the socket, but leave the pcb
196		 * present until timewait ends.
197		 *
198		 * XXXRW: Would it be cleaner to free the tcptw here?
199		 *
200		 * Astute question indeed, from twtcp perspective there are
201		 * three cases to consider:
202		 *
203		 * #1 tcp_detach is called at tcptw creation time by
204		 *  tcp_twstart, then do not discard the newly created tcptw
205		 *  and leave inpcb present until timewait ends
206		 * #2 tcp_detach is called at timewait end (or reuse) by
207		 *  tcp_twclose, then the tcptw has already been discarded
208		 *  (or reused) and inpcb is freed here
209		 * #3 tcp_detach is called() after timewait ends (or reuse)
210		 *  (e.g. by soclose), then tcptw has already been discarded
211		 *  (or reused) and inpcb is freed here
212		 *
213		 *  In all three cases the tcptw should not be freed here.
214		 */
215		if (inp->inp_flags & INP_DROPPED) {
216			in_pcbdetach(inp);
217			if (__predict_true(tp == NULL)) {
218				in_pcbfree(inp);
219			} else {
220				/*
221				 * This case should not happen as in TIMEWAIT
222				 * state the inp should not be destroyed before
223				 * its tcptw.  If INVARIANTS is defined, panic.
224				 */
225#ifdef INVARIANTS
226				panic("%s: Panic before an inp double-free: "
227				    "INP_TIMEWAIT && INP_DROPPED && tp != NULL"
228				    , __func__);
229#else
230				log(LOG_ERR, "%s: Avoid an inp double-free: "
231				    "INP_TIMEWAIT && INP_DROPPED && tp != NULL"
232				    , __func__);
233#endif
234				INP_WUNLOCK(inp);
235			}
236		} else {
237			in_pcbdetach(inp);
238			INP_WUNLOCK(inp);
239		}
240	} else {
241		/*
242		 * If the connection is not in timewait, we consider two
243		 * two conditions: one in which no further processing is
244		 * necessary (dropped || embryonic), and one in which TCP is
245		 * not yet done, but no longer requires the socket, so the
246		 * pcb will persist for the time being.
247		 *
248		 * XXXRW: Does the second case still occur?
249		 */
250		if (inp->inp_flags & INP_DROPPED ||
251		    tp->t_state < TCPS_SYN_SENT) {
252			tcp_discardcb(tp);
253			in_pcbdetach(inp);
254			in_pcbfree(inp);
255		} else {
256			in_pcbdetach(inp);
257			INP_WUNLOCK(inp);
258		}
259	}
260}
261
262/*
263 * pru_detach() detaches the TCP protocol from the socket.
264 * If the protocol state is non-embryonic, then can't
265 * do this directly: have to initiate a pru_disconnect(),
266 * which may finish later; embryonic TCB's can just
267 * be discarded here.
268 */
269static void
270tcp_usr_detach(struct socket *so)
271{
272	struct inpcb *inp;
273	int rlock = 0;
274
275	inp = sotoinpcb(so);
276	KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
277	if (!INP_INFO_WLOCKED(&V_tcbinfo)) {
278		INP_INFO_RLOCK(&V_tcbinfo);
279		rlock = 1;
280	}
281	INP_WLOCK(inp);
282	KASSERT(inp->inp_socket != NULL,
283	    ("tcp_usr_detach: inp_socket == NULL"));
284	tcp_detach(so, inp);
285	if (rlock)
286		INP_INFO_RUNLOCK(&V_tcbinfo);
287}
288
289#ifdef INET
290/*
291 * Give the socket an address.
292 */
293static int
294tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
295{
296	int error = 0;
297	struct inpcb *inp;
298	struct tcpcb *tp = NULL;
299	struct sockaddr_in *sinp;
300
301	sinp = (struct sockaddr_in *)nam;
302	if (nam->sa_len != sizeof (*sinp))
303		return (EINVAL);
304	/*
305	 * Must check for multicast addresses and disallow binding
306	 * to them.
307	 */
308	if (sinp->sin_family == AF_INET &&
309	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
310		return (EAFNOSUPPORT);
311
312	TCPDEBUG0;
313	inp = sotoinpcb(so);
314	KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
315	INP_WLOCK(inp);
316	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
317		error = EINVAL;
318		goto out;
319	}
320	tp = intotcpcb(inp);
321	TCPDEBUG1();
322	INP_HASH_WLOCK(&V_tcbinfo);
323	error = in_pcbbind(inp, nam, td->td_ucred);
324	INP_HASH_WUNLOCK(&V_tcbinfo);
325out:
326	TCPDEBUG2(PRU_BIND);
327	TCP_PROBE2(debug__user, tp, PRU_BIND);
328	INP_WUNLOCK(inp);
329
330	return (error);
331}
332#endif /* INET */
333
334#ifdef INET6
335static int
336tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
337{
338	int error = 0;
339	struct inpcb *inp;
340	struct tcpcb *tp = NULL;
341	struct sockaddr_in6 *sin6p;
342
343	sin6p = (struct sockaddr_in6 *)nam;
344	if (nam->sa_len != sizeof (*sin6p))
345		return (EINVAL);
346	/*
347	 * Must check for multicast addresses and disallow binding
348	 * to them.
349	 */
350	if (sin6p->sin6_family == AF_INET6 &&
351	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
352		return (EAFNOSUPPORT);
353
354	TCPDEBUG0;
355	inp = sotoinpcb(so);
356	KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
357	INP_WLOCK(inp);
358	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
359		error = EINVAL;
360		goto out;
361	}
362	tp = intotcpcb(inp);
363	TCPDEBUG1();
364	INP_HASH_WLOCK(&V_tcbinfo);
365	inp->inp_vflag &= ~INP_IPV4;
366	inp->inp_vflag |= INP_IPV6;
367#ifdef INET
368	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
369		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
370			inp->inp_vflag |= INP_IPV4;
371		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
372			struct sockaddr_in sin;
373
374			in6_sin6_2_sin(&sin, sin6p);
375			inp->inp_vflag |= INP_IPV4;
376			inp->inp_vflag &= ~INP_IPV6;
377			error = in_pcbbind(inp, (struct sockaddr *)&sin,
378			    td->td_ucred);
379			INP_HASH_WUNLOCK(&V_tcbinfo);
380			goto out;
381		}
382	}
383#endif
384	error = in6_pcbbind(inp, nam, td->td_ucred);
385	INP_HASH_WUNLOCK(&V_tcbinfo);
386out:
387	TCPDEBUG2(PRU_BIND);
388	TCP_PROBE2(debug__user, tp, PRU_BIND);
389	INP_WUNLOCK(inp);
390	return (error);
391}
392#endif /* INET6 */
393
394#ifdef INET
395/*
396 * Prepare to accept connections.
397 */
398static int
399tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
400{
401	int error = 0;
402	struct inpcb *inp;
403	struct tcpcb *tp = NULL;
404
405	TCPDEBUG0;
406	inp = sotoinpcb(so);
407	KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
408	INP_WLOCK(inp);
409	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
410		error = EINVAL;
411		goto out;
412	}
413	tp = intotcpcb(inp);
414	TCPDEBUG1();
415	SOCK_LOCK(so);
416	error = solisten_proto_check(so);
417	INP_HASH_WLOCK(&V_tcbinfo);
418	if (error == 0 && inp->inp_lport == 0)
419		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
420	INP_HASH_WUNLOCK(&V_tcbinfo);
421	if (error == 0) {
422		tcp_state_change(tp, TCPS_LISTEN);
423		solisten_proto(so, backlog);
424#ifdef TCP_OFFLOAD
425		if ((so->so_options & SO_NO_OFFLOAD) == 0)
426			tcp_offload_listen_start(tp);
427#endif
428	}
429	SOCK_UNLOCK(so);
430
431#ifdef TCP_RFC7413
432	if (tp->t_flags & TF_FASTOPEN)
433		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
434#endif
435out:
436	TCPDEBUG2(PRU_LISTEN);
437	TCP_PROBE2(debug__user, tp, PRU_LISTEN);
438	INP_WUNLOCK(inp);
439	return (error);
440}
441#endif /* INET */
442
443#ifdef INET6
444static int
445tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
446{
447	int error = 0;
448	struct inpcb *inp;
449	struct tcpcb *tp = NULL;
450
451	TCPDEBUG0;
452	inp = sotoinpcb(so);
453	KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
454	INP_WLOCK(inp);
455	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
456		error = EINVAL;
457		goto out;
458	}
459	tp = intotcpcb(inp);
460	TCPDEBUG1();
461	SOCK_LOCK(so);
462	error = solisten_proto_check(so);
463	INP_HASH_WLOCK(&V_tcbinfo);
464	if (error == 0 && inp->inp_lport == 0) {
465		inp->inp_vflag &= ~INP_IPV4;
466		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
467			inp->inp_vflag |= INP_IPV4;
468		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
469	}
470	INP_HASH_WUNLOCK(&V_tcbinfo);
471	if (error == 0) {
472		tcp_state_change(tp, TCPS_LISTEN);
473		solisten_proto(so, backlog);
474#ifdef TCP_OFFLOAD
475		if ((so->so_options & SO_NO_OFFLOAD) == 0)
476			tcp_offload_listen_start(tp);
477#endif
478	}
479	SOCK_UNLOCK(so);
480
481#ifdef TCP_RFC7413
482	if (tp->t_flags & TF_FASTOPEN)
483		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
484#endif
485out:
486	TCPDEBUG2(PRU_LISTEN);
487	TCP_PROBE2(debug__user, tp, PRU_LISTEN);
488	INP_WUNLOCK(inp);
489	return (error);
490}
491#endif /* INET6 */
492
493#ifdef INET
494/*
495 * Initiate connection to peer.
496 * Create a template for use in transmissions on this connection.
497 * Enter SYN_SENT state, and mark socket as connecting.
498 * Start keep-alive timer, and seed output sequence space.
499 * Send initial segment on connection.
500 */
501static int
502tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
503{
504	int error = 0;
505	struct inpcb *inp;
506	struct tcpcb *tp = NULL;
507	struct sockaddr_in *sinp;
508
509	sinp = (struct sockaddr_in *)nam;
510	if (nam->sa_len != sizeof (*sinp))
511		return (EINVAL);
512	/*
513	 * Must disallow TCP ``connections'' to multicast addresses.
514	 */
515	if (sinp->sin_family == AF_INET
516	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
517		return (EAFNOSUPPORT);
518	if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
519		return (error);
520
521	TCPDEBUG0;
522	inp = sotoinpcb(so);
523	KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
524	INP_WLOCK(inp);
525	if (inp->inp_flags & INP_TIMEWAIT) {
526		error = EADDRINUSE;
527		goto out;
528	}
529	if (inp->inp_flags & INP_DROPPED) {
530		error = ECONNREFUSED;
531		goto out;
532	}
533	tp = intotcpcb(inp);
534	TCPDEBUG1();
535	if ((error = tcp_connect(tp, nam, td)) != 0)
536		goto out;
537#ifdef TCP_OFFLOAD
538	if (registered_toedevs > 0 &&
539	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
540	    (error = tcp_offload_connect(so, nam)) == 0)
541		goto out;
542#endif
543	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
544	error = tp->t_fb->tfb_tcp_output(tp);
545out:
546	TCPDEBUG2(PRU_CONNECT);
547	TCP_PROBE2(debug__user, tp, PRU_CONNECT);
548	INP_WUNLOCK(inp);
549	return (error);
550}
551#endif /* INET */
552
553#ifdef INET6
554static int
555tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
556{
557	int error = 0;
558	struct inpcb *inp;
559	struct tcpcb *tp = NULL;
560	struct sockaddr_in6 *sin6p;
561
562	TCPDEBUG0;
563
564	sin6p = (struct sockaddr_in6 *)nam;
565	if (nam->sa_len != sizeof (*sin6p))
566		return (EINVAL);
567	/*
568	 * Must disallow TCP ``connections'' to multicast addresses.
569	 */
570	if (sin6p->sin6_family == AF_INET6
571	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
572		return (EAFNOSUPPORT);
573
574	inp = sotoinpcb(so);
575	KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
576	INP_WLOCK(inp);
577	if (inp->inp_flags & INP_TIMEWAIT) {
578		error = EADDRINUSE;
579		goto out;
580	}
581	if (inp->inp_flags & INP_DROPPED) {
582		error = ECONNREFUSED;
583		goto out;
584	}
585	tp = intotcpcb(inp);
586	TCPDEBUG1();
587#ifdef INET
588	/*
589	 * XXXRW: Some confusion: V4/V6 flags relate to binding, and
590	 * therefore probably require the hash lock, which isn't held here.
591	 * Is this a significant problem?
592	 */
593	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
594		struct sockaddr_in sin;
595
596		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
597			error = EINVAL;
598			goto out;
599		}
600		if ((inp->inp_vflag & INP_IPV4) == 0) {
601			error = EAFNOSUPPORT;
602			goto out;
603		}
604
605		in6_sin6_2_sin(&sin, sin6p);
606		inp->inp_vflag |= INP_IPV4;
607		inp->inp_vflag &= ~INP_IPV6;
608		if ((error = prison_remote_ip4(td->td_ucred,
609		    &sin.sin_addr)) != 0)
610			goto out;
611		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
612			goto out;
613#ifdef TCP_OFFLOAD
614		if (registered_toedevs > 0 &&
615		    (so->so_options & SO_NO_OFFLOAD) == 0 &&
616		    (error = tcp_offload_connect(so, nam)) == 0)
617			goto out;
618#endif
619		error = tp->t_fb->tfb_tcp_output(tp);
620		goto out;
621	} else {
622		if ((inp->inp_vflag & INP_IPV6) == 0) {
623			error = EAFNOSUPPORT;
624			goto out;
625		}
626	}
627#endif
628	inp->inp_vflag &= ~INP_IPV4;
629	inp->inp_vflag |= INP_IPV6;
630	inp->inp_inc.inc_flags |= INC_ISIPV6;
631	if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0)
632		goto out;
633	if ((error = tcp6_connect(tp, nam, td)) != 0)
634		goto out;
635#ifdef TCP_OFFLOAD
636	if (registered_toedevs > 0 &&
637	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
638	    (error = tcp_offload_connect(so, nam)) == 0)
639		goto out;
640#endif
641	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
642	error = tp->t_fb->tfb_tcp_output(tp);
643
644out:
645	TCPDEBUG2(PRU_CONNECT);
646	TCP_PROBE2(debug__user, tp, PRU_CONNECT);
647	INP_WUNLOCK(inp);
648	return (error);
649}
650#endif /* INET6 */
651
652/*
653 * Initiate disconnect from peer.
654 * If connection never passed embryonic stage, just drop;
655 * else if don't need to let data drain, then can just drop anyways,
656 * else have to begin TCP shutdown process: mark socket disconnecting,
657 * drain unread data, state switch to reflect user close, and
658 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
659 * when peer sends FIN and acks ours.
660 *
661 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
662 */
663static int
664tcp_usr_disconnect(struct socket *so)
665{
666	struct inpcb *inp;
667	struct tcpcb *tp = NULL;
668	int error = 0;
669
670	TCPDEBUG0;
671	INP_INFO_RLOCK(&V_tcbinfo);
672	inp = sotoinpcb(so);
673	KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
674	INP_WLOCK(inp);
675	if (inp->inp_flags & INP_TIMEWAIT)
676		goto out;
677	if (inp->inp_flags & INP_DROPPED) {
678		error = ECONNRESET;
679		goto out;
680	}
681	tp = intotcpcb(inp);
682	TCPDEBUG1();
683	tcp_disconnect(tp);
684out:
685	TCPDEBUG2(PRU_DISCONNECT);
686	TCP_PROBE2(debug__user, tp, PRU_DISCONNECT);
687	INP_WUNLOCK(inp);
688	INP_INFO_RUNLOCK(&V_tcbinfo);
689	return (error);
690}
691
692#ifdef INET
693/*
694 * Accept a connection.  Essentially all the work is done at higher levels;
695 * just return the address of the peer, storing through addr.
696 */
697static int
698tcp_usr_accept(struct socket *so, struct sockaddr **nam)
699{
700	int error = 0;
701	struct inpcb *inp = NULL;
702	struct tcpcb *tp = NULL;
703	struct in_addr addr;
704	in_port_t port = 0;
705	TCPDEBUG0;
706
707	if (so->so_state & SS_ISDISCONNECTED)
708		return (ECONNABORTED);
709
710	inp = sotoinpcb(so);
711	KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
712	INP_WLOCK(inp);
713	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
714		error = ECONNABORTED;
715		goto out;
716	}
717	tp = intotcpcb(inp);
718	TCPDEBUG1();
719
720	/*
721	 * We inline in_getpeeraddr and COMMON_END here, so that we can
722	 * copy the data of interest and defer the malloc until after we
723	 * release the lock.
724	 */
725	port = inp->inp_fport;
726	addr = inp->inp_faddr;
727
728out:
729	TCPDEBUG2(PRU_ACCEPT);
730	TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
731	INP_WUNLOCK(inp);
732	if (error == 0)
733		*nam = in_sockaddr(port, &addr);
734	return error;
735}
736#endif /* INET */
737
738#ifdef INET6
739static int
740tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
741{
742	struct inpcb *inp = NULL;
743	int error = 0;
744	struct tcpcb *tp = NULL;
745	struct in_addr addr;
746	struct in6_addr addr6;
747	in_port_t port = 0;
748	int v4 = 0;
749	TCPDEBUG0;
750
751	if (so->so_state & SS_ISDISCONNECTED)
752		return (ECONNABORTED);
753
754	inp = sotoinpcb(so);
755	KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
756	INP_INFO_RLOCK(&V_tcbinfo);
757	INP_WLOCK(inp);
758	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
759		error = ECONNABORTED;
760		goto out;
761	}
762	tp = intotcpcb(inp);
763	TCPDEBUG1();
764
765	/*
766	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
767	 * copy the data of interest and defer the malloc until after we
768	 * release the lock.
769	 */
770	if (inp->inp_vflag & INP_IPV4) {
771		v4 = 1;
772		port = inp->inp_fport;
773		addr = inp->inp_faddr;
774	} else {
775		port = inp->inp_fport;
776		addr6 = inp->in6p_faddr;
777	}
778
779out:
780	TCPDEBUG2(PRU_ACCEPT);
781	TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
782	INP_WUNLOCK(inp);
783	INP_INFO_RUNLOCK(&V_tcbinfo);
784	if (error == 0) {
785		if (v4)
786			*nam = in6_v4mapsin6_sockaddr(port, &addr);
787		else
788			*nam = in6_sockaddr(port, &addr6);
789	}
790	return error;
791}
792#endif /* INET6 */
793
794/*
795 * Mark the connection as being incapable of further output.
796 */
797static int
798tcp_usr_shutdown(struct socket *so)
799{
800	int error = 0;
801	struct inpcb *inp;
802	struct tcpcb *tp = NULL;
803
804	TCPDEBUG0;
805	INP_INFO_RLOCK(&V_tcbinfo);
806	inp = sotoinpcb(so);
807	KASSERT(inp != NULL, ("inp == NULL"));
808	INP_WLOCK(inp);
809	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
810		error = ECONNRESET;
811		goto out;
812	}
813	tp = intotcpcb(inp);
814	TCPDEBUG1();
815	socantsendmore(so);
816	tcp_usrclosed(tp);
817	if (!(inp->inp_flags & INP_DROPPED))
818		error = tp->t_fb->tfb_tcp_output(tp);
819
820out:
821	TCPDEBUG2(PRU_SHUTDOWN);
822	TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
823	INP_WUNLOCK(inp);
824	INP_INFO_RUNLOCK(&V_tcbinfo);
825
826	return (error);
827}
828
829/*
830 * After a receive, possibly send window update to peer.
831 */
832static int
833tcp_usr_rcvd(struct socket *so, int flags)
834{
835	struct inpcb *inp;
836	struct tcpcb *tp = NULL;
837	int error = 0;
838
839	TCPDEBUG0;
840	inp = sotoinpcb(so);
841	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
842	INP_WLOCK(inp);
843	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
844		error = ECONNRESET;
845		goto out;
846	}
847	tp = intotcpcb(inp);
848	TCPDEBUG1();
849#ifdef TCP_RFC7413
850	/*
851	 * For passively-created TFO connections, don't attempt a window
852	 * update while still in SYN_RECEIVED as this may trigger an early
853	 * SYN|ACK.  It is preferable to have the SYN|ACK be sent along with
854	 * application response data, or failing that, when the DELACK timer
855	 * expires.
856	 */
857	if ((tp->t_flags & TF_FASTOPEN) &&
858	    (tp->t_state == TCPS_SYN_RECEIVED))
859		goto out;
860#endif
861#ifdef TCP_OFFLOAD
862	if (tp->t_flags & TF_TOE)
863		tcp_offload_rcvd(tp);
864	else
865#endif
866	tp->t_fb->tfb_tcp_output(tp);
867
868out:
869	TCPDEBUG2(PRU_RCVD);
870	TCP_PROBE2(debug__user, tp, PRU_RCVD);
871	INP_WUNLOCK(inp);
872	return (error);
873}
874
875/*
876 * Do a send by putting data in output queue and updating urgent
877 * marker if URG set.  Possibly send more data.  Unlike the other
878 * pru_*() routines, the mbuf chains are our responsibility.  We
879 * must either enqueue them or free them.  The other pru_* routines
880 * generally are caller-frees.
881 */
882static int
883tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
884    struct sockaddr *nam, struct mbuf *control, struct thread *td)
885{
886	int error = 0;
887	struct inpcb *inp;
888	struct tcpcb *tp = NULL;
889#ifdef INET6
890	int isipv6;
891#endif
892	TCPDEBUG0;
893
894	/*
895	 * We require the pcbinfo lock if we will close the socket as part of
896	 * this call.
897	 */
898	if (flags & PRUS_EOF)
899		INP_INFO_RLOCK(&V_tcbinfo);
900	inp = sotoinpcb(so);
901	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
902	INP_WLOCK(inp);
903	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
904		if (control)
905			m_freem(control);
906		/*
907		 * In case of PRUS_NOTREADY, tcp_usr_ready() is responsible
908		 * for freeing memory.
909		 */
910		if (m && (flags & PRUS_NOTREADY) == 0)
911			m_freem(m);
912		error = ECONNRESET;
913		goto out;
914	}
915#ifdef INET6
916	isipv6 = nam && nam->sa_family == AF_INET6;
917#endif /* INET6 */
918	tp = intotcpcb(inp);
919	TCPDEBUG1();
920	if (control) {
921		/* TCP doesn't do control messages (rights, creds, etc) */
922		if (control->m_len) {
923			m_freem(control);
924			if (m)
925				m_freem(m);
926			error = EINVAL;
927			goto out;
928		}
929		m_freem(control);	/* empty control, just free it */
930	}
931	if (!(flags & PRUS_OOB)) {
932		sbappendstream(&so->so_snd, m, flags);
933		if (nam && tp->t_state < TCPS_SYN_SENT) {
934			/*
935			 * Do implied connect if not yet connected,
936			 * initialize window to default value, and
937			 * initialize maxseg using peer's cached MSS.
938			 */
939#ifdef INET6
940			if (isipv6)
941				error = tcp6_connect(tp, nam, td);
942#endif /* INET6 */
943#if defined(INET6) && defined(INET)
944			else
945#endif
946#ifdef INET
947				error = tcp_connect(tp, nam, td);
948#endif
949			if (error)
950				goto out;
951			tp->snd_wnd = TTCP_CLIENT_SND_WND;
952			tcp_mss(tp, -1);
953		}
954		if (flags & PRUS_EOF) {
955			/*
956			 * Close the send side of the connection after
957			 * the data is sent.
958			 */
959			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
960			socantsendmore(so);
961			tcp_usrclosed(tp);
962		}
963		if (!(inp->inp_flags & INP_DROPPED) &&
964		    !(flags & PRUS_NOTREADY)) {
965			if (flags & PRUS_MORETOCOME)
966				tp->t_flags |= TF_MORETOCOME;
967			error = tp->t_fb->tfb_tcp_output(tp);
968			if (flags & PRUS_MORETOCOME)
969				tp->t_flags &= ~TF_MORETOCOME;
970		}
971	} else {
972		/*
973		 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
974		 */
975		SOCKBUF_LOCK(&so->so_snd);
976		if (sbspace(&so->so_snd) < -512) {
977			SOCKBUF_UNLOCK(&so->so_snd);
978			m_freem(m);
979			error = ENOBUFS;
980			goto out;
981		}
982		/*
983		 * According to RFC961 (Assigned Protocols),
984		 * the urgent pointer points to the last octet
985		 * of urgent data.  We continue, however,
986		 * to consider it to indicate the first octet
987		 * of data past the urgent section.
988		 * Otherwise, snd_up should be one lower.
989		 */
990		sbappendstream_locked(&so->so_snd, m, flags);
991		SOCKBUF_UNLOCK(&so->so_snd);
992		if (nam && tp->t_state < TCPS_SYN_SENT) {
993			/*
994			 * Do implied connect if not yet connected,
995			 * initialize window to default value, and
996			 * initialize maxseg using peer's cached MSS.
997			 */
998#ifdef INET6
999			if (isipv6)
1000				error = tcp6_connect(tp, nam, td);
1001#endif /* INET6 */
1002#if defined(INET6) && defined(INET)
1003			else
1004#endif
1005#ifdef INET
1006				error = tcp_connect(tp, nam, td);
1007#endif
1008			if (error)
1009				goto out;
1010			tp->snd_wnd = TTCP_CLIENT_SND_WND;
1011			tcp_mss(tp, -1);
1012		}
1013		tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
1014		if (!(flags & PRUS_NOTREADY)) {
1015			tp->t_flags |= TF_FORCEDATA;
1016			error = tp->t_fb->tfb_tcp_output(tp);
1017			tp->t_flags &= ~TF_FORCEDATA;
1018		}
1019	}
1020out:
1021	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
1022		  ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
1023	TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
1024		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
1025	INP_WUNLOCK(inp);
1026	if (flags & PRUS_EOF)
1027		INP_INFO_RUNLOCK(&V_tcbinfo);
1028	return (error);
1029}
1030
1031static int
1032tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
1033{
1034	struct inpcb *inp;
1035	struct tcpcb *tp;
1036	int error;
1037
1038	inp = sotoinpcb(so);
1039	INP_WLOCK(inp);
1040	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
1041		INP_WUNLOCK(inp);
1042		for (int i = 0; i < count; i++)
1043			m = m_free(m);
1044		return (ECONNRESET);
1045	}
1046	tp = intotcpcb(inp);
1047
1048	SOCKBUF_LOCK(&so->so_snd);
1049	error = sbready(&so->so_snd, m, count);
1050	SOCKBUF_UNLOCK(&so->so_snd);
1051	if (error == 0)
1052		error = tp->t_fb->tfb_tcp_output(tp);
1053	INP_WUNLOCK(inp);
1054
1055	return (error);
1056}
1057
1058/*
1059 * Abort the TCP.  Drop the connection abruptly.
1060 */
1061static void
1062tcp_usr_abort(struct socket *so)
1063{
1064	struct inpcb *inp;
1065	struct tcpcb *tp = NULL;
1066	TCPDEBUG0;
1067
1068	inp = sotoinpcb(so);
1069	KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
1070
1071	INP_INFO_RLOCK(&V_tcbinfo);
1072	INP_WLOCK(inp);
1073	KASSERT(inp->inp_socket != NULL,
1074	    ("tcp_usr_abort: inp_socket == NULL"));
1075
1076	/*
1077	 * If we still have full TCP state, and we're not dropped, drop.
1078	 */
1079	if (!(inp->inp_flags & INP_TIMEWAIT) &&
1080	    !(inp->inp_flags & INP_DROPPED)) {
1081		tp = intotcpcb(inp);
1082		TCPDEBUG1();
1083		tcp_drop(tp, ECONNABORTED);
1084		TCPDEBUG2(PRU_ABORT);
1085		TCP_PROBE2(debug__user, tp, PRU_ABORT);
1086	}
1087	if (!(inp->inp_flags & INP_DROPPED)) {
1088		SOCK_LOCK(so);
1089		so->so_state |= SS_PROTOREF;
1090		SOCK_UNLOCK(so);
1091		inp->inp_flags |= INP_SOCKREF;
1092	}
1093	INP_WUNLOCK(inp);
1094	INP_INFO_RUNLOCK(&V_tcbinfo);
1095}
1096
1097/*
1098 * TCP socket is closed.  Start friendly disconnect.
1099 */
1100static void
1101tcp_usr_close(struct socket *so)
1102{
1103	struct inpcb *inp;
1104	struct tcpcb *tp = NULL;
1105	TCPDEBUG0;
1106
1107	inp = sotoinpcb(so);
1108	KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
1109
1110	INP_INFO_RLOCK(&V_tcbinfo);
1111	INP_WLOCK(inp);
1112	KASSERT(inp->inp_socket != NULL,
1113	    ("tcp_usr_close: inp_socket == NULL"));
1114
1115	/*
1116	 * If we still have full TCP state, and we're not dropped, initiate
1117	 * a disconnect.
1118	 */
1119	if (!(inp->inp_flags & INP_TIMEWAIT) &&
1120	    !(inp->inp_flags & INP_DROPPED)) {
1121		tp = intotcpcb(inp);
1122		TCPDEBUG1();
1123		tcp_disconnect(tp);
1124		TCPDEBUG2(PRU_CLOSE);
1125		TCP_PROBE2(debug__user, tp, PRU_CLOSE);
1126	}
1127	if (!(inp->inp_flags & INP_DROPPED)) {
1128		SOCK_LOCK(so);
1129		so->so_state |= SS_PROTOREF;
1130		SOCK_UNLOCK(so);
1131		inp->inp_flags |= INP_SOCKREF;
1132	}
1133	INP_WUNLOCK(inp);
1134	INP_INFO_RUNLOCK(&V_tcbinfo);
1135}
1136
1137/*
1138 * Receive out-of-band data.
1139 */
1140static int
1141tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
1142{
1143	int error = 0;
1144	struct inpcb *inp;
1145	struct tcpcb *tp = NULL;
1146
1147	TCPDEBUG0;
1148	inp = sotoinpcb(so);
1149	KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
1150	INP_WLOCK(inp);
1151	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
1152		error = ECONNRESET;
1153		goto out;
1154	}
1155	tp = intotcpcb(inp);
1156	TCPDEBUG1();
1157	if ((so->so_oobmark == 0 &&
1158	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1159	    so->so_options & SO_OOBINLINE ||
1160	    tp->t_oobflags & TCPOOB_HADDATA) {
1161		error = EINVAL;
1162		goto out;
1163	}
1164	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
1165		error = EWOULDBLOCK;
1166		goto out;
1167	}
1168	m->m_len = 1;
1169	*mtod(m, caddr_t) = tp->t_iobc;
1170	if ((flags & MSG_PEEK) == 0)
1171		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1172
1173out:
1174	TCPDEBUG2(PRU_RCVOOB);
1175	TCP_PROBE2(debug__user, tp, PRU_RCVOOB);
1176	INP_WUNLOCK(inp);
1177	return (error);
1178}
1179
1180#ifdef INET
1181struct pr_usrreqs tcp_usrreqs = {
1182	.pru_abort =		tcp_usr_abort,
1183	.pru_accept =		tcp_usr_accept,
1184	.pru_attach =		tcp_usr_attach,
1185	.pru_bind =		tcp_usr_bind,
1186	.pru_connect =		tcp_usr_connect,
1187	.pru_control =		in_control,
1188	.pru_detach =		tcp_usr_detach,
1189	.pru_disconnect =	tcp_usr_disconnect,
1190	.pru_listen =		tcp_usr_listen,
1191	.pru_peeraddr =		in_getpeeraddr,
1192	.pru_rcvd =		tcp_usr_rcvd,
1193	.pru_rcvoob =		tcp_usr_rcvoob,
1194	.pru_send =		tcp_usr_send,
1195	.pru_ready =		tcp_usr_ready,
1196	.pru_shutdown =		tcp_usr_shutdown,
1197	.pru_sockaddr =		in_getsockaddr,
1198	.pru_sosetlabel =	in_pcbsosetlabel,
1199	.pru_close =		tcp_usr_close,
1200};
1201#endif /* INET */
1202
1203#ifdef INET6
1204struct pr_usrreqs tcp6_usrreqs = {
1205	.pru_abort =		tcp_usr_abort,
1206	.pru_accept =		tcp6_usr_accept,
1207	.pru_attach =		tcp_usr_attach,
1208	.pru_bind =		tcp6_usr_bind,
1209	.pru_connect =		tcp6_usr_connect,
1210	.pru_control =		in6_control,
1211	.pru_detach =		tcp_usr_detach,
1212	.pru_disconnect =	tcp_usr_disconnect,
1213	.pru_listen =		tcp6_usr_listen,
1214	.pru_peeraddr =		in6_mapped_peeraddr,
1215	.pru_rcvd =		tcp_usr_rcvd,
1216	.pru_rcvoob =		tcp_usr_rcvoob,
1217	.pru_send =		tcp_usr_send,
1218	.pru_ready =		tcp_usr_ready,
1219	.pru_shutdown =		tcp_usr_shutdown,
1220	.pru_sockaddr =		in6_mapped_sockaddr,
1221	.pru_sosetlabel =	in_pcbsosetlabel,
1222	.pru_close =		tcp_usr_close,
1223};
1224#endif /* INET6 */
1225
1226#ifdef INET
1227/*
1228 * Common subroutine to open a TCP connection to remote host specified
1229 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
1230 * port number if needed.  Call in_pcbconnect_setup to do the routing and
1231 * to choose a local host address (interface).  If there is an existing
1232 * incarnation of the same connection in TIME-WAIT state and if the remote
1233 * host was sending CC options and if the connection duration was < MSL, then
1234 * truncate the previous TIME-WAIT state and proceed.
1235 * Initialize connection parameters and enter SYN-SENT state.
1236 */
1237static int
1238tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
1239{
1240	struct inpcb *inp = tp->t_inpcb, *oinp;
1241	struct socket *so = inp->inp_socket;
1242	struct in_addr laddr;
1243	u_short lport;
1244	int error;
1245
1246	INP_WLOCK_ASSERT(inp);
1247	INP_HASH_WLOCK(&V_tcbinfo);
1248
1249	if (inp->inp_lport == 0) {
1250		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
1251		if (error)
1252			goto out;
1253	}
1254
1255	/*
1256	 * Cannot simply call in_pcbconnect, because there might be an
1257	 * earlier incarnation of this same connection still in
1258	 * TIME_WAIT state, creating an ADDRINUSE error.
1259	 */
1260	laddr = inp->inp_laddr;
1261	lport = inp->inp_lport;
1262	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
1263	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
1264	if (error && oinp == NULL)
1265		goto out;
1266	if (oinp) {
1267		error = EADDRINUSE;
1268		goto out;
1269	}
1270	inp->inp_laddr = laddr;
1271	in_pcbrehash(inp);
1272	INP_HASH_WUNLOCK(&V_tcbinfo);
1273
1274	/*
1275	 * Compute window scaling to request:
1276	 * Scale to fit into sweet spot.  See tcp_syncache.c.
1277	 * XXX: This should move to tcp_output().
1278	 */
1279	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1280	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1281		tp->request_r_scale++;
1282
1283	soisconnecting(so);
1284	TCPSTAT_INC(tcps_connattempt);
1285	tcp_state_change(tp, TCPS_SYN_SENT);
1286	tp->iss = tcp_new_isn(tp);
1287	tcp_sendseqinit(tp);
1288
1289	return 0;
1290
1291out:
1292	INP_HASH_WUNLOCK(&V_tcbinfo);
1293	return (error);
1294}
1295#endif /* INET */
1296
1297#ifdef INET6
1298static int
1299tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
1300{
1301	struct inpcb *inp = tp->t_inpcb;
1302	int error;
1303
1304	INP_WLOCK_ASSERT(inp);
1305	INP_HASH_WLOCK(&V_tcbinfo);
1306
1307	if (inp->inp_lport == 0) {
1308		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
1309		if (error)
1310			goto out;
1311	}
1312	error = in6_pcbconnect(inp, nam, td->td_ucred);
1313	if (error != 0)
1314		goto out;
1315	INP_HASH_WUNLOCK(&V_tcbinfo);
1316
1317	/* Compute window scaling to request.  */
1318	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1319	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1320		tp->request_r_scale++;
1321
1322	soisconnecting(inp->inp_socket);
1323	TCPSTAT_INC(tcps_connattempt);
1324	tcp_state_change(tp, TCPS_SYN_SENT);
1325	tp->iss = tcp_new_isn(tp);
1326	tcp_sendseqinit(tp);
1327
1328	return 0;
1329
1330out:
1331	INP_HASH_WUNLOCK(&V_tcbinfo);
1332	return error;
1333}
1334#endif /* INET6 */
1335
1336/*
1337 * Export TCP internal state information via a struct tcp_info, based on the
1338 * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
1339 * (TCP state machine, etc).  We export all information using FreeBSD-native
1340 * constants -- for example, the numeric values for tcpi_state will differ
1341 * from Linux.
1342 */
1343static void
1344tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
1345{
1346
1347	INP_WLOCK_ASSERT(tp->t_inpcb);
1348	bzero(ti, sizeof(*ti));
1349
1350	ti->tcpi_state = tp->t_state;
1351	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
1352		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1353	if (tp->t_flags & TF_SACK_PERMIT)
1354		ti->tcpi_options |= TCPI_OPT_SACK;
1355	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
1356		ti->tcpi_options |= TCPI_OPT_WSCALE;
1357		ti->tcpi_snd_wscale = tp->snd_scale;
1358		ti->tcpi_rcv_wscale = tp->rcv_scale;
1359	}
1360	if (tp->t_flags & TF_ECN_PERMIT)
1361		ti->tcpi_options |= TCPI_OPT_ECN;
1362
1363	ti->tcpi_rto = tp->t_rxtcur * tick;
1364	ti->tcpi_last_data_recv = (long)(ticks - (int)tp->t_rcvtime) * tick;
1365	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
1366	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
1367
1368	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
1369	ti->tcpi_snd_cwnd = tp->snd_cwnd;
1370
1371	/*
1372	 * FreeBSD-specific extension fields for tcp_info.
1373	 */
1374	ti->tcpi_rcv_space = tp->rcv_wnd;
1375	ti->tcpi_rcv_nxt = tp->rcv_nxt;
1376	ti->tcpi_snd_wnd = tp->snd_wnd;
1377	ti->tcpi_snd_bwnd = 0;		/* Unused, kept for compat. */
1378	ti->tcpi_snd_nxt = tp->snd_nxt;
1379	ti->tcpi_snd_mss = tp->t_maxseg;
1380	ti->tcpi_rcv_mss = tp->t_maxseg;
1381	if (tp->t_flags & TF_TOE)
1382		ti->tcpi_options |= TCPI_OPT_TOE;
1383	ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
1384	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
1385	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
1386}
1387
1388/*
1389 * tcp_ctloutput() must drop the inpcb lock before performing copyin on
1390 * socket option arguments.  When it re-acquires the lock after the copy, it
1391 * has to revalidate that the connection is still valid for the socket
1392 * option.
1393 */
1394#define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do {			\
1395	INP_WLOCK(inp);							\
1396	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {		\
1397		INP_WUNLOCK(inp);					\
1398		cleanup;						\
1399		return (ECONNRESET);					\
1400	}								\
1401	tp = intotcpcb(inp);						\
1402} while(0)
1403#define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */)
1404
1405int
1406tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1407{
1408	int	error;
1409	struct	inpcb *inp;
1410	struct	tcpcb *tp;
1411	struct tcp_function_block *blk;
1412	struct tcp_function_set fsn;
1413
1414	error = 0;
1415	inp = sotoinpcb(so);
1416	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
1417	INP_WLOCK(inp);
1418	if (sopt->sopt_level != IPPROTO_TCP) {
1419#ifdef INET6
1420		if (inp->inp_vflag & INP_IPV6PROTO) {
1421			INP_WUNLOCK(inp);
1422			error = ip6_ctloutput(so, sopt);
1423		}
1424#endif /* INET6 */
1425#if defined(INET6) && defined(INET)
1426		else
1427#endif
1428#ifdef INET
1429		{
1430			INP_WUNLOCK(inp);
1431			error = ip_ctloutput(so, sopt);
1432		}
1433#endif
1434		return (error);
1435	}
1436	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
1437		INP_WUNLOCK(inp);
1438		return (ECONNRESET);
1439	}
1440	tp = intotcpcb(inp);
1441	/*
1442	 * Protect the TCP option TCP_FUNCTION_BLK so
1443	 * that a sub-function can *never* overwrite this.
1444	 */
1445	if ((sopt->sopt_dir == SOPT_SET) &&
1446	    (sopt->sopt_name == TCP_FUNCTION_BLK)) {
1447		INP_WUNLOCK(inp);
1448		error = sooptcopyin(sopt, &fsn, sizeof fsn,
1449		    sizeof fsn);
1450		if (error)
1451			return (error);
1452		INP_WLOCK_RECHECK(inp);
1453		if (tp->t_state != TCPS_CLOSED) {
1454			/*
1455			 * The user has advanced the state
1456			 * past the initial point, we can't
1457			 * switch since we are down the road
1458			 * and a new set of functions may
1459			 * not be compatibile.
1460			 */
1461			INP_WUNLOCK(inp);
1462			return(EINVAL);
1463		}
1464		blk = find_and_ref_tcp_functions(&fsn);
1465		if (blk == NULL) {
1466			INP_WUNLOCK(inp);
1467			return (ENOENT);
1468		}
1469		if (tp->t_fb != blk) {
1470			if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
1471				refcount_release(&blk->tfb_refcnt);
1472				INP_WUNLOCK(inp);
1473				return (ENOENT);
1474			}
1475			/*
1476			 * Release the old refcnt, the
1477			 * lookup acquires a ref on the
1478			 * new one.
1479			 */
1480			if (tp->t_fb->tfb_tcp_fb_fini)
1481				(*tp->t_fb->tfb_tcp_fb_fini)(tp);
1482			refcount_release(&tp->t_fb->tfb_refcnt);
1483			tp->t_fb = blk;
1484			if (tp->t_fb->tfb_tcp_fb_init) {
1485				(*tp->t_fb->tfb_tcp_fb_init)(tp);
1486			}
1487		}
1488#ifdef TCP_OFFLOAD
1489		if (tp->t_flags & TF_TOE) {
1490			tcp_offload_ctloutput(tp, sopt->sopt_dir,
1491			     sopt->sopt_name);
1492		}
1493#endif
1494		INP_WUNLOCK(inp);
1495		return (error);
1496	} else if ((sopt->sopt_dir == SOPT_GET) &&
1497	    (sopt->sopt_name == TCP_FUNCTION_BLK)) {
1498		strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name);
1499		fsn.pcbcnt = tp->t_fb->tfb_refcnt;
1500		INP_WUNLOCK(inp);
1501		error = sooptcopyout(sopt, &fsn, sizeof fsn);
1502		return (error);
1503	}
1504	/* Pass in the INP locked, called must unlock it */
1505	return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp));
1506}
1507
1508int
1509tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
1510{
1511	int	error, opt, optval;
1512	u_int	ui;
1513	struct	tcp_info ti;
1514	struct cc_algo *algo;
1515	char	*pbuf, buf[TCP_CA_NAME_MAX];
1516	size_t	len;
1517
1518	/*
1519	 * For TCP_CCALGOOPT forward the control to CC module, for both
1520	 * SOPT_SET and SOPT_GET.
1521	 */
1522	switch (sopt->sopt_name) {
1523	case TCP_CCALGOOPT:
1524		INP_WUNLOCK(inp);
1525		pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO);
1526		error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize,
1527		    sopt->sopt_valsize);
1528		if (error) {
1529			free(pbuf, M_TEMP);
1530			return (error);
1531		}
1532		INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP));
1533		if (CC_ALGO(tp)->ctl_output != NULL)
1534			error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf);
1535		else
1536			error = ENOENT;
1537		INP_WUNLOCK(inp);
1538		if (error == 0 && sopt->sopt_dir == SOPT_GET)
1539			error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize);
1540		free(pbuf, M_TEMP);
1541		return (error);
1542	}
1543
1544	switch (sopt->sopt_dir) {
1545	case SOPT_SET:
1546		switch (sopt->sopt_name) {
1547#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1548		case TCP_MD5SIG:
1549			if (!TCPMD5_ENABLED()) {
1550				INP_WUNLOCK(inp);
1551				return (ENOPROTOOPT);
1552			}
1553			error = TCPMD5_PCBCTL(inp, sopt);
1554			if (error)
1555				return (error);
1556			goto unlock_and_done;
1557#endif /* IPSEC */
1558
1559		case TCP_NODELAY:
1560		case TCP_NOOPT:
1561			INP_WUNLOCK(inp);
1562			error = sooptcopyin(sopt, &optval, sizeof optval,
1563			    sizeof optval);
1564			if (error)
1565				return (error);
1566
1567			INP_WLOCK_RECHECK(inp);
1568			switch (sopt->sopt_name) {
1569			case TCP_NODELAY:
1570				opt = TF_NODELAY;
1571				break;
1572			case TCP_NOOPT:
1573				opt = TF_NOOPT;
1574				break;
1575			default:
1576				opt = 0; /* dead code to fool gcc */
1577				break;
1578			}
1579
1580			if (optval)
1581				tp->t_flags |= opt;
1582			else
1583				tp->t_flags &= ~opt;
1584unlock_and_done:
1585#ifdef TCP_OFFLOAD
1586			if (tp->t_flags & TF_TOE) {
1587				tcp_offload_ctloutput(tp, sopt->sopt_dir,
1588				    sopt->sopt_name);
1589			}
1590#endif
1591			INP_WUNLOCK(inp);
1592			break;
1593
1594		case TCP_NOPUSH:
1595			INP_WUNLOCK(inp);
1596			error = sooptcopyin(sopt, &optval, sizeof optval,
1597			    sizeof optval);
1598			if (error)
1599				return (error);
1600
1601			INP_WLOCK_RECHECK(inp);
1602			if (optval)
1603				tp->t_flags |= TF_NOPUSH;
1604			else if (tp->t_flags & TF_NOPUSH) {
1605				tp->t_flags &= ~TF_NOPUSH;
1606				if (TCPS_HAVEESTABLISHED(tp->t_state))
1607					error = tp->t_fb->tfb_tcp_output(tp);
1608			}
1609			goto unlock_and_done;
1610
1611		case TCP_MAXSEG:
1612			INP_WUNLOCK(inp);
1613			error = sooptcopyin(sopt, &optval, sizeof optval,
1614			    sizeof optval);
1615			if (error)
1616				return (error);
1617
1618			INP_WLOCK_RECHECK(inp);
1619			if (optval > 0 && optval <= tp->t_maxseg &&
1620			    optval + 40 >= V_tcp_minmss)
1621				tp->t_maxseg = optval;
1622			else
1623				error = EINVAL;
1624			goto unlock_and_done;
1625
1626		case TCP_INFO:
1627			INP_WUNLOCK(inp);
1628			error = EINVAL;
1629			break;
1630
1631		case TCP_CONGESTION:
1632			INP_WUNLOCK(inp);
1633			error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
1634			if (error)
1635				break;
1636			buf[sopt->sopt_valsize] = '\0';
1637			INP_WLOCK_RECHECK(inp);
1638			CC_LIST_RLOCK();
1639			STAILQ_FOREACH(algo, &cc_list, entries)
1640				if (strncmp(buf, algo->name,
1641				    TCP_CA_NAME_MAX) == 0)
1642					break;
1643			CC_LIST_RUNLOCK();
1644			if (algo == NULL) {
1645				INP_WUNLOCK(inp);
1646				error = EINVAL;
1647				break;
1648			}
1649			/*
1650			 * We hold a write lock over the tcb so it's safe to
1651			 * do these things without ordering concerns.
1652			 */
1653			if (CC_ALGO(tp)->cb_destroy != NULL)
1654				CC_ALGO(tp)->cb_destroy(tp->ccv);
1655			CC_ALGO(tp) = algo;
1656			/*
1657			 * If something goes pear shaped initialising the new
1658			 * algo, fall back to newreno (which does not
1659			 * require initialisation).
1660			 */
1661			if (algo->cb_init != NULL &&
1662			    algo->cb_init(tp->ccv) != 0) {
1663				CC_ALGO(tp) = &newreno_cc_algo;
1664				/*
1665				 * The only reason init should fail is
1666				 * because of malloc.
1667				 */
1668				error = ENOMEM;
1669			}
1670			INP_WUNLOCK(inp);
1671			break;
1672
1673		case TCP_KEEPIDLE:
1674		case TCP_KEEPINTVL:
1675		case TCP_KEEPINIT:
1676			INP_WUNLOCK(inp);
1677			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
1678			if (error)
1679				return (error);
1680
1681			if (ui > (UINT_MAX / hz)) {
1682				error = EINVAL;
1683				break;
1684			}
1685			ui *= hz;
1686
1687			INP_WLOCK_RECHECK(inp);
1688			switch (sopt->sopt_name) {
1689			case TCP_KEEPIDLE:
1690				tp->t_keepidle = ui;
1691				/*
1692				 * XXX: better check current remaining
1693				 * timeout and "merge" it with new value.
1694				 */
1695				if ((tp->t_state > TCPS_LISTEN) &&
1696				    (tp->t_state <= TCPS_CLOSING))
1697					tcp_timer_activate(tp, TT_KEEP,
1698					    TP_KEEPIDLE(tp));
1699				break;
1700			case TCP_KEEPINTVL:
1701				tp->t_keepintvl = ui;
1702				if ((tp->t_state == TCPS_FIN_WAIT_2) &&
1703				    (TP_MAXIDLE(tp) > 0))
1704					tcp_timer_activate(tp, TT_2MSL,
1705					    TP_MAXIDLE(tp));
1706				break;
1707			case TCP_KEEPINIT:
1708				tp->t_keepinit = ui;
1709				if (tp->t_state == TCPS_SYN_RECEIVED ||
1710				    tp->t_state == TCPS_SYN_SENT)
1711					tcp_timer_activate(tp, TT_KEEP,
1712					    TP_KEEPINIT(tp));
1713				break;
1714			}
1715			goto unlock_and_done;
1716
1717		case TCP_KEEPCNT:
1718			INP_WUNLOCK(inp);
1719			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
1720			if (error)
1721				return (error);
1722
1723			INP_WLOCK_RECHECK(inp);
1724			tp->t_keepcnt = ui;
1725			if ((tp->t_state == TCPS_FIN_WAIT_2) &&
1726			    (TP_MAXIDLE(tp) > 0))
1727				tcp_timer_activate(tp, TT_2MSL,
1728				    TP_MAXIDLE(tp));
1729			goto unlock_and_done;
1730
1731#ifdef TCPPCAP
1732		case TCP_PCAP_OUT:
1733		case TCP_PCAP_IN:
1734			INP_WUNLOCK(inp);
1735			error = sooptcopyin(sopt, &optval, sizeof optval,
1736			    sizeof optval);
1737			if (error)
1738				return (error);
1739
1740			INP_WLOCK_RECHECK(inp);
1741			if (optval >= 0)
1742				tcp_pcap_set_sock_max(TCP_PCAP_OUT ?
1743					&(tp->t_outpkts) : &(tp->t_inpkts),
1744					optval);
1745			else
1746				error = EINVAL;
1747			goto unlock_and_done;
1748#endif
1749
1750#ifdef TCP_RFC7413
1751		case TCP_FASTOPEN:
1752			INP_WUNLOCK(inp);
1753			if (!V_tcp_fastopen_enabled)
1754				return (EPERM);
1755
1756			error = sooptcopyin(sopt, &optval, sizeof optval,
1757			    sizeof optval);
1758			if (error)
1759				return (error);
1760
1761			INP_WLOCK_RECHECK(inp);
1762			if (optval) {
1763				tp->t_flags |= TF_FASTOPEN;
1764				if ((tp->t_state == TCPS_LISTEN) &&
1765				    (tp->t_tfo_pending == NULL))
1766					tp->t_tfo_pending =
1767					    tcp_fastopen_alloc_counter();
1768			} else
1769				tp->t_flags &= ~TF_FASTOPEN;
1770			goto unlock_and_done;
1771#endif
1772
1773		default:
1774			INP_WUNLOCK(inp);
1775			error = ENOPROTOOPT;
1776			break;
1777		}
1778		break;
1779
1780	case SOPT_GET:
1781		tp = intotcpcb(inp);
1782		switch (sopt->sopt_name) {
1783#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1784		case TCP_MD5SIG:
1785			if (!TCPMD5_ENABLED()) {
1786				INP_WUNLOCK(inp);
1787				return (ENOPROTOOPT);
1788			}
1789			error = TCPMD5_PCBCTL(inp, sopt);
1790			break;
1791#endif
1792
1793		case TCP_NODELAY:
1794			optval = tp->t_flags & TF_NODELAY;
1795			INP_WUNLOCK(inp);
1796			error = sooptcopyout(sopt, &optval, sizeof optval);
1797			break;
1798		case TCP_MAXSEG:
1799			optval = tp->t_maxseg;
1800			INP_WUNLOCK(inp);
1801			error = sooptcopyout(sopt, &optval, sizeof optval);
1802			break;
1803		case TCP_NOOPT:
1804			optval = tp->t_flags & TF_NOOPT;
1805			INP_WUNLOCK(inp);
1806			error = sooptcopyout(sopt, &optval, sizeof optval);
1807			break;
1808		case TCP_NOPUSH:
1809			optval = tp->t_flags & TF_NOPUSH;
1810			INP_WUNLOCK(inp);
1811			error = sooptcopyout(sopt, &optval, sizeof optval);
1812			break;
1813		case TCP_INFO:
1814			tcp_fill_info(tp, &ti);
1815			INP_WUNLOCK(inp);
1816			error = sooptcopyout(sopt, &ti, sizeof ti);
1817			break;
1818		case TCP_CONGESTION:
1819			len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
1820			INP_WUNLOCK(inp);
1821			error = sooptcopyout(sopt, buf, len + 1);
1822			break;
1823		case TCP_KEEPIDLE:
1824		case TCP_KEEPINTVL:
1825		case TCP_KEEPINIT:
1826		case TCP_KEEPCNT:
1827			switch (sopt->sopt_name) {
1828			case TCP_KEEPIDLE:
1829				ui = TP_KEEPIDLE(tp) / hz;
1830				break;
1831			case TCP_KEEPINTVL:
1832				ui = TP_KEEPINTVL(tp) / hz;
1833				break;
1834			case TCP_KEEPINIT:
1835				ui = TP_KEEPINIT(tp) / hz;
1836				break;
1837			case TCP_KEEPCNT:
1838				ui = TP_KEEPCNT(tp);
1839				break;
1840			}
1841			INP_WUNLOCK(inp);
1842			error = sooptcopyout(sopt, &ui, sizeof(ui));
1843			break;
1844#ifdef TCPPCAP
1845		case TCP_PCAP_OUT:
1846		case TCP_PCAP_IN:
1847			optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ?
1848					&(tp->t_outpkts) : &(tp->t_inpkts));
1849			INP_WUNLOCK(inp);
1850			error = sooptcopyout(sopt, &optval, sizeof optval);
1851			break;
1852#endif
1853
1854#ifdef TCP_RFC7413
1855		case TCP_FASTOPEN:
1856			optval = tp->t_flags & TF_FASTOPEN;
1857			INP_WUNLOCK(inp);
1858			error = sooptcopyout(sopt, &optval, sizeof optval);
1859			break;
1860#endif
1861		default:
1862			INP_WUNLOCK(inp);
1863			error = ENOPROTOOPT;
1864			break;
1865		}
1866		break;
1867	}
1868	return (error);
1869}
1870#undef INP_WLOCK_RECHECK
1871#undef INP_WLOCK_RECHECK_CLEANUP
1872
1873/*
1874 * Attach TCP protocol to socket, allocating
1875 * internet protocol control block, tcp control block,
1876 * bufer space, and entering LISTEN state if to accept connections.
1877 */
1878static int
1879tcp_attach(struct socket *so)
1880{
1881	struct tcpcb *tp;
1882	struct inpcb *inp;
1883	int error;
1884
1885	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1886		error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
1887		if (error)
1888			return (error);
1889	}
1890	so->so_rcv.sb_flags |= SB_AUTOSIZE;
1891	so->so_snd.sb_flags |= SB_AUTOSIZE;
1892	INP_INFO_RLOCK(&V_tcbinfo);
1893	error = in_pcballoc(so, &V_tcbinfo);
1894	if (error) {
1895		INP_INFO_RUNLOCK(&V_tcbinfo);
1896		return (error);
1897	}
1898	inp = sotoinpcb(so);
1899#ifdef INET6
1900	if (inp->inp_vflag & INP_IPV6PROTO) {
1901		inp->inp_vflag |= INP_IPV6;
1902		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
1903			inp->inp_vflag |= INP_IPV4;
1904		inp->in6p_hops = -1;	/* use kernel default */
1905	}
1906	else
1907#endif
1908	inp->inp_vflag |= INP_IPV4;
1909	tp = tcp_newtcpcb(inp);
1910	if (tp == NULL) {
1911		in_pcbdetach(inp);
1912		in_pcbfree(inp);
1913		INP_INFO_RUNLOCK(&V_tcbinfo);
1914		return (ENOBUFS);
1915	}
1916	tp->t_state = TCPS_CLOSED;
1917	INP_WUNLOCK(inp);
1918	INP_INFO_RUNLOCK(&V_tcbinfo);
1919	TCPSTATES_INC(TCPS_CLOSED);
1920	return (0);
1921}
1922
1923/*
1924 * Initiate (or continue) disconnect.
1925 * If embryonic state, just send reset (once).
1926 * If in ``let data drain'' option and linger null, just drop.
1927 * Otherwise (hard), mark socket disconnecting and drop
1928 * current input data; switch states based on user close, and
1929 * send segment to peer (with FIN).
1930 */
1931static void
1932tcp_disconnect(struct tcpcb *tp)
1933{
1934	struct inpcb *inp = tp->t_inpcb;
1935	struct socket *so = inp->inp_socket;
1936
1937	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1938	INP_WLOCK_ASSERT(inp);
1939
1940	/*
1941	 * Neither tcp_close() nor tcp_drop() should return NULL, as the
1942	 * socket is still open.
1943	 */
1944	if (tp->t_state < TCPS_ESTABLISHED) {
1945		tp = tcp_close(tp);
1946		KASSERT(tp != NULL,
1947		    ("tcp_disconnect: tcp_close() returned NULL"));
1948	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
1949		tp = tcp_drop(tp, 0);
1950		KASSERT(tp != NULL,
1951		    ("tcp_disconnect: tcp_drop() returned NULL"));
1952	} else {
1953		soisdisconnecting(so);
1954		sbflush(&so->so_rcv);
1955		tcp_usrclosed(tp);
1956		if (!(inp->inp_flags & INP_DROPPED))
1957			tp->t_fb->tfb_tcp_output(tp);
1958	}
1959}
1960
1961/*
1962 * User issued close, and wish to trail through shutdown states:
1963 * if never received SYN, just forget it.  If got a SYN from peer,
1964 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1965 * If already got a FIN from peer, then almost done; go to LAST_ACK
1966 * state.  In all other cases, have already sent FIN to peer (e.g.
1967 * after PRU_SHUTDOWN), and just have to play tedious game waiting
1968 * for peer to send FIN or not respond to keep-alives, etc.
1969 * We can let the user exit from the close as soon as the FIN is acked.
1970 */
1971static void
1972tcp_usrclosed(struct tcpcb *tp)
1973{
1974
1975	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1976	INP_WLOCK_ASSERT(tp->t_inpcb);
1977
1978	switch (tp->t_state) {
1979	case TCPS_LISTEN:
1980#ifdef TCP_OFFLOAD
1981		tcp_offload_listen_stop(tp);
1982#endif
1983		tcp_state_change(tp, TCPS_CLOSED);
1984		/* FALLTHROUGH */
1985	case TCPS_CLOSED:
1986		tp = tcp_close(tp);
1987		/*
1988		 * tcp_close() should never return NULL here as the socket is
1989		 * still open.
1990		 */
1991		KASSERT(tp != NULL,
1992		    ("tcp_usrclosed: tcp_close() returned NULL"));
1993		break;
1994
1995	case TCPS_SYN_SENT:
1996	case TCPS_SYN_RECEIVED:
1997		tp->t_flags |= TF_NEEDFIN;
1998		break;
1999
2000	case TCPS_ESTABLISHED:
2001		tcp_state_change(tp, TCPS_FIN_WAIT_1);
2002		break;
2003
2004	case TCPS_CLOSE_WAIT:
2005		tcp_state_change(tp, TCPS_LAST_ACK);
2006		break;
2007	}
2008	if (tp->t_state >= TCPS_FIN_WAIT_2) {
2009		soisdisconnected(tp->t_inpcb->inp_socket);
2010		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
2011		if (tp->t_state == TCPS_FIN_WAIT_2) {
2012			int timeout;
2013
2014			timeout = (tcp_fast_finwait2_recycle) ?
2015			    tcp_finwait2_timeout : TP_MAXIDLE(tp);
2016			tcp_timer_activate(tp, TT_2MSL, timeout);
2017		}
2018	}
2019}
2020
2021#ifdef DDB
2022static void
2023db_print_indent(int indent)
2024{
2025	int i;
2026
2027	for (i = 0; i < indent; i++)
2028		db_printf(" ");
2029}
2030
2031static void
2032db_print_tstate(int t_state)
2033{
2034
2035	switch (t_state) {
2036	case TCPS_CLOSED:
2037		db_printf("TCPS_CLOSED");
2038		return;
2039
2040	case TCPS_LISTEN:
2041		db_printf("TCPS_LISTEN");
2042		return;
2043
2044	case TCPS_SYN_SENT:
2045		db_printf("TCPS_SYN_SENT");
2046		return;
2047
2048	case TCPS_SYN_RECEIVED:
2049		db_printf("TCPS_SYN_RECEIVED");
2050		return;
2051
2052	case TCPS_ESTABLISHED:
2053		db_printf("TCPS_ESTABLISHED");
2054		return;
2055
2056	case TCPS_CLOSE_WAIT:
2057		db_printf("TCPS_CLOSE_WAIT");
2058		return;
2059
2060	case TCPS_FIN_WAIT_1:
2061		db_printf("TCPS_FIN_WAIT_1");
2062		return;
2063
2064	case TCPS_CLOSING:
2065		db_printf("TCPS_CLOSING");
2066		return;
2067
2068	case TCPS_LAST_ACK:
2069		db_printf("TCPS_LAST_ACK");
2070		return;
2071
2072	case TCPS_FIN_WAIT_2:
2073		db_printf("TCPS_FIN_WAIT_2");
2074		return;
2075
2076	case TCPS_TIME_WAIT:
2077		db_printf("TCPS_TIME_WAIT");
2078		return;
2079
2080	default:
2081		db_printf("unknown");
2082		return;
2083	}
2084}
2085
2086static void
2087db_print_tflags(u_int t_flags)
2088{
2089	int comma;
2090
2091	comma = 0;
2092	if (t_flags & TF_ACKNOW) {
2093		db_printf("%sTF_ACKNOW", comma ? ", " : "");
2094		comma = 1;
2095	}
2096	if (t_flags & TF_DELACK) {
2097		db_printf("%sTF_DELACK", comma ? ", " : "");
2098		comma = 1;
2099	}
2100	if (t_flags & TF_NODELAY) {
2101		db_printf("%sTF_NODELAY", comma ? ", " : "");
2102		comma = 1;
2103	}
2104	if (t_flags & TF_NOOPT) {
2105		db_printf("%sTF_NOOPT", comma ? ", " : "");
2106		comma = 1;
2107	}
2108	if (t_flags & TF_SENTFIN) {
2109		db_printf("%sTF_SENTFIN", comma ? ", " : "");
2110		comma = 1;
2111	}
2112	if (t_flags & TF_REQ_SCALE) {
2113		db_printf("%sTF_REQ_SCALE", comma ? ", " : "");
2114		comma = 1;
2115	}
2116	if (t_flags & TF_RCVD_SCALE) {
2117		db_printf("%sTF_RECVD_SCALE", comma ? ", " : "");
2118		comma = 1;
2119	}
2120	if (t_flags & TF_REQ_TSTMP) {
2121		db_printf("%sTF_REQ_TSTMP", comma ? ", " : "");
2122		comma = 1;
2123	}
2124	if (t_flags & TF_RCVD_TSTMP) {
2125		db_printf("%sTF_RCVD_TSTMP", comma ? ", " : "");
2126		comma = 1;
2127	}
2128	if (t_flags & TF_SACK_PERMIT) {
2129		db_printf("%sTF_SACK_PERMIT", comma ? ", " : "");
2130		comma = 1;
2131	}
2132	if (t_flags & TF_NEEDSYN) {
2133		db_printf("%sTF_NEEDSYN", comma ? ", " : "");
2134		comma = 1;
2135	}
2136	if (t_flags & TF_NEEDFIN) {
2137		db_printf("%sTF_NEEDFIN", comma ? ", " : "");
2138		comma = 1;
2139	}
2140	if (t_flags & TF_NOPUSH) {
2141		db_printf("%sTF_NOPUSH", comma ? ", " : "");
2142		comma = 1;
2143	}
2144	if (t_flags & TF_MORETOCOME) {
2145		db_printf("%sTF_MORETOCOME", comma ? ", " : "");
2146		comma = 1;
2147	}
2148	if (t_flags & TF_LQ_OVERFLOW) {
2149		db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : "");
2150		comma = 1;
2151	}
2152	if (t_flags & TF_LASTIDLE) {
2153		db_printf("%sTF_LASTIDLE", comma ? ", " : "");
2154		comma = 1;
2155	}
2156	if (t_flags & TF_RXWIN0SENT) {
2157		db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
2158		comma = 1;
2159	}
2160	if (t_flags & TF_FASTRECOVERY) {
2161		db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
2162		comma = 1;
2163	}
2164	if (t_flags & TF_CONGRECOVERY) {
2165		db_printf("%sTF_CONGRECOVERY", comma ? ", " : "");
2166		comma = 1;
2167	}
2168	if (t_flags & TF_WASFRECOVERY) {
2169		db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
2170		comma = 1;
2171	}
2172	if (t_flags & TF_SIGNATURE) {
2173		db_printf("%sTF_SIGNATURE", comma ? ", " : "");
2174		comma = 1;
2175	}
2176	if (t_flags & TF_FORCEDATA) {
2177		db_printf("%sTF_FORCEDATA", comma ? ", " : "");
2178		comma = 1;
2179	}
2180	if (t_flags & TF_TSO) {
2181		db_printf("%sTF_TSO", comma ? ", " : "");
2182		comma = 1;
2183	}
2184	if (t_flags & TF_ECN_PERMIT) {
2185		db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
2186		comma = 1;
2187	}
2188	if (t_flags & TF_FASTOPEN) {
2189		db_printf("%sTF_FASTOPEN", comma ? ", " : "");
2190		comma = 1;
2191	}
2192}
2193
2194static void
2195db_print_toobflags(char t_oobflags)
2196{
2197	int comma;
2198
2199	comma = 0;
2200	if (t_oobflags & TCPOOB_HAVEDATA) {
2201		db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : "");
2202		comma = 1;
2203	}
2204	if (t_oobflags & TCPOOB_HADDATA) {
2205		db_printf("%sTCPOOB_HADDATA", comma ? ", " : "");
2206		comma = 1;
2207	}
2208}
2209
2210static void
2211db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
2212{
2213
2214	db_print_indent(indent);
2215	db_printf("%s at %p\n", name, tp);
2216
2217	indent += 2;
2218
2219	db_print_indent(indent);
2220	db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
2221	   LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
2222
2223	db_print_indent(indent);
2224	db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",
2225	    &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep);
2226
2227	db_print_indent(indent);
2228	db_printf("tt_2msl: %p   tt_delack: %p   t_inpcb: %p\n", &tp->t_timers->tt_2msl,
2229	    &tp->t_timers->tt_delack, tp->t_inpcb);
2230
2231	db_print_indent(indent);
2232	db_printf("t_state: %d (", tp->t_state);
2233	db_print_tstate(tp->t_state);
2234	db_printf(")\n");
2235
2236	db_print_indent(indent);
2237	db_printf("t_flags: 0x%x (", tp->t_flags);
2238	db_print_tflags(tp->t_flags);
2239	db_printf(")\n");
2240
2241	db_print_indent(indent);
2242	db_printf("snd_una: 0x%08x   snd_max: 0x%08x   snd_nxt: x0%08x\n",
2243	    tp->snd_una, tp->snd_max, tp->snd_nxt);
2244
2245	db_print_indent(indent);
2246	db_printf("snd_up: 0x%08x   snd_wl1: 0x%08x   snd_wl2: 0x%08x\n",
2247	   tp->snd_up, tp->snd_wl1, tp->snd_wl2);
2248
2249	db_print_indent(indent);
2250	db_printf("iss: 0x%08x   irs: 0x%08x   rcv_nxt: 0x%08x\n",
2251	    tp->iss, tp->irs, tp->rcv_nxt);
2252
2253	db_print_indent(indent);
2254	db_printf("rcv_adv: 0x%08x   rcv_wnd: %lu   rcv_up: 0x%08x\n",
2255	    tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
2256
2257	db_print_indent(indent);
2258	db_printf("snd_wnd: %lu   snd_cwnd: %lu\n",
2259	   tp->snd_wnd, tp->snd_cwnd);
2260
2261	db_print_indent(indent);
2262	db_printf("snd_ssthresh: %lu   snd_recover: "
2263	    "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
2264
2265	db_print_indent(indent);
2266	db_printf("t_rcvtime: %u   t_startime: %u\n",
2267	    tp->t_rcvtime, tp->t_starttime);
2268
2269	db_print_indent(indent);
2270	db_printf("t_rttime: %u   t_rtsq: 0x%08x\n",
2271	    tp->t_rtttime, tp->t_rtseq);
2272
2273	db_print_indent(indent);
2274	db_printf("t_rxtcur: %d   t_maxseg: %u   t_srtt: %d\n",
2275	    tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
2276
2277	db_print_indent(indent);
2278	db_printf("t_rttvar: %d   t_rxtshift: %d   t_rttmin: %u   "
2279	    "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
2280	    tp->t_rttbest);
2281
2282	db_print_indent(indent);
2283	db_printf("t_rttupdated: %lu   max_sndwnd: %lu   t_softerror: %d\n",
2284	    tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
2285
2286	db_print_indent(indent);
2287	db_printf("t_oobflags: 0x%x (", tp->t_oobflags);
2288	db_print_toobflags(tp->t_oobflags);
2289	db_printf(")   t_iobc: 0x%02x\n", tp->t_iobc);
2290
2291	db_print_indent(indent);
2292	db_printf("snd_scale: %u   rcv_scale: %u   request_r_scale: %u\n",
2293	    tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
2294
2295	db_print_indent(indent);
2296	db_printf("ts_recent: %u   ts_recent_age: %u\n",
2297	    tp->ts_recent, tp->ts_recent_age);
2298
2299	db_print_indent(indent);
2300	db_printf("ts_offset: %u   last_ack_sent: 0x%08x   snd_cwnd_prev: "
2301	    "%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
2302
2303	db_print_indent(indent);
2304	db_printf("snd_ssthresh_prev: %lu   snd_recover_prev: 0x%08x   "
2305	    "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
2306	    tp->snd_recover_prev, tp->t_badrxtwin);
2307
2308	db_print_indent(indent);
2309	db_printf("snd_numholes: %d  snd_holes first: %p\n",
2310	    tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
2311
2312	db_print_indent(indent);
2313	db_printf("snd_fack: 0x%08x   rcv_numsacks: %d   sack_newdata: "
2314	    "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata);
2315
2316	/* Skip sackblks, sackhint. */
2317
2318	db_print_indent(indent);
2319	db_printf("t_rttlow: %d   rfbuf_ts: %u   rfbuf_cnt: %d\n",
2320	    tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
2321}
2322
2323DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
2324{
2325	struct tcpcb *tp;
2326
2327	if (!have_addr) {
2328		db_printf("usage: show tcpcb <addr>\n");
2329		return;
2330	}
2331	tp = (struct tcpcb *)addr;
2332
2333	db_print_tcpcb(tp, "tcpcb", 0);
2334}
2335#endif
2336