1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1993
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
61 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.9 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
68#include <sys/sysctl.h>
69#include <sys/mbuf.h>
70#if INET6
71#include <sys/domain.h>
72#endif /* INET6 */
73#include <sys/socket.h>
74#include <sys/socketvar.h>
75#include <sys/protosw.h>
76
77#include <net/if.h>
78#include <net/route.h>
79#include <net/ntstat.h>
80
81#include <netinet/in.h>
82#include <netinet/in_systm.h>
83#if INET6
84#include <netinet/ip6.h>
85#endif
86#include <netinet/in_pcb.h>
87#if INET6
88#include <netinet6/in6_pcb.h>
89#endif
90#include <netinet/in_var.h>
91#include <netinet/ip_var.h>
92#if INET6
93#include <netinet6/ip6_var.h>
94#endif
95#include <netinet/tcp.h>
96#include <netinet/tcp_fsm.h>
97#include <netinet/tcp_seq.h>
98#include <netinet/tcp_timer.h>
99#include <netinet/tcp_var.h>
100#include <netinet/tcpip.h>
101#if TCPDEBUG
102#include <netinet/tcp_debug.h>
103#endif
104
105#if IPSEC
106#include <netinet6/ipsec.h>
107#endif /*IPSEC*/
108
109void	tcp_fill_info(struct tcpcb *, struct tcp_info *);
110errno_t tcp_fill_info_for_info_tuple(struct info_tuple *, struct tcp_info *);
111
112int tcp_sysctl_info(struct sysctl_oid *, void *, int , struct sysctl_req *);
113
114/*
115 * TCP protocol interface to socket abstraction.
116 */
117extern	char *tcpstates[];	/* XXX ??? */
118
119static int	tcp_attach(struct socket *, struct proc *);
120static int	tcp_connect(struct tcpcb *, struct sockaddr *, struct proc *);
121#if INET6
122static int	tcp6_connect(struct tcpcb *, struct sockaddr *, struct proc *);
123#endif /* INET6 */
124static struct tcpcb *
125		tcp_disconnect(struct tcpcb *);
126static struct tcpcb *
127		tcp_usrclosed(struct tcpcb *);
128
129static u_int32_t tcps_in_sw_cksum;
130SYSCTL_UINT(_net_inet_tcp, OID_AUTO, in_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
131    &tcps_in_sw_cksum, 0,
132    "Number of received packets checksummed in software");
133
134static u_int64_t tcps_in_sw_cksum_bytes;
135SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
136    &tcps_in_sw_cksum_bytes,
137    "Amount of received data checksummed in software");
138
139static u_int32_t tcps_out_sw_cksum;
140SYSCTL_UINT(_net_inet_tcp, OID_AUTO, out_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
141    &tcps_out_sw_cksum, 0,
142    "Number of transmitted packets checksummed in software");
143
144static u_int64_t tcps_out_sw_cksum_bytes;
145SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
146    &tcps_out_sw_cksum_bytes,
147    "Amount of transmitted data checksummed in software");
148
149extern uint32_t tcp_autorcvbuf_max;
150
151extern void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
152
153#if TCPDEBUG
154#define	TCPDEBUG0	int ostate = 0
155#define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
156#define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
157				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
158#else
159#define	TCPDEBUG0
160#define	TCPDEBUG1()
161#define	TCPDEBUG2(req)
162#endif
163
164SYSCTL_PROC(_net_inet_tcp, OID_AUTO, info, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
165    0 , 0, tcp_sysctl_info, "S", "TCP info per tuple");
166
167/*
168 * TCP attaches to socket via pru_attach(), reserving space,
169 * and an internet control block.
170 *
171 * Returns:	0			Success
172 *		EISCONN
173 *	tcp_attach:ENOBUFS
174 *	tcp_attach:ENOMEM
175 *	tcp_attach:???			[IPSEC specific]
176 */
177static int
178tcp_usr_attach(struct socket *so, __unused int proto, struct proc *p)
179{
180	int error;
181	struct inpcb *inp = sotoinpcb(so);
182	struct tcpcb *tp = 0;
183	TCPDEBUG0;
184
185	TCPDEBUG1();
186	if (inp) {
187		error = EISCONN;
188		goto out;
189	}
190
191	error = tcp_attach(so, p);
192	if (error)
193		goto out;
194
195	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
196		so->so_linger = TCP_LINGERTIME * hz;
197	tp = sototcpcb(so);
198out:
199	TCPDEBUG2(PRU_ATTACH);
200	return error;
201}
202
203/*
204 * pru_detach() detaches the TCP protocol from the socket.
205 * If the protocol state is non-embryonic, then can't
206 * do this directly: have to initiate a pru_disconnect(),
207 * which may finish later; embryonic TCB's can just
208 * be discarded here.
209 */
210static int
211tcp_usr_detach(struct socket *so)
212{
213	int error = 0;
214	struct inpcb *inp = sotoinpcb(so);
215	struct tcpcb *tp;
216	TCPDEBUG0;
217
218	if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
219		return EINVAL;	/* XXX */
220	}
221	lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
222	tp = intotcpcb(inp);
223	/* In case we got disconnected from the peer */
224        if (tp == 0)
225	    goto out;
226	TCPDEBUG1();
227
228	calculate_tcp_clock();
229
230	tp = tcp_disconnect(tp);
231out:
232	TCPDEBUG2(PRU_DETACH);
233	return error;
234}
235
236#define	COMMON_START()	TCPDEBUG0; \
237			do { \
238				     if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) { \
239					     return EINVAL; \
240				     } \
241				     tp = intotcpcb(inp); \
242				     TCPDEBUG1(); \
243				     calculate_tcp_clock(); \
244		     } while(0)
245
246#define COMMON_END(req)	out: TCPDEBUG2(req); return error; goto out
247
248
249/*
250 * Give the socket an address.
251 *
252 * Returns:	0			Success
253 *		EINVAL			Invalid argument [COMMON_START]
254 *		EAFNOSUPPORT		Address family not supported
255 *	in_pcbbind:EADDRNOTAVAIL	Address not available.
256 *	in_pcbbind:EINVAL		Invalid argument
257 *	in_pcbbind:EAFNOSUPPORT		Address family not supported [notdef]
258 *	in_pcbbind:EACCES		Permission denied
259 *	in_pcbbind:EADDRINUSE		Address in use
260 *	in_pcbbind:EAGAIN		Resource unavailable, try again
261 *	in_pcbbind:EPERM		Operation not permitted
262 */
263static int
264tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
265{
266	int error = 0;
267	struct inpcb *inp = sotoinpcb(so);
268	struct tcpcb *tp;
269	struct sockaddr_in *sinp;
270
271	COMMON_START();
272
273	if (nam->sa_family != 0 && nam->sa_family != AF_INET) {
274		error = EAFNOSUPPORT;
275		goto out;
276	}
277
278	/*
279	 * Must check for multicast addresses and disallow binding
280	 * to them.
281	 */
282	sinp = (struct sockaddr_in *)(void *)nam;
283	if (sinp->sin_family == AF_INET &&
284	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
285		error = EAFNOSUPPORT;
286		goto out;
287	}
288	error = in_pcbbind(inp, nam, p);
289	if (error)
290		goto out;
291	COMMON_END(PRU_BIND);
292
293}
294
295#if INET6
296static int
297tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
298{
299	int error = 0;
300	struct inpcb *inp = sotoinpcb(so);
301	struct tcpcb *tp;
302	struct sockaddr_in6 *sin6p;
303
304	COMMON_START();
305
306	if (nam->sa_family != 0 && nam->sa_family != AF_INET6) {
307		error = EAFNOSUPPORT;
308		goto out;
309	}
310
311	/*
312	 * Must check for multicast addresses and disallow binding
313	 * to them.
314	 */
315	sin6p = (struct sockaddr_in6 *)(void *)nam;
316	if (sin6p->sin6_family == AF_INET6 &&
317	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
318		error = EAFNOSUPPORT;
319		goto out;
320	}
321	inp->inp_vflag &= ~INP_IPV4;
322	inp->inp_vflag |= INP_IPV6;
323	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
324		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
325			inp->inp_vflag |= INP_IPV4;
326		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
327			struct sockaddr_in sin;
328
329			in6_sin6_2_sin(&sin, sin6p);
330			inp->inp_vflag |= INP_IPV4;
331			inp->inp_vflag &= ~INP_IPV6;
332			error = in_pcbbind(inp, (struct sockaddr *)&sin, p);
333			goto out;
334		}
335	}
336	error = in6_pcbbind(inp, nam, p);
337	if (error)
338		goto out;
339	COMMON_END(PRU_BIND);
340}
341#endif /* INET6 */
342
343/*
344 * Prepare to accept connections.
345 *
346 * Returns:	0			Success
347 *		EINVAL [COMMON_START]
348 *	in_pcbbind:EADDRNOTAVAIL	Address not available.
349 *	in_pcbbind:EINVAL		Invalid argument
350 *	in_pcbbind:EAFNOSUPPORT		Address family not supported [notdef]
351 *	in_pcbbind:EACCES		Permission denied
352 *	in_pcbbind:EADDRINUSE		Address in use
353 *	in_pcbbind:EAGAIN		Resource unavailable, try again
354 *	in_pcbbind:EPERM		Operation not permitted
355 */
356static int
357tcp_usr_listen(struct socket *so, struct proc *p)
358{
359	int error = 0;
360	struct inpcb *inp = sotoinpcb(so);
361	struct tcpcb *tp;
362
363	COMMON_START();
364	if (inp->inp_lport == 0)
365		error = in_pcbbind(inp, (struct sockaddr *)0, p);
366	if (error == 0)
367		tp->t_state = TCPS_LISTEN;
368	COMMON_END(PRU_LISTEN);
369}
370
371#if INET6
372static int
373tcp6_usr_listen(struct socket *so, struct proc *p)
374{
375	int error = 0;
376	struct inpcb *inp = sotoinpcb(so);
377	struct tcpcb *tp;
378
379	COMMON_START();
380	if (inp->inp_lport == 0) {
381		inp->inp_vflag &= ~INP_IPV4;
382		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
383			inp->inp_vflag |= INP_IPV4;
384		error = in6_pcbbind(inp, (struct sockaddr *)0, p);
385	}
386	if (error == 0)
387		tp->t_state = TCPS_LISTEN;
388	COMMON_END(PRU_LISTEN);
389}
390#endif /* INET6 */
391
392/*
393 * Initiate connection to peer.
394 * Create a template for use in transmissions on this connection.
395 * Enter SYN_SENT state, and mark socket as connecting.
396 * Start keep-alive timer, and seed output sequence space.
397 * Send initial segment on connection.
398 */
399static int
400tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
401{
402	int error = 0;
403	struct inpcb *inp = sotoinpcb(so);
404	struct tcpcb *tp;
405	struct sockaddr_in *sinp;
406
407	TCPDEBUG0;
408	if (inp == 0)
409		return EINVAL;
410	else if (inp->inp_state == INPCB_STATE_DEAD) {
411		if (so->so_error) {
412			error = so->so_error;
413			so->so_error = 0;
414			return error;
415		} else
416			return EINVAL;
417	}
418	tp = intotcpcb(inp);
419	TCPDEBUG1();
420
421	calculate_tcp_clock();
422
423	if (nam->sa_family != 0 && nam->sa_family != AF_INET) {
424		error = EAFNOSUPPORT;
425		goto out;
426	}
427	/*
428	 * Must disallow TCP ``connections'' to multicast addresses.
429	 */
430	sinp = (struct sockaddr_in *)(void *)nam;
431	if (sinp->sin_family == AF_INET
432	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
433		error = EAFNOSUPPORT;
434		goto out;
435	}
436
437
438	if ((error = tcp_connect(tp, nam, p)) != 0)
439		goto out;
440	error = tcp_output(tp);
441	COMMON_END(PRU_CONNECT);
442}
443
444#if INET6
445static int
446tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
447{
448	int error = 0;
449	struct inpcb *inp = sotoinpcb(so);
450	struct tcpcb *tp;
451	struct sockaddr_in6 *sin6p;
452
453	COMMON_START();
454
455	if (nam->sa_family != 0 && nam->sa_family != AF_INET6) {
456		error = EAFNOSUPPORT;
457		goto out;
458	}
459
460	/*
461	 * Must disallow TCP ``connections'' to multicast addresses.
462	 */
463	sin6p = (struct sockaddr_in6 *)(void *)nam;
464	if (sin6p->sin6_family == AF_INET6
465	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
466		error = EAFNOSUPPORT;
467		goto out;
468	}
469
470	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
471		struct sockaddr_in sin;
472
473		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
474			return (EINVAL);
475
476		in6_sin6_2_sin(&sin, sin6p);
477		inp->inp_vflag |= INP_IPV4;
478		inp->inp_vflag &= ~INP_IPV6;
479		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0)
480			goto out;
481		error = tcp_output(tp);
482		goto out;
483	}
484	inp->inp_vflag &= ~INP_IPV4;
485	inp->inp_vflag |= INP_IPV6;
486	if ((error = tcp6_connect(tp, nam, p)) != 0)
487		goto out;
488	error = tcp_output(tp);
489	if (error)
490		goto out;
491	COMMON_END(PRU_CONNECT);
492}
493#endif /* INET6 */
494
495/*
496 * Initiate disconnect from peer.
497 * If connection never passed embryonic stage, just drop;
498 * else if don't need to let data drain, then can just drop anyways,
499 * else have to begin TCP shutdown process: mark socket disconnecting,
500 * drain unread data, state switch to reflect user close, and
501 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
502 * when peer sends FIN and acks ours.
503 *
504 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
505 */
506static int
507tcp_usr_disconnect(struct socket *so)
508{
509	int error = 0;
510	struct inpcb *inp = sotoinpcb(so);
511	struct tcpcb *tp;
512
513	lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
514	COMMON_START();
515        /* In case we got disconnected from the peer */
516        if (tp == 0)
517            goto out;
518	tp = tcp_disconnect(tp);
519	COMMON_END(PRU_DISCONNECT);
520}
521
522/*
523 * Accept a connection.  Essentially all the work is
524 * done at higher levels; just return the address
525 * of the peer, storing through addr.
526 */
527static int
528tcp_usr_accept(struct socket *so, struct sockaddr **nam)
529{
530	int error = 0;
531	struct inpcb *inp = sotoinpcb(so);
532	struct tcpcb *tp = NULL;
533	TCPDEBUG0;
534
535	in_setpeeraddr(so, nam);
536
537	if (so->so_state & SS_ISDISCONNECTED) {
538		error = ECONNABORTED;
539		goto out;
540	}
541	if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
542		return (EINVAL);
543	}
544	tp = intotcpcb(inp);
545	TCPDEBUG1();
546
547	calculate_tcp_clock();
548
549	COMMON_END(PRU_ACCEPT);
550}
551
552#if INET6
553static int
554tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
555{
556	int error = 0;
557	struct inpcb *inp = sotoinpcb(so);
558	struct tcpcb *tp = NULL;
559	TCPDEBUG0;
560
561	if (so->so_state & SS_ISDISCONNECTED) {
562		error = ECONNABORTED;
563		goto out;
564	}
565	if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
566		return (EINVAL);
567	}
568	tp = intotcpcb(inp);
569	TCPDEBUG1();
570
571	calculate_tcp_clock();
572
573	in6_mapped_peeraddr(so, nam);
574	COMMON_END(PRU_ACCEPT);
575}
576#endif /* INET6 */
577
578/*
579 * Mark the connection as being incapable of further output.
580 *
581 * Returns:	0			Success
582 *		EINVAL [COMMON_START]
583 *	tcp_output:EADDRNOTAVAIL
584 *	tcp_output:ENOBUFS
585 *	tcp_output:EMSGSIZE
586 *	tcp_output:EHOSTUNREACH
587 *	tcp_output:ENETUNREACH
588 *	tcp_output:ENETDOWN
589 *	tcp_output:ENOMEM
590 *	tcp_output:EACCES
591 *	tcp_output:EMSGSIZE
592 *	tcp_output:ENOBUFS
593 *	tcp_output:???			[ignorable: mostly IPSEC/firewall/DLIL]
594 */
595static int
596tcp_usr_shutdown(struct socket *so)
597{
598	int error = 0;
599	struct inpcb *inp = sotoinpcb(so);
600	struct tcpcb *tp;
601
602	COMMON_START();
603	socantsendmore(so);
604        /* In case we got disconnected from the peer */
605        if (tp == 0)
606            goto out;
607	tp = tcp_usrclosed(tp);
608	if (tp)
609		error = tcp_output(tp);
610	COMMON_END(PRU_SHUTDOWN);
611}
612
613/*
614 * After a receive, possibly send window update to peer.
615 */
616static int
617tcp_usr_rcvd(struct socket *so, __unused int flags)
618{
619	int error = 0;
620	struct inpcb *inp = sotoinpcb(so);
621	struct tcpcb *tp;
622
623	COMMON_START();
624        /* In case we got disconnected from the peer */
625        if (tp == 0)
626            goto out;
627	tcp_sbrcv_trim(tp, &so->so_rcv);
628
629	tcp_output(tp);
630	COMMON_END(PRU_RCVD);
631}
632
633/*
634 * Do a send by putting data in output queue and updating urgent
635 * marker if URG set.  Possibly send more data.  Unlike the other
636 * pru_*() routines, the mbuf chains are our responsibility.  We
637 * must either enqueue them or free them.  The other pru_* routines
638 * generally are caller-frees.
639 *
640 * Returns:	0			Success
641 *		ECONNRESET
642 *		EINVAL
643 *		ENOBUFS
644 *	tcp_connect:EADDRINUSE		Address in use
645 *	tcp_connect:EADDRNOTAVAIL	Address not available.
646 *	tcp_connect:EINVAL		Invalid argument
647 *	tcp_connect:EAFNOSUPPORT	Address family not supported [notdef]
648 *	tcp_connect:EACCES		Permission denied
649 *	tcp_connect:EAGAIN		Resource unavailable, try again
650 *	tcp_connect:EPERM		Operation not permitted
651 *	tcp_output:EADDRNOTAVAIL
652 *	tcp_output:ENOBUFS
653 *	tcp_output:EMSGSIZE
654 *	tcp_output:EHOSTUNREACH
655 *	tcp_output:ENETUNREACH
656 *	tcp_output:ENETDOWN
657 *	tcp_output:ENOMEM
658 *	tcp_output:EACCES
659 *	tcp_output:EMSGSIZE
660 *	tcp_output:ENOBUFS
661 *	tcp_output:???			[ignorable: mostly IPSEC/firewall/DLIL]
662 *	tcp6_connect:???		[IPV6 only]
663 */
664static int
665tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
666	     struct sockaddr *nam, struct mbuf *control, struct proc *p)
667{
668	int error = 0;
669	struct inpcb *inp = sotoinpcb(so);
670	struct tcpcb *tp;
671#if INET6
672	int isipv6;
673#endif
674	TCPDEBUG0;
675
676	if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD) {
677		/*
678		 * OOPS! we lost a race, the TCP session got reset after
679		 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
680		 * network interrupt in the non-splnet() section of sosend().
681		 */
682		if (m)
683			m_freem(m);
684		if (control)
685			m_freem(control);
686		error = ECONNRESET;	/* XXX EPIPE? */
687		tp = NULL;
688		TCPDEBUG1();
689		goto out;
690	}
691#if INET6
692	isipv6 = nam && nam->sa_family == AF_INET6;
693#endif /* INET6 */
694	tp = intotcpcb(inp);
695	TCPDEBUG1();
696
697	calculate_tcp_clock();
698
699	if (control) {
700		/* TCP doesn't do control messages (rights, creds, etc) */
701		if (control->m_len) {
702			m_freem(control);
703			if (m)
704				m_freem(m);
705			error = EINVAL;
706			goto out;
707		}
708		m_freem(control);	/* empty control, just free it */
709	}
710	if(!(flags & PRUS_OOB)) {
711		sbappendstream(&so->so_snd, m);
712		if (nam && tp->t_state < TCPS_SYN_SENT) {
713			/*
714			 * Do implied connect if not yet connected,
715			 * initialize window to default value, and
716			 * initialize maxseg/maxopd using peer's cached
717			 * MSS.
718			 */
719#if INET6
720			if (isipv6)
721				error = tcp6_connect(tp, nam, p);
722			else
723#endif /* INET6 */
724				error = tcp_connect(tp, nam, p);
725			if (error)
726				goto out;
727			tp->snd_wnd = TTCP_CLIENT_SND_WND;
728			tcp_mss(tp, -1, IFSCOPE_NONE);
729		}
730
731		if (flags & PRUS_EOF) {
732			/*
733			 * Close the send side of the connection after
734			 * the data is sent.
735			 */
736			socantsendmore(so);
737			tp = tcp_usrclosed(tp);
738		}
739		if (tp != NULL) {
740			if (flags & PRUS_MORETOCOME)
741				tp->t_flags |= TF_MORETOCOME;
742			error = tcp_output(tp);
743			if (flags & PRUS_MORETOCOME)
744				tp->t_flags &= ~TF_MORETOCOME;
745		}
746	} else {
747		if (sbspace(&so->so_snd) == 0) {
748			/* if no space is left in sockbuf,
749			 * do not try to squeeze in OOB traffic */
750			m_freem(m);
751			error = ENOBUFS;
752			goto out;
753		}
754		/*
755		 * According to RFC961 (Assigned Protocols),
756		 * the urgent pointer points to the last octet
757		 * of urgent data.  We continue, however,
758		 * to consider it to indicate the first octet
759		 * of data past the urgent section.
760		 * Otherwise, snd_up should be one lower.
761		 */
762		sbappendstream(&so->so_snd, m);
763		if (nam && tp->t_state < TCPS_SYN_SENT) {
764			/*
765			 * Do implied connect if not yet connected,
766			 * initialize window to default value, and
767			 * initialize maxseg/maxopd using peer's cached
768			 * MSS.
769			 */
770#if INET6
771			if (isipv6)
772				error = tcp6_connect(tp, nam, p);
773			else
774#endif /* INET6 */
775			error = tcp_connect(tp, nam, p);
776			if (error)
777				goto out;
778			tp->snd_wnd = TTCP_CLIENT_SND_WND;
779			tcp_mss(tp, -1, IFSCOPE_NONE);
780		}
781		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
782		tp->t_force = 1;
783		error = tcp_output(tp);
784		tp->t_force = 0;
785	}
786	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
787		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
788}
789
790/*
791 * Abort the TCP.
792 */
793static int
794tcp_usr_abort(struct socket *so)
795{
796	int error = 0;
797	struct inpcb *inp = sotoinpcb(so);
798	struct tcpcb *tp;
799
800	COMMON_START();
801        /* In case we got disconnected from the peer */
802        if (tp == 0)
803            goto out;
804	tp = tcp_drop(tp, ECONNABORTED);
805	so->so_usecount--;
806	COMMON_END(PRU_ABORT);
807}
808
809/*
810 * Receive out-of-band data.
811 *
812 * Returns:	0			Success
813 *		EINVAL [COMMON_START]
814 *		EINVAL
815 *		EWOULDBLOCK
816 */
817static int
818tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
819{
820	int error = 0;
821	struct inpcb *inp = sotoinpcb(so);
822	struct tcpcb *tp;
823
824	COMMON_START();
825	if ((so->so_oobmark == 0 &&
826	     (so->so_state & SS_RCVATMARK) == 0) ||
827	    so->so_options & SO_OOBINLINE ||
828	    tp->t_oobflags & TCPOOB_HADDATA) {
829		error = EINVAL;
830		goto out;
831	}
832	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
833		error = EWOULDBLOCK;
834		goto out;
835	}
836	m->m_len = 1;
837	*mtod(m, caddr_t) = tp->t_iobc;
838	if ((flags & MSG_PEEK) == 0)
839		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
840	COMMON_END(PRU_RCVOOB);
841}
842
843/* xxx - should be const */
844struct pr_usrreqs tcp_usrreqs = {
845	tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
846	tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
847	tcp_usr_disconnect, tcp_usr_listen, in_setpeeraddr, tcp_usr_rcvd,
848	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
849	in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp
850};
851
852#if INET6
853struct pr_usrreqs tcp6_usrreqs = {
854	tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
855	tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
856	tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
857	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
858	in6_mapped_sockaddr, sosend, soreceive, pru_sopoll_notsupp
859};
860#endif /* INET6 */
861
862/*
863 * Common subroutine to open a TCP connection to remote host specified
864 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
865 * port number if needed.  Call in_pcbladdr to do the routing and to choose
866 * a local host address (interface).  If there is an existing incarnation
867 * of the same connection in TIME-WAIT state and if the remote host was
868 * sending CC options and if the connection duration was < MSL, then
869 * truncate the previous TIME-WAIT state and proceed.
870 * Initialize connection parameters and enter SYN-SENT state.
871 *
872 * Returns:	0			Success
873 *		EADDRINUSE
874 *		EINVAL
875 *	in_pcbbind:EADDRNOTAVAIL	Address not available.
876 *	in_pcbbind:EINVAL		Invalid argument
877 *	in_pcbbind:EAFNOSUPPORT		Address family not supported [notdef]
878 *	in_pcbbind:EACCES		Permission denied
879 *	in_pcbbind:EADDRINUSE		Address in use
880 *	in_pcbbind:EAGAIN		Resource unavailable, try again
881 *	in_pcbbind:EPERM		Operation not permitted
882 *	in_pcbladdr:EINVAL		Invalid argument
883 *	in_pcbladdr:EAFNOSUPPORT	Address family not supported
884 *	in_pcbladdr:EADDRNOTAVAIL	Address not available
885 */
886static int
887tcp_connect(tp, nam, p)
888	register struct tcpcb *tp;
889	struct sockaddr *nam;
890	struct proc *p;
891{
892	struct inpcb *inp = tp->t_inpcb, *oinp;
893	struct socket *so = inp->inp_socket;
894	struct tcpcb *otp;
895	struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
896	struct sockaddr_in ifaddr;
897	struct rmxp_tao *taop;
898	struct rmxp_tao tao_noncached;
899	int error;
900	struct ifnet *outif = NULL;
901
902	if (inp->inp_lport == 0) {
903		error = in_pcbbind(inp, (struct sockaddr *)0, p);
904		if (error)
905			return error;
906	}
907
908	/*
909	 * Cannot simply call in_pcbconnect, because there might be an
910	 * earlier incarnation of this same connection still in
911	 * TIME_WAIT state, creating an ADDRINUSE error.
912	 */
913	error = in_pcbladdr(inp, nam, &ifaddr, &outif);
914	if (error)
915		return error;
916
917	tcp_unlock(inp->inp_socket, 0, 0);
918	oinp = in_pcblookup_hash(inp->inp_pcbinfo,
919	    sin->sin_addr, sin->sin_port,
920	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
921						: ifaddr.sin_addr,
922	    inp->inp_lport,  0, NULL);
923
924	tcp_lock(inp->inp_socket, 0, 0);
925	if (oinp) {
926		if (oinp != inp) /* 4143933: avoid deadlock if inp == oinp */
927			tcp_lock(oinp->inp_socket, 1, 0);
928		if (in_pcb_checkstate(oinp, WNT_RELEASE, 1) == WNT_STOPUSING) {
929			if (oinp != inp)
930				tcp_unlock(oinp->inp_socket, 1, 0);
931			goto skip_oinp;
932		}
933
934		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
935		otp->t_state == TCPS_TIME_WAIT &&
936		    ((int)(tcp_now - otp->t_starttime)) < tcp_msl &&
937		    (otp->t_flags & TF_RCVD_CC))
938			otp = tcp_close(otp);
939		else {
940			printf("tcp_connect: inp=%p err=EADDRINUSE\n", inp);
941			if (oinp != inp)
942				tcp_unlock(oinp->inp_socket, 1, 0);
943			return EADDRINUSE;
944		}
945		if (oinp != inp)
946			tcp_unlock(oinp->inp_socket, 1, 0);
947	}
948skip_oinp:
949	if ((inp->inp_laddr.s_addr == INADDR_ANY ? ifaddr.sin_addr.s_addr :
950		 inp->inp_laddr.s_addr) == sin->sin_addr.s_addr &&
951	    inp->inp_lport == sin->sin_port)
952			return EINVAL;
953	if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
954		/*lock inversion issue, mostly with udp multicast packets */
955		socket_unlock(inp->inp_socket, 0);
956		lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
957		socket_lock(inp->inp_socket, 0);
958	}
959	if (inp->inp_laddr.s_addr == INADDR_ANY) {
960		inp->inp_laddr = ifaddr.sin_addr;
961		inp->inp_last_outifp = outif;
962	}
963	inp->inp_faddr = sin->sin_addr;
964	inp->inp_fport = sin->sin_port;
965	in_pcbrehash(inp);
966	lck_rw_done(inp->inp_pcbinfo->mtx);
967
968	if (inp->inp_flowhash == 0)
969		inp->inp_flowhash = inp_calc_flowhash(inp);
970
971	tcp_set_max_rwinscale(tp, so);
972
973	soisconnecting(so);
974	tcpstat.tcps_connattempt++;
975	tp->t_state = TCPS_SYN_SENT;
976	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
977		tp->t_keepinit ? tp->t_keepinit : tcp_keepinit);
978	tp->iss = tcp_new_isn(tp);
979	tcp_sendseqinit(tp);
980	if (nstat_collect)
981		nstat_route_connect_attempt(inp->inp_route.ro_rt);
982
983	/*
984	 * Generate a CC value for this connection and
985	 * check whether CC or CCnew should be used.
986	 */
987	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
988		taop = &tao_noncached;
989		bzero(taop, sizeof(*taop));
990	}
991
992	tp->cc_send = CC_INC(tcp_ccgen);
993	if (taop->tao_ccsent != 0 &&
994	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
995		taop->tao_ccsent = tp->cc_send;
996	} else {
997		taop->tao_ccsent = 0;
998		tp->t_flags |= TF_SENDCCNEW;
999	}
1000
1001	return 0;
1002}
1003
1004#if INET6
1005static int
1006tcp6_connect(tp, nam, p)
1007	register struct tcpcb *tp;
1008	struct sockaddr *nam;
1009	struct proc *p;
1010{
1011	struct inpcb *inp = tp->t_inpcb, *oinp;
1012	struct socket *so = inp->inp_socket;
1013	struct tcpcb *otp;
1014	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)nam;
1015	struct in6_addr addr6;
1016	struct rmxp_tao *taop;
1017	struct rmxp_tao tao_noncached;
1018	int error = 0;
1019	struct ifnet *outif = NULL;
1020
1021	if (inp->inp_lport == 0) {
1022		error = in6_pcbbind(inp, (struct sockaddr *)0, p);
1023		if (error)
1024			goto done;
1025	}
1026
1027	/*
1028	 * Cannot simply call in_pcbconnect, because there might be an
1029	 * earlier incarnation of this same connection still in
1030	 * TIME_WAIT state, creating an ADDRINUSE error.
1031	 *
1032	 * in6_pcbladdr() might return an ifp with its reference held
1033	 * even in the error case, so make sure that it's released
1034	 * whenever it's non-NULL.
1035	 */
1036	error = in6_pcbladdr(inp, nam, &addr6, &outif);
1037	if (error)
1038		goto done;
1039	tcp_unlock(inp->inp_socket, 0, 0);
1040	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
1041				  &sin6->sin6_addr, sin6->sin6_port,
1042				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
1043				  ? &addr6
1044				  : &inp->in6p_laddr,
1045				  inp->inp_lport,  0, NULL);
1046	tcp_lock(inp->inp_socket, 0, 0);
1047	if (oinp) {
1048		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
1049		    otp->t_state == TCPS_TIME_WAIT &&
1050		    ((int)(tcp_now - otp->t_starttime)) < tcp_msl &&
1051		    (otp->t_flags & TF_RCVD_CC)) {
1052			otp = tcp_close(otp);
1053		} else {
1054			error = EADDRINUSE;
1055			goto done;
1056		}
1057	}
1058	if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
1059		/*lock inversion issue, mostly with udp multicast packets */
1060		socket_unlock(inp->inp_socket, 0);
1061		lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
1062		socket_lock(inp->inp_socket, 0);
1063	}
1064	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
1065		inp->in6p_laddr = addr6;
1066		inp->in6p_last_outifp = outif;	/* no reference needed */
1067	}
1068	inp->in6p_faddr = sin6->sin6_addr;
1069	inp->inp_fport = sin6->sin6_port;
1070	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
1071		inp->in6p_flowinfo = sin6->sin6_flowinfo;
1072	in_pcbrehash(inp);
1073	lck_rw_done(inp->inp_pcbinfo->mtx);
1074
1075	if (inp->inp_flowhash == 0)
1076		inp->inp_flowhash = inp_calc_flowhash(inp);
1077
1078	tcp_set_max_rwinscale(tp, so);
1079
1080	soisconnecting(so);
1081	tcpstat.tcps_connattempt++;
1082	tp->t_state = TCPS_SYN_SENT;
1083	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1084		tp->t_keepinit ? tp->t_keepinit : tcp_keepinit);
1085	tp->iss = tcp_new_isn(tp);
1086	tcp_sendseqinit(tp);
1087	if (nstat_collect)
1088		nstat_route_connect_attempt(inp->inp_route.ro_rt);
1089
1090	/*
1091	 * Generate a CC value for this connection and
1092	 * check whether CC or CCnew should be used.
1093	 */
1094	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
1095		taop = &tao_noncached;
1096		bzero(taop, sizeof(*taop));
1097	}
1098
1099	tp->cc_send = CC_INC(tcp_ccgen);
1100	if (taop->tao_ccsent != 0 &&
1101	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
1102		taop->tao_ccsent = tp->cc_send;
1103	} else {
1104		taop->tao_ccsent = 0;
1105		tp->t_flags |= TF_SENDCCNEW;
1106	}
1107
1108done:
1109	if (outif != NULL)
1110		ifnet_release(outif);
1111
1112	return (error);
1113}
1114#endif /* INET6 */
1115
1116/*
1117 * Export TCP internal state information via a struct tcp_info
1118 */
1119__private_extern__ void
1120tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
1121{
1122	struct inpcb *inp = tp->t_inpcb;
1123
1124	bzero(ti, sizeof(*ti));
1125
1126	ti->tcpi_state = tp->t_state;
1127
1128	if (tp->t_state > TCPS_LISTEN) {
1129		if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
1130			ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1131		if (tp->t_flags & TF_SACK_PERMIT)
1132			ti->tcpi_options |= TCPI_OPT_SACK;
1133		if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
1134			ti->tcpi_options |= TCPI_OPT_WSCALE;
1135			ti->tcpi_snd_wscale = tp->snd_scale;
1136			ti->tcpi_rcv_wscale = tp->rcv_scale;
1137		}
1138
1139		/* Are we in retranmission episode */
1140		if (tp->snd_max != tp->snd_nxt)
1141			ti->tcpi_flags |= TCPI_FLAG_LOSSRECOVERY;
1142		else
1143				ti->tcpi_flags &= ~TCPI_FLAG_LOSSRECOVERY;
1144
1145		ti->tcpi_rto = tp->t_timer[TCPT_REXMT] ? tp->t_rxtcur : 0;
1146		ti->tcpi_snd_mss = tp->t_maxseg;
1147		ti->tcpi_rcv_mss = tp->t_maxseg;
1148
1149		ti->tcpi_rttcur = tp->t_rttcur;
1150		ti->tcpi_srtt = tp->t_srtt >> TCP_RTT_SHIFT;
1151		ti->tcpi_rttvar = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
1152
1153		ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
1154		ti->tcpi_snd_cwnd = tp->snd_cwnd;
1155		ti->tcpi_snd_sbbytes = tp->t_inpcb->inp_socket->so_snd.sb_cc;
1156
1157		ti->tcpi_rcv_space = tp->rcv_wnd;
1158
1159		ti->tcpi_snd_wnd = tp->snd_wnd;
1160		ti->tcpi_snd_nxt = tp->snd_nxt;
1161		ti->tcpi_rcv_nxt = tp->rcv_nxt;
1162
1163		/* convert bytes/msec to bits/sec */
1164		if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
1165			tp->t_bwmeas != NULL) {
1166			ti->tcpi_snd_bw	= (tp->t_bwmeas->bw_sndbw * 8000);
1167		}
1168
1169		ti->tcpi_last_outif = (tp->t_inpcb->inp_last_outifp == NULL) ? 0 :
1170		    tp->t_inpcb->inp_last_outifp->if_index;
1171
1172		//atomic_get_64(ti->tcpi_txbytes, &inp->inp_stat->txbytes);
1173		ti->tcpi_txbytes = inp->inp_stat->txbytes;
1174		ti->tcpi_txretransmitbytes = tp->t_stat.txretransmitbytes;
1175		ti->tcpi_txunacked = tp->snd_max - tp->snd_una;
1176
1177		//atomic_get_64(ti->tcpi_rxbytes, &inp->inp_stat->rxbytes);
1178		ti->tcpi_rxbytes = inp->inp_stat->rxbytes;
1179		ti->tcpi_rxduplicatebytes = tp->t_stat.rxduplicatebytes;
1180	}
1181}
1182
1183__private_extern__ errno_t
1184tcp_fill_info_for_info_tuple(struct info_tuple *itpl, struct tcp_info *ti)
1185{
1186	struct inpcbinfo *pcbinfo = NULL;
1187	struct inpcb *inp = NULL;
1188	struct socket *so;
1189	struct tcpcb *tp;
1190
1191	if (itpl->itpl_proto == IPPROTO_TCP)
1192		pcbinfo = &tcbinfo;
1193	else
1194		return EINVAL;
1195
1196	if (itpl->itpl_local_sa.sa_family == AF_INET &&
1197		itpl->itpl_remote_sa.sa_family == AF_INET) {
1198		inp = in_pcblookup_hash(pcbinfo,
1199								itpl->itpl_remote_sin.sin_addr,
1200								itpl->itpl_remote_sin.sin_port,
1201								itpl->itpl_local_sin.sin_addr,
1202								itpl->itpl_local_sin.sin_port,
1203								0, NULL);
1204	} else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
1205		itpl->itpl_remote_sa.sa_family == AF_INET6) {
1206		struct in6_addr ina6_local;
1207		struct in6_addr ina6_remote;
1208
1209		ina6_local = itpl->itpl_local_sin6.sin6_addr;
1210		if (IN6_IS_SCOPE_LINKLOCAL(&ina6_local) && itpl->itpl_local_sin6.sin6_scope_id)
1211			ina6_local.s6_addr16[1] = htons(itpl->itpl_local_sin6.sin6_scope_id);
1212
1213		ina6_remote = itpl->itpl_remote_sin6.sin6_addr;
1214		if (IN6_IS_SCOPE_LINKLOCAL(&ina6_remote) && itpl->itpl_remote_sin6.sin6_scope_id)
1215			ina6_remote.s6_addr16[1] = htons(itpl->itpl_remote_sin6.sin6_scope_id);
1216
1217		inp = in6_pcblookup_hash(pcbinfo,
1218								&ina6_remote,
1219								itpl->itpl_remote_sin6.sin6_port,
1220								&ina6_local,
1221								itpl->itpl_local_sin6.sin6_port,
1222								0, NULL);
1223	} else
1224		return EINVAL;
1225	if (inp == NULL || (so = inp->inp_socket) == NULL)
1226		return ENOENT;
1227
1228	socket_lock(so, 0);
1229	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1230		socket_unlock(so, 0);
1231		return ENOENT;
1232	}
1233	tp = intotcpcb(inp);
1234
1235	tcp_fill_info(tp, ti);
1236	socket_unlock(so, 0);
1237
1238	return 0;
1239}
1240
1241
1242__private_extern__ int
1243tcp_sysctl_info(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
1244{
1245	int error;
1246	struct tcp_info ti;
1247	struct info_tuple itpl;
1248
1249	if (req->newptr == USER_ADDR_NULL) {
1250		return EINVAL;
1251	}
1252	if (req->newlen < sizeof(struct info_tuple)) {
1253		return EINVAL;
1254	}
1255	error = SYSCTL_IN(req, &itpl, sizeof(struct info_tuple));
1256	if (error != 0) {
1257		return error;
1258	}
1259	error = tcp_fill_info_for_info_tuple(&itpl, &ti);
1260	if (error != 0) {
1261		return error;
1262	}
1263	error = SYSCTL_OUT(req, &ti, sizeof(struct tcp_info));
1264	if (error != 0) {
1265		return error;
1266	}
1267
1268	return 0;
1269}
1270
1271static int
1272tcp_lookup_peer_pid_locked(struct socket *so, pid_t *out_pid)
1273{
1274	int error = EHOSTUNREACH;
1275	*out_pid = -1;
1276	if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN;
1277
1278	struct inpcb	*inp = (struct inpcb*)so->so_pcb;
1279	uint16_t		lport = inp->inp_lport;
1280	uint16_t		fport = inp->inp_fport;
1281	struct inpcb	*finp = NULL;
1282
1283	if (inp->inp_vflag & INP_IPV6) {
1284		struct	in6_addr	laddr6 = inp->in6p_laddr;
1285		struct	in6_addr	faddr6 = inp->in6p_faddr;
1286		socket_unlock(so, 0);
1287		finp = in6_pcblookup_hash(&tcbinfo, &laddr6, lport, &faddr6, fport, 0, NULL);
1288		socket_lock(so, 0);
1289	} else if (inp->inp_vflag & INP_IPV4) {
1290		struct	in_addr	laddr4 = inp->inp_laddr;
1291		struct	in_addr	faddr4 = inp->inp_faddr;
1292		socket_unlock(so, 0);
1293		finp = in_pcblookup_hash(&tcbinfo, laddr4, lport, faddr4, fport, 0, NULL);
1294		socket_lock(so, 0);
1295	}
1296
1297	if (finp) {
1298		*out_pid = finp->inp_socket->last_pid;
1299		error = 0;
1300		in_pcb_checkstate(finp, WNT_RELEASE, 0);
1301	}
1302
1303	return error;
1304}
1305
1306/*
1307 * The new sockopt interface makes it possible for us to block in the
1308 * copyin/out step (if we take a page fault).  Taking a page fault at
1309 * splnet() is probably a Bad Thing.  (Since sockets and pcbs both now
1310 * use TSM, there probably isn't any need for this function to run at
1311 * splnet() any more.  This needs more examination.)
1312 */
1313int
1314tcp_ctloutput(so, sopt)
1315	struct socket *so;
1316	struct sockopt *sopt;
1317{
1318	int	error, opt, optval;
1319	struct	inpcb *inp;
1320	struct	tcpcb *tp;
1321
1322	error = 0;
1323	inp = sotoinpcb(so);
1324	if (inp == NULL) {
1325		return (ECONNRESET);
1326	}
1327	/* Allow <SOL_SOCKET,SO_FLUSH> at this level */
1328	if (sopt->sopt_level != IPPROTO_TCP &&
1329	    !(sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_FLUSH)) {
1330#if INET6
1331		if (INP_CHECK_SOCKAF(so, AF_INET6))
1332			error = ip6_ctloutput(so, sopt);
1333		else
1334#endif /* INET6 */
1335		error = ip_ctloutput(so, sopt);
1336		return (error);
1337	}
1338	tp = intotcpcb(inp);
1339        if (tp == NULL) {
1340                return (ECONNRESET);
1341        }
1342
1343	calculate_tcp_clock();
1344
1345	switch (sopt->sopt_dir) {
1346	case SOPT_SET:
1347		switch (sopt->sopt_name) {
1348		case TCP_NODELAY:
1349		case TCP_NOOPT:
1350		case TCP_NOPUSH:
1351			error = sooptcopyin(sopt, &optval, sizeof optval,
1352					    sizeof optval);
1353			if (error)
1354				break;
1355
1356			switch (sopt->sopt_name) {
1357			case TCP_NODELAY:
1358				opt = TF_NODELAY;
1359				break;
1360			case TCP_NOOPT:
1361				opt = TF_NOOPT;
1362				break;
1363			case TCP_NOPUSH:
1364				opt = TF_NOPUSH;
1365				break;
1366			default:
1367				opt = 0; /* dead code to fool gcc */
1368				break;
1369			}
1370
1371			if (optval)
1372				tp->t_flags |= opt;
1373			else
1374				tp->t_flags &= ~opt;
1375			break;
1376		case TCP_RXT_FINDROP:
1377			error = sooptcopyin(sopt, &optval, sizeof optval,
1378				sizeof optval);
1379			if (error)
1380				break;
1381			opt = TF_RXTFINDROP;
1382			if (optval)
1383				tp->t_flagsext |= opt;
1384			else
1385				tp->t_flagsext &= ~opt;
1386			break;
1387		case TCP_MEASURE_SND_BW:
1388			error = sooptcopyin(sopt, &optval, sizeof optval,
1389				sizeof optval);
1390			if (error)
1391				break;
1392			opt = TF_MEASURESNDBW;
1393			if (optval) {
1394				if (tp->t_bwmeas == NULL) {
1395					tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1396					if (tp->t_bwmeas == NULL) {
1397						error = ENOMEM;
1398						break;
1399					}
1400				}
1401				tp->t_flagsext |= opt;
1402			} else {
1403				tp->t_flagsext &= ~opt;
1404				/* Reset snd bw measurement state */
1405				tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
1406				if (tp->t_bwmeas != NULL) {
1407					tcp_bwmeas_free(tp);
1408				}
1409			}
1410			break;
1411		case TCP_MEASURE_BW_BURST: {
1412			struct tcp_measure_bw_burst in;
1413			uint32_t minpkts, maxpkts;
1414			bzero(&in, sizeof(in));
1415
1416			error = sooptcopyin(sopt, &in, sizeof(in),
1417				sizeof(in));
1418			if (error)
1419				break;
1420			if ((tp->t_flagsext & TF_MEASURESNDBW) == 0 ||
1421				tp->t_bwmeas == NULL) {
1422				error = EINVAL;
1423				break;
1424			}
1425			minpkts = (in.min_burst_size != 0) ? in.min_burst_size :
1426				tp->t_bwmeas->bw_minsizepkts;
1427			maxpkts = (in.max_burst_size != 0) ? in.max_burst_size :
1428				tp->t_bwmeas->bw_maxsizepkts;
1429			if (minpkts > maxpkts) {
1430				error = EINVAL;
1431				break;
1432			}
1433			tp->t_bwmeas->bw_minsizepkts = minpkts;
1434			tp->t_bwmeas->bw_maxsizepkts = maxpkts;
1435			tp->t_bwmeas->bw_minsize = (minpkts * tp->t_maxseg);
1436			tp->t_bwmeas->bw_maxsize = (maxpkts * tp->t_maxseg);
1437			break;
1438		}
1439		case TCP_MAXSEG:
1440			error = sooptcopyin(sopt, &optval, sizeof optval,
1441					    sizeof optval);
1442			if (error)
1443				break;
1444
1445			if (optval > 0 && optval <= tp->t_maxseg &&
1446			    optval + 40 >= tcp_minmss)
1447				tp->t_maxseg = optval;
1448			else
1449				error = EINVAL;
1450			break;
1451
1452		case TCP_KEEPALIVE:
1453			error = sooptcopyin(sopt, &optval, sizeof optval,
1454						sizeof optval);
1455			if (error)
1456				break;
1457			if (optval < 0)
1458				error = EINVAL;
1459			else {
1460				tp->t_keepidle = optval * TCP_RETRANSHZ;
1461				tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1462					TCP_KEEPIDLE(tp)); /* reset the timer to new value */
1463				tcp_check_timer_state(tp);
1464			}
1465                        break;
1466
1467		case TCP_CONNECTIONTIMEOUT:
1468			error = sooptcopyin(sopt, &optval, sizeof optval,
1469						sizeof optval);
1470			if (error)
1471				break;
1472			if (optval < 0)
1473				error = EINVAL;
1474			else
1475				tp->t_keepinit = optval * TCP_RETRANSHZ;
1476			break;
1477
1478		case PERSIST_TIMEOUT:
1479			error = sooptcopyin(sopt, &optval, sizeof optval,
1480						sizeof optval);
1481			if (error)
1482				break;
1483			if (optval < 0)
1484				error = EINVAL;
1485			else
1486				tp->t_persist_timeout = optval * TCP_RETRANSHZ;
1487			break;
1488		case TCP_RXT_CONNDROPTIME:
1489			error = sooptcopyin(sopt, &optval, sizeof(optval),
1490					sizeof(optval));
1491			if (error)
1492				break;
1493			if (optval < 0)
1494				error = EINVAL;
1495			else
1496				tp->rxt_conndroptime = optval * TCP_RETRANSHZ;
1497			break;
1498		case TCP_NOTSENT_LOWAT:
1499			error = sooptcopyin(sopt, &optval, sizeof(optval),
1500				sizeof(optval));
1501			if (error)
1502				break;
1503			if (optval < 0) {
1504				error = EINVAL;
1505				break;
1506			} else {
1507				if (optval == 0) {
1508					so->so_flags &= ~(SOF_NOTSENT_LOWAT);
1509					tp->t_notsent_lowat = 0;
1510				} else {
1511					so->so_flags |= SOF_NOTSENT_LOWAT;
1512					tp->t_notsent_lowat = optval;
1513				}
1514			}
1515			break;
1516
1517		case SO_FLUSH:
1518			if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
1519			    sizeof (optval))) != 0)
1520				break;
1521
1522			error = inp_flush(inp, optval);
1523			break;
1524
1525		default:
1526			error = ENOPROTOOPT;
1527			break;
1528		}
1529		break;
1530
1531	case SOPT_GET:
1532		switch (sopt->sopt_name) {
1533		case TCP_NODELAY:
1534			optval = tp->t_flags & TF_NODELAY;
1535			break;
1536		case TCP_MAXSEG:
1537			optval = tp->t_maxseg;
1538			break;
1539		case TCP_KEEPALIVE:
1540			optval = tp->t_keepidle / TCP_RETRANSHZ;
1541			break;
1542		case TCP_NOOPT:
1543			optval = tp->t_flags & TF_NOOPT;
1544			break;
1545		case TCP_NOPUSH:
1546			optval = tp->t_flags & TF_NOPUSH;
1547			break;
1548		case TCP_CONNECTIONTIMEOUT:
1549			optval = tp->t_keepinit / TCP_RETRANSHZ;
1550			break;
1551		case PERSIST_TIMEOUT:
1552			optval = tp->t_persist_timeout / TCP_RETRANSHZ;
1553			break;
1554		case TCP_RXT_CONNDROPTIME:
1555			optval = tp->rxt_conndroptime / TCP_RETRANSHZ;
1556			break;
1557		case TCP_RXT_FINDROP:
1558			optval = tp->t_flagsext & TF_RXTFINDROP;
1559			break;
1560		case TCP_MEASURE_SND_BW:
1561			optval = tp->t_flagsext & TF_MEASURESNDBW;
1562			break;
1563		case TCP_INFO: {
1564			struct tcp_info ti;
1565
1566			tcp_fill_info(tp, &ti);
1567			error = sooptcopyout(sopt, &ti, sizeof(struct tcp_info));
1568			goto done;
1569			/* NOT REACHED */
1570		}
1571		case TCP_MEASURE_BW_BURST: {
1572			struct tcp_measure_bw_burst out;
1573			if ((tp->t_flagsext & TF_MEASURESNDBW) == 0 ||
1574				tp->t_bwmeas == NULL) {
1575				error = EINVAL;
1576				break;
1577			}
1578			out.min_burst_size = tp->t_bwmeas->bw_minsizepkts;
1579			out.max_burst_size = tp->t_bwmeas->bw_maxsizepkts;
1580			error = sooptcopyout(sopt, &out, sizeof(out));
1581			goto done;
1582		}
1583		case TCP_NOTSENT_LOWAT:
1584			if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
1585				optval = tp->t_notsent_lowat;
1586			} else {
1587				optval = 0;
1588			}
1589			break;
1590		case TCP_PEER_PID: {
1591			pid_t	pid;
1592			error = tcp_lookup_peer_pid_locked(so, &pid);
1593			if (error == 0)
1594				error = sooptcopyout(sopt, &pid, sizeof(pid));
1595			goto done;
1596		}
1597		default:
1598			error = ENOPROTOOPT;
1599			break;
1600		}
1601		if (error == 0)
1602			error = sooptcopyout(sopt, &optval, sizeof optval);
1603		break;
1604	}
1605done:
1606	return (error);
1607}
1608
1609/*
1610 * tcp_sendspace and tcp_recvspace are the default send and receive window
1611 * sizes, respectively.  These are obsolescent (this information should
1612 * be set by the route).
1613 */
1614u_int32_t	tcp_sendspace = 1448*256;
1615u_int32_t	tcp_recvspace = 1448*384;
1616
1617/* During attach, the size of socket buffer allocated is limited to
1618 * sb_max in sbreserve. Disallow setting the tcp send and recv space
1619 * to be more than sb_max because that will cause tcp_attach to fail
1620 * (see radar 5713060)
1621 */
1622static int
1623sysctl_tcp_sospace(struct sysctl_oid *oidp, __unused void *arg1,
1624	__unused int arg2, struct sysctl_req *req) {
1625	u_int32_t new_value = 0, *space_p = NULL;
1626	int changed = 0, error = 0;
1627	u_quad_t sb_effective_max = (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES;
1628
1629	switch (oidp->oid_number) {
1630		case TCPCTL_SENDSPACE:
1631			space_p = &tcp_sendspace;
1632			break;
1633		case TCPCTL_RECVSPACE:
1634			space_p = &tcp_recvspace;
1635			break;
1636		default:
1637			return EINVAL;
1638	}
1639	error = sysctl_io_number(req, *space_p, sizeof(u_int32_t),
1640		&new_value, &changed);
1641	if (changed) {
1642		if (new_value > 0 && new_value <= sb_effective_max) {
1643			*space_p = new_value;
1644		} else {
1645			error = ERANGE;
1646		}
1647	}
1648	return error;
1649}
1650
1651SYSCTL_PROC(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
1652    &tcp_sendspace , 0, &sysctl_tcp_sospace, "IU", "Maximum outgoing TCP datagram size");
1653SYSCTL_PROC(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
1654    &tcp_recvspace , 0, &sysctl_tcp_sospace, "IU", "Maximum incoming TCP datagram size");
1655
1656
1657/*
1658 * Attach TCP protocol to socket, allocating
1659 * internet protocol control block, tcp control block,
1660 * bufer space, and entering LISTEN state if to accept connections.
1661 *
1662 * Returns:	0			Success
1663 *	in_pcballoc:ENOBUFS
1664 *	in_pcballoc:ENOMEM
1665 *	in_pcballoc:???			[IPSEC specific]
1666 *	soreserve:ENOBUFS
1667 */
1668static int
1669tcp_attach(so, p)
1670	struct socket *so;
1671	struct proc *p;
1672{
1673	register struct tcpcb *tp;
1674	struct inpcb *inp;
1675	int error;
1676#if INET6
1677	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
1678#endif
1679
1680	error = in_pcballoc(so, &tcbinfo, p);
1681	if (error)
1682		return (error);
1683
1684	inp = sotoinpcb(so);
1685
1686	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1687		error = soreserve(so, tcp_sendspace, tcp_recvspace);
1688		if (error)
1689			return (error);
1690	}
1691	if ((so->so_rcv.sb_flags & SB_USRSIZE) == 0)
1692		so->so_rcv.sb_flags |= SB_AUTOSIZE;
1693	if ((so->so_snd.sb_flags & SB_USRSIZE) == 0)
1694		so->so_snd.sb_flags |= SB_AUTOSIZE;
1695
1696#if INET6
1697	if (isipv6) {
1698		inp->inp_vflag |= INP_IPV6;
1699		inp->in6p_hops = -1;	/* use kernel default */
1700	}
1701	else
1702#endif /* INET6 */
1703	inp->inp_vflag |= INP_IPV4;
1704	tp = tcp_newtcpcb(inp);
1705	if (tp == 0) {
1706		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
1707
1708		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
1709#if INET6
1710		if (isipv6)
1711			in6_pcbdetach(inp);
1712		else
1713#endif /* INET6 */
1714		in_pcbdetach(inp);
1715		so->so_state |= nofd;
1716		return (ENOBUFS);
1717	}
1718	if (nstat_collect) {
1719		nstat_tcp_new_pcb(inp);
1720	}
1721	tp->t_state = TCPS_CLOSED;
1722	return (0);
1723}
1724
1725/*
1726 * Initiate (or continue) disconnect.
1727 * If embryonic state, just send reset (once).
1728 * If in ``let data drain'' option and linger null, just drop.
1729 * Otherwise (hard), mark socket disconnecting and drop
1730 * current input data; switch states based on user close, and
1731 * send segment to peer (with FIN).
1732 */
1733static struct tcpcb *
1734tcp_disconnect(tp)
1735	register struct tcpcb *tp;
1736{
1737	struct socket *so = tp->t_inpcb->inp_socket;
1738
1739	if (tp->t_state < TCPS_ESTABLISHED)
1740		tp = tcp_close(tp);
1741	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1742		tp = tcp_drop(tp, 0);
1743	else {
1744		soisdisconnecting(so);
1745		sbflush(&so->so_rcv);
1746		tp = tcp_usrclosed(tp);
1747		if (tp)
1748			(void) tcp_output(tp);
1749	}
1750	return (tp);
1751}
1752
1753/*
1754 * User issued close, and wish to trail through shutdown states:
1755 * if never received SYN, just forget it.  If got a SYN from peer,
1756 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1757 * If already got a FIN from peer, then almost done; go to LAST_ACK
1758 * state.  In all other cases, have already sent FIN to peer (e.g.
1759 * after PRU_SHUTDOWN), and just have to play tedious game waiting
1760 * for peer to send FIN or not respond to keep-alives, etc.
1761 * We can let the user exit from the close as soon as the FIN is acked.
1762 */
1763static struct tcpcb *
1764tcp_usrclosed(tp)
1765	register struct tcpcb *tp;
1766{
1767
1768	switch (tp->t_state) {
1769
1770	case TCPS_CLOSED:
1771	case TCPS_LISTEN:
1772		tp->t_state = TCPS_CLOSED;
1773		tp = tcp_close(tp);
1774		break;
1775
1776	case TCPS_SYN_SENT:
1777	case TCPS_SYN_RECEIVED:
1778		tp->t_flags |= TF_NEEDFIN;
1779		break;
1780
1781	case TCPS_ESTABLISHED:
1782		tp->t_state = TCPS_FIN_WAIT_1;
1783		break;
1784
1785	case TCPS_CLOSE_WAIT:
1786		tp->t_state = TCPS_LAST_ACK;
1787		break;
1788	}
1789	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1790		soisdisconnected(tp->t_inpcb->inp_socket);
1791		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
1792		if (tp->t_state == TCPS_FIN_WAIT_2)
1793			tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, tcp_maxidle);
1794	}
1795	return (tp);
1796}
1797
1798void
1799tcp_in_cksum_stats(u_int32_t len)
1800{
1801	tcps_in_sw_cksum++;
1802	tcps_in_sw_cksum_bytes += len;
1803}
1804
1805void
1806tcp_out_cksum_stats(u_int32_t len)
1807{
1808	tcps_out_sw_cksum++;
1809	tcps_out_sw_cksum_bytes += len;
1810}
1811