1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.22 2001/08/22 00:59:12 silby Exp $ 62 */ 63/* 64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 65 * support for mandatory and extensible security protections. This notice 66 * is included in support of clause 2.2 (b) of the Apple Public License, 67 * Version 2.0. 68 */ 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/callout.h> 73#include <sys/kernel.h> 74#include <sys/sysctl.h> 75#include <sys/malloc.h> 76#include <sys/mbuf.h> 77#include <sys/domain.h> 78#include <sys/proc.h> 79#include <sys/kauth.h> 80#include <sys/socket.h> 81#include <sys/socketvar.h> 82#include <sys/protosw.h> 83#include <sys/random.h> 84#include <sys/syslog.h> 85#include <sys/mcache.h> 86#include <kern/locks.h> 87#include <kern/zalloc.h> 88 89#include <net/route.h> 90#include <net/if.h> 91 92#define tcp_minmssoverload fring 93#define _IP_VHL 94#include <netinet/in.h> 95#include <netinet/in_systm.h> 96#include <netinet/ip.h> 97#include <netinet/ip_icmp.h> 98#if INET6 99#include <netinet/ip6.h> 100#endif 101#include <netinet/in_pcb.h> 102#if INET6 103#include <netinet6/in6_pcb.h> 104#endif 105#include <netinet/in_var.h> 106#include <netinet/ip_var.h> 107#include <netinet/icmp_var.h> 108#if INET6 109#include <netinet6/ip6_var.h> 110#endif 111#include <netinet/tcp.h> 112#include <netinet/tcp_fsm.h> 113#include <netinet/tcp_seq.h> 114#include <netinet/tcp_timer.h> 115#include <netinet/tcp_var.h> 116#include <netinet/tcp_cc.h> 117#include <kern/thread_call.h> 118 119#if INET6 120#include <netinet6/tcp6_var.h> 121#endif 122#include <netinet/tcpip.h> 123#if TCPDEBUG 124#include <netinet/tcp_debug.h> 125#endif 126#include <netinet6/ip6protosw.h> 127 128#if IPSEC 129#include <netinet6/ipsec.h> 130#if INET6 131#include <netinet6/ipsec6.h> 132#endif 133#endif /*IPSEC*/ 134 135#undef tcp_minmssoverload 136 137#if CONFIG_MACF_NET 138#include <security/mac_framework.h> 139#endif /* MAC_NET */ 140 141#include <libkern/crypto/md5.h> 142#include <sys/kdebug.h> 143#include <mach/sdt.h> 144 145#include <netinet/lro_ext.h> 146 147#define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) 148 149extern int tcp_lq_overflow; 150 151/* temporary: for testing */ 152#if IPSEC 153extern int ipsec_bypass; 154#endif 155 156int tcp_mssdflt = TCP_MSS; 157SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED, 158 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 159 160#if INET6 161int tcp_v6mssdflt = TCP6_MSS; 162SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 163 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_v6mssdflt , 0, 164 "Default TCP Maximum Segment Size for IPv6"); 165#endif 166 167extern int tcp_do_autorcvbuf; 168 169/* 170 * Minimum MSS we accept and use. This prevents DoS attacks where 171 * we are forced to a ridiculous low MSS like 20 and send hundreds 172 * of packets instead of one. The effect scales with the available 173 * bandwidth and quickly saturates the CPU and network interface 174 * with packet generation and sending. Set to zero to disable MINMSS 175 * checking. This setting prevents us from sending too small packets. 176 */ 177int tcp_minmss = TCP_MINMSS; 178SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED, 179 &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); 180 181/* 182 * Number of TCP segments per second we accept from remote host 183 * before we start to calculate average segment size. If average 184 * segment size drops below the minimum TCP MSS we assume a DoS 185 * attack and reset+drop the connection. Care has to be taken not to 186 * set this value too small to not kill interactive type connections 187 * (telnet, SSH) which send many small packets. 188 */ 189#ifdef FIX_WORKAROUND_FOR_3894301 190__private_extern__ int tcp_minmssoverload = TCP_MINMSSOVERLOAD; 191#else 192__private_extern__ int tcp_minmssoverload = 0; 193#endif 194SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW | CTLFLAG_LOCKED, 195 &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to" 196 "be under the MINMSS Size"); 197 198static int tcp_do_rfc1323 = 1; 199SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW | CTLFLAG_LOCKED, 200 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 201 202// Not used 203static int tcp_do_rfc1644 = 0; 204SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW | CTLFLAG_LOCKED, 205 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 206 207static int do_tcpdrain = 0; 208SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED, &do_tcpdrain, 0, 209 "Enable tcp_drain routine for extra help when low on mbufs"); 210 211SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, 212 &tcbinfo.ipi_count, 0, "Number of active PCBs"); 213 214static int icmp_may_rst = 1; 215SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp_may_rst, 0, 216 "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 217 218static int tcp_strict_rfc1948 = 0; 219SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED, 220 &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly"); 221 222static int tcp_isn_reseed_interval = 0; 223SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW | CTLFLAG_LOCKED, 224 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); 225static int tcp_background_io_enabled = 1; 226SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, 227 &tcp_background_io_enabled, 0, "Background IO Enabled"); 228 229int tcp_TCPTV_MIN = 100; /* 100ms minimum RTT */ 230SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED, 231 &tcp_TCPTV_MIN, 0, "min rtt value allowed"); 232 233int tcp_rexmt_slop = TCPTV_REXMTSLOP; 234SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmt_slop, CTLFLAG_RW, 235 &tcp_rexmt_slop, 0, "Slop added to retransmit timeout"); 236 237__private_extern__ int tcp_use_randomport = 0; 238SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED, 239 &tcp_use_randomport, 0, "Randomize TCP port numbers"); 240 241extern struct tcp_cc_algo tcp_cc_newreno; 242SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets, CTLFLAG_RD | CTLFLAG_LOCKED, 243 &tcp_cc_newreno.num_sockets, 0, "Number of sockets using newreno"); 244 245extern struct tcp_cc_algo tcp_cc_ledbat; 246SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, CTLFLAG_RD | CTLFLAG_LOCKED, 247 &tcp_cc_ledbat.num_sockets, 0, "Number of sockets using background transport"); 248 249__private_extern__ int tcp_win_scale = 3; 250SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED, 251 &tcp_win_scale, 0, "Window scaling factor"); 252 253static void tcp_cleartaocache(void); 254static void tcp_notify(struct inpcb *, int); 255static void tcp_cc_init(void); 256 257struct zone *sack_hole_zone; 258struct zone *tcp_reass_zone; 259struct zone *tcp_bwmeas_zone; 260 261/* The array containing pointers to currently implemented TCP CC algorithms */ 262struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; 263 264extern int slowlink_wsize; /* window correction for slow links */ 265extern int path_mtu_discovery; 266 267extern u_int32_t tcp_autorcvbuf_max; 268extern u_int32_t tcp_autorcvbuf_inc_shift; 269static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb); 270 271#define TCP_BWMEAS_BURST_MINSIZE 6 272#define TCP_BWMEAS_BURST_MAXSIZE 25 273 274static uint32_t bwmeas_elm_size; 275 276/* 277 * Target size of TCP PCB hash tables. Must be a power of two. 278 * 279 * Note that this can be overridden by the kernel environment 280 * variable net.inet.tcp.tcbhashsize 281 */ 282#ifndef TCBHASHSIZE 283#define TCBHASHSIZE CONFIG_TCBHASHSIZE 284#endif 285 286__private_extern__ int tcp_tcbhashsize = TCBHASHSIZE; 287SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED, 288 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 289 290/* 291 * This is the actual shape of what we allocate using the zone 292 * allocator. Doing it this way allows us to protect both structures 293 * using the same generation count, and also eliminates the overhead 294 * of allocating tcpcbs separately. By hiding the structure here, 295 * we avoid changing most of the rest of the code (although it needs 296 * to be changed, eventually, for greater efficiency). 297 */ 298#define ALIGNMENT 32 299struct inp_tp { 300 struct inpcb inp; 301 struct tcpcb tcb __attribute__((aligned(ALIGNMENT))); 302}; 303#undef ALIGNMENT 304 305extern struct inpcbhead time_wait_slots[]; 306extern struct tcptimerlist tcp_timer_list; 307 308int get_inpcb_str_size(void); 309int get_tcp_str_size(void); 310 311static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *); 312 313static lck_attr_t *tcp_uptime_mtx_attr = NULL; /* mutex attributes */ 314static lck_grp_t *tcp_uptime_mtx_grp = NULL; /* mutex group definition */ 315static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL; /* mutex group attributes */ 316int tcp_notsent_lowat_check(struct socket *so); 317 318 319int get_inpcb_str_size(void) 320{ 321 return sizeof(struct inpcb); 322} 323 324 325int get_tcp_str_size(void) 326{ 327 return sizeof(struct tcpcb); 328} 329 330int tcp_freeq(struct tcpcb *tp); 331 332/* 333 * Initialize TCP congestion control algorithms. 334 */ 335 336void 337tcp_cc_init(void) 338{ 339 bzero(&tcp_cc_algo_list, sizeof(tcp_cc_algo_list)); 340 tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno; 341 tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat; 342} 343 344/* 345 * Tcp initialization 346 */ 347void 348tcp_init() 349{ 350 vm_size_t str_size; 351 int i; 352 struct inpcbinfo *pcbinfo; 353 354 tcp_ccgen = 1; 355 tcp_cleartaocache(); 356 357 tcp_keepinit = TCPTV_KEEP_INIT; 358 tcp_keepidle = TCPTV_KEEP_IDLE; 359 tcp_keepintvl = TCPTV_KEEPINTVL; 360 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 361 tcp_msl = TCPTV_MSL; 362 363 microuptime(&tcp_uptime); 364 read_random(&tcp_now, sizeof(tcp_now)); 365 tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal clock at a random value */ 366 367 LIST_INIT(&tcb); 368 tcbinfo.listhead = &tcb; 369 pcbinfo = &tcbinfo; 370 if (!powerof2(tcp_tcbhashsize)) { 371 printf("WARNING: TCB hash size not a power of 2\n"); 372 tcp_tcbhashsize = 512; /* safe default */ 373 } 374 tcbinfo.hashsize = tcp_tcbhashsize; 375 tcbinfo.hashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.hashmask); 376 tcbinfo.porthashbase = hashinit(tcp_tcbhashsize, M_PCB, 377 &tcbinfo.porthashmask); 378 str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t)); 379 tcbinfo.ipi_zone = (void *) zinit(str_size, 120000*str_size, 8192, "tcpcb"); 380 zone_change(tcbinfo.ipi_zone, Z_CALLERACCT, FALSE); 381 zone_change(tcbinfo.ipi_zone, Z_EXPAND, TRUE); 382 383 str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t)); 384 sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone"); 385 zone_change(sack_hole_zone, Z_CALLERACCT, FALSE); 386 zone_change(sack_hole_zone, Z_EXPAND, TRUE); 387 388 tcp_reass_maxseg = nmbclusters / 16; 389 str_size = P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t)); 390 tcp_reass_zone = zinit(str_size, (tcp_reass_maxseg + 1) * str_size, 391 0, "tcp_reass_zone"); 392 if (tcp_reass_zone == NULL) { 393 panic("%s: failed allocating tcp_reass_zone", __func__); 394 /* NOTREACHED */ 395 } 396 zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE); 397 zone_change(tcp_reass_zone, Z_EXPAND, TRUE); 398 399 bwmeas_elm_size = P2ROUNDUP(sizeof(struct bwmeas), sizeof(u_int64_t)); 400 tcp_bwmeas_zone = zinit(bwmeas_elm_size, (100 * bwmeas_elm_size), 0, "tcp_bwmeas_zone"); 401 if (tcp_bwmeas_zone == NULL) { 402 panic("%s: failed allocating tcp_bwmeas_zone", __func__); 403 /* NOTREACHED */ 404 } 405 zone_change(tcp_bwmeas_zone, Z_CALLERACCT, FALSE); 406 zone_change(tcp_bwmeas_zone, Z_EXPAND, TRUE); 407 408#if INET6 409#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 410#else /* INET6 */ 411#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 412#endif /* INET6 */ 413 if (max_protohdr < TCP_MINPROTOHDR) { 414 _max_protohdr = TCP_MINPROTOHDR; 415 _max_protohdr = max_protohdr; /* round it up */ 416 } 417 if (max_linkhdr + max_protohdr > MCLBYTES) 418 panic("tcp_init"); 419#undef TCP_MINPROTOHDR 420 421 /* 422 * allocate lock group attribute and group for tcp pcb mutexes 423 */ 424 pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); 425 pcbinfo->mtx_grp = lck_grp_alloc_init("tcppcb", pcbinfo->mtx_grp_attr); 426 427 /* 428 * allocate the lock attribute for tcp pcb mutexes 429 */ 430 pcbinfo->mtx_attr = lck_attr_alloc_init(); 431 432 if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL) { 433 printf("tcp_init: mutex not alloced!\n"); 434 return; /* pretty much dead if this fails... */ 435 } 436 437 for (i=0; i < N_TIME_WAIT_SLOTS; i++) { 438 LIST_INIT(&time_wait_slots[i]); 439 } 440 441 bzero(&tcp_timer_list, sizeof(tcp_timer_list)); 442 LIST_INIT(&tcp_timer_list.lhead); 443 /* 444 * allocate lock group attribute, group and attribute for the tcp timer list 445 */ 446 tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init(); 447 tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist", tcp_timer_list.mtx_grp_attr); 448 tcp_timer_list.mtx_attr = lck_attr_alloc_init(); 449 if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp, tcp_timer_list.mtx_attr)) == NULL) { 450 panic("failed to allocate memory for tcp_timer_list.mtx\n"); 451 }; 452 tcp_timer_list.fast_quantum = TCP_FASTTIMER_QUANTUM; 453 tcp_timer_list.slow_quantum = TCP_SLOWTIMER_QUANTUM; 454 if ((tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL)) == NULL) { 455 panic("failed to allocate call entry 1 in tcp_init\n"); 456 } 457 458 /* 459 * allocate lock group attribute, group and attribute for tcp_uptime_lock 460 */ 461 tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init(); 462 tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime", tcp_uptime_mtx_grp_attr); 463 tcp_uptime_mtx_attr = lck_attr_alloc_init(); 464 tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp, tcp_uptime_mtx_attr); 465 466 /* Initialize TCP congestion control algorithms list */ 467 tcp_cc_init(); 468 469 /* Initialize TCP LRO data structures */ 470 tcp_lro_init(); 471} 472 473/* 474 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 475 * tcp_template used to store this data in mbufs, but we now recopy it out 476 * of the tcpcb each time to conserve mbufs. 477 */ 478void 479tcp_fillheaders(tp, ip_ptr, tcp_ptr) 480 struct tcpcb *tp; 481 void *ip_ptr; 482 void *tcp_ptr; 483{ 484 struct inpcb *inp = tp->t_inpcb; 485 struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; 486 487#if INET6 488 if ((inp->inp_vflag & INP_IPV6) != 0) { 489 struct ip6_hdr *ip6; 490 491 ip6 = (struct ip6_hdr *)ip_ptr; 492 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 493 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); 494 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 495 (IPV6_VERSION & IPV6_VERSION_MASK); 496 ip6->ip6_nxt = IPPROTO_TCP; 497 ip6->ip6_plen = sizeof(struct tcphdr); 498 ip6->ip6_src = inp->in6p_laddr; 499 ip6->ip6_dst = inp->in6p_faddr; 500 tcp_hdr->th_sum = in6_cksum_phdr(&inp->in6p_laddr, 501 &inp->in6p_faddr, htonl(sizeof(struct tcphdr)), 502 htonl(IPPROTO_TCP)); 503 } else 504#endif 505 { 506 struct ip *ip = (struct ip *) ip_ptr; 507 508 ip->ip_vhl = IP_VHL_BORING; 509 ip->ip_tos = 0; 510 ip->ip_len = 0; 511 ip->ip_id = 0; 512 ip->ip_off = 0; 513 ip->ip_ttl = 0; 514 ip->ip_sum = 0; 515 ip->ip_p = IPPROTO_TCP; 516 ip->ip_src = inp->inp_laddr; 517 ip->ip_dst = inp->inp_faddr; 518 tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 519 htons(sizeof(struct tcphdr) + IPPROTO_TCP)); 520 } 521 522 tcp_hdr->th_sport = inp->inp_lport; 523 tcp_hdr->th_dport = inp->inp_fport; 524 tcp_hdr->th_seq = 0; 525 tcp_hdr->th_ack = 0; 526 tcp_hdr->th_x2 = 0; 527 tcp_hdr->th_off = 5; 528 tcp_hdr->th_flags = 0; 529 tcp_hdr->th_win = 0; 530 tcp_hdr->th_urp = 0; 531} 532 533/* 534 * Create template to be used to send tcp packets on a connection. 535 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 536 * use for this function is in keepalives, which use tcp_respond. 537 */ 538struct tcptemp * 539tcp_maketemplate(tp) 540 struct tcpcb *tp; 541{ 542 struct mbuf *m; 543 struct tcptemp *n; 544 545 m = m_get(M_DONTWAIT, MT_HEADER); 546 if (m == NULL) 547 return (0); 548 m->m_len = sizeof(struct tcptemp); 549 n = mtod(m, struct tcptemp *); 550 551 tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t); 552 return (n); 553} 554 555/* 556 * Send a single message to the TCP at address specified by 557 * the given TCP/IP header. If m == 0, then we make a copy 558 * of the tcpiphdr at ti and send directly to the addressed host. 559 * This is used to force keep alive messages out using the TCP 560 * template for a connection. If flags are given then we send 561 * a message back to the TCP which originated the * segment ti, 562 * and discard the mbuf containing it and any other attached mbufs. 563 * 564 * In any case the ack and sequence number of the transmitted 565 * segment are as specified by the parameters. 566 * 567 * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 568 */ 569void 570tcp_respond( 571 struct tcpcb *tp, 572 void *ipgen, 573 register struct tcphdr *th, 574 register struct mbuf *m, 575 tcp_seq ack, 576 tcp_seq seq, 577 int flags, 578 unsigned int ifscope, 579 unsigned int nocell 580 ) 581{ 582 register int tlen; 583 int win = 0; 584 struct route *ro = 0; 585 struct route sro; 586 struct ip *ip; 587 struct tcphdr *nth; 588#if INET6 589 struct route_in6 *ro6 = 0; 590 struct route_in6 sro6; 591 struct ip6_hdr *ip6; 592 int isipv6; 593#endif /* INET6 */ 594 struct ifnet *outif; 595 596#if INET6 597 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; 598 ip6 = ipgen; 599#endif /* INET6 */ 600 ip = ipgen; 601 602 if (tp) { 603 if (!(flags & TH_RST)) { 604 win = tcp_sbspace(tp); 605 if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale) 606 win = (int32_t)TCP_MAXWIN << tp->rcv_scale; 607 } 608#if INET6 609 if (isipv6) 610 ro6 = &tp->t_inpcb->in6p_route; 611 else 612#endif /* INET6 */ 613 ro = &tp->t_inpcb->inp_route; 614 } else { 615#if INET6 616 if (isipv6) { 617 ro6 = &sro6; 618 bzero(ro6, sizeof *ro6); 619 } else 620#endif /* INET6 */ 621 { 622 ro = &sro; 623 bzero(ro, sizeof *ro); 624 } 625 } 626 if (m == 0) { 627 m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */ 628 if (m == NULL) 629 return; 630 tlen = 0; 631 m->m_data += max_linkhdr; 632#if INET6 633 if (isipv6) { 634 bcopy((caddr_t)ip6, mtod(m, caddr_t), 635 sizeof(struct ip6_hdr)); 636 ip6 = mtod(m, struct ip6_hdr *); 637 nth = (struct tcphdr *)(void *)(ip6 + 1); 638 } else 639#endif /* INET6 */ 640 { 641 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 642 ip = mtod(m, struct ip *); 643 nth = (struct tcphdr *)(void *)(ip + 1); 644 } 645 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 646 flags = TH_ACK; 647 } else { 648 m_freem(m->m_next); 649 m->m_next = 0; 650 m->m_data = (caddr_t)ipgen; 651 /* m_len is set later */ 652 tlen = 0; 653#define xchg(a,b,type) { type t; t=a; a=b; b=t; } 654#if INET6 655 if (isipv6) { 656 /* Expect 32-bit aligned IP on strict-align platforms */ 657 IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6); 658 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 659 nth = (struct tcphdr *)(void *)(ip6 + 1); 660 } else 661#endif /* INET6 */ 662 { 663 /* Expect 32-bit aligned IP on strict-align platforms */ 664 IP_HDR_STRICT_ALIGNMENT_CHECK(ip); 665 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); 666 nth = (struct tcphdr *)(void *)(ip + 1); 667 } 668 if (th != nth) { 669 /* 670 * this is usually a case when an extension header 671 * exists between the IPv6 header and the 672 * TCP header. 673 */ 674 nth->th_sport = th->th_sport; 675 nth->th_dport = th->th_dport; 676 } 677 xchg(nth->th_dport, nth->th_sport, n_short); 678#undef xchg 679 } 680#if INET6 681 if (isipv6) { 682 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + 683 tlen)); 684 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 685 } else 686#endif 687 { 688 tlen += sizeof (struct tcpiphdr); 689 ip->ip_len = tlen; 690 ip->ip_ttl = ip_defttl; 691 } 692 m->m_len = tlen; 693 m->m_pkthdr.len = tlen; 694 m->m_pkthdr.rcvif = 0; 695#if CONFIG_MACF_NET 696 if (tp != NULL && tp->t_inpcb != NULL) { 697 /* 698 * Packet is associated with a socket, so allow the 699 * label of the response to reflect the socket label. 700 */ 701 mac_mbuf_label_associate_inpcb(tp->t_inpcb, m); 702 } else { 703 /* 704 * Packet is not associated with a socket, so possibly 705 * update the label in place. 706 */ 707 mac_netinet_tcp_reply(m); 708 } 709#endif 710 711 nth->th_seq = htonl(seq); 712 nth->th_ack = htonl(ack); 713 nth->th_x2 = 0; 714 nth->th_off = sizeof (struct tcphdr) >> 2; 715 nth->th_flags = flags; 716 if (tp) 717 nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 718 else 719 nth->th_win = htons((u_short)win); 720 nth->th_urp = 0; 721#if INET6 722 if (isipv6) { 723 nth->th_sum = 0; 724 nth->th_sum = in6_cksum_phdr(&ip6->ip6_src, 725 &ip6->ip6_dst, htons((u_short)(tlen - sizeof(struct ip6_hdr))), 726 htonl(IPPROTO_TCP)); 727 m->m_pkthdr.csum_flags = CSUM_TCPIPV6; 728 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 729 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, 730 ro6 && ro6->ro_rt ? 731 ro6->ro_rt->rt_ifp : 732 NULL); 733 } else 734#endif /* INET6 */ 735 { 736 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 737 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 738 m->m_pkthdr.csum_flags = CSUM_TCP; 739 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 740 } 741#if TCPDEBUG 742 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 743 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 744#endif 745#if IPSEC 746 if (ipsec_bypass == 0 && ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) { 747 m_freem(m); 748 return; 749 } 750#endif 751 752 if (tp != NULL) { 753 u_int32_t svc_flags = 0; 754 if (isipv6) { 755 svc_flags |= PKT_SCF_IPV6; 756 } 757 set_packet_service_class(m, tp->t_inpcb->inp_socket, 758 MBUF_SC_UNSPEC, svc_flags); 759 760 /* Embed flowhash and flow control flags */ 761 m->m_pkthdr.m_flowhash = tp->t_inpcb->inp_flowhash; 762 m->m_pkthdr.m_fhflags |= 763 (PF_TAG_TCP | PF_TAG_FLOWHASH | PF_TAG_FLOWADV); 764 } 765 766#if INET6 767 if (isipv6) { 768 struct ip6_out_args ip6oa = { ifscope, { 0 }, 769 IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR }; 770 771 if (ifscope != IFSCOPE_NONE) 772 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; 773 if (nocell) 774 ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR; 775 776 (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL, 777 NULL, &ip6oa); 778 if (ro6->ro_rt != NULL) { 779 if (ro6 == &sro6) { 780 rtfree(ro6->ro_rt); 781 ro6->ro_rt = NULL; 782 } else if ((outif = ro6->ro_rt->rt_ifp) != 783 tp->t_inpcb->in6p_last_outifp) { 784 tp->t_inpcb->in6p_last_outifp = outif; 785 } 786 } 787 } else 788#endif /* INET6 */ 789 { 790 struct ip_out_args ipoa = { ifscope, { 0 }, 791 IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR }; 792 793 if (ifscope != IFSCOPE_NONE) 794 ipoa.ipoa_flags |= IPOAF_BOUND_IF; 795 if (nocell) 796 ipoa.ipoa_flags |= IPOAF_NO_CELLULAR; 797 798 if (ro != &sro) { 799 /* Copy the cached route and take an extra reference */ 800 inp_route_copyout(tp->t_inpcb, &sro); 801 } 802 /* 803 * For consistency, pass a local route copy. 804 */ 805 (void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa); 806 807 if (ro != &sro) { 808 if (sro.ro_rt != NULL && 809 (outif = sro.ro_rt->rt_ifp) != 810 tp->t_inpcb->inp_last_outifp) 811 tp->t_inpcb->inp_last_outifp = outif; 812 /* Synchronize cached PCB route */ 813 inp_route_copyin(tp->t_inpcb, &sro); 814 } else if (sro.ro_rt != NULL) { 815 rtfree(sro.ro_rt); 816 } 817 } 818} 819 820/* 821 * Create a new TCP control block, making an 822 * empty reassembly queue and hooking it to the argument 823 * protocol control block. The `inp' parameter must have 824 * come from the zone allocator set up in tcp_init(). 825 */ 826struct tcpcb * 827tcp_newtcpcb(inp) 828 struct inpcb *inp; 829{ 830 struct inp_tp *it; 831 register struct tcpcb *tp; 832 register struct socket *so = inp->inp_socket; 833#if INET6 834 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 835#endif /* INET6 */ 836 837 calculate_tcp_clock(); 838 839 if (so->cached_in_sock_layer == 0) { 840 it = (struct inp_tp *)(void *)inp; 841 tp = &it->tcb; 842 } 843 else 844 tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb; 845 846 bzero((char *) tp, sizeof(struct tcpcb)); 847 LIST_INIT(&tp->t_segq); 848 tp->t_maxseg = tp->t_maxopd = 849#if INET6 850 isipv6 ? tcp_v6mssdflt : 851#endif /* INET6 */ 852 tcp_mssdflt; 853 854 if (tcp_do_rfc1323) 855 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 856 tp->sack_enable = tcp_do_sack; 857 TAILQ_INIT(&tp->snd_holes); 858 tp->t_inpcb = inp; /* XXX */ 859 /* 860 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 861 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 862 * reasonable initial retransmit time. 863 */ 864 tp->t_srtt = TCPTV_SRTTBASE; 865 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 866 tp->t_rttmin = tcp_TCPTV_MIN; 867 tp->t_rxtcur = TCPTV_RTOBASE; 868 869 /* Initialize congestion control algorithm for this connection 870 * to newreno by default 871 */ 872 tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX; 873 if (CC_ALGO(tp)->init != NULL) { 874 CC_ALGO(tp)->init(tp); 875 } 876 877 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 878 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 879 tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT; 880 tp->t_rcvtime = tcp_now; 881 tp->tentry.timer_start = tcp_now; 882 tp->t_persist_timeout = tcp_max_persist_timeout; 883 tp->t_persist_stop = 0; 884 tp->t_flagsext |= TF_RCVUNACK_WAITSS; 885 /* 886 * IPv4 TTL initialization is necessary for an IPv6 socket as well, 887 * because the socket may be bound to an IPv6 wildcard address, 888 * which may match an IPv4-mapped IPv6 address. 889 */ 890 inp->inp_ip_ttl = ip_defttl; 891 inp->inp_ppcb = (caddr_t)tp; 892 return (tp); /* XXX */ 893} 894 895/* 896 * Drop a TCP connection, reporting 897 * the specified error. If connection is synchronized, 898 * then send a RST to peer. 899 */ 900struct tcpcb * 901tcp_drop(tp, errno) 902 register struct tcpcb *tp; 903 int errno; 904{ 905 struct socket *so = tp->t_inpcb->inp_socket; 906#if CONFIG_DTRACE 907 struct inpcb *inp = tp->t_inpcb; 908#endif 909 910 if (TCPS_HAVERCVDSYN(tp->t_state)) { 911 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 912 struct tcpcb *, tp, int32_t, TCPS_CLOSED); 913 tp->t_state = TCPS_CLOSED; 914 (void) tcp_output(tp); 915 tcpstat.tcps_drops++; 916 } else 917 tcpstat.tcps_conndrops++; 918 if (errno == ETIMEDOUT && tp->t_softerror) 919 errno = tp->t_softerror; 920 so->so_error = errno; 921 return (tcp_close(tp)); 922} 923 924void 925tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) 926{ 927 u_int32_t rtt = rt->rt_rmx.rmx_rtt; 928 int isnetlocal = (tp->t_flags & TF_LOCAL); 929 930 if (rtt != 0) { 931 /* 932 * XXX the lock bit for RTT indicates that the value 933 * is also a minimum value; this is subject to time. 934 */ 935 if (rt->rt_rmx.rmx_locks & RTV_RTT) 936 tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ); 937 else 938 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; 939 tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); 940 tcpstat.tcps_usedrtt++; 941 if (rt->rt_rmx.rmx_rttvar) { 942 tp->t_rttvar = rt->rt_rmx.rmx_rttvar / 943 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); 944 tcpstat.tcps_usedrttvar++; 945 } else { 946 /* default variation is +- 1 rtt */ 947 tp->t_rttvar = 948 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 949 } 950 TCPT_RANGESET(tp->t_rxtcur, 951 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 952 tp->t_rttmin, TCPTV_REXMTMAX, 953 TCP_ADD_REXMTSLOP(tp)); 954 } 955} 956 957/* 958 * Close a TCP control block: 959 * discard all space held by the tcp 960 * discard internet protocol block 961 * wake up any sleepers 962 */ 963struct tcpcb * 964tcp_close(tp) 965 register struct tcpcb *tp; 966{ 967 struct inpcb *inp = tp->t_inpcb; 968 struct socket *so = inp->inp_socket; 969#if INET6 970 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 971#endif /* INET6 */ 972 struct rtentry *rt; 973 int dosavessthresh; 974 975 if ( inp->inp_ppcb == NULL) /* tcp_close was called previously, bail */ 976 return(NULL); 977 978 tcp_canceltimers(tp); 979 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0); 980 981 /* 982 * If another thread for this tcp is currently in ip (indicated by 983 * the TF_SENDINPROG flag), defer the cleanup until after it returns 984 * back to tcp. This is done to serialize the close until after all 985 * pending output is finished, in order to avoid having the PCB be 986 * detached and the cached route cleaned, only for ip to cache the 987 * route back into the PCB again. Note that we've cleared all the 988 * timers at this point. Set TF_CLOSING to indicate to tcp_output() 989 * that is should call us again once it returns from ip; at that 990 * point both flags should be cleared and we can proceed further 991 * with the cleanup. 992 */ 993 if ((tp->t_flags & TF_CLOSING) || 994 inp->inp_sndinprog_cnt > 0) { 995 tp->t_flags |= TF_CLOSING; 996 return (NULL); 997 } 998 999 if (CC_ALGO(tp)->cleanup != NULL) { 1000 CC_ALGO(tp)->cleanup(tp); 1001 } 1002 1003#if INET6 1004 rt = isipv6 ? inp->in6p_route.ro_rt : inp->inp_route.ro_rt; 1005#else 1006 rt = inp->inp_route.ro_rt; 1007#endif 1008 if (rt != NULL) 1009 RT_LOCK_SPIN(rt); 1010 1011 /* 1012 * If we got enough samples through the srtt filter, 1013 * save the rtt and rttvar in the routing entry. 1014 * 'Enough' is arbitrarily defined as the 16 samples. 1015 * 16 samples is enough for the srtt filter to converge 1016 * to within 5% of the correct value; fewer samples and 1017 * we could save a very bogus rtt. 1018 * 1019 * Don't update the default route's characteristics and don't 1020 * update anything that the user "locked". 1021 */ 1022 if (tp->t_rttupdated >= 16) { 1023 register u_int32_t i = 0; 1024 1025#if INET6 1026 if (isipv6) { 1027 struct sockaddr_in6 *sin6; 1028 1029 if (rt == NULL) 1030 goto no_valid_rt; 1031 sin6 = (struct sockaddr_in6 *)(void *)rt_key(rt); 1032 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 1033 goto no_valid_rt; 1034 } 1035 else 1036#endif /* INET6 */ 1037 if (rt == NULL || !(rt->rt_flags & RTF_UP) || 1038 ((struct sockaddr_in *)(void *)rt_key(rt))->sin_addr.s_addr == 1039 INADDR_ANY || rt->generation_id != route_generation) { 1040 if (tp->t_state >= TCPS_CLOSE_WAIT) { 1041 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 1042 struct tcpcb *, tp, int32_t, TCPS_CLOSING); 1043 tp->t_state = TCPS_CLOSING; 1044 } 1045 goto no_valid_rt; 1046 } 1047 1048 RT_LOCK_ASSERT_HELD(rt); 1049 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { 1050 i = tp->t_srtt * 1051 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); 1052 if (rt->rt_rmx.rmx_rtt && i) 1053 /* 1054 * filter this update to half the old & half 1055 * the new values, converting scale. 1056 * See route.h and tcp_var.h for a 1057 * description of the scaling constants. 1058 */ 1059 rt->rt_rmx.rmx_rtt = 1060 (rt->rt_rmx.rmx_rtt + i) / 2; 1061 else 1062 rt->rt_rmx.rmx_rtt = i; 1063 tcpstat.tcps_cachedrtt++; 1064 } 1065 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { 1066 i = tp->t_rttvar * 1067 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); 1068 if (rt->rt_rmx.rmx_rttvar && i) 1069 rt->rt_rmx.rmx_rttvar = 1070 (rt->rt_rmx.rmx_rttvar + i) / 2; 1071 else 1072 rt->rt_rmx.rmx_rttvar = i; 1073 tcpstat.tcps_cachedrttvar++; 1074 } 1075 /* 1076 * The old comment here said: 1077 * update the pipelimit (ssthresh) if it has been updated 1078 * already or if a pipesize was specified & the threshhold 1079 * got below half the pipesize. I.e., wait for bad news 1080 * before we start updating, then update on both good 1081 * and bad news. 1082 * 1083 * But we want to save the ssthresh even if no pipesize is 1084 * specified explicitly in the route, because such 1085 * connections still have an implicit pipesize specified 1086 * by the global tcp_sendspace. In the absence of a reliable 1087 * way to calculate the pipesize, it will have to do. 1088 */ 1089 i = tp->snd_ssthresh; 1090 if (rt->rt_rmx.rmx_sendpipe != 0) 1091 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); 1092 else 1093 dosavessthresh = (i < so->so_snd.sb_hiwat / 2); 1094 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && 1095 i != 0 && rt->rt_rmx.rmx_ssthresh != 0) 1096 || dosavessthresh) { 1097 /* 1098 * convert the limit from user data bytes to 1099 * packets then to packet data bytes. 1100 */ 1101 i = (i + tp->t_maxseg / 2) / tp->t_maxseg; 1102 if (i < 2) 1103 i = 2; 1104 i *= (u_int32_t)(tp->t_maxseg + 1105#if INET6 1106 (isipv6 ? sizeof (struct ip6_hdr) + 1107 sizeof (struct tcphdr) : 1108#endif 1109 sizeof (struct tcpiphdr) 1110#if INET6 1111 ) 1112#endif 1113 ); 1114 if (rt->rt_rmx.rmx_ssthresh) 1115 rt->rt_rmx.rmx_ssthresh = 1116 (rt->rt_rmx.rmx_ssthresh + i) / 2; 1117 else 1118 rt->rt_rmx.rmx_ssthresh = i; 1119 tcpstat.tcps_cachedssthresh++; 1120 } 1121 } 1122 1123 /* 1124 * Mark route for deletion if no information is cached. 1125 */ 1126 if (rt != NULL && (so->so_flags & SOF_OVERFLOW) && tcp_lq_overflow) { 1127 if (!(rt->rt_rmx.rmx_locks & RTV_RTT) && 1128 rt->rt_rmx.rmx_rtt == 0) { 1129 rt->rt_flags |= RTF_DELCLONE; 1130 } 1131 } 1132 1133no_valid_rt: 1134 if (rt != NULL) 1135 RT_UNLOCK(rt); 1136 1137 /* free the reassembly queue, if any */ 1138 (void) tcp_freeq(tp); 1139 1140 tcp_free_sackholes(tp); 1141 if (tp->t_bwmeas != NULL) { 1142 tcp_bwmeas_free(tp); 1143 } 1144 1145 /* Free the packet list */ 1146 if (tp->t_pktlist_head != NULL) 1147 m_freem_list(tp->t_pktlist_head); 1148 TCP_PKTLIST_CLEAR(tp); 1149 1150#ifdef __APPLE__ 1151 if (so->cached_in_sock_layer) 1152 inp->inp_saved_ppcb = (caddr_t) tp; 1153#endif 1154 /* Issue a wakeup before detach so that we don't miss 1155 * a wakeup 1156 */ 1157 sodisconnectwakeup(so); 1158 1159 /* 1160 * Clean up any LRO state 1161 */ 1162 if (tp->t_flagsext & TF_LRO_OFFLOADED) { 1163 tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr, 1164 inp->inp_lport, 1165 inp->inp_fport); 1166 tp->t_flagsext &= ~TF_LRO_OFFLOADED; 1167 } 1168 1169#if INET6 1170 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1171 in6_pcbdetach(inp); 1172 else 1173#endif /* INET6 */ 1174 in_pcbdetach(inp); 1175 1176 /* Call soisdisconnected after detach because it might unlock the socket */ 1177 soisdisconnected(so); 1178 tcpstat.tcps_closed++; 1179 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed,0,0,0,0); 1180 return(NULL); 1181} 1182 1183int 1184tcp_freeq(tp) 1185 struct tcpcb *tp; 1186{ 1187 1188 register struct tseg_qent *q; 1189 int rv = 0; 1190 1191 while((q = LIST_FIRST(&tp->t_segq)) != NULL) { 1192 LIST_REMOVE(q, tqe_q); 1193 m_freem(q->tqe_m); 1194 zfree(tcp_reass_zone, q); 1195 tcp_reass_qsize--; 1196 rv = 1; 1197 } 1198 return (rv); 1199} 1200 1201void 1202tcp_drain() 1203{ 1204 if (do_tcpdrain) 1205 { 1206 struct inpcb *inpb; 1207 struct tcpcb *tcpb; 1208 struct tseg_qent *te; 1209 1210 /* 1211 * Walk the tcpbs, if existing, and flush the reassembly queue, 1212 * if there is one... 1213 * XXX: The "Net/3" implementation doesn't imply that the TCP 1214 * reassembly queue should be flushed, but in a situation 1215 * where we're really low on mbufs, this is potentially 1216 * usefull. 1217 */ 1218 if (!lck_rw_try_lock_exclusive(tcbinfo.mtx)) /* do it next time if the lock is in use */ 1219 return; 1220 1221 for (inpb = LIST_FIRST(tcbinfo.listhead); inpb; 1222 inpb = LIST_NEXT(inpb, inp_list)) { 1223 if ((tcpb = intotcpcb(inpb))) { 1224 while ((te = LIST_FIRST(&tcpb->t_segq)) 1225 != NULL) { 1226 LIST_REMOVE(te, tqe_q); 1227 m_freem(te->tqe_m); 1228 zfree(tcp_reass_zone, te); 1229 tcp_reass_qsize--; 1230 } 1231 } 1232 } 1233 lck_rw_done(tcbinfo.mtx); 1234 1235 } 1236} 1237 1238/* 1239 * Notify a tcp user of an asynchronous error; 1240 * store error as soft error, but wake up user 1241 * (for now, won't do anything until can select for soft error). 1242 * 1243 * Do not wake up user since there currently is no mechanism for 1244 * reporting soft errors (yet - a kqueue filter may be added). 1245 */ 1246static void 1247tcp_notify(inp, error) 1248 struct inpcb *inp; 1249 int error; 1250{ 1251 struct tcpcb *tp; 1252 1253 if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) 1254 return; /* pcb is gone already */ 1255 1256 tp = (struct tcpcb *)inp->inp_ppcb; 1257 1258 /* 1259 * Ignore some errors if we are hooked up. 1260 * If connection hasn't completed, has retransmitted several times, 1261 * and receives a second error, give up now. This is better 1262 * than waiting a long time to establish a connection that 1263 * can never complete. 1264 */ 1265 if (tp->t_state == TCPS_ESTABLISHED && 1266 (error == EHOSTUNREACH || error == ENETUNREACH || 1267 error == EHOSTDOWN)) { 1268 return; 1269 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 1270 tp->t_softerror) 1271 tcp_drop(tp, error); 1272 else 1273 tp->t_softerror = error; 1274#if 0 1275 wakeup((caddr_t) &so->so_timeo); 1276 sorwakeup(so); 1277 sowwakeup(so); 1278#endif 1279} 1280 1281struct bwmeas* 1282tcp_bwmeas_alloc(struct tcpcb *tp) 1283{ 1284 struct bwmeas *elm; 1285 elm = zalloc(tcp_bwmeas_zone); 1286 if (elm == NULL) 1287 return(elm); 1288 1289 bzero(elm, bwmeas_elm_size); 1290 elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE; 1291 elm->bw_maxsizepkts = TCP_BWMEAS_BURST_MAXSIZE; 1292 elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg; 1293 elm->bw_maxsize = elm->bw_maxsizepkts * tp->t_maxseg; 1294 return(elm); 1295} 1296 1297void 1298tcp_bwmeas_free(struct tcpcb* tp) 1299{ 1300 zfree(tcp_bwmeas_zone, tp->t_bwmeas); 1301 tp->t_bwmeas = NULL; 1302 tp->t_flagsext &= ~(TF_MEASURESNDBW); 1303} 1304 1305/* 1306 * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format. 1307 * The otcpcb data structure is passed to user space and must not change. 1308 */ 1309static void 1310tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp) 1311{ 1312 int i; 1313 1314 otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; 1315 otp->t_dupacks = tp->t_dupacks; 1316 for (i = 0; i < TCPT_NTIMERS_EXT; i++) 1317 otp->t_timer[i] = tp->t_timer[i]; 1318 otp->t_inpcb = (_TCPCB_PTR(struct inpcb *))(uintptr_t)tp->t_inpcb; 1319 otp->t_state = tp->t_state; 1320 otp->t_flags = tp->t_flags; 1321 otp->t_force = tp->t_force; 1322 otp->snd_una = tp->snd_una; 1323 otp->snd_max = tp->snd_max; 1324 otp->snd_nxt = tp->snd_nxt; 1325 otp->snd_up = tp->snd_up; 1326 otp->snd_wl1 = tp->snd_wl1; 1327 otp->snd_wl2 = tp->snd_wl2; 1328 otp->iss = tp->iss; 1329 otp->irs = tp->irs; 1330 otp->rcv_nxt = tp->rcv_nxt; 1331 otp->rcv_adv = tp->rcv_adv; 1332 otp->rcv_wnd = tp->rcv_wnd; 1333 otp->rcv_up = tp->rcv_up; 1334 otp->snd_wnd = tp->snd_wnd; 1335 otp->snd_cwnd = tp->snd_cwnd; 1336 otp->snd_ssthresh = tp->snd_ssthresh; 1337 otp->t_maxopd = tp->t_maxopd; 1338 otp->t_rcvtime = tp->t_rcvtime; 1339 otp->t_starttime = tp->t_starttime; 1340 otp->t_rtttime = tp->t_rtttime; 1341 otp->t_rtseq = tp->t_rtseq; 1342 otp->t_rxtcur = tp->t_rxtcur; 1343 otp->t_maxseg = tp->t_maxseg; 1344 otp->t_srtt = tp->t_srtt; 1345 otp->t_rttvar = tp->t_rttvar; 1346 otp->t_rxtshift = tp->t_rxtshift; 1347 otp->t_rttmin = tp->t_rttmin; 1348 otp->t_rttupdated = tp->t_rttupdated; 1349 otp->max_sndwnd = tp->max_sndwnd; 1350 otp->t_softerror = tp->t_softerror; 1351 otp->t_oobflags = tp->t_oobflags; 1352 otp->t_iobc = tp->t_iobc; 1353 otp->snd_scale = tp->snd_scale; 1354 otp->rcv_scale = tp->rcv_scale; 1355 otp->request_r_scale = tp->request_r_scale; 1356 otp->requested_s_scale = tp->requested_s_scale; 1357 otp->ts_recent = tp->ts_recent; 1358 otp->ts_recent_age = tp->ts_recent_age; 1359 otp->last_ack_sent = tp->last_ack_sent; 1360 otp->cc_send = tp->cc_send; 1361 otp->cc_recv = tp->cc_recv; 1362 otp->snd_recover = tp->snd_recover; 1363 otp->snd_cwnd_prev = tp->snd_cwnd_prev; 1364 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev; 1365 otp->t_badrxtwin = tp->t_badrxtwin; 1366} 1367 1368static int 1369tcp_pcblist SYSCTL_HANDLER_ARGS 1370{ 1371#pragma unused(oidp, arg1, arg2) 1372 int error, i, n; 1373 struct inpcb *inp, **inp_list; 1374 inp_gen_t gencnt; 1375 struct xinpgen xig; 1376 int slot; 1377 1378 /* 1379 * The process of preparing the TCB list is too time-consuming and 1380 * resource-intensive to repeat twice on every request. 1381 */ 1382 lck_rw_lock_shared(tcbinfo.mtx); 1383 if (req->oldptr == USER_ADDR_NULL) { 1384 n = tcbinfo.ipi_count; 1385 req->oldidx = 2 * (sizeof xig) 1386 + (n + n/8) * sizeof(struct xtcpcb); 1387 lck_rw_done(tcbinfo.mtx); 1388 return 0; 1389 } 1390 1391 if (req->newptr != USER_ADDR_NULL) { 1392 lck_rw_done(tcbinfo.mtx); 1393 return EPERM; 1394 } 1395 1396 /* 1397 * OK, now we're committed to doing something. 1398 */ 1399 gencnt = tcbinfo.ipi_gencnt; 1400 n = tcbinfo.ipi_count; 1401 1402 bzero(&xig, sizeof(xig)); 1403 xig.xig_len = sizeof xig; 1404 xig.xig_count = n; 1405 xig.xig_gen = gencnt; 1406 xig.xig_sogen = so_gencnt; 1407 error = SYSCTL_OUT(req, &xig, sizeof xig); 1408 if (error) { 1409 lck_rw_done(tcbinfo.mtx); 1410 return error; 1411 } 1412 /* 1413 * We are done if there is no pcb 1414 */ 1415 if (n == 0) { 1416 lck_rw_done(tcbinfo.mtx); 1417 return 0; 1418 } 1419 1420 inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1421 if (inp_list == 0) { 1422 lck_rw_done(tcbinfo.mtx); 1423 return ENOMEM; 1424 } 1425 1426 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; 1427 inp = LIST_NEXT(inp, inp_list)) { 1428#ifdef __APPLE__ 1429 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) 1430#else 1431 if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) 1432#endif 1433 inp_list[i++] = inp; 1434 } 1435 1436 for (slot = 0; slot < N_TIME_WAIT_SLOTS; slot++) { 1437 struct inpcb *inpnxt; 1438 1439 for (inp = time_wait_slots[slot].lh_first; inp && i < n; inp = inpnxt) { 1440 inpnxt = inp->inp_list.le_next; 1441 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) 1442 inp_list[i++] = inp; 1443 } 1444 } 1445 1446 n = i; 1447 1448 error = 0; 1449 for (i = 0; i < n; i++) { 1450 inp = inp_list[i]; 1451 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { 1452 struct xtcpcb xt; 1453 caddr_t inp_ppcb; 1454 1455 bzero(&xt, sizeof(xt)); 1456 xt.xt_len = sizeof xt; 1457 /* XXX should avoid extra copy */ 1458 inpcb_to_compat(inp, &xt.xt_inp); 1459 inp_ppcb = inp->inp_ppcb; 1460 if (inp_ppcb != NULL) { 1461 tcpcb_to_otcpcb((struct tcpcb *)(void *)inp_ppcb, 1462 &xt.xt_tp); 1463 } else { 1464 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 1465 } 1466 if (inp->inp_socket) 1467 sotoxsocket(inp->inp_socket, &xt.xt_socket); 1468 error = SYSCTL_OUT(req, &xt, sizeof xt); 1469 } 1470 } 1471 if (!error) { 1472 /* 1473 * Give the user an updated idea of our state. 1474 * If the generation differs from what we told 1475 * them before, they know that something happened 1476 * while we were processing this request, and it 1477 * might be necessary to retry. 1478 */ 1479 bzero(&xig, sizeof(xig)); 1480 xig.xig_len = sizeof xig; 1481 xig.xig_gen = tcbinfo.ipi_gencnt; 1482 xig.xig_sogen = so_gencnt; 1483 xig.xig_count = tcbinfo.ipi_count; 1484 error = SYSCTL_OUT(req, &xig, sizeof xig); 1485 } 1486 FREE(inp_list, M_TEMP); 1487 lck_rw_done(tcbinfo.mtx); 1488 return error; 1489} 1490 1491SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, 1492 tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 1493 1494#if !CONFIG_EMBEDDED 1495 1496static void 1497tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp) 1498{ 1499 int i; 1500 1501 otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; 1502 otp->t_dupacks = tp->t_dupacks; 1503 for (i = 0; i < TCPT_NTIMERS_EXT; i++) 1504 otp->t_timer[i] = tp->t_timer[i]; 1505 otp->t_state = tp->t_state; 1506 otp->t_flags = tp->t_flags; 1507 otp->t_force = tp->t_force; 1508 otp->snd_una = tp->snd_una; 1509 otp->snd_max = tp->snd_max; 1510 otp->snd_nxt = tp->snd_nxt; 1511 otp->snd_up = tp->snd_up; 1512 otp->snd_wl1 = tp->snd_wl1; 1513 otp->snd_wl2 = tp->snd_wl2; 1514 otp->iss = tp->iss; 1515 otp->irs = tp->irs; 1516 otp->rcv_nxt = tp->rcv_nxt; 1517 otp->rcv_adv = tp->rcv_adv; 1518 otp->rcv_wnd = tp->rcv_wnd; 1519 otp->rcv_up = tp->rcv_up; 1520 otp->snd_wnd = tp->snd_wnd; 1521 otp->snd_cwnd = tp->snd_cwnd; 1522 otp->snd_ssthresh = tp->snd_ssthresh; 1523 otp->t_maxopd = tp->t_maxopd; 1524 otp->t_rcvtime = tp->t_rcvtime; 1525 otp->t_starttime = tp->t_starttime; 1526 otp->t_rtttime = tp->t_rtttime; 1527 otp->t_rtseq = tp->t_rtseq; 1528 otp->t_rxtcur = tp->t_rxtcur; 1529 otp->t_maxseg = tp->t_maxseg; 1530 otp->t_srtt = tp->t_srtt; 1531 otp->t_rttvar = tp->t_rttvar; 1532 otp->t_rxtshift = tp->t_rxtshift; 1533 otp->t_rttmin = tp->t_rttmin; 1534 otp->t_rttupdated = tp->t_rttupdated; 1535 otp->max_sndwnd = tp->max_sndwnd; 1536 otp->t_softerror = tp->t_softerror; 1537 otp->t_oobflags = tp->t_oobflags; 1538 otp->t_iobc = tp->t_iobc; 1539 otp->snd_scale = tp->snd_scale; 1540 otp->rcv_scale = tp->rcv_scale; 1541 otp->request_r_scale = tp->request_r_scale; 1542 otp->requested_s_scale = tp->requested_s_scale; 1543 otp->ts_recent = tp->ts_recent; 1544 otp->ts_recent_age = tp->ts_recent_age; 1545 otp->last_ack_sent = tp->last_ack_sent; 1546 otp->cc_send = tp->cc_send; 1547 otp->cc_recv = tp->cc_recv; 1548 otp->snd_recover = tp->snd_recover; 1549 otp->snd_cwnd_prev = tp->snd_cwnd_prev; 1550 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev; 1551 otp->t_badrxtwin = tp->t_badrxtwin; 1552} 1553 1554 1555static int 1556tcp_pcblist64 SYSCTL_HANDLER_ARGS 1557{ 1558#pragma unused(oidp, arg1, arg2) 1559 int error, i, n; 1560 struct inpcb *inp, **inp_list; 1561 inp_gen_t gencnt; 1562 struct xinpgen xig; 1563 int slot; 1564 1565 /* 1566 * The process of preparing the TCB list is too time-consuming and 1567 * resource-intensive to repeat twice on every request. 1568 */ 1569 lck_rw_lock_shared(tcbinfo.mtx); 1570 if (req->oldptr == USER_ADDR_NULL) { 1571 n = tcbinfo.ipi_count; 1572 req->oldidx = 2 * (sizeof xig) 1573 + (n + n/8) * sizeof(struct xtcpcb64); 1574 lck_rw_done(tcbinfo.mtx); 1575 return 0; 1576 } 1577 1578 if (req->newptr != USER_ADDR_NULL) { 1579 lck_rw_done(tcbinfo.mtx); 1580 return EPERM; 1581 } 1582 1583 /* 1584 * OK, now we're committed to doing something. 1585 */ 1586 gencnt = tcbinfo.ipi_gencnt; 1587 n = tcbinfo.ipi_count; 1588 1589 bzero(&xig, sizeof(xig)); 1590 xig.xig_len = sizeof xig; 1591 xig.xig_count = n; 1592 xig.xig_gen = gencnt; 1593 xig.xig_sogen = so_gencnt; 1594 error = SYSCTL_OUT(req, &xig, sizeof xig); 1595 if (error) { 1596 lck_rw_done(tcbinfo.mtx); 1597 return error; 1598 } 1599 /* 1600 * We are done if there is no pcb 1601 */ 1602 if (n == 0) { 1603 lck_rw_done(tcbinfo.mtx); 1604 return 0; 1605 } 1606 1607 inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1608 if (inp_list == 0) { 1609 lck_rw_done(tcbinfo.mtx); 1610 return ENOMEM; 1611 } 1612 1613 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; 1614 inp = LIST_NEXT(inp, inp_list)) { 1615#ifdef __APPLE__ 1616 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) 1617#else 1618 if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) 1619#endif 1620 inp_list[i++] = inp; 1621 } 1622 1623 for (slot = 0; slot < N_TIME_WAIT_SLOTS; slot++) { 1624 struct inpcb *inpnxt; 1625 1626 for (inp = time_wait_slots[slot].lh_first; inp && i < n; inp = inpnxt) { 1627 inpnxt = inp->inp_list.le_next; 1628 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) 1629 inp_list[i++] = inp; 1630 } 1631 } 1632 1633 n = i; 1634 1635 error = 0; 1636 for (i = 0; i < n; i++) { 1637 inp = inp_list[i]; 1638 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { 1639 struct xtcpcb64 xt; 1640 1641 bzero(&xt, sizeof(xt)); 1642 xt.xt_len = sizeof xt; 1643 inpcb_to_xinpcb64(inp, &xt.xt_inpcb); 1644 xt.xt_inpcb.inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb; 1645 if (inp->inp_ppcb != NULL) 1646 tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt); 1647 if (inp->inp_socket) 1648 sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket); 1649 error = SYSCTL_OUT(req, &xt, sizeof xt); 1650 } 1651 } 1652 if (!error) { 1653 /* 1654 * Give the user an updated idea of our state. 1655 * If the generation differs from what we told 1656 * them before, they know that something happened 1657 * while we were processing this request, and it 1658 * might be necessary to retry. 1659 */ 1660 bzero(&xig, sizeof(xig)); 1661 xig.xig_len = sizeof xig; 1662 xig.xig_gen = tcbinfo.ipi_gencnt; 1663 xig.xig_sogen = so_gencnt; 1664 xig.xig_count = tcbinfo.ipi_count; 1665 error = SYSCTL_OUT(req, &xig, sizeof xig); 1666 } 1667 FREE(inp_list, M_TEMP); 1668 lck_rw_done(tcbinfo.mtx); 1669 return error; 1670} 1671 1672SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, 1673 tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections"); 1674 1675#endif /* !CONFIG_EMBEDDED */ 1676 1677static int 1678tcp_pcblist_n SYSCTL_HANDLER_ARGS 1679{ 1680#pragma unused(oidp, arg1, arg2) 1681 int error = 0; 1682 1683 error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo); 1684 1685 return error; 1686} 1687 1688 1689SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, 1690 tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections"); 1691 1692 1693__private_extern__ void 1694tcp_get_ports_used(unsigned int ifindex, uint8_t *bitfield) 1695{ 1696 inpcb_get_ports_used(ifindex, bitfield, &tcbinfo); 1697} 1698 1699__private_extern__ uint32_t 1700tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags) 1701{ 1702 return inpcb_count_opportunistic(ifindex, &tcbinfo, flags); 1703} 1704 1705void 1706tcp_ctlinput(cmd, sa, vip) 1707 int cmd; 1708 struct sockaddr *sa; 1709 void *vip; 1710{ 1711 tcp_seq icmp_tcp_seq; 1712 struct ip *ip = vip; 1713 struct in_addr faddr; 1714 struct inpcb *inp; 1715 struct tcpcb *tp; 1716 1717 void (*notify)(struct inpcb *, int) = tcp_notify; 1718 1719 faddr = ((struct sockaddr_in *)(void *)sa)->sin_addr; 1720 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1721 return; 1722 1723 if (cmd == PRC_MSGSIZE) 1724 notify = tcp_mtudisc; 1725 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 1726 cmd == PRC_UNREACH_PORT) && ip) 1727 notify = tcp_drop_syn_sent; 1728 else if (PRC_IS_REDIRECT(cmd)) { 1729 ip = 0; 1730 notify = in_rtchange; 1731 } else if (cmd == PRC_HOSTDEAD) 1732 ip = 0; 1733 /* Source quench is deprecated */ 1734 else if (cmd == PRC_QUENCH) 1735 return; 1736 else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) 1737 return; 1738 if (ip) { 1739 struct tcphdr th; 1740 struct icmp *icp; 1741 1742 icp = (struct icmp *)(void *) 1743 ((caddr_t)ip - offsetof(struct icmp, icmp_ip)); 1744 bcopy(((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)), 1745 &th, sizeof (th)); 1746 inp = in_pcblookup_hash(&tcbinfo, faddr, th.th_dport, 1747 ip->ip_src, th.th_sport, 0, NULL); 1748 if (inp != NULL && inp->inp_socket != NULL) { 1749 tcp_lock(inp->inp_socket, 1, 0); 1750 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { 1751 tcp_unlock(inp->inp_socket, 1, 0); 1752 return; 1753 } 1754 icmp_tcp_seq = htonl(th.th_seq); 1755 tp = intotcpcb(inp); 1756 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && 1757 SEQ_LT(icmp_tcp_seq, tp->snd_max)) { 1758 if (cmd == PRC_MSGSIZE) { 1759 1760 /* 1761 * MTU discovery: 1762 * If we got a needfrag and there is a host route to the 1763 * original destination, and the MTU is not locked, then 1764 * set the MTU in the route to the suggested new value 1765 * (if given) and then notify as usual. The ULPs will 1766 * notice that the MTU has changed and adapt accordingly. 1767 * If no new MTU was suggested, then we guess a new one 1768 * less than the current value. If the new MTU is 1769 * unreasonably small (defined by sysctl tcp_minmss), then 1770 * we reset the MTU to the interface value and enable the 1771 * lock bit, indicating that we are no longer doing MTU 1772 * discovery. 1773 */ 1774 struct rtentry *rt; 1775 int mtu; 1776 struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET, 1777 0 , { 0 }, { 0,0,0,0,0,0,0,0 } }; 1778 icmpsrc.sin_addr = icp->icmp_ip.ip_dst; 1779 1780 rt = rtalloc1((struct sockaddr *)&icmpsrc, 0, 1781 RTF_CLONING | RTF_PRCLONING); 1782 if (rt != NULL) { 1783 RT_LOCK(rt); 1784 if ((rt->rt_flags & RTF_HOST) && 1785 !(rt->rt_rmx.rmx_locks & RTV_MTU)) { 1786 mtu = ntohs(icp->icmp_nextmtu); 1787 if (!mtu) 1788 mtu = ip_next_mtu(rt->rt_rmx. 1789 rmx_mtu, 1); 1790#if DEBUG_MTUDISC 1791 printf("MTU for %s reduced to %d\n", 1792 inet_ntop(AF_INET, 1793 &icmpsrc.sin_addr, ipv4str, 1794 sizeof (ipv4str)), mtu); 1795#endif 1796 if (mtu < max(296, (tcp_minmss + 1797 sizeof (struct tcpiphdr)))) { 1798 /* rt->rt_rmx.rmx_mtu = 1799 rt->rt_ifp->if_mtu; */ 1800 rt->rt_rmx.rmx_locks |= RTV_MTU; 1801 } else if (rt->rt_rmx.rmx_mtu > mtu) { 1802 rt->rt_rmx.rmx_mtu = mtu; 1803 } 1804 } 1805 RT_UNLOCK(rt); 1806 rtfree(rt); 1807 } 1808 } 1809 1810 (*notify)(inp, inetctlerrmap[cmd]); 1811 } 1812 tcp_unlock(inp->inp_socket, 1, 0); 1813 } 1814 } else 1815 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); 1816} 1817 1818#if INET6 1819void 1820tcp6_ctlinput(cmd, sa, d) 1821 int cmd; 1822 struct sockaddr *sa; 1823 void *d; 1824{ 1825 struct tcphdr th; 1826 void (*notify)(struct inpcb *, int) = tcp_notify; 1827 struct ip6_hdr *ip6; 1828 struct mbuf *m; 1829 struct ip6ctlparam *ip6cp = NULL; 1830 const struct sockaddr_in6 *sa6_src = NULL; 1831 int off; 1832 struct tcp_portonly { 1833 u_int16_t th_sport; 1834 u_int16_t th_dport; 1835 } *thp; 1836 1837 if (sa->sa_family != AF_INET6 || 1838 sa->sa_len != sizeof(struct sockaddr_in6)) 1839 return; 1840 1841 if (cmd == PRC_MSGSIZE) 1842 notify = tcp_mtudisc; 1843 else if (!PRC_IS_REDIRECT(cmd) && 1844 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 1845 return; 1846 /* Source quench is deprecated */ 1847 else if (cmd == PRC_QUENCH) 1848 return; 1849 1850 /* if the parameter is from icmp6, decode it. */ 1851 if (d != NULL) { 1852 ip6cp = (struct ip6ctlparam *)d; 1853 m = ip6cp->ip6c_m; 1854 ip6 = ip6cp->ip6c_ip6; 1855 off = ip6cp->ip6c_off; 1856 sa6_src = ip6cp->ip6c_src; 1857 } else { 1858 m = NULL; 1859 ip6 = NULL; 1860 off = 0; /* fool gcc */ 1861 sa6_src = &sa6_any; 1862 } 1863 1864 if (ip6) { 1865 /* 1866 * XXX: We assume that when IPV6 is non NULL, 1867 * M and OFF are valid. 1868 */ 1869 1870 /* check if we can safely examine src and dst ports */ 1871 if (m->m_pkthdr.len < off + sizeof(*thp)) 1872 return; 1873 1874 bzero(&th, sizeof(th)); 1875 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 1876 1877 in6_pcbnotify(&tcbinfo, sa, th.th_dport, 1878 (struct sockaddr *)ip6cp->ip6c_src, 1879 th.th_sport, cmd, NULL, notify); 1880 } else { 1881 in6_pcbnotify(&tcbinfo, sa, 0, 1882 (struct sockaddr *)(size_t)sa6_src, 0, cmd, NULL, notify); 1883 } 1884} 1885#endif /* INET6 */ 1886 1887 1888/* 1889 * Following is where TCP initial sequence number generation occurs. 1890 * 1891 * There are two places where we must use initial sequence numbers: 1892 * 1. In SYN-ACK packets. 1893 * 2. In SYN packets. 1894 * 1895 * The ISNs in SYN-ACK packets have no monotonicity requirement, 1896 * and should be as unpredictable as possible to avoid the possibility 1897 * of spoofing and/or connection hijacking. To satisfy this 1898 * requirement, SYN-ACK ISNs are generated via the arc4random() 1899 * function. If exact RFC 1948 compliance is requested via sysctl, 1900 * these ISNs will be generated just like those in SYN packets. 1901 * 1902 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 1903 * depends on this property. In addition, these ISNs should be 1904 * unguessable so as to prevent connection hijacking. To satisfy 1905 * the requirements of this situation, the algorithm outlined in 1906 * RFC 1948 is used to generate sequence numbers. 1907 * 1908 * For more information on the theory of operation, please see 1909 * RFC 1948. 1910 * 1911 * Implementation details: 1912 * 1913 * Time is based off the system timer, and is corrected so that it 1914 * increases by one megabyte per second. This allows for proper 1915 * recycling on high speed LANs while still leaving over an hour 1916 * before rollover. 1917 * 1918 * Two sysctls control the generation of ISNs: 1919 * 1920 * net.inet.tcp.isn_reseed_interval controls the number of seconds 1921 * between seeding of isn_secret. This is normally set to zero, 1922 * as reseeding should not be necessary. 1923 * 1924 * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed 1925 * strictly. When strict compliance is requested, reseeding is 1926 * disabled and SYN-ACKs will be generated in the same manner as 1927 * SYNs. Strict mode is disabled by default. 1928 * 1929 */ 1930 1931#define ISN_BYTES_PER_SECOND 1048576 1932 1933tcp_seq 1934tcp_new_isn(tp) 1935 struct tcpcb *tp; 1936{ 1937 u_int32_t md5_buffer[4]; 1938 tcp_seq new_isn; 1939 struct timeval timenow; 1940 u_char isn_secret[32]; 1941 int isn_last_reseed = 0; 1942 MD5_CTX isn_ctx; 1943 1944 /* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */ 1945 if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT)) 1946 && tcp_strict_rfc1948 == 0) 1947#ifdef __APPLE__ 1948 return random(); 1949#else 1950 return arc4random(); 1951#endif 1952 getmicrotime(&timenow); 1953 1954 /* Seed if this is the first use, reseed if requested. */ 1955 if ((isn_last_reseed == 0) || 1956 ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) && 1957 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) 1958 < (u_int)timenow.tv_sec))) { 1959#ifdef __APPLE__ 1960 read_random(&isn_secret, sizeof(isn_secret)); 1961#else 1962 read_random_unlimited(&isn_secret, sizeof(isn_secret)); 1963#endif 1964 isn_last_reseed = timenow.tv_sec; 1965 } 1966 1967 /* Compute the md5 hash and return the ISN. */ 1968 MD5Init(&isn_ctx); 1969 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 1970 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 1971#if INET6 1972 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 1973 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 1974 sizeof(struct in6_addr)); 1975 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 1976 sizeof(struct in6_addr)); 1977 } else 1978#endif 1979 { 1980 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 1981 sizeof(struct in_addr)); 1982 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 1983 sizeof(struct in_addr)); 1984 } 1985 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); 1986 MD5Final((u_char *) &md5_buffer, &isn_ctx); 1987 new_isn = (tcp_seq) md5_buffer[0]; 1988 new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz); 1989 return new_isn; 1990} 1991 1992 1993/* 1994 * When a specific ICMP unreachable message is received and the 1995 * connection state is SYN-SENT, drop the connection. This behavior 1996 * is controlled by the icmp_may_rst sysctl. 1997 */ 1998void 1999tcp_drop_syn_sent(inp, errno) 2000 struct inpcb *inp; 2001 int errno; 2002{ 2003 struct tcpcb *tp = intotcpcb(inp); 2004 2005 if (tp && tp->t_state == TCPS_SYN_SENT) 2006 tcp_drop(tp, errno); 2007} 2008 2009/* 2010 * When `need fragmentation' ICMP is received, update our idea of the MSS 2011 * based on the new value in the route. Also nudge TCP to send something, 2012 * since we know the packet we just sent was dropped. 2013 * This duplicates some code in the tcp_mss() function in tcp_input.c. 2014 */ 2015void 2016tcp_mtudisc( 2017 struct inpcb *inp, 2018 __unused int errno 2019) 2020{ 2021 struct tcpcb *tp = intotcpcb(inp); 2022 struct rtentry *rt; 2023 struct rmxp_tao *taop; 2024 struct socket *so = inp->inp_socket; 2025 int offered; 2026 int mss; 2027#if INET6 2028 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 2029#endif /* INET6 */ 2030 2031 if (tp) { 2032#if INET6 2033 if (isipv6) 2034 rt = tcp_rtlookup6(inp, IFSCOPE_NONE); 2035 else 2036#endif /* INET6 */ 2037 rt = tcp_rtlookup(inp, IFSCOPE_NONE); 2038 if (!rt || !rt->rt_rmx.rmx_mtu) { 2039 tp->t_maxopd = tp->t_maxseg = 2040#if INET6 2041 isipv6 ? tcp_v6mssdflt : 2042#endif /* INET6 */ 2043 tcp_mssdflt; 2044 2045 /* Route locked during lookup above */ 2046 if (rt != NULL) 2047 RT_UNLOCK(rt); 2048 return; 2049 } 2050 taop = rmx_taop(rt->rt_rmx); 2051 offered = taop->tao_mssopt; 2052 mss = rt->rt_rmx.rmx_mtu - 2053#if INET6 2054 (isipv6 ? 2055 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : 2056#endif /* INET6 */ 2057 sizeof(struct tcpiphdr) 2058#if INET6 2059 ) 2060#endif /* INET6 */ 2061 ; 2062 2063 /* Route locked during lookup above */ 2064 RT_UNLOCK(rt); 2065 2066 if (offered) 2067 mss = min(mss, offered); 2068 /* 2069 * XXX - The above conditional probably violates the TCP 2070 * spec. The problem is that, since we don't know the 2071 * other end's MSS, we are supposed to use a conservative 2072 * default. But, if we do that, then MTU discovery will 2073 * never actually take place, because the conservative 2074 * default is much less than the MTUs typically seen 2075 * on the Internet today. For the moment, we'll sweep 2076 * this under the carpet. 2077 * 2078 * The conservative default might not actually be a problem 2079 * if the only case this occurs is when sending an initial 2080 * SYN with options and data to a host we've never talked 2081 * to before. Then, they will reply with an MSS value which 2082 * will get recorded and the new parameters should get 2083 * recomputed. For Further Study. 2084 */ 2085 if (tp->t_maxopd <= mss) 2086 return; 2087 tp->t_maxopd = mss; 2088 2089 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2090 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2091 mss -= TCPOLEN_TSTAMP_APPA; 2092 2093 if (so->so_snd.sb_hiwat < mss) 2094 mss = so->so_snd.sb_hiwat; 2095 2096 tp->t_maxseg = mss; 2097 2098 /* 2099 * Reset the slow-start flight size as it may depends on the new MSS 2100 */ 2101 if (CC_ALGO(tp)->cwnd_init != NULL) 2102 CC_ALGO(tp)->cwnd_init(tp); 2103 tcpstat.tcps_mturesent++; 2104 tp->t_rtttime = 0; 2105 tp->snd_nxt = tp->snd_una; 2106 tcp_output(tp); 2107 } 2108} 2109 2110/* 2111 * Look-up the routing entry to the peer of this inpcb. If no route 2112 * is found and it cannot be allocated the return NULL. This routine 2113 * is called by TCP routines that access the rmx structure and by tcp_mss 2114 * to get the interface MTU. If a route is found, this routine will 2115 * hold the rtentry lock; the caller is responsible for unlocking. 2116 */ 2117struct rtentry * 2118tcp_rtlookup(inp, input_ifscope) 2119 struct inpcb *inp; 2120 unsigned int input_ifscope; 2121{ 2122 struct route *ro; 2123 struct rtentry *rt; 2124 struct tcpcb *tp; 2125 2126 lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); 2127 2128 ro = &inp->inp_route; 2129 if ((rt = ro->ro_rt) != NULL) 2130 RT_LOCK(rt); 2131 2132 if (rt == NULL || !(rt->rt_flags & RTF_UP) || 2133 rt->generation_id != route_generation) { 2134 /* No route yet, so try to acquire one */ 2135 if (inp->inp_faddr.s_addr != INADDR_ANY) { 2136 unsigned int ifscope; 2137 2138 ro->ro_dst.sa_family = AF_INET; 2139 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 2140 ((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr = 2141 inp->inp_faddr; 2142 2143 /* 2144 * If the socket was bound to an interface, then 2145 * the bound-to-interface takes precedence over 2146 * the inbound interface passed in by the caller 2147 * (if we get here as part of the output path then 2148 * input_ifscope is IFSCOPE_NONE). 2149 */ 2150 ifscope = (inp->inp_flags & INP_BOUND_IF) ? 2151 inp->inp_boundifp->if_index : input_ifscope; 2152 2153 if (rt != NULL) 2154 RT_UNLOCK(rt); 2155 rtalloc_scoped(ro, ifscope); 2156 if ((rt = ro->ro_rt) != NULL) 2157 RT_LOCK(rt); 2158 } 2159 } 2160 2161 /* 2162 * Update MTU discovery determination. Don't do it if: 2163 * 1) it is disabled via the sysctl 2164 * 2) the route isn't up 2165 * 3) the MTU is locked (if it is, then discovery has been 2166 * disabled) 2167 */ 2168 2169 tp = intotcpcb(inp); 2170 2171 if (!path_mtu_discovery || ((rt != NULL) && 2172 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) 2173 tp->t_flags &= ~TF_PMTUD; 2174 else 2175 tp->t_flags |= TF_PMTUD; 2176 2177#if CONFIG_IFEF_NOWINDOWSCALE 2178 if (tcp_obey_ifef_nowindowscale && 2179 tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL && 2180 (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) { 2181 /* Window scaling is enabled on this interface */ 2182 tp->t_flags &= ~TF_REQ_SCALE; 2183 } 2184#endif 2185 2186 if (rt != NULL && rt->rt_ifp != NULL) { 2187 somultipages(inp->inp_socket, 2188 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES)); 2189 tcp_set_tso(tp, rt->rt_ifp); 2190 } 2191 2192 /* 2193 * Caller needs to call RT_UNLOCK(rt). 2194 */ 2195 return rt; 2196} 2197 2198#if INET6 2199struct rtentry * 2200tcp_rtlookup6(inp, input_ifscope) 2201 struct inpcb *inp; 2202 unsigned int input_ifscope; 2203{ 2204 struct route_in6 *ro6; 2205 struct rtentry *rt; 2206 struct tcpcb *tp; 2207 2208 lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); 2209 2210 ro6 = &inp->in6p_route; 2211 if ((rt = ro6->ro_rt) != NULL) 2212 RT_LOCK(rt); 2213 2214 if (rt == NULL || !(rt->rt_flags & RTF_UP) || 2215 rt->generation_id != route_generation) { 2216 /* No route yet, so try to acquire one */ 2217 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { 2218 struct sockaddr_in6 *dst6; 2219 unsigned int ifscope; 2220 2221 dst6 = (struct sockaddr_in6 *)&ro6->ro_dst; 2222 dst6->sin6_family = AF_INET6; 2223 dst6->sin6_len = sizeof(*dst6); 2224 dst6->sin6_addr = inp->in6p_faddr; 2225 2226 /* 2227 * If the socket was bound to an interface, then 2228 * the bound-to-interface takes precedence over 2229 * the inbound interface passed in by the caller 2230 * (if we get here as part of the output path then 2231 * input_ifscope is IFSCOPE_NONE). 2232 */ 2233 ifscope = (inp->inp_flags & INP_BOUND_IF) ? 2234 inp->inp_boundifp->if_index : input_ifscope; 2235 2236 if (rt != NULL) 2237 RT_UNLOCK(rt); 2238 rtalloc_scoped((struct route *)ro6, ifscope); 2239 if ((rt = ro6->ro_rt) != NULL) 2240 RT_LOCK(rt); 2241 } 2242 } 2243 /* 2244 * Update path MTU Discovery determination 2245 * while looking up the route: 2246 * 1) we have a valid route to the destination 2247 * 2) the MTU is not locked (if it is, then discovery has been 2248 * disabled) 2249 */ 2250 2251 2252 tp = intotcpcb(inp); 2253 2254 /* 2255 * Update MTU discovery determination. Don't do it if: 2256 * 1) it is disabled via the sysctl 2257 * 2) the route isn't up 2258 * 3) the MTU is locked (if it is, then discovery has been 2259 * disabled) 2260 */ 2261 2262 if (!path_mtu_discovery || ((rt != NULL) && 2263 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) 2264 tp->t_flags &= ~TF_PMTUD; 2265 else 2266 tp->t_flags |= TF_PMTUD; 2267 2268#if CONFIG_IFEF_NOWINDOWSCALE 2269 if (tcp_obey_ifef_nowindowscale && 2270 tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL && 2271 (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) { 2272 /* Window scaling is not enabled on this interface */ 2273 tp->t_flags &= ~TF_REQ_SCALE; 2274 } 2275#endif 2276 2277 if (rt != NULL && rt->rt_ifp != NULL) { 2278 somultipages(inp->inp_socket, 2279 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES)); 2280 tcp_set_tso(tp, rt->rt_ifp); 2281 } 2282 2283 /* 2284 * Caller needs to call RT_UNLOCK(rt). 2285 */ 2286 return rt; 2287} 2288#endif /* INET6 */ 2289 2290#if IPSEC 2291/* compute ESP/AH header size for TCP, including outer IP header. */ 2292size_t 2293ipsec_hdrsiz_tcp(tp) 2294 struct tcpcb *tp; 2295{ 2296 struct inpcb *inp; 2297 struct mbuf *m; 2298 size_t hdrsiz; 2299 struct ip *ip; 2300#if INET6 2301 struct ip6_hdr *ip6 = NULL; 2302#endif /* INET6 */ 2303 struct tcphdr *th; 2304 2305 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) 2306 return 0; 2307 MGETHDR(m, M_DONTWAIT, MT_DATA); /* MAC-OK */ 2308 if (!m) 2309 return 0; 2310 2311#if INET6 2312 if ((inp->inp_vflag & INP_IPV6) != 0) { 2313 ip6 = mtod(m, struct ip6_hdr *); 2314 th = (struct tcphdr *)(void *)(ip6 + 1); 2315 m->m_pkthdr.len = m->m_len = 2316 sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 2317 tcp_fillheaders(tp, ip6, th); 2318 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 2319 } else 2320#endif /* INET6 */ 2321 { 2322 ip = mtod(m, struct ip *); 2323 th = (struct tcphdr *)(ip + 1); 2324 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 2325 tcp_fillheaders(tp, ip, th); 2326 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 2327 } 2328 m_free(m); 2329 return hdrsiz; 2330} 2331#endif /*IPSEC*/ 2332 2333/* 2334 * Return a pointer to the cached information about the remote host. 2335 * The cached information is stored in the protocol specific part of 2336 * the route metrics. 2337 */ 2338struct rmxp_tao * 2339tcp_gettaocache(inp) 2340 struct inpcb *inp; 2341{ 2342 struct rtentry *rt; 2343 struct rmxp_tao *taop; 2344 2345#if INET6 2346 if ((inp->inp_vflag & INP_IPV6) != 0) 2347 rt = tcp_rtlookup6(inp, IFSCOPE_NONE); 2348 else 2349#endif /* INET6 */ 2350 rt = tcp_rtlookup(inp, IFSCOPE_NONE); 2351 2352 /* Make sure this is a host route and is up. */ 2353 if (rt == NULL || 2354 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) { 2355 /* Route locked during lookup above */ 2356 if (rt != NULL) 2357 RT_UNLOCK(rt); 2358 return NULL; 2359 } 2360 2361 taop = rmx_taop(rt->rt_rmx); 2362 /* Route locked during lookup above */ 2363 RT_UNLOCK(rt); 2364 return (taop); 2365} 2366 2367/* 2368 * Clear all the TAO cache entries, called from tcp_init. 2369 * 2370 * XXX 2371 * This routine is just an empty one, because we assume that the routing 2372 * routing tables are initialized at the same time when TCP, so there is 2373 * nothing in the cache left over. 2374 */ 2375static void 2376tcp_cleartaocache() 2377{ 2378} 2379 2380int 2381tcp_lock(struct socket *so, int refcount, void *lr) 2382{ 2383 void *lr_saved; 2384 2385 if (lr == NULL) 2386 lr_saved = __builtin_return_address(0); 2387 else 2388 lr_saved = lr; 2389 2390 if (so->so_pcb != NULL) { 2391 lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); 2392 } else { 2393 panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n", 2394 so, lr_saved, solockhistory_nr(so)); 2395 /* NOTREACHED */ 2396 } 2397 2398 if (so->so_usecount < 0) { 2399 panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", 2400 so, so->so_pcb, lr_saved, so->so_usecount, solockhistory_nr(so)); 2401 /* NOTREACHED */ 2402 } 2403 if (refcount) 2404 so->so_usecount++; 2405 so->lock_lr[so->next_lock_lr] = lr_saved; 2406 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX; 2407 return (0); 2408} 2409 2410int 2411tcp_unlock(struct socket *so, int refcount, void *lr) 2412{ 2413 void *lr_saved; 2414 2415 if (lr == NULL) 2416 lr_saved = __builtin_return_address(0); 2417 else 2418 lr_saved = lr; 2419 2420#ifdef MORE_TCPLOCK_DEBUG 2421 printf("tcp_unlock: so=%p sopcb=%p lock=%p ref=%x lr=%p\n", 2422 so, so->so_pcb, &((struct inpcb *)so->so_pcb)->inpcb_mtx, 2423 so->so_usecount, lr_saved); 2424#endif 2425 if (refcount) 2426 so->so_usecount--; 2427 2428 if (so->so_usecount < 0) { 2429 panic("tcp_unlock: so=%p usecount=%x lrh= %s\n", 2430 so, so->so_usecount, solockhistory_nr(so)); 2431 /* NOTREACHED */ 2432 } 2433 if (so->so_pcb == NULL) { 2434 panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", 2435 so, so->so_usecount, lr_saved, solockhistory_nr(so)); 2436 /* NOTREACHED */ 2437 } else { 2438 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, 2439 LCK_MTX_ASSERT_OWNED); 2440 so->unlock_lr[so->next_unlock_lr] = lr_saved; 2441 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; 2442 lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); 2443 } 2444 return (0); 2445} 2446 2447lck_mtx_t * 2448tcp_getlock( 2449 struct socket *so, 2450 __unused int locktype) 2451{ 2452 struct inpcb *inp = sotoinpcb(so); 2453 2454 if (so->so_pcb) { 2455 if (so->so_usecount < 0) 2456 panic("tcp_getlock: so=%p usecount=%x lrh= %s\n", 2457 so, so->so_usecount, solockhistory_nr(so)); 2458 return(&inp->inpcb_mtx); 2459 } 2460 else { 2461 panic("tcp_getlock: so=%p NULL so_pcb %s\n", 2462 so, solockhistory_nr(so)); 2463 return (so->so_proto->pr_domain->dom_mtx); 2464 } 2465} 2466 2467/* Determine if we can grow the recieve socket buffer to avoid sending 2468 * a zero window update to the peer. We allow even socket buffers that 2469 * have fixed size (set by the application) to grow if the resource 2470 * constraints are met. They will also be trimmed after the application 2471 * reads data. 2472 */ 2473static void 2474tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb) { 2475 u_int32_t rcvbufinc = tp->t_maxseg << tcp_autorcvbuf_inc_shift; 2476 if (tcp_do_autorcvbuf == 1 && 2477 tcp_cansbgrow(sb) && 2478 (tp->t_flags & TF_SLOWLINK) == 0 && 2479 (sb->sb_hiwat - sb->sb_cc) < rcvbufinc && 2480 (sb->sb_hiwat < tcp_autorcvbuf_max)) { 2481 sbreserve(sb, (sb->sb_hiwat + rcvbufinc)); 2482 } 2483} 2484 2485int32_t 2486tcp_sbspace(struct tcpcb *tp) 2487{ 2488 struct sockbuf *sb = &tp->t_inpcb->inp_socket->so_rcv; 2489 int32_t space; 2490 2491 tcp_sbrcv_grow_rwin(tp, sb); 2492 2493 space = ((int32_t) imin((sb->sb_hiwat - sb->sb_cc), 2494 (sb->sb_mbmax - sb->sb_mbcnt))); 2495 if (space < 0) 2496 space = 0; 2497 2498 /* Avoid increasing window size if the current window 2499 * is already very low, we could be in "persist" mode and 2500 * we could break some apps (see rdar://5409343) 2501 */ 2502 2503 if (space < tp->t_maxseg) 2504 return space; 2505 2506 /* Clip window size for slower link */ 2507 2508 if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0 ) 2509 return imin(space, slowlink_wsize); 2510 2511 return space; 2512} 2513/* 2514 * Checks TCP Segment Offloading capability for a given connection and interface pair. 2515 */ 2516void 2517tcp_set_tso(tp, ifp) 2518 struct tcpcb *tp; 2519 struct ifnet *ifp; 2520{ 2521#if INET6 2522 struct inpcb *inp = tp->t_inpcb; 2523 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 2524 2525 if (isipv6) { 2526 if (ifp && ifp->if_hwassist & IFNET_TSO_IPV6) { 2527 tp->t_flags |= TF_TSO; 2528 if (ifp->if_tso_v6_mtu != 0) 2529 tp->tso_max_segment_size = ifp->if_tso_v6_mtu; 2530 else 2531 tp->tso_max_segment_size = TCP_MAXWIN; 2532 } else 2533 tp->t_flags &= ~TF_TSO; 2534 2535 } else 2536#endif /* INET6 */ 2537 2538 { 2539 if (ifp && ifp->if_hwassist & IFNET_TSO_IPV4) { 2540 tp->t_flags |= TF_TSO; 2541 if (ifp->if_tso_v4_mtu != 0) 2542 tp->tso_max_segment_size = ifp->if_tso_v4_mtu; 2543 else 2544 tp->tso_max_segment_size = TCP_MAXWIN; 2545 } else 2546 tp->t_flags &= ~TF_TSO; 2547 } 2548} 2549 2550#define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC) 2551 2552/* Function to calculate the tcp clock. The tcp clock will get updated 2553 * at the boundaries of the tcp layer. This is done at 3 places: 2554 * 1. Right before processing an input tcp packet 2555 * 2. Whenever a connection wants to access the network using tcp_usrreqs 2556 * 3. When a tcp timer fires or before tcp slow timeout 2557 * 2558 */ 2559 2560void 2561calculate_tcp_clock() 2562{ 2563 struct timeval tv = tcp_uptime; 2564 struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC}; 2565 struct timeval now, hold_now; 2566 uint32_t incr = 0; 2567 2568 timevaladd(&tv, &interval); 2569 microuptime(&now); 2570 if (timevalcmp(&now, &tv, >)) { 2571 /* time to update the clock */ 2572 lck_spin_lock(tcp_uptime_lock); 2573 if (timevalcmp(&tcp_uptime, &now, >=)) { 2574 /* clock got updated while we were waiting for the lock */ 2575 lck_spin_unlock(tcp_uptime_lock); 2576 return; 2577 } 2578 2579 microuptime(&now); 2580 hold_now = now; 2581 tv = tcp_uptime; 2582 timevalsub(&now, &tv); 2583 2584 incr = TIMEVAL_TO_TCPHZ(now); 2585 if (incr > 0) { 2586 tcp_uptime = hold_now; 2587 tcp_now += incr; 2588 } 2589 2590 lck_spin_unlock(tcp_uptime_lock); 2591 } 2592 return; 2593} 2594 2595/* Compute receive window scaling that we are going to request 2596 * for this connection based on sb_hiwat. Try to leave some 2597 * room to potentially increase the window size upto a maximum 2598 * defined by the constant tcp_autorcvbuf_max. 2599 */ 2600void 2601tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so) { 2602 u_int32_t maxsockbufsize; 2603 2604 tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale); 2605 maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ? 2606 so->so_rcv.sb_hiwat : tcp_autorcvbuf_max; 2607 2608 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 2609 (TCP_MAXWIN << tp->request_r_scale) < maxsockbufsize) 2610 tp->request_r_scale++; 2611 tp->request_r_scale = min(tp->request_r_scale, TCP_MAX_WINSHIFT); 2612 2613} 2614 2615int 2616tcp_notsent_lowat_check(struct socket *so) { 2617 struct inpcb *inp = sotoinpcb(so); 2618 struct tcpcb *tp = NULL; 2619 int notsent = 0; 2620 if (inp != NULL) { 2621 tp = intotcpcb(inp); 2622 } 2623 2624 notsent = so->so_snd.sb_cc - 2625 (tp->snd_nxt - tp->snd_una); 2626 2627 /* When we send a FIN or SYN, not_sent can be negative. 2628 * In that case also we need to send a write event to the 2629 * process if it is waiting. In the FIN case, it will 2630 * get an error from send because cantsendmore will be set. 2631 */ 2632 if (notsent <= tp->t_notsent_lowat) { 2633 return(1); 2634 } 2635 2636 /* When Nagle's algorithm is not disabled, it is better 2637 * to wakeup the client until there is atleast one 2638 * maxseg of data to write. 2639 */ 2640 if ((tp->t_flags & TF_NODELAY) == 0 && 2641 notsent > 0 && notsent < tp->t_maxseg) { 2642 return(1); 2643 } 2644 return(0); 2645} 2646 2647 2648/* DSEP Review Done pl-20051213-v02 @3253,@3391,@3400 */ 2649