1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.22 2001/08/22 00:59:12 silby Exp $ 62 */ 63/* 64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 65 * support for mandatory and extensible security protections. This notice 66 * is included in support of clause 2.2 (b) of the Apple Public License, 67 * Version 2.0. 68 */ 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/callout.h> 73#include <sys/kernel.h> 74#include <sys/sysctl.h> 75#include <sys/malloc.h> 76#include <sys/mbuf.h> 77#include <sys/domain.h> 78#include <sys/proc.h> 79#include <sys/kauth.h> 80#include <sys/socket.h> 81#include <sys/socketvar.h> 82#include <sys/protosw.h> 83#include <sys/random.h> 84#include <sys/syslog.h> 85#include <sys/mcache.h> 86#include <kern/locks.h> 87#include <kern/zalloc.h> 88 89#include <dev/random/randomdev.h> 90 91#include <net/route.h> 92#include <net/if.h> 93 94#define tcp_minmssoverload fring 95#define _IP_VHL 96#include <netinet/in.h> 97#include <netinet/in_systm.h> 98#include <netinet/ip.h> 99#include <netinet/ip_icmp.h> 100#if INET6 101#include <netinet/ip6.h> 102#endif 103#include <netinet/in_pcb.h> 104#if INET6 105#include <netinet6/in6_pcb.h> 106#endif 107#include <netinet/in_var.h> 108#include <netinet/ip_var.h> 109#include <netinet/icmp_var.h> 110#if INET6 111#include <netinet6/ip6_var.h> 112#endif 113#include <netinet/tcp.h> 114#include <netinet/tcp_fsm.h> 115#include <netinet/tcp_seq.h> 116#include <netinet/tcp_timer.h> 117#include <netinet/tcp_var.h> 118#include <netinet/tcp_cc.h> 119#include <kern/thread_call.h> 120 121#if INET6 122#include <netinet6/tcp6_var.h> 123#endif 124#include <netinet/tcpip.h> 125#if TCPDEBUG 126#include <netinet/tcp_debug.h> 127#endif 128#include <netinet6/ip6protosw.h> 129 130#if IPSEC 131#include <netinet6/ipsec.h> 132#if INET6 133#include <netinet6/ipsec6.h> 134#endif 135#endif /*IPSEC*/ 136 137#undef tcp_minmssoverload 138 139#if CONFIG_MACF_NET 140#include <security/mac_framework.h> 141#endif /* MAC_NET */ 142 143#include <libkern/crypto/md5.h> 144#include <sys/kdebug.h> 145#include <mach/sdt.h> 146 147#include <netinet/lro_ext.h> 148 149#define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) 150 151extern int tcp_lq_overflow; 152 153/* temporary: for testing */ 154#if IPSEC 155extern int ipsec_bypass; 156#endif 157extern struct tcptimerlist tcp_timer_list; 158extern struct tcptailq tcp_tw_tailq; 159 160int tcp_mssdflt = TCP_MSS; 161SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED, 162 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 163 164#if INET6 165int tcp_v6mssdflt = TCP6_MSS; 166SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 167 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_v6mssdflt , 0, 168 "Default TCP Maximum Segment Size for IPv6"); 169#endif 170 171extern int tcp_do_autorcvbuf; 172 173/* 174 * Minimum MSS we accept and use. This prevents DoS attacks where 175 * we are forced to a ridiculous low MSS like 20 and send hundreds 176 * of packets instead of one. The effect scales with the available 177 * bandwidth and quickly saturates the CPU and network interface 178 * with packet generation and sending. Set to zero to disable MINMSS 179 * checking. This setting prevents us from sending too small packets. 180 */ 181int tcp_minmss = TCP_MINMSS; 182SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED, 183 &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); 184 185static int tcp_do_rfc1323 = 1; 186SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW | CTLFLAG_LOCKED, 187 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 188 189// Not used 190static int tcp_do_rfc1644 = 0; 191SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW | CTLFLAG_LOCKED, 192 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 193 194static int do_tcpdrain = 0; 195SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED, &do_tcpdrain, 0, 196 "Enable tcp_drain routine for extra help when low on mbufs"); 197 198SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, 199 &tcbinfo.ipi_count, 0, "Number of active PCBs"); 200 201SYSCTL_INT(_net_inet_tcp, OID_AUTO, tw_pcbcount, 202 CTLFLAG_RD | CTLFLAG_LOCKED, 203 &tcbinfo.ipi_twcount, 0, "Number of pcbs in time-wait state"); 204 205static int icmp_may_rst = 1; 206SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp_may_rst, 0, 207 "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 208 209static int tcp_strict_rfc1948 = 0; 210SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED, 211 &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly"); 212 213static int tcp_isn_reseed_interval = 0; 214SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW | CTLFLAG_LOCKED, 215 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); 216static int tcp_background_io_enabled = 1; 217SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, 218 &tcp_background_io_enabled, 0, "Background IO Enabled"); 219 220int tcp_TCPTV_MIN = 100; /* 100ms minimum RTT */ 221SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED, 222 &tcp_TCPTV_MIN, 0, "min rtt value allowed"); 223 224int tcp_rexmt_slop = TCPTV_REXMTSLOP; 225SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmt_slop, CTLFLAG_RW, 226 &tcp_rexmt_slop, 0, "Slop added to retransmit timeout"); 227 228__private_extern__ int tcp_use_randomport = 0; 229SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED, 230 &tcp_use_randomport, 0, "Randomize TCP port numbers"); 231 232extern struct tcp_cc_algo tcp_cc_newreno; 233SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets, CTLFLAG_RD | CTLFLAG_LOCKED, 234 &tcp_cc_newreno.num_sockets, 0, "Number of sockets using newreno"); 235 236extern struct tcp_cc_algo tcp_cc_ledbat; 237SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, CTLFLAG_RD | CTLFLAG_LOCKED, 238 &tcp_cc_ledbat.num_sockets, 0, "Number of sockets using background transport"); 239 240__private_extern__ int tcp_win_scale = 3; 241SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED, 242 &tcp_win_scale, 0, "Window scaling factor"); 243 244static void tcp_cleartaocache(void); 245static void tcp_notify(struct inpcb *, int); 246static void tcp_cc_init(void); 247 248struct zone *sack_hole_zone; 249struct zone *tcp_reass_zone; 250struct zone *tcp_bwmeas_zone; 251#if 0 252static unsigned int tcp_mptcp_dsnm_sz; 253struct zone *tcp_mptcp_dsnm_zone; 254#endif 255/* The array containing pointers to currently implemented TCP CC algorithms */ 256struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; 257 258extern int slowlink_wsize; /* window correction for slow links */ 259extern int path_mtu_discovery; 260 261extern u_int32_t tcp_autorcvbuf_max; 262extern u_int32_t tcp_autorcvbuf_inc_shift; 263static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb); 264 265#define TCP_BWMEAS_BURST_MINSIZE 6 266#define TCP_BWMEAS_BURST_MAXSIZE 25 267 268static uint32_t bwmeas_elm_size; 269 270/* 271 * Target size of TCP PCB hash tables. Must be a power of two. 272 * 273 * Note that this can be overridden by the kernel environment 274 * variable net.inet.tcp.tcbhashsize 275 */ 276#ifndef TCBHASHSIZE 277#define TCBHASHSIZE CONFIG_TCBHASHSIZE 278#endif 279 280__private_extern__ int tcp_tcbhashsize = TCBHASHSIZE; 281SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED, 282 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 283 284/* 285 * This is the actual shape of what we allocate using the zone 286 * allocator. Doing it this way allows us to protect both structures 287 * using the same generation count, and also eliminates the overhead 288 * of allocating tcpcbs separately. By hiding the structure here, 289 * we avoid changing most of the rest of the code (although it needs 290 * to be changed, eventually, for greater efficiency). 291 */ 292#define ALIGNMENT 32 293struct inp_tp { 294 struct inpcb inp; 295 struct tcpcb tcb __attribute__((aligned(ALIGNMENT))); 296}; 297#undef ALIGNMENT 298 299int get_inpcb_str_size(void); 300int get_tcp_str_size(void); 301 302static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *); 303 304static lck_attr_t *tcp_uptime_mtx_attr = NULL; /* mutex attributes */ 305static lck_grp_t *tcp_uptime_mtx_grp = NULL; /* mutex group definition */ 306static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL; /* mutex group attributes */ 307int tcp_notsent_lowat_check(struct socket *so); 308 309 310int get_inpcb_str_size(void) 311{ 312 return sizeof(struct inpcb); 313} 314 315 316int get_tcp_str_size(void) 317{ 318 return sizeof(struct tcpcb); 319} 320 321int tcp_freeq(struct tcpcb *tp); 322 323/* 324 * Initialize TCP congestion control algorithms. 325 */ 326 327void 328tcp_cc_init(void) 329{ 330 bzero(&tcp_cc_algo_list, sizeof(tcp_cc_algo_list)); 331 tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno; 332 tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat; 333} 334 335/* 336 * Tcp initialization 337 */ 338void 339tcp_init(struct protosw *pp, struct domain *dp) 340{ 341#pragma unused(dp) 342 static int tcp_initialized = 0; 343 vm_size_t str_size; 344 struct inpcbinfo *pcbinfo; 345 346 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED); 347 348 if (tcp_initialized) 349 return; 350 tcp_initialized = 1; 351 352 tcp_ccgen = 1; 353 tcp_cleartaocache(); 354 355 tcp_keepinit = TCPTV_KEEP_INIT; 356 tcp_keepidle = TCPTV_KEEP_IDLE; 357 tcp_keepintvl = TCPTV_KEEPINTVL; 358 tcp_keepcnt = TCPTV_KEEPCNT; 359 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 360 tcp_msl = TCPTV_MSL; 361 362 microuptime(&tcp_uptime); 363 read_random(&tcp_now, sizeof(tcp_now)); 364 tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal clock at a random value */ 365 366 LIST_INIT(&tcb); 367 tcbinfo.ipi_listhead = &tcb; 368 369 pcbinfo = &tcbinfo; 370 /* 371 * allocate lock group attribute and group for tcp pcb mutexes 372 */ 373 pcbinfo->ipi_lock_grp_attr = lck_grp_attr_alloc_init(); 374 pcbinfo->ipi_lock_grp = lck_grp_alloc_init("tcppcb", pcbinfo->ipi_lock_grp_attr); 375 376 /* 377 * allocate the lock attribute for tcp pcb mutexes 378 */ 379 pcbinfo->ipi_lock_attr = lck_attr_alloc_init(); 380 381 if ((pcbinfo->ipi_lock = lck_rw_alloc_init(pcbinfo->ipi_lock_grp, 382 pcbinfo->ipi_lock_attr)) == NULL) { 383 panic("%s: unable to allocate PCB lock\n", __func__); 384 /* NOTREACHED */ 385 } 386 387 if (!powerof2(tcp_tcbhashsize)) { 388 printf("WARNING: TCB hash size not a power of 2\n"); 389 tcp_tcbhashsize = 512; /* safe default */ 390 } 391 tcbinfo.ipi_hashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.ipi_hashmask); 392 tcbinfo.ipi_porthashbase = hashinit(tcp_tcbhashsize, M_PCB, 393 &tcbinfo.ipi_porthashmask); 394 str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t)); 395 tcbinfo.ipi_zone = zinit(str_size, 120000*str_size, 8192, "tcpcb"); 396 zone_change(tcbinfo.ipi_zone, Z_CALLERACCT, FALSE); 397 zone_change(tcbinfo.ipi_zone, Z_EXPAND, TRUE); 398 399 tcbinfo.ipi_gc = tcp_gc; 400 in_pcbinfo_attach(&tcbinfo); 401 402 str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t)); 403 sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone"); 404 zone_change(sack_hole_zone, Z_CALLERACCT, FALSE); 405 zone_change(sack_hole_zone, Z_EXPAND, TRUE); 406 407 tcp_reass_maxseg = nmbclusters / 16; 408 str_size = P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t)); 409 tcp_reass_zone = zinit(str_size, (tcp_reass_maxseg + 1) * str_size, 410 0, "tcp_reass_zone"); 411 if (tcp_reass_zone == NULL) { 412 panic("%s: failed allocating tcp_reass_zone", __func__); 413 /* NOTREACHED */ 414 } 415 zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE); 416 zone_change(tcp_reass_zone, Z_EXPAND, TRUE); 417 418 bwmeas_elm_size = P2ROUNDUP(sizeof(struct bwmeas), sizeof(u_int64_t)); 419 tcp_bwmeas_zone = zinit(bwmeas_elm_size, (100 * bwmeas_elm_size), 0, "tcp_bwmeas_zone"); 420 if (tcp_bwmeas_zone == NULL) { 421 panic("%s: failed allocating tcp_bwmeas_zone", __func__); 422 /* NOTREACHED */ 423 } 424 zone_change(tcp_bwmeas_zone, Z_CALLERACCT, FALSE); 425 zone_change(tcp_bwmeas_zone, Z_EXPAND, TRUE); 426 427#if INET6 428#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 429#else /* INET6 */ 430#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 431#endif /* INET6 */ 432 if (max_protohdr < TCP_MINPROTOHDR) { 433 _max_protohdr = TCP_MINPROTOHDR; 434 _max_protohdr = max_protohdr; /* round it up */ 435 } 436 if (max_linkhdr + max_protohdr > MCLBYTES) 437 panic("tcp_init"); 438#undef TCP_MINPROTOHDR 439 440 /* Initialize time wait and timer lists */ 441 TAILQ_INIT(&tcp_tw_tailq); 442 443 bzero(&tcp_timer_list, sizeof(tcp_timer_list)); 444 LIST_INIT(&tcp_timer_list.lhead); 445 /* 446 * allocate lock group attribute, group and attribute for the tcp timer list 447 */ 448 tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init(); 449 tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist", tcp_timer_list.mtx_grp_attr); 450 tcp_timer_list.mtx_attr = lck_attr_alloc_init(); 451 if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp, tcp_timer_list.mtx_attr)) == NULL) { 452 panic("failed to allocate memory for tcp_timer_list.mtx\n"); 453 }; 454 tcp_timer_list.fast_quantum = TCP_FASTTIMER_QUANTUM; 455 tcp_timer_list.slow_quantum = TCP_SLOWTIMER_QUANTUM; 456 if ((tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL)) == NULL) { 457 panic("failed to allocate call entry 1 in tcp_init\n"); 458 } 459 460 /* 461 * allocate lock group attribute, group and attribute for tcp_uptime_lock 462 */ 463 tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init(); 464 tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime", tcp_uptime_mtx_grp_attr); 465 tcp_uptime_mtx_attr = lck_attr_alloc_init(); 466 tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp, tcp_uptime_mtx_attr); 467 468 /* Initialize TCP congestion control algorithms list */ 469 tcp_cc_init(); 470 471 /* Initialize TCP LRO data structures */ 472 tcp_lro_init(); 473} 474 475/* 476 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 477 * tcp_template used to store this data in mbufs, but we now recopy it out 478 * of the tcpcb each time to conserve mbufs. 479 */ 480void 481tcp_fillheaders(tp, ip_ptr, tcp_ptr) 482 struct tcpcb *tp; 483 void *ip_ptr; 484 void *tcp_ptr; 485{ 486 struct inpcb *inp = tp->t_inpcb; 487 struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; 488 489#if INET6 490 if ((inp->inp_vflag & INP_IPV6) != 0) { 491 struct ip6_hdr *ip6; 492 493 ip6 = (struct ip6_hdr *)ip_ptr; 494 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 495 (inp->inp_flow & IPV6_FLOWINFO_MASK); 496 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 497 (IPV6_VERSION & IPV6_VERSION_MASK); 498 ip6->ip6_nxt = IPPROTO_TCP; 499 ip6->ip6_plen = sizeof(struct tcphdr); 500 ip6->ip6_src = inp->in6p_laddr; 501 ip6->ip6_dst = inp->in6p_faddr; 502 tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr, 503 htonl(sizeof (struct tcphdr) + IPPROTO_TCP)); 504 } else 505#endif 506 { 507 struct ip *ip = (struct ip *) ip_ptr; 508 509 ip->ip_vhl = IP_VHL_BORING; 510 ip->ip_tos = 0; 511 ip->ip_len = 0; 512 ip->ip_id = 0; 513 ip->ip_off = 0; 514 ip->ip_ttl = 0; 515 ip->ip_sum = 0; 516 ip->ip_p = IPPROTO_TCP; 517 ip->ip_src = inp->inp_laddr; 518 ip->ip_dst = inp->inp_faddr; 519 tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 520 htons(sizeof(struct tcphdr) + IPPROTO_TCP)); 521 } 522 523 tcp_hdr->th_sport = inp->inp_lport; 524 tcp_hdr->th_dport = inp->inp_fport; 525 tcp_hdr->th_seq = 0; 526 tcp_hdr->th_ack = 0; 527 tcp_hdr->th_x2 = 0; 528 tcp_hdr->th_off = 5; 529 tcp_hdr->th_flags = 0; 530 tcp_hdr->th_win = 0; 531 tcp_hdr->th_urp = 0; 532} 533 534/* 535 * Create template to be used to send tcp packets on a connection. 536 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 537 * use for this function is in keepalives, which use tcp_respond. 538 */ 539struct tcptemp * 540tcp_maketemplate(tp) 541 struct tcpcb *tp; 542{ 543 struct mbuf *m; 544 struct tcptemp *n; 545 546 m = m_get(M_DONTWAIT, MT_HEADER); 547 if (m == NULL) 548 return (0); 549 m->m_len = sizeof(struct tcptemp); 550 n = mtod(m, struct tcptemp *); 551 552 tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t); 553 return (n); 554} 555 556/* 557 * Send a single message to the TCP at address specified by 558 * the given TCP/IP header. If m == 0, then we make a copy 559 * of the tcpiphdr at ti and send directly to the addressed host. 560 * This is used to force keep alive messages out using the TCP 561 * template for a connection. If flags are given then we send 562 * a message back to the TCP which originated the * segment ti, 563 * and discard the mbuf containing it and any other attached mbufs. 564 * 565 * In any case the ack and sequence number of the transmitted 566 * segment are as specified by the parameters. 567 * 568 * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 569 */ 570void 571tcp_respond( 572 struct tcpcb *tp, 573 void *ipgen, 574 register struct tcphdr *th, 575 register struct mbuf *m, 576 tcp_seq ack, 577 tcp_seq seq, 578 int flags, 579 unsigned int ifscope, 580 unsigned int nocell 581 ) 582{ 583 register int tlen; 584 int win = 0; 585 struct route *ro = 0; 586 struct route sro; 587 struct ip *ip; 588 struct tcphdr *nth; 589#if INET6 590 struct route_in6 *ro6 = 0; 591 struct route_in6 sro6; 592 struct ip6_hdr *ip6; 593 int isipv6; 594#endif /* INET6 */ 595 struct ifnet *outif; 596 597#if INET6 598 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; 599 ip6 = ipgen; 600#endif /* INET6 */ 601 ip = ipgen; 602 603 if (tp) { 604 if (!(flags & TH_RST)) { 605 win = tcp_sbspace(tp); 606 if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale) 607 win = (int32_t)TCP_MAXWIN << tp->rcv_scale; 608 } 609#if INET6 610 if (isipv6) 611 ro6 = &tp->t_inpcb->in6p_route; 612 else 613#endif /* INET6 */ 614 ro = &tp->t_inpcb->inp_route; 615 } else { 616#if INET6 617 if (isipv6) { 618 ro6 = &sro6; 619 bzero(ro6, sizeof *ro6); 620 } else 621#endif /* INET6 */ 622 { 623 ro = &sro; 624 bzero(ro, sizeof *ro); 625 } 626 } 627 if (m == 0) { 628 m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */ 629 if (m == NULL) 630 return; 631 tlen = 0; 632 m->m_data += max_linkhdr; 633#if INET6 634 if (isipv6) { 635 VERIFY((MHLEN - max_linkhdr) >= 636 (sizeof (*ip6) + sizeof (*nth))); 637 bcopy((caddr_t)ip6, mtod(m, caddr_t), 638 sizeof(struct ip6_hdr)); 639 ip6 = mtod(m, struct ip6_hdr *); 640 nth = (struct tcphdr *)(void *)(ip6 + 1); 641 } else 642#endif /* INET6 */ 643 { 644 VERIFY((MHLEN - max_linkhdr) >= 645 (sizeof (*ip) + sizeof (*nth))); 646 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 647 ip = mtod(m, struct ip *); 648 nth = (struct tcphdr *)(void *)(ip + 1); 649 } 650 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 651#if MPTCP 652 if ((tp) && (tp->t_mpflags & TMPF_RESET)) 653 flags = (TH_RST | TH_ACK); 654 else 655#endif 656 flags = TH_ACK; 657 } else { 658 m_freem(m->m_next); 659 m->m_next = 0; 660 m->m_data = (caddr_t)ipgen; 661 /* m_len is set later */ 662 tlen = 0; 663#define xchg(a,b,type) { type t; t=a; a=b; b=t; } 664#if INET6 665 if (isipv6) { 666 /* Expect 32-bit aligned IP on strict-align platforms */ 667 IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6); 668 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 669 nth = (struct tcphdr *)(void *)(ip6 + 1); 670 } else 671#endif /* INET6 */ 672 { 673 /* Expect 32-bit aligned IP on strict-align platforms */ 674 IP_HDR_STRICT_ALIGNMENT_CHECK(ip); 675 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); 676 nth = (struct tcphdr *)(void *)(ip + 1); 677 } 678 if (th != nth) { 679 /* 680 * this is usually a case when an extension header 681 * exists between the IPv6 header and the 682 * TCP header. 683 */ 684 nth->th_sport = th->th_sport; 685 nth->th_dport = th->th_dport; 686 } 687 xchg(nth->th_dport, nth->th_sport, n_short); 688#undef xchg 689 } 690#if INET6 691 if (isipv6) { 692 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + 693 tlen)); 694 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 695 } else 696#endif 697 { 698 tlen += sizeof (struct tcpiphdr); 699 ip->ip_len = tlen; 700 ip->ip_ttl = ip_defttl; 701 } 702 m->m_len = tlen; 703 m->m_pkthdr.len = tlen; 704 m->m_pkthdr.rcvif = 0; 705#if CONFIG_MACF_NET 706 if (tp != NULL && tp->t_inpcb != NULL) { 707 /* 708 * Packet is associated with a socket, so allow the 709 * label of the response to reflect the socket label. 710 */ 711 mac_mbuf_label_associate_inpcb(tp->t_inpcb, m); 712 } else { 713 /* 714 * Packet is not associated with a socket, so possibly 715 * update the label in place. 716 */ 717 mac_netinet_tcp_reply(m); 718 } 719#endif 720 721 nth->th_seq = htonl(seq); 722 nth->th_ack = htonl(ack); 723 nth->th_x2 = 0; 724 nth->th_off = sizeof (struct tcphdr) >> 2; 725 nth->th_flags = flags; 726 if (tp) 727 nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 728 else 729 nth->th_win = htons((u_short)win); 730 nth->th_urp = 0; 731#if INET6 732 if (isipv6) { 733 nth->th_sum = 0; 734 nth->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst, 735 htonl((tlen - sizeof (struct ip6_hdr)) + IPPROTO_TCP)); 736 m->m_pkthdr.csum_flags = CSUM_TCPIPV6; 737 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 738 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, 739 ro6 && ro6->ro_rt ? 740 ro6->ro_rt->rt_ifp : 741 NULL); 742 } else 743#endif /* INET6 */ 744 { 745 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 746 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 747 m->m_pkthdr.csum_flags = CSUM_TCP; 748 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 749 } 750#if TCPDEBUG 751 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 752 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 753#endif 754#if IPSEC 755 if (ipsec_bypass == 0 && ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) { 756 m_freem(m); 757 return; 758 } 759#endif 760 761 if (tp != NULL) { 762 u_int32_t svc_flags = 0; 763 if (isipv6) { 764 svc_flags |= PKT_SCF_IPV6; 765 } 766 set_packet_service_class(m, tp->t_inpcb->inp_socket, 767 MBUF_SC_UNSPEC, svc_flags); 768 769 /* Embed flowhash and flow control flags */ 770 m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; 771 m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash; 772 m->m_pkthdr.pkt_flags |= PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC; 773#if MPTCP 774 /* Disable flow advisory when using MPTCP. */ 775 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) 776#endif /* MPTCP */ 777 m->m_pkthdr.pkt_flags |= PKTF_FLOW_ADV; 778 m->m_pkthdr.pkt_proto = IPPROTO_TCP; 779 } 780 781#if INET6 782 if (isipv6) { 783 struct ip6_out_args ip6oa = { ifscope, { 0 }, 784 IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0 }; 785 786 if (ifscope != IFSCOPE_NONE) 787 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; 788 if (nocell) 789 ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR; 790 791 (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL, 792 NULL, &ip6oa); 793 794 if (tp != NULL && ro6 != NULL && ro6->ro_rt != NULL && 795 (outif = ro6->ro_rt->rt_ifp) != 796 tp->t_inpcb->in6p_last_outifp) 797 tp->t_inpcb->in6p_last_outifp = outif; 798 799 if (ro6 == &sro6) 800 ROUTE_RELEASE(ro6); 801 } else 802#endif /* INET6 */ 803 { 804 struct ip_out_args ipoa = { ifscope, { 0 }, 805 IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR, 0 }; 806 807 if (ifscope != IFSCOPE_NONE) 808 ipoa.ipoa_flags |= IPOAF_BOUND_IF; 809 if (nocell) 810 ipoa.ipoa_flags |= IPOAF_NO_CELLULAR; 811 812 if (ro != &sro) { 813 /* Copy the cached route and take an extra reference */ 814 inp_route_copyout(tp->t_inpcb, &sro); 815 } 816 /* 817 * For consistency, pass a local route copy. 818 */ 819 (void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa); 820 821 if (tp != NULL && sro.ro_rt != NULL && 822 (outif = sro.ro_rt->rt_ifp) != 823 tp->t_inpcb->inp_last_outifp) 824 tp->t_inpcb->inp_last_outifp = outif; 825 826 if (ro != &sro) { 827 /* Synchronize cached PCB route */ 828 inp_route_copyin(tp->t_inpcb, &sro); 829 } else { 830 ROUTE_RELEASE(&sro); 831 } 832 } 833} 834 835/* 836 * Create a new TCP control block, making an 837 * empty reassembly queue and hooking it to the argument 838 * protocol control block. The `inp' parameter must have 839 * come from the zone allocator set up in tcp_init(). 840 */ 841struct tcpcb * 842tcp_newtcpcb(inp) 843 struct inpcb *inp; 844{ 845 struct inp_tp *it; 846 register struct tcpcb *tp; 847 register struct socket *so = inp->inp_socket; 848#if INET6 849 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 850#endif /* INET6 */ 851 852 calculate_tcp_clock(); 853 854 if (!so->cached_in_sock_layer) { 855 it = (struct inp_tp *)(void *)inp; 856 tp = &it->tcb; 857 } else { 858 tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb; 859 } 860 861 bzero((char *) tp, sizeof(struct tcpcb)); 862 LIST_INIT(&tp->t_segq); 863 tp->t_maxseg = tp->t_maxopd = 864#if INET6 865 isipv6 ? tcp_v6mssdflt : 866#endif /* INET6 */ 867 tcp_mssdflt; 868 869 if (tcp_do_rfc1323) 870 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 871 if (tcp_do_sack) 872 tp->t_flagsext |= TF_SACK_ENABLE; 873 874 TAILQ_INIT(&tp->snd_holes); 875 tp->t_inpcb = inp; /* XXX */ 876 /* 877 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 878 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 879 * reasonable initial retransmit time. 880 */ 881 tp->t_srtt = TCPTV_SRTTBASE; 882 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 883 tp->t_rttmin = tcp_TCPTV_MIN; 884 tp->t_rxtcur = TCPTV_RTOBASE; 885 886 /* Initialize congestion control algorithm for this connection 887 * to newreno by default 888 */ 889 tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX; 890 if (CC_ALGO(tp)->init != NULL) { 891 CC_ALGO(tp)->init(tp); 892 } 893 894 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 895 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 896 tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT; 897 tp->t_rcvtime = tcp_now; 898 tp->tentry.timer_start = tcp_now; 899 tp->t_persist_timeout = tcp_max_persist_timeout; 900 tp->t_persist_stop = 0; 901 tp->t_flagsext |= TF_RCVUNACK_WAITSS; 902 tp->t_rexmtthresh = tcprexmtthresh; 903 904 /* Clear time wait tailq entry */ 905 tp->t_twentry.tqe_next = NULL; 906 tp->t_twentry.tqe_prev = NULL; 907 908 /* 909 * IPv4 TTL initialization is necessary for an IPv6 socket as well, 910 * because the socket may be bound to an IPv6 wildcard address, 911 * which may match an IPv4-mapped IPv6 address. 912 */ 913 inp->inp_ip_ttl = ip_defttl; 914 inp->inp_ppcb = (caddr_t)tp; 915 return (tp); /* XXX */ 916} 917 918/* 919 * Drop a TCP connection, reporting 920 * the specified error. If connection is synchronized, 921 * then send a RST to peer. 922 */ 923struct tcpcb * 924tcp_drop(tp, errno) 925 register struct tcpcb *tp; 926 int errno; 927{ 928 struct socket *so = tp->t_inpcb->inp_socket; 929#if CONFIG_DTRACE 930 struct inpcb *inp = tp->t_inpcb; 931#endif 932 933 if (TCPS_HAVERCVDSYN(tp->t_state)) { 934 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 935 struct tcpcb *, tp, int32_t, TCPS_CLOSED); 936 tp->t_state = TCPS_CLOSED; 937 (void) tcp_output(tp); 938 tcpstat.tcps_drops++; 939 } else 940 tcpstat.tcps_conndrops++; 941 if (errno == ETIMEDOUT && tp->t_softerror) 942 errno = tp->t_softerror; 943 so->so_error = errno; 944 return (tcp_close(tp)); 945} 946 947void 948tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) 949{ 950 u_int32_t rtt = rt->rt_rmx.rmx_rtt; 951 int isnetlocal = (tp->t_flags & TF_LOCAL); 952 953 if (rtt != 0) { 954 /* 955 * XXX the lock bit for RTT indicates that the value 956 * is also a minimum value; this is subject to time. 957 */ 958 if (rt->rt_rmx.rmx_locks & RTV_RTT) 959 tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ); 960 else 961 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; 962 tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); 963 tcpstat.tcps_usedrtt++; 964 if (rt->rt_rmx.rmx_rttvar) { 965 tp->t_rttvar = rt->rt_rmx.rmx_rttvar / 966 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); 967 tcpstat.tcps_usedrttvar++; 968 } else { 969 /* default variation is +- 1 rtt */ 970 tp->t_rttvar = 971 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 972 } 973 TCPT_RANGESET(tp->t_rxtcur, 974 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 975 tp->t_rttmin, TCPTV_REXMTMAX, 976 TCP_ADD_REXMTSLOP(tp)); 977 } 978} 979 980/* 981 * Close a TCP control block: 982 * discard all space held by the tcp 983 * discard internet protocol block 984 * wake up any sleepers 985 */ 986struct tcpcb * 987tcp_close(tp) 988 register struct tcpcb *tp; 989{ 990 struct inpcb *inp = tp->t_inpcb; 991 struct socket *so = inp->inp_socket; 992#if INET6 993 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 994#endif /* INET6 */ 995 struct route *ro; 996 struct rtentry *rt; 997 int dosavessthresh; 998 999 /* tcp_close was called previously, bail */ 1000 if ( inp->inp_ppcb == NULL) 1001 return(NULL); 1002 1003 tcp_canceltimers(tp); 1004 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0); 1005 1006 /* 1007 * If another thread for this tcp is currently in ip (indicated by 1008 * the TF_SENDINPROG flag), defer the cleanup until after it returns 1009 * back to tcp. This is done to serialize the close until after all 1010 * pending output is finished, in order to avoid having the PCB be 1011 * detached and the cached route cleaned, only for ip to cache the 1012 * route back into the PCB again. Note that we've cleared all the 1013 * timers at this point. Set TF_CLOSING to indicate to tcp_output() 1014 * that is should call us again once it returns from ip; at that 1015 * point both flags should be cleared and we can proceed further 1016 * with the cleanup. 1017 */ 1018 if ((tp->t_flags & TF_CLOSING) || 1019 inp->inp_sndinprog_cnt > 0) { 1020 tp->t_flags |= TF_CLOSING; 1021 return (NULL); 1022 } 1023 1024 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 1025 struct tcpcb *, tp, int32_t, TCPS_CLOSED); 1026 1027 if (CC_ALGO(tp)->cleanup != NULL) { 1028 CC_ALGO(tp)->cleanup(tp); 1029 } 1030 1031#if INET6 1032 ro = (isipv6 ? (struct route *)&inp->in6p_route : &inp->inp_route); 1033#else 1034 ro = &inp->inp_route; 1035#endif 1036 rt = ro->ro_rt; 1037 if (rt != NULL) 1038 RT_LOCK_SPIN(rt); 1039 1040 /* 1041 * If we got enough samples through the srtt filter, 1042 * save the rtt and rttvar in the routing entry. 1043 * 'Enough' is arbitrarily defined as the 16 samples. 1044 * 16 samples is enough for the srtt filter to converge 1045 * to within 5% of the correct value; fewer samples and 1046 * we could save a very bogus rtt. 1047 * 1048 * Don't update the default route's characteristics and don't 1049 * update anything that the user "locked". 1050 */ 1051 if (tp->t_rttupdated >= 16) { 1052 register u_int32_t i = 0; 1053 1054#if INET6 1055 if (isipv6) { 1056 struct sockaddr_in6 *sin6; 1057 1058 if (rt == NULL) 1059 goto no_valid_rt; 1060 sin6 = (struct sockaddr_in6 *)(void *)rt_key(rt); 1061 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 1062 goto no_valid_rt; 1063 } 1064 else 1065#endif /* INET6 */ 1066 if (ROUTE_UNUSABLE(ro) || 1067 SIN(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) { 1068 if (tp->t_state >= TCPS_CLOSE_WAIT) { 1069 DTRACE_TCP4(state__change, 1070 void, NULL, struct inpcb *, inp, 1071 struct tcpcb *, tp, int32_t, 1072 TCPS_CLOSING); 1073 tp->t_state = TCPS_CLOSING; 1074 } 1075 goto no_valid_rt; 1076 } 1077 1078 RT_LOCK_ASSERT_HELD(rt); 1079 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { 1080 i = tp->t_srtt * 1081 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); 1082 if (rt->rt_rmx.rmx_rtt && i) 1083 /* 1084 * filter this update to half the old & half 1085 * the new values, converting scale. 1086 * See route.h and tcp_var.h for a 1087 * description of the scaling constants. 1088 */ 1089 rt->rt_rmx.rmx_rtt = 1090 (rt->rt_rmx.rmx_rtt + i) / 2; 1091 else 1092 rt->rt_rmx.rmx_rtt = i; 1093 tcpstat.tcps_cachedrtt++; 1094 } 1095 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { 1096 i = tp->t_rttvar * 1097 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); 1098 if (rt->rt_rmx.rmx_rttvar && i) 1099 rt->rt_rmx.rmx_rttvar = 1100 (rt->rt_rmx.rmx_rttvar + i) / 2; 1101 else 1102 rt->rt_rmx.rmx_rttvar = i; 1103 tcpstat.tcps_cachedrttvar++; 1104 } 1105 /* 1106 * The old comment here said: 1107 * update the pipelimit (ssthresh) if it has been updated 1108 * already or if a pipesize was specified & the threshhold 1109 * got below half the pipesize. I.e., wait for bad news 1110 * before we start updating, then update on both good 1111 * and bad news. 1112 * 1113 * But we want to save the ssthresh even if no pipesize is 1114 * specified explicitly in the route, because such 1115 * connections still have an implicit pipesize specified 1116 * by the global tcp_sendspace. In the absence of a reliable 1117 * way to calculate the pipesize, it will have to do. 1118 */ 1119 i = tp->snd_ssthresh; 1120 if (rt->rt_rmx.rmx_sendpipe != 0) 1121 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); 1122 else 1123 dosavessthresh = (i < so->so_snd.sb_hiwat / 2); 1124 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && 1125 i != 0 && rt->rt_rmx.rmx_ssthresh != 0) 1126 || dosavessthresh) { 1127 /* 1128 * convert the limit from user data bytes to 1129 * packets then to packet data bytes. 1130 */ 1131 i = (i + tp->t_maxseg / 2) / tp->t_maxseg; 1132 if (i < 2) 1133 i = 2; 1134 i *= (u_int32_t)(tp->t_maxseg + 1135#if INET6 1136 (isipv6 ? sizeof (struct ip6_hdr) + 1137 sizeof (struct tcphdr) : 1138#endif 1139 sizeof (struct tcpiphdr) 1140#if INET6 1141 ) 1142#endif 1143 ); 1144 if (rt->rt_rmx.rmx_ssthresh) 1145 rt->rt_rmx.rmx_ssthresh = 1146 (rt->rt_rmx.rmx_ssthresh + i) / 2; 1147 else 1148 rt->rt_rmx.rmx_ssthresh = i; 1149 tcpstat.tcps_cachedssthresh++; 1150 } 1151 } 1152 1153 /* 1154 * Mark route for deletion if no information is cached. 1155 */ 1156 if (rt != NULL && (so->so_flags & SOF_OVERFLOW) && tcp_lq_overflow) { 1157 if (!(rt->rt_rmx.rmx_locks & RTV_RTT) && 1158 rt->rt_rmx.rmx_rtt == 0) { 1159 rt->rt_flags |= RTF_DELCLONE; 1160 } 1161 } 1162 1163no_valid_rt: 1164 if (rt != NULL) 1165 RT_UNLOCK(rt); 1166 1167 /* free the reassembly queue, if any */ 1168 (void) tcp_freeq(tp); 1169 1170 tcp_free_sackholes(tp); 1171 if (tp->t_bwmeas != NULL) { 1172 tcp_bwmeas_free(tp); 1173 } 1174 1175 /* Free the packet list */ 1176 if (tp->t_pktlist_head != NULL) 1177 m_freem_list(tp->t_pktlist_head); 1178 TCP_PKTLIST_CLEAR(tp); 1179 1180#if MPTCP 1181 /* Clear MPTCP state */ 1182 tp->t_mpflags = 0; 1183#endif /* MPTCP */ 1184 1185 if (so->cached_in_sock_layer) 1186 inp->inp_saved_ppcb = (caddr_t) tp; 1187 1188 /* Issue a wakeup before detach so that we don't miss 1189 * a wakeup 1190 */ 1191 sodisconnectwakeup(so); 1192 1193 /* 1194 * Clean up any LRO state 1195 */ 1196 if (tp->t_flagsext & TF_LRO_OFFLOADED) { 1197 tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr, 1198 inp->inp_lport, 1199 inp->inp_fport); 1200 tp->t_flagsext &= ~TF_LRO_OFFLOADED; 1201 } 1202 tp->t_state = TCPS_CLOSED; 1203#if INET6 1204 if (SOCK_CHECK_DOM(so, PF_INET6)) 1205 in6_pcbdetach(inp); 1206 else 1207#endif /* INET6 */ 1208 in_pcbdetach(inp); 1209 1210 /* Call soisdisconnected after detach because it might unlock the socket */ 1211 soisdisconnected(so); 1212 tcpstat.tcps_closed++; 1213 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed,0,0,0,0); 1214 return(NULL); 1215} 1216 1217int 1218tcp_freeq(tp) 1219 struct tcpcb *tp; 1220{ 1221 1222 register struct tseg_qent *q; 1223 int rv = 0; 1224 1225 while((q = LIST_FIRST(&tp->t_segq)) != NULL) { 1226 LIST_REMOVE(q, tqe_q); 1227 m_freem(q->tqe_m); 1228 zfree(tcp_reass_zone, q); 1229 tcp_reass_qsize--; 1230 rv = 1; 1231 } 1232 return (rv); 1233} 1234 1235void 1236tcp_drain() 1237{ 1238 if (do_tcpdrain) 1239 { 1240 struct inpcb *inp; 1241 struct tcpcb *tp; 1242 /* 1243 * Walk the tcpbs, if existing, and flush the reassembly queue, 1244 * if there is one... 1245 * Do it next time if the pcbinfo lock is in use 1246 */ 1247 if (!lck_rw_try_lock_exclusive(tcbinfo.ipi_lock)) 1248 return; 1249 1250 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { 1251 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != 1252 WNT_STOPUSING) { 1253 tcp_lock(inp->inp_socket, 1, 0); 1254 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) 1255 == WNT_STOPUSING) { 1256 /* lost a race, try the next one */ 1257 tcp_unlock(inp->inp_socket, 1, 0); 1258 continue; 1259 } 1260 tp = intotcpcb(inp); 1261 tcp_freeq(tp); 1262 tcp_unlock(inp->inp_socket, 1, 0); 1263 } 1264 } 1265 lck_rw_done(tcbinfo.ipi_lock); 1266 1267 } 1268} 1269 1270/* 1271 * Notify a tcp user of an asynchronous error; 1272 * store error as soft error, but wake up user 1273 * (for now, won't do anything until can select for soft error). 1274 * 1275 * Do not wake up user since there currently is no mechanism for 1276 * reporting soft errors (yet - a kqueue filter may be added). 1277 */ 1278static void 1279tcp_notify(inp, error) 1280 struct inpcb *inp; 1281 int error; 1282{ 1283 struct tcpcb *tp; 1284 1285 if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) 1286 return; /* pcb is gone already */ 1287 1288 tp = (struct tcpcb *)inp->inp_ppcb; 1289 1290 /* 1291 * Ignore some errors if we are hooked up. 1292 * If connection hasn't completed, has retransmitted several times, 1293 * and receives a second error, give up now. This is better 1294 * than waiting a long time to establish a connection that 1295 * can never complete. 1296 */ 1297 if (tp->t_state == TCPS_ESTABLISHED && 1298 (error == EHOSTUNREACH || error == ENETUNREACH || 1299 error == EHOSTDOWN)) { 1300 return; 1301 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 1302 tp->t_softerror) 1303 tcp_drop(tp, error); 1304 else 1305 tp->t_softerror = error; 1306#if 0 1307 wakeup((caddr_t) &so->so_timeo); 1308 sorwakeup(so); 1309 sowwakeup(so); 1310#endif 1311} 1312 1313struct bwmeas* 1314tcp_bwmeas_alloc(struct tcpcb *tp) 1315{ 1316 struct bwmeas *elm; 1317 elm = zalloc(tcp_bwmeas_zone); 1318 if (elm == NULL) 1319 return(elm); 1320 1321 bzero(elm, bwmeas_elm_size); 1322 elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE; 1323 elm->bw_maxsizepkts = TCP_BWMEAS_BURST_MAXSIZE; 1324 elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg; 1325 elm->bw_maxsize = elm->bw_maxsizepkts * tp->t_maxseg; 1326 return(elm); 1327} 1328 1329void 1330tcp_bwmeas_free(struct tcpcb* tp) 1331{ 1332 zfree(tcp_bwmeas_zone, tp->t_bwmeas); 1333 tp->t_bwmeas = NULL; 1334 tp->t_flagsext &= ~(TF_MEASURESNDBW); 1335} 1336 1337/* 1338 * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format. 1339 * The otcpcb data structure is passed to user space and must not change. 1340 */ 1341static void 1342tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp) 1343{ 1344 int i; 1345 1346 otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; 1347 otp->t_dupacks = tp->t_dupacks; 1348 for (i = 0; i < TCPT_NTIMERS_EXT; i++) 1349 otp->t_timer[i] = tp->t_timer[i]; 1350 otp->t_inpcb = (_TCPCB_PTR(struct inpcb *))(uintptr_t)tp->t_inpcb; 1351 otp->t_state = tp->t_state; 1352 otp->t_flags = tp->t_flags; 1353 otp->t_force = tp->t_force; 1354 otp->snd_una = tp->snd_una; 1355 otp->snd_max = tp->snd_max; 1356 otp->snd_nxt = tp->snd_nxt; 1357 otp->snd_up = tp->snd_up; 1358 otp->snd_wl1 = tp->snd_wl1; 1359 otp->snd_wl2 = tp->snd_wl2; 1360 otp->iss = tp->iss; 1361 otp->irs = tp->irs; 1362 otp->rcv_nxt = tp->rcv_nxt; 1363 otp->rcv_adv = tp->rcv_adv; 1364 otp->rcv_wnd = tp->rcv_wnd; 1365 otp->rcv_up = tp->rcv_up; 1366 otp->snd_wnd = tp->snd_wnd; 1367 otp->snd_cwnd = tp->snd_cwnd; 1368 otp->snd_ssthresh = tp->snd_ssthresh; 1369 otp->t_maxopd = tp->t_maxopd; 1370 otp->t_rcvtime = tp->t_rcvtime; 1371 otp->t_starttime = tp->t_starttime; 1372 otp->t_rtttime = tp->t_rtttime; 1373 otp->t_rtseq = tp->t_rtseq; 1374 otp->t_rxtcur = tp->t_rxtcur; 1375 otp->t_maxseg = tp->t_maxseg; 1376 otp->t_srtt = tp->t_srtt; 1377 otp->t_rttvar = tp->t_rttvar; 1378 otp->t_rxtshift = tp->t_rxtshift; 1379 otp->t_rttmin = tp->t_rttmin; 1380 otp->t_rttupdated = tp->t_rttupdated; 1381 otp->max_sndwnd = tp->max_sndwnd; 1382 otp->t_softerror = tp->t_softerror; 1383 otp->t_oobflags = tp->t_oobflags; 1384 otp->t_iobc = tp->t_iobc; 1385 otp->snd_scale = tp->snd_scale; 1386 otp->rcv_scale = tp->rcv_scale; 1387 otp->request_r_scale = tp->request_r_scale; 1388 otp->requested_s_scale = tp->requested_s_scale; 1389 otp->ts_recent = tp->ts_recent; 1390 otp->ts_recent_age = tp->ts_recent_age; 1391 otp->last_ack_sent = tp->last_ack_sent; 1392 otp->cc_send = tp->cc_send; 1393 otp->cc_recv = tp->cc_recv; 1394 otp->snd_recover = tp->snd_recover; 1395 otp->snd_cwnd_prev = tp->snd_cwnd_prev; 1396 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev; 1397 otp->t_badrxtwin = 0; 1398} 1399 1400static int 1401tcp_pcblist SYSCTL_HANDLER_ARGS 1402{ 1403#pragma unused(oidp, arg1, arg2) 1404 int error, i = 0, n; 1405 struct inpcb *inp, **inp_list; 1406 struct tcpcb *tp; 1407 inp_gen_t gencnt; 1408 struct xinpgen xig; 1409 1410 /* 1411 * The process of preparing the TCB list is too time-consuming and 1412 * resource-intensive to repeat twice on every request. 1413 */ 1414 lck_rw_lock_shared(tcbinfo.ipi_lock); 1415 if (req->oldptr == USER_ADDR_NULL) { 1416 n = tcbinfo.ipi_count; 1417 req->oldidx = 2 * (sizeof xig) 1418 + (n + n/8) * sizeof(struct xtcpcb); 1419 lck_rw_done(tcbinfo.ipi_lock); 1420 return 0; 1421 } 1422 1423 if (req->newptr != USER_ADDR_NULL) { 1424 lck_rw_done(tcbinfo.ipi_lock); 1425 return EPERM; 1426 } 1427 1428 /* 1429 * OK, now we're committed to doing something. 1430 */ 1431 gencnt = tcbinfo.ipi_gencnt; 1432 n = tcbinfo.ipi_count; 1433 1434 bzero(&xig, sizeof(xig)); 1435 xig.xig_len = sizeof xig; 1436 xig.xig_count = n; 1437 xig.xig_gen = gencnt; 1438 xig.xig_sogen = so_gencnt; 1439 error = SYSCTL_OUT(req, &xig, sizeof xig); 1440 if (error) { 1441 lck_rw_done(tcbinfo.ipi_lock); 1442 return error; 1443 } 1444 /* 1445 * We are done if there is no pcb 1446 */ 1447 if (n == 0) { 1448 lck_rw_done(tcbinfo.ipi_lock); 1449 return 0; 1450 } 1451 1452 inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1453 if (inp_list == 0) { 1454 lck_rw_done(tcbinfo.ipi_lock); 1455 return ENOMEM; 1456 } 1457 1458 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { 1459 if (inp->inp_gencnt <= gencnt && 1460 inp->inp_state != INPCB_STATE_DEAD) 1461 inp_list[i++] = inp; 1462 if (i >= n) break; 1463 } 1464 1465 TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) { 1466 inp = tp->t_inpcb; 1467 if (inp->inp_gencnt <= gencnt && 1468 inp->inp_state != INPCB_STATE_DEAD) 1469 inp_list[i++] = inp; 1470 if (i >= n) break; 1471 } 1472 1473 n = i; 1474 1475 error = 0; 1476 for (i = 0; i < n; i++) { 1477 inp = inp_list[i]; 1478 if (inp->inp_gencnt <= gencnt && 1479 inp->inp_state != INPCB_STATE_DEAD) { 1480 struct xtcpcb xt; 1481 caddr_t inp_ppcb; 1482 1483 bzero(&xt, sizeof(xt)); 1484 xt.xt_len = sizeof xt; 1485 /* XXX should avoid extra copy */ 1486 inpcb_to_compat(inp, &xt.xt_inp); 1487 inp_ppcb = inp->inp_ppcb; 1488 if (inp_ppcb != NULL) { 1489 tcpcb_to_otcpcb( 1490 (struct tcpcb *)(void *)inp_ppcb, 1491 &xt.xt_tp); 1492 } else { 1493 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 1494 } 1495 if (inp->inp_socket) 1496 sotoxsocket(inp->inp_socket, &xt.xt_socket); 1497 error = SYSCTL_OUT(req, &xt, sizeof xt); 1498 } 1499 } 1500 if (!error) { 1501 /* 1502 * Give the user an updated idea of our state. 1503 * If the generation differs from what we told 1504 * her before, she knows that something happened 1505 * while we were processing this request, and it 1506 * might be necessary to retry. 1507 */ 1508 bzero(&xig, sizeof(xig)); 1509 xig.xig_len = sizeof xig; 1510 xig.xig_gen = tcbinfo.ipi_gencnt; 1511 xig.xig_sogen = so_gencnt; 1512 xig.xig_count = tcbinfo.ipi_count; 1513 error = SYSCTL_OUT(req, &xig, sizeof xig); 1514 } 1515 FREE(inp_list, M_TEMP); 1516 lck_rw_done(tcbinfo.ipi_lock); 1517 return error; 1518} 1519 1520SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, 1521 tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 1522 1523 1524static void 1525tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp) 1526{ 1527 int i; 1528 1529 otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; 1530 otp->t_dupacks = tp->t_dupacks; 1531 for (i = 0; i < TCPT_NTIMERS_EXT; i++) 1532 otp->t_timer[i] = tp->t_timer[i]; 1533 otp->t_state = tp->t_state; 1534 otp->t_flags = tp->t_flags; 1535 otp->t_force = tp->t_force; 1536 otp->snd_una = tp->snd_una; 1537 otp->snd_max = tp->snd_max; 1538 otp->snd_nxt = tp->snd_nxt; 1539 otp->snd_up = tp->snd_up; 1540 otp->snd_wl1 = tp->snd_wl1; 1541 otp->snd_wl2 = tp->snd_wl2; 1542 otp->iss = tp->iss; 1543 otp->irs = tp->irs; 1544 otp->rcv_nxt = tp->rcv_nxt; 1545 otp->rcv_adv = tp->rcv_adv; 1546 otp->rcv_wnd = tp->rcv_wnd; 1547 otp->rcv_up = tp->rcv_up; 1548 otp->snd_wnd = tp->snd_wnd; 1549 otp->snd_cwnd = tp->snd_cwnd; 1550 otp->snd_ssthresh = tp->snd_ssthresh; 1551 otp->t_maxopd = tp->t_maxopd; 1552 otp->t_rcvtime = tp->t_rcvtime; 1553 otp->t_starttime = tp->t_starttime; 1554 otp->t_rtttime = tp->t_rtttime; 1555 otp->t_rtseq = tp->t_rtseq; 1556 otp->t_rxtcur = tp->t_rxtcur; 1557 otp->t_maxseg = tp->t_maxseg; 1558 otp->t_srtt = tp->t_srtt; 1559 otp->t_rttvar = tp->t_rttvar; 1560 otp->t_rxtshift = tp->t_rxtshift; 1561 otp->t_rttmin = tp->t_rttmin; 1562 otp->t_rttupdated = tp->t_rttupdated; 1563 otp->max_sndwnd = tp->max_sndwnd; 1564 otp->t_softerror = tp->t_softerror; 1565 otp->t_oobflags = tp->t_oobflags; 1566 otp->t_iobc = tp->t_iobc; 1567 otp->snd_scale = tp->snd_scale; 1568 otp->rcv_scale = tp->rcv_scale; 1569 otp->request_r_scale = tp->request_r_scale; 1570 otp->requested_s_scale = tp->requested_s_scale; 1571 otp->ts_recent = tp->ts_recent; 1572 otp->ts_recent_age = tp->ts_recent_age; 1573 otp->last_ack_sent = tp->last_ack_sent; 1574 otp->cc_send = tp->cc_send; 1575 otp->cc_recv = tp->cc_recv; 1576 otp->snd_recover = tp->snd_recover; 1577 otp->snd_cwnd_prev = tp->snd_cwnd_prev; 1578 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev; 1579 otp->t_badrxtwin = 0; 1580} 1581 1582 1583static int 1584tcp_pcblist64 SYSCTL_HANDLER_ARGS 1585{ 1586#pragma unused(oidp, arg1, arg2) 1587 int error, i = 0, n; 1588 struct inpcb *inp, **inp_list; 1589 struct tcpcb *tp; 1590 inp_gen_t gencnt; 1591 struct xinpgen xig; 1592 1593 /* 1594 * The process of preparing the TCB list is too time-consuming and 1595 * resource-intensive to repeat twice on every request. 1596 */ 1597 lck_rw_lock_shared(tcbinfo.ipi_lock); 1598 if (req->oldptr == USER_ADDR_NULL) { 1599 n = tcbinfo.ipi_count; 1600 req->oldidx = 2 * (sizeof xig) 1601 + (n + n/8) * sizeof(struct xtcpcb64); 1602 lck_rw_done(tcbinfo.ipi_lock); 1603 return 0; 1604 } 1605 1606 if (req->newptr != USER_ADDR_NULL) { 1607 lck_rw_done(tcbinfo.ipi_lock); 1608 return EPERM; 1609 } 1610 1611 /* 1612 * OK, now we're committed to doing something. 1613 */ 1614 gencnt = tcbinfo.ipi_gencnt; 1615 n = tcbinfo.ipi_count; 1616 1617 bzero(&xig, sizeof(xig)); 1618 xig.xig_len = sizeof xig; 1619 xig.xig_count = n; 1620 xig.xig_gen = gencnt; 1621 xig.xig_sogen = so_gencnt; 1622 error = SYSCTL_OUT(req, &xig, sizeof xig); 1623 if (error) { 1624 lck_rw_done(tcbinfo.ipi_lock); 1625 return error; 1626 } 1627 /* 1628 * We are done if there is no pcb 1629 */ 1630 if (n == 0) { 1631 lck_rw_done(tcbinfo.ipi_lock); 1632 return 0; 1633 } 1634 1635 inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1636 if (inp_list == 0) { 1637 lck_rw_done(tcbinfo.ipi_lock); 1638 return ENOMEM; 1639 } 1640 1641 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { 1642 if (inp->inp_gencnt <= gencnt && 1643 inp->inp_state != INPCB_STATE_DEAD) 1644 inp_list[i++] = inp; 1645 if (i >= n) break; 1646 } 1647 1648 TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) { 1649 inp = tp->t_inpcb; 1650 if (inp->inp_gencnt <= gencnt && 1651 inp->inp_state != INPCB_STATE_DEAD) 1652 inp_list[i++] = inp; 1653 if (i >= n) break; 1654 } 1655 1656 n = i; 1657 1658 error = 0; 1659 for (i = 0; i < n; i++) { 1660 inp = inp_list[i]; 1661 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { 1662 struct xtcpcb64 xt; 1663 1664 bzero(&xt, sizeof(xt)); 1665 xt.xt_len = sizeof xt; 1666 inpcb_to_xinpcb64(inp, &xt.xt_inpcb); 1667 xt.xt_inpcb.inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb; 1668 if (inp->inp_ppcb != NULL) 1669 tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt); 1670 if (inp->inp_socket) 1671 sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket); 1672 error = SYSCTL_OUT(req, &xt, sizeof xt); 1673 } 1674 } 1675 if (!error) { 1676 /* 1677 * Give the user an updated idea of our state. 1678 * If the generation differs from what we told 1679 * her before, she knows that something happened 1680 * while we were processing this request, and it 1681 * might be necessary to retry. 1682 */ 1683 bzero(&xig, sizeof(xig)); 1684 xig.xig_len = sizeof xig; 1685 xig.xig_gen = tcbinfo.ipi_gencnt; 1686 xig.xig_sogen = so_gencnt; 1687 xig.xig_count = tcbinfo.ipi_count; 1688 error = SYSCTL_OUT(req, &xig, sizeof xig); 1689 } 1690 FREE(inp_list, M_TEMP); 1691 lck_rw_done(tcbinfo.ipi_lock); 1692 return error; 1693} 1694 1695SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, 1696 tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections"); 1697 1698 1699static int 1700tcp_pcblist_n SYSCTL_HANDLER_ARGS 1701{ 1702#pragma unused(oidp, arg1, arg2) 1703 int error = 0; 1704 1705 error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo); 1706 1707 return error; 1708} 1709 1710 1711SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, 1712 tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections"); 1713 1714 1715__private_extern__ void 1716tcp_get_ports_used(uint32_t ifindex, int protocol, uint32_t wildcardok, 1717 bitstr_t *bitfield) 1718{ 1719 inpcb_get_ports_used(ifindex, protocol, wildcardok, bitfield, &tcbinfo); 1720} 1721 1722__private_extern__ uint32_t 1723tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags) 1724{ 1725 return inpcb_count_opportunistic(ifindex, &tcbinfo, flags); 1726} 1727 1728__private_extern__ uint32_t 1729tcp_find_anypcb_byaddr(struct ifaddr *ifa) 1730{ 1731 return inpcb_find_anypcb_byaddr(ifa, &tcbinfo); 1732} 1733 1734void 1735tcp_ctlinput(cmd, sa, vip) 1736 int cmd; 1737 struct sockaddr *sa; 1738 void *vip; 1739{ 1740 tcp_seq icmp_tcp_seq; 1741 struct ip *ip = vip; 1742 struct in_addr faddr; 1743 struct inpcb *inp; 1744 struct tcpcb *tp; 1745 1746 void (*notify)(struct inpcb *, int) = tcp_notify; 1747 1748 faddr = ((struct sockaddr_in *)(void *)sa)->sin_addr; 1749 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1750 return; 1751 1752 if (cmd == PRC_MSGSIZE) 1753 notify = tcp_mtudisc; 1754 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 1755 cmd == PRC_UNREACH_PORT) && ip) 1756 notify = tcp_drop_syn_sent; 1757 else if (PRC_IS_REDIRECT(cmd)) { 1758 ip = 0; 1759 notify = in_rtchange; 1760 } else if (cmd == PRC_HOSTDEAD) 1761 ip = 0; 1762 /* Source quench is deprecated */ 1763 else if (cmd == PRC_QUENCH) 1764 return; 1765 else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) 1766 return; 1767 if (ip) { 1768 struct tcphdr th; 1769 struct icmp *icp; 1770 1771 icp = (struct icmp *)(void *) 1772 ((caddr_t)ip - offsetof(struct icmp, icmp_ip)); 1773 bcopy(((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)), 1774 &th, sizeof (th)); 1775 inp = in_pcblookup_hash(&tcbinfo, faddr, th.th_dport, 1776 ip->ip_src, th.th_sport, 0, NULL); 1777 if (inp != NULL && inp->inp_socket != NULL) { 1778 tcp_lock(inp->inp_socket, 1, 0); 1779 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { 1780 tcp_unlock(inp->inp_socket, 1, 0); 1781 return; 1782 } 1783 icmp_tcp_seq = htonl(th.th_seq); 1784 tp = intotcpcb(inp); 1785 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && 1786 SEQ_LT(icmp_tcp_seq, tp->snd_max)) { 1787 if (cmd == PRC_MSGSIZE) { 1788 1789 /* 1790 * MTU discovery: 1791 * If we got a needfrag and there is a host route to the 1792 * original destination, and the MTU is not locked, then 1793 * set the MTU in the route to the suggested new value 1794 * (if given) and then notify as usual. The ULPs will 1795 * notice that the MTU has changed and adapt accordingly. 1796 * If no new MTU was suggested, then we guess a new one 1797 * less than the current value. If the new MTU is 1798 * unreasonably small (defined by sysctl tcp_minmss), then 1799 * we reset the MTU to the interface value and enable the 1800 * lock bit, indicating that we are no longer doing MTU 1801 * discovery. 1802 */ 1803 struct rtentry *rt; 1804 int mtu; 1805 struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET, 1806 0 , { 0 }, { 0,0,0,0,0,0,0,0 } }; 1807 icmpsrc.sin_addr = icp->icmp_ip.ip_dst; 1808 1809 rt = rtalloc1((struct sockaddr *)&icmpsrc, 0, 1810 RTF_CLONING | RTF_PRCLONING); 1811 if (rt != NULL) { 1812 RT_LOCK(rt); 1813 if ((rt->rt_flags & RTF_HOST) && 1814 !(rt->rt_rmx.rmx_locks & RTV_MTU)) { 1815 mtu = ntohs(icp->icmp_nextmtu); 1816 if (!mtu) 1817 mtu = ip_next_mtu(rt->rt_rmx. 1818 rmx_mtu, 1); 1819#if DEBUG_MTUDISC 1820 printf("MTU for %s reduced to %d\n", 1821 inet_ntop(AF_INET, 1822 &icmpsrc.sin_addr, ipv4str, 1823 sizeof (ipv4str)), mtu); 1824#endif 1825 if (mtu < max(296, (tcp_minmss + 1826 sizeof (struct tcpiphdr)))) { 1827 /* rt->rt_rmx.rmx_mtu = 1828 rt->rt_ifp->if_mtu; */ 1829 rt->rt_rmx.rmx_locks |= RTV_MTU; 1830 } else if (rt->rt_rmx.rmx_mtu > mtu) { 1831 rt->rt_rmx.rmx_mtu = mtu; 1832 } 1833 } 1834 RT_UNLOCK(rt); 1835 rtfree(rt); 1836 } 1837 } 1838 1839 (*notify)(inp, inetctlerrmap[cmd]); 1840 } 1841 tcp_unlock(inp->inp_socket, 1, 0); 1842 } 1843 } else 1844 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); 1845} 1846 1847#if INET6 1848void 1849tcp6_ctlinput(cmd, sa, d) 1850 int cmd; 1851 struct sockaddr *sa; 1852 void *d; 1853{ 1854 struct tcphdr th; 1855 void (*notify)(struct inpcb *, int) = tcp_notify; 1856 struct ip6_hdr *ip6; 1857 struct mbuf *m; 1858 struct ip6ctlparam *ip6cp = NULL; 1859 const struct sockaddr_in6 *sa6_src = NULL; 1860 int off; 1861 struct tcp_portonly { 1862 u_int16_t th_sport; 1863 u_int16_t th_dport; 1864 } *thp; 1865 1866 if (sa->sa_family != AF_INET6 || 1867 sa->sa_len != sizeof(struct sockaddr_in6)) 1868 return; 1869 1870 if (cmd == PRC_MSGSIZE) 1871 notify = tcp_mtudisc; 1872 else if (!PRC_IS_REDIRECT(cmd) && 1873 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 1874 return; 1875 /* Source quench is deprecated */ 1876 else if (cmd == PRC_QUENCH) 1877 return; 1878 1879 /* if the parameter is from icmp6, decode it. */ 1880 if (d != NULL) { 1881 ip6cp = (struct ip6ctlparam *)d; 1882 m = ip6cp->ip6c_m; 1883 ip6 = ip6cp->ip6c_ip6; 1884 off = ip6cp->ip6c_off; 1885 sa6_src = ip6cp->ip6c_src; 1886 } else { 1887 m = NULL; 1888 ip6 = NULL; 1889 off = 0; /* fool gcc */ 1890 sa6_src = &sa6_any; 1891 } 1892 1893 if (ip6) { 1894 /* 1895 * XXX: We assume that when IPV6 is non NULL, 1896 * M and OFF are valid. 1897 */ 1898 1899 /* check if we can safely examine src and dst ports */ 1900 if (m->m_pkthdr.len < off + sizeof(*thp)) 1901 return; 1902 1903 bzero(&th, sizeof(th)); 1904 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 1905 1906 in6_pcbnotify(&tcbinfo, sa, th.th_dport, 1907 (struct sockaddr *)ip6cp->ip6c_src, 1908 th.th_sport, cmd, NULL, notify); 1909 } else { 1910 in6_pcbnotify(&tcbinfo, sa, 0, 1911 (struct sockaddr *)(size_t)sa6_src, 0, cmd, NULL, notify); 1912 } 1913} 1914#endif /* INET6 */ 1915 1916 1917/* 1918 * Following is where TCP initial sequence number generation occurs. 1919 * 1920 * There are two places where we must use initial sequence numbers: 1921 * 1. In SYN-ACK packets. 1922 * 2. In SYN packets. 1923 * 1924 * The ISNs in SYN-ACK packets have no monotonicity requirement, 1925 * and should be as unpredictable as possible to avoid the possibility 1926 * of spoofing and/or connection hijacking. To satisfy this 1927 * requirement, SYN-ACK ISNs are generated via the arc4random() 1928 * function. If exact RFC 1948 compliance is requested via sysctl, 1929 * these ISNs will be generated just like those in SYN packets. 1930 * 1931 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 1932 * depends on this property. In addition, these ISNs should be 1933 * unguessable so as to prevent connection hijacking. To satisfy 1934 * the requirements of this situation, the algorithm outlined in 1935 * RFC 1948 is used to generate sequence numbers. 1936 * 1937 * For more information on the theory of operation, please see 1938 * RFC 1948. 1939 * 1940 * Implementation details: 1941 * 1942 * Time is based off the system timer, and is corrected so that it 1943 * increases by one megabyte per second. This allows for proper 1944 * recycling on high speed LANs while still leaving over an hour 1945 * before rollover. 1946 * 1947 * Two sysctls control the generation of ISNs: 1948 * 1949 * net.inet.tcp.isn_reseed_interval controls the number of seconds 1950 * between seeding of isn_secret. This is normally set to zero, 1951 * as reseeding should not be necessary. 1952 * 1953 * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed 1954 * strictly. When strict compliance is requested, reseeding is 1955 * disabled and SYN-ACKs will be generated in the same manner as 1956 * SYNs. Strict mode is disabled by default. 1957 * 1958 */ 1959 1960#define ISN_BYTES_PER_SECOND 1048576 1961 1962tcp_seq 1963tcp_new_isn(tp) 1964 struct tcpcb *tp; 1965{ 1966 u_int32_t md5_buffer[4]; 1967 tcp_seq new_isn; 1968 struct timeval timenow; 1969 u_char isn_secret[32]; 1970 int isn_last_reseed = 0; 1971 MD5_CTX isn_ctx; 1972 1973 /* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */ 1974 if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT)) 1975 && tcp_strict_rfc1948 == 0) 1976#ifdef __APPLE__ 1977 return RandomULong(); 1978#else 1979 return arc4random(); 1980#endif 1981 getmicrotime(&timenow); 1982 1983 /* Seed if this is the first use, reseed if requested. */ 1984 if ((isn_last_reseed == 0) || 1985 ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) && 1986 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) 1987 < (u_int)timenow.tv_sec))) { 1988#ifdef __APPLE__ 1989 read_random(&isn_secret, sizeof(isn_secret)); 1990#else 1991 read_random_unlimited(&isn_secret, sizeof(isn_secret)); 1992#endif 1993 isn_last_reseed = timenow.tv_sec; 1994 } 1995 1996 /* Compute the md5 hash and return the ISN. */ 1997 MD5Init(&isn_ctx); 1998 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 1999 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 2000#if INET6 2001 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 2002 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 2003 sizeof(struct in6_addr)); 2004 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 2005 sizeof(struct in6_addr)); 2006 } else 2007#endif 2008 { 2009 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 2010 sizeof(struct in_addr)); 2011 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 2012 sizeof(struct in_addr)); 2013 } 2014 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); 2015 MD5Final((u_char *) &md5_buffer, &isn_ctx); 2016 new_isn = (tcp_seq) md5_buffer[0]; 2017 new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz); 2018 return new_isn; 2019} 2020 2021 2022/* 2023 * When a specific ICMP unreachable message is received and the 2024 * connection state is SYN-SENT, drop the connection. This behavior 2025 * is controlled by the icmp_may_rst sysctl. 2026 */ 2027void 2028tcp_drop_syn_sent(inp, errno) 2029 struct inpcb *inp; 2030 int errno; 2031{ 2032 struct tcpcb *tp = intotcpcb(inp); 2033 2034 if (tp && tp->t_state == TCPS_SYN_SENT) 2035 tcp_drop(tp, errno); 2036} 2037 2038/* 2039 * When `need fragmentation' ICMP is received, update our idea of the MSS 2040 * based on the new value in the route. Also nudge TCP to send something, 2041 * since we know the packet we just sent was dropped. 2042 * This duplicates some code in the tcp_mss() function in tcp_input.c. 2043 */ 2044void 2045tcp_mtudisc( 2046 struct inpcb *inp, 2047 __unused int errno 2048) 2049{ 2050 struct tcpcb *tp = intotcpcb(inp); 2051 struct rtentry *rt; 2052 struct rmxp_tao *taop; 2053 struct socket *so = inp->inp_socket; 2054 int offered; 2055 int mss; 2056#if INET6 2057 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 2058#endif /* INET6 */ 2059 2060 if (tp) { 2061#if INET6 2062 if (isipv6) 2063 rt = tcp_rtlookup6(inp, IFSCOPE_NONE); 2064 else 2065#endif /* INET6 */ 2066 rt = tcp_rtlookup(inp, IFSCOPE_NONE); 2067 if (!rt || !rt->rt_rmx.rmx_mtu) { 2068 tp->t_maxopd = tp->t_maxseg = 2069#if INET6 2070 isipv6 ? tcp_v6mssdflt : 2071#endif /* INET6 */ 2072 tcp_mssdflt; 2073 2074 /* Route locked during lookup above */ 2075 if (rt != NULL) 2076 RT_UNLOCK(rt); 2077 return; 2078 } 2079 taop = rmx_taop(rt->rt_rmx); 2080 offered = taop->tao_mssopt; 2081 mss = rt->rt_rmx.rmx_mtu - 2082#if INET6 2083 (isipv6 ? 2084 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : 2085#endif /* INET6 */ 2086 sizeof(struct tcpiphdr) 2087#if INET6 2088 ) 2089#endif /* INET6 */ 2090 ; 2091 2092 /* Route locked during lookup above */ 2093 RT_UNLOCK(rt); 2094 2095 if (offered) 2096 mss = min(mss, offered); 2097 /* 2098 * XXX - The above conditional probably violates the TCP 2099 * spec. The problem is that, since we don't know the 2100 * other end's MSS, we are supposed to use a conservative 2101 * default. But, if we do that, then MTU discovery will 2102 * never actually take place, because the conservative 2103 * default is much less than the MTUs typically seen 2104 * on the Internet today. For the moment, we'll sweep 2105 * this under the carpet. 2106 * 2107 * The conservative default might not actually be a problem 2108 * if the only case this occurs is when sending an initial 2109 * SYN with options and data to a host we've never talked 2110 * to before. Then, they will reply with an MSS value which 2111 * will get recorded and the new parameters should get 2112 * recomputed. For Further Study. 2113 */ 2114 if (tp->t_maxopd <= mss) 2115 return; 2116 tp->t_maxopd = mss; 2117 2118 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2119 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2120 mss -= TCPOLEN_TSTAMP_APPA; 2121 2122#if MPTCP 2123 mss -= mptcp_adj_mss(tp, TRUE); 2124#endif 2125 if (so->so_snd.sb_hiwat < mss) 2126 mss = so->so_snd.sb_hiwat; 2127 2128 tp->t_maxseg = mss; 2129 2130 /* 2131 * Reset the slow-start flight size as it may depends on the new MSS 2132 */ 2133 if (CC_ALGO(tp)->cwnd_init != NULL) 2134 CC_ALGO(tp)->cwnd_init(tp); 2135 tcpstat.tcps_mturesent++; 2136 tp->t_rtttime = 0; 2137 tp->snd_nxt = tp->snd_una; 2138 tcp_output(tp); 2139 } 2140} 2141 2142/* 2143 * Look-up the routing entry to the peer of this inpcb. If no route 2144 * is found and it cannot be allocated the return NULL. This routine 2145 * is called by TCP routines that access the rmx structure and by tcp_mss 2146 * to get the interface MTU. If a route is found, this routine will 2147 * hold the rtentry lock; the caller is responsible for unlocking. 2148 */ 2149struct rtentry * 2150tcp_rtlookup(inp, input_ifscope) 2151 struct inpcb *inp; 2152 unsigned int input_ifscope; 2153{ 2154 struct route *ro; 2155 struct rtentry *rt; 2156 struct tcpcb *tp; 2157 2158 lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); 2159 2160 ro = &inp->inp_route; 2161 if ((rt = ro->ro_rt) != NULL) 2162 RT_LOCK(rt); 2163 2164 if (ROUTE_UNUSABLE(ro)) { 2165 if (rt != NULL) { 2166 RT_UNLOCK(rt); 2167 rt = NULL; 2168 } 2169 ROUTE_RELEASE(ro); 2170 /* No route yet, so try to acquire one */ 2171 if (inp->inp_faddr.s_addr != INADDR_ANY) { 2172 unsigned int ifscope; 2173 2174 ro->ro_dst.sa_family = AF_INET; 2175 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 2176 ((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr = 2177 inp->inp_faddr; 2178 2179 /* 2180 * If the socket was bound to an interface, then 2181 * the bound-to-interface takes precedence over 2182 * the inbound interface passed in by the caller 2183 * (if we get here as part of the output path then 2184 * input_ifscope is IFSCOPE_NONE). 2185 */ 2186 ifscope = (inp->inp_flags & INP_BOUND_IF) ? 2187 inp->inp_boundifp->if_index : input_ifscope; 2188 2189 rtalloc_scoped(ro, ifscope); 2190 if ((rt = ro->ro_rt) != NULL) 2191 RT_LOCK(rt); 2192 } 2193 } 2194 if (rt != NULL) 2195 RT_LOCK_ASSERT_HELD(rt); 2196 2197 /* 2198 * Update MTU discovery determination. Don't do it if: 2199 * 1) it is disabled via the sysctl 2200 * 2) the route isn't up 2201 * 3) the MTU is locked (if it is, then discovery has been 2202 * disabled) 2203 */ 2204 2205 tp = intotcpcb(inp); 2206 2207 if (!path_mtu_discovery || ((rt != NULL) && 2208 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) 2209 tp->t_flags &= ~TF_PMTUD; 2210 else 2211 tp->t_flags |= TF_PMTUD; 2212 2213#if CONFIG_IFEF_NOWINDOWSCALE 2214 if (tcp_obey_ifef_nowindowscale && 2215 tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL && 2216 (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) { 2217 /* Window scaling is enabled on this interface */ 2218 tp->t_flags &= ~TF_REQ_SCALE; 2219 } 2220#endif 2221 2222 if (rt != NULL && rt->rt_ifp != NULL) { 2223 somultipages(inp->inp_socket, 2224 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES)); 2225 tcp_set_tso(tp, rt->rt_ifp); 2226 } 2227 2228 /* Note if the peer is local */ 2229 if (rt != NULL && 2230 (rt->rt_gateway->sa_family == AF_LINK || 2231 rt->rt_ifp->if_flags & IFF_LOOPBACK || 2232 in_localaddr(inp->inp_faddr))) { 2233 tp->t_flags |= TF_LOCAL; 2234 } 2235 2236 /* 2237 * Caller needs to call RT_UNLOCK(rt). 2238 */ 2239 return rt; 2240} 2241 2242#if INET6 2243struct rtentry * 2244tcp_rtlookup6(inp, input_ifscope) 2245 struct inpcb *inp; 2246 unsigned int input_ifscope; 2247{ 2248 struct route_in6 *ro6; 2249 struct rtentry *rt; 2250 struct tcpcb *tp; 2251 2252 lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); 2253 2254 ro6 = &inp->in6p_route; 2255 if ((rt = ro6->ro_rt) != NULL) 2256 RT_LOCK(rt); 2257 2258 if (ROUTE_UNUSABLE(ro6)) { 2259 if (rt != NULL) { 2260 RT_UNLOCK(rt); 2261 rt = NULL; 2262 } 2263 ROUTE_RELEASE(ro6); 2264 /* No route yet, so try to acquire one */ 2265 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { 2266 struct sockaddr_in6 *dst6; 2267 unsigned int ifscope; 2268 2269 dst6 = (struct sockaddr_in6 *)&ro6->ro_dst; 2270 dst6->sin6_family = AF_INET6; 2271 dst6->sin6_len = sizeof(*dst6); 2272 dst6->sin6_addr = inp->in6p_faddr; 2273 2274 /* 2275 * If the socket was bound to an interface, then 2276 * the bound-to-interface takes precedence over 2277 * the inbound interface passed in by the caller 2278 * (if we get here as part of the output path then 2279 * input_ifscope is IFSCOPE_NONE). 2280 */ 2281 ifscope = (inp->inp_flags & INP_BOUND_IF) ? 2282 inp->inp_boundifp->if_index : input_ifscope; 2283 2284 rtalloc_scoped((struct route *)ro6, ifscope); 2285 if ((rt = ro6->ro_rt) != NULL) 2286 RT_LOCK(rt); 2287 } 2288 } 2289 if (rt != NULL) 2290 RT_LOCK_ASSERT_HELD(rt); 2291 2292 /* 2293 * Update path MTU Discovery determination 2294 * while looking up the route: 2295 * 1) we have a valid route to the destination 2296 * 2) the MTU is not locked (if it is, then discovery has been 2297 * disabled) 2298 */ 2299 2300 2301 tp = intotcpcb(inp); 2302 2303 /* 2304 * Update MTU discovery determination. Don't do it if: 2305 * 1) it is disabled via the sysctl 2306 * 2) the route isn't up 2307 * 3) the MTU is locked (if it is, then discovery has been 2308 * disabled) 2309 */ 2310 2311 if (!path_mtu_discovery || ((rt != NULL) && 2312 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) 2313 tp->t_flags &= ~TF_PMTUD; 2314 else 2315 tp->t_flags |= TF_PMTUD; 2316 2317#if CONFIG_IFEF_NOWINDOWSCALE 2318 if (tcp_obey_ifef_nowindowscale && 2319 tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL && 2320 (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) { 2321 /* Window scaling is not enabled on this interface */ 2322 tp->t_flags &= ~TF_REQ_SCALE; 2323 } 2324#endif 2325 2326 if (rt != NULL && rt->rt_ifp != NULL) { 2327 somultipages(inp->inp_socket, 2328 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES)); 2329 tcp_set_tso(tp, rt->rt_ifp); 2330 } 2331 2332 /* Note if the peer is local */ 2333 if (rt != NULL && 2334 (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) || 2335 IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) || 2336 rt->rt_gateway->sa_family == AF_LINK || 2337 in6_localaddr(&inp->in6p_faddr))) { 2338 tp->t_flags |= TF_LOCAL; 2339 } 2340 2341 /* 2342 * Caller needs to call RT_UNLOCK(rt). 2343 */ 2344 return rt; 2345} 2346#endif /* INET6 */ 2347 2348#if IPSEC 2349/* compute ESP/AH header size for TCP, including outer IP header. */ 2350size_t 2351ipsec_hdrsiz_tcp(tp) 2352 struct tcpcb *tp; 2353{ 2354 struct inpcb *inp; 2355 struct mbuf *m; 2356 size_t hdrsiz; 2357 struct ip *ip; 2358#if INET6 2359 struct ip6_hdr *ip6 = NULL; 2360#endif /* INET6 */ 2361 struct tcphdr *th; 2362 2363 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) 2364 return 0; 2365 MGETHDR(m, M_DONTWAIT, MT_DATA); /* MAC-OK */ 2366 if (!m) 2367 return 0; 2368 2369#if INET6 2370 if ((inp->inp_vflag & INP_IPV6) != 0) { 2371 ip6 = mtod(m, struct ip6_hdr *); 2372 th = (struct tcphdr *)(void *)(ip6 + 1); 2373 m->m_pkthdr.len = m->m_len = 2374 sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 2375 tcp_fillheaders(tp, ip6, th); 2376 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 2377 } else 2378#endif /* INET6 */ 2379 { 2380 ip = mtod(m, struct ip *); 2381 th = (struct tcphdr *)(ip + 1); 2382 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 2383 tcp_fillheaders(tp, ip, th); 2384 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 2385 } 2386 m_free(m); 2387 return hdrsiz; 2388} 2389#endif /*IPSEC*/ 2390 2391/* 2392 * Return a pointer to the cached information about the remote host. 2393 * The cached information is stored in the protocol specific part of 2394 * the route metrics. 2395 */ 2396struct rmxp_tao * 2397tcp_gettaocache(inp) 2398 struct inpcb *inp; 2399{ 2400 struct rtentry *rt; 2401 struct rmxp_tao *taop; 2402 2403#if INET6 2404 if ((inp->inp_vflag & INP_IPV6) != 0) 2405 rt = tcp_rtlookup6(inp, IFSCOPE_NONE); 2406 else 2407#endif /* INET6 */ 2408 rt = tcp_rtlookup(inp, IFSCOPE_NONE); 2409 2410 /* Make sure this is a host route and is up. */ 2411 if (rt == NULL || 2412 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) { 2413 /* Route locked during lookup above */ 2414 if (rt != NULL) 2415 RT_UNLOCK(rt); 2416 return NULL; 2417 } 2418 2419 taop = rmx_taop(rt->rt_rmx); 2420 /* Route locked during lookup above */ 2421 RT_UNLOCK(rt); 2422 return (taop); 2423} 2424 2425/* 2426 * Clear all the TAO cache entries, called from tcp_init. 2427 * 2428 * XXX 2429 * This routine is just an empty one, because we assume that the routing 2430 * routing tables are initialized at the same time when TCP, so there is 2431 * nothing in the cache left over. 2432 */ 2433static void 2434tcp_cleartaocache() 2435{ 2436} 2437 2438int 2439tcp_lock(struct socket *so, int refcount, void *lr) 2440{ 2441 void *lr_saved; 2442 2443 if (lr == NULL) 2444 lr_saved = __builtin_return_address(0); 2445 else 2446 lr_saved = lr; 2447 2448 if (so->so_pcb != NULL) { 2449 lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); 2450 } else { 2451 panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n", 2452 so, lr_saved, solockhistory_nr(so)); 2453 /* NOTREACHED */ 2454 } 2455 2456 if (so->so_usecount < 0) { 2457 panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", 2458 so, so->so_pcb, lr_saved, so->so_usecount, solockhistory_nr(so)); 2459 /* NOTREACHED */ 2460 } 2461 if (refcount) 2462 so->so_usecount++; 2463 so->lock_lr[so->next_lock_lr] = lr_saved; 2464 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX; 2465 return (0); 2466} 2467 2468int 2469tcp_unlock(struct socket *so, int refcount, void *lr) 2470{ 2471 void *lr_saved; 2472 2473 if (lr == NULL) 2474 lr_saved = __builtin_return_address(0); 2475 else 2476 lr_saved = lr; 2477 2478#ifdef MORE_TCPLOCK_DEBUG 2479 printf("tcp_unlock: so=0x%llx sopcb=0x%llx lock=0x%llx ref=%x " 2480 "lr=0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so), 2481 (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb), 2482 (uint64_t)VM_KERNEL_ADDRPERM(&(sotoinpcb(so)->inpcb_mtx)), 2483 so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved)); 2484#endif 2485 if (refcount) 2486 so->so_usecount--; 2487 2488 if (so->so_usecount < 0) { 2489 panic("tcp_unlock: so=%p usecount=%x lrh= %s\n", 2490 so, so->so_usecount, solockhistory_nr(so)); 2491 /* NOTREACHED */ 2492 } 2493 if (so->so_pcb == NULL) { 2494 panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", 2495 so, so->so_usecount, lr_saved, solockhistory_nr(so)); 2496 /* NOTREACHED */ 2497 } else { 2498 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, 2499 LCK_MTX_ASSERT_OWNED); 2500 so->unlock_lr[so->next_unlock_lr] = lr_saved; 2501 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; 2502 lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); 2503 } 2504 return (0); 2505} 2506 2507lck_mtx_t * 2508tcp_getlock( 2509 struct socket *so, 2510 __unused int locktype) 2511{ 2512 struct inpcb *inp = sotoinpcb(so); 2513 2514 if (so->so_pcb) { 2515 if (so->so_usecount < 0) 2516 panic("tcp_getlock: so=%p usecount=%x lrh= %s\n", 2517 so, so->so_usecount, solockhistory_nr(so)); 2518 return(&inp->inpcb_mtx); 2519 } 2520 else { 2521 panic("tcp_getlock: so=%p NULL so_pcb %s\n", 2522 so, solockhistory_nr(so)); 2523 return (so->so_proto->pr_domain->dom_mtx); 2524 } 2525} 2526 2527/* Determine if we can grow the recieve socket buffer to avoid sending 2528 * a zero window update to the peer. We allow even socket buffers that 2529 * have fixed size (set by the application) to grow if the resource 2530 * constraints are met. They will also be trimmed after the application 2531 * reads data. 2532 */ 2533static void 2534tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb) { 2535 u_int32_t rcvbufinc = tp->t_maxseg << tcp_autorcvbuf_inc_shift; 2536 u_int32_t rcvbuf = sb->sb_hiwat; 2537 struct socket *so = tp->t_inpcb->inp_socket; 2538 2539 /* 2540 * If message delivery is enabled, do not count 2541 * unordered bytes in receive buffer towards hiwat 2542 */ 2543 if (so->so_flags & SOF_ENABLE_MSGS) 2544 rcvbuf = rcvbuf - so->so_msg_state->msg_uno_bytes; 2545 2546 if (tcp_do_autorcvbuf == 1 && 2547 tcp_cansbgrow(sb) && 2548 (tp->t_flags & TF_SLOWLINK) == 0 && 2549 (rcvbuf - sb->sb_cc) < rcvbufinc && 2550 (rcvbuf < tcp_autorcvbuf_max)) { 2551 sbreserve(sb, (sb->sb_hiwat + rcvbufinc)); 2552 } 2553} 2554 2555int32_t 2556tcp_sbspace(struct tcpcb *tp) 2557{ 2558 struct sockbuf *sb = &tp->t_inpcb->inp_socket->so_rcv; 2559 u_int32_t rcvbuf = sb->sb_hiwat; 2560 int32_t space; 2561 struct socket *so = tp->t_inpcb->inp_socket; 2562 2563 /* 2564 * If message delivery is enabled, do not count 2565 * unordered bytes in receive buffer towards hiwat mark. 2566 * This value is used to return correct rwnd that does 2567 * not reflect the extra unordered bytes added to the 2568 * receive socket buffer. 2569 */ 2570 if (so->so_flags & SOF_ENABLE_MSGS) 2571 rcvbuf = rcvbuf - so->so_msg_state->msg_uno_bytes; 2572 2573 tcp_sbrcv_grow_rwin(tp, sb); 2574 2575 space = ((int32_t) imin((rcvbuf - sb->sb_cc), 2576 (sb->sb_mbmax - sb->sb_mbcnt))); 2577 if (space < 0) 2578 space = 0; 2579 2580 /* Avoid increasing window size if the current window 2581 * is already very low, we could be in "persist" mode and 2582 * we could break some apps (see rdar://5409343) 2583 */ 2584 2585 if (space < tp->t_maxseg) 2586 return space; 2587 2588 /* Clip window size for slower link */ 2589 2590 if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0 ) 2591 return imin(space, slowlink_wsize); 2592 2593 return space; 2594} 2595/* 2596 * Checks TCP Segment Offloading capability for a given connection and interface pair. 2597 */ 2598void 2599tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp) 2600{ 2601#if MPTCP 2602 /* 2603 * We can't use TSO if this tcpcb belongs to an MPTCP session. 2604 */ 2605 if (tp->t_mpflags & TMPF_MPTCP_TRUE) { 2606 tp->t_flags &= ~TF_TSO; 2607 return; 2608 } 2609#endif 2610#if INET6 2611 struct inpcb *inp = tp->t_inpcb; 2612 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 2613 2614 if (isipv6) { 2615 if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV6)) { 2616 tp->t_flags |= TF_TSO; 2617 if (ifp->if_tso_v6_mtu != 0) 2618 tp->tso_max_segment_size = ifp->if_tso_v6_mtu; 2619 else 2620 tp->tso_max_segment_size = TCP_MAXWIN; 2621 } else 2622 tp->t_flags &= ~TF_TSO; 2623 2624 } else 2625#endif /* INET6 */ 2626 2627 { 2628 if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV4)) { 2629 tp->t_flags |= TF_TSO; 2630 if (ifp->if_tso_v4_mtu != 0) 2631 tp->tso_max_segment_size = ifp->if_tso_v4_mtu; 2632 else 2633 tp->tso_max_segment_size = TCP_MAXWIN; 2634 } else 2635 tp->t_flags &= ~TF_TSO; 2636 } 2637} 2638 2639#define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC) 2640 2641/* Function to calculate the tcp clock. The tcp clock will get updated 2642 * at the boundaries of the tcp layer. This is done at 3 places: 2643 * 1. Right before processing an input tcp packet 2644 * 2. Whenever a connection wants to access the network using tcp_usrreqs 2645 * 3. When a tcp timer fires or before tcp slow timeout 2646 * 2647 */ 2648 2649void 2650calculate_tcp_clock() 2651{ 2652 struct timeval tv = tcp_uptime; 2653 struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC}; 2654 struct timeval now, hold_now; 2655 uint32_t incr = 0; 2656 2657 microuptime(&now); 2658 2659 /* 2660 * Update coarse-grained networking timestamp (in sec.); the idea 2661 * is to update the counter returnable via net_uptime() when 2662 * we read time. 2663 */ 2664 net_update_uptime_secs(now.tv_sec); 2665 2666 timevaladd(&tv, &interval); 2667 if (timevalcmp(&now, &tv, >)) { 2668 /* time to update the clock */ 2669 lck_spin_lock(tcp_uptime_lock); 2670 if (timevalcmp(&tcp_uptime, &now, >=)) { 2671 /* clock got updated while we were waiting for the lock */ 2672 lck_spin_unlock(tcp_uptime_lock); 2673 return; 2674 } 2675 2676 microuptime(&now); 2677 hold_now = now; 2678 tv = tcp_uptime; 2679 timevalsub(&now, &tv); 2680 2681 incr = TIMEVAL_TO_TCPHZ(now); 2682 if (incr > 0) { 2683 tcp_uptime = hold_now; 2684 tcp_now += incr; 2685 } 2686 2687 lck_spin_unlock(tcp_uptime_lock); 2688 } 2689 return; 2690} 2691 2692/* Compute receive window scaling that we are going to request 2693 * for this connection based on sb_hiwat. Try to leave some 2694 * room to potentially increase the window size upto a maximum 2695 * defined by the constant tcp_autorcvbuf_max. 2696 */ 2697void 2698tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so) { 2699 u_int32_t maxsockbufsize; 2700 2701 tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale); 2702 maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ? 2703 so->so_rcv.sb_hiwat : tcp_autorcvbuf_max; 2704 2705 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 2706 (TCP_MAXWIN << tp->request_r_scale) < maxsockbufsize) 2707 tp->request_r_scale++; 2708 tp->request_r_scale = min(tp->request_r_scale, TCP_MAX_WINSHIFT); 2709 2710} 2711 2712int 2713tcp_notsent_lowat_check(struct socket *so) { 2714 struct inpcb *inp = sotoinpcb(so); 2715 struct tcpcb *tp = NULL; 2716 int notsent = 0; 2717 if (inp != NULL) { 2718 tp = intotcpcb(inp); 2719 } 2720 2721 notsent = so->so_snd.sb_cc - 2722 (tp->snd_nxt - tp->snd_una); 2723 2724 /* When we send a FIN or SYN, not_sent can be negative. 2725 * In that case also we need to send a write event to the 2726 * process if it is waiting. In the FIN case, it will 2727 * get an error from send because cantsendmore will be set. 2728 */ 2729 if (notsent <= tp->t_notsent_lowat) { 2730 return(1); 2731 } 2732 2733 /* When Nagle's algorithm is not disabled, it is better 2734 * to wakeup the client until there is atleast one 2735 * maxseg of data to write. 2736 */ 2737 if ((tp->t_flags & TF_NODELAY) == 0 && 2738 notsent > 0 && notsent < tp->t_maxseg) { 2739 return(1); 2740 } 2741 return(0); 2742} 2743 2744 2745/* DSEP Review Done pl-20051213-v02 @3253,@3391,@3400 */ 2746