1/* 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.22 2001/08/22 00:59:12 silby Exp $ 62 */ 63/* 64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 65 * support for mandatory and extensible security protections. This notice 66 * is included in support of clause 2.2 (b) of the Apple Public License, 67 * Version 2.0. 68 */ 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/callout.h> 73#include <sys/kernel.h> 74#include <sys/sysctl.h> 75#include <sys/malloc.h> 76#include <sys/mbuf.h> 77#include <sys/domain.h> 78#include <sys/proc.h> 79#include <sys/kauth.h> 80#include <sys/socket.h> 81#include <sys/socketvar.h> 82#include <sys/protosw.h> 83#include <sys/random.h> 84#include <sys/syslog.h> 85#include <kern/locks.h> 86#include <kern/zalloc.h> 87 88#include <net/route.h> 89#include <net/if.h> 90 91#define _IP_VHL 92#include <netinet/in.h> 93#include <netinet/in_systm.h> 94#include <netinet/ip.h> 95#if INET6 96#include <netinet/ip6.h> 97#endif 98#include <netinet/in_pcb.h> 99#if INET6 100#include <netinet6/in6_pcb.h> 101#endif 102#include <netinet/in_var.h> 103#include <netinet/ip_var.h> 104#if INET6 105#include <netinet6/ip6_var.h> 106#endif 107#include <netinet/tcp.h> 108#include <netinet/tcp_fsm.h> 109#include <netinet/tcp_seq.h> 110#include <netinet/tcp_timer.h> 111#include <netinet/tcp_var.h> 112#if INET6 113#include <netinet6/tcp6_var.h> 114#endif 115#include <netinet/tcpip.h> 116#if TCPDEBUG 117#include <netinet/tcp_debug.h> 118#endif 119#include <netinet6/ip6protosw.h> 120 121#if IPSEC 122#include <netinet6/ipsec.h> 123#if INET6 124#include <netinet6/ipsec6.h> 125#endif 126#endif /*IPSEC*/ 127 128#if CONFIG_MACF_NET 129#include <security/mac_framework.h> 130#endif /* MAC_NET */ 131 132#include <libkern/crypto/md5.h> 133#include <sys/kdebug.h> 134 135#define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) 136 137extern int tcp_lq_overflow; 138 139/* temporary: for testing */ 140#if IPSEC 141extern int ipsec_bypass; 142#endif 143 144int tcp_mssdflt = TCP_MSS; 145SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 146 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 147 148#if INET6 149int tcp_v6mssdflt = TCP6_MSS; 150SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 151 CTLFLAG_RW, &tcp_v6mssdflt , 0, 152 "Default TCP Maximum Segment Size for IPv6"); 153#endif 154 155/* 156 * Minimum MSS we accept and use. This prevents DoS attacks where 157 * we are forced to a ridiculous low MSS like 20 and send hundreds 158 * of packets instead of one. The effect scales with the available 159 * bandwidth and quickly saturates the CPU and network interface 160 * with packet generation and sending. Set to zero to disable MINMSS 161 * checking. This setting prevents us from sending too small packets. 162 */ 163int tcp_minmss = TCP_MINMSS; 164SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, 165 &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); 166 167/* 168 * Number of TCP segments per second we accept from remote host 169 * before we start to calculate average segment size. If average 170 * segment size drops below the minimum TCP MSS we assume a DoS 171 * attack and reset+drop the connection. Care has to be taken not to 172 * set this value too small to not kill interactive type connections 173 * (telnet, SSH) which send many small packets. 174 */ 175#ifdef FIX_WORKAROUND_FOR_3894301 176__private_extern__ int tcp_minmssoverload = TCP_MINMSSOVERLOAD; 177#else 178__private_extern__ int tcp_minmssoverload = 0; 179#endif 180SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW, 181 &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to" 182 "be under the MINMSS Size"); 183 184static int tcp_do_rfc1323 = 1; 185SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 186 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 187 188static int tcp_do_rfc1644 = 0; 189SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 190 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 191 192static int tcp_tcbhashsize = 0; 193SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, 194 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 195 196static int do_tcpdrain = 0; 197SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, 198 "Enable tcp_drain routine for extra help when low on mbufs"); 199 200SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 201 &tcbinfo.ipi_count, 0, "Number of active PCBs"); 202 203static int icmp_may_rst = 1; 204SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 205 "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 206 207static int tcp_strict_rfc1948 = 0; 208SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW, 209 &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly"); 210 211static int tcp_isn_reseed_interval = 0; 212SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, 213 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); 214static int tcp_background_io_enabled = 1; 215SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_enabled, CTLFLAG_RW, 216 &tcp_background_io_enabled, 0, "Background IO Enabled"); 217 218int tcp_TCPTV_MIN = 1; 219SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW, 220 &tcp_TCPTV_MIN, 0, "min rtt value allowed"); 221 222static void tcp_cleartaocache(void); 223static void tcp_notify(struct inpcb *, int); 224struct zone *sack_hole_zone; 225 226extern unsigned int total_mb_cnt; 227extern unsigned int total_cl_cnt; 228extern int sbspace_factor; 229extern int tcp_sockthreshold; 230extern int slowlink_wsize; /* window correction for slow links */ 231extern int path_mtu_discovery; 232 233 234/* 235 * Target size of TCP PCB hash tables. Must be a power of two. 236 * 237 * Note that this can be overridden by the kernel environment 238 * variable net.inet.tcp.tcbhashsize 239 */ 240#ifndef TCBHASHSIZE 241#define TCBHASHSIZE CONFIG_TCBHASHSIZE 242#endif 243 244/* 245 * This is the actual shape of what we allocate using the zone 246 * allocator. Doing it this way allows us to protect both structures 247 * using the same generation count, and also eliminates the overhead 248 * of allocating tcpcbs separately. By hiding the structure here, 249 * we avoid changing most of the rest of the code (although it needs 250 * to be changed, eventually, for greater efficiency). 251 */ 252#define ALIGNMENT 32 253#define ALIGNM1 (ALIGNMENT - 1) 254struct inp_tp { 255 union { 256 struct inpcb inp; 257 char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; 258 } inp_tp_u; 259 struct tcpcb tcb; 260}; 261#undef ALIGNMENT 262#undef ALIGNM1 263 264static struct tcpcb dummy_tcb; 265 266 267extern struct inpcbhead time_wait_slots[]; 268extern int cur_tw_slot; 269extern u_long *delack_bitmask; 270extern u_long route_generation; 271 272int get_inpcb_str_size(void); 273int get_tcp_str_size(void); 274 275 276int get_inpcb_str_size(void) 277{ 278 return sizeof(struct inpcb); 279} 280 281 282int get_tcp_str_size(void) 283{ 284 return sizeof(struct tcpcb); 285} 286 287int tcp_freeq(struct tcpcb *tp); 288 289 290/* 291 * Tcp initialization 292 */ 293void 294tcp_init() 295{ 296 int hashsize = TCBHASHSIZE; 297 vm_size_t str_size; 298 int i; 299 struct inpcbinfo *pcbinfo; 300 301 tcp_ccgen = 1; 302 tcp_cleartaocache(); 303 304 tcp_keepinit = TCPTV_KEEP_INIT; 305 tcp_keepidle = TCPTV_KEEP_IDLE; 306 tcp_keepintvl = TCPTV_KEEPINTVL; 307 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 308 tcp_msl = TCPTV_MSL; 309 read_random(&tcp_now, sizeof(tcp_now)); 310 tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal 100ms clock at a random value */ 311 312 313 LIST_INIT(&tcb); 314 tcbinfo.listhead = &tcb; 315 pcbinfo = &tcbinfo; 316 if (!powerof2(hashsize)) { 317 printf("WARNING: TCB hash size not a power of 2\n"); 318 hashsize = 512; /* safe default */ 319 } 320 tcp_tcbhashsize = hashsize; 321 tcbinfo.hashsize = hashsize; 322 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); 323 tcbinfo.porthashbase = hashinit(hashsize, M_PCB, 324 &tcbinfo.porthashmask); 325 str_size = (vm_size_t) sizeof(struct inp_tp); 326 tcbinfo.ipi_zone = (void *) zinit(str_size, 120000*str_size, 8192, "tcpcb"); 327 sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone"); 328 tcp_reass_maxseg = nmbclusters / 16; 329 330#if INET6 331#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 332#else /* INET6 */ 333#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 334#endif /* INET6 */ 335 if (max_protohdr < TCP_MINPROTOHDR) 336 max_protohdr = TCP_MINPROTOHDR; 337 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) 338 panic("tcp_init"); 339#undef TCP_MINPROTOHDR 340 dummy_tcb.t_state = TCP_NSTATES; 341 dummy_tcb.t_flags = 0; 342 tcbinfo.dummy_cb = (caddr_t) &dummy_tcb; 343 344 /* 345 * allocate lock group attribute and group for tcp pcb mutexes 346 */ 347 pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); 348 pcbinfo->mtx_grp = lck_grp_alloc_init("tcppcb", pcbinfo->mtx_grp_attr); 349 350 /* 351 * allocate the lock attribute for tcp pcb mutexes 352 */ 353 pcbinfo->mtx_attr = lck_attr_alloc_init(); 354 355 if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL) { 356 printf("tcp_init: mutex not alloced!\n"); 357 return; /* pretty much dead if this fails... */ 358 } 359 360 361 in_pcb_nat_init(&tcbinfo, AF_INET, IPPROTO_TCP, SOCK_STREAM); 362 363 delack_bitmask = _MALLOC((4 * hashsize)/32, M_PCB, M_WAITOK); 364 if (delack_bitmask == 0) 365 panic("Delack Memory"); 366 367 for (i=0; i < (tcbinfo.hashsize / 32); i++) 368 delack_bitmask[i] = 0; 369 370 for (i=0; i < N_TIME_WAIT_SLOTS; i++) { 371 LIST_INIT(&time_wait_slots[i]); 372 } 373 374 timeout(tcp_fasttimo, NULL, hz/TCP_RETRANSHZ); 375} 376 377/* 378 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 379 * tcp_template used to store this data in mbufs, but we now recopy it out 380 * of the tcpcb each time to conserve mbufs. 381 */ 382void 383tcp_fillheaders(tp, ip_ptr, tcp_ptr) 384 struct tcpcb *tp; 385 void *ip_ptr; 386 void *tcp_ptr; 387{ 388 struct inpcb *inp = tp->t_inpcb; 389 struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; 390 391#if INET6 392 if ((inp->inp_vflag & INP_IPV6) != 0) { 393 struct ip6_hdr *ip6; 394 395 ip6 = (struct ip6_hdr *)ip_ptr; 396 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 397 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); 398 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 399 (IPV6_VERSION & IPV6_VERSION_MASK); 400 ip6->ip6_nxt = IPPROTO_TCP; 401 ip6->ip6_plen = sizeof(struct tcphdr); 402 ip6->ip6_src = inp->in6p_laddr; 403 ip6->ip6_dst = inp->in6p_faddr; 404 tcp_hdr->th_sum = 0; 405 } else 406#endif 407 { 408 struct ip *ip = (struct ip *) ip_ptr; 409 410 ip->ip_vhl = IP_VHL_BORING; 411 ip->ip_tos = 0; 412 ip->ip_len = 0; 413 ip->ip_id = 0; 414 ip->ip_off = 0; 415 ip->ip_ttl = 0; 416 ip->ip_sum = 0; 417 ip->ip_p = IPPROTO_TCP; 418 ip->ip_src = inp->inp_laddr; 419 ip->ip_dst = inp->inp_faddr; 420 tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 421 htons(sizeof(struct tcphdr) + IPPROTO_TCP)); 422 } 423 424 tcp_hdr->th_sport = inp->inp_lport; 425 tcp_hdr->th_dport = inp->inp_fport; 426 tcp_hdr->th_seq = 0; 427 tcp_hdr->th_ack = 0; 428 tcp_hdr->th_x2 = 0; 429 tcp_hdr->th_off = 5; 430 tcp_hdr->th_flags = 0; 431 tcp_hdr->th_win = 0; 432 tcp_hdr->th_urp = 0; 433} 434 435/* 436 * Create template to be used to send tcp packets on a connection. 437 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 438 * use for this function is in keepalives, which use tcp_respond. 439 */ 440struct tcptemp * 441tcp_maketemplate(tp) 442 struct tcpcb *tp; 443{ 444 struct mbuf *m; 445 struct tcptemp *n; 446 447 m = m_get(M_DONTWAIT, MT_HEADER); 448 if (m == NULL) 449 return (0); 450 m->m_len = sizeof(struct tcptemp); 451 n = mtod(m, struct tcptemp *); 452 453 tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t); 454 return (n); 455} 456 457/* 458 * Send a single message to the TCP at address specified by 459 * the given TCP/IP header. If m == 0, then we make a copy 460 * of the tcpiphdr at ti and send directly to the addressed host. 461 * This is used to force keep alive messages out using the TCP 462 * template for a connection. If flags are given then we send 463 * a message back to the TCP which originated the * segment ti, 464 * and discard the mbuf containing it and any other attached mbufs. 465 * 466 * In any case the ack and sequence number of the transmitted 467 * segment are as specified by the parameters. 468 * 469 * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 470 */ 471void 472tcp_respond( 473 struct tcpcb *tp, 474 void *ipgen, 475 register struct tcphdr *th, 476 register struct mbuf *m, 477 tcp_seq ack, 478 tcp_seq seq, 479 int flags, 480 unsigned int ifscope 481 ) 482{ 483 register int tlen; 484 int win = 0; 485 struct route *ro = 0; 486 struct route sro; 487 struct ip *ip; 488 struct tcphdr *nth; 489#if INET6 490 struct route_in6 *ro6 = 0; 491 struct route_in6 sro6; 492 struct ip6_hdr *ip6; 493 int isipv6; 494#endif /* INET6 */ 495 496#if INET6 497 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; 498 ip6 = ipgen; 499#endif /* INET6 */ 500 ip = ipgen; 501 502 if (tp) { 503 if (!(flags & TH_RST)) { 504 win = tcp_sbspace(tp); 505 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 506 win = (long)TCP_MAXWIN << tp->rcv_scale; 507 } 508#if INET6 509 if (isipv6) 510 ro6 = &tp->t_inpcb->in6p_route; 511 else 512#endif /* INET6 */ 513 ro = &tp->t_inpcb->inp_route; 514 } else { 515#if INET6 516 if (isipv6) { 517 ro6 = &sro6; 518 bzero(ro6, sizeof *ro6); 519 } else 520#endif /* INET6 */ 521 { 522 ro = &sro; 523 bzero(ro, sizeof *ro); 524 } 525 } 526 if (m == 0) { 527 m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */ 528 if (m == NULL) 529 return; 530 tlen = 0; 531 m->m_data += max_linkhdr; 532#if INET6 533 if (isipv6) { 534 bcopy((caddr_t)ip6, mtod(m, caddr_t), 535 sizeof(struct ip6_hdr)); 536 ip6 = mtod(m, struct ip6_hdr *); 537 nth = (struct tcphdr *)(ip6 + 1); 538 } else 539#endif /* INET6 */ 540 { 541 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 542 ip = mtod(m, struct ip *); 543 nth = (struct tcphdr *)(ip + 1); 544 } 545 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 546 flags = TH_ACK; 547 } else { 548 m_freem(m->m_next); 549 m->m_next = 0; 550 m->m_data = (caddr_t)ipgen; 551 /* m_len is set later */ 552 tlen = 0; 553#define xchg(a,b,type) { type t; t=a; a=b; b=t; } 554#if INET6 555 if (isipv6) { 556 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 557 nth = (struct tcphdr *)(ip6 + 1); 558 } else 559#endif /* INET6 */ 560 { 561 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); 562 nth = (struct tcphdr *)(ip + 1); 563 } 564 if (th != nth) { 565 /* 566 * this is usually a case when an extension header 567 * exists between the IPv6 header and the 568 * TCP header. 569 */ 570 nth->th_sport = th->th_sport; 571 nth->th_dport = th->th_dport; 572 } 573 xchg(nth->th_dport, nth->th_sport, n_short); 574#undef xchg 575 } 576#if INET6 577 if (isipv6) { 578 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + 579 tlen)); 580 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 581 } else 582#endif 583 { 584 tlen += sizeof (struct tcpiphdr); 585 ip->ip_len = tlen; 586 ip->ip_ttl = ip_defttl; 587 } 588 m->m_len = tlen; 589 m->m_pkthdr.len = tlen; 590 m->m_pkthdr.rcvif = 0; 591#if CONFIG_MACF_NET 592 if (tp != NULL && tp->t_inpcb != NULL) { 593 /* 594 * Packet is associated with a socket, so allow the 595 * label of the response to reflect the socket label. 596 */ 597 mac_mbuf_label_associate_inpcb(tp->t_inpcb, m); 598 } else { 599 /* 600 * Packet is not associated with a socket, so possibly 601 * update the label in place. 602 */ 603 mac_netinet_tcp_reply(m); 604 } 605#endif 606 607#if CONFIG_IP_EDGEHOLE 608 if (tp && tp->t_inpcb) 609 ip_edgehole_mbuf_tag(tp->t_inpcb, m); 610#endif 611 612 nth->th_seq = htonl(seq); 613 nth->th_ack = htonl(ack); 614 nth->th_x2 = 0; 615 nth->th_off = sizeof (struct tcphdr) >> 2; 616 nth->th_flags = flags; 617 if (tp) 618 nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 619 else 620 nth->th_win = htons((u_short)win); 621 nth->th_urp = 0; 622#if INET6 623 if (isipv6) { 624 nth->th_sum = 0; 625 nth->th_sum = in6_cksum(m, IPPROTO_TCP, 626 sizeof(struct ip6_hdr), 627 tlen - sizeof(struct ip6_hdr)); 628 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, 629 ro6 && ro6->ro_rt ? 630 ro6->ro_rt->rt_ifp : 631 NULL); 632 } else 633#endif /* INET6 */ 634 { 635 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 636 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 637 m->m_pkthdr.csum_flags = CSUM_TCP; 638 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 639 } 640#if TCPDEBUG 641 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 642 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 643#endif 644#if IPSEC 645 if (ipsec_bypass == 0 && ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) { 646 m_freem(m); 647 return; 648 } 649#endif 650#if INET6 651 if (isipv6) { 652 (void)ip6_output(m, NULL, ro6, 0, NULL, NULL, 0); 653 if (ro6 == &sro6 && ro6->ro_rt) { 654 rtfree(ro6->ro_rt); 655 ro6->ro_rt = NULL; 656 } 657 } else 658#endif /* INET6 */ 659 { 660 struct ip_out_args ipoa = { ifscope }; 661 662 (void) ip_output(m, NULL, ro, IP_OUTARGS, NULL, &ipoa); 663 664 if (ro == &sro && ro->ro_rt) { 665 rtfree(ro->ro_rt); 666 ro->ro_rt = NULL; 667 } 668 } 669} 670 671/* 672 * Create a new TCP control block, making an 673 * empty reassembly queue and hooking it to the argument 674 * protocol control block. The `inp' parameter must have 675 * come from the zone allocator set up in tcp_init(). 676 */ 677struct tcpcb * 678tcp_newtcpcb(inp) 679 struct inpcb *inp; 680{ 681 struct inp_tp *it; 682 register struct tcpcb *tp; 683 register struct socket *so = inp->inp_socket; 684#if INET6 685 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 686#endif /* INET6 */ 687 688 if (so->cached_in_sock_layer == 0) { 689 it = (struct inp_tp *)inp; 690 tp = &it->tcb; 691 } 692 else 693 tp = (struct tcpcb *) inp->inp_saved_ppcb; 694 695 bzero((char *) tp, sizeof(struct tcpcb)); 696 LIST_INIT(&tp->t_segq); 697 tp->t_maxseg = tp->t_maxopd = 698#if INET6 699 isipv6 ? tcp_v6mssdflt : 700#endif /* INET6 */ 701 tcp_mssdflt; 702 703 if (tcp_do_rfc1323) 704 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 705 tp->sack_enable = tcp_do_sack; 706 TAILQ_INIT(&tp->snd_holes); 707 tp->t_inpcb = inp; /* XXX */ 708 /* 709 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 710 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 711 * reasonable initial retransmit time. 712 */ 713 tp->t_srtt = TCPTV_SRTTBASE; 714 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 715 tp->t_rttmin = tcp_TCPTV_MIN; 716 tp->t_rxtcur = TCPTV_RTOBASE; 717 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 718 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 719 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 720 tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT; 721 tp->t_rcvtime = 0; 722 tp->t_bw_rtttime = 0; 723 /* 724 * IPv4 TTL initialization is necessary for an IPv6 socket as well, 725 * because the socket may be bound to an IPv6 wildcard address, 726 * which may match an IPv4-mapped IPv6 address. 727 */ 728 inp->inp_ip_ttl = ip_defttl; 729 inp->inp_ppcb = (caddr_t)tp; 730 return (tp); /* XXX */ 731} 732 733/* 734 * Drop a TCP connection, reporting 735 * the specified error. If connection is synchronized, 736 * then send a RST to peer. 737 */ 738struct tcpcb * 739tcp_drop(tp, errno) 740 register struct tcpcb *tp; 741 int errno; 742{ 743 struct socket *so = tp->t_inpcb->inp_socket; 744 745 if (TCPS_HAVERCVDSYN(tp->t_state)) { 746 tp->t_state = TCPS_CLOSED; 747 (void) tcp_output(tp); 748 tcpstat.tcps_drops++; 749 } else 750 tcpstat.tcps_conndrops++; 751 if (errno == ETIMEDOUT && tp->t_softerror) 752 errno = tp->t_softerror; 753 so->so_error = errno; 754 return (tcp_close(tp)); 755} 756 757/* 758 * Close a TCP control block: 759 * discard all space held by the tcp 760 * discard internet protocol block 761 * wake up any sleepers 762 */ 763struct tcpcb * 764tcp_close(tp) 765 register struct tcpcb *tp; 766{ 767 struct inpcb *inp = tp->t_inpcb; 768 struct socket *so = inp->inp_socket; 769#if INET6 770 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 771#endif /* INET6 */ 772 register struct rtentry *rt; 773 int dosavessthresh; 774 775 if ( inp->inp_ppcb == NULL) /* tcp_close was called previously, bail */ 776 return NULL; 777 778 /* Clear the timers before we delete the PCB. */ 779 { 780 int i; 781 for (i = 0; i < TCPT_NTIMERS; i++) { 782 tp->t_timer[i] = 0; 783 } 784 } 785 786 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0); 787 switch (tp->t_state) 788 { 789 case TCPS_ESTABLISHED: 790 case TCPS_FIN_WAIT_1: 791 case TCPS_CLOSING: 792 case TCPS_CLOSE_WAIT: 793 case TCPS_LAST_ACK: 794 break; 795 } 796 797 /* 798 * If another thread for this tcp is currently in ip (indicated by 799 * the TF_SENDINPROG flag), defer the cleanup until after it returns 800 * back to tcp. This is done to serialize the close until after all 801 * pending output is finished, in order to avoid having the PCB be 802 * detached and the cached route cleaned, only for ip to cache the 803 * route back into the PCB again. Note that we've cleared all the 804 * timers at this point. Set TF_CLOSING to indicate to tcp_output() 805 * that is should call us again once it returns from ip; at that 806 * point both flags should be cleared and we can proceed further 807 * with the cleanup. 808 */ 809 if (tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) { 810 tp->t_flags |= TF_CLOSING; 811 return (NULL); 812 } 813 814 lck_mtx_lock(rt_mtx); 815 /* 816 * If we got enough samples through the srtt filter, 817 * save the rtt and rttvar in the routing entry. 818 * 'Enough' is arbitrarily defined as the 16 samples. 819 * 16 samples is enough for the srtt filter to converge 820 * to within 5% of the correct value; fewer samples and 821 * we could save a very bogus rtt. 822 * 823 * Don't update the default route's characteristics and don't 824 * update anything that the user "locked". 825 */ 826 if (tp->t_rttupdated >= 16) { 827 register u_long i = 0; 828 829#if INET6 830 if (isipv6) { 831 struct sockaddr_in6 *sin6; 832 833 if ((rt = inp->in6p_route.ro_rt) == NULL) 834 goto no_valid_rt; 835 sin6 = (struct sockaddr_in6 *)rt_key(rt); 836 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 837 goto no_valid_rt; 838 } 839 else 840#endif /* INET6 */ 841 rt = inp->inp_route.ro_rt; 842 if (rt == NULL || 843 ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr 844 == INADDR_ANY || rt->generation_id != route_generation) { 845 if (tp->t_state >= TCPS_CLOSE_WAIT) 846 tp->t_state = TCPS_CLOSING; 847 848 goto no_valid_rt; 849 } 850 851 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { 852 i = tp->t_srtt * 853 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); 854 if (rt->rt_rmx.rmx_rtt && i) 855 /* 856 * filter this update to half the old & half 857 * the new values, converting scale. 858 * See route.h and tcp_var.h for a 859 * description of the scaling constants. 860 */ 861 rt->rt_rmx.rmx_rtt = 862 (rt->rt_rmx.rmx_rtt + i) / 2; 863 else 864 rt->rt_rmx.rmx_rtt = i; 865 tcpstat.tcps_cachedrtt++; 866 } 867 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { 868 i = tp->t_rttvar * 869 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); 870 if (rt->rt_rmx.rmx_rttvar && i) 871 rt->rt_rmx.rmx_rttvar = 872 (rt->rt_rmx.rmx_rttvar + i) / 2; 873 else 874 rt->rt_rmx.rmx_rttvar = i; 875 tcpstat.tcps_cachedrttvar++; 876 } 877 /* 878 * The old comment here said: 879 * update the pipelimit (ssthresh) if it has been updated 880 * already or if a pipesize was specified & the threshhold 881 * got below half the pipesize. I.e., wait for bad news 882 * before we start updating, then update on both good 883 * and bad news. 884 * 885 * But we want to save the ssthresh even if no pipesize is 886 * specified explicitly in the route, because such 887 * connections still have an implicit pipesize specified 888 * by the global tcp_sendspace. In the absence of a reliable 889 * way to calculate the pipesize, it will have to do. 890 */ 891 i = tp->snd_ssthresh; 892 if (rt->rt_rmx.rmx_sendpipe != 0) 893 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); 894 else 895 dosavessthresh = (i < so->so_snd.sb_hiwat / 2); 896 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && 897 i != 0 && rt->rt_rmx.rmx_ssthresh != 0) 898 || dosavessthresh) { 899 /* 900 * convert the limit from user data bytes to 901 * packets then to packet data bytes. 902 */ 903 i = (i + tp->t_maxseg / 2) / tp->t_maxseg; 904 if (i < 2) 905 i = 2; 906 i *= (u_long)(tp->t_maxseg + 907#if INET6 908 (isipv6 ? sizeof (struct ip6_hdr) + 909 sizeof (struct tcphdr) : 910#endif 911 sizeof (struct tcpiphdr) 912#if INET6 913 ) 914#endif 915 ); 916 if (rt->rt_rmx.rmx_ssthresh) 917 rt->rt_rmx.rmx_ssthresh = 918 (rt->rt_rmx.rmx_ssthresh + i) / 2; 919 else 920 rt->rt_rmx.rmx_ssthresh = i; 921 tcpstat.tcps_cachedssthresh++; 922 } 923 } 924 rt = inp->inp_route.ro_rt; 925 if (rt) { 926 /* 927 * mark route for deletion if no information is 928 * cached. 929 */ 930 if ((so->so_flags & SOF_OVERFLOW) && tcp_lq_overflow && 931 ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0)){ 932 if (rt->rt_rmx.rmx_rtt == 0) 933 rt->rt_flags |= RTF_DELCLONE; 934 } 935 } 936 no_valid_rt: 937 /* free the reassembly queue, if any */ 938 lck_mtx_unlock(rt_mtx); 939 940 (void) tcp_freeq(tp); 941 942 tcp_free_sackholes(tp); 943 944 /* Free the packet list */ 945 if (tp->t_pktlist_head != NULL) 946 m_freem_list(tp->t_pktlist_head); 947 TCP_PKTLIST_CLEAR(tp); 948 949#ifdef __APPLE__ 950 if (so->cached_in_sock_layer) 951 inp->inp_saved_ppcb = (caddr_t) tp; 952#endif 953 954 soisdisconnected(so); 955#if INET6 956 if (INP_CHECK_SOCKAF(so, AF_INET6)) 957 in6_pcbdetach(inp); 958 else 959#endif /* INET6 */ 960 in_pcbdetach(inp); 961 tcpstat.tcps_closed++; 962 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed,0,0,0,0); 963 return ((struct tcpcb *)0); 964} 965 966int 967tcp_freeq(tp) 968 struct tcpcb *tp; 969{ 970 971 register struct tseg_qent *q; 972 int rv = 0; 973 974 while((q = LIST_FIRST(&tp->t_segq)) != NULL) { 975 LIST_REMOVE(q, tqe_q); 976 m_freem(q->tqe_m); 977 FREE(q, M_TSEGQ); 978 tcp_reass_qsize--; 979 rv = 1; 980 } 981 return (rv); 982} 983 984void 985tcp_drain() 986{ 987 if (do_tcpdrain) 988 { 989 struct inpcb *inpb; 990 struct tcpcb *tcpb; 991 struct tseg_qent *te; 992 993 /* 994 * Walk the tcpbs, if existing, and flush the reassembly queue, 995 * if there is one... 996 * XXX: The "Net/3" implementation doesn't imply that the TCP 997 * reassembly queue should be flushed, but in a situation 998 * where we're really low on mbufs, this is potentially 999 * usefull. 1000 */ 1001 if (!lck_rw_try_lock_exclusive(tcbinfo.mtx)) /* do it next time if the lock is in use */ 1002 return; 1003 1004 for (inpb = LIST_FIRST(tcbinfo.listhead); inpb; 1005 inpb = LIST_NEXT(inpb, inp_list)) { 1006 if ((tcpb = intotcpcb(inpb))) { 1007 while ((te = LIST_FIRST(&tcpb->t_segq)) 1008 != NULL) { 1009 LIST_REMOVE(te, tqe_q); 1010 m_freem(te->tqe_m); 1011 FREE(te, M_TSEGQ); 1012 tcp_reass_qsize--; 1013 } 1014 } 1015 } 1016 lck_rw_done(tcbinfo.mtx); 1017 1018 } 1019} 1020 1021/* 1022 * Notify a tcp user of an asynchronous error; 1023 * store error as soft error, but wake up user 1024 * (for now, won't do anything until can select for soft error). 1025 * 1026 * Do not wake up user since there currently is no mechanism for 1027 * reporting soft errors (yet - a kqueue filter may be added). 1028 */ 1029static void 1030tcp_notify(inp, error) 1031 struct inpcb *inp; 1032 int error; 1033{ 1034 struct tcpcb *tp; 1035 1036 if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) 1037 return; /* pcb is gone already */ 1038 1039 tp = (struct tcpcb *)inp->inp_ppcb; 1040 1041 /* 1042 * Ignore some errors if we are hooked up. 1043 * If connection hasn't completed, has retransmitted several times, 1044 * and receives a second error, give up now. This is better 1045 * than waiting a long time to establish a connection that 1046 * can never complete. 1047 */ 1048 if (tp->t_state == TCPS_ESTABLISHED && 1049 (error == EHOSTUNREACH || error == ENETUNREACH || 1050 error == EHOSTDOWN)) { 1051 return; 1052 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 1053 tp->t_softerror) 1054 tcp_drop(tp, error); 1055 else 1056 tp->t_softerror = error; 1057#if 0 1058 wakeup((caddr_t) &so->so_timeo); 1059 sorwakeup(so); 1060 sowwakeup(so); 1061#endif 1062} 1063 1064static int 1065tcp_pcblist SYSCTL_HANDLER_ARGS 1066{ 1067#pragma unused(oidp, arg1, arg2) 1068 int error, i, n; 1069 struct inpcb *inp, **inp_list; 1070 inp_gen_t gencnt; 1071 struct xinpgen xig; 1072 int slot; 1073 1074 /* 1075 * The process of preparing the TCB list is too time-consuming and 1076 * resource-intensive to repeat twice on every request. 1077 */ 1078 lck_rw_lock_shared(tcbinfo.mtx); 1079 if (req->oldptr == USER_ADDR_NULL) { 1080 n = tcbinfo.ipi_count; 1081 req->oldidx = 2 * (sizeof xig) 1082 + (n + n/8) * sizeof(struct xtcpcb); 1083 lck_rw_done(tcbinfo.mtx); 1084 return 0; 1085 } 1086 1087 if (req->newptr != USER_ADDR_NULL) { 1088 lck_rw_done(tcbinfo.mtx); 1089 return EPERM; 1090 } 1091 1092 /* 1093 * OK, now we're committed to doing something. 1094 */ 1095 gencnt = tcbinfo.ipi_gencnt; 1096 n = tcbinfo.ipi_count; 1097 1098 bzero(&xig, sizeof(xig)); 1099 xig.xig_len = sizeof xig; 1100 xig.xig_count = n; 1101 xig.xig_gen = gencnt; 1102 xig.xig_sogen = so_gencnt; 1103 error = SYSCTL_OUT(req, &xig, sizeof xig); 1104 if (error) { 1105 lck_rw_done(tcbinfo.mtx); 1106 return error; 1107 } 1108 /* 1109 * We are done if there is no pcb 1110 */ 1111 if (n == 0) { 1112 lck_rw_done(tcbinfo.mtx); 1113 return 0; 1114 } 1115 1116 inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1117 if (inp_list == 0) { 1118 lck_rw_done(tcbinfo.mtx); 1119 return ENOMEM; 1120 } 1121 1122 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; 1123 inp = LIST_NEXT(inp, inp_list)) { 1124#ifdef __APPLE__ 1125 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) 1126#else 1127 if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) 1128#endif 1129 inp_list[i++] = inp; 1130 } 1131 1132 for (slot = 0; slot < N_TIME_WAIT_SLOTS; slot++) { 1133 struct inpcb *inpnxt; 1134 1135 for (inp = time_wait_slots[slot].lh_first; inp && i < n; inp = inpnxt) { 1136 inpnxt = inp->inp_list.le_next; 1137 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) 1138 inp_list[i++] = inp; 1139 } 1140 } 1141 1142 n = i; 1143 1144 error = 0; 1145 for (i = 0; i < n; i++) { 1146 inp = inp_list[i]; 1147 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { 1148 struct xtcpcb xt; 1149 caddr_t inp_ppcb; 1150 1151 bzero(&xt, sizeof(xt)); 1152 xt.xt_len = sizeof xt; 1153 /* XXX should avoid extra copy */ 1154 inpcb_to_compat(inp, &xt.xt_inp); 1155 inp_ppcb = inp->inp_ppcb; 1156 if (inp_ppcb != NULL) { 1157 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); 1158 } 1159 else 1160 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 1161 if (inp->inp_socket) 1162 sotoxsocket(inp->inp_socket, &xt.xt_socket); 1163 error = SYSCTL_OUT(req, &xt, sizeof xt); 1164 } 1165 } 1166 if (!error) { 1167 /* 1168 * Give the user an updated idea of our state. 1169 * If the generation differs from what we told 1170 * her before, she knows that something happened 1171 * while we were processing this request, and it 1172 * might be necessary to retry. 1173 */ 1174 bzero(&xig, sizeof(xig)); 1175 xig.xig_len = sizeof xig; 1176 xig.xig_gen = tcbinfo.ipi_gencnt; 1177 xig.xig_sogen = so_gencnt; 1178 xig.xig_count = tcbinfo.ipi_count; 1179 error = SYSCTL_OUT(req, &xig, sizeof xig); 1180 } 1181 FREE(inp_list, M_TEMP); 1182 lck_rw_done(tcbinfo.mtx); 1183 return error; 1184} 1185 1186SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, 1187 tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 1188 1189#ifndef __APPLE__ 1190static int 1191tcp_getcred(SYSCTL_HANDLER_ARGS) 1192{ 1193 struct sockaddr_in addrs[2]; 1194 struct inpcb *inp; 1195 int error, s; 1196 1197 error = suser(req->p); 1198 if (error) 1199 return (error); 1200 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 1201 if (error) 1202 return (error); 1203 s = splnet(); 1204 inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 1205 addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); 1206 if (inp == NULL || inp->inp_socket == NULL) { 1207 error = ENOENT; 1208 goto out; 1209 } 1210 error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(*(kauth_cred_t)0); 1211out: 1212 splx(s); 1213 return (error); 1214} 1215 1216SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 1217 0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection"); 1218 1219#if INET6 1220static int 1221tcp6_getcred(SYSCTL_HANDLER_ARGS) 1222{ 1223 struct sockaddr_in6 addrs[2]; 1224 struct inpcb *inp; 1225 int error, s, mapped = 0; 1226 1227 error = suser(req->p); 1228 if (error) 1229 return (error); 1230 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 1231 if (error) 1232 return (error); 1233 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { 1234 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) 1235 mapped = 1; 1236 else 1237 return (EINVAL); 1238 } 1239 s = splnet(); 1240 if (mapped == 1) 1241 inp = in_pcblookup_hash(&tcbinfo, 1242 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], 1243 addrs[1].sin6_port, 1244 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], 1245 addrs[0].sin6_port, 1246 0, NULL); 1247 else 1248 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, 1249 addrs[1].sin6_port, 1250 &addrs[0].sin6_addr, addrs[0].sin6_port, 1251 0, NULL); 1252 if (inp == NULL || inp->inp_socket == NULL) { 1253 error = ENOENT; 1254 goto out; 1255 } 1256 error = SYSCTL_OUT(req, inp->inp_socket->so_cred, 1257 sizeof(*(kauth_cred_t)0); 1258out: 1259 splx(s); 1260 return (error); 1261} 1262 1263SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 1264 0, 0, 1265 tcp6_getcred, "S,ucred", "Get the ucred of a TCP6 connection"); 1266#endif 1267#endif /* __APPLE__*/ 1268 1269void 1270tcp_ctlinput(cmd, sa, vip) 1271 int cmd; 1272 struct sockaddr *sa; 1273 void *vip; 1274{ 1275 struct ip *ip = vip; 1276 struct tcphdr *th; 1277 struct in_addr faddr; 1278 struct inpcb *inp; 1279 struct tcpcb *tp; 1280 void (*notify)(struct inpcb *, int) = tcp_notify; 1281 tcp_seq icmp_seq; 1282 1283 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1284 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1285 return; 1286 1287 if (cmd == PRC_QUENCH) 1288 notify = tcp_quench; 1289 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 1290 cmd == PRC_UNREACH_PORT) && ip) 1291 notify = tcp_drop_syn_sent; 1292 else if (cmd == PRC_MSGSIZE) 1293 notify = tcp_mtudisc; 1294 else if (PRC_IS_REDIRECT(cmd)) { 1295 ip = 0; 1296 notify = in_rtchange; 1297 } else if (cmd == PRC_HOSTDEAD) 1298 ip = 0; 1299 else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) 1300 return; 1301 if (ip) { 1302 th = (struct tcphdr *)((caddr_t)ip 1303 + (IP_VHL_HL(ip->ip_vhl) << 2)); 1304 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, 1305 ip->ip_src, th->th_sport, 0, NULL); 1306 if (inp != NULL && inp->inp_socket != NULL) { 1307 tcp_lock(inp->inp_socket, 1, 0); 1308 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { 1309 tcp_unlock(inp->inp_socket, 1, 0); 1310 return; 1311 } 1312 icmp_seq = htonl(th->th_seq); 1313 tp = intotcpcb(inp); 1314 if (SEQ_GEQ(icmp_seq, tp->snd_una) && 1315 SEQ_LT(icmp_seq, tp->snd_max)) 1316 (*notify)(inp, inetctlerrmap[cmd]); 1317 tcp_unlock(inp->inp_socket, 1, 0); 1318 } 1319 } else 1320 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); 1321} 1322 1323#if INET6 1324void 1325tcp6_ctlinput(cmd, sa, d) 1326 int cmd; 1327 struct sockaddr *sa; 1328 void *d; 1329{ 1330 struct tcphdr th; 1331 void (*notify)(struct inpcb *, int) = tcp_notify; 1332 struct ip6_hdr *ip6; 1333 struct mbuf *m; 1334 struct ip6ctlparam *ip6cp = NULL; 1335 const struct sockaddr_in6 *sa6_src = NULL; 1336 int off; 1337 struct tcp_portonly { 1338 u_int16_t th_sport; 1339 u_int16_t th_dport; 1340 } *thp; 1341 1342 if (sa->sa_family != AF_INET6 || 1343 sa->sa_len != sizeof(struct sockaddr_in6)) 1344 return; 1345 1346 if (cmd == PRC_QUENCH) 1347 notify = tcp_quench; 1348 else if (cmd == PRC_MSGSIZE) 1349 notify = tcp_mtudisc; 1350 else if (!PRC_IS_REDIRECT(cmd) && 1351 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 1352 return; 1353 1354 /* if the parameter is from icmp6, decode it. */ 1355 if (d != NULL) { 1356 ip6cp = (struct ip6ctlparam *)d; 1357 m = ip6cp->ip6c_m; 1358 ip6 = ip6cp->ip6c_ip6; 1359 off = ip6cp->ip6c_off; 1360 sa6_src = ip6cp->ip6c_src; 1361 } else { 1362 m = NULL; 1363 ip6 = NULL; 1364 off = 0; /* fool gcc */ 1365 sa6_src = &sa6_any; 1366 } 1367 1368 if (ip6) { 1369 /* 1370 * XXX: We assume that when IPV6 is non NULL, 1371 * M and OFF are valid. 1372 */ 1373 1374 /* check if we can safely examine src and dst ports */ 1375 if (m->m_pkthdr.len < off + sizeof(*thp)) 1376 return; 1377 1378 bzero(&th, sizeof(th)); 1379 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 1380 1381 in6_pcbnotify(&tcbinfo, sa, th.th_dport, 1382 (struct sockaddr *)ip6cp->ip6c_src, 1383 th.th_sport, cmd, notify); 1384 } else 1385 in6_pcbnotify(&tcbinfo, sa, 0, (struct sockaddr *)sa6_src, 1386 0, cmd, notify); 1387} 1388#endif /* INET6 */ 1389 1390 1391/* 1392 * Following is where TCP initial sequence number generation occurs. 1393 * 1394 * There are two places where we must use initial sequence numbers: 1395 * 1. In SYN-ACK packets. 1396 * 2. In SYN packets. 1397 * 1398 * The ISNs in SYN-ACK packets have no monotonicity requirement, 1399 * and should be as unpredictable as possible to avoid the possibility 1400 * of spoofing and/or connection hijacking. To satisfy this 1401 * requirement, SYN-ACK ISNs are generated via the arc4random() 1402 * function. If exact RFC 1948 compliance is requested via sysctl, 1403 * these ISNs will be generated just like those in SYN packets. 1404 * 1405 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 1406 * depends on this property. In addition, these ISNs should be 1407 * unguessable so as to prevent connection hijacking. To satisfy 1408 * the requirements of this situation, the algorithm outlined in 1409 * RFC 1948 is used to generate sequence numbers. 1410 * 1411 * For more information on the theory of operation, please see 1412 * RFC 1948. 1413 * 1414 * Implementation details: 1415 * 1416 * Time is based off the system timer, and is corrected so that it 1417 * increases by one megabyte per second. This allows for proper 1418 * recycling on high speed LANs while still leaving over an hour 1419 * before rollover. 1420 * 1421 * Two sysctls control the generation of ISNs: 1422 * 1423 * net.inet.tcp.isn_reseed_interval controls the number of seconds 1424 * between seeding of isn_secret. This is normally set to zero, 1425 * as reseeding should not be necessary. 1426 * 1427 * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed 1428 * strictly. When strict compliance is requested, reseeding is 1429 * disabled and SYN-ACKs will be generated in the same manner as 1430 * SYNs. Strict mode is disabled by default. 1431 * 1432 */ 1433 1434#define ISN_BYTES_PER_SECOND 1048576 1435 1436tcp_seq 1437tcp_new_isn(tp) 1438 struct tcpcb *tp; 1439{ 1440 u_int32_t md5_buffer[4]; 1441 tcp_seq new_isn; 1442 struct timeval timenow; 1443 u_char isn_secret[32]; 1444 int isn_last_reseed = 0; 1445 MD5_CTX isn_ctx; 1446 1447 /* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */ 1448 if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT)) 1449 && tcp_strict_rfc1948 == 0) 1450#ifdef __APPLE__ 1451 return random(); 1452#else 1453 return arc4random(); 1454#endif 1455 getmicrotime(&timenow); 1456 1457 /* Seed if this is the first use, reseed if requested. */ 1458 if ((isn_last_reseed == 0) || 1459 ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) && 1460 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) 1461 < (u_int)timenow.tv_sec))) { 1462#ifdef __APPLE__ 1463 read_random(&isn_secret, sizeof(isn_secret)); 1464#else 1465 read_random_unlimited(&isn_secret, sizeof(isn_secret)); 1466#endif 1467 isn_last_reseed = timenow.tv_sec; 1468 } 1469 1470 /* Compute the md5 hash and return the ISN. */ 1471 MD5Init(&isn_ctx); 1472 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 1473 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 1474#if INET6 1475 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 1476 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 1477 sizeof(struct in6_addr)); 1478 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 1479 sizeof(struct in6_addr)); 1480 } else 1481#endif 1482 { 1483 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 1484 sizeof(struct in_addr)); 1485 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 1486 sizeof(struct in_addr)); 1487 } 1488 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); 1489 MD5Final((u_char *) &md5_buffer, &isn_ctx); 1490 new_isn = (tcp_seq) md5_buffer[0]; 1491 new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz); 1492 return new_isn; 1493} 1494 1495/* 1496 * When a source quench is received, close congestion window 1497 * to one segment. We will gradually open it again as we proceed. 1498 */ 1499void 1500tcp_quench( 1501 struct inpcb *inp, 1502 __unused int errno 1503) 1504{ 1505 struct tcpcb *tp = intotcpcb(inp); 1506 1507 if (tp) { 1508 tp->snd_cwnd = tp->t_maxseg; 1509 tp->t_bytes_acked = 0; 1510 } 1511} 1512 1513/* 1514 * When a specific ICMP unreachable message is received and the 1515 * connection state is SYN-SENT, drop the connection. This behavior 1516 * is controlled by the icmp_may_rst sysctl. 1517 */ 1518void 1519tcp_drop_syn_sent(inp, errno) 1520 struct inpcb *inp; 1521 int errno; 1522{ 1523 struct tcpcb *tp = intotcpcb(inp); 1524 1525 if (tp && tp->t_state == TCPS_SYN_SENT) 1526 tcp_drop(tp, errno); 1527} 1528 1529/* 1530 * When `need fragmentation' ICMP is received, update our idea of the MSS 1531 * based on the new value in the route. Also nudge TCP to send something, 1532 * since we know the packet we just sent was dropped. 1533 * This duplicates some code in the tcp_mss() function in tcp_input.c. 1534 */ 1535void 1536tcp_mtudisc( 1537 struct inpcb *inp, 1538 __unused int errno 1539) 1540{ 1541 struct tcpcb *tp = intotcpcb(inp); 1542 struct rtentry *rt; 1543 struct rmxp_tao *taop; 1544 struct socket *so = inp->inp_socket; 1545 int offered; 1546 int mss; 1547#if INET6 1548 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 1549#endif /* INET6 */ 1550 1551 if (tp) { 1552 lck_mtx_lock(rt_mtx); 1553#if INET6 1554 if (isipv6) 1555 rt = tcp_rtlookup6(inp); 1556 else 1557#endif /* INET6 */ 1558 rt = tcp_rtlookup(inp, IFSCOPE_NONE); 1559 if (!rt || !rt->rt_rmx.rmx_mtu) { 1560 tp->t_maxopd = tp->t_maxseg = 1561#if INET6 1562 isipv6 ? tcp_v6mssdflt : 1563#endif /* INET6 */ 1564 tcp_mssdflt; 1565 lck_mtx_unlock(rt_mtx); 1566 return; 1567 } 1568 taop = rmx_taop(rt->rt_rmx); 1569 offered = taop->tao_mssopt; 1570 mss = rt->rt_rmx.rmx_mtu - 1571#if INET6 1572 (isipv6 ? 1573 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : 1574#endif /* INET6 */ 1575 sizeof(struct tcpiphdr) 1576#if INET6 1577 ) 1578#endif /* INET6 */ 1579 ; 1580 1581 lck_mtx_unlock(rt_mtx); 1582 if (offered) 1583 mss = min(mss, offered); 1584 /* 1585 * XXX - The above conditional probably violates the TCP 1586 * spec. The problem is that, since we don't know the 1587 * other end's MSS, we are supposed to use a conservative 1588 * default. But, if we do that, then MTU discovery will 1589 * never actually take place, because the conservative 1590 * default is much less than the MTUs typically seen 1591 * on the Internet today. For the moment, we'll sweep 1592 * this under the carpet. 1593 * 1594 * The conservative default might not actually be a problem 1595 * if the only case this occurs is when sending an initial 1596 * SYN with options and data to a host we've never talked 1597 * to before. Then, they will reply with an MSS value which 1598 * will get recorded and the new parameters should get 1599 * recomputed. For Further Study. 1600 */ 1601 if (tp->t_maxopd <= mss) 1602 return; 1603 tp->t_maxopd = mss; 1604 1605 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 1606 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 1607 mss -= TCPOLEN_TSTAMP_APPA; 1608 1609 if (so->so_snd.sb_hiwat < mss) 1610 mss = so->so_snd.sb_hiwat; 1611 1612 tp->t_maxseg = mss; 1613 1614 tcpstat.tcps_mturesent++; 1615 tp->t_rtttime = 0; 1616 tp->snd_nxt = tp->snd_una; 1617 tcp_output(tp); 1618 } 1619} 1620 1621/* 1622 * Look-up the routing entry to the peer of this inpcb. If no route 1623 * is found and it cannot be allocated then return NULL. This routine 1624 * is called by TCP routines that access the rmx structure and by tcp_mss 1625 * to get the interface MTU. 1626 */ 1627struct rtentry * 1628tcp_rtlookup(inp, input_ifscope) 1629 struct inpcb *inp; 1630 unsigned int input_ifscope; 1631{ 1632 struct route *ro; 1633 struct rtentry *rt; 1634 struct tcpcb *tp; 1635 1636 ro = &inp->inp_route; 1637 if (ro == NULL) 1638 return (NULL); 1639 rt = ro->ro_rt; 1640 1641 lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); 1642 1643 if (rt == NULL || !(rt->rt_flags & RTF_UP) || rt->generation_id != route_generation) { 1644 /* No route yet, so try to acquire one */ 1645 if (inp->inp_faddr.s_addr != INADDR_ANY) { 1646 unsigned int ifscope; 1647 1648 ro->ro_dst.sa_family = AF_INET; 1649 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 1650 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = 1651 inp->inp_faddr; 1652 1653 /* 1654 * If the socket was bound to an interface, then 1655 * the bound-to-interface takes precedence over 1656 * the inbound interface passed in by the caller 1657 * (if we get here as part of the output path then 1658 * input_ifscope is IFSCOPE_NONE). 1659 */ 1660 ifscope = (inp->inp_flags & INP_BOUND_IF) ? 1661 inp->inp_boundif : input_ifscope; 1662 1663 rtalloc_scoped_ign_locked(ro, 0UL, ifscope); 1664 rt = ro->ro_rt; 1665 } 1666 } 1667 if (rt != NULL && rt->rt_ifp != NULL) 1668 somultipages(inp->inp_socket, 1669 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES)); 1670 1671 /* 1672 * Update MTU discovery determination. Don't do it if: 1673 * 1) it is disabled via the sysctl 1674 * 2) the route isn't up 1675 * 3) the MTU is locked (if it is, then discovery has been 1676 * disabled) 1677 */ 1678 1679 tp = intotcpcb(inp); 1680 1681 if (!path_mtu_discovery || ((rt != NULL) && 1682 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) 1683 tp->t_flags &= ~TF_PMTUD; 1684 else 1685 tp->t_flags |= TF_PMTUD; 1686 1687#ifdef IFEF_NOWINDOWSCALE 1688 if (tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL && 1689 (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE) != 0) 1690 { 1691 // Timestamps are not enabled on this interface 1692 tp->t_flags &= ~(TF_REQ_SCALE); 1693 } 1694#endif 1695 1696 return rt; 1697} 1698 1699#if INET6 1700struct rtentry * 1701tcp_rtlookup6(inp) 1702 struct inpcb *inp; 1703{ 1704 struct route_in6 *ro6; 1705 struct rtentry *rt; 1706 struct tcpcb *tp; 1707 1708 lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); 1709 1710 ro6 = &inp->in6p_route; 1711 rt = ro6->ro_rt; 1712 if (rt == NULL || !(rt->rt_flags & RTF_UP)) { 1713 /* No route yet, so try to acquire one */ 1714 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { 1715 struct sockaddr_in6 *dst6; 1716 1717 dst6 = (struct sockaddr_in6 *)&ro6->ro_dst; 1718 dst6->sin6_family = AF_INET6; 1719 dst6->sin6_len = sizeof(*dst6); 1720 dst6->sin6_addr = inp->in6p_faddr; 1721 rtalloc_ign_locked((struct route *)ro6, 0UL); 1722 rt = ro6->ro_rt; 1723 } 1724 } 1725 if (rt != NULL && rt->rt_ifp != NULL) 1726 somultipages(inp->inp_socket, 1727 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES)); 1728 /* 1729 * Update path MTU Discovery determination 1730 * while looking up the route: 1731 * 1) we have a valid route to the destination 1732 * 2) the MTU is not locked (if it is, then discovery has been 1733 * disabled) 1734 */ 1735 1736 1737 tp = intotcpcb(inp); 1738 1739 /* 1740 * Update MTU discovery determination. Don't do it if: 1741 * 1) it is disabled via the sysctl 1742 * 2) the route isn't up 1743 * 3) the MTU is locked (if it is, then discovery has been 1744 * disabled) 1745 */ 1746 1747 if (!path_mtu_discovery || ((rt != NULL) && 1748 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) 1749 tp->t_flags &= ~TF_PMTUD; 1750 else 1751 tp->t_flags |= TF_PMTUD; 1752 1753 return rt; 1754} 1755#endif /* INET6 */ 1756 1757#if IPSEC 1758/* compute ESP/AH header size for TCP, including outer IP header. */ 1759size_t 1760ipsec_hdrsiz_tcp(tp) 1761 struct tcpcb *tp; 1762{ 1763 struct inpcb *inp; 1764 struct mbuf *m; 1765 size_t hdrsiz; 1766 struct ip *ip; 1767#if INET6 1768 struct ip6_hdr *ip6 = NULL; 1769#endif /* INET6 */ 1770 struct tcphdr *th; 1771 1772 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) 1773 return 0; 1774 MGETHDR(m, M_DONTWAIT, MT_DATA); /* MAC-OK */ 1775 if (!m) 1776 return 0; 1777 1778#if INET6 1779 if ((inp->inp_vflag & INP_IPV6) != 0) { 1780 ip6 = mtod(m, struct ip6_hdr *); 1781 th = (struct tcphdr *)(ip6 + 1); 1782 m->m_pkthdr.len = m->m_len = 1783 sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1784 tcp_fillheaders(tp, ip6, th); 1785 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1786 } else 1787#endif /* INET6 */ 1788 { 1789 ip = mtod(m, struct ip *); 1790 th = (struct tcphdr *)(ip + 1); 1791 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 1792 tcp_fillheaders(tp, ip, th); 1793 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1794 } 1795 m_free(m); 1796 return hdrsiz; 1797} 1798#endif /*IPSEC*/ 1799 1800/* 1801 * Return a pointer to the cached information about the remote host. 1802 * The cached information is stored in the protocol specific part of 1803 * the route metrics. 1804 */ 1805struct rmxp_tao * 1806tcp_gettaocache(inp) 1807 struct inpcb *inp; 1808{ 1809 struct rtentry *rt; 1810 struct rmxp_tao *taop; 1811 1812 lck_mtx_lock(rt_mtx); 1813#if INET6 1814 if ((inp->inp_vflag & INP_IPV6) != 0) 1815 rt = tcp_rtlookup6(inp); 1816 else 1817#endif /* INET6 */ 1818 rt = tcp_rtlookup(inp, IFSCOPE_NONE); 1819 1820 /* Make sure this is a host route and is up. */ 1821 if (rt == NULL || 1822 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) { 1823 lck_mtx_unlock(rt_mtx); 1824 return NULL; 1825 } 1826 1827 taop = rmx_taop(rt->rt_rmx); 1828 lck_mtx_unlock(rt_mtx); 1829 return (taop); 1830} 1831 1832/* 1833 * Clear all the TAO cache entries, called from tcp_init. 1834 * 1835 * XXX 1836 * This routine is just an empty one, because we assume that the routing 1837 * routing tables are initialized at the same time when TCP, so there is 1838 * nothing in the cache left over. 1839 */ 1840static void 1841tcp_cleartaocache() 1842{ 1843} 1844 1845int 1846tcp_lock(so, refcount, lr) 1847 struct socket *so; 1848 int refcount; 1849 int lr; 1850{ 1851 int lr_saved; 1852 if (lr == 0) 1853 lr_saved = (unsigned int) __builtin_return_address(0); 1854 else lr_saved = lr; 1855 1856 if (so->so_pcb) { 1857 lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); 1858 } 1859 else { 1860 panic("tcp_lock: so=%p NO PCB! lr=%x\n", so, lr_saved); 1861 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); 1862 } 1863 1864 if (so->so_usecount < 0) 1865 panic("tcp_lock: so=%p so_pcb=%p lr=%x ref=%x\n", 1866 so, so->so_pcb, lr_saved, so->so_usecount); 1867 1868 if (refcount) 1869 so->so_usecount++; 1870 so->lock_lr[so->next_lock_lr] = (u_int32_t)lr_saved; 1871 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX; 1872 return (0); 1873} 1874 1875int 1876tcp_unlock(so, refcount, lr) 1877 struct socket *so; 1878 int refcount; 1879 int lr; 1880{ 1881 int lr_saved; 1882 if (lr == 0) 1883 lr_saved = (unsigned int) __builtin_return_address(0); 1884 else lr_saved = lr; 1885 1886#ifdef MORE_TCPLOCK_DEBUG 1887 printf("tcp_unlock: so=%p sopcb=%x lock=%x ref=%x lr=%x\n", 1888 so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount, lr_saved); 1889#endif 1890 if (refcount) 1891 so->so_usecount--; 1892 1893 if (so->so_usecount < 0) 1894 panic("tcp_unlock: so=%p usecount=%x\n", so, so->so_usecount); 1895 if (so->so_pcb == NULL) 1896 panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%x\n", so, so->so_usecount, lr_saved); 1897 else { 1898 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); 1899 so->unlock_lr[so->next_unlock_lr] = (u_int32_t)lr_saved; 1900 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; 1901 lck_mtx_unlock(((struct inpcb *)so->so_pcb)->inpcb_mtx); 1902 } 1903 return (0); 1904} 1905 1906lck_mtx_t * 1907tcp_getlock( 1908 struct socket *so, 1909 __unused int locktype) 1910{ 1911 struct inpcb *inp = sotoinpcb(so); 1912 1913 if (so->so_pcb) { 1914 if (so->so_usecount < 0) 1915 panic("tcp_getlock: so=%p usecount=%x\n", so, so->so_usecount); 1916 return(inp->inpcb_mtx); 1917 } 1918 else { 1919 panic("tcp_getlock: so=%p NULL so_pcb\n", so); 1920 return (so->so_proto->pr_domain->dom_mtx); 1921 } 1922} 1923long 1924tcp_sbspace(struct tcpcb *tp) 1925{ 1926 struct sockbuf *sb = &tp->t_inpcb->inp_socket->so_rcv; 1927 long space, newspace; 1928 1929 space = ((long) lmin((sb->sb_hiwat - sb->sb_cc), 1930 (sb->sb_mbmax - sb->sb_mbcnt))); 1931 1932#if TRAFFIC_MGT 1933 if (tp->t_inpcb->inp_socket->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND) { 1934 if (tcp_background_io_enabled && 1935 tp->t_inpcb->inp_socket->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_SUPPRESSED) { 1936 tp->t_flags |= TF_RXWIN0SENT; 1937 return 0; /* Triggers TCP window closing by responding there is no space */ 1938 } 1939 } 1940#endif /* TRAFFIC_MGT */ 1941 1942 /* Avoid inscreasing window size if the current window 1943 * is already very low, we could be in "persist" mode and 1944 * we could break some apps (see rdar://5409343) 1945 */ 1946 1947 if (space < tp->t_maxseg) 1948 return space; 1949 1950 /* Clip window size for slower link */ 1951 1952 if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0 ) 1953 return lmin(space, slowlink_wsize); 1954 1955 /* 1956 * Check for ressources constraints before over-ajusting the amount of space we can 1957 * advertise in the TCP window size updates. 1958 */ 1959 1960 if (sbspace_factor && (tp->t_inpcb->inp_pcbinfo->ipi_count < tcp_sockthreshold) && 1961 (total_mb_cnt / 8) < (mbstat.m_clusters / sbspace_factor)) { 1962 if (space < (long)(sb->sb_maxused - sb->sb_cc)) {/* make sure we don't constrain the window if we have enough ressources */ 1963 space = (long) lmax((sb->sb_maxused - sb->sb_cc), tp->rcv_maxbyps); 1964 } 1965 newspace = (long) lmax(((long)sb->sb_maxused - sb->sb_cc), (long)tp->rcv_maxbyps); 1966 1967 if (newspace > space) 1968 space = newspace; 1969 } 1970 return space; 1971} 1972/* DSEP Review Done pl-20051213-v02 @3253,@3391,@3400 */ 1973