tcp_input.c revision 221023
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (c) 2007-2008,2010 5 * Swinburne University of Technology, Melbourne, Australia. 6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7 * Copyright (c) 2010 The FreeBSD Foundation 8 * All rights reserved. 9 * 10 * Portions of this software were developed at the Centre for Advanced Internet 11 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 12 * James Healy and David Hayes, made possible in part by a grant from the Cisco 13 * University Research Program Fund at Community Foundation Silicon Valley. 14 * 15 * Portions of this software were developed at the Centre for Advanced 16 * Internet Architectures, Swinburne University of Technology, Melbourne, 17 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 44 */ 45 46#include <sys/cdefs.h> 47__FBSDID("$FreeBSD: head/sys/netinet/tcp_input.c 221023 2011-04-25 17:13:40Z attilio $"); 48 49#include "opt_ipfw.h" /* for ipfw_fwd */ 50#include "opt_inet.h" 51#include "opt_inet6.h" 52#include "opt_ipsec.h" 53#include "opt_tcpdebug.h" 54 55#include <sys/param.h> 56#include <sys/kernel.h> 57#include <sys/hhook.h> 58#include <sys/malloc.h> 59#include <sys/mbuf.h> 60#include <sys/proc.h> /* for proc0 declaration */ 61#include <sys/protosw.h> 62#include <sys/signalvar.h> 63#include <sys/socket.h> 64#include <sys/socketvar.h> 65#include <sys/sysctl.h> 66#include <sys/syslog.h> 67#include <sys/systm.h> 68 69#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 70 71#include <vm/uma.h> 72 73#include <net/if.h> 74#include <net/route.h> 75#include <net/vnet.h> 76 77#define TCPSTATES /* for logging */ 78 79#include <netinet/cc.h> 80#include <netinet/in.h> 81#include <netinet/in_pcb.h> 82#include <netinet/in_systm.h> 83#include <netinet/in_var.h> 84#include <netinet/ip.h> 85#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 86#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 87#include <netinet/ip_var.h> 88#include <netinet/ip_options.h> 89#include <netinet/ip6.h> 90#include <netinet/icmp6.h> 91#include <netinet6/in6_pcb.h> 92#include <netinet6/ip6_var.h> 93#include <netinet6/nd6.h> 94#include <netinet/tcp_fsm.h> 95#include <netinet/tcp_seq.h> 96#include <netinet/tcp_timer.h> 97#include <netinet/tcp_var.h> 98#include <netinet6/tcp6_var.h> 99#include <netinet/tcpip.h> 100#include <netinet/tcp_syncache.h> 101#ifdef TCPDEBUG 102#include <netinet/tcp_debug.h> 103#endif /* TCPDEBUG */ 104 105#ifdef IPSEC 106#include <netipsec/ipsec.h> 107#include <netipsec/ipsec6.h> 108#endif /*IPSEC*/ 109 110#include <machine/in_cksum.h> 111 112#include <security/mac/mac_framework.h> 113 114const int tcprexmtthresh = 3; 115 116VNET_DEFINE(struct tcpstat, tcpstat); 117SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, 118 &VNET_NAME(tcpstat), tcpstat, 119 "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 120 121int tcp_log_in_vain = 0; 122SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 123 &tcp_log_in_vain, 0, 124 "Log all incoming TCP segments to closed ports"); 125 126VNET_DEFINE(int, blackhole) = 0; 127#define V_blackhole VNET(blackhole) 128SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, 129 &VNET_NAME(blackhole), 0, 130 "Do not send RST on segments to closed ports"); 131 132VNET_DEFINE(int, tcp_delack_enabled) = 1; 133SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 134 &VNET_NAME(tcp_delack_enabled), 0, 135 "Delay ACK to try and piggyback it onto a data packet"); 136 137VNET_DEFINE(int, drop_synfin) = 0; 138#define V_drop_synfin VNET(drop_synfin) 139SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, 140 &VNET_NAME(drop_synfin), 0, 141 "Drop TCP packets with SYN+FIN set"); 142 143VNET_DEFINE(int, tcp_do_rfc3042) = 1; 144#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) 145SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, 146 &VNET_NAME(tcp_do_rfc3042), 0, 147 "Enable RFC 3042 (Limited Transmit)"); 148 149VNET_DEFINE(int, tcp_do_rfc3390) = 1; 150SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, 151 &VNET_NAME(tcp_do_rfc3390), 0, 152 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 153 154VNET_DEFINE(int, tcp_do_rfc3465) = 1; 155SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, 156 &VNET_NAME(tcp_do_rfc3465), 0, 157 "Enable RFC 3465 (Appropriate Byte Counting)"); 158 159VNET_DEFINE(int, tcp_abc_l_var) = 2; 160SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, 161 &VNET_NAME(tcp_abc_l_var), 2, 162 "Cap the max cwnd increment during slow-start to this number of segments"); 163 164SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); 165 166VNET_DEFINE(int, tcp_do_ecn) = 0; 167SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, 168 &VNET_NAME(tcp_do_ecn), 0, 169 "TCP ECN support"); 170 171VNET_DEFINE(int, tcp_ecn_maxretries) = 1; 172SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, 173 &VNET_NAME(tcp_ecn_maxretries), 0, 174 "Max retries before giving up on ECN"); 175 176VNET_DEFINE(int, tcp_insecure_rst) = 0; 177#define V_tcp_insecure_rst VNET(tcp_insecure_rst) 178SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, 179 &VNET_NAME(tcp_insecure_rst), 0, 180 "Follow the old (insecure) criteria for accepting RST packets"); 181 182VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; 183#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 184SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, 185 &VNET_NAME(tcp_do_autorcvbuf), 0, 186 "Enable automatic receive buffer sizing"); 187 188VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; 189#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 190SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, 191 &VNET_NAME(tcp_autorcvbuf_inc), 0, 192 "Incrementor step size of automatic receive buffer"); 193 194VNET_DEFINE(int, tcp_autorcvbuf_max) = 256*1024; 195#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 196SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, 197 &VNET_NAME(tcp_autorcvbuf_max), 0, 198 "Max size of automatic receive buffer"); 199 200int tcp_read_locking = 1; 201SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW, 202 &tcp_read_locking, 0, "Enable read locking strategy"); 203 204VNET_DEFINE(struct inpcbhead, tcb); 205#define tcb6 tcb /* for KAME src sync over BSD*'s */ 206VNET_DEFINE(struct inpcbinfo, tcbinfo); 207 208static void tcp_dooptions(struct tcpopt *, u_char *, int, int); 209static void tcp_do_segment(struct mbuf *, struct tcphdr *, 210 struct socket *, struct tcpcb *, int, int, uint8_t, 211 int); 212static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, 213 struct tcpcb *, int, int); 214static void tcp_pulloutofband(struct socket *, 215 struct tcphdr *, struct mbuf *, int); 216static void tcp_xmit_timer(struct tcpcb *, int); 217static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); 218static void inline tcp_fields_to_host(struct tcphdr *); 219#ifdef TCP_SIGNATURE 220static void inline tcp_fields_to_net(struct tcphdr *); 221static int inline tcp_signature_verify_input(struct mbuf *, int, int, 222 int, struct tcpopt *, struct tcphdr *, u_int); 223#endif 224static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, 225 uint16_t type); 226static void inline cc_conn_init(struct tcpcb *tp); 227static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); 228static void inline hhook_run_tcp_est_in(struct tcpcb *tp, 229 struct tcphdr *th, struct tcpopt *to); 230 231/* 232 * Kernel module interface for updating tcpstat. The argument is an index 233 * into tcpstat treated as an array of u_long. While this encodes the 234 * general layout of tcpstat into the caller, it doesn't encode its location, 235 * so that future changes to add, for example, per-CPU stats support won't 236 * cause binary compatibility problems for kernel modules. 237 */ 238void 239kmod_tcpstat_inc(int statnum) 240{ 241 242 (*((u_long *)&V_tcpstat + statnum))++; 243} 244 245/* 246 * Wrapper for the TCP established input helper hook. 247 */ 248static void inline 249hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) 250{ 251 struct tcp_hhook_data hhook_data; 252 253 if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { 254 hhook_data.tp = tp; 255 hhook_data.th = th; 256 hhook_data.to = to; 257 258 hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, 259 tp->osd); 260 } 261} 262 263/* 264 * CC wrapper hook functions 265 */ 266static void inline 267cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) 268{ 269 INP_WLOCK_ASSERT(tp->t_inpcb); 270 271 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 272 if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd)) 273 tp->ccv->flags |= CCF_CWND_LIMITED; 274 else 275 tp->ccv->flags &= ~CCF_CWND_LIMITED; 276 277 if (type == CC_ACK) { 278 if (tp->snd_cwnd > tp->snd_ssthresh) { 279 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 280 V_tcp_abc_l_var * tp->t_maxseg); 281 if (tp->t_bytes_acked >= tp->snd_cwnd) { 282 tp->t_bytes_acked -= tp->snd_cwnd; 283 tp->ccv->flags |= CCF_ABC_SENTAWND; 284 } 285 } else { 286 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 287 tp->t_bytes_acked = 0; 288 } 289 } 290 291 if (CC_ALGO(tp)->ack_received != NULL) { 292 /* XXXLAS: Find a way to live without this */ 293 tp->ccv->curack = th->th_ack; 294 CC_ALGO(tp)->ack_received(tp->ccv, type); 295 } 296} 297 298static void inline 299cc_conn_init(struct tcpcb *tp) 300{ 301 struct hc_metrics_lite metrics; 302 struct inpcb *inp = tp->t_inpcb; 303 int rtt; 304#ifdef INET6 305 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 306#endif 307 308 INP_WLOCK_ASSERT(tp->t_inpcb); 309 310 tcp_hc_get(&inp->inp_inc, &metrics); 311 312 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { 313 tp->t_srtt = rtt; 314 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; 315 TCPSTAT_INC(tcps_usedrtt); 316 if (metrics.rmx_rttvar) { 317 tp->t_rttvar = metrics.rmx_rttvar; 318 TCPSTAT_INC(tcps_usedrttvar); 319 } else { 320 /* default variation is +- 1 rtt */ 321 tp->t_rttvar = 322 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 323 } 324 TCPT_RANGESET(tp->t_rxtcur, 325 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 326 tp->t_rttmin, TCPTV_REXMTMAX); 327 } 328 if (metrics.rmx_ssthresh) { 329 /* 330 * There's some sort of gateway or interface 331 * buffer limit on the path. Use this to set 332 * the slow start threshhold, but set the 333 * threshold to no less than 2*mss. 334 */ 335 tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); 336 TCPSTAT_INC(tcps_usedssthresh); 337 } 338 339 /* 340 * Set the slow-start flight size depending on whether this 341 * is a local network or not. 342 * 343 * Extend this so we cache the cwnd too and retrieve it here. 344 * Make cwnd even bigger than RFC3390 suggests but only if we 345 * have previous experience with the remote host. Be careful 346 * not make cwnd bigger than remote receive window or our own 347 * send socket buffer. Maybe put some additional upper bound 348 * on the retrieved cwnd. Should do incremental updates to 349 * hostcache when cwnd collapses so next connection doesn't 350 * overloads the path again. 351 * 352 * XXXAO: Initializing the CWND from the hostcache is broken 353 * and in its current form not RFC conformant. It is disabled 354 * until fixed or removed entirely. 355 * 356 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. 357 * We currently check only in syncache_socket for that. 358 */ 359/* #define TCP_METRICS_CWND */ 360#ifdef TCP_METRICS_CWND 361 if (metrics.rmx_cwnd) 362 tp->snd_cwnd = max(tp->t_maxseg, min(metrics.rmx_cwnd / 2, 363 min(tp->snd_wnd, so->so_snd.sb_hiwat))); 364 else 365#endif 366 if (V_tcp_do_rfc3390) 367 tp->snd_cwnd = min(4 * tp->t_maxseg, 368 max(2 * tp->t_maxseg, 4380)); 369#ifdef INET6 370 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || 371 (!isipv6 && in_localaddr(inp->inp_faddr))) 372#else 373 else if (in_localaddr(inp->inp_faddr)) 374#endif 375 tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local; 376 else 377 tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz; 378 379 if (CC_ALGO(tp)->conn_init != NULL) 380 CC_ALGO(tp)->conn_init(tp->ccv); 381} 382 383void inline 384cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 385{ 386 INP_WLOCK_ASSERT(tp->t_inpcb); 387 388 switch(type) { 389 case CC_NDUPACK: 390 if (!IN_FASTRECOVERY(tp->t_flags)) { 391 tp->snd_recover = tp->snd_max; 392 if (tp->t_flags & TF_ECN_PERMIT) 393 tp->t_flags |= TF_ECN_SND_CWR; 394 } 395 break; 396 case CC_ECN: 397 if (!IN_CONGRECOVERY(tp->t_flags)) { 398 TCPSTAT_INC(tcps_ecn_rcwnd); 399 tp->snd_recover = tp->snd_max; 400 if (tp->t_flags & TF_ECN_PERMIT) 401 tp->t_flags |= TF_ECN_SND_CWR; 402 } 403 break; 404 case CC_RTO: 405 tp->t_dupacks = 0; 406 tp->t_bytes_acked = 0; 407 EXIT_RECOVERY(tp->t_flags); 408 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / 409 tp->t_maxseg) * tp->t_maxseg; 410 tp->snd_cwnd = tp->t_maxseg; 411 break; 412 case CC_RTO_ERR: 413 TCPSTAT_INC(tcps_sndrexmitbad); 414 /* RTO was unnecessary, so reset everything. */ 415 tp->snd_cwnd = tp->snd_cwnd_prev; 416 tp->snd_ssthresh = tp->snd_ssthresh_prev; 417 tp->snd_recover = tp->snd_recover_prev; 418 if (tp->t_flags & TF_WASFRECOVERY) 419 ENTER_FASTRECOVERY(tp->t_flags); 420 if (tp->t_flags & TF_WASCRECOVERY) 421 ENTER_CONGRECOVERY(tp->t_flags); 422 tp->snd_nxt = tp->snd_max; 423 tp->t_badrxtwin = 0; 424 break; 425 } 426 427 if (CC_ALGO(tp)->cong_signal != NULL) { 428 if (th != NULL) 429 tp->ccv->curack = th->th_ack; 430 CC_ALGO(tp)->cong_signal(tp->ccv, type); 431 } 432} 433 434static void inline 435cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) 436{ 437 INP_WLOCK_ASSERT(tp->t_inpcb); 438 439 /* XXXLAS: KASSERT that we're in recovery? */ 440 441 if (CC_ALGO(tp)->post_recovery != NULL) { 442 tp->ccv->curack = th->th_ack; 443 CC_ALGO(tp)->post_recovery(tp->ccv); 444 } 445 /* XXXLAS: EXIT_RECOVERY ? */ 446 tp->t_bytes_acked = 0; 447} 448 449static inline void 450tcp_fields_to_host(struct tcphdr *th) 451{ 452 453 th->th_seq = ntohl(th->th_seq); 454 th->th_ack = ntohl(th->th_ack); 455 th->th_win = ntohs(th->th_win); 456 th->th_urp = ntohs(th->th_urp); 457} 458 459#ifdef TCP_SIGNATURE 460static inline void 461tcp_fields_to_net(struct tcphdr *th) 462{ 463 464 th->th_seq = htonl(th->th_seq); 465 th->th_ack = htonl(th->th_ack); 466 th->th_win = htons(th->th_win); 467 th->th_urp = htons(th->th_urp); 468} 469 470static inline int 471tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, 472 struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) 473{ 474 int ret; 475 476 tcp_fields_to_net(th); 477 ret = tcp_signature_verify(m, off0, tlen, optlen, to, th, tcpbflag); 478 tcp_fields_to_host(th); 479 return (ret); 480} 481#endif 482 483/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ 484#ifdef INET6 485#define ND6_HINT(tp) \ 486do { \ 487 if ((tp) && (tp)->t_inpcb && \ 488 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ 489 nd6_nud_hint(NULL, NULL, 0); \ 490} while (0) 491#else 492#define ND6_HINT(tp) 493#endif 494 495/* 496 * Indicate whether this ack should be delayed. We can delay the ack if 497 * - there is no delayed ack timer in progress and 498 * - our last ack wasn't a 0-sized window. We never want to delay 499 * the ack that opens up a 0-sized window and 500 * - delayed acks are enabled or 501 * - this is a half-synchronized T/TCP connection. 502 */ 503#define DELAY_ACK(tp) \ 504 ((!tcp_timer_active(tp, TT_DELACK) && \ 505 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 506 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 507 508/* 509 * TCP input handling is split into multiple parts: 510 * tcp6_input is a thin wrapper around tcp_input for the extended 511 * ip6_protox[] call format in ip6_input 512 * tcp_input handles primary segment validation, inpcb lookup and 513 * SYN processing on listen sockets 514 * tcp_do_segment processes the ACK and text of the segment for 515 * establishing, established and closing connections 516 */ 517#ifdef INET6 518int 519tcp6_input(struct mbuf **mp, int *offp, int proto) 520{ 521 struct mbuf *m = *mp; 522 struct in6_ifaddr *ia6; 523 524 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); 525 526 /* 527 * draft-itojun-ipv6-tcp-to-anycast 528 * better place to put this in? 529 */ 530 ia6 = ip6_getdstifaddr(m); 531 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { 532 struct ip6_hdr *ip6; 533 534 ifa_free(&ia6->ia_ifa); 535 ip6 = mtod(m, struct ip6_hdr *); 536 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 537 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 538 return IPPROTO_DONE; 539 } 540 541 tcp_input(m, *offp); 542 return IPPROTO_DONE; 543} 544#endif 545 546void 547tcp_input(struct mbuf *m, int off0) 548{ 549 struct tcphdr *th; 550 struct ip *ip = NULL; 551 struct ipovly *ipov; 552 struct inpcb *inp = NULL; 553 struct tcpcb *tp = NULL; 554 struct socket *so = NULL; 555 u_char *optp = NULL; 556 int optlen = 0; 557 int len, tlen, off; 558 int drop_hdrlen; 559 int thflags; 560 int rstreason = 0; /* For badport_bandlim accounting purposes */ 561 uint8_t iptos; 562#ifdef TCP_SIGNATURE 563 uint8_t sig_checked = 0; 564#endif 565#ifdef IPFIREWALL_FORWARD 566 struct m_tag *fwd_tag; 567#endif 568#ifdef INET6 569 struct ip6_hdr *ip6 = NULL; 570 int isipv6; 571#else 572 const void *ip6 = NULL; 573 const int isipv6 = 0; 574#endif 575 struct tcpopt to; /* options in this segment */ 576 char *s = NULL; /* address and port logging */ 577 int ti_locked; 578#define TI_UNLOCKED 1 579#define TI_RLOCKED 2 580#define TI_WLOCKED 3 581 582#ifdef TCPDEBUG 583 /* 584 * The size of tcp_saveipgen must be the size of the max ip header, 585 * now IPv6. 586 */ 587 u_char tcp_saveipgen[IP6_HDR_LEN]; 588 struct tcphdr tcp_savetcp; 589 short ostate = 0; 590#endif 591 592#ifdef INET6 593 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 594#endif 595 596 to.to_flags = 0; 597 TCPSTAT_INC(tcps_rcvtotal); 598 599 if (isipv6) { 600#ifdef INET6 601 /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ 602 ip6 = mtod(m, struct ip6_hdr *); 603 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 604 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { 605 TCPSTAT_INC(tcps_rcvbadsum); 606 goto drop; 607 } 608 th = (struct tcphdr *)((caddr_t)ip6 + off0); 609 610 /* 611 * Be proactive about unspecified IPv6 address in source. 612 * As we use all-zero to indicate unbounded/unconnected pcb, 613 * unspecified IPv6 address can be used to confuse us. 614 * 615 * Note that packets with unspecified IPv6 destination is 616 * already dropped in ip6_input. 617 */ 618 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 619 /* XXX stat */ 620 goto drop; 621 } 622#else 623 th = NULL; /* XXX: Avoid compiler warning. */ 624#endif 625 } else { 626 /* 627 * Get IP and TCP header together in first mbuf. 628 * Note: IP leaves IP header in first mbuf. 629 */ 630 if (off0 > sizeof (struct ip)) { 631 ip_stripoptions(m, (struct mbuf *)0); 632 off0 = sizeof(struct ip); 633 } 634 if (m->m_len < sizeof (struct tcpiphdr)) { 635 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 636 == NULL) { 637 TCPSTAT_INC(tcps_rcvshort); 638 return; 639 } 640 } 641 ip = mtod(m, struct ip *); 642 ipov = (struct ipovly *)ip; 643 th = (struct tcphdr *)((caddr_t)ip + off0); 644 tlen = ip->ip_len; 645 646 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 647 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 648 th->th_sum = m->m_pkthdr.csum_data; 649 else 650 th->th_sum = in_pseudo(ip->ip_src.s_addr, 651 ip->ip_dst.s_addr, 652 htonl(m->m_pkthdr.csum_data + 653 ip->ip_len + 654 IPPROTO_TCP)); 655 th->th_sum ^= 0xffff; 656#ifdef TCPDEBUG 657 ipov->ih_len = (u_short)tlen; 658 ipov->ih_len = htons(ipov->ih_len); 659#endif 660 } else { 661 /* 662 * Checksum extended TCP header and data. 663 */ 664 len = sizeof (struct ip) + tlen; 665 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 666 ipov->ih_len = (u_short)tlen; 667 ipov->ih_len = htons(ipov->ih_len); 668 th->th_sum = in_cksum(m, len); 669 } 670 if (th->th_sum) { 671 TCPSTAT_INC(tcps_rcvbadsum); 672 goto drop; 673 } 674 /* Re-initialization for later version check */ 675 ip->ip_v = IPVERSION; 676 } 677 678#ifdef INET6 679 if (isipv6) 680 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 681 else 682#endif 683 iptos = ip->ip_tos; 684 685 /* 686 * Check that TCP offset makes sense, 687 * pull out TCP options and adjust length. XXX 688 */ 689 off = th->th_off << 2; 690 if (off < sizeof (struct tcphdr) || off > tlen) { 691 TCPSTAT_INC(tcps_rcvbadoff); 692 goto drop; 693 } 694 tlen -= off; /* tlen is used instead of ti->ti_len */ 695 if (off > sizeof (struct tcphdr)) { 696 if (isipv6) { 697#ifdef INET6 698 IP6_EXTHDR_CHECK(m, off0, off, ); 699 ip6 = mtod(m, struct ip6_hdr *); 700 th = (struct tcphdr *)((caddr_t)ip6 + off0); 701#endif 702 } else { 703 if (m->m_len < sizeof(struct ip) + off) { 704 if ((m = m_pullup(m, sizeof (struct ip) + off)) 705 == NULL) { 706 TCPSTAT_INC(tcps_rcvshort); 707 return; 708 } 709 ip = mtod(m, struct ip *); 710 ipov = (struct ipovly *)ip; 711 th = (struct tcphdr *)((caddr_t)ip + off0); 712 } 713 } 714 optlen = off - sizeof (struct tcphdr); 715 optp = (u_char *)(th + 1); 716 } 717 thflags = th->th_flags; 718 719 /* 720 * Convert TCP protocol specific fields to host format. 721 */ 722 tcp_fields_to_host(th); 723 724 /* 725 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 726 */ 727 drop_hdrlen = off0 + off; 728 729 /* 730 * Locate pcb for segment, which requires a lock on tcbinfo. 731 * Optimisticaly acquire a global read lock rather than a write lock 732 * unless header flags necessarily imply a state change. There are 733 * two cases where we might discover later we need a write lock 734 * despite the flags: ACKs moving a connection out of the syncache, 735 * and ACKs for a connection in TIMEWAIT. 736 */ 737 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 738 tcp_read_locking == 0) { 739 INP_INFO_WLOCK(&V_tcbinfo); 740 ti_locked = TI_WLOCKED; 741 } else { 742 INP_INFO_RLOCK(&V_tcbinfo); 743 ti_locked = TI_RLOCKED; 744 } 745 746findpcb: 747#ifdef INVARIANTS 748 if (ti_locked == TI_RLOCKED) 749 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 750 else if (ti_locked == TI_WLOCKED) 751 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 752 else 753 panic("%s: findpcb ti_locked %d\n", __func__, ti_locked); 754#endif 755 756#ifdef IPFIREWALL_FORWARD 757 /* 758 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. 759 */ 760 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 761 762 if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ 763 struct sockaddr_in *next_hop; 764 765 next_hop = (struct sockaddr_in *)(fwd_tag+1); 766 /* 767 * Transparently forwarded. Pretend to be the destination. 768 * already got one like this? 769 */ 770 inp = in_pcblookup_hash(&V_tcbinfo, 771 ip->ip_src, th->th_sport, 772 ip->ip_dst, th->th_dport, 773 0, m->m_pkthdr.rcvif); 774 if (!inp) { 775 /* It's new. Try to find the ambushing socket. */ 776 inp = in_pcblookup_hash(&V_tcbinfo, 777 ip->ip_src, th->th_sport, 778 next_hop->sin_addr, 779 next_hop->sin_port ? 780 ntohs(next_hop->sin_port) : 781 th->th_dport, 782 INPLOOKUP_WILDCARD, 783 m->m_pkthdr.rcvif); 784 } 785 /* Remove the tag from the packet. We don't need it anymore. */ 786 m_tag_delete(m, fwd_tag); 787 } else 788#endif /* IPFIREWALL_FORWARD */ 789 { 790 if (isipv6) { 791#ifdef INET6 792 inp = in6_pcblookup_hash(&V_tcbinfo, 793 &ip6->ip6_src, th->th_sport, 794 &ip6->ip6_dst, th->th_dport, 795 INPLOOKUP_WILDCARD, 796 m->m_pkthdr.rcvif); 797#endif 798 } else 799 inp = in_pcblookup_hash(&V_tcbinfo, 800 ip->ip_src, th->th_sport, 801 ip->ip_dst, th->th_dport, 802 INPLOOKUP_WILDCARD, 803 m->m_pkthdr.rcvif); 804 } 805 806 /* 807 * If the INPCB does not exist then all data in the incoming 808 * segment is discarded and an appropriate RST is sent back. 809 * XXX MRT Send RST using which routing table? 810 */ 811 if (inp == NULL) { 812 /* 813 * Log communication attempts to ports that are not 814 * in use. 815 */ 816 if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 817 tcp_log_in_vain == 2) { 818 if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) 819 log(LOG_INFO, "%s; %s: Connection attempt " 820 "to closed port\n", s, __func__); 821 } 822 /* 823 * When blackholing do not respond with a RST but 824 * completely ignore the segment and drop it. 825 */ 826 if ((V_blackhole == 1 && (thflags & TH_SYN)) || 827 V_blackhole == 2) 828 goto dropunlock; 829 830 rstreason = BANDLIM_RST_CLOSEDPORT; 831 goto dropwithreset; 832 } 833 INP_WLOCK(inp); 834 if (!(inp->inp_flags & INP_HW_FLOWID) 835 && (m->m_flags & M_FLOWID) 836 && ((inp->inp_socket == NULL) 837 || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) { 838 inp->inp_flags |= INP_HW_FLOWID; 839 inp->inp_flags &= ~INP_SW_FLOWID; 840 inp->inp_flowid = m->m_pkthdr.flowid; 841 } 842#ifdef IPSEC 843#ifdef INET6 844 if (isipv6 && ipsec6_in_reject(m, inp)) { 845 V_ipsec6stat.in_polvio++; 846 goto dropunlock; 847 } else 848#endif /* INET6 */ 849 if (ipsec4_in_reject(m, inp) != 0) { 850 V_ipsec4stat.in_polvio++; 851 goto dropunlock; 852 } 853#endif /* IPSEC */ 854 855 /* 856 * Check the minimum TTL for socket. 857 */ 858 if (inp->inp_ip_minttl != 0) { 859#ifdef INET6 860 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) 861 goto dropunlock; 862 else 863#endif 864 if (inp->inp_ip_minttl > ip->ip_ttl) 865 goto dropunlock; 866 } 867 868 /* 869 * A previous connection in TIMEWAIT state is supposed to catch stray 870 * or duplicate segments arriving late. If this segment was a 871 * legitimate new connection attempt the old INPCB gets removed and 872 * we can try again to find a listening socket. 873 * 874 * At this point, due to earlier optimism, we may hold a read lock on 875 * the inpcbinfo, rather than a write lock. If so, we need to 876 * upgrade, or if that fails, acquire a reference on the inpcb, drop 877 * all locks, acquire a global write lock, and then re-acquire the 878 * inpcb lock. We may at that point discover that another thread has 879 * tried to free the inpcb, in which case we need to loop back and 880 * try to find a new inpcb to deliver to. 881 */ 882relocked: 883 if (inp->inp_flags & INP_TIMEWAIT) { 884 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 885 ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked)); 886 887 if (ti_locked == TI_RLOCKED) { 888 if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { 889 in_pcbref(inp); 890 INP_WUNLOCK(inp); 891 INP_INFO_RUNLOCK(&V_tcbinfo); 892 INP_INFO_WLOCK(&V_tcbinfo); 893 ti_locked = TI_WLOCKED; 894 INP_WLOCK(inp); 895 if (in_pcbrele(inp)) { 896 inp = NULL; 897 goto findpcb; 898 } 899 } else 900 ti_locked = TI_WLOCKED; 901 } 902 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 903 904#ifdef TCP_SIGNATURE 905 tcp_dooptions(&to, optp, optlen, 906 (thflags & TH_SYN) ? TO_SYN : 0); 907 if (sig_checked == 0) { 908 tp = intotcpcb(inp); 909 if (tp == NULL || tp->t_state == TCPS_CLOSED) { 910 rstreason = BANDLIM_RST_CLOSEDPORT; 911 goto dropwithreset; 912 } 913 if (!tcp_signature_verify_input(m, off0, tlen, optlen, 914 &to, th, tp->t_flags)) 915 goto dropunlock; 916 sig_checked = 1; 917 } 918#else 919 if (thflags & TH_SYN) 920 tcp_dooptions(&to, optp, optlen, TO_SYN); 921#endif 922 /* 923 * NB: tcp_twcheck unlocks the INP and frees the mbuf. 924 */ 925 if (tcp_twcheck(inp, &to, th, m, tlen)) 926 goto findpcb; 927 INP_INFO_WUNLOCK(&V_tcbinfo); 928 return; 929 } 930 /* 931 * The TCPCB may no longer exist if the connection is winding 932 * down or it is in the CLOSED state. Either way we drop the 933 * segment and send an appropriate response. 934 */ 935 tp = intotcpcb(inp); 936 if (tp == NULL || tp->t_state == TCPS_CLOSED) { 937 rstreason = BANDLIM_RST_CLOSEDPORT; 938 goto dropwithreset; 939 } 940 941 /* 942 * We've identified a valid inpcb, but it could be that we need an 943 * inpcbinfo write lock and have only a read lock. In this case, 944 * attempt to upgrade/relock using the same strategy as the TIMEWAIT 945 * case above. If we relock, we have to jump back to 'relocked' as 946 * the connection might now be in TIMEWAIT. 947 */ 948 if (tp->t_state != TCPS_ESTABLISHED || 949 (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 950 tcp_read_locking == 0) { 951 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 952 ("%s: upgrade check ti_locked %d", __func__, ti_locked)); 953 954 if (ti_locked == TI_RLOCKED) { 955 if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { 956 in_pcbref(inp); 957 INP_WUNLOCK(inp); 958 INP_INFO_RUNLOCK(&V_tcbinfo); 959 INP_INFO_WLOCK(&V_tcbinfo); 960 ti_locked = TI_WLOCKED; 961 INP_WLOCK(inp); 962 if (in_pcbrele(inp)) { 963 inp = NULL; 964 goto findpcb; 965 } 966 goto relocked; 967 } else 968 ti_locked = TI_WLOCKED; 969 } 970 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 971 } 972 973#ifdef MAC 974 INP_WLOCK_ASSERT(inp); 975 if (mac_inpcb_check_deliver(inp, m)) 976 goto dropunlock; 977#endif 978 so = inp->inp_socket; 979 KASSERT(so != NULL, ("%s: so == NULL", __func__)); 980#ifdef TCPDEBUG 981 if (so->so_options & SO_DEBUG) { 982 ostate = tp->t_state; 983 if (isipv6) { 984#ifdef INET6 985 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); 986#endif 987 } else 988 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 989 tcp_savetcp = *th; 990 } 991#endif 992 /* 993 * When the socket is accepting connections (the INPCB is in LISTEN 994 * state) we look into the SYN cache if this is a new connection 995 * attempt or the completion of a previous one. 996 */ 997 if (so->so_options & SO_ACCEPTCONN) { 998 struct in_conninfo inc; 999 1000 KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " 1001 "tp not listening", __func__)); 1002 1003 bzero(&inc, sizeof(inc)); 1004#ifdef INET6 1005 if (isipv6) { 1006 inc.inc_flags |= INC_ISIPV6; 1007 inc.inc6_faddr = ip6->ip6_src; 1008 inc.inc6_laddr = ip6->ip6_dst; 1009 } else 1010#endif 1011 { 1012 inc.inc_faddr = ip->ip_src; 1013 inc.inc_laddr = ip->ip_dst; 1014 } 1015 inc.inc_fport = th->th_sport; 1016 inc.inc_lport = th->th_dport; 1017 inc.inc_fibnum = so->so_fibnum; 1018 1019 /* 1020 * Check for an existing connection attempt in syncache if 1021 * the flag is only ACK. A successful lookup creates a new 1022 * socket appended to the listen queue in SYN_RECEIVED state. 1023 */ 1024 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 1025 /* 1026 * Parse the TCP options here because 1027 * syncookies need access to the reflected 1028 * timestamp. 1029 */ 1030 tcp_dooptions(&to, optp, optlen, 0); 1031 /* 1032 * NB: syncache_expand() doesn't unlock 1033 * inp and tcpinfo locks. 1034 */ 1035 if (!syncache_expand(&inc, &to, th, &so, m)) { 1036 /* 1037 * No syncache entry or ACK was not 1038 * for our SYN/ACK. Send a RST. 1039 * NB: syncache did its own logging 1040 * of the failure cause. 1041 */ 1042 rstreason = BANDLIM_RST_OPENPORT; 1043 goto dropwithreset; 1044 } 1045 if (so == NULL) { 1046 /* 1047 * We completed the 3-way handshake 1048 * but could not allocate a socket 1049 * either due to memory shortage, 1050 * listen queue length limits or 1051 * global socket limits. Send RST 1052 * or wait and have the remote end 1053 * retransmit the ACK for another 1054 * try. 1055 */ 1056 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1057 log(LOG_DEBUG, "%s; %s: Listen socket: " 1058 "Socket allocation failed due to " 1059 "limits or memory shortage, %s\n", 1060 s, __func__, 1061 V_tcp_sc_rst_sock_fail ? 1062 "sending RST" : "try again"); 1063 if (V_tcp_sc_rst_sock_fail) { 1064 rstreason = BANDLIM_UNLIMITED; 1065 goto dropwithreset; 1066 } else 1067 goto dropunlock; 1068 } 1069 /* 1070 * Socket is created in state SYN_RECEIVED. 1071 * Unlock the listen socket, lock the newly 1072 * created socket and update the tp variable. 1073 */ 1074 INP_WUNLOCK(inp); /* listen socket */ 1075 inp = sotoinpcb(so); 1076 INP_WLOCK(inp); /* new connection */ 1077 tp = intotcpcb(inp); 1078 KASSERT(tp->t_state == TCPS_SYN_RECEIVED, 1079 ("%s: ", __func__)); 1080#ifdef TCP_SIGNATURE 1081 if (sig_checked == 0) { 1082 tcp_dooptions(&to, optp, optlen, 1083 (thflags & TH_SYN) ? TO_SYN : 0); 1084 if (!tcp_signature_verify_input(m, off0, tlen, 1085 optlen, &to, th, tp->t_flags)) { 1086 1087 /* 1088 * In SYN_SENT state if it receives an 1089 * RST, it is allowed for further 1090 * processing. 1091 */ 1092 if ((thflags & TH_RST) == 0 || 1093 (tp->t_state == TCPS_SYN_SENT) == 0) 1094 goto dropunlock; 1095 } 1096 sig_checked = 1; 1097 } 1098#endif 1099 1100 /* 1101 * Process the segment and the data it 1102 * contains. tcp_do_segment() consumes 1103 * the mbuf chain and unlocks the inpcb. 1104 */ 1105 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, 1106 iptos, ti_locked); 1107 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1108 return; 1109 } 1110 /* 1111 * Segment flag validation for new connection attempts: 1112 * 1113 * Our (SYN|ACK) response was rejected. 1114 * Check with syncache and remove entry to prevent 1115 * retransmits. 1116 * 1117 * NB: syncache_chkrst does its own logging of failure 1118 * causes. 1119 */ 1120 if (thflags & TH_RST) { 1121 syncache_chkrst(&inc, th); 1122 goto dropunlock; 1123 } 1124 /* 1125 * We can't do anything without SYN. 1126 */ 1127 if ((thflags & TH_SYN) == 0) { 1128 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1129 log(LOG_DEBUG, "%s; %s: Listen socket: " 1130 "SYN is missing, segment ignored\n", 1131 s, __func__); 1132 TCPSTAT_INC(tcps_badsyn); 1133 goto dropunlock; 1134 } 1135 /* 1136 * (SYN|ACK) is bogus on a listen socket. 1137 */ 1138 if (thflags & TH_ACK) { 1139 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1140 log(LOG_DEBUG, "%s; %s: Listen socket: " 1141 "SYN|ACK invalid, segment rejected\n", 1142 s, __func__); 1143 syncache_badack(&inc); /* XXX: Not needed! */ 1144 TCPSTAT_INC(tcps_badsyn); 1145 rstreason = BANDLIM_RST_OPENPORT; 1146 goto dropwithreset; 1147 } 1148 /* 1149 * If the drop_synfin option is enabled, drop all 1150 * segments with both the SYN and FIN bits set. 1151 * This prevents e.g. nmap from identifying the 1152 * TCP/IP stack. 1153 * XXX: Poor reasoning. nmap has other methods 1154 * and is constantly refining its stack detection 1155 * strategies. 1156 * XXX: This is a violation of the TCP specification 1157 * and was used by RFC1644. 1158 */ 1159 if ((thflags & TH_FIN) && V_drop_synfin) { 1160 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1161 log(LOG_DEBUG, "%s; %s: Listen socket: " 1162 "SYN|FIN segment ignored (based on " 1163 "sysctl setting)\n", s, __func__); 1164 TCPSTAT_INC(tcps_badsyn); 1165 goto dropunlock; 1166 } 1167 /* 1168 * Segment's flags are (SYN) or (SYN|FIN). 1169 * 1170 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored 1171 * as they do not affect the state of the TCP FSM. 1172 * The data pointed to by TH_URG and th_urp is ignored. 1173 */ 1174 KASSERT((thflags & (TH_RST|TH_ACK)) == 0, 1175 ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); 1176 KASSERT(thflags & (TH_SYN), 1177 ("%s: Listen socket: TH_SYN not set", __func__)); 1178#ifdef INET6 1179 /* 1180 * If deprecated address is forbidden, 1181 * we do not accept SYN to deprecated interface 1182 * address to prevent any new inbound connection from 1183 * getting established. 1184 * When we do not accept SYN, we send a TCP RST, 1185 * with deprecated source address (instead of dropping 1186 * it). We compromise it as it is much better for peer 1187 * to send a RST, and RST will be the final packet 1188 * for the exchange. 1189 * 1190 * If we do not forbid deprecated addresses, we accept 1191 * the SYN packet. RFC2462 does not suggest dropping 1192 * SYN in this case. 1193 * If we decipher RFC2462 5.5.4, it says like this: 1194 * 1. use of deprecated addr with existing 1195 * communication is okay - "SHOULD continue to be 1196 * used" 1197 * 2. use of it with new communication: 1198 * (2a) "SHOULD NOT be used if alternate address 1199 * with sufficient scope is available" 1200 * (2b) nothing mentioned otherwise. 1201 * Here we fall into (2b) case as we have no choice in 1202 * our source address selection - we must obey the peer. 1203 * 1204 * The wording in RFC2462 is confusing, and there are 1205 * multiple description text for deprecated address 1206 * handling - worse, they are not exactly the same. 1207 * I believe 5.5.4 is the best one, so we follow 5.5.4. 1208 */ 1209 if (isipv6 && !V_ip6_use_deprecated) { 1210 struct in6_ifaddr *ia6; 1211 1212 ia6 = ip6_getdstifaddr(m); 1213 if (ia6 != NULL && 1214 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 1215 ifa_free(&ia6->ia_ifa); 1216 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1217 log(LOG_DEBUG, "%s; %s: Listen socket: " 1218 "Connection attempt to deprecated " 1219 "IPv6 address rejected\n", 1220 s, __func__); 1221 rstreason = BANDLIM_RST_OPENPORT; 1222 goto dropwithreset; 1223 } 1224 ifa_free(&ia6->ia_ifa); 1225 } 1226#endif 1227 /* 1228 * Basic sanity checks on incoming SYN requests: 1229 * Don't respond if the destination is a link layer 1230 * broadcast according to RFC1122 4.2.3.10, p. 104. 1231 * If it is from this socket it must be forged. 1232 * Don't respond if the source or destination is a 1233 * global or subnet broad- or multicast address. 1234 * Note that it is quite possible to receive unicast 1235 * link-layer packets with a broadcast IP address. Use 1236 * in_broadcast() to find them. 1237 */ 1238 if (m->m_flags & (M_BCAST|M_MCAST)) { 1239 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1240 log(LOG_DEBUG, "%s; %s: Listen socket: " 1241 "Connection attempt from broad- or multicast " 1242 "link layer address ignored\n", s, __func__); 1243 goto dropunlock; 1244 } 1245 if (isipv6) { 1246#ifdef INET6 1247 if (th->th_dport == th->th_sport && 1248 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { 1249 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1250 log(LOG_DEBUG, "%s; %s: Listen socket: " 1251 "Connection attempt to/from self " 1252 "ignored\n", s, __func__); 1253 goto dropunlock; 1254 } 1255 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 1256 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { 1257 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1258 log(LOG_DEBUG, "%s; %s: Listen socket: " 1259 "Connection attempt from/to multicast " 1260 "address ignored\n", s, __func__); 1261 goto dropunlock; 1262 } 1263#endif 1264 } else { 1265 if (th->th_dport == th->th_sport && 1266 ip->ip_dst.s_addr == ip->ip_src.s_addr) { 1267 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1268 log(LOG_DEBUG, "%s; %s: Listen socket: " 1269 "Connection attempt from/to self " 1270 "ignored\n", s, __func__); 1271 goto dropunlock; 1272 } 1273 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 1274 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 1275 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 1276 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { 1277 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1278 log(LOG_DEBUG, "%s; %s: Listen socket: " 1279 "Connection attempt from/to broad- " 1280 "or multicast address ignored\n", 1281 s, __func__); 1282 goto dropunlock; 1283 } 1284 } 1285 /* 1286 * SYN appears to be valid. Create compressed TCP state 1287 * for syncache. 1288 */ 1289#ifdef TCPDEBUG 1290 if (so->so_options & SO_DEBUG) 1291 tcp_trace(TA_INPUT, ostate, tp, 1292 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1293#endif 1294 tcp_dooptions(&to, optp, optlen, TO_SYN); 1295 syncache_add(&inc, &to, th, inp, &so, m); 1296 /* 1297 * Entry added to syncache and mbuf consumed. 1298 * Everything already unlocked by syncache_add(). 1299 */ 1300 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1301 return; 1302 } 1303 1304#ifdef TCP_SIGNATURE 1305 if (sig_checked == 0) { 1306 tcp_dooptions(&to, optp, optlen, 1307 (thflags & TH_SYN) ? TO_SYN : 0); 1308 if (!tcp_signature_verify_input(m, off0, tlen, optlen, &to, 1309 th, tp->t_flags)) { 1310 1311 /* 1312 * In SYN_SENT state if it receives an RST, it is 1313 * allowed for further processing. 1314 */ 1315 if ((thflags & TH_RST) == 0 || 1316 (tp->t_state == TCPS_SYN_SENT) == 0) 1317 goto dropunlock; 1318 } 1319 sig_checked = 1; 1320 } 1321#endif 1322 1323 /* 1324 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later 1325 * state. tcp_do_segment() always consumes the mbuf chain, unlocks 1326 * the inpcb, and unlocks pcbinfo. 1327 */ 1328 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); 1329 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1330 return; 1331 1332dropwithreset: 1333 if (ti_locked == TI_RLOCKED) 1334 INP_INFO_RUNLOCK(&V_tcbinfo); 1335 else if (ti_locked == TI_WLOCKED) 1336 INP_INFO_WUNLOCK(&V_tcbinfo); 1337 else 1338 panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); 1339 ti_locked = TI_UNLOCKED; 1340 1341 if (inp != NULL) { 1342 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1343 INP_WUNLOCK(inp); 1344 } else 1345 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1346 m = NULL; /* mbuf chain got consumed. */ 1347 goto drop; 1348 1349dropunlock: 1350 if (ti_locked == TI_RLOCKED) 1351 INP_INFO_RUNLOCK(&V_tcbinfo); 1352 else if (ti_locked == TI_WLOCKED) 1353 INP_INFO_WUNLOCK(&V_tcbinfo); 1354 else 1355 panic("%s: dropunlock ti_locked %d", __func__, ti_locked); 1356 ti_locked = TI_UNLOCKED; 1357 1358 if (inp != NULL) 1359 INP_WUNLOCK(inp); 1360 1361drop: 1362 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1363 if (s != NULL) 1364 free(s, M_TCPLOG); 1365 if (m != NULL) 1366 m_freem(m); 1367} 1368 1369static void 1370tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 1371 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 1372 int ti_locked) 1373{ 1374 int thflags, acked, ourfinisacked, needoutput = 0; 1375 int rstreason, todrop, win; 1376 u_long tiwin; 1377 struct tcpopt to; 1378 1379#ifdef TCPDEBUG 1380 /* 1381 * The size of tcp_saveipgen must be the size of the max ip header, 1382 * now IPv6. 1383 */ 1384 u_char tcp_saveipgen[IP6_HDR_LEN]; 1385 struct tcphdr tcp_savetcp; 1386 short ostate = 0; 1387#endif 1388 thflags = th->th_flags; 1389 tp->sackhint.last_sack_ack = 0; 1390 1391 /* 1392 * If this is either a state-changing packet or current state isn't 1393 * established, we require a write lock on tcbinfo. Otherwise, we 1394 * allow either a read lock or a write lock, as we may have acquired 1395 * a write lock due to a race. 1396 * 1397 * Require a global write lock for SYN/FIN/RST segments or 1398 * non-established connections; otherwise accept either a read or 1399 * write lock, as we may have conservatively acquired a write lock in 1400 * certain cases in tcp_input() (is this still true?). Currently we 1401 * will never enter with no lock, so we try to drop it quickly in the 1402 * common pure ack/pure data cases. 1403 */ 1404 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 1405 tp->t_state != TCPS_ESTABLISHED) { 1406 KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " 1407 "SYN/FIN/RST/!EST", __func__, ti_locked)); 1408 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1409 } else { 1410#ifdef INVARIANTS 1411 if (ti_locked == TI_RLOCKED) 1412 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1413 else if (ti_locked == TI_WLOCKED) 1414 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1415 else 1416 panic("%s: ti_locked %d for EST", __func__, 1417 ti_locked); 1418#endif 1419 } 1420 INP_WLOCK_ASSERT(tp->t_inpcb); 1421 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1422 __func__)); 1423 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1424 __func__)); 1425 1426 /* 1427 * Segment received on connection. 1428 * Reset idle time and keep-alive timer. 1429 * XXX: This should be done after segment 1430 * validation to ignore broken/spoofed segs. 1431 */ 1432 tp->t_rcvtime = ticks; 1433 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1434 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1435 1436 /* 1437 * Unscale the window into a 32-bit value. 1438 * For the SYN_SENT state the scale is zero. 1439 */ 1440 tiwin = th->th_win << tp->snd_scale; 1441 1442 /* 1443 * TCP ECN processing. 1444 */ 1445 if (tp->t_flags & TF_ECN_PERMIT) { 1446 if (thflags & TH_CWR) 1447 tp->t_flags &= ~TF_ECN_SND_ECE; 1448 switch (iptos & IPTOS_ECN_MASK) { 1449 case IPTOS_ECN_CE: 1450 tp->t_flags |= TF_ECN_SND_ECE; 1451 TCPSTAT_INC(tcps_ecn_ce); 1452 break; 1453 case IPTOS_ECN_ECT0: 1454 TCPSTAT_INC(tcps_ecn_ect0); 1455 break; 1456 case IPTOS_ECN_ECT1: 1457 TCPSTAT_INC(tcps_ecn_ect1); 1458 break; 1459 } 1460 /* Congestion experienced. */ 1461 if (thflags & TH_ECE) { 1462 cc_cong_signal(tp, th, CC_ECN); 1463 } 1464 } 1465 1466 /* 1467 * Parse options on any incoming segment. 1468 */ 1469 tcp_dooptions(&to, (u_char *)(th + 1), 1470 (th->th_off << 2) - sizeof(struct tcphdr), 1471 (thflags & TH_SYN) ? TO_SYN : 0); 1472 1473 /* 1474 * If echoed timestamp is later than the current time, 1475 * fall back to non RFC1323 RTT calculation. Normalize 1476 * timestamp if syncookies were used when this connection 1477 * was established. 1478 */ 1479 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1480 to.to_tsecr -= tp->ts_offset; 1481 if (TSTMP_GT(to.to_tsecr, ticks)) 1482 to.to_tsecr = 0; 1483 } 1484 1485 /* 1486 * Process options only when we get SYN/ACK back. The SYN case 1487 * for incoming connections is handled in tcp_syncache. 1488 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1489 * or <SYN,ACK>) segment itself is never scaled. 1490 * XXX this is traditional behavior, may need to be cleaned up. 1491 */ 1492 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1493 if ((to.to_flags & TOF_SCALE) && 1494 (tp->t_flags & TF_REQ_SCALE)) { 1495 tp->t_flags |= TF_RCVD_SCALE; 1496 tp->snd_scale = to.to_wscale; 1497 } 1498 /* 1499 * Initial send window. It will be updated with 1500 * the next incoming segment to the scaled value. 1501 */ 1502 tp->snd_wnd = th->th_win; 1503 if (to.to_flags & TOF_TS) { 1504 tp->t_flags |= TF_RCVD_TSTMP; 1505 tp->ts_recent = to.to_tsval; 1506 tp->ts_recent_age = ticks; 1507 } 1508 if (to.to_flags & TOF_MSS) 1509 tcp_mss(tp, to.to_mss); 1510 if ((tp->t_flags & TF_SACK_PERMIT) && 1511 (to.to_flags & TOF_SACKPERM) == 0) 1512 tp->t_flags &= ~TF_SACK_PERMIT; 1513 } 1514 1515 /* 1516 * Header prediction: check for the two common cases 1517 * of a uni-directional data xfer. If the packet has 1518 * no control flags, is in-sequence, the window didn't 1519 * change and we're not retransmitting, it's a 1520 * candidate. If the length is zero and the ack moved 1521 * forward, we're the sender side of the xfer. Just 1522 * free the data acked & wake any higher level process 1523 * that was blocked waiting for space. If the length 1524 * is non-zero and the ack didn't move, we're the 1525 * receiver side. If we're getting packets in-order 1526 * (the reassembly queue is empty), add the data to 1527 * the socket buffer and note that we need a delayed ack. 1528 * Make sure that the hidden state-flags are also off. 1529 * Since we check for TCPS_ESTABLISHED first, it can only 1530 * be TH_NEEDSYN. 1531 */ 1532 if (tp->t_state == TCPS_ESTABLISHED && 1533 th->th_seq == tp->rcv_nxt && 1534 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1535 tp->snd_nxt == tp->snd_max && 1536 tiwin && tiwin == tp->snd_wnd && 1537 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1538 LIST_EMPTY(&tp->t_segq) && 1539 ((to.to_flags & TOF_TS) == 0 || 1540 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { 1541 1542 /* 1543 * If last ACK falls within this segment's sequence numbers, 1544 * record the timestamp. 1545 * NOTE that the test is modified according to the latest 1546 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1547 */ 1548 if ((to.to_flags & TOF_TS) != 0 && 1549 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1550 tp->ts_recent_age = ticks; 1551 tp->ts_recent = to.to_tsval; 1552 } 1553 1554 if (tlen == 0) { 1555 if (SEQ_GT(th->th_ack, tp->snd_una) && 1556 SEQ_LEQ(th->th_ack, tp->snd_max) && 1557 !IN_RECOVERY(tp->t_flags) && 1558 (to.to_flags & TOF_SACK) == 0 && 1559 TAILQ_EMPTY(&tp->snd_holes)) { 1560 /* 1561 * This is a pure ack for outstanding data. 1562 */ 1563 if (ti_locked == TI_RLOCKED) 1564 INP_INFO_RUNLOCK(&V_tcbinfo); 1565 else if (ti_locked == TI_WLOCKED) 1566 INP_INFO_WUNLOCK(&V_tcbinfo); 1567 else 1568 panic("%s: ti_locked %d on pure ACK", 1569 __func__, ti_locked); 1570 ti_locked = TI_UNLOCKED; 1571 1572 TCPSTAT_INC(tcps_predack); 1573 1574 /* 1575 * "bad retransmit" recovery. 1576 */ 1577 if (tp->t_rxtshift == 1 && 1578 (int)(ticks - tp->t_badrxtwin) < 0) { 1579 cc_cong_signal(tp, th, CC_RTO_ERR); 1580 } 1581 1582 /* 1583 * Recalculate the transmit timer / rtt. 1584 * 1585 * Some boxes send broken timestamp replies 1586 * during the SYN+ACK phase, ignore 1587 * timestamps of 0 or we could calculate a 1588 * huge RTT and blow up the retransmit timer. 1589 */ 1590 if ((to.to_flags & TOF_TS) != 0 && 1591 to.to_tsecr) { 1592 if (!tp->t_rttlow || 1593 tp->t_rttlow > ticks - to.to_tsecr) 1594 tp->t_rttlow = ticks - to.to_tsecr; 1595 tcp_xmit_timer(tp, 1596 ticks - to.to_tsecr + 1); 1597 } else if (tp->t_rtttime && 1598 SEQ_GT(th->th_ack, tp->t_rtseq)) { 1599 if (!tp->t_rttlow || 1600 tp->t_rttlow > ticks - tp->t_rtttime) 1601 tp->t_rttlow = ticks - tp->t_rtttime; 1602 tcp_xmit_timer(tp, 1603 ticks - tp->t_rtttime); 1604 } 1605 acked = BYTES_THIS_ACK(tp, th); 1606 1607 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 1608 hhook_run_tcp_est_in(tp, th, &to); 1609 1610 TCPSTAT_INC(tcps_rcvackpack); 1611 TCPSTAT_ADD(tcps_rcvackbyte, acked); 1612 sbdrop(&so->so_snd, acked); 1613 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 1614 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1615 tp->snd_recover = th->th_ack - 1; 1616 1617 /* 1618 * Let the congestion control algorithm update 1619 * congestion control related information. This 1620 * typically means increasing the congestion 1621 * window. 1622 */ 1623 cc_ack_received(tp, th, CC_ACK); 1624 1625 tp->snd_una = th->th_ack; 1626 /* 1627 * Pull snd_wl2 up to prevent seq wrap relative 1628 * to th_ack. 1629 */ 1630 tp->snd_wl2 = th->th_ack; 1631 tp->t_dupacks = 0; 1632 m_freem(m); 1633 ND6_HINT(tp); /* Some progress has been made. */ 1634 1635 /* 1636 * If all outstanding data are acked, stop 1637 * retransmit timer, otherwise restart timer 1638 * using current (possibly backed-off) value. 1639 * If process is waiting for space, 1640 * wakeup/selwakeup/signal. If data 1641 * are ready to send, let tcp_output 1642 * decide between more output or persist. 1643 */ 1644#ifdef TCPDEBUG 1645 if (so->so_options & SO_DEBUG) 1646 tcp_trace(TA_INPUT, ostate, tp, 1647 (void *)tcp_saveipgen, 1648 &tcp_savetcp, 0); 1649#endif 1650 if (tp->snd_una == tp->snd_max) 1651 tcp_timer_activate(tp, TT_REXMT, 0); 1652 else if (!tcp_timer_active(tp, TT_PERSIST)) 1653 tcp_timer_activate(tp, TT_REXMT, 1654 tp->t_rxtcur); 1655 sowwakeup(so); 1656 if (so->so_snd.sb_cc) 1657 (void) tcp_output(tp); 1658 goto check_delack; 1659 } 1660 } else if (th->th_ack == tp->snd_una && 1661 tlen <= sbspace(&so->so_rcv)) { 1662 int newsize = 0; /* automatic sockbuf scaling */ 1663 1664 /* 1665 * This is a pure, in-sequence data packet with 1666 * nothing on the reassembly queue and we have enough 1667 * buffer space to take it. 1668 */ 1669 if (ti_locked == TI_RLOCKED) 1670 INP_INFO_RUNLOCK(&V_tcbinfo); 1671 else if (ti_locked == TI_WLOCKED) 1672 INP_INFO_WUNLOCK(&V_tcbinfo); 1673 else 1674 panic("%s: ti_locked %d on pure data " 1675 "segment", __func__, ti_locked); 1676 ti_locked = TI_UNLOCKED; 1677 1678 /* Clean receiver SACK report if present */ 1679 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 1680 tcp_clean_sackreport(tp); 1681 TCPSTAT_INC(tcps_preddat); 1682 tp->rcv_nxt += tlen; 1683 /* 1684 * Pull snd_wl1 up to prevent seq wrap relative to 1685 * th_seq. 1686 */ 1687 tp->snd_wl1 = th->th_seq; 1688 /* 1689 * Pull rcv_up up to prevent seq wrap relative to 1690 * rcv_nxt. 1691 */ 1692 tp->rcv_up = tp->rcv_nxt; 1693 TCPSTAT_INC(tcps_rcvpack); 1694 TCPSTAT_ADD(tcps_rcvbyte, tlen); 1695 ND6_HINT(tp); /* Some progress has been made */ 1696#ifdef TCPDEBUG 1697 if (so->so_options & SO_DEBUG) 1698 tcp_trace(TA_INPUT, ostate, tp, 1699 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1700#endif 1701 /* 1702 * Automatic sizing of receive socket buffer. Often the send 1703 * buffer size is not optimally adjusted to the actual network 1704 * conditions at hand (delay bandwidth product). Setting the 1705 * buffer size too small limits throughput on links with high 1706 * bandwidth and high delay (eg. trans-continental/oceanic links). 1707 * 1708 * On the receive side the socket buffer memory is only rarely 1709 * used to any significant extent. This allows us to be much 1710 * more aggressive in scaling the receive socket buffer. For 1711 * the case that the buffer space is actually used to a large 1712 * extent and we run out of kernel memory we can simply drop 1713 * the new segments; TCP on the sender will just retransmit it 1714 * later. Setting the buffer size too big may only consume too 1715 * much kernel memory if the application doesn't read() from 1716 * the socket or packet loss or reordering makes use of the 1717 * reassembly queue. 1718 * 1719 * The criteria to step up the receive buffer one notch are: 1720 * 1. the number of bytes received during the time it takes 1721 * one timestamp to be reflected back to us (the RTT); 1722 * 2. received bytes per RTT is within seven eighth of the 1723 * current socket buffer size; 1724 * 3. receive buffer size has not hit maximal automatic size; 1725 * 1726 * This algorithm does one step per RTT at most and only if 1727 * we receive a bulk stream w/o packet losses or reorderings. 1728 * Shrinking the buffer during idle times is not necessary as 1729 * it doesn't consume any memory when idle. 1730 * 1731 * TODO: Only step up if the application is actually serving 1732 * the buffer to better manage the socket buffer resources. 1733 */ 1734 if (V_tcp_do_autorcvbuf && 1735 to.to_tsecr && 1736 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1737 if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) && 1738 to.to_tsecr - tp->rfbuf_ts < hz) { 1739 if (tp->rfbuf_cnt > 1740 (so->so_rcv.sb_hiwat / 8 * 7) && 1741 so->so_rcv.sb_hiwat < 1742 V_tcp_autorcvbuf_max) { 1743 newsize = 1744 min(so->so_rcv.sb_hiwat + 1745 V_tcp_autorcvbuf_inc, 1746 V_tcp_autorcvbuf_max); 1747 } 1748 /* Start over with next RTT. */ 1749 tp->rfbuf_ts = 0; 1750 tp->rfbuf_cnt = 0; 1751 } else 1752 tp->rfbuf_cnt += tlen; /* add up */ 1753 } 1754 1755 /* Add data to socket buffer. */ 1756 SOCKBUF_LOCK(&so->so_rcv); 1757 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1758 m_freem(m); 1759 } else { 1760 /* 1761 * Set new socket buffer size. 1762 * Give up when limit is reached. 1763 */ 1764 if (newsize) 1765 if (!sbreserve_locked(&so->so_rcv, 1766 newsize, so, NULL)) 1767 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1768 m_adj(m, drop_hdrlen); /* delayed header drop */ 1769 sbappendstream_locked(&so->so_rcv, m); 1770 } 1771 /* NB: sorwakeup_locked() does an implicit unlock. */ 1772 sorwakeup_locked(so); 1773 if (DELAY_ACK(tp)) { 1774 tp->t_flags |= TF_DELACK; 1775 } else { 1776 tp->t_flags |= TF_ACKNOW; 1777 tcp_output(tp); 1778 } 1779 goto check_delack; 1780 } 1781 } 1782 1783 /* 1784 * Calculate amount of space in receive window, 1785 * and then do TCP input processing. 1786 * Receive window is amount of space in rcv queue, 1787 * but not less than advertised window. 1788 */ 1789 win = sbspace(&so->so_rcv); 1790 if (win < 0) 1791 win = 0; 1792 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1793 1794 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1795 tp->rfbuf_ts = 0; 1796 tp->rfbuf_cnt = 0; 1797 1798 switch (tp->t_state) { 1799 1800 /* 1801 * If the state is SYN_RECEIVED: 1802 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1803 */ 1804 case TCPS_SYN_RECEIVED: 1805 if ((thflags & TH_ACK) && 1806 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1807 SEQ_GT(th->th_ack, tp->snd_max))) { 1808 rstreason = BANDLIM_RST_OPENPORT; 1809 goto dropwithreset; 1810 } 1811 break; 1812 1813 /* 1814 * If the state is SYN_SENT: 1815 * if seg contains an ACK, but not for our SYN, drop the input. 1816 * if seg contains a RST, then drop the connection. 1817 * if seg does not contain SYN, then drop it. 1818 * Otherwise this is an acceptable SYN segment 1819 * initialize tp->rcv_nxt and tp->irs 1820 * if seg contains ack then advance tp->snd_una 1821 * if seg contains an ECE and ECN support is enabled, the stream 1822 * is ECN capable. 1823 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1824 * arrange for segment to be acked (eventually) 1825 * continue processing rest of data/controls, beginning with URG 1826 */ 1827 case TCPS_SYN_SENT: 1828 if ((thflags & TH_ACK) && 1829 (SEQ_LEQ(th->th_ack, tp->iss) || 1830 SEQ_GT(th->th_ack, tp->snd_max))) { 1831 rstreason = BANDLIM_UNLIMITED; 1832 goto dropwithreset; 1833 } 1834 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) 1835 tp = tcp_drop(tp, ECONNREFUSED); 1836 if (thflags & TH_RST) 1837 goto drop; 1838 if (!(thflags & TH_SYN)) 1839 goto drop; 1840 1841 tp->irs = th->th_seq; 1842 tcp_rcvseqinit(tp); 1843 if (thflags & TH_ACK) { 1844 TCPSTAT_INC(tcps_connects); 1845 soisconnected(so); 1846#ifdef MAC 1847 mac_socketpeer_set_from_mbuf(m, so); 1848#endif 1849 /* Do window scaling on this connection? */ 1850 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1851 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1852 tp->rcv_scale = tp->request_r_scale; 1853 } 1854 tp->rcv_adv += imin(tp->rcv_wnd, 1855 TCP_MAXWIN << tp->rcv_scale); 1856 tp->snd_una++; /* SYN is acked */ 1857 /* 1858 * If there's data, delay ACK; if there's also a FIN 1859 * ACKNOW will be turned on later. 1860 */ 1861 if (DELAY_ACK(tp) && tlen != 0) 1862 tcp_timer_activate(tp, TT_DELACK, 1863 tcp_delacktime); 1864 else 1865 tp->t_flags |= TF_ACKNOW; 1866 1867 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 1868 tp->t_flags |= TF_ECN_PERMIT; 1869 TCPSTAT_INC(tcps_ecn_shs); 1870 } 1871 1872 /* 1873 * Received <SYN,ACK> in SYN_SENT[*] state. 1874 * Transitions: 1875 * SYN_SENT --> ESTABLISHED 1876 * SYN_SENT* --> FIN_WAIT_1 1877 */ 1878 tp->t_starttime = ticks; 1879 if (tp->t_flags & TF_NEEDFIN) { 1880 tp->t_state = TCPS_FIN_WAIT_1; 1881 tp->t_flags &= ~TF_NEEDFIN; 1882 thflags &= ~TH_SYN; 1883 } else { 1884 tp->t_state = TCPS_ESTABLISHED; 1885 cc_conn_init(tp); 1886 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1887 } 1888 } else { 1889 /* 1890 * Received initial SYN in SYN-SENT[*] state => 1891 * simultaneous open. If segment contains CC option 1892 * and there is a cached CC, apply TAO test. 1893 * If it succeeds, connection is * half-synchronized. 1894 * Otherwise, do 3-way handshake: 1895 * SYN-SENT -> SYN-RECEIVED 1896 * SYN-SENT* -> SYN-RECEIVED* 1897 * If there was no CC option, clear cached CC value. 1898 */ 1899 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 1900 tcp_timer_activate(tp, TT_REXMT, 0); 1901 tp->t_state = TCPS_SYN_RECEIVED; 1902 } 1903 1904 KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " 1905 "ti_locked %d", __func__, ti_locked)); 1906 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1907 INP_WLOCK_ASSERT(tp->t_inpcb); 1908 1909 /* 1910 * Advance th->th_seq to correspond to first data byte. 1911 * If data, trim to stay within window, 1912 * dropping FIN if necessary. 1913 */ 1914 th->th_seq++; 1915 if (tlen > tp->rcv_wnd) { 1916 todrop = tlen - tp->rcv_wnd; 1917 m_adj(m, -todrop); 1918 tlen = tp->rcv_wnd; 1919 thflags &= ~TH_FIN; 1920 TCPSTAT_INC(tcps_rcvpackafterwin); 1921 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 1922 } 1923 tp->snd_wl1 = th->th_seq - 1; 1924 tp->rcv_up = th->th_seq; 1925 /* 1926 * Client side of transaction: already sent SYN and data. 1927 * If the remote host used T/TCP to validate the SYN, 1928 * our data will be ACK'd; if so, enter normal data segment 1929 * processing in the middle of step 5, ack processing. 1930 * Otherwise, goto step 6. 1931 */ 1932 if (thflags & TH_ACK) 1933 goto process_ACK; 1934 1935 goto step6; 1936 1937 /* 1938 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 1939 * do normal processing. 1940 * 1941 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 1942 */ 1943 case TCPS_LAST_ACK: 1944 case TCPS_CLOSING: 1945 break; /* continue normal processing */ 1946 } 1947 1948 /* 1949 * States other than LISTEN or SYN_SENT. 1950 * First check the RST flag and sequence number since reset segments 1951 * are exempt from the timestamp and connection count tests. This 1952 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 1953 * below which allowed reset segments in half the sequence space 1954 * to fall though and be processed (which gives forged reset 1955 * segments with a random sequence number a 50 percent chance of 1956 * killing a connection). 1957 * Then check timestamp, if present. 1958 * Then check the connection count, if present. 1959 * Then check that at least some bytes of segment are within 1960 * receive window. If segment begins before rcv_nxt, 1961 * drop leading data (and SYN); if nothing left, just ack. 1962 * 1963 * 1964 * If the RST bit is set, check the sequence number to see 1965 * if this is a valid reset segment. 1966 * RFC 793 page 37: 1967 * In all states except SYN-SENT, all reset (RST) segments 1968 * are validated by checking their SEQ-fields. A reset is 1969 * valid if its sequence number is in the window. 1970 * Note: this does not take into account delayed ACKs, so 1971 * we should test against last_ack_sent instead of rcv_nxt. 1972 * The sequence number in the reset segment is normally an 1973 * echo of our outgoing acknowlegement numbers, but some hosts 1974 * send a reset with the sequence number at the rightmost edge 1975 * of our receive window, and we have to handle this case. 1976 * Note 2: Paul Watson's paper "Slipping in the Window" has shown 1977 * that brute force RST attacks are possible. To combat this, 1978 * we use a much stricter check while in the ESTABLISHED state, 1979 * only accepting RSTs where the sequence number is equal to 1980 * last_ack_sent. In all other states (the states in which a 1981 * RST is more likely), the more permissive check is used. 1982 * If we have multiple segments in flight, the initial reset 1983 * segment sequence numbers will be to the left of last_ack_sent, 1984 * but they will eventually catch up. 1985 * In any case, it never made sense to trim reset segments to 1986 * fit the receive window since RFC 1122 says: 1987 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 1988 * 1989 * A TCP SHOULD allow a received RST segment to include data. 1990 * 1991 * DISCUSSION 1992 * It has been suggested that a RST segment could contain 1993 * ASCII text that encoded and explained the cause of the 1994 * RST. No standard has yet been established for such 1995 * data. 1996 * 1997 * If the reset segment passes the sequence number test examine 1998 * the state: 1999 * SYN_RECEIVED STATE: 2000 * If passive open, return to LISTEN state. 2001 * If active open, inform user that connection was refused. 2002 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 2003 * Inform user that connection was reset, and close tcb. 2004 * CLOSING, LAST_ACK STATES: 2005 * Close the tcb. 2006 * TIME_WAIT STATE: 2007 * Drop the segment - see Stevens, vol. 2, p. 964 and 2008 * RFC 1337. 2009 */ 2010 if (thflags & TH_RST) { 2011 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 2012 SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 2013 switch (tp->t_state) { 2014 2015 case TCPS_SYN_RECEIVED: 2016 so->so_error = ECONNREFUSED; 2017 goto close; 2018 2019 case TCPS_ESTABLISHED: 2020 if (V_tcp_insecure_rst == 0 && 2021 !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && 2022 SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && 2023 !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 2024 SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { 2025 TCPSTAT_INC(tcps_badrst); 2026 goto drop; 2027 } 2028 /* FALLTHROUGH */ 2029 case TCPS_FIN_WAIT_1: 2030 case TCPS_FIN_WAIT_2: 2031 case TCPS_CLOSE_WAIT: 2032 so->so_error = ECONNRESET; 2033 close: 2034 KASSERT(ti_locked == TI_WLOCKED, 2035 ("tcp_do_segment: TH_RST 1 ti_locked %d", 2036 ti_locked)); 2037 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2038 2039 tp->t_state = TCPS_CLOSED; 2040 TCPSTAT_INC(tcps_drops); 2041 tp = tcp_close(tp); 2042 break; 2043 2044 case TCPS_CLOSING: 2045 case TCPS_LAST_ACK: 2046 KASSERT(ti_locked == TI_WLOCKED, 2047 ("tcp_do_segment: TH_RST 2 ti_locked %d", 2048 ti_locked)); 2049 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2050 2051 tp = tcp_close(tp); 2052 break; 2053 } 2054 } 2055 goto drop; 2056 } 2057 2058 /* 2059 * RFC 1323 PAWS: If we have a timestamp reply on this segment 2060 * and it's less than ts_recent, drop it. 2061 */ 2062 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 2063 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 2064 2065 /* Check to see if ts_recent is over 24 days old. */ 2066 if (ticks - tp->ts_recent_age > TCP_PAWS_IDLE) { 2067 /* 2068 * Invalidate ts_recent. If this segment updates 2069 * ts_recent, the age will be reset later and ts_recent 2070 * will get a valid value. If it does not, setting 2071 * ts_recent to zero will at least satisfy the 2072 * requirement that zero be placed in the timestamp 2073 * echo reply when ts_recent isn't valid. The 2074 * age isn't reset until we get a valid ts_recent 2075 * because we don't want out-of-order segments to be 2076 * dropped when ts_recent is old. 2077 */ 2078 tp->ts_recent = 0; 2079 } else { 2080 TCPSTAT_INC(tcps_rcvduppack); 2081 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 2082 TCPSTAT_INC(tcps_pawsdrop); 2083 if (tlen) 2084 goto dropafterack; 2085 goto drop; 2086 } 2087 } 2088 2089 /* 2090 * In the SYN-RECEIVED state, validate that the packet belongs to 2091 * this connection before trimming the data to fit the receive 2092 * window. Check the sequence number versus IRS since we know 2093 * the sequence numbers haven't wrapped. This is a partial fix 2094 * for the "LAND" DoS attack. 2095 */ 2096 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 2097 rstreason = BANDLIM_RST_OPENPORT; 2098 goto dropwithreset; 2099 } 2100 2101 todrop = tp->rcv_nxt - th->th_seq; 2102 if (todrop > 0) { 2103 /* 2104 * If this is a duplicate SYN for our current connection, 2105 * advance over it and pretend and it's not a SYN. 2106 */ 2107 if (thflags & TH_SYN && th->th_seq == tp->irs) { 2108 thflags &= ~TH_SYN; 2109 th->th_seq++; 2110 if (th->th_urp > 1) 2111 th->th_urp--; 2112 else 2113 thflags &= ~TH_URG; 2114 todrop--; 2115 } 2116 /* 2117 * Following if statement from Stevens, vol. 2, p. 960. 2118 */ 2119 if (todrop > tlen 2120 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 2121 /* 2122 * Any valid FIN must be to the left of the window. 2123 * At this point the FIN must be a duplicate or out 2124 * of sequence; drop it. 2125 */ 2126 thflags &= ~TH_FIN; 2127 2128 /* 2129 * Send an ACK to resynchronize and drop any data. 2130 * But keep on processing for RST or ACK. 2131 */ 2132 tp->t_flags |= TF_ACKNOW; 2133 todrop = tlen; 2134 TCPSTAT_INC(tcps_rcvduppack); 2135 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 2136 } else { 2137 TCPSTAT_INC(tcps_rcvpartduppack); 2138 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 2139 } 2140 drop_hdrlen += todrop; /* drop from the top afterwards */ 2141 th->th_seq += todrop; 2142 tlen -= todrop; 2143 if (th->th_urp > todrop) 2144 th->th_urp -= todrop; 2145 else { 2146 thflags &= ~TH_URG; 2147 th->th_urp = 0; 2148 } 2149 } 2150 2151 /* 2152 * If new data are received on a connection after the 2153 * user processes are gone, then RST the other end. 2154 */ 2155 if ((so->so_state & SS_NOFDREF) && 2156 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 2157 char *s; 2158 2159 KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " 2160 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); 2161 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2162 2163 if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { 2164 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " 2165 "was closed, sending RST and removing tcpcb\n", 2166 s, __func__, tcpstates[tp->t_state], tlen); 2167 free(s, M_TCPLOG); 2168 } 2169 tp = tcp_close(tp); 2170 TCPSTAT_INC(tcps_rcvafterclose); 2171 rstreason = BANDLIM_UNLIMITED; 2172 goto dropwithreset; 2173 } 2174 2175 /* 2176 * If segment ends after window, drop trailing data 2177 * (and PUSH and FIN); if nothing left, just ACK. 2178 */ 2179 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 2180 if (todrop > 0) { 2181 TCPSTAT_INC(tcps_rcvpackafterwin); 2182 if (todrop >= tlen) { 2183 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 2184 /* 2185 * If window is closed can only take segments at 2186 * window edge, and have to drop data and PUSH from 2187 * incoming segments. Continue processing, but 2188 * remember to ack. Otherwise, drop segment 2189 * and ack. 2190 */ 2191 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 2192 tp->t_flags |= TF_ACKNOW; 2193 TCPSTAT_INC(tcps_rcvwinprobe); 2194 } else 2195 goto dropafterack; 2196 } else 2197 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 2198 m_adj(m, -todrop); 2199 tlen -= todrop; 2200 thflags &= ~(TH_PUSH|TH_FIN); 2201 } 2202 2203 /* 2204 * If last ACK falls within this segment's sequence numbers, 2205 * record its timestamp. 2206 * NOTE: 2207 * 1) That the test incorporates suggestions from the latest 2208 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2209 * 2) That updating only on newer timestamps interferes with 2210 * our earlier PAWS tests, so this check should be solely 2211 * predicated on the sequence space of this segment. 2212 * 3) That we modify the segment boundary check to be 2213 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 2214 * instead of RFC1323's 2215 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 2216 * This modified check allows us to overcome RFC1323's 2217 * limitations as described in Stevens TCP/IP Illustrated 2218 * Vol. 2 p.869. In such cases, we can still calculate the 2219 * RTT correctly when RCV.NXT == Last.ACK.Sent. 2220 */ 2221 if ((to.to_flags & TOF_TS) != 0 && 2222 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 2223 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 2224 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 2225 tp->ts_recent_age = ticks; 2226 tp->ts_recent = to.to_tsval; 2227 } 2228 2229 /* 2230 * If a SYN is in the window, then this is an 2231 * error and we send an RST and drop the connection. 2232 */ 2233 if (thflags & TH_SYN) { 2234 KASSERT(ti_locked == TI_WLOCKED, 2235 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); 2236 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2237 2238 tp = tcp_drop(tp, ECONNRESET); 2239 rstreason = BANDLIM_UNLIMITED; 2240 goto drop; 2241 } 2242 2243 /* 2244 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 2245 * flag is on (half-synchronized state), then queue data for 2246 * later processing; else drop segment and return. 2247 */ 2248 if ((thflags & TH_ACK) == 0) { 2249 if (tp->t_state == TCPS_SYN_RECEIVED || 2250 (tp->t_flags & TF_NEEDSYN)) 2251 goto step6; 2252 else if (tp->t_flags & TF_ACKNOW) 2253 goto dropafterack; 2254 else 2255 goto drop; 2256 } 2257 2258 /* 2259 * Ack processing. 2260 */ 2261 switch (tp->t_state) { 2262 2263 /* 2264 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 2265 * ESTABLISHED state and continue processing. 2266 * The ACK was checked above. 2267 */ 2268 case TCPS_SYN_RECEIVED: 2269 2270 TCPSTAT_INC(tcps_connects); 2271 soisconnected(so); 2272 /* Do window scaling? */ 2273 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2274 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2275 tp->rcv_scale = tp->request_r_scale; 2276 tp->snd_wnd = tiwin; 2277 } 2278 /* 2279 * Make transitions: 2280 * SYN-RECEIVED -> ESTABLISHED 2281 * SYN-RECEIVED* -> FIN-WAIT-1 2282 */ 2283 tp->t_starttime = ticks; 2284 if (tp->t_flags & TF_NEEDFIN) { 2285 tp->t_state = TCPS_FIN_WAIT_1; 2286 tp->t_flags &= ~TF_NEEDFIN; 2287 } else { 2288 tp->t_state = TCPS_ESTABLISHED; 2289 cc_conn_init(tp); 2290 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 2291 } 2292 /* 2293 * If segment contains data or ACK, will call tcp_reass() 2294 * later; if not, do so now to pass queued data to user. 2295 */ 2296 if (tlen == 0 && (thflags & TH_FIN) == 0) 2297 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 2298 (struct mbuf *)0); 2299 tp->snd_wl1 = th->th_seq - 1; 2300 /* FALLTHROUGH */ 2301 2302 /* 2303 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 2304 * ACKs. If the ack is in the range 2305 * tp->snd_una < th->th_ack <= tp->snd_max 2306 * then advance tp->snd_una to th->th_ack and drop 2307 * data from the retransmission queue. If this ACK reflects 2308 * more up to date window information we update our window information. 2309 */ 2310 case TCPS_ESTABLISHED: 2311 case TCPS_FIN_WAIT_1: 2312 case TCPS_FIN_WAIT_2: 2313 case TCPS_CLOSE_WAIT: 2314 case TCPS_CLOSING: 2315 case TCPS_LAST_ACK: 2316 if (SEQ_GT(th->th_ack, tp->snd_max)) { 2317 TCPSTAT_INC(tcps_rcvacktoomuch); 2318 goto dropafterack; 2319 } 2320 if ((tp->t_flags & TF_SACK_PERMIT) && 2321 ((to.to_flags & TOF_SACK) || 2322 !TAILQ_EMPTY(&tp->snd_holes))) 2323 tcp_sack_doack(tp, &to, th->th_ack); 2324 2325 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 2326 hhook_run_tcp_est_in(tp, th, &to); 2327 2328 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 2329 if (tlen == 0 && tiwin == tp->snd_wnd) { 2330 TCPSTAT_INC(tcps_rcvdupack); 2331 /* 2332 * If we have outstanding data (other than 2333 * a window probe), this is a completely 2334 * duplicate ack (ie, window info didn't 2335 * change), the ack is the biggest we've 2336 * seen and we've seen exactly our rexmt 2337 * threshhold of them, assume a packet 2338 * has been dropped and retransmit it. 2339 * Kludge snd_nxt & the congestion 2340 * window so we send only this one 2341 * packet. 2342 * 2343 * We know we're losing at the current 2344 * window size so do congestion avoidance 2345 * (set ssthresh to half the current window 2346 * and pull our congestion window back to 2347 * the new ssthresh). 2348 * 2349 * Dup acks mean that packets have left the 2350 * network (they're now cached at the receiver) 2351 * so bump cwnd by the amount in the receiver 2352 * to keep a constant cwnd packets in the 2353 * network. 2354 * 2355 * When using TCP ECN, notify the peer that 2356 * we reduced the cwnd. 2357 */ 2358 if (!tcp_timer_active(tp, TT_REXMT) || 2359 th->th_ack != tp->snd_una) 2360 tp->t_dupacks = 0; 2361 else if (++tp->t_dupacks > tcprexmtthresh || 2362 IN_FASTRECOVERY(tp->t_flags)) { 2363 cc_ack_received(tp, th, CC_DUPACK); 2364 if ((tp->t_flags & TF_SACK_PERMIT) && 2365 IN_FASTRECOVERY(tp->t_flags)) { 2366 int awnd; 2367 2368 /* 2369 * Compute the amount of data in flight first. 2370 * We can inject new data into the pipe iff 2371 * we have less than 1/2 the original window's 2372 * worth of data in flight. 2373 */ 2374 awnd = (tp->snd_nxt - tp->snd_fack) + 2375 tp->sackhint.sack_bytes_rexmit; 2376 if (awnd < tp->snd_ssthresh) { 2377 tp->snd_cwnd += tp->t_maxseg; 2378 if (tp->snd_cwnd > tp->snd_ssthresh) 2379 tp->snd_cwnd = tp->snd_ssthresh; 2380 } 2381 } else 2382 tp->snd_cwnd += tp->t_maxseg; 2383 (void) tcp_output(tp); 2384 goto drop; 2385 } else if (tp->t_dupacks == tcprexmtthresh) { 2386 tcp_seq onxt = tp->snd_nxt; 2387 2388 /* 2389 * If we're doing sack, check to 2390 * see if we're already in sack 2391 * recovery. If we're not doing sack, 2392 * check to see if we're in newreno 2393 * recovery. 2394 */ 2395 if (tp->t_flags & TF_SACK_PERMIT) { 2396 if (IN_FASTRECOVERY(tp->t_flags)) { 2397 tp->t_dupacks = 0; 2398 break; 2399 } 2400 } else { 2401 if (SEQ_LEQ(th->th_ack, 2402 tp->snd_recover)) { 2403 tp->t_dupacks = 0; 2404 break; 2405 } 2406 } 2407 /* Congestion signal before ack. */ 2408 cc_cong_signal(tp, th, CC_NDUPACK); 2409 cc_ack_received(tp, th, CC_DUPACK); 2410 tcp_timer_activate(tp, TT_REXMT, 0); 2411 tp->t_rtttime = 0; 2412 if (tp->t_flags & TF_SACK_PERMIT) { 2413 TCPSTAT_INC( 2414 tcps_sack_recovery_episode); 2415 tp->sack_newdata = tp->snd_nxt; 2416 tp->snd_cwnd = tp->t_maxseg; 2417 (void) tcp_output(tp); 2418 goto drop; 2419 } 2420 tp->snd_nxt = th->th_ack; 2421 tp->snd_cwnd = tp->t_maxseg; 2422 (void) tcp_output(tp); 2423 KASSERT(tp->snd_limited <= 2, 2424 ("%s: tp->snd_limited too big", 2425 __func__)); 2426 tp->snd_cwnd = tp->snd_ssthresh + 2427 tp->t_maxseg * 2428 (tp->t_dupacks - tp->snd_limited); 2429 if (SEQ_GT(onxt, tp->snd_nxt)) 2430 tp->snd_nxt = onxt; 2431 goto drop; 2432 } else if (V_tcp_do_rfc3042) { 2433 cc_ack_received(tp, th, CC_DUPACK); 2434 u_long oldcwnd = tp->snd_cwnd; 2435 tcp_seq oldsndmax = tp->snd_max; 2436 u_int sent; 2437 2438 KASSERT(tp->t_dupacks == 1 || 2439 tp->t_dupacks == 2, 2440 ("%s: dupacks not 1 or 2", 2441 __func__)); 2442 if (tp->t_dupacks == 1) 2443 tp->snd_limited = 0; 2444 tp->snd_cwnd = 2445 (tp->snd_nxt - tp->snd_una) + 2446 (tp->t_dupacks - tp->snd_limited) * 2447 tp->t_maxseg; 2448 (void) tcp_output(tp); 2449 sent = tp->snd_max - oldsndmax; 2450 if (sent > tp->t_maxseg) { 2451 KASSERT((tp->t_dupacks == 2 && 2452 tp->snd_limited == 0) || 2453 (sent == tp->t_maxseg + 1 && 2454 tp->t_flags & TF_SENTFIN), 2455 ("%s: sent too much", 2456 __func__)); 2457 tp->snd_limited = 2; 2458 } else if (sent > 0) 2459 ++tp->snd_limited; 2460 tp->snd_cwnd = oldcwnd; 2461 goto drop; 2462 } 2463 } else 2464 tp->t_dupacks = 0; 2465 break; 2466 } 2467 2468 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 2469 ("%s: th_ack <= snd_una", __func__)); 2470 2471 /* 2472 * If the congestion window was inflated to account 2473 * for the other side's cached packets, retract it. 2474 */ 2475 if (IN_FASTRECOVERY(tp->t_flags)) { 2476 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2477 if (tp->t_flags & TF_SACK_PERMIT) 2478 tcp_sack_partialack(tp, th); 2479 else 2480 tcp_newreno_partial_ack(tp, th); 2481 } else 2482 cc_post_recovery(tp, th); 2483 } 2484 tp->t_dupacks = 0; 2485 /* 2486 * If we reach this point, ACK is not a duplicate, 2487 * i.e., it ACKs something we sent. 2488 */ 2489 if (tp->t_flags & TF_NEEDSYN) { 2490 /* 2491 * T/TCP: Connection was half-synchronized, and our 2492 * SYN has been ACK'd (so connection is now fully 2493 * synchronized). Go to non-starred state, 2494 * increment snd_una for ACK of SYN, and check if 2495 * we can do window scaling. 2496 */ 2497 tp->t_flags &= ~TF_NEEDSYN; 2498 tp->snd_una++; 2499 /* Do window scaling? */ 2500 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2501 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2502 tp->rcv_scale = tp->request_r_scale; 2503 /* Send window already scaled. */ 2504 } 2505 } 2506 2507process_ACK: 2508 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 2509 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 2510 ("tcp_input: process_ACK ti_locked %d", ti_locked)); 2511 INP_WLOCK_ASSERT(tp->t_inpcb); 2512 2513 acked = BYTES_THIS_ACK(tp, th); 2514 TCPSTAT_INC(tcps_rcvackpack); 2515 TCPSTAT_ADD(tcps_rcvackbyte, acked); 2516 2517 /* 2518 * If we just performed our first retransmit, and the ACK 2519 * arrives within our recovery window, then it was a mistake 2520 * to do the retransmit in the first place. Recover our 2521 * original cwnd and ssthresh, and proceed to transmit where 2522 * we left off. 2523 */ 2524 if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) 2525 cc_cong_signal(tp, th, CC_RTO_ERR); 2526 2527 /* 2528 * If we have a timestamp reply, update smoothed 2529 * round trip time. If no timestamp is present but 2530 * transmit timer is running and timed sequence 2531 * number was acked, update smoothed round trip time. 2532 * Since we now have an rtt measurement, cancel the 2533 * timer backoff (cf., Phil Karn's retransmit alg.). 2534 * Recompute the initial retransmit timer. 2535 * 2536 * Some boxes send broken timestamp replies 2537 * during the SYN+ACK phase, ignore 2538 * timestamps of 0 or we could calculate a 2539 * huge RTT and blow up the retransmit timer. 2540 */ 2541 if ((to.to_flags & TOF_TS) != 0 && 2542 to.to_tsecr) { 2543 if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) 2544 tp->t_rttlow = ticks - to.to_tsecr; 2545 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); 2546 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 2547 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 2548 tp->t_rttlow = ticks - tp->t_rtttime; 2549 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 2550 } 2551 2552 /* 2553 * If all outstanding data is acked, stop retransmit 2554 * timer and remember to restart (more output or persist). 2555 * If there is more data to be acked, restart retransmit 2556 * timer, using current (possibly backed-off) value. 2557 */ 2558 if (th->th_ack == tp->snd_max) { 2559 tcp_timer_activate(tp, TT_REXMT, 0); 2560 needoutput = 1; 2561 } else if (!tcp_timer_active(tp, TT_PERSIST)) 2562 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 2563 2564 /* 2565 * If no data (only SYN) was ACK'd, 2566 * skip rest of ACK processing. 2567 */ 2568 if (acked == 0) 2569 goto step6; 2570 2571 /* 2572 * Let the congestion control algorithm update congestion 2573 * control related information. This typically means increasing 2574 * the congestion window. 2575 */ 2576 cc_ack_received(tp, th, CC_ACK); 2577 2578 SOCKBUF_LOCK(&so->so_snd); 2579 if (acked > so->so_snd.sb_cc) { 2580 tp->snd_wnd -= so->so_snd.sb_cc; 2581 sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); 2582 ourfinisacked = 1; 2583 } else { 2584 sbdrop_locked(&so->so_snd, acked); 2585 tp->snd_wnd -= acked; 2586 ourfinisacked = 0; 2587 } 2588 /* NB: sowwakeup_locked() does an implicit unlock. */ 2589 sowwakeup_locked(so); 2590 /* Detect una wraparound. */ 2591 if (!IN_RECOVERY(tp->t_flags) && 2592 SEQ_GT(tp->snd_una, tp->snd_recover) && 2593 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2594 tp->snd_recover = th->th_ack - 1; 2595 /* XXXLAS: Can this be moved up into cc_post_recovery? */ 2596 if (IN_RECOVERY(tp->t_flags) && 2597 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 2598 EXIT_RECOVERY(tp->t_flags); 2599 } 2600 tp->snd_una = th->th_ack; 2601 if (tp->t_flags & TF_SACK_PERMIT) { 2602 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 2603 tp->snd_recover = tp->snd_una; 2604 } 2605 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2606 tp->snd_nxt = tp->snd_una; 2607 2608 switch (tp->t_state) { 2609 2610 /* 2611 * In FIN_WAIT_1 STATE in addition to the processing 2612 * for the ESTABLISHED state if our FIN is now acknowledged 2613 * then enter FIN_WAIT_2. 2614 */ 2615 case TCPS_FIN_WAIT_1: 2616 if (ourfinisacked) { 2617 /* 2618 * If we can't receive any more 2619 * data, then closing user can proceed. 2620 * Starting the timer is contrary to the 2621 * specification, but if we don't get a FIN 2622 * we'll hang forever. 2623 * 2624 * XXXjl: 2625 * we should release the tp also, and use a 2626 * compressed state. 2627 */ 2628 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2629 int timeout; 2630 2631 soisdisconnected(so); 2632 timeout = (tcp_fast_finwait2_recycle) ? 2633 tcp_finwait2_timeout : tcp_maxidle; 2634 tcp_timer_activate(tp, TT_2MSL, timeout); 2635 } 2636 tp->t_state = TCPS_FIN_WAIT_2; 2637 } 2638 break; 2639 2640 /* 2641 * In CLOSING STATE in addition to the processing for 2642 * the ESTABLISHED state if the ACK acknowledges our FIN 2643 * then enter the TIME-WAIT state, otherwise ignore 2644 * the segment. 2645 */ 2646 case TCPS_CLOSING: 2647 if (ourfinisacked) { 2648 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2649 tcp_twstart(tp); 2650 INP_INFO_WUNLOCK(&V_tcbinfo); 2651 m_freem(m); 2652 return; 2653 } 2654 break; 2655 2656 /* 2657 * In LAST_ACK, we may still be waiting for data to drain 2658 * and/or to be acked, as well as for the ack of our FIN. 2659 * If our FIN is now acknowledged, delete the TCB, 2660 * enter the closed state and return. 2661 */ 2662 case TCPS_LAST_ACK: 2663 if (ourfinisacked) { 2664 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2665 tp = tcp_close(tp); 2666 goto drop; 2667 } 2668 break; 2669 } 2670 } 2671 2672step6: 2673 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 2674 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 2675 ("tcp_do_segment: step6 ti_locked %d", ti_locked)); 2676 INP_WLOCK_ASSERT(tp->t_inpcb); 2677 2678 /* 2679 * Update window information. 2680 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2681 */ 2682 if ((thflags & TH_ACK) && 2683 (SEQ_LT(tp->snd_wl1, th->th_seq) || 2684 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2685 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2686 /* keep track of pure window updates */ 2687 if (tlen == 0 && 2688 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2689 TCPSTAT_INC(tcps_rcvwinupd); 2690 tp->snd_wnd = tiwin; 2691 tp->snd_wl1 = th->th_seq; 2692 tp->snd_wl2 = th->th_ack; 2693 if (tp->snd_wnd > tp->max_sndwnd) 2694 tp->max_sndwnd = tp->snd_wnd; 2695 needoutput = 1; 2696 } 2697 2698 /* 2699 * Process segments with URG. 2700 */ 2701 if ((thflags & TH_URG) && th->th_urp && 2702 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2703 /* 2704 * This is a kludge, but if we receive and accept 2705 * random urgent pointers, we'll crash in 2706 * soreceive. It's hard to imagine someone 2707 * actually wanting to send this much urgent data. 2708 */ 2709 SOCKBUF_LOCK(&so->so_rcv); 2710 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2711 th->th_urp = 0; /* XXX */ 2712 thflags &= ~TH_URG; /* XXX */ 2713 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 2714 goto dodata; /* XXX */ 2715 } 2716 /* 2717 * If this segment advances the known urgent pointer, 2718 * then mark the data stream. This should not happen 2719 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2720 * a FIN has been received from the remote side. 2721 * In these states we ignore the URG. 2722 * 2723 * According to RFC961 (Assigned Protocols), 2724 * the urgent pointer points to the last octet 2725 * of urgent data. We continue, however, 2726 * to consider it to indicate the first octet 2727 * of data past the urgent section as the original 2728 * spec states (in one of two places). 2729 */ 2730 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2731 tp->rcv_up = th->th_seq + th->th_urp; 2732 so->so_oobmark = so->so_rcv.sb_cc + 2733 (tp->rcv_up - tp->rcv_nxt) - 1; 2734 if (so->so_oobmark == 0) 2735 so->so_rcv.sb_state |= SBS_RCVATMARK; 2736 sohasoutofband(so); 2737 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2738 } 2739 SOCKBUF_UNLOCK(&so->so_rcv); 2740 /* 2741 * Remove out of band data so doesn't get presented to user. 2742 * This can happen independent of advancing the URG pointer, 2743 * but if two URG's are pending at once, some out-of-band 2744 * data may creep in... ick. 2745 */ 2746 if (th->th_urp <= (u_long)tlen && 2747 !(so->so_options & SO_OOBINLINE)) { 2748 /* hdr drop is delayed */ 2749 tcp_pulloutofband(so, th, m, drop_hdrlen); 2750 } 2751 } else { 2752 /* 2753 * If no out of band data is expected, 2754 * pull receive urgent pointer along 2755 * with the receive window. 2756 */ 2757 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2758 tp->rcv_up = tp->rcv_nxt; 2759 } 2760dodata: /* XXX */ 2761 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 2762 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 2763 ("tcp_do_segment: dodata ti_locked %d", ti_locked)); 2764 INP_WLOCK_ASSERT(tp->t_inpcb); 2765 2766 /* 2767 * Process the segment text, merging it into the TCP sequencing queue, 2768 * and arranging for acknowledgment of receipt if necessary. 2769 * This process logically involves adjusting tp->rcv_wnd as data 2770 * is presented to the user (this happens in tcp_usrreq.c, 2771 * case PRU_RCVD). If a FIN has already been received on this 2772 * connection then we just ignore the text. 2773 */ 2774 if ((tlen || (thflags & TH_FIN)) && 2775 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2776 tcp_seq save_start = th->th_seq; 2777 m_adj(m, drop_hdrlen); /* delayed header drop */ 2778 /* 2779 * Insert segment which includes th into TCP reassembly queue 2780 * with control block tp. Set thflags to whether reassembly now 2781 * includes a segment with FIN. This handles the common case 2782 * inline (segment is the next to be received on an established 2783 * connection, and the queue is empty), avoiding linkage into 2784 * and removal from the queue and repetition of various 2785 * conversions. 2786 * Set DELACK for segments received in order, but ack 2787 * immediately when segments are out of order (so 2788 * fast retransmit can work). 2789 */ 2790 if (th->th_seq == tp->rcv_nxt && 2791 LIST_EMPTY(&tp->t_segq) && 2792 TCPS_HAVEESTABLISHED(tp->t_state)) { 2793 if (DELAY_ACK(tp)) 2794 tp->t_flags |= TF_DELACK; 2795 else 2796 tp->t_flags |= TF_ACKNOW; 2797 tp->rcv_nxt += tlen; 2798 thflags = th->th_flags & TH_FIN; 2799 TCPSTAT_INC(tcps_rcvpack); 2800 TCPSTAT_ADD(tcps_rcvbyte, tlen); 2801 ND6_HINT(tp); 2802 SOCKBUF_LOCK(&so->so_rcv); 2803 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 2804 m_freem(m); 2805 else 2806 sbappendstream_locked(&so->so_rcv, m); 2807 /* NB: sorwakeup_locked() does an implicit unlock. */ 2808 sorwakeup_locked(so); 2809 } else { 2810 /* 2811 * XXX: Due to the header drop above "th" is 2812 * theoretically invalid by now. Fortunately 2813 * m_adj() doesn't actually frees any mbufs 2814 * when trimming from the head. 2815 */ 2816 thflags = tcp_reass(tp, th, &tlen, m); 2817 tp->t_flags |= TF_ACKNOW; 2818 } 2819 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 2820 tcp_update_sack_list(tp, save_start, save_start + tlen); 2821#if 0 2822 /* 2823 * Note the amount of data that peer has sent into 2824 * our window, in order to estimate the sender's 2825 * buffer size. 2826 * XXX: Unused. 2827 */ 2828 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2829#endif 2830 } else { 2831 m_freem(m); 2832 thflags &= ~TH_FIN; 2833 } 2834 2835 /* 2836 * If FIN is received ACK the FIN and let the user know 2837 * that the connection is closing. 2838 */ 2839 if (thflags & TH_FIN) { 2840 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2841 socantrcvmore(so); 2842 /* 2843 * If connection is half-synchronized 2844 * (ie NEEDSYN flag on) then delay ACK, 2845 * so it may be piggybacked when SYN is sent. 2846 * Otherwise, since we received a FIN then no 2847 * more input can be expected, send ACK now. 2848 */ 2849 if (tp->t_flags & TF_NEEDSYN) 2850 tp->t_flags |= TF_DELACK; 2851 else 2852 tp->t_flags |= TF_ACKNOW; 2853 tp->rcv_nxt++; 2854 } 2855 switch (tp->t_state) { 2856 2857 /* 2858 * In SYN_RECEIVED and ESTABLISHED STATES 2859 * enter the CLOSE_WAIT state. 2860 */ 2861 case TCPS_SYN_RECEIVED: 2862 tp->t_starttime = ticks; 2863 /* FALLTHROUGH */ 2864 case TCPS_ESTABLISHED: 2865 tp->t_state = TCPS_CLOSE_WAIT; 2866 break; 2867 2868 /* 2869 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2870 * enter the CLOSING state. 2871 */ 2872 case TCPS_FIN_WAIT_1: 2873 tp->t_state = TCPS_CLOSING; 2874 break; 2875 2876 /* 2877 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2878 * starting the time-wait timer, turning off the other 2879 * standard timers. 2880 */ 2881 case TCPS_FIN_WAIT_2: 2882 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2883 KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " 2884 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 2885 ti_locked)); 2886 2887 tcp_twstart(tp); 2888 INP_INFO_WUNLOCK(&V_tcbinfo); 2889 return; 2890 } 2891 } 2892 if (ti_locked == TI_RLOCKED) 2893 INP_INFO_RUNLOCK(&V_tcbinfo); 2894 else if (ti_locked == TI_WLOCKED) 2895 INP_INFO_WUNLOCK(&V_tcbinfo); 2896 else 2897 panic("%s: dodata epilogue ti_locked %d", __func__, 2898 ti_locked); 2899 ti_locked = TI_UNLOCKED; 2900 2901#ifdef TCPDEBUG 2902 if (so->so_options & SO_DEBUG) 2903 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 2904 &tcp_savetcp, 0); 2905#endif 2906 2907 /* 2908 * Return any desired output. 2909 */ 2910 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2911 (void) tcp_output(tp); 2912 2913check_delack: 2914 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 2915 __func__, ti_locked)); 2916 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2917 INP_WLOCK_ASSERT(tp->t_inpcb); 2918 2919 if (tp->t_flags & TF_DELACK) { 2920 tp->t_flags &= ~TF_DELACK; 2921 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2922 } 2923 INP_WUNLOCK(tp->t_inpcb); 2924 return; 2925 2926dropafterack: 2927 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 2928 ("tcp_do_segment: dropafterack ti_locked %d", ti_locked)); 2929 2930 /* 2931 * Generate an ACK dropping incoming segment if it occupies 2932 * sequence space, where the ACK reflects our state. 2933 * 2934 * We can now skip the test for the RST flag since all 2935 * paths to this code happen after packets containing 2936 * RST have been dropped. 2937 * 2938 * In the SYN-RECEIVED state, don't send an ACK unless the 2939 * segment we received passes the SYN-RECEIVED ACK test. 2940 * If it fails send a RST. This breaks the loop in the 2941 * "LAND" DoS attack, and also prevents an ACK storm 2942 * between two listening ports that have been sent forged 2943 * SYN segments, each with the source address of the other. 2944 */ 2945 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 2946 (SEQ_GT(tp->snd_una, th->th_ack) || 2947 SEQ_GT(th->th_ack, tp->snd_max)) ) { 2948 rstreason = BANDLIM_RST_OPENPORT; 2949 goto dropwithreset; 2950 } 2951#ifdef TCPDEBUG 2952 if (so->so_options & SO_DEBUG) 2953 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2954 &tcp_savetcp, 0); 2955#endif 2956 if (ti_locked == TI_RLOCKED) 2957 INP_INFO_RUNLOCK(&V_tcbinfo); 2958 else if (ti_locked == TI_WLOCKED) 2959 INP_INFO_WUNLOCK(&V_tcbinfo); 2960 else 2961 panic("%s: dropafterack epilogue ti_locked %d", __func__, 2962 ti_locked); 2963 ti_locked = TI_UNLOCKED; 2964 2965 tp->t_flags |= TF_ACKNOW; 2966 (void) tcp_output(tp); 2967 INP_WUNLOCK(tp->t_inpcb); 2968 m_freem(m); 2969 return; 2970 2971dropwithreset: 2972 if (ti_locked == TI_RLOCKED) 2973 INP_INFO_RUNLOCK(&V_tcbinfo); 2974 else if (ti_locked == TI_WLOCKED) 2975 INP_INFO_WUNLOCK(&V_tcbinfo); 2976 else 2977 panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); 2978 ti_locked = TI_UNLOCKED; 2979 2980 if (tp != NULL) { 2981 tcp_dropwithreset(m, th, tp, tlen, rstreason); 2982 INP_WUNLOCK(tp->t_inpcb); 2983 } else 2984 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 2985 return; 2986 2987drop: 2988 if (ti_locked == TI_RLOCKED) 2989 INP_INFO_RUNLOCK(&V_tcbinfo); 2990 else if (ti_locked == TI_WLOCKED) 2991 INP_INFO_WUNLOCK(&V_tcbinfo); 2992#ifdef INVARIANTS 2993 else 2994 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2995#endif 2996 ti_locked = TI_UNLOCKED; 2997 2998 /* 2999 * Drop space held by incoming segment and return. 3000 */ 3001#ifdef TCPDEBUG 3002 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 3003 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 3004 &tcp_savetcp, 0); 3005#endif 3006 if (tp != NULL) 3007 INP_WUNLOCK(tp->t_inpcb); 3008 m_freem(m); 3009} 3010 3011/* 3012 * Issue RST and make ACK acceptable to originator of segment. 3013 * The mbuf must still include the original packet header. 3014 * tp may be NULL. 3015 */ 3016static void 3017tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, 3018 int tlen, int rstreason) 3019{ 3020 struct ip *ip; 3021#ifdef INET6 3022 struct ip6_hdr *ip6; 3023#endif 3024 3025 if (tp != NULL) { 3026 INP_WLOCK_ASSERT(tp->t_inpcb); 3027 } 3028 3029 /* Don't bother if destination was broadcast/multicast. */ 3030 if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 3031 goto drop; 3032#ifdef INET6 3033 if (mtod(m, struct ip *)->ip_v == 6) { 3034 ip6 = mtod(m, struct ip6_hdr *); 3035 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 3036 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 3037 goto drop; 3038 /* IPv6 anycast check is done at tcp6_input() */ 3039 } else 3040#endif 3041 { 3042 ip = mtod(m, struct ip *); 3043 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 3044 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 3045 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 3046 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 3047 goto drop; 3048 } 3049 3050 /* Perform bandwidth limiting. */ 3051 if (badport_bandlim(rstreason) < 0) 3052 goto drop; 3053 3054 /* tcp_respond consumes the mbuf chain. */ 3055 if (th->th_flags & TH_ACK) { 3056 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, 3057 th->th_ack, TH_RST); 3058 } else { 3059 if (th->th_flags & TH_SYN) 3060 tlen++; 3061 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 3062 (tcp_seq)0, TH_RST|TH_ACK); 3063 } 3064 return; 3065drop: 3066 m_freem(m); 3067} 3068 3069/* 3070 * Parse TCP options and place in tcpopt. 3071 */ 3072static void 3073tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 3074{ 3075 int opt, optlen; 3076 3077 to->to_flags = 0; 3078 for (; cnt > 0; cnt -= optlen, cp += optlen) { 3079 opt = cp[0]; 3080 if (opt == TCPOPT_EOL) 3081 break; 3082 if (opt == TCPOPT_NOP) 3083 optlen = 1; 3084 else { 3085 if (cnt < 2) 3086 break; 3087 optlen = cp[1]; 3088 if (optlen < 2 || optlen > cnt) 3089 break; 3090 } 3091 switch (opt) { 3092 case TCPOPT_MAXSEG: 3093 if (optlen != TCPOLEN_MAXSEG) 3094 continue; 3095 if (!(flags & TO_SYN)) 3096 continue; 3097 to->to_flags |= TOF_MSS; 3098 bcopy((char *)cp + 2, 3099 (char *)&to->to_mss, sizeof(to->to_mss)); 3100 to->to_mss = ntohs(to->to_mss); 3101 break; 3102 case TCPOPT_WINDOW: 3103 if (optlen != TCPOLEN_WINDOW) 3104 continue; 3105 if (!(flags & TO_SYN)) 3106 continue; 3107 to->to_flags |= TOF_SCALE; 3108 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 3109 break; 3110 case TCPOPT_TIMESTAMP: 3111 if (optlen != TCPOLEN_TIMESTAMP) 3112 continue; 3113 to->to_flags |= TOF_TS; 3114 bcopy((char *)cp + 2, 3115 (char *)&to->to_tsval, sizeof(to->to_tsval)); 3116 to->to_tsval = ntohl(to->to_tsval); 3117 bcopy((char *)cp + 6, 3118 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 3119 to->to_tsecr = ntohl(to->to_tsecr); 3120 break; 3121#ifdef TCP_SIGNATURE 3122 /* 3123 * XXX In order to reply to a host which has set the 3124 * TCP_SIGNATURE option in its initial SYN, we have to 3125 * record the fact that the option was observed here 3126 * for the syncache code to perform the correct response. 3127 */ 3128 case TCPOPT_SIGNATURE: 3129 if (optlen != TCPOLEN_SIGNATURE) 3130 continue; 3131 to->to_flags |= TOF_SIGNATURE; 3132 to->to_signature = cp + 2; 3133 break; 3134#endif 3135 case TCPOPT_SACK_PERMITTED: 3136 if (optlen != TCPOLEN_SACK_PERMITTED) 3137 continue; 3138 if (!(flags & TO_SYN)) 3139 continue; 3140 if (!V_tcp_do_sack) 3141 continue; 3142 to->to_flags |= TOF_SACKPERM; 3143 break; 3144 case TCPOPT_SACK: 3145 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 3146 continue; 3147 if (flags & TO_SYN) 3148 continue; 3149 to->to_flags |= TOF_SACK; 3150 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 3151 to->to_sacks = cp + 2; 3152 TCPSTAT_INC(tcps_sack_rcv_blocks); 3153 break; 3154 default: 3155 continue; 3156 } 3157 } 3158} 3159 3160/* 3161 * Pull out of band byte out of a segment so 3162 * it doesn't appear in the user's data queue. 3163 * It is still reflected in the segment length for 3164 * sequencing purposes. 3165 */ 3166static void 3167tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 3168 int off) 3169{ 3170 int cnt = off + th->th_urp - 1; 3171 3172 while (cnt >= 0) { 3173 if (m->m_len > cnt) { 3174 char *cp = mtod(m, caddr_t) + cnt; 3175 struct tcpcb *tp = sototcpcb(so); 3176 3177 INP_WLOCK_ASSERT(tp->t_inpcb); 3178 3179 tp->t_iobc = *cp; 3180 tp->t_oobflags |= TCPOOB_HAVEDATA; 3181 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 3182 m->m_len--; 3183 if (m->m_flags & M_PKTHDR) 3184 m->m_pkthdr.len--; 3185 return; 3186 } 3187 cnt -= m->m_len; 3188 m = m->m_next; 3189 if (m == NULL) 3190 break; 3191 } 3192 panic("tcp_pulloutofband"); 3193} 3194 3195/* 3196 * Collect new round-trip time estimate 3197 * and update averages and current timeout. 3198 */ 3199static void 3200tcp_xmit_timer(struct tcpcb *tp, int rtt) 3201{ 3202 int delta; 3203 3204 INP_WLOCK_ASSERT(tp->t_inpcb); 3205 3206 TCPSTAT_INC(tcps_rttupdated); 3207 tp->t_rttupdated++; 3208 if (tp->t_srtt != 0) { 3209 /* 3210 * srtt is stored as fixed point with 5 bits after the 3211 * binary point (i.e., scaled by 8). The following magic 3212 * is equivalent to the smoothing algorithm in rfc793 with 3213 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 3214 * point). Adjust rtt to origin 0. 3215 */ 3216 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3217 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3218 3219 if ((tp->t_srtt += delta) <= 0) 3220 tp->t_srtt = 1; 3221 3222 /* 3223 * We accumulate a smoothed rtt variance (actually, a 3224 * smoothed mean difference), then set the retransmit 3225 * timer to smoothed rtt + 4 times the smoothed variance. 3226 * rttvar is stored as fixed point with 4 bits after the 3227 * binary point (scaled by 16). The following is 3228 * equivalent to rfc793 smoothing with an alpha of .75 3229 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 3230 * rfc793's wired-in beta. 3231 */ 3232 if (delta < 0) 3233 delta = -delta; 3234 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3235 if ((tp->t_rttvar += delta) <= 0) 3236 tp->t_rttvar = 1; 3237 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 3238 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3239 } else { 3240 /* 3241 * No rtt measurement yet - use the unsmoothed rtt. 3242 * Set the variance to half the rtt (so our first 3243 * retransmit happens at 3*rtt). 3244 */ 3245 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3246 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3247 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3248 } 3249 tp->t_rtttime = 0; 3250 tp->t_rxtshift = 0; 3251 3252 /* 3253 * the retransmit should happen at rtt + 4 * rttvar. 3254 * Because of the way we do the smoothing, srtt and rttvar 3255 * will each average +1/2 tick of bias. When we compute 3256 * the retransmit timer, we want 1/2 tick of rounding and 3257 * 1 extra tick because of +-1/2 tick uncertainty in the 3258 * firing of the timer. The bias will give us exactly the 3259 * 1.5 tick we need. But, because the bias is 3260 * statistical, we have to test that we don't drop below 3261 * the minimum feasible timer (which is 2 ticks). 3262 */ 3263 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3264 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 3265 3266 /* 3267 * We received an ack for a packet that wasn't retransmitted; 3268 * it is probably safe to discard any error indications we've 3269 * received recently. This isn't quite right, but close enough 3270 * for now (a route might have failed after we sent a segment, 3271 * and the return path might not be symmetrical). 3272 */ 3273 tp->t_softerror = 0; 3274} 3275 3276/* 3277 * Determine a reasonable value for maxseg size. 3278 * If the route is known, check route for mtu. 3279 * If none, use an mss that can be handled on the outgoing 3280 * interface without forcing IP to fragment; if bigger than 3281 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 3282 * to utilize large mbufs. If no route is found, route has no mtu, 3283 * or the destination isn't local, use a default, hopefully conservative 3284 * size (usually 512 or the default IP max size, but no more than the mtu 3285 * of the interface), as we can't discover anything about intervening 3286 * gateways or networks. We also initialize the congestion/slow start 3287 * window to be a single segment if the destination isn't local. 3288 * While looking at the routing entry, we also initialize other path-dependent 3289 * parameters from pre-set or cached values in the routing entry. 3290 * 3291 * Also take into account the space needed for options that we 3292 * send regularly. Make maxseg shorter by that amount to assure 3293 * that we can send maxseg amount of data even when the options 3294 * are present. Store the upper limit of the length of options plus 3295 * data in maxopd. 3296 * 3297 * In case of T/TCP, we call this routine during implicit connection 3298 * setup as well (offer = -1), to initialize maxseg from the cached 3299 * MSS of our peer. 3300 * 3301 * NOTE that this routine is only called when we process an incoming 3302 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). 3303 */ 3304void 3305tcp_mss_update(struct tcpcb *tp, int offer, 3306 struct hc_metrics_lite *metricptr, int *mtuflags) 3307{ 3308 int mss; 3309 u_long maxmtu; 3310 struct inpcb *inp = tp->t_inpcb; 3311 struct hc_metrics_lite metrics; 3312 int origoffer = offer; 3313#ifdef INET6 3314 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 3315 size_t min_protoh = isipv6 ? 3316 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 3317 sizeof (struct tcpiphdr); 3318#else 3319 const size_t min_protoh = sizeof(struct tcpiphdr); 3320#endif 3321 3322 INP_WLOCK_ASSERT(tp->t_inpcb); 3323 3324 /* Initialize. */ 3325#ifdef INET6 3326 if (isipv6) { 3327 maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags); 3328 tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; 3329 } else 3330#endif 3331 { 3332 maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags); 3333 tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; 3334 } 3335 3336 /* 3337 * No route to sender, stay with default mss and return. 3338 */ 3339 if (maxmtu == 0) { 3340 /* 3341 * In case we return early we need to initialize metrics 3342 * to a defined state as tcp_hc_get() would do for us 3343 * if there was no cache hit. 3344 */ 3345 if (metricptr != NULL) 3346 bzero(metricptr, sizeof(struct hc_metrics_lite)); 3347 return; 3348 } 3349 3350 /* What have we got? */ 3351 switch (offer) { 3352 case 0: 3353 /* 3354 * Offer == 0 means that there was no MSS on the SYN 3355 * segment, in this case we use tcp_mssdflt as 3356 * already assigned to t_maxopd above. 3357 */ 3358 offer = tp->t_maxopd; 3359 break; 3360 3361 case -1: 3362 /* 3363 * Offer == -1 means that we didn't receive SYN yet. 3364 */ 3365 /* FALLTHROUGH */ 3366 3367 default: 3368 /* 3369 * Prevent DoS attack with too small MSS. Round up 3370 * to at least minmss. 3371 */ 3372 offer = max(offer, V_tcp_minmss); 3373 } 3374 3375 /* 3376 * rmx information is now retrieved from tcp_hostcache. 3377 */ 3378 tcp_hc_get(&inp->inp_inc, &metrics); 3379 if (metricptr != NULL) 3380 bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); 3381 3382 /* 3383 * If there's a discovered mtu int tcp hostcache, use it 3384 * else, use the link mtu. 3385 */ 3386 if (metrics.rmx_mtu) 3387 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; 3388 else { 3389#ifdef INET6 3390 if (isipv6) { 3391 mss = maxmtu - min_protoh; 3392 if (!V_path_mtu_discovery && 3393 !in6_localaddr(&inp->in6p_faddr)) 3394 mss = min(mss, V_tcp_v6mssdflt); 3395 } else 3396#endif 3397 { 3398 mss = maxmtu - min_protoh; 3399 if (!V_path_mtu_discovery && 3400 !in_localaddr(inp->inp_faddr)) 3401 mss = min(mss, V_tcp_mssdflt); 3402 } 3403 /* 3404 * XXX - The above conditional (mss = maxmtu - min_protoh) 3405 * probably violates the TCP spec. 3406 * The problem is that, since we don't know the 3407 * other end's MSS, we are supposed to use a conservative 3408 * default. But, if we do that, then MTU discovery will 3409 * never actually take place, because the conservative 3410 * default is much less than the MTUs typically seen 3411 * on the Internet today. For the moment, we'll sweep 3412 * this under the carpet. 3413 * 3414 * The conservative default might not actually be a problem 3415 * if the only case this occurs is when sending an initial 3416 * SYN with options and data to a host we've never talked 3417 * to before. Then, they will reply with an MSS value which 3418 * will get recorded and the new parameters should get 3419 * recomputed. For Further Study. 3420 */ 3421 } 3422 mss = min(mss, offer); 3423 3424 /* 3425 * Sanity check: make sure that maxopd will be large 3426 * enough to allow some data on segments even if the 3427 * all the option space is used (40bytes). Otherwise 3428 * funny things may happen in tcp_output. 3429 */ 3430 mss = max(mss, 64); 3431 3432 /* 3433 * maxopd stores the maximum length of data AND options 3434 * in a segment; maxseg is the amount of data in a normal 3435 * segment. We need to store this value (maxopd) apart 3436 * from maxseg, because now every segment carries options 3437 * and thus we normally have somewhat less data in segments. 3438 */ 3439 tp->t_maxopd = mss; 3440 3441 /* 3442 * origoffer==-1 indicates that no segments were received yet. 3443 * In this case we just guess. 3444 */ 3445 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3446 (origoffer == -1 || 3447 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 3448 mss -= TCPOLEN_TSTAMP_APPA; 3449 3450#if (MCLBYTES & (MCLBYTES - 1)) == 0 3451 if (mss > MCLBYTES) 3452 mss &= ~(MCLBYTES-1); 3453#else 3454 if (mss > MCLBYTES) 3455 mss = mss / MCLBYTES * MCLBYTES; 3456#endif 3457 tp->t_maxseg = mss; 3458} 3459 3460void 3461tcp_mss(struct tcpcb *tp, int offer) 3462{ 3463 int mss; 3464 u_long bufsize; 3465 struct inpcb *inp; 3466 struct socket *so; 3467 struct hc_metrics_lite metrics; 3468 int mtuflags = 0; 3469 3470 KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); 3471 3472 tcp_mss_update(tp, offer, &metrics, &mtuflags); 3473 3474 mss = tp->t_maxseg; 3475 inp = tp->t_inpcb; 3476 3477 /* 3478 * If there's a pipesize, change the socket buffer to that size, 3479 * don't change if sb_hiwat is different than default (then it 3480 * has been changed on purpose with setsockopt). 3481 * Make the socket buffers an integral number of mss units; 3482 * if the mss is larger than the socket buffer, decrease the mss. 3483 */ 3484 so = inp->inp_socket; 3485 SOCKBUF_LOCK(&so->so_snd); 3486 if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) 3487 bufsize = metrics.rmx_sendpipe; 3488 else 3489 bufsize = so->so_snd.sb_hiwat; 3490 if (bufsize < mss) 3491 mss = bufsize; 3492 else { 3493 bufsize = roundup(bufsize, mss); 3494 if (bufsize > sb_max) 3495 bufsize = sb_max; 3496 if (bufsize > so->so_snd.sb_hiwat) 3497 (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); 3498 } 3499 SOCKBUF_UNLOCK(&so->so_snd); 3500 tp->t_maxseg = mss; 3501 3502 SOCKBUF_LOCK(&so->so_rcv); 3503 if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) 3504 bufsize = metrics.rmx_recvpipe; 3505 else 3506 bufsize = so->so_rcv.sb_hiwat; 3507 if (bufsize > mss) { 3508 bufsize = roundup(bufsize, mss); 3509 if (bufsize > sb_max) 3510 bufsize = sb_max; 3511 if (bufsize > so->so_rcv.sb_hiwat) 3512 (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); 3513 } 3514 SOCKBUF_UNLOCK(&so->so_rcv); 3515 3516 /* Check the interface for TSO capabilities. */ 3517 if (mtuflags & CSUM_TSO) 3518 tp->t_flags |= TF_TSO; 3519} 3520 3521/* 3522 * Determine the MSS option to send on an outgoing SYN. 3523 */ 3524int 3525tcp_mssopt(struct in_conninfo *inc) 3526{ 3527 int mss = 0; 3528 u_long maxmtu = 0; 3529 u_long thcmtu = 0; 3530 size_t min_protoh; 3531 3532 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 3533 3534#ifdef INET6 3535 if (inc->inc_flags & INC_ISIPV6) { 3536 mss = V_tcp_v6mssdflt; 3537 maxmtu = tcp_maxmtu6(inc, NULL); 3538 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3539 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 3540 } else 3541#endif 3542 { 3543 mss = V_tcp_mssdflt; 3544 maxmtu = tcp_maxmtu(inc, NULL); 3545 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3546 min_protoh = sizeof(struct tcpiphdr); 3547 } 3548 if (maxmtu && thcmtu) 3549 mss = min(maxmtu, thcmtu) - min_protoh; 3550 else if (maxmtu || thcmtu) 3551 mss = max(maxmtu, thcmtu) - min_protoh; 3552 3553 return (mss); 3554} 3555 3556 3557/* 3558 * On a partial ack arrives, force the retransmission of the 3559 * next unacknowledged segment. Do not clear tp->t_dupacks. 3560 * By setting snd_nxt to ti_ack, this forces retransmission timer to 3561 * be started again. 3562 */ 3563static void 3564tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 3565{ 3566 tcp_seq onxt = tp->snd_nxt; 3567 u_long ocwnd = tp->snd_cwnd; 3568 3569 INP_WLOCK_ASSERT(tp->t_inpcb); 3570 3571 tcp_timer_activate(tp, TT_REXMT, 0); 3572 tp->t_rtttime = 0; 3573 tp->snd_nxt = th->th_ack; 3574 /* 3575 * Set snd_cwnd to one segment beyond acknowledged offset. 3576 * (tp->snd_una has not yet been updated when this function is called.) 3577 */ 3578 tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); 3579 tp->t_flags |= TF_ACKNOW; 3580 (void) tcp_output(tp); 3581 tp->snd_cwnd = ocwnd; 3582 if (SEQ_GT(onxt, tp->snd_nxt)) 3583 tp->snd_nxt = onxt; 3584 /* 3585 * Partial window deflation. Relies on fact that tp->snd_una 3586 * not updated yet. 3587 */ 3588 if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) 3589 tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); 3590 else 3591 tp->snd_cwnd = 0; 3592 tp->snd_cwnd += tp->t_maxseg; 3593} 3594