tcp_input.c revision 226060
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (c) 2007-2008,2010 5 * Swinburne University of Technology, Melbourne, Australia. 6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7 * Copyright (c) 2010 The FreeBSD Foundation 8 * Copyright (c) 2010-2011 Juniper Networks, Inc. 9 * All rights reserved. 10 * 11 * Portions of this software were developed at the Centre for Advanced Internet 12 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 13 * James Healy and David Hayes, made possible in part by a grant from the Cisco 14 * University Research Program Fund at Community Foundation Silicon Valley. 15 * 16 * Portions of this software were developed at the Centre for Advanced 17 * Internet Architectures, Swinburne University of Technology, Melbourne, 18 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 19 * 20 * Portions of this software were developed by Robert N. M. Watson under 21 * contract to Juniper Networks, Inc. 22 * 23 * Redistribution and use in source and binary forms, with or without 24 * modification, are permitted provided that the following conditions 25 * are met: 26 * 1. Redistributions of source code must retain the above copyright 27 * notice, this list of conditions and the following disclaimer. 28 * 2. Redistributions in binary form must reproduce the above copyright 29 * notice, this list of conditions and the following disclaimer in the 30 * documentation and/or other materials provided with the distribution. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 * 47 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 48 */ 49 50#include <sys/cdefs.h> 51__FBSDID("$FreeBSD: head/sys/netinet/tcp_input.c 226060 2011-10-06 14:29:38Z attilio $"); 52 53#include "opt_ipfw.h" /* for ipfw_fwd */ 54#include "opt_inet.h" 55#include "opt_inet6.h" 56#include "opt_ipsec.h" 57#include "opt_tcpdebug.h" 58 59#include <sys/param.h> 60#include <sys/kernel.h> 61#include <sys/hhook.h> 62#include <sys/malloc.h> 63#include <sys/mbuf.h> 64#include <sys/proc.h> /* for proc0 declaration */ 65#include <sys/protosw.h> 66#include <sys/signalvar.h> 67#include <sys/socket.h> 68#include <sys/socketvar.h> 69#include <sys/sysctl.h> 70#include <sys/syslog.h> 71#include <sys/systm.h> 72 73#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 74 75#include <vm/uma.h> 76 77#include <net/if.h> 78#include <net/route.h> 79#include <net/vnet.h> 80 81#define TCPSTATES /* for logging */ 82 83#include <netinet/cc.h> 84#include <netinet/in.h> 85#include <netinet/in_pcb.h> 86#include <netinet/in_systm.h> 87#include <netinet/in_var.h> 88#include <netinet/ip.h> 89#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 90#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 91#include <netinet/ip_var.h> 92#include <netinet/ip_options.h> 93#include <netinet/ip6.h> 94#include <netinet/icmp6.h> 95#include <netinet6/in6_pcb.h> 96#include <netinet6/ip6_var.h> 97#include <netinet6/nd6.h> 98#include <netinet/tcp_fsm.h> 99#include <netinet/tcp_seq.h> 100#include <netinet/tcp_timer.h> 101#include <netinet/tcp_var.h> 102#include <netinet6/tcp6_var.h> 103#include <netinet/tcpip.h> 104#include <netinet/tcp_syncache.h> 105#ifdef TCPDEBUG 106#include <netinet/tcp_debug.h> 107#endif /* TCPDEBUG */ 108 109#ifdef IPSEC 110#include <netipsec/ipsec.h> 111#include <netipsec/ipsec6.h> 112#endif /*IPSEC*/ 113 114#include <machine/in_cksum.h> 115 116#include <security/mac/mac_framework.h> 117 118const int tcprexmtthresh = 3; 119 120VNET_DEFINE(struct tcpstat, tcpstat); 121SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, 122 &VNET_NAME(tcpstat), tcpstat, 123 "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 124 125int tcp_log_in_vain = 0; 126SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 127 &tcp_log_in_vain, 0, 128 "Log all incoming TCP segments to closed ports"); 129 130VNET_DEFINE(int, blackhole) = 0; 131#define V_blackhole VNET(blackhole) 132SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, 133 &VNET_NAME(blackhole), 0, 134 "Do not send RST on segments to closed ports"); 135 136VNET_DEFINE(int, tcp_delack_enabled) = 1; 137SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 138 &VNET_NAME(tcp_delack_enabled), 0, 139 "Delay ACK to try and piggyback it onto a data packet"); 140 141VNET_DEFINE(int, drop_synfin) = 0; 142#define V_drop_synfin VNET(drop_synfin) 143SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, 144 &VNET_NAME(drop_synfin), 0, 145 "Drop TCP packets with SYN+FIN set"); 146 147VNET_DEFINE(int, tcp_do_rfc3042) = 1; 148#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) 149SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, 150 &VNET_NAME(tcp_do_rfc3042), 0, 151 "Enable RFC 3042 (Limited Transmit)"); 152 153VNET_DEFINE(int, tcp_do_rfc3390) = 1; 154SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, 155 &VNET_NAME(tcp_do_rfc3390), 0, 156 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 157 158VNET_DEFINE(int, tcp_do_rfc3465) = 1; 159SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, 160 &VNET_NAME(tcp_do_rfc3465), 0, 161 "Enable RFC 3465 (Appropriate Byte Counting)"); 162 163VNET_DEFINE(int, tcp_abc_l_var) = 2; 164SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, 165 &VNET_NAME(tcp_abc_l_var), 2, 166 "Cap the max cwnd increment during slow-start to this number of segments"); 167 168SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); 169 170VNET_DEFINE(int, tcp_do_ecn) = 0; 171SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, 172 &VNET_NAME(tcp_do_ecn), 0, 173 "TCP ECN support"); 174 175VNET_DEFINE(int, tcp_ecn_maxretries) = 1; 176SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, 177 &VNET_NAME(tcp_ecn_maxretries), 0, 178 "Max retries before giving up on ECN"); 179 180VNET_DEFINE(int, tcp_insecure_rst) = 0; 181#define V_tcp_insecure_rst VNET(tcp_insecure_rst) 182SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, 183 &VNET_NAME(tcp_insecure_rst), 0, 184 "Follow the old (insecure) criteria for accepting RST packets"); 185 186VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; 187#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 188SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, 189 &VNET_NAME(tcp_do_autorcvbuf), 0, 190 "Enable automatic receive buffer sizing"); 191 192VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; 193#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 194SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, 195 &VNET_NAME(tcp_autorcvbuf_inc), 0, 196 "Incrementor step size of automatic receive buffer"); 197 198VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; 199#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 200SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, 201 &VNET_NAME(tcp_autorcvbuf_max), 0, 202 "Max size of automatic receive buffer"); 203 204VNET_DEFINE(struct inpcbhead, tcb); 205#define tcb6 tcb /* for KAME src sync over BSD*'s */ 206VNET_DEFINE(struct inpcbinfo, tcbinfo); 207 208static void tcp_dooptions(struct tcpopt *, u_char *, int, int); 209static void tcp_do_segment(struct mbuf *, struct tcphdr *, 210 struct socket *, struct tcpcb *, int, int, uint8_t, 211 int); 212static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, 213 struct tcpcb *, int, int); 214static void tcp_pulloutofband(struct socket *, 215 struct tcphdr *, struct mbuf *, int); 216static void tcp_xmit_timer(struct tcpcb *, int); 217static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); 218static void inline tcp_fields_to_host(struct tcphdr *); 219#ifdef TCP_SIGNATURE 220static void inline tcp_fields_to_net(struct tcphdr *); 221static int inline tcp_signature_verify_input(struct mbuf *, int, int, 222 int, struct tcpopt *, struct tcphdr *, u_int); 223#endif 224static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, 225 uint16_t type); 226static void inline cc_conn_init(struct tcpcb *tp); 227static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); 228static void inline hhook_run_tcp_est_in(struct tcpcb *tp, 229 struct tcphdr *th, struct tcpopt *to); 230 231/* 232 * Kernel module interface for updating tcpstat. The argument is an index 233 * into tcpstat treated as an array of u_long. While this encodes the 234 * general layout of tcpstat into the caller, it doesn't encode its location, 235 * so that future changes to add, for example, per-CPU stats support won't 236 * cause binary compatibility problems for kernel modules. 237 */ 238void 239kmod_tcpstat_inc(int statnum) 240{ 241 242 (*((u_long *)&V_tcpstat + statnum))++; 243} 244 245/* 246 * Wrapper for the TCP established input helper hook. 247 */ 248static void inline 249hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) 250{ 251 struct tcp_hhook_data hhook_data; 252 253 if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { 254 hhook_data.tp = tp; 255 hhook_data.th = th; 256 hhook_data.to = to; 257 258 hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, 259 tp->osd); 260 } 261} 262 263/* 264 * CC wrapper hook functions 265 */ 266static void inline 267cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) 268{ 269 INP_WLOCK_ASSERT(tp->t_inpcb); 270 271 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 272 if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd)) 273 tp->ccv->flags |= CCF_CWND_LIMITED; 274 else 275 tp->ccv->flags &= ~CCF_CWND_LIMITED; 276 277 if (type == CC_ACK) { 278 if (tp->snd_cwnd > tp->snd_ssthresh) { 279 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 280 V_tcp_abc_l_var * tp->t_maxseg); 281 if (tp->t_bytes_acked >= tp->snd_cwnd) { 282 tp->t_bytes_acked -= tp->snd_cwnd; 283 tp->ccv->flags |= CCF_ABC_SENTAWND; 284 } 285 } else { 286 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 287 tp->t_bytes_acked = 0; 288 } 289 } 290 291 if (CC_ALGO(tp)->ack_received != NULL) { 292 /* XXXLAS: Find a way to live without this */ 293 tp->ccv->curack = th->th_ack; 294 CC_ALGO(tp)->ack_received(tp->ccv, type); 295 } 296} 297 298static void inline 299cc_conn_init(struct tcpcb *tp) 300{ 301 struct hc_metrics_lite metrics; 302 struct inpcb *inp = tp->t_inpcb; 303 int rtt; 304#ifdef INET6 305 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 306#endif 307 308 INP_WLOCK_ASSERT(tp->t_inpcb); 309 310 tcp_hc_get(&inp->inp_inc, &metrics); 311 312 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { 313 tp->t_srtt = rtt; 314 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; 315 TCPSTAT_INC(tcps_usedrtt); 316 if (metrics.rmx_rttvar) { 317 tp->t_rttvar = metrics.rmx_rttvar; 318 TCPSTAT_INC(tcps_usedrttvar); 319 } else { 320 /* default variation is +- 1 rtt */ 321 tp->t_rttvar = 322 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 323 } 324 TCPT_RANGESET(tp->t_rxtcur, 325 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 326 tp->t_rttmin, TCPTV_REXMTMAX); 327 } 328 if (metrics.rmx_ssthresh) { 329 /* 330 * There's some sort of gateway or interface 331 * buffer limit on the path. Use this to set 332 * the slow start threshhold, but set the 333 * threshold to no less than 2*mss. 334 */ 335 tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); 336 TCPSTAT_INC(tcps_usedssthresh); 337 } 338 339 /* 340 * Set the slow-start flight size depending on whether this 341 * is a local network or not. 342 * 343 * Extend this so we cache the cwnd too and retrieve it here. 344 * Make cwnd even bigger than RFC3390 suggests but only if we 345 * have previous experience with the remote host. Be careful 346 * not make cwnd bigger than remote receive window or our own 347 * send socket buffer. Maybe put some additional upper bound 348 * on the retrieved cwnd. Should do incremental updates to 349 * hostcache when cwnd collapses so next connection doesn't 350 * overloads the path again. 351 * 352 * XXXAO: Initializing the CWND from the hostcache is broken 353 * and in its current form not RFC conformant. It is disabled 354 * until fixed or removed entirely. 355 * 356 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. 357 * We currently check only in syncache_socket for that. 358 */ 359/* #define TCP_METRICS_CWND */ 360#ifdef TCP_METRICS_CWND 361 if (metrics.rmx_cwnd) 362 tp->snd_cwnd = max(tp->t_maxseg, min(metrics.rmx_cwnd / 2, 363 min(tp->snd_wnd, so->so_snd.sb_hiwat))); 364 else 365#endif 366 if (V_tcp_do_rfc3390) 367 tp->snd_cwnd = min(4 * tp->t_maxseg, 368 max(2 * tp->t_maxseg, 4380)); 369#ifdef INET6 370 else if (isipv6 && in6_localaddr(&inp->in6p_faddr)) 371 tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local; 372#endif 373#if defined(INET) && defined(INET6) 374 else if (!isipv6 && in_localaddr(inp->inp_faddr)) 375 tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local; 376#endif 377#ifdef INET 378 else if (in_localaddr(inp->inp_faddr)) 379 tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local; 380#endif 381 else 382 tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz; 383 384 if (CC_ALGO(tp)->conn_init != NULL) 385 CC_ALGO(tp)->conn_init(tp->ccv); 386} 387 388void inline 389cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 390{ 391 INP_WLOCK_ASSERT(tp->t_inpcb); 392 393 switch(type) { 394 case CC_NDUPACK: 395 if (!IN_FASTRECOVERY(tp->t_flags)) { 396 tp->snd_recover = tp->snd_max; 397 if (tp->t_flags & TF_ECN_PERMIT) 398 tp->t_flags |= TF_ECN_SND_CWR; 399 } 400 break; 401 case CC_ECN: 402 if (!IN_CONGRECOVERY(tp->t_flags)) { 403 TCPSTAT_INC(tcps_ecn_rcwnd); 404 tp->snd_recover = tp->snd_max; 405 if (tp->t_flags & TF_ECN_PERMIT) 406 tp->t_flags |= TF_ECN_SND_CWR; 407 } 408 break; 409 case CC_RTO: 410 tp->t_dupacks = 0; 411 tp->t_bytes_acked = 0; 412 EXIT_RECOVERY(tp->t_flags); 413 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / 414 tp->t_maxseg) * tp->t_maxseg; 415 tp->snd_cwnd = tp->t_maxseg; 416 break; 417 case CC_RTO_ERR: 418 TCPSTAT_INC(tcps_sndrexmitbad); 419 /* RTO was unnecessary, so reset everything. */ 420 tp->snd_cwnd = tp->snd_cwnd_prev; 421 tp->snd_ssthresh = tp->snd_ssthresh_prev; 422 tp->snd_recover = tp->snd_recover_prev; 423 if (tp->t_flags & TF_WASFRECOVERY) 424 ENTER_FASTRECOVERY(tp->t_flags); 425 if (tp->t_flags & TF_WASCRECOVERY) 426 ENTER_CONGRECOVERY(tp->t_flags); 427 tp->snd_nxt = tp->snd_max; 428 tp->t_flags &= ~TF_PREVVALID; 429 tp->t_badrxtwin = 0; 430 break; 431 } 432 433 if (CC_ALGO(tp)->cong_signal != NULL) { 434 if (th != NULL) 435 tp->ccv->curack = th->th_ack; 436 CC_ALGO(tp)->cong_signal(tp->ccv, type); 437 } 438} 439 440static void inline 441cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) 442{ 443 INP_WLOCK_ASSERT(tp->t_inpcb); 444 445 /* XXXLAS: KASSERT that we're in recovery? */ 446 447 if (CC_ALGO(tp)->post_recovery != NULL) { 448 tp->ccv->curack = th->th_ack; 449 CC_ALGO(tp)->post_recovery(tp->ccv); 450 } 451 /* XXXLAS: EXIT_RECOVERY ? */ 452 tp->t_bytes_acked = 0; 453} 454 455static inline void 456tcp_fields_to_host(struct tcphdr *th) 457{ 458 459 th->th_seq = ntohl(th->th_seq); 460 th->th_ack = ntohl(th->th_ack); 461 th->th_win = ntohs(th->th_win); 462 th->th_urp = ntohs(th->th_urp); 463} 464 465#ifdef TCP_SIGNATURE 466static inline void 467tcp_fields_to_net(struct tcphdr *th) 468{ 469 470 th->th_seq = htonl(th->th_seq); 471 th->th_ack = htonl(th->th_ack); 472 th->th_win = htons(th->th_win); 473 th->th_urp = htons(th->th_urp); 474} 475 476static inline int 477tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, 478 struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) 479{ 480 int ret; 481 482 tcp_fields_to_net(th); 483 ret = tcp_signature_verify(m, off0, tlen, optlen, to, th, tcpbflag); 484 tcp_fields_to_host(th); 485 return (ret); 486} 487#endif 488 489/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ 490#ifdef INET6 491#define ND6_HINT(tp) \ 492do { \ 493 if ((tp) && (tp)->t_inpcb && \ 494 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ 495 nd6_nud_hint(NULL, NULL, 0); \ 496} while (0) 497#else 498#define ND6_HINT(tp) 499#endif 500 501/* 502 * Indicate whether this ack should be delayed. We can delay the ack if 503 * - there is no delayed ack timer in progress and 504 * - our last ack wasn't a 0-sized window. We never want to delay 505 * the ack that opens up a 0-sized window and 506 * - delayed acks are enabled or 507 * - this is a half-synchronized T/TCP connection. 508 */ 509#define DELAY_ACK(tp) \ 510 ((!tcp_timer_active(tp, TT_DELACK) && \ 511 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 512 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 513 514/* 515 * TCP input handling is split into multiple parts: 516 * tcp6_input is a thin wrapper around tcp_input for the extended 517 * ip6_protox[] call format in ip6_input 518 * tcp_input handles primary segment validation, inpcb lookup and 519 * SYN processing on listen sockets 520 * tcp_do_segment processes the ACK and text of the segment for 521 * establishing, established and closing connections 522 */ 523#ifdef INET6 524int 525tcp6_input(struct mbuf **mp, int *offp, int proto) 526{ 527 struct mbuf *m = *mp; 528 struct in6_ifaddr *ia6; 529 530 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); 531 532 /* 533 * draft-itojun-ipv6-tcp-to-anycast 534 * better place to put this in? 535 */ 536 ia6 = ip6_getdstifaddr(m); 537 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { 538 struct ip6_hdr *ip6; 539 540 ifa_free(&ia6->ia_ifa); 541 ip6 = mtod(m, struct ip6_hdr *); 542 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 543 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 544 return IPPROTO_DONE; 545 } 546 547 tcp_input(m, *offp); 548 return IPPROTO_DONE; 549} 550#endif /* INET6 */ 551 552void 553tcp_input(struct mbuf *m, int off0) 554{ 555 struct tcphdr *th = NULL; 556 struct ip *ip = NULL; 557#ifdef INET 558 struct ipovly *ipov; 559#endif 560 struct inpcb *inp = NULL; 561 struct tcpcb *tp = NULL; 562 struct socket *so = NULL; 563 u_char *optp = NULL; 564 int optlen = 0; 565#ifdef INET 566 int len; 567#endif 568 int tlen = 0, off; 569 int drop_hdrlen; 570 int thflags; 571 int rstreason = 0; /* For badport_bandlim accounting purposes */ 572#ifdef TCP_SIGNATURE 573 uint8_t sig_checked = 0; 574#endif 575 uint8_t iptos = 0; 576#ifdef IPFIREWALL_FORWARD 577 struct m_tag *fwd_tag; 578#endif 579#ifdef INET6 580 struct ip6_hdr *ip6 = NULL; 581 int isipv6; 582#else 583 const void *ip6 = NULL; 584#endif /* INET6 */ 585 struct tcpopt to; /* options in this segment */ 586 char *s = NULL; /* address and port logging */ 587 int ti_locked; 588#define TI_UNLOCKED 1 589#define TI_WLOCKED 2 590 591#ifdef TCPDEBUG 592 /* 593 * The size of tcp_saveipgen must be the size of the max ip header, 594 * now IPv6. 595 */ 596 u_char tcp_saveipgen[IP6_HDR_LEN]; 597 struct tcphdr tcp_savetcp; 598 short ostate = 0; 599#endif 600 601#ifdef INET6 602 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 603#endif 604 605 to.to_flags = 0; 606 TCPSTAT_INC(tcps_rcvtotal); 607 608#ifdef INET6 609 if (isipv6) { 610 /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ 611 ip6 = mtod(m, struct ip6_hdr *); 612 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 613 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { 614 TCPSTAT_INC(tcps_rcvbadsum); 615 goto drop; 616 } 617 th = (struct tcphdr *)((caddr_t)ip6 + off0); 618 619 /* 620 * Be proactive about unspecified IPv6 address in source. 621 * As we use all-zero to indicate unbounded/unconnected pcb, 622 * unspecified IPv6 address can be used to confuse us. 623 * 624 * Note that packets with unspecified IPv6 destination is 625 * already dropped in ip6_input. 626 */ 627 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 628 /* XXX stat */ 629 goto drop; 630 } 631 } 632#endif 633#if defined(INET) && defined(INET6) 634 else 635#endif 636#ifdef INET 637 { 638 /* 639 * Get IP and TCP header together in first mbuf. 640 * Note: IP leaves IP header in first mbuf. 641 */ 642 if (off0 > sizeof (struct ip)) { 643 ip_stripoptions(m, (struct mbuf *)0); 644 off0 = sizeof(struct ip); 645 } 646 if (m->m_len < sizeof (struct tcpiphdr)) { 647 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 648 == NULL) { 649 TCPSTAT_INC(tcps_rcvshort); 650 return; 651 } 652 } 653 ip = mtod(m, struct ip *); 654 ipov = (struct ipovly *)ip; 655 th = (struct tcphdr *)((caddr_t)ip + off0); 656 tlen = ip->ip_len; 657 658 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 659 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 660 th->th_sum = m->m_pkthdr.csum_data; 661 else 662 th->th_sum = in_pseudo(ip->ip_src.s_addr, 663 ip->ip_dst.s_addr, 664 htonl(m->m_pkthdr.csum_data + 665 ip->ip_len + 666 IPPROTO_TCP)); 667 th->th_sum ^= 0xffff; 668#ifdef TCPDEBUG 669 ipov->ih_len = (u_short)tlen; 670 ipov->ih_len = htons(ipov->ih_len); 671#endif 672 } else { 673 /* 674 * Checksum extended TCP header and data. 675 */ 676 len = sizeof (struct ip) + tlen; 677 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 678 ipov->ih_len = (u_short)tlen; 679 ipov->ih_len = htons(ipov->ih_len); 680 th->th_sum = in_cksum(m, len); 681 } 682 if (th->th_sum) { 683 TCPSTAT_INC(tcps_rcvbadsum); 684 goto drop; 685 } 686 /* Re-initialization for later version check */ 687 ip->ip_v = IPVERSION; 688 } 689#endif /* INET */ 690 691#ifdef INET6 692 if (isipv6) 693 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 694#endif 695#if defined(INET) && defined(INET6) 696 else 697#endif 698#ifdef INET 699 iptos = ip->ip_tos; 700#endif 701 702 /* 703 * Check that TCP offset makes sense, 704 * pull out TCP options and adjust length. XXX 705 */ 706 off = th->th_off << 2; 707 if (off < sizeof (struct tcphdr) || off > tlen) { 708 TCPSTAT_INC(tcps_rcvbadoff); 709 goto drop; 710 } 711 tlen -= off; /* tlen is used instead of ti->ti_len */ 712 if (off > sizeof (struct tcphdr)) { 713#ifdef INET6 714 if (isipv6) { 715 IP6_EXTHDR_CHECK(m, off0, off, ); 716 ip6 = mtod(m, struct ip6_hdr *); 717 th = (struct tcphdr *)((caddr_t)ip6 + off0); 718 } 719#endif 720#if defined(INET) && defined(INET6) 721 else 722#endif 723#ifdef INET 724 { 725 if (m->m_len < sizeof(struct ip) + off) { 726 if ((m = m_pullup(m, sizeof (struct ip) + off)) 727 == NULL) { 728 TCPSTAT_INC(tcps_rcvshort); 729 return; 730 } 731 ip = mtod(m, struct ip *); 732 ipov = (struct ipovly *)ip; 733 th = (struct tcphdr *)((caddr_t)ip + off0); 734 } 735 } 736#endif 737 optlen = off - sizeof (struct tcphdr); 738 optp = (u_char *)(th + 1); 739 } 740 thflags = th->th_flags; 741 742 /* 743 * Convert TCP protocol specific fields to host format. 744 */ 745 tcp_fields_to_host(th); 746 747 /* 748 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 749 */ 750 drop_hdrlen = off0 + off; 751 752 /* 753 * Locate pcb for segment; if we're likely to add or remove a 754 * connection then first acquire pcbinfo lock. There are two cases 755 * where we might discover later we need a write lock despite the 756 * flags: ACKs moving a connection out of the syncache, and ACKs for 757 * a connection in TIMEWAIT. 758 */ 759 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) { 760 INP_INFO_WLOCK(&V_tcbinfo); 761 ti_locked = TI_WLOCKED; 762 } else 763 ti_locked = TI_UNLOCKED; 764 765findpcb: 766#ifdef INVARIANTS 767 if (ti_locked == TI_WLOCKED) { 768 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 769 } else { 770 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 771 } 772#endif 773 774#ifdef IPFIREWALL_FORWARD 775 /* 776 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. 777 */ 778 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 779#endif /* IPFIREWALL_FORWARD */ 780 781#ifdef INET6 782#ifdef IPFIREWALL_FORWARD 783 if (isipv6 && fwd_tag != NULL) { 784 struct sockaddr_in6 *next_hop6; 785 786 next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); 787 /* 788 * Transparently forwarded. Pretend to be the destination. 789 * Already got one like this? 790 */ 791 inp = in6_pcblookup_mbuf(&V_tcbinfo, 792 &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, 793 INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); 794 if (!inp) { 795 /* 796 * It's new. Try to find the ambushing socket. 797 * Because we've rewritten the destination address, 798 * any hardware-generated hash is ignored. 799 */ 800 inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, 801 th->th_sport, &next_hop6->sin6_addr, 802 next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : 803 th->th_dport, INPLOOKUP_WILDCARD | 804 INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); 805 } 806 /* Remove the tag from the packet. We don't need it anymore. */ 807 m_tag_delete(m, fwd_tag); 808 } else 809#endif /* IPFIREWALL_FORWARD */ 810 if (isipv6) { 811 inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, 812 th->th_sport, &ip6->ip6_dst, th->th_dport, 813 INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, 814 m->m_pkthdr.rcvif, m); 815 } 816#endif /* INET6 */ 817#if defined(INET6) && defined(INET) 818 else 819#endif 820#ifdef INET 821#ifdef IPFIREWALL_FORWARD 822 if (fwd_tag != NULL) { 823 struct sockaddr_in *next_hop; 824 825 next_hop = (struct sockaddr_in *)(fwd_tag+1); 826 /* 827 * Transparently forwarded. Pretend to be the destination. 828 * already got one like this? 829 */ 830 inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, 831 ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, 832 m->m_pkthdr.rcvif, m); 833 if (!inp) { 834 /* 835 * It's new. Try to find the ambushing socket. 836 * Because we've rewritten the destination address, 837 * any hardware-generated hash is ignored. 838 */ 839 inp = in_pcblookup(&V_tcbinfo, ip->ip_src, 840 th->th_sport, next_hop->sin_addr, 841 next_hop->sin_port ? ntohs(next_hop->sin_port) : 842 th->th_dport, INPLOOKUP_WILDCARD | 843 INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); 844 } 845 /* Remove the tag from the packet. We don't need it anymore. */ 846 m_tag_delete(m, fwd_tag); 847 } else 848#endif /* IPFIREWALL_FORWARD */ 849 inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, 850 th->th_sport, ip->ip_dst, th->th_dport, 851 INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, 852 m->m_pkthdr.rcvif, m); 853#endif /* INET */ 854 855 /* 856 * If the INPCB does not exist then all data in the incoming 857 * segment is discarded and an appropriate RST is sent back. 858 * XXX MRT Send RST using which routing table? 859 */ 860 if (inp == NULL) { 861 /* 862 * Log communication attempts to ports that are not 863 * in use. 864 */ 865 if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 866 tcp_log_in_vain == 2) { 867 if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) 868 log(LOG_INFO, "%s; %s: Connection attempt " 869 "to closed port\n", s, __func__); 870 } 871 /* 872 * When blackholing do not respond with a RST but 873 * completely ignore the segment and drop it. 874 */ 875 if ((V_blackhole == 1 && (thflags & TH_SYN)) || 876 V_blackhole == 2) 877 goto dropunlock; 878 879 rstreason = BANDLIM_RST_CLOSEDPORT; 880 goto dropwithreset; 881 } 882 INP_WLOCK_ASSERT(inp); 883 if (!(inp->inp_flags & INP_HW_FLOWID) 884 && (m->m_flags & M_FLOWID) 885 && ((inp->inp_socket == NULL) 886 || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) { 887 inp->inp_flags |= INP_HW_FLOWID; 888 inp->inp_flags &= ~INP_SW_FLOWID; 889 inp->inp_flowid = m->m_pkthdr.flowid; 890 } 891#ifdef IPSEC 892#ifdef INET6 893 if (isipv6 && ipsec6_in_reject(m, inp)) { 894 V_ipsec6stat.in_polvio++; 895 goto dropunlock; 896 } else 897#endif /* INET6 */ 898 if (ipsec4_in_reject(m, inp) != 0) { 899 V_ipsec4stat.in_polvio++; 900 goto dropunlock; 901 } 902#endif /* IPSEC */ 903 904 /* 905 * Check the minimum TTL for socket. 906 */ 907 if (inp->inp_ip_minttl != 0) { 908#ifdef INET6 909 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) 910 goto dropunlock; 911 else 912#endif 913 if (inp->inp_ip_minttl > ip->ip_ttl) 914 goto dropunlock; 915 } 916 917 /* 918 * A previous connection in TIMEWAIT state is supposed to catch stray 919 * or duplicate segments arriving late. If this segment was a 920 * legitimate new connection attempt the old INPCB gets removed and 921 * we can try again to find a listening socket. 922 * 923 * At this point, due to earlier optimism, we may hold only an inpcb 924 * lock, and not the inpcbinfo write lock. If so, we need to try to 925 * acquire it, or if that fails, acquire a reference on the inpcb, 926 * drop all locks, acquire a global write lock, and then re-acquire 927 * the inpcb lock. We may at that point discover that another thread 928 * has tried to free the inpcb, in which case we need to loop back 929 * and try to find a new inpcb to deliver to. 930 * 931 * XXXRW: It may be time to rethink timewait locking. 932 */ 933relocked: 934 if (inp->inp_flags & INP_TIMEWAIT) { 935 if (ti_locked == TI_UNLOCKED) { 936 if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { 937 in_pcbref(inp); 938 INP_WUNLOCK(inp); 939 INP_INFO_WLOCK(&V_tcbinfo); 940 ti_locked = TI_WLOCKED; 941 INP_WLOCK(inp); 942 if (in_pcbrele_wlocked(inp)) { 943 inp = NULL; 944 goto findpcb; 945 } 946 } else 947 ti_locked = TI_WLOCKED; 948 } 949 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 950 951 if (thflags & TH_SYN) 952 tcp_dooptions(&to, optp, optlen, TO_SYN); 953 /* 954 * NB: tcp_twcheck unlocks the INP and frees the mbuf. 955 */ 956 if (tcp_twcheck(inp, &to, th, m, tlen)) 957 goto findpcb; 958 INP_INFO_WUNLOCK(&V_tcbinfo); 959 return; 960 } 961 /* 962 * The TCPCB may no longer exist if the connection is winding 963 * down or it is in the CLOSED state. Either way we drop the 964 * segment and send an appropriate response. 965 */ 966 tp = intotcpcb(inp); 967 if (tp == NULL || tp->t_state == TCPS_CLOSED) { 968 rstreason = BANDLIM_RST_CLOSEDPORT; 969 goto dropwithreset; 970 } 971 972 /* 973 * We've identified a valid inpcb, but it could be that we need an 974 * inpcbinfo write lock but don't hold it. In this case, attempt to 975 * acquire using the same strategy as the TIMEWAIT case above. If we 976 * relock, we have to jump back to 'relocked' as the connection might 977 * now be in TIMEWAIT. 978 */ 979#ifdef INVARIANTS 980 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) 981 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 982#endif 983 if (tp->t_state != TCPS_ESTABLISHED) { 984 if (ti_locked == TI_UNLOCKED) { 985 if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { 986 in_pcbref(inp); 987 INP_WUNLOCK(inp); 988 INP_INFO_WLOCK(&V_tcbinfo); 989 ti_locked = TI_WLOCKED; 990 INP_WLOCK(inp); 991 if (in_pcbrele_wlocked(inp)) { 992 inp = NULL; 993 goto findpcb; 994 } 995 goto relocked; 996 } else 997 ti_locked = TI_WLOCKED; 998 } 999 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1000 } 1001 1002#ifdef MAC 1003 INP_WLOCK_ASSERT(inp); 1004 if (mac_inpcb_check_deliver(inp, m)) 1005 goto dropunlock; 1006#endif 1007 so = inp->inp_socket; 1008 KASSERT(so != NULL, ("%s: so == NULL", __func__)); 1009#ifdef TCPDEBUG 1010 if (so->so_options & SO_DEBUG) { 1011 ostate = tp->t_state; 1012#ifdef INET6 1013 if (isipv6) { 1014 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); 1015 } else 1016#endif 1017 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 1018 tcp_savetcp = *th; 1019 } 1020#endif /* TCPDEBUG */ 1021 /* 1022 * When the socket is accepting connections (the INPCB is in LISTEN 1023 * state) we look into the SYN cache if this is a new connection 1024 * attempt or the completion of a previous one. Because listen 1025 * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be 1026 * held in this case. 1027 */ 1028 if (so->so_options & SO_ACCEPTCONN) { 1029 struct in_conninfo inc; 1030 1031 KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " 1032 "tp not listening", __func__)); 1033 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1034 1035 bzero(&inc, sizeof(inc)); 1036#ifdef INET6 1037 if (isipv6) { 1038 inc.inc_flags |= INC_ISIPV6; 1039 inc.inc6_faddr = ip6->ip6_src; 1040 inc.inc6_laddr = ip6->ip6_dst; 1041 } else 1042#endif 1043 { 1044 inc.inc_faddr = ip->ip_src; 1045 inc.inc_laddr = ip->ip_dst; 1046 } 1047 inc.inc_fport = th->th_sport; 1048 inc.inc_lport = th->th_dport; 1049 inc.inc_fibnum = so->so_fibnum; 1050 1051 /* 1052 * Check for an existing connection attempt in syncache if 1053 * the flag is only ACK. A successful lookup creates a new 1054 * socket appended to the listen queue in SYN_RECEIVED state. 1055 */ 1056 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 1057 /* 1058 * Parse the TCP options here because 1059 * syncookies need access to the reflected 1060 * timestamp. 1061 */ 1062 tcp_dooptions(&to, optp, optlen, 0); 1063 /* 1064 * NB: syncache_expand() doesn't unlock 1065 * inp and tcpinfo locks. 1066 */ 1067 if (!syncache_expand(&inc, &to, th, &so, m)) { 1068 /* 1069 * No syncache entry or ACK was not 1070 * for our SYN/ACK. Send a RST. 1071 * NB: syncache did its own logging 1072 * of the failure cause. 1073 */ 1074 rstreason = BANDLIM_RST_OPENPORT; 1075 goto dropwithreset; 1076 } 1077 if (so == NULL) { 1078 /* 1079 * We completed the 3-way handshake 1080 * but could not allocate a socket 1081 * either due to memory shortage, 1082 * listen queue length limits or 1083 * global socket limits. Send RST 1084 * or wait and have the remote end 1085 * retransmit the ACK for another 1086 * try. 1087 */ 1088 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1089 log(LOG_DEBUG, "%s; %s: Listen socket: " 1090 "Socket allocation failed due to " 1091 "limits or memory shortage, %s\n", 1092 s, __func__, 1093 V_tcp_sc_rst_sock_fail ? 1094 "sending RST" : "try again"); 1095 if (V_tcp_sc_rst_sock_fail) { 1096 rstreason = BANDLIM_UNLIMITED; 1097 goto dropwithreset; 1098 } else 1099 goto dropunlock; 1100 } 1101 /* 1102 * Socket is created in state SYN_RECEIVED. 1103 * Unlock the listen socket, lock the newly 1104 * created socket and update the tp variable. 1105 */ 1106 INP_WUNLOCK(inp); /* listen socket */ 1107 inp = sotoinpcb(so); 1108 INP_WLOCK(inp); /* new connection */ 1109 tp = intotcpcb(inp); 1110 KASSERT(tp->t_state == TCPS_SYN_RECEIVED, 1111 ("%s: ", __func__)); 1112#ifdef TCP_SIGNATURE 1113 if (sig_checked == 0) { 1114 tcp_dooptions(&to, optp, optlen, 1115 (thflags & TH_SYN) ? TO_SYN : 0); 1116 if (!tcp_signature_verify_input(m, off0, tlen, 1117 optlen, &to, th, tp->t_flags)) { 1118 1119 /* 1120 * In SYN_SENT state if it receives an 1121 * RST, it is allowed for further 1122 * processing. 1123 */ 1124 if ((thflags & TH_RST) == 0 || 1125 (tp->t_state == TCPS_SYN_SENT) == 0) 1126 goto dropunlock; 1127 } 1128 sig_checked = 1; 1129 } 1130#endif 1131 1132 /* 1133 * Process the segment and the data it 1134 * contains. tcp_do_segment() consumes 1135 * the mbuf chain and unlocks the inpcb. 1136 */ 1137 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, 1138 iptos, ti_locked); 1139 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1140 return; 1141 } 1142 /* 1143 * Segment flag validation for new connection attempts: 1144 * 1145 * Our (SYN|ACK) response was rejected. 1146 * Check with syncache and remove entry to prevent 1147 * retransmits. 1148 * 1149 * NB: syncache_chkrst does its own logging of failure 1150 * causes. 1151 */ 1152 if (thflags & TH_RST) { 1153 syncache_chkrst(&inc, th); 1154 goto dropunlock; 1155 } 1156 /* 1157 * We can't do anything without SYN. 1158 */ 1159 if ((thflags & TH_SYN) == 0) { 1160 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1161 log(LOG_DEBUG, "%s; %s: Listen socket: " 1162 "SYN is missing, segment ignored\n", 1163 s, __func__); 1164 TCPSTAT_INC(tcps_badsyn); 1165 goto dropunlock; 1166 } 1167 /* 1168 * (SYN|ACK) is bogus on a listen socket. 1169 */ 1170 if (thflags & TH_ACK) { 1171 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1172 log(LOG_DEBUG, "%s; %s: Listen socket: " 1173 "SYN|ACK invalid, segment rejected\n", 1174 s, __func__); 1175 syncache_badack(&inc); /* XXX: Not needed! */ 1176 TCPSTAT_INC(tcps_badsyn); 1177 rstreason = BANDLIM_RST_OPENPORT; 1178 goto dropwithreset; 1179 } 1180 /* 1181 * If the drop_synfin option is enabled, drop all 1182 * segments with both the SYN and FIN bits set. 1183 * This prevents e.g. nmap from identifying the 1184 * TCP/IP stack. 1185 * XXX: Poor reasoning. nmap has other methods 1186 * and is constantly refining its stack detection 1187 * strategies. 1188 * XXX: This is a violation of the TCP specification 1189 * and was used by RFC1644. 1190 */ 1191 if ((thflags & TH_FIN) && V_drop_synfin) { 1192 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1193 log(LOG_DEBUG, "%s; %s: Listen socket: " 1194 "SYN|FIN segment ignored (based on " 1195 "sysctl setting)\n", s, __func__); 1196 TCPSTAT_INC(tcps_badsyn); 1197 goto dropunlock; 1198 } 1199 /* 1200 * Segment's flags are (SYN) or (SYN|FIN). 1201 * 1202 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored 1203 * as they do not affect the state of the TCP FSM. 1204 * The data pointed to by TH_URG and th_urp is ignored. 1205 */ 1206 KASSERT((thflags & (TH_RST|TH_ACK)) == 0, 1207 ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); 1208 KASSERT(thflags & (TH_SYN), 1209 ("%s: Listen socket: TH_SYN not set", __func__)); 1210#ifdef INET6 1211 /* 1212 * If deprecated address is forbidden, 1213 * we do not accept SYN to deprecated interface 1214 * address to prevent any new inbound connection from 1215 * getting established. 1216 * When we do not accept SYN, we send a TCP RST, 1217 * with deprecated source address (instead of dropping 1218 * it). We compromise it as it is much better for peer 1219 * to send a RST, and RST will be the final packet 1220 * for the exchange. 1221 * 1222 * If we do not forbid deprecated addresses, we accept 1223 * the SYN packet. RFC2462 does not suggest dropping 1224 * SYN in this case. 1225 * If we decipher RFC2462 5.5.4, it says like this: 1226 * 1. use of deprecated addr with existing 1227 * communication is okay - "SHOULD continue to be 1228 * used" 1229 * 2. use of it with new communication: 1230 * (2a) "SHOULD NOT be used if alternate address 1231 * with sufficient scope is available" 1232 * (2b) nothing mentioned otherwise. 1233 * Here we fall into (2b) case as we have no choice in 1234 * our source address selection - we must obey the peer. 1235 * 1236 * The wording in RFC2462 is confusing, and there are 1237 * multiple description text for deprecated address 1238 * handling - worse, they are not exactly the same. 1239 * I believe 5.5.4 is the best one, so we follow 5.5.4. 1240 */ 1241 if (isipv6 && !V_ip6_use_deprecated) { 1242 struct in6_ifaddr *ia6; 1243 1244 ia6 = ip6_getdstifaddr(m); 1245 if (ia6 != NULL && 1246 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 1247 ifa_free(&ia6->ia_ifa); 1248 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1249 log(LOG_DEBUG, "%s; %s: Listen socket: " 1250 "Connection attempt to deprecated " 1251 "IPv6 address rejected\n", 1252 s, __func__); 1253 rstreason = BANDLIM_RST_OPENPORT; 1254 goto dropwithreset; 1255 } 1256 ifa_free(&ia6->ia_ifa); 1257 } 1258#endif /* INET6 */ 1259 /* 1260 * Basic sanity checks on incoming SYN requests: 1261 * Don't respond if the destination is a link layer 1262 * broadcast according to RFC1122 4.2.3.10, p. 104. 1263 * If it is from this socket it must be forged. 1264 * Don't respond if the source or destination is a 1265 * global or subnet broad- or multicast address. 1266 * Note that it is quite possible to receive unicast 1267 * link-layer packets with a broadcast IP address. Use 1268 * in_broadcast() to find them. 1269 */ 1270 if (m->m_flags & (M_BCAST|M_MCAST)) { 1271 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1272 log(LOG_DEBUG, "%s; %s: Listen socket: " 1273 "Connection attempt from broad- or multicast " 1274 "link layer address ignored\n", s, __func__); 1275 goto dropunlock; 1276 } 1277#ifdef INET6 1278 if (isipv6) { 1279 if (th->th_dport == th->th_sport && 1280 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { 1281 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1282 log(LOG_DEBUG, "%s; %s: Listen socket: " 1283 "Connection attempt to/from self " 1284 "ignored\n", s, __func__); 1285 goto dropunlock; 1286 } 1287 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 1288 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { 1289 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1290 log(LOG_DEBUG, "%s; %s: Listen socket: " 1291 "Connection attempt from/to multicast " 1292 "address ignored\n", s, __func__); 1293 goto dropunlock; 1294 } 1295 } 1296#endif 1297#if defined(INET) && defined(INET6) 1298 else 1299#endif 1300#ifdef INET 1301 { 1302 if (th->th_dport == th->th_sport && 1303 ip->ip_dst.s_addr == ip->ip_src.s_addr) { 1304 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1305 log(LOG_DEBUG, "%s; %s: Listen socket: " 1306 "Connection attempt from/to self " 1307 "ignored\n", s, __func__); 1308 goto dropunlock; 1309 } 1310 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 1311 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 1312 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 1313 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { 1314 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1315 log(LOG_DEBUG, "%s; %s: Listen socket: " 1316 "Connection attempt from/to broad- " 1317 "or multicast address ignored\n", 1318 s, __func__); 1319 goto dropunlock; 1320 } 1321 } 1322#endif 1323 /* 1324 * SYN appears to be valid. Create compressed TCP state 1325 * for syncache. 1326 */ 1327#ifdef TCPDEBUG 1328 if (so->so_options & SO_DEBUG) 1329 tcp_trace(TA_INPUT, ostate, tp, 1330 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1331#endif 1332 tcp_dooptions(&to, optp, optlen, TO_SYN); 1333 syncache_add(&inc, &to, th, inp, &so, m); 1334 /* 1335 * Entry added to syncache and mbuf consumed. 1336 * Everything already unlocked by syncache_add(). 1337 */ 1338 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1339 return; 1340 } 1341 1342#ifdef TCP_SIGNATURE 1343 if (sig_checked == 0) { 1344 tcp_dooptions(&to, optp, optlen, 1345 (thflags & TH_SYN) ? TO_SYN : 0); 1346 if (!tcp_signature_verify_input(m, off0, tlen, optlen, &to, 1347 th, tp->t_flags)) { 1348 1349 /* 1350 * In SYN_SENT state if it receives an RST, it is 1351 * allowed for further processing. 1352 */ 1353 if ((thflags & TH_RST) == 0 || 1354 (tp->t_state == TCPS_SYN_SENT) == 0) 1355 goto dropunlock; 1356 } 1357 sig_checked = 1; 1358 } 1359#endif 1360 1361 /* 1362 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later 1363 * state. tcp_do_segment() always consumes the mbuf chain, unlocks 1364 * the inpcb, and unlocks pcbinfo. 1365 */ 1366 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); 1367 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1368 return; 1369 1370dropwithreset: 1371 if (ti_locked == TI_WLOCKED) { 1372 INP_INFO_WUNLOCK(&V_tcbinfo); 1373 ti_locked = TI_UNLOCKED; 1374 } 1375#ifdef INVARIANTS 1376 else { 1377 KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " 1378 "ti_locked: %d", __func__, ti_locked)); 1379 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1380 } 1381#endif 1382 1383 if (inp != NULL) { 1384 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1385 INP_WUNLOCK(inp); 1386 } else 1387 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1388 m = NULL; /* mbuf chain got consumed. */ 1389 goto drop; 1390 1391dropunlock: 1392 if (ti_locked == TI_WLOCKED) { 1393 INP_INFO_WUNLOCK(&V_tcbinfo); 1394 ti_locked = TI_UNLOCKED; 1395 } 1396#ifdef INVARIANTS 1397 else { 1398 KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " 1399 "ti_locked: %d", __func__, ti_locked)); 1400 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1401 } 1402#endif 1403 1404 if (inp != NULL) 1405 INP_WUNLOCK(inp); 1406 1407drop: 1408 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1409 if (s != NULL) 1410 free(s, M_TCPLOG); 1411 if (m != NULL) 1412 m_freem(m); 1413} 1414 1415static void 1416tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 1417 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 1418 int ti_locked) 1419{ 1420 int thflags, acked, ourfinisacked, needoutput = 0; 1421 int rstreason, todrop, win; 1422 u_long tiwin; 1423 struct tcpopt to; 1424 1425#ifdef TCPDEBUG 1426 /* 1427 * The size of tcp_saveipgen must be the size of the max ip header, 1428 * now IPv6. 1429 */ 1430 u_char tcp_saveipgen[IP6_HDR_LEN]; 1431 struct tcphdr tcp_savetcp; 1432 short ostate = 0; 1433#endif 1434 thflags = th->th_flags; 1435 tp->sackhint.last_sack_ack = 0; 1436 1437 /* 1438 * If this is either a state-changing packet or current state isn't 1439 * established, we require a write lock on tcbinfo. Otherwise, we 1440 * allow either a read lock or a write lock, as we may have acquired 1441 * a write lock due to a race. 1442 * 1443 * Require a global write lock for SYN/FIN/RST segments or 1444 * non-established connections; otherwise accept either a read or 1445 * write lock, as we may have conservatively acquired a write lock in 1446 * certain cases in tcp_input() (is this still true?). Currently we 1447 * will never enter with no lock, so we try to drop it quickly in the 1448 * common pure ack/pure data cases. 1449 */ 1450 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 1451 tp->t_state != TCPS_ESTABLISHED) { 1452 KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " 1453 "SYN/FIN/RST/!EST", __func__, ti_locked)); 1454 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1455 } else { 1456#ifdef INVARIANTS 1457 if (ti_locked == TI_WLOCKED) 1458 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1459 else { 1460 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 1461 "ti_locked: %d", __func__, ti_locked)); 1462 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1463 } 1464#endif 1465 } 1466 INP_WLOCK_ASSERT(tp->t_inpcb); 1467 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1468 __func__)); 1469 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1470 __func__)); 1471 1472 /* 1473 * Segment received on connection. 1474 * Reset idle time and keep-alive timer. 1475 * XXX: This should be done after segment 1476 * validation to ignore broken/spoofed segs. 1477 */ 1478 tp->t_rcvtime = ticks; 1479 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1480 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1481 1482 /* 1483 * Unscale the window into a 32-bit value. 1484 * For the SYN_SENT state the scale is zero. 1485 */ 1486 tiwin = th->th_win << tp->snd_scale; 1487 1488 /* 1489 * TCP ECN processing. 1490 */ 1491 if (tp->t_flags & TF_ECN_PERMIT) { 1492 if (thflags & TH_CWR) 1493 tp->t_flags &= ~TF_ECN_SND_ECE; 1494 switch (iptos & IPTOS_ECN_MASK) { 1495 case IPTOS_ECN_CE: 1496 tp->t_flags |= TF_ECN_SND_ECE; 1497 TCPSTAT_INC(tcps_ecn_ce); 1498 break; 1499 case IPTOS_ECN_ECT0: 1500 TCPSTAT_INC(tcps_ecn_ect0); 1501 break; 1502 case IPTOS_ECN_ECT1: 1503 TCPSTAT_INC(tcps_ecn_ect1); 1504 break; 1505 } 1506 /* Congestion experienced. */ 1507 if (thflags & TH_ECE) { 1508 cc_cong_signal(tp, th, CC_ECN); 1509 } 1510 } 1511 1512 /* 1513 * Parse options on any incoming segment. 1514 */ 1515 tcp_dooptions(&to, (u_char *)(th + 1), 1516 (th->th_off << 2) - sizeof(struct tcphdr), 1517 (thflags & TH_SYN) ? TO_SYN : 0); 1518 1519 /* 1520 * If echoed timestamp is later than the current time, 1521 * fall back to non RFC1323 RTT calculation. Normalize 1522 * timestamp if syncookies were used when this connection 1523 * was established. 1524 */ 1525 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1526 to.to_tsecr -= tp->ts_offset; 1527 if (TSTMP_GT(to.to_tsecr, ticks)) 1528 to.to_tsecr = 0; 1529 } 1530 1531 /* 1532 * Process options only when we get SYN/ACK back. The SYN case 1533 * for incoming connections is handled in tcp_syncache. 1534 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1535 * or <SYN,ACK>) segment itself is never scaled. 1536 * XXX this is traditional behavior, may need to be cleaned up. 1537 */ 1538 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1539 if ((to.to_flags & TOF_SCALE) && 1540 (tp->t_flags & TF_REQ_SCALE)) { 1541 tp->t_flags |= TF_RCVD_SCALE; 1542 tp->snd_scale = to.to_wscale; 1543 } 1544 /* 1545 * Initial send window. It will be updated with 1546 * the next incoming segment to the scaled value. 1547 */ 1548 tp->snd_wnd = th->th_win; 1549 if (to.to_flags & TOF_TS) { 1550 tp->t_flags |= TF_RCVD_TSTMP; 1551 tp->ts_recent = to.to_tsval; 1552 tp->ts_recent_age = ticks; 1553 } 1554 if (to.to_flags & TOF_MSS) 1555 tcp_mss(tp, to.to_mss); 1556 if ((tp->t_flags & TF_SACK_PERMIT) && 1557 (to.to_flags & TOF_SACKPERM) == 0) 1558 tp->t_flags &= ~TF_SACK_PERMIT; 1559 } 1560 1561 /* 1562 * Header prediction: check for the two common cases 1563 * of a uni-directional data xfer. If the packet has 1564 * no control flags, is in-sequence, the window didn't 1565 * change and we're not retransmitting, it's a 1566 * candidate. If the length is zero and the ack moved 1567 * forward, we're the sender side of the xfer. Just 1568 * free the data acked & wake any higher level process 1569 * that was blocked waiting for space. If the length 1570 * is non-zero and the ack didn't move, we're the 1571 * receiver side. If we're getting packets in-order 1572 * (the reassembly queue is empty), add the data to 1573 * the socket buffer and note that we need a delayed ack. 1574 * Make sure that the hidden state-flags are also off. 1575 * Since we check for TCPS_ESTABLISHED first, it can only 1576 * be TH_NEEDSYN. 1577 */ 1578 if (tp->t_state == TCPS_ESTABLISHED && 1579 th->th_seq == tp->rcv_nxt && 1580 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1581 tp->snd_nxt == tp->snd_max && 1582 tiwin && tiwin == tp->snd_wnd && 1583 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1584 LIST_EMPTY(&tp->t_segq) && 1585 ((to.to_flags & TOF_TS) == 0 || 1586 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { 1587 1588 /* 1589 * If last ACK falls within this segment's sequence numbers, 1590 * record the timestamp. 1591 * NOTE that the test is modified according to the latest 1592 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1593 */ 1594 if ((to.to_flags & TOF_TS) != 0 && 1595 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1596 tp->ts_recent_age = ticks; 1597 tp->ts_recent = to.to_tsval; 1598 } 1599 1600 if (tlen == 0) { 1601 if (SEQ_GT(th->th_ack, tp->snd_una) && 1602 SEQ_LEQ(th->th_ack, tp->snd_max) && 1603 !IN_RECOVERY(tp->t_flags) && 1604 (to.to_flags & TOF_SACK) == 0 && 1605 TAILQ_EMPTY(&tp->snd_holes)) { 1606 /* 1607 * This is a pure ack for outstanding data. 1608 */ 1609 if (ti_locked == TI_WLOCKED) 1610 INP_INFO_WUNLOCK(&V_tcbinfo); 1611 ti_locked = TI_UNLOCKED; 1612 1613 TCPSTAT_INC(tcps_predack); 1614 1615 /* 1616 * "bad retransmit" recovery. 1617 */ 1618 if (tp->t_rxtshift == 1 && 1619 tp->t_flags & TF_PREVVALID && 1620 (int)(ticks - tp->t_badrxtwin) < 0) { 1621 cc_cong_signal(tp, th, CC_RTO_ERR); 1622 } 1623 1624 /* 1625 * Recalculate the transmit timer / rtt. 1626 * 1627 * Some boxes send broken timestamp replies 1628 * during the SYN+ACK phase, ignore 1629 * timestamps of 0 or we could calculate a 1630 * huge RTT and blow up the retransmit timer. 1631 */ 1632 if ((to.to_flags & TOF_TS) != 0 && 1633 to.to_tsecr) { 1634 if (!tp->t_rttlow || 1635 tp->t_rttlow > ticks - to.to_tsecr) 1636 tp->t_rttlow = ticks - to.to_tsecr; 1637 tcp_xmit_timer(tp, 1638 ticks - to.to_tsecr + 1); 1639 } else if (tp->t_rtttime && 1640 SEQ_GT(th->th_ack, tp->t_rtseq)) { 1641 if (!tp->t_rttlow || 1642 tp->t_rttlow > ticks - tp->t_rtttime) 1643 tp->t_rttlow = ticks - tp->t_rtttime; 1644 tcp_xmit_timer(tp, 1645 ticks - tp->t_rtttime); 1646 } 1647 acked = BYTES_THIS_ACK(tp, th); 1648 1649 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 1650 hhook_run_tcp_est_in(tp, th, &to); 1651 1652 TCPSTAT_INC(tcps_rcvackpack); 1653 TCPSTAT_ADD(tcps_rcvackbyte, acked); 1654 sbdrop(&so->so_snd, acked); 1655 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 1656 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1657 tp->snd_recover = th->th_ack - 1; 1658 1659 /* 1660 * Let the congestion control algorithm update 1661 * congestion control related information. This 1662 * typically means increasing the congestion 1663 * window. 1664 */ 1665 cc_ack_received(tp, th, CC_ACK); 1666 1667 tp->snd_una = th->th_ack; 1668 /* 1669 * Pull snd_wl2 up to prevent seq wrap relative 1670 * to th_ack. 1671 */ 1672 tp->snd_wl2 = th->th_ack; 1673 tp->t_dupacks = 0; 1674 m_freem(m); 1675 ND6_HINT(tp); /* Some progress has been made. */ 1676 1677 /* 1678 * If all outstanding data are acked, stop 1679 * retransmit timer, otherwise restart timer 1680 * using current (possibly backed-off) value. 1681 * If process is waiting for space, 1682 * wakeup/selwakeup/signal. If data 1683 * are ready to send, let tcp_output 1684 * decide between more output or persist. 1685 */ 1686#ifdef TCPDEBUG 1687 if (so->so_options & SO_DEBUG) 1688 tcp_trace(TA_INPUT, ostate, tp, 1689 (void *)tcp_saveipgen, 1690 &tcp_savetcp, 0); 1691#endif 1692 if (tp->snd_una == tp->snd_max) 1693 tcp_timer_activate(tp, TT_REXMT, 0); 1694 else if (!tcp_timer_active(tp, TT_PERSIST)) 1695 tcp_timer_activate(tp, TT_REXMT, 1696 tp->t_rxtcur); 1697 sowwakeup(so); 1698 if (so->so_snd.sb_cc) 1699 (void) tcp_output(tp); 1700 goto check_delack; 1701 } 1702 } else if (th->th_ack == tp->snd_una && 1703 tlen <= sbspace(&so->so_rcv)) { 1704 int newsize = 0; /* automatic sockbuf scaling */ 1705 1706 /* 1707 * This is a pure, in-sequence data packet with 1708 * nothing on the reassembly queue and we have enough 1709 * buffer space to take it. 1710 */ 1711 if (ti_locked == TI_WLOCKED) 1712 INP_INFO_WUNLOCK(&V_tcbinfo); 1713 ti_locked = TI_UNLOCKED; 1714 1715 /* Clean receiver SACK report if present */ 1716 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 1717 tcp_clean_sackreport(tp); 1718 TCPSTAT_INC(tcps_preddat); 1719 tp->rcv_nxt += tlen; 1720 /* 1721 * Pull snd_wl1 up to prevent seq wrap relative to 1722 * th_seq. 1723 */ 1724 tp->snd_wl1 = th->th_seq; 1725 /* 1726 * Pull rcv_up up to prevent seq wrap relative to 1727 * rcv_nxt. 1728 */ 1729 tp->rcv_up = tp->rcv_nxt; 1730 TCPSTAT_INC(tcps_rcvpack); 1731 TCPSTAT_ADD(tcps_rcvbyte, tlen); 1732 ND6_HINT(tp); /* Some progress has been made */ 1733#ifdef TCPDEBUG 1734 if (so->so_options & SO_DEBUG) 1735 tcp_trace(TA_INPUT, ostate, tp, 1736 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1737#endif 1738 /* 1739 * Automatic sizing of receive socket buffer. Often the send 1740 * buffer size is not optimally adjusted to the actual network 1741 * conditions at hand (delay bandwidth product). Setting the 1742 * buffer size too small limits throughput on links with high 1743 * bandwidth and high delay (eg. trans-continental/oceanic links). 1744 * 1745 * On the receive side the socket buffer memory is only rarely 1746 * used to any significant extent. This allows us to be much 1747 * more aggressive in scaling the receive socket buffer. For 1748 * the case that the buffer space is actually used to a large 1749 * extent and we run out of kernel memory we can simply drop 1750 * the new segments; TCP on the sender will just retransmit it 1751 * later. Setting the buffer size too big may only consume too 1752 * much kernel memory if the application doesn't read() from 1753 * the socket or packet loss or reordering makes use of the 1754 * reassembly queue. 1755 * 1756 * The criteria to step up the receive buffer one notch are: 1757 * 1. the number of bytes received during the time it takes 1758 * one timestamp to be reflected back to us (the RTT); 1759 * 2. received bytes per RTT is within seven eighth of the 1760 * current socket buffer size; 1761 * 3. receive buffer size has not hit maximal automatic size; 1762 * 1763 * This algorithm does one step per RTT at most and only if 1764 * we receive a bulk stream w/o packet losses or reorderings. 1765 * Shrinking the buffer during idle times is not necessary as 1766 * it doesn't consume any memory when idle. 1767 * 1768 * TODO: Only step up if the application is actually serving 1769 * the buffer to better manage the socket buffer resources. 1770 */ 1771 if (V_tcp_do_autorcvbuf && 1772 to.to_tsecr && 1773 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1774 if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) && 1775 to.to_tsecr - tp->rfbuf_ts < hz) { 1776 if (tp->rfbuf_cnt > 1777 (so->so_rcv.sb_hiwat / 8 * 7) && 1778 so->so_rcv.sb_hiwat < 1779 V_tcp_autorcvbuf_max) { 1780 newsize = 1781 min(so->so_rcv.sb_hiwat + 1782 V_tcp_autorcvbuf_inc, 1783 V_tcp_autorcvbuf_max); 1784 } 1785 /* Start over with next RTT. */ 1786 tp->rfbuf_ts = 0; 1787 tp->rfbuf_cnt = 0; 1788 } else 1789 tp->rfbuf_cnt += tlen; /* add up */ 1790 } 1791 1792 /* Add data to socket buffer. */ 1793 SOCKBUF_LOCK(&so->so_rcv); 1794 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1795 m_freem(m); 1796 } else { 1797 /* 1798 * Set new socket buffer size. 1799 * Give up when limit is reached. 1800 */ 1801 if (newsize) 1802 if (!sbreserve_locked(&so->so_rcv, 1803 newsize, so, NULL)) 1804 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1805 m_adj(m, drop_hdrlen); /* delayed header drop */ 1806 sbappendstream_locked(&so->so_rcv, m); 1807 } 1808 /* NB: sorwakeup_locked() does an implicit unlock. */ 1809 sorwakeup_locked(so); 1810 if (DELAY_ACK(tp)) { 1811 tp->t_flags |= TF_DELACK; 1812 } else { 1813 tp->t_flags |= TF_ACKNOW; 1814 tcp_output(tp); 1815 } 1816 goto check_delack; 1817 } 1818 } 1819 1820 /* 1821 * Calculate amount of space in receive window, 1822 * and then do TCP input processing. 1823 * Receive window is amount of space in rcv queue, 1824 * but not less than advertised window. 1825 */ 1826 win = sbspace(&so->so_rcv); 1827 if (win < 0) 1828 win = 0; 1829 KASSERT(SEQ_GEQ(tp->rcv_adv, tp->rcv_nxt), 1830 ("tcp_input negative window: tp %p rcv_nxt %u rcv_adv %u", tp, 1831 tp->rcv_nxt, tp->rcv_adv)); 1832 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1833 1834 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1835 tp->rfbuf_ts = 0; 1836 tp->rfbuf_cnt = 0; 1837 1838 switch (tp->t_state) { 1839 1840 /* 1841 * If the state is SYN_RECEIVED: 1842 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1843 */ 1844 case TCPS_SYN_RECEIVED: 1845 if ((thflags & TH_ACK) && 1846 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1847 SEQ_GT(th->th_ack, tp->snd_max))) { 1848 rstreason = BANDLIM_RST_OPENPORT; 1849 goto dropwithreset; 1850 } 1851 break; 1852 1853 /* 1854 * If the state is SYN_SENT: 1855 * if seg contains an ACK, but not for our SYN, drop the input. 1856 * if seg contains a RST, then drop the connection. 1857 * if seg does not contain SYN, then drop it. 1858 * Otherwise this is an acceptable SYN segment 1859 * initialize tp->rcv_nxt and tp->irs 1860 * if seg contains ack then advance tp->snd_una 1861 * if seg contains an ECE and ECN support is enabled, the stream 1862 * is ECN capable. 1863 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1864 * arrange for segment to be acked (eventually) 1865 * continue processing rest of data/controls, beginning with URG 1866 */ 1867 case TCPS_SYN_SENT: 1868 if ((thflags & TH_ACK) && 1869 (SEQ_LEQ(th->th_ack, tp->iss) || 1870 SEQ_GT(th->th_ack, tp->snd_max))) { 1871 rstreason = BANDLIM_UNLIMITED; 1872 goto dropwithreset; 1873 } 1874 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) 1875 tp = tcp_drop(tp, ECONNREFUSED); 1876 if (thflags & TH_RST) 1877 goto drop; 1878 if (!(thflags & TH_SYN)) 1879 goto drop; 1880 1881 tp->irs = th->th_seq; 1882 tcp_rcvseqinit(tp); 1883 if (thflags & TH_ACK) { 1884 TCPSTAT_INC(tcps_connects); 1885 soisconnected(so); 1886#ifdef MAC 1887 mac_socketpeer_set_from_mbuf(m, so); 1888#endif 1889 /* Do window scaling on this connection? */ 1890 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1891 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1892 tp->rcv_scale = tp->request_r_scale; 1893 } 1894 tp->rcv_adv += imin(tp->rcv_wnd, 1895 TCP_MAXWIN << tp->rcv_scale); 1896 tp->snd_una++; /* SYN is acked */ 1897 /* 1898 * If there's data, delay ACK; if there's also a FIN 1899 * ACKNOW will be turned on later. 1900 */ 1901 if (DELAY_ACK(tp) && tlen != 0) 1902 tcp_timer_activate(tp, TT_DELACK, 1903 tcp_delacktime); 1904 else 1905 tp->t_flags |= TF_ACKNOW; 1906 1907 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 1908 tp->t_flags |= TF_ECN_PERMIT; 1909 TCPSTAT_INC(tcps_ecn_shs); 1910 } 1911 1912 /* 1913 * Received <SYN,ACK> in SYN_SENT[*] state. 1914 * Transitions: 1915 * SYN_SENT --> ESTABLISHED 1916 * SYN_SENT* --> FIN_WAIT_1 1917 */ 1918 tp->t_starttime = ticks; 1919 if (tp->t_flags & TF_NEEDFIN) { 1920 tp->t_state = TCPS_FIN_WAIT_1; 1921 tp->t_flags &= ~TF_NEEDFIN; 1922 thflags &= ~TH_SYN; 1923 } else { 1924 tp->t_state = TCPS_ESTABLISHED; 1925 cc_conn_init(tp); 1926 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1927 } 1928 } else { 1929 /* 1930 * Received initial SYN in SYN-SENT[*] state => 1931 * simultaneous open. If segment contains CC option 1932 * and there is a cached CC, apply TAO test. 1933 * If it succeeds, connection is * half-synchronized. 1934 * Otherwise, do 3-way handshake: 1935 * SYN-SENT -> SYN-RECEIVED 1936 * SYN-SENT* -> SYN-RECEIVED* 1937 * If there was no CC option, clear cached CC value. 1938 */ 1939 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 1940 tcp_timer_activate(tp, TT_REXMT, 0); 1941 tp->t_state = TCPS_SYN_RECEIVED; 1942 } 1943 1944 KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " 1945 "ti_locked %d", __func__, ti_locked)); 1946 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1947 INP_WLOCK_ASSERT(tp->t_inpcb); 1948 1949 /* 1950 * Advance th->th_seq to correspond to first data byte. 1951 * If data, trim to stay within window, 1952 * dropping FIN if necessary. 1953 */ 1954 th->th_seq++; 1955 if (tlen > tp->rcv_wnd) { 1956 todrop = tlen - tp->rcv_wnd; 1957 m_adj(m, -todrop); 1958 tlen = tp->rcv_wnd; 1959 thflags &= ~TH_FIN; 1960 TCPSTAT_INC(tcps_rcvpackafterwin); 1961 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 1962 } 1963 tp->snd_wl1 = th->th_seq - 1; 1964 tp->rcv_up = th->th_seq; 1965 /* 1966 * Client side of transaction: already sent SYN and data. 1967 * If the remote host used T/TCP to validate the SYN, 1968 * our data will be ACK'd; if so, enter normal data segment 1969 * processing in the middle of step 5, ack processing. 1970 * Otherwise, goto step 6. 1971 */ 1972 if (thflags & TH_ACK) 1973 goto process_ACK; 1974 1975 goto step6; 1976 1977 /* 1978 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 1979 * do normal processing. 1980 * 1981 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 1982 */ 1983 case TCPS_LAST_ACK: 1984 case TCPS_CLOSING: 1985 break; /* continue normal processing */ 1986 } 1987 1988 /* 1989 * States other than LISTEN or SYN_SENT. 1990 * First check the RST flag and sequence number since reset segments 1991 * are exempt from the timestamp and connection count tests. This 1992 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 1993 * below which allowed reset segments in half the sequence space 1994 * to fall though and be processed (which gives forged reset 1995 * segments with a random sequence number a 50 percent chance of 1996 * killing a connection). 1997 * Then check timestamp, if present. 1998 * Then check the connection count, if present. 1999 * Then check that at least some bytes of segment are within 2000 * receive window. If segment begins before rcv_nxt, 2001 * drop leading data (and SYN); if nothing left, just ack. 2002 * 2003 * 2004 * If the RST bit is set, check the sequence number to see 2005 * if this is a valid reset segment. 2006 * RFC 793 page 37: 2007 * In all states except SYN-SENT, all reset (RST) segments 2008 * are validated by checking their SEQ-fields. A reset is 2009 * valid if its sequence number is in the window. 2010 * Note: this does not take into account delayed ACKs, so 2011 * we should test against last_ack_sent instead of rcv_nxt. 2012 * The sequence number in the reset segment is normally an 2013 * echo of our outgoing acknowlegement numbers, but some hosts 2014 * send a reset with the sequence number at the rightmost edge 2015 * of our receive window, and we have to handle this case. 2016 * Note 2: Paul Watson's paper "Slipping in the Window" has shown 2017 * that brute force RST attacks are possible. To combat this, 2018 * we use a much stricter check while in the ESTABLISHED state, 2019 * only accepting RSTs where the sequence number is equal to 2020 * last_ack_sent. In all other states (the states in which a 2021 * RST is more likely), the more permissive check is used. 2022 * If we have multiple segments in flight, the initial reset 2023 * segment sequence numbers will be to the left of last_ack_sent, 2024 * but they will eventually catch up. 2025 * In any case, it never made sense to trim reset segments to 2026 * fit the receive window since RFC 1122 says: 2027 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 2028 * 2029 * A TCP SHOULD allow a received RST segment to include data. 2030 * 2031 * DISCUSSION 2032 * It has been suggested that a RST segment could contain 2033 * ASCII text that encoded and explained the cause of the 2034 * RST. No standard has yet been established for such 2035 * data. 2036 * 2037 * If the reset segment passes the sequence number test examine 2038 * the state: 2039 * SYN_RECEIVED STATE: 2040 * If passive open, return to LISTEN state. 2041 * If active open, inform user that connection was refused. 2042 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 2043 * Inform user that connection was reset, and close tcb. 2044 * CLOSING, LAST_ACK STATES: 2045 * Close the tcb. 2046 * TIME_WAIT STATE: 2047 * Drop the segment - see Stevens, vol. 2, p. 964 and 2048 * RFC 1337. 2049 */ 2050 if (thflags & TH_RST) { 2051 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 2052 SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 2053 switch (tp->t_state) { 2054 2055 case TCPS_SYN_RECEIVED: 2056 so->so_error = ECONNREFUSED; 2057 goto close; 2058 2059 case TCPS_ESTABLISHED: 2060 if (V_tcp_insecure_rst == 0 && 2061 !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && 2062 SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && 2063 !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 2064 SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { 2065 TCPSTAT_INC(tcps_badrst); 2066 goto drop; 2067 } 2068 /* FALLTHROUGH */ 2069 case TCPS_FIN_WAIT_1: 2070 case TCPS_FIN_WAIT_2: 2071 case TCPS_CLOSE_WAIT: 2072 so->so_error = ECONNRESET; 2073 close: 2074 KASSERT(ti_locked == TI_WLOCKED, 2075 ("tcp_do_segment: TH_RST 1 ti_locked %d", 2076 ti_locked)); 2077 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2078 2079 tp->t_state = TCPS_CLOSED; 2080 TCPSTAT_INC(tcps_drops); 2081 tp = tcp_close(tp); 2082 break; 2083 2084 case TCPS_CLOSING: 2085 case TCPS_LAST_ACK: 2086 KASSERT(ti_locked == TI_WLOCKED, 2087 ("tcp_do_segment: TH_RST 2 ti_locked %d", 2088 ti_locked)); 2089 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2090 2091 tp = tcp_close(tp); 2092 break; 2093 } 2094 } 2095 goto drop; 2096 } 2097 2098 /* 2099 * RFC 1323 PAWS: If we have a timestamp reply on this segment 2100 * and it's less than ts_recent, drop it. 2101 */ 2102 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 2103 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 2104 2105 /* Check to see if ts_recent is over 24 days old. */ 2106 if (ticks - tp->ts_recent_age > TCP_PAWS_IDLE) { 2107 /* 2108 * Invalidate ts_recent. If this segment updates 2109 * ts_recent, the age will be reset later and ts_recent 2110 * will get a valid value. If it does not, setting 2111 * ts_recent to zero will at least satisfy the 2112 * requirement that zero be placed in the timestamp 2113 * echo reply when ts_recent isn't valid. The 2114 * age isn't reset until we get a valid ts_recent 2115 * because we don't want out-of-order segments to be 2116 * dropped when ts_recent is old. 2117 */ 2118 tp->ts_recent = 0; 2119 } else { 2120 TCPSTAT_INC(tcps_rcvduppack); 2121 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 2122 TCPSTAT_INC(tcps_pawsdrop); 2123 if (tlen) 2124 goto dropafterack; 2125 goto drop; 2126 } 2127 } 2128 2129 /* 2130 * In the SYN-RECEIVED state, validate that the packet belongs to 2131 * this connection before trimming the data to fit the receive 2132 * window. Check the sequence number versus IRS since we know 2133 * the sequence numbers haven't wrapped. This is a partial fix 2134 * for the "LAND" DoS attack. 2135 */ 2136 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 2137 rstreason = BANDLIM_RST_OPENPORT; 2138 goto dropwithreset; 2139 } 2140 2141 todrop = tp->rcv_nxt - th->th_seq; 2142 if (todrop > 0) { 2143 /* 2144 * If this is a duplicate SYN for our current connection, 2145 * advance over it and pretend and it's not a SYN. 2146 */ 2147 if (thflags & TH_SYN && th->th_seq == tp->irs) { 2148 thflags &= ~TH_SYN; 2149 th->th_seq++; 2150 if (th->th_urp > 1) 2151 th->th_urp--; 2152 else 2153 thflags &= ~TH_URG; 2154 todrop--; 2155 } 2156 /* 2157 * Following if statement from Stevens, vol. 2, p. 960. 2158 */ 2159 if (todrop > tlen 2160 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 2161 /* 2162 * Any valid FIN must be to the left of the window. 2163 * At this point the FIN must be a duplicate or out 2164 * of sequence; drop it. 2165 */ 2166 thflags &= ~TH_FIN; 2167 2168 /* 2169 * Send an ACK to resynchronize and drop any data. 2170 * But keep on processing for RST or ACK. 2171 */ 2172 tp->t_flags |= TF_ACKNOW; 2173 todrop = tlen; 2174 TCPSTAT_INC(tcps_rcvduppack); 2175 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 2176 } else { 2177 TCPSTAT_INC(tcps_rcvpartduppack); 2178 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 2179 } 2180 drop_hdrlen += todrop; /* drop from the top afterwards */ 2181 th->th_seq += todrop; 2182 tlen -= todrop; 2183 if (th->th_urp > todrop) 2184 th->th_urp -= todrop; 2185 else { 2186 thflags &= ~TH_URG; 2187 th->th_urp = 0; 2188 } 2189 } 2190 2191 /* 2192 * If new data are received on a connection after the 2193 * user processes are gone, then RST the other end. 2194 */ 2195 if ((so->so_state & SS_NOFDREF) && 2196 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 2197 char *s; 2198 2199 KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " 2200 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); 2201 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2202 2203 if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { 2204 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " 2205 "was closed, sending RST and removing tcpcb\n", 2206 s, __func__, tcpstates[tp->t_state], tlen); 2207 free(s, M_TCPLOG); 2208 } 2209 tp = tcp_close(tp); 2210 TCPSTAT_INC(tcps_rcvafterclose); 2211 rstreason = BANDLIM_UNLIMITED; 2212 goto dropwithreset; 2213 } 2214 2215 /* 2216 * If segment ends after window, drop trailing data 2217 * (and PUSH and FIN); if nothing left, just ACK. 2218 */ 2219 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 2220 if (todrop > 0) { 2221 TCPSTAT_INC(tcps_rcvpackafterwin); 2222 if (todrop >= tlen) { 2223 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 2224 /* 2225 * If window is closed can only take segments at 2226 * window edge, and have to drop data and PUSH from 2227 * incoming segments. Continue processing, but 2228 * remember to ack. Otherwise, drop segment 2229 * and ack. 2230 */ 2231 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 2232 tp->t_flags |= TF_ACKNOW; 2233 TCPSTAT_INC(tcps_rcvwinprobe); 2234 } else 2235 goto dropafterack; 2236 } else 2237 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 2238 m_adj(m, -todrop); 2239 tlen -= todrop; 2240 thflags &= ~(TH_PUSH|TH_FIN); 2241 } 2242 2243 /* 2244 * If last ACK falls within this segment's sequence numbers, 2245 * record its timestamp. 2246 * NOTE: 2247 * 1) That the test incorporates suggestions from the latest 2248 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2249 * 2) That updating only on newer timestamps interferes with 2250 * our earlier PAWS tests, so this check should be solely 2251 * predicated on the sequence space of this segment. 2252 * 3) That we modify the segment boundary check to be 2253 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 2254 * instead of RFC1323's 2255 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 2256 * This modified check allows us to overcome RFC1323's 2257 * limitations as described in Stevens TCP/IP Illustrated 2258 * Vol. 2 p.869. In such cases, we can still calculate the 2259 * RTT correctly when RCV.NXT == Last.ACK.Sent. 2260 */ 2261 if ((to.to_flags & TOF_TS) != 0 && 2262 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 2263 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 2264 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 2265 tp->ts_recent_age = ticks; 2266 tp->ts_recent = to.to_tsval; 2267 } 2268 2269 /* 2270 * If a SYN is in the window, then this is an 2271 * error and we send an RST and drop the connection. 2272 */ 2273 if (thflags & TH_SYN) { 2274 KASSERT(ti_locked == TI_WLOCKED, 2275 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); 2276 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2277 2278 tp = tcp_drop(tp, ECONNRESET); 2279 rstreason = BANDLIM_UNLIMITED; 2280 goto drop; 2281 } 2282 2283 /* 2284 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 2285 * flag is on (half-synchronized state), then queue data for 2286 * later processing; else drop segment and return. 2287 */ 2288 if ((thflags & TH_ACK) == 0) { 2289 if (tp->t_state == TCPS_SYN_RECEIVED || 2290 (tp->t_flags & TF_NEEDSYN)) 2291 goto step6; 2292 else if (tp->t_flags & TF_ACKNOW) 2293 goto dropafterack; 2294 else 2295 goto drop; 2296 } 2297 2298 /* 2299 * Ack processing. 2300 */ 2301 switch (tp->t_state) { 2302 2303 /* 2304 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 2305 * ESTABLISHED state and continue processing. 2306 * The ACK was checked above. 2307 */ 2308 case TCPS_SYN_RECEIVED: 2309 2310 TCPSTAT_INC(tcps_connects); 2311 soisconnected(so); 2312 /* Do window scaling? */ 2313 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2314 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2315 tp->rcv_scale = tp->request_r_scale; 2316 tp->snd_wnd = tiwin; 2317 } 2318 /* 2319 * Make transitions: 2320 * SYN-RECEIVED -> ESTABLISHED 2321 * SYN-RECEIVED* -> FIN-WAIT-1 2322 */ 2323 tp->t_starttime = ticks; 2324 if (tp->t_flags & TF_NEEDFIN) { 2325 tp->t_state = TCPS_FIN_WAIT_1; 2326 tp->t_flags &= ~TF_NEEDFIN; 2327 } else { 2328 tp->t_state = TCPS_ESTABLISHED; 2329 cc_conn_init(tp); 2330 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 2331 } 2332 /* 2333 * If segment contains data or ACK, will call tcp_reass() 2334 * later; if not, do so now to pass queued data to user. 2335 */ 2336 if (tlen == 0 && (thflags & TH_FIN) == 0) 2337 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 2338 (struct mbuf *)0); 2339 tp->snd_wl1 = th->th_seq - 1; 2340 /* FALLTHROUGH */ 2341 2342 /* 2343 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 2344 * ACKs. If the ack is in the range 2345 * tp->snd_una < th->th_ack <= tp->snd_max 2346 * then advance tp->snd_una to th->th_ack and drop 2347 * data from the retransmission queue. If this ACK reflects 2348 * more up to date window information we update our window information. 2349 */ 2350 case TCPS_ESTABLISHED: 2351 case TCPS_FIN_WAIT_1: 2352 case TCPS_FIN_WAIT_2: 2353 case TCPS_CLOSE_WAIT: 2354 case TCPS_CLOSING: 2355 case TCPS_LAST_ACK: 2356 if (SEQ_GT(th->th_ack, tp->snd_max)) { 2357 TCPSTAT_INC(tcps_rcvacktoomuch); 2358 goto dropafterack; 2359 } 2360 if ((tp->t_flags & TF_SACK_PERMIT) && 2361 ((to.to_flags & TOF_SACK) || 2362 !TAILQ_EMPTY(&tp->snd_holes))) 2363 tcp_sack_doack(tp, &to, th->th_ack); 2364 2365 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 2366 hhook_run_tcp_est_in(tp, th, &to); 2367 2368 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 2369 if (tlen == 0 && tiwin == tp->snd_wnd) { 2370 TCPSTAT_INC(tcps_rcvdupack); 2371 /* 2372 * If we have outstanding data (other than 2373 * a window probe), this is a completely 2374 * duplicate ack (ie, window info didn't 2375 * change), the ack is the biggest we've 2376 * seen and we've seen exactly our rexmt 2377 * threshhold of them, assume a packet 2378 * has been dropped and retransmit it. 2379 * Kludge snd_nxt & the congestion 2380 * window so we send only this one 2381 * packet. 2382 * 2383 * We know we're losing at the current 2384 * window size so do congestion avoidance 2385 * (set ssthresh to half the current window 2386 * and pull our congestion window back to 2387 * the new ssthresh). 2388 * 2389 * Dup acks mean that packets have left the 2390 * network (they're now cached at the receiver) 2391 * so bump cwnd by the amount in the receiver 2392 * to keep a constant cwnd packets in the 2393 * network. 2394 * 2395 * When using TCP ECN, notify the peer that 2396 * we reduced the cwnd. 2397 */ 2398 if (!tcp_timer_active(tp, TT_REXMT) || 2399 th->th_ack != tp->snd_una) 2400 tp->t_dupacks = 0; 2401 else if (++tp->t_dupacks > tcprexmtthresh || 2402 IN_FASTRECOVERY(tp->t_flags)) { 2403 cc_ack_received(tp, th, CC_DUPACK); 2404 if ((tp->t_flags & TF_SACK_PERMIT) && 2405 IN_FASTRECOVERY(tp->t_flags)) { 2406 int awnd; 2407 2408 /* 2409 * Compute the amount of data in flight first. 2410 * We can inject new data into the pipe iff 2411 * we have less than 1/2 the original window's 2412 * worth of data in flight. 2413 */ 2414 awnd = (tp->snd_nxt - tp->snd_fack) + 2415 tp->sackhint.sack_bytes_rexmit; 2416 if (awnd < tp->snd_ssthresh) { 2417 tp->snd_cwnd += tp->t_maxseg; 2418 if (tp->snd_cwnd > tp->snd_ssthresh) 2419 tp->snd_cwnd = tp->snd_ssthresh; 2420 } 2421 } else 2422 tp->snd_cwnd += tp->t_maxseg; 2423 (void) tcp_output(tp); 2424 goto drop; 2425 } else if (tp->t_dupacks == tcprexmtthresh) { 2426 tcp_seq onxt = tp->snd_nxt; 2427 2428 /* 2429 * If we're doing sack, check to 2430 * see if we're already in sack 2431 * recovery. If we're not doing sack, 2432 * check to see if we're in newreno 2433 * recovery. 2434 */ 2435 if (tp->t_flags & TF_SACK_PERMIT) { 2436 if (IN_FASTRECOVERY(tp->t_flags)) { 2437 tp->t_dupacks = 0; 2438 break; 2439 } 2440 } else { 2441 if (SEQ_LEQ(th->th_ack, 2442 tp->snd_recover)) { 2443 tp->t_dupacks = 0; 2444 break; 2445 } 2446 } 2447 /* Congestion signal before ack. */ 2448 cc_cong_signal(tp, th, CC_NDUPACK); 2449 cc_ack_received(tp, th, CC_DUPACK); 2450 tcp_timer_activate(tp, TT_REXMT, 0); 2451 tp->t_rtttime = 0; 2452 if (tp->t_flags & TF_SACK_PERMIT) { 2453 TCPSTAT_INC( 2454 tcps_sack_recovery_episode); 2455 tp->sack_newdata = tp->snd_nxt; 2456 tp->snd_cwnd = tp->t_maxseg; 2457 (void) tcp_output(tp); 2458 goto drop; 2459 } 2460 tp->snd_nxt = th->th_ack; 2461 tp->snd_cwnd = tp->t_maxseg; 2462 (void) tcp_output(tp); 2463 KASSERT(tp->snd_limited <= 2, 2464 ("%s: tp->snd_limited too big", 2465 __func__)); 2466 tp->snd_cwnd = tp->snd_ssthresh + 2467 tp->t_maxseg * 2468 (tp->t_dupacks - tp->snd_limited); 2469 if (SEQ_GT(onxt, tp->snd_nxt)) 2470 tp->snd_nxt = onxt; 2471 goto drop; 2472 } else if (V_tcp_do_rfc3042) { 2473 cc_ack_received(tp, th, CC_DUPACK); 2474 u_long oldcwnd = tp->snd_cwnd; 2475 tcp_seq oldsndmax = tp->snd_max; 2476 u_int sent; 2477 2478 KASSERT(tp->t_dupacks == 1 || 2479 tp->t_dupacks == 2, 2480 ("%s: dupacks not 1 or 2", 2481 __func__)); 2482 if (tp->t_dupacks == 1) 2483 tp->snd_limited = 0; 2484 tp->snd_cwnd = 2485 (tp->snd_nxt - tp->snd_una) + 2486 (tp->t_dupacks - tp->snd_limited) * 2487 tp->t_maxseg; 2488 (void) tcp_output(tp); 2489 sent = tp->snd_max - oldsndmax; 2490 if (sent > tp->t_maxseg) { 2491 KASSERT((tp->t_dupacks == 2 && 2492 tp->snd_limited == 0) || 2493 (sent == tp->t_maxseg + 1 && 2494 tp->t_flags & TF_SENTFIN), 2495 ("%s: sent too much", 2496 __func__)); 2497 tp->snd_limited = 2; 2498 } else if (sent > 0) 2499 ++tp->snd_limited; 2500 tp->snd_cwnd = oldcwnd; 2501 goto drop; 2502 } 2503 } else 2504 tp->t_dupacks = 0; 2505 break; 2506 } 2507 2508 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 2509 ("%s: th_ack <= snd_una", __func__)); 2510 2511 /* 2512 * If the congestion window was inflated to account 2513 * for the other side's cached packets, retract it. 2514 */ 2515 if (IN_FASTRECOVERY(tp->t_flags)) { 2516 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2517 if (tp->t_flags & TF_SACK_PERMIT) 2518 tcp_sack_partialack(tp, th); 2519 else 2520 tcp_newreno_partial_ack(tp, th); 2521 } else 2522 cc_post_recovery(tp, th); 2523 } 2524 tp->t_dupacks = 0; 2525 /* 2526 * If we reach this point, ACK is not a duplicate, 2527 * i.e., it ACKs something we sent. 2528 */ 2529 if (tp->t_flags & TF_NEEDSYN) { 2530 /* 2531 * T/TCP: Connection was half-synchronized, and our 2532 * SYN has been ACK'd (so connection is now fully 2533 * synchronized). Go to non-starred state, 2534 * increment snd_una for ACK of SYN, and check if 2535 * we can do window scaling. 2536 */ 2537 tp->t_flags &= ~TF_NEEDSYN; 2538 tp->snd_una++; 2539 /* Do window scaling? */ 2540 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2541 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2542 tp->rcv_scale = tp->request_r_scale; 2543 /* Send window already scaled. */ 2544 } 2545 } 2546 2547process_ACK: 2548 INP_WLOCK_ASSERT(tp->t_inpcb); 2549 2550 acked = BYTES_THIS_ACK(tp, th); 2551 TCPSTAT_INC(tcps_rcvackpack); 2552 TCPSTAT_ADD(tcps_rcvackbyte, acked); 2553 2554 /* 2555 * If we just performed our first retransmit, and the ACK 2556 * arrives within our recovery window, then it was a mistake 2557 * to do the retransmit in the first place. Recover our 2558 * original cwnd and ssthresh, and proceed to transmit where 2559 * we left off. 2560 */ 2561 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && 2562 (int)(ticks - tp->t_badrxtwin) < 0) 2563 cc_cong_signal(tp, th, CC_RTO_ERR); 2564 2565 /* 2566 * If we have a timestamp reply, update smoothed 2567 * round trip time. If no timestamp is present but 2568 * transmit timer is running and timed sequence 2569 * number was acked, update smoothed round trip time. 2570 * Since we now have an rtt measurement, cancel the 2571 * timer backoff (cf., Phil Karn's retransmit alg.). 2572 * Recompute the initial retransmit timer. 2573 * 2574 * Some boxes send broken timestamp replies 2575 * during the SYN+ACK phase, ignore 2576 * timestamps of 0 or we could calculate a 2577 * huge RTT and blow up the retransmit timer. 2578 */ 2579 if ((to.to_flags & TOF_TS) != 0 && 2580 to.to_tsecr) { 2581 if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) 2582 tp->t_rttlow = ticks - to.to_tsecr; 2583 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); 2584 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 2585 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 2586 tp->t_rttlow = ticks - tp->t_rtttime; 2587 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 2588 } 2589 2590 /* 2591 * If all outstanding data is acked, stop retransmit 2592 * timer and remember to restart (more output or persist). 2593 * If there is more data to be acked, restart retransmit 2594 * timer, using current (possibly backed-off) value. 2595 */ 2596 if (th->th_ack == tp->snd_max) { 2597 tcp_timer_activate(tp, TT_REXMT, 0); 2598 needoutput = 1; 2599 } else if (!tcp_timer_active(tp, TT_PERSIST)) 2600 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 2601 2602 /* 2603 * If no data (only SYN) was ACK'd, 2604 * skip rest of ACK processing. 2605 */ 2606 if (acked == 0) 2607 goto step6; 2608 2609 /* 2610 * Let the congestion control algorithm update congestion 2611 * control related information. This typically means increasing 2612 * the congestion window. 2613 */ 2614 cc_ack_received(tp, th, CC_ACK); 2615 2616 SOCKBUF_LOCK(&so->so_snd); 2617 if (acked > so->so_snd.sb_cc) { 2618 tp->snd_wnd -= so->so_snd.sb_cc; 2619 sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); 2620 ourfinisacked = 1; 2621 } else { 2622 sbdrop_locked(&so->so_snd, acked); 2623 tp->snd_wnd -= acked; 2624 ourfinisacked = 0; 2625 } 2626 /* NB: sowwakeup_locked() does an implicit unlock. */ 2627 sowwakeup_locked(so); 2628 /* Detect una wraparound. */ 2629 if (!IN_RECOVERY(tp->t_flags) && 2630 SEQ_GT(tp->snd_una, tp->snd_recover) && 2631 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2632 tp->snd_recover = th->th_ack - 1; 2633 /* XXXLAS: Can this be moved up into cc_post_recovery? */ 2634 if (IN_RECOVERY(tp->t_flags) && 2635 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 2636 EXIT_RECOVERY(tp->t_flags); 2637 } 2638 tp->snd_una = th->th_ack; 2639 if (tp->t_flags & TF_SACK_PERMIT) { 2640 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 2641 tp->snd_recover = tp->snd_una; 2642 } 2643 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2644 tp->snd_nxt = tp->snd_una; 2645 2646 switch (tp->t_state) { 2647 2648 /* 2649 * In FIN_WAIT_1 STATE in addition to the processing 2650 * for the ESTABLISHED state if our FIN is now acknowledged 2651 * then enter FIN_WAIT_2. 2652 */ 2653 case TCPS_FIN_WAIT_1: 2654 if (ourfinisacked) { 2655 /* 2656 * If we can't receive any more 2657 * data, then closing user can proceed. 2658 * Starting the timer is contrary to the 2659 * specification, but if we don't get a FIN 2660 * we'll hang forever. 2661 * 2662 * XXXjl: 2663 * we should release the tp also, and use a 2664 * compressed state. 2665 */ 2666 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2667 int timeout; 2668 2669 soisdisconnected(so); 2670 timeout = (tcp_fast_finwait2_recycle) ? 2671 tcp_finwait2_timeout : tcp_maxidle; 2672 tcp_timer_activate(tp, TT_2MSL, timeout); 2673 } 2674 tp->t_state = TCPS_FIN_WAIT_2; 2675 } 2676 break; 2677 2678 /* 2679 * In CLOSING STATE in addition to the processing for 2680 * the ESTABLISHED state if the ACK acknowledges our FIN 2681 * then enter the TIME-WAIT state, otherwise ignore 2682 * the segment. 2683 */ 2684 case TCPS_CLOSING: 2685 if (ourfinisacked) { 2686 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2687 tcp_twstart(tp); 2688 INP_INFO_WUNLOCK(&V_tcbinfo); 2689 m_freem(m); 2690 return; 2691 } 2692 break; 2693 2694 /* 2695 * In LAST_ACK, we may still be waiting for data to drain 2696 * and/or to be acked, as well as for the ack of our FIN. 2697 * If our FIN is now acknowledged, delete the TCB, 2698 * enter the closed state and return. 2699 */ 2700 case TCPS_LAST_ACK: 2701 if (ourfinisacked) { 2702 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2703 tp = tcp_close(tp); 2704 goto drop; 2705 } 2706 break; 2707 } 2708 } 2709 2710step6: 2711 INP_WLOCK_ASSERT(tp->t_inpcb); 2712 2713 /* 2714 * Update window information. 2715 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2716 */ 2717 if ((thflags & TH_ACK) && 2718 (SEQ_LT(tp->snd_wl1, th->th_seq) || 2719 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2720 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2721 /* keep track of pure window updates */ 2722 if (tlen == 0 && 2723 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2724 TCPSTAT_INC(tcps_rcvwinupd); 2725 tp->snd_wnd = tiwin; 2726 tp->snd_wl1 = th->th_seq; 2727 tp->snd_wl2 = th->th_ack; 2728 if (tp->snd_wnd > tp->max_sndwnd) 2729 tp->max_sndwnd = tp->snd_wnd; 2730 needoutput = 1; 2731 } 2732 2733 /* 2734 * Process segments with URG. 2735 */ 2736 if ((thflags & TH_URG) && th->th_urp && 2737 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2738 /* 2739 * This is a kludge, but if we receive and accept 2740 * random urgent pointers, we'll crash in 2741 * soreceive. It's hard to imagine someone 2742 * actually wanting to send this much urgent data. 2743 */ 2744 SOCKBUF_LOCK(&so->so_rcv); 2745 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2746 th->th_urp = 0; /* XXX */ 2747 thflags &= ~TH_URG; /* XXX */ 2748 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 2749 goto dodata; /* XXX */ 2750 } 2751 /* 2752 * If this segment advances the known urgent pointer, 2753 * then mark the data stream. This should not happen 2754 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2755 * a FIN has been received from the remote side. 2756 * In these states we ignore the URG. 2757 * 2758 * According to RFC961 (Assigned Protocols), 2759 * the urgent pointer points to the last octet 2760 * of urgent data. We continue, however, 2761 * to consider it to indicate the first octet 2762 * of data past the urgent section as the original 2763 * spec states (in one of two places). 2764 */ 2765 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2766 tp->rcv_up = th->th_seq + th->th_urp; 2767 so->so_oobmark = so->so_rcv.sb_cc + 2768 (tp->rcv_up - tp->rcv_nxt) - 1; 2769 if (so->so_oobmark == 0) 2770 so->so_rcv.sb_state |= SBS_RCVATMARK; 2771 sohasoutofband(so); 2772 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2773 } 2774 SOCKBUF_UNLOCK(&so->so_rcv); 2775 /* 2776 * Remove out of band data so doesn't get presented to user. 2777 * This can happen independent of advancing the URG pointer, 2778 * but if two URG's are pending at once, some out-of-band 2779 * data may creep in... ick. 2780 */ 2781 if (th->th_urp <= (u_long)tlen && 2782 !(so->so_options & SO_OOBINLINE)) { 2783 /* hdr drop is delayed */ 2784 tcp_pulloutofband(so, th, m, drop_hdrlen); 2785 } 2786 } else { 2787 /* 2788 * If no out of band data is expected, 2789 * pull receive urgent pointer along 2790 * with the receive window. 2791 */ 2792 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2793 tp->rcv_up = tp->rcv_nxt; 2794 } 2795dodata: /* XXX */ 2796 INP_WLOCK_ASSERT(tp->t_inpcb); 2797 2798 /* 2799 * Process the segment text, merging it into the TCP sequencing queue, 2800 * and arranging for acknowledgment of receipt if necessary. 2801 * This process logically involves adjusting tp->rcv_wnd as data 2802 * is presented to the user (this happens in tcp_usrreq.c, 2803 * case PRU_RCVD). If a FIN has already been received on this 2804 * connection then we just ignore the text. 2805 */ 2806 if ((tlen || (thflags & TH_FIN)) && 2807 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2808 tcp_seq save_start = th->th_seq; 2809 m_adj(m, drop_hdrlen); /* delayed header drop */ 2810 /* 2811 * Insert segment which includes th into TCP reassembly queue 2812 * with control block tp. Set thflags to whether reassembly now 2813 * includes a segment with FIN. This handles the common case 2814 * inline (segment is the next to be received on an established 2815 * connection, and the queue is empty), avoiding linkage into 2816 * and removal from the queue and repetition of various 2817 * conversions. 2818 * Set DELACK for segments received in order, but ack 2819 * immediately when segments are out of order (so 2820 * fast retransmit can work). 2821 */ 2822 if (th->th_seq == tp->rcv_nxt && 2823 LIST_EMPTY(&tp->t_segq) && 2824 TCPS_HAVEESTABLISHED(tp->t_state)) { 2825 if (DELAY_ACK(tp)) 2826 tp->t_flags |= TF_DELACK; 2827 else 2828 tp->t_flags |= TF_ACKNOW; 2829 tp->rcv_nxt += tlen; 2830 thflags = th->th_flags & TH_FIN; 2831 TCPSTAT_INC(tcps_rcvpack); 2832 TCPSTAT_ADD(tcps_rcvbyte, tlen); 2833 ND6_HINT(tp); 2834 SOCKBUF_LOCK(&so->so_rcv); 2835 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 2836 m_freem(m); 2837 else 2838 sbappendstream_locked(&so->so_rcv, m); 2839 /* NB: sorwakeup_locked() does an implicit unlock. */ 2840 sorwakeup_locked(so); 2841 } else { 2842 /* 2843 * XXX: Due to the header drop above "th" is 2844 * theoretically invalid by now. Fortunately 2845 * m_adj() doesn't actually frees any mbufs 2846 * when trimming from the head. 2847 */ 2848 thflags = tcp_reass(tp, th, &tlen, m); 2849 tp->t_flags |= TF_ACKNOW; 2850 } 2851 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 2852 tcp_update_sack_list(tp, save_start, save_start + tlen); 2853#if 0 2854 /* 2855 * Note the amount of data that peer has sent into 2856 * our window, in order to estimate the sender's 2857 * buffer size. 2858 * XXX: Unused. 2859 */ 2860 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) 2861 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2862 else 2863 len = so->so_rcv.sb_hiwat; 2864#endif 2865 } else { 2866 m_freem(m); 2867 thflags &= ~TH_FIN; 2868 } 2869 2870 /* 2871 * If FIN is received ACK the FIN and let the user know 2872 * that the connection is closing. 2873 */ 2874 if (thflags & TH_FIN) { 2875 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2876 socantrcvmore(so); 2877 /* 2878 * If connection is half-synchronized 2879 * (ie NEEDSYN flag on) then delay ACK, 2880 * so it may be piggybacked when SYN is sent. 2881 * Otherwise, since we received a FIN then no 2882 * more input can be expected, send ACK now. 2883 */ 2884 if (tp->t_flags & TF_NEEDSYN) 2885 tp->t_flags |= TF_DELACK; 2886 else 2887 tp->t_flags |= TF_ACKNOW; 2888 tp->rcv_nxt++; 2889 } 2890 switch (tp->t_state) { 2891 2892 /* 2893 * In SYN_RECEIVED and ESTABLISHED STATES 2894 * enter the CLOSE_WAIT state. 2895 */ 2896 case TCPS_SYN_RECEIVED: 2897 tp->t_starttime = ticks; 2898 /* FALLTHROUGH */ 2899 case TCPS_ESTABLISHED: 2900 tp->t_state = TCPS_CLOSE_WAIT; 2901 break; 2902 2903 /* 2904 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2905 * enter the CLOSING state. 2906 */ 2907 case TCPS_FIN_WAIT_1: 2908 tp->t_state = TCPS_CLOSING; 2909 break; 2910 2911 /* 2912 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2913 * starting the time-wait timer, turning off the other 2914 * standard timers. 2915 */ 2916 case TCPS_FIN_WAIT_2: 2917 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2918 KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " 2919 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 2920 ti_locked)); 2921 2922 tcp_twstart(tp); 2923 INP_INFO_WUNLOCK(&V_tcbinfo); 2924 return; 2925 } 2926 } 2927 if (ti_locked == TI_WLOCKED) 2928 INP_INFO_WUNLOCK(&V_tcbinfo); 2929 ti_locked = TI_UNLOCKED; 2930 2931#ifdef TCPDEBUG 2932 if (so->so_options & SO_DEBUG) 2933 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 2934 &tcp_savetcp, 0); 2935#endif 2936 2937 /* 2938 * Return any desired output. 2939 */ 2940 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2941 (void) tcp_output(tp); 2942 2943check_delack: 2944 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 2945 __func__, ti_locked)); 2946 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2947 INP_WLOCK_ASSERT(tp->t_inpcb); 2948 2949 if (tp->t_flags & TF_DELACK) { 2950 tp->t_flags &= ~TF_DELACK; 2951 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2952 } 2953 INP_WUNLOCK(tp->t_inpcb); 2954 return; 2955 2956dropafterack: 2957 /* 2958 * Generate an ACK dropping incoming segment if it occupies 2959 * sequence space, where the ACK reflects our state. 2960 * 2961 * We can now skip the test for the RST flag since all 2962 * paths to this code happen after packets containing 2963 * RST have been dropped. 2964 * 2965 * In the SYN-RECEIVED state, don't send an ACK unless the 2966 * segment we received passes the SYN-RECEIVED ACK test. 2967 * If it fails send a RST. This breaks the loop in the 2968 * "LAND" DoS attack, and also prevents an ACK storm 2969 * between two listening ports that have been sent forged 2970 * SYN segments, each with the source address of the other. 2971 */ 2972 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 2973 (SEQ_GT(tp->snd_una, th->th_ack) || 2974 SEQ_GT(th->th_ack, tp->snd_max)) ) { 2975 rstreason = BANDLIM_RST_OPENPORT; 2976 goto dropwithreset; 2977 } 2978#ifdef TCPDEBUG 2979 if (so->so_options & SO_DEBUG) 2980 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2981 &tcp_savetcp, 0); 2982#endif 2983 if (ti_locked == TI_WLOCKED) 2984 INP_INFO_WUNLOCK(&V_tcbinfo); 2985 ti_locked = TI_UNLOCKED; 2986 2987 tp->t_flags |= TF_ACKNOW; 2988 (void) tcp_output(tp); 2989 INP_WUNLOCK(tp->t_inpcb); 2990 m_freem(m); 2991 return; 2992 2993dropwithreset: 2994 if (ti_locked == TI_WLOCKED) 2995 INP_INFO_WUNLOCK(&V_tcbinfo); 2996 ti_locked = TI_UNLOCKED; 2997 2998 if (tp != NULL) { 2999 tcp_dropwithreset(m, th, tp, tlen, rstreason); 3000 INP_WUNLOCK(tp->t_inpcb); 3001 } else 3002 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 3003 return; 3004 3005drop: 3006 if (ti_locked == TI_WLOCKED) { 3007 INP_INFO_WUNLOCK(&V_tcbinfo); 3008 ti_locked = TI_UNLOCKED; 3009 } 3010#ifdef INVARIANTS 3011 else 3012 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 3013#endif 3014 3015 /* 3016 * Drop space held by incoming segment and return. 3017 */ 3018#ifdef TCPDEBUG 3019 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 3020 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 3021 &tcp_savetcp, 0); 3022#endif 3023 if (tp != NULL) 3024 INP_WUNLOCK(tp->t_inpcb); 3025 m_freem(m); 3026} 3027 3028/* 3029 * Issue RST and make ACK acceptable to originator of segment. 3030 * The mbuf must still include the original packet header. 3031 * tp may be NULL. 3032 */ 3033static void 3034tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, 3035 int tlen, int rstreason) 3036{ 3037#ifdef INET 3038 struct ip *ip; 3039#endif 3040#ifdef INET6 3041 struct ip6_hdr *ip6; 3042#endif 3043 3044 if (tp != NULL) { 3045 INP_WLOCK_ASSERT(tp->t_inpcb); 3046 } 3047 3048 /* Don't bother if destination was broadcast/multicast. */ 3049 if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 3050 goto drop; 3051#ifdef INET6 3052 if (mtod(m, struct ip *)->ip_v == 6) { 3053 ip6 = mtod(m, struct ip6_hdr *); 3054 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 3055 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 3056 goto drop; 3057 /* IPv6 anycast check is done at tcp6_input() */ 3058 } 3059#endif 3060#if defined(INET) && defined(INET6) 3061 else 3062#endif 3063#ifdef INET 3064 { 3065 ip = mtod(m, struct ip *); 3066 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 3067 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 3068 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 3069 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 3070 goto drop; 3071 } 3072#endif 3073 3074 /* Perform bandwidth limiting. */ 3075 if (badport_bandlim(rstreason) < 0) 3076 goto drop; 3077 3078 /* tcp_respond consumes the mbuf chain. */ 3079 if (th->th_flags & TH_ACK) { 3080 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, 3081 th->th_ack, TH_RST); 3082 } else { 3083 if (th->th_flags & TH_SYN) 3084 tlen++; 3085 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 3086 (tcp_seq)0, TH_RST|TH_ACK); 3087 } 3088 return; 3089drop: 3090 m_freem(m); 3091} 3092 3093/* 3094 * Parse TCP options and place in tcpopt. 3095 */ 3096static void 3097tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 3098{ 3099 int opt, optlen; 3100 3101 to->to_flags = 0; 3102 for (; cnt > 0; cnt -= optlen, cp += optlen) { 3103 opt = cp[0]; 3104 if (opt == TCPOPT_EOL) 3105 break; 3106 if (opt == TCPOPT_NOP) 3107 optlen = 1; 3108 else { 3109 if (cnt < 2) 3110 break; 3111 optlen = cp[1]; 3112 if (optlen < 2 || optlen > cnt) 3113 break; 3114 } 3115 switch (opt) { 3116 case TCPOPT_MAXSEG: 3117 if (optlen != TCPOLEN_MAXSEG) 3118 continue; 3119 if (!(flags & TO_SYN)) 3120 continue; 3121 to->to_flags |= TOF_MSS; 3122 bcopy((char *)cp + 2, 3123 (char *)&to->to_mss, sizeof(to->to_mss)); 3124 to->to_mss = ntohs(to->to_mss); 3125 break; 3126 case TCPOPT_WINDOW: 3127 if (optlen != TCPOLEN_WINDOW) 3128 continue; 3129 if (!(flags & TO_SYN)) 3130 continue; 3131 to->to_flags |= TOF_SCALE; 3132 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 3133 break; 3134 case TCPOPT_TIMESTAMP: 3135 if (optlen != TCPOLEN_TIMESTAMP) 3136 continue; 3137 to->to_flags |= TOF_TS; 3138 bcopy((char *)cp + 2, 3139 (char *)&to->to_tsval, sizeof(to->to_tsval)); 3140 to->to_tsval = ntohl(to->to_tsval); 3141 bcopy((char *)cp + 6, 3142 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 3143 to->to_tsecr = ntohl(to->to_tsecr); 3144 break; 3145#ifdef TCP_SIGNATURE 3146 /* 3147 * XXX In order to reply to a host which has set the 3148 * TCP_SIGNATURE option in its initial SYN, we have to 3149 * record the fact that the option was observed here 3150 * for the syncache code to perform the correct response. 3151 */ 3152 case TCPOPT_SIGNATURE: 3153 if (optlen != TCPOLEN_SIGNATURE) 3154 continue; 3155 to->to_flags |= TOF_SIGNATURE; 3156 to->to_signature = cp + 2; 3157 break; 3158#endif 3159 case TCPOPT_SACK_PERMITTED: 3160 if (optlen != TCPOLEN_SACK_PERMITTED) 3161 continue; 3162 if (!(flags & TO_SYN)) 3163 continue; 3164 if (!V_tcp_do_sack) 3165 continue; 3166 to->to_flags |= TOF_SACKPERM; 3167 break; 3168 case TCPOPT_SACK: 3169 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 3170 continue; 3171 if (flags & TO_SYN) 3172 continue; 3173 to->to_flags |= TOF_SACK; 3174 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 3175 to->to_sacks = cp + 2; 3176 TCPSTAT_INC(tcps_sack_rcv_blocks); 3177 break; 3178 default: 3179 continue; 3180 } 3181 } 3182} 3183 3184/* 3185 * Pull out of band byte out of a segment so 3186 * it doesn't appear in the user's data queue. 3187 * It is still reflected in the segment length for 3188 * sequencing purposes. 3189 */ 3190static void 3191tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 3192 int off) 3193{ 3194 int cnt = off + th->th_urp - 1; 3195 3196 while (cnt >= 0) { 3197 if (m->m_len > cnt) { 3198 char *cp = mtod(m, caddr_t) + cnt; 3199 struct tcpcb *tp = sototcpcb(so); 3200 3201 INP_WLOCK_ASSERT(tp->t_inpcb); 3202 3203 tp->t_iobc = *cp; 3204 tp->t_oobflags |= TCPOOB_HAVEDATA; 3205 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 3206 m->m_len--; 3207 if (m->m_flags & M_PKTHDR) 3208 m->m_pkthdr.len--; 3209 return; 3210 } 3211 cnt -= m->m_len; 3212 m = m->m_next; 3213 if (m == NULL) 3214 break; 3215 } 3216 panic("tcp_pulloutofband"); 3217} 3218 3219/* 3220 * Collect new round-trip time estimate 3221 * and update averages and current timeout. 3222 */ 3223static void 3224tcp_xmit_timer(struct tcpcb *tp, int rtt) 3225{ 3226 int delta; 3227 3228 INP_WLOCK_ASSERT(tp->t_inpcb); 3229 3230 TCPSTAT_INC(tcps_rttupdated); 3231 tp->t_rttupdated++; 3232 if (tp->t_srtt != 0) { 3233 /* 3234 * srtt is stored as fixed point with 5 bits after the 3235 * binary point (i.e., scaled by 8). The following magic 3236 * is equivalent to the smoothing algorithm in rfc793 with 3237 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 3238 * point). Adjust rtt to origin 0. 3239 */ 3240 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3241 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3242 3243 if ((tp->t_srtt += delta) <= 0) 3244 tp->t_srtt = 1; 3245 3246 /* 3247 * We accumulate a smoothed rtt variance (actually, a 3248 * smoothed mean difference), then set the retransmit 3249 * timer to smoothed rtt + 4 times the smoothed variance. 3250 * rttvar is stored as fixed point with 4 bits after the 3251 * binary point (scaled by 16). The following is 3252 * equivalent to rfc793 smoothing with an alpha of .75 3253 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 3254 * rfc793's wired-in beta. 3255 */ 3256 if (delta < 0) 3257 delta = -delta; 3258 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3259 if ((tp->t_rttvar += delta) <= 0) 3260 tp->t_rttvar = 1; 3261 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 3262 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3263 } else { 3264 /* 3265 * No rtt measurement yet - use the unsmoothed rtt. 3266 * Set the variance to half the rtt (so our first 3267 * retransmit happens at 3*rtt). 3268 */ 3269 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3270 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3271 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3272 } 3273 tp->t_rtttime = 0; 3274 tp->t_rxtshift = 0; 3275 3276 /* 3277 * the retransmit should happen at rtt + 4 * rttvar. 3278 * Because of the way we do the smoothing, srtt and rttvar 3279 * will each average +1/2 tick of bias. When we compute 3280 * the retransmit timer, we want 1/2 tick of rounding and 3281 * 1 extra tick because of +-1/2 tick uncertainty in the 3282 * firing of the timer. The bias will give us exactly the 3283 * 1.5 tick we need. But, because the bias is 3284 * statistical, we have to test that we don't drop below 3285 * the minimum feasible timer (which is 2 ticks). 3286 */ 3287 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3288 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 3289 3290 /* 3291 * We received an ack for a packet that wasn't retransmitted; 3292 * it is probably safe to discard any error indications we've 3293 * received recently. This isn't quite right, but close enough 3294 * for now (a route might have failed after we sent a segment, 3295 * and the return path might not be symmetrical). 3296 */ 3297 tp->t_softerror = 0; 3298} 3299 3300/* 3301 * Determine a reasonable value for maxseg size. 3302 * If the route is known, check route for mtu. 3303 * If none, use an mss that can be handled on the outgoing 3304 * interface without forcing IP to fragment; if bigger than 3305 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 3306 * to utilize large mbufs. If no route is found, route has no mtu, 3307 * or the destination isn't local, use a default, hopefully conservative 3308 * size (usually 512 or the default IP max size, but no more than the mtu 3309 * of the interface), as we can't discover anything about intervening 3310 * gateways or networks. We also initialize the congestion/slow start 3311 * window to be a single segment if the destination isn't local. 3312 * While looking at the routing entry, we also initialize other path-dependent 3313 * parameters from pre-set or cached values in the routing entry. 3314 * 3315 * Also take into account the space needed for options that we 3316 * send regularly. Make maxseg shorter by that amount to assure 3317 * that we can send maxseg amount of data even when the options 3318 * are present. Store the upper limit of the length of options plus 3319 * data in maxopd. 3320 * 3321 * In case of T/TCP, we call this routine during implicit connection 3322 * setup as well (offer = -1), to initialize maxseg from the cached 3323 * MSS of our peer. 3324 * 3325 * NOTE that this routine is only called when we process an incoming 3326 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). 3327 */ 3328void 3329tcp_mss_update(struct tcpcb *tp, int offer, 3330 struct hc_metrics_lite *metricptr, int *mtuflags) 3331{ 3332 int mss = 0; 3333 u_long maxmtu = 0; 3334 struct inpcb *inp = tp->t_inpcb; 3335 struct hc_metrics_lite metrics; 3336 int origoffer = offer; 3337#ifdef INET6 3338 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 3339 size_t min_protoh = isipv6 ? 3340 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 3341 sizeof (struct tcpiphdr); 3342#else 3343 const size_t min_protoh = sizeof(struct tcpiphdr); 3344#endif 3345 3346 INP_WLOCK_ASSERT(tp->t_inpcb); 3347 3348 /* Initialize. */ 3349#ifdef INET6 3350 if (isipv6) { 3351 maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags); 3352 tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; 3353 } 3354#endif 3355#if defined(INET) && defined(INET6) 3356 else 3357#endif 3358#ifdef INET 3359 { 3360 maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags); 3361 tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; 3362 } 3363#endif 3364 3365 /* 3366 * No route to sender, stay with default mss and return. 3367 */ 3368 if (maxmtu == 0) { 3369 /* 3370 * In case we return early we need to initialize metrics 3371 * to a defined state as tcp_hc_get() would do for us 3372 * if there was no cache hit. 3373 */ 3374 if (metricptr != NULL) 3375 bzero(metricptr, sizeof(struct hc_metrics_lite)); 3376 return; 3377 } 3378 3379 /* What have we got? */ 3380 switch (offer) { 3381 case 0: 3382 /* 3383 * Offer == 0 means that there was no MSS on the SYN 3384 * segment, in this case we use tcp_mssdflt as 3385 * already assigned to t_maxopd above. 3386 */ 3387 offer = tp->t_maxopd; 3388 break; 3389 3390 case -1: 3391 /* 3392 * Offer == -1 means that we didn't receive SYN yet. 3393 */ 3394 /* FALLTHROUGH */ 3395 3396 default: 3397 /* 3398 * Prevent DoS attack with too small MSS. Round up 3399 * to at least minmss. 3400 */ 3401 offer = max(offer, V_tcp_minmss); 3402 } 3403 3404 /* 3405 * rmx information is now retrieved from tcp_hostcache. 3406 */ 3407 tcp_hc_get(&inp->inp_inc, &metrics); 3408 if (metricptr != NULL) 3409 bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); 3410 3411 /* 3412 * If there's a discovered mtu int tcp hostcache, use it 3413 * else, use the link mtu. 3414 */ 3415 if (metrics.rmx_mtu) 3416 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; 3417 else { 3418#ifdef INET6 3419 if (isipv6) { 3420 mss = maxmtu - min_protoh; 3421 if (!V_path_mtu_discovery && 3422 !in6_localaddr(&inp->in6p_faddr)) 3423 mss = min(mss, V_tcp_v6mssdflt); 3424 } 3425#endif 3426#if defined(INET) && defined(INET6) 3427 else 3428#endif 3429#ifdef INET 3430 { 3431 mss = maxmtu - min_protoh; 3432 if (!V_path_mtu_discovery && 3433 !in_localaddr(inp->inp_faddr)) 3434 mss = min(mss, V_tcp_mssdflt); 3435 } 3436#endif 3437 /* 3438 * XXX - The above conditional (mss = maxmtu - min_protoh) 3439 * probably violates the TCP spec. 3440 * The problem is that, since we don't know the 3441 * other end's MSS, we are supposed to use a conservative 3442 * default. But, if we do that, then MTU discovery will 3443 * never actually take place, because the conservative 3444 * default is much less than the MTUs typically seen 3445 * on the Internet today. For the moment, we'll sweep 3446 * this under the carpet. 3447 * 3448 * The conservative default might not actually be a problem 3449 * if the only case this occurs is when sending an initial 3450 * SYN with options and data to a host we've never talked 3451 * to before. Then, they will reply with an MSS value which 3452 * will get recorded and the new parameters should get 3453 * recomputed. For Further Study. 3454 */ 3455 } 3456 mss = min(mss, offer); 3457 3458 /* 3459 * Sanity check: make sure that maxopd will be large 3460 * enough to allow some data on segments even if the 3461 * all the option space is used (40bytes). Otherwise 3462 * funny things may happen in tcp_output. 3463 */ 3464 mss = max(mss, 64); 3465 3466 /* 3467 * maxopd stores the maximum length of data AND options 3468 * in a segment; maxseg is the amount of data in a normal 3469 * segment. We need to store this value (maxopd) apart 3470 * from maxseg, because now every segment carries options 3471 * and thus we normally have somewhat less data in segments. 3472 */ 3473 tp->t_maxopd = mss; 3474 3475 /* 3476 * origoffer==-1 indicates that no segments were received yet. 3477 * In this case we just guess. 3478 */ 3479 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3480 (origoffer == -1 || 3481 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 3482 mss -= TCPOLEN_TSTAMP_APPA; 3483 3484#if (MCLBYTES & (MCLBYTES - 1)) == 0 3485 if (mss > MCLBYTES) 3486 mss &= ~(MCLBYTES-1); 3487#else 3488 if (mss > MCLBYTES) 3489 mss = mss / MCLBYTES * MCLBYTES; 3490#endif 3491 tp->t_maxseg = mss; 3492} 3493 3494void 3495tcp_mss(struct tcpcb *tp, int offer) 3496{ 3497 int mss; 3498 u_long bufsize; 3499 struct inpcb *inp; 3500 struct socket *so; 3501 struct hc_metrics_lite metrics; 3502 int mtuflags = 0; 3503 3504 KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); 3505 3506 tcp_mss_update(tp, offer, &metrics, &mtuflags); 3507 3508 mss = tp->t_maxseg; 3509 inp = tp->t_inpcb; 3510 3511 /* 3512 * If there's a pipesize, change the socket buffer to that size, 3513 * don't change if sb_hiwat is different than default (then it 3514 * has been changed on purpose with setsockopt). 3515 * Make the socket buffers an integral number of mss units; 3516 * if the mss is larger than the socket buffer, decrease the mss. 3517 */ 3518 so = inp->inp_socket; 3519 SOCKBUF_LOCK(&so->so_snd); 3520 if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) 3521 bufsize = metrics.rmx_sendpipe; 3522 else 3523 bufsize = so->so_snd.sb_hiwat; 3524 if (bufsize < mss) 3525 mss = bufsize; 3526 else { 3527 bufsize = roundup(bufsize, mss); 3528 if (bufsize > sb_max) 3529 bufsize = sb_max; 3530 if (bufsize > so->so_snd.sb_hiwat) 3531 (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); 3532 } 3533 SOCKBUF_UNLOCK(&so->so_snd); 3534 tp->t_maxseg = mss; 3535 3536 SOCKBUF_LOCK(&so->so_rcv); 3537 if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) 3538 bufsize = metrics.rmx_recvpipe; 3539 else 3540 bufsize = so->so_rcv.sb_hiwat; 3541 if (bufsize > mss) { 3542 bufsize = roundup(bufsize, mss); 3543 if (bufsize > sb_max) 3544 bufsize = sb_max; 3545 if (bufsize > so->so_rcv.sb_hiwat) 3546 (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); 3547 } 3548 SOCKBUF_UNLOCK(&so->so_rcv); 3549 3550 /* Check the interface for TSO capabilities. */ 3551 if (mtuflags & CSUM_TSO) 3552 tp->t_flags |= TF_TSO; 3553} 3554 3555/* 3556 * Determine the MSS option to send on an outgoing SYN. 3557 */ 3558int 3559tcp_mssopt(struct in_conninfo *inc) 3560{ 3561 int mss = 0; 3562 u_long maxmtu = 0; 3563 u_long thcmtu = 0; 3564 size_t min_protoh; 3565 3566 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 3567 3568#ifdef INET6 3569 if (inc->inc_flags & INC_ISIPV6) { 3570 mss = V_tcp_v6mssdflt; 3571 maxmtu = tcp_maxmtu6(inc, NULL); 3572 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3573 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 3574 } 3575#endif 3576#if defined(INET) && defined(INET6) 3577 else 3578#endif 3579#ifdef INET 3580 { 3581 mss = V_tcp_mssdflt; 3582 maxmtu = tcp_maxmtu(inc, NULL); 3583 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3584 min_protoh = sizeof(struct tcpiphdr); 3585 } 3586#endif 3587 if (maxmtu && thcmtu) 3588 mss = min(maxmtu, thcmtu) - min_protoh; 3589 else if (maxmtu || thcmtu) 3590 mss = max(maxmtu, thcmtu) - min_protoh; 3591 3592 return (mss); 3593} 3594 3595 3596/* 3597 * On a partial ack arrives, force the retransmission of the 3598 * next unacknowledged segment. Do not clear tp->t_dupacks. 3599 * By setting snd_nxt to ti_ack, this forces retransmission timer to 3600 * be started again. 3601 */ 3602static void 3603tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 3604{ 3605 tcp_seq onxt = tp->snd_nxt; 3606 u_long ocwnd = tp->snd_cwnd; 3607 3608 INP_WLOCK_ASSERT(tp->t_inpcb); 3609 3610 tcp_timer_activate(tp, TT_REXMT, 0); 3611 tp->t_rtttime = 0; 3612 tp->snd_nxt = th->th_ack; 3613 /* 3614 * Set snd_cwnd to one segment beyond acknowledged offset. 3615 * (tp->snd_una has not yet been updated when this function is called.) 3616 */ 3617 tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); 3618 tp->t_flags |= TF_ACKNOW; 3619 (void) tcp_output(tp); 3620 tp->snd_cwnd = ocwnd; 3621 if (SEQ_GT(onxt, tp->snd_nxt)) 3622 tp->snd_nxt = onxt; 3623 /* 3624 * Partial window deflation. Relies on fact that tp->snd_una 3625 * not updated yet. 3626 */ 3627 if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) 3628 tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); 3629 else 3630 tp->snd_cwnd = 0; 3631 tp->snd_cwnd += tp->t_maxseg; 3632} 3633