1/* 2 * Copyright (c) 2011 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <sys/param.h> 30#include <sys/systm.h> 31#include <sys/sysctl.h> 32#include <sys/mbuf.h> 33#include <sys/mcache.h> 34#include <sys/socket.h> 35#include <sys/socketvar.h> 36#include <net/if_types.h> 37#include <net/route.h> 38#include <netinet/in.h> 39#include <netinet/in_systm.h> 40#include <net/if.h> 41#include <netinet/ip.h> 42#include <netinet/ip_var.h> 43#include <netinet/in_var.h> 44#include <netinet/tcp.h> 45#include <netinet/tcp_seq.h> 46#include <netinet/tcpip.h> 47#include <netinet/tcp_var.h> 48#include <netinet/tcp_lro.h> 49#include <netinet/lro_ext.h> 50#include <kern/locks.h> 51 52unsigned int lrocount = 0; /* A counter used for debugging only */ 53unsigned int lro_seq_outoforder = 0; /* Counter for debugging */ 54unsigned int lro_seq_mismatch = 0; /* Counter for debugging */ 55unsigned int lro_eject_req = 0; /* Counter for tracking flow ejections */ 56unsigned int lro_flushes = 0; /* Counter for tracking number of flushes */ 57unsigned int lro_single_flushes = 0; 58unsigned int lro_double_flushes = 0; 59unsigned int lro_good_flushes = 0; 60 61unsigned int coalesc_sz = LRO_MX_COALESCE_PKTS; 62SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_sz, CTLFLAG_RW | CTLFLAG_LOCKED, 63 &coalesc_sz, 0, "Max coalescing size"); 64 65unsigned int coalesc_time = LRO_MX_TIME_TO_BUFFER; 66SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_time, CTLFLAG_RW | CTLFLAG_LOCKED, 67 &coalesc_time, 0, "Max coalescing time"); 68 69struct lro_flow lro_flow_list[TCP_LRO_NUM_FLOWS]; 70 71char lro_flow_map[TCP_LRO_FLOW_MAP]; 72 73static lck_attr_t *tcp_lro_mtx_attr = NULL; /* mutex attributes */ 74static lck_grp_t *tcp_lro_mtx_grp = NULL; /* mutex group */ 75static lck_grp_attr_t *tcp_lro_mtx_grp_attr = NULL; /* mutex group attrs */ 76decl_lck_mtx_data( ,tcp_lro_lock); /* Used to synchronize updates */ 77 78unsigned int lro_byte_count = 0; 79 80uint64_t lro_deadline = 0; /* LRO's sense of time - protected by tcp_lro_lock */ 81uint32_t lro_timer_set = 0; 82 83/* Some LRO stats */ 84u_int32_t lro_pkt_count = 0; /* Number of packets encountered in an LRO period */ 85thread_call_t tcp_lro_timer; 86 87extern u_int32_t kipf_count; 88 89static void tcp_lro_timer_proc(void*, void*); 90static void lro_update_stats(struct mbuf*); 91static void lro_update_flush_stats(struct mbuf *); 92static void tcp_lro_flush_flows(void); 93static void tcp_lro_sched_timer(uint64_t); 94static void lro_proto_input(struct mbuf *); 95 96static struct mbuf *lro_tcp_xsum_validate(struct mbuf*, struct ipovly *, 97 struct tcphdr*); 98static struct mbuf *tcp_lro_process_pkt(struct mbuf*, struct ip*, struct tcphdr*, 99 int); 100 101void 102tcp_lro_init(void) 103{ 104 int i; 105 106 bzero(lro_flow_list, sizeof (struct lro_flow) * TCP_LRO_NUM_FLOWS); 107 for (i = 0; i < TCP_LRO_FLOW_MAP; i++) { 108 lro_flow_map[i] = TCP_LRO_FLOW_UNINIT; 109 } 110 111 /* 112 * allocate lock group attribute, group and attribute for tcp_lro_lock 113 */ 114 tcp_lro_mtx_grp_attr = lck_grp_attr_alloc_init(); 115 tcp_lro_mtx_grp = lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr); 116 tcp_lro_mtx_attr = lck_attr_alloc_init(); 117 lck_mtx_init(&tcp_lro_lock, tcp_lro_mtx_grp, tcp_lro_mtx_attr); 118 119 tcp_lro_timer = thread_call_allocate(tcp_lro_timer_proc, NULL); 120 if (tcp_lro_timer == NULL) { 121 panic_plain("%s: unable to allocate lro timer", __func__); 122 } 123 124 return; 125} 126 127static int 128tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash, 129 int *flow_id ) 130{ 131 struct lro_flow *flow; 132 tcp_seq seqnum; 133 unsigned int off = 0; 134 int payload_len = 0; 135 136 *hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, 137 tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1)); 138 139 *flow_id = lro_flow_map[*hash]; 140 if (*flow_id == TCP_LRO_FLOW_NOTFOUND) { 141 return TCP_LRO_NAN; 142 } 143 144 seqnum = tcp_hdr->th_seq; 145 off = tcp_hdr->th_off << 2; 146 payload_len = ip_hdr->ip_len - off; 147 148 flow = &lro_flow_list[*flow_id]; 149 150 if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) && 151 (flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) && 152 (flow->lr_fport == tcp_hdr->th_sport) && 153 (flow->lr_lport == tcp_hdr->th_dport)) { 154 if (flow->lr_tcphdr == NULL) { 155 if (ntohl(seqnum) == flow->lr_seq) { 156 return TCP_LRO_COALESCE; 157 } 158 if (lrodebug >= 4) { 159 printf("%s: seqnum = %x, lr_seq = %x\n", 160 __func__, ntohl(seqnum), flow->lr_seq); 161 } 162 lro_seq_mismatch++; 163 if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) { 164 lro_seq_outoforder++; 165 /* 166 * Whenever we receive out of order packets it 167 * signals loss and recovery and LRO doesn't 168 * let flows recover quickly. So eject. 169 */ 170 flow->lr_flags |= LRO_EJECT_REQ; 171 172 } 173 return TCP_LRO_NAN; 174 } 175 176 if (flow->lr_flags & LRO_EJECT_REQ) { 177 if (lrodebug) 178 printf("%s: eject. \n", __func__); 179 return TCP_LRO_EJECT_FLOW; 180 } 181 if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) { 182 if (lrodebug) { 183 printf("%s: th_ack = %x flow_ack = %x \n", 184 __func__, tcp_hdr->th_ack, 185 flow->lr_tcphdr->th_ack); 186 } 187 return TCP_LRO_EJECT_FLOW; 188 } 189 190 if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) { 191 return TCP_LRO_COALESCE; 192 } else { 193 /* LRO does not handle loss recovery well, eject */ 194 flow->lr_flags |= LRO_EJECT_REQ; 195 return TCP_LRO_EJECT_FLOW; 196 } 197 } 198 if (lrodebug) printf("tcp_lro_matching_tuple: collision \n"); 199 return TCP_LRO_COLLISION; 200} 201 202static void 203tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr, 204 int hash, u_int32_t timestamp, int payload_len) 205{ 206 struct lro_flow *flow = NULL; 207 208 flow = &lro_flow_list[flow_id]; 209 210 flow->lr_hash_map = hash; 211 flow->lr_faddr.s_addr = ip_hdr->ip_src.s_addr; 212 flow->lr_laddr.s_addr = ip_hdr->ip_dst.s_addr; 213 flow->lr_fport = tcp_hdr->th_sport; 214 flow->lr_lport = tcp_hdr->th_dport; 215 lro_flow_map[hash] = flow_id; 216 flow->lr_timestamp = timestamp; 217 flow->lr_seq = ntohl(tcp_hdr->th_seq) + payload_len; 218 flow->lr_flags = 0; 219 return; 220} 221 222static void 223tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr, 224 int payload_len, int drop_hdrlen, struct tcpopt *topt, 225 u_int32_t* tsval, u_int32_t* tsecr, int thflags) 226{ 227 struct lro_flow *flow = NULL; 228 struct mbuf *last; 229 struct ip *ip = NULL; 230 231 flow = &lro_flow_list[flow_id]; 232 if (flow->lr_mhead) { 233 if (lrodebug) 234 printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq, 235 payload_len); 236 m_adj(lro_mb, drop_hdrlen); 237 238 last = flow->lr_mtail; 239 while (last->m_next != NULL) { 240 last = last->m_next; 241 } 242 last->m_next = lro_mb; 243 244 flow->lr_mtail = lro_mb; 245 246 ip = mtod(flow->lr_mhead, struct ip *); 247 ip->ip_len += lro_mb->m_pkthdr.len; 248 flow->lr_mhead->m_pkthdr.len += lro_mb->m_pkthdr.len; 249 250 if (flow->lr_len == 0) { 251 panic_plain("%s: Inconsistent LRO flow state", __func__); 252 } 253 flow->lr_len += payload_len; 254 flow->lr_seq += payload_len; 255 /* 256 * This bit is re-OR'd each time a packet is added to the 257 * large coalesced packet. 258 */ 259 flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT; 260 flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */ 261 if (flow->lr_mhead->m_pkthdr.lro_pktlen < 262 lro_mb->m_pkthdr.lro_pktlen) { 263 /* 264 * For TCP Inter Arrival Jitter calculation, return max 265 * size encountered while coalescing a stream of pkts. 266 */ 267 flow->lr_mhead->m_pkthdr.lro_pktlen = 268 lro_mb->m_pkthdr.lro_pktlen; 269 } 270 /* Update the timestamp value */ 271 if (topt->to_flags & TOF_TS) { 272 if ((flow->lr_tsval) && 273 (TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) { 274 *(flow->lr_tsval) = htonl(topt->to_tsval); 275 } 276 if ((flow->lr_tsecr) && 277 (topt->to_tsecr != 0) && 278 (TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) { 279 if (lrodebug >= 2) { 280 printf("%s: instantaneous RTT = %d \n", __func__, 281 topt->to_tsecr - ntohl(*(flow->lr_tsecr))); 282 } 283 *(flow->lr_tsecr) = htonl(topt->to_tsecr); 284 } 285 } 286 /* Coalesce the flags */ 287 if (thflags) { 288 flow->lr_tcphdr->th_flags |= thflags; 289 } 290 /* Update receive window */ 291 flow->lr_tcphdr->th_win = tcphdr->th_win; 292 } else { 293 if (lro_mb) { 294 flow->lr_mhead = flow->lr_mtail = lro_mb; 295 flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT; 296 flow->lr_tcphdr = tcphdr; 297 if ((topt) && (topt->to_flags & TOF_TS)) { 298 ASSERT(tsval != NULL); 299 ASSERT(tsecr != NULL); 300 flow->lr_tsval = tsval; 301 flow->lr_tsecr = tsecr; 302 } 303 flow->lr_len = payload_len; 304 flow->lr_timestamp = tcp_now; 305 tcp_lro_sched_timer(0); 306 } 307 flow->lr_seq = ntohl(tcphdr->th_seq) + payload_len; 308 } 309 if (lro_mb) { 310 tcpstat.tcps_coalesced_pack++; 311 } 312 return; 313} 314 315static struct mbuf * 316tcp_lro_eject_flow(int flow_id) 317{ 318 struct mbuf *mb = NULL; 319 320 mb = lro_flow_list[flow_id].lr_mhead; 321 ASSERT(lro_flow_map[lro_flow_list[flow_id].lr_hash_map] == flow_id); 322 lro_flow_map[lro_flow_list[flow_id].lr_hash_map] = TCP_LRO_FLOW_UNINIT; 323 bzero(&lro_flow_list[flow_id], sizeof(struct lro_flow)); 324 325 return mb; 326} 327 328static struct mbuf* 329tcp_lro_eject_coalesced_pkt(int flow_id) 330{ 331 struct mbuf *mb = NULL; 332 mb = lro_flow_list[flow_id].lr_mhead; 333 lro_flow_list[flow_id].lr_mhead = 334 lro_flow_list[flow_id].lr_mtail = NULL; 335 lro_flow_list[flow_id].lr_tcphdr = NULL; 336 return mb; 337} 338 339static struct mbuf* 340tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr, 341 struct tcphdr *tcp_hdr, int payload_len, 342 int drop_hdrlen, int hash, struct tcpopt *topt, 343 u_int32_t *tsval, u_int32_t *tsecr) 344{ 345 int i; 346 int slot_available = 0; 347 int candidate_flow = 0; 348 u_int32_t oldest_timestamp; 349 struct mbuf *mb = NULL; 350 int collision = 0; 351 352 oldest_timestamp = tcp_now; 353 354 /* handle collision */ 355 if (lro_flow_map[hash] != TCP_LRO_FLOW_UNINIT) { 356 if (lrodebug) { 357 collision = 1; 358 } 359 candidate_flow = lro_flow_map[hash]; 360 tcpstat.tcps_flowtbl_collision++; 361 goto kick_flow; 362 } 363 364 for (i = 0; i < TCP_LRO_NUM_FLOWS; i++) { 365 if (lro_flow_list[i].lr_mhead == NULL) { 366 candidate_flow = i; 367 slot_available = 1; 368 break; 369 } 370 if (oldest_timestamp >= lro_flow_list[i].lr_timestamp) { 371 candidate_flow = i; 372 oldest_timestamp = lro_flow_list[i].lr_timestamp; 373 } 374 } 375 376 if (!slot_available) { 377 tcpstat.tcps_flowtbl_full++; 378kick_flow: 379 /* kick the oldest flow */ 380 mb = tcp_lro_eject_flow(candidate_flow); 381 382 if (lrodebug) { 383 if (!slot_available) { 384 printf("%s: slot unavailable.\n",__func__); 385 } 386 if (collision) { 387 printf("%s: collision.\n",__func__); 388 } 389 } 390 } else { 391 candidate_flow = i; /* this is now the flow to be used */ 392 393 } 394 395 tcp_lro_init_flow(candidate_flow, ip_hdr, tcp_hdr, hash, 396 tcp_now, payload_len); 397 tcp_lro_coalesce(candidate_flow, lro_mb, tcp_hdr, payload_len, 398 drop_hdrlen, topt, tsval, tsecr, 0); 399 return mb; 400} 401 402struct mbuf* 403tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr, 404 struct tcphdr *tcp_hdr, int drop_hdrlen) 405{ 406 int flow_id = TCP_LRO_FLOW_UNINIT; 407 int hash; 408 unsigned int off = 0; 409 int eject_flow = 0; 410 int optlen; 411 int retval = 0; 412 struct mbuf *mb = NULL; 413 int payload_len = 0; 414 u_char *optp = NULL; 415 int thflags = 0; 416 struct tcpopt to; 417 int ret_response = TCP_LRO_CONSUMED; 418 int coalesced = 0, tcpflags = 0, unknown_tcpopts = 0; 419 u_int8_t ecn; 420 421 if (lro_mb->m_len < (int32_t)sizeof (struct tcpiphdr)) { 422 if ((lro_mb = m_pullup(lro_mb, sizeof(struct tcpiphdr))) == 0) { 423 tcpstat.tcps_rcvshort++; 424 m_freem(lro_mb); 425 if (lrodebug) { 426 printf("tcp_lro_process_pkt:mbuf too short.\n"); 427 } 428 return NULL; 429 } 430 } 431 432 if ((lro_mb = lro_tcp_xsum_validate(lro_mb, 433 (struct ipovly*)ip_hdr, tcp_hdr)) == NULL) { 434 if (lrodebug) { 435 printf("tcp_lro_process_pkt: TCP xsum failed.\n"); 436 } 437 return NULL; 438 } 439 440 /* Update stats */ 441 lro_pkt_count++; 442 443 /* Avoids checksumming in tcp_input */ 444 lro_mb->m_pkthdr.aux_flags |= MAUXF_SW_LRO_DID_CSUM; 445 446 off = tcp_hdr->th_off << 2; 447 optlen = off - sizeof (struct tcphdr); 448 payload_len = ip_hdr->ip_len - off; 449 optp = (u_char *)(tcp_hdr + 1); 450 /* 451 * Do quick retrieval of timestamp options ("options 452 * prediction?"). If timestamp is the only option and it's 453 * formatted as recommended in RFC 1323 appendix A, we 454 * quickly get the values now and not bother calling 455 * tcp_dooptions(), etc. 456 */ 457 if ((optlen == TCPOLEN_TSTAMP_APPA || 458 (optlen > TCPOLEN_TSTAMP_APPA && 459 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 460 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 461 (tcp_hdr->th_flags & TH_SYN) == 0) { 462 to.to_flags |= TOF_TS; 463 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4)); 464 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8)); 465 } else { 466 /* 467 * If TCP timestamps are not in use, or not the first option, 468 * skip LRO path since timestamps are used to avoid LRO 469 * from introducing additional latencies for retransmissions 470 * and other slow-paced transmissions. 471 */ 472 to.to_flags = to.to_tsecr = 0; 473 eject_flow = 1; 474 } 475 476 /* list all the conditions that can trigger a flow ejection here */ 477 478 thflags = tcp_hdr->th_flags; 479 if (thflags & (TH_SYN | TH_URG | TH_ECE | TH_CWR | TH_PUSH | TH_RST | TH_FIN)) { 480 eject_flow = tcpflags = 1; 481 } 482 483 if (optlen && !((optlen == TCPOLEN_TSTAMP_APPA) && 484 (to.to_flags & TOF_TS))) { 485 eject_flow = unknown_tcpopts = 1; 486 } 487 488 if (payload_len <= LRO_MIN_COALESC_SZ) { /* zero payload ACK */ 489 eject_flow = 1; 490 } 491 492 /* Can't coalesce ECN marked packets. */ 493 ecn = ip_hdr->ip_tos & IPTOS_ECN_MASK; 494 if (ecn == IPTOS_ECN_CE) { 495 /* 496 * ECN needs quick notification 497 */ 498 if (lrodebug) { 499 printf("%s: ECE bits set.\n", __func__); 500 } 501 eject_flow = 1; 502 } 503 504 lck_mtx_lock_spin(&tcp_lro_lock); 505 506 retval = tcp_lro_matching_tuple(ip_hdr, tcp_hdr, &hash, &flow_id); 507 508 switch (retval) { 509 case TCP_LRO_NAN: 510 lck_mtx_unlock(&tcp_lro_lock); 511 ret_response = TCP_LRO_FLOW_NOTFOUND; 512 break; 513 514 case TCP_LRO_COALESCE: 515 if ((payload_len != 0) && (unknown_tcpopts == 0) && 516 (tcpflags == 0) && (ecn != IPTOS_ECN_CE) && (to.to_flags & TOF_TS)) { 517 tcp_lro_coalesce(flow_id, lro_mb, tcp_hdr, payload_len, 518 drop_hdrlen, &to, 519 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 4) : NULL, 520 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 8) : NULL, 521 thflags); 522 if (lrodebug >= 2) { 523 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n", 524 lro_flow_list[flow_id].lr_len, flow_id, 525 payload_len, drop_hdrlen, optlen, 526 ntohs(lro_flow_list[flow_id].lr_lport), 527 ntohl(tcp_hdr->th_seq)); 528 } 529 if (lro_flow_list[flow_id].lr_mhead->m_pkthdr.lro_npkts >= coalesc_sz) { 530 eject_flow = 1; 531 } 532 coalesced = 1; 533 } 534 if (eject_flow) { 535 mb = tcp_lro_eject_coalesced_pkt(flow_id); 536 lro_flow_list[flow_id].lr_seq = ntohl(tcp_hdr->th_seq) + 537 payload_len; 538 lck_mtx_unlock(&tcp_lro_lock); 539 if (mb) { 540 lro_proto_input(mb); 541 } 542 if (!coalesced) { 543 if (lrodebug >= 2) { 544 printf("%s: pkt payload_len = %d \n", __func__, payload_len); 545 } 546 lro_proto_input(lro_mb); 547 } 548 } else { 549 lck_mtx_unlock(&tcp_lro_lock); 550 } 551 break; 552 553 case TCP_LRO_EJECT_FLOW: 554 mb = tcp_lro_eject_coalesced_pkt(flow_id); 555 lck_mtx_unlock(&tcp_lro_lock); 556 if (mb) { 557 if (lrodebug) 558 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb->m_pkthdr.len); 559 lro_proto_input(mb); 560 } 561 562 lro_proto_input(lro_mb); 563 break; 564 565 case TCP_LRO_COLLISION: 566 lck_mtx_unlock(&tcp_lro_lock); 567 ret_response = TCP_LRO_FLOW_NOTFOUND; 568 break; 569 570 default: 571 lck_mtx_unlock(&tcp_lro_lock); 572 panic_plain("%s: unrecognized type %d", __func__, retval); 573 break; 574 } 575 576 if (ret_response == TCP_LRO_FLOW_NOTFOUND) { 577 lro_proto_input(lro_mb); 578 } 579 return NULL; 580} 581 582static void 583tcp_lro_timer_proc(void *arg1, void *arg2) 584{ 585#pragma unused(arg1, arg2) 586 587 lck_mtx_lock_spin(&tcp_lro_lock); 588 lro_timer_set = 0; 589 lck_mtx_unlock(&tcp_lro_lock); 590 tcp_lro_flush_flows(); 591} 592 593static void 594tcp_lro_flush_flows(void) 595{ 596 int i = 0; 597 struct mbuf *mb; 598 struct lro_flow *flow; 599 int active_flows = 0; 600 int outstanding_flows = 0; 601 int tcpclock_updated = 0; 602 603 lck_mtx_lock(&tcp_lro_lock); 604 605 while (i < TCP_LRO_NUM_FLOWS) { 606 flow = &lro_flow_list[i]; 607 if (flow->lr_mhead != NULL) { 608 active_flows++; 609 if (!tcpclock_updated) { 610 calculate_tcp_clock(); 611 tcpclock_updated = 1; 612 } 613 if (((tcp_now - flow->lr_timestamp) >= coalesc_time) || 614 (flow->lr_mhead->m_pkthdr.lro_npkts >= 615 coalesc_sz)) { 616 617 if (lrodebug >= 2) 618 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n", 619 flow->lr_len, 620 flow->lr_mhead->m_pkthdr.lro_npkts, 621 flow->lr_timestamp, tcp_now); 622 623 mb = tcp_lro_eject_flow(i); 624 625 if (mb) { 626 lck_mtx_unlock(&tcp_lro_lock); 627 lro_update_flush_stats(mb); 628 lro_proto_input(mb); 629 lck_mtx_lock(&tcp_lro_lock); 630 } 631 632 } else { 633 tcp_lro_sched_timer(0); 634 outstanding_flows++; 635 if (lrodebug >= 2) { 636 printf("tcp_lro_flush_flows: did not flush flow of len =%d deadline = %x timestamp = %x \n", 637 flow->lr_len, tcp_now, flow->lr_timestamp); 638 } 639 } 640 } 641 if (flow->lr_flags & LRO_EJECT_REQ) { 642 mb = tcp_lro_eject_flow(i); 643 if (mb) { 644 lck_mtx_unlock(&tcp_lro_lock); 645 lro_proto_input(mb); 646 lro_eject_req++; 647 lck_mtx_lock(&tcp_lro_lock); 648 } 649 } 650 i++; 651 } 652 lck_mtx_unlock(&tcp_lro_lock); 653#if 0 654 if (lrocount == 900) { 655 printf("%s: %d %d %d %d oo: %d mismatch: %d ej_req: %d coll: %d \n", 656 __func__, 657 tcpstat.tcps_coalesced_pack, 658 tcpstat.tcps_lro_twopack, 659 tcpstat.tcps_lro_multpack, 660 tcpstat.tcps_lro_largepack, 661 lro_seq_outoforder, 662 lro_seq_mismatch, 663 lro_eject_req, 664 tcpstat.tcps_flowtbl_collision); 665 printf("%s: all: %d single: %d double: %d good: %d \n", 666 __func__, lro_flushes, lro_single_flushes, 667 lro_double_flushes, lro_good_flushes); 668 lrocount = 0; 669 } else { 670 lrocount++; 671 } 672 if ((lrodebug >= 2) && (active_flows > 1)) { 673 printf("lro_flush_flows: active_flows = %d \n", active_flows); 674 } 675#endif 676} 677 678/* 679 * Must be called with tcp_lro_lock held. 680 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time 681 * takes precedence, so lro_timer_set is not set for the hint case 682 */ 683static void 684tcp_lro_sched_timer(uint64_t hint) 685{ 686 if (lro_timer_set) { 687 return; 688 } 689 690 lro_timer_set = 1; 691 if (!hint) { 692 /* the intent is to wake up every coalesc_time msecs */ 693 clock_interval_to_deadline(coalesc_time, 694 (NSEC_PER_SEC / TCP_RETRANSHZ), &lro_deadline); 695 } else { 696 clock_interval_to_deadline(hint, NSEC_PER_SEC / TCP_RETRANSHZ, 697 &lro_deadline); 698 } 699 thread_call_enter_delayed(tcp_lro_timer, lro_deadline); 700} 701 702struct mbuf* 703tcp_lro(struct mbuf *m, unsigned int hlen) 704{ 705 struct ip *ip_hdr; 706 unsigned int tlen; 707 struct tcphdr * tcp_hdr = NULL; 708 unsigned int off = 0; 709 710 if (kipf_count != 0) 711 return m; 712 713 /* 714 * Experiments on cellular show that the RTT is much higher 715 * than the coalescing time of 5 msecs, causing lro to flush 716 * 80% of the time on a single packet. Increasing 717 * coalescing time for cellular does not show marked 718 * improvement to throughput either. Loopback perf is hurt 719 * by the 5 msec latency and it already sends large packets. 720 */ 721 if ((m->m_pkthdr.rcvif->if_type == IFT_CELLULAR) || 722 (m->m_pkthdr.rcvif->if_type == IFT_LOOP)) { 723 return m; 724 } 725 726 ip_hdr = mtod(m, struct ip*); 727 728 /* only TCP is coalesced */ 729 if (ip_hdr->ip_p != IPPROTO_TCP) { 730 return m; 731 } 732 733 if (m->m_len < (int32_t) sizeof (struct tcpiphdr)) { 734 if (lrodebug) printf("tcp_lro m_pullup \n"); 735 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { 736 tcpstat.tcps_rcvshort++; 737 if (lrodebug) { 738 printf("ip_lro: rcvshort.\n"); 739 } 740 return NULL; 741 } 742 } 743 744 tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + hlen); 745 tlen = ip_hdr->ip_len ; //ignore IP header bytes len 746 m->m_pkthdr.lro_pktlen = tlen; /* Used to return max pkt encountered to tcp */ 747 m->m_pkthdr.lro_npkts = 1; /* Initialize a counter to hold num pkts coalesced */ 748 off = tcp_hdr->th_off << 2; 749 if (off < sizeof (struct tcphdr) || off > tlen) { 750 tcpstat.tcps_rcvbadoff++; 751 if (lrodebug) { 752 printf("ip_lro: TCP off greater than TCP header.\n"); 753 } 754 return m; 755 } 756 757 return (tcp_lro_process_pkt(m, ip_hdr, tcp_hdr, hlen + off)); 758} 759 760static void 761lro_proto_input(struct mbuf *m) 762{ 763 struct ip* ip_hdr = mtod(m, struct ip*); 764 765 if (lrodebug >= 3) { 766 printf("lro_proto_input: ip_len = %d \n", 767 ip_hdr->ip_len); 768 } 769 lro_update_stats(m); 770 ip_proto_dispatch_in_wrapper(m, ip_hdr->ip_hl << 2, ip_hdr->ip_p); 771} 772 773static struct mbuf * 774lro_tcp_xsum_validate(struct mbuf *m, struct ipovly *ipov, struct tcphdr * th) 775{ 776 777 struct ip* ip = (struct ip*)ipov; 778 int tlen = ip->ip_len; 779 int len; 780 struct ifnet *ifp = ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) ? 781 m->m_pkthdr.rcvif: NULL; 782 783 /* Expect 32-bit aligned data pointer on strict-align platforms */ 784 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); 785 786 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 787 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) { 788 u_short pseudo; 789 char b[9]; 790 791 bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1)); 792 bzero(ipov->ih_x1, sizeof (ipov->ih_x1)); 793 ipov->ih_len = (u_short)tlen; 794#if BYTE_ORDER != BIG_ENDIAN 795 HTONS(ipov->ih_len); 796#endif 797 pseudo = in_cksum(m, sizeof (struct ip)); 798 bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1)); 799 800 th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF)); 801 } else { 802 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 803 th->th_sum = m->m_pkthdr.csum_data; 804 else 805 th->th_sum = in_pseudo(ip->ip_src.s_addr, 806 ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + 807 ip->ip_len + IPPROTO_TCP)); 808 } 809 th->th_sum ^= 0xffff; 810 } else { 811 char b[9]; 812 /* 813 * Checksum extended TCP header and data. 814 */ 815 bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1)); 816 bzero(ipov->ih_x1, sizeof (ipov->ih_x1)); 817 ipov->ih_len = (u_short)tlen; 818#if BYTE_ORDER != BIG_ENDIAN 819 HTONS(ipov->ih_len); 820#endif 821 len = sizeof (struct ip) + tlen; 822 th->th_sum = in_cksum(m, len); 823 bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1)); 824 825 tcp_in_cksum_stats(len); 826 } 827 if (th->th_sum) { 828 tcpstat.tcps_rcvbadsum++; 829 if (ifp != NULL && ifp->if_tcp_stat != NULL) { 830 atomic_add_64(&ifp->if_tcp_stat->badformat, 1); 831 } 832 if (lrodebug) 833 printf("lro_tcp_xsum_validate: bad xsum and drop m = %p.\n",m); 834 m_freem(m); 835 return NULL; 836 } 837 /* revert back the order as IP will look into this again. */ 838#if BYTE_ORDER != BIG_ENDIAN 839 NTOHS(ipov->ih_len); 840#endif 841 return m; 842} 843 844/* 845 * When TCP detects a stable, steady flow without out of ordering, 846 * with a sufficiently high cwnd, it invokes LRO. 847 */ 848int 849tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen) 850{ 851 int hash; 852 int flow_id; 853 struct mbuf *eject_mb; 854 struct lro_flow *lf; 855 856 hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, 857 tcp_hdr->th_sport, tcp_hdr->th_dport, 858 (TCP_LRO_FLOW_MAP - 1)); 859 860 861 lck_mtx_lock_spin(&tcp_lro_lock); 862 flow_id = lro_flow_map[hash]; 863 if (flow_id != TCP_LRO_FLOW_NOTFOUND) { 864 lf = &lro_flow_list[flow_id]; 865 if ((lf->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) && 866 (lf->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) && 867 (lf->lr_fport == tcp_hdr->th_sport) && 868 (lf->lr_lport == tcp_hdr->th_dport)) { 869 if ((lf->lr_tcphdr == NULL) && 870 (lf->lr_seq != (tcp_hdr->th_seq + tlen))) { 871 lf->lr_seq = tcp_hdr->th_seq + tlen; 872 } 873 lf->lr_flags &= ~LRO_EJECT_REQ; 874 } 875 lck_mtx_unlock(&tcp_lro_lock); 876 return 0; 877 } 878 879 HTONL(tcp_hdr->th_seq); 880 HTONL(tcp_hdr->th_ack); 881 eject_mb = 882 tcp_lro_insert_flow(NULL, ip_hdr, tcp_hdr, tlen, 0, hash, 883 NULL, NULL, NULL); 884 885 lck_mtx_unlock(&tcp_lro_lock); 886 887 NTOHL(tcp_hdr->th_seq); 888 NTOHL(tcp_hdr->th_ack); 889 if (lrodebug >= 3) { 890 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n", 891 __func__, ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, 892 tcp_hdr->th_sport, tcp_hdr->th_dport, tcp_hdr->th_seq); 893 } 894 ASSERT(eject_mb == NULL); 895 return 0; 896} 897 898/* 899 * When TCP detects loss or idle condition, it stops offloading 900 * to LRO. 901 */ 902int 903tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr, 904 unsigned short sport, unsigned short dport) 905{ 906 int hash, flow_id; 907 struct lro_flow *lf; 908 909 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport, 910 (TCP_LRO_FLOW_MAP - 1)); 911 lck_mtx_lock_spin(&tcp_lro_lock); 912 flow_id = lro_flow_map[hash]; 913 if (flow_id == TCP_LRO_FLOW_UNINIT) { 914 lck_mtx_unlock(&tcp_lro_lock); 915 return 0; 916 } 917 lf = &lro_flow_list[flow_id]; 918 if ((lf->lr_faddr.s_addr == daddr.s_addr) && 919 (lf->lr_laddr.s_addr == saddr.s_addr) && 920 (lf->lr_fport == dport) && 921 (lf->lr_lport == sport)) { 922 if (lrodebug) { 923 printf("%s: %x %x\n", __func__, 924 lf->lr_flags, lf->lr_seq); 925 } 926 lf->lr_flags |= LRO_EJECT_REQ; 927 } 928 lck_mtx_unlock(&tcp_lro_lock); 929 return 0; 930} 931 932void 933tcp_update_lro_seq(__uint32_t rcv_nxt, struct in_addr saddr, struct in_addr daddr, 934 unsigned short sport, unsigned short dport) 935{ 936 int hash, flow_id; 937 struct lro_flow *lf; 938 939 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport, 940 (TCP_LRO_FLOW_MAP - 1)); 941 lck_mtx_lock_spin(&tcp_lro_lock); 942 flow_id = lro_flow_map[hash]; 943 if (flow_id == TCP_LRO_FLOW_UNINIT) { 944 lck_mtx_unlock(&tcp_lro_lock); 945 return; 946 } 947 lf = &lro_flow_list[flow_id]; 948 if ((lf->lr_faddr.s_addr == daddr.s_addr) && 949 (lf->lr_laddr.s_addr == saddr.s_addr) && 950 (lf->lr_fport == dport) && 951 (lf->lr_lport == sport) && 952 (lf->lr_tcphdr == NULL)) { 953 lf->lr_seq = (tcp_seq)rcv_nxt; 954 } 955 lck_mtx_unlock(&tcp_lro_lock); 956 return; 957} 958 959static void 960lro_update_stats(struct mbuf *m) 961{ 962 switch(m->m_pkthdr.lro_npkts) { 963 case 0: /* fall through */ 964 case 1: 965 break; 966 967 case 2: 968 tcpstat.tcps_lro_twopack++; 969 break; 970 971 case 3: /* fall through */ 972 case 4: 973 tcpstat.tcps_lro_multpack++; 974 break; 975 976 default: 977 tcpstat.tcps_lro_largepack++; 978 break; 979 } 980 return; 981} 982 983static void 984lro_update_flush_stats(struct mbuf *m) 985{ 986 lro_flushes++; 987 switch(m->m_pkthdr.lro_npkts) { 988 case 0: ASSERT(0); 989 case 1: lro_single_flushes++; 990 break; 991 case 2: lro_double_flushes++; 992 break; 993 default: lro_good_flushes++; 994 break; 995 } 996 return; 997} 998