1/* 2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <sys/param.h> 30#include <sys/systm.h> 31#include <sys/sysctl.h> 32#include <sys/mbuf.h> 33#include <sys/mcache.h> 34#include <sys/socket.h> 35#include <sys/socketvar.h> 36#include <net/if_types.h> 37#include <net/route.h> 38#include <netinet/in.h> 39#include <netinet/in_systm.h> 40#include <net/if.h> 41#include <net/dlil.h> 42#include <netinet/ip.h> 43#include <netinet/ip_var.h> 44#include <netinet/in_var.h> 45#include <netinet/tcp.h> 46#include <netinet/tcp_seq.h> 47#include <netinet/tcpip.h> 48#include <netinet/tcp_var.h> 49#include <netinet/tcp_lro.h> 50#include <netinet/lro_ext.h> 51#include <kern/locks.h> 52 53unsigned int lrocount = 0; /* A counter used for debugging only */ 54unsigned int lro_seq_outoforder = 0; /* Counter for debugging */ 55unsigned int lro_seq_mismatch = 0; /* Counter for debugging */ 56unsigned int lro_flushes = 0; /* Counter for tracking number of flushes */ 57unsigned int lro_single_flushes = 0; 58unsigned int lro_double_flushes = 0; 59unsigned int lro_good_flushes = 0; 60 61unsigned int coalesc_sz = LRO_MX_COALESCE_PKTS; 62SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_sz, CTLFLAG_RW | CTLFLAG_LOCKED, 63 &coalesc_sz, 0, "Max coalescing size"); 64 65unsigned int coalesc_time = LRO_MX_TIME_TO_BUFFER; 66SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_time, CTLFLAG_RW | CTLFLAG_LOCKED, 67 &coalesc_time, 0, "Max coalescing time"); 68 69struct lro_flow lro_flow_list[TCP_LRO_NUM_FLOWS]; 70 71char lro_flow_map[TCP_LRO_FLOW_MAP]; 72 73static lck_attr_t *tcp_lro_mtx_attr = NULL; /* mutex attributes */ 74static lck_grp_t *tcp_lro_mtx_grp = NULL; /* mutex group */ 75static lck_grp_attr_t *tcp_lro_mtx_grp_attr = NULL; /* mutex group attrs */ 76decl_lck_mtx_data( ,tcp_lro_lock); /* Used to synchronize updates */ 77 78unsigned int lro_byte_count = 0; 79 80uint64_t lro_deadline = 0; /* LRO's sense of time - protected by tcp_lro_lock */ 81uint32_t lro_timer_set = 0; 82 83/* Some LRO stats */ 84u_int32_t lro_pkt_count = 0; /* Number of packets encountered in an LRO period */ 85thread_call_t tcp_lro_timer; 86 87extern u_int32_t kipf_count; 88 89static void tcp_lro_timer_proc(void*, void*); 90static void lro_update_stats(struct mbuf*); 91static void lro_update_flush_stats(struct mbuf *); 92static void tcp_lro_flush_flows(void); 93static void tcp_lro_sched_timer(uint64_t); 94static void lro_proto_input(struct mbuf *); 95 96static struct mbuf *lro_tcp_xsum_validate(struct mbuf*, struct ip *, 97 struct tcphdr*); 98static struct mbuf *tcp_lro_process_pkt(struct mbuf*, struct ip*, struct tcphdr*, 99 int); 100 101void 102tcp_lro_init(void) 103{ 104 int i; 105 106 bzero(lro_flow_list, sizeof (struct lro_flow) * TCP_LRO_NUM_FLOWS); 107 for (i = 0; i < TCP_LRO_FLOW_MAP; i++) { 108 lro_flow_map[i] = TCP_LRO_FLOW_UNINIT; 109 } 110 111 /* 112 * allocate lock group attribute, group and attribute for tcp_lro_lock 113 */ 114 tcp_lro_mtx_grp_attr = lck_grp_attr_alloc_init(); 115 tcp_lro_mtx_grp = lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr); 116 tcp_lro_mtx_attr = lck_attr_alloc_init(); 117 lck_mtx_init(&tcp_lro_lock, tcp_lro_mtx_grp, tcp_lro_mtx_attr); 118 119 tcp_lro_timer = thread_call_allocate(tcp_lro_timer_proc, NULL); 120 if (tcp_lro_timer == NULL) { 121 panic_plain("%s: unable to allocate lro timer", __func__); 122 } 123 124 return; 125} 126 127static int 128tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash, 129 int *flow_id ) 130{ 131 struct lro_flow *flow; 132 tcp_seq seqnum; 133 unsigned int off = 0; 134 int payload_len = 0; 135 136 *hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, 137 tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1)); 138 139 *flow_id = lro_flow_map[*hash]; 140 if (*flow_id == TCP_LRO_FLOW_NOTFOUND) { 141 return TCP_LRO_NAN; 142 } 143 144 seqnum = tcp_hdr->th_seq; 145 off = tcp_hdr->th_off << 2; 146 payload_len = ip_hdr->ip_len - off; 147 148 flow = &lro_flow_list[*flow_id]; 149 150 if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) && 151 (flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) && 152 (flow->lr_fport == tcp_hdr->th_sport) && 153 (flow->lr_lport == tcp_hdr->th_dport)) { 154 if (flow->lr_tcphdr == NULL) { 155 if (ntohl(seqnum) == flow->lr_seq) { 156 return TCP_LRO_COALESCE; 157 } 158 if (lrodebug >= 4) { 159 printf("%s: seqnum = %x, lr_seq = %x\n", 160 __func__, ntohl(seqnum), flow->lr_seq); 161 } 162 lro_seq_mismatch++; 163 if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) { 164 lro_seq_outoforder++; 165 /* 166 * Whenever we receive out of order packets it 167 * signals loss and recovery and LRO doesn't 168 * let flows recover quickly. So eject. 169 */ 170 flow->lr_flags |= LRO_EJECT_REQ; 171 172 } 173 return TCP_LRO_NAN; 174 } 175 176 if (flow->lr_flags & LRO_EJECT_REQ) { 177 if (lrodebug) 178 printf("%s: eject. \n", __func__); 179 return TCP_LRO_EJECT_FLOW; 180 } 181 if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) { 182 if (lrodebug) { 183 printf("%s: th_ack = %x flow_ack = %x \n", 184 __func__, tcp_hdr->th_ack, 185 flow->lr_tcphdr->th_ack); 186 } 187 return TCP_LRO_EJECT_FLOW; 188 } 189 190 if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) { 191 return TCP_LRO_COALESCE; 192 } else { 193 /* LRO does not handle loss recovery well, eject */ 194 flow->lr_flags |= LRO_EJECT_REQ; 195 return TCP_LRO_EJECT_FLOW; 196 } 197 } 198 if (lrodebug) printf("tcp_lro_matching_tuple: collision \n"); 199 return TCP_LRO_COLLISION; 200} 201 202static void 203tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr, 204 int hash, u_int32_t timestamp, int payload_len) 205{ 206 struct lro_flow *flow = NULL; 207 208 flow = &lro_flow_list[flow_id]; 209 210 flow->lr_hash_map = hash; 211 flow->lr_faddr.s_addr = ip_hdr->ip_src.s_addr; 212 flow->lr_laddr.s_addr = ip_hdr->ip_dst.s_addr; 213 flow->lr_fport = tcp_hdr->th_sport; 214 flow->lr_lport = tcp_hdr->th_dport; 215 lro_flow_map[hash] = flow_id; 216 flow->lr_timestamp = timestamp; 217 flow->lr_seq = ntohl(tcp_hdr->th_seq) + payload_len; 218 flow->lr_flags = 0; 219 return; 220} 221 222static void 223tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr, 224 int payload_len, int drop_hdrlen, struct tcpopt *topt, 225 u_int32_t* tsval, u_int32_t* tsecr, int thflags) 226{ 227 struct lro_flow *flow = NULL; 228 struct mbuf *last; 229 struct ip *ip = NULL; 230 231 flow = &lro_flow_list[flow_id]; 232 if (flow->lr_mhead) { 233 if (lrodebug) 234 printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq, 235 payload_len); 236 m_adj(lro_mb, drop_hdrlen); 237 238 last = flow->lr_mtail; 239 while (last->m_next != NULL) { 240 last = last->m_next; 241 } 242 last->m_next = lro_mb; 243 244 flow->lr_mtail = lro_mb; 245 246 ip = mtod(flow->lr_mhead, struct ip *); 247 ip->ip_len += lro_mb->m_pkthdr.len; 248 flow->lr_mhead->m_pkthdr.len += lro_mb->m_pkthdr.len; 249 250 if (flow->lr_len == 0) { 251 panic_plain("%s: Inconsistent LRO flow state", __func__); 252 } 253 flow->lr_len += payload_len; 254 flow->lr_seq += payload_len; 255 /* 256 * This bit is re-OR'd each time a packet is added to the 257 * large coalesced packet. 258 */ 259 flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT; 260 flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */ 261 if (flow->lr_mhead->m_pkthdr.lro_pktlen < 262 lro_mb->m_pkthdr.lro_pktlen) { 263 /* 264 * For TCP Inter Arrival Jitter calculation, return max 265 * size encountered while coalescing a stream of pkts. 266 */ 267 flow->lr_mhead->m_pkthdr.lro_pktlen = 268 lro_mb->m_pkthdr.lro_pktlen; 269 } 270 /* Update the timestamp value */ 271 if (topt->to_flags & TOF_TS) { 272 if ((flow->lr_tsval) && 273 (TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) { 274 *(flow->lr_tsval) = htonl(topt->to_tsval); 275 } 276 if ((flow->lr_tsecr) && 277 (topt->to_tsecr != 0) && 278 (TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) { 279 if (lrodebug >= 2) { 280 printf("%s: instantaneous RTT = %d \n", __func__, 281 topt->to_tsecr - ntohl(*(flow->lr_tsecr))); 282 } 283 *(flow->lr_tsecr) = htonl(topt->to_tsecr); 284 } 285 } 286 /* Coalesce the flags */ 287 if (thflags) { 288 flow->lr_tcphdr->th_flags |= thflags; 289 } 290 /* Update receive window */ 291 flow->lr_tcphdr->th_win = tcphdr->th_win; 292 } else { 293 if (lro_mb) { 294 flow->lr_mhead = flow->lr_mtail = lro_mb; 295 flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT; 296 flow->lr_tcphdr = tcphdr; 297 if ((topt) && (topt->to_flags & TOF_TS)) { 298 ASSERT(tsval != NULL); 299 ASSERT(tsecr != NULL); 300 flow->lr_tsval = tsval; 301 flow->lr_tsecr = tsecr; 302 } 303 flow->lr_len = payload_len; 304 calculate_tcp_clock(); 305 flow->lr_timestamp = tcp_now; 306 tcp_lro_sched_timer(0); 307 } 308 flow->lr_seq = ntohl(tcphdr->th_seq) + payload_len; 309 } 310 if (lro_mb) { 311 tcpstat.tcps_coalesced_pack++; 312 } 313 return; 314} 315 316static struct mbuf * 317tcp_lro_eject_flow(int flow_id) 318{ 319 struct mbuf *mb = NULL; 320 321 mb = lro_flow_list[flow_id].lr_mhead; 322 ASSERT(lro_flow_map[lro_flow_list[flow_id].lr_hash_map] == flow_id); 323 lro_flow_map[lro_flow_list[flow_id].lr_hash_map] = TCP_LRO_FLOW_UNINIT; 324 bzero(&lro_flow_list[flow_id], sizeof(struct lro_flow)); 325 326 return mb; 327} 328 329static struct mbuf* 330tcp_lro_eject_coalesced_pkt(int flow_id) 331{ 332 struct mbuf *mb = NULL; 333 mb = lro_flow_list[flow_id].lr_mhead; 334 lro_flow_list[flow_id].lr_mhead = 335 lro_flow_list[flow_id].lr_mtail = NULL; 336 lro_flow_list[flow_id].lr_tcphdr = NULL; 337 return mb; 338} 339 340static struct mbuf* 341tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr, 342 struct tcphdr *tcp_hdr, int payload_len, 343 int drop_hdrlen, int hash, struct tcpopt *topt, 344 u_int32_t *tsval, u_int32_t *tsecr) 345{ 346 int i; 347 int slot_available = 0; 348 int candidate_flow = 0; 349 u_int32_t oldest_timestamp; 350 struct mbuf *mb = NULL; 351 int collision = 0; 352 353 oldest_timestamp = tcp_now; 354 355 /* handle collision */ 356 if (lro_flow_map[hash] != TCP_LRO_FLOW_UNINIT) { 357 if (lrodebug) { 358 collision = 1; 359 } 360 candidate_flow = lro_flow_map[hash]; 361 tcpstat.tcps_flowtbl_collision++; 362 goto kick_flow; 363 } 364 365 for (i = 0; i < TCP_LRO_NUM_FLOWS; i++) { 366 if (lro_flow_list[i].lr_mhead == NULL) { 367 candidate_flow = i; 368 slot_available = 1; 369 break; 370 } 371 if (oldest_timestamp >= lro_flow_list[i].lr_timestamp) { 372 candidate_flow = i; 373 oldest_timestamp = lro_flow_list[i].lr_timestamp; 374 } 375 } 376 377 if (!slot_available) { 378 tcpstat.tcps_flowtbl_full++; 379kick_flow: 380 /* kick the oldest flow */ 381 mb = tcp_lro_eject_flow(candidate_flow); 382 383 if (lrodebug) { 384 if (!slot_available) { 385 printf("%s: slot unavailable.\n",__func__); 386 } 387 if (collision) { 388 printf("%s: collision.\n",__func__); 389 } 390 } 391 } else { 392 candidate_flow = i; /* this is now the flow to be used */ 393 394 } 395 396 tcp_lro_init_flow(candidate_flow, ip_hdr, tcp_hdr, hash, 397 tcp_now, payload_len); 398 tcp_lro_coalesce(candidate_flow, lro_mb, tcp_hdr, payload_len, 399 drop_hdrlen, topt, tsval, tsecr, 0); 400 return mb; 401} 402 403struct mbuf* 404tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr, 405 struct tcphdr *tcp_hdr, int drop_hdrlen) 406{ 407 int flow_id = TCP_LRO_FLOW_UNINIT; 408 int hash; 409 unsigned int off = 0; 410 int eject_flow = 0; 411 int optlen; 412 int retval = 0; 413 struct mbuf *mb = NULL; 414 int payload_len = 0; 415 u_char *optp = NULL; 416 int thflags = 0; 417 struct tcpopt to; 418 int ret_response = TCP_LRO_CONSUMED; 419 int coalesced = 0, tcpflags = 0, unknown_tcpopts = 0; 420 u_int8_t ecn; 421 422 if (lro_mb->m_len < (int32_t)sizeof (struct tcpiphdr)) { 423 if ((lro_mb = m_pullup(lro_mb, sizeof(struct tcpiphdr))) == 0) { 424 tcpstat.tcps_rcvshort++; 425 m_freem(lro_mb); 426 if (lrodebug) { 427 printf("tcp_lro_process_pkt:mbuf too short.\n"); 428 } 429 return NULL; 430 } 431 } 432 433 /* Just in case */ 434 lro_mb->m_pkthdr.pkt_flags &= ~PKTF_SW_LRO_DID_CSUM; 435 436 if ((lro_mb = lro_tcp_xsum_validate(lro_mb, ip_hdr, tcp_hdr)) == NULL) { 437 if (lrodebug) { 438 printf("tcp_lro_process_pkt: TCP xsum failed.\n"); 439 } 440 return NULL; 441 } 442 443 /* Update stats */ 444 lro_pkt_count++; 445 446 /* Avoids checksumming in tcp_input */ 447 lro_mb->m_pkthdr.pkt_flags |= PKTF_SW_LRO_DID_CSUM; 448 449 off = tcp_hdr->th_off << 2; 450 optlen = off - sizeof (struct tcphdr); 451 payload_len = ip_hdr->ip_len - off; 452 optp = (u_char *)(tcp_hdr + 1); 453 /* 454 * Do quick retrieval of timestamp options ("options 455 * prediction?"). If timestamp is the only option and it's 456 * formatted as recommended in RFC 1323 appendix A, we 457 * quickly get the values now and not bother calling 458 * tcp_dooptions(), etc. 459 */ 460 if ((optlen == TCPOLEN_TSTAMP_APPA || 461 (optlen > TCPOLEN_TSTAMP_APPA && 462 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 463 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 464 (tcp_hdr->th_flags & TH_SYN) == 0) { 465 to.to_flags |= TOF_TS; 466 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4)); 467 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8)); 468 } else { 469 /* 470 * If TCP timestamps are not in use, or not the first option, 471 * skip LRO path since timestamps are used to avoid LRO 472 * from introducing additional latencies for retransmissions 473 * and other slow-paced transmissions. 474 */ 475 to.to_flags = to.to_tsecr = 0; 476 eject_flow = 1; 477 } 478 479 /* list all the conditions that can trigger a flow ejection here */ 480 481 thflags = tcp_hdr->th_flags; 482 if (thflags & (TH_SYN | TH_URG | TH_ECE | TH_CWR | TH_PUSH | TH_RST | TH_FIN)) { 483 eject_flow = tcpflags = 1; 484 } 485 486 if (optlen && !((optlen == TCPOLEN_TSTAMP_APPA) && 487 (to.to_flags & TOF_TS))) { 488 eject_flow = unknown_tcpopts = 1; 489 } 490 491 if (payload_len <= LRO_MIN_COALESC_SZ) { /* zero payload ACK */ 492 eject_flow = 1; 493 } 494 495 /* Can't coalesce ECN marked packets. */ 496 ecn = ip_hdr->ip_tos & IPTOS_ECN_MASK; 497 if (ecn == IPTOS_ECN_CE) { 498 /* 499 * ECN needs quick notification 500 */ 501 if (lrodebug) { 502 printf("%s: ECE bits set.\n", __func__); 503 } 504 eject_flow = 1; 505 } 506 507 lck_mtx_lock_spin(&tcp_lro_lock); 508 509 retval = tcp_lro_matching_tuple(ip_hdr, tcp_hdr, &hash, &flow_id); 510 511 switch (retval) { 512 case TCP_LRO_NAN: 513 lck_mtx_unlock(&tcp_lro_lock); 514 ret_response = TCP_LRO_FLOW_NOTFOUND; 515 break; 516 517 case TCP_LRO_COALESCE: 518 if ((payload_len != 0) && (unknown_tcpopts == 0) && 519 (tcpflags == 0) && (ecn != IPTOS_ECN_CE) && (to.to_flags & TOF_TS)) { 520 tcp_lro_coalesce(flow_id, lro_mb, tcp_hdr, payload_len, 521 drop_hdrlen, &to, 522 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 4) : NULL, 523 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 8) : NULL, 524 thflags); 525 if (lrodebug >= 2) { 526 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n", 527 lro_flow_list[flow_id].lr_len, flow_id, 528 payload_len, drop_hdrlen, optlen, 529 ntohs(lro_flow_list[flow_id].lr_lport), 530 ntohl(tcp_hdr->th_seq)); 531 } 532 if (lro_flow_list[flow_id].lr_mhead->m_pkthdr.lro_npkts >= coalesc_sz) { 533 eject_flow = 1; 534 } 535 coalesced = 1; 536 } 537 if (eject_flow) { 538 mb = tcp_lro_eject_coalesced_pkt(flow_id); 539 lro_flow_list[flow_id].lr_seq = ntohl(tcp_hdr->th_seq) + 540 payload_len; 541 calculate_tcp_clock(); 542 u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp; 543 lck_mtx_unlock(&tcp_lro_lock); 544 if (mb) { 545 mb->m_pkthdr.lro_elapsed = timestamp; 546 lro_proto_input(mb); 547 } 548 if (!coalesced) { 549 if (lrodebug >= 2) { 550 printf("%s: pkt payload_len = %d \n", __func__, payload_len); 551 } 552 lro_proto_input(lro_mb); 553 } 554 } else { 555 lck_mtx_unlock(&tcp_lro_lock); 556 } 557 break; 558 559 case TCP_LRO_EJECT_FLOW: 560 mb = tcp_lro_eject_coalesced_pkt(flow_id); 561 calculate_tcp_clock(); 562 u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp; 563 lck_mtx_unlock(&tcp_lro_lock); 564 if (mb) { 565 if (lrodebug) 566 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb->m_pkthdr.len); 567 mb->m_pkthdr.lro_elapsed = timestamp; 568 lro_proto_input(mb); 569 } 570 571 lro_proto_input(lro_mb); 572 break; 573 574 case TCP_LRO_COLLISION: 575 lck_mtx_unlock(&tcp_lro_lock); 576 ret_response = TCP_LRO_FLOW_NOTFOUND; 577 break; 578 579 default: 580 lck_mtx_unlock(&tcp_lro_lock); 581 panic_plain("%s: unrecognized type %d", __func__, retval); 582 break; 583 } 584 585 if (ret_response == TCP_LRO_FLOW_NOTFOUND) { 586 lro_proto_input(lro_mb); 587 } 588 return NULL; 589} 590 591static void 592tcp_lro_timer_proc(void *arg1, void *arg2) 593{ 594#pragma unused(arg1, arg2) 595 596 lck_mtx_lock_spin(&tcp_lro_lock); 597 lro_timer_set = 0; 598 lck_mtx_unlock(&tcp_lro_lock); 599 tcp_lro_flush_flows(); 600} 601 602static void 603tcp_lro_flush_flows(void) 604{ 605 int i = 0; 606 struct mbuf *mb; 607 struct lro_flow *flow; 608 int tcpclock_updated = 0; 609 610 lck_mtx_lock(&tcp_lro_lock); 611 612 while (i < TCP_LRO_NUM_FLOWS) { 613 flow = &lro_flow_list[i]; 614 if (flow->lr_mhead != NULL) { 615 616 if (!tcpclock_updated) { 617 calculate_tcp_clock(); 618 tcpclock_updated = 1; 619 } 620 621 if (lrodebug >= 2) 622 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n", 623 flow->lr_len, 624 flow->lr_mhead->m_pkthdr.lro_npkts, 625 flow->lr_timestamp, tcp_now); 626 627 u_int8_t timestamp = tcp_now - flow->lr_timestamp; 628 629 mb = tcp_lro_eject_flow(i); 630 631 if (mb) { 632 mb->m_pkthdr.lro_elapsed = timestamp; 633 lck_mtx_unlock(&tcp_lro_lock); 634 lro_update_flush_stats(mb); 635 lro_proto_input(mb); 636 lck_mtx_lock(&tcp_lro_lock); 637 } 638 } 639 i++; 640 } 641 lck_mtx_unlock(&tcp_lro_lock); 642} 643 644/* 645 * Must be called with tcp_lro_lock held. 646 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time 647 * takes precedence, so lro_timer_set is not set for the hint case 648 */ 649static void 650tcp_lro_sched_timer(uint64_t hint) 651{ 652 if (lro_timer_set) { 653 return; 654 } 655 656 lro_timer_set = 1; 657 if (!hint) { 658 /* the intent is to wake up every coalesc_time msecs */ 659 clock_interval_to_deadline(coalesc_time, 660 (NSEC_PER_SEC / TCP_RETRANSHZ), &lro_deadline); 661 } else { 662 clock_interval_to_deadline(hint, NSEC_PER_SEC / TCP_RETRANSHZ, 663 &lro_deadline); 664 } 665 thread_call_enter_delayed(tcp_lro_timer, lro_deadline); 666} 667 668struct mbuf* 669tcp_lro(struct mbuf *m, unsigned int hlen) 670{ 671 struct ip *ip_hdr; 672 unsigned int tlen; 673 struct tcphdr * tcp_hdr = NULL; 674 unsigned int off = 0; 675 676 if (kipf_count != 0) 677 return m; 678 679 /* 680 * Experiments on cellular show that the RTT is much higher 681 * than the coalescing time of 5 msecs, causing lro to flush 682 * 80% of the time on a single packet. Increasing 683 * coalescing time for cellular does not show marked 684 * improvement to throughput either. Loopback perf is hurt 685 * by the 5 msec latency and it already sends large packets. 686 */ 687 if (IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) || 688 (m->m_pkthdr.rcvif->if_type == IFT_LOOP)) { 689 return m; 690 } 691 692 ip_hdr = mtod(m, struct ip*); 693 694 /* don't deal with IP options */ 695 if (hlen > sizeof (struct ip)) 696 return (m); 697 698 /* only TCP is coalesced */ 699 if (ip_hdr->ip_p != IPPROTO_TCP) { 700 return m; 701 } 702 703 if (m->m_len < (int32_t) sizeof (struct tcpiphdr)) { 704 if (lrodebug) printf("tcp_lro m_pullup \n"); 705 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { 706 tcpstat.tcps_rcvshort++; 707 if (lrodebug) { 708 printf("ip_lro: rcvshort.\n"); 709 } 710 return NULL; 711 } 712 } 713 714 tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + hlen); 715 tlen = ip_hdr->ip_len ; //ignore IP header bytes len 716 m->m_pkthdr.lro_pktlen = tlen; /* Used to return max pkt encountered to tcp */ 717 m->m_pkthdr.lro_npkts = 1; /* Initialize a counter to hold num pkts coalesced */ 718 m->m_pkthdr.lro_elapsed = 0; /* Initialize the field to carry elapsed time */ 719 off = tcp_hdr->th_off << 2; 720 if (off < sizeof (struct tcphdr) || off > tlen) { 721 tcpstat.tcps_rcvbadoff++; 722 if (lrodebug) { 723 printf("ip_lro: TCP off greater than TCP header.\n"); 724 } 725 return m; 726 } 727 728 return (tcp_lro_process_pkt(m, ip_hdr, tcp_hdr, hlen + off)); 729} 730 731static void 732lro_proto_input(struct mbuf *m) 733{ 734 struct ip* ip_hdr = mtod(m, struct ip*); 735 736 if (lrodebug >= 3) { 737 printf("lro_proto_input: ip_len = %d \n", 738 ip_hdr->ip_len); 739 } 740 lro_update_stats(m); 741 ip_proto_dispatch_in_wrapper(m, ip_hdr->ip_hl << 2, ip_hdr->ip_p); 742} 743 744static struct mbuf * 745lro_tcp_xsum_validate(struct mbuf *m, struct ip *ip, struct tcphdr * th) 746{ 747 /* Expect 32-bit aligned data pointer on strict-align platforms */ 748 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); 749 750 /* we shouldn't get here for IP with options; hence sizeof (ip) */ 751 if (tcp_input_checksum(AF_INET, m, th, sizeof (*ip), ip->ip_len)) { 752 if (lrodebug) 753 printf("%s: bad xsum and drop m = 0x%llx.\n", __func__, 754 (uint64_t)VM_KERNEL_ADDRPERM(m)); 755 m_freem(m); 756 return (NULL); 757 } 758 759 return (m); 760} 761 762/* 763 * When TCP detects a stable, steady flow without out of ordering, 764 * with a sufficiently high cwnd, it invokes LRO. 765 */ 766int 767tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen) 768{ 769 int hash; 770 int flow_id; 771 struct mbuf *eject_mb; 772 struct lro_flow *lf; 773 774 hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, 775 tcp_hdr->th_sport, tcp_hdr->th_dport, 776 (TCP_LRO_FLOW_MAP - 1)); 777 778 779 lck_mtx_lock_spin(&tcp_lro_lock); 780 flow_id = lro_flow_map[hash]; 781 if (flow_id != TCP_LRO_FLOW_NOTFOUND) { 782 lf = &lro_flow_list[flow_id]; 783 if ((lf->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) && 784 (lf->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) && 785 (lf->lr_fport == tcp_hdr->th_sport) && 786 (lf->lr_lport == tcp_hdr->th_dport)) { 787 if ((lf->lr_tcphdr == NULL) && 788 (lf->lr_seq != (tcp_hdr->th_seq + tlen))) { 789 lf->lr_seq = tcp_hdr->th_seq + tlen; 790 } 791 lf->lr_flags &= ~LRO_EJECT_REQ; 792 } 793 lck_mtx_unlock(&tcp_lro_lock); 794 return 0; 795 } 796 797 HTONL(tcp_hdr->th_seq); 798 HTONL(tcp_hdr->th_ack); 799 eject_mb = 800 tcp_lro_insert_flow(NULL, ip_hdr, tcp_hdr, tlen, 0, hash, 801 NULL, NULL, NULL); 802 803 lck_mtx_unlock(&tcp_lro_lock); 804 805 NTOHL(tcp_hdr->th_seq); 806 NTOHL(tcp_hdr->th_ack); 807 if (lrodebug >= 3) { 808 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n", 809 __func__, ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, 810 tcp_hdr->th_sport, tcp_hdr->th_dport, tcp_hdr->th_seq); 811 } 812 ASSERT(eject_mb == NULL); 813 return 0; 814} 815 816/* 817 * When TCP detects loss or idle condition, it stops offloading 818 * to LRO. 819 */ 820int 821tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr, 822 unsigned short sport, unsigned short dport) 823{ 824 int hash, flow_id; 825 struct lro_flow *lf; 826 827 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport, 828 (TCP_LRO_FLOW_MAP - 1)); 829 lck_mtx_lock_spin(&tcp_lro_lock); 830 flow_id = lro_flow_map[hash]; 831 if (flow_id == TCP_LRO_FLOW_UNINIT) { 832 lck_mtx_unlock(&tcp_lro_lock); 833 return 0; 834 } 835 lf = &lro_flow_list[flow_id]; 836 if ((lf->lr_faddr.s_addr == daddr.s_addr) && 837 (lf->lr_laddr.s_addr == saddr.s_addr) && 838 (lf->lr_fport == dport) && 839 (lf->lr_lport == sport)) { 840 if (lrodebug) { 841 printf("%s: %x %x\n", __func__, 842 lf->lr_flags, lf->lr_seq); 843 } 844 lf->lr_flags |= LRO_EJECT_REQ; 845 } 846 lck_mtx_unlock(&tcp_lro_lock); 847 return 0; 848} 849 850void 851tcp_update_lro_seq(__uint32_t rcv_nxt, struct in_addr saddr, struct in_addr daddr, 852 unsigned short sport, unsigned short dport) 853{ 854 int hash, flow_id; 855 struct lro_flow *lf; 856 857 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport, 858 (TCP_LRO_FLOW_MAP - 1)); 859 lck_mtx_lock_spin(&tcp_lro_lock); 860 flow_id = lro_flow_map[hash]; 861 if (flow_id == TCP_LRO_FLOW_UNINIT) { 862 lck_mtx_unlock(&tcp_lro_lock); 863 return; 864 } 865 lf = &lro_flow_list[flow_id]; 866 if ((lf->lr_faddr.s_addr == daddr.s_addr) && 867 (lf->lr_laddr.s_addr == saddr.s_addr) && 868 (lf->lr_fport == dport) && 869 (lf->lr_lport == sport) && 870 (lf->lr_tcphdr == NULL)) { 871 lf->lr_seq = (tcp_seq)rcv_nxt; 872 } 873 lck_mtx_unlock(&tcp_lro_lock); 874 return; 875} 876 877static void 878lro_update_stats(struct mbuf *m) 879{ 880 switch(m->m_pkthdr.lro_npkts) { 881 case 0: /* fall through */ 882 case 1: 883 break; 884 885 case 2: 886 tcpstat.tcps_lro_twopack++; 887 break; 888 889 case 3: /* fall through */ 890 case 4: 891 tcpstat.tcps_lro_multpack++; 892 break; 893 894 default: 895 tcpstat.tcps_lro_largepack++; 896 break; 897 } 898 return; 899} 900 901static void 902lro_update_flush_stats(struct mbuf *m) 903{ 904 lro_flushes++; 905 switch(m->m_pkthdr.lro_npkts) { 906 case 0: ASSERT(0); 907 case 1: lro_single_flushes++; 908 break; 909 case 2: lro_double_flushes++; 910 break; 911 default: lro_good_flushes++; 912 break; 913 } 914 return; 915} 916