1235474Sbz/*- 2235474Sbz * Copyright (c) 2007, Myricom Inc. 3235474Sbz * Copyright (c) 2008, Intel Corporation. 4235944Sbz * Copyright (c) 2012 The FreeBSD Foundation 5294327Shselasky * Copyright (c) 2016 Mellanox Technologies. 6235474Sbz * All rights reserved. 7235474Sbz * 8235944Sbz * Portions of this software were developed by Bjoern Zeeb 9235944Sbz * under sponsorship from the FreeBSD Foundation. 10235944Sbz * 11235474Sbz * Redistribution and use in source and binary forms, with or without 12235474Sbz * modification, are permitted provided that the following conditions 13235474Sbz * are met: 14235474Sbz * 1. Redistributions of source code must retain the above copyright 15235474Sbz * notice, this list of conditions and the following disclaimer. 16235474Sbz * 2. Redistributions in binary form must reproduce the above copyright 17235474Sbz * notice, this list of conditions and the following disclaimer in the 18235474Sbz * documentation and/or other materials provided with the distribution. 19235474Sbz * 20235474Sbz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21235474Sbz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22235474Sbz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23235474Sbz * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24235474Sbz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25235474Sbz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26235474Sbz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27235474Sbz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28235474Sbz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29235474Sbz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30235474Sbz * SUCH DAMAGE. 31235474Sbz */ 32179737Sjfv 33235944Sbz#include <sys/cdefs.h> 34235944Sbz__FBSDID("$FreeBSD: releng/11.0/sys/netinet/tcp_lro.c 301249 2016-06-03 08:35:07Z hselasky $"); 35235944Sbz 36235944Sbz#include "opt_inet.h" 37235944Sbz#include "opt_inet6.h" 38235944Sbz 39179737Sjfv#include <sys/param.h> 40179737Sjfv#include <sys/systm.h> 41295126Sglebius#include <sys/kernel.h> 42295126Sglebius#include <sys/malloc.h> 43179737Sjfv#include <sys/mbuf.h> 44179737Sjfv#include <sys/socket.h> 45179737Sjfv 46179737Sjfv#include <net/if.h> 47235944Sbz#include <net/if_var.h> 48179737Sjfv#include <net/ethernet.h> 49236394Sbz#include <net/vnet.h> 50179737Sjfv 51179737Sjfv#include <netinet/in_systm.h> 52179737Sjfv#include <netinet/in.h> 53235944Sbz#include <netinet/ip6.h> 54179737Sjfv#include <netinet/ip.h> 55235981Sbz#include <netinet/ip_var.h> 56179737Sjfv#include <netinet/tcp.h> 57179737Sjfv#include <netinet/tcp_lro.h> 58179737Sjfv 59235981Sbz#include <netinet6/ip6_var.h> 60235981Sbz 61179737Sjfv#include <machine/in_cksum.h> 62179737Sjfv 63294327Shselaskystatic MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); 64179737Sjfv 65235944Sbz#define TCP_LRO_UPDATE_CSUM 1 66235944Sbz#ifndef TCP_LRO_UPDATE_CSUM 67235944Sbz#define TCP_LRO_INVALID_CSUM 0x0000 68235944Sbz#endif 69179737Sjfv 70297482Ssephestatic void tcp_lro_rx_done(struct lro_ctrl *lc); 71297482Ssephe 72298974Ssephestatic __inline void 73298974Ssephetcp_lro_active_insert(struct lro_ctrl *lc, struct lro_entry *le) 74298974Ssephe{ 75298974Ssephe 76298974Ssephe LIST_INSERT_HEAD(&lc->lro_active, le, next); 77298974Ssephe} 78298974Ssephe 79298974Ssephestatic __inline void 80298974Ssephetcp_lro_active_remove(struct lro_entry *le) 81298974Ssephe{ 82298974Ssephe 83298974Ssephe LIST_REMOVE(le, next); 84298974Ssephe} 85298974Ssephe 86179737Sjfvint 87235944Sbztcp_lro_init(struct lro_ctrl *lc) 88179737Sjfv{ 89294327Shselasky return (tcp_lro_init_args(lc, NULL, TCP_LRO_ENTRIES, 0)); 90294327Shselasky} 91294327Shselasky 92294327Shselaskyint 93294327Shselaskytcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, 94294327Shselasky unsigned lro_entries, unsigned lro_mbufs) 95294327Shselasky{ 96235944Sbz struct lro_entry *le; 97294327Shselasky size_t size; 98294327Shselasky unsigned i; 99179737Sjfv 100235944Sbz lc->lro_bad_csum = 0; 101235944Sbz lc->lro_queued = 0; 102235944Sbz lc->lro_flushed = 0; 103235944Sbz lc->lro_cnt = 0; 104294327Shselasky lc->lro_mbuf_count = 0; 105294327Shselasky lc->lro_mbuf_max = lro_mbufs; 106294327Shselasky lc->lro_cnt = lro_entries; 107295739Ssephe lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; 108295739Ssephe lc->lro_length_lim = TCP_LRO_LENGTH_MAX; 109294327Shselasky lc->ifp = ifp; 110297483Ssephe LIST_INIT(&lc->lro_free); 111297483Ssephe LIST_INIT(&lc->lro_active); 112179737Sjfv 113294327Shselasky /* compute size to allocate */ 114300731Shselasky size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + 115294327Shselasky (lro_entries * sizeof(*le)); 116300731Shselasky lc->lro_mbuf_data = (struct lro_mbuf_sort *) 117294327Shselasky malloc(size, M_LRO, M_NOWAIT | M_ZERO); 118179737Sjfv 119294327Shselasky /* check for out of memory */ 120294327Shselasky if (lc->lro_mbuf_data == NULL) { 121294327Shselasky memset(lc, 0, sizeof(*lc)); 122294327Shselasky return (ENOMEM); 123294327Shselasky } 124294327Shselasky /* compute offset for LRO entries */ 125294327Shselasky le = (struct lro_entry *) 126294327Shselasky (lc->lro_mbuf_data + lro_mbufs); 127294327Shselasky 128294327Shselasky /* setup linked list */ 129294327Shselasky for (i = 0; i != lro_entries; i++) 130297483Ssephe LIST_INSERT_HEAD(&lc->lro_free, le + i, next); 131294327Shselasky 132294327Shselasky return (0); 133179737Sjfv} 134179737Sjfv 135179737Sjfvvoid 136235944Sbztcp_lro_free(struct lro_ctrl *lc) 137179737Sjfv{ 138235944Sbz struct lro_entry *le; 139294327Shselasky unsigned x; 140179737Sjfv 141294327Shselasky /* reset LRO free list */ 142297483Ssephe LIST_INIT(&lc->lro_free); 143294327Shselasky 144294327Shselasky /* free active mbufs, if any */ 145297483Ssephe while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 146298974Ssephe tcp_lro_active_remove(le); 147294327Shselasky m_freem(le->m_head); 148179737Sjfv } 149294327Shselasky 150294327Shselasky /* free mbuf array, if any */ 151294327Shselasky for (x = 0; x != lc->lro_mbuf_count; x++) 152300731Shselasky m_freem(lc->lro_mbuf_data[x].mb); 153294327Shselasky lc->lro_mbuf_count = 0; 154294327Shselasky 155294327Shselasky /* free allocated memory, if any */ 156294327Shselasky free(lc->lro_mbuf_data, M_LRO); 157294327Shselasky lc->lro_mbuf_data = NULL; 158179737Sjfv} 159179737Sjfv 160235944Sbz#ifdef TCP_LRO_UPDATE_CSUM 161235944Sbzstatic uint16_t 162235944Sbztcp_lro_csum_th(struct tcphdr *th) 163235944Sbz{ 164235944Sbz uint32_t ch; 165235944Sbz uint16_t *p, l; 166235944Sbz 167235944Sbz ch = th->th_sum = 0x0000; 168235944Sbz l = th->th_off; 169235944Sbz p = (uint16_t *)th; 170235944Sbz while (l > 0) { 171235944Sbz ch += *p; 172235944Sbz p++; 173235944Sbz ch += *p; 174235944Sbz p++; 175235944Sbz l--; 176235944Sbz } 177235944Sbz while (ch > 0xffff) 178235944Sbz ch = (ch >> 16) + (ch & 0xffff); 179235944Sbz 180235944Sbz return (ch & 0xffff); 181235944Sbz} 182235944Sbz 183235944Sbzstatic uint16_t 184235944Sbztcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, 185235944Sbz uint16_t tcp_data_len, uint16_t csum) 186235944Sbz{ 187235944Sbz uint32_t c; 188235944Sbz uint16_t cs; 189235944Sbz 190235944Sbz c = csum; 191235944Sbz 192235944Sbz /* Remove length from checksum. */ 193235944Sbz switch (le->eh_type) { 194235944Sbz#ifdef INET6 195235944Sbz case ETHERTYPE_IPV6: 196235944Sbz { 197235944Sbz struct ip6_hdr *ip6; 198235944Sbz 199235944Sbz ip6 = (struct ip6_hdr *)l3hdr; 200235944Sbz if (le->append_cnt == 0) 201235944Sbz cs = ip6->ip6_plen; 202235944Sbz else { 203235944Sbz uint32_t cx; 204235944Sbz 205235944Sbz cx = ntohs(ip6->ip6_plen); 206235944Sbz cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); 207235944Sbz } 208235944Sbz break; 209235944Sbz } 210235944Sbz#endif 211235944Sbz#ifdef INET 212235944Sbz case ETHERTYPE_IP: 213235944Sbz { 214235944Sbz struct ip *ip4; 215235944Sbz 216235944Sbz ip4 = (struct ip *)l3hdr; 217235944Sbz if (le->append_cnt == 0) 218235944Sbz cs = ip4->ip_len; 219235944Sbz else { 220235944Sbz cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), 221235944Sbz IPPROTO_TCP); 222235944Sbz cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, 223235944Sbz htons(cs)); 224235944Sbz } 225235944Sbz break; 226235944Sbz } 227235944Sbz#endif 228235944Sbz default: 229235944Sbz cs = 0; /* Keep compiler happy. */ 230235944Sbz } 231235944Sbz 232235944Sbz cs = ~cs; 233235944Sbz c += cs; 234235944Sbz 235235944Sbz /* Remove TCP header csum. */ 236235944Sbz cs = ~tcp_lro_csum_th(th); 237235944Sbz c += cs; 238235944Sbz while (c > 0xffff) 239235944Sbz c = (c >> 16) + (c & 0xffff); 240235944Sbz 241235944Sbz return (c & 0xffff); 242235944Sbz} 243235944Sbz#endif 244235944Sbz 245297482Ssephestatic void 246297482Ssephetcp_lro_rx_done(struct lro_ctrl *lc) 247297482Ssephe{ 248297482Ssephe struct lro_entry *le; 249297482Ssephe 250297483Ssephe while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 251298974Ssephe tcp_lro_active_remove(le); 252297482Ssephe tcp_lro_flush(lc, le); 253297482Ssephe } 254297482Ssephe} 255297482Ssephe 256179737Sjfvvoid 257255010Snptcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) 258255010Snp{ 259255010Snp struct lro_entry *le, *le_tmp; 260255010Snp struct timeval tv; 261255010Snp 262297483Ssephe if (LIST_EMPTY(&lc->lro_active)) 263255010Snp return; 264255010Snp 265255010Snp getmicrotime(&tv); 266255010Snp timevalsub(&tv, timeout); 267297483Ssephe LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { 268255010Snp if (timevalcmp(&tv, &le->mtime, >=)) { 269298974Ssephe tcp_lro_active_remove(le); 270255010Snp tcp_lro_flush(lc, le); 271255010Snp } 272255010Snp } 273255010Snp} 274255010Snp 275255010Snpvoid 276235944Sbztcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) 277179737Sjfv{ 278179737Sjfv 279235944Sbz if (le->append_cnt > 0) { 280235944Sbz struct tcphdr *th; 281235944Sbz uint16_t p_len; 282179737Sjfv 283235944Sbz p_len = htons(le->p_len); 284235944Sbz switch (le->eh_type) { 285235944Sbz#ifdef INET6 286235944Sbz case ETHERTYPE_IPV6: 287235944Sbz { 288235944Sbz struct ip6_hdr *ip6; 289179737Sjfv 290235944Sbz ip6 = le->le_ip6; 291235944Sbz ip6->ip6_plen = p_len; 292235944Sbz th = (struct tcphdr *)(ip6 + 1); 293235944Sbz le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 294235944Sbz CSUM_PSEUDO_HDR; 295235944Sbz le->p_len += ETHER_HDR_LEN + sizeof(*ip6); 296235944Sbz break; 297235944Sbz } 298235944Sbz#endif 299235944Sbz#ifdef INET 300235944Sbz case ETHERTYPE_IP: 301235944Sbz { 302235944Sbz struct ip *ip4; 303235944Sbz#ifdef TCP_LRO_UPDATE_CSUM 304235944Sbz uint32_t cl; 305235944Sbz uint16_t c; 306235944Sbz#endif 307179737Sjfv 308235944Sbz ip4 = le->le_ip4; 309235944Sbz#ifdef TCP_LRO_UPDATE_CSUM 310235944Sbz /* Fix IP header checksum for new length. */ 311235944Sbz c = ~ip4->ip_sum; 312235944Sbz cl = c; 313235944Sbz c = ~ip4->ip_len; 314235944Sbz cl += c + p_len; 315235944Sbz while (cl > 0xffff) 316235944Sbz cl = (cl >> 16) + (cl & 0xffff); 317235944Sbz c = cl; 318235944Sbz ip4->ip_sum = ~c; 319235944Sbz#else 320235944Sbz ip4->ip_sum = TCP_LRO_INVALID_CSUM; 321235944Sbz#endif 322235944Sbz ip4->ip_len = p_len; 323235944Sbz th = (struct tcphdr *)(ip4 + 1); 324235944Sbz le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 325235944Sbz CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; 326235944Sbz le->p_len += ETHER_HDR_LEN; 327235944Sbz break; 328179737Sjfv } 329235944Sbz#endif 330235944Sbz default: 331235944Sbz th = NULL; /* Keep compiler happy. */ 332235944Sbz } 333235944Sbz le->m_head->m_pkthdr.csum_data = 0xffff; 334235944Sbz le->m_head->m_pkthdr.len = le->p_len; 335235944Sbz 336235944Sbz /* Incorporate the latest ACK into the TCP header. */ 337235944Sbz th->th_ack = le->ack_seq; 338235944Sbz th->th_win = le->window; 339235944Sbz /* Incorporate latest timestamp into the TCP header. */ 340235944Sbz if (le->timestamp != 0) { 341235944Sbz uint32_t *ts_ptr; 342235944Sbz 343235944Sbz ts_ptr = (uint32_t *)(th + 1); 344235944Sbz ts_ptr[1] = htonl(le->tsval); 345235944Sbz ts_ptr[2] = le->tsecr; 346235944Sbz } 347235944Sbz#ifdef TCP_LRO_UPDATE_CSUM 348235944Sbz /* Update the TCP header checksum. */ 349235944Sbz le->ulp_csum += p_len; 350235944Sbz le->ulp_csum += tcp_lro_csum_th(th); 351235944Sbz while (le->ulp_csum > 0xffff) 352235944Sbz le->ulp_csum = (le->ulp_csum >> 16) + 353235944Sbz (le->ulp_csum & 0xffff); 354235944Sbz th->th_sum = (le->ulp_csum & 0xffff); 355235944Sbz th->th_sum = ~th->th_sum; 356235944Sbz#else 357235944Sbz th->th_sum = TCP_LRO_INVALID_CSUM; 358235944Sbz#endif 359179737Sjfv } 360235944Sbz 361235944Sbz (*lc->ifp->if_input)(lc->ifp, le->m_head); 362235944Sbz lc->lro_queued += le->append_cnt + 1; 363235944Sbz lc->lro_flushed++; 364235944Sbz bzero(le, sizeof(*le)); 365297483Ssephe LIST_INSERT_HEAD(&lc->lro_free, le, next); 366179737Sjfv} 367179737Sjfv 368300731Shselasky#ifdef HAVE_INLINE_FLSLL 369300731Shselasky#define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) 370300731Shselasky#else 371300731Shselaskystatic inline uint64_t 372300731Shselaskytcp_lro_msb_64(uint64_t x) 373294327Shselasky{ 374300731Shselasky x |= (x >> 1); 375300731Shselasky x |= (x >> 2); 376300731Shselasky x |= (x >> 4); 377300731Shselasky x |= (x >> 8); 378300731Shselasky x |= (x >> 16); 379300731Shselasky x |= (x >> 32); 380300731Shselasky return (x & ~(x >> 1)); 381300731Shselasky} 382300731Shselasky#endif 383294327Shselasky 384300731Shselasky/* 385300731Shselasky * The tcp_lro_sort() routine is comparable to qsort(), except it has 386300731Shselasky * a worst case complexity limit of O(MIN(N,64)*N), where N is the 387300731Shselasky * number of elements to sort and 64 is the number of sequence bits 388300731Shselasky * available. The algorithm is bit-slicing the 64-bit sequence number, 389300731Shselasky * sorting one bit at a time from the most significant bit until the 390301249Shselasky * least significant one, skipping the constant bits. This is 391301249Shselasky * typically called a radix sort. 392300731Shselasky */ 393300731Shselaskystatic void 394300731Shselaskytcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) 395300731Shselasky{ 396300731Shselasky struct lro_mbuf_sort temp; 397300731Shselasky uint64_t ones; 398300731Shselasky uint64_t zeros; 399300731Shselasky uint32_t x; 400300731Shselasky uint32_t y; 401294327Shselasky 402300731Shselaskyrepeat: 403301249Shselasky /* for small arrays insertion sort is faster */ 404300731Shselasky if (size <= 12) { 405301249Shselasky for (x = 1; x < size; x++) { 406301249Shselasky temp = parray[x]; 407301249Shselasky for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) 408301249Shselasky parray[y] = parray[y - 1]; 409301249Shselasky parray[y] = temp; 410300731Shselasky } 411300731Shselasky return; 412300731Shselasky } 413294327Shselasky 414300731Shselasky /* compute sequence bits which are constant */ 415300731Shselasky ones = 0; 416300731Shselasky zeros = 0; 417300731Shselasky for (x = 0; x != size; x++) { 418300731Shselasky ones |= parray[x].seq; 419300731Shselasky zeros |= ~parray[x].seq; 420300731Shselasky } 421300731Shselasky 422300731Shselasky /* compute bits which are not constant into "ones" */ 423300731Shselasky ones &= zeros; 424300731Shselasky if (ones == 0) 425300731Shselasky return; 426300731Shselasky 427300731Shselasky /* pick the most significant bit which is not constant */ 428300731Shselasky ones = tcp_lro_msb_64(ones); 429300731Shselasky 430300731Shselasky /* 431300731Shselasky * Move entries having cleared sequence bits to the beginning 432300731Shselasky * of the array: 433300731Shselasky */ 434300731Shselasky for (x = y = 0; y != size; y++) { 435300731Shselasky /* skip set bits */ 436300731Shselasky if (parray[y].seq & ones) 437300731Shselasky continue; 438300731Shselasky /* swap entries */ 439300731Shselasky temp = parray[x]; 440300731Shselasky parray[x] = parray[y]; 441300731Shselasky parray[y] = temp; 442300731Shselasky x++; 443300731Shselasky } 444300731Shselasky 445300731Shselasky KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); 446300731Shselasky 447300731Shselasky /* sort zeros */ 448300731Shselasky tcp_lro_sort(parray, x); 449300731Shselasky 450300731Shselasky /* sort ones */ 451300731Shselasky parray += x; 452300731Shselasky size -= x; 453300731Shselasky goto repeat; 454294327Shselasky} 455294327Shselasky 456294327Shselaskyvoid 457294327Shselaskytcp_lro_flush_all(struct lro_ctrl *lc) 458294327Shselasky{ 459300731Shselasky uint64_t seq; 460300731Shselasky uint64_t nseq; 461294327Shselasky unsigned x; 462294327Shselasky 463294327Shselasky /* check if no mbufs to flush */ 464297482Ssephe if (lc->lro_mbuf_count == 0) 465294327Shselasky goto done; 466294327Shselasky 467294327Shselasky /* sort all mbufs according to stream */ 468300731Shselasky tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); 469294327Shselasky 470294327Shselasky /* input data into LRO engine, stream by stream */ 471300731Shselasky seq = 0; 472294327Shselasky for (x = 0; x != lc->lro_mbuf_count; x++) { 473294327Shselasky struct mbuf *mb; 474294327Shselasky 475300731Shselasky /* get mbuf */ 476300731Shselasky mb = lc->lro_mbuf_data[x].mb; 477294327Shselasky 478300731Shselasky /* get sequence number, masking away the packet index */ 479300731Shselasky nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); 480300731Shselasky 481294327Shselasky /* check for new stream */ 482300731Shselasky if (seq != nseq) { 483300731Shselasky seq = nseq; 484294327Shselasky 485294327Shselasky /* flush active streams */ 486297482Ssephe tcp_lro_rx_done(lc); 487294327Shselasky } 488300731Shselasky 489294327Shselasky /* add packet to LRO engine */ 490294327Shselasky if (tcp_lro_rx(lc, mb, 0) != 0) { 491294327Shselasky /* input packet to network layer */ 492294327Shselasky (*lc->ifp->if_input)(lc->ifp, mb); 493294327Shselasky lc->lro_queued++; 494294327Shselasky lc->lro_flushed++; 495294327Shselasky } 496294327Shselasky } 497294327Shselaskydone: 498294327Shselasky /* flush active streams */ 499297482Ssephe tcp_lro_rx_done(lc); 500297482Ssephe 501294327Shselasky lc->lro_mbuf_count = 0; 502294327Shselasky} 503294327Shselasky 504235944Sbz#ifdef INET6 505235944Sbzstatic int 506235944Sbztcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, 507235944Sbz struct tcphdr **th) 508179737Sjfv{ 509179737Sjfv 510235944Sbz /* XXX-BZ we should check the flow-label. */ 511179737Sjfv 512235944Sbz /* XXX-BZ We do not yet support ext. hdrs. */ 513235944Sbz if (ip6->ip6_nxt != IPPROTO_TCP) 514235944Sbz return (TCP_LRO_NOT_SUPPORTED); 515179737Sjfv 516235944Sbz /* Find the TCP header. */ 517235944Sbz *th = (struct tcphdr *)(ip6 + 1); 518179737Sjfv 519235944Sbz return (0); 520235944Sbz} 521235944Sbz#endif 522235944Sbz 523235944Sbz#ifdef INET 524235944Sbzstatic int 525235944Sbztcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, 526235944Sbz struct tcphdr **th) 527235944Sbz{ 528235944Sbz int csum_flags; 529235944Sbz uint16_t csum; 530235944Sbz 531235944Sbz if (ip4->ip_p != IPPROTO_TCP) 532235944Sbz return (TCP_LRO_NOT_SUPPORTED); 533235944Sbz 534235944Sbz /* Ensure there are no options. */ 535235944Sbz if ((ip4->ip_hl << 2) != sizeof (*ip4)) 536235944Sbz return (TCP_LRO_CANNOT); 537235944Sbz 538235944Sbz /* .. and the packet is not fragmented. */ 539235944Sbz if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) 540235944Sbz return (TCP_LRO_CANNOT); 541235944Sbz 542235944Sbz /* Legacy IP has a header checksum that needs to be correct. */ 543235944Sbz csum_flags = m->m_pkthdr.csum_flags; 544182089Skmacy if (csum_flags & CSUM_IP_CHECKED) { 545182089Skmacy if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { 546235944Sbz lc->lro_bad_csum++; 547235944Sbz return (TCP_LRO_CANNOT); 548182089Skmacy } 549182089Skmacy } else { 550235944Sbz csum = in_cksum_hdr(ip4); 551247104Sgallatin if (__predict_false((csum) != 0)) { 552235944Sbz lc->lro_bad_csum++; 553235944Sbz return (TCP_LRO_CANNOT); 554182089Skmacy } 555179737Sjfv } 556179737Sjfv 557235944Sbz /* Find the TCP header (we assured there are no IP options). */ 558235944Sbz *th = (struct tcphdr *)(ip4 + 1); 559179737Sjfv 560235944Sbz return (0); 561235944Sbz} 562235944Sbz#endif 563179737Sjfv 564235944Sbzint 565235944Sbztcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) 566235944Sbz{ 567235944Sbz struct lro_entry *le; 568235944Sbz struct ether_header *eh; 569235944Sbz#ifdef INET6 570235944Sbz struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 571235944Sbz#endif 572235944Sbz#ifdef INET 573235944Sbz struct ip *ip4 = NULL; /* Keep compiler happy. */ 574235944Sbz#endif 575235944Sbz struct tcphdr *th; 576235944Sbz void *l3hdr = NULL; /* Keep compiler happy. */ 577235944Sbz uint32_t *ts_ptr; 578235944Sbz tcp_seq seq; 579235944Sbz int error, ip_len, l; 580235944Sbz uint16_t eh_type, tcp_data_len; 581179737Sjfv 582235944Sbz /* We expect a contiguous header [eh, ip, tcp]. */ 583235944Sbz 584235944Sbz eh = mtod(m, struct ether_header *); 585235944Sbz eh_type = ntohs(eh->ether_type); 586235944Sbz switch (eh_type) { 587235944Sbz#ifdef INET6 588235944Sbz case ETHERTYPE_IPV6: 589236394Sbz { 590236394Sbz CURVNET_SET(lc->ifp->if_vnet); 591235981Sbz if (V_ip6_forwarding != 0) { 592235981Sbz /* XXX-BZ stats but changing lro_ctrl is a problem. */ 593236394Sbz CURVNET_RESTORE(); 594235981Sbz return (TCP_LRO_CANNOT); 595235981Sbz } 596236394Sbz CURVNET_RESTORE(); 597235944Sbz l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); 598235944Sbz error = tcp_lro_rx_ipv6(lc, m, ip6, &th); 599235944Sbz if (error != 0) 600235944Sbz return (error); 601235944Sbz tcp_data_len = ntohs(ip6->ip6_plen); 602235944Sbz ip_len = sizeof(*ip6) + tcp_data_len; 603235944Sbz break; 604236394Sbz } 605235944Sbz#endif 606235944Sbz#ifdef INET 607235944Sbz case ETHERTYPE_IP: 608236394Sbz { 609236394Sbz CURVNET_SET(lc->ifp->if_vnet); 610235981Sbz if (V_ipforwarding != 0) { 611235981Sbz /* XXX-BZ stats but changing lro_ctrl is a problem. */ 612236394Sbz CURVNET_RESTORE(); 613235981Sbz return (TCP_LRO_CANNOT); 614235981Sbz } 615236394Sbz CURVNET_RESTORE(); 616235944Sbz l3hdr = ip4 = (struct ip *)(eh + 1); 617235944Sbz error = tcp_lro_rx_ipv4(lc, m, ip4, &th); 618235944Sbz if (error != 0) 619235944Sbz return (error); 620235944Sbz ip_len = ntohs(ip4->ip_len); 621235944Sbz tcp_data_len = ip_len - sizeof(*ip4); 622235944Sbz break; 623236394Sbz } 624235944Sbz#endif 625235944Sbz /* XXX-BZ what happens in case of VLAN(s)? */ 626235944Sbz default: 627235944Sbz return (TCP_LRO_NOT_SUPPORTED); 628179737Sjfv } 629179737Sjfv 630235944Sbz /* 631235944Sbz * If the frame is padded beyond the end of the IP packet, then we must 632235944Sbz * trim the extra bytes off. 633235944Sbz */ 634235944Sbz l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); 635235944Sbz if (l != 0) { 636235944Sbz if (l < 0) 637235944Sbz /* Truncated packet. */ 638235944Sbz return (TCP_LRO_CANNOT); 639179737Sjfv 640235944Sbz m_adj(m, -l); 641235944Sbz } 642235944Sbz 643235944Sbz /* 644235944Sbz * Check TCP header constraints. 645179737Sjfv */ 646235944Sbz /* Ensure no bits set besides ACK or PSH. */ 647235944Sbz if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) 648235944Sbz return (TCP_LRO_CANNOT); 649235944Sbz 650298730Ssephe /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */ 651235944Sbz /* XXX-BZ Ideally we'd flush on PUSH? */ 652235944Sbz 653235944Sbz /* 654235944Sbz * Check for timestamps. 655235944Sbz * Since the only option we handle are timestamps, we only have to 656235944Sbz * handle the simple case of aligned timestamps. 657235944Sbz */ 658235944Sbz l = (th->th_off << 2); 659235944Sbz tcp_data_len -= l; 660235944Sbz l -= sizeof(*th); 661235944Sbz ts_ptr = (uint32_t *)(th + 1); 662235944Sbz if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 663235944Sbz (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 664235944Sbz TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) 665235944Sbz return (TCP_LRO_CANNOT); 666235944Sbz 667235944Sbz /* If the driver did not pass in the checksum, set it now. */ 668235944Sbz if (csum == 0x0000) 669235944Sbz csum = th->th_sum; 670235944Sbz 671235944Sbz seq = ntohl(th->th_seq); 672235944Sbz 673235944Sbz /* Try to find a matching previous segment. */ 674297483Ssephe LIST_FOREACH(le, &lc->lro_active, next) { 675235944Sbz if (le->eh_type != eh_type) 676235944Sbz continue; 677235944Sbz if (le->source_port != th->th_sport || 678235944Sbz le->dest_port != th->th_dport) 679235944Sbz continue; 680235944Sbz switch (eh_type) { 681235944Sbz#ifdef INET6 682235944Sbz case ETHERTYPE_IPV6: 683235944Sbz if (bcmp(&le->source_ip6, &ip6->ip6_src, 684235944Sbz sizeof(struct in6_addr)) != 0 || 685235944Sbz bcmp(&le->dest_ip6, &ip6->ip6_dst, 686235944Sbz sizeof(struct in6_addr)) != 0) 687235944Sbz continue; 688235944Sbz break; 689235944Sbz#endif 690235944Sbz#ifdef INET 691235944Sbz case ETHERTYPE_IP: 692235944Sbz if (le->source_ip4 != ip4->ip_src.s_addr || 693235944Sbz le->dest_ip4 != ip4->ip_dst.s_addr) 694235944Sbz continue; 695235944Sbz break; 696235944Sbz#endif 697179737Sjfv } 698179737Sjfv 699235944Sbz /* Flush now if appending will result in overflow. */ 700295739Ssephe if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { 701298974Ssephe tcp_lro_active_remove(le); 702235944Sbz tcp_lro_flush(lc, le); 703235944Sbz break; 704235944Sbz } 705179737Sjfv 706235944Sbz /* Try to append the new segment. */ 707235944Sbz if (__predict_false(seq != le->next_seq || 708235944Sbz (tcp_data_len == 0 && le->ack_seq == th->th_ack))) { 709235944Sbz /* Out of order packet or duplicate ACK. */ 710298974Ssephe tcp_lro_active_remove(le); 711235944Sbz tcp_lro_flush(lc, le); 712235944Sbz return (TCP_LRO_CANNOT); 713235944Sbz } 714179737Sjfv 715235944Sbz if (l != 0) { 716235944Sbz uint32_t tsval = ntohl(*(ts_ptr + 1)); 717235944Sbz /* Make sure timestamp values are increasing. */ 718235944Sbz /* XXX-BZ flip and use TSTMP_GEQ macro for this? */ 719235944Sbz if (__predict_false(le->tsval > tsval || 720235944Sbz *(ts_ptr + 2) == 0)) 721235944Sbz return (TCP_LRO_CANNOT); 722235944Sbz le->tsval = tsval; 723235944Sbz le->tsecr = *(ts_ptr + 2); 724235944Sbz } 725223797Scperciva 726235944Sbz le->next_seq += tcp_data_len; 727235944Sbz le->ack_seq = th->th_ack; 728235944Sbz le->window = th->th_win; 729235944Sbz le->append_cnt++; 730179737Sjfv 731235944Sbz#ifdef TCP_LRO_UPDATE_CSUM 732235944Sbz le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th, 733235944Sbz tcp_data_len, ~csum); 734235944Sbz#endif 735179737Sjfv 736235944Sbz if (tcp_data_len == 0) { 737235944Sbz m_freem(m); 738295739Ssephe /* 739295739Ssephe * Flush this LRO entry, if this ACK should not 740295739Ssephe * be further delayed. 741295739Ssephe */ 742295739Ssephe if (le->append_cnt >= lc->lro_ackcnt_lim) { 743298974Ssephe tcp_lro_active_remove(le); 744295739Ssephe tcp_lro_flush(lc, le); 745295739Ssephe } 746235944Sbz return (0); 747235944Sbz } 748179737Sjfv 749235944Sbz le->p_len += tcp_data_len; 750179737Sjfv 751235944Sbz /* 752235944Sbz * Adjust the mbuf so that m_data points to the first byte of 753235944Sbz * the ULP payload. Adjust the mbuf to avoid complications and 754235944Sbz * append new segment to existing mbuf chain. 755235944Sbz */ 756235944Sbz m_adj(m, m->m_pkthdr.len - tcp_data_len); 757284961Snp m_demote_pkthdr(m); 758179737Sjfv 759235944Sbz le->m_tail->m_next = m; 760235944Sbz le->m_tail = m_last(m); 761235944Sbz 762235944Sbz /* 763235944Sbz * If a possible next full length packet would cause an 764235944Sbz * overflow, pro-actively flush now. 765235944Sbz */ 766295739Ssephe if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) { 767298974Ssephe tcp_lro_active_remove(le); 768235944Sbz tcp_lro_flush(lc, le); 769255010Snp } else 770255010Snp getmicrotime(&le->mtime); 771235944Sbz 772235944Sbz return (0); 773179737Sjfv } 774179737Sjfv 775235944Sbz /* Try to find an empty slot. */ 776297483Ssephe if (LIST_EMPTY(&lc->lro_free)) 777297265Ssephe return (TCP_LRO_NO_ENTRIES); 778179737Sjfv 779235944Sbz /* Start a new segment chain. */ 780297483Ssephe le = LIST_FIRST(&lc->lro_free); 781297483Ssephe LIST_REMOVE(le, next); 782298974Ssephe tcp_lro_active_insert(lc, le); 783255010Snp getmicrotime(&le->mtime); 784179737Sjfv 785235944Sbz /* Start filling in details. */ 786235944Sbz switch (eh_type) { 787235944Sbz#ifdef INET6 788235944Sbz case ETHERTYPE_IPV6: 789235944Sbz le->le_ip6 = ip6; 790235944Sbz le->source_ip6 = ip6->ip6_src; 791235944Sbz le->dest_ip6 = ip6->ip6_dst; 792235944Sbz le->eh_type = eh_type; 793235944Sbz le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); 794235944Sbz break; 795235944Sbz#endif 796235944Sbz#ifdef INET 797235944Sbz case ETHERTYPE_IP: 798235944Sbz le->le_ip4 = ip4; 799235944Sbz le->source_ip4 = ip4->ip_src.s_addr; 800235944Sbz le->dest_ip4 = ip4->ip_dst.s_addr; 801235944Sbz le->eh_type = eh_type; 802235944Sbz le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; 803235944Sbz break; 804235944Sbz#endif 805235944Sbz } 806235944Sbz le->source_port = th->th_sport; 807235944Sbz le->dest_port = th->th_dport; 808235944Sbz 809235944Sbz le->next_seq = seq + tcp_data_len; 810235944Sbz le->ack_seq = th->th_ack; 811235944Sbz le->window = th->th_win; 812235944Sbz if (l != 0) { 813235944Sbz le->timestamp = 1; 814235944Sbz le->tsval = ntohl(*(ts_ptr + 1)); 815235944Sbz le->tsecr = *(ts_ptr + 2); 816235944Sbz } 817235944Sbz 818235944Sbz#ifdef TCP_LRO_UPDATE_CSUM 819235944Sbz /* 820235944Sbz * Do not touch the csum of the first packet. However save the 821235944Sbz * "adjusted" checksum of just the source and destination addresses, 822235944Sbz * the next header and the TCP payload. The length and TCP header 823235944Sbz * parts may change, so we remove those from the saved checksum and 824235944Sbz * re-add with final values on tcp_lro_flush() if needed. 825179737Sjfv */ 826235944Sbz KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", 827235944Sbz __func__, le, le->ulp_csum)); 828235944Sbz 829235944Sbz le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, 830235944Sbz ~csum); 831235944Sbz th->th_sum = csum; /* Restore checksum on first packet. */ 832235944Sbz#endif 833235944Sbz 834235944Sbz le->m_head = m; 835235944Sbz le->m_tail = m_last(m); 836235944Sbz 837235944Sbz return (0); 838179737Sjfv} 839235944Sbz 840294327Shselaskyvoid 841294327Shselaskytcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) 842294327Shselasky{ 843294327Shselasky /* sanity checks */ 844294327Shselasky if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || 845294327Shselasky lc->lro_mbuf_max == 0)) { 846294327Shselasky /* packet drop */ 847294327Shselasky m_freem(mb); 848294327Shselasky return; 849294327Shselasky } 850294327Shselasky 851294327Shselasky /* check if packet is not LRO capable */ 852294327Shselasky if (__predict_false(mb->m_pkthdr.csum_flags == 0 || 853294327Shselasky (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { 854294327Shselasky lc->lro_flushed++; 855294327Shselasky lc->lro_queued++; 856294327Shselasky 857294327Shselasky /* input packet to network layer */ 858294327Shselasky (*lc->ifp->if_input) (lc->ifp, mb); 859294327Shselasky return; 860294327Shselasky } 861294327Shselasky 862294327Shselasky /* check if array is full */ 863294327Shselasky if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max)) 864294327Shselasky tcp_lro_flush_all(lc); 865294327Shselasky 866300731Shselasky /* create sequence number */ 867300731Shselasky lc->lro_mbuf_data[lc->lro_mbuf_count].seq = 868300731Shselasky (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | 869300731Shselasky (((uint64_t)mb->m_pkthdr.flowid) << 24) | 870300731Shselasky ((uint64_t)lc->lro_mbuf_count); 871294327Shselasky 872294327Shselasky /* enter mbuf */ 873300731Shselasky lc->lro_mbuf_data[lc->lro_mbuf_count++].mb = mb; 874294327Shselasky} 875294327Shselasky 876235944Sbz/* end */ 877