tcp_lro.c revision 235474
1235474Sbz/*- 2235474Sbz * Copyright (c) 2007, Myricom Inc. 3235474Sbz * Copyright (c) 2008, Intel Corporation. 4235474Sbz * All rights reserved. 5235474Sbz * 6235474Sbz * Redistribution and use in source and binary forms, with or without 7235474Sbz * modification, are permitted provided that the following conditions 8235474Sbz * are met: 9235474Sbz * 1. Redistributions of source code must retain the above copyright 10235474Sbz * notice, this list of conditions and the following disclaimer. 11235474Sbz * 2. Redistributions in binary form must reproduce the above copyright 12235474Sbz * notice, this list of conditions and the following disclaimer in the 13235474Sbz * documentation and/or other materials provided with the distribution. 14235474Sbz * 15235474Sbz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16235474Sbz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17235474Sbz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18235474Sbz * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19235474Sbz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20235474Sbz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21235474Sbz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22235474Sbz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23235474Sbz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24235474Sbz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25235474Sbz * SUCH DAMAGE. 26235474Sbz * 27235474Sbz * $FreeBSD: head/sys/netinet/tcp_lro.c 235474 2012-05-15 13:23:44Z bz $ 28235474Sbz */ 29179737Sjfv 30179737Sjfv#include <sys/param.h> 31179737Sjfv#include <sys/systm.h> 32179737Sjfv#include <sys/endian.h> 33179737Sjfv#include <sys/mbuf.h> 34179737Sjfv#include <sys/kernel.h> 35179737Sjfv#include <sys/socket.h> 36179737Sjfv 37179737Sjfv#include <net/if.h> 38179737Sjfv#include <net/ethernet.h> 39179737Sjfv#include <net/if_media.h> 40179737Sjfv 41179737Sjfv#include <netinet/in_systm.h> 42179737Sjfv#include <netinet/in.h> 43179737Sjfv#include <netinet/ip.h> 44179737Sjfv#include <netinet/tcp.h> 45179737Sjfv#include <netinet/tcp_lro.h> 46179737Sjfv 47179737Sjfv#include <machine/bus.h> 48179737Sjfv#include <machine/in_cksum.h> 49179737Sjfv 50179737Sjfv 51179737Sjfvstatic uint16_t do_csum_data(uint16_t *raw, int len) 52179737Sjfv{ 53179737Sjfv uint32_t csum; 54179737Sjfv csum = 0; 55179737Sjfv while (len > 0) { 56179737Sjfv csum += *raw; 57179737Sjfv raw++; 58179737Sjfv csum += *raw; 59179737Sjfv raw++; 60179737Sjfv len -= 4; 61179737Sjfv } 62179737Sjfv csum = (csum >> 16) + (csum & 0xffff); 63179737Sjfv csum = (csum >> 16) + (csum & 0xffff); 64179737Sjfv return (uint16_t)csum; 65179737Sjfv} 66179737Sjfv 67179737Sjfv/* 68179737Sjfv * Allocate and init the LRO data structures 69179737Sjfv */ 70179737Sjfvint 71179737Sjfvtcp_lro_init(struct lro_ctrl *cntl) 72179737Sjfv{ 73179737Sjfv struct lro_entry *lro; 74179737Sjfv int i, error = 0; 75179737Sjfv 76179737Sjfv SLIST_INIT(&cntl->lro_free); 77179737Sjfv SLIST_INIT(&cntl->lro_active); 78179737Sjfv 79179737Sjfv cntl->lro_bad_csum = 0; 80179737Sjfv cntl->lro_queued = 0; 81179737Sjfv cntl->lro_flushed = 0; 82179737Sjfv 83179737Sjfv for (i = 0; i < LRO_ENTRIES; i++) { 84179737Sjfv lro = (struct lro_entry *) malloc(sizeof (struct lro_entry), 85179737Sjfv M_DEVBUF, M_NOWAIT | M_ZERO); 86179737Sjfv if (lro == NULL) { 87179737Sjfv if (i == 0) 88179737Sjfv error = ENOMEM; 89179737Sjfv break; 90179737Sjfv } 91179737Sjfv cntl->lro_cnt = i; 92179737Sjfv SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); 93179737Sjfv } 94179737Sjfv 95179737Sjfv return (error); 96179737Sjfv} 97179737Sjfv 98179737Sjfvvoid 99179737Sjfvtcp_lro_free(struct lro_ctrl *cntl) 100179737Sjfv{ 101179737Sjfv struct lro_entry *entry; 102179737Sjfv 103179737Sjfv while (!SLIST_EMPTY(&cntl->lro_free)) { 104179737Sjfv entry = SLIST_FIRST(&cntl->lro_free); 105217126Sjhb SLIST_REMOVE_HEAD(&cntl->lro_free, next); 106179737Sjfv free(entry, M_DEVBUF); 107179737Sjfv } 108179737Sjfv} 109179737Sjfv 110179737Sjfvvoid 111179737Sjfvtcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro) 112179737Sjfv{ 113179737Sjfv struct ifnet *ifp; 114179737Sjfv struct ip *ip; 115179737Sjfv struct tcphdr *tcp; 116179737Sjfv uint32_t *ts_ptr; 117179737Sjfv uint32_t tcplen, tcp_csum; 118179737Sjfv 119179737Sjfv 120179737Sjfv if (lro->append_cnt) { 121179737Sjfv /* incorporate the new len into the ip header and 122179737Sjfv * re-calculate the checksum */ 123179737Sjfv ip = lro->ip; 124179737Sjfv ip->ip_len = htons(lro->len - ETHER_HDR_LEN); 125179737Sjfv ip->ip_sum = 0; 126179737Sjfv ip->ip_sum = 0xffff ^ 127179737Sjfv do_csum_data((uint16_t*)ip, 128179737Sjfv sizeof (*ip)); 129179737Sjfv 130179737Sjfv lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED | 131179737Sjfv CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 132179737Sjfv lro->m_head->m_pkthdr.csum_data = 0xffff; 133179737Sjfv lro->m_head->m_pkthdr.len = lro->len; 134179737Sjfv 135179737Sjfv /* incorporate the latest ack into the tcp header */ 136179737Sjfv tcp = (struct tcphdr *) (ip + 1); 137179737Sjfv tcp->th_ack = lro->ack_seq; 138179737Sjfv tcp->th_win = lro->window; 139179737Sjfv /* incorporate latest timestamp into the tcp header */ 140179737Sjfv if (lro->timestamp) { 141179737Sjfv ts_ptr = (uint32_t *)(tcp + 1); 142179737Sjfv ts_ptr[1] = htonl(lro->tsval); 143179737Sjfv ts_ptr[2] = lro->tsecr; 144179737Sjfv } 145179737Sjfv /* 146179737Sjfv * update checksum in tcp header by re-calculating the 147179737Sjfv * tcp pseudoheader checksum, and adding it to the checksum 148179737Sjfv * of the tcp payload data 149179737Sjfv */ 150179737Sjfv tcp->th_sum = 0; 151179737Sjfv tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN; 152179737Sjfv tcp_csum = lro->data_csum; 153179737Sjfv tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 154179737Sjfv htons(tcplen + IPPROTO_TCP)); 155179737Sjfv tcp_csum += do_csum_data((uint16_t*)tcp, 156179737Sjfv tcp->th_off << 2); 157179737Sjfv tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); 158179737Sjfv tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); 159179737Sjfv tcp->th_sum = 0xffff ^ tcp_csum; 160179737Sjfv } 161179737Sjfv ifp = cntl->ifp; 162179737Sjfv (*ifp->if_input)(cntl->ifp, lro->m_head); 163179737Sjfv cntl->lro_queued += lro->append_cnt + 1; 164179737Sjfv cntl->lro_flushed++; 165179737Sjfv lro->m_head = NULL; 166179737Sjfv lro->timestamp = 0; 167179737Sjfv lro->append_cnt = 0; 168179737Sjfv SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); 169179737Sjfv} 170179737Sjfv 171179737Sjfvint 172179737Sjfvtcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum) 173179737Sjfv{ 174179737Sjfv struct ether_header *eh; 175179737Sjfv struct ip *ip; 176179737Sjfv struct tcphdr *tcp; 177179737Sjfv uint32_t *ts_ptr; 178179737Sjfv struct mbuf *m_nxt, *m_tail; 179179737Sjfv struct lro_entry *lro; 180179737Sjfv int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len; 181182089Skmacy int opt_bytes, trim, csum_flags; 182179737Sjfv uint32_t seq, tmp_csum, device_mtu; 183179737Sjfv 184179737Sjfv 185179737Sjfv eh = mtod(m_head, struct ether_header *); 186179737Sjfv if (eh->ether_type != htons(ETHERTYPE_IP)) 187179737Sjfv return 1; 188179737Sjfv ip = (struct ip *) (eh + 1); 189179737Sjfv if (ip->ip_p != IPPROTO_TCP) 190179737Sjfv return 1; 191179737Sjfv 192179737Sjfv /* ensure there are no options */ 193179737Sjfv if ((ip->ip_hl << 2) != sizeof (*ip)) 194179737Sjfv return -1; 195179737Sjfv 196179737Sjfv /* .. and the packet is not fragmented */ 197179737Sjfv if (ip->ip_off & htons(IP_MF|IP_OFFMASK)) 198179737Sjfv return -1; 199179737Sjfv 200179737Sjfv /* verify that the IP header checksum is correct */ 201182089Skmacy csum_flags = m_head->m_pkthdr.csum_flags; 202182089Skmacy if (csum_flags & CSUM_IP_CHECKED) { 203182089Skmacy if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { 204182089Skmacy cntl->lro_bad_csum++; 205182089Skmacy return -1; 206182089Skmacy } 207182089Skmacy } else { 208182089Skmacy tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip)); 209182089Skmacy if (__predict_false((tmp_csum ^ 0xffff) != 0)) { 210182089Skmacy cntl->lro_bad_csum++; 211182089Skmacy return -1; 212182089Skmacy } 213179737Sjfv } 214182089Skmacy 215179737Sjfv /* find the TCP header */ 216179737Sjfv tcp = (struct tcphdr *) (ip + 1); 217179737Sjfv 218179737Sjfv /* Get the TCP checksum if we dont have it */ 219179737Sjfv if (!csum) 220179737Sjfv csum = tcp->th_sum; 221179737Sjfv 222179737Sjfv /* ensure no bits set besides ack or psh */ 223179737Sjfv if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0) 224179737Sjfv return -1; 225179737Sjfv 226179737Sjfv /* check for timestamps. Since the only option we handle are 227179737Sjfv timestamps, we only have to handle the simple case of 228179737Sjfv aligned timestamps */ 229179737Sjfv 230179737Sjfv opt_bytes = (tcp->th_off << 2) - sizeof (*tcp); 231179737Sjfv tcp_hdr_len = sizeof (*tcp) + opt_bytes; 232179737Sjfv ts_ptr = (uint32_t *)(tcp + 1); 233179737Sjfv if (opt_bytes != 0) { 234179737Sjfv if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) || 235179737Sjfv (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 236179737Sjfv TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))) 237179737Sjfv return -1; 238179737Sjfv } 239179737Sjfv 240179737Sjfv ip_len = ntohs(ip->ip_len); 241179737Sjfv tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip); 242179737Sjfv 243179737Sjfv 244179737Sjfv /* 245179737Sjfv * If frame is padded beyond the end of the IP packet, 246179737Sjfv * then we must trim the extra bytes off the end. 247179737Sjfv */ 248179737Sjfv tot_len = m_head->m_pkthdr.len; 249179737Sjfv trim = tot_len - (ip_len + ETHER_HDR_LEN); 250179737Sjfv if (trim != 0) { 251179737Sjfv if (trim < 0) { 252179737Sjfv /* truncated packet */ 253179737Sjfv return -1; 254179737Sjfv } 255179737Sjfv m_adj(m_head, -trim); 256179737Sjfv tot_len = m_head->m_pkthdr.len; 257179737Sjfv } 258179737Sjfv 259179737Sjfv m_nxt = m_head; 260179737Sjfv m_tail = NULL; /* -Wuninitialized */ 261179737Sjfv while (m_nxt != NULL) { 262179737Sjfv m_tail = m_nxt; 263179737Sjfv m_nxt = m_tail->m_next; 264179737Sjfv } 265179737Sjfv 266179737Sjfv hlen = ip_len + ETHER_HDR_LEN - tcp_data_len; 267179737Sjfv seq = ntohl(tcp->th_seq); 268179737Sjfv 269179737Sjfv SLIST_FOREACH(lro, &cntl->lro_active, next) { 270179737Sjfv if (lro->source_port == tcp->th_sport && 271179737Sjfv lro->dest_port == tcp->th_dport && 272179737Sjfv lro->source_ip == ip->ip_src.s_addr && 273179737Sjfv lro->dest_ip == ip->ip_dst.s_addr) { 274223797Scperciva /* Flush now if appending will result in overflow. */ 275223797Scperciva if (lro->len > (65535 - tcp_data_len)) { 276223797Scperciva SLIST_REMOVE(&cntl->lro_active, lro, 277223797Scperciva lro_entry, next); 278223797Scperciva tcp_lro_flush(cntl, lro); 279223797Scperciva break; 280223797Scperciva } 281223797Scperciva 282179737Sjfv /* Try to append it */ 283179737Sjfv 284220428Sjfv if (__predict_false(seq != lro->next_seq || 285220428Sjfv (tcp_data_len == 0 && 286220428Sjfv lro->ack_seq == tcp->th_ack))) { 287220428Sjfv /* out of order packet or dup ack */ 288179737Sjfv SLIST_REMOVE(&cntl->lro_active, lro, 289179737Sjfv lro_entry, next); 290179737Sjfv tcp_lro_flush(cntl, lro); 291179737Sjfv return -1; 292179737Sjfv } 293179737Sjfv 294179737Sjfv if (opt_bytes) { 295179737Sjfv uint32_t tsval = ntohl(*(ts_ptr + 1)); 296179737Sjfv /* make sure timestamp values are increasing */ 297179737Sjfv if (__predict_false(lro->tsval > tsval || 298179737Sjfv *(ts_ptr + 2) == 0)) { 299179737Sjfv return -1; 300179737Sjfv } 301179737Sjfv lro->tsval = tsval; 302179737Sjfv lro->tsecr = *(ts_ptr + 2); 303179737Sjfv } 304179737Sjfv 305179737Sjfv lro->next_seq += tcp_data_len; 306179737Sjfv lro->ack_seq = tcp->th_ack; 307179737Sjfv lro->window = tcp->th_win; 308179737Sjfv lro->append_cnt++; 309179737Sjfv if (tcp_data_len == 0) { 310179737Sjfv m_freem(m_head); 311179737Sjfv return 0; 312179737Sjfv } 313179737Sjfv /* subtract off the checksum of the tcp header 314179737Sjfv * from the hardware checksum, and add it to the 315179737Sjfv * stored tcp data checksum. Byteswap the checksum 316179737Sjfv * if the total length so far is odd 317179737Sjfv */ 318179737Sjfv tmp_csum = do_csum_data((uint16_t*)tcp, 319179737Sjfv tcp_hdr_len); 320179737Sjfv csum = csum + (tmp_csum ^ 0xffff); 321179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 322179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 323179737Sjfv if (lro->len & 0x1) { 324179737Sjfv /* Odd number of bytes so far, flip bytes */ 325179737Sjfv csum = ((csum << 8) | (csum >> 8)) & 0xffff; 326179737Sjfv } 327179737Sjfv csum = csum + lro->data_csum; 328179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 329179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 330179737Sjfv lro->data_csum = csum; 331179737Sjfv 332179737Sjfv lro->len += tcp_data_len; 333179737Sjfv 334179737Sjfv /* adjust mbuf so that m->m_data points to 335179737Sjfv the first byte of the payload */ 336179737Sjfv m_adj(m_head, hlen); 337179737Sjfv /* append mbuf chain */ 338179737Sjfv lro->m_tail->m_next = m_head; 339179737Sjfv /* advance the last pointer */ 340179737Sjfv lro->m_tail = m_tail; 341179737Sjfv /* flush packet if required */ 342179737Sjfv device_mtu = cntl->ifp->if_mtu; 343179737Sjfv if (lro->len > (65535 - device_mtu)) { 344179737Sjfv SLIST_REMOVE(&cntl->lro_active, lro, 345179737Sjfv lro_entry, next); 346179737Sjfv tcp_lro_flush(cntl, lro); 347179737Sjfv } 348179737Sjfv return 0; 349179737Sjfv } 350179737Sjfv } 351179737Sjfv 352179737Sjfv if (SLIST_EMPTY(&cntl->lro_free)) 353179737Sjfv return -1; 354179737Sjfv 355179737Sjfv /* start a new chain */ 356179737Sjfv lro = SLIST_FIRST(&cntl->lro_free); 357179737Sjfv SLIST_REMOVE_HEAD(&cntl->lro_free, next); 358179737Sjfv SLIST_INSERT_HEAD(&cntl->lro_active, lro, next); 359179737Sjfv lro->source_port = tcp->th_sport; 360179737Sjfv lro->dest_port = tcp->th_dport; 361179737Sjfv lro->source_ip = ip->ip_src.s_addr; 362179737Sjfv lro->dest_ip = ip->ip_dst.s_addr; 363179737Sjfv lro->next_seq = seq + tcp_data_len; 364179737Sjfv lro->mss = tcp_data_len; 365179737Sjfv lro->ack_seq = tcp->th_ack; 366179737Sjfv lro->window = tcp->th_win; 367179737Sjfv 368179737Sjfv /* save the checksum of just the TCP payload by 369179737Sjfv * subtracting off the checksum of the TCP header from 370179737Sjfv * the entire hardware checksum 371179737Sjfv * Since IP header checksum is correct, checksum over 372179737Sjfv * the IP header is -0. Substracting -0 is unnecessary. 373179737Sjfv */ 374179737Sjfv tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len); 375179737Sjfv csum = csum + (tmp_csum ^ 0xffff); 376179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 377179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 378179737Sjfv lro->data_csum = csum; 379179737Sjfv 380179737Sjfv lro->ip = ip; 381179737Sjfv /* record timestamp if it is present */ 382179737Sjfv if (opt_bytes) { 383179737Sjfv lro->timestamp = 1; 384179737Sjfv lro->tsval = ntohl(*(ts_ptr + 1)); 385179737Sjfv lro->tsecr = *(ts_ptr + 2); 386179737Sjfv } 387179737Sjfv lro->len = tot_len; 388179737Sjfv lro->m_head = m_head; 389179737Sjfv lro->m_tail = m_tail; 390179737Sjfv return 0; 391179737Sjfv} 392