tcp_lro.c revision 182089
1179737Sjfv/****************************************************************************** 2179737Sjfv 3179737SjfvCopyright (c) 2007, Myricom Inc. 4179737SjfvCopyright (c) 2008, Intel Corporation. 5179737SjfvAll rights reserved. 6179737Sjfv 7179737SjfvRedistribution and use in source and binary forms, with or without 8179737Sjfvmodification, are permitted provided that the following conditions are met: 9179737Sjfv 10179737Sjfv 1. Redistributions of source code must retain the above copyright notice, 11179737Sjfv this list of conditions and the following disclaimer. 12179737Sjfv 13179737Sjfv 2. Neither the name of the Myricom Inc, nor the names of its 14179737Sjfv contributors may be used to endorse or promote products derived from 15179737Sjfv this software without specific prior written permission. 16179737Sjfv 17179737Sjfv 3. Neither the name of the Intel Corporation, nor the names of its 18179737Sjfv contributors may be used to endorse or promote products derived from 19179737Sjfv this software without specific prior written permission. 20179737Sjfv 21179737SjfvTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22179737SjfvAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23179737SjfvIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24179737SjfvARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 25179737SjfvLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26179737SjfvCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27179737SjfvSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28179737SjfvINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29179737SjfvCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30179737SjfvARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31179737SjfvPOSSIBILITY OF SUCH DAMAGE. 32179737Sjfv 33179737Sjfv$FreeBSD: head/sys/netinet/tcp_lro.c 182089 2008-08-24 02:31:09Z kmacy $ 34179737Sjfv***************************************************************************/ 35179737Sjfv 36179737Sjfv#include <sys/param.h> 37179737Sjfv#include <sys/systm.h> 38179737Sjfv#include <sys/endian.h> 39179737Sjfv#include <sys/mbuf.h> 40179737Sjfv#include <sys/kernel.h> 41179737Sjfv#include <sys/socket.h> 42179737Sjfv 43179737Sjfv#include <net/if.h> 44179737Sjfv#include <net/ethernet.h> 45179737Sjfv#include <net/if_media.h> 46179737Sjfv 47179737Sjfv#include <netinet/in_systm.h> 48179737Sjfv#include <netinet/in.h> 49179737Sjfv#include <netinet/ip.h> 50179737Sjfv#include <netinet/tcp.h> 51179737Sjfv#include <netinet/tcp_lro.h> 52179737Sjfv 53179737Sjfv#include <machine/bus.h> 54179737Sjfv#include <machine/in_cksum.h> 55179737Sjfv 56179737Sjfv 57179737Sjfvstatic uint16_t do_csum_data(uint16_t *raw, int len) 58179737Sjfv{ 59179737Sjfv uint32_t csum; 60179737Sjfv csum = 0; 61179737Sjfv while (len > 0) { 62179737Sjfv csum += *raw; 63179737Sjfv raw++; 64179737Sjfv csum += *raw; 65179737Sjfv raw++; 66179737Sjfv len -= 4; 67179737Sjfv } 68179737Sjfv csum = (csum >> 16) + (csum & 0xffff); 69179737Sjfv csum = (csum >> 16) + (csum & 0xffff); 70179737Sjfv return (uint16_t)csum; 71179737Sjfv} 72179737Sjfv 73179737Sjfv/* 74179737Sjfv * Allocate and init the LRO data structures 75179737Sjfv */ 76179737Sjfvint 77179737Sjfvtcp_lro_init(struct lro_ctrl *cntl) 78179737Sjfv{ 79179737Sjfv struct lro_entry *lro; 80179737Sjfv int i, error = 0; 81179737Sjfv 82179737Sjfv SLIST_INIT(&cntl->lro_free); 83179737Sjfv SLIST_INIT(&cntl->lro_active); 84179737Sjfv 85179737Sjfv cntl->lro_bad_csum = 0; 86179737Sjfv cntl->lro_queued = 0; 87179737Sjfv cntl->lro_flushed = 0; 88179737Sjfv 89179737Sjfv for (i = 0; i < LRO_ENTRIES; i++) { 90179737Sjfv lro = (struct lro_entry *) malloc(sizeof (struct lro_entry), 91179737Sjfv M_DEVBUF, M_NOWAIT | M_ZERO); 92179737Sjfv if (lro == NULL) { 93179737Sjfv if (i == 0) 94179737Sjfv error = ENOMEM; 95179737Sjfv break; 96179737Sjfv } 97179737Sjfv cntl->lro_cnt = i; 98179737Sjfv SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); 99179737Sjfv } 100179737Sjfv 101179737Sjfv return (error); 102179737Sjfv} 103179737Sjfv 104179737Sjfvvoid 105179737Sjfvtcp_lro_free(struct lro_ctrl *cntl) 106179737Sjfv{ 107179737Sjfv struct lro_entry *entry; 108179737Sjfv 109179737Sjfv while (!SLIST_EMPTY(&cntl->lro_free)) { 110179737Sjfv entry = SLIST_FIRST(&cntl->lro_free); 111179737Sjfv SLIST_REMOVE_HEAD(&cntl->lro_free, next); 112179737Sjfv free(entry, M_DEVBUF); 113179737Sjfv } 114179737Sjfv} 115179737Sjfv 116179737Sjfvvoid 117179737Sjfvtcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro) 118179737Sjfv{ 119179737Sjfv struct ifnet *ifp; 120179737Sjfv struct ip *ip; 121179737Sjfv struct tcphdr *tcp; 122179737Sjfv uint32_t *ts_ptr; 123179737Sjfv uint32_t tcplen, tcp_csum; 124179737Sjfv 125179737Sjfv 126179737Sjfv if (lro->append_cnt) { 127179737Sjfv /* incorporate the new len into the ip header and 128179737Sjfv * re-calculate the checksum */ 129179737Sjfv ip = lro->ip; 130179737Sjfv ip->ip_len = htons(lro->len - ETHER_HDR_LEN); 131179737Sjfv ip->ip_sum = 0; 132179737Sjfv ip->ip_sum = 0xffff ^ 133179737Sjfv do_csum_data((uint16_t*)ip, 134179737Sjfv sizeof (*ip)); 135179737Sjfv 136179737Sjfv lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED | 137179737Sjfv CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 138179737Sjfv lro->m_head->m_pkthdr.csum_data = 0xffff; 139179737Sjfv lro->m_head->m_pkthdr.len = lro->len; 140179737Sjfv 141179737Sjfv /* incorporate the latest ack into the tcp header */ 142179737Sjfv tcp = (struct tcphdr *) (ip + 1); 143179737Sjfv tcp->th_ack = lro->ack_seq; 144179737Sjfv tcp->th_win = lro->window; 145179737Sjfv /* incorporate latest timestamp into the tcp header */ 146179737Sjfv if (lro->timestamp) { 147179737Sjfv ts_ptr = (uint32_t *)(tcp + 1); 148179737Sjfv ts_ptr[1] = htonl(lro->tsval); 149179737Sjfv ts_ptr[2] = lro->tsecr; 150179737Sjfv } 151179737Sjfv /* 152179737Sjfv * update checksum in tcp header by re-calculating the 153179737Sjfv * tcp pseudoheader checksum, and adding it to the checksum 154179737Sjfv * of the tcp payload data 155179737Sjfv */ 156179737Sjfv tcp->th_sum = 0; 157179737Sjfv tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN; 158179737Sjfv tcp_csum = lro->data_csum; 159179737Sjfv tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 160179737Sjfv htons(tcplen + IPPROTO_TCP)); 161179737Sjfv tcp_csum += do_csum_data((uint16_t*)tcp, 162179737Sjfv tcp->th_off << 2); 163179737Sjfv tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); 164179737Sjfv tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); 165179737Sjfv tcp->th_sum = 0xffff ^ tcp_csum; 166179737Sjfv } 167179737Sjfv ifp = cntl->ifp; 168179737Sjfv (*ifp->if_input)(cntl->ifp, lro->m_head); 169179737Sjfv cntl->lro_queued += lro->append_cnt + 1; 170179737Sjfv cntl->lro_flushed++; 171179737Sjfv lro->m_head = NULL; 172179737Sjfv lro->timestamp = 0; 173179737Sjfv lro->append_cnt = 0; 174179737Sjfv SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); 175179737Sjfv} 176179737Sjfv 177179737Sjfvint 178179737Sjfvtcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum) 179179737Sjfv{ 180179737Sjfv struct ether_header *eh; 181179737Sjfv struct ip *ip; 182179737Sjfv struct tcphdr *tcp; 183179737Sjfv uint32_t *ts_ptr; 184179737Sjfv struct mbuf *m_nxt, *m_tail; 185179737Sjfv struct lro_entry *lro; 186179737Sjfv int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len; 187182089Skmacy int opt_bytes, trim, csum_flags; 188179737Sjfv uint32_t seq, tmp_csum, device_mtu; 189179737Sjfv 190179737Sjfv 191179737Sjfv eh = mtod(m_head, struct ether_header *); 192179737Sjfv if (eh->ether_type != htons(ETHERTYPE_IP)) 193179737Sjfv return 1; 194179737Sjfv ip = (struct ip *) (eh + 1); 195179737Sjfv if (ip->ip_p != IPPROTO_TCP) 196179737Sjfv return 1; 197179737Sjfv 198179737Sjfv /* ensure there are no options */ 199179737Sjfv if ((ip->ip_hl << 2) != sizeof (*ip)) 200179737Sjfv return -1; 201179737Sjfv 202179737Sjfv /* .. and the packet is not fragmented */ 203179737Sjfv if (ip->ip_off & htons(IP_MF|IP_OFFMASK)) 204179737Sjfv return -1; 205179737Sjfv 206179737Sjfv /* verify that the IP header checksum is correct */ 207182089Skmacy csum_flags = m_head->m_pkthdr.csum_flags; 208182089Skmacy if (csum_flags & CSUM_IP_CHECKED) { 209182089Skmacy if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { 210182089Skmacy cntl->lro_bad_csum++; 211182089Skmacy return -1; 212182089Skmacy } 213182089Skmacy } else { 214182089Skmacy tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip)); 215182089Skmacy if (__predict_false((tmp_csum ^ 0xffff) != 0)) { 216182089Skmacy cntl->lro_bad_csum++; 217182089Skmacy return -1; 218182089Skmacy } 219179737Sjfv } 220182089Skmacy 221179737Sjfv /* find the TCP header */ 222179737Sjfv tcp = (struct tcphdr *) (ip + 1); 223179737Sjfv 224179737Sjfv /* Get the TCP checksum if we dont have it */ 225179737Sjfv if (!csum) 226179737Sjfv csum = tcp->th_sum; 227179737Sjfv 228179737Sjfv /* ensure no bits set besides ack or psh */ 229179737Sjfv if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0) 230179737Sjfv return -1; 231179737Sjfv 232179737Sjfv /* check for timestamps. Since the only option we handle are 233179737Sjfv timestamps, we only have to handle the simple case of 234179737Sjfv aligned timestamps */ 235179737Sjfv 236179737Sjfv opt_bytes = (tcp->th_off << 2) - sizeof (*tcp); 237179737Sjfv tcp_hdr_len = sizeof (*tcp) + opt_bytes; 238179737Sjfv ts_ptr = (uint32_t *)(tcp + 1); 239179737Sjfv if (opt_bytes != 0) { 240179737Sjfv if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) || 241179737Sjfv (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 242179737Sjfv TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))) 243179737Sjfv return -1; 244179737Sjfv } 245179737Sjfv 246179737Sjfv ip_len = ntohs(ip->ip_len); 247179737Sjfv tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip); 248179737Sjfv 249179737Sjfv 250179737Sjfv /* 251179737Sjfv * If frame is padded beyond the end of the IP packet, 252179737Sjfv * then we must trim the extra bytes off the end. 253179737Sjfv */ 254179737Sjfv tot_len = m_head->m_pkthdr.len; 255179737Sjfv trim = tot_len - (ip_len + ETHER_HDR_LEN); 256179737Sjfv if (trim != 0) { 257179737Sjfv if (trim < 0) { 258179737Sjfv /* truncated packet */ 259179737Sjfv return -1; 260179737Sjfv } 261179737Sjfv m_adj(m_head, -trim); 262179737Sjfv tot_len = m_head->m_pkthdr.len; 263179737Sjfv } 264179737Sjfv 265179737Sjfv m_nxt = m_head; 266179737Sjfv m_tail = NULL; /* -Wuninitialized */ 267179737Sjfv while (m_nxt != NULL) { 268179737Sjfv m_tail = m_nxt; 269179737Sjfv m_nxt = m_tail->m_next; 270179737Sjfv } 271179737Sjfv 272179737Sjfv hlen = ip_len + ETHER_HDR_LEN - tcp_data_len; 273179737Sjfv seq = ntohl(tcp->th_seq); 274179737Sjfv 275179737Sjfv SLIST_FOREACH(lro, &cntl->lro_active, next) { 276179737Sjfv if (lro->source_port == tcp->th_sport && 277179737Sjfv lro->dest_port == tcp->th_dport && 278179737Sjfv lro->source_ip == ip->ip_src.s_addr && 279179737Sjfv lro->dest_ip == ip->ip_dst.s_addr) { 280179737Sjfv /* Try to append it */ 281179737Sjfv 282179737Sjfv if (__predict_false(seq != lro->next_seq)) { 283179737Sjfv /* out of order packet */ 284179737Sjfv SLIST_REMOVE(&cntl->lro_active, lro, 285179737Sjfv lro_entry, next); 286179737Sjfv tcp_lro_flush(cntl, lro); 287179737Sjfv return -1; 288179737Sjfv } 289179737Sjfv 290179737Sjfv if (opt_bytes) { 291179737Sjfv uint32_t tsval = ntohl(*(ts_ptr + 1)); 292179737Sjfv /* make sure timestamp values are increasing */ 293179737Sjfv if (__predict_false(lro->tsval > tsval || 294179737Sjfv *(ts_ptr + 2) == 0)) { 295179737Sjfv return -1; 296179737Sjfv } 297179737Sjfv lro->tsval = tsval; 298179737Sjfv lro->tsecr = *(ts_ptr + 2); 299179737Sjfv } 300179737Sjfv 301179737Sjfv lro->next_seq += tcp_data_len; 302179737Sjfv lro->ack_seq = tcp->th_ack; 303179737Sjfv lro->window = tcp->th_win; 304179737Sjfv lro->append_cnt++; 305179737Sjfv if (tcp_data_len == 0) { 306179737Sjfv m_freem(m_head); 307179737Sjfv return 0; 308179737Sjfv } 309179737Sjfv /* subtract off the checksum of the tcp header 310179737Sjfv * from the hardware checksum, and add it to the 311179737Sjfv * stored tcp data checksum. Byteswap the checksum 312179737Sjfv * if the total length so far is odd 313179737Sjfv */ 314179737Sjfv tmp_csum = do_csum_data((uint16_t*)tcp, 315179737Sjfv tcp_hdr_len); 316179737Sjfv csum = csum + (tmp_csum ^ 0xffff); 317179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 318179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 319179737Sjfv if (lro->len & 0x1) { 320179737Sjfv /* Odd number of bytes so far, flip bytes */ 321179737Sjfv csum = ((csum << 8) | (csum >> 8)) & 0xffff; 322179737Sjfv } 323179737Sjfv csum = csum + lro->data_csum; 324179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 325179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 326179737Sjfv lro->data_csum = csum; 327179737Sjfv 328179737Sjfv lro->len += tcp_data_len; 329179737Sjfv 330179737Sjfv /* adjust mbuf so that m->m_data points to 331179737Sjfv the first byte of the payload */ 332179737Sjfv m_adj(m_head, hlen); 333179737Sjfv /* append mbuf chain */ 334179737Sjfv lro->m_tail->m_next = m_head; 335179737Sjfv /* advance the last pointer */ 336179737Sjfv lro->m_tail = m_tail; 337179737Sjfv /* flush packet if required */ 338179737Sjfv device_mtu = cntl->ifp->if_mtu; 339179737Sjfv if (lro->len > (65535 - device_mtu)) { 340179737Sjfv SLIST_REMOVE(&cntl->lro_active, lro, 341179737Sjfv lro_entry, next); 342179737Sjfv tcp_lro_flush(cntl, lro); 343179737Sjfv } 344179737Sjfv return 0; 345179737Sjfv } 346179737Sjfv } 347179737Sjfv 348179737Sjfv if (SLIST_EMPTY(&cntl->lro_free)) 349179737Sjfv return -1; 350179737Sjfv 351179737Sjfv /* start a new chain */ 352179737Sjfv lro = SLIST_FIRST(&cntl->lro_free); 353179737Sjfv SLIST_REMOVE_HEAD(&cntl->lro_free, next); 354179737Sjfv SLIST_INSERT_HEAD(&cntl->lro_active, lro, next); 355179737Sjfv lro->source_port = tcp->th_sport; 356179737Sjfv lro->dest_port = tcp->th_dport; 357179737Sjfv lro->source_ip = ip->ip_src.s_addr; 358179737Sjfv lro->dest_ip = ip->ip_dst.s_addr; 359179737Sjfv lro->next_seq = seq + tcp_data_len; 360179737Sjfv lro->mss = tcp_data_len; 361179737Sjfv lro->ack_seq = tcp->th_ack; 362179737Sjfv lro->window = tcp->th_win; 363179737Sjfv 364179737Sjfv /* save the checksum of just the TCP payload by 365179737Sjfv * subtracting off the checksum of the TCP header from 366179737Sjfv * the entire hardware checksum 367179737Sjfv * Since IP header checksum is correct, checksum over 368179737Sjfv * the IP header is -0. Substracting -0 is unnecessary. 369179737Sjfv */ 370179737Sjfv tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len); 371179737Sjfv csum = csum + (tmp_csum ^ 0xffff); 372179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 373179737Sjfv csum = (csum & 0xffff) + (csum >> 16); 374179737Sjfv lro->data_csum = csum; 375179737Sjfv 376179737Sjfv lro->ip = ip; 377179737Sjfv /* record timestamp if it is present */ 378179737Sjfv if (opt_bytes) { 379179737Sjfv lro->timestamp = 1; 380179737Sjfv lro->tsval = ntohl(*(ts_ptr + 1)); 381179737Sjfv lro->tsecr = *(ts_ptr + 2); 382179737Sjfv } 383179737Sjfv lro->len = tot_len; 384179737Sjfv lro->m_head = m_head; 385179737Sjfv lro->m_tail = m_tail; 386179737Sjfv return 0; 387179737Sjfv} 388