tcp_lro.c revision 179737
1/****************************************************************************** 2 3Copyright (c) 2007, Myricom Inc. 4Copyright (c) 2008, Intel Corporation. 5All rights reserved. 6 7Redistribution and use in source and binary forms, with or without 8modification, are permitted provided that the following conditions are met: 9 10 1. Redistributions of source code must retain the above copyright notice, 11 this list of conditions and the following disclaimer. 12 13 2. Neither the name of the Myricom Inc, nor the names of its 14 contributors may be used to endorse or promote products derived from 15 this software without specific prior written permission. 16 17 3. Neither the name of the Intel Corporation, nor the names of its 18 contributors may be used to endorse or promote products derived from 19 this software without specific prior written permission. 20 21THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 25LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31POSSIBILITY OF SUCH DAMAGE. 32 33$FreeBSD: head/sys/netinet/tcp_lro.c 179737 2008-06-11 22:12:50Z jfv $ 34***************************************************************************/ 35 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/endian.h> 39#include <sys/mbuf.h> 40#include <sys/kernel.h> 41#include <sys/socket.h> 42 43#include <net/if.h> 44#include <net/ethernet.h> 45#include <net/if_media.h> 46 47#include <netinet/in_systm.h> 48#include <netinet/in.h> 49#include <netinet/ip.h> 50#include <netinet/tcp.h> 51#include <netinet/tcp_lro.h> 52 53#include <machine/bus.h> 54#include <machine/in_cksum.h> 55 56 57static uint16_t do_csum_data(uint16_t *raw, int len) 58{ 59 uint32_t csum; 60 csum = 0; 61 while (len > 0) { 62 csum += *raw; 63 raw++; 64 csum += *raw; 65 raw++; 66 len -= 4; 67 } 68 csum = (csum >> 16) + (csum & 0xffff); 69 csum = (csum >> 16) + (csum & 0xffff); 70 return (uint16_t)csum; 71} 72 73/* 74 * Allocate and init the LRO data structures 75 */ 76int 77tcp_lro_init(struct lro_ctrl *cntl) 78{ 79 struct lro_entry *lro; 80 int i, error = 0; 81 82 SLIST_INIT(&cntl->lro_free); 83 SLIST_INIT(&cntl->lro_active); 84 85 cntl->lro_bad_csum = 0; 86 cntl->lro_queued = 0; 87 cntl->lro_flushed = 0; 88 89 for (i = 0; i < LRO_ENTRIES; i++) { 90 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry), 91 M_DEVBUF, M_NOWAIT | M_ZERO); 92 if (lro == NULL) { 93 if (i == 0) 94 error = ENOMEM; 95 break; 96 } 97 cntl->lro_cnt = i; 98 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); 99 } 100 101 return (error); 102} 103 104void 105tcp_lro_free(struct lro_ctrl *cntl) 106{ 107 struct lro_entry *entry; 108 109 while (!SLIST_EMPTY(&cntl->lro_free)) { 110 entry = SLIST_FIRST(&cntl->lro_free); 111 SLIST_REMOVE_HEAD(&cntl->lro_free, next); 112 free(entry, M_DEVBUF); 113 } 114} 115 116void 117tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro) 118{ 119 struct ifnet *ifp; 120 struct ip *ip; 121 struct tcphdr *tcp; 122 uint32_t *ts_ptr; 123 uint32_t tcplen, tcp_csum; 124 125 126 if (lro->append_cnt) { 127 /* incorporate the new len into the ip header and 128 * re-calculate the checksum */ 129 ip = lro->ip; 130 ip->ip_len = htons(lro->len - ETHER_HDR_LEN); 131 ip->ip_sum = 0; 132 ip->ip_sum = 0xffff ^ 133 do_csum_data((uint16_t*)ip, 134 sizeof (*ip)); 135 136 lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED | 137 CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 138 lro->m_head->m_pkthdr.csum_data = 0xffff; 139 lro->m_head->m_pkthdr.len = lro->len; 140 141 /* incorporate the latest ack into the tcp header */ 142 tcp = (struct tcphdr *) (ip + 1); 143 tcp->th_ack = lro->ack_seq; 144 tcp->th_win = lro->window; 145 /* incorporate latest timestamp into the tcp header */ 146 if (lro->timestamp) { 147 ts_ptr = (uint32_t *)(tcp + 1); 148 ts_ptr[1] = htonl(lro->tsval); 149 ts_ptr[2] = lro->tsecr; 150 } 151 /* 152 * update checksum in tcp header by re-calculating the 153 * tcp pseudoheader checksum, and adding it to the checksum 154 * of the tcp payload data 155 */ 156 tcp->th_sum = 0; 157 tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN; 158 tcp_csum = lro->data_csum; 159 tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 160 htons(tcplen + IPPROTO_TCP)); 161 tcp_csum += do_csum_data((uint16_t*)tcp, 162 tcp->th_off << 2); 163 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); 164 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); 165 tcp->th_sum = 0xffff ^ tcp_csum; 166 } 167 ifp = cntl->ifp; 168 (*ifp->if_input)(cntl->ifp, lro->m_head); 169 cntl->lro_queued += lro->append_cnt + 1; 170 cntl->lro_flushed++; 171 lro->m_head = NULL; 172 lro->timestamp = 0; 173 lro->append_cnt = 0; 174 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); 175} 176 177int 178tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum) 179{ 180 struct ether_header *eh; 181 struct ip *ip; 182 struct tcphdr *tcp; 183 uint32_t *ts_ptr; 184 struct mbuf *m_nxt, *m_tail; 185 struct lro_entry *lro; 186 int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len; 187 int opt_bytes, trim; 188 uint32_t seq, tmp_csum, device_mtu; 189 190 191 eh = mtod(m_head, struct ether_header *); 192 if (eh->ether_type != htons(ETHERTYPE_IP)) 193 return 1; 194 ip = (struct ip *) (eh + 1); 195 if (ip->ip_p != IPPROTO_TCP) 196 return 1; 197 198 /* ensure there are no options */ 199 if ((ip->ip_hl << 2) != sizeof (*ip)) 200 return -1; 201 202 /* .. and the packet is not fragmented */ 203 if (ip->ip_off & htons(IP_MF|IP_OFFMASK)) 204 return -1; 205 206 /* verify that the IP header checksum is correct */ 207 tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip)); 208 if (__predict_false((tmp_csum ^ 0xffff) != 0)) { 209 cntl->lro_bad_csum++; 210 return -1; 211 } 212 213 /* find the TCP header */ 214 tcp = (struct tcphdr *) (ip + 1); 215 216 /* Get the TCP checksum if we dont have it */ 217 if (!csum) 218 csum = tcp->th_sum; 219 220 /* ensure no bits set besides ack or psh */ 221 if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0) 222 return -1; 223 224 /* check for timestamps. Since the only option we handle are 225 timestamps, we only have to handle the simple case of 226 aligned timestamps */ 227 228 opt_bytes = (tcp->th_off << 2) - sizeof (*tcp); 229 tcp_hdr_len = sizeof (*tcp) + opt_bytes; 230 ts_ptr = (uint32_t *)(tcp + 1); 231 if (opt_bytes != 0) { 232 if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) || 233 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 234 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))) 235 return -1; 236 } 237 238 ip_len = ntohs(ip->ip_len); 239 tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip); 240 241 242 /* 243 * If frame is padded beyond the end of the IP packet, 244 * then we must trim the extra bytes off the end. 245 */ 246 tot_len = m_head->m_pkthdr.len; 247 trim = tot_len - (ip_len + ETHER_HDR_LEN); 248 if (trim != 0) { 249 if (trim < 0) { 250 /* truncated packet */ 251 return -1; 252 } 253 m_adj(m_head, -trim); 254 tot_len = m_head->m_pkthdr.len; 255 } 256 257 m_nxt = m_head; 258 m_tail = NULL; /* -Wuninitialized */ 259 while (m_nxt != NULL) { 260 m_tail = m_nxt; 261 m_nxt = m_tail->m_next; 262 } 263 264 hlen = ip_len + ETHER_HDR_LEN - tcp_data_len; 265 seq = ntohl(tcp->th_seq); 266 267 SLIST_FOREACH(lro, &cntl->lro_active, next) { 268 if (lro->source_port == tcp->th_sport && 269 lro->dest_port == tcp->th_dport && 270 lro->source_ip == ip->ip_src.s_addr && 271 lro->dest_ip == ip->ip_dst.s_addr) { 272 /* Try to append it */ 273 274 if (__predict_false(seq != lro->next_seq)) { 275 /* out of order packet */ 276 SLIST_REMOVE(&cntl->lro_active, lro, 277 lro_entry, next); 278 tcp_lro_flush(cntl, lro); 279 return -1; 280 } 281 282 if (opt_bytes) { 283 uint32_t tsval = ntohl(*(ts_ptr + 1)); 284 /* make sure timestamp values are increasing */ 285 if (__predict_false(lro->tsval > tsval || 286 *(ts_ptr + 2) == 0)) { 287 return -1; 288 } 289 lro->tsval = tsval; 290 lro->tsecr = *(ts_ptr + 2); 291 } 292 293 lro->next_seq += tcp_data_len; 294 lro->ack_seq = tcp->th_ack; 295 lro->window = tcp->th_win; 296 lro->append_cnt++; 297 if (tcp_data_len == 0) { 298 m_freem(m_head); 299 return 0; 300 } 301 /* subtract off the checksum of the tcp header 302 * from the hardware checksum, and add it to the 303 * stored tcp data checksum. Byteswap the checksum 304 * if the total length so far is odd 305 */ 306 tmp_csum = do_csum_data((uint16_t*)tcp, 307 tcp_hdr_len); 308 csum = csum + (tmp_csum ^ 0xffff); 309 csum = (csum & 0xffff) + (csum >> 16); 310 csum = (csum & 0xffff) + (csum >> 16); 311 if (lro->len & 0x1) { 312 /* Odd number of bytes so far, flip bytes */ 313 csum = ((csum << 8) | (csum >> 8)) & 0xffff; 314 } 315 csum = csum + lro->data_csum; 316 csum = (csum & 0xffff) + (csum >> 16); 317 csum = (csum & 0xffff) + (csum >> 16); 318 lro->data_csum = csum; 319 320 lro->len += tcp_data_len; 321 322 /* adjust mbuf so that m->m_data points to 323 the first byte of the payload */ 324 m_adj(m_head, hlen); 325 /* append mbuf chain */ 326 lro->m_tail->m_next = m_head; 327 /* advance the last pointer */ 328 lro->m_tail = m_tail; 329 /* flush packet if required */ 330 device_mtu = cntl->ifp->if_mtu; 331 if (lro->len > (65535 - device_mtu)) { 332 SLIST_REMOVE(&cntl->lro_active, lro, 333 lro_entry, next); 334 tcp_lro_flush(cntl, lro); 335 } 336 return 0; 337 } 338 } 339 340 if (SLIST_EMPTY(&cntl->lro_free)) 341 return -1; 342 343 /* start a new chain */ 344 lro = SLIST_FIRST(&cntl->lro_free); 345 SLIST_REMOVE_HEAD(&cntl->lro_free, next); 346 SLIST_INSERT_HEAD(&cntl->lro_active, lro, next); 347 lro->source_port = tcp->th_sport; 348 lro->dest_port = tcp->th_dport; 349 lro->source_ip = ip->ip_src.s_addr; 350 lro->dest_ip = ip->ip_dst.s_addr; 351 lro->next_seq = seq + tcp_data_len; 352 lro->mss = tcp_data_len; 353 lro->ack_seq = tcp->th_ack; 354 lro->window = tcp->th_win; 355 356 /* save the checksum of just the TCP payload by 357 * subtracting off the checksum of the TCP header from 358 * the entire hardware checksum 359 * Since IP header checksum is correct, checksum over 360 * the IP header is -0. Substracting -0 is unnecessary. 361 */ 362 tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len); 363 csum = csum + (tmp_csum ^ 0xffff); 364 csum = (csum & 0xffff) + (csum >> 16); 365 csum = (csum & 0xffff) + (csum >> 16); 366 lro->data_csum = csum; 367 368 lro->ip = ip; 369 /* record timestamp if it is present */ 370 if (opt_bytes) { 371 lro->timestamp = 1; 372 lro->tsval = ntohl(*(ts_ptr + 1)); 373 lro->tsecr = *(ts_ptr + 2); 374 } 375 lro->len = tot_len; 376 lro->m_head = m_head; 377 lro->m_tail = m_tail; 378 return 0; 379} 380