tcp_lro.c revision 235474
1/*- 2 * Copyright (c) 2007, Myricom Inc. 3 * Copyright (c) 2008, Intel Corporation. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD: head/sys/netinet/tcp_lro.c 235474 2012-05-15 13:23:44Z bz $ 28 */ 29 30#include <sys/param.h> 31#include <sys/systm.h> 32#include <sys/endian.h> 33#include <sys/mbuf.h> 34#include <sys/kernel.h> 35#include <sys/socket.h> 36 37#include <net/if.h> 38#include <net/ethernet.h> 39#include <net/if_media.h> 40 41#include <netinet/in_systm.h> 42#include <netinet/in.h> 43#include <netinet/ip.h> 44#include <netinet/tcp.h> 45#include <netinet/tcp_lro.h> 46 47#include <machine/bus.h> 48#include <machine/in_cksum.h> 49 50 51static uint16_t do_csum_data(uint16_t *raw, int len) 52{ 53 uint32_t csum; 54 csum = 0; 55 while (len > 0) { 56 csum += *raw; 57 raw++; 58 csum += *raw; 59 raw++; 60 len -= 4; 61 } 62 csum = (csum >> 16) + (csum & 0xffff); 63 csum = (csum >> 16) + (csum & 0xffff); 64 return (uint16_t)csum; 65} 66 67/* 68 * Allocate and init the LRO data structures 69 */ 70int 71tcp_lro_init(struct lro_ctrl *cntl) 72{ 73 struct lro_entry *lro; 74 int i, error = 0; 75 76 SLIST_INIT(&cntl->lro_free); 77 SLIST_INIT(&cntl->lro_active); 78 79 cntl->lro_bad_csum = 0; 80 cntl->lro_queued = 0; 81 cntl->lro_flushed = 0; 82 83 for (i = 0; i < LRO_ENTRIES; i++) { 84 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry), 85 M_DEVBUF, M_NOWAIT | M_ZERO); 86 if (lro == NULL) { 87 if (i == 0) 88 error = ENOMEM; 89 break; 90 } 91 cntl->lro_cnt = i; 92 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); 93 } 94 95 return (error); 96} 97 98void 99tcp_lro_free(struct lro_ctrl *cntl) 100{ 101 struct lro_entry *entry; 102 103 while (!SLIST_EMPTY(&cntl->lro_free)) { 104 entry = SLIST_FIRST(&cntl->lro_free); 105 SLIST_REMOVE_HEAD(&cntl->lro_free, next); 106 free(entry, M_DEVBUF); 107 } 108} 109 110void 111tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro) 112{ 113 struct ifnet *ifp; 114 struct ip *ip; 115 struct tcphdr *tcp; 116 uint32_t *ts_ptr; 117 uint32_t tcplen, tcp_csum; 118 119 120 if (lro->append_cnt) { 121 /* incorporate the new len into the ip header and 122 * re-calculate the checksum */ 123 ip = lro->ip; 124 ip->ip_len = htons(lro->len - ETHER_HDR_LEN); 125 ip->ip_sum = 0; 126 ip->ip_sum = 0xffff ^ 127 do_csum_data((uint16_t*)ip, 128 sizeof (*ip)); 129 130 lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED | 131 CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 132 lro->m_head->m_pkthdr.csum_data = 0xffff; 133 lro->m_head->m_pkthdr.len = lro->len; 134 135 /* incorporate the latest ack into the tcp header */ 136 tcp = (struct tcphdr *) (ip + 1); 137 tcp->th_ack = lro->ack_seq; 138 tcp->th_win = lro->window; 139 /* incorporate latest timestamp into the tcp header */ 140 if (lro->timestamp) { 141 ts_ptr = (uint32_t *)(tcp + 1); 142 ts_ptr[1] = htonl(lro->tsval); 143 ts_ptr[2] = lro->tsecr; 144 } 145 /* 146 * update checksum in tcp header by re-calculating the 147 * tcp pseudoheader checksum, and adding it to the checksum 148 * of the tcp payload data 149 */ 150 tcp->th_sum = 0; 151 tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN; 152 tcp_csum = lro->data_csum; 153 tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 154 htons(tcplen + IPPROTO_TCP)); 155 tcp_csum += do_csum_data((uint16_t*)tcp, 156 tcp->th_off << 2); 157 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); 158 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); 159 tcp->th_sum = 0xffff ^ tcp_csum; 160 } 161 ifp = cntl->ifp; 162 (*ifp->if_input)(cntl->ifp, lro->m_head); 163 cntl->lro_queued += lro->append_cnt + 1; 164 cntl->lro_flushed++; 165 lro->m_head = NULL; 166 lro->timestamp = 0; 167 lro->append_cnt = 0; 168 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); 169} 170 171int 172tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum) 173{ 174 struct ether_header *eh; 175 struct ip *ip; 176 struct tcphdr *tcp; 177 uint32_t *ts_ptr; 178 struct mbuf *m_nxt, *m_tail; 179 struct lro_entry *lro; 180 int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len; 181 int opt_bytes, trim, csum_flags; 182 uint32_t seq, tmp_csum, device_mtu; 183 184 185 eh = mtod(m_head, struct ether_header *); 186 if (eh->ether_type != htons(ETHERTYPE_IP)) 187 return 1; 188 ip = (struct ip *) (eh + 1); 189 if (ip->ip_p != IPPROTO_TCP) 190 return 1; 191 192 /* ensure there are no options */ 193 if ((ip->ip_hl << 2) != sizeof (*ip)) 194 return -1; 195 196 /* .. and the packet is not fragmented */ 197 if (ip->ip_off & htons(IP_MF|IP_OFFMASK)) 198 return -1; 199 200 /* verify that the IP header checksum is correct */ 201 csum_flags = m_head->m_pkthdr.csum_flags; 202 if (csum_flags & CSUM_IP_CHECKED) { 203 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { 204 cntl->lro_bad_csum++; 205 return -1; 206 } 207 } else { 208 tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip)); 209 if (__predict_false((tmp_csum ^ 0xffff) != 0)) { 210 cntl->lro_bad_csum++; 211 return -1; 212 } 213 } 214 215 /* find the TCP header */ 216 tcp = (struct tcphdr *) (ip + 1); 217 218 /* Get the TCP checksum if we dont have it */ 219 if (!csum) 220 csum = tcp->th_sum; 221 222 /* ensure no bits set besides ack or psh */ 223 if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0) 224 return -1; 225 226 /* check for timestamps. Since the only option we handle are 227 timestamps, we only have to handle the simple case of 228 aligned timestamps */ 229 230 opt_bytes = (tcp->th_off << 2) - sizeof (*tcp); 231 tcp_hdr_len = sizeof (*tcp) + opt_bytes; 232 ts_ptr = (uint32_t *)(tcp + 1); 233 if (opt_bytes != 0) { 234 if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) || 235 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 236 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))) 237 return -1; 238 } 239 240 ip_len = ntohs(ip->ip_len); 241 tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip); 242 243 244 /* 245 * If frame is padded beyond the end of the IP packet, 246 * then we must trim the extra bytes off the end. 247 */ 248 tot_len = m_head->m_pkthdr.len; 249 trim = tot_len - (ip_len + ETHER_HDR_LEN); 250 if (trim != 0) { 251 if (trim < 0) { 252 /* truncated packet */ 253 return -1; 254 } 255 m_adj(m_head, -trim); 256 tot_len = m_head->m_pkthdr.len; 257 } 258 259 m_nxt = m_head; 260 m_tail = NULL; /* -Wuninitialized */ 261 while (m_nxt != NULL) { 262 m_tail = m_nxt; 263 m_nxt = m_tail->m_next; 264 } 265 266 hlen = ip_len + ETHER_HDR_LEN - tcp_data_len; 267 seq = ntohl(tcp->th_seq); 268 269 SLIST_FOREACH(lro, &cntl->lro_active, next) { 270 if (lro->source_port == tcp->th_sport && 271 lro->dest_port == tcp->th_dport && 272 lro->source_ip == ip->ip_src.s_addr && 273 lro->dest_ip == ip->ip_dst.s_addr) { 274 /* Flush now if appending will result in overflow. */ 275 if (lro->len > (65535 - tcp_data_len)) { 276 SLIST_REMOVE(&cntl->lro_active, lro, 277 lro_entry, next); 278 tcp_lro_flush(cntl, lro); 279 break; 280 } 281 282 /* Try to append it */ 283 284 if (__predict_false(seq != lro->next_seq || 285 (tcp_data_len == 0 && 286 lro->ack_seq == tcp->th_ack))) { 287 /* out of order packet or dup ack */ 288 SLIST_REMOVE(&cntl->lro_active, lro, 289 lro_entry, next); 290 tcp_lro_flush(cntl, lro); 291 return -1; 292 } 293 294 if (opt_bytes) { 295 uint32_t tsval = ntohl(*(ts_ptr + 1)); 296 /* make sure timestamp values are increasing */ 297 if (__predict_false(lro->tsval > tsval || 298 *(ts_ptr + 2) == 0)) { 299 return -1; 300 } 301 lro->tsval = tsval; 302 lro->tsecr = *(ts_ptr + 2); 303 } 304 305 lro->next_seq += tcp_data_len; 306 lro->ack_seq = tcp->th_ack; 307 lro->window = tcp->th_win; 308 lro->append_cnt++; 309 if (tcp_data_len == 0) { 310 m_freem(m_head); 311 return 0; 312 } 313 /* subtract off the checksum of the tcp header 314 * from the hardware checksum, and add it to the 315 * stored tcp data checksum. Byteswap the checksum 316 * if the total length so far is odd 317 */ 318 tmp_csum = do_csum_data((uint16_t*)tcp, 319 tcp_hdr_len); 320 csum = csum + (tmp_csum ^ 0xffff); 321 csum = (csum & 0xffff) + (csum >> 16); 322 csum = (csum & 0xffff) + (csum >> 16); 323 if (lro->len & 0x1) { 324 /* Odd number of bytes so far, flip bytes */ 325 csum = ((csum << 8) | (csum >> 8)) & 0xffff; 326 } 327 csum = csum + lro->data_csum; 328 csum = (csum & 0xffff) + (csum >> 16); 329 csum = (csum & 0xffff) + (csum >> 16); 330 lro->data_csum = csum; 331 332 lro->len += tcp_data_len; 333 334 /* adjust mbuf so that m->m_data points to 335 the first byte of the payload */ 336 m_adj(m_head, hlen); 337 /* append mbuf chain */ 338 lro->m_tail->m_next = m_head; 339 /* advance the last pointer */ 340 lro->m_tail = m_tail; 341 /* flush packet if required */ 342 device_mtu = cntl->ifp->if_mtu; 343 if (lro->len > (65535 - device_mtu)) { 344 SLIST_REMOVE(&cntl->lro_active, lro, 345 lro_entry, next); 346 tcp_lro_flush(cntl, lro); 347 } 348 return 0; 349 } 350 } 351 352 if (SLIST_EMPTY(&cntl->lro_free)) 353 return -1; 354 355 /* start a new chain */ 356 lro = SLIST_FIRST(&cntl->lro_free); 357 SLIST_REMOVE_HEAD(&cntl->lro_free, next); 358 SLIST_INSERT_HEAD(&cntl->lro_active, lro, next); 359 lro->source_port = tcp->th_sport; 360 lro->dest_port = tcp->th_dport; 361 lro->source_ip = ip->ip_src.s_addr; 362 lro->dest_ip = ip->ip_dst.s_addr; 363 lro->next_seq = seq + tcp_data_len; 364 lro->mss = tcp_data_len; 365 lro->ack_seq = tcp->th_ack; 366 lro->window = tcp->th_win; 367 368 /* save the checksum of just the TCP payload by 369 * subtracting off the checksum of the TCP header from 370 * the entire hardware checksum 371 * Since IP header checksum is correct, checksum over 372 * the IP header is -0. Substracting -0 is unnecessary. 373 */ 374 tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len); 375 csum = csum + (tmp_csum ^ 0xffff); 376 csum = (csum & 0xffff) + (csum >> 16); 377 csum = (csum & 0xffff) + (csum >> 16); 378 lro->data_csum = csum; 379 380 lro->ip = ip; 381 /* record timestamp if it is present */ 382 if (opt_bytes) { 383 lro->timestamp = 1; 384 lro->tsval = ntohl(*(ts_ptr + 1)); 385 lro->tsecr = *(ts_ptr + 2); 386 } 387 lro->len = tot_len; 388 lro->m_head = m_head; 389 lro->m_tail = m_tail; 390 return 0; 391} 392