Deleted Added
sdiff udiff text old ( 255010 ) new ( 284961 )
full compact
1/*-
2 * Copyright (c) 2007, Myricom Inc.
3 * Copyright (c) 2008, Intel Corporation.
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Bjoern Zeeb
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/netinet/tcp_lro.c 284961 2015-06-30 17:19:58Z np $");
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/mbuf.h>
41#include <sys/kernel.h>
42#include <sys/socket.h>
43
44#include <net/if.h>
45#include <net/if_var.h>
46#include <net/ethernet.h>
47#include <net/vnet.h>
48
49#include <netinet/in_systm.h>
50#include <netinet/in.h>
51#include <netinet/ip6.h>
52#include <netinet/ip.h>
53#include <netinet/ip_var.h>
54#include <netinet/tcp.h>
55#include <netinet/tcp_lro.h>
56
57#include <netinet6/ip6_var.h>
58
59#include <machine/in_cksum.h>
60
61#ifndef LRO_ENTRIES
62#define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */
63#endif
64
65#define TCP_LRO_UPDATE_CSUM 1
66#ifndef TCP_LRO_UPDATE_CSUM
67#define TCP_LRO_INVALID_CSUM 0x0000
68#endif
69
70int
71tcp_lro_init(struct lro_ctrl *lc)
72{
73 struct lro_entry *le;
74 int error, i;
75
76 lc->lro_bad_csum = 0;
77 lc->lro_queued = 0;
78 lc->lro_flushed = 0;
79 lc->lro_cnt = 0;
80 SLIST_INIT(&lc->lro_free);
81 SLIST_INIT(&lc->lro_active);
82
83 error = 0;
84 for (i = 0; i < LRO_ENTRIES; i++) {
85 le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
86 M_NOWAIT | M_ZERO);
87 if (le == NULL) {
88 if (i == 0)
89 error = ENOMEM;
90 break;
91 }
92 lc->lro_cnt = i + 1;
93 SLIST_INSERT_HEAD(&lc->lro_free, le, next);
94 }
95
96 return (error);
97}
98
99void
100tcp_lro_free(struct lro_ctrl *lc)
101{
102 struct lro_entry *le;
103
104 while (!SLIST_EMPTY(&lc->lro_free)) {
105 le = SLIST_FIRST(&lc->lro_free);
106 SLIST_REMOVE_HEAD(&lc->lro_free, next);
107 free(le, M_DEVBUF);
108 }
109}
110
111#ifdef TCP_LRO_UPDATE_CSUM
112static uint16_t
113tcp_lro_csum_th(struct tcphdr *th)
114{
115 uint32_t ch;
116 uint16_t *p, l;
117
118 ch = th->th_sum = 0x0000;
119 l = th->th_off;
120 p = (uint16_t *)th;
121 while (l > 0) {
122 ch += *p;
123 p++;
124 ch += *p;
125 p++;
126 l--;
127 }
128 while (ch > 0xffff)
129 ch = (ch >> 16) + (ch & 0xffff);
130
131 return (ch & 0xffff);
132}
133
134static uint16_t
135tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
136 uint16_t tcp_data_len, uint16_t csum)
137{
138 uint32_t c;
139 uint16_t cs;
140
141 c = csum;
142
143 /* Remove length from checksum. */
144 switch (le->eh_type) {
145#ifdef INET6
146 case ETHERTYPE_IPV6:
147 {
148 struct ip6_hdr *ip6;
149
150 ip6 = (struct ip6_hdr *)l3hdr;
151 if (le->append_cnt == 0)
152 cs = ip6->ip6_plen;
153 else {
154 uint32_t cx;
155
156 cx = ntohs(ip6->ip6_plen);
157 cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
158 }
159 break;
160 }
161#endif
162#ifdef INET
163 case ETHERTYPE_IP:
164 {
165 struct ip *ip4;
166
167 ip4 = (struct ip *)l3hdr;
168 if (le->append_cnt == 0)
169 cs = ip4->ip_len;
170 else {
171 cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
172 IPPROTO_TCP);
173 cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
174 htons(cs));
175 }
176 break;
177 }
178#endif
179 default:
180 cs = 0; /* Keep compiler happy. */
181 }
182
183 cs = ~cs;
184 c += cs;
185
186 /* Remove TCP header csum. */
187 cs = ~tcp_lro_csum_th(th);
188 c += cs;
189 while (c > 0xffff)
190 c = (c >> 16) + (c & 0xffff);
191
192 return (c & 0xffff);
193}
194#endif
195
196void
197tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
198{
199 struct lro_entry *le, *le_tmp;
200 struct timeval tv;
201
202 if (SLIST_EMPTY(&lc->lro_active))
203 return;
204
205 getmicrotime(&tv);
206 timevalsub(&tv, timeout);
207 SLIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
208 if (timevalcmp(&tv, &le->mtime, >=)) {
209 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
210 tcp_lro_flush(lc, le);
211 }
212 }
213}
214
215void
216tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
217{
218
219 if (le->append_cnt > 0) {
220 struct tcphdr *th;
221 uint16_t p_len;
222
223 p_len = htons(le->p_len);
224 switch (le->eh_type) {
225#ifdef INET6
226 case ETHERTYPE_IPV6:
227 {
228 struct ip6_hdr *ip6;
229
230 ip6 = le->le_ip6;
231 ip6->ip6_plen = p_len;
232 th = (struct tcphdr *)(ip6 + 1);
233 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
234 CSUM_PSEUDO_HDR;
235 le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
236 break;
237 }
238#endif
239#ifdef INET
240 case ETHERTYPE_IP:
241 {
242 struct ip *ip4;
243#ifdef TCP_LRO_UPDATE_CSUM
244 uint32_t cl;
245 uint16_t c;
246#endif
247
248 ip4 = le->le_ip4;
249#ifdef TCP_LRO_UPDATE_CSUM
250 /* Fix IP header checksum for new length. */
251 c = ~ip4->ip_sum;
252 cl = c;
253 c = ~ip4->ip_len;
254 cl += c + p_len;
255 while (cl > 0xffff)
256 cl = (cl >> 16) + (cl & 0xffff);
257 c = cl;
258 ip4->ip_sum = ~c;
259#else
260 ip4->ip_sum = TCP_LRO_INVALID_CSUM;
261#endif
262 ip4->ip_len = p_len;
263 th = (struct tcphdr *)(ip4 + 1);
264 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
265 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
266 le->p_len += ETHER_HDR_LEN;
267 break;
268 }
269#endif
270 default:
271 th = NULL; /* Keep compiler happy. */
272 }
273 le->m_head->m_pkthdr.csum_data = 0xffff;
274 le->m_head->m_pkthdr.len = le->p_len;
275
276 /* Incorporate the latest ACK into the TCP header. */
277 th->th_ack = le->ack_seq;
278 th->th_win = le->window;
279 /* Incorporate latest timestamp into the TCP header. */
280 if (le->timestamp != 0) {
281 uint32_t *ts_ptr;
282
283 ts_ptr = (uint32_t *)(th + 1);
284 ts_ptr[1] = htonl(le->tsval);
285 ts_ptr[2] = le->tsecr;
286 }
287#ifdef TCP_LRO_UPDATE_CSUM
288 /* Update the TCP header checksum. */
289 le->ulp_csum += p_len;
290 le->ulp_csum += tcp_lro_csum_th(th);
291 while (le->ulp_csum > 0xffff)
292 le->ulp_csum = (le->ulp_csum >> 16) +
293 (le->ulp_csum & 0xffff);
294 th->th_sum = (le->ulp_csum & 0xffff);
295 th->th_sum = ~th->th_sum;
296#else
297 th->th_sum = TCP_LRO_INVALID_CSUM;
298#endif
299 }
300
301 (*lc->ifp->if_input)(lc->ifp, le->m_head);
302 lc->lro_queued += le->append_cnt + 1;
303 lc->lro_flushed++;
304 bzero(le, sizeof(*le));
305 SLIST_INSERT_HEAD(&lc->lro_free, le, next);
306}
307
308#ifdef INET6
309static int
310tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
311 struct tcphdr **th)
312{
313
314 /* XXX-BZ we should check the flow-label. */
315
316 /* XXX-BZ We do not yet support ext. hdrs. */
317 if (ip6->ip6_nxt != IPPROTO_TCP)
318 return (TCP_LRO_NOT_SUPPORTED);
319
320 /* Find the TCP header. */
321 *th = (struct tcphdr *)(ip6 + 1);
322
323 return (0);
324}
325#endif
326
327#ifdef INET
328static int
329tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
330 struct tcphdr **th)
331{
332 int csum_flags;
333 uint16_t csum;
334
335 if (ip4->ip_p != IPPROTO_TCP)
336 return (TCP_LRO_NOT_SUPPORTED);
337
338 /* Ensure there are no options. */
339 if ((ip4->ip_hl << 2) != sizeof (*ip4))
340 return (TCP_LRO_CANNOT);
341
342 /* .. and the packet is not fragmented. */
343 if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
344 return (TCP_LRO_CANNOT);
345
346 /* Legacy IP has a header checksum that needs to be correct. */
347 csum_flags = m->m_pkthdr.csum_flags;
348 if (csum_flags & CSUM_IP_CHECKED) {
349 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
350 lc->lro_bad_csum++;
351 return (TCP_LRO_CANNOT);
352 }
353 } else {
354 csum = in_cksum_hdr(ip4);
355 if (__predict_false((csum) != 0)) {
356 lc->lro_bad_csum++;
357 return (TCP_LRO_CANNOT);
358 }
359 }
360
361 /* Find the TCP header (we assured there are no IP options). */
362 *th = (struct tcphdr *)(ip4 + 1);
363
364 return (0);
365}
366#endif
367
368int
369tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
370{
371 struct lro_entry *le;
372 struct ether_header *eh;
373#ifdef INET6
374 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
375#endif
376#ifdef INET
377 struct ip *ip4 = NULL; /* Keep compiler happy. */
378#endif
379 struct tcphdr *th;
380 void *l3hdr = NULL; /* Keep compiler happy. */
381 uint32_t *ts_ptr;
382 tcp_seq seq;
383 int error, ip_len, l;
384 uint16_t eh_type, tcp_data_len;
385
386 /* We expect a contiguous header [eh, ip, tcp]. */
387
388 eh = mtod(m, struct ether_header *);
389 eh_type = ntohs(eh->ether_type);
390 switch (eh_type) {
391#ifdef INET6
392 case ETHERTYPE_IPV6:
393 {
394 CURVNET_SET(lc->ifp->if_vnet);
395 if (V_ip6_forwarding != 0) {
396 /* XXX-BZ stats but changing lro_ctrl is a problem. */
397 CURVNET_RESTORE();
398 return (TCP_LRO_CANNOT);
399 }
400 CURVNET_RESTORE();
401 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
402 error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
403 if (error != 0)
404 return (error);
405 tcp_data_len = ntohs(ip6->ip6_plen);
406 ip_len = sizeof(*ip6) + tcp_data_len;
407 break;
408 }
409#endif
410#ifdef INET
411 case ETHERTYPE_IP:
412 {
413 CURVNET_SET(lc->ifp->if_vnet);
414 if (V_ipforwarding != 0) {
415 /* XXX-BZ stats but changing lro_ctrl is a problem. */
416 CURVNET_RESTORE();
417 return (TCP_LRO_CANNOT);
418 }
419 CURVNET_RESTORE();
420 l3hdr = ip4 = (struct ip *)(eh + 1);
421 error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
422 if (error != 0)
423 return (error);
424 ip_len = ntohs(ip4->ip_len);
425 tcp_data_len = ip_len - sizeof(*ip4);
426 break;
427 }
428#endif
429 /* XXX-BZ what happens in case of VLAN(s)? */
430 default:
431 return (TCP_LRO_NOT_SUPPORTED);
432 }
433
434 /*
435 * If the frame is padded beyond the end of the IP packet, then we must
436 * trim the extra bytes off.
437 */
438 l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
439 if (l != 0) {
440 if (l < 0)
441 /* Truncated packet. */
442 return (TCP_LRO_CANNOT);
443
444 m_adj(m, -l);
445 }
446
447 /*
448 * Check TCP header constraints.
449 */
450 /* Ensure no bits set besides ACK or PSH. */
451 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
452 return (TCP_LRO_CANNOT);
453
454 /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */
455 /* XXX-BZ Ideally we'd flush on PUSH? */
456
457 /*
458 * Check for timestamps.
459 * Since the only option we handle are timestamps, we only have to
460 * handle the simple case of aligned timestamps.
461 */
462 l = (th->th_off << 2);
463 tcp_data_len -= l;
464 l -= sizeof(*th);
465 ts_ptr = (uint32_t *)(th + 1);
466 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
467 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
468 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
469 return (TCP_LRO_CANNOT);
470
471 /* If the driver did not pass in the checksum, set it now. */
472 if (csum == 0x0000)
473 csum = th->th_sum;
474
475 seq = ntohl(th->th_seq);
476
477 /* Try to find a matching previous segment. */
478 SLIST_FOREACH(le, &lc->lro_active, next) {
479 if (le->eh_type != eh_type)
480 continue;
481 if (le->source_port != th->th_sport ||
482 le->dest_port != th->th_dport)
483 continue;
484 switch (eh_type) {
485#ifdef INET6
486 case ETHERTYPE_IPV6:
487 if (bcmp(&le->source_ip6, &ip6->ip6_src,
488 sizeof(struct in6_addr)) != 0 ||
489 bcmp(&le->dest_ip6, &ip6->ip6_dst,
490 sizeof(struct in6_addr)) != 0)
491 continue;
492 break;
493#endif
494#ifdef INET
495 case ETHERTYPE_IP:
496 if (le->source_ip4 != ip4->ip_src.s_addr ||
497 le->dest_ip4 != ip4->ip_dst.s_addr)
498 continue;
499 break;
500#endif
501 }
502
503 /* Flush now if appending will result in overflow. */
504 if (le->p_len > (65535 - tcp_data_len)) {
505 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
506 tcp_lro_flush(lc, le);
507 break;
508 }
509
510 /* Try to append the new segment. */
511 if (__predict_false(seq != le->next_seq ||
512 (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
513 /* Out of order packet or duplicate ACK. */
514 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
515 tcp_lro_flush(lc, le);
516 return (TCP_LRO_CANNOT);
517 }
518
519 if (l != 0) {
520 uint32_t tsval = ntohl(*(ts_ptr + 1));
521 /* Make sure timestamp values are increasing. */
522 /* XXX-BZ flip and use TSTMP_GEQ macro for this? */
523 if (__predict_false(le->tsval > tsval ||
524 *(ts_ptr + 2) == 0))
525 return (TCP_LRO_CANNOT);
526 le->tsval = tsval;
527 le->tsecr = *(ts_ptr + 2);
528 }
529
530 le->next_seq += tcp_data_len;
531 le->ack_seq = th->th_ack;
532 le->window = th->th_win;
533 le->append_cnt++;
534
535#ifdef TCP_LRO_UPDATE_CSUM
536 le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
537 tcp_data_len, ~csum);
538#endif
539
540 if (tcp_data_len == 0) {
541 m_freem(m);
542 return (0);
543 }
544
545 le->p_len += tcp_data_len;
546
547 /*
548 * Adjust the mbuf so that m_data points to the first byte of
549 * the ULP payload. Adjust the mbuf to avoid complications and
550 * append new segment to existing mbuf chain.
551 */
552 m_adj(m, m->m_pkthdr.len - tcp_data_len);
553 m_demote_pkthdr(m);
554
555 le->m_tail->m_next = m;
556 le->m_tail = m_last(m);
557
558 /*
559 * If a possible next full length packet would cause an
560 * overflow, pro-actively flush now.
561 */
562 if (le->p_len > (65535 - lc->ifp->if_mtu)) {
563 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
564 tcp_lro_flush(lc, le);
565 } else
566 getmicrotime(&le->mtime);
567
568 return (0);
569 }
570
571 /* Try to find an empty slot. */
572 if (SLIST_EMPTY(&lc->lro_free))
573 return (TCP_LRO_CANNOT);
574
575 /* Start a new segment chain. */
576 le = SLIST_FIRST(&lc->lro_free);
577 SLIST_REMOVE_HEAD(&lc->lro_free, next);
578 SLIST_INSERT_HEAD(&lc->lro_active, le, next);
579 getmicrotime(&le->mtime);
580
581 /* Start filling in details. */
582 switch (eh_type) {
583#ifdef INET6
584 case ETHERTYPE_IPV6:
585 le->le_ip6 = ip6;
586 le->source_ip6 = ip6->ip6_src;
587 le->dest_ip6 = ip6->ip6_dst;
588 le->eh_type = eh_type;
589 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
590 break;
591#endif
592#ifdef INET
593 case ETHERTYPE_IP:
594 le->le_ip4 = ip4;
595 le->source_ip4 = ip4->ip_src.s_addr;
596 le->dest_ip4 = ip4->ip_dst.s_addr;
597 le->eh_type = eh_type;
598 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
599 break;
600#endif
601 }
602 le->source_port = th->th_sport;
603 le->dest_port = th->th_dport;
604
605 le->next_seq = seq + tcp_data_len;
606 le->ack_seq = th->th_ack;
607 le->window = th->th_win;
608 if (l != 0) {
609 le->timestamp = 1;
610 le->tsval = ntohl(*(ts_ptr + 1));
611 le->tsecr = *(ts_ptr + 2);
612 }
613
614#ifdef TCP_LRO_UPDATE_CSUM
615 /*
616 * Do not touch the csum of the first packet. However save the
617 * "adjusted" checksum of just the source and destination addresses,
618 * the next header and the TCP payload. The length and TCP header
619 * parts may change, so we remove those from the saved checksum and
620 * re-add with final values on tcp_lro_flush() if needed.
621 */
622 KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
623 __func__, le, le->ulp_csum));
624
625 le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
626 ~csum);
627 th->th_sum = csum; /* Restore checksum on first packet. */
628#endif
629
630 le->m_head = m;
631 le->m_tail = m_last(m);
632
633 return (0);
634}
635
636/* end */