tcp_lro.c revision 304836
1235474Sbz/*-
2235474Sbz * Copyright (c) 2007, Myricom Inc.
3235474Sbz * Copyright (c) 2008, Intel Corporation.
4235944Sbz * Copyright (c) 2012 The FreeBSD Foundation
5235474Sbz * All rights reserved.
6235474Sbz *
7235944Sbz * Portions of this software were developed by Bjoern Zeeb
8235944Sbz * under sponsorship from the FreeBSD Foundation.
9235944Sbz *
10235474Sbz * Redistribution and use in source and binary forms, with or without
11235474Sbz * modification, are permitted provided that the following conditions
12235474Sbz * are met:
13235474Sbz * 1. Redistributions of source code must retain the above copyright
14235474Sbz *    notice, this list of conditions and the following disclaimer.
15235474Sbz * 2. Redistributions in binary form must reproduce the above copyright
16235474Sbz *    notice, this list of conditions and the following disclaimer in the
17235474Sbz *    documentation and/or other materials provided with the distribution.
18235474Sbz *
19235474Sbz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20235474Sbz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21235474Sbz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22235474Sbz * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23235474Sbz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24235474Sbz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25235474Sbz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26235474Sbz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27235474Sbz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28235474Sbz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29235474Sbz * SUCH DAMAGE.
30235474Sbz */
31179737Sjfv
32235944Sbz#include <sys/cdefs.h>
33235944Sbz__FBSDID("$FreeBSD: stable/10/sys/netinet/tcp_lro.c 304836 2016-08-26 06:19:12Z sephe $");
34235944Sbz
35235944Sbz#include "opt_inet.h"
36235944Sbz#include "opt_inet6.h"
37235944Sbz
38179737Sjfv#include <sys/param.h>
39179737Sjfv#include <sys/systm.h>
40179737Sjfv#include <sys/mbuf.h>
41179737Sjfv#include <sys/kernel.h>
42179737Sjfv#include <sys/socket.h>
43179737Sjfv
44179737Sjfv#include <net/if.h>
45235944Sbz#include <net/if_var.h>
46179737Sjfv#include <net/ethernet.h>
47236394Sbz#include <net/vnet.h>
48179737Sjfv
49179737Sjfv#include <netinet/in_systm.h>
50179737Sjfv#include <netinet/in.h>
51235944Sbz#include <netinet/ip6.h>
52179737Sjfv#include <netinet/ip.h>
53235981Sbz#include <netinet/ip_var.h>
54179737Sjfv#include <netinet/tcp.h>
55179737Sjfv#include <netinet/tcp_lro.h>
56179737Sjfv
57235981Sbz#include <netinet6/ip6_var.h>
58235981Sbz
59179737Sjfv#include <machine/in_cksum.h>
60179737Sjfv
61235944Sbz#ifndef LRO_ENTRIES
62235944Sbz#define	LRO_ENTRIES	8	/* # of LRO entries per RX queue. */
63235944Sbz#endif
64179737Sjfv
65235944Sbz#define	TCP_LRO_UPDATE_CSUM	1
66235944Sbz#ifndef	TCP_LRO_UPDATE_CSUM
67235944Sbz#define	TCP_LRO_INVALID_CSUM	0x0000
68235944Sbz#endif
69179737Sjfv
70179737Sjfvint
71235944Sbztcp_lro_init(struct lro_ctrl *lc)
72179737Sjfv{
73235944Sbz	struct lro_entry *le;
74235944Sbz	int error, i;
75179737Sjfv
76235944Sbz	lc->lro_bad_csum = 0;
77235944Sbz	lc->lro_queued = 0;
78235944Sbz	lc->lro_flushed = 0;
79235944Sbz	lc->lro_cnt = 0;
80235944Sbz	SLIST_INIT(&lc->lro_free);
81235944Sbz	SLIST_INIT(&lc->lro_active);
82179737Sjfv
83235944Sbz	error = 0;
84179737Sjfv	for (i = 0; i < LRO_ENTRIES; i++) {
85235944Sbz		le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
86235944Sbz		    M_NOWAIT | M_ZERO);
87235944Sbz                if (le == NULL) {
88179737Sjfv			if (i == 0)
89179737Sjfv				error = ENOMEM;
90179737Sjfv                        break;
91179737Sjfv                }
92235944Sbz		lc->lro_cnt = i + 1;
93235944Sbz		SLIST_INSERT_HEAD(&lc->lro_free, le, next);
94179737Sjfv        }
95179737Sjfv
96179737Sjfv	return (error);
97179737Sjfv}
98179737Sjfv
99179737Sjfvvoid
100235944Sbztcp_lro_free(struct lro_ctrl *lc)
101179737Sjfv{
102235944Sbz	struct lro_entry *le;
103179737Sjfv
104235944Sbz	while (!SLIST_EMPTY(&lc->lro_free)) {
105235944Sbz		le = SLIST_FIRST(&lc->lro_free);
106235944Sbz		SLIST_REMOVE_HEAD(&lc->lro_free, next);
107235944Sbz		free(le, M_DEVBUF);
108179737Sjfv	}
109179737Sjfv}
110179737Sjfv
111235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
112235944Sbzstatic uint16_t
113235944Sbztcp_lro_csum_th(struct tcphdr *th)
114235944Sbz{
115235944Sbz	uint32_t ch;
116235944Sbz	uint16_t *p, l;
117235944Sbz
118235944Sbz	ch = th->th_sum = 0x0000;
119235944Sbz	l = th->th_off;
120235944Sbz	p = (uint16_t *)th;
121235944Sbz	while (l > 0) {
122235944Sbz		ch += *p;
123235944Sbz		p++;
124235944Sbz		ch += *p;
125235944Sbz		p++;
126235944Sbz		l--;
127235944Sbz	}
128235944Sbz	while (ch > 0xffff)
129235944Sbz		ch = (ch >> 16) + (ch & 0xffff);
130235944Sbz
131235944Sbz	return (ch & 0xffff);
132235944Sbz}
133235944Sbz
134235944Sbzstatic uint16_t
135235944Sbztcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
136235944Sbz    uint16_t tcp_data_len, uint16_t csum)
137235944Sbz{
138235944Sbz	uint32_t c;
139235944Sbz	uint16_t cs;
140235944Sbz
141235944Sbz	c = csum;
142235944Sbz
143235944Sbz	/* Remove length from checksum. */
144235944Sbz	switch (le->eh_type) {
145235944Sbz#ifdef INET6
146235944Sbz	case ETHERTYPE_IPV6:
147235944Sbz	{
148235944Sbz		struct ip6_hdr *ip6;
149235944Sbz
150235944Sbz		ip6 = (struct ip6_hdr *)l3hdr;
151235944Sbz		if (le->append_cnt == 0)
152235944Sbz			cs = ip6->ip6_plen;
153235944Sbz		else {
154235944Sbz			uint32_t cx;
155235944Sbz
156235944Sbz			cx = ntohs(ip6->ip6_plen);
157235944Sbz			cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
158235944Sbz		}
159235944Sbz		break;
160235944Sbz	}
161235944Sbz#endif
162235944Sbz#ifdef INET
163235944Sbz	case ETHERTYPE_IP:
164235944Sbz	{
165235944Sbz		struct ip *ip4;
166235944Sbz
167235944Sbz		ip4 = (struct ip *)l3hdr;
168235944Sbz		if (le->append_cnt == 0)
169235944Sbz			cs = ip4->ip_len;
170235944Sbz		else {
171235944Sbz			cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
172235944Sbz			    IPPROTO_TCP);
173235944Sbz			cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
174235944Sbz			    htons(cs));
175235944Sbz		}
176235944Sbz		break;
177235944Sbz	}
178235944Sbz#endif
179235944Sbz	default:
180235944Sbz		cs = 0;		/* Keep compiler happy. */
181235944Sbz	}
182235944Sbz
183235944Sbz	cs = ~cs;
184235944Sbz	c += cs;
185235944Sbz
186235944Sbz	/* Remove TCP header csum. */
187235944Sbz	cs = ~tcp_lro_csum_th(th);
188235944Sbz	c += cs;
189235944Sbz	while (c > 0xffff)
190235944Sbz		c = (c >> 16) + (c & 0xffff);
191235944Sbz
192235944Sbz	return (c & 0xffff);
193235944Sbz}
194235944Sbz#endif
195235944Sbz
196179737Sjfvvoid
197255010Snptcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
198255010Snp{
199255010Snp	struct lro_entry *le, *le_tmp;
200255010Snp	struct timeval tv;
201255010Snp
202255010Snp	if (SLIST_EMPTY(&lc->lro_active))
203255010Snp		return;
204255010Snp
205255010Snp	getmicrotime(&tv);
206255010Snp	timevalsub(&tv, timeout);
207255010Snp	SLIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
208255010Snp		if (timevalcmp(&tv, &le->mtime, >=)) {
209255010Snp			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
210255010Snp			tcp_lro_flush(lc, le);
211255010Snp		}
212255010Snp	}
213255010Snp}
214255010Snp
215255010Snpvoid
216235944Sbztcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
217179737Sjfv{
218179737Sjfv
219235944Sbz	if (le->append_cnt > 0) {
220235944Sbz		struct tcphdr *th;
221235944Sbz		uint16_t p_len;
222179737Sjfv
223235944Sbz		p_len = htons(le->p_len);
224235944Sbz		switch (le->eh_type) {
225235944Sbz#ifdef INET6
226235944Sbz		case ETHERTYPE_IPV6:
227235944Sbz		{
228235944Sbz			struct ip6_hdr *ip6;
229179737Sjfv
230235944Sbz			ip6 = le->le_ip6;
231235944Sbz			ip6->ip6_plen = p_len;
232235944Sbz			th = (struct tcphdr *)(ip6 + 1);
233235944Sbz			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
234235944Sbz			    CSUM_PSEUDO_HDR;
235235944Sbz			le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
236235944Sbz			break;
237235944Sbz		}
238235944Sbz#endif
239235944Sbz#ifdef INET
240235944Sbz		case ETHERTYPE_IP:
241235944Sbz		{
242235944Sbz			struct ip *ip4;
243235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
244235944Sbz			uint32_t cl;
245235944Sbz			uint16_t c;
246235944Sbz#endif
247179737Sjfv
248235944Sbz			ip4 = le->le_ip4;
249235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
250235944Sbz			/* Fix IP header checksum for new length. */
251235944Sbz			c = ~ip4->ip_sum;
252235944Sbz			cl = c;
253235944Sbz			c = ~ip4->ip_len;
254235944Sbz			cl += c + p_len;
255235944Sbz			while (cl > 0xffff)
256235944Sbz				cl = (cl >> 16) + (cl & 0xffff);
257235944Sbz			c = cl;
258235944Sbz			ip4->ip_sum = ~c;
259235944Sbz#else
260235944Sbz			ip4->ip_sum = TCP_LRO_INVALID_CSUM;
261235944Sbz#endif
262235944Sbz			ip4->ip_len = p_len;
263235944Sbz			th = (struct tcphdr *)(ip4 + 1);
264235944Sbz			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
265235944Sbz			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
266235944Sbz			le->p_len += ETHER_HDR_LEN;
267235944Sbz			break;
268179737Sjfv		}
269235944Sbz#endif
270235944Sbz		default:
271235944Sbz			th = NULL;	/* Keep compiler happy. */
272235944Sbz		}
273235944Sbz		le->m_head->m_pkthdr.csum_data = 0xffff;
274235944Sbz		le->m_head->m_pkthdr.len = le->p_len;
275235944Sbz
276235944Sbz		/* Incorporate the latest ACK into the TCP header. */
277235944Sbz		th->th_ack = le->ack_seq;
278235944Sbz		th->th_win = le->window;
279235944Sbz		/* Incorporate latest timestamp into the TCP header. */
280235944Sbz		if (le->timestamp != 0) {
281235944Sbz			uint32_t *ts_ptr;
282235944Sbz
283235944Sbz			ts_ptr = (uint32_t *)(th + 1);
284235944Sbz			ts_ptr[1] = htonl(le->tsval);
285235944Sbz			ts_ptr[2] = le->tsecr;
286235944Sbz		}
287235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
288235944Sbz		/* Update the TCP header checksum. */
289235944Sbz		le->ulp_csum += p_len;
290235944Sbz		le->ulp_csum += tcp_lro_csum_th(th);
291235944Sbz		while (le->ulp_csum > 0xffff)
292235944Sbz			le->ulp_csum = (le->ulp_csum >> 16) +
293235944Sbz			    (le->ulp_csum & 0xffff);
294235944Sbz		th->th_sum = (le->ulp_csum & 0xffff);
295235944Sbz		th->th_sum = ~th->th_sum;
296235944Sbz#else
297235944Sbz		th->th_sum = TCP_LRO_INVALID_CSUM;
298235944Sbz#endif
299179737Sjfv	}
300235944Sbz
301235944Sbz	(*lc->ifp->if_input)(lc->ifp, le->m_head);
302235944Sbz	lc->lro_queued += le->append_cnt + 1;
303235944Sbz	lc->lro_flushed++;
304235944Sbz	bzero(le, sizeof(*le));
305235944Sbz	SLIST_INSERT_HEAD(&lc->lro_free, le, next);
306179737Sjfv}
307179737Sjfv
308235944Sbz#ifdef INET6
309235944Sbzstatic int
310235944Sbztcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
311235944Sbz    struct tcphdr **th)
312179737Sjfv{
313179737Sjfv
314235944Sbz	/* XXX-BZ we should check the flow-label. */
315179737Sjfv
316235944Sbz	/* XXX-BZ We do not yet support ext. hdrs. */
317235944Sbz	if (ip6->ip6_nxt != IPPROTO_TCP)
318235944Sbz		return (TCP_LRO_NOT_SUPPORTED);
319179737Sjfv
320235944Sbz	/* Find the TCP header. */
321235944Sbz	*th = (struct tcphdr *)(ip6 + 1);
322179737Sjfv
323235944Sbz	return (0);
324235944Sbz}
325235944Sbz#endif
326235944Sbz
327235944Sbz#ifdef INET
328235944Sbzstatic int
329235944Sbztcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
330235944Sbz    struct tcphdr **th)
331235944Sbz{
332235944Sbz	int csum_flags;
333235944Sbz	uint16_t csum;
334235944Sbz
335235944Sbz	if (ip4->ip_p != IPPROTO_TCP)
336235944Sbz		return (TCP_LRO_NOT_SUPPORTED);
337235944Sbz
338235944Sbz	/* Ensure there are no options. */
339235944Sbz	if ((ip4->ip_hl << 2) != sizeof (*ip4))
340235944Sbz		return (TCP_LRO_CANNOT);
341235944Sbz
342235944Sbz	/* .. and the packet is not fragmented. */
343235944Sbz	if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
344235944Sbz		return (TCP_LRO_CANNOT);
345235944Sbz
346235944Sbz	/* Legacy IP has a header checksum that needs to be correct. */
347235944Sbz	csum_flags = m->m_pkthdr.csum_flags;
348182089Skmacy	if (csum_flags & CSUM_IP_CHECKED) {
349182089Skmacy		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
350235944Sbz			lc->lro_bad_csum++;
351235944Sbz			return (TCP_LRO_CANNOT);
352182089Skmacy		}
353182089Skmacy	} else {
354235944Sbz		csum = in_cksum_hdr(ip4);
355247104Sgallatin		if (__predict_false((csum) != 0)) {
356235944Sbz			lc->lro_bad_csum++;
357235944Sbz			return (TCP_LRO_CANNOT);
358182089Skmacy		}
359179737Sjfv	}
360179737Sjfv
361235944Sbz	/* Find the TCP header (we assured there are no IP options). */
362235944Sbz	*th = (struct tcphdr *)(ip4 + 1);
363179737Sjfv
364235944Sbz	return (0);
365235944Sbz}
366235944Sbz#endif
367179737Sjfv
368235944Sbzint
369235944Sbztcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
370235944Sbz{
371235944Sbz	struct lro_entry *le;
372235944Sbz	struct ether_header *eh;
373235944Sbz#ifdef INET6
374235944Sbz	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
375235944Sbz#endif
376235944Sbz#ifdef INET
377235944Sbz	struct ip *ip4 = NULL;		/* Keep compiler happy. */
378235944Sbz#endif
379235944Sbz	struct tcphdr *th;
380235944Sbz	void *l3hdr = NULL;		/* Keep compiler happy. */
381235944Sbz	uint32_t *ts_ptr;
382235944Sbz	tcp_seq seq;
383235944Sbz	int error, ip_len, l;
384235944Sbz	uint16_t eh_type, tcp_data_len;
385304836Ssephe	int force_flush = 0;
386179737Sjfv
387235944Sbz	/* We expect a contiguous header [eh, ip, tcp]. */
388235944Sbz
389235944Sbz	eh = mtod(m, struct ether_header *);
390235944Sbz	eh_type = ntohs(eh->ether_type);
391235944Sbz	switch (eh_type) {
392235944Sbz#ifdef INET6
393235944Sbz	case ETHERTYPE_IPV6:
394236394Sbz	{
395236394Sbz		CURVNET_SET(lc->ifp->if_vnet);
396235981Sbz		if (V_ip6_forwarding != 0) {
397235981Sbz			/* XXX-BZ stats but changing lro_ctrl is a problem. */
398236394Sbz			CURVNET_RESTORE();
399235981Sbz			return (TCP_LRO_CANNOT);
400235981Sbz		}
401236394Sbz		CURVNET_RESTORE();
402235944Sbz		l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
403235944Sbz		error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
404235944Sbz		if (error != 0)
405235944Sbz			return (error);
406235944Sbz		tcp_data_len = ntohs(ip6->ip6_plen);
407235944Sbz		ip_len = sizeof(*ip6) + tcp_data_len;
408235944Sbz		break;
409236394Sbz	}
410235944Sbz#endif
411235944Sbz#ifdef INET
412235944Sbz	case ETHERTYPE_IP:
413236394Sbz	{
414236394Sbz		CURVNET_SET(lc->ifp->if_vnet);
415235981Sbz		if (V_ipforwarding != 0) {
416235981Sbz			/* XXX-BZ stats but changing lro_ctrl is a problem. */
417236394Sbz			CURVNET_RESTORE();
418235981Sbz			return (TCP_LRO_CANNOT);
419235981Sbz		}
420236394Sbz		CURVNET_RESTORE();
421235944Sbz		l3hdr = ip4 = (struct ip *)(eh + 1);
422235944Sbz		error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
423235944Sbz		if (error != 0)
424235944Sbz			return (error);
425235944Sbz		ip_len = ntohs(ip4->ip_len);
426235944Sbz		tcp_data_len = ip_len - sizeof(*ip4);
427235944Sbz		break;
428236394Sbz	}
429235944Sbz#endif
430235944Sbz	/* XXX-BZ what happens in case of VLAN(s)? */
431235944Sbz	default:
432235944Sbz		return (TCP_LRO_NOT_SUPPORTED);
433179737Sjfv	}
434179737Sjfv
435235944Sbz	/*
436235944Sbz	 * If the frame is padded beyond the end of the IP packet, then we must
437235944Sbz	 * trim the extra bytes off.
438235944Sbz	 */
439235944Sbz	l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
440235944Sbz	if (l != 0) {
441235944Sbz		if (l < 0)
442235944Sbz			/* Truncated packet. */
443235944Sbz			return (TCP_LRO_CANNOT);
444179737Sjfv
445235944Sbz		m_adj(m, -l);
446235944Sbz	}
447235944Sbz
448235944Sbz	/*
449235944Sbz	 * Check TCP header constraints.
450179737Sjfv	 */
451235944Sbz	/* Ensure no bits set besides ACK or PSH. */
452304836Ssephe	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
453304836Ssephe		if (th->th_flags & TH_SYN)
454304836Ssephe			return (TCP_LRO_CANNOT);
455304836Ssephe		/*
456304836Ssephe		 * Make sure that previously seen segements/ACKs are delivered
457304836Ssephe		 * before this segement, e.g. FIN.
458304836Ssephe		 */
459304836Ssephe		force_flush = 1;
460304836Ssephe	}
461235944Sbz
462302051Ssephe	/* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
463235944Sbz	/* XXX-BZ Ideally we'd flush on PUSH? */
464235944Sbz
465235944Sbz	/*
466235944Sbz	 * Check for timestamps.
467235944Sbz	 * Since the only option we handle are timestamps, we only have to
468235944Sbz	 * handle the simple case of aligned timestamps.
469235944Sbz	 */
470235944Sbz	l = (th->th_off << 2);
471235944Sbz	tcp_data_len -= l;
472235944Sbz	l -= sizeof(*th);
473235944Sbz	ts_ptr = (uint32_t *)(th + 1);
474235944Sbz	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
475235944Sbz	    (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
476304836Ssephe	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
477304836Ssephe		/*
478304836Ssephe		 * Make sure that previously seen segements/ACKs are delivered
479304836Ssephe		 * before this segement.
480304836Ssephe		 */
481304836Ssephe		force_flush = 1;
482304836Ssephe	}
483235944Sbz
484235944Sbz	/* If the driver did not pass in the checksum, set it now. */
485235944Sbz	if (csum == 0x0000)
486235944Sbz		csum = th->th_sum;
487235944Sbz
488235944Sbz	seq = ntohl(th->th_seq);
489235944Sbz
490235944Sbz	/* Try to find a matching previous segment. */
491235944Sbz	SLIST_FOREACH(le, &lc->lro_active, next) {
492235944Sbz		if (le->eh_type != eh_type)
493235944Sbz			continue;
494235944Sbz		if (le->source_port != th->th_sport ||
495235944Sbz		    le->dest_port != th->th_dport)
496235944Sbz			continue;
497235944Sbz		switch (eh_type) {
498235944Sbz#ifdef INET6
499235944Sbz		case ETHERTYPE_IPV6:
500235944Sbz			if (bcmp(&le->source_ip6, &ip6->ip6_src,
501235944Sbz			    sizeof(struct in6_addr)) != 0 ||
502235944Sbz			    bcmp(&le->dest_ip6, &ip6->ip6_dst,
503235944Sbz			    sizeof(struct in6_addr)) != 0)
504235944Sbz				continue;
505235944Sbz			break;
506235944Sbz#endif
507235944Sbz#ifdef INET
508235944Sbz		case ETHERTYPE_IP:
509235944Sbz			if (le->source_ip4 != ip4->ip_src.s_addr ||
510235944Sbz			    le->dest_ip4 != ip4->ip_dst.s_addr)
511235944Sbz				continue;
512235944Sbz			break;
513235944Sbz#endif
514179737Sjfv		}
515179737Sjfv
516304836Ssephe		if (force_flush) {
517304836Ssephe			/* Timestamps mismatch; this is a FIN, etc */
518304836Ssephe			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
519304836Ssephe			tcp_lro_flush(lc, le);
520304836Ssephe			return (TCP_LRO_CANNOT);
521304836Ssephe		}
522304836Ssephe
523235944Sbz		/* Flush now if appending will result in overflow. */
524235944Sbz		if (le->p_len > (65535 - tcp_data_len)) {
525235944Sbz			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
526235944Sbz			tcp_lro_flush(lc, le);
527235944Sbz			break;
528235944Sbz		}
529179737Sjfv
530235944Sbz		/* Try to append the new segment. */
531235944Sbz		if (__predict_false(seq != le->next_seq ||
532235944Sbz		    (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
533235944Sbz			/* Out of order packet or duplicate ACK. */
534235944Sbz			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
535235944Sbz			tcp_lro_flush(lc, le);
536235944Sbz			return (TCP_LRO_CANNOT);
537235944Sbz		}
538179737Sjfv
539235944Sbz		if (l != 0) {
540235944Sbz			uint32_t tsval = ntohl(*(ts_ptr + 1));
541235944Sbz			/* Make sure timestamp values are increasing. */
542235944Sbz			/* XXX-BZ flip and use TSTMP_GEQ macro for this? */
543235944Sbz			if (__predict_false(le->tsval > tsval ||
544235944Sbz			    *(ts_ptr + 2) == 0))
545235944Sbz				return (TCP_LRO_CANNOT);
546235944Sbz			le->tsval = tsval;
547235944Sbz			le->tsecr = *(ts_ptr + 2);
548235944Sbz		}
549223797Scperciva
550235944Sbz		le->next_seq += tcp_data_len;
551235944Sbz		le->ack_seq = th->th_ack;
552235944Sbz		le->window = th->th_win;
553235944Sbz		le->append_cnt++;
554179737Sjfv
555235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
556235944Sbz		le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
557235944Sbz		    tcp_data_len, ~csum);
558235944Sbz#endif
559179737Sjfv
560235944Sbz		if (tcp_data_len == 0) {
561235944Sbz			m_freem(m);
562235944Sbz			return (0);
563235944Sbz		}
564179737Sjfv
565235944Sbz		le->p_len += tcp_data_len;
566179737Sjfv
567235944Sbz		/*
568235944Sbz		 * Adjust the mbuf so that m_data points to the first byte of
569235944Sbz		 * the ULP payload.  Adjust the mbuf to avoid complications and
570235944Sbz		 * append new segment to existing mbuf chain.
571235944Sbz		 */
572235944Sbz		m_adj(m, m->m_pkthdr.len - tcp_data_len);
573235944Sbz		m->m_flags &= ~M_PKTHDR;
574179737Sjfv
575235944Sbz		le->m_tail->m_next = m;
576235944Sbz		le->m_tail = m_last(m);
577235944Sbz
578235944Sbz		/*
579235944Sbz		 * If a possible next full length packet would cause an
580235944Sbz		 * overflow, pro-actively flush now.
581235944Sbz		 */
582235944Sbz		if (le->p_len > (65535 - lc->ifp->if_mtu)) {
583235944Sbz			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
584235944Sbz			tcp_lro_flush(lc, le);
585255010Snp		} else
586255010Snp			getmicrotime(&le->mtime);
587235944Sbz
588235944Sbz		return (0);
589179737Sjfv	}
590179737Sjfv
591304836Ssephe	if (force_flush) {
592304836Ssephe		/*
593304836Ssephe		 * Nothing to flush, but this segment can not be further
594304836Ssephe		 * aggregated/delayed.
595304836Ssephe		 */
596304836Ssephe		return (TCP_LRO_CANNOT);
597304836Ssephe	}
598304836Ssephe
599235944Sbz	/* Try to find an empty slot. */
600235944Sbz	if (SLIST_EMPTY(&lc->lro_free))
601301949Ssephe		return (TCP_LRO_NO_ENTRIES);
602179737Sjfv
603235944Sbz	/* Start a new segment chain. */
604235944Sbz	le = SLIST_FIRST(&lc->lro_free);
605235944Sbz	SLIST_REMOVE_HEAD(&lc->lro_free, next);
606235944Sbz	SLIST_INSERT_HEAD(&lc->lro_active, le, next);
607255010Snp	getmicrotime(&le->mtime);
608179737Sjfv
609235944Sbz	/* Start filling in details. */
610235944Sbz	switch (eh_type) {
611235944Sbz#ifdef INET6
612235944Sbz	case ETHERTYPE_IPV6:
613235944Sbz		le->le_ip6 = ip6;
614235944Sbz		le->source_ip6 = ip6->ip6_src;
615235944Sbz		le->dest_ip6 = ip6->ip6_dst;
616235944Sbz		le->eh_type = eh_type;
617235944Sbz		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
618235944Sbz		break;
619235944Sbz#endif
620235944Sbz#ifdef INET
621235944Sbz	case ETHERTYPE_IP:
622235944Sbz		le->le_ip4 = ip4;
623235944Sbz		le->source_ip4 = ip4->ip_src.s_addr;
624235944Sbz		le->dest_ip4 = ip4->ip_dst.s_addr;
625235944Sbz		le->eh_type = eh_type;
626235944Sbz		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
627235944Sbz		break;
628235944Sbz#endif
629235944Sbz	}
630235944Sbz	le->source_port = th->th_sport;
631235944Sbz	le->dest_port = th->th_dport;
632235944Sbz
633235944Sbz	le->next_seq = seq + tcp_data_len;
634235944Sbz	le->ack_seq = th->th_ack;
635235944Sbz	le->window = th->th_win;
636235944Sbz	if (l != 0) {
637235944Sbz		le->timestamp = 1;
638235944Sbz		le->tsval = ntohl(*(ts_ptr + 1));
639235944Sbz		le->tsecr = *(ts_ptr + 2);
640235944Sbz	}
641235944Sbz
642235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
643235944Sbz	/*
644235944Sbz	 * Do not touch the csum of the first packet.  However save the
645235944Sbz	 * "adjusted" checksum of just the source and destination addresses,
646235944Sbz	 * the next header and the TCP payload.  The length and TCP header
647235944Sbz	 * parts may change, so we remove those from the saved checksum and
648235944Sbz	 * re-add with final values on tcp_lro_flush() if needed.
649179737Sjfv	 */
650235944Sbz	KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
651235944Sbz	    __func__, le, le->ulp_csum));
652235944Sbz
653235944Sbz	le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
654235944Sbz	    ~csum);
655235944Sbz	th->th_sum = csum;	/* Restore checksum on first packet. */
656235944Sbz#endif
657235944Sbz
658235944Sbz	le->m_head = m;
659235944Sbz	le->m_tail = m_last(m);
660235944Sbz
661235944Sbz	return (0);
662179737Sjfv}
663235944Sbz
664235944Sbz/* end */
665