1235474Sbz/*-
2235474Sbz * Copyright (c) 2007, Myricom Inc.
3235474Sbz * Copyright (c) 2008, Intel Corporation.
4235944Sbz * Copyright (c) 2012 The FreeBSD Foundation
5235474Sbz * All rights reserved.
6235474Sbz *
7235944Sbz * Portions of this software were developed by Bjoern Zeeb
8235944Sbz * under sponsorship from the FreeBSD Foundation.
9235944Sbz *
10235474Sbz * Redistribution and use in source and binary forms, with or without
11235474Sbz * modification, are permitted provided that the following conditions
12235474Sbz * are met:
13235474Sbz * 1. Redistributions of source code must retain the above copyright
14235474Sbz *    notice, this list of conditions and the following disclaimer.
15235474Sbz * 2. Redistributions in binary form must reproduce the above copyright
16235474Sbz *    notice, this list of conditions and the following disclaimer in the
17235474Sbz *    documentation and/or other materials provided with the distribution.
18235474Sbz *
19235474Sbz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20235474Sbz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21235474Sbz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22235474Sbz * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23235474Sbz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24235474Sbz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25235474Sbz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26235474Sbz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27235474Sbz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28235474Sbz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29235474Sbz * SUCH DAMAGE.
30235474Sbz */
31179737Sjfv
32235944Sbz#include <sys/cdefs.h>
33235944Sbz__FBSDID("$FreeBSD$");
34235944Sbz
35235944Sbz#include "opt_inet.h"
36235944Sbz#include "opt_inet6.h"
37235944Sbz
38179737Sjfv#include <sys/param.h>
39179737Sjfv#include <sys/systm.h>
40179737Sjfv#include <sys/mbuf.h>
41179737Sjfv#include <sys/kernel.h>
42179737Sjfv#include <sys/socket.h>
43305189Ssephe#include <sys/sysctl.h>
44179737Sjfv
45179737Sjfv#include <net/if.h>
46235944Sbz#include <net/if_var.h>
47179737Sjfv#include <net/ethernet.h>
48236394Sbz#include <net/vnet.h>
49179737Sjfv
50179737Sjfv#include <netinet/in_systm.h>
51179737Sjfv#include <netinet/in.h>
52235944Sbz#include <netinet/ip6.h>
53179737Sjfv#include <netinet/ip.h>
54235981Sbz#include <netinet/ip_var.h>
55179737Sjfv#include <netinet/tcp.h>
56179737Sjfv#include <netinet/tcp_lro.h>
57305189Ssephe#include <netinet/tcp_var.h>
58179737Sjfv
59235981Sbz#include <netinet6/ip6_var.h>
60235981Sbz
61179737Sjfv#include <machine/in_cksum.h>
62179737Sjfv
63235944Sbz#ifndef LRO_ENTRIES
64235944Sbz#define	LRO_ENTRIES	8	/* # of LRO entries per RX queue. */
65235944Sbz#endif
66179737Sjfv
67235944Sbz#define	TCP_LRO_UPDATE_CSUM	1
68235944Sbz#ifndef	TCP_LRO_UPDATE_CSUM
69235944Sbz#define	TCP_LRO_INVALID_CSUM	0x0000
70235944Sbz#endif
71179737Sjfv
72305189SsepheSYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
73305189Ssephe    "TCP LRO");
74305189Ssephe
75305189Ssephestatic unsigned	tcp_lro_entries = LRO_ENTRIES;
76305189SsepheSYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
77305189Ssephe    CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
78305189Ssephe    "default number of LRO entries");
79305189Ssephe
80179737Sjfvint
81235944Sbztcp_lro_init(struct lro_ctrl *lc)
82179737Sjfv{
83235944Sbz	struct lro_entry *le;
84235944Sbz	int error, i;
85179737Sjfv
86235944Sbz	lc->lro_bad_csum = 0;
87235944Sbz	lc->lro_queued = 0;
88235944Sbz	lc->lro_flushed = 0;
89235944Sbz	lc->lro_cnt = 0;
90235944Sbz	SLIST_INIT(&lc->lro_free);
91235944Sbz	SLIST_INIT(&lc->lro_active);
92179737Sjfv
93235944Sbz	error = 0;
94305189Ssephe	for (i = 0; i < tcp_lro_entries; i++) {
95235944Sbz		le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
96235944Sbz		    M_NOWAIT | M_ZERO);
97235944Sbz                if (le == NULL) {
98179737Sjfv			if (i == 0)
99179737Sjfv				error = ENOMEM;
100179737Sjfv                        break;
101179737Sjfv                }
102235944Sbz		lc->lro_cnt = i + 1;
103235944Sbz		SLIST_INSERT_HEAD(&lc->lro_free, le, next);
104179737Sjfv        }
105179737Sjfv
106179737Sjfv	return (error);
107179737Sjfv}
108179737Sjfv
109179737Sjfvvoid
110235944Sbztcp_lro_free(struct lro_ctrl *lc)
111179737Sjfv{
112235944Sbz	struct lro_entry *le;
113179737Sjfv
114235944Sbz	while (!SLIST_EMPTY(&lc->lro_free)) {
115235944Sbz		le = SLIST_FIRST(&lc->lro_free);
116235944Sbz		SLIST_REMOVE_HEAD(&lc->lro_free, next);
117235944Sbz		free(le, M_DEVBUF);
118179737Sjfv	}
119179737Sjfv}
120179737Sjfv
121235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
122235944Sbzstatic uint16_t
123235944Sbztcp_lro_csum_th(struct tcphdr *th)
124235944Sbz{
125235944Sbz	uint32_t ch;
126235944Sbz	uint16_t *p, l;
127235944Sbz
128235944Sbz	ch = th->th_sum = 0x0000;
129235944Sbz	l = th->th_off;
130235944Sbz	p = (uint16_t *)th;
131235944Sbz	while (l > 0) {
132235944Sbz		ch += *p;
133235944Sbz		p++;
134235944Sbz		ch += *p;
135235944Sbz		p++;
136235944Sbz		l--;
137235944Sbz	}
138235944Sbz	while (ch > 0xffff)
139235944Sbz		ch = (ch >> 16) + (ch & 0xffff);
140235944Sbz
141235944Sbz	return (ch & 0xffff);
142235944Sbz}
143235944Sbz
144235944Sbzstatic uint16_t
145235944Sbztcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
146235944Sbz    uint16_t tcp_data_len, uint16_t csum)
147235944Sbz{
148235944Sbz	uint32_t c;
149235944Sbz	uint16_t cs;
150235944Sbz
151235944Sbz	c = csum;
152235944Sbz
153235944Sbz	/* Remove length from checksum. */
154235944Sbz	switch (le->eh_type) {
155235944Sbz#ifdef INET6
156235944Sbz	case ETHERTYPE_IPV6:
157235944Sbz	{
158235944Sbz		struct ip6_hdr *ip6;
159235944Sbz
160235944Sbz		ip6 = (struct ip6_hdr *)l3hdr;
161235944Sbz		if (le->append_cnt == 0)
162235944Sbz			cs = ip6->ip6_plen;
163235944Sbz		else {
164235944Sbz			uint32_t cx;
165235944Sbz
166235944Sbz			cx = ntohs(ip6->ip6_plen);
167235944Sbz			cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
168235944Sbz		}
169235944Sbz		break;
170235944Sbz	}
171235944Sbz#endif
172235944Sbz#ifdef INET
173235944Sbz	case ETHERTYPE_IP:
174235944Sbz	{
175235944Sbz		struct ip *ip4;
176235944Sbz
177235944Sbz		ip4 = (struct ip *)l3hdr;
178235944Sbz		if (le->append_cnt == 0)
179235944Sbz			cs = ip4->ip_len;
180235944Sbz		else {
181235944Sbz			cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
182235944Sbz			    IPPROTO_TCP);
183235944Sbz			cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
184235944Sbz			    htons(cs));
185235944Sbz		}
186235944Sbz		break;
187235944Sbz	}
188235944Sbz#endif
189235944Sbz	default:
190235944Sbz		cs = 0;		/* Keep compiler happy. */
191235944Sbz	}
192235944Sbz
193235944Sbz	cs = ~cs;
194235944Sbz	c += cs;
195235944Sbz
196235944Sbz	/* Remove TCP header csum. */
197235944Sbz	cs = ~tcp_lro_csum_th(th);
198235944Sbz	c += cs;
199235944Sbz	while (c > 0xffff)
200235944Sbz		c = (c >> 16) + (c & 0xffff);
201235944Sbz
202235944Sbz	return (c & 0xffff);
203235944Sbz}
204235944Sbz#endif
205235944Sbz
206179737Sjfvvoid
207255010Snptcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
208255010Snp{
209255010Snp	struct lro_entry *le, *le_tmp;
210255010Snp	struct timeval tv;
211255010Snp
212255010Snp	if (SLIST_EMPTY(&lc->lro_active))
213255010Snp		return;
214255010Snp
215255010Snp	getmicrotime(&tv);
216255010Snp	timevalsub(&tv, timeout);
217255010Snp	SLIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
218255010Snp		if (timevalcmp(&tv, &le->mtime, >=)) {
219255010Snp			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
220255010Snp			tcp_lro_flush(lc, le);
221255010Snp		}
222255010Snp	}
223255010Snp}
224255010Snp
225255010Snpvoid
226235944Sbztcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
227179737Sjfv{
228179737Sjfv
229235944Sbz	if (le->append_cnt > 0) {
230235944Sbz		struct tcphdr *th;
231235944Sbz		uint16_t p_len;
232179737Sjfv
233235944Sbz		p_len = htons(le->p_len);
234235944Sbz		switch (le->eh_type) {
235235944Sbz#ifdef INET6
236235944Sbz		case ETHERTYPE_IPV6:
237235944Sbz		{
238235944Sbz			struct ip6_hdr *ip6;
239179737Sjfv
240235944Sbz			ip6 = le->le_ip6;
241235944Sbz			ip6->ip6_plen = p_len;
242235944Sbz			th = (struct tcphdr *)(ip6 + 1);
243235944Sbz			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
244235944Sbz			    CSUM_PSEUDO_HDR;
245235944Sbz			le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
246235944Sbz			break;
247235944Sbz		}
248235944Sbz#endif
249235944Sbz#ifdef INET
250235944Sbz		case ETHERTYPE_IP:
251235944Sbz		{
252235944Sbz			struct ip *ip4;
253235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
254235944Sbz			uint32_t cl;
255235944Sbz			uint16_t c;
256235944Sbz#endif
257179737Sjfv
258235944Sbz			ip4 = le->le_ip4;
259235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
260235944Sbz			/* Fix IP header checksum for new length. */
261235944Sbz			c = ~ip4->ip_sum;
262235944Sbz			cl = c;
263235944Sbz			c = ~ip4->ip_len;
264235944Sbz			cl += c + p_len;
265235944Sbz			while (cl > 0xffff)
266235944Sbz				cl = (cl >> 16) + (cl & 0xffff);
267235944Sbz			c = cl;
268235944Sbz			ip4->ip_sum = ~c;
269235944Sbz#else
270235944Sbz			ip4->ip_sum = TCP_LRO_INVALID_CSUM;
271235944Sbz#endif
272235944Sbz			ip4->ip_len = p_len;
273235944Sbz			th = (struct tcphdr *)(ip4 + 1);
274235944Sbz			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
275235944Sbz			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
276235944Sbz			le->p_len += ETHER_HDR_LEN;
277235944Sbz			break;
278179737Sjfv		}
279235944Sbz#endif
280235944Sbz		default:
281235944Sbz			th = NULL;	/* Keep compiler happy. */
282235944Sbz		}
283235944Sbz		le->m_head->m_pkthdr.csum_data = 0xffff;
284235944Sbz		le->m_head->m_pkthdr.len = le->p_len;
285235944Sbz
286235944Sbz		/* Incorporate the latest ACK into the TCP header. */
287235944Sbz		th->th_ack = le->ack_seq;
288235944Sbz		th->th_win = le->window;
289235944Sbz		/* Incorporate latest timestamp into the TCP header. */
290235944Sbz		if (le->timestamp != 0) {
291235944Sbz			uint32_t *ts_ptr;
292235944Sbz
293235944Sbz			ts_ptr = (uint32_t *)(th + 1);
294235944Sbz			ts_ptr[1] = htonl(le->tsval);
295235944Sbz			ts_ptr[2] = le->tsecr;
296235944Sbz		}
297235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
298235944Sbz		/* Update the TCP header checksum. */
299235944Sbz		le->ulp_csum += p_len;
300235944Sbz		le->ulp_csum += tcp_lro_csum_th(th);
301235944Sbz		while (le->ulp_csum > 0xffff)
302235944Sbz			le->ulp_csum = (le->ulp_csum >> 16) +
303235944Sbz			    (le->ulp_csum & 0xffff);
304235944Sbz		th->th_sum = (le->ulp_csum & 0xffff);
305235944Sbz		th->th_sum = ~th->th_sum;
306235944Sbz#else
307235944Sbz		th->th_sum = TCP_LRO_INVALID_CSUM;
308235944Sbz#endif
309179737Sjfv	}
310235944Sbz
311235944Sbz	(*lc->ifp->if_input)(lc->ifp, le->m_head);
312235944Sbz	lc->lro_queued += le->append_cnt + 1;
313235944Sbz	lc->lro_flushed++;
314235944Sbz	bzero(le, sizeof(*le));
315235944Sbz	SLIST_INSERT_HEAD(&lc->lro_free, le, next);
316179737Sjfv}
317179737Sjfv
318235944Sbz#ifdef INET6
319235944Sbzstatic int
320235944Sbztcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
321235944Sbz    struct tcphdr **th)
322179737Sjfv{
323179737Sjfv
324235944Sbz	/* XXX-BZ we should check the flow-label. */
325179737Sjfv
326235944Sbz	/* XXX-BZ We do not yet support ext. hdrs. */
327235944Sbz	if (ip6->ip6_nxt != IPPROTO_TCP)
328235944Sbz		return (TCP_LRO_NOT_SUPPORTED);
329179737Sjfv
330235944Sbz	/* Find the TCP header. */
331235944Sbz	*th = (struct tcphdr *)(ip6 + 1);
332179737Sjfv
333235944Sbz	return (0);
334235944Sbz}
335235944Sbz#endif
336235944Sbz
337235944Sbz#ifdef INET
338235944Sbzstatic int
339235944Sbztcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
340235944Sbz    struct tcphdr **th)
341235944Sbz{
342235944Sbz	int csum_flags;
343235944Sbz	uint16_t csum;
344235944Sbz
345235944Sbz	if (ip4->ip_p != IPPROTO_TCP)
346235944Sbz		return (TCP_LRO_NOT_SUPPORTED);
347235944Sbz
348235944Sbz	/* Ensure there are no options. */
349235944Sbz	if ((ip4->ip_hl << 2) != sizeof (*ip4))
350235944Sbz		return (TCP_LRO_CANNOT);
351235944Sbz
352235944Sbz	/* .. and the packet is not fragmented. */
353235944Sbz	if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
354235944Sbz		return (TCP_LRO_CANNOT);
355235944Sbz
356235944Sbz	/* Legacy IP has a header checksum that needs to be correct. */
357235944Sbz	csum_flags = m->m_pkthdr.csum_flags;
358182089Skmacy	if (csum_flags & CSUM_IP_CHECKED) {
359182089Skmacy		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
360235944Sbz			lc->lro_bad_csum++;
361235944Sbz			return (TCP_LRO_CANNOT);
362182089Skmacy		}
363182089Skmacy	} else {
364235944Sbz		csum = in_cksum_hdr(ip4);
365247104Sgallatin		if (__predict_false((csum) != 0)) {
366235944Sbz			lc->lro_bad_csum++;
367235944Sbz			return (TCP_LRO_CANNOT);
368182089Skmacy		}
369179737Sjfv	}
370179737Sjfv
371235944Sbz	/* Find the TCP header (we assured there are no IP options). */
372235944Sbz	*th = (struct tcphdr *)(ip4 + 1);
373179737Sjfv
374235944Sbz	return (0);
375235944Sbz}
376235944Sbz#endif
377179737Sjfv
378235944Sbzint
379235944Sbztcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
380235944Sbz{
381235944Sbz	struct lro_entry *le;
382235944Sbz	struct ether_header *eh;
383235944Sbz#ifdef INET6
384235944Sbz	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
385235944Sbz#endif
386235944Sbz#ifdef INET
387235944Sbz	struct ip *ip4 = NULL;		/* Keep compiler happy. */
388235944Sbz#endif
389235944Sbz	struct tcphdr *th;
390235944Sbz	void *l3hdr = NULL;		/* Keep compiler happy. */
391235944Sbz	uint32_t *ts_ptr;
392235944Sbz	tcp_seq seq;
393235944Sbz	int error, ip_len, l;
394235944Sbz	uint16_t eh_type, tcp_data_len;
395304836Ssephe	int force_flush = 0;
396179737Sjfv
397235944Sbz	/* We expect a contiguous header [eh, ip, tcp]. */
398235944Sbz
399235944Sbz	eh = mtod(m, struct ether_header *);
400235944Sbz	eh_type = ntohs(eh->ether_type);
401235944Sbz	switch (eh_type) {
402235944Sbz#ifdef INET6
403235944Sbz	case ETHERTYPE_IPV6:
404236394Sbz	{
405236394Sbz		CURVNET_SET(lc->ifp->if_vnet);
406235981Sbz		if (V_ip6_forwarding != 0) {
407235981Sbz			/* XXX-BZ stats but changing lro_ctrl is a problem. */
408236394Sbz			CURVNET_RESTORE();
409235981Sbz			return (TCP_LRO_CANNOT);
410235981Sbz		}
411236394Sbz		CURVNET_RESTORE();
412235944Sbz		l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
413235944Sbz		error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
414235944Sbz		if (error != 0)
415235944Sbz			return (error);
416235944Sbz		tcp_data_len = ntohs(ip6->ip6_plen);
417235944Sbz		ip_len = sizeof(*ip6) + tcp_data_len;
418235944Sbz		break;
419236394Sbz	}
420235944Sbz#endif
421235944Sbz#ifdef INET
422235944Sbz	case ETHERTYPE_IP:
423236394Sbz	{
424236394Sbz		CURVNET_SET(lc->ifp->if_vnet);
425235981Sbz		if (V_ipforwarding != 0) {
426235981Sbz			/* XXX-BZ stats but changing lro_ctrl is a problem. */
427236394Sbz			CURVNET_RESTORE();
428235981Sbz			return (TCP_LRO_CANNOT);
429235981Sbz		}
430236394Sbz		CURVNET_RESTORE();
431235944Sbz		l3hdr = ip4 = (struct ip *)(eh + 1);
432235944Sbz		error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
433235944Sbz		if (error != 0)
434235944Sbz			return (error);
435235944Sbz		ip_len = ntohs(ip4->ip_len);
436235944Sbz		tcp_data_len = ip_len - sizeof(*ip4);
437235944Sbz		break;
438236394Sbz	}
439235944Sbz#endif
440235944Sbz	/* XXX-BZ what happens in case of VLAN(s)? */
441235944Sbz	default:
442235944Sbz		return (TCP_LRO_NOT_SUPPORTED);
443179737Sjfv	}
444179737Sjfv
445235944Sbz	/*
446235944Sbz	 * If the frame is padded beyond the end of the IP packet, then we must
447235944Sbz	 * trim the extra bytes off.
448235944Sbz	 */
449235944Sbz	l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
450235944Sbz	if (l != 0) {
451235944Sbz		if (l < 0)
452235944Sbz			/* Truncated packet. */
453235944Sbz			return (TCP_LRO_CANNOT);
454179737Sjfv
455235944Sbz		m_adj(m, -l);
456235944Sbz	}
457235944Sbz
458235944Sbz	/*
459235944Sbz	 * Check TCP header constraints.
460179737Sjfv	 */
461235944Sbz	/* Ensure no bits set besides ACK or PSH. */
462304836Ssephe	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
463304836Ssephe		if (th->th_flags & TH_SYN)
464304836Ssephe			return (TCP_LRO_CANNOT);
465304836Ssephe		/*
466304836Ssephe		 * Make sure that previously seen segements/ACKs are delivered
467304836Ssephe		 * before this segement, e.g. FIN.
468304836Ssephe		 */
469304836Ssephe		force_flush = 1;
470304836Ssephe	}
471235944Sbz
472302051Ssephe	/* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
473235944Sbz	/* XXX-BZ Ideally we'd flush on PUSH? */
474235944Sbz
475235944Sbz	/*
476235944Sbz	 * Check for timestamps.
477235944Sbz	 * Since the only option we handle are timestamps, we only have to
478235944Sbz	 * handle the simple case of aligned timestamps.
479235944Sbz	 */
480235944Sbz	l = (th->th_off << 2);
481235944Sbz	tcp_data_len -= l;
482235944Sbz	l -= sizeof(*th);
483235944Sbz	ts_ptr = (uint32_t *)(th + 1);
484235944Sbz	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
485235944Sbz	    (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
486304836Ssephe	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
487304836Ssephe		/*
488304836Ssephe		 * Make sure that previously seen segements/ACKs are delivered
489304836Ssephe		 * before this segement.
490304836Ssephe		 */
491304836Ssephe		force_flush = 1;
492304836Ssephe	}
493235944Sbz
494235944Sbz	/* If the driver did not pass in the checksum, set it now. */
495235944Sbz	if (csum == 0x0000)
496235944Sbz		csum = th->th_sum;
497235944Sbz
498235944Sbz	seq = ntohl(th->th_seq);
499235944Sbz
500235944Sbz	/* Try to find a matching previous segment. */
501235944Sbz	SLIST_FOREACH(le, &lc->lro_active, next) {
502235944Sbz		if (le->eh_type != eh_type)
503235944Sbz			continue;
504235944Sbz		if (le->source_port != th->th_sport ||
505235944Sbz		    le->dest_port != th->th_dport)
506235944Sbz			continue;
507235944Sbz		switch (eh_type) {
508235944Sbz#ifdef INET6
509235944Sbz		case ETHERTYPE_IPV6:
510235944Sbz			if (bcmp(&le->source_ip6, &ip6->ip6_src,
511235944Sbz			    sizeof(struct in6_addr)) != 0 ||
512235944Sbz			    bcmp(&le->dest_ip6, &ip6->ip6_dst,
513235944Sbz			    sizeof(struct in6_addr)) != 0)
514235944Sbz				continue;
515235944Sbz			break;
516235944Sbz#endif
517235944Sbz#ifdef INET
518235944Sbz		case ETHERTYPE_IP:
519235944Sbz			if (le->source_ip4 != ip4->ip_src.s_addr ||
520235944Sbz			    le->dest_ip4 != ip4->ip_dst.s_addr)
521235944Sbz				continue;
522235944Sbz			break;
523235944Sbz#endif
524179737Sjfv		}
525179737Sjfv
526304836Ssephe		if (force_flush) {
527304836Ssephe			/* Timestamps mismatch; this is a FIN, etc */
528304836Ssephe			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
529304836Ssephe			tcp_lro_flush(lc, le);
530304836Ssephe			return (TCP_LRO_CANNOT);
531304836Ssephe		}
532304836Ssephe
533235944Sbz		/* Flush now if appending will result in overflow. */
534235944Sbz		if (le->p_len > (65535 - tcp_data_len)) {
535235944Sbz			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
536235944Sbz			tcp_lro_flush(lc, le);
537235944Sbz			break;
538235944Sbz		}
539179737Sjfv
540235944Sbz		/* Try to append the new segment. */
541235944Sbz		if (__predict_false(seq != le->next_seq ||
542235944Sbz		    (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
543235944Sbz			/* Out of order packet or duplicate ACK. */
544235944Sbz			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
545235944Sbz			tcp_lro_flush(lc, le);
546235944Sbz			return (TCP_LRO_CANNOT);
547235944Sbz		}
548179737Sjfv
549235944Sbz		if (l != 0) {
550235944Sbz			uint32_t tsval = ntohl(*(ts_ptr + 1));
551235944Sbz			/* Make sure timestamp values are increasing. */
552235944Sbz			/* XXX-BZ flip and use TSTMP_GEQ macro for this? */
553235944Sbz			if (__predict_false(le->tsval > tsval ||
554235944Sbz			    *(ts_ptr + 2) == 0))
555235944Sbz				return (TCP_LRO_CANNOT);
556235944Sbz			le->tsval = tsval;
557235944Sbz			le->tsecr = *(ts_ptr + 2);
558235944Sbz		}
559223797Scperciva
560235944Sbz		le->next_seq += tcp_data_len;
561235944Sbz		le->ack_seq = th->th_ack;
562235944Sbz		le->window = th->th_win;
563235944Sbz		le->append_cnt++;
564179737Sjfv
565235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
566235944Sbz		le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
567235944Sbz		    tcp_data_len, ~csum);
568235944Sbz#endif
569179737Sjfv
570235944Sbz		if (tcp_data_len == 0) {
571235944Sbz			m_freem(m);
572235944Sbz			return (0);
573235944Sbz		}
574179737Sjfv
575235944Sbz		le->p_len += tcp_data_len;
576179737Sjfv
577235944Sbz		/*
578235944Sbz		 * Adjust the mbuf so that m_data points to the first byte of
579235944Sbz		 * the ULP payload.  Adjust the mbuf to avoid complications and
580235944Sbz		 * append new segment to existing mbuf chain.
581235944Sbz		 */
582235944Sbz		m_adj(m, m->m_pkthdr.len - tcp_data_len);
583235944Sbz		m->m_flags &= ~M_PKTHDR;
584179737Sjfv
585235944Sbz		le->m_tail->m_next = m;
586235944Sbz		le->m_tail = m_last(m);
587235944Sbz
588235944Sbz		/*
589235944Sbz		 * If a possible next full length packet would cause an
590235944Sbz		 * overflow, pro-actively flush now.
591235944Sbz		 */
592235944Sbz		if (le->p_len > (65535 - lc->ifp->if_mtu)) {
593235944Sbz			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
594235944Sbz			tcp_lro_flush(lc, le);
595255010Snp		} else
596255010Snp			getmicrotime(&le->mtime);
597235944Sbz
598235944Sbz		return (0);
599179737Sjfv	}
600179737Sjfv
601304836Ssephe	if (force_flush) {
602304836Ssephe		/*
603304836Ssephe		 * Nothing to flush, but this segment can not be further
604304836Ssephe		 * aggregated/delayed.
605304836Ssephe		 */
606304836Ssephe		return (TCP_LRO_CANNOT);
607304836Ssephe	}
608304836Ssephe
609235944Sbz	/* Try to find an empty slot. */
610235944Sbz	if (SLIST_EMPTY(&lc->lro_free))
611301949Ssephe		return (TCP_LRO_NO_ENTRIES);
612179737Sjfv
613235944Sbz	/* Start a new segment chain. */
614235944Sbz	le = SLIST_FIRST(&lc->lro_free);
615235944Sbz	SLIST_REMOVE_HEAD(&lc->lro_free, next);
616235944Sbz	SLIST_INSERT_HEAD(&lc->lro_active, le, next);
617255010Snp	getmicrotime(&le->mtime);
618179737Sjfv
619235944Sbz	/* Start filling in details. */
620235944Sbz	switch (eh_type) {
621235944Sbz#ifdef INET6
622235944Sbz	case ETHERTYPE_IPV6:
623235944Sbz		le->le_ip6 = ip6;
624235944Sbz		le->source_ip6 = ip6->ip6_src;
625235944Sbz		le->dest_ip6 = ip6->ip6_dst;
626235944Sbz		le->eh_type = eh_type;
627235944Sbz		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
628235944Sbz		break;
629235944Sbz#endif
630235944Sbz#ifdef INET
631235944Sbz	case ETHERTYPE_IP:
632235944Sbz		le->le_ip4 = ip4;
633235944Sbz		le->source_ip4 = ip4->ip_src.s_addr;
634235944Sbz		le->dest_ip4 = ip4->ip_dst.s_addr;
635235944Sbz		le->eh_type = eh_type;
636235944Sbz		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
637235944Sbz		break;
638235944Sbz#endif
639235944Sbz	}
640235944Sbz	le->source_port = th->th_sport;
641235944Sbz	le->dest_port = th->th_dport;
642235944Sbz
643235944Sbz	le->next_seq = seq + tcp_data_len;
644235944Sbz	le->ack_seq = th->th_ack;
645235944Sbz	le->window = th->th_win;
646235944Sbz	if (l != 0) {
647235944Sbz		le->timestamp = 1;
648235944Sbz		le->tsval = ntohl(*(ts_ptr + 1));
649235944Sbz		le->tsecr = *(ts_ptr + 2);
650235944Sbz	}
651235944Sbz
652235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
653235944Sbz	/*
654235944Sbz	 * Do not touch the csum of the first packet.  However save the
655235944Sbz	 * "adjusted" checksum of just the source and destination addresses,
656235944Sbz	 * the next header and the TCP payload.  The length and TCP header
657235944Sbz	 * parts may change, so we remove those from the saved checksum and
658235944Sbz	 * re-add with final values on tcp_lro_flush() if needed.
659179737Sjfv	 */
660235944Sbz	KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
661235944Sbz	    __func__, le, le->ulp_csum));
662235944Sbz
663235944Sbz	le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
664235944Sbz	    ~csum);
665235944Sbz	th->th_sum = csum;	/* Restore checksum on first packet. */
666235944Sbz#endif
667235944Sbz
668235944Sbz	le->m_head = m;
669235944Sbz	le->m_tail = m_last(m);
670235944Sbz
671235944Sbz	return (0);
672179737Sjfv}
673235944Sbz
674235944Sbz/* end */
675