1235474Sbz/*-
2235474Sbz * Copyright (c) 2007, Myricom Inc.
3235474Sbz * Copyright (c) 2008, Intel Corporation.
4235944Sbz * Copyright (c) 2012 The FreeBSD Foundation
5294327Shselasky * Copyright (c) 2016 Mellanox Technologies.
6235474Sbz * All rights reserved.
7235474Sbz *
8235944Sbz * Portions of this software were developed by Bjoern Zeeb
9235944Sbz * under sponsorship from the FreeBSD Foundation.
10235944Sbz *
11235474Sbz * Redistribution and use in source and binary forms, with or without
12235474Sbz * modification, are permitted provided that the following conditions
13235474Sbz * are met:
14235474Sbz * 1. Redistributions of source code must retain the above copyright
15235474Sbz *    notice, this list of conditions and the following disclaimer.
16235474Sbz * 2. Redistributions in binary form must reproduce the above copyright
17235474Sbz *    notice, this list of conditions and the following disclaimer in the
18235474Sbz *    documentation and/or other materials provided with the distribution.
19235474Sbz *
20235474Sbz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21235474Sbz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22235474Sbz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23235474Sbz * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24235474Sbz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25235474Sbz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26235474Sbz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27235474Sbz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28235474Sbz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29235474Sbz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30235474Sbz * SUCH DAMAGE.
31235474Sbz */
32179737Sjfv
33235944Sbz#include <sys/cdefs.h>
34235944Sbz__FBSDID("$FreeBSD: releng/11.0/sys/netinet/tcp_lro.c 301249 2016-06-03 08:35:07Z hselasky $");
35235944Sbz
36235944Sbz#include "opt_inet.h"
37235944Sbz#include "opt_inet6.h"
38235944Sbz
39179737Sjfv#include <sys/param.h>
40179737Sjfv#include <sys/systm.h>
41295126Sglebius#include <sys/kernel.h>
42295126Sglebius#include <sys/malloc.h>
43179737Sjfv#include <sys/mbuf.h>
44179737Sjfv#include <sys/socket.h>
45179737Sjfv
46179737Sjfv#include <net/if.h>
47235944Sbz#include <net/if_var.h>
48179737Sjfv#include <net/ethernet.h>
49236394Sbz#include <net/vnet.h>
50179737Sjfv
51179737Sjfv#include <netinet/in_systm.h>
52179737Sjfv#include <netinet/in.h>
53235944Sbz#include <netinet/ip6.h>
54179737Sjfv#include <netinet/ip.h>
55235981Sbz#include <netinet/ip_var.h>
56179737Sjfv#include <netinet/tcp.h>
57179737Sjfv#include <netinet/tcp_lro.h>
58179737Sjfv
59235981Sbz#include <netinet6/ip6_var.h>
60235981Sbz
61179737Sjfv#include <machine/in_cksum.h>
62179737Sjfv
63294327Shselaskystatic MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
64179737Sjfv
65235944Sbz#define	TCP_LRO_UPDATE_CSUM	1
66235944Sbz#ifndef	TCP_LRO_UPDATE_CSUM
67235944Sbz#define	TCP_LRO_INVALID_CSUM	0x0000
68235944Sbz#endif
69179737Sjfv
70297482Ssephestatic void	tcp_lro_rx_done(struct lro_ctrl *lc);
71297482Ssephe
72298974Ssephestatic __inline void
73298974Ssephetcp_lro_active_insert(struct lro_ctrl *lc, struct lro_entry *le)
74298974Ssephe{
75298974Ssephe
76298974Ssephe	LIST_INSERT_HEAD(&lc->lro_active, le, next);
77298974Ssephe}
78298974Ssephe
79298974Ssephestatic __inline void
80298974Ssephetcp_lro_active_remove(struct lro_entry *le)
81298974Ssephe{
82298974Ssephe
83298974Ssephe	LIST_REMOVE(le, next);
84298974Ssephe}
85298974Ssephe
86179737Sjfvint
87235944Sbztcp_lro_init(struct lro_ctrl *lc)
88179737Sjfv{
89294327Shselasky	return (tcp_lro_init_args(lc, NULL, TCP_LRO_ENTRIES, 0));
90294327Shselasky}
91294327Shselasky
92294327Shselaskyint
93294327Shselaskytcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
94294327Shselasky    unsigned lro_entries, unsigned lro_mbufs)
95294327Shselasky{
96235944Sbz	struct lro_entry *le;
97294327Shselasky	size_t size;
98294327Shselasky	unsigned i;
99179737Sjfv
100235944Sbz	lc->lro_bad_csum = 0;
101235944Sbz	lc->lro_queued = 0;
102235944Sbz	lc->lro_flushed = 0;
103235944Sbz	lc->lro_cnt = 0;
104294327Shselasky	lc->lro_mbuf_count = 0;
105294327Shselasky	lc->lro_mbuf_max = lro_mbufs;
106294327Shselasky	lc->lro_cnt = lro_entries;
107295739Ssephe	lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
108295739Ssephe	lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
109294327Shselasky	lc->ifp = ifp;
110297483Ssephe	LIST_INIT(&lc->lro_free);
111297483Ssephe	LIST_INIT(&lc->lro_active);
112179737Sjfv
113294327Shselasky	/* compute size to allocate */
114300731Shselasky	size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) +
115294327Shselasky	    (lro_entries * sizeof(*le));
116300731Shselasky	lc->lro_mbuf_data = (struct lro_mbuf_sort *)
117294327Shselasky	    malloc(size, M_LRO, M_NOWAIT | M_ZERO);
118179737Sjfv
119294327Shselasky	/* check for out of memory */
120294327Shselasky	if (lc->lro_mbuf_data == NULL) {
121294327Shselasky		memset(lc, 0, sizeof(*lc));
122294327Shselasky		return (ENOMEM);
123294327Shselasky	}
124294327Shselasky	/* compute offset for LRO entries */
125294327Shselasky	le = (struct lro_entry *)
126294327Shselasky	    (lc->lro_mbuf_data + lro_mbufs);
127294327Shselasky
128294327Shselasky	/* setup linked list */
129294327Shselasky	for (i = 0; i != lro_entries; i++)
130297483Ssephe		LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
131294327Shselasky
132294327Shselasky	return (0);
133179737Sjfv}
134179737Sjfv
135179737Sjfvvoid
136235944Sbztcp_lro_free(struct lro_ctrl *lc)
137179737Sjfv{
138235944Sbz	struct lro_entry *le;
139294327Shselasky	unsigned x;
140179737Sjfv
141294327Shselasky	/* reset LRO free list */
142297483Ssephe	LIST_INIT(&lc->lro_free);
143294327Shselasky
144294327Shselasky	/* free active mbufs, if any */
145297483Ssephe	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
146298974Ssephe		tcp_lro_active_remove(le);
147294327Shselasky		m_freem(le->m_head);
148179737Sjfv	}
149294327Shselasky
150294327Shselasky	/* free mbuf array, if any */
151294327Shselasky	for (x = 0; x != lc->lro_mbuf_count; x++)
152300731Shselasky		m_freem(lc->lro_mbuf_data[x].mb);
153294327Shselasky	lc->lro_mbuf_count = 0;
154294327Shselasky
155294327Shselasky	/* free allocated memory, if any */
156294327Shselasky	free(lc->lro_mbuf_data, M_LRO);
157294327Shselasky	lc->lro_mbuf_data = NULL;
158179737Sjfv}
159179737Sjfv
160235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
161235944Sbzstatic uint16_t
162235944Sbztcp_lro_csum_th(struct tcphdr *th)
163235944Sbz{
164235944Sbz	uint32_t ch;
165235944Sbz	uint16_t *p, l;
166235944Sbz
167235944Sbz	ch = th->th_sum = 0x0000;
168235944Sbz	l = th->th_off;
169235944Sbz	p = (uint16_t *)th;
170235944Sbz	while (l > 0) {
171235944Sbz		ch += *p;
172235944Sbz		p++;
173235944Sbz		ch += *p;
174235944Sbz		p++;
175235944Sbz		l--;
176235944Sbz	}
177235944Sbz	while (ch > 0xffff)
178235944Sbz		ch = (ch >> 16) + (ch & 0xffff);
179235944Sbz
180235944Sbz	return (ch & 0xffff);
181235944Sbz}
182235944Sbz
183235944Sbzstatic uint16_t
184235944Sbztcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
185235944Sbz    uint16_t tcp_data_len, uint16_t csum)
186235944Sbz{
187235944Sbz	uint32_t c;
188235944Sbz	uint16_t cs;
189235944Sbz
190235944Sbz	c = csum;
191235944Sbz
192235944Sbz	/* Remove length from checksum. */
193235944Sbz	switch (le->eh_type) {
194235944Sbz#ifdef INET6
195235944Sbz	case ETHERTYPE_IPV6:
196235944Sbz	{
197235944Sbz		struct ip6_hdr *ip6;
198235944Sbz
199235944Sbz		ip6 = (struct ip6_hdr *)l3hdr;
200235944Sbz		if (le->append_cnt == 0)
201235944Sbz			cs = ip6->ip6_plen;
202235944Sbz		else {
203235944Sbz			uint32_t cx;
204235944Sbz
205235944Sbz			cx = ntohs(ip6->ip6_plen);
206235944Sbz			cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
207235944Sbz		}
208235944Sbz		break;
209235944Sbz	}
210235944Sbz#endif
211235944Sbz#ifdef INET
212235944Sbz	case ETHERTYPE_IP:
213235944Sbz	{
214235944Sbz		struct ip *ip4;
215235944Sbz
216235944Sbz		ip4 = (struct ip *)l3hdr;
217235944Sbz		if (le->append_cnt == 0)
218235944Sbz			cs = ip4->ip_len;
219235944Sbz		else {
220235944Sbz			cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
221235944Sbz			    IPPROTO_TCP);
222235944Sbz			cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
223235944Sbz			    htons(cs));
224235944Sbz		}
225235944Sbz		break;
226235944Sbz	}
227235944Sbz#endif
228235944Sbz	default:
229235944Sbz		cs = 0;		/* Keep compiler happy. */
230235944Sbz	}
231235944Sbz
232235944Sbz	cs = ~cs;
233235944Sbz	c += cs;
234235944Sbz
235235944Sbz	/* Remove TCP header csum. */
236235944Sbz	cs = ~tcp_lro_csum_th(th);
237235944Sbz	c += cs;
238235944Sbz	while (c > 0xffff)
239235944Sbz		c = (c >> 16) + (c & 0xffff);
240235944Sbz
241235944Sbz	return (c & 0xffff);
242235944Sbz}
243235944Sbz#endif
244235944Sbz
245297482Ssephestatic void
246297482Ssephetcp_lro_rx_done(struct lro_ctrl *lc)
247297482Ssephe{
248297482Ssephe	struct lro_entry *le;
249297482Ssephe
250297483Ssephe	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
251298974Ssephe		tcp_lro_active_remove(le);
252297482Ssephe		tcp_lro_flush(lc, le);
253297482Ssephe	}
254297482Ssephe}
255297482Ssephe
256179737Sjfvvoid
257255010Snptcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
258255010Snp{
259255010Snp	struct lro_entry *le, *le_tmp;
260255010Snp	struct timeval tv;
261255010Snp
262297483Ssephe	if (LIST_EMPTY(&lc->lro_active))
263255010Snp		return;
264255010Snp
265255010Snp	getmicrotime(&tv);
266255010Snp	timevalsub(&tv, timeout);
267297483Ssephe	LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
268255010Snp		if (timevalcmp(&tv, &le->mtime, >=)) {
269298974Ssephe			tcp_lro_active_remove(le);
270255010Snp			tcp_lro_flush(lc, le);
271255010Snp		}
272255010Snp	}
273255010Snp}
274255010Snp
275255010Snpvoid
276235944Sbztcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
277179737Sjfv{
278179737Sjfv
279235944Sbz	if (le->append_cnt > 0) {
280235944Sbz		struct tcphdr *th;
281235944Sbz		uint16_t p_len;
282179737Sjfv
283235944Sbz		p_len = htons(le->p_len);
284235944Sbz		switch (le->eh_type) {
285235944Sbz#ifdef INET6
286235944Sbz		case ETHERTYPE_IPV6:
287235944Sbz		{
288235944Sbz			struct ip6_hdr *ip6;
289179737Sjfv
290235944Sbz			ip6 = le->le_ip6;
291235944Sbz			ip6->ip6_plen = p_len;
292235944Sbz			th = (struct tcphdr *)(ip6 + 1);
293235944Sbz			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
294235944Sbz			    CSUM_PSEUDO_HDR;
295235944Sbz			le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
296235944Sbz			break;
297235944Sbz		}
298235944Sbz#endif
299235944Sbz#ifdef INET
300235944Sbz		case ETHERTYPE_IP:
301235944Sbz		{
302235944Sbz			struct ip *ip4;
303235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
304235944Sbz			uint32_t cl;
305235944Sbz			uint16_t c;
306235944Sbz#endif
307179737Sjfv
308235944Sbz			ip4 = le->le_ip4;
309235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
310235944Sbz			/* Fix IP header checksum for new length. */
311235944Sbz			c = ~ip4->ip_sum;
312235944Sbz			cl = c;
313235944Sbz			c = ~ip4->ip_len;
314235944Sbz			cl += c + p_len;
315235944Sbz			while (cl > 0xffff)
316235944Sbz				cl = (cl >> 16) + (cl & 0xffff);
317235944Sbz			c = cl;
318235944Sbz			ip4->ip_sum = ~c;
319235944Sbz#else
320235944Sbz			ip4->ip_sum = TCP_LRO_INVALID_CSUM;
321235944Sbz#endif
322235944Sbz			ip4->ip_len = p_len;
323235944Sbz			th = (struct tcphdr *)(ip4 + 1);
324235944Sbz			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
325235944Sbz			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
326235944Sbz			le->p_len += ETHER_HDR_LEN;
327235944Sbz			break;
328179737Sjfv		}
329235944Sbz#endif
330235944Sbz		default:
331235944Sbz			th = NULL;	/* Keep compiler happy. */
332235944Sbz		}
333235944Sbz		le->m_head->m_pkthdr.csum_data = 0xffff;
334235944Sbz		le->m_head->m_pkthdr.len = le->p_len;
335235944Sbz
336235944Sbz		/* Incorporate the latest ACK into the TCP header. */
337235944Sbz		th->th_ack = le->ack_seq;
338235944Sbz		th->th_win = le->window;
339235944Sbz		/* Incorporate latest timestamp into the TCP header. */
340235944Sbz		if (le->timestamp != 0) {
341235944Sbz			uint32_t *ts_ptr;
342235944Sbz
343235944Sbz			ts_ptr = (uint32_t *)(th + 1);
344235944Sbz			ts_ptr[1] = htonl(le->tsval);
345235944Sbz			ts_ptr[2] = le->tsecr;
346235944Sbz		}
347235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
348235944Sbz		/* Update the TCP header checksum. */
349235944Sbz		le->ulp_csum += p_len;
350235944Sbz		le->ulp_csum += tcp_lro_csum_th(th);
351235944Sbz		while (le->ulp_csum > 0xffff)
352235944Sbz			le->ulp_csum = (le->ulp_csum >> 16) +
353235944Sbz			    (le->ulp_csum & 0xffff);
354235944Sbz		th->th_sum = (le->ulp_csum & 0xffff);
355235944Sbz		th->th_sum = ~th->th_sum;
356235944Sbz#else
357235944Sbz		th->th_sum = TCP_LRO_INVALID_CSUM;
358235944Sbz#endif
359179737Sjfv	}
360235944Sbz
361235944Sbz	(*lc->ifp->if_input)(lc->ifp, le->m_head);
362235944Sbz	lc->lro_queued += le->append_cnt + 1;
363235944Sbz	lc->lro_flushed++;
364235944Sbz	bzero(le, sizeof(*le));
365297483Ssephe	LIST_INSERT_HEAD(&lc->lro_free, le, next);
366179737Sjfv}
367179737Sjfv
368300731Shselasky#ifdef HAVE_INLINE_FLSLL
369300731Shselasky#define	tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1))
370300731Shselasky#else
371300731Shselaskystatic inline uint64_t
372300731Shselaskytcp_lro_msb_64(uint64_t x)
373294327Shselasky{
374300731Shselasky	x |= (x >> 1);
375300731Shselasky	x |= (x >> 2);
376300731Shselasky	x |= (x >> 4);
377300731Shselasky	x |= (x >> 8);
378300731Shselasky	x |= (x >> 16);
379300731Shselasky	x |= (x >> 32);
380300731Shselasky	return (x & ~(x >> 1));
381300731Shselasky}
382300731Shselasky#endif
383294327Shselasky
384300731Shselasky/*
385300731Shselasky * The tcp_lro_sort() routine is comparable to qsort(), except it has
386300731Shselasky * a worst case complexity limit of O(MIN(N,64)*N), where N is the
387300731Shselasky * number of elements to sort and 64 is the number of sequence bits
388300731Shselasky * available. The algorithm is bit-slicing the 64-bit sequence number,
389300731Shselasky * sorting one bit at a time from the most significant bit until the
390301249Shselasky * least significant one, skipping the constant bits. This is
391301249Shselasky * typically called a radix sort.
392300731Shselasky */
393300731Shselaskystatic void
394300731Shselaskytcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size)
395300731Shselasky{
396300731Shselasky	struct lro_mbuf_sort temp;
397300731Shselasky	uint64_t ones;
398300731Shselasky	uint64_t zeros;
399300731Shselasky	uint32_t x;
400300731Shselasky	uint32_t y;
401294327Shselasky
402300731Shselaskyrepeat:
403301249Shselasky	/* for small arrays insertion sort is faster */
404300731Shselasky	if (size <= 12) {
405301249Shselasky		for (x = 1; x < size; x++) {
406301249Shselasky			temp = parray[x];
407301249Shselasky			for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--)
408301249Shselasky				parray[y] = parray[y - 1];
409301249Shselasky			parray[y] = temp;
410300731Shselasky		}
411300731Shselasky		return;
412300731Shselasky	}
413294327Shselasky
414300731Shselasky	/* compute sequence bits which are constant */
415300731Shselasky	ones = 0;
416300731Shselasky	zeros = 0;
417300731Shselasky	for (x = 0; x != size; x++) {
418300731Shselasky		ones |= parray[x].seq;
419300731Shselasky		zeros |= ~parray[x].seq;
420300731Shselasky	}
421300731Shselasky
422300731Shselasky	/* compute bits which are not constant into "ones" */
423300731Shselasky	ones &= zeros;
424300731Shselasky	if (ones == 0)
425300731Shselasky		return;
426300731Shselasky
427300731Shselasky	/* pick the most significant bit which is not constant */
428300731Shselasky	ones = tcp_lro_msb_64(ones);
429300731Shselasky
430300731Shselasky	/*
431300731Shselasky	 * Move entries having cleared sequence bits to the beginning
432300731Shselasky	 * of the array:
433300731Shselasky	 */
434300731Shselasky	for (x = y = 0; y != size; y++) {
435300731Shselasky		/* skip set bits */
436300731Shselasky		if (parray[y].seq & ones)
437300731Shselasky			continue;
438300731Shselasky		/* swap entries */
439300731Shselasky		temp = parray[x];
440300731Shselasky		parray[x] = parray[y];
441300731Shselasky		parray[y] = temp;
442300731Shselasky		x++;
443300731Shselasky	}
444300731Shselasky
445300731Shselasky	KASSERT(x != 0 && x != size, ("Memory is corrupted\n"));
446300731Shselasky
447300731Shselasky	/* sort zeros */
448300731Shselasky	tcp_lro_sort(parray, x);
449300731Shselasky
450300731Shselasky	/* sort ones */
451300731Shselasky	parray += x;
452300731Shselasky	size -= x;
453300731Shselasky	goto repeat;
454294327Shselasky}
455294327Shselasky
456294327Shselaskyvoid
457294327Shselaskytcp_lro_flush_all(struct lro_ctrl *lc)
458294327Shselasky{
459300731Shselasky	uint64_t seq;
460300731Shselasky	uint64_t nseq;
461294327Shselasky	unsigned x;
462294327Shselasky
463294327Shselasky	/* check if no mbufs to flush */
464297482Ssephe	if (lc->lro_mbuf_count == 0)
465294327Shselasky		goto done;
466294327Shselasky
467294327Shselasky	/* sort all mbufs according to stream */
468300731Shselasky	tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count);
469294327Shselasky
470294327Shselasky	/* input data into LRO engine, stream by stream */
471300731Shselasky	seq = 0;
472294327Shselasky	for (x = 0; x != lc->lro_mbuf_count; x++) {
473294327Shselasky		struct mbuf *mb;
474294327Shselasky
475300731Shselasky		/* get mbuf */
476300731Shselasky		mb = lc->lro_mbuf_data[x].mb;
477294327Shselasky
478300731Shselasky		/* get sequence number, masking away the packet index */
479300731Shselasky		nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24);
480300731Shselasky
481294327Shselasky		/* check for new stream */
482300731Shselasky		if (seq != nseq) {
483300731Shselasky			seq = nseq;
484294327Shselasky
485294327Shselasky			/* flush active streams */
486297482Ssephe			tcp_lro_rx_done(lc);
487294327Shselasky		}
488300731Shselasky
489294327Shselasky		/* add packet to LRO engine */
490294327Shselasky		if (tcp_lro_rx(lc, mb, 0) != 0) {
491294327Shselasky			/* input packet to network layer */
492294327Shselasky			(*lc->ifp->if_input)(lc->ifp, mb);
493294327Shselasky			lc->lro_queued++;
494294327Shselasky			lc->lro_flushed++;
495294327Shselasky		}
496294327Shselasky	}
497294327Shselaskydone:
498294327Shselasky	/* flush active streams */
499297482Ssephe	tcp_lro_rx_done(lc);
500297482Ssephe
501294327Shselasky	lc->lro_mbuf_count = 0;
502294327Shselasky}
503294327Shselasky
504235944Sbz#ifdef INET6
505235944Sbzstatic int
506235944Sbztcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
507235944Sbz    struct tcphdr **th)
508179737Sjfv{
509179737Sjfv
510235944Sbz	/* XXX-BZ we should check the flow-label. */
511179737Sjfv
512235944Sbz	/* XXX-BZ We do not yet support ext. hdrs. */
513235944Sbz	if (ip6->ip6_nxt != IPPROTO_TCP)
514235944Sbz		return (TCP_LRO_NOT_SUPPORTED);
515179737Sjfv
516235944Sbz	/* Find the TCP header. */
517235944Sbz	*th = (struct tcphdr *)(ip6 + 1);
518179737Sjfv
519235944Sbz	return (0);
520235944Sbz}
521235944Sbz#endif
522235944Sbz
523235944Sbz#ifdef INET
524235944Sbzstatic int
525235944Sbztcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
526235944Sbz    struct tcphdr **th)
527235944Sbz{
528235944Sbz	int csum_flags;
529235944Sbz	uint16_t csum;
530235944Sbz
531235944Sbz	if (ip4->ip_p != IPPROTO_TCP)
532235944Sbz		return (TCP_LRO_NOT_SUPPORTED);
533235944Sbz
534235944Sbz	/* Ensure there are no options. */
535235944Sbz	if ((ip4->ip_hl << 2) != sizeof (*ip4))
536235944Sbz		return (TCP_LRO_CANNOT);
537235944Sbz
538235944Sbz	/* .. and the packet is not fragmented. */
539235944Sbz	if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
540235944Sbz		return (TCP_LRO_CANNOT);
541235944Sbz
542235944Sbz	/* Legacy IP has a header checksum that needs to be correct. */
543235944Sbz	csum_flags = m->m_pkthdr.csum_flags;
544182089Skmacy	if (csum_flags & CSUM_IP_CHECKED) {
545182089Skmacy		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
546235944Sbz			lc->lro_bad_csum++;
547235944Sbz			return (TCP_LRO_CANNOT);
548182089Skmacy		}
549182089Skmacy	} else {
550235944Sbz		csum = in_cksum_hdr(ip4);
551247104Sgallatin		if (__predict_false((csum) != 0)) {
552235944Sbz			lc->lro_bad_csum++;
553235944Sbz			return (TCP_LRO_CANNOT);
554182089Skmacy		}
555179737Sjfv	}
556179737Sjfv
557235944Sbz	/* Find the TCP header (we assured there are no IP options). */
558235944Sbz	*th = (struct tcphdr *)(ip4 + 1);
559179737Sjfv
560235944Sbz	return (0);
561235944Sbz}
562235944Sbz#endif
563179737Sjfv
564235944Sbzint
565235944Sbztcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
566235944Sbz{
567235944Sbz	struct lro_entry *le;
568235944Sbz	struct ether_header *eh;
569235944Sbz#ifdef INET6
570235944Sbz	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
571235944Sbz#endif
572235944Sbz#ifdef INET
573235944Sbz	struct ip *ip4 = NULL;		/* Keep compiler happy. */
574235944Sbz#endif
575235944Sbz	struct tcphdr *th;
576235944Sbz	void *l3hdr = NULL;		/* Keep compiler happy. */
577235944Sbz	uint32_t *ts_ptr;
578235944Sbz	tcp_seq seq;
579235944Sbz	int error, ip_len, l;
580235944Sbz	uint16_t eh_type, tcp_data_len;
581179737Sjfv
582235944Sbz	/* We expect a contiguous header [eh, ip, tcp]. */
583235944Sbz
584235944Sbz	eh = mtod(m, struct ether_header *);
585235944Sbz	eh_type = ntohs(eh->ether_type);
586235944Sbz	switch (eh_type) {
587235944Sbz#ifdef INET6
588235944Sbz	case ETHERTYPE_IPV6:
589236394Sbz	{
590236394Sbz		CURVNET_SET(lc->ifp->if_vnet);
591235981Sbz		if (V_ip6_forwarding != 0) {
592235981Sbz			/* XXX-BZ stats but changing lro_ctrl is a problem. */
593236394Sbz			CURVNET_RESTORE();
594235981Sbz			return (TCP_LRO_CANNOT);
595235981Sbz		}
596236394Sbz		CURVNET_RESTORE();
597235944Sbz		l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
598235944Sbz		error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
599235944Sbz		if (error != 0)
600235944Sbz			return (error);
601235944Sbz		tcp_data_len = ntohs(ip6->ip6_plen);
602235944Sbz		ip_len = sizeof(*ip6) + tcp_data_len;
603235944Sbz		break;
604236394Sbz	}
605235944Sbz#endif
606235944Sbz#ifdef INET
607235944Sbz	case ETHERTYPE_IP:
608236394Sbz	{
609236394Sbz		CURVNET_SET(lc->ifp->if_vnet);
610235981Sbz		if (V_ipforwarding != 0) {
611235981Sbz			/* XXX-BZ stats but changing lro_ctrl is a problem. */
612236394Sbz			CURVNET_RESTORE();
613235981Sbz			return (TCP_LRO_CANNOT);
614235981Sbz		}
615236394Sbz		CURVNET_RESTORE();
616235944Sbz		l3hdr = ip4 = (struct ip *)(eh + 1);
617235944Sbz		error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
618235944Sbz		if (error != 0)
619235944Sbz			return (error);
620235944Sbz		ip_len = ntohs(ip4->ip_len);
621235944Sbz		tcp_data_len = ip_len - sizeof(*ip4);
622235944Sbz		break;
623236394Sbz	}
624235944Sbz#endif
625235944Sbz	/* XXX-BZ what happens in case of VLAN(s)? */
626235944Sbz	default:
627235944Sbz		return (TCP_LRO_NOT_SUPPORTED);
628179737Sjfv	}
629179737Sjfv
630235944Sbz	/*
631235944Sbz	 * If the frame is padded beyond the end of the IP packet, then we must
632235944Sbz	 * trim the extra bytes off.
633235944Sbz	 */
634235944Sbz	l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
635235944Sbz	if (l != 0) {
636235944Sbz		if (l < 0)
637235944Sbz			/* Truncated packet. */
638235944Sbz			return (TCP_LRO_CANNOT);
639179737Sjfv
640235944Sbz		m_adj(m, -l);
641235944Sbz	}
642235944Sbz
643235944Sbz	/*
644235944Sbz	 * Check TCP header constraints.
645179737Sjfv	 */
646235944Sbz	/* Ensure no bits set besides ACK or PSH. */
647235944Sbz	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
648235944Sbz		return (TCP_LRO_CANNOT);
649235944Sbz
650298730Ssephe	/* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
651235944Sbz	/* XXX-BZ Ideally we'd flush on PUSH? */
652235944Sbz
653235944Sbz	/*
654235944Sbz	 * Check for timestamps.
655235944Sbz	 * Since the only option we handle are timestamps, we only have to
656235944Sbz	 * handle the simple case of aligned timestamps.
657235944Sbz	 */
658235944Sbz	l = (th->th_off << 2);
659235944Sbz	tcp_data_len -= l;
660235944Sbz	l -= sizeof(*th);
661235944Sbz	ts_ptr = (uint32_t *)(th + 1);
662235944Sbz	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
663235944Sbz	    (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
664235944Sbz	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
665235944Sbz		return (TCP_LRO_CANNOT);
666235944Sbz
667235944Sbz	/* If the driver did not pass in the checksum, set it now. */
668235944Sbz	if (csum == 0x0000)
669235944Sbz		csum = th->th_sum;
670235944Sbz
671235944Sbz	seq = ntohl(th->th_seq);
672235944Sbz
673235944Sbz	/* Try to find a matching previous segment. */
674297483Ssephe	LIST_FOREACH(le, &lc->lro_active, next) {
675235944Sbz		if (le->eh_type != eh_type)
676235944Sbz			continue;
677235944Sbz		if (le->source_port != th->th_sport ||
678235944Sbz		    le->dest_port != th->th_dport)
679235944Sbz			continue;
680235944Sbz		switch (eh_type) {
681235944Sbz#ifdef INET6
682235944Sbz		case ETHERTYPE_IPV6:
683235944Sbz			if (bcmp(&le->source_ip6, &ip6->ip6_src,
684235944Sbz			    sizeof(struct in6_addr)) != 0 ||
685235944Sbz			    bcmp(&le->dest_ip6, &ip6->ip6_dst,
686235944Sbz			    sizeof(struct in6_addr)) != 0)
687235944Sbz				continue;
688235944Sbz			break;
689235944Sbz#endif
690235944Sbz#ifdef INET
691235944Sbz		case ETHERTYPE_IP:
692235944Sbz			if (le->source_ip4 != ip4->ip_src.s_addr ||
693235944Sbz			    le->dest_ip4 != ip4->ip_dst.s_addr)
694235944Sbz				continue;
695235944Sbz			break;
696235944Sbz#endif
697179737Sjfv		}
698179737Sjfv
699235944Sbz		/* Flush now if appending will result in overflow. */
700295739Ssephe		if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
701298974Ssephe			tcp_lro_active_remove(le);
702235944Sbz			tcp_lro_flush(lc, le);
703235944Sbz			break;
704235944Sbz		}
705179737Sjfv
706235944Sbz		/* Try to append the new segment. */
707235944Sbz		if (__predict_false(seq != le->next_seq ||
708235944Sbz		    (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
709235944Sbz			/* Out of order packet or duplicate ACK. */
710298974Ssephe			tcp_lro_active_remove(le);
711235944Sbz			tcp_lro_flush(lc, le);
712235944Sbz			return (TCP_LRO_CANNOT);
713235944Sbz		}
714179737Sjfv
715235944Sbz		if (l != 0) {
716235944Sbz			uint32_t tsval = ntohl(*(ts_ptr + 1));
717235944Sbz			/* Make sure timestamp values are increasing. */
718235944Sbz			/* XXX-BZ flip and use TSTMP_GEQ macro for this? */
719235944Sbz			if (__predict_false(le->tsval > tsval ||
720235944Sbz			    *(ts_ptr + 2) == 0))
721235944Sbz				return (TCP_LRO_CANNOT);
722235944Sbz			le->tsval = tsval;
723235944Sbz			le->tsecr = *(ts_ptr + 2);
724235944Sbz		}
725223797Scperciva
726235944Sbz		le->next_seq += tcp_data_len;
727235944Sbz		le->ack_seq = th->th_ack;
728235944Sbz		le->window = th->th_win;
729235944Sbz		le->append_cnt++;
730179737Sjfv
731235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
732235944Sbz		le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
733235944Sbz		    tcp_data_len, ~csum);
734235944Sbz#endif
735179737Sjfv
736235944Sbz		if (tcp_data_len == 0) {
737235944Sbz			m_freem(m);
738295739Ssephe			/*
739295739Ssephe			 * Flush this LRO entry, if this ACK should not
740295739Ssephe			 * be further delayed.
741295739Ssephe			 */
742295739Ssephe			if (le->append_cnt >= lc->lro_ackcnt_lim) {
743298974Ssephe				tcp_lro_active_remove(le);
744295739Ssephe				tcp_lro_flush(lc, le);
745295739Ssephe			}
746235944Sbz			return (0);
747235944Sbz		}
748179737Sjfv
749235944Sbz		le->p_len += tcp_data_len;
750179737Sjfv
751235944Sbz		/*
752235944Sbz		 * Adjust the mbuf so that m_data points to the first byte of
753235944Sbz		 * the ULP payload.  Adjust the mbuf to avoid complications and
754235944Sbz		 * append new segment to existing mbuf chain.
755235944Sbz		 */
756235944Sbz		m_adj(m, m->m_pkthdr.len - tcp_data_len);
757284961Snp		m_demote_pkthdr(m);
758179737Sjfv
759235944Sbz		le->m_tail->m_next = m;
760235944Sbz		le->m_tail = m_last(m);
761235944Sbz
762235944Sbz		/*
763235944Sbz		 * If a possible next full length packet would cause an
764235944Sbz		 * overflow, pro-actively flush now.
765235944Sbz		 */
766295739Ssephe		if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) {
767298974Ssephe			tcp_lro_active_remove(le);
768235944Sbz			tcp_lro_flush(lc, le);
769255010Snp		} else
770255010Snp			getmicrotime(&le->mtime);
771235944Sbz
772235944Sbz		return (0);
773179737Sjfv	}
774179737Sjfv
775235944Sbz	/* Try to find an empty slot. */
776297483Ssephe	if (LIST_EMPTY(&lc->lro_free))
777297265Ssephe		return (TCP_LRO_NO_ENTRIES);
778179737Sjfv
779235944Sbz	/* Start a new segment chain. */
780297483Ssephe	le = LIST_FIRST(&lc->lro_free);
781297483Ssephe	LIST_REMOVE(le, next);
782298974Ssephe	tcp_lro_active_insert(lc, le);
783255010Snp	getmicrotime(&le->mtime);
784179737Sjfv
785235944Sbz	/* Start filling in details. */
786235944Sbz	switch (eh_type) {
787235944Sbz#ifdef INET6
788235944Sbz	case ETHERTYPE_IPV6:
789235944Sbz		le->le_ip6 = ip6;
790235944Sbz		le->source_ip6 = ip6->ip6_src;
791235944Sbz		le->dest_ip6 = ip6->ip6_dst;
792235944Sbz		le->eh_type = eh_type;
793235944Sbz		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
794235944Sbz		break;
795235944Sbz#endif
796235944Sbz#ifdef INET
797235944Sbz	case ETHERTYPE_IP:
798235944Sbz		le->le_ip4 = ip4;
799235944Sbz		le->source_ip4 = ip4->ip_src.s_addr;
800235944Sbz		le->dest_ip4 = ip4->ip_dst.s_addr;
801235944Sbz		le->eh_type = eh_type;
802235944Sbz		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
803235944Sbz		break;
804235944Sbz#endif
805235944Sbz	}
806235944Sbz	le->source_port = th->th_sport;
807235944Sbz	le->dest_port = th->th_dport;
808235944Sbz
809235944Sbz	le->next_seq = seq + tcp_data_len;
810235944Sbz	le->ack_seq = th->th_ack;
811235944Sbz	le->window = th->th_win;
812235944Sbz	if (l != 0) {
813235944Sbz		le->timestamp = 1;
814235944Sbz		le->tsval = ntohl(*(ts_ptr + 1));
815235944Sbz		le->tsecr = *(ts_ptr + 2);
816235944Sbz	}
817235944Sbz
818235944Sbz#ifdef TCP_LRO_UPDATE_CSUM
819235944Sbz	/*
820235944Sbz	 * Do not touch the csum of the first packet.  However save the
821235944Sbz	 * "adjusted" checksum of just the source and destination addresses,
822235944Sbz	 * the next header and the TCP payload.  The length and TCP header
823235944Sbz	 * parts may change, so we remove those from the saved checksum and
824235944Sbz	 * re-add with final values on tcp_lro_flush() if needed.
825179737Sjfv	 */
826235944Sbz	KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
827235944Sbz	    __func__, le, le->ulp_csum));
828235944Sbz
829235944Sbz	le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
830235944Sbz	    ~csum);
831235944Sbz	th->th_sum = csum;	/* Restore checksum on first packet. */
832235944Sbz#endif
833235944Sbz
834235944Sbz	le->m_head = m;
835235944Sbz	le->m_tail = m_last(m);
836235944Sbz
837235944Sbz	return (0);
838179737Sjfv}
839235944Sbz
840294327Shselaskyvoid
841294327Shselaskytcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
842294327Shselasky{
843294327Shselasky	/* sanity checks */
844294327Shselasky	if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
845294327Shselasky	    lc->lro_mbuf_max == 0)) {
846294327Shselasky		/* packet drop */
847294327Shselasky		m_freem(mb);
848294327Shselasky		return;
849294327Shselasky	}
850294327Shselasky
851294327Shselasky	/* check if packet is not LRO capable */
852294327Shselasky	if (__predict_false(mb->m_pkthdr.csum_flags == 0 ||
853294327Shselasky	    (lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
854294327Shselasky		lc->lro_flushed++;
855294327Shselasky		lc->lro_queued++;
856294327Shselasky
857294327Shselasky		/* input packet to network layer */
858294327Shselasky		(*lc->ifp->if_input) (lc->ifp, mb);
859294327Shselasky		return;
860294327Shselasky	}
861294327Shselasky
862294327Shselasky	/* check if array is full */
863294327Shselasky	if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max))
864294327Shselasky		tcp_lro_flush_all(lc);
865294327Shselasky
866300731Shselasky	/* create sequence number */
867300731Shselasky	lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
868300731Shselasky	    (((uint64_t)M_HASHTYPE_GET(mb)) << 56) |
869300731Shselasky	    (((uint64_t)mb->m_pkthdr.flowid) << 24) |
870300731Shselasky	    ((uint64_t)lc->lro_mbuf_count);
871294327Shselasky
872294327Shselasky	/* enter mbuf */
873300731Shselasky	lc->lro_mbuf_data[lc->lro_mbuf_count++].mb = mb;
874294327Shselasky}
875294327Shselasky
876235944Sbz/* end */
877