tcp_lro.c revision 182089
1179737Sjfv/******************************************************************************
2179737Sjfv
3179737SjfvCopyright (c) 2007, Myricom Inc.
4179737SjfvCopyright (c) 2008, Intel Corporation.
5179737SjfvAll rights reserved.
6179737Sjfv
7179737SjfvRedistribution and use in source and binary forms, with or without
8179737Sjfvmodification, are permitted provided that the following conditions are met:
9179737Sjfv
10179737Sjfv 1. Redistributions of source code must retain the above copyright notice,
11179737Sjfv    this list of conditions and the following disclaimer.
12179737Sjfv
13179737Sjfv 2. Neither the name of the Myricom Inc, nor the names of its
14179737Sjfv    contributors may be used to endorse or promote products derived from
15179737Sjfv    this software without specific prior written permission.
16179737Sjfv
17179737Sjfv 3. Neither the name of the Intel Corporation, nor the names of its
18179737Sjfv    contributors may be used to endorse or promote products derived from
19179737Sjfv    this software without specific prior written permission.
20179737Sjfv
21179737SjfvTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22179737SjfvAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23179737SjfvIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24179737SjfvARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25179737SjfvLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26179737SjfvCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27179737SjfvSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28179737SjfvINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29179737SjfvCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30179737SjfvARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31179737SjfvPOSSIBILITY OF SUCH DAMAGE.
32179737Sjfv
33179737Sjfv$FreeBSD: head/sys/netinet/tcp_lro.c 182089 2008-08-24 02:31:09Z kmacy $
34179737Sjfv***************************************************************************/
35179737Sjfv
36179737Sjfv#include <sys/param.h>
37179737Sjfv#include <sys/systm.h>
38179737Sjfv#include <sys/endian.h>
39179737Sjfv#include <sys/mbuf.h>
40179737Sjfv#include <sys/kernel.h>
41179737Sjfv#include <sys/socket.h>
42179737Sjfv
43179737Sjfv#include <net/if.h>
44179737Sjfv#include <net/ethernet.h>
45179737Sjfv#include <net/if_media.h>
46179737Sjfv
47179737Sjfv#include <netinet/in_systm.h>
48179737Sjfv#include <netinet/in.h>
49179737Sjfv#include <netinet/ip.h>
50179737Sjfv#include <netinet/tcp.h>
51179737Sjfv#include <netinet/tcp_lro.h>
52179737Sjfv
53179737Sjfv#include <machine/bus.h>
54179737Sjfv#include <machine/in_cksum.h>
55179737Sjfv
56179737Sjfv
57179737Sjfvstatic uint16_t do_csum_data(uint16_t *raw, int len)
58179737Sjfv{
59179737Sjfv	uint32_t csum;
60179737Sjfv	csum = 0;
61179737Sjfv	while (len > 0) {
62179737Sjfv		csum += *raw;
63179737Sjfv		raw++;
64179737Sjfv		csum += *raw;
65179737Sjfv		raw++;
66179737Sjfv		len -= 4;
67179737Sjfv	}
68179737Sjfv	csum = (csum >> 16) + (csum & 0xffff);
69179737Sjfv	csum = (csum >> 16) + (csum & 0xffff);
70179737Sjfv	return (uint16_t)csum;
71179737Sjfv}
72179737Sjfv
73179737Sjfv/*
74179737Sjfv * Allocate and init the LRO data structures
75179737Sjfv */
76179737Sjfvint
77179737Sjfvtcp_lro_init(struct lro_ctrl *cntl)
78179737Sjfv{
79179737Sjfv	struct lro_entry *lro;
80179737Sjfv	int i, error = 0;
81179737Sjfv
82179737Sjfv	SLIST_INIT(&cntl->lro_free);
83179737Sjfv	SLIST_INIT(&cntl->lro_active);
84179737Sjfv
85179737Sjfv	cntl->lro_bad_csum = 0;
86179737Sjfv	cntl->lro_queued = 0;
87179737Sjfv	cntl->lro_flushed = 0;
88179737Sjfv
89179737Sjfv	for (i = 0; i < LRO_ENTRIES; i++) {
90179737Sjfv                lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91179737Sjfv		    M_DEVBUF, M_NOWAIT | M_ZERO);
92179737Sjfv                if (lro == NULL) {
93179737Sjfv			if (i == 0)
94179737Sjfv				error = ENOMEM;
95179737Sjfv                        break;
96179737Sjfv                }
97179737Sjfv		cntl->lro_cnt = i;
98179737Sjfv                SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99179737Sjfv        }
100179737Sjfv
101179737Sjfv	return (error);
102179737Sjfv}
103179737Sjfv
104179737Sjfvvoid
105179737Sjfvtcp_lro_free(struct lro_ctrl *cntl)
106179737Sjfv{
107179737Sjfv	struct lro_entry *entry;
108179737Sjfv
109179737Sjfv	while (!SLIST_EMPTY(&cntl->lro_free)) {
110179737Sjfv		entry = SLIST_FIRST(&cntl->lro_free);
111179737Sjfv               	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112179737Sjfv		free(entry, M_DEVBUF);
113179737Sjfv	}
114179737Sjfv}
115179737Sjfv
116179737Sjfvvoid
117179737Sjfvtcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118179737Sjfv{
119179737Sjfv	struct ifnet *ifp;
120179737Sjfv	struct ip *ip;
121179737Sjfv	struct tcphdr *tcp;
122179737Sjfv	uint32_t *ts_ptr;
123179737Sjfv	uint32_t tcplen, tcp_csum;
124179737Sjfv
125179737Sjfv
126179737Sjfv	if (lro->append_cnt) {
127179737Sjfv		/* incorporate the new len into the ip header and
128179737Sjfv		 * re-calculate the checksum */
129179737Sjfv		ip = lro->ip;
130179737Sjfv		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131179737Sjfv		ip->ip_sum = 0;
132179737Sjfv		ip->ip_sum = 0xffff ^
133179737Sjfv			do_csum_data((uint16_t*)ip,
134179737Sjfv					      sizeof (*ip));
135179737Sjfv
136179737Sjfv		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137179737Sjfv			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138179737Sjfv		lro->m_head->m_pkthdr.csum_data = 0xffff;
139179737Sjfv		lro->m_head->m_pkthdr.len = lro->len;
140179737Sjfv
141179737Sjfv		/* incorporate the latest ack into the tcp header */
142179737Sjfv		tcp = (struct tcphdr *) (ip + 1);
143179737Sjfv		tcp->th_ack = lro->ack_seq;
144179737Sjfv		tcp->th_win = lro->window;
145179737Sjfv		/* incorporate latest timestamp into the tcp header */
146179737Sjfv		if (lro->timestamp) {
147179737Sjfv			ts_ptr = (uint32_t *)(tcp + 1);
148179737Sjfv			ts_ptr[1] = htonl(lro->tsval);
149179737Sjfv			ts_ptr[2] = lro->tsecr;
150179737Sjfv		}
151179737Sjfv		/*
152179737Sjfv		 * update checksum in tcp header by re-calculating the
153179737Sjfv		 * tcp pseudoheader checksum, and adding it to the checksum
154179737Sjfv		 * of the tcp payload data
155179737Sjfv		 */
156179737Sjfv		tcp->th_sum = 0;
157179737Sjfv		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158179737Sjfv		tcp_csum = lro->data_csum;
159179737Sjfv		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160179737Sjfv				      htons(tcplen + IPPROTO_TCP));
161179737Sjfv		tcp_csum += do_csum_data((uint16_t*)tcp,
162179737Sjfv						  tcp->th_off << 2);
163179737Sjfv		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164179737Sjfv		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165179737Sjfv		tcp->th_sum = 0xffff ^ tcp_csum;
166179737Sjfv	}
167179737Sjfv	ifp = cntl->ifp;
168179737Sjfv	(*ifp->if_input)(cntl->ifp, lro->m_head);
169179737Sjfv	cntl->lro_queued += lro->append_cnt + 1;
170179737Sjfv	cntl->lro_flushed++;
171179737Sjfv	lro->m_head = NULL;
172179737Sjfv	lro->timestamp = 0;
173179737Sjfv	lro->append_cnt = 0;
174179737Sjfv	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175179737Sjfv}
176179737Sjfv
177179737Sjfvint
178179737Sjfvtcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179179737Sjfv{
180179737Sjfv	struct ether_header *eh;
181179737Sjfv	struct ip *ip;
182179737Sjfv	struct tcphdr *tcp;
183179737Sjfv	uint32_t *ts_ptr;
184179737Sjfv	struct mbuf *m_nxt, *m_tail;
185179737Sjfv	struct lro_entry *lro;
186179737Sjfv	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187182089Skmacy	int opt_bytes, trim, csum_flags;
188179737Sjfv	uint32_t seq, tmp_csum, device_mtu;
189179737Sjfv
190179737Sjfv
191179737Sjfv	eh = mtod(m_head, struct ether_header *);
192179737Sjfv	if (eh->ether_type != htons(ETHERTYPE_IP))
193179737Sjfv		return 1;
194179737Sjfv	ip = (struct ip *) (eh + 1);
195179737Sjfv	if (ip->ip_p != IPPROTO_TCP)
196179737Sjfv		return 1;
197179737Sjfv
198179737Sjfv	/* ensure there are no options */
199179737Sjfv	if ((ip->ip_hl << 2) != sizeof (*ip))
200179737Sjfv		return -1;
201179737Sjfv
202179737Sjfv	/* .. and the packet is not fragmented */
203179737Sjfv	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204179737Sjfv		return -1;
205179737Sjfv
206179737Sjfv	/* verify that the IP header checksum is correct */
207182089Skmacy	csum_flags = m_head->m_pkthdr.csum_flags;
208182089Skmacy	if (csum_flags & CSUM_IP_CHECKED) {
209182089Skmacy		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
210182089Skmacy			cntl->lro_bad_csum++;
211182089Skmacy			return -1;
212182089Skmacy		}
213182089Skmacy	} else {
214182089Skmacy		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
215182089Skmacy		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
216182089Skmacy			cntl->lro_bad_csum++;
217182089Skmacy			return -1;
218182089Skmacy		}
219179737Sjfv	}
220182089Skmacy
221179737Sjfv	/* find the TCP header */
222179737Sjfv	tcp = (struct tcphdr *) (ip + 1);
223179737Sjfv
224179737Sjfv	/* Get the TCP checksum if we dont have it */
225179737Sjfv	if (!csum)
226179737Sjfv		csum = tcp->th_sum;
227179737Sjfv
228179737Sjfv	/* ensure no bits set besides ack or psh */
229179737Sjfv	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
230179737Sjfv		return -1;
231179737Sjfv
232179737Sjfv	/* check for timestamps. Since the only option we handle are
233179737Sjfv	   timestamps, we only have to handle the simple case of
234179737Sjfv	   aligned timestamps */
235179737Sjfv
236179737Sjfv	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
237179737Sjfv	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
238179737Sjfv	ts_ptr = (uint32_t *)(tcp + 1);
239179737Sjfv	if (opt_bytes != 0) {
240179737Sjfv		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
241179737Sjfv		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
242179737Sjfv		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
243179737Sjfv			return -1;
244179737Sjfv	}
245179737Sjfv
246179737Sjfv	ip_len = ntohs(ip->ip_len);
247179737Sjfv	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
248179737Sjfv
249179737Sjfv
250179737Sjfv	/*
251179737Sjfv	 * If frame is padded beyond the end of the IP packet,
252179737Sjfv	 * then we must trim the extra bytes off the end.
253179737Sjfv	 */
254179737Sjfv	tot_len = m_head->m_pkthdr.len;
255179737Sjfv	trim = tot_len - (ip_len + ETHER_HDR_LEN);
256179737Sjfv	if (trim != 0) {
257179737Sjfv		if (trim < 0) {
258179737Sjfv			/* truncated packet */
259179737Sjfv			return -1;
260179737Sjfv		}
261179737Sjfv		m_adj(m_head, -trim);
262179737Sjfv		tot_len = m_head->m_pkthdr.len;
263179737Sjfv	}
264179737Sjfv
265179737Sjfv	m_nxt = m_head;
266179737Sjfv	m_tail = NULL; /* -Wuninitialized */
267179737Sjfv	while (m_nxt != NULL) {
268179737Sjfv		m_tail = m_nxt;
269179737Sjfv		m_nxt = m_tail->m_next;
270179737Sjfv	}
271179737Sjfv
272179737Sjfv	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
273179737Sjfv	seq = ntohl(tcp->th_seq);
274179737Sjfv
275179737Sjfv	SLIST_FOREACH(lro, &cntl->lro_active, next) {
276179737Sjfv		if (lro->source_port == tcp->th_sport &&
277179737Sjfv		    lro->dest_port == tcp->th_dport &&
278179737Sjfv		    lro->source_ip == ip->ip_src.s_addr &&
279179737Sjfv		    lro->dest_ip == ip->ip_dst.s_addr) {
280179737Sjfv			/* Try to append it */
281179737Sjfv
282179737Sjfv			if (__predict_false(seq != lro->next_seq)) {
283179737Sjfv				/* out of order packet */
284179737Sjfv				SLIST_REMOVE(&cntl->lro_active, lro,
285179737Sjfv					     lro_entry, next);
286179737Sjfv				tcp_lro_flush(cntl, lro);
287179737Sjfv				return -1;
288179737Sjfv			}
289179737Sjfv
290179737Sjfv			if (opt_bytes) {
291179737Sjfv				uint32_t tsval = ntohl(*(ts_ptr + 1));
292179737Sjfv				/* make sure timestamp values are increasing */
293179737Sjfv				if (__predict_false(lro->tsval > tsval ||
294179737Sjfv					     *(ts_ptr + 2) == 0)) {
295179737Sjfv					return -1;
296179737Sjfv				}
297179737Sjfv				lro->tsval = tsval;
298179737Sjfv				lro->tsecr = *(ts_ptr + 2);
299179737Sjfv			}
300179737Sjfv
301179737Sjfv			lro->next_seq += tcp_data_len;
302179737Sjfv			lro->ack_seq = tcp->th_ack;
303179737Sjfv			lro->window = tcp->th_win;
304179737Sjfv			lro->append_cnt++;
305179737Sjfv			if (tcp_data_len == 0) {
306179737Sjfv				m_freem(m_head);
307179737Sjfv				return 0;
308179737Sjfv			}
309179737Sjfv			/* subtract off the checksum of the tcp header
310179737Sjfv                         * from the hardware checksum, and add it to the
311179737Sjfv                         * stored tcp data checksum.  Byteswap the checksum
312179737Sjfv			 * if the total length so far is odd
313179737Sjfv                         */
314179737Sjfv			tmp_csum = do_csum_data((uint16_t*)tcp,
315179737Sjfv							 tcp_hdr_len);
316179737Sjfv			csum = csum + (tmp_csum ^ 0xffff);
317179737Sjfv			csum = (csum & 0xffff) + (csum >> 16);
318179737Sjfv			csum = (csum & 0xffff) + (csum >> 16);
319179737Sjfv			if (lro->len & 0x1) {
320179737Sjfv				/* Odd number of bytes so far, flip bytes */
321179737Sjfv				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
322179737Sjfv			}
323179737Sjfv			csum = csum + lro->data_csum;
324179737Sjfv			csum = (csum & 0xffff) + (csum >> 16);
325179737Sjfv			csum = (csum & 0xffff) + (csum >> 16);
326179737Sjfv			lro->data_csum = csum;
327179737Sjfv
328179737Sjfv			lro->len += tcp_data_len;
329179737Sjfv
330179737Sjfv			/* adjust mbuf so that m->m_data points to
331179737Sjfv			   the first byte of the payload */
332179737Sjfv			m_adj(m_head, hlen);
333179737Sjfv			/* append mbuf chain */
334179737Sjfv			lro->m_tail->m_next = m_head;
335179737Sjfv			/* advance the last pointer */
336179737Sjfv			lro->m_tail = m_tail;
337179737Sjfv			/* flush packet if required */
338179737Sjfv			device_mtu = cntl->ifp->if_mtu;
339179737Sjfv			if (lro->len > (65535 - device_mtu)) {
340179737Sjfv				SLIST_REMOVE(&cntl->lro_active, lro,
341179737Sjfv					     lro_entry, next);
342179737Sjfv				tcp_lro_flush(cntl, lro);
343179737Sjfv			}
344179737Sjfv			return 0;
345179737Sjfv		}
346179737Sjfv	}
347179737Sjfv
348179737Sjfv	if (SLIST_EMPTY(&cntl->lro_free))
349179737Sjfv	    return -1;
350179737Sjfv
351179737Sjfv	/* start a new chain */
352179737Sjfv	lro = SLIST_FIRST(&cntl->lro_free);
353179737Sjfv	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
354179737Sjfv	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
355179737Sjfv	lro->source_port = tcp->th_sport;
356179737Sjfv	lro->dest_port = tcp->th_dport;
357179737Sjfv	lro->source_ip = ip->ip_src.s_addr;
358179737Sjfv	lro->dest_ip = ip->ip_dst.s_addr;
359179737Sjfv	lro->next_seq = seq + tcp_data_len;
360179737Sjfv	lro->mss = tcp_data_len;
361179737Sjfv	lro->ack_seq = tcp->th_ack;
362179737Sjfv	lro->window = tcp->th_win;
363179737Sjfv
364179737Sjfv	/* save the checksum of just the TCP payload by
365179737Sjfv	 * subtracting off the checksum of the TCP header from
366179737Sjfv	 * the entire hardware checksum
367179737Sjfv	 * Since IP header checksum is correct, checksum over
368179737Sjfv	 * the IP header is -0.  Substracting -0 is unnecessary.
369179737Sjfv	 */
370179737Sjfv	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
371179737Sjfv	csum = csum + (tmp_csum ^ 0xffff);
372179737Sjfv	csum = (csum & 0xffff) + (csum >> 16);
373179737Sjfv	csum = (csum & 0xffff) + (csum >> 16);
374179737Sjfv	lro->data_csum = csum;
375179737Sjfv
376179737Sjfv	lro->ip = ip;
377179737Sjfv	/* record timestamp if it is present */
378179737Sjfv	if (opt_bytes) {
379179737Sjfv		lro->timestamp = 1;
380179737Sjfv		lro->tsval = ntohl(*(ts_ptr + 1));
381179737Sjfv		lro->tsecr = *(ts_ptr + 2);
382179737Sjfv	}
383179737Sjfv	lro->len = tot_len;
384179737Sjfv	lro->m_head = m_head;
385179737Sjfv	lro->m_tail = m_tail;
386179737Sjfv	return 0;
387179737Sjfv}
388