tcp_lro.c revision 235474
1235474Sbz/*-
2235474Sbz * Copyright (c) 2007, Myricom Inc.
3235474Sbz * Copyright (c) 2008, Intel Corporation.
4235474Sbz * All rights reserved.
5235474Sbz *
6235474Sbz * Redistribution and use in source and binary forms, with or without
7235474Sbz * modification, are permitted provided that the following conditions
8235474Sbz * are met:
9235474Sbz * 1. Redistributions of source code must retain the above copyright
10235474Sbz *    notice, this list of conditions and the following disclaimer.
11235474Sbz * 2. Redistributions in binary form must reproduce the above copyright
12235474Sbz *    notice, this list of conditions and the following disclaimer in the
13235474Sbz *    documentation and/or other materials provided with the distribution.
14235474Sbz *
15235474Sbz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16235474Sbz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17235474Sbz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18235474Sbz * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19235474Sbz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20235474Sbz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21235474Sbz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22235474Sbz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23235474Sbz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24235474Sbz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25235474Sbz * SUCH DAMAGE.
26235474Sbz *
27235474Sbz * $FreeBSD: head/sys/netinet/tcp_lro.c 235474 2012-05-15 13:23:44Z bz $
28235474Sbz */
29179737Sjfv
30179737Sjfv#include <sys/param.h>
31179737Sjfv#include <sys/systm.h>
32179737Sjfv#include <sys/endian.h>
33179737Sjfv#include <sys/mbuf.h>
34179737Sjfv#include <sys/kernel.h>
35179737Sjfv#include <sys/socket.h>
36179737Sjfv
37179737Sjfv#include <net/if.h>
38179737Sjfv#include <net/ethernet.h>
39179737Sjfv#include <net/if_media.h>
40179737Sjfv
41179737Sjfv#include <netinet/in_systm.h>
42179737Sjfv#include <netinet/in.h>
43179737Sjfv#include <netinet/ip.h>
44179737Sjfv#include <netinet/tcp.h>
45179737Sjfv#include <netinet/tcp_lro.h>
46179737Sjfv
47179737Sjfv#include <machine/bus.h>
48179737Sjfv#include <machine/in_cksum.h>
49179737Sjfv
50179737Sjfv
51179737Sjfvstatic uint16_t do_csum_data(uint16_t *raw, int len)
52179737Sjfv{
53179737Sjfv	uint32_t csum;
54179737Sjfv	csum = 0;
55179737Sjfv	while (len > 0) {
56179737Sjfv		csum += *raw;
57179737Sjfv		raw++;
58179737Sjfv		csum += *raw;
59179737Sjfv		raw++;
60179737Sjfv		len -= 4;
61179737Sjfv	}
62179737Sjfv	csum = (csum >> 16) + (csum & 0xffff);
63179737Sjfv	csum = (csum >> 16) + (csum & 0xffff);
64179737Sjfv	return (uint16_t)csum;
65179737Sjfv}
66179737Sjfv
67179737Sjfv/*
68179737Sjfv * Allocate and init the LRO data structures
69179737Sjfv */
70179737Sjfvint
71179737Sjfvtcp_lro_init(struct lro_ctrl *cntl)
72179737Sjfv{
73179737Sjfv	struct lro_entry *lro;
74179737Sjfv	int i, error = 0;
75179737Sjfv
76179737Sjfv	SLIST_INIT(&cntl->lro_free);
77179737Sjfv	SLIST_INIT(&cntl->lro_active);
78179737Sjfv
79179737Sjfv	cntl->lro_bad_csum = 0;
80179737Sjfv	cntl->lro_queued = 0;
81179737Sjfv	cntl->lro_flushed = 0;
82179737Sjfv
83179737Sjfv	for (i = 0; i < LRO_ENTRIES; i++) {
84179737Sjfv                lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
85179737Sjfv		    M_DEVBUF, M_NOWAIT | M_ZERO);
86179737Sjfv                if (lro == NULL) {
87179737Sjfv			if (i == 0)
88179737Sjfv				error = ENOMEM;
89179737Sjfv                        break;
90179737Sjfv                }
91179737Sjfv		cntl->lro_cnt = i;
92179737Sjfv                SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
93179737Sjfv        }
94179737Sjfv
95179737Sjfv	return (error);
96179737Sjfv}
97179737Sjfv
98179737Sjfvvoid
99179737Sjfvtcp_lro_free(struct lro_ctrl *cntl)
100179737Sjfv{
101179737Sjfv	struct lro_entry *entry;
102179737Sjfv
103179737Sjfv	while (!SLIST_EMPTY(&cntl->lro_free)) {
104179737Sjfv		entry = SLIST_FIRST(&cntl->lro_free);
105217126Sjhb		SLIST_REMOVE_HEAD(&cntl->lro_free, next);
106179737Sjfv		free(entry, M_DEVBUF);
107179737Sjfv	}
108179737Sjfv}
109179737Sjfv
110179737Sjfvvoid
111179737Sjfvtcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
112179737Sjfv{
113179737Sjfv	struct ifnet *ifp;
114179737Sjfv	struct ip *ip;
115179737Sjfv	struct tcphdr *tcp;
116179737Sjfv	uint32_t *ts_ptr;
117179737Sjfv	uint32_t tcplen, tcp_csum;
118179737Sjfv
119179737Sjfv
120179737Sjfv	if (lro->append_cnt) {
121179737Sjfv		/* incorporate the new len into the ip header and
122179737Sjfv		 * re-calculate the checksum */
123179737Sjfv		ip = lro->ip;
124179737Sjfv		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
125179737Sjfv		ip->ip_sum = 0;
126179737Sjfv		ip->ip_sum = 0xffff ^
127179737Sjfv			do_csum_data((uint16_t*)ip,
128179737Sjfv					      sizeof (*ip));
129179737Sjfv
130179737Sjfv		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
131179737Sjfv			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
132179737Sjfv		lro->m_head->m_pkthdr.csum_data = 0xffff;
133179737Sjfv		lro->m_head->m_pkthdr.len = lro->len;
134179737Sjfv
135179737Sjfv		/* incorporate the latest ack into the tcp header */
136179737Sjfv		tcp = (struct tcphdr *) (ip + 1);
137179737Sjfv		tcp->th_ack = lro->ack_seq;
138179737Sjfv		tcp->th_win = lro->window;
139179737Sjfv		/* incorporate latest timestamp into the tcp header */
140179737Sjfv		if (lro->timestamp) {
141179737Sjfv			ts_ptr = (uint32_t *)(tcp + 1);
142179737Sjfv			ts_ptr[1] = htonl(lro->tsval);
143179737Sjfv			ts_ptr[2] = lro->tsecr;
144179737Sjfv		}
145179737Sjfv		/*
146179737Sjfv		 * update checksum in tcp header by re-calculating the
147179737Sjfv		 * tcp pseudoheader checksum, and adding it to the checksum
148179737Sjfv		 * of the tcp payload data
149179737Sjfv		 */
150179737Sjfv		tcp->th_sum = 0;
151179737Sjfv		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
152179737Sjfv		tcp_csum = lro->data_csum;
153179737Sjfv		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
154179737Sjfv				      htons(tcplen + IPPROTO_TCP));
155179737Sjfv		tcp_csum += do_csum_data((uint16_t*)tcp,
156179737Sjfv						  tcp->th_off << 2);
157179737Sjfv		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
158179737Sjfv		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
159179737Sjfv		tcp->th_sum = 0xffff ^ tcp_csum;
160179737Sjfv	}
161179737Sjfv	ifp = cntl->ifp;
162179737Sjfv	(*ifp->if_input)(cntl->ifp, lro->m_head);
163179737Sjfv	cntl->lro_queued += lro->append_cnt + 1;
164179737Sjfv	cntl->lro_flushed++;
165179737Sjfv	lro->m_head = NULL;
166179737Sjfv	lro->timestamp = 0;
167179737Sjfv	lro->append_cnt = 0;
168179737Sjfv	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
169179737Sjfv}
170179737Sjfv
171179737Sjfvint
172179737Sjfvtcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
173179737Sjfv{
174179737Sjfv	struct ether_header *eh;
175179737Sjfv	struct ip *ip;
176179737Sjfv	struct tcphdr *tcp;
177179737Sjfv	uint32_t *ts_ptr;
178179737Sjfv	struct mbuf *m_nxt, *m_tail;
179179737Sjfv	struct lro_entry *lro;
180179737Sjfv	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
181182089Skmacy	int opt_bytes, trim, csum_flags;
182179737Sjfv	uint32_t seq, tmp_csum, device_mtu;
183179737Sjfv
184179737Sjfv
185179737Sjfv	eh = mtod(m_head, struct ether_header *);
186179737Sjfv	if (eh->ether_type != htons(ETHERTYPE_IP))
187179737Sjfv		return 1;
188179737Sjfv	ip = (struct ip *) (eh + 1);
189179737Sjfv	if (ip->ip_p != IPPROTO_TCP)
190179737Sjfv		return 1;
191179737Sjfv
192179737Sjfv	/* ensure there are no options */
193179737Sjfv	if ((ip->ip_hl << 2) != sizeof (*ip))
194179737Sjfv		return -1;
195179737Sjfv
196179737Sjfv	/* .. and the packet is not fragmented */
197179737Sjfv	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
198179737Sjfv		return -1;
199179737Sjfv
200179737Sjfv	/* verify that the IP header checksum is correct */
201182089Skmacy	csum_flags = m_head->m_pkthdr.csum_flags;
202182089Skmacy	if (csum_flags & CSUM_IP_CHECKED) {
203182089Skmacy		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
204182089Skmacy			cntl->lro_bad_csum++;
205182089Skmacy			return -1;
206182089Skmacy		}
207182089Skmacy	} else {
208182089Skmacy		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
209182089Skmacy		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
210182089Skmacy			cntl->lro_bad_csum++;
211182089Skmacy			return -1;
212182089Skmacy		}
213179737Sjfv	}
214182089Skmacy
215179737Sjfv	/* find the TCP header */
216179737Sjfv	tcp = (struct tcphdr *) (ip + 1);
217179737Sjfv
218179737Sjfv	/* Get the TCP checksum if we dont have it */
219179737Sjfv	if (!csum)
220179737Sjfv		csum = tcp->th_sum;
221179737Sjfv
222179737Sjfv	/* ensure no bits set besides ack or psh */
223179737Sjfv	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
224179737Sjfv		return -1;
225179737Sjfv
226179737Sjfv	/* check for timestamps. Since the only option we handle are
227179737Sjfv	   timestamps, we only have to handle the simple case of
228179737Sjfv	   aligned timestamps */
229179737Sjfv
230179737Sjfv	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
231179737Sjfv	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
232179737Sjfv	ts_ptr = (uint32_t *)(tcp + 1);
233179737Sjfv	if (opt_bytes != 0) {
234179737Sjfv		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
235179737Sjfv		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
236179737Sjfv		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
237179737Sjfv			return -1;
238179737Sjfv	}
239179737Sjfv
240179737Sjfv	ip_len = ntohs(ip->ip_len);
241179737Sjfv	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
242179737Sjfv
243179737Sjfv
244179737Sjfv	/*
245179737Sjfv	 * If frame is padded beyond the end of the IP packet,
246179737Sjfv	 * then we must trim the extra bytes off the end.
247179737Sjfv	 */
248179737Sjfv	tot_len = m_head->m_pkthdr.len;
249179737Sjfv	trim = tot_len - (ip_len + ETHER_HDR_LEN);
250179737Sjfv	if (trim != 0) {
251179737Sjfv		if (trim < 0) {
252179737Sjfv			/* truncated packet */
253179737Sjfv			return -1;
254179737Sjfv		}
255179737Sjfv		m_adj(m_head, -trim);
256179737Sjfv		tot_len = m_head->m_pkthdr.len;
257179737Sjfv	}
258179737Sjfv
259179737Sjfv	m_nxt = m_head;
260179737Sjfv	m_tail = NULL; /* -Wuninitialized */
261179737Sjfv	while (m_nxt != NULL) {
262179737Sjfv		m_tail = m_nxt;
263179737Sjfv		m_nxt = m_tail->m_next;
264179737Sjfv	}
265179737Sjfv
266179737Sjfv	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
267179737Sjfv	seq = ntohl(tcp->th_seq);
268179737Sjfv
269179737Sjfv	SLIST_FOREACH(lro, &cntl->lro_active, next) {
270179737Sjfv		if (lro->source_port == tcp->th_sport &&
271179737Sjfv		    lro->dest_port == tcp->th_dport &&
272179737Sjfv		    lro->source_ip == ip->ip_src.s_addr &&
273179737Sjfv		    lro->dest_ip == ip->ip_dst.s_addr) {
274223797Scperciva			/* Flush now if appending will result in overflow. */
275223797Scperciva			if (lro->len > (65535 - tcp_data_len)) {
276223797Scperciva				SLIST_REMOVE(&cntl->lro_active, lro,
277223797Scperciva					     lro_entry, next);
278223797Scperciva				tcp_lro_flush(cntl, lro);
279223797Scperciva				break;
280223797Scperciva			}
281223797Scperciva
282179737Sjfv			/* Try to append it */
283179737Sjfv
284220428Sjfv			if (__predict_false(seq != lro->next_seq ||
285220428Sjfv				    (tcp_data_len == 0 &&
286220428Sjfv				    lro->ack_seq == tcp->th_ack))) {
287220428Sjfv				/* out of order packet or dup ack */
288179737Sjfv				SLIST_REMOVE(&cntl->lro_active, lro,
289179737Sjfv					     lro_entry, next);
290179737Sjfv				tcp_lro_flush(cntl, lro);
291179737Sjfv				return -1;
292179737Sjfv			}
293179737Sjfv
294179737Sjfv			if (opt_bytes) {
295179737Sjfv				uint32_t tsval = ntohl(*(ts_ptr + 1));
296179737Sjfv				/* make sure timestamp values are increasing */
297179737Sjfv				if (__predict_false(lro->tsval > tsval ||
298179737Sjfv					     *(ts_ptr + 2) == 0)) {
299179737Sjfv					return -1;
300179737Sjfv				}
301179737Sjfv				lro->tsval = tsval;
302179737Sjfv				lro->tsecr = *(ts_ptr + 2);
303179737Sjfv			}
304179737Sjfv
305179737Sjfv			lro->next_seq += tcp_data_len;
306179737Sjfv			lro->ack_seq = tcp->th_ack;
307179737Sjfv			lro->window = tcp->th_win;
308179737Sjfv			lro->append_cnt++;
309179737Sjfv			if (tcp_data_len == 0) {
310179737Sjfv				m_freem(m_head);
311179737Sjfv				return 0;
312179737Sjfv			}
313179737Sjfv			/* subtract off the checksum of the tcp header
314179737Sjfv                         * from the hardware checksum, and add it to the
315179737Sjfv                         * stored tcp data checksum.  Byteswap the checksum
316179737Sjfv			 * if the total length so far is odd
317179737Sjfv                         */
318179737Sjfv			tmp_csum = do_csum_data((uint16_t*)tcp,
319179737Sjfv							 tcp_hdr_len);
320179737Sjfv			csum = csum + (tmp_csum ^ 0xffff);
321179737Sjfv			csum = (csum & 0xffff) + (csum >> 16);
322179737Sjfv			csum = (csum & 0xffff) + (csum >> 16);
323179737Sjfv			if (lro->len & 0x1) {
324179737Sjfv				/* Odd number of bytes so far, flip bytes */
325179737Sjfv				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
326179737Sjfv			}
327179737Sjfv			csum = csum + lro->data_csum;
328179737Sjfv			csum = (csum & 0xffff) + (csum >> 16);
329179737Sjfv			csum = (csum & 0xffff) + (csum >> 16);
330179737Sjfv			lro->data_csum = csum;
331179737Sjfv
332179737Sjfv			lro->len += tcp_data_len;
333179737Sjfv
334179737Sjfv			/* adjust mbuf so that m->m_data points to
335179737Sjfv			   the first byte of the payload */
336179737Sjfv			m_adj(m_head, hlen);
337179737Sjfv			/* append mbuf chain */
338179737Sjfv			lro->m_tail->m_next = m_head;
339179737Sjfv			/* advance the last pointer */
340179737Sjfv			lro->m_tail = m_tail;
341179737Sjfv			/* flush packet if required */
342179737Sjfv			device_mtu = cntl->ifp->if_mtu;
343179737Sjfv			if (lro->len > (65535 - device_mtu)) {
344179737Sjfv				SLIST_REMOVE(&cntl->lro_active, lro,
345179737Sjfv					     lro_entry, next);
346179737Sjfv				tcp_lro_flush(cntl, lro);
347179737Sjfv			}
348179737Sjfv			return 0;
349179737Sjfv		}
350179737Sjfv	}
351179737Sjfv
352179737Sjfv	if (SLIST_EMPTY(&cntl->lro_free))
353179737Sjfv	    return -1;
354179737Sjfv
355179737Sjfv	/* start a new chain */
356179737Sjfv	lro = SLIST_FIRST(&cntl->lro_free);
357179737Sjfv	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
358179737Sjfv	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
359179737Sjfv	lro->source_port = tcp->th_sport;
360179737Sjfv	lro->dest_port = tcp->th_dport;
361179737Sjfv	lro->source_ip = ip->ip_src.s_addr;
362179737Sjfv	lro->dest_ip = ip->ip_dst.s_addr;
363179737Sjfv	lro->next_seq = seq + tcp_data_len;
364179737Sjfv	lro->mss = tcp_data_len;
365179737Sjfv	lro->ack_seq = tcp->th_ack;
366179737Sjfv	lro->window = tcp->th_win;
367179737Sjfv
368179737Sjfv	/* save the checksum of just the TCP payload by
369179737Sjfv	 * subtracting off the checksum of the TCP header from
370179737Sjfv	 * the entire hardware checksum
371179737Sjfv	 * Since IP header checksum is correct, checksum over
372179737Sjfv	 * the IP header is -0.  Substracting -0 is unnecessary.
373179737Sjfv	 */
374179737Sjfv	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
375179737Sjfv	csum = csum + (tmp_csum ^ 0xffff);
376179737Sjfv	csum = (csum & 0xffff) + (csum >> 16);
377179737Sjfv	csum = (csum & 0xffff) + (csum >> 16);
378179737Sjfv	lro->data_csum = csum;
379179737Sjfv
380179737Sjfv	lro->ip = ip;
381179737Sjfv	/* record timestamp if it is present */
382179737Sjfv	if (opt_bytes) {
383179737Sjfv		lro->timestamp = 1;
384179737Sjfv		lro->tsval = ntohl(*(ts_ptr + 1));
385179737Sjfv		lro->tsecr = *(ts_ptr + 2);
386179737Sjfv	}
387179737Sjfv	lro->len = tot_len;
388179737Sjfv	lro->m_head = m_head;
389179737Sjfv	lro->m_tail = m_tail;
390179737Sjfv	return 0;
391179737Sjfv}
392