tcp_lro.c revision 179737
1/******************************************************************************
2
3Copyright (c) 2007, Myricom Inc.
4Copyright (c) 2008, Intel Corporation.
5All rights reserved.
6
7Redistribution and use in source and binary forms, with or without
8modification, are permitted provided that the following conditions are met:
9
10 1. Redistributions of source code must retain the above copyright notice,
11    this list of conditions and the following disclaimer.
12
13 2. Neither the name of the Myricom Inc, nor the names of its
14    contributors may be used to endorse or promote products derived from
15    this software without specific prior written permission.
16
17 3. Neither the name of the Intel Corporation, nor the names of its
18    contributors may be used to endorse or promote products derived from
19    this software without specific prior written permission.
20
21THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31POSSIBILITY OF SUCH DAMAGE.
32
33$FreeBSD: head/sys/netinet/tcp_lro.c 179737 2008-06-11 22:12:50Z jfv $
34***************************************************************************/
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/endian.h>
39#include <sys/mbuf.h>
40#include <sys/kernel.h>
41#include <sys/socket.h>
42
43#include <net/if.h>
44#include <net/ethernet.h>
45#include <net/if_media.h>
46
47#include <netinet/in_systm.h>
48#include <netinet/in.h>
49#include <netinet/ip.h>
50#include <netinet/tcp.h>
51#include <netinet/tcp_lro.h>
52
53#include <machine/bus.h>
54#include <machine/in_cksum.h>
55
56
57static uint16_t do_csum_data(uint16_t *raw, int len)
58{
59	uint32_t csum;
60	csum = 0;
61	while (len > 0) {
62		csum += *raw;
63		raw++;
64		csum += *raw;
65		raw++;
66		len -= 4;
67	}
68	csum = (csum >> 16) + (csum & 0xffff);
69	csum = (csum >> 16) + (csum & 0xffff);
70	return (uint16_t)csum;
71}
72
73/*
74 * Allocate and init the LRO data structures
75 */
76int
77tcp_lro_init(struct lro_ctrl *cntl)
78{
79	struct lro_entry *lro;
80	int i, error = 0;
81
82	SLIST_INIT(&cntl->lro_free);
83	SLIST_INIT(&cntl->lro_active);
84
85	cntl->lro_bad_csum = 0;
86	cntl->lro_queued = 0;
87	cntl->lro_flushed = 0;
88
89	for (i = 0; i < LRO_ENTRIES; i++) {
90                lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91		    M_DEVBUF, M_NOWAIT | M_ZERO);
92                if (lro == NULL) {
93			if (i == 0)
94				error = ENOMEM;
95                        break;
96                }
97		cntl->lro_cnt = i;
98                SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99        }
100
101	return (error);
102}
103
104void
105tcp_lro_free(struct lro_ctrl *cntl)
106{
107	struct lro_entry *entry;
108
109	while (!SLIST_EMPTY(&cntl->lro_free)) {
110		entry = SLIST_FIRST(&cntl->lro_free);
111               	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112		free(entry, M_DEVBUF);
113	}
114}
115
116void
117tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118{
119	struct ifnet *ifp;
120	struct ip *ip;
121	struct tcphdr *tcp;
122	uint32_t *ts_ptr;
123	uint32_t tcplen, tcp_csum;
124
125
126	if (lro->append_cnt) {
127		/* incorporate the new len into the ip header and
128		 * re-calculate the checksum */
129		ip = lro->ip;
130		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131		ip->ip_sum = 0;
132		ip->ip_sum = 0xffff ^
133			do_csum_data((uint16_t*)ip,
134					      sizeof (*ip));
135
136		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138		lro->m_head->m_pkthdr.csum_data = 0xffff;
139		lro->m_head->m_pkthdr.len = lro->len;
140
141		/* incorporate the latest ack into the tcp header */
142		tcp = (struct tcphdr *) (ip + 1);
143		tcp->th_ack = lro->ack_seq;
144		tcp->th_win = lro->window;
145		/* incorporate latest timestamp into the tcp header */
146		if (lro->timestamp) {
147			ts_ptr = (uint32_t *)(tcp + 1);
148			ts_ptr[1] = htonl(lro->tsval);
149			ts_ptr[2] = lro->tsecr;
150		}
151		/*
152		 * update checksum in tcp header by re-calculating the
153		 * tcp pseudoheader checksum, and adding it to the checksum
154		 * of the tcp payload data
155		 */
156		tcp->th_sum = 0;
157		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158		tcp_csum = lro->data_csum;
159		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160				      htons(tcplen + IPPROTO_TCP));
161		tcp_csum += do_csum_data((uint16_t*)tcp,
162						  tcp->th_off << 2);
163		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165		tcp->th_sum = 0xffff ^ tcp_csum;
166	}
167	ifp = cntl->ifp;
168	(*ifp->if_input)(cntl->ifp, lro->m_head);
169	cntl->lro_queued += lro->append_cnt + 1;
170	cntl->lro_flushed++;
171	lro->m_head = NULL;
172	lro->timestamp = 0;
173	lro->append_cnt = 0;
174	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175}
176
177int
178tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179{
180	struct ether_header *eh;
181	struct ip *ip;
182	struct tcphdr *tcp;
183	uint32_t *ts_ptr;
184	struct mbuf *m_nxt, *m_tail;
185	struct lro_entry *lro;
186	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187	int opt_bytes, trim;
188	uint32_t seq, tmp_csum, device_mtu;
189
190
191	eh = mtod(m_head, struct ether_header *);
192	if (eh->ether_type != htons(ETHERTYPE_IP))
193		return 1;
194	ip = (struct ip *) (eh + 1);
195	if (ip->ip_p != IPPROTO_TCP)
196		return 1;
197
198	/* ensure there are no options */
199	if ((ip->ip_hl << 2) != sizeof (*ip))
200		return -1;
201
202	/* .. and the packet is not fragmented */
203	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204		return -1;
205
206	/* verify that the IP header checksum is correct */
207	tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
208	if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
209		cntl->lro_bad_csum++;
210		return -1;
211	}
212
213	/* find the TCP header */
214	tcp = (struct tcphdr *) (ip + 1);
215
216	/* Get the TCP checksum if we dont have it */
217	if (!csum)
218		csum = tcp->th_sum;
219
220	/* ensure no bits set besides ack or psh */
221	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
222		return -1;
223
224	/* check for timestamps. Since the only option we handle are
225	   timestamps, we only have to handle the simple case of
226	   aligned timestamps */
227
228	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
229	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
230	ts_ptr = (uint32_t *)(tcp + 1);
231	if (opt_bytes != 0) {
232		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
233		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
234		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
235			return -1;
236	}
237
238	ip_len = ntohs(ip->ip_len);
239	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
240
241
242	/*
243	 * If frame is padded beyond the end of the IP packet,
244	 * then we must trim the extra bytes off the end.
245	 */
246	tot_len = m_head->m_pkthdr.len;
247	trim = tot_len - (ip_len + ETHER_HDR_LEN);
248	if (trim != 0) {
249		if (trim < 0) {
250			/* truncated packet */
251			return -1;
252		}
253		m_adj(m_head, -trim);
254		tot_len = m_head->m_pkthdr.len;
255	}
256
257	m_nxt = m_head;
258	m_tail = NULL; /* -Wuninitialized */
259	while (m_nxt != NULL) {
260		m_tail = m_nxt;
261		m_nxt = m_tail->m_next;
262	}
263
264	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
265	seq = ntohl(tcp->th_seq);
266
267	SLIST_FOREACH(lro, &cntl->lro_active, next) {
268		if (lro->source_port == tcp->th_sport &&
269		    lro->dest_port == tcp->th_dport &&
270		    lro->source_ip == ip->ip_src.s_addr &&
271		    lro->dest_ip == ip->ip_dst.s_addr) {
272			/* Try to append it */
273
274			if (__predict_false(seq != lro->next_seq)) {
275				/* out of order packet */
276				SLIST_REMOVE(&cntl->lro_active, lro,
277					     lro_entry, next);
278				tcp_lro_flush(cntl, lro);
279				return -1;
280			}
281
282			if (opt_bytes) {
283				uint32_t tsval = ntohl(*(ts_ptr + 1));
284				/* make sure timestamp values are increasing */
285				if (__predict_false(lro->tsval > tsval ||
286					     *(ts_ptr + 2) == 0)) {
287					return -1;
288				}
289				lro->tsval = tsval;
290				lro->tsecr = *(ts_ptr + 2);
291			}
292
293			lro->next_seq += tcp_data_len;
294			lro->ack_seq = tcp->th_ack;
295			lro->window = tcp->th_win;
296			lro->append_cnt++;
297			if (tcp_data_len == 0) {
298				m_freem(m_head);
299				return 0;
300			}
301			/* subtract off the checksum of the tcp header
302                         * from the hardware checksum, and add it to the
303                         * stored tcp data checksum.  Byteswap the checksum
304			 * if the total length so far is odd
305                         */
306			tmp_csum = do_csum_data((uint16_t*)tcp,
307							 tcp_hdr_len);
308			csum = csum + (tmp_csum ^ 0xffff);
309			csum = (csum & 0xffff) + (csum >> 16);
310			csum = (csum & 0xffff) + (csum >> 16);
311			if (lro->len & 0x1) {
312				/* Odd number of bytes so far, flip bytes */
313				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
314			}
315			csum = csum + lro->data_csum;
316			csum = (csum & 0xffff) + (csum >> 16);
317			csum = (csum & 0xffff) + (csum >> 16);
318			lro->data_csum = csum;
319
320			lro->len += tcp_data_len;
321
322			/* adjust mbuf so that m->m_data points to
323			   the first byte of the payload */
324			m_adj(m_head, hlen);
325			/* append mbuf chain */
326			lro->m_tail->m_next = m_head;
327			/* advance the last pointer */
328			lro->m_tail = m_tail;
329			/* flush packet if required */
330			device_mtu = cntl->ifp->if_mtu;
331			if (lro->len > (65535 - device_mtu)) {
332				SLIST_REMOVE(&cntl->lro_active, lro,
333					     lro_entry, next);
334				tcp_lro_flush(cntl, lro);
335			}
336			return 0;
337		}
338	}
339
340	if (SLIST_EMPTY(&cntl->lro_free))
341	    return -1;
342
343	/* start a new chain */
344	lro = SLIST_FIRST(&cntl->lro_free);
345	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
346	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
347	lro->source_port = tcp->th_sport;
348	lro->dest_port = tcp->th_dport;
349	lro->source_ip = ip->ip_src.s_addr;
350	lro->dest_ip = ip->ip_dst.s_addr;
351	lro->next_seq = seq + tcp_data_len;
352	lro->mss = tcp_data_len;
353	lro->ack_seq = tcp->th_ack;
354	lro->window = tcp->th_win;
355
356	/* save the checksum of just the TCP payload by
357	 * subtracting off the checksum of the TCP header from
358	 * the entire hardware checksum
359	 * Since IP header checksum is correct, checksum over
360	 * the IP header is -0.  Substracting -0 is unnecessary.
361	 */
362	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
363	csum = csum + (tmp_csum ^ 0xffff);
364	csum = (csum & 0xffff) + (csum >> 16);
365	csum = (csum & 0xffff) + (csum >> 16);
366	lro->data_csum = csum;
367
368	lro->ip = ip;
369	/* record timestamp if it is present */
370	if (opt_bytes) {
371		lro->timestamp = 1;
372		lro->tsval = ntohl(*(ts_ptr + 1));
373		lro->tsecr = *(ts_ptr + 2);
374	}
375	lro->len = tot_len;
376	lro->m_head = m_head;
377	lro->m_tail = m_tail;
378	return 0;
379}
380