tcp_lro.c revision 182089
1/******************************************************************************
2
3Copyright (c) 2007, Myricom Inc.
4Copyright (c) 2008, Intel Corporation.
5All rights reserved.
6
7Redistribution and use in source and binary forms, with or without
8modification, are permitted provided that the following conditions are met:
9
10 1. Redistributions of source code must retain the above copyright notice,
11    this list of conditions and the following disclaimer.
12
13 2. Neither the name of the Myricom Inc, nor the names of its
14    contributors may be used to endorse or promote products derived from
15    this software without specific prior written permission.
16
17 3. Neither the name of the Intel Corporation, nor the names of its
18    contributors may be used to endorse or promote products derived from
19    this software without specific prior written permission.
20
21THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31POSSIBILITY OF SUCH DAMAGE.
32
33$FreeBSD: head/sys/netinet/tcp_lro.c 182089 2008-08-24 02:31:09Z kmacy $
34***************************************************************************/
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/endian.h>
39#include <sys/mbuf.h>
40#include <sys/kernel.h>
41#include <sys/socket.h>
42
43#include <net/if.h>
44#include <net/ethernet.h>
45#include <net/if_media.h>
46
47#include <netinet/in_systm.h>
48#include <netinet/in.h>
49#include <netinet/ip.h>
50#include <netinet/tcp.h>
51#include <netinet/tcp_lro.h>
52
53#include <machine/bus.h>
54#include <machine/in_cksum.h>
55
56
57static uint16_t do_csum_data(uint16_t *raw, int len)
58{
59	uint32_t csum;
60	csum = 0;
61	while (len > 0) {
62		csum += *raw;
63		raw++;
64		csum += *raw;
65		raw++;
66		len -= 4;
67	}
68	csum = (csum >> 16) + (csum & 0xffff);
69	csum = (csum >> 16) + (csum & 0xffff);
70	return (uint16_t)csum;
71}
72
73/*
74 * Allocate and init the LRO data structures
75 */
76int
77tcp_lro_init(struct lro_ctrl *cntl)
78{
79	struct lro_entry *lro;
80	int i, error = 0;
81
82	SLIST_INIT(&cntl->lro_free);
83	SLIST_INIT(&cntl->lro_active);
84
85	cntl->lro_bad_csum = 0;
86	cntl->lro_queued = 0;
87	cntl->lro_flushed = 0;
88
89	for (i = 0; i < LRO_ENTRIES; i++) {
90                lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91		    M_DEVBUF, M_NOWAIT | M_ZERO);
92                if (lro == NULL) {
93			if (i == 0)
94				error = ENOMEM;
95                        break;
96                }
97		cntl->lro_cnt = i;
98                SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99        }
100
101	return (error);
102}
103
104void
105tcp_lro_free(struct lro_ctrl *cntl)
106{
107	struct lro_entry *entry;
108
109	while (!SLIST_EMPTY(&cntl->lro_free)) {
110		entry = SLIST_FIRST(&cntl->lro_free);
111               	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112		free(entry, M_DEVBUF);
113	}
114}
115
116void
117tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118{
119	struct ifnet *ifp;
120	struct ip *ip;
121	struct tcphdr *tcp;
122	uint32_t *ts_ptr;
123	uint32_t tcplen, tcp_csum;
124
125
126	if (lro->append_cnt) {
127		/* incorporate the new len into the ip header and
128		 * re-calculate the checksum */
129		ip = lro->ip;
130		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131		ip->ip_sum = 0;
132		ip->ip_sum = 0xffff ^
133			do_csum_data((uint16_t*)ip,
134					      sizeof (*ip));
135
136		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138		lro->m_head->m_pkthdr.csum_data = 0xffff;
139		lro->m_head->m_pkthdr.len = lro->len;
140
141		/* incorporate the latest ack into the tcp header */
142		tcp = (struct tcphdr *) (ip + 1);
143		tcp->th_ack = lro->ack_seq;
144		tcp->th_win = lro->window;
145		/* incorporate latest timestamp into the tcp header */
146		if (lro->timestamp) {
147			ts_ptr = (uint32_t *)(tcp + 1);
148			ts_ptr[1] = htonl(lro->tsval);
149			ts_ptr[2] = lro->tsecr;
150		}
151		/*
152		 * update checksum in tcp header by re-calculating the
153		 * tcp pseudoheader checksum, and adding it to the checksum
154		 * of the tcp payload data
155		 */
156		tcp->th_sum = 0;
157		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158		tcp_csum = lro->data_csum;
159		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160				      htons(tcplen + IPPROTO_TCP));
161		tcp_csum += do_csum_data((uint16_t*)tcp,
162						  tcp->th_off << 2);
163		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165		tcp->th_sum = 0xffff ^ tcp_csum;
166	}
167	ifp = cntl->ifp;
168	(*ifp->if_input)(cntl->ifp, lro->m_head);
169	cntl->lro_queued += lro->append_cnt + 1;
170	cntl->lro_flushed++;
171	lro->m_head = NULL;
172	lro->timestamp = 0;
173	lro->append_cnt = 0;
174	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175}
176
177int
178tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179{
180	struct ether_header *eh;
181	struct ip *ip;
182	struct tcphdr *tcp;
183	uint32_t *ts_ptr;
184	struct mbuf *m_nxt, *m_tail;
185	struct lro_entry *lro;
186	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187	int opt_bytes, trim, csum_flags;
188	uint32_t seq, tmp_csum, device_mtu;
189
190
191	eh = mtod(m_head, struct ether_header *);
192	if (eh->ether_type != htons(ETHERTYPE_IP))
193		return 1;
194	ip = (struct ip *) (eh + 1);
195	if (ip->ip_p != IPPROTO_TCP)
196		return 1;
197
198	/* ensure there are no options */
199	if ((ip->ip_hl << 2) != sizeof (*ip))
200		return -1;
201
202	/* .. and the packet is not fragmented */
203	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204		return -1;
205
206	/* verify that the IP header checksum is correct */
207	csum_flags = m_head->m_pkthdr.csum_flags;
208	if (csum_flags & CSUM_IP_CHECKED) {
209		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
210			cntl->lro_bad_csum++;
211			return -1;
212		}
213	} else {
214		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
215		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
216			cntl->lro_bad_csum++;
217			return -1;
218		}
219	}
220
221	/* find the TCP header */
222	tcp = (struct tcphdr *) (ip + 1);
223
224	/* Get the TCP checksum if we dont have it */
225	if (!csum)
226		csum = tcp->th_sum;
227
228	/* ensure no bits set besides ack or psh */
229	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
230		return -1;
231
232	/* check for timestamps. Since the only option we handle are
233	   timestamps, we only have to handle the simple case of
234	   aligned timestamps */
235
236	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
237	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
238	ts_ptr = (uint32_t *)(tcp + 1);
239	if (opt_bytes != 0) {
240		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
241		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
242		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
243			return -1;
244	}
245
246	ip_len = ntohs(ip->ip_len);
247	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
248
249
250	/*
251	 * If frame is padded beyond the end of the IP packet,
252	 * then we must trim the extra bytes off the end.
253	 */
254	tot_len = m_head->m_pkthdr.len;
255	trim = tot_len - (ip_len + ETHER_HDR_LEN);
256	if (trim != 0) {
257		if (trim < 0) {
258			/* truncated packet */
259			return -1;
260		}
261		m_adj(m_head, -trim);
262		tot_len = m_head->m_pkthdr.len;
263	}
264
265	m_nxt = m_head;
266	m_tail = NULL; /* -Wuninitialized */
267	while (m_nxt != NULL) {
268		m_tail = m_nxt;
269		m_nxt = m_tail->m_next;
270	}
271
272	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
273	seq = ntohl(tcp->th_seq);
274
275	SLIST_FOREACH(lro, &cntl->lro_active, next) {
276		if (lro->source_port == tcp->th_sport &&
277		    lro->dest_port == tcp->th_dport &&
278		    lro->source_ip == ip->ip_src.s_addr &&
279		    lro->dest_ip == ip->ip_dst.s_addr) {
280			/* Try to append it */
281
282			if (__predict_false(seq != lro->next_seq)) {
283				/* out of order packet */
284				SLIST_REMOVE(&cntl->lro_active, lro,
285					     lro_entry, next);
286				tcp_lro_flush(cntl, lro);
287				return -1;
288			}
289
290			if (opt_bytes) {
291				uint32_t tsval = ntohl(*(ts_ptr + 1));
292				/* make sure timestamp values are increasing */
293				if (__predict_false(lro->tsval > tsval ||
294					     *(ts_ptr + 2) == 0)) {
295					return -1;
296				}
297				lro->tsval = tsval;
298				lro->tsecr = *(ts_ptr + 2);
299			}
300
301			lro->next_seq += tcp_data_len;
302			lro->ack_seq = tcp->th_ack;
303			lro->window = tcp->th_win;
304			lro->append_cnt++;
305			if (tcp_data_len == 0) {
306				m_freem(m_head);
307				return 0;
308			}
309			/* subtract off the checksum of the tcp header
310                         * from the hardware checksum, and add it to the
311                         * stored tcp data checksum.  Byteswap the checksum
312			 * if the total length so far is odd
313                         */
314			tmp_csum = do_csum_data((uint16_t*)tcp,
315							 tcp_hdr_len);
316			csum = csum + (tmp_csum ^ 0xffff);
317			csum = (csum & 0xffff) + (csum >> 16);
318			csum = (csum & 0xffff) + (csum >> 16);
319			if (lro->len & 0x1) {
320				/* Odd number of bytes so far, flip bytes */
321				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
322			}
323			csum = csum + lro->data_csum;
324			csum = (csum & 0xffff) + (csum >> 16);
325			csum = (csum & 0xffff) + (csum >> 16);
326			lro->data_csum = csum;
327
328			lro->len += tcp_data_len;
329
330			/* adjust mbuf so that m->m_data points to
331			   the first byte of the payload */
332			m_adj(m_head, hlen);
333			/* append mbuf chain */
334			lro->m_tail->m_next = m_head;
335			/* advance the last pointer */
336			lro->m_tail = m_tail;
337			/* flush packet if required */
338			device_mtu = cntl->ifp->if_mtu;
339			if (lro->len > (65535 - device_mtu)) {
340				SLIST_REMOVE(&cntl->lro_active, lro,
341					     lro_entry, next);
342				tcp_lro_flush(cntl, lro);
343			}
344			return 0;
345		}
346	}
347
348	if (SLIST_EMPTY(&cntl->lro_free))
349	    return -1;
350
351	/* start a new chain */
352	lro = SLIST_FIRST(&cntl->lro_free);
353	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
354	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
355	lro->source_port = tcp->th_sport;
356	lro->dest_port = tcp->th_dport;
357	lro->source_ip = ip->ip_src.s_addr;
358	lro->dest_ip = ip->ip_dst.s_addr;
359	lro->next_seq = seq + tcp_data_len;
360	lro->mss = tcp_data_len;
361	lro->ack_seq = tcp->th_ack;
362	lro->window = tcp->th_win;
363
364	/* save the checksum of just the TCP payload by
365	 * subtracting off the checksum of the TCP header from
366	 * the entire hardware checksum
367	 * Since IP header checksum is correct, checksum over
368	 * the IP header is -0.  Substracting -0 is unnecessary.
369	 */
370	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
371	csum = csum + (tmp_csum ^ 0xffff);
372	csum = (csum & 0xffff) + (csum >> 16);
373	csum = (csum & 0xffff) + (csum >> 16);
374	lro->data_csum = csum;
375
376	lro->ip = ip;
377	/* record timestamp if it is present */
378	if (opt_bytes) {
379		lro->timestamp = 1;
380		lro->tsval = ntohl(*(ts_ptr + 1));
381		lro->tsecr = *(ts_ptr + 2);
382	}
383	lro->len = tot_len;
384	lro->m_head = m_head;
385	lro->m_tail = m_tail;
386	return 0;
387}
388