tcp_lro.c revision 223797
1/******************************************************************************
2
3Copyright (c) 2007, Myricom Inc.
4Copyright (c) 2008, Intel Corporation.
5All rights reserved.
6
7Redistribution and use in source and binary forms, with or without
8modification, are permitted provided that the following conditions are met:
9
10 1. Redistributions of source code must retain the above copyright notice,
11    this list of conditions and the following disclaimer.
12
13 2. Neither the name of the Myricom Inc, nor the names of its
14    contributors may be used to endorse or promote products derived from
15    this software without specific prior written permission.
16
17 3. Neither the name of the Intel Corporation, nor the names of its
18    contributors may be used to endorse or promote products derived from
19    this software without specific prior written permission.
20
21THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31POSSIBILITY OF SUCH DAMAGE.
32
33$FreeBSD: head/sys/netinet/tcp_lro.c 223797 2011-07-05 18:43:54Z cperciva $
34***************************************************************************/
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/endian.h>
39#include <sys/mbuf.h>
40#include <sys/kernel.h>
41#include <sys/socket.h>
42
43#include <net/if.h>
44#include <net/ethernet.h>
45#include <net/if_media.h>
46
47#include <netinet/in_systm.h>
48#include <netinet/in.h>
49#include <netinet/ip.h>
50#include <netinet/tcp.h>
51#include <netinet/tcp_lro.h>
52
53#include <machine/bus.h>
54#include <machine/in_cksum.h>
55
56
57static uint16_t do_csum_data(uint16_t *raw, int len)
58{
59	uint32_t csum;
60	csum = 0;
61	while (len > 0) {
62		csum += *raw;
63		raw++;
64		csum += *raw;
65		raw++;
66		len -= 4;
67	}
68	csum = (csum >> 16) + (csum & 0xffff);
69	csum = (csum >> 16) + (csum & 0xffff);
70	return (uint16_t)csum;
71}
72
73/*
74 * Allocate and init the LRO data structures
75 */
76int
77tcp_lro_init(struct lro_ctrl *cntl)
78{
79	struct lro_entry *lro;
80	int i, error = 0;
81
82	SLIST_INIT(&cntl->lro_free);
83	SLIST_INIT(&cntl->lro_active);
84
85	cntl->lro_bad_csum = 0;
86	cntl->lro_queued = 0;
87	cntl->lro_flushed = 0;
88
89	for (i = 0; i < LRO_ENTRIES; i++) {
90                lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91		    M_DEVBUF, M_NOWAIT | M_ZERO);
92                if (lro == NULL) {
93			if (i == 0)
94				error = ENOMEM;
95                        break;
96                }
97		cntl->lro_cnt = i;
98                SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99        }
100
101	return (error);
102}
103
104void
105tcp_lro_free(struct lro_ctrl *cntl)
106{
107	struct lro_entry *entry;
108
109	while (!SLIST_EMPTY(&cntl->lro_free)) {
110		entry = SLIST_FIRST(&cntl->lro_free);
111		SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112		free(entry, M_DEVBUF);
113	}
114}
115
116void
117tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118{
119	struct ifnet *ifp;
120	struct ip *ip;
121	struct tcphdr *tcp;
122	uint32_t *ts_ptr;
123	uint32_t tcplen, tcp_csum;
124
125
126	if (lro->append_cnt) {
127		/* incorporate the new len into the ip header and
128		 * re-calculate the checksum */
129		ip = lro->ip;
130		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131		ip->ip_sum = 0;
132		ip->ip_sum = 0xffff ^
133			do_csum_data((uint16_t*)ip,
134					      sizeof (*ip));
135
136		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138		lro->m_head->m_pkthdr.csum_data = 0xffff;
139		lro->m_head->m_pkthdr.len = lro->len;
140
141		/* incorporate the latest ack into the tcp header */
142		tcp = (struct tcphdr *) (ip + 1);
143		tcp->th_ack = lro->ack_seq;
144		tcp->th_win = lro->window;
145		/* incorporate latest timestamp into the tcp header */
146		if (lro->timestamp) {
147			ts_ptr = (uint32_t *)(tcp + 1);
148			ts_ptr[1] = htonl(lro->tsval);
149			ts_ptr[2] = lro->tsecr;
150		}
151		/*
152		 * update checksum in tcp header by re-calculating the
153		 * tcp pseudoheader checksum, and adding it to the checksum
154		 * of the tcp payload data
155		 */
156		tcp->th_sum = 0;
157		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158		tcp_csum = lro->data_csum;
159		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160				      htons(tcplen + IPPROTO_TCP));
161		tcp_csum += do_csum_data((uint16_t*)tcp,
162						  tcp->th_off << 2);
163		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165		tcp->th_sum = 0xffff ^ tcp_csum;
166	}
167	ifp = cntl->ifp;
168	(*ifp->if_input)(cntl->ifp, lro->m_head);
169	cntl->lro_queued += lro->append_cnt + 1;
170	cntl->lro_flushed++;
171	lro->m_head = NULL;
172	lro->timestamp = 0;
173	lro->append_cnt = 0;
174	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175}
176
177int
178tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179{
180	struct ether_header *eh;
181	struct ip *ip;
182	struct tcphdr *tcp;
183	uint32_t *ts_ptr;
184	struct mbuf *m_nxt, *m_tail;
185	struct lro_entry *lro;
186	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187	int opt_bytes, trim, csum_flags;
188	uint32_t seq, tmp_csum, device_mtu;
189
190
191	eh = mtod(m_head, struct ether_header *);
192	if (eh->ether_type != htons(ETHERTYPE_IP))
193		return 1;
194	ip = (struct ip *) (eh + 1);
195	if (ip->ip_p != IPPROTO_TCP)
196		return 1;
197
198	/* ensure there are no options */
199	if ((ip->ip_hl << 2) != sizeof (*ip))
200		return -1;
201
202	/* .. and the packet is not fragmented */
203	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204		return -1;
205
206	/* verify that the IP header checksum is correct */
207	csum_flags = m_head->m_pkthdr.csum_flags;
208	if (csum_flags & CSUM_IP_CHECKED) {
209		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
210			cntl->lro_bad_csum++;
211			return -1;
212		}
213	} else {
214		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
215		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
216			cntl->lro_bad_csum++;
217			return -1;
218		}
219	}
220
221	/* find the TCP header */
222	tcp = (struct tcphdr *) (ip + 1);
223
224	/* Get the TCP checksum if we dont have it */
225	if (!csum)
226		csum = tcp->th_sum;
227
228	/* ensure no bits set besides ack or psh */
229	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
230		return -1;
231
232	/* check for timestamps. Since the only option we handle are
233	   timestamps, we only have to handle the simple case of
234	   aligned timestamps */
235
236	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
237	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
238	ts_ptr = (uint32_t *)(tcp + 1);
239	if (opt_bytes != 0) {
240		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
241		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
242		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
243			return -1;
244	}
245
246	ip_len = ntohs(ip->ip_len);
247	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
248
249
250	/*
251	 * If frame is padded beyond the end of the IP packet,
252	 * then we must trim the extra bytes off the end.
253	 */
254	tot_len = m_head->m_pkthdr.len;
255	trim = tot_len - (ip_len + ETHER_HDR_LEN);
256	if (trim != 0) {
257		if (trim < 0) {
258			/* truncated packet */
259			return -1;
260		}
261		m_adj(m_head, -trim);
262		tot_len = m_head->m_pkthdr.len;
263	}
264
265	m_nxt = m_head;
266	m_tail = NULL; /* -Wuninitialized */
267	while (m_nxt != NULL) {
268		m_tail = m_nxt;
269		m_nxt = m_tail->m_next;
270	}
271
272	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
273	seq = ntohl(tcp->th_seq);
274
275	SLIST_FOREACH(lro, &cntl->lro_active, next) {
276		if (lro->source_port == tcp->th_sport &&
277		    lro->dest_port == tcp->th_dport &&
278		    lro->source_ip == ip->ip_src.s_addr &&
279		    lro->dest_ip == ip->ip_dst.s_addr) {
280			/* Flush now if appending will result in overflow. */
281			if (lro->len > (65535 - tcp_data_len)) {
282				SLIST_REMOVE(&cntl->lro_active, lro,
283					     lro_entry, next);
284				tcp_lro_flush(cntl, lro);
285				break;
286			}
287
288			/* Try to append it */
289
290			if (__predict_false(seq != lro->next_seq ||
291				    (tcp_data_len == 0 &&
292				    lro->ack_seq == tcp->th_ack))) {
293				/* out of order packet or dup ack */
294				SLIST_REMOVE(&cntl->lro_active, lro,
295					     lro_entry, next);
296				tcp_lro_flush(cntl, lro);
297				return -1;
298			}
299
300			if (opt_bytes) {
301				uint32_t tsval = ntohl(*(ts_ptr + 1));
302				/* make sure timestamp values are increasing */
303				if (__predict_false(lro->tsval > tsval ||
304					     *(ts_ptr + 2) == 0)) {
305					return -1;
306				}
307				lro->tsval = tsval;
308				lro->tsecr = *(ts_ptr + 2);
309			}
310
311			lro->next_seq += tcp_data_len;
312			lro->ack_seq = tcp->th_ack;
313			lro->window = tcp->th_win;
314			lro->append_cnt++;
315			if (tcp_data_len == 0) {
316				m_freem(m_head);
317				return 0;
318			}
319			/* subtract off the checksum of the tcp header
320                         * from the hardware checksum, and add it to the
321                         * stored tcp data checksum.  Byteswap the checksum
322			 * if the total length so far is odd
323                         */
324			tmp_csum = do_csum_data((uint16_t*)tcp,
325							 tcp_hdr_len);
326			csum = csum + (tmp_csum ^ 0xffff);
327			csum = (csum & 0xffff) + (csum >> 16);
328			csum = (csum & 0xffff) + (csum >> 16);
329			if (lro->len & 0x1) {
330				/* Odd number of bytes so far, flip bytes */
331				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
332			}
333			csum = csum + lro->data_csum;
334			csum = (csum & 0xffff) + (csum >> 16);
335			csum = (csum & 0xffff) + (csum >> 16);
336			lro->data_csum = csum;
337
338			lro->len += tcp_data_len;
339
340			/* adjust mbuf so that m->m_data points to
341			   the first byte of the payload */
342			m_adj(m_head, hlen);
343			/* append mbuf chain */
344			lro->m_tail->m_next = m_head;
345			/* advance the last pointer */
346			lro->m_tail = m_tail;
347			/* flush packet if required */
348			device_mtu = cntl->ifp->if_mtu;
349			if (lro->len > (65535 - device_mtu)) {
350				SLIST_REMOVE(&cntl->lro_active, lro,
351					     lro_entry, next);
352				tcp_lro_flush(cntl, lro);
353			}
354			return 0;
355		}
356	}
357
358	if (SLIST_EMPTY(&cntl->lro_free))
359	    return -1;
360
361	/* start a new chain */
362	lro = SLIST_FIRST(&cntl->lro_free);
363	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
364	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
365	lro->source_port = tcp->th_sport;
366	lro->dest_port = tcp->th_dport;
367	lro->source_ip = ip->ip_src.s_addr;
368	lro->dest_ip = ip->ip_dst.s_addr;
369	lro->next_seq = seq + tcp_data_len;
370	lro->mss = tcp_data_len;
371	lro->ack_seq = tcp->th_ack;
372	lro->window = tcp->th_win;
373
374	/* save the checksum of just the TCP payload by
375	 * subtracting off the checksum of the TCP header from
376	 * the entire hardware checksum
377	 * Since IP header checksum is correct, checksum over
378	 * the IP header is -0.  Substracting -0 is unnecessary.
379	 */
380	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
381	csum = csum + (tmp_csum ^ 0xffff);
382	csum = (csum & 0xffff) + (csum >> 16);
383	csum = (csum & 0xffff) + (csum >> 16);
384	lro->data_csum = csum;
385
386	lro->ip = ip;
387	/* record timestamp if it is present */
388	if (opt_bytes) {
389		lro->timestamp = 1;
390		lro->tsval = ntohl(*(ts_ptr + 1));
391		lro->tsecr = *(ts_ptr + 2);
392	}
393	lro->len = tot_len;
394	lro->m_head = m_head;
395	lro->m_tail = m_tail;
396	return 0;
397}
398