tcp_lro.c revision 235474
1/*-
2 * Copyright (c) 2007, Myricom Inc.
3 * Copyright (c) 2008, Intel Corporation.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: head/sys/netinet/tcp_lro.c 235474 2012-05-15 13:23:44Z bz $
28 */
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/endian.h>
33#include <sys/mbuf.h>
34#include <sys/kernel.h>
35#include <sys/socket.h>
36
37#include <net/if.h>
38#include <net/ethernet.h>
39#include <net/if_media.h>
40
41#include <netinet/in_systm.h>
42#include <netinet/in.h>
43#include <netinet/ip.h>
44#include <netinet/tcp.h>
45#include <netinet/tcp_lro.h>
46
47#include <machine/bus.h>
48#include <machine/in_cksum.h>
49
50
51static uint16_t do_csum_data(uint16_t *raw, int len)
52{
53	uint32_t csum;
54	csum = 0;
55	while (len > 0) {
56		csum += *raw;
57		raw++;
58		csum += *raw;
59		raw++;
60		len -= 4;
61	}
62	csum = (csum >> 16) + (csum & 0xffff);
63	csum = (csum >> 16) + (csum & 0xffff);
64	return (uint16_t)csum;
65}
66
67/*
68 * Allocate and init the LRO data structures
69 */
70int
71tcp_lro_init(struct lro_ctrl *cntl)
72{
73	struct lro_entry *lro;
74	int i, error = 0;
75
76	SLIST_INIT(&cntl->lro_free);
77	SLIST_INIT(&cntl->lro_active);
78
79	cntl->lro_bad_csum = 0;
80	cntl->lro_queued = 0;
81	cntl->lro_flushed = 0;
82
83	for (i = 0; i < LRO_ENTRIES; i++) {
84                lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
85		    M_DEVBUF, M_NOWAIT | M_ZERO);
86                if (lro == NULL) {
87			if (i == 0)
88				error = ENOMEM;
89                        break;
90                }
91		cntl->lro_cnt = i;
92                SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
93        }
94
95	return (error);
96}
97
98void
99tcp_lro_free(struct lro_ctrl *cntl)
100{
101	struct lro_entry *entry;
102
103	while (!SLIST_EMPTY(&cntl->lro_free)) {
104		entry = SLIST_FIRST(&cntl->lro_free);
105		SLIST_REMOVE_HEAD(&cntl->lro_free, next);
106		free(entry, M_DEVBUF);
107	}
108}
109
110void
111tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
112{
113	struct ifnet *ifp;
114	struct ip *ip;
115	struct tcphdr *tcp;
116	uint32_t *ts_ptr;
117	uint32_t tcplen, tcp_csum;
118
119
120	if (lro->append_cnt) {
121		/* incorporate the new len into the ip header and
122		 * re-calculate the checksum */
123		ip = lro->ip;
124		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
125		ip->ip_sum = 0;
126		ip->ip_sum = 0xffff ^
127			do_csum_data((uint16_t*)ip,
128					      sizeof (*ip));
129
130		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
131			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
132		lro->m_head->m_pkthdr.csum_data = 0xffff;
133		lro->m_head->m_pkthdr.len = lro->len;
134
135		/* incorporate the latest ack into the tcp header */
136		tcp = (struct tcphdr *) (ip + 1);
137		tcp->th_ack = lro->ack_seq;
138		tcp->th_win = lro->window;
139		/* incorporate latest timestamp into the tcp header */
140		if (lro->timestamp) {
141			ts_ptr = (uint32_t *)(tcp + 1);
142			ts_ptr[1] = htonl(lro->tsval);
143			ts_ptr[2] = lro->tsecr;
144		}
145		/*
146		 * update checksum in tcp header by re-calculating the
147		 * tcp pseudoheader checksum, and adding it to the checksum
148		 * of the tcp payload data
149		 */
150		tcp->th_sum = 0;
151		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
152		tcp_csum = lro->data_csum;
153		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
154				      htons(tcplen + IPPROTO_TCP));
155		tcp_csum += do_csum_data((uint16_t*)tcp,
156						  tcp->th_off << 2);
157		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
158		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
159		tcp->th_sum = 0xffff ^ tcp_csum;
160	}
161	ifp = cntl->ifp;
162	(*ifp->if_input)(cntl->ifp, lro->m_head);
163	cntl->lro_queued += lro->append_cnt + 1;
164	cntl->lro_flushed++;
165	lro->m_head = NULL;
166	lro->timestamp = 0;
167	lro->append_cnt = 0;
168	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
169}
170
171int
172tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
173{
174	struct ether_header *eh;
175	struct ip *ip;
176	struct tcphdr *tcp;
177	uint32_t *ts_ptr;
178	struct mbuf *m_nxt, *m_tail;
179	struct lro_entry *lro;
180	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
181	int opt_bytes, trim, csum_flags;
182	uint32_t seq, tmp_csum, device_mtu;
183
184
185	eh = mtod(m_head, struct ether_header *);
186	if (eh->ether_type != htons(ETHERTYPE_IP))
187		return 1;
188	ip = (struct ip *) (eh + 1);
189	if (ip->ip_p != IPPROTO_TCP)
190		return 1;
191
192	/* ensure there are no options */
193	if ((ip->ip_hl << 2) != sizeof (*ip))
194		return -1;
195
196	/* .. and the packet is not fragmented */
197	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
198		return -1;
199
200	/* verify that the IP header checksum is correct */
201	csum_flags = m_head->m_pkthdr.csum_flags;
202	if (csum_flags & CSUM_IP_CHECKED) {
203		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
204			cntl->lro_bad_csum++;
205			return -1;
206		}
207	} else {
208		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
209		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
210			cntl->lro_bad_csum++;
211			return -1;
212		}
213	}
214
215	/* find the TCP header */
216	tcp = (struct tcphdr *) (ip + 1);
217
218	/* Get the TCP checksum if we dont have it */
219	if (!csum)
220		csum = tcp->th_sum;
221
222	/* ensure no bits set besides ack or psh */
223	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
224		return -1;
225
226	/* check for timestamps. Since the only option we handle are
227	   timestamps, we only have to handle the simple case of
228	   aligned timestamps */
229
230	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
231	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
232	ts_ptr = (uint32_t *)(tcp + 1);
233	if (opt_bytes != 0) {
234		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
235		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
236		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
237			return -1;
238	}
239
240	ip_len = ntohs(ip->ip_len);
241	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
242
243
244	/*
245	 * If frame is padded beyond the end of the IP packet,
246	 * then we must trim the extra bytes off the end.
247	 */
248	tot_len = m_head->m_pkthdr.len;
249	trim = tot_len - (ip_len + ETHER_HDR_LEN);
250	if (trim != 0) {
251		if (trim < 0) {
252			/* truncated packet */
253			return -1;
254		}
255		m_adj(m_head, -trim);
256		tot_len = m_head->m_pkthdr.len;
257	}
258
259	m_nxt = m_head;
260	m_tail = NULL; /* -Wuninitialized */
261	while (m_nxt != NULL) {
262		m_tail = m_nxt;
263		m_nxt = m_tail->m_next;
264	}
265
266	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
267	seq = ntohl(tcp->th_seq);
268
269	SLIST_FOREACH(lro, &cntl->lro_active, next) {
270		if (lro->source_port == tcp->th_sport &&
271		    lro->dest_port == tcp->th_dport &&
272		    lro->source_ip == ip->ip_src.s_addr &&
273		    lro->dest_ip == ip->ip_dst.s_addr) {
274			/* Flush now if appending will result in overflow. */
275			if (lro->len > (65535 - tcp_data_len)) {
276				SLIST_REMOVE(&cntl->lro_active, lro,
277					     lro_entry, next);
278				tcp_lro_flush(cntl, lro);
279				break;
280			}
281
282			/* Try to append it */
283
284			if (__predict_false(seq != lro->next_seq ||
285				    (tcp_data_len == 0 &&
286				    lro->ack_seq == tcp->th_ack))) {
287				/* out of order packet or dup ack */
288				SLIST_REMOVE(&cntl->lro_active, lro,
289					     lro_entry, next);
290				tcp_lro_flush(cntl, lro);
291				return -1;
292			}
293
294			if (opt_bytes) {
295				uint32_t tsval = ntohl(*(ts_ptr + 1));
296				/* make sure timestamp values are increasing */
297				if (__predict_false(lro->tsval > tsval ||
298					     *(ts_ptr + 2) == 0)) {
299					return -1;
300				}
301				lro->tsval = tsval;
302				lro->tsecr = *(ts_ptr + 2);
303			}
304
305			lro->next_seq += tcp_data_len;
306			lro->ack_seq = tcp->th_ack;
307			lro->window = tcp->th_win;
308			lro->append_cnt++;
309			if (tcp_data_len == 0) {
310				m_freem(m_head);
311				return 0;
312			}
313			/* subtract off the checksum of the tcp header
314                         * from the hardware checksum, and add it to the
315                         * stored tcp data checksum.  Byteswap the checksum
316			 * if the total length so far is odd
317                         */
318			tmp_csum = do_csum_data((uint16_t*)tcp,
319							 tcp_hdr_len);
320			csum = csum + (tmp_csum ^ 0xffff);
321			csum = (csum & 0xffff) + (csum >> 16);
322			csum = (csum & 0xffff) + (csum >> 16);
323			if (lro->len & 0x1) {
324				/* Odd number of bytes so far, flip bytes */
325				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
326			}
327			csum = csum + lro->data_csum;
328			csum = (csum & 0xffff) + (csum >> 16);
329			csum = (csum & 0xffff) + (csum >> 16);
330			lro->data_csum = csum;
331
332			lro->len += tcp_data_len;
333
334			/* adjust mbuf so that m->m_data points to
335			   the first byte of the payload */
336			m_adj(m_head, hlen);
337			/* append mbuf chain */
338			lro->m_tail->m_next = m_head;
339			/* advance the last pointer */
340			lro->m_tail = m_tail;
341			/* flush packet if required */
342			device_mtu = cntl->ifp->if_mtu;
343			if (lro->len > (65535 - device_mtu)) {
344				SLIST_REMOVE(&cntl->lro_active, lro,
345					     lro_entry, next);
346				tcp_lro_flush(cntl, lro);
347			}
348			return 0;
349		}
350	}
351
352	if (SLIST_EMPTY(&cntl->lro_free))
353	    return -1;
354
355	/* start a new chain */
356	lro = SLIST_FIRST(&cntl->lro_free);
357	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
358	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
359	lro->source_port = tcp->th_sport;
360	lro->dest_port = tcp->th_dport;
361	lro->source_ip = ip->ip_src.s_addr;
362	lro->dest_ip = ip->ip_dst.s_addr;
363	lro->next_seq = seq + tcp_data_len;
364	lro->mss = tcp_data_len;
365	lro->ack_seq = tcp->th_ack;
366	lro->window = tcp->th_win;
367
368	/* save the checksum of just the TCP payload by
369	 * subtracting off the checksum of the TCP header from
370	 * the entire hardware checksum
371	 * Since IP header checksum is correct, checksum over
372	 * the IP header is -0.  Substracting -0 is unnecessary.
373	 */
374	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
375	csum = csum + (tmp_csum ^ 0xffff);
376	csum = (csum & 0xffff) + (csum >> 16);
377	csum = (csum & 0xffff) + (csum >> 16);
378	lro->data_csum = csum;
379
380	lro->ip = ip;
381	/* record timestamp if it is present */
382	if (opt_bytes) {
383		lro->timestamp = 1;
384		lro->tsval = ntohl(*(ts_ptr + 1));
385		lro->tsecr = *(ts_ptr + 2);
386	}
387	lro->len = tot_len;
388	lro->m_head = m_head;
389	lro->m_tail = m_tail;
390	return 0;
391}
392