1/*-
2 * Copyright (c) 2007, Myricom Inc.
3 * Copyright (c) 2008, Intel Corporation.
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Bjoern Zeeb
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD$");
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/mbuf.h>
41#include <sys/kernel.h>
42#include <sys/socket.h>
43
44#include <net/if.h>
45#include <net/if_var.h>
46#include <net/ethernet.h>
47#include <net/vnet.h>
48
49#include <netinet/in_systm.h>
50#include <netinet/in.h>
51#include <netinet/ip6.h>
52#include <netinet/ip.h>
53#include <netinet/ip_var.h>
54#include <netinet/tcp.h>
55#include <netinet/tcp_lro.h>
56
57#include <netinet6/ip6_var.h>
58
59#include <machine/in_cksum.h>
60
61#ifndef LRO_ENTRIES
62#define	LRO_ENTRIES	8	/* # of LRO entries per RX queue. */
63#endif
64
65#define	TCP_LRO_UPDATE_CSUM	1
66#ifndef	TCP_LRO_UPDATE_CSUM
67#define	TCP_LRO_INVALID_CSUM	0x0000
68#endif
69
70int
71tcp_lro_init(struct lro_ctrl *lc)
72{
73	struct lro_entry *le;
74	int error, i;
75
76	lc->lro_bad_csum = 0;
77	lc->lro_queued = 0;
78	lc->lro_flushed = 0;
79	lc->lro_cnt = 0;
80	SLIST_INIT(&lc->lro_free);
81	SLIST_INIT(&lc->lro_active);
82
83	error = 0;
84	for (i = 0; i < LRO_ENTRIES; i++) {
85		le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
86		    M_NOWAIT | M_ZERO);
87                if (le == NULL) {
88			if (i == 0)
89				error = ENOMEM;
90                        break;
91                }
92		lc->lro_cnt = i + 1;
93		SLIST_INSERT_HEAD(&lc->lro_free, le, next);
94        }
95
96	return (error);
97}
98
99void
100tcp_lro_free(struct lro_ctrl *lc)
101{
102	struct lro_entry *le;
103
104	while (!SLIST_EMPTY(&lc->lro_free)) {
105		le = SLIST_FIRST(&lc->lro_free);
106		SLIST_REMOVE_HEAD(&lc->lro_free, next);
107		free(le, M_DEVBUF);
108	}
109}
110
111#ifdef TCP_LRO_UPDATE_CSUM
112static uint16_t
113tcp_lro_csum_th(struct tcphdr *th)
114{
115	uint32_t ch;
116	uint16_t *p, l;
117
118	ch = th->th_sum = 0x0000;
119	l = th->th_off;
120	p = (uint16_t *)th;
121	while (l > 0) {
122		ch += *p;
123		p++;
124		ch += *p;
125		p++;
126		l--;
127	}
128	while (ch > 0xffff)
129		ch = (ch >> 16) + (ch & 0xffff);
130
131	return (ch & 0xffff);
132}
133
134static uint16_t
135tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
136    uint16_t tcp_data_len, uint16_t csum)
137{
138	uint32_t c;
139	uint16_t cs;
140
141	c = csum;
142
143	/* Remove length from checksum. */
144	switch (le->eh_type) {
145#ifdef INET6
146	case ETHERTYPE_IPV6:
147	{
148		struct ip6_hdr *ip6;
149
150		ip6 = (struct ip6_hdr *)l3hdr;
151		if (le->append_cnt == 0)
152			cs = ip6->ip6_plen;
153		else {
154			uint32_t cx;
155
156			cx = ntohs(ip6->ip6_plen);
157			cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
158		}
159		break;
160	}
161#endif
162#ifdef INET
163	case ETHERTYPE_IP:
164	{
165		struct ip *ip4;
166
167		ip4 = (struct ip *)l3hdr;
168		if (le->append_cnt == 0)
169			cs = ip4->ip_len;
170		else {
171			cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
172			    IPPROTO_TCP);
173			cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
174			    htons(cs));
175		}
176		break;
177	}
178#endif
179	default:
180		cs = 0;		/* Keep compiler happy. */
181	}
182
183	cs = ~cs;
184	c += cs;
185
186	/* Remove TCP header csum. */
187	cs = ~tcp_lro_csum_th(th);
188	c += cs;
189	while (c > 0xffff)
190		c = (c >> 16) + (c & 0xffff);
191
192	return (c & 0xffff);
193}
194#endif
195
196void
197tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
198{
199
200	if (le->append_cnt > 0) {
201		struct tcphdr *th;
202		uint16_t p_len;
203
204		p_len = htons(le->p_len);
205		switch (le->eh_type) {
206#ifdef INET6
207		case ETHERTYPE_IPV6:
208		{
209			struct ip6_hdr *ip6;
210
211			ip6 = le->le_ip6;
212			ip6->ip6_plen = p_len;
213			th = (struct tcphdr *)(ip6 + 1);
214			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
215			    CSUM_PSEUDO_HDR;
216			le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
217			break;
218		}
219#endif
220#ifdef INET
221		case ETHERTYPE_IP:
222		{
223			struct ip *ip4;
224#ifdef TCP_LRO_UPDATE_CSUM
225			uint32_t cl;
226			uint16_t c;
227#endif
228
229			ip4 = le->le_ip4;
230#ifdef TCP_LRO_UPDATE_CSUM
231			/* Fix IP header checksum for new length. */
232			c = ~ip4->ip_sum;
233			cl = c;
234			c = ~ip4->ip_len;
235			cl += c + p_len;
236			while (cl > 0xffff)
237				cl = (cl >> 16) + (cl & 0xffff);
238			c = cl;
239			ip4->ip_sum = ~c;
240#else
241			ip4->ip_sum = TCP_LRO_INVALID_CSUM;
242#endif
243			ip4->ip_len = p_len;
244			th = (struct tcphdr *)(ip4 + 1);
245			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
246			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
247			le->p_len += ETHER_HDR_LEN;
248			break;
249		}
250#endif
251		default:
252			th = NULL;	/* Keep compiler happy. */
253		}
254		le->m_head->m_pkthdr.csum_data = 0xffff;
255		le->m_head->m_pkthdr.len = le->p_len;
256
257		/* Incorporate the latest ACK into the TCP header. */
258		th->th_ack = le->ack_seq;
259		th->th_win = le->window;
260		/* Incorporate latest timestamp into the TCP header. */
261		if (le->timestamp != 0) {
262			uint32_t *ts_ptr;
263
264			ts_ptr = (uint32_t *)(th + 1);
265			ts_ptr[1] = htonl(le->tsval);
266			ts_ptr[2] = le->tsecr;
267		}
268#ifdef TCP_LRO_UPDATE_CSUM
269		/* Update the TCP header checksum. */
270		le->ulp_csum += p_len;
271		le->ulp_csum += tcp_lro_csum_th(th);
272		while (le->ulp_csum > 0xffff)
273			le->ulp_csum = (le->ulp_csum >> 16) +
274			    (le->ulp_csum & 0xffff);
275		th->th_sum = (le->ulp_csum & 0xffff);
276		th->th_sum = ~th->th_sum;
277#else
278		th->th_sum = TCP_LRO_INVALID_CSUM;
279#endif
280	}
281
282	(*lc->ifp->if_input)(lc->ifp, le->m_head);
283	lc->lro_queued += le->append_cnt + 1;
284	lc->lro_flushed++;
285	bzero(le, sizeof(*le));
286	SLIST_INSERT_HEAD(&lc->lro_free, le, next);
287}
288
289#ifdef INET6
290static int
291tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
292    struct tcphdr **th)
293{
294
295	/* XXX-BZ we should check the flow-label. */
296
297	/* XXX-BZ We do not yet support ext. hdrs. */
298	if (ip6->ip6_nxt != IPPROTO_TCP)
299		return (TCP_LRO_NOT_SUPPORTED);
300
301	/* Find the TCP header. */
302	*th = (struct tcphdr *)(ip6 + 1);
303
304	return (0);
305}
306#endif
307
308#ifdef INET
309static int
310tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
311    struct tcphdr **th)
312{
313	int csum_flags;
314	uint16_t csum;
315
316	if (ip4->ip_p != IPPROTO_TCP)
317		return (TCP_LRO_NOT_SUPPORTED);
318
319	/* Ensure there are no options. */
320	if ((ip4->ip_hl << 2) != sizeof (*ip4))
321		return (TCP_LRO_CANNOT);
322
323	/* .. and the packet is not fragmented. */
324	if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
325		return (TCP_LRO_CANNOT);
326
327	/* Legacy IP has a header checksum that needs to be correct. */
328	csum_flags = m->m_pkthdr.csum_flags;
329	if (csum_flags & CSUM_IP_CHECKED) {
330		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
331			lc->lro_bad_csum++;
332			return (TCP_LRO_CANNOT);
333		}
334	} else {
335		csum = in_cksum_hdr(ip4);
336		if (__predict_false((csum) != 0)) {
337			lc->lro_bad_csum++;
338			return (TCP_LRO_CANNOT);
339		}
340	}
341
342	/* Find the TCP header (we assured there are no IP options). */
343	*th = (struct tcphdr *)(ip4 + 1);
344
345	return (0);
346}
347#endif
348
349int
350tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
351{
352	struct lro_entry *le;
353	struct ether_header *eh;
354#ifdef INET6
355	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
356#endif
357#ifdef INET
358	struct ip *ip4 = NULL;		/* Keep compiler happy. */
359#endif
360	struct tcphdr *th;
361	void *l3hdr = NULL;		/* Keep compiler happy. */
362	uint32_t *ts_ptr;
363	tcp_seq seq;
364	int error, ip_len, l;
365	uint16_t eh_type, tcp_data_len;
366
367	/* We expect a contiguous header [eh, ip, tcp]. */
368
369	eh = mtod(m, struct ether_header *);
370	eh_type = ntohs(eh->ether_type);
371	switch (eh_type) {
372#ifdef INET6
373	case ETHERTYPE_IPV6:
374	{
375		CURVNET_SET(lc->ifp->if_vnet);
376		if (V_ip6_forwarding != 0) {
377			/* XXX-BZ stats but changing lro_ctrl is a problem. */
378			CURVNET_RESTORE();
379			return (TCP_LRO_CANNOT);
380		}
381		CURVNET_RESTORE();
382		l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
383		error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
384		if (error != 0)
385			return (error);
386		tcp_data_len = ntohs(ip6->ip6_plen);
387		ip_len = sizeof(*ip6) + tcp_data_len;
388		break;
389	}
390#endif
391#ifdef INET
392	case ETHERTYPE_IP:
393	{
394		CURVNET_SET(lc->ifp->if_vnet);
395		if (V_ipforwarding != 0) {
396			/* XXX-BZ stats but changing lro_ctrl is a problem. */
397			CURVNET_RESTORE();
398			return (TCP_LRO_CANNOT);
399		}
400		CURVNET_RESTORE();
401		l3hdr = ip4 = (struct ip *)(eh + 1);
402		error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
403		if (error != 0)
404			return (error);
405		ip_len = ntohs(ip4->ip_len);
406		tcp_data_len = ip_len - sizeof(*ip4);
407		break;
408	}
409#endif
410	/* XXX-BZ what happens in case of VLAN(s)? */
411	default:
412		return (TCP_LRO_NOT_SUPPORTED);
413	}
414
415	/*
416	 * If the frame is padded beyond the end of the IP packet, then we must
417	 * trim the extra bytes off.
418	 */
419	l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
420	if (l != 0) {
421		if (l < 0)
422			/* Truncated packet. */
423			return (TCP_LRO_CANNOT);
424
425		m_adj(m, -l);
426	}
427
428	/*
429	 * Check TCP header constraints.
430	 */
431	/* Ensure no bits set besides ACK or PSH. */
432	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
433		return (TCP_LRO_CANNOT);
434
435	/* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */
436	/* XXX-BZ Ideally we'd flush on PUSH? */
437
438	/*
439	 * Check for timestamps.
440	 * Since the only option we handle are timestamps, we only have to
441	 * handle the simple case of aligned timestamps.
442	 */
443	l = (th->th_off << 2);
444	tcp_data_len -= l;
445	l -= sizeof(*th);
446	ts_ptr = (uint32_t *)(th + 1);
447	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
448	    (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
449	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
450		return (TCP_LRO_CANNOT);
451
452	/* If the driver did not pass in the checksum, set it now. */
453	if (csum == 0x0000)
454		csum = th->th_sum;
455
456	seq = ntohl(th->th_seq);
457
458	/* Try to find a matching previous segment. */
459	SLIST_FOREACH(le, &lc->lro_active, next) {
460		if (le->eh_type != eh_type)
461			continue;
462		if (le->source_port != th->th_sport ||
463		    le->dest_port != th->th_dport)
464			continue;
465		switch (eh_type) {
466#ifdef INET6
467		case ETHERTYPE_IPV6:
468			if (bcmp(&le->source_ip6, &ip6->ip6_src,
469			    sizeof(struct in6_addr)) != 0 ||
470			    bcmp(&le->dest_ip6, &ip6->ip6_dst,
471			    sizeof(struct in6_addr)) != 0)
472				continue;
473			break;
474#endif
475#ifdef INET
476		case ETHERTYPE_IP:
477			if (le->source_ip4 != ip4->ip_src.s_addr ||
478			    le->dest_ip4 != ip4->ip_dst.s_addr)
479				continue;
480			break;
481#endif
482		}
483
484		/* Flush now if appending will result in overflow. */
485		if (le->p_len > (65535 - tcp_data_len)) {
486			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
487			tcp_lro_flush(lc, le);
488			break;
489		}
490
491		/* Try to append the new segment. */
492		if (__predict_false(seq != le->next_seq ||
493		    (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
494			/* Out of order packet or duplicate ACK. */
495			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
496			tcp_lro_flush(lc, le);
497			return (TCP_LRO_CANNOT);
498		}
499
500		if (l != 0) {
501			uint32_t tsval = ntohl(*(ts_ptr + 1));
502			/* Make sure timestamp values are increasing. */
503			/* XXX-BZ flip and use TSTMP_GEQ macro for this? */
504			if (__predict_false(le->tsval > tsval ||
505			    *(ts_ptr + 2) == 0))
506				return (TCP_LRO_CANNOT);
507			le->tsval = tsval;
508			le->tsecr = *(ts_ptr + 2);
509		}
510
511		le->next_seq += tcp_data_len;
512		le->ack_seq = th->th_ack;
513		le->window = th->th_win;
514		le->append_cnt++;
515
516#ifdef TCP_LRO_UPDATE_CSUM
517		le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
518		    tcp_data_len, ~csum);
519#endif
520
521		if (tcp_data_len == 0) {
522			m_freem(m);
523			return (0);
524		}
525
526		le->p_len += tcp_data_len;
527
528		/*
529		 * Adjust the mbuf so that m_data points to the first byte of
530		 * the ULP payload.  Adjust the mbuf to avoid complications and
531		 * append new segment to existing mbuf chain.
532		 */
533		m_adj(m, m->m_pkthdr.len - tcp_data_len);
534		m->m_flags &= ~M_PKTHDR;
535
536		le->m_tail->m_next = m;
537		le->m_tail = m_last(m);
538
539		/*
540		 * If a possible next full length packet would cause an
541		 * overflow, pro-actively flush now.
542		 */
543		if (le->p_len > (65535 - lc->ifp->if_mtu)) {
544			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
545			tcp_lro_flush(lc, le);
546		}
547
548		return (0);
549	}
550
551	/* Try to find an empty slot. */
552	if (SLIST_EMPTY(&lc->lro_free))
553		return (TCP_LRO_CANNOT);
554
555	/* Start a new segment chain. */
556	le = SLIST_FIRST(&lc->lro_free);
557	SLIST_REMOVE_HEAD(&lc->lro_free, next);
558	SLIST_INSERT_HEAD(&lc->lro_active, le, next);
559
560	/* Start filling in details. */
561	switch (eh_type) {
562#ifdef INET6
563	case ETHERTYPE_IPV6:
564		le->le_ip6 = ip6;
565		le->source_ip6 = ip6->ip6_src;
566		le->dest_ip6 = ip6->ip6_dst;
567		le->eh_type = eh_type;
568		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
569		break;
570#endif
571#ifdef INET
572	case ETHERTYPE_IP:
573		le->le_ip4 = ip4;
574		le->source_ip4 = ip4->ip_src.s_addr;
575		le->dest_ip4 = ip4->ip_dst.s_addr;
576		le->eh_type = eh_type;
577		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
578		break;
579#endif
580	}
581	le->source_port = th->th_sport;
582	le->dest_port = th->th_dport;
583
584	le->next_seq = seq + tcp_data_len;
585	le->ack_seq = th->th_ack;
586	le->window = th->th_win;
587	if (l != 0) {
588		le->timestamp = 1;
589		le->tsval = ntohl(*(ts_ptr + 1));
590		le->tsecr = *(ts_ptr + 2);
591	}
592
593#ifdef TCP_LRO_UPDATE_CSUM
594	/*
595	 * Do not touch the csum of the first packet.  However save the
596	 * "adjusted" checksum of just the source and destination addresses,
597	 * the next header and the TCP payload.  The length and TCP header
598	 * parts may change, so we remove those from the saved checksum and
599	 * re-add with final values on tcp_lro_flush() if needed.
600	 */
601	KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
602	    __func__, le, le->ulp_csum));
603
604	le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
605	    ~csum);
606	th->th_sum = csum;	/* Restore checksum on first packet. */
607#endif
608
609	le->m_head = m;
610	le->m_tail = m_last(m);
611
612	return (0);
613}
614
615/* end */
616