sfxge_rx.c revision 279179
1/*-
2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 279179 2015-02-22 18:56:03Z arybchik $");
32
33#include <sys/types.h>
34#include <sys/mbuf.h>
35#include <sys/smp.h>
36#include <sys/socket.h>
37#include <sys/sysctl.h>
38#include <sys/limits.h>
39
40#include <net/ethernet.h>
41#include <net/if.h>
42#include <net/if_vlan_var.h>
43
44#include <netinet/in.h>
45#include <netinet/ip.h>
46#include <netinet/ip6.h>
47#include <netinet/tcp.h>
48
49#include <machine/in_cksum.h>
50
51#include "common/efx.h"
52
53
54#include "sfxge.h"
55#include "sfxge_rx.h"
56
57#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
58
59/* Size of the LRO hash table.  Must be a power of 2.  A larger table
60 * means we can accelerate a larger number of streams.
61 */
62static unsigned lro_table_size = 128;
63
64/* Maximum length of a hash chain.  If chains get too long then the lookup
65 * time increases and may exceed the benefit of LRO.
66 */
67static unsigned lro_chain_max = 20;
68
69/* Maximum time (in ticks) that a connection can be idle before it's LRO
70 * state is discarded.
71 */
72static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
73
74/* Number of packets with payload that must arrive in-order before a
75 * connection is eligible for LRO.  The idea is we should avoid coalescing
76 * segments when the sender is in slow-start because reducing the ACK rate
77 * can damage performance.
78 */
79static int lro_slow_start_packets = 2000;
80
81/* Number of packets with payload that must arrive in-order following loss
82 * before a connection is eligible for LRO.  The idea is we should avoid
83 * coalescing segments when the sender is recovering from loss, because
84 * reducing the ACK rate can damage performance.
85 */
86static int lro_loss_packets = 20;
87
88/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
89#define	SFXGE_LRO_L2_ID_VLAN 0x4000
90#define	SFXGE_LRO_L2_ID_IPV6 0x8000
91#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
92#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
93
94/* Compare IPv6 addresses, avoiding conditional branches */
95static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
96				   const struct in6_addr *right)
97{
98#if LONG_BIT == 64
99	const uint64_t *left64 = (const uint64_t *)left;
100	const uint64_t *right64 = (const uint64_t *)right;
101	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
102#else
103	return (left->s6_addr32[0] - right->s6_addr32[0]) |
104	       (left->s6_addr32[1] - right->s6_addr32[1]) |
105	       (left->s6_addr32[2] - right->s6_addr32[2]) |
106	       (left->s6_addr32[3] - right->s6_addr32[3]);
107#endif
108}
109
110void
111sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
112{
113
114	rxq->flush_state = SFXGE_FLUSH_DONE;
115}
116
117void
118sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
119{
120
121	rxq->flush_state = SFXGE_FLUSH_FAILED;
122}
123
124static uint8_t toep_key[] = {
125	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
126	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
127	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
128	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
129	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
130};
131
132static void
133sfxge_rx_post_refill(void *arg)
134{
135	struct sfxge_rxq *rxq = arg;
136	struct sfxge_softc *sc;
137	unsigned int index;
138	struct sfxge_evq *evq;
139	uint16_t magic;
140
141	sc = rxq->sc;
142	index = rxq->index;
143	evq = sc->evq[index];
144
145	magic = SFXGE_MAGIC_RX_QREFILL | index;
146
147	/* This is guaranteed due to the start/stop order of rx and ev */
148	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
149	    ("evq not started"));
150	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
151	    ("rxq not started"));
152	efx_ev_qpost(evq->common, magic);
153}
154
155static void
156sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
157{
158	/* Initially retry after 100 ms, but back off in case of
159	 * repeated failures as we probably have to wait for the
160	 * administrator to raise the pool limit. */
161	if (retrying)
162		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
163	else
164		rxq->refill_delay = hz / 10;
165
166	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
167			     sfxge_rx_post_refill, rxq);
168}
169
170static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
171{
172	struct mb_args args;
173	struct mbuf *m;
174
175	/* Allocate mbuf structure */
176	args.flags = M_PKTHDR;
177	args.type = MT_DATA;
178	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
179
180	/* Allocate (and attach) packet buffer */
181	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
182		uma_zfree(zone_mbuf, m);
183		m = NULL;
184	}
185
186	return (m);
187}
188
189#define	SFXGE_REFILL_BATCH  64
190
191static void
192sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
193{
194	struct sfxge_softc *sc;
195	unsigned int index;
196	struct sfxge_evq *evq;
197	unsigned int batch;
198	unsigned int rxfill;
199	unsigned int mblksize;
200	int ntodo;
201	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
202
203	sc = rxq->sc;
204	index = rxq->index;
205	evq = sc->evq[index];
206
207	prefetch_read_many(sc->enp);
208	prefetch_read_many(rxq->common);
209
210	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
211
212	if (rxq->init_state != SFXGE_RXQ_STARTED)
213		return;
214
215	rxfill = rxq->added - rxq->completed;
216	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
217	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
218	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
219	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
220	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
221
222	if (ntodo == 0)
223		return;
224
225	batch = 0;
226	mblksize = sc->rx_buffer_size;
227	while (ntodo-- > 0) {
228		unsigned int id;
229		struct sfxge_rx_sw_desc *rx_desc;
230		bus_dma_segment_t seg;
231		struct mbuf *m;
232
233		id = (rxq->added + batch) & rxq->ptr_mask;
234		rx_desc = &rxq->queue[id];
235		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
236
237		rx_desc->flags = EFX_DISCARD;
238		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
239		if (m == NULL)
240			break;
241		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
242		addr[batch++] = seg.ds_addr;
243
244		if (batch == SFXGE_REFILL_BATCH) {
245			efx_rx_qpost(rxq->common, addr, mblksize, batch,
246			    rxq->completed, rxq->added);
247			rxq->added += batch;
248			batch = 0;
249		}
250	}
251
252	if (ntodo != 0)
253		sfxge_rx_schedule_refill(rxq, retrying);
254
255	if (batch != 0) {
256		efx_rx_qpost(rxq->common, addr, mblksize, batch,
257		    rxq->completed, rxq->added);
258		rxq->added += batch;
259	}
260
261	/* Make the descriptors visible to the hardware */
262	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
263			BUS_DMASYNC_PREWRITE);
264
265	efx_rx_qpush(rxq->common, rxq->added);
266}
267
268void
269sfxge_rx_qrefill(struct sfxge_rxq *rxq)
270{
271
272	if (rxq->init_state != SFXGE_RXQ_STARTED)
273		return;
274
275	/* Make sure the queue is full */
276	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
277}
278
279static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
280{
281	struct ifnet *ifp = sc->ifnet;
282
283	m->m_pkthdr.rcvif = ifp;
284	m->m_pkthdr.csum_data = 0xffff;
285	ifp->if_input(ifp, m);
286}
287
288static void
289sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
290{
291	struct mbuf *m = rx_desc->mbuf;
292	int csum_flags;
293
294	/* Convert checksum flags */
295	csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
296		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
297	if (rx_desc->flags & EFX_CKSUM_TCPUDP)
298		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
299
300#ifdef SFXGE_HAVE_MQ
301	/* The hash covers a 4-tuple for TCP only */
302	if (rx_desc->flags & EFX_PKT_TCP) {
303		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
304						       mtod(m, uint8_t *));
305		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
306	}
307#endif
308	m->m_data += sc->rx_prefix_size;
309	m->m_len = rx_desc->size - sc->rx_prefix_size;
310	m->m_pkthdr.len = m->m_len;
311	m->m_pkthdr.csum_flags = csum_flags;
312	__sfxge_rx_deliver(sc, rx_desc->mbuf);
313
314	rx_desc->flags = EFX_DISCARD;
315	rx_desc->mbuf = NULL;
316}
317
318static void
319sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
320{
321	struct sfxge_softc *sc = st->sc;
322	struct mbuf *m = c->mbuf;
323	struct tcphdr *c_th;
324	int csum_flags;
325
326	KASSERT(m, ("no mbuf to deliver"));
327
328	++st->n_bursts;
329
330	/* Finish off packet munging and recalculate IP header checksum. */
331	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
332		struct ip *iph = c->nh;
333		iph->ip_len = htons(iph->ip_len);
334		iph->ip_sum = 0;
335		iph->ip_sum = in_cksum_hdr(iph);
336		c_th = (struct tcphdr *)(iph + 1);
337		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
338			      CSUM_IP_CHECKED | CSUM_IP_VALID);
339	} else {
340		struct ip6_hdr *iph = c->nh;
341		iph->ip6_plen = htons(iph->ip6_plen);
342		c_th = (struct tcphdr *)(iph + 1);
343		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
344	}
345
346	c_th->th_win = c->th_last->th_win;
347	c_th->th_ack = c->th_last->th_ack;
348	if (c_th->th_off == c->th_last->th_off) {
349		/* Copy TCP options (take care to avoid going negative). */
350		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
351		memcpy(c_th + 1, c->th_last + 1, optlen);
352	}
353
354#ifdef SFXGE_HAVE_MQ
355	m->m_pkthdr.flowid = c->conn_hash;
356	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
357#endif
358	m->m_pkthdr.csum_flags = csum_flags;
359	__sfxge_rx_deliver(sc, m);
360
361	c->mbuf = NULL;
362	c->delivered = 1;
363}
364
365/* Drop the given connection, and add it to the free list. */
366static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
367{
368	unsigned bucket;
369
370	KASSERT(!c->mbuf, ("found orphaned mbuf"));
371
372	if (c->next_buf.mbuf != NULL) {
373		sfxge_rx_deliver(rxq->sc, &c->next_buf);
374		LIST_REMOVE(c, active_link);
375	}
376
377	bucket = c->conn_hash & rxq->lro.conns_mask;
378	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
379	--rxq->lro.conns_n[bucket];
380	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
381	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
382}
383
384/* Stop tracking connections that have gone idle in order to keep hash
385 * chains short.
386 */
387static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
388{
389	struct sfxge_lro_conn *c;
390	unsigned i;
391
392	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
393		("found active connections"));
394
395	rxq->lro.last_purge_ticks = now;
396	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
397		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
398			continue;
399
400		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
401		if (now - c->last_pkt_ticks > lro_idle_ticks) {
402			++rxq->lro.n_drop_idle;
403			sfxge_lro_drop(rxq, c);
404		}
405	}
406}
407
408static void
409sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
410		struct mbuf *mbuf, struct tcphdr *th)
411{
412	struct tcphdr *c_th;
413
414	/* Tack the new mbuf onto the chain. */
415	KASSERT(!mbuf->m_next, ("mbuf already chained"));
416	c->mbuf_tail->m_next = mbuf;
417	c->mbuf_tail = mbuf;
418
419	/* Increase length appropriately */
420	c->mbuf->m_pkthdr.len += mbuf->m_len;
421
422	/* Update the connection state flags */
423	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
424		struct ip *iph = c->nh;
425		iph->ip_len += mbuf->m_len;
426		c_th = (struct tcphdr *)(iph + 1);
427	} else {
428		struct ip6_hdr *iph = c->nh;
429		iph->ip6_plen += mbuf->m_len;
430		c_th = (struct tcphdr *)(iph + 1);
431	}
432	c_th->th_flags |= (th->th_flags & TH_PUSH);
433	c->th_last = th;
434	++st->n_merges;
435
436	/* Pass packet up now if another segment could overflow the IP
437	 * length.
438	 */
439	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
440		sfxge_lro_deliver(st, c);
441}
442
443static void
444sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
445		struct mbuf *mbuf, void *nh, struct tcphdr *th)
446{
447	/* Start the chain */
448	c->mbuf = mbuf;
449	c->mbuf_tail = c->mbuf;
450	c->nh = nh;
451	c->th_last = th;
452
453	mbuf->m_pkthdr.len = mbuf->m_len;
454
455	/* Mangle header fields for later processing */
456	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
457		struct ip *iph = nh;
458		iph->ip_len = ntohs(iph->ip_len);
459	} else {
460		struct ip6_hdr *iph = nh;
461		iph->ip6_plen = ntohs(iph->ip6_plen);
462	}
463}
464
465/* Try to merge or otherwise hold or deliver (as appropriate) the
466 * packet buffered for this connection (c->next_buf).  Return a flag
467 * indicating whether the connection is still active for LRO purposes.
468 */
469static int
470sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
471{
472	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
473	char *eh = c->next_eh;
474	int data_length, hdr_length, dont_merge;
475	unsigned th_seq, pkt_length;
476	struct tcphdr *th;
477	unsigned now;
478
479	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
480		struct ip *iph = c->next_nh;
481		th = (struct tcphdr *)(iph + 1);
482		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
483	} else {
484		struct ip6_hdr *iph = c->next_nh;
485		th = (struct tcphdr *)(iph + 1);
486		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
487	}
488
489	hdr_length = (char *) th + th->th_off * 4 - eh;
490	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
491		       hdr_length);
492	th_seq = ntohl(th->th_seq);
493	dont_merge = ((data_length <= 0)
494		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
495
496	/* Check for options other than aligned timestamp. */
497	if (th->th_off != 5) {
498		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
499		if (th->th_off == 8 &&
500		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
501					(TCPOPT_NOP << 16) |
502					(TCPOPT_TIMESTAMP << 8) |
503					TCPOLEN_TIMESTAMP)) {
504			/* timestamp option -- okay */
505		} else {
506			dont_merge = 1;
507		}
508	}
509
510	if (__predict_false(th_seq != c->next_seq)) {
511		/* Out-of-order, so start counting again. */
512		if (c->mbuf != NULL)
513			sfxge_lro_deliver(&rxq->lro, c);
514		c->n_in_order_pkts -= lro_loss_packets;
515		c->next_seq = th_seq + data_length;
516		++rxq->lro.n_misorder;
517		goto deliver_buf_out;
518	}
519	c->next_seq = th_seq + data_length;
520
521	now = ticks;
522	if (now - c->last_pkt_ticks > lro_idle_ticks) {
523		++rxq->lro.n_drop_idle;
524		if (c->mbuf != NULL)
525			sfxge_lro_deliver(&rxq->lro, c);
526		sfxge_lro_drop(rxq, c);
527		return (0);
528	}
529	c->last_pkt_ticks = ticks;
530
531	if (c->n_in_order_pkts < lro_slow_start_packets) {
532		/* May be in slow-start, so don't merge. */
533		++rxq->lro.n_slow_start;
534		++c->n_in_order_pkts;
535		goto deliver_buf_out;
536	}
537
538	if (__predict_false(dont_merge)) {
539		if (c->mbuf != NULL)
540			sfxge_lro_deliver(&rxq->lro, c);
541		if (th->th_flags & (TH_FIN | TH_RST)) {
542			++rxq->lro.n_drop_closed;
543			sfxge_lro_drop(rxq, c);
544			return (0);
545		}
546		goto deliver_buf_out;
547	}
548
549	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
550
551	if (__predict_true(c->mbuf != NULL)) {
552		/* Remove headers and any padding */
553		rx_buf->mbuf->m_data += hdr_length;
554		rx_buf->mbuf->m_len = data_length;
555
556		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
557	} else {
558		/* Remove any padding */
559		rx_buf->mbuf->m_len = pkt_length;
560
561		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
562	}
563
564	rx_buf->mbuf = NULL;
565	return (1);
566
567 deliver_buf_out:
568	sfxge_rx_deliver(rxq->sc, rx_buf);
569	return (1);
570}
571
572static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
573			       uint16_t l2_id, void *nh, struct tcphdr *th)
574{
575	unsigned bucket = conn_hash & st->conns_mask;
576	struct sfxge_lro_conn *c;
577
578	if (st->conns_n[bucket] >= lro_chain_max) {
579		++st->n_too_many;
580		return;
581	}
582
583	if (!TAILQ_EMPTY(&st->free_conns)) {
584		c = TAILQ_FIRST(&st->free_conns);
585		TAILQ_REMOVE(&st->free_conns, c, link);
586	} else {
587		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
588		if (c == NULL)
589			return;
590		c->mbuf = NULL;
591		c->next_buf.mbuf = NULL;
592	}
593
594	/* Create the connection tracking data */
595	++st->conns_n[bucket];
596	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
597	c->l2_id = l2_id;
598	c->conn_hash = conn_hash;
599	c->source = th->th_sport;
600	c->dest = th->th_dport;
601	c->n_in_order_pkts = 0;
602	c->last_pkt_ticks = *(volatile int *)&ticks;
603	c->delivered = 0;
604	++st->n_new_stream;
605	/* NB. We don't initialise c->next_seq, and it doesn't matter what
606	 * value it has.  Most likely the next packet received for this
607	 * connection will not match -- no harm done.
608	 */
609}
610
611/* Process mbuf and decide whether to dispatch it to the stack now or
612 * later.
613 */
614static void
615sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
616{
617	struct sfxge_softc *sc = rxq->sc;
618	struct mbuf *m = rx_buf->mbuf;
619	struct ether_header *eh;
620	struct sfxge_lro_conn *c;
621	uint16_t l2_id;
622	uint16_t l3_proto;
623	void *nh;
624	struct tcphdr *th;
625	uint32_t conn_hash;
626	unsigned bucket;
627
628	/* Get the hardware hash */
629	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
630				      mtod(m, uint8_t *));
631
632	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
633	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
634		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
635		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
636			SFXGE_LRO_L2_ID_VLAN;
637		l3_proto = veh->evl_proto;
638		nh = veh + 1;
639	} else {
640		l2_id = 0;
641		l3_proto = eh->ether_type;
642		nh = eh + 1;
643	}
644
645	/* Check whether this is a suitable packet (unfragmented
646	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
647	 * length, and compute a hash if necessary.  If not, return.
648	 */
649	if (l3_proto == htons(ETHERTYPE_IP)) {
650		struct ip *iph = nh;
651		if ((iph->ip_p - IPPROTO_TCP) |
652		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
653		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
654			goto deliver_now;
655		th = (struct tcphdr *)(iph + 1);
656	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
657		struct ip6_hdr *iph = nh;
658		if (iph->ip6_nxt != IPPROTO_TCP)
659			goto deliver_now;
660		l2_id |= SFXGE_LRO_L2_ID_IPV6;
661		th = (struct tcphdr *)(iph + 1);
662	} else {
663		goto deliver_now;
664	}
665
666	bucket = conn_hash & rxq->lro.conns_mask;
667
668	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
669		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
670			continue;
671		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
672			continue;
673		if (c->mbuf != NULL) {
674			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
675				struct ip *c_iph, *iph = nh;
676				c_iph = c->nh;
677				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
678				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
679					continue;
680			} else {
681				struct ip6_hdr *c_iph, *iph = nh;
682				c_iph = c->nh;
683				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
684				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
685					continue;
686			}
687		}
688
689		/* Re-insert at head of list to reduce lookup time. */
690		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
691		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
692
693		if (c->next_buf.mbuf != NULL) {
694			if (!sfxge_lro_try_merge(rxq, c))
695				goto deliver_now;
696		} else {
697			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
698			    active_link);
699		}
700		c->next_buf = *rx_buf;
701		c->next_eh = eh;
702		c->next_nh = nh;
703
704		rx_buf->mbuf = NULL;
705		rx_buf->flags = EFX_DISCARD;
706		return;
707	}
708
709	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
710 deliver_now:
711	sfxge_rx_deliver(sc, rx_buf);
712}
713
714static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
715{
716	struct sfxge_lro_state *st = &rxq->lro;
717	struct sfxge_lro_conn *c;
718	unsigned t;
719
720	while (!LIST_EMPTY(&st->active_conns)) {
721		c = LIST_FIRST(&st->active_conns);
722		if (!c->delivered && c->mbuf != NULL)
723			sfxge_lro_deliver(st, c);
724		if (sfxge_lro_try_merge(rxq, c)) {
725			if (c->mbuf != NULL)
726				sfxge_lro_deliver(st, c);
727			LIST_REMOVE(c, active_link);
728		}
729		c->delivered = 0;
730	}
731
732	t = *(volatile int *)&ticks;
733	if (__predict_false(t != st->last_purge_ticks))
734		sfxge_lro_purge_idle(rxq, t);
735}
736
737void
738sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
739{
740	struct sfxge_softc *sc = rxq->sc;
741	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
742	unsigned int index;
743	struct sfxge_evq *evq;
744	unsigned int completed;
745	unsigned int level;
746	struct mbuf *m;
747	struct sfxge_rx_sw_desc *prev = NULL;
748
749	index = rxq->index;
750	evq = sc->evq[index];
751
752	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
753
754	completed = rxq->completed;
755	while (completed != rxq->pending) {
756		unsigned int id;
757		struct sfxge_rx_sw_desc *rx_desc;
758
759		id = completed++ & rxq->ptr_mask;
760		rx_desc = &rxq->queue[id];
761		m = rx_desc->mbuf;
762
763		if (rxq->init_state != SFXGE_RXQ_STARTED)
764			goto discard;
765
766		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
767			goto discard;
768
769		prefetch_read_many(mtod(m, caddr_t));
770
771		/* Check for loopback packets */
772		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
773		    !(rx_desc->flags & EFX_PKT_IPV6)) {
774			struct ether_header *etherhp;
775
776			/*LINTED*/
777			etherhp = mtod(m, struct ether_header *);
778
779			if (etherhp->ether_type ==
780			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
781				EFSYS_PROBE(loopback);
782
783				rxq->loopback++;
784				goto discard;
785			}
786		}
787
788		/* Pass packet up the stack or into LRO (pipelined) */
789		if (prev != NULL) {
790			if (lro_enabled)
791				sfxge_lro(rxq, prev);
792			else
793				sfxge_rx_deliver(sc, prev);
794		}
795		prev = rx_desc;
796		continue;
797
798discard:
799		/* Return the packet to the pool */
800		m_free(m);
801		rx_desc->mbuf = NULL;
802	}
803	rxq->completed = completed;
804
805	level = rxq->added - rxq->completed;
806
807	/* Pass last packet up the stack or into LRO */
808	if (prev != NULL) {
809		if (lro_enabled)
810			sfxge_lro(rxq, prev);
811		else
812			sfxge_rx_deliver(sc, prev);
813	}
814
815	/*
816	 * If there are any pending flows and this is the end of the
817	 * poll then they must be completed.
818	 */
819	if (eop)
820		sfxge_lro_end_of_burst(rxq);
821
822	/* Top up the queue if necessary */
823	if (level < rxq->refill_threshold)
824		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
825}
826
827static void
828sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
829{
830	struct sfxge_rxq *rxq;
831	struct sfxge_evq *evq;
832	unsigned int count;
833
834	rxq = sc->rxq[index];
835	evq = sc->evq[index];
836
837	SFXGE_EVQ_LOCK(evq);
838
839	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
840	    ("rxq not started"));
841
842	rxq->init_state = SFXGE_RXQ_INITIALIZED;
843
844	callout_stop(&rxq->refill_callout);
845
846again:
847	rxq->flush_state = SFXGE_FLUSH_PENDING;
848
849	/* Flush the receive queue */
850	efx_rx_qflush(rxq->common);
851
852	SFXGE_EVQ_UNLOCK(evq);
853
854	count = 0;
855	do {
856		/* Spin for 100 ms */
857		DELAY(100000);
858
859		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
860			break;
861
862	} while (++count < 20);
863
864	SFXGE_EVQ_LOCK(evq);
865
866	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
867		goto again;
868
869	rxq->flush_state = SFXGE_FLUSH_DONE;
870
871	rxq->pending = rxq->added;
872	sfxge_rx_qcomplete(rxq, B_TRUE);
873
874	KASSERT(rxq->completed == rxq->pending,
875	    ("rxq->completed != rxq->pending"));
876
877	rxq->added = 0;
878	rxq->pending = 0;
879	rxq->completed = 0;
880	rxq->loopback = 0;
881
882	/* Destroy the common code receive queue. */
883	efx_rx_qdestroy(rxq->common);
884
885	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
886	    EFX_RXQ_NBUFS(sc->rxq_entries));
887
888	SFXGE_EVQ_UNLOCK(evq);
889}
890
891static int
892sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
893{
894	struct sfxge_rxq *rxq;
895	efsys_mem_t *esmp;
896	struct sfxge_evq *evq;
897	int rc;
898
899	rxq = sc->rxq[index];
900	esmp = &rxq->mem;
901	evq = sc->evq[index];
902
903	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
904	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
905	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
906	    ("evq->init_state != SFXGE_EVQ_STARTED"));
907
908	/* Program the buffer table. */
909	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
910	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
911		return (rc);
912
913	/* Create the common code receive queue. */
914	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
915	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
916	    &rxq->common)) != 0)
917		goto fail;
918
919	SFXGE_EVQ_LOCK(evq);
920
921	/* Enable the receive queue. */
922	efx_rx_qenable(rxq->common);
923
924	rxq->init_state = SFXGE_RXQ_STARTED;
925
926	/* Try to fill the queue from the pool. */
927	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
928
929	SFXGE_EVQ_UNLOCK(evq);
930
931	return (0);
932
933fail:
934	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
935	    EFX_RXQ_NBUFS(sc->rxq_entries));
936	return (rc);
937}
938
939void
940sfxge_rx_stop(struct sfxge_softc *sc)
941{
942	int index;
943
944	/* Stop the receive queue(s) */
945	index = sc->rxq_count;
946	while (--index >= 0)
947		sfxge_rx_qstop(sc, index);
948
949	sc->rx_prefix_size = 0;
950	sc->rx_buffer_size = 0;
951
952	efx_rx_fini(sc->enp);
953}
954
955int
956sfxge_rx_start(struct sfxge_softc *sc)
957{
958	struct sfxge_intr *intr;
959	int index;
960	int rc;
961
962	intr = &sc->intr;
963
964	/* Initialize the common code receive module. */
965	if ((rc = efx_rx_init(sc->enp)) != 0)
966		return (rc);
967
968	/* Calculate the receive packet buffer size. */
969	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
970	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
971			      sc->rx_prefix_size);
972
973	/* Select zone for packet buffers */
974	if (sc->rx_buffer_size <= MCLBYTES)
975		sc->rx_buffer_zone = zone_clust;
976	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
977		sc->rx_buffer_zone = zone_jumbop;
978	else if (sc->rx_buffer_size <= MJUM9BYTES)
979		sc->rx_buffer_zone = zone_jumbo9;
980	else
981		sc->rx_buffer_zone = zone_jumbo16;
982
983	/*
984	 * Set up the scale table.  Enable all hash types and hash insertion.
985	 */
986	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
987		sc->rx_indir_table[index] = index % sc->rxq_count;
988	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
989				       SFXGE_RX_SCALE_MAX)) != 0)
990		goto fail;
991	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
992	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
993	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
994
995	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
996	    sizeof(toep_key))) != 0)
997		goto fail;
998
999	/* Start the receive queue(s). */
1000	for (index = 0; index < sc->rxq_count; index++) {
1001		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1002			goto fail2;
1003	}
1004
1005	return (0);
1006
1007fail2:
1008	while (--index >= 0)
1009		sfxge_rx_qstop(sc, index);
1010
1011fail:
1012	efx_rx_fini(sc->enp);
1013
1014	return (rc);
1015}
1016
1017static void sfxge_lro_init(struct sfxge_rxq *rxq)
1018{
1019	struct sfxge_lro_state *st = &rxq->lro;
1020	unsigned i;
1021
1022	st->conns_mask = lro_table_size - 1;
1023	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1024		("lro_table_size must be a power of 2"));
1025	st->sc = rxq->sc;
1026	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1027			   M_SFXGE, M_WAITOK);
1028	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1029			     M_SFXGE, M_WAITOK);
1030	for (i = 0; i <= st->conns_mask; ++i) {
1031		TAILQ_INIT(&st->conns[i]);
1032		st->conns_n[i] = 0;
1033	}
1034	LIST_INIT(&st->active_conns);
1035	TAILQ_INIT(&st->free_conns);
1036}
1037
1038static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1039{
1040	struct sfxge_lro_state *st = &rxq->lro;
1041	struct sfxge_lro_conn *c;
1042	unsigned i;
1043
1044	/* Return cleanly if sfxge_lro_init() has not been called. */
1045	if (st->conns == NULL)
1046		return;
1047
1048	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1049
1050	for (i = 0; i <= st->conns_mask; ++i) {
1051		while (!TAILQ_EMPTY(&st->conns[i])) {
1052			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1053			sfxge_lro_drop(rxq, c);
1054		}
1055	}
1056
1057	while (!TAILQ_EMPTY(&st->free_conns)) {
1058		c = TAILQ_FIRST(&st->free_conns);
1059		TAILQ_REMOVE(&st->free_conns, c, link);
1060		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1061		free(c, M_SFXGE);
1062	}
1063
1064	free(st->conns_n, M_SFXGE);
1065	free(st->conns, M_SFXGE);
1066	st->conns = NULL;
1067}
1068
1069static void
1070sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1071{
1072	struct sfxge_rxq *rxq;
1073
1074	rxq = sc->rxq[index];
1075
1076	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1077	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1078
1079	/* Free the context array and the flow table. */
1080	free(rxq->queue, M_SFXGE);
1081	sfxge_lro_fini(rxq);
1082
1083	/* Release DMA memory. */
1084	sfxge_dma_free(&rxq->mem);
1085
1086	sc->rxq[index] = NULL;
1087
1088	free(rxq, M_SFXGE);
1089}
1090
1091static int
1092sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1093{
1094	struct sfxge_rxq *rxq;
1095	struct sfxge_evq *evq;
1096	efsys_mem_t *esmp;
1097	int rc;
1098
1099	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1100
1101	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1102	rxq->sc = sc;
1103	rxq->index = index;
1104	rxq->entries = sc->rxq_entries;
1105	rxq->ptr_mask = rxq->entries - 1;
1106	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1107
1108	sc->rxq[index] = rxq;
1109	esmp = &rxq->mem;
1110
1111	evq = sc->evq[index];
1112
1113	/* Allocate and zero DMA space. */
1114	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1115		return (rc);
1116
1117	/* Allocate buffer table entries. */
1118	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1119				 &rxq->buf_base_id);
1120
1121	/* Allocate the context array and the flow table. */
1122	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1123	    M_SFXGE, M_WAITOK | M_ZERO);
1124	sfxge_lro_init(rxq);
1125
1126	callout_init(&rxq->refill_callout, B_TRUE);
1127
1128	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1129
1130	return (0);
1131}
1132
1133static const struct {
1134	const char *name;
1135	size_t offset;
1136} sfxge_rx_stats[] = {
1137#define	SFXGE_RX_STAT(name, member) \
1138	{ #name, offsetof(struct sfxge_rxq, member) }
1139	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1140	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1141	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1142	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1143	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1144	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1145	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1146	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1147};
1148
1149static int
1150sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1151{
1152	struct sfxge_softc *sc = arg1;
1153	unsigned int id = arg2;
1154	unsigned int sum, index;
1155
1156	/* Sum across all RX queues */
1157	sum = 0;
1158	for (index = 0; index < sc->rxq_count; index++)
1159		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1160					 sfxge_rx_stats[id].offset);
1161
1162	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1163}
1164
1165static void
1166sfxge_rx_stat_init(struct sfxge_softc *sc)
1167{
1168	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1169	struct sysctl_oid_list *stat_list;
1170	unsigned int id;
1171
1172	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1173
1174	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1175		SYSCTL_ADD_PROC(
1176			ctx, stat_list,
1177			OID_AUTO, sfxge_rx_stats[id].name,
1178			CTLTYPE_UINT|CTLFLAG_RD,
1179			sc, id, sfxge_rx_stat_handler, "IU",
1180			"");
1181	}
1182}
1183
1184void
1185sfxge_rx_fini(struct sfxge_softc *sc)
1186{
1187	int index;
1188
1189	index = sc->rxq_count;
1190	while (--index >= 0)
1191		sfxge_rx_qfini(sc, index);
1192
1193	sc->rxq_count = 0;
1194}
1195
1196int
1197sfxge_rx_init(struct sfxge_softc *sc)
1198{
1199	struct sfxge_intr *intr;
1200	int index;
1201	int rc;
1202
1203	if (lro_idle_ticks == 0)
1204		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1205
1206	intr = &sc->intr;
1207
1208	sc->rxq_count = intr->n_alloc;
1209
1210	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1211	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1212
1213	/* Initialize the receive queue(s) - one per interrupt. */
1214	for (index = 0; index < sc->rxq_count; index++) {
1215		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1216			goto fail;
1217	}
1218
1219	sfxge_rx_stat_init(sc);
1220
1221	return (0);
1222
1223fail:
1224	/* Tear down the receive queue(s). */
1225	while (--index >= 0)
1226		sfxge_rx_qfini(sc, index);
1227
1228	sc->rxq_count = 0;
1229	return (rc);
1230}
1231