sfxge_rx.c revision 282940
1/*-
2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 282940 2015-05-15 06:48:36Z arybchik $");
32
33#include <sys/types.h>
34#include <sys/mbuf.h>
35#include <sys/smp.h>
36#include <sys/socket.h>
37#include <sys/sysctl.h>
38#include <sys/limits.h>
39#include <sys/syslog.h>
40
41#include <net/ethernet.h>
42#include <net/if.h>
43#include <net/if_vlan_var.h>
44
45#include <netinet/in.h>
46#include <netinet/ip.h>
47#include <netinet/ip6.h>
48#include <netinet/tcp.h>
49
50#include <machine/in_cksum.h>
51
52#include "common/efx.h"
53
54
55#include "sfxge.h"
56#include "sfxge_rx.h"
57
58#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59
60#ifdef SFXGE_LRO
61
62SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63	    "Large receive offload (LRO) parameters");
64
65#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66
67/* Size of the LRO hash table.  Must be a power of 2.  A larger table
68 * means we can accelerate a larger number of streams.
69 */
70static unsigned lro_table_size = 128;
71TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73	    &lro_table_size, 0,
74	    "Size of the LRO hash table (must be a power of 2)");
75
76/* Maximum length of a hash chain.  If chains get too long then the lookup
77 * time increases and may exceed the benefit of LRO.
78 */
79static unsigned lro_chain_max = 20;
80TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82	    &lro_chain_max, 0,
83	    "The maximum length of a hash chain");
84
85/* Maximum time (in ticks) that a connection can be idle before it's LRO
86 * state is discarded.
87 */
88static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91	    &lro_idle_ticks, 0,
92	    "The maximum time (in ticks) that a connection can be idle "
93	    "before it's LRO state is discarded");
94
95/* Number of packets with payload that must arrive in-order before a
96 * connection is eligible for LRO.  The idea is we should avoid coalescing
97 * segments when the sender is in slow-start because reducing the ACK rate
98 * can damage performance.
99 */
100static int lro_slow_start_packets = 2000;
101TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103	    &lro_slow_start_packets, 0,
104	    "Number of packets with payload that must arrive in-order before "
105	    "a connection is eligible for LRO");
106
107/* Number of packets with payload that must arrive in-order following loss
108 * before a connection is eligible for LRO.  The idea is we should avoid
109 * coalescing segments when the sender is recovering from loss, because
110 * reducing the ACK rate can damage performance.
111 */
112static int lro_loss_packets = 20;
113TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115	    &lro_loss_packets, 0,
116	    "Number of packets with payload that must arrive in-order "
117	    "following loss before a connection is eligible for LRO");
118
119/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120#define	SFXGE_LRO_L2_ID_VLAN 0x4000
121#define	SFXGE_LRO_L2_ID_IPV6 0x8000
122#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124
125/* Compare IPv6 addresses, avoiding conditional branches */
126static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127				   const struct in6_addr *right)
128{
129#if LONG_BIT == 64
130	const uint64_t *left64 = (const uint64_t *)left;
131	const uint64_t *right64 = (const uint64_t *)right;
132	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133#else
134	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137	       (left->s6_addr32[3] - right->s6_addr32[3]);
138#endif
139}
140
141#endif	/* SFXGE_LRO */
142
143void
144sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145{
146
147	rxq->flush_state = SFXGE_FLUSH_DONE;
148}
149
150void
151sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152{
153
154	rxq->flush_state = SFXGE_FLUSH_FAILED;
155}
156
157static uint8_t toep_key[] = {
158	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163};
164
165static void
166sfxge_rx_post_refill(void *arg)
167{
168	struct sfxge_rxq *rxq = arg;
169	struct sfxge_softc *sc;
170	unsigned int index;
171	struct sfxge_evq *evq;
172	uint16_t magic;
173
174	sc = rxq->sc;
175	index = rxq->index;
176	evq = sc->evq[index];
177
178	magic = SFXGE_MAGIC_RX_QREFILL | index;
179
180	/* This is guaranteed due to the start/stop order of rx and ev */
181	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182	    ("evq not started"));
183	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184	    ("rxq not started"));
185	efx_ev_qpost(evq->common, magic);
186}
187
188static void
189sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190{
191	/* Initially retry after 100 ms, but back off in case of
192	 * repeated failures as we probably have to wait for the
193	 * administrator to raise the pool limit. */
194	if (retrying)
195		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196	else
197		rxq->refill_delay = hz / 10;
198
199	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200			     sfxge_rx_post_refill, rxq);
201}
202
203static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204{
205	struct mb_args args;
206	struct mbuf *m;
207
208	/* Allocate mbuf structure */
209	args.flags = M_PKTHDR;
210	args.type = MT_DATA;
211	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212
213	/* Allocate (and attach) packet buffer */
214	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215		uma_zfree(zone_mbuf, m);
216		m = NULL;
217	}
218
219	return (m);
220}
221
222#define	SFXGE_REFILL_BATCH  64
223
224static void
225sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226{
227	struct sfxge_softc *sc;
228	unsigned int index;
229	struct sfxge_evq *evq;
230	unsigned int batch;
231	unsigned int rxfill;
232	unsigned int mblksize;
233	int ntodo;
234	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235
236	sc = rxq->sc;
237	index = rxq->index;
238	evq = sc->evq[index];
239
240	prefetch_read_many(sc->enp);
241	prefetch_read_many(rxq->common);
242
243	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244
245	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246		return;
247
248	rxfill = rxq->added - rxq->completed;
249	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254
255	if (ntodo == 0)
256		return;
257
258	batch = 0;
259	mblksize = sc->rx_buffer_size;
260	while (ntodo-- > 0) {
261		unsigned int id;
262		struct sfxge_rx_sw_desc *rx_desc;
263		bus_dma_segment_t seg;
264		struct mbuf *m;
265
266		id = (rxq->added + batch) & rxq->ptr_mask;
267		rx_desc = &rxq->queue[id];
268		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269
270		rx_desc->flags = EFX_DISCARD;
271		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272		if (m == NULL)
273			break;
274		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275		addr[batch++] = seg.ds_addr;
276
277		if (batch == SFXGE_REFILL_BATCH) {
278			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279			    rxq->completed, rxq->added);
280			rxq->added += batch;
281			batch = 0;
282		}
283	}
284
285	if (ntodo != 0)
286		sfxge_rx_schedule_refill(rxq, retrying);
287
288	if (batch != 0) {
289		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290		    rxq->completed, rxq->added);
291		rxq->added += batch;
292	}
293
294	/* Make the descriptors visible to the hardware */
295	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296			BUS_DMASYNC_PREWRITE);
297
298	efx_rx_qpush(rxq->common, rxq->added);
299}
300
301void
302sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303{
304
305	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306		return;
307
308	/* Make sure the queue is full */
309	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310}
311
312static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313{
314	struct ifnet *ifp = sc->ifnet;
315
316	m->m_pkthdr.rcvif = ifp;
317	m->m_pkthdr.csum_data = 0xffff;
318	ifp->if_input(ifp, m);
319}
320
321static void
322sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323{
324	struct mbuf *m = rx_desc->mbuf;
325	int flags = rx_desc->flags;
326	int csum_flags;
327
328	/* Convert checksum flags */
329	csum_flags = (flags & EFX_CKSUM_IPV4) ?
330		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
331	if (flags & EFX_CKSUM_TCPUDP)
332		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
333
334	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
335		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
336						       mtod(m, uint8_t *));
337		/* The hash covers a 4-tuple for TCP only */
338		M_HASHTYPE_SET(m,
339		    (flags & EFX_PKT_IPV4) ?
340			((flags & EFX_PKT_TCP) ?
341			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
342			((flags & EFX_PKT_TCP) ?
343			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
344	}
345	m->m_data += sc->rx_prefix_size;
346	m->m_len = rx_desc->size - sc->rx_prefix_size;
347	m->m_pkthdr.len = m->m_len;
348	m->m_pkthdr.csum_flags = csum_flags;
349	__sfxge_rx_deliver(sc, rx_desc->mbuf);
350
351	rx_desc->flags = EFX_DISCARD;
352	rx_desc->mbuf = NULL;
353}
354
355#ifdef SFXGE_LRO
356
357static void
358sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
359{
360	struct sfxge_softc *sc = st->sc;
361	struct mbuf *m = c->mbuf;
362	struct tcphdr *c_th;
363	int csum_flags;
364
365	KASSERT(m, ("no mbuf to deliver"));
366
367	++st->n_bursts;
368
369	/* Finish off packet munging and recalculate IP header checksum. */
370	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
371		struct ip *iph = c->nh;
372		iph->ip_len = htons(iph->ip_len);
373		iph->ip_sum = 0;
374		iph->ip_sum = in_cksum_hdr(iph);
375		c_th = (struct tcphdr *)(iph + 1);
376		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
377			      CSUM_IP_CHECKED | CSUM_IP_VALID);
378	} else {
379		struct ip6_hdr *iph = c->nh;
380		iph->ip6_plen = htons(iph->ip6_plen);
381		c_th = (struct tcphdr *)(iph + 1);
382		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
383	}
384
385	c_th->th_win = c->th_last->th_win;
386	c_th->th_ack = c->th_last->th_ack;
387	if (c_th->th_off == c->th_last->th_off) {
388		/* Copy TCP options (take care to avoid going negative). */
389		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
390		memcpy(c_th + 1, c->th_last + 1, optlen);
391	}
392
393	m->m_pkthdr.flowid = c->conn_hash;
394	M_HASHTYPE_SET(m,
395	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
396		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
397
398	m->m_pkthdr.csum_flags = csum_flags;
399	__sfxge_rx_deliver(sc, m);
400
401	c->mbuf = NULL;
402	c->delivered = 1;
403}
404
405/* Drop the given connection, and add it to the free list. */
406static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
407{
408	unsigned bucket;
409
410	KASSERT(!c->mbuf, ("found orphaned mbuf"));
411
412	if (c->next_buf.mbuf != NULL) {
413		sfxge_rx_deliver(rxq->sc, &c->next_buf);
414		LIST_REMOVE(c, active_link);
415	}
416
417	bucket = c->conn_hash & rxq->lro.conns_mask;
418	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
419	--rxq->lro.conns_n[bucket];
420	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
421	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
422}
423
424/* Stop tracking connections that have gone idle in order to keep hash
425 * chains short.
426 */
427static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
428{
429	struct sfxge_lro_conn *c;
430	unsigned i;
431
432	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
433		("found active connections"));
434
435	rxq->lro.last_purge_ticks = now;
436	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
437		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
438			continue;
439
440		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
441		if (now - c->last_pkt_ticks > lro_idle_ticks) {
442			++rxq->lro.n_drop_idle;
443			sfxge_lro_drop(rxq, c);
444		}
445	}
446}
447
448static void
449sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
450		struct mbuf *mbuf, struct tcphdr *th)
451{
452	struct tcphdr *c_th;
453
454	/* Tack the new mbuf onto the chain. */
455	KASSERT(!mbuf->m_next, ("mbuf already chained"));
456	c->mbuf_tail->m_next = mbuf;
457	c->mbuf_tail = mbuf;
458
459	/* Increase length appropriately */
460	c->mbuf->m_pkthdr.len += mbuf->m_len;
461
462	/* Update the connection state flags */
463	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
464		struct ip *iph = c->nh;
465		iph->ip_len += mbuf->m_len;
466		c_th = (struct tcphdr *)(iph + 1);
467	} else {
468		struct ip6_hdr *iph = c->nh;
469		iph->ip6_plen += mbuf->m_len;
470		c_th = (struct tcphdr *)(iph + 1);
471	}
472	c_th->th_flags |= (th->th_flags & TH_PUSH);
473	c->th_last = th;
474	++st->n_merges;
475
476	/* Pass packet up now if another segment could overflow the IP
477	 * length.
478	 */
479	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
480		sfxge_lro_deliver(st, c);
481}
482
483static void
484sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
485		struct mbuf *mbuf, void *nh, struct tcphdr *th)
486{
487	/* Start the chain */
488	c->mbuf = mbuf;
489	c->mbuf_tail = c->mbuf;
490	c->nh = nh;
491	c->th_last = th;
492
493	mbuf->m_pkthdr.len = mbuf->m_len;
494
495	/* Mangle header fields for later processing */
496	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
497		struct ip *iph = nh;
498		iph->ip_len = ntohs(iph->ip_len);
499	} else {
500		struct ip6_hdr *iph = nh;
501		iph->ip6_plen = ntohs(iph->ip6_plen);
502	}
503}
504
505/* Try to merge or otherwise hold or deliver (as appropriate) the
506 * packet buffered for this connection (c->next_buf).  Return a flag
507 * indicating whether the connection is still active for LRO purposes.
508 */
509static int
510sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
511{
512	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
513	char *eh = c->next_eh;
514	int data_length, hdr_length, dont_merge;
515	unsigned th_seq, pkt_length;
516	struct tcphdr *th;
517	unsigned now;
518
519	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
520		struct ip *iph = c->next_nh;
521		th = (struct tcphdr *)(iph + 1);
522		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
523	} else {
524		struct ip6_hdr *iph = c->next_nh;
525		th = (struct tcphdr *)(iph + 1);
526		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
527	}
528
529	hdr_length = (char *) th + th->th_off * 4 - eh;
530	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
531		       hdr_length);
532	th_seq = ntohl(th->th_seq);
533	dont_merge = ((data_length <= 0)
534		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
535
536	/* Check for options other than aligned timestamp. */
537	if (th->th_off != 5) {
538		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
539		if (th->th_off == 8 &&
540		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
541					(TCPOPT_NOP << 16) |
542					(TCPOPT_TIMESTAMP << 8) |
543					TCPOLEN_TIMESTAMP)) {
544			/* timestamp option -- okay */
545		} else {
546			dont_merge = 1;
547		}
548	}
549
550	if (__predict_false(th_seq != c->next_seq)) {
551		/* Out-of-order, so start counting again. */
552		if (c->mbuf != NULL)
553			sfxge_lro_deliver(&rxq->lro, c);
554		c->n_in_order_pkts -= lro_loss_packets;
555		c->next_seq = th_seq + data_length;
556		++rxq->lro.n_misorder;
557		goto deliver_buf_out;
558	}
559	c->next_seq = th_seq + data_length;
560
561	now = ticks;
562	if (now - c->last_pkt_ticks > lro_idle_ticks) {
563		++rxq->lro.n_drop_idle;
564		if (c->mbuf != NULL)
565			sfxge_lro_deliver(&rxq->lro, c);
566		sfxge_lro_drop(rxq, c);
567		return (0);
568	}
569	c->last_pkt_ticks = ticks;
570
571	if (c->n_in_order_pkts < lro_slow_start_packets) {
572		/* May be in slow-start, so don't merge. */
573		++rxq->lro.n_slow_start;
574		++c->n_in_order_pkts;
575		goto deliver_buf_out;
576	}
577
578	if (__predict_false(dont_merge)) {
579		if (c->mbuf != NULL)
580			sfxge_lro_deliver(&rxq->lro, c);
581		if (th->th_flags & (TH_FIN | TH_RST)) {
582			++rxq->lro.n_drop_closed;
583			sfxge_lro_drop(rxq, c);
584			return (0);
585		}
586		goto deliver_buf_out;
587	}
588
589	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
590
591	if (__predict_true(c->mbuf != NULL)) {
592		/* Remove headers and any padding */
593		rx_buf->mbuf->m_data += hdr_length;
594		rx_buf->mbuf->m_len = data_length;
595
596		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
597	} else {
598		/* Remove any padding */
599		rx_buf->mbuf->m_len = pkt_length;
600
601		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
602	}
603
604	rx_buf->mbuf = NULL;
605	return (1);
606
607 deliver_buf_out:
608	sfxge_rx_deliver(rxq->sc, rx_buf);
609	return (1);
610}
611
612static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
613			       uint16_t l2_id, void *nh, struct tcphdr *th)
614{
615	unsigned bucket = conn_hash & st->conns_mask;
616	struct sfxge_lro_conn *c;
617
618	if (st->conns_n[bucket] >= lro_chain_max) {
619		++st->n_too_many;
620		return;
621	}
622
623	if (!TAILQ_EMPTY(&st->free_conns)) {
624		c = TAILQ_FIRST(&st->free_conns);
625		TAILQ_REMOVE(&st->free_conns, c, link);
626	} else {
627		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
628		if (c == NULL)
629			return;
630		c->mbuf = NULL;
631		c->next_buf.mbuf = NULL;
632	}
633
634	/* Create the connection tracking data */
635	++st->conns_n[bucket];
636	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
637	c->l2_id = l2_id;
638	c->conn_hash = conn_hash;
639	c->source = th->th_sport;
640	c->dest = th->th_dport;
641	c->n_in_order_pkts = 0;
642	c->last_pkt_ticks = *(volatile int *)&ticks;
643	c->delivered = 0;
644	++st->n_new_stream;
645	/* NB. We don't initialise c->next_seq, and it doesn't matter what
646	 * value it has.  Most likely the next packet received for this
647	 * connection will not match -- no harm done.
648	 */
649}
650
651/* Process mbuf and decide whether to dispatch it to the stack now or
652 * later.
653 */
654static void
655sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
656{
657	struct sfxge_softc *sc = rxq->sc;
658	struct mbuf *m = rx_buf->mbuf;
659	struct ether_header *eh;
660	struct sfxge_lro_conn *c;
661	uint16_t l2_id;
662	uint16_t l3_proto;
663	void *nh;
664	struct tcphdr *th;
665	uint32_t conn_hash;
666	unsigned bucket;
667
668	/* Get the hardware hash */
669	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
670				      mtod(m, uint8_t *));
671
672	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
673	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
674		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
675		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
676			SFXGE_LRO_L2_ID_VLAN;
677		l3_proto = veh->evl_proto;
678		nh = veh + 1;
679	} else {
680		l2_id = 0;
681		l3_proto = eh->ether_type;
682		nh = eh + 1;
683	}
684
685	/* Check whether this is a suitable packet (unfragmented
686	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
687	 * length, and compute a hash if necessary.  If not, return.
688	 */
689	if (l3_proto == htons(ETHERTYPE_IP)) {
690		struct ip *iph = nh;
691
692		KASSERT(iph->ip_p == IPPROTO_TCP,
693		    ("IPv4 protocol is not TCP, but packet marker is set"));
694		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
695		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
696			goto deliver_now;
697		th = (struct tcphdr *)(iph + 1);
698	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
699		struct ip6_hdr *iph = nh;
700
701		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
702		    ("IPv6 next header is not TCP, but packet marker is set"));
703		l2_id |= SFXGE_LRO_L2_ID_IPV6;
704		th = (struct tcphdr *)(iph + 1);
705	} else {
706		goto deliver_now;
707	}
708
709	bucket = conn_hash & rxq->lro.conns_mask;
710
711	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
712		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
713			continue;
714		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
715			continue;
716		if (c->mbuf != NULL) {
717			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
718				struct ip *c_iph, *iph = nh;
719				c_iph = c->nh;
720				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
721				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
722					continue;
723			} else {
724				struct ip6_hdr *c_iph, *iph = nh;
725				c_iph = c->nh;
726				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
727				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
728					continue;
729			}
730		}
731
732		/* Re-insert at head of list to reduce lookup time. */
733		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
734		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
735
736		if (c->next_buf.mbuf != NULL) {
737			if (!sfxge_lro_try_merge(rxq, c))
738				goto deliver_now;
739		} else {
740			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
741			    active_link);
742		}
743		c->next_buf = *rx_buf;
744		c->next_eh = eh;
745		c->next_nh = nh;
746
747		rx_buf->mbuf = NULL;
748		rx_buf->flags = EFX_DISCARD;
749		return;
750	}
751
752	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
753 deliver_now:
754	sfxge_rx_deliver(sc, rx_buf);
755}
756
757static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
758{
759	struct sfxge_lro_state *st = &rxq->lro;
760	struct sfxge_lro_conn *c;
761	unsigned t;
762
763	while (!LIST_EMPTY(&st->active_conns)) {
764		c = LIST_FIRST(&st->active_conns);
765		if (!c->delivered && c->mbuf != NULL)
766			sfxge_lro_deliver(st, c);
767		if (sfxge_lro_try_merge(rxq, c)) {
768			if (c->mbuf != NULL)
769				sfxge_lro_deliver(st, c);
770			LIST_REMOVE(c, active_link);
771		}
772		c->delivered = 0;
773	}
774
775	t = *(volatile int *)&ticks;
776	if (__predict_false(t != st->last_purge_ticks))
777		sfxge_lro_purge_idle(rxq, t);
778}
779
780#else	/* !SFXGE_LRO */
781
782static void
783sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
784{
785}
786
787static void
788sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
789{
790}
791
792#endif	/* SFXGE_LRO */
793
794void
795sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
796{
797	struct sfxge_softc *sc = rxq->sc;
798	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
799	unsigned int index;
800	struct sfxge_evq *evq;
801	unsigned int completed;
802	unsigned int level;
803	struct mbuf *m;
804	struct sfxge_rx_sw_desc *prev = NULL;
805
806	index = rxq->index;
807	evq = sc->evq[index];
808
809	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
810
811	completed = rxq->completed;
812	while (completed != rxq->pending) {
813		unsigned int id;
814		struct sfxge_rx_sw_desc *rx_desc;
815
816		id = completed++ & rxq->ptr_mask;
817		rx_desc = &rxq->queue[id];
818		m = rx_desc->mbuf;
819
820		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
821			goto discard;
822
823		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
824			goto discard;
825
826		prefetch_read_many(mtod(m, caddr_t));
827
828		/* Check for loopback packets */
829		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
830		    !(rx_desc->flags & EFX_PKT_IPV6)) {
831			struct ether_header *etherhp;
832
833			/*LINTED*/
834			etherhp = mtod(m, struct ether_header *);
835
836			if (etherhp->ether_type ==
837			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
838				EFSYS_PROBE(loopback);
839
840				rxq->loopback++;
841				goto discard;
842			}
843		}
844
845		/* Pass packet up the stack or into LRO (pipelined) */
846		if (prev != NULL) {
847			if (lro_enabled &&
848			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
849			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
850				sfxge_lro(rxq, prev);
851			else
852				sfxge_rx_deliver(sc, prev);
853		}
854		prev = rx_desc;
855		continue;
856
857discard:
858		/* Return the packet to the pool */
859		m_free(m);
860		rx_desc->mbuf = NULL;
861	}
862	rxq->completed = completed;
863
864	level = rxq->added - rxq->completed;
865
866	/* Pass last packet up the stack or into LRO */
867	if (prev != NULL) {
868		if (lro_enabled &&
869		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
870		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
871			sfxge_lro(rxq, prev);
872		else
873			sfxge_rx_deliver(sc, prev);
874	}
875
876	/*
877	 * If there are any pending flows and this is the end of the
878	 * poll then they must be completed.
879	 */
880	if (eop)
881		sfxge_lro_end_of_burst(rxq);
882
883	/* Top up the queue if necessary */
884	if (level < rxq->refill_threshold)
885		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
886}
887
888static void
889sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
890{
891	struct sfxge_rxq *rxq;
892	struct sfxge_evq *evq;
893	unsigned int count;
894
895	rxq = sc->rxq[index];
896	evq = sc->evq[index];
897
898	SFXGE_EVQ_LOCK(evq);
899
900	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
901	    ("rxq not started"));
902
903	rxq->init_state = SFXGE_RXQ_INITIALIZED;
904
905	callout_stop(&rxq->refill_callout);
906
907again:
908	rxq->flush_state = SFXGE_FLUSH_PENDING;
909
910	/* Flush the receive queue */
911	efx_rx_qflush(rxq->common);
912
913	SFXGE_EVQ_UNLOCK(evq);
914
915	count = 0;
916	do {
917		/* Spin for 100 ms */
918		DELAY(100000);
919
920		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
921			break;
922
923	} while (++count < 20);
924
925	SFXGE_EVQ_LOCK(evq);
926
927	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
928		goto again;
929
930	rxq->flush_state = SFXGE_FLUSH_DONE;
931
932	rxq->pending = rxq->added;
933	sfxge_rx_qcomplete(rxq, B_TRUE);
934
935	KASSERT(rxq->completed == rxq->pending,
936	    ("rxq->completed != rxq->pending"));
937
938	rxq->added = 0;
939	rxq->pending = 0;
940	rxq->completed = 0;
941	rxq->loopback = 0;
942
943	/* Destroy the common code receive queue. */
944	efx_rx_qdestroy(rxq->common);
945
946	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
947	    EFX_RXQ_NBUFS(sc->rxq_entries));
948
949	SFXGE_EVQ_UNLOCK(evq);
950}
951
952static int
953sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
954{
955	struct sfxge_rxq *rxq;
956	efsys_mem_t *esmp;
957	struct sfxge_evq *evq;
958	int rc;
959
960	rxq = sc->rxq[index];
961	esmp = &rxq->mem;
962	evq = sc->evq[index];
963
964	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
965	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
966	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
967	    ("evq->init_state != SFXGE_EVQ_STARTED"));
968
969	/* Program the buffer table. */
970	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
971	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
972		return (rc);
973
974	/* Create the common code receive queue. */
975	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
976	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
977	    &rxq->common)) != 0)
978		goto fail;
979
980	SFXGE_EVQ_LOCK(evq);
981
982	/* Enable the receive queue. */
983	efx_rx_qenable(rxq->common);
984
985	rxq->init_state = SFXGE_RXQ_STARTED;
986
987	/* Try to fill the queue from the pool. */
988	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
989
990	SFXGE_EVQ_UNLOCK(evq);
991
992	return (0);
993
994fail:
995	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
996	    EFX_RXQ_NBUFS(sc->rxq_entries));
997	return (rc);
998}
999
1000void
1001sfxge_rx_stop(struct sfxge_softc *sc)
1002{
1003	int index;
1004
1005	/* Stop the receive queue(s) */
1006	index = sc->rxq_count;
1007	while (--index >= 0)
1008		sfxge_rx_qstop(sc, index);
1009
1010	sc->rx_prefix_size = 0;
1011	sc->rx_buffer_size = 0;
1012
1013	efx_rx_fini(sc->enp);
1014}
1015
1016int
1017sfxge_rx_start(struct sfxge_softc *sc)
1018{
1019	struct sfxge_intr *intr;
1020	int index;
1021	int rc;
1022
1023	intr = &sc->intr;
1024
1025	/* Initialize the common code receive module. */
1026	if ((rc = efx_rx_init(sc->enp)) != 0)
1027		return (rc);
1028
1029	/* Calculate the receive packet buffer size. */
1030	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1031	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1032			      sc->rx_prefix_size);
1033
1034	/* Select zone for packet buffers */
1035	if (sc->rx_buffer_size <= MCLBYTES)
1036		sc->rx_buffer_zone = zone_clust;
1037	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1038		sc->rx_buffer_zone = zone_jumbop;
1039	else if (sc->rx_buffer_size <= MJUM9BYTES)
1040		sc->rx_buffer_zone = zone_jumbo9;
1041	else
1042		sc->rx_buffer_zone = zone_jumbo16;
1043
1044	/*
1045	 * Set up the scale table.  Enable all hash types and hash insertion.
1046	 */
1047	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1048		sc->rx_indir_table[index] = index % sc->rxq_count;
1049	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1050				       SFXGE_RX_SCALE_MAX)) != 0)
1051		goto fail;
1052	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1053	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1054	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1055
1056	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1057	    sizeof(toep_key))) != 0)
1058		goto fail;
1059
1060	/* Start the receive queue(s). */
1061	for (index = 0; index < sc->rxq_count; index++) {
1062		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1063			goto fail2;
1064	}
1065
1066	return (0);
1067
1068fail2:
1069	while (--index >= 0)
1070		sfxge_rx_qstop(sc, index);
1071
1072fail:
1073	efx_rx_fini(sc->enp);
1074
1075	return (rc);
1076}
1077
1078#ifdef SFXGE_LRO
1079
1080static void sfxge_lro_init(struct sfxge_rxq *rxq)
1081{
1082	struct sfxge_lro_state *st = &rxq->lro;
1083	unsigned i;
1084
1085	st->conns_mask = lro_table_size - 1;
1086	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1087		("lro_table_size must be a power of 2"));
1088	st->sc = rxq->sc;
1089	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1090			   M_SFXGE, M_WAITOK);
1091	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1092			     M_SFXGE, M_WAITOK);
1093	for (i = 0; i <= st->conns_mask; ++i) {
1094		TAILQ_INIT(&st->conns[i]);
1095		st->conns_n[i] = 0;
1096	}
1097	LIST_INIT(&st->active_conns);
1098	TAILQ_INIT(&st->free_conns);
1099}
1100
1101static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1102{
1103	struct sfxge_lro_state *st = &rxq->lro;
1104	struct sfxge_lro_conn *c;
1105	unsigned i;
1106
1107	/* Return cleanly if sfxge_lro_init() has not been called. */
1108	if (st->conns == NULL)
1109		return;
1110
1111	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1112
1113	for (i = 0; i <= st->conns_mask; ++i) {
1114		while (!TAILQ_EMPTY(&st->conns[i])) {
1115			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1116			sfxge_lro_drop(rxq, c);
1117		}
1118	}
1119
1120	while (!TAILQ_EMPTY(&st->free_conns)) {
1121		c = TAILQ_FIRST(&st->free_conns);
1122		TAILQ_REMOVE(&st->free_conns, c, link);
1123		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1124		free(c, M_SFXGE);
1125	}
1126
1127	free(st->conns_n, M_SFXGE);
1128	free(st->conns, M_SFXGE);
1129	st->conns = NULL;
1130}
1131
1132#else
1133
1134static void
1135sfxge_lro_init(struct sfxge_rxq *rxq)
1136{
1137}
1138
1139static void
1140sfxge_lro_fini(struct sfxge_rxq *rxq)
1141{
1142}
1143
1144#endif	/* SFXGE_LRO */
1145
1146static void
1147sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1148{
1149	struct sfxge_rxq *rxq;
1150
1151	rxq = sc->rxq[index];
1152
1153	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1154	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1155
1156	/* Free the context array and the flow table. */
1157	free(rxq->queue, M_SFXGE);
1158	sfxge_lro_fini(rxq);
1159
1160	/* Release DMA memory. */
1161	sfxge_dma_free(&rxq->mem);
1162
1163	sc->rxq[index] = NULL;
1164
1165	free(rxq, M_SFXGE);
1166}
1167
1168static int
1169sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1170{
1171	struct sfxge_rxq *rxq;
1172	struct sfxge_evq *evq;
1173	efsys_mem_t *esmp;
1174	int rc;
1175
1176	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1177
1178	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1179	rxq->sc = sc;
1180	rxq->index = index;
1181	rxq->entries = sc->rxq_entries;
1182	rxq->ptr_mask = rxq->entries - 1;
1183	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1184
1185	sc->rxq[index] = rxq;
1186	esmp = &rxq->mem;
1187
1188	evq = sc->evq[index];
1189
1190	/* Allocate and zero DMA space. */
1191	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1192		return (rc);
1193
1194	/* Allocate buffer table entries. */
1195	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1196				 &rxq->buf_base_id);
1197
1198	/* Allocate the context array and the flow table. */
1199	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1200	    M_SFXGE, M_WAITOK | M_ZERO);
1201	sfxge_lro_init(rxq);
1202
1203	callout_init(&rxq->refill_callout, B_TRUE);
1204
1205	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1206
1207	return (0);
1208}
1209
1210static const struct {
1211	const char *name;
1212	size_t offset;
1213} sfxge_rx_stats[] = {
1214#define	SFXGE_RX_STAT(name, member) \
1215	{ #name, offsetof(struct sfxge_rxq, member) }
1216#ifdef SFXGE_LRO
1217	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1218	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1219	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1220	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1221	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1222	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1223	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1224	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1225#endif
1226};
1227
1228static int
1229sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1230{
1231	struct sfxge_softc *sc = arg1;
1232	unsigned int id = arg2;
1233	unsigned int sum, index;
1234
1235	/* Sum across all RX queues */
1236	sum = 0;
1237	for (index = 0; index < sc->rxq_count; index++)
1238		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1239					 sfxge_rx_stats[id].offset);
1240
1241	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1242}
1243
1244static void
1245sfxge_rx_stat_init(struct sfxge_softc *sc)
1246{
1247	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1248	struct sysctl_oid_list *stat_list;
1249	unsigned int id;
1250
1251	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1252
1253	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1254		SYSCTL_ADD_PROC(
1255			ctx, stat_list,
1256			OID_AUTO, sfxge_rx_stats[id].name,
1257			CTLTYPE_UINT|CTLFLAG_RD,
1258			sc, id, sfxge_rx_stat_handler, "IU",
1259			"");
1260	}
1261}
1262
1263void
1264sfxge_rx_fini(struct sfxge_softc *sc)
1265{
1266	int index;
1267
1268	index = sc->rxq_count;
1269	while (--index >= 0)
1270		sfxge_rx_qfini(sc, index);
1271
1272	sc->rxq_count = 0;
1273}
1274
1275int
1276sfxge_rx_init(struct sfxge_softc *sc)
1277{
1278	struct sfxge_intr *intr;
1279	int index;
1280	int rc;
1281
1282#ifdef SFXGE_LRO
1283	if (!ISP2(lro_table_size)) {
1284		log(LOG_ERR, "%s=%u must be power of 2",
1285		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1286		rc = EINVAL;
1287		goto fail_lro_table_size;
1288	}
1289
1290	if (lro_idle_ticks == 0)
1291		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1292#endif
1293
1294	intr = &sc->intr;
1295
1296	sc->rxq_count = intr->n_alloc;
1297
1298	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1299	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1300
1301	/* Initialize the receive queue(s) - one per interrupt. */
1302	for (index = 0; index < sc->rxq_count; index++) {
1303		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1304			goto fail;
1305	}
1306
1307	sfxge_rx_stat_init(sc);
1308
1309	return (0);
1310
1311fail:
1312	/* Tear down the receive queue(s). */
1313	while (--index >= 0)
1314		sfxge_rx_qfini(sc, index);
1315
1316	sc->rxq_count = 0;
1317
1318#ifdef SFXGE_LRO
1319fail_lro_table_size:
1320#endif
1321	return (rc);
1322}
1323