sfxge_rx.c revision 280376
1/*-
2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 280376 2015-03-23 15:47:37Z arybchik $");
32
33#include <sys/types.h>
34#include <sys/mbuf.h>
35#include <sys/smp.h>
36#include <sys/socket.h>
37#include <sys/sysctl.h>
38#include <sys/limits.h>
39#include <sys/syslog.h>
40
41#include <net/ethernet.h>
42#include <net/if.h>
43#include <net/if_vlan_var.h>
44
45#include <netinet/in.h>
46#include <netinet/ip.h>
47#include <netinet/ip6.h>
48#include <netinet/tcp.h>
49
50#include <machine/in_cksum.h>
51
52#include "common/efx.h"
53
54
55#include "sfxge.h"
56#include "sfxge_rx.h"
57
58#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59
60#ifdef SFXGE_LRO
61
62SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63	    "Large receive offload (LRO) parameters");
64
65#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66
67/* Size of the LRO hash table.  Must be a power of 2.  A larger table
68 * means we can accelerate a larger number of streams.
69 */
70static unsigned lro_table_size = 128;
71TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73	    &lro_table_size, 0,
74	    "Size of the LRO hash table (must be a power of 2)");
75
76/* Maximum length of a hash chain.  If chains get too long then the lookup
77 * time increases and may exceed the benefit of LRO.
78 */
79static unsigned lro_chain_max = 20;
80TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82	    &lro_chain_max, 0,
83	    "The maximum length of a hash chain");
84
85/* Maximum time (in ticks) that a connection can be idle before it's LRO
86 * state is discarded.
87 */
88static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91	    &lro_idle_ticks, 0,
92	    "The maximum time (in ticks) that a connection can be idle "
93	    "before it's LRO state is discarded");
94
95/* Number of packets with payload that must arrive in-order before a
96 * connection is eligible for LRO.  The idea is we should avoid coalescing
97 * segments when the sender is in slow-start because reducing the ACK rate
98 * can damage performance.
99 */
100static int lro_slow_start_packets = 2000;
101TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103	    &lro_slow_start_packets, 0,
104	    "Number of packets with payload that must arrive in-order before "
105	    "a connection is eligible for LRO");
106
107/* Number of packets with payload that must arrive in-order following loss
108 * before a connection is eligible for LRO.  The idea is we should avoid
109 * coalescing segments when the sender is recovering from loss, because
110 * reducing the ACK rate can damage performance.
111 */
112static int lro_loss_packets = 20;
113TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115	    &lro_loss_packets, 0,
116	    "Number of packets with payload that must arrive in-order "
117	    "following loss before a connection is eligible for LRO");
118
119/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120#define	SFXGE_LRO_L2_ID_VLAN 0x4000
121#define	SFXGE_LRO_L2_ID_IPV6 0x8000
122#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124
125/* Compare IPv6 addresses, avoiding conditional branches */
126static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127				   const struct in6_addr *right)
128{
129#if LONG_BIT == 64
130	const uint64_t *left64 = (const uint64_t *)left;
131	const uint64_t *right64 = (const uint64_t *)right;
132	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133#else
134	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137	       (left->s6_addr32[3] - right->s6_addr32[3]);
138#endif
139}
140
141#endif	/* SFXGE_LRO */
142
143void
144sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145{
146
147	rxq->flush_state = SFXGE_FLUSH_DONE;
148}
149
150void
151sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152{
153
154	rxq->flush_state = SFXGE_FLUSH_FAILED;
155}
156
157static uint8_t toep_key[] = {
158	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163};
164
165static void
166sfxge_rx_post_refill(void *arg)
167{
168	struct sfxge_rxq *rxq = arg;
169	struct sfxge_softc *sc;
170	unsigned int index;
171	struct sfxge_evq *evq;
172	uint16_t magic;
173
174	sc = rxq->sc;
175	index = rxq->index;
176	evq = sc->evq[index];
177
178	magic = SFXGE_MAGIC_RX_QREFILL | index;
179
180	/* This is guaranteed due to the start/stop order of rx and ev */
181	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182	    ("evq not started"));
183	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184	    ("rxq not started"));
185	efx_ev_qpost(evq->common, magic);
186}
187
188static void
189sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190{
191	/* Initially retry after 100 ms, but back off in case of
192	 * repeated failures as we probably have to wait for the
193	 * administrator to raise the pool limit. */
194	if (retrying)
195		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196	else
197		rxq->refill_delay = hz / 10;
198
199	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200			     sfxge_rx_post_refill, rxq);
201}
202
203static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204{
205	struct mb_args args;
206	struct mbuf *m;
207
208	/* Allocate mbuf structure */
209	args.flags = M_PKTHDR;
210	args.type = MT_DATA;
211	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212
213	/* Allocate (and attach) packet buffer */
214	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215		uma_zfree(zone_mbuf, m);
216		m = NULL;
217	}
218
219	return (m);
220}
221
222#define	SFXGE_REFILL_BATCH  64
223
224static void
225sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226{
227	struct sfxge_softc *sc;
228	unsigned int index;
229	struct sfxge_evq *evq;
230	unsigned int batch;
231	unsigned int rxfill;
232	unsigned int mblksize;
233	int ntodo;
234	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235
236	sc = rxq->sc;
237	index = rxq->index;
238	evq = sc->evq[index];
239
240	prefetch_read_many(sc->enp);
241	prefetch_read_many(rxq->common);
242
243	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244
245	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246		return;
247
248	rxfill = rxq->added - rxq->completed;
249	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254
255	if (ntodo == 0)
256		return;
257
258	batch = 0;
259	mblksize = sc->rx_buffer_size;
260	while (ntodo-- > 0) {
261		unsigned int id;
262		struct sfxge_rx_sw_desc *rx_desc;
263		bus_dma_segment_t seg;
264		struct mbuf *m;
265
266		id = (rxq->added + batch) & rxq->ptr_mask;
267		rx_desc = &rxq->queue[id];
268		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269
270		rx_desc->flags = EFX_DISCARD;
271		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272		if (m == NULL)
273			break;
274		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275		addr[batch++] = seg.ds_addr;
276
277		if (batch == SFXGE_REFILL_BATCH) {
278			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279			    rxq->completed, rxq->added);
280			rxq->added += batch;
281			batch = 0;
282		}
283	}
284
285	if (ntodo != 0)
286		sfxge_rx_schedule_refill(rxq, retrying);
287
288	if (batch != 0) {
289		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290		    rxq->completed, rxq->added);
291		rxq->added += batch;
292	}
293
294	/* Make the descriptors visible to the hardware */
295	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296			BUS_DMASYNC_PREWRITE);
297
298	efx_rx_qpush(rxq->common, rxq->added);
299}
300
301void
302sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303{
304
305	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306		return;
307
308	/* Make sure the queue is full */
309	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310}
311
312static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313{
314	struct ifnet *ifp = sc->ifnet;
315
316	m->m_pkthdr.rcvif = ifp;
317	m->m_pkthdr.csum_data = 0xffff;
318	ifp->if_input(ifp, m);
319}
320
321static void
322sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323{
324	struct mbuf *m = rx_desc->mbuf;
325	int csum_flags;
326
327	/* Convert checksum flags */
328	csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
329		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
330	if (rx_desc->flags & EFX_CKSUM_TCPUDP)
331		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
332
333	/* The hash covers a 4-tuple for TCP only */
334	if (rx_desc->flags & EFX_PKT_TCP) {
335		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
336						       mtod(m, uint8_t *));
337		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
338	}
339	m->m_data += sc->rx_prefix_size;
340	m->m_len = rx_desc->size - sc->rx_prefix_size;
341	m->m_pkthdr.len = m->m_len;
342	m->m_pkthdr.csum_flags = csum_flags;
343	__sfxge_rx_deliver(sc, rx_desc->mbuf);
344
345	rx_desc->flags = EFX_DISCARD;
346	rx_desc->mbuf = NULL;
347}
348
349#ifdef SFXGE_LRO
350
351static void
352sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
353{
354	struct sfxge_softc *sc = st->sc;
355	struct mbuf *m = c->mbuf;
356	struct tcphdr *c_th;
357	int csum_flags;
358
359	KASSERT(m, ("no mbuf to deliver"));
360
361	++st->n_bursts;
362
363	/* Finish off packet munging and recalculate IP header checksum. */
364	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
365		struct ip *iph = c->nh;
366		iph->ip_len = htons(iph->ip_len);
367		iph->ip_sum = 0;
368		iph->ip_sum = in_cksum_hdr(iph);
369		c_th = (struct tcphdr *)(iph + 1);
370		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
371			      CSUM_IP_CHECKED | CSUM_IP_VALID);
372	} else {
373		struct ip6_hdr *iph = c->nh;
374		iph->ip6_plen = htons(iph->ip6_plen);
375		c_th = (struct tcphdr *)(iph + 1);
376		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
377	}
378
379	c_th->th_win = c->th_last->th_win;
380	c_th->th_ack = c->th_last->th_ack;
381	if (c_th->th_off == c->th_last->th_off) {
382		/* Copy TCP options (take care to avoid going negative). */
383		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
384		memcpy(c_th + 1, c->th_last + 1, optlen);
385	}
386
387	m->m_pkthdr.flowid = c->conn_hash;
388	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
389
390	m->m_pkthdr.csum_flags = csum_flags;
391	__sfxge_rx_deliver(sc, m);
392
393	c->mbuf = NULL;
394	c->delivered = 1;
395}
396
397/* Drop the given connection, and add it to the free list. */
398static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
399{
400	unsigned bucket;
401
402	KASSERT(!c->mbuf, ("found orphaned mbuf"));
403
404	if (c->next_buf.mbuf != NULL) {
405		sfxge_rx_deliver(rxq->sc, &c->next_buf);
406		LIST_REMOVE(c, active_link);
407	}
408
409	bucket = c->conn_hash & rxq->lro.conns_mask;
410	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
411	--rxq->lro.conns_n[bucket];
412	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
413	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
414}
415
416/* Stop tracking connections that have gone idle in order to keep hash
417 * chains short.
418 */
419static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
420{
421	struct sfxge_lro_conn *c;
422	unsigned i;
423
424	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
425		("found active connections"));
426
427	rxq->lro.last_purge_ticks = now;
428	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
429		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
430			continue;
431
432		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
433		if (now - c->last_pkt_ticks > lro_idle_ticks) {
434			++rxq->lro.n_drop_idle;
435			sfxge_lro_drop(rxq, c);
436		}
437	}
438}
439
440static void
441sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
442		struct mbuf *mbuf, struct tcphdr *th)
443{
444	struct tcphdr *c_th;
445
446	/* Tack the new mbuf onto the chain. */
447	KASSERT(!mbuf->m_next, ("mbuf already chained"));
448	c->mbuf_tail->m_next = mbuf;
449	c->mbuf_tail = mbuf;
450
451	/* Increase length appropriately */
452	c->mbuf->m_pkthdr.len += mbuf->m_len;
453
454	/* Update the connection state flags */
455	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
456		struct ip *iph = c->nh;
457		iph->ip_len += mbuf->m_len;
458		c_th = (struct tcphdr *)(iph + 1);
459	} else {
460		struct ip6_hdr *iph = c->nh;
461		iph->ip6_plen += mbuf->m_len;
462		c_th = (struct tcphdr *)(iph + 1);
463	}
464	c_th->th_flags |= (th->th_flags & TH_PUSH);
465	c->th_last = th;
466	++st->n_merges;
467
468	/* Pass packet up now if another segment could overflow the IP
469	 * length.
470	 */
471	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
472		sfxge_lro_deliver(st, c);
473}
474
475static void
476sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
477		struct mbuf *mbuf, void *nh, struct tcphdr *th)
478{
479	/* Start the chain */
480	c->mbuf = mbuf;
481	c->mbuf_tail = c->mbuf;
482	c->nh = nh;
483	c->th_last = th;
484
485	mbuf->m_pkthdr.len = mbuf->m_len;
486
487	/* Mangle header fields for later processing */
488	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
489		struct ip *iph = nh;
490		iph->ip_len = ntohs(iph->ip_len);
491	} else {
492		struct ip6_hdr *iph = nh;
493		iph->ip6_plen = ntohs(iph->ip6_plen);
494	}
495}
496
497/* Try to merge or otherwise hold or deliver (as appropriate) the
498 * packet buffered for this connection (c->next_buf).  Return a flag
499 * indicating whether the connection is still active for LRO purposes.
500 */
501static int
502sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
503{
504	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
505	char *eh = c->next_eh;
506	int data_length, hdr_length, dont_merge;
507	unsigned th_seq, pkt_length;
508	struct tcphdr *th;
509	unsigned now;
510
511	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
512		struct ip *iph = c->next_nh;
513		th = (struct tcphdr *)(iph + 1);
514		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
515	} else {
516		struct ip6_hdr *iph = c->next_nh;
517		th = (struct tcphdr *)(iph + 1);
518		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
519	}
520
521	hdr_length = (char *) th + th->th_off * 4 - eh;
522	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
523		       hdr_length);
524	th_seq = ntohl(th->th_seq);
525	dont_merge = ((data_length <= 0)
526		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
527
528	/* Check for options other than aligned timestamp. */
529	if (th->th_off != 5) {
530		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
531		if (th->th_off == 8 &&
532		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
533					(TCPOPT_NOP << 16) |
534					(TCPOPT_TIMESTAMP << 8) |
535					TCPOLEN_TIMESTAMP)) {
536			/* timestamp option -- okay */
537		} else {
538			dont_merge = 1;
539		}
540	}
541
542	if (__predict_false(th_seq != c->next_seq)) {
543		/* Out-of-order, so start counting again. */
544		if (c->mbuf != NULL)
545			sfxge_lro_deliver(&rxq->lro, c);
546		c->n_in_order_pkts -= lro_loss_packets;
547		c->next_seq = th_seq + data_length;
548		++rxq->lro.n_misorder;
549		goto deliver_buf_out;
550	}
551	c->next_seq = th_seq + data_length;
552
553	now = ticks;
554	if (now - c->last_pkt_ticks > lro_idle_ticks) {
555		++rxq->lro.n_drop_idle;
556		if (c->mbuf != NULL)
557			sfxge_lro_deliver(&rxq->lro, c);
558		sfxge_lro_drop(rxq, c);
559		return (0);
560	}
561	c->last_pkt_ticks = ticks;
562
563	if (c->n_in_order_pkts < lro_slow_start_packets) {
564		/* May be in slow-start, so don't merge. */
565		++rxq->lro.n_slow_start;
566		++c->n_in_order_pkts;
567		goto deliver_buf_out;
568	}
569
570	if (__predict_false(dont_merge)) {
571		if (c->mbuf != NULL)
572			sfxge_lro_deliver(&rxq->lro, c);
573		if (th->th_flags & (TH_FIN | TH_RST)) {
574			++rxq->lro.n_drop_closed;
575			sfxge_lro_drop(rxq, c);
576			return (0);
577		}
578		goto deliver_buf_out;
579	}
580
581	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
582
583	if (__predict_true(c->mbuf != NULL)) {
584		/* Remove headers and any padding */
585		rx_buf->mbuf->m_data += hdr_length;
586		rx_buf->mbuf->m_len = data_length;
587
588		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
589	} else {
590		/* Remove any padding */
591		rx_buf->mbuf->m_len = pkt_length;
592
593		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
594	}
595
596	rx_buf->mbuf = NULL;
597	return (1);
598
599 deliver_buf_out:
600	sfxge_rx_deliver(rxq->sc, rx_buf);
601	return (1);
602}
603
604static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
605			       uint16_t l2_id, void *nh, struct tcphdr *th)
606{
607	unsigned bucket = conn_hash & st->conns_mask;
608	struct sfxge_lro_conn *c;
609
610	if (st->conns_n[bucket] >= lro_chain_max) {
611		++st->n_too_many;
612		return;
613	}
614
615	if (!TAILQ_EMPTY(&st->free_conns)) {
616		c = TAILQ_FIRST(&st->free_conns);
617		TAILQ_REMOVE(&st->free_conns, c, link);
618	} else {
619		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
620		if (c == NULL)
621			return;
622		c->mbuf = NULL;
623		c->next_buf.mbuf = NULL;
624	}
625
626	/* Create the connection tracking data */
627	++st->conns_n[bucket];
628	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
629	c->l2_id = l2_id;
630	c->conn_hash = conn_hash;
631	c->source = th->th_sport;
632	c->dest = th->th_dport;
633	c->n_in_order_pkts = 0;
634	c->last_pkt_ticks = *(volatile int *)&ticks;
635	c->delivered = 0;
636	++st->n_new_stream;
637	/* NB. We don't initialise c->next_seq, and it doesn't matter what
638	 * value it has.  Most likely the next packet received for this
639	 * connection will not match -- no harm done.
640	 */
641}
642
643/* Process mbuf and decide whether to dispatch it to the stack now or
644 * later.
645 */
646static void
647sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
648{
649	struct sfxge_softc *sc = rxq->sc;
650	struct mbuf *m = rx_buf->mbuf;
651	struct ether_header *eh;
652	struct sfxge_lro_conn *c;
653	uint16_t l2_id;
654	uint16_t l3_proto;
655	void *nh;
656	struct tcphdr *th;
657	uint32_t conn_hash;
658	unsigned bucket;
659
660	/* Get the hardware hash */
661	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
662				      mtod(m, uint8_t *));
663
664	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
665	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
666		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
667		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
668			SFXGE_LRO_L2_ID_VLAN;
669		l3_proto = veh->evl_proto;
670		nh = veh + 1;
671	} else {
672		l2_id = 0;
673		l3_proto = eh->ether_type;
674		nh = eh + 1;
675	}
676
677	/* Check whether this is a suitable packet (unfragmented
678	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
679	 * length, and compute a hash if necessary.  If not, return.
680	 */
681	if (l3_proto == htons(ETHERTYPE_IP)) {
682		struct ip *iph = nh;
683		if ((iph->ip_p - IPPROTO_TCP) |
684		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
685		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
686			goto deliver_now;
687		th = (struct tcphdr *)(iph + 1);
688	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
689		struct ip6_hdr *iph = nh;
690		if (iph->ip6_nxt != IPPROTO_TCP)
691			goto deliver_now;
692		l2_id |= SFXGE_LRO_L2_ID_IPV6;
693		th = (struct tcphdr *)(iph + 1);
694	} else {
695		goto deliver_now;
696	}
697
698	bucket = conn_hash & rxq->lro.conns_mask;
699
700	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
701		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
702			continue;
703		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
704			continue;
705		if (c->mbuf != NULL) {
706			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
707				struct ip *c_iph, *iph = nh;
708				c_iph = c->nh;
709				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
710				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
711					continue;
712			} else {
713				struct ip6_hdr *c_iph, *iph = nh;
714				c_iph = c->nh;
715				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
716				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
717					continue;
718			}
719		}
720
721		/* Re-insert at head of list to reduce lookup time. */
722		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
723		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
724
725		if (c->next_buf.mbuf != NULL) {
726			if (!sfxge_lro_try_merge(rxq, c))
727				goto deliver_now;
728		} else {
729			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
730			    active_link);
731		}
732		c->next_buf = *rx_buf;
733		c->next_eh = eh;
734		c->next_nh = nh;
735
736		rx_buf->mbuf = NULL;
737		rx_buf->flags = EFX_DISCARD;
738		return;
739	}
740
741	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
742 deliver_now:
743	sfxge_rx_deliver(sc, rx_buf);
744}
745
746static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
747{
748	struct sfxge_lro_state *st = &rxq->lro;
749	struct sfxge_lro_conn *c;
750	unsigned t;
751
752	while (!LIST_EMPTY(&st->active_conns)) {
753		c = LIST_FIRST(&st->active_conns);
754		if (!c->delivered && c->mbuf != NULL)
755			sfxge_lro_deliver(st, c);
756		if (sfxge_lro_try_merge(rxq, c)) {
757			if (c->mbuf != NULL)
758				sfxge_lro_deliver(st, c);
759			LIST_REMOVE(c, active_link);
760		}
761		c->delivered = 0;
762	}
763
764	t = *(volatile int *)&ticks;
765	if (__predict_false(t != st->last_purge_ticks))
766		sfxge_lro_purge_idle(rxq, t);
767}
768
769#else	/* !SFXGE_LRO */
770
771static void
772sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
773{
774}
775
776static void
777sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
778{
779}
780
781#endif	/* SFXGE_LRO */
782
783void
784sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
785{
786	struct sfxge_softc *sc = rxq->sc;
787	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
788	unsigned int index;
789	struct sfxge_evq *evq;
790	unsigned int completed;
791	unsigned int level;
792	struct mbuf *m;
793	struct sfxge_rx_sw_desc *prev = NULL;
794
795	index = rxq->index;
796	evq = sc->evq[index];
797
798	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
799
800	completed = rxq->completed;
801	while (completed != rxq->pending) {
802		unsigned int id;
803		struct sfxge_rx_sw_desc *rx_desc;
804
805		id = completed++ & rxq->ptr_mask;
806		rx_desc = &rxq->queue[id];
807		m = rx_desc->mbuf;
808
809		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
810			goto discard;
811
812		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
813			goto discard;
814
815		prefetch_read_many(mtod(m, caddr_t));
816
817		/* Check for loopback packets */
818		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
819		    !(rx_desc->flags & EFX_PKT_IPV6)) {
820			struct ether_header *etherhp;
821
822			/*LINTED*/
823			etherhp = mtod(m, struct ether_header *);
824
825			if (etherhp->ether_type ==
826			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
827				EFSYS_PROBE(loopback);
828
829				rxq->loopback++;
830				goto discard;
831			}
832		}
833
834		/* Pass packet up the stack or into LRO (pipelined) */
835		if (prev != NULL) {
836			if (lro_enabled)
837				sfxge_lro(rxq, prev);
838			else
839				sfxge_rx_deliver(sc, prev);
840		}
841		prev = rx_desc;
842		continue;
843
844discard:
845		/* Return the packet to the pool */
846		m_free(m);
847		rx_desc->mbuf = NULL;
848	}
849	rxq->completed = completed;
850
851	level = rxq->added - rxq->completed;
852
853	/* Pass last packet up the stack or into LRO */
854	if (prev != NULL) {
855		if (lro_enabled)
856			sfxge_lro(rxq, prev);
857		else
858			sfxge_rx_deliver(sc, prev);
859	}
860
861	/*
862	 * If there are any pending flows and this is the end of the
863	 * poll then they must be completed.
864	 */
865	if (eop)
866		sfxge_lro_end_of_burst(rxq);
867
868	/* Top up the queue if necessary */
869	if (level < rxq->refill_threshold)
870		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
871}
872
873static void
874sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
875{
876	struct sfxge_rxq *rxq;
877	struct sfxge_evq *evq;
878	unsigned int count;
879
880	rxq = sc->rxq[index];
881	evq = sc->evq[index];
882
883	SFXGE_EVQ_LOCK(evq);
884
885	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
886	    ("rxq not started"));
887
888	rxq->init_state = SFXGE_RXQ_INITIALIZED;
889
890	callout_stop(&rxq->refill_callout);
891
892again:
893	rxq->flush_state = SFXGE_FLUSH_PENDING;
894
895	/* Flush the receive queue */
896	efx_rx_qflush(rxq->common);
897
898	SFXGE_EVQ_UNLOCK(evq);
899
900	count = 0;
901	do {
902		/* Spin for 100 ms */
903		DELAY(100000);
904
905		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
906			break;
907
908	} while (++count < 20);
909
910	SFXGE_EVQ_LOCK(evq);
911
912	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
913		goto again;
914
915	rxq->flush_state = SFXGE_FLUSH_DONE;
916
917	rxq->pending = rxq->added;
918	sfxge_rx_qcomplete(rxq, B_TRUE);
919
920	KASSERT(rxq->completed == rxq->pending,
921	    ("rxq->completed != rxq->pending"));
922
923	rxq->added = 0;
924	rxq->pending = 0;
925	rxq->completed = 0;
926	rxq->loopback = 0;
927
928	/* Destroy the common code receive queue. */
929	efx_rx_qdestroy(rxq->common);
930
931	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
932	    EFX_RXQ_NBUFS(sc->rxq_entries));
933
934	SFXGE_EVQ_UNLOCK(evq);
935}
936
937static int
938sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
939{
940	struct sfxge_rxq *rxq;
941	efsys_mem_t *esmp;
942	struct sfxge_evq *evq;
943	int rc;
944
945	rxq = sc->rxq[index];
946	esmp = &rxq->mem;
947	evq = sc->evq[index];
948
949	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
950	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
951	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
952	    ("evq->init_state != SFXGE_EVQ_STARTED"));
953
954	/* Program the buffer table. */
955	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
956	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
957		return (rc);
958
959	/* Create the common code receive queue. */
960	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
961	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
962	    &rxq->common)) != 0)
963		goto fail;
964
965	SFXGE_EVQ_LOCK(evq);
966
967	/* Enable the receive queue. */
968	efx_rx_qenable(rxq->common);
969
970	rxq->init_state = SFXGE_RXQ_STARTED;
971
972	/* Try to fill the queue from the pool. */
973	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
974
975	SFXGE_EVQ_UNLOCK(evq);
976
977	return (0);
978
979fail:
980	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
981	    EFX_RXQ_NBUFS(sc->rxq_entries));
982	return (rc);
983}
984
985void
986sfxge_rx_stop(struct sfxge_softc *sc)
987{
988	int index;
989
990	/* Stop the receive queue(s) */
991	index = sc->rxq_count;
992	while (--index >= 0)
993		sfxge_rx_qstop(sc, index);
994
995	sc->rx_prefix_size = 0;
996	sc->rx_buffer_size = 0;
997
998	efx_rx_fini(sc->enp);
999}
1000
1001int
1002sfxge_rx_start(struct sfxge_softc *sc)
1003{
1004	struct sfxge_intr *intr;
1005	int index;
1006	int rc;
1007
1008	intr = &sc->intr;
1009
1010	/* Initialize the common code receive module. */
1011	if ((rc = efx_rx_init(sc->enp)) != 0)
1012		return (rc);
1013
1014	/* Calculate the receive packet buffer size. */
1015	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1016	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1017			      sc->rx_prefix_size);
1018
1019	/* Select zone for packet buffers */
1020	if (sc->rx_buffer_size <= MCLBYTES)
1021		sc->rx_buffer_zone = zone_clust;
1022	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1023		sc->rx_buffer_zone = zone_jumbop;
1024	else if (sc->rx_buffer_size <= MJUM9BYTES)
1025		sc->rx_buffer_zone = zone_jumbo9;
1026	else
1027		sc->rx_buffer_zone = zone_jumbo16;
1028
1029	/*
1030	 * Set up the scale table.  Enable all hash types and hash insertion.
1031	 */
1032	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1033		sc->rx_indir_table[index] = index % sc->rxq_count;
1034	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1035				       SFXGE_RX_SCALE_MAX)) != 0)
1036		goto fail;
1037	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1038	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1039	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1040
1041	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1042	    sizeof(toep_key))) != 0)
1043		goto fail;
1044
1045	/* Start the receive queue(s). */
1046	for (index = 0; index < sc->rxq_count; index++) {
1047		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1048			goto fail2;
1049	}
1050
1051	return (0);
1052
1053fail2:
1054	while (--index >= 0)
1055		sfxge_rx_qstop(sc, index);
1056
1057fail:
1058	efx_rx_fini(sc->enp);
1059
1060	return (rc);
1061}
1062
1063#ifdef SFXGE_LRO
1064
1065static void sfxge_lro_init(struct sfxge_rxq *rxq)
1066{
1067	struct sfxge_lro_state *st = &rxq->lro;
1068	unsigned i;
1069
1070	st->conns_mask = lro_table_size - 1;
1071	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1072		("lro_table_size must be a power of 2"));
1073	st->sc = rxq->sc;
1074	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1075			   M_SFXGE, M_WAITOK);
1076	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1077			     M_SFXGE, M_WAITOK);
1078	for (i = 0; i <= st->conns_mask; ++i) {
1079		TAILQ_INIT(&st->conns[i]);
1080		st->conns_n[i] = 0;
1081	}
1082	LIST_INIT(&st->active_conns);
1083	TAILQ_INIT(&st->free_conns);
1084}
1085
1086static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1087{
1088	struct sfxge_lro_state *st = &rxq->lro;
1089	struct sfxge_lro_conn *c;
1090	unsigned i;
1091
1092	/* Return cleanly if sfxge_lro_init() has not been called. */
1093	if (st->conns == NULL)
1094		return;
1095
1096	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1097
1098	for (i = 0; i <= st->conns_mask; ++i) {
1099		while (!TAILQ_EMPTY(&st->conns[i])) {
1100			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1101			sfxge_lro_drop(rxq, c);
1102		}
1103	}
1104
1105	while (!TAILQ_EMPTY(&st->free_conns)) {
1106		c = TAILQ_FIRST(&st->free_conns);
1107		TAILQ_REMOVE(&st->free_conns, c, link);
1108		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1109		free(c, M_SFXGE);
1110	}
1111
1112	free(st->conns_n, M_SFXGE);
1113	free(st->conns, M_SFXGE);
1114	st->conns = NULL;
1115}
1116
1117#else
1118
1119static void
1120sfxge_lro_init(struct sfxge_rxq *rxq)
1121{
1122}
1123
1124static void
1125sfxge_lro_fini(struct sfxge_rxq *rxq)
1126{
1127}
1128
1129#endif	/* SFXGE_LRO */
1130
1131static void
1132sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1133{
1134	struct sfxge_rxq *rxq;
1135
1136	rxq = sc->rxq[index];
1137
1138	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1139	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1140
1141	/* Free the context array and the flow table. */
1142	free(rxq->queue, M_SFXGE);
1143	sfxge_lro_fini(rxq);
1144
1145	/* Release DMA memory. */
1146	sfxge_dma_free(&rxq->mem);
1147
1148	sc->rxq[index] = NULL;
1149
1150	free(rxq, M_SFXGE);
1151}
1152
1153static int
1154sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1155{
1156	struct sfxge_rxq *rxq;
1157	struct sfxge_evq *evq;
1158	efsys_mem_t *esmp;
1159	int rc;
1160
1161	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1162
1163	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1164	rxq->sc = sc;
1165	rxq->index = index;
1166	rxq->entries = sc->rxq_entries;
1167	rxq->ptr_mask = rxq->entries - 1;
1168	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1169
1170	sc->rxq[index] = rxq;
1171	esmp = &rxq->mem;
1172
1173	evq = sc->evq[index];
1174
1175	/* Allocate and zero DMA space. */
1176	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1177		return (rc);
1178
1179	/* Allocate buffer table entries. */
1180	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1181				 &rxq->buf_base_id);
1182
1183	/* Allocate the context array and the flow table. */
1184	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1185	    M_SFXGE, M_WAITOK | M_ZERO);
1186	sfxge_lro_init(rxq);
1187
1188	callout_init(&rxq->refill_callout, B_TRUE);
1189
1190	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1191
1192	return (0);
1193}
1194
1195static const struct {
1196	const char *name;
1197	size_t offset;
1198} sfxge_rx_stats[] = {
1199#define	SFXGE_RX_STAT(name, member) \
1200	{ #name, offsetof(struct sfxge_rxq, member) }
1201#ifdef SFXGE_LRO
1202	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1203	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1204	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1205	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1206	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1207	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1208	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1209	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1210#endif
1211};
1212
1213static int
1214sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1215{
1216	struct sfxge_softc *sc = arg1;
1217	unsigned int id = arg2;
1218	unsigned int sum, index;
1219
1220	/* Sum across all RX queues */
1221	sum = 0;
1222	for (index = 0; index < sc->rxq_count; index++)
1223		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1224					 sfxge_rx_stats[id].offset);
1225
1226	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1227}
1228
1229static void
1230sfxge_rx_stat_init(struct sfxge_softc *sc)
1231{
1232	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1233	struct sysctl_oid_list *stat_list;
1234	unsigned int id;
1235
1236	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1237
1238	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1239		SYSCTL_ADD_PROC(
1240			ctx, stat_list,
1241			OID_AUTO, sfxge_rx_stats[id].name,
1242			CTLTYPE_UINT|CTLFLAG_RD,
1243			sc, id, sfxge_rx_stat_handler, "IU",
1244			"");
1245	}
1246}
1247
1248void
1249sfxge_rx_fini(struct sfxge_softc *sc)
1250{
1251	int index;
1252
1253	index = sc->rxq_count;
1254	while (--index >= 0)
1255		sfxge_rx_qfini(sc, index);
1256
1257	sc->rxq_count = 0;
1258}
1259
1260int
1261sfxge_rx_init(struct sfxge_softc *sc)
1262{
1263	struct sfxge_intr *intr;
1264	int index;
1265	int rc;
1266
1267#ifdef SFXGE_LRO
1268	if (!ISP2(lro_table_size)) {
1269		log(LOG_ERR, "%s=%u must be power of 2",
1270		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1271		rc = EINVAL;
1272		goto fail_lro_table_size;
1273	}
1274
1275	if (lro_idle_ticks == 0)
1276		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1277#endif
1278
1279	intr = &sc->intr;
1280
1281	sc->rxq_count = intr->n_alloc;
1282
1283	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1284	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1285
1286	/* Initialize the receive queue(s) - one per interrupt. */
1287	for (index = 0; index < sc->rxq_count; index++) {
1288		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1289			goto fail;
1290	}
1291
1292	sfxge_rx_stat_init(sc);
1293
1294	return (0);
1295
1296fail:
1297	/* Tear down the receive queue(s). */
1298	while (--index >= 0)
1299		sfxge_rx_qfini(sc, index);
1300
1301	sc->rxq_count = 0;
1302
1303#ifdef SFXGE_LRO
1304fail_lro_table_size:
1305#endif
1306	return (rc);
1307}
1308