sfxge_rx.c revision 282899
1/*-
2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 282899 2015-05-14 13:28:29Z arybchik $");
32
33#include <sys/types.h>
34#include <sys/mbuf.h>
35#include <sys/smp.h>
36#include <sys/socket.h>
37#include <sys/sysctl.h>
38#include <sys/limits.h>
39#include <sys/syslog.h>
40
41#include <net/ethernet.h>
42#include <net/if.h>
43#include <net/if_vlan_var.h>
44
45#include <netinet/in.h>
46#include <netinet/ip.h>
47#include <netinet/ip6.h>
48#include <netinet/tcp.h>
49
50#include <machine/in_cksum.h>
51
52#include "common/efx.h"
53
54
55#include "sfxge.h"
56#include "sfxge_rx.h"
57
58#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59
60#ifdef SFXGE_LRO
61
62SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63	    "Large receive offload (LRO) parameters");
64
65#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66
67/* Size of the LRO hash table.  Must be a power of 2.  A larger table
68 * means we can accelerate a larger number of streams.
69 */
70static unsigned lro_table_size = 128;
71TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73	    &lro_table_size, 0,
74	    "Size of the LRO hash table (must be a power of 2)");
75
76/* Maximum length of a hash chain.  If chains get too long then the lookup
77 * time increases and may exceed the benefit of LRO.
78 */
79static unsigned lro_chain_max = 20;
80TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82	    &lro_chain_max, 0,
83	    "The maximum length of a hash chain");
84
85/* Maximum time (in ticks) that a connection can be idle before it's LRO
86 * state is discarded.
87 */
88static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91	    &lro_idle_ticks, 0,
92	    "The maximum time (in ticks) that a connection can be idle "
93	    "before it's LRO state is discarded");
94
95/* Number of packets with payload that must arrive in-order before a
96 * connection is eligible for LRO.  The idea is we should avoid coalescing
97 * segments when the sender is in slow-start because reducing the ACK rate
98 * can damage performance.
99 */
100static int lro_slow_start_packets = 2000;
101TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103	    &lro_slow_start_packets, 0,
104	    "Number of packets with payload that must arrive in-order before "
105	    "a connection is eligible for LRO");
106
107/* Number of packets with payload that must arrive in-order following loss
108 * before a connection is eligible for LRO.  The idea is we should avoid
109 * coalescing segments when the sender is recovering from loss, because
110 * reducing the ACK rate can damage performance.
111 */
112static int lro_loss_packets = 20;
113TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115	    &lro_loss_packets, 0,
116	    "Number of packets with payload that must arrive in-order "
117	    "following loss before a connection is eligible for LRO");
118
119/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120#define	SFXGE_LRO_L2_ID_VLAN 0x4000
121#define	SFXGE_LRO_L2_ID_IPV6 0x8000
122#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124
125/* Compare IPv6 addresses, avoiding conditional branches */
126static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127				   const struct in6_addr *right)
128{
129#if LONG_BIT == 64
130	const uint64_t *left64 = (const uint64_t *)left;
131	const uint64_t *right64 = (const uint64_t *)right;
132	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133#else
134	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137	       (left->s6_addr32[3] - right->s6_addr32[3]);
138#endif
139}
140
141#endif	/* SFXGE_LRO */
142
143void
144sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145{
146
147	rxq->flush_state = SFXGE_FLUSH_DONE;
148}
149
150void
151sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152{
153
154	rxq->flush_state = SFXGE_FLUSH_FAILED;
155}
156
157static uint8_t toep_key[] = {
158	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163};
164
165static void
166sfxge_rx_post_refill(void *arg)
167{
168	struct sfxge_rxq *rxq = arg;
169	struct sfxge_softc *sc;
170	unsigned int index;
171	struct sfxge_evq *evq;
172	uint16_t magic;
173
174	sc = rxq->sc;
175	index = rxq->index;
176	evq = sc->evq[index];
177
178	magic = SFXGE_MAGIC_RX_QREFILL | index;
179
180	/* This is guaranteed due to the start/stop order of rx and ev */
181	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182	    ("evq not started"));
183	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184	    ("rxq not started"));
185	efx_ev_qpost(evq->common, magic);
186}
187
188static void
189sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190{
191	/* Initially retry after 100 ms, but back off in case of
192	 * repeated failures as we probably have to wait for the
193	 * administrator to raise the pool limit. */
194	if (retrying)
195		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196	else
197		rxq->refill_delay = hz / 10;
198
199	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200			     sfxge_rx_post_refill, rxq);
201}
202
203static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204{
205	struct mb_args args;
206	struct mbuf *m;
207
208	/* Allocate mbuf structure */
209	args.flags = M_PKTHDR;
210	args.type = MT_DATA;
211	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212
213	/* Allocate (and attach) packet buffer */
214	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215		uma_zfree(zone_mbuf, m);
216		m = NULL;
217	}
218
219	return (m);
220}
221
222#define	SFXGE_REFILL_BATCH  64
223
224static void
225sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226{
227	struct sfxge_softc *sc;
228	unsigned int index;
229	struct sfxge_evq *evq;
230	unsigned int batch;
231	unsigned int rxfill;
232	unsigned int mblksize;
233	int ntodo;
234	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235
236	sc = rxq->sc;
237	index = rxq->index;
238	evq = sc->evq[index];
239
240	prefetch_read_many(sc->enp);
241	prefetch_read_many(rxq->common);
242
243	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244
245	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246		return;
247
248	rxfill = rxq->added - rxq->completed;
249	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254
255	if (ntodo == 0)
256		return;
257
258	batch = 0;
259	mblksize = sc->rx_buffer_size;
260	while (ntodo-- > 0) {
261		unsigned int id;
262		struct sfxge_rx_sw_desc *rx_desc;
263		bus_dma_segment_t seg;
264		struct mbuf *m;
265
266		id = (rxq->added + batch) & rxq->ptr_mask;
267		rx_desc = &rxq->queue[id];
268		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269
270		rx_desc->flags = EFX_DISCARD;
271		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272		if (m == NULL)
273			break;
274		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275		addr[batch++] = seg.ds_addr;
276
277		if (batch == SFXGE_REFILL_BATCH) {
278			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279			    rxq->completed, rxq->added);
280			rxq->added += batch;
281			batch = 0;
282		}
283	}
284
285	if (ntodo != 0)
286		sfxge_rx_schedule_refill(rxq, retrying);
287
288	if (batch != 0) {
289		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290		    rxq->completed, rxq->added);
291		rxq->added += batch;
292	}
293
294	/* Make the descriptors visible to the hardware */
295	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296			BUS_DMASYNC_PREWRITE);
297
298	efx_rx_qpush(rxq->common, rxq->added);
299}
300
301void
302sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303{
304
305	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306		return;
307
308	/* Make sure the queue is full */
309	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310}
311
312static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313{
314	struct ifnet *ifp = sc->ifnet;
315
316	m->m_pkthdr.rcvif = ifp;
317	m->m_pkthdr.csum_data = 0xffff;
318	ifp->if_input(ifp, m);
319}
320
321static void
322sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323{
324	struct mbuf *m = rx_desc->mbuf;
325	int flags = rx_desc->flags;
326	int csum_flags;
327
328	/* Convert checksum flags */
329	csum_flags = (flags & EFX_CKSUM_IPV4) ?
330		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
331	if (flags & EFX_CKSUM_TCPUDP)
332		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
333
334	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
335		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
336						       mtod(m, uint8_t *));
337		/* The hash covers a 4-tuple for TCP only */
338		M_HASHTYPE_SET(m,
339		    (flags & EFX_PKT_IPV4) ?
340			((flags & EFX_PKT_TCP) ?
341			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
342			((flags & EFX_PKT_TCP) ?
343			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
344	}
345	m->m_data += sc->rx_prefix_size;
346	m->m_len = rx_desc->size - sc->rx_prefix_size;
347	m->m_pkthdr.len = m->m_len;
348	m->m_pkthdr.csum_flags = csum_flags;
349	__sfxge_rx_deliver(sc, rx_desc->mbuf);
350
351	rx_desc->flags = EFX_DISCARD;
352	rx_desc->mbuf = NULL;
353}
354
355#ifdef SFXGE_LRO
356
357static void
358sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
359{
360	struct sfxge_softc *sc = st->sc;
361	struct mbuf *m = c->mbuf;
362	struct tcphdr *c_th;
363	int csum_flags;
364
365	KASSERT(m, ("no mbuf to deliver"));
366
367	++st->n_bursts;
368
369	/* Finish off packet munging and recalculate IP header checksum. */
370	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
371		struct ip *iph = c->nh;
372		iph->ip_len = htons(iph->ip_len);
373		iph->ip_sum = 0;
374		iph->ip_sum = in_cksum_hdr(iph);
375		c_th = (struct tcphdr *)(iph + 1);
376		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
377			      CSUM_IP_CHECKED | CSUM_IP_VALID);
378	} else {
379		struct ip6_hdr *iph = c->nh;
380		iph->ip6_plen = htons(iph->ip6_plen);
381		c_th = (struct tcphdr *)(iph + 1);
382		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
383	}
384
385	c_th->th_win = c->th_last->th_win;
386	c_th->th_ack = c->th_last->th_ack;
387	if (c_th->th_off == c->th_last->th_off) {
388		/* Copy TCP options (take care to avoid going negative). */
389		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
390		memcpy(c_th + 1, c->th_last + 1, optlen);
391	}
392
393	m->m_pkthdr.flowid = c->conn_hash;
394	M_HASHTYPE_SET(m,
395	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
396		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
397
398	m->m_pkthdr.csum_flags = csum_flags;
399	__sfxge_rx_deliver(sc, m);
400
401	c->mbuf = NULL;
402	c->delivered = 1;
403}
404
405/* Drop the given connection, and add it to the free list. */
406static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
407{
408	unsigned bucket;
409
410	KASSERT(!c->mbuf, ("found orphaned mbuf"));
411
412	if (c->next_buf.mbuf != NULL) {
413		sfxge_rx_deliver(rxq->sc, &c->next_buf);
414		LIST_REMOVE(c, active_link);
415	}
416
417	bucket = c->conn_hash & rxq->lro.conns_mask;
418	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
419	--rxq->lro.conns_n[bucket];
420	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
421	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
422}
423
424/* Stop tracking connections that have gone idle in order to keep hash
425 * chains short.
426 */
427static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
428{
429	struct sfxge_lro_conn *c;
430	unsigned i;
431
432	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
433		("found active connections"));
434
435	rxq->lro.last_purge_ticks = now;
436	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
437		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
438			continue;
439
440		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
441		if (now - c->last_pkt_ticks > lro_idle_ticks) {
442			++rxq->lro.n_drop_idle;
443			sfxge_lro_drop(rxq, c);
444		}
445	}
446}
447
448static void
449sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
450		struct mbuf *mbuf, struct tcphdr *th)
451{
452	struct tcphdr *c_th;
453
454	/* Tack the new mbuf onto the chain. */
455	KASSERT(!mbuf->m_next, ("mbuf already chained"));
456	c->mbuf_tail->m_next = mbuf;
457	c->mbuf_tail = mbuf;
458
459	/* Increase length appropriately */
460	c->mbuf->m_pkthdr.len += mbuf->m_len;
461
462	/* Update the connection state flags */
463	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
464		struct ip *iph = c->nh;
465		iph->ip_len += mbuf->m_len;
466		c_th = (struct tcphdr *)(iph + 1);
467	} else {
468		struct ip6_hdr *iph = c->nh;
469		iph->ip6_plen += mbuf->m_len;
470		c_th = (struct tcphdr *)(iph + 1);
471	}
472	c_th->th_flags |= (th->th_flags & TH_PUSH);
473	c->th_last = th;
474	++st->n_merges;
475
476	/* Pass packet up now if another segment could overflow the IP
477	 * length.
478	 */
479	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
480		sfxge_lro_deliver(st, c);
481}
482
483static void
484sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
485		struct mbuf *mbuf, void *nh, struct tcphdr *th)
486{
487	/* Start the chain */
488	c->mbuf = mbuf;
489	c->mbuf_tail = c->mbuf;
490	c->nh = nh;
491	c->th_last = th;
492
493	mbuf->m_pkthdr.len = mbuf->m_len;
494
495	/* Mangle header fields for later processing */
496	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
497		struct ip *iph = nh;
498		iph->ip_len = ntohs(iph->ip_len);
499	} else {
500		struct ip6_hdr *iph = nh;
501		iph->ip6_plen = ntohs(iph->ip6_plen);
502	}
503}
504
505/* Try to merge or otherwise hold or deliver (as appropriate) the
506 * packet buffered for this connection (c->next_buf).  Return a flag
507 * indicating whether the connection is still active for LRO purposes.
508 */
509static int
510sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
511{
512	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
513	char *eh = c->next_eh;
514	int data_length, hdr_length, dont_merge;
515	unsigned th_seq, pkt_length;
516	struct tcphdr *th;
517	unsigned now;
518
519	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
520		struct ip *iph = c->next_nh;
521		th = (struct tcphdr *)(iph + 1);
522		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
523	} else {
524		struct ip6_hdr *iph = c->next_nh;
525		th = (struct tcphdr *)(iph + 1);
526		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
527	}
528
529	hdr_length = (char *) th + th->th_off * 4 - eh;
530	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
531		       hdr_length);
532	th_seq = ntohl(th->th_seq);
533	dont_merge = ((data_length <= 0)
534		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
535
536	/* Check for options other than aligned timestamp. */
537	if (th->th_off != 5) {
538		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
539		if (th->th_off == 8 &&
540		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
541					(TCPOPT_NOP << 16) |
542					(TCPOPT_TIMESTAMP << 8) |
543					TCPOLEN_TIMESTAMP)) {
544			/* timestamp option -- okay */
545		} else {
546			dont_merge = 1;
547		}
548	}
549
550	if (__predict_false(th_seq != c->next_seq)) {
551		/* Out-of-order, so start counting again. */
552		if (c->mbuf != NULL)
553			sfxge_lro_deliver(&rxq->lro, c);
554		c->n_in_order_pkts -= lro_loss_packets;
555		c->next_seq = th_seq + data_length;
556		++rxq->lro.n_misorder;
557		goto deliver_buf_out;
558	}
559	c->next_seq = th_seq + data_length;
560
561	now = ticks;
562	if (now - c->last_pkt_ticks > lro_idle_ticks) {
563		++rxq->lro.n_drop_idle;
564		if (c->mbuf != NULL)
565			sfxge_lro_deliver(&rxq->lro, c);
566		sfxge_lro_drop(rxq, c);
567		return (0);
568	}
569	c->last_pkt_ticks = ticks;
570
571	if (c->n_in_order_pkts < lro_slow_start_packets) {
572		/* May be in slow-start, so don't merge. */
573		++rxq->lro.n_slow_start;
574		++c->n_in_order_pkts;
575		goto deliver_buf_out;
576	}
577
578	if (__predict_false(dont_merge)) {
579		if (c->mbuf != NULL)
580			sfxge_lro_deliver(&rxq->lro, c);
581		if (th->th_flags & (TH_FIN | TH_RST)) {
582			++rxq->lro.n_drop_closed;
583			sfxge_lro_drop(rxq, c);
584			return (0);
585		}
586		goto deliver_buf_out;
587	}
588
589	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
590
591	if (__predict_true(c->mbuf != NULL)) {
592		/* Remove headers and any padding */
593		rx_buf->mbuf->m_data += hdr_length;
594		rx_buf->mbuf->m_len = data_length;
595
596		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
597	} else {
598		/* Remove any padding */
599		rx_buf->mbuf->m_len = pkt_length;
600
601		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
602	}
603
604	rx_buf->mbuf = NULL;
605	return (1);
606
607 deliver_buf_out:
608	sfxge_rx_deliver(rxq->sc, rx_buf);
609	return (1);
610}
611
612static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
613			       uint16_t l2_id, void *nh, struct tcphdr *th)
614{
615	unsigned bucket = conn_hash & st->conns_mask;
616	struct sfxge_lro_conn *c;
617
618	if (st->conns_n[bucket] >= lro_chain_max) {
619		++st->n_too_many;
620		return;
621	}
622
623	if (!TAILQ_EMPTY(&st->free_conns)) {
624		c = TAILQ_FIRST(&st->free_conns);
625		TAILQ_REMOVE(&st->free_conns, c, link);
626	} else {
627		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
628		if (c == NULL)
629			return;
630		c->mbuf = NULL;
631		c->next_buf.mbuf = NULL;
632	}
633
634	/* Create the connection tracking data */
635	++st->conns_n[bucket];
636	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
637	c->l2_id = l2_id;
638	c->conn_hash = conn_hash;
639	c->source = th->th_sport;
640	c->dest = th->th_dport;
641	c->n_in_order_pkts = 0;
642	c->last_pkt_ticks = *(volatile int *)&ticks;
643	c->delivered = 0;
644	++st->n_new_stream;
645	/* NB. We don't initialise c->next_seq, and it doesn't matter what
646	 * value it has.  Most likely the next packet received for this
647	 * connection will not match -- no harm done.
648	 */
649}
650
651/* Process mbuf and decide whether to dispatch it to the stack now or
652 * later.
653 */
654static void
655sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
656{
657	struct sfxge_softc *sc = rxq->sc;
658	struct mbuf *m = rx_buf->mbuf;
659	struct ether_header *eh;
660	struct sfxge_lro_conn *c;
661	uint16_t l2_id;
662	uint16_t l3_proto;
663	void *nh;
664	struct tcphdr *th;
665	uint32_t conn_hash;
666	unsigned bucket;
667
668	/* Get the hardware hash */
669	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
670				      mtod(m, uint8_t *));
671
672	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
673	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
674		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
675		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
676			SFXGE_LRO_L2_ID_VLAN;
677		l3_proto = veh->evl_proto;
678		nh = veh + 1;
679	} else {
680		l2_id = 0;
681		l3_proto = eh->ether_type;
682		nh = eh + 1;
683	}
684
685	/* Check whether this is a suitable packet (unfragmented
686	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
687	 * length, and compute a hash if necessary.  If not, return.
688	 */
689	if (l3_proto == htons(ETHERTYPE_IP)) {
690		struct ip *iph = nh;
691		if ((iph->ip_p - IPPROTO_TCP) |
692		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
693		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
694			goto deliver_now;
695		th = (struct tcphdr *)(iph + 1);
696	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
697		struct ip6_hdr *iph = nh;
698		if (iph->ip6_nxt != IPPROTO_TCP)
699			goto deliver_now;
700		l2_id |= SFXGE_LRO_L2_ID_IPV6;
701		th = (struct tcphdr *)(iph + 1);
702	} else {
703		goto deliver_now;
704	}
705
706	bucket = conn_hash & rxq->lro.conns_mask;
707
708	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
709		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
710			continue;
711		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
712			continue;
713		if (c->mbuf != NULL) {
714			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
715				struct ip *c_iph, *iph = nh;
716				c_iph = c->nh;
717				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
718				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
719					continue;
720			} else {
721				struct ip6_hdr *c_iph, *iph = nh;
722				c_iph = c->nh;
723				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
724				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
725					continue;
726			}
727		}
728
729		/* Re-insert at head of list to reduce lookup time. */
730		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
731		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
732
733		if (c->next_buf.mbuf != NULL) {
734			if (!sfxge_lro_try_merge(rxq, c))
735				goto deliver_now;
736		} else {
737			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
738			    active_link);
739		}
740		c->next_buf = *rx_buf;
741		c->next_eh = eh;
742		c->next_nh = nh;
743
744		rx_buf->mbuf = NULL;
745		rx_buf->flags = EFX_DISCARD;
746		return;
747	}
748
749	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
750 deliver_now:
751	sfxge_rx_deliver(sc, rx_buf);
752}
753
754static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
755{
756	struct sfxge_lro_state *st = &rxq->lro;
757	struct sfxge_lro_conn *c;
758	unsigned t;
759
760	while (!LIST_EMPTY(&st->active_conns)) {
761		c = LIST_FIRST(&st->active_conns);
762		if (!c->delivered && c->mbuf != NULL)
763			sfxge_lro_deliver(st, c);
764		if (sfxge_lro_try_merge(rxq, c)) {
765			if (c->mbuf != NULL)
766				sfxge_lro_deliver(st, c);
767			LIST_REMOVE(c, active_link);
768		}
769		c->delivered = 0;
770	}
771
772	t = *(volatile int *)&ticks;
773	if (__predict_false(t != st->last_purge_ticks))
774		sfxge_lro_purge_idle(rxq, t);
775}
776
777#else	/* !SFXGE_LRO */
778
779static void
780sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
781{
782}
783
784static void
785sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
786{
787}
788
789#endif	/* SFXGE_LRO */
790
791void
792sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
793{
794	struct sfxge_softc *sc = rxq->sc;
795	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
796	unsigned int index;
797	struct sfxge_evq *evq;
798	unsigned int completed;
799	unsigned int level;
800	struct mbuf *m;
801	struct sfxge_rx_sw_desc *prev = NULL;
802
803	index = rxq->index;
804	evq = sc->evq[index];
805
806	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
807
808	completed = rxq->completed;
809	while (completed != rxq->pending) {
810		unsigned int id;
811		struct sfxge_rx_sw_desc *rx_desc;
812
813		id = completed++ & rxq->ptr_mask;
814		rx_desc = &rxq->queue[id];
815		m = rx_desc->mbuf;
816
817		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
818			goto discard;
819
820		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
821			goto discard;
822
823		prefetch_read_many(mtod(m, caddr_t));
824
825		/* Check for loopback packets */
826		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
827		    !(rx_desc->flags & EFX_PKT_IPV6)) {
828			struct ether_header *etherhp;
829
830			/*LINTED*/
831			etherhp = mtod(m, struct ether_header *);
832
833			if (etherhp->ether_type ==
834			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
835				EFSYS_PROBE(loopback);
836
837				rxq->loopback++;
838				goto discard;
839			}
840		}
841
842		/* Pass packet up the stack or into LRO (pipelined) */
843		if (prev != NULL) {
844			if (lro_enabled)
845				sfxge_lro(rxq, prev);
846			else
847				sfxge_rx_deliver(sc, prev);
848		}
849		prev = rx_desc;
850		continue;
851
852discard:
853		/* Return the packet to the pool */
854		m_free(m);
855		rx_desc->mbuf = NULL;
856	}
857	rxq->completed = completed;
858
859	level = rxq->added - rxq->completed;
860
861	/* Pass last packet up the stack or into LRO */
862	if (prev != NULL) {
863		if (lro_enabled)
864			sfxge_lro(rxq, prev);
865		else
866			sfxge_rx_deliver(sc, prev);
867	}
868
869	/*
870	 * If there are any pending flows and this is the end of the
871	 * poll then they must be completed.
872	 */
873	if (eop)
874		sfxge_lro_end_of_burst(rxq);
875
876	/* Top up the queue if necessary */
877	if (level < rxq->refill_threshold)
878		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
879}
880
881static void
882sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
883{
884	struct sfxge_rxq *rxq;
885	struct sfxge_evq *evq;
886	unsigned int count;
887
888	rxq = sc->rxq[index];
889	evq = sc->evq[index];
890
891	SFXGE_EVQ_LOCK(evq);
892
893	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
894	    ("rxq not started"));
895
896	rxq->init_state = SFXGE_RXQ_INITIALIZED;
897
898	callout_stop(&rxq->refill_callout);
899
900again:
901	rxq->flush_state = SFXGE_FLUSH_PENDING;
902
903	/* Flush the receive queue */
904	efx_rx_qflush(rxq->common);
905
906	SFXGE_EVQ_UNLOCK(evq);
907
908	count = 0;
909	do {
910		/* Spin for 100 ms */
911		DELAY(100000);
912
913		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
914			break;
915
916	} while (++count < 20);
917
918	SFXGE_EVQ_LOCK(evq);
919
920	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
921		goto again;
922
923	rxq->flush_state = SFXGE_FLUSH_DONE;
924
925	rxq->pending = rxq->added;
926	sfxge_rx_qcomplete(rxq, B_TRUE);
927
928	KASSERT(rxq->completed == rxq->pending,
929	    ("rxq->completed != rxq->pending"));
930
931	rxq->added = 0;
932	rxq->pending = 0;
933	rxq->completed = 0;
934	rxq->loopback = 0;
935
936	/* Destroy the common code receive queue. */
937	efx_rx_qdestroy(rxq->common);
938
939	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
940	    EFX_RXQ_NBUFS(sc->rxq_entries));
941
942	SFXGE_EVQ_UNLOCK(evq);
943}
944
945static int
946sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
947{
948	struct sfxge_rxq *rxq;
949	efsys_mem_t *esmp;
950	struct sfxge_evq *evq;
951	int rc;
952
953	rxq = sc->rxq[index];
954	esmp = &rxq->mem;
955	evq = sc->evq[index];
956
957	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
958	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
959	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
960	    ("evq->init_state != SFXGE_EVQ_STARTED"));
961
962	/* Program the buffer table. */
963	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
964	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
965		return (rc);
966
967	/* Create the common code receive queue. */
968	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
969	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
970	    &rxq->common)) != 0)
971		goto fail;
972
973	SFXGE_EVQ_LOCK(evq);
974
975	/* Enable the receive queue. */
976	efx_rx_qenable(rxq->common);
977
978	rxq->init_state = SFXGE_RXQ_STARTED;
979
980	/* Try to fill the queue from the pool. */
981	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
982
983	SFXGE_EVQ_UNLOCK(evq);
984
985	return (0);
986
987fail:
988	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
989	    EFX_RXQ_NBUFS(sc->rxq_entries));
990	return (rc);
991}
992
993void
994sfxge_rx_stop(struct sfxge_softc *sc)
995{
996	int index;
997
998	/* Stop the receive queue(s) */
999	index = sc->rxq_count;
1000	while (--index >= 0)
1001		sfxge_rx_qstop(sc, index);
1002
1003	sc->rx_prefix_size = 0;
1004	sc->rx_buffer_size = 0;
1005
1006	efx_rx_fini(sc->enp);
1007}
1008
1009int
1010sfxge_rx_start(struct sfxge_softc *sc)
1011{
1012	struct sfxge_intr *intr;
1013	int index;
1014	int rc;
1015
1016	intr = &sc->intr;
1017
1018	/* Initialize the common code receive module. */
1019	if ((rc = efx_rx_init(sc->enp)) != 0)
1020		return (rc);
1021
1022	/* Calculate the receive packet buffer size. */
1023	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1024	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1025			      sc->rx_prefix_size);
1026
1027	/* Select zone for packet buffers */
1028	if (sc->rx_buffer_size <= MCLBYTES)
1029		sc->rx_buffer_zone = zone_clust;
1030	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1031		sc->rx_buffer_zone = zone_jumbop;
1032	else if (sc->rx_buffer_size <= MJUM9BYTES)
1033		sc->rx_buffer_zone = zone_jumbo9;
1034	else
1035		sc->rx_buffer_zone = zone_jumbo16;
1036
1037	/*
1038	 * Set up the scale table.  Enable all hash types and hash insertion.
1039	 */
1040	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1041		sc->rx_indir_table[index] = index % sc->rxq_count;
1042	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1043				       SFXGE_RX_SCALE_MAX)) != 0)
1044		goto fail;
1045	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1046	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1047	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1048
1049	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1050	    sizeof(toep_key))) != 0)
1051		goto fail;
1052
1053	/* Start the receive queue(s). */
1054	for (index = 0; index < sc->rxq_count; index++) {
1055		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1056			goto fail2;
1057	}
1058
1059	return (0);
1060
1061fail2:
1062	while (--index >= 0)
1063		sfxge_rx_qstop(sc, index);
1064
1065fail:
1066	efx_rx_fini(sc->enp);
1067
1068	return (rc);
1069}
1070
1071#ifdef SFXGE_LRO
1072
1073static void sfxge_lro_init(struct sfxge_rxq *rxq)
1074{
1075	struct sfxge_lro_state *st = &rxq->lro;
1076	unsigned i;
1077
1078	st->conns_mask = lro_table_size - 1;
1079	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1080		("lro_table_size must be a power of 2"));
1081	st->sc = rxq->sc;
1082	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1083			   M_SFXGE, M_WAITOK);
1084	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1085			     M_SFXGE, M_WAITOK);
1086	for (i = 0; i <= st->conns_mask; ++i) {
1087		TAILQ_INIT(&st->conns[i]);
1088		st->conns_n[i] = 0;
1089	}
1090	LIST_INIT(&st->active_conns);
1091	TAILQ_INIT(&st->free_conns);
1092}
1093
1094static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1095{
1096	struct sfxge_lro_state *st = &rxq->lro;
1097	struct sfxge_lro_conn *c;
1098	unsigned i;
1099
1100	/* Return cleanly if sfxge_lro_init() has not been called. */
1101	if (st->conns == NULL)
1102		return;
1103
1104	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1105
1106	for (i = 0; i <= st->conns_mask; ++i) {
1107		while (!TAILQ_EMPTY(&st->conns[i])) {
1108			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1109			sfxge_lro_drop(rxq, c);
1110		}
1111	}
1112
1113	while (!TAILQ_EMPTY(&st->free_conns)) {
1114		c = TAILQ_FIRST(&st->free_conns);
1115		TAILQ_REMOVE(&st->free_conns, c, link);
1116		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1117		free(c, M_SFXGE);
1118	}
1119
1120	free(st->conns_n, M_SFXGE);
1121	free(st->conns, M_SFXGE);
1122	st->conns = NULL;
1123}
1124
1125#else
1126
1127static void
1128sfxge_lro_init(struct sfxge_rxq *rxq)
1129{
1130}
1131
1132static void
1133sfxge_lro_fini(struct sfxge_rxq *rxq)
1134{
1135}
1136
1137#endif	/* SFXGE_LRO */
1138
1139static void
1140sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1141{
1142	struct sfxge_rxq *rxq;
1143
1144	rxq = sc->rxq[index];
1145
1146	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1147	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1148
1149	/* Free the context array and the flow table. */
1150	free(rxq->queue, M_SFXGE);
1151	sfxge_lro_fini(rxq);
1152
1153	/* Release DMA memory. */
1154	sfxge_dma_free(&rxq->mem);
1155
1156	sc->rxq[index] = NULL;
1157
1158	free(rxq, M_SFXGE);
1159}
1160
1161static int
1162sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1163{
1164	struct sfxge_rxq *rxq;
1165	struct sfxge_evq *evq;
1166	efsys_mem_t *esmp;
1167	int rc;
1168
1169	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1170
1171	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1172	rxq->sc = sc;
1173	rxq->index = index;
1174	rxq->entries = sc->rxq_entries;
1175	rxq->ptr_mask = rxq->entries - 1;
1176	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1177
1178	sc->rxq[index] = rxq;
1179	esmp = &rxq->mem;
1180
1181	evq = sc->evq[index];
1182
1183	/* Allocate and zero DMA space. */
1184	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1185		return (rc);
1186
1187	/* Allocate buffer table entries. */
1188	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1189				 &rxq->buf_base_id);
1190
1191	/* Allocate the context array and the flow table. */
1192	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1193	    M_SFXGE, M_WAITOK | M_ZERO);
1194	sfxge_lro_init(rxq);
1195
1196	callout_init(&rxq->refill_callout, B_TRUE);
1197
1198	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1199
1200	return (0);
1201}
1202
1203static const struct {
1204	const char *name;
1205	size_t offset;
1206} sfxge_rx_stats[] = {
1207#define	SFXGE_RX_STAT(name, member) \
1208	{ #name, offsetof(struct sfxge_rxq, member) }
1209#ifdef SFXGE_LRO
1210	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1211	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1212	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1213	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1214	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1215	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1216	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1217	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1218#endif
1219};
1220
1221static int
1222sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1223{
1224	struct sfxge_softc *sc = arg1;
1225	unsigned int id = arg2;
1226	unsigned int sum, index;
1227
1228	/* Sum across all RX queues */
1229	sum = 0;
1230	for (index = 0; index < sc->rxq_count; index++)
1231		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1232					 sfxge_rx_stats[id].offset);
1233
1234	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1235}
1236
1237static void
1238sfxge_rx_stat_init(struct sfxge_softc *sc)
1239{
1240	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1241	struct sysctl_oid_list *stat_list;
1242	unsigned int id;
1243
1244	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1245
1246	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1247		SYSCTL_ADD_PROC(
1248			ctx, stat_list,
1249			OID_AUTO, sfxge_rx_stats[id].name,
1250			CTLTYPE_UINT|CTLFLAG_RD,
1251			sc, id, sfxge_rx_stat_handler, "IU",
1252			"");
1253	}
1254}
1255
1256void
1257sfxge_rx_fini(struct sfxge_softc *sc)
1258{
1259	int index;
1260
1261	index = sc->rxq_count;
1262	while (--index >= 0)
1263		sfxge_rx_qfini(sc, index);
1264
1265	sc->rxq_count = 0;
1266}
1267
1268int
1269sfxge_rx_init(struct sfxge_softc *sc)
1270{
1271	struct sfxge_intr *intr;
1272	int index;
1273	int rc;
1274
1275#ifdef SFXGE_LRO
1276	if (!ISP2(lro_table_size)) {
1277		log(LOG_ERR, "%s=%u must be power of 2",
1278		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1279		rc = EINVAL;
1280		goto fail_lro_table_size;
1281	}
1282
1283	if (lro_idle_ticks == 0)
1284		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1285#endif
1286
1287	intr = &sc->intr;
1288
1289	sc->rxq_count = intr->n_alloc;
1290
1291	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1292	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1293
1294	/* Initialize the receive queue(s) - one per interrupt. */
1295	for (index = 0; index < sc->rxq_count; index++) {
1296		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1297			goto fail;
1298	}
1299
1300	sfxge_rx_stat_init(sc);
1301
1302	return (0);
1303
1304fail:
1305	/* Tear down the receive queue(s). */
1306	while (--index >= 0)
1307		sfxge_rx_qfini(sc, index);
1308
1309	sc->rxq_count = 0;
1310
1311#ifdef SFXGE_LRO
1312fail_lro_table_size:
1313#endif
1314	return (rc);
1315}
1316