sfxge_rx.c revision 280783
1/*-
2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 280783 2015-03-28 10:20:20Z arybchik $");
32
33#include <sys/types.h>
34#include <sys/mbuf.h>
35#include <sys/smp.h>
36#include <sys/socket.h>
37#include <sys/sysctl.h>
38#include <sys/limits.h>
39#include <sys/syslog.h>
40
41#include <net/ethernet.h>
42#include <net/if.h>
43#include <net/if_vlan_var.h>
44
45#include <netinet/in.h>
46#include <netinet/ip.h>
47#include <netinet/ip6.h>
48#include <netinet/tcp.h>
49
50#include <machine/in_cksum.h>
51
52#include "common/efx.h"
53
54
55#include "sfxge.h"
56#include "sfxge_rx.h"
57
58#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59
60#ifdef SFXGE_LRO
61
62SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63	    "Large receive offload (LRO) parameters");
64
65#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66
67/* Size of the LRO hash table.  Must be a power of 2.  A larger table
68 * means we can accelerate a larger number of streams.
69 */
70static unsigned lro_table_size = 128;
71TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73	    &lro_table_size, 0,
74	    "Size of the LRO hash table (must be a power of 2)");
75
76/* Maximum length of a hash chain.  If chains get too long then the lookup
77 * time increases and may exceed the benefit of LRO.
78 */
79static unsigned lro_chain_max = 20;
80TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82	    &lro_chain_max, 0,
83	    "The maximum length of a hash chain");
84
85/* Maximum time (in ticks) that a connection can be idle before it's LRO
86 * state is discarded.
87 */
88static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91	    &lro_idle_ticks, 0,
92	    "The maximum time (in ticks) that a connection can be idle "
93	    "before it's LRO state is discarded");
94
95/* Number of packets with payload that must arrive in-order before a
96 * connection is eligible for LRO.  The idea is we should avoid coalescing
97 * segments when the sender is in slow-start because reducing the ACK rate
98 * can damage performance.
99 */
100static int lro_slow_start_packets = 2000;
101TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103	    &lro_slow_start_packets, 0,
104	    "Number of packets with payload that must arrive in-order before "
105	    "a connection is eligible for LRO");
106
107/* Number of packets with payload that must arrive in-order following loss
108 * before a connection is eligible for LRO.  The idea is we should avoid
109 * coalescing segments when the sender is recovering from loss, because
110 * reducing the ACK rate can damage performance.
111 */
112static int lro_loss_packets = 20;
113TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115	    &lro_loss_packets, 0,
116	    "Number of packets with payload that must arrive in-order "
117	    "following loss before a connection is eligible for LRO");
118
119/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120#define	SFXGE_LRO_L2_ID_VLAN 0x4000
121#define	SFXGE_LRO_L2_ID_IPV6 0x8000
122#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124
125/* Compare IPv6 addresses, avoiding conditional branches */
126static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127				   const struct in6_addr *right)
128{
129#if LONG_BIT == 64
130	const uint64_t *left64 = (const uint64_t *)left;
131	const uint64_t *right64 = (const uint64_t *)right;
132	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133#else
134	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137	       (left->s6_addr32[3] - right->s6_addr32[3]);
138#endif
139}
140
141#endif	/* SFXGE_LRO */
142
143void
144sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145{
146
147	rxq->flush_state = SFXGE_FLUSH_DONE;
148}
149
150void
151sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152{
153
154	rxq->flush_state = SFXGE_FLUSH_FAILED;
155}
156
157static uint8_t toep_key[] = {
158	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163};
164
165static void
166sfxge_rx_post_refill(void *arg)
167{
168	struct sfxge_rxq *rxq = arg;
169	struct sfxge_softc *sc;
170	unsigned int index;
171	struct sfxge_evq *evq;
172	uint16_t magic;
173
174	sc = rxq->sc;
175	index = rxq->index;
176	evq = sc->evq[index];
177
178	magic = SFXGE_MAGIC_RX_QREFILL | index;
179
180	/* This is guaranteed due to the start/stop order of rx and ev */
181	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182	    ("evq not started"));
183	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184	    ("rxq not started"));
185	efx_ev_qpost(evq->common, magic);
186}
187
188static void
189sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190{
191	/* Initially retry after 100 ms, but back off in case of
192	 * repeated failures as we probably have to wait for the
193	 * administrator to raise the pool limit. */
194	if (retrying)
195		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196	else
197		rxq->refill_delay = hz / 10;
198
199	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200			     sfxge_rx_post_refill, rxq);
201}
202
203static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204{
205	struct mb_args args;
206	struct mbuf *m;
207
208	/* Allocate mbuf structure */
209	args.flags = M_PKTHDR;
210	args.type = MT_DATA;
211	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212
213	/* Allocate (and attach) packet buffer */
214	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215		uma_zfree(zone_mbuf, m);
216		m = NULL;
217	}
218
219	return (m);
220}
221
222#define	SFXGE_REFILL_BATCH  64
223
224static void
225sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226{
227	struct sfxge_softc *sc;
228	unsigned int index;
229	struct sfxge_evq *evq;
230	unsigned int batch;
231	unsigned int rxfill;
232	unsigned int mblksize;
233	int ntodo;
234	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235
236	sc = rxq->sc;
237	index = rxq->index;
238	evq = sc->evq[index];
239
240	prefetch_read_many(sc->enp);
241	prefetch_read_many(rxq->common);
242
243	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244
245	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246		return;
247
248	rxfill = rxq->added - rxq->completed;
249	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254
255	if (ntodo == 0)
256		return;
257
258	batch = 0;
259	mblksize = sc->rx_buffer_size;
260	while (ntodo-- > 0) {
261		unsigned int id;
262		struct sfxge_rx_sw_desc *rx_desc;
263		bus_dma_segment_t seg;
264		struct mbuf *m;
265
266		id = (rxq->added + batch) & rxq->ptr_mask;
267		rx_desc = &rxq->queue[id];
268		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269
270		rx_desc->flags = EFX_DISCARD;
271		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272		if (m == NULL)
273			break;
274		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275		addr[batch++] = seg.ds_addr;
276
277		if (batch == SFXGE_REFILL_BATCH) {
278			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279			    rxq->completed, rxq->added);
280			rxq->added += batch;
281			batch = 0;
282		}
283	}
284
285	if (ntodo != 0)
286		sfxge_rx_schedule_refill(rxq, retrying);
287
288	if (batch != 0) {
289		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290		    rxq->completed, rxq->added);
291		rxq->added += batch;
292	}
293
294	/* Make the descriptors visible to the hardware */
295	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296			BUS_DMASYNC_PREWRITE);
297
298	efx_rx_qpush(rxq->common, rxq->added);
299}
300
301void
302sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303{
304
305	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306		return;
307
308	/* Make sure the queue is full */
309	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310}
311
312static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313{
314	struct ifnet *ifp = sc->ifnet;
315
316	m->m_pkthdr.rcvif = ifp;
317	m->m_pkthdr.csum_data = 0xffff;
318	ifp->if_input(ifp, m);
319}
320
321static void
322sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323{
324	struct mbuf *m = rx_desc->mbuf;
325	int csum_flags;
326
327	/* Convert checksum flags */
328	csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
329		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
330	if (rx_desc->flags & EFX_CKSUM_TCPUDP)
331		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
332
333	if (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
334		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
335						       mtod(m, uint8_t *));
336		/* The hash covers a 4-tuple for TCP only */
337		M_HASHTYPE_SET(m,
338		    (rx_desc->flags & EFX_PKT_IPV4) ?
339			((rx_desc->flags & EFX_PKT_TCP) ?
340			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
341			((rx_desc->flags & EFX_PKT_TCP) ?
342			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
343	}
344	m->m_data += sc->rx_prefix_size;
345	m->m_len = rx_desc->size - sc->rx_prefix_size;
346	m->m_pkthdr.len = m->m_len;
347	m->m_pkthdr.csum_flags = csum_flags;
348	__sfxge_rx_deliver(sc, rx_desc->mbuf);
349
350	rx_desc->flags = EFX_DISCARD;
351	rx_desc->mbuf = NULL;
352}
353
354#ifdef SFXGE_LRO
355
356static void
357sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
358{
359	struct sfxge_softc *sc = st->sc;
360	struct mbuf *m = c->mbuf;
361	struct tcphdr *c_th;
362	int csum_flags;
363
364	KASSERT(m, ("no mbuf to deliver"));
365
366	++st->n_bursts;
367
368	/* Finish off packet munging and recalculate IP header checksum. */
369	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
370		struct ip *iph = c->nh;
371		iph->ip_len = htons(iph->ip_len);
372		iph->ip_sum = 0;
373		iph->ip_sum = in_cksum_hdr(iph);
374		c_th = (struct tcphdr *)(iph + 1);
375		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
376			      CSUM_IP_CHECKED | CSUM_IP_VALID);
377	} else {
378		struct ip6_hdr *iph = c->nh;
379		iph->ip6_plen = htons(iph->ip6_plen);
380		c_th = (struct tcphdr *)(iph + 1);
381		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
382	}
383
384	c_th->th_win = c->th_last->th_win;
385	c_th->th_ack = c->th_last->th_ack;
386	if (c_th->th_off == c->th_last->th_off) {
387		/* Copy TCP options (take care to avoid going negative). */
388		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
389		memcpy(c_th + 1, c->th_last + 1, optlen);
390	}
391
392	m->m_pkthdr.flowid = c->conn_hash;
393	M_HASHTYPE_SET(m,
394	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
395		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
396
397	m->m_pkthdr.csum_flags = csum_flags;
398	__sfxge_rx_deliver(sc, m);
399
400	c->mbuf = NULL;
401	c->delivered = 1;
402}
403
404/* Drop the given connection, and add it to the free list. */
405static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
406{
407	unsigned bucket;
408
409	KASSERT(!c->mbuf, ("found orphaned mbuf"));
410
411	if (c->next_buf.mbuf != NULL) {
412		sfxge_rx_deliver(rxq->sc, &c->next_buf);
413		LIST_REMOVE(c, active_link);
414	}
415
416	bucket = c->conn_hash & rxq->lro.conns_mask;
417	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
418	--rxq->lro.conns_n[bucket];
419	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
420	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
421}
422
423/* Stop tracking connections that have gone idle in order to keep hash
424 * chains short.
425 */
426static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
427{
428	struct sfxge_lro_conn *c;
429	unsigned i;
430
431	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
432		("found active connections"));
433
434	rxq->lro.last_purge_ticks = now;
435	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
436		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
437			continue;
438
439		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
440		if (now - c->last_pkt_ticks > lro_idle_ticks) {
441			++rxq->lro.n_drop_idle;
442			sfxge_lro_drop(rxq, c);
443		}
444	}
445}
446
447static void
448sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
449		struct mbuf *mbuf, struct tcphdr *th)
450{
451	struct tcphdr *c_th;
452
453	/* Tack the new mbuf onto the chain. */
454	KASSERT(!mbuf->m_next, ("mbuf already chained"));
455	c->mbuf_tail->m_next = mbuf;
456	c->mbuf_tail = mbuf;
457
458	/* Increase length appropriately */
459	c->mbuf->m_pkthdr.len += mbuf->m_len;
460
461	/* Update the connection state flags */
462	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
463		struct ip *iph = c->nh;
464		iph->ip_len += mbuf->m_len;
465		c_th = (struct tcphdr *)(iph + 1);
466	} else {
467		struct ip6_hdr *iph = c->nh;
468		iph->ip6_plen += mbuf->m_len;
469		c_th = (struct tcphdr *)(iph + 1);
470	}
471	c_th->th_flags |= (th->th_flags & TH_PUSH);
472	c->th_last = th;
473	++st->n_merges;
474
475	/* Pass packet up now if another segment could overflow the IP
476	 * length.
477	 */
478	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
479		sfxge_lro_deliver(st, c);
480}
481
482static void
483sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
484		struct mbuf *mbuf, void *nh, struct tcphdr *th)
485{
486	/* Start the chain */
487	c->mbuf = mbuf;
488	c->mbuf_tail = c->mbuf;
489	c->nh = nh;
490	c->th_last = th;
491
492	mbuf->m_pkthdr.len = mbuf->m_len;
493
494	/* Mangle header fields for later processing */
495	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
496		struct ip *iph = nh;
497		iph->ip_len = ntohs(iph->ip_len);
498	} else {
499		struct ip6_hdr *iph = nh;
500		iph->ip6_plen = ntohs(iph->ip6_plen);
501	}
502}
503
504/* Try to merge or otherwise hold or deliver (as appropriate) the
505 * packet buffered for this connection (c->next_buf).  Return a flag
506 * indicating whether the connection is still active for LRO purposes.
507 */
508static int
509sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
510{
511	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
512	char *eh = c->next_eh;
513	int data_length, hdr_length, dont_merge;
514	unsigned th_seq, pkt_length;
515	struct tcphdr *th;
516	unsigned now;
517
518	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
519		struct ip *iph = c->next_nh;
520		th = (struct tcphdr *)(iph + 1);
521		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
522	} else {
523		struct ip6_hdr *iph = c->next_nh;
524		th = (struct tcphdr *)(iph + 1);
525		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
526	}
527
528	hdr_length = (char *) th + th->th_off * 4 - eh;
529	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
530		       hdr_length);
531	th_seq = ntohl(th->th_seq);
532	dont_merge = ((data_length <= 0)
533		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
534
535	/* Check for options other than aligned timestamp. */
536	if (th->th_off != 5) {
537		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
538		if (th->th_off == 8 &&
539		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
540					(TCPOPT_NOP << 16) |
541					(TCPOPT_TIMESTAMP << 8) |
542					TCPOLEN_TIMESTAMP)) {
543			/* timestamp option -- okay */
544		} else {
545			dont_merge = 1;
546		}
547	}
548
549	if (__predict_false(th_seq != c->next_seq)) {
550		/* Out-of-order, so start counting again. */
551		if (c->mbuf != NULL)
552			sfxge_lro_deliver(&rxq->lro, c);
553		c->n_in_order_pkts -= lro_loss_packets;
554		c->next_seq = th_seq + data_length;
555		++rxq->lro.n_misorder;
556		goto deliver_buf_out;
557	}
558	c->next_seq = th_seq + data_length;
559
560	now = ticks;
561	if (now - c->last_pkt_ticks > lro_idle_ticks) {
562		++rxq->lro.n_drop_idle;
563		if (c->mbuf != NULL)
564			sfxge_lro_deliver(&rxq->lro, c);
565		sfxge_lro_drop(rxq, c);
566		return (0);
567	}
568	c->last_pkt_ticks = ticks;
569
570	if (c->n_in_order_pkts < lro_slow_start_packets) {
571		/* May be in slow-start, so don't merge. */
572		++rxq->lro.n_slow_start;
573		++c->n_in_order_pkts;
574		goto deliver_buf_out;
575	}
576
577	if (__predict_false(dont_merge)) {
578		if (c->mbuf != NULL)
579			sfxge_lro_deliver(&rxq->lro, c);
580		if (th->th_flags & (TH_FIN | TH_RST)) {
581			++rxq->lro.n_drop_closed;
582			sfxge_lro_drop(rxq, c);
583			return (0);
584		}
585		goto deliver_buf_out;
586	}
587
588	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
589
590	if (__predict_true(c->mbuf != NULL)) {
591		/* Remove headers and any padding */
592		rx_buf->mbuf->m_data += hdr_length;
593		rx_buf->mbuf->m_len = data_length;
594
595		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
596	} else {
597		/* Remove any padding */
598		rx_buf->mbuf->m_len = pkt_length;
599
600		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
601	}
602
603	rx_buf->mbuf = NULL;
604	return (1);
605
606 deliver_buf_out:
607	sfxge_rx_deliver(rxq->sc, rx_buf);
608	return (1);
609}
610
611static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
612			       uint16_t l2_id, void *nh, struct tcphdr *th)
613{
614	unsigned bucket = conn_hash & st->conns_mask;
615	struct sfxge_lro_conn *c;
616
617	if (st->conns_n[bucket] >= lro_chain_max) {
618		++st->n_too_many;
619		return;
620	}
621
622	if (!TAILQ_EMPTY(&st->free_conns)) {
623		c = TAILQ_FIRST(&st->free_conns);
624		TAILQ_REMOVE(&st->free_conns, c, link);
625	} else {
626		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
627		if (c == NULL)
628			return;
629		c->mbuf = NULL;
630		c->next_buf.mbuf = NULL;
631	}
632
633	/* Create the connection tracking data */
634	++st->conns_n[bucket];
635	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
636	c->l2_id = l2_id;
637	c->conn_hash = conn_hash;
638	c->source = th->th_sport;
639	c->dest = th->th_dport;
640	c->n_in_order_pkts = 0;
641	c->last_pkt_ticks = *(volatile int *)&ticks;
642	c->delivered = 0;
643	++st->n_new_stream;
644	/* NB. We don't initialise c->next_seq, and it doesn't matter what
645	 * value it has.  Most likely the next packet received for this
646	 * connection will not match -- no harm done.
647	 */
648}
649
650/* Process mbuf and decide whether to dispatch it to the stack now or
651 * later.
652 */
653static void
654sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
655{
656	struct sfxge_softc *sc = rxq->sc;
657	struct mbuf *m = rx_buf->mbuf;
658	struct ether_header *eh;
659	struct sfxge_lro_conn *c;
660	uint16_t l2_id;
661	uint16_t l3_proto;
662	void *nh;
663	struct tcphdr *th;
664	uint32_t conn_hash;
665	unsigned bucket;
666
667	/* Get the hardware hash */
668	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
669				      mtod(m, uint8_t *));
670
671	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
672	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
673		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
674		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
675			SFXGE_LRO_L2_ID_VLAN;
676		l3_proto = veh->evl_proto;
677		nh = veh + 1;
678	} else {
679		l2_id = 0;
680		l3_proto = eh->ether_type;
681		nh = eh + 1;
682	}
683
684	/* Check whether this is a suitable packet (unfragmented
685	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
686	 * length, and compute a hash if necessary.  If not, return.
687	 */
688	if (l3_proto == htons(ETHERTYPE_IP)) {
689		struct ip *iph = nh;
690		if ((iph->ip_p - IPPROTO_TCP) |
691		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
692		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
693			goto deliver_now;
694		th = (struct tcphdr *)(iph + 1);
695	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
696		struct ip6_hdr *iph = nh;
697		if (iph->ip6_nxt != IPPROTO_TCP)
698			goto deliver_now;
699		l2_id |= SFXGE_LRO_L2_ID_IPV6;
700		th = (struct tcphdr *)(iph + 1);
701	} else {
702		goto deliver_now;
703	}
704
705	bucket = conn_hash & rxq->lro.conns_mask;
706
707	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
708		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
709			continue;
710		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
711			continue;
712		if (c->mbuf != NULL) {
713			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
714				struct ip *c_iph, *iph = nh;
715				c_iph = c->nh;
716				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
717				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
718					continue;
719			} else {
720				struct ip6_hdr *c_iph, *iph = nh;
721				c_iph = c->nh;
722				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
723				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
724					continue;
725			}
726		}
727
728		/* Re-insert at head of list to reduce lookup time. */
729		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
730		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
731
732		if (c->next_buf.mbuf != NULL) {
733			if (!sfxge_lro_try_merge(rxq, c))
734				goto deliver_now;
735		} else {
736			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
737			    active_link);
738		}
739		c->next_buf = *rx_buf;
740		c->next_eh = eh;
741		c->next_nh = nh;
742
743		rx_buf->mbuf = NULL;
744		rx_buf->flags = EFX_DISCARD;
745		return;
746	}
747
748	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
749 deliver_now:
750	sfxge_rx_deliver(sc, rx_buf);
751}
752
753static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
754{
755	struct sfxge_lro_state *st = &rxq->lro;
756	struct sfxge_lro_conn *c;
757	unsigned t;
758
759	while (!LIST_EMPTY(&st->active_conns)) {
760		c = LIST_FIRST(&st->active_conns);
761		if (!c->delivered && c->mbuf != NULL)
762			sfxge_lro_deliver(st, c);
763		if (sfxge_lro_try_merge(rxq, c)) {
764			if (c->mbuf != NULL)
765				sfxge_lro_deliver(st, c);
766			LIST_REMOVE(c, active_link);
767		}
768		c->delivered = 0;
769	}
770
771	t = *(volatile int *)&ticks;
772	if (__predict_false(t != st->last_purge_ticks))
773		sfxge_lro_purge_idle(rxq, t);
774}
775
776#else	/* !SFXGE_LRO */
777
778static void
779sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
780{
781}
782
783static void
784sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
785{
786}
787
788#endif	/* SFXGE_LRO */
789
790void
791sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
792{
793	struct sfxge_softc *sc = rxq->sc;
794	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
795	unsigned int index;
796	struct sfxge_evq *evq;
797	unsigned int completed;
798	unsigned int level;
799	struct mbuf *m;
800	struct sfxge_rx_sw_desc *prev = NULL;
801
802	index = rxq->index;
803	evq = sc->evq[index];
804
805	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
806
807	completed = rxq->completed;
808	while (completed != rxq->pending) {
809		unsigned int id;
810		struct sfxge_rx_sw_desc *rx_desc;
811
812		id = completed++ & rxq->ptr_mask;
813		rx_desc = &rxq->queue[id];
814		m = rx_desc->mbuf;
815
816		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
817			goto discard;
818
819		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
820			goto discard;
821
822		prefetch_read_many(mtod(m, caddr_t));
823
824		/* Check for loopback packets */
825		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
826		    !(rx_desc->flags & EFX_PKT_IPV6)) {
827			struct ether_header *etherhp;
828
829			/*LINTED*/
830			etherhp = mtod(m, struct ether_header *);
831
832			if (etherhp->ether_type ==
833			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
834				EFSYS_PROBE(loopback);
835
836				rxq->loopback++;
837				goto discard;
838			}
839		}
840
841		/* Pass packet up the stack or into LRO (pipelined) */
842		if (prev != NULL) {
843			if (lro_enabled)
844				sfxge_lro(rxq, prev);
845			else
846				sfxge_rx_deliver(sc, prev);
847		}
848		prev = rx_desc;
849		continue;
850
851discard:
852		/* Return the packet to the pool */
853		m_free(m);
854		rx_desc->mbuf = NULL;
855	}
856	rxq->completed = completed;
857
858	level = rxq->added - rxq->completed;
859
860	/* Pass last packet up the stack or into LRO */
861	if (prev != NULL) {
862		if (lro_enabled)
863			sfxge_lro(rxq, prev);
864		else
865			sfxge_rx_deliver(sc, prev);
866	}
867
868	/*
869	 * If there are any pending flows and this is the end of the
870	 * poll then they must be completed.
871	 */
872	if (eop)
873		sfxge_lro_end_of_burst(rxq);
874
875	/* Top up the queue if necessary */
876	if (level < rxq->refill_threshold)
877		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
878}
879
880static void
881sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
882{
883	struct sfxge_rxq *rxq;
884	struct sfxge_evq *evq;
885	unsigned int count;
886
887	rxq = sc->rxq[index];
888	evq = sc->evq[index];
889
890	SFXGE_EVQ_LOCK(evq);
891
892	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
893	    ("rxq not started"));
894
895	rxq->init_state = SFXGE_RXQ_INITIALIZED;
896
897	callout_stop(&rxq->refill_callout);
898
899again:
900	rxq->flush_state = SFXGE_FLUSH_PENDING;
901
902	/* Flush the receive queue */
903	efx_rx_qflush(rxq->common);
904
905	SFXGE_EVQ_UNLOCK(evq);
906
907	count = 0;
908	do {
909		/* Spin for 100 ms */
910		DELAY(100000);
911
912		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
913			break;
914
915	} while (++count < 20);
916
917	SFXGE_EVQ_LOCK(evq);
918
919	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
920		goto again;
921
922	rxq->flush_state = SFXGE_FLUSH_DONE;
923
924	rxq->pending = rxq->added;
925	sfxge_rx_qcomplete(rxq, B_TRUE);
926
927	KASSERT(rxq->completed == rxq->pending,
928	    ("rxq->completed != rxq->pending"));
929
930	rxq->added = 0;
931	rxq->pending = 0;
932	rxq->completed = 0;
933	rxq->loopback = 0;
934
935	/* Destroy the common code receive queue. */
936	efx_rx_qdestroy(rxq->common);
937
938	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
939	    EFX_RXQ_NBUFS(sc->rxq_entries));
940
941	SFXGE_EVQ_UNLOCK(evq);
942}
943
944static int
945sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
946{
947	struct sfxge_rxq *rxq;
948	efsys_mem_t *esmp;
949	struct sfxge_evq *evq;
950	int rc;
951
952	rxq = sc->rxq[index];
953	esmp = &rxq->mem;
954	evq = sc->evq[index];
955
956	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
957	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
958	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
959	    ("evq->init_state != SFXGE_EVQ_STARTED"));
960
961	/* Program the buffer table. */
962	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
963	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
964		return (rc);
965
966	/* Create the common code receive queue. */
967	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
968	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
969	    &rxq->common)) != 0)
970		goto fail;
971
972	SFXGE_EVQ_LOCK(evq);
973
974	/* Enable the receive queue. */
975	efx_rx_qenable(rxq->common);
976
977	rxq->init_state = SFXGE_RXQ_STARTED;
978
979	/* Try to fill the queue from the pool. */
980	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
981
982	SFXGE_EVQ_UNLOCK(evq);
983
984	return (0);
985
986fail:
987	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
988	    EFX_RXQ_NBUFS(sc->rxq_entries));
989	return (rc);
990}
991
992void
993sfxge_rx_stop(struct sfxge_softc *sc)
994{
995	int index;
996
997	/* Stop the receive queue(s) */
998	index = sc->rxq_count;
999	while (--index >= 0)
1000		sfxge_rx_qstop(sc, index);
1001
1002	sc->rx_prefix_size = 0;
1003	sc->rx_buffer_size = 0;
1004
1005	efx_rx_fini(sc->enp);
1006}
1007
1008int
1009sfxge_rx_start(struct sfxge_softc *sc)
1010{
1011	struct sfxge_intr *intr;
1012	int index;
1013	int rc;
1014
1015	intr = &sc->intr;
1016
1017	/* Initialize the common code receive module. */
1018	if ((rc = efx_rx_init(sc->enp)) != 0)
1019		return (rc);
1020
1021	/* Calculate the receive packet buffer size. */
1022	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1023	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1024			      sc->rx_prefix_size);
1025
1026	/* Select zone for packet buffers */
1027	if (sc->rx_buffer_size <= MCLBYTES)
1028		sc->rx_buffer_zone = zone_clust;
1029	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1030		sc->rx_buffer_zone = zone_jumbop;
1031	else if (sc->rx_buffer_size <= MJUM9BYTES)
1032		sc->rx_buffer_zone = zone_jumbo9;
1033	else
1034		sc->rx_buffer_zone = zone_jumbo16;
1035
1036	/*
1037	 * Set up the scale table.  Enable all hash types and hash insertion.
1038	 */
1039	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1040		sc->rx_indir_table[index] = index % sc->rxq_count;
1041	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1042				       SFXGE_RX_SCALE_MAX)) != 0)
1043		goto fail;
1044	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1045	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1046	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1047
1048	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1049	    sizeof(toep_key))) != 0)
1050		goto fail;
1051
1052	/* Start the receive queue(s). */
1053	for (index = 0; index < sc->rxq_count; index++) {
1054		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1055			goto fail2;
1056	}
1057
1058	return (0);
1059
1060fail2:
1061	while (--index >= 0)
1062		sfxge_rx_qstop(sc, index);
1063
1064fail:
1065	efx_rx_fini(sc->enp);
1066
1067	return (rc);
1068}
1069
1070#ifdef SFXGE_LRO
1071
1072static void sfxge_lro_init(struct sfxge_rxq *rxq)
1073{
1074	struct sfxge_lro_state *st = &rxq->lro;
1075	unsigned i;
1076
1077	st->conns_mask = lro_table_size - 1;
1078	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1079		("lro_table_size must be a power of 2"));
1080	st->sc = rxq->sc;
1081	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1082			   M_SFXGE, M_WAITOK);
1083	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1084			     M_SFXGE, M_WAITOK);
1085	for (i = 0; i <= st->conns_mask; ++i) {
1086		TAILQ_INIT(&st->conns[i]);
1087		st->conns_n[i] = 0;
1088	}
1089	LIST_INIT(&st->active_conns);
1090	TAILQ_INIT(&st->free_conns);
1091}
1092
1093static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1094{
1095	struct sfxge_lro_state *st = &rxq->lro;
1096	struct sfxge_lro_conn *c;
1097	unsigned i;
1098
1099	/* Return cleanly if sfxge_lro_init() has not been called. */
1100	if (st->conns == NULL)
1101		return;
1102
1103	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1104
1105	for (i = 0; i <= st->conns_mask; ++i) {
1106		while (!TAILQ_EMPTY(&st->conns[i])) {
1107			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1108			sfxge_lro_drop(rxq, c);
1109		}
1110	}
1111
1112	while (!TAILQ_EMPTY(&st->free_conns)) {
1113		c = TAILQ_FIRST(&st->free_conns);
1114		TAILQ_REMOVE(&st->free_conns, c, link);
1115		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1116		free(c, M_SFXGE);
1117	}
1118
1119	free(st->conns_n, M_SFXGE);
1120	free(st->conns, M_SFXGE);
1121	st->conns = NULL;
1122}
1123
1124#else
1125
1126static void
1127sfxge_lro_init(struct sfxge_rxq *rxq)
1128{
1129}
1130
1131static void
1132sfxge_lro_fini(struct sfxge_rxq *rxq)
1133{
1134}
1135
1136#endif	/* SFXGE_LRO */
1137
1138static void
1139sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1140{
1141	struct sfxge_rxq *rxq;
1142
1143	rxq = sc->rxq[index];
1144
1145	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1146	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1147
1148	/* Free the context array and the flow table. */
1149	free(rxq->queue, M_SFXGE);
1150	sfxge_lro_fini(rxq);
1151
1152	/* Release DMA memory. */
1153	sfxge_dma_free(&rxq->mem);
1154
1155	sc->rxq[index] = NULL;
1156
1157	free(rxq, M_SFXGE);
1158}
1159
1160static int
1161sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1162{
1163	struct sfxge_rxq *rxq;
1164	struct sfxge_evq *evq;
1165	efsys_mem_t *esmp;
1166	int rc;
1167
1168	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1169
1170	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1171	rxq->sc = sc;
1172	rxq->index = index;
1173	rxq->entries = sc->rxq_entries;
1174	rxq->ptr_mask = rxq->entries - 1;
1175	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1176
1177	sc->rxq[index] = rxq;
1178	esmp = &rxq->mem;
1179
1180	evq = sc->evq[index];
1181
1182	/* Allocate and zero DMA space. */
1183	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1184		return (rc);
1185
1186	/* Allocate buffer table entries. */
1187	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1188				 &rxq->buf_base_id);
1189
1190	/* Allocate the context array and the flow table. */
1191	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1192	    M_SFXGE, M_WAITOK | M_ZERO);
1193	sfxge_lro_init(rxq);
1194
1195	callout_init(&rxq->refill_callout, B_TRUE);
1196
1197	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1198
1199	return (0);
1200}
1201
1202static const struct {
1203	const char *name;
1204	size_t offset;
1205} sfxge_rx_stats[] = {
1206#define	SFXGE_RX_STAT(name, member) \
1207	{ #name, offsetof(struct sfxge_rxq, member) }
1208#ifdef SFXGE_LRO
1209	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1210	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1211	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1212	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1213	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1214	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1215	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1216	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1217#endif
1218};
1219
1220static int
1221sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1222{
1223	struct sfxge_softc *sc = arg1;
1224	unsigned int id = arg2;
1225	unsigned int sum, index;
1226
1227	/* Sum across all RX queues */
1228	sum = 0;
1229	for (index = 0; index < sc->rxq_count; index++)
1230		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1231					 sfxge_rx_stats[id].offset);
1232
1233	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1234}
1235
1236static void
1237sfxge_rx_stat_init(struct sfxge_softc *sc)
1238{
1239	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1240	struct sysctl_oid_list *stat_list;
1241	unsigned int id;
1242
1243	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1244
1245	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1246		SYSCTL_ADD_PROC(
1247			ctx, stat_list,
1248			OID_AUTO, sfxge_rx_stats[id].name,
1249			CTLTYPE_UINT|CTLFLAG_RD,
1250			sc, id, sfxge_rx_stat_handler, "IU",
1251			"");
1252	}
1253}
1254
1255void
1256sfxge_rx_fini(struct sfxge_softc *sc)
1257{
1258	int index;
1259
1260	index = sc->rxq_count;
1261	while (--index >= 0)
1262		sfxge_rx_qfini(sc, index);
1263
1264	sc->rxq_count = 0;
1265}
1266
1267int
1268sfxge_rx_init(struct sfxge_softc *sc)
1269{
1270	struct sfxge_intr *intr;
1271	int index;
1272	int rc;
1273
1274#ifdef SFXGE_LRO
1275	if (!ISP2(lro_table_size)) {
1276		log(LOG_ERR, "%s=%u must be power of 2",
1277		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1278		rc = EINVAL;
1279		goto fail_lro_table_size;
1280	}
1281
1282	if (lro_idle_ticks == 0)
1283		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1284#endif
1285
1286	intr = &sc->intr;
1287
1288	sc->rxq_count = intr->n_alloc;
1289
1290	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1291	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1292
1293	/* Initialize the receive queue(s) - one per interrupt. */
1294	for (index = 0; index < sc->rxq_count; index++) {
1295		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1296			goto fail;
1297	}
1298
1299	sfxge_rx_stat_init(sc);
1300
1301	return (0);
1302
1303fail:
1304	/* Tear down the receive queue(s). */
1305	while (--index >= 0)
1306		sfxge_rx_qfini(sc, index);
1307
1308	sc->rxq_count = 0;
1309
1310#ifdef SFXGE_LRO
1311fail_lro_table_size:
1312#endif
1313	return (rc);
1314}
1315