sfxge_rx.c revision 283291
1/*-
2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 283291 2015-05-22 17:05:21Z jkim $");
32
33#include <sys/types.h>
34#include <sys/mbuf.h>
35#include <sys/smp.h>
36#include <sys/socket.h>
37#include <sys/sysctl.h>
38#include <sys/limits.h>
39#include <sys/syslog.h>
40
41#include <net/ethernet.h>
42#include <net/if.h>
43#include <net/if_vlan_var.h>
44
45#include <netinet/in.h>
46#include <netinet/ip.h>
47#include <netinet/ip6.h>
48#include <netinet/tcp.h>
49
50#include <machine/in_cksum.h>
51
52#include "common/efx.h"
53
54
55#include "sfxge.h"
56#include "sfxge_rx.h"
57
58#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59
60#ifdef SFXGE_LRO
61
62SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63	    "Large receive offload (LRO) parameters");
64
65#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66
67/* Size of the LRO hash table.  Must be a power of 2.  A larger table
68 * means we can accelerate a larger number of streams.
69 */
70static unsigned lro_table_size = 128;
71TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73	    &lro_table_size, 0,
74	    "Size of the LRO hash table (must be a power of 2)");
75
76/* Maximum length of a hash chain.  If chains get too long then the lookup
77 * time increases and may exceed the benefit of LRO.
78 */
79static unsigned lro_chain_max = 20;
80TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82	    &lro_chain_max, 0,
83	    "The maximum length of a hash chain");
84
85/* Maximum time (in ticks) that a connection can be idle before it's LRO
86 * state is discarded.
87 */
88static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91	    &lro_idle_ticks, 0,
92	    "The maximum time (in ticks) that a connection can be idle "
93	    "before it's LRO state is discarded");
94
95/* Number of packets with payload that must arrive in-order before a
96 * connection is eligible for LRO.  The idea is we should avoid coalescing
97 * segments when the sender is in slow-start because reducing the ACK rate
98 * can damage performance.
99 */
100static int lro_slow_start_packets = 2000;
101TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103	    &lro_slow_start_packets, 0,
104	    "Number of packets with payload that must arrive in-order before "
105	    "a connection is eligible for LRO");
106
107/* Number of packets with payload that must arrive in-order following loss
108 * before a connection is eligible for LRO.  The idea is we should avoid
109 * coalescing segments when the sender is recovering from loss, because
110 * reducing the ACK rate can damage performance.
111 */
112static int lro_loss_packets = 20;
113TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115	    &lro_loss_packets, 0,
116	    "Number of packets with payload that must arrive in-order "
117	    "following loss before a connection is eligible for LRO");
118
119/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120#define	SFXGE_LRO_L2_ID_VLAN 0x4000
121#define	SFXGE_LRO_L2_ID_IPV6 0x8000
122#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124
125/* Compare IPv6 addresses, avoiding conditional branches */
126static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127				   const struct in6_addr *right)
128{
129#if LONG_BIT == 64
130	const uint64_t *left64 = (const uint64_t *)left;
131	const uint64_t *right64 = (const uint64_t *)right;
132	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133#else
134	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137	       (left->s6_addr32[3] - right->s6_addr32[3]);
138#endif
139}
140
141#endif	/* SFXGE_LRO */
142
143void
144sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145{
146
147	rxq->flush_state = SFXGE_FLUSH_DONE;
148}
149
150void
151sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152{
153
154	rxq->flush_state = SFXGE_FLUSH_FAILED;
155}
156
157static uint8_t toep_key[] = {
158	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163};
164
165static void
166sfxge_rx_post_refill(void *arg)
167{
168	struct sfxge_rxq *rxq = arg;
169	struct sfxge_softc *sc;
170	unsigned int index;
171	struct sfxge_evq *evq;
172	uint16_t magic;
173
174	sc = rxq->sc;
175	index = rxq->index;
176	evq = sc->evq[index];
177
178	magic = SFXGE_MAGIC_RX_QREFILL | index;
179
180	/* This is guaranteed due to the start/stop order of rx and ev */
181	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182	    ("evq not started"));
183	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184	    ("rxq not started"));
185	efx_ev_qpost(evq->common, magic);
186}
187
188static void
189sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190{
191	/* Initially retry after 100 ms, but back off in case of
192	 * repeated failures as we probably have to wait for the
193	 * administrator to raise the pool limit. */
194	if (retrying)
195		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196	else
197		rxq->refill_delay = hz / 10;
198
199	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200			     sfxge_rx_post_refill, rxq);
201}
202
203static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204{
205	struct mb_args args;
206	struct mbuf *m;
207
208	/* Allocate mbuf structure */
209	args.flags = M_PKTHDR;
210	args.type = MT_DATA;
211	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212
213	/* Allocate (and attach) packet buffer */
214	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215		uma_zfree(zone_mbuf, m);
216		m = NULL;
217	}
218
219	return (m);
220}
221
222#define	SFXGE_REFILL_BATCH  64
223
224static void
225sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226{
227	struct sfxge_softc *sc;
228	unsigned int index;
229	struct sfxge_evq *evq;
230	unsigned int batch;
231	unsigned int rxfill;
232	unsigned int mblksize;
233	int ntodo;
234	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235
236	sc = rxq->sc;
237	index = rxq->index;
238	evq = sc->evq[index];
239
240	prefetch_read_many(sc->enp);
241	prefetch_read_many(rxq->common);
242
243	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244
245	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246		return;
247
248	rxfill = rxq->added - rxq->completed;
249	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254
255	if (ntodo == 0)
256		return;
257
258	batch = 0;
259	mblksize = sc->rx_buffer_size;
260	while (ntodo-- > 0) {
261		unsigned int id;
262		struct sfxge_rx_sw_desc *rx_desc;
263		bus_dma_segment_t seg;
264		struct mbuf *m;
265
266		id = (rxq->added + batch) & rxq->ptr_mask;
267		rx_desc = &rxq->queue[id];
268		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269
270		rx_desc->flags = EFX_DISCARD;
271		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272		if (m == NULL)
273			break;
274		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275		addr[batch++] = seg.ds_addr;
276
277		if (batch == SFXGE_REFILL_BATCH) {
278			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279			    rxq->completed, rxq->added);
280			rxq->added += batch;
281			batch = 0;
282		}
283	}
284
285	if (ntodo != 0)
286		sfxge_rx_schedule_refill(rxq, retrying);
287
288	if (batch != 0) {
289		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290		    rxq->completed, rxq->added);
291		rxq->added += batch;
292	}
293
294	/* Make the descriptors visible to the hardware */
295	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296			BUS_DMASYNC_PREWRITE);
297
298	efx_rx_qpush(rxq->common, rxq->added);
299}
300
301void
302sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303{
304
305	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306		return;
307
308	/* Make sure the queue is full */
309	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310}
311
312static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313{
314	struct ifnet *ifp = sc->ifnet;
315
316	m->m_pkthdr.rcvif = ifp;
317	m->m_pkthdr.csum_data = 0xffff;
318	ifp->if_input(ifp, m);
319}
320
321static void
322sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323{
324	struct mbuf *m = rx_desc->mbuf;
325	int flags = rx_desc->flags;
326	int csum_flags;
327
328	/* Convert checksum flags */
329	csum_flags = (flags & EFX_CKSUM_IPV4) ?
330		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
331	if (flags & EFX_CKSUM_TCPUDP)
332		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
333
334	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
335		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
336						       mtod(m, uint8_t *));
337		/* The hash covers a 4-tuple for TCP only */
338		M_HASHTYPE_SET(m,
339		    (flags & EFX_PKT_IPV4) ?
340			((flags & EFX_PKT_TCP) ?
341			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
342			((flags & EFX_PKT_TCP) ?
343			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
344	}
345	m->m_data += sc->rx_prefix_size;
346	m->m_len = rx_desc->size - sc->rx_prefix_size;
347	m->m_pkthdr.len = m->m_len;
348	m->m_pkthdr.csum_flags = csum_flags;
349	__sfxge_rx_deliver(sc, rx_desc->mbuf);
350
351	rx_desc->flags = EFX_DISCARD;
352	rx_desc->mbuf = NULL;
353}
354
355#ifdef SFXGE_LRO
356
357static void
358sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
359{
360	struct sfxge_softc *sc = st->sc;
361	struct mbuf *m = c->mbuf;
362	struct tcphdr *c_th;
363	int csum_flags;
364
365	KASSERT(m, ("no mbuf to deliver"));
366
367	++st->n_bursts;
368
369	/* Finish off packet munging and recalculate IP header checksum. */
370	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
371		struct ip *iph = c->nh;
372		iph->ip_len = htons(iph->ip_len);
373		iph->ip_sum = 0;
374		iph->ip_sum = in_cksum_hdr(iph);
375		c_th = (struct tcphdr *)(iph + 1);
376		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
377			      CSUM_IP_CHECKED | CSUM_IP_VALID);
378	} else {
379		struct ip6_hdr *iph = c->nh;
380		iph->ip6_plen = htons(iph->ip6_plen);
381		c_th = (struct tcphdr *)(iph + 1);
382		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
383	}
384
385	c_th->th_win = c->th_last->th_win;
386	c_th->th_ack = c->th_last->th_ack;
387	if (c_th->th_off == c->th_last->th_off) {
388		/* Copy TCP options (take care to avoid going negative). */
389		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
390		memcpy(c_th + 1, c->th_last + 1, optlen);
391	}
392
393	m->m_pkthdr.flowid = c->conn_hash;
394	M_HASHTYPE_SET(m,
395	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
396		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
397
398	m->m_pkthdr.csum_flags = csum_flags;
399	__sfxge_rx_deliver(sc, m);
400
401	c->mbuf = NULL;
402	c->delivered = 1;
403}
404
405/* Drop the given connection, and add it to the free list. */
406static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
407{
408	unsigned bucket;
409
410	KASSERT(!c->mbuf, ("found orphaned mbuf"));
411
412	if (c->next_buf.mbuf != NULL) {
413		sfxge_rx_deliver(rxq->sc, &c->next_buf);
414		LIST_REMOVE(c, active_link);
415	}
416
417	bucket = c->conn_hash & rxq->lro.conns_mask;
418	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
419	--rxq->lro.conns_n[bucket];
420	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
421	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
422}
423
424/* Stop tracking connections that have gone idle in order to keep hash
425 * chains short.
426 */
427static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
428{
429	struct sfxge_lro_conn *c;
430	unsigned i;
431
432	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
433		("found active connections"));
434
435	rxq->lro.last_purge_ticks = now;
436	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
437		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
438			continue;
439
440		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
441		if (now - c->last_pkt_ticks > lro_idle_ticks) {
442			++rxq->lro.n_drop_idle;
443			sfxge_lro_drop(rxq, c);
444		}
445	}
446}
447
448static void
449sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
450		struct mbuf *mbuf, struct tcphdr *th)
451{
452	struct tcphdr *c_th;
453
454	/* Tack the new mbuf onto the chain. */
455	KASSERT(!mbuf->m_next, ("mbuf already chained"));
456	c->mbuf_tail->m_next = mbuf;
457	c->mbuf_tail = mbuf;
458
459	/* Increase length appropriately */
460	c->mbuf->m_pkthdr.len += mbuf->m_len;
461
462	/* Update the connection state flags */
463	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
464		struct ip *iph = c->nh;
465		iph->ip_len += mbuf->m_len;
466		c_th = (struct tcphdr *)(iph + 1);
467	} else {
468		struct ip6_hdr *iph = c->nh;
469		iph->ip6_plen += mbuf->m_len;
470		c_th = (struct tcphdr *)(iph + 1);
471	}
472	c_th->th_flags |= (th->th_flags & TH_PUSH);
473	c->th_last = th;
474	++st->n_merges;
475
476	/* Pass packet up now if another segment could overflow the IP
477	 * length.
478	 */
479	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
480		sfxge_lro_deliver(st, c);
481}
482
483static void
484sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
485		struct mbuf *mbuf, void *nh, struct tcphdr *th)
486{
487	/* Start the chain */
488	c->mbuf = mbuf;
489	c->mbuf_tail = c->mbuf;
490	c->nh = nh;
491	c->th_last = th;
492
493	mbuf->m_pkthdr.len = mbuf->m_len;
494
495	/* Mangle header fields for later processing */
496	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
497		struct ip *iph = nh;
498		iph->ip_len = ntohs(iph->ip_len);
499	} else {
500		struct ip6_hdr *iph = nh;
501		iph->ip6_plen = ntohs(iph->ip6_plen);
502	}
503}
504
505/* Try to merge or otherwise hold or deliver (as appropriate) the
506 * packet buffered for this connection (c->next_buf).  Return a flag
507 * indicating whether the connection is still active for LRO purposes.
508 */
509static int
510sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
511{
512	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
513	char *eh = c->next_eh;
514	int data_length, hdr_length, dont_merge;
515	unsigned th_seq, pkt_length;
516	struct tcphdr *th;
517	unsigned now;
518
519	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
520		struct ip *iph = c->next_nh;
521		th = (struct tcphdr *)(iph + 1);
522		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
523	} else {
524		struct ip6_hdr *iph = c->next_nh;
525		th = (struct tcphdr *)(iph + 1);
526		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
527	}
528
529	hdr_length = (char *) th + th->th_off * 4 - eh;
530	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
531		       hdr_length);
532	th_seq = ntohl(th->th_seq);
533	dont_merge = ((data_length <= 0)
534		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
535
536	/* Check for options other than aligned timestamp. */
537	if (th->th_off != 5) {
538		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
539		if (th->th_off == 8 &&
540		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
541					(TCPOPT_NOP << 16) |
542					(TCPOPT_TIMESTAMP << 8) |
543					TCPOLEN_TIMESTAMP)) {
544			/* timestamp option -- okay */
545		} else {
546			dont_merge = 1;
547		}
548	}
549
550	if (__predict_false(th_seq != c->next_seq)) {
551		/* Out-of-order, so start counting again. */
552		if (c->mbuf != NULL)
553			sfxge_lro_deliver(&rxq->lro, c);
554		c->n_in_order_pkts -= lro_loss_packets;
555		c->next_seq = th_seq + data_length;
556		++rxq->lro.n_misorder;
557		goto deliver_buf_out;
558	}
559	c->next_seq = th_seq + data_length;
560
561	now = ticks;
562	if (now - c->last_pkt_ticks > lro_idle_ticks) {
563		++rxq->lro.n_drop_idle;
564		if (c->mbuf != NULL)
565			sfxge_lro_deliver(&rxq->lro, c);
566		sfxge_lro_drop(rxq, c);
567		return (0);
568	}
569	c->last_pkt_ticks = ticks;
570
571	if (c->n_in_order_pkts < lro_slow_start_packets) {
572		/* May be in slow-start, so don't merge. */
573		++rxq->lro.n_slow_start;
574		++c->n_in_order_pkts;
575		goto deliver_buf_out;
576	}
577
578	if (__predict_false(dont_merge)) {
579		if (c->mbuf != NULL)
580			sfxge_lro_deliver(&rxq->lro, c);
581		if (th->th_flags & (TH_FIN | TH_RST)) {
582			++rxq->lro.n_drop_closed;
583			sfxge_lro_drop(rxq, c);
584			return (0);
585		}
586		goto deliver_buf_out;
587	}
588
589	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
590
591	if (__predict_true(c->mbuf != NULL)) {
592		/* Remove headers and any padding */
593		rx_buf->mbuf->m_data += hdr_length;
594		rx_buf->mbuf->m_len = data_length;
595
596		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
597	} else {
598		/* Remove any padding */
599		rx_buf->mbuf->m_len = pkt_length;
600
601		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
602	}
603
604	rx_buf->mbuf = NULL;
605	return (1);
606
607 deliver_buf_out:
608	sfxge_rx_deliver(rxq->sc, rx_buf);
609	return (1);
610}
611
612static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
613			       uint16_t l2_id, void *nh, struct tcphdr *th)
614{
615	unsigned bucket = conn_hash & st->conns_mask;
616	struct sfxge_lro_conn *c;
617
618	if (st->conns_n[bucket] >= lro_chain_max) {
619		++st->n_too_many;
620		return;
621	}
622
623	if (!TAILQ_EMPTY(&st->free_conns)) {
624		c = TAILQ_FIRST(&st->free_conns);
625		TAILQ_REMOVE(&st->free_conns, c, link);
626	} else {
627		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
628		if (c == NULL)
629			return;
630		c->mbuf = NULL;
631		c->next_buf.mbuf = NULL;
632	}
633
634	/* Create the connection tracking data */
635	++st->conns_n[bucket];
636	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
637	c->l2_id = l2_id;
638	c->conn_hash = conn_hash;
639	c->source = th->th_sport;
640	c->dest = th->th_dport;
641	c->n_in_order_pkts = 0;
642	c->last_pkt_ticks = *(volatile int *)&ticks;
643	c->delivered = 0;
644	++st->n_new_stream;
645	/* NB. We don't initialise c->next_seq, and it doesn't matter what
646	 * value it has.  Most likely the next packet received for this
647	 * connection will not match -- no harm done.
648	 */
649}
650
651/* Process mbuf and decide whether to dispatch it to the stack now or
652 * later.
653 */
654static void
655sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
656{
657	struct sfxge_softc *sc = rxq->sc;
658	struct mbuf *m = rx_buf->mbuf;
659	struct ether_header *eh;
660	struct sfxge_lro_conn *c;
661	uint16_t l2_id;
662	uint16_t l3_proto;
663	void *nh;
664	struct tcphdr *th;
665	uint32_t conn_hash;
666	unsigned bucket;
667
668	/* Get the hardware hash */
669	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
670				      mtod(m, uint8_t *));
671
672	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
673	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
674		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
675		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
676			SFXGE_LRO_L2_ID_VLAN;
677		l3_proto = veh->evl_proto;
678		nh = veh + 1;
679	} else {
680		l2_id = 0;
681		l3_proto = eh->ether_type;
682		nh = eh + 1;
683	}
684
685	/* Check whether this is a suitable packet (unfragmented
686	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
687	 * length, and compute a hash if necessary.  If not, return.
688	 */
689	if (l3_proto == htons(ETHERTYPE_IP)) {
690		struct ip *iph = nh;
691
692		KASSERT(iph->ip_p == IPPROTO_TCP,
693		    ("IPv4 protocol is not TCP, but packet marker is set"));
694		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
695		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
696			goto deliver_now;
697		th = (struct tcphdr *)(iph + 1);
698	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
699		struct ip6_hdr *iph = nh;
700
701		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
702		    ("IPv6 next header is not TCP, but packet marker is set"));
703		l2_id |= SFXGE_LRO_L2_ID_IPV6;
704		th = (struct tcphdr *)(iph + 1);
705	} else {
706		goto deliver_now;
707	}
708
709	bucket = conn_hash & rxq->lro.conns_mask;
710
711	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
712		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
713			continue;
714		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
715			continue;
716		if (c->mbuf != NULL) {
717			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
718				struct ip *c_iph, *iph = nh;
719				c_iph = c->nh;
720				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
721				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
722					continue;
723			} else {
724				struct ip6_hdr *c_iph, *iph = nh;
725				c_iph = c->nh;
726				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
727				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
728					continue;
729			}
730		}
731
732		/* Re-insert at head of list to reduce lookup time. */
733		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
734		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
735
736		if (c->next_buf.mbuf != NULL) {
737			if (!sfxge_lro_try_merge(rxq, c))
738				goto deliver_now;
739		} else {
740			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
741			    active_link);
742		}
743		c->next_buf = *rx_buf;
744		c->next_eh = eh;
745		c->next_nh = nh;
746
747		rx_buf->mbuf = NULL;
748		rx_buf->flags = EFX_DISCARD;
749		return;
750	}
751
752	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
753 deliver_now:
754	sfxge_rx_deliver(sc, rx_buf);
755}
756
757static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
758{
759	struct sfxge_lro_state *st = &rxq->lro;
760	struct sfxge_lro_conn *c;
761	unsigned t;
762
763	while (!LIST_EMPTY(&st->active_conns)) {
764		c = LIST_FIRST(&st->active_conns);
765		if (!c->delivered && c->mbuf != NULL)
766			sfxge_lro_deliver(st, c);
767		if (sfxge_lro_try_merge(rxq, c)) {
768			if (c->mbuf != NULL)
769				sfxge_lro_deliver(st, c);
770			LIST_REMOVE(c, active_link);
771		}
772		c->delivered = 0;
773	}
774
775	t = *(volatile int *)&ticks;
776	if (__predict_false(t != st->last_purge_ticks))
777		sfxge_lro_purge_idle(rxq, t);
778}
779
780#else	/* !SFXGE_LRO */
781
782static void
783sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
784{
785}
786
787static void
788sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
789{
790}
791
792#endif	/* SFXGE_LRO */
793
794void
795sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
796{
797	struct sfxge_softc *sc = rxq->sc;
798	int if_capenable = sc->ifnet->if_capenable;
799	int lro_enabled = if_capenable & IFCAP_LRO;
800	unsigned int index;
801	struct sfxge_evq *evq;
802	unsigned int completed;
803	unsigned int level;
804	struct mbuf *m;
805	struct sfxge_rx_sw_desc *prev = NULL;
806
807	index = rxq->index;
808	evq = sc->evq[index];
809
810	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
811
812	completed = rxq->completed;
813	while (completed != rxq->pending) {
814		unsigned int id;
815		struct sfxge_rx_sw_desc *rx_desc;
816
817		id = completed++ & rxq->ptr_mask;
818		rx_desc = &rxq->queue[id];
819		m = rx_desc->mbuf;
820
821		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
822			goto discard;
823
824		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
825			goto discard;
826
827		prefetch_read_many(mtod(m, caddr_t));
828
829		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
830		case EFX_PKT_IPV4:
831			if (~if_capenable & IFCAP_RXCSUM)
832				rx_desc->flags &=
833				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
834			break;
835		case EFX_PKT_IPV6:
836			if (~if_capenable & IFCAP_RXCSUM_IPV6)
837				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
838			break;
839		case 0:
840			/* Check for loopback packets */
841			{
842				struct ether_header *etherhp;
843
844				/*LINTED*/
845				etherhp = mtod(m, struct ether_header *);
846
847				if (etherhp->ether_type ==
848				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
849					EFSYS_PROBE(loopback);
850
851					rxq->loopback++;
852					goto discard;
853				}
854			}
855			break;
856		default:
857			KASSERT(B_FALSE,
858			    ("Rx descriptor with both IPv4 and IPv6 flags"));
859			goto discard;
860		}
861
862		/* Pass packet up the stack or into LRO (pipelined) */
863		if (prev != NULL) {
864			if (lro_enabled &&
865			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
866			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
867				sfxge_lro(rxq, prev);
868			else
869				sfxge_rx_deliver(sc, prev);
870		}
871		prev = rx_desc;
872		continue;
873
874discard:
875		/* Return the packet to the pool */
876		m_free(m);
877		rx_desc->mbuf = NULL;
878	}
879	rxq->completed = completed;
880
881	level = rxq->added - rxq->completed;
882
883	/* Pass last packet up the stack or into LRO */
884	if (prev != NULL) {
885		if (lro_enabled &&
886		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
887		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
888			sfxge_lro(rxq, prev);
889		else
890			sfxge_rx_deliver(sc, prev);
891	}
892
893	/*
894	 * If there are any pending flows and this is the end of the
895	 * poll then they must be completed.
896	 */
897	if (eop)
898		sfxge_lro_end_of_burst(rxq);
899
900	/* Top up the queue if necessary */
901	if (level < rxq->refill_threshold)
902		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
903}
904
905static void
906sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
907{
908	struct sfxge_rxq *rxq;
909	struct sfxge_evq *evq;
910	unsigned int count;
911
912	rxq = sc->rxq[index];
913	evq = sc->evq[index];
914
915	SFXGE_EVQ_LOCK(evq);
916
917	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
918	    ("rxq not started"));
919
920	rxq->init_state = SFXGE_RXQ_INITIALIZED;
921
922	callout_stop(&rxq->refill_callout);
923
924again:
925	rxq->flush_state = SFXGE_FLUSH_PENDING;
926
927	/* Flush the receive queue */
928	efx_rx_qflush(rxq->common);
929
930	SFXGE_EVQ_UNLOCK(evq);
931
932	count = 0;
933	do {
934		/* Spin for 100 ms */
935		DELAY(100000);
936
937		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
938			break;
939
940	} while (++count < 20);
941
942	SFXGE_EVQ_LOCK(evq);
943
944	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
945		goto again;
946
947	rxq->flush_state = SFXGE_FLUSH_DONE;
948
949	rxq->pending = rxq->added;
950	sfxge_rx_qcomplete(rxq, B_TRUE);
951
952	KASSERT(rxq->completed == rxq->pending,
953	    ("rxq->completed != rxq->pending"));
954
955	rxq->added = 0;
956	rxq->pending = 0;
957	rxq->completed = 0;
958	rxq->loopback = 0;
959
960	/* Destroy the common code receive queue. */
961	efx_rx_qdestroy(rxq->common);
962
963	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
964	    EFX_RXQ_NBUFS(sc->rxq_entries));
965
966	SFXGE_EVQ_UNLOCK(evq);
967}
968
969static int
970sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
971{
972	struct sfxge_rxq *rxq;
973	efsys_mem_t *esmp;
974	struct sfxge_evq *evq;
975	int rc;
976
977	rxq = sc->rxq[index];
978	esmp = &rxq->mem;
979	evq = sc->evq[index];
980
981	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
982	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
983	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
984	    ("evq->init_state != SFXGE_EVQ_STARTED"));
985
986	/* Program the buffer table. */
987	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
988	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
989		return (rc);
990
991	/* Create the common code receive queue. */
992	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
993	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
994	    &rxq->common)) != 0)
995		goto fail;
996
997	SFXGE_EVQ_LOCK(evq);
998
999	/* Enable the receive queue. */
1000	efx_rx_qenable(rxq->common);
1001
1002	rxq->init_state = SFXGE_RXQ_STARTED;
1003
1004	/* Try to fill the queue from the pool. */
1005	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1006
1007	SFXGE_EVQ_UNLOCK(evq);
1008
1009	return (0);
1010
1011fail:
1012	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1013	    EFX_RXQ_NBUFS(sc->rxq_entries));
1014	return (rc);
1015}
1016
1017void
1018sfxge_rx_stop(struct sfxge_softc *sc)
1019{
1020	int index;
1021
1022	/* Stop the receive queue(s) */
1023	index = sc->rxq_count;
1024	while (--index >= 0)
1025		sfxge_rx_qstop(sc, index);
1026
1027	sc->rx_prefix_size = 0;
1028	sc->rx_buffer_size = 0;
1029
1030	efx_rx_fini(sc->enp);
1031}
1032
1033int
1034sfxge_rx_start(struct sfxge_softc *sc)
1035{
1036	struct sfxge_intr *intr;
1037	int index;
1038	int rc;
1039
1040	intr = &sc->intr;
1041
1042	/* Initialize the common code receive module. */
1043	if ((rc = efx_rx_init(sc->enp)) != 0)
1044		return (rc);
1045
1046	/* Calculate the receive packet buffer size. */
1047	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1048	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1049			      sc->rx_prefix_size);
1050
1051	/* Select zone for packet buffers */
1052	if (sc->rx_buffer_size <= MCLBYTES)
1053		sc->rx_buffer_zone = zone_clust;
1054	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1055		sc->rx_buffer_zone = zone_jumbop;
1056	else if (sc->rx_buffer_size <= MJUM9BYTES)
1057		sc->rx_buffer_zone = zone_jumbo9;
1058	else
1059		sc->rx_buffer_zone = zone_jumbo16;
1060
1061	/*
1062	 * Set up the scale table.  Enable all hash types and hash insertion.
1063	 */
1064	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1065		sc->rx_indir_table[index] = index % sc->rxq_count;
1066	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1067				       SFXGE_RX_SCALE_MAX)) != 0)
1068		goto fail;
1069	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1070	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1071	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1072
1073	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1074	    sizeof(toep_key))) != 0)
1075		goto fail;
1076
1077	/* Start the receive queue(s). */
1078	for (index = 0; index < sc->rxq_count; index++) {
1079		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1080			goto fail2;
1081	}
1082
1083	return (0);
1084
1085fail2:
1086	while (--index >= 0)
1087		sfxge_rx_qstop(sc, index);
1088
1089fail:
1090	efx_rx_fini(sc->enp);
1091
1092	return (rc);
1093}
1094
1095#ifdef SFXGE_LRO
1096
1097static void sfxge_lro_init(struct sfxge_rxq *rxq)
1098{
1099	struct sfxge_lro_state *st = &rxq->lro;
1100	unsigned i;
1101
1102	st->conns_mask = lro_table_size - 1;
1103	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1104		("lro_table_size must be a power of 2"));
1105	st->sc = rxq->sc;
1106	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1107			   M_SFXGE, M_WAITOK);
1108	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1109			     M_SFXGE, M_WAITOK);
1110	for (i = 0; i <= st->conns_mask; ++i) {
1111		TAILQ_INIT(&st->conns[i]);
1112		st->conns_n[i] = 0;
1113	}
1114	LIST_INIT(&st->active_conns);
1115	TAILQ_INIT(&st->free_conns);
1116}
1117
1118static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1119{
1120	struct sfxge_lro_state *st = &rxq->lro;
1121	struct sfxge_lro_conn *c;
1122	unsigned i;
1123
1124	/* Return cleanly if sfxge_lro_init() has not been called. */
1125	if (st->conns == NULL)
1126		return;
1127
1128	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1129
1130	for (i = 0; i <= st->conns_mask; ++i) {
1131		while (!TAILQ_EMPTY(&st->conns[i])) {
1132			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1133			sfxge_lro_drop(rxq, c);
1134		}
1135	}
1136
1137	while (!TAILQ_EMPTY(&st->free_conns)) {
1138		c = TAILQ_FIRST(&st->free_conns);
1139		TAILQ_REMOVE(&st->free_conns, c, link);
1140		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1141		free(c, M_SFXGE);
1142	}
1143
1144	free(st->conns_n, M_SFXGE);
1145	free(st->conns, M_SFXGE);
1146	st->conns = NULL;
1147}
1148
1149#else
1150
1151static void
1152sfxge_lro_init(struct sfxge_rxq *rxq)
1153{
1154}
1155
1156static void
1157sfxge_lro_fini(struct sfxge_rxq *rxq)
1158{
1159}
1160
1161#endif	/* SFXGE_LRO */
1162
1163static void
1164sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1165{
1166	struct sfxge_rxq *rxq;
1167
1168	rxq = sc->rxq[index];
1169
1170	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1171	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1172
1173	/* Free the context array and the flow table. */
1174	free(rxq->queue, M_SFXGE);
1175	sfxge_lro_fini(rxq);
1176
1177	/* Release DMA memory. */
1178	sfxge_dma_free(&rxq->mem);
1179
1180	sc->rxq[index] = NULL;
1181
1182	free(rxq, M_SFXGE);
1183}
1184
1185static int
1186sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1187{
1188	struct sfxge_rxq *rxq;
1189	struct sfxge_evq *evq;
1190	efsys_mem_t *esmp;
1191	int rc;
1192
1193	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1194
1195	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1196	rxq->sc = sc;
1197	rxq->index = index;
1198	rxq->entries = sc->rxq_entries;
1199	rxq->ptr_mask = rxq->entries - 1;
1200	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1201
1202	sc->rxq[index] = rxq;
1203	esmp = &rxq->mem;
1204
1205	evq = sc->evq[index];
1206
1207	/* Allocate and zero DMA space. */
1208	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1209		return (rc);
1210
1211	/* Allocate buffer table entries. */
1212	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1213				 &rxq->buf_base_id);
1214
1215	/* Allocate the context array and the flow table. */
1216	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1217	    M_SFXGE, M_WAITOK | M_ZERO);
1218	sfxge_lro_init(rxq);
1219
1220	callout_init(&rxq->refill_callout, 1);
1221
1222	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1223
1224	return (0);
1225}
1226
1227static const struct {
1228	const char *name;
1229	size_t offset;
1230} sfxge_rx_stats[] = {
1231#define	SFXGE_RX_STAT(name, member) \
1232	{ #name, offsetof(struct sfxge_rxq, member) }
1233#ifdef SFXGE_LRO
1234	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1235	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1236	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1237	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1238	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1239	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1240	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1241	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1242#endif
1243};
1244
1245static int
1246sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1247{
1248	struct sfxge_softc *sc = arg1;
1249	unsigned int id = arg2;
1250	unsigned int sum, index;
1251
1252	/* Sum across all RX queues */
1253	sum = 0;
1254	for (index = 0; index < sc->rxq_count; index++)
1255		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1256					 sfxge_rx_stats[id].offset);
1257
1258	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1259}
1260
1261static void
1262sfxge_rx_stat_init(struct sfxge_softc *sc)
1263{
1264	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1265	struct sysctl_oid_list *stat_list;
1266	unsigned int id;
1267
1268	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1269
1270	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1271		SYSCTL_ADD_PROC(
1272			ctx, stat_list,
1273			OID_AUTO, sfxge_rx_stats[id].name,
1274			CTLTYPE_UINT|CTLFLAG_RD,
1275			sc, id, sfxge_rx_stat_handler, "IU",
1276			"");
1277	}
1278}
1279
1280void
1281sfxge_rx_fini(struct sfxge_softc *sc)
1282{
1283	int index;
1284
1285	index = sc->rxq_count;
1286	while (--index >= 0)
1287		sfxge_rx_qfini(sc, index);
1288
1289	sc->rxq_count = 0;
1290}
1291
1292int
1293sfxge_rx_init(struct sfxge_softc *sc)
1294{
1295	struct sfxge_intr *intr;
1296	int index;
1297	int rc;
1298
1299#ifdef SFXGE_LRO
1300	if (!ISP2(lro_table_size)) {
1301		log(LOG_ERR, "%s=%u must be power of 2",
1302		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1303		rc = EINVAL;
1304		goto fail_lro_table_size;
1305	}
1306
1307	if (lro_idle_ticks == 0)
1308		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1309#endif
1310
1311	intr = &sc->intr;
1312
1313	sc->rxq_count = intr->n_alloc;
1314
1315	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1316	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1317
1318	/* Initialize the receive queue(s) - one per interrupt. */
1319	for (index = 0; index < sc->rxq_count; index++) {
1320		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1321			goto fail;
1322	}
1323
1324	sfxge_rx_stat_init(sc);
1325
1326	return (0);
1327
1328fail:
1329	/* Tear down the receive queue(s). */
1330	while (--index >= 0)
1331		sfxge_rx_qfini(sc, index);
1332
1333	sc->rxq_count = 0;
1334
1335#ifdef SFXGE_LRO
1336fail_lro_table_size:
1337#endif
1338	return (rc);
1339}
1340