sfxge_rx.c revision 295126
1/*-
2 * Copyright (c) 2010-2015 Solarflare Communications Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright notice,
12 *    this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 295126 2016-02-01 17:41:21Z glebius $");
36
37#include <sys/param.h>
38#include <sys/malloc.h>
39#include <sys/mbuf.h>
40#include <sys/smp.h>
41#include <sys/socket.h>
42#include <sys/sysctl.h>
43#include <sys/syslog.h>
44#include <sys/limits.h>
45#include <sys/syslog.h>
46
47#include <net/ethernet.h>
48#include <net/if.h>
49#include <net/if_vlan_var.h>
50
51#include <netinet/in.h>
52#include <netinet/ip.h>
53#include <netinet/ip6.h>
54#include <netinet/tcp.h>
55
56#include <machine/in_cksum.h>
57
58#include "common/efx.h"
59
60
61#include "sfxge.h"
62#include "sfxge_rx.h"
63
64#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
65
66#ifdef SFXGE_LRO
67
68SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
69	    "Large receive offload (LRO) parameters");
70
71#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
72
73/* Size of the LRO hash table.  Must be a power of 2.  A larger table
74 * means we can accelerate a larger number of streams.
75 */
76static unsigned lro_table_size = 128;
77TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
78SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
79	    &lro_table_size, 0,
80	    "Size of the LRO hash table (must be a power of 2)");
81
82/* Maximum length of a hash chain.  If chains get too long then the lookup
83 * time increases and may exceed the benefit of LRO.
84 */
85static unsigned lro_chain_max = 20;
86TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
87SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
88	    &lro_chain_max, 0,
89	    "The maximum length of a hash chain");
90
91/* Maximum time (in ticks) that a connection can be idle before it's LRO
92 * state is discarded.
93 */
94static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
95TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
96SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
97	    &lro_idle_ticks, 0,
98	    "The maximum time (in ticks) that a connection can be idle "
99	    "before it's LRO state is discarded");
100
101/* Number of packets with payload that must arrive in-order before a
102 * connection is eligible for LRO.  The idea is we should avoid coalescing
103 * segments when the sender is in slow-start because reducing the ACK rate
104 * can damage performance.
105 */
106static int lro_slow_start_packets = 2000;
107TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
108SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
109	    &lro_slow_start_packets, 0,
110	    "Number of packets with payload that must arrive in-order before "
111	    "a connection is eligible for LRO");
112
113/* Number of packets with payload that must arrive in-order following loss
114 * before a connection is eligible for LRO.  The idea is we should avoid
115 * coalescing segments when the sender is recovering from loss, because
116 * reducing the ACK rate can damage performance.
117 */
118static int lro_loss_packets = 20;
119TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
120SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
121	    &lro_loss_packets, 0,
122	    "Number of packets with payload that must arrive in-order "
123	    "following loss before a connection is eligible for LRO");
124
125/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
126#define	SFXGE_LRO_L2_ID_VLAN 0x4000
127#define	SFXGE_LRO_L2_ID_IPV6 0x8000
128#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
129#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
130
131/* Compare IPv6 addresses, avoiding conditional branches */
132static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
133				   const struct in6_addr *right)
134{
135#if LONG_BIT == 64
136	const uint64_t *left64 = (const uint64_t *)left;
137	const uint64_t *right64 = (const uint64_t *)right;
138	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
139#else
140	return (left->s6_addr32[0] - right->s6_addr32[0]) |
141	       (left->s6_addr32[1] - right->s6_addr32[1]) |
142	       (left->s6_addr32[2] - right->s6_addr32[2]) |
143	       (left->s6_addr32[3] - right->s6_addr32[3]);
144#endif
145}
146
147#endif	/* SFXGE_LRO */
148
149void
150sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
151{
152
153	rxq->flush_state = SFXGE_FLUSH_DONE;
154}
155
156void
157sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
158{
159
160	rxq->flush_state = SFXGE_FLUSH_FAILED;
161}
162
163static uint8_t toep_key[] = {
164	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
165	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
166	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
167	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
168	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
169};
170
171static void
172sfxge_rx_post_refill(void *arg)
173{
174	struct sfxge_rxq *rxq = arg;
175	struct sfxge_softc *sc;
176	unsigned int index;
177	struct sfxge_evq *evq;
178	uint16_t magic;
179
180	sc = rxq->sc;
181	index = rxq->index;
182	evq = sc->evq[index];
183
184	magic = SFXGE_MAGIC_RX_QREFILL | index;
185
186	/* This is guaranteed due to the start/stop order of rx and ev */
187	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
188	    ("evq not started"));
189	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
190	    ("rxq not started"));
191	efx_ev_qpost(evq->common, magic);
192}
193
194static void
195sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
196{
197	/* Initially retry after 100 ms, but back off in case of
198	 * repeated failures as we probably have to wait for the
199	 * administrator to raise the pool limit. */
200	if (retrying)
201		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
202	else
203		rxq->refill_delay = hz / 10;
204
205	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
206			     sfxge_rx_post_refill, rxq);
207}
208
209#define	SFXGE_REFILL_BATCH  64
210
211static void
212sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
213{
214	struct sfxge_softc *sc;
215	unsigned int index;
216	struct sfxge_evq *evq;
217	unsigned int batch;
218	unsigned int rxfill;
219	unsigned int mblksize;
220	int ntodo;
221	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
222
223	sc = rxq->sc;
224	index = rxq->index;
225	evq = sc->evq[index];
226
227	prefetch_read_many(sc->enp);
228	prefetch_read_many(rxq->common);
229
230	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
231
232	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
233		return;
234
235	rxfill = rxq->added - rxq->completed;
236	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
237	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
238	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
239	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
240	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
241
242	if (ntodo == 0)
243		return;
244
245	batch = 0;
246	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
247	while (ntodo-- > 0) {
248		unsigned int id;
249		struct sfxge_rx_sw_desc *rx_desc;
250		bus_dma_segment_t seg;
251		struct mbuf *m;
252
253		id = (rxq->added + batch) & rxq->ptr_mask;
254		rx_desc = &rxq->queue[id];
255		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
256
257		rx_desc->flags = EFX_DISCARD;
258		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
259		    sc->rx_cluster_size);
260		if (m == NULL)
261			break;
262
263		/* m_len specifies length of area to be mapped for DMA */
264		m->m_len  = mblksize;
265		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
266		m->m_data += sc->rx_buffer_align;
267
268		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
269		addr[batch++] = seg.ds_addr;
270
271		if (batch == SFXGE_REFILL_BATCH) {
272			efx_rx_qpost(rxq->common, addr, mblksize, batch,
273			    rxq->completed, rxq->added);
274			rxq->added += batch;
275			batch = 0;
276		}
277	}
278
279	if (ntodo != 0)
280		sfxge_rx_schedule_refill(rxq, retrying);
281
282	if (batch != 0) {
283		efx_rx_qpost(rxq->common, addr, mblksize, batch,
284		    rxq->completed, rxq->added);
285		rxq->added += batch;
286	}
287
288	/* Make the descriptors visible to the hardware */
289	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
290			BUS_DMASYNC_PREWRITE);
291
292	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
293
294	/* The queue could still be empty if no descriptors were actually
295	 * pushed, in which case there will be no event to cause the next
296	 * refill, so we must schedule a refill ourselves.
297	 */
298	if(rxq->pushed == rxq->completed) {
299		sfxge_rx_schedule_refill(rxq, retrying);
300	}
301}
302
303void
304sfxge_rx_qrefill(struct sfxge_rxq *rxq)
305{
306
307	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
308		return;
309
310	/* Make sure the queue is full */
311	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
312}
313
314static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
315{
316	struct ifnet *ifp = sc->ifnet;
317
318	m->m_pkthdr.rcvif = ifp;
319	m->m_pkthdr.csum_data = 0xffff;
320	ifp->if_input(ifp, m);
321}
322
323static void
324sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
325{
326	struct mbuf *m = rx_desc->mbuf;
327	int flags = rx_desc->flags;
328	int csum_flags;
329
330	/* Convert checksum flags */
331	csum_flags = (flags & EFX_CKSUM_IPV4) ?
332		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
333	if (flags & EFX_CKSUM_TCPUDP)
334		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
335
336	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
337		m->m_pkthdr.flowid =
338			efx_psuedo_hdr_hash_get(sc->enp,
339						EFX_RX_HASHALG_TOEPLITZ,
340						mtod(m, uint8_t *));
341		/* The hash covers a 4-tuple for TCP only */
342		M_HASHTYPE_SET(m,
343		    (flags & EFX_PKT_IPV4) ?
344			((flags & EFX_PKT_TCP) ?
345			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
346			((flags & EFX_PKT_TCP) ?
347			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
348	}
349	m->m_data += sc->rx_prefix_size;
350	m->m_len = rx_desc->size - sc->rx_prefix_size;
351	m->m_pkthdr.len = m->m_len;
352	m->m_pkthdr.csum_flags = csum_flags;
353	__sfxge_rx_deliver(sc, rx_desc->mbuf);
354
355	rx_desc->flags = EFX_DISCARD;
356	rx_desc->mbuf = NULL;
357}
358
359#ifdef SFXGE_LRO
360
361static void
362sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
363{
364	struct sfxge_softc *sc = st->sc;
365	struct mbuf *m = c->mbuf;
366	struct tcphdr *c_th;
367	int csum_flags;
368
369	KASSERT(m, ("no mbuf to deliver"));
370
371	++st->n_bursts;
372
373	/* Finish off packet munging and recalculate IP header checksum. */
374	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
375		struct ip *iph = c->nh;
376		iph->ip_len = htons(iph->ip_len);
377		iph->ip_sum = 0;
378		iph->ip_sum = in_cksum_hdr(iph);
379		c_th = (struct tcphdr *)(iph + 1);
380		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
381			      CSUM_IP_CHECKED | CSUM_IP_VALID);
382	} else {
383		struct ip6_hdr *iph = c->nh;
384		iph->ip6_plen = htons(iph->ip6_plen);
385		c_th = (struct tcphdr *)(iph + 1);
386		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
387	}
388
389	c_th->th_win = c->th_last->th_win;
390	c_th->th_ack = c->th_last->th_ack;
391	if (c_th->th_off == c->th_last->th_off) {
392		/* Copy TCP options (take care to avoid going negative). */
393		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
394		memcpy(c_th + 1, c->th_last + 1, optlen);
395	}
396
397	m->m_pkthdr.flowid = c->conn_hash;
398	M_HASHTYPE_SET(m,
399	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
400		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
401
402	m->m_pkthdr.csum_flags = csum_flags;
403	__sfxge_rx_deliver(sc, m);
404
405	c->mbuf = NULL;
406	c->delivered = 1;
407}
408
409/* Drop the given connection, and add it to the free list. */
410static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
411{
412	unsigned bucket;
413
414	KASSERT(!c->mbuf, ("found orphaned mbuf"));
415
416	if (c->next_buf.mbuf != NULL) {
417		sfxge_rx_deliver(rxq->sc, &c->next_buf);
418		LIST_REMOVE(c, active_link);
419	}
420
421	bucket = c->conn_hash & rxq->lro.conns_mask;
422	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
423	--rxq->lro.conns_n[bucket];
424	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
425	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
426}
427
428/* Stop tracking connections that have gone idle in order to keep hash
429 * chains short.
430 */
431static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
432{
433	struct sfxge_lro_conn *c;
434	unsigned i;
435
436	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
437		("found active connections"));
438
439	rxq->lro.last_purge_ticks = now;
440	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
441		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
442			continue;
443
444		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
445		if (now - c->last_pkt_ticks > lro_idle_ticks) {
446			++rxq->lro.n_drop_idle;
447			sfxge_lro_drop(rxq, c);
448		}
449	}
450}
451
452static void
453sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
454		struct mbuf *mbuf, struct tcphdr *th)
455{
456	struct tcphdr *c_th;
457
458	/* Tack the new mbuf onto the chain. */
459	KASSERT(!mbuf->m_next, ("mbuf already chained"));
460	c->mbuf_tail->m_next = mbuf;
461	c->mbuf_tail = mbuf;
462
463	/* Increase length appropriately */
464	c->mbuf->m_pkthdr.len += mbuf->m_len;
465
466	/* Update the connection state flags */
467	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
468		struct ip *iph = c->nh;
469		iph->ip_len += mbuf->m_len;
470		c_th = (struct tcphdr *)(iph + 1);
471	} else {
472		struct ip6_hdr *iph = c->nh;
473		iph->ip6_plen += mbuf->m_len;
474		c_th = (struct tcphdr *)(iph + 1);
475	}
476	c_th->th_flags |= (th->th_flags & TH_PUSH);
477	c->th_last = th;
478	++st->n_merges;
479
480	/* Pass packet up now if another segment could overflow the IP
481	 * length.
482	 */
483	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
484		sfxge_lro_deliver(st, c);
485}
486
487static void
488sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
489		struct mbuf *mbuf, void *nh, struct tcphdr *th)
490{
491	/* Start the chain */
492	c->mbuf = mbuf;
493	c->mbuf_tail = c->mbuf;
494	c->nh = nh;
495	c->th_last = th;
496
497	mbuf->m_pkthdr.len = mbuf->m_len;
498
499	/* Mangle header fields for later processing */
500	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
501		struct ip *iph = nh;
502		iph->ip_len = ntohs(iph->ip_len);
503	} else {
504		struct ip6_hdr *iph = nh;
505		iph->ip6_plen = ntohs(iph->ip6_plen);
506	}
507}
508
509/* Try to merge or otherwise hold or deliver (as appropriate) the
510 * packet buffered for this connection (c->next_buf).  Return a flag
511 * indicating whether the connection is still active for LRO purposes.
512 */
513static int
514sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
515{
516	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
517	char *eh = c->next_eh;
518	int data_length, hdr_length, dont_merge;
519	unsigned th_seq, pkt_length;
520	struct tcphdr *th;
521	unsigned now;
522
523	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
524		struct ip *iph = c->next_nh;
525		th = (struct tcphdr *)(iph + 1);
526		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
527	} else {
528		struct ip6_hdr *iph = c->next_nh;
529		th = (struct tcphdr *)(iph + 1);
530		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
531	}
532
533	hdr_length = (char *) th + th->th_off * 4 - eh;
534	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
535		       hdr_length);
536	th_seq = ntohl(th->th_seq);
537	dont_merge = ((data_length <= 0)
538		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
539
540	/* Check for options other than aligned timestamp. */
541	if (th->th_off != 5) {
542		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
543		if (th->th_off == 8 &&
544		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
545					(TCPOPT_NOP << 16) |
546					(TCPOPT_TIMESTAMP << 8) |
547					TCPOLEN_TIMESTAMP)) {
548			/* timestamp option -- okay */
549		} else {
550			dont_merge = 1;
551		}
552	}
553
554	if (__predict_false(th_seq != c->next_seq)) {
555		/* Out-of-order, so start counting again. */
556		if (c->mbuf != NULL)
557			sfxge_lro_deliver(&rxq->lro, c);
558		c->n_in_order_pkts -= lro_loss_packets;
559		c->next_seq = th_seq + data_length;
560		++rxq->lro.n_misorder;
561		goto deliver_buf_out;
562	}
563	c->next_seq = th_seq + data_length;
564
565	now = ticks;
566	if (now - c->last_pkt_ticks > lro_idle_ticks) {
567		++rxq->lro.n_drop_idle;
568		if (c->mbuf != NULL)
569			sfxge_lro_deliver(&rxq->lro, c);
570		sfxge_lro_drop(rxq, c);
571		return (0);
572	}
573	c->last_pkt_ticks = ticks;
574
575	if (c->n_in_order_pkts < lro_slow_start_packets) {
576		/* May be in slow-start, so don't merge. */
577		++rxq->lro.n_slow_start;
578		++c->n_in_order_pkts;
579		goto deliver_buf_out;
580	}
581
582	if (__predict_false(dont_merge)) {
583		if (c->mbuf != NULL)
584			sfxge_lro_deliver(&rxq->lro, c);
585		if (th->th_flags & (TH_FIN | TH_RST)) {
586			++rxq->lro.n_drop_closed;
587			sfxge_lro_drop(rxq, c);
588			return (0);
589		}
590		goto deliver_buf_out;
591	}
592
593	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
594
595	if (__predict_true(c->mbuf != NULL)) {
596		/* Remove headers and any padding */
597		rx_buf->mbuf->m_data += hdr_length;
598		rx_buf->mbuf->m_len = data_length;
599
600		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
601	} else {
602		/* Remove any padding */
603		rx_buf->mbuf->m_len = pkt_length;
604
605		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
606	}
607
608	rx_buf->mbuf = NULL;
609	return (1);
610
611 deliver_buf_out:
612	sfxge_rx_deliver(rxq->sc, rx_buf);
613	return (1);
614}
615
616static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
617			       uint16_t l2_id, void *nh, struct tcphdr *th)
618{
619	unsigned bucket = conn_hash & st->conns_mask;
620	struct sfxge_lro_conn *c;
621
622	if (st->conns_n[bucket] >= lro_chain_max) {
623		++st->n_too_many;
624		return;
625	}
626
627	if (!TAILQ_EMPTY(&st->free_conns)) {
628		c = TAILQ_FIRST(&st->free_conns);
629		TAILQ_REMOVE(&st->free_conns, c, link);
630	} else {
631		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
632		if (c == NULL)
633			return;
634		c->mbuf = NULL;
635		c->next_buf.mbuf = NULL;
636	}
637
638	/* Create the connection tracking data */
639	++st->conns_n[bucket];
640	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
641	c->l2_id = l2_id;
642	c->conn_hash = conn_hash;
643	c->source = th->th_sport;
644	c->dest = th->th_dport;
645	c->n_in_order_pkts = 0;
646	c->last_pkt_ticks = *(volatile int *)&ticks;
647	c->delivered = 0;
648	++st->n_new_stream;
649	/* NB. We don't initialise c->next_seq, and it doesn't matter what
650	 * value it has.  Most likely the next packet received for this
651	 * connection will not match -- no harm done.
652	 */
653}
654
655/* Process mbuf and decide whether to dispatch it to the stack now or
656 * later.
657 */
658static void
659sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
660{
661	struct sfxge_softc *sc = rxq->sc;
662	struct mbuf *m = rx_buf->mbuf;
663	struct ether_header *eh;
664	struct sfxge_lro_conn *c;
665	uint16_t l2_id;
666	uint16_t l3_proto;
667	void *nh;
668	struct tcphdr *th;
669	uint32_t conn_hash;
670	unsigned bucket;
671
672	/* Get the hardware hash */
673	conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
674					    EFX_RX_HASHALG_TOEPLITZ,
675					    mtod(m, uint8_t *));
676
677	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
678	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
679		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
680		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
681			SFXGE_LRO_L2_ID_VLAN;
682		l3_proto = veh->evl_proto;
683		nh = veh + 1;
684	} else {
685		l2_id = 0;
686		l3_proto = eh->ether_type;
687		nh = eh + 1;
688	}
689
690	/* Check whether this is a suitable packet (unfragmented
691	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
692	 * length, and compute a hash if necessary.  If not, return.
693	 */
694	if (l3_proto == htons(ETHERTYPE_IP)) {
695		struct ip *iph = nh;
696
697		KASSERT(iph->ip_p == IPPROTO_TCP,
698		    ("IPv4 protocol is not TCP, but packet marker is set"));
699		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
700		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
701			goto deliver_now;
702		th = (struct tcphdr *)(iph + 1);
703	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
704		struct ip6_hdr *iph = nh;
705
706		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
707		    ("IPv6 next header is not TCP, but packet marker is set"));
708		l2_id |= SFXGE_LRO_L2_ID_IPV6;
709		th = (struct tcphdr *)(iph + 1);
710	} else {
711		goto deliver_now;
712	}
713
714	bucket = conn_hash & rxq->lro.conns_mask;
715
716	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
717		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
718			continue;
719		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
720			continue;
721		if (c->mbuf != NULL) {
722			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
723				struct ip *c_iph, *iph = nh;
724				c_iph = c->nh;
725				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
726				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
727					continue;
728			} else {
729				struct ip6_hdr *c_iph, *iph = nh;
730				c_iph = c->nh;
731				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
732				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
733					continue;
734			}
735		}
736
737		/* Re-insert at head of list to reduce lookup time. */
738		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
739		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
740
741		if (c->next_buf.mbuf != NULL) {
742			if (!sfxge_lro_try_merge(rxq, c))
743				goto deliver_now;
744		} else {
745			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
746			    active_link);
747		}
748		c->next_buf = *rx_buf;
749		c->next_eh = eh;
750		c->next_nh = nh;
751
752		rx_buf->mbuf = NULL;
753		rx_buf->flags = EFX_DISCARD;
754		return;
755	}
756
757	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
758 deliver_now:
759	sfxge_rx_deliver(sc, rx_buf);
760}
761
762static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
763{
764	struct sfxge_lro_state *st = &rxq->lro;
765	struct sfxge_lro_conn *c;
766	unsigned t;
767
768	while (!LIST_EMPTY(&st->active_conns)) {
769		c = LIST_FIRST(&st->active_conns);
770		if (!c->delivered && c->mbuf != NULL)
771			sfxge_lro_deliver(st, c);
772		if (sfxge_lro_try_merge(rxq, c)) {
773			if (c->mbuf != NULL)
774				sfxge_lro_deliver(st, c);
775			LIST_REMOVE(c, active_link);
776		}
777		c->delivered = 0;
778	}
779
780	t = *(volatile int *)&ticks;
781	if (__predict_false(t != st->last_purge_ticks))
782		sfxge_lro_purge_idle(rxq, t);
783}
784
785#else	/* !SFXGE_LRO */
786
787static void
788sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
789{
790}
791
792static void
793sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
794{
795}
796
797#endif	/* SFXGE_LRO */
798
799void
800sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
801{
802	struct sfxge_softc *sc = rxq->sc;
803	int if_capenable = sc->ifnet->if_capenable;
804	int lro_enabled = if_capenable & IFCAP_LRO;
805	unsigned int index;
806	struct sfxge_evq *evq;
807	unsigned int completed;
808	unsigned int level;
809	struct mbuf *m;
810	struct sfxge_rx_sw_desc *prev = NULL;
811
812	index = rxq->index;
813	evq = sc->evq[index];
814
815	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
816
817	completed = rxq->completed;
818	while (completed != rxq->pending) {
819		unsigned int id;
820		struct sfxge_rx_sw_desc *rx_desc;
821
822		id = completed++ & rxq->ptr_mask;
823		rx_desc = &rxq->queue[id];
824		m = rx_desc->mbuf;
825
826		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
827			goto discard;
828
829		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
830			goto discard;
831
832		/* Read the length from the psuedo header if required */
833		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
834			uint16_t tmp_size;
835			int rc;
836			rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
837							   mtod(m, uint8_t *),
838							   &tmp_size);
839			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
840			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
841		}
842
843		prefetch_read_many(mtod(m, caddr_t));
844
845		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
846		case EFX_PKT_IPV4:
847			if (~if_capenable & IFCAP_RXCSUM)
848				rx_desc->flags &=
849				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
850			break;
851		case EFX_PKT_IPV6:
852			if (~if_capenable & IFCAP_RXCSUM_IPV6)
853				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
854			break;
855		case 0:
856			/* Check for loopback packets */
857			{
858				struct ether_header *etherhp;
859
860				/*LINTED*/
861				etherhp = mtod(m, struct ether_header *);
862
863				if (etherhp->ether_type ==
864				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
865					EFSYS_PROBE(loopback);
866
867					rxq->loopback++;
868					goto discard;
869				}
870			}
871			break;
872		default:
873			KASSERT(B_FALSE,
874			    ("Rx descriptor with both IPv4 and IPv6 flags"));
875			goto discard;
876		}
877
878		/* Pass packet up the stack or into LRO (pipelined) */
879		if (prev != NULL) {
880			if (lro_enabled &&
881			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
882			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
883				sfxge_lro(rxq, prev);
884			else
885				sfxge_rx_deliver(sc, prev);
886		}
887		prev = rx_desc;
888		continue;
889
890discard:
891		/* Return the packet to the pool */
892		m_free(m);
893		rx_desc->mbuf = NULL;
894	}
895	rxq->completed = completed;
896
897	level = rxq->added - rxq->completed;
898
899	/* Pass last packet up the stack or into LRO */
900	if (prev != NULL) {
901		if (lro_enabled &&
902		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
903		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
904			sfxge_lro(rxq, prev);
905		else
906			sfxge_rx_deliver(sc, prev);
907	}
908
909	/*
910	 * If there are any pending flows and this is the end of the
911	 * poll then they must be completed.
912	 */
913	if (eop)
914		sfxge_lro_end_of_burst(rxq);
915
916	/* Top up the queue if necessary */
917	if (level < rxq->refill_threshold)
918		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
919}
920
921static void
922sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
923{
924	struct sfxge_rxq *rxq;
925	struct sfxge_evq *evq;
926	unsigned int count;
927	unsigned int retry = 3;
928
929	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
930
931	rxq = sc->rxq[index];
932	evq = sc->evq[index];
933
934	SFXGE_EVQ_LOCK(evq);
935
936	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
937	    ("rxq not started"));
938
939	rxq->init_state = SFXGE_RXQ_INITIALIZED;
940
941	callout_stop(&rxq->refill_callout);
942
943	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
944		rxq->flush_state = SFXGE_FLUSH_PENDING;
945
946		SFXGE_EVQ_UNLOCK(evq);
947
948		/* Flush the receive queue */
949		if (efx_rx_qflush(rxq->common) != 0) {
950			SFXGE_EVQ_LOCK(evq);
951			rxq->flush_state = SFXGE_FLUSH_FAILED;
952			break;
953		}
954
955		count = 0;
956		do {
957			/* Spin for 100 ms */
958			DELAY(100000);
959
960			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
961				break;
962
963		} while (++count < 20);
964
965		SFXGE_EVQ_LOCK(evq);
966
967		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
968			/* Flush timeout - neither done nor failed */
969			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
970			    device_get_nameunit(sc->dev), index);
971			rxq->flush_state = SFXGE_FLUSH_DONE;
972		}
973		retry--;
974	}
975	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
976		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
977		    device_get_nameunit(sc->dev), index);
978		rxq->flush_state = SFXGE_FLUSH_DONE;
979	}
980
981	rxq->pending = rxq->added;
982	sfxge_rx_qcomplete(rxq, B_TRUE);
983
984	KASSERT(rxq->completed == rxq->pending,
985	    ("rxq->completed != rxq->pending"));
986
987	rxq->added = 0;
988	rxq->pushed = 0;
989	rxq->pending = 0;
990	rxq->completed = 0;
991	rxq->loopback = 0;
992
993	/* Destroy the common code receive queue. */
994	efx_rx_qdestroy(rxq->common);
995
996	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
997	    EFX_RXQ_NBUFS(sc->rxq_entries));
998
999	SFXGE_EVQ_UNLOCK(evq);
1000}
1001
1002static int
1003sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1004{
1005	struct sfxge_rxq *rxq;
1006	efsys_mem_t *esmp;
1007	struct sfxge_evq *evq;
1008	int rc;
1009
1010	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1011
1012	rxq = sc->rxq[index];
1013	esmp = &rxq->mem;
1014	evq = sc->evq[index];
1015
1016	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1017	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1018	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1019	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1020
1021	/* Program the buffer table. */
1022	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1023	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1024		return (rc);
1025
1026	/* Create the common code receive queue. */
1027	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1028	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1029	    &rxq->common)) != 0)
1030		goto fail;
1031
1032	SFXGE_EVQ_LOCK(evq);
1033
1034	/* Enable the receive queue. */
1035	efx_rx_qenable(rxq->common);
1036
1037	rxq->init_state = SFXGE_RXQ_STARTED;
1038	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1039
1040	/* Try to fill the queue from the pool. */
1041	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1042
1043	SFXGE_EVQ_UNLOCK(evq);
1044
1045	return (0);
1046
1047fail:
1048	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1049	    EFX_RXQ_NBUFS(sc->rxq_entries));
1050	return (rc);
1051}
1052
1053void
1054sfxge_rx_stop(struct sfxge_softc *sc)
1055{
1056	int index;
1057
1058	efx_mac_filter_default_rxq_clear(sc->enp);
1059
1060	/* Stop the receive queue(s) */
1061	index = sc->rxq_count;
1062	while (--index >= 0)
1063		sfxge_rx_qstop(sc, index);
1064
1065	sc->rx_prefix_size = 0;
1066	sc->rx_buffer_size = 0;
1067
1068	efx_rx_fini(sc->enp);
1069}
1070
1071int
1072sfxge_rx_start(struct sfxge_softc *sc)
1073{
1074	struct sfxge_intr *intr;
1075	const efx_nic_cfg_t *encp;
1076	size_t hdrlen, align, reserved;
1077	int index;
1078	int rc;
1079
1080	intr = &sc->intr;
1081
1082	/* Initialize the common code receive module. */
1083	if ((rc = efx_rx_init(sc->enp)) != 0)
1084		return (rc);
1085
1086	encp = efx_nic_cfg_get(sc->enp);
1087	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1088
1089	/* Calculate the receive packet buffer size. */
1090	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1091
1092	/* Ensure IP headers are 32bit aligned */
1093	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1094	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1095
1096	sc->rx_buffer_size += sc->rx_buffer_align;
1097
1098	/* Align end of packet buffer for RX DMA end padding */
1099	align = MAX(1, encp->enc_rx_buf_align_end);
1100	EFSYS_ASSERT(ISP2(align));
1101	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1102
1103	/*
1104	 * Standard mbuf zones only guarantee pointer-size alignment;
1105	 * we need extra space to align to the cache line
1106	 */
1107	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1108
1109	/* Select zone for packet buffers */
1110	if (reserved <= MCLBYTES)
1111		sc->rx_cluster_size = MCLBYTES;
1112	else if (reserved <= MJUMPAGESIZE)
1113		sc->rx_cluster_size = MJUMPAGESIZE;
1114	else if (reserved <= MJUM9BYTES)
1115		sc->rx_cluster_size = MJUM9BYTES;
1116	else
1117		sc->rx_cluster_size = MJUM16BYTES;
1118
1119	/*
1120	 * Set up the scale table.  Enable all hash types and hash insertion.
1121	 */
1122	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1123		sc->rx_indir_table[index] = index % sc->rxq_count;
1124	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1125				       SFXGE_RX_SCALE_MAX)) != 0)
1126		goto fail;
1127	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1128	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1129	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1130
1131	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1132				       sizeof(toep_key))) != 0)
1133		goto fail;
1134
1135	/* Start the receive queue(s). */
1136	for (index = 0; index < sc->rxq_count; index++) {
1137		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1138			goto fail2;
1139	}
1140
1141	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1142					    sc->intr.n_alloc > 1);
1143	if (rc != 0)
1144		goto fail3;
1145
1146	return (0);
1147
1148fail3:
1149fail2:
1150	while (--index >= 0)
1151		sfxge_rx_qstop(sc, index);
1152
1153fail:
1154	efx_rx_fini(sc->enp);
1155
1156	return (rc);
1157}
1158
1159#ifdef SFXGE_LRO
1160
1161static void sfxge_lro_init(struct sfxge_rxq *rxq)
1162{
1163	struct sfxge_lro_state *st = &rxq->lro;
1164	unsigned i;
1165
1166	st->conns_mask = lro_table_size - 1;
1167	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1168		("lro_table_size must be a power of 2"));
1169	st->sc = rxq->sc;
1170	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1171			   M_SFXGE, M_WAITOK);
1172	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1173			     M_SFXGE, M_WAITOK);
1174	for (i = 0; i <= st->conns_mask; ++i) {
1175		TAILQ_INIT(&st->conns[i]);
1176		st->conns_n[i] = 0;
1177	}
1178	LIST_INIT(&st->active_conns);
1179	TAILQ_INIT(&st->free_conns);
1180}
1181
1182static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1183{
1184	struct sfxge_lro_state *st = &rxq->lro;
1185	struct sfxge_lro_conn *c;
1186	unsigned i;
1187
1188	/* Return cleanly if sfxge_lro_init() has not been called. */
1189	if (st->conns == NULL)
1190		return;
1191
1192	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1193
1194	for (i = 0; i <= st->conns_mask; ++i) {
1195		while (!TAILQ_EMPTY(&st->conns[i])) {
1196			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1197			sfxge_lro_drop(rxq, c);
1198		}
1199	}
1200
1201	while (!TAILQ_EMPTY(&st->free_conns)) {
1202		c = TAILQ_FIRST(&st->free_conns);
1203		TAILQ_REMOVE(&st->free_conns, c, link);
1204		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1205		free(c, M_SFXGE);
1206	}
1207
1208	free(st->conns_n, M_SFXGE);
1209	free(st->conns, M_SFXGE);
1210	st->conns = NULL;
1211}
1212
1213#else
1214
1215static void
1216sfxge_lro_init(struct sfxge_rxq *rxq)
1217{
1218}
1219
1220static void
1221sfxge_lro_fini(struct sfxge_rxq *rxq)
1222{
1223}
1224
1225#endif	/* SFXGE_LRO */
1226
1227static void
1228sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1229{
1230	struct sfxge_rxq *rxq;
1231
1232	rxq = sc->rxq[index];
1233
1234	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1235	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1236
1237	/* Free the context array and the flow table. */
1238	free(rxq->queue, M_SFXGE);
1239	sfxge_lro_fini(rxq);
1240
1241	/* Release DMA memory. */
1242	sfxge_dma_free(&rxq->mem);
1243
1244	sc->rxq[index] = NULL;
1245
1246	free(rxq, M_SFXGE);
1247}
1248
1249static int
1250sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1251{
1252	struct sfxge_rxq *rxq;
1253	struct sfxge_evq *evq;
1254	efsys_mem_t *esmp;
1255	int rc;
1256
1257	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1258
1259	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1260	rxq->sc = sc;
1261	rxq->index = index;
1262	rxq->entries = sc->rxq_entries;
1263	rxq->ptr_mask = rxq->entries - 1;
1264	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1265
1266	sc->rxq[index] = rxq;
1267	esmp = &rxq->mem;
1268
1269	evq = sc->evq[index];
1270
1271	/* Allocate and zero DMA space. */
1272	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1273		return (rc);
1274
1275	/* Allocate buffer table entries. */
1276	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1277				 &rxq->buf_base_id);
1278
1279	/* Allocate the context array and the flow table. */
1280	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1281	    M_SFXGE, M_WAITOK | M_ZERO);
1282	sfxge_lro_init(rxq);
1283
1284	callout_init(&rxq->refill_callout, 1);
1285
1286	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1287
1288	return (0);
1289}
1290
1291static const struct {
1292	const char *name;
1293	size_t offset;
1294} sfxge_rx_stats[] = {
1295#define	SFXGE_RX_STAT(name, member) \
1296	{ #name, offsetof(struct sfxge_rxq, member) }
1297#ifdef SFXGE_LRO
1298	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1299	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1300	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1301	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1302	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1303	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1304	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1305	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1306#endif
1307};
1308
1309static int
1310sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1311{
1312	struct sfxge_softc *sc = arg1;
1313	unsigned int id = arg2;
1314	unsigned int sum, index;
1315
1316	/* Sum across all RX queues */
1317	sum = 0;
1318	for (index = 0; index < sc->rxq_count; index++)
1319		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1320					 sfxge_rx_stats[id].offset);
1321
1322	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1323}
1324
1325static void
1326sfxge_rx_stat_init(struct sfxge_softc *sc)
1327{
1328	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1329	struct sysctl_oid_list *stat_list;
1330	unsigned int id;
1331
1332	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1333
1334	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1335		SYSCTL_ADD_PROC(
1336			ctx, stat_list,
1337			OID_AUTO, sfxge_rx_stats[id].name,
1338			CTLTYPE_UINT|CTLFLAG_RD,
1339			sc, id, sfxge_rx_stat_handler, "IU",
1340			"");
1341	}
1342}
1343
1344void
1345sfxge_rx_fini(struct sfxge_softc *sc)
1346{
1347	int index;
1348
1349	index = sc->rxq_count;
1350	while (--index >= 0)
1351		sfxge_rx_qfini(sc, index);
1352
1353	sc->rxq_count = 0;
1354}
1355
1356int
1357sfxge_rx_init(struct sfxge_softc *sc)
1358{
1359	struct sfxge_intr *intr;
1360	int index;
1361	int rc;
1362
1363#ifdef SFXGE_LRO
1364	if (!ISP2(lro_table_size)) {
1365		log(LOG_ERR, "%s=%u must be power of 2",
1366		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1367		rc = EINVAL;
1368		goto fail_lro_table_size;
1369	}
1370
1371	if (lro_idle_ticks == 0)
1372		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1373#endif
1374
1375	intr = &sc->intr;
1376
1377	sc->rxq_count = intr->n_alloc;
1378
1379	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1380	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1381
1382	/* Initialize the receive queue(s) - one per interrupt. */
1383	for (index = 0; index < sc->rxq_count; index++) {
1384		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1385			goto fail;
1386	}
1387
1388	sfxge_rx_stat_init(sc);
1389
1390	return (0);
1391
1392fail:
1393	/* Tear down the receive queue(s). */
1394	while (--index >= 0)
1395		sfxge_rx_qfini(sc, index);
1396
1397	sc->rxq_count = 0;
1398
1399#ifdef SFXGE_LRO
1400fail_lro_table_size:
1401#endif
1402	return (rc);
1403}
1404