sfxge_rx.c revision 301065
1/*-
2 * Copyright (c) 2010-2016 Solarflare Communications Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright notice,
12 *    this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 301065 2016-05-31 18:31:17Z arybchik $");
36
37#include <sys/param.h>
38#include <sys/malloc.h>
39#include <sys/mbuf.h>
40#include <sys/smp.h>
41#include <sys/socket.h>
42#include <sys/sysctl.h>
43#include <sys/syslog.h>
44#include <sys/limits.h>
45#include <sys/syslog.h>
46
47#include <net/ethernet.h>
48#include <net/if.h>
49#include <net/if_vlan_var.h>
50
51#include <netinet/in.h>
52#include <netinet/ip.h>
53#include <netinet/ip6.h>
54#include <netinet/tcp.h>
55
56#include <machine/in_cksum.h>
57
58#include "common/efx.h"
59
60
61#include "sfxge.h"
62#include "sfxge_rx.h"
63
64#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
65
66#ifdef SFXGE_LRO
67
68SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
69	    "Large receive offload (LRO) parameters");
70
71#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
72
73/* Size of the LRO hash table.  Must be a power of 2.  A larger table
74 * means we can accelerate a larger number of streams.
75 */
76static unsigned lro_table_size = 128;
77TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
78SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
79	    &lro_table_size, 0,
80	    "Size of the LRO hash table (must be a power of 2)");
81
82/* Maximum length of a hash chain.  If chains get too long then the lookup
83 * time increases and may exceed the benefit of LRO.
84 */
85static unsigned lro_chain_max = 20;
86TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
87SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
88	    &lro_chain_max, 0,
89	    "The maximum length of a hash chain");
90
91/* Maximum time (in ticks) that a connection can be idle before it's LRO
92 * state is discarded.
93 */
94static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
95TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
96SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
97	    &lro_idle_ticks, 0,
98	    "The maximum time (in ticks) that a connection can be idle "
99	    "before it's LRO state is discarded");
100
101/* Number of packets with payload that must arrive in-order before a
102 * connection is eligible for LRO.  The idea is we should avoid coalescing
103 * segments when the sender is in slow-start because reducing the ACK rate
104 * can damage performance.
105 */
106static int lro_slow_start_packets = 2000;
107TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
108SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
109	    &lro_slow_start_packets, 0,
110	    "Number of packets with payload that must arrive in-order before "
111	    "a connection is eligible for LRO");
112
113/* Number of packets with payload that must arrive in-order following loss
114 * before a connection is eligible for LRO.  The idea is we should avoid
115 * coalescing segments when the sender is recovering from loss, because
116 * reducing the ACK rate can damage performance.
117 */
118static int lro_loss_packets = 20;
119TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
120SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
121	    &lro_loss_packets, 0,
122	    "Number of packets with payload that must arrive in-order "
123	    "following loss before a connection is eligible for LRO");
124
125/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
126#define	SFXGE_LRO_L2_ID_VLAN 0x4000
127#define	SFXGE_LRO_L2_ID_IPV6 0x8000
128#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
129#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
130
131/* Compare IPv6 addresses, avoiding conditional branches */
132static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
133				   const struct in6_addr *right)
134{
135#if LONG_BIT == 64
136	const uint64_t *left64 = (const uint64_t *)left;
137	const uint64_t *right64 = (const uint64_t *)right;
138	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
139#else
140	return (left->s6_addr32[0] - right->s6_addr32[0]) |
141	       (left->s6_addr32[1] - right->s6_addr32[1]) |
142	       (left->s6_addr32[2] - right->s6_addr32[2]) |
143	       (left->s6_addr32[3] - right->s6_addr32[3]);
144#endif
145}
146
147#endif	/* SFXGE_LRO */
148
149void
150sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
151{
152
153	rxq->flush_state = SFXGE_FLUSH_DONE;
154}
155
156void
157sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
158{
159
160	rxq->flush_state = SFXGE_FLUSH_FAILED;
161}
162
163static uint8_t toep_key[] = {
164	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
165	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
166	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
167	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
168	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
169};
170
171static void
172sfxge_rx_post_refill(void *arg)
173{
174	struct sfxge_rxq *rxq = arg;
175	struct sfxge_softc *sc;
176	unsigned int index;
177	struct sfxge_evq *evq;
178	unsigned int label;
179	uint16_t magic;
180
181	sc = rxq->sc;
182	index = rxq->index;
183	evq = sc->evq[index];
184
185	label = 0;
186	KASSERT((label & SFXGE_MAGIC_DMAQ_LABEL_MASK) == label,
187	    ("(label & SFXGE_MAGIC_DMAQ_LABEL_MASK) != level"));
188	magic = SFXGE_MAGIC_RX_QREFILL | label;
189
190	/* This is guaranteed due to the start/stop order of rx and ev */
191	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
192	    ("evq not started"));
193	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
194	    ("rxq not started"));
195	efx_ev_qpost(evq->common, magic);
196}
197
198static void
199sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
200{
201	/* Initially retry after 100 ms, but back off in case of
202	 * repeated failures as we probably have to wait for the
203	 * administrator to raise the pool limit. */
204	if (retrying)
205		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
206	else
207		rxq->refill_delay = hz / 10;
208
209	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
210			     sfxge_rx_post_refill, rxq);
211}
212
213#define	SFXGE_REFILL_BATCH  64
214
215static void
216sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
217{
218	struct sfxge_softc *sc;
219	unsigned int index;
220	struct sfxge_evq *evq;
221	unsigned int batch;
222	unsigned int rxfill;
223	unsigned int mblksize;
224	int ntodo;
225	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
226
227	sc = rxq->sc;
228	index = rxq->index;
229	evq = sc->evq[index];
230
231	prefetch_read_many(sc->enp);
232	prefetch_read_many(rxq->common);
233
234	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
235
236	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
237		return;
238
239	rxfill = rxq->added - rxq->completed;
240	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
241	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
242	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
243	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
244	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
245
246	if (ntodo == 0)
247		return;
248
249	batch = 0;
250	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
251	while (ntodo-- > 0) {
252		unsigned int id;
253		struct sfxge_rx_sw_desc *rx_desc;
254		bus_dma_segment_t seg;
255		struct mbuf *m;
256
257		id = (rxq->added + batch) & rxq->ptr_mask;
258		rx_desc = &rxq->queue[id];
259		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
260
261		rx_desc->flags = EFX_DISCARD;
262		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
263		    sc->rx_cluster_size);
264		if (m == NULL)
265			break;
266
267		/* m_len specifies length of area to be mapped for DMA */
268		m->m_len  = mblksize;
269		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
270		m->m_data += sc->rx_buffer_align;
271
272		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
273		addr[batch++] = seg.ds_addr;
274
275		if (batch == SFXGE_REFILL_BATCH) {
276			efx_rx_qpost(rxq->common, addr, mblksize, batch,
277			    rxq->completed, rxq->added);
278			rxq->added += batch;
279			batch = 0;
280		}
281	}
282
283	if (ntodo != 0)
284		sfxge_rx_schedule_refill(rxq, retrying);
285
286	if (batch != 0) {
287		efx_rx_qpost(rxq->common, addr, mblksize, batch,
288		    rxq->completed, rxq->added);
289		rxq->added += batch;
290	}
291
292	/* Make the descriptors visible to the hardware */
293	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
294			BUS_DMASYNC_PREWRITE);
295
296	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
297
298	/* The queue could still be empty if no descriptors were actually
299	 * pushed, in which case there will be no event to cause the next
300	 * refill, so we must schedule a refill ourselves.
301	 */
302	if(rxq->pushed == rxq->completed) {
303		sfxge_rx_schedule_refill(rxq, retrying);
304	}
305}
306
307void
308sfxge_rx_qrefill(struct sfxge_rxq *rxq)
309{
310
311	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
312		return;
313
314	/* Make sure the queue is full */
315	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
316}
317
318static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
319{
320	struct ifnet *ifp = sc->ifnet;
321
322	m->m_pkthdr.rcvif = ifp;
323	m->m_pkthdr.csum_data = 0xffff;
324	ifp->if_input(ifp, m);
325}
326
327static void
328sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
329{
330	struct mbuf *m = rx_desc->mbuf;
331	int flags = rx_desc->flags;
332	int csum_flags;
333
334	/* Convert checksum flags */
335	csum_flags = (flags & EFX_CKSUM_IPV4) ?
336		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
337	if (flags & EFX_CKSUM_TCPUDP)
338		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
339
340	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
341		m->m_pkthdr.flowid =
342			efx_psuedo_hdr_hash_get(sc->enp,
343						EFX_RX_HASHALG_TOEPLITZ,
344						mtod(m, uint8_t *));
345		/* The hash covers a 4-tuple for TCP only */
346		M_HASHTYPE_SET(m,
347		    (flags & EFX_PKT_IPV4) ?
348			((flags & EFX_PKT_TCP) ?
349			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
350			((flags & EFX_PKT_TCP) ?
351			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
352	}
353	m->m_data += sc->rx_prefix_size;
354	m->m_len = rx_desc->size - sc->rx_prefix_size;
355	m->m_pkthdr.len = m->m_len;
356	m->m_pkthdr.csum_flags = csum_flags;
357	__sfxge_rx_deliver(sc, rx_desc->mbuf);
358
359	rx_desc->flags = EFX_DISCARD;
360	rx_desc->mbuf = NULL;
361}
362
363#ifdef SFXGE_LRO
364
365static void
366sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
367{
368	struct sfxge_softc *sc = st->sc;
369	struct mbuf *m = c->mbuf;
370	struct tcphdr *c_th;
371	int csum_flags;
372
373	KASSERT(m, ("no mbuf to deliver"));
374
375	++st->n_bursts;
376
377	/* Finish off packet munging and recalculate IP header checksum. */
378	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
379		struct ip *iph = c->nh;
380		iph->ip_len = htons(iph->ip_len);
381		iph->ip_sum = 0;
382		iph->ip_sum = in_cksum_hdr(iph);
383		c_th = (struct tcphdr *)(iph + 1);
384		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
385			      CSUM_IP_CHECKED | CSUM_IP_VALID);
386	} else {
387		struct ip6_hdr *iph = c->nh;
388		iph->ip6_plen = htons(iph->ip6_plen);
389		c_th = (struct tcphdr *)(iph + 1);
390		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
391	}
392
393	c_th->th_win = c->th_last->th_win;
394	c_th->th_ack = c->th_last->th_ack;
395	if (c_th->th_off == c->th_last->th_off) {
396		/* Copy TCP options (take care to avoid going negative). */
397		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
398		memcpy(c_th + 1, c->th_last + 1, optlen);
399	}
400
401	m->m_pkthdr.flowid = c->conn_hash;
402	M_HASHTYPE_SET(m,
403	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
404		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
405
406	m->m_pkthdr.csum_flags = csum_flags;
407	__sfxge_rx_deliver(sc, m);
408
409	c->mbuf = NULL;
410	c->delivered = 1;
411}
412
413/* Drop the given connection, and add it to the free list. */
414static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
415{
416	unsigned bucket;
417
418	KASSERT(!c->mbuf, ("found orphaned mbuf"));
419
420	if (c->next_buf.mbuf != NULL) {
421		sfxge_rx_deliver(rxq->sc, &c->next_buf);
422		LIST_REMOVE(c, active_link);
423	}
424
425	bucket = c->conn_hash & rxq->lro.conns_mask;
426	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
427	--rxq->lro.conns_n[bucket];
428	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
429	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
430}
431
432/* Stop tracking connections that have gone idle in order to keep hash
433 * chains short.
434 */
435static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
436{
437	struct sfxge_lro_conn *c;
438	unsigned i;
439
440	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
441		("found active connections"));
442
443	rxq->lro.last_purge_ticks = now;
444	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
445		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
446			continue;
447
448		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
449		if (now - c->last_pkt_ticks > lro_idle_ticks) {
450			++rxq->lro.n_drop_idle;
451			sfxge_lro_drop(rxq, c);
452		}
453	}
454}
455
456static void
457sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
458		struct mbuf *mbuf, struct tcphdr *th)
459{
460	struct tcphdr *c_th;
461
462	/* Tack the new mbuf onto the chain. */
463	KASSERT(!mbuf->m_next, ("mbuf already chained"));
464	c->mbuf_tail->m_next = mbuf;
465	c->mbuf_tail = mbuf;
466
467	/* Increase length appropriately */
468	c->mbuf->m_pkthdr.len += mbuf->m_len;
469
470	/* Update the connection state flags */
471	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
472		struct ip *iph = c->nh;
473		iph->ip_len += mbuf->m_len;
474		c_th = (struct tcphdr *)(iph + 1);
475	} else {
476		struct ip6_hdr *iph = c->nh;
477		iph->ip6_plen += mbuf->m_len;
478		c_th = (struct tcphdr *)(iph + 1);
479	}
480	c_th->th_flags |= (th->th_flags & TH_PUSH);
481	c->th_last = th;
482	++st->n_merges;
483
484	/* Pass packet up now if another segment could overflow the IP
485	 * length.
486	 */
487	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
488		sfxge_lro_deliver(st, c);
489}
490
491static void
492sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
493		struct mbuf *mbuf, void *nh, struct tcphdr *th)
494{
495	/* Start the chain */
496	c->mbuf = mbuf;
497	c->mbuf_tail = c->mbuf;
498	c->nh = nh;
499	c->th_last = th;
500
501	mbuf->m_pkthdr.len = mbuf->m_len;
502
503	/* Mangle header fields for later processing */
504	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
505		struct ip *iph = nh;
506		iph->ip_len = ntohs(iph->ip_len);
507	} else {
508		struct ip6_hdr *iph = nh;
509		iph->ip6_plen = ntohs(iph->ip6_plen);
510	}
511}
512
513/* Try to merge or otherwise hold or deliver (as appropriate) the
514 * packet buffered for this connection (c->next_buf).  Return a flag
515 * indicating whether the connection is still active for LRO purposes.
516 */
517static int
518sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
519{
520	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
521	char *eh = c->next_eh;
522	int data_length, hdr_length, dont_merge;
523	unsigned th_seq, pkt_length;
524	struct tcphdr *th;
525	unsigned now;
526
527	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
528		struct ip *iph = c->next_nh;
529		th = (struct tcphdr *)(iph + 1);
530		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
531	} else {
532		struct ip6_hdr *iph = c->next_nh;
533		th = (struct tcphdr *)(iph + 1);
534		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
535	}
536
537	hdr_length = (char *) th + th->th_off * 4 - eh;
538	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
539		       hdr_length);
540	th_seq = ntohl(th->th_seq);
541	dont_merge = ((data_length <= 0)
542		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
543
544	/* Check for options other than aligned timestamp. */
545	if (th->th_off != 5) {
546		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
547		if (th->th_off == 8 &&
548		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
549					(TCPOPT_NOP << 16) |
550					(TCPOPT_TIMESTAMP << 8) |
551					TCPOLEN_TIMESTAMP)) {
552			/* timestamp option -- okay */
553		} else {
554			dont_merge = 1;
555		}
556	}
557
558	if (__predict_false(th_seq != c->next_seq)) {
559		/* Out-of-order, so start counting again. */
560		if (c->mbuf != NULL)
561			sfxge_lro_deliver(&rxq->lro, c);
562		c->n_in_order_pkts -= lro_loss_packets;
563		c->next_seq = th_seq + data_length;
564		++rxq->lro.n_misorder;
565		goto deliver_buf_out;
566	}
567	c->next_seq = th_seq + data_length;
568
569	now = ticks;
570	if (now - c->last_pkt_ticks > lro_idle_ticks) {
571		++rxq->lro.n_drop_idle;
572		if (c->mbuf != NULL)
573			sfxge_lro_deliver(&rxq->lro, c);
574		sfxge_lro_drop(rxq, c);
575		return (0);
576	}
577	c->last_pkt_ticks = ticks;
578
579	if (c->n_in_order_pkts < lro_slow_start_packets) {
580		/* May be in slow-start, so don't merge. */
581		++rxq->lro.n_slow_start;
582		++c->n_in_order_pkts;
583		goto deliver_buf_out;
584	}
585
586	if (__predict_false(dont_merge)) {
587		if (c->mbuf != NULL)
588			sfxge_lro_deliver(&rxq->lro, c);
589		if (th->th_flags & (TH_FIN | TH_RST)) {
590			++rxq->lro.n_drop_closed;
591			sfxge_lro_drop(rxq, c);
592			return (0);
593		}
594		goto deliver_buf_out;
595	}
596
597	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
598
599	if (__predict_true(c->mbuf != NULL)) {
600		/* Remove headers and any padding */
601		rx_buf->mbuf->m_data += hdr_length;
602		rx_buf->mbuf->m_len = data_length;
603
604		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
605	} else {
606		/* Remove any padding */
607		rx_buf->mbuf->m_len = pkt_length;
608
609		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
610	}
611
612	rx_buf->mbuf = NULL;
613	return (1);
614
615 deliver_buf_out:
616	sfxge_rx_deliver(rxq->sc, rx_buf);
617	return (1);
618}
619
620static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
621			       uint16_t l2_id, void *nh, struct tcphdr *th)
622{
623	unsigned bucket = conn_hash & st->conns_mask;
624	struct sfxge_lro_conn *c;
625
626	if (st->conns_n[bucket] >= lro_chain_max) {
627		++st->n_too_many;
628		return;
629	}
630
631	if (!TAILQ_EMPTY(&st->free_conns)) {
632		c = TAILQ_FIRST(&st->free_conns);
633		TAILQ_REMOVE(&st->free_conns, c, link);
634	} else {
635		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
636		if (c == NULL)
637			return;
638		c->mbuf = NULL;
639		c->next_buf.mbuf = NULL;
640	}
641
642	/* Create the connection tracking data */
643	++st->conns_n[bucket];
644	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
645	c->l2_id = l2_id;
646	c->conn_hash = conn_hash;
647	c->source = th->th_sport;
648	c->dest = th->th_dport;
649	c->n_in_order_pkts = 0;
650	c->last_pkt_ticks = *(volatile int *)&ticks;
651	c->delivered = 0;
652	++st->n_new_stream;
653	/* NB. We don't initialise c->next_seq, and it doesn't matter what
654	 * value it has.  Most likely the next packet received for this
655	 * connection will not match -- no harm done.
656	 */
657}
658
659/* Process mbuf and decide whether to dispatch it to the stack now or
660 * later.
661 */
662static void
663sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
664{
665	struct sfxge_softc *sc = rxq->sc;
666	struct mbuf *m = rx_buf->mbuf;
667	struct ether_header *eh;
668	struct sfxge_lro_conn *c;
669	uint16_t l2_id;
670	uint16_t l3_proto;
671	void *nh;
672	struct tcphdr *th;
673	uint32_t conn_hash;
674	unsigned bucket;
675
676	/* Get the hardware hash */
677	conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
678					    EFX_RX_HASHALG_TOEPLITZ,
679					    mtod(m, uint8_t *));
680
681	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
682	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
683		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
684		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
685			SFXGE_LRO_L2_ID_VLAN;
686		l3_proto = veh->evl_proto;
687		nh = veh + 1;
688	} else {
689		l2_id = 0;
690		l3_proto = eh->ether_type;
691		nh = eh + 1;
692	}
693
694	/* Check whether this is a suitable packet (unfragmented
695	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
696	 * length, and compute a hash if necessary.  If not, return.
697	 */
698	if (l3_proto == htons(ETHERTYPE_IP)) {
699		struct ip *iph = nh;
700
701		KASSERT(iph->ip_p == IPPROTO_TCP,
702		    ("IPv4 protocol is not TCP, but packet marker is set"));
703		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
704		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
705			goto deliver_now;
706		th = (struct tcphdr *)(iph + 1);
707	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
708		struct ip6_hdr *iph = nh;
709
710		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
711		    ("IPv6 next header is not TCP, but packet marker is set"));
712		l2_id |= SFXGE_LRO_L2_ID_IPV6;
713		th = (struct tcphdr *)(iph + 1);
714	} else {
715		goto deliver_now;
716	}
717
718	bucket = conn_hash & rxq->lro.conns_mask;
719
720	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
721		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
722			continue;
723		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
724			continue;
725		if (c->mbuf != NULL) {
726			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
727				struct ip *c_iph, *iph = nh;
728				c_iph = c->nh;
729				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
730				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
731					continue;
732			} else {
733				struct ip6_hdr *c_iph, *iph = nh;
734				c_iph = c->nh;
735				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
736				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
737					continue;
738			}
739		}
740
741		/* Re-insert at head of list to reduce lookup time. */
742		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
743		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
744
745		if (c->next_buf.mbuf != NULL) {
746			if (!sfxge_lro_try_merge(rxq, c))
747				goto deliver_now;
748		} else {
749			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
750			    active_link);
751		}
752		c->next_buf = *rx_buf;
753		c->next_eh = eh;
754		c->next_nh = nh;
755
756		rx_buf->mbuf = NULL;
757		rx_buf->flags = EFX_DISCARD;
758		return;
759	}
760
761	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
762 deliver_now:
763	sfxge_rx_deliver(sc, rx_buf);
764}
765
766static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
767{
768	struct sfxge_lro_state *st = &rxq->lro;
769	struct sfxge_lro_conn *c;
770	unsigned t;
771
772	while (!LIST_EMPTY(&st->active_conns)) {
773		c = LIST_FIRST(&st->active_conns);
774		if (!c->delivered && c->mbuf != NULL)
775			sfxge_lro_deliver(st, c);
776		if (sfxge_lro_try_merge(rxq, c)) {
777			if (c->mbuf != NULL)
778				sfxge_lro_deliver(st, c);
779			LIST_REMOVE(c, active_link);
780		}
781		c->delivered = 0;
782	}
783
784	t = *(volatile int *)&ticks;
785	if (__predict_false(t != st->last_purge_ticks))
786		sfxge_lro_purge_idle(rxq, t);
787}
788
789#else	/* !SFXGE_LRO */
790
791static void
792sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
793{
794}
795
796static void
797sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
798{
799}
800
801#endif	/* SFXGE_LRO */
802
803void
804sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
805{
806	struct sfxge_softc *sc = rxq->sc;
807	int if_capenable = sc->ifnet->if_capenable;
808	int lro_enabled = if_capenable & IFCAP_LRO;
809	unsigned int index;
810	struct sfxge_evq *evq;
811	unsigned int completed;
812	unsigned int level;
813	struct mbuf *m;
814	struct sfxge_rx_sw_desc *prev = NULL;
815
816	index = rxq->index;
817	evq = sc->evq[index];
818
819	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
820
821	completed = rxq->completed;
822	while (completed != rxq->pending) {
823		unsigned int id;
824		struct sfxge_rx_sw_desc *rx_desc;
825
826		id = completed++ & rxq->ptr_mask;
827		rx_desc = &rxq->queue[id];
828		m = rx_desc->mbuf;
829
830		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
831			goto discard;
832
833		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
834			goto discard;
835
836		/* Read the length from the pseudo header if required */
837		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
838			uint16_t tmp_size;
839			int rc;
840			rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
841							   mtod(m, uint8_t *),
842							   &tmp_size);
843			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
844			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
845		}
846
847		prefetch_read_many(mtod(m, caddr_t));
848
849		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
850		case EFX_PKT_IPV4:
851			if (~if_capenable & IFCAP_RXCSUM)
852				rx_desc->flags &=
853				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
854			break;
855		case EFX_PKT_IPV6:
856			if (~if_capenable & IFCAP_RXCSUM_IPV6)
857				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
858			break;
859		case 0:
860			/* Check for loopback packets */
861			{
862				struct ether_header *etherhp;
863
864				/*LINTED*/
865				etherhp = mtod(m, struct ether_header *);
866
867				if (etherhp->ether_type ==
868				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
869					EFSYS_PROBE(loopback);
870
871					rxq->loopback++;
872					goto discard;
873				}
874			}
875			break;
876		default:
877			KASSERT(B_FALSE,
878			    ("Rx descriptor with both IPv4 and IPv6 flags"));
879			goto discard;
880		}
881
882		/* Pass packet up the stack or into LRO (pipelined) */
883		if (prev != NULL) {
884			if (lro_enabled &&
885			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
886			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
887				sfxge_lro(rxq, prev);
888			else
889				sfxge_rx_deliver(sc, prev);
890		}
891		prev = rx_desc;
892		continue;
893
894discard:
895		/* Return the packet to the pool */
896		m_free(m);
897		rx_desc->mbuf = NULL;
898	}
899	rxq->completed = completed;
900
901	level = rxq->added - rxq->completed;
902
903	/* Pass last packet up the stack or into LRO */
904	if (prev != NULL) {
905		if (lro_enabled &&
906		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
907		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
908			sfxge_lro(rxq, prev);
909		else
910			sfxge_rx_deliver(sc, prev);
911	}
912
913	/*
914	 * If there are any pending flows and this is the end of the
915	 * poll then they must be completed.
916	 */
917	if (eop)
918		sfxge_lro_end_of_burst(rxq);
919
920	/* Top up the queue if necessary */
921	if (level < rxq->refill_threshold)
922		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
923}
924
925static void
926sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
927{
928	struct sfxge_rxq *rxq;
929	struct sfxge_evq *evq;
930	unsigned int count;
931	unsigned int retry = 3;
932
933	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
934
935	rxq = sc->rxq[index];
936	evq = sc->evq[index];
937
938	SFXGE_EVQ_LOCK(evq);
939
940	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
941	    ("rxq not started"));
942
943	rxq->init_state = SFXGE_RXQ_INITIALIZED;
944
945	callout_stop(&rxq->refill_callout);
946
947	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
948		rxq->flush_state = SFXGE_FLUSH_PENDING;
949
950		SFXGE_EVQ_UNLOCK(evq);
951
952		/* Flush the receive queue */
953		if (efx_rx_qflush(rxq->common) != 0) {
954			SFXGE_EVQ_LOCK(evq);
955			rxq->flush_state = SFXGE_FLUSH_FAILED;
956			break;
957		}
958
959		count = 0;
960		do {
961			/* Spin for 100 ms */
962			DELAY(100000);
963
964			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
965				break;
966
967		} while (++count < 20);
968
969		SFXGE_EVQ_LOCK(evq);
970
971		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
972			/* Flush timeout - neither done nor failed */
973			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
974			    device_get_nameunit(sc->dev), index);
975			rxq->flush_state = SFXGE_FLUSH_DONE;
976		}
977		retry--;
978	}
979	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
980		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
981		    device_get_nameunit(sc->dev), index);
982		rxq->flush_state = SFXGE_FLUSH_DONE;
983	}
984
985	rxq->pending = rxq->added;
986	sfxge_rx_qcomplete(rxq, B_TRUE);
987
988	KASSERT(rxq->completed == rxq->pending,
989	    ("rxq->completed != rxq->pending"));
990
991	rxq->added = 0;
992	rxq->pushed = 0;
993	rxq->pending = 0;
994	rxq->completed = 0;
995	rxq->loopback = 0;
996
997	/* Destroy the common code receive queue. */
998	efx_rx_qdestroy(rxq->common);
999
1000	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1001	    EFX_RXQ_NBUFS(sc->rxq_entries));
1002
1003	SFXGE_EVQ_UNLOCK(evq);
1004}
1005
1006static int
1007sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1008{
1009	struct sfxge_rxq *rxq;
1010	efsys_mem_t *esmp;
1011	struct sfxge_evq *evq;
1012	int rc;
1013
1014	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1015
1016	rxq = sc->rxq[index];
1017	esmp = &rxq->mem;
1018	evq = sc->evq[index];
1019
1020	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1021	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1022	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1023	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1024
1025	/* Program the buffer table. */
1026	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1027	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1028		return (rc);
1029
1030	/* Create the common code receive queue. */
1031	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1032	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1033	    &rxq->common)) != 0)
1034		goto fail;
1035
1036	SFXGE_EVQ_LOCK(evq);
1037
1038	/* Enable the receive queue. */
1039	efx_rx_qenable(rxq->common);
1040
1041	rxq->init_state = SFXGE_RXQ_STARTED;
1042	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1043
1044	/* Try to fill the queue from the pool. */
1045	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1046
1047	SFXGE_EVQ_UNLOCK(evq);
1048
1049	return (0);
1050
1051fail:
1052	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1053	    EFX_RXQ_NBUFS(sc->rxq_entries));
1054	return (rc);
1055}
1056
1057void
1058sfxge_rx_stop(struct sfxge_softc *sc)
1059{
1060	int index;
1061
1062	efx_mac_filter_default_rxq_clear(sc->enp);
1063
1064	/* Stop the receive queue(s) */
1065	index = sc->rxq_count;
1066	while (--index >= 0)
1067		sfxge_rx_qstop(sc, index);
1068
1069	sc->rx_prefix_size = 0;
1070	sc->rx_buffer_size = 0;
1071
1072	efx_rx_fini(sc->enp);
1073}
1074
1075int
1076sfxge_rx_start(struct sfxge_softc *sc)
1077{
1078	struct sfxge_intr *intr;
1079	const efx_nic_cfg_t *encp;
1080	size_t hdrlen, align, reserved;
1081	int index;
1082	int rc;
1083
1084	intr = &sc->intr;
1085
1086	/* Initialize the common code receive module. */
1087	if ((rc = efx_rx_init(sc->enp)) != 0)
1088		return (rc);
1089
1090	encp = efx_nic_cfg_get(sc->enp);
1091	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1092
1093	/* Calculate the receive packet buffer size. */
1094	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1095
1096	/* Ensure IP headers are 32bit aligned */
1097	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1098	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1099
1100	sc->rx_buffer_size += sc->rx_buffer_align;
1101
1102	/* Align end of packet buffer for RX DMA end padding */
1103	align = MAX(1, encp->enc_rx_buf_align_end);
1104	EFSYS_ASSERT(ISP2(align));
1105	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1106
1107	/*
1108	 * Standard mbuf zones only guarantee pointer-size alignment;
1109	 * we need extra space to align to the cache line
1110	 */
1111	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1112
1113	/* Select zone for packet buffers */
1114	if (reserved <= MCLBYTES)
1115		sc->rx_cluster_size = MCLBYTES;
1116	else if (reserved <= MJUMPAGESIZE)
1117		sc->rx_cluster_size = MJUMPAGESIZE;
1118	else if (reserved <= MJUM9BYTES)
1119		sc->rx_cluster_size = MJUM9BYTES;
1120	else
1121		sc->rx_cluster_size = MJUM16BYTES;
1122
1123	/*
1124	 * Set up the scale table.  Enable all hash types and hash insertion.
1125	 */
1126	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1127		sc->rx_indir_table[index] = index % sc->rxq_count;
1128	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1129				       SFXGE_RX_SCALE_MAX)) != 0)
1130		goto fail;
1131	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1132	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1133	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1134
1135	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1136				       sizeof(toep_key))) != 0)
1137		goto fail;
1138
1139	/* Start the receive queue(s). */
1140	for (index = 0; index < sc->rxq_count; index++) {
1141		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1142			goto fail2;
1143	}
1144
1145	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1146					    sc->intr.n_alloc > 1);
1147	if (rc != 0)
1148		goto fail3;
1149
1150	return (0);
1151
1152fail3:
1153fail2:
1154	while (--index >= 0)
1155		sfxge_rx_qstop(sc, index);
1156
1157fail:
1158	efx_rx_fini(sc->enp);
1159
1160	return (rc);
1161}
1162
1163#ifdef SFXGE_LRO
1164
1165static void sfxge_lro_init(struct sfxge_rxq *rxq)
1166{
1167	struct sfxge_lro_state *st = &rxq->lro;
1168	unsigned i;
1169
1170	st->conns_mask = lro_table_size - 1;
1171	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1172		("lro_table_size must be a power of 2"));
1173	st->sc = rxq->sc;
1174	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1175			   M_SFXGE, M_WAITOK);
1176	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1177			     M_SFXGE, M_WAITOK);
1178	for (i = 0; i <= st->conns_mask; ++i) {
1179		TAILQ_INIT(&st->conns[i]);
1180		st->conns_n[i] = 0;
1181	}
1182	LIST_INIT(&st->active_conns);
1183	TAILQ_INIT(&st->free_conns);
1184}
1185
1186static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1187{
1188	struct sfxge_lro_state *st = &rxq->lro;
1189	struct sfxge_lro_conn *c;
1190	unsigned i;
1191
1192	/* Return cleanly if sfxge_lro_init() has not been called. */
1193	if (st->conns == NULL)
1194		return;
1195
1196	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1197
1198	for (i = 0; i <= st->conns_mask; ++i) {
1199		while (!TAILQ_EMPTY(&st->conns[i])) {
1200			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1201			sfxge_lro_drop(rxq, c);
1202		}
1203	}
1204
1205	while (!TAILQ_EMPTY(&st->free_conns)) {
1206		c = TAILQ_FIRST(&st->free_conns);
1207		TAILQ_REMOVE(&st->free_conns, c, link);
1208		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1209		free(c, M_SFXGE);
1210	}
1211
1212	free(st->conns_n, M_SFXGE);
1213	free(st->conns, M_SFXGE);
1214	st->conns = NULL;
1215}
1216
1217#else
1218
1219static void
1220sfxge_lro_init(struct sfxge_rxq *rxq)
1221{
1222}
1223
1224static void
1225sfxge_lro_fini(struct sfxge_rxq *rxq)
1226{
1227}
1228
1229#endif	/* SFXGE_LRO */
1230
1231static void
1232sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1233{
1234	struct sfxge_rxq *rxq;
1235
1236	rxq = sc->rxq[index];
1237
1238	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1239	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1240
1241	/* Free the context array and the flow table. */
1242	free(rxq->queue, M_SFXGE);
1243	sfxge_lro_fini(rxq);
1244
1245	/* Release DMA memory. */
1246	sfxge_dma_free(&rxq->mem);
1247
1248	sc->rxq[index] = NULL;
1249
1250	free(rxq, M_SFXGE);
1251}
1252
1253static int
1254sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1255{
1256	struct sfxge_rxq *rxq;
1257	struct sfxge_evq *evq;
1258	efsys_mem_t *esmp;
1259	int rc;
1260
1261	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1262
1263	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1264	rxq->sc = sc;
1265	rxq->index = index;
1266	rxq->entries = sc->rxq_entries;
1267	rxq->ptr_mask = rxq->entries - 1;
1268	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1269
1270	sc->rxq[index] = rxq;
1271	esmp = &rxq->mem;
1272
1273	evq = sc->evq[index];
1274
1275	/* Allocate and zero DMA space. */
1276	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1277		return (rc);
1278
1279	/* Allocate buffer table entries. */
1280	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1281				 &rxq->buf_base_id);
1282
1283	/* Allocate the context array and the flow table. */
1284	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1285	    M_SFXGE, M_WAITOK | M_ZERO);
1286	sfxge_lro_init(rxq);
1287
1288	callout_init(&rxq->refill_callout, 1);
1289
1290	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1291
1292	return (0);
1293}
1294
1295static const struct {
1296	const char *name;
1297	size_t offset;
1298} sfxge_rx_stats[] = {
1299#define	SFXGE_RX_STAT(name, member) \
1300	{ #name, offsetof(struct sfxge_rxq, member) }
1301#ifdef SFXGE_LRO
1302	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1303	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1304	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1305	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1306	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1307	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1308	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1309	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1310#endif
1311};
1312
1313static int
1314sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1315{
1316	struct sfxge_softc *sc = arg1;
1317	unsigned int id = arg2;
1318	unsigned int sum, index;
1319
1320	/* Sum across all RX queues */
1321	sum = 0;
1322	for (index = 0; index < sc->rxq_count; index++)
1323		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1324					 sfxge_rx_stats[id].offset);
1325
1326	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1327}
1328
1329static void
1330sfxge_rx_stat_init(struct sfxge_softc *sc)
1331{
1332	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1333	struct sysctl_oid_list *stat_list;
1334	unsigned int id;
1335
1336	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1337
1338	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1339		SYSCTL_ADD_PROC(
1340			ctx, stat_list,
1341			OID_AUTO, sfxge_rx_stats[id].name,
1342			CTLTYPE_UINT|CTLFLAG_RD,
1343			sc, id, sfxge_rx_stat_handler, "IU",
1344			"");
1345	}
1346}
1347
1348void
1349sfxge_rx_fini(struct sfxge_softc *sc)
1350{
1351	int index;
1352
1353	index = sc->rxq_count;
1354	while (--index >= 0)
1355		sfxge_rx_qfini(sc, index);
1356
1357	sc->rxq_count = 0;
1358}
1359
1360int
1361sfxge_rx_init(struct sfxge_softc *sc)
1362{
1363	struct sfxge_intr *intr;
1364	int index;
1365	int rc;
1366
1367#ifdef SFXGE_LRO
1368	if (!ISP2(lro_table_size)) {
1369		log(LOG_ERR, "%s=%u must be power of 2",
1370		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1371		rc = EINVAL;
1372		goto fail_lro_table_size;
1373	}
1374
1375	if (lro_idle_ticks == 0)
1376		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1377#endif
1378
1379	intr = &sc->intr;
1380
1381	sc->rxq_count = intr->n_alloc;
1382
1383	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1384	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1385
1386	/* Initialize the receive queue(s) - one per interrupt. */
1387	for (index = 0; index < sc->rxq_count; index++) {
1388		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1389			goto fail;
1390	}
1391
1392	sfxge_rx_stat_init(sc);
1393
1394	return (0);
1395
1396fail:
1397	/* Tear down the receive queue(s). */
1398	while (--index >= 0)
1399		sfxge_rx_qfini(sc, index);
1400
1401	sc->rxq_count = 0;
1402
1403#ifdef SFXGE_LRO
1404fail_lro_table_size:
1405#endif
1406	return (rc);
1407}
1408