sfxge_rx.c revision 350409
1/*-
2 * Copyright (c) 2010-2016 Solarflare Communications Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright notice,
12 *    this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/11/sys/dev/sfxge/sfxge_rx.c 350409 2019-07-29 10:41:21Z arybchik $");
36
37#include "opt_rss.h"
38
39#include <sys/param.h>
40#include <sys/malloc.h>
41#include <sys/mbuf.h>
42#include <sys/smp.h>
43#include <sys/socket.h>
44#include <sys/sysctl.h>
45#include <sys/syslog.h>
46#include <sys/limits.h>
47#include <sys/syslog.h>
48
49#include <net/ethernet.h>
50#include <net/if.h>
51#include <net/if_vlan_var.h>
52
53#include <netinet/in.h>
54#include <netinet/ip.h>
55#include <netinet/ip6.h>
56#include <netinet/tcp.h>
57
58#include <machine/in_cksum.h>
59
60#ifdef RSS
61#include <net/rss_config.h>
62#endif
63
64#include "common/efx.h"
65
66
67#include "sfxge.h"
68#include "sfxge_rx.h"
69
70#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
71
72#ifdef SFXGE_LRO
73
74SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
75	    "Large receive offload (LRO) parameters");
76
77#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
78
79/* Size of the LRO hash table.  Must be a power of 2.  A larger table
80 * means we can accelerate a larger number of streams.
81 */
82static unsigned lro_table_size = 128;
83TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
84SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
85	    &lro_table_size, 0,
86	    "Size of the LRO hash table (must be a power of 2)");
87
88/* Maximum length of a hash chain.  If chains get too long then the lookup
89 * time increases and may exceed the benefit of LRO.
90 */
91static unsigned lro_chain_max = 20;
92TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
93SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
94	    &lro_chain_max, 0,
95	    "The maximum length of a hash chain");
96
97/* Maximum time (in ticks) that a connection can be idle before it's LRO
98 * state is discarded.
99 */
100static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
101TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
102SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
103	    &lro_idle_ticks, 0,
104	    "The maximum time (in ticks) that a connection can be idle "
105	    "before it's LRO state is discarded");
106
107/* Number of packets with payload that must arrive in-order before a
108 * connection is eligible for LRO.  The idea is we should avoid coalescing
109 * segments when the sender is in slow-start because reducing the ACK rate
110 * can damage performance.
111 */
112static int lro_slow_start_packets = 2000;
113TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
114SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
115	    &lro_slow_start_packets, 0,
116	    "Number of packets with payload that must arrive in-order before "
117	    "a connection is eligible for LRO");
118
119/* Number of packets with payload that must arrive in-order following loss
120 * before a connection is eligible for LRO.  The idea is we should avoid
121 * coalescing segments when the sender is recovering from loss, because
122 * reducing the ACK rate can damage performance.
123 */
124static int lro_loss_packets = 20;
125TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
126SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
127	    &lro_loss_packets, 0,
128	    "Number of packets with payload that must arrive in-order "
129	    "following loss before a connection is eligible for LRO");
130
131/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
132#define	SFXGE_LRO_L2_ID_VLAN 0x4000
133#define	SFXGE_LRO_L2_ID_IPV6 0x8000
134#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
135#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
136
137/* Compare IPv6 addresses, avoiding conditional branches */
138static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
139				   const struct in6_addr *right)
140{
141#if LONG_BIT == 64
142	const uint64_t *left64 = (const uint64_t *)left;
143	const uint64_t *right64 = (const uint64_t *)right;
144	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
145#else
146	return (left->s6_addr32[0] - right->s6_addr32[0]) |
147	       (left->s6_addr32[1] - right->s6_addr32[1]) |
148	       (left->s6_addr32[2] - right->s6_addr32[2]) |
149	       (left->s6_addr32[3] - right->s6_addr32[3]);
150#endif
151}
152
153#endif	/* SFXGE_LRO */
154
155void
156sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
157{
158
159	rxq->flush_state = SFXGE_FLUSH_DONE;
160}
161
162void
163sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
164{
165
166	rxq->flush_state = SFXGE_FLUSH_FAILED;
167}
168
169#ifdef RSS
170static uint8_t toep_key[RSS_KEYSIZE];
171#else
172static uint8_t toep_key[] = {
173	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
174	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
175	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
176	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
177	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
178};
179#endif
180
181static void
182sfxge_rx_post_refill(void *arg)
183{
184	struct sfxge_rxq *rxq = arg;
185	struct sfxge_softc *sc;
186	unsigned int index;
187	struct sfxge_evq *evq;
188	uint16_t magic;
189
190	sc = rxq->sc;
191	index = rxq->index;
192	evq = sc->evq[index];
193	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
194
195	/* This is guaranteed due to the start/stop order of rx and ev */
196	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
197	    ("evq not started"));
198	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
199	    ("rxq not started"));
200	efx_ev_qpost(evq->common, magic);
201}
202
203static void
204sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
205{
206	/* Initially retry after 100 ms, but back off in case of
207	 * repeated failures as we probably have to wait for the
208	 * administrator to raise the pool limit. */
209	if (retrying)
210		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
211	else
212		rxq->refill_delay = hz / 10;
213
214	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
215			     sfxge_rx_post_refill, rxq);
216}
217
218#define	SFXGE_REFILL_BATCH  64
219
220static void
221sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
222{
223	struct sfxge_softc *sc;
224	unsigned int index;
225	struct sfxge_evq *evq;
226	unsigned int batch;
227	unsigned int rxfill;
228	unsigned int mblksize;
229	int ntodo;
230	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
231
232	sc = rxq->sc;
233	index = rxq->index;
234	evq = sc->evq[index];
235
236	prefetch_read_many(sc->enp);
237	prefetch_read_many(rxq->common);
238
239	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
240
241	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
242		return;
243
244	rxfill = rxq->added - rxq->completed;
245	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
250
251	if (ntodo == 0)
252		return;
253
254	batch = 0;
255	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
256	while (ntodo-- > 0) {
257		unsigned int id;
258		struct sfxge_rx_sw_desc *rx_desc;
259		bus_dma_segment_t seg;
260		struct mbuf *m;
261
262		id = (rxq->added + batch) & rxq->ptr_mask;
263		rx_desc = &rxq->queue[id];
264		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
265
266		rx_desc->flags = EFX_DISCARD;
267		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
268		    sc->rx_cluster_size);
269		if (m == NULL)
270			break;
271
272		/* m_len specifies length of area to be mapped for DMA */
273		m->m_len  = mblksize;
274		m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
275						   CACHE_LINE_SIZE);
276		m->m_data += sc->rx_buffer_align;
277
278		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
279		addr[batch++] = seg.ds_addr;
280
281		if (batch == SFXGE_REFILL_BATCH) {
282			efx_rx_qpost(rxq->common, addr, mblksize, batch,
283			    rxq->completed, rxq->added);
284			rxq->added += batch;
285			batch = 0;
286		}
287	}
288
289	if (ntodo != 0)
290		sfxge_rx_schedule_refill(rxq, retrying);
291
292	if (batch != 0) {
293		efx_rx_qpost(rxq->common, addr, mblksize, batch,
294		    rxq->completed, rxq->added);
295		rxq->added += batch;
296	}
297
298	/* Make the descriptors visible to the hardware */
299	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
300			BUS_DMASYNC_PREWRITE);
301
302	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
303
304	/* The queue could still be empty if no descriptors were actually
305	 * pushed, in which case there will be no event to cause the next
306	 * refill, so we must schedule a refill ourselves.
307	 */
308	if(rxq->pushed == rxq->completed) {
309		sfxge_rx_schedule_refill(rxq, retrying);
310	}
311}
312
313void
314sfxge_rx_qrefill(struct sfxge_rxq *rxq)
315{
316
317	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
318		return;
319
320	/* Make sure the queue is full */
321	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
322}
323
324static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
325{
326	struct ifnet *ifp = sc->ifnet;
327
328	m->m_pkthdr.rcvif = ifp;
329	m->m_pkthdr.csum_data = 0xffff;
330	ifp->if_input(ifp, m);
331}
332
333static void
334sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
335{
336	struct sfxge_softc *sc = rxq->sc;
337	struct mbuf *m = rx_desc->mbuf;
338	int flags = rx_desc->flags;
339	int csum_flags;
340
341	/* Convert checksum flags */
342	csum_flags = (flags & EFX_CKSUM_IPV4) ?
343		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
344	if (flags & EFX_CKSUM_TCPUDP)
345		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
346
347	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
348		m->m_pkthdr.flowid =
349			efx_pseudo_hdr_hash_get(rxq->common,
350						EFX_RX_HASHALG_TOEPLITZ,
351						mtod(m, uint8_t *));
352		/* The hash covers a 4-tuple for TCP only */
353		M_HASHTYPE_SET(m,
354		    (flags & EFX_PKT_IPV4) ?
355			((flags & EFX_PKT_TCP) ?
356			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
357			((flags & EFX_PKT_TCP) ?
358			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
359	}
360	m->m_data += sc->rx_prefix_size;
361	m->m_len = rx_desc->size - sc->rx_prefix_size;
362	m->m_pkthdr.len = m->m_len;
363	m->m_pkthdr.csum_flags = csum_flags;
364	__sfxge_rx_deliver(sc, rx_desc->mbuf);
365
366	rx_desc->flags = EFX_DISCARD;
367	rx_desc->mbuf = NULL;
368}
369
370#ifdef SFXGE_LRO
371
372static void
373sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
374{
375	struct sfxge_softc *sc = st->sc;
376	struct mbuf *m = c->mbuf;
377	struct tcphdr *c_th;
378	int csum_flags;
379
380	KASSERT(m, ("no mbuf to deliver"));
381
382	++st->n_bursts;
383
384	/* Finish off packet munging and recalculate IP header checksum. */
385	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
386		struct ip *iph = c->nh;
387		iph->ip_len = htons(iph->ip_len);
388		iph->ip_sum = 0;
389		iph->ip_sum = in_cksum_hdr(iph);
390		c_th = (struct tcphdr *)(iph + 1);
391		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
392			      CSUM_IP_CHECKED | CSUM_IP_VALID);
393	} else {
394		struct ip6_hdr *iph = c->nh;
395		iph->ip6_plen = htons(iph->ip6_plen);
396		c_th = (struct tcphdr *)(iph + 1);
397		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
398	}
399
400	c_th->th_win = c->th_last->th_win;
401	c_th->th_ack = c->th_last->th_ack;
402	if (c_th->th_off == c->th_last->th_off) {
403		/* Copy TCP options (take care to avoid going negative). */
404		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
405		memcpy(c_th + 1, c->th_last + 1, optlen);
406	}
407
408	m->m_pkthdr.flowid = c->conn_hash;
409	M_HASHTYPE_SET(m,
410	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
411		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
412
413	m->m_pkthdr.csum_flags = csum_flags;
414	__sfxge_rx_deliver(sc, m);
415
416	c->mbuf = NULL;
417	c->delivered = 1;
418}
419
420/* Drop the given connection, and add it to the free list. */
421static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
422{
423	unsigned bucket;
424
425	KASSERT(!c->mbuf, ("found orphaned mbuf"));
426
427	if (c->next_buf.mbuf != NULL) {
428		sfxge_rx_deliver(rxq, &c->next_buf);
429		LIST_REMOVE(c, active_link);
430	}
431
432	bucket = c->conn_hash & rxq->lro.conns_mask;
433	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
434	--rxq->lro.conns_n[bucket];
435	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
436	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
437}
438
439/* Stop tracking connections that have gone idle in order to keep hash
440 * chains short.
441 */
442static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
443{
444	struct sfxge_lro_conn *c;
445	unsigned i;
446
447	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
448		("found active connections"));
449
450	rxq->lro.last_purge_ticks = now;
451	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
452		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
453			continue;
454
455		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
456		if (now - c->last_pkt_ticks > lro_idle_ticks) {
457			++rxq->lro.n_drop_idle;
458			sfxge_lro_drop(rxq, c);
459		}
460	}
461}
462
463static void
464sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
465		struct mbuf *mbuf, struct tcphdr *th)
466{
467	struct tcphdr *c_th;
468
469	/* Tack the new mbuf onto the chain. */
470	KASSERT(!mbuf->m_next, ("mbuf already chained"));
471	c->mbuf_tail->m_next = mbuf;
472	c->mbuf_tail = mbuf;
473
474	/* Increase length appropriately */
475	c->mbuf->m_pkthdr.len += mbuf->m_len;
476
477	/* Update the connection state flags */
478	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
479		struct ip *iph = c->nh;
480		iph->ip_len += mbuf->m_len;
481		c_th = (struct tcphdr *)(iph + 1);
482	} else {
483		struct ip6_hdr *iph = c->nh;
484		iph->ip6_plen += mbuf->m_len;
485		c_th = (struct tcphdr *)(iph + 1);
486	}
487	c_th->th_flags |= (th->th_flags & TH_PUSH);
488	c->th_last = th;
489	++st->n_merges;
490
491	/* Pass packet up now if another segment could overflow the IP
492	 * length.
493	 */
494	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
495		sfxge_lro_deliver(st, c);
496}
497
498static void
499sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
500		struct mbuf *mbuf, void *nh, struct tcphdr *th)
501{
502	/* Start the chain */
503	c->mbuf = mbuf;
504	c->mbuf_tail = c->mbuf;
505	c->nh = nh;
506	c->th_last = th;
507
508	mbuf->m_pkthdr.len = mbuf->m_len;
509
510	/* Mangle header fields for later processing */
511	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
512		struct ip *iph = nh;
513		iph->ip_len = ntohs(iph->ip_len);
514	} else {
515		struct ip6_hdr *iph = nh;
516		iph->ip6_plen = ntohs(iph->ip6_plen);
517	}
518}
519
520/* Try to merge or otherwise hold or deliver (as appropriate) the
521 * packet buffered for this connection (c->next_buf).  Return a flag
522 * indicating whether the connection is still active for LRO purposes.
523 */
524static int
525sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
526{
527	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
528	char *eh = c->next_eh;
529	int data_length, hdr_length, dont_merge;
530	unsigned th_seq, pkt_length;
531	struct tcphdr *th;
532	unsigned now;
533
534	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
535		struct ip *iph = c->next_nh;
536		th = (struct tcphdr *)(iph + 1);
537		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
538	} else {
539		struct ip6_hdr *iph = c->next_nh;
540		th = (struct tcphdr *)(iph + 1);
541		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
542	}
543
544	hdr_length = (char *) th + th->th_off * 4 - eh;
545	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
546		       hdr_length);
547	th_seq = ntohl(th->th_seq);
548	dont_merge = ((data_length <= 0)
549		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
550
551	/* Check for options other than aligned timestamp. */
552	if (th->th_off != 5) {
553		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
554		if (th->th_off == 8 &&
555		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
556					(TCPOPT_NOP << 16) |
557					(TCPOPT_TIMESTAMP << 8) |
558					TCPOLEN_TIMESTAMP)) {
559			/* timestamp option -- okay */
560		} else {
561			dont_merge = 1;
562		}
563	}
564
565	if (__predict_false(th_seq != c->next_seq)) {
566		/* Out-of-order, so start counting again. */
567		if (c->mbuf != NULL)
568			sfxge_lro_deliver(&rxq->lro, c);
569		c->n_in_order_pkts -= lro_loss_packets;
570		c->next_seq = th_seq + data_length;
571		++rxq->lro.n_misorder;
572		goto deliver_buf_out;
573	}
574	c->next_seq = th_seq + data_length;
575
576	now = ticks;
577	if (now - c->last_pkt_ticks > lro_idle_ticks) {
578		++rxq->lro.n_drop_idle;
579		if (c->mbuf != NULL)
580			sfxge_lro_deliver(&rxq->lro, c);
581		sfxge_lro_drop(rxq, c);
582		return (0);
583	}
584	c->last_pkt_ticks = ticks;
585
586	if (c->n_in_order_pkts < lro_slow_start_packets) {
587		/* May be in slow-start, so don't merge. */
588		++rxq->lro.n_slow_start;
589		++c->n_in_order_pkts;
590		goto deliver_buf_out;
591	}
592
593	if (__predict_false(dont_merge)) {
594		if (c->mbuf != NULL)
595			sfxge_lro_deliver(&rxq->lro, c);
596		if (th->th_flags & (TH_FIN | TH_RST)) {
597			++rxq->lro.n_drop_closed;
598			sfxge_lro_drop(rxq, c);
599			return (0);
600		}
601		goto deliver_buf_out;
602	}
603
604	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
605
606	if (__predict_true(c->mbuf != NULL)) {
607		/* Remove headers and any padding */
608		rx_buf->mbuf->m_data += hdr_length;
609		rx_buf->mbuf->m_len = data_length;
610
611		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
612	} else {
613		/* Remove any padding */
614		rx_buf->mbuf->m_len = pkt_length;
615
616		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
617	}
618
619	rx_buf->mbuf = NULL;
620	return (1);
621
622 deliver_buf_out:
623	sfxge_rx_deliver(rxq, rx_buf);
624	return (1);
625}
626
627static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
628			       uint16_t l2_id, void *nh, struct tcphdr *th)
629{
630	unsigned bucket = conn_hash & st->conns_mask;
631	struct sfxge_lro_conn *c;
632
633	if (st->conns_n[bucket] >= lro_chain_max) {
634		++st->n_too_many;
635		return;
636	}
637
638	if (!TAILQ_EMPTY(&st->free_conns)) {
639		c = TAILQ_FIRST(&st->free_conns);
640		TAILQ_REMOVE(&st->free_conns, c, link);
641	} else {
642		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
643		if (c == NULL)
644			return;
645		c->mbuf = NULL;
646		c->next_buf.mbuf = NULL;
647	}
648
649	/* Create the connection tracking data */
650	++st->conns_n[bucket];
651	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
652	c->l2_id = l2_id;
653	c->conn_hash = conn_hash;
654	c->source = th->th_sport;
655	c->dest = th->th_dport;
656	c->n_in_order_pkts = 0;
657	c->last_pkt_ticks = *(volatile int *)&ticks;
658	c->delivered = 0;
659	++st->n_new_stream;
660	/* NB. We don't initialise c->next_seq, and it doesn't matter what
661	 * value it has.  Most likely the next packet received for this
662	 * connection will not match -- no harm done.
663	 */
664}
665
666/* Process mbuf and decide whether to dispatch it to the stack now or
667 * later.
668 */
669static void
670sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
671{
672	struct sfxge_softc *sc = rxq->sc;
673	struct mbuf *m = rx_buf->mbuf;
674	struct ether_header *eh;
675	struct sfxge_lro_conn *c;
676	uint16_t l2_id;
677	uint16_t l3_proto;
678	void *nh;
679	struct tcphdr *th;
680	uint32_t conn_hash;
681	unsigned bucket;
682
683	/* Get the hardware hash */
684	conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
685					    EFX_RX_HASHALG_TOEPLITZ,
686					    mtod(m, uint8_t *));
687
688	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
689	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
690		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
691		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
692			SFXGE_LRO_L2_ID_VLAN;
693		l3_proto = veh->evl_proto;
694		nh = veh + 1;
695	} else {
696		l2_id = 0;
697		l3_proto = eh->ether_type;
698		nh = eh + 1;
699	}
700
701	/* Check whether this is a suitable packet (unfragmented
702	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
703	 * length, and compute a hash if necessary.  If not, return.
704	 */
705	if (l3_proto == htons(ETHERTYPE_IP)) {
706		struct ip *iph = nh;
707
708		KASSERT(iph->ip_p == IPPROTO_TCP,
709		    ("IPv4 protocol is not TCP, but packet marker is set"));
710		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
711		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
712			goto deliver_now;
713		th = (struct tcphdr *)(iph + 1);
714	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
715		struct ip6_hdr *iph = nh;
716
717		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
718		    ("IPv6 next header is not TCP, but packet marker is set"));
719		l2_id |= SFXGE_LRO_L2_ID_IPV6;
720		th = (struct tcphdr *)(iph + 1);
721	} else {
722		goto deliver_now;
723	}
724
725	bucket = conn_hash & rxq->lro.conns_mask;
726
727	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
728		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
729			continue;
730		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
731			continue;
732		if (c->mbuf != NULL) {
733			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
734				struct ip *c_iph, *iph = nh;
735				c_iph = c->nh;
736				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
737				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
738					continue;
739			} else {
740				struct ip6_hdr *c_iph, *iph = nh;
741				c_iph = c->nh;
742				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
743				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
744					continue;
745			}
746		}
747
748		/* Re-insert at head of list to reduce lookup time. */
749		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
750		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
751
752		if (c->next_buf.mbuf != NULL) {
753			if (!sfxge_lro_try_merge(rxq, c))
754				goto deliver_now;
755		} else {
756			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
757			    active_link);
758		}
759		c->next_buf = *rx_buf;
760		c->next_eh = eh;
761		c->next_nh = nh;
762
763		rx_buf->mbuf = NULL;
764		rx_buf->flags = EFX_DISCARD;
765		return;
766	}
767
768	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
769 deliver_now:
770	sfxge_rx_deliver(rxq, rx_buf);
771}
772
773static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
774{
775	struct sfxge_lro_state *st = &rxq->lro;
776	struct sfxge_lro_conn *c;
777	unsigned t;
778
779	while (!LIST_EMPTY(&st->active_conns)) {
780		c = LIST_FIRST(&st->active_conns);
781		if (!c->delivered && c->mbuf != NULL)
782			sfxge_lro_deliver(st, c);
783		if (sfxge_lro_try_merge(rxq, c)) {
784			if (c->mbuf != NULL)
785				sfxge_lro_deliver(st, c);
786			LIST_REMOVE(c, active_link);
787		}
788		c->delivered = 0;
789	}
790
791	t = *(volatile int *)&ticks;
792	if (__predict_false(t != st->last_purge_ticks))
793		sfxge_lro_purge_idle(rxq, t);
794}
795
796#else	/* !SFXGE_LRO */
797
798static void
799sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
800{
801}
802
803static void
804sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
805{
806}
807
808#endif	/* SFXGE_LRO */
809
810void
811sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
812{
813	struct sfxge_softc *sc = rxq->sc;
814	int if_capenable = sc->ifnet->if_capenable;
815	int lro_enabled = if_capenable & IFCAP_LRO;
816	unsigned int index;
817	struct sfxge_evq *evq;
818	unsigned int completed;
819	unsigned int level;
820	struct mbuf *m;
821	struct sfxge_rx_sw_desc *prev = NULL;
822
823	index = rxq->index;
824	evq = sc->evq[index];
825
826	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
827
828	completed = rxq->completed;
829	while (completed != rxq->pending) {
830		unsigned int id;
831		struct sfxge_rx_sw_desc *rx_desc;
832
833		id = completed++ & rxq->ptr_mask;
834		rx_desc = &rxq->queue[id];
835		m = rx_desc->mbuf;
836
837		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
838			goto discard;
839
840		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
841			goto discard;
842
843		/* Read the length from the pseudo header if required */
844		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
845			uint16_t tmp_size;
846			int rc;
847			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
848							   mtod(m, uint8_t *),
849							   &tmp_size);
850			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
851			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
852		}
853
854		prefetch_read_many(mtod(m, caddr_t));
855
856		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
857		case EFX_PKT_IPV4:
858			if (~if_capenable & IFCAP_RXCSUM)
859				rx_desc->flags &=
860				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
861			break;
862		case EFX_PKT_IPV6:
863			if (~if_capenable & IFCAP_RXCSUM_IPV6)
864				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
865			break;
866		case 0:
867			/* Check for loopback packets */
868			{
869				struct ether_header *etherhp;
870
871				/*LINTED*/
872				etherhp = mtod(m, struct ether_header *);
873
874				if (etherhp->ether_type ==
875				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
876					EFSYS_PROBE(loopback);
877
878					rxq->loopback++;
879					goto discard;
880				}
881			}
882			break;
883		default:
884			KASSERT(B_FALSE,
885			    ("Rx descriptor with both IPv4 and IPv6 flags"));
886			goto discard;
887		}
888
889		/* Pass packet up the stack or into LRO (pipelined) */
890		if (prev != NULL) {
891			if (lro_enabled &&
892			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
893			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
894				sfxge_lro(rxq, prev);
895			else
896				sfxge_rx_deliver(rxq, prev);
897		}
898		prev = rx_desc;
899		continue;
900
901discard:
902		/* Return the packet to the pool */
903		m_free(m);
904		rx_desc->mbuf = NULL;
905	}
906	rxq->completed = completed;
907
908	level = rxq->added - rxq->completed;
909
910	/* Pass last packet up the stack or into LRO */
911	if (prev != NULL) {
912		if (lro_enabled &&
913		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
914		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
915			sfxge_lro(rxq, prev);
916		else
917			sfxge_rx_deliver(rxq, prev);
918	}
919
920	/*
921	 * If there are any pending flows and this is the end of the
922	 * poll then they must be completed.
923	 */
924	if (eop)
925		sfxge_lro_end_of_burst(rxq);
926
927	/* Top up the queue if necessary */
928	if (level < rxq->refill_threshold)
929		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
930}
931
932static void
933sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
934{
935	struct sfxge_rxq *rxq;
936	struct sfxge_evq *evq;
937	unsigned int count;
938	unsigned int retry = 3;
939
940	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
941
942	rxq = sc->rxq[index];
943	evq = sc->evq[index];
944
945	SFXGE_EVQ_LOCK(evq);
946
947	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
948	    ("rxq not started"));
949
950	rxq->init_state = SFXGE_RXQ_INITIALIZED;
951
952	callout_stop(&rxq->refill_callout);
953
954	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
955		rxq->flush_state = SFXGE_FLUSH_PENDING;
956
957		SFXGE_EVQ_UNLOCK(evq);
958
959		/* Flush the receive queue */
960		if (efx_rx_qflush(rxq->common) != 0) {
961			SFXGE_EVQ_LOCK(evq);
962			rxq->flush_state = SFXGE_FLUSH_FAILED;
963			break;
964		}
965
966		count = 0;
967		do {
968			/* Spin for 100 ms */
969			DELAY(100000);
970
971			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
972				break;
973
974		} while (++count < 20);
975
976		SFXGE_EVQ_LOCK(evq);
977
978		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
979			/* Flush timeout - neither done nor failed */
980			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
981			    device_get_nameunit(sc->dev), index);
982			rxq->flush_state = SFXGE_FLUSH_DONE;
983		}
984		retry--;
985	}
986	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
987		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
988		    device_get_nameunit(sc->dev), index);
989		rxq->flush_state = SFXGE_FLUSH_DONE;
990	}
991
992	rxq->pending = rxq->added;
993	sfxge_rx_qcomplete(rxq, B_TRUE);
994
995	KASSERT(rxq->completed == rxq->pending,
996	    ("rxq->completed != rxq->pending"));
997
998	rxq->added = 0;
999	rxq->pushed = 0;
1000	rxq->pending = 0;
1001	rxq->completed = 0;
1002	rxq->loopback = 0;
1003
1004	/* Destroy the common code receive queue. */
1005	efx_rx_qdestroy(rxq->common);
1006
1007	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1008	    EFX_RXQ_NBUFS(sc->rxq_entries));
1009
1010	SFXGE_EVQ_UNLOCK(evq);
1011}
1012
1013static int
1014sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1015{
1016	struct sfxge_rxq *rxq;
1017	efsys_mem_t *esmp;
1018	struct sfxge_evq *evq;
1019	int rc;
1020
1021	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1022
1023	rxq = sc->rxq[index];
1024	esmp = &rxq->mem;
1025	evq = sc->evq[index];
1026
1027	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1028	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1029	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1030	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1031
1032	/* Program the buffer table. */
1033	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1034	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1035		return (rc);
1036
1037	/* Create the common code receive queue. */
1038	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1039	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1040	    &rxq->common)) != 0)
1041		goto fail;
1042
1043	SFXGE_EVQ_LOCK(evq);
1044
1045	/* Enable the receive queue. */
1046	efx_rx_qenable(rxq->common);
1047
1048	rxq->init_state = SFXGE_RXQ_STARTED;
1049	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1050
1051	/* Try to fill the queue from the pool. */
1052	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1053
1054	SFXGE_EVQ_UNLOCK(evq);
1055
1056	return (0);
1057
1058fail:
1059	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1060	    EFX_RXQ_NBUFS(sc->rxq_entries));
1061	return (rc);
1062}
1063
1064void
1065sfxge_rx_stop(struct sfxge_softc *sc)
1066{
1067	int index;
1068
1069	efx_mac_filter_default_rxq_clear(sc->enp);
1070
1071	/* Stop the receive queue(s) */
1072	index = sc->rxq_count;
1073	while (--index >= 0)
1074		sfxge_rx_qstop(sc, index);
1075
1076	sc->rx_prefix_size = 0;
1077	sc->rx_buffer_size = 0;
1078
1079	efx_rx_fini(sc->enp);
1080}
1081
1082int
1083sfxge_rx_start(struct sfxge_softc *sc)
1084{
1085	struct sfxge_intr *intr;
1086	const efx_nic_cfg_t *encp;
1087	size_t hdrlen, align, reserved;
1088	int index;
1089	int rc;
1090
1091	intr = &sc->intr;
1092
1093	/* Initialize the common code receive module. */
1094	if ((rc = efx_rx_init(sc->enp)) != 0)
1095		return (rc);
1096
1097	encp = efx_nic_cfg_get(sc->enp);
1098	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1099
1100	/* Calculate the receive packet buffer size. */
1101	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1102
1103	/* Ensure IP headers are 32bit aligned */
1104	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1105	sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1106
1107	sc->rx_buffer_size += sc->rx_buffer_align;
1108
1109	/* Align end of packet buffer for RX DMA end padding */
1110	align = MAX(1, encp->enc_rx_buf_align_end);
1111	EFSYS_ASSERT(ISP2(align));
1112	sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1113
1114	/*
1115	 * Standard mbuf zones only guarantee pointer-size alignment;
1116	 * we need extra space to align to the cache line
1117	 */
1118	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1119
1120	/* Select zone for packet buffers */
1121	if (reserved <= MCLBYTES)
1122		sc->rx_cluster_size = MCLBYTES;
1123	else if (reserved <= MJUMPAGESIZE)
1124		sc->rx_cluster_size = MJUMPAGESIZE;
1125	else if (reserved <= MJUM9BYTES)
1126		sc->rx_cluster_size = MJUM9BYTES;
1127	else
1128		sc->rx_cluster_size = MJUM16BYTES;
1129
1130	/*
1131	 * Set up the scale table.  Enable all hash types and hash insertion.
1132	 */
1133	for (index = 0; index < nitems(sc->rx_indir_table); index++)
1134#ifdef RSS
1135		sc->rx_indir_table[index] =
1136			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1137#else
1138		sc->rx_indir_table[index] = index % sc->rxq_count;
1139#endif
1140	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1141				       nitems(sc->rx_indir_table))) != 0)
1142		goto fail;
1143	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1144	    EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1145	    EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1146
1147#ifdef RSS
1148	rss_getkey(toep_key);
1149#endif
1150	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1151				       sizeof(toep_key))) != 0)
1152		goto fail;
1153
1154	/* Start the receive queue(s). */
1155	for (index = 0; index < sc->rxq_count; index++) {
1156		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1157			goto fail2;
1158	}
1159
1160	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1161					    sc->intr.n_alloc > 1);
1162	if (rc != 0)
1163		goto fail3;
1164
1165	return (0);
1166
1167fail3:
1168fail2:
1169	while (--index >= 0)
1170		sfxge_rx_qstop(sc, index);
1171
1172fail:
1173	efx_rx_fini(sc->enp);
1174
1175	return (rc);
1176}
1177
1178#ifdef SFXGE_LRO
1179
1180static void sfxge_lro_init(struct sfxge_rxq *rxq)
1181{
1182	struct sfxge_lro_state *st = &rxq->lro;
1183	unsigned i;
1184
1185	st->conns_mask = lro_table_size - 1;
1186	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1187		("lro_table_size must be a power of 2"));
1188	st->sc = rxq->sc;
1189	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1190			   M_SFXGE, M_WAITOK);
1191	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1192			     M_SFXGE, M_WAITOK);
1193	for (i = 0; i <= st->conns_mask; ++i) {
1194		TAILQ_INIT(&st->conns[i]);
1195		st->conns_n[i] = 0;
1196	}
1197	LIST_INIT(&st->active_conns);
1198	TAILQ_INIT(&st->free_conns);
1199}
1200
1201static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1202{
1203	struct sfxge_lro_state *st = &rxq->lro;
1204	struct sfxge_lro_conn *c;
1205	unsigned i;
1206
1207	/* Return cleanly if sfxge_lro_init() has not been called. */
1208	if (st->conns == NULL)
1209		return;
1210
1211	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1212
1213	for (i = 0; i <= st->conns_mask; ++i) {
1214		while (!TAILQ_EMPTY(&st->conns[i])) {
1215			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1216			sfxge_lro_drop(rxq, c);
1217		}
1218	}
1219
1220	while (!TAILQ_EMPTY(&st->free_conns)) {
1221		c = TAILQ_FIRST(&st->free_conns);
1222		TAILQ_REMOVE(&st->free_conns, c, link);
1223		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1224		free(c, M_SFXGE);
1225	}
1226
1227	free(st->conns_n, M_SFXGE);
1228	free(st->conns, M_SFXGE);
1229	st->conns = NULL;
1230}
1231
1232#else
1233
1234static void
1235sfxge_lro_init(struct sfxge_rxq *rxq)
1236{
1237}
1238
1239static void
1240sfxge_lro_fini(struct sfxge_rxq *rxq)
1241{
1242}
1243
1244#endif	/* SFXGE_LRO */
1245
1246static void
1247sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1248{
1249	struct sfxge_rxq *rxq;
1250
1251	rxq = sc->rxq[index];
1252
1253	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1254	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1255
1256	/* Free the context array and the flow table. */
1257	free(rxq->queue, M_SFXGE);
1258	sfxge_lro_fini(rxq);
1259
1260	/* Release DMA memory. */
1261	sfxge_dma_free(&rxq->mem);
1262
1263	sc->rxq[index] = NULL;
1264
1265	free(rxq, M_SFXGE);
1266}
1267
1268static int
1269sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1270{
1271	struct sfxge_rxq *rxq;
1272	struct sfxge_evq *evq;
1273	efsys_mem_t *esmp;
1274	int rc;
1275
1276	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1277
1278	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1279	rxq->sc = sc;
1280	rxq->index = index;
1281	rxq->entries = sc->rxq_entries;
1282	rxq->ptr_mask = rxq->entries - 1;
1283	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1284
1285	sc->rxq[index] = rxq;
1286	esmp = &rxq->mem;
1287
1288	evq = sc->evq[index];
1289
1290	/* Allocate and zero DMA space. */
1291	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1292		return (rc);
1293
1294	/* Allocate buffer table entries. */
1295	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1296				 &rxq->buf_base_id);
1297
1298	/* Allocate the context array and the flow table. */
1299	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1300	    M_SFXGE, M_WAITOK | M_ZERO);
1301	sfxge_lro_init(rxq);
1302
1303	callout_init(&rxq->refill_callout, 1);
1304
1305	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1306
1307	return (0);
1308}
1309
1310static const struct {
1311	const char *name;
1312	size_t offset;
1313} sfxge_rx_stats[] = {
1314#define	SFXGE_RX_STAT(name, member) \
1315	{ #name, offsetof(struct sfxge_rxq, member) }
1316#ifdef SFXGE_LRO
1317	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1318	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1319	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1320	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1321	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1322	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1323	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1324	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1325#endif
1326};
1327
1328static int
1329sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1330{
1331	struct sfxge_softc *sc = arg1;
1332	unsigned int id = arg2;
1333	unsigned int sum, index;
1334
1335	/* Sum across all RX queues */
1336	sum = 0;
1337	for (index = 0; index < sc->rxq_count; index++)
1338		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1339					 sfxge_rx_stats[id].offset);
1340
1341	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1342}
1343
1344static void
1345sfxge_rx_stat_init(struct sfxge_softc *sc)
1346{
1347	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1348	struct sysctl_oid_list *stat_list;
1349	unsigned int id;
1350
1351	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1352
1353	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1354		SYSCTL_ADD_PROC(
1355			ctx, stat_list,
1356			OID_AUTO, sfxge_rx_stats[id].name,
1357			CTLTYPE_UINT|CTLFLAG_RD,
1358			sc, id, sfxge_rx_stat_handler, "IU",
1359			"");
1360	}
1361}
1362
1363void
1364sfxge_rx_fini(struct sfxge_softc *sc)
1365{
1366	int index;
1367
1368	index = sc->rxq_count;
1369	while (--index >= 0)
1370		sfxge_rx_qfini(sc, index);
1371
1372	sc->rxq_count = 0;
1373}
1374
1375int
1376sfxge_rx_init(struct sfxge_softc *sc)
1377{
1378	struct sfxge_intr *intr;
1379	int index;
1380	int rc;
1381
1382#ifdef SFXGE_LRO
1383	if (!ISP2(lro_table_size)) {
1384		log(LOG_ERR, "%s=%u must be power of 2",
1385		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1386		rc = EINVAL;
1387		goto fail_lro_table_size;
1388	}
1389
1390	if (lro_idle_ticks == 0)
1391		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1392#endif
1393
1394	intr = &sc->intr;
1395
1396	sc->rxq_count = intr->n_alloc;
1397
1398	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1399	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1400
1401	/* Initialize the receive queue(s) - one per interrupt. */
1402	for (index = 0; index < sc->rxq_count; index++) {
1403		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1404			goto fail;
1405	}
1406
1407	sfxge_rx_stat_init(sc);
1408
1409	return (0);
1410
1411fail:
1412	/* Tear down the receive queue(s). */
1413	while (--index >= 0)
1414		sfxge_rx_qfini(sc, index);
1415
1416	sc->rxq_count = 0;
1417
1418#ifdef SFXGE_LRO
1419fail_lro_table_size:
1420#endif
1421	return (rc);
1422}
1423