sfxge_rx.c revision 301493
1/*-
2 * Copyright (c) 2010-2016 Solarflare Communications Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright notice,
12 *    this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 301493 2016-06-06 09:07:26Z arybchik $");
36
37#include "opt_rss.h"
38
39#include <sys/param.h>
40#include <sys/malloc.h>
41#include <sys/mbuf.h>
42#include <sys/smp.h>
43#include <sys/socket.h>
44#include <sys/sysctl.h>
45#include <sys/syslog.h>
46#include <sys/limits.h>
47#include <sys/syslog.h>
48
49#include <net/ethernet.h>
50#include <net/if.h>
51#include <net/if_vlan_var.h>
52
53#include <netinet/in.h>
54#include <netinet/ip.h>
55#include <netinet/ip6.h>
56#include <netinet/tcp.h>
57
58#include <machine/in_cksum.h>
59
60#ifdef RSS
61#include <net/rss_config.h>
62#endif
63
64#include "common/efx.h"
65
66
67#include "sfxge.h"
68#include "sfxge_rx.h"
69
70#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
71
72#ifdef SFXGE_LRO
73
74SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
75	    "Large receive offload (LRO) parameters");
76
77#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
78
79/* Size of the LRO hash table.  Must be a power of 2.  A larger table
80 * means we can accelerate a larger number of streams.
81 */
82static unsigned lro_table_size = 128;
83TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
84SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
85	    &lro_table_size, 0,
86	    "Size of the LRO hash table (must be a power of 2)");
87
88/* Maximum length of a hash chain.  If chains get too long then the lookup
89 * time increases and may exceed the benefit of LRO.
90 */
91static unsigned lro_chain_max = 20;
92TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
93SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
94	    &lro_chain_max, 0,
95	    "The maximum length of a hash chain");
96
97/* Maximum time (in ticks) that a connection can be idle before it's LRO
98 * state is discarded.
99 */
100static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
101TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
102SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
103	    &lro_idle_ticks, 0,
104	    "The maximum time (in ticks) that a connection can be idle "
105	    "before it's LRO state is discarded");
106
107/* Number of packets with payload that must arrive in-order before a
108 * connection is eligible for LRO.  The idea is we should avoid coalescing
109 * segments when the sender is in slow-start because reducing the ACK rate
110 * can damage performance.
111 */
112static int lro_slow_start_packets = 2000;
113TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
114SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
115	    &lro_slow_start_packets, 0,
116	    "Number of packets with payload that must arrive in-order before "
117	    "a connection is eligible for LRO");
118
119/* Number of packets with payload that must arrive in-order following loss
120 * before a connection is eligible for LRO.  The idea is we should avoid
121 * coalescing segments when the sender is recovering from loss, because
122 * reducing the ACK rate can damage performance.
123 */
124static int lro_loss_packets = 20;
125TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
126SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
127	    &lro_loss_packets, 0,
128	    "Number of packets with payload that must arrive in-order "
129	    "following loss before a connection is eligible for LRO");
130
131/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
132#define	SFXGE_LRO_L2_ID_VLAN 0x4000
133#define	SFXGE_LRO_L2_ID_IPV6 0x8000
134#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
135#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
136
137/* Compare IPv6 addresses, avoiding conditional branches */
138static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
139				   const struct in6_addr *right)
140{
141#if LONG_BIT == 64
142	const uint64_t *left64 = (const uint64_t *)left;
143	const uint64_t *right64 = (const uint64_t *)right;
144	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
145#else
146	return (left->s6_addr32[0] - right->s6_addr32[0]) |
147	       (left->s6_addr32[1] - right->s6_addr32[1]) |
148	       (left->s6_addr32[2] - right->s6_addr32[2]) |
149	       (left->s6_addr32[3] - right->s6_addr32[3]);
150#endif
151}
152
153#endif	/* SFXGE_LRO */
154
155void
156sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
157{
158
159	rxq->flush_state = SFXGE_FLUSH_DONE;
160}
161
162void
163sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
164{
165
166	rxq->flush_state = SFXGE_FLUSH_FAILED;
167}
168
169#ifdef RSS
170static uint8_t toep_key[RSS_KEYSIZE];
171#else
172static uint8_t toep_key[] = {
173	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
174	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
175	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
176	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
177	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
178};
179#endif
180
181static void
182sfxge_rx_post_refill(void *arg)
183{
184	struct sfxge_rxq *rxq = arg;
185	struct sfxge_softc *sc;
186	unsigned int index;
187	struct sfxge_evq *evq;
188	uint16_t magic;
189
190	sc = rxq->sc;
191	index = rxq->index;
192	evq = sc->evq[index];
193	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
194
195	/* This is guaranteed due to the start/stop order of rx and ev */
196	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
197	    ("evq not started"));
198	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
199	    ("rxq not started"));
200	efx_ev_qpost(evq->common, magic);
201}
202
203static void
204sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
205{
206	/* Initially retry after 100 ms, but back off in case of
207	 * repeated failures as we probably have to wait for the
208	 * administrator to raise the pool limit. */
209	if (retrying)
210		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
211	else
212		rxq->refill_delay = hz / 10;
213
214	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
215			     sfxge_rx_post_refill, rxq);
216}
217
218#define	SFXGE_REFILL_BATCH  64
219
220static void
221sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
222{
223	struct sfxge_softc *sc;
224	unsigned int index;
225	struct sfxge_evq *evq;
226	unsigned int batch;
227	unsigned int rxfill;
228	unsigned int mblksize;
229	int ntodo;
230	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
231
232	sc = rxq->sc;
233	index = rxq->index;
234	evq = sc->evq[index];
235
236	prefetch_read_many(sc->enp);
237	prefetch_read_many(rxq->common);
238
239	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
240
241	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
242		return;
243
244	rxfill = rxq->added - rxq->completed;
245	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
250
251	if (ntodo == 0)
252		return;
253
254	batch = 0;
255	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
256	while (ntodo-- > 0) {
257		unsigned int id;
258		struct sfxge_rx_sw_desc *rx_desc;
259		bus_dma_segment_t seg;
260		struct mbuf *m;
261
262		id = (rxq->added + batch) & rxq->ptr_mask;
263		rx_desc = &rxq->queue[id];
264		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
265
266		rx_desc->flags = EFX_DISCARD;
267		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
268		    sc->rx_cluster_size);
269		if (m == NULL)
270			break;
271
272		/* m_len specifies length of area to be mapped for DMA */
273		m->m_len  = mblksize;
274		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
275		m->m_data += sc->rx_buffer_align;
276
277		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278		addr[batch++] = seg.ds_addr;
279
280		if (batch == SFXGE_REFILL_BATCH) {
281			efx_rx_qpost(rxq->common, addr, mblksize, batch,
282			    rxq->completed, rxq->added);
283			rxq->added += batch;
284			batch = 0;
285		}
286	}
287
288	if (ntodo != 0)
289		sfxge_rx_schedule_refill(rxq, retrying);
290
291	if (batch != 0) {
292		efx_rx_qpost(rxq->common, addr, mblksize, batch,
293		    rxq->completed, rxq->added);
294		rxq->added += batch;
295	}
296
297	/* Make the descriptors visible to the hardware */
298	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299			BUS_DMASYNC_PREWRITE);
300
301	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
302
303	/* The queue could still be empty if no descriptors were actually
304	 * pushed, in which case there will be no event to cause the next
305	 * refill, so we must schedule a refill ourselves.
306	 */
307	if(rxq->pushed == rxq->completed) {
308		sfxge_rx_schedule_refill(rxq, retrying);
309	}
310}
311
312void
313sfxge_rx_qrefill(struct sfxge_rxq *rxq)
314{
315
316	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
317		return;
318
319	/* Make sure the queue is full */
320	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
321}
322
323static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
324{
325	struct ifnet *ifp = sc->ifnet;
326
327	m->m_pkthdr.rcvif = ifp;
328	m->m_pkthdr.csum_data = 0xffff;
329	ifp->if_input(ifp, m);
330}
331
332static void
333sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
334{
335	struct mbuf *m = rx_desc->mbuf;
336	int flags = rx_desc->flags;
337	int csum_flags;
338
339	/* Convert checksum flags */
340	csum_flags = (flags & EFX_CKSUM_IPV4) ?
341		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
342	if (flags & EFX_CKSUM_TCPUDP)
343		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
344
345	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
346		m->m_pkthdr.flowid =
347			efx_psuedo_hdr_hash_get(sc->enp,
348						EFX_RX_HASHALG_TOEPLITZ,
349						mtod(m, uint8_t *));
350		/* The hash covers a 4-tuple for TCP only */
351		M_HASHTYPE_SET(m,
352		    (flags & EFX_PKT_IPV4) ?
353			((flags & EFX_PKT_TCP) ?
354			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
355			((flags & EFX_PKT_TCP) ?
356			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
357	}
358	m->m_data += sc->rx_prefix_size;
359	m->m_len = rx_desc->size - sc->rx_prefix_size;
360	m->m_pkthdr.len = m->m_len;
361	m->m_pkthdr.csum_flags = csum_flags;
362	__sfxge_rx_deliver(sc, rx_desc->mbuf);
363
364	rx_desc->flags = EFX_DISCARD;
365	rx_desc->mbuf = NULL;
366}
367
368#ifdef SFXGE_LRO
369
370static void
371sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
372{
373	struct sfxge_softc *sc = st->sc;
374	struct mbuf *m = c->mbuf;
375	struct tcphdr *c_th;
376	int csum_flags;
377
378	KASSERT(m, ("no mbuf to deliver"));
379
380	++st->n_bursts;
381
382	/* Finish off packet munging and recalculate IP header checksum. */
383	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
384		struct ip *iph = c->nh;
385		iph->ip_len = htons(iph->ip_len);
386		iph->ip_sum = 0;
387		iph->ip_sum = in_cksum_hdr(iph);
388		c_th = (struct tcphdr *)(iph + 1);
389		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
390			      CSUM_IP_CHECKED | CSUM_IP_VALID);
391	} else {
392		struct ip6_hdr *iph = c->nh;
393		iph->ip6_plen = htons(iph->ip6_plen);
394		c_th = (struct tcphdr *)(iph + 1);
395		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
396	}
397
398	c_th->th_win = c->th_last->th_win;
399	c_th->th_ack = c->th_last->th_ack;
400	if (c_th->th_off == c->th_last->th_off) {
401		/* Copy TCP options (take care to avoid going negative). */
402		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
403		memcpy(c_th + 1, c->th_last + 1, optlen);
404	}
405
406	m->m_pkthdr.flowid = c->conn_hash;
407	M_HASHTYPE_SET(m,
408	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
409		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
410
411	m->m_pkthdr.csum_flags = csum_flags;
412	__sfxge_rx_deliver(sc, m);
413
414	c->mbuf = NULL;
415	c->delivered = 1;
416}
417
418/* Drop the given connection, and add it to the free list. */
419static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
420{
421	unsigned bucket;
422
423	KASSERT(!c->mbuf, ("found orphaned mbuf"));
424
425	if (c->next_buf.mbuf != NULL) {
426		sfxge_rx_deliver(rxq->sc, &c->next_buf);
427		LIST_REMOVE(c, active_link);
428	}
429
430	bucket = c->conn_hash & rxq->lro.conns_mask;
431	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
432	--rxq->lro.conns_n[bucket];
433	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
434	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
435}
436
437/* Stop tracking connections that have gone idle in order to keep hash
438 * chains short.
439 */
440static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
441{
442	struct sfxge_lro_conn *c;
443	unsigned i;
444
445	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
446		("found active connections"));
447
448	rxq->lro.last_purge_ticks = now;
449	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
450		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
451			continue;
452
453		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
454		if (now - c->last_pkt_ticks > lro_idle_ticks) {
455			++rxq->lro.n_drop_idle;
456			sfxge_lro_drop(rxq, c);
457		}
458	}
459}
460
461static void
462sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
463		struct mbuf *mbuf, struct tcphdr *th)
464{
465	struct tcphdr *c_th;
466
467	/* Tack the new mbuf onto the chain. */
468	KASSERT(!mbuf->m_next, ("mbuf already chained"));
469	c->mbuf_tail->m_next = mbuf;
470	c->mbuf_tail = mbuf;
471
472	/* Increase length appropriately */
473	c->mbuf->m_pkthdr.len += mbuf->m_len;
474
475	/* Update the connection state flags */
476	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
477		struct ip *iph = c->nh;
478		iph->ip_len += mbuf->m_len;
479		c_th = (struct tcphdr *)(iph + 1);
480	} else {
481		struct ip6_hdr *iph = c->nh;
482		iph->ip6_plen += mbuf->m_len;
483		c_th = (struct tcphdr *)(iph + 1);
484	}
485	c_th->th_flags |= (th->th_flags & TH_PUSH);
486	c->th_last = th;
487	++st->n_merges;
488
489	/* Pass packet up now if another segment could overflow the IP
490	 * length.
491	 */
492	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
493		sfxge_lro_deliver(st, c);
494}
495
496static void
497sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
498		struct mbuf *mbuf, void *nh, struct tcphdr *th)
499{
500	/* Start the chain */
501	c->mbuf = mbuf;
502	c->mbuf_tail = c->mbuf;
503	c->nh = nh;
504	c->th_last = th;
505
506	mbuf->m_pkthdr.len = mbuf->m_len;
507
508	/* Mangle header fields for later processing */
509	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
510		struct ip *iph = nh;
511		iph->ip_len = ntohs(iph->ip_len);
512	} else {
513		struct ip6_hdr *iph = nh;
514		iph->ip6_plen = ntohs(iph->ip6_plen);
515	}
516}
517
518/* Try to merge or otherwise hold or deliver (as appropriate) the
519 * packet buffered for this connection (c->next_buf).  Return a flag
520 * indicating whether the connection is still active for LRO purposes.
521 */
522static int
523sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
524{
525	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
526	char *eh = c->next_eh;
527	int data_length, hdr_length, dont_merge;
528	unsigned th_seq, pkt_length;
529	struct tcphdr *th;
530	unsigned now;
531
532	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
533		struct ip *iph = c->next_nh;
534		th = (struct tcphdr *)(iph + 1);
535		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
536	} else {
537		struct ip6_hdr *iph = c->next_nh;
538		th = (struct tcphdr *)(iph + 1);
539		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
540	}
541
542	hdr_length = (char *) th + th->th_off * 4 - eh;
543	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
544		       hdr_length);
545	th_seq = ntohl(th->th_seq);
546	dont_merge = ((data_length <= 0)
547		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
548
549	/* Check for options other than aligned timestamp. */
550	if (th->th_off != 5) {
551		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
552		if (th->th_off == 8 &&
553		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
554					(TCPOPT_NOP << 16) |
555					(TCPOPT_TIMESTAMP << 8) |
556					TCPOLEN_TIMESTAMP)) {
557			/* timestamp option -- okay */
558		} else {
559			dont_merge = 1;
560		}
561	}
562
563	if (__predict_false(th_seq != c->next_seq)) {
564		/* Out-of-order, so start counting again. */
565		if (c->mbuf != NULL)
566			sfxge_lro_deliver(&rxq->lro, c);
567		c->n_in_order_pkts -= lro_loss_packets;
568		c->next_seq = th_seq + data_length;
569		++rxq->lro.n_misorder;
570		goto deliver_buf_out;
571	}
572	c->next_seq = th_seq + data_length;
573
574	now = ticks;
575	if (now - c->last_pkt_ticks > lro_idle_ticks) {
576		++rxq->lro.n_drop_idle;
577		if (c->mbuf != NULL)
578			sfxge_lro_deliver(&rxq->lro, c);
579		sfxge_lro_drop(rxq, c);
580		return (0);
581	}
582	c->last_pkt_ticks = ticks;
583
584	if (c->n_in_order_pkts < lro_slow_start_packets) {
585		/* May be in slow-start, so don't merge. */
586		++rxq->lro.n_slow_start;
587		++c->n_in_order_pkts;
588		goto deliver_buf_out;
589	}
590
591	if (__predict_false(dont_merge)) {
592		if (c->mbuf != NULL)
593			sfxge_lro_deliver(&rxq->lro, c);
594		if (th->th_flags & (TH_FIN | TH_RST)) {
595			++rxq->lro.n_drop_closed;
596			sfxge_lro_drop(rxq, c);
597			return (0);
598		}
599		goto deliver_buf_out;
600	}
601
602	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
603
604	if (__predict_true(c->mbuf != NULL)) {
605		/* Remove headers and any padding */
606		rx_buf->mbuf->m_data += hdr_length;
607		rx_buf->mbuf->m_len = data_length;
608
609		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
610	} else {
611		/* Remove any padding */
612		rx_buf->mbuf->m_len = pkt_length;
613
614		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
615	}
616
617	rx_buf->mbuf = NULL;
618	return (1);
619
620 deliver_buf_out:
621	sfxge_rx_deliver(rxq->sc, rx_buf);
622	return (1);
623}
624
625static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
626			       uint16_t l2_id, void *nh, struct tcphdr *th)
627{
628	unsigned bucket = conn_hash & st->conns_mask;
629	struct sfxge_lro_conn *c;
630
631	if (st->conns_n[bucket] >= lro_chain_max) {
632		++st->n_too_many;
633		return;
634	}
635
636	if (!TAILQ_EMPTY(&st->free_conns)) {
637		c = TAILQ_FIRST(&st->free_conns);
638		TAILQ_REMOVE(&st->free_conns, c, link);
639	} else {
640		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
641		if (c == NULL)
642			return;
643		c->mbuf = NULL;
644		c->next_buf.mbuf = NULL;
645	}
646
647	/* Create the connection tracking data */
648	++st->conns_n[bucket];
649	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
650	c->l2_id = l2_id;
651	c->conn_hash = conn_hash;
652	c->source = th->th_sport;
653	c->dest = th->th_dport;
654	c->n_in_order_pkts = 0;
655	c->last_pkt_ticks = *(volatile int *)&ticks;
656	c->delivered = 0;
657	++st->n_new_stream;
658	/* NB. We don't initialise c->next_seq, and it doesn't matter what
659	 * value it has.  Most likely the next packet received for this
660	 * connection will not match -- no harm done.
661	 */
662}
663
664/* Process mbuf and decide whether to dispatch it to the stack now or
665 * later.
666 */
667static void
668sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
669{
670	struct sfxge_softc *sc = rxq->sc;
671	struct mbuf *m = rx_buf->mbuf;
672	struct ether_header *eh;
673	struct sfxge_lro_conn *c;
674	uint16_t l2_id;
675	uint16_t l3_proto;
676	void *nh;
677	struct tcphdr *th;
678	uint32_t conn_hash;
679	unsigned bucket;
680
681	/* Get the hardware hash */
682	conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
683					    EFX_RX_HASHALG_TOEPLITZ,
684					    mtod(m, uint8_t *));
685
686	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
687	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
688		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
689		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
690			SFXGE_LRO_L2_ID_VLAN;
691		l3_proto = veh->evl_proto;
692		nh = veh + 1;
693	} else {
694		l2_id = 0;
695		l3_proto = eh->ether_type;
696		nh = eh + 1;
697	}
698
699	/* Check whether this is a suitable packet (unfragmented
700	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
701	 * length, and compute a hash if necessary.  If not, return.
702	 */
703	if (l3_proto == htons(ETHERTYPE_IP)) {
704		struct ip *iph = nh;
705
706		KASSERT(iph->ip_p == IPPROTO_TCP,
707		    ("IPv4 protocol is not TCP, but packet marker is set"));
708		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
709		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
710			goto deliver_now;
711		th = (struct tcphdr *)(iph + 1);
712	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
713		struct ip6_hdr *iph = nh;
714
715		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
716		    ("IPv6 next header is not TCP, but packet marker is set"));
717		l2_id |= SFXGE_LRO_L2_ID_IPV6;
718		th = (struct tcphdr *)(iph + 1);
719	} else {
720		goto deliver_now;
721	}
722
723	bucket = conn_hash & rxq->lro.conns_mask;
724
725	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
726		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
727			continue;
728		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
729			continue;
730		if (c->mbuf != NULL) {
731			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
732				struct ip *c_iph, *iph = nh;
733				c_iph = c->nh;
734				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
735				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
736					continue;
737			} else {
738				struct ip6_hdr *c_iph, *iph = nh;
739				c_iph = c->nh;
740				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
741				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
742					continue;
743			}
744		}
745
746		/* Re-insert at head of list to reduce lookup time. */
747		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
748		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
749
750		if (c->next_buf.mbuf != NULL) {
751			if (!sfxge_lro_try_merge(rxq, c))
752				goto deliver_now;
753		} else {
754			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
755			    active_link);
756		}
757		c->next_buf = *rx_buf;
758		c->next_eh = eh;
759		c->next_nh = nh;
760
761		rx_buf->mbuf = NULL;
762		rx_buf->flags = EFX_DISCARD;
763		return;
764	}
765
766	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
767 deliver_now:
768	sfxge_rx_deliver(sc, rx_buf);
769}
770
771static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
772{
773	struct sfxge_lro_state *st = &rxq->lro;
774	struct sfxge_lro_conn *c;
775	unsigned t;
776
777	while (!LIST_EMPTY(&st->active_conns)) {
778		c = LIST_FIRST(&st->active_conns);
779		if (!c->delivered && c->mbuf != NULL)
780			sfxge_lro_deliver(st, c);
781		if (sfxge_lro_try_merge(rxq, c)) {
782			if (c->mbuf != NULL)
783				sfxge_lro_deliver(st, c);
784			LIST_REMOVE(c, active_link);
785		}
786		c->delivered = 0;
787	}
788
789	t = *(volatile int *)&ticks;
790	if (__predict_false(t != st->last_purge_ticks))
791		sfxge_lro_purge_idle(rxq, t);
792}
793
794#else	/* !SFXGE_LRO */
795
796static void
797sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
798{
799}
800
801static void
802sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
803{
804}
805
806#endif	/* SFXGE_LRO */
807
808void
809sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
810{
811	struct sfxge_softc *sc = rxq->sc;
812	int if_capenable = sc->ifnet->if_capenable;
813	int lro_enabled = if_capenable & IFCAP_LRO;
814	unsigned int index;
815	struct sfxge_evq *evq;
816	unsigned int completed;
817	unsigned int level;
818	struct mbuf *m;
819	struct sfxge_rx_sw_desc *prev = NULL;
820
821	index = rxq->index;
822	evq = sc->evq[index];
823
824	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
825
826	completed = rxq->completed;
827	while (completed != rxq->pending) {
828		unsigned int id;
829		struct sfxge_rx_sw_desc *rx_desc;
830
831		id = completed++ & rxq->ptr_mask;
832		rx_desc = &rxq->queue[id];
833		m = rx_desc->mbuf;
834
835		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
836			goto discard;
837
838		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
839			goto discard;
840
841		/* Read the length from the pseudo header if required */
842		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
843			uint16_t tmp_size;
844			int rc;
845			rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
846							   mtod(m, uint8_t *),
847							   &tmp_size);
848			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
849			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
850		}
851
852		prefetch_read_many(mtod(m, caddr_t));
853
854		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
855		case EFX_PKT_IPV4:
856			if (~if_capenable & IFCAP_RXCSUM)
857				rx_desc->flags &=
858				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
859			break;
860		case EFX_PKT_IPV6:
861			if (~if_capenable & IFCAP_RXCSUM_IPV6)
862				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
863			break;
864		case 0:
865			/* Check for loopback packets */
866			{
867				struct ether_header *etherhp;
868
869				/*LINTED*/
870				etherhp = mtod(m, struct ether_header *);
871
872				if (etherhp->ether_type ==
873				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
874					EFSYS_PROBE(loopback);
875
876					rxq->loopback++;
877					goto discard;
878				}
879			}
880			break;
881		default:
882			KASSERT(B_FALSE,
883			    ("Rx descriptor with both IPv4 and IPv6 flags"));
884			goto discard;
885		}
886
887		/* Pass packet up the stack or into LRO (pipelined) */
888		if (prev != NULL) {
889			if (lro_enabled &&
890			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
891			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
892				sfxge_lro(rxq, prev);
893			else
894				sfxge_rx_deliver(sc, prev);
895		}
896		prev = rx_desc;
897		continue;
898
899discard:
900		/* Return the packet to the pool */
901		m_free(m);
902		rx_desc->mbuf = NULL;
903	}
904	rxq->completed = completed;
905
906	level = rxq->added - rxq->completed;
907
908	/* Pass last packet up the stack or into LRO */
909	if (prev != NULL) {
910		if (lro_enabled &&
911		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
912		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
913			sfxge_lro(rxq, prev);
914		else
915			sfxge_rx_deliver(sc, prev);
916	}
917
918	/*
919	 * If there are any pending flows and this is the end of the
920	 * poll then they must be completed.
921	 */
922	if (eop)
923		sfxge_lro_end_of_burst(rxq);
924
925	/* Top up the queue if necessary */
926	if (level < rxq->refill_threshold)
927		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
928}
929
930static void
931sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
932{
933	struct sfxge_rxq *rxq;
934	struct sfxge_evq *evq;
935	unsigned int count;
936	unsigned int retry = 3;
937
938	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
939
940	rxq = sc->rxq[index];
941	evq = sc->evq[index];
942
943	SFXGE_EVQ_LOCK(evq);
944
945	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
946	    ("rxq not started"));
947
948	rxq->init_state = SFXGE_RXQ_INITIALIZED;
949
950	callout_stop(&rxq->refill_callout);
951
952	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
953		rxq->flush_state = SFXGE_FLUSH_PENDING;
954
955		SFXGE_EVQ_UNLOCK(evq);
956
957		/* Flush the receive queue */
958		if (efx_rx_qflush(rxq->common) != 0) {
959			SFXGE_EVQ_LOCK(evq);
960			rxq->flush_state = SFXGE_FLUSH_FAILED;
961			break;
962		}
963
964		count = 0;
965		do {
966			/* Spin for 100 ms */
967			DELAY(100000);
968
969			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
970				break;
971
972		} while (++count < 20);
973
974		SFXGE_EVQ_LOCK(evq);
975
976		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
977			/* Flush timeout - neither done nor failed */
978			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
979			    device_get_nameunit(sc->dev), index);
980			rxq->flush_state = SFXGE_FLUSH_DONE;
981		}
982		retry--;
983	}
984	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
985		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
986		    device_get_nameunit(sc->dev), index);
987		rxq->flush_state = SFXGE_FLUSH_DONE;
988	}
989
990	rxq->pending = rxq->added;
991	sfxge_rx_qcomplete(rxq, B_TRUE);
992
993	KASSERT(rxq->completed == rxq->pending,
994	    ("rxq->completed != rxq->pending"));
995
996	rxq->added = 0;
997	rxq->pushed = 0;
998	rxq->pending = 0;
999	rxq->completed = 0;
1000	rxq->loopback = 0;
1001
1002	/* Destroy the common code receive queue. */
1003	efx_rx_qdestroy(rxq->common);
1004
1005	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1006	    EFX_RXQ_NBUFS(sc->rxq_entries));
1007
1008	SFXGE_EVQ_UNLOCK(evq);
1009}
1010
1011static int
1012sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1013{
1014	struct sfxge_rxq *rxq;
1015	efsys_mem_t *esmp;
1016	struct sfxge_evq *evq;
1017	int rc;
1018
1019	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1020
1021	rxq = sc->rxq[index];
1022	esmp = &rxq->mem;
1023	evq = sc->evq[index];
1024
1025	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1026	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1027	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1028	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1029
1030	/* Program the buffer table. */
1031	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1032	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1033		return (rc);
1034
1035	/* Create the common code receive queue. */
1036	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1037	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1038	    &rxq->common)) != 0)
1039		goto fail;
1040
1041	SFXGE_EVQ_LOCK(evq);
1042
1043	/* Enable the receive queue. */
1044	efx_rx_qenable(rxq->common);
1045
1046	rxq->init_state = SFXGE_RXQ_STARTED;
1047	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1048
1049	/* Try to fill the queue from the pool. */
1050	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1051
1052	SFXGE_EVQ_UNLOCK(evq);
1053
1054	return (0);
1055
1056fail:
1057	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1058	    EFX_RXQ_NBUFS(sc->rxq_entries));
1059	return (rc);
1060}
1061
1062void
1063sfxge_rx_stop(struct sfxge_softc *sc)
1064{
1065	int index;
1066
1067	efx_mac_filter_default_rxq_clear(sc->enp);
1068
1069	/* Stop the receive queue(s) */
1070	index = sc->rxq_count;
1071	while (--index >= 0)
1072		sfxge_rx_qstop(sc, index);
1073
1074	sc->rx_prefix_size = 0;
1075	sc->rx_buffer_size = 0;
1076
1077	efx_rx_fini(sc->enp);
1078}
1079
1080int
1081sfxge_rx_start(struct sfxge_softc *sc)
1082{
1083	struct sfxge_intr *intr;
1084	const efx_nic_cfg_t *encp;
1085	size_t hdrlen, align, reserved;
1086	int index;
1087	int rc;
1088
1089	intr = &sc->intr;
1090
1091	/* Initialize the common code receive module. */
1092	if ((rc = efx_rx_init(sc->enp)) != 0)
1093		return (rc);
1094
1095	encp = efx_nic_cfg_get(sc->enp);
1096	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1097
1098	/* Calculate the receive packet buffer size. */
1099	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1100
1101	/* Ensure IP headers are 32bit aligned */
1102	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1103	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1104
1105	sc->rx_buffer_size += sc->rx_buffer_align;
1106
1107	/* Align end of packet buffer for RX DMA end padding */
1108	align = MAX(1, encp->enc_rx_buf_align_end);
1109	EFSYS_ASSERT(ISP2(align));
1110	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1111
1112	/*
1113	 * Standard mbuf zones only guarantee pointer-size alignment;
1114	 * we need extra space to align to the cache line
1115	 */
1116	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1117
1118	/* Select zone for packet buffers */
1119	if (reserved <= MCLBYTES)
1120		sc->rx_cluster_size = MCLBYTES;
1121	else if (reserved <= MJUMPAGESIZE)
1122		sc->rx_cluster_size = MJUMPAGESIZE;
1123	else if (reserved <= MJUM9BYTES)
1124		sc->rx_cluster_size = MJUM9BYTES;
1125	else
1126		sc->rx_cluster_size = MJUM16BYTES;
1127
1128	/*
1129	 * Set up the scale table.  Enable all hash types and hash insertion.
1130	 */
1131	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1132#ifdef RSS
1133		sc->rx_indir_table[index] =
1134			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1135#else
1136		sc->rx_indir_table[index] = index % sc->rxq_count;
1137#endif
1138	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1139				       SFXGE_RX_SCALE_MAX)) != 0)
1140		goto fail;
1141	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1142	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1143	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1144
1145#ifdef RSS
1146	rss_getkey(toep_key);
1147#endif
1148	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1149				       sizeof(toep_key))) != 0)
1150		goto fail;
1151
1152	/* Start the receive queue(s). */
1153	for (index = 0; index < sc->rxq_count; index++) {
1154		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1155			goto fail2;
1156	}
1157
1158	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1159					    sc->intr.n_alloc > 1);
1160	if (rc != 0)
1161		goto fail3;
1162
1163	return (0);
1164
1165fail3:
1166fail2:
1167	while (--index >= 0)
1168		sfxge_rx_qstop(sc, index);
1169
1170fail:
1171	efx_rx_fini(sc->enp);
1172
1173	return (rc);
1174}
1175
1176#ifdef SFXGE_LRO
1177
1178static void sfxge_lro_init(struct sfxge_rxq *rxq)
1179{
1180	struct sfxge_lro_state *st = &rxq->lro;
1181	unsigned i;
1182
1183	st->conns_mask = lro_table_size - 1;
1184	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1185		("lro_table_size must be a power of 2"));
1186	st->sc = rxq->sc;
1187	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1188			   M_SFXGE, M_WAITOK);
1189	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1190			     M_SFXGE, M_WAITOK);
1191	for (i = 0; i <= st->conns_mask; ++i) {
1192		TAILQ_INIT(&st->conns[i]);
1193		st->conns_n[i] = 0;
1194	}
1195	LIST_INIT(&st->active_conns);
1196	TAILQ_INIT(&st->free_conns);
1197}
1198
1199static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1200{
1201	struct sfxge_lro_state *st = &rxq->lro;
1202	struct sfxge_lro_conn *c;
1203	unsigned i;
1204
1205	/* Return cleanly if sfxge_lro_init() has not been called. */
1206	if (st->conns == NULL)
1207		return;
1208
1209	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1210
1211	for (i = 0; i <= st->conns_mask; ++i) {
1212		while (!TAILQ_EMPTY(&st->conns[i])) {
1213			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1214			sfxge_lro_drop(rxq, c);
1215		}
1216	}
1217
1218	while (!TAILQ_EMPTY(&st->free_conns)) {
1219		c = TAILQ_FIRST(&st->free_conns);
1220		TAILQ_REMOVE(&st->free_conns, c, link);
1221		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1222		free(c, M_SFXGE);
1223	}
1224
1225	free(st->conns_n, M_SFXGE);
1226	free(st->conns, M_SFXGE);
1227	st->conns = NULL;
1228}
1229
1230#else
1231
1232static void
1233sfxge_lro_init(struct sfxge_rxq *rxq)
1234{
1235}
1236
1237static void
1238sfxge_lro_fini(struct sfxge_rxq *rxq)
1239{
1240}
1241
1242#endif	/* SFXGE_LRO */
1243
1244static void
1245sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1246{
1247	struct sfxge_rxq *rxq;
1248
1249	rxq = sc->rxq[index];
1250
1251	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1252	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1253
1254	/* Free the context array and the flow table. */
1255	free(rxq->queue, M_SFXGE);
1256	sfxge_lro_fini(rxq);
1257
1258	/* Release DMA memory. */
1259	sfxge_dma_free(&rxq->mem);
1260
1261	sc->rxq[index] = NULL;
1262
1263	free(rxq, M_SFXGE);
1264}
1265
1266static int
1267sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1268{
1269	struct sfxge_rxq *rxq;
1270	struct sfxge_evq *evq;
1271	efsys_mem_t *esmp;
1272	int rc;
1273
1274	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1275
1276	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1277	rxq->sc = sc;
1278	rxq->index = index;
1279	rxq->entries = sc->rxq_entries;
1280	rxq->ptr_mask = rxq->entries - 1;
1281	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1282
1283	sc->rxq[index] = rxq;
1284	esmp = &rxq->mem;
1285
1286	evq = sc->evq[index];
1287
1288	/* Allocate and zero DMA space. */
1289	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1290		return (rc);
1291
1292	/* Allocate buffer table entries. */
1293	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1294				 &rxq->buf_base_id);
1295
1296	/* Allocate the context array and the flow table. */
1297	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1298	    M_SFXGE, M_WAITOK | M_ZERO);
1299	sfxge_lro_init(rxq);
1300
1301	callout_init(&rxq->refill_callout, 1);
1302
1303	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1304
1305	return (0);
1306}
1307
1308static const struct {
1309	const char *name;
1310	size_t offset;
1311} sfxge_rx_stats[] = {
1312#define	SFXGE_RX_STAT(name, member) \
1313	{ #name, offsetof(struct sfxge_rxq, member) }
1314#ifdef SFXGE_LRO
1315	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1316	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1317	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1318	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1319	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1320	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1321	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1322	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1323#endif
1324};
1325
1326static int
1327sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1328{
1329	struct sfxge_softc *sc = arg1;
1330	unsigned int id = arg2;
1331	unsigned int sum, index;
1332
1333	/* Sum across all RX queues */
1334	sum = 0;
1335	for (index = 0; index < sc->rxq_count; index++)
1336		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1337					 sfxge_rx_stats[id].offset);
1338
1339	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1340}
1341
1342static void
1343sfxge_rx_stat_init(struct sfxge_softc *sc)
1344{
1345	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1346	struct sysctl_oid_list *stat_list;
1347	unsigned int id;
1348
1349	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1350
1351	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1352		SYSCTL_ADD_PROC(
1353			ctx, stat_list,
1354			OID_AUTO, sfxge_rx_stats[id].name,
1355			CTLTYPE_UINT|CTLFLAG_RD,
1356			sc, id, sfxge_rx_stat_handler, "IU",
1357			"");
1358	}
1359}
1360
1361void
1362sfxge_rx_fini(struct sfxge_softc *sc)
1363{
1364	int index;
1365
1366	index = sc->rxq_count;
1367	while (--index >= 0)
1368		sfxge_rx_qfini(sc, index);
1369
1370	sc->rxq_count = 0;
1371}
1372
1373int
1374sfxge_rx_init(struct sfxge_softc *sc)
1375{
1376	struct sfxge_intr *intr;
1377	int index;
1378	int rc;
1379
1380#ifdef SFXGE_LRO
1381	if (!ISP2(lro_table_size)) {
1382		log(LOG_ERR, "%s=%u must be power of 2",
1383		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1384		rc = EINVAL;
1385		goto fail_lro_table_size;
1386	}
1387
1388	if (lro_idle_ticks == 0)
1389		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1390#endif
1391
1392	intr = &sc->intr;
1393
1394	sc->rxq_count = intr->n_alloc;
1395
1396	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1397	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1398
1399	/* Initialize the receive queue(s) - one per interrupt. */
1400	for (index = 0; index < sc->rxq_count; index++) {
1401		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1402			goto fail;
1403	}
1404
1405	sfxge_rx_stat_init(sc);
1406
1407	return (0);
1408
1409fail:
1410	/* Tear down the receive queue(s). */
1411	while (--index >= 0)
1412		sfxge_rx_qfini(sc, index);
1413
1414	sc->rxq_count = 0;
1415
1416#ifdef SFXGE_LRO
1417fail_lro_table_size:
1418#endif
1419	return (rc);
1420}
1421