sfxge_rx.c revision 331722
1/*-
2 * Copyright (c) 2010-2016 Solarflare Communications Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright notice,
12 *    this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/11/sys/dev/sfxge/sfxge_rx.c 331722 2018-03-29 02:50:57Z eadler $");
36
37#include "opt_rss.h"
38
39#include <sys/param.h>
40#include <sys/malloc.h>
41#include <sys/mbuf.h>
42#include <sys/smp.h>
43#include <sys/socket.h>
44#include <sys/sysctl.h>
45#include <sys/syslog.h>
46#include <sys/limits.h>
47#include <sys/syslog.h>
48
49#include <net/ethernet.h>
50#include <net/if.h>
51#include <net/if_vlan_var.h>
52
53#include <netinet/in.h>
54#include <netinet/ip.h>
55#include <netinet/ip6.h>
56#include <netinet/tcp.h>
57
58#include <machine/in_cksum.h>
59
60#ifdef RSS
61#include <net/rss_config.h>
62#endif
63
64#include "common/efx.h"
65
66
67#include "sfxge.h"
68#include "sfxge_rx.h"
69
70#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
71
72#ifdef SFXGE_LRO
73
74SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
75	    "Large receive offload (LRO) parameters");
76
77#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
78
79/* Size of the LRO hash table.  Must be a power of 2.  A larger table
80 * means we can accelerate a larger number of streams.
81 */
82static unsigned lro_table_size = 128;
83TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
84SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
85	    &lro_table_size, 0,
86	    "Size of the LRO hash table (must be a power of 2)");
87
88/* Maximum length of a hash chain.  If chains get too long then the lookup
89 * time increases and may exceed the benefit of LRO.
90 */
91static unsigned lro_chain_max = 20;
92TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
93SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
94	    &lro_chain_max, 0,
95	    "The maximum length of a hash chain");
96
97/* Maximum time (in ticks) that a connection can be idle before it's LRO
98 * state is discarded.
99 */
100static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
101TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
102SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
103	    &lro_idle_ticks, 0,
104	    "The maximum time (in ticks) that a connection can be idle "
105	    "before it's LRO state is discarded");
106
107/* Number of packets with payload that must arrive in-order before a
108 * connection is eligible for LRO.  The idea is we should avoid coalescing
109 * segments when the sender is in slow-start because reducing the ACK rate
110 * can damage performance.
111 */
112static int lro_slow_start_packets = 2000;
113TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
114SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
115	    &lro_slow_start_packets, 0,
116	    "Number of packets with payload that must arrive in-order before "
117	    "a connection is eligible for LRO");
118
119/* Number of packets with payload that must arrive in-order following loss
120 * before a connection is eligible for LRO.  The idea is we should avoid
121 * coalescing segments when the sender is recovering from loss, because
122 * reducing the ACK rate can damage performance.
123 */
124static int lro_loss_packets = 20;
125TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
126SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
127	    &lro_loss_packets, 0,
128	    "Number of packets with payload that must arrive in-order "
129	    "following loss before a connection is eligible for LRO");
130
131/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
132#define	SFXGE_LRO_L2_ID_VLAN 0x4000
133#define	SFXGE_LRO_L2_ID_IPV6 0x8000
134#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
135#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
136
137/* Compare IPv6 addresses, avoiding conditional branches */
138static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
139				   const struct in6_addr *right)
140{
141#if LONG_BIT == 64
142	const uint64_t *left64 = (const uint64_t *)left;
143	const uint64_t *right64 = (const uint64_t *)right;
144	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
145#else
146	return (left->s6_addr32[0] - right->s6_addr32[0]) |
147	       (left->s6_addr32[1] - right->s6_addr32[1]) |
148	       (left->s6_addr32[2] - right->s6_addr32[2]) |
149	       (left->s6_addr32[3] - right->s6_addr32[3]);
150#endif
151}
152
153#endif	/* SFXGE_LRO */
154
155void
156sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
157{
158
159	rxq->flush_state = SFXGE_FLUSH_DONE;
160}
161
162void
163sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
164{
165
166	rxq->flush_state = SFXGE_FLUSH_FAILED;
167}
168
169#ifdef RSS
170static uint8_t toep_key[RSS_KEYSIZE];
171#else
172static uint8_t toep_key[] = {
173	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
174	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
175	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
176	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
177	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
178};
179#endif
180
181static void
182sfxge_rx_post_refill(void *arg)
183{
184	struct sfxge_rxq *rxq = arg;
185	struct sfxge_softc *sc;
186	unsigned int index;
187	struct sfxge_evq *evq;
188	uint16_t magic;
189
190	sc = rxq->sc;
191	index = rxq->index;
192	evq = sc->evq[index];
193	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
194
195	/* This is guaranteed due to the start/stop order of rx and ev */
196	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
197	    ("evq not started"));
198	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
199	    ("rxq not started"));
200	efx_ev_qpost(evq->common, magic);
201}
202
203static void
204sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
205{
206	/* Initially retry after 100 ms, but back off in case of
207	 * repeated failures as we probably have to wait for the
208	 * administrator to raise the pool limit. */
209	if (retrying)
210		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
211	else
212		rxq->refill_delay = hz / 10;
213
214	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
215			     sfxge_rx_post_refill, rxq);
216}
217
218#define	SFXGE_REFILL_BATCH  64
219
220static void
221sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
222{
223	struct sfxge_softc *sc;
224	unsigned int index;
225	struct sfxge_evq *evq;
226	unsigned int batch;
227	unsigned int rxfill;
228	unsigned int mblksize;
229	int ntodo;
230	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
231
232	sc = rxq->sc;
233	index = rxq->index;
234	evq = sc->evq[index];
235
236	prefetch_read_many(sc->enp);
237	prefetch_read_many(rxq->common);
238
239	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
240
241	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
242		return;
243
244	rxfill = rxq->added - rxq->completed;
245	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
250
251	if (ntodo == 0)
252		return;
253
254	batch = 0;
255	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
256	while (ntodo-- > 0) {
257		unsigned int id;
258		struct sfxge_rx_sw_desc *rx_desc;
259		bus_dma_segment_t seg;
260		struct mbuf *m;
261
262		id = (rxq->added + batch) & rxq->ptr_mask;
263		rx_desc = &rxq->queue[id];
264		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
265
266		rx_desc->flags = EFX_DISCARD;
267		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
268		    sc->rx_cluster_size);
269		if (m == NULL)
270			break;
271
272		/* m_len specifies length of area to be mapped for DMA */
273		m->m_len  = mblksize;
274		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
275		m->m_data += sc->rx_buffer_align;
276
277		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278		addr[batch++] = seg.ds_addr;
279
280		if (batch == SFXGE_REFILL_BATCH) {
281			efx_rx_qpost(rxq->common, addr, mblksize, batch,
282			    rxq->completed, rxq->added);
283			rxq->added += batch;
284			batch = 0;
285		}
286	}
287
288	if (ntodo != 0)
289		sfxge_rx_schedule_refill(rxq, retrying);
290
291	if (batch != 0) {
292		efx_rx_qpost(rxq->common, addr, mblksize, batch,
293		    rxq->completed, rxq->added);
294		rxq->added += batch;
295	}
296
297	/* Make the descriptors visible to the hardware */
298	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299			BUS_DMASYNC_PREWRITE);
300
301	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
302
303	/* The queue could still be empty if no descriptors were actually
304	 * pushed, in which case there will be no event to cause the next
305	 * refill, so we must schedule a refill ourselves.
306	 */
307	if(rxq->pushed == rxq->completed) {
308		sfxge_rx_schedule_refill(rxq, retrying);
309	}
310}
311
312void
313sfxge_rx_qrefill(struct sfxge_rxq *rxq)
314{
315
316	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
317		return;
318
319	/* Make sure the queue is full */
320	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
321}
322
323static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
324{
325	struct ifnet *ifp = sc->ifnet;
326
327	m->m_pkthdr.rcvif = ifp;
328	m->m_pkthdr.csum_data = 0xffff;
329	ifp->if_input(ifp, m);
330}
331
332static void
333sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
334{
335	struct sfxge_softc *sc = rxq->sc;
336	struct mbuf *m = rx_desc->mbuf;
337	int flags = rx_desc->flags;
338	int csum_flags;
339
340	/* Convert checksum flags */
341	csum_flags = (flags & EFX_CKSUM_IPV4) ?
342		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
343	if (flags & EFX_CKSUM_TCPUDP)
344		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
345
346	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
347		m->m_pkthdr.flowid =
348			efx_pseudo_hdr_hash_get(rxq->common,
349						EFX_RX_HASHALG_TOEPLITZ,
350						mtod(m, uint8_t *));
351		/* The hash covers a 4-tuple for TCP only */
352		M_HASHTYPE_SET(m,
353		    (flags & EFX_PKT_IPV4) ?
354			((flags & EFX_PKT_TCP) ?
355			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
356			((flags & EFX_PKT_TCP) ?
357			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
358	}
359	m->m_data += sc->rx_prefix_size;
360	m->m_len = rx_desc->size - sc->rx_prefix_size;
361	m->m_pkthdr.len = m->m_len;
362	m->m_pkthdr.csum_flags = csum_flags;
363	__sfxge_rx_deliver(sc, rx_desc->mbuf);
364
365	rx_desc->flags = EFX_DISCARD;
366	rx_desc->mbuf = NULL;
367}
368
369#ifdef SFXGE_LRO
370
371static void
372sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
373{
374	struct sfxge_softc *sc = st->sc;
375	struct mbuf *m = c->mbuf;
376	struct tcphdr *c_th;
377	int csum_flags;
378
379	KASSERT(m, ("no mbuf to deliver"));
380
381	++st->n_bursts;
382
383	/* Finish off packet munging and recalculate IP header checksum. */
384	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
385		struct ip *iph = c->nh;
386		iph->ip_len = htons(iph->ip_len);
387		iph->ip_sum = 0;
388		iph->ip_sum = in_cksum_hdr(iph);
389		c_th = (struct tcphdr *)(iph + 1);
390		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
391			      CSUM_IP_CHECKED | CSUM_IP_VALID);
392	} else {
393		struct ip6_hdr *iph = c->nh;
394		iph->ip6_plen = htons(iph->ip6_plen);
395		c_th = (struct tcphdr *)(iph + 1);
396		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
397	}
398
399	c_th->th_win = c->th_last->th_win;
400	c_th->th_ack = c->th_last->th_ack;
401	if (c_th->th_off == c->th_last->th_off) {
402		/* Copy TCP options (take care to avoid going negative). */
403		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
404		memcpy(c_th + 1, c->th_last + 1, optlen);
405	}
406
407	m->m_pkthdr.flowid = c->conn_hash;
408	M_HASHTYPE_SET(m,
409	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
410		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
411
412	m->m_pkthdr.csum_flags = csum_flags;
413	__sfxge_rx_deliver(sc, m);
414
415	c->mbuf = NULL;
416	c->delivered = 1;
417}
418
419/* Drop the given connection, and add it to the free list. */
420static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
421{
422	unsigned bucket;
423
424	KASSERT(!c->mbuf, ("found orphaned mbuf"));
425
426	if (c->next_buf.mbuf != NULL) {
427		sfxge_rx_deliver(rxq, &c->next_buf);
428		LIST_REMOVE(c, active_link);
429	}
430
431	bucket = c->conn_hash & rxq->lro.conns_mask;
432	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
433	--rxq->lro.conns_n[bucket];
434	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
435	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
436}
437
438/* Stop tracking connections that have gone idle in order to keep hash
439 * chains short.
440 */
441static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
442{
443	struct sfxge_lro_conn *c;
444	unsigned i;
445
446	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
447		("found active connections"));
448
449	rxq->lro.last_purge_ticks = now;
450	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
451		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
452			continue;
453
454		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
455		if (now - c->last_pkt_ticks > lro_idle_ticks) {
456			++rxq->lro.n_drop_idle;
457			sfxge_lro_drop(rxq, c);
458		}
459	}
460}
461
462static void
463sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
464		struct mbuf *mbuf, struct tcphdr *th)
465{
466	struct tcphdr *c_th;
467
468	/* Tack the new mbuf onto the chain. */
469	KASSERT(!mbuf->m_next, ("mbuf already chained"));
470	c->mbuf_tail->m_next = mbuf;
471	c->mbuf_tail = mbuf;
472
473	/* Increase length appropriately */
474	c->mbuf->m_pkthdr.len += mbuf->m_len;
475
476	/* Update the connection state flags */
477	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
478		struct ip *iph = c->nh;
479		iph->ip_len += mbuf->m_len;
480		c_th = (struct tcphdr *)(iph + 1);
481	} else {
482		struct ip6_hdr *iph = c->nh;
483		iph->ip6_plen += mbuf->m_len;
484		c_th = (struct tcphdr *)(iph + 1);
485	}
486	c_th->th_flags |= (th->th_flags & TH_PUSH);
487	c->th_last = th;
488	++st->n_merges;
489
490	/* Pass packet up now if another segment could overflow the IP
491	 * length.
492	 */
493	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
494		sfxge_lro_deliver(st, c);
495}
496
497static void
498sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
499		struct mbuf *mbuf, void *nh, struct tcphdr *th)
500{
501	/* Start the chain */
502	c->mbuf = mbuf;
503	c->mbuf_tail = c->mbuf;
504	c->nh = nh;
505	c->th_last = th;
506
507	mbuf->m_pkthdr.len = mbuf->m_len;
508
509	/* Mangle header fields for later processing */
510	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
511		struct ip *iph = nh;
512		iph->ip_len = ntohs(iph->ip_len);
513	} else {
514		struct ip6_hdr *iph = nh;
515		iph->ip6_plen = ntohs(iph->ip6_plen);
516	}
517}
518
519/* Try to merge or otherwise hold or deliver (as appropriate) the
520 * packet buffered for this connection (c->next_buf).  Return a flag
521 * indicating whether the connection is still active for LRO purposes.
522 */
523static int
524sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
525{
526	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
527	char *eh = c->next_eh;
528	int data_length, hdr_length, dont_merge;
529	unsigned th_seq, pkt_length;
530	struct tcphdr *th;
531	unsigned now;
532
533	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
534		struct ip *iph = c->next_nh;
535		th = (struct tcphdr *)(iph + 1);
536		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
537	} else {
538		struct ip6_hdr *iph = c->next_nh;
539		th = (struct tcphdr *)(iph + 1);
540		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
541	}
542
543	hdr_length = (char *) th + th->th_off * 4 - eh;
544	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
545		       hdr_length);
546	th_seq = ntohl(th->th_seq);
547	dont_merge = ((data_length <= 0)
548		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
549
550	/* Check for options other than aligned timestamp. */
551	if (th->th_off != 5) {
552		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
553		if (th->th_off == 8 &&
554		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
555					(TCPOPT_NOP << 16) |
556					(TCPOPT_TIMESTAMP << 8) |
557					TCPOLEN_TIMESTAMP)) {
558			/* timestamp option -- okay */
559		} else {
560			dont_merge = 1;
561		}
562	}
563
564	if (__predict_false(th_seq != c->next_seq)) {
565		/* Out-of-order, so start counting again. */
566		if (c->mbuf != NULL)
567			sfxge_lro_deliver(&rxq->lro, c);
568		c->n_in_order_pkts -= lro_loss_packets;
569		c->next_seq = th_seq + data_length;
570		++rxq->lro.n_misorder;
571		goto deliver_buf_out;
572	}
573	c->next_seq = th_seq + data_length;
574
575	now = ticks;
576	if (now - c->last_pkt_ticks > lro_idle_ticks) {
577		++rxq->lro.n_drop_idle;
578		if (c->mbuf != NULL)
579			sfxge_lro_deliver(&rxq->lro, c);
580		sfxge_lro_drop(rxq, c);
581		return (0);
582	}
583	c->last_pkt_ticks = ticks;
584
585	if (c->n_in_order_pkts < lro_slow_start_packets) {
586		/* May be in slow-start, so don't merge. */
587		++rxq->lro.n_slow_start;
588		++c->n_in_order_pkts;
589		goto deliver_buf_out;
590	}
591
592	if (__predict_false(dont_merge)) {
593		if (c->mbuf != NULL)
594			sfxge_lro_deliver(&rxq->lro, c);
595		if (th->th_flags & (TH_FIN | TH_RST)) {
596			++rxq->lro.n_drop_closed;
597			sfxge_lro_drop(rxq, c);
598			return (0);
599		}
600		goto deliver_buf_out;
601	}
602
603	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
604
605	if (__predict_true(c->mbuf != NULL)) {
606		/* Remove headers and any padding */
607		rx_buf->mbuf->m_data += hdr_length;
608		rx_buf->mbuf->m_len = data_length;
609
610		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
611	} else {
612		/* Remove any padding */
613		rx_buf->mbuf->m_len = pkt_length;
614
615		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
616	}
617
618	rx_buf->mbuf = NULL;
619	return (1);
620
621 deliver_buf_out:
622	sfxge_rx_deliver(rxq, rx_buf);
623	return (1);
624}
625
626static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
627			       uint16_t l2_id, void *nh, struct tcphdr *th)
628{
629	unsigned bucket = conn_hash & st->conns_mask;
630	struct sfxge_lro_conn *c;
631
632	if (st->conns_n[bucket] >= lro_chain_max) {
633		++st->n_too_many;
634		return;
635	}
636
637	if (!TAILQ_EMPTY(&st->free_conns)) {
638		c = TAILQ_FIRST(&st->free_conns);
639		TAILQ_REMOVE(&st->free_conns, c, link);
640	} else {
641		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
642		if (c == NULL)
643			return;
644		c->mbuf = NULL;
645		c->next_buf.mbuf = NULL;
646	}
647
648	/* Create the connection tracking data */
649	++st->conns_n[bucket];
650	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
651	c->l2_id = l2_id;
652	c->conn_hash = conn_hash;
653	c->source = th->th_sport;
654	c->dest = th->th_dport;
655	c->n_in_order_pkts = 0;
656	c->last_pkt_ticks = *(volatile int *)&ticks;
657	c->delivered = 0;
658	++st->n_new_stream;
659	/* NB. We don't initialise c->next_seq, and it doesn't matter what
660	 * value it has.  Most likely the next packet received for this
661	 * connection will not match -- no harm done.
662	 */
663}
664
665/* Process mbuf and decide whether to dispatch it to the stack now or
666 * later.
667 */
668static void
669sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
670{
671	struct sfxge_softc *sc = rxq->sc;
672	struct mbuf *m = rx_buf->mbuf;
673	struct ether_header *eh;
674	struct sfxge_lro_conn *c;
675	uint16_t l2_id;
676	uint16_t l3_proto;
677	void *nh;
678	struct tcphdr *th;
679	uint32_t conn_hash;
680	unsigned bucket;
681
682	/* Get the hardware hash */
683	conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
684					    EFX_RX_HASHALG_TOEPLITZ,
685					    mtod(m, uint8_t *));
686
687	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
688	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
689		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
690		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
691			SFXGE_LRO_L2_ID_VLAN;
692		l3_proto = veh->evl_proto;
693		nh = veh + 1;
694	} else {
695		l2_id = 0;
696		l3_proto = eh->ether_type;
697		nh = eh + 1;
698	}
699
700	/* Check whether this is a suitable packet (unfragmented
701	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
702	 * length, and compute a hash if necessary.  If not, return.
703	 */
704	if (l3_proto == htons(ETHERTYPE_IP)) {
705		struct ip *iph = nh;
706
707		KASSERT(iph->ip_p == IPPROTO_TCP,
708		    ("IPv4 protocol is not TCP, but packet marker is set"));
709		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
710		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
711			goto deliver_now;
712		th = (struct tcphdr *)(iph + 1);
713	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
714		struct ip6_hdr *iph = nh;
715
716		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
717		    ("IPv6 next header is not TCP, but packet marker is set"));
718		l2_id |= SFXGE_LRO_L2_ID_IPV6;
719		th = (struct tcphdr *)(iph + 1);
720	} else {
721		goto deliver_now;
722	}
723
724	bucket = conn_hash & rxq->lro.conns_mask;
725
726	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
727		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
728			continue;
729		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
730			continue;
731		if (c->mbuf != NULL) {
732			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
733				struct ip *c_iph, *iph = nh;
734				c_iph = c->nh;
735				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
736				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
737					continue;
738			} else {
739				struct ip6_hdr *c_iph, *iph = nh;
740				c_iph = c->nh;
741				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
742				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
743					continue;
744			}
745		}
746
747		/* Re-insert at head of list to reduce lookup time. */
748		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
749		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
750
751		if (c->next_buf.mbuf != NULL) {
752			if (!sfxge_lro_try_merge(rxq, c))
753				goto deliver_now;
754		} else {
755			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
756			    active_link);
757		}
758		c->next_buf = *rx_buf;
759		c->next_eh = eh;
760		c->next_nh = nh;
761
762		rx_buf->mbuf = NULL;
763		rx_buf->flags = EFX_DISCARD;
764		return;
765	}
766
767	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
768 deliver_now:
769	sfxge_rx_deliver(rxq, rx_buf);
770}
771
772static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
773{
774	struct sfxge_lro_state *st = &rxq->lro;
775	struct sfxge_lro_conn *c;
776	unsigned t;
777
778	while (!LIST_EMPTY(&st->active_conns)) {
779		c = LIST_FIRST(&st->active_conns);
780		if (!c->delivered && c->mbuf != NULL)
781			sfxge_lro_deliver(st, c);
782		if (sfxge_lro_try_merge(rxq, c)) {
783			if (c->mbuf != NULL)
784				sfxge_lro_deliver(st, c);
785			LIST_REMOVE(c, active_link);
786		}
787		c->delivered = 0;
788	}
789
790	t = *(volatile int *)&ticks;
791	if (__predict_false(t != st->last_purge_ticks))
792		sfxge_lro_purge_idle(rxq, t);
793}
794
795#else	/* !SFXGE_LRO */
796
797static void
798sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
799{
800}
801
802static void
803sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
804{
805}
806
807#endif	/* SFXGE_LRO */
808
809void
810sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
811{
812	struct sfxge_softc *sc = rxq->sc;
813	int if_capenable = sc->ifnet->if_capenable;
814	int lro_enabled = if_capenable & IFCAP_LRO;
815	unsigned int index;
816	struct sfxge_evq *evq;
817	unsigned int completed;
818	unsigned int level;
819	struct mbuf *m;
820	struct sfxge_rx_sw_desc *prev = NULL;
821
822	index = rxq->index;
823	evq = sc->evq[index];
824
825	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
826
827	completed = rxq->completed;
828	while (completed != rxq->pending) {
829		unsigned int id;
830		struct sfxge_rx_sw_desc *rx_desc;
831
832		id = completed++ & rxq->ptr_mask;
833		rx_desc = &rxq->queue[id];
834		m = rx_desc->mbuf;
835
836		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
837			goto discard;
838
839		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
840			goto discard;
841
842		/* Read the length from the pseudo header if required */
843		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
844			uint16_t tmp_size;
845			int rc;
846			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
847							   mtod(m, uint8_t *),
848							   &tmp_size);
849			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
850			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
851		}
852
853		prefetch_read_many(mtod(m, caddr_t));
854
855		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
856		case EFX_PKT_IPV4:
857			if (~if_capenable & IFCAP_RXCSUM)
858				rx_desc->flags &=
859				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
860			break;
861		case EFX_PKT_IPV6:
862			if (~if_capenable & IFCAP_RXCSUM_IPV6)
863				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
864			break;
865		case 0:
866			/* Check for loopback packets */
867			{
868				struct ether_header *etherhp;
869
870				/*LINTED*/
871				etherhp = mtod(m, struct ether_header *);
872
873				if (etherhp->ether_type ==
874				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
875					EFSYS_PROBE(loopback);
876
877					rxq->loopback++;
878					goto discard;
879				}
880			}
881			break;
882		default:
883			KASSERT(B_FALSE,
884			    ("Rx descriptor with both IPv4 and IPv6 flags"));
885			goto discard;
886		}
887
888		/* Pass packet up the stack or into LRO (pipelined) */
889		if (prev != NULL) {
890			if (lro_enabled &&
891			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
892			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
893				sfxge_lro(rxq, prev);
894			else
895				sfxge_rx_deliver(rxq, prev);
896		}
897		prev = rx_desc;
898		continue;
899
900discard:
901		/* Return the packet to the pool */
902		m_free(m);
903		rx_desc->mbuf = NULL;
904	}
905	rxq->completed = completed;
906
907	level = rxq->added - rxq->completed;
908
909	/* Pass last packet up the stack or into LRO */
910	if (prev != NULL) {
911		if (lro_enabled &&
912		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
913		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
914			sfxge_lro(rxq, prev);
915		else
916			sfxge_rx_deliver(rxq, prev);
917	}
918
919	/*
920	 * If there are any pending flows and this is the end of the
921	 * poll then they must be completed.
922	 */
923	if (eop)
924		sfxge_lro_end_of_burst(rxq);
925
926	/* Top up the queue if necessary */
927	if (level < rxq->refill_threshold)
928		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
929}
930
931static void
932sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
933{
934	struct sfxge_rxq *rxq;
935	struct sfxge_evq *evq;
936	unsigned int count;
937	unsigned int retry = 3;
938
939	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
940
941	rxq = sc->rxq[index];
942	evq = sc->evq[index];
943
944	SFXGE_EVQ_LOCK(evq);
945
946	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
947	    ("rxq not started"));
948
949	rxq->init_state = SFXGE_RXQ_INITIALIZED;
950
951	callout_stop(&rxq->refill_callout);
952
953	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
954		rxq->flush_state = SFXGE_FLUSH_PENDING;
955
956		SFXGE_EVQ_UNLOCK(evq);
957
958		/* Flush the receive queue */
959		if (efx_rx_qflush(rxq->common) != 0) {
960			SFXGE_EVQ_LOCK(evq);
961			rxq->flush_state = SFXGE_FLUSH_FAILED;
962			break;
963		}
964
965		count = 0;
966		do {
967			/* Spin for 100 ms */
968			DELAY(100000);
969
970			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
971				break;
972
973		} while (++count < 20);
974
975		SFXGE_EVQ_LOCK(evq);
976
977		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
978			/* Flush timeout - neither done nor failed */
979			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
980			    device_get_nameunit(sc->dev), index);
981			rxq->flush_state = SFXGE_FLUSH_DONE;
982		}
983		retry--;
984	}
985	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
986		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
987		    device_get_nameunit(sc->dev), index);
988		rxq->flush_state = SFXGE_FLUSH_DONE;
989	}
990
991	rxq->pending = rxq->added;
992	sfxge_rx_qcomplete(rxq, B_TRUE);
993
994	KASSERT(rxq->completed == rxq->pending,
995	    ("rxq->completed != rxq->pending"));
996
997	rxq->added = 0;
998	rxq->pushed = 0;
999	rxq->pending = 0;
1000	rxq->completed = 0;
1001	rxq->loopback = 0;
1002
1003	/* Destroy the common code receive queue. */
1004	efx_rx_qdestroy(rxq->common);
1005
1006	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1007	    EFX_RXQ_NBUFS(sc->rxq_entries));
1008
1009	SFXGE_EVQ_UNLOCK(evq);
1010}
1011
1012static int
1013sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1014{
1015	struct sfxge_rxq *rxq;
1016	efsys_mem_t *esmp;
1017	struct sfxge_evq *evq;
1018	int rc;
1019
1020	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1021
1022	rxq = sc->rxq[index];
1023	esmp = &rxq->mem;
1024	evq = sc->evq[index];
1025
1026	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1027	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1028	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1029	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1030
1031	/* Program the buffer table. */
1032	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1033	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1034		return (rc);
1035
1036	/* Create the common code receive queue. */
1037	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1038	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1039	    &rxq->common)) != 0)
1040		goto fail;
1041
1042	SFXGE_EVQ_LOCK(evq);
1043
1044	/* Enable the receive queue. */
1045	efx_rx_qenable(rxq->common);
1046
1047	rxq->init_state = SFXGE_RXQ_STARTED;
1048	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1049
1050	/* Try to fill the queue from the pool. */
1051	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1052
1053	SFXGE_EVQ_UNLOCK(evq);
1054
1055	return (0);
1056
1057fail:
1058	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1059	    EFX_RXQ_NBUFS(sc->rxq_entries));
1060	return (rc);
1061}
1062
1063void
1064sfxge_rx_stop(struct sfxge_softc *sc)
1065{
1066	int index;
1067
1068	efx_mac_filter_default_rxq_clear(sc->enp);
1069
1070	/* Stop the receive queue(s) */
1071	index = sc->rxq_count;
1072	while (--index >= 0)
1073		sfxge_rx_qstop(sc, index);
1074
1075	sc->rx_prefix_size = 0;
1076	sc->rx_buffer_size = 0;
1077
1078	efx_rx_fini(sc->enp);
1079}
1080
1081int
1082sfxge_rx_start(struct sfxge_softc *sc)
1083{
1084	struct sfxge_intr *intr;
1085	const efx_nic_cfg_t *encp;
1086	size_t hdrlen, align, reserved;
1087	int index;
1088	int rc;
1089
1090	intr = &sc->intr;
1091
1092	/* Initialize the common code receive module. */
1093	if ((rc = efx_rx_init(sc->enp)) != 0)
1094		return (rc);
1095
1096	encp = efx_nic_cfg_get(sc->enp);
1097	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1098
1099	/* Calculate the receive packet buffer size. */
1100	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1101
1102	/* Ensure IP headers are 32bit aligned */
1103	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1104	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1105
1106	sc->rx_buffer_size += sc->rx_buffer_align;
1107
1108	/* Align end of packet buffer for RX DMA end padding */
1109	align = MAX(1, encp->enc_rx_buf_align_end);
1110	EFSYS_ASSERT(ISP2(align));
1111	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1112
1113	/*
1114	 * Standard mbuf zones only guarantee pointer-size alignment;
1115	 * we need extra space to align to the cache line
1116	 */
1117	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1118
1119	/* Select zone for packet buffers */
1120	if (reserved <= MCLBYTES)
1121		sc->rx_cluster_size = MCLBYTES;
1122	else if (reserved <= MJUMPAGESIZE)
1123		sc->rx_cluster_size = MJUMPAGESIZE;
1124	else if (reserved <= MJUM9BYTES)
1125		sc->rx_cluster_size = MJUM9BYTES;
1126	else
1127		sc->rx_cluster_size = MJUM16BYTES;
1128
1129	/*
1130	 * Set up the scale table.  Enable all hash types and hash insertion.
1131	 */
1132	for (index = 0; index < nitems(sc->rx_indir_table); index++)
1133#ifdef RSS
1134		sc->rx_indir_table[index] =
1135			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1136#else
1137		sc->rx_indir_table[index] = index % sc->rxq_count;
1138#endif
1139	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1140				       nitems(sc->rx_indir_table))) != 0)
1141		goto fail;
1142	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1143	    EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1144	    EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1145
1146#ifdef RSS
1147	rss_getkey(toep_key);
1148#endif
1149	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1150				       sizeof(toep_key))) != 0)
1151		goto fail;
1152
1153	/* Start the receive queue(s). */
1154	for (index = 0; index < sc->rxq_count; index++) {
1155		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1156			goto fail2;
1157	}
1158
1159	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1160					    sc->intr.n_alloc > 1);
1161	if (rc != 0)
1162		goto fail3;
1163
1164	return (0);
1165
1166fail3:
1167fail2:
1168	while (--index >= 0)
1169		sfxge_rx_qstop(sc, index);
1170
1171fail:
1172	efx_rx_fini(sc->enp);
1173
1174	return (rc);
1175}
1176
1177#ifdef SFXGE_LRO
1178
1179static void sfxge_lro_init(struct sfxge_rxq *rxq)
1180{
1181	struct sfxge_lro_state *st = &rxq->lro;
1182	unsigned i;
1183
1184	st->conns_mask = lro_table_size - 1;
1185	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1186		("lro_table_size must be a power of 2"));
1187	st->sc = rxq->sc;
1188	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1189			   M_SFXGE, M_WAITOK);
1190	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1191			     M_SFXGE, M_WAITOK);
1192	for (i = 0; i <= st->conns_mask; ++i) {
1193		TAILQ_INIT(&st->conns[i]);
1194		st->conns_n[i] = 0;
1195	}
1196	LIST_INIT(&st->active_conns);
1197	TAILQ_INIT(&st->free_conns);
1198}
1199
1200static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1201{
1202	struct sfxge_lro_state *st = &rxq->lro;
1203	struct sfxge_lro_conn *c;
1204	unsigned i;
1205
1206	/* Return cleanly if sfxge_lro_init() has not been called. */
1207	if (st->conns == NULL)
1208		return;
1209
1210	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1211
1212	for (i = 0; i <= st->conns_mask; ++i) {
1213		while (!TAILQ_EMPTY(&st->conns[i])) {
1214			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1215			sfxge_lro_drop(rxq, c);
1216		}
1217	}
1218
1219	while (!TAILQ_EMPTY(&st->free_conns)) {
1220		c = TAILQ_FIRST(&st->free_conns);
1221		TAILQ_REMOVE(&st->free_conns, c, link);
1222		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1223		free(c, M_SFXGE);
1224	}
1225
1226	free(st->conns_n, M_SFXGE);
1227	free(st->conns, M_SFXGE);
1228	st->conns = NULL;
1229}
1230
1231#else
1232
1233static void
1234sfxge_lro_init(struct sfxge_rxq *rxq)
1235{
1236}
1237
1238static void
1239sfxge_lro_fini(struct sfxge_rxq *rxq)
1240{
1241}
1242
1243#endif	/* SFXGE_LRO */
1244
1245static void
1246sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1247{
1248	struct sfxge_rxq *rxq;
1249
1250	rxq = sc->rxq[index];
1251
1252	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1253	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1254
1255	/* Free the context array and the flow table. */
1256	free(rxq->queue, M_SFXGE);
1257	sfxge_lro_fini(rxq);
1258
1259	/* Release DMA memory. */
1260	sfxge_dma_free(&rxq->mem);
1261
1262	sc->rxq[index] = NULL;
1263
1264	free(rxq, M_SFXGE);
1265}
1266
1267static int
1268sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1269{
1270	struct sfxge_rxq *rxq;
1271	struct sfxge_evq *evq;
1272	efsys_mem_t *esmp;
1273	int rc;
1274
1275	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1276
1277	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1278	rxq->sc = sc;
1279	rxq->index = index;
1280	rxq->entries = sc->rxq_entries;
1281	rxq->ptr_mask = rxq->entries - 1;
1282	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1283
1284	sc->rxq[index] = rxq;
1285	esmp = &rxq->mem;
1286
1287	evq = sc->evq[index];
1288
1289	/* Allocate and zero DMA space. */
1290	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1291		return (rc);
1292
1293	/* Allocate buffer table entries. */
1294	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1295				 &rxq->buf_base_id);
1296
1297	/* Allocate the context array and the flow table. */
1298	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1299	    M_SFXGE, M_WAITOK | M_ZERO);
1300	sfxge_lro_init(rxq);
1301
1302	callout_init(&rxq->refill_callout, 1);
1303
1304	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1305
1306	return (0);
1307}
1308
1309static const struct {
1310	const char *name;
1311	size_t offset;
1312} sfxge_rx_stats[] = {
1313#define	SFXGE_RX_STAT(name, member) \
1314	{ #name, offsetof(struct sfxge_rxq, member) }
1315#ifdef SFXGE_LRO
1316	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1317	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1318	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1319	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1320	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1321	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1322	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1323	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1324#endif
1325};
1326
1327static int
1328sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1329{
1330	struct sfxge_softc *sc = arg1;
1331	unsigned int id = arg2;
1332	unsigned int sum, index;
1333
1334	/* Sum across all RX queues */
1335	sum = 0;
1336	for (index = 0; index < sc->rxq_count; index++)
1337		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1338					 sfxge_rx_stats[id].offset);
1339
1340	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1341}
1342
1343static void
1344sfxge_rx_stat_init(struct sfxge_softc *sc)
1345{
1346	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1347	struct sysctl_oid_list *stat_list;
1348	unsigned int id;
1349
1350	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1351
1352	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1353		SYSCTL_ADD_PROC(
1354			ctx, stat_list,
1355			OID_AUTO, sfxge_rx_stats[id].name,
1356			CTLTYPE_UINT|CTLFLAG_RD,
1357			sc, id, sfxge_rx_stat_handler, "IU",
1358			"");
1359	}
1360}
1361
1362void
1363sfxge_rx_fini(struct sfxge_softc *sc)
1364{
1365	int index;
1366
1367	index = sc->rxq_count;
1368	while (--index >= 0)
1369		sfxge_rx_qfini(sc, index);
1370
1371	sc->rxq_count = 0;
1372}
1373
1374int
1375sfxge_rx_init(struct sfxge_softc *sc)
1376{
1377	struct sfxge_intr *intr;
1378	int index;
1379	int rc;
1380
1381#ifdef SFXGE_LRO
1382	if (!ISP2(lro_table_size)) {
1383		log(LOG_ERR, "%s=%u must be power of 2",
1384		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1385		rc = EINVAL;
1386		goto fail_lro_table_size;
1387	}
1388
1389	if (lro_idle_ticks == 0)
1390		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1391#endif
1392
1393	intr = &sc->intr;
1394
1395	sc->rxq_count = intr->n_alloc;
1396
1397	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1398	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1399
1400	/* Initialize the receive queue(s) - one per interrupt. */
1401	for (index = 0; index < sc->rxq_count; index++) {
1402		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1403			goto fail;
1404	}
1405
1406	sfxge_rx_stat_init(sc);
1407
1408	return (0);
1409
1410fail:
1411	/* Tear down the receive queue(s). */
1412	while (--index >= 0)
1413		sfxge_rx_qfini(sc, index);
1414
1415	sc->rxq_count = 0;
1416
1417#ifdef SFXGE_LRO
1418fail_lro_table_size:
1419#endif
1420	return (rc);
1421}
1422