1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2010-2016 Solarflare Communications Inc.
5 * All rights reserved.
6 *
7 * This software was developed in part by Philip Paeps under contract for
8 * Solarflare Communications, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright notice,
14 *    this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright notice,
16 *    this list of conditions and the following disclaimer in the documentation
17 *    and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * The views and conclusions contained in the software and documentation are
32 * those of the authors and should not be interpreted as representing official
33 * policies, either expressed or implied, of the FreeBSD Project.
34 */
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD$");
38
39#include "opt_rss.h"
40
41#include <sys/param.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/smp.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/limits.h>
49#include <sys/syslog.h>
50
51#include <net/ethernet.h>
52#include <net/if.h>
53#include <net/if_vlan_var.h>
54
55#include <netinet/in.h>
56#include <netinet/ip.h>
57#include <netinet/ip6.h>
58#include <netinet/tcp.h>
59
60#include <machine/in_cksum.h>
61
62#ifdef RSS
63#include <net/rss_config.h>
64#endif
65
66#include "common/efx.h"
67
68
69#include "sfxge.h"
70#include "sfxge_rx.h"
71
72#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
73
74#ifdef SFXGE_LRO
75
76SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
77	    "Large receive offload (LRO) parameters");
78
79#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
80
81/* Size of the LRO hash table.  Must be a power of 2.  A larger table
82 * means we can accelerate a larger number of streams.
83 */
84static unsigned lro_table_size = 128;
85TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
86SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
87	    &lro_table_size, 0,
88	    "Size of the LRO hash table (must be a power of 2)");
89
90/* Maximum length of a hash chain.  If chains get too long then the lookup
91 * time increases and may exceed the benefit of LRO.
92 */
93static unsigned lro_chain_max = 20;
94TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
95SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
96	    &lro_chain_max, 0,
97	    "The maximum length of a hash chain");
98
99/* Maximum time (in ticks) that a connection can be idle before it's LRO
100 * state is discarded.
101 */
102static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
103TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
104SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
105	    &lro_idle_ticks, 0,
106	    "The maximum time (in ticks) that a connection can be idle "
107	    "before it's LRO state is discarded");
108
109/* Number of packets with payload that must arrive in-order before a
110 * connection is eligible for LRO.  The idea is we should avoid coalescing
111 * segments when the sender is in slow-start because reducing the ACK rate
112 * can damage performance.
113 */
114static int lro_slow_start_packets = 2000;
115TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
116SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
117	    &lro_slow_start_packets, 0,
118	    "Number of packets with payload that must arrive in-order before "
119	    "a connection is eligible for LRO");
120
121/* Number of packets with payload that must arrive in-order following loss
122 * before a connection is eligible for LRO.  The idea is we should avoid
123 * coalescing segments when the sender is recovering from loss, because
124 * reducing the ACK rate can damage performance.
125 */
126static int lro_loss_packets = 20;
127TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
128SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
129	    &lro_loss_packets, 0,
130	    "Number of packets with payload that must arrive in-order "
131	    "following loss before a connection is eligible for LRO");
132
133/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
134#define	SFXGE_LRO_L2_ID_VLAN 0x4000
135#define	SFXGE_LRO_L2_ID_IPV6 0x8000
136#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
137#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
138
139/* Compare IPv6 addresses, avoiding conditional branches */
140static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
141				   const struct in6_addr *right)
142{
143#if LONG_BIT == 64
144	const uint64_t *left64 = (const uint64_t *)left;
145	const uint64_t *right64 = (const uint64_t *)right;
146	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
147#else
148	return (left->s6_addr32[0] - right->s6_addr32[0]) |
149	       (left->s6_addr32[1] - right->s6_addr32[1]) |
150	       (left->s6_addr32[2] - right->s6_addr32[2]) |
151	       (left->s6_addr32[3] - right->s6_addr32[3]);
152#endif
153}
154
155#endif	/* SFXGE_LRO */
156
157void
158sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
159{
160
161	rxq->flush_state = SFXGE_FLUSH_DONE;
162}
163
164void
165sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
166{
167
168	rxq->flush_state = SFXGE_FLUSH_FAILED;
169}
170
171#ifdef RSS
172static uint8_t toep_key[RSS_KEYSIZE];
173#else
174static uint8_t toep_key[] = {
175	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
176	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
177	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
178	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
179	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
180};
181#endif
182
183static void
184sfxge_rx_post_refill(void *arg)
185{
186	struct sfxge_rxq *rxq = arg;
187	struct sfxge_softc *sc;
188	unsigned int index;
189	struct sfxge_evq *evq;
190	uint16_t magic;
191
192	sc = rxq->sc;
193	index = rxq->index;
194	evq = sc->evq[index];
195	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
196
197	/* This is guaranteed due to the start/stop order of rx and ev */
198	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
199	    ("evq not started"));
200	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
201	    ("rxq not started"));
202	efx_ev_qpost(evq->common, magic);
203}
204
205static void
206sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
207{
208	/* Initially retry after 100 ms, but back off in case of
209	 * repeated failures as we probably have to wait for the
210	 * administrator to raise the pool limit. */
211	if (retrying)
212		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
213	else
214		rxq->refill_delay = hz / 10;
215
216	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
217			     sfxge_rx_post_refill, rxq);
218}
219
220#define	SFXGE_REFILL_BATCH  64
221
222static void
223sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
224{
225	struct sfxge_softc *sc;
226	unsigned int index;
227	struct sfxge_evq *evq;
228	unsigned int batch;
229	unsigned int rxfill;
230	unsigned int mblksize;
231	int ntodo;
232	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
233
234	sc = rxq->sc;
235	index = rxq->index;
236	evq = sc->evq[index];
237
238	prefetch_read_many(sc->enp);
239	prefetch_read_many(rxq->common);
240
241	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
242
243	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
244		return;
245
246	rxfill = rxq->added - rxq->completed;
247	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
248	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
249	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
250	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
251	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
252
253	if (ntodo == 0)
254		return;
255
256	batch = 0;
257	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
258	while (ntodo-- > 0) {
259		unsigned int id;
260		struct sfxge_rx_sw_desc *rx_desc;
261		bus_dma_segment_t seg;
262		struct mbuf *m;
263
264		id = (rxq->added + batch) & rxq->ptr_mask;
265		rx_desc = &rxq->queue[id];
266		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
267
268		rx_desc->flags = EFX_DISCARD;
269		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
270		    sc->rx_cluster_size);
271		if (m == NULL)
272			break;
273
274		/* m_len specifies length of area to be mapped for DMA */
275		m->m_len  = mblksize;
276		m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
277						   CACHE_LINE_SIZE);
278		m->m_data += sc->rx_buffer_align;
279
280		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
281		addr[batch++] = seg.ds_addr;
282
283		if (batch == SFXGE_REFILL_BATCH) {
284			efx_rx_qpost(rxq->common, addr, mblksize, batch,
285			    rxq->completed, rxq->added);
286			rxq->added += batch;
287			batch = 0;
288		}
289	}
290
291	if (ntodo != 0)
292		sfxge_rx_schedule_refill(rxq, retrying);
293
294	if (batch != 0) {
295		efx_rx_qpost(rxq->common, addr, mblksize, batch,
296		    rxq->completed, rxq->added);
297		rxq->added += batch;
298	}
299
300	/* Make the descriptors visible to the hardware */
301	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
302			BUS_DMASYNC_PREWRITE);
303
304	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
305
306	/* The queue could still be empty if no descriptors were actually
307	 * pushed, in which case there will be no event to cause the next
308	 * refill, so we must schedule a refill ourselves.
309	 */
310	if(rxq->pushed == rxq->completed) {
311		sfxge_rx_schedule_refill(rxq, retrying);
312	}
313}
314
315void
316sfxge_rx_qrefill(struct sfxge_rxq *rxq)
317{
318
319	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
320		return;
321
322	/* Make sure the queue is full */
323	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
324}
325
326static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
327{
328	struct ifnet *ifp = sc->ifnet;
329
330	m->m_pkthdr.rcvif = ifp;
331	m->m_pkthdr.csum_data = 0xffff;
332	ifp->if_input(ifp, m);
333}
334
335static void
336sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
337{
338	struct sfxge_softc *sc = rxq->sc;
339	struct mbuf *m = rx_desc->mbuf;
340	int flags = rx_desc->flags;
341	int csum_flags;
342
343	/* Convert checksum flags */
344	csum_flags = (flags & EFX_CKSUM_IPV4) ?
345		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
346	if (flags & EFX_CKSUM_TCPUDP)
347		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
348
349	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
350		m->m_pkthdr.flowid =
351			efx_pseudo_hdr_hash_get(rxq->common,
352						EFX_RX_HASHALG_TOEPLITZ,
353						mtod(m, uint8_t *));
354		/* The hash covers a 4-tuple for TCP only */
355		M_HASHTYPE_SET(m,
356		    (flags & EFX_PKT_IPV4) ?
357			((flags & EFX_PKT_TCP) ?
358			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
359			((flags & EFX_PKT_TCP) ?
360			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
361	}
362	m->m_data += sc->rx_prefix_size;
363	m->m_len = rx_desc->size - sc->rx_prefix_size;
364	m->m_pkthdr.len = m->m_len;
365	m->m_pkthdr.csum_flags = csum_flags;
366	__sfxge_rx_deliver(sc, rx_desc->mbuf);
367
368	rx_desc->flags = EFX_DISCARD;
369	rx_desc->mbuf = NULL;
370}
371
372#ifdef SFXGE_LRO
373
374static void
375sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
376{
377	struct sfxge_softc *sc = st->sc;
378	struct mbuf *m = c->mbuf;
379	struct tcphdr *c_th;
380	int csum_flags;
381
382	KASSERT(m, ("no mbuf to deliver"));
383
384	++st->n_bursts;
385
386	/* Finish off packet munging and recalculate IP header checksum. */
387	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
388		struct ip *iph = c->nh;
389		iph->ip_len = htons(iph->ip_len);
390		iph->ip_sum = 0;
391		iph->ip_sum = in_cksum_hdr(iph);
392		c_th = (struct tcphdr *)(iph + 1);
393		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
394			      CSUM_IP_CHECKED | CSUM_IP_VALID);
395	} else {
396		struct ip6_hdr *iph = c->nh;
397		iph->ip6_plen = htons(iph->ip6_plen);
398		c_th = (struct tcphdr *)(iph + 1);
399		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
400	}
401
402	c_th->th_win = c->th_last->th_win;
403	c_th->th_ack = c->th_last->th_ack;
404	if (c_th->th_off == c->th_last->th_off) {
405		/* Copy TCP options (take care to avoid going negative). */
406		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
407		memcpy(c_th + 1, c->th_last + 1, optlen);
408	}
409
410	m->m_pkthdr.flowid = c->conn_hash;
411	M_HASHTYPE_SET(m,
412	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
413		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
414
415	m->m_pkthdr.csum_flags = csum_flags;
416	__sfxge_rx_deliver(sc, m);
417
418	c->mbuf = NULL;
419	c->delivered = 1;
420}
421
422/* Drop the given connection, and add it to the free list. */
423static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
424{
425	unsigned bucket;
426
427	KASSERT(!c->mbuf, ("found orphaned mbuf"));
428
429	if (c->next_buf.mbuf != NULL) {
430		sfxge_rx_deliver(rxq, &c->next_buf);
431		LIST_REMOVE(c, active_link);
432	}
433
434	bucket = c->conn_hash & rxq->lro.conns_mask;
435	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
436	--rxq->lro.conns_n[bucket];
437	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
438	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
439}
440
441/* Stop tracking connections that have gone idle in order to keep hash
442 * chains short.
443 */
444static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
445{
446	struct sfxge_lro_conn *c;
447	unsigned i;
448
449	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
450		("found active connections"));
451
452	rxq->lro.last_purge_ticks = now;
453	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
454		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
455			continue;
456
457		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
458		if (now - c->last_pkt_ticks > lro_idle_ticks) {
459			++rxq->lro.n_drop_idle;
460			sfxge_lro_drop(rxq, c);
461		}
462	}
463}
464
465static void
466sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
467		struct mbuf *mbuf, struct tcphdr *th)
468{
469	struct tcphdr *c_th;
470
471	/* Tack the new mbuf onto the chain. */
472	KASSERT(!mbuf->m_next, ("mbuf already chained"));
473	c->mbuf_tail->m_next = mbuf;
474	c->mbuf_tail = mbuf;
475
476	/* Increase length appropriately */
477	c->mbuf->m_pkthdr.len += mbuf->m_len;
478
479	/* Update the connection state flags */
480	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
481		struct ip *iph = c->nh;
482		iph->ip_len += mbuf->m_len;
483		c_th = (struct tcphdr *)(iph + 1);
484	} else {
485		struct ip6_hdr *iph = c->nh;
486		iph->ip6_plen += mbuf->m_len;
487		c_th = (struct tcphdr *)(iph + 1);
488	}
489	c_th->th_flags |= (th->th_flags & TH_PUSH);
490	c->th_last = th;
491	++st->n_merges;
492
493	/* Pass packet up now if another segment could overflow the IP
494	 * length.
495	 */
496	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
497		sfxge_lro_deliver(st, c);
498}
499
500static void
501sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
502		struct mbuf *mbuf, void *nh, struct tcphdr *th)
503{
504	/* Start the chain */
505	c->mbuf = mbuf;
506	c->mbuf_tail = c->mbuf;
507	c->nh = nh;
508	c->th_last = th;
509
510	mbuf->m_pkthdr.len = mbuf->m_len;
511
512	/* Mangle header fields for later processing */
513	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
514		struct ip *iph = nh;
515		iph->ip_len = ntohs(iph->ip_len);
516	} else {
517		struct ip6_hdr *iph = nh;
518		iph->ip6_plen = ntohs(iph->ip6_plen);
519	}
520}
521
522/* Try to merge or otherwise hold or deliver (as appropriate) the
523 * packet buffered for this connection (c->next_buf).  Return a flag
524 * indicating whether the connection is still active for LRO purposes.
525 */
526static int
527sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
528{
529	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
530	char *eh = c->next_eh;
531	int data_length, hdr_length, dont_merge;
532	unsigned th_seq, pkt_length;
533	struct tcphdr *th;
534	unsigned now;
535
536	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
537		struct ip *iph = c->next_nh;
538		th = (struct tcphdr *)(iph + 1);
539		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
540	} else {
541		struct ip6_hdr *iph = c->next_nh;
542		th = (struct tcphdr *)(iph + 1);
543		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
544	}
545
546	hdr_length = (char *) th + th->th_off * 4 - eh;
547	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
548		       hdr_length);
549	th_seq = ntohl(th->th_seq);
550	dont_merge = ((data_length <= 0)
551		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
552
553	/* Check for options other than aligned timestamp. */
554	if (th->th_off != 5) {
555		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
556		if (th->th_off == 8 &&
557		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
558					(TCPOPT_NOP << 16) |
559					(TCPOPT_TIMESTAMP << 8) |
560					TCPOLEN_TIMESTAMP)) {
561			/* timestamp option -- okay */
562		} else {
563			dont_merge = 1;
564		}
565	}
566
567	if (__predict_false(th_seq != c->next_seq)) {
568		/* Out-of-order, so start counting again. */
569		if (c->mbuf != NULL)
570			sfxge_lro_deliver(&rxq->lro, c);
571		c->n_in_order_pkts -= lro_loss_packets;
572		c->next_seq = th_seq + data_length;
573		++rxq->lro.n_misorder;
574		goto deliver_buf_out;
575	}
576	c->next_seq = th_seq + data_length;
577
578	now = ticks;
579	if (now - c->last_pkt_ticks > lro_idle_ticks) {
580		++rxq->lro.n_drop_idle;
581		if (c->mbuf != NULL)
582			sfxge_lro_deliver(&rxq->lro, c);
583		sfxge_lro_drop(rxq, c);
584		return (0);
585	}
586	c->last_pkt_ticks = ticks;
587
588	if (c->n_in_order_pkts < lro_slow_start_packets) {
589		/* May be in slow-start, so don't merge. */
590		++rxq->lro.n_slow_start;
591		++c->n_in_order_pkts;
592		goto deliver_buf_out;
593	}
594
595	if (__predict_false(dont_merge)) {
596		if (c->mbuf != NULL)
597			sfxge_lro_deliver(&rxq->lro, c);
598		if (th->th_flags & (TH_FIN | TH_RST)) {
599			++rxq->lro.n_drop_closed;
600			sfxge_lro_drop(rxq, c);
601			return (0);
602		}
603		goto deliver_buf_out;
604	}
605
606	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
607
608	if (__predict_true(c->mbuf != NULL)) {
609		/* Remove headers and any padding */
610		rx_buf->mbuf->m_data += hdr_length;
611		rx_buf->mbuf->m_len = data_length;
612
613		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
614	} else {
615		/* Remove any padding */
616		rx_buf->mbuf->m_len = pkt_length;
617
618		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
619	}
620
621	rx_buf->mbuf = NULL;
622	return (1);
623
624 deliver_buf_out:
625	sfxge_rx_deliver(rxq, rx_buf);
626	return (1);
627}
628
629static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
630			       uint16_t l2_id, void *nh, struct tcphdr *th)
631{
632	unsigned bucket = conn_hash & st->conns_mask;
633	struct sfxge_lro_conn *c;
634
635	if (st->conns_n[bucket] >= lro_chain_max) {
636		++st->n_too_many;
637		return;
638	}
639
640	if (!TAILQ_EMPTY(&st->free_conns)) {
641		c = TAILQ_FIRST(&st->free_conns);
642		TAILQ_REMOVE(&st->free_conns, c, link);
643	} else {
644		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
645		if (c == NULL)
646			return;
647		c->mbuf = NULL;
648		c->next_buf.mbuf = NULL;
649	}
650
651	/* Create the connection tracking data */
652	++st->conns_n[bucket];
653	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
654	c->l2_id = l2_id;
655	c->conn_hash = conn_hash;
656	c->source = th->th_sport;
657	c->dest = th->th_dport;
658	c->n_in_order_pkts = 0;
659	c->last_pkt_ticks = *(volatile int *)&ticks;
660	c->delivered = 0;
661	++st->n_new_stream;
662	/* NB. We don't initialise c->next_seq, and it doesn't matter what
663	 * value it has.  Most likely the next packet received for this
664	 * connection will not match -- no harm done.
665	 */
666}
667
668/* Process mbuf and decide whether to dispatch it to the stack now or
669 * later.
670 */
671static void
672sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
673{
674	struct sfxge_softc *sc = rxq->sc;
675	struct mbuf *m = rx_buf->mbuf;
676	struct ether_header *eh;
677	struct sfxge_lro_conn *c;
678	uint16_t l2_id;
679	uint16_t l3_proto;
680	void *nh;
681	struct tcphdr *th;
682	uint32_t conn_hash;
683	unsigned bucket;
684
685	/* Get the hardware hash */
686	conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
687					    EFX_RX_HASHALG_TOEPLITZ,
688					    mtod(m, uint8_t *));
689
690	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
691	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
692		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
693		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
694			SFXGE_LRO_L2_ID_VLAN;
695		l3_proto = veh->evl_proto;
696		nh = veh + 1;
697	} else {
698		l2_id = 0;
699		l3_proto = eh->ether_type;
700		nh = eh + 1;
701	}
702
703	/* Check whether this is a suitable packet (unfragmented
704	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
705	 * length, and compute a hash if necessary.  If not, return.
706	 */
707	if (l3_proto == htons(ETHERTYPE_IP)) {
708		struct ip *iph = nh;
709
710		KASSERT(iph->ip_p == IPPROTO_TCP,
711		    ("IPv4 protocol is not TCP, but packet marker is set"));
712		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
713		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
714			goto deliver_now;
715		th = (struct tcphdr *)(iph + 1);
716	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
717		struct ip6_hdr *iph = nh;
718
719		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
720		    ("IPv6 next header is not TCP, but packet marker is set"));
721		l2_id |= SFXGE_LRO_L2_ID_IPV6;
722		th = (struct tcphdr *)(iph + 1);
723	} else {
724		goto deliver_now;
725	}
726
727	bucket = conn_hash & rxq->lro.conns_mask;
728
729	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
730		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
731			continue;
732		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
733			continue;
734		if (c->mbuf != NULL) {
735			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
736				struct ip *c_iph, *iph = nh;
737				c_iph = c->nh;
738				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
739				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
740					continue;
741			} else {
742				struct ip6_hdr *c_iph, *iph = nh;
743				c_iph = c->nh;
744				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
745				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
746					continue;
747			}
748		}
749
750		/* Re-insert at head of list to reduce lookup time. */
751		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
752		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
753
754		if (c->next_buf.mbuf != NULL) {
755			if (!sfxge_lro_try_merge(rxq, c))
756				goto deliver_now;
757		} else {
758			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
759			    active_link);
760		}
761		c->next_buf = *rx_buf;
762		c->next_eh = eh;
763		c->next_nh = nh;
764
765		rx_buf->mbuf = NULL;
766		rx_buf->flags = EFX_DISCARD;
767		return;
768	}
769
770	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
771 deliver_now:
772	sfxge_rx_deliver(rxq, rx_buf);
773}
774
775static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
776{
777	struct sfxge_lro_state *st = &rxq->lro;
778	struct sfxge_lro_conn *c;
779	unsigned t;
780
781	while (!LIST_EMPTY(&st->active_conns)) {
782		c = LIST_FIRST(&st->active_conns);
783		if (!c->delivered && c->mbuf != NULL)
784			sfxge_lro_deliver(st, c);
785		if (sfxge_lro_try_merge(rxq, c)) {
786			if (c->mbuf != NULL)
787				sfxge_lro_deliver(st, c);
788			LIST_REMOVE(c, active_link);
789		}
790		c->delivered = 0;
791	}
792
793	t = *(volatile int *)&ticks;
794	if (__predict_false(t != st->last_purge_ticks))
795		sfxge_lro_purge_idle(rxq, t);
796}
797
798#else	/* !SFXGE_LRO */
799
800static void
801sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
802{
803}
804
805static void
806sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
807{
808}
809
810#endif	/* SFXGE_LRO */
811
812void
813sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
814{
815	struct sfxge_softc *sc = rxq->sc;
816	int if_capenable = sc->ifnet->if_capenable;
817	int lro_enabled = if_capenable & IFCAP_LRO;
818	unsigned int index;
819	struct sfxge_evq *evq;
820	unsigned int completed;
821	unsigned int level;
822	struct mbuf *m;
823	struct sfxge_rx_sw_desc *prev = NULL;
824
825	index = rxq->index;
826	evq = sc->evq[index];
827
828	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
829
830	completed = rxq->completed;
831	while (completed != rxq->pending) {
832		unsigned int id;
833		struct sfxge_rx_sw_desc *rx_desc;
834
835		id = completed++ & rxq->ptr_mask;
836		rx_desc = &rxq->queue[id];
837		m = rx_desc->mbuf;
838
839		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
840			goto discard;
841
842		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
843			goto discard;
844
845		/* Read the length from the pseudo header if required */
846		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
847			uint16_t tmp_size;
848			int rc;
849			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
850							   mtod(m, uint8_t *),
851							   &tmp_size);
852			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
853			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
854		}
855
856		prefetch_read_many(mtod(m, caddr_t));
857
858		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
859		case EFX_PKT_IPV4:
860			if (~if_capenable & IFCAP_RXCSUM)
861				rx_desc->flags &=
862				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
863			break;
864		case EFX_PKT_IPV6:
865			if (~if_capenable & IFCAP_RXCSUM_IPV6)
866				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
867			break;
868		case 0:
869			/* Check for loopback packets */
870			{
871				struct ether_header *etherhp;
872
873				/*LINTED*/
874				etherhp = mtod(m, struct ether_header *);
875
876				if (etherhp->ether_type ==
877				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
878					EFSYS_PROBE(loopback);
879
880					rxq->loopback++;
881					goto discard;
882				}
883			}
884			break;
885		default:
886			KASSERT(B_FALSE,
887			    ("Rx descriptor with both IPv4 and IPv6 flags"));
888			goto discard;
889		}
890
891		/* Pass packet up the stack or into LRO (pipelined) */
892		if (prev != NULL) {
893			if (lro_enabled &&
894			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
895			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
896				sfxge_lro(rxq, prev);
897			else
898				sfxge_rx_deliver(rxq, prev);
899		}
900		prev = rx_desc;
901		continue;
902
903discard:
904		/* Return the packet to the pool */
905		m_free(m);
906		rx_desc->mbuf = NULL;
907	}
908	rxq->completed = completed;
909
910	level = rxq->added - rxq->completed;
911
912	/* Pass last packet up the stack or into LRO */
913	if (prev != NULL) {
914		if (lro_enabled &&
915		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
916		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
917			sfxge_lro(rxq, prev);
918		else
919			sfxge_rx_deliver(rxq, prev);
920	}
921
922	/*
923	 * If there are any pending flows and this is the end of the
924	 * poll then they must be completed.
925	 */
926	if (eop)
927		sfxge_lro_end_of_burst(rxq);
928
929	/* Top up the queue if necessary */
930	if (level < rxq->refill_threshold)
931		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
932}
933
934static void
935sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
936{
937	struct sfxge_rxq *rxq;
938	struct sfxge_evq *evq;
939	unsigned int count;
940	unsigned int retry = 3;
941
942	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
943
944	rxq = sc->rxq[index];
945	evq = sc->evq[index];
946
947	SFXGE_EVQ_LOCK(evq);
948
949	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
950	    ("rxq not started"));
951
952	rxq->init_state = SFXGE_RXQ_INITIALIZED;
953
954	callout_stop(&rxq->refill_callout);
955
956	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
957		rxq->flush_state = SFXGE_FLUSH_PENDING;
958
959		SFXGE_EVQ_UNLOCK(evq);
960
961		/* Flush the receive queue */
962		if (efx_rx_qflush(rxq->common) != 0) {
963			SFXGE_EVQ_LOCK(evq);
964			rxq->flush_state = SFXGE_FLUSH_FAILED;
965			break;
966		}
967
968		count = 0;
969		do {
970			/* Spin for 100 ms */
971			DELAY(100000);
972
973			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
974				break;
975
976		} while (++count < 20);
977
978		SFXGE_EVQ_LOCK(evq);
979
980		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
981			/* Flush timeout - neither done nor failed */
982			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
983			    device_get_nameunit(sc->dev), index);
984			rxq->flush_state = SFXGE_FLUSH_DONE;
985		}
986		retry--;
987	}
988	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
989		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
990		    device_get_nameunit(sc->dev), index);
991		rxq->flush_state = SFXGE_FLUSH_DONE;
992	}
993
994	rxq->pending = rxq->added;
995	sfxge_rx_qcomplete(rxq, B_TRUE);
996
997	KASSERT(rxq->completed == rxq->pending,
998	    ("rxq->completed != rxq->pending"));
999
1000	rxq->added = 0;
1001	rxq->pushed = 0;
1002	rxq->pending = 0;
1003	rxq->completed = 0;
1004	rxq->loopback = 0;
1005
1006	/* Destroy the common code receive queue. */
1007	efx_rx_qdestroy(rxq->common);
1008
1009	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1010	    EFX_RXQ_NBUFS(sc->rxq_entries));
1011
1012	SFXGE_EVQ_UNLOCK(evq);
1013}
1014
1015static int
1016sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1017{
1018	struct sfxge_rxq *rxq;
1019	efsys_mem_t *esmp;
1020	struct sfxge_evq *evq;
1021	int rc;
1022
1023	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1024
1025	rxq = sc->rxq[index];
1026	esmp = &rxq->mem;
1027	evq = sc->evq[index];
1028
1029	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1030	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1031	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1032	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1033
1034	/* Program the buffer table. */
1035	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1036	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1037		return (rc);
1038
1039	/* Create the common code receive queue. */
1040	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1041	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1042	    &rxq->common)) != 0)
1043		goto fail;
1044
1045	SFXGE_EVQ_LOCK(evq);
1046
1047	/* Enable the receive queue. */
1048	efx_rx_qenable(rxq->common);
1049
1050	rxq->init_state = SFXGE_RXQ_STARTED;
1051	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1052
1053	/* Try to fill the queue from the pool. */
1054	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1055
1056	SFXGE_EVQ_UNLOCK(evq);
1057
1058	return (0);
1059
1060fail:
1061	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1062	    EFX_RXQ_NBUFS(sc->rxq_entries));
1063	return (rc);
1064}
1065
1066void
1067sfxge_rx_stop(struct sfxge_softc *sc)
1068{
1069	int index;
1070
1071	efx_mac_filter_default_rxq_clear(sc->enp);
1072
1073	/* Stop the receive queue(s) */
1074	index = sc->rxq_count;
1075	while (--index >= 0)
1076		sfxge_rx_qstop(sc, index);
1077
1078	sc->rx_prefix_size = 0;
1079	sc->rx_buffer_size = 0;
1080
1081	efx_rx_fini(sc->enp);
1082}
1083
1084int
1085sfxge_rx_start(struct sfxge_softc *sc)
1086{
1087	struct sfxge_intr *intr;
1088	const efx_nic_cfg_t *encp;
1089	size_t hdrlen, align, reserved;
1090	int index;
1091	int rc;
1092
1093	intr = &sc->intr;
1094
1095	/* Initialize the common code receive module. */
1096	if ((rc = efx_rx_init(sc->enp)) != 0)
1097		return (rc);
1098
1099	encp = efx_nic_cfg_get(sc->enp);
1100	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1101
1102	/* Calculate the receive packet buffer size. */
1103	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1104
1105	/* Ensure IP headers are 32bit aligned */
1106	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1107	sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1108
1109	sc->rx_buffer_size += sc->rx_buffer_align;
1110
1111	/* Align end of packet buffer for RX DMA end padding */
1112	align = MAX(1, encp->enc_rx_buf_align_end);
1113	EFSYS_ASSERT(ISP2(align));
1114	sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1115
1116	/*
1117	 * Standard mbuf zones only guarantee pointer-size alignment;
1118	 * we need extra space to align to the cache line
1119	 */
1120	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1121
1122	/* Select zone for packet buffers */
1123	if (reserved <= MCLBYTES)
1124		sc->rx_cluster_size = MCLBYTES;
1125	else if (reserved <= MJUMPAGESIZE)
1126		sc->rx_cluster_size = MJUMPAGESIZE;
1127	else if (reserved <= MJUM9BYTES)
1128		sc->rx_cluster_size = MJUM9BYTES;
1129	else
1130		sc->rx_cluster_size = MJUM16BYTES;
1131
1132	/*
1133	 * Set up the scale table.  Enable all hash types and hash insertion.
1134	 */
1135	for (index = 0; index < nitems(sc->rx_indir_table); index++)
1136#ifdef RSS
1137		sc->rx_indir_table[index] =
1138			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1139#else
1140		sc->rx_indir_table[index] = index % sc->rxq_count;
1141#endif
1142	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1143				       nitems(sc->rx_indir_table))) != 0)
1144		goto fail;
1145	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1146	    EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1147	    EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1148
1149#ifdef RSS
1150	rss_getkey(toep_key);
1151#endif
1152	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1153				       sizeof(toep_key))) != 0)
1154		goto fail;
1155
1156	/* Start the receive queue(s). */
1157	for (index = 0; index < sc->rxq_count; index++) {
1158		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1159			goto fail2;
1160	}
1161
1162	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1163					    sc->intr.n_alloc > 1);
1164	if (rc != 0)
1165		goto fail3;
1166
1167	return (0);
1168
1169fail3:
1170fail2:
1171	while (--index >= 0)
1172		sfxge_rx_qstop(sc, index);
1173
1174fail:
1175	efx_rx_fini(sc->enp);
1176
1177	return (rc);
1178}
1179
1180#ifdef SFXGE_LRO
1181
1182static void sfxge_lro_init(struct sfxge_rxq *rxq)
1183{
1184	struct sfxge_lro_state *st = &rxq->lro;
1185	unsigned i;
1186
1187	st->conns_mask = lro_table_size - 1;
1188	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1189		("lro_table_size must be a power of 2"));
1190	st->sc = rxq->sc;
1191	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1192			   M_SFXGE, M_WAITOK);
1193	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1194			     M_SFXGE, M_WAITOK);
1195	for (i = 0; i <= st->conns_mask; ++i) {
1196		TAILQ_INIT(&st->conns[i]);
1197		st->conns_n[i] = 0;
1198	}
1199	LIST_INIT(&st->active_conns);
1200	TAILQ_INIT(&st->free_conns);
1201}
1202
1203static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1204{
1205	struct sfxge_lro_state *st = &rxq->lro;
1206	struct sfxge_lro_conn *c;
1207	unsigned i;
1208
1209	/* Return cleanly if sfxge_lro_init() has not been called. */
1210	if (st->conns == NULL)
1211		return;
1212
1213	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1214
1215	for (i = 0; i <= st->conns_mask; ++i) {
1216		while (!TAILQ_EMPTY(&st->conns[i])) {
1217			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1218			sfxge_lro_drop(rxq, c);
1219		}
1220	}
1221
1222	while (!TAILQ_EMPTY(&st->free_conns)) {
1223		c = TAILQ_FIRST(&st->free_conns);
1224		TAILQ_REMOVE(&st->free_conns, c, link);
1225		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1226		free(c, M_SFXGE);
1227	}
1228
1229	free(st->conns_n, M_SFXGE);
1230	free(st->conns, M_SFXGE);
1231	st->conns = NULL;
1232}
1233
1234#else
1235
1236static void
1237sfxge_lro_init(struct sfxge_rxq *rxq)
1238{
1239}
1240
1241static void
1242sfxge_lro_fini(struct sfxge_rxq *rxq)
1243{
1244}
1245
1246#endif	/* SFXGE_LRO */
1247
1248static void
1249sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1250{
1251	struct sfxge_rxq *rxq;
1252
1253	rxq = sc->rxq[index];
1254
1255	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1256	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1257
1258	/* Free the context array and the flow table. */
1259	free(rxq->queue, M_SFXGE);
1260	sfxge_lro_fini(rxq);
1261
1262	/* Release DMA memory. */
1263	sfxge_dma_free(&rxq->mem);
1264
1265	sc->rxq[index] = NULL;
1266
1267	free(rxq, M_SFXGE);
1268}
1269
1270static int
1271sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1272{
1273	struct sfxge_rxq *rxq;
1274	struct sfxge_evq *evq;
1275	efsys_mem_t *esmp;
1276	int rc;
1277
1278	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1279
1280	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1281	rxq->sc = sc;
1282	rxq->index = index;
1283	rxq->entries = sc->rxq_entries;
1284	rxq->ptr_mask = rxq->entries - 1;
1285	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1286
1287	sc->rxq[index] = rxq;
1288	esmp = &rxq->mem;
1289
1290	evq = sc->evq[index];
1291
1292	/* Allocate and zero DMA space. */
1293	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1294		return (rc);
1295
1296	/* Allocate buffer table entries. */
1297	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1298				 &rxq->buf_base_id);
1299
1300	/* Allocate the context array and the flow table. */
1301	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1302	    M_SFXGE, M_WAITOK | M_ZERO);
1303	sfxge_lro_init(rxq);
1304
1305	callout_init(&rxq->refill_callout, 1);
1306
1307	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1308
1309	return (0);
1310}
1311
1312static const struct {
1313	const char *name;
1314	size_t offset;
1315} sfxge_rx_stats[] = {
1316#define	SFXGE_RX_STAT(name, member) \
1317	{ #name, offsetof(struct sfxge_rxq, member) }
1318#ifdef SFXGE_LRO
1319	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1320	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1321	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1322	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1323	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1324	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1325	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1326	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1327#endif
1328};
1329
1330static int
1331sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1332{
1333	struct sfxge_softc *sc = arg1;
1334	unsigned int id = arg2;
1335	unsigned int sum, index;
1336
1337	/* Sum across all RX queues */
1338	sum = 0;
1339	for (index = 0; index < sc->rxq_count; index++)
1340		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1341					 sfxge_rx_stats[id].offset);
1342
1343	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1344}
1345
1346static void
1347sfxge_rx_stat_init(struct sfxge_softc *sc)
1348{
1349	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1350	struct sysctl_oid_list *stat_list;
1351	unsigned int id;
1352
1353	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1354
1355	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1356		SYSCTL_ADD_PROC(
1357			ctx, stat_list,
1358			OID_AUTO, sfxge_rx_stats[id].name,
1359			CTLTYPE_UINT|CTLFLAG_RD,
1360			sc, id, sfxge_rx_stat_handler, "IU",
1361			"");
1362	}
1363}
1364
1365void
1366sfxge_rx_fini(struct sfxge_softc *sc)
1367{
1368	int index;
1369
1370	index = sc->rxq_count;
1371	while (--index >= 0)
1372		sfxge_rx_qfini(sc, index);
1373
1374	sc->rxq_count = 0;
1375}
1376
1377int
1378sfxge_rx_init(struct sfxge_softc *sc)
1379{
1380	struct sfxge_intr *intr;
1381	int index;
1382	int rc;
1383
1384#ifdef SFXGE_LRO
1385	if (!ISP2(lro_table_size)) {
1386		log(LOG_ERR, "%s=%u must be power of 2",
1387		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1388		rc = EINVAL;
1389		goto fail_lro_table_size;
1390	}
1391
1392	if (lro_idle_ticks == 0)
1393		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1394#endif
1395
1396	intr = &sc->intr;
1397
1398	sc->rxq_count = intr->n_alloc;
1399
1400	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1401	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1402
1403	/* Initialize the receive queue(s) - one per interrupt. */
1404	for (index = 0; index < sc->rxq_count; index++) {
1405		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1406			goto fail;
1407	}
1408
1409	sfxge_rx_stat_init(sc);
1410
1411	return (0);
1412
1413fail:
1414	/* Tear down the receive queue(s). */
1415	while (--index >= 0)
1416		sfxge_rx_qfini(sc, index);
1417
1418	sc->rxq_count = 0;
1419
1420#ifdef SFXGE_LRO
1421fail_lro_table_size:
1422#endif
1423	return (rc);
1424}
1425