sfxge_rx.c revision 234086
1/*-
2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 234086 2012-04-10 06:52:21Z glebius $");
32
33#include <sys/types.h>
34#include <sys/mbuf.h>
35#include <sys/smp.h>
36#include <sys/socket.h>
37#include <sys/sysctl.h>
38#include <sys/limits.h>
39
40#include <net/ethernet.h>
41#include <net/if.h>
42#include <net/if_vlan_var.h>
43
44#include <netinet/in.h>
45#include <netinet/ip.h>
46#include <netinet/ip6.h>
47#include <netinet/tcp.h>
48
49#include <machine/in_cksum.h>
50
51#include "common/efx.h"
52
53
54#include "sfxge.h"
55#include "sfxge_rx.h"
56
57#define RX_REFILL_THRESHOLD (EFX_RXQ_LIMIT(SFXGE_NDESCS) * 9 / 10)
58#define RX_REFILL_THRESHOLD_2 (RX_REFILL_THRESHOLD / 2)
59
60/* Size of the LRO hash table.  Must be a power of 2.  A larger table
61 * means we can accelerate a larger number of streams.
62 */
63static unsigned lro_table_size = 128;
64
65/* Maximum length of a hash chain.  If chains get too long then the lookup
66 * time increases and may exceed the benefit of LRO.
67 */
68static unsigned lro_chain_max = 20;
69
70/* Maximum time (in ticks) that a connection can be idle before it's LRO
71 * state is discarded.
72 */
73static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
74
75/* Number of packets with payload that must arrive in-order before a
76 * connection is eligible for LRO.  The idea is we should avoid coalescing
77 * segments when the sender is in slow-start because reducing the ACK rate
78 * can damage performance.
79 */
80static int lro_slow_start_packets = 2000;
81
82/* Number of packets with payload that must arrive in-order following loss
83 * before a connection is eligible for LRO.  The idea is we should avoid
84 * coalescing segments when the sender is recovering from loss, because
85 * reducing the ACK rate can damage performance.
86 */
87static int lro_loss_packets = 20;
88
89/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
90#define SFXGE_LRO_L2_ID_VLAN 0x4000
91#define SFXGE_LRO_L2_ID_IPV6 0x8000
92#define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
93#define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
94
95/* Compare IPv6 addresses, avoiding conditional branches */
96static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left,
97					    const struct in6_addr *right)
98{
99#if LONG_BIT == 64
100	const uint64_t *left64 = (const uint64_t *)left;
101	const uint64_t *right64 = (const uint64_t *)right;
102	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
103#else
104	return (left->s6_addr32[0] - right->s6_addr32[0]) |
105	       (left->s6_addr32[1] - right->s6_addr32[1]) |
106	       (left->s6_addr32[2] - right->s6_addr32[2]) |
107	       (left->s6_addr32[3] - right->s6_addr32[3]);
108#endif
109}
110
111void
112sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
113{
114
115	rxq->flush_state = SFXGE_FLUSH_DONE;
116}
117
118void
119sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
120{
121
122	rxq->flush_state = SFXGE_FLUSH_FAILED;
123}
124
125static uint8_t toep_key[] = {
126	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
131};
132
133static void
134sfxge_rx_post_refill(void *arg)
135{
136	struct sfxge_rxq *rxq = arg;
137	struct sfxge_softc *sc;
138	unsigned int index;
139	struct sfxge_evq *evq;
140	uint16_t magic;
141
142	sc = rxq->sc;
143	index = rxq->index;
144	evq = sc->evq[index];
145
146	magic = SFXGE_MAGIC_RX_QREFILL | index;
147
148	/* This is guaranteed due to the start/stop order of rx and ev */
149	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
150	    ("evq not started"));
151	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
152	    ("rxq not started"));
153	efx_ev_qpost(evq->common, magic);
154}
155
156static void
157sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
158{
159	/* Initially retry after 100 ms, but back off in case of
160	 * repeated failures as we probably have to wait for the
161	 * administrator to raise the pool limit. */
162	if (retrying)
163		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
164	else
165		rxq->refill_delay = hz / 10;
166
167	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
168			     sfxge_rx_post_refill, rxq);
169}
170
171static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
172{
173	struct mb_args args;
174	struct mbuf *m;
175
176	/* Allocate mbuf structure */
177	args.flags = M_PKTHDR;
178	args.type = MT_DATA;
179	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
180
181	/* Allocate (and attach) packet buffer */
182	if (m && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
183		uma_zfree(zone_mbuf, m);
184		m = NULL;
185	}
186
187	return m;
188}
189
190#define	SFXGE_REFILL_BATCH  64
191
192static void
193sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
194{
195	struct sfxge_softc *sc;
196	unsigned int index;
197	struct sfxge_evq *evq;
198	unsigned int batch;
199	unsigned int rxfill;
200	unsigned int mblksize;
201	int ntodo;
202	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
203
204	sc = rxq->sc;
205	index = rxq->index;
206	evq = sc->evq[index];
207
208	prefetch_read_many(sc->enp);
209	prefetch_read_many(rxq->common);
210
211	mtx_assert(&evq->lock, MA_OWNED);
212
213	if (rxq->init_state != SFXGE_RXQ_STARTED)
214		return;
215
216	rxfill = rxq->added - rxq->completed;
217	KASSERT(rxfill <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
218	    ("rxfill > EFX_RXQ_LIMIT(SFXGE_NDESCS)"));
219	ntodo = min(EFX_RXQ_LIMIT(SFXGE_NDESCS) - rxfill, target);
220	KASSERT(ntodo <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
221	    ("ntodo > EFX_RQX_LIMIT(SFXGE_NDESCS)"));
222
223	if (ntodo == 0)
224		return;
225
226	batch = 0;
227	mblksize = sc->rx_buffer_size;
228	while (ntodo-- > 0) {
229		unsigned int id;
230		struct sfxge_rx_sw_desc *rx_desc;
231		bus_dma_segment_t seg;
232		struct mbuf *m;
233
234		id = (rxq->added + batch) & (SFXGE_NDESCS - 1);
235		rx_desc = &rxq->queue[id];
236		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
237
238		rx_desc->flags = EFX_DISCARD;
239		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
240		if (m == NULL)
241			break;
242		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
243		addr[batch++] = seg.ds_addr;
244
245		if (batch == SFXGE_REFILL_BATCH) {
246			efx_rx_qpost(rxq->common, addr, mblksize, batch,
247			    rxq->completed, rxq->added);
248			rxq->added += batch;
249			batch = 0;
250		}
251	}
252
253	if (ntodo != 0)
254		sfxge_rx_schedule_refill(rxq, retrying);
255
256	if (batch != 0) {
257		efx_rx_qpost(rxq->common, addr, mblksize, batch,
258		    rxq->completed, rxq->added);
259		rxq->added += batch;
260	}
261
262	/* Make the descriptors visible to the hardware */
263	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
264			BUS_DMASYNC_PREWRITE);
265
266	efx_rx_qpush(rxq->common, rxq->added);
267}
268
269void
270sfxge_rx_qrefill(struct sfxge_rxq *rxq)
271{
272
273	if (rxq->init_state != SFXGE_RXQ_STARTED)
274		return;
275
276	/* Make sure the queue is full */
277	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_TRUE);
278}
279
280static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
281{
282	struct ifnet *ifp = sc->ifnet;
283
284	m->m_pkthdr.rcvif = ifp;
285	m->m_pkthdr.header = m->m_data;
286	m->m_pkthdr.csum_data = 0xffff;
287	ifp->if_input(ifp, m);
288}
289
290static void
291sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
292{
293	struct mbuf *m = rx_desc->mbuf;
294	int csum_flags;
295
296	/* Convert checksum flags */
297	csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
298		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
299	if (rx_desc->flags & EFX_CKSUM_TCPUDP)
300		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
301
302#ifdef SFXGE_HAVE_MQ
303	/* The hash covers a 4-tuple for TCP only */
304	if (rx_desc->flags & EFX_PKT_TCP) {
305		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
306						       mtod(m, uint8_t *));
307		m->m_flags |= M_FLOWID;
308	}
309#endif
310	m->m_data += sc->rx_prefix_size;
311	m->m_len = rx_desc->size - sc->rx_prefix_size;
312	m->m_pkthdr.len = m->m_len;
313	m->m_pkthdr.csum_flags = csum_flags;
314	__sfxge_rx_deliver(sc, rx_desc->mbuf);
315
316	rx_desc->flags = EFX_DISCARD;
317	rx_desc->mbuf = NULL;
318}
319
320static void
321sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
322{
323	struct sfxge_softc *sc = st->sc;
324	struct mbuf *m = c->mbuf;
325	struct tcphdr *c_th;
326	int csum_flags;
327
328	KASSERT(m, ("no mbuf to deliver"));
329
330	++st->n_bursts;
331
332	/* Finish off packet munging and recalculate IP header checksum. */
333	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
334		struct ip *iph = c->nh;
335		iph->ip_len = htons(iph->ip_len);
336		iph->ip_sum = 0;
337		iph->ip_sum = in_cksum_hdr(iph);
338		c_th = (struct tcphdr *)(iph + 1);
339		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
340			      CSUM_IP_CHECKED | CSUM_IP_VALID);
341	} else {
342		struct ip6_hdr *iph = c->nh;
343		iph->ip6_plen = htons(iph->ip6_plen);
344		c_th = (struct tcphdr *)(iph + 1);
345		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
346	}
347
348	c_th->th_win = c->th_last->th_win;
349	c_th->th_ack = c->th_last->th_ack;
350	if (c_th->th_off == c->th_last->th_off) {
351		/* Copy TCP options (take care to avoid going negative). */
352		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
353		memcpy(c_th + 1, c->th_last + 1, optlen);
354	}
355
356#ifdef SFXGE_HAVE_MQ
357	m->m_pkthdr.flowid = c->conn_hash;
358	m->m_flags |= M_FLOWID;
359#endif
360	m->m_pkthdr.csum_flags = csum_flags;
361	__sfxge_rx_deliver(sc, m);
362
363	c->mbuf = NULL;
364	c->delivered = 1;
365}
366
367/* Drop the given connection, and add it to the free list. */
368static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
369{
370	unsigned bucket;
371
372	KASSERT(!c->mbuf, ("found orphaned mbuf"));
373
374	if (c->next_buf.mbuf) {
375		sfxge_rx_deliver(rxq->sc, &c->next_buf);
376		LIST_REMOVE(c, active_link);
377	}
378
379	bucket = c->conn_hash & rxq->lro.conns_mask;
380	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
381	--rxq->lro.conns_n[bucket];
382	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
383	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
384}
385
386/* Stop tracking connections that have gone idle in order to keep hash
387 * chains short.
388 */
389static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
390{
391	struct sfxge_lro_conn *c;
392	unsigned i;
393
394	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
395		("found active connections"));
396
397	rxq->lro.last_purge_ticks = now;
398	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
399		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
400			continue;
401
402		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
403		if (now - c->last_pkt_ticks > lro_idle_ticks) {
404			++rxq->lro.n_drop_idle;
405			sfxge_lro_drop(rxq, c);
406		}
407	}
408}
409
410static void
411sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
412		struct mbuf *mbuf, struct tcphdr *th)
413{
414	struct tcphdr *c_th;
415
416	/* Tack the new mbuf onto the chain. */
417	KASSERT(!mbuf->m_next, ("mbuf already chained"));
418	c->mbuf_tail->m_next = mbuf;
419	c->mbuf_tail = mbuf;
420
421	/* Increase length appropriately */
422	c->mbuf->m_pkthdr.len += mbuf->m_len;
423
424	/* Update the connection state flags */
425	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
426		struct ip *iph = c->nh;
427		iph->ip_len += mbuf->m_len;
428		c_th = (struct tcphdr *)(iph + 1);
429	} else {
430		struct ip6_hdr *iph = c->nh;
431		iph->ip6_plen += mbuf->m_len;
432		c_th = (struct tcphdr *)(iph + 1);
433	}
434	c_th->th_flags |= (th->th_flags & TH_PUSH);
435	c->th_last = th;
436	++st->n_merges;
437
438	/* Pass packet up now if another segment could overflow the IP
439	 * length.
440	 */
441	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
442		sfxge_lro_deliver(st, c);
443}
444
445static void
446sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
447		struct mbuf *mbuf, void *nh, struct tcphdr *th)
448{
449	/* Start the chain */
450	c->mbuf = mbuf;
451	c->mbuf_tail = c->mbuf;
452	c->nh = nh;
453	c->th_last = th;
454
455	mbuf->m_pkthdr.len = mbuf->m_len;
456
457	/* Mangle header fields for later processing */
458	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
459		struct ip *iph = nh;
460		iph->ip_len = ntohs(iph->ip_len);
461	} else {
462		struct ip6_hdr *iph = nh;
463		iph->ip6_plen = ntohs(iph->ip6_plen);
464	}
465}
466
467/* Try to merge or otherwise hold or deliver (as appropriate) the
468 * packet buffered for this connection (c->next_buf).  Return a flag
469 * indicating whether the connection is still active for LRO purposes.
470 */
471static int
472sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
473{
474	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
475	char *eh = c->next_eh;
476	int data_length, hdr_length, dont_merge;
477	unsigned th_seq, pkt_length;
478	struct tcphdr *th;
479	unsigned now;
480
481	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
482		struct ip *iph = c->next_nh;
483		th = (struct tcphdr *)(iph + 1);
484		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
485	} else {
486		struct ip6_hdr *iph = c->next_nh;
487		th = (struct tcphdr *)(iph + 1);
488		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
489	}
490
491	hdr_length = (char *) th + th->th_off * 4 - eh;
492	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
493		       hdr_length);
494	th_seq = ntohl(th->th_seq);
495	dont_merge = ((data_length <= 0)
496		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
497
498	/* Check for options other than aligned timestamp. */
499	if (th->th_off != 5) {
500		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
501		if (th->th_off == 8 &&
502		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
503					(TCPOPT_NOP << 16) |
504					(TCPOPT_TIMESTAMP << 8) |
505					TCPOLEN_TIMESTAMP)) {
506			/* timestamp option -- okay */
507		} else {
508			dont_merge = 1;
509		}
510	}
511
512	if (__predict_false(th_seq != c->next_seq)) {
513		/* Out-of-order, so start counting again. */
514		if (c->mbuf)
515			sfxge_lro_deliver(&rxq->lro, c);
516		c->n_in_order_pkts -= lro_loss_packets;
517		c->next_seq = th_seq + data_length;
518		++rxq->lro.n_misorder;
519		goto deliver_buf_out;
520	}
521	c->next_seq = th_seq + data_length;
522
523	now = ticks;
524	if (now - c->last_pkt_ticks > lro_idle_ticks) {
525		++rxq->lro.n_drop_idle;
526		if (c->mbuf)
527			sfxge_lro_deliver(&rxq->lro, c);
528		sfxge_lro_drop(rxq, c);
529		return 0;
530	}
531	c->last_pkt_ticks = ticks;
532
533	if (c->n_in_order_pkts < lro_slow_start_packets) {
534		/* May be in slow-start, so don't merge. */
535		++rxq->lro.n_slow_start;
536		++c->n_in_order_pkts;
537		goto deliver_buf_out;
538	}
539
540	if (__predict_false(dont_merge)) {
541		if (c->mbuf)
542			sfxge_lro_deliver(&rxq->lro, c);
543		if (th->th_flags & (TH_FIN | TH_RST)) {
544			++rxq->lro.n_drop_closed;
545			sfxge_lro_drop(rxq, c);
546			return 0;
547		}
548		goto deliver_buf_out;
549	}
550
551	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
552
553	if (__predict_true(c->mbuf != NULL)) {
554		/* Remove headers and any padding */
555		rx_buf->mbuf->m_data += hdr_length;
556		rx_buf->mbuf->m_len = data_length;
557
558		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
559	} else {
560		/* Remove any padding */
561		rx_buf->mbuf->m_len = pkt_length;
562
563		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
564	}
565
566	rx_buf->mbuf = NULL;
567	return 1;
568
569 deliver_buf_out:
570	sfxge_rx_deliver(rxq->sc, rx_buf);
571	return 1;
572}
573
574static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
575			       uint16_t l2_id, void *nh, struct tcphdr *th)
576{
577	unsigned bucket = conn_hash & st->conns_mask;
578	struct sfxge_lro_conn *c;
579
580	if (st->conns_n[bucket] >= lro_chain_max) {
581		++st->n_too_many;
582		return;
583	}
584
585	if (!TAILQ_EMPTY(&st->free_conns)) {
586		c = TAILQ_FIRST(&st->free_conns);
587		TAILQ_REMOVE(&st->free_conns, c, link);
588	} else {
589		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
590		if (c == NULL)
591			return;
592		c->mbuf = NULL;
593		c->next_buf.mbuf = NULL;
594	}
595
596	/* Create the connection tracking data */
597	++st->conns_n[bucket];
598	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
599	c->l2_id = l2_id;
600	c->conn_hash = conn_hash;
601	c->source = th->th_sport;
602	c->dest = th->th_dport;
603	c->n_in_order_pkts = 0;
604	c->last_pkt_ticks = *(volatile int *)&ticks;
605	c->delivered = 0;
606	++st->n_new_stream;
607	/* NB. We don't initialise c->next_seq, and it doesn't matter what
608	 * value it has.  Most likely the next packet received for this
609	 * connection will not match -- no harm done.
610	 */
611}
612
613/* Process mbuf and decide whether to dispatch it to the stack now or
614 * later.
615 */
616static void
617sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
618{
619	struct sfxge_softc *sc = rxq->sc;
620	struct mbuf *m = rx_buf->mbuf;
621	struct ether_header *eh;
622	struct sfxge_lro_conn *c;
623	uint16_t l2_id;
624	uint16_t l3_proto;
625        void *nh;
626	struct tcphdr *th;
627	uint32_t conn_hash;
628	unsigned bucket;
629
630	/* Get the hardware hash */
631	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
632				      mtod(m, uint8_t *));
633
634	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
635	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
636		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
637		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
638			SFXGE_LRO_L2_ID_VLAN;
639		l3_proto = veh->evl_proto;
640		nh = veh + 1;
641	} else {
642		l2_id = 0;
643		l3_proto = eh->ether_type;
644		nh = eh + 1;
645	}
646
647	/* Check whether this is a suitable packet (unfragmented
648	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
649	 * length, and compute a hash if necessary.  If not, return.
650	 */
651	if (l3_proto == htons(ETHERTYPE_IP)) {
652		struct ip *iph = nh;
653		if ((iph->ip_p - IPPROTO_TCP) |
654		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
655		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
656			goto deliver_now;
657		th = (struct tcphdr *)(iph + 1);
658	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
659		struct ip6_hdr *iph = nh;
660		if (iph->ip6_nxt != IPPROTO_TCP)
661			goto deliver_now;
662		l2_id |= SFXGE_LRO_L2_ID_IPV6;
663		th = (struct tcphdr *)(iph + 1);
664	} else {
665		goto deliver_now;
666	}
667
668	bucket = conn_hash & rxq->lro.conns_mask;
669
670	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
671		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
672			continue;
673		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
674			continue;
675		if (c->mbuf) {
676			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
677				struct ip *c_iph, *iph = nh;
678				c_iph = c->nh;
679				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
680				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
681					continue;
682			} else {
683				struct ip6_hdr *c_iph, *iph = nh;
684				c_iph = c->nh;
685				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
686				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
687					continue;
688			}
689		}
690
691		/* Re-insert at head of list to reduce lookup time. */
692		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
693		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
694
695		if (c->next_buf.mbuf) {
696			if (!sfxge_lro_try_merge(rxq, c))
697				goto deliver_now;
698		} else {
699			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
700			    active_link);
701		}
702		c->next_buf = *rx_buf;
703		c->next_eh = eh;
704		c->next_nh = nh;
705
706		rx_buf->mbuf = NULL;
707		rx_buf->flags = EFX_DISCARD;
708		return;
709	}
710
711	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
712 deliver_now:
713	sfxge_rx_deliver(sc, rx_buf);
714}
715
716static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
717{
718	struct sfxge_lro_state *st = &rxq->lro;
719	struct sfxge_lro_conn *c;
720	unsigned t;
721
722	while (!LIST_EMPTY(&st->active_conns)) {
723		c = LIST_FIRST(&st->active_conns);
724		if (!c->delivered && c->mbuf)
725			sfxge_lro_deliver(st, c);
726		if (sfxge_lro_try_merge(rxq, c)) {
727			if (c->mbuf)
728				sfxge_lro_deliver(st, c);
729			LIST_REMOVE(c, active_link);
730		}
731		c->delivered = 0;
732	}
733
734	t = *(volatile int *)&ticks;
735	if (__predict_false(t != st->last_purge_ticks))
736		sfxge_lro_purge_idle(rxq, t);
737}
738
739void
740sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
741{
742	struct sfxge_softc *sc = rxq->sc;
743	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
744	unsigned int index;
745	struct sfxge_evq *evq;
746	unsigned int completed;
747	unsigned int level;
748	struct mbuf *m;
749	struct sfxge_rx_sw_desc *prev = NULL;
750
751	index = rxq->index;
752	evq = sc->evq[index];
753
754	mtx_assert(&evq->lock, MA_OWNED);
755
756	completed = rxq->completed;
757	while (completed != rxq->pending) {
758		unsigned int id;
759		struct sfxge_rx_sw_desc *rx_desc;
760
761		id = completed++ & (SFXGE_NDESCS - 1);
762		rx_desc = &rxq->queue[id];
763		m = rx_desc->mbuf;
764
765		if (rxq->init_state != SFXGE_RXQ_STARTED)
766			goto discard;
767
768		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
769			goto discard;
770
771		prefetch_read_many(mtod(m, caddr_t));
772
773		/* Check for loopback packets */
774		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
775		    !(rx_desc->flags & EFX_PKT_IPV6)) {
776			struct ether_header *etherhp;
777
778			/*LINTED*/
779			etherhp = mtod(m, struct ether_header *);
780
781			if (etherhp->ether_type ==
782			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
783				EFSYS_PROBE(loopback);
784
785				rxq->loopback++;
786				goto discard;
787			}
788		}
789
790		/* Pass packet up the stack or into LRO (pipelined) */
791		if (prev != NULL) {
792			if (lro_enabled)
793				sfxge_lro(rxq, prev);
794			else
795				sfxge_rx_deliver(sc, prev);
796		}
797		prev = rx_desc;
798		continue;
799
800discard:
801		/* Return the packet to the pool */
802		m_free(m);
803		rx_desc->mbuf = NULL;
804	}
805	rxq->completed = completed;
806
807	level = rxq->added - rxq->completed;
808
809	/* Pass last packet up the stack or into LRO */
810	if (prev != NULL) {
811		if (lro_enabled)
812			sfxge_lro(rxq, prev);
813		else
814			sfxge_rx_deliver(sc, prev);
815	}
816
817	/*
818	 * If there are any pending flows and this is the end of the
819	 * poll then they must be completed.
820	 */
821	if (eop)
822		sfxge_lro_end_of_burst(rxq);
823
824	/* Top up the queue if necessary */
825	if (level < RX_REFILL_THRESHOLD)
826		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
827}
828
829static void
830sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
831{
832	struct sfxge_rxq *rxq;
833	struct sfxge_evq *evq;
834	unsigned int count;
835
836	rxq = sc->rxq[index];
837	evq = sc->evq[index];
838
839	mtx_lock(&evq->lock);
840
841	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
842	    ("rxq not started"));
843
844	rxq->init_state = SFXGE_RXQ_INITIALIZED;
845
846	callout_stop(&rxq->refill_callout);
847
848again:
849	rxq->flush_state = SFXGE_FLUSH_PENDING;
850
851	/* Flush the receive queue */
852	efx_rx_qflush(rxq->common);
853
854	mtx_unlock(&evq->lock);
855
856	count = 0;
857	do {
858		/* Spin for 100 ms */
859		DELAY(100000);
860
861		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
862			break;
863
864	} while (++count < 20);
865
866	mtx_lock(&evq->lock);
867
868	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
869		goto again;
870
871	rxq->flush_state = SFXGE_FLUSH_DONE;
872
873	rxq->pending = rxq->added;
874	sfxge_rx_qcomplete(rxq, B_TRUE);
875
876	KASSERT(rxq->completed == rxq->pending,
877	    ("rxq->completed != rxq->pending"));
878
879	rxq->added = 0;
880	rxq->pending = 0;
881	rxq->completed = 0;
882	rxq->loopback = 0;
883
884	/* Destroy the common code receive queue. */
885	efx_rx_qdestroy(rxq->common);
886
887	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
888	    EFX_RXQ_NBUFS(SFXGE_NDESCS));
889
890	mtx_unlock(&evq->lock);
891}
892
893static int
894sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
895{
896	struct sfxge_rxq *rxq;
897	efsys_mem_t *esmp;
898	struct sfxge_evq *evq;
899	int rc;
900
901	rxq = sc->rxq[index];
902	esmp = &rxq->mem;
903	evq = sc->evq[index];
904
905	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
906	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
907	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
908	    ("evq->init_state != SFXGE_EVQ_STARTED"));
909
910	/* Program the buffer table. */
911	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
912	    EFX_RXQ_NBUFS(SFXGE_NDESCS))) != 0)
913		return rc;
914
915	/* Create the common code receive queue. */
916	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
917	    esmp, SFXGE_NDESCS, rxq->buf_base_id, evq->common,
918	    &rxq->common)) != 0)
919		goto fail;
920
921	mtx_lock(&evq->lock);
922
923	/* Enable the receive queue. */
924	efx_rx_qenable(rxq->common);
925
926	rxq->init_state = SFXGE_RXQ_STARTED;
927
928	/* Try to fill the queue from the pool. */
929	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
930
931	mtx_unlock(&evq->lock);
932
933	return (0);
934
935fail:
936	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
937	    EFX_RXQ_NBUFS(SFXGE_NDESCS));
938	return rc;
939}
940
941void
942sfxge_rx_stop(struct sfxge_softc *sc)
943{
944	struct sfxge_intr *intr;
945	int index;
946
947	intr = &sc->intr;
948
949	/* Stop the receive queue(s) */
950	index = intr->n_alloc;
951	while (--index >= 0)
952		sfxge_rx_qstop(sc, index);
953
954	sc->rx_prefix_size = 0;
955	sc->rx_buffer_size = 0;
956
957	efx_rx_fini(sc->enp);
958}
959
960int
961sfxge_rx_start(struct sfxge_softc *sc)
962{
963	struct sfxge_intr *intr;
964	int index;
965	int rc;
966
967	intr = &sc->intr;
968
969	/* Initialize the common code receive module. */
970	if ((rc = efx_rx_init(sc->enp)) != 0)
971		return (rc);
972
973	/* Calculate the receive packet buffer size. */
974	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
975	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
976			      sc->rx_prefix_size);
977
978	/* Select zone for packet buffers */
979	if (sc->rx_buffer_size <= MCLBYTES)
980		sc->rx_buffer_zone = zone_clust;
981	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
982		sc->rx_buffer_zone = zone_jumbop;
983	else if (sc->rx_buffer_size <= MJUM9BYTES)
984		sc->rx_buffer_zone = zone_jumbo9;
985	else
986		sc->rx_buffer_zone = zone_jumbo16;
987
988	/*
989	 * Set up the scale table.  Enable all hash types and hash insertion.
990	 */
991	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
992		sc->rx_indir_table[index] = index % sc->intr.n_alloc;
993	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
994				       SFXGE_RX_SCALE_MAX)) != 0)
995		goto fail;
996	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
997	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
998	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
999
1000	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1001	    sizeof(toep_key))) != 0)
1002		goto fail;
1003
1004	/* Start the receive queue(s). */
1005	for (index = 0; index < intr->n_alloc; index++) {
1006		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1007			goto fail2;
1008	}
1009
1010	return (0);
1011
1012fail2:
1013	while (--index >= 0)
1014		sfxge_rx_qstop(sc, index);
1015
1016fail:
1017	efx_rx_fini(sc->enp);
1018
1019	return (rc);
1020}
1021
1022static void sfxge_lro_init(struct sfxge_rxq *rxq)
1023{
1024	struct sfxge_lro_state *st = &rxq->lro;
1025	unsigned i;
1026
1027	st->conns_mask = lro_table_size - 1;
1028	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1029		("lro_table_size must be a power of 2"));
1030	st->sc = rxq->sc;
1031	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1032			   M_SFXGE, M_WAITOK);
1033	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1034			     M_SFXGE, M_WAITOK);
1035	for (i = 0; i <= st->conns_mask; ++i) {
1036		TAILQ_INIT(&st->conns[i]);
1037		st->conns_n[i] = 0;
1038	}
1039	LIST_INIT(&st->active_conns);
1040	TAILQ_INIT(&st->free_conns);
1041}
1042
1043static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1044{
1045	struct sfxge_lro_state *st = &rxq->lro;
1046	struct sfxge_lro_conn *c;
1047	unsigned i;
1048
1049	/* Return cleanly if sfxge_lro_init() has not been called. */
1050	if (st->conns == NULL)
1051		return;
1052
1053	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1054
1055	for (i = 0; i <= st->conns_mask; ++i) {
1056		while (!TAILQ_EMPTY(&st->conns[i])) {
1057			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1058			sfxge_lro_drop(rxq, c);
1059		}
1060	}
1061
1062	while (!TAILQ_EMPTY(&st->free_conns)) {
1063		c = TAILQ_FIRST(&st->free_conns);
1064		TAILQ_REMOVE(&st->free_conns, c, link);
1065		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1066		free(c, M_SFXGE);
1067	}
1068
1069	free(st->conns_n, M_SFXGE);
1070	free(st->conns, M_SFXGE);
1071	st->conns = NULL;
1072}
1073
1074static void
1075sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1076{
1077	struct sfxge_rxq *rxq;
1078
1079	rxq = sc->rxq[index];
1080
1081	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1082	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1083
1084	/* Free the context array and the flow table. */
1085	free(rxq->queue, M_SFXGE);
1086	sfxge_lro_fini(rxq);
1087
1088	/* Release DMA memory. */
1089	sfxge_dma_free(&rxq->mem);
1090
1091	sc->rxq[index] = NULL;
1092
1093	free(rxq, M_SFXGE);
1094}
1095
1096static int
1097sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1098{
1099	struct sfxge_rxq *rxq;
1100	struct sfxge_evq *evq;
1101	efsys_mem_t *esmp;
1102	int rc;
1103
1104	KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc));
1105
1106	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1107	rxq->sc = sc;
1108	rxq->index = index;
1109
1110	sc->rxq[index] = rxq;
1111	esmp = &rxq->mem;
1112
1113	evq = sc->evq[index];
1114
1115	/* Allocate and zero DMA space. */
1116	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(SFXGE_NDESCS), esmp)) != 0)
1117		return (rc);
1118	(void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(SFXGE_NDESCS));
1119
1120	/* Allocate buffer table entries. */
1121	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(SFXGE_NDESCS),
1122				 &rxq->buf_base_id);
1123
1124	/* Allocate the context array and the flow table. */
1125	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * SFXGE_NDESCS,
1126	    M_SFXGE, M_WAITOK | M_ZERO);
1127	sfxge_lro_init(rxq);
1128
1129	callout_init(&rxq->refill_callout, B_TRUE);
1130
1131	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1132
1133	return (0);
1134}
1135
1136static const struct {
1137	const char *name;
1138	size_t offset;
1139} sfxge_rx_stats[] = {
1140#define SFXGE_RX_STAT(name, member) \
1141	{ #name, offsetof(struct sfxge_rxq, member) }
1142	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1143	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1144	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1145	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1146	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1147	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1148	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1149	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1150};
1151
1152static int
1153sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1154{
1155	struct sfxge_softc *sc = arg1;
1156	unsigned int id = arg2;
1157	unsigned int sum, index;
1158
1159	/* Sum across all RX queues */
1160	sum = 0;
1161	for (index = 0; index < sc->intr.n_alloc; index++)
1162		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1163					 sfxge_rx_stats[id].offset);
1164
1165	return SYSCTL_OUT(req, &sum, sizeof(sum));
1166}
1167
1168static void
1169sfxge_rx_stat_init(struct sfxge_softc *sc)
1170{
1171	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1172	struct sysctl_oid_list *stat_list;
1173	unsigned int id;
1174
1175	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1176
1177	for (id = 0;
1178	     id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]);
1179	     id++) {
1180		SYSCTL_ADD_PROC(
1181			ctx, stat_list,
1182			OID_AUTO, sfxge_rx_stats[id].name,
1183			CTLTYPE_UINT|CTLFLAG_RD,
1184			sc, id, sfxge_rx_stat_handler, "IU",
1185			"");
1186	}
1187}
1188
1189void
1190sfxge_rx_fini(struct sfxge_softc *sc)
1191{
1192	struct sfxge_intr *intr;
1193	int index;
1194
1195	intr = &sc->intr;
1196
1197	index = intr->n_alloc;
1198	while (--index >= 0)
1199		sfxge_rx_qfini(sc, index);
1200}
1201
1202int
1203sfxge_rx_init(struct sfxge_softc *sc)
1204{
1205	struct sfxge_intr *intr;
1206	int index;
1207	int rc;
1208
1209	if (lro_idle_ticks == 0)
1210		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1211
1212	intr = &sc->intr;
1213
1214	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1215	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1216
1217	/* Initialize the receive queue(s) - one per interrupt. */
1218	for (index = 0; index < intr->n_alloc; index++) {
1219		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1220			goto fail;
1221	}
1222
1223	sfxge_rx_stat_init(sc);
1224
1225	return (0);
1226
1227fail:
1228	/* Tear down the receive queue(s). */
1229	while (--index >= 0)
1230		sfxge_rx_qfini(sc, index);
1231
1232	return (rc);
1233}
1234