sfxge_rx.c revision 283206
11590Srgrimes/*-
21590Srgrimes * Copyright (c) 2010-2011 Solarflare Communications, Inc.
31590Srgrimes * All rights reserved.
41590Srgrimes *
51590Srgrimes * This software was developed in part by Philip Paeps under contract for
61590Srgrimes * Solarflare Communications, Inc.
71590Srgrimes *
81590Srgrimes * Redistribution and use in source and binary forms, with or without
91590Srgrimes * modification, are permitted provided that the following conditions
101590Srgrimes * are met:
111590Srgrimes * 1. Redistributions of source code must retain the above copyright
121590Srgrimes *    notice, this list of conditions and the following disclaimer.
131590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
141590Srgrimes *    notice, this list of conditions and the following disclaimer in the
151590Srgrimes *    documentation and/or other materials provided with the distribution.
161590Srgrimes *
171590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
181590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
191590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
201590Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
211590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
221590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
231590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
241590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
251590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
261590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
271590Srgrimes * SUCH DAMAGE.
281590Srgrimes */
291590Srgrimes
301590Srgrimes#include <sys/cdefs.h>
311590Srgrimes__FBSDID("$FreeBSD: stable/10/sys/dev/sfxge/sfxge_rx.c 283206 2015-05-21 09:05:13Z arybchik $");
321590Srgrimes
331590Srgrimes#include <sys/types.h>
341590Srgrimes#include <sys/mbuf.h>
351590Srgrimes#include <sys/smp.h>
361590Srgrimes#include <sys/socket.h>
371590Srgrimes#include <sys/sysctl.h>
381590Srgrimes#include <sys/limits.h>
391590Srgrimes#include <sys/syslog.h>
401590Srgrimes
411590Srgrimes#include <net/ethernet.h>
421590Srgrimes#include <net/if.h>
431590Srgrimes#include <net/if_vlan_var.h>
441590Srgrimes
451590Srgrimes#include <netinet/in.h>
461590Srgrimes#include <netinet/ip.h>
471590Srgrimes#include <netinet/ip6.h>
481590Srgrimes#include <netinet/tcp.h>
491590Srgrimes
501590Srgrimes#include <machine/in_cksum.h>
511590Srgrimes
521590Srgrimes#include "common/efx.h"
531590Srgrimes
541590Srgrimes
551590Srgrimes#include "sfxge.h"
561590Srgrimes#include "sfxge_rx.h"
577726Sdg
581590Srgrimes#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
591590Srgrimes
601590Srgrimes#ifdef SFXGE_LRO
611590Srgrimes
621590SrgrimesSYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
631590Srgrimes	    "Large receive offload (LRO) parameters");
641590Srgrimes
651590Srgrimes#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
661590Srgrimes
671590Srgrimes/* Size of the LRO hash table.  Must be a power of 2.  A larger table
681590Srgrimes * means we can accelerate a larger number of streams.
691590Srgrimes */
701590Srgrimesstatic unsigned lro_table_size = 128;
711590SrgrimesTUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
721590SrgrimesSYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
731590Srgrimes	    &lro_table_size, 0,
741590Srgrimes	    "Size of the LRO hash table (must be a power of 2)");
751590Srgrimes
761590Srgrimes/* Maximum length of a hash chain.  If chains get too long then the lookup
771590Srgrimes * time increases and may exceed the benefit of LRO.
781590Srgrimes */
791590Srgrimesstatic unsigned lro_chain_max = 20;
801590SrgrimesTUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
811590SrgrimesSYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
821590Srgrimes	    &lro_chain_max, 0,
831590Srgrimes	    "The maximum length of a hash chain");
841590Srgrimes
851590Srgrimes/* Maximum time (in ticks) that a connection can be idle before it's LRO
861590Srgrimes * state is discarded.
871590Srgrimes */
881590Srgrimesstatic unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
891590SrgrimesTUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
901590SrgrimesSYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
911590Srgrimes	    &lro_idle_ticks, 0,
921590Srgrimes	    "The maximum time (in ticks) that a connection can be idle "
931590Srgrimes	    "before it's LRO state is discarded");
941590Srgrimes
951590Srgrimes/* Number of packets with payload that must arrive in-order before a
961590Srgrimes * connection is eligible for LRO.  The idea is we should avoid coalescing
971590Srgrimes * segments when the sender is in slow-start because reducing the ACK rate
981590Srgrimes * can damage performance.
991590Srgrimes */
1001590Srgrimesstatic int lro_slow_start_packets = 2000;
1011590SrgrimesTUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
1021590SrgrimesSYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
1031590Srgrimes	    &lro_slow_start_packets, 0,
1041590Srgrimes	    "Number of packets with payload that must arrive in-order before "
1051590Srgrimes	    "a connection is eligible for LRO");
1061590Srgrimes
1071590Srgrimes/* Number of packets with payload that must arrive in-order following loss
1081590Srgrimes * before a connection is eligible for LRO.  The idea is we should avoid
1091590Srgrimes * coalescing segments when the sender is recovering from loss, because
1101590Srgrimes * reducing the ACK rate can damage performance.
1111590Srgrimes */
1121590Srgrimesstatic int lro_loss_packets = 20;
1131590SrgrimesTUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
1141590SrgrimesSYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
1151590Srgrimes	    &lro_loss_packets, 0,
1161590Srgrimes	    "Number of packets with payload that must arrive in-order "
1171590Srgrimes	    "following loss before a connection is eligible for LRO");
1181590Srgrimes
1191590Srgrimes/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
1201590Srgrimes#define	SFXGE_LRO_L2_ID_VLAN 0x4000
1211590Srgrimes#define	SFXGE_LRO_L2_ID_IPV6 0x8000
1221590Srgrimes#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
1231590Srgrimes#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
1241590Srgrimes
1251590Srgrimes/* Compare IPv6 addresses, avoiding conditional branches */
1261590Srgrimesstatic unsigned long ipv6_addr_cmp(const struct in6_addr *left,
1271590Srgrimes				   const struct in6_addr *right)
1281590Srgrimes{
1291590Srgrimes#if LONG_BIT == 64
1301590Srgrimes	const uint64_t *left64 = (const uint64_t *)left;
1311590Srgrimes	const uint64_t *right64 = (const uint64_t *)right;
1321590Srgrimes	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
1331590Srgrimes#else
1341590Srgrimes	return (left->s6_addr32[0] - right->s6_addr32[0]) |
1351590Srgrimes	       (left->s6_addr32[1] - right->s6_addr32[1]) |
1361590Srgrimes	       (left->s6_addr32[2] - right->s6_addr32[2]) |
1378874Srgrimes	       (left->s6_addr32[3] - right->s6_addr32[3]);
1381590Srgrimes#endif
1391590Srgrimes}
1401590Srgrimes
1411590Srgrimes#endif	/* SFXGE_LRO */
1421590Srgrimes
1431590Srgrimesvoid
1441590Srgrimessfxge_rx_qflush_done(struct sfxge_rxq *rxq)
1451590Srgrimes{
1461590Srgrimes
1471590Srgrimes	rxq->flush_state = SFXGE_FLUSH_DONE;
1481590Srgrimes}
1491590Srgrimes
1501590Srgrimesvoid
1511590Srgrimessfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
1521590Srgrimes{
1531590Srgrimes
1541590Srgrimes	rxq->flush_state = SFXGE_FLUSH_FAILED;
1551590Srgrimes}
1561590Srgrimes
1571590Srgrimesstatic uint8_t toep_key[] = {
1581590Srgrimes	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
1591590Srgrimes	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
1601590Srgrimes	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
1611590Srgrimes	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
1621590Srgrimes	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
1631590Srgrimes};
1641590Srgrimes
1651590Srgrimesstatic void
1661590Srgrimessfxge_rx_post_refill(void *arg)
1671590Srgrimes{
1681590Srgrimes	struct sfxge_rxq *rxq = arg;
1691590Srgrimes	struct sfxge_softc *sc;
1701590Srgrimes	unsigned int index;
1711590Srgrimes	struct sfxge_evq *evq;
1721590Srgrimes	uint16_t magic;
1731590Srgrimes
1741590Srgrimes	sc = rxq->sc;
1751590Srgrimes	index = rxq->index;
1761590Srgrimes	evq = sc->evq[index];
1771590Srgrimes
1781590Srgrimes	magic = SFXGE_MAGIC_RX_QREFILL | index;
1791590Srgrimes
1801590Srgrimes	/* This is guaranteed due to the start/stop order of rx and ev */
1811590Srgrimes	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1821590Srgrimes	    ("evq not started"));
1831590Srgrimes	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
1841590Srgrimes	    ("rxq not started"));
1851590Srgrimes	efx_ev_qpost(evq->common, magic);
1861590Srgrimes}
1871590Srgrimes
1881590Srgrimesstatic void
1891590Srgrimessfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
1901590Srgrimes{
1911590Srgrimes	/* Initially retry after 100 ms, but back off in case of
1921590Srgrimes	 * repeated failures as we probably have to wait for the
1931590Srgrimes	 * administrator to raise the pool limit. */
1941590Srgrimes	if (retrying)
1951590Srgrimes		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
1961590Srgrimes	else
1971590Srgrimes		rxq->refill_delay = hz / 10;
1981590Srgrimes
1991590Srgrimes	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
2001590Srgrimes			     sfxge_rx_post_refill, rxq);
2011590Srgrimes}
2021590Srgrimes
2031590Srgrimesstatic struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
2041590Srgrimes{
2051590Srgrimes	struct mb_args args;
2061590Srgrimes	struct mbuf *m;
2071590Srgrimes
2081590Srgrimes	/* Allocate mbuf structure */
2091590Srgrimes	args.flags = M_PKTHDR;
2101590Srgrimes	args.type = MT_DATA;
2111590Srgrimes	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
2121590Srgrimes
2131590Srgrimes	/* Allocate (and attach) packet buffer */
2141590Srgrimes	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
2151590Srgrimes		uma_zfree(zone_mbuf, m);
2161590Srgrimes		m = NULL;
2171590Srgrimes	}
2188874Srgrimes
2191590Srgrimes	return (m);
2201590Srgrimes}
2211590Srgrimes
2221590Srgrimes#define	SFXGE_REFILL_BATCH  64
2231590Srgrimes
2241590Srgrimesstatic void
2251590Srgrimessfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
2261590Srgrimes{
2271590Srgrimes	struct sfxge_softc *sc;
2281590Srgrimes	unsigned int index;
2291590Srgrimes	struct sfxge_evq *evq;
2301590Srgrimes	unsigned int batch;
2311590Srgrimes	unsigned int rxfill;
2321590Srgrimes	unsigned int mblksize;
2331590Srgrimes	int ntodo;
2341590Srgrimes	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
2351590Srgrimes
2361590Srgrimes	sc = rxq->sc;
2371590Srgrimes	index = rxq->index;
2381590Srgrimes	evq = sc->evq[index];
2391590Srgrimes
2401590Srgrimes	prefetch_read_many(sc->enp);
2411590Srgrimes	prefetch_read_many(rxq->common);
2421590Srgrimes
2431590Srgrimes	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
2441590Srgrimes
2451590Srgrimes	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
2461590Srgrimes		return;
2471590Srgrimes
2481590Srgrimes	rxfill = rxq->added - rxq->completed;
2491590Srgrimes	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
2501590Srgrimes	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
2511590Srgrimes	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
2521590Srgrimes	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
2531590Srgrimes	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
2541590Srgrimes
2551590Srgrimes	if (ntodo == 0)
2561590Srgrimes		return;
2571590Srgrimes
2581590Srgrimes	batch = 0;
2591590Srgrimes	mblksize = sc->rx_buffer_size;
2601590Srgrimes	while (ntodo-- > 0) {
2611590Srgrimes		unsigned int id;
2621590Srgrimes		struct sfxge_rx_sw_desc *rx_desc;
2631590Srgrimes		bus_dma_segment_t seg;
2641590Srgrimes		struct mbuf *m;
2651590Srgrimes
2661590Srgrimes		id = (rxq->added + batch) & rxq->ptr_mask;
2671590Srgrimes		rx_desc = &rxq->queue[id];
2681590Srgrimes		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
2691590Srgrimes
2701590Srgrimes		rx_desc->flags = EFX_DISCARD;
2711590Srgrimes		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
2721590Srgrimes		if (m == NULL)
2731590Srgrimes			break;
2741590Srgrimes		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
2751590Srgrimes		addr[batch++] = seg.ds_addr;
2761590Srgrimes
2771590Srgrimes		if (batch == SFXGE_REFILL_BATCH) {
2781590Srgrimes			efx_rx_qpost(rxq->common, addr, mblksize, batch,
2791590Srgrimes			    rxq->completed, rxq->added);
2801590Srgrimes			rxq->added += batch;
2811590Srgrimes			batch = 0;
2821590Srgrimes		}
2831590Srgrimes	}
2841590Srgrimes
2851590Srgrimes	if (ntodo != 0)
2861590Srgrimes		sfxge_rx_schedule_refill(rxq, retrying);
2871590Srgrimes
2881590Srgrimes	if (batch != 0) {
2891590Srgrimes		efx_rx_qpost(rxq->common, addr, mblksize, batch,
2901590Srgrimes		    rxq->completed, rxq->added);
2911590Srgrimes		rxq->added += batch;
2921590Srgrimes	}
2931590Srgrimes
2941590Srgrimes	/* Make the descriptors visible to the hardware */
2951590Srgrimes	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
2961590Srgrimes			BUS_DMASYNC_PREWRITE);
2971590Srgrimes
2981590Srgrimes	efx_rx_qpush(rxq->common, rxq->added);
2991590Srgrimes}
3001590Srgrimes
3011590Srgrimesvoid
3021590Srgrimessfxge_rx_qrefill(struct sfxge_rxq *rxq)
3031590Srgrimes{
3041590Srgrimes
3051590Srgrimes	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
3061590Srgrimes		return;
3071590Srgrimes
3081590Srgrimes	/* Make sure the queue is full */
3091590Srgrimes	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
3101590Srgrimes}
3111590Srgrimes
3121590Srgrimesstatic void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
3131590Srgrimes{
3141590Srgrimes	struct ifnet *ifp = sc->ifnet;
3151590Srgrimes
3161590Srgrimes	m->m_pkthdr.rcvif = ifp;
3171590Srgrimes	m->m_pkthdr.csum_data = 0xffff;
3181590Srgrimes	ifp->if_input(ifp, m);
3191590Srgrimes}
3201590Srgrimes
3211590Srgrimesstatic void
3221590Srgrimessfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
3231590Srgrimes{
3241590Srgrimes	struct mbuf *m = rx_desc->mbuf;
3251590Srgrimes	int flags = rx_desc->flags;
3261590Srgrimes	int csum_flags;
3271590Srgrimes
3281590Srgrimes	/* Convert checksum flags */
3291590Srgrimes	csum_flags = (flags & EFX_CKSUM_IPV4) ?
3301590Srgrimes		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
3311590Srgrimes	if (flags & EFX_CKSUM_TCPUDP)
3321590Srgrimes		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
3331590Srgrimes
3341590Srgrimes	/* The hash covers a 4-tuple for TCP only */
3351590Srgrimes	if (flags & EFX_PKT_TCP) {
3361590Srgrimes		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
3371590Srgrimes						       mtod(m, uint8_t *));
3381590Srgrimes		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
3391590Srgrimes	}
3401590Srgrimes	m->m_data += sc->rx_prefix_size;
3411590Srgrimes	m->m_len = rx_desc->size - sc->rx_prefix_size;
3421590Srgrimes	m->m_pkthdr.len = m->m_len;
3431590Srgrimes	m->m_pkthdr.csum_flags = csum_flags;
3441590Srgrimes	__sfxge_rx_deliver(sc, rx_desc->mbuf);
3451590Srgrimes
3461590Srgrimes	rx_desc->flags = EFX_DISCARD;
3471590Srgrimes	rx_desc->mbuf = NULL;
3481590Srgrimes}
3491590Srgrimes
3501590Srgrimes#ifdef SFXGE_LRO
3511590Srgrimes
3521590Srgrimesstatic void
3531590Srgrimessfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
3541590Srgrimes{
3551590Srgrimes	struct sfxge_softc *sc = st->sc;
3561590Srgrimes	struct mbuf *m = c->mbuf;
3571590Srgrimes	struct tcphdr *c_th;
3588874Srgrimes	int csum_flags;
3591590Srgrimes
3601590Srgrimes	KASSERT(m, ("no mbuf to deliver"));
3611590Srgrimes
3621590Srgrimes	++st->n_bursts;
3631590Srgrimes
3641590Srgrimes	/* Finish off packet munging and recalculate IP header checksum. */
3651590Srgrimes	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
3661590Srgrimes		struct ip *iph = c->nh;
3671590Srgrimes		iph->ip_len = htons(iph->ip_len);
3681590Srgrimes		iph->ip_sum = 0;
3691590Srgrimes		iph->ip_sum = in_cksum_hdr(iph);
3701590Srgrimes		c_th = (struct tcphdr *)(iph + 1);
3711590Srgrimes		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
3721590Srgrimes			      CSUM_IP_CHECKED | CSUM_IP_VALID);
3731590Srgrimes	} else {
3741590Srgrimes		struct ip6_hdr *iph = c->nh;
3751590Srgrimes		iph->ip6_plen = htons(iph->ip6_plen);
3761590Srgrimes		c_th = (struct tcphdr *)(iph + 1);
3771590Srgrimes		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
3781590Srgrimes	}
3791590Srgrimes
3801590Srgrimes	c_th->th_win = c->th_last->th_win;
3811590Srgrimes	c_th->th_ack = c->th_last->th_ack;
3821590Srgrimes	if (c_th->th_off == c->th_last->th_off) {
3831590Srgrimes		/* Copy TCP options (take care to avoid going negative). */
3841590Srgrimes		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
3851590Srgrimes		memcpy(c_th + 1, c->th_last + 1, optlen);
3861590Srgrimes	}
3871590Srgrimes
3881590Srgrimes	m->m_pkthdr.flowid = c->conn_hash;
3891590Srgrimes	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
3901590Srgrimes
3911590Srgrimes	m->m_pkthdr.csum_flags = csum_flags;
3921590Srgrimes	__sfxge_rx_deliver(sc, m);
3931590Srgrimes
3941590Srgrimes	c->mbuf = NULL;
3951590Srgrimes	c->delivered = 1;
3961590Srgrimes}
3971590Srgrimes
3981590Srgrimes/* Drop the given connection, and add it to the free list. */
3991590Srgrimesstatic void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
4001590Srgrimes{
4011590Srgrimes	unsigned bucket;
4021590Srgrimes
4031590Srgrimes	KASSERT(!c->mbuf, ("found orphaned mbuf"));
4041590Srgrimes
4051590Srgrimes	if (c->next_buf.mbuf != NULL) {
4061590Srgrimes		sfxge_rx_deliver(rxq->sc, &c->next_buf);
4071590Srgrimes		LIST_REMOVE(c, active_link);
4081590Srgrimes	}
4091590Srgrimes
4101590Srgrimes	bucket = c->conn_hash & rxq->lro.conns_mask;
4111590Srgrimes	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
4121590Srgrimes	--rxq->lro.conns_n[bucket];
4131590Srgrimes	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
4141590Srgrimes	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
4151590Srgrimes}
4161590Srgrimes
4171590Srgrimes/* Stop tracking connections that have gone idle in order to keep hash
4181590Srgrimes * chains short.
4191590Srgrimes */
4201590Srgrimesstatic void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
4211590Srgrimes{
4221590Srgrimes	struct sfxge_lro_conn *c;
4231590Srgrimes	unsigned i;
4241590Srgrimes
4251590Srgrimes	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
4261590Srgrimes		("found active connections"));
4271590Srgrimes
4281590Srgrimes	rxq->lro.last_purge_ticks = now;
4291590Srgrimes	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
4301590Srgrimes		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
4311590Srgrimes			continue;
4321590Srgrimes
4331590Srgrimes		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
4341590Srgrimes		if (now - c->last_pkt_ticks > lro_idle_ticks) {
4351590Srgrimes			++rxq->lro.n_drop_idle;
4361590Srgrimes			sfxge_lro_drop(rxq, c);
4371590Srgrimes		}
4381590Srgrimes	}
4391590Srgrimes}
4401590Srgrimes
4411590Srgrimesstatic void
4428874Srgrimessfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
4431590Srgrimes		struct mbuf *mbuf, struct tcphdr *th)
4441590Srgrimes{
4451590Srgrimes	struct tcphdr *c_th;
4461590Srgrimes
4471590Srgrimes	/* Tack the new mbuf onto the chain. */
4481590Srgrimes	KASSERT(!mbuf->m_next, ("mbuf already chained"));
4491590Srgrimes	c->mbuf_tail->m_next = mbuf;
4501590Srgrimes	c->mbuf_tail = mbuf;
4511590Srgrimes
4521590Srgrimes	/* Increase length appropriately */
4531590Srgrimes	c->mbuf->m_pkthdr.len += mbuf->m_len;
4541590Srgrimes
4551590Srgrimes	/* Update the connection state flags */
4561590Srgrimes	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
4571590Srgrimes		struct ip *iph = c->nh;
4581590Srgrimes		iph->ip_len += mbuf->m_len;
4591590Srgrimes		c_th = (struct tcphdr *)(iph + 1);
4601590Srgrimes	} else {
4611590Srgrimes		struct ip6_hdr *iph = c->nh;
4621590Srgrimes		iph->ip6_plen += mbuf->m_len;
4631590Srgrimes		c_th = (struct tcphdr *)(iph + 1);
4641590Srgrimes	}
4651590Srgrimes	c_th->th_flags |= (th->th_flags & TH_PUSH);
4661590Srgrimes	c->th_last = th;
4671590Srgrimes	++st->n_merges;
4681590Srgrimes
4691590Srgrimes	/* Pass packet up now if another segment could overflow the IP
4701590Srgrimes	 * length.
4711590Srgrimes	 */
4721590Srgrimes	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
4731590Srgrimes		sfxge_lro_deliver(st, c);
4741590Srgrimes}
4751590Srgrimes
4761590Srgrimesstatic void
4771590Srgrimessfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
4781590Srgrimes		struct mbuf *mbuf, void *nh, struct tcphdr *th)
4791590Srgrimes{
4801590Srgrimes	/* Start the chain */
4811590Srgrimes	c->mbuf = mbuf;
4821590Srgrimes	c->mbuf_tail = c->mbuf;
4831590Srgrimes	c->nh = nh;
4841590Srgrimes	c->th_last = th;
4851590Srgrimes
4861590Srgrimes	mbuf->m_pkthdr.len = mbuf->m_len;
4871590Srgrimes
4881590Srgrimes	/* Mangle header fields for later processing */
4891590Srgrimes	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
4901590Srgrimes		struct ip *iph = nh;
4911590Srgrimes		iph->ip_len = ntohs(iph->ip_len);
4921590Srgrimes	} else {
4931590Srgrimes		struct ip6_hdr *iph = nh;
4941590Srgrimes		iph->ip6_plen = ntohs(iph->ip6_plen);
4951590Srgrimes	}
4961590Srgrimes}
4971590Srgrimes
4981590Srgrimes/* Try to merge or otherwise hold or deliver (as appropriate) the
4991590Srgrimes * packet buffered for this connection (c->next_buf).  Return a flag
5001590Srgrimes * indicating whether the connection is still active for LRO purposes.
5011590Srgrimes */
5021590Srgrimesstatic int
5031590Srgrimessfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
5041590Srgrimes{
5051590Srgrimes	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
5061590Srgrimes	char *eh = c->next_eh;
5071590Srgrimes	int data_length, hdr_length, dont_merge;
5081590Srgrimes	unsigned th_seq, pkt_length;
5091590Srgrimes	struct tcphdr *th;
5101590Srgrimes	unsigned now;
5111590Srgrimes
5121590Srgrimes	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
5131590Srgrimes		struct ip *iph = c->next_nh;
5141590Srgrimes		th = (struct tcphdr *)(iph + 1);
5151590Srgrimes		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
5161590Srgrimes	} else {
5171590Srgrimes		struct ip6_hdr *iph = c->next_nh;
5181590Srgrimes		th = (struct tcphdr *)(iph + 1);
5191590Srgrimes		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
5201590Srgrimes	}
5211590Srgrimes
5221590Srgrimes	hdr_length = (char *) th + th->th_off * 4 - eh;
5231590Srgrimes	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
5241590Srgrimes		       hdr_length);
5251590Srgrimes	th_seq = ntohl(th->th_seq);
5261590Srgrimes	dont_merge = ((data_length <= 0)
5271590Srgrimes		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
5281590Srgrimes
5291590Srgrimes	/* Check for options other than aligned timestamp. */
5301590Srgrimes	if (th->th_off != 5) {
5311590Srgrimes		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
5321590Srgrimes		if (th->th_off == 8 &&
5331590Srgrimes		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
5341590Srgrimes					(TCPOPT_NOP << 16) |
5351590Srgrimes					(TCPOPT_TIMESTAMP << 8) |
5361590Srgrimes					TCPOLEN_TIMESTAMP)) {
5371590Srgrimes			/* timestamp option -- okay */
5381590Srgrimes		} else {
5391590Srgrimes			dont_merge = 1;
5401590Srgrimes		}
5411590Srgrimes	}
5421590Srgrimes
5431590Srgrimes	if (__predict_false(th_seq != c->next_seq)) {
5441590Srgrimes		/* Out-of-order, so start counting again. */
5451590Srgrimes		if (c->mbuf != NULL)
5461590Srgrimes			sfxge_lro_deliver(&rxq->lro, c);
5471590Srgrimes		c->n_in_order_pkts -= lro_loss_packets;
5481590Srgrimes		c->next_seq = th_seq + data_length;
5491590Srgrimes		++rxq->lro.n_misorder;
5501590Srgrimes		goto deliver_buf_out;
5511590Srgrimes	}
5521590Srgrimes	c->next_seq = th_seq + data_length;
5531590Srgrimes
5541590Srgrimes	now = ticks;
5551590Srgrimes	if (now - c->last_pkt_ticks > lro_idle_ticks) {
5561590Srgrimes		++rxq->lro.n_drop_idle;
5571590Srgrimes		if (c->mbuf != NULL)
5581590Srgrimes			sfxge_lro_deliver(&rxq->lro, c);
5591590Srgrimes		sfxge_lro_drop(rxq, c);
5601590Srgrimes		return (0);
5611590Srgrimes	}
5621590Srgrimes	c->last_pkt_ticks = ticks;
5631590Srgrimes
5641590Srgrimes	if (c->n_in_order_pkts < lro_slow_start_packets) {
5651590Srgrimes		/* May be in slow-start, so don't merge. */
5661590Srgrimes		++rxq->lro.n_slow_start;
5671590Srgrimes		++c->n_in_order_pkts;
5681590Srgrimes		goto deliver_buf_out;
5691590Srgrimes	}
5701590Srgrimes
5711590Srgrimes	if (__predict_false(dont_merge)) {
5721590Srgrimes		if (c->mbuf != NULL)
5731590Srgrimes			sfxge_lro_deliver(&rxq->lro, c);
5741590Srgrimes		if (th->th_flags & (TH_FIN | TH_RST)) {
5751590Srgrimes			++rxq->lro.n_drop_closed;
5761590Srgrimes			sfxge_lro_drop(rxq, c);
5771590Srgrimes			return (0);
5781590Srgrimes		}
5791590Srgrimes		goto deliver_buf_out;
5801590Srgrimes	}
5811590Srgrimes
5821590Srgrimes	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
5831590Srgrimes
5841590Srgrimes	if (__predict_true(c->mbuf != NULL)) {
5851590Srgrimes		/* Remove headers and any padding */
5861590Srgrimes		rx_buf->mbuf->m_data += hdr_length;
5871590Srgrimes		rx_buf->mbuf->m_len = data_length;
5881590Srgrimes
5891590Srgrimes		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
5901590Srgrimes	} else {
5911590Srgrimes		/* Remove any padding */
5921590Srgrimes		rx_buf->mbuf->m_len = pkt_length;
5931590Srgrimes
5941590Srgrimes		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
5951590Srgrimes	}
5961590Srgrimes
5971590Srgrimes	rx_buf->mbuf = NULL;
5981590Srgrimes	return (1);
5991590Srgrimes
6001590Srgrimes deliver_buf_out:
6011590Srgrimes	sfxge_rx_deliver(rxq->sc, rx_buf);
6021590Srgrimes	return (1);
6031590Srgrimes}
6041590Srgrimes
6051590Srgrimesstatic void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
6061590Srgrimes			       uint16_t l2_id, void *nh, struct tcphdr *th)
6071590Srgrimes{
6081590Srgrimes	unsigned bucket = conn_hash & st->conns_mask;
6091590Srgrimes	struct sfxge_lro_conn *c;
6101590Srgrimes
6111590Srgrimes	if (st->conns_n[bucket] >= lro_chain_max) {
6121590Srgrimes		++st->n_too_many;
6131590Srgrimes		return;
6141590Srgrimes	}
6151590Srgrimes
6161590Srgrimes	if (!TAILQ_EMPTY(&st->free_conns)) {
6178874Srgrimes		c = TAILQ_FIRST(&st->free_conns);
6181590Srgrimes		TAILQ_REMOVE(&st->free_conns, c, link);
6191590Srgrimes	} else {
6201590Srgrimes		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
6211590Srgrimes		if (c == NULL)
6221590Srgrimes			return;
6231590Srgrimes		c->mbuf = NULL;
6241590Srgrimes		c->next_buf.mbuf = NULL;
6251590Srgrimes	}
6261590Srgrimes
6271590Srgrimes	/* Create the connection tracking data */
6281590Srgrimes	++st->conns_n[bucket];
6291590Srgrimes	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
6301590Srgrimes	c->l2_id = l2_id;
6311590Srgrimes	c->conn_hash = conn_hash;
6321590Srgrimes	c->source = th->th_sport;
6331590Srgrimes	c->dest = th->th_dport;
6341590Srgrimes	c->n_in_order_pkts = 0;
6351590Srgrimes	c->last_pkt_ticks = *(volatile int *)&ticks;
6368874Srgrimes	c->delivered = 0;
6371590Srgrimes	++st->n_new_stream;
6381590Srgrimes	/* NB. We don't initialise c->next_seq, and it doesn't matter what
6391590Srgrimes	 * value it has.  Most likely the next packet received for this
6401590Srgrimes	 * connection will not match -- no harm done.
6411590Srgrimes	 */
6421590Srgrimes}
6431590Srgrimes
6441590Srgrimes/* Process mbuf and decide whether to dispatch it to the stack now or
6451590Srgrimes * later.
6461590Srgrimes */
6471590Srgrimesstatic void
6481590Srgrimessfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
6491590Srgrimes{
6501590Srgrimes	struct sfxge_softc *sc = rxq->sc;
6511590Srgrimes	struct mbuf *m = rx_buf->mbuf;
6521590Srgrimes	struct ether_header *eh;
6531590Srgrimes	struct sfxge_lro_conn *c;
6541590Srgrimes	uint16_t l2_id;
6551590Srgrimes	uint16_t l3_proto;
6561590Srgrimes	void *nh;
6571590Srgrimes	struct tcphdr *th;
6581590Srgrimes	uint32_t conn_hash;
6591590Srgrimes	unsigned bucket;
6601590Srgrimes
6611590Srgrimes	/* Get the hardware hash */
6621590Srgrimes	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
6631590Srgrimes				      mtod(m, uint8_t *));
6641590Srgrimes
6651590Srgrimes	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
6661590Srgrimes	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
6671590Srgrimes		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
6681590Srgrimes		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
6691590Srgrimes			SFXGE_LRO_L2_ID_VLAN;
6701590Srgrimes		l3_proto = veh->evl_proto;
6711590Srgrimes		nh = veh + 1;
6721590Srgrimes	} else {
6731590Srgrimes		l2_id = 0;
6741590Srgrimes		l3_proto = eh->ether_type;
6751590Srgrimes		nh = eh + 1;
6761590Srgrimes	}
6771590Srgrimes
6781590Srgrimes	/* Check whether this is a suitable packet (unfragmented
6791590Srgrimes	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
6801590Srgrimes	 * length, and compute a hash if necessary.  If not, return.
6811590Srgrimes	 */
6821590Srgrimes	if (l3_proto == htons(ETHERTYPE_IP)) {
6831590Srgrimes		struct ip *iph = nh;
6841590Srgrimes
6851590Srgrimes		KASSERT(iph->ip_p == IPPROTO_TCP,
6861590Srgrimes		    ("IPv4 protocol is not TCP, but packet marker is set"));
6871590Srgrimes		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
6881590Srgrimes		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
6891590Srgrimes			goto deliver_now;
6901590Srgrimes		th = (struct tcphdr *)(iph + 1);
6911590Srgrimes	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
6921590Srgrimes		struct ip6_hdr *iph = nh;
6931590Srgrimes
6941590Srgrimes		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
6951590Srgrimes		    ("IPv6 next header is not TCP, but packet marker is set"));
6961590Srgrimes		l2_id |= SFXGE_LRO_L2_ID_IPV6;
6971590Srgrimes		th = (struct tcphdr *)(iph + 1);
6981590Srgrimes	} else {
6991590Srgrimes		goto deliver_now;
7001590Srgrimes	}
7011590Srgrimes
7021590Srgrimes	bucket = conn_hash & rxq->lro.conns_mask;
7031590Srgrimes
7041590Srgrimes	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
7051590Srgrimes		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
7061590Srgrimes			continue;
7071590Srgrimes		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
7081590Srgrimes			continue;
7091590Srgrimes		if (c->mbuf != NULL) {
7101590Srgrimes			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
7111590Srgrimes				struct ip *c_iph, *iph = nh;
7121590Srgrimes				c_iph = c->nh;
7131590Srgrimes				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
7141590Srgrimes				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
7151590Srgrimes					continue;
7161590Srgrimes			} else {
7171590Srgrimes				struct ip6_hdr *c_iph, *iph = nh;
7181590Srgrimes				c_iph = c->nh;
7191590Srgrimes				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
7201590Srgrimes				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
7211590Srgrimes					continue;
7221590Srgrimes			}
7231590Srgrimes		}
7241590Srgrimes
7251590Srgrimes		/* Re-insert at head of list to reduce lookup time. */
7261590Srgrimes		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
7271590Srgrimes		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
7281590Srgrimes
7291590Srgrimes		if (c->next_buf.mbuf != NULL) {
7301590Srgrimes			if (!sfxge_lro_try_merge(rxq, c))
7311590Srgrimes				goto deliver_now;
7321590Srgrimes		} else {
7331590Srgrimes			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
7341590Srgrimes			    active_link);
7351590Srgrimes		}
7361590Srgrimes		c->next_buf = *rx_buf;
7371590Srgrimes		c->next_eh = eh;
7381590Srgrimes		c->next_nh = nh;
7391590Srgrimes
7401590Srgrimes		rx_buf->mbuf = NULL;
7411590Srgrimes		rx_buf->flags = EFX_DISCARD;
7421590Srgrimes		return;
7431590Srgrimes	}
7441590Srgrimes
7451590Srgrimes	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
7461590Srgrimes deliver_now:
7471590Srgrimes	sfxge_rx_deliver(sc, rx_buf);
748}
749
750static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
751{
752	struct sfxge_lro_state *st = &rxq->lro;
753	struct sfxge_lro_conn *c;
754	unsigned t;
755
756	while (!LIST_EMPTY(&st->active_conns)) {
757		c = LIST_FIRST(&st->active_conns);
758		if (!c->delivered && c->mbuf != NULL)
759			sfxge_lro_deliver(st, c);
760		if (sfxge_lro_try_merge(rxq, c)) {
761			if (c->mbuf != NULL)
762				sfxge_lro_deliver(st, c);
763			LIST_REMOVE(c, active_link);
764		}
765		c->delivered = 0;
766	}
767
768	t = *(volatile int *)&ticks;
769	if (__predict_false(t != st->last_purge_ticks))
770		sfxge_lro_purge_idle(rxq, t);
771}
772
773#else	/* !SFXGE_LRO */
774
775static void
776sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
777{
778}
779
780static void
781sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
782{
783}
784
785#endif	/* SFXGE_LRO */
786
787void
788sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
789{
790	struct sfxge_softc *sc = rxq->sc;
791	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
792	unsigned int index;
793	struct sfxge_evq *evq;
794	unsigned int completed;
795	unsigned int level;
796	struct mbuf *m;
797	struct sfxge_rx_sw_desc *prev = NULL;
798
799	index = rxq->index;
800	evq = sc->evq[index];
801
802	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
803
804	completed = rxq->completed;
805	while (completed != rxq->pending) {
806		unsigned int id;
807		struct sfxge_rx_sw_desc *rx_desc;
808
809		id = completed++ & rxq->ptr_mask;
810		rx_desc = &rxq->queue[id];
811		m = rx_desc->mbuf;
812
813		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
814			goto discard;
815
816		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
817			goto discard;
818
819		prefetch_read_many(mtod(m, caddr_t));
820
821		/* Check for loopback packets */
822		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
823		    !(rx_desc->flags & EFX_PKT_IPV6)) {
824			struct ether_header *etherhp;
825
826			/*LINTED*/
827			etherhp = mtod(m, struct ether_header *);
828
829			if (etherhp->ether_type ==
830			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
831				EFSYS_PROBE(loopback);
832
833				rxq->loopback++;
834				goto discard;
835			}
836		}
837
838		/* Pass packet up the stack or into LRO (pipelined) */
839		if (prev != NULL) {
840			if (lro_enabled &&
841			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
842			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
843				sfxge_lro(rxq, prev);
844			else
845				sfxge_rx_deliver(sc, prev);
846		}
847		prev = rx_desc;
848		continue;
849
850discard:
851		/* Return the packet to the pool */
852		m_free(m);
853		rx_desc->mbuf = NULL;
854	}
855	rxq->completed = completed;
856
857	level = rxq->added - rxq->completed;
858
859	/* Pass last packet up the stack or into LRO */
860	if (prev != NULL) {
861		if (lro_enabled &&
862		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
863		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
864			sfxge_lro(rxq, prev);
865		else
866			sfxge_rx_deliver(sc, prev);
867	}
868
869	/*
870	 * If there are any pending flows and this is the end of the
871	 * poll then they must be completed.
872	 */
873	if (eop)
874		sfxge_lro_end_of_burst(rxq);
875
876	/* Top up the queue if necessary */
877	if (level < rxq->refill_threshold)
878		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
879}
880
881static void
882sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
883{
884	struct sfxge_rxq *rxq;
885	struct sfxge_evq *evq;
886	unsigned int count;
887
888	rxq = sc->rxq[index];
889	evq = sc->evq[index];
890
891	SFXGE_EVQ_LOCK(evq);
892
893	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
894	    ("rxq not started"));
895
896	rxq->init_state = SFXGE_RXQ_INITIALIZED;
897
898	callout_stop(&rxq->refill_callout);
899
900again:
901	rxq->flush_state = SFXGE_FLUSH_PENDING;
902
903	/* Flush the receive queue */
904	efx_rx_qflush(rxq->common);
905
906	SFXGE_EVQ_UNLOCK(evq);
907
908	count = 0;
909	do {
910		/* Spin for 100 ms */
911		DELAY(100000);
912
913		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
914			break;
915
916	} while (++count < 20);
917
918	SFXGE_EVQ_LOCK(evq);
919
920	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
921		goto again;
922
923	rxq->flush_state = SFXGE_FLUSH_DONE;
924
925	rxq->pending = rxq->added;
926	sfxge_rx_qcomplete(rxq, B_TRUE);
927
928	KASSERT(rxq->completed == rxq->pending,
929	    ("rxq->completed != rxq->pending"));
930
931	rxq->added = 0;
932	rxq->pending = 0;
933	rxq->completed = 0;
934	rxq->loopback = 0;
935
936	/* Destroy the common code receive queue. */
937	efx_rx_qdestroy(rxq->common);
938
939	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
940	    EFX_RXQ_NBUFS(sc->rxq_entries));
941
942	SFXGE_EVQ_UNLOCK(evq);
943}
944
945static int
946sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
947{
948	struct sfxge_rxq *rxq;
949	efsys_mem_t *esmp;
950	struct sfxge_evq *evq;
951	int rc;
952
953	rxq = sc->rxq[index];
954	esmp = &rxq->mem;
955	evq = sc->evq[index];
956
957	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
958	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
959	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
960	    ("evq->init_state != SFXGE_EVQ_STARTED"));
961
962	/* Program the buffer table. */
963	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
964	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
965		return (rc);
966
967	/* Create the common code receive queue. */
968	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
969	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
970	    &rxq->common)) != 0)
971		goto fail;
972
973	SFXGE_EVQ_LOCK(evq);
974
975	/* Enable the receive queue. */
976	efx_rx_qenable(rxq->common);
977
978	rxq->init_state = SFXGE_RXQ_STARTED;
979
980	/* Try to fill the queue from the pool. */
981	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
982
983	SFXGE_EVQ_UNLOCK(evq);
984
985	return (0);
986
987fail:
988	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
989	    EFX_RXQ_NBUFS(sc->rxq_entries));
990	return (rc);
991}
992
993void
994sfxge_rx_stop(struct sfxge_softc *sc)
995{
996	int index;
997
998	/* Stop the receive queue(s) */
999	index = sc->rxq_count;
1000	while (--index >= 0)
1001		sfxge_rx_qstop(sc, index);
1002
1003	sc->rx_prefix_size = 0;
1004	sc->rx_buffer_size = 0;
1005
1006	efx_rx_fini(sc->enp);
1007}
1008
1009int
1010sfxge_rx_start(struct sfxge_softc *sc)
1011{
1012	struct sfxge_intr *intr;
1013	int index;
1014	int rc;
1015
1016	intr = &sc->intr;
1017
1018	/* Initialize the common code receive module. */
1019	if ((rc = efx_rx_init(sc->enp)) != 0)
1020		return (rc);
1021
1022	/* Calculate the receive packet buffer size. */
1023	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1024	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1025			      sc->rx_prefix_size);
1026
1027	/* Select zone for packet buffers */
1028	if (sc->rx_buffer_size <= MCLBYTES)
1029		sc->rx_buffer_zone = zone_clust;
1030	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1031		sc->rx_buffer_zone = zone_jumbop;
1032	else if (sc->rx_buffer_size <= MJUM9BYTES)
1033		sc->rx_buffer_zone = zone_jumbo9;
1034	else
1035		sc->rx_buffer_zone = zone_jumbo16;
1036
1037	/*
1038	 * Set up the scale table.  Enable all hash types and hash insertion.
1039	 */
1040	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1041		sc->rx_indir_table[index] = index % sc->rxq_count;
1042	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1043				       SFXGE_RX_SCALE_MAX)) != 0)
1044		goto fail;
1045	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1046	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1047	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1048
1049	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1050	    sizeof(toep_key))) != 0)
1051		goto fail;
1052
1053	/* Start the receive queue(s). */
1054	for (index = 0; index < sc->rxq_count; index++) {
1055		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1056			goto fail2;
1057	}
1058
1059	return (0);
1060
1061fail2:
1062	while (--index >= 0)
1063		sfxge_rx_qstop(sc, index);
1064
1065fail:
1066	efx_rx_fini(sc->enp);
1067
1068	return (rc);
1069}
1070
1071#ifdef SFXGE_LRO
1072
1073static void sfxge_lro_init(struct sfxge_rxq *rxq)
1074{
1075	struct sfxge_lro_state *st = &rxq->lro;
1076	unsigned i;
1077
1078	st->conns_mask = lro_table_size - 1;
1079	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1080		("lro_table_size must be a power of 2"));
1081	st->sc = rxq->sc;
1082	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1083			   M_SFXGE, M_WAITOK);
1084	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1085			     M_SFXGE, M_WAITOK);
1086	for (i = 0; i <= st->conns_mask; ++i) {
1087		TAILQ_INIT(&st->conns[i]);
1088		st->conns_n[i] = 0;
1089	}
1090	LIST_INIT(&st->active_conns);
1091	TAILQ_INIT(&st->free_conns);
1092}
1093
1094static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1095{
1096	struct sfxge_lro_state *st = &rxq->lro;
1097	struct sfxge_lro_conn *c;
1098	unsigned i;
1099
1100	/* Return cleanly if sfxge_lro_init() has not been called. */
1101	if (st->conns == NULL)
1102		return;
1103
1104	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1105
1106	for (i = 0; i <= st->conns_mask; ++i) {
1107		while (!TAILQ_EMPTY(&st->conns[i])) {
1108			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1109			sfxge_lro_drop(rxq, c);
1110		}
1111	}
1112
1113	while (!TAILQ_EMPTY(&st->free_conns)) {
1114		c = TAILQ_FIRST(&st->free_conns);
1115		TAILQ_REMOVE(&st->free_conns, c, link);
1116		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1117		free(c, M_SFXGE);
1118	}
1119
1120	free(st->conns_n, M_SFXGE);
1121	free(st->conns, M_SFXGE);
1122	st->conns = NULL;
1123}
1124
1125#else
1126
1127static void
1128sfxge_lro_init(struct sfxge_rxq *rxq)
1129{
1130}
1131
1132static void
1133sfxge_lro_fini(struct sfxge_rxq *rxq)
1134{
1135}
1136
1137#endif	/* SFXGE_LRO */
1138
1139static void
1140sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1141{
1142	struct sfxge_rxq *rxq;
1143
1144	rxq = sc->rxq[index];
1145
1146	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1147	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1148
1149	/* Free the context array and the flow table. */
1150	free(rxq->queue, M_SFXGE);
1151	sfxge_lro_fini(rxq);
1152
1153	/* Release DMA memory. */
1154	sfxge_dma_free(&rxq->mem);
1155
1156	sc->rxq[index] = NULL;
1157
1158	free(rxq, M_SFXGE);
1159}
1160
1161static int
1162sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1163{
1164	struct sfxge_rxq *rxq;
1165	struct sfxge_evq *evq;
1166	efsys_mem_t *esmp;
1167	int rc;
1168
1169	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1170
1171	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1172	rxq->sc = sc;
1173	rxq->index = index;
1174	rxq->entries = sc->rxq_entries;
1175	rxq->ptr_mask = rxq->entries - 1;
1176	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1177
1178	sc->rxq[index] = rxq;
1179	esmp = &rxq->mem;
1180
1181	evq = sc->evq[index];
1182
1183	/* Allocate and zero DMA space. */
1184	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1185		return (rc);
1186
1187	/* Allocate buffer table entries. */
1188	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1189				 &rxq->buf_base_id);
1190
1191	/* Allocate the context array and the flow table. */
1192	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1193	    M_SFXGE, M_WAITOK | M_ZERO);
1194	sfxge_lro_init(rxq);
1195
1196	callout_init(&rxq->refill_callout, B_TRUE);
1197
1198	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1199
1200	return (0);
1201}
1202
1203static const struct {
1204	const char *name;
1205	size_t offset;
1206} sfxge_rx_stats[] = {
1207#define	SFXGE_RX_STAT(name, member) \
1208	{ #name, offsetof(struct sfxge_rxq, member) }
1209#ifdef SFXGE_LRO
1210	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1211	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1212	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1213	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1214	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1215	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1216	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1217	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1218#endif
1219};
1220
1221static int
1222sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1223{
1224	struct sfxge_softc *sc = arg1;
1225	unsigned int id = arg2;
1226	unsigned int sum, index;
1227
1228	/* Sum across all RX queues */
1229	sum = 0;
1230	for (index = 0; index < sc->rxq_count; index++)
1231		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1232					 sfxge_rx_stats[id].offset);
1233
1234	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1235}
1236
1237static void
1238sfxge_rx_stat_init(struct sfxge_softc *sc)
1239{
1240	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1241	struct sysctl_oid_list *stat_list;
1242	unsigned int id;
1243
1244	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1245
1246	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1247		SYSCTL_ADD_PROC(
1248			ctx, stat_list,
1249			OID_AUTO, sfxge_rx_stats[id].name,
1250			CTLTYPE_UINT|CTLFLAG_RD,
1251			sc, id, sfxge_rx_stat_handler, "IU",
1252			"");
1253	}
1254}
1255
1256void
1257sfxge_rx_fini(struct sfxge_softc *sc)
1258{
1259	int index;
1260
1261	index = sc->rxq_count;
1262	while (--index >= 0)
1263		sfxge_rx_qfini(sc, index);
1264
1265	sc->rxq_count = 0;
1266}
1267
1268int
1269sfxge_rx_init(struct sfxge_softc *sc)
1270{
1271	struct sfxge_intr *intr;
1272	int index;
1273	int rc;
1274
1275#ifdef SFXGE_LRO
1276	if (!ISP2(lro_table_size)) {
1277		log(LOG_ERR, "%s=%u must be power of 2",
1278		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1279		rc = EINVAL;
1280		goto fail_lro_table_size;
1281	}
1282
1283	if (lro_idle_ticks == 0)
1284		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1285#endif
1286
1287	intr = &sc->intr;
1288
1289	sc->rxq_count = intr->n_alloc;
1290
1291	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1292	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1293
1294	/* Initialize the receive queue(s) - one per interrupt. */
1295	for (index = 0; index < sc->rxq_count; index++) {
1296		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1297			goto fail;
1298	}
1299
1300	sfxge_rx_stat_init(sc);
1301
1302	return (0);
1303
1304fail:
1305	/* Tear down the receive queue(s). */
1306	while (--index >= 0)
1307		sfxge_rx_qfini(sc, index);
1308
1309	sc->rxq_count = 0;
1310
1311#ifdef SFXGE_LRO
1312fail_lro_table_size:
1313#endif
1314	return (rc);
1315}
1316