t4_sge.c revision 279092
1185377Ssam/*-
2187831Ssam * Copyright (c) 2011 Chelsio Communications, Inc.
3185377Ssam * All rights reserved.
4185377Ssam * Written by: Navdeep Parhar <np@FreeBSD.org>
5185377Ssam *
6185377Ssam * Redistribution and use in source and binary forms, with or without
7185377Ssam * modification, are permitted provided that the following conditions
8185377Ssam * are met:
9185377Ssam * 1. Redistributions of source code must retain the above copyright
10185377Ssam *    notice, this list of conditions and the following disclaimer.
11185377Ssam * 2. Redistributions in binary form must reproduce the above copyright
12185377Ssam *    notice, this list of conditions and the following disclaimer in the
13185377Ssam *    documentation and/or other materials provided with the distribution.
14185377Ssam *
15185377Ssam * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16185377Ssam * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17187831Ssam * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18185377Ssam * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19185377Ssam * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20185377Ssam * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21185377Ssam * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22185377Ssam * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23185377Ssam * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24185377Ssam * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25185377Ssam * SUCH DAMAGE.
26185377Ssam */
27185377Ssam
28185377Ssam#include <sys/cdefs.h>
29185377Ssam__FBSDID("$FreeBSD: head/sys/dev/cxgbe/t4_sge.c 279092 2015-02-20 22:57:54Z np $");
30185377Ssam
31185377Ssam#include "opt_inet.h"
32235972Sadrian#include "opt_inet6.h"
33235972Sadrian
34188979Ssam#include <sys/types.h>
35188979Ssam#include <sys/eventhandler.h>
36185377Ssam#include <sys/mbuf.h>
37185377Ssam#include <sys/socket.h>
38185377Ssam#include <sys/kernel.h>
39185377Ssam#include <sys/malloc.h>
40185377Ssam#include <sys/queue.h>
41185377Ssam#include <sys/sbuf.h>
42185377Ssam#include <sys/taskqueue.h>
43185377Ssam#include <sys/time.h>
44185377Ssam#include <sys/sglist.h>
45185377Ssam#include <sys/sysctl.h>
46188979Ssam#include <sys/smp.h>
47188979Ssam#include <sys/counter.h>
48185377Ssam#include <net/bpf.h>
49185377Ssam#include <net/ethernet.h>
50185380Ssam#include <net/if.h>
51185380Ssam#include <net/if_vlan_var.h>
52185377Ssam#include <netinet/in.h>
53185377Ssam#include <netinet/ip.h>
54185377Ssam#include <netinet/ip6.h>
55185377Ssam#include <netinet/tcp.h>
56185377Ssam#include <machine/md_var.h>
57185377Ssam#include <vm/vm.h>
58185377Ssam#include <vm/pmap.h>
59185377Ssam#ifdef DEV_NETMAP
60185377Ssam#include <machine/bus.h>
61185377Ssam#include <sys/selinfo.h>
62185377Ssam#include <net/if_var.h>
63185377Ssam#include <net/netmap.h>
64185377Ssam#include <dev/netmap/netmap_kern.h>
65185377Ssam#endif
66185377Ssam
67185377Ssam#include "common/common.h"
68185377Ssam#include "common/t4_regs.h"
69185377Ssam#include "common/t4_regs_values.h"
70185377Ssam#include "common/t4_msg.h"
71185377Ssam#include "t4_mp_ring.h"
72185377Ssam
73217621Sadrian#ifdef T4_PKT_TIMESTAMP
74238607Sadrian#define RX_COPY_THRESHOLD (MINCLSIZE - 8)
75238607Sadrian#else
76238607Sadrian#define RX_COPY_THRESHOLD MINCLSIZE
77185377Ssam#endif
78185377Ssam
79185377Ssam/*
80185377Ssam * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
81185377Ssam * 0-7 are valid values.
82185377Ssam */
83185377Ssamint fl_pktshift = 2;
84185377SsamTUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
85185377Ssam
86185377Ssam/*
87185377Ssam * Pad ethernet payload up to this boundary.
88185377Ssam * -1: driver should figure out a good value.
89185377Ssam *  0: disable padding.
90185377Ssam *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
91185377Ssam */
92217684Sadrianint fl_pad = -1;
93217684SadrianTUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
94185377Ssam
95185377Ssam/*
96185377Ssam * Status page length.
97185377Ssam * -1: driver should figure out a good value.
98185377Ssam *  64 or 128 are the only other valid values.
99185377Ssam */
100185377Ssamint spg_len = -1;
101185377SsamTUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
102185377Ssam
103185377Ssam/*
104185380Ssam * Congestion drops.
105185377Ssam * -1: no congestion feedback (not recommended).
106185377Ssam *  0: backpressure the channel instead of dropping packets right away.
107185377Ssam *  1: no backpressure, drop packets for the congested queue immediately.
108185377Ssam */
109185377Ssamstatic int cong_drop = 0;
110185377SsamTUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
111185377Ssam
112185377Ssam/*
113185377Ssam * Deliver multiple frames in the same free list buffer if they fit.
114243424Sadrian * -1: let the driver decide whether to enable buffer packing or not.
115185377Ssam *  0: disable buffer packing.
116185377Ssam *  1: enable buffer packing.
117185377Ssam */
118185377Ssamstatic int buffer_packing = -1;
119185377SsamTUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
120185377Ssam
121185377Ssam/*
122185377Ssam * Start next frame in a packed buffer at this boundary.
123185377Ssam * -1: driver should figure out a good value.
124185377Ssam * T4: driver will ignore this and use the same value as fl_pad above.
125185377Ssam * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
126185377Ssam */
127185377Ssamstatic int fl_pack = -1;
128185377SsamTUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
129185377Ssam
130185377Ssam/*
131185377Ssam * Allow the driver to create mbuf(s) in a cluster allocated for rx.
132185377Ssam * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
133234873Sadrian * 1: ok to create mbuf(s) within a cluster if there is room.
134234873Sadrian */
135222644Sadrianstatic int allow_mbufs_in_cluster = 1;
136234873SadrianTUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
137247286Sadrian
138185377Ssam/*
139222584Sadrian * Largest rx cluster size that the driver is allowed to allocate.
140222584Sadrian */
141222584Sadrianstatic int largest_rx_cluster = MJUM16BYTES;
142239642SadrianTUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
143222815Sadrian
144224709Sadrian/*
145230791Sadrian * Size of cluster allocation that's most likely to succeed.  The driver will
146222584Sadrian * fall back to this size if it fails to allocate clusters larger than this.
147185377Ssam */
148185377Ssamstatic int safest_rx_cluster = PAGE_SIZE;
149185377SsamTUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
150185377Ssam
151185377Ssamstruct txpkts {
152185377Ssam	u_int wr_type;		/* type 0 or type 1 */
153185377Ssam	u_int npkt;		/* # of packets in this work request */
154185377Ssam	u_int plen;		/* total payload (sum of all packets) */
155185377Ssam	u_int len16;		/* # of 16B pieces used by this work request */
156185377Ssam};
157185377Ssam
158185377Ssam/* A packet's SGL.  This + m_pkthdr has all info needed for tx */
159185377Ssamstruct sgl {
160185377Ssam	struct sglist sg;
161185377Ssam	struct sglist_seg seg[TX_SGL_SEGS];
162185377Ssam};
163225444Sadrian
164185377Ssamstatic int service_iq(struct sge_iq *, int);
165185377Ssamstatic struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
166185377Ssamstatic int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
167185377Ssamstatic inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
168185377Ssamstatic inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
169185377Ssamstatic inline void init_eq(struct sge_eq *, int, int, uint8_t, uint16_t,
170185377Ssam    char *);
171185377Ssamstatic int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
172185377Ssam    bus_addr_t *, void **);
173185377Ssamstatic int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
174185377Ssam    void *);
175185377Ssamstatic int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *,
176185377Ssam    int, int);
177185377Ssamstatic int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *);
178185377Ssamstatic void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
179185377Ssam    struct sge_fl *);
180185377Ssamstatic int alloc_fwq(struct adapter *);
181185377Ssamstatic int free_fwq(struct adapter *);
182185377Ssamstatic int alloc_mgmtq(struct adapter *);
183185377Ssamstatic int free_mgmtq(struct adapter *);
184185377Ssamstatic int alloc_rxq(struct port_info *, struct sge_rxq *, int, int,
185185377Ssam    struct sysctl_oid *);
186185377Ssamstatic int free_rxq(struct port_info *, struct sge_rxq *);
187185377Ssam#ifdef TCP_OFFLOAD
188185377Ssamstatic int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int,
189185377Ssam    struct sysctl_oid *);
190185377Ssamstatic int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *);
191185377Ssam#endif
192185377Ssam#ifdef DEV_NETMAP
193185377Ssamstatic int alloc_nm_rxq(struct port_info *, struct sge_nm_rxq *, int, int,
194185377Ssam    struct sysctl_oid *);
195185377Ssamstatic int free_nm_rxq(struct port_info *, struct sge_nm_rxq *);
196185377Ssamstatic int alloc_nm_txq(struct port_info *, struct sge_nm_txq *, int, int,
197185377Ssam    struct sysctl_oid *);
198185377Ssamstatic int free_nm_txq(struct port_info *, struct sge_nm_txq *);
199185377Ssam#endif
200185377Ssamstatic int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
201185377Ssamstatic int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
202185377Ssam#ifdef TCP_OFFLOAD
203185377Ssamstatic int ofld_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
204185377Ssam#endif
205185377Ssamstatic int alloc_eq(struct adapter *, struct port_info *, struct sge_eq *);
206185377Ssamstatic int free_eq(struct adapter *, struct sge_eq *);
207185377Ssamstatic int alloc_wrq(struct adapter *, struct port_info *, struct sge_wrq *,
208185377Ssam    struct sysctl_oid *);
209185377Ssamstatic int free_wrq(struct adapter *, struct sge_wrq *);
210185377Ssamstatic int alloc_txq(struct port_info *, struct sge_txq *, int,
211185377Ssam    struct sysctl_oid *);
212185377Ssamstatic int free_txq(struct port_info *, struct sge_txq *);
213185377Ssamstatic void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
214185377Ssamstatic inline void ring_fl_db(struct adapter *, struct sge_fl *);
215185377Ssamstatic int refill_fl(struct adapter *, struct sge_fl *, int);
216185377Ssamstatic void refill_sfl(void *);
217185377Ssamstatic int alloc_fl_sdesc(struct sge_fl *);
218185377Ssamstatic void free_fl_sdesc(struct adapter *, struct sge_fl *);
219185377Ssamstatic void find_best_refill_source(struct adapter *, struct sge_fl *, int);
220185377Ssamstatic void find_safe_refill_source(struct adapter *, struct sge_fl *);
221185377Ssamstatic void add_fl_to_sfl(struct adapter *, struct sge_fl *);
222185377Ssam
223222265Sadrianstatic inline void get_pkt_gl(struct mbuf *, struct sglist *);
224222265Sadrianstatic inline u_int txpkt_len16(u_int, u_int);
225222265Sadrianstatic inline u_int txpkts0_len16(u_int);
226185377Ssamstatic inline u_int txpkts1_len16(void);
227185377Ssamstatic u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
228185377Ssam    struct mbuf *, u_int);
229185377Ssamstatic int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
230185377Ssamstatic int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
231185377Ssamstatic u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
232185377Ssam    struct mbuf *, const struct txpkts *, u_int);
233185377Ssamstatic void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
234185377Ssamstatic inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
235185377Ssamstatic inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
236185377Ssamstatic inline uint16_t read_hw_cidx(struct sge_eq *);
237185377Ssamstatic inline u_int reclaimable_tx_desc(struct sge_eq *);
238185377Ssamstatic inline u_int total_available_tx_desc(struct sge_eq *);
239185377Ssamstatic u_int reclaim_tx_descs(struct sge_txq *, u_int);
240185377Ssamstatic void tx_reclaim(void *, int);
241185377Ssamstatic __be64 get_flit(struct sglist_seg *, int, int);
242185377Ssamstatic int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
243185377Ssam    struct mbuf *);
244185377Ssamstatic int handle_fw_msg(struct sge_iq *, const struct rss_header *,
245185377Ssam    struct mbuf *);
246185377Ssamstatic void wrq_tx_drain(void *, int);
247185377Ssamstatic void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
248185377Ssam
249185377Ssamstatic int sysctl_uint16(SYSCTL_HANDLER_ARGS);
250185377Ssamstatic int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
251185377Ssam
252185377Ssamstatic counter_u64_t extfree_refs;
253185377Ssamstatic counter_u64_t extfree_rels;
254185380Ssam
255185380Ssam/*
256185377Ssam * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
257185377Ssam */
258185377Ssamvoid
259185377Ssamt4_sge_modload(void)
260185377Ssam{
261185377Ssam
262185377Ssam	if (fl_pktshift < 0 || fl_pktshift > 7) {
263185380Ssam		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
264185377Ssam		    " using 2 instead.\n", fl_pktshift);
265185377Ssam		fl_pktshift = 2;
266185377Ssam	}
267185377Ssam
268185377Ssam	if (spg_len != 64 && spg_len != 128) {
269185377Ssam		int len;
270185377Ssam
271185377Ssam#if defined(__i386__) || defined(__amd64__)
272185377Ssam		len = cpu_clflush_line_size > 64 ? 128 : 64;
273204579Srpaulo#else
274204579Srpaulo		len = 64;
275204579Srpaulo#endif
276185377Ssam		if (spg_len != -1) {
277185377Ssam			printf("Invalid hw.cxgbe.spg_len value (%d),"
278185377Ssam			    " using %d instead.\n", spg_len, len);
279185377Ssam		}
280185377Ssam		spg_len = len;
281185377Ssam	}
282185377Ssam
283185377Ssam	if (cong_drop < -1 || cong_drop > 1) {
284185377Ssam		printf("Invalid hw.cxgbe.cong_drop value (%d),"
285185377Ssam		    " using 0 instead.\n", cong_drop);
286185377Ssam		cong_drop = 0;
287185377Ssam	}
288185377Ssam
289185377Ssam	extfree_refs = counter_u64_alloc(M_WAITOK);
290185377Ssam	extfree_rels = counter_u64_alloc(M_WAITOK);
291185377Ssam	counter_u64_zero(extfree_refs);
292185377Ssam	counter_u64_zero(extfree_rels);
293185377Ssam}
294185377Ssam
295185377Ssamvoid
296185377Ssamt4_sge_modunload(void)
297185377Ssam{
298185377Ssam
299185377Ssam	counter_u64_free(extfree_refs);
300185377Ssam	counter_u64_free(extfree_rels);
301185377Ssam}
302185377Ssam
303185377Ssamuint64_t
304185377Ssamt4_sge_extfree_refs(void)
305185377Ssam{
306185377Ssam	uint64_t refs, rels;
307185377Ssam
308185377Ssam	rels = counter_u64_fetch(extfree_rels);
309185377Ssam	refs = counter_u64_fetch(extfree_refs);
310185377Ssam
311185377Ssam	return (refs - rels);
312185377Ssam}
313185377Ssam
314185377Ssamvoid
315185377Ssamt4_init_sge_cpl_handlers(struct adapter *sc)
316185377Ssam{
317185406Ssam
318185377Ssam	t4_register_cpl_handler(sc, CPL_FW4_MSG, handle_fw_msg);
319217624Sadrian	t4_register_cpl_handler(sc, CPL_FW6_MSG, handle_fw_msg);
320217624Sadrian	t4_register_cpl_handler(sc, CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
321185377Ssam	t4_register_cpl_handler(sc, CPL_RX_PKT, t4_eth_rx);
322185377Ssam	t4_register_fw_msg_handler(sc, FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
323188979Ssam}
324185377Ssam
325185377Ssamstatic inline void
326185406Ssamsetup_pad_and_pack_boundaries(struct adapter *sc)
327185377Ssam{
328185377Ssam	uint32_t v, m;
329185377Ssam	int pad, pack;
330185377Ssam
331225883Sadrian	pad = fl_pad;
332185377Ssam	if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) {
333185377Ssam		/*
334185377Ssam		 * If there is any chance that we might use buffer packing and
335185377Ssam		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
336185377Ssam		 * it to 32 in all other cases.
337225883Sadrian		 */
338185377Ssam		pad = is_t4(sc) && buffer_packing ? 64 : 32;
339185377Ssam
340185377Ssam		/*
341185377Ssam		 * For fl_pad = 0 we'll still write a reasonable value to the
342185377Ssam		 * register but all the freelists will opt out of padding.
343185377Ssam		 * We'll complain here only if the user tried to set it to a
344185377Ssam		 * value greater than 0 that was invalid.
345185377Ssam		 */
346185377Ssam		if (fl_pad > 0) {
347185377Ssam			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
348185377Ssam			    " (%d), using %d instead.\n", fl_pad, pad);
349185377Ssam		}
350185377Ssam	}
351185377Ssam	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
352185377Ssam	v = V_INGPADBOUNDARY(ilog2(pad) - 5);
353185377Ssam	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
354185377Ssam
355188979Ssam	if (is_t4(sc)) {
356185377Ssam		if (fl_pack != -1 && fl_pack != pad) {
357185377Ssam			/* Complain but carry on. */
358185377Ssam			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
359185377Ssam			    " using %d instead.\n", fl_pack, pad);
360185377Ssam		}
361185377Ssam		return;
362185377Ssam	}
363185377Ssam
364185377Ssam	pack = fl_pack;
365185377Ssam	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
366185377Ssam	    !powerof2(fl_pack)) {
367185377Ssam		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
368185377Ssam		MPASS(powerof2(pack));
369185377Ssam		if (pack < 16)
370185377Ssam			pack = 16;
371185377Ssam		if (pack == 32)
372185377Ssam			pack = 64;
373185377Ssam		if (pack > 4096)
374185377Ssam			pack = 4096;
375185377Ssam		if (fl_pack != -1) {
376185377Ssam			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
377185377Ssam			    " (%d), using %d instead.\n", fl_pack, pack);
378188979Ssam		}
379185377Ssam	}
380235972Sadrian	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
381185377Ssam	if (pack == 16)
382185377Ssam		v = V_INGPACKBOUNDARY(0);
383185377Ssam	else
384185377Ssam		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
385185377Ssam
386185377Ssam	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
387185377Ssam	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
388185377Ssam}
389185377Ssam
390185377Ssam/*
391185377Ssam * adap->params.vpd.cclk must be set up before this is called.
392185377Ssam */
393185377Ssamvoid
394185377Ssamt4_tweak_chip_settings(struct adapter *sc)
395185377Ssam{
396185377Ssam	int i;
397185377Ssam	uint32_t v, m;
398185377Ssam	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
399185377Ssam	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
400185377Ssam	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
401185377Ssam	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
402185406Ssam	static int sge_flbuf_sizes[] = {
403185406Ssam		MCLBYTES,
404185406Ssam#if MJUMPAGESIZE != MCLBYTES
405185406Ssam		MJUMPAGESIZE,
406185406Ssam		MJUMPAGESIZE - CL_METADATA_SIZE,
407185377Ssam		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
408185377Ssam#endif
409185377Ssam		MJUM9BYTES,
410185377Ssam		MJUM16BYTES,
411185377Ssam		MCLBYTES - MSIZE - CL_METADATA_SIZE,
412185377Ssam		MJUM9BYTES - CL_METADATA_SIZE,
413185377Ssam		MJUM16BYTES - CL_METADATA_SIZE,
414185377Ssam	};
415185377Ssam
416185377Ssam	KASSERT(sc->flags & MASTER_PF,
417185377Ssam	    ("%s: trying to change chip settings when not master.", __func__));
418185377Ssam
419185377Ssam	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
420185377Ssam	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
421185377Ssam	    V_EGRSTATUSPAGESIZE(spg_len == 128);
422185377Ssam	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
423185377Ssam
424185377Ssam	setup_pad_and_pack_boundaries(sc);
425185377Ssam
426185377Ssam	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
427185377Ssam	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
428185377Ssam	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
429185377Ssam	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
430185377Ssam	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
431185377Ssam	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
432185377Ssam	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
433185377Ssam	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
434185380Ssam	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
435185380Ssam
436185377Ssam	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
437185377Ssam	    ("%s: hw buffer size table too big", __func__));
438185377Ssam	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
439185380Ssam		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
440185380Ssam		    sge_flbuf_sizes[i]);
441185377Ssam	}
442185377Ssam
443185377Ssam	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
444185380Ssam	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
445185380Ssam	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
446185377Ssam
447185377Ssam	KASSERT(intr_timer[0] <= timer_max,
448185377Ssam	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
449185377Ssam	    timer_max));
450185377Ssam	for (i = 1; i < nitems(intr_timer); i++) {
451185377Ssam		KASSERT(intr_timer[i] >= intr_timer[i - 1],
452185377Ssam		    ("%s: timers not listed in increasing order (%d)",
453185377Ssam		    __func__, i));
454185377Ssam
455185377Ssam		while (intr_timer[i] > timer_max) {
456185377Ssam			if (i == nitems(intr_timer) - 1) {
457185377Ssam				intr_timer[i] = timer_max;
458185380Ssam				break;
459185377Ssam			}
460185377Ssam			intr_timer[i] += intr_timer[i - 1];
461185377Ssam			intr_timer[i] /= 2;
462185377Ssam		}
463185377Ssam	}
464185377Ssam
465185377Ssam	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
466185377Ssam	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
467185377Ssam	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
468185377Ssam	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
469185377Ssam	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
470188979Ssam	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
471185377Ssam	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
472185377Ssam	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
473185377Ssam	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
474185377Ssam
475185377Ssam	if (cong_drop == 0) {
476185377Ssam		m = F_TUNNELCNGDROP0 | F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 |
477185377Ssam		    F_TUNNELCNGDROP3;
478185377Ssam		t4_set_reg_field(sc, A_TP_PARA_REG3, m, 0);
479185377Ssam	}
480185377Ssam
481185377Ssam	/* 4K, 16K, 64K, 256K DDP "page sizes" */
482185377Ssam	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
483185377Ssam	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
484185377Ssam
485185377Ssam	m = v = F_TDDPTAGTCB;
486185377Ssam	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
487185377Ssam
488185377Ssam	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
489185377Ssam	    F_RESETDDPOFFSET;
490185377Ssam	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
491185377Ssam	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
492185377Ssam}
493185377Ssam
494185377Ssam/*
495185377Ssam * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
496185377Ssam * padding is is use the buffer's start and end need to be aligned to the pad
497185377Ssam * boundary as well.  We'll just make sure that the size is a multiple of the
498185377Ssam * boundary here, it is up to the buffer allocation code to make sure the start
499185377Ssam * of the buffer is aligned as well.
500185377Ssam */
501185377Ssamstatic inline int
502185377Ssamhwsz_ok(struct adapter *sc, int hwsz)
503185377Ssam{
504185377Ssam	int mask = fl_pad ? sc->sge.pad_boundary - 1 : 16 - 1;
505185377Ssam
506185377Ssam	return (hwsz >= 64 && (hwsz & mask) == 0);
507185377Ssam}
508185377Ssam
509185377Ssam/*
510185377Ssam * XXX: driver really should be able to deal with unexpected settings.
511185377Ssam */
512185377Ssamint
513185377Ssamt4_read_chip_settings(struct adapter *sc)
514185377Ssam{
515185377Ssam	struct sge *s = &sc->sge;
516185377Ssam	int i, j, n, rc = 0;
517185377Ssam	uint32_t m, v, r;
518185377Ssam	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
519185377Ssam	static int sw_buf_sizes[] = {	/* Sorted by size */
520185377Ssam		MCLBYTES,
521185377Ssam#if MJUMPAGESIZE != MCLBYTES
522185377Ssam		MJUMPAGESIZE,
523185377Ssam#endif
524185377Ssam		MJUM9BYTES,
525185377Ssam		MJUM16BYTES
526185377Ssam	};
527185377Ssam	struct sw_zone_info *swz, *safe_swz;
528185377Ssam	struct hw_buf_info *hwb;
529185377Ssam
530185377Ssam	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
531185377Ssam	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
532185377Ssam	    V_EGRSTATUSPAGESIZE(spg_len == 128);
533185377Ssam	r = t4_read_reg(sc, A_SGE_CONTROL);
534185377Ssam	if ((r & m) != v) {
535185377Ssam		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
536185377Ssam		rc = EINVAL;
537185377Ssam	}
538185377Ssam	s->pad_boundary = 1 << (G_INGPADBOUNDARY(r) + 5);
539185377Ssam
540185377Ssam	if (is_t4(sc))
541185377Ssam		s->pack_boundary = s->pad_boundary;
542185377Ssam	else {
543185377Ssam		r = t4_read_reg(sc, A_SGE_CONTROL2);
544185406Ssam		if (G_INGPACKBOUNDARY(r) == 0)
545185377Ssam			s->pack_boundary = 16;
546185377Ssam		else
547185377Ssam			s->pack_boundary = 1 << (G_INGPACKBOUNDARY(r) + 5);
548185377Ssam	}
549185377Ssam
550185377Ssam	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
551185377Ssam	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
552185377Ssam	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
553185377Ssam	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
554185377Ssam	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
555185377Ssam	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
556185377Ssam	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
557185377Ssam	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
558185377Ssam	r = t4_read_reg(sc, A_SGE_HOST_PAGE_SIZE);
559185377Ssam	if (r != v) {
560185377Ssam		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
561185377Ssam		rc = EINVAL;
562185377Ssam	}
563185377Ssam
564185377Ssam	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
565185377Ssam	hwb = &s->hw_buf_info[0];
566185377Ssam	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
567185377Ssam		r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
568185377Ssam		hwb->size = r;
569185377Ssam		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
570185377Ssam		hwb->next = -1;
571185377Ssam	}
572185377Ssam
573185377Ssam	/*
574185377Ssam	 * Create a sorted list in decreasing order of hw buffer sizes (and so
575185377Ssam	 * increasing order of spare area) for each software zone.
576185377Ssam	 *
577185377Ssam	 * If padding is enabled then the start and end of the buffer must align
578185377Ssam	 * to the pad boundary; if packing is enabled then they must align with
579185377Ssam	 * the pack boundary as well.  Allocations from the cluster zones are
580185377Ssam	 * aligned to min(size, 4K), so the buffer starts at that alignment and
581185377Ssam	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
582185377Ssam	 * starting alignment will be reduced to MSIZE and the driver will
583185377Ssam	 * exercise appropriate caution when deciding on the best buffer layout
584185377Ssam	 * to use.
585185377Ssam	 */
586185377Ssam	n = 0;	/* no usable buffer size to begin with */
587185377Ssam	swz = &s->sw_zone_info[0];
588185377Ssam	safe_swz = NULL;
589185377Ssam	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
590185377Ssam		int8_t head = -1, tail = -1;
591185377Ssam
592185377Ssam		swz->size = sw_buf_sizes[i];
593185377Ssam		swz->zone = m_getzone(swz->size);
594185377Ssam		swz->type = m_gettype(swz->size);
595185377Ssam
596185377Ssam		if (swz->size < PAGE_SIZE) {
597185377Ssam			MPASS(powerof2(swz->size));
598185377Ssam			if (fl_pad && (swz->size % sc->sge.pad_boundary != 0))
599185377Ssam				continue;
600185377Ssam		}
601185377Ssam
602185377Ssam		if (swz->size == safest_rx_cluster)
603185377Ssam			safe_swz = swz;
604185377Ssam
605185377Ssam		hwb = &s->hw_buf_info[0];
606185377Ssam		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
607185377Ssam			if (hwb->zidx != -1 || hwb->size > swz->size)
608185377Ssam				continue;
609185377Ssam#ifdef INVARIANTS
610185377Ssam			if (fl_pad)
611185377Ssam				MPASS(hwb->size % sc->sge.pad_boundary == 0);
612185377Ssam#endif
613185377Ssam			hwb->zidx = i;
614185377Ssam			if (head == -1)
615185377Ssam				head = tail = j;
616185377Ssam			else if (hwb->size < s->hw_buf_info[tail].size) {
617185377Ssam				s->hw_buf_info[tail].next = j;
618185377Ssam				tail = j;
619185377Ssam			} else {
620185377Ssam				int8_t *cur;
621185377Ssam				struct hw_buf_info *t;
622185377Ssam
623185377Ssam				for (cur = &head; *cur != -1; cur = &t->next) {
624185377Ssam					t = &s->hw_buf_info[*cur];
625185377Ssam					if (hwb->size == t->size) {
626185377Ssam						hwb->zidx = -2;
627185377Ssam						break;
628185377Ssam					}
629185377Ssam					if (hwb->size > t->size) {
630185377Ssam						hwb->next = *cur;
631185377Ssam						*cur = j;
632185377Ssam						break;
633185377Ssam					}
634185377Ssam				}
635185377Ssam			}
636185377Ssam		}
637185377Ssam		swz->head_hwidx = head;
638185377Ssam		swz->tail_hwidx = tail;
639185377Ssam
640185377Ssam		if (tail != -1) {
641185377Ssam			n++;
642185377Ssam			if (swz->size - s->hw_buf_info[tail].size >=
643185377Ssam			    CL_METADATA_SIZE)
644185377Ssam				sc->flags |= BUF_PACKING_OK;
645185377Ssam		}
646185377Ssam	}
647185377Ssam	if (n == 0) {
648185377Ssam		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
649185377Ssam		rc = EINVAL;
650185377Ssam	}
651187831Ssam
652185377Ssam	s->safe_hwidx1 = -1;
653185377Ssam	s->safe_hwidx2 = -1;
654185377Ssam	if (safe_swz != NULL) {
655185377Ssam		s->safe_hwidx1 = safe_swz->head_hwidx;
656187831Ssam		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
657185377Ssam			int spare;
658185377Ssam
659185377Ssam			hwb = &s->hw_buf_info[i];
660185377Ssam#ifdef INVARIANTS
661185377Ssam			if (fl_pad)
662185377Ssam				MPASS(hwb->size % sc->sge.pad_boundary == 0);
663185377Ssam#endif
664185377Ssam			spare = safe_swz->size - hwb->size;
665185377Ssam			if (spare >= CL_METADATA_SIZE) {
666185377Ssam				s->safe_hwidx2 = i;
667188979Ssam				break;
668188979Ssam			}
669188979Ssam		}
670188979Ssam	}
671188979Ssam
672188979Ssam	r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD);
673188979Ssam	s->counter_val[0] = G_THRESHOLD_0(r);
674188979Ssam	s->counter_val[1] = G_THRESHOLD_1(r);
675188979Ssam	s->counter_val[2] = G_THRESHOLD_2(r);
676235972Sadrian	s->counter_val[3] = G_THRESHOLD_3(r);
677188979Ssam
678188979Ssam	r = t4_read_reg(sc, A_SGE_TIMER_VALUE_0_AND_1);
679188979Ssam	s->timer_val[0] = G_TIMERVALUE0(r) / core_ticks_per_usec(sc);
680188979Ssam	s->timer_val[1] = G_TIMERVALUE1(r) / core_ticks_per_usec(sc);
681188979Ssam	r = t4_read_reg(sc, A_SGE_TIMER_VALUE_2_AND_3);
682188979Ssam	s->timer_val[2] = G_TIMERVALUE2(r) / core_ticks_per_usec(sc);
683188979Ssam	s->timer_val[3] = G_TIMERVALUE3(r) / core_ticks_per_usec(sc);
684188979Ssam	r = t4_read_reg(sc, A_SGE_TIMER_VALUE_4_AND_5);
685188979Ssam	s->timer_val[4] = G_TIMERVALUE4(r) / core_ticks_per_usec(sc);
686188979Ssam	s->timer_val[5] = G_TIMERVALUE5(r) / core_ticks_per_usec(sc);
687188979Ssam
688188979Ssam	if (cong_drop == 0) {
689188979Ssam		m = F_TUNNELCNGDROP0 | F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 |
690188979Ssam		    F_TUNNELCNGDROP3;
691188979Ssam		r = t4_read_reg(sc, A_TP_PARA_REG3);
692188979Ssam		if (r & m) {
693188979Ssam			device_printf(sc->dev,
694188979Ssam			    "invalid TP_PARA_REG3(0x%x)\n", r);
695188979Ssam			rc = EINVAL;
696188979Ssam		}
697188979Ssam	}
698188979Ssam
699188979Ssam	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
700188979Ssam	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
701188979Ssam	if (r != v) {
702188979Ssam		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
703185377Ssam		rc = EINVAL;
704185377Ssam	}
705185377Ssam
706185377Ssam	m = v = F_TDDPTAGTCB;
707185377Ssam	r = t4_read_reg(sc, A_ULP_RX_CTL);
708185377Ssam	if ((r & m) != v) {
709185377Ssam		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
710185377Ssam		rc = EINVAL;
711185377Ssam	}
712185377Ssam
713185377Ssam	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
714185377Ssam	    F_RESETDDPOFFSET;
715185377Ssam	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
716185377Ssam	r = t4_read_reg(sc, A_TP_PARA_REG5);
717185377Ssam	if ((r & m) != v) {
718185377Ssam		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
719185377Ssam		rc = EINVAL;
720185377Ssam	}
721185377Ssam
722185377Ssam	r = t4_read_reg(sc, A_SGE_CONM_CTRL);
723185377Ssam	s->fl_starve_threshold = G_EGRTHRESHOLD(r) * 2 + 1;
724185377Ssam	if (is_t4(sc))
725185377Ssam		s->fl_starve_threshold2 = s->fl_starve_threshold;
726185377Ssam	else
727185377Ssam		s->fl_starve_threshold2 = G_EGRTHRESHOLDPACKING(r) * 2 + 1;
728185377Ssam
729185377Ssam	/* egress queues: log2 of # of doorbells per BAR2 page */
730185377Ssam	r = t4_read_reg(sc, A_SGE_EGRESS_QUEUES_PER_PAGE_PF);
731185377Ssam	r >>= S_QUEUESPERPAGEPF0 +
732185377Ssam	    (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf;
733185377Ssam	s->eq_s_qpp = r & M_QUEUESPERPAGEPF0;
734185377Ssam
735185377Ssam	/* ingress queues: log2 of # of doorbells per BAR2 page */
736185377Ssam	r = t4_read_reg(sc, A_SGE_INGRESS_QUEUES_PER_PAGE_PF);
737185377Ssam	r >>= S_QUEUESPERPAGEPF0 +
738185377Ssam	    (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf;
739185377Ssam	s->iq_s_qpp = r & M_QUEUESPERPAGEPF0;
740185377Ssam
741185377Ssam	t4_init_tp_params(sc);
742185377Ssam
743185377Ssam	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
744185377Ssam	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
745185377Ssam
746185377Ssam	return (rc);
747185377Ssam}
748185377Ssam
749185377Ssamint
750185377Ssamt4_create_dma_tag(struct adapter *sc)
751185377Ssam{
752185377Ssam	int rc;
753185377Ssam
754185377Ssam	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
755185377Ssam	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
756185377Ssam	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
757185377Ssam	    NULL, &sc->dmat);
758185377Ssam	if (rc != 0) {
759185377Ssam		device_printf(sc->dev,
760185377Ssam		    "failed to create main DMA tag: %d\n", rc);
761185377Ssam	}
762185377Ssam
763185377Ssam	return (rc);
764185377Ssam}
765185377Ssam
766185377Ssamvoid
767185377Ssamt4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
768185377Ssam    struct sysctl_oid_list *children)
769185377Ssam{
770185377Ssam
771185377Ssam	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
772185377Ssam	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
773185377Ssam	    "freelist buffer sizes");
774185377Ssam
775185377Ssam	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
776185377Ssam	    NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)");
777185377Ssam
778185377Ssam	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
779185377Ssam	    NULL, sc->sge.pad_boundary, "payload pad boundary (bytes)");
780185377Ssam
781185377Ssam	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
782185377Ssam	    NULL, spg_len, "status page size (bytes)");
783185377Ssam
784185377Ssam	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
785185380Ssam	    NULL, cong_drop, "congestion drop setting");
786185380Ssam
787185377Ssam	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
788185377Ssam	    NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)");
789185377Ssam}
790185377Ssam
791242408Sadrianint
792242408Sadriant4_destroy_dma_tag(struct adapter *sc)
793242408Sadrian{
794242408Sadrian	if (sc->dmat)
795242408Sadrian		bus_dma_tag_destroy(sc->dmat);
796242408Sadrian
797242408Sadrian	return (0);
798242408Sadrian}
799242408Sadrian
800242408Sadrian/*
801242408Sadrian * Allocate and initialize the firmware event queue and the management queue.
802242408Sadrian *
803242408Sadrian * Returns errno on failure.  Resources allocated up to that point may still be
804242408Sadrian * allocated.  Caller is responsible for cleanup in case this function fails.
805242408Sadrian */
806242408Sadrianint
807242408Sadriant4_setup_adapter_queues(struct adapter *sc)
808242408Sadrian{
809242408Sadrian	int rc;
810242408Sadrian
811242408Sadrian	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
812242408Sadrian
813242408Sadrian	sysctl_ctx_init(&sc->ctx);
814185377Ssam	sc->flags |= ADAP_SYSCTL_CTX;
815185377Ssam
816185377Ssam	/*
817185377Ssam	 * Firmware event queue
818185377Ssam	 */
819185377Ssam	rc = alloc_fwq(sc);
820185377Ssam	if (rc != 0)
821185377Ssam		return (rc);
822185377Ssam
823185377Ssam	/*
824185377Ssam	 * Management queue.  This is just a control queue that uses the fwq as
825185377Ssam	 * its associated iq.
826185377Ssam	 */
827185377Ssam	rc = alloc_mgmtq(sc);
828185377Ssam
829185377Ssam	return (rc);
830185377Ssam}
831185377Ssam
832185377Ssam/*
833185377Ssam * Idempotent
834185377Ssam */
835185377Ssamint
836185377Ssamt4_teardown_adapter_queues(struct adapter *sc)
837185377Ssam{
838185377Ssam
839185377Ssam	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
840185377Ssam
841185377Ssam	/* Do this before freeing the queue */
842185377Ssam	if (sc->flags & ADAP_SYSCTL_CTX) {
843185377Ssam		sysctl_ctx_free(&sc->ctx);
844185377Ssam		sc->flags &= ~ADAP_SYSCTL_CTX;
845185377Ssam	}
846185377Ssam
847185377Ssam	free_mgmtq(sc);
848185377Ssam	free_fwq(sc);
849185377Ssam
850185377Ssam	return (0);
851185377Ssam}
852238858Sadrian
853239643Sadrianstatic inline int
854185377Ssamport_intr_count(struct port_info *pi)
855185377Ssam{
856185377Ssam	int rc = 0;
857185377Ssam
858185377Ssam	if (pi->flags & INTR_RXQ)
859185377Ssam		rc += pi->nrxq;
860185377Ssam#ifdef TCP_OFFLOAD
861185377Ssam	if (pi->flags & INTR_OFLD_RXQ)
862185377Ssam		rc += pi->nofldrxq;
863185377Ssam#endif
864185377Ssam#ifdef DEV_NETMAP
865185377Ssam	if (pi->flags & INTR_NM_RXQ)
866185377Ssam		rc += pi->nnmrxq;
867185377Ssam#endif
868185377Ssam	return (rc);
869185377Ssam}
870185377Ssam
871185377Ssamstatic inline int
872185380Ssamfirst_vector(struct port_info *pi)
873185380Ssam{
874185377Ssam	struct adapter *sc = pi->adapter;
875230791Sadrian	int rc = T4_EXTRA_INTR, i;
876230791Sadrian
877230791Sadrian	if (sc->intr_count == 1)
878230791Sadrian		return (0);
879230791Sadrian
880230791Sadrian	for_each_port(sc, i) {
881185377Ssam		if (i == pi->port_id)
882185377Ssam			break;
883185377Ssam
884185377Ssam		rc += port_intr_count(sc->port[i]);
885185377Ssam	}
886185377Ssam
887185377Ssam	return (rc);
888185377Ssam}
889185377Ssam
890185377Ssam/*
891185377Ssam * Given an arbitrary "index," come up with an iq that can be used by other
892195114Ssam * queues (of this port) for interrupt forwarding, SGE egress updates, etc.
893195114Ssam * The iq returned is guaranteed to be something that takes direct interrupts.
894185377Ssam */
895195114Ssamstatic struct sge_iq *
896195114Ssamport_intr_iq(struct port_info *pi, int idx)
897195114Ssam{
898195114Ssam	struct adapter *sc = pi->adapter;
899195114Ssam	struct sge *s = &sc->sge;
900195114Ssam	struct sge_iq *iq = NULL;
901185377Ssam	int nintr, i;
902185377Ssam
903192396Ssam	if (sc->intr_count == 1)
904192396Ssam		return (&sc->sge.fwq);
905192396Ssam
906192396Ssam	nintr = port_intr_count(pi);
907192396Ssam	KASSERT(nintr != 0,
908192396Ssam	    ("%s: pi %p has no exclusive interrupts, total interrupts = %d",
909192396Ssam	    __func__, pi, sc->intr_count));
910192400Ssam#ifdef DEV_NETMAP
911192400Ssam	/* Exclude netmap queues as they can't take anyone else's interrupts */
912185377Ssam	if (pi->flags & INTR_NM_RXQ)
913218436Sadrian		nintr -= pi->nnmrxq;
914220324Sadrian	KASSERT(nintr > 0,
915218436Sadrian	    ("%s: pi %p has nintr %d after netmap adjustment of %d", __func__,
916185377Ssam	    pi, nintr, pi->nnmrxq));
917185377Ssam#endif
918185377Ssam	i = idx % nintr;
919185377Ssam
920185377Ssam	if (pi->flags & INTR_RXQ) {
921185406Ssam	       	if (i < pi->nrxq) {
922185406Ssam			iq = &s->rxq[pi->first_rxq + i].iq;
923185406Ssam			goto done;
924185406Ssam		}
925185406Ssam		i -= pi->nrxq;
926185406Ssam	}
927185406Ssam#ifdef TCP_OFFLOAD
928185406Ssam	if (pi->flags & INTR_OFLD_RXQ) {
929185406Ssam	       	if (i < pi->nofldrxq) {
930185406Ssam			iq = &s->ofld_rxq[pi->first_ofld_rxq + i].iq;
931185406Ssam			goto done;
932185406Ssam		}
933185406Ssam		i -= pi->nofldrxq;
934185406Ssam	}
935185406Ssam#endif
936185406Ssam	panic("%s: pi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__,
937185406Ssam	    pi, pi->flags & INTR_ALL, idx, nintr);
938185406Ssamdone:
939185406Ssam	MPASS(iq != NULL);
940185406Ssam	KASSERT(iq->flags & IQ_INTR,
941185406Ssam	    ("%s: iq %p (port %p, intr_flags 0x%lx, idx %d)", __func__, iq, pi,
942185406Ssam	    pi->flags & INTR_ALL, idx));
943185406Ssam	return (iq);
944185406Ssam}
945185406Ssam
946185406Ssam/* Maximum payload that can be delivered with a single iq descriptor */
947185418Ssamstatic inline int
948mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
949{
950	int payload;
951
952#ifdef TCP_OFFLOAD
953	if (toe) {
954		payload = sc->tt.rx_coalesce ?
955		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
956	} else {
957#endif
958		/* large enough even when hw VLAN extraction is disabled */
959		payload = fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN +
960		    mtu;
961#ifdef TCP_OFFLOAD
962	}
963#endif
964
965	return (payload);
966}
967
968int
969t4_setup_port_queues(struct port_info *pi)
970{
971	int rc = 0, i, j, intr_idx, iqid;
972	struct sge_rxq *rxq;
973	struct sge_txq *txq;
974	struct sge_wrq *ctrlq;
975#ifdef TCP_OFFLOAD
976	struct sge_ofld_rxq *ofld_rxq;
977	struct sge_wrq *ofld_txq;
978#endif
979#ifdef DEV_NETMAP
980	struct sge_nm_rxq *nm_rxq;
981	struct sge_nm_txq *nm_txq;
982#endif
983	char name[16];
984	struct adapter *sc = pi->adapter;
985	struct ifnet *ifp = pi->ifp;
986	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
987	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
988	int maxp, mtu = ifp->if_mtu;
989
990	/* Interrupt vector to start from (when using multiple vectors) */
991	intr_idx = first_vector(pi);
992
993	/*
994	 * First pass over all NIC and TOE rx queues:
995	 * a) initialize iq and fl
996	 * b) allocate queue iff it will take direct interrupts.
997	 */
998	maxp = mtu_to_max_payload(sc, mtu, 0);
999	if (pi->flags & INTR_RXQ) {
1000		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq",
1001		    CTLFLAG_RD, NULL, "rx queues");
1002	}
1003	for_each_rxq(pi, i, rxq) {
1004
1005		init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, pi->qsize_rxq);
1006
1007		snprintf(name, sizeof(name), "%s rxq%d-fl",
1008		    device_get_nameunit(pi->dev), i);
1009		init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, name);
1010
1011		if (pi->flags & INTR_RXQ) {
1012			rxq->iq.flags |= IQ_INTR;
1013			rc = alloc_rxq(pi, rxq, intr_idx, i, oid);
1014			if (rc != 0)
1015				goto done;
1016			intr_idx++;
1017		}
1018	}
1019#ifdef TCP_OFFLOAD
1020	maxp = mtu_to_max_payload(sc, mtu, 1);
1021	if (is_offload(sc) && pi->flags & INTR_OFLD_RXQ) {
1022		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq",
1023		    CTLFLAG_RD, NULL,
1024		    "rx queues for offloaded TCP connections");
1025	}
1026	for_each_ofld_rxq(pi, i, ofld_rxq) {
1027
1028		init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx,
1029		    pi->qsize_rxq);
1030
1031		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
1032		    device_get_nameunit(pi->dev), i);
1033		init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, name);
1034
1035		if (pi->flags & INTR_OFLD_RXQ) {
1036			ofld_rxq->iq.flags |= IQ_INTR;
1037			rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid);
1038			if (rc != 0)
1039				goto done;
1040			intr_idx++;
1041		}
1042	}
1043#endif
1044#ifdef DEV_NETMAP
1045	/*
1046	 * We don't have buffers to back the netmap rx queues right now so we
1047	 * create the queues in a way that doesn't set off any congestion signal
1048	 * in the chip.
1049	 */
1050	if (pi->flags & INTR_NM_RXQ) {
1051		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "nm_rxq",
1052		    CTLFLAG_RD, NULL, "rx queues for netmap");
1053		for_each_nm_rxq(pi, i, nm_rxq) {
1054			rc = alloc_nm_rxq(pi, nm_rxq, intr_idx, i, oid);
1055			if (rc != 0)
1056				goto done;
1057			intr_idx++;
1058		}
1059	}
1060#endif
1061
1062	/*
1063	 * Second pass over all NIC and TOE rx queues.  The queues forwarding
1064	 * their interrupts are allocated now.
1065	 */
1066	j = 0;
1067	if (!(pi->flags & INTR_RXQ)) {
1068		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq",
1069		    CTLFLAG_RD, NULL, "rx queues");
1070		for_each_rxq(pi, i, rxq) {
1071			MPASS(!(rxq->iq.flags & IQ_INTR));
1072
1073			intr_idx = port_intr_iq(pi, j)->abs_id;
1074
1075			rc = alloc_rxq(pi, rxq, intr_idx, i, oid);
1076			if (rc != 0)
1077				goto done;
1078			j++;
1079		}
1080	}
1081#ifdef TCP_OFFLOAD
1082	if (is_offload(sc) && !(pi->flags & INTR_OFLD_RXQ)) {
1083		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq",
1084		    CTLFLAG_RD, NULL,
1085		    "rx queues for offloaded TCP connections");
1086		for_each_ofld_rxq(pi, i, ofld_rxq) {
1087			MPASS(!(ofld_rxq->iq.flags & IQ_INTR));
1088
1089			intr_idx = port_intr_iq(pi, j)->abs_id;
1090
1091			rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid);
1092			if (rc != 0)
1093				goto done;
1094			j++;
1095		}
1096	}
1097#endif
1098#ifdef DEV_NETMAP
1099	if (!(pi->flags & INTR_NM_RXQ))
1100		CXGBE_UNIMPLEMENTED(__func__);
1101#endif
1102
1103	/*
1104	 * Now the tx queues.  Only one pass needed.
1105	 */
1106	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
1107	    NULL, "tx queues");
1108	j = 0;
1109	for_each_txq(pi, i, txq) {
1110		iqid = port_intr_iq(pi, j)->cntxt_id;
1111		snprintf(name, sizeof(name), "%s txq%d",
1112		    device_get_nameunit(pi->dev), i);
1113		init_eq(&txq->eq, EQ_ETH, pi->qsize_txq, pi->tx_chan, iqid,
1114		    name);
1115
1116		rc = alloc_txq(pi, txq, i, oid);
1117		if (rc != 0)
1118			goto done;
1119		j++;
1120	}
1121#ifdef TCP_OFFLOAD
1122	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq",
1123	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
1124	for_each_ofld_txq(pi, i, ofld_txq) {
1125		struct sysctl_oid *oid2;
1126
1127		iqid = port_intr_iq(pi, j)->cntxt_id;
1128		snprintf(name, sizeof(name), "%s ofld_txq%d",
1129		    device_get_nameunit(pi->dev), i);
1130		init_eq(&ofld_txq->eq, EQ_OFLD, pi->qsize_txq, pi->tx_chan,
1131		    iqid, name);
1132
1133		snprintf(name, sizeof(name), "%d", i);
1134		oid2 = SYSCTL_ADD_NODE(&pi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
1135		    name, CTLFLAG_RD, NULL, "offload tx queue");
1136
1137		rc = alloc_wrq(sc, pi, ofld_txq, oid2);
1138		if (rc != 0)
1139			goto done;
1140		j++;
1141	}
1142#endif
1143#ifdef DEV_NETMAP
1144	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "nm_txq",
1145	    CTLFLAG_RD, NULL, "tx queues for netmap use");
1146	for_each_nm_txq(pi, i, nm_txq) {
1147		iqid = pi->first_nm_rxq + (j % pi->nnmrxq);
1148		rc = alloc_nm_txq(pi, nm_txq, iqid, i, oid);
1149		if (rc != 0)
1150			goto done;
1151		j++;
1152	}
1153#endif
1154
1155	/*
1156	 * Finally, the control queue.
1157	 */
1158	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD,
1159	    NULL, "ctrl queue");
1160	ctrlq = &sc->sge.ctrlq[pi->port_id];
1161	iqid = port_intr_iq(pi, 0)->cntxt_id;
1162	snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(pi->dev));
1163	init_eq(&ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid, name);
1164	rc = alloc_wrq(sc, pi, ctrlq, oid);
1165
1166done:
1167	if (rc)
1168		t4_teardown_port_queues(pi);
1169
1170	return (rc);
1171}
1172
1173/*
1174 * Idempotent
1175 */
1176int
1177t4_teardown_port_queues(struct port_info *pi)
1178{
1179	int i;
1180	struct adapter *sc = pi->adapter;
1181	struct sge_rxq *rxq;
1182	struct sge_txq *txq;
1183#ifdef TCP_OFFLOAD
1184	struct sge_ofld_rxq *ofld_rxq;
1185	struct sge_wrq *ofld_txq;
1186#endif
1187#ifdef DEV_NETMAP
1188	struct sge_nm_rxq *nm_rxq;
1189	struct sge_nm_txq *nm_txq;
1190#endif
1191
1192	/* Do this before freeing the queues */
1193	if (pi->flags & PORT_SYSCTL_CTX) {
1194		sysctl_ctx_free(&pi->ctx);
1195		pi->flags &= ~PORT_SYSCTL_CTX;
1196	}
1197
1198	/*
1199	 * Take down all the tx queues first, as they reference the rx queues
1200	 * (for egress updates, etc.).
1201	 */
1202
1203	free_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
1204
1205	for_each_txq(pi, i, txq) {
1206		free_txq(pi, txq);
1207	}
1208#ifdef TCP_OFFLOAD
1209	for_each_ofld_txq(pi, i, ofld_txq) {
1210		free_wrq(sc, ofld_txq);
1211	}
1212#endif
1213#ifdef DEV_NETMAP
1214	for_each_nm_txq(pi, i, nm_txq)
1215	    free_nm_txq(pi, nm_txq);
1216#endif
1217
1218	/*
1219	 * Then take down the rx queues that forward their interrupts, as they
1220	 * reference other rx queues.
1221	 */
1222
1223	for_each_rxq(pi, i, rxq) {
1224		if ((rxq->iq.flags & IQ_INTR) == 0)
1225			free_rxq(pi, rxq);
1226	}
1227#ifdef TCP_OFFLOAD
1228	for_each_ofld_rxq(pi, i, ofld_rxq) {
1229		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
1230			free_ofld_rxq(pi, ofld_rxq);
1231	}
1232#endif
1233#ifdef DEV_NETMAP
1234	for_each_nm_rxq(pi, i, nm_rxq)
1235	    free_nm_rxq(pi, nm_rxq);
1236#endif
1237
1238	/*
1239	 * Then take down the rx queues that take direct interrupts.
1240	 */
1241
1242	for_each_rxq(pi, i, rxq) {
1243		if (rxq->iq.flags & IQ_INTR)
1244			free_rxq(pi, rxq);
1245	}
1246#ifdef TCP_OFFLOAD
1247	for_each_ofld_rxq(pi, i, ofld_rxq) {
1248		if (ofld_rxq->iq.flags & IQ_INTR)
1249			free_ofld_rxq(pi, ofld_rxq);
1250	}
1251#endif
1252
1253	return (0);
1254}
1255
1256/*
1257 * Deals with errors and the firmware event queue.  All data rx queues forward
1258 * their interrupt to the firmware event queue.
1259 */
1260void
1261t4_intr_all(void *arg)
1262{
1263	struct adapter *sc = arg;
1264	struct sge_iq *fwq = &sc->sge.fwq;
1265
1266	t4_intr_err(arg);
1267	if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) {
1268		service_iq(fwq, 0);
1269		atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE);
1270	}
1271}
1272
1273/* Deals with error interrupts */
1274void
1275t4_intr_err(void *arg)
1276{
1277	struct adapter *sc = arg;
1278
1279	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
1280	t4_slow_intr_handler(sc);
1281}
1282
1283void
1284t4_intr_evt(void *arg)
1285{
1286	struct sge_iq *iq = arg;
1287
1288	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1289		service_iq(iq, 0);
1290		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1291	}
1292}
1293
1294void
1295t4_intr(void *arg)
1296{
1297	struct sge_iq *iq = arg;
1298
1299	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1300		service_iq(iq, 0);
1301		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1302	}
1303}
1304
1305/*
1306 * Deals with anything and everything on the given ingress queue.
1307 */
1308static int
1309service_iq(struct sge_iq *iq, int budget)
1310{
1311	struct sge_iq *q;
1312	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
1313	struct sge_fl *fl;			/* Use iff IQ_HAS_FL */
1314	struct adapter *sc = iq->adapter;
1315	struct iq_desc *d = &iq->desc[iq->cidx];
1316	int ndescs = 0, limit;
1317	int rsp_type, refill;
1318	uint32_t lq;
1319	uint16_t fl_hw_cidx;
1320	struct mbuf *m0;
1321	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
1322#if defined(INET) || defined(INET6)
1323	const struct timeval lro_timeout = {0, sc->lro_timeout};
1324#endif
1325
1326	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1327
1328	limit = budget ? budget : iq->qsize / 16;
1329
1330	if (iq->flags & IQ_HAS_FL) {
1331		fl = &rxq->fl;
1332		fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
1333	} else {
1334		fl = NULL;
1335		fl_hw_cidx = 0;			/* to silence gcc warning */
1336	}
1337
1338	/*
1339	 * We always come back and check the descriptor ring for new indirect
1340	 * interrupts and other responses after running a single handler.
1341	 */
1342	for (;;) {
1343		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1344
1345			rmb();
1346
1347			refill = 0;
1348			m0 = NULL;
1349			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1350			lq = be32toh(d->rsp.pldbuflen_qid);
1351
1352			switch (rsp_type) {
1353			case X_RSPD_TYPE_FLBUF:
1354
1355				KASSERT(iq->flags & IQ_HAS_FL,
1356				    ("%s: data for an iq (%p) with no freelist",
1357				    __func__, iq));
1358
1359				m0 = get_fl_payload(sc, fl, lq);
1360				if (__predict_false(m0 == NULL))
1361					goto process_iql;
1362				refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
1363#ifdef T4_PKT_TIMESTAMP
1364				/*
1365				 * 60 bit timestamp for the payload is
1366				 * *(uint64_t *)m0->m_pktdat.  Note that it is
1367				 * in the leading free-space in the mbuf.  The
1368				 * kernel can clobber it during a pullup,
1369				 * m_copymdata, etc.  You need to make sure that
1370				 * the mbuf reaches you unmolested if you care
1371				 * about the timestamp.
1372				 */
1373				*(uint64_t *)m0->m_pktdat =
1374				    be64toh(ctrl->u.last_flit) &
1375				    0xfffffffffffffff;
1376#endif
1377
1378				/* fall through */
1379
1380			case X_RSPD_TYPE_CPL:
1381				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1382				    ("%s: bad opcode %02x.", __func__,
1383				    d->rss.opcode));
1384				sc->cpl_handler[d->rss.opcode](iq, &d->rss, m0);
1385				break;
1386
1387			case X_RSPD_TYPE_INTR:
1388
1389				/*
1390				 * Interrupts should be forwarded only to queues
1391				 * that are not forwarding their interrupts.
1392				 * This means service_iq can recurse but only 1
1393				 * level deep.
1394				 */
1395				KASSERT(budget == 0,
1396				    ("%s: budget %u, rsp_type %u", __func__,
1397				    budget, rsp_type));
1398
1399				/*
1400				 * There are 1K interrupt-capable queues (qids 0
1401				 * through 1023).  A response type indicating a
1402				 * forwarded interrupt with a qid >= 1K is an
1403				 * iWARP async notification.
1404				 */
1405				if (lq >= 1024) {
1406                                        sc->an_handler(iq, &d->rsp);
1407                                        break;
1408                                }
1409
1410				q = sc->sge.iqmap[lq - sc->sge.iq_start];
1411				if (atomic_cmpset_int(&q->state, IQS_IDLE,
1412				    IQS_BUSY)) {
1413					if (service_iq(q, q->qsize / 16) == 0) {
1414						atomic_cmpset_int(&q->state,
1415						    IQS_BUSY, IQS_IDLE);
1416					} else {
1417						STAILQ_INSERT_TAIL(&iql, q,
1418						    link);
1419					}
1420				}
1421				break;
1422
1423			default:
1424				KASSERT(0,
1425				    ("%s: illegal response type %d on iq %p",
1426				    __func__, rsp_type, iq));
1427				log(LOG_ERR,
1428				    "%s: illegal response type %d on iq %p",
1429				    device_get_nameunit(sc->dev), rsp_type, iq);
1430				break;
1431			}
1432
1433			d++;
1434			if (__predict_false(++iq->cidx == iq->sidx)) {
1435				iq->cidx = 0;
1436				iq->gen ^= F_RSPD_GEN;
1437				d = &iq->desc[0];
1438			}
1439			if (__predict_false(++ndescs == limit)) {
1440				t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
1441				    V_CIDXINC(ndescs) |
1442				    V_INGRESSQID(iq->cntxt_id) |
1443				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1444				ndescs = 0;
1445
1446#if defined(INET) || defined(INET6)
1447				if (iq->flags & IQ_LRO_ENABLED &&
1448				    sc->lro_timeout != 0) {
1449					tcp_lro_flush_inactive(&rxq->lro,
1450					    &lro_timeout);
1451				}
1452#endif
1453
1454				if (budget) {
1455					if (iq->flags & IQ_HAS_FL) {
1456						FL_LOCK(fl);
1457						refill_fl(sc, fl, 32);
1458						FL_UNLOCK(fl);
1459					}
1460					return (EINPROGRESS);
1461				}
1462			}
1463			if (refill) {
1464				FL_LOCK(fl);
1465				refill_fl(sc, fl, 32);
1466				FL_UNLOCK(fl);
1467				fl_hw_cidx = fl->hw_cidx;
1468			}
1469		}
1470
1471process_iql:
1472		if (STAILQ_EMPTY(&iql))
1473			break;
1474
1475		/*
1476		 * Process the head only, and send it to the back of the list if
1477		 * it's still not done.
1478		 */
1479		q = STAILQ_FIRST(&iql);
1480		STAILQ_REMOVE_HEAD(&iql, link);
1481		if (service_iq(q, q->qsize / 8) == 0)
1482			atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
1483		else
1484			STAILQ_INSERT_TAIL(&iql, q, link);
1485	}
1486
1487#if defined(INET) || defined(INET6)
1488	if (iq->flags & IQ_LRO_ENABLED) {
1489		struct lro_ctrl *lro = &rxq->lro;
1490		struct lro_entry *l;
1491
1492		while (!SLIST_EMPTY(&lro->lro_active)) {
1493			l = SLIST_FIRST(&lro->lro_active);
1494			SLIST_REMOVE_HEAD(&lro->lro_active, next);
1495			tcp_lro_flush(lro, l);
1496		}
1497	}
1498#endif
1499
1500	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) |
1501	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1502
1503	if (iq->flags & IQ_HAS_FL) {
1504		int starved;
1505
1506		FL_LOCK(fl);
1507		starved = refill_fl(sc, fl, 64);
1508		FL_UNLOCK(fl);
1509		if (__predict_false(starved != 0))
1510			add_fl_to_sfl(sc, fl);
1511	}
1512
1513	return (0);
1514}
1515
1516static inline int
1517cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
1518{
1519	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
1520
1521	if (rc)
1522		MPASS(cll->region3 >= CL_METADATA_SIZE);
1523
1524	return (rc);
1525}
1526
1527static inline struct cluster_metadata *
1528cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
1529    caddr_t cl)
1530{
1531
1532	if (cl_has_metadata(fl, cll)) {
1533		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1534
1535		return ((struct cluster_metadata *)(cl + swz->size) - 1);
1536	}
1537	return (NULL);
1538}
1539
1540static void
1541rxb_free(struct mbuf *m, void *arg1, void *arg2)
1542{
1543	uma_zone_t zone = arg1;
1544	caddr_t cl = arg2;
1545
1546	uma_zfree(zone, cl);
1547	counter_u64_add(extfree_rels, 1);
1548}
1549
1550/*
1551 * The mbuf returned by this function could be allocated from zone_mbuf or
1552 * constructed in spare room in the cluster.
1553 *
1554 * The mbuf carries the payload in one of these ways
1555 * a) frame inside the mbuf (mbuf from zone_mbuf)
1556 * b) m_cljset (for clusters without metadata) zone_mbuf
1557 * c) m_extaddref (cluster with metadata) inline mbuf
1558 * d) m_extaddref (cluster with metadata) zone_mbuf
1559 */
1560static struct mbuf *
1561get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1562    int remaining)
1563{
1564	struct mbuf *m;
1565	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1566	struct cluster_layout *cll = &sd->cll;
1567	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1568	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
1569	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
1570	int len, blen;
1571	caddr_t payload;
1572
1573	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
1574	len = min(remaining, blen);
1575	payload = sd->cl + cll->region1 + fl->rx_offset;
1576	if (fl->flags & FL_BUF_PACKING) {
1577		const u_int l = fr_offset + len;
1578		const u_int pad = roundup2(l, fl->buf_boundary) - l;
1579
1580		if (fl->rx_offset + len + pad < hwb->size)
1581			blen = len + pad;
1582		MPASS(fl->rx_offset + blen <= hwb->size);
1583	} else {
1584		MPASS(fl->rx_offset == 0);	/* not packing */
1585	}
1586
1587
1588	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
1589
1590		/*
1591		 * Copy payload into a freshly allocated mbuf.
1592		 */
1593
1594		m = fr_offset == 0 ?
1595		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1596		if (m == NULL)
1597			return (NULL);
1598		fl->mbuf_allocated++;
1599#ifdef T4_PKT_TIMESTAMP
1600		/* Leave room for a timestamp */
1601		m->m_data += 8;
1602#endif
1603		/* copy data to mbuf */
1604		bcopy(payload, mtod(m, caddr_t), len);
1605
1606	} else if (sd->nmbuf * MSIZE < cll->region1) {
1607
1608		/*
1609		 * There's spare room in the cluster for an mbuf.  Create one
1610		 * and associate it with the payload that's in the cluster.
1611		 */
1612
1613		MPASS(clm != NULL);
1614		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
1615		/* No bzero required */
1616		if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA,
1617		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
1618			return (NULL);
1619		fl->mbuf_inlined++;
1620		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
1621		    swz->zone, sd->cl);
1622		if (sd->nmbuf++ == 0)
1623			counter_u64_add(extfree_refs, 1);
1624
1625	} else {
1626
1627		/*
1628		 * Grab an mbuf from zone_mbuf and associate it with the
1629		 * payload in the cluster.
1630		 */
1631
1632		m = fr_offset == 0 ?
1633		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1634		if (m == NULL)
1635			return (NULL);
1636		fl->mbuf_allocated++;
1637		if (clm != NULL) {
1638			m_extaddref(m, payload, blen, &clm->refcount,
1639			    rxb_free, swz->zone, sd->cl);
1640			if (sd->nmbuf++ == 0)
1641				counter_u64_add(extfree_refs, 1);
1642		} else {
1643			m_cljset(m, sd->cl, swz->type);
1644			sd->cl = NULL;	/* consumed, not a recycle candidate */
1645		}
1646	}
1647	if (fr_offset == 0)
1648		m->m_pkthdr.len = remaining;
1649	m->m_len = len;
1650
1651	if (fl->flags & FL_BUF_PACKING) {
1652		fl->rx_offset += blen;
1653		MPASS(fl->rx_offset <= hwb->size);
1654		if (fl->rx_offset < hwb->size)
1655			return (m);	/* without advancing the cidx */
1656	}
1657
1658	if (__predict_false(++fl->cidx % 8 == 0)) {
1659		uint16_t cidx = fl->cidx / 8;
1660
1661		if (__predict_false(cidx == fl->sidx))
1662			fl->cidx = cidx = 0;
1663		fl->hw_cidx = cidx;
1664	}
1665	fl->rx_offset = 0;
1666
1667	return (m);
1668}
1669
1670static struct mbuf *
1671get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
1672{
1673	struct mbuf *m0, *m, **pnext;
1674	u_int remaining;
1675	const u_int total = G_RSPD_LEN(len_newbuf);
1676
1677	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
1678		M_ASSERTPKTHDR(fl->m0);
1679		MPASS(fl->m0->m_pkthdr.len == total);
1680		MPASS(fl->remaining < total);
1681
1682		m0 = fl->m0;
1683		pnext = fl->pnext;
1684		remaining = fl->remaining;
1685		fl->flags &= ~FL_BUF_RESUME;
1686		goto get_segment;
1687	}
1688
1689	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
1690		fl->rx_offset = 0;
1691		if (__predict_false(++fl->cidx % 8 == 0)) {
1692			uint16_t cidx = fl->cidx / 8;
1693
1694			if (__predict_false(cidx == fl->sidx))
1695				fl->cidx = cidx = 0;
1696			fl->hw_cidx = cidx;
1697		}
1698	}
1699
1700	/*
1701	 * Payload starts at rx_offset in the current hw buffer.  Its length is
1702	 * 'len' and it may span multiple hw buffers.
1703	 */
1704
1705	m0 = get_scatter_segment(sc, fl, 0, total);
1706	if (m0 == NULL)
1707		return (NULL);
1708	remaining = total - m0->m_len;
1709	pnext = &m0->m_next;
1710	while (remaining > 0) {
1711get_segment:
1712		MPASS(fl->rx_offset == 0);
1713		m = get_scatter_segment(sc, fl, total - remaining, remaining);
1714		if (__predict_false(m == NULL)) {
1715			fl->m0 = m0;
1716			fl->pnext = pnext;
1717			fl->remaining = remaining;
1718			fl->flags |= FL_BUF_RESUME;
1719			return (NULL);
1720		}
1721		*pnext = m;
1722		pnext = &m->m_next;
1723		remaining -= m->m_len;
1724	}
1725	*pnext = NULL;
1726
1727	return (m0);
1728}
1729
1730static int
1731t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
1732{
1733	struct sge_rxq *rxq = iq_to_rxq(iq);
1734	struct ifnet *ifp = rxq->ifp;
1735	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
1736#if defined(INET) || defined(INET6)
1737	struct lro_ctrl *lro = &rxq->lro;
1738#endif
1739
1740	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
1741	    rss->opcode));
1742
1743	m0->m_pkthdr.len -= fl_pktshift;
1744	m0->m_len -= fl_pktshift;
1745	m0->m_data += fl_pktshift;
1746
1747	m0->m_pkthdr.rcvif = ifp;
1748	M_HASHTYPE_SET(m0, M_HASHTYPE_OPAQUE);
1749	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
1750
1751	if (cpl->csum_calc && !cpl->err_vec) {
1752		if (ifp->if_capenable & IFCAP_RXCSUM &&
1753		    cpl->l2info & htobe32(F_RXF_IP)) {
1754			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
1755			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1756			rxq->rxcsum++;
1757		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
1758		    cpl->l2info & htobe32(F_RXF_IP6)) {
1759			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
1760			    CSUM_PSEUDO_HDR);
1761			rxq->rxcsum++;
1762		}
1763
1764		if (__predict_false(cpl->ip_frag))
1765			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
1766		else
1767			m0->m_pkthdr.csum_data = 0xffff;
1768	}
1769
1770	if (cpl->vlan_ex) {
1771		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
1772		m0->m_flags |= M_VLANTAG;
1773		rxq->vlan_extraction++;
1774	}
1775
1776#if defined(INET) || defined(INET6)
1777	if (cpl->l2info & htobe32(F_RXF_LRO) &&
1778	    iq->flags & IQ_LRO_ENABLED &&
1779	    tcp_lro_rx(lro, m0, 0) == 0) {
1780		/* queued for LRO */
1781	} else
1782#endif
1783	ifp->if_input(ifp, m0);
1784
1785	return (0);
1786}
1787
1788/*
1789 * Must drain the wrq or make sure that someone else will.
1790 */
1791static void
1792wrq_tx_drain(void *arg, int n)
1793{
1794	struct sge_wrq *wrq = arg;
1795	struct sge_eq *eq = &wrq->eq;
1796
1797	EQ_LOCK(eq);
1798	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
1799		drain_wrq_wr_list(wrq->adapter, wrq);
1800	EQ_UNLOCK(eq);
1801}
1802
1803static void
1804drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
1805{
1806	struct sge_eq *eq = &wrq->eq;
1807	u_int available, dbdiff;	/* # of hardware descriptors */
1808	u_int n;
1809	struct wrqe *wr;
1810	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
1811
1812	EQ_LOCK_ASSERT_OWNED(eq);
1813	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
1814	wr = STAILQ_FIRST(&wrq->wr_list);
1815	MPASS(wr != NULL);	/* Must be called with something useful to do */
1816	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
1817
1818	do {
1819		eq->cidx = read_hw_cidx(eq);
1820		if (eq->pidx == eq->cidx)
1821			available = eq->sidx - 1;
1822		else
1823			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
1824
1825		MPASS(wr->wrq == wrq);
1826		n = howmany(wr->wr_len, EQ_ESIZE);
1827		if (available < n)
1828			return;
1829
1830		dst = (void *)&eq->desc[eq->pidx];
1831		if (__predict_true(eq->sidx - eq->pidx > n)) {
1832			/* Won't wrap, won't end exactly at the status page. */
1833			bcopy(&wr->wr[0], dst, wr->wr_len);
1834			eq->pidx += n;
1835		} else {
1836			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
1837
1838			bcopy(&wr->wr[0], dst, first_portion);
1839			if (wr->wr_len > first_portion) {
1840				bcopy(&wr->wr[first_portion], &eq->desc[0],
1841				    wr->wr_len - first_portion);
1842			}
1843			eq->pidx = n - (eq->sidx - eq->pidx);
1844		}
1845
1846		if (available < eq->sidx / 4 &&
1847		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
1848			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
1849			    F_FW_WR_EQUEQ);
1850			eq->equeqidx = eq->pidx;
1851		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
1852			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
1853			eq->equeqidx = eq->pidx;
1854		}
1855
1856		dbdiff += n;
1857		if (dbdiff >= 16) {
1858			ring_eq_db(sc, eq, dbdiff);
1859			dbdiff = 0;
1860		}
1861
1862		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
1863		free_wrqe(wr);
1864		MPASS(wrq->nwr_pending > 0);
1865		wrq->nwr_pending--;
1866		MPASS(wrq->ndesc_needed >= n);
1867		wrq->ndesc_needed -= n;
1868	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
1869
1870	if (dbdiff)
1871		ring_eq_db(sc, eq, dbdiff);
1872}
1873
1874/*
1875 * Doesn't fail.  Holds on to work requests it can't send right away.
1876 */
1877void
1878t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
1879{
1880#ifdef INVARIANTS
1881	struct sge_eq *eq = &wrq->eq;
1882#endif
1883
1884	EQ_LOCK_ASSERT_OWNED(eq);
1885	MPASS(wr != NULL);
1886	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
1887	MPASS((wr->wr_len & 0x7) == 0);
1888
1889	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
1890	wrq->nwr_pending++;
1891	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
1892
1893	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
1894		return;	/* commit_wrq_wr will drain wr_list as well. */
1895
1896	drain_wrq_wr_list(sc, wrq);
1897
1898	/* Doorbell must have caught up to the pidx. */
1899	MPASS(eq->pidx == eq->dbidx);
1900}
1901
1902void
1903t4_update_fl_bufsize(struct ifnet *ifp)
1904{
1905	struct port_info *pi = ifp->if_softc;
1906	struct adapter *sc = pi->adapter;
1907	struct sge_rxq *rxq;
1908#ifdef TCP_OFFLOAD
1909	struct sge_ofld_rxq *ofld_rxq;
1910#endif
1911	struct sge_fl *fl;
1912	int i, maxp, mtu = ifp->if_mtu;
1913
1914	maxp = mtu_to_max_payload(sc, mtu, 0);
1915	for_each_rxq(pi, i, rxq) {
1916		fl = &rxq->fl;
1917
1918		FL_LOCK(fl);
1919		find_best_refill_source(sc, fl, maxp);
1920		FL_UNLOCK(fl);
1921	}
1922#ifdef TCP_OFFLOAD
1923	maxp = mtu_to_max_payload(sc, mtu, 1);
1924	for_each_ofld_rxq(pi, i, ofld_rxq) {
1925		fl = &ofld_rxq->fl;
1926
1927		FL_LOCK(fl);
1928		find_best_refill_source(sc, fl, maxp);
1929		FL_UNLOCK(fl);
1930	}
1931#endif
1932}
1933
1934static inline int
1935mbuf_nsegs(struct mbuf *m)
1936{
1937
1938	M_ASSERTPKTHDR(m);
1939	KASSERT(m->m_pkthdr.l5hlen > 0,
1940	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
1941
1942	return (m->m_pkthdr.l5hlen);
1943}
1944
1945static inline void
1946set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
1947{
1948
1949	M_ASSERTPKTHDR(m);
1950	m->m_pkthdr.l5hlen = nsegs;
1951}
1952
1953static inline int
1954mbuf_len16(struct mbuf *m)
1955{
1956	int n;
1957
1958	M_ASSERTPKTHDR(m);
1959	n = m->m_pkthdr.PH_loc.eight[0];
1960	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
1961
1962	return (n);
1963}
1964
1965static inline void
1966set_mbuf_len16(struct mbuf *m, uint8_t len16)
1967{
1968
1969	M_ASSERTPKTHDR(m);
1970	m->m_pkthdr.PH_loc.eight[0] = len16;
1971}
1972
1973static inline int
1974needs_tso(struct mbuf *m)
1975{
1976
1977	M_ASSERTPKTHDR(m);
1978
1979	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1980		KASSERT(m->m_pkthdr.tso_segsz > 0,
1981		    ("%s: TSO requested in mbuf %p but MSS not provided",
1982		    __func__, m));
1983		return (1);
1984	}
1985
1986	return (0);
1987}
1988
1989static inline int
1990needs_l3_csum(struct mbuf *m)
1991{
1992
1993	M_ASSERTPKTHDR(m);
1994
1995	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
1996		return (1);
1997	return (0);
1998}
1999
2000static inline int
2001needs_l4_csum(struct mbuf *m)
2002{
2003
2004	M_ASSERTPKTHDR(m);
2005
2006	if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
2007	    CSUM_TCP_IPV6 | CSUM_TSO))
2008		return (1);
2009	return (0);
2010}
2011
2012static inline int
2013needs_vlan_insertion(struct mbuf *m)
2014{
2015
2016	M_ASSERTPKTHDR(m);
2017
2018	if (m->m_flags & M_VLANTAG) {
2019		KASSERT(m->m_pkthdr.ether_vtag != 0,
2020		    ("%s: HWVLAN requested in mbuf %p but tag not provided",
2021		    __func__, m));
2022		return (1);
2023	}
2024	return (0);
2025}
2026
2027static void *
2028m_advance(struct mbuf **pm, int *poffset, int len)
2029{
2030	struct mbuf *m = *pm;
2031	int offset = *poffset;
2032	uintptr_t p = 0;
2033
2034	MPASS(len > 0);
2035
2036	while (len) {
2037		if (offset + len < m->m_len) {
2038			offset += len;
2039			p = mtod(m, uintptr_t) + offset;
2040			break;
2041		}
2042		len -= m->m_len - offset;
2043		m = m->m_next;
2044		offset = 0;
2045		MPASS(m != NULL);
2046	}
2047	*poffset = offset;
2048	*pm = m;
2049	return ((void *)p);
2050}
2051
2052static inline int
2053same_paddr(char *a, char *b)
2054{
2055
2056	if (a == b)
2057		return (1);
2058	else if (a != NULL && b != NULL) {
2059		vm_offset_t x = (vm_offset_t)a;
2060		vm_offset_t y = (vm_offset_t)b;
2061
2062		if ((x & PAGE_MASK) == (y & PAGE_MASK) &&
2063		    pmap_kextract(x) == pmap_kextract(y))
2064			return (1);
2065	}
2066
2067	return (0);
2068}
2069
2070/*
2071 * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
2072 * must have at least one mbuf that's not empty.
2073 */
2074static inline int
2075count_mbuf_nsegs(struct mbuf *m)
2076{
2077	char *prev_end, *start;
2078	int len, nsegs;
2079
2080	MPASS(m != NULL);
2081
2082	nsegs = 0;
2083	prev_end = NULL;
2084	for (; m; m = m->m_next) {
2085
2086		len = m->m_len;
2087		if (__predict_false(len == 0))
2088			continue;
2089		start = mtod(m, char *);
2090
2091		nsegs += sglist_count(start, len);
2092		if (same_paddr(prev_end, start))
2093			nsegs--;
2094		prev_end = start + len;
2095	}
2096
2097	MPASS(nsegs > 0);
2098	return (nsegs);
2099}
2100
2101/*
2102 * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
2103 * a) caller can assume it's been freed if this function returns with an error.
2104 * b) it may get defragged up if the gather list is too long for the hardware.
2105 */
2106int
2107parse_pkt(struct mbuf **mp)
2108{
2109	struct mbuf *m0 = *mp, *m;
2110	int rc, nsegs, defragged = 0, offset;
2111	struct ether_header *eh;
2112	void *l3hdr;
2113#if defined(INET) || defined(INET6)
2114	struct tcphdr *tcp;
2115#endif
2116	uint16_t eh_type;
2117
2118	M_ASSERTPKTHDR(m0);
2119	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
2120		rc = EINVAL;
2121fail:
2122		m_freem(m0);
2123		*mp = NULL;
2124		return (rc);
2125	}
2126restart:
2127	/*
2128	 * First count the number of gather list segments in the payload.
2129	 * Defrag the mbuf if nsegs exceeds the hardware limit.
2130	 */
2131	M_ASSERTPKTHDR(m0);
2132	MPASS(m0->m_pkthdr.len > 0);
2133	nsegs = count_mbuf_nsegs(m0);
2134	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
2135		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
2136			rc = EFBIG;
2137			goto fail;
2138		}
2139		*mp = m0 = m;	/* update caller's copy after defrag */
2140		goto restart;
2141	}
2142
2143	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
2144		m0 = m_pullup(m0, m0->m_pkthdr.len);
2145		if (m0 == NULL) {
2146			/* Should have left well enough alone. */
2147			rc = EFBIG;
2148			goto fail;
2149		}
2150		*mp = m0;	/* update caller's copy after pullup */
2151		goto restart;
2152	}
2153	set_mbuf_nsegs(m0, nsegs);
2154	set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
2155
2156	if (!needs_tso(m0))
2157		return (0);
2158
2159	m = m0;
2160	eh = mtod(m, struct ether_header *);
2161	eh_type = ntohs(eh->ether_type);
2162	if (eh_type == ETHERTYPE_VLAN) {
2163		struct ether_vlan_header *evh = (void *)eh;
2164
2165		eh_type = ntohs(evh->evl_proto);
2166		m0->m_pkthdr.l2hlen = sizeof(*evh);
2167	} else
2168		m0->m_pkthdr.l2hlen = sizeof(*eh);
2169
2170	offset = 0;
2171	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
2172
2173	switch (eh_type) {
2174#ifdef INET6
2175	case ETHERTYPE_IPV6:
2176	{
2177		struct ip6_hdr *ip6 = l3hdr;
2178
2179		MPASS(ip6->ip6_nxt == IPPROTO_TCP);
2180
2181		m0->m_pkthdr.l3hlen = sizeof(*ip6);
2182		break;
2183	}
2184#endif
2185#ifdef INET
2186	case ETHERTYPE_IP:
2187	{
2188		struct ip *ip = l3hdr;
2189
2190		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
2191		break;
2192	}
2193#endif
2194	default:
2195		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
2196		    " with the same INET/INET6 options as the kernel.",
2197		    __func__, eh_type);
2198	}
2199
2200#if defined(INET) || defined(INET6)
2201	tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
2202	m0->m_pkthdr.l4hlen = tcp->th_off * 4;
2203#endif
2204	MPASS(m0 == *mp);
2205	return (0);
2206}
2207
2208void *
2209start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
2210{
2211	struct sge_eq *eq = &wrq->eq;
2212	struct adapter *sc = wrq->adapter;
2213	int ndesc, available;
2214	struct wrqe *wr;
2215	void *w;
2216
2217	MPASS(len16 > 0);
2218	ndesc = howmany(len16, EQ_ESIZE / 16);
2219	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
2220
2221	EQ_LOCK(eq);
2222
2223	if (!STAILQ_EMPTY(&wrq->wr_list))
2224		drain_wrq_wr_list(sc, wrq);
2225
2226	if (!STAILQ_EMPTY(&wrq->wr_list)) {
2227slowpath:
2228		EQ_UNLOCK(eq);
2229		wr = alloc_wrqe(len16 * 16, wrq);
2230		if (__predict_false(wr == NULL))
2231			return (NULL);
2232		cookie->pidx = -1;
2233		cookie->ndesc = ndesc;
2234		return (&wr->wr);
2235	}
2236
2237	eq->cidx = read_hw_cidx(eq);
2238	if (eq->pidx == eq->cidx)
2239		available = eq->sidx - 1;
2240	else
2241		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2242	if (available < ndesc)
2243		goto slowpath;
2244
2245	cookie->pidx = eq->pidx;
2246	cookie->ndesc = ndesc;
2247	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
2248
2249	w = &eq->desc[eq->pidx];
2250	IDXINCR(eq->pidx, ndesc, eq->sidx);
2251	if (__predict_false(eq->pidx < ndesc - 1)) {
2252		w = &wrq->ss[0];
2253		wrq->ss_pidx = cookie->pidx;
2254		wrq->ss_len = len16 * 16;
2255	}
2256
2257	EQ_UNLOCK(eq);
2258
2259	return (w);
2260}
2261
2262void
2263commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
2264{
2265	struct sge_eq *eq = &wrq->eq;
2266	struct adapter *sc = wrq->adapter;
2267	int ndesc, pidx;
2268	struct wrq_cookie *prev, *next;
2269
2270	if (cookie->pidx == -1) {
2271		struct wrqe *wr = __containerof(w, struct wrqe, wr);
2272
2273		t4_wrq_tx(sc, wr);
2274		return;
2275	}
2276
2277	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
2278	pidx = cookie->pidx;
2279	MPASS(pidx >= 0 && pidx < eq->sidx);
2280	if (__predict_false(w == &wrq->ss[0])) {
2281		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
2282
2283		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
2284		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
2285		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
2286		wrq->tx_wrs_ss++;
2287	} else
2288		wrq->tx_wrs_direct++;
2289
2290	EQ_LOCK(eq);
2291	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
2292	next = TAILQ_NEXT(cookie, link);
2293	if (prev == NULL) {
2294		MPASS(pidx == eq->dbidx);
2295		if (next == NULL || ndesc >= 16)
2296			ring_eq_db(wrq->adapter, eq, ndesc);
2297		else {
2298			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
2299			next->pidx = pidx;
2300			next->ndesc += ndesc;
2301		}
2302	} else {
2303		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
2304		prev->ndesc += ndesc;
2305	}
2306	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
2307
2308	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2309		drain_wrq_wr_list(sc, wrq);
2310
2311#ifdef INVARIANTS
2312	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
2313		/* Doorbell must have caught up to the pidx. */
2314		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
2315	}
2316#endif
2317	EQ_UNLOCK(eq);
2318}
2319
2320static u_int
2321can_resume_eth_tx(struct mp_ring *r)
2322{
2323	struct sge_eq *eq = r->cookie;
2324
2325	return (total_available_tx_desc(eq) > eq->sidx / 8);
2326}
2327
2328static inline int
2329cannot_use_txpkts(struct mbuf *m)
2330{
2331	/* maybe put a GL limit too, to avoid silliness? */
2332
2333	return (needs_tso(m));
2334}
2335
2336/*
2337 * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
2338 * be consumed.  Return the actual number consumed.  0 indicates a stall.
2339 */
2340static u_int
2341eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
2342{
2343	struct sge_txq *txq = r->cookie;
2344	struct sge_eq *eq = &txq->eq;
2345	struct ifnet *ifp = txq->ifp;
2346	struct port_info *pi = (void *)ifp->if_softc;
2347	struct adapter *sc = pi->adapter;
2348	u_int total, remaining;		/* # of packets */
2349	u_int available, dbdiff;	/* # of hardware descriptors */
2350	u_int n, next_cidx;
2351	struct mbuf *m0, *tail;
2352	struct txpkts txp;
2353	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
2354
2355	remaining = IDXDIFF(pidx, cidx, r->size);
2356	MPASS(remaining > 0);	/* Must not be called without work to do. */
2357	total = 0;
2358
2359	TXQ_LOCK(txq);
2360	if (__predict_false((eq->flags & EQ_ENABLED) == 0)) {
2361		while (cidx != pidx) {
2362			m0 = r->items[cidx];
2363			m_freem(m0);
2364			if (++cidx == r->size)
2365				cidx = 0;
2366		}
2367		reclaim_tx_descs(txq, 2048);
2368		total = remaining;
2369		goto done;
2370	}
2371
2372	/* How many hardware descriptors do we have readily available. */
2373	if (eq->pidx == eq->cidx)
2374		available = eq->sidx - 1;
2375	else
2376		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2377	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
2378
2379	while (remaining > 0) {
2380
2381		m0 = r->items[cidx];
2382		M_ASSERTPKTHDR(m0);
2383		MPASS(m0->m_nextpkt == NULL);
2384
2385		if (available < SGE_MAX_WR_NDESC) {
2386			available += reclaim_tx_descs(txq, 64);
2387			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
2388				break;	/* out of descriptors */
2389		}
2390
2391		next_cidx = cidx + 1;
2392		if (__predict_false(next_cidx == r->size))
2393			next_cidx = 0;
2394
2395		wr = (void *)&eq->desc[eq->pidx];
2396		if (remaining > 1 &&
2397		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
2398
2399			/* pkts at cidx, next_cidx should both be in txp. */
2400			MPASS(txp.npkt == 2);
2401			tail = r->items[next_cidx];
2402			MPASS(tail->m_nextpkt == NULL);
2403			ETHER_BPF_MTAP(ifp, m0);
2404			ETHER_BPF_MTAP(ifp, tail);
2405			m0->m_nextpkt = tail;
2406
2407			if (__predict_false(++next_cidx == r->size))
2408				next_cidx = 0;
2409
2410			while (next_cidx != pidx) {
2411				if (add_to_txpkts(r->items[next_cidx], &txp,
2412				    available) != 0)
2413					break;
2414				tail->m_nextpkt = r->items[next_cidx];
2415				tail = tail->m_nextpkt;
2416				ETHER_BPF_MTAP(ifp, tail);
2417				if (__predict_false(++next_cidx == r->size))
2418					next_cidx = 0;
2419			}
2420
2421			n = write_txpkts_wr(txq, wr, m0, &txp, available);
2422			total += txp.npkt;
2423			remaining -= txp.npkt;
2424		} else {
2425			total++;
2426			remaining--;
2427			n = write_txpkt_wr(txq, (void *)wr, m0, available);
2428			ETHER_BPF_MTAP(ifp, m0);
2429		}
2430		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
2431
2432		available -= n;
2433		dbdiff += n;
2434		IDXINCR(eq->pidx, n, eq->sidx);
2435
2436		if (total_available_tx_desc(eq) < eq->sidx / 4 &&
2437		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
2438			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
2439			    F_FW_WR_EQUEQ);
2440			eq->equeqidx = eq->pidx;
2441		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
2442			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
2443			eq->equeqidx = eq->pidx;
2444		}
2445
2446		if (dbdiff >= 16 && remaining >= 4) {
2447			ring_eq_db(sc, eq, dbdiff);
2448			available += reclaim_tx_descs(txq, 4 * dbdiff);
2449			dbdiff = 0;
2450		}
2451
2452		cidx = next_cidx;
2453	}
2454	if (dbdiff != 0) {
2455		ring_eq_db(sc, eq, dbdiff);
2456		reclaim_tx_descs(txq, 32);
2457	}
2458done:
2459	TXQ_UNLOCK(txq);
2460
2461	return (total);
2462}
2463
2464static inline void
2465init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
2466    int qsize)
2467{
2468
2469	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
2470	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
2471	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
2472	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
2473
2474	iq->flags = 0;
2475	iq->adapter = sc;
2476	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
2477	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
2478	if (pktc_idx >= 0) {
2479		iq->intr_params |= F_QINTR_CNT_EN;
2480		iq->intr_pktc_idx = pktc_idx;
2481	}
2482	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
2483	iq->sidx = iq->qsize - spg_len / IQ_ESIZE;
2484}
2485
2486static inline void
2487init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
2488{
2489
2490	fl->qsize = qsize;
2491	fl->sidx = qsize - spg_len / EQ_ESIZE;
2492	strlcpy(fl->lockname, name, sizeof(fl->lockname));
2493	if (sc->flags & BUF_PACKING_OK &&
2494	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
2495	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
2496		fl->flags |= FL_BUF_PACKING;
2497	find_best_refill_source(sc, fl, maxp);
2498	find_safe_refill_source(sc, fl);
2499}
2500
2501static inline void
2502init_eq(struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan,
2503    uint16_t iqid, char *name)
2504{
2505	KASSERT(tx_chan < NCHAN, ("%s: bad tx channel %d", __func__, tx_chan));
2506	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
2507
2508	eq->flags = eqtype & EQ_TYPEMASK;
2509	eq->tx_chan = tx_chan;
2510	eq->iqid = iqid;
2511	eq->sidx = qsize - spg_len / EQ_ESIZE;
2512	strlcpy(eq->lockname, name, sizeof(eq->lockname));
2513}
2514
2515static int
2516alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
2517    bus_dmamap_t *map, bus_addr_t *pa, void **va)
2518{
2519	int rc;
2520
2521	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
2522	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
2523	if (rc != 0) {
2524		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
2525		goto done;
2526	}
2527
2528	rc = bus_dmamem_alloc(*tag, va,
2529	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
2530	if (rc != 0) {
2531		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
2532		goto done;
2533	}
2534
2535	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
2536	if (rc != 0) {
2537		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
2538		goto done;
2539	}
2540done:
2541	if (rc)
2542		free_ring(sc, *tag, *map, *pa, *va);
2543
2544	return (rc);
2545}
2546
2547static int
2548free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
2549    bus_addr_t pa, void *va)
2550{
2551	if (pa)
2552		bus_dmamap_unload(tag, map);
2553	if (va)
2554		bus_dmamem_free(tag, va, map);
2555	if (tag)
2556		bus_dma_tag_destroy(tag);
2557
2558	return (0);
2559}
2560
2561/*
2562 * Allocates the ring for an ingress queue and an optional freelist.  If the
2563 * freelist is specified it will be allocated and then associated with the
2564 * ingress queue.
2565 *
2566 * Returns errno on failure.  Resources allocated up to that point may still be
2567 * allocated.  Caller is responsible for cleanup in case this function fails.
2568 *
2569 * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then
2570 * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
2571 * the abs_id of the ingress queue to which its interrupts should be forwarded.
2572 */
2573static int
2574alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl,
2575    int intr_idx, int cong)
2576{
2577	int rc, i, cntxt_id;
2578	size_t len;
2579	struct fw_iq_cmd c;
2580	struct adapter *sc = iq->adapter;
2581	__be32 v = 0;
2582
2583	len = iq->qsize * IQ_ESIZE;
2584	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
2585	    (void **)&iq->desc);
2586	if (rc != 0)
2587		return (rc);
2588
2589	bzero(&c, sizeof(c));
2590	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
2591	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
2592	    V_FW_IQ_CMD_VFN(0));
2593
2594	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
2595	    FW_LEN16(c));
2596
2597	/* Special handling for firmware event queue */
2598	if (iq == &sc->sge.fwq)
2599		v |= F_FW_IQ_CMD_IQASYNCH;
2600
2601	if (iq->flags & IQ_INTR) {
2602		KASSERT(intr_idx < sc->intr_count,
2603		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
2604	} else
2605		v |= F_FW_IQ_CMD_IQANDST;
2606	v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
2607
2608	c.type_to_iqandstindex = htobe32(v |
2609	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
2610	    V_FW_IQ_CMD_VIID(pi->viid) |
2611	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
2612	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
2613	    F_FW_IQ_CMD_IQGTSMODE |
2614	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
2615	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
2616	c.iqsize = htobe16(iq->qsize);
2617	c.iqaddr = htobe64(iq->ba);
2618	if (cong >= 0)
2619		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
2620
2621	if (fl) {
2622		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
2623
2624		len = fl->qsize * EQ_ESIZE;
2625		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
2626		    &fl->ba, (void **)&fl->desc);
2627		if (rc)
2628			return (rc);
2629
2630		/* Allocate space for one software descriptor per buffer. */
2631		rc = alloc_fl_sdesc(fl);
2632		if (rc != 0) {
2633			device_printf(sc->dev,
2634			    "failed to setup fl software descriptors: %d\n",
2635			    rc);
2636			return (rc);
2637		}
2638
2639		if (fl->flags & FL_BUF_PACKING) {
2640			fl->lowat = roundup2(sc->sge.fl_starve_threshold2, 8);
2641			fl->buf_boundary = sc->sge.pack_boundary;
2642		} else {
2643			fl->lowat = roundup2(sc->sge.fl_starve_threshold, 8);
2644			fl->buf_boundary = 16;
2645		}
2646		if (fl_pad && fl->buf_boundary < sc->sge.pad_boundary)
2647			fl->buf_boundary = sc->sge.pad_boundary;
2648
2649		c.iqns_to_fl0congen |=
2650		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
2651			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
2652			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
2653			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
2654			    0));
2655		if (cong >= 0) {
2656			c.iqns_to_fl0congen |=
2657				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
2658				    F_FW_IQ_CMD_FL0CONGCIF |
2659				    F_FW_IQ_CMD_FL0CONGEN);
2660		}
2661		c.fl0dcaen_to_fl0cidxfthresh =
2662		    htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_64B) |
2663			V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B));
2664		c.fl0size = htobe16(fl->qsize);
2665		c.fl0addr = htobe64(fl->ba);
2666	}
2667
2668	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2669	if (rc != 0) {
2670		device_printf(sc->dev,
2671		    "failed to create ingress queue: %d\n", rc);
2672		return (rc);
2673	}
2674
2675	iq->cidx = 0;
2676	iq->gen = F_RSPD_GEN;
2677	iq->intr_next = iq->intr_params;
2678	iq->cntxt_id = be16toh(c.iqid);
2679	iq->abs_id = be16toh(c.physiqid);
2680	iq->flags |= IQ_ALLOCATED;
2681
2682	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
2683	if (cntxt_id >= sc->sge.niq) {
2684		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
2685		    cntxt_id, sc->sge.niq - 1);
2686	}
2687	sc->sge.iqmap[cntxt_id] = iq;
2688
2689	if (fl) {
2690		u_int qid;
2691
2692		iq->flags |= IQ_HAS_FL;
2693		fl->cntxt_id = be16toh(c.fl0id);
2694		fl->pidx = fl->cidx = 0;
2695
2696		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
2697		if (cntxt_id >= sc->sge.neq) {
2698			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
2699			    __func__, cntxt_id, sc->sge.neq - 1);
2700		}
2701		sc->sge.eqmap[cntxt_id] = (void *)fl;
2702
2703		qid = fl->cntxt_id;
2704		if (isset(&sc->doorbells, DOORBELL_UDB)) {
2705			uint32_t s_qpp = sc->sge.eq_s_qpp;
2706			uint32_t mask = (1 << s_qpp) - 1;
2707			volatile uint8_t *udb;
2708
2709			udb = sc->udbs_base + UDBS_DB_OFFSET;
2710			udb += (qid >> s_qpp) << PAGE_SHIFT;
2711			qid &= mask;
2712			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
2713				udb += qid << UDBS_SEG_SHIFT;
2714				qid = 0;
2715			}
2716			fl->udb = (volatile void *)udb;
2717		}
2718		fl->dbval = F_DBPRIO | V_QID(qid);
2719		if (is_t5(sc))
2720			fl->dbval |= F_DBTYPE;
2721
2722		FL_LOCK(fl);
2723		/* Enough to make sure the SGE doesn't think it's starved */
2724		refill_fl(sc, fl, fl->lowat);
2725		FL_UNLOCK(fl);
2726	}
2727
2728	if (is_t5(sc) && cong >= 0) {
2729		uint32_t param, val;
2730
2731		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
2732		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
2733		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
2734		if (cong == 0)
2735			val = 1 << 19;
2736		else {
2737			val = 2 << 19;
2738			for (i = 0; i < 4; i++) {
2739				if (cong & (1 << i))
2740					val |= 1 << (i << 2);
2741			}
2742		}
2743
2744		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
2745		if (rc != 0) {
2746			/* report error but carry on */
2747			device_printf(sc->dev,
2748			    "failed to set congestion manager context for "
2749			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
2750		}
2751	}
2752
2753	/* Enable IQ interrupts */
2754	atomic_store_rel_int(&iq->state, IQS_IDLE);
2755	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_SEINTARM(iq->intr_params) |
2756	    V_INGRESSQID(iq->cntxt_id));
2757
2758	return (0);
2759}
2760
2761static int
2762free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl)
2763{
2764	int rc;
2765	struct adapter *sc = iq->adapter;
2766	device_t dev;
2767
2768	if (sc == NULL)
2769		return (0);	/* nothing to do */
2770
2771	dev = pi ? pi->dev : sc->dev;
2772
2773	if (iq->flags & IQ_ALLOCATED) {
2774		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
2775		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
2776		    fl ? fl->cntxt_id : 0xffff, 0xffff);
2777		if (rc != 0) {
2778			device_printf(dev,
2779			    "failed to free queue %p: %d\n", iq, rc);
2780			return (rc);
2781		}
2782		iq->flags &= ~IQ_ALLOCATED;
2783	}
2784
2785	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
2786
2787	bzero(iq, sizeof(*iq));
2788
2789	if (fl) {
2790		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
2791		    fl->desc);
2792
2793		if (fl->sdesc)
2794			free_fl_sdesc(sc, fl);
2795
2796		if (mtx_initialized(&fl->fl_lock))
2797			mtx_destroy(&fl->fl_lock);
2798
2799		bzero(fl, sizeof(*fl));
2800	}
2801
2802	return (0);
2803}
2804
2805static void
2806add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
2807    struct sge_fl *fl)
2808{
2809	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2810
2811	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
2812	    "freelist");
2813	children = SYSCTL_CHILDREN(oid);
2814
2815	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
2816	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
2817	    "SGE context id of the freelist");
2818	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
2819	    fl_pad ? 1 : 0, "padding enabled");
2820	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
2821	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
2822	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
2823	    0, "consumer index");
2824	if (fl->flags & FL_BUF_PACKING) {
2825		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
2826		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
2827	}
2828	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
2829	    0, "producer index");
2830	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
2831	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
2832	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
2833	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
2834	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
2835	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
2836	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
2837	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
2838	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
2839	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
2840}
2841
2842static int
2843alloc_fwq(struct adapter *sc)
2844{
2845	int rc, intr_idx;
2846	struct sge_iq *fwq = &sc->sge.fwq;
2847	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
2848	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2849
2850	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
2851	fwq->flags |= IQ_INTR;	/* always */
2852	intr_idx = sc->intr_count > 1 ? 1 : 0;
2853	rc = alloc_iq_fl(sc->port[0], fwq, NULL, intr_idx, -1);
2854	if (rc != 0) {
2855		device_printf(sc->dev,
2856		    "failed to create firmware event queue: %d\n", rc);
2857		return (rc);
2858	}
2859
2860	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
2861	    NULL, "firmware event queue");
2862	children = SYSCTL_CHILDREN(oid);
2863
2864	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id",
2865	    CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I",
2866	    "absolute id of the queue");
2867	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id",
2868	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I",
2869	    "SGE context id of the queue");
2870	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx",
2871	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I",
2872	    "consumer index");
2873
2874	return (0);
2875}
2876
2877static int
2878free_fwq(struct adapter *sc)
2879{
2880	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
2881}
2882
2883static int
2884alloc_mgmtq(struct adapter *sc)
2885{
2886	int rc;
2887	struct sge_wrq *mgmtq = &sc->sge.mgmtq;
2888	char name[16];
2889	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
2890	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2891
2892	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD,
2893	    NULL, "management queue");
2894
2895	snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev));
2896	init_eq(&mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan,
2897	    sc->sge.fwq.cntxt_id, name);
2898	rc = alloc_wrq(sc, NULL, mgmtq, oid);
2899	if (rc != 0) {
2900		device_printf(sc->dev,
2901		    "failed to create management queue: %d\n", rc);
2902		return (rc);
2903	}
2904
2905	return (0);
2906}
2907
2908static int
2909free_mgmtq(struct adapter *sc)
2910{
2911
2912	return free_wrq(sc, &sc->sge.mgmtq);
2913}
2914
2915static inline int
2916tnl_cong(struct port_info *pi)
2917{
2918
2919	if (cong_drop == -1)
2920		return (-1);
2921	else if (cong_drop == 1)
2922		return (0);
2923	else
2924		return (pi->rx_chan_map);
2925}
2926
2927static int
2928alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx,
2929    struct sysctl_oid *oid)
2930{
2931	int rc;
2932	struct sysctl_oid_list *children;
2933	char name[16];
2934
2935	rc = alloc_iq_fl(pi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(pi));
2936	if (rc != 0)
2937		return (rc);
2938
2939	/*
2940	 * The freelist is just barely above the starvation threshold right now,
2941	 * fill it up a bit more.
2942	 */
2943	FL_LOCK(&rxq->fl);
2944	refill_fl(pi->adapter, &rxq->fl, 128);
2945	FL_UNLOCK(&rxq->fl);
2946
2947#if defined(INET) || defined(INET6)
2948	rc = tcp_lro_init(&rxq->lro);
2949	if (rc != 0)
2950		return (rc);
2951	rxq->lro.ifp = pi->ifp; /* also indicates LRO init'ed */
2952
2953	if (pi->ifp->if_capenable & IFCAP_LRO)
2954		rxq->iq.flags |= IQ_LRO_ENABLED;
2955#endif
2956	rxq->ifp = pi->ifp;
2957
2958	children = SYSCTL_CHILDREN(oid);
2959
2960	snprintf(name, sizeof(name), "%d", idx);
2961	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
2962	    NULL, "rx queue");
2963	children = SYSCTL_CHILDREN(oid);
2964
2965	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "abs_id",
2966	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I",
2967	    "absolute id of the queue");
2968	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id",
2969	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I",
2970	    "SGE context id of the queue");
2971	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
2972	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I",
2973	    "consumer index");
2974#if defined(INET) || defined(INET6)
2975	SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
2976	    &rxq->lro.lro_queued, 0, NULL);
2977	SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
2978	    &rxq->lro.lro_flushed, 0, NULL);
2979#endif
2980	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
2981	    &rxq->rxcsum, "# of times hardware assisted with checksum");
2982	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_extraction",
2983	    CTLFLAG_RD, &rxq->vlan_extraction,
2984	    "# of times hardware extracted 802.1Q tag");
2985
2986	add_fl_sysctls(&pi->ctx, oid, &rxq->fl);
2987
2988	return (rc);
2989}
2990
2991static int
2992free_rxq(struct port_info *pi, struct sge_rxq *rxq)
2993{
2994	int rc;
2995
2996#if defined(INET) || defined(INET6)
2997	if (rxq->lro.ifp) {
2998		tcp_lro_free(&rxq->lro);
2999		rxq->lro.ifp = NULL;
3000	}
3001#endif
3002
3003	rc = free_iq_fl(pi, &rxq->iq, &rxq->fl);
3004	if (rc == 0)
3005		bzero(rxq, sizeof(*rxq));
3006
3007	return (rc);
3008}
3009
3010#ifdef TCP_OFFLOAD
3011static int
3012alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq,
3013    int intr_idx, int idx, struct sysctl_oid *oid)
3014{
3015	int rc;
3016	struct sysctl_oid_list *children;
3017	char name[16];
3018
3019	rc = alloc_iq_fl(pi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx,
3020	    pi->rx_chan_map);
3021	if (rc != 0)
3022		return (rc);
3023
3024	children = SYSCTL_CHILDREN(oid);
3025
3026	snprintf(name, sizeof(name), "%d", idx);
3027	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3028	    NULL, "rx queue");
3029	children = SYSCTL_CHILDREN(oid);
3030
3031	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "abs_id",
3032	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16,
3033	    "I", "absolute id of the queue");
3034	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id",
3035	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16,
3036	    "I", "SGE context id of the queue");
3037	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
3038	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I",
3039	    "consumer index");
3040
3041	add_fl_sysctls(&pi->ctx, oid, &ofld_rxq->fl);
3042
3043	return (rc);
3044}
3045
3046static int
3047free_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq)
3048{
3049	int rc;
3050
3051	rc = free_iq_fl(pi, &ofld_rxq->iq, &ofld_rxq->fl);
3052	if (rc == 0)
3053		bzero(ofld_rxq, sizeof(*ofld_rxq));
3054
3055	return (rc);
3056}
3057#endif
3058
3059#ifdef DEV_NETMAP
3060static int
3061alloc_nm_rxq(struct port_info *pi, struct sge_nm_rxq *nm_rxq, int intr_idx,
3062    int idx, struct sysctl_oid *oid)
3063{
3064	int rc;
3065	struct sysctl_oid_list *children;
3066	struct sysctl_ctx_list *ctx;
3067	char name[16];
3068	size_t len;
3069	struct adapter *sc = pi->adapter;
3070	struct netmap_adapter *na = NA(pi->nm_ifp);
3071
3072	MPASS(na != NULL);
3073
3074	len = pi->qsize_rxq * IQ_ESIZE;
3075	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
3076	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
3077	if (rc != 0)
3078		return (rc);
3079
3080	len = na->num_rx_desc * EQ_ESIZE + spg_len;
3081	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
3082	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
3083	if (rc != 0)
3084		return (rc);
3085
3086	nm_rxq->pi = pi;
3087	nm_rxq->nid = idx;
3088	nm_rxq->iq_cidx = 0;
3089	nm_rxq->iq_sidx = pi->qsize_rxq - spg_len / IQ_ESIZE;
3090	nm_rxq->iq_gen = F_RSPD_GEN;
3091	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
3092	nm_rxq->fl_sidx = na->num_rx_desc;
3093	nm_rxq->intr_idx = intr_idx;
3094
3095	ctx = &pi->ctx;
3096	children = SYSCTL_CHILDREN(oid);
3097
3098	snprintf(name, sizeof(name), "%d", idx);
3099	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
3100	    "rx queue");
3101	children = SYSCTL_CHILDREN(oid);
3102
3103	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
3104	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
3105	    "I", "absolute id of the queue");
3106	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3107	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
3108	    "I", "SGE context id of the queue");
3109	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3110	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
3111	    "consumer index");
3112
3113	children = SYSCTL_CHILDREN(oid);
3114	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
3115	    "freelist");
3116	children = SYSCTL_CHILDREN(oid);
3117
3118	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3119	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
3120	    "I", "SGE context id of the freelist");
3121	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
3122	    &nm_rxq->fl_cidx, 0, "consumer index");
3123	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
3124	    &nm_rxq->fl_pidx, 0, "producer index");
3125
3126	return (rc);
3127}
3128
3129
3130static int
3131free_nm_rxq(struct port_info *pi, struct sge_nm_rxq *nm_rxq)
3132{
3133	struct adapter *sc = pi->adapter;
3134
3135	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
3136	    nm_rxq->iq_desc);
3137	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
3138	    nm_rxq->fl_desc);
3139
3140	return (0);
3141}
3142
3143static int
3144alloc_nm_txq(struct port_info *pi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
3145    struct sysctl_oid *oid)
3146{
3147	int rc;
3148	size_t len;
3149	struct adapter *sc = pi->adapter;
3150	struct netmap_adapter *na = NA(pi->nm_ifp);
3151	char name[16];
3152	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3153
3154	len = na->num_tx_desc * EQ_ESIZE + spg_len;
3155	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
3156	    &nm_txq->ba, (void **)&nm_txq->desc);
3157	if (rc)
3158		return (rc);
3159
3160	nm_txq->pidx = nm_txq->cidx = 0;
3161	nm_txq->sidx = na->num_tx_desc;
3162	nm_txq->nid = idx;
3163	nm_txq->iqidx = iqidx;
3164	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3165	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf));
3166
3167	snprintf(name, sizeof(name), "%d", idx);
3168	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3169	    NULL, "netmap tx queue");
3170	children = SYSCTL_CHILDREN(oid);
3171
3172	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3173	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
3174	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
3175	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
3176	    "consumer index");
3177	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "pidx",
3178	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
3179	    "producer index");
3180
3181	return (rc);
3182}
3183
3184static int
3185free_nm_txq(struct port_info *pi, struct sge_nm_txq *nm_txq)
3186{
3187	struct adapter *sc = pi->adapter;
3188
3189	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
3190	    nm_txq->desc);
3191
3192	return (0);
3193}
3194#endif
3195
3196static int
3197ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
3198{
3199	int rc, cntxt_id;
3200	struct fw_eq_ctrl_cmd c;
3201	int qsize = eq->sidx + spg_len / EQ_ESIZE;
3202
3203	bzero(&c, sizeof(c));
3204
3205	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
3206	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
3207	    V_FW_EQ_CTRL_CMD_VFN(0));
3208	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
3209	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
3210	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
3211	c.physeqid_pkd = htobe32(0);
3212	c.fetchszm_to_iqid =
3213	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3214		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
3215		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
3216	c.dcaen_to_eqsize =
3217	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3218		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3219		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
3220	c.eqaddr = htobe64(eq->ba);
3221
3222	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3223	if (rc != 0) {
3224		device_printf(sc->dev,
3225		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
3226		return (rc);
3227	}
3228	eq->flags |= EQ_ALLOCATED;
3229
3230	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
3231	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3232	if (cntxt_id >= sc->sge.neq)
3233	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3234		cntxt_id, sc->sge.neq - 1);
3235	sc->sge.eqmap[cntxt_id] = eq;
3236
3237	return (rc);
3238}
3239
3240static int
3241eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
3242{
3243	int rc, cntxt_id;
3244	struct fw_eq_eth_cmd c;
3245	int qsize = eq->sidx + spg_len / EQ_ESIZE;
3246
3247	bzero(&c, sizeof(c));
3248
3249	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
3250	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
3251	    V_FW_EQ_ETH_CMD_VFN(0));
3252	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
3253	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
3254	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
3255	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(pi->viid));
3256	c.fetchszm_to_iqid =
3257	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3258		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
3259		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
3260	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3261	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3262	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
3263	c.eqaddr = htobe64(eq->ba);
3264
3265	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3266	if (rc != 0) {
3267		device_printf(pi->dev,
3268		    "failed to create Ethernet egress queue: %d\n", rc);
3269		return (rc);
3270	}
3271	eq->flags |= EQ_ALLOCATED;
3272
3273	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
3274	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3275	if (cntxt_id >= sc->sge.neq)
3276	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3277		cntxt_id, sc->sge.neq - 1);
3278	sc->sge.eqmap[cntxt_id] = eq;
3279
3280	return (rc);
3281}
3282
3283#ifdef TCP_OFFLOAD
3284static int
3285ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
3286{
3287	int rc, cntxt_id;
3288	struct fw_eq_ofld_cmd c;
3289	int qsize = eq->sidx + spg_len / EQ_ESIZE;
3290
3291	bzero(&c, sizeof(c));
3292
3293	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
3294	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
3295	    V_FW_EQ_OFLD_CMD_VFN(0));
3296	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
3297	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
3298	c.fetchszm_to_iqid =
3299		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3300		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
3301		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
3302	c.dcaen_to_eqsize =
3303	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3304		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3305		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
3306	c.eqaddr = htobe64(eq->ba);
3307
3308	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3309	if (rc != 0) {
3310		device_printf(pi->dev,
3311		    "failed to create egress queue for TCP offload: %d\n", rc);
3312		return (rc);
3313	}
3314	eq->flags |= EQ_ALLOCATED;
3315
3316	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
3317	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3318	if (cntxt_id >= sc->sge.neq)
3319	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3320		cntxt_id, sc->sge.neq - 1);
3321	sc->sge.eqmap[cntxt_id] = eq;
3322
3323	return (rc);
3324}
3325#endif
3326
3327static int
3328alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
3329{
3330	int rc, qsize;
3331	size_t len;
3332
3333	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
3334
3335	qsize = eq->sidx + spg_len / EQ_ESIZE;
3336	len = qsize * EQ_ESIZE;
3337	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
3338	    &eq->ba, (void **)&eq->desc);
3339	if (rc)
3340		return (rc);
3341
3342	eq->pidx = eq->cidx = 0;
3343	eq->equeqidx = eq->dbidx = 0;
3344	eq->doorbells = sc->doorbells;
3345
3346	switch (eq->flags & EQ_TYPEMASK) {
3347	case EQ_CTRL:
3348		rc = ctrl_eq_alloc(sc, eq);
3349		break;
3350
3351	case EQ_ETH:
3352		rc = eth_eq_alloc(sc, pi, eq);
3353		break;
3354
3355#ifdef TCP_OFFLOAD
3356	case EQ_OFLD:
3357		rc = ofld_eq_alloc(sc, pi, eq);
3358		break;
3359#endif
3360
3361	default:
3362		panic("%s: invalid eq type %d.", __func__,
3363		    eq->flags & EQ_TYPEMASK);
3364	}
3365	if (rc != 0) {
3366		device_printf(sc->dev,
3367		    "failed to allocate egress queue(%d): %d\n",
3368		    eq->flags & EQ_TYPEMASK, rc);
3369	}
3370
3371	if (isset(&eq->doorbells, DOORBELL_UDB) ||
3372	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
3373	    isset(&eq->doorbells, DOORBELL_WCWR)) {
3374		uint32_t s_qpp = sc->sge.eq_s_qpp;
3375		uint32_t mask = (1 << s_qpp) - 1;
3376		volatile uint8_t *udb;
3377
3378		udb = sc->udbs_base + UDBS_DB_OFFSET;
3379		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
3380		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
3381		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
3382	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
3383		else {
3384			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
3385			eq->udb_qid = 0;
3386		}
3387		eq->udb = (volatile void *)udb;
3388	}
3389
3390	return (rc);
3391}
3392
3393static int
3394free_eq(struct adapter *sc, struct sge_eq *eq)
3395{
3396	int rc;
3397
3398	if (eq->flags & EQ_ALLOCATED) {
3399		switch (eq->flags & EQ_TYPEMASK) {
3400		case EQ_CTRL:
3401			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
3402			    eq->cntxt_id);
3403			break;
3404
3405		case EQ_ETH:
3406			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
3407			    eq->cntxt_id);
3408			break;
3409
3410#ifdef TCP_OFFLOAD
3411		case EQ_OFLD:
3412			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
3413			    eq->cntxt_id);
3414			break;
3415#endif
3416
3417		default:
3418			panic("%s: invalid eq type %d.", __func__,
3419			    eq->flags & EQ_TYPEMASK);
3420		}
3421		if (rc != 0) {
3422			device_printf(sc->dev,
3423			    "failed to free egress queue (%d): %d\n",
3424			    eq->flags & EQ_TYPEMASK, rc);
3425			return (rc);
3426		}
3427		eq->flags &= ~EQ_ALLOCATED;
3428	}
3429
3430	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
3431
3432	if (mtx_initialized(&eq->eq_lock))
3433		mtx_destroy(&eq->eq_lock);
3434
3435	bzero(eq, sizeof(*eq));
3436	return (0);
3437}
3438
3439static int
3440alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq,
3441    struct sysctl_oid *oid)
3442{
3443	int rc;
3444	struct sysctl_ctx_list *ctx = pi ? &pi->ctx : &sc->ctx;
3445	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3446
3447	rc = alloc_eq(sc, pi, &wrq->eq);
3448	if (rc)
3449		return (rc);
3450
3451	wrq->adapter = sc;
3452	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
3453	TAILQ_INIT(&wrq->incomplete_wrs);
3454	STAILQ_INIT(&wrq->wr_list);
3455	wrq->nwr_pending = 0;
3456	wrq->ndesc_needed = 0;
3457
3458	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3459	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
3460	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3461	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
3462	    "consumer index");
3463	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
3464	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
3465	    "producer index");
3466	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
3467	    &wrq->tx_wrs_direct, "# of work requests (direct)");
3468	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
3469	    &wrq->tx_wrs_copied, "# of work requests (copied)");
3470
3471	return (rc);
3472}
3473
3474static int
3475free_wrq(struct adapter *sc, struct sge_wrq *wrq)
3476{
3477	int rc;
3478
3479	rc = free_eq(sc, &wrq->eq);
3480	if (rc)
3481		return (rc);
3482
3483	bzero(wrq, sizeof(*wrq));
3484	return (0);
3485}
3486
3487static int
3488alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx,
3489    struct sysctl_oid *oid)
3490{
3491	int rc;
3492	struct adapter *sc = pi->adapter;
3493	struct sge_eq *eq = &txq->eq;
3494	char name[16];
3495	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3496
3497	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
3498	    M_CXGBE, M_WAITOK);
3499	if (rc != 0) {
3500		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
3501		return (rc);
3502	}
3503
3504	rc = alloc_eq(sc, pi, eq);
3505	if (rc != 0) {
3506		mp_ring_free(txq->r);
3507		txq->r = NULL;
3508		return (rc);
3509	}
3510
3511	/* Can't fail after this point. */
3512
3513	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
3514	txq->ifp = pi->ifp;
3515	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
3516	txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3517	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf));
3518	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
3519	    M_ZERO | M_WAITOK);
3520
3521	snprintf(name, sizeof(name), "%d", idx);
3522	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3523	    NULL, "tx queue");
3524	children = SYSCTL_CHILDREN(oid);
3525
3526	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3527	    &eq->cntxt_id, 0, "SGE context id of the queue");
3528	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
3529	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
3530	    "consumer index");
3531	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "pidx",
3532	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
3533	    "producer index");
3534
3535	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
3536	    &txq->txcsum, "# of times hardware assisted with checksum");
3537	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_insertion",
3538	    CTLFLAG_RD, &txq->vlan_insertion,
3539	    "# of times hardware inserted 802.1Q tag");
3540	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
3541	    &txq->tso_wrs, "# of TSO work requests");
3542	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
3543	    &txq->imm_wrs, "# of work requests with immediate data");
3544	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
3545	    &txq->sgl_wrs, "# of work requests with direct SGL");
3546	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
3547	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
3548	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts0_wrs",
3549	    CTLFLAG_RD, &txq->txpkts0_wrs,
3550	    "# of txpkts (type 0) work requests");
3551	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts1_wrs",
3552	    CTLFLAG_RD, &txq->txpkts1_wrs,
3553	    "# of txpkts (type 1) work requests");
3554	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts0_pkts",
3555	    CTLFLAG_RD, &txq->txpkts0_pkts,
3556	    "# of frames tx'd using type0 txpkts work requests");
3557	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts1_pkts",
3558	    CTLFLAG_RD, &txq->txpkts1_pkts,
3559	    "# of frames tx'd using type1 txpkts work requests");
3560
3561	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_enqueues",
3562	    CTLFLAG_RD, &txq->r->enqueues,
3563	    "# of enqueues to the mp_ring for this queue");
3564	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_drops",
3565	    CTLFLAG_RD, &txq->r->drops,
3566	    "# of drops in the mp_ring for this queue");
3567	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_starts",
3568	    CTLFLAG_RD, &txq->r->starts,
3569	    "# of normal consumer starts in the mp_ring for this queue");
3570	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_stalls",
3571	    CTLFLAG_RD, &txq->r->stalls,
3572	    "# of consumer stalls in the mp_ring for this queue");
3573	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_restarts",
3574	    CTLFLAG_RD, &txq->r->restarts,
3575	    "# of consumer restarts in the mp_ring for this queue");
3576	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_abdications",
3577	    CTLFLAG_RD, &txq->r->abdications,
3578	    "# of consumer abdications in the mp_ring for this queue");
3579
3580	return (0);
3581}
3582
3583static int
3584free_txq(struct port_info *pi, struct sge_txq *txq)
3585{
3586	int rc;
3587	struct adapter *sc = pi->adapter;
3588	struct sge_eq *eq = &txq->eq;
3589
3590	rc = free_eq(sc, eq);
3591	if (rc)
3592		return (rc);
3593
3594	sglist_free(txq->gl);
3595	free(txq->sdesc, M_CXGBE);
3596	mp_ring_free(txq->r);
3597
3598	bzero(txq, sizeof(*txq));
3599	return (0);
3600}
3601
3602static void
3603oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
3604{
3605	bus_addr_t *ba = arg;
3606
3607	KASSERT(nseg == 1,
3608	    ("%s meant for single segment mappings only.", __func__));
3609
3610	*ba = error ? 0 : segs->ds_addr;
3611}
3612
3613static inline void
3614ring_fl_db(struct adapter *sc, struct sge_fl *fl)
3615{
3616	uint32_t n, v;
3617
3618	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
3619	MPASS(n > 0);
3620
3621	wmb();
3622	v = fl->dbval | V_PIDX(n);
3623	if (fl->udb)
3624		*fl->udb = htole32(v);
3625	else
3626		t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v);
3627	IDXINCR(fl->dbidx, n, fl->sidx);
3628}
3629
3630/*
3631 * Fills up the freelist by allocating upto 'n' buffers.  Buffers that are
3632 * recycled do not count towards this allocation budget.
3633 *
3634 * Returns non-zero to indicate that this freelist should be added to the list
3635 * of starving freelists.
3636 */
3637static int
3638refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
3639{
3640	__be64 *d;
3641	struct fl_sdesc *sd;
3642	uintptr_t pa;
3643	caddr_t cl;
3644	struct cluster_layout *cll;
3645	struct sw_zone_info *swz;
3646	struct cluster_metadata *clm;
3647	uint16_t max_pidx;
3648	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
3649
3650	FL_LOCK_ASSERT_OWNED(fl);
3651
3652	/*
3653	 * We always stop at the begining of the hardware descriptor that's just
3654	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
3655	 * which would mean an empty freelist to the chip.
3656	 */
3657	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
3658	if (fl->pidx == max_pidx * 8)
3659		return (0);
3660
3661	d = &fl->desc[fl->pidx];
3662	sd = &fl->sdesc[fl->pidx];
3663	cll = &fl->cll_def;	/* default layout */
3664	swz = &sc->sge.sw_zone_info[cll->zidx];
3665
3666	while (n > 0) {
3667
3668		if (sd->cl != NULL) {
3669
3670			if (sd->nmbuf == 0) {
3671				/*
3672				 * Fast recycle without involving any atomics on
3673				 * the cluster's metadata (if the cluster has
3674				 * metadata).  This happens when all frames
3675				 * received in the cluster were small enough to
3676				 * fit within a single mbuf each.
3677				 */
3678				fl->cl_fast_recycled++;
3679#ifdef INVARIANTS
3680				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3681				if (clm != NULL)
3682					MPASS(clm->refcount == 1);
3683#endif
3684				goto recycled_fast;
3685			}
3686
3687			/*
3688			 * Cluster is guaranteed to have metadata.  Clusters
3689			 * without metadata always take the fast recycle path
3690			 * when they're recycled.
3691			 */
3692			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3693			MPASS(clm != NULL);
3694
3695			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
3696				fl->cl_recycled++;
3697				counter_u64_add(extfree_rels, 1);
3698				goto recycled;
3699			}
3700			sd->cl = NULL;	/* gave up my reference */
3701		}
3702		MPASS(sd->cl == NULL);
3703alloc:
3704		cl = uma_zalloc(swz->zone, M_NOWAIT);
3705		if (__predict_false(cl == NULL)) {
3706			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
3707			    fl->cll_def.zidx == fl->cll_alt.zidx)
3708				break;
3709
3710			/* fall back to the safe zone */
3711			cll = &fl->cll_alt;
3712			swz = &sc->sge.sw_zone_info[cll->zidx];
3713			goto alloc;
3714		}
3715		fl->cl_allocated++;
3716		n--;
3717
3718		pa = pmap_kextract((vm_offset_t)cl);
3719		pa += cll->region1;
3720		sd->cl = cl;
3721		sd->cll = *cll;
3722		*d = htobe64(pa | cll->hwidx);
3723		clm = cl_metadata(sc, fl, cll, cl);
3724		if (clm != NULL) {
3725recycled:
3726#ifdef INVARIANTS
3727			clm->sd = sd;
3728#endif
3729			clm->refcount = 1;
3730		}
3731		sd->nmbuf = 0;
3732recycled_fast:
3733		d++;
3734		sd++;
3735		if (__predict_false(++fl->pidx % 8 == 0)) {
3736			uint16_t pidx = fl->pidx / 8;
3737
3738			if (__predict_false(pidx == fl->sidx)) {
3739				fl->pidx = 0;
3740				pidx = 0;
3741				sd = fl->sdesc;
3742				d = fl->desc;
3743			}
3744			if (pidx == max_pidx)
3745				break;
3746
3747			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
3748				ring_fl_db(sc, fl);
3749		}
3750	}
3751
3752	if (fl->pidx / 8 != fl->dbidx)
3753		ring_fl_db(sc, fl);
3754
3755	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
3756}
3757
3758/*
3759 * Attempt to refill all starving freelists.
3760 */
3761static void
3762refill_sfl(void *arg)
3763{
3764	struct adapter *sc = arg;
3765	struct sge_fl *fl, *fl_temp;
3766
3767	mtx_lock(&sc->sfl_lock);
3768	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
3769		FL_LOCK(fl);
3770		refill_fl(sc, fl, 64);
3771		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
3772			TAILQ_REMOVE(&sc->sfl, fl, link);
3773			fl->flags &= ~FL_STARVING;
3774		}
3775		FL_UNLOCK(fl);
3776	}
3777
3778	if (!TAILQ_EMPTY(&sc->sfl))
3779		callout_schedule(&sc->sfl_callout, hz / 5);
3780	mtx_unlock(&sc->sfl_lock);
3781}
3782
3783static int
3784alloc_fl_sdesc(struct sge_fl *fl)
3785{
3786
3787	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
3788	    M_ZERO | M_WAITOK);
3789
3790	return (0);
3791}
3792
3793static void
3794free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
3795{
3796	struct fl_sdesc *sd;
3797	struct cluster_metadata *clm;
3798	struct cluster_layout *cll;
3799	int i;
3800
3801	sd = fl->sdesc;
3802	for (i = 0; i < fl->sidx * 8; i++, sd++) {
3803		if (sd->cl == NULL)
3804			continue;
3805
3806		cll = &sd->cll;
3807		clm = cl_metadata(sc, fl, cll, sd->cl);
3808		if (sd->nmbuf == 0)
3809			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
3810		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
3811			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
3812			counter_u64_add(extfree_rels, 1);
3813		}
3814		sd->cl = NULL;
3815	}
3816
3817	free(fl->sdesc, M_CXGBE);
3818	fl->sdesc = NULL;
3819}
3820
3821static inline void
3822get_pkt_gl(struct mbuf *m, struct sglist *gl)
3823{
3824	int rc;
3825
3826	M_ASSERTPKTHDR(m);
3827
3828	sglist_reset(gl);
3829	rc = sglist_append_mbuf(gl, m);
3830	if (__predict_false(rc != 0)) {
3831		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
3832		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
3833	}
3834
3835	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
3836	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
3837	    mbuf_nsegs(m), gl->sg_nseg));
3838	KASSERT(gl->sg_nseg > 0 &&
3839	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
3840	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
3841		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
3842}
3843
3844/*
3845 * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
3846 */
3847static inline u_int
3848txpkt_len16(u_int nsegs, u_int tso)
3849{
3850	u_int n;
3851
3852	MPASS(nsegs > 0);
3853
3854	nsegs--; /* first segment is part of ulptx_sgl */
3855	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
3856	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
3857	if (tso)
3858		n += sizeof(struct cpl_tx_pkt_lso_core);
3859
3860	return (howmany(n, 16));
3861}
3862
3863/*
3864 * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
3865 * request header.
3866 */
3867static inline u_int
3868txpkts0_len16(u_int nsegs)
3869{
3870	u_int n;
3871
3872	MPASS(nsegs > 0);
3873
3874	nsegs--; /* first segment is part of ulptx_sgl */
3875	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
3876	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
3877	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
3878
3879	return (howmany(n, 16));
3880}
3881
3882/*
3883 * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
3884 * request header.
3885 */
3886static inline u_int
3887txpkts1_len16(void)
3888{
3889	u_int n;
3890
3891	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
3892
3893	return (howmany(n, 16));
3894}
3895
3896static inline u_int
3897imm_payload(u_int ndesc)
3898{
3899	u_int n;
3900
3901	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
3902	    sizeof(struct cpl_tx_pkt_core);
3903
3904	return (n);
3905}
3906
3907/*
3908 * Write a txpkt WR for this packet to the hardware descriptors, update the
3909 * software descriptor, and advance the pidx.  It is guaranteed that enough
3910 * descriptors are available.
3911 *
3912 * The return value is the # of hardware descriptors used.
3913 */
3914static u_int
3915write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
3916    struct mbuf *m0, u_int available)
3917{
3918	struct sge_eq *eq = &txq->eq;
3919	struct tx_sdesc *txsd;
3920	struct cpl_tx_pkt_core *cpl;
3921	uint32_t ctrl;	/* used in many unrelated places */
3922	uint64_t ctrl1;
3923	int len16, ndesc, pktlen, nsegs;
3924	caddr_t dst;
3925
3926	TXQ_LOCK_ASSERT_OWNED(txq);
3927	M_ASSERTPKTHDR(m0);
3928	MPASS(available > 0 && available < eq->sidx);
3929
3930	len16 = mbuf_len16(m0);
3931	nsegs = mbuf_nsegs(m0);
3932	pktlen = m0->m_pkthdr.len;
3933	ctrl = sizeof(struct cpl_tx_pkt_core);
3934	if (needs_tso(m0))
3935		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
3936	else if (pktlen <= imm_payload(2) && available >= 2) {
3937		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
3938		ctrl += pktlen;
3939		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
3940		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
3941		nsegs = 0;
3942	}
3943	ndesc = howmany(len16, EQ_ESIZE / 16);
3944	MPASS(ndesc <= available);
3945
3946	/* Firmware work request header */
3947	MPASS(wr == (void *)&eq->desc[eq->pidx]);
3948	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
3949	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
3950
3951	ctrl = V_FW_WR_LEN16(len16);
3952	wr->equiq_to_len16 = htobe32(ctrl);
3953	wr->r3 = 0;
3954
3955	if (needs_tso(m0)) {
3956		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
3957
3958		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
3959		    m0->m_pkthdr.l4hlen > 0,
3960		    ("%s: mbuf %p needs TSO but missing header lengths",
3961			__func__, m0));
3962
3963		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
3964		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
3965		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
3966		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
3967			ctrl |= V_LSO_ETHHDR_LEN(1);
3968		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
3969			ctrl |= F_LSO_IPV6;
3970
3971		lso->lso_ctrl = htobe32(ctrl);
3972		lso->ipid_ofst = htobe16(0);
3973		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
3974		lso->seqno_offset = htobe32(0);
3975		lso->len = htobe32(pktlen);
3976
3977		cpl = (void *)(lso + 1);
3978
3979		txq->tso_wrs++;
3980	} else
3981		cpl = (void *)(wr + 1);
3982
3983	/* Checksum offload */
3984	ctrl1 = 0;
3985	if (needs_l3_csum(m0) == 0)
3986		ctrl1 |= F_TXPKT_IPCSUM_DIS;
3987	if (needs_l4_csum(m0) == 0)
3988		ctrl1 |= F_TXPKT_L4CSUM_DIS;
3989	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
3990	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
3991		txq->txcsum++;	/* some hardware assistance provided */
3992
3993	/* VLAN tag insertion */
3994	if (needs_vlan_insertion(m0)) {
3995		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
3996		txq->vlan_insertion++;
3997	}
3998
3999	/* CPL header */
4000	cpl->ctrl0 = txq->cpl_ctrl0;
4001	cpl->pack = 0;
4002	cpl->len = htobe16(pktlen);
4003	cpl->ctrl1 = htobe64(ctrl1);
4004
4005	/* SGL */
4006	dst = (void *)(cpl + 1);
4007	if (nsegs > 0) {
4008
4009		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
4010		txq->sgl_wrs++;
4011	} else {
4012		struct mbuf *m;
4013
4014		for (m = m0; m != NULL; m = m->m_next) {
4015			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
4016#ifdef INVARIANTS
4017			pktlen -= m->m_len;
4018#endif
4019		}
4020#ifdef INVARIANTS
4021		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
4022#endif
4023		txq->imm_wrs++;
4024	}
4025
4026	txq->txpkt_wrs++;
4027
4028	txsd = &txq->sdesc[eq->pidx];
4029	txsd->m = m0;
4030	txsd->desc_used = ndesc;
4031
4032	return (ndesc);
4033}
4034
4035static int
4036try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
4037{
4038	u_int needed, nsegs1, nsegs2, l1, l2;
4039
4040	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
4041		return (1);
4042
4043	nsegs1 = mbuf_nsegs(m);
4044	nsegs2 = mbuf_nsegs(n);
4045	if (nsegs1 + nsegs2 == 2) {
4046		txp->wr_type = 1;
4047		l1 = l2 = txpkts1_len16();
4048	} else {
4049		txp->wr_type = 0;
4050		l1 = txpkts0_len16(nsegs1);
4051		l2 = txpkts0_len16(nsegs2);
4052	}
4053	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
4054	needed = howmany(txp->len16, EQ_ESIZE / 16);
4055	if (needed > SGE_MAX_WR_NDESC || needed > available)
4056		return (1);
4057
4058	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
4059	if (txp->plen > 65535)
4060		return (1);
4061
4062	txp->npkt = 2;
4063	set_mbuf_len16(m, l1);
4064	set_mbuf_len16(n, l2);
4065
4066	return (0);
4067}
4068
4069static int
4070add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
4071{
4072	u_int plen, len16, needed, nsegs;
4073
4074	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
4075
4076	nsegs = mbuf_nsegs(m);
4077	if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1))
4078		return (1);
4079
4080	plen = txp->plen + m->m_pkthdr.len;
4081	if (plen > 65535)
4082		return (1);
4083
4084	if (txp->wr_type == 0)
4085		len16 = txpkts0_len16(nsegs);
4086	else
4087		len16 = txpkts1_len16();
4088	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
4089	if (needed > SGE_MAX_WR_NDESC || needed > available)
4090		return (1);
4091
4092	txp->npkt++;
4093	txp->plen = plen;
4094	txp->len16 += len16;
4095	set_mbuf_len16(m, len16);
4096
4097	return (0);
4098}
4099
4100/*
4101 * Write a txpkts WR for the packets in txp to the hardware descriptors, update
4102 * the software descriptor, and advance the pidx.  It is guaranteed that enough
4103 * descriptors are available.
4104 *
4105 * The return value is the # of hardware descriptors used.
4106 */
4107static u_int
4108write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
4109    struct mbuf *m0, const struct txpkts *txp, u_int available)
4110{
4111	struct sge_eq *eq = &txq->eq;
4112	struct tx_sdesc *txsd;
4113	struct cpl_tx_pkt_core *cpl;
4114	uint32_t ctrl;
4115	uint64_t ctrl1;
4116	int ndesc, checkwrap;
4117	struct mbuf *m;
4118	void *flitp;
4119
4120	TXQ_LOCK_ASSERT_OWNED(txq);
4121	MPASS(txp->npkt > 0);
4122	MPASS(txp->plen < 65536);
4123	MPASS(m0 != NULL);
4124	MPASS(m0->m_nextpkt != NULL);
4125	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
4126	MPASS(available > 0 && available < eq->sidx);
4127
4128	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
4129	MPASS(ndesc <= available);
4130
4131	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4132	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
4133	ctrl = V_FW_WR_LEN16(txp->len16);
4134	wr->equiq_to_len16 = htobe32(ctrl);
4135	wr->plen = htobe16(txp->plen);
4136	wr->npkt = txp->npkt;
4137	wr->r3 = 0;
4138	wr->type = txp->wr_type;
4139	flitp = wr + 1;
4140
4141	/*
4142	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
4143	 * set then we know the WR is going to wrap around somewhere.  We'll
4144	 * check for that at appropriate points.
4145	 */
4146	checkwrap = eq->sidx - ndesc < eq->pidx;
4147	for (m = m0; m != NULL; m = m->m_nextpkt) {
4148		if (txp->wr_type == 0) {
4149			struct ulp_txpkt *ulpmc;
4150			struct ulptx_idata *ulpsc;
4151
4152			/* ULP master command */
4153			ulpmc = flitp;
4154			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
4155			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
4156			ulpmc->len = htobe32(mbuf_len16(m));
4157
4158			/* ULP subcommand */
4159			ulpsc = (void *)(ulpmc + 1);
4160			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
4161			    F_ULP_TX_SC_MORE);
4162			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
4163
4164			cpl = (void *)(ulpsc + 1);
4165			if (checkwrap &&
4166			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
4167				cpl = (void *)&eq->desc[0];
4168			txq->txpkts0_pkts += txp->npkt;
4169			txq->txpkts0_wrs++;
4170		} else {
4171			cpl = flitp;
4172			txq->txpkts1_pkts += txp->npkt;
4173			txq->txpkts1_wrs++;
4174		}
4175
4176		/* Checksum offload */
4177		ctrl1 = 0;
4178		if (needs_l3_csum(m) == 0)
4179			ctrl1 |= F_TXPKT_IPCSUM_DIS;
4180		if (needs_l4_csum(m) == 0)
4181			ctrl1 |= F_TXPKT_L4CSUM_DIS;
4182		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4183		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4184			txq->txcsum++;	/* some hardware assistance provided */
4185
4186		/* VLAN tag insertion */
4187		if (needs_vlan_insertion(m)) {
4188			ctrl1 |= F_TXPKT_VLAN_VLD |
4189			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
4190			txq->vlan_insertion++;
4191		}
4192
4193		/* CPL header */
4194		cpl->ctrl0 = txq->cpl_ctrl0;
4195		cpl->pack = 0;
4196		cpl->len = htobe16(m->m_pkthdr.len);
4197		cpl->ctrl1 = htobe64(ctrl1);
4198
4199		flitp = cpl + 1;
4200		if (checkwrap &&
4201		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
4202			flitp = (void *)&eq->desc[0];
4203
4204		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
4205
4206	}
4207
4208	txsd = &txq->sdesc[eq->pidx];
4209	txsd->m = m0;
4210	txsd->desc_used = ndesc;
4211
4212	return (ndesc);
4213}
4214
4215/*
4216 * If the SGL ends on an address that is not 16 byte aligned, this function will
4217 * add a 0 filled flit at the end.
4218 */
4219static void
4220write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
4221{
4222	struct sge_eq *eq = &txq->eq;
4223	struct sglist *gl = txq->gl;
4224	struct sglist_seg *seg;
4225	__be64 *flitp, *wrap;
4226	struct ulptx_sgl *usgl;
4227	int i, nflits, nsegs;
4228
4229	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
4230	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
4231	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4232	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4233
4234	get_pkt_gl(m, gl);
4235	nsegs = gl->sg_nseg;
4236	MPASS(nsegs > 0);
4237
4238	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
4239	flitp = (__be64 *)(*to);
4240	wrap = (__be64 *)(&eq->desc[eq->sidx]);
4241	seg = &gl->sg_segs[0];
4242	usgl = (void *)flitp;
4243
4244	/*
4245	 * We start at a 16 byte boundary somewhere inside the tx descriptor
4246	 * ring, so we're at least 16 bytes away from the status page.  There is
4247	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
4248	 */
4249
4250	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
4251	    V_ULPTX_NSGE(nsegs));
4252	usgl->len0 = htobe32(seg->ss_len);
4253	usgl->addr0 = htobe64(seg->ss_paddr);
4254	seg++;
4255
4256	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
4257
4258		/* Won't wrap around at all */
4259
4260		for (i = 0; i < nsegs - 1; i++, seg++) {
4261			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
4262			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
4263		}
4264		if (i & 1)
4265			usgl->sge[i / 2].len[1] = htobe32(0);
4266		flitp += nflits;
4267	} else {
4268
4269		/* Will wrap somewhere in the rest of the SGL */
4270
4271		/* 2 flits already written, write the rest flit by flit */
4272		flitp = (void *)(usgl + 1);
4273		for (i = 0; i < nflits - 2; i++) {
4274			if (flitp == wrap)
4275				flitp = (void *)eq->desc;
4276			*flitp++ = get_flit(seg, nsegs - 1, i);
4277		}
4278	}
4279
4280	if (nflits & 1) {
4281		MPASS(((uintptr_t)flitp) & 0xf);
4282		*flitp++ = 0;
4283	}
4284
4285	MPASS((((uintptr_t)flitp) & 0xf) == 0);
4286	if (__predict_false(flitp == wrap))
4287		*to = (void *)eq->desc;
4288	else
4289		*to = (void *)flitp;
4290}
4291
4292static inline void
4293copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
4294{
4295
4296	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4297	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4298
4299	if (__predict_true((uintptr_t)(*to) + len <=
4300	    (uintptr_t)&eq->desc[eq->sidx])) {
4301		bcopy(from, *to, len);
4302		(*to) += len;
4303	} else {
4304		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
4305
4306		bcopy(from, *to, portion);
4307		from += portion;
4308		portion = len - portion;	/* remaining */
4309		bcopy(from, (void *)eq->desc, portion);
4310		(*to) = (caddr_t)eq->desc + portion;
4311	}
4312}
4313
4314static inline void
4315ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
4316{
4317	u_int db;
4318
4319	MPASS(n > 0);
4320
4321	db = eq->doorbells;
4322	if (n > 1)
4323		clrbit(&db, DOORBELL_WCWR);
4324	wmb();
4325
4326	switch (ffs(db) - 1) {
4327	case DOORBELL_UDB:
4328		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4329		break;
4330
4331	case DOORBELL_WCWR: {
4332		volatile uint64_t *dst, *src;
4333		int i;
4334
4335		/*
4336		 * Queues whose 128B doorbell segment fits in the page do not
4337		 * use relative qid (udb_qid is always 0).  Only queues with
4338		 * doorbell segments can do WCWR.
4339		 */
4340		KASSERT(eq->udb_qid == 0 && n == 1,
4341		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
4342		    __func__, eq->doorbells, n, eq->dbidx, eq));
4343
4344		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
4345		    UDBS_DB_OFFSET);
4346		i = eq->dbidx;
4347		src = (void *)&eq->desc[i];
4348		while (src != (void *)&eq->desc[i + 1])
4349			*dst++ = *src++;
4350		wmb();
4351		break;
4352	}
4353
4354	case DOORBELL_UDBWC:
4355		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4356		wmb();
4357		break;
4358
4359	case DOORBELL_KDB:
4360		t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL),
4361		    V_QID(eq->cntxt_id) | V_PIDX(n));
4362		break;
4363	}
4364
4365	IDXINCR(eq->dbidx, n, eq->sidx);
4366}
4367
4368static inline u_int
4369reclaimable_tx_desc(struct sge_eq *eq)
4370{
4371	uint16_t hw_cidx;
4372
4373	hw_cidx = read_hw_cidx(eq);
4374	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
4375}
4376
4377static inline u_int
4378total_available_tx_desc(struct sge_eq *eq)
4379{
4380	uint16_t hw_cidx, pidx;
4381
4382	hw_cidx = read_hw_cidx(eq);
4383	pidx = eq->pidx;
4384
4385	if (pidx == hw_cidx)
4386		return (eq->sidx - 1);
4387	else
4388		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
4389}
4390
4391static inline uint16_t
4392read_hw_cidx(struct sge_eq *eq)
4393{
4394	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
4395	uint16_t cidx = spg->cidx;	/* stable snapshot */
4396
4397	return (be16toh(cidx));
4398}
4399
4400/*
4401 * Reclaim 'n' descriptors approximately.
4402 */
4403static u_int
4404reclaim_tx_descs(struct sge_txq *txq, u_int n)
4405{
4406	struct tx_sdesc *txsd;
4407	struct sge_eq *eq = &txq->eq;
4408	u_int can_reclaim, reclaimed;
4409
4410	TXQ_LOCK_ASSERT_OWNED(txq);
4411	MPASS(n > 0);
4412
4413	reclaimed = 0;
4414	can_reclaim = reclaimable_tx_desc(eq);
4415	while (can_reclaim && reclaimed < n) {
4416		int ndesc;
4417		struct mbuf *m, *nextpkt;
4418
4419		txsd = &txq->sdesc[eq->cidx];
4420		ndesc = txsd->desc_used;
4421
4422		/* Firmware doesn't return "partial" credits. */
4423		KASSERT(can_reclaim >= ndesc,
4424		    ("%s: unexpected number of credits: %d, %d",
4425		    __func__, can_reclaim, ndesc));
4426
4427		for (m = txsd->m; m != NULL; m = nextpkt) {
4428			nextpkt = m->m_nextpkt;
4429			m->m_nextpkt = NULL;
4430			m_freem(m);
4431		}
4432		reclaimed += ndesc;
4433		can_reclaim -= ndesc;
4434		IDXINCR(eq->cidx, ndesc, eq->sidx);
4435	}
4436
4437	return (reclaimed);
4438}
4439
4440static void
4441tx_reclaim(void *arg, int n)
4442{
4443	struct sge_txq *txq = arg;
4444	struct sge_eq *eq = &txq->eq;
4445
4446	do {
4447		if (TXQ_TRYLOCK(txq) == 0)
4448			break;
4449		n = reclaim_tx_descs(txq, 32);
4450		if (eq->cidx == eq->pidx)
4451			eq->equeqidx = eq->pidx;
4452		TXQ_UNLOCK(txq);
4453	} while (n > 0);
4454}
4455
4456static __be64
4457get_flit(struct sglist_seg *segs, int nsegs, int idx)
4458{
4459	int i = (idx / 3) * 2;
4460
4461	switch (idx % 3) {
4462	case 0: {
4463		__be64 rc;
4464
4465		rc = htobe32(segs[i].ss_len);
4466		if (i + 1 < nsegs)
4467			rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32;
4468
4469		return (rc);
4470	}
4471	case 1:
4472		return (htobe64(segs[i].ss_paddr));
4473	case 2:
4474		return (htobe64(segs[i + 1].ss_paddr));
4475	}
4476
4477	return (0);
4478}
4479
4480static void
4481find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
4482{
4483	int8_t zidx, hwidx, idx;
4484	uint16_t region1, region3;
4485	int spare, spare_needed, n;
4486	struct sw_zone_info *swz;
4487	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
4488
4489	/*
4490	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
4491	 * large enough for the max payload and cluster metadata.  Otherwise
4492	 * settle for the largest bufsize that leaves enough room in the cluster
4493	 * for metadata.
4494	 *
4495	 * Without buffer packing: Look for the smallest zone which has a
4496	 * bufsize large enough for the max payload.  Settle for the largest
4497	 * bufsize available if there's nothing big enough for max payload.
4498	 */
4499	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
4500	swz = &sc->sge.sw_zone_info[0];
4501	hwidx = -1;
4502	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
4503		if (swz->size > largest_rx_cluster) {
4504			if (__predict_true(hwidx != -1))
4505				break;
4506
4507			/*
4508			 * This is a misconfiguration.  largest_rx_cluster is
4509			 * preventing us from finding a refill source.  See
4510			 * dev.t5nex.<n>.buffer_sizes to figure out why.
4511			 */
4512			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
4513			    " refill source for fl %p (dma %u).  Ignored.\n",
4514			    largest_rx_cluster, fl, maxp);
4515		}
4516		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
4517			hwb = &hwb_list[idx];
4518			spare = swz->size - hwb->size;
4519			if (spare < spare_needed)
4520				continue;
4521
4522			hwidx = idx;		/* best option so far */
4523			if (hwb->size >= maxp) {
4524
4525				if ((fl->flags & FL_BUF_PACKING) == 0)
4526					goto done; /* stop looking (not packing) */
4527
4528				if (swz->size >= safest_rx_cluster)
4529					goto done; /* stop looking (packing) */
4530			}
4531			break;		/* keep looking, next zone */
4532		}
4533	}
4534done:
4535	/* A usable hwidx has been located. */
4536	MPASS(hwidx != -1);
4537	hwb = &hwb_list[hwidx];
4538	zidx = hwb->zidx;
4539	swz = &sc->sge.sw_zone_info[zidx];
4540	region1 = 0;
4541	region3 = swz->size - hwb->size;
4542
4543	/*
4544	 * Stay within this zone and see if there is a better match when mbuf
4545	 * inlining is allowed.  Remember that the hwidx's are sorted in
4546	 * decreasing order of size (so in increasing order of spare area).
4547	 */
4548	for (idx = hwidx; idx != -1; idx = hwb->next) {
4549		hwb = &hwb_list[idx];
4550		spare = swz->size - hwb->size;
4551
4552		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
4553			break;
4554
4555		/*
4556		 * Do not inline mbufs if doing so would violate the pad/pack
4557		 * boundary alignment requirement.
4558		 */
4559		if (fl_pad && (MSIZE % sc->sge.pad_boundary) != 0)
4560			continue;
4561		if (fl->flags & FL_BUF_PACKING &&
4562		    (MSIZE % sc->sge.pack_boundary) != 0)
4563			continue;
4564
4565		if (spare < CL_METADATA_SIZE + MSIZE)
4566			continue;
4567		n = (spare - CL_METADATA_SIZE) / MSIZE;
4568		if (n > howmany(hwb->size, maxp))
4569			break;
4570
4571		hwidx = idx;
4572		if (fl->flags & FL_BUF_PACKING) {
4573			region1 = n * MSIZE;
4574			region3 = spare - region1;
4575		} else {
4576			region1 = MSIZE;
4577			region3 = spare - region1;
4578			break;
4579		}
4580	}
4581
4582	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
4583	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
4584	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
4585	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
4586	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
4587	    sc->sge.sw_zone_info[zidx].size,
4588	    ("%s: bad buffer layout for fl %p, maxp %d. "
4589		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4590		sc->sge.sw_zone_info[zidx].size, region1,
4591		sc->sge.hw_buf_info[hwidx].size, region3));
4592	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
4593		KASSERT(region3 >= CL_METADATA_SIZE,
4594		    ("%s: no room for metadata.  fl %p, maxp %d; "
4595		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4596		    sc->sge.sw_zone_info[zidx].size, region1,
4597		    sc->sge.hw_buf_info[hwidx].size, region3));
4598		KASSERT(region1 % MSIZE == 0,
4599		    ("%s: bad mbuf region for fl %p, maxp %d. "
4600		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4601		    sc->sge.sw_zone_info[zidx].size, region1,
4602		    sc->sge.hw_buf_info[hwidx].size, region3));
4603	}
4604
4605	fl->cll_def.zidx = zidx;
4606	fl->cll_def.hwidx = hwidx;
4607	fl->cll_def.region1 = region1;
4608	fl->cll_def.region3 = region3;
4609}
4610
4611static void
4612find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
4613{
4614	struct sge *s = &sc->sge;
4615	struct hw_buf_info *hwb;
4616	struct sw_zone_info *swz;
4617	int spare;
4618	int8_t hwidx;
4619
4620	if (fl->flags & FL_BUF_PACKING)
4621		hwidx = s->safe_hwidx2;	/* with room for metadata */
4622	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
4623		hwidx = s->safe_hwidx2;
4624		hwb = &s->hw_buf_info[hwidx];
4625		swz = &s->sw_zone_info[hwb->zidx];
4626		spare = swz->size - hwb->size;
4627
4628		/* no good if there isn't room for an mbuf as well */
4629		if (spare < CL_METADATA_SIZE + MSIZE)
4630			hwidx = s->safe_hwidx1;
4631	} else
4632		hwidx = s->safe_hwidx1;
4633
4634	if (hwidx == -1) {
4635		/* No fallback source */
4636		fl->cll_alt.hwidx = -1;
4637		fl->cll_alt.zidx = -1;
4638
4639		return;
4640	}
4641
4642	hwb = &s->hw_buf_info[hwidx];
4643	swz = &s->sw_zone_info[hwb->zidx];
4644	spare = swz->size - hwb->size;
4645	fl->cll_alt.hwidx = hwidx;
4646	fl->cll_alt.zidx = hwb->zidx;
4647	if (allow_mbufs_in_cluster &&
4648	    (fl_pad == 0 || (MSIZE % sc->sge.pad_boundary) == 0))
4649		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
4650	else
4651		fl->cll_alt.region1 = 0;
4652	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
4653}
4654
4655static void
4656add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
4657{
4658	mtx_lock(&sc->sfl_lock);
4659	FL_LOCK(fl);
4660	if ((fl->flags & FL_DOOMED) == 0) {
4661		fl->flags |= FL_STARVING;
4662		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
4663		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
4664	}
4665	FL_UNLOCK(fl);
4666	mtx_unlock(&sc->sfl_lock);
4667}
4668
4669static void
4670handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
4671{
4672	struct sge_wrq *wrq = (void *)eq;
4673
4674	atomic_readandclear_int(&eq->equiq);
4675	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
4676}
4677
4678static void
4679handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
4680{
4681	struct sge_txq *txq = (void *)eq;
4682
4683	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
4684
4685	atomic_readandclear_int(&eq->equiq);
4686	mp_ring_check_drainage(txq->r, 0);
4687	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
4688}
4689
4690static int
4691handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
4692    struct mbuf *m)
4693{
4694	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
4695	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
4696	struct adapter *sc = iq->adapter;
4697	struct sge *s = &sc->sge;
4698	struct sge_eq *eq;
4699	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
4700		&handle_wrq_egr_update, &handle_eth_egr_update,
4701		&handle_wrq_egr_update};
4702
4703	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
4704	    rss->opcode));
4705
4706	eq = s->eqmap[qid - s->eq_start];
4707	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
4708
4709	return (0);
4710}
4711
4712/* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
4713CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
4714    offsetof(struct cpl_fw6_msg, data));
4715
4716static int
4717handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
4718{
4719	struct adapter *sc = iq->adapter;
4720	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
4721
4722	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
4723	    rss->opcode));
4724
4725	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
4726		const struct rss_header *rss2;
4727
4728		rss2 = (const struct rss_header *)&cpl->data[0];
4729		return (sc->cpl_handler[rss2->opcode](iq, rss2, m));
4730	}
4731
4732	return (sc->fw_msg_handler[cpl->type](sc, &cpl->data[0]));
4733}
4734
4735static int
4736sysctl_uint16(SYSCTL_HANDLER_ARGS)
4737{
4738	uint16_t *id = arg1;
4739	int i = *id;
4740
4741	return sysctl_handle_int(oidp, &i, 0, req);
4742}
4743
4744static int
4745sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
4746{
4747	struct sge *s = arg1;
4748	struct hw_buf_info *hwb = &s->hw_buf_info[0];
4749	struct sw_zone_info *swz = &s->sw_zone_info[0];
4750	int i, rc;
4751	struct sbuf sb;
4752	char c;
4753
4754	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
4755	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
4756		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
4757			c = '*';
4758		else
4759			c = '\0';
4760
4761		sbuf_printf(&sb, "%u%c ", hwb->size, c);
4762	}
4763	sbuf_trim(&sb);
4764	sbuf_finish(&sb);
4765	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
4766	sbuf_delete(&sb);
4767	return (rc);
4768}
4769