t4_sge.c revision 309560
1/*-
2 * Copyright (c) 2011 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/t4_sge.c 309560 2016-12-05 20:43:25Z jhb $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#include <sys/types.h>
35#include <sys/mbuf.h>
36#include <sys/socket.h>
37#include <sys/kernel.h>
38#include <sys/malloc.h>
39#include <sys/queue.h>
40#include <sys/sbuf.h>
41#include <sys/taskqueue.h>
42#include <sys/time.h>
43#include <sys/sglist.h>
44#include <sys/sysctl.h>
45#include <sys/smp.h>
46#include <sys/counter.h>
47#include <net/bpf.h>
48#include <net/ethernet.h>
49#include <net/if.h>
50#include <net/if_vlan_var.h>
51#include <netinet/in.h>
52#include <netinet/ip.h>
53#include <netinet/ip6.h>
54#include <netinet/tcp.h>
55#include <machine/in_cksum.h>
56#include <machine/md_var.h>
57#include <vm/vm.h>
58#include <vm/pmap.h>
59#ifdef DEV_NETMAP
60#include <machine/bus.h>
61#include <sys/selinfo.h>
62#include <net/if_var.h>
63#include <net/netmap.h>
64#include <dev/netmap/netmap_kern.h>
65#endif
66
67#include "common/common.h"
68#include "common/t4_regs.h"
69#include "common/t4_regs_values.h"
70#include "common/t4_msg.h"
71#include "t4_l2t.h"
72#include "t4_mp_ring.h"
73
74#ifdef T4_PKT_TIMESTAMP
75#define RX_COPY_THRESHOLD (MINCLSIZE - 8)
76#else
77#define RX_COPY_THRESHOLD MINCLSIZE
78#endif
79
80/*
81 * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
82 * 0-7 are valid values.
83 */
84static int fl_pktshift = 2;
85TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
86
87/*
88 * Pad ethernet payload up to this boundary.
89 * -1: driver should figure out a good value.
90 *  0: disable padding.
91 *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
92 */
93int fl_pad = -1;
94TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
95
96/*
97 * Status page length.
98 * -1: driver should figure out a good value.
99 *  64 or 128 are the only other valid values.
100 */
101static int spg_len = -1;
102TUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
103
104/*
105 * Congestion drops.
106 * -1: no congestion feedback (not recommended).
107 *  0: backpressure the channel instead of dropping packets right away.
108 *  1: no backpressure, drop packets for the congested queue immediately.
109 */
110static int cong_drop = 0;
111TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
112
113/*
114 * Deliver multiple frames in the same free list buffer if they fit.
115 * -1: let the driver decide whether to enable buffer packing or not.
116 *  0: disable buffer packing.
117 *  1: enable buffer packing.
118 */
119static int buffer_packing = -1;
120TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
121
122/*
123 * Start next frame in a packed buffer at this boundary.
124 * -1: driver should figure out a good value.
125 * T4: driver will ignore this and use the same value as fl_pad above.
126 * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
127 */
128static int fl_pack = -1;
129TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
130
131/*
132 * Allow the driver to create mbuf(s) in a cluster allocated for rx.
133 * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
134 * 1: ok to create mbuf(s) within a cluster if there is room.
135 */
136static int allow_mbufs_in_cluster = 1;
137TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
138
139/*
140 * Largest rx cluster size that the driver is allowed to allocate.
141 */
142static int largest_rx_cluster = MJUM16BYTES;
143TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
144
145/*
146 * Size of cluster allocation that's most likely to succeed.  The driver will
147 * fall back to this size if it fails to allocate clusters larger than this.
148 */
149static int safest_rx_cluster = PAGE_SIZE;
150TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
151
152struct txpkts {
153	u_int wr_type;		/* type 0 or type 1 */
154	u_int npkt;		/* # of packets in this work request */
155	u_int plen;		/* total payload (sum of all packets) */
156	u_int len16;		/* # of 16B pieces used by this work request */
157};
158
159/* A packet's SGL.  This + m_pkthdr has all info needed for tx */
160struct sgl {
161	struct sglist sg;
162	struct sglist_seg seg[TX_SGL_SEGS];
163};
164
165static int service_iq(struct sge_iq *, int);
166static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
167static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
168static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
169static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
170static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
171    uint16_t, char *);
172static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
173    bus_addr_t *, void **);
174static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
175    void *);
176static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
177    int, int);
178static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
179static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
180    struct sge_fl *);
181static int alloc_fwq(struct adapter *);
182static int free_fwq(struct adapter *);
183static int alloc_mgmtq(struct adapter *);
184static int free_mgmtq(struct adapter *);
185static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
186    struct sysctl_oid *);
187static int free_rxq(struct vi_info *, struct sge_rxq *);
188#ifdef TCP_OFFLOAD
189static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
190    struct sysctl_oid *);
191static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
192#endif
193#ifdef DEV_NETMAP
194static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
195    struct sysctl_oid *);
196static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
197static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
198    struct sysctl_oid *);
199static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
200#endif
201static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
202static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
203#ifdef TCP_OFFLOAD
204static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
205#endif
206static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
207static int free_eq(struct adapter *, struct sge_eq *);
208static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
209    struct sysctl_oid *);
210static int free_wrq(struct adapter *, struct sge_wrq *);
211static int alloc_txq(struct vi_info *, struct sge_txq *, int,
212    struct sysctl_oid *);
213static int free_txq(struct vi_info *, struct sge_txq *);
214static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
215static inline void ring_fl_db(struct adapter *, struct sge_fl *);
216static int refill_fl(struct adapter *, struct sge_fl *, int);
217static void refill_sfl(void *);
218static int alloc_fl_sdesc(struct sge_fl *);
219static void free_fl_sdesc(struct adapter *, struct sge_fl *);
220static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
221static void find_safe_refill_source(struct adapter *, struct sge_fl *);
222static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
223
224static inline void get_pkt_gl(struct mbuf *, struct sglist *);
225static inline u_int txpkt_len16(u_int, u_int);
226static inline u_int txpkt_vm_len16(u_int, u_int);
227static inline u_int txpkts0_len16(u_int);
228static inline u_int txpkts1_len16(void);
229static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
230    struct mbuf *, u_int);
231static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
232    struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
233static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
234static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
235static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
236    struct mbuf *, const struct txpkts *, u_int);
237static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
238static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
239static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
240static inline uint16_t read_hw_cidx(struct sge_eq *);
241static inline u_int reclaimable_tx_desc(struct sge_eq *);
242static inline u_int total_available_tx_desc(struct sge_eq *);
243static u_int reclaim_tx_descs(struct sge_txq *, u_int);
244static void tx_reclaim(void *, int);
245static __be64 get_flit(struct sglist_seg *, int, int);
246static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
247    struct mbuf *);
248static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
249    struct mbuf *);
250static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
251static void wrq_tx_drain(void *, int);
252static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
253
254static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
255static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
256static int sysctl_tc(SYSCTL_HANDLER_ARGS);
257
258static counter_u64_t extfree_refs;
259static counter_u64_t extfree_rels;
260
261an_handler_t t4_an_handler;
262fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
263cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
264
265
266static int
267an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl)
268{
269
270#ifdef INVARIANTS
271	panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl);
272#else
273	log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n",
274	    __func__, iq, ctrl);
275#endif
276	return (EDOOFUS);
277}
278
279int
280t4_register_an_handler(an_handler_t h)
281{
282	uintptr_t *loc, new;
283
284	new = h ? (uintptr_t)h : (uintptr_t)an_not_handled;
285	loc = (uintptr_t *) &t4_an_handler;
286	atomic_store_rel_ptr(loc, new);
287
288	return (0);
289}
290
291static int
292fw_msg_not_handled(struct adapter *sc, const __be64 *rpl)
293{
294	const struct cpl_fw6_msg *cpl =
295	    __containerof(rpl, struct cpl_fw6_msg, data[0]);
296
297#ifdef INVARIANTS
298	panic("%s: fw_msg type %d", __func__, cpl->type);
299#else
300	log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type);
301#endif
302	return (EDOOFUS);
303}
304
305int
306t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
307{
308	uintptr_t *loc, new;
309
310	if (type >= nitems(t4_fw_msg_handler))
311		return (EINVAL);
312
313	/*
314	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
315	 * handler dispatch table.  Reject any attempt to install a handler for
316	 * this subtype.
317	 */
318	if (type == FW_TYPE_RSSCPL || type == FW6_TYPE_RSSCPL)
319		return (EINVAL);
320
321	new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled;
322	loc = (uintptr_t *) &t4_fw_msg_handler[type];
323	atomic_store_rel_ptr(loc, new);
324
325	return (0);
326}
327
328static int
329cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
330{
331
332#ifdef INVARIANTS
333	panic("%s: opcode 0x%02x on iq %p with payload %p",
334	    __func__, rss->opcode, iq, m);
335#else
336	log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n",
337	    __func__, rss->opcode, iq, m);
338	m_freem(m);
339#endif
340	return (EDOOFUS);
341}
342
343int
344t4_register_cpl_handler(int opcode, cpl_handler_t h)
345{
346	uintptr_t *loc, new;
347
348	if (opcode >= nitems(t4_cpl_handler))
349		return (EINVAL);
350
351	new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled;
352	loc = (uintptr_t *) &t4_cpl_handler[opcode];
353	atomic_store_rel_ptr(loc, new);
354
355	return (0);
356}
357
358/*
359 * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
360 */
361void
362t4_sge_modload(void)
363{
364	int i;
365
366	if (fl_pktshift < 0 || fl_pktshift > 7) {
367		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
368		    " using 2 instead.\n", fl_pktshift);
369		fl_pktshift = 2;
370	}
371
372	if (spg_len != 64 && spg_len != 128) {
373		int len;
374
375#if defined(__i386__) || defined(__amd64__)
376		len = cpu_clflush_line_size > 64 ? 128 : 64;
377#else
378		len = 64;
379#endif
380		if (spg_len != -1) {
381			printf("Invalid hw.cxgbe.spg_len value (%d),"
382			    " using %d instead.\n", spg_len, len);
383		}
384		spg_len = len;
385	}
386
387	if (cong_drop < -1 || cong_drop > 1) {
388		printf("Invalid hw.cxgbe.cong_drop value (%d),"
389		    " using 0 instead.\n", cong_drop);
390		cong_drop = 0;
391	}
392
393	extfree_refs = counter_u64_alloc(M_WAITOK);
394	extfree_rels = counter_u64_alloc(M_WAITOK);
395	counter_u64_zero(extfree_refs);
396	counter_u64_zero(extfree_rels);
397
398	t4_an_handler = an_not_handled;
399	for (i = 0; i < nitems(t4_fw_msg_handler); i++)
400		t4_fw_msg_handler[i] = fw_msg_not_handled;
401	for (i = 0; i < nitems(t4_cpl_handler); i++)
402		t4_cpl_handler[i] = cpl_not_handled;
403
404	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
405	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
406	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
407	t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx);
408	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
409	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
410}
411
412void
413t4_sge_modunload(void)
414{
415
416	counter_u64_free(extfree_refs);
417	counter_u64_free(extfree_rels);
418}
419
420uint64_t
421t4_sge_extfree_refs(void)
422{
423	uint64_t refs, rels;
424
425	rels = counter_u64_fetch(extfree_rels);
426	refs = counter_u64_fetch(extfree_refs);
427
428	return (refs - rels);
429}
430
431static inline void
432setup_pad_and_pack_boundaries(struct adapter *sc)
433{
434	uint32_t v, m;
435	int pad, pack, pad_shift;
436
437	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
438	    X_INGPADBOUNDARY_SHIFT;
439	pad = fl_pad;
440	if (fl_pad < (1 << pad_shift) ||
441	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
442	    !powerof2(fl_pad)) {
443		/*
444		 * If there is any chance that we might use buffer packing and
445		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
446		 * it to the minimum allowed in all other cases.
447		 */
448		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
449
450		/*
451		 * For fl_pad = 0 we'll still write a reasonable value to the
452		 * register but all the freelists will opt out of padding.
453		 * We'll complain here only if the user tried to set it to a
454		 * value greater than 0 that was invalid.
455		 */
456		if (fl_pad > 0) {
457			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
458			    " (%d), using %d instead.\n", fl_pad, pad);
459		}
460	}
461	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
462	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
463	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
464
465	if (is_t4(sc)) {
466		if (fl_pack != -1 && fl_pack != pad) {
467			/* Complain but carry on. */
468			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
469			    " using %d instead.\n", fl_pack, pad);
470		}
471		return;
472	}
473
474	pack = fl_pack;
475	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
476	    !powerof2(fl_pack)) {
477		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
478		MPASS(powerof2(pack));
479		if (pack < 16)
480			pack = 16;
481		if (pack == 32)
482			pack = 64;
483		if (pack > 4096)
484			pack = 4096;
485		if (fl_pack != -1) {
486			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
487			    " (%d), using %d instead.\n", fl_pack, pack);
488		}
489	}
490	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
491	if (pack == 16)
492		v = V_INGPACKBOUNDARY(0);
493	else
494		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
495
496	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
497	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
498}
499
500/*
501 * adap->params.vpd.cclk must be set up before this is called.
502 */
503void
504t4_tweak_chip_settings(struct adapter *sc)
505{
506	int i;
507	uint32_t v, m;
508	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
509	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
510	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
511	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
512	static int sge_flbuf_sizes[] = {
513		MCLBYTES,
514#if MJUMPAGESIZE != MCLBYTES
515		MJUMPAGESIZE,
516		MJUMPAGESIZE - CL_METADATA_SIZE,
517		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
518#endif
519		MJUM9BYTES,
520		MJUM16BYTES,
521		MCLBYTES - MSIZE - CL_METADATA_SIZE,
522		MJUM9BYTES - CL_METADATA_SIZE,
523		MJUM16BYTES - CL_METADATA_SIZE,
524	};
525
526	KASSERT(sc->flags & MASTER_PF,
527	    ("%s: trying to change chip settings when not master.", __func__));
528
529	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
530	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
531	    V_EGRSTATUSPAGESIZE(spg_len == 128);
532	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
533
534	setup_pad_and_pack_boundaries(sc);
535
536	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
537	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
538	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
539	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
540	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
541	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
542	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
543	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
544	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
545
546	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
547	    ("%s: hw buffer size table too big", __func__));
548	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
549		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
550		    sge_flbuf_sizes[i]);
551	}
552
553	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
554	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
555	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
556
557	KASSERT(intr_timer[0] <= timer_max,
558	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
559	    timer_max));
560	for (i = 1; i < nitems(intr_timer); i++) {
561		KASSERT(intr_timer[i] >= intr_timer[i - 1],
562		    ("%s: timers not listed in increasing order (%d)",
563		    __func__, i));
564
565		while (intr_timer[i] > timer_max) {
566			if (i == nitems(intr_timer) - 1) {
567				intr_timer[i] = timer_max;
568				break;
569			}
570			intr_timer[i] += intr_timer[i - 1];
571			intr_timer[i] /= 2;
572		}
573	}
574
575	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
576	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
577	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
578	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
579	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
580	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
581	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
582	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
583	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
584
585	/* 4K, 16K, 64K, 256K DDP "page sizes" */
586	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
587	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
588
589	m = v = F_TDDPTAGTCB;
590	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
591
592	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
593	    F_RESETDDPOFFSET;
594	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
595	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
596}
597
598/*
599 * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
600 * padding is in use, the buffer's start and end need to be aligned to the pad
601 * boundary as well.  We'll just make sure that the size is a multiple of the
602 * boundary here, it is up to the buffer allocation code to make sure the start
603 * of the buffer is aligned as well.
604 */
605static inline int
606hwsz_ok(struct adapter *sc, int hwsz)
607{
608	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
609
610	return (hwsz >= 64 && (hwsz & mask) == 0);
611}
612
613/*
614 * XXX: driver really should be able to deal with unexpected settings.
615 */
616int
617t4_read_chip_settings(struct adapter *sc)
618{
619	struct sge *s = &sc->sge;
620	struct sge_params *sp = &sc->params.sge;
621	int i, j, n, rc = 0;
622	uint32_t m, v, r;
623	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
624	static int sw_buf_sizes[] = {	/* Sorted by size */
625		MCLBYTES,
626#if MJUMPAGESIZE != MCLBYTES
627		MJUMPAGESIZE,
628#endif
629		MJUM9BYTES,
630		MJUM16BYTES
631	};
632	struct sw_zone_info *swz, *safe_swz;
633	struct hw_buf_info *hwb;
634
635	m = F_RXPKTCPLMODE;
636	v = F_RXPKTCPLMODE;
637	r = sc->params.sge.sge_control;
638	if ((r & m) != v) {
639		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
640		rc = EINVAL;
641	}
642
643	/*
644	 * If this changes then every single use of PAGE_SHIFT in the driver
645	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
646	 */
647	if (sp->page_shift != PAGE_SHIFT) {
648		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
649		rc = EINVAL;
650	}
651
652	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
653	hwb = &s->hw_buf_info[0];
654	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
655		r = sc->params.sge.sge_fl_buffer_size[i];
656		hwb->size = r;
657		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
658		hwb->next = -1;
659	}
660
661	/*
662	 * Create a sorted list in decreasing order of hw buffer sizes (and so
663	 * increasing order of spare area) for each software zone.
664	 *
665	 * If padding is enabled then the start and end of the buffer must align
666	 * to the pad boundary; if packing is enabled then they must align with
667	 * the pack boundary as well.  Allocations from the cluster zones are
668	 * aligned to min(size, 4K), so the buffer starts at that alignment and
669	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
670	 * starting alignment will be reduced to MSIZE and the driver will
671	 * exercise appropriate caution when deciding on the best buffer layout
672	 * to use.
673	 */
674	n = 0;	/* no usable buffer size to begin with */
675	swz = &s->sw_zone_info[0];
676	safe_swz = NULL;
677	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
678		int8_t head = -1, tail = -1;
679
680		swz->size = sw_buf_sizes[i];
681		swz->zone = m_getzone(swz->size);
682		swz->type = m_gettype(swz->size);
683
684		if (swz->size < PAGE_SIZE) {
685			MPASS(powerof2(swz->size));
686			if (fl_pad && (swz->size % sp->pad_boundary != 0))
687				continue;
688		}
689
690		if (swz->size == safest_rx_cluster)
691			safe_swz = swz;
692
693		hwb = &s->hw_buf_info[0];
694		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
695			if (hwb->zidx != -1 || hwb->size > swz->size)
696				continue;
697#ifdef INVARIANTS
698			if (fl_pad)
699				MPASS(hwb->size % sp->pad_boundary == 0);
700#endif
701			hwb->zidx = i;
702			if (head == -1)
703				head = tail = j;
704			else if (hwb->size < s->hw_buf_info[tail].size) {
705				s->hw_buf_info[tail].next = j;
706				tail = j;
707			} else {
708				int8_t *cur;
709				struct hw_buf_info *t;
710
711				for (cur = &head; *cur != -1; cur = &t->next) {
712					t = &s->hw_buf_info[*cur];
713					if (hwb->size == t->size) {
714						hwb->zidx = -2;
715						break;
716					}
717					if (hwb->size > t->size) {
718						hwb->next = *cur;
719						*cur = j;
720						break;
721					}
722				}
723			}
724		}
725		swz->head_hwidx = head;
726		swz->tail_hwidx = tail;
727
728		if (tail != -1) {
729			n++;
730			if (swz->size - s->hw_buf_info[tail].size >=
731			    CL_METADATA_SIZE)
732				sc->flags |= BUF_PACKING_OK;
733		}
734	}
735	if (n == 0) {
736		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
737		rc = EINVAL;
738	}
739
740	s->safe_hwidx1 = -1;
741	s->safe_hwidx2 = -1;
742	if (safe_swz != NULL) {
743		s->safe_hwidx1 = safe_swz->head_hwidx;
744		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
745			int spare;
746
747			hwb = &s->hw_buf_info[i];
748#ifdef INVARIANTS
749			if (fl_pad)
750				MPASS(hwb->size % sp->pad_boundary == 0);
751#endif
752			spare = safe_swz->size - hwb->size;
753			if (spare >= CL_METADATA_SIZE) {
754				s->safe_hwidx2 = i;
755				break;
756			}
757		}
758	}
759
760	if (sc->flags & IS_VF)
761		return (0);
762
763	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
764	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
765	if (r != v) {
766		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
767		rc = EINVAL;
768	}
769
770	m = v = F_TDDPTAGTCB;
771	r = t4_read_reg(sc, A_ULP_RX_CTL);
772	if ((r & m) != v) {
773		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
774		rc = EINVAL;
775	}
776
777	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
778	    F_RESETDDPOFFSET;
779	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
780	r = t4_read_reg(sc, A_TP_PARA_REG5);
781	if ((r & m) != v) {
782		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
783		rc = EINVAL;
784	}
785
786	t4_init_tp_params(sc);
787
788	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
789	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
790
791	return (rc);
792}
793
794int
795t4_create_dma_tag(struct adapter *sc)
796{
797	int rc;
798
799	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
800	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
801	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
802	    NULL, &sc->dmat);
803	if (rc != 0) {
804		device_printf(sc->dev,
805		    "failed to create main DMA tag: %d\n", rc);
806	}
807
808	return (rc);
809}
810
811void
812t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
813    struct sysctl_oid_list *children)
814{
815	struct sge_params *sp = &sc->params.sge;
816
817	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
818	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
819	    "freelist buffer sizes");
820
821	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
822	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
823
824	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
825	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
826
827	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
828	    NULL, sp->spg_len, "status page size (bytes)");
829
830	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
831	    NULL, cong_drop, "congestion drop setting");
832
833	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
834	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
835}
836
837int
838t4_destroy_dma_tag(struct adapter *sc)
839{
840	if (sc->dmat)
841		bus_dma_tag_destroy(sc->dmat);
842
843	return (0);
844}
845
846/*
847 * Allocate and initialize the firmware event queue and the management queue.
848 *
849 * Returns errno on failure.  Resources allocated up to that point may still be
850 * allocated.  Caller is responsible for cleanup in case this function fails.
851 */
852int
853t4_setup_adapter_queues(struct adapter *sc)
854{
855	int rc;
856
857	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
858
859	sysctl_ctx_init(&sc->ctx);
860	sc->flags |= ADAP_SYSCTL_CTX;
861
862	/*
863	 * Firmware event queue
864	 */
865	rc = alloc_fwq(sc);
866	if (rc != 0)
867		return (rc);
868
869	/*
870	 * Management queue.  This is just a control queue that uses the fwq as
871	 * its associated iq.
872	 */
873	if (!(sc->flags & IS_VF))
874		rc = alloc_mgmtq(sc);
875
876	return (rc);
877}
878
879/*
880 * Idempotent
881 */
882int
883t4_teardown_adapter_queues(struct adapter *sc)
884{
885
886	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
887
888	/* Do this before freeing the queue */
889	if (sc->flags & ADAP_SYSCTL_CTX) {
890		sysctl_ctx_free(&sc->ctx);
891		sc->flags &= ~ADAP_SYSCTL_CTX;
892	}
893
894	free_mgmtq(sc);
895	free_fwq(sc);
896
897	return (0);
898}
899
900static inline int
901first_vector(struct vi_info *vi)
902{
903	struct adapter *sc = vi->pi->adapter;
904
905	if (sc->intr_count == 1)
906		return (0);
907
908	return (vi->first_intr);
909}
910
911/*
912 * Given an arbitrary "index," come up with an iq that can be used by other
913 * queues (of this VI) for interrupt forwarding, SGE egress updates, etc.
914 * The iq returned is guaranteed to be something that takes direct interrupts.
915 */
916static struct sge_iq *
917vi_intr_iq(struct vi_info *vi, int idx)
918{
919	struct adapter *sc = vi->pi->adapter;
920	struct sge *s = &sc->sge;
921	struct sge_iq *iq = NULL;
922	int nintr, i;
923
924	if (sc->intr_count == 1)
925		return (&sc->sge.fwq);
926
927	nintr = vi->nintr;
928	KASSERT(nintr != 0,
929	    ("%s: vi %p has no exclusive interrupts, total interrupts = %d",
930	    __func__, vi, sc->intr_count));
931	i = idx % nintr;
932
933	if (vi->flags & INTR_RXQ) {
934	       	if (i < vi->nrxq) {
935			iq = &s->rxq[vi->first_rxq + i].iq;
936			goto done;
937		}
938		i -= vi->nrxq;
939	}
940#ifdef TCP_OFFLOAD
941	if (vi->flags & INTR_OFLD_RXQ) {
942	       	if (i < vi->nofldrxq) {
943			iq = &s->ofld_rxq[vi->first_ofld_rxq + i].iq;
944			goto done;
945		}
946		i -= vi->nofldrxq;
947	}
948#endif
949	panic("%s: vi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__,
950	    vi, vi->flags & INTR_ALL, idx, nintr);
951done:
952	MPASS(iq != NULL);
953	KASSERT(iq->flags & IQ_INTR,
954	    ("%s: iq %p (vi %p, intr_flags 0x%lx, idx %d)", __func__, iq, vi,
955	    vi->flags & INTR_ALL, idx));
956	return (iq);
957}
958
959/* Maximum payload that can be delivered with a single iq descriptor */
960static inline int
961mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
962{
963	int payload;
964
965#ifdef TCP_OFFLOAD
966	if (toe) {
967		payload = sc->tt.rx_coalesce ?
968		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
969	} else {
970#endif
971		/* large enough even when hw VLAN extraction is disabled */
972		payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
973		    ETHER_VLAN_ENCAP_LEN + mtu;
974#ifdef TCP_OFFLOAD
975	}
976#endif
977
978	return (payload);
979}
980
981int
982t4_setup_vi_queues(struct vi_info *vi)
983{
984	int rc = 0, i, j, intr_idx, iqid;
985	struct sge_rxq *rxq;
986	struct sge_txq *txq;
987	struct sge_wrq *ctrlq;
988#ifdef TCP_OFFLOAD
989	struct sge_ofld_rxq *ofld_rxq;
990	struct sge_wrq *ofld_txq;
991#endif
992#ifdef DEV_NETMAP
993	int saved_idx;
994	struct sge_nm_rxq *nm_rxq;
995	struct sge_nm_txq *nm_txq;
996#endif
997	char name[16];
998	struct port_info *pi = vi->pi;
999	struct adapter *sc = pi->adapter;
1000	struct ifnet *ifp = vi->ifp;
1001	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
1002	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
1003	int maxp, mtu = ifp->if_mtu;
1004
1005	/* Interrupt vector to start from (when using multiple vectors) */
1006	intr_idx = first_vector(vi);
1007
1008#ifdef DEV_NETMAP
1009	saved_idx = intr_idx;
1010	if (ifp->if_capabilities & IFCAP_NETMAP) {
1011
1012		/* netmap is supported with direct interrupts only. */
1013		MPASS(vi->flags & INTR_RXQ);
1014
1015		/*
1016		 * We don't have buffers to back the netmap rx queues
1017		 * right now so we create the queues in a way that
1018		 * doesn't set off any congestion signal in the chip.
1019		 */
1020		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
1021		    CTLFLAG_RD, NULL, "rx queues");
1022		for_each_nm_rxq(vi, i, nm_rxq) {
1023			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
1024			if (rc != 0)
1025				goto done;
1026			intr_idx++;
1027		}
1028
1029		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
1030		    CTLFLAG_RD, NULL, "tx queues");
1031		for_each_nm_txq(vi, i, nm_txq) {
1032			iqid = vi->first_nm_rxq + (i % vi->nnmrxq);
1033			rc = alloc_nm_txq(vi, nm_txq, iqid, i, oid);
1034			if (rc != 0)
1035				goto done;
1036		}
1037	}
1038
1039	/* Normal rx queues and netmap rx queues share the same interrupts. */
1040	intr_idx = saved_idx;
1041#endif
1042
1043	/*
1044	 * First pass over all NIC and TOE rx queues:
1045	 * a) initialize iq and fl
1046	 * b) allocate queue iff it will take direct interrupts.
1047	 */
1048	maxp = mtu_to_max_payload(sc, mtu, 0);
1049	if (vi->flags & INTR_RXQ) {
1050		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
1051		    CTLFLAG_RD, NULL, "rx queues");
1052	}
1053	for_each_rxq(vi, i, rxq) {
1054
1055		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
1056
1057		snprintf(name, sizeof(name), "%s rxq%d-fl",
1058		    device_get_nameunit(vi->dev), i);
1059		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
1060
1061		if (vi->flags & INTR_RXQ) {
1062			rxq->iq.flags |= IQ_INTR;
1063			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
1064			if (rc != 0)
1065				goto done;
1066			intr_idx++;
1067		}
1068	}
1069#ifdef DEV_NETMAP
1070	if (ifp->if_capabilities & IFCAP_NETMAP)
1071		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
1072#endif
1073#ifdef TCP_OFFLOAD
1074	maxp = mtu_to_max_payload(sc, mtu, 1);
1075	if (vi->flags & INTR_OFLD_RXQ) {
1076		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
1077		    CTLFLAG_RD, NULL,
1078		    "rx queues for offloaded TCP connections");
1079	}
1080	for_each_ofld_rxq(vi, i, ofld_rxq) {
1081
1082		init_iq(&ofld_rxq->iq, sc, vi->tmr_idx, vi->pktc_idx,
1083		    vi->qsize_rxq);
1084
1085		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
1086		    device_get_nameunit(vi->dev), i);
1087		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
1088
1089		if (vi->flags & INTR_OFLD_RXQ) {
1090			ofld_rxq->iq.flags |= IQ_INTR;
1091			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
1092			if (rc != 0)
1093				goto done;
1094			intr_idx++;
1095		}
1096	}
1097#endif
1098
1099	/*
1100	 * Second pass over all NIC and TOE rx queues.  The queues forwarding
1101	 * their interrupts are allocated now.
1102	 */
1103	j = 0;
1104	if (!(vi->flags & INTR_RXQ)) {
1105		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
1106		    CTLFLAG_RD, NULL, "rx queues");
1107		for_each_rxq(vi, i, rxq) {
1108			MPASS(!(rxq->iq.flags & IQ_INTR));
1109
1110			intr_idx = vi_intr_iq(vi, j)->abs_id;
1111
1112			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
1113			if (rc != 0)
1114				goto done;
1115			j++;
1116		}
1117	}
1118#ifdef TCP_OFFLOAD
1119	if (vi->nofldrxq != 0 && !(vi->flags & INTR_OFLD_RXQ)) {
1120		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
1121		    CTLFLAG_RD, NULL,
1122		    "rx queues for offloaded TCP connections");
1123		for_each_ofld_rxq(vi, i, ofld_rxq) {
1124			MPASS(!(ofld_rxq->iq.flags & IQ_INTR));
1125
1126			intr_idx = vi_intr_iq(vi, j)->abs_id;
1127
1128			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
1129			if (rc != 0)
1130				goto done;
1131			j++;
1132		}
1133	}
1134#endif
1135
1136	/*
1137	 * Now the tx queues.  Only one pass needed.
1138	 */
1139	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
1140	    NULL, "tx queues");
1141	j = 0;
1142	for_each_txq(vi, i, txq) {
1143		iqid = vi_intr_iq(vi, j)->cntxt_id;
1144		snprintf(name, sizeof(name), "%s txq%d",
1145		    device_get_nameunit(vi->dev), i);
1146		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, iqid,
1147		    name);
1148
1149		rc = alloc_txq(vi, txq, i, oid);
1150		if (rc != 0)
1151			goto done;
1152		j++;
1153	}
1154#ifdef TCP_OFFLOAD
1155	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
1156	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
1157	for_each_ofld_txq(vi, i, ofld_txq) {
1158		struct sysctl_oid *oid2;
1159
1160		iqid = vi_intr_iq(vi, j)->cntxt_id;
1161		snprintf(name, sizeof(name), "%s ofld_txq%d",
1162		    device_get_nameunit(vi->dev), i);
1163		init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan,
1164		    iqid, name);
1165
1166		snprintf(name, sizeof(name), "%d", i);
1167		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
1168		    name, CTLFLAG_RD, NULL, "offload tx queue");
1169
1170		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
1171		if (rc != 0)
1172			goto done;
1173		j++;
1174	}
1175#endif
1176
1177	/*
1178	 * Finally, the control queue.
1179	 */
1180	if (!IS_MAIN_VI(vi) || sc->flags & IS_VF)
1181		goto done;
1182	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD,
1183	    NULL, "ctrl queue");
1184	ctrlq = &sc->sge.ctrlq[pi->port_id];
1185	iqid = vi_intr_iq(vi, 0)->cntxt_id;
1186	snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(vi->dev));
1187	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid,
1188	    name);
1189	rc = alloc_wrq(sc, vi, ctrlq, oid);
1190
1191done:
1192	if (rc)
1193		t4_teardown_vi_queues(vi);
1194
1195	return (rc);
1196}
1197
1198/*
1199 * Idempotent
1200 */
1201int
1202t4_teardown_vi_queues(struct vi_info *vi)
1203{
1204	int i;
1205	struct port_info *pi = vi->pi;
1206	struct adapter *sc = pi->adapter;
1207	struct sge_rxq *rxq;
1208	struct sge_txq *txq;
1209#ifdef TCP_OFFLOAD
1210	struct sge_ofld_rxq *ofld_rxq;
1211	struct sge_wrq *ofld_txq;
1212#endif
1213#ifdef DEV_NETMAP
1214	struct sge_nm_rxq *nm_rxq;
1215	struct sge_nm_txq *nm_txq;
1216#endif
1217
1218	/* Do this before freeing the queues */
1219	if (vi->flags & VI_SYSCTL_CTX) {
1220		sysctl_ctx_free(&vi->ctx);
1221		vi->flags &= ~VI_SYSCTL_CTX;
1222	}
1223
1224#ifdef DEV_NETMAP
1225	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
1226		for_each_nm_txq(vi, i, nm_txq) {
1227			free_nm_txq(vi, nm_txq);
1228		}
1229
1230		for_each_nm_rxq(vi, i, nm_rxq) {
1231			free_nm_rxq(vi, nm_rxq);
1232		}
1233	}
1234#endif
1235
1236	/*
1237	 * Take down all the tx queues first, as they reference the rx queues
1238	 * (for egress updates, etc.).
1239	 */
1240
1241	if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF))
1242		free_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
1243
1244	for_each_txq(vi, i, txq) {
1245		free_txq(vi, txq);
1246	}
1247#ifdef TCP_OFFLOAD
1248	for_each_ofld_txq(vi, i, ofld_txq) {
1249		free_wrq(sc, ofld_txq);
1250	}
1251#endif
1252
1253	/*
1254	 * Then take down the rx queues that forward their interrupts, as they
1255	 * reference other rx queues.
1256	 */
1257
1258	for_each_rxq(vi, i, rxq) {
1259		if ((rxq->iq.flags & IQ_INTR) == 0)
1260			free_rxq(vi, rxq);
1261	}
1262#ifdef TCP_OFFLOAD
1263	for_each_ofld_rxq(vi, i, ofld_rxq) {
1264		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
1265			free_ofld_rxq(vi, ofld_rxq);
1266	}
1267#endif
1268
1269	/*
1270	 * Then take down the rx queues that take direct interrupts.
1271	 */
1272
1273	for_each_rxq(vi, i, rxq) {
1274		if (rxq->iq.flags & IQ_INTR)
1275			free_rxq(vi, rxq);
1276	}
1277#ifdef TCP_OFFLOAD
1278	for_each_ofld_rxq(vi, i, ofld_rxq) {
1279		if (ofld_rxq->iq.flags & IQ_INTR)
1280			free_ofld_rxq(vi, ofld_rxq);
1281	}
1282#endif
1283
1284	return (0);
1285}
1286
1287/*
1288 * Deals with errors and the firmware event queue.  All data rx queues forward
1289 * their interrupt to the firmware event queue.
1290 */
1291void
1292t4_intr_all(void *arg)
1293{
1294	struct adapter *sc = arg;
1295	struct sge_iq *fwq = &sc->sge.fwq;
1296
1297	t4_intr_err(arg);
1298	if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) {
1299		service_iq(fwq, 0);
1300		atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE);
1301	}
1302}
1303
1304/* Deals with error interrupts */
1305void
1306t4_intr_err(void *arg)
1307{
1308	struct adapter *sc = arg;
1309
1310	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
1311	t4_slow_intr_handler(sc);
1312}
1313
1314void
1315t4_intr_evt(void *arg)
1316{
1317	struct sge_iq *iq = arg;
1318
1319	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1320		service_iq(iq, 0);
1321		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1322	}
1323}
1324
1325void
1326t4_intr(void *arg)
1327{
1328	struct sge_iq *iq = arg;
1329
1330	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1331		service_iq(iq, 0);
1332		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1333	}
1334}
1335
1336void
1337t4_vi_intr(void *arg)
1338{
1339	struct irq *irq = arg;
1340
1341#ifdef DEV_NETMAP
1342	if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) {
1343		t4_nm_intr(irq->nm_rxq);
1344		atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON);
1345	}
1346#endif
1347	if (irq->rxq != NULL)
1348		t4_intr(irq->rxq);
1349}
1350
1351/*
1352 * Deals with anything and everything on the given ingress queue.
1353 */
1354static int
1355service_iq(struct sge_iq *iq, int budget)
1356{
1357	struct sge_iq *q;
1358	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
1359	struct sge_fl *fl;			/* Use iff IQ_HAS_FL */
1360	struct adapter *sc = iq->adapter;
1361	struct iq_desc *d = &iq->desc[iq->cidx];
1362	int ndescs = 0, limit;
1363	int rsp_type, refill;
1364	uint32_t lq;
1365	uint16_t fl_hw_cidx;
1366	struct mbuf *m0;
1367	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
1368#if defined(INET) || defined(INET6)
1369	const struct timeval lro_timeout = {0, sc->lro_timeout};
1370#endif
1371
1372	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1373
1374	limit = budget ? budget : iq->qsize / 16;
1375
1376	if (iq->flags & IQ_HAS_FL) {
1377		fl = &rxq->fl;
1378		fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
1379	} else {
1380		fl = NULL;
1381		fl_hw_cidx = 0;			/* to silence gcc warning */
1382	}
1383
1384	/*
1385	 * We always come back and check the descriptor ring for new indirect
1386	 * interrupts and other responses after running a single handler.
1387	 */
1388	for (;;) {
1389		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1390
1391			rmb();
1392
1393			refill = 0;
1394			m0 = NULL;
1395			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1396			lq = be32toh(d->rsp.pldbuflen_qid);
1397
1398			switch (rsp_type) {
1399			case X_RSPD_TYPE_FLBUF:
1400
1401				KASSERT(iq->flags & IQ_HAS_FL,
1402				    ("%s: data for an iq (%p) with no freelist",
1403				    __func__, iq));
1404
1405				m0 = get_fl_payload(sc, fl, lq);
1406				if (__predict_false(m0 == NULL))
1407					goto process_iql;
1408				refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
1409#ifdef T4_PKT_TIMESTAMP
1410				/*
1411				 * 60 bit timestamp for the payload is
1412				 * *(uint64_t *)m0->m_pktdat.  Note that it is
1413				 * in the leading free-space in the mbuf.  The
1414				 * kernel can clobber it during a pullup,
1415				 * m_copymdata, etc.  You need to make sure that
1416				 * the mbuf reaches you unmolested if you care
1417				 * about the timestamp.
1418				 */
1419				*(uint64_t *)m0->m_pktdat =
1420				    be64toh(ctrl->u.last_flit) &
1421				    0xfffffffffffffff;
1422#endif
1423
1424				/* fall through */
1425
1426			case X_RSPD_TYPE_CPL:
1427				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1428				    ("%s: bad opcode %02x.", __func__,
1429				    d->rss.opcode));
1430				t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
1431				break;
1432
1433			case X_RSPD_TYPE_INTR:
1434
1435				/*
1436				 * Interrupts should be forwarded only to queues
1437				 * that are not forwarding their interrupts.
1438				 * This means service_iq can recurse but only 1
1439				 * level deep.
1440				 */
1441				KASSERT(budget == 0,
1442				    ("%s: budget %u, rsp_type %u", __func__,
1443				    budget, rsp_type));
1444
1445				/*
1446				 * There are 1K interrupt-capable queues (qids 0
1447				 * through 1023).  A response type indicating a
1448				 * forwarded interrupt with a qid >= 1K is an
1449				 * iWARP async notification.
1450				 */
1451				if (lq >= 1024) {
1452                                        t4_an_handler(iq, &d->rsp);
1453                                        break;
1454                                }
1455
1456				q = sc->sge.iqmap[lq - sc->sge.iq_start -
1457				    sc->sge.iq_base];
1458				if (atomic_cmpset_int(&q->state, IQS_IDLE,
1459				    IQS_BUSY)) {
1460					if (service_iq(q, q->qsize / 16) == 0) {
1461						atomic_cmpset_int(&q->state,
1462						    IQS_BUSY, IQS_IDLE);
1463					} else {
1464						STAILQ_INSERT_TAIL(&iql, q,
1465						    link);
1466					}
1467				}
1468				break;
1469
1470			default:
1471				KASSERT(0,
1472				    ("%s: illegal response type %d on iq %p",
1473				    __func__, rsp_type, iq));
1474				log(LOG_ERR,
1475				    "%s: illegal response type %d on iq %p",
1476				    device_get_nameunit(sc->dev), rsp_type, iq);
1477				break;
1478			}
1479
1480			d++;
1481			if (__predict_false(++iq->cidx == iq->sidx)) {
1482				iq->cidx = 0;
1483				iq->gen ^= F_RSPD_GEN;
1484				d = &iq->desc[0];
1485			}
1486			if (__predict_false(++ndescs == limit)) {
1487				t4_write_reg(sc, sc->sge_gts_reg,
1488				    V_CIDXINC(ndescs) |
1489				    V_INGRESSQID(iq->cntxt_id) |
1490				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1491				ndescs = 0;
1492
1493#if defined(INET) || defined(INET6)
1494				if (iq->flags & IQ_LRO_ENABLED &&
1495				    sc->lro_timeout != 0) {
1496					tcp_lro_flush_inactive(&rxq->lro,
1497					    &lro_timeout);
1498				}
1499#endif
1500
1501				if (budget) {
1502					if (iq->flags & IQ_HAS_FL) {
1503						FL_LOCK(fl);
1504						refill_fl(sc, fl, 32);
1505						FL_UNLOCK(fl);
1506					}
1507					return (EINPROGRESS);
1508				}
1509			}
1510			if (refill) {
1511				FL_LOCK(fl);
1512				refill_fl(sc, fl, 32);
1513				FL_UNLOCK(fl);
1514				fl_hw_cidx = fl->hw_cidx;
1515			}
1516		}
1517
1518process_iql:
1519		if (STAILQ_EMPTY(&iql))
1520			break;
1521
1522		/*
1523		 * Process the head only, and send it to the back of the list if
1524		 * it's still not done.
1525		 */
1526		q = STAILQ_FIRST(&iql);
1527		STAILQ_REMOVE_HEAD(&iql, link);
1528		if (service_iq(q, q->qsize / 8) == 0)
1529			atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
1530		else
1531			STAILQ_INSERT_TAIL(&iql, q, link);
1532	}
1533
1534#if defined(INET) || defined(INET6)
1535	if (iq->flags & IQ_LRO_ENABLED) {
1536		struct lro_ctrl *lro = &rxq->lro;
1537		struct lro_entry *l;
1538
1539		while (!SLIST_EMPTY(&lro->lro_active)) {
1540			l = SLIST_FIRST(&lro->lro_active);
1541			SLIST_REMOVE_HEAD(&lro->lro_active, next);
1542			tcp_lro_flush(lro, l);
1543		}
1544	}
1545#endif
1546
1547	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
1548	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1549
1550	if (iq->flags & IQ_HAS_FL) {
1551		int starved;
1552
1553		FL_LOCK(fl);
1554		starved = refill_fl(sc, fl, 64);
1555		FL_UNLOCK(fl);
1556		if (__predict_false(starved != 0))
1557			add_fl_to_sfl(sc, fl);
1558	}
1559
1560	return (0);
1561}
1562
1563static inline int
1564cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
1565{
1566	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
1567
1568	if (rc)
1569		MPASS(cll->region3 >= CL_METADATA_SIZE);
1570
1571	return (rc);
1572}
1573
1574static inline struct cluster_metadata *
1575cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
1576    caddr_t cl)
1577{
1578
1579	if (cl_has_metadata(fl, cll)) {
1580		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1581
1582		return ((struct cluster_metadata *)(cl + swz->size) - 1);
1583	}
1584	return (NULL);
1585}
1586
1587static int
1588rxb_free(struct mbuf *m, void *arg1, void *arg2)
1589{
1590	uma_zone_t zone = arg1;
1591	caddr_t cl = arg2;
1592
1593	uma_zfree(zone, cl);
1594	counter_u64_add(extfree_rels, 1);
1595
1596	return (EXT_FREE_OK);
1597}
1598
1599/*
1600 * The mbuf returned by this function could be allocated from zone_mbuf or
1601 * constructed in spare room in the cluster.
1602 *
1603 * The mbuf carries the payload in one of these ways
1604 * a) frame inside the mbuf (mbuf from zone_mbuf)
1605 * b) m_cljset (for clusters without metadata) zone_mbuf
1606 * c) m_extaddref (cluster with metadata) inline mbuf
1607 * d) m_extaddref (cluster with metadata) zone_mbuf
1608 */
1609static struct mbuf *
1610get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1611    int remaining)
1612{
1613	struct mbuf *m;
1614	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1615	struct cluster_layout *cll = &sd->cll;
1616	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1617	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
1618	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
1619	int len, blen;
1620	caddr_t payload;
1621
1622	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
1623	len = min(remaining, blen);
1624	payload = sd->cl + cll->region1 + fl->rx_offset;
1625	if (fl->flags & FL_BUF_PACKING) {
1626		const u_int l = fr_offset + len;
1627		const u_int pad = roundup2(l, fl->buf_boundary) - l;
1628
1629		if (fl->rx_offset + len + pad < hwb->size)
1630			blen = len + pad;
1631		MPASS(fl->rx_offset + blen <= hwb->size);
1632	} else {
1633		MPASS(fl->rx_offset == 0);	/* not packing */
1634	}
1635
1636
1637	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
1638
1639		/*
1640		 * Copy payload into a freshly allocated mbuf.
1641		 */
1642
1643		m = fr_offset == 0 ?
1644		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1645		if (m == NULL)
1646			return (NULL);
1647		fl->mbuf_allocated++;
1648#ifdef T4_PKT_TIMESTAMP
1649		/* Leave room for a timestamp */
1650		m->m_data += 8;
1651#endif
1652		/* copy data to mbuf */
1653		bcopy(payload, mtod(m, caddr_t), len);
1654
1655	} else if (sd->nmbuf * MSIZE < cll->region1) {
1656
1657		/*
1658		 * There's spare room in the cluster for an mbuf.  Create one
1659		 * and associate it with the payload that's in the cluster.
1660		 */
1661
1662		MPASS(clm != NULL);
1663		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
1664		/* No bzero required */
1665		if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA,
1666		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
1667			return (NULL);
1668		fl->mbuf_inlined++;
1669		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
1670		    swz->zone, sd->cl);
1671		if (sd->nmbuf++ == 0)
1672			counter_u64_add(extfree_refs, 1);
1673
1674	} else {
1675
1676		/*
1677		 * Grab an mbuf from zone_mbuf and associate it with the
1678		 * payload in the cluster.
1679		 */
1680
1681		m = fr_offset == 0 ?
1682		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1683		if (m == NULL)
1684			return (NULL);
1685		fl->mbuf_allocated++;
1686		if (clm != NULL) {
1687			m_extaddref(m, payload, blen, &clm->refcount,
1688			    rxb_free, swz->zone, sd->cl);
1689			if (sd->nmbuf++ == 0)
1690				counter_u64_add(extfree_refs, 1);
1691		} else {
1692			m_cljset(m, sd->cl, swz->type);
1693			sd->cl = NULL;	/* consumed, not a recycle candidate */
1694		}
1695	}
1696	if (fr_offset == 0)
1697		m->m_pkthdr.len = remaining;
1698	m->m_len = len;
1699
1700	if (fl->flags & FL_BUF_PACKING) {
1701		fl->rx_offset += blen;
1702		MPASS(fl->rx_offset <= hwb->size);
1703		if (fl->rx_offset < hwb->size)
1704			return (m);	/* without advancing the cidx */
1705	}
1706
1707	if (__predict_false(++fl->cidx % 8 == 0)) {
1708		uint16_t cidx = fl->cidx / 8;
1709
1710		if (__predict_false(cidx == fl->sidx))
1711			fl->cidx = cidx = 0;
1712		fl->hw_cidx = cidx;
1713	}
1714	fl->rx_offset = 0;
1715
1716	return (m);
1717}
1718
1719static struct mbuf *
1720get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
1721{
1722	struct mbuf *m0, *m, **pnext;
1723	u_int remaining;
1724	const u_int total = G_RSPD_LEN(len_newbuf);
1725
1726	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
1727		M_ASSERTPKTHDR(fl->m0);
1728		MPASS(fl->m0->m_pkthdr.len == total);
1729		MPASS(fl->remaining < total);
1730
1731		m0 = fl->m0;
1732		pnext = fl->pnext;
1733		remaining = fl->remaining;
1734		fl->flags &= ~FL_BUF_RESUME;
1735		goto get_segment;
1736	}
1737
1738	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
1739		fl->rx_offset = 0;
1740		if (__predict_false(++fl->cidx % 8 == 0)) {
1741			uint16_t cidx = fl->cidx / 8;
1742
1743			if (__predict_false(cidx == fl->sidx))
1744				fl->cidx = cidx = 0;
1745			fl->hw_cidx = cidx;
1746		}
1747	}
1748
1749	/*
1750	 * Payload starts at rx_offset in the current hw buffer.  Its length is
1751	 * 'len' and it may span multiple hw buffers.
1752	 */
1753
1754	m0 = get_scatter_segment(sc, fl, 0, total);
1755	if (m0 == NULL)
1756		return (NULL);
1757	remaining = total - m0->m_len;
1758	pnext = &m0->m_next;
1759	while (remaining > 0) {
1760get_segment:
1761		MPASS(fl->rx_offset == 0);
1762		m = get_scatter_segment(sc, fl, total - remaining, remaining);
1763		if (__predict_false(m == NULL)) {
1764			fl->m0 = m0;
1765			fl->pnext = pnext;
1766			fl->remaining = remaining;
1767			fl->flags |= FL_BUF_RESUME;
1768			return (NULL);
1769		}
1770		*pnext = m;
1771		pnext = &m->m_next;
1772		remaining -= m->m_len;
1773	}
1774	*pnext = NULL;
1775
1776	M_ASSERTPKTHDR(m0);
1777	return (m0);
1778}
1779
1780static int
1781t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
1782{
1783	struct sge_rxq *rxq = iq_to_rxq(iq);
1784	struct ifnet *ifp = rxq->ifp;
1785	struct adapter *sc = iq->adapter;
1786	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
1787#if defined(INET) || defined(INET6)
1788	struct lro_ctrl *lro = &rxq->lro;
1789#endif
1790
1791	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
1792	    rss->opcode));
1793
1794	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
1795	m0->m_len -= sc->params.sge.fl_pktshift;
1796	m0->m_data += sc->params.sge.fl_pktshift;
1797
1798	m0->m_pkthdr.rcvif = ifp;
1799	M_HASHTYPE_SET(m0, M_HASHTYPE_OPAQUE);
1800	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
1801
1802	if (cpl->csum_calc && !cpl->err_vec) {
1803		if (ifp->if_capenable & IFCAP_RXCSUM &&
1804		    cpl->l2info & htobe32(F_RXF_IP)) {
1805			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
1806			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1807			rxq->rxcsum++;
1808		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
1809		    cpl->l2info & htobe32(F_RXF_IP6)) {
1810			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
1811			    CSUM_PSEUDO_HDR);
1812			rxq->rxcsum++;
1813		}
1814
1815		if (__predict_false(cpl->ip_frag))
1816			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
1817		else
1818			m0->m_pkthdr.csum_data = 0xffff;
1819	}
1820
1821	if (cpl->vlan_ex) {
1822		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
1823		m0->m_flags |= M_VLANTAG;
1824		rxq->vlan_extraction++;
1825	}
1826
1827#if defined(INET) || defined(INET6)
1828	if (iq->flags & IQ_LRO_ENABLED &&
1829	    tcp_lro_rx(lro, m0, 0) == 0) {
1830		/* queued for LRO */
1831	} else
1832#endif
1833	ifp->if_input(ifp, m0);
1834
1835	return (0);
1836}
1837
1838/*
1839 * Must drain the wrq or make sure that someone else will.
1840 */
1841static void
1842wrq_tx_drain(void *arg, int n)
1843{
1844	struct sge_wrq *wrq = arg;
1845	struct sge_eq *eq = &wrq->eq;
1846
1847	EQ_LOCK(eq);
1848	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
1849		drain_wrq_wr_list(wrq->adapter, wrq);
1850	EQ_UNLOCK(eq);
1851}
1852
1853static void
1854drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
1855{
1856	struct sge_eq *eq = &wrq->eq;
1857	u_int available, dbdiff;	/* # of hardware descriptors */
1858	u_int n;
1859	struct wrqe *wr;
1860	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
1861
1862	EQ_LOCK_ASSERT_OWNED(eq);
1863	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
1864	wr = STAILQ_FIRST(&wrq->wr_list);
1865	MPASS(wr != NULL);	/* Must be called with something useful to do */
1866	MPASS(eq->pidx == eq->dbidx);
1867	dbdiff = 0;
1868
1869	do {
1870		eq->cidx = read_hw_cidx(eq);
1871		if (eq->pidx == eq->cidx)
1872			available = eq->sidx - 1;
1873		else
1874			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
1875
1876		MPASS(wr->wrq == wrq);
1877		n = howmany(wr->wr_len, EQ_ESIZE);
1878		if (available < n)
1879			break;
1880
1881		dst = (void *)&eq->desc[eq->pidx];
1882		if (__predict_true(eq->sidx - eq->pidx > n)) {
1883			/* Won't wrap, won't end exactly at the status page. */
1884			bcopy(&wr->wr[0], dst, wr->wr_len);
1885			eq->pidx += n;
1886		} else {
1887			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
1888
1889			bcopy(&wr->wr[0], dst, first_portion);
1890			if (wr->wr_len > first_portion) {
1891				bcopy(&wr->wr[first_portion], &eq->desc[0],
1892				    wr->wr_len - first_portion);
1893			}
1894			eq->pidx = n - (eq->sidx - eq->pidx);
1895		}
1896		wrq->tx_wrs_copied++;
1897
1898		if (available < eq->sidx / 4 &&
1899		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
1900			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
1901			    F_FW_WR_EQUEQ);
1902			eq->equeqidx = eq->pidx;
1903		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
1904			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
1905			eq->equeqidx = eq->pidx;
1906		}
1907
1908		dbdiff += n;
1909		if (dbdiff >= 16) {
1910			ring_eq_db(sc, eq, dbdiff);
1911			dbdiff = 0;
1912		}
1913
1914		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
1915		free_wrqe(wr);
1916		MPASS(wrq->nwr_pending > 0);
1917		wrq->nwr_pending--;
1918		MPASS(wrq->ndesc_needed >= n);
1919		wrq->ndesc_needed -= n;
1920	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
1921
1922	if (dbdiff)
1923		ring_eq_db(sc, eq, dbdiff);
1924}
1925
1926/*
1927 * Doesn't fail.  Holds on to work requests it can't send right away.
1928 */
1929void
1930t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
1931{
1932#ifdef INVARIANTS
1933	struct sge_eq *eq = &wrq->eq;
1934#endif
1935
1936	EQ_LOCK_ASSERT_OWNED(eq);
1937	MPASS(wr != NULL);
1938	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
1939	MPASS((wr->wr_len & 0x7) == 0);
1940
1941	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
1942	wrq->nwr_pending++;
1943	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
1944
1945	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
1946		return;	/* commit_wrq_wr will drain wr_list as well. */
1947
1948	drain_wrq_wr_list(sc, wrq);
1949
1950	/* Doorbell must have caught up to the pidx. */
1951	MPASS(eq->pidx == eq->dbidx);
1952}
1953
1954void
1955t4_update_fl_bufsize(struct ifnet *ifp)
1956{
1957	struct vi_info *vi = ifp->if_softc;
1958	struct adapter *sc = vi->pi->adapter;
1959	struct sge_rxq *rxq;
1960#ifdef TCP_OFFLOAD
1961	struct sge_ofld_rxq *ofld_rxq;
1962#endif
1963	struct sge_fl *fl;
1964	int i, maxp, mtu = ifp->if_mtu;
1965
1966	maxp = mtu_to_max_payload(sc, mtu, 0);
1967	for_each_rxq(vi, i, rxq) {
1968		fl = &rxq->fl;
1969
1970		FL_LOCK(fl);
1971		find_best_refill_source(sc, fl, maxp);
1972		FL_UNLOCK(fl);
1973	}
1974#ifdef TCP_OFFLOAD
1975	maxp = mtu_to_max_payload(sc, mtu, 1);
1976	for_each_ofld_rxq(vi, i, ofld_rxq) {
1977		fl = &ofld_rxq->fl;
1978
1979		FL_LOCK(fl);
1980		find_best_refill_source(sc, fl, maxp);
1981		FL_UNLOCK(fl);
1982	}
1983#endif
1984}
1985
1986static inline int
1987mbuf_nsegs(struct mbuf *m)
1988{
1989
1990	M_ASSERTPKTHDR(m);
1991	KASSERT(m->m_pkthdr.l5hlen > 0,
1992	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
1993
1994	return (m->m_pkthdr.l5hlen);
1995}
1996
1997static inline void
1998set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
1999{
2000
2001	M_ASSERTPKTHDR(m);
2002	m->m_pkthdr.l5hlen = nsegs;
2003}
2004
2005static inline int
2006mbuf_len16(struct mbuf *m)
2007{
2008	int n;
2009
2010	M_ASSERTPKTHDR(m);
2011	n = m->m_pkthdr.PH_loc.eigth[0];
2012	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
2013
2014	return (n);
2015}
2016
2017static inline void
2018set_mbuf_len16(struct mbuf *m, uint8_t len16)
2019{
2020
2021	M_ASSERTPKTHDR(m);
2022	m->m_pkthdr.PH_loc.eigth[0] = len16;
2023}
2024
2025static inline int
2026needs_tso(struct mbuf *m)
2027{
2028
2029	M_ASSERTPKTHDR(m);
2030
2031	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
2032		KASSERT(m->m_pkthdr.tso_segsz > 0,
2033		    ("%s: TSO requested in mbuf %p but MSS not provided",
2034		    __func__, m));
2035		return (1);
2036	}
2037
2038	return (0);
2039}
2040
2041static inline int
2042needs_l3_csum(struct mbuf *m)
2043{
2044
2045	M_ASSERTPKTHDR(m);
2046
2047	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
2048		return (1);
2049	return (0);
2050}
2051
2052static inline int
2053needs_l4_csum(struct mbuf *m)
2054{
2055
2056	M_ASSERTPKTHDR(m);
2057
2058	if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
2059	    CSUM_TCP_IPV6 | CSUM_TSO))
2060		return (1);
2061	return (0);
2062}
2063
2064static inline int
2065needs_vlan_insertion(struct mbuf *m)
2066{
2067
2068	M_ASSERTPKTHDR(m);
2069
2070	if (m->m_flags & M_VLANTAG) {
2071		KASSERT(m->m_pkthdr.ether_vtag != 0,
2072		    ("%s: HWVLAN requested in mbuf %p but tag not provided",
2073		    __func__, m));
2074		return (1);
2075	}
2076	return (0);
2077}
2078
2079static void *
2080m_advance(struct mbuf **pm, int *poffset, int len)
2081{
2082	struct mbuf *m = *pm;
2083	int offset = *poffset;
2084	uintptr_t p = 0;
2085
2086	MPASS(len > 0);
2087
2088	for (;;) {
2089		if (offset + len < m->m_len) {
2090			offset += len;
2091			p = mtod(m, uintptr_t) + offset;
2092			break;
2093		}
2094		len -= m->m_len - offset;
2095		m = m->m_next;
2096		offset = 0;
2097		MPASS(m != NULL);
2098	}
2099	*poffset = offset;
2100	*pm = m;
2101	return ((void *)p);
2102}
2103
2104static inline int
2105same_paddr(char *a, char *b)
2106{
2107
2108	if (a == b)
2109		return (1);
2110	else if (a != NULL && b != NULL) {
2111		vm_offset_t x = (vm_offset_t)a;
2112		vm_offset_t y = (vm_offset_t)b;
2113
2114		if ((x & PAGE_MASK) == (y & PAGE_MASK) &&
2115		    pmap_kextract(x) == pmap_kextract(y))
2116			return (1);
2117	}
2118
2119	return (0);
2120}
2121
2122/*
2123 * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
2124 * must have at least one mbuf that's not empty.
2125 */
2126static inline int
2127count_mbuf_nsegs(struct mbuf *m)
2128{
2129	char *prev_end, *start;
2130	int len, nsegs;
2131
2132	MPASS(m != NULL);
2133
2134	nsegs = 0;
2135	prev_end = NULL;
2136	for (; m; m = m->m_next) {
2137
2138		len = m->m_len;
2139		if (__predict_false(len == 0))
2140			continue;
2141		start = mtod(m, char *);
2142
2143		nsegs += sglist_count(start, len);
2144		if (same_paddr(prev_end, start))
2145			nsegs--;
2146		prev_end = start + len;
2147	}
2148
2149	MPASS(nsegs > 0);
2150	return (nsegs);
2151}
2152
2153/*
2154 * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
2155 * a) caller can assume it's been freed if this function returns with an error.
2156 * b) it may get defragged up if the gather list is too long for the hardware.
2157 */
2158int
2159parse_pkt(struct adapter *sc, struct mbuf **mp)
2160{
2161	struct mbuf *m0 = *mp, *m;
2162	int rc, nsegs, defragged = 0, offset;
2163	struct ether_header *eh;
2164	void *l3hdr;
2165#if defined(INET) || defined(INET6)
2166	struct tcphdr *tcp;
2167#endif
2168	uint16_t eh_type;
2169
2170	M_ASSERTPKTHDR(m0);
2171	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
2172		rc = EINVAL;
2173fail:
2174		m_freem(m0);
2175		*mp = NULL;
2176		return (rc);
2177	}
2178restart:
2179	/*
2180	 * First count the number of gather list segments in the payload.
2181	 * Defrag the mbuf if nsegs exceeds the hardware limit.
2182	 */
2183	M_ASSERTPKTHDR(m0);
2184	MPASS(m0->m_pkthdr.len > 0);
2185	nsegs = count_mbuf_nsegs(m0);
2186	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
2187		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
2188			rc = EFBIG;
2189			goto fail;
2190		}
2191		*mp = m0 = m;	/* update caller's copy after defrag */
2192		goto restart;
2193	}
2194
2195	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
2196		m0 = m_pullup(m0, m0->m_pkthdr.len);
2197		if (m0 == NULL) {
2198			/* Should have left well enough alone. */
2199			rc = EFBIG;
2200			goto fail;
2201		}
2202		*mp = m0;	/* update caller's copy after pullup */
2203		goto restart;
2204	}
2205	set_mbuf_nsegs(m0, nsegs);
2206	if (sc->flags & IS_VF)
2207		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
2208	else
2209		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
2210
2211	if (!needs_tso(m0) &&
2212	    !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0))))
2213		return (0);
2214
2215	m = m0;
2216	eh = mtod(m, struct ether_header *);
2217	eh_type = ntohs(eh->ether_type);
2218	if (eh_type == ETHERTYPE_VLAN) {
2219		struct ether_vlan_header *evh = (void *)eh;
2220
2221		eh_type = ntohs(evh->evl_proto);
2222		m0->m_pkthdr.l2hlen = sizeof(*evh);
2223	} else
2224		m0->m_pkthdr.l2hlen = sizeof(*eh);
2225
2226	offset = 0;
2227	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
2228
2229	switch (eh_type) {
2230#ifdef INET6
2231	case ETHERTYPE_IPV6:
2232	{
2233		struct ip6_hdr *ip6 = l3hdr;
2234
2235		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
2236
2237		m0->m_pkthdr.l3hlen = sizeof(*ip6);
2238		break;
2239	}
2240#endif
2241#ifdef INET
2242	case ETHERTYPE_IP:
2243	{
2244		struct ip *ip = l3hdr;
2245
2246		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
2247		break;
2248	}
2249#endif
2250	default:
2251		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
2252		    " with the same INET/INET6 options as the kernel.",
2253		    __func__, eh_type);
2254	}
2255
2256#if defined(INET) || defined(INET6)
2257	if (needs_tso(m0)) {
2258		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
2259		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
2260	}
2261#endif
2262	MPASS(m0 == *mp);
2263	return (0);
2264}
2265
2266void *
2267start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
2268{
2269	struct sge_eq *eq = &wrq->eq;
2270	struct adapter *sc = wrq->adapter;
2271	int ndesc, available;
2272	struct wrqe *wr;
2273	void *w;
2274
2275	MPASS(len16 > 0);
2276	ndesc = howmany(len16, EQ_ESIZE / 16);
2277	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
2278
2279	EQ_LOCK(eq);
2280
2281	if (!STAILQ_EMPTY(&wrq->wr_list))
2282		drain_wrq_wr_list(sc, wrq);
2283
2284	if (!STAILQ_EMPTY(&wrq->wr_list)) {
2285slowpath:
2286		EQ_UNLOCK(eq);
2287		wr = alloc_wrqe(len16 * 16, wrq);
2288		if (__predict_false(wr == NULL))
2289			return (NULL);
2290		cookie->pidx = -1;
2291		cookie->ndesc = ndesc;
2292		return (&wr->wr);
2293	}
2294
2295	eq->cidx = read_hw_cidx(eq);
2296	if (eq->pidx == eq->cidx)
2297		available = eq->sidx - 1;
2298	else
2299		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2300	if (available < ndesc)
2301		goto slowpath;
2302
2303	cookie->pidx = eq->pidx;
2304	cookie->ndesc = ndesc;
2305	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
2306
2307	w = &eq->desc[eq->pidx];
2308	IDXINCR(eq->pidx, ndesc, eq->sidx);
2309	if (__predict_false(eq->pidx < ndesc - 1)) {
2310		w = &wrq->ss[0];
2311		wrq->ss_pidx = cookie->pidx;
2312		wrq->ss_len = len16 * 16;
2313	}
2314
2315	EQ_UNLOCK(eq);
2316
2317	return (w);
2318}
2319
2320void
2321commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
2322{
2323	struct sge_eq *eq = &wrq->eq;
2324	struct adapter *sc = wrq->adapter;
2325	int ndesc, pidx;
2326	struct wrq_cookie *prev, *next;
2327
2328	if (cookie->pidx == -1) {
2329		struct wrqe *wr = __containerof(w, struct wrqe, wr);
2330
2331		t4_wrq_tx(sc, wr);
2332		return;
2333	}
2334
2335	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
2336	pidx = cookie->pidx;
2337	MPASS(pidx >= 0 && pidx < eq->sidx);
2338	if (__predict_false(w == &wrq->ss[0])) {
2339		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
2340
2341		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
2342		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
2343		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
2344		wrq->tx_wrs_ss++;
2345	} else
2346		wrq->tx_wrs_direct++;
2347
2348	EQ_LOCK(eq);
2349	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
2350	next = TAILQ_NEXT(cookie, link);
2351	if (prev == NULL) {
2352		MPASS(pidx == eq->dbidx);
2353		if (next == NULL || ndesc >= 16)
2354			ring_eq_db(wrq->adapter, eq, ndesc);
2355		else {
2356			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
2357			next->pidx = pidx;
2358			next->ndesc += ndesc;
2359		}
2360	} else {
2361		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
2362		prev->ndesc += ndesc;
2363	}
2364	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
2365
2366	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2367		drain_wrq_wr_list(sc, wrq);
2368
2369#ifdef INVARIANTS
2370	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
2371		/* Doorbell must have caught up to the pidx. */
2372		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
2373	}
2374#endif
2375	EQ_UNLOCK(eq);
2376}
2377
2378static u_int
2379can_resume_eth_tx(struct mp_ring *r)
2380{
2381	struct sge_eq *eq = r->cookie;
2382
2383	return (total_available_tx_desc(eq) > eq->sidx / 8);
2384}
2385
2386static inline int
2387cannot_use_txpkts(struct mbuf *m)
2388{
2389	/* maybe put a GL limit too, to avoid silliness? */
2390
2391	return (needs_tso(m));
2392}
2393
2394/*
2395 * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
2396 * be consumed.  Return the actual number consumed.  0 indicates a stall.
2397 */
2398static u_int
2399eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
2400{
2401	struct sge_txq *txq = r->cookie;
2402	struct sge_eq *eq = &txq->eq;
2403	struct ifnet *ifp = txq->ifp;
2404	struct vi_info *vi = ifp->if_softc;
2405	struct port_info *pi = vi->pi;
2406	struct adapter *sc = pi->adapter;
2407	u_int total, remaining;		/* # of packets */
2408	u_int available, dbdiff;	/* # of hardware descriptors */
2409	u_int n, next_cidx;
2410	struct mbuf *m0, *tail;
2411	struct txpkts txp;
2412	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
2413
2414	remaining = IDXDIFF(pidx, cidx, r->size);
2415	MPASS(remaining > 0);	/* Must not be called without work to do. */
2416	total = 0;
2417
2418	TXQ_LOCK(txq);
2419	if (__predict_false((eq->flags & EQ_ENABLED) == 0)) {
2420		while (cidx != pidx) {
2421			m0 = r->items[cidx];
2422			m_freem(m0);
2423			if (++cidx == r->size)
2424				cidx = 0;
2425		}
2426		reclaim_tx_descs(txq, 2048);
2427		total = remaining;
2428		goto done;
2429	}
2430
2431	/* How many hardware descriptors do we have readily available. */
2432	if (eq->pidx == eq->cidx)
2433		available = eq->sidx - 1;
2434	else
2435		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2436	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
2437
2438	while (remaining > 0) {
2439
2440		m0 = r->items[cidx];
2441		M_ASSERTPKTHDR(m0);
2442		MPASS(m0->m_nextpkt == NULL);
2443
2444		if (available < SGE_MAX_WR_NDESC) {
2445			available += reclaim_tx_descs(txq, 64);
2446			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
2447				break;	/* out of descriptors */
2448		}
2449
2450		next_cidx = cidx + 1;
2451		if (__predict_false(next_cidx == r->size))
2452			next_cidx = 0;
2453
2454		wr = (void *)&eq->desc[eq->pidx];
2455		if (sc->flags & IS_VF) {
2456			total++;
2457			remaining--;
2458			ETHER_BPF_MTAP(ifp, m0);
2459			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
2460			    available);
2461		} else if (remaining > 1 &&
2462		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
2463
2464			/* pkts at cidx, next_cidx should both be in txp. */
2465			MPASS(txp.npkt == 2);
2466			tail = r->items[next_cidx];
2467			MPASS(tail->m_nextpkt == NULL);
2468			ETHER_BPF_MTAP(ifp, m0);
2469			ETHER_BPF_MTAP(ifp, tail);
2470			m0->m_nextpkt = tail;
2471
2472			if (__predict_false(++next_cidx == r->size))
2473				next_cidx = 0;
2474
2475			while (next_cidx != pidx) {
2476				if (add_to_txpkts(r->items[next_cidx], &txp,
2477				    available) != 0)
2478					break;
2479				tail->m_nextpkt = r->items[next_cidx];
2480				tail = tail->m_nextpkt;
2481				ETHER_BPF_MTAP(ifp, tail);
2482				if (__predict_false(++next_cidx == r->size))
2483					next_cidx = 0;
2484			}
2485
2486			n = write_txpkts_wr(txq, wr, m0, &txp, available);
2487			total += txp.npkt;
2488			remaining -= txp.npkt;
2489		} else {
2490			total++;
2491			remaining--;
2492			ETHER_BPF_MTAP(ifp, m0);
2493			n = write_txpkt_wr(txq, (void *)wr, m0, available);
2494		}
2495		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
2496
2497		available -= n;
2498		dbdiff += n;
2499		IDXINCR(eq->pidx, n, eq->sidx);
2500
2501		if (total_available_tx_desc(eq) < eq->sidx / 4 &&
2502		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
2503			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
2504			    F_FW_WR_EQUEQ);
2505			eq->equeqidx = eq->pidx;
2506		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
2507			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
2508			eq->equeqidx = eq->pidx;
2509		}
2510
2511		if (dbdiff >= 16 && remaining >= 4) {
2512			ring_eq_db(sc, eq, dbdiff);
2513			available += reclaim_tx_descs(txq, 4 * dbdiff);
2514			dbdiff = 0;
2515		}
2516
2517		cidx = next_cidx;
2518	}
2519	if (dbdiff != 0) {
2520		ring_eq_db(sc, eq, dbdiff);
2521		reclaim_tx_descs(txq, 32);
2522	}
2523done:
2524	TXQ_UNLOCK(txq);
2525
2526	return (total);
2527}
2528
2529static inline void
2530init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
2531    int qsize)
2532{
2533
2534	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
2535	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
2536	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
2537	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
2538
2539	iq->flags = 0;
2540	iq->adapter = sc;
2541	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
2542	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
2543	if (pktc_idx >= 0) {
2544		iq->intr_params |= F_QINTR_CNT_EN;
2545		iq->intr_pktc_idx = pktc_idx;
2546	}
2547	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
2548	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
2549}
2550
2551static inline void
2552init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
2553{
2554
2555	fl->qsize = qsize;
2556	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
2557	strlcpy(fl->lockname, name, sizeof(fl->lockname));
2558	if (sc->flags & BUF_PACKING_OK &&
2559	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
2560	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
2561		fl->flags |= FL_BUF_PACKING;
2562	find_best_refill_source(sc, fl, maxp);
2563	find_safe_refill_source(sc, fl);
2564}
2565
2566static inline void
2567init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
2568    uint8_t tx_chan, uint16_t iqid, char *name)
2569{
2570	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
2571
2572	eq->flags = eqtype & EQ_TYPEMASK;
2573	eq->tx_chan = tx_chan;
2574	eq->iqid = iqid;
2575	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
2576	strlcpy(eq->lockname, name, sizeof(eq->lockname));
2577}
2578
2579static int
2580alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
2581    bus_dmamap_t *map, bus_addr_t *pa, void **va)
2582{
2583	int rc;
2584
2585	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
2586	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
2587	if (rc != 0) {
2588		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
2589		goto done;
2590	}
2591
2592	rc = bus_dmamem_alloc(*tag, va,
2593	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
2594	if (rc != 0) {
2595		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
2596		goto done;
2597	}
2598
2599	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
2600	if (rc != 0) {
2601		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
2602		goto done;
2603	}
2604done:
2605	if (rc)
2606		free_ring(sc, *tag, *map, *pa, *va);
2607
2608	return (rc);
2609}
2610
2611static int
2612free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
2613    bus_addr_t pa, void *va)
2614{
2615	if (pa)
2616		bus_dmamap_unload(tag, map);
2617	if (va)
2618		bus_dmamem_free(tag, va, map);
2619	if (tag)
2620		bus_dma_tag_destroy(tag);
2621
2622	return (0);
2623}
2624
2625/*
2626 * Allocates the ring for an ingress queue and an optional freelist.  If the
2627 * freelist is specified it will be allocated and then associated with the
2628 * ingress queue.
2629 *
2630 * Returns errno on failure.  Resources allocated up to that point may still be
2631 * allocated.  Caller is responsible for cleanup in case this function fails.
2632 *
2633 * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then
2634 * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
2635 * the abs_id of the ingress queue to which its interrupts should be forwarded.
2636 */
2637static int
2638alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
2639    int intr_idx, int cong)
2640{
2641	int rc, i, cntxt_id;
2642	size_t len;
2643	struct fw_iq_cmd c;
2644	struct port_info *pi = vi->pi;
2645	struct adapter *sc = iq->adapter;
2646	struct sge_params *sp = &sc->params.sge;
2647	__be32 v = 0;
2648
2649	len = iq->qsize * IQ_ESIZE;
2650	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
2651	    (void **)&iq->desc);
2652	if (rc != 0)
2653		return (rc);
2654
2655	bzero(&c, sizeof(c));
2656	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
2657	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
2658	    V_FW_IQ_CMD_VFN(0));
2659
2660	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
2661	    FW_LEN16(c));
2662
2663	/* Special handling for firmware event queue */
2664	if (iq == &sc->sge.fwq)
2665		v |= F_FW_IQ_CMD_IQASYNCH;
2666
2667	if (iq->flags & IQ_INTR) {
2668		KASSERT(intr_idx < sc->intr_count,
2669		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
2670	} else
2671		v |= F_FW_IQ_CMD_IQANDST;
2672	v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
2673
2674	c.type_to_iqandstindex = htobe32(v |
2675	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
2676	    V_FW_IQ_CMD_VIID(vi->viid) |
2677	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
2678	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
2679	    F_FW_IQ_CMD_IQGTSMODE |
2680	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
2681	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
2682	c.iqsize = htobe16(iq->qsize);
2683	c.iqaddr = htobe64(iq->ba);
2684	if (cong >= 0)
2685		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
2686
2687	if (fl) {
2688		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
2689
2690		len = fl->qsize * EQ_ESIZE;
2691		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
2692		    &fl->ba, (void **)&fl->desc);
2693		if (rc)
2694			return (rc);
2695
2696		/* Allocate space for one software descriptor per buffer. */
2697		rc = alloc_fl_sdesc(fl);
2698		if (rc != 0) {
2699			device_printf(sc->dev,
2700			    "failed to setup fl software descriptors: %d\n",
2701			    rc);
2702			return (rc);
2703		}
2704
2705		if (fl->flags & FL_BUF_PACKING) {
2706			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
2707			fl->buf_boundary = sp->pack_boundary;
2708		} else {
2709			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
2710			fl->buf_boundary = 16;
2711		}
2712		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
2713			fl->buf_boundary = sp->pad_boundary;
2714
2715		c.iqns_to_fl0congen |=
2716		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
2717			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
2718			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
2719			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
2720			    0));
2721		if (cong >= 0) {
2722			c.iqns_to_fl0congen |=
2723				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
2724				    F_FW_IQ_CMD_FL0CONGCIF |
2725				    F_FW_IQ_CMD_FL0CONGEN);
2726		}
2727		c.fl0dcaen_to_fl0cidxfthresh =
2728		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
2729			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) |
2730			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
2731			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
2732		c.fl0size = htobe16(fl->qsize);
2733		c.fl0addr = htobe64(fl->ba);
2734	}
2735
2736	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2737	if (rc != 0) {
2738		device_printf(sc->dev,
2739		    "failed to create ingress queue: %d\n", rc);
2740		return (rc);
2741	}
2742
2743	iq->cidx = 0;
2744	iq->gen = F_RSPD_GEN;
2745	iq->intr_next = iq->intr_params;
2746	iq->cntxt_id = be16toh(c.iqid);
2747	iq->abs_id = be16toh(c.physiqid);
2748	iq->flags |= IQ_ALLOCATED;
2749
2750	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
2751	if (cntxt_id >= sc->sge.niq) {
2752		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
2753		    cntxt_id, sc->sge.niq - 1);
2754	}
2755	sc->sge.iqmap[cntxt_id] = iq;
2756
2757	if (fl) {
2758		u_int qid;
2759
2760		iq->flags |= IQ_HAS_FL;
2761		fl->cntxt_id = be16toh(c.fl0id);
2762		fl->pidx = fl->cidx = 0;
2763
2764		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
2765		if (cntxt_id >= sc->sge.neq) {
2766			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
2767			    __func__, cntxt_id, sc->sge.neq - 1);
2768		}
2769		sc->sge.eqmap[cntxt_id] = (void *)fl;
2770
2771		qid = fl->cntxt_id;
2772		if (isset(&sc->doorbells, DOORBELL_UDB)) {
2773			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
2774			uint32_t mask = (1 << s_qpp) - 1;
2775			volatile uint8_t *udb;
2776
2777			udb = sc->udbs_base + UDBS_DB_OFFSET;
2778			udb += (qid >> s_qpp) << PAGE_SHIFT;
2779			qid &= mask;
2780			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
2781				udb += qid << UDBS_SEG_SHIFT;
2782				qid = 0;
2783			}
2784			fl->udb = (volatile void *)udb;
2785		}
2786		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
2787
2788		FL_LOCK(fl);
2789		/* Enough to make sure the SGE doesn't think it's starved */
2790		refill_fl(sc, fl, fl->lowat);
2791		FL_UNLOCK(fl);
2792	}
2793
2794	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
2795		uint32_t param, val;
2796
2797		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
2798		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
2799		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
2800		if (cong == 0)
2801			val = 1 << 19;
2802		else {
2803			val = 2 << 19;
2804			for (i = 0; i < 4; i++) {
2805				if (cong & (1 << i))
2806					val |= 1 << (i << 2);
2807			}
2808		}
2809
2810		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
2811		if (rc != 0) {
2812			/* report error but carry on */
2813			device_printf(sc->dev,
2814			    "failed to set congestion manager context for "
2815			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
2816		}
2817	}
2818
2819	/* Enable IQ interrupts */
2820	atomic_store_rel_int(&iq->state, IQS_IDLE);
2821	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
2822	    V_INGRESSQID(iq->cntxt_id));
2823
2824	return (0);
2825}
2826
2827static int
2828free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
2829{
2830	int rc;
2831	struct adapter *sc = iq->adapter;
2832	device_t dev;
2833
2834	if (sc == NULL)
2835		return (0);	/* nothing to do */
2836
2837	dev = vi ? vi->dev : sc->dev;
2838
2839	if (iq->flags & IQ_ALLOCATED) {
2840		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
2841		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
2842		    fl ? fl->cntxt_id : 0xffff, 0xffff);
2843		if (rc != 0) {
2844			device_printf(dev,
2845			    "failed to free queue %p: %d\n", iq, rc);
2846			return (rc);
2847		}
2848		iq->flags &= ~IQ_ALLOCATED;
2849	}
2850
2851	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
2852
2853	bzero(iq, sizeof(*iq));
2854
2855	if (fl) {
2856		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
2857		    fl->desc);
2858
2859		if (fl->sdesc)
2860			free_fl_sdesc(sc, fl);
2861
2862		if (mtx_initialized(&fl->fl_lock))
2863			mtx_destroy(&fl->fl_lock);
2864
2865		bzero(fl, sizeof(*fl));
2866	}
2867
2868	return (0);
2869}
2870
2871static void
2872add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
2873    struct sge_fl *fl)
2874{
2875	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2876
2877	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
2878	    "freelist");
2879	children = SYSCTL_CHILDREN(oid);
2880
2881	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
2882	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
2883	    "SGE context id of the freelist");
2884	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
2885	    fl_pad ? 1 : 0, "padding enabled");
2886	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
2887	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
2888	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
2889	    0, "consumer index");
2890	if (fl->flags & FL_BUF_PACKING) {
2891		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
2892		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
2893	}
2894	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
2895	    0, "producer index");
2896	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
2897	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
2898	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
2899	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
2900	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
2901	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
2902	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
2903	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
2904	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
2905	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
2906}
2907
2908static int
2909alloc_fwq(struct adapter *sc)
2910{
2911	int rc, intr_idx;
2912	struct sge_iq *fwq = &sc->sge.fwq;
2913	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
2914	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2915
2916	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
2917	fwq->flags |= IQ_INTR;	/* always */
2918	if (sc->flags & IS_VF)
2919		intr_idx = 0;
2920	else {
2921		intr_idx = sc->intr_count > 1 ? 1 : 0;
2922		fwq->set_tcb_rpl = t4_filter_rpl;
2923		fwq->l2t_write_rpl = do_l2t_write_rpl;
2924	}
2925	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
2926	if (rc != 0) {
2927		device_printf(sc->dev,
2928		    "failed to create firmware event queue: %d\n", rc);
2929		return (rc);
2930	}
2931
2932	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
2933	    NULL, "firmware event queue");
2934	children = SYSCTL_CHILDREN(oid);
2935
2936	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id",
2937	    CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I",
2938	    "absolute id of the queue");
2939	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id",
2940	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I",
2941	    "SGE context id of the queue");
2942	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx",
2943	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I",
2944	    "consumer index");
2945
2946	return (0);
2947}
2948
2949static int
2950free_fwq(struct adapter *sc)
2951{
2952	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
2953}
2954
2955static int
2956alloc_mgmtq(struct adapter *sc)
2957{
2958	int rc;
2959	struct sge_wrq *mgmtq = &sc->sge.mgmtq;
2960	char name[16];
2961	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
2962	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2963
2964	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD,
2965	    NULL, "management queue");
2966
2967	snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev));
2968	init_eq(sc, &mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan,
2969	    sc->sge.fwq.cntxt_id, name);
2970	rc = alloc_wrq(sc, NULL, mgmtq, oid);
2971	if (rc != 0) {
2972		device_printf(sc->dev,
2973		    "failed to create management queue: %d\n", rc);
2974		return (rc);
2975	}
2976
2977	return (0);
2978}
2979
2980static int
2981free_mgmtq(struct adapter *sc)
2982{
2983
2984	return free_wrq(sc, &sc->sge.mgmtq);
2985}
2986
2987int
2988tnl_cong(struct port_info *pi, int drop)
2989{
2990
2991	if (drop == -1)
2992		return (-1);
2993	else if (drop == 1)
2994		return (0);
2995	else
2996		return (pi->rx_chan_map);
2997}
2998
2999static int
3000alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
3001    struct sysctl_oid *oid)
3002{
3003	int rc;
3004	struct adapter *sc = vi->pi->adapter;
3005	struct sysctl_oid_list *children;
3006	char name[16];
3007
3008	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
3009	    tnl_cong(vi->pi, cong_drop));
3010	if (rc != 0)
3011		return (rc);
3012
3013	if (idx == 0)
3014		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
3015	else
3016		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
3017		    ("iq_base mismatch"));
3018	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
3019	    ("PF with non-zero iq_base"));
3020
3021	/*
3022	 * The freelist is just barely above the starvation threshold right now,
3023	 * fill it up a bit more.
3024	 */
3025	FL_LOCK(&rxq->fl);
3026	refill_fl(sc, &rxq->fl, 128);
3027	FL_UNLOCK(&rxq->fl);
3028
3029#if defined(INET) || defined(INET6)
3030	rc = tcp_lro_init(&rxq->lro);
3031	if (rc != 0)
3032		return (rc);
3033	rxq->lro.ifp = vi->ifp; /* also indicates LRO init'ed */
3034
3035	if (vi->ifp->if_capenable & IFCAP_LRO)
3036		rxq->iq.flags |= IQ_LRO_ENABLED;
3037#endif
3038	rxq->ifp = vi->ifp;
3039
3040	children = SYSCTL_CHILDREN(oid);
3041
3042	snprintf(name, sizeof(name), "%d", idx);
3043	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3044	    NULL, "rx queue");
3045	children = SYSCTL_CHILDREN(oid);
3046
3047	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
3048	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I",
3049	    "absolute id of the queue");
3050	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
3051	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I",
3052	    "SGE context id of the queue");
3053	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3054	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I",
3055	    "consumer index");
3056#if defined(INET) || defined(INET6)
3057	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
3058	    &rxq->lro.lro_queued, 0, NULL);
3059	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
3060	    &rxq->lro.lro_flushed, 0, NULL);
3061#endif
3062	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
3063	    &rxq->rxcsum, "# of times hardware assisted with checksum");
3064	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
3065	    CTLFLAG_RD, &rxq->vlan_extraction,
3066	    "# of times hardware extracted 802.1Q tag");
3067
3068	add_fl_sysctls(&vi->ctx, oid, &rxq->fl);
3069
3070	return (rc);
3071}
3072
3073static int
3074free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
3075{
3076	int rc;
3077
3078#if defined(INET) || defined(INET6)
3079	if (rxq->lro.ifp) {
3080		tcp_lro_free(&rxq->lro);
3081		rxq->lro.ifp = NULL;
3082	}
3083#endif
3084
3085	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
3086	if (rc == 0)
3087		bzero(rxq, sizeof(*rxq));
3088
3089	return (rc);
3090}
3091
3092#ifdef TCP_OFFLOAD
3093static int
3094alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
3095    int intr_idx, int idx, struct sysctl_oid *oid)
3096{
3097	int rc;
3098	struct sysctl_oid_list *children;
3099	char name[16];
3100
3101	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx,
3102	    vi->pi->rx_chan_map);
3103	if (rc != 0)
3104		return (rc);
3105
3106	children = SYSCTL_CHILDREN(oid);
3107
3108	snprintf(name, sizeof(name), "%d", idx);
3109	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3110	    NULL, "rx queue");
3111	children = SYSCTL_CHILDREN(oid);
3112
3113	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
3114	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16,
3115	    "I", "absolute id of the queue");
3116	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
3117	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16,
3118	    "I", "SGE context id of the queue");
3119	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3120	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I",
3121	    "consumer index");
3122
3123	add_fl_sysctls(&vi->ctx, oid, &ofld_rxq->fl);
3124
3125	return (rc);
3126}
3127
3128static int
3129free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
3130{
3131	int rc;
3132
3133	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
3134	if (rc == 0)
3135		bzero(ofld_rxq, sizeof(*ofld_rxq));
3136
3137	return (rc);
3138}
3139#endif
3140
3141#ifdef DEV_NETMAP
3142static int
3143alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
3144    int idx, struct sysctl_oid *oid)
3145{
3146	int rc;
3147	struct sysctl_oid_list *children;
3148	struct sysctl_ctx_list *ctx;
3149	char name[16];
3150	size_t len;
3151	struct adapter *sc = vi->pi->adapter;
3152	struct netmap_adapter *na = NA(vi->ifp);
3153
3154	MPASS(na != NULL);
3155
3156	len = vi->qsize_rxq * IQ_ESIZE;
3157	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
3158	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
3159	if (rc != 0)
3160		return (rc);
3161
3162	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
3163	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
3164	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
3165	if (rc != 0)
3166		return (rc);
3167
3168	nm_rxq->vi = vi;
3169	nm_rxq->nid = idx;
3170	nm_rxq->iq_cidx = 0;
3171	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
3172	nm_rxq->iq_gen = F_RSPD_GEN;
3173	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
3174	nm_rxq->fl_sidx = na->num_rx_desc;
3175	nm_rxq->intr_idx = intr_idx;
3176
3177	ctx = &vi->ctx;
3178	children = SYSCTL_CHILDREN(oid);
3179
3180	snprintf(name, sizeof(name), "%d", idx);
3181	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
3182	    "rx queue");
3183	children = SYSCTL_CHILDREN(oid);
3184
3185	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
3186	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
3187	    "I", "absolute id of the queue");
3188	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3189	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
3190	    "I", "SGE context id of the queue");
3191	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3192	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
3193	    "consumer index");
3194
3195	children = SYSCTL_CHILDREN(oid);
3196	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
3197	    "freelist");
3198	children = SYSCTL_CHILDREN(oid);
3199
3200	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3201	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
3202	    "I", "SGE context id of the freelist");
3203	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
3204	    &nm_rxq->fl_cidx, 0, "consumer index");
3205	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
3206	    &nm_rxq->fl_pidx, 0, "producer index");
3207
3208	return (rc);
3209}
3210
3211
3212static int
3213free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
3214{
3215	struct adapter *sc = vi->pi->adapter;
3216
3217	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
3218	    nm_rxq->iq_desc);
3219	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
3220	    nm_rxq->fl_desc);
3221
3222	return (0);
3223}
3224
3225static int
3226alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
3227    struct sysctl_oid *oid)
3228{
3229	int rc;
3230	size_t len;
3231	struct port_info *pi = vi->pi;
3232	struct adapter *sc = pi->adapter;
3233	struct netmap_adapter *na = NA(vi->ifp);
3234	char name[16];
3235	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3236
3237	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
3238	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
3239	    &nm_txq->ba, (void **)&nm_txq->desc);
3240	if (rc)
3241		return (rc);
3242
3243	nm_txq->pidx = nm_txq->cidx = 0;
3244	nm_txq->sidx = na->num_tx_desc;
3245	nm_txq->nid = idx;
3246	nm_txq->iqidx = iqidx;
3247	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3248	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
3249	    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
3250	    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
3251
3252	snprintf(name, sizeof(name), "%d", idx);
3253	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3254	    NULL, "netmap tx queue");
3255	children = SYSCTL_CHILDREN(oid);
3256
3257	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3258	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
3259	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3260	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
3261	    "consumer index");
3262	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
3263	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
3264	    "producer index");
3265
3266	return (rc);
3267}
3268
3269static int
3270free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
3271{
3272	struct adapter *sc = vi->pi->adapter;
3273
3274	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
3275	    nm_txq->desc);
3276
3277	return (0);
3278}
3279#endif
3280
3281static int
3282ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
3283{
3284	int rc, cntxt_id;
3285	struct fw_eq_ctrl_cmd c;
3286	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3287
3288	bzero(&c, sizeof(c));
3289
3290	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
3291	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
3292	    V_FW_EQ_CTRL_CMD_VFN(0));
3293	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
3294	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
3295	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
3296	c.physeqid_pkd = htobe32(0);
3297	c.fetchszm_to_iqid =
3298	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3299		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
3300		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
3301	c.dcaen_to_eqsize =
3302	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3303		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3304		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
3305	c.eqaddr = htobe64(eq->ba);
3306
3307	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3308	if (rc != 0) {
3309		device_printf(sc->dev,
3310		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
3311		return (rc);
3312	}
3313	eq->flags |= EQ_ALLOCATED;
3314
3315	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
3316	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3317	if (cntxt_id >= sc->sge.neq)
3318	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3319		cntxt_id, sc->sge.neq - 1);
3320	sc->sge.eqmap[cntxt_id] = eq;
3321
3322	return (rc);
3323}
3324
3325static int
3326eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3327{
3328	int rc, cntxt_id;
3329	struct fw_eq_eth_cmd c;
3330	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3331
3332	bzero(&c, sizeof(c));
3333
3334	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
3335	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
3336	    V_FW_EQ_ETH_CMD_VFN(0));
3337	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
3338	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
3339	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
3340	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
3341	c.fetchszm_to_iqid =
3342	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3343		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
3344		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
3345	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3346	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3347	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
3348	c.eqaddr = htobe64(eq->ba);
3349
3350	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3351	if (rc != 0) {
3352		device_printf(vi->dev,
3353		    "failed to create Ethernet egress queue: %d\n", rc);
3354		return (rc);
3355	}
3356	eq->flags |= EQ_ALLOCATED;
3357
3358	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
3359	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
3360	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3361	if (cntxt_id >= sc->sge.neq)
3362	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3363		cntxt_id, sc->sge.neq - 1);
3364	sc->sge.eqmap[cntxt_id] = eq;
3365
3366	return (rc);
3367}
3368
3369#ifdef TCP_OFFLOAD
3370static int
3371ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3372{
3373	int rc, cntxt_id;
3374	struct fw_eq_ofld_cmd c;
3375	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3376
3377	bzero(&c, sizeof(c));
3378
3379	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
3380	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
3381	    V_FW_EQ_OFLD_CMD_VFN(0));
3382	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
3383	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
3384	c.fetchszm_to_iqid =
3385		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3386		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
3387		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
3388	c.dcaen_to_eqsize =
3389	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3390		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3391		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
3392	c.eqaddr = htobe64(eq->ba);
3393
3394	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3395	if (rc != 0) {
3396		device_printf(vi->dev,
3397		    "failed to create egress queue for TCP offload: %d\n", rc);
3398		return (rc);
3399	}
3400	eq->flags |= EQ_ALLOCATED;
3401
3402	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
3403	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3404	if (cntxt_id >= sc->sge.neq)
3405	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3406		cntxt_id, sc->sge.neq - 1);
3407	sc->sge.eqmap[cntxt_id] = eq;
3408
3409	return (rc);
3410}
3411#endif
3412
3413static int
3414alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3415{
3416	int rc, qsize;
3417	size_t len;
3418
3419	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
3420
3421	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3422	len = qsize * EQ_ESIZE;
3423	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
3424	    &eq->ba, (void **)&eq->desc);
3425	if (rc)
3426		return (rc);
3427
3428	eq->pidx = eq->cidx = 0;
3429	eq->equeqidx = eq->dbidx = 0;
3430	eq->doorbells = sc->doorbells;
3431
3432	switch (eq->flags & EQ_TYPEMASK) {
3433	case EQ_CTRL:
3434		rc = ctrl_eq_alloc(sc, eq);
3435		break;
3436
3437	case EQ_ETH:
3438		rc = eth_eq_alloc(sc, vi, eq);
3439		break;
3440
3441#ifdef TCP_OFFLOAD
3442	case EQ_OFLD:
3443		rc = ofld_eq_alloc(sc, vi, eq);
3444		break;
3445#endif
3446
3447	default:
3448		panic("%s: invalid eq type %d.", __func__,
3449		    eq->flags & EQ_TYPEMASK);
3450	}
3451	if (rc != 0) {
3452		device_printf(sc->dev,
3453		    "failed to allocate egress queue(%d): %d\n",
3454		    eq->flags & EQ_TYPEMASK, rc);
3455	}
3456
3457	if (isset(&eq->doorbells, DOORBELL_UDB) ||
3458	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
3459	    isset(&eq->doorbells, DOORBELL_WCWR)) {
3460		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
3461		uint32_t mask = (1 << s_qpp) - 1;
3462		volatile uint8_t *udb;
3463
3464		udb = sc->udbs_base + UDBS_DB_OFFSET;
3465		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
3466		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
3467		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
3468	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
3469		else {
3470			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
3471			eq->udb_qid = 0;
3472		}
3473		eq->udb = (volatile void *)udb;
3474	}
3475
3476	return (rc);
3477}
3478
3479static int
3480free_eq(struct adapter *sc, struct sge_eq *eq)
3481{
3482	int rc;
3483
3484	if (eq->flags & EQ_ALLOCATED) {
3485		switch (eq->flags & EQ_TYPEMASK) {
3486		case EQ_CTRL:
3487			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
3488			    eq->cntxt_id);
3489			break;
3490
3491		case EQ_ETH:
3492			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
3493			    eq->cntxt_id);
3494			break;
3495
3496#ifdef TCP_OFFLOAD
3497		case EQ_OFLD:
3498			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
3499			    eq->cntxt_id);
3500			break;
3501#endif
3502
3503		default:
3504			panic("%s: invalid eq type %d.", __func__,
3505			    eq->flags & EQ_TYPEMASK);
3506		}
3507		if (rc != 0) {
3508			device_printf(sc->dev,
3509			    "failed to free egress queue (%d): %d\n",
3510			    eq->flags & EQ_TYPEMASK, rc);
3511			return (rc);
3512		}
3513		eq->flags &= ~EQ_ALLOCATED;
3514	}
3515
3516	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
3517
3518	if (mtx_initialized(&eq->eq_lock))
3519		mtx_destroy(&eq->eq_lock);
3520
3521	bzero(eq, sizeof(*eq));
3522	return (0);
3523}
3524
3525static int
3526alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
3527    struct sysctl_oid *oid)
3528{
3529	int rc;
3530	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
3531	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3532
3533	rc = alloc_eq(sc, vi, &wrq->eq);
3534	if (rc)
3535		return (rc);
3536
3537	wrq->adapter = sc;
3538	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
3539	TAILQ_INIT(&wrq->incomplete_wrs);
3540	STAILQ_INIT(&wrq->wr_list);
3541	wrq->nwr_pending = 0;
3542	wrq->ndesc_needed = 0;
3543
3544	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3545	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
3546	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3547	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
3548	    "consumer index");
3549	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
3550	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
3551	    "producer index");
3552	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
3553	    &wrq->tx_wrs_direct, "# of work requests (direct)");
3554	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
3555	    &wrq->tx_wrs_copied, "# of work requests (copied)");
3556	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
3557	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
3558
3559	return (rc);
3560}
3561
3562static int
3563free_wrq(struct adapter *sc, struct sge_wrq *wrq)
3564{
3565	int rc;
3566
3567	rc = free_eq(sc, &wrq->eq);
3568	if (rc)
3569		return (rc);
3570
3571	bzero(wrq, sizeof(*wrq));
3572	return (0);
3573}
3574
3575static int
3576alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
3577    struct sysctl_oid *oid)
3578{
3579	int rc;
3580	struct port_info *pi = vi->pi;
3581	struct adapter *sc = pi->adapter;
3582	struct sge_eq *eq = &txq->eq;
3583	char name[16];
3584	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3585
3586	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
3587	    M_CXGBE, M_WAITOK);
3588	if (rc != 0) {
3589		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
3590		return (rc);
3591	}
3592
3593	rc = alloc_eq(sc, vi, eq);
3594	if (rc != 0) {
3595		mp_ring_free(txq->r);
3596		txq->r = NULL;
3597		return (rc);
3598	}
3599
3600	/* Can't fail after this point. */
3601
3602	if (idx == 0)
3603		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
3604	else
3605		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
3606		    ("eq_base mismatch"));
3607	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
3608	    ("PF with non-zero eq_base"));
3609
3610	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
3611	txq->ifp = vi->ifp;
3612	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
3613	if (sc->flags & IS_VF)
3614		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
3615		    V_TXPKT_INTF(pi->tx_chan));
3616	else
3617		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3618		    V_TXPKT_INTF(pi->tx_chan) |
3619		    V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
3620		    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
3621		    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
3622	txq->tc_idx = -1;
3623	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
3624	    M_ZERO | M_WAITOK);
3625
3626	snprintf(name, sizeof(name), "%d", idx);
3627	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3628	    NULL, "tx queue");
3629	children = SYSCTL_CHILDREN(oid);
3630
3631	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
3632	    &eq->abs_id, 0, "absolute id of the queue");
3633	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3634	    &eq->cntxt_id, 0, "SGE context id of the queue");
3635	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3636	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
3637	    "consumer index");
3638	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
3639	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
3640	    "producer index");
3641
3642	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
3643	    CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I",
3644	    "traffic class (-1 means none)");
3645
3646	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
3647	    &txq->txcsum, "# of times hardware assisted with checksum");
3648	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
3649	    CTLFLAG_RD, &txq->vlan_insertion,
3650	    "# of times hardware inserted 802.1Q tag");
3651	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
3652	    &txq->tso_wrs, "# of TSO work requests");
3653	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
3654	    &txq->imm_wrs, "# of work requests with immediate data");
3655	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
3656	    &txq->sgl_wrs, "# of work requests with direct SGL");
3657	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
3658	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
3659	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
3660	    CTLFLAG_RD, &txq->txpkts0_wrs,
3661	    "# of txpkts (type 0) work requests");
3662	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
3663	    CTLFLAG_RD, &txq->txpkts1_wrs,
3664	    "# of txpkts (type 1) work requests");
3665	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
3666	    CTLFLAG_RD, &txq->txpkts0_pkts,
3667	    "# of frames tx'd using type0 txpkts work requests");
3668	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
3669	    CTLFLAG_RD, &txq->txpkts1_pkts,
3670	    "# of frames tx'd using type1 txpkts work requests");
3671
3672	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
3673	    CTLFLAG_RD, &txq->r->enqueues,
3674	    "# of enqueues to the mp_ring for this queue");
3675	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
3676	    CTLFLAG_RD, &txq->r->drops,
3677	    "# of drops in the mp_ring for this queue");
3678	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
3679	    CTLFLAG_RD, &txq->r->starts,
3680	    "# of normal consumer starts in the mp_ring for this queue");
3681	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
3682	    CTLFLAG_RD, &txq->r->stalls,
3683	    "# of consumer stalls in the mp_ring for this queue");
3684	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
3685	    CTLFLAG_RD, &txq->r->restarts,
3686	    "# of consumer restarts in the mp_ring for this queue");
3687	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
3688	    CTLFLAG_RD, &txq->r->abdications,
3689	    "# of consumer abdications in the mp_ring for this queue");
3690
3691	return (0);
3692}
3693
3694static int
3695free_txq(struct vi_info *vi, struct sge_txq *txq)
3696{
3697	int rc;
3698	struct adapter *sc = vi->pi->adapter;
3699	struct sge_eq *eq = &txq->eq;
3700
3701	rc = free_eq(sc, eq);
3702	if (rc)
3703		return (rc);
3704
3705	sglist_free(txq->gl);
3706	free(txq->sdesc, M_CXGBE);
3707	mp_ring_free(txq->r);
3708
3709	bzero(txq, sizeof(*txq));
3710	return (0);
3711}
3712
3713static void
3714oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
3715{
3716	bus_addr_t *ba = arg;
3717
3718	KASSERT(nseg == 1,
3719	    ("%s meant for single segment mappings only.", __func__));
3720
3721	*ba = error ? 0 : segs->ds_addr;
3722}
3723
3724static inline void
3725ring_fl_db(struct adapter *sc, struct sge_fl *fl)
3726{
3727	uint32_t n, v;
3728
3729	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
3730	MPASS(n > 0);
3731
3732	wmb();
3733	v = fl->dbval | V_PIDX(n);
3734	if (fl->udb)
3735		*fl->udb = htole32(v);
3736	else
3737		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
3738	IDXINCR(fl->dbidx, n, fl->sidx);
3739}
3740
3741/*
3742 * Fills up the freelist by allocating upto 'n' buffers.  Buffers that are
3743 * recycled do not count towards this allocation budget.
3744 *
3745 * Returns non-zero to indicate that this freelist should be added to the list
3746 * of starving freelists.
3747 */
3748static int
3749refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
3750{
3751	__be64 *d;
3752	struct fl_sdesc *sd;
3753	uintptr_t pa;
3754	caddr_t cl;
3755	struct cluster_layout *cll;
3756	struct sw_zone_info *swz;
3757	struct cluster_metadata *clm;
3758	uint16_t max_pidx;
3759	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
3760
3761	FL_LOCK_ASSERT_OWNED(fl);
3762
3763	/*
3764	 * We always stop at the begining of the hardware descriptor that's just
3765	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
3766	 * which would mean an empty freelist to the chip.
3767	 */
3768	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
3769	if (fl->pidx == max_pidx * 8)
3770		return (0);
3771
3772	d = &fl->desc[fl->pidx];
3773	sd = &fl->sdesc[fl->pidx];
3774	cll = &fl->cll_def;	/* default layout */
3775	swz = &sc->sge.sw_zone_info[cll->zidx];
3776
3777	while (n > 0) {
3778
3779		if (sd->cl != NULL) {
3780
3781			if (sd->nmbuf == 0) {
3782				/*
3783				 * Fast recycle without involving any atomics on
3784				 * the cluster's metadata (if the cluster has
3785				 * metadata).  This happens when all frames
3786				 * received in the cluster were small enough to
3787				 * fit within a single mbuf each.
3788				 */
3789				fl->cl_fast_recycled++;
3790#ifdef INVARIANTS
3791				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3792				if (clm != NULL)
3793					MPASS(clm->refcount == 1);
3794#endif
3795				goto recycled_fast;
3796			}
3797
3798			/*
3799			 * Cluster is guaranteed to have metadata.  Clusters
3800			 * without metadata always take the fast recycle path
3801			 * when they're recycled.
3802			 */
3803			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3804			MPASS(clm != NULL);
3805
3806			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
3807				fl->cl_recycled++;
3808				counter_u64_add(extfree_rels, 1);
3809				goto recycled;
3810			}
3811			sd->cl = NULL;	/* gave up my reference */
3812		}
3813		MPASS(sd->cl == NULL);
3814alloc:
3815		cl = uma_zalloc(swz->zone, M_NOWAIT);
3816		if (__predict_false(cl == NULL)) {
3817			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
3818			    fl->cll_def.zidx == fl->cll_alt.zidx)
3819				break;
3820
3821			/* fall back to the safe zone */
3822			cll = &fl->cll_alt;
3823			swz = &sc->sge.sw_zone_info[cll->zidx];
3824			goto alloc;
3825		}
3826		fl->cl_allocated++;
3827		n--;
3828
3829		pa = pmap_kextract((vm_offset_t)cl);
3830		pa += cll->region1;
3831		sd->cl = cl;
3832		sd->cll = *cll;
3833		*d = htobe64(pa | cll->hwidx);
3834		clm = cl_metadata(sc, fl, cll, cl);
3835		if (clm != NULL) {
3836recycled:
3837#ifdef INVARIANTS
3838			clm->sd = sd;
3839#endif
3840			clm->refcount = 1;
3841		}
3842		sd->nmbuf = 0;
3843recycled_fast:
3844		d++;
3845		sd++;
3846		if (__predict_false(++fl->pidx % 8 == 0)) {
3847			uint16_t pidx = fl->pidx / 8;
3848
3849			if (__predict_false(pidx == fl->sidx)) {
3850				fl->pidx = 0;
3851				pidx = 0;
3852				sd = fl->sdesc;
3853				d = fl->desc;
3854			}
3855			if (pidx == max_pidx)
3856				break;
3857
3858			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
3859				ring_fl_db(sc, fl);
3860		}
3861	}
3862
3863	if (fl->pidx / 8 != fl->dbidx)
3864		ring_fl_db(sc, fl);
3865
3866	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
3867}
3868
3869/*
3870 * Attempt to refill all starving freelists.
3871 */
3872static void
3873refill_sfl(void *arg)
3874{
3875	struct adapter *sc = arg;
3876	struct sge_fl *fl, *fl_temp;
3877
3878	mtx_assert(&sc->sfl_lock, MA_OWNED);
3879	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
3880		FL_LOCK(fl);
3881		refill_fl(sc, fl, 64);
3882		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
3883			TAILQ_REMOVE(&sc->sfl, fl, link);
3884			fl->flags &= ~FL_STARVING;
3885		}
3886		FL_UNLOCK(fl);
3887	}
3888
3889	if (!TAILQ_EMPTY(&sc->sfl))
3890		callout_schedule(&sc->sfl_callout, hz / 5);
3891}
3892
3893static int
3894alloc_fl_sdesc(struct sge_fl *fl)
3895{
3896
3897	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
3898	    M_ZERO | M_WAITOK);
3899
3900	return (0);
3901}
3902
3903static void
3904free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
3905{
3906	struct fl_sdesc *sd;
3907	struct cluster_metadata *clm;
3908	struct cluster_layout *cll;
3909	int i;
3910
3911	sd = fl->sdesc;
3912	for (i = 0; i < fl->sidx * 8; i++, sd++) {
3913		if (sd->cl == NULL)
3914			continue;
3915
3916		cll = &sd->cll;
3917		clm = cl_metadata(sc, fl, cll, sd->cl);
3918		if (sd->nmbuf == 0)
3919			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
3920		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
3921			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
3922			counter_u64_add(extfree_rels, 1);
3923		}
3924		sd->cl = NULL;
3925	}
3926
3927	free(fl->sdesc, M_CXGBE);
3928	fl->sdesc = NULL;
3929}
3930
3931static inline void
3932get_pkt_gl(struct mbuf *m, struct sglist *gl)
3933{
3934	int rc;
3935
3936	M_ASSERTPKTHDR(m);
3937
3938	sglist_reset(gl);
3939	rc = sglist_append_mbuf(gl, m);
3940	if (__predict_false(rc != 0)) {
3941		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
3942		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
3943	}
3944
3945	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
3946	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
3947	    mbuf_nsegs(m), gl->sg_nseg));
3948	KASSERT(gl->sg_nseg > 0 &&
3949	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
3950	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
3951		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
3952}
3953
3954/*
3955 * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
3956 */
3957static inline u_int
3958txpkt_len16(u_int nsegs, u_int tso)
3959{
3960	u_int n;
3961
3962	MPASS(nsegs > 0);
3963
3964	nsegs--; /* first segment is part of ulptx_sgl */
3965	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
3966	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
3967	if (tso)
3968		n += sizeof(struct cpl_tx_pkt_lso_core);
3969
3970	return (howmany(n, 16));
3971}
3972
3973/*
3974 * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
3975 * request header.
3976 */
3977static inline u_int
3978txpkt_vm_len16(u_int nsegs, u_int tso)
3979{
3980	u_int n;
3981
3982	MPASS(nsegs > 0);
3983
3984	nsegs--; /* first segment is part of ulptx_sgl */
3985	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
3986	    sizeof(struct cpl_tx_pkt_core) +
3987	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
3988	if (tso)
3989		n += sizeof(struct cpl_tx_pkt_lso_core);
3990
3991	return (howmany(n, 16));
3992}
3993
3994/*
3995 * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
3996 * request header.
3997 */
3998static inline u_int
3999txpkts0_len16(u_int nsegs)
4000{
4001	u_int n;
4002
4003	MPASS(nsegs > 0);
4004
4005	nsegs--; /* first segment is part of ulptx_sgl */
4006	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
4007	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
4008	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
4009
4010	return (howmany(n, 16));
4011}
4012
4013/*
4014 * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
4015 * request header.
4016 */
4017static inline u_int
4018txpkts1_len16(void)
4019{
4020	u_int n;
4021
4022	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
4023
4024	return (howmany(n, 16));
4025}
4026
4027static inline u_int
4028imm_payload(u_int ndesc)
4029{
4030	u_int n;
4031
4032	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
4033	    sizeof(struct cpl_tx_pkt_core);
4034
4035	return (n);
4036}
4037
4038/*
4039 * Write a VM txpkt WR for this packet to the hardware descriptors, update the
4040 * software descriptor, and advance the pidx.  It is guaranteed that enough
4041 * descriptors are available.
4042 *
4043 * The return value is the # of hardware descriptors used.
4044 */
4045static u_int
4046write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
4047    struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
4048{
4049	struct sge_eq *eq = &txq->eq;
4050	struct tx_sdesc *txsd;
4051	struct cpl_tx_pkt_core *cpl;
4052	uint32_t ctrl;	/* used in many unrelated places */
4053	uint64_t ctrl1;
4054	int csum_type, len16, ndesc, pktlen, nsegs;
4055	caddr_t dst;
4056
4057	TXQ_LOCK_ASSERT_OWNED(txq);
4058	M_ASSERTPKTHDR(m0);
4059	MPASS(available > 0 && available < eq->sidx);
4060
4061	len16 = mbuf_len16(m0);
4062	nsegs = mbuf_nsegs(m0);
4063	pktlen = m0->m_pkthdr.len;
4064	ctrl = sizeof(struct cpl_tx_pkt_core);
4065	if (needs_tso(m0))
4066		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
4067	ndesc = howmany(len16, EQ_ESIZE / 16);
4068	MPASS(ndesc <= available);
4069
4070	/* Firmware work request header */
4071	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4072	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
4073	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
4074
4075	ctrl = V_FW_WR_LEN16(len16);
4076	wr->equiq_to_len16 = htobe32(ctrl);
4077	wr->r3[0] = 0;
4078	wr->r3[1] = 0;
4079
4080	/*
4081	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
4082	 * vlantci is ignored unless the ethtype is 0x8100, so it's
4083	 * simpler to always copy it rather than making it
4084	 * conditional.  Also, it seems that we do not have to set
4085	 * vlantci or fake the ethtype when doing VLAN tag insertion.
4086	 */
4087	m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst);
4088
4089	csum_type = -1;
4090	if (needs_tso(m0)) {
4091		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
4092
4093		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
4094		    m0->m_pkthdr.l4hlen > 0,
4095		    ("%s: mbuf %p needs TSO but missing header lengths",
4096			__func__, m0));
4097
4098		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
4099		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
4100		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
4101		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
4102			ctrl |= V_LSO_ETHHDR_LEN(1);
4103		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4104			ctrl |= F_LSO_IPV6;
4105
4106		lso->lso_ctrl = htobe32(ctrl);
4107		lso->ipid_ofst = htobe16(0);
4108		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
4109		lso->seqno_offset = htobe32(0);
4110		lso->len = htobe32(pktlen);
4111
4112		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4113			csum_type = TX_CSUM_TCPIP6;
4114		else
4115			csum_type = TX_CSUM_TCPIP;
4116
4117		cpl = (void *)(lso + 1);
4118
4119		txq->tso_wrs++;
4120	} else {
4121		if (m0->m_pkthdr.csum_flags & CSUM_IP_TCP)
4122			csum_type = TX_CSUM_TCPIP;
4123		else if (m0->m_pkthdr.csum_flags & CSUM_IP_UDP)
4124			csum_type = TX_CSUM_UDPIP;
4125		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_TCP)
4126			csum_type = TX_CSUM_TCPIP6;
4127		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_UDP)
4128			csum_type = TX_CSUM_UDPIP6;
4129#if defined(INET)
4130		else if (m0->m_pkthdr.csum_flags & CSUM_IP) {
4131			/*
4132			 * XXX: The firmware appears to stomp on the
4133			 * fragment/flags field of the IP header when
4134			 * using TX_CSUM_IP.  Fall back to doing
4135			 * software checksums.
4136			 */
4137			u_short *sump;
4138			struct mbuf *m;
4139			int offset;
4140
4141			m = m0;
4142			offset = 0;
4143			sump = m_advance(&m, &offset, m0->m_pkthdr.l2hlen +
4144			    offsetof(struct ip, ip_sum));
4145			*sump = in_cksum_skip(m0, m0->m_pkthdr.l2hlen +
4146			    m0->m_pkthdr.l3hlen, m0->m_pkthdr.l2hlen);
4147			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
4148		}
4149#endif
4150
4151		cpl = (void *)(wr + 1);
4152	}
4153
4154	/* Checksum offload */
4155	ctrl1 = 0;
4156	if (needs_l3_csum(m0) == 0)
4157		ctrl1 |= F_TXPKT_IPCSUM_DIS;
4158	if (csum_type >= 0) {
4159		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0,
4160	    ("%s: mbuf %p needs checksum offload but missing header lengths",
4161			__func__, m0));
4162
4163		if (chip_id(sc) <= CHELSIO_T5) {
4164			ctrl1 |= V_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
4165			    ETHER_HDR_LEN);
4166		} else {
4167			ctrl1 |= V_T6_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
4168			    ETHER_HDR_LEN);
4169		}
4170		ctrl1 |= V_TXPKT_IPHDR_LEN(m0->m_pkthdr.l3hlen);
4171		ctrl1 |= V_TXPKT_CSUM_TYPE(csum_type);
4172	} else
4173		ctrl1 |= F_TXPKT_L4CSUM_DIS;
4174	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4175	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4176		txq->txcsum++;	/* some hardware assistance provided */
4177
4178	/* VLAN tag insertion */
4179	if (needs_vlan_insertion(m0)) {
4180		ctrl1 |= F_TXPKT_VLAN_VLD |
4181		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
4182		txq->vlan_insertion++;
4183	}
4184
4185	/* CPL header */
4186	cpl->ctrl0 = txq->cpl_ctrl0;
4187	cpl->pack = 0;
4188	cpl->len = htobe16(pktlen);
4189	cpl->ctrl1 = htobe64(ctrl1);
4190
4191	/* SGL */
4192	dst = (void *)(cpl + 1);
4193
4194	/*
4195	 * A packet using TSO will use up an entire descriptor for the
4196	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
4197	 * If this descriptor is the last descriptor in the ring, wrap
4198	 * around to the front of the ring explicitly for the start of
4199	 * the sgl.
4200	 */
4201	if (dst == (void *)&eq->desc[eq->sidx]) {
4202		dst = (void *)&eq->desc[0];
4203		write_gl_to_txd(txq, m0, &dst, 0);
4204	} else
4205		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
4206	txq->sgl_wrs++;
4207
4208	txq->txpkt_wrs++;
4209
4210	txsd = &txq->sdesc[eq->pidx];
4211	txsd->m = m0;
4212	txsd->desc_used = ndesc;
4213
4214	return (ndesc);
4215}
4216
4217/*
4218 * Write a txpkt WR for this packet to the hardware descriptors, update the
4219 * software descriptor, and advance the pidx.  It is guaranteed that enough
4220 * descriptors are available.
4221 *
4222 * The return value is the # of hardware descriptors used.
4223 */
4224static u_int
4225write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
4226    struct mbuf *m0, u_int available)
4227{
4228	struct sge_eq *eq = &txq->eq;
4229	struct tx_sdesc *txsd;
4230	struct cpl_tx_pkt_core *cpl;
4231	uint32_t ctrl;	/* used in many unrelated places */
4232	uint64_t ctrl1;
4233	int len16, ndesc, pktlen, nsegs;
4234	caddr_t dst;
4235
4236	TXQ_LOCK_ASSERT_OWNED(txq);
4237	M_ASSERTPKTHDR(m0);
4238	MPASS(available > 0 && available < eq->sidx);
4239
4240	len16 = mbuf_len16(m0);
4241	nsegs = mbuf_nsegs(m0);
4242	pktlen = m0->m_pkthdr.len;
4243	ctrl = sizeof(struct cpl_tx_pkt_core);
4244	if (needs_tso(m0))
4245		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
4246	else if (pktlen <= imm_payload(2) && available >= 2) {
4247		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
4248		ctrl += pktlen;
4249		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
4250		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
4251		nsegs = 0;
4252	}
4253	ndesc = howmany(len16, EQ_ESIZE / 16);
4254	MPASS(ndesc <= available);
4255
4256	/* Firmware work request header */
4257	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4258	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
4259	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
4260
4261	ctrl = V_FW_WR_LEN16(len16);
4262	wr->equiq_to_len16 = htobe32(ctrl);
4263	wr->r3 = 0;
4264
4265	if (needs_tso(m0)) {
4266		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
4267
4268		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
4269		    m0->m_pkthdr.l4hlen > 0,
4270		    ("%s: mbuf %p needs TSO but missing header lengths",
4271			__func__, m0));
4272
4273		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
4274		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
4275		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
4276		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
4277			ctrl |= V_LSO_ETHHDR_LEN(1);
4278		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4279			ctrl |= F_LSO_IPV6;
4280
4281		lso->lso_ctrl = htobe32(ctrl);
4282		lso->ipid_ofst = htobe16(0);
4283		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
4284		lso->seqno_offset = htobe32(0);
4285		lso->len = htobe32(pktlen);
4286
4287		cpl = (void *)(lso + 1);
4288
4289		txq->tso_wrs++;
4290	} else
4291		cpl = (void *)(wr + 1);
4292
4293	/* Checksum offload */
4294	ctrl1 = 0;
4295	if (needs_l3_csum(m0) == 0)
4296		ctrl1 |= F_TXPKT_IPCSUM_DIS;
4297	if (needs_l4_csum(m0) == 0)
4298		ctrl1 |= F_TXPKT_L4CSUM_DIS;
4299	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4300	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4301		txq->txcsum++;	/* some hardware assistance provided */
4302
4303	/* VLAN tag insertion */
4304	if (needs_vlan_insertion(m0)) {
4305		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
4306		txq->vlan_insertion++;
4307	}
4308
4309	/* CPL header */
4310	cpl->ctrl0 = txq->cpl_ctrl0;
4311	cpl->pack = 0;
4312	cpl->len = htobe16(pktlen);
4313	cpl->ctrl1 = htobe64(ctrl1);
4314
4315	/* SGL */
4316	dst = (void *)(cpl + 1);
4317	if (nsegs > 0) {
4318
4319		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
4320		txq->sgl_wrs++;
4321	} else {
4322		struct mbuf *m;
4323
4324		for (m = m0; m != NULL; m = m->m_next) {
4325			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
4326#ifdef INVARIANTS
4327			pktlen -= m->m_len;
4328#endif
4329		}
4330#ifdef INVARIANTS
4331		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
4332#endif
4333		txq->imm_wrs++;
4334	}
4335
4336	txq->txpkt_wrs++;
4337
4338	txsd = &txq->sdesc[eq->pidx];
4339	txsd->m = m0;
4340	txsd->desc_used = ndesc;
4341
4342	return (ndesc);
4343}
4344
4345static int
4346try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
4347{
4348	u_int needed, nsegs1, nsegs2, l1, l2;
4349
4350	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
4351		return (1);
4352
4353	nsegs1 = mbuf_nsegs(m);
4354	nsegs2 = mbuf_nsegs(n);
4355	if (nsegs1 + nsegs2 == 2) {
4356		txp->wr_type = 1;
4357		l1 = l2 = txpkts1_len16();
4358	} else {
4359		txp->wr_type = 0;
4360		l1 = txpkts0_len16(nsegs1);
4361		l2 = txpkts0_len16(nsegs2);
4362	}
4363	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
4364	needed = howmany(txp->len16, EQ_ESIZE / 16);
4365	if (needed > SGE_MAX_WR_NDESC || needed > available)
4366		return (1);
4367
4368	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
4369	if (txp->plen > 65535)
4370		return (1);
4371
4372	txp->npkt = 2;
4373	set_mbuf_len16(m, l1);
4374	set_mbuf_len16(n, l2);
4375
4376	return (0);
4377}
4378
4379static int
4380add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
4381{
4382	u_int plen, len16, needed, nsegs;
4383
4384	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
4385
4386	nsegs = mbuf_nsegs(m);
4387	if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1))
4388		return (1);
4389
4390	plen = txp->plen + m->m_pkthdr.len;
4391	if (plen > 65535)
4392		return (1);
4393
4394	if (txp->wr_type == 0)
4395		len16 = txpkts0_len16(nsegs);
4396	else
4397		len16 = txpkts1_len16();
4398	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
4399	if (needed > SGE_MAX_WR_NDESC || needed > available)
4400		return (1);
4401
4402	txp->npkt++;
4403	txp->plen = plen;
4404	txp->len16 += len16;
4405	set_mbuf_len16(m, len16);
4406
4407	return (0);
4408}
4409
4410/*
4411 * Write a txpkts WR for the packets in txp to the hardware descriptors, update
4412 * the software descriptor, and advance the pidx.  It is guaranteed that enough
4413 * descriptors are available.
4414 *
4415 * The return value is the # of hardware descriptors used.
4416 */
4417static u_int
4418write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
4419    struct mbuf *m0, const struct txpkts *txp, u_int available)
4420{
4421	struct sge_eq *eq = &txq->eq;
4422	struct tx_sdesc *txsd;
4423	struct cpl_tx_pkt_core *cpl;
4424	uint32_t ctrl;
4425	uint64_t ctrl1;
4426	int ndesc, checkwrap;
4427	struct mbuf *m;
4428	void *flitp;
4429
4430	TXQ_LOCK_ASSERT_OWNED(txq);
4431	MPASS(txp->npkt > 0);
4432	MPASS(txp->plen < 65536);
4433	MPASS(m0 != NULL);
4434	MPASS(m0->m_nextpkt != NULL);
4435	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
4436	MPASS(available > 0 && available < eq->sidx);
4437
4438	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
4439	MPASS(ndesc <= available);
4440
4441	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4442	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
4443	ctrl = V_FW_WR_LEN16(txp->len16);
4444	wr->equiq_to_len16 = htobe32(ctrl);
4445	wr->plen = htobe16(txp->plen);
4446	wr->npkt = txp->npkt;
4447	wr->r3 = 0;
4448	wr->type = txp->wr_type;
4449	flitp = wr + 1;
4450
4451	/*
4452	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
4453	 * set then we know the WR is going to wrap around somewhere.  We'll
4454	 * check for that at appropriate points.
4455	 */
4456	checkwrap = eq->sidx - ndesc < eq->pidx;
4457	for (m = m0; m != NULL; m = m->m_nextpkt) {
4458		if (txp->wr_type == 0) {
4459			struct ulp_txpkt *ulpmc;
4460			struct ulptx_idata *ulpsc;
4461
4462			/* ULP master command */
4463			ulpmc = flitp;
4464			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
4465			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
4466			ulpmc->len = htobe32(mbuf_len16(m));
4467
4468			/* ULP subcommand */
4469			ulpsc = (void *)(ulpmc + 1);
4470			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
4471			    F_ULP_TX_SC_MORE);
4472			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
4473
4474			cpl = (void *)(ulpsc + 1);
4475			if (checkwrap &&
4476			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
4477				cpl = (void *)&eq->desc[0];
4478			txq->txpkts0_pkts += txp->npkt;
4479			txq->txpkts0_wrs++;
4480		} else {
4481			cpl = flitp;
4482			txq->txpkts1_pkts += txp->npkt;
4483			txq->txpkts1_wrs++;
4484		}
4485
4486		/* Checksum offload */
4487		ctrl1 = 0;
4488		if (needs_l3_csum(m) == 0)
4489			ctrl1 |= F_TXPKT_IPCSUM_DIS;
4490		if (needs_l4_csum(m) == 0)
4491			ctrl1 |= F_TXPKT_L4CSUM_DIS;
4492		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4493		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4494			txq->txcsum++;	/* some hardware assistance provided */
4495
4496		/* VLAN tag insertion */
4497		if (needs_vlan_insertion(m)) {
4498			ctrl1 |= F_TXPKT_VLAN_VLD |
4499			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
4500			txq->vlan_insertion++;
4501		}
4502
4503		/* CPL header */
4504		cpl->ctrl0 = txq->cpl_ctrl0;
4505		cpl->pack = 0;
4506		cpl->len = htobe16(m->m_pkthdr.len);
4507		cpl->ctrl1 = htobe64(ctrl1);
4508
4509		flitp = cpl + 1;
4510		if (checkwrap &&
4511		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
4512			flitp = (void *)&eq->desc[0];
4513
4514		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
4515
4516	}
4517
4518	txsd = &txq->sdesc[eq->pidx];
4519	txsd->m = m0;
4520	txsd->desc_used = ndesc;
4521
4522	return (ndesc);
4523}
4524
4525/*
4526 * If the SGL ends on an address that is not 16 byte aligned, this function will
4527 * add a 0 filled flit at the end.
4528 */
4529static void
4530write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
4531{
4532	struct sge_eq *eq = &txq->eq;
4533	struct sglist *gl = txq->gl;
4534	struct sglist_seg *seg;
4535	__be64 *flitp, *wrap;
4536	struct ulptx_sgl *usgl;
4537	int i, nflits, nsegs;
4538
4539	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
4540	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
4541	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4542	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4543
4544	get_pkt_gl(m, gl);
4545	nsegs = gl->sg_nseg;
4546	MPASS(nsegs > 0);
4547
4548	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
4549	flitp = (__be64 *)(*to);
4550	wrap = (__be64 *)(&eq->desc[eq->sidx]);
4551	seg = &gl->sg_segs[0];
4552	usgl = (void *)flitp;
4553
4554	/*
4555	 * We start at a 16 byte boundary somewhere inside the tx descriptor
4556	 * ring, so we're at least 16 bytes away from the status page.  There is
4557	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
4558	 */
4559
4560	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
4561	    V_ULPTX_NSGE(nsegs));
4562	usgl->len0 = htobe32(seg->ss_len);
4563	usgl->addr0 = htobe64(seg->ss_paddr);
4564	seg++;
4565
4566	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
4567
4568		/* Won't wrap around at all */
4569
4570		for (i = 0; i < nsegs - 1; i++, seg++) {
4571			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
4572			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
4573		}
4574		if (i & 1)
4575			usgl->sge[i / 2].len[1] = htobe32(0);
4576		flitp += nflits;
4577	} else {
4578
4579		/* Will wrap somewhere in the rest of the SGL */
4580
4581		/* 2 flits already written, write the rest flit by flit */
4582		flitp = (void *)(usgl + 1);
4583		for (i = 0; i < nflits - 2; i++) {
4584			if (flitp == wrap)
4585				flitp = (void *)eq->desc;
4586			*flitp++ = get_flit(seg, nsegs - 1, i);
4587		}
4588	}
4589
4590	if (nflits & 1) {
4591		MPASS(((uintptr_t)flitp) & 0xf);
4592		*flitp++ = 0;
4593	}
4594
4595	MPASS((((uintptr_t)flitp) & 0xf) == 0);
4596	if (__predict_false(flitp == wrap))
4597		*to = (void *)eq->desc;
4598	else
4599		*to = (void *)flitp;
4600}
4601
4602static inline void
4603copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
4604{
4605
4606	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4607	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4608
4609	if (__predict_true((uintptr_t)(*to) + len <=
4610	    (uintptr_t)&eq->desc[eq->sidx])) {
4611		bcopy(from, *to, len);
4612		(*to) += len;
4613	} else {
4614		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
4615
4616		bcopy(from, *to, portion);
4617		from += portion;
4618		portion = len - portion;	/* remaining */
4619		bcopy(from, (void *)eq->desc, portion);
4620		(*to) = (caddr_t)eq->desc + portion;
4621	}
4622}
4623
4624static inline void
4625ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
4626{
4627	u_int db;
4628
4629	MPASS(n > 0);
4630
4631	db = eq->doorbells;
4632	if (n > 1)
4633		clrbit(&db, DOORBELL_WCWR);
4634	wmb();
4635
4636	switch (ffs(db) - 1) {
4637	case DOORBELL_UDB:
4638		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4639		break;
4640
4641	case DOORBELL_WCWR: {
4642		volatile uint64_t *dst, *src;
4643		int i;
4644
4645		/*
4646		 * Queues whose 128B doorbell segment fits in the page do not
4647		 * use relative qid (udb_qid is always 0).  Only queues with
4648		 * doorbell segments can do WCWR.
4649		 */
4650		KASSERT(eq->udb_qid == 0 && n == 1,
4651		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
4652		    __func__, eq->doorbells, n, eq->dbidx, eq));
4653
4654		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
4655		    UDBS_DB_OFFSET);
4656		i = eq->dbidx;
4657		src = (void *)&eq->desc[i];
4658		while (src != (void *)&eq->desc[i + 1])
4659			*dst++ = *src++;
4660		wmb();
4661		break;
4662	}
4663
4664	case DOORBELL_UDBWC:
4665		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4666		wmb();
4667		break;
4668
4669	case DOORBELL_KDB:
4670		t4_write_reg(sc, sc->sge_kdoorbell_reg,
4671		    V_QID(eq->cntxt_id) | V_PIDX(n));
4672		break;
4673	}
4674
4675	IDXINCR(eq->dbidx, n, eq->sidx);
4676}
4677
4678static inline u_int
4679reclaimable_tx_desc(struct sge_eq *eq)
4680{
4681	uint16_t hw_cidx;
4682
4683	hw_cidx = read_hw_cidx(eq);
4684	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
4685}
4686
4687static inline u_int
4688total_available_tx_desc(struct sge_eq *eq)
4689{
4690	uint16_t hw_cidx, pidx;
4691
4692	hw_cidx = read_hw_cidx(eq);
4693	pidx = eq->pidx;
4694
4695	if (pidx == hw_cidx)
4696		return (eq->sidx - 1);
4697	else
4698		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
4699}
4700
4701static inline uint16_t
4702read_hw_cidx(struct sge_eq *eq)
4703{
4704	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
4705	uint16_t cidx = spg->cidx;	/* stable snapshot */
4706
4707	return (be16toh(cidx));
4708}
4709
4710/*
4711 * Reclaim 'n' descriptors approximately.
4712 */
4713static u_int
4714reclaim_tx_descs(struct sge_txq *txq, u_int n)
4715{
4716	struct tx_sdesc *txsd;
4717	struct sge_eq *eq = &txq->eq;
4718	u_int can_reclaim, reclaimed;
4719
4720	TXQ_LOCK_ASSERT_OWNED(txq);
4721	MPASS(n > 0);
4722
4723	reclaimed = 0;
4724	can_reclaim = reclaimable_tx_desc(eq);
4725	while (can_reclaim && reclaimed < n) {
4726		int ndesc;
4727		struct mbuf *m, *nextpkt;
4728
4729		txsd = &txq->sdesc[eq->cidx];
4730		ndesc = txsd->desc_used;
4731
4732		/* Firmware doesn't return "partial" credits. */
4733		KASSERT(can_reclaim >= ndesc,
4734		    ("%s: unexpected number of credits: %d, %d",
4735		    __func__, can_reclaim, ndesc));
4736
4737		for (m = txsd->m; m != NULL; m = nextpkt) {
4738			nextpkt = m->m_nextpkt;
4739			m->m_nextpkt = NULL;
4740			m_freem(m);
4741		}
4742		reclaimed += ndesc;
4743		can_reclaim -= ndesc;
4744		IDXINCR(eq->cidx, ndesc, eq->sidx);
4745	}
4746
4747	return (reclaimed);
4748}
4749
4750static void
4751tx_reclaim(void *arg, int n)
4752{
4753	struct sge_txq *txq = arg;
4754	struct sge_eq *eq = &txq->eq;
4755
4756	do {
4757		if (TXQ_TRYLOCK(txq) == 0)
4758			break;
4759		n = reclaim_tx_descs(txq, 32);
4760		if (eq->cidx == eq->pidx)
4761			eq->equeqidx = eq->pidx;
4762		TXQ_UNLOCK(txq);
4763	} while (n > 0);
4764}
4765
4766static __be64
4767get_flit(struct sglist_seg *segs, int nsegs, int idx)
4768{
4769	int i = (idx / 3) * 2;
4770
4771	switch (idx % 3) {
4772	case 0: {
4773		__be64 rc;
4774
4775		rc = htobe32(segs[i].ss_len);
4776		if (i + 1 < nsegs)
4777			rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32;
4778
4779		return (rc);
4780	}
4781	case 1:
4782		return (htobe64(segs[i].ss_paddr));
4783	case 2:
4784		return (htobe64(segs[i + 1].ss_paddr));
4785	}
4786
4787	return (0);
4788}
4789
4790static void
4791find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
4792{
4793	int8_t zidx, hwidx, idx;
4794	uint16_t region1, region3;
4795	int spare, spare_needed, n;
4796	struct sw_zone_info *swz;
4797	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
4798
4799	/*
4800	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
4801	 * large enough for the max payload and cluster metadata.  Otherwise
4802	 * settle for the largest bufsize that leaves enough room in the cluster
4803	 * for metadata.
4804	 *
4805	 * Without buffer packing: Look for the smallest zone which has a
4806	 * bufsize large enough for the max payload.  Settle for the largest
4807	 * bufsize available if there's nothing big enough for max payload.
4808	 */
4809	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
4810	swz = &sc->sge.sw_zone_info[0];
4811	hwidx = -1;
4812	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
4813		if (swz->size > largest_rx_cluster) {
4814			if (__predict_true(hwidx != -1))
4815				break;
4816
4817			/*
4818			 * This is a misconfiguration.  largest_rx_cluster is
4819			 * preventing us from finding a refill source.  See
4820			 * dev.t5nex.<n>.buffer_sizes to figure out why.
4821			 */
4822			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
4823			    " refill source for fl %p (dma %u).  Ignored.\n",
4824			    largest_rx_cluster, fl, maxp);
4825		}
4826		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
4827			hwb = &hwb_list[idx];
4828			spare = swz->size - hwb->size;
4829			if (spare < spare_needed)
4830				continue;
4831
4832			hwidx = idx;		/* best option so far */
4833			if (hwb->size >= maxp) {
4834
4835				if ((fl->flags & FL_BUF_PACKING) == 0)
4836					goto done; /* stop looking (not packing) */
4837
4838				if (swz->size >= safest_rx_cluster)
4839					goto done; /* stop looking (packing) */
4840			}
4841			break;		/* keep looking, next zone */
4842		}
4843	}
4844done:
4845	/* A usable hwidx has been located. */
4846	MPASS(hwidx != -1);
4847	hwb = &hwb_list[hwidx];
4848	zidx = hwb->zidx;
4849	swz = &sc->sge.sw_zone_info[zidx];
4850	region1 = 0;
4851	region3 = swz->size - hwb->size;
4852
4853	/*
4854	 * Stay within this zone and see if there is a better match when mbuf
4855	 * inlining is allowed.  Remember that the hwidx's are sorted in
4856	 * decreasing order of size (so in increasing order of spare area).
4857	 */
4858	for (idx = hwidx; idx != -1; idx = hwb->next) {
4859		hwb = &hwb_list[idx];
4860		spare = swz->size - hwb->size;
4861
4862		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
4863			break;
4864
4865		/*
4866		 * Do not inline mbufs if doing so would violate the pad/pack
4867		 * boundary alignment requirement.
4868		 */
4869		if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0)
4870			continue;
4871		if (fl->flags & FL_BUF_PACKING &&
4872		    (MSIZE % sc->params.sge.pack_boundary) != 0)
4873			continue;
4874
4875		if (spare < CL_METADATA_SIZE + MSIZE)
4876			continue;
4877		n = (spare - CL_METADATA_SIZE) / MSIZE;
4878		if (n > howmany(hwb->size, maxp))
4879			break;
4880
4881		hwidx = idx;
4882		if (fl->flags & FL_BUF_PACKING) {
4883			region1 = n * MSIZE;
4884			region3 = spare - region1;
4885		} else {
4886			region1 = MSIZE;
4887			region3 = spare - region1;
4888			break;
4889		}
4890	}
4891
4892	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
4893	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
4894	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
4895	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
4896	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
4897	    sc->sge.sw_zone_info[zidx].size,
4898	    ("%s: bad buffer layout for fl %p, maxp %d. "
4899		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4900		sc->sge.sw_zone_info[zidx].size, region1,
4901		sc->sge.hw_buf_info[hwidx].size, region3));
4902	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
4903		KASSERT(region3 >= CL_METADATA_SIZE,
4904		    ("%s: no room for metadata.  fl %p, maxp %d; "
4905		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4906		    sc->sge.sw_zone_info[zidx].size, region1,
4907		    sc->sge.hw_buf_info[hwidx].size, region3));
4908		KASSERT(region1 % MSIZE == 0,
4909		    ("%s: bad mbuf region for fl %p, maxp %d. "
4910		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4911		    sc->sge.sw_zone_info[zidx].size, region1,
4912		    sc->sge.hw_buf_info[hwidx].size, region3));
4913	}
4914
4915	fl->cll_def.zidx = zidx;
4916	fl->cll_def.hwidx = hwidx;
4917	fl->cll_def.region1 = region1;
4918	fl->cll_def.region3 = region3;
4919}
4920
4921static void
4922find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
4923{
4924	struct sge *s = &sc->sge;
4925	struct hw_buf_info *hwb;
4926	struct sw_zone_info *swz;
4927	int spare;
4928	int8_t hwidx;
4929
4930	if (fl->flags & FL_BUF_PACKING)
4931		hwidx = s->safe_hwidx2;	/* with room for metadata */
4932	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
4933		hwidx = s->safe_hwidx2;
4934		hwb = &s->hw_buf_info[hwidx];
4935		swz = &s->sw_zone_info[hwb->zidx];
4936		spare = swz->size - hwb->size;
4937
4938		/* no good if there isn't room for an mbuf as well */
4939		if (spare < CL_METADATA_SIZE + MSIZE)
4940			hwidx = s->safe_hwidx1;
4941	} else
4942		hwidx = s->safe_hwidx1;
4943
4944	if (hwidx == -1) {
4945		/* No fallback source */
4946		fl->cll_alt.hwidx = -1;
4947		fl->cll_alt.zidx = -1;
4948
4949		return;
4950	}
4951
4952	hwb = &s->hw_buf_info[hwidx];
4953	swz = &s->sw_zone_info[hwb->zidx];
4954	spare = swz->size - hwb->size;
4955	fl->cll_alt.hwidx = hwidx;
4956	fl->cll_alt.zidx = hwb->zidx;
4957	if (allow_mbufs_in_cluster &&
4958	    (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0))
4959		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
4960	else
4961		fl->cll_alt.region1 = 0;
4962	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
4963}
4964
4965static void
4966add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
4967{
4968	mtx_lock(&sc->sfl_lock);
4969	FL_LOCK(fl);
4970	if ((fl->flags & FL_DOOMED) == 0) {
4971		fl->flags |= FL_STARVING;
4972		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
4973		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
4974	}
4975	FL_UNLOCK(fl);
4976	mtx_unlock(&sc->sfl_lock);
4977}
4978
4979static void
4980handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
4981{
4982	struct sge_wrq *wrq = (void *)eq;
4983
4984	atomic_readandclear_int(&eq->equiq);
4985	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
4986}
4987
4988static void
4989handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
4990{
4991	struct sge_txq *txq = (void *)eq;
4992
4993	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
4994
4995	atomic_readandclear_int(&eq->equiq);
4996	mp_ring_check_drainage(txq->r, 0);
4997	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
4998}
4999
5000static int
5001handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
5002    struct mbuf *m)
5003{
5004	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
5005	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
5006	struct adapter *sc = iq->adapter;
5007	struct sge *s = &sc->sge;
5008	struct sge_eq *eq;
5009	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
5010		&handle_wrq_egr_update, &handle_eth_egr_update,
5011		&handle_wrq_egr_update};
5012
5013	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
5014	    rss->opcode));
5015
5016	eq = s->eqmap[qid - s->eq_start - s->eq_base];
5017	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
5018
5019	return (0);
5020}
5021
5022/* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
5023CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
5024    offsetof(struct cpl_fw6_msg, data));
5025
5026static int
5027handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
5028{
5029	struct adapter *sc = iq->adapter;
5030	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
5031
5032	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
5033	    rss->opcode));
5034
5035	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
5036		const struct rss_header *rss2;
5037
5038		rss2 = (const struct rss_header *)&cpl->data[0];
5039		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
5040	}
5041
5042	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
5043}
5044
5045/**
5046 *	t4_handle_wrerr_rpl - process a FW work request error message
5047 *	@adap: the adapter
5048 *	@rpl: start of the FW message
5049 */
5050static int
5051t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
5052{
5053	u8 opcode = *(const u8 *)rpl;
5054	const struct fw_error_cmd *e = (const void *)rpl;
5055	unsigned int i;
5056
5057	if (opcode != FW_ERROR_CMD) {
5058		log(LOG_ERR,
5059		    "%s: Received WRERR_RPL message with opcode %#x\n",
5060		    device_get_nameunit(adap->dev), opcode);
5061		return (EINVAL);
5062	}
5063	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
5064	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
5065	    "non-fatal");
5066	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
5067	case FW_ERROR_TYPE_EXCEPTION:
5068		log(LOG_ERR, "exception info:\n");
5069		for (i = 0; i < nitems(e->u.exception.info); i++)
5070			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
5071			    be32toh(e->u.exception.info[i]));
5072		log(LOG_ERR, "\n");
5073		break;
5074	case FW_ERROR_TYPE_HWMODULE:
5075		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
5076		    be32toh(e->u.hwmodule.regaddr),
5077		    be32toh(e->u.hwmodule.regval));
5078		break;
5079	case FW_ERROR_TYPE_WR:
5080		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
5081		    be16toh(e->u.wr.cidx),
5082		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
5083		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
5084		    be32toh(e->u.wr.eqid));
5085		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
5086			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
5087			    e->u.wr.wrhdr[i]);
5088		log(LOG_ERR, "\n");
5089		break;
5090	case FW_ERROR_TYPE_ACL:
5091		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
5092		    be16toh(e->u.acl.cidx),
5093		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
5094		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
5095		    be32toh(e->u.acl.eqid),
5096		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
5097		    "MAC");
5098		for (i = 0; i < nitems(e->u.acl.val); i++)
5099			log(LOG_ERR, " %02x", e->u.acl.val[i]);
5100		log(LOG_ERR, "\n");
5101		break;
5102	default:
5103		log(LOG_ERR, "type %#x\n",
5104		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
5105		return (EINVAL);
5106	}
5107	return (0);
5108}
5109
5110static int
5111sysctl_uint16(SYSCTL_HANDLER_ARGS)
5112{
5113	uint16_t *id = arg1;
5114	int i = *id;
5115
5116	return sysctl_handle_int(oidp, &i, 0, req);
5117}
5118
5119static int
5120sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
5121{
5122	struct sge *s = arg1;
5123	struct hw_buf_info *hwb = &s->hw_buf_info[0];
5124	struct sw_zone_info *swz = &s->sw_zone_info[0];
5125	int i, rc;
5126	struct sbuf sb;
5127	char c;
5128
5129	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
5130	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
5131		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
5132			c = '*';
5133		else
5134			c = '\0';
5135
5136		sbuf_printf(&sb, "%u%c ", hwb->size, c);
5137	}
5138	sbuf_trim(&sb);
5139	sbuf_finish(&sb);
5140	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
5141	sbuf_delete(&sb);
5142	return (rc);
5143}
5144
5145static int
5146sysctl_tc(SYSCTL_HANDLER_ARGS)
5147{
5148	struct vi_info *vi = arg1;
5149	struct port_info *pi;
5150	struct adapter *sc;
5151	struct sge_txq *txq;
5152	struct tx_sched_class *tc;
5153	int qidx = arg2, rc, tc_idx;
5154	uint32_t fw_queue, fw_class;
5155
5156	MPASS(qidx >= 0 && qidx < vi->ntxq);
5157	pi = vi->pi;
5158	sc = pi->adapter;
5159	txq = &sc->sge.txq[vi->first_txq + qidx];
5160
5161	tc_idx = txq->tc_idx;
5162	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
5163	if (rc != 0 || req->newptr == NULL)
5164		return (rc);
5165
5166	/* Note that -1 is legitimate input (it means unbind). */
5167	if (tc_idx < -1 || tc_idx >= sc->chip_params->nsched_cls)
5168		return (EINVAL);
5169
5170	rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4stc");
5171	if (rc)
5172		return (rc);
5173
5174	if (tc_idx == txq->tc_idx) {
5175		rc = 0;		/* No change, nothing to do. */
5176		goto done;
5177	}
5178
5179	fw_queue = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
5180	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
5181	    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id);
5182
5183	if (tc_idx == -1)
5184		fw_class = 0xffffffff;	/* Unbind. */
5185	else {
5186		/*
5187		 * Bind to a different class.  Ethernet txq's are only allowed
5188		 * to bind to cl-rl mode-class for now.  XXX: too restrictive.
5189		 */
5190		tc = &pi->tc[tc_idx];
5191		if (tc->flags & TX_SC_OK &&
5192		    tc->params.level == SCHED_CLASS_LEVEL_CL_RL &&
5193		    tc->params.mode == SCHED_CLASS_MODE_CLASS) {
5194			/* Ok to proceed. */
5195			fw_class = tc_idx;
5196		} else {
5197			rc = tc->flags & TX_SC_OK ? EBUSY : ENXIO;
5198			goto done;
5199		}
5200	}
5201
5202	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class);
5203	if (rc == 0) {
5204		if (txq->tc_idx != -1) {
5205			tc = &pi->tc[txq->tc_idx];
5206			MPASS(tc->refcount > 0);
5207			tc->refcount--;
5208		}
5209		if (tc_idx != -1) {
5210			tc = &pi->tc[tc_idx];
5211			tc->refcount++;
5212		}
5213		txq->tc_idx = tc_idx;
5214	}
5215done:
5216	end_synchronized_op(sc, 0);
5217	return (rc);
5218}
5219