t4_sge.c revision 252715
1/*-
2 * Copyright (c) 2011 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/dev/cxgbe/t4_sge.c 252715 2013-07-04 19:15:41Z np $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#include <sys/types.h>
35#include <sys/mbuf.h>
36#include <sys/socket.h>
37#include <sys/kernel.h>
38#include <sys/kdb.h>
39#include <sys/malloc.h>
40#include <sys/queue.h>
41#include <sys/taskqueue.h>
42#include <sys/sysctl.h>
43#include <sys/smp.h>
44#include <net/bpf.h>
45#include <net/ethernet.h>
46#include <net/if.h>
47#include <net/if_vlan_var.h>
48#include <netinet/in.h>
49#include <netinet/ip.h>
50#include <netinet/ip6.h>
51#include <netinet/tcp.h>
52
53#include "common/common.h"
54#include "common/t4_regs.h"
55#include "common/t4_regs_values.h"
56#include "common/t4_msg.h"
57
58struct fl_buf_info {
59	int size;
60	int type;
61	uma_zone_t zone;
62};
63
64/* Filled up by t4_sge_modload */
65static struct fl_buf_info fl_buf_info[FL_BUF_SIZES];
66
67#define FL_BUF_SIZE(x)	(fl_buf_info[x].size)
68#define FL_BUF_TYPE(x)	(fl_buf_info[x].type)
69#define FL_BUF_ZONE(x)	(fl_buf_info[x].zone)
70
71#ifdef T4_PKT_TIMESTAMP
72#define RX_COPY_THRESHOLD (MINCLSIZE - 8)
73#else
74#define RX_COPY_THRESHOLD MINCLSIZE
75#endif
76
77/*
78 * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
79 * 0-7 are valid values.
80 */
81static int fl_pktshift = 2;
82TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
83
84/*
85 * Pad ethernet payload up to this boundary.
86 * -1: driver should figure out a good value.
87 *  Any power of 2, from 32 to 4096 (both inclusive) is a valid value.
88 */
89static int fl_pad = -1;
90TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
91
92/*
93 * Status page length.
94 * -1: driver should figure out a good value.
95 *  64 or 128 are the only other valid values.
96 */
97static int spg_len = -1;
98TUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
99
100/*
101 * Congestion drops.
102 * -1: no congestion feedback (not recommended).
103 *  0: backpressure the channel instead of dropping packets right away.
104 *  1: no backpressure, drop packets for the congested queue immediately.
105 */
106static int cong_drop = 0;
107TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
108
109/* Used to track coalesced tx work request */
110struct txpkts {
111	uint64_t *flitp;	/* ptr to flit where next pkt should start */
112	uint8_t npkt;		/* # of packets in this work request */
113	uint8_t nflits;		/* # of flits used by this work request */
114	uint16_t plen;		/* total payload (sum of all packets) */
115};
116
117/* A packet's SGL.  This + m_pkthdr has all info needed for tx */
118struct sgl {
119	int nsegs;		/* # of segments in the SGL, 0 means imm. tx */
120	int nflits;		/* # of flits needed for the SGL */
121	bus_dma_segment_t seg[TX_SGL_SEGS];
122};
123
124static int service_iq(struct sge_iq *, int);
125static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t,
126    int *);
127static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
128static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int,
129    int);
130static inline void init_fl(struct sge_fl *, int, int, char *);
131static inline void init_eq(struct sge_eq *, int, int, uint8_t, uint16_t,
132    char *);
133static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
134    bus_addr_t *, void **);
135static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
136    void *);
137static int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *,
138    int, int);
139static int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *);
140static int alloc_fwq(struct adapter *);
141static int free_fwq(struct adapter *);
142static int alloc_mgmtq(struct adapter *);
143static int free_mgmtq(struct adapter *);
144static int alloc_rxq(struct port_info *, struct sge_rxq *, int, int,
145    struct sysctl_oid *);
146static int free_rxq(struct port_info *, struct sge_rxq *);
147#ifdef TCP_OFFLOAD
148static int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int,
149    struct sysctl_oid *);
150static int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *);
151#endif
152static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
153static int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
154#ifdef TCP_OFFLOAD
155static int ofld_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
156#endif
157static int alloc_eq(struct adapter *, struct port_info *, struct sge_eq *);
158static int free_eq(struct adapter *, struct sge_eq *);
159static int alloc_wrq(struct adapter *, struct port_info *, struct sge_wrq *,
160    struct sysctl_oid *);
161static int free_wrq(struct adapter *, struct sge_wrq *);
162static int alloc_txq(struct port_info *, struct sge_txq *, int,
163    struct sysctl_oid *);
164static int free_txq(struct port_info *, struct sge_txq *);
165static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
166static inline bool is_new_response(const struct sge_iq *, struct rsp_ctrl **);
167static inline void iq_next(struct sge_iq *);
168static inline void ring_fl_db(struct adapter *, struct sge_fl *);
169static int refill_fl(struct adapter *, struct sge_fl *, int);
170static void refill_sfl(void *);
171static int alloc_fl_sdesc(struct sge_fl *);
172static void free_fl_sdesc(struct sge_fl *);
173static void set_fl_tag_idx(struct sge_fl *, int);
174static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
175
176static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
177static int free_pkt_sgl(struct sge_txq *, struct sgl *);
178static int write_txpkt_wr(struct port_info *, struct sge_txq *, struct mbuf *,
179    struct sgl *);
180static int add_to_txpkts(struct port_info *, struct sge_txq *, struct txpkts *,
181    struct mbuf *, struct sgl *);
182static void write_txpkts_wr(struct sge_txq *, struct txpkts *);
183static inline void write_ulp_cpl_sgl(struct port_info *, struct sge_txq *,
184    struct txpkts *, struct mbuf *, struct sgl *);
185static int write_sgl_to_txd(struct sge_eq *, struct sgl *, caddr_t *);
186static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
187static inline void ring_eq_db(struct adapter *, struct sge_eq *);
188static inline int reclaimable(struct sge_eq *);
189static int reclaim_tx_descs(struct sge_txq *, int, int);
190static void write_eqflush_wr(struct sge_eq *);
191static __be64 get_flit(bus_dma_segment_t *, int, int);
192static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
193    struct mbuf *);
194static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
195    struct mbuf *);
196
197static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
198
199#if defined(__i386__) || defined(__amd64__)
200extern u_int cpu_clflush_line_size;
201#endif
202
203/*
204 * Called on MOD_LOAD.  Fills up fl_buf_info[] and validates/calculates the SGE
205 * tunables.
206 */
207void
208t4_sge_modload(void)
209{
210	int i;
211	int bufsize[FL_BUF_SIZES] = {
212		MCLBYTES,
213#if MJUMPAGESIZE != MCLBYTES
214		MJUMPAGESIZE,
215#endif
216		MJUM9BYTES,
217		MJUM16BYTES
218	};
219
220	for (i = 0; i < FL_BUF_SIZES; i++) {
221		FL_BUF_SIZE(i) = bufsize[i];
222		FL_BUF_TYPE(i) = m_gettype(bufsize[i]);
223		FL_BUF_ZONE(i) = m_getzone(bufsize[i]);
224	}
225
226	if (fl_pktshift < 0 || fl_pktshift > 7) {
227		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
228		    " using 2 instead.\n", fl_pktshift);
229		fl_pktshift = 2;
230	}
231
232	if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) {
233		int pad;
234
235#if defined(__i386__) || defined(__amd64__)
236		pad = max(cpu_clflush_line_size, 32);
237#else
238		pad = max(CACHE_LINE_SIZE, 32);
239#endif
240		pad = min(pad, 4096);
241
242		if (fl_pad != -1) {
243			printf("Invalid hw.cxgbe.fl_pad value (%d),"
244			    " using %d instead.\n", fl_pad, pad);
245		}
246		fl_pad = pad;
247	}
248
249	if (spg_len != 64 && spg_len != 128) {
250		int len;
251
252#if defined(__i386__) || defined(__amd64__)
253		len = cpu_clflush_line_size > 64 ? 128 : 64;
254#else
255		len = 64;
256#endif
257		if (spg_len != -1) {
258			printf("Invalid hw.cxgbe.spg_len value (%d),"
259			    " using %d instead.\n", spg_len, len);
260		}
261		spg_len = len;
262	}
263
264	if (cong_drop < -1 || cong_drop > 1) {
265		printf("Invalid hw.cxgbe.cong_drop value (%d),"
266		    " using 0 instead.\n", cong_drop);
267		cong_drop = 0;
268	}
269}
270
271void
272t4_init_sge_cpl_handlers(struct adapter *sc)
273{
274
275	t4_register_cpl_handler(sc, CPL_FW4_MSG, handle_fw_msg);
276	t4_register_cpl_handler(sc, CPL_FW6_MSG, handle_fw_msg);
277	t4_register_cpl_handler(sc, CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
278	t4_register_cpl_handler(sc, CPL_RX_PKT, t4_eth_rx);
279
280	t4_register_fw_msg_handler(sc, FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
281}
282
283/*
284 * adap->params.vpd.cclk must be set up before this is called.
285 */
286void
287t4_tweak_chip_settings(struct adapter *sc)
288{
289	int i;
290	uint32_t v, m;
291	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
292	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
293	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
294	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
295
296	KASSERT(sc->flags & MASTER_PF,
297	    ("%s: trying to change chip settings when not master.", __func__));
298
299	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE |
300	    V_INGPADBOUNDARY(M_INGPADBOUNDARY) | F_EGRSTATUSPAGESIZE;
301	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
302	    V_INGPADBOUNDARY(ilog2(fl_pad) - 5) |
303	    V_EGRSTATUSPAGESIZE(spg_len == 128);
304	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
305
306	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
307	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
308	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
309	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
310	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
311	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
312	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
313	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
314	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
315
316	for (i = 0; i < FL_BUF_SIZES; i++) {
317		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
318		    FL_BUF_SIZE(i));
319	}
320
321	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
322	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
323	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
324
325	KASSERT(intr_timer[0] <= timer_max,
326	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
327	    timer_max));
328	for (i = 1; i < nitems(intr_timer); i++) {
329		KASSERT(intr_timer[i] >= intr_timer[i - 1],
330		    ("%s: timers not listed in increasing order (%d)",
331		    __func__, i));
332
333		while (intr_timer[i] > timer_max) {
334			if (i == nitems(intr_timer) - 1) {
335				intr_timer[i] = timer_max;
336				break;
337			}
338			intr_timer[i] += intr_timer[i - 1];
339			intr_timer[i] /= 2;
340		}
341	}
342
343	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
344	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
345	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
346	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
347	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
348	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
349	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
350	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
351	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
352
353	if (cong_drop == 0) {
354		m = F_TUNNELCNGDROP0 | F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 |
355		    F_TUNNELCNGDROP3;
356		t4_set_reg_field(sc, A_TP_PARA_REG3, m, 0);
357	}
358
359	/* 4K, 16K, 64K, 256K DDP "page sizes" */
360	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
361	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
362
363	m = v = F_TDDPTAGTCB;
364	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
365
366	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
367	    F_RESETDDPOFFSET;
368	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
369	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
370}
371
372/*
373 * XXX: driver really should be able to deal with unexpected settings.
374 */
375int
376t4_read_chip_settings(struct adapter *sc)
377{
378	struct sge *s = &sc->sge;
379	int i, rc = 0;
380	uint32_t m, v, r;
381	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
382
383	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE |
384	    V_INGPADBOUNDARY(M_INGPADBOUNDARY) | F_EGRSTATUSPAGESIZE;
385	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
386	    V_INGPADBOUNDARY(ilog2(fl_pad) - 5) |
387	    V_EGRSTATUSPAGESIZE(spg_len == 128);
388	r = t4_read_reg(sc, A_SGE_CONTROL);
389	if ((r & m) != v) {
390		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
391		rc = EINVAL;
392	}
393
394	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
395	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
396	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
397	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
398	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
399	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
400	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
401	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
402	r = t4_read_reg(sc, A_SGE_HOST_PAGE_SIZE);
403	if (r != v) {
404		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
405		rc = EINVAL;
406	}
407
408	for (i = 0; i < FL_BUF_SIZES; i++) {
409		v = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
410		if (v != FL_BUF_SIZE(i)) {
411			device_printf(sc->dev,
412			    "invalid SGE_FL_BUFFER_SIZE[%d](0x%x)\n", i, v);
413			rc = EINVAL;
414		}
415	}
416
417	r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD);
418	s->counter_val[0] = G_THRESHOLD_0(r);
419	s->counter_val[1] = G_THRESHOLD_1(r);
420	s->counter_val[2] = G_THRESHOLD_2(r);
421	s->counter_val[3] = G_THRESHOLD_3(r);
422
423	r = t4_read_reg(sc, A_SGE_TIMER_VALUE_0_AND_1);
424	s->timer_val[0] = G_TIMERVALUE0(r) / core_ticks_per_usec(sc);
425	s->timer_val[1] = G_TIMERVALUE1(r) / core_ticks_per_usec(sc);
426	r = t4_read_reg(sc, A_SGE_TIMER_VALUE_2_AND_3);
427	s->timer_val[2] = G_TIMERVALUE2(r) / core_ticks_per_usec(sc);
428	s->timer_val[3] = G_TIMERVALUE3(r) / core_ticks_per_usec(sc);
429	r = t4_read_reg(sc, A_SGE_TIMER_VALUE_4_AND_5);
430	s->timer_val[4] = G_TIMERVALUE4(r) / core_ticks_per_usec(sc);
431	s->timer_val[5] = G_TIMERVALUE5(r) / core_ticks_per_usec(sc);
432
433	if (cong_drop == 0) {
434		m = F_TUNNELCNGDROP0 | F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 |
435		    F_TUNNELCNGDROP3;
436		r = t4_read_reg(sc, A_TP_PARA_REG3);
437		if (r & m) {
438			device_printf(sc->dev,
439			    "invalid TP_PARA_REG3(0x%x)\n", r);
440			rc = EINVAL;
441		}
442	}
443
444	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
445	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
446	if (r != v) {
447		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
448		rc = EINVAL;
449	}
450
451	m = v = F_TDDPTAGTCB;
452	r = t4_read_reg(sc, A_ULP_RX_CTL);
453	if ((r & m) != v) {
454		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
455		rc = EINVAL;
456	}
457
458	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
459	    F_RESETDDPOFFSET;
460	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
461	r = t4_read_reg(sc, A_TP_PARA_REG5);
462	if ((r & m) != v) {
463		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
464		rc = EINVAL;
465	}
466
467	r = t4_read_reg(sc, A_SGE_CONM_CTRL);
468	s->fl_starve_threshold = G_EGRTHRESHOLD(r) * 2 + 1;
469
470	if (is_t5(sc)) {
471		r = t4_read_reg(sc, A_SGE_EGRESS_QUEUES_PER_PAGE_PF);
472		r >>= S_QUEUESPERPAGEPF0 +
473		    (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf;
474		s->s_qpp = r & M_QUEUESPERPAGEPF0;
475	}
476
477	t4_init_tp_params(sc);
478
479	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
480	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
481
482	return (rc);
483}
484
485int
486t4_create_dma_tag(struct adapter *sc)
487{
488	int rc;
489
490	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
491	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
492	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
493	    NULL, &sc->dmat);
494	if (rc != 0) {
495		device_printf(sc->dev,
496		    "failed to create main DMA tag: %d\n", rc);
497	}
498
499	return (rc);
500}
501
502int
503t4_destroy_dma_tag(struct adapter *sc)
504{
505	if (sc->dmat)
506		bus_dma_tag_destroy(sc->dmat);
507
508	return (0);
509}
510
511/*
512 * Allocate and initialize the firmware event queue and the management queue.
513 *
514 * Returns errno on failure.  Resources allocated up to that point may still be
515 * allocated.  Caller is responsible for cleanup in case this function fails.
516 */
517int
518t4_setup_adapter_queues(struct adapter *sc)
519{
520	int rc;
521
522	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
523
524	sysctl_ctx_init(&sc->ctx);
525	sc->flags |= ADAP_SYSCTL_CTX;
526
527	/*
528	 * Firmware event queue
529	 */
530	rc = alloc_fwq(sc);
531	if (rc != 0)
532		return (rc);
533
534	/*
535	 * Management queue.  This is just a control queue that uses the fwq as
536	 * its associated iq.
537	 */
538	rc = alloc_mgmtq(sc);
539
540	return (rc);
541}
542
543/*
544 * Idempotent
545 */
546int
547t4_teardown_adapter_queues(struct adapter *sc)
548{
549
550	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
551
552	/* Do this before freeing the queue */
553	if (sc->flags & ADAP_SYSCTL_CTX) {
554		sysctl_ctx_free(&sc->ctx);
555		sc->flags &= ~ADAP_SYSCTL_CTX;
556	}
557
558	free_mgmtq(sc);
559	free_fwq(sc);
560
561	return (0);
562}
563
564static inline int
565first_vector(struct port_info *pi)
566{
567	struct adapter *sc = pi->adapter;
568	int rc = T4_EXTRA_INTR, i;
569
570	if (sc->intr_count == 1)
571		return (0);
572
573	for_each_port(sc, i) {
574		struct port_info *p = sc->port[i];
575
576		if (i == pi->port_id)
577			break;
578
579#ifdef TCP_OFFLOAD
580		if (sc->flags & INTR_DIRECT)
581			rc += p->nrxq + p->nofldrxq;
582		else
583			rc += max(p->nrxq, p->nofldrxq);
584#else
585		/*
586		 * Not compiled with offload support and intr_count > 1.  Only
587		 * NIC queues exist and they'd better be taking direct
588		 * interrupts.
589		 */
590		KASSERT(sc->flags & INTR_DIRECT,
591		    ("%s: intr_count %d, !INTR_DIRECT", __func__,
592		    sc->intr_count));
593
594		rc += p->nrxq;
595#endif
596	}
597
598	return (rc);
599}
600
601/*
602 * Given an arbitrary "index," come up with an iq that can be used by other
603 * queues (of this port) for interrupt forwarding, SGE egress updates, etc.
604 * The iq returned is guaranteed to be something that takes direct interrupts.
605 */
606static struct sge_iq *
607port_intr_iq(struct port_info *pi, int idx)
608{
609	struct adapter *sc = pi->adapter;
610	struct sge *s = &sc->sge;
611	struct sge_iq *iq = NULL;
612
613	if (sc->intr_count == 1)
614		return (&sc->sge.fwq);
615
616#ifdef TCP_OFFLOAD
617	if (sc->flags & INTR_DIRECT) {
618		idx %= pi->nrxq + pi->nofldrxq;
619
620		if (idx >= pi->nrxq) {
621			idx -= pi->nrxq;
622			iq = &s->ofld_rxq[pi->first_ofld_rxq + idx].iq;
623		} else
624			iq = &s->rxq[pi->first_rxq + idx].iq;
625
626	} else {
627		idx %= max(pi->nrxq, pi->nofldrxq);
628
629		if (pi->nrxq >= pi->nofldrxq)
630			iq = &s->rxq[pi->first_rxq + idx].iq;
631		else
632			iq = &s->ofld_rxq[pi->first_ofld_rxq + idx].iq;
633	}
634#else
635	/*
636	 * Not compiled with offload support and intr_count > 1.  Only NIC
637	 * queues exist and they'd better be taking direct interrupts.
638	 */
639	KASSERT(sc->flags & INTR_DIRECT,
640	    ("%s: intr_count %d, !INTR_DIRECT", __func__, sc->intr_count));
641
642	idx %= pi->nrxq;
643	iq = &s->rxq[pi->first_rxq + idx].iq;
644#endif
645
646	KASSERT(iq->flags & IQ_INTR, ("%s: EDOOFUS", __func__));
647	return (iq);
648}
649
650static inline int
651mtu_to_bufsize(int mtu)
652{
653	int bufsize;
654
655	/* large enough for a frame even when VLAN extraction is disabled */
656	bufsize = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu;
657	bufsize = roundup2(bufsize + fl_pktshift, fl_pad);
658
659	return (bufsize);
660}
661
662int
663t4_setup_port_queues(struct port_info *pi)
664{
665	int rc = 0, i, j, intr_idx, iqid;
666	struct sge_rxq *rxq;
667	struct sge_txq *txq;
668	struct sge_wrq *ctrlq;
669#ifdef TCP_OFFLOAD
670	struct sge_ofld_rxq *ofld_rxq;
671	struct sge_wrq *ofld_txq;
672	struct sysctl_oid *oid2 = NULL;
673#endif
674	char name[16];
675	struct adapter *sc = pi->adapter;
676	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
677	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
678	int bufsize = mtu_to_bufsize(pi->ifp->if_mtu);
679
680	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
681	    NULL, "rx queues");
682
683#ifdef TCP_OFFLOAD
684	if (is_offload(sc)) {
685		oid2 = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq",
686		    CTLFLAG_RD, NULL,
687		    "rx queues for offloaded TCP connections");
688	}
689#endif
690
691	/* Interrupt vector to start from (when using multiple vectors) */
692	intr_idx = first_vector(pi);
693
694	/*
695	 * First pass over all rx queues (NIC and TOE):
696	 * a) initialize iq and fl
697	 * b) allocate queue iff it will take direct interrupts.
698	 */
699	for_each_rxq(pi, i, rxq) {
700
701		init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, pi->qsize_rxq,
702		    RX_IQ_ESIZE);
703
704		snprintf(name, sizeof(name), "%s rxq%d-fl",
705		    device_get_nameunit(pi->dev), i);
706		init_fl(&rxq->fl, pi->qsize_rxq / 8, bufsize, name);
707
708		if (sc->flags & INTR_DIRECT
709#ifdef TCP_OFFLOAD
710		    || (sc->intr_count > 1 && pi->nrxq >= pi->nofldrxq)
711#endif
712		   ) {
713			rxq->iq.flags |= IQ_INTR;
714			rc = alloc_rxq(pi, rxq, intr_idx, i, oid);
715			if (rc != 0)
716				goto done;
717			intr_idx++;
718		}
719	}
720
721#ifdef TCP_OFFLOAD
722	for_each_ofld_rxq(pi, i, ofld_rxq) {
723
724		init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx,
725		    pi->qsize_rxq, RX_IQ_ESIZE);
726
727		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
728		    device_get_nameunit(pi->dev), i);
729		init_fl(&ofld_rxq->fl, pi->qsize_rxq / 8, OFLD_BUF_SIZE, name);
730
731		if (sc->flags & INTR_DIRECT ||
732		    (sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) {
733			ofld_rxq->iq.flags |= IQ_INTR;
734			rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid2);
735			if (rc != 0)
736				goto done;
737			intr_idx++;
738		}
739	}
740#endif
741
742	/*
743	 * Second pass over all rx queues (NIC and TOE).  The queues forwarding
744	 * their interrupts are allocated now.
745	 */
746	j = 0;
747	for_each_rxq(pi, i, rxq) {
748		if (rxq->iq.flags & IQ_INTR)
749			continue;
750
751		intr_idx = port_intr_iq(pi, j)->abs_id;
752
753		rc = alloc_rxq(pi, rxq, intr_idx, i, oid);
754		if (rc != 0)
755			goto done;
756		j++;
757	}
758
759#ifdef TCP_OFFLOAD
760	for_each_ofld_rxq(pi, i, ofld_rxq) {
761		if (ofld_rxq->iq.flags & IQ_INTR)
762			continue;
763
764		intr_idx = port_intr_iq(pi, j)->abs_id;
765
766		rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid2);
767		if (rc != 0)
768			goto done;
769		j++;
770	}
771#endif
772
773	/*
774	 * Now the tx queues.  Only one pass needed.
775	 */
776	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
777	    NULL, "tx queues");
778	j = 0;
779	for_each_txq(pi, i, txq) {
780		uint16_t iqid;
781
782		iqid = port_intr_iq(pi, j)->cntxt_id;
783
784		snprintf(name, sizeof(name), "%s txq%d",
785		    device_get_nameunit(pi->dev), i);
786		init_eq(&txq->eq, EQ_ETH, pi->qsize_txq, pi->tx_chan, iqid,
787		    name);
788
789		rc = alloc_txq(pi, txq, i, oid);
790		if (rc != 0)
791			goto done;
792		j++;
793	}
794
795#ifdef TCP_OFFLOAD
796	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq",
797	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
798	for_each_ofld_txq(pi, i, ofld_txq) {
799		uint16_t iqid;
800
801		iqid = port_intr_iq(pi, j)->cntxt_id;
802
803		snprintf(name, sizeof(name), "%s ofld_txq%d",
804		    device_get_nameunit(pi->dev), i);
805		init_eq(&ofld_txq->eq, EQ_OFLD, pi->qsize_txq, pi->tx_chan,
806		    iqid, name);
807
808		snprintf(name, sizeof(name), "%d", i);
809		oid2 = SYSCTL_ADD_NODE(&pi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
810		    name, CTLFLAG_RD, NULL, "offload tx queue");
811
812		rc = alloc_wrq(sc, pi, ofld_txq, oid2);
813		if (rc != 0)
814			goto done;
815		j++;
816	}
817#endif
818
819	/*
820	 * Finally, the control queue.
821	 */
822	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD,
823	    NULL, "ctrl queue");
824	ctrlq = &sc->sge.ctrlq[pi->port_id];
825	iqid = port_intr_iq(pi, 0)->cntxt_id;
826	snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(pi->dev));
827	init_eq(&ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid, name);
828	rc = alloc_wrq(sc, pi, ctrlq, oid);
829
830done:
831	if (rc)
832		t4_teardown_port_queues(pi);
833
834	return (rc);
835}
836
837/*
838 * Idempotent
839 */
840int
841t4_teardown_port_queues(struct port_info *pi)
842{
843	int i;
844	struct adapter *sc = pi->adapter;
845	struct sge_rxq *rxq;
846	struct sge_txq *txq;
847#ifdef TCP_OFFLOAD
848	struct sge_ofld_rxq *ofld_rxq;
849	struct sge_wrq *ofld_txq;
850#endif
851
852	/* Do this before freeing the queues */
853	if (pi->flags & PORT_SYSCTL_CTX) {
854		sysctl_ctx_free(&pi->ctx);
855		pi->flags &= ~PORT_SYSCTL_CTX;
856	}
857
858	/*
859	 * Take down all the tx queues first, as they reference the rx queues
860	 * (for egress updates, etc.).
861	 */
862
863	free_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
864
865	for_each_txq(pi, i, txq) {
866		free_txq(pi, txq);
867	}
868
869#ifdef TCP_OFFLOAD
870	for_each_ofld_txq(pi, i, ofld_txq) {
871		free_wrq(sc, ofld_txq);
872	}
873#endif
874
875	/*
876	 * Then take down the rx queues that forward their interrupts, as they
877	 * reference other rx queues.
878	 */
879
880	for_each_rxq(pi, i, rxq) {
881		if ((rxq->iq.flags & IQ_INTR) == 0)
882			free_rxq(pi, rxq);
883	}
884
885#ifdef TCP_OFFLOAD
886	for_each_ofld_rxq(pi, i, ofld_rxq) {
887		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
888			free_ofld_rxq(pi, ofld_rxq);
889	}
890#endif
891
892	/*
893	 * Then take down the rx queues that take direct interrupts.
894	 */
895
896	for_each_rxq(pi, i, rxq) {
897		if (rxq->iq.flags & IQ_INTR)
898			free_rxq(pi, rxq);
899	}
900
901#ifdef TCP_OFFLOAD
902	for_each_ofld_rxq(pi, i, ofld_rxq) {
903		if (ofld_rxq->iq.flags & IQ_INTR)
904			free_ofld_rxq(pi, ofld_rxq);
905	}
906#endif
907
908	return (0);
909}
910
911/*
912 * Deals with errors and the firmware event queue.  All data rx queues forward
913 * their interrupt to the firmware event queue.
914 */
915void
916t4_intr_all(void *arg)
917{
918	struct adapter *sc = arg;
919	struct sge_iq *fwq = &sc->sge.fwq;
920
921	t4_intr_err(arg);
922	if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) {
923		service_iq(fwq, 0);
924		atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE);
925	}
926}
927
928/* Deals with error interrupts */
929void
930t4_intr_err(void *arg)
931{
932	struct adapter *sc = arg;
933
934	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
935	t4_slow_intr_handler(sc);
936}
937
938void
939t4_intr_evt(void *arg)
940{
941	struct sge_iq *iq = arg;
942
943	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
944		service_iq(iq, 0);
945		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
946	}
947}
948
949void
950t4_intr(void *arg)
951{
952	struct sge_iq *iq = arg;
953
954	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
955		service_iq(iq, 0);
956		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
957	}
958}
959
960/*
961 * Deals with anything and everything on the given ingress queue.
962 */
963static int
964service_iq(struct sge_iq *iq, int budget)
965{
966	struct sge_iq *q;
967	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
968	struct sge_fl *fl = &rxq->fl;		/* Use iff IQ_HAS_FL */
969	struct adapter *sc = iq->adapter;
970	struct rsp_ctrl *ctrl;
971	const struct rss_header *rss;
972	int ndescs = 0, limit, fl_bufs_used = 0;
973	int rsp_type;
974	uint32_t lq;
975	struct mbuf *m0;
976	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
977
978	limit = budget ? budget : iq->qsize / 8;
979
980	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
981
982	/*
983	 * We always come back and check the descriptor ring for new indirect
984	 * interrupts and other responses after running a single handler.
985	 */
986	for (;;) {
987		while (is_new_response(iq, &ctrl)) {
988
989			rmb();
990
991			m0 = NULL;
992			rsp_type = G_RSPD_TYPE(ctrl->u.type_gen);
993			lq = be32toh(ctrl->pldbuflen_qid);
994			rss = (const void *)iq->cdesc;
995
996			switch (rsp_type) {
997			case X_RSPD_TYPE_FLBUF:
998
999				KASSERT(iq->flags & IQ_HAS_FL,
1000				    ("%s: data for an iq (%p) with no freelist",
1001				    __func__, iq));
1002
1003				m0 = get_fl_payload(sc, fl, lq, &fl_bufs_used);
1004#ifdef T4_PKT_TIMESTAMP
1005				/*
1006				 * 60 bit timestamp for the payload is
1007				 * *(uint64_t *)m0->m_pktdat.  Note that it is
1008				 * in the leading free-space in the mbuf.  The
1009				 * kernel can clobber it during a pullup,
1010				 * m_copymdata, etc.  You need to make sure that
1011				 * the mbuf reaches you unmolested if you care
1012				 * about the timestamp.
1013				 */
1014				*(uint64_t *)m0->m_pktdat =
1015				    be64toh(ctrl->u.last_flit) &
1016				    0xfffffffffffffff;
1017#endif
1018
1019				/* fall through */
1020
1021			case X_RSPD_TYPE_CPL:
1022				KASSERT(rss->opcode < NUM_CPL_CMDS,
1023				    ("%s: bad opcode %02x.", __func__,
1024				    rss->opcode));
1025				sc->cpl_handler[rss->opcode](iq, rss, m0);
1026				break;
1027
1028			case X_RSPD_TYPE_INTR:
1029
1030				/*
1031				 * Interrupts should be forwarded only to queues
1032				 * that are not forwarding their interrupts.
1033				 * This means service_iq can recurse but only 1
1034				 * level deep.
1035				 */
1036				KASSERT(budget == 0,
1037				    ("%s: budget %u, rsp_type %u", __func__,
1038				    budget, rsp_type));
1039
1040				q = sc->sge.iqmap[lq - sc->sge.iq_start];
1041				if (atomic_cmpset_int(&q->state, IQS_IDLE,
1042				    IQS_BUSY)) {
1043					if (service_iq(q, q->qsize / 8) == 0) {
1044						atomic_cmpset_int(&q->state,
1045						    IQS_BUSY, IQS_IDLE);
1046					} else {
1047						STAILQ_INSERT_TAIL(&iql, q,
1048						    link);
1049					}
1050				}
1051				break;
1052
1053			default:
1054				sc->an_handler(iq, ctrl);
1055				break;
1056			}
1057
1058			iq_next(iq);
1059			if (++ndescs == limit) {
1060				t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
1061				    V_CIDXINC(ndescs) |
1062				    V_INGRESSQID(iq->cntxt_id) |
1063				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1064				ndescs = 0;
1065
1066				if (fl_bufs_used > 0) {
1067					FL_LOCK(fl);
1068					fl->needed += fl_bufs_used;
1069					refill_fl(sc, fl, fl->cap / 8);
1070					FL_UNLOCK(fl);
1071					fl_bufs_used = 0;
1072				}
1073
1074				if (budget)
1075					return (EINPROGRESS);
1076			}
1077		}
1078
1079		if (STAILQ_EMPTY(&iql))
1080			break;
1081
1082		/*
1083		 * Process the head only, and send it to the back of the list if
1084		 * it's still not done.
1085		 */
1086		q = STAILQ_FIRST(&iql);
1087		STAILQ_REMOVE_HEAD(&iql, link);
1088		if (service_iq(q, q->qsize / 8) == 0)
1089			atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
1090		else
1091			STAILQ_INSERT_TAIL(&iql, q, link);
1092	}
1093
1094#if defined(INET) || defined(INET6)
1095	if (iq->flags & IQ_LRO_ENABLED) {
1096		struct lro_ctrl *lro = &rxq->lro;
1097		struct lro_entry *l;
1098
1099		while (!SLIST_EMPTY(&lro->lro_active)) {
1100			l = SLIST_FIRST(&lro->lro_active);
1101			SLIST_REMOVE_HEAD(&lro->lro_active, next);
1102			tcp_lro_flush(lro, l);
1103		}
1104	}
1105#endif
1106
1107	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) |
1108	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1109
1110	if (iq->flags & IQ_HAS_FL) {
1111		int starved;
1112
1113		FL_LOCK(fl);
1114		fl->needed += fl_bufs_used;
1115		starved = refill_fl(sc, fl, fl->cap / 4);
1116		FL_UNLOCK(fl);
1117		if (__predict_false(starved != 0))
1118			add_fl_to_sfl(sc, fl);
1119	}
1120
1121	return (0);
1122}
1123
1124static struct mbuf *
1125get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
1126    int *fl_bufs_used)
1127{
1128	struct mbuf *m0, *m;
1129	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1130	unsigned int nbuf, len;
1131
1132	/*
1133	 * No assertion for the fl lock because we don't need it.  This routine
1134	 * is called only from the rx interrupt handler and it only updates
1135	 * fl->cidx.  (Contrast that with fl->pidx/fl->needed which could be
1136	 * updated in the rx interrupt handler or the starvation helper routine.
1137	 * That's why code that manipulates fl->pidx/fl->needed needs the fl
1138	 * lock but this routine does not).
1139	 */
1140
1141	if (__predict_false((len_newbuf & F_RSPD_NEWBUF) == 0))
1142		panic("%s: cannot handle packed frames", __func__);
1143	len = G_RSPD_LEN(len_newbuf);
1144
1145	m0 = sd->m;
1146	sd->m = NULL;	/* consumed */
1147
1148	bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD);
1149	m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR);
1150#ifdef T4_PKT_TIMESTAMP
1151	/* Leave room for a timestamp */
1152	m0->m_data += 8;
1153#endif
1154
1155	if (len < RX_COPY_THRESHOLD) {
1156		/* copy data to mbuf, buffer will be recycled */
1157		bcopy(sd->cl, mtod(m0, caddr_t), len);
1158		m0->m_len = len;
1159	} else {
1160		bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map);
1161		m_cljset(m0, sd->cl, FL_BUF_TYPE(sd->tag_idx));
1162		sd->cl = NULL;	/* consumed */
1163		m0->m_len = min(len, FL_BUF_SIZE(sd->tag_idx));
1164	}
1165	m0->m_pkthdr.len = len;
1166
1167	sd++;
1168	if (__predict_false(++fl->cidx == fl->cap)) {
1169		sd = fl->sdesc;
1170		fl->cidx = 0;
1171	}
1172
1173	m = m0;
1174	len -= m->m_len;
1175	nbuf = 1;	/* # of fl buffers used */
1176
1177	while (len > 0) {
1178		m->m_next = sd->m;
1179		sd->m = NULL;	/* consumed */
1180		m = m->m_next;
1181
1182		bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
1183		    BUS_DMASYNC_POSTREAD);
1184
1185		m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
1186		if (len <= MLEN) {
1187			bcopy(sd->cl, mtod(m, caddr_t), len);
1188			m->m_len = len;
1189		} else {
1190			bus_dmamap_unload(fl->tag[sd->tag_idx],
1191			    sd->map);
1192			m_cljset(m, sd->cl, FL_BUF_TYPE(sd->tag_idx));
1193			sd->cl = NULL;	/* consumed */
1194			m->m_len = min(len, FL_BUF_SIZE(sd->tag_idx));
1195		}
1196
1197		sd++;
1198		if (__predict_false(++fl->cidx == fl->cap)) {
1199			sd = fl->sdesc;
1200			fl->cidx = 0;
1201		}
1202
1203		len -= m->m_len;
1204		nbuf++;
1205	}
1206
1207	(*fl_bufs_used) += nbuf;
1208
1209	return (m0);
1210}
1211
1212static int
1213t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
1214{
1215	struct sge_rxq *rxq = iq_to_rxq(iq);
1216	struct ifnet *ifp = rxq->ifp;
1217	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
1218#if defined(INET) || defined(INET6)
1219	struct lro_ctrl *lro = &rxq->lro;
1220#endif
1221
1222	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
1223	    rss->opcode));
1224
1225	m0->m_pkthdr.len -= fl_pktshift;
1226	m0->m_len -= fl_pktshift;
1227	m0->m_data += fl_pktshift;
1228
1229	m0->m_pkthdr.rcvif = ifp;
1230	m0->m_flags |= M_FLOWID;
1231	m0->m_pkthdr.flowid = rss->hash_val;
1232
1233	if (cpl->csum_calc && !cpl->err_vec) {
1234		if (ifp->if_capenable & IFCAP_RXCSUM &&
1235		    cpl->l2info & htobe32(F_RXF_IP)) {
1236			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
1237			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1238			rxq->rxcsum++;
1239		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
1240		    cpl->l2info & htobe32(F_RXF_IP6)) {
1241			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
1242			    CSUM_PSEUDO_HDR);
1243			rxq->rxcsum++;
1244		}
1245
1246		if (__predict_false(cpl->ip_frag))
1247			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
1248		else
1249			m0->m_pkthdr.csum_data = 0xffff;
1250	}
1251
1252	if (cpl->vlan_ex) {
1253		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
1254		m0->m_flags |= M_VLANTAG;
1255		rxq->vlan_extraction++;
1256	}
1257
1258#if defined(INET) || defined(INET6)
1259	if (cpl->l2info & htobe32(F_RXF_LRO) &&
1260	    iq->flags & IQ_LRO_ENABLED &&
1261	    tcp_lro_rx(lro, m0, 0) == 0) {
1262		/* queued for LRO */
1263	} else
1264#endif
1265	ifp->if_input(ifp, m0);
1266
1267	return (0);
1268}
1269
1270/*
1271 * Doesn't fail.  Holds on to work requests it can't send right away.
1272 */
1273void
1274t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
1275{
1276	struct sge_eq *eq = &wrq->eq;
1277	int can_reclaim;
1278	caddr_t dst;
1279
1280	TXQ_LOCK_ASSERT_OWNED(wrq);
1281#ifdef TCP_OFFLOAD
1282	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD ||
1283	    (eq->flags & EQ_TYPEMASK) == EQ_CTRL,
1284	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
1285#else
1286	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL,
1287	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
1288#endif
1289
1290	if (__predict_true(wr != NULL))
1291		STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
1292
1293	can_reclaim = reclaimable(eq);
1294	if (__predict_false(eq->flags & EQ_STALLED)) {
1295		if (can_reclaim < tx_resume_threshold(eq))
1296			return;
1297		eq->flags &= ~EQ_STALLED;
1298		eq->unstalled++;
1299	}
1300	eq->cidx += can_reclaim;
1301	eq->avail += can_reclaim;
1302	if (__predict_false(eq->cidx >= eq->cap))
1303		eq->cidx -= eq->cap;
1304
1305	while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) {
1306		int ndesc;
1307
1308		if (__predict_false(wr->wr_len < 0 ||
1309		    wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) {
1310
1311#ifdef INVARIANTS
1312			panic("%s: work request with length %d", __func__,
1313			    wr->wr_len);
1314#endif
1315#ifdef KDB
1316			kdb_backtrace();
1317#endif
1318			log(LOG_ERR, "%s: %s work request with length %d",
1319			    device_get_nameunit(sc->dev), __func__, wr->wr_len);
1320			STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
1321			free_wrqe(wr);
1322			continue;
1323		}
1324
1325		ndesc = howmany(wr->wr_len, EQ_ESIZE);
1326		if (eq->avail < ndesc) {
1327			wrq->no_desc++;
1328			break;
1329		}
1330
1331		dst = (void *)&eq->desc[eq->pidx];
1332		copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len);
1333
1334		eq->pidx += ndesc;
1335		eq->avail -= ndesc;
1336		if (__predict_false(eq->pidx >= eq->cap))
1337			eq->pidx -= eq->cap;
1338
1339		eq->pending += ndesc;
1340		if (eq->pending >= 8)
1341			ring_eq_db(sc, eq);
1342
1343		wrq->tx_wrs++;
1344		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
1345		free_wrqe(wr);
1346
1347		if (eq->avail < 8) {
1348			can_reclaim = reclaimable(eq);
1349			eq->cidx += can_reclaim;
1350			eq->avail += can_reclaim;
1351			if (__predict_false(eq->cidx >= eq->cap))
1352				eq->cidx -= eq->cap;
1353		}
1354	}
1355
1356	if (eq->pending)
1357		ring_eq_db(sc, eq);
1358
1359	if (wr != NULL) {
1360		eq->flags |= EQ_STALLED;
1361		if (callout_pending(&eq->tx_callout) == 0)
1362			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
1363	}
1364}
1365
1366/* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */
1367#define TXPKTS_PKT_HDR ((\
1368    sizeof(struct ulp_txpkt) + \
1369    sizeof(struct ulptx_idata) + \
1370    sizeof(struct cpl_tx_pkt_core) \
1371    ) / 8)
1372
1373/* Header of a coalesced tx WR, before SGL of first packet (in flits) */
1374#define TXPKTS_WR_HDR (\
1375    sizeof(struct fw_eth_tx_pkts_wr) / 8 + \
1376    TXPKTS_PKT_HDR)
1377
1378/* Header of a tx WR, before SGL of first packet (in flits) */
1379#define TXPKT_WR_HDR ((\
1380    sizeof(struct fw_eth_tx_pkt_wr) + \
1381    sizeof(struct cpl_tx_pkt_core) \
1382    ) / 8 )
1383
1384/* Header of a tx LSO WR, before SGL of first packet (in flits) */
1385#define TXPKT_LSO_WR_HDR ((\
1386    sizeof(struct fw_eth_tx_pkt_wr) + \
1387    sizeof(struct cpl_tx_pkt_lso_core) + \
1388    sizeof(struct cpl_tx_pkt_core) \
1389    ) / 8 )
1390
1391int
1392t4_eth_tx(struct ifnet *ifp, struct sge_txq *txq, struct mbuf *m)
1393{
1394	struct port_info *pi = (void *)ifp->if_softc;
1395	struct adapter *sc = pi->adapter;
1396	struct sge_eq *eq = &txq->eq;
1397	struct buf_ring *br = txq->br;
1398	struct mbuf *next;
1399	int rc, coalescing, can_reclaim;
1400	struct txpkts txpkts;
1401	struct sgl sgl;
1402
1403	TXQ_LOCK_ASSERT_OWNED(txq);
1404	KASSERT(m, ("%s: called with nothing to do.", __func__));
1405	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_ETH,
1406	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
1407
1408	prefetch(&eq->desc[eq->pidx]);
1409	prefetch(&txq->sdesc[eq->pidx]);
1410
1411	txpkts.npkt = 0;/* indicates there's nothing in txpkts */
1412	coalescing = 0;
1413
1414	can_reclaim = reclaimable(eq);
1415	if (__predict_false(eq->flags & EQ_STALLED)) {
1416		if (can_reclaim < tx_resume_threshold(eq)) {
1417			txq->m = m;
1418			return (0);
1419		}
1420		eq->flags &= ~EQ_STALLED;
1421		eq->unstalled++;
1422	}
1423
1424	if (__predict_false(eq->flags & EQ_DOOMED)) {
1425		m_freem(m);
1426		while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
1427			m_freem(m);
1428		return (ENETDOWN);
1429	}
1430
1431	if (eq->avail < 8 && can_reclaim)
1432		reclaim_tx_descs(txq, can_reclaim, 32);
1433
1434	for (; m; m = next ? next : drbr_dequeue(ifp, br)) {
1435
1436		if (eq->avail < 8)
1437			break;
1438
1439		next = m->m_nextpkt;
1440		m->m_nextpkt = NULL;
1441
1442		if (next || buf_ring_peek(br))
1443			coalescing = 1;
1444
1445		rc = get_pkt_sgl(txq, &m, &sgl, coalescing);
1446		if (rc != 0) {
1447			if (rc == ENOMEM) {
1448
1449				/* Short of resources, suspend tx */
1450
1451				m->m_nextpkt = next;
1452				break;
1453			}
1454
1455			/*
1456			 * Unrecoverable error for this packet, throw it away
1457			 * and move on to the next.  get_pkt_sgl may already
1458			 * have freed m (it will be NULL in that case and the
1459			 * m_freem here is still safe).
1460			 */
1461
1462			m_freem(m);
1463			continue;
1464		}
1465
1466		if (coalescing &&
1467		    add_to_txpkts(pi, txq, &txpkts, m, &sgl) == 0) {
1468
1469			/* Successfully absorbed into txpkts */
1470
1471			write_ulp_cpl_sgl(pi, txq, &txpkts, m, &sgl);
1472			goto doorbell;
1473		}
1474
1475		/*
1476		 * We weren't coalescing to begin with, or current frame could
1477		 * not be coalesced (add_to_txpkts flushes txpkts if a frame
1478		 * given to it can't be coalesced).  Either way there should be
1479		 * nothing in txpkts.
1480		 */
1481		KASSERT(txpkts.npkt == 0,
1482		    ("%s: txpkts not empty: %d", __func__, txpkts.npkt));
1483
1484		/* We're sending out individual packets now */
1485		coalescing = 0;
1486
1487		if (eq->avail < 8)
1488			reclaim_tx_descs(txq, 0, 8);
1489		rc = write_txpkt_wr(pi, txq, m, &sgl);
1490		if (rc != 0) {
1491
1492			/* Short of hardware descriptors, suspend tx */
1493
1494			/*
1495			 * This is an unlikely but expensive failure.  We've
1496			 * done all the hard work (DMA mappings etc.) and now we
1497			 * can't send out the packet.  What's worse, we have to
1498			 * spend even more time freeing up everything in sgl.
1499			 */
1500			txq->no_desc++;
1501			free_pkt_sgl(txq, &sgl);
1502
1503			m->m_nextpkt = next;
1504			break;
1505		}
1506
1507		ETHER_BPF_MTAP(ifp, m);
1508		if (sgl.nsegs == 0)
1509			m_freem(m);
1510doorbell:
1511		if (eq->pending >= 8)
1512			ring_eq_db(sc, eq);
1513
1514		can_reclaim = reclaimable(eq);
1515		if (can_reclaim >= 32)
1516			reclaim_tx_descs(txq, can_reclaim, 64);
1517	}
1518
1519	if (txpkts.npkt > 0)
1520		write_txpkts_wr(txq, &txpkts);
1521
1522	/*
1523	 * m not NULL means there was an error but we haven't thrown it away.
1524	 * This can happen when we're short of tx descriptors (no_desc) or maybe
1525	 * even DMA maps (no_dmamap).  Either way, a credit flush and reclaim
1526	 * will get things going again.
1527	 */
1528	if (m && !(eq->flags & EQ_CRFLUSHED)) {
1529		struct tx_sdesc *txsd = &txq->sdesc[eq->pidx];
1530
1531		/*
1532		 * If EQ_CRFLUSHED is not set then we know we have at least one
1533		 * available descriptor because any WR that reduces eq->avail to
1534		 * 0 also sets EQ_CRFLUSHED.
1535		 */
1536		KASSERT(eq->avail > 0, ("%s: no space for eqflush.", __func__));
1537
1538		txsd->desc_used = 1;
1539		txsd->credits = 0;
1540		write_eqflush_wr(eq);
1541	}
1542	txq->m = m;
1543
1544	if (eq->pending)
1545		ring_eq_db(sc, eq);
1546
1547	reclaim_tx_descs(txq, 0, 128);
1548
1549	if (eq->flags & EQ_STALLED && callout_pending(&eq->tx_callout) == 0)
1550		callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
1551
1552	return (0);
1553}
1554
1555void
1556t4_update_fl_bufsize(struct ifnet *ifp)
1557{
1558	struct port_info *pi = ifp->if_softc;
1559	struct sge_rxq *rxq;
1560	struct sge_fl *fl;
1561	int i, bufsize = mtu_to_bufsize(ifp->if_mtu);
1562
1563	for_each_rxq(pi, i, rxq) {
1564		fl = &rxq->fl;
1565
1566		FL_LOCK(fl);
1567		set_fl_tag_idx(fl, bufsize);
1568		FL_UNLOCK(fl);
1569	}
1570}
1571
1572int
1573can_resume_tx(struct sge_eq *eq)
1574{
1575	return (reclaimable(eq) >= tx_resume_threshold(eq));
1576}
1577
1578static inline void
1579init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
1580    int qsize, int esize)
1581{
1582	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
1583	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
1584	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
1585	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
1586
1587	iq->flags = 0;
1588	iq->adapter = sc;
1589	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
1590	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
1591	if (pktc_idx >= 0) {
1592		iq->intr_params |= F_QINTR_CNT_EN;
1593		iq->intr_pktc_idx = pktc_idx;
1594	}
1595	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
1596	iq->esize = max(esize, 16);		/* See FW_IQ_CMD/iqesize */
1597}
1598
1599static inline void
1600init_fl(struct sge_fl *fl, int qsize, int bufsize, char *name)
1601{
1602	fl->qsize = qsize;
1603	strlcpy(fl->lockname, name, sizeof(fl->lockname));
1604	set_fl_tag_idx(fl, bufsize);
1605}
1606
1607static inline void
1608init_eq(struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan,
1609    uint16_t iqid, char *name)
1610{
1611	KASSERT(tx_chan < NCHAN, ("%s: bad tx channel %d", __func__, tx_chan));
1612	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
1613
1614	eq->flags = eqtype & EQ_TYPEMASK;
1615	eq->tx_chan = tx_chan;
1616	eq->iqid = iqid;
1617	eq->qsize = qsize;
1618	strlcpy(eq->lockname, name, sizeof(eq->lockname));
1619
1620	TASK_INIT(&eq->tx_task, 0, t4_tx_task, eq);
1621	callout_init(&eq->tx_callout, CALLOUT_MPSAFE);
1622}
1623
1624static int
1625alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
1626    bus_dmamap_t *map, bus_addr_t *pa, void **va)
1627{
1628	int rc;
1629
1630	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
1631	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
1632	if (rc != 0) {
1633		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
1634		goto done;
1635	}
1636
1637	rc = bus_dmamem_alloc(*tag, va,
1638	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
1639	if (rc != 0) {
1640		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
1641		goto done;
1642	}
1643
1644	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
1645	if (rc != 0) {
1646		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
1647		goto done;
1648	}
1649done:
1650	if (rc)
1651		free_ring(sc, *tag, *map, *pa, *va);
1652
1653	return (rc);
1654}
1655
1656static int
1657free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
1658    bus_addr_t pa, void *va)
1659{
1660	if (pa)
1661		bus_dmamap_unload(tag, map);
1662	if (va)
1663		bus_dmamem_free(tag, va, map);
1664	if (tag)
1665		bus_dma_tag_destroy(tag);
1666
1667	return (0);
1668}
1669
1670/*
1671 * Allocates the ring for an ingress queue and an optional freelist.  If the
1672 * freelist is specified it will be allocated and then associated with the
1673 * ingress queue.
1674 *
1675 * Returns errno on failure.  Resources allocated up to that point may still be
1676 * allocated.  Caller is responsible for cleanup in case this function fails.
1677 *
1678 * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then
1679 * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
1680 * the abs_id of the ingress queue to which its interrupts should be forwarded.
1681 */
1682static int
1683alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl,
1684    int intr_idx, int cong)
1685{
1686	int rc, i, cntxt_id;
1687	size_t len;
1688	struct fw_iq_cmd c;
1689	struct adapter *sc = iq->adapter;
1690	__be32 v = 0;
1691
1692	len = iq->qsize * iq->esize;
1693	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
1694	    (void **)&iq->desc);
1695	if (rc != 0)
1696		return (rc);
1697
1698	bzero(&c, sizeof(c));
1699	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
1700	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
1701	    V_FW_IQ_CMD_VFN(0));
1702
1703	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
1704	    FW_LEN16(c));
1705
1706	/* Special handling for firmware event queue */
1707	if (iq == &sc->sge.fwq)
1708		v |= F_FW_IQ_CMD_IQASYNCH;
1709
1710	if (iq->flags & IQ_INTR) {
1711		KASSERT(intr_idx < sc->intr_count,
1712		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
1713	} else
1714		v |= F_FW_IQ_CMD_IQANDST;
1715	v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
1716
1717	c.type_to_iqandstindex = htobe32(v |
1718	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
1719	    V_FW_IQ_CMD_VIID(pi->viid) |
1720	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
1721	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
1722	    F_FW_IQ_CMD_IQGTSMODE |
1723	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
1724	    V_FW_IQ_CMD_IQESIZE(ilog2(iq->esize) - 4));
1725	c.iqsize = htobe16(iq->qsize);
1726	c.iqaddr = htobe64(iq->ba);
1727	if (cong >= 0)
1728		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
1729
1730	if (fl) {
1731		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
1732
1733		for (i = 0; i < FL_BUF_SIZES; i++) {
1734
1735			/*
1736			 * A freelist buffer must be 16 byte aligned as the SGE
1737			 * uses the low 4 bits of the bus addr to figure out the
1738			 * buffer size.
1739			 */
1740			rc = bus_dma_tag_create(sc->dmat, 16, 0,
1741			    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
1742			    FL_BUF_SIZE(i), 1, FL_BUF_SIZE(i), BUS_DMA_ALLOCNOW,
1743			    NULL, NULL, &fl->tag[i]);
1744			if (rc != 0) {
1745				device_printf(sc->dev,
1746				    "failed to create fl DMA tag[%d]: %d\n",
1747				    i, rc);
1748				return (rc);
1749			}
1750		}
1751		len = fl->qsize * RX_FL_ESIZE;
1752		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
1753		    &fl->ba, (void **)&fl->desc);
1754		if (rc)
1755			return (rc);
1756
1757		/* Allocate space for one software descriptor per buffer. */
1758		fl->cap = (fl->qsize - spg_len / RX_FL_ESIZE) * 8;
1759		FL_LOCK(fl);
1760		rc = alloc_fl_sdesc(fl);
1761		FL_UNLOCK(fl);
1762		if (rc != 0) {
1763			device_printf(sc->dev,
1764			    "failed to setup fl software descriptors: %d\n",
1765			    rc);
1766			return (rc);
1767		}
1768		fl->needed = fl->cap;
1769		fl->lowat = roundup2(sc->sge.fl_starve_threshold, 8);
1770
1771		c.iqns_to_fl0congen |=
1772		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
1773			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
1774			F_FW_IQ_CMD_FL0PADEN);
1775		if (cong >= 0) {
1776			c.iqns_to_fl0congen |=
1777				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
1778				    F_FW_IQ_CMD_FL0CONGCIF |
1779				    F_FW_IQ_CMD_FL0CONGEN);
1780		}
1781		c.fl0dcaen_to_fl0cidxfthresh =
1782		    htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_64B) |
1783			V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B));
1784		c.fl0size = htobe16(fl->qsize);
1785		c.fl0addr = htobe64(fl->ba);
1786	}
1787
1788	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
1789	if (rc != 0) {
1790		device_printf(sc->dev,
1791		    "failed to create ingress queue: %d\n", rc);
1792		return (rc);
1793	}
1794
1795	iq->cdesc = iq->desc;
1796	iq->cidx = 0;
1797	iq->gen = 1;
1798	iq->intr_next = iq->intr_params;
1799	iq->cntxt_id = be16toh(c.iqid);
1800	iq->abs_id = be16toh(c.physiqid);
1801	iq->flags |= IQ_ALLOCATED;
1802
1803	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
1804	if (cntxt_id >= sc->sge.niq) {
1805		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
1806		    cntxt_id, sc->sge.niq - 1);
1807	}
1808	sc->sge.iqmap[cntxt_id] = iq;
1809
1810	if (fl) {
1811		fl->cntxt_id = be16toh(c.fl0id);
1812		fl->pidx = fl->cidx = 0;
1813
1814		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
1815		if (cntxt_id >= sc->sge.neq) {
1816			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
1817			    __func__, cntxt_id, sc->sge.neq - 1);
1818		}
1819		sc->sge.eqmap[cntxt_id] = (void *)fl;
1820
1821		FL_LOCK(fl);
1822		/* Enough to make sure the SGE doesn't think it's starved */
1823		refill_fl(sc, fl, fl->lowat);
1824		FL_UNLOCK(fl);
1825
1826		iq->flags |= IQ_HAS_FL;
1827	}
1828
1829	/* Enable IQ interrupts */
1830	atomic_store_rel_int(&iq->state, IQS_IDLE);
1831	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_SEINTARM(iq->intr_params) |
1832	    V_INGRESSQID(iq->cntxt_id));
1833
1834	return (0);
1835}
1836
1837static int
1838free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl)
1839{
1840	int i, rc;
1841	struct adapter *sc = iq->adapter;
1842	device_t dev;
1843
1844	if (sc == NULL)
1845		return (0);	/* nothing to do */
1846
1847	dev = pi ? pi->dev : sc->dev;
1848
1849	if (iq->flags & IQ_ALLOCATED) {
1850		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
1851		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
1852		    fl ? fl->cntxt_id : 0xffff, 0xffff);
1853		if (rc != 0) {
1854			device_printf(dev,
1855			    "failed to free queue %p: %d\n", iq, rc);
1856			return (rc);
1857		}
1858		iq->flags &= ~IQ_ALLOCATED;
1859	}
1860
1861	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
1862
1863	bzero(iq, sizeof(*iq));
1864
1865	if (fl) {
1866		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
1867		    fl->desc);
1868
1869		if (fl->sdesc) {
1870			FL_LOCK(fl);
1871			free_fl_sdesc(fl);
1872			FL_UNLOCK(fl);
1873		}
1874
1875		if (mtx_initialized(&fl->fl_lock))
1876			mtx_destroy(&fl->fl_lock);
1877
1878		for (i = 0; i < FL_BUF_SIZES; i++) {
1879			if (fl->tag[i])
1880				bus_dma_tag_destroy(fl->tag[i]);
1881		}
1882
1883		bzero(fl, sizeof(*fl));
1884	}
1885
1886	return (0);
1887}
1888
1889static int
1890alloc_fwq(struct adapter *sc)
1891{
1892	int rc, intr_idx;
1893	struct sge_iq *fwq = &sc->sge.fwq;
1894	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
1895	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
1896
1897	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE, FW_IQ_ESIZE);
1898	fwq->flags |= IQ_INTR;	/* always */
1899	intr_idx = sc->intr_count > 1 ? 1 : 0;
1900	rc = alloc_iq_fl(sc->port[0], fwq, NULL, intr_idx, -1);
1901	if (rc != 0) {
1902		device_printf(sc->dev,
1903		    "failed to create firmware event queue: %d\n", rc);
1904		return (rc);
1905	}
1906
1907	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
1908	    NULL, "firmware event queue");
1909	children = SYSCTL_CHILDREN(oid);
1910
1911	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id",
1912	    CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I",
1913	    "absolute id of the queue");
1914	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id",
1915	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I",
1916	    "SGE context id of the queue");
1917	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx",
1918	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I",
1919	    "consumer index");
1920
1921	return (0);
1922}
1923
1924static int
1925free_fwq(struct adapter *sc)
1926{
1927	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
1928}
1929
1930static int
1931alloc_mgmtq(struct adapter *sc)
1932{
1933	int rc;
1934	struct sge_wrq *mgmtq = &sc->sge.mgmtq;
1935	char name[16];
1936	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
1937	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
1938
1939	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD,
1940	    NULL, "management queue");
1941
1942	snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev));
1943	init_eq(&mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan,
1944	    sc->sge.fwq.cntxt_id, name);
1945	rc = alloc_wrq(sc, NULL, mgmtq, oid);
1946	if (rc != 0) {
1947		device_printf(sc->dev,
1948		    "failed to create management queue: %d\n", rc);
1949		return (rc);
1950	}
1951
1952	return (0);
1953}
1954
1955static int
1956free_mgmtq(struct adapter *sc)
1957{
1958
1959	return free_wrq(sc, &sc->sge.mgmtq);
1960}
1961
1962static inline int
1963tnl_cong(struct port_info *pi)
1964{
1965
1966	if (cong_drop == -1)
1967		return (-1);
1968	else if (cong_drop == 1)
1969		return (0);
1970	else
1971		return (1 << pi->tx_chan);
1972}
1973
1974static int
1975alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx,
1976    struct sysctl_oid *oid)
1977{
1978	int rc;
1979	struct sysctl_oid_list *children;
1980	char name[16];
1981
1982	rc = alloc_iq_fl(pi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(pi));
1983	if (rc != 0)
1984		return (rc);
1985
1986	FL_LOCK(&rxq->fl);
1987	refill_fl(pi->adapter, &rxq->fl, rxq->fl.needed / 8);
1988	FL_UNLOCK(&rxq->fl);
1989
1990#if defined(INET) || defined(INET6)
1991	rc = tcp_lro_init(&rxq->lro);
1992	if (rc != 0)
1993		return (rc);
1994	rxq->lro.ifp = pi->ifp; /* also indicates LRO init'ed */
1995
1996	if (pi->ifp->if_capenable & IFCAP_LRO)
1997		rxq->iq.flags |= IQ_LRO_ENABLED;
1998#endif
1999	rxq->ifp = pi->ifp;
2000
2001	children = SYSCTL_CHILDREN(oid);
2002
2003	snprintf(name, sizeof(name), "%d", idx);
2004	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
2005	    NULL, "rx queue");
2006	children = SYSCTL_CHILDREN(oid);
2007
2008	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "abs_id",
2009	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I",
2010	    "absolute id of the queue");
2011	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id",
2012	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I",
2013	    "SGE context id of the queue");
2014	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
2015	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I",
2016	    "consumer index");
2017#if defined(INET) || defined(INET6)
2018	SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
2019	    &rxq->lro.lro_queued, 0, NULL);
2020	SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
2021	    &rxq->lro.lro_flushed, 0, NULL);
2022#endif
2023	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
2024	    &rxq->rxcsum, "# of times hardware assisted with checksum");
2025	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_extraction",
2026	    CTLFLAG_RD, &rxq->vlan_extraction,
2027	    "# of times hardware extracted 802.1Q tag");
2028
2029	children = SYSCTL_CHILDREN(oid);
2030	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "fl", CTLFLAG_RD,
2031	    NULL, "freelist");
2032	children = SYSCTL_CHILDREN(oid);
2033
2034	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id",
2035	    CTLTYPE_INT | CTLFLAG_RD, &rxq->fl.cntxt_id, 0, sysctl_uint16, "I",
2036	    "SGE context id of the queue");
2037	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
2038	    &rxq->fl.cidx, 0, "consumer index");
2039	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
2040	    &rxq->fl.pidx, 0, "producer index");
2041
2042	return (rc);
2043}
2044
2045static int
2046free_rxq(struct port_info *pi, struct sge_rxq *rxq)
2047{
2048	int rc;
2049
2050#if defined(INET) || defined(INET6)
2051	if (rxq->lro.ifp) {
2052		tcp_lro_free(&rxq->lro);
2053		rxq->lro.ifp = NULL;
2054	}
2055#endif
2056
2057	rc = free_iq_fl(pi, &rxq->iq, &rxq->fl);
2058	if (rc == 0)
2059		bzero(rxq, sizeof(*rxq));
2060
2061	return (rc);
2062}
2063
2064#ifdef TCP_OFFLOAD
2065static int
2066alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq,
2067    int intr_idx, int idx, struct sysctl_oid *oid)
2068{
2069	int rc;
2070	struct sysctl_oid_list *children;
2071	char name[16];
2072
2073	rc = alloc_iq_fl(pi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx,
2074	    1 << pi->tx_chan);
2075	if (rc != 0)
2076		return (rc);
2077
2078	children = SYSCTL_CHILDREN(oid);
2079
2080	snprintf(name, sizeof(name), "%d", idx);
2081	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
2082	    NULL, "rx queue");
2083	children = SYSCTL_CHILDREN(oid);
2084
2085	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "abs_id",
2086	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16,
2087	    "I", "absolute id of the queue");
2088	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id",
2089	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16,
2090	    "I", "SGE context id of the queue");
2091	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
2092	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I",
2093	    "consumer index");
2094
2095	children = SYSCTL_CHILDREN(oid);
2096	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "fl", CTLFLAG_RD,
2097	    NULL, "freelist");
2098	children = SYSCTL_CHILDREN(oid);
2099
2100	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id",
2101	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->fl.cntxt_id, 0, sysctl_uint16,
2102	    "I", "SGE context id of the queue");
2103	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
2104	    &ofld_rxq->fl.cidx, 0, "consumer index");
2105	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
2106	    &ofld_rxq->fl.pidx, 0, "producer index");
2107
2108	return (rc);
2109}
2110
2111static int
2112free_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq)
2113{
2114	int rc;
2115
2116	rc = free_iq_fl(pi, &ofld_rxq->iq, &ofld_rxq->fl);
2117	if (rc == 0)
2118		bzero(ofld_rxq, sizeof(*ofld_rxq));
2119
2120	return (rc);
2121}
2122#endif
2123
2124static int
2125ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
2126{
2127	int rc, cntxt_id;
2128	struct fw_eq_ctrl_cmd c;
2129
2130	bzero(&c, sizeof(c));
2131
2132	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
2133	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
2134	    V_FW_EQ_CTRL_CMD_VFN(0));
2135	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
2136	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
2137	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); /* XXX */
2138	c.physeqid_pkd = htobe32(0);
2139	c.fetchszm_to_iqid =
2140	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
2141		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
2142		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
2143	c.dcaen_to_eqsize =
2144	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
2145		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
2146		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
2147		V_FW_EQ_CTRL_CMD_EQSIZE(eq->qsize));
2148	c.eqaddr = htobe64(eq->ba);
2149
2150	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2151	if (rc != 0) {
2152		device_printf(sc->dev,
2153		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
2154		return (rc);
2155	}
2156	eq->flags |= EQ_ALLOCATED;
2157
2158	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
2159	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
2160	if (cntxt_id >= sc->sge.neq)
2161	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
2162		cntxt_id, sc->sge.neq - 1);
2163	sc->sge.eqmap[cntxt_id] = eq;
2164
2165	return (rc);
2166}
2167
2168static int
2169eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
2170{
2171	int rc, cntxt_id;
2172	struct fw_eq_eth_cmd c;
2173
2174	bzero(&c, sizeof(c));
2175
2176	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
2177	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
2178	    V_FW_EQ_ETH_CMD_VFN(0));
2179	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
2180	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
2181	c.viid_pkd = htobe32(V_FW_EQ_ETH_CMD_VIID(pi->viid));
2182	c.fetchszm_to_iqid =
2183	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
2184		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
2185		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
2186	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
2187		      V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
2188		      V_FW_EQ_ETH_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
2189		      V_FW_EQ_ETH_CMD_EQSIZE(eq->qsize));
2190	c.eqaddr = htobe64(eq->ba);
2191
2192	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2193	if (rc != 0) {
2194		device_printf(pi->dev,
2195		    "failed to create Ethernet egress queue: %d\n", rc);
2196		return (rc);
2197	}
2198	eq->flags |= EQ_ALLOCATED;
2199
2200	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
2201	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
2202	if (cntxt_id >= sc->sge.neq)
2203	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
2204		cntxt_id, sc->sge.neq - 1);
2205	sc->sge.eqmap[cntxt_id] = eq;
2206
2207	return (rc);
2208}
2209
2210#ifdef TCP_OFFLOAD
2211static int
2212ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
2213{
2214	int rc, cntxt_id;
2215	struct fw_eq_ofld_cmd c;
2216
2217	bzero(&c, sizeof(c));
2218
2219	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
2220	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
2221	    V_FW_EQ_OFLD_CMD_VFN(0));
2222	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
2223	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
2224	c.fetchszm_to_iqid =
2225		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
2226		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
2227		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
2228	c.dcaen_to_eqsize =
2229	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
2230		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
2231		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
2232		V_FW_EQ_OFLD_CMD_EQSIZE(eq->qsize));
2233	c.eqaddr = htobe64(eq->ba);
2234
2235	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2236	if (rc != 0) {
2237		device_printf(pi->dev,
2238		    "failed to create egress queue for TCP offload: %d\n", rc);
2239		return (rc);
2240	}
2241	eq->flags |= EQ_ALLOCATED;
2242
2243	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
2244	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
2245	if (cntxt_id >= sc->sge.neq)
2246	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
2247		cntxt_id, sc->sge.neq - 1);
2248	sc->sge.eqmap[cntxt_id] = eq;
2249
2250	return (rc);
2251}
2252#endif
2253
2254static int
2255alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
2256{
2257	int rc;
2258	size_t len;
2259
2260	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
2261
2262	len = eq->qsize * EQ_ESIZE;
2263	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
2264	    &eq->ba, (void **)&eq->desc);
2265	if (rc)
2266		return (rc);
2267
2268	eq->cap = eq->qsize - spg_len / EQ_ESIZE;
2269	eq->spg = (void *)&eq->desc[eq->cap];
2270	eq->avail = eq->cap - 1;	/* one less to avoid cidx = pidx */
2271	eq->pidx = eq->cidx = 0;
2272	eq->doorbells = sc->doorbells;
2273
2274	switch (eq->flags & EQ_TYPEMASK) {
2275	case EQ_CTRL:
2276		rc = ctrl_eq_alloc(sc, eq);
2277		break;
2278
2279	case EQ_ETH:
2280		rc = eth_eq_alloc(sc, pi, eq);
2281		break;
2282
2283#ifdef TCP_OFFLOAD
2284	case EQ_OFLD:
2285		rc = ofld_eq_alloc(sc, pi, eq);
2286		break;
2287#endif
2288
2289	default:
2290		panic("%s: invalid eq type %d.", __func__,
2291		    eq->flags & EQ_TYPEMASK);
2292	}
2293	if (rc != 0) {
2294		device_printf(sc->dev,
2295		    "failed to allocate egress queue(%d): %d",
2296		    eq->flags & EQ_TYPEMASK, rc);
2297	}
2298
2299	eq->tx_callout.c_cpu = eq->cntxt_id % mp_ncpus;
2300
2301	if (isset(&eq->doorbells, DOORBELL_UDB) ||
2302	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
2303	    isset(&eq->doorbells, DOORBELL_WCWR)) {
2304		uint32_t s_qpp = sc->sge.s_qpp;
2305		uint32_t mask = (1 << s_qpp) - 1;
2306		volatile uint8_t *udb;
2307
2308		udb = sc->udbs_base + UDBS_DB_OFFSET;
2309		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
2310		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
2311		if (eq->udb_qid > PAGE_SIZE / UDBS_SEG_SIZE)
2312	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
2313		else {
2314			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
2315			eq->udb_qid = 0;
2316		}
2317		eq->udb = (volatile void *)udb;
2318	}
2319
2320	return (rc);
2321}
2322
2323static int
2324free_eq(struct adapter *sc, struct sge_eq *eq)
2325{
2326	int rc;
2327
2328	if (eq->flags & EQ_ALLOCATED) {
2329		switch (eq->flags & EQ_TYPEMASK) {
2330		case EQ_CTRL:
2331			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
2332			    eq->cntxt_id);
2333			break;
2334
2335		case EQ_ETH:
2336			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
2337			    eq->cntxt_id);
2338			break;
2339
2340#ifdef TCP_OFFLOAD
2341		case EQ_OFLD:
2342			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
2343			    eq->cntxt_id);
2344			break;
2345#endif
2346
2347		default:
2348			panic("%s: invalid eq type %d.", __func__,
2349			    eq->flags & EQ_TYPEMASK);
2350		}
2351		if (rc != 0) {
2352			device_printf(sc->dev,
2353			    "failed to free egress queue (%d): %d\n",
2354			    eq->flags & EQ_TYPEMASK, rc);
2355			return (rc);
2356		}
2357		eq->flags &= ~EQ_ALLOCATED;
2358	}
2359
2360	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
2361
2362	if (mtx_initialized(&eq->eq_lock))
2363		mtx_destroy(&eq->eq_lock);
2364
2365	bzero(eq, sizeof(*eq));
2366	return (0);
2367}
2368
2369static int
2370alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq,
2371    struct sysctl_oid *oid)
2372{
2373	int rc;
2374	struct sysctl_ctx_list *ctx = pi ? &pi->ctx : &sc->ctx;
2375	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2376
2377	rc = alloc_eq(sc, pi, &wrq->eq);
2378	if (rc)
2379		return (rc);
2380
2381	wrq->adapter = sc;
2382	STAILQ_INIT(&wrq->wr_list);
2383
2384	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
2385	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
2386	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
2387	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
2388	    "consumer index");
2389	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
2390	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
2391	    "producer index");
2392	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs", CTLFLAG_RD,
2393	    &wrq->tx_wrs, "# of work requests");
2394	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD,
2395	    &wrq->no_desc, 0,
2396	    "# of times queue ran out of hardware descriptors");
2397	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD,
2398	    &wrq->eq.unstalled, 0, "# of times queue recovered after stall");
2399
2400
2401	return (rc);
2402}
2403
2404static int
2405free_wrq(struct adapter *sc, struct sge_wrq *wrq)
2406{
2407	int rc;
2408
2409	rc = free_eq(sc, &wrq->eq);
2410	if (rc)
2411		return (rc);
2412
2413	bzero(wrq, sizeof(*wrq));
2414	return (0);
2415}
2416
2417static int
2418alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx,
2419    struct sysctl_oid *oid)
2420{
2421	int rc;
2422	struct adapter *sc = pi->adapter;
2423	struct sge_eq *eq = &txq->eq;
2424	char name[16];
2425	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2426
2427	rc = alloc_eq(sc, pi, eq);
2428	if (rc)
2429		return (rc);
2430
2431	txq->ifp = pi->ifp;
2432
2433	txq->sdesc = malloc(eq->cap * sizeof(struct tx_sdesc), M_CXGBE,
2434	    M_ZERO | M_WAITOK);
2435	txq->br = buf_ring_alloc(eq->qsize, M_CXGBE, M_WAITOK, &eq->eq_lock);
2436
2437	rc = bus_dma_tag_create(sc->dmat, 1, 0, BUS_SPACE_MAXADDR,
2438	    BUS_SPACE_MAXADDR, NULL, NULL, 64 * 1024, TX_SGL_SEGS,
2439	    BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &txq->tx_tag);
2440	if (rc != 0) {
2441		device_printf(sc->dev,
2442		    "failed to create tx DMA tag: %d\n", rc);
2443		return (rc);
2444	}
2445
2446	/*
2447	 * We can stuff ~10 frames in an 8-descriptor txpkts WR (8 is the SGE
2448	 * limit for any WR).  txq->no_dmamap events shouldn't occur if maps is
2449	 * sized for the worst case.
2450	 */
2451	rc = t4_alloc_tx_maps(&txq->txmaps, txq->tx_tag, eq->qsize * 10 / 8,
2452	    M_WAITOK);
2453	if (rc != 0) {
2454		device_printf(sc->dev, "failed to setup tx DMA maps: %d\n", rc);
2455		return (rc);
2456	}
2457
2458	snprintf(name, sizeof(name), "%d", idx);
2459	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
2460	    NULL, "tx queue");
2461	children = SYSCTL_CHILDREN(oid);
2462
2463	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
2464	    &eq->cntxt_id, 0, "SGE context id of the queue");
2465	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
2466	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
2467	    "consumer index");
2468	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "pidx",
2469	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
2470	    "producer index");
2471
2472	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
2473	    &txq->txcsum, "# of times hardware assisted with checksum");
2474	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_insertion",
2475	    CTLFLAG_RD, &txq->vlan_insertion,
2476	    "# of times hardware inserted 802.1Q tag");
2477	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
2478	    &txq->tso_wrs, "# of TSO work requests");
2479	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
2480	    &txq->imm_wrs, "# of work requests with immediate data");
2481	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
2482	    &txq->sgl_wrs, "# of work requests with direct SGL");
2483	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
2484	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
2485	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_wrs", CTLFLAG_RD,
2486	    &txq->txpkts_wrs, "# of txpkts work requests (multiple pkts/WR)");
2487	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_pkts", CTLFLAG_RD,
2488	    &txq->txpkts_pkts, "# of frames tx'd using txpkts work requests");
2489
2490	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "br_drops", CTLFLAG_RD,
2491	    &txq->br->br_drops, "# of drops in the buf_ring for this queue");
2492	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_dmamap", CTLFLAG_RD,
2493	    &txq->no_dmamap, 0, "# of times txq ran out of DMA maps");
2494	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD,
2495	    &txq->no_desc, 0, "# of times txq ran out of hardware descriptors");
2496	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "egr_update", CTLFLAG_RD,
2497	    &eq->egr_update, 0, "egress update notifications from the SGE");
2498	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD,
2499	    &eq->unstalled, 0, "# of times txq recovered after stall");
2500
2501	return (rc);
2502}
2503
2504static int
2505free_txq(struct port_info *pi, struct sge_txq *txq)
2506{
2507	int rc;
2508	struct adapter *sc = pi->adapter;
2509	struct sge_eq *eq = &txq->eq;
2510
2511	rc = free_eq(sc, eq);
2512	if (rc)
2513		return (rc);
2514
2515	free(txq->sdesc, M_CXGBE);
2516
2517	if (txq->txmaps.maps)
2518		t4_free_tx_maps(&txq->txmaps, txq->tx_tag);
2519
2520	buf_ring_free(txq->br, M_CXGBE);
2521
2522	if (txq->tx_tag)
2523		bus_dma_tag_destroy(txq->tx_tag);
2524
2525	bzero(txq, sizeof(*txq));
2526	return (0);
2527}
2528
2529static void
2530oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
2531{
2532	bus_addr_t *ba = arg;
2533
2534	KASSERT(nseg == 1,
2535	    ("%s meant for single segment mappings only.", __func__));
2536
2537	*ba = error ? 0 : segs->ds_addr;
2538}
2539
2540static inline bool
2541is_new_response(const struct sge_iq *iq, struct rsp_ctrl **ctrl)
2542{
2543	*ctrl = (void *)((uintptr_t)iq->cdesc +
2544	    (iq->esize - sizeof(struct rsp_ctrl)));
2545
2546	return (((*ctrl)->u.type_gen >> S_RSPD_GEN) == iq->gen);
2547}
2548
2549static inline void
2550iq_next(struct sge_iq *iq)
2551{
2552	iq->cdesc = (void *) ((uintptr_t)iq->cdesc + iq->esize);
2553	if (__predict_false(++iq->cidx == iq->qsize - 1)) {
2554		iq->cidx = 0;
2555		iq->gen ^= 1;
2556		iq->cdesc = iq->desc;
2557	}
2558}
2559
2560#define FL_HW_IDX(x) ((x) >> 3)
2561static inline void
2562ring_fl_db(struct adapter *sc, struct sge_fl *fl)
2563{
2564	int ndesc = fl->pending / 8;
2565	uint32_t v;
2566
2567	if (FL_HW_IDX(fl->pidx) == FL_HW_IDX(fl->cidx))
2568		ndesc--;	/* hold back one credit */
2569
2570	if (ndesc <= 0)
2571		return;		/* nothing to do */
2572
2573	v = F_DBPRIO | V_QID(fl->cntxt_id) | V_PIDX(ndesc);
2574	if (is_t5(sc))
2575		v |= F_DBTYPE;
2576
2577	wmb();
2578
2579	t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v);
2580	fl->pending -= ndesc * 8;
2581}
2582
2583/*
2584 * Fill up the freelist by upto nbufs and maybe ring its doorbell.
2585 *
2586 * Returns non-zero to indicate that it should be added to the list of starving
2587 * freelists.
2588 */
2589static int
2590refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs)
2591{
2592	__be64 *d = &fl->desc[fl->pidx];
2593	struct fl_sdesc *sd = &fl->sdesc[fl->pidx];
2594	bus_dma_tag_t tag;
2595	bus_addr_t pa;
2596	caddr_t cl;
2597	int rc;
2598
2599	FL_LOCK_ASSERT_OWNED(fl);
2600
2601	if (nbufs > fl->needed)
2602		nbufs = fl->needed;
2603
2604	while (nbufs--) {
2605
2606		if (sd->cl != NULL) {
2607
2608			/*
2609			 * This happens when a frame small enough to fit
2610			 * entirely in an mbuf was received in cl last time.
2611			 * We'd held on to cl and can reuse it now.  Note that
2612			 * we reuse a cluster of the old size if fl->tag_idx is
2613			 * no longer the same as sd->tag_idx.
2614			 */
2615
2616			KASSERT(*d == sd->ba_tag,
2617			    ("%s: recyling problem at pidx %d",
2618			    __func__, fl->pidx));
2619
2620			d++;
2621			goto recycled;
2622		}
2623
2624
2625		if (fl->tag_idx != sd->tag_idx) {
2626			bus_dmamap_t map;
2627			bus_dma_tag_t newtag = fl->tag[fl->tag_idx];
2628			bus_dma_tag_t oldtag = fl->tag[sd->tag_idx];
2629
2630			/*
2631			 * An MTU change can get us here.  Discard the old map
2632			 * which was created with the old tag, but only if
2633			 * we're able to get a new one.
2634			 */
2635			rc = bus_dmamap_create(newtag, 0, &map);
2636			if (rc == 0) {
2637				bus_dmamap_destroy(oldtag, sd->map);
2638				sd->map = map;
2639				sd->tag_idx = fl->tag_idx;
2640			}
2641		}
2642
2643		tag = fl->tag[sd->tag_idx];
2644
2645		cl = m_cljget(NULL, M_NOWAIT, FL_BUF_SIZE(sd->tag_idx));
2646		if (cl == NULL)
2647			break;
2648
2649		rc = bus_dmamap_load(tag, sd->map, cl, FL_BUF_SIZE(sd->tag_idx),
2650		    oneseg_dma_callback, &pa, 0);
2651		if (rc != 0 || pa == 0) {
2652			fl->dmamap_failed++;
2653			uma_zfree(FL_BUF_ZONE(sd->tag_idx), cl);
2654			break;
2655		}
2656
2657		sd->cl = cl;
2658		*d++ = htobe64(pa | sd->tag_idx);
2659
2660#ifdef INVARIANTS
2661		sd->ba_tag = htobe64(pa | sd->tag_idx);
2662#endif
2663
2664recycled:
2665		/* sd->m is never recycled, should always be NULL */
2666		KASSERT(sd->m == NULL, ("%s: stray mbuf", __func__));
2667
2668		sd->m = m_gethdr(M_NOWAIT, MT_NOINIT);
2669		if (sd->m == NULL)
2670			break;
2671
2672		fl->pending++;
2673		fl->needed--;
2674		sd++;
2675		if (++fl->pidx == fl->cap) {
2676			fl->pidx = 0;
2677			sd = fl->sdesc;
2678			d = fl->desc;
2679		}
2680	}
2681
2682	if (fl->pending >= 8)
2683		ring_fl_db(sc, fl);
2684
2685	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
2686}
2687
2688/*
2689 * Attempt to refill all starving freelists.
2690 */
2691static void
2692refill_sfl(void *arg)
2693{
2694	struct adapter *sc = arg;
2695	struct sge_fl *fl, *fl_temp;
2696
2697	mtx_lock(&sc->sfl_lock);
2698	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
2699		FL_LOCK(fl);
2700		refill_fl(sc, fl, 64);
2701		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
2702			TAILQ_REMOVE(&sc->sfl, fl, link);
2703			fl->flags &= ~FL_STARVING;
2704		}
2705		FL_UNLOCK(fl);
2706	}
2707
2708	if (!TAILQ_EMPTY(&sc->sfl))
2709		callout_schedule(&sc->sfl_callout, hz / 5);
2710	mtx_unlock(&sc->sfl_lock);
2711}
2712
2713static int
2714alloc_fl_sdesc(struct sge_fl *fl)
2715{
2716	struct fl_sdesc *sd;
2717	bus_dma_tag_t tag;
2718	int i, rc;
2719
2720	FL_LOCK_ASSERT_OWNED(fl);
2721
2722	fl->sdesc = malloc(fl->cap * sizeof(struct fl_sdesc), M_CXGBE,
2723	    M_ZERO | M_WAITOK);
2724
2725	tag = fl->tag[fl->tag_idx];
2726	sd = fl->sdesc;
2727	for (i = 0; i < fl->cap; i++, sd++) {
2728
2729		sd->tag_idx = fl->tag_idx;
2730		rc = bus_dmamap_create(tag, 0, &sd->map);
2731		if (rc != 0)
2732			goto failed;
2733	}
2734
2735	return (0);
2736failed:
2737	while (--i >= 0) {
2738		sd--;
2739		bus_dmamap_destroy(tag, sd->map);
2740		if (sd->m) {
2741			m_init(sd->m, NULL, 0, M_NOWAIT, MT_DATA, 0);
2742			m_free(sd->m);
2743			sd->m = NULL;
2744		}
2745	}
2746	KASSERT(sd == fl->sdesc, ("%s: EDOOFUS", __func__));
2747
2748	free(fl->sdesc, M_CXGBE);
2749	fl->sdesc = NULL;
2750
2751	return (rc);
2752}
2753
2754static void
2755free_fl_sdesc(struct sge_fl *fl)
2756{
2757	struct fl_sdesc *sd;
2758	int i;
2759
2760	FL_LOCK_ASSERT_OWNED(fl);
2761
2762	sd = fl->sdesc;
2763	for (i = 0; i < fl->cap; i++, sd++) {
2764
2765		if (sd->m) {
2766			m_init(sd->m, NULL, 0, M_NOWAIT, MT_DATA, 0);
2767			m_free(sd->m);
2768			sd->m = NULL;
2769		}
2770
2771		if (sd->cl) {
2772			bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map);
2773			uma_zfree(FL_BUF_ZONE(sd->tag_idx), sd->cl);
2774			sd->cl = NULL;
2775		}
2776
2777		bus_dmamap_destroy(fl->tag[sd->tag_idx], sd->map);
2778	}
2779
2780	free(fl->sdesc, M_CXGBE);
2781	fl->sdesc = NULL;
2782}
2783
2784int
2785t4_alloc_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag, int count,
2786    int flags)
2787{
2788	struct tx_map *txm;
2789	int i, rc;
2790
2791	txmaps->map_total = txmaps->map_avail = count;
2792	txmaps->map_cidx = txmaps->map_pidx = 0;
2793
2794	txmaps->maps = malloc(count * sizeof(struct tx_map), M_CXGBE,
2795	    M_ZERO | flags);
2796
2797	txm = txmaps->maps;
2798	for (i = 0; i < count; i++, txm++) {
2799		rc = bus_dmamap_create(tx_tag, 0, &txm->map);
2800		if (rc != 0)
2801			goto failed;
2802	}
2803
2804	return (0);
2805failed:
2806	while (--i >= 0) {
2807		txm--;
2808		bus_dmamap_destroy(tx_tag, txm->map);
2809	}
2810	KASSERT(txm == txmaps->maps, ("%s: EDOOFUS", __func__));
2811
2812	free(txmaps->maps, M_CXGBE);
2813	txmaps->maps = NULL;
2814
2815	return (rc);
2816}
2817
2818void
2819t4_free_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag)
2820{
2821	struct tx_map *txm;
2822	int i;
2823
2824	txm = txmaps->maps;
2825	for (i = 0; i < txmaps->map_total; i++, txm++) {
2826
2827		if (txm->m) {
2828			bus_dmamap_unload(tx_tag, txm->map);
2829			m_freem(txm->m);
2830			txm->m = NULL;
2831		}
2832
2833		bus_dmamap_destroy(tx_tag, txm->map);
2834	}
2835
2836	free(txmaps->maps, M_CXGBE);
2837	txmaps->maps = NULL;
2838}
2839
2840/*
2841 * We'll do immediate data tx for non-TSO, but only when not coalescing.  We're
2842 * willing to use upto 2 hardware descriptors which means a maximum of 96 bytes
2843 * of immediate data.
2844 */
2845#define IMM_LEN ( \
2846      2 * EQ_ESIZE \
2847    - sizeof(struct fw_eth_tx_pkt_wr) \
2848    - sizeof(struct cpl_tx_pkt_core))
2849
2850/*
2851 * Returns non-zero on failure, no need to cleanup anything in that case.
2852 *
2853 * Note 1: We always try to defrag the mbuf if required and return EFBIG only
2854 * if the resulting chain still won't fit in a tx descriptor.
2855 *
2856 * Note 2: We'll pullup the mbuf chain if TSO is requested and the first mbuf
2857 * does not have the TCP header in it.
2858 */
2859static int
2860get_pkt_sgl(struct sge_txq *txq, struct mbuf **fp, struct sgl *sgl,
2861    int sgl_only)
2862{
2863	struct mbuf *m = *fp;
2864	struct tx_maps *txmaps;
2865	struct tx_map *txm;
2866	int rc, defragged = 0, n;
2867
2868	TXQ_LOCK_ASSERT_OWNED(txq);
2869
2870	if (m->m_pkthdr.tso_segsz)
2871		sgl_only = 1;	/* Do not allow immediate data with LSO */
2872
2873start:	sgl->nsegs = 0;
2874
2875	if (m->m_pkthdr.len <= IMM_LEN && !sgl_only)
2876		return (0);	/* nsegs = 0 tells caller to use imm. tx */
2877
2878	txmaps = &txq->txmaps;
2879	if (txmaps->map_avail == 0) {
2880		txq->no_dmamap++;
2881		return (ENOMEM);
2882	}
2883	txm = &txmaps->maps[txmaps->map_pidx];
2884
2885	if (m->m_pkthdr.tso_segsz && m->m_len < 50) {
2886		*fp = m_pullup(m, 50);
2887		m = *fp;
2888		if (m == NULL)
2889			return (ENOBUFS);
2890	}
2891
2892	rc = bus_dmamap_load_mbuf_sg(txq->tx_tag, txm->map, m, sgl->seg,
2893	    &sgl->nsegs, BUS_DMA_NOWAIT);
2894	if (rc == EFBIG && defragged == 0) {
2895		m = m_defrag(m, M_NOWAIT);
2896		if (m == NULL)
2897			return (EFBIG);
2898
2899		defragged = 1;
2900		*fp = m;
2901		goto start;
2902	}
2903	if (rc != 0)
2904		return (rc);
2905
2906	txm->m = m;
2907	txmaps->map_avail--;
2908	if (++txmaps->map_pidx == txmaps->map_total)
2909		txmaps->map_pidx = 0;
2910
2911	KASSERT(sgl->nsegs > 0 && sgl->nsegs <= TX_SGL_SEGS,
2912	    ("%s: bad DMA mapping (%d segments)", __func__, sgl->nsegs));
2913
2914	/*
2915	 * Store the # of flits required to hold this frame's SGL in nflits.  An
2916	 * SGL has a (ULPTX header + len0, addr0) tuple optionally followed by
2917	 * multiple (len0 + len1, addr0, addr1) tuples.  If addr1 is not used
2918	 * then len1 must be set to 0.
2919	 */
2920	n = sgl->nsegs - 1;
2921	sgl->nflits = (3 * n) / 2 + (n & 1) + 2;
2922
2923	return (0);
2924}
2925
2926
2927/*
2928 * Releases all the txq resources used up in the specified sgl.
2929 */
2930static int
2931free_pkt_sgl(struct sge_txq *txq, struct sgl *sgl)
2932{
2933	struct tx_maps *txmaps;
2934	struct tx_map *txm;
2935
2936	TXQ_LOCK_ASSERT_OWNED(txq);
2937
2938	if (sgl->nsegs == 0)
2939		return (0);	/* didn't use any map */
2940
2941	txmaps = &txq->txmaps;
2942
2943	/* 1 pkt uses exactly 1 map, back it out */
2944
2945	txmaps->map_avail++;
2946	if (txmaps->map_pidx > 0)
2947		txmaps->map_pidx--;
2948	else
2949		txmaps->map_pidx = txmaps->map_total - 1;
2950
2951	txm = &txmaps->maps[txmaps->map_pidx];
2952	bus_dmamap_unload(txq->tx_tag, txm->map);
2953	txm->m = NULL;
2954
2955	return (0);
2956}
2957
2958static int
2959write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m,
2960    struct sgl *sgl)
2961{
2962	struct sge_eq *eq = &txq->eq;
2963	struct fw_eth_tx_pkt_wr *wr;
2964	struct cpl_tx_pkt_core *cpl;
2965	uint32_t ctrl;	/* used in many unrelated places */
2966	uint64_t ctrl1;
2967	int nflits, ndesc, pktlen;
2968	struct tx_sdesc *txsd;
2969	caddr_t dst;
2970
2971	TXQ_LOCK_ASSERT_OWNED(txq);
2972
2973	pktlen = m->m_pkthdr.len;
2974
2975	/*
2976	 * Do we have enough flits to send this frame out?
2977	 */
2978	ctrl = sizeof(struct cpl_tx_pkt_core);
2979	if (m->m_pkthdr.tso_segsz) {
2980		nflits = TXPKT_LSO_WR_HDR;
2981		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
2982	} else
2983		nflits = TXPKT_WR_HDR;
2984	if (sgl->nsegs > 0)
2985		nflits += sgl->nflits;
2986	else {
2987		nflits += howmany(pktlen, 8);
2988		ctrl += pktlen;
2989	}
2990	ndesc = howmany(nflits, 8);
2991	if (ndesc > eq->avail)
2992		return (ENOMEM);
2993
2994	/* Firmware work request header */
2995	wr = (void *)&eq->desc[eq->pidx];
2996	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
2997	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
2998	ctrl = V_FW_WR_LEN16(howmany(nflits, 2));
2999	if (eq->avail == ndesc) {
3000		if (!(eq->flags & EQ_CRFLUSHED)) {
3001			ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
3002			eq->flags |= EQ_CRFLUSHED;
3003		}
3004		eq->flags |= EQ_STALLED;
3005	}
3006
3007	wr->equiq_to_len16 = htobe32(ctrl);
3008	wr->r3 = 0;
3009
3010	if (m->m_pkthdr.tso_segsz) {
3011		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
3012		struct ether_header *eh;
3013		void *l3hdr;
3014#if defined(INET) || defined(INET6)
3015		struct tcphdr *tcp;
3016#endif
3017		uint16_t eh_type;
3018
3019		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
3020		    F_LSO_LAST_SLICE;
3021
3022		eh = mtod(m, struct ether_header *);
3023		eh_type = ntohs(eh->ether_type);
3024		if (eh_type == ETHERTYPE_VLAN) {
3025			struct ether_vlan_header *evh = (void *)eh;
3026
3027			ctrl |= V_LSO_ETHHDR_LEN(1);
3028			l3hdr = evh + 1;
3029			eh_type = ntohs(evh->evl_proto);
3030		} else
3031			l3hdr = eh + 1;
3032
3033		switch (eh_type) {
3034#ifdef INET6
3035		case ETHERTYPE_IPV6:
3036		{
3037			struct ip6_hdr *ip6 = l3hdr;
3038
3039			/*
3040			 * XXX-BZ For now we do not pretend to support
3041			 * IPv6 extension headers.
3042			 */
3043			KASSERT(ip6->ip6_nxt == IPPROTO_TCP, ("%s: CSUM_TSO "
3044			    "with ip6_nxt != TCP: %u", __func__, ip6->ip6_nxt));
3045			tcp = (struct tcphdr *)(ip6 + 1);
3046			ctrl |= F_LSO_IPV6;
3047			ctrl |= V_LSO_IPHDR_LEN(sizeof(*ip6) >> 2) |
3048			    V_LSO_TCPHDR_LEN(tcp->th_off);
3049			break;
3050		}
3051#endif
3052#ifdef INET
3053		case ETHERTYPE_IP:
3054		{
3055			struct ip *ip = l3hdr;
3056
3057			tcp = (void *)((uintptr_t)ip + ip->ip_hl * 4);
3058			ctrl |= V_LSO_IPHDR_LEN(ip->ip_hl) |
3059			    V_LSO_TCPHDR_LEN(tcp->th_off);
3060			break;
3061		}
3062#endif
3063		default:
3064			panic("%s: CSUM_TSO but no supported IP version "
3065			    "(0x%04x)", __func__, eh_type);
3066		}
3067
3068		lso->lso_ctrl = htobe32(ctrl);
3069		lso->ipid_ofst = htobe16(0);
3070		lso->mss = htobe16(m->m_pkthdr.tso_segsz);
3071		lso->seqno_offset = htobe32(0);
3072		lso->len = htobe32(pktlen);
3073
3074		cpl = (void *)(lso + 1);
3075
3076		txq->tso_wrs++;
3077	} else
3078		cpl = (void *)(wr + 1);
3079
3080	/* Checksum offload */
3081	ctrl1 = 0;
3082	if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)))
3083		ctrl1 |= F_TXPKT_IPCSUM_DIS;
3084	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
3085	    CSUM_TCP_IPV6 | CSUM_TSO)))
3086		ctrl1 |= F_TXPKT_L4CSUM_DIS;
3087	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
3088	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
3089		txq->txcsum++;	/* some hardware assistance provided */
3090
3091	/* VLAN tag insertion */
3092	if (m->m_flags & M_VLANTAG) {
3093		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
3094		txq->vlan_insertion++;
3095	}
3096
3097	/* CPL header */
3098	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3099	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
3100	cpl->pack = 0;
3101	cpl->len = htobe16(pktlen);
3102	cpl->ctrl1 = htobe64(ctrl1);
3103
3104	/* Software descriptor */
3105	txsd = &txq->sdesc[eq->pidx];
3106	txsd->desc_used = ndesc;
3107
3108	eq->pending += ndesc;
3109	eq->avail -= ndesc;
3110	eq->pidx += ndesc;
3111	if (eq->pidx >= eq->cap)
3112		eq->pidx -= eq->cap;
3113
3114	/* SGL */
3115	dst = (void *)(cpl + 1);
3116	if (sgl->nsegs > 0) {
3117		txsd->credits = 1;
3118		txq->sgl_wrs++;
3119		write_sgl_to_txd(eq, sgl, &dst);
3120	} else {
3121		txsd->credits = 0;
3122		txq->imm_wrs++;
3123		for (; m; m = m->m_next) {
3124			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
3125#ifdef INVARIANTS
3126			pktlen -= m->m_len;
3127#endif
3128		}
3129#ifdef INVARIANTS
3130		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
3131#endif
3132
3133	}
3134
3135	txq->txpkt_wrs++;
3136	return (0);
3137}
3138
3139/*
3140 * Returns 0 to indicate that m has been accepted into a coalesced tx work
3141 * request.  It has either been folded into txpkts or txpkts was flushed and m
3142 * has started a new coalesced work request (as the first frame in a fresh
3143 * txpkts).
3144 *
3145 * Returns non-zero to indicate a failure - caller is responsible for
3146 * transmitting m, if there was anything in txpkts it has been flushed.
3147 */
3148static int
3149add_to_txpkts(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts,
3150    struct mbuf *m, struct sgl *sgl)
3151{
3152	struct sge_eq *eq = &txq->eq;
3153	int can_coalesce;
3154	struct tx_sdesc *txsd;
3155	int flits;
3156
3157	TXQ_LOCK_ASSERT_OWNED(txq);
3158
3159	KASSERT(sgl->nsegs, ("%s: can't coalesce imm data", __func__));
3160
3161	if (txpkts->npkt > 0) {
3162		flits = TXPKTS_PKT_HDR + sgl->nflits;
3163		can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
3164		    txpkts->nflits + flits <= TX_WR_FLITS &&
3165		    txpkts->nflits + flits <= eq->avail * 8 &&
3166		    txpkts->plen + m->m_pkthdr.len < 65536;
3167
3168		if (can_coalesce) {
3169			txpkts->npkt++;
3170			txpkts->nflits += flits;
3171			txpkts->plen += m->m_pkthdr.len;
3172
3173			txsd = &txq->sdesc[eq->pidx];
3174			txsd->credits++;
3175
3176			return (0);
3177		}
3178
3179		/*
3180		 * Couldn't coalesce m into txpkts.  The first order of business
3181		 * is to send txpkts on its way.  Then we'll revisit m.
3182		 */
3183		write_txpkts_wr(txq, txpkts);
3184	}
3185
3186	/*
3187	 * Check if we can start a new coalesced tx work request with m as
3188	 * the first packet in it.
3189	 */
3190
3191	KASSERT(txpkts->npkt == 0, ("%s: txpkts not empty", __func__));
3192
3193	flits = TXPKTS_WR_HDR + sgl->nflits;
3194	can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
3195	    flits <= eq->avail * 8 && flits <= TX_WR_FLITS;
3196
3197	if (can_coalesce == 0)
3198		return (EINVAL);
3199
3200	/*
3201	 * Start a fresh coalesced tx WR with m as the first frame in it.
3202	 */
3203	txpkts->npkt = 1;
3204	txpkts->nflits = flits;
3205	txpkts->flitp = &eq->desc[eq->pidx].flit[2];
3206	txpkts->plen = m->m_pkthdr.len;
3207
3208	txsd = &txq->sdesc[eq->pidx];
3209	txsd->credits = 1;
3210
3211	return (0);
3212}
3213
3214/*
3215 * Note that write_txpkts_wr can never run out of hardware descriptors (but
3216 * write_txpkt_wr can).  add_to_txpkts ensures that a frame is accepted for
3217 * coalescing only if sufficient hardware descriptors are available.
3218 */
3219static void
3220write_txpkts_wr(struct sge_txq *txq, struct txpkts *txpkts)
3221{
3222	struct sge_eq *eq = &txq->eq;
3223	struct fw_eth_tx_pkts_wr *wr;
3224	struct tx_sdesc *txsd;
3225	uint32_t ctrl;
3226	int ndesc;
3227
3228	TXQ_LOCK_ASSERT_OWNED(txq);
3229
3230	ndesc = howmany(txpkts->nflits, 8);
3231
3232	wr = (void *)&eq->desc[eq->pidx];
3233	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
3234	ctrl = V_FW_WR_LEN16(howmany(txpkts->nflits, 2));
3235	if (eq->avail == ndesc) {
3236		if (!(eq->flags & EQ_CRFLUSHED)) {
3237			ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
3238			eq->flags |= EQ_CRFLUSHED;
3239		}
3240		eq->flags |= EQ_STALLED;
3241	}
3242	wr->equiq_to_len16 = htobe32(ctrl);
3243	wr->plen = htobe16(txpkts->plen);
3244	wr->npkt = txpkts->npkt;
3245	wr->r3 = wr->type = 0;
3246
3247	/* Everything else already written */
3248
3249	txsd = &txq->sdesc[eq->pidx];
3250	txsd->desc_used = ndesc;
3251
3252	KASSERT(eq->avail >= ndesc, ("%s: out of descriptors", __func__));
3253
3254	eq->pending += ndesc;
3255	eq->avail -= ndesc;
3256	eq->pidx += ndesc;
3257	if (eq->pidx >= eq->cap)
3258		eq->pidx -= eq->cap;
3259
3260	txq->txpkts_pkts += txpkts->npkt;
3261	txq->txpkts_wrs++;
3262	txpkts->npkt = 0;	/* emptied */
3263}
3264
3265static inline void
3266write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq,
3267    struct txpkts *txpkts, struct mbuf *m, struct sgl *sgl)
3268{
3269	struct ulp_txpkt *ulpmc;
3270	struct ulptx_idata *ulpsc;
3271	struct cpl_tx_pkt_core *cpl;
3272	struct sge_eq *eq = &txq->eq;
3273	uintptr_t flitp, start, end;
3274	uint64_t ctrl;
3275	caddr_t dst;
3276
3277	KASSERT(txpkts->npkt > 0, ("%s: txpkts is empty", __func__));
3278
3279	start = (uintptr_t)eq->desc;
3280	end = (uintptr_t)eq->spg;
3281
3282	/* Checksum offload */
3283	ctrl = 0;
3284	if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)))
3285		ctrl |= F_TXPKT_IPCSUM_DIS;
3286	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
3287	    CSUM_TCP_IPV6 | CSUM_TSO)))
3288		ctrl |= F_TXPKT_L4CSUM_DIS;
3289	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
3290	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
3291		txq->txcsum++;	/* some hardware assistance provided */
3292
3293	/* VLAN tag insertion */
3294	if (m->m_flags & M_VLANTAG) {
3295		ctrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
3296		txq->vlan_insertion++;
3297	}
3298
3299	/*
3300	 * The previous packet's SGL must have ended at a 16 byte boundary (this
3301	 * is required by the firmware/hardware).  It follows that flitp cannot
3302	 * wrap around between the ULPTX master command and ULPTX subcommand (8
3303	 * bytes each), and that it can not wrap around in the middle of the
3304	 * cpl_tx_pkt_core either.
3305	 */
3306	flitp = (uintptr_t)txpkts->flitp;
3307	KASSERT((flitp & 0xf) == 0,
3308	    ("%s: last SGL did not end at 16 byte boundary: %p",
3309	    __func__, txpkts->flitp));
3310
3311	/* ULP master command */
3312	ulpmc = (void *)flitp;
3313	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) |
3314	    V_ULP_TXPKT_FID(eq->iqid));
3315	ulpmc->len = htonl(howmany(sizeof(*ulpmc) + sizeof(*ulpsc) +
3316	    sizeof(*cpl) + 8 * sgl->nflits, 16));
3317
3318	/* ULP subcommand */
3319	ulpsc = (void *)(ulpmc + 1);
3320	ulpsc->cmd_more = htobe32(V_ULPTX_CMD((u32)ULP_TX_SC_IMM) |
3321	    F_ULP_TX_SC_MORE);
3322	ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
3323
3324	flitp += sizeof(*ulpmc) + sizeof(*ulpsc);
3325	if (flitp == end)
3326		flitp = start;
3327
3328	/* CPL_TX_PKT */
3329	cpl = (void *)flitp;
3330	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3331	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
3332	cpl->pack = 0;
3333	cpl->len = htobe16(m->m_pkthdr.len);
3334	cpl->ctrl1 = htobe64(ctrl);
3335
3336	flitp += sizeof(*cpl);
3337	if (flitp == end)
3338		flitp = start;
3339
3340	/* SGL for this frame */
3341	dst = (caddr_t)flitp;
3342	txpkts->nflits += write_sgl_to_txd(eq, sgl, &dst);
3343	txpkts->flitp = (void *)dst;
3344
3345	KASSERT(((uintptr_t)dst & 0xf) == 0,
3346	    ("%s: SGL ends at %p (not a 16 byte boundary)", __func__, dst));
3347}
3348
3349/*
3350 * If the SGL ends on an address that is not 16 byte aligned, this function will
3351 * add a 0 filled flit at the end.  It returns 1 in that case.
3352 */
3353static int
3354write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to)
3355{
3356	__be64 *flitp, *end;
3357	struct ulptx_sgl *usgl;
3358	bus_dma_segment_t *seg;
3359	int i, padded;
3360
3361	KASSERT(sgl->nsegs > 0 && sgl->nflits > 0,
3362	    ("%s: bad SGL - nsegs=%d, nflits=%d",
3363	    __func__, sgl->nsegs, sgl->nflits));
3364
3365	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
3366	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
3367
3368	flitp = (__be64 *)(*to);
3369	end = flitp + sgl->nflits;
3370	seg = &sgl->seg[0];
3371	usgl = (void *)flitp;
3372
3373	/*
3374	 * We start at a 16 byte boundary somewhere inside the tx descriptor
3375	 * ring, so we're at least 16 bytes away from the status page.  There is
3376	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
3377	 */
3378
3379	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
3380	    V_ULPTX_NSGE(sgl->nsegs));
3381	usgl->len0 = htobe32(seg->ds_len);
3382	usgl->addr0 = htobe64(seg->ds_addr);
3383	seg++;
3384
3385	if ((uintptr_t)end <= (uintptr_t)eq->spg) {
3386
3387		/* Won't wrap around at all */
3388
3389		for (i = 0; i < sgl->nsegs - 1; i++, seg++) {
3390			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ds_len);
3391			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ds_addr);
3392		}
3393		if (i & 1)
3394			usgl->sge[i / 2].len[1] = htobe32(0);
3395	} else {
3396
3397		/* Will wrap somewhere in the rest of the SGL */
3398
3399		/* 2 flits already written, write the rest flit by flit */
3400		flitp = (void *)(usgl + 1);
3401		for (i = 0; i < sgl->nflits - 2; i++) {
3402			if ((uintptr_t)flitp == (uintptr_t)eq->spg)
3403				flitp = (void *)eq->desc;
3404			*flitp++ = get_flit(seg, sgl->nsegs - 1, i);
3405		}
3406		end = flitp;
3407	}
3408
3409	if ((uintptr_t)end & 0xf) {
3410		*(uint64_t *)end = 0;
3411		end++;
3412		padded = 1;
3413	} else
3414		padded = 0;
3415
3416	if ((uintptr_t)end == (uintptr_t)eq->spg)
3417		*to = (void *)eq->desc;
3418	else
3419		*to = (void *)end;
3420
3421	return (padded);
3422}
3423
3424static inline void
3425copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
3426{
3427	if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) {
3428		bcopy(from, *to, len);
3429		(*to) += len;
3430	} else {
3431		int portion = (uintptr_t)eq->spg - (uintptr_t)(*to);
3432
3433		bcopy(from, *to, portion);
3434		from += portion;
3435		portion = len - portion;	/* remaining */
3436		bcopy(from, (void *)eq->desc, portion);
3437		(*to) = (caddr_t)eq->desc + portion;
3438	}
3439}
3440
3441static inline void
3442ring_eq_db(struct adapter *sc, struct sge_eq *eq)
3443{
3444	u_int db, pending;
3445
3446	db = eq->doorbells;
3447	pending = eq->pending;
3448	if (pending > 1)
3449		clrbit(&db, DOORBELL_WCWR);
3450	eq->pending = 0;
3451	wmb();
3452
3453	switch (ffs(db) - 1) {
3454	case DOORBELL_UDB:
3455		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending));
3456		return;
3457
3458	case DOORBELL_WCWR: {
3459		volatile uint64_t *dst, *src;
3460		int i;
3461
3462		/*
3463		 * Queues whose 128B doorbell segment fits in the page do not
3464		 * use relative qid (udb_qid is always 0).  Only queues with
3465		 * doorbell segments can do WCWR.
3466		 */
3467		KASSERT(eq->udb_qid == 0 && pending == 1,
3468		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
3469		    __func__, eq->doorbells, pending, eq->pidx, eq));
3470
3471		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
3472		    UDBS_DB_OFFSET);
3473		i = eq->pidx ? eq->pidx - 1 : eq->cap - 1;
3474		src = (void *)&eq->desc[i];
3475		while (src != (void *)&eq->desc[i + 1])
3476			*dst++ = *src++;
3477		wmb();
3478		return;
3479	}
3480
3481	case DOORBELL_UDBWC:
3482		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending));
3483		wmb();
3484		return;
3485
3486	case DOORBELL_KDB:
3487		t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL),
3488		    V_QID(eq->cntxt_id) | V_PIDX(pending));
3489		return;
3490	}
3491}
3492
3493static inline int
3494reclaimable(struct sge_eq *eq)
3495{
3496	unsigned int cidx;
3497
3498	cidx = eq->spg->cidx;	/* stable snapshot */
3499	cidx = be16toh(cidx);
3500
3501	if (cidx >= eq->cidx)
3502		return (cidx - eq->cidx);
3503	else
3504		return (cidx + eq->cap - eq->cidx);
3505}
3506
3507/*
3508 * There are "can_reclaim" tx descriptors ready to be reclaimed.  Reclaim as
3509 * many as possible but stop when there are around "n" mbufs to free.
3510 *
3511 * The actual number reclaimed is provided as the return value.
3512 */
3513static int
3514reclaim_tx_descs(struct sge_txq *txq, int can_reclaim, int n)
3515{
3516	struct tx_sdesc *txsd;
3517	struct tx_maps *txmaps;
3518	struct tx_map *txm;
3519	unsigned int reclaimed, maps;
3520	struct sge_eq *eq = &txq->eq;
3521
3522	TXQ_LOCK_ASSERT_OWNED(txq);
3523
3524	if (can_reclaim == 0)
3525		can_reclaim = reclaimable(eq);
3526
3527	maps = reclaimed = 0;
3528	while (can_reclaim && maps < n) {
3529		int ndesc;
3530
3531		txsd = &txq->sdesc[eq->cidx];
3532		ndesc = txsd->desc_used;
3533
3534		/* Firmware doesn't return "partial" credits. */
3535		KASSERT(can_reclaim >= ndesc,
3536		    ("%s: unexpected number of credits: %d, %d",
3537		    __func__, can_reclaim, ndesc));
3538
3539		maps += txsd->credits;
3540
3541		reclaimed += ndesc;
3542		can_reclaim -= ndesc;
3543
3544		eq->cidx += ndesc;
3545		if (__predict_false(eq->cidx >= eq->cap))
3546			eq->cidx -= eq->cap;
3547	}
3548
3549	txmaps = &txq->txmaps;
3550	txm = &txmaps->maps[txmaps->map_cidx];
3551	if (maps)
3552		prefetch(txm->m);
3553
3554	eq->avail += reclaimed;
3555	KASSERT(eq->avail < eq->cap,	/* avail tops out at (cap - 1) */
3556	    ("%s: too many descriptors available", __func__));
3557
3558	txmaps->map_avail += maps;
3559	KASSERT(txmaps->map_avail <= txmaps->map_total,
3560	    ("%s: too many maps available", __func__));
3561
3562	while (maps--) {
3563		struct tx_map *next;
3564
3565		next = txm + 1;
3566		if (__predict_false(txmaps->map_cidx + 1 == txmaps->map_total))
3567			next = txmaps->maps;
3568		prefetch(next->m);
3569
3570		bus_dmamap_unload(txq->tx_tag, txm->map);
3571		m_freem(txm->m);
3572		txm->m = NULL;
3573
3574		txm = next;
3575		if (__predict_false(++txmaps->map_cidx == txmaps->map_total))
3576			txmaps->map_cidx = 0;
3577	}
3578
3579	return (reclaimed);
3580}
3581
3582static void
3583write_eqflush_wr(struct sge_eq *eq)
3584{
3585	struct fw_eq_flush_wr *wr;
3586
3587	EQ_LOCK_ASSERT_OWNED(eq);
3588	KASSERT(eq->avail > 0, ("%s: no descriptors left.", __func__));
3589	KASSERT(!(eq->flags & EQ_CRFLUSHED), ("%s: flushed already", __func__));
3590
3591	wr = (void *)&eq->desc[eq->pidx];
3592	bzero(wr, sizeof(*wr));
3593	wr->opcode = FW_EQ_FLUSH_WR;
3594	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(sizeof(*wr) / 16) |
3595	    F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
3596
3597	eq->flags |= (EQ_CRFLUSHED | EQ_STALLED);
3598	eq->pending++;
3599	eq->avail--;
3600	if (++eq->pidx == eq->cap)
3601		eq->pidx = 0;
3602}
3603
3604static __be64
3605get_flit(bus_dma_segment_t *sgl, int nsegs, int idx)
3606{
3607	int i = (idx / 3) * 2;
3608
3609	switch (idx % 3) {
3610	case 0: {
3611		__be64 rc;
3612
3613		rc = htobe32(sgl[i].ds_len);
3614		if (i + 1 < nsegs)
3615			rc |= (uint64_t)htobe32(sgl[i + 1].ds_len) << 32;
3616
3617		return (rc);
3618	}
3619	case 1:
3620		return htobe64(sgl[i].ds_addr);
3621	case 2:
3622		return htobe64(sgl[i + 1].ds_addr);
3623	}
3624
3625	return (0);
3626}
3627
3628static void
3629set_fl_tag_idx(struct sge_fl *fl, int bufsize)
3630{
3631	int i;
3632
3633	for (i = 0; i < FL_BUF_SIZES - 1; i++) {
3634		if (FL_BUF_SIZE(i) >= bufsize)
3635			break;
3636	}
3637
3638	fl->tag_idx = i;
3639}
3640
3641static void
3642add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
3643{
3644	mtx_lock(&sc->sfl_lock);
3645	FL_LOCK(fl);
3646	if ((fl->flags & FL_DOOMED) == 0) {
3647		fl->flags |= FL_STARVING;
3648		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
3649		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
3650	}
3651	FL_UNLOCK(fl);
3652	mtx_unlock(&sc->sfl_lock);
3653}
3654
3655static int
3656handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
3657    struct mbuf *m)
3658{
3659	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
3660	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
3661	struct adapter *sc = iq->adapter;
3662	struct sge *s = &sc->sge;
3663	struct sge_eq *eq;
3664
3665	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
3666	    rss->opcode));
3667
3668	eq = s->eqmap[qid - s->eq_start];
3669	EQ_LOCK(eq);
3670	KASSERT(eq->flags & EQ_CRFLUSHED,
3671	    ("%s: unsolicited egress update", __func__));
3672	eq->flags &= ~EQ_CRFLUSHED;
3673	eq->egr_update++;
3674
3675	if (__predict_false(eq->flags & EQ_DOOMED))
3676		wakeup_one(eq);
3677	else if (eq->flags & EQ_STALLED && can_resume_tx(eq))
3678		taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task);
3679	EQ_UNLOCK(eq);
3680
3681	return (0);
3682}
3683
3684/* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
3685CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
3686    offsetof(struct cpl_fw6_msg, data));
3687
3688static int
3689handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
3690{
3691	struct adapter *sc = iq->adapter;
3692	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
3693
3694	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
3695	    rss->opcode));
3696
3697	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
3698		const struct rss_header *rss2;
3699
3700		rss2 = (const struct rss_header *)&cpl->data[0];
3701		return (sc->cpl_handler[rss2->opcode](iq, rss2, m));
3702	}
3703
3704	return (sc->fw_msg_handler[cpl->type](sc, &cpl->data[0]));
3705}
3706
3707static int
3708sysctl_uint16(SYSCTL_HANDLER_ARGS)
3709{
3710	uint16_t *id = arg1;
3711	int i = *id;
3712
3713	return sysctl_handle_int(oidp, &i, 0, req);
3714}
3715