cxgb_sge.c revision 195006
1226584Sdim/**************************************************************************
2226584Sdim
3226584SdimCopyright (c) 2007-2009, Chelsio Inc.
4226584SdimAll rights reserved.
5226584Sdim
6226584SdimRedistribution and use in source and binary forms, with or without
7226584Sdimmodification, are permitted provided that the following conditions are met:
8226584Sdim
9226584Sdim 1. Redistributions of source code must retain the above copyright notice,
10226584Sdim    this list of conditions and the following disclaimer.
11226584Sdim
12226584Sdim 2. Neither the name of the Chelsio Corporation nor the names of its
13226584Sdim    contributors may be used to endorse or promote products derived from
14226584Sdim    this software without specific prior written permission.
15226584Sdim
16226584SdimTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17226584SdimAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18226584SdimIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19226584SdimARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20226584SdimLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21226584SdimCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22226584SdimSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23226584SdimINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24226584SdimCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25226584SdimARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26226584SdimPOSSIBILITY OF SUCH DAMAGE.
27226584Sdim
28226584Sdim***************************************************************************/
29226584Sdim
30226584Sdim#include <sys/cdefs.h>
31226584Sdim__FBSDID("$FreeBSD: head/sys/dev/cxgb/cxgb_sge.c 195006 2009-06-25 21:50:15Z np $");
32226584Sdim
33226584Sdim#include <sys/param.h>
34226584Sdim#include <sys/systm.h>
35226584Sdim#include <sys/kernel.h>
36226584Sdim#include <sys/module.h>
37249423Sdim#include <sys/bus.h>
38239462Sdim#include <sys/conf.h>
39239462Sdim#include <machine/bus.h>
40239462Sdim#include <machine/resource.h>
41239462Sdim#include <sys/bus_dma.h>
42239462Sdim#include <sys/rman.h>
43226584Sdim#include <sys/queue.h>
44226584Sdim#include <sys/sysctl.h>
45226584Sdim#include <sys/taskqueue.h>
46226584Sdim
47226584Sdim#include <sys/proc.h>
48226584Sdim#include <sys/sbuf.h>
49226584Sdim#include <sys/sched.h>
50226584Sdim#include <sys/smp.h>
51226584Sdim#include <sys/systm.h>
52226584Sdim#include <sys/syslog.h>
53226584Sdim
54226584Sdim#include <net/bpf.h>
55226584Sdim
56226584Sdim#include <netinet/in_systm.h>
57226584Sdim#include <netinet/in.h>
58226584Sdim#include <netinet/ip.h>
59226584Sdim#include <netinet/tcp.h>
60226584Sdim
61226584Sdim#include <dev/pci/pcireg.h>
62226584Sdim#include <dev/pci/pcivar.h>
63226584Sdim
64226584Sdim#include <vm/vm.h>
65226584Sdim#include <vm/pmap.h>
66226584Sdim
67226584Sdim#include <cxgb_include.h>
68226584Sdim#include <sys/mvec.h>
69226584Sdim
70226584Sdimint	txq_fills = 0;
71226584Sdimint	multiq_tx_enable = 1;
72226584Sdim
73226584Sdimextern struct sysctl_oid_list sysctl__hw_cxgb_children;
74226584Sdimint cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
75226584SdimTUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
76226584SdimSYSCTL_UINT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
77226584Sdim    "size of per-queue mbuf ring");
78263508Sdim
79226584Sdimstatic int cxgb_tx_coalesce_force = 0;
80263508SdimTUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
81226584SdimSYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
82226584Sdim    &cxgb_tx_coalesce_force, 0,
83226584Sdim    "coalesce small packets into a single work request regardless of ring state");
84226584Sdim
85226584Sdim#define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
86226584Sdim#define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
87226584Sdim#define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
88226584Sdim#define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
89226584Sdim#define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
90226584Sdim#define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
91226584Sdim#define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
92226584Sdim
93226584Sdim
94263508Sdimstatic int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
95263508SdimTUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
96226584Sdim    &cxgb_tx_coalesce_enable_start);
97226584SdimSYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
98226584Sdim    &cxgb_tx_coalesce_enable_start, 0,
99226584Sdim    "coalesce enable threshold");
100226584Sdimstatic int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
101226584SdimTUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
102226584SdimSYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
103226584Sdim    &cxgb_tx_coalesce_enable_stop, 0,
104249423Sdim    "coalesce disable threshold");
105249423Sdimstatic int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
106239462SdimTUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
107249423SdimSYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
108249423Sdim    &cxgb_tx_reclaim_threshold, 0,
109249423Sdim    "tx cleaning minimum threshold");
110249423Sdim
111249423Sdim/*
112249423Sdim * XXX don't re-enable this until TOE stops assuming
113239462Sdim * we have an m_ext
114263508Sdim */
115263508Sdimstatic int recycle_enable = 0;
116226584Sdimint cxgb_ext_freed = 0;
117226584Sdimint cxgb_ext_inited = 0;
118226584Sdimint fl_q_size = 0;
119226584Sdimint jumbo_q_size = 0;
120226584Sdim
121226584Sdimextern int cxgb_use_16k_clusters;
122239462Sdimextern int nmbjumbo4;
123226584Sdimextern int nmbjumbo9;
124249423Sdimextern int nmbjumbo16;
125249423Sdim
126226584Sdim#define USE_GTS 0
127226584Sdim
128249423Sdim#define SGE_RX_SM_BUF_SIZE	1536
129239462Sdim#define SGE_RX_DROP_THRES	16
130226584Sdim#define SGE_RX_COPY_THRES	128
131226584Sdim
132226584Sdim/*
133226584Sdim * Period of the Tx buffer reclaim timer.  This timer does not need to run
134226584Sdim * frequently as Tx buffers are usually reclaimed by new Tx packets.
135226584Sdim */
136226584Sdim#define TX_RECLAIM_PERIOD       (hz >> 1)
137226584Sdim
138226584Sdim/*
139226584Sdim * Values for sge_txq.flags
140226584Sdim */
141226584Sdimenum {
142239462Sdim	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
143239462Sdim	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
144239462Sdim};
145239462Sdim
146226584Sdimstruct tx_desc {
147226584Sdim	uint64_t	flit[TX_DESC_FLITS];
148226584Sdim} __packed;
149226584Sdim
150226584Sdimstruct rx_desc {
151226584Sdim	uint32_t	addr_lo;
152226584Sdim	uint32_t	len_gen;
153226584Sdim	uint32_t	gen2;
154226584Sdim	uint32_t	addr_hi;
155226584Sdim} __packed;;
156226584Sdim
157226584Sdimstruct rsp_desc {               /* response queue descriptor */
158226584Sdim	struct rss_header	rss_hdr;
159226584Sdim	uint32_t		flags;
160239462Sdim	uint32_t		len_cq;
161239462Sdim	uint8_t			imm_data[47];
162263508Sdim	uint8_t			intr_gen;
163226584Sdim} __packed;
164239462Sdim
165239462Sdim#define RX_SW_DESC_MAP_CREATED	(1 << 0)
166239462Sdim#define TX_SW_DESC_MAP_CREATED	(1 << 1)
167263508Sdim#define RX_SW_DESC_INUSE        (1 << 3)
168239462Sdim#define TX_SW_DESC_MAPPED       (1 << 4)
169239462Sdim
170263508Sdim#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
171263508Sdim#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
172239462Sdim#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
173239462Sdim#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
174239462Sdim
175226584Sdimstruct tx_sw_desc {                /* SW state per Tx descriptor */
176226584Sdim	struct mbuf	*m;
177226584Sdim	bus_dmamap_t	map;
178263508Sdim	int		flags;
179226584Sdim};
180239462Sdim
181263508Sdimstruct rx_sw_desc {                /* SW state per Rx descriptor */
182263508Sdim	caddr_t		rxsd_cl;
183239462Sdim	struct mbuf	*m;
184239462Sdim	bus_dmamap_t	map;
185226584Sdim	int		flags;
186226584Sdim};
187226584Sdim
188226584Sdimstruct txq_state {
189226584Sdim	unsigned int	compl;
190226584Sdim	unsigned int	gen;
191226584Sdim	unsigned int	pidx;
192226584Sdim};
193226584Sdim
194226584Sdimstruct refill_fl_cb_arg {
195226584Sdim	int               error;
196226584Sdim	bus_dma_segment_t seg;
197226584Sdim	int               nseg;
198226584Sdim};
199226584Sdim
200226584Sdim
201226584Sdim/*
202226584Sdim * Maps a number of flits to the number of Tx descriptors that can hold them.
203226584Sdim * The formula is
204226584Sdim *
205226584Sdim * desc = 1 + (flits - 2) / (WR_FLITS - 1).
206226584Sdim *
207226584Sdim * HW allows up to 4 descriptors to be combined into a WR.
208226584Sdim */
209226584Sdimstatic uint8_t flit_desc_map[] = {
210226584Sdim	0,
211226584Sdim#if SGE_NUM_GENBITS == 1
212226584Sdim	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213263508Sdim	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
214226584Sdim	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
215226584Sdim	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
216226584Sdim#elif SGE_NUM_GENBITS == 2
217226584Sdim	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
218263508Sdim	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
219226584Sdim	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
220226584Sdim	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
221263508Sdim#else
222226584Sdim# error "SGE_NUM_GENBITS must be 1 or 2"
223226584Sdim#endif
224226584Sdim};
225226584Sdim
226226584Sdim#define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
227226584Sdim#define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
228226584Sdim#define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
229226584Sdim#define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
230226584Sdim#define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
231239462Sdim#define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
232226584Sdim#define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
233226584Sdim	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
234226584Sdim#define	TXQ_RING_DEQUEUE(qs) \
235226584Sdim	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236226584Sdim
237int cxgb_debug = 0;
238
239static void sge_timer_cb(void *arg);
240static void sge_timer_reclaim(void *arg, int ncount);
241static void sge_txq_reclaim_handler(void *arg, int ncount);
242static void cxgb_start_locked(struct sge_qset *qs);
243
244/*
245 * XXX need to cope with bursty scheduling by looking at a wider
246 * window than we are now for determining the need for coalescing
247 *
248 */
249static __inline uint64_t
250check_pkt_coalesce(struct sge_qset *qs)
251{
252        struct adapter *sc;
253        struct sge_txq *txq;
254	uint8_t *fill;
255
256	if (__predict_false(cxgb_tx_coalesce_force))
257		return (1);
258	txq = &qs->txq[TXQ_ETH];
259        sc = qs->port->adapter;
260	fill = &sc->tunq_fill[qs->idx];
261
262	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
263		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
264	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
265		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
266	/*
267	 * if the hardware transmit queue is more than 1/8 full
268	 * we mark it as coalescing - we drop back from coalescing
269	 * when we go below 1/32 full and there are no packets enqueued,
270	 * this provides us with some degree of hysteresis
271	 */
272        if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
273	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
274                *fill = 0;
275        else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
276                *fill = 1;
277
278	return (sc->tunq_coalesce);
279}
280
281#ifdef __LP64__
282static void
283set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
284{
285	uint64_t wr_hilo;
286#if _BYTE_ORDER == _LITTLE_ENDIAN
287	wr_hilo = wr_hi;
288	wr_hilo |= (((uint64_t)wr_lo)<<32);
289#else
290	wr_hilo = wr_lo;
291	wr_hilo |= (((uint64_t)wr_hi)<<32);
292#endif
293	wrp->wrh_hilo = wr_hilo;
294}
295#else
296static void
297set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
298{
299
300	wrp->wrh_hi = wr_hi;
301	wmb();
302	wrp->wrh_lo = wr_lo;
303}
304#endif
305
306struct coalesce_info {
307	int count;
308	int nbytes;
309};
310
311static int
312coalesce_check(struct mbuf *m, void *arg)
313{
314	struct coalesce_info *ci = arg;
315	int *count = &ci->count;
316	int *nbytes = &ci->nbytes;
317
318	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
319		(*count < 7) && (m->m_next == NULL))) {
320		*count += 1;
321		*nbytes += m->m_len;
322		return (1);
323	}
324	return (0);
325}
326
327static struct mbuf *
328cxgb_dequeue(struct sge_qset *qs)
329{
330	struct mbuf *m, *m_head, *m_tail;
331	struct coalesce_info ci;
332
333
334	if (check_pkt_coalesce(qs) == 0)
335		return TXQ_RING_DEQUEUE(qs);
336
337	m_head = m_tail = NULL;
338	ci.count = ci.nbytes = 0;
339	do {
340		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
341		if (m_head == NULL) {
342			m_tail = m_head = m;
343		} else if (m != NULL) {
344			m_tail->m_nextpkt = m;
345			m_tail = m;
346		}
347	} while (m != NULL);
348	if (ci.count > 7)
349		panic("trying to coalesce %d packets in to one WR", ci.count);
350	return (m_head);
351}
352
353/**
354 *	reclaim_completed_tx - reclaims completed Tx descriptors
355 *	@adapter: the adapter
356 *	@q: the Tx queue to reclaim completed descriptors from
357 *
358 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
359 *	and frees the associated buffers if possible.  Called with the Tx
360 *	queue's lock held.
361 */
362static __inline int
363reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
364{
365	struct sge_txq *q = &qs->txq[queue];
366	int reclaim = desc_reclaimable(q);
367
368	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
369	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
370		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
371
372	if (reclaim < reclaim_min)
373		return (0);
374
375	mtx_assert(&qs->lock, MA_OWNED);
376	if (reclaim > 0) {
377		t3_free_tx_desc(qs, reclaim, queue);
378		q->cleaned += reclaim;
379		q->in_use -= reclaim;
380	}
381	if (isset(&qs->txq_stopped, TXQ_ETH))
382                clrbit(&qs->txq_stopped, TXQ_ETH);
383
384	return (reclaim);
385}
386
387/**
388 *	should_restart_tx - are there enough resources to restart a Tx queue?
389 *	@q: the Tx queue
390 *
391 *	Checks if there are enough descriptors to restart a suspended Tx queue.
392 */
393static __inline int
394should_restart_tx(const struct sge_txq *q)
395{
396	unsigned int r = q->processed - q->cleaned;
397
398	return q->in_use - r < (q->size >> 1);
399}
400
401/**
402 *	t3_sge_init - initialize SGE
403 *	@adap: the adapter
404 *	@p: the SGE parameters
405 *
406 *	Performs SGE initialization needed every time after a chip reset.
407 *	We do not initialize any of the queue sets here, instead the driver
408 *	top-level must request those individually.  We also do not enable DMA
409 *	here, that should be done after the queues have been set up.
410 */
411void
412t3_sge_init(adapter_t *adap, struct sge_params *p)
413{
414	u_int ctrl, ups;
415
416	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
417
418	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
419	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
420	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
421	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
422#if SGE_NUM_GENBITS == 1
423	ctrl |= F_EGRGENCTRL;
424#endif
425	if (adap->params.rev > 0) {
426		if (!(adap->flags & (USING_MSIX | USING_MSI)))
427			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
428	}
429	t3_write_reg(adap, A_SG_CONTROL, ctrl);
430	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
431		     V_LORCQDRBTHRSH(512));
432	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
433	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
434		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
435	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
436		     adap->params.rev < T3_REV_C ? 1000 : 500);
437	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
438	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
439	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
440	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
441	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
442}
443
444
445/**
446 *	sgl_len - calculates the size of an SGL of the given capacity
447 *	@n: the number of SGL entries
448 *
449 *	Calculates the number of flits needed for a scatter/gather list that
450 *	can hold the given number of entries.
451 */
452static __inline unsigned int
453sgl_len(unsigned int n)
454{
455	return ((3 * n) / 2 + (n & 1));
456}
457
458/**
459 *	get_imm_packet - return the next ingress packet buffer from a response
460 *	@resp: the response descriptor containing the packet data
461 *
462 *	Return a packet containing the immediate data of the given response.
463 */
464static int
465get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
466{
467
468	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
469	m->m_ext.ext_buf = NULL;
470	m->m_ext.ext_type = 0;
471	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
472	return (0);
473}
474
475static __inline u_int
476flits_to_desc(u_int n)
477{
478	return (flit_desc_map[n]);
479}
480
481#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
482		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
483		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
484		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
485		    F_HIRCQPARITYERROR)
486#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
487#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
488		      F_RSPQDISABLED)
489
490/**
491 *	t3_sge_err_intr_handler - SGE async event interrupt handler
492 *	@adapter: the adapter
493 *
494 *	Interrupt handler for SGE asynchronous (non-data) events.
495 */
496void
497t3_sge_err_intr_handler(adapter_t *adapter)
498{
499	unsigned int v, status;
500
501	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
502	if (status & SGE_PARERR)
503		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
504			 status & SGE_PARERR);
505	if (status & SGE_FRAMINGERR)
506		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
507			 status & SGE_FRAMINGERR);
508	if (status & F_RSPQCREDITOVERFOW)
509		CH_ALERT(adapter, "SGE response queue credit overflow\n");
510
511	if (status & F_RSPQDISABLED) {
512		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
513
514		CH_ALERT(adapter,
515			 "packet delivered to disabled response queue (0x%x)\n",
516			 (v >> S_RSPQ0DISABLED) & 0xff);
517	}
518
519	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
520	if (status & SGE_FATALERR)
521		t3_fatal_err(adapter);
522}
523
524void
525t3_sge_prep(adapter_t *adap, struct sge_params *p)
526{
527	int i, nqsets;
528
529	nqsets = min(SGE_QSETS, mp_ncpus*4);
530
531	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
532
533	while (!powerof2(fl_q_size))
534		fl_q_size--;
535#if __FreeBSD_version >= 700111
536	if (cxgb_use_16k_clusters)
537		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
538	else
539		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
540#else
541	jumbo_q_size = min(nmbjumbo4/(3*nqsets), JUMBO_Q_SIZE);
542#endif
543	while (!powerof2(jumbo_q_size))
544		jumbo_q_size--;
545
546	/* XXX Does ETHER_ALIGN need to be accounted for here? */
547	p->max_pkt_size = adap->sge.qs[0].fl[1].buf_size - sizeof(struct cpl_rx_data);
548
549	for (i = 0; i < SGE_QSETS; ++i) {
550		struct qset_params *q = p->qset + i;
551
552		if (adap->params.nports > 2) {
553			q->coalesce_usecs = 50;
554		} else {
555#ifdef INVARIANTS
556			q->coalesce_usecs = 10;
557#else
558			q->coalesce_usecs = 5;
559#endif
560		}
561		q->polling = 0;
562		q->rspq_size = RSPQ_Q_SIZE;
563		q->fl_size = fl_q_size;
564		q->jumbo_size = jumbo_q_size;
565		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
566		q->txq_size[TXQ_OFLD] = 1024;
567		q->txq_size[TXQ_CTRL] = 256;
568		q->cong_thres = 0;
569	}
570}
571
572int
573t3_sge_alloc(adapter_t *sc)
574{
575
576	/* The parent tag. */
577	if (bus_dma_tag_create( NULL,			/* parent */
578				1, 0,			/* algnmnt, boundary */
579				BUS_SPACE_MAXADDR,	/* lowaddr */
580				BUS_SPACE_MAXADDR,	/* highaddr */
581				NULL, NULL,		/* filter, filterarg */
582				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
583				BUS_SPACE_UNRESTRICTED, /* nsegments */
584				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
585				0,			/* flags */
586				NULL, NULL,		/* lock, lockarg */
587				&sc->parent_dmat)) {
588		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
589		return (ENOMEM);
590	}
591
592	/*
593	 * DMA tag for normal sized RX frames
594	 */
595	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
596		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
597		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
598		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
599		return (ENOMEM);
600	}
601
602	/*
603	 * DMA tag for jumbo sized RX frames.
604	 */
605	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
606		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
607		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
608		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
609		return (ENOMEM);
610	}
611
612	/*
613	 * DMA tag for TX frames.
614	 */
615	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
616		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
617		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
618		NULL, NULL, &sc->tx_dmat)) {
619		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
620		return (ENOMEM);
621	}
622
623	return (0);
624}
625
626int
627t3_sge_free(struct adapter * sc)
628{
629
630	if (sc->tx_dmat != NULL)
631		bus_dma_tag_destroy(sc->tx_dmat);
632
633	if (sc->rx_jumbo_dmat != NULL)
634		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
635
636	if (sc->rx_dmat != NULL)
637		bus_dma_tag_destroy(sc->rx_dmat);
638
639	if (sc->parent_dmat != NULL)
640		bus_dma_tag_destroy(sc->parent_dmat);
641
642	return (0);
643}
644
645void
646t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
647{
648
649	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
650	qs->rspq.polling = 0 /* p->polling */;
651}
652
653#if !defined(__i386__) && !defined(__amd64__)
654static void
655refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
656{
657	struct refill_fl_cb_arg *cb_arg = arg;
658
659	cb_arg->error = error;
660	cb_arg->seg = segs[0];
661	cb_arg->nseg = nseg;
662
663}
664#endif
665/**
666 *	refill_fl - refill an SGE free-buffer list
667 *	@sc: the controller softc
668 *	@q: the free-list to refill
669 *	@n: the number of new buffers to allocate
670 *
671 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
672 *	The caller must assure that @n does not exceed the queue's capacity.
673 */
674static void
675refill_fl(adapter_t *sc, struct sge_fl *q, int n)
676{
677	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
678	struct rx_desc *d = &q->desc[q->pidx];
679	struct refill_fl_cb_arg cb_arg;
680	struct mbuf *m;
681	caddr_t cl;
682	int err, count = 0;
683
684	cb_arg.error = 0;
685	while (n--) {
686		/*
687		 * We only allocate a cluster, mbuf allocation happens after rx
688		 */
689		if (q->zone == zone_pack) {
690			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
691				break;
692			cl = m->m_ext.ext_buf;
693		} else {
694			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
695				break;
696			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
697				uma_zfree(q->zone, cl);
698				break;
699			}
700		}
701		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
702			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
703				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
704				uma_zfree(q->zone, cl);
705				goto done;
706			}
707			sd->flags |= RX_SW_DESC_MAP_CREATED;
708		}
709#if !defined(__i386__) && !defined(__amd64__)
710		err = bus_dmamap_load(q->entry_tag, sd->map,
711		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
712
713		if (err != 0 || cb_arg.error) {
714			if (q->zone == zone_pack)
715				uma_zfree(q->zone, cl);
716			m_free(m);
717			goto done;
718		}
719#else
720		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
721#endif
722		sd->flags |= RX_SW_DESC_INUSE;
723		sd->rxsd_cl = cl;
724		sd->m = m;
725		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
726		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
727		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
728		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
729
730		d++;
731		sd++;
732
733		if (++q->pidx == q->size) {
734			q->pidx = 0;
735			q->gen ^= 1;
736			sd = q->sdesc;
737			d = q->desc;
738		}
739		q->credits++;
740		count++;
741	}
742
743done:
744	if (count)
745		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
746}
747
748
749/**
750 *	free_rx_bufs - free the Rx buffers on an SGE free list
751 *	@sc: the controle softc
752 *	@q: the SGE free list to clean up
753 *
754 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
755 *	this queue should be stopped before calling this function.
756 */
757static void
758free_rx_bufs(adapter_t *sc, struct sge_fl *q)
759{
760	u_int cidx = q->cidx;
761
762	while (q->credits--) {
763		struct rx_sw_desc *d = &q->sdesc[cidx];
764
765		if (d->flags & RX_SW_DESC_INUSE) {
766			bus_dmamap_unload(q->entry_tag, d->map);
767			bus_dmamap_destroy(q->entry_tag, d->map);
768			if (q->zone == zone_pack) {
769				m_init(d->m, zone_pack, MCLBYTES,
770				    M_NOWAIT, MT_DATA, M_EXT);
771				uma_zfree(zone_pack, d->m);
772			} else {
773				m_init(d->m, zone_mbuf, MLEN,
774				    M_NOWAIT, MT_DATA, 0);
775				uma_zfree(zone_mbuf, d->m);
776				uma_zfree(q->zone, d->rxsd_cl);
777			}
778		}
779
780		d->rxsd_cl = NULL;
781		d->m = NULL;
782		if (++cidx == q->size)
783			cidx = 0;
784	}
785}
786
787static __inline void
788__refill_fl(adapter_t *adap, struct sge_fl *fl)
789{
790	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
791}
792
793static __inline void
794__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
795{
796	if ((fl->size - fl->credits) < max)
797		refill_fl(adap, fl, min(max, fl->size - fl->credits));
798}
799
800/**
801 *	recycle_rx_buf - recycle a receive buffer
802 *	@adapter: the adapter
803 *	@q: the SGE free list
804 *	@idx: index of buffer to recycle
805 *
806 *	Recycles the specified buffer on the given free list by adding it at
807 *	the next available slot on the list.
808 */
809static void
810recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
811{
812	struct rx_desc *from = &q->desc[idx];
813	struct rx_desc *to   = &q->desc[q->pidx];
814
815	q->sdesc[q->pidx] = q->sdesc[idx];
816	to->addr_lo = from->addr_lo;        // already big endian
817	to->addr_hi = from->addr_hi;        // likewise
818	wmb();	/* necessary ? */
819	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
820	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
821	q->credits++;
822
823	if (++q->pidx == q->size) {
824		q->pidx = 0;
825		q->gen ^= 1;
826	}
827	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
828}
829
830static void
831alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
832{
833	uint32_t *addr;
834
835	addr = arg;
836	*addr = segs[0].ds_addr;
837}
838
839static int
840alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
841    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
842    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
843{
844	size_t len = nelem * elem_size;
845	void *s = NULL;
846	void *p = NULL;
847	int err;
848
849	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
850				      BUS_SPACE_MAXADDR_32BIT,
851				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
852				      len, 0, NULL, NULL, tag)) != 0) {
853		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
854		return (ENOMEM);
855	}
856
857	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
858				    map)) != 0) {
859		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
860		return (ENOMEM);
861	}
862
863	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
864	bzero(p, len);
865	*(void **)desc = p;
866
867	if (sw_size) {
868		len = nelem * sw_size;
869		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
870		*(void **)sdesc = s;
871	}
872	if (parent_entry_tag == NULL)
873		return (0);
874
875	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
876				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
877		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
878				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
879		                      NULL, NULL, entry_tag)) != 0) {
880		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
881		return (ENOMEM);
882	}
883	return (0);
884}
885
886static void
887sge_slow_intr_handler(void *arg, int ncount)
888{
889	adapter_t *sc = arg;
890
891	t3_slow_intr_handler(sc);
892}
893
894/**
895 *	sge_timer_cb - perform periodic maintenance of an SGE qset
896 *	@data: the SGE queue set to maintain
897 *
898 *	Runs periodically from a timer to perform maintenance of an SGE queue
899 *	set.  It performs two tasks:
900 *
901 *	a) Cleans up any completed Tx descriptors that may still be pending.
902 *	Normal descriptor cleanup happens when new packets are added to a Tx
903 *	queue so this timer is relatively infrequent and does any cleanup only
904 *	if the Tx queue has not seen any new packets in a while.  We make a
905 *	best effort attempt to reclaim descriptors, in that we don't wait
906 *	around if we cannot get a queue's lock (which most likely is because
907 *	someone else is queueing new packets and so will also handle the clean
908 *	up).  Since control queues use immediate data exclusively we don't
909 *	bother cleaning them up here.
910 *
911 *	b) Replenishes Rx queues that have run out due to memory shortage.
912 *	Normally new Rx buffers are added when existing ones are consumed but
913 *	when out of memory a queue can become empty.  We try to add only a few
914 *	buffers here, the queue will be replenished fully as these new buffers
915 *	are used up if memory shortage has subsided.
916 *
917 *	c) Return coalesced response queue credits in case a response queue is
918 *	starved.
919 *
920 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
921 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
922 */
923static void
924sge_timer_cb(void *arg)
925{
926	adapter_t *sc = arg;
927	if ((sc->flags & USING_MSIX) == 0) {
928
929		struct port_info *pi;
930		struct sge_qset *qs;
931		struct sge_txq  *txq;
932		int i, j;
933		int reclaim_ofl, refill_rx;
934
935		if (sc->open_device_map == 0)
936			return;
937
938		for (i = 0; i < sc->params.nports; i++) {
939			pi = &sc->port[i];
940			for (j = 0; j < pi->nqsets; j++) {
941				qs = &sc->sge.qs[pi->first_qset + j];
942				txq = &qs->txq[0];
943				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
944				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
945				    (qs->fl[1].credits < qs->fl[1].size));
946				if (reclaim_ofl || refill_rx) {
947					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
948					break;
949				}
950			}
951		}
952	}
953
954	if (sc->params.nports > 2) {
955		int i;
956
957		for_each_port(sc, i) {
958			struct port_info *pi = &sc->port[i];
959
960			t3_write_reg(sc, A_SG_KDOORBELL,
961				     F_SELEGRCNTX |
962				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
963		}
964	}
965	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
966	    sc->open_device_map != 0)
967		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
968}
969
970/*
971 * This is meant to be a catch-all function to keep sge state private
972 * to sge.c
973 *
974 */
975int
976t3_sge_init_adapter(adapter_t *sc)
977{
978	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
979	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
980	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
981	return (0);
982}
983
984int
985t3_sge_reset_adapter(adapter_t *sc)
986{
987	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
988	return (0);
989}
990
991int
992t3_sge_init_port(struct port_info *pi)
993{
994	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
995	return (0);
996}
997
998/**
999 *	refill_rspq - replenish an SGE response queue
1000 *	@adapter: the adapter
1001 *	@q: the response queue to replenish
1002 *	@credits: how many new responses to make available
1003 *
1004 *	Replenishes a response queue by making the supplied number of responses
1005 *	available to HW.
1006 */
1007static __inline void
1008refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1009{
1010
1011	/* mbufs are allocated on demand when a rspq entry is processed. */
1012	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1013		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1014}
1015
1016static void
1017sge_txq_reclaim_handler(void *arg, int ncount)
1018{
1019	struct sge_qset *qs = arg;
1020	int i;
1021
1022	for (i = 0; i < 3; i++)
1023		reclaim_completed_tx(qs, 16, i);
1024}
1025
1026static void
1027sge_timer_reclaim(void *arg, int ncount)
1028{
1029	struct port_info *pi = arg;
1030	int i, nqsets = pi->nqsets;
1031	adapter_t *sc = pi->adapter;
1032	struct sge_qset *qs;
1033	struct mtx *lock;
1034
1035	KASSERT((sc->flags & USING_MSIX) == 0,
1036	    ("can't call timer reclaim for msi-x"));
1037
1038	for (i = 0; i < nqsets; i++) {
1039		qs = &sc->sge.qs[pi->first_qset + i];
1040
1041		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1042		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1043			    &sc->sge.qs[0].rspq.lock;
1044
1045		if (mtx_trylock(lock)) {
1046			/* XXX currently assume that we are *NOT* polling */
1047			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1048
1049			if (qs->fl[0].credits < qs->fl[0].size - 16)
1050				__refill_fl(sc, &qs->fl[0]);
1051			if (qs->fl[1].credits < qs->fl[1].size - 16)
1052				__refill_fl(sc, &qs->fl[1]);
1053
1054			if (status & (1 << qs->rspq.cntxt_id)) {
1055				if (qs->rspq.credits) {
1056					refill_rspq(sc, &qs->rspq, 1);
1057					qs->rspq.credits--;
1058					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1059					    1 << qs->rspq.cntxt_id);
1060				}
1061			}
1062			mtx_unlock(lock);
1063		}
1064	}
1065}
1066
1067/**
1068 *	init_qset_cntxt - initialize an SGE queue set context info
1069 *	@qs: the queue set
1070 *	@id: the queue set id
1071 *
1072 *	Initializes the TIDs and context ids for the queues of a queue set.
1073 */
1074static void
1075init_qset_cntxt(struct sge_qset *qs, u_int id)
1076{
1077
1078	qs->rspq.cntxt_id = id;
1079	qs->fl[0].cntxt_id = 2 * id;
1080	qs->fl[1].cntxt_id = 2 * id + 1;
1081	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1082	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1083	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1084	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1085	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1086
1087	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1088	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1089	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1090}
1091
1092
1093static void
1094txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1095{
1096	txq->in_use += ndesc;
1097	/*
1098	 * XXX we don't handle stopping of queue
1099	 * presumably start handles this when we bump against the end
1100	 */
1101	txqs->gen = txq->gen;
1102	txq->unacked += ndesc;
1103	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1104	txq->unacked &= 31;
1105	txqs->pidx = txq->pidx;
1106	txq->pidx += ndesc;
1107#ifdef INVARIANTS
1108	if (((txqs->pidx > txq->cidx) &&
1109		(txq->pidx < txqs->pidx) &&
1110		(txq->pidx >= txq->cidx)) ||
1111	    ((txqs->pidx < txq->cidx) &&
1112		(txq->pidx >= txq-> cidx)) ||
1113	    ((txqs->pidx < txq->cidx) &&
1114		(txq->cidx < txqs->pidx)))
1115		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1116		    txqs->pidx, txq->pidx, txq->cidx);
1117#endif
1118	if (txq->pidx >= txq->size) {
1119		txq->pidx -= txq->size;
1120		txq->gen ^= 1;
1121	}
1122
1123}
1124
1125/**
1126 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1127 *	@m: the packet mbufs
1128 *      @nsegs: the number of segments
1129 *
1130 * 	Returns the number of Tx descriptors needed for the given Ethernet
1131 * 	packet.  Ethernet packets require addition of WR and CPL headers.
1132 */
1133static __inline unsigned int
1134calc_tx_descs(const struct mbuf *m, int nsegs)
1135{
1136	unsigned int flits;
1137
1138	if (m->m_pkthdr.len <= PIO_LEN)
1139		return 1;
1140
1141	flits = sgl_len(nsegs) + 2;
1142#ifdef TSO_SUPPORTED
1143	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1144		flits++;
1145#endif
1146	return flits_to_desc(flits);
1147}
1148
1149static unsigned int
1150busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1151    struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1152{
1153	struct mbuf *m0;
1154	int err, pktlen, pass = 0;
1155	bus_dma_tag_t tag = txq->entry_tag;
1156
1157retry:
1158	err = 0;
1159	m0 = *m;
1160	pktlen = m0->m_pkthdr.len;
1161#if defined(__i386__) || defined(__amd64__)
1162	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1163		goto done;
1164	} else
1165#endif
1166		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1167
1168	if (err == 0) {
1169		goto done;
1170	}
1171	if (err == EFBIG && pass == 0) {
1172		pass = 1;
1173		/* Too many segments, try to defrag */
1174		m0 = m_defrag(m0, M_DONTWAIT);
1175		if (m0 == NULL) {
1176			m_freem(*m);
1177			*m = NULL;
1178			return (ENOBUFS);
1179		}
1180		*m = m0;
1181		goto retry;
1182	} else if (err == ENOMEM) {
1183		return (err);
1184	} if (err) {
1185		if (cxgb_debug)
1186			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1187		m_freem(m0);
1188		*m = NULL;
1189		return (err);
1190	}
1191done:
1192#if !defined(__i386__) && !defined(__amd64__)
1193	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1194#endif
1195	txsd->flags |= TX_SW_DESC_MAPPED;
1196
1197	return (0);
1198}
1199
1200/**
1201 *	make_sgl - populate a scatter/gather list for a packet
1202 *	@sgp: the SGL to populate
1203 *	@segs: the packet dma segments
1204 *	@nsegs: the number of segments
1205 *
1206 *	Generates a scatter/gather list for the buffers that make up a packet
1207 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1208 *	appropriately.
1209 */
1210static __inline void
1211make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1212{
1213	int i, idx;
1214
1215	for (idx = 0, i = 0; i < nsegs; i++) {
1216		/*
1217		 * firmware doesn't like empty segments
1218		 */
1219		if (segs[i].ds_len == 0)
1220			continue;
1221		if (i && idx == 0)
1222			++sgp;
1223
1224		sgp->len[idx] = htobe32(segs[i].ds_len);
1225		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1226		idx ^= 1;
1227	}
1228
1229	if (idx) {
1230		sgp->len[idx] = 0;
1231		sgp->addr[idx] = 0;
1232	}
1233}
1234
1235/**
1236 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1237 *	@adap: the adapter
1238 *	@q: the Tx queue
1239 *
1240 *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1241 *	where the HW is going to sleep just after we checked, however,
1242 *	then the interrupt handler will detect the outstanding TX packet
1243 *	and ring the doorbell for us.
1244 *
1245 *	When GTS is disabled we unconditionally ring the doorbell.
1246 */
1247static __inline void
1248check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1249{
1250#if USE_GTS
1251	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1252	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1253		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1254#ifdef T3_TRACE
1255		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1256			  q->cntxt_id);
1257#endif
1258		t3_write_reg(adap, A_SG_KDOORBELL,
1259			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1260	}
1261#else
1262	wmb();            /* write descriptors before telling HW */
1263	t3_write_reg(adap, A_SG_KDOORBELL,
1264		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1265#endif
1266}
1267
1268static __inline void
1269wr_gen2(struct tx_desc *d, unsigned int gen)
1270{
1271#if SGE_NUM_GENBITS == 2
1272	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1273#endif
1274}
1275
1276/**
1277 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1278 *	@ndesc: number of Tx descriptors spanned by the SGL
1279 *	@txd: first Tx descriptor to be written
1280 *	@txqs: txq state (generation and producer index)
1281 *	@txq: the SGE Tx queue
1282 *	@sgl: the SGL
1283 *	@flits: number of flits to the start of the SGL in the first descriptor
1284 *	@sgl_flits: the SGL size in flits
1285 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1286 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1287 *
1288 *	Write a work request header and an associated SGL.  If the SGL is
1289 *	small enough to fit into one Tx descriptor it has already been written
1290 *	and we just need to write the WR header.  Otherwise we distribute the
1291 *	SGL across the number of descriptors it spans.
1292 */
1293static void
1294write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1295    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1296    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1297{
1298
1299	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1300	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1301
1302	if (__predict_true(ndesc == 1)) {
1303		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1304			V_WR_SGLSFLT(flits)) | wr_hi,
1305		    htonl(V_WR_LEN(flits + sgl_flits) |
1306			V_WR_GEN(txqs->gen)) | wr_lo);
1307		/* XXX gen? */
1308		wr_gen2(txd, txqs->gen);
1309
1310	} else {
1311		unsigned int ogen = txqs->gen;
1312		const uint64_t *fp = (const uint64_t *)sgl;
1313		struct work_request_hdr *wp = wrp;
1314
1315		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1316		    V_WR_SGLSFLT(flits)) | wr_hi;
1317
1318		while (sgl_flits) {
1319			unsigned int avail = WR_FLITS - flits;
1320
1321			if (avail > sgl_flits)
1322				avail = sgl_flits;
1323			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1324			sgl_flits -= avail;
1325			ndesc--;
1326			if (!sgl_flits)
1327				break;
1328
1329			fp += avail;
1330			txd++;
1331			txsd++;
1332			if (++txqs->pidx == txq->size) {
1333				txqs->pidx = 0;
1334				txqs->gen ^= 1;
1335				txd = txq->desc;
1336				txsd = txq->sdesc;
1337			}
1338
1339			/*
1340			 * when the head of the mbuf chain
1341			 * is freed all clusters will be freed
1342			 * with it
1343			 */
1344			wrp = (struct work_request_hdr *)txd;
1345			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1346			    V_WR_SGLSFLT(1)) | wr_hi;
1347			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1348				    sgl_flits + 1)) |
1349			    V_WR_GEN(txqs->gen)) | wr_lo;
1350			wr_gen2(txd, txqs->gen);
1351			flits = 1;
1352		}
1353		wrp->wrh_hi |= htonl(F_WR_EOP);
1354		wmb();
1355		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1356		wr_gen2((struct tx_desc *)wp, ogen);
1357	}
1358}
1359
1360/* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1361#define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1362
1363#ifdef VLAN_SUPPORTED
1364#define GET_VTAG(cntrl, m) \
1365do { \
1366	if ((m)->m_flags & M_VLANTAG)					            \
1367		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1368} while (0)
1369
1370#else
1371#define GET_VTAG(cntrl, m)
1372#endif
1373
1374static int
1375t3_encap(struct sge_qset *qs, struct mbuf **m)
1376{
1377	adapter_t *sc;
1378	struct mbuf *m0;
1379	struct sge_txq *txq;
1380	struct txq_state txqs;
1381	struct port_info *pi;
1382	unsigned int ndesc, flits, cntrl, mlen;
1383	int err, nsegs, tso_info = 0;
1384
1385	struct work_request_hdr *wrp;
1386	struct tx_sw_desc *txsd;
1387	struct sg_ent *sgp, *sgl;
1388	uint32_t wr_hi, wr_lo, sgl_flits;
1389	bus_dma_segment_t segs[TX_MAX_SEGS];
1390
1391	struct tx_desc *txd;
1392
1393	pi = qs->port;
1394	sc = pi->adapter;
1395	txq = &qs->txq[TXQ_ETH];
1396	txd = &txq->desc[txq->pidx];
1397	txsd = &txq->sdesc[txq->pidx];
1398	sgl = txq->txq_sgl;
1399
1400	prefetch(txd);
1401	m0 = *m;
1402
1403	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
1404	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
1405
1406	mtx_assert(&qs->lock, MA_OWNED);
1407	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1408	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1409
1410#ifdef VLAN_SUPPORTED
1411	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1412	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1413		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1414#endif
1415	if (m0->m_nextpkt != NULL) {
1416		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1417		ndesc = 1;
1418		mlen = 0;
1419	} else {
1420		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1421		    &m0, segs, &nsegs))) {
1422			if (cxgb_debug)
1423				printf("failed ... err=%d\n", err);
1424			return (err);
1425		}
1426		mlen = m0->m_pkthdr.len;
1427		ndesc = calc_tx_descs(m0, nsegs);
1428	}
1429	txq_prod(txq, ndesc, &txqs);
1430
1431	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1432	txsd->m = m0;
1433
1434	if (m0->m_nextpkt != NULL) {
1435		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1436		int i, fidx;
1437
1438		if (nsegs > 7)
1439			panic("trying to coalesce %d packets in to one WR", nsegs);
1440		txq->txq_coalesced += nsegs;
1441		wrp = (struct work_request_hdr *)txd;
1442		flits = nsegs*2 + 1;
1443
1444		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1445			struct cpl_tx_pkt_batch_entry *cbe;
1446			uint64_t flit;
1447			uint32_t *hflit = (uint32_t *)&flit;
1448			int cflags = m0->m_pkthdr.csum_flags;
1449
1450			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1451			GET_VTAG(cntrl, m0);
1452			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1453			if (__predict_false(!(cflags & CSUM_IP)))
1454				cntrl |= F_TXPKT_IPCSUM_DIS;
1455			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1456				cntrl |= F_TXPKT_L4CSUM_DIS;
1457
1458			hflit[0] = htonl(cntrl);
1459			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1460			flit |= htobe64(1 << 24);
1461			cbe = &cpl_batch->pkt_entry[i];
1462			cbe->cntrl = hflit[0];
1463			cbe->len = hflit[1];
1464			cbe->addr = htobe64(segs[i].ds_addr);
1465		}
1466
1467		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1468		    V_WR_SGLSFLT(flits)) |
1469		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1470		wr_lo = htonl(V_WR_LEN(flits) |
1471		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1472		set_wr_hdr(wrp, wr_hi, wr_lo);
1473		wmb();
1474		wr_gen2(txd, txqs.gen);
1475		check_ring_tx_db(sc, txq);
1476		return (0);
1477	} else if (tso_info) {
1478		int min_size = TCPPKTHDRSIZE, eth_type, tagged;
1479		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1480		struct ip *ip;
1481		struct tcphdr *tcp;
1482		char *pkthdr;
1483
1484		txd->flit[2] = 0;
1485		GET_VTAG(cntrl, m0);
1486		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1487		hdr->cntrl = htonl(cntrl);
1488		hdr->len = htonl(mlen | 0x80000000);
1489
1490		DPRINTF("tso buf len=%d\n", mlen);
1491
1492		tagged = m0->m_flags & M_VLANTAG;
1493		if (!tagged)
1494			min_size -= ETHER_VLAN_ENCAP_LEN;
1495
1496		if (__predict_false(mlen < min_size)) {
1497			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1498			    m0, mlen, m0->m_pkthdr.tso_segsz,
1499			    m0->m_pkthdr.csum_flags, m0->m_flags);
1500			panic("tx tso packet too small");
1501		}
1502
1503		/* Make sure that ether, ip, tcp headers are all in m0 */
1504		if (__predict_false(m0->m_len < min_size)) {
1505			m0 = m_pullup(m0, min_size);
1506			if (__predict_false(m0 == NULL)) {
1507				/* XXX panic probably an overreaction */
1508				panic("couldn't fit header into mbuf");
1509			}
1510		}
1511		pkthdr = m0->m_data;
1512
1513		if (tagged) {
1514			eth_type = CPL_ETH_II_VLAN;
1515			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1516			    ETHER_VLAN_ENCAP_LEN);
1517		} else {
1518			eth_type = CPL_ETH_II;
1519			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1520		}
1521		tcp = (struct tcphdr *)((uint8_t *)ip +
1522		    sizeof(*ip));
1523
1524		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1525			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1526			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1527		hdr->lso_info = htonl(tso_info);
1528
1529		if (__predict_false(mlen <= PIO_LEN)) {
1530			/* pkt not undersized but fits in PIO_LEN
1531			 * Indicates a TSO bug at the higher levels.
1532			 *
1533			 */
1534			DPRINTF("**5592 Fix** mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1535			    m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags);
1536			txsd->m = NULL;
1537			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1538			flits = (mlen + 7) / 8 + 3;
1539			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1540					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1541					  F_WR_SOP | F_WR_EOP | txqs.compl);
1542			wr_lo = htonl(V_WR_LEN(flits) |
1543			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1544			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1545			wmb();
1546			wr_gen2(txd, txqs.gen);
1547			check_ring_tx_db(sc, txq);
1548			return (0);
1549		}
1550		flits = 3;
1551	} else {
1552		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1553
1554		GET_VTAG(cntrl, m0);
1555		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1556		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1557			cntrl |= F_TXPKT_IPCSUM_DIS;
1558		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1559			cntrl |= F_TXPKT_L4CSUM_DIS;
1560		cpl->cntrl = htonl(cntrl);
1561		cpl->len = htonl(mlen | 0x80000000);
1562
1563		if (mlen <= PIO_LEN) {
1564			txsd->m = NULL;
1565			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1566			flits = (mlen + 7) / 8 + 2;
1567
1568			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1569			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1570					  F_WR_SOP | F_WR_EOP | txqs.compl);
1571			wr_lo = htonl(V_WR_LEN(flits) |
1572			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1573			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1574			wmb();
1575			wr_gen2(txd, txqs.gen);
1576			check_ring_tx_db(sc, txq);
1577			return (0);
1578		}
1579		flits = 2;
1580	}
1581	wrp = (struct work_request_hdr *)txd;
1582	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1583	make_sgl(sgp, segs, nsegs);
1584
1585	sgl_flits = sgl_len(nsegs);
1586
1587	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1588	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1589	wr_lo = htonl(V_WR_TID(txq->token));
1590	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1591	    sgl_flits, wr_hi, wr_lo);
1592	check_ring_tx_db(pi->adapter, txq);
1593
1594	return (0);
1595}
1596
1597void
1598cxgb_tx_watchdog(void *arg)
1599{
1600	struct sge_qset *qs = arg;
1601	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1602
1603        if (qs->coalescing != 0 &&
1604	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1605	    TXQ_RING_EMPTY(qs))
1606                qs->coalescing = 0;
1607        else if (qs->coalescing == 0 &&
1608	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1609                qs->coalescing = 1;
1610	if (TXQ_TRYLOCK(qs)) {
1611		qs->qs_flags |= QS_FLUSHING;
1612		cxgb_start_locked(qs);
1613		qs->qs_flags &= ~QS_FLUSHING;
1614		TXQ_UNLOCK(qs);
1615	}
1616	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1617		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1618		    qs, txq->txq_watchdog.c_cpu);
1619}
1620
1621static void
1622cxgb_tx_timeout(void *arg)
1623{
1624	struct sge_qset *qs = arg;
1625	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1626
1627	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1628                qs->coalescing = 1;
1629	if (TXQ_TRYLOCK(qs)) {
1630		qs->qs_flags |= QS_TIMEOUT;
1631		cxgb_start_locked(qs);
1632		qs->qs_flags &= ~QS_TIMEOUT;
1633		TXQ_UNLOCK(qs);
1634	}
1635}
1636
1637static void
1638cxgb_start_locked(struct sge_qset *qs)
1639{
1640	struct mbuf *m_head = NULL;
1641	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1642	int avail, txmax;
1643	int in_use_init = txq->in_use;
1644	struct port_info *pi = qs->port;
1645	struct ifnet *ifp = pi->ifp;
1646	avail = txq->size - txq->in_use - 4;
1647	txmax = min(TX_START_MAX_DESC, avail);
1648
1649	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1650		reclaim_completed_tx(qs, 0, TXQ_ETH);
1651
1652	if (!pi->link_config.link_ok) {
1653		TXQ_RING_FLUSH(qs);
1654		return;
1655	}
1656	TXQ_LOCK_ASSERT(qs);
1657	while ((txq->in_use - in_use_init < txmax) &&
1658	    !TXQ_RING_EMPTY(qs) &&
1659	    (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1660	    pi->link_config.link_ok) {
1661		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1662
1663		if ((m_head = cxgb_dequeue(qs)) == NULL)
1664			break;
1665		/*
1666		 *  Encapsulation can modify our pointer, and or make it
1667		 *  NULL on failure.  In that event, we can't requeue.
1668		 */
1669		if (t3_encap(qs, &m_head) || m_head == NULL)
1670			break;
1671
1672		/* Send a copy of the frame to the BPF listener */
1673		ETHER_BPF_MTAP(ifp, m_head);
1674
1675		/*
1676		 * We sent via PIO, no longer need a copy
1677		 */
1678		if (m_head->m_nextpkt == NULL &&
1679		    m_head->m_pkthdr.len <= PIO_LEN)
1680			m_freem(m_head);
1681
1682		m_head = NULL;
1683	}
1684	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1685	    pi->link_config.link_ok)
1686		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1687		    qs, txq->txq_timer.c_cpu);
1688	if (m_head != NULL)
1689		m_freem(m_head);
1690}
1691
1692static int
1693cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1694{
1695	struct port_info *pi = qs->port;
1696	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1697	struct buf_ring *br = txq->txq_mr;
1698	int error, avail;
1699
1700	avail = txq->size - txq->in_use;
1701	TXQ_LOCK_ASSERT(qs);
1702
1703	/*
1704	 * We can only do a direct transmit if the following are true:
1705	 * - we aren't coalescing (ring < 3/4 full)
1706	 * - the link is up -- checked in caller
1707	 * - there are no packets enqueued already
1708	 * - there is space in hardware transmit queue
1709	 */
1710	if (check_pkt_coalesce(qs) == 0 &&
1711	    TXQ_RING_EMPTY(qs) && avail > 4) {
1712		if (t3_encap(qs, &m)) {
1713			if (m != NULL &&
1714			    (error = drbr_enqueue(ifp, br, m)) != 0)
1715				return (error);
1716		} else {
1717			/*
1718			 * We've bypassed the buf ring so we need to update
1719			 * the stats directly
1720			 */
1721			txq->txq_direct_packets++;
1722			txq->txq_direct_bytes += m->m_pkthdr.len;
1723			/*
1724			** Send a copy of the frame to the BPF
1725			** listener and set the watchdog on.
1726			*/
1727			ETHER_BPF_MTAP(ifp, m);
1728			/*
1729			 * We sent via PIO, no longer need a copy
1730			 */
1731			if (m->m_pkthdr.len <= PIO_LEN)
1732				m_freem(m);
1733
1734		}
1735	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1736		return (error);
1737
1738	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1739	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1740	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1741		cxgb_start_locked(qs);
1742	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1743		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1744		    qs, txq->txq_timer.c_cpu);
1745	return (0);
1746}
1747
1748int
1749cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1750{
1751	struct sge_qset *qs;
1752	struct port_info *pi = ifp->if_softc;
1753	int error, qidx = pi->first_qset;
1754
1755	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1756	    ||(!pi->link_config.link_ok)) {
1757		m_freem(m);
1758		return (0);
1759	}
1760
1761	if (m->m_flags & M_FLOWID)
1762		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1763
1764	qs = &pi->adapter->sge.qs[qidx];
1765
1766	if (TXQ_TRYLOCK(qs)) {
1767		/* XXX running */
1768		error = cxgb_transmit_locked(ifp, qs, m);
1769		TXQ_UNLOCK(qs);
1770	} else
1771		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1772	return (error);
1773}
1774void
1775cxgb_start(struct ifnet *ifp)
1776{
1777	struct port_info *pi = ifp->if_softc;
1778	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
1779
1780	if (!pi->link_config.link_ok)
1781		return;
1782
1783	TXQ_LOCK(qs);
1784	cxgb_start_locked(qs);
1785	TXQ_UNLOCK(qs);
1786}
1787
1788void
1789cxgb_qflush(struct ifnet *ifp)
1790{
1791	/*
1792	 * flush any enqueued mbufs in the buf_rings
1793	 * and in the transmit queues
1794	 * no-op for now
1795	 */
1796	return;
1797}
1798
1799/**
1800 *	write_imm - write a packet into a Tx descriptor as immediate data
1801 *	@d: the Tx descriptor to write
1802 *	@m: the packet
1803 *	@len: the length of packet data to write as immediate data
1804 *	@gen: the generation bit value to write
1805 *
1806 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1807 *	contains a work request at its beginning.  We must write the packet
1808 *	carefully so the SGE doesn't read accidentally before it's written in
1809 *	its entirety.
1810 */
1811static __inline void
1812write_imm(struct tx_desc *d, struct mbuf *m,
1813	  unsigned int len, unsigned int gen)
1814{
1815	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1816	struct work_request_hdr *to = (struct work_request_hdr *)d;
1817	uint32_t wr_hi, wr_lo;
1818
1819	if (len > WR_LEN)
1820		panic("len too big %d\n", len);
1821	if (len < sizeof(*from))
1822		panic("len too small %d", len);
1823
1824	memcpy(&to[1], &from[1], len - sizeof(*from));
1825	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1826					V_WR_BCNTLFLT(len & 7));
1827	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1828					V_WR_LEN((len + 7) / 8));
1829	set_wr_hdr(to, wr_hi, wr_lo);
1830	wmb();
1831	wr_gen2(d, gen);
1832
1833	/*
1834	 * This check is a hack we should really fix the logic so
1835	 * that this can't happen
1836	 */
1837	if (m->m_type != MT_DONTFREE)
1838		m_freem(m);
1839
1840}
1841
1842/**
1843 *	check_desc_avail - check descriptor availability on a send queue
1844 *	@adap: the adapter
1845 *	@q: the TX queue
1846 *	@m: the packet needing the descriptors
1847 *	@ndesc: the number of Tx descriptors needed
1848 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1849 *
1850 *	Checks if the requested number of Tx descriptors is available on an
1851 *	SGE send queue.  If the queue is already suspended or not enough
1852 *	descriptors are available the packet is queued for later transmission.
1853 *	Must be called with the Tx queue locked.
1854 *
1855 *	Returns 0 if enough descriptors are available, 1 if there aren't
1856 *	enough descriptors and the packet has been queued, and 2 if the caller
1857 *	needs to retry because there weren't enough descriptors at the
1858 *	beginning of the call but some freed up in the mean time.
1859 */
1860static __inline int
1861check_desc_avail(adapter_t *adap, struct sge_txq *q,
1862		 struct mbuf *m, unsigned int ndesc,
1863		 unsigned int qid)
1864{
1865	/*
1866	 * XXX We currently only use this for checking the control queue
1867	 * the control queue is only used for binding qsets which happens
1868	 * at init time so we are guaranteed enough descriptors
1869	 */
1870	if (__predict_false(!mbufq_empty(&q->sendq))) {
1871addq_exit:	mbufq_tail(&q->sendq, m);
1872		return 1;
1873	}
1874	if (__predict_false(q->size - q->in_use < ndesc)) {
1875
1876		struct sge_qset *qs = txq_to_qset(q, qid);
1877
1878		setbit(&qs->txq_stopped, qid);
1879		if (should_restart_tx(q) &&
1880		    test_and_clear_bit(qid, &qs->txq_stopped))
1881			return 2;
1882
1883		q->stops++;
1884		goto addq_exit;
1885	}
1886	return 0;
1887}
1888
1889
1890/**
1891 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1892 *	@q: the SGE control Tx queue
1893 *
1894 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1895 *	that send only immediate data (presently just the control queues) and
1896 *	thus do not have any mbufs
1897 */
1898static __inline void
1899reclaim_completed_tx_imm(struct sge_txq *q)
1900{
1901	unsigned int reclaim = q->processed - q->cleaned;
1902
1903	q->in_use -= reclaim;
1904	q->cleaned += reclaim;
1905}
1906
1907static __inline int
1908immediate(const struct mbuf *m)
1909{
1910	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1911}
1912
1913/**
1914 *	ctrl_xmit - send a packet through an SGE control Tx queue
1915 *	@adap: the adapter
1916 *	@q: the control queue
1917 *	@m: the packet
1918 *
1919 *	Send a packet through an SGE control Tx queue.  Packets sent through
1920 *	a control queue must fit entirely as immediate data in a single Tx
1921 *	descriptor and have no page fragments.
1922 */
1923static int
1924ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1925{
1926	int ret;
1927	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1928	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1929
1930	if (__predict_false(!immediate(m))) {
1931		m_freem(m);
1932		return 0;
1933	}
1934
1935	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1936	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1937
1938	TXQ_LOCK(qs);
1939again:	reclaim_completed_tx_imm(q);
1940
1941	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1942	if (__predict_false(ret)) {
1943		if (ret == 1) {
1944			TXQ_UNLOCK(qs);
1945			log(LOG_ERR, "no desc available\n");
1946			return (ENOSPC);
1947		}
1948		goto again;
1949	}
1950	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1951
1952	q->in_use++;
1953	if (++q->pidx >= q->size) {
1954		q->pidx = 0;
1955		q->gen ^= 1;
1956	}
1957	TXQ_UNLOCK(qs);
1958	t3_write_reg(adap, A_SG_KDOORBELL,
1959		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1960	return (0);
1961}
1962
1963
1964/**
1965 *	restart_ctrlq - restart a suspended control queue
1966 *	@qs: the queue set cotaining the control queue
1967 *
1968 *	Resumes transmission on a suspended Tx control queue.
1969 */
1970static void
1971restart_ctrlq(void *data, int npending)
1972{
1973	struct mbuf *m;
1974	struct sge_qset *qs = (struct sge_qset *)data;
1975	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1976	adapter_t *adap = qs->port->adapter;
1977
1978	log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use);
1979
1980	TXQ_LOCK(qs);
1981again:	reclaim_completed_tx_imm(q);
1982
1983	while (q->in_use < q->size &&
1984	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1985
1986		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1987
1988		if (++q->pidx >= q->size) {
1989			q->pidx = 0;
1990			q->gen ^= 1;
1991		}
1992		q->in_use++;
1993	}
1994	if (!mbufq_empty(&q->sendq)) {
1995		setbit(&qs->txq_stopped, TXQ_CTRL);
1996
1997		if (should_restart_tx(q) &&
1998		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1999			goto again;
2000		q->stops++;
2001	}
2002	TXQ_UNLOCK(qs);
2003	t3_write_reg(adap, A_SG_KDOORBELL,
2004		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2005}
2006
2007
2008/*
2009 * Send a management message through control queue 0
2010 */
2011int
2012t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
2013{
2014	return ctrl_xmit(adap, &adap->sge.qs[0], m);
2015}
2016
2017/**
2018 *	free_qset - free the resources of an SGE queue set
2019 *	@sc: the controller owning the queue set
2020 *	@q: the queue set
2021 *
2022 *	Release the HW and SW resources associated with an SGE queue set, such
2023 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2024 *	queue set must be quiesced prior to calling this.
2025 */
2026static void
2027t3_free_qset(adapter_t *sc, struct sge_qset *q)
2028{
2029	int i;
2030
2031	reclaim_completed_tx(q, 0, TXQ_ETH);
2032	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2033		if (q->txq[i].txq_mr != NULL)
2034			buf_ring_free(q->txq[i].txq_mr, M_DEVBUF);
2035		if (q->txq[i].txq_ifq != NULL) {
2036			ifq_delete(q->txq[i].txq_ifq);
2037			free(q->txq[i].txq_ifq, M_DEVBUF);
2038		}
2039	}
2040
2041	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2042		if (q->fl[i].desc) {
2043			mtx_lock_spin(&sc->sge.reg_lock);
2044			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2045			mtx_unlock_spin(&sc->sge.reg_lock);
2046			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2047			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2048					q->fl[i].desc_map);
2049			bus_dma_tag_destroy(q->fl[i].desc_tag);
2050			bus_dma_tag_destroy(q->fl[i].entry_tag);
2051		}
2052		if (q->fl[i].sdesc) {
2053			free_rx_bufs(sc, &q->fl[i]);
2054			free(q->fl[i].sdesc, M_DEVBUF);
2055		}
2056	}
2057
2058	mtx_unlock(&q->lock);
2059	MTX_DESTROY(&q->lock);
2060	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2061		if (q->txq[i].desc) {
2062			mtx_lock_spin(&sc->sge.reg_lock);
2063			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2064			mtx_unlock_spin(&sc->sge.reg_lock);
2065			bus_dmamap_unload(q->txq[i].desc_tag,
2066					q->txq[i].desc_map);
2067			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2068					q->txq[i].desc_map);
2069			bus_dma_tag_destroy(q->txq[i].desc_tag);
2070			bus_dma_tag_destroy(q->txq[i].entry_tag);
2071		}
2072		if (q->txq[i].sdesc) {
2073			free(q->txq[i].sdesc, M_DEVBUF);
2074		}
2075	}
2076
2077	if (q->rspq.desc) {
2078		mtx_lock_spin(&sc->sge.reg_lock);
2079		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2080		mtx_unlock_spin(&sc->sge.reg_lock);
2081
2082		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2083		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2084			        q->rspq.desc_map);
2085		bus_dma_tag_destroy(q->rspq.desc_tag);
2086		MTX_DESTROY(&q->rspq.lock);
2087	}
2088
2089#ifdef LRO_SUPPORTED
2090	tcp_lro_free(&q->lro.ctrl);
2091#endif
2092
2093	bzero(q, sizeof(*q));
2094}
2095
2096/**
2097 *	t3_free_sge_resources - free SGE resources
2098 *	@sc: the adapter softc
2099 *
2100 *	Frees resources used by the SGE queue sets.
2101 */
2102void
2103t3_free_sge_resources(adapter_t *sc)
2104{
2105	int i, nqsets;
2106
2107	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2108		nqsets += sc->port[i].nqsets;
2109
2110	for (i = 0; i < nqsets; ++i) {
2111		TXQ_LOCK(&sc->sge.qs[i]);
2112		t3_free_qset(sc, &sc->sge.qs[i]);
2113	}
2114
2115}
2116
2117/**
2118 *	t3_sge_start - enable SGE
2119 *	@sc: the controller softc
2120 *
2121 *	Enables the SGE for DMAs.  This is the last step in starting packet
2122 *	transfers.
2123 */
2124void
2125t3_sge_start(adapter_t *sc)
2126{
2127	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2128}
2129
2130/**
2131 *	t3_sge_stop - disable SGE operation
2132 *	@sc: the adapter
2133 *
2134 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2135 *	from error interrupts) or from normal process context.  In the latter
2136 *	case it also disables any pending queue restart tasklets.  Note that
2137 *	if it is called in interrupt context it cannot disable the restart
2138 *	tasklets as it cannot wait, however the tasklets will have no effect
2139 *	since the doorbells are disabled and the driver will call this again
2140 *	later from process context, at which time the tasklets will be stopped
2141 *	if they are still running.
2142 */
2143void
2144t3_sge_stop(adapter_t *sc)
2145{
2146	int i, nqsets;
2147
2148	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2149
2150	if (sc->tq == NULL)
2151		return;
2152
2153	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2154		nqsets += sc->port[i].nqsets;
2155#ifdef notyet
2156	/*
2157	 *
2158	 * XXX
2159	 */
2160	for (i = 0; i < nqsets; ++i) {
2161		struct sge_qset *qs = &sc->sge.qs[i];
2162
2163		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2164		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2165	}
2166#endif
2167}
2168
2169/**
2170 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2171 *	@adapter: the adapter
2172 *	@q: the Tx queue to reclaim descriptors from
2173 *	@reclaimable: the number of descriptors to reclaim
2174 *      @m_vec_size: maximum number of buffers to reclaim
2175 *      @desc_reclaimed: returns the number of descriptors reclaimed
2176 *
2177 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2178 *	Tx buffers.  Called with the Tx queue lock held.
2179 *
2180 *      Returns number of buffers of reclaimed
2181 */
2182void
2183t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2184{
2185	struct tx_sw_desc *txsd;
2186	unsigned int cidx, mask;
2187	struct sge_txq *q = &qs->txq[queue];
2188
2189#ifdef T3_TRACE
2190	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2191		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2192#endif
2193	cidx = q->cidx;
2194	mask = q->size - 1;
2195	txsd = &q->sdesc[cidx];
2196
2197	mtx_assert(&qs->lock, MA_OWNED);
2198	while (reclaimable--) {
2199		prefetch(q->sdesc[(cidx + 1) & mask].m);
2200		prefetch(q->sdesc[(cidx + 2) & mask].m);
2201
2202		if (txsd->m != NULL) {
2203			if (txsd->flags & TX_SW_DESC_MAPPED) {
2204				bus_dmamap_unload(q->entry_tag, txsd->map);
2205				txsd->flags &= ~TX_SW_DESC_MAPPED;
2206			}
2207			m_freem_list(txsd->m);
2208			txsd->m = NULL;
2209		} else
2210			q->txq_skipped++;
2211
2212		++txsd;
2213		if (++cidx == q->size) {
2214			cidx = 0;
2215			txsd = q->sdesc;
2216		}
2217	}
2218	q->cidx = cidx;
2219
2220}
2221
2222/**
2223 *	is_new_response - check if a response is newly written
2224 *	@r: the response descriptor
2225 *	@q: the response queue
2226 *
2227 *	Returns true if a response descriptor contains a yet unprocessed
2228 *	response.
2229 */
2230static __inline int
2231is_new_response(const struct rsp_desc *r,
2232    const struct sge_rspq *q)
2233{
2234	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2235}
2236
2237#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2238#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2239			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2240			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2241			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2242
2243/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2244#define NOMEM_INTR_DELAY 2500
2245
2246/**
2247 *	write_ofld_wr - write an offload work request
2248 *	@adap: the adapter
2249 *	@m: the packet to send
2250 *	@q: the Tx queue
2251 *	@pidx: index of the first Tx descriptor to write
2252 *	@gen: the generation value to use
2253 *	@ndesc: number of descriptors the packet will occupy
2254 *
2255 *	Write an offload work request to send the supplied packet.  The packet
2256 *	data already carry the work request with most fields populated.
2257 */
2258static void
2259write_ofld_wr(adapter_t *adap, struct mbuf *m,
2260    struct sge_txq *q, unsigned int pidx,
2261    unsigned int gen, unsigned int ndesc,
2262    bus_dma_segment_t *segs, unsigned int nsegs)
2263{
2264	unsigned int sgl_flits, flits;
2265	struct work_request_hdr *from;
2266	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2267	struct tx_desc *d = &q->desc[pidx];
2268	struct txq_state txqs;
2269
2270	if (immediate(m) && nsegs == 0) {
2271		write_imm(d, m, m->m_len, gen);
2272		return;
2273	}
2274
2275	/* Only TX_DATA builds SGLs */
2276	from = mtod(m, struct work_request_hdr *);
2277	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2278
2279	flits = m->m_len / 8;
2280	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2281
2282	make_sgl(sgp, segs, nsegs);
2283	sgl_flits = sgl_len(nsegs);
2284
2285	txqs.gen = gen;
2286	txqs.pidx = pidx;
2287	txqs.compl = 0;
2288
2289	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2290	    from->wrh_hi, from->wrh_lo);
2291}
2292
2293/**
2294 *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2295 *	@m: the packet
2296 *
2297 * 	Returns the number of Tx descriptors needed for the given offload
2298 * 	packet.  These packets are already fully constructed.
2299 */
2300static __inline unsigned int
2301calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2302{
2303	unsigned int flits, cnt = 0;
2304	int ndescs;
2305
2306	if (m->m_len <= WR_LEN && nsegs == 0)
2307		return (1);                 /* packet fits as immediate data */
2308
2309	/*
2310	 * This needs to be re-visited for TOE
2311	 */
2312
2313	cnt = nsegs;
2314
2315	/* headers */
2316	flits = m->m_len / 8;
2317
2318	ndescs = flits_to_desc(flits + sgl_len(cnt));
2319
2320	return (ndescs);
2321}
2322
2323/**
2324 *	ofld_xmit - send a packet through an offload queue
2325 *	@adap: the adapter
2326 *	@q: the Tx offload queue
2327 *	@m: the packet
2328 *
2329 *	Send an offload packet through an SGE offload queue.
2330 */
2331static int
2332ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2333{
2334	int ret, nsegs;
2335	unsigned int ndesc;
2336	unsigned int pidx, gen;
2337	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2338	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2339	struct tx_sw_desc *stx;
2340
2341	nsegs = m_get_sgllen(m);
2342	vsegs = m_get_sgl(m);
2343	ndesc = calc_tx_descs_ofld(m, nsegs);
2344	busdma_map_sgl(vsegs, segs, nsegs);
2345
2346	stx = &q->sdesc[q->pidx];
2347
2348	TXQ_LOCK(qs);
2349again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2350	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2351	if (__predict_false(ret)) {
2352		if (ret == 1) {
2353			printf("no ofld desc avail\n");
2354
2355			m_set_priority(m, ndesc);     /* save for restart */
2356			TXQ_UNLOCK(qs);
2357			return (EINTR);
2358		}
2359		goto again;
2360	}
2361
2362	gen = q->gen;
2363	q->in_use += ndesc;
2364	pidx = q->pidx;
2365	q->pidx += ndesc;
2366	if (q->pidx >= q->size) {
2367		q->pidx -= q->size;
2368		q->gen ^= 1;
2369	}
2370#ifdef T3_TRACE
2371	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2372		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2373		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2374		  skb_shinfo(skb)->nr_frags);
2375#endif
2376	TXQ_UNLOCK(qs);
2377
2378	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2379	check_ring_tx_db(adap, q);
2380	return (0);
2381}
2382
2383/**
2384 *	restart_offloadq - restart a suspended offload queue
2385 *	@qs: the queue set cotaining the offload queue
2386 *
2387 *	Resumes transmission on a suspended Tx offload queue.
2388 */
2389static void
2390restart_offloadq(void *data, int npending)
2391{
2392	struct mbuf *m;
2393	struct sge_qset *qs = data;
2394	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2395	adapter_t *adap = qs->port->adapter;
2396	bus_dma_segment_t segs[TX_MAX_SEGS];
2397	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2398	int nsegs, cleaned;
2399
2400	TXQ_LOCK(qs);
2401again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2402
2403	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2404		unsigned int gen, pidx;
2405		unsigned int ndesc = m_get_priority(m);
2406
2407		if (__predict_false(q->size - q->in_use < ndesc)) {
2408			setbit(&qs->txq_stopped, TXQ_OFLD);
2409			if (should_restart_tx(q) &&
2410			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2411				goto again;
2412			q->stops++;
2413			break;
2414		}
2415
2416		gen = q->gen;
2417		q->in_use += ndesc;
2418		pidx = q->pidx;
2419		q->pidx += ndesc;
2420		if (q->pidx >= q->size) {
2421			q->pidx -= q->size;
2422			q->gen ^= 1;
2423		}
2424
2425		(void)mbufq_dequeue(&q->sendq);
2426		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2427		TXQ_UNLOCK(qs);
2428		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2429		TXQ_LOCK(qs);
2430	}
2431#if USE_GTS
2432	set_bit(TXQ_RUNNING, &q->flags);
2433	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2434#endif
2435	TXQ_UNLOCK(qs);
2436	wmb();
2437	t3_write_reg(adap, A_SG_KDOORBELL,
2438		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2439}
2440
2441/**
2442 *	queue_set - return the queue set a packet should use
2443 *	@m: the packet
2444 *
2445 *	Maps a packet to the SGE queue set it should use.  The desired queue
2446 *	set is carried in bits 1-3 in the packet's priority.
2447 */
2448static __inline int
2449queue_set(const struct mbuf *m)
2450{
2451	return m_get_priority(m) >> 1;
2452}
2453
2454/**
2455 *	is_ctrl_pkt - return whether an offload packet is a control packet
2456 *	@m: the packet
2457 *
2458 *	Determines whether an offload packet should use an OFLD or a CTRL
2459 *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2460 */
2461static __inline int
2462is_ctrl_pkt(const struct mbuf *m)
2463{
2464	return m_get_priority(m) & 1;
2465}
2466
2467/**
2468 *	t3_offload_tx - send an offload packet
2469 *	@tdev: the offload device to send to
2470 *	@m: the packet
2471 *
2472 *	Sends an offload packet.  We use the packet priority to select the
2473 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2474 *	should be sent as regular or control, bits 1-3 select the queue set.
2475 */
2476int
2477t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2478{
2479	adapter_t *adap = tdev2adap(tdev);
2480	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2481
2482	if (__predict_false(is_ctrl_pkt(m)))
2483		return ctrl_xmit(adap, qs, m);
2484
2485	return ofld_xmit(adap, qs, m);
2486}
2487
2488/**
2489 *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2490 *	@tdev: the offload device that will be receiving the packets
2491 *	@q: the SGE response queue that assembled the bundle
2492 *	@m: the partial bundle
2493 *	@n: the number of packets in the bundle
2494 *
2495 *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2496 */
2497static __inline void
2498deliver_partial_bundle(struct t3cdev *tdev,
2499			struct sge_rspq *q,
2500			struct mbuf *mbufs[], int n)
2501{
2502	if (n) {
2503		q->offload_bundles++;
2504		cxgb_ofld_recv(tdev, mbufs, n);
2505	}
2506}
2507
2508static __inline int
2509rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2510    struct mbuf *m, struct mbuf *rx_gather[],
2511    unsigned int gather_idx)
2512{
2513
2514	rq->offload_pkts++;
2515	m->m_pkthdr.header = mtod(m, void *);
2516	rx_gather[gather_idx++] = m;
2517	if (gather_idx == RX_BUNDLE_SIZE) {
2518		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2519		gather_idx = 0;
2520		rq->offload_bundles++;
2521	}
2522	return (gather_idx);
2523}
2524
2525static void
2526restart_tx(struct sge_qset *qs)
2527{
2528	struct adapter *sc = qs->port->adapter;
2529
2530
2531	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2532	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2533	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2534		qs->txq[TXQ_OFLD].restarts++;
2535		DPRINTF("restarting TXQ_OFLD\n");
2536		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2537	}
2538	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2539	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2540	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2541	    qs->txq[TXQ_CTRL].in_use);
2542
2543	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2544	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2545	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2546		qs->txq[TXQ_CTRL].restarts++;
2547		DPRINTF("restarting TXQ_CTRL\n");
2548		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2549	}
2550}
2551
2552/**
2553 *	t3_sge_alloc_qset - initialize an SGE queue set
2554 *	@sc: the controller softc
2555 *	@id: the queue set id
2556 *	@nports: how many Ethernet ports will be using this queue set
2557 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2558 *	@p: configuration parameters for this queue set
2559 *	@ntxq: number of Tx queues for the queue set
2560 *	@pi: port info for queue set
2561 *
2562 *	Allocate resources and initialize an SGE queue set.  A queue set
2563 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2564 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2565 *	queue, offload queue, and control queue.
2566 */
2567int
2568t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2569		  const struct qset_params *p, int ntxq, struct port_info *pi)
2570{
2571	struct sge_qset *q = &sc->sge.qs[id];
2572	int i, ret = 0;
2573
2574	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2575	q->port = pi;
2576
2577	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2578
2579		if ((q->txq[i].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2580			    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2581			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2582			goto err;
2583		}
2584		if ((q->txq[i].txq_ifq =
2585			malloc(sizeof(struct ifaltq), M_DEVBUF, M_NOWAIT|M_ZERO))
2586		    == NULL) {
2587			device_printf(sc->dev, "failed to allocate ifq\n");
2588			goto err;
2589		}
2590		ifq_init(q->txq[i].txq_ifq, pi->ifp);
2591		callout_init(&q->txq[i].txq_timer, 1);
2592		callout_init(&q->txq[i].txq_watchdog, 1);
2593		q->txq[i].txq_timer.c_cpu = id % mp_ncpus;
2594		q->txq[i].txq_watchdog.c_cpu = id % mp_ncpus;
2595	}
2596	init_qset_cntxt(q, id);
2597	q->idx = id;
2598	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2599		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2600		    &q->fl[0].desc, &q->fl[0].sdesc,
2601		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2602		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2603		printf("error %d from alloc ring fl0\n", ret);
2604		goto err;
2605	}
2606
2607	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2608		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2609		    &q->fl[1].desc, &q->fl[1].sdesc,
2610		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2611		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2612		printf("error %d from alloc ring fl1\n", ret);
2613		goto err;
2614	}
2615
2616	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2617		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2618		    &q->rspq.desc_tag, &q->rspq.desc_map,
2619		    NULL, NULL)) != 0) {
2620		printf("error %d from alloc ring rspq\n", ret);
2621		goto err;
2622	}
2623
2624	for (i = 0; i < ntxq; ++i) {
2625		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2626
2627		if ((ret = alloc_ring(sc, p->txq_size[i],
2628			    sizeof(struct tx_desc), sz,
2629			    &q->txq[i].phys_addr, &q->txq[i].desc,
2630			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2631			    &q->txq[i].desc_map,
2632			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2633			printf("error %d from alloc ring tx %i\n", ret, i);
2634			goto err;
2635		}
2636		mbufq_init(&q->txq[i].sendq);
2637		q->txq[i].gen = 1;
2638		q->txq[i].size = p->txq_size[i];
2639	}
2640
2641	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2642	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2643	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2644	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2645
2646	q->fl[0].gen = q->fl[1].gen = 1;
2647	q->fl[0].size = p->fl_size;
2648	q->fl[1].size = p->jumbo_size;
2649
2650	q->rspq.gen = 1;
2651	q->rspq.cidx = 0;
2652	q->rspq.size = p->rspq_size;
2653
2654	q->txq[TXQ_ETH].stop_thres = nports *
2655	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2656
2657	q->fl[0].buf_size = MCLBYTES;
2658	q->fl[0].zone = zone_pack;
2659	q->fl[0].type = EXT_PACKET;
2660#if __FreeBSD_version > 800000
2661	if (cxgb_use_16k_clusters) {
2662		q->fl[1].buf_size = MJUM16BYTES;
2663		q->fl[1].zone = zone_jumbo16;
2664		q->fl[1].type = EXT_JUMBO16;
2665	} else {
2666		q->fl[1].buf_size = MJUM9BYTES;
2667		q->fl[1].zone = zone_jumbo9;
2668		q->fl[1].type = EXT_JUMBO9;
2669	}
2670#else
2671	q->fl[1].buf_size = MJUMPAGESIZE;
2672	q->fl[1].zone = zone_jumbop;
2673	q->fl[1].type = EXT_JUMBOP;
2674#endif
2675
2676#ifdef LRO_SUPPORTED
2677	/* Allocate and setup the lro_ctrl structure */
2678	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2679	ret = tcp_lro_init(&q->lro.ctrl);
2680	if (ret) {
2681		printf("error %d from tcp_lro_init\n", ret);
2682		goto err;
2683	}
2684	q->lro.ctrl.ifp = pi->ifp;
2685#endif
2686
2687	mtx_lock_spin(&sc->sge.reg_lock);
2688	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2689				   q->rspq.phys_addr, q->rspq.size,
2690				   q->fl[0].buf_size, 1, 0);
2691	if (ret) {
2692		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2693		goto err_unlock;
2694	}
2695
2696	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2697		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2698					  q->fl[i].phys_addr, q->fl[i].size,
2699					  q->fl[i].buf_size, p->cong_thres, 1,
2700					  0);
2701		if (ret) {
2702			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2703			goto err_unlock;
2704		}
2705	}
2706
2707	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2708				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2709				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2710				 1, 0);
2711	if (ret) {
2712		printf("error %d from t3_sge_init_ecntxt\n", ret);
2713		goto err_unlock;
2714	}
2715
2716	if (ntxq > 1) {
2717		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2718					 USE_GTS, SGE_CNTXT_OFLD, id,
2719					 q->txq[TXQ_OFLD].phys_addr,
2720					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2721		if (ret) {
2722			printf("error %d from t3_sge_init_ecntxt\n", ret);
2723			goto err_unlock;
2724		}
2725	}
2726
2727	if (ntxq > 2) {
2728		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2729					 SGE_CNTXT_CTRL, id,
2730					 q->txq[TXQ_CTRL].phys_addr,
2731					 q->txq[TXQ_CTRL].size,
2732					 q->txq[TXQ_CTRL].token, 1, 0);
2733		if (ret) {
2734			printf("error %d from t3_sge_init_ecntxt\n", ret);
2735			goto err_unlock;
2736		}
2737	}
2738
2739	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2740	    device_get_unit(sc->dev), irq_vec_idx);
2741	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2742
2743	mtx_unlock_spin(&sc->sge.reg_lock);
2744	t3_update_qset_coalesce(q, p);
2745	q->port = pi;
2746
2747	refill_fl(sc, &q->fl[0], q->fl[0].size);
2748	refill_fl(sc, &q->fl[1], q->fl[1].size);
2749	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2750
2751	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2752		     V_NEWTIMER(q->rspq.holdoff_tmr));
2753
2754	return (0);
2755
2756err_unlock:
2757	mtx_unlock_spin(&sc->sge.reg_lock);
2758err:
2759	TXQ_LOCK(q);
2760	t3_free_qset(sc, q);
2761
2762	return (ret);
2763}
2764
2765/*
2766 * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2767 * ethernet data.  Hardware assistance with various checksums and any vlan tag
2768 * will also be taken into account here.
2769 */
2770void
2771t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2772{
2773	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2774	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2775	struct ifnet *ifp = pi->ifp;
2776
2777	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2778
2779	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2780	    cpl->csum_valid && cpl->csum == 0xffff) {
2781		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2782		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2783		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2784		m->m_pkthdr.csum_data = 0xffff;
2785	}
2786	/*
2787	 * XXX need to add VLAN support for 6.x
2788	 */
2789#ifdef VLAN_SUPPORTED
2790	if (__predict_false(cpl->vlan_valid)) {
2791		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2792		m->m_flags |= M_VLANTAG;
2793	}
2794#endif
2795
2796	m->m_pkthdr.rcvif = ifp;
2797	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2798	/*
2799	 * adjust after conversion to mbuf chain
2800	 */
2801	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2802	m->m_len -= (sizeof(*cpl) + ethpad);
2803	m->m_data += (sizeof(*cpl) + ethpad);
2804}
2805
2806/**
2807 *	get_packet - return the next ingress packet buffer from a free list
2808 *	@adap: the adapter that received the packet
2809 *	@drop_thres: # of remaining buffers before we start dropping packets
2810 *	@qs: the qset that the SGE free list holding the packet belongs to
2811 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2812 *      @r: response descriptor
2813 *
2814 *	Get the next packet from a free list and complete setup of the
2815 *	sk_buff.  If the packet is small we make a copy and recycle the
2816 *	original buffer, otherwise we use the original buffer itself.  If a
2817 *	positive drop threshold is supplied packets are dropped and their
2818 *	buffers recycled if (a) the number of remaining buffers is under the
2819 *	threshold and the packet is too big to copy, or (b) the packet should
2820 *	be copied but there is no memory for the copy.
2821 */
2822static int
2823get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2824    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2825{
2826
2827	unsigned int len_cq =  ntohl(r->len_cq);
2828	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2829	int mask, cidx = fl->cidx;
2830	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2831	uint32_t len = G_RSPD_LEN(len_cq);
2832	uint32_t flags = M_EXT;
2833	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2834	caddr_t cl;
2835	struct mbuf *m;
2836	int ret = 0;
2837
2838	mask = fl->size - 1;
2839	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2840	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2841	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2842	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2843
2844	fl->credits--;
2845	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2846
2847	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2848	    sopeop == RSPQ_SOP_EOP) {
2849		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2850			goto skip_recycle;
2851		cl = mtod(m, void *);
2852		memcpy(cl, sd->rxsd_cl, len);
2853		recycle_rx_buf(adap, fl, fl->cidx);
2854		m->m_pkthdr.len = m->m_len = len;
2855		m->m_flags = 0;
2856		mh->mh_head = mh->mh_tail = m;
2857		ret = 1;
2858		goto done;
2859	} else {
2860	skip_recycle:
2861		bus_dmamap_unload(fl->entry_tag, sd->map);
2862		cl = sd->rxsd_cl;
2863		m = sd->m;
2864
2865		if ((sopeop == RSPQ_SOP_EOP) ||
2866		    (sopeop == RSPQ_SOP))
2867			flags |= M_PKTHDR;
2868		if (fl->zone == zone_pack) {
2869			m_init(m, zone_pack, MCLBYTES, M_NOWAIT, MT_DATA, flags);
2870			/*
2871			 * restore clobbered data pointer
2872			 */
2873			m->m_data = m->m_ext.ext_buf;
2874		} else {
2875			m_cljset(m, cl, fl->type);
2876			m->m_flags = flags;
2877		}
2878		m->m_len = len;
2879	}
2880	switch(sopeop) {
2881	case RSPQ_SOP_EOP:
2882		ret = 1;
2883		/* FALLTHROUGH */
2884	case RSPQ_SOP:
2885		mh->mh_head = mh->mh_tail = m;
2886		m->m_pkthdr.len = len;
2887		break;
2888	case RSPQ_EOP:
2889		ret = 1;
2890		/* FALLTHROUGH */
2891	case RSPQ_NSOP_NEOP:
2892		if (mh->mh_tail == NULL) {
2893			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2894			m_freem(m);
2895			break;
2896		}
2897		mh->mh_tail->m_next = m;
2898		mh->mh_tail = m;
2899		mh->mh_head->m_pkthdr.len += len;
2900		break;
2901	}
2902	if (cxgb_debug)
2903		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2904done:
2905	if (++fl->cidx == fl->size)
2906		fl->cidx = 0;
2907
2908	return (ret);
2909}
2910
2911/**
2912 *	handle_rsp_cntrl_info - handles control information in a response
2913 *	@qs: the queue set corresponding to the response
2914 *	@flags: the response control flags
2915 *
2916 *	Handles the control information of an SGE response, such as GTS
2917 *	indications and completion credits for the queue set's Tx queues.
2918 *	HW coalesces credits, we don't do any extra SW coalescing.
2919 */
2920static __inline void
2921handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2922{
2923	unsigned int credits;
2924
2925#if USE_GTS
2926	if (flags & F_RSPD_TXQ0_GTS)
2927		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2928#endif
2929	credits = G_RSPD_TXQ0_CR(flags);
2930	if (credits)
2931		qs->txq[TXQ_ETH].processed += credits;
2932
2933	credits = G_RSPD_TXQ2_CR(flags);
2934	if (credits)
2935		qs->txq[TXQ_CTRL].processed += credits;
2936
2937# if USE_GTS
2938	if (flags & F_RSPD_TXQ1_GTS)
2939		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2940# endif
2941	credits = G_RSPD_TXQ1_CR(flags);
2942	if (credits)
2943		qs->txq[TXQ_OFLD].processed += credits;
2944
2945}
2946
2947static void
2948check_ring_db(adapter_t *adap, struct sge_qset *qs,
2949    unsigned int sleeping)
2950{
2951	;
2952}
2953
2954/**
2955 *	process_responses - process responses from an SGE response queue
2956 *	@adap: the adapter
2957 *	@qs: the queue set to which the response queue belongs
2958 *	@budget: how many responses can be processed in this round
2959 *
2960 *	Process responses from an SGE response queue up to the supplied budget.
2961 *	Responses include received packets as well as credits and other events
2962 *	for the queues that belong to the response queue's queue set.
2963 *	A negative budget is effectively unlimited.
2964 *
2965 *	Additionally choose the interrupt holdoff time for the next interrupt
2966 *	on this queue.  If the system is under memory shortage use a fairly
2967 *	long delay to help recovery.
2968 */
2969static int
2970process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2971{
2972	struct sge_rspq *rspq = &qs->rspq;
2973	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2974	int budget_left = budget;
2975	unsigned int sleeping = 0;
2976#ifdef LRO_SUPPORTED
2977	int lro_enabled = qs->lro.enabled;
2978	int skip_lro;
2979	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2980#endif
2981	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2982	int ngathered = 0;
2983#ifdef DEBUG
2984	static int last_holdoff = 0;
2985	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2986		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2987		last_holdoff = rspq->holdoff_tmr;
2988	}
2989#endif
2990	rspq->next_holdoff = rspq->holdoff_tmr;
2991
2992	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2993		int eth, eop = 0, ethpad = 0;
2994		uint32_t flags = ntohl(r->flags);
2995		uint32_t rss_csum = *(const uint32_t *)r;
2996		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2997
2998		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2999
3000		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
3001			struct mbuf *m;
3002
3003			if (cxgb_debug)
3004				printf("async notification\n");
3005
3006			if (rspq->rspq_mh.mh_head == NULL) {
3007				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3008				m = rspq->rspq_mh.mh_head;
3009			} else {
3010				m = m_gethdr(M_DONTWAIT, MT_DATA);
3011			}
3012			if (m == NULL)
3013				goto no_mem;
3014
3015                        memcpy(mtod(m, char *), r, AN_PKT_SIZE);
3016			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
3017                        *mtod(m, char *) = CPL_ASYNC_NOTIF;
3018			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
3019			eop = 1;
3020                        rspq->async_notif++;
3021			goto skip;
3022		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
3023			struct mbuf *m = NULL;
3024
3025			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
3026			    r->rss_hdr.opcode, rspq->cidx);
3027			if (rspq->rspq_mh.mh_head == NULL)
3028				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3029                        else
3030				m = m_gethdr(M_DONTWAIT, MT_DATA);
3031
3032			if (rspq->rspq_mh.mh_head == NULL &&  m == NULL) {
3033		no_mem:
3034				rspq->next_holdoff = NOMEM_INTR_DELAY;
3035				budget_left--;
3036				break;
3037			}
3038			get_imm_packet(adap, r, rspq->rspq_mh.mh_head);
3039			eop = 1;
3040			rspq->imm_data++;
3041		} else if (r->len_cq) {
3042			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3043
3044			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
3045			if (eop) {
3046				rspq->rspq_mh.mh_head->m_flags |= M_FLOWID;
3047				rspq->rspq_mh.mh_head->m_pkthdr.flowid = rss_hash;
3048			}
3049
3050			ethpad = 2;
3051		} else {
3052			rspq->pure_rsps++;
3053		}
3054	skip:
3055		if (flags & RSPD_CTRL_MASK) {
3056			sleeping |= flags & RSPD_GTS_MASK;
3057			handle_rsp_cntrl_info(qs, flags);
3058		}
3059
3060		r++;
3061		if (__predict_false(++rspq->cidx == rspq->size)) {
3062			rspq->cidx = 0;
3063			rspq->gen ^= 1;
3064			r = rspq->desc;
3065		}
3066
3067		if (++rspq->credits >= (rspq->size / 4)) {
3068			refill_rspq(adap, rspq, rspq->credits);
3069			rspq->credits = 0;
3070		}
3071		if (!eth && eop) {
3072			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
3073			/*
3074			 * XXX size mismatch
3075			 */
3076			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
3077
3078
3079			ngathered = rx_offload(&adap->tdev, rspq,
3080			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
3081			rspq->rspq_mh.mh_head = NULL;
3082			DPRINTF("received offload packet\n");
3083
3084		} else if (eth && eop) {
3085			struct mbuf *m = rspq->rspq_mh.mh_head;
3086
3087			t3_rx_eth(adap, rspq, m, ethpad);
3088
3089#ifdef LRO_SUPPORTED
3090			/*
3091			 * The T304 sends incoming packets on any qset.  If LRO
3092			 * is also enabled, we could end up sending packet up
3093			 * lro_ctrl->ifp's input.  That is incorrect.
3094			 *
3095			 * The mbuf's rcvif was derived from the cpl header and
3096			 * is accurate.  Skip LRO and just use that.
3097			 */
3098			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3099
3100			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro &&
3101			    (tcp_lro_rx(lro_ctrl, m, 0) == 0)) {
3102				/* successfully queue'd for LRO */
3103			} else
3104#endif
3105			{
3106				/*
3107				 * LRO not enabled, packet unsuitable for LRO,
3108				 * or unable to queue.  Pass it up right now in
3109				 * either case.
3110				 */
3111				struct ifnet *ifp = m->m_pkthdr.rcvif;
3112				(*ifp->if_input)(ifp, m);
3113			}
3114			rspq->rspq_mh.mh_head = NULL;
3115
3116		}
3117		__refill_fl_lt(adap, &qs->fl[0], 32);
3118		__refill_fl_lt(adap, &qs->fl[1], 32);
3119		--budget_left;
3120	}
3121
3122	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3123
3124#ifdef LRO_SUPPORTED
3125	/* Flush LRO */
3126	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3127		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3128		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3129		tcp_lro_flush(lro_ctrl, queued);
3130	}
3131#endif
3132
3133	if (sleeping)
3134		check_ring_db(adap, qs, sleeping);
3135
3136	mb();  /* commit Tx queue processed updates */
3137	if (__predict_false(qs->txq_stopped > 1)) {
3138		printf("restarting tx on %p\n", qs);
3139
3140		restart_tx(qs);
3141	}
3142
3143	__refill_fl_lt(adap, &qs->fl[0], 512);
3144	__refill_fl_lt(adap, &qs->fl[1], 512);
3145	budget -= budget_left;
3146	return (budget);
3147}
3148
3149/*
3150 * A helper function that processes responses and issues GTS.
3151 */
3152static __inline int
3153process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3154{
3155	int work;
3156	static int last_holdoff = 0;
3157
3158	work = process_responses(adap, rspq_to_qset(rq), -1);
3159
3160	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3161		printf("next_holdoff=%d\n", rq->next_holdoff);
3162		last_holdoff = rq->next_holdoff;
3163	}
3164	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3165	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3166
3167	return (work);
3168}
3169
3170
3171/*
3172 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3173 * Handles data events from SGE response queues as well as error and other
3174 * async events as they all use the same interrupt pin.  We use one SGE
3175 * response queue per port in this mode and protect all response queues with
3176 * queue 0's lock.
3177 */
3178void
3179t3b_intr(void *data)
3180{
3181	uint32_t i, map;
3182	adapter_t *adap = data;
3183	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3184
3185	t3_write_reg(adap, A_PL_CLI, 0);
3186	map = t3_read_reg(adap, A_SG_DATA_INTR);
3187
3188	if (!map)
3189		return;
3190
3191	if (__predict_false(map & F_ERRINTR))
3192		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3193
3194	mtx_lock(&q0->lock);
3195	for_each_port(adap, i)
3196	    if (map & (1 << i))
3197			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3198	mtx_unlock(&q0->lock);
3199}
3200
3201/*
3202 * The MSI interrupt handler.  This needs to handle data events from SGE
3203 * response queues as well as error and other async events as they all use
3204 * the same MSI vector.  We use one SGE response queue per port in this mode
3205 * and protect all response queues with queue 0's lock.
3206 */
3207void
3208t3_intr_msi(void *data)
3209{
3210	adapter_t *adap = data;
3211	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3212	int i, new_packets = 0;
3213
3214	mtx_lock(&q0->lock);
3215
3216	for_each_port(adap, i)
3217	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3218		    new_packets = 1;
3219	mtx_unlock(&q0->lock);
3220	if (new_packets == 0)
3221		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3222}
3223
3224void
3225t3_intr_msix(void *data)
3226{
3227	struct sge_qset *qs = data;
3228	adapter_t *adap = qs->port->adapter;
3229	struct sge_rspq *rspq = &qs->rspq;
3230
3231	if (process_responses_gts(adap, rspq) == 0)
3232		rspq->unhandled_irqs++;
3233}
3234
3235#define QDUMP_SBUF_SIZE		32 * 400
3236static int
3237t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3238{
3239	struct sge_rspq *rspq;
3240	struct sge_qset *qs;
3241	int i, err, dump_end, idx;
3242	static int multiplier = 1;
3243	struct sbuf *sb;
3244	struct rsp_desc *rspd;
3245	uint32_t data[4];
3246
3247	rspq = arg1;
3248	qs = rspq_to_qset(rspq);
3249	if (rspq->rspq_dump_count == 0)
3250		return (0);
3251	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3252		log(LOG_WARNING,
3253		    "dump count is too large %d\n", rspq->rspq_dump_count);
3254		rspq->rspq_dump_count = 0;
3255		return (EINVAL);
3256	}
3257	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3258		log(LOG_WARNING,
3259		    "dump start of %d is greater than queue size\n",
3260		    rspq->rspq_dump_start);
3261		rspq->rspq_dump_start = 0;
3262		return (EINVAL);
3263	}
3264	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3265	if (err)
3266		return (err);
3267retry_sbufops:
3268	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3269
3270	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3271	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3272	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3273	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3274	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3275
3276	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3277	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3278
3279	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3280	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3281		idx = i & (RSPQ_Q_SIZE-1);
3282
3283		rspd = &rspq->desc[idx];
3284		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3285		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3286		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3287		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3288		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3289		    be32toh(rspd->len_cq), rspd->intr_gen);
3290	}
3291	if (sbuf_overflowed(sb)) {
3292		sbuf_delete(sb);
3293		multiplier++;
3294		goto retry_sbufops;
3295	}
3296	sbuf_finish(sb);
3297	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3298	sbuf_delete(sb);
3299	return (err);
3300}
3301
3302static int
3303t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3304{
3305	struct sge_txq *txq;
3306	struct sge_qset *qs;
3307	int i, j, err, dump_end;
3308	static int multiplier = 1;
3309	struct sbuf *sb;
3310	struct tx_desc *txd;
3311	uint32_t *WR, wr_hi, wr_lo, gen;
3312	uint32_t data[4];
3313
3314	txq = arg1;
3315	qs = txq_to_qset(txq, TXQ_ETH);
3316	if (txq->txq_dump_count == 0) {
3317		return (0);
3318	}
3319	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3320		log(LOG_WARNING,
3321		    "dump count is too large %d\n", txq->txq_dump_count);
3322		txq->txq_dump_count = 1;
3323		return (EINVAL);
3324	}
3325	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3326		log(LOG_WARNING,
3327		    "dump start of %d is greater than queue size\n",
3328		    txq->txq_dump_start);
3329		txq->txq_dump_start = 0;
3330		return (EINVAL);
3331	}
3332	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3333	if (err)
3334		return (err);
3335
3336
3337retry_sbufops:
3338	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3339
3340	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3341	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3342	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3343	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3344	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3345	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3346	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3347	    txq->txq_dump_start,
3348	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3349
3350	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3351	for (i = txq->txq_dump_start; i < dump_end; i++) {
3352		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3353		WR = (uint32_t *)txd->flit;
3354		wr_hi = ntohl(WR[0]);
3355		wr_lo = ntohl(WR[1]);
3356		gen = G_WR_GEN(wr_lo);
3357
3358		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3359		    wr_hi, wr_lo, gen);
3360		for (j = 2; j < 30; j += 4)
3361			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3362			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3363
3364	}
3365	if (sbuf_overflowed(sb)) {
3366		sbuf_delete(sb);
3367		multiplier++;
3368		goto retry_sbufops;
3369	}
3370	sbuf_finish(sb);
3371	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3372	sbuf_delete(sb);
3373	return (err);
3374}
3375
3376static int
3377t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3378{
3379	struct sge_txq *txq;
3380	struct sge_qset *qs;
3381	int i, j, err, dump_end;
3382	static int multiplier = 1;
3383	struct sbuf *sb;
3384	struct tx_desc *txd;
3385	uint32_t *WR, wr_hi, wr_lo, gen;
3386
3387	txq = arg1;
3388	qs = txq_to_qset(txq, TXQ_CTRL);
3389	if (txq->txq_dump_count == 0) {
3390		return (0);
3391	}
3392	if (txq->txq_dump_count > 256) {
3393		log(LOG_WARNING,
3394		    "dump count is too large %d\n", txq->txq_dump_count);
3395		txq->txq_dump_count = 1;
3396		return (EINVAL);
3397	}
3398	if (txq->txq_dump_start > 255) {
3399		log(LOG_WARNING,
3400		    "dump start of %d is greater than queue size\n",
3401		    txq->txq_dump_start);
3402		txq->txq_dump_start = 0;
3403		return (EINVAL);
3404	}
3405
3406retry_sbufops:
3407	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3408	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3409	    txq->txq_dump_start,
3410	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3411
3412	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3413	for (i = txq->txq_dump_start; i < dump_end; i++) {
3414		txd = &txq->desc[i & (255)];
3415		WR = (uint32_t *)txd->flit;
3416		wr_hi = ntohl(WR[0]);
3417		wr_lo = ntohl(WR[1]);
3418		gen = G_WR_GEN(wr_lo);
3419
3420		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3421		    wr_hi, wr_lo, gen);
3422		for (j = 2; j < 30; j += 4)
3423			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3424			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3425
3426	}
3427	if (sbuf_overflowed(sb)) {
3428		sbuf_delete(sb);
3429		multiplier++;
3430		goto retry_sbufops;
3431	}
3432	sbuf_finish(sb);
3433	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3434	sbuf_delete(sb);
3435	return (err);
3436}
3437
3438static int
3439t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3440{
3441	adapter_t *sc = arg1;
3442	struct qset_params *qsp = &sc->params.sge.qset[0];
3443	int coalesce_usecs;
3444	struct sge_qset *qs;
3445	int i, j, err, nqsets = 0;
3446	struct mtx *lock;
3447
3448	if ((sc->flags & FULL_INIT_DONE) == 0)
3449		return (ENXIO);
3450
3451	coalesce_usecs = qsp->coalesce_usecs;
3452        err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3453
3454	if (err != 0) {
3455		return (err);
3456	}
3457	if (coalesce_usecs == qsp->coalesce_usecs)
3458		return (0);
3459
3460	for (i = 0; i < sc->params.nports; i++)
3461		for (j = 0; j < sc->port[i].nqsets; j++)
3462			nqsets++;
3463
3464	coalesce_usecs = max(1, coalesce_usecs);
3465
3466	for (i = 0; i < nqsets; i++) {
3467		qs = &sc->sge.qs[i];
3468		qsp = &sc->params.sge.qset[i];
3469		qsp->coalesce_usecs = coalesce_usecs;
3470
3471		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3472			    &sc->sge.qs[0].rspq.lock;
3473
3474		mtx_lock(lock);
3475		t3_update_qset_coalesce(qs, qsp);
3476		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3477		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3478		mtx_unlock(lock);
3479	}
3480
3481	return (0);
3482}
3483
3484
3485void
3486t3_add_attach_sysctls(adapter_t *sc)
3487{
3488	struct sysctl_ctx_list *ctx;
3489	struct sysctl_oid_list *children;
3490
3491	ctx = device_get_sysctl_ctx(sc->dev);
3492	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3493
3494	/* random information */
3495	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3496	    "firmware_version",
3497	    CTLFLAG_RD, &sc->fw_version,
3498	    0, "firmware version");
3499	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3500	    "hw_revision",
3501	    CTLFLAG_RD, &sc->params.rev,
3502	    0, "chip model");
3503	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3504	    "port_types",
3505	    CTLFLAG_RD, &sc->port_types,
3506	    0, "type of ports");
3507	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3508	    "enable_debug",
3509	    CTLFLAG_RW, &cxgb_debug,
3510	    0, "enable verbose debugging output");
3511	SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3512	    CTLFLAG_RD, &sc->tunq_coalesce,
3513	    "#tunneled packets freed");
3514	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3515	    "txq_overrun",
3516	    CTLFLAG_RD, &txq_fills,
3517	    0, "#times txq overrun");
3518}
3519
3520
3521static const char *rspq_name = "rspq";
3522static const char *txq_names[] =
3523{
3524	"txq_eth",
3525	"txq_ofld",
3526	"txq_ctrl"
3527};
3528
3529static int
3530sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3531{
3532	struct port_info *p = arg1;
3533	uint64_t *parg;
3534
3535	if (!p)
3536		return (EINVAL);
3537
3538	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3539	PORT_LOCK(p);
3540	t3_mac_update_stats(&p->mac);
3541	PORT_UNLOCK(p);
3542
3543	return (sysctl_handle_quad(oidp, parg, 0, req));
3544}
3545
3546void
3547t3_add_configured_sysctls(adapter_t *sc)
3548{
3549	struct sysctl_ctx_list *ctx;
3550	struct sysctl_oid_list *children;
3551	int i, j;
3552
3553	ctx = device_get_sysctl_ctx(sc->dev);
3554	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3555
3556	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3557	    "intr_coal",
3558	    CTLTYPE_INT|CTLFLAG_RW, sc,
3559	    0, t3_set_coalesce_usecs,
3560	    "I", "interrupt coalescing timer (us)");
3561
3562	for (i = 0; i < sc->params.nports; i++) {
3563		struct port_info *pi = &sc->port[i];
3564		struct sysctl_oid *poid;
3565		struct sysctl_oid_list *poidlist;
3566		struct mac_stats *mstats = &pi->mac.stats;
3567
3568		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3569		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3570		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3571		poidlist = SYSCTL_CHILDREN(poid);
3572		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3573		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3574		    0, "#queue sets");
3575
3576		for (j = 0; j < pi->nqsets; j++) {
3577			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3578			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3579					  *ctrlqpoid, *lropoid;
3580			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3581					       *txqpoidlist, *ctrlqpoidlist,
3582					       *lropoidlist;
3583			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3584
3585			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3586
3587			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3588			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3589			qspoidlist = SYSCTL_CHILDREN(qspoid);
3590
3591			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3592					CTLFLAG_RD, &qs->fl[0].empty, 0,
3593					"freelist #0 empty");
3594			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3595					CTLFLAG_RD, &qs->fl[1].empty, 0,
3596					"freelist #1 empty");
3597
3598			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3599			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3600			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3601
3602			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3603			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3604			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3605
3606			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3607			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3608			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3609
3610			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3611			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3612			lropoidlist = SYSCTL_CHILDREN(lropoid);
3613
3614			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3615			    CTLFLAG_RD, &qs->rspq.size,
3616			    0, "#entries in response queue");
3617			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3618			    CTLFLAG_RD, &qs->rspq.cidx,
3619			    0, "consumer index");
3620			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3621			    CTLFLAG_RD, &qs->rspq.credits,
3622			    0, "#credits");
3623			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3624			    CTLFLAG_RD, &qs->rspq.phys_addr,
3625			    "physical_address_of the queue");
3626			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3627			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3628			    0, "start rspq dump entry");
3629			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3630			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3631			    0, "#rspq entries to dump");
3632			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3633			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3634			    0, t3_dump_rspq, "A", "dump of the response queue");
3635
3636
3637			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "dropped",
3638			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3639			    0, "#tunneled packets dropped");
3640			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3641			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3642			    0, "#tunneled packets waiting to be sent");
3643#if 0
3644			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3645			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3646			    0, "#tunneled packets queue producer index");
3647			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3648			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3649			    0, "#tunneled packets queue consumer index");
3650#endif
3651			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3652			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3653			    0, "#tunneled packets processed by the card");
3654			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3655			    CTLFLAG_RD, &txq->cleaned,
3656			    0, "#tunneled packets cleaned");
3657			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3658			    CTLFLAG_RD, &txq->in_use,
3659			    0, "#tunneled packet slots in use");
3660			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3661			    CTLFLAG_RD, &txq->txq_frees,
3662			    "#tunneled packets freed");
3663			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3664			    CTLFLAG_RD, &txq->txq_skipped,
3665			    0, "#tunneled packet descriptors skipped");
3666			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3667			    CTLFLAG_RD, &txq->txq_coalesced,
3668			    "#tunneled packets coalesced");
3669			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3670			    CTLFLAG_RD, &txq->txq_enqueued,
3671			    0, "#tunneled packets enqueued to hardware");
3672			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3673			    CTLFLAG_RD, &qs->txq_stopped,
3674			    0, "tx queues stopped");
3675			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3676			    CTLFLAG_RD, &txq->phys_addr,
3677			    "physical_address_of the queue");
3678			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3679			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3680			    0, "txq generation");
3681			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3682			    CTLFLAG_RD, &txq->cidx,
3683			    0, "hardware queue cidx");
3684			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3685			    CTLFLAG_RD, &txq->pidx,
3686			    0, "hardware queue pidx");
3687			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3688			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3689			    0, "txq start idx for dump");
3690			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3691			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3692			    0, "txq #entries to dump");
3693			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3694			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3695			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3696
3697			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3698			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3699			    0, "ctrlq start idx for dump");
3700			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3701			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3702			    0, "ctrl #entries to dump");
3703			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3704			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3705			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3706
3707#ifdef LRO_SUPPORTED
3708			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3709			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3710			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3711			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3712			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3713			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3714			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3715			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3716#endif
3717		}
3718
3719		/* Now add a node for mac stats. */
3720		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3721		    CTLFLAG_RD, NULL, "MAC statistics");
3722		poidlist = SYSCTL_CHILDREN(poid);
3723
3724		/*
3725		 * We (ab)use the length argument (arg2) to pass on the offset
3726		 * of the data that we are interested in.  This is only required
3727		 * for the quad counters that are updated from the hardware (we
3728		 * make sure that we return the latest value).
3729		 * sysctl_handle_macstat first updates *all* the counters from
3730		 * the hardware, and then returns the latest value of the
3731		 * requested counter.  Best would be to update only the
3732		 * requested counter from hardware, but t3_mac_update_stats()
3733		 * hides all the register details and we don't want to dive into
3734		 * all that here.
3735		 */
3736#define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3737    (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3738    sysctl_handle_macstat, "QU", 0)
3739		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3740		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3741		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3742		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3743		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3744		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3745		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3746		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3747		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3748		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3749		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3750		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3751		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3752		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3753		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3754		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3755		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3756		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3757		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3758		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3759		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3760		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3761		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3762		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3763		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3764		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3765		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3766		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3767		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3768		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3769		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3770		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3771		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3772		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3773		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3774		CXGB_SYSCTL_ADD_QUAD(rx_short);
3775		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3776		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3777		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3778		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3779		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3780		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3781		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3782		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3783		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3784		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3785#undef CXGB_SYSCTL_ADD_QUAD
3786
3787#define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3788    CTLFLAG_RD, &mstats->a, 0)
3789		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3790		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3791		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3792		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3793		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3794		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3795		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3796		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3797		CXGB_SYSCTL_ADD_ULONG(num_resets);
3798		CXGB_SYSCTL_ADD_ULONG(link_faults);
3799#undef CXGB_SYSCTL_ADD_ULONG
3800	}
3801}
3802
3803/**
3804 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3805 *	@qs: the queue set
3806 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3807 *	@idx: the descriptor index in the queue
3808 *	@data: where to dump the descriptor contents
3809 *
3810 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3811 *	size of the descriptor.
3812 */
3813int
3814t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3815		unsigned char *data)
3816{
3817	if (qnum >= 6)
3818		return (EINVAL);
3819
3820	if (qnum < 3) {
3821		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3822			return -EINVAL;
3823		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3824		return sizeof(struct tx_desc);
3825	}
3826
3827	if (qnum == 3) {
3828		if (!qs->rspq.desc || idx >= qs->rspq.size)
3829			return (EINVAL);
3830		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3831		return sizeof(struct rsp_desc);
3832	}
3833
3834	qnum -= 4;
3835	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3836		return (EINVAL);
3837	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3838	return sizeof(struct rx_desc);
3839}
3840