cxgb_sge.c revision 195512
1/**************************************************************************
2
3Copyright (c) 2007-2009, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/cxgb_sge.c 195512 2009-07-09 19:27:58Z np $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kernel.h>
36#include <sys/module.h>
37#include <sys/bus.h>
38#include <sys/conf.h>
39#include <machine/bus.h>
40#include <machine/resource.h>
41#include <sys/bus_dma.h>
42#include <sys/rman.h>
43#include <sys/queue.h>
44#include <sys/sysctl.h>
45#include <sys/taskqueue.h>
46
47#include <sys/proc.h>
48#include <sys/sbuf.h>
49#include <sys/sched.h>
50#include <sys/smp.h>
51#include <sys/systm.h>
52#include <sys/syslog.h>
53
54#include <net/bpf.h>
55
56#include <netinet/in_systm.h>
57#include <netinet/in.h>
58#include <netinet/ip.h>
59#include <netinet/tcp.h>
60
61#include <dev/pci/pcireg.h>
62#include <dev/pci/pcivar.h>
63
64#include <vm/vm.h>
65#include <vm/pmap.h>
66
67#include <cxgb_include.h>
68#include <sys/mvec.h>
69
70int	txq_fills = 0;
71int	multiq_tx_enable = 1;
72
73extern struct sysctl_oid_list sysctl__hw_cxgb_children;
74int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
75TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
76SYSCTL_UINT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
77    "size of per-queue mbuf ring");
78
79static int cxgb_tx_coalesce_force = 0;
80TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
81SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
82    &cxgb_tx_coalesce_force, 0,
83    "coalesce small packets into a single work request regardless of ring state");
84
85#define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
86#define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
87#define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
88#define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
89#define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
90#define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
91#define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
92
93
94static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
95TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
96    &cxgb_tx_coalesce_enable_start);
97SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
98    &cxgb_tx_coalesce_enable_start, 0,
99    "coalesce enable threshold");
100static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
101TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
102SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
103    &cxgb_tx_coalesce_enable_stop, 0,
104    "coalesce disable threshold");
105static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
106TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
107SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
108    &cxgb_tx_reclaim_threshold, 0,
109    "tx cleaning minimum threshold");
110
111/*
112 * XXX don't re-enable this until TOE stops assuming
113 * we have an m_ext
114 */
115static int recycle_enable = 0;
116int cxgb_ext_freed = 0;
117int cxgb_ext_inited = 0;
118int fl_q_size = 0;
119int jumbo_q_size = 0;
120
121extern int cxgb_use_16k_clusters;
122extern int nmbjumbo4;
123extern int nmbjumbo9;
124extern int nmbjumbo16;
125
126#define USE_GTS 0
127
128#define SGE_RX_SM_BUF_SIZE	1536
129#define SGE_RX_DROP_THRES	16
130#define SGE_RX_COPY_THRES	128
131
132/*
133 * Period of the Tx buffer reclaim timer.  This timer does not need to run
134 * frequently as Tx buffers are usually reclaimed by new Tx packets.
135 */
136#define TX_RECLAIM_PERIOD       (hz >> 1)
137
138/*
139 * Values for sge_txq.flags
140 */
141enum {
142	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
143	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
144};
145
146struct tx_desc {
147	uint64_t	flit[TX_DESC_FLITS];
148} __packed;
149
150struct rx_desc {
151	uint32_t	addr_lo;
152	uint32_t	len_gen;
153	uint32_t	gen2;
154	uint32_t	addr_hi;
155} __packed;;
156
157struct rsp_desc {               /* response queue descriptor */
158	struct rss_header	rss_hdr;
159	uint32_t		flags;
160	uint32_t		len_cq;
161	uint8_t			imm_data[47];
162	uint8_t			intr_gen;
163} __packed;
164
165#define RX_SW_DESC_MAP_CREATED	(1 << 0)
166#define TX_SW_DESC_MAP_CREATED	(1 << 1)
167#define RX_SW_DESC_INUSE        (1 << 3)
168#define TX_SW_DESC_MAPPED       (1 << 4)
169
170#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
171#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
172#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
173#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
174
175struct tx_sw_desc {                /* SW state per Tx descriptor */
176	struct mbuf	*m;
177	bus_dmamap_t	map;
178	int		flags;
179};
180
181struct rx_sw_desc {                /* SW state per Rx descriptor */
182	caddr_t		rxsd_cl;
183	struct mbuf	*m;
184	bus_dmamap_t	map;
185	int		flags;
186};
187
188struct txq_state {
189	unsigned int	compl;
190	unsigned int	gen;
191	unsigned int	pidx;
192};
193
194struct refill_fl_cb_arg {
195	int               error;
196	bus_dma_segment_t seg;
197	int               nseg;
198};
199
200
201/*
202 * Maps a number of flits to the number of Tx descriptors that can hold them.
203 * The formula is
204 *
205 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
206 *
207 * HW allows up to 4 descriptors to be combined into a WR.
208 */
209static uint8_t flit_desc_map[] = {
210	0,
211#if SGE_NUM_GENBITS == 1
212	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
214	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
215	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
216#elif SGE_NUM_GENBITS == 2
217	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
218	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
219	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
220	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
221#else
222# error "SGE_NUM_GENBITS must be 1 or 2"
223#endif
224};
225
226#define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
227#define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
228#define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
229#define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
230#define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
231#define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
232#define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
233	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
234#define	TXQ_RING_DEQUEUE(qs) \
235	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236
237int cxgb_debug = 0;
238
239static void sge_timer_cb(void *arg);
240static void sge_timer_reclaim(void *arg, int ncount);
241static void sge_txq_reclaim_handler(void *arg, int ncount);
242static void cxgb_start_locked(struct sge_qset *qs);
243
244/*
245 * XXX need to cope with bursty scheduling by looking at a wider
246 * window than we are now for determining the need for coalescing
247 *
248 */
249static __inline uint64_t
250check_pkt_coalesce(struct sge_qset *qs)
251{
252        struct adapter *sc;
253        struct sge_txq *txq;
254	uint8_t *fill;
255
256	if (__predict_false(cxgb_tx_coalesce_force))
257		return (1);
258	txq = &qs->txq[TXQ_ETH];
259        sc = qs->port->adapter;
260	fill = &sc->tunq_fill[qs->idx];
261
262	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
263		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
264	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
265		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
266	/*
267	 * if the hardware transmit queue is more than 1/8 full
268	 * we mark it as coalescing - we drop back from coalescing
269	 * when we go below 1/32 full and there are no packets enqueued,
270	 * this provides us with some degree of hysteresis
271	 */
272        if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
273	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
274                *fill = 0;
275        else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
276                *fill = 1;
277
278	return (sc->tunq_coalesce);
279}
280
281#ifdef __LP64__
282static void
283set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
284{
285	uint64_t wr_hilo;
286#if _BYTE_ORDER == _LITTLE_ENDIAN
287	wr_hilo = wr_hi;
288	wr_hilo |= (((uint64_t)wr_lo)<<32);
289#else
290	wr_hilo = wr_lo;
291	wr_hilo |= (((uint64_t)wr_hi)<<32);
292#endif
293	wrp->wrh_hilo = wr_hilo;
294}
295#else
296static void
297set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
298{
299
300	wrp->wrh_hi = wr_hi;
301	wmb();
302	wrp->wrh_lo = wr_lo;
303}
304#endif
305
306struct coalesce_info {
307	int count;
308	int nbytes;
309};
310
311static int
312coalesce_check(struct mbuf *m, void *arg)
313{
314	struct coalesce_info *ci = arg;
315	int *count = &ci->count;
316	int *nbytes = &ci->nbytes;
317
318	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
319		(*count < 7) && (m->m_next == NULL))) {
320		*count += 1;
321		*nbytes += m->m_len;
322		return (1);
323	}
324	return (0);
325}
326
327static struct mbuf *
328cxgb_dequeue(struct sge_qset *qs)
329{
330	struct mbuf *m, *m_head, *m_tail;
331	struct coalesce_info ci;
332
333
334	if (check_pkt_coalesce(qs) == 0)
335		return TXQ_RING_DEQUEUE(qs);
336
337	m_head = m_tail = NULL;
338	ci.count = ci.nbytes = 0;
339	do {
340		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
341		if (m_head == NULL) {
342			m_tail = m_head = m;
343		} else if (m != NULL) {
344			m_tail->m_nextpkt = m;
345			m_tail = m;
346		}
347	} while (m != NULL);
348	if (ci.count > 7)
349		panic("trying to coalesce %d packets in to one WR", ci.count);
350	return (m_head);
351}
352
353/**
354 *	reclaim_completed_tx - reclaims completed Tx descriptors
355 *	@adapter: the adapter
356 *	@q: the Tx queue to reclaim completed descriptors from
357 *
358 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
359 *	and frees the associated buffers if possible.  Called with the Tx
360 *	queue's lock held.
361 */
362static __inline int
363reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
364{
365	struct sge_txq *q = &qs->txq[queue];
366	int reclaim = desc_reclaimable(q);
367
368	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
369	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
370		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
371
372	if (reclaim < reclaim_min)
373		return (0);
374
375	mtx_assert(&qs->lock, MA_OWNED);
376	if (reclaim > 0) {
377		t3_free_tx_desc(qs, reclaim, queue);
378		q->cleaned += reclaim;
379		q->in_use -= reclaim;
380	}
381	if (isset(&qs->txq_stopped, TXQ_ETH))
382                clrbit(&qs->txq_stopped, TXQ_ETH);
383
384	return (reclaim);
385}
386
387/**
388 *	should_restart_tx - are there enough resources to restart a Tx queue?
389 *	@q: the Tx queue
390 *
391 *	Checks if there are enough descriptors to restart a suspended Tx queue.
392 */
393static __inline int
394should_restart_tx(const struct sge_txq *q)
395{
396	unsigned int r = q->processed - q->cleaned;
397
398	return q->in_use - r < (q->size >> 1);
399}
400
401/**
402 *	t3_sge_init - initialize SGE
403 *	@adap: the adapter
404 *	@p: the SGE parameters
405 *
406 *	Performs SGE initialization needed every time after a chip reset.
407 *	We do not initialize any of the queue sets here, instead the driver
408 *	top-level must request those individually.  We also do not enable DMA
409 *	here, that should be done after the queues have been set up.
410 */
411void
412t3_sge_init(adapter_t *adap, struct sge_params *p)
413{
414	u_int ctrl, ups;
415
416	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
417
418	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
419	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
420	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
421	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
422#if SGE_NUM_GENBITS == 1
423	ctrl |= F_EGRGENCTRL;
424#endif
425	if (adap->params.rev > 0) {
426		if (!(adap->flags & (USING_MSIX | USING_MSI)))
427			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
428	}
429	t3_write_reg(adap, A_SG_CONTROL, ctrl);
430	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
431		     V_LORCQDRBTHRSH(512));
432	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
433	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
434		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
435	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
436		     adap->params.rev < T3_REV_C ? 1000 : 500);
437	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
438	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
439	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
440	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
441	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
442}
443
444
445/**
446 *	sgl_len - calculates the size of an SGL of the given capacity
447 *	@n: the number of SGL entries
448 *
449 *	Calculates the number of flits needed for a scatter/gather list that
450 *	can hold the given number of entries.
451 */
452static __inline unsigned int
453sgl_len(unsigned int n)
454{
455	return ((3 * n) / 2 + (n & 1));
456}
457
458/**
459 *	get_imm_packet - return the next ingress packet buffer from a response
460 *	@resp: the response descriptor containing the packet data
461 *
462 *	Return a packet containing the immediate data of the given response.
463 */
464static int
465get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
466{
467
468	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
469	m->m_ext.ext_buf = NULL;
470	m->m_ext.ext_type = 0;
471	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
472	return (0);
473}
474
475static __inline u_int
476flits_to_desc(u_int n)
477{
478	return (flit_desc_map[n]);
479}
480
481#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
482		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
483		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
484		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
485		    F_HIRCQPARITYERROR)
486#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
487#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
488		      F_RSPQDISABLED)
489
490/**
491 *	t3_sge_err_intr_handler - SGE async event interrupt handler
492 *	@adapter: the adapter
493 *
494 *	Interrupt handler for SGE asynchronous (non-data) events.
495 */
496void
497t3_sge_err_intr_handler(adapter_t *adapter)
498{
499	unsigned int v, status;
500
501	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
502	if (status & SGE_PARERR)
503		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
504			 status & SGE_PARERR);
505	if (status & SGE_FRAMINGERR)
506		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
507			 status & SGE_FRAMINGERR);
508	if (status & F_RSPQCREDITOVERFOW)
509		CH_ALERT(adapter, "SGE response queue credit overflow\n");
510
511	if (status & F_RSPQDISABLED) {
512		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
513
514		CH_ALERT(adapter,
515			 "packet delivered to disabled response queue (0x%x)\n",
516			 (v >> S_RSPQ0DISABLED) & 0xff);
517	}
518
519	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
520	if (status & SGE_FATALERR)
521		t3_fatal_err(adapter);
522}
523
524void
525t3_sge_prep(adapter_t *adap, struct sge_params *p)
526{
527	int i, nqsets;
528
529	nqsets = min(SGE_QSETS, mp_ncpus*4);
530
531	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
532
533	while (!powerof2(fl_q_size))
534		fl_q_size--;
535#if __FreeBSD_version >= 700111
536	if (cxgb_use_16k_clusters)
537		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
538	else
539		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
540#else
541	jumbo_q_size = min(nmbjumbo4/(3*nqsets), JUMBO_Q_SIZE);
542#endif
543	while (!powerof2(jumbo_q_size))
544		jumbo_q_size--;
545
546	/* XXX Does ETHER_ALIGN need to be accounted for here? */
547	p->max_pkt_size = adap->sge.qs[0].fl[1].buf_size - sizeof(struct cpl_rx_data);
548
549	for (i = 0; i < SGE_QSETS; ++i) {
550		struct qset_params *q = p->qset + i;
551
552		if (adap->params.nports > 2) {
553			q->coalesce_usecs = 50;
554		} else {
555#ifdef INVARIANTS
556			q->coalesce_usecs = 10;
557#else
558			q->coalesce_usecs = 5;
559#endif
560		}
561		q->polling = 0;
562		q->rspq_size = RSPQ_Q_SIZE;
563		q->fl_size = fl_q_size;
564		q->jumbo_size = jumbo_q_size;
565		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
566		q->txq_size[TXQ_OFLD] = 1024;
567		q->txq_size[TXQ_CTRL] = 256;
568		q->cong_thres = 0;
569	}
570}
571
572int
573t3_sge_alloc(adapter_t *sc)
574{
575
576	/* The parent tag. */
577	if (bus_dma_tag_create( NULL,			/* parent */
578				1, 0,			/* algnmnt, boundary */
579				BUS_SPACE_MAXADDR,	/* lowaddr */
580				BUS_SPACE_MAXADDR,	/* highaddr */
581				NULL, NULL,		/* filter, filterarg */
582				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
583				BUS_SPACE_UNRESTRICTED, /* nsegments */
584				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
585				0,			/* flags */
586				NULL, NULL,		/* lock, lockarg */
587				&sc->parent_dmat)) {
588		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
589		return (ENOMEM);
590	}
591
592	/*
593	 * DMA tag for normal sized RX frames
594	 */
595	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
596		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
597		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
598		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
599		return (ENOMEM);
600	}
601
602	/*
603	 * DMA tag for jumbo sized RX frames.
604	 */
605	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
606		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
607		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
608		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
609		return (ENOMEM);
610	}
611
612	/*
613	 * DMA tag for TX frames.
614	 */
615	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
616		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
617		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
618		NULL, NULL, &sc->tx_dmat)) {
619		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
620		return (ENOMEM);
621	}
622
623	return (0);
624}
625
626int
627t3_sge_free(struct adapter * sc)
628{
629
630	if (sc->tx_dmat != NULL)
631		bus_dma_tag_destroy(sc->tx_dmat);
632
633	if (sc->rx_jumbo_dmat != NULL)
634		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
635
636	if (sc->rx_dmat != NULL)
637		bus_dma_tag_destroy(sc->rx_dmat);
638
639	if (sc->parent_dmat != NULL)
640		bus_dma_tag_destroy(sc->parent_dmat);
641
642	return (0);
643}
644
645void
646t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
647{
648
649	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
650	qs->rspq.polling = 0 /* p->polling */;
651}
652
653#if !defined(__i386__) && !defined(__amd64__)
654static void
655refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
656{
657	struct refill_fl_cb_arg *cb_arg = arg;
658
659	cb_arg->error = error;
660	cb_arg->seg = segs[0];
661	cb_arg->nseg = nseg;
662
663}
664#endif
665/**
666 *	refill_fl - refill an SGE free-buffer list
667 *	@sc: the controller softc
668 *	@q: the free-list to refill
669 *	@n: the number of new buffers to allocate
670 *
671 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
672 *	The caller must assure that @n does not exceed the queue's capacity.
673 */
674static void
675refill_fl(adapter_t *sc, struct sge_fl *q, int n)
676{
677	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
678	struct rx_desc *d = &q->desc[q->pidx];
679	struct refill_fl_cb_arg cb_arg;
680	struct mbuf *m;
681	caddr_t cl;
682	int err, count = 0;
683
684	cb_arg.error = 0;
685	while (n--) {
686		/*
687		 * We only allocate a cluster, mbuf allocation happens after rx
688		 */
689		if (q->zone == zone_pack) {
690			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
691				break;
692			cl = m->m_ext.ext_buf;
693		} else {
694			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
695				break;
696			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
697				uma_zfree(q->zone, cl);
698				break;
699			}
700		}
701		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
702			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
703				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
704				uma_zfree(q->zone, cl);
705				goto done;
706			}
707			sd->flags |= RX_SW_DESC_MAP_CREATED;
708		}
709#if !defined(__i386__) && !defined(__amd64__)
710		err = bus_dmamap_load(q->entry_tag, sd->map,
711		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
712
713		if (err != 0 || cb_arg.error) {
714			if (q->zone == zone_pack)
715				uma_zfree(q->zone, cl);
716			m_free(m);
717			goto done;
718		}
719#else
720		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
721#endif
722		sd->flags |= RX_SW_DESC_INUSE;
723		sd->rxsd_cl = cl;
724		sd->m = m;
725		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
726		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
727		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
728		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
729
730		d++;
731		sd++;
732
733		if (++q->pidx == q->size) {
734			q->pidx = 0;
735			q->gen ^= 1;
736			sd = q->sdesc;
737			d = q->desc;
738		}
739		q->credits++;
740		count++;
741	}
742
743done:
744	if (count)
745		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
746}
747
748
749/**
750 *	free_rx_bufs - free the Rx buffers on an SGE free list
751 *	@sc: the controle softc
752 *	@q: the SGE free list to clean up
753 *
754 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
755 *	this queue should be stopped before calling this function.
756 */
757static void
758free_rx_bufs(adapter_t *sc, struct sge_fl *q)
759{
760	u_int cidx = q->cidx;
761
762	while (q->credits--) {
763		struct rx_sw_desc *d = &q->sdesc[cidx];
764
765		if (d->flags & RX_SW_DESC_INUSE) {
766			bus_dmamap_unload(q->entry_tag, d->map);
767			bus_dmamap_destroy(q->entry_tag, d->map);
768			if (q->zone == zone_pack) {
769				m_init(d->m, zone_pack, MCLBYTES,
770				    M_NOWAIT, MT_DATA, M_EXT);
771				uma_zfree(zone_pack, d->m);
772			} else {
773				m_init(d->m, zone_mbuf, MLEN,
774				    M_NOWAIT, MT_DATA, 0);
775				uma_zfree(zone_mbuf, d->m);
776				uma_zfree(q->zone, d->rxsd_cl);
777			}
778		}
779
780		d->rxsd_cl = NULL;
781		d->m = NULL;
782		if (++cidx == q->size)
783			cidx = 0;
784	}
785}
786
787static __inline void
788__refill_fl(adapter_t *adap, struct sge_fl *fl)
789{
790	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
791}
792
793static __inline void
794__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
795{
796	if ((fl->size - fl->credits) < max)
797		refill_fl(adap, fl, min(max, fl->size - fl->credits));
798}
799
800/**
801 *	recycle_rx_buf - recycle a receive buffer
802 *	@adapter: the adapter
803 *	@q: the SGE free list
804 *	@idx: index of buffer to recycle
805 *
806 *	Recycles the specified buffer on the given free list by adding it at
807 *	the next available slot on the list.
808 */
809static void
810recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
811{
812	struct rx_desc *from = &q->desc[idx];
813	struct rx_desc *to   = &q->desc[q->pidx];
814
815	q->sdesc[q->pidx] = q->sdesc[idx];
816	to->addr_lo = from->addr_lo;        // already big endian
817	to->addr_hi = from->addr_hi;        // likewise
818	wmb();	/* necessary ? */
819	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
820	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
821	q->credits++;
822
823	if (++q->pidx == q->size) {
824		q->pidx = 0;
825		q->gen ^= 1;
826	}
827	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
828}
829
830static void
831alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
832{
833	uint32_t *addr;
834
835	addr = arg;
836	*addr = segs[0].ds_addr;
837}
838
839static int
840alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
841    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
842    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
843{
844	size_t len = nelem * elem_size;
845	void *s = NULL;
846	void *p = NULL;
847	int err;
848
849	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
850				      BUS_SPACE_MAXADDR_32BIT,
851				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
852				      len, 0, NULL, NULL, tag)) != 0) {
853		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
854		return (ENOMEM);
855	}
856
857	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
858				    map)) != 0) {
859		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
860		return (ENOMEM);
861	}
862
863	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
864	bzero(p, len);
865	*(void **)desc = p;
866
867	if (sw_size) {
868		len = nelem * sw_size;
869		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
870		*(void **)sdesc = s;
871	}
872	if (parent_entry_tag == NULL)
873		return (0);
874
875	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
876				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
877		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
878				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
879		                      NULL, NULL, entry_tag)) != 0) {
880		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
881		return (ENOMEM);
882	}
883	return (0);
884}
885
886static void
887sge_slow_intr_handler(void *arg, int ncount)
888{
889	adapter_t *sc = arg;
890
891	t3_slow_intr_handler(sc);
892}
893
894/**
895 *	sge_timer_cb - perform periodic maintenance of an SGE qset
896 *	@data: the SGE queue set to maintain
897 *
898 *	Runs periodically from a timer to perform maintenance of an SGE queue
899 *	set.  It performs two tasks:
900 *
901 *	a) Cleans up any completed Tx descriptors that may still be pending.
902 *	Normal descriptor cleanup happens when new packets are added to a Tx
903 *	queue so this timer is relatively infrequent and does any cleanup only
904 *	if the Tx queue has not seen any new packets in a while.  We make a
905 *	best effort attempt to reclaim descriptors, in that we don't wait
906 *	around if we cannot get a queue's lock (which most likely is because
907 *	someone else is queueing new packets and so will also handle the clean
908 *	up).  Since control queues use immediate data exclusively we don't
909 *	bother cleaning them up here.
910 *
911 *	b) Replenishes Rx queues that have run out due to memory shortage.
912 *	Normally new Rx buffers are added when existing ones are consumed but
913 *	when out of memory a queue can become empty.  We try to add only a few
914 *	buffers here, the queue will be replenished fully as these new buffers
915 *	are used up if memory shortage has subsided.
916 *
917 *	c) Return coalesced response queue credits in case a response queue is
918 *	starved.
919 *
920 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
921 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
922 */
923static void
924sge_timer_cb(void *arg)
925{
926	adapter_t *sc = arg;
927	if ((sc->flags & USING_MSIX) == 0) {
928
929		struct port_info *pi;
930		struct sge_qset *qs;
931		struct sge_txq  *txq;
932		int i, j;
933		int reclaim_ofl, refill_rx;
934
935		if (sc->open_device_map == 0)
936			return;
937
938		for (i = 0; i < sc->params.nports; i++) {
939			pi = &sc->port[i];
940			for (j = 0; j < pi->nqsets; j++) {
941				qs = &sc->sge.qs[pi->first_qset + j];
942				txq = &qs->txq[0];
943				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
944				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
945				    (qs->fl[1].credits < qs->fl[1].size));
946				if (reclaim_ofl || refill_rx) {
947					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
948					break;
949				}
950			}
951		}
952	}
953
954	if (sc->params.nports > 2) {
955		int i;
956
957		for_each_port(sc, i) {
958			struct port_info *pi = &sc->port[i];
959
960			t3_write_reg(sc, A_SG_KDOORBELL,
961				     F_SELEGRCNTX |
962				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
963		}
964	}
965	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
966	    sc->open_device_map != 0)
967		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
968}
969
970/*
971 * This is meant to be a catch-all function to keep sge state private
972 * to sge.c
973 *
974 */
975int
976t3_sge_init_adapter(adapter_t *sc)
977{
978	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
979	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
980	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
981	return (0);
982}
983
984int
985t3_sge_reset_adapter(adapter_t *sc)
986{
987	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
988	return (0);
989}
990
991int
992t3_sge_init_port(struct port_info *pi)
993{
994	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
995	return (0);
996}
997
998/**
999 *	refill_rspq - replenish an SGE response queue
1000 *	@adapter: the adapter
1001 *	@q: the response queue to replenish
1002 *	@credits: how many new responses to make available
1003 *
1004 *	Replenishes a response queue by making the supplied number of responses
1005 *	available to HW.
1006 */
1007static __inline void
1008refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1009{
1010
1011	/* mbufs are allocated on demand when a rspq entry is processed. */
1012	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1013		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1014}
1015
1016static void
1017sge_txq_reclaim_handler(void *arg, int ncount)
1018{
1019	struct sge_qset *qs = arg;
1020	int i;
1021
1022	for (i = 0; i < 3; i++)
1023		reclaim_completed_tx(qs, 16, i);
1024}
1025
1026static void
1027sge_timer_reclaim(void *arg, int ncount)
1028{
1029	struct port_info *pi = arg;
1030	int i, nqsets = pi->nqsets;
1031	adapter_t *sc = pi->adapter;
1032	struct sge_qset *qs;
1033	struct mtx *lock;
1034
1035	KASSERT((sc->flags & USING_MSIX) == 0,
1036	    ("can't call timer reclaim for msi-x"));
1037
1038	for (i = 0; i < nqsets; i++) {
1039		qs = &sc->sge.qs[pi->first_qset + i];
1040
1041		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1042		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1043			    &sc->sge.qs[0].rspq.lock;
1044
1045		if (mtx_trylock(lock)) {
1046			/* XXX currently assume that we are *NOT* polling */
1047			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1048
1049			if (qs->fl[0].credits < qs->fl[0].size - 16)
1050				__refill_fl(sc, &qs->fl[0]);
1051			if (qs->fl[1].credits < qs->fl[1].size - 16)
1052				__refill_fl(sc, &qs->fl[1]);
1053
1054			if (status & (1 << qs->rspq.cntxt_id)) {
1055				if (qs->rspq.credits) {
1056					refill_rspq(sc, &qs->rspq, 1);
1057					qs->rspq.credits--;
1058					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1059					    1 << qs->rspq.cntxt_id);
1060				}
1061			}
1062			mtx_unlock(lock);
1063		}
1064	}
1065}
1066
1067/**
1068 *	init_qset_cntxt - initialize an SGE queue set context info
1069 *	@qs: the queue set
1070 *	@id: the queue set id
1071 *
1072 *	Initializes the TIDs and context ids for the queues of a queue set.
1073 */
1074static void
1075init_qset_cntxt(struct sge_qset *qs, u_int id)
1076{
1077
1078	qs->rspq.cntxt_id = id;
1079	qs->fl[0].cntxt_id = 2 * id;
1080	qs->fl[1].cntxt_id = 2 * id + 1;
1081	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1082	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1083	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1084	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1085	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1086
1087	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1088	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1089	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1090}
1091
1092
1093static void
1094txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1095{
1096	txq->in_use += ndesc;
1097	/*
1098	 * XXX we don't handle stopping of queue
1099	 * presumably start handles this when we bump against the end
1100	 */
1101	txqs->gen = txq->gen;
1102	txq->unacked += ndesc;
1103	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1104	txq->unacked &= 31;
1105	txqs->pidx = txq->pidx;
1106	txq->pidx += ndesc;
1107#ifdef INVARIANTS
1108	if (((txqs->pidx > txq->cidx) &&
1109		(txq->pidx < txqs->pidx) &&
1110		(txq->pidx >= txq->cidx)) ||
1111	    ((txqs->pidx < txq->cidx) &&
1112		(txq->pidx >= txq-> cidx)) ||
1113	    ((txqs->pidx < txq->cidx) &&
1114		(txq->cidx < txqs->pidx)))
1115		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1116		    txqs->pidx, txq->pidx, txq->cidx);
1117#endif
1118	if (txq->pidx >= txq->size) {
1119		txq->pidx -= txq->size;
1120		txq->gen ^= 1;
1121	}
1122
1123}
1124
1125/**
1126 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1127 *	@m: the packet mbufs
1128 *      @nsegs: the number of segments
1129 *
1130 * 	Returns the number of Tx descriptors needed for the given Ethernet
1131 * 	packet.  Ethernet packets require addition of WR and CPL headers.
1132 */
1133static __inline unsigned int
1134calc_tx_descs(const struct mbuf *m, int nsegs)
1135{
1136	unsigned int flits;
1137
1138	if (m->m_pkthdr.len <= PIO_LEN)
1139		return 1;
1140
1141	flits = sgl_len(nsegs) + 2;
1142#ifdef TSO_SUPPORTED
1143	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1144		flits++;
1145#endif
1146	return flits_to_desc(flits);
1147}
1148
1149static unsigned int
1150busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1151    struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1152{
1153	struct mbuf *m0;
1154	int err, pktlen, pass = 0;
1155	bus_dma_tag_t tag = txq->entry_tag;
1156
1157retry:
1158	err = 0;
1159	m0 = *m;
1160	pktlen = m0->m_pkthdr.len;
1161#if defined(__i386__) || defined(__amd64__)
1162	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1163		goto done;
1164	} else
1165#endif
1166		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1167
1168	if (err == 0) {
1169		goto done;
1170	}
1171	if (err == EFBIG && pass == 0) {
1172		pass = 1;
1173		/* Too many segments, try to defrag */
1174		m0 = m_defrag(m0, M_DONTWAIT);
1175		if (m0 == NULL) {
1176			m_freem(*m);
1177			*m = NULL;
1178			return (ENOBUFS);
1179		}
1180		*m = m0;
1181		goto retry;
1182	} else if (err == ENOMEM) {
1183		return (err);
1184	} if (err) {
1185		if (cxgb_debug)
1186			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1187		m_freem(m0);
1188		*m = NULL;
1189		return (err);
1190	}
1191done:
1192#if !defined(__i386__) && !defined(__amd64__)
1193	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1194#endif
1195	txsd->flags |= TX_SW_DESC_MAPPED;
1196
1197	return (0);
1198}
1199
1200/**
1201 *	make_sgl - populate a scatter/gather list for a packet
1202 *	@sgp: the SGL to populate
1203 *	@segs: the packet dma segments
1204 *	@nsegs: the number of segments
1205 *
1206 *	Generates a scatter/gather list for the buffers that make up a packet
1207 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1208 *	appropriately.
1209 */
1210static __inline void
1211make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1212{
1213	int i, idx;
1214
1215	for (idx = 0, i = 0; i < nsegs; i++) {
1216		/*
1217		 * firmware doesn't like empty segments
1218		 */
1219		if (segs[i].ds_len == 0)
1220			continue;
1221		if (i && idx == 0)
1222			++sgp;
1223
1224		sgp->len[idx] = htobe32(segs[i].ds_len);
1225		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1226		idx ^= 1;
1227	}
1228
1229	if (idx) {
1230		sgp->len[idx] = 0;
1231		sgp->addr[idx] = 0;
1232	}
1233}
1234
1235/**
1236 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1237 *	@adap: the adapter
1238 *	@q: the Tx queue
1239 *
1240 *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1241 *	where the HW is going to sleep just after we checked, however,
1242 *	then the interrupt handler will detect the outstanding TX packet
1243 *	and ring the doorbell for us.
1244 *
1245 *	When GTS is disabled we unconditionally ring the doorbell.
1246 */
1247static __inline void
1248check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1249{
1250#if USE_GTS
1251	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1252	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1253		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1254#ifdef T3_TRACE
1255		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1256			  q->cntxt_id);
1257#endif
1258		t3_write_reg(adap, A_SG_KDOORBELL,
1259			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1260	}
1261#else
1262	wmb();            /* write descriptors before telling HW */
1263	t3_write_reg(adap, A_SG_KDOORBELL,
1264		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1265#endif
1266}
1267
1268static __inline void
1269wr_gen2(struct tx_desc *d, unsigned int gen)
1270{
1271#if SGE_NUM_GENBITS == 2
1272	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1273#endif
1274}
1275
1276/**
1277 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1278 *	@ndesc: number of Tx descriptors spanned by the SGL
1279 *	@txd: first Tx descriptor to be written
1280 *	@txqs: txq state (generation and producer index)
1281 *	@txq: the SGE Tx queue
1282 *	@sgl: the SGL
1283 *	@flits: number of flits to the start of the SGL in the first descriptor
1284 *	@sgl_flits: the SGL size in flits
1285 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1286 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1287 *
1288 *	Write a work request header and an associated SGL.  If the SGL is
1289 *	small enough to fit into one Tx descriptor it has already been written
1290 *	and we just need to write the WR header.  Otherwise we distribute the
1291 *	SGL across the number of descriptors it spans.
1292 */
1293static void
1294write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1295    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1296    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1297{
1298
1299	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1300	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1301
1302	if (__predict_true(ndesc == 1)) {
1303		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1304			V_WR_SGLSFLT(flits)) | wr_hi,
1305		    htonl(V_WR_LEN(flits + sgl_flits) |
1306			V_WR_GEN(txqs->gen)) | wr_lo);
1307		/* XXX gen? */
1308		wr_gen2(txd, txqs->gen);
1309
1310	} else {
1311		unsigned int ogen = txqs->gen;
1312		const uint64_t *fp = (const uint64_t *)sgl;
1313		struct work_request_hdr *wp = wrp;
1314
1315		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1316		    V_WR_SGLSFLT(flits)) | wr_hi;
1317
1318		while (sgl_flits) {
1319			unsigned int avail = WR_FLITS - flits;
1320
1321			if (avail > sgl_flits)
1322				avail = sgl_flits;
1323			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1324			sgl_flits -= avail;
1325			ndesc--;
1326			if (!sgl_flits)
1327				break;
1328
1329			fp += avail;
1330			txd++;
1331			txsd++;
1332			if (++txqs->pidx == txq->size) {
1333				txqs->pidx = 0;
1334				txqs->gen ^= 1;
1335				txd = txq->desc;
1336				txsd = txq->sdesc;
1337			}
1338
1339			/*
1340			 * when the head of the mbuf chain
1341			 * is freed all clusters will be freed
1342			 * with it
1343			 */
1344			wrp = (struct work_request_hdr *)txd;
1345			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1346			    V_WR_SGLSFLT(1)) | wr_hi;
1347			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1348				    sgl_flits + 1)) |
1349			    V_WR_GEN(txqs->gen)) | wr_lo;
1350			wr_gen2(txd, txqs->gen);
1351			flits = 1;
1352		}
1353		wrp->wrh_hi |= htonl(F_WR_EOP);
1354		wmb();
1355		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1356		wr_gen2((struct tx_desc *)wp, ogen);
1357	}
1358}
1359
1360/* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1361#define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1362
1363#ifdef VLAN_SUPPORTED
1364#define GET_VTAG(cntrl, m) \
1365do { \
1366	if ((m)->m_flags & M_VLANTAG)					            \
1367		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1368} while (0)
1369
1370#else
1371#define GET_VTAG(cntrl, m)
1372#endif
1373
1374static int
1375t3_encap(struct sge_qset *qs, struct mbuf **m)
1376{
1377	adapter_t *sc;
1378	struct mbuf *m0;
1379	struct sge_txq *txq;
1380	struct txq_state txqs;
1381	struct port_info *pi;
1382	unsigned int ndesc, flits, cntrl, mlen;
1383	int err, nsegs, tso_info = 0;
1384
1385	struct work_request_hdr *wrp;
1386	struct tx_sw_desc *txsd;
1387	struct sg_ent *sgp, *sgl;
1388	uint32_t wr_hi, wr_lo, sgl_flits;
1389	bus_dma_segment_t segs[TX_MAX_SEGS];
1390
1391	struct tx_desc *txd;
1392
1393	pi = qs->port;
1394	sc = pi->adapter;
1395	txq = &qs->txq[TXQ_ETH];
1396	txd = &txq->desc[txq->pidx];
1397	txsd = &txq->sdesc[txq->pidx];
1398	sgl = txq->txq_sgl;
1399
1400	prefetch(txd);
1401	m0 = *m;
1402
1403	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
1404	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
1405
1406	mtx_assert(&qs->lock, MA_OWNED);
1407	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1408	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1409
1410#ifdef VLAN_SUPPORTED
1411	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1412	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1413		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1414#endif
1415	if (m0->m_nextpkt != NULL) {
1416		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1417		ndesc = 1;
1418		mlen = 0;
1419	} else {
1420		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1421		    &m0, segs, &nsegs))) {
1422			if (cxgb_debug)
1423				printf("failed ... err=%d\n", err);
1424			return (err);
1425		}
1426		mlen = m0->m_pkthdr.len;
1427		ndesc = calc_tx_descs(m0, nsegs);
1428	}
1429	txq_prod(txq, ndesc, &txqs);
1430
1431	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1432	txsd->m = m0;
1433
1434	if (m0->m_nextpkt != NULL) {
1435		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1436		int i, fidx;
1437
1438		if (nsegs > 7)
1439			panic("trying to coalesce %d packets in to one WR", nsegs);
1440		txq->txq_coalesced += nsegs;
1441		wrp = (struct work_request_hdr *)txd;
1442		flits = nsegs*2 + 1;
1443
1444		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1445			struct cpl_tx_pkt_batch_entry *cbe;
1446			uint64_t flit;
1447			uint32_t *hflit = (uint32_t *)&flit;
1448			int cflags = m0->m_pkthdr.csum_flags;
1449
1450			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1451			GET_VTAG(cntrl, m0);
1452			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1453			if (__predict_false(!(cflags & CSUM_IP)))
1454				cntrl |= F_TXPKT_IPCSUM_DIS;
1455			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1456				cntrl |= F_TXPKT_L4CSUM_DIS;
1457
1458			hflit[0] = htonl(cntrl);
1459			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1460			flit |= htobe64(1 << 24);
1461			cbe = &cpl_batch->pkt_entry[i];
1462			cbe->cntrl = hflit[0];
1463			cbe->len = hflit[1];
1464			cbe->addr = htobe64(segs[i].ds_addr);
1465		}
1466
1467		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1468		    V_WR_SGLSFLT(flits)) |
1469		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1470		wr_lo = htonl(V_WR_LEN(flits) |
1471		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1472		set_wr_hdr(wrp, wr_hi, wr_lo);
1473		wmb();
1474		wr_gen2(txd, txqs.gen);
1475		check_ring_tx_db(sc, txq);
1476		return (0);
1477	} else if (tso_info) {
1478		int min_size = TCPPKTHDRSIZE, eth_type, tagged;
1479		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1480		struct ip *ip;
1481		struct tcphdr *tcp;
1482		char *pkthdr;
1483
1484		txd->flit[2] = 0;
1485		GET_VTAG(cntrl, m0);
1486		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1487		hdr->cntrl = htonl(cntrl);
1488		hdr->len = htonl(mlen | 0x80000000);
1489
1490		DPRINTF("tso buf len=%d\n", mlen);
1491
1492		tagged = m0->m_flags & M_VLANTAG;
1493		if (!tagged)
1494			min_size -= ETHER_VLAN_ENCAP_LEN;
1495
1496		if (__predict_false(mlen < min_size)) {
1497			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1498			    m0, mlen, m0->m_pkthdr.tso_segsz,
1499			    m0->m_pkthdr.csum_flags, m0->m_flags);
1500			panic("tx tso packet too small");
1501		}
1502
1503		/* Make sure that ether, ip, tcp headers are all in m0 */
1504		if (__predict_false(m0->m_len < min_size)) {
1505			m0 = m_pullup(m0, min_size);
1506			if (__predict_false(m0 == NULL)) {
1507				/* XXX panic probably an overreaction */
1508				panic("couldn't fit header into mbuf");
1509			}
1510		}
1511		pkthdr = m0->m_data;
1512
1513		if (tagged) {
1514			eth_type = CPL_ETH_II_VLAN;
1515			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1516			    ETHER_VLAN_ENCAP_LEN);
1517		} else {
1518			eth_type = CPL_ETH_II;
1519			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1520		}
1521		tcp = (struct tcphdr *)((uint8_t *)ip +
1522		    sizeof(*ip));
1523
1524		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1525			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1526			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1527		hdr->lso_info = htonl(tso_info);
1528
1529		if (__predict_false(mlen <= PIO_LEN)) {
1530			/* pkt not undersized but fits in PIO_LEN
1531			 * Indicates a TSO bug at the higher levels.
1532			 *
1533			 */
1534			DPRINTF("**5592 Fix** mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1535			    m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags);
1536			txsd->m = NULL;
1537			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1538			flits = (mlen + 7) / 8 + 3;
1539			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1540					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1541					  F_WR_SOP | F_WR_EOP | txqs.compl);
1542			wr_lo = htonl(V_WR_LEN(flits) |
1543			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1544			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1545			wmb();
1546			wr_gen2(txd, txqs.gen);
1547			check_ring_tx_db(sc, txq);
1548			return (0);
1549		}
1550		flits = 3;
1551	} else {
1552		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1553
1554		GET_VTAG(cntrl, m0);
1555		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1556		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1557			cntrl |= F_TXPKT_IPCSUM_DIS;
1558		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1559			cntrl |= F_TXPKT_L4CSUM_DIS;
1560		cpl->cntrl = htonl(cntrl);
1561		cpl->len = htonl(mlen | 0x80000000);
1562
1563		if (mlen <= PIO_LEN) {
1564			txsd->m = NULL;
1565			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1566			flits = (mlen + 7) / 8 + 2;
1567
1568			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1569			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1570					  F_WR_SOP | F_WR_EOP | txqs.compl);
1571			wr_lo = htonl(V_WR_LEN(flits) |
1572			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1573			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1574			wmb();
1575			wr_gen2(txd, txqs.gen);
1576			check_ring_tx_db(sc, txq);
1577			return (0);
1578		}
1579		flits = 2;
1580	}
1581	wrp = (struct work_request_hdr *)txd;
1582	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1583	make_sgl(sgp, segs, nsegs);
1584
1585	sgl_flits = sgl_len(nsegs);
1586
1587	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1588	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1589	wr_lo = htonl(V_WR_TID(txq->token));
1590	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1591	    sgl_flits, wr_hi, wr_lo);
1592	check_ring_tx_db(pi->adapter, txq);
1593
1594	return (0);
1595}
1596
1597void
1598cxgb_tx_watchdog(void *arg)
1599{
1600	struct sge_qset *qs = arg;
1601	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1602
1603        if (qs->coalescing != 0 &&
1604	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1605	    TXQ_RING_EMPTY(qs))
1606                qs->coalescing = 0;
1607        else if (qs->coalescing == 0 &&
1608	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1609                qs->coalescing = 1;
1610	if (TXQ_TRYLOCK(qs)) {
1611		qs->qs_flags |= QS_FLUSHING;
1612		cxgb_start_locked(qs);
1613		qs->qs_flags &= ~QS_FLUSHING;
1614		TXQ_UNLOCK(qs);
1615	}
1616	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1617		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1618		    qs, txq->txq_watchdog.c_cpu);
1619}
1620
1621static void
1622cxgb_tx_timeout(void *arg)
1623{
1624	struct sge_qset *qs = arg;
1625	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1626
1627	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1628                qs->coalescing = 1;
1629	if (TXQ_TRYLOCK(qs)) {
1630		qs->qs_flags |= QS_TIMEOUT;
1631		cxgb_start_locked(qs);
1632		qs->qs_flags &= ~QS_TIMEOUT;
1633		TXQ_UNLOCK(qs);
1634	}
1635}
1636
1637static void
1638cxgb_start_locked(struct sge_qset *qs)
1639{
1640	struct mbuf *m_head = NULL;
1641	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1642	int avail, txmax;
1643	int in_use_init = txq->in_use;
1644	struct port_info *pi = qs->port;
1645	struct ifnet *ifp = pi->ifp;
1646	avail = txq->size - txq->in_use - 4;
1647	txmax = min(TX_START_MAX_DESC, avail);
1648
1649	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1650		reclaim_completed_tx(qs, 0, TXQ_ETH);
1651
1652	if (!pi->link_config.link_ok) {
1653		TXQ_RING_FLUSH(qs);
1654		return;
1655	}
1656	TXQ_LOCK_ASSERT(qs);
1657	while ((txq->in_use - in_use_init < txmax) &&
1658	    !TXQ_RING_EMPTY(qs) &&
1659	    (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1660	    pi->link_config.link_ok) {
1661		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1662
1663		if ((m_head = cxgb_dequeue(qs)) == NULL)
1664			break;
1665		/*
1666		 *  Encapsulation can modify our pointer, and or make it
1667		 *  NULL on failure.  In that event, we can't requeue.
1668		 */
1669		if (t3_encap(qs, &m_head) || m_head == NULL)
1670			break;
1671
1672		/* Send a copy of the frame to the BPF listener */
1673		ETHER_BPF_MTAP(ifp, m_head);
1674
1675		/*
1676		 * We sent via PIO, no longer need a copy
1677		 */
1678		if (m_head->m_nextpkt == NULL &&
1679		    m_head->m_pkthdr.len <= PIO_LEN)
1680			m_freem(m_head);
1681
1682		m_head = NULL;
1683	}
1684	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1685	    pi->link_config.link_ok)
1686		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1687		    qs, txq->txq_timer.c_cpu);
1688	if (m_head != NULL)
1689		m_freem(m_head);
1690}
1691
1692static int
1693cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1694{
1695	struct port_info *pi = qs->port;
1696	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1697	struct buf_ring *br = txq->txq_mr;
1698	int error, avail;
1699
1700	avail = txq->size - txq->in_use;
1701	TXQ_LOCK_ASSERT(qs);
1702
1703	/*
1704	 * We can only do a direct transmit if the following are true:
1705	 * - we aren't coalescing (ring < 3/4 full)
1706	 * - the link is up -- checked in caller
1707	 * - there are no packets enqueued already
1708	 * - there is space in hardware transmit queue
1709	 */
1710	if (check_pkt_coalesce(qs) == 0 &&
1711	    TXQ_RING_EMPTY(qs) && avail > 4) {
1712		if (t3_encap(qs, &m)) {
1713			if (m != NULL &&
1714			    (error = drbr_enqueue(ifp, br, m)) != 0)
1715				return (error);
1716		} else {
1717			/*
1718			 * We've bypassed the buf ring so we need to update
1719			 * the stats directly
1720			 */
1721			txq->txq_direct_packets++;
1722			txq->txq_direct_bytes += m->m_pkthdr.len;
1723			/*
1724			** Send a copy of the frame to the BPF
1725			** listener and set the watchdog on.
1726			*/
1727			ETHER_BPF_MTAP(ifp, m);
1728			/*
1729			 * We sent via PIO, no longer need a copy
1730			 */
1731			if (m->m_pkthdr.len <= PIO_LEN)
1732				m_freem(m);
1733
1734		}
1735	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1736		return (error);
1737
1738	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1739	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1740	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1741		cxgb_start_locked(qs);
1742	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1743		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1744		    qs, txq->txq_timer.c_cpu);
1745	return (0);
1746}
1747
1748int
1749cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1750{
1751	struct sge_qset *qs;
1752	struct port_info *pi = ifp->if_softc;
1753	int error, qidx = pi->first_qset;
1754
1755	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1756	    ||(!pi->link_config.link_ok)) {
1757		m_freem(m);
1758		return (0);
1759	}
1760
1761	if (m->m_flags & M_FLOWID)
1762		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1763
1764	qs = &pi->adapter->sge.qs[qidx];
1765
1766	if (TXQ_TRYLOCK(qs)) {
1767		/* XXX running */
1768		error = cxgb_transmit_locked(ifp, qs, m);
1769		TXQ_UNLOCK(qs);
1770	} else
1771		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1772	return (error);
1773}
1774void
1775cxgb_start(struct ifnet *ifp)
1776{
1777	struct port_info *pi = ifp->if_softc;
1778	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
1779
1780	if (!pi->link_config.link_ok)
1781		return;
1782
1783	TXQ_LOCK(qs);
1784	cxgb_start_locked(qs);
1785	TXQ_UNLOCK(qs);
1786}
1787
1788void
1789cxgb_qflush(struct ifnet *ifp)
1790{
1791	/*
1792	 * flush any enqueued mbufs in the buf_rings
1793	 * and in the transmit queues
1794	 * no-op for now
1795	 */
1796	return;
1797}
1798
1799/**
1800 *	write_imm - write a packet into a Tx descriptor as immediate data
1801 *	@d: the Tx descriptor to write
1802 *	@m: the packet
1803 *	@len: the length of packet data to write as immediate data
1804 *	@gen: the generation bit value to write
1805 *
1806 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1807 *	contains a work request at its beginning.  We must write the packet
1808 *	carefully so the SGE doesn't read accidentally before it's written in
1809 *	its entirety.
1810 */
1811static __inline void
1812write_imm(struct tx_desc *d, struct mbuf *m,
1813	  unsigned int len, unsigned int gen)
1814{
1815	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1816	struct work_request_hdr *to = (struct work_request_hdr *)d;
1817	uint32_t wr_hi, wr_lo;
1818
1819	if (len > WR_LEN)
1820		panic("len too big %d\n", len);
1821	if (len < sizeof(*from))
1822		panic("len too small %d", len);
1823
1824	memcpy(&to[1], &from[1], len - sizeof(*from));
1825	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1826					V_WR_BCNTLFLT(len & 7));
1827	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1828					V_WR_LEN((len + 7) / 8));
1829	set_wr_hdr(to, wr_hi, wr_lo);
1830	wmb();
1831	wr_gen2(d, gen);
1832
1833	/*
1834	 * This check is a hack we should really fix the logic so
1835	 * that this can't happen
1836	 */
1837	if (m->m_type != MT_DONTFREE)
1838		m_freem(m);
1839
1840}
1841
1842/**
1843 *	check_desc_avail - check descriptor availability on a send queue
1844 *	@adap: the adapter
1845 *	@q: the TX queue
1846 *	@m: the packet needing the descriptors
1847 *	@ndesc: the number of Tx descriptors needed
1848 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1849 *
1850 *	Checks if the requested number of Tx descriptors is available on an
1851 *	SGE send queue.  If the queue is already suspended or not enough
1852 *	descriptors are available the packet is queued for later transmission.
1853 *	Must be called with the Tx queue locked.
1854 *
1855 *	Returns 0 if enough descriptors are available, 1 if there aren't
1856 *	enough descriptors and the packet has been queued, and 2 if the caller
1857 *	needs to retry because there weren't enough descriptors at the
1858 *	beginning of the call but some freed up in the mean time.
1859 */
1860static __inline int
1861check_desc_avail(adapter_t *adap, struct sge_txq *q,
1862		 struct mbuf *m, unsigned int ndesc,
1863		 unsigned int qid)
1864{
1865	/*
1866	 * XXX We currently only use this for checking the control queue
1867	 * the control queue is only used for binding qsets which happens
1868	 * at init time so we are guaranteed enough descriptors
1869	 */
1870	if (__predict_false(!mbufq_empty(&q->sendq))) {
1871addq_exit:	mbufq_tail(&q->sendq, m);
1872		return 1;
1873	}
1874	if (__predict_false(q->size - q->in_use < ndesc)) {
1875
1876		struct sge_qset *qs = txq_to_qset(q, qid);
1877
1878		setbit(&qs->txq_stopped, qid);
1879		if (should_restart_tx(q) &&
1880		    test_and_clear_bit(qid, &qs->txq_stopped))
1881			return 2;
1882
1883		q->stops++;
1884		goto addq_exit;
1885	}
1886	return 0;
1887}
1888
1889
1890/**
1891 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1892 *	@q: the SGE control Tx queue
1893 *
1894 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1895 *	that send only immediate data (presently just the control queues) and
1896 *	thus do not have any mbufs
1897 */
1898static __inline void
1899reclaim_completed_tx_imm(struct sge_txq *q)
1900{
1901	unsigned int reclaim = q->processed - q->cleaned;
1902
1903	q->in_use -= reclaim;
1904	q->cleaned += reclaim;
1905}
1906
1907static __inline int
1908immediate(const struct mbuf *m)
1909{
1910	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1911}
1912
1913/**
1914 *	ctrl_xmit - send a packet through an SGE control Tx queue
1915 *	@adap: the adapter
1916 *	@q: the control queue
1917 *	@m: the packet
1918 *
1919 *	Send a packet through an SGE control Tx queue.  Packets sent through
1920 *	a control queue must fit entirely as immediate data in a single Tx
1921 *	descriptor and have no page fragments.
1922 */
1923static int
1924ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1925{
1926	int ret;
1927	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1928	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1929
1930	if (__predict_false(!immediate(m))) {
1931		m_freem(m);
1932		return 0;
1933	}
1934
1935	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1936	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1937
1938	TXQ_LOCK(qs);
1939again:	reclaim_completed_tx_imm(q);
1940
1941	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1942	if (__predict_false(ret)) {
1943		if (ret == 1) {
1944			TXQ_UNLOCK(qs);
1945			log(LOG_ERR, "no desc available\n");
1946			return (ENOSPC);
1947		}
1948		goto again;
1949	}
1950	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1951
1952	q->in_use++;
1953	if (++q->pidx >= q->size) {
1954		q->pidx = 0;
1955		q->gen ^= 1;
1956	}
1957	TXQ_UNLOCK(qs);
1958	t3_write_reg(adap, A_SG_KDOORBELL,
1959		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1960	return (0);
1961}
1962
1963
1964/**
1965 *	restart_ctrlq - restart a suspended control queue
1966 *	@qs: the queue set cotaining the control queue
1967 *
1968 *	Resumes transmission on a suspended Tx control queue.
1969 */
1970static void
1971restart_ctrlq(void *data, int npending)
1972{
1973	struct mbuf *m;
1974	struct sge_qset *qs = (struct sge_qset *)data;
1975	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1976	adapter_t *adap = qs->port->adapter;
1977
1978	log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use);
1979
1980	TXQ_LOCK(qs);
1981again:	reclaim_completed_tx_imm(q);
1982
1983	while (q->in_use < q->size &&
1984	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1985
1986		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1987
1988		if (++q->pidx >= q->size) {
1989			q->pidx = 0;
1990			q->gen ^= 1;
1991		}
1992		q->in_use++;
1993	}
1994	if (!mbufq_empty(&q->sendq)) {
1995		setbit(&qs->txq_stopped, TXQ_CTRL);
1996
1997		if (should_restart_tx(q) &&
1998		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1999			goto again;
2000		q->stops++;
2001	}
2002	TXQ_UNLOCK(qs);
2003	t3_write_reg(adap, A_SG_KDOORBELL,
2004		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2005}
2006
2007
2008/*
2009 * Send a management message through control queue 0
2010 */
2011int
2012t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
2013{
2014	return ctrl_xmit(adap, &adap->sge.qs[0], m);
2015}
2016
2017/**
2018 *	free_qset - free the resources of an SGE queue set
2019 *	@sc: the controller owning the queue set
2020 *	@q: the queue set
2021 *
2022 *	Release the HW and SW resources associated with an SGE queue set, such
2023 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2024 *	queue set must be quiesced prior to calling this.
2025 */
2026static void
2027t3_free_qset(adapter_t *sc, struct sge_qset *q)
2028{
2029	int i;
2030
2031	reclaim_completed_tx(q, 0, TXQ_ETH);
2032	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2033		if (q->txq[i].txq_mr != NULL)
2034			buf_ring_free(q->txq[i].txq_mr, M_DEVBUF);
2035		if (q->txq[i].txq_ifq != NULL) {
2036			ifq_delete(q->txq[i].txq_ifq);
2037			free(q->txq[i].txq_ifq, M_DEVBUF);
2038		}
2039	}
2040
2041	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2042		if (q->fl[i].desc) {
2043			mtx_lock_spin(&sc->sge.reg_lock);
2044			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2045			mtx_unlock_spin(&sc->sge.reg_lock);
2046			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2047			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2048					q->fl[i].desc_map);
2049			bus_dma_tag_destroy(q->fl[i].desc_tag);
2050			bus_dma_tag_destroy(q->fl[i].entry_tag);
2051		}
2052		if (q->fl[i].sdesc) {
2053			free_rx_bufs(sc, &q->fl[i]);
2054			free(q->fl[i].sdesc, M_DEVBUF);
2055		}
2056	}
2057
2058	mtx_unlock(&q->lock);
2059	MTX_DESTROY(&q->lock);
2060	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2061		if (q->txq[i].desc) {
2062			mtx_lock_spin(&sc->sge.reg_lock);
2063			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2064			mtx_unlock_spin(&sc->sge.reg_lock);
2065			bus_dmamap_unload(q->txq[i].desc_tag,
2066					q->txq[i].desc_map);
2067			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2068					q->txq[i].desc_map);
2069			bus_dma_tag_destroy(q->txq[i].desc_tag);
2070			bus_dma_tag_destroy(q->txq[i].entry_tag);
2071		}
2072		if (q->txq[i].sdesc) {
2073			free(q->txq[i].sdesc, M_DEVBUF);
2074		}
2075	}
2076
2077	if (q->rspq.desc) {
2078		mtx_lock_spin(&sc->sge.reg_lock);
2079		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2080		mtx_unlock_spin(&sc->sge.reg_lock);
2081
2082		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2083		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2084			        q->rspq.desc_map);
2085		bus_dma_tag_destroy(q->rspq.desc_tag);
2086		MTX_DESTROY(&q->rspq.lock);
2087	}
2088
2089#ifdef LRO_SUPPORTED
2090	tcp_lro_free(&q->lro.ctrl);
2091#endif
2092
2093	bzero(q, sizeof(*q));
2094}
2095
2096/**
2097 *	t3_free_sge_resources - free SGE resources
2098 *	@sc: the adapter softc
2099 *
2100 *	Frees resources used by the SGE queue sets.
2101 */
2102void
2103t3_free_sge_resources(adapter_t *sc)
2104{
2105	int i, nqsets;
2106
2107	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2108		nqsets += sc->port[i].nqsets;
2109
2110	for (i = 0; i < nqsets; ++i) {
2111		TXQ_LOCK(&sc->sge.qs[i]);
2112		t3_free_qset(sc, &sc->sge.qs[i]);
2113	}
2114
2115}
2116
2117/**
2118 *	t3_sge_start - enable SGE
2119 *	@sc: the controller softc
2120 *
2121 *	Enables the SGE for DMAs.  This is the last step in starting packet
2122 *	transfers.
2123 */
2124void
2125t3_sge_start(adapter_t *sc)
2126{
2127	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2128}
2129
2130/**
2131 *	t3_sge_stop - disable SGE operation
2132 *	@sc: the adapter
2133 *
2134 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2135 *	from error interrupts) or from normal process context.  In the latter
2136 *	case it also disables any pending queue restart tasklets.  Note that
2137 *	if it is called in interrupt context it cannot disable the restart
2138 *	tasklets as it cannot wait, however the tasklets will have no effect
2139 *	since the doorbells are disabled and the driver will call this again
2140 *	later from process context, at which time the tasklets will be stopped
2141 *	if they are still running.
2142 */
2143void
2144t3_sge_stop(adapter_t *sc)
2145{
2146	int i, nqsets;
2147
2148	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2149
2150	if (sc->tq == NULL)
2151		return;
2152
2153	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2154		nqsets += sc->port[i].nqsets;
2155#ifdef notyet
2156	/*
2157	 *
2158	 * XXX
2159	 */
2160	for (i = 0; i < nqsets; ++i) {
2161		struct sge_qset *qs = &sc->sge.qs[i];
2162
2163		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2164		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2165	}
2166#endif
2167}
2168
2169/**
2170 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2171 *	@adapter: the adapter
2172 *	@q: the Tx queue to reclaim descriptors from
2173 *	@reclaimable: the number of descriptors to reclaim
2174 *      @m_vec_size: maximum number of buffers to reclaim
2175 *      @desc_reclaimed: returns the number of descriptors reclaimed
2176 *
2177 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2178 *	Tx buffers.  Called with the Tx queue lock held.
2179 *
2180 *      Returns number of buffers of reclaimed
2181 */
2182void
2183t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2184{
2185	struct tx_sw_desc *txsd;
2186	unsigned int cidx, mask;
2187	struct sge_txq *q = &qs->txq[queue];
2188
2189#ifdef T3_TRACE
2190	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2191		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2192#endif
2193	cidx = q->cidx;
2194	mask = q->size - 1;
2195	txsd = &q->sdesc[cidx];
2196
2197	mtx_assert(&qs->lock, MA_OWNED);
2198	while (reclaimable--) {
2199		prefetch(q->sdesc[(cidx + 1) & mask].m);
2200		prefetch(q->sdesc[(cidx + 2) & mask].m);
2201
2202		if (txsd->m != NULL) {
2203			if (txsd->flags & TX_SW_DESC_MAPPED) {
2204				bus_dmamap_unload(q->entry_tag, txsd->map);
2205				txsd->flags &= ~TX_SW_DESC_MAPPED;
2206			}
2207			m_freem_list(txsd->m);
2208			txsd->m = NULL;
2209		} else
2210			q->txq_skipped++;
2211
2212		++txsd;
2213		if (++cidx == q->size) {
2214			cidx = 0;
2215			txsd = q->sdesc;
2216		}
2217	}
2218	q->cidx = cidx;
2219
2220}
2221
2222/**
2223 *	is_new_response - check if a response is newly written
2224 *	@r: the response descriptor
2225 *	@q: the response queue
2226 *
2227 *	Returns true if a response descriptor contains a yet unprocessed
2228 *	response.
2229 */
2230static __inline int
2231is_new_response(const struct rsp_desc *r,
2232    const struct sge_rspq *q)
2233{
2234	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2235}
2236
2237#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2238#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2239			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2240			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2241			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2242
2243/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2244#define NOMEM_INTR_DELAY 2500
2245
2246/**
2247 *	write_ofld_wr - write an offload work request
2248 *	@adap: the adapter
2249 *	@m: the packet to send
2250 *	@q: the Tx queue
2251 *	@pidx: index of the first Tx descriptor to write
2252 *	@gen: the generation value to use
2253 *	@ndesc: number of descriptors the packet will occupy
2254 *
2255 *	Write an offload work request to send the supplied packet.  The packet
2256 *	data already carry the work request with most fields populated.
2257 */
2258static void
2259write_ofld_wr(adapter_t *adap, struct mbuf *m,
2260    struct sge_txq *q, unsigned int pidx,
2261    unsigned int gen, unsigned int ndesc,
2262    bus_dma_segment_t *segs, unsigned int nsegs)
2263{
2264	unsigned int sgl_flits, flits;
2265	struct work_request_hdr *from;
2266	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2267	struct tx_desc *d = &q->desc[pidx];
2268	struct txq_state txqs;
2269
2270	if (immediate(m) && nsegs == 0) {
2271		write_imm(d, m, m->m_len, gen);
2272		return;
2273	}
2274
2275	/* Only TX_DATA builds SGLs */
2276	from = mtod(m, struct work_request_hdr *);
2277	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2278
2279	flits = m->m_len / 8;
2280	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2281
2282	make_sgl(sgp, segs, nsegs);
2283	sgl_flits = sgl_len(nsegs);
2284
2285	txqs.gen = gen;
2286	txqs.pidx = pidx;
2287	txqs.compl = 0;
2288
2289	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2290	    from->wrh_hi, from->wrh_lo);
2291}
2292
2293/**
2294 *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2295 *	@m: the packet
2296 *
2297 * 	Returns the number of Tx descriptors needed for the given offload
2298 * 	packet.  These packets are already fully constructed.
2299 */
2300static __inline unsigned int
2301calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2302{
2303	unsigned int flits, cnt = 0;
2304	int ndescs;
2305
2306	if (m->m_len <= WR_LEN && nsegs == 0)
2307		return (1);                 /* packet fits as immediate data */
2308
2309	/*
2310	 * This needs to be re-visited for TOE
2311	 */
2312
2313	cnt = nsegs;
2314
2315	/* headers */
2316	flits = m->m_len / 8;
2317
2318	ndescs = flits_to_desc(flits + sgl_len(cnt));
2319
2320	return (ndescs);
2321}
2322
2323/**
2324 *	ofld_xmit - send a packet through an offload queue
2325 *	@adap: the adapter
2326 *	@q: the Tx offload queue
2327 *	@m: the packet
2328 *
2329 *	Send an offload packet through an SGE offload queue.
2330 */
2331static int
2332ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2333{
2334	int ret, nsegs;
2335	unsigned int ndesc;
2336	unsigned int pidx, gen;
2337	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2338	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2339	struct tx_sw_desc *stx;
2340
2341	nsegs = m_get_sgllen(m);
2342	vsegs = m_get_sgl(m);
2343	ndesc = calc_tx_descs_ofld(m, nsegs);
2344	busdma_map_sgl(vsegs, segs, nsegs);
2345
2346	stx = &q->sdesc[q->pidx];
2347
2348	TXQ_LOCK(qs);
2349again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2350	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2351	if (__predict_false(ret)) {
2352		if (ret == 1) {
2353			printf("no ofld desc avail\n");
2354
2355			m_set_priority(m, ndesc);     /* save for restart */
2356			TXQ_UNLOCK(qs);
2357			return (EINTR);
2358		}
2359		goto again;
2360	}
2361
2362	gen = q->gen;
2363	q->in_use += ndesc;
2364	pidx = q->pidx;
2365	q->pidx += ndesc;
2366	if (q->pidx >= q->size) {
2367		q->pidx -= q->size;
2368		q->gen ^= 1;
2369	}
2370#ifdef T3_TRACE
2371	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2372		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2373		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2374		  skb_shinfo(skb)->nr_frags);
2375#endif
2376	TXQ_UNLOCK(qs);
2377
2378	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2379	check_ring_tx_db(adap, q);
2380	return (0);
2381}
2382
2383/**
2384 *	restart_offloadq - restart a suspended offload queue
2385 *	@qs: the queue set cotaining the offload queue
2386 *
2387 *	Resumes transmission on a suspended Tx offload queue.
2388 */
2389static void
2390restart_offloadq(void *data, int npending)
2391{
2392	struct mbuf *m;
2393	struct sge_qset *qs = data;
2394	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2395	adapter_t *adap = qs->port->adapter;
2396	bus_dma_segment_t segs[TX_MAX_SEGS];
2397	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2398	int nsegs, cleaned;
2399
2400	TXQ_LOCK(qs);
2401again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2402
2403	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2404		unsigned int gen, pidx;
2405		unsigned int ndesc = m_get_priority(m);
2406
2407		if (__predict_false(q->size - q->in_use < ndesc)) {
2408			setbit(&qs->txq_stopped, TXQ_OFLD);
2409			if (should_restart_tx(q) &&
2410			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2411				goto again;
2412			q->stops++;
2413			break;
2414		}
2415
2416		gen = q->gen;
2417		q->in_use += ndesc;
2418		pidx = q->pidx;
2419		q->pidx += ndesc;
2420		if (q->pidx >= q->size) {
2421			q->pidx -= q->size;
2422			q->gen ^= 1;
2423		}
2424
2425		(void)mbufq_dequeue(&q->sendq);
2426		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2427		TXQ_UNLOCK(qs);
2428		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2429		TXQ_LOCK(qs);
2430	}
2431#if USE_GTS
2432	set_bit(TXQ_RUNNING, &q->flags);
2433	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2434#endif
2435	TXQ_UNLOCK(qs);
2436	wmb();
2437	t3_write_reg(adap, A_SG_KDOORBELL,
2438		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2439}
2440
2441/**
2442 *	queue_set - return the queue set a packet should use
2443 *	@m: the packet
2444 *
2445 *	Maps a packet to the SGE queue set it should use.  The desired queue
2446 *	set is carried in bits 1-3 in the packet's priority.
2447 */
2448static __inline int
2449queue_set(const struct mbuf *m)
2450{
2451	return m_get_priority(m) >> 1;
2452}
2453
2454/**
2455 *	is_ctrl_pkt - return whether an offload packet is a control packet
2456 *	@m: the packet
2457 *
2458 *	Determines whether an offload packet should use an OFLD or a CTRL
2459 *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2460 */
2461static __inline int
2462is_ctrl_pkt(const struct mbuf *m)
2463{
2464	return m_get_priority(m) & 1;
2465}
2466
2467/**
2468 *	t3_offload_tx - send an offload packet
2469 *	@tdev: the offload device to send to
2470 *	@m: the packet
2471 *
2472 *	Sends an offload packet.  We use the packet priority to select the
2473 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2474 *	should be sent as regular or control, bits 1-3 select the queue set.
2475 */
2476int
2477t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2478{
2479	adapter_t *adap = tdev2adap(tdev);
2480	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2481
2482	if (__predict_false(is_ctrl_pkt(m)))
2483		return ctrl_xmit(adap, qs, m);
2484
2485	return ofld_xmit(adap, qs, m);
2486}
2487
2488/**
2489 *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2490 *	@tdev: the offload device that will be receiving the packets
2491 *	@q: the SGE response queue that assembled the bundle
2492 *	@m: the partial bundle
2493 *	@n: the number of packets in the bundle
2494 *
2495 *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2496 */
2497static __inline void
2498deliver_partial_bundle(struct t3cdev *tdev,
2499			struct sge_rspq *q,
2500			struct mbuf *mbufs[], int n)
2501{
2502	if (n) {
2503		q->offload_bundles++;
2504		cxgb_ofld_recv(tdev, mbufs, n);
2505	}
2506}
2507
2508static __inline int
2509rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2510    struct mbuf *m, struct mbuf *rx_gather[],
2511    unsigned int gather_idx)
2512{
2513
2514	rq->offload_pkts++;
2515	m->m_pkthdr.header = mtod(m, void *);
2516	rx_gather[gather_idx++] = m;
2517	if (gather_idx == RX_BUNDLE_SIZE) {
2518		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2519		gather_idx = 0;
2520		rq->offload_bundles++;
2521	}
2522	return (gather_idx);
2523}
2524
2525static void
2526restart_tx(struct sge_qset *qs)
2527{
2528	struct adapter *sc = qs->port->adapter;
2529
2530
2531	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2532	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2533	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2534		qs->txq[TXQ_OFLD].restarts++;
2535		DPRINTF("restarting TXQ_OFLD\n");
2536		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2537	}
2538	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2539	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2540	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2541	    qs->txq[TXQ_CTRL].in_use);
2542
2543	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2544	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2545	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2546		qs->txq[TXQ_CTRL].restarts++;
2547		DPRINTF("restarting TXQ_CTRL\n");
2548		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2549	}
2550}
2551
2552/**
2553 *	t3_sge_alloc_qset - initialize an SGE queue set
2554 *	@sc: the controller softc
2555 *	@id: the queue set id
2556 *	@nports: how many Ethernet ports will be using this queue set
2557 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2558 *	@p: configuration parameters for this queue set
2559 *	@ntxq: number of Tx queues for the queue set
2560 *	@pi: port info for queue set
2561 *
2562 *	Allocate resources and initialize an SGE queue set.  A queue set
2563 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2564 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2565 *	queue, offload queue, and control queue.
2566 */
2567int
2568t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2569		  const struct qset_params *p, int ntxq, struct port_info *pi)
2570{
2571	struct sge_qset *q = &sc->sge.qs[id];
2572	int i, ret = 0;
2573
2574	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2575	q->port = pi;
2576
2577	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2578
2579		if ((q->txq[i].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2580			    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2581			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2582			goto err;
2583		}
2584		if ((q->txq[i].txq_ifq =
2585			malloc(sizeof(struct ifaltq), M_DEVBUF, M_NOWAIT|M_ZERO))
2586		    == NULL) {
2587			device_printf(sc->dev, "failed to allocate ifq\n");
2588			goto err;
2589		}
2590		ifq_init(q->txq[i].txq_ifq, pi->ifp);
2591		callout_init(&q->txq[i].txq_timer, 1);
2592		callout_init(&q->txq[i].txq_watchdog, 1);
2593		q->txq[i].txq_timer.c_cpu = id % mp_ncpus;
2594		q->txq[i].txq_watchdog.c_cpu = id % mp_ncpus;
2595	}
2596	init_qset_cntxt(q, id);
2597	q->idx = id;
2598	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2599		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2600		    &q->fl[0].desc, &q->fl[0].sdesc,
2601		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2602		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2603		printf("error %d from alloc ring fl0\n", ret);
2604		goto err;
2605	}
2606
2607	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2608		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2609		    &q->fl[1].desc, &q->fl[1].sdesc,
2610		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2611		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2612		printf("error %d from alloc ring fl1\n", ret);
2613		goto err;
2614	}
2615
2616	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2617		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2618		    &q->rspq.desc_tag, &q->rspq.desc_map,
2619		    NULL, NULL)) != 0) {
2620		printf("error %d from alloc ring rspq\n", ret);
2621		goto err;
2622	}
2623
2624	for (i = 0; i < ntxq; ++i) {
2625		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2626
2627		if ((ret = alloc_ring(sc, p->txq_size[i],
2628			    sizeof(struct tx_desc), sz,
2629			    &q->txq[i].phys_addr, &q->txq[i].desc,
2630			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2631			    &q->txq[i].desc_map,
2632			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2633			printf("error %d from alloc ring tx %i\n", ret, i);
2634			goto err;
2635		}
2636		mbufq_init(&q->txq[i].sendq);
2637		q->txq[i].gen = 1;
2638		q->txq[i].size = p->txq_size[i];
2639	}
2640
2641	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2642	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2643	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2644	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2645
2646	q->fl[0].gen = q->fl[1].gen = 1;
2647	q->fl[0].size = p->fl_size;
2648	q->fl[1].size = p->jumbo_size;
2649
2650	q->rspq.gen = 1;
2651	q->rspq.cidx = 0;
2652	q->rspq.size = p->rspq_size;
2653
2654	q->txq[TXQ_ETH].stop_thres = nports *
2655	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2656
2657	q->fl[0].buf_size = MCLBYTES;
2658	q->fl[0].zone = zone_pack;
2659	q->fl[0].type = EXT_PACKET;
2660#if __FreeBSD_version > 800000
2661	if (cxgb_use_16k_clusters) {
2662		q->fl[1].buf_size = MJUM16BYTES;
2663		q->fl[1].zone = zone_jumbo16;
2664		q->fl[1].type = EXT_JUMBO16;
2665	} else {
2666		q->fl[1].buf_size = MJUM9BYTES;
2667		q->fl[1].zone = zone_jumbo9;
2668		q->fl[1].type = EXT_JUMBO9;
2669	}
2670#else
2671	q->fl[1].buf_size = MJUMPAGESIZE;
2672	q->fl[1].zone = zone_jumbop;
2673	q->fl[1].type = EXT_JUMBOP;
2674#endif
2675
2676#ifdef LRO_SUPPORTED
2677	/* Allocate and setup the lro_ctrl structure */
2678	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2679	ret = tcp_lro_init(&q->lro.ctrl);
2680	if (ret) {
2681		printf("error %d from tcp_lro_init\n", ret);
2682		goto err;
2683	}
2684	q->lro.ctrl.ifp = pi->ifp;
2685#endif
2686
2687	mtx_lock_spin(&sc->sge.reg_lock);
2688	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2689				   q->rspq.phys_addr, q->rspq.size,
2690				   q->fl[0].buf_size, 1, 0);
2691	if (ret) {
2692		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2693		goto err_unlock;
2694	}
2695
2696	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2697		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2698					  q->fl[i].phys_addr, q->fl[i].size,
2699					  q->fl[i].buf_size, p->cong_thres, 1,
2700					  0);
2701		if (ret) {
2702			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2703			goto err_unlock;
2704		}
2705	}
2706
2707	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2708				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2709				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2710				 1, 0);
2711	if (ret) {
2712		printf("error %d from t3_sge_init_ecntxt\n", ret);
2713		goto err_unlock;
2714	}
2715
2716	if (ntxq > 1) {
2717		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2718					 USE_GTS, SGE_CNTXT_OFLD, id,
2719					 q->txq[TXQ_OFLD].phys_addr,
2720					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2721		if (ret) {
2722			printf("error %d from t3_sge_init_ecntxt\n", ret);
2723			goto err_unlock;
2724		}
2725	}
2726
2727	if (ntxq > 2) {
2728		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2729					 SGE_CNTXT_CTRL, id,
2730					 q->txq[TXQ_CTRL].phys_addr,
2731					 q->txq[TXQ_CTRL].size,
2732					 q->txq[TXQ_CTRL].token, 1, 0);
2733		if (ret) {
2734			printf("error %d from t3_sge_init_ecntxt\n", ret);
2735			goto err_unlock;
2736		}
2737	}
2738
2739	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2740	    device_get_unit(sc->dev), irq_vec_idx);
2741	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2742
2743	mtx_unlock_spin(&sc->sge.reg_lock);
2744	t3_update_qset_coalesce(q, p);
2745	q->port = pi;
2746
2747	refill_fl(sc, &q->fl[0], q->fl[0].size);
2748	refill_fl(sc, &q->fl[1], q->fl[1].size);
2749	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2750
2751	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2752		     V_NEWTIMER(q->rspq.holdoff_tmr));
2753
2754	return (0);
2755
2756err_unlock:
2757	mtx_unlock_spin(&sc->sge.reg_lock);
2758err:
2759	TXQ_LOCK(q);
2760	t3_free_qset(sc, q);
2761
2762	return (ret);
2763}
2764
2765/*
2766 * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2767 * ethernet data.  Hardware assistance with various checksums and any vlan tag
2768 * will also be taken into account here.
2769 */
2770void
2771t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2772{
2773	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2774	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2775	struct ifnet *ifp = pi->ifp;
2776
2777	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2778
2779	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2780	    cpl->csum_valid && cpl->csum == 0xffff) {
2781		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2782		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2783		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2784		m->m_pkthdr.csum_data = 0xffff;
2785	}
2786	/*
2787	 * XXX need to add VLAN support for 6.x
2788	 */
2789#ifdef VLAN_SUPPORTED
2790	if (__predict_false(cpl->vlan_valid)) {
2791		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2792		m->m_flags |= M_VLANTAG;
2793	}
2794#endif
2795
2796	m->m_pkthdr.rcvif = ifp;
2797	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2798	/*
2799	 * adjust after conversion to mbuf chain
2800	 */
2801	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2802	m->m_len -= (sizeof(*cpl) + ethpad);
2803	m->m_data += (sizeof(*cpl) + ethpad);
2804}
2805
2806/**
2807 *	get_packet - return the next ingress packet buffer from a free list
2808 *	@adap: the adapter that received the packet
2809 *	@drop_thres: # of remaining buffers before we start dropping packets
2810 *	@qs: the qset that the SGE free list holding the packet belongs to
2811 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2812 *      @r: response descriptor
2813 *
2814 *	Get the next packet from a free list and complete setup of the
2815 *	sk_buff.  If the packet is small we make a copy and recycle the
2816 *	original buffer, otherwise we use the original buffer itself.  If a
2817 *	positive drop threshold is supplied packets are dropped and their
2818 *	buffers recycled if (a) the number of remaining buffers is under the
2819 *	threshold and the packet is too big to copy, or (b) the packet should
2820 *	be copied but there is no memory for the copy.
2821 */
2822static int
2823get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2824    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2825{
2826
2827	unsigned int len_cq =  ntohl(r->len_cq);
2828	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2829	int mask, cidx = fl->cidx;
2830	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2831	uint32_t len = G_RSPD_LEN(len_cq);
2832	uint32_t flags = M_EXT;
2833	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2834	caddr_t cl;
2835	struct mbuf *m;
2836	int ret = 0;
2837
2838	mask = fl->size - 1;
2839	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2840	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2841	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2842	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2843
2844	fl->credits--;
2845	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2846
2847	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2848	    sopeop == RSPQ_SOP_EOP) {
2849		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2850			goto skip_recycle;
2851		cl = mtod(m, void *);
2852		memcpy(cl, sd->rxsd_cl, len);
2853		recycle_rx_buf(adap, fl, fl->cidx);
2854		m->m_pkthdr.len = m->m_len = len;
2855		m->m_flags = 0;
2856		mh->mh_head = mh->mh_tail = m;
2857		ret = 1;
2858		goto done;
2859	} else {
2860	skip_recycle:
2861		bus_dmamap_unload(fl->entry_tag, sd->map);
2862		cl = sd->rxsd_cl;
2863		m = sd->m;
2864
2865		if ((sopeop == RSPQ_SOP_EOP) ||
2866		    (sopeop == RSPQ_SOP))
2867			flags |= M_PKTHDR;
2868		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2869		if (fl->zone == zone_pack) {
2870			/*
2871			 * restore clobbered data pointer
2872			 */
2873			m->m_data = m->m_ext.ext_buf;
2874		} else {
2875			m_cljset(m, cl, fl->type);
2876		}
2877		m->m_len = len;
2878	}
2879	switch(sopeop) {
2880	case RSPQ_SOP_EOP:
2881		ret = 1;
2882		/* FALLTHROUGH */
2883	case RSPQ_SOP:
2884		mh->mh_head = mh->mh_tail = m;
2885		m->m_pkthdr.len = len;
2886		break;
2887	case RSPQ_EOP:
2888		ret = 1;
2889		/* FALLTHROUGH */
2890	case RSPQ_NSOP_NEOP:
2891		if (mh->mh_tail == NULL) {
2892			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2893			m_freem(m);
2894			break;
2895		}
2896		mh->mh_tail->m_next = m;
2897		mh->mh_tail = m;
2898		mh->mh_head->m_pkthdr.len += len;
2899		break;
2900	}
2901	if (cxgb_debug)
2902		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2903done:
2904	if (++fl->cidx == fl->size)
2905		fl->cidx = 0;
2906
2907	return (ret);
2908}
2909
2910/**
2911 *	handle_rsp_cntrl_info - handles control information in a response
2912 *	@qs: the queue set corresponding to the response
2913 *	@flags: the response control flags
2914 *
2915 *	Handles the control information of an SGE response, such as GTS
2916 *	indications and completion credits for the queue set's Tx queues.
2917 *	HW coalesces credits, we don't do any extra SW coalescing.
2918 */
2919static __inline void
2920handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2921{
2922	unsigned int credits;
2923
2924#if USE_GTS
2925	if (flags & F_RSPD_TXQ0_GTS)
2926		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2927#endif
2928	credits = G_RSPD_TXQ0_CR(flags);
2929	if (credits)
2930		qs->txq[TXQ_ETH].processed += credits;
2931
2932	credits = G_RSPD_TXQ2_CR(flags);
2933	if (credits)
2934		qs->txq[TXQ_CTRL].processed += credits;
2935
2936# if USE_GTS
2937	if (flags & F_RSPD_TXQ1_GTS)
2938		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2939# endif
2940	credits = G_RSPD_TXQ1_CR(flags);
2941	if (credits)
2942		qs->txq[TXQ_OFLD].processed += credits;
2943
2944}
2945
2946static void
2947check_ring_db(adapter_t *adap, struct sge_qset *qs,
2948    unsigned int sleeping)
2949{
2950	;
2951}
2952
2953/**
2954 *	process_responses - process responses from an SGE response queue
2955 *	@adap: the adapter
2956 *	@qs: the queue set to which the response queue belongs
2957 *	@budget: how many responses can be processed in this round
2958 *
2959 *	Process responses from an SGE response queue up to the supplied budget.
2960 *	Responses include received packets as well as credits and other events
2961 *	for the queues that belong to the response queue's queue set.
2962 *	A negative budget is effectively unlimited.
2963 *
2964 *	Additionally choose the interrupt holdoff time for the next interrupt
2965 *	on this queue.  If the system is under memory shortage use a fairly
2966 *	long delay to help recovery.
2967 */
2968static int
2969process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2970{
2971	struct sge_rspq *rspq = &qs->rspq;
2972	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2973	int budget_left = budget;
2974	unsigned int sleeping = 0;
2975#ifdef LRO_SUPPORTED
2976	int lro_enabled = qs->lro.enabled;
2977	int skip_lro;
2978	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2979#endif
2980	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2981	int ngathered = 0;
2982#ifdef DEBUG
2983	static int last_holdoff = 0;
2984	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2985		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2986		last_holdoff = rspq->holdoff_tmr;
2987	}
2988#endif
2989	rspq->next_holdoff = rspq->holdoff_tmr;
2990
2991	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2992		int eth, eop = 0, ethpad = 0;
2993		uint32_t flags = ntohl(r->flags);
2994		uint32_t rss_csum = *(const uint32_t *)r;
2995		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2996
2997		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2998
2999		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
3000			struct mbuf *m;
3001
3002			if (cxgb_debug)
3003				printf("async notification\n");
3004
3005			if (rspq->rspq_mh.mh_head == NULL) {
3006				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3007				m = rspq->rspq_mh.mh_head;
3008			} else {
3009				m = m_gethdr(M_DONTWAIT, MT_DATA);
3010			}
3011			if (m == NULL)
3012				goto no_mem;
3013
3014                        memcpy(mtod(m, char *), r, AN_PKT_SIZE);
3015			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
3016                        *mtod(m, char *) = CPL_ASYNC_NOTIF;
3017			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
3018			eop = 1;
3019                        rspq->async_notif++;
3020			goto skip;
3021		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
3022			struct mbuf *m = NULL;
3023
3024			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
3025			    r->rss_hdr.opcode, rspq->cidx);
3026			if (rspq->rspq_mh.mh_head == NULL)
3027				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3028                        else
3029				m = m_gethdr(M_DONTWAIT, MT_DATA);
3030
3031			if (rspq->rspq_mh.mh_head == NULL &&  m == NULL) {
3032		no_mem:
3033				rspq->next_holdoff = NOMEM_INTR_DELAY;
3034				budget_left--;
3035				break;
3036			}
3037			get_imm_packet(adap, r, rspq->rspq_mh.mh_head);
3038			eop = 1;
3039			rspq->imm_data++;
3040		} else if (r->len_cq) {
3041			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3042
3043			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
3044			if (eop) {
3045				rspq->rspq_mh.mh_head->m_flags |= M_FLOWID;
3046				rspq->rspq_mh.mh_head->m_pkthdr.flowid = rss_hash;
3047			}
3048
3049			ethpad = 2;
3050		} else {
3051			rspq->pure_rsps++;
3052		}
3053	skip:
3054		if (flags & RSPD_CTRL_MASK) {
3055			sleeping |= flags & RSPD_GTS_MASK;
3056			handle_rsp_cntrl_info(qs, flags);
3057		}
3058
3059		r++;
3060		if (__predict_false(++rspq->cidx == rspq->size)) {
3061			rspq->cidx = 0;
3062			rspq->gen ^= 1;
3063			r = rspq->desc;
3064		}
3065
3066		if (++rspq->credits >= (rspq->size / 4)) {
3067			refill_rspq(adap, rspq, rspq->credits);
3068			rspq->credits = 0;
3069		}
3070		if (!eth && eop) {
3071			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
3072			/*
3073			 * XXX size mismatch
3074			 */
3075			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
3076
3077
3078			ngathered = rx_offload(&adap->tdev, rspq,
3079			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
3080			rspq->rspq_mh.mh_head = NULL;
3081			DPRINTF("received offload packet\n");
3082
3083		} else if (eth && eop) {
3084			struct mbuf *m = rspq->rspq_mh.mh_head;
3085
3086			t3_rx_eth(adap, rspq, m, ethpad);
3087
3088#ifdef LRO_SUPPORTED
3089			/*
3090			 * The T304 sends incoming packets on any qset.  If LRO
3091			 * is also enabled, we could end up sending packet up
3092			 * lro_ctrl->ifp's input.  That is incorrect.
3093			 *
3094			 * The mbuf's rcvif was derived from the cpl header and
3095			 * is accurate.  Skip LRO and just use that.
3096			 */
3097			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3098
3099			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro &&
3100			    (tcp_lro_rx(lro_ctrl, m, 0) == 0)) {
3101				/* successfully queue'd for LRO */
3102			} else
3103#endif
3104			{
3105				/*
3106				 * LRO not enabled, packet unsuitable for LRO,
3107				 * or unable to queue.  Pass it up right now in
3108				 * either case.
3109				 */
3110				struct ifnet *ifp = m->m_pkthdr.rcvif;
3111				(*ifp->if_input)(ifp, m);
3112			}
3113			rspq->rspq_mh.mh_head = NULL;
3114
3115		}
3116		__refill_fl_lt(adap, &qs->fl[0], 32);
3117		__refill_fl_lt(adap, &qs->fl[1], 32);
3118		--budget_left;
3119	}
3120
3121	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3122
3123#ifdef LRO_SUPPORTED
3124	/* Flush LRO */
3125	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3126		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3127		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3128		tcp_lro_flush(lro_ctrl, queued);
3129	}
3130#endif
3131
3132	if (sleeping)
3133		check_ring_db(adap, qs, sleeping);
3134
3135	mb();  /* commit Tx queue processed updates */
3136	if (__predict_false(qs->txq_stopped > 1)) {
3137		printf("restarting tx on %p\n", qs);
3138
3139		restart_tx(qs);
3140	}
3141
3142	__refill_fl_lt(adap, &qs->fl[0], 512);
3143	__refill_fl_lt(adap, &qs->fl[1], 512);
3144	budget -= budget_left;
3145	return (budget);
3146}
3147
3148/*
3149 * A helper function that processes responses and issues GTS.
3150 */
3151static __inline int
3152process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3153{
3154	int work;
3155	static int last_holdoff = 0;
3156
3157	work = process_responses(adap, rspq_to_qset(rq), -1);
3158
3159	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3160		printf("next_holdoff=%d\n", rq->next_holdoff);
3161		last_holdoff = rq->next_holdoff;
3162	}
3163	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3164	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3165
3166	return (work);
3167}
3168
3169
3170/*
3171 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3172 * Handles data events from SGE response queues as well as error and other
3173 * async events as they all use the same interrupt pin.  We use one SGE
3174 * response queue per port in this mode and protect all response queues with
3175 * queue 0's lock.
3176 */
3177void
3178t3b_intr(void *data)
3179{
3180	uint32_t i, map;
3181	adapter_t *adap = data;
3182	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3183
3184	t3_write_reg(adap, A_PL_CLI, 0);
3185	map = t3_read_reg(adap, A_SG_DATA_INTR);
3186
3187	if (!map)
3188		return;
3189
3190	if (__predict_false(map & F_ERRINTR))
3191		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3192
3193	mtx_lock(&q0->lock);
3194	for_each_port(adap, i)
3195	    if (map & (1 << i))
3196			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3197	mtx_unlock(&q0->lock);
3198}
3199
3200/*
3201 * The MSI interrupt handler.  This needs to handle data events from SGE
3202 * response queues as well as error and other async events as they all use
3203 * the same MSI vector.  We use one SGE response queue per port in this mode
3204 * and protect all response queues with queue 0's lock.
3205 */
3206void
3207t3_intr_msi(void *data)
3208{
3209	adapter_t *adap = data;
3210	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3211	int i, new_packets = 0;
3212
3213	mtx_lock(&q0->lock);
3214
3215	for_each_port(adap, i)
3216	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3217		    new_packets = 1;
3218	mtx_unlock(&q0->lock);
3219	if (new_packets == 0)
3220		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3221}
3222
3223void
3224t3_intr_msix(void *data)
3225{
3226	struct sge_qset *qs = data;
3227	adapter_t *adap = qs->port->adapter;
3228	struct sge_rspq *rspq = &qs->rspq;
3229
3230	if (process_responses_gts(adap, rspq) == 0)
3231		rspq->unhandled_irqs++;
3232}
3233
3234#define QDUMP_SBUF_SIZE		32 * 400
3235static int
3236t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3237{
3238	struct sge_rspq *rspq;
3239	struct sge_qset *qs;
3240	int i, err, dump_end, idx;
3241	static int multiplier = 1;
3242	struct sbuf *sb;
3243	struct rsp_desc *rspd;
3244	uint32_t data[4];
3245
3246	rspq = arg1;
3247	qs = rspq_to_qset(rspq);
3248	if (rspq->rspq_dump_count == 0)
3249		return (0);
3250	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3251		log(LOG_WARNING,
3252		    "dump count is too large %d\n", rspq->rspq_dump_count);
3253		rspq->rspq_dump_count = 0;
3254		return (EINVAL);
3255	}
3256	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3257		log(LOG_WARNING,
3258		    "dump start of %d is greater than queue size\n",
3259		    rspq->rspq_dump_start);
3260		rspq->rspq_dump_start = 0;
3261		return (EINVAL);
3262	}
3263	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3264	if (err)
3265		return (err);
3266retry_sbufops:
3267	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3268
3269	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3270	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3271	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3272	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3273	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3274
3275	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3276	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3277
3278	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3279	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3280		idx = i & (RSPQ_Q_SIZE-1);
3281
3282		rspd = &rspq->desc[idx];
3283		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3284		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3285		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3286		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3287		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3288		    be32toh(rspd->len_cq), rspd->intr_gen);
3289	}
3290	if (sbuf_overflowed(sb)) {
3291		sbuf_delete(sb);
3292		multiplier++;
3293		goto retry_sbufops;
3294	}
3295	sbuf_finish(sb);
3296	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3297	sbuf_delete(sb);
3298	return (err);
3299}
3300
3301static int
3302t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3303{
3304	struct sge_txq *txq;
3305	struct sge_qset *qs;
3306	int i, j, err, dump_end;
3307	static int multiplier = 1;
3308	struct sbuf *sb;
3309	struct tx_desc *txd;
3310	uint32_t *WR, wr_hi, wr_lo, gen;
3311	uint32_t data[4];
3312
3313	txq = arg1;
3314	qs = txq_to_qset(txq, TXQ_ETH);
3315	if (txq->txq_dump_count == 0) {
3316		return (0);
3317	}
3318	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3319		log(LOG_WARNING,
3320		    "dump count is too large %d\n", txq->txq_dump_count);
3321		txq->txq_dump_count = 1;
3322		return (EINVAL);
3323	}
3324	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3325		log(LOG_WARNING,
3326		    "dump start of %d is greater than queue size\n",
3327		    txq->txq_dump_start);
3328		txq->txq_dump_start = 0;
3329		return (EINVAL);
3330	}
3331	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3332	if (err)
3333		return (err);
3334
3335
3336retry_sbufops:
3337	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3338
3339	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3340	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3341	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3342	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3343	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3344	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3345	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3346	    txq->txq_dump_start,
3347	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3348
3349	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3350	for (i = txq->txq_dump_start; i < dump_end; i++) {
3351		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3352		WR = (uint32_t *)txd->flit;
3353		wr_hi = ntohl(WR[0]);
3354		wr_lo = ntohl(WR[1]);
3355		gen = G_WR_GEN(wr_lo);
3356
3357		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3358		    wr_hi, wr_lo, gen);
3359		for (j = 2; j < 30; j += 4)
3360			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3361			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3362
3363	}
3364	if (sbuf_overflowed(sb)) {
3365		sbuf_delete(sb);
3366		multiplier++;
3367		goto retry_sbufops;
3368	}
3369	sbuf_finish(sb);
3370	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3371	sbuf_delete(sb);
3372	return (err);
3373}
3374
3375static int
3376t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3377{
3378	struct sge_txq *txq;
3379	struct sge_qset *qs;
3380	int i, j, err, dump_end;
3381	static int multiplier = 1;
3382	struct sbuf *sb;
3383	struct tx_desc *txd;
3384	uint32_t *WR, wr_hi, wr_lo, gen;
3385
3386	txq = arg1;
3387	qs = txq_to_qset(txq, TXQ_CTRL);
3388	if (txq->txq_dump_count == 0) {
3389		return (0);
3390	}
3391	if (txq->txq_dump_count > 256) {
3392		log(LOG_WARNING,
3393		    "dump count is too large %d\n", txq->txq_dump_count);
3394		txq->txq_dump_count = 1;
3395		return (EINVAL);
3396	}
3397	if (txq->txq_dump_start > 255) {
3398		log(LOG_WARNING,
3399		    "dump start of %d is greater than queue size\n",
3400		    txq->txq_dump_start);
3401		txq->txq_dump_start = 0;
3402		return (EINVAL);
3403	}
3404
3405retry_sbufops:
3406	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3407	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3408	    txq->txq_dump_start,
3409	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3410
3411	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3412	for (i = txq->txq_dump_start; i < dump_end; i++) {
3413		txd = &txq->desc[i & (255)];
3414		WR = (uint32_t *)txd->flit;
3415		wr_hi = ntohl(WR[0]);
3416		wr_lo = ntohl(WR[1]);
3417		gen = G_WR_GEN(wr_lo);
3418
3419		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3420		    wr_hi, wr_lo, gen);
3421		for (j = 2; j < 30; j += 4)
3422			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3423			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3424
3425	}
3426	if (sbuf_overflowed(sb)) {
3427		sbuf_delete(sb);
3428		multiplier++;
3429		goto retry_sbufops;
3430	}
3431	sbuf_finish(sb);
3432	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3433	sbuf_delete(sb);
3434	return (err);
3435}
3436
3437static int
3438t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3439{
3440	adapter_t *sc = arg1;
3441	struct qset_params *qsp = &sc->params.sge.qset[0];
3442	int coalesce_usecs;
3443	struct sge_qset *qs;
3444	int i, j, err, nqsets = 0;
3445	struct mtx *lock;
3446
3447	if ((sc->flags & FULL_INIT_DONE) == 0)
3448		return (ENXIO);
3449
3450	coalesce_usecs = qsp->coalesce_usecs;
3451        err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3452
3453	if (err != 0) {
3454		return (err);
3455	}
3456	if (coalesce_usecs == qsp->coalesce_usecs)
3457		return (0);
3458
3459	for (i = 0; i < sc->params.nports; i++)
3460		for (j = 0; j < sc->port[i].nqsets; j++)
3461			nqsets++;
3462
3463	coalesce_usecs = max(1, coalesce_usecs);
3464
3465	for (i = 0; i < nqsets; i++) {
3466		qs = &sc->sge.qs[i];
3467		qsp = &sc->params.sge.qset[i];
3468		qsp->coalesce_usecs = coalesce_usecs;
3469
3470		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3471			    &sc->sge.qs[0].rspq.lock;
3472
3473		mtx_lock(lock);
3474		t3_update_qset_coalesce(qs, qsp);
3475		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3476		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3477		mtx_unlock(lock);
3478	}
3479
3480	return (0);
3481}
3482
3483
3484void
3485t3_add_attach_sysctls(adapter_t *sc)
3486{
3487	struct sysctl_ctx_list *ctx;
3488	struct sysctl_oid_list *children;
3489
3490	ctx = device_get_sysctl_ctx(sc->dev);
3491	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3492
3493	/* random information */
3494	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3495	    "firmware_version",
3496	    CTLFLAG_RD, &sc->fw_version,
3497	    0, "firmware version");
3498	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3499	    "hw_revision",
3500	    CTLFLAG_RD, &sc->params.rev,
3501	    0, "chip model");
3502	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3503	    "port_types",
3504	    CTLFLAG_RD, &sc->port_types,
3505	    0, "type of ports");
3506	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3507	    "enable_debug",
3508	    CTLFLAG_RW, &cxgb_debug,
3509	    0, "enable verbose debugging output");
3510	SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3511	    CTLFLAG_RD, &sc->tunq_coalesce,
3512	    "#tunneled packets freed");
3513	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3514	    "txq_overrun",
3515	    CTLFLAG_RD, &txq_fills,
3516	    0, "#times txq overrun");
3517}
3518
3519
3520static const char *rspq_name = "rspq";
3521static const char *txq_names[] =
3522{
3523	"txq_eth",
3524	"txq_ofld",
3525	"txq_ctrl"
3526};
3527
3528static int
3529sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3530{
3531	struct port_info *p = arg1;
3532	uint64_t *parg;
3533
3534	if (!p)
3535		return (EINVAL);
3536
3537	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3538	PORT_LOCK(p);
3539	t3_mac_update_stats(&p->mac);
3540	PORT_UNLOCK(p);
3541
3542	return (sysctl_handle_quad(oidp, parg, 0, req));
3543}
3544
3545void
3546t3_add_configured_sysctls(adapter_t *sc)
3547{
3548	struct sysctl_ctx_list *ctx;
3549	struct sysctl_oid_list *children;
3550	int i, j;
3551
3552	ctx = device_get_sysctl_ctx(sc->dev);
3553	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3554
3555	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3556	    "intr_coal",
3557	    CTLTYPE_INT|CTLFLAG_RW, sc,
3558	    0, t3_set_coalesce_usecs,
3559	    "I", "interrupt coalescing timer (us)");
3560
3561	for (i = 0; i < sc->params.nports; i++) {
3562		struct port_info *pi = &sc->port[i];
3563		struct sysctl_oid *poid;
3564		struct sysctl_oid_list *poidlist;
3565		struct mac_stats *mstats = &pi->mac.stats;
3566
3567		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3568		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3569		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3570		poidlist = SYSCTL_CHILDREN(poid);
3571		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3572		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3573		    0, "#queue sets");
3574
3575		for (j = 0; j < pi->nqsets; j++) {
3576			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3577			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3578					  *ctrlqpoid, *lropoid;
3579			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3580					       *txqpoidlist, *ctrlqpoidlist,
3581					       *lropoidlist;
3582			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3583
3584			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3585
3586			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3587			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3588			qspoidlist = SYSCTL_CHILDREN(qspoid);
3589
3590			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3591					CTLFLAG_RD, &qs->fl[0].empty, 0,
3592					"freelist #0 empty");
3593			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3594					CTLFLAG_RD, &qs->fl[1].empty, 0,
3595					"freelist #1 empty");
3596
3597			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3598			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3599			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3600
3601			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3602			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3603			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3604
3605			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3606			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3607			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3608
3609			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3610			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3611			lropoidlist = SYSCTL_CHILDREN(lropoid);
3612
3613			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3614			    CTLFLAG_RD, &qs->rspq.size,
3615			    0, "#entries in response queue");
3616			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3617			    CTLFLAG_RD, &qs->rspq.cidx,
3618			    0, "consumer index");
3619			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3620			    CTLFLAG_RD, &qs->rspq.credits,
3621			    0, "#credits");
3622			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3623			    CTLFLAG_RD, &qs->rspq.phys_addr,
3624			    "physical_address_of the queue");
3625			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3626			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3627			    0, "start rspq dump entry");
3628			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3629			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3630			    0, "#rspq entries to dump");
3631			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3632			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3633			    0, t3_dump_rspq, "A", "dump of the response queue");
3634
3635
3636			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "dropped",
3637			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3638			    0, "#tunneled packets dropped");
3639			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3640			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3641			    0, "#tunneled packets waiting to be sent");
3642#if 0
3643			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3644			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3645			    0, "#tunneled packets queue producer index");
3646			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3647			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3648			    0, "#tunneled packets queue consumer index");
3649#endif
3650			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3651			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3652			    0, "#tunneled packets processed by the card");
3653			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3654			    CTLFLAG_RD, &txq->cleaned,
3655			    0, "#tunneled packets cleaned");
3656			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3657			    CTLFLAG_RD, &txq->in_use,
3658			    0, "#tunneled packet slots in use");
3659			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3660			    CTLFLAG_RD, &txq->txq_frees,
3661			    "#tunneled packets freed");
3662			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3663			    CTLFLAG_RD, &txq->txq_skipped,
3664			    0, "#tunneled packet descriptors skipped");
3665			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3666			    CTLFLAG_RD, &txq->txq_coalesced,
3667			    "#tunneled packets coalesced");
3668			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3669			    CTLFLAG_RD, &txq->txq_enqueued,
3670			    0, "#tunneled packets enqueued to hardware");
3671			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3672			    CTLFLAG_RD, &qs->txq_stopped,
3673			    0, "tx queues stopped");
3674			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3675			    CTLFLAG_RD, &txq->phys_addr,
3676			    "physical_address_of the queue");
3677			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3678			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3679			    0, "txq generation");
3680			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3681			    CTLFLAG_RD, &txq->cidx,
3682			    0, "hardware queue cidx");
3683			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3684			    CTLFLAG_RD, &txq->pidx,
3685			    0, "hardware queue pidx");
3686			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3687			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3688			    0, "txq start idx for dump");
3689			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3690			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3691			    0, "txq #entries to dump");
3692			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3693			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3694			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3695
3696			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3697			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3698			    0, "ctrlq start idx for dump");
3699			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3700			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3701			    0, "ctrl #entries to dump");
3702			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3703			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3704			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3705
3706#ifdef LRO_SUPPORTED
3707			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3708			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3709			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3710			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3711			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3712			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3713			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3714			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3715#endif
3716		}
3717
3718		/* Now add a node for mac stats. */
3719		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3720		    CTLFLAG_RD, NULL, "MAC statistics");
3721		poidlist = SYSCTL_CHILDREN(poid);
3722
3723		/*
3724		 * We (ab)use the length argument (arg2) to pass on the offset
3725		 * of the data that we are interested in.  This is only required
3726		 * for the quad counters that are updated from the hardware (we
3727		 * make sure that we return the latest value).
3728		 * sysctl_handle_macstat first updates *all* the counters from
3729		 * the hardware, and then returns the latest value of the
3730		 * requested counter.  Best would be to update only the
3731		 * requested counter from hardware, but t3_mac_update_stats()
3732		 * hides all the register details and we don't want to dive into
3733		 * all that here.
3734		 */
3735#define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3736    (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3737    sysctl_handle_macstat, "QU", 0)
3738		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3739		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3740		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3741		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3742		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3743		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3744		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3745		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3746		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3747		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3748		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3749		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3750		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3751		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3752		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3753		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3754		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3755		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3756		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3757		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3758		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3759		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3760		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3761		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3762		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3763		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3764		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3765		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3766		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3767		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3768		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3769		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3770		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3771		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3772		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3773		CXGB_SYSCTL_ADD_QUAD(rx_short);
3774		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3775		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3776		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3777		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3778		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3779		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3780		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3781		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3782		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3783		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3784#undef CXGB_SYSCTL_ADD_QUAD
3785
3786#define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3787    CTLFLAG_RD, &mstats->a, 0)
3788		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3789		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3790		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3791		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3792		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3793		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3794		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3795		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3796		CXGB_SYSCTL_ADD_ULONG(num_resets);
3797		CXGB_SYSCTL_ADD_ULONG(link_faults);
3798#undef CXGB_SYSCTL_ADD_ULONG
3799	}
3800}
3801
3802/**
3803 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3804 *	@qs: the queue set
3805 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3806 *	@idx: the descriptor index in the queue
3807 *	@data: where to dump the descriptor contents
3808 *
3809 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3810 *	size of the descriptor.
3811 */
3812int
3813t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3814		unsigned char *data)
3815{
3816	if (qnum >= 6)
3817		return (EINVAL);
3818
3819	if (qnum < 3) {
3820		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3821			return -EINVAL;
3822		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3823		return sizeof(struct tx_desc);
3824	}
3825
3826	if (qnum == 3) {
3827		if (!qs->rspq.desc || idx >= qs->rspq.size)
3828			return (EINVAL);
3829		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3830		return sizeof(struct rsp_desc);
3831	}
3832
3833	qnum -= 4;
3834	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3835		return (EINVAL);
3836	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3837	return sizeof(struct rx_desc);
3838}
3839