cxgb_sge.c revision 204348
1/**************************************************************************
2
3Copyright (c) 2007-2009, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/cxgb_sge.c 204348 2010-02-26 07:08:44Z np $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kernel.h>
36#include <sys/module.h>
37#include <sys/bus.h>
38#include <sys/conf.h>
39#include <machine/bus.h>
40#include <machine/resource.h>
41#include <sys/bus_dma.h>
42#include <sys/rman.h>
43#include <sys/queue.h>
44#include <sys/sysctl.h>
45#include <sys/taskqueue.h>
46
47#include <sys/proc.h>
48#include <sys/sbuf.h>
49#include <sys/sched.h>
50#include <sys/smp.h>
51#include <sys/systm.h>
52#include <sys/syslog.h>
53#include <sys/socket.h>
54
55#include <net/bpf.h>
56#include <net/ethernet.h>
57#include <net/if.h>
58#include <net/if_vlan_var.h>
59
60#include <netinet/in_systm.h>
61#include <netinet/in.h>
62#include <netinet/ip.h>
63#include <netinet/tcp.h>
64
65#include <dev/pci/pcireg.h>
66#include <dev/pci/pcivar.h>
67
68#include <vm/vm.h>
69#include <vm/pmap.h>
70
71#include <cxgb_include.h>
72#include <sys/mvec.h>
73
74int	txq_fills = 0;
75int	multiq_tx_enable = 1;
76
77extern struct sysctl_oid_list sysctl__hw_cxgb_children;
78int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
79TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
80SYSCTL_UINT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
81    "size of per-queue mbuf ring");
82
83static int cxgb_tx_coalesce_force = 0;
84TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
85SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
86    &cxgb_tx_coalesce_force, 0,
87    "coalesce small packets into a single work request regardless of ring state");
88
89#define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
90#define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
91#define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
92#define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
93#define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
94#define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
95#define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
96
97
98static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
99TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
100    &cxgb_tx_coalesce_enable_start);
101SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
102    &cxgb_tx_coalesce_enable_start, 0,
103    "coalesce enable threshold");
104static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
105TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
106SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
107    &cxgb_tx_coalesce_enable_stop, 0,
108    "coalesce disable threshold");
109static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
110TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
111SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
112    &cxgb_tx_reclaim_threshold, 0,
113    "tx cleaning minimum threshold");
114
115/*
116 * XXX don't re-enable this until TOE stops assuming
117 * we have an m_ext
118 */
119static int recycle_enable = 0;
120int cxgb_ext_freed = 0;
121int cxgb_ext_inited = 0;
122int fl_q_size = 0;
123int jumbo_q_size = 0;
124
125extern int cxgb_use_16k_clusters;
126extern int nmbjumbo4;
127extern int nmbjumbo9;
128extern int nmbjumbo16;
129
130#define USE_GTS 0
131
132#define SGE_RX_SM_BUF_SIZE	1536
133#define SGE_RX_DROP_THRES	16
134#define SGE_RX_COPY_THRES	128
135
136/*
137 * Period of the Tx buffer reclaim timer.  This timer does not need to run
138 * frequently as Tx buffers are usually reclaimed by new Tx packets.
139 */
140#define TX_RECLAIM_PERIOD       (hz >> 1)
141
142/*
143 * Values for sge_txq.flags
144 */
145enum {
146	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
147	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
148};
149
150struct tx_desc {
151	uint64_t	flit[TX_DESC_FLITS];
152} __packed;
153
154struct rx_desc {
155	uint32_t	addr_lo;
156	uint32_t	len_gen;
157	uint32_t	gen2;
158	uint32_t	addr_hi;
159} __packed;
160
161struct rsp_desc {               /* response queue descriptor */
162	struct rss_header	rss_hdr;
163	uint32_t		flags;
164	uint32_t		len_cq;
165	uint8_t			imm_data[47];
166	uint8_t			intr_gen;
167} __packed;
168
169#define RX_SW_DESC_MAP_CREATED	(1 << 0)
170#define TX_SW_DESC_MAP_CREATED	(1 << 1)
171#define RX_SW_DESC_INUSE        (1 << 3)
172#define TX_SW_DESC_MAPPED       (1 << 4)
173
174#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
175#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
176#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
177#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
178
179struct tx_sw_desc {                /* SW state per Tx descriptor */
180	struct mbuf	*m;
181	bus_dmamap_t	map;
182	int		flags;
183};
184
185struct rx_sw_desc {                /* SW state per Rx descriptor */
186	caddr_t		rxsd_cl;
187	struct mbuf	*m;
188	bus_dmamap_t	map;
189	int		flags;
190};
191
192struct txq_state {
193	unsigned int	compl;
194	unsigned int	gen;
195	unsigned int	pidx;
196};
197
198struct refill_fl_cb_arg {
199	int               error;
200	bus_dma_segment_t seg;
201	int               nseg;
202};
203
204
205/*
206 * Maps a number of flits to the number of Tx descriptors that can hold them.
207 * The formula is
208 *
209 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
210 *
211 * HW allows up to 4 descriptors to be combined into a WR.
212 */
213static uint8_t flit_desc_map[] = {
214	0,
215#if SGE_NUM_GENBITS == 1
216	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
218	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
219	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
220#elif SGE_NUM_GENBITS == 2
221	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
222	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
223	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
225#else
226# error "SGE_NUM_GENBITS must be 1 or 2"
227#endif
228};
229
230#define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
231#define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
232#define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
233#define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
234#define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
235#define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
236	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
237#define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
238#define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
239	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
240#define	TXQ_RING_DEQUEUE(qs) \
241	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
242
243int cxgb_debug = 0;
244
245static void sge_timer_cb(void *arg);
246static void sge_timer_reclaim(void *arg, int ncount);
247static void sge_txq_reclaim_handler(void *arg, int ncount);
248static void cxgb_start_locked(struct sge_qset *qs);
249
250/*
251 * XXX need to cope with bursty scheduling by looking at a wider
252 * window than we are now for determining the need for coalescing
253 *
254 */
255static __inline uint64_t
256check_pkt_coalesce(struct sge_qset *qs)
257{
258        struct adapter *sc;
259        struct sge_txq *txq;
260	uint8_t *fill;
261
262	if (__predict_false(cxgb_tx_coalesce_force))
263		return (1);
264	txq = &qs->txq[TXQ_ETH];
265        sc = qs->port->adapter;
266	fill = &sc->tunq_fill[qs->idx];
267
268	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
269		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
270	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
271		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
272	/*
273	 * if the hardware transmit queue is more than 1/8 full
274	 * we mark it as coalescing - we drop back from coalescing
275	 * when we go below 1/32 full and there are no packets enqueued,
276	 * this provides us with some degree of hysteresis
277	 */
278        if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
279	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
280                *fill = 0;
281        else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
282                *fill = 1;
283
284	return (sc->tunq_coalesce);
285}
286
287#ifdef __LP64__
288static void
289set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
290{
291	uint64_t wr_hilo;
292#if _BYTE_ORDER == _LITTLE_ENDIAN
293	wr_hilo = wr_hi;
294	wr_hilo |= (((uint64_t)wr_lo)<<32);
295#else
296	wr_hilo = wr_lo;
297	wr_hilo |= (((uint64_t)wr_hi)<<32);
298#endif
299	wrp->wrh_hilo = wr_hilo;
300}
301#else
302static void
303set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
304{
305
306	wrp->wrh_hi = wr_hi;
307	wmb();
308	wrp->wrh_lo = wr_lo;
309}
310#endif
311
312struct coalesce_info {
313	int count;
314	int nbytes;
315};
316
317static int
318coalesce_check(struct mbuf *m, void *arg)
319{
320	struct coalesce_info *ci = arg;
321	int *count = &ci->count;
322	int *nbytes = &ci->nbytes;
323
324	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
325		(*count < 7) && (m->m_next == NULL))) {
326		*count += 1;
327		*nbytes += m->m_len;
328		return (1);
329	}
330	return (0);
331}
332
333static struct mbuf *
334cxgb_dequeue(struct sge_qset *qs)
335{
336	struct mbuf *m, *m_head, *m_tail;
337	struct coalesce_info ci;
338
339
340	if (check_pkt_coalesce(qs) == 0)
341		return TXQ_RING_DEQUEUE(qs);
342
343	m_head = m_tail = NULL;
344	ci.count = ci.nbytes = 0;
345	do {
346		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
347		if (m_head == NULL) {
348			m_tail = m_head = m;
349		} else if (m != NULL) {
350			m_tail->m_nextpkt = m;
351			m_tail = m;
352		}
353	} while (m != NULL);
354	if (ci.count > 7)
355		panic("trying to coalesce %d packets in to one WR", ci.count);
356	return (m_head);
357}
358
359/**
360 *	reclaim_completed_tx - reclaims completed Tx descriptors
361 *	@adapter: the adapter
362 *	@q: the Tx queue to reclaim completed descriptors from
363 *
364 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
365 *	and frees the associated buffers if possible.  Called with the Tx
366 *	queue's lock held.
367 */
368static __inline int
369reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
370{
371	struct sge_txq *q = &qs->txq[queue];
372	int reclaim = desc_reclaimable(q);
373
374	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
375	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
376		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
377
378	if (reclaim < reclaim_min)
379		return (0);
380
381	mtx_assert(&qs->lock, MA_OWNED);
382	if (reclaim > 0) {
383		t3_free_tx_desc(qs, reclaim, queue);
384		q->cleaned += reclaim;
385		q->in_use -= reclaim;
386	}
387	if (isset(&qs->txq_stopped, TXQ_ETH))
388                clrbit(&qs->txq_stopped, TXQ_ETH);
389
390	return (reclaim);
391}
392
393/**
394 *	should_restart_tx - are there enough resources to restart a Tx queue?
395 *	@q: the Tx queue
396 *
397 *	Checks if there are enough descriptors to restart a suspended Tx queue.
398 */
399static __inline int
400should_restart_tx(const struct sge_txq *q)
401{
402	unsigned int r = q->processed - q->cleaned;
403
404	return q->in_use - r < (q->size >> 1);
405}
406
407/**
408 *	t3_sge_init - initialize SGE
409 *	@adap: the adapter
410 *	@p: the SGE parameters
411 *
412 *	Performs SGE initialization needed every time after a chip reset.
413 *	We do not initialize any of the queue sets here, instead the driver
414 *	top-level must request those individually.  We also do not enable DMA
415 *	here, that should be done after the queues have been set up.
416 */
417void
418t3_sge_init(adapter_t *adap, struct sge_params *p)
419{
420	u_int ctrl, ups;
421
422	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
423
424	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
425	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
426	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
427	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
428#if SGE_NUM_GENBITS == 1
429	ctrl |= F_EGRGENCTRL;
430#endif
431	if (adap->params.rev > 0) {
432		if (!(adap->flags & (USING_MSIX | USING_MSI)))
433			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
434	}
435	t3_write_reg(adap, A_SG_CONTROL, ctrl);
436	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
437		     V_LORCQDRBTHRSH(512));
438	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
439	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
440		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
441	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
442		     adap->params.rev < T3_REV_C ? 1000 : 500);
443	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
444	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
445	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
446	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
447	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
448}
449
450
451/**
452 *	sgl_len - calculates the size of an SGL of the given capacity
453 *	@n: the number of SGL entries
454 *
455 *	Calculates the number of flits needed for a scatter/gather list that
456 *	can hold the given number of entries.
457 */
458static __inline unsigned int
459sgl_len(unsigned int n)
460{
461	return ((3 * n) / 2 + (n & 1));
462}
463
464/**
465 *	get_imm_packet - return the next ingress packet buffer from a response
466 *	@resp: the response descriptor containing the packet data
467 *
468 *	Return a packet containing the immediate data of the given response.
469 */
470static int
471get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
472{
473
474	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
475	m->m_ext.ext_buf = NULL;
476	m->m_ext.ext_type = 0;
477	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
478	return (0);
479}
480
481static __inline u_int
482flits_to_desc(u_int n)
483{
484	return (flit_desc_map[n]);
485}
486
487#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
488		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
489		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
490		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
491		    F_HIRCQPARITYERROR)
492#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
493#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
494		      F_RSPQDISABLED)
495
496/**
497 *	t3_sge_err_intr_handler - SGE async event interrupt handler
498 *	@adapter: the adapter
499 *
500 *	Interrupt handler for SGE asynchronous (non-data) events.
501 */
502void
503t3_sge_err_intr_handler(adapter_t *adapter)
504{
505	unsigned int v, status;
506
507	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
508	if (status & SGE_PARERR)
509		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
510			 status & SGE_PARERR);
511	if (status & SGE_FRAMINGERR)
512		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
513			 status & SGE_FRAMINGERR);
514	if (status & F_RSPQCREDITOVERFOW)
515		CH_ALERT(adapter, "SGE response queue credit overflow\n");
516
517	if (status & F_RSPQDISABLED) {
518		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
519
520		CH_ALERT(adapter,
521			 "packet delivered to disabled response queue (0x%x)\n",
522			 (v >> S_RSPQ0DISABLED) & 0xff);
523	}
524
525	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
526	if (status & SGE_FATALERR)
527		t3_fatal_err(adapter);
528}
529
530void
531t3_sge_prep(adapter_t *adap, struct sge_params *p)
532{
533	int i, nqsets;
534
535	nqsets = min(SGE_QSETS, mp_ncpus*4);
536
537	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
538
539	while (!powerof2(fl_q_size))
540		fl_q_size--;
541#if __FreeBSD_version >= 700111
542	if (cxgb_use_16k_clusters)
543		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
544	else
545		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
546#else
547	jumbo_q_size = min(nmbjumbo4/(3*nqsets), JUMBO_Q_SIZE);
548#endif
549	while (!powerof2(jumbo_q_size))
550		jumbo_q_size--;
551
552	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
553		device_printf(adap->dev,
554		    "Insufficient clusters and/or jumbo buffers.\n");
555
556	/* XXX Does ETHER_ALIGN need to be accounted for here? */
557	p->max_pkt_size = adap->sge.qs[0].fl[1].buf_size - sizeof(struct cpl_rx_data);
558
559	for (i = 0; i < SGE_QSETS; ++i) {
560		struct qset_params *q = p->qset + i;
561
562		if (adap->params.nports > 2) {
563			q->coalesce_usecs = 50;
564		} else {
565#ifdef INVARIANTS
566			q->coalesce_usecs = 10;
567#else
568			q->coalesce_usecs = 5;
569#endif
570		}
571		q->polling = 0;
572		q->rspq_size = RSPQ_Q_SIZE;
573		q->fl_size = fl_q_size;
574		q->jumbo_size = jumbo_q_size;
575		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
576		q->txq_size[TXQ_OFLD] = 1024;
577		q->txq_size[TXQ_CTRL] = 256;
578		q->cong_thres = 0;
579	}
580}
581
582int
583t3_sge_alloc(adapter_t *sc)
584{
585
586	/* The parent tag. */
587	if (bus_dma_tag_create( NULL,			/* parent */
588				1, 0,			/* algnmnt, boundary */
589				BUS_SPACE_MAXADDR,	/* lowaddr */
590				BUS_SPACE_MAXADDR,	/* highaddr */
591				NULL, NULL,		/* filter, filterarg */
592				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
593				BUS_SPACE_UNRESTRICTED, /* nsegments */
594				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
595				0,			/* flags */
596				NULL, NULL,		/* lock, lockarg */
597				&sc->parent_dmat)) {
598		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
599		return (ENOMEM);
600	}
601
602	/*
603	 * DMA tag for normal sized RX frames
604	 */
605	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
606		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
607		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
608		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
609		return (ENOMEM);
610	}
611
612	/*
613	 * DMA tag for jumbo sized RX frames.
614	 */
615	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
616		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
617		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
618		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
619		return (ENOMEM);
620	}
621
622	/*
623	 * DMA tag for TX frames.
624	 */
625	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
626		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
627		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
628		NULL, NULL, &sc->tx_dmat)) {
629		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
630		return (ENOMEM);
631	}
632
633	return (0);
634}
635
636int
637t3_sge_free(struct adapter * sc)
638{
639
640	if (sc->tx_dmat != NULL)
641		bus_dma_tag_destroy(sc->tx_dmat);
642
643	if (sc->rx_jumbo_dmat != NULL)
644		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
645
646	if (sc->rx_dmat != NULL)
647		bus_dma_tag_destroy(sc->rx_dmat);
648
649	if (sc->parent_dmat != NULL)
650		bus_dma_tag_destroy(sc->parent_dmat);
651
652	return (0);
653}
654
655void
656t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
657{
658
659	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
660	qs->rspq.polling = 0 /* p->polling */;
661}
662
663#if !defined(__i386__) && !defined(__amd64__)
664static void
665refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
666{
667	struct refill_fl_cb_arg *cb_arg = arg;
668
669	cb_arg->error = error;
670	cb_arg->seg = segs[0];
671	cb_arg->nseg = nseg;
672
673}
674#endif
675/**
676 *	refill_fl - refill an SGE free-buffer list
677 *	@sc: the controller softc
678 *	@q: the free-list to refill
679 *	@n: the number of new buffers to allocate
680 *
681 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
682 *	The caller must assure that @n does not exceed the queue's capacity.
683 */
684static void
685refill_fl(adapter_t *sc, struct sge_fl *q, int n)
686{
687	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
688	struct rx_desc *d = &q->desc[q->pidx];
689	struct refill_fl_cb_arg cb_arg;
690	struct mbuf *m;
691	caddr_t cl;
692	int err, count = 0;
693
694	cb_arg.error = 0;
695	while (n--) {
696		/*
697		 * We only allocate a cluster, mbuf allocation happens after rx
698		 */
699		if (q->zone == zone_pack) {
700			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
701				break;
702			cl = m->m_ext.ext_buf;
703		} else {
704			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
705				break;
706			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
707				uma_zfree(q->zone, cl);
708				break;
709			}
710		}
711		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
712			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
713				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
714				uma_zfree(q->zone, cl);
715				goto done;
716			}
717			sd->flags |= RX_SW_DESC_MAP_CREATED;
718		}
719#if !defined(__i386__) && !defined(__amd64__)
720		err = bus_dmamap_load(q->entry_tag, sd->map,
721		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
722
723		if (err != 0 || cb_arg.error) {
724			if (q->zone == zone_pack)
725				uma_zfree(q->zone, cl);
726			m_free(m);
727			goto done;
728		}
729#else
730		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
731#endif
732		sd->flags |= RX_SW_DESC_INUSE;
733		sd->rxsd_cl = cl;
734		sd->m = m;
735		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
736		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
737		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
738		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
739
740		d++;
741		sd++;
742
743		if (++q->pidx == q->size) {
744			q->pidx = 0;
745			q->gen ^= 1;
746			sd = q->sdesc;
747			d = q->desc;
748		}
749		q->credits++;
750		count++;
751	}
752
753done:
754	if (count)
755		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
756}
757
758
759/**
760 *	free_rx_bufs - free the Rx buffers on an SGE free list
761 *	@sc: the controle softc
762 *	@q: the SGE free list to clean up
763 *
764 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
765 *	this queue should be stopped before calling this function.
766 */
767static void
768free_rx_bufs(adapter_t *sc, struct sge_fl *q)
769{
770	u_int cidx = q->cidx;
771
772	while (q->credits--) {
773		struct rx_sw_desc *d = &q->sdesc[cidx];
774
775		if (d->flags & RX_SW_DESC_INUSE) {
776			bus_dmamap_unload(q->entry_tag, d->map);
777			bus_dmamap_destroy(q->entry_tag, d->map);
778			if (q->zone == zone_pack) {
779				m_init(d->m, zone_pack, MCLBYTES,
780				    M_NOWAIT, MT_DATA, M_EXT);
781				uma_zfree(zone_pack, d->m);
782			} else {
783				m_init(d->m, zone_mbuf, MLEN,
784				    M_NOWAIT, MT_DATA, 0);
785				uma_zfree(zone_mbuf, d->m);
786				uma_zfree(q->zone, d->rxsd_cl);
787			}
788		}
789
790		d->rxsd_cl = NULL;
791		d->m = NULL;
792		if (++cidx == q->size)
793			cidx = 0;
794	}
795}
796
797static __inline void
798__refill_fl(adapter_t *adap, struct sge_fl *fl)
799{
800	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
801}
802
803static __inline void
804__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
805{
806	if ((fl->size - fl->credits) < max)
807		refill_fl(adap, fl, min(max, fl->size - fl->credits));
808}
809
810/**
811 *	recycle_rx_buf - recycle a receive buffer
812 *	@adapter: the adapter
813 *	@q: the SGE free list
814 *	@idx: index of buffer to recycle
815 *
816 *	Recycles the specified buffer on the given free list by adding it at
817 *	the next available slot on the list.
818 */
819static void
820recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
821{
822	struct rx_desc *from = &q->desc[idx];
823	struct rx_desc *to   = &q->desc[q->pidx];
824
825	q->sdesc[q->pidx] = q->sdesc[idx];
826	to->addr_lo = from->addr_lo;        // already big endian
827	to->addr_hi = from->addr_hi;        // likewise
828	wmb();	/* necessary ? */
829	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
830	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
831	q->credits++;
832
833	if (++q->pidx == q->size) {
834		q->pidx = 0;
835		q->gen ^= 1;
836	}
837	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
838}
839
840static void
841alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
842{
843	uint32_t *addr;
844
845	addr = arg;
846	*addr = segs[0].ds_addr;
847}
848
849static int
850alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
851    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
852    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
853{
854	size_t len = nelem * elem_size;
855	void *s = NULL;
856	void *p = NULL;
857	int err;
858
859	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
860				      BUS_SPACE_MAXADDR_32BIT,
861				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
862				      len, 0, NULL, NULL, tag)) != 0) {
863		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
864		return (ENOMEM);
865	}
866
867	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
868				    map)) != 0) {
869		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
870		return (ENOMEM);
871	}
872
873	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
874	bzero(p, len);
875	*(void **)desc = p;
876
877	if (sw_size) {
878		len = nelem * sw_size;
879		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
880		*(void **)sdesc = s;
881	}
882	if (parent_entry_tag == NULL)
883		return (0);
884
885	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
886				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
887		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
888				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
889		                      NULL, NULL, entry_tag)) != 0) {
890		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
891		return (ENOMEM);
892	}
893	return (0);
894}
895
896static void
897sge_slow_intr_handler(void *arg, int ncount)
898{
899	adapter_t *sc = arg;
900
901	t3_slow_intr_handler(sc);
902}
903
904/**
905 *	sge_timer_cb - perform periodic maintenance of an SGE qset
906 *	@data: the SGE queue set to maintain
907 *
908 *	Runs periodically from a timer to perform maintenance of an SGE queue
909 *	set.  It performs two tasks:
910 *
911 *	a) Cleans up any completed Tx descriptors that may still be pending.
912 *	Normal descriptor cleanup happens when new packets are added to a Tx
913 *	queue so this timer is relatively infrequent and does any cleanup only
914 *	if the Tx queue has not seen any new packets in a while.  We make a
915 *	best effort attempt to reclaim descriptors, in that we don't wait
916 *	around if we cannot get a queue's lock (which most likely is because
917 *	someone else is queueing new packets and so will also handle the clean
918 *	up).  Since control queues use immediate data exclusively we don't
919 *	bother cleaning them up here.
920 *
921 *	b) Replenishes Rx queues that have run out due to memory shortage.
922 *	Normally new Rx buffers are added when existing ones are consumed but
923 *	when out of memory a queue can become empty.  We try to add only a few
924 *	buffers here, the queue will be replenished fully as these new buffers
925 *	are used up if memory shortage has subsided.
926 *
927 *	c) Return coalesced response queue credits in case a response queue is
928 *	starved.
929 *
930 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
931 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
932 */
933static void
934sge_timer_cb(void *arg)
935{
936	adapter_t *sc = arg;
937	if ((sc->flags & USING_MSIX) == 0) {
938
939		struct port_info *pi;
940		struct sge_qset *qs;
941		struct sge_txq  *txq;
942		int i, j;
943		int reclaim_ofl, refill_rx;
944
945		if (sc->open_device_map == 0)
946			return;
947
948		for (i = 0; i < sc->params.nports; i++) {
949			pi = &sc->port[i];
950			for (j = 0; j < pi->nqsets; j++) {
951				qs = &sc->sge.qs[pi->first_qset + j];
952				txq = &qs->txq[0];
953				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
954				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
955				    (qs->fl[1].credits < qs->fl[1].size));
956				if (reclaim_ofl || refill_rx) {
957					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
958					break;
959				}
960			}
961		}
962	}
963
964	if (sc->params.nports > 2) {
965		int i;
966
967		for_each_port(sc, i) {
968			struct port_info *pi = &sc->port[i];
969
970			t3_write_reg(sc, A_SG_KDOORBELL,
971				     F_SELEGRCNTX |
972				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
973		}
974	}
975	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
976	    sc->open_device_map != 0)
977		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
978}
979
980/*
981 * This is meant to be a catch-all function to keep sge state private
982 * to sge.c
983 *
984 */
985int
986t3_sge_init_adapter(adapter_t *sc)
987{
988	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
989	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
990	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
991	return (0);
992}
993
994int
995t3_sge_reset_adapter(adapter_t *sc)
996{
997	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
998	return (0);
999}
1000
1001int
1002t3_sge_init_port(struct port_info *pi)
1003{
1004	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1005	return (0);
1006}
1007
1008/**
1009 *	refill_rspq - replenish an SGE response queue
1010 *	@adapter: the adapter
1011 *	@q: the response queue to replenish
1012 *	@credits: how many new responses to make available
1013 *
1014 *	Replenishes a response queue by making the supplied number of responses
1015 *	available to HW.
1016 */
1017static __inline void
1018refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1019{
1020
1021	/* mbufs are allocated on demand when a rspq entry is processed. */
1022	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1023		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1024}
1025
1026static void
1027sge_txq_reclaim_handler(void *arg, int ncount)
1028{
1029	struct sge_qset *qs = arg;
1030	int i;
1031
1032	for (i = 0; i < 3; i++)
1033		reclaim_completed_tx(qs, 16, i);
1034}
1035
1036static void
1037sge_timer_reclaim(void *arg, int ncount)
1038{
1039	struct port_info *pi = arg;
1040	int i, nqsets = pi->nqsets;
1041	adapter_t *sc = pi->adapter;
1042	struct sge_qset *qs;
1043	struct mtx *lock;
1044
1045	KASSERT((sc->flags & USING_MSIX) == 0,
1046	    ("can't call timer reclaim for msi-x"));
1047
1048	for (i = 0; i < nqsets; i++) {
1049		qs = &sc->sge.qs[pi->first_qset + i];
1050
1051		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1052		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1053			    &sc->sge.qs[0].rspq.lock;
1054
1055		if (mtx_trylock(lock)) {
1056			/* XXX currently assume that we are *NOT* polling */
1057			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1058
1059			if (qs->fl[0].credits < qs->fl[0].size - 16)
1060				__refill_fl(sc, &qs->fl[0]);
1061			if (qs->fl[1].credits < qs->fl[1].size - 16)
1062				__refill_fl(sc, &qs->fl[1]);
1063
1064			if (status & (1 << qs->rspq.cntxt_id)) {
1065				if (qs->rspq.credits) {
1066					refill_rspq(sc, &qs->rspq, 1);
1067					qs->rspq.credits--;
1068					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1069					    1 << qs->rspq.cntxt_id);
1070				}
1071			}
1072			mtx_unlock(lock);
1073		}
1074	}
1075}
1076
1077/**
1078 *	init_qset_cntxt - initialize an SGE queue set context info
1079 *	@qs: the queue set
1080 *	@id: the queue set id
1081 *
1082 *	Initializes the TIDs and context ids for the queues of a queue set.
1083 */
1084static void
1085init_qset_cntxt(struct sge_qset *qs, u_int id)
1086{
1087
1088	qs->rspq.cntxt_id = id;
1089	qs->fl[0].cntxt_id = 2 * id;
1090	qs->fl[1].cntxt_id = 2 * id + 1;
1091	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1092	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1093	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1094	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1095	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1096
1097	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1098	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1099	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1100}
1101
1102
1103static void
1104txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1105{
1106	txq->in_use += ndesc;
1107	/*
1108	 * XXX we don't handle stopping of queue
1109	 * presumably start handles this when we bump against the end
1110	 */
1111	txqs->gen = txq->gen;
1112	txq->unacked += ndesc;
1113	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1114	txq->unacked &= 31;
1115	txqs->pidx = txq->pidx;
1116	txq->pidx += ndesc;
1117#ifdef INVARIANTS
1118	if (((txqs->pidx > txq->cidx) &&
1119		(txq->pidx < txqs->pidx) &&
1120		(txq->pidx >= txq->cidx)) ||
1121	    ((txqs->pidx < txq->cidx) &&
1122		(txq->pidx >= txq-> cidx)) ||
1123	    ((txqs->pidx < txq->cidx) &&
1124		(txq->cidx < txqs->pidx)))
1125		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1126		    txqs->pidx, txq->pidx, txq->cidx);
1127#endif
1128	if (txq->pidx >= txq->size) {
1129		txq->pidx -= txq->size;
1130		txq->gen ^= 1;
1131	}
1132
1133}
1134
1135/**
1136 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1137 *	@m: the packet mbufs
1138 *      @nsegs: the number of segments
1139 *
1140 * 	Returns the number of Tx descriptors needed for the given Ethernet
1141 * 	packet.  Ethernet packets require addition of WR and CPL headers.
1142 */
1143static __inline unsigned int
1144calc_tx_descs(const struct mbuf *m, int nsegs)
1145{
1146	unsigned int flits;
1147
1148	if (m->m_pkthdr.len <= PIO_LEN)
1149		return 1;
1150
1151	flits = sgl_len(nsegs) + 2;
1152	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1153		flits++;
1154
1155	return flits_to_desc(flits);
1156}
1157
1158static unsigned int
1159busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1160    struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1161{
1162	struct mbuf *m0;
1163	int err, pktlen, pass = 0;
1164	bus_dma_tag_t tag = txq->entry_tag;
1165
1166retry:
1167	err = 0;
1168	m0 = *m;
1169	pktlen = m0->m_pkthdr.len;
1170#if defined(__i386__) || defined(__amd64__)
1171	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1172		goto done;
1173	} else
1174#endif
1175		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1176
1177	if (err == 0) {
1178		goto done;
1179	}
1180	if (err == EFBIG && pass == 0) {
1181		pass = 1;
1182		/* Too many segments, try to defrag */
1183		m0 = m_defrag(m0, M_DONTWAIT);
1184		if (m0 == NULL) {
1185			m_freem(*m);
1186			*m = NULL;
1187			return (ENOBUFS);
1188		}
1189		*m = m0;
1190		goto retry;
1191	} else if (err == ENOMEM) {
1192		return (err);
1193	} if (err) {
1194		if (cxgb_debug)
1195			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1196		m_freem(m0);
1197		*m = NULL;
1198		return (err);
1199	}
1200done:
1201#if !defined(__i386__) && !defined(__amd64__)
1202	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1203#endif
1204	txsd->flags |= TX_SW_DESC_MAPPED;
1205
1206	return (0);
1207}
1208
1209/**
1210 *	make_sgl - populate a scatter/gather list for a packet
1211 *	@sgp: the SGL to populate
1212 *	@segs: the packet dma segments
1213 *	@nsegs: the number of segments
1214 *
1215 *	Generates a scatter/gather list for the buffers that make up a packet
1216 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1217 *	appropriately.
1218 */
1219static __inline void
1220make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1221{
1222	int i, idx;
1223
1224	for (idx = 0, i = 0; i < nsegs; i++) {
1225		/*
1226		 * firmware doesn't like empty segments
1227		 */
1228		if (segs[i].ds_len == 0)
1229			continue;
1230		if (i && idx == 0)
1231			++sgp;
1232
1233		sgp->len[idx] = htobe32(segs[i].ds_len);
1234		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1235		idx ^= 1;
1236	}
1237
1238	if (idx) {
1239		sgp->len[idx] = 0;
1240		sgp->addr[idx] = 0;
1241	}
1242}
1243
1244/**
1245 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1246 *	@adap: the adapter
1247 *	@q: the Tx queue
1248 *
1249 *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1250 *	where the HW is going to sleep just after we checked, however,
1251 *	then the interrupt handler will detect the outstanding TX packet
1252 *	and ring the doorbell for us.
1253 *
1254 *	When GTS is disabled we unconditionally ring the doorbell.
1255 */
1256static __inline void
1257check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1258{
1259#if USE_GTS
1260	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1261	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1262		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1263#ifdef T3_TRACE
1264		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1265			  q->cntxt_id);
1266#endif
1267		t3_write_reg(adap, A_SG_KDOORBELL,
1268			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1269	}
1270#else
1271	wmb();            /* write descriptors before telling HW */
1272	t3_write_reg(adap, A_SG_KDOORBELL,
1273		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1274#endif
1275}
1276
1277static __inline void
1278wr_gen2(struct tx_desc *d, unsigned int gen)
1279{
1280#if SGE_NUM_GENBITS == 2
1281	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1282#endif
1283}
1284
1285/**
1286 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1287 *	@ndesc: number of Tx descriptors spanned by the SGL
1288 *	@txd: first Tx descriptor to be written
1289 *	@txqs: txq state (generation and producer index)
1290 *	@txq: the SGE Tx queue
1291 *	@sgl: the SGL
1292 *	@flits: number of flits to the start of the SGL in the first descriptor
1293 *	@sgl_flits: the SGL size in flits
1294 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1295 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1296 *
1297 *	Write a work request header and an associated SGL.  If the SGL is
1298 *	small enough to fit into one Tx descriptor it has already been written
1299 *	and we just need to write the WR header.  Otherwise we distribute the
1300 *	SGL across the number of descriptors it spans.
1301 */
1302static void
1303write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1304    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1305    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1306{
1307
1308	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1309	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1310
1311	if (__predict_true(ndesc == 1)) {
1312		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1313			V_WR_SGLSFLT(flits)) | wr_hi,
1314		    htonl(V_WR_LEN(flits + sgl_flits) |
1315			V_WR_GEN(txqs->gen)) | wr_lo);
1316		/* XXX gen? */
1317		wr_gen2(txd, txqs->gen);
1318
1319	} else {
1320		unsigned int ogen = txqs->gen;
1321		const uint64_t *fp = (const uint64_t *)sgl;
1322		struct work_request_hdr *wp = wrp;
1323
1324		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1325		    V_WR_SGLSFLT(flits)) | wr_hi;
1326
1327		while (sgl_flits) {
1328			unsigned int avail = WR_FLITS - flits;
1329
1330			if (avail > sgl_flits)
1331				avail = sgl_flits;
1332			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1333			sgl_flits -= avail;
1334			ndesc--;
1335			if (!sgl_flits)
1336				break;
1337
1338			fp += avail;
1339			txd++;
1340			txsd++;
1341			if (++txqs->pidx == txq->size) {
1342				txqs->pidx = 0;
1343				txqs->gen ^= 1;
1344				txd = txq->desc;
1345				txsd = txq->sdesc;
1346			}
1347
1348			/*
1349			 * when the head of the mbuf chain
1350			 * is freed all clusters will be freed
1351			 * with it
1352			 */
1353			wrp = (struct work_request_hdr *)txd;
1354			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1355			    V_WR_SGLSFLT(1)) | wr_hi;
1356			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1357				    sgl_flits + 1)) |
1358			    V_WR_GEN(txqs->gen)) | wr_lo;
1359			wr_gen2(txd, txqs->gen);
1360			flits = 1;
1361		}
1362		wrp->wrh_hi |= htonl(F_WR_EOP);
1363		wmb();
1364		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1365		wr_gen2((struct tx_desc *)wp, ogen);
1366	}
1367}
1368
1369/* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1370#define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1371
1372#define GET_VTAG(cntrl, m) \
1373do { \
1374	if ((m)->m_flags & M_VLANTAG)					            \
1375		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1376} while (0)
1377
1378static int
1379t3_encap(struct sge_qset *qs, struct mbuf **m)
1380{
1381	adapter_t *sc;
1382	struct mbuf *m0;
1383	struct sge_txq *txq;
1384	struct txq_state txqs;
1385	struct port_info *pi;
1386	unsigned int ndesc, flits, cntrl, mlen;
1387	int err, nsegs, tso_info = 0;
1388
1389	struct work_request_hdr *wrp;
1390	struct tx_sw_desc *txsd;
1391	struct sg_ent *sgp, *sgl;
1392	uint32_t wr_hi, wr_lo, sgl_flits;
1393	bus_dma_segment_t segs[TX_MAX_SEGS];
1394
1395	struct tx_desc *txd;
1396
1397	pi = qs->port;
1398	sc = pi->adapter;
1399	txq = &qs->txq[TXQ_ETH];
1400	txd = &txq->desc[txq->pidx];
1401	txsd = &txq->sdesc[txq->pidx];
1402	sgl = txq->txq_sgl;
1403
1404	prefetch(txd);
1405	m0 = *m;
1406
1407	mtx_assert(&qs->lock, MA_OWNED);
1408	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1409	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1410
1411	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1412	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1413		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1414
1415	if (m0->m_nextpkt != NULL) {
1416		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1417		ndesc = 1;
1418		mlen = 0;
1419	} else {
1420		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1421		    &m0, segs, &nsegs))) {
1422			if (cxgb_debug)
1423				printf("failed ... err=%d\n", err);
1424			return (err);
1425		}
1426		mlen = m0->m_pkthdr.len;
1427		ndesc = calc_tx_descs(m0, nsegs);
1428	}
1429	txq_prod(txq, ndesc, &txqs);
1430
1431	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1432	txsd->m = m0;
1433
1434	if (m0->m_nextpkt != NULL) {
1435		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1436		int i, fidx;
1437
1438		if (nsegs > 7)
1439			panic("trying to coalesce %d packets in to one WR", nsegs);
1440		txq->txq_coalesced += nsegs;
1441		wrp = (struct work_request_hdr *)txd;
1442		flits = nsegs*2 + 1;
1443
1444		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1445			struct cpl_tx_pkt_batch_entry *cbe;
1446			uint64_t flit;
1447			uint32_t *hflit = (uint32_t *)&flit;
1448			int cflags = m0->m_pkthdr.csum_flags;
1449
1450			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1451			GET_VTAG(cntrl, m0);
1452			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1453			if (__predict_false(!(cflags & CSUM_IP)))
1454				cntrl |= F_TXPKT_IPCSUM_DIS;
1455			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1456				cntrl |= F_TXPKT_L4CSUM_DIS;
1457
1458			hflit[0] = htonl(cntrl);
1459			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1460			flit |= htobe64(1 << 24);
1461			cbe = &cpl_batch->pkt_entry[i];
1462			cbe->cntrl = hflit[0];
1463			cbe->len = hflit[1];
1464			cbe->addr = htobe64(segs[i].ds_addr);
1465		}
1466
1467		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1468		    V_WR_SGLSFLT(flits)) |
1469		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1470		wr_lo = htonl(V_WR_LEN(flits) |
1471		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1472		set_wr_hdr(wrp, wr_hi, wr_lo);
1473		wmb();
1474		ETHER_BPF_MTAP(pi->ifp, m0);
1475		wr_gen2(txd, txqs.gen);
1476		check_ring_tx_db(sc, txq);
1477		return (0);
1478	} else if (tso_info) {
1479		int eth_type;
1480		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1481		struct ether_header *eh;
1482		struct ip *ip;
1483		struct tcphdr *tcp;
1484
1485		txd->flit[2] = 0;
1486		GET_VTAG(cntrl, m0);
1487		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1488		hdr->cntrl = htonl(cntrl);
1489		hdr->len = htonl(mlen | 0x80000000);
1490
1491		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1492			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1493			    m0, mlen, m0->m_pkthdr.tso_segsz,
1494			    m0->m_pkthdr.csum_flags, m0->m_flags);
1495			panic("tx tso packet too small");
1496		}
1497
1498		/* Make sure that ether, ip, tcp headers are all in m0 */
1499		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1500			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1501			if (__predict_false(m0 == NULL)) {
1502				/* XXX panic probably an overreaction */
1503				panic("couldn't fit header into mbuf");
1504			}
1505		}
1506
1507		eh = mtod(m0, struct ether_header *);
1508		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
1509			eth_type = CPL_ETH_II_VLAN;
1510			ip = (struct ip *)((struct ether_vlan_header *)eh + 1);
1511		} else {
1512			eth_type = CPL_ETH_II;
1513			ip = (struct ip *)(eh + 1);
1514		}
1515		tcp = (struct tcphdr *)(ip + 1);
1516
1517		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1518			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1519			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1520		hdr->lso_info = htonl(tso_info);
1521
1522		if (__predict_false(mlen <= PIO_LEN)) {
1523			/*
1524			 * pkt not undersized but fits in PIO_LEN
1525			 * Indicates a TSO bug at the higher levels.
1526			 */
1527			txsd->m = NULL;
1528			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1529			flits = (mlen + 7) / 8 + 3;
1530			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1531					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1532					  F_WR_SOP | F_WR_EOP | txqs.compl);
1533			wr_lo = htonl(V_WR_LEN(flits) |
1534			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1535			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1536			wmb();
1537			ETHER_BPF_MTAP(pi->ifp, m0);
1538			wr_gen2(txd, txqs.gen);
1539			check_ring_tx_db(sc, txq);
1540			m_freem(m0);
1541			return (0);
1542		}
1543		flits = 3;
1544	} else {
1545		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1546
1547		GET_VTAG(cntrl, m0);
1548		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1549		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1550			cntrl |= F_TXPKT_IPCSUM_DIS;
1551		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1552			cntrl |= F_TXPKT_L4CSUM_DIS;
1553		cpl->cntrl = htonl(cntrl);
1554		cpl->len = htonl(mlen | 0x80000000);
1555
1556		if (mlen <= PIO_LEN) {
1557			txsd->m = NULL;
1558			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1559			flits = (mlen + 7) / 8 + 2;
1560
1561			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1562			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1563					  F_WR_SOP | F_WR_EOP | txqs.compl);
1564			wr_lo = htonl(V_WR_LEN(flits) |
1565			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1566			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1567			wmb();
1568			ETHER_BPF_MTAP(pi->ifp, m0);
1569			wr_gen2(txd, txqs.gen);
1570			check_ring_tx_db(sc, txq);
1571			m_freem(m0);
1572			return (0);
1573		}
1574		flits = 2;
1575	}
1576	wrp = (struct work_request_hdr *)txd;
1577	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1578	make_sgl(sgp, segs, nsegs);
1579
1580	sgl_flits = sgl_len(nsegs);
1581
1582	ETHER_BPF_MTAP(pi->ifp, m0);
1583
1584	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1585	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1586	wr_lo = htonl(V_WR_TID(txq->token));
1587	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1588	    sgl_flits, wr_hi, wr_lo);
1589	check_ring_tx_db(sc, txq);
1590
1591	return (0);
1592}
1593
1594void
1595cxgb_tx_watchdog(void *arg)
1596{
1597	struct sge_qset *qs = arg;
1598	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1599
1600        if (qs->coalescing != 0 &&
1601	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1602	    TXQ_RING_EMPTY(qs))
1603                qs->coalescing = 0;
1604        else if (qs->coalescing == 0 &&
1605	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1606                qs->coalescing = 1;
1607	if (TXQ_TRYLOCK(qs)) {
1608		qs->qs_flags |= QS_FLUSHING;
1609		cxgb_start_locked(qs);
1610		qs->qs_flags &= ~QS_FLUSHING;
1611		TXQ_UNLOCK(qs);
1612	}
1613	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1614		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1615		    qs, txq->txq_watchdog.c_cpu);
1616}
1617
1618static void
1619cxgb_tx_timeout(void *arg)
1620{
1621	struct sge_qset *qs = arg;
1622	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1623
1624	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1625                qs->coalescing = 1;
1626	if (TXQ_TRYLOCK(qs)) {
1627		qs->qs_flags |= QS_TIMEOUT;
1628		cxgb_start_locked(qs);
1629		qs->qs_flags &= ~QS_TIMEOUT;
1630		TXQ_UNLOCK(qs);
1631	}
1632}
1633
1634static void
1635cxgb_start_locked(struct sge_qset *qs)
1636{
1637	struct mbuf *m_head = NULL;
1638	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1639	int avail, txmax;
1640	int in_use_init = txq->in_use;
1641	struct port_info *pi = qs->port;
1642	struct ifnet *ifp = pi->ifp;
1643	avail = txq->size - txq->in_use - 4;
1644	txmax = min(TX_START_MAX_DESC, avail);
1645
1646	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1647		reclaim_completed_tx(qs, 0, TXQ_ETH);
1648
1649	if (!pi->link_config.link_ok) {
1650		TXQ_RING_FLUSH(qs);
1651		return;
1652	}
1653	TXQ_LOCK_ASSERT(qs);
1654	while ((txq->in_use - in_use_init < txmax) &&
1655	    !TXQ_RING_EMPTY(qs) &&
1656	    (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1657	    pi->link_config.link_ok) {
1658		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1659
1660		if ((m_head = cxgb_dequeue(qs)) == NULL)
1661			break;
1662		/*
1663		 *  Encapsulation can modify our pointer, and or make it
1664		 *  NULL on failure.  In that event, we can't requeue.
1665		 */
1666		if (t3_encap(qs, &m_head) || m_head == NULL)
1667			break;
1668
1669		m_head = NULL;
1670	}
1671	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1672	    pi->link_config.link_ok)
1673		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1674		    qs, txq->txq_timer.c_cpu);
1675	if (m_head != NULL)
1676		m_freem(m_head);
1677}
1678
1679static int
1680cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1681{
1682	struct port_info *pi = qs->port;
1683	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1684	struct buf_ring *br = txq->txq_mr;
1685	int error, avail;
1686
1687	avail = txq->size - txq->in_use;
1688	TXQ_LOCK_ASSERT(qs);
1689
1690	/*
1691	 * We can only do a direct transmit if the following are true:
1692	 * - we aren't coalescing (ring < 3/4 full)
1693	 * - the link is up -- checked in caller
1694	 * - there are no packets enqueued already
1695	 * - there is space in hardware transmit queue
1696	 */
1697	if (check_pkt_coalesce(qs) == 0 &&
1698	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > 4) {
1699		if (t3_encap(qs, &m)) {
1700			if (m != NULL &&
1701			    (error = drbr_enqueue(ifp, br, m)) != 0)
1702				return (error);
1703		} else {
1704			/*
1705			 * We've bypassed the buf ring so we need to update
1706			 * the stats directly
1707			 */
1708			txq->txq_direct_packets++;
1709			txq->txq_direct_bytes += m->m_pkthdr.len;
1710		}
1711	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1712		return (error);
1713
1714	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1715	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1716	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1717		cxgb_start_locked(qs);
1718	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1719		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1720		    qs, txq->txq_timer.c_cpu);
1721	return (0);
1722}
1723
1724int
1725cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1726{
1727	struct sge_qset *qs;
1728	struct port_info *pi = ifp->if_softc;
1729	int error, qidx = pi->first_qset;
1730
1731	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1732	    ||(!pi->link_config.link_ok)) {
1733		m_freem(m);
1734		return (0);
1735	}
1736
1737	if (m->m_flags & M_FLOWID)
1738		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1739
1740	qs = &pi->adapter->sge.qs[qidx];
1741
1742	if (TXQ_TRYLOCK(qs)) {
1743		/* XXX running */
1744		error = cxgb_transmit_locked(ifp, qs, m);
1745		TXQ_UNLOCK(qs);
1746	} else
1747		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1748	return (error);
1749}
1750void
1751cxgb_start(struct ifnet *ifp)
1752{
1753	struct port_info *pi = ifp->if_softc;
1754	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
1755
1756	if (!pi->link_config.link_ok)
1757		return;
1758
1759	TXQ_LOCK(qs);
1760	cxgb_start_locked(qs);
1761	TXQ_UNLOCK(qs);
1762}
1763
1764void
1765cxgb_qflush(struct ifnet *ifp)
1766{
1767	/*
1768	 * flush any enqueued mbufs in the buf_rings
1769	 * and in the transmit queues
1770	 * no-op for now
1771	 */
1772	return;
1773}
1774
1775/**
1776 *	write_imm - write a packet into a Tx descriptor as immediate data
1777 *	@d: the Tx descriptor to write
1778 *	@m: the packet
1779 *	@len: the length of packet data to write as immediate data
1780 *	@gen: the generation bit value to write
1781 *
1782 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1783 *	contains a work request at its beginning.  We must write the packet
1784 *	carefully so the SGE doesn't read accidentally before it's written in
1785 *	its entirety.
1786 */
1787static __inline void
1788write_imm(struct tx_desc *d, struct mbuf *m,
1789	  unsigned int len, unsigned int gen)
1790{
1791	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1792	struct work_request_hdr *to = (struct work_request_hdr *)d;
1793	uint32_t wr_hi, wr_lo;
1794
1795	if (len > WR_LEN)
1796		panic("len too big %d\n", len);
1797	if (len < sizeof(*from))
1798		panic("len too small %d", len);
1799
1800	memcpy(&to[1], &from[1], len - sizeof(*from));
1801	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1802					V_WR_BCNTLFLT(len & 7));
1803	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1804					V_WR_LEN((len + 7) / 8));
1805	set_wr_hdr(to, wr_hi, wr_lo);
1806	wmb();
1807	wr_gen2(d, gen);
1808
1809	/*
1810	 * This check is a hack we should really fix the logic so
1811	 * that this can't happen
1812	 */
1813	if (m->m_type != MT_DONTFREE)
1814		m_freem(m);
1815
1816}
1817
1818/**
1819 *	check_desc_avail - check descriptor availability on a send queue
1820 *	@adap: the adapter
1821 *	@q: the TX queue
1822 *	@m: the packet needing the descriptors
1823 *	@ndesc: the number of Tx descriptors needed
1824 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1825 *
1826 *	Checks if the requested number of Tx descriptors is available on an
1827 *	SGE send queue.  If the queue is already suspended or not enough
1828 *	descriptors are available the packet is queued for later transmission.
1829 *	Must be called with the Tx queue locked.
1830 *
1831 *	Returns 0 if enough descriptors are available, 1 if there aren't
1832 *	enough descriptors and the packet has been queued, and 2 if the caller
1833 *	needs to retry because there weren't enough descriptors at the
1834 *	beginning of the call but some freed up in the mean time.
1835 */
1836static __inline int
1837check_desc_avail(adapter_t *adap, struct sge_txq *q,
1838		 struct mbuf *m, unsigned int ndesc,
1839		 unsigned int qid)
1840{
1841	/*
1842	 * XXX We currently only use this for checking the control queue
1843	 * the control queue is only used for binding qsets which happens
1844	 * at init time so we are guaranteed enough descriptors
1845	 */
1846	if (__predict_false(!mbufq_empty(&q->sendq))) {
1847addq_exit:	mbufq_tail(&q->sendq, m);
1848		return 1;
1849	}
1850	if (__predict_false(q->size - q->in_use < ndesc)) {
1851
1852		struct sge_qset *qs = txq_to_qset(q, qid);
1853
1854		setbit(&qs->txq_stopped, qid);
1855		if (should_restart_tx(q) &&
1856		    test_and_clear_bit(qid, &qs->txq_stopped))
1857			return 2;
1858
1859		q->stops++;
1860		goto addq_exit;
1861	}
1862	return 0;
1863}
1864
1865
1866/**
1867 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1868 *	@q: the SGE control Tx queue
1869 *
1870 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1871 *	that send only immediate data (presently just the control queues) and
1872 *	thus do not have any mbufs
1873 */
1874static __inline void
1875reclaim_completed_tx_imm(struct sge_txq *q)
1876{
1877	unsigned int reclaim = q->processed - q->cleaned;
1878
1879	q->in_use -= reclaim;
1880	q->cleaned += reclaim;
1881}
1882
1883static __inline int
1884immediate(const struct mbuf *m)
1885{
1886	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1887}
1888
1889/**
1890 *	ctrl_xmit - send a packet through an SGE control Tx queue
1891 *	@adap: the adapter
1892 *	@q: the control queue
1893 *	@m: the packet
1894 *
1895 *	Send a packet through an SGE control Tx queue.  Packets sent through
1896 *	a control queue must fit entirely as immediate data in a single Tx
1897 *	descriptor and have no page fragments.
1898 */
1899static int
1900ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1901{
1902	int ret;
1903	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1904	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1905
1906	if (__predict_false(!immediate(m))) {
1907		m_freem(m);
1908		return 0;
1909	}
1910
1911	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1912	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1913
1914	TXQ_LOCK(qs);
1915again:	reclaim_completed_tx_imm(q);
1916
1917	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1918	if (__predict_false(ret)) {
1919		if (ret == 1) {
1920			TXQ_UNLOCK(qs);
1921			return (ENOSPC);
1922		}
1923		goto again;
1924	}
1925	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1926
1927	q->in_use++;
1928	if (++q->pidx >= q->size) {
1929		q->pidx = 0;
1930		q->gen ^= 1;
1931	}
1932	TXQ_UNLOCK(qs);
1933	wmb();
1934	t3_write_reg(adap, A_SG_KDOORBELL,
1935		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1936	return (0);
1937}
1938
1939
1940/**
1941 *	restart_ctrlq - restart a suspended control queue
1942 *	@qs: the queue set cotaining the control queue
1943 *
1944 *	Resumes transmission on a suspended Tx control queue.
1945 */
1946static void
1947restart_ctrlq(void *data, int npending)
1948{
1949	struct mbuf *m;
1950	struct sge_qset *qs = (struct sge_qset *)data;
1951	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1952	adapter_t *adap = qs->port->adapter;
1953
1954	TXQ_LOCK(qs);
1955again:	reclaim_completed_tx_imm(q);
1956
1957	while (q->in_use < q->size &&
1958	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1959
1960		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1961
1962		if (++q->pidx >= q->size) {
1963			q->pidx = 0;
1964			q->gen ^= 1;
1965		}
1966		q->in_use++;
1967	}
1968	if (!mbufq_empty(&q->sendq)) {
1969		setbit(&qs->txq_stopped, TXQ_CTRL);
1970
1971		if (should_restart_tx(q) &&
1972		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1973			goto again;
1974		q->stops++;
1975	}
1976	TXQ_UNLOCK(qs);
1977	t3_write_reg(adap, A_SG_KDOORBELL,
1978		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1979}
1980
1981
1982/*
1983 * Send a management message through control queue 0
1984 */
1985int
1986t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1987{
1988	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1989}
1990
1991/**
1992 *	free_qset - free the resources of an SGE queue set
1993 *	@sc: the controller owning the queue set
1994 *	@q: the queue set
1995 *
1996 *	Release the HW and SW resources associated with an SGE queue set, such
1997 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1998 *	queue set must be quiesced prior to calling this.
1999 */
2000static void
2001t3_free_qset(adapter_t *sc, struct sge_qset *q)
2002{
2003	int i;
2004
2005	reclaim_completed_tx(q, 0, TXQ_ETH);
2006	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2007		if (q->txq[i].txq_mr != NULL)
2008			buf_ring_free(q->txq[i].txq_mr, M_DEVBUF);
2009		if (q->txq[i].txq_ifq != NULL) {
2010			ifq_delete(q->txq[i].txq_ifq);
2011			free(q->txq[i].txq_ifq, M_DEVBUF);
2012		}
2013	}
2014
2015	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2016		if (q->fl[i].desc) {
2017			mtx_lock_spin(&sc->sge.reg_lock);
2018			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2019			mtx_unlock_spin(&sc->sge.reg_lock);
2020			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2021			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2022					q->fl[i].desc_map);
2023			bus_dma_tag_destroy(q->fl[i].desc_tag);
2024			bus_dma_tag_destroy(q->fl[i].entry_tag);
2025		}
2026		if (q->fl[i].sdesc) {
2027			free_rx_bufs(sc, &q->fl[i]);
2028			free(q->fl[i].sdesc, M_DEVBUF);
2029		}
2030	}
2031
2032	mtx_unlock(&q->lock);
2033	MTX_DESTROY(&q->lock);
2034	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2035		if (q->txq[i].desc) {
2036			mtx_lock_spin(&sc->sge.reg_lock);
2037			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2038			mtx_unlock_spin(&sc->sge.reg_lock);
2039			bus_dmamap_unload(q->txq[i].desc_tag,
2040					q->txq[i].desc_map);
2041			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2042					q->txq[i].desc_map);
2043			bus_dma_tag_destroy(q->txq[i].desc_tag);
2044			bus_dma_tag_destroy(q->txq[i].entry_tag);
2045		}
2046		if (q->txq[i].sdesc) {
2047			free(q->txq[i].sdesc, M_DEVBUF);
2048		}
2049	}
2050
2051	if (q->rspq.desc) {
2052		mtx_lock_spin(&sc->sge.reg_lock);
2053		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2054		mtx_unlock_spin(&sc->sge.reg_lock);
2055
2056		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2057		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2058			        q->rspq.desc_map);
2059		bus_dma_tag_destroy(q->rspq.desc_tag);
2060		MTX_DESTROY(&q->rspq.lock);
2061	}
2062
2063	tcp_lro_free(&q->lro.ctrl);
2064
2065	bzero(q, sizeof(*q));
2066}
2067
2068/**
2069 *	t3_free_sge_resources - free SGE resources
2070 *	@sc: the adapter softc
2071 *
2072 *	Frees resources used by the SGE queue sets.
2073 */
2074void
2075t3_free_sge_resources(adapter_t *sc)
2076{
2077	int i, nqsets;
2078
2079	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2080		nqsets += sc->port[i].nqsets;
2081
2082	for (i = 0; i < nqsets; ++i) {
2083		TXQ_LOCK(&sc->sge.qs[i]);
2084		t3_free_qset(sc, &sc->sge.qs[i]);
2085	}
2086
2087}
2088
2089/**
2090 *	t3_sge_start - enable SGE
2091 *	@sc: the controller softc
2092 *
2093 *	Enables the SGE for DMAs.  This is the last step in starting packet
2094 *	transfers.
2095 */
2096void
2097t3_sge_start(adapter_t *sc)
2098{
2099	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2100}
2101
2102/**
2103 *	t3_sge_stop - disable SGE operation
2104 *	@sc: the adapter
2105 *
2106 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2107 *	from error interrupts) or from normal process context.  In the latter
2108 *	case it also disables any pending queue restart tasklets.  Note that
2109 *	if it is called in interrupt context it cannot disable the restart
2110 *	tasklets as it cannot wait, however the tasklets will have no effect
2111 *	since the doorbells are disabled and the driver will call this again
2112 *	later from process context, at which time the tasklets will be stopped
2113 *	if they are still running.
2114 */
2115void
2116t3_sge_stop(adapter_t *sc)
2117{
2118	int i, nqsets;
2119
2120	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2121
2122	if (sc->tq == NULL)
2123		return;
2124
2125	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2126		nqsets += sc->port[i].nqsets;
2127#ifdef notyet
2128	/*
2129	 *
2130	 * XXX
2131	 */
2132	for (i = 0; i < nqsets; ++i) {
2133		struct sge_qset *qs = &sc->sge.qs[i];
2134
2135		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2136		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2137	}
2138#endif
2139}
2140
2141/**
2142 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2143 *	@adapter: the adapter
2144 *	@q: the Tx queue to reclaim descriptors from
2145 *	@reclaimable: the number of descriptors to reclaim
2146 *      @m_vec_size: maximum number of buffers to reclaim
2147 *      @desc_reclaimed: returns the number of descriptors reclaimed
2148 *
2149 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2150 *	Tx buffers.  Called with the Tx queue lock held.
2151 *
2152 *      Returns number of buffers of reclaimed
2153 */
2154void
2155t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2156{
2157	struct tx_sw_desc *txsd;
2158	unsigned int cidx, mask;
2159	struct sge_txq *q = &qs->txq[queue];
2160
2161#ifdef T3_TRACE
2162	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2163		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2164#endif
2165	cidx = q->cidx;
2166	mask = q->size - 1;
2167	txsd = &q->sdesc[cidx];
2168
2169	mtx_assert(&qs->lock, MA_OWNED);
2170	while (reclaimable--) {
2171		prefetch(q->sdesc[(cidx + 1) & mask].m);
2172		prefetch(q->sdesc[(cidx + 2) & mask].m);
2173
2174		if (txsd->m != NULL) {
2175			if (txsd->flags & TX_SW_DESC_MAPPED) {
2176				bus_dmamap_unload(q->entry_tag, txsd->map);
2177				txsd->flags &= ~TX_SW_DESC_MAPPED;
2178			}
2179			m_freem_list(txsd->m);
2180			txsd->m = NULL;
2181		} else
2182			q->txq_skipped++;
2183
2184		++txsd;
2185		if (++cidx == q->size) {
2186			cidx = 0;
2187			txsd = q->sdesc;
2188		}
2189	}
2190	q->cidx = cidx;
2191
2192}
2193
2194/**
2195 *	is_new_response - check if a response is newly written
2196 *	@r: the response descriptor
2197 *	@q: the response queue
2198 *
2199 *	Returns true if a response descriptor contains a yet unprocessed
2200 *	response.
2201 */
2202static __inline int
2203is_new_response(const struct rsp_desc *r,
2204    const struct sge_rspq *q)
2205{
2206	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2207}
2208
2209#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2210#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2211			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2212			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2213			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2214
2215/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2216#define NOMEM_INTR_DELAY 2500
2217
2218/**
2219 *	write_ofld_wr - write an offload work request
2220 *	@adap: the adapter
2221 *	@m: the packet to send
2222 *	@q: the Tx queue
2223 *	@pidx: index of the first Tx descriptor to write
2224 *	@gen: the generation value to use
2225 *	@ndesc: number of descriptors the packet will occupy
2226 *
2227 *	Write an offload work request to send the supplied packet.  The packet
2228 *	data already carry the work request with most fields populated.
2229 */
2230static void
2231write_ofld_wr(adapter_t *adap, struct mbuf *m,
2232    struct sge_txq *q, unsigned int pidx,
2233    unsigned int gen, unsigned int ndesc,
2234    bus_dma_segment_t *segs, unsigned int nsegs)
2235{
2236	unsigned int sgl_flits, flits;
2237	struct work_request_hdr *from;
2238	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2239	struct tx_desc *d = &q->desc[pidx];
2240	struct txq_state txqs;
2241
2242	if (immediate(m) && nsegs == 0) {
2243		write_imm(d, m, m->m_len, gen);
2244		return;
2245	}
2246
2247	/* Only TX_DATA builds SGLs */
2248	from = mtod(m, struct work_request_hdr *);
2249	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2250
2251	flits = m->m_len / 8;
2252	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2253
2254	make_sgl(sgp, segs, nsegs);
2255	sgl_flits = sgl_len(nsegs);
2256
2257	txqs.gen = gen;
2258	txqs.pidx = pidx;
2259	txqs.compl = 0;
2260
2261	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2262	    from->wrh_hi, from->wrh_lo);
2263}
2264
2265/**
2266 *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2267 *	@m: the packet
2268 *
2269 * 	Returns the number of Tx descriptors needed for the given offload
2270 * 	packet.  These packets are already fully constructed.
2271 */
2272static __inline unsigned int
2273calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2274{
2275	unsigned int flits, cnt = 0;
2276	int ndescs;
2277
2278	if (m->m_len <= WR_LEN && nsegs == 0)
2279		return (1);                 /* packet fits as immediate data */
2280
2281	/*
2282	 * This needs to be re-visited for TOE
2283	 */
2284
2285	cnt = nsegs;
2286
2287	/* headers */
2288	flits = m->m_len / 8;
2289
2290	ndescs = flits_to_desc(flits + sgl_len(cnt));
2291
2292	return (ndescs);
2293}
2294
2295/**
2296 *	ofld_xmit - send a packet through an offload queue
2297 *	@adap: the adapter
2298 *	@q: the Tx offload queue
2299 *	@m: the packet
2300 *
2301 *	Send an offload packet through an SGE offload queue.
2302 */
2303static int
2304ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2305{
2306	int ret, nsegs;
2307	unsigned int ndesc;
2308	unsigned int pidx, gen;
2309	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2310	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2311	struct tx_sw_desc *stx;
2312
2313	nsegs = m_get_sgllen(m);
2314	vsegs = m_get_sgl(m);
2315	ndesc = calc_tx_descs_ofld(m, nsegs);
2316	busdma_map_sgl(vsegs, segs, nsegs);
2317
2318	stx = &q->sdesc[q->pidx];
2319
2320	TXQ_LOCK(qs);
2321again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2322	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2323	if (__predict_false(ret)) {
2324		if (ret == 1) {
2325			printf("no ofld desc avail\n");
2326
2327			m_set_priority(m, ndesc);     /* save for restart */
2328			TXQ_UNLOCK(qs);
2329			return (EINTR);
2330		}
2331		goto again;
2332	}
2333
2334	gen = q->gen;
2335	q->in_use += ndesc;
2336	pidx = q->pidx;
2337	q->pidx += ndesc;
2338	if (q->pidx >= q->size) {
2339		q->pidx -= q->size;
2340		q->gen ^= 1;
2341	}
2342#ifdef T3_TRACE
2343	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2344		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2345		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2346		  skb_shinfo(skb)->nr_frags);
2347#endif
2348	TXQ_UNLOCK(qs);
2349
2350	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2351	check_ring_tx_db(adap, q);
2352	return (0);
2353}
2354
2355/**
2356 *	restart_offloadq - restart a suspended offload queue
2357 *	@qs: the queue set cotaining the offload queue
2358 *
2359 *	Resumes transmission on a suspended Tx offload queue.
2360 */
2361static void
2362restart_offloadq(void *data, int npending)
2363{
2364	struct mbuf *m;
2365	struct sge_qset *qs = data;
2366	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2367	adapter_t *adap = qs->port->adapter;
2368	bus_dma_segment_t segs[TX_MAX_SEGS];
2369	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2370	int nsegs, cleaned;
2371
2372	TXQ_LOCK(qs);
2373again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2374
2375	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2376		unsigned int gen, pidx;
2377		unsigned int ndesc = m_get_priority(m);
2378
2379		if (__predict_false(q->size - q->in_use < ndesc)) {
2380			setbit(&qs->txq_stopped, TXQ_OFLD);
2381			if (should_restart_tx(q) &&
2382			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2383				goto again;
2384			q->stops++;
2385			break;
2386		}
2387
2388		gen = q->gen;
2389		q->in_use += ndesc;
2390		pidx = q->pidx;
2391		q->pidx += ndesc;
2392		if (q->pidx >= q->size) {
2393			q->pidx -= q->size;
2394			q->gen ^= 1;
2395		}
2396
2397		(void)mbufq_dequeue(&q->sendq);
2398		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2399		TXQ_UNLOCK(qs);
2400		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2401		TXQ_LOCK(qs);
2402	}
2403#if USE_GTS
2404	set_bit(TXQ_RUNNING, &q->flags);
2405	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2406#endif
2407	TXQ_UNLOCK(qs);
2408	wmb();
2409	t3_write_reg(adap, A_SG_KDOORBELL,
2410		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2411}
2412
2413/**
2414 *	queue_set - return the queue set a packet should use
2415 *	@m: the packet
2416 *
2417 *	Maps a packet to the SGE queue set it should use.  The desired queue
2418 *	set is carried in bits 1-3 in the packet's priority.
2419 */
2420static __inline int
2421queue_set(const struct mbuf *m)
2422{
2423	return m_get_priority(m) >> 1;
2424}
2425
2426/**
2427 *	is_ctrl_pkt - return whether an offload packet is a control packet
2428 *	@m: the packet
2429 *
2430 *	Determines whether an offload packet should use an OFLD or a CTRL
2431 *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2432 */
2433static __inline int
2434is_ctrl_pkt(const struct mbuf *m)
2435{
2436	return m_get_priority(m) & 1;
2437}
2438
2439/**
2440 *	t3_offload_tx - send an offload packet
2441 *	@tdev: the offload device to send to
2442 *	@m: the packet
2443 *
2444 *	Sends an offload packet.  We use the packet priority to select the
2445 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2446 *	should be sent as regular or control, bits 1-3 select the queue set.
2447 */
2448int
2449t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2450{
2451	adapter_t *adap = tdev2adap(tdev);
2452	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2453
2454	if (__predict_false(is_ctrl_pkt(m)))
2455		return ctrl_xmit(adap, qs, m);
2456
2457	return ofld_xmit(adap, qs, m);
2458}
2459
2460/**
2461 *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2462 *	@tdev: the offload device that will be receiving the packets
2463 *	@q: the SGE response queue that assembled the bundle
2464 *	@m: the partial bundle
2465 *	@n: the number of packets in the bundle
2466 *
2467 *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2468 */
2469static __inline void
2470deliver_partial_bundle(struct t3cdev *tdev,
2471			struct sge_rspq *q,
2472			struct mbuf *mbufs[], int n)
2473{
2474	if (n) {
2475		q->offload_bundles++;
2476		cxgb_ofld_recv(tdev, mbufs, n);
2477	}
2478}
2479
2480static __inline int
2481rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2482    struct mbuf *m, struct mbuf *rx_gather[],
2483    unsigned int gather_idx)
2484{
2485
2486	rq->offload_pkts++;
2487	m->m_pkthdr.header = mtod(m, void *);
2488	rx_gather[gather_idx++] = m;
2489	if (gather_idx == RX_BUNDLE_SIZE) {
2490		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2491		gather_idx = 0;
2492		rq->offload_bundles++;
2493	}
2494	return (gather_idx);
2495}
2496
2497static void
2498restart_tx(struct sge_qset *qs)
2499{
2500	struct adapter *sc = qs->port->adapter;
2501
2502
2503	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2504	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2505	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2506		qs->txq[TXQ_OFLD].restarts++;
2507		DPRINTF("restarting TXQ_OFLD\n");
2508		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2509	}
2510	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2511	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2512	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2513	    qs->txq[TXQ_CTRL].in_use);
2514
2515	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2516	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2517	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2518		qs->txq[TXQ_CTRL].restarts++;
2519		DPRINTF("restarting TXQ_CTRL\n");
2520		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2521	}
2522}
2523
2524/**
2525 *	t3_sge_alloc_qset - initialize an SGE queue set
2526 *	@sc: the controller softc
2527 *	@id: the queue set id
2528 *	@nports: how many Ethernet ports will be using this queue set
2529 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2530 *	@p: configuration parameters for this queue set
2531 *	@ntxq: number of Tx queues for the queue set
2532 *	@pi: port info for queue set
2533 *
2534 *	Allocate resources and initialize an SGE queue set.  A queue set
2535 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2536 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2537 *	queue, offload queue, and control queue.
2538 */
2539int
2540t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2541		  const struct qset_params *p, int ntxq, struct port_info *pi)
2542{
2543	struct sge_qset *q = &sc->sge.qs[id];
2544	int i, ret = 0;
2545
2546	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2547	q->port = pi;
2548
2549	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2550
2551		if ((q->txq[i].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2552			    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2553			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2554			goto err;
2555		}
2556		if ((q->txq[i].txq_ifq =
2557			malloc(sizeof(struct ifaltq), M_DEVBUF, M_NOWAIT|M_ZERO))
2558		    == NULL) {
2559			device_printf(sc->dev, "failed to allocate ifq\n");
2560			goto err;
2561		}
2562		ifq_init(q->txq[i].txq_ifq, pi->ifp);
2563		callout_init(&q->txq[i].txq_timer, 1);
2564		callout_init(&q->txq[i].txq_watchdog, 1);
2565		q->txq[i].txq_timer.c_cpu = id % mp_ncpus;
2566		q->txq[i].txq_watchdog.c_cpu = id % mp_ncpus;
2567	}
2568	init_qset_cntxt(q, id);
2569	q->idx = id;
2570	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2571		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2572		    &q->fl[0].desc, &q->fl[0].sdesc,
2573		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2574		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2575		printf("error %d from alloc ring fl0\n", ret);
2576		goto err;
2577	}
2578
2579	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2580		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2581		    &q->fl[1].desc, &q->fl[1].sdesc,
2582		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2583		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2584		printf("error %d from alloc ring fl1\n", ret);
2585		goto err;
2586	}
2587
2588	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2589		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2590		    &q->rspq.desc_tag, &q->rspq.desc_map,
2591		    NULL, NULL)) != 0) {
2592		printf("error %d from alloc ring rspq\n", ret);
2593		goto err;
2594	}
2595
2596	for (i = 0; i < ntxq; ++i) {
2597		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2598
2599		if ((ret = alloc_ring(sc, p->txq_size[i],
2600			    sizeof(struct tx_desc), sz,
2601			    &q->txq[i].phys_addr, &q->txq[i].desc,
2602			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2603			    &q->txq[i].desc_map,
2604			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2605			printf("error %d from alloc ring tx %i\n", ret, i);
2606			goto err;
2607		}
2608		mbufq_init(&q->txq[i].sendq);
2609		q->txq[i].gen = 1;
2610		q->txq[i].size = p->txq_size[i];
2611	}
2612
2613	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2614	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2615	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2616	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2617
2618	q->fl[0].gen = q->fl[1].gen = 1;
2619	q->fl[0].size = p->fl_size;
2620	q->fl[1].size = p->jumbo_size;
2621
2622	q->rspq.gen = 1;
2623	q->rspq.cidx = 0;
2624	q->rspq.size = p->rspq_size;
2625
2626	q->txq[TXQ_ETH].stop_thres = nports *
2627	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2628
2629	q->fl[0].buf_size = MCLBYTES;
2630	q->fl[0].zone = zone_pack;
2631	q->fl[0].type = EXT_PACKET;
2632#if __FreeBSD_version > 800000
2633	if (cxgb_use_16k_clusters) {
2634		q->fl[1].buf_size = MJUM16BYTES;
2635		q->fl[1].zone = zone_jumbo16;
2636		q->fl[1].type = EXT_JUMBO16;
2637	} else {
2638		q->fl[1].buf_size = MJUM9BYTES;
2639		q->fl[1].zone = zone_jumbo9;
2640		q->fl[1].type = EXT_JUMBO9;
2641	}
2642#else
2643	q->fl[1].buf_size = MJUMPAGESIZE;
2644	q->fl[1].zone = zone_jumbop;
2645	q->fl[1].type = EXT_JUMBOP;
2646#endif
2647
2648	/* Allocate and setup the lro_ctrl structure */
2649	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2650	ret = tcp_lro_init(&q->lro.ctrl);
2651	if (ret) {
2652		printf("error %d from tcp_lro_init\n", ret);
2653		goto err;
2654	}
2655	q->lro.ctrl.ifp = pi->ifp;
2656
2657	mtx_lock_spin(&sc->sge.reg_lock);
2658	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2659				   q->rspq.phys_addr, q->rspq.size,
2660				   q->fl[0].buf_size, 1, 0);
2661	if (ret) {
2662		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2663		goto err_unlock;
2664	}
2665
2666	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2667		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2668					  q->fl[i].phys_addr, q->fl[i].size,
2669					  q->fl[i].buf_size, p->cong_thres, 1,
2670					  0);
2671		if (ret) {
2672			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2673			goto err_unlock;
2674		}
2675	}
2676
2677	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2678				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2679				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2680				 1, 0);
2681	if (ret) {
2682		printf("error %d from t3_sge_init_ecntxt\n", ret);
2683		goto err_unlock;
2684	}
2685
2686	if (ntxq > 1) {
2687		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2688					 USE_GTS, SGE_CNTXT_OFLD, id,
2689					 q->txq[TXQ_OFLD].phys_addr,
2690					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2691		if (ret) {
2692			printf("error %d from t3_sge_init_ecntxt\n", ret);
2693			goto err_unlock;
2694		}
2695	}
2696
2697	if (ntxq > 2) {
2698		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2699					 SGE_CNTXT_CTRL, id,
2700					 q->txq[TXQ_CTRL].phys_addr,
2701					 q->txq[TXQ_CTRL].size,
2702					 q->txq[TXQ_CTRL].token, 1, 0);
2703		if (ret) {
2704			printf("error %d from t3_sge_init_ecntxt\n", ret);
2705			goto err_unlock;
2706		}
2707	}
2708
2709	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2710	    device_get_unit(sc->dev), irq_vec_idx);
2711	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2712
2713	mtx_unlock_spin(&sc->sge.reg_lock);
2714	t3_update_qset_coalesce(q, p);
2715	q->port = pi;
2716
2717	refill_fl(sc, &q->fl[0], q->fl[0].size);
2718	refill_fl(sc, &q->fl[1], q->fl[1].size);
2719	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2720
2721	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2722		     V_NEWTIMER(q->rspq.holdoff_tmr));
2723
2724	return (0);
2725
2726err_unlock:
2727	mtx_unlock_spin(&sc->sge.reg_lock);
2728err:
2729	TXQ_LOCK(q);
2730	t3_free_qset(sc, q);
2731
2732	return (ret);
2733}
2734
2735/*
2736 * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2737 * ethernet data.  Hardware assistance with various checksums and any vlan tag
2738 * will also be taken into account here.
2739 */
2740void
2741t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2742{
2743	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2744	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2745	struct ifnet *ifp = pi->ifp;
2746
2747	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2748
2749	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2750	    cpl->csum_valid && cpl->csum == 0xffff) {
2751		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2752		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2753		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2754		m->m_pkthdr.csum_data = 0xffff;
2755	}
2756
2757	if (cpl->vlan_valid) {
2758		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2759		m->m_flags |= M_VLANTAG;
2760	}
2761
2762	m->m_pkthdr.rcvif = ifp;
2763	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2764	/*
2765	 * adjust after conversion to mbuf chain
2766	 */
2767	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2768	m->m_len -= (sizeof(*cpl) + ethpad);
2769	m->m_data += (sizeof(*cpl) + ethpad);
2770}
2771
2772/**
2773 *	get_packet - return the next ingress packet buffer from a free list
2774 *	@adap: the adapter that received the packet
2775 *	@drop_thres: # of remaining buffers before we start dropping packets
2776 *	@qs: the qset that the SGE free list holding the packet belongs to
2777 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2778 *      @r: response descriptor
2779 *
2780 *	Get the next packet from a free list and complete setup of the
2781 *	sk_buff.  If the packet is small we make a copy and recycle the
2782 *	original buffer, otherwise we use the original buffer itself.  If a
2783 *	positive drop threshold is supplied packets are dropped and their
2784 *	buffers recycled if (a) the number of remaining buffers is under the
2785 *	threshold and the packet is too big to copy, or (b) the packet should
2786 *	be copied but there is no memory for the copy.
2787 */
2788static int
2789get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2790    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2791{
2792
2793	unsigned int len_cq =  ntohl(r->len_cq);
2794	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2795	int mask, cidx = fl->cidx;
2796	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2797	uint32_t len = G_RSPD_LEN(len_cq);
2798	uint32_t flags = M_EXT;
2799	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2800	caddr_t cl;
2801	struct mbuf *m;
2802	int ret = 0;
2803
2804	mask = fl->size - 1;
2805	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2806	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2807	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2808	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2809
2810	fl->credits--;
2811	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2812
2813	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2814	    sopeop == RSPQ_SOP_EOP) {
2815		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2816			goto skip_recycle;
2817		cl = mtod(m, void *);
2818		memcpy(cl, sd->rxsd_cl, len);
2819		recycle_rx_buf(adap, fl, fl->cidx);
2820		m->m_pkthdr.len = m->m_len = len;
2821		m->m_flags = 0;
2822		mh->mh_head = mh->mh_tail = m;
2823		ret = 1;
2824		goto done;
2825	} else {
2826	skip_recycle:
2827		bus_dmamap_unload(fl->entry_tag, sd->map);
2828		cl = sd->rxsd_cl;
2829		m = sd->m;
2830
2831		if ((sopeop == RSPQ_SOP_EOP) ||
2832		    (sopeop == RSPQ_SOP))
2833			flags |= M_PKTHDR;
2834		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2835		if (fl->zone == zone_pack) {
2836			/*
2837			 * restore clobbered data pointer
2838			 */
2839			m->m_data = m->m_ext.ext_buf;
2840		} else {
2841			m_cljset(m, cl, fl->type);
2842		}
2843		m->m_len = len;
2844	}
2845	switch(sopeop) {
2846	case RSPQ_SOP_EOP:
2847		ret = 1;
2848		/* FALLTHROUGH */
2849	case RSPQ_SOP:
2850		mh->mh_head = mh->mh_tail = m;
2851		m->m_pkthdr.len = len;
2852		break;
2853	case RSPQ_EOP:
2854		ret = 1;
2855		/* FALLTHROUGH */
2856	case RSPQ_NSOP_NEOP:
2857		if (mh->mh_tail == NULL) {
2858			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2859			m_freem(m);
2860			break;
2861		}
2862		mh->mh_tail->m_next = m;
2863		mh->mh_tail = m;
2864		mh->mh_head->m_pkthdr.len += len;
2865		break;
2866	}
2867	if (cxgb_debug)
2868		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2869done:
2870	if (++fl->cidx == fl->size)
2871		fl->cidx = 0;
2872
2873	return (ret);
2874}
2875
2876/**
2877 *	handle_rsp_cntrl_info - handles control information in a response
2878 *	@qs: the queue set corresponding to the response
2879 *	@flags: the response control flags
2880 *
2881 *	Handles the control information of an SGE response, such as GTS
2882 *	indications and completion credits for the queue set's Tx queues.
2883 *	HW coalesces credits, we don't do any extra SW coalescing.
2884 */
2885static __inline void
2886handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2887{
2888	unsigned int credits;
2889
2890#if USE_GTS
2891	if (flags & F_RSPD_TXQ0_GTS)
2892		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2893#endif
2894	credits = G_RSPD_TXQ0_CR(flags);
2895	if (credits)
2896		qs->txq[TXQ_ETH].processed += credits;
2897
2898	credits = G_RSPD_TXQ2_CR(flags);
2899	if (credits)
2900		qs->txq[TXQ_CTRL].processed += credits;
2901
2902# if USE_GTS
2903	if (flags & F_RSPD_TXQ1_GTS)
2904		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2905# endif
2906	credits = G_RSPD_TXQ1_CR(flags);
2907	if (credits)
2908		qs->txq[TXQ_OFLD].processed += credits;
2909
2910}
2911
2912static void
2913check_ring_db(adapter_t *adap, struct sge_qset *qs,
2914    unsigned int sleeping)
2915{
2916	;
2917}
2918
2919/**
2920 *	process_responses - process responses from an SGE response queue
2921 *	@adap: the adapter
2922 *	@qs: the queue set to which the response queue belongs
2923 *	@budget: how many responses can be processed in this round
2924 *
2925 *	Process responses from an SGE response queue up to the supplied budget.
2926 *	Responses include received packets as well as credits and other events
2927 *	for the queues that belong to the response queue's queue set.
2928 *	A negative budget is effectively unlimited.
2929 *
2930 *	Additionally choose the interrupt holdoff time for the next interrupt
2931 *	on this queue.  If the system is under memory shortage use a fairly
2932 *	long delay to help recovery.
2933 */
2934static int
2935process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2936{
2937	struct sge_rspq *rspq = &qs->rspq;
2938	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2939	int budget_left = budget;
2940	unsigned int sleeping = 0;
2941	int lro_enabled = qs->lro.enabled;
2942	int skip_lro;
2943	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2944	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2945	int ngathered = 0;
2946#ifdef DEBUG
2947	static int last_holdoff = 0;
2948	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2949		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2950		last_holdoff = rspq->holdoff_tmr;
2951	}
2952#endif
2953	rspq->next_holdoff = rspq->holdoff_tmr;
2954
2955	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2956		int eth, eop = 0, ethpad = 0;
2957		uint32_t flags = ntohl(r->flags);
2958		uint32_t rss_csum = *(const uint32_t *)r;
2959		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2960
2961		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2962
2963		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2964			struct mbuf *m;
2965
2966			if (cxgb_debug)
2967				printf("async notification\n");
2968
2969			if (rspq->rspq_mh.mh_head == NULL) {
2970				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2971				m = rspq->rspq_mh.mh_head;
2972			} else {
2973				m = m_gethdr(M_DONTWAIT, MT_DATA);
2974			}
2975			if (m == NULL)
2976				goto no_mem;
2977
2978                        memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2979			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2980                        *mtod(m, char *) = CPL_ASYNC_NOTIF;
2981			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
2982			eop = 1;
2983                        rspq->async_notif++;
2984			goto skip;
2985		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2986			struct mbuf *m = NULL;
2987
2988			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
2989			    r->rss_hdr.opcode, rspq->cidx);
2990			if (rspq->rspq_mh.mh_head == NULL)
2991				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2992                        else
2993				m = m_gethdr(M_DONTWAIT, MT_DATA);
2994
2995			if (rspq->rspq_mh.mh_head == NULL &&  m == NULL) {
2996		no_mem:
2997				rspq->next_holdoff = NOMEM_INTR_DELAY;
2998				budget_left--;
2999				break;
3000			}
3001			get_imm_packet(adap, r, rspq->rspq_mh.mh_head);
3002			eop = 1;
3003			rspq->imm_data++;
3004		} else if (r->len_cq) {
3005			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3006
3007			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
3008			if (eop) {
3009				rspq->rspq_mh.mh_head->m_flags |= M_FLOWID;
3010				rspq->rspq_mh.mh_head->m_pkthdr.flowid = rss_hash;
3011			}
3012
3013			ethpad = 2;
3014		} else {
3015			rspq->pure_rsps++;
3016		}
3017	skip:
3018		if (flags & RSPD_CTRL_MASK) {
3019			sleeping |= flags & RSPD_GTS_MASK;
3020			handle_rsp_cntrl_info(qs, flags);
3021		}
3022
3023		r++;
3024		if (__predict_false(++rspq->cidx == rspq->size)) {
3025			rspq->cidx = 0;
3026			rspq->gen ^= 1;
3027			r = rspq->desc;
3028		}
3029
3030		if (++rspq->credits >= (rspq->size / 4)) {
3031			refill_rspq(adap, rspq, rspq->credits);
3032			rspq->credits = 0;
3033		}
3034		if (!eth && eop) {
3035			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
3036			/*
3037			 * XXX size mismatch
3038			 */
3039			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
3040
3041
3042			ngathered = rx_offload(&adap->tdev, rspq,
3043			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
3044			rspq->rspq_mh.mh_head = NULL;
3045			DPRINTF("received offload packet\n");
3046
3047		} else if (eth && eop) {
3048			struct mbuf *m = rspq->rspq_mh.mh_head;
3049
3050			t3_rx_eth(adap, rspq, m, ethpad);
3051
3052			/*
3053			 * The T304 sends incoming packets on any qset.  If LRO
3054			 * is also enabled, we could end up sending packet up
3055			 * lro_ctrl->ifp's input.  That is incorrect.
3056			 *
3057			 * The mbuf's rcvif was derived from the cpl header and
3058			 * is accurate.  Skip LRO and just use that.
3059			 */
3060			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3061
3062			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro &&
3063			    (tcp_lro_rx(lro_ctrl, m, 0) == 0)) {
3064				/* successfully queue'd for LRO */
3065			} else {
3066				/*
3067				 * LRO not enabled, packet unsuitable for LRO,
3068				 * or unable to queue.  Pass it up right now in
3069				 * either case.
3070				 */
3071				struct ifnet *ifp = m->m_pkthdr.rcvif;
3072				(*ifp->if_input)(ifp, m);
3073			}
3074			rspq->rspq_mh.mh_head = NULL;
3075
3076		}
3077		__refill_fl_lt(adap, &qs->fl[0], 32);
3078		__refill_fl_lt(adap, &qs->fl[1], 32);
3079		--budget_left;
3080	}
3081
3082	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3083
3084	/* Flush LRO */
3085	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3086		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3087		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3088		tcp_lro_flush(lro_ctrl, queued);
3089	}
3090
3091	if (sleeping)
3092		check_ring_db(adap, qs, sleeping);
3093
3094	mb();  /* commit Tx queue processed updates */
3095	if (__predict_false(qs->txq_stopped > 1))
3096		restart_tx(qs);
3097
3098	__refill_fl_lt(adap, &qs->fl[0], 512);
3099	__refill_fl_lt(adap, &qs->fl[1], 512);
3100	budget -= budget_left;
3101	return (budget);
3102}
3103
3104/*
3105 * A helper function that processes responses and issues GTS.
3106 */
3107static __inline int
3108process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3109{
3110	int work;
3111	static int last_holdoff = 0;
3112
3113	work = process_responses(adap, rspq_to_qset(rq), -1);
3114
3115	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3116		printf("next_holdoff=%d\n", rq->next_holdoff);
3117		last_holdoff = rq->next_holdoff;
3118	}
3119	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3120	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3121
3122	return (work);
3123}
3124
3125
3126/*
3127 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3128 * Handles data events from SGE response queues as well as error and other
3129 * async events as they all use the same interrupt pin.  We use one SGE
3130 * response queue per port in this mode and protect all response queues with
3131 * queue 0's lock.
3132 */
3133void
3134t3b_intr(void *data)
3135{
3136	uint32_t i, map;
3137	adapter_t *adap = data;
3138	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3139
3140	t3_write_reg(adap, A_PL_CLI, 0);
3141	map = t3_read_reg(adap, A_SG_DATA_INTR);
3142
3143	if (!map)
3144		return;
3145
3146	if (__predict_false(map & F_ERRINTR))
3147		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3148
3149	mtx_lock(&q0->lock);
3150	for_each_port(adap, i)
3151	    if (map & (1 << i))
3152			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3153	mtx_unlock(&q0->lock);
3154}
3155
3156/*
3157 * The MSI interrupt handler.  This needs to handle data events from SGE
3158 * response queues as well as error and other async events as they all use
3159 * the same MSI vector.  We use one SGE response queue per port in this mode
3160 * and protect all response queues with queue 0's lock.
3161 */
3162void
3163t3_intr_msi(void *data)
3164{
3165	adapter_t *adap = data;
3166	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3167	int i, new_packets = 0;
3168
3169	mtx_lock(&q0->lock);
3170
3171	for_each_port(adap, i)
3172	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3173		    new_packets = 1;
3174	mtx_unlock(&q0->lock);
3175	if (new_packets == 0)
3176		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3177}
3178
3179void
3180t3_intr_msix(void *data)
3181{
3182	struct sge_qset *qs = data;
3183	adapter_t *adap = qs->port->adapter;
3184	struct sge_rspq *rspq = &qs->rspq;
3185
3186	if (process_responses_gts(adap, rspq) == 0)
3187		rspq->unhandled_irqs++;
3188}
3189
3190#define QDUMP_SBUF_SIZE		32 * 400
3191static int
3192t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3193{
3194	struct sge_rspq *rspq;
3195	struct sge_qset *qs;
3196	int i, err, dump_end, idx;
3197	static int multiplier = 1;
3198	struct sbuf *sb;
3199	struct rsp_desc *rspd;
3200	uint32_t data[4];
3201
3202	rspq = arg1;
3203	qs = rspq_to_qset(rspq);
3204	if (rspq->rspq_dump_count == 0)
3205		return (0);
3206	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3207		log(LOG_WARNING,
3208		    "dump count is too large %d\n", rspq->rspq_dump_count);
3209		rspq->rspq_dump_count = 0;
3210		return (EINVAL);
3211	}
3212	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3213		log(LOG_WARNING,
3214		    "dump start of %d is greater than queue size\n",
3215		    rspq->rspq_dump_start);
3216		rspq->rspq_dump_start = 0;
3217		return (EINVAL);
3218	}
3219	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3220	if (err)
3221		return (err);
3222retry_sbufops:
3223	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3224
3225	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3226	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3227	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3228	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3229	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3230
3231	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3232	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3233
3234	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3235	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3236		idx = i & (RSPQ_Q_SIZE-1);
3237
3238		rspd = &rspq->desc[idx];
3239		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3240		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3241		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3242		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3243		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3244		    be32toh(rspd->len_cq), rspd->intr_gen);
3245	}
3246	if (sbuf_overflowed(sb)) {
3247		sbuf_delete(sb);
3248		multiplier++;
3249		goto retry_sbufops;
3250	}
3251	sbuf_finish(sb);
3252	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3253	sbuf_delete(sb);
3254	return (err);
3255}
3256
3257static int
3258t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3259{
3260	struct sge_txq *txq;
3261	struct sge_qset *qs;
3262	int i, j, err, dump_end;
3263	static int multiplier = 1;
3264	struct sbuf *sb;
3265	struct tx_desc *txd;
3266	uint32_t *WR, wr_hi, wr_lo, gen;
3267	uint32_t data[4];
3268
3269	txq = arg1;
3270	qs = txq_to_qset(txq, TXQ_ETH);
3271	if (txq->txq_dump_count == 0) {
3272		return (0);
3273	}
3274	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3275		log(LOG_WARNING,
3276		    "dump count is too large %d\n", txq->txq_dump_count);
3277		txq->txq_dump_count = 1;
3278		return (EINVAL);
3279	}
3280	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3281		log(LOG_WARNING,
3282		    "dump start of %d is greater than queue size\n",
3283		    txq->txq_dump_start);
3284		txq->txq_dump_start = 0;
3285		return (EINVAL);
3286	}
3287	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3288	if (err)
3289		return (err);
3290
3291
3292retry_sbufops:
3293	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3294
3295	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3296	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3297	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3298	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3299	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3300	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3301	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3302	    txq->txq_dump_start,
3303	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3304
3305	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3306	for (i = txq->txq_dump_start; i < dump_end; i++) {
3307		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3308		WR = (uint32_t *)txd->flit;
3309		wr_hi = ntohl(WR[0]);
3310		wr_lo = ntohl(WR[1]);
3311		gen = G_WR_GEN(wr_lo);
3312
3313		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3314		    wr_hi, wr_lo, gen);
3315		for (j = 2; j < 30; j += 4)
3316			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3317			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3318
3319	}
3320	if (sbuf_overflowed(sb)) {
3321		sbuf_delete(sb);
3322		multiplier++;
3323		goto retry_sbufops;
3324	}
3325	sbuf_finish(sb);
3326	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3327	sbuf_delete(sb);
3328	return (err);
3329}
3330
3331static int
3332t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3333{
3334	struct sge_txq *txq;
3335	struct sge_qset *qs;
3336	int i, j, err, dump_end;
3337	static int multiplier = 1;
3338	struct sbuf *sb;
3339	struct tx_desc *txd;
3340	uint32_t *WR, wr_hi, wr_lo, gen;
3341
3342	txq = arg1;
3343	qs = txq_to_qset(txq, TXQ_CTRL);
3344	if (txq->txq_dump_count == 0) {
3345		return (0);
3346	}
3347	if (txq->txq_dump_count > 256) {
3348		log(LOG_WARNING,
3349		    "dump count is too large %d\n", txq->txq_dump_count);
3350		txq->txq_dump_count = 1;
3351		return (EINVAL);
3352	}
3353	if (txq->txq_dump_start > 255) {
3354		log(LOG_WARNING,
3355		    "dump start of %d is greater than queue size\n",
3356		    txq->txq_dump_start);
3357		txq->txq_dump_start = 0;
3358		return (EINVAL);
3359	}
3360
3361retry_sbufops:
3362	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3363	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3364	    txq->txq_dump_start,
3365	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3366
3367	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3368	for (i = txq->txq_dump_start; i < dump_end; i++) {
3369		txd = &txq->desc[i & (255)];
3370		WR = (uint32_t *)txd->flit;
3371		wr_hi = ntohl(WR[0]);
3372		wr_lo = ntohl(WR[1]);
3373		gen = G_WR_GEN(wr_lo);
3374
3375		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3376		    wr_hi, wr_lo, gen);
3377		for (j = 2; j < 30; j += 4)
3378			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3379			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3380
3381	}
3382	if (sbuf_overflowed(sb)) {
3383		sbuf_delete(sb);
3384		multiplier++;
3385		goto retry_sbufops;
3386	}
3387	sbuf_finish(sb);
3388	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3389	sbuf_delete(sb);
3390	return (err);
3391}
3392
3393static int
3394t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3395{
3396	adapter_t *sc = arg1;
3397	struct qset_params *qsp = &sc->params.sge.qset[0];
3398	int coalesce_usecs;
3399	struct sge_qset *qs;
3400	int i, j, err, nqsets = 0;
3401	struct mtx *lock;
3402
3403	if ((sc->flags & FULL_INIT_DONE) == 0)
3404		return (ENXIO);
3405
3406	coalesce_usecs = qsp->coalesce_usecs;
3407        err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3408
3409	if (err != 0) {
3410		return (err);
3411	}
3412	if (coalesce_usecs == qsp->coalesce_usecs)
3413		return (0);
3414
3415	for (i = 0; i < sc->params.nports; i++)
3416		for (j = 0; j < sc->port[i].nqsets; j++)
3417			nqsets++;
3418
3419	coalesce_usecs = max(1, coalesce_usecs);
3420
3421	for (i = 0; i < nqsets; i++) {
3422		qs = &sc->sge.qs[i];
3423		qsp = &sc->params.sge.qset[i];
3424		qsp->coalesce_usecs = coalesce_usecs;
3425
3426		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3427			    &sc->sge.qs[0].rspq.lock;
3428
3429		mtx_lock(lock);
3430		t3_update_qset_coalesce(qs, qsp);
3431		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3432		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3433		mtx_unlock(lock);
3434	}
3435
3436	return (0);
3437}
3438
3439
3440void
3441t3_add_attach_sysctls(adapter_t *sc)
3442{
3443	struct sysctl_ctx_list *ctx;
3444	struct sysctl_oid_list *children;
3445
3446	ctx = device_get_sysctl_ctx(sc->dev);
3447	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3448
3449	/* random information */
3450	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3451	    "firmware_version",
3452	    CTLFLAG_RD, &sc->fw_version,
3453	    0, "firmware version");
3454	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3455	    "hw_revision",
3456	    CTLFLAG_RD, &sc->params.rev,
3457	    0, "chip model");
3458	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3459	    "port_types",
3460	    CTLFLAG_RD, &sc->port_types,
3461	    0, "type of ports");
3462	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3463	    "enable_debug",
3464	    CTLFLAG_RW, &cxgb_debug,
3465	    0, "enable verbose debugging output");
3466	SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3467	    CTLFLAG_RD, &sc->tunq_coalesce,
3468	    "#tunneled packets freed");
3469	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3470	    "txq_overrun",
3471	    CTLFLAG_RD, &txq_fills,
3472	    0, "#times txq overrun");
3473}
3474
3475
3476static const char *rspq_name = "rspq";
3477static const char *txq_names[] =
3478{
3479	"txq_eth",
3480	"txq_ofld",
3481	"txq_ctrl"
3482};
3483
3484static int
3485sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3486{
3487	struct port_info *p = arg1;
3488	uint64_t *parg;
3489
3490	if (!p)
3491		return (EINVAL);
3492
3493	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3494	PORT_LOCK(p);
3495	t3_mac_update_stats(&p->mac);
3496	PORT_UNLOCK(p);
3497
3498	return (sysctl_handle_quad(oidp, parg, 0, req));
3499}
3500
3501void
3502t3_add_configured_sysctls(adapter_t *sc)
3503{
3504	struct sysctl_ctx_list *ctx;
3505	struct sysctl_oid_list *children;
3506	int i, j;
3507
3508	ctx = device_get_sysctl_ctx(sc->dev);
3509	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3510
3511	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3512	    "intr_coal",
3513	    CTLTYPE_INT|CTLFLAG_RW, sc,
3514	    0, t3_set_coalesce_usecs,
3515	    "I", "interrupt coalescing timer (us)");
3516
3517	for (i = 0; i < sc->params.nports; i++) {
3518		struct port_info *pi = &sc->port[i];
3519		struct sysctl_oid *poid;
3520		struct sysctl_oid_list *poidlist;
3521		struct mac_stats *mstats = &pi->mac.stats;
3522
3523		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3524		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3525		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3526		poidlist = SYSCTL_CHILDREN(poid);
3527		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3528		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3529		    0, "#queue sets");
3530
3531		for (j = 0; j < pi->nqsets; j++) {
3532			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3533			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3534					  *ctrlqpoid, *lropoid;
3535			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3536					       *txqpoidlist, *ctrlqpoidlist,
3537					       *lropoidlist;
3538			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3539
3540			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3541
3542			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3543			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3544			qspoidlist = SYSCTL_CHILDREN(qspoid);
3545
3546			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3547					CTLFLAG_RD, &qs->fl[0].empty, 0,
3548					"freelist #0 empty");
3549			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3550					CTLFLAG_RD, &qs->fl[1].empty, 0,
3551					"freelist #1 empty");
3552
3553			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3554			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3555			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3556
3557			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3558			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3559			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3560
3561			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3562			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3563			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3564
3565			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3566			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3567			lropoidlist = SYSCTL_CHILDREN(lropoid);
3568
3569			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3570			    CTLFLAG_RD, &qs->rspq.size,
3571			    0, "#entries in response queue");
3572			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3573			    CTLFLAG_RD, &qs->rspq.cidx,
3574			    0, "consumer index");
3575			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3576			    CTLFLAG_RD, &qs->rspq.credits,
3577			    0, "#credits");
3578			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3579			    CTLFLAG_RD, &qs->rspq.phys_addr,
3580			    "physical_address_of the queue");
3581			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3582			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3583			    0, "start rspq dump entry");
3584			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3585			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3586			    0, "#rspq entries to dump");
3587			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3588			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3589			    0, t3_dump_rspq, "A", "dump of the response queue");
3590
3591
3592			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "dropped",
3593			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3594			    0, "#tunneled packets dropped");
3595			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3596			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3597			    0, "#tunneled packets waiting to be sent");
3598#if 0
3599			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3600			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3601			    0, "#tunneled packets queue producer index");
3602			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3603			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3604			    0, "#tunneled packets queue consumer index");
3605#endif
3606			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3607			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3608			    0, "#tunneled packets processed by the card");
3609			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3610			    CTLFLAG_RD, &txq->cleaned,
3611			    0, "#tunneled packets cleaned");
3612			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3613			    CTLFLAG_RD, &txq->in_use,
3614			    0, "#tunneled packet slots in use");
3615			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3616			    CTLFLAG_RD, &txq->txq_frees,
3617			    "#tunneled packets freed");
3618			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3619			    CTLFLAG_RD, &txq->txq_skipped,
3620			    0, "#tunneled packet descriptors skipped");
3621			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3622			    CTLFLAG_RD, &txq->txq_coalesced,
3623			    "#tunneled packets coalesced");
3624			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3625			    CTLFLAG_RD, &txq->txq_enqueued,
3626			    0, "#tunneled packets enqueued to hardware");
3627			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3628			    CTLFLAG_RD, &qs->txq_stopped,
3629			    0, "tx queues stopped");
3630			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3631			    CTLFLAG_RD, &txq->phys_addr,
3632			    "physical_address_of the queue");
3633			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3634			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3635			    0, "txq generation");
3636			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3637			    CTLFLAG_RD, &txq->cidx,
3638			    0, "hardware queue cidx");
3639			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3640			    CTLFLAG_RD, &txq->pidx,
3641			    0, "hardware queue pidx");
3642			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3643			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3644			    0, "txq start idx for dump");
3645			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3646			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3647			    0, "txq #entries to dump");
3648			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3649			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3650			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3651
3652			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3653			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3654			    0, "ctrlq start idx for dump");
3655			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3656			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3657			    0, "ctrl #entries to dump");
3658			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3659			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3660			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3661
3662			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3663			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3664			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3665			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3666			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3667			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3668			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3669			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3670		}
3671
3672		/* Now add a node for mac stats. */
3673		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3674		    CTLFLAG_RD, NULL, "MAC statistics");
3675		poidlist = SYSCTL_CHILDREN(poid);
3676
3677		/*
3678		 * We (ab)use the length argument (arg2) to pass on the offset
3679		 * of the data that we are interested in.  This is only required
3680		 * for the quad counters that are updated from the hardware (we
3681		 * make sure that we return the latest value).
3682		 * sysctl_handle_macstat first updates *all* the counters from
3683		 * the hardware, and then returns the latest value of the
3684		 * requested counter.  Best would be to update only the
3685		 * requested counter from hardware, but t3_mac_update_stats()
3686		 * hides all the register details and we don't want to dive into
3687		 * all that here.
3688		 */
3689#define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3690    (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3691    sysctl_handle_macstat, "QU", 0)
3692		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3693		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3694		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3695		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3696		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3697		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3698		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3699		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3700		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3701		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3702		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3703		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3704		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3705		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3706		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3707		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3708		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3709		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3710		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3711		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3712		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3713		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3714		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3715		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3716		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3717		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3718		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3719		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3720		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3721		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3722		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3723		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3724		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3725		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3726		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3727		CXGB_SYSCTL_ADD_QUAD(rx_short);
3728		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3729		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3730		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3731		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3732		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3733		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3734		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3735		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3736		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3737		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3738#undef CXGB_SYSCTL_ADD_QUAD
3739
3740#define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3741    CTLFLAG_RD, &mstats->a, 0)
3742		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3743		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3744		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3745		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3746		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3747		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3748		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3749		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3750		CXGB_SYSCTL_ADD_ULONG(num_resets);
3751		CXGB_SYSCTL_ADD_ULONG(link_faults);
3752#undef CXGB_SYSCTL_ADD_ULONG
3753	}
3754}
3755
3756/**
3757 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3758 *	@qs: the queue set
3759 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3760 *	@idx: the descriptor index in the queue
3761 *	@data: where to dump the descriptor contents
3762 *
3763 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3764 *	size of the descriptor.
3765 */
3766int
3767t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3768		unsigned char *data)
3769{
3770	if (qnum >= 6)
3771		return (EINVAL);
3772
3773	if (qnum < 3) {
3774		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3775			return -EINVAL;
3776		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3777		return sizeof(struct tx_desc);
3778	}
3779
3780	if (qnum == 3) {
3781		if (!qs->rspq.desc || idx >= qs->rspq.size)
3782			return (EINVAL);
3783		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3784		return sizeof(struct rsp_desc);
3785	}
3786
3787	qnum -= 4;
3788	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3789		return (EINVAL);
3790	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3791	return sizeof(struct rx_desc);
3792}
3793