cxgb_sge.c revision 217321
1/**************************************************************************
2
3Copyright (c) 2007-2009, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/cxgb_sge.c 217321 2011-01-12 19:53:44Z mdf $");
32
33#include "opt_inet.h"
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/module.h>
39#include <sys/bus.h>
40#include <sys/conf.h>
41#include <machine/bus.h>
42#include <machine/resource.h>
43#include <sys/bus_dma.h>
44#include <sys/rman.h>
45#include <sys/queue.h>
46#include <sys/sysctl.h>
47#include <sys/taskqueue.h>
48
49#include <sys/proc.h>
50#include <sys/sbuf.h>
51#include <sys/sched.h>
52#include <sys/smp.h>
53#include <sys/systm.h>
54#include <sys/syslog.h>
55#include <sys/socket.h>
56
57#include <net/bpf.h>
58#include <net/ethernet.h>
59#include <net/if.h>
60#include <net/if_vlan_var.h>
61
62#include <netinet/in_systm.h>
63#include <netinet/in.h>
64#include <netinet/ip.h>
65#include <netinet/tcp.h>
66
67#include <dev/pci/pcireg.h>
68#include <dev/pci/pcivar.h>
69
70#include <vm/vm.h>
71#include <vm/pmap.h>
72
73#include <cxgb_include.h>
74#include <sys/mvec.h>
75
76int	txq_fills = 0;
77int	multiq_tx_enable = 1;
78
79extern struct sysctl_oid_list sysctl__hw_cxgb_children;
80int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
81TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
82SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
83    "size of per-queue mbuf ring");
84
85static int cxgb_tx_coalesce_force = 0;
86TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
87SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
88    &cxgb_tx_coalesce_force, 0,
89    "coalesce small packets into a single work request regardless of ring state");
90
91#define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
92#define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
93#define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
94#define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
95#define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
96#define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
97#define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
98
99
100static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
101TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
102    &cxgb_tx_coalesce_enable_start);
103SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
104    &cxgb_tx_coalesce_enable_start, 0,
105    "coalesce enable threshold");
106static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
107TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
108SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
109    &cxgb_tx_coalesce_enable_stop, 0,
110    "coalesce disable threshold");
111static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
112TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
113SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
114    &cxgb_tx_reclaim_threshold, 0,
115    "tx cleaning minimum threshold");
116
117/*
118 * XXX don't re-enable this until TOE stops assuming
119 * we have an m_ext
120 */
121static int recycle_enable = 0;
122
123extern int cxgb_use_16k_clusters;
124extern int nmbjumbop;
125extern int nmbjumbo9;
126extern int nmbjumbo16;
127
128#define USE_GTS 0
129
130#define SGE_RX_SM_BUF_SIZE	1536
131#define SGE_RX_DROP_THRES	16
132#define SGE_RX_COPY_THRES	128
133
134/*
135 * Period of the Tx buffer reclaim timer.  This timer does not need to run
136 * frequently as Tx buffers are usually reclaimed by new Tx packets.
137 */
138#define TX_RECLAIM_PERIOD       (hz >> 1)
139
140/*
141 * Values for sge_txq.flags
142 */
143enum {
144	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
145	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
146};
147
148struct tx_desc {
149	uint64_t	flit[TX_DESC_FLITS];
150} __packed;
151
152struct rx_desc {
153	uint32_t	addr_lo;
154	uint32_t	len_gen;
155	uint32_t	gen2;
156	uint32_t	addr_hi;
157} __packed;
158
159struct rsp_desc {               /* response queue descriptor */
160	struct rss_header	rss_hdr;
161	uint32_t		flags;
162	uint32_t		len_cq;
163	uint8_t			imm_data[47];
164	uint8_t			intr_gen;
165} __packed;
166
167#define RX_SW_DESC_MAP_CREATED	(1 << 0)
168#define TX_SW_DESC_MAP_CREATED	(1 << 1)
169#define RX_SW_DESC_INUSE        (1 << 3)
170#define TX_SW_DESC_MAPPED       (1 << 4)
171
172#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
173#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
174#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
175#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
176
177struct tx_sw_desc {                /* SW state per Tx descriptor */
178	struct mbuf	*m;
179	bus_dmamap_t	map;
180	int		flags;
181};
182
183struct rx_sw_desc {                /* SW state per Rx descriptor */
184	caddr_t		rxsd_cl;
185	struct mbuf	*m;
186	bus_dmamap_t	map;
187	int		flags;
188};
189
190struct txq_state {
191	unsigned int	compl;
192	unsigned int	gen;
193	unsigned int	pidx;
194};
195
196struct refill_fl_cb_arg {
197	int               error;
198	bus_dma_segment_t seg;
199	int               nseg;
200};
201
202
203/*
204 * Maps a number of flits to the number of Tx descriptors that can hold them.
205 * The formula is
206 *
207 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
208 *
209 * HW allows up to 4 descriptors to be combined into a WR.
210 */
211static uint8_t flit_desc_map[] = {
212	0,
213#if SGE_NUM_GENBITS == 1
214	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
216	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
217	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
218#elif SGE_NUM_GENBITS == 2
219	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
220	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
222	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
223#else
224# error "SGE_NUM_GENBITS must be 1 or 2"
225#endif
226};
227
228#define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
229#define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
230#define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
231#define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
232#define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
233#define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
234	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
235#define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236#define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
237	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
238#define	TXQ_RING_DEQUEUE(qs) \
239	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
240
241int cxgb_debug = 0;
242
243static void sge_timer_cb(void *arg);
244static void sge_timer_reclaim(void *arg, int ncount);
245static void sge_txq_reclaim_handler(void *arg, int ncount);
246static void cxgb_start_locked(struct sge_qset *qs);
247
248/*
249 * XXX need to cope with bursty scheduling by looking at a wider
250 * window than we are now for determining the need for coalescing
251 *
252 */
253static __inline uint64_t
254check_pkt_coalesce(struct sge_qset *qs)
255{
256        struct adapter *sc;
257        struct sge_txq *txq;
258	uint8_t *fill;
259
260	if (__predict_false(cxgb_tx_coalesce_force))
261		return (1);
262	txq = &qs->txq[TXQ_ETH];
263        sc = qs->port->adapter;
264	fill = &sc->tunq_fill[qs->idx];
265
266	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
267		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
268	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
269		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
270	/*
271	 * if the hardware transmit queue is more than 1/8 full
272	 * we mark it as coalescing - we drop back from coalescing
273	 * when we go below 1/32 full and there are no packets enqueued,
274	 * this provides us with some degree of hysteresis
275	 */
276        if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
277	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
278                *fill = 0;
279        else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
280                *fill = 1;
281
282	return (sc->tunq_coalesce);
283}
284
285#ifdef __LP64__
286static void
287set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
288{
289	uint64_t wr_hilo;
290#if _BYTE_ORDER == _LITTLE_ENDIAN
291	wr_hilo = wr_hi;
292	wr_hilo |= (((uint64_t)wr_lo)<<32);
293#else
294	wr_hilo = wr_lo;
295	wr_hilo |= (((uint64_t)wr_hi)<<32);
296#endif
297	wrp->wrh_hilo = wr_hilo;
298}
299#else
300static void
301set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
302{
303
304	wrp->wrh_hi = wr_hi;
305	wmb();
306	wrp->wrh_lo = wr_lo;
307}
308#endif
309
310struct coalesce_info {
311	int count;
312	int nbytes;
313};
314
315static int
316coalesce_check(struct mbuf *m, void *arg)
317{
318	struct coalesce_info *ci = arg;
319	int *count = &ci->count;
320	int *nbytes = &ci->nbytes;
321
322	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
323		(*count < 7) && (m->m_next == NULL))) {
324		*count += 1;
325		*nbytes += m->m_len;
326		return (1);
327	}
328	return (0);
329}
330
331static struct mbuf *
332cxgb_dequeue(struct sge_qset *qs)
333{
334	struct mbuf *m, *m_head, *m_tail;
335	struct coalesce_info ci;
336
337
338	if (check_pkt_coalesce(qs) == 0)
339		return TXQ_RING_DEQUEUE(qs);
340
341	m_head = m_tail = NULL;
342	ci.count = ci.nbytes = 0;
343	do {
344		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
345		if (m_head == NULL) {
346			m_tail = m_head = m;
347		} else if (m != NULL) {
348			m_tail->m_nextpkt = m;
349			m_tail = m;
350		}
351	} while (m != NULL);
352	if (ci.count > 7)
353		panic("trying to coalesce %d packets in to one WR", ci.count);
354	return (m_head);
355}
356
357/**
358 *	reclaim_completed_tx - reclaims completed Tx descriptors
359 *	@adapter: the adapter
360 *	@q: the Tx queue to reclaim completed descriptors from
361 *
362 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
363 *	and frees the associated buffers if possible.  Called with the Tx
364 *	queue's lock held.
365 */
366static __inline int
367reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
368{
369	struct sge_txq *q = &qs->txq[queue];
370	int reclaim = desc_reclaimable(q);
371
372	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
373	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
374		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
375
376	if (reclaim < reclaim_min)
377		return (0);
378
379	mtx_assert(&qs->lock, MA_OWNED);
380	if (reclaim > 0) {
381		t3_free_tx_desc(qs, reclaim, queue);
382		q->cleaned += reclaim;
383		q->in_use -= reclaim;
384	}
385	if (isset(&qs->txq_stopped, TXQ_ETH))
386                clrbit(&qs->txq_stopped, TXQ_ETH);
387
388	return (reclaim);
389}
390
391/**
392 *	should_restart_tx - are there enough resources to restart a Tx queue?
393 *	@q: the Tx queue
394 *
395 *	Checks if there are enough descriptors to restart a suspended Tx queue.
396 */
397static __inline int
398should_restart_tx(const struct sge_txq *q)
399{
400	unsigned int r = q->processed - q->cleaned;
401
402	return q->in_use - r < (q->size >> 1);
403}
404
405/**
406 *	t3_sge_init - initialize SGE
407 *	@adap: the adapter
408 *	@p: the SGE parameters
409 *
410 *	Performs SGE initialization needed every time after a chip reset.
411 *	We do not initialize any of the queue sets here, instead the driver
412 *	top-level must request those individually.  We also do not enable DMA
413 *	here, that should be done after the queues have been set up.
414 */
415void
416t3_sge_init(adapter_t *adap, struct sge_params *p)
417{
418	u_int ctrl, ups;
419
420	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
421
422	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
423	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
424	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
425	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
426#if SGE_NUM_GENBITS == 1
427	ctrl |= F_EGRGENCTRL;
428#endif
429	if (adap->params.rev > 0) {
430		if (!(adap->flags & (USING_MSIX | USING_MSI)))
431			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
432	}
433	t3_write_reg(adap, A_SG_CONTROL, ctrl);
434	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
435		     V_LORCQDRBTHRSH(512));
436	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
437	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
438		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
439	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
440		     adap->params.rev < T3_REV_C ? 1000 : 500);
441	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
442	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
443	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
444	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
445	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
446}
447
448
449/**
450 *	sgl_len - calculates the size of an SGL of the given capacity
451 *	@n: the number of SGL entries
452 *
453 *	Calculates the number of flits needed for a scatter/gather list that
454 *	can hold the given number of entries.
455 */
456static __inline unsigned int
457sgl_len(unsigned int n)
458{
459	return ((3 * n) / 2 + (n & 1));
460}
461
462/**
463 *	get_imm_packet - return the next ingress packet buffer from a response
464 *	@resp: the response descriptor containing the packet data
465 *
466 *	Return a packet containing the immediate data of the given response.
467 */
468static int
469get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
470{
471
472	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
473	m->m_ext.ext_buf = NULL;
474	m->m_ext.ext_type = 0;
475	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
476	return (0);
477}
478
479static __inline u_int
480flits_to_desc(u_int n)
481{
482	return (flit_desc_map[n]);
483}
484
485#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
486		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
487		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
488		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
489		    F_HIRCQPARITYERROR)
490#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
491#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
492		      F_RSPQDISABLED)
493
494/**
495 *	t3_sge_err_intr_handler - SGE async event interrupt handler
496 *	@adapter: the adapter
497 *
498 *	Interrupt handler for SGE asynchronous (non-data) events.
499 */
500void
501t3_sge_err_intr_handler(adapter_t *adapter)
502{
503	unsigned int v, status;
504
505	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
506	if (status & SGE_PARERR)
507		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
508			 status & SGE_PARERR);
509	if (status & SGE_FRAMINGERR)
510		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
511			 status & SGE_FRAMINGERR);
512	if (status & F_RSPQCREDITOVERFOW)
513		CH_ALERT(adapter, "SGE response queue credit overflow\n");
514
515	if (status & F_RSPQDISABLED) {
516		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
517
518		CH_ALERT(adapter,
519			 "packet delivered to disabled response queue (0x%x)\n",
520			 (v >> S_RSPQ0DISABLED) & 0xff);
521	}
522
523	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
524	if (status & SGE_FATALERR)
525		t3_fatal_err(adapter);
526}
527
528void
529t3_sge_prep(adapter_t *adap, struct sge_params *p)
530{
531	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
532
533	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
534	nqsets *= adap->params.nports;
535
536	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
537
538	while (!powerof2(fl_q_size))
539		fl_q_size--;
540
541	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
542	    is_offload(adap);
543
544#if __FreeBSD_version >= 700111
545	if (use_16k) {
546		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
547		jumbo_buf_size = MJUM16BYTES;
548	} else {
549		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
550		jumbo_buf_size = MJUM9BYTES;
551	}
552#else
553	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
554	jumbo_buf_size = MJUMPAGESIZE;
555#endif
556	while (!powerof2(jumbo_q_size))
557		jumbo_q_size--;
558
559	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
560		device_printf(adap->dev,
561		    "Insufficient clusters and/or jumbo buffers.\n");
562
563	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
564
565	for (i = 0; i < SGE_QSETS; ++i) {
566		struct qset_params *q = p->qset + i;
567
568		if (adap->params.nports > 2) {
569			q->coalesce_usecs = 50;
570		} else {
571#ifdef INVARIANTS
572			q->coalesce_usecs = 10;
573#else
574			q->coalesce_usecs = 5;
575#endif
576		}
577		q->polling = 0;
578		q->rspq_size = RSPQ_Q_SIZE;
579		q->fl_size = fl_q_size;
580		q->jumbo_size = jumbo_q_size;
581		q->jumbo_buf_size = jumbo_buf_size;
582		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
583		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
584		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
585		q->cong_thres = 0;
586	}
587}
588
589int
590t3_sge_alloc(adapter_t *sc)
591{
592
593	/* The parent tag. */
594	if (bus_dma_tag_create( NULL,			/* parent */
595				1, 0,			/* algnmnt, boundary */
596				BUS_SPACE_MAXADDR,	/* lowaddr */
597				BUS_SPACE_MAXADDR,	/* highaddr */
598				NULL, NULL,		/* filter, filterarg */
599				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
600				BUS_SPACE_UNRESTRICTED, /* nsegments */
601				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
602				0,			/* flags */
603				NULL, NULL,		/* lock, lockarg */
604				&sc->parent_dmat)) {
605		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
606		return (ENOMEM);
607	}
608
609	/*
610	 * DMA tag for normal sized RX frames
611	 */
612	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
613		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
614		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
615		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
616		return (ENOMEM);
617	}
618
619	/*
620	 * DMA tag for jumbo sized RX frames.
621	 */
622	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
623		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
624		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
625		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
626		return (ENOMEM);
627	}
628
629	/*
630	 * DMA tag for TX frames.
631	 */
632	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
633		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
634		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
635		NULL, NULL, &sc->tx_dmat)) {
636		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
637		return (ENOMEM);
638	}
639
640	return (0);
641}
642
643int
644t3_sge_free(struct adapter * sc)
645{
646
647	if (sc->tx_dmat != NULL)
648		bus_dma_tag_destroy(sc->tx_dmat);
649
650	if (sc->rx_jumbo_dmat != NULL)
651		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
652
653	if (sc->rx_dmat != NULL)
654		bus_dma_tag_destroy(sc->rx_dmat);
655
656	if (sc->parent_dmat != NULL)
657		bus_dma_tag_destroy(sc->parent_dmat);
658
659	return (0);
660}
661
662void
663t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
664{
665
666	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
667	qs->rspq.polling = 0 /* p->polling */;
668}
669
670#if !defined(__i386__) && !defined(__amd64__)
671static void
672refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
673{
674	struct refill_fl_cb_arg *cb_arg = arg;
675
676	cb_arg->error = error;
677	cb_arg->seg = segs[0];
678	cb_arg->nseg = nseg;
679
680}
681#endif
682/**
683 *	refill_fl - refill an SGE free-buffer list
684 *	@sc: the controller softc
685 *	@q: the free-list to refill
686 *	@n: the number of new buffers to allocate
687 *
688 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
689 *	The caller must assure that @n does not exceed the queue's capacity.
690 */
691static void
692refill_fl(adapter_t *sc, struct sge_fl *q, int n)
693{
694	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
695	struct rx_desc *d = &q->desc[q->pidx];
696	struct refill_fl_cb_arg cb_arg;
697	struct mbuf *m;
698	caddr_t cl;
699	int err;
700
701	cb_arg.error = 0;
702	while (n--) {
703		/*
704		 * We only allocate a cluster, mbuf allocation happens after rx
705		 */
706		if (q->zone == zone_pack) {
707			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
708				break;
709			cl = m->m_ext.ext_buf;
710		} else {
711			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
712				break;
713			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
714				uma_zfree(q->zone, cl);
715				break;
716			}
717		}
718		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
719			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
720				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
721				uma_zfree(q->zone, cl);
722				goto done;
723			}
724			sd->flags |= RX_SW_DESC_MAP_CREATED;
725		}
726#if !defined(__i386__) && !defined(__amd64__)
727		err = bus_dmamap_load(q->entry_tag, sd->map,
728		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
729
730		if (err != 0 || cb_arg.error) {
731			if (q->zone == zone_pack)
732				uma_zfree(q->zone, cl);
733			m_free(m);
734			goto done;
735		}
736#else
737		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
738#endif
739		sd->flags |= RX_SW_DESC_INUSE;
740		sd->rxsd_cl = cl;
741		sd->m = m;
742		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
743		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
744		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
745		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
746
747		d++;
748		sd++;
749
750		if (++q->pidx == q->size) {
751			q->pidx = 0;
752			q->gen ^= 1;
753			sd = q->sdesc;
754			d = q->desc;
755		}
756		q->credits++;
757		q->db_pending++;
758	}
759
760done:
761	if (q->db_pending >= 32) {
762		q->db_pending = 0;
763		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
764	}
765}
766
767
768/**
769 *	free_rx_bufs - free the Rx buffers on an SGE free list
770 *	@sc: the controle softc
771 *	@q: the SGE free list to clean up
772 *
773 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
774 *	this queue should be stopped before calling this function.
775 */
776static void
777free_rx_bufs(adapter_t *sc, struct sge_fl *q)
778{
779	u_int cidx = q->cidx;
780
781	while (q->credits--) {
782		struct rx_sw_desc *d = &q->sdesc[cidx];
783
784		if (d->flags & RX_SW_DESC_INUSE) {
785			bus_dmamap_unload(q->entry_tag, d->map);
786			bus_dmamap_destroy(q->entry_tag, d->map);
787			if (q->zone == zone_pack) {
788				m_init(d->m, zone_pack, MCLBYTES,
789				    M_NOWAIT, MT_DATA, M_EXT);
790				uma_zfree(zone_pack, d->m);
791			} else {
792				m_init(d->m, zone_mbuf, MLEN,
793				    M_NOWAIT, MT_DATA, 0);
794				uma_zfree(zone_mbuf, d->m);
795				uma_zfree(q->zone, d->rxsd_cl);
796			}
797		}
798
799		d->rxsd_cl = NULL;
800		d->m = NULL;
801		if (++cidx == q->size)
802			cidx = 0;
803	}
804}
805
806static __inline void
807__refill_fl(adapter_t *adap, struct sge_fl *fl)
808{
809	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
810}
811
812static __inline void
813__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
814{
815	uint32_t reclaimable = fl->size - fl->credits;
816
817	if (reclaimable > 0)
818		refill_fl(adap, fl, min(max, reclaimable));
819}
820
821/**
822 *	recycle_rx_buf - recycle a receive buffer
823 *	@adapter: the adapter
824 *	@q: the SGE free list
825 *	@idx: index of buffer to recycle
826 *
827 *	Recycles the specified buffer on the given free list by adding it at
828 *	the next available slot on the list.
829 */
830static void
831recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
832{
833	struct rx_desc *from = &q->desc[idx];
834	struct rx_desc *to   = &q->desc[q->pidx];
835
836	q->sdesc[q->pidx] = q->sdesc[idx];
837	to->addr_lo = from->addr_lo;        // already big endian
838	to->addr_hi = from->addr_hi;        // likewise
839	wmb();	/* necessary ? */
840	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
841	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
842	q->credits++;
843
844	if (++q->pidx == q->size) {
845		q->pidx = 0;
846		q->gen ^= 1;
847	}
848	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
849}
850
851static void
852alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
853{
854	uint32_t *addr;
855
856	addr = arg;
857	*addr = segs[0].ds_addr;
858}
859
860static int
861alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
862    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
863    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
864{
865	size_t len = nelem * elem_size;
866	void *s = NULL;
867	void *p = NULL;
868	int err;
869
870	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
871				      BUS_SPACE_MAXADDR_32BIT,
872				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
873				      len, 0, NULL, NULL, tag)) != 0) {
874		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
875		return (ENOMEM);
876	}
877
878	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
879				    map)) != 0) {
880		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
881		return (ENOMEM);
882	}
883
884	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
885	bzero(p, len);
886	*(void **)desc = p;
887
888	if (sw_size) {
889		len = nelem * sw_size;
890		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
891		*(void **)sdesc = s;
892	}
893	if (parent_entry_tag == NULL)
894		return (0);
895
896	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
897				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
898		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
899				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
900		                      NULL, NULL, entry_tag)) != 0) {
901		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
902		return (ENOMEM);
903	}
904	return (0);
905}
906
907static void
908sge_slow_intr_handler(void *arg, int ncount)
909{
910	adapter_t *sc = arg;
911
912	t3_slow_intr_handler(sc);
913	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
914	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
915}
916
917/**
918 *	sge_timer_cb - perform periodic maintenance of an SGE qset
919 *	@data: the SGE queue set to maintain
920 *
921 *	Runs periodically from a timer to perform maintenance of an SGE queue
922 *	set.  It performs two tasks:
923 *
924 *	a) Cleans up any completed Tx descriptors that may still be pending.
925 *	Normal descriptor cleanup happens when new packets are added to a Tx
926 *	queue so this timer is relatively infrequent and does any cleanup only
927 *	if the Tx queue has not seen any new packets in a while.  We make a
928 *	best effort attempt to reclaim descriptors, in that we don't wait
929 *	around if we cannot get a queue's lock (which most likely is because
930 *	someone else is queueing new packets and so will also handle the clean
931 *	up).  Since control queues use immediate data exclusively we don't
932 *	bother cleaning them up here.
933 *
934 *	b) Replenishes Rx queues that have run out due to memory shortage.
935 *	Normally new Rx buffers are added when existing ones are consumed but
936 *	when out of memory a queue can become empty.  We try to add only a few
937 *	buffers here, the queue will be replenished fully as these new buffers
938 *	are used up if memory shortage has subsided.
939 *
940 *	c) Return coalesced response queue credits in case a response queue is
941 *	starved.
942 *
943 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
944 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
945 */
946static void
947sge_timer_cb(void *arg)
948{
949	adapter_t *sc = arg;
950	if ((sc->flags & USING_MSIX) == 0) {
951
952		struct port_info *pi;
953		struct sge_qset *qs;
954		struct sge_txq  *txq;
955		int i, j;
956		int reclaim_ofl, refill_rx;
957
958		if (sc->open_device_map == 0)
959			return;
960
961		for (i = 0; i < sc->params.nports; i++) {
962			pi = &sc->port[i];
963			for (j = 0; j < pi->nqsets; j++) {
964				qs = &sc->sge.qs[pi->first_qset + j];
965				txq = &qs->txq[0];
966				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
967				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
968				    (qs->fl[1].credits < qs->fl[1].size));
969				if (reclaim_ofl || refill_rx) {
970					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
971					break;
972				}
973			}
974		}
975	}
976
977	if (sc->params.nports > 2) {
978		int i;
979
980		for_each_port(sc, i) {
981			struct port_info *pi = &sc->port[i];
982
983			t3_write_reg(sc, A_SG_KDOORBELL,
984				     F_SELEGRCNTX |
985				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
986		}
987	}
988	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
989	    sc->open_device_map != 0)
990		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
991}
992
993/*
994 * This is meant to be a catch-all function to keep sge state private
995 * to sge.c
996 *
997 */
998int
999t3_sge_init_adapter(adapter_t *sc)
1000{
1001	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
1002	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1003	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1004	return (0);
1005}
1006
1007int
1008t3_sge_reset_adapter(adapter_t *sc)
1009{
1010	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1011	return (0);
1012}
1013
1014int
1015t3_sge_init_port(struct port_info *pi)
1016{
1017	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1018	return (0);
1019}
1020
1021/**
1022 *	refill_rspq - replenish an SGE response queue
1023 *	@adapter: the adapter
1024 *	@q: the response queue to replenish
1025 *	@credits: how many new responses to make available
1026 *
1027 *	Replenishes a response queue by making the supplied number of responses
1028 *	available to HW.
1029 */
1030static __inline void
1031refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1032{
1033
1034	/* mbufs are allocated on demand when a rspq entry is processed. */
1035	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1036		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1037}
1038
1039static void
1040sge_txq_reclaim_handler(void *arg, int ncount)
1041{
1042	struct sge_qset *qs = arg;
1043	int i;
1044
1045	for (i = 0; i < 3; i++)
1046		reclaim_completed_tx(qs, 16, i);
1047}
1048
1049static void
1050sge_timer_reclaim(void *arg, int ncount)
1051{
1052	struct port_info *pi = arg;
1053	int i, nqsets = pi->nqsets;
1054	adapter_t *sc = pi->adapter;
1055	struct sge_qset *qs;
1056	struct mtx *lock;
1057
1058	KASSERT((sc->flags & USING_MSIX) == 0,
1059	    ("can't call timer reclaim for msi-x"));
1060
1061	for (i = 0; i < nqsets; i++) {
1062		qs = &sc->sge.qs[pi->first_qset + i];
1063
1064		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1065		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1066			    &sc->sge.qs[0].rspq.lock;
1067
1068		if (mtx_trylock(lock)) {
1069			/* XXX currently assume that we are *NOT* polling */
1070			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1071
1072			if (qs->fl[0].credits < qs->fl[0].size - 16)
1073				__refill_fl(sc, &qs->fl[0]);
1074			if (qs->fl[1].credits < qs->fl[1].size - 16)
1075				__refill_fl(sc, &qs->fl[1]);
1076
1077			if (status & (1 << qs->rspq.cntxt_id)) {
1078				if (qs->rspq.credits) {
1079					refill_rspq(sc, &qs->rspq, 1);
1080					qs->rspq.credits--;
1081					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1082					    1 << qs->rspq.cntxt_id);
1083				}
1084			}
1085			mtx_unlock(lock);
1086		}
1087	}
1088}
1089
1090/**
1091 *	init_qset_cntxt - initialize an SGE queue set context info
1092 *	@qs: the queue set
1093 *	@id: the queue set id
1094 *
1095 *	Initializes the TIDs and context ids for the queues of a queue set.
1096 */
1097static void
1098init_qset_cntxt(struct sge_qset *qs, u_int id)
1099{
1100
1101	qs->rspq.cntxt_id = id;
1102	qs->fl[0].cntxt_id = 2 * id;
1103	qs->fl[1].cntxt_id = 2 * id + 1;
1104	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1105	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1106	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1107	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1108	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1109
1110	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1111	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1112	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1113}
1114
1115
1116static void
1117txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1118{
1119	txq->in_use += ndesc;
1120	/*
1121	 * XXX we don't handle stopping of queue
1122	 * presumably start handles this when we bump against the end
1123	 */
1124	txqs->gen = txq->gen;
1125	txq->unacked += ndesc;
1126	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1127	txq->unacked &= 31;
1128	txqs->pidx = txq->pidx;
1129	txq->pidx += ndesc;
1130#ifdef INVARIANTS
1131	if (((txqs->pidx > txq->cidx) &&
1132		(txq->pidx < txqs->pidx) &&
1133		(txq->pidx >= txq->cidx)) ||
1134	    ((txqs->pidx < txq->cidx) &&
1135		(txq->pidx >= txq-> cidx)) ||
1136	    ((txqs->pidx < txq->cidx) &&
1137		(txq->cidx < txqs->pidx)))
1138		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1139		    txqs->pidx, txq->pidx, txq->cidx);
1140#endif
1141	if (txq->pidx >= txq->size) {
1142		txq->pidx -= txq->size;
1143		txq->gen ^= 1;
1144	}
1145
1146}
1147
1148/**
1149 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1150 *	@m: the packet mbufs
1151 *      @nsegs: the number of segments
1152 *
1153 * 	Returns the number of Tx descriptors needed for the given Ethernet
1154 * 	packet.  Ethernet packets require addition of WR and CPL headers.
1155 */
1156static __inline unsigned int
1157calc_tx_descs(const struct mbuf *m, int nsegs)
1158{
1159	unsigned int flits;
1160
1161	if (m->m_pkthdr.len <= PIO_LEN)
1162		return 1;
1163
1164	flits = sgl_len(nsegs) + 2;
1165	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1166		flits++;
1167
1168	return flits_to_desc(flits);
1169}
1170
1171static unsigned int
1172busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1173    struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1174{
1175	struct mbuf *m0;
1176	int err, pktlen, pass = 0;
1177	bus_dma_tag_t tag = txq->entry_tag;
1178
1179retry:
1180	err = 0;
1181	m0 = *m;
1182	pktlen = m0->m_pkthdr.len;
1183#if defined(__i386__) || defined(__amd64__)
1184	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1185		goto done;
1186	} else
1187#endif
1188		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1189
1190	if (err == 0) {
1191		goto done;
1192	}
1193	if (err == EFBIG && pass == 0) {
1194		pass = 1;
1195		/* Too many segments, try to defrag */
1196		m0 = m_defrag(m0, M_DONTWAIT);
1197		if (m0 == NULL) {
1198			m_freem(*m);
1199			*m = NULL;
1200			return (ENOBUFS);
1201		}
1202		*m = m0;
1203		goto retry;
1204	} else if (err == ENOMEM) {
1205		return (err);
1206	} if (err) {
1207		if (cxgb_debug)
1208			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1209		m_freem(m0);
1210		*m = NULL;
1211		return (err);
1212	}
1213done:
1214#if !defined(__i386__) && !defined(__amd64__)
1215	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1216#endif
1217	txsd->flags |= TX_SW_DESC_MAPPED;
1218
1219	return (0);
1220}
1221
1222/**
1223 *	make_sgl - populate a scatter/gather list for a packet
1224 *	@sgp: the SGL to populate
1225 *	@segs: the packet dma segments
1226 *	@nsegs: the number of segments
1227 *
1228 *	Generates a scatter/gather list for the buffers that make up a packet
1229 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1230 *	appropriately.
1231 */
1232static __inline void
1233make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1234{
1235	int i, idx;
1236
1237	for (idx = 0, i = 0; i < nsegs; i++) {
1238		/*
1239		 * firmware doesn't like empty segments
1240		 */
1241		if (segs[i].ds_len == 0)
1242			continue;
1243		if (i && idx == 0)
1244			++sgp;
1245
1246		sgp->len[idx] = htobe32(segs[i].ds_len);
1247		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1248		idx ^= 1;
1249	}
1250
1251	if (idx) {
1252		sgp->len[idx] = 0;
1253		sgp->addr[idx] = 0;
1254	}
1255}
1256
1257/**
1258 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1259 *	@adap: the adapter
1260 *	@q: the Tx queue
1261 *
1262 *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1263 *	where the HW is going to sleep just after we checked, however,
1264 *	then the interrupt handler will detect the outstanding TX packet
1265 *	and ring the doorbell for us.
1266 *
1267 *	When GTS is disabled we unconditionally ring the doorbell.
1268 */
1269static __inline void
1270check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1271{
1272#if USE_GTS
1273	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1274	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1275		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1276#ifdef T3_TRACE
1277		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1278			  q->cntxt_id);
1279#endif
1280		t3_write_reg(adap, A_SG_KDOORBELL,
1281			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1282	}
1283#else
1284	if (mustring || ++q->db_pending >= 32) {
1285		wmb();            /* write descriptors before telling HW */
1286		t3_write_reg(adap, A_SG_KDOORBELL,
1287		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1288		q->db_pending = 0;
1289	}
1290#endif
1291}
1292
1293static __inline void
1294wr_gen2(struct tx_desc *d, unsigned int gen)
1295{
1296#if SGE_NUM_GENBITS == 2
1297	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1298#endif
1299}
1300
1301/**
1302 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1303 *	@ndesc: number of Tx descriptors spanned by the SGL
1304 *	@txd: first Tx descriptor to be written
1305 *	@txqs: txq state (generation and producer index)
1306 *	@txq: the SGE Tx queue
1307 *	@sgl: the SGL
1308 *	@flits: number of flits to the start of the SGL in the first descriptor
1309 *	@sgl_flits: the SGL size in flits
1310 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1311 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1312 *
1313 *	Write a work request header and an associated SGL.  If the SGL is
1314 *	small enough to fit into one Tx descriptor it has already been written
1315 *	and we just need to write the WR header.  Otherwise we distribute the
1316 *	SGL across the number of descriptors it spans.
1317 */
1318static void
1319write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1320    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1321    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1322{
1323
1324	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1325	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1326
1327	if (__predict_true(ndesc == 1)) {
1328		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1329			V_WR_SGLSFLT(flits)) | wr_hi,
1330		    htonl(V_WR_LEN(flits + sgl_flits) |
1331			V_WR_GEN(txqs->gen)) | wr_lo);
1332		/* XXX gen? */
1333		wr_gen2(txd, txqs->gen);
1334
1335	} else {
1336		unsigned int ogen = txqs->gen;
1337		const uint64_t *fp = (const uint64_t *)sgl;
1338		struct work_request_hdr *wp = wrp;
1339
1340		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1341		    V_WR_SGLSFLT(flits)) | wr_hi;
1342
1343		while (sgl_flits) {
1344			unsigned int avail = WR_FLITS - flits;
1345
1346			if (avail > sgl_flits)
1347				avail = sgl_flits;
1348			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1349			sgl_flits -= avail;
1350			ndesc--;
1351			if (!sgl_flits)
1352				break;
1353
1354			fp += avail;
1355			txd++;
1356			txsd++;
1357			if (++txqs->pidx == txq->size) {
1358				txqs->pidx = 0;
1359				txqs->gen ^= 1;
1360				txd = txq->desc;
1361				txsd = txq->sdesc;
1362			}
1363
1364			/*
1365			 * when the head of the mbuf chain
1366			 * is freed all clusters will be freed
1367			 * with it
1368			 */
1369			wrp = (struct work_request_hdr *)txd;
1370			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1371			    V_WR_SGLSFLT(1)) | wr_hi;
1372			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1373				    sgl_flits + 1)) |
1374			    V_WR_GEN(txqs->gen)) | wr_lo;
1375			wr_gen2(txd, txqs->gen);
1376			flits = 1;
1377		}
1378		wrp->wrh_hi |= htonl(F_WR_EOP);
1379		wmb();
1380		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1381		wr_gen2((struct tx_desc *)wp, ogen);
1382	}
1383}
1384
1385/* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1386#define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1387
1388#define GET_VTAG(cntrl, m) \
1389do { \
1390	if ((m)->m_flags & M_VLANTAG)					            \
1391		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1392} while (0)
1393
1394static int
1395t3_encap(struct sge_qset *qs, struct mbuf **m)
1396{
1397	adapter_t *sc;
1398	struct mbuf *m0;
1399	struct sge_txq *txq;
1400	struct txq_state txqs;
1401	struct port_info *pi;
1402	unsigned int ndesc, flits, cntrl, mlen;
1403	int err, nsegs, tso_info = 0;
1404
1405	struct work_request_hdr *wrp;
1406	struct tx_sw_desc *txsd;
1407	struct sg_ent *sgp, *sgl;
1408	uint32_t wr_hi, wr_lo, sgl_flits;
1409	bus_dma_segment_t segs[TX_MAX_SEGS];
1410
1411	struct tx_desc *txd;
1412
1413	pi = qs->port;
1414	sc = pi->adapter;
1415	txq = &qs->txq[TXQ_ETH];
1416	txd = &txq->desc[txq->pidx];
1417	txsd = &txq->sdesc[txq->pidx];
1418	sgl = txq->txq_sgl;
1419
1420	prefetch(txd);
1421	m0 = *m;
1422
1423	mtx_assert(&qs->lock, MA_OWNED);
1424	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1425	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1426
1427	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1428	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1429		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1430
1431	if (m0->m_nextpkt != NULL) {
1432		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1433		ndesc = 1;
1434		mlen = 0;
1435	} else {
1436		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1437		    &m0, segs, &nsegs))) {
1438			if (cxgb_debug)
1439				printf("failed ... err=%d\n", err);
1440			return (err);
1441		}
1442		mlen = m0->m_pkthdr.len;
1443		ndesc = calc_tx_descs(m0, nsegs);
1444	}
1445	txq_prod(txq, ndesc, &txqs);
1446
1447	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1448	txsd->m = m0;
1449
1450	if (m0->m_nextpkt != NULL) {
1451		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1452		int i, fidx;
1453
1454		if (nsegs > 7)
1455			panic("trying to coalesce %d packets in to one WR", nsegs);
1456		txq->txq_coalesced += nsegs;
1457		wrp = (struct work_request_hdr *)txd;
1458		flits = nsegs*2 + 1;
1459
1460		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1461			struct cpl_tx_pkt_batch_entry *cbe;
1462			uint64_t flit;
1463			uint32_t *hflit = (uint32_t *)&flit;
1464			int cflags = m0->m_pkthdr.csum_flags;
1465
1466			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1467			GET_VTAG(cntrl, m0);
1468			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1469			if (__predict_false(!(cflags & CSUM_IP)))
1470				cntrl |= F_TXPKT_IPCSUM_DIS;
1471			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1472				cntrl |= F_TXPKT_L4CSUM_DIS;
1473
1474			hflit[0] = htonl(cntrl);
1475			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1476			flit |= htobe64(1 << 24);
1477			cbe = &cpl_batch->pkt_entry[i];
1478			cbe->cntrl = hflit[0];
1479			cbe->len = hflit[1];
1480			cbe->addr = htobe64(segs[i].ds_addr);
1481		}
1482
1483		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1484		    V_WR_SGLSFLT(flits)) |
1485		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1486		wr_lo = htonl(V_WR_LEN(flits) |
1487		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1488		set_wr_hdr(wrp, wr_hi, wr_lo);
1489		wmb();
1490		ETHER_BPF_MTAP(pi->ifp, m0);
1491		wr_gen2(txd, txqs.gen);
1492		check_ring_tx_db(sc, txq, 0);
1493		return (0);
1494	} else if (tso_info) {
1495		int eth_type;
1496		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1497		struct ether_header *eh;
1498		struct ip *ip;
1499		struct tcphdr *tcp;
1500
1501		txd->flit[2] = 0;
1502		GET_VTAG(cntrl, m0);
1503		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1504		hdr->cntrl = htonl(cntrl);
1505		hdr->len = htonl(mlen | 0x80000000);
1506
1507		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1508			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1509			    m0, mlen, m0->m_pkthdr.tso_segsz,
1510			    m0->m_pkthdr.csum_flags, m0->m_flags);
1511			panic("tx tso packet too small");
1512		}
1513
1514		/* Make sure that ether, ip, tcp headers are all in m0 */
1515		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1516			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1517			if (__predict_false(m0 == NULL)) {
1518				/* XXX panic probably an overreaction */
1519				panic("couldn't fit header into mbuf");
1520			}
1521		}
1522
1523		eh = mtod(m0, struct ether_header *);
1524		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
1525			eth_type = CPL_ETH_II_VLAN;
1526			ip = (struct ip *)((struct ether_vlan_header *)eh + 1);
1527		} else {
1528			eth_type = CPL_ETH_II;
1529			ip = (struct ip *)(eh + 1);
1530		}
1531		tcp = (struct tcphdr *)(ip + 1);
1532
1533		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1534			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1535			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1536		hdr->lso_info = htonl(tso_info);
1537
1538		if (__predict_false(mlen <= PIO_LEN)) {
1539			/*
1540			 * pkt not undersized but fits in PIO_LEN
1541			 * Indicates a TSO bug at the higher levels.
1542			 */
1543			txsd->m = NULL;
1544			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1545			flits = (mlen + 7) / 8 + 3;
1546			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1547					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1548					  F_WR_SOP | F_WR_EOP | txqs.compl);
1549			wr_lo = htonl(V_WR_LEN(flits) |
1550			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1551			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1552			wmb();
1553			ETHER_BPF_MTAP(pi->ifp, m0);
1554			wr_gen2(txd, txqs.gen);
1555			check_ring_tx_db(sc, txq, 0);
1556			m_freem(m0);
1557			return (0);
1558		}
1559		flits = 3;
1560	} else {
1561		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1562
1563		GET_VTAG(cntrl, m0);
1564		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1565		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1566			cntrl |= F_TXPKT_IPCSUM_DIS;
1567		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1568			cntrl |= F_TXPKT_L4CSUM_DIS;
1569		cpl->cntrl = htonl(cntrl);
1570		cpl->len = htonl(mlen | 0x80000000);
1571
1572		if (mlen <= PIO_LEN) {
1573			txsd->m = NULL;
1574			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1575			flits = (mlen + 7) / 8 + 2;
1576
1577			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1578			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1579					  F_WR_SOP | F_WR_EOP | txqs.compl);
1580			wr_lo = htonl(V_WR_LEN(flits) |
1581			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1582			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1583			wmb();
1584			ETHER_BPF_MTAP(pi->ifp, m0);
1585			wr_gen2(txd, txqs.gen);
1586			check_ring_tx_db(sc, txq, 0);
1587			m_freem(m0);
1588			return (0);
1589		}
1590		flits = 2;
1591	}
1592	wrp = (struct work_request_hdr *)txd;
1593	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1594	make_sgl(sgp, segs, nsegs);
1595
1596	sgl_flits = sgl_len(nsegs);
1597
1598	ETHER_BPF_MTAP(pi->ifp, m0);
1599
1600	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1601	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1602	wr_lo = htonl(V_WR_TID(txq->token));
1603	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1604	    sgl_flits, wr_hi, wr_lo);
1605	check_ring_tx_db(sc, txq, 0);
1606
1607	return (0);
1608}
1609
1610void
1611cxgb_tx_watchdog(void *arg)
1612{
1613	struct sge_qset *qs = arg;
1614	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1615
1616        if (qs->coalescing != 0 &&
1617	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1618	    TXQ_RING_EMPTY(qs))
1619                qs->coalescing = 0;
1620        else if (qs->coalescing == 0 &&
1621	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1622                qs->coalescing = 1;
1623	if (TXQ_TRYLOCK(qs)) {
1624		qs->qs_flags |= QS_FLUSHING;
1625		cxgb_start_locked(qs);
1626		qs->qs_flags &= ~QS_FLUSHING;
1627		TXQ_UNLOCK(qs);
1628	}
1629	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1630		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1631		    qs, txq->txq_watchdog.c_cpu);
1632}
1633
1634static void
1635cxgb_tx_timeout(void *arg)
1636{
1637	struct sge_qset *qs = arg;
1638	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1639
1640	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1641                qs->coalescing = 1;
1642	if (TXQ_TRYLOCK(qs)) {
1643		qs->qs_flags |= QS_TIMEOUT;
1644		cxgb_start_locked(qs);
1645		qs->qs_flags &= ~QS_TIMEOUT;
1646		TXQ_UNLOCK(qs);
1647	}
1648}
1649
1650static void
1651cxgb_start_locked(struct sge_qset *qs)
1652{
1653	struct mbuf *m_head = NULL;
1654	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1655	struct port_info *pi = qs->port;
1656	struct ifnet *ifp = pi->ifp;
1657
1658	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1659		reclaim_completed_tx(qs, 0, TXQ_ETH);
1660
1661	if (!pi->link_config.link_ok) {
1662		TXQ_RING_FLUSH(qs);
1663		return;
1664	}
1665	TXQ_LOCK_ASSERT(qs);
1666	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1667	    pi->link_config.link_ok) {
1668		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1669
1670		if (txq->size - txq->in_use <= TX_MAX_DESC)
1671			break;
1672
1673		if ((m_head = cxgb_dequeue(qs)) == NULL)
1674			break;
1675		/*
1676		 *  Encapsulation can modify our pointer, and or make it
1677		 *  NULL on failure.  In that event, we can't requeue.
1678		 */
1679		if (t3_encap(qs, &m_head) || m_head == NULL)
1680			break;
1681
1682		m_head = NULL;
1683	}
1684
1685	if (txq->db_pending)
1686		check_ring_tx_db(pi->adapter, txq, 1);
1687
1688	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1689	    pi->link_config.link_ok)
1690		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1691		    qs, txq->txq_timer.c_cpu);
1692	if (m_head != NULL)
1693		m_freem(m_head);
1694}
1695
1696static int
1697cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1698{
1699	struct port_info *pi = qs->port;
1700	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1701	struct buf_ring *br = txq->txq_mr;
1702	int error, avail;
1703
1704	avail = txq->size - txq->in_use;
1705	TXQ_LOCK_ASSERT(qs);
1706
1707	/*
1708	 * We can only do a direct transmit if the following are true:
1709	 * - we aren't coalescing (ring < 3/4 full)
1710	 * - the link is up -- checked in caller
1711	 * - there are no packets enqueued already
1712	 * - there is space in hardware transmit queue
1713	 */
1714	if (check_pkt_coalesce(qs) == 0 &&
1715	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1716		if (t3_encap(qs, &m)) {
1717			if (m != NULL &&
1718			    (error = drbr_enqueue(ifp, br, m)) != 0)
1719				return (error);
1720		} else {
1721			if (txq->db_pending)
1722				check_ring_tx_db(pi->adapter, txq, 1);
1723
1724			/*
1725			 * We've bypassed the buf ring so we need to update
1726			 * the stats directly
1727			 */
1728			txq->txq_direct_packets++;
1729			txq->txq_direct_bytes += m->m_pkthdr.len;
1730		}
1731	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1732		return (error);
1733
1734	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1735	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1736	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1737		cxgb_start_locked(qs);
1738	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1739		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1740		    qs, txq->txq_timer.c_cpu);
1741	return (0);
1742}
1743
1744int
1745cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1746{
1747	struct sge_qset *qs;
1748	struct port_info *pi = ifp->if_softc;
1749	int error, qidx = pi->first_qset;
1750
1751	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1752	    ||(!pi->link_config.link_ok)) {
1753		m_freem(m);
1754		return (0);
1755	}
1756
1757	if (m->m_flags & M_FLOWID)
1758		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1759
1760	qs = &pi->adapter->sge.qs[qidx];
1761
1762	if (TXQ_TRYLOCK(qs)) {
1763		/* XXX running */
1764		error = cxgb_transmit_locked(ifp, qs, m);
1765		TXQ_UNLOCK(qs);
1766	} else
1767		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1768	return (error);
1769}
1770void
1771cxgb_start(struct ifnet *ifp)
1772{
1773	struct port_info *pi = ifp->if_softc;
1774	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
1775
1776	if (!pi->link_config.link_ok)
1777		return;
1778
1779	TXQ_LOCK(qs);
1780	cxgb_start_locked(qs);
1781	TXQ_UNLOCK(qs);
1782}
1783
1784void
1785cxgb_qflush(struct ifnet *ifp)
1786{
1787	/*
1788	 * flush any enqueued mbufs in the buf_rings
1789	 * and in the transmit queues
1790	 * no-op for now
1791	 */
1792	return;
1793}
1794
1795/**
1796 *	write_imm - write a packet into a Tx descriptor as immediate data
1797 *	@d: the Tx descriptor to write
1798 *	@m: the packet
1799 *	@len: the length of packet data to write as immediate data
1800 *	@gen: the generation bit value to write
1801 *
1802 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1803 *	contains a work request at its beginning.  We must write the packet
1804 *	carefully so the SGE doesn't read accidentally before it's written in
1805 *	its entirety.
1806 */
1807static __inline void
1808write_imm(struct tx_desc *d, struct mbuf *m,
1809	  unsigned int len, unsigned int gen)
1810{
1811	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1812	struct work_request_hdr *to = (struct work_request_hdr *)d;
1813	uint32_t wr_hi, wr_lo;
1814
1815	if (len > WR_LEN)
1816		panic("len too big %d\n", len);
1817	if (len < sizeof(*from))
1818		panic("len too small %d", len);
1819
1820	memcpy(&to[1], &from[1], len - sizeof(*from));
1821	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1822					V_WR_BCNTLFLT(len & 7));
1823	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1824					V_WR_LEN((len + 7) / 8));
1825	set_wr_hdr(to, wr_hi, wr_lo);
1826	wmb();
1827	wr_gen2(d, gen);
1828
1829	/*
1830	 * This check is a hack we should really fix the logic so
1831	 * that this can't happen
1832	 */
1833	if (m->m_type != MT_DONTFREE)
1834		m_freem(m);
1835
1836}
1837
1838/**
1839 *	check_desc_avail - check descriptor availability on a send queue
1840 *	@adap: the adapter
1841 *	@q: the TX queue
1842 *	@m: the packet needing the descriptors
1843 *	@ndesc: the number of Tx descriptors needed
1844 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1845 *
1846 *	Checks if the requested number of Tx descriptors is available on an
1847 *	SGE send queue.  If the queue is already suspended or not enough
1848 *	descriptors are available the packet is queued for later transmission.
1849 *	Must be called with the Tx queue locked.
1850 *
1851 *	Returns 0 if enough descriptors are available, 1 if there aren't
1852 *	enough descriptors and the packet has been queued, and 2 if the caller
1853 *	needs to retry because there weren't enough descriptors at the
1854 *	beginning of the call but some freed up in the mean time.
1855 */
1856static __inline int
1857check_desc_avail(adapter_t *adap, struct sge_txq *q,
1858		 struct mbuf *m, unsigned int ndesc,
1859		 unsigned int qid)
1860{
1861	/*
1862	 * XXX We currently only use this for checking the control queue
1863	 * the control queue is only used for binding qsets which happens
1864	 * at init time so we are guaranteed enough descriptors
1865	 */
1866	if (__predict_false(!mbufq_empty(&q->sendq))) {
1867addq_exit:	mbufq_tail(&q->sendq, m);
1868		return 1;
1869	}
1870	if (__predict_false(q->size - q->in_use < ndesc)) {
1871
1872		struct sge_qset *qs = txq_to_qset(q, qid);
1873
1874		setbit(&qs->txq_stopped, qid);
1875		if (should_restart_tx(q) &&
1876		    test_and_clear_bit(qid, &qs->txq_stopped))
1877			return 2;
1878
1879		q->stops++;
1880		goto addq_exit;
1881	}
1882	return 0;
1883}
1884
1885
1886/**
1887 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1888 *	@q: the SGE control Tx queue
1889 *
1890 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1891 *	that send only immediate data (presently just the control queues) and
1892 *	thus do not have any mbufs
1893 */
1894static __inline void
1895reclaim_completed_tx_imm(struct sge_txq *q)
1896{
1897	unsigned int reclaim = q->processed - q->cleaned;
1898
1899	q->in_use -= reclaim;
1900	q->cleaned += reclaim;
1901}
1902
1903static __inline int
1904immediate(const struct mbuf *m)
1905{
1906	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1907}
1908
1909/**
1910 *	ctrl_xmit - send a packet through an SGE control Tx queue
1911 *	@adap: the adapter
1912 *	@q: the control queue
1913 *	@m: the packet
1914 *
1915 *	Send a packet through an SGE control Tx queue.  Packets sent through
1916 *	a control queue must fit entirely as immediate data in a single Tx
1917 *	descriptor and have no page fragments.
1918 */
1919static int
1920ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1921{
1922	int ret;
1923	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1924	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1925
1926	if (__predict_false(!immediate(m))) {
1927		m_freem(m);
1928		return 0;
1929	}
1930
1931	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1932	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1933
1934	TXQ_LOCK(qs);
1935again:	reclaim_completed_tx_imm(q);
1936
1937	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1938	if (__predict_false(ret)) {
1939		if (ret == 1) {
1940			TXQ_UNLOCK(qs);
1941			return (ENOSPC);
1942		}
1943		goto again;
1944	}
1945	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1946
1947	q->in_use++;
1948	if (++q->pidx >= q->size) {
1949		q->pidx = 0;
1950		q->gen ^= 1;
1951	}
1952	TXQ_UNLOCK(qs);
1953	wmb();
1954	t3_write_reg(adap, A_SG_KDOORBELL,
1955		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1956	return (0);
1957}
1958
1959
1960/**
1961 *	restart_ctrlq - restart a suspended control queue
1962 *	@qs: the queue set cotaining the control queue
1963 *
1964 *	Resumes transmission on a suspended Tx control queue.
1965 */
1966static void
1967restart_ctrlq(void *data, int npending)
1968{
1969	struct mbuf *m;
1970	struct sge_qset *qs = (struct sge_qset *)data;
1971	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1972	adapter_t *adap = qs->port->adapter;
1973
1974	TXQ_LOCK(qs);
1975again:	reclaim_completed_tx_imm(q);
1976
1977	while (q->in_use < q->size &&
1978	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1979
1980		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1981
1982		if (++q->pidx >= q->size) {
1983			q->pidx = 0;
1984			q->gen ^= 1;
1985		}
1986		q->in_use++;
1987	}
1988	if (!mbufq_empty(&q->sendq)) {
1989		setbit(&qs->txq_stopped, TXQ_CTRL);
1990
1991		if (should_restart_tx(q) &&
1992		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1993			goto again;
1994		q->stops++;
1995	}
1996	TXQ_UNLOCK(qs);
1997	t3_write_reg(adap, A_SG_KDOORBELL,
1998		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1999}
2000
2001
2002/*
2003 * Send a management message through control queue 0
2004 */
2005int
2006t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
2007{
2008	return ctrl_xmit(adap, &adap->sge.qs[0], m);
2009}
2010
2011/**
2012 *	free_qset - free the resources of an SGE queue set
2013 *	@sc: the controller owning the queue set
2014 *	@q: the queue set
2015 *
2016 *	Release the HW and SW resources associated with an SGE queue set, such
2017 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2018 *	queue set must be quiesced prior to calling this.
2019 */
2020static void
2021t3_free_qset(adapter_t *sc, struct sge_qset *q)
2022{
2023	int i;
2024
2025	reclaim_completed_tx(q, 0, TXQ_ETH);
2026	if (q->txq[TXQ_ETH].txq_mr != NULL)
2027		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
2028	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
2029		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
2030		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
2031	}
2032
2033	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2034		if (q->fl[i].desc) {
2035			mtx_lock_spin(&sc->sge.reg_lock);
2036			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2037			mtx_unlock_spin(&sc->sge.reg_lock);
2038			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2039			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2040					q->fl[i].desc_map);
2041			bus_dma_tag_destroy(q->fl[i].desc_tag);
2042			bus_dma_tag_destroy(q->fl[i].entry_tag);
2043		}
2044		if (q->fl[i].sdesc) {
2045			free_rx_bufs(sc, &q->fl[i]);
2046			free(q->fl[i].sdesc, M_DEVBUF);
2047		}
2048	}
2049
2050	mtx_unlock(&q->lock);
2051	MTX_DESTROY(&q->lock);
2052	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2053		if (q->txq[i].desc) {
2054			mtx_lock_spin(&sc->sge.reg_lock);
2055			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2056			mtx_unlock_spin(&sc->sge.reg_lock);
2057			bus_dmamap_unload(q->txq[i].desc_tag,
2058					q->txq[i].desc_map);
2059			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2060					q->txq[i].desc_map);
2061			bus_dma_tag_destroy(q->txq[i].desc_tag);
2062			bus_dma_tag_destroy(q->txq[i].entry_tag);
2063		}
2064		if (q->txq[i].sdesc) {
2065			free(q->txq[i].sdesc, M_DEVBUF);
2066		}
2067	}
2068
2069	if (q->rspq.desc) {
2070		mtx_lock_spin(&sc->sge.reg_lock);
2071		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2072		mtx_unlock_spin(&sc->sge.reg_lock);
2073
2074		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2075		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2076			        q->rspq.desc_map);
2077		bus_dma_tag_destroy(q->rspq.desc_tag);
2078		MTX_DESTROY(&q->rspq.lock);
2079	}
2080
2081#ifdef INET
2082	tcp_lro_free(&q->lro.ctrl);
2083#endif
2084
2085	bzero(q, sizeof(*q));
2086}
2087
2088/**
2089 *	t3_free_sge_resources - free SGE resources
2090 *	@sc: the adapter softc
2091 *
2092 *	Frees resources used by the SGE queue sets.
2093 */
2094void
2095t3_free_sge_resources(adapter_t *sc)
2096{
2097	int i, nqsets;
2098
2099	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2100		nqsets += sc->port[i].nqsets;
2101
2102	for (i = 0; i < nqsets; ++i) {
2103		TXQ_LOCK(&sc->sge.qs[i]);
2104		t3_free_qset(sc, &sc->sge.qs[i]);
2105	}
2106
2107}
2108
2109/**
2110 *	t3_sge_start - enable SGE
2111 *	@sc: the controller softc
2112 *
2113 *	Enables the SGE for DMAs.  This is the last step in starting packet
2114 *	transfers.
2115 */
2116void
2117t3_sge_start(adapter_t *sc)
2118{
2119	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2120}
2121
2122/**
2123 *	t3_sge_stop - disable SGE operation
2124 *	@sc: the adapter
2125 *
2126 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2127 *	from error interrupts) or from normal process context.  In the latter
2128 *	case it also disables any pending queue restart tasklets.  Note that
2129 *	if it is called in interrupt context it cannot disable the restart
2130 *	tasklets as it cannot wait, however the tasklets will have no effect
2131 *	since the doorbells are disabled and the driver will call this again
2132 *	later from process context, at which time the tasklets will be stopped
2133 *	if they are still running.
2134 */
2135void
2136t3_sge_stop(adapter_t *sc)
2137{
2138	int i, nqsets;
2139
2140	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2141
2142	if (sc->tq == NULL)
2143		return;
2144
2145	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2146		nqsets += sc->port[i].nqsets;
2147#ifdef notyet
2148	/*
2149	 *
2150	 * XXX
2151	 */
2152	for (i = 0; i < nqsets; ++i) {
2153		struct sge_qset *qs = &sc->sge.qs[i];
2154
2155		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2156		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2157	}
2158#endif
2159}
2160
2161/**
2162 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2163 *	@adapter: the adapter
2164 *	@q: the Tx queue to reclaim descriptors from
2165 *	@reclaimable: the number of descriptors to reclaim
2166 *      @m_vec_size: maximum number of buffers to reclaim
2167 *      @desc_reclaimed: returns the number of descriptors reclaimed
2168 *
2169 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2170 *	Tx buffers.  Called with the Tx queue lock held.
2171 *
2172 *      Returns number of buffers of reclaimed
2173 */
2174void
2175t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2176{
2177	struct tx_sw_desc *txsd;
2178	unsigned int cidx, mask;
2179	struct sge_txq *q = &qs->txq[queue];
2180
2181#ifdef T3_TRACE
2182	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2183		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2184#endif
2185	cidx = q->cidx;
2186	mask = q->size - 1;
2187	txsd = &q->sdesc[cidx];
2188
2189	mtx_assert(&qs->lock, MA_OWNED);
2190	while (reclaimable--) {
2191		prefetch(q->sdesc[(cidx + 1) & mask].m);
2192		prefetch(q->sdesc[(cidx + 2) & mask].m);
2193
2194		if (txsd->m != NULL) {
2195			if (txsd->flags & TX_SW_DESC_MAPPED) {
2196				bus_dmamap_unload(q->entry_tag, txsd->map);
2197				txsd->flags &= ~TX_SW_DESC_MAPPED;
2198			}
2199			m_freem_list(txsd->m);
2200			txsd->m = NULL;
2201		} else
2202			q->txq_skipped++;
2203
2204		++txsd;
2205		if (++cidx == q->size) {
2206			cidx = 0;
2207			txsd = q->sdesc;
2208		}
2209	}
2210	q->cidx = cidx;
2211
2212}
2213
2214/**
2215 *	is_new_response - check if a response is newly written
2216 *	@r: the response descriptor
2217 *	@q: the response queue
2218 *
2219 *	Returns true if a response descriptor contains a yet unprocessed
2220 *	response.
2221 */
2222static __inline int
2223is_new_response(const struct rsp_desc *r,
2224    const struct sge_rspq *q)
2225{
2226	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2227}
2228
2229#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2230#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2231			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2232			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2233			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2234
2235/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2236#define NOMEM_INTR_DELAY 2500
2237
2238/**
2239 *	write_ofld_wr - write an offload work request
2240 *	@adap: the adapter
2241 *	@m: the packet to send
2242 *	@q: the Tx queue
2243 *	@pidx: index of the first Tx descriptor to write
2244 *	@gen: the generation value to use
2245 *	@ndesc: number of descriptors the packet will occupy
2246 *
2247 *	Write an offload work request to send the supplied packet.  The packet
2248 *	data already carry the work request with most fields populated.
2249 */
2250static void
2251write_ofld_wr(adapter_t *adap, struct mbuf *m,
2252    struct sge_txq *q, unsigned int pidx,
2253    unsigned int gen, unsigned int ndesc,
2254    bus_dma_segment_t *segs, unsigned int nsegs)
2255{
2256	unsigned int sgl_flits, flits;
2257	struct work_request_hdr *from;
2258	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2259	struct tx_desc *d = &q->desc[pidx];
2260	struct txq_state txqs;
2261
2262	if (immediate(m) && nsegs == 0) {
2263		write_imm(d, m, m->m_len, gen);
2264		return;
2265	}
2266
2267	/* Only TX_DATA builds SGLs */
2268	from = mtod(m, struct work_request_hdr *);
2269	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2270
2271	flits = m->m_len / 8;
2272	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2273
2274	make_sgl(sgp, segs, nsegs);
2275	sgl_flits = sgl_len(nsegs);
2276
2277	txqs.gen = gen;
2278	txqs.pidx = pidx;
2279	txqs.compl = 0;
2280
2281	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2282	    from->wrh_hi, from->wrh_lo);
2283}
2284
2285/**
2286 *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2287 *	@m: the packet
2288 *
2289 * 	Returns the number of Tx descriptors needed for the given offload
2290 * 	packet.  These packets are already fully constructed.
2291 */
2292static __inline unsigned int
2293calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2294{
2295	unsigned int flits, cnt = 0;
2296	int ndescs;
2297
2298	if (m->m_len <= WR_LEN && nsegs == 0)
2299		return (1);                 /* packet fits as immediate data */
2300
2301	/*
2302	 * This needs to be re-visited for TOE
2303	 */
2304
2305	cnt = nsegs;
2306
2307	/* headers */
2308	flits = m->m_len / 8;
2309
2310	ndescs = flits_to_desc(flits + sgl_len(cnt));
2311
2312	return (ndescs);
2313}
2314
2315/**
2316 *	ofld_xmit - send a packet through an offload queue
2317 *	@adap: the adapter
2318 *	@q: the Tx offload queue
2319 *	@m: the packet
2320 *
2321 *	Send an offload packet through an SGE offload queue.
2322 */
2323static int
2324ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2325{
2326	int ret, nsegs;
2327	unsigned int ndesc;
2328	unsigned int pidx, gen;
2329	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2330	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2331	struct tx_sw_desc *stx;
2332
2333	nsegs = m_get_sgllen(m);
2334	vsegs = m_get_sgl(m);
2335	ndesc = calc_tx_descs_ofld(m, nsegs);
2336	busdma_map_sgl(vsegs, segs, nsegs);
2337
2338	stx = &q->sdesc[q->pidx];
2339
2340	TXQ_LOCK(qs);
2341again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2342	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2343	if (__predict_false(ret)) {
2344		if (ret == 1) {
2345			printf("no ofld desc avail\n");
2346
2347			m_set_priority(m, ndesc);     /* save for restart */
2348			TXQ_UNLOCK(qs);
2349			return (EINTR);
2350		}
2351		goto again;
2352	}
2353
2354	gen = q->gen;
2355	q->in_use += ndesc;
2356	pidx = q->pidx;
2357	q->pidx += ndesc;
2358	if (q->pidx >= q->size) {
2359		q->pidx -= q->size;
2360		q->gen ^= 1;
2361	}
2362#ifdef T3_TRACE
2363	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2364		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2365		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2366		  skb_shinfo(skb)->nr_frags);
2367#endif
2368	TXQ_UNLOCK(qs);
2369
2370	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2371	check_ring_tx_db(adap, q, 1);
2372	return (0);
2373}
2374
2375/**
2376 *	restart_offloadq - restart a suspended offload queue
2377 *	@qs: the queue set cotaining the offload queue
2378 *
2379 *	Resumes transmission on a suspended Tx offload queue.
2380 */
2381static void
2382restart_offloadq(void *data, int npending)
2383{
2384	struct mbuf *m;
2385	struct sge_qset *qs = data;
2386	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2387	adapter_t *adap = qs->port->adapter;
2388	bus_dma_segment_t segs[TX_MAX_SEGS];
2389	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2390	int nsegs, cleaned;
2391
2392	TXQ_LOCK(qs);
2393again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2394
2395	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2396		unsigned int gen, pidx;
2397		unsigned int ndesc = m_get_priority(m);
2398
2399		if (__predict_false(q->size - q->in_use < ndesc)) {
2400			setbit(&qs->txq_stopped, TXQ_OFLD);
2401			if (should_restart_tx(q) &&
2402			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2403				goto again;
2404			q->stops++;
2405			break;
2406		}
2407
2408		gen = q->gen;
2409		q->in_use += ndesc;
2410		pidx = q->pidx;
2411		q->pidx += ndesc;
2412		if (q->pidx >= q->size) {
2413			q->pidx -= q->size;
2414			q->gen ^= 1;
2415		}
2416
2417		(void)mbufq_dequeue(&q->sendq);
2418		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2419		TXQ_UNLOCK(qs);
2420		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2421		TXQ_LOCK(qs);
2422	}
2423#if USE_GTS
2424	set_bit(TXQ_RUNNING, &q->flags);
2425	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2426#endif
2427	TXQ_UNLOCK(qs);
2428	wmb();
2429	t3_write_reg(adap, A_SG_KDOORBELL,
2430		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2431}
2432
2433/**
2434 *	queue_set - return the queue set a packet should use
2435 *	@m: the packet
2436 *
2437 *	Maps a packet to the SGE queue set it should use.  The desired queue
2438 *	set is carried in bits 1-3 in the packet's priority.
2439 */
2440static __inline int
2441queue_set(const struct mbuf *m)
2442{
2443	return m_get_priority(m) >> 1;
2444}
2445
2446/**
2447 *	is_ctrl_pkt - return whether an offload packet is a control packet
2448 *	@m: the packet
2449 *
2450 *	Determines whether an offload packet should use an OFLD or a CTRL
2451 *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2452 */
2453static __inline int
2454is_ctrl_pkt(const struct mbuf *m)
2455{
2456	return m_get_priority(m) & 1;
2457}
2458
2459/**
2460 *	t3_offload_tx - send an offload packet
2461 *	@tdev: the offload device to send to
2462 *	@m: the packet
2463 *
2464 *	Sends an offload packet.  We use the packet priority to select the
2465 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2466 *	should be sent as regular or control, bits 1-3 select the queue set.
2467 */
2468int
2469t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2470{
2471	adapter_t *adap = tdev2adap(tdev);
2472	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2473
2474	if (__predict_false(is_ctrl_pkt(m)))
2475		return ctrl_xmit(adap, qs, m);
2476
2477	return ofld_xmit(adap, qs, m);
2478}
2479
2480/**
2481 *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2482 *	@tdev: the offload device that will be receiving the packets
2483 *	@q: the SGE response queue that assembled the bundle
2484 *	@m: the partial bundle
2485 *	@n: the number of packets in the bundle
2486 *
2487 *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2488 */
2489static __inline void
2490deliver_partial_bundle(struct t3cdev *tdev,
2491			struct sge_rspq *q,
2492			struct mbuf *mbufs[], int n)
2493{
2494	if (n) {
2495		q->offload_bundles++;
2496		cxgb_ofld_recv(tdev, mbufs, n);
2497	}
2498}
2499
2500static __inline int
2501rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2502    struct mbuf *m, struct mbuf *rx_gather[],
2503    unsigned int gather_idx)
2504{
2505
2506	rq->offload_pkts++;
2507	m->m_pkthdr.header = mtod(m, void *);
2508	rx_gather[gather_idx++] = m;
2509	if (gather_idx == RX_BUNDLE_SIZE) {
2510		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2511		gather_idx = 0;
2512		rq->offload_bundles++;
2513	}
2514	return (gather_idx);
2515}
2516
2517static void
2518restart_tx(struct sge_qset *qs)
2519{
2520	struct adapter *sc = qs->port->adapter;
2521
2522
2523	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2524	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2525	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2526		qs->txq[TXQ_OFLD].restarts++;
2527		DPRINTF("restarting TXQ_OFLD\n");
2528		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2529	}
2530	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2531	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2532	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2533	    qs->txq[TXQ_CTRL].in_use);
2534
2535	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2536	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2537	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2538		qs->txq[TXQ_CTRL].restarts++;
2539		DPRINTF("restarting TXQ_CTRL\n");
2540		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2541	}
2542}
2543
2544/**
2545 *	t3_sge_alloc_qset - initialize an SGE queue set
2546 *	@sc: the controller softc
2547 *	@id: the queue set id
2548 *	@nports: how many Ethernet ports will be using this queue set
2549 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2550 *	@p: configuration parameters for this queue set
2551 *	@ntxq: number of Tx queues for the queue set
2552 *	@pi: port info for queue set
2553 *
2554 *	Allocate resources and initialize an SGE queue set.  A queue set
2555 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2556 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2557 *	queue, offload queue, and control queue.
2558 */
2559int
2560t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2561		  const struct qset_params *p, int ntxq, struct port_info *pi)
2562{
2563	struct sge_qset *q = &sc->sge.qs[id];
2564	int i, ret = 0;
2565
2566	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2567	q->port = pi;
2568
2569	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2570	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2571		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2572		goto err;
2573	}
2574	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2575	    M_NOWAIT | M_ZERO)) == NULL) {
2576		device_printf(sc->dev, "failed to allocate ifq\n");
2577		goto err;
2578	}
2579	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2580	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2581	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2582	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2583	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2584
2585	init_qset_cntxt(q, id);
2586	q->idx = id;
2587	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2588		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2589		    &q->fl[0].desc, &q->fl[0].sdesc,
2590		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2591		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2592		printf("error %d from alloc ring fl0\n", ret);
2593		goto err;
2594	}
2595
2596	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2597		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2598		    &q->fl[1].desc, &q->fl[1].sdesc,
2599		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2600		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2601		printf("error %d from alloc ring fl1\n", ret);
2602		goto err;
2603	}
2604
2605	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2606		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2607		    &q->rspq.desc_tag, &q->rspq.desc_map,
2608		    NULL, NULL)) != 0) {
2609		printf("error %d from alloc ring rspq\n", ret);
2610		goto err;
2611	}
2612
2613	for (i = 0; i < ntxq; ++i) {
2614		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2615
2616		if ((ret = alloc_ring(sc, p->txq_size[i],
2617			    sizeof(struct tx_desc), sz,
2618			    &q->txq[i].phys_addr, &q->txq[i].desc,
2619			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2620			    &q->txq[i].desc_map,
2621			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2622			printf("error %d from alloc ring tx %i\n", ret, i);
2623			goto err;
2624		}
2625		mbufq_init(&q->txq[i].sendq);
2626		q->txq[i].gen = 1;
2627		q->txq[i].size = p->txq_size[i];
2628	}
2629
2630	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2631	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2632	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2633	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2634
2635	q->fl[0].gen = q->fl[1].gen = 1;
2636	q->fl[0].size = p->fl_size;
2637	q->fl[1].size = p->jumbo_size;
2638
2639	q->rspq.gen = 1;
2640	q->rspq.cidx = 0;
2641	q->rspq.size = p->rspq_size;
2642
2643	q->txq[TXQ_ETH].stop_thres = nports *
2644	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2645
2646	q->fl[0].buf_size = MCLBYTES;
2647	q->fl[0].zone = zone_pack;
2648	q->fl[0].type = EXT_PACKET;
2649
2650	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2651		q->fl[1].zone = zone_jumbo16;
2652		q->fl[1].type = EXT_JUMBO16;
2653	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2654		q->fl[1].zone = zone_jumbo9;
2655		q->fl[1].type = EXT_JUMBO9;
2656	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2657		q->fl[1].zone = zone_jumbop;
2658		q->fl[1].type = EXT_JUMBOP;
2659	} else {
2660		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2661		ret = EDOOFUS;
2662		goto err;
2663	}
2664	q->fl[1].buf_size = p->jumbo_buf_size;
2665
2666	/* Allocate and setup the lro_ctrl structure */
2667	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2668#ifdef INET
2669	ret = tcp_lro_init(&q->lro.ctrl);
2670	if (ret) {
2671		printf("error %d from tcp_lro_init\n", ret);
2672		goto err;
2673	}
2674#endif
2675	q->lro.ctrl.ifp = pi->ifp;
2676
2677	mtx_lock_spin(&sc->sge.reg_lock);
2678	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2679				   q->rspq.phys_addr, q->rspq.size,
2680				   q->fl[0].buf_size, 1, 0);
2681	if (ret) {
2682		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2683		goto err_unlock;
2684	}
2685
2686	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2687		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2688					  q->fl[i].phys_addr, q->fl[i].size,
2689					  q->fl[i].buf_size, p->cong_thres, 1,
2690					  0);
2691		if (ret) {
2692			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2693			goto err_unlock;
2694		}
2695	}
2696
2697	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2698				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2699				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2700				 1, 0);
2701	if (ret) {
2702		printf("error %d from t3_sge_init_ecntxt\n", ret);
2703		goto err_unlock;
2704	}
2705
2706	if (ntxq > 1) {
2707		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2708					 USE_GTS, SGE_CNTXT_OFLD, id,
2709					 q->txq[TXQ_OFLD].phys_addr,
2710					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2711		if (ret) {
2712			printf("error %d from t3_sge_init_ecntxt\n", ret);
2713			goto err_unlock;
2714		}
2715	}
2716
2717	if (ntxq > 2) {
2718		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2719					 SGE_CNTXT_CTRL, id,
2720					 q->txq[TXQ_CTRL].phys_addr,
2721					 q->txq[TXQ_CTRL].size,
2722					 q->txq[TXQ_CTRL].token, 1, 0);
2723		if (ret) {
2724			printf("error %d from t3_sge_init_ecntxt\n", ret);
2725			goto err_unlock;
2726		}
2727	}
2728
2729	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2730	    device_get_unit(sc->dev), irq_vec_idx);
2731	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2732
2733	mtx_unlock_spin(&sc->sge.reg_lock);
2734	t3_update_qset_coalesce(q, p);
2735	q->port = pi;
2736
2737	refill_fl(sc, &q->fl[0], q->fl[0].size);
2738	refill_fl(sc, &q->fl[1], q->fl[1].size);
2739	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2740
2741	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2742		     V_NEWTIMER(q->rspq.holdoff_tmr));
2743
2744	return (0);
2745
2746err_unlock:
2747	mtx_unlock_spin(&sc->sge.reg_lock);
2748err:
2749	TXQ_LOCK(q);
2750	t3_free_qset(sc, q);
2751
2752	return (ret);
2753}
2754
2755/*
2756 * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2757 * ethernet data.  Hardware assistance with various checksums and any vlan tag
2758 * will also be taken into account here.
2759 */
2760void
2761t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2762{
2763	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2764	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2765	struct ifnet *ifp = pi->ifp;
2766
2767	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2768
2769	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2770	    cpl->csum_valid && cpl->csum == 0xffff) {
2771		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2772		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2773		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2774		m->m_pkthdr.csum_data = 0xffff;
2775	}
2776
2777	if (cpl->vlan_valid) {
2778		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2779		m->m_flags |= M_VLANTAG;
2780	}
2781
2782	m->m_pkthdr.rcvif = ifp;
2783	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2784	/*
2785	 * adjust after conversion to mbuf chain
2786	 */
2787	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2788	m->m_len -= (sizeof(*cpl) + ethpad);
2789	m->m_data += (sizeof(*cpl) + ethpad);
2790}
2791
2792/**
2793 *	get_packet - return the next ingress packet buffer from a free list
2794 *	@adap: the adapter that received the packet
2795 *	@drop_thres: # of remaining buffers before we start dropping packets
2796 *	@qs: the qset that the SGE free list holding the packet belongs to
2797 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2798 *      @r: response descriptor
2799 *
2800 *	Get the next packet from a free list and complete setup of the
2801 *	sk_buff.  If the packet is small we make a copy and recycle the
2802 *	original buffer, otherwise we use the original buffer itself.  If a
2803 *	positive drop threshold is supplied packets are dropped and their
2804 *	buffers recycled if (a) the number of remaining buffers is under the
2805 *	threshold and the packet is too big to copy, or (b) the packet should
2806 *	be copied but there is no memory for the copy.
2807 */
2808static int
2809get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2810    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2811{
2812
2813	unsigned int len_cq =  ntohl(r->len_cq);
2814	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2815	int mask, cidx = fl->cidx;
2816	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2817	uint32_t len = G_RSPD_LEN(len_cq);
2818	uint32_t flags = M_EXT;
2819	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2820	caddr_t cl;
2821	struct mbuf *m;
2822	int ret = 0;
2823
2824	mask = fl->size - 1;
2825	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2826	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2827	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2828	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2829
2830	fl->credits--;
2831	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2832
2833	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2834	    sopeop == RSPQ_SOP_EOP) {
2835		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2836			goto skip_recycle;
2837		cl = mtod(m, void *);
2838		memcpy(cl, sd->rxsd_cl, len);
2839		recycle_rx_buf(adap, fl, fl->cidx);
2840		m->m_pkthdr.len = m->m_len = len;
2841		m->m_flags = 0;
2842		mh->mh_head = mh->mh_tail = m;
2843		ret = 1;
2844		goto done;
2845	} else {
2846	skip_recycle:
2847		bus_dmamap_unload(fl->entry_tag, sd->map);
2848		cl = sd->rxsd_cl;
2849		m = sd->m;
2850
2851		if ((sopeop == RSPQ_SOP_EOP) ||
2852		    (sopeop == RSPQ_SOP))
2853			flags |= M_PKTHDR;
2854		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2855		if (fl->zone == zone_pack) {
2856			/*
2857			 * restore clobbered data pointer
2858			 */
2859			m->m_data = m->m_ext.ext_buf;
2860		} else {
2861			m_cljset(m, cl, fl->type);
2862		}
2863		m->m_len = len;
2864	}
2865	switch(sopeop) {
2866	case RSPQ_SOP_EOP:
2867		ret = 1;
2868		/* FALLTHROUGH */
2869	case RSPQ_SOP:
2870		mh->mh_head = mh->mh_tail = m;
2871		m->m_pkthdr.len = len;
2872		break;
2873	case RSPQ_EOP:
2874		ret = 1;
2875		/* FALLTHROUGH */
2876	case RSPQ_NSOP_NEOP:
2877		if (mh->mh_tail == NULL) {
2878			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2879			m_freem(m);
2880			break;
2881		}
2882		mh->mh_tail->m_next = m;
2883		mh->mh_tail = m;
2884		mh->mh_head->m_pkthdr.len += len;
2885		break;
2886	}
2887	if (cxgb_debug)
2888		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2889done:
2890	if (++fl->cidx == fl->size)
2891		fl->cidx = 0;
2892
2893	return (ret);
2894}
2895
2896/**
2897 *	handle_rsp_cntrl_info - handles control information in a response
2898 *	@qs: the queue set corresponding to the response
2899 *	@flags: the response control flags
2900 *
2901 *	Handles the control information of an SGE response, such as GTS
2902 *	indications and completion credits for the queue set's Tx queues.
2903 *	HW coalesces credits, we don't do any extra SW coalescing.
2904 */
2905static __inline void
2906handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2907{
2908	unsigned int credits;
2909
2910#if USE_GTS
2911	if (flags & F_RSPD_TXQ0_GTS)
2912		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2913#endif
2914	credits = G_RSPD_TXQ0_CR(flags);
2915	if (credits)
2916		qs->txq[TXQ_ETH].processed += credits;
2917
2918	credits = G_RSPD_TXQ2_CR(flags);
2919	if (credits)
2920		qs->txq[TXQ_CTRL].processed += credits;
2921
2922# if USE_GTS
2923	if (flags & F_RSPD_TXQ1_GTS)
2924		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2925# endif
2926	credits = G_RSPD_TXQ1_CR(flags);
2927	if (credits)
2928		qs->txq[TXQ_OFLD].processed += credits;
2929
2930}
2931
2932static void
2933check_ring_db(adapter_t *adap, struct sge_qset *qs,
2934    unsigned int sleeping)
2935{
2936	;
2937}
2938
2939/**
2940 *	process_responses - process responses from an SGE response queue
2941 *	@adap: the adapter
2942 *	@qs: the queue set to which the response queue belongs
2943 *	@budget: how many responses can be processed in this round
2944 *
2945 *	Process responses from an SGE response queue up to the supplied budget.
2946 *	Responses include received packets as well as credits and other events
2947 *	for the queues that belong to the response queue's queue set.
2948 *	A negative budget is effectively unlimited.
2949 *
2950 *	Additionally choose the interrupt holdoff time for the next interrupt
2951 *	on this queue.  If the system is under memory shortage use a fairly
2952 *	long delay to help recovery.
2953 */
2954static int
2955process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2956{
2957	struct sge_rspq *rspq = &qs->rspq;
2958	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2959	int budget_left = budget;
2960	unsigned int sleeping = 0;
2961	int lro_enabled = qs->lro.enabled;
2962	int skip_lro;
2963	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2964	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2965	int ngathered = 0;
2966	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2967#ifdef DEBUG
2968	static int last_holdoff = 0;
2969	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2970		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2971		last_holdoff = rspq->holdoff_tmr;
2972	}
2973#endif
2974	rspq->next_holdoff = rspq->holdoff_tmr;
2975
2976	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2977		int eth, eop = 0, ethpad = 0;
2978		uint32_t flags = ntohl(r->flags);
2979		uint32_t rss_csum = *(const uint32_t *)r;
2980		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2981
2982		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2983
2984		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2985			struct mbuf *m;
2986
2987			if (cxgb_debug)
2988				printf("async notification\n");
2989
2990			if (mh->mh_head == NULL) {
2991				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2992				m = mh->mh_head;
2993			} else {
2994				m = m_gethdr(M_DONTWAIT, MT_DATA);
2995			}
2996			if (m == NULL)
2997				goto no_mem;
2998
2999                        memcpy(mtod(m, char *), r, AN_PKT_SIZE);
3000			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
3001                        *mtod(m, char *) = CPL_ASYNC_NOTIF;
3002			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
3003			eop = 1;
3004                        rspq->async_notif++;
3005			goto skip;
3006		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
3007			struct mbuf *m = NULL;
3008
3009			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
3010			    r->rss_hdr.opcode, rspq->cidx);
3011			if (mh->mh_head == NULL)
3012				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3013                        else
3014				m = m_gethdr(M_DONTWAIT, MT_DATA);
3015
3016			if (mh->mh_head == NULL &&  m == NULL) {
3017		no_mem:
3018				rspq->next_holdoff = NOMEM_INTR_DELAY;
3019				budget_left--;
3020				break;
3021			}
3022			get_imm_packet(adap, r, mh->mh_head);
3023			eop = 1;
3024			rspq->imm_data++;
3025		} else if (r->len_cq) {
3026			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3027
3028			eop = get_packet(adap, drop_thresh, qs, mh, r);
3029			if (eop) {
3030				if (r->rss_hdr.hash_type && !adap->timestamp)
3031					mh->mh_head->m_flags |= M_FLOWID;
3032				mh->mh_head->m_pkthdr.flowid = rss_hash;
3033			}
3034
3035			ethpad = 2;
3036		} else {
3037			rspq->pure_rsps++;
3038		}
3039	skip:
3040		if (flags & RSPD_CTRL_MASK) {
3041			sleeping |= flags & RSPD_GTS_MASK;
3042			handle_rsp_cntrl_info(qs, flags);
3043		}
3044
3045		r++;
3046		if (__predict_false(++rspq->cidx == rspq->size)) {
3047			rspq->cidx = 0;
3048			rspq->gen ^= 1;
3049			r = rspq->desc;
3050		}
3051
3052		if (++rspq->credits >= 64) {
3053			refill_rspq(adap, rspq, rspq->credits);
3054			rspq->credits = 0;
3055		}
3056		if (!eth && eop) {
3057			mh->mh_head->m_pkthdr.csum_data = rss_csum;
3058			/*
3059			 * XXX size mismatch
3060			 */
3061			m_set_priority(mh->mh_head, rss_hash);
3062
3063
3064			ngathered = rx_offload(&adap->tdev, rspq,
3065			    mh->mh_head, offload_mbufs, ngathered);
3066			mh->mh_head = NULL;
3067			DPRINTF("received offload packet\n");
3068
3069		} else if (eth && eop) {
3070			struct mbuf *m = mh->mh_head;
3071
3072			t3_rx_eth(adap, rspq, m, ethpad);
3073
3074			/*
3075			 * The T304 sends incoming packets on any qset.  If LRO
3076			 * is also enabled, we could end up sending packet up
3077			 * lro_ctrl->ifp's input.  That is incorrect.
3078			 *
3079			 * The mbuf's rcvif was derived from the cpl header and
3080			 * is accurate.  Skip LRO and just use that.
3081			 */
3082			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3083
3084			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
3085#ifdef INET
3086			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
3087#endif
3088			    ) {
3089				/* successfully queue'd for LRO */
3090			} else {
3091				/*
3092				 * LRO not enabled, packet unsuitable for LRO,
3093				 * or unable to queue.  Pass it up right now in
3094				 * either case.
3095				 */
3096				struct ifnet *ifp = m->m_pkthdr.rcvif;
3097				(*ifp->if_input)(ifp, m);
3098			}
3099			mh->mh_head = NULL;
3100
3101		}
3102		__refill_fl_lt(adap, &qs->fl[0], 32);
3103		__refill_fl_lt(adap, &qs->fl[1], 32);
3104		--budget_left;
3105	}
3106
3107	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3108
3109#ifdef INET
3110	/* Flush LRO */
3111	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3112		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3113		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3114		tcp_lro_flush(lro_ctrl, queued);
3115	}
3116#endif
3117
3118	if (sleeping)
3119		check_ring_db(adap, qs, sleeping);
3120
3121	mb();  /* commit Tx queue processed updates */
3122	if (__predict_false(qs->txq_stopped > 1))
3123		restart_tx(qs);
3124
3125	__refill_fl_lt(adap, &qs->fl[0], 512);
3126	__refill_fl_lt(adap, &qs->fl[1], 512);
3127	budget -= budget_left;
3128	return (budget);
3129}
3130
3131/*
3132 * A helper function that processes responses and issues GTS.
3133 */
3134static __inline int
3135process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3136{
3137	int work;
3138	static int last_holdoff = 0;
3139
3140	work = process_responses(adap, rspq_to_qset(rq), -1);
3141
3142	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3143		printf("next_holdoff=%d\n", rq->next_holdoff);
3144		last_holdoff = rq->next_holdoff;
3145	}
3146	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3147	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3148
3149	return (work);
3150}
3151
3152
3153/*
3154 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3155 * Handles data events from SGE response queues as well as error and other
3156 * async events as they all use the same interrupt pin.  We use one SGE
3157 * response queue per port in this mode and protect all response queues with
3158 * queue 0's lock.
3159 */
3160void
3161t3b_intr(void *data)
3162{
3163	uint32_t i, map;
3164	adapter_t *adap = data;
3165	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3166
3167	t3_write_reg(adap, A_PL_CLI, 0);
3168	map = t3_read_reg(adap, A_SG_DATA_INTR);
3169
3170	if (!map)
3171		return;
3172
3173	if (__predict_false(map & F_ERRINTR)) {
3174		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3175		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3176		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3177	}
3178
3179	mtx_lock(&q0->lock);
3180	for_each_port(adap, i)
3181	    if (map & (1 << i))
3182			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3183	mtx_unlock(&q0->lock);
3184}
3185
3186/*
3187 * The MSI interrupt handler.  This needs to handle data events from SGE
3188 * response queues as well as error and other async events as they all use
3189 * the same MSI vector.  We use one SGE response queue per port in this mode
3190 * and protect all response queues with queue 0's lock.
3191 */
3192void
3193t3_intr_msi(void *data)
3194{
3195	adapter_t *adap = data;
3196	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3197	int i, new_packets = 0;
3198
3199	mtx_lock(&q0->lock);
3200
3201	for_each_port(adap, i)
3202	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3203		    new_packets = 1;
3204	mtx_unlock(&q0->lock);
3205	if (new_packets == 0) {
3206		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3207		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3208		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3209	}
3210}
3211
3212void
3213t3_intr_msix(void *data)
3214{
3215	struct sge_qset *qs = data;
3216	adapter_t *adap = qs->port->adapter;
3217	struct sge_rspq *rspq = &qs->rspq;
3218
3219	if (process_responses_gts(adap, rspq) == 0)
3220		rspq->unhandled_irqs++;
3221}
3222
3223#define QDUMP_SBUF_SIZE		32 * 400
3224static int
3225t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3226{
3227	struct sge_rspq *rspq;
3228	struct sge_qset *qs;
3229	int i, err, dump_end, idx;
3230	struct sbuf *sb;
3231	struct rsp_desc *rspd;
3232	uint32_t data[4];
3233
3234	rspq = arg1;
3235	qs = rspq_to_qset(rspq);
3236	if (rspq->rspq_dump_count == 0)
3237		return (0);
3238	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3239		log(LOG_WARNING,
3240		    "dump count is too large %d\n", rspq->rspq_dump_count);
3241		rspq->rspq_dump_count = 0;
3242		return (EINVAL);
3243	}
3244	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3245		log(LOG_WARNING,
3246		    "dump start of %d is greater than queue size\n",
3247		    rspq->rspq_dump_start);
3248		rspq->rspq_dump_start = 0;
3249		return (EINVAL);
3250	}
3251	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3252	if (err)
3253		return (err);
3254
3255	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3256
3257	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3258	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3259	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3260	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3261	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3262
3263	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3264	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3265
3266	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3267	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3268		idx = i & (RSPQ_Q_SIZE-1);
3269
3270		rspd = &rspq->desc[idx];
3271		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3272		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3273		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3274		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3275		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3276		    be32toh(rspd->len_cq), rspd->intr_gen);
3277	}
3278
3279	err = sbuf_finish(sb);
3280	/* Output a trailing NUL. */
3281	if (err == 0)
3282		err = SYSCTL_OUT(req, "", 1);
3283	sbuf_delete(sb);
3284	return (err);
3285}
3286
3287static int
3288t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3289{
3290	struct sge_txq *txq;
3291	struct sge_qset *qs;
3292	int i, j, err, dump_end;
3293	struct sbuf *sb;
3294	struct tx_desc *txd;
3295	uint32_t *WR, wr_hi, wr_lo, gen;
3296	uint32_t data[4];
3297
3298	txq = arg1;
3299	qs = txq_to_qset(txq, TXQ_ETH);
3300	if (txq->txq_dump_count == 0) {
3301		return (0);
3302	}
3303	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3304		log(LOG_WARNING,
3305		    "dump count is too large %d\n", txq->txq_dump_count);
3306		txq->txq_dump_count = 1;
3307		return (EINVAL);
3308	}
3309	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3310		log(LOG_WARNING,
3311		    "dump start of %d is greater than queue size\n",
3312		    txq->txq_dump_start);
3313		txq->txq_dump_start = 0;
3314		return (EINVAL);
3315	}
3316	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3317	if (err)
3318		return (err);
3319
3320	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3321
3322	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3323	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3324	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3325	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3326	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3327	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3328	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3329	    txq->txq_dump_start,
3330	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3331
3332	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3333	for (i = txq->txq_dump_start; i < dump_end; i++) {
3334		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3335		WR = (uint32_t *)txd->flit;
3336		wr_hi = ntohl(WR[0]);
3337		wr_lo = ntohl(WR[1]);
3338		gen = G_WR_GEN(wr_lo);
3339
3340		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3341		    wr_hi, wr_lo, gen);
3342		for (j = 2; j < 30; j += 4)
3343			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3344			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3345
3346	}
3347	err = sbuf_finish(sb);
3348	/* Output a trailing NUL. */
3349	if (err == 0)
3350		err = SYSCTL_OUT(req, "", 1);
3351	sbuf_delete(sb);
3352	return (err);
3353}
3354
3355static int
3356t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3357{
3358	struct sge_txq *txq;
3359	struct sge_qset *qs;
3360	int i, j, err, dump_end;
3361	struct sbuf *sb;
3362	struct tx_desc *txd;
3363	uint32_t *WR, wr_hi, wr_lo, gen;
3364
3365	txq = arg1;
3366	qs = txq_to_qset(txq, TXQ_CTRL);
3367	if (txq->txq_dump_count == 0) {
3368		return (0);
3369	}
3370	if (txq->txq_dump_count > 256) {
3371		log(LOG_WARNING,
3372		    "dump count is too large %d\n", txq->txq_dump_count);
3373		txq->txq_dump_count = 1;
3374		return (EINVAL);
3375	}
3376	if (txq->txq_dump_start > 255) {
3377		log(LOG_WARNING,
3378		    "dump start of %d is greater than queue size\n",
3379		    txq->txq_dump_start);
3380		txq->txq_dump_start = 0;
3381		return (EINVAL);
3382	}
3383
3384	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3385	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3386	    txq->txq_dump_start,
3387	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3388
3389	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3390	for (i = txq->txq_dump_start; i < dump_end; i++) {
3391		txd = &txq->desc[i & (255)];
3392		WR = (uint32_t *)txd->flit;
3393		wr_hi = ntohl(WR[0]);
3394		wr_lo = ntohl(WR[1]);
3395		gen = G_WR_GEN(wr_lo);
3396
3397		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3398		    wr_hi, wr_lo, gen);
3399		for (j = 2; j < 30; j += 4)
3400			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3401			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3402
3403	}
3404	err = sbuf_finish(sb);
3405	/* Output a trailing NUL. */
3406	if (err == 0)
3407		err = SYSCTL_OUT(req, "", 1);
3408	sbuf_delete(sb);
3409	return (err);
3410}
3411
3412static int
3413t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3414{
3415	adapter_t *sc = arg1;
3416	struct qset_params *qsp = &sc->params.sge.qset[0];
3417	int coalesce_usecs;
3418	struct sge_qset *qs;
3419	int i, j, err, nqsets = 0;
3420	struct mtx *lock;
3421
3422	if ((sc->flags & FULL_INIT_DONE) == 0)
3423		return (ENXIO);
3424
3425	coalesce_usecs = qsp->coalesce_usecs;
3426        err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3427
3428	if (err != 0) {
3429		return (err);
3430	}
3431	if (coalesce_usecs == qsp->coalesce_usecs)
3432		return (0);
3433
3434	for (i = 0; i < sc->params.nports; i++)
3435		for (j = 0; j < sc->port[i].nqsets; j++)
3436			nqsets++;
3437
3438	coalesce_usecs = max(1, coalesce_usecs);
3439
3440	for (i = 0; i < nqsets; i++) {
3441		qs = &sc->sge.qs[i];
3442		qsp = &sc->params.sge.qset[i];
3443		qsp->coalesce_usecs = coalesce_usecs;
3444
3445		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3446			    &sc->sge.qs[0].rspq.lock;
3447
3448		mtx_lock(lock);
3449		t3_update_qset_coalesce(qs, qsp);
3450		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3451		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3452		mtx_unlock(lock);
3453	}
3454
3455	return (0);
3456}
3457
3458static int
3459t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3460{
3461	adapter_t *sc = arg1;
3462	int rc, timestamp;
3463
3464	if ((sc->flags & FULL_INIT_DONE) == 0)
3465		return (ENXIO);
3466
3467	timestamp = sc->timestamp;
3468	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3469
3470	if (rc != 0)
3471		return (rc);
3472
3473	if (timestamp != sc->timestamp) {
3474		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3475		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3476		sc->timestamp = timestamp;
3477	}
3478
3479	return (0);
3480}
3481
3482void
3483t3_add_attach_sysctls(adapter_t *sc)
3484{
3485	struct sysctl_ctx_list *ctx;
3486	struct sysctl_oid_list *children;
3487
3488	ctx = device_get_sysctl_ctx(sc->dev);
3489	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3490
3491	/* random information */
3492	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3493	    "firmware_version",
3494	    CTLFLAG_RD, &sc->fw_version,
3495	    0, "firmware version");
3496	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3497	    "hw_revision",
3498	    CTLFLAG_RD, &sc->params.rev,
3499	    0, "chip model");
3500	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3501	    "port_types",
3502	    CTLFLAG_RD, &sc->port_types,
3503	    0, "type of ports");
3504	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3505	    "enable_debug",
3506	    CTLFLAG_RW, &cxgb_debug,
3507	    0, "enable verbose debugging output");
3508	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3509	    CTLFLAG_RD, &sc->tunq_coalesce,
3510	    "#tunneled packets freed");
3511	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3512	    "txq_overrun",
3513	    CTLFLAG_RD, &txq_fills,
3514	    0, "#times txq overrun");
3515	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3516	    "core_clock",
3517	    CTLFLAG_RD, &sc->params.vpd.cclk,
3518	    0, "core clock frequency (in KHz)");
3519}
3520
3521
3522static const char *rspq_name = "rspq";
3523static const char *txq_names[] =
3524{
3525	"txq_eth",
3526	"txq_ofld",
3527	"txq_ctrl"
3528};
3529
3530static int
3531sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3532{
3533	struct port_info *p = arg1;
3534	uint64_t *parg;
3535
3536	if (!p)
3537		return (EINVAL);
3538
3539	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3540	PORT_LOCK(p);
3541	t3_mac_update_stats(&p->mac);
3542	PORT_UNLOCK(p);
3543
3544	return (sysctl_handle_quad(oidp, parg, 0, req));
3545}
3546
3547void
3548t3_add_configured_sysctls(adapter_t *sc)
3549{
3550	struct sysctl_ctx_list *ctx;
3551	struct sysctl_oid_list *children;
3552	int i, j;
3553
3554	ctx = device_get_sysctl_ctx(sc->dev);
3555	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3556
3557	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3558	    "intr_coal",
3559	    CTLTYPE_INT|CTLFLAG_RW, sc,
3560	    0, t3_set_coalesce_usecs,
3561	    "I", "interrupt coalescing timer (us)");
3562
3563	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3564	    "pkt_timestamp",
3565	    CTLTYPE_INT | CTLFLAG_RW, sc,
3566	    0, t3_pkt_timestamp,
3567	    "I", "provide packet timestamp instead of connection hash");
3568
3569	for (i = 0; i < sc->params.nports; i++) {
3570		struct port_info *pi = &sc->port[i];
3571		struct sysctl_oid *poid;
3572		struct sysctl_oid_list *poidlist;
3573		struct mac_stats *mstats = &pi->mac.stats;
3574
3575		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3576		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3577		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3578		poidlist = SYSCTL_CHILDREN(poid);
3579		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3580		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3581		    0, "#queue sets");
3582
3583		for (j = 0; j < pi->nqsets; j++) {
3584			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3585			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3586					  *ctrlqpoid, *lropoid;
3587			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3588					       *txqpoidlist, *ctrlqpoidlist,
3589					       *lropoidlist;
3590			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3591
3592			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3593
3594			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3595			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3596			qspoidlist = SYSCTL_CHILDREN(qspoid);
3597
3598			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3599					CTLFLAG_RD, &qs->fl[0].empty, 0,
3600					"freelist #0 empty");
3601			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3602					CTLFLAG_RD, &qs->fl[1].empty, 0,
3603					"freelist #1 empty");
3604
3605			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3606			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3607			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3608
3609			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3610			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3611			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3612
3613			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3614			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3615			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3616
3617			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3618			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3619			lropoidlist = SYSCTL_CHILDREN(lropoid);
3620
3621			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3622			    CTLFLAG_RD, &qs->rspq.size,
3623			    0, "#entries in response queue");
3624			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3625			    CTLFLAG_RD, &qs->rspq.cidx,
3626			    0, "consumer index");
3627			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3628			    CTLFLAG_RD, &qs->rspq.credits,
3629			    0, "#credits");
3630			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3631			    CTLFLAG_RD, &qs->rspq.starved,
3632			    0, "#times starved");
3633			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3634			    CTLFLAG_RD, &qs->rspq.phys_addr,
3635			    "physical_address_of the queue");
3636			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3637			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3638			    0, "start rspq dump entry");
3639			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3640			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3641			    0, "#rspq entries to dump");
3642			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3643			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3644			    0, t3_dump_rspq, "A", "dump of the response queue");
3645
3646			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3647			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3648			    "#tunneled packets dropped");
3649			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3650			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3651			    0, "#tunneled packets waiting to be sent");
3652#if 0
3653			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3654			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3655			    0, "#tunneled packets queue producer index");
3656			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3657			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3658			    0, "#tunneled packets queue consumer index");
3659#endif
3660			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3661			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3662			    0, "#tunneled packets processed by the card");
3663			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3664			    CTLFLAG_RD, &txq->cleaned,
3665			    0, "#tunneled packets cleaned");
3666			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3667			    CTLFLAG_RD, &txq->in_use,
3668			    0, "#tunneled packet slots in use");
3669			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3670			    CTLFLAG_RD, &txq->txq_frees,
3671			    "#tunneled packets freed");
3672			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3673			    CTLFLAG_RD, &txq->txq_skipped,
3674			    0, "#tunneled packet descriptors skipped");
3675			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3676			    CTLFLAG_RD, &txq->txq_coalesced,
3677			    "#tunneled packets coalesced");
3678			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3679			    CTLFLAG_RD, &txq->txq_enqueued,
3680			    0, "#tunneled packets enqueued to hardware");
3681			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3682			    CTLFLAG_RD, &qs->txq_stopped,
3683			    0, "tx queues stopped");
3684			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3685			    CTLFLAG_RD, &txq->phys_addr,
3686			    "physical_address_of the queue");
3687			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3688			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3689			    0, "txq generation");
3690			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3691			    CTLFLAG_RD, &txq->cidx,
3692			    0, "hardware queue cidx");
3693			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3694			    CTLFLAG_RD, &txq->pidx,
3695			    0, "hardware queue pidx");
3696			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3697			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3698			    0, "txq start idx for dump");
3699			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3700			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3701			    0, "txq #entries to dump");
3702			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3703			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3704			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3705
3706			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3707			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3708			    0, "ctrlq start idx for dump");
3709			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3710			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3711			    0, "ctrl #entries to dump");
3712			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3713			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3714			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3715
3716			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3717			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3718			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3719			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3720			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3721			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3722			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3723			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3724		}
3725
3726		/* Now add a node for mac stats. */
3727		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3728		    CTLFLAG_RD, NULL, "MAC statistics");
3729		poidlist = SYSCTL_CHILDREN(poid);
3730
3731		/*
3732		 * We (ab)use the length argument (arg2) to pass on the offset
3733		 * of the data that we are interested in.  This is only required
3734		 * for the quad counters that are updated from the hardware (we
3735		 * make sure that we return the latest value).
3736		 * sysctl_handle_macstat first updates *all* the counters from
3737		 * the hardware, and then returns the latest value of the
3738		 * requested counter.  Best would be to update only the
3739		 * requested counter from hardware, but t3_mac_update_stats()
3740		 * hides all the register details and we don't want to dive into
3741		 * all that here.
3742		 */
3743#define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3744    (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3745    sysctl_handle_macstat, "QU", 0)
3746		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3747		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3748		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3749		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3750		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3751		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3752		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3753		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3754		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3755		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3756		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3757		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3758		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3759		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3760		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3761		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3762		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3763		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3764		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3765		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3766		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3767		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3768		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3769		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3770		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3771		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3772		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3773		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3774		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3775		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3776		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3777		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3778		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3779		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3780		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3781		CXGB_SYSCTL_ADD_QUAD(rx_short);
3782		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3783		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3784		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3785		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3786		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3787		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3788		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3789		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3790		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3791		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3792#undef CXGB_SYSCTL_ADD_QUAD
3793
3794#define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3795    CTLFLAG_RD, &mstats->a, 0)
3796		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3797		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3798		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3799		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3800		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3801		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3802		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3803		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3804		CXGB_SYSCTL_ADD_ULONG(num_resets);
3805		CXGB_SYSCTL_ADD_ULONG(link_faults);
3806#undef CXGB_SYSCTL_ADD_ULONG
3807	}
3808}
3809
3810/**
3811 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3812 *	@qs: the queue set
3813 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3814 *	@idx: the descriptor index in the queue
3815 *	@data: where to dump the descriptor contents
3816 *
3817 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3818 *	size of the descriptor.
3819 */
3820int
3821t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3822		unsigned char *data)
3823{
3824	if (qnum >= 6)
3825		return (EINVAL);
3826
3827	if (qnum < 3) {
3828		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3829			return -EINVAL;
3830		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3831		return sizeof(struct tx_desc);
3832	}
3833
3834	if (qnum == 3) {
3835		if (!qs->rspq.desc || idx >= qs->rspq.size)
3836			return (EINVAL);
3837		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3838		return sizeof(struct rsp_desc);
3839	}
3840
3841	qnum -= 4;
3842	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3843		return (EINVAL);
3844	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3845	return sizeof(struct rx_desc);
3846}
3847