1/**************************************************************************
2
3Copyright (c) 2007-2009, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD$");
32
33#include "opt_inet6.h"
34#include "opt_inet.h"
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/module.h>
40#include <sys/bus.h>
41#include <sys/conf.h>
42#include <machine/bus.h>
43#include <machine/resource.h>
44#include <sys/bus_dma.h>
45#include <sys/rman.h>
46#include <sys/queue.h>
47#include <sys/sysctl.h>
48#include <sys/taskqueue.h>
49
50#include <sys/proc.h>
51#include <sys/sbuf.h>
52#include <sys/sched.h>
53#include <sys/smp.h>
54#include <sys/systm.h>
55#include <sys/syslog.h>
56#include <sys/socket.h>
57#include <sys/sglist.h>
58
59#include <net/bpf.h>
60#include <net/ethernet.h>
61#include <net/if.h>
62#include <net/if_vlan_var.h>
63
64#include <netinet/in_systm.h>
65#include <netinet/in.h>
66#include <netinet/ip.h>
67#include <netinet/ip6.h>
68#include <netinet/tcp.h>
69
70#include <dev/pci/pcireg.h>
71#include <dev/pci/pcivar.h>
72
73#include <vm/vm.h>
74#include <vm/pmap.h>
75
76#include <cxgb_include.h>
77#include <sys/mvec.h>
78
79int	txq_fills = 0;
80int	multiq_tx_enable = 1;
81
82#ifdef TCP_OFFLOAD
83CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
84#endif
85
86extern struct sysctl_oid_list sysctl__hw_cxgb_children;
87int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
88TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
89SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
90    "size of per-queue mbuf ring");
91
92static int cxgb_tx_coalesce_force = 0;
93TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
94SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
95    &cxgb_tx_coalesce_force, 0,
96    "coalesce small packets into a single work request regardless of ring state");
97
98#define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
99#define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
100#define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
101#define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
102#define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
103#define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
104#define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
105
106
107static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
108TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
109    &cxgb_tx_coalesce_enable_start);
110SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
111    &cxgb_tx_coalesce_enable_start, 0,
112    "coalesce enable threshold");
113static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
114TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
115SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
116    &cxgb_tx_coalesce_enable_stop, 0,
117    "coalesce disable threshold");
118static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
119TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
120SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
121    &cxgb_tx_reclaim_threshold, 0,
122    "tx cleaning minimum threshold");
123
124/*
125 * XXX don't re-enable this until TOE stops assuming
126 * we have an m_ext
127 */
128static int recycle_enable = 0;
129
130extern int cxgb_use_16k_clusters;
131extern int nmbjumbop;
132extern int nmbjumbo9;
133extern int nmbjumbo16;
134
135#define USE_GTS 0
136
137#define SGE_RX_SM_BUF_SIZE	1536
138#define SGE_RX_DROP_THRES	16
139#define SGE_RX_COPY_THRES	128
140
141/*
142 * Period of the Tx buffer reclaim timer.  This timer does not need to run
143 * frequently as Tx buffers are usually reclaimed by new Tx packets.
144 */
145#define TX_RECLAIM_PERIOD       (hz >> 1)
146
147/*
148 * Values for sge_txq.flags
149 */
150enum {
151	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
152	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
153};
154
155struct tx_desc {
156	uint64_t	flit[TX_DESC_FLITS];
157} __packed;
158
159struct rx_desc {
160	uint32_t	addr_lo;
161	uint32_t	len_gen;
162	uint32_t	gen2;
163	uint32_t	addr_hi;
164} __packed;
165
166struct rsp_desc {               /* response queue descriptor */
167	struct rss_header	rss_hdr;
168	uint32_t		flags;
169	uint32_t		len_cq;
170	uint8_t			imm_data[47];
171	uint8_t			intr_gen;
172} __packed;
173
174#define RX_SW_DESC_MAP_CREATED	(1 << 0)
175#define TX_SW_DESC_MAP_CREATED	(1 << 1)
176#define RX_SW_DESC_INUSE        (1 << 3)
177#define TX_SW_DESC_MAPPED       (1 << 4)
178
179#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
180#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
181#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
182#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
183
184struct tx_sw_desc {                /* SW state per Tx descriptor */
185	struct mbuf	*m;
186	bus_dmamap_t	map;
187	int		flags;
188};
189
190struct rx_sw_desc {                /* SW state per Rx descriptor */
191	caddr_t		rxsd_cl;
192	struct mbuf	*m;
193	bus_dmamap_t	map;
194	int		flags;
195};
196
197struct txq_state {
198	unsigned int	compl;
199	unsigned int	gen;
200	unsigned int	pidx;
201};
202
203struct refill_fl_cb_arg {
204	int               error;
205	bus_dma_segment_t seg;
206	int               nseg;
207};
208
209
210/*
211 * Maps a number of flits to the number of Tx descriptors that can hold them.
212 * The formula is
213 *
214 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
215 *
216 * HW allows up to 4 descriptors to be combined into a WR.
217 */
218static uint8_t flit_desc_map[] = {
219	0,
220#if SGE_NUM_GENBITS == 1
221	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
222	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
223	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
225#elif SGE_NUM_GENBITS == 2
226	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
227	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
228	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
229	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
230#else
231# error "SGE_NUM_GENBITS must be 1 or 2"
232#endif
233};
234
235#define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
236#define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
237#define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
238#define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
239#define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
240#define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
241	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
242#define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
243#define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
244	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
245#define	TXQ_RING_DEQUEUE(qs) \
246	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
247
248int cxgb_debug = 0;
249
250static void sge_timer_cb(void *arg);
251static void sge_timer_reclaim(void *arg, int ncount);
252static void sge_txq_reclaim_handler(void *arg, int ncount);
253static void cxgb_start_locked(struct sge_qset *qs);
254
255/*
256 * XXX need to cope with bursty scheduling by looking at a wider
257 * window than we are now for determining the need for coalescing
258 *
259 */
260static __inline uint64_t
261check_pkt_coalesce(struct sge_qset *qs)
262{
263        struct adapter *sc;
264        struct sge_txq *txq;
265	uint8_t *fill;
266
267	if (__predict_false(cxgb_tx_coalesce_force))
268		return (1);
269	txq = &qs->txq[TXQ_ETH];
270        sc = qs->port->adapter;
271	fill = &sc->tunq_fill[qs->idx];
272
273	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
274		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
275	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
276		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
277	/*
278	 * if the hardware transmit queue is more than 1/8 full
279	 * we mark it as coalescing - we drop back from coalescing
280	 * when we go below 1/32 full and there are no packets enqueued,
281	 * this provides us with some degree of hysteresis
282	 */
283        if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
284	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
285                *fill = 0;
286        else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
287                *fill = 1;
288
289	return (sc->tunq_coalesce);
290}
291
292#ifdef __LP64__
293static void
294set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
295{
296	uint64_t wr_hilo;
297#if _BYTE_ORDER == _LITTLE_ENDIAN
298	wr_hilo = wr_hi;
299	wr_hilo |= (((uint64_t)wr_lo)<<32);
300#else
301	wr_hilo = wr_lo;
302	wr_hilo |= (((uint64_t)wr_hi)<<32);
303#endif
304	wrp->wrh_hilo = wr_hilo;
305}
306#else
307static void
308set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
309{
310
311	wrp->wrh_hi = wr_hi;
312	wmb();
313	wrp->wrh_lo = wr_lo;
314}
315#endif
316
317struct coalesce_info {
318	int count;
319	int nbytes;
320};
321
322static int
323coalesce_check(struct mbuf *m, void *arg)
324{
325	struct coalesce_info *ci = arg;
326	int *count = &ci->count;
327	int *nbytes = &ci->nbytes;
328
329	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
330		(*count < 7) && (m->m_next == NULL))) {
331		*count += 1;
332		*nbytes += m->m_len;
333		return (1);
334	}
335	return (0);
336}
337
338static struct mbuf *
339cxgb_dequeue(struct sge_qset *qs)
340{
341	struct mbuf *m, *m_head, *m_tail;
342	struct coalesce_info ci;
343
344
345	if (check_pkt_coalesce(qs) == 0)
346		return TXQ_RING_DEQUEUE(qs);
347
348	m_head = m_tail = NULL;
349	ci.count = ci.nbytes = 0;
350	do {
351		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
352		if (m_head == NULL) {
353			m_tail = m_head = m;
354		} else if (m != NULL) {
355			m_tail->m_nextpkt = m;
356			m_tail = m;
357		}
358	} while (m != NULL);
359	if (ci.count > 7)
360		panic("trying to coalesce %d packets in to one WR", ci.count);
361	return (m_head);
362}
363
364/**
365 *	reclaim_completed_tx - reclaims completed Tx descriptors
366 *	@adapter: the adapter
367 *	@q: the Tx queue to reclaim completed descriptors from
368 *
369 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
370 *	and frees the associated buffers if possible.  Called with the Tx
371 *	queue's lock held.
372 */
373static __inline int
374reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
375{
376	struct sge_txq *q = &qs->txq[queue];
377	int reclaim = desc_reclaimable(q);
378
379	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
380	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
381		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
382
383	if (reclaim < reclaim_min)
384		return (0);
385
386	mtx_assert(&qs->lock, MA_OWNED);
387	if (reclaim > 0) {
388		t3_free_tx_desc(qs, reclaim, queue);
389		q->cleaned += reclaim;
390		q->in_use -= reclaim;
391	}
392	if (isset(&qs->txq_stopped, TXQ_ETH))
393                clrbit(&qs->txq_stopped, TXQ_ETH);
394
395	return (reclaim);
396}
397
398/**
399 *	should_restart_tx - are there enough resources to restart a Tx queue?
400 *	@q: the Tx queue
401 *
402 *	Checks if there are enough descriptors to restart a suspended Tx queue.
403 */
404static __inline int
405should_restart_tx(const struct sge_txq *q)
406{
407	unsigned int r = q->processed - q->cleaned;
408
409	return q->in_use - r < (q->size >> 1);
410}
411
412/**
413 *	t3_sge_init - initialize SGE
414 *	@adap: the adapter
415 *	@p: the SGE parameters
416 *
417 *	Performs SGE initialization needed every time after a chip reset.
418 *	We do not initialize any of the queue sets here, instead the driver
419 *	top-level must request those individually.  We also do not enable DMA
420 *	here, that should be done after the queues have been set up.
421 */
422void
423t3_sge_init(adapter_t *adap, struct sge_params *p)
424{
425	u_int ctrl, ups;
426
427	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
428
429	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
430	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
431	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
432	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
433#if SGE_NUM_GENBITS == 1
434	ctrl |= F_EGRGENCTRL;
435#endif
436	if (adap->params.rev > 0) {
437		if (!(adap->flags & (USING_MSIX | USING_MSI)))
438			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
439	}
440	t3_write_reg(adap, A_SG_CONTROL, ctrl);
441	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
442		     V_LORCQDRBTHRSH(512));
443	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
444	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
445		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
446	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
447		     adap->params.rev < T3_REV_C ? 1000 : 500);
448	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
449	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
450	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
451	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
452	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
453}
454
455
456/**
457 *	sgl_len - calculates the size of an SGL of the given capacity
458 *	@n: the number of SGL entries
459 *
460 *	Calculates the number of flits needed for a scatter/gather list that
461 *	can hold the given number of entries.
462 */
463static __inline unsigned int
464sgl_len(unsigned int n)
465{
466	return ((3 * n) / 2 + (n & 1));
467}
468
469/**
470 *	get_imm_packet - return the next ingress packet buffer from a response
471 *	@resp: the response descriptor containing the packet data
472 *
473 *	Return a packet containing the immediate data of the given response.
474 */
475static int
476get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
477{
478
479	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
480		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
481		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
482	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
483		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
484		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
485	} else
486		m->m_len = IMMED_PKT_SIZE;
487	m->m_ext.ext_buf = NULL;
488	m->m_ext.ext_type = 0;
489	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len);
490	return (0);
491}
492
493static __inline u_int
494flits_to_desc(u_int n)
495{
496	return (flit_desc_map[n]);
497}
498
499#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
500		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
501		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
502		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
503		    F_HIRCQPARITYERROR)
504#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
505#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
506		      F_RSPQDISABLED)
507
508/**
509 *	t3_sge_err_intr_handler - SGE async event interrupt handler
510 *	@adapter: the adapter
511 *
512 *	Interrupt handler for SGE asynchronous (non-data) events.
513 */
514void
515t3_sge_err_intr_handler(adapter_t *adapter)
516{
517	unsigned int v, status;
518
519	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
520	if (status & SGE_PARERR)
521		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
522			 status & SGE_PARERR);
523	if (status & SGE_FRAMINGERR)
524		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
525			 status & SGE_FRAMINGERR);
526	if (status & F_RSPQCREDITOVERFOW)
527		CH_ALERT(adapter, "SGE response queue credit overflow\n");
528
529	if (status & F_RSPQDISABLED) {
530		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
531
532		CH_ALERT(adapter,
533			 "packet delivered to disabled response queue (0x%x)\n",
534			 (v >> S_RSPQ0DISABLED) & 0xff);
535	}
536
537	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
538	if (status & SGE_FATALERR)
539		t3_fatal_err(adapter);
540}
541
542void
543t3_sge_prep(adapter_t *adap, struct sge_params *p)
544{
545	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
546
547	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
548	nqsets *= adap->params.nports;
549
550	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
551
552	while (!powerof2(fl_q_size))
553		fl_q_size--;
554
555	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
556	    is_offload(adap);
557
558#if __FreeBSD_version >= 700111
559	if (use_16k) {
560		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
561		jumbo_buf_size = MJUM16BYTES;
562	} else {
563		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
564		jumbo_buf_size = MJUM9BYTES;
565	}
566#else
567	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
568	jumbo_buf_size = MJUMPAGESIZE;
569#endif
570	while (!powerof2(jumbo_q_size))
571		jumbo_q_size--;
572
573	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
574		device_printf(adap->dev,
575		    "Insufficient clusters and/or jumbo buffers.\n");
576
577	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
578
579	for (i = 0; i < SGE_QSETS; ++i) {
580		struct qset_params *q = p->qset + i;
581
582		if (adap->params.nports > 2) {
583			q->coalesce_usecs = 50;
584		} else {
585#ifdef INVARIANTS
586			q->coalesce_usecs = 10;
587#else
588			q->coalesce_usecs = 5;
589#endif
590		}
591		q->polling = 0;
592		q->rspq_size = RSPQ_Q_SIZE;
593		q->fl_size = fl_q_size;
594		q->jumbo_size = jumbo_q_size;
595		q->jumbo_buf_size = jumbo_buf_size;
596		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
597		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
598		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
599		q->cong_thres = 0;
600	}
601}
602
603int
604t3_sge_alloc(adapter_t *sc)
605{
606
607	/* The parent tag. */
608	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
609				1, 0,			/* algnmnt, boundary */
610				BUS_SPACE_MAXADDR,	/* lowaddr */
611				BUS_SPACE_MAXADDR,	/* highaddr */
612				NULL, NULL,		/* filter, filterarg */
613				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
614				BUS_SPACE_UNRESTRICTED, /* nsegments */
615				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
616				0,			/* flags */
617				NULL, NULL,		/* lock, lockarg */
618				&sc->parent_dmat)) {
619		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
620		return (ENOMEM);
621	}
622
623	/*
624	 * DMA tag for normal sized RX frames
625	 */
626	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
627		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
628		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
629		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
630		return (ENOMEM);
631	}
632
633	/*
634	 * DMA tag for jumbo sized RX frames.
635	 */
636	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
637		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
638		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
639		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
640		return (ENOMEM);
641	}
642
643	/*
644	 * DMA tag for TX frames.
645	 */
646	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
647		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
648		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
649		NULL, NULL, &sc->tx_dmat)) {
650		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
651		return (ENOMEM);
652	}
653
654	return (0);
655}
656
657int
658t3_sge_free(struct adapter * sc)
659{
660
661	if (sc->tx_dmat != NULL)
662		bus_dma_tag_destroy(sc->tx_dmat);
663
664	if (sc->rx_jumbo_dmat != NULL)
665		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
666
667	if (sc->rx_dmat != NULL)
668		bus_dma_tag_destroy(sc->rx_dmat);
669
670	if (sc->parent_dmat != NULL)
671		bus_dma_tag_destroy(sc->parent_dmat);
672
673	return (0);
674}
675
676void
677t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
678{
679
680	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
681	qs->rspq.polling = 0 /* p->polling */;
682}
683
684#if !defined(__i386__) && !defined(__amd64__)
685static void
686refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
687{
688	struct refill_fl_cb_arg *cb_arg = arg;
689
690	cb_arg->error = error;
691	cb_arg->seg = segs[0];
692	cb_arg->nseg = nseg;
693
694}
695#endif
696/**
697 *	refill_fl - refill an SGE free-buffer list
698 *	@sc: the controller softc
699 *	@q: the free-list to refill
700 *	@n: the number of new buffers to allocate
701 *
702 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
703 *	The caller must assure that @n does not exceed the queue's capacity.
704 */
705static void
706refill_fl(adapter_t *sc, struct sge_fl *q, int n)
707{
708	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
709	struct rx_desc *d = &q->desc[q->pidx];
710	struct refill_fl_cb_arg cb_arg;
711	struct mbuf *m;
712	caddr_t cl;
713	int err;
714
715	cb_arg.error = 0;
716	while (n--) {
717		/*
718		 * We allocate an uninitialized mbuf + cluster, mbuf is
719		 * initialized after rx.
720		 */
721		if (q->zone == zone_pack) {
722			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
723				break;
724			cl = m->m_ext.ext_buf;
725		} else {
726			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
727				break;
728			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
729				uma_zfree(q->zone, cl);
730				break;
731			}
732		}
733		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
734			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
735				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
736				uma_zfree(q->zone, cl);
737				goto done;
738			}
739			sd->flags |= RX_SW_DESC_MAP_CREATED;
740		}
741#if !defined(__i386__) && !defined(__amd64__)
742		err = bus_dmamap_load(q->entry_tag, sd->map,
743		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
744
745		if (err != 0 || cb_arg.error) {
746			if (q->zone == zone_pack)
747				uma_zfree(q->zone, cl);
748			m_free(m);
749			goto done;
750		}
751#else
752		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
753#endif
754		sd->flags |= RX_SW_DESC_INUSE;
755		sd->rxsd_cl = cl;
756		sd->m = m;
757		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
758		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
759		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
760		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
761
762		d++;
763		sd++;
764
765		if (++q->pidx == q->size) {
766			q->pidx = 0;
767			q->gen ^= 1;
768			sd = q->sdesc;
769			d = q->desc;
770		}
771		q->credits++;
772		q->db_pending++;
773	}
774
775done:
776	if (q->db_pending >= 32) {
777		q->db_pending = 0;
778		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
779	}
780}
781
782
783/**
784 *	free_rx_bufs - free the Rx buffers on an SGE free list
785 *	@sc: the controle softc
786 *	@q: the SGE free list to clean up
787 *
788 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
789 *	this queue should be stopped before calling this function.
790 */
791static void
792free_rx_bufs(adapter_t *sc, struct sge_fl *q)
793{
794	u_int cidx = q->cidx;
795
796	while (q->credits--) {
797		struct rx_sw_desc *d = &q->sdesc[cidx];
798
799		if (d->flags & RX_SW_DESC_INUSE) {
800			bus_dmamap_unload(q->entry_tag, d->map);
801			bus_dmamap_destroy(q->entry_tag, d->map);
802			if (q->zone == zone_pack) {
803				m_init(d->m, zone_pack, MCLBYTES,
804				    M_NOWAIT, MT_DATA, M_EXT);
805				uma_zfree(zone_pack, d->m);
806			} else {
807				m_init(d->m, zone_mbuf, MLEN,
808				    M_NOWAIT, MT_DATA, 0);
809				uma_zfree(zone_mbuf, d->m);
810				uma_zfree(q->zone, d->rxsd_cl);
811			}
812		}
813
814		d->rxsd_cl = NULL;
815		d->m = NULL;
816		if (++cidx == q->size)
817			cidx = 0;
818	}
819}
820
821static __inline void
822__refill_fl(adapter_t *adap, struct sge_fl *fl)
823{
824	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
825}
826
827static __inline void
828__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
829{
830	uint32_t reclaimable = fl->size - fl->credits;
831
832	if (reclaimable > 0)
833		refill_fl(adap, fl, min(max, reclaimable));
834}
835
836/**
837 *	recycle_rx_buf - recycle a receive buffer
838 *	@adapter: the adapter
839 *	@q: the SGE free list
840 *	@idx: index of buffer to recycle
841 *
842 *	Recycles the specified buffer on the given free list by adding it at
843 *	the next available slot on the list.
844 */
845static void
846recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
847{
848	struct rx_desc *from = &q->desc[idx];
849	struct rx_desc *to   = &q->desc[q->pidx];
850
851	q->sdesc[q->pidx] = q->sdesc[idx];
852	to->addr_lo = from->addr_lo;        // already big endian
853	to->addr_hi = from->addr_hi;        // likewise
854	wmb();	/* necessary ? */
855	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
856	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
857	q->credits++;
858
859	if (++q->pidx == q->size) {
860		q->pidx = 0;
861		q->gen ^= 1;
862	}
863	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
864}
865
866static void
867alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
868{
869	uint32_t *addr;
870
871	addr = arg;
872	*addr = segs[0].ds_addr;
873}
874
875static int
876alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
877    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
878    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
879{
880	size_t len = nelem * elem_size;
881	void *s = NULL;
882	void *p = NULL;
883	int err;
884
885	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
886				      BUS_SPACE_MAXADDR_32BIT,
887				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
888				      len, 0, NULL, NULL, tag)) != 0) {
889		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
890		return (ENOMEM);
891	}
892
893	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
894				    map)) != 0) {
895		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
896		return (ENOMEM);
897	}
898
899	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
900	bzero(p, len);
901	*(void **)desc = p;
902
903	if (sw_size) {
904		len = nelem * sw_size;
905		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
906		*(void **)sdesc = s;
907	}
908	if (parent_entry_tag == NULL)
909		return (0);
910
911	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
912				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
913		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
914				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
915		                      NULL, NULL, entry_tag)) != 0) {
916		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
917		return (ENOMEM);
918	}
919	return (0);
920}
921
922static void
923sge_slow_intr_handler(void *arg, int ncount)
924{
925	adapter_t *sc = arg;
926
927	t3_slow_intr_handler(sc);
928	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
929	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
930}
931
932/**
933 *	sge_timer_cb - perform periodic maintenance of an SGE qset
934 *	@data: the SGE queue set to maintain
935 *
936 *	Runs periodically from a timer to perform maintenance of an SGE queue
937 *	set.  It performs two tasks:
938 *
939 *	a) Cleans up any completed Tx descriptors that may still be pending.
940 *	Normal descriptor cleanup happens when new packets are added to a Tx
941 *	queue so this timer is relatively infrequent and does any cleanup only
942 *	if the Tx queue has not seen any new packets in a while.  We make a
943 *	best effort attempt to reclaim descriptors, in that we don't wait
944 *	around if we cannot get a queue's lock (which most likely is because
945 *	someone else is queueing new packets and so will also handle the clean
946 *	up).  Since control queues use immediate data exclusively we don't
947 *	bother cleaning them up here.
948 *
949 *	b) Replenishes Rx queues that have run out due to memory shortage.
950 *	Normally new Rx buffers are added when existing ones are consumed but
951 *	when out of memory a queue can become empty.  We try to add only a few
952 *	buffers here, the queue will be replenished fully as these new buffers
953 *	are used up if memory shortage has subsided.
954 *
955 *	c) Return coalesced response queue credits in case a response queue is
956 *	starved.
957 *
958 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
959 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
960 */
961static void
962sge_timer_cb(void *arg)
963{
964	adapter_t *sc = arg;
965	if ((sc->flags & USING_MSIX) == 0) {
966
967		struct port_info *pi;
968		struct sge_qset *qs;
969		struct sge_txq  *txq;
970		int i, j;
971		int reclaim_ofl, refill_rx;
972
973		if (sc->open_device_map == 0)
974			return;
975
976		for (i = 0; i < sc->params.nports; i++) {
977			pi = &sc->port[i];
978			for (j = 0; j < pi->nqsets; j++) {
979				qs = &sc->sge.qs[pi->first_qset + j];
980				txq = &qs->txq[0];
981				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
982				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
983				    (qs->fl[1].credits < qs->fl[1].size));
984				if (reclaim_ofl || refill_rx) {
985					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
986					break;
987				}
988			}
989		}
990	}
991
992	if (sc->params.nports > 2) {
993		int i;
994
995		for_each_port(sc, i) {
996			struct port_info *pi = &sc->port[i];
997
998			t3_write_reg(sc, A_SG_KDOORBELL,
999				     F_SELEGRCNTX |
1000				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
1001		}
1002	}
1003	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
1004	    sc->open_device_map != 0)
1005		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1006}
1007
1008/*
1009 * This is meant to be a catch-all function to keep sge state private
1010 * to sge.c
1011 *
1012 */
1013int
1014t3_sge_init_adapter(adapter_t *sc)
1015{
1016	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
1017	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1018	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1019	return (0);
1020}
1021
1022int
1023t3_sge_reset_adapter(adapter_t *sc)
1024{
1025	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1026	return (0);
1027}
1028
1029int
1030t3_sge_init_port(struct port_info *pi)
1031{
1032	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1033	return (0);
1034}
1035
1036/**
1037 *	refill_rspq - replenish an SGE response queue
1038 *	@adapter: the adapter
1039 *	@q: the response queue to replenish
1040 *	@credits: how many new responses to make available
1041 *
1042 *	Replenishes a response queue by making the supplied number of responses
1043 *	available to HW.
1044 */
1045static __inline void
1046refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1047{
1048
1049	/* mbufs are allocated on demand when a rspq entry is processed. */
1050	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1051		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1052}
1053
1054static void
1055sge_txq_reclaim_handler(void *arg, int ncount)
1056{
1057	struct sge_qset *qs = arg;
1058	int i;
1059
1060	for (i = 0; i < 3; i++)
1061		reclaim_completed_tx(qs, 16, i);
1062}
1063
1064static void
1065sge_timer_reclaim(void *arg, int ncount)
1066{
1067	struct port_info *pi = arg;
1068	int i, nqsets = pi->nqsets;
1069	adapter_t *sc = pi->adapter;
1070	struct sge_qset *qs;
1071	struct mtx *lock;
1072
1073	KASSERT((sc->flags & USING_MSIX) == 0,
1074	    ("can't call timer reclaim for msi-x"));
1075
1076	for (i = 0; i < nqsets; i++) {
1077		qs = &sc->sge.qs[pi->first_qset + i];
1078
1079		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1080		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1081			    &sc->sge.qs[0].rspq.lock;
1082
1083		if (mtx_trylock(lock)) {
1084			/* XXX currently assume that we are *NOT* polling */
1085			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1086
1087			if (qs->fl[0].credits < qs->fl[0].size - 16)
1088				__refill_fl(sc, &qs->fl[0]);
1089			if (qs->fl[1].credits < qs->fl[1].size - 16)
1090				__refill_fl(sc, &qs->fl[1]);
1091
1092			if (status & (1 << qs->rspq.cntxt_id)) {
1093				if (qs->rspq.credits) {
1094					refill_rspq(sc, &qs->rspq, 1);
1095					qs->rspq.credits--;
1096					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1097					    1 << qs->rspq.cntxt_id);
1098				}
1099			}
1100			mtx_unlock(lock);
1101		}
1102	}
1103}
1104
1105/**
1106 *	init_qset_cntxt - initialize an SGE queue set context info
1107 *	@qs: the queue set
1108 *	@id: the queue set id
1109 *
1110 *	Initializes the TIDs and context ids for the queues of a queue set.
1111 */
1112static void
1113init_qset_cntxt(struct sge_qset *qs, u_int id)
1114{
1115
1116	qs->rspq.cntxt_id = id;
1117	qs->fl[0].cntxt_id = 2 * id;
1118	qs->fl[1].cntxt_id = 2 * id + 1;
1119	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1120	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1121	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1122	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1123	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1124
1125	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1126	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1127	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1128}
1129
1130
1131static void
1132txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1133{
1134	txq->in_use += ndesc;
1135	/*
1136	 * XXX we don't handle stopping of queue
1137	 * presumably start handles this when we bump against the end
1138	 */
1139	txqs->gen = txq->gen;
1140	txq->unacked += ndesc;
1141	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1142	txq->unacked &= 31;
1143	txqs->pidx = txq->pidx;
1144	txq->pidx += ndesc;
1145#ifdef INVARIANTS
1146	if (((txqs->pidx > txq->cidx) &&
1147		(txq->pidx < txqs->pidx) &&
1148		(txq->pidx >= txq->cidx)) ||
1149	    ((txqs->pidx < txq->cidx) &&
1150		(txq->pidx >= txq-> cidx)) ||
1151	    ((txqs->pidx < txq->cidx) &&
1152		(txq->cidx < txqs->pidx)))
1153		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1154		    txqs->pidx, txq->pidx, txq->cidx);
1155#endif
1156	if (txq->pidx >= txq->size) {
1157		txq->pidx -= txq->size;
1158		txq->gen ^= 1;
1159	}
1160
1161}
1162
1163/**
1164 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1165 *	@m: the packet mbufs
1166 *      @nsegs: the number of segments
1167 *
1168 * 	Returns the number of Tx descriptors needed for the given Ethernet
1169 * 	packet.  Ethernet packets require addition of WR and CPL headers.
1170 */
1171static __inline unsigned int
1172calc_tx_descs(const struct mbuf *m, int nsegs)
1173{
1174	unsigned int flits;
1175
1176	if (m->m_pkthdr.len <= PIO_LEN)
1177		return 1;
1178
1179	flits = sgl_len(nsegs) + 2;
1180	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1181		flits++;
1182
1183	return flits_to_desc(flits);
1184}
1185
1186/**
1187 *	make_sgl - populate a scatter/gather list for a packet
1188 *	@sgp: the SGL to populate
1189 *	@segs: the packet dma segments
1190 *	@nsegs: the number of segments
1191 *
1192 *	Generates a scatter/gather list for the buffers that make up a packet
1193 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1194 *	appropriately.
1195 */
1196static __inline void
1197make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1198{
1199	int i, idx;
1200
1201	for (idx = 0, i = 0; i < nsegs; i++) {
1202		/*
1203		 * firmware doesn't like empty segments
1204		 */
1205		if (segs[i].ds_len == 0)
1206			continue;
1207		if (i && idx == 0)
1208			++sgp;
1209
1210		sgp->len[idx] = htobe32(segs[i].ds_len);
1211		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1212		idx ^= 1;
1213	}
1214
1215	if (idx) {
1216		sgp->len[idx] = 0;
1217		sgp->addr[idx] = 0;
1218	}
1219}
1220
1221/**
1222 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1223 *	@adap: the adapter
1224 *	@q: the Tx queue
1225 *
1226 *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1227 *	where the HW is going to sleep just after we checked, however,
1228 *	then the interrupt handler will detect the outstanding TX packet
1229 *	and ring the doorbell for us.
1230 *
1231 *	When GTS is disabled we unconditionally ring the doorbell.
1232 */
1233static __inline void
1234check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1235{
1236#if USE_GTS
1237	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1238	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1239		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1240#ifdef T3_TRACE
1241		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1242			  q->cntxt_id);
1243#endif
1244		t3_write_reg(adap, A_SG_KDOORBELL,
1245			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1246	}
1247#else
1248	if (mustring || ++q->db_pending >= 32) {
1249		wmb();            /* write descriptors before telling HW */
1250		t3_write_reg(adap, A_SG_KDOORBELL,
1251		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1252		q->db_pending = 0;
1253	}
1254#endif
1255}
1256
1257static __inline void
1258wr_gen2(struct tx_desc *d, unsigned int gen)
1259{
1260#if SGE_NUM_GENBITS == 2
1261	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1262#endif
1263}
1264
1265/**
1266 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1267 *	@ndesc: number of Tx descriptors spanned by the SGL
1268 *	@txd: first Tx descriptor to be written
1269 *	@txqs: txq state (generation and producer index)
1270 *	@txq: the SGE Tx queue
1271 *	@sgl: the SGL
1272 *	@flits: number of flits to the start of the SGL in the first descriptor
1273 *	@sgl_flits: the SGL size in flits
1274 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1275 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1276 *
1277 *	Write a work request header and an associated SGL.  If the SGL is
1278 *	small enough to fit into one Tx descriptor it has already been written
1279 *	and we just need to write the WR header.  Otherwise we distribute the
1280 *	SGL across the number of descriptors it spans.
1281 */
1282static void
1283write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1284    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1285    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1286{
1287
1288	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1289	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1290
1291	if (__predict_true(ndesc == 1)) {
1292		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1293		    V_WR_SGLSFLT(flits)) | wr_hi,
1294		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
1295		    wr_lo);
1296
1297		wr_gen2(txd, txqs->gen);
1298
1299	} else {
1300		unsigned int ogen = txqs->gen;
1301		const uint64_t *fp = (const uint64_t *)sgl;
1302		struct work_request_hdr *wp = wrp;
1303
1304		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1305		    V_WR_SGLSFLT(flits)) | wr_hi;
1306
1307		while (sgl_flits) {
1308			unsigned int avail = WR_FLITS - flits;
1309
1310			if (avail > sgl_flits)
1311				avail = sgl_flits;
1312			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1313			sgl_flits -= avail;
1314			ndesc--;
1315			if (!sgl_flits)
1316				break;
1317
1318			fp += avail;
1319			txd++;
1320			txsd++;
1321			if (++txqs->pidx == txq->size) {
1322				txqs->pidx = 0;
1323				txqs->gen ^= 1;
1324				txd = txq->desc;
1325				txsd = txq->sdesc;
1326			}
1327
1328			/*
1329			 * when the head of the mbuf chain
1330			 * is freed all clusters will be freed
1331			 * with it
1332			 */
1333			wrp = (struct work_request_hdr *)txd;
1334			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1335			    V_WR_SGLSFLT(1)) | wr_hi;
1336			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1337				    sgl_flits + 1)) |
1338			    V_WR_GEN(txqs->gen)) | wr_lo;
1339			wr_gen2(txd, txqs->gen);
1340			flits = 1;
1341		}
1342		wrp->wrh_hi |= htonl(F_WR_EOP);
1343		wmb();
1344		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1345		wr_gen2((struct tx_desc *)wp, ogen);
1346	}
1347}
1348
1349/* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1350#define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1351
1352#define GET_VTAG(cntrl, m) \
1353do { \
1354	if ((m)->m_flags & M_VLANTAG)					            \
1355		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1356} while (0)
1357
1358static int
1359t3_encap(struct sge_qset *qs, struct mbuf **m)
1360{
1361	adapter_t *sc;
1362	struct mbuf *m0;
1363	struct sge_txq *txq;
1364	struct txq_state txqs;
1365	struct port_info *pi;
1366	unsigned int ndesc, flits, cntrl, mlen;
1367	int err, nsegs, tso_info = 0;
1368
1369	struct work_request_hdr *wrp;
1370	struct tx_sw_desc *txsd;
1371	struct sg_ent *sgp, *sgl;
1372	uint32_t wr_hi, wr_lo, sgl_flits;
1373	bus_dma_segment_t segs[TX_MAX_SEGS];
1374
1375	struct tx_desc *txd;
1376
1377	pi = qs->port;
1378	sc = pi->adapter;
1379	txq = &qs->txq[TXQ_ETH];
1380	txd = &txq->desc[txq->pidx];
1381	txsd = &txq->sdesc[txq->pidx];
1382	sgl = txq->txq_sgl;
1383
1384	prefetch(txd);
1385	m0 = *m;
1386
1387	mtx_assert(&qs->lock, MA_OWNED);
1388	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1389	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1390
1391	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1392	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1393		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1394
1395	if (m0->m_nextpkt != NULL) {
1396		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1397		ndesc = 1;
1398		mlen = 0;
1399	} else {
1400		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1401		    &m0, segs, &nsegs))) {
1402			if (cxgb_debug)
1403				printf("failed ... err=%d\n", err);
1404			return (err);
1405		}
1406		mlen = m0->m_pkthdr.len;
1407		ndesc = calc_tx_descs(m0, nsegs);
1408	}
1409	txq_prod(txq, ndesc, &txqs);
1410
1411	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1412	txsd->m = m0;
1413
1414	if (m0->m_nextpkt != NULL) {
1415		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1416		int i, fidx;
1417
1418		if (nsegs > 7)
1419			panic("trying to coalesce %d packets in to one WR", nsegs);
1420		txq->txq_coalesced += nsegs;
1421		wrp = (struct work_request_hdr *)txd;
1422		flits = nsegs*2 + 1;
1423
1424		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1425			struct cpl_tx_pkt_batch_entry *cbe;
1426			uint64_t flit;
1427			uint32_t *hflit = (uint32_t *)&flit;
1428			int cflags = m0->m_pkthdr.csum_flags;
1429
1430			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1431			GET_VTAG(cntrl, m0);
1432			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1433			if (__predict_false(!(cflags & CSUM_IP)))
1434				cntrl |= F_TXPKT_IPCSUM_DIS;
1435			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP |
1436			    CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1437				cntrl |= F_TXPKT_L4CSUM_DIS;
1438
1439			hflit[0] = htonl(cntrl);
1440			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1441			flit |= htobe64(1 << 24);
1442			cbe = &cpl_batch->pkt_entry[i];
1443			cbe->cntrl = hflit[0];
1444			cbe->len = hflit[1];
1445			cbe->addr = htobe64(segs[i].ds_addr);
1446		}
1447
1448		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1449		    V_WR_SGLSFLT(flits)) |
1450		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1451		wr_lo = htonl(V_WR_LEN(flits) |
1452		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1453		set_wr_hdr(wrp, wr_hi, wr_lo);
1454		wmb();
1455		ETHER_BPF_MTAP(pi->ifp, m0);
1456		wr_gen2(txd, txqs.gen);
1457		check_ring_tx_db(sc, txq, 0);
1458		return (0);
1459	} else if (tso_info) {
1460		uint16_t eth_type;
1461		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1462		struct ether_header *eh;
1463		void *l3hdr;
1464		struct tcphdr *tcp;
1465
1466		txd->flit[2] = 0;
1467		GET_VTAG(cntrl, m0);
1468		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1469		hdr->cntrl = htonl(cntrl);
1470		hdr->len = htonl(mlen | 0x80000000);
1471
1472		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1473			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1474			    m0, mlen, m0->m_pkthdr.tso_segsz,
1475			    m0->m_pkthdr.csum_flags, m0->m_flags);
1476			panic("tx tso packet too small");
1477		}
1478
1479		/* Make sure that ether, ip, tcp headers are all in m0 */
1480		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1481			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1482			if (__predict_false(m0 == NULL)) {
1483				/* XXX panic probably an overreaction */
1484				panic("couldn't fit header into mbuf");
1485			}
1486		}
1487
1488		eh = mtod(m0, struct ether_header *);
1489		eth_type = eh->ether_type;
1490		if (eth_type == htons(ETHERTYPE_VLAN)) {
1491			struct ether_vlan_header *evh = (void *)eh;
1492
1493			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1494			l3hdr = evh + 1;
1495			eth_type = evh->evl_proto;
1496		} else {
1497			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1498			l3hdr = eh + 1;
1499		}
1500
1501		if (eth_type == htons(ETHERTYPE_IP)) {
1502			struct ip *ip = l3hdr;
1503
1504			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1505			tcp = (struct tcphdr *)(ip + 1);
1506		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1507			struct ip6_hdr *ip6 = l3hdr;
1508
1509			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1510			    ("%s: CSUM_TSO with ip6_nxt %d",
1511			    __func__, ip6->ip6_nxt));
1512
1513			tso_info |= F_LSO_IPV6;
1514			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1515			tcp = (struct tcphdr *)(ip6 + 1);
1516		} else
1517			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1518
1519		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1520		hdr->lso_info = htonl(tso_info);
1521
1522		if (__predict_false(mlen <= PIO_LEN)) {
1523			/*
1524			 * pkt not undersized but fits in PIO_LEN
1525			 * Indicates a TSO bug at the higher levels.
1526			 */
1527			txsd->m = NULL;
1528			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1529			flits = (mlen + 7) / 8 + 3;
1530			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1531					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1532					  F_WR_SOP | F_WR_EOP | txqs.compl);
1533			wr_lo = htonl(V_WR_LEN(flits) |
1534			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1535			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1536			wmb();
1537			ETHER_BPF_MTAP(pi->ifp, m0);
1538			wr_gen2(txd, txqs.gen);
1539			check_ring_tx_db(sc, txq, 0);
1540			m_freem(m0);
1541			return (0);
1542		}
1543		flits = 3;
1544	} else {
1545		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1546
1547		GET_VTAG(cntrl, m0);
1548		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1549		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1550			cntrl |= F_TXPKT_IPCSUM_DIS;
1551		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP |
1552		    CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1553			cntrl |= F_TXPKT_L4CSUM_DIS;
1554		cpl->cntrl = htonl(cntrl);
1555		cpl->len = htonl(mlen | 0x80000000);
1556
1557		if (mlen <= PIO_LEN) {
1558			txsd->m = NULL;
1559			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1560			flits = (mlen + 7) / 8 + 2;
1561
1562			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1563			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1564					  F_WR_SOP | F_WR_EOP | txqs.compl);
1565			wr_lo = htonl(V_WR_LEN(flits) |
1566			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1567			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1568			wmb();
1569			ETHER_BPF_MTAP(pi->ifp, m0);
1570			wr_gen2(txd, txqs.gen);
1571			check_ring_tx_db(sc, txq, 0);
1572			m_freem(m0);
1573			return (0);
1574		}
1575		flits = 2;
1576	}
1577	wrp = (struct work_request_hdr *)txd;
1578	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1579	make_sgl(sgp, segs, nsegs);
1580
1581	sgl_flits = sgl_len(nsegs);
1582
1583	ETHER_BPF_MTAP(pi->ifp, m0);
1584
1585	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1586	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1587	wr_lo = htonl(V_WR_TID(txq->token));
1588	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1589	    sgl_flits, wr_hi, wr_lo);
1590	check_ring_tx_db(sc, txq, 0);
1591
1592	return (0);
1593}
1594
1595void
1596cxgb_tx_watchdog(void *arg)
1597{
1598	struct sge_qset *qs = arg;
1599	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1600
1601        if (qs->coalescing != 0 &&
1602	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1603	    TXQ_RING_EMPTY(qs))
1604                qs->coalescing = 0;
1605        else if (qs->coalescing == 0 &&
1606	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1607                qs->coalescing = 1;
1608	if (TXQ_TRYLOCK(qs)) {
1609		qs->qs_flags |= QS_FLUSHING;
1610		cxgb_start_locked(qs);
1611		qs->qs_flags &= ~QS_FLUSHING;
1612		TXQ_UNLOCK(qs);
1613	}
1614	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1615		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1616		    qs, txq->txq_watchdog.c_cpu);
1617}
1618
1619static void
1620cxgb_tx_timeout(void *arg)
1621{
1622	struct sge_qset *qs = arg;
1623	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1624
1625	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1626                qs->coalescing = 1;
1627	if (TXQ_TRYLOCK(qs)) {
1628		qs->qs_flags |= QS_TIMEOUT;
1629		cxgb_start_locked(qs);
1630		qs->qs_flags &= ~QS_TIMEOUT;
1631		TXQ_UNLOCK(qs);
1632	}
1633}
1634
1635static void
1636cxgb_start_locked(struct sge_qset *qs)
1637{
1638	struct mbuf *m_head = NULL;
1639	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1640	struct port_info *pi = qs->port;
1641	struct ifnet *ifp = pi->ifp;
1642
1643	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1644		reclaim_completed_tx(qs, 0, TXQ_ETH);
1645
1646	if (!pi->link_config.link_ok) {
1647		TXQ_RING_FLUSH(qs);
1648		return;
1649	}
1650	TXQ_LOCK_ASSERT(qs);
1651	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1652	    pi->link_config.link_ok) {
1653		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1654
1655		if (txq->size - txq->in_use <= TX_MAX_DESC)
1656			break;
1657
1658		if ((m_head = cxgb_dequeue(qs)) == NULL)
1659			break;
1660		/*
1661		 *  Encapsulation can modify our pointer, and or make it
1662		 *  NULL on failure.  In that event, we can't requeue.
1663		 */
1664		if (t3_encap(qs, &m_head) || m_head == NULL)
1665			break;
1666
1667		m_head = NULL;
1668	}
1669
1670	if (txq->db_pending)
1671		check_ring_tx_db(pi->adapter, txq, 1);
1672
1673	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1674	    pi->link_config.link_ok)
1675		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1676		    qs, txq->txq_timer.c_cpu);
1677	if (m_head != NULL)
1678		m_freem(m_head);
1679}
1680
1681static int
1682cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1683{
1684	struct port_info *pi = qs->port;
1685	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1686	struct buf_ring *br = txq->txq_mr;
1687	int error, avail;
1688
1689	avail = txq->size - txq->in_use;
1690	TXQ_LOCK_ASSERT(qs);
1691
1692	/*
1693	 * We can only do a direct transmit if the following are true:
1694	 * - we aren't coalescing (ring < 3/4 full)
1695	 * - the link is up -- checked in caller
1696	 * - there are no packets enqueued already
1697	 * - there is space in hardware transmit queue
1698	 */
1699	if (check_pkt_coalesce(qs) == 0 &&
1700	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1701		if (t3_encap(qs, &m)) {
1702			if (m != NULL &&
1703			    (error = drbr_enqueue(ifp, br, m)) != 0)
1704				return (error);
1705		} else {
1706			if (txq->db_pending)
1707				check_ring_tx_db(pi->adapter, txq, 1);
1708
1709			/*
1710			 * We've bypassed the buf ring so we need to update
1711			 * the stats directly
1712			 */
1713			txq->txq_direct_packets++;
1714			txq->txq_direct_bytes += m->m_pkthdr.len;
1715		}
1716	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1717		return (error);
1718
1719	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1720	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1721	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1722		cxgb_start_locked(qs);
1723	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1724		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1725		    qs, txq->txq_timer.c_cpu);
1726	return (0);
1727}
1728
1729int
1730cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1731{
1732	struct sge_qset *qs;
1733	struct port_info *pi = ifp->if_softc;
1734	int error, qidx = pi->first_qset;
1735
1736	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1737	    ||(!pi->link_config.link_ok)) {
1738		m_freem(m);
1739		return (0);
1740	}
1741
1742	if (m->m_flags & M_FLOWID)
1743		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1744
1745	qs = &pi->adapter->sge.qs[qidx];
1746
1747	if (TXQ_TRYLOCK(qs)) {
1748		/* XXX running */
1749		error = cxgb_transmit_locked(ifp, qs, m);
1750		TXQ_UNLOCK(qs);
1751	} else
1752		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1753	return (error);
1754}
1755
1756void
1757cxgb_qflush(struct ifnet *ifp)
1758{
1759	/*
1760	 * flush any enqueued mbufs in the buf_rings
1761	 * and in the transmit queues
1762	 * no-op for now
1763	 */
1764	return;
1765}
1766
1767/**
1768 *	write_imm - write a packet into a Tx descriptor as immediate data
1769 *	@d: the Tx descriptor to write
1770 *	@m: the packet
1771 *	@len: the length of packet data to write as immediate data
1772 *	@gen: the generation bit value to write
1773 *
1774 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1775 *	contains a work request at its beginning.  We must write the packet
1776 *	carefully so the SGE doesn't read accidentally before it's written in
1777 *	its entirety.
1778 */
1779static __inline void
1780write_imm(struct tx_desc *d, caddr_t src,
1781	  unsigned int len, unsigned int gen)
1782{
1783	struct work_request_hdr *from = (struct work_request_hdr *)src;
1784	struct work_request_hdr *to = (struct work_request_hdr *)d;
1785	uint32_t wr_hi, wr_lo;
1786
1787	KASSERT(len <= WR_LEN && len >= sizeof(*from),
1788	    ("%s: invalid len %d", __func__, len));
1789
1790	memcpy(&to[1], &from[1], len - sizeof(*from));
1791	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1792	    V_WR_BCNTLFLT(len & 7));
1793	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
1794	set_wr_hdr(to, wr_hi, wr_lo);
1795	wmb();
1796	wr_gen2(d, gen);
1797}
1798
1799/**
1800 *	check_desc_avail - check descriptor availability on a send queue
1801 *	@adap: the adapter
1802 *	@q: the TX queue
1803 *	@m: the packet needing the descriptors
1804 *	@ndesc: the number of Tx descriptors needed
1805 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1806 *
1807 *	Checks if the requested number of Tx descriptors is available on an
1808 *	SGE send queue.  If the queue is already suspended or not enough
1809 *	descriptors are available the packet is queued for later transmission.
1810 *	Must be called with the Tx queue locked.
1811 *
1812 *	Returns 0 if enough descriptors are available, 1 if there aren't
1813 *	enough descriptors and the packet has been queued, and 2 if the caller
1814 *	needs to retry because there weren't enough descriptors at the
1815 *	beginning of the call but some freed up in the mean time.
1816 */
1817static __inline int
1818check_desc_avail(adapter_t *adap, struct sge_txq *q,
1819		 struct mbuf *m, unsigned int ndesc,
1820		 unsigned int qid)
1821{
1822	/*
1823	 * XXX We currently only use this for checking the control queue
1824	 * the control queue is only used for binding qsets which happens
1825	 * at init time so we are guaranteed enough descriptors
1826	 */
1827	if (__predict_false(!mbufq_empty(&q->sendq))) {
1828addq_exit:	mbufq_tail(&q->sendq, m);
1829		return 1;
1830	}
1831	if (__predict_false(q->size - q->in_use < ndesc)) {
1832
1833		struct sge_qset *qs = txq_to_qset(q, qid);
1834
1835		setbit(&qs->txq_stopped, qid);
1836		if (should_restart_tx(q) &&
1837		    test_and_clear_bit(qid, &qs->txq_stopped))
1838			return 2;
1839
1840		q->stops++;
1841		goto addq_exit;
1842	}
1843	return 0;
1844}
1845
1846
1847/**
1848 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1849 *	@q: the SGE control Tx queue
1850 *
1851 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1852 *	that send only immediate data (presently just the control queues) and
1853 *	thus do not have any mbufs
1854 */
1855static __inline void
1856reclaim_completed_tx_imm(struct sge_txq *q)
1857{
1858	unsigned int reclaim = q->processed - q->cleaned;
1859
1860	q->in_use -= reclaim;
1861	q->cleaned += reclaim;
1862}
1863
1864/**
1865 *	ctrl_xmit - send a packet through an SGE control Tx queue
1866 *	@adap: the adapter
1867 *	@q: the control queue
1868 *	@m: the packet
1869 *
1870 *	Send a packet through an SGE control Tx queue.  Packets sent through
1871 *	a control queue must fit entirely as immediate data in a single Tx
1872 *	descriptor and have no page fragments.
1873 */
1874static int
1875ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1876{
1877	int ret;
1878	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1879	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1880
1881	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
1882
1883	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1884	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1885
1886	TXQ_LOCK(qs);
1887again:	reclaim_completed_tx_imm(q);
1888
1889	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1890	if (__predict_false(ret)) {
1891		if (ret == 1) {
1892			TXQ_UNLOCK(qs);
1893			return (ENOSPC);
1894		}
1895		goto again;
1896	}
1897	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1898
1899	q->in_use++;
1900	if (++q->pidx >= q->size) {
1901		q->pidx = 0;
1902		q->gen ^= 1;
1903	}
1904	TXQ_UNLOCK(qs);
1905	wmb();
1906	t3_write_reg(adap, A_SG_KDOORBELL,
1907	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1908
1909	m_free(m);
1910	return (0);
1911}
1912
1913
1914/**
1915 *	restart_ctrlq - restart a suspended control queue
1916 *	@qs: the queue set cotaining the control queue
1917 *
1918 *	Resumes transmission on a suspended Tx control queue.
1919 */
1920static void
1921restart_ctrlq(void *data, int npending)
1922{
1923	struct mbuf *m;
1924	struct sge_qset *qs = (struct sge_qset *)data;
1925	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1926	adapter_t *adap = qs->port->adapter;
1927
1928	TXQ_LOCK(qs);
1929again:	reclaim_completed_tx_imm(q);
1930
1931	while (q->in_use < q->size &&
1932	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1933
1934		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1935		m_free(m);
1936
1937		if (++q->pidx >= q->size) {
1938			q->pidx = 0;
1939			q->gen ^= 1;
1940		}
1941		q->in_use++;
1942	}
1943	if (!mbufq_empty(&q->sendq)) {
1944		setbit(&qs->txq_stopped, TXQ_CTRL);
1945
1946		if (should_restart_tx(q) &&
1947		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1948			goto again;
1949		q->stops++;
1950	}
1951	TXQ_UNLOCK(qs);
1952	t3_write_reg(adap, A_SG_KDOORBELL,
1953		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1954}
1955
1956
1957/*
1958 * Send a management message through control queue 0
1959 */
1960int
1961t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1962{
1963	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1964}
1965
1966/**
1967 *	free_qset - free the resources of an SGE queue set
1968 *	@sc: the controller owning the queue set
1969 *	@q: the queue set
1970 *
1971 *	Release the HW and SW resources associated with an SGE queue set, such
1972 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1973 *	queue set must be quiesced prior to calling this.
1974 */
1975static void
1976t3_free_qset(adapter_t *sc, struct sge_qset *q)
1977{
1978	int i;
1979
1980	reclaim_completed_tx(q, 0, TXQ_ETH);
1981	if (q->txq[TXQ_ETH].txq_mr != NULL)
1982		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
1983	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
1984		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
1985		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
1986	}
1987
1988	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1989		if (q->fl[i].desc) {
1990			mtx_lock_spin(&sc->sge.reg_lock);
1991			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1992			mtx_unlock_spin(&sc->sge.reg_lock);
1993			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1994			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1995					q->fl[i].desc_map);
1996			bus_dma_tag_destroy(q->fl[i].desc_tag);
1997			bus_dma_tag_destroy(q->fl[i].entry_tag);
1998		}
1999		if (q->fl[i].sdesc) {
2000			free_rx_bufs(sc, &q->fl[i]);
2001			free(q->fl[i].sdesc, M_DEVBUF);
2002		}
2003	}
2004
2005	mtx_unlock(&q->lock);
2006	MTX_DESTROY(&q->lock);
2007	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2008		if (q->txq[i].desc) {
2009			mtx_lock_spin(&sc->sge.reg_lock);
2010			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2011			mtx_unlock_spin(&sc->sge.reg_lock);
2012			bus_dmamap_unload(q->txq[i].desc_tag,
2013					q->txq[i].desc_map);
2014			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2015					q->txq[i].desc_map);
2016			bus_dma_tag_destroy(q->txq[i].desc_tag);
2017			bus_dma_tag_destroy(q->txq[i].entry_tag);
2018		}
2019		if (q->txq[i].sdesc) {
2020			free(q->txq[i].sdesc, M_DEVBUF);
2021		}
2022	}
2023
2024	if (q->rspq.desc) {
2025		mtx_lock_spin(&sc->sge.reg_lock);
2026		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2027		mtx_unlock_spin(&sc->sge.reg_lock);
2028
2029		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2030		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2031			        q->rspq.desc_map);
2032		bus_dma_tag_destroy(q->rspq.desc_tag);
2033		MTX_DESTROY(&q->rspq.lock);
2034	}
2035
2036#if defined(INET6) || defined(INET)
2037	tcp_lro_free(&q->lro.ctrl);
2038#endif
2039
2040	bzero(q, sizeof(*q));
2041}
2042
2043/**
2044 *	t3_free_sge_resources - free SGE resources
2045 *	@sc: the adapter softc
2046 *
2047 *	Frees resources used by the SGE queue sets.
2048 */
2049void
2050t3_free_sge_resources(adapter_t *sc, int nqsets)
2051{
2052	int i;
2053
2054	for (i = 0; i < nqsets; ++i) {
2055		TXQ_LOCK(&sc->sge.qs[i]);
2056		t3_free_qset(sc, &sc->sge.qs[i]);
2057	}
2058}
2059
2060/**
2061 *	t3_sge_start - enable SGE
2062 *	@sc: the controller softc
2063 *
2064 *	Enables the SGE for DMAs.  This is the last step in starting packet
2065 *	transfers.
2066 */
2067void
2068t3_sge_start(adapter_t *sc)
2069{
2070	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2071}
2072
2073/**
2074 *	t3_sge_stop - disable SGE operation
2075 *	@sc: the adapter
2076 *
2077 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2078 *	from error interrupts) or from normal process context.  In the latter
2079 *	case it also disables any pending queue restart tasklets.  Note that
2080 *	if it is called in interrupt context it cannot disable the restart
2081 *	tasklets as it cannot wait, however the tasklets will have no effect
2082 *	since the doorbells are disabled and the driver will call this again
2083 *	later from process context, at which time the tasklets will be stopped
2084 *	if they are still running.
2085 */
2086void
2087t3_sge_stop(adapter_t *sc)
2088{
2089	int i, nqsets;
2090
2091	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2092
2093	if (sc->tq == NULL)
2094		return;
2095
2096	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2097		nqsets += sc->port[i].nqsets;
2098#ifdef notyet
2099	/*
2100	 *
2101	 * XXX
2102	 */
2103	for (i = 0; i < nqsets; ++i) {
2104		struct sge_qset *qs = &sc->sge.qs[i];
2105
2106		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2107		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2108	}
2109#endif
2110}
2111
2112/**
2113 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2114 *	@adapter: the adapter
2115 *	@q: the Tx queue to reclaim descriptors from
2116 *	@reclaimable: the number of descriptors to reclaim
2117 *      @m_vec_size: maximum number of buffers to reclaim
2118 *      @desc_reclaimed: returns the number of descriptors reclaimed
2119 *
2120 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2121 *	Tx buffers.  Called with the Tx queue lock held.
2122 *
2123 *      Returns number of buffers of reclaimed
2124 */
2125void
2126t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2127{
2128	struct tx_sw_desc *txsd;
2129	unsigned int cidx, mask;
2130	struct sge_txq *q = &qs->txq[queue];
2131
2132#ifdef T3_TRACE
2133	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2134		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2135#endif
2136	cidx = q->cidx;
2137	mask = q->size - 1;
2138	txsd = &q->sdesc[cidx];
2139
2140	mtx_assert(&qs->lock, MA_OWNED);
2141	while (reclaimable--) {
2142		prefetch(q->sdesc[(cidx + 1) & mask].m);
2143		prefetch(q->sdesc[(cidx + 2) & mask].m);
2144
2145		if (txsd->m != NULL) {
2146			if (txsd->flags & TX_SW_DESC_MAPPED) {
2147				bus_dmamap_unload(q->entry_tag, txsd->map);
2148				txsd->flags &= ~TX_SW_DESC_MAPPED;
2149			}
2150			m_freem_list(txsd->m);
2151			txsd->m = NULL;
2152		} else
2153			q->txq_skipped++;
2154
2155		++txsd;
2156		if (++cidx == q->size) {
2157			cidx = 0;
2158			txsd = q->sdesc;
2159		}
2160	}
2161	q->cidx = cidx;
2162
2163}
2164
2165/**
2166 *	is_new_response - check if a response is newly written
2167 *	@r: the response descriptor
2168 *	@q: the response queue
2169 *
2170 *	Returns true if a response descriptor contains a yet unprocessed
2171 *	response.
2172 */
2173static __inline int
2174is_new_response(const struct rsp_desc *r,
2175    const struct sge_rspq *q)
2176{
2177	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2178}
2179
2180#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2181#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2182			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2183			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2184			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2185
2186/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2187#define NOMEM_INTR_DELAY 2500
2188
2189#ifdef TCP_OFFLOAD
2190/**
2191 *	write_ofld_wr - write an offload work request
2192 *	@adap: the adapter
2193 *	@m: the packet to send
2194 *	@q: the Tx queue
2195 *	@pidx: index of the first Tx descriptor to write
2196 *	@gen: the generation value to use
2197 *	@ndesc: number of descriptors the packet will occupy
2198 *
2199 *	Write an offload work request to send the supplied packet.  The packet
2200 *	data already carry the work request with most fields populated.
2201 */
2202static void
2203write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
2204    unsigned int pidx, unsigned int gen, unsigned int ndesc)
2205{
2206	unsigned int sgl_flits, flits;
2207	int i, idx, nsegs, wrlen;
2208	struct work_request_hdr *from;
2209	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
2210	struct tx_desc *d = &q->desc[pidx];
2211	struct txq_state txqs;
2212	struct sglist_seg *segs;
2213	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2214	struct sglist *sgl;
2215
2216	from = (void *)(oh + 1);	/* Start of WR within mbuf */
2217	wrlen = m->m_len - sizeof(*oh);
2218
2219	if (!(oh->flags & F_HDR_SGL)) {
2220		write_imm(d, (caddr_t)from, wrlen, gen);
2221
2222		/*
2223		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
2224		 * t3_push_frames and freed in wr_ack.  Others, like those sent
2225		 * down by close_conn, t3_send_reset, etc. should be freed here.
2226		 */
2227		if (!(oh->flags & F_HDR_DF))
2228			m_free(m);
2229		return;
2230	}
2231
2232	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
2233
2234	sgl = oh->sgl;
2235	flits = wrlen / 8;
2236	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
2237
2238	nsegs = sgl->sg_nseg;
2239	segs = sgl->sg_segs;
2240	for (idx = 0, i = 0; i < nsegs; i++) {
2241		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
2242		if (i && idx == 0)
2243			++sgp;
2244		sgp->len[idx] = htobe32(segs[i].ss_len);
2245		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
2246		idx ^= 1;
2247	}
2248	if (idx) {
2249		sgp->len[idx] = 0;
2250		sgp->addr[idx] = 0;
2251	}
2252
2253	sgl_flits = sgl_len(nsegs);
2254	txqs.gen = gen;
2255	txqs.pidx = pidx;
2256	txqs.compl = 0;
2257
2258	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
2259	    from->wrh_hi, from->wrh_lo);
2260}
2261
2262/**
2263 *	ofld_xmit - send a packet through an offload queue
2264 *	@adap: the adapter
2265 *	@q: the Tx offload queue
2266 *	@m: the packet
2267 *
2268 *	Send an offload packet through an SGE offload queue.
2269 */
2270static int
2271ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2272{
2273	int ret;
2274	unsigned int ndesc;
2275	unsigned int pidx, gen;
2276	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2277	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2278
2279	ndesc = G_HDR_NDESC(oh->flags);
2280
2281	TXQ_LOCK(qs);
2282again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2283	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2284	if (__predict_false(ret)) {
2285		if (ret == 1) {
2286			TXQ_UNLOCK(qs);
2287			return (EINTR);
2288		}
2289		goto again;
2290	}
2291
2292	gen = q->gen;
2293	q->in_use += ndesc;
2294	pidx = q->pidx;
2295	q->pidx += ndesc;
2296	if (q->pidx >= q->size) {
2297		q->pidx -= q->size;
2298		q->gen ^= 1;
2299	}
2300
2301	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2302	check_ring_tx_db(adap, q, 1);
2303	TXQ_UNLOCK(qs);
2304
2305	return (0);
2306}
2307
2308/**
2309 *	restart_offloadq - restart a suspended offload queue
2310 *	@qs: the queue set cotaining the offload queue
2311 *
2312 *	Resumes transmission on a suspended Tx offload queue.
2313 */
2314static void
2315restart_offloadq(void *data, int npending)
2316{
2317	struct mbuf *m;
2318	struct sge_qset *qs = data;
2319	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2320	adapter_t *adap = qs->port->adapter;
2321	int cleaned;
2322
2323	TXQ_LOCK(qs);
2324again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2325
2326	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2327		unsigned int gen, pidx;
2328		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2329		unsigned int ndesc = G_HDR_NDESC(oh->flags);
2330
2331		if (__predict_false(q->size - q->in_use < ndesc)) {
2332			setbit(&qs->txq_stopped, TXQ_OFLD);
2333			if (should_restart_tx(q) &&
2334			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2335				goto again;
2336			q->stops++;
2337			break;
2338		}
2339
2340		gen = q->gen;
2341		q->in_use += ndesc;
2342		pidx = q->pidx;
2343		q->pidx += ndesc;
2344		if (q->pidx >= q->size) {
2345			q->pidx -= q->size;
2346			q->gen ^= 1;
2347		}
2348
2349		(void)mbufq_dequeue(&q->sendq);
2350		TXQ_UNLOCK(qs);
2351		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2352		TXQ_LOCK(qs);
2353	}
2354#if USE_GTS
2355	set_bit(TXQ_RUNNING, &q->flags);
2356	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2357#endif
2358	TXQ_UNLOCK(qs);
2359	wmb();
2360	t3_write_reg(adap, A_SG_KDOORBELL,
2361		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2362}
2363
2364/**
2365 *	t3_offload_tx - send an offload packet
2366 *	@m: the packet
2367 *
2368 *	Sends an offload packet.  We use the packet priority to select the
2369 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2370 *	should be sent as regular or control, bits 1-3 select the queue set.
2371 */
2372int
2373t3_offload_tx(struct adapter *sc, struct mbuf *m)
2374{
2375	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2376	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
2377
2378	if (oh->flags & F_HDR_CTRL) {
2379		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
2380		return (ctrl_xmit(sc, qs, m));
2381	} else
2382		return (ofld_xmit(sc, qs, m));
2383}
2384#endif
2385
2386static void
2387restart_tx(struct sge_qset *qs)
2388{
2389	struct adapter *sc = qs->port->adapter;
2390
2391	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2392	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2393	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2394		qs->txq[TXQ_OFLD].restarts++;
2395		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2396	}
2397
2398	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2399	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2400	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2401		qs->txq[TXQ_CTRL].restarts++;
2402		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2403	}
2404}
2405
2406/**
2407 *	t3_sge_alloc_qset - initialize an SGE queue set
2408 *	@sc: the controller softc
2409 *	@id: the queue set id
2410 *	@nports: how many Ethernet ports will be using this queue set
2411 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2412 *	@p: configuration parameters for this queue set
2413 *	@ntxq: number of Tx queues for the queue set
2414 *	@pi: port info for queue set
2415 *
2416 *	Allocate resources and initialize an SGE queue set.  A queue set
2417 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2418 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2419 *	queue, offload queue, and control queue.
2420 */
2421int
2422t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2423		  const struct qset_params *p, int ntxq, struct port_info *pi)
2424{
2425	struct sge_qset *q = &sc->sge.qs[id];
2426	int i, ret = 0;
2427
2428	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2429	q->port = pi;
2430	q->adap = sc;
2431
2432	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2433	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2434		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2435		goto err;
2436	}
2437	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2438	    M_NOWAIT | M_ZERO)) == NULL) {
2439		device_printf(sc->dev, "failed to allocate ifq\n");
2440		goto err;
2441	}
2442	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2443	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2444	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2445	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2446	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2447
2448	init_qset_cntxt(q, id);
2449	q->idx = id;
2450	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2451		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2452		    &q->fl[0].desc, &q->fl[0].sdesc,
2453		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2454		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2455		printf("error %d from alloc ring fl0\n", ret);
2456		goto err;
2457	}
2458
2459	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2460		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2461		    &q->fl[1].desc, &q->fl[1].sdesc,
2462		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2463		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2464		printf("error %d from alloc ring fl1\n", ret);
2465		goto err;
2466	}
2467
2468	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2469		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2470		    &q->rspq.desc_tag, &q->rspq.desc_map,
2471		    NULL, NULL)) != 0) {
2472		printf("error %d from alloc ring rspq\n", ret);
2473		goto err;
2474	}
2475
2476	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2477	    device_get_unit(sc->dev), irq_vec_idx);
2478	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2479
2480	for (i = 0; i < ntxq; ++i) {
2481		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2482
2483		if ((ret = alloc_ring(sc, p->txq_size[i],
2484			    sizeof(struct tx_desc), sz,
2485			    &q->txq[i].phys_addr, &q->txq[i].desc,
2486			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2487			    &q->txq[i].desc_map,
2488			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2489			printf("error %d from alloc ring tx %i\n", ret, i);
2490			goto err;
2491		}
2492		mbufq_init(&q->txq[i].sendq);
2493		q->txq[i].gen = 1;
2494		q->txq[i].size = p->txq_size[i];
2495	}
2496
2497#ifdef TCP_OFFLOAD
2498	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2499#endif
2500	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2501	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2502	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2503
2504	q->fl[0].gen = q->fl[1].gen = 1;
2505	q->fl[0].size = p->fl_size;
2506	q->fl[1].size = p->jumbo_size;
2507
2508	q->rspq.gen = 1;
2509	q->rspq.cidx = 0;
2510	q->rspq.size = p->rspq_size;
2511
2512	q->txq[TXQ_ETH].stop_thres = nports *
2513	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2514
2515	q->fl[0].buf_size = MCLBYTES;
2516	q->fl[0].zone = zone_pack;
2517	q->fl[0].type = EXT_PACKET;
2518
2519	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2520		q->fl[1].zone = zone_jumbo16;
2521		q->fl[1].type = EXT_JUMBO16;
2522	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2523		q->fl[1].zone = zone_jumbo9;
2524		q->fl[1].type = EXT_JUMBO9;
2525	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2526		q->fl[1].zone = zone_jumbop;
2527		q->fl[1].type = EXT_JUMBOP;
2528	} else {
2529		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2530		ret = EDOOFUS;
2531		goto err;
2532	}
2533	q->fl[1].buf_size = p->jumbo_buf_size;
2534
2535	/* Allocate and setup the lro_ctrl structure */
2536	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2537#if defined(INET6) || defined(INET)
2538	ret = tcp_lro_init(&q->lro.ctrl);
2539	if (ret) {
2540		printf("error %d from tcp_lro_init\n", ret);
2541		goto err;
2542	}
2543#endif
2544	q->lro.ctrl.ifp = pi->ifp;
2545
2546	mtx_lock_spin(&sc->sge.reg_lock);
2547	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2548				   q->rspq.phys_addr, q->rspq.size,
2549				   q->fl[0].buf_size, 1, 0);
2550	if (ret) {
2551		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2552		goto err_unlock;
2553	}
2554
2555	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2556		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2557					  q->fl[i].phys_addr, q->fl[i].size,
2558					  q->fl[i].buf_size, p->cong_thres, 1,
2559					  0);
2560		if (ret) {
2561			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2562			goto err_unlock;
2563		}
2564	}
2565
2566	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2567				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2568				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2569				 1, 0);
2570	if (ret) {
2571		printf("error %d from t3_sge_init_ecntxt\n", ret);
2572		goto err_unlock;
2573	}
2574
2575	if (ntxq > 1) {
2576		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2577					 USE_GTS, SGE_CNTXT_OFLD, id,
2578					 q->txq[TXQ_OFLD].phys_addr,
2579					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2580		if (ret) {
2581			printf("error %d from t3_sge_init_ecntxt\n", ret);
2582			goto err_unlock;
2583		}
2584	}
2585
2586	if (ntxq > 2) {
2587		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2588					 SGE_CNTXT_CTRL, id,
2589					 q->txq[TXQ_CTRL].phys_addr,
2590					 q->txq[TXQ_CTRL].size,
2591					 q->txq[TXQ_CTRL].token, 1, 0);
2592		if (ret) {
2593			printf("error %d from t3_sge_init_ecntxt\n", ret);
2594			goto err_unlock;
2595		}
2596	}
2597
2598	mtx_unlock_spin(&sc->sge.reg_lock);
2599	t3_update_qset_coalesce(q, p);
2600
2601	refill_fl(sc, &q->fl[0], q->fl[0].size);
2602	refill_fl(sc, &q->fl[1], q->fl[1].size);
2603	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2604
2605	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2606		     V_NEWTIMER(q->rspq.holdoff_tmr));
2607
2608	return (0);
2609
2610err_unlock:
2611	mtx_unlock_spin(&sc->sge.reg_lock);
2612err:
2613	TXQ_LOCK(q);
2614	t3_free_qset(sc, q);
2615
2616	return (ret);
2617}
2618
2619/*
2620 * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2621 * ethernet data.  Hardware assistance with various checksums and any vlan tag
2622 * will also be taken into account here.
2623 */
2624void
2625t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad)
2626{
2627	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2628	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2629	struct ifnet *ifp = pi->ifp;
2630
2631	if (cpl->vlan_valid) {
2632		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2633		m->m_flags |= M_VLANTAG;
2634	}
2635
2636	m->m_pkthdr.rcvif = ifp;
2637	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2638	/*
2639	 * adjust after conversion to mbuf chain
2640	 */
2641	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2642	m->m_len -= (sizeof(*cpl) + ethpad);
2643	m->m_data += (sizeof(*cpl) + ethpad);
2644
2645	if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) {
2646		struct ether_header *eh = mtod(m, void *);
2647		uint16_t eh_type;
2648
2649		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2650			struct ether_vlan_header *evh = mtod(m, void *);
2651
2652			eh_type = evh->evl_proto;
2653		} else
2654			eh_type = eh->ether_type;
2655
2656		if (ifp->if_capenable & IFCAP_RXCSUM &&
2657		    eh_type == htons(ETHERTYPE_IP)) {
2658			m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
2659			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2660			m->m_pkthdr.csum_data = 0xffff;
2661		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
2662		    eh_type == htons(ETHERTYPE_IPV6)) {
2663			m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
2664			    CSUM_PSEUDO_HDR);
2665			m->m_pkthdr.csum_data = 0xffff;
2666		}
2667	}
2668}
2669
2670/**
2671 *	get_packet - return the next ingress packet buffer from a free list
2672 *	@adap: the adapter that received the packet
2673 *	@drop_thres: # of remaining buffers before we start dropping packets
2674 *	@qs: the qset that the SGE free list holding the packet belongs to
2675 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2676 *      @r: response descriptor
2677 *
2678 *	Get the next packet from a free list and complete setup of the
2679 *	sk_buff.  If the packet is small we make a copy and recycle the
2680 *	original buffer, otherwise we use the original buffer itself.  If a
2681 *	positive drop threshold is supplied packets are dropped and their
2682 *	buffers recycled if (a) the number of remaining buffers is under the
2683 *	threshold and the packet is too big to copy, or (b) the packet should
2684 *	be copied but there is no memory for the copy.
2685 */
2686static int
2687get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2688    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2689{
2690
2691	unsigned int len_cq =  ntohl(r->len_cq);
2692	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2693	int mask, cidx = fl->cidx;
2694	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2695	uint32_t len = G_RSPD_LEN(len_cq);
2696	uint32_t flags = M_EXT;
2697	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2698	caddr_t cl;
2699	struct mbuf *m;
2700	int ret = 0;
2701
2702	mask = fl->size - 1;
2703	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2704	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2705	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2706	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2707
2708	fl->credits--;
2709	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2710
2711	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2712	    sopeop == RSPQ_SOP_EOP) {
2713		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
2714			goto skip_recycle;
2715		cl = mtod(m, void *);
2716		memcpy(cl, sd->rxsd_cl, len);
2717		recycle_rx_buf(adap, fl, fl->cidx);
2718		m->m_pkthdr.len = m->m_len = len;
2719		m->m_flags = 0;
2720		mh->mh_head = mh->mh_tail = m;
2721		ret = 1;
2722		goto done;
2723	} else {
2724	skip_recycle:
2725		bus_dmamap_unload(fl->entry_tag, sd->map);
2726		cl = sd->rxsd_cl;
2727		m = sd->m;
2728
2729		if ((sopeop == RSPQ_SOP_EOP) ||
2730		    (sopeop == RSPQ_SOP))
2731			flags |= M_PKTHDR;
2732		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2733		if (fl->zone == zone_pack) {
2734			/*
2735			 * restore clobbered data pointer
2736			 */
2737			m->m_data = m->m_ext.ext_buf;
2738		} else {
2739			m_cljset(m, cl, fl->type);
2740		}
2741		m->m_len = len;
2742	}
2743	switch(sopeop) {
2744	case RSPQ_SOP_EOP:
2745		ret = 1;
2746		/* FALLTHROUGH */
2747	case RSPQ_SOP:
2748		mh->mh_head = mh->mh_tail = m;
2749		m->m_pkthdr.len = len;
2750		break;
2751	case RSPQ_EOP:
2752		ret = 1;
2753		/* FALLTHROUGH */
2754	case RSPQ_NSOP_NEOP:
2755		if (mh->mh_tail == NULL) {
2756			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2757			m_freem(m);
2758			break;
2759		}
2760		mh->mh_tail->m_next = m;
2761		mh->mh_tail = m;
2762		mh->mh_head->m_pkthdr.len += len;
2763		break;
2764	}
2765	if (cxgb_debug)
2766		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2767done:
2768	if (++fl->cidx == fl->size)
2769		fl->cidx = 0;
2770
2771	return (ret);
2772}
2773
2774/**
2775 *	handle_rsp_cntrl_info - handles control information in a response
2776 *	@qs: the queue set corresponding to the response
2777 *	@flags: the response control flags
2778 *
2779 *	Handles the control information of an SGE response, such as GTS
2780 *	indications and completion credits for the queue set's Tx queues.
2781 *	HW coalesces credits, we don't do any extra SW coalescing.
2782 */
2783static __inline void
2784handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2785{
2786	unsigned int credits;
2787
2788#if USE_GTS
2789	if (flags & F_RSPD_TXQ0_GTS)
2790		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2791#endif
2792	credits = G_RSPD_TXQ0_CR(flags);
2793	if (credits)
2794		qs->txq[TXQ_ETH].processed += credits;
2795
2796	credits = G_RSPD_TXQ2_CR(flags);
2797	if (credits)
2798		qs->txq[TXQ_CTRL].processed += credits;
2799
2800# if USE_GTS
2801	if (flags & F_RSPD_TXQ1_GTS)
2802		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2803# endif
2804	credits = G_RSPD_TXQ1_CR(flags);
2805	if (credits)
2806		qs->txq[TXQ_OFLD].processed += credits;
2807
2808}
2809
2810static void
2811check_ring_db(adapter_t *adap, struct sge_qset *qs,
2812    unsigned int sleeping)
2813{
2814	;
2815}
2816
2817/**
2818 *	process_responses - process responses from an SGE response queue
2819 *	@adap: the adapter
2820 *	@qs: the queue set to which the response queue belongs
2821 *	@budget: how many responses can be processed in this round
2822 *
2823 *	Process responses from an SGE response queue up to the supplied budget.
2824 *	Responses include received packets as well as credits and other events
2825 *	for the queues that belong to the response queue's queue set.
2826 *	A negative budget is effectively unlimited.
2827 *
2828 *	Additionally choose the interrupt holdoff time for the next interrupt
2829 *	on this queue.  If the system is under memory shortage use a fairly
2830 *	long delay to help recovery.
2831 */
2832static int
2833process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2834{
2835	struct sge_rspq *rspq = &qs->rspq;
2836	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2837	int budget_left = budget;
2838	unsigned int sleeping = 0;
2839#if defined(INET6) || defined(INET)
2840	int lro_enabled = qs->lro.enabled;
2841	int skip_lro;
2842	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2843#endif
2844	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2845#ifdef DEBUG
2846	static int last_holdoff = 0;
2847	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2848		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2849		last_holdoff = rspq->holdoff_tmr;
2850	}
2851#endif
2852	rspq->next_holdoff = rspq->holdoff_tmr;
2853
2854	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2855		int eth, eop = 0, ethpad = 0;
2856		uint32_t flags = ntohl(r->flags);
2857		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2858		uint8_t opcode = r->rss_hdr.opcode;
2859
2860		eth = (opcode == CPL_RX_PKT);
2861
2862		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2863			struct mbuf *m;
2864
2865			if (cxgb_debug)
2866				printf("async notification\n");
2867
2868			if (mh->mh_head == NULL) {
2869				mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA);
2870				m = mh->mh_head;
2871			} else {
2872				m = m_gethdr(M_NOWAIT, MT_DATA);
2873			}
2874			if (m == NULL)
2875				goto no_mem;
2876
2877                        memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2878			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2879                        *mtod(m, char *) = CPL_ASYNC_NOTIF;
2880			opcode = CPL_ASYNC_NOTIF;
2881			eop = 1;
2882                        rspq->async_notif++;
2883			goto skip;
2884		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2885			struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
2886
2887			if (m == NULL) {
2888		no_mem:
2889				rspq->next_holdoff = NOMEM_INTR_DELAY;
2890				budget_left--;
2891				break;
2892			}
2893			if (mh->mh_head == NULL)
2894				mh->mh_head = m;
2895                        else
2896				mh->mh_tail->m_next = m;
2897			mh->mh_tail = m;
2898
2899			get_imm_packet(adap, r, m);
2900			mh->mh_head->m_pkthdr.len += m->m_len;
2901			eop = 1;
2902			rspq->imm_data++;
2903		} else if (r->len_cq) {
2904			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2905
2906			eop = get_packet(adap, drop_thresh, qs, mh, r);
2907			if (eop) {
2908				if (r->rss_hdr.hash_type && !adap->timestamp)
2909					mh->mh_head->m_flags |= M_FLOWID;
2910				mh->mh_head->m_pkthdr.flowid = rss_hash;
2911			}
2912
2913			ethpad = 2;
2914		} else {
2915			rspq->pure_rsps++;
2916		}
2917	skip:
2918		if (flags & RSPD_CTRL_MASK) {
2919			sleeping |= flags & RSPD_GTS_MASK;
2920			handle_rsp_cntrl_info(qs, flags);
2921		}
2922
2923		if (!eth && eop) {
2924			rspq->offload_pkts++;
2925#ifdef TCP_OFFLOAD
2926			adap->cpl_handler[opcode](qs, r, mh->mh_head);
2927#else
2928			m_freem(mh->mh_head);
2929#endif
2930			mh->mh_head = NULL;
2931		} else if (eth && eop) {
2932			struct mbuf *m = mh->mh_head;
2933
2934			t3_rx_eth(adap, m, ethpad);
2935
2936			/*
2937			 * The T304 sends incoming packets on any qset.  If LRO
2938			 * is also enabled, we could end up sending packet up
2939			 * lro_ctrl->ifp's input.  That is incorrect.
2940			 *
2941			 * The mbuf's rcvif was derived from the cpl header and
2942			 * is accurate.  Skip LRO and just use that.
2943			 */
2944#if defined(INET6) || defined(INET)
2945			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
2946
2947			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
2948			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
2949			    ) {
2950				/* successfully queue'd for LRO */
2951			} else
2952#endif
2953			{
2954				/*
2955				 * LRO not enabled, packet unsuitable for LRO,
2956				 * or unable to queue.  Pass it up right now in
2957				 * either case.
2958				 */
2959				struct ifnet *ifp = m->m_pkthdr.rcvif;
2960				(*ifp->if_input)(ifp, m);
2961			}
2962			mh->mh_head = NULL;
2963
2964		}
2965
2966		r++;
2967		if (__predict_false(++rspq->cidx == rspq->size)) {
2968			rspq->cidx = 0;
2969			rspq->gen ^= 1;
2970			r = rspq->desc;
2971		}
2972
2973		if (++rspq->credits >= 64) {
2974			refill_rspq(adap, rspq, rspq->credits);
2975			rspq->credits = 0;
2976		}
2977		__refill_fl_lt(adap, &qs->fl[0], 32);
2978		__refill_fl_lt(adap, &qs->fl[1], 32);
2979		--budget_left;
2980	}
2981
2982#if defined(INET6) || defined(INET)
2983	/* Flush LRO */
2984	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
2985		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
2986		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
2987		tcp_lro_flush(lro_ctrl, queued);
2988	}
2989#endif
2990
2991	if (sleeping)
2992		check_ring_db(adap, qs, sleeping);
2993
2994	mb();  /* commit Tx queue processed updates */
2995	if (__predict_false(qs->txq_stopped > 1))
2996		restart_tx(qs);
2997
2998	__refill_fl_lt(adap, &qs->fl[0], 512);
2999	__refill_fl_lt(adap, &qs->fl[1], 512);
3000	budget -= budget_left;
3001	return (budget);
3002}
3003
3004/*
3005 * A helper function that processes responses and issues GTS.
3006 */
3007static __inline int
3008process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3009{
3010	int work;
3011	static int last_holdoff = 0;
3012
3013	work = process_responses(adap, rspq_to_qset(rq), -1);
3014
3015	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3016		printf("next_holdoff=%d\n", rq->next_holdoff);
3017		last_holdoff = rq->next_holdoff;
3018	}
3019	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3020	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3021
3022	return (work);
3023}
3024
3025
3026/*
3027 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3028 * Handles data events from SGE response queues as well as error and other
3029 * async events as they all use the same interrupt pin.  We use one SGE
3030 * response queue per port in this mode and protect all response queues with
3031 * queue 0's lock.
3032 */
3033void
3034t3b_intr(void *data)
3035{
3036	uint32_t i, map;
3037	adapter_t *adap = data;
3038	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3039
3040	t3_write_reg(adap, A_PL_CLI, 0);
3041	map = t3_read_reg(adap, A_SG_DATA_INTR);
3042
3043	if (!map)
3044		return;
3045
3046	if (__predict_false(map & F_ERRINTR)) {
3047		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3048		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3049		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3050	}
3051
3052	mtx_lock(&q0->lock);
3053	for_each_port(adap, i)
3054	    if (map & (1 << i))
3055			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3056	mtx_unlock(&q0->lock);
3057}
3058
3059/*
3060 * The MSI interrupt handler.  This needs to handle data events from SGE
3061 * response queues as well as error and other async events as they all use
3062 * the same MSI vector.  We use one SGE response queue per port in this mode
3063 * and protect all response queues with queue 0's lock.
3064 */
3065void
3066t3_intr_msi(void *data)
3067{
3068	adapter_t *adap = data;
3069	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3070	int i, new_packets = 0;
3071
3072	mtx_lock(&q0->lock);
3073
3074	for_each_port(adap, i)
3075	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3076		    new_packets = 1;
3077	mtx_unlock(&q0->lock);
3078	if (new_packets == 0) {
3079		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3080		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3081		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3082	}
3083}
3084
3085void
3086t3_intr_msix(void *data)
3087{
3088	struct sge_qset *qs = data;
3089	adapter_t *adap = qs->port->adapter;
3090	struct sge_rspq *rspq = &qs->rspq;
3091
3092	if (process_responses_gts(adap, rspq) == 0)
3093		rspq->unhandled_irqs++;
3094}
3095
3096#define QDUMP_SBUF_SIZE		32 * 400
3097static int
3098t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3099{
3100	struct sge_rspq *rspq;
3101	struct sge_qset *qs;
3102	int i, err, dump_end, idx;
3103	struct sbuf *sb;
3104	struct rsp_desc *rspd;
3105	uint32_t data[4];
3106
3107	rspq = arg1;
3108	qs = rspq_to_qset(rspq);
3109	if (rspq->rspq_dump_count == 0)
3110		return (0);
3111	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3112		log(LOG_WARNING,
3113		    "dump count is too large %d\n", rspq->rspq_dump_count);
3114		rspq->rspq_dump_count = 0;
3115		return (EINVAL);
3116	}
3117	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3118		log(LOG_WARNING,
3119		    "dump start of %d is greater than queue size\n",
3120		    rspq->rspq_dump_start);
3121		rspq->rspq_dump_start = 0;
3122		return (EINVAL);
3123	}
3124	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3125	if (err)
3126		return (err);
3127	err = sysctl_wire_old_buffer(req, 0);
3128	if (err)
3129		return (err);
3130	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3131
3132	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3133	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3134	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3135	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3136	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3137
3138	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3139	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3140
3141	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3142	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3143		idx = i & (RSPQ_Q_SIZE-1);
3144
3145		rspd = &rspq->desc[idx];
3146		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3147		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3148		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3149		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3150		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3151		    be32toh(rspd->len_cq), rspd->intr_gen);
3152	}
3153
3154	err = sbuf_finish(sb);
3155	/* Output a trailing NUL. */
3156	if (err == 0)
3157		err = SYSCTL_OUT(req, "", 1);
3158	sbuf_delete(sb);
3159	return (err);
3160}
3161
3162static int
3163t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3164{
3165	struct sge_txq *txq;
3166	struct sge_qset *qs;
3167	int i, j, err, dump_end;
3168	struct sbuf *sb;
3169	struct tx_desc *txd;
3170	uint32_t *WR, wr_hi, wr_lo, gen;
3171	uint32_t data[4];
3172
3173	txq = arg1;
3174	qs = txq_to_qset(txq, TXQ_ETH);
3175	if (txq->txq_dump_count == 0) {
3176		return (0);
3177	}
3178	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3179		log(LOG_WARNING,
3180		    "dump count is too large %d\n", txq->txq_dump_count);
3181		txq->txq_dump_count = 1;
3182		return (EINVAL);
3183	}
3184	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3185		log(LOG_WARNING,
3186		    "dump start of %d is greater than queue size\n",
3187		    txq->txq_dump_start);
3188		txq->txq_dump_start = 0;
3189		return (EINVAL);
3190	}
3191	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3192	if (err)
3193		return (err);
3194	err = sysctl_wire_old_buffer(req, 0);
3195	if (err)
3196		return (err);
3197	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3198
3199	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3200	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3201	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3202	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3203	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3204	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3205	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3206	    txq->txq_dump_start,
3207	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3208
3209	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3210	for (i = txq->txq_dump_start; i < dump_end; i++) {
3211		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3212		WR = (uint32_t *)txd->flit;
3213		wr_hi = ntohl(WR[0]);
3214		wr_lo = ntohl(WR[1]);
3215		gen = G_WR_GEN(wr_lo);
3216
3217		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3218		    wr_hi, wr_lo, gen);
3219		for (j = 2; j < 30; j += 4)
3220			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3221			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3222
3223	}
3224	err = sbuf_finish(sb);
3225	/* Output a trailing NUL. */
3226	if (err == 0)
3227		err = SYSCTL_OUT(req, "", 1);
3228	sbuf_delete(sb);
3229	return (err);
3230}
3231
3232static int
3233t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3234{
3235	struct sge_txq *txq;
3236	struct sge_qset *qs;
3237	int i, j, err, dump_end;
3238	struct sbuf *sb;
3239	struct tx_desc *txd;
3240	uint32_t *WR, wr_hi, wr_lo, gen;
3241
3242	txq = arg1;
3243	qs = txq_to_qset(txq, TXQ_CTRL);
3244	if (txq->txq_dump_count == 0) {
3245		return (0);
3246	}
3247	if (txq->txq_dump_count > 256) {
3248		log(LOG_WARNING,
3249		    "dump count is too large %d\n", txq->txq_dump_count);
3250		txq->txq_dump_count = 1;
3251		return (EINVAL);
3252	}
3253	if (txq->txq_dump_start > 255) {
3254		log(LOG_WARNING,
3255		    "dump start of %d is greater than queue size\n",
3256		    txq->txq_dump_start);
3257		txq->txq_dump_start = 0;
3258		return (EINVAL);
3259	}
3260
3261	err = sysctl_wire_old_buffer(req, 0);
3262	if (err != 0)
3263		return (err);
3264	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3265	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3266	    txq->txq_dump_start,
3267	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3268
3269	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3270	for (i = txq->txq_dump_start; i < dump_end; i++) {
3271		txd = &txq->desc[i & (255)];
3272		WR = (uint32_t *)txd->flit;
3273		wr_hi = ntohl(WR[0]);
3274		wr_lo = ntohl(WR[1]);
3275		gen = G_WR_GEN(wr_lo);
3276
3277		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3278		    wr_hi, wr_lo, gen);
3279		for (j = 2; j < 30; j += 4)
3280			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3281			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3282
3283	}
3284	err = sbuf_finish(sb);
3285	/* Output a trailing NUL. */
3286	if (err == 0)
3287		err = SYSCTL_OUT(req, "", 1);
3288	sbuf_delete(sb);
3289	return (err);
3290}
3291
3292static int
3293t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3294{
3295	adapter_t *sc = arg1;
3296	struct qset_params *qsp = &sc->params.sge.qset[0];
3297	int coalesce_usecs;
3298	struct sge_qset *qs;
3299	int i, j, err, nqsets = 0;
3300	struct mtx *lock;
3301
3302	if ((sc->flags & FULL_INIT_DONE) == 0)
3303		return (ENXIO);
3304
3305	coalesce_usecs = qsp->coalesce_usecs;
3306        err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3307
3308	if (err != 0) {
3309		return (err);
3310	}
3311	if (coalesce_usecs == qsp->coalesce_usecs)
3312		return (0);
3313
3314	for (i = 0; i < sc->params.nports; i++)
3315		for (j = 0; j < sc->port[i].nqsets; j++)
3316			nqsets++;
3317
3318	coalesce_usecs = max(1, coalesce_usecs);
3319
3320	for (i = 0; i < nqsets; i++) {
3321		qs = &sc->sge.qs[i];
3322		qsp = &sc->params.sge.qset[i];
3323		qsp->coalesce_usecs = coalesce_usecs;
3324
3325		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3326			    &sc->sge.qs[0].rspq.lock;
3327
3328		mtx_lock(lock);
3329		t3_update_qset_coalesce(qs, qsp);
3330		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3331		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3332		mtx_unlock(lock);
3333	}
3334
3335	return (0);
3336}
3337
3338static int
3339t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3340{
3341	adapter_t *sc = arg1;
3342	int rc, timestamp;
3343
3344	if ((sc->flags & FULL_INIT_DONE) == 0)
3345		return (ENXIO);
3346
3347	timestamp = sc->timestamp;
3348	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3349
3350	if (rc != 0)
3351		return (rc);
3352
3353	if (timestamp != sc->timestamp) {
3354		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3355		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3356		sc->timestamp = timestamp;
3357	}
3358
3359	return (0);
3360}
3361
3362void
3363t3_add_attach_sysctls(adapter_t *sc)
3364{
3365	struct sysctl_ctx_list *ctx;
3366	struct sysctl_oid_list *children;
3367
3368	ctx = device_get_sysctl_ctx(sc->dev);
3369	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3370
3371	/* random information */
3372	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3373	    "firmware_version",
3374	    CTLFLAG_RD, &sc->fw_version,
3375	    0, "firmware version");
3376	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3377	    "hw_revision",
3378	    CTLFLAG_RD, &sc->params.rev,
3379	    0, "chip model");
3380	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3381	    "port_types",
3382	    CTLFLAG_RD, &sc->port_types,
3383	    0, "type of ports");
3384	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3385	    "enable_debug",
3386	    CTLFLAG_RW, &cxgb_debug,
3387	    0, "enable verbose debugging output");
3388	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3389	    CTLFLAG_RD, &sc->tunq_coalesce,
3390	    "#tunneled packets freed");
3391	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3392	    "txq_overrun",
3393	    CTLFLAG_RD, &txq_fills,
3394	    0, "#times txq overrun");
3395	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3396	    "core_clock",
3397	    CTLFLAG_RD, &sc->params.vpd.cclk,
3398	    0, "core clock frequency (in KHz)");
3399}
3400
3401
3402static const char *rspq_name = "rspq";
3403static const char *txq_names[] =
3404{
3405	"txq_eth",
3406	"txq_ofld",
3407	"txq_ctrl"
3408};
3409
3410static int
3411sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3412{
3413	struct port_info *p = arg1;
3414	uint64_t *parg;
3415
3416	if (!p)
3417		return (EINVAL);
3418
3419	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3420	PORT_LOCK(p);
3421	t3_mac_update_stats(&p->mac);
3422	PORT_UNLOCK(p);
3423
3424	return (sysctl_handle_64(oidp, parg, 0, req));
3425}
3426
3427void
3428t3_add_configured_sysctls(adapter_t *sc)
3429{
3430	struct sysctl_ctx_list *ctx;
3431	struct sysctl_oid_list *children;
3432	int i, j;
3433
3434	ctx = device_get_sysctl_ctx(sc->dev);
3435	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3436
3437	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3438	    "intr_coal",
3439	    CTLTYPE_INT|CTLFLAG_RW, sc,
3440	    0, t3_set_coalesce_usecs,
3441	    "I", "interrupt coalescing timer (us)");
3442
3443	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3444	    "pkt_timestamp",
3445	    CTLTYPE_INT | CTLFLAG_RW, sc,
3446	    0, t3_pkt_timestamp,
3447	    "I", "provide packet timestamp instead of connection hash");
3448
3449	for (i = 0; i < sc->params.nports; i++) {
3450		struct port_info *pi = &sc->port[i];
3451		struct sysctl_oid *poid;
3452		struct sysctl_oid_list *poidlist;
3453		struct mac_stats *mstats = &pi->mac.stats;
3454
3455		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3456		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3457		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3458		poidlist = SYSCTL_CHILDREN(poid);
3459		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3460		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3461		    0, "#queue sets");
3462
3463		for (j = 0; j < pi->nqsets; j++) {
3464			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3465			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3466					  *ctrlqpoid, *lropoid;
3467			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3468					       *txqpoidlist, *ctrlqpoidlist,
3469					       *lropoidlist;
3470			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3471
3472			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3473
3474			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3475			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3476			qspoidlist = SYSCTL_CHILDREN(qspoid);
3477
3478			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3479					CTLFLAG_RD, &qs->fl[0].empty, 0,
3480					"freelist #0 empty");
3481			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3482					CTLFLAG_RD, &qs->fl[1].empty, 0,
3483					"freelist #1 empty");
3484
3485			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3486			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3487			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3488
3489			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3490			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3491			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3492
3493			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3494			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3495			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3496
3497			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3498			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3499			lropoidlist = SYSCTL_CHILDREN(lropoid);
3500
3501			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3502			    CTLFLAG_RD, &qs->rspq.size,
3503			    0, "#entries in response queue");
3504			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3505			    CTLFLAG_RD, &qs->rspq.cidx,
3506			    0, "consumer index");
3507			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3508			    CTLFLAG_RD, &qs->rspq.credits,
3509			    0, "#credits");
3510			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3511			    CTLFLAG_RD, &qs->rspq.starved,
3512			    0, "#times starved");
3513			SYSCTL_ADD_ULONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3514			    CTLFLAG_RD, &qs->rspq.phys_addr,
3515			    "physical_address_of the queue");
3516			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3517			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3518			    0, "start rspq dump entry");
3519			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3520			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3521			    0, "#rspq entries to dump");
3522			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3523			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3524			    0, t3_dump_rspq, "A", "dump of the response queue");
3525
3526			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3527			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3528			    "#tunneled packets dropped");
3529			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3530			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3531			    0, "#tunneled packets waiting to be sent");
3532#if 0
3533			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3534			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3535			    0, "#tunneled packets queue producer index");
3536			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3537			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3538			    0, "#tunneled packets queue consumer index");
3539#endif
3540			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3541			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3542			    0, "#tunneled packets processed by the card");
3543			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3544			    CTLFLAG_RD, &txq->cleaned,
3545			    0, "#tunneled packets cleaned");
3546			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3547			    CTLFLAG_RD, &txq->in_use,
3548			    0, "#tunneled packet slots in use");
3549			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3550			    CTLFLAG_RD, &txq->txq_frees,
3551			    "#tunneled packets freed");
3552			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3553			    CTLFLAG_RD, &txq->txq_skipped,
3554			    0, "#tunneled packet descriptors skipped");
3555			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3556			    CTLFLAG_RD, &txq->txq_coalesced,
3557			    "#tunneled packets coalesced");
3558			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3559			    CTLFLAG_RD, &txq->txq_enqueued,
3560			    0, "#tunneled packets enqueued to hardware");
3561			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3562			    CTLFLAG_RD, &qs->txq_stopped,
3563			    0, "tx queues stopped");
3564			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3565			    CTLFLAG_RD, &txq->phys_addr,
3566			    "physical_address_of the queue");
3567			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3568			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3569			    0, "txq generation");
3570			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3571			    CTLFLAG_RD, &txq->cidx,
3572			    0, "hardware queue cidx");
3573			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3574			    CTLFLAG_RD, &txq->pidx,
3575			    0, "hardware queue pidx");
3576			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3577			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3578			    0, "txq start idx for dump");
3579			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3580			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3581			    0, "txq #entries to dump");
3582			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3583			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3584			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3585
3586			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3587			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3588			    0, "ctrlq start idx for dump");
3589			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3590			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3591			    0, "ctrl #entries to dump");
3592			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3593			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3594			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3595
3596			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3597			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3598			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3599			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3600			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3601			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3602			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3603			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3604		}
3605
3606		/* Now add a node for mac stats. */
3607		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3608		    CTLFLAG_RD, NULL, "MAC statistics");
3609		poidlist = SYSCTL_CHILDREN(poid);
3610
3611		/*
3612		 * We (ab)use the length argument (arg2) to pass on the offset
3613		 * of the data that we are interested in.  This is only required
3614		 * for the quad counters that are updated from the hardware (we
3615		 * make sure that we return the latest value).
3616		 * sysctl_handle_macstat first updates *all* the counters from
3617		 * the hardware, and then returns the latest value of the
3618		 * requested counter.  Best would be to update only the
3619		 * requested counter from hardware, but t3_mac_update_stats()
3620		 * hides all the register details and we don't want to dive into
3621		 * all that here.
3622		 */
3623#define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3624    (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3625    sysctl_handle_macstat, "QU", 0)
3626		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3627		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3628		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3629		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3630		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3631		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3632		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3633		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3634		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3635		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3636		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3637		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3638		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3639		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3640		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3641		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3642		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3643		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3644		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3645		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3646		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3647		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3648		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3649		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3650		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3651		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3652		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3653		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3654		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3655		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3656		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3657		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3658		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3659		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3660		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3661		CXGB_SYSCTL_ADD_QUAD(rx_short);
3662		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3663		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3664		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3665		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3666		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3667		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3668		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3669		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3670		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3671		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3672#undef CXGB_SYSCTL_ADD_QUAD
3673
3674#define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3675    CTLFLAG_RD, &mstats->a, 0)
3676		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3677		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3678		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3679		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3680		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3681		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3682		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3683		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3684		CXGB_SYSCTL_ADD_ULONG(num_resets);
3685		CXGB_SYSCTL_ADD_ULONG(link_faults);
3686#undef CXGB_SYSCTL_ADD_ULONG
3687	}
3688}
3689
3690/**
3691 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3692 *	@qs: the queue set
3693 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3694 *	@idx: the descriptor index in the queue
3695 *	@data: where to dump the descriptor contents
3696 *
3697 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3698 *	size of the descriptor.
3699 */
3700int
3701t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3702		unsigned char *data)
3703{
3704	if (qnum >= 6)
3705		return (EINVAL);
3706
3707	if (qnum < 3) {
3708		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3709			return -EINVAL;
3710		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3711		return sizeof(struct tx_desc);
3712	}
3713
3714	if (qnum == 3) {
3715		if (!qs->rspq.desc || idx >= qs->rspq.size)
3716			return (EINVAL);
3717		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3718		return sizeof(struct rsp_desc);
3719	}
3720
3721	qnum -= 4;
3722	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3723		return (EINVAL);
3724	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3725	return sizeof(struct rx_desc);
3726}
3727