1/**************************************************************************
2SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3
4Copyright (c) 2007-2009, Chelsio Inc.
5All rights reserved.
6
7Redistribution and use in source and binary forms, with or without
8modification, are permitted provided that the following conditions are met:
9
10 1. Redistributions of source code must retain the above copyright notice,
11    this list of conditions and the following disclaimer.
12
13 2. Neither the name of the Chelsio Corporation nor the names of its
14    contributors may be used to endorse or promote products derived from
15    this software without specific prior written permission.
16
17THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27POSSIBILITY OF SUCH DAMAGE.
28
29***************************************************************************/
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include "opt_inet6.h"
35#include "opt_inet.h"
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/module.h>
41#include <sys/bus.h>
42#include <sys/conf.h>
43#include <machine/bus.h>
44#include <machine/resource.h>
45#include <sys/rman.h>
46#include <sys/queue.h>
47#include <sys/sysctl.h>
48#include <sys/taskqueue.h>
49
50#include <sys/proc.h>
51#include <sys/sbuf.h>
52#include <sys/sched.h>
53#include <sys/smp.h>
54#include <sys/systm.h>
55#include <sys/syslog.h>
56#include <sys/socket.h>
57#include <sys/sglist.h>
58
59#include <net/if.h>
60#include <net/if_var.h>
61#include <net/bpf.h>
62#include <net/ethernet.h>
63#include <net/if_vlan_var.h>
64
65#include <netinet/in_systm.h>
66#include <netinet/in.h>
67#include <netinet/ip.h>
68#include <netinet/ip6.h>
69#include <netinet/tcp.h>
70
71#include <dev/pci/pcireg.h>
72#include <dev/pci/pcivar.h>
73
74#include <vm/vm.h>
75#include <vm/pmap.h>
76
77#include <cxgb_include.h>
78#include <sys/mvec.h>
79
80int	txq_fills = 0;
81int	multiq_tx_enable = 1;
82
83#ifdef TCP_OFFLOAD
84CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
85#endif
86
87extern struct sysctl_oid_list sysctl__hw_cxgb_children;
88int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
89SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
90    "size of per-queue mbuf ring");
91
92static int cxgb_tx_coalesce_force = 0;
93SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RWTUN,
94    &cxgb_tx_coalesce_force, 0,
95    "coalesce small packets into a single work request regardless of ring state");
96
97#define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
98#define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
99#define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
100#define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
101#define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
102#define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
103#define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
104
105
106static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
107SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RWTUN,
108    &cxgb_tx_coalesce_enable_start, 0,
109    "coalesce enable threshold");
110static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
111SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RWTUN,
112    &cxgb_tx_coalesce_enable_stop, 0,
113    "coalesce disable threshold");
114static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
115SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RWTUN,
116    &cxgb_tx_reclaim_threshold, 0,
117    "tx cleaning minimum threshold");
118
119/*
120 * XXX don't re-enable this until TOE stops assuming
121 * we have an m_ext
122 */
123static int recycle_enable = 0;
124
125extern int cxgb_use_16k_clusters;
126extern int nmbjumbop;
127extern int nmbjumbo9;
128extern int nmbjumbo16;
129
130#define USE_GTS 0
131
132#define SGE_RX_SM_BUF_SIZE	1536
133#define SGE_RX_DROP_THRES	16
134#define SGE_RX_COPY_THRES	128
135
136/*
137 * Period of the Tx buffer reclaim timer.  This timer does not need to run
138 * frequently as Tx buffers are usually reclaimed by new Tx packets.
139 */
140#define TX_RECLAIM_PERIOD       (hz >> 1)
141
142/*
143 * Values for sge_txq.flags
144 */
145enum {
146	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
147	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
148};
149
150struct tx_desc {
151	uint64_t	flit[TX_DESC_FLITS];
152} __packed;
153
154struct rx_desc {
155	uint32_t	addr_lo;
156	uint32_t	len_gen;
157	uint32_t	gen2;
158	uint32_t	addr_hi;
159} __packed;
160
161struct rsp_desc {               /* response queue descriptor */
162	struct rss_header	rss_hdr;
163	uint32_t		flags;
164	uint32_t		len_cq;
165	uint8_t			imm_data[47];
166	uint8_t			intr_gen;
167} __packed;
168
169#define RX_SW_DESC_MAP_CREATED	(1 << 0)
170#define TX_SW_DESC_MAP_CREATED	(1 << 1)
171#define RX_SW_DESC_INUSE        (1 << 3)
172#define TX_SW_DESC_MAPPED       (1 << 4)
173
174#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
175#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
176#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
177#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
178
179struct tx_sw_desc {                /* SW state per Tx descriptor */
180	struct mbuf	*m;
181	bus_dmamap_t	map;
182	int		flags;
183};
184
185struct rx_sw_desc {                /* SW state per Rx descriptor */
186	caddr_t		rxsd_cl;
187	struct mbuf	*m;
188	bus_dmamap_t	map;
189	int		flags;
190};
191
192struct txq_state {
193	unsigned int	compl;
194	unsigned int	gen;
195	unsigned int	pidx;
196};
197
198struct refill_fl_cb_arg {
199	int               error;
200	bus_dma_segment_t seg;
201	int               nseg;
202};
203
204
205/*
206 * Maps a number of flits to the number of Tx descriptors that can hold them.
207 * The formula is
208 *
209 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
210 *
211 * HW allows up to 4 descriptors to be combined into a WR.
212 */
213static uint8_t flit_desc_map[] = {
214	0,
215#if SGE_NUM_GENBITS == 1
216	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
218	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
219	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
220#elif SGE_NUM_GENBITS == 2
221	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
222	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
223	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
225#else
226# error "SGE_NUM_GENBITS must be 1 or 2"
227#endif
228};
229
230#define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
231#define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
232#define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
233#define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
234#define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
235#define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
236	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
237#define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
238#define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
239	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
240#define	TXQ_RING_DEQUEUE(qs) \
241	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
242
243int cxgb_debug = 0;
244
245static void sge_timer_cb(void *arg);
246static void sge_timer_reclaim(void *arg, int ncount);
247static void sge_txq_reclaim_handler(void *arg, int ncount);
248static void cxgb_start_locked(struct sge_qset *qs);
249
250/*
251 * XXX need to cope with bursty scheduling by looking at a wider
252 * window than we are now for determining the need for coalescing
253 *
254 */
255static __inline uint64_t
256check_pkt_coalesce(struct sge_qset *qs)
257{
258        struct adapter *sc;
259        struct sge_txq *txq;
260	uint8_t *fill;
261
262	if (__predict_false(cxgb_tx_coalesce_force))
263		return (1);
264	txq = &qs->txq[TXQ_ETH];
265        sc = qs->port->adapter;
266	fill = &sc->tunq_fill[qs->idx];
267
268	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
269		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
270	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
271		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
272	/*
273	 * if the hardware transmit queue is more than 1/8 full
274	 * we mark it as coalescing - we drop back from coalescing
275	 * when we go below 1/32 full and there are no packets enqueued,
276	 * this provides us with some degree of hysteresis
277	 */
278        if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
279	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
280                *fill = 0;
281        else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
282                *fill = 1;
283
284	return (sc->tunq_coalesce);
285}
286
287#ifdef __LP64__
288static void
289set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
290{
291	uint64_t wr_hilo;
292#if _BYTE_ORDER == _LITTLE_ENDIAN
293	wr_hilo = wr_hi;
294	wr_hilo |= (((uint64_t)wr_lo)<<32);
295#else
296	wr_hilo = wr_lo;
297	wr_hilo |= (((uint64_t)wr_hi)<<32);
298#endif
299	wrp->wrh_hilo = wr_hilo;
300}
301#else
302static void
303set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
304{
305
306	wrp->wrh_hi = wr_hi;
307	wmb();
308	wrp->wrh_lo = wr_lo;
309}
310#endif
311
312struct coalesce_info {
313	int count;
314	int nbytes;
315	int noncoal;
316};
317
318static int
319coalesce_check(struct mbuf *m, void *arg)
320{
321	struct coalesce_info *ci = arg;
322
323	if ((m->m_next != NULL) ||
324	    ((mtod(m, vm_offset_t) & PAGE_MASK) + m->m_len > PAGE_SIZE))
325		ci->noncoal = 1;
326
327	if ((ci->count == 0) || (ci->noncoal == 0 && (ci->count < 7) &&
328	    (ci->nbytes + m->m_len <= 10500))) {
329		ci->count++;
330		ci->nbytes += m->m_len;
331		return (1);
332	}
333	return (0);
334}
335
336static struct mbuf *
337cxgb_dequeue(struct sge_qset *qs)
338{
339	struct mbuf *m, *m_head, *m_tail;
340	struct coalesce_info ci;
341
342
343	if (check_pkt_coalesce(qs) == 0)
344		return TXQ_RING_DEQUEUE(qs);
345
346	m_head = m_tail = NULL;
347	ci.count = ci.nbytes = ci.noncoal = 0;
348	do {
349		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
350		if (m_head == NULL) {
351			m_tail = m_head = m;
352		} else if (m != NULL) {
353			m_tail->m_nextpkt = m;
354			m_tail = m;
355		}
356	} while (m != NULL);
357	if (ci.count > 7)
358		panic("trying to coalesce %d packets in to one WR", ci.count);
359	return (m_head);
360}
361
362/**
363 *	reclaim_completed_tx - reclaims completed Tx descriptors
364 *	@adapter: the adapter
365 *	@q: the Tx queue to reclaim completed descriptors from
366 *
367 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
368 *	and frees the associated buffers if possible.  Called with the Tx
369 *	queue's lock held.
370 */
371static __inline int
372reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
373{
374	struct sge_txq *q = &qs->txq[queue];
375	int reclaim = desc_reclaimable(q);
376
377	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
378	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
379		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
380
381	if (reclaim < reclaim_min)
382		return (0);
383
384	mtx_assert(&qs->lock, MA_OWNED);
385	if (reclaim > 0) {
386		t3_free_tx_desc(qs, reclaim, queue);
387		q->cleaned += reclaim;
388		q->in_use -= reclaim;
389	}
390	if (isset(&qs->txq_stopped, TXQ_ETH))
391                clrbit(&qs->txq_stopped, TXQ_ETH);
392
393	return (reclaim);
394}
395
396#ifdef NETDUMP
397int
398cxgb_netdump_poll_tx(struct sge_qset *qs)
399{
400
401	return (reclaim_completed_tx(qs, TX_RECLAIM_MAX, TXQ_ETH));
402}
403#endif
404
405/**
406 *	should_restart_tx - are there enough resources to restart a Tx queue?
407 *	@q: the Tx queue
408 *
409 *	Checks if there are enough descriptors to restart a suspended Tx queue.
410 */
411static __inline int
412should_restart_tx(const struct sge_txq *q)
413{
414	unsigned int r = q->processed - q->cleaned;
415
416	return q->in_use - r < (q->size >> 1);
417}
418
419/**
420 *	t3_sge_init - initialize SGE
421 *	@adap: the adapter
422 *	@p: the SGE parameters
423 *
424 *	Performs SGE initialization needed every time after a chip reset.
425 *	We do not initialize any of the queue sets here, instead the driver
426 *	top-level must request those individually.  We also do not enable DMA
427 *	here, that should be done after the queues have been set up.
428 */
429void
430t3_sge_init(adapter_t *adap, struct sge_params *p)
431{
432	u_int ctrl, ups;
433
434	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
435
436	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
437	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
438	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
439	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
440#if SGE_NUM_GENBITS == 1
441	ctrl |= F_EGRGENCTRL;
442#endif
443	if (adap->params.rev > 0) {
444		if (!(adap->flags & (USING_MSIX | USING_MSI)))
445			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
446	}
447	t3_write_reg(adap, A_SG_CONTROL, ctrl);
448	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
449		     V_LORCQDRBTHRSH(512));
450	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
451	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
452		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
453	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
454		     adap->params.rev < T3_REV_C ? 1000 : 500);
455	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
456	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
457	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
458	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
459	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
460}
461
462
463/**
464 *	sgl_len - calculates the size of an SGL of the given capacity
465 *	@n: the number of SGL entries
466 *
467 *	Calculates the number of flits needed for a scatter/gather list that
468 *	can hold the given number of entries.
469 */
470static __inline unsigned int
471sgl_len(unsigned int n)
472{
473	return ((3 * n) / 2 + (n & 1));
474}
475
476/**
477 *	get_imm_packet - return the next ingress packet buffer from a response
478 *	@resp: the response descriptor containing the packet data
479 *
480 *	Return a packet containing the immediate data of the given response.
481 */
482static int
483get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
484{
485
486	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
487		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
488		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
489	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
490		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
491		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
492	} else
493		m->m_len = IMMED_PKT_SIZE;
494	m->m_ext.ext_buf = NULL;
495	m->m_ext.ext_type = 0;
496	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len);
497	return (0);
498}
499
500static __inline u_int
501flits_to_desc(u_int n)
502{
503	return (flit_desc_map[n]);
504}
505
506#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
507		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
508		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
509		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
510		    F_HIRCQPARITYERROR)
511#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
512#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
513		      F_RSPQDISABLED)
514
515/**
516 *	t3_sge_err_intr_handler - SGE async event interrupt handler
517 *	@adapter: the adapter
518 *
519 *	Interrupt handler for SGE asynchronous (non-data) events.
520 */
521void
522t3_sge_err_intr_handler(adapter_t *adapter)
523{
524	unsigned int v, status;
525
526	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
527	if (status & SGE_PARERR)
528		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
529			 status & SGE_PARERR);
530	if (status & SGE_FRAMINGERR)
531		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
532			 status & SGE_FRAMINGERR);
533	if (status & F_RSPQCREDITOVERFOW)
534		CH_ALERT(adapter, "SGE response queue credit overflow\n");
535
536	if (status & F_RSPQDISABLED) {
537		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
538
539		CH_ALERT(adapter,
540			 "packet delivered to disabled response queue (0x%x)\n",
541			 (v >> S_RSPQ0DISABLED) & 0xff);
542	}
543
544	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
545	if (status & SGE_FATALERR)
546		t3_fatal_err(adapter);
547}
548
549void
550t3_sge_prep(adapter_t *adap, struct sge_params *p)
551{
552	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
553
554	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
555	nqsets *= adap->params.nports;
556
557	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
558
559	while (!powerof2(fl_q_size))
560		fl_q_size--;
561
562	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
563	    is_offload(adap);
564
565#if __FreeBSD_version >= 700111
566	if (use_16k) {
567		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
568		jumbo_buf_size = MJUM16BYTES;
569	} else {
570		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
571		jumbo_buf_size = MJUM9BYTES;
572	}
573#else
574	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
575	jumbo_buf_size = MJUMPAGESIZE;
576#endif
577	while (!powerof2(jumbo_q_size))
578		jumbo_q_size--;
579
580	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
581		device_printf(adap->dev,
582		    "Insufficient clusters and/or jumbo buffers.\n");
583
584	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
585
586	for (i = 0; i < SGE_QSETS; ++i) {
587		struct qset_params *q = p->qset + i;
588
589		if (adap->params.nports > 2) {
590			q->coalesce_usecs = 50;
591		} else {
592#ifdef INVARIANTS
593			q->coalesce_usecs = 10;
594#else
595			q->coalesce_usecs = 5;
596#endif
597		}
598		q->polling = 0;
599		q->rspq_size = RSPQ_Q_SIZE;
600		q->fl_size = fl_q_size;
601		q->jumbo_size = jumbo_q_size;
602		q->jumbo_buf_size = jumbo_buf_size;
603		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
604		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
605		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
606		q->cong_thres = 0;
607	}
608}
609
610int
611t3_sge_alloc(adapter_t *sc)
612{
613
614	/* The parent tag. */
615	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
616				1, 0,			/* algnmnt, boundary */
617				BUS_SPACE_MAXADDR,	/* lowaddr */
618				BUS_SPACE_MAXADDR,	/* highaddr */
619				NULL, NULL,		/* filter, filterarg */
620				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
621				BUS_SPACE_UNRESTRICTED, /* nsegments */
622				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
623				0,			/* flags */
624				NULL, NULL,		/* lock, lockarg */
625				&sc->parent_dmat)) {
626		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
627		return (ENOMEM);
628	}
629
630	/*
631	 * DMA tag for normal sized RX frames
632	 */
633	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
634		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
635		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
636		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
637		return (ENOMEM);
638	}
639
640	/*
641	 * DMA tag for jumbo sized RX frames.
642	 */
643	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
644		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
645		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
646		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
647		return (ENOMEM);
648	}
649
650	/*
651	 * DMA tag for TX frames.
652	 */
653	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
654		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
655		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
656		NULL, NULL, &sc->tx_dmat)) {
657		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
658		return (ENOMEM);
659	}
660
661	return (0);
662}
663
664int
665t3_sge_free(struct adapter * sc)
666{
667
668	if (sc->tx_dmat != NULL)
669		bus_dma_tag_destroy(sc->tx_dmat);
670
671	if (sc->rx_jumbo_dmat != NULL)
672		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
673
674	if (sc->rx_dmat != NULL)
675		bus_dma_tag_destroy(sc->rx_dmat);
676
677	if (sc->parent_dmat != NULL)
678		bus_dma_tag_destroy(sc->parent_dmat);
679
680	return (0);
681}
682
683void
684t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
685{
686
687	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
688	qs->rspq.polling = 0 /* p->polling */;
689}
690
691#if !defined(__i386__) && !defined(__amd64__)
692static void
693refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
694{
695	struct refill_fl_cb_arg *cb_arg = arg;
696
697	cb_arg->error = error;
698	cb_arg->seg = segs[0];
699	cb_arg->nseg = nseg;
700
701}
702#endif
703/**
704 *	refill_fl - refill an SGE free-buffer list
705 *	@sc: the controller softc
706 *	@q: the free-list to refill
707 *	@n: the number of new buffers to allocate
708 *
709 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
710 *	The caller must assure that @n does not exceed the queue's capacity.
711 */
712static void
713refill_fl(adapter_t *sc, struct sge_fl *q, int n)
714{
715	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
716	struct rx_desc *d = &q->desc[q->pidx];
717	struct refill_fl_cb_arg cb_arg;
718	struct mbuf *m;
719	caddr_t cl;
720	int err;
721
722	cb_arg.error = 0;
723	while (n--) {
724		/*
725		 * We allocate an uninitialized mbuf + cluster, mbuf is
726		 * initialized after rx.
727		 */
728		if (q->zone == zone_pack) {
729			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
730				break;
731			cl = m->m_ext.ext_buf;
732		} else {
733			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
734				break;
735			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
736				uma_zfree(q->zone, cl);
737				break;
738			}
739		}
740		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
741			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
742				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
743				uma_zfree(q->zone, cl);
744				goto done;
745			}
746			sd->flags |= RX_SW_DESC_MAP_CREATED;
747		}
748#if !defined(__i386__) && !defined(__amd64__)
749		err = bus_dmamap_load(q->entry_tag, sd->map,
750		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
751
752		if (err != 0 || cb_arg.error) {
753			if (q->zone != zone_pack)
754				uma_zfree(q->zone, cl);
755			m_free(m);
756			goto done;
757		}
758#else
759		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
760#endif
761		sd->flags |= RX_SW_DESC_INUSE;
762		sd->rxsd_cl = cl;
763		sd->m = m;
764		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
765		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
766		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
767		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
768
769		d++;
770		sd++;
771
772		if (++q->pidx == q->size) {
773			q->pidx = 0;
774			q->gen ^= 1;
775			sd = q->sdesc;
776			d = q->desc;
777		}
778		q->credits++;
779		q->db_pending++;
780	}
781
782done:
783	if (q->db_pending >= 32) {
784		q->db_pending = 0;
785		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
786	}
787}
788
789
790/**
791 *	free_rx_bufs - free the Rx buffers on an SGE free list
792 *	@sc: the controle softc
793 *	@q: the SGE free list to clean up
794 *
795 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
796 *	this queue should be stopped before calling this function.
797 */
798static void
799free_rx_bufs(adapter_t *sc, struct sge_fl *q)
800{
801	u_int cidx = q->cidx;
802
803	while (q->credits--) {
804		struct rx_sw_desc *d = &q->sdesc[cidx];
805
806		if (d->flags & RX_SW_DESC_INUSE) {
807			bus_dmamap_unload(q->entry_tag, d->map);
808			bus_dmamap_destroy(q->entry_tag, d->map);
809			if (q->zone == zone_pack) {
810				m_init(d->m, M_NOWAIT, MT_DATA, M_EXT);
811				uma_zfree(zone_pack, d->m);
812			} else {
813				m_init(d->m, M_NOWAIT, MT_DATA, 0);
814				uma_zfree(zone_mbuf, d->m);
815				uma_zfree(q->zone, d->rxsd_cl);
816			}
817		}
818
819		d->rxsd_cl = NULL;
820		d->m = NULL;
821		if (++cidx == q->size)
822			cidx = 0;
823	}
824}
825
826static __inline void
827__refill_fl(adapter_t *adap, struct sge_fl *fl)
828{
829	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
830}
831
832static __inline void
833__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
834{
835	uint32_t reclaimable = fl->size - fl->credits;
836
837	if (reclaimable > 0)
838		refill_fl(adap, fl, min(max, reclaimable));
839}
840
841/**
842 *	recycle_rx_buf - recycle a receive buffer
843 *	@adapter: the adapter
844 *	@q: the SGE free list
845 *	@idx: index of buffer to recycle
846 *
847 *	Recycles the specified buffer on the given free list by adding it at
848 *	the next available slot on the list.
849 */
850static void
851recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
852{
853	struct rx_desc *from = &q->desc[idx];
854	struct rx_desc *to   = &q->desc[q->pidx];
855
856	q->sdesc[q->pidx] = q->sdesc[idx];
857	to->addr_lo = from->addr_lo;        // already big endian
858	to->addr_hi = from->addr_hi;        // likewise
859	wmb();	/* necessary ? */
860	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
861	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
862	q->credits++;
863
864	if (++q->pidx == q->size) {
865		q->pidx = 0;
866		q->gen ^= 1;
867	}
868	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
869}
870
871static void
872alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
873{
874	uint32_t *addr;
875
876	addr = arg;
877	*addr = segs[0].ds_addr;
878}
879
880static int
881alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
882    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
883    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
884{
885	size_t len = nelem * elem_size;
886	void *s = NULL;
887	void *p = NULL;
888	int err;
889
890	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
891				      BUS_SPACE_MAXADDR_32BIT,
892				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
893				      len, 0, NULL, NULL, tag)) != 0) {
894		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
895		return (ENOMEM);
896	}
897
898	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
899				    map)) != 0) {
900		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
901		return (ENOMEM);
902	}
903
904	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
905	bzero(p, len);
906	*(void **)desc = p;
907
908	if (sw_size) {
909		len = nelem * sw_size;
910		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
911		*(void **)sdesc = s;
912	}
913	if (parent_entry_tag == NULL)
914		return (0);
915
916	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
917				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
918		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
919				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
920		                      NULL, NULL, entry_tag)) != 0) {
921		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
922		return (ENOMEM);
923	}
924	return (0);
925}
926
927static void
928sge_slow_intr_handler(void *arg, int ncount)
929{
930	adapter_t *sc = arg;
931
932	t3_slow_intr_handler(sc);
933	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
934	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
935}
936
937/**
938 *	sge_timer_cb - perform periodic maintenance of an SGE qset
939 *	@data: the SGE queue set to maintain
940 *
941 *	Runs periodically from a timer to perform maintenance of an SGE queue
942 *	set.  It performs two tasks:
943 *
944 *	a) Cleans up any completed Tx descriptors that may still be pending.
945 *	Normal descriptor cleanup happens when new packets are added to a Tx
946 *	queue so this timer is relatively infrequent and does any cleanup only
947 *	if the Tx queue has not seen any new packets in a while.  We make a
948 *	best effort attempt to reclaim descriptors, in that we don't wait
949 *	around if we cannot get a queue's lock (which most likely is because
950 *	someone else is queueing new packets and so will also handle the clean
951 *	up).  Since control queues use immediate data exclusively we don't
952 *	bother cleaning them up here.
953 *
954 *	b) Replenishes Rx queues that have run out due to memory shortage.
955 *	Normally new Rx buffers are added when existing ones are consumed but
956 *	when out of memory a queue can become empty.  We try to add only a few
957 *	buffers here, the queue will be replenished fully as these new buffers
958 *	are used up if memory shortage has subsided.
959 *
960 *	c) Return coalesced response queue credits in case a response queue is
961 *	starved.
962 *
963 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
964 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
965 */
966static void
967sge_timer_cb(void *arg)
968{
969	adapter_t *sc = arg;
970	if ((sc->flags & USING_MSIX) == 0) {
971
972		struct port_info *pi;
973		struct sge_qset *qs;
974		struct sge_txq  *txq;
975		int i, j;
976		int reclaim_ofl, refill_rx;
977
978		if (sc->open_device_map == 0)
979			return;
980
981		for (i = 0; i < sc->params.nports; i++) {
982			pi = &sc->port[i];
983			for (j = 0; j < pi->nqsets; j++) {
984				qs = &sc->sge.qs[pi->first_qset + j];
985				txq = &qs->txq[0];
986				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
987				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
988				    (qs->fl[1].credits < qs->fl[1].size));
989				if (reclaim_ofl || refill_rx) {
990					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
991					break;
992				}
993			}
994		}
995	}
996
997	if (sc->params.nports > 2) {
998		int i;
999
1000		for_each_port(sc, i) {
1001			struct port_info *pi = &sc->port[i];
1002
1003			t3_write_reg(sc, A_SG_KDOORBELL,
1004				     F_SELEGRCNTX |
1005				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
1006		}
1007	}
1008	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
1009	    sc->open_device_map != 0)
1010		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1011}
1012
1013/*
1014 * This is meant to be a catch-all function to keep sge state private
1015 * to sge.c
1016 *
1017 */
1018int
1019t3_sge_init_adapter(adapter_t *sc)
1020{
1021	callout_init(&sc->sge_timer_ch, 1);
1022	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1023	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1024	return (0);
1025}
1026
1027int
1028t3_sge_reset_adapter(adapter_t *sc)
1029{
1030	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1031	return (0);
1032}
1033
1034int
1035t3_sge_init_port(struct port_info *pi)
1036{
1037	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1038	return (0);
1039}
1040
1041/**
1042 *	refill_rspq - replenish an SGE response queue
1043 *	@adapter: the adapter
1044 *	@q: the response queue to replenish
1045 *	@credits: how many new responses to make available
1046 *
1047 *	Replenishes a response queue by making the supplied number of responses
1048 *	available to HW.
1049 */
1050static __inline void
1051refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1052{
1053
1054	/* mbufs are allocated on demand when a rspq entry is processed. */
1055	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1056		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1057}
1058
1059static void
1060sge_txq_reclaim_handler(void *arg, int ncount)
1061{
1062	struct sge_qset *qs = arg;
1063	int i;
1064
1065	for (i = 0; i < 3; i++)
1066		reclaim_completed_tx(qs, 16, i);
1067}
1068
1069static void
1070sge_timer_reclaim(void *arg, int ncount)
1071{
1072	struct port_info *pi = arg;
1073	int i, nqsets = pi->nqsets;
1074	adapter_t *sc = pi->adapter;
1075	struct sge_qset *qs;
1076	struct mtx *lock;
1077
1078	KASSERT((sc->flags & USING_MSIX) == 0,
1079	    ("can't call timer reclaim for msi-x"));
1080
1081	for (i = 0; i < nqsets; i++) {
1082		qs = &sc->sge.qs[pi->first_qset + i];
1083
1084		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1085		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1086			    &sc->sge.qs[0].rspq.lock;
1087
1088		if (mtx_trylock(lock)) {
1089			/* XXX currently assume that we are *NOT* polling */
1090			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1091
1092			if (qs->fl[0].credits < qs->fl[0].size - 16)
1093				__refill_fl(sc, &qs->fl[0]);
1094			if (qs->fl[1].credits < qs->fl[1].size - 16)
1095				__refill_fl(sc, &qs->fl[1]);
1096
1097			if (status & (1 << qs->rspq.cntxt_id)) {
1098				if (qs->rspq.credits) {
1099					refill_rspq(sc, &qs->rspq, 1);
1100					qs->rspq.credits--;
1101					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1102					    1 << qs->rspq.cntxt_id);
1103				}
1104			}
1105			mtx_unlock(lock);
1106		}
1107	}
1108}
1109
1110/**
1111 *	init_qset_cntxt - initialize an SGE queue set context info
1112 *	@qs: the queue set
1113 *	@id: the queue set id
1114 *
1115 *	Initializes the TIDs and context ids for the queues of a queue set.
1116 */
1117static void
1118init_qset_cntxt(struct sge_qset *qs, u_int id)
1119{
1120
1121	qs->rspq.cntxt_id = id;
1122	qs->fl[0].cntxt_id = 2 * id;
1123	qs->fl[1].cntxt_id = 2 * id + 1;
1124	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1125	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1126	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1127	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1128	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1129
1130	/* XXX: a sane limit is needed instead of INT_MAX */
1131	mbufq_init(&qs->txq[TXQ_ETH].sendq, INT_MAX);
1132	mbufq_init(&qs->txq[TXQ_OFLD].sendq, INT_MAX);
1133	mbufq_init(&qs->txq[TXQ_CTRL].sendq, INT_MAX);
1134}
1135
1136
1137static void
1138txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1139{
1140	txq->in_use += ndesc;
1141	/*
1142	 * XXX we don't handle stopping of queue
1143	 * presumably start handles this when we bump against the end
1144	 */
1145	txqs->gen = txq->gen;
1146	txq->unacked += ndesc;
1147	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1148	txq->unacked &= 31;
1149	txqs->pidx = txq->pidx;
1150	txq->pidx += ndesc;
1151#ifdef INVARIANTS
1152	if (((txqs->pidx > txq->cidx) &&
1153		(txq->pidx < txqs->pidx) &&
1154		(txq->pidx >= txq->cidx)) ||
1155	    ((txqs->pidx < txq->cidx) &&
1156		(txq->pidx >= txq-> cidx)) ||
1157	    ((txqs->pidx < txq->cidx) &&
1158		(txq->cidx < txqs->pidx)))
1159		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1160		    txqs->pidx, txq->pidx, txq->cidx);
1161#endif
1162	if (txq->pidx >= txq->size) {
1163		txq->pidx -= txq->size;
1164		txq->gen ^= 1;
1165	}
1166
1167}
1168
1169/**
1170 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1171 *	@m: the packet mbufs
1172 *      @nsegs: the number of segments
1173 *
1174 * 	Returns the number of Tx descriptors needed for the given Ethernet
1175 * 	packet.  Ethernet packets require addition of WR and CPL headers.
1176 */
1177static __inline unsigned int
1178calc_tx_descs(const struct mbuf *m, int nsegs)
1179{
1180	unsigned int flits;
1181
1182	if (m->m_pkthdr.len <= PIO_LEN)
1183		return 1;
1184
1185	flits = sgl_len(nsegs) + 2;
1186	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1187		flits++;
1188
1189	return flits_to_desc(flits);
1190}
1191
1192/**
1193 *	make_sgl - populate a scatter/gather list for a packet
1194 *	@sgp: the SGL to populate
1195 *	@segs: the packet dma segments
1196 *	@nsegs: the number of segments
1197 *
1198 *	Generates a scatter/gather list for the buffers that make up a packet
1199 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1200 *	appropriately.
1201 */
1202static __inline void
1203make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1204{
1205	int i, idx;
1206
1207	for (idx = 0, i = 0; i < nsegs; i++) {
1208		/*
1209		 * firmware doesn't like empty segments
1210		 */
1211		if (segs[i].ds_len == 0)
1212			continue;
1213		if (i && idx == 0)
1214			++sgp;
1215
1216		sgp->len[idx] = htobe32(segs[i].ds_len);
1217		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1218		idx ^= 1;
1219	}
1220
1221	if (idx) {
1222		sgp->len[idx] = 0;
1223		sgp->addr[idx] = 0;
1224	}
1225}
1226
1227/**
1228 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1229 *	@adap: the adapter
1230 *	@q: the Tx queue
1231 *
1232 *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1233 *	where the HW is going to sleep just after we checked, however,
1234 *	then the interrupt handler will detect the outstanding TX packet
1235 *	and ring the doorbell for us.
1236 *
1237 *	When GTS is disabled we unconditionally ring the doorbell.
1238 */
1239static __inline void
1240check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1241{
1242#if USE_GTS
1243	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1244	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1245		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1246#ifdef T3_TRACE
1247		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1248			  q->cntxt_id);
1249#endif
1250		t3_write_reg(adap, A_SG_KDOORBELL,
1251			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1252	}
1253#else
1254	if (mustring || ++q->db_pending >= 32) {
1255		wmb();            /* write descriptors before telling HW */
1256		t3_write_reg(adap, A_SG_KDOORBELL,
1257		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1258		q->db_pending = 0;
1259	}
1260#endif
1261}
1262
1263static __inline void
1264wr_gen2(struct tx_desc *d, unsigned int gen)
1265{
1266#if SGE_NUM_GENBITS == 2
1267	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1268#endif
1269}
1270
1271/**
1272 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1273 *	@ndesc: number of Tx descriptors spanned by the SGL
1274 *	@txd: first Tx descriptor to be written
1275 *	@txqs: txq state (generation and producer index)
1276 *	@txq: the SGE Tx queue
1277 *	@sgl: the SGL
1278 *	@flits: number of flits to the start of the SGL in the first descriptor
1279 *	@sgl_flits: the SGL size in flits
1280 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1281 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1282 *
1283 *	Write a work request header and an associated SGL.  If the SGL is
1284 *	small enough to fit into one Tx descriptor it has already been written
1285 *	and we just need to write the WR header.  Otherwise we distribute the
1286 *	SGL across the number of descriptors it spans.
1287 */
1288static void
1289write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1290    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1291    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1292{
1293
1294	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1295	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1296
1297	if (__predict_true(ndesc == 1)) {
1298		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1299		    V_WR_SGLSFLT(flits)) | wr_hi,
1300		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
1301		    wr_lo);
1302
1303		wr_gen2(txd, txqs->gen);
1304
1305	} else {
1306		unsigned int ogen = txqs->gen;
1307		const uint64_t *fp = (const uint64_t *)sgl;
1308		struct work_request_hdr *wp = wrp;
1309
1310		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1311		    V_WR_SGLSFLT(flits)) | wr_hi;
1312
1313		while (sgl_flits) {
1314			unsigned int avail = WR_FLITS - flits;
1315
1316			if (avail > sgl_flits)
1317				avail = sgl_flits;
1318			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1319			sgl_flits -= avail;
1320			ndesc--;
1321			if (!sgl_flits)
1322				break;
1323
1324			fp += avail;
1325			txd++;
1326			txsd++;
1327			if (++txqs->pidx == txq->size) {
1328				txqs->pidx = 0;
1329				txqs->gen ^= 1;
1330				txd = txq->desc;
1331				txsd = txq->sdesc;
1332			}
1333
1334			/*
1335			 * when the head of the mbuf chain
1336			 * is freed all clusters will be freed
1337			 * with it
1338			 */
1339			wrp = (struct work_request_hdr *)txd;
1340			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1341			    V_WR_SGLSFLT(1)) | wr_hi;
1342			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1343				    sgl_flits + 1)) |
1344			    V_WR_GEN(txqs->gen)) | wr_lo;
1345			wr_gen2(txd, txqs->gen);
1346			flits = 1;
1347		}
1348		wrp->wrh_hi |= htonl(F_WR_EOP);
1349		wmb();
1350		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1351		wr_gen2((struct tx_desc *)wp, ogen);
1352	}
1353}
1354
1355/* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1356#define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1357
1358#define GET_VTAG(cntrl, m) \
1359do { \
1360	if ((m)->m_flags & M_VLANTAG)					            \
1361		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1362} while (0)
1363
1364static int
1365t3_encap(struct sge_qset *qs, struct mbuf **m)
1366{
1367	adapter_t *sc;
1368	struct mbuf *m0;
1369	struct sge_txq *txq;
1370	struct txq_state txqs;
1371	struct port_info *pi;
1372	unsigned int ndesc, flits, cntrl, mlen;
1373	int err, nsegs, tso_info = 0;
1374
1375	struct work_request_hdr *wrp;
1376	struct tx_sw_desc *txsd;
1377	struct sg_ent *sgp, *sgl;
1378	uint32_t wr_hi, wr_lo, sgl_flits;
1379	bus_dma_segment_t segs[TX_MAX_SEGS];
1380
1381	struct tx_desc *txd;
1382
1383	pi = qs->port;
1384	sc = pi->adapter;
1385	txq = &qs->txq[TXQ_ETH];
1386	txd = &txq->desc[txq->pidx];
1387	txsd = &txq->sdesc[txq->pidx];
1388	sgl = txq->txq_sgl;
1389
1390	prefetch(txd);
1391	m0 = *m;
1392
1393	mtx_assert(&qs->lock, MA_OWNED);
1394	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1395	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1396
1397	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1398	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1399		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1400
1401	if (m0->m_nextpkt != NULL) {
1402		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1403		ndesc = 1;
1404		mlen = 0;
1405	} else {
1406		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1407		    &m0, segs, &nsegs))) {
1408			if (cxgb_debug)
1409				printf("failed ... err=%d\n", err);
1410			return (err);
1411		}
1412		mlen = m0->m_pkthdr.len;
1413		ndesc = calc_tx_descs(m0, nsegs);
1414	}
1415	txq_prod(txq, ndesc, &txqs);
1416
1417	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1418	txsd->m = m0;
1419
1420	if (m0->m_nextpkt != NULL) {
1421		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1422		int i, fidx;
1423
1424		if (nsegs > 7)
1425			panic("trying to coalesce %d packets in to one WR", nsegs);
1426		txq->txq_coalesced += nsegs;
1427		wrp = (struct work_request_hdr *)txd;
1428		flits = nsegs*2 + 1;
1429
1430		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1431			struct cpl_tx_pkt_batch_entry *cbe;
1432			uint64_t flit;
1433			uint32_t *hflit = (uint32_t *)&flit;
1434			int cflags = m0->m_pkthdr.csum_flags;
1435
1436			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1437			GET_VTAG(cntrl, m0);
1438			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1439			if (__predict_false(!(cflags & CSUM_IP)))
1440				cntrl |= F_TXPKT_IPCSUM_DIS;
1441			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP |
1442			    CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1443				cntrl |= F_TXPKT_L4CSUM_DIS;
1444
1445			hflit[0] = htonl(cntrl);
1446			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1447			flit |= htobe64(1 << 24);
1448			cbe = &cpl_batch->pkt_entry[i];
1449			cbe->cntrl = hflit[0];
1450			cbe->len = hflit[1];
1451			cbe->addr = htobe64(segs[i].ds_addr);
1452		}
1453
1454		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1455		    V_WR_SGLSFLT(flits)) |
1456		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1457		wr_lo = htonl(V_WR_LEN(flits) |
1458		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1459		set_wr_hdr(wrp, wr_hi, wr_lo);
1460		wmb();
1461		ETHER_BPF_MTAP(pi->ifp, m0);
1462		wr_gen2(txd, txqs.gen);
1463		check_ring_tx_db(sc, txq, 0);
1464		return (0);
1465	} else if (tso_info) {
1466		uint16_t eth_type;
1467		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1468		struct ether_header *eh;
1469		void *l3hdr;
1470		struct tcphdr *tcp;
1471
1472		txd->flit[2] = 0;
1473		GET_VTAG(cntrl, m0);
1474		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1475		hdr->cntrl = htonl(cntrl);
1476		hdr->len = htonl(mlen | 0x80000000);
1477
1478		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1479			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x",
1480			    m0, mlen, m0->m_pkthdr.tso_segsz,
1481			    (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags);
1482			panic("tx tso packet too small");
1483		}
1484
1485		/* Make sure that ether, ip, tcp headers are all in m0 */
1486		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1487			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1488			if (__predict_false(m0 == NULL)) {
1489				/* XXX panic probably an overreaction */
1490				panic("couldn't fit header into mbuf");
1491			}
1492		}
1493
1494		eh = mtod(m0, struct ether_header *);
1495		eth_type = eh->ether_type;
1496		if (eth_type == htons(ETHERTYPE_VLAN)) {
1497			struct ether_vlan_header *evh = (void *)eh;
1498
1499			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1500			l3hdr = evh + 1;
1501			eth_type = evh->evl_proto;
1502		} else {
1503			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1504			l3hdr = eh + 1;
1505		}
1506
1507		if (eth_type == htons(ETHERTYPE_IP)) {
1508			struct ip *ip = l3hdr;
1509
1510			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1511			tcp = (struct tcphdr *)(ip + 1);
1512		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1513			struct ip6_hdr *ip6 = l3hdr;
1514
1515			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1516			    ("%s: CSUM_TSO with ip6_nxt %d",
1517			    __func__, ip6->ip6_nxt));
1518
1519			tso_info |= F_LSO_IPV6;
1520			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1521			tcp = (struct tcphdr *)(ip6 + 1);
1522		} else
1523			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1524
1525		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1526		hdr->lso_info = htonl(tso_info);
1527
1528		if (__predict_false(mlen <= PIO_LEN)) {
1529			/*
1530			 * pkt not undersized but fits in PIO_LEN
1531			 * Indicates a TSO bug at the higher levels.
1532			 */
1533			txsd->m = NULL;
1534			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1535			flits = (mlen + 7) / 8 + 3;
1536			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1537					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1538					  F_WR_SOP | F_WR_EOP | txqs.compl);
1539			wr_lo = htonl(V_WR_LEN(flits) |
1540			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1541			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1542			wmb();
1543			ETHER_BPF_MTAP(pi->ifp, m0);
1544			wr_gen2(txd, txqs.gen);
1545			check_ring_tx_db(sc, txq, 0);
1546			m_freem(m0);
1547			return (0);
1548		}
1549		flits = 3;
1550	} else {
1551		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1552
1553		GET_VTAG(cntrl, m0);
1554		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1555		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1556			cntrl |= F_TXPKT_IPCSUM_DIS;
1557		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP |
1558		    CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1559			cntrl |= F_TXPKT_L4CSUM_DIS;
1560		cpl->cntrl = htonl(cntrl);
1561		cpl->len = htonl(mlen | 0x80000000);
1562
1563		if (mlen <= PIO_LEN) {
1564			txsd->m = NULL;
1565			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1566			flits = (mlen + 7) / 8 + 2;
1567
1568			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1569			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1570					  F_WR_SOP | F_WR_EOP | txqs.compl);
1571			wr_lo = htonl(V_WR_LEN(flits) |
1572			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1573			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1574			wmb();
1575			ETHER_BPF_MTAP(pi->ifp, m0);
1576			wr_gen2(txd, txqs.gen);
1577			check_ring_tx_db(sc, txq, 0);
1578			m_freem(m0);
1579			return (0);
1580		}
1581		flits = 2;
1582	}
1583	wrp = (struct work_request_hdr *)txd;
1584	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1585	make_sgl(sgp, segs, nsegs);
1586
1587	sgl_flits = sgl_len(nsegs);
1588
1589	ETHER_BPF_MTAP(pi->ifp, m0);
1590
1591	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1592	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1593	wr_lo = htonl(V_WR_TID(txq->token));
1594	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1595	    sgl_flits, wr_hi, wr_lo);
1596	check_ring_tx_db(sc, txq, 0);
1597
1598	return (0);
1599}
1600
1601#ifdef NETDUMP
1602int
1603cxgb_netdump_encap(struct sge_qset *qs, struct mbuf **m)
1604{
1605	int error;
1606
1607	error = t3_encap(qs, m);
1608	if (error == 0)
1609		check_ring_tx_db(qs->port->adapter, &qs->txq[TXQ_ETH], 1);
1610	else if (*m != NULL) {
1611		m_freem(*m);
1612		*m = NULL;
1613	}
1614	return (error);
1615}
1616#endif
1617
1618void
1619cxgb_tx_watchdog(void *arg)
1620{
1621	struct sge_qset *qs = arg;
1622	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1623
1624        if (qs->coalescing != 0 &&
1625	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1626	    TXQ_RING_EMPTY(qs))
1627                qs->coalescing = 0;
1628        else if (qs->coalescing == 0 &&
1629	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1630                qs->coalescing = 1;
1631	if (TXQ_TRYLOCK(qs)) {
1632		qs->qs_flags |= QS_FLUSHING;
1633		cxgb_start_locked(qs);
1634		qs->qs_flags &= ~QS_FLUSHING;
1635		TXQ_UNLOCK(qs);
1636	}
1637	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1638		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1639		    qs, txq->txq_watchdog.c_cpu);
1640}
1641
1642static void
1643cxgb_tx_timeout(void *arg)
1644{
1645	struct sge_qset *qs = arg;
1646	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1647
1648	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1649                qs->coalescing = 1;
1650	if (TXQ_TRYLOCK(qs)) {
1651		qs->qs_flags |= QS_TIMEOUT;
1652		cxgb_start_locked(qs);
1653		qs->qs_flags &= ~QS_TIMEOUT;
1654		TXQ_UNLOCK(qs);
1655	}
1656}
1657
1658static void
1659cxgb_start_locked(struct sge_qset *qs)
1660{
1661	struct mbuf *m_head = NULL;
1662	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1663	struct port_info *pi = qs->port;
1664	struct ifnet *ifp = pi->ifp;
1665
1666	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1667		reclaim_completed_tx(qs, 0, TXQ_ETH);
1668
1669	if (!pi->link_config.link_ok) {
1670		TXQ_RING_FLUSH(qs);
1671		return;
1672	}
1673	TXQ_LOCK_ASSERT(qs);
1674	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1675	    pi->link_config.link_ok) {
1676		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1677
1678		if (txq->size - txq->in_use <= TX_MAX_DESC)
1679			break;
1680
1681		if ((m_head = cxgb_dequeue(qs)) == NULL)
1682			break;
1683		/*
1684		 *  Encapsulation can modify our pointer, and or make it
1685		 *  NULL on failure.  In that event, we can't requeue.
1686		 */
1687		if (t3_encap(qs, &m_head) || m_head == NULL)
1688			break;
1689
1690		m_head = NULL;
1691	}
1692
1693	if (txq->db_pending)
1694		check_ring_tx_db(pi->adapter, txq, 1);
1695
1696	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1697	    pi->link_config.link_ok)
1698		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1699		    qs, txq->txq_timer.c_cpu);
1700	if (m_head != NULL)
1701		m_freem(m_head);
1702}
1703
1704static int
1705cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1706{
1707	struct port_info *pi = qs->port;
1708	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1709	struct buf_ring *br = txq->txq_mr;
1710	int error, avail;
1711
1712	avail = txq->size - txq->in_use;
1713	TXQ_LOCK_ASSERT(qs);
1714
1715	/*
1716	 * We can only do a direct transmit if the following are true:
1717	 * - we aren't coalescing (ring < 3/4 full)
1718	 * - the link is up -- checked in caller
1719	 * - there are no packets enqueued already
1720	 * - there is space in hardware transmit queue
1721	 */
1722	if (check_pkt_coalesce(qs) == 0 &&
1723	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1724		if (t3_encap(qs, &m)) {
1725			if (m != NULL &&
1726			    (error = drbr_enqueue(ifp, br, m)) != 0)
1727				return (error);
1728		} else {
1729			if (txq->db_pending)
1730				check_ring_tx_db(pi->adapter, txq, 1);
1731
1732			/*
1733			 * We've bypassed the buf ring so we need to update
1734			 * the stats directly
1735			 */
1736			txq->txq_direct_packets++;
1737			txq->txq_direct_bytes += m->m_pkthdr.len;
1738		}
1739	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1740		return (error);
1741
1742	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1743	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1744	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1745		cxgb_start_locked(qs);
1746	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1747		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1748		    qs, txq->txq_timer.c_cpu);
1749	return (0);
1750}
1751
1752int
1753cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1754{
1755	struct sge_qset *qs;
1756	struct port_info *pi = ifp->if_softc;
1757	int error, qidx = pi->first_qset;
1758
1759	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1760	    ||(!pi->link_config.link_ok)) {
1761		m_freem(m);
1762		return (0);
1763	}
1764
1765	/* check if flowid is set */
1766	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
1767		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1768
1769	qs = &pi->adapter->sge.qs[qidx];
1770
1771	if (TXQ_TRYLOCK(qs)) {
1772		/* XXX running */
1773		error = cxgb_transmit_locked(ifp, qs, m);
1774		TXQ_UNLOCK(qs);
1775	} else
1776		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1777	return (error);
1778}
1779
1780void
1781cxgb_qflush(struct ifnet *ifp)
1782{
1783	/*
1784	 * flush any enqueued mbufs in the buf_rings
1785	 * and in the transmit queues
1786	 * no-op for now
1787	 */
1788	return;
1789}
1790
1791/**
1792 *	write_imm - write a packet into a Tx descriptor as immediate data
1793 *	@d: the Tx descriptor to write
1794 *	@m: the packet
1795 *	@len: the length of packet data to write as immediate data
1796 *	@gen: the generation bit value to write
1797 *
1798 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1799 *	contains a work request at its beginning.  We must write the packet
1800 *	carefully so the SGE doesn't read accidentally before it's written in
1801 *	its entirety.
1802 */
1803static __inline void
1804write_imm(struct tx_desc *d, caddr_t src,
1805	  unsigned int len, unsigned int gen)
1806{
1807	struct work_request_hdr *from = (struct work_request_hdr *)src;
1808	struct work_request_hdr *to = (struct work_request_hdr *)d;
1809	uint32_t wr_hi, wr_lo;
1810
1811	KASSERT(len <= WR_LEN && len >= sizeof(*from),
1812	    ("%s: invalid len %d", __func__, len));
1813
1814	memcpy(&to[1], &from[1], len - sizeof(*from));
1815	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1816	    V_WR_BCNTLFLT(len & 7));
1817	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
1818	set_wr_hdr(to, wr_hi, wr_lo);
1819	wmb();
1820	wr_gen2(d, gen);
1821}
1822
1823/**
1824 *	check_desc_avail - check descriptor availability on a send queue
1825 *	@adap: the adapter
1826 *	@q: the TX queue
1827 *	@m: the packet needing the descriptors
1828 *	@ndesc: the number of Tx descriptors needed
1829 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1830 *
1831 *	Checks if the requested number of Tx descriptors is available on an
1832 *	SGE send queue.  If the queue is already suspended or not enough
1833 *	descriptors are available the packet is queued for later transmission.
1834 *	Must be called with the Tx queue locked.
1835 *
1836 *	Returns 0 if enough descriptors are available, 1 if there aren't
1837 *	enough descriptors and the packet has been queued, and 2 if the caller
1838 *	needs to retry because there weren't enough descriptors at the
1839 *	beginning of the call but some freed up in the mean time.
1840 */
1841static __inline int
1842check_desc_avail(adapter_t *adap, struct sge_txq *q,
1843		 struct mbuf *m, unsigned int ndesc,
1844		 unsigned int qid)
1845{
1846	/*
1847	 * XXX We currently only use this for checking the control queue
1848	 * the control queue is only used for binding qsets which happens
1849	 * at init time so we are guaranteed enough descriptors
1850	 */
1851	if (__predict_false(mbufq_len(&q->sendq))) {
1852addq_exit:	(void )mbufq_enqueue(&q->sendq, m);
1853		return 1;
1854	}
1855	if (__predict_false(q->size - q->in_use < ndesc)) {
1856
1857		struct sge_qset *qs = txq_to_qset(q, qid);
1858
1859		setbit(&qs->txq_stopped, qid);
1860		if (should_restart_tx(q) &&
1861		    test_and_clear_bit(qid, &qs->txq_stopped))
1862			return 2;
1863
1864		q->stops++;
1865		goto addq_exit;
1866	}
1867	return 0;
1868}
1869
1870
1871/**
1872 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1873 *	@q: the SGE control Tx queue
1874 *
1875 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1876 *	that send only immediate data (presently just the control queues) and
1877 *	thus do not have any mbufs
1878 */
1879static __inline void
1880reclaim_completed_tx_imm(struct sge_txq *q)
1881{
1882	unsigned int reclaim = q->processed - q->cleaned;
1883
1884	q->in_use -= reclaim;
1885	q->cleaned += reclaim;
1886}
1887
1888/**
1889 *	ctrl_xmit - send a packet through an SGE control Tx queue
1890 *	@adap: the adapter
1891 *	@q: the control queue
1892 *	@m: the packet
1893 *
1894 *	Send a packet through an SGE control Tx queue.  Packets sent through
1895 *	a control queue must fit entirely as immediate data in a single Tx
1896 *	descriptor and have no page fragments.
1897 */
1898static int
1899ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1900{
1901	int ret;
1902	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1903	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1904
1905	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
1906
1907	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1908	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1909
1910	TXQ_LOCK(qs);
1911again:	reclaim_completed_tx_imm(q);
1912
1913	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1914	if (__predict_false(ret)) {
1915		if (ret == 1) {
1916			TXQ_UNLOCK(qs);
1917			return (ENOSPC);
1918		}
1919		goto again;
1920	}
1921	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1922
1923	q->in_use++;
1924	if (++q->pidx >= q->size) {
1925		q->pidx = 0;
1926		q->gen ^= 1;
1927	}
1928	TXQ_UNLOCK(qs);
1929	wmb();
1930	t3_write_reg(adap, A_SG_KDOORBELL,
1931	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1932
1933	m_free(m);
1934	return (0);
1935}
1936
1937
1938/**
1939 *	restart_ctrlq - restart a suspended control queue
1940 *	@qs: the queue set cotaining the control queue
1941 *
1942 *	Resumes transmission on a suspended Tx control queue.
1943 */
1944static void
1945restart_ctrlq(void *data, int npending)
1946{
1947	struct mbuf *m;
1948	struct sge_qset *qs = (struct sge_qset *)data;
1949	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1950	adapter_t *adap = qs->port->adapter;
1951
1952	TXQ_LOCK(qs);
1953again:	reclaim_completed_tx_imm(q);
1954
1955	while (q->in_use < q->size &&
1956	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1957
1958		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1959		m_free(m);
1960
1961		if (++q->pidx >= q->size) {
1962			q->pidx = 0;
1963			q->gen ^= 1;
1964		}
1965		q->in_use++;
1966	}
1967	if (mbufq_len(&q->sendq)) {
1968		setbit(&qs->txq_stopped, TXQ_CTRL);
1969
1970		if (should_restart_tx(q) &&
1971		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1972			goto again;
1973		q->stops++;
1974	}
1975	TXQ_UNLOCK(qs);
1976	t3_write_reg(adap, A_SG_KDOORBELL,
1977		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1978}
1979
1980
1981/*
1982 * Send a management message through control queue 0
1983 */
1984int
1985t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1986{
1987	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1988}
1989
1990/**
1991 *	free_qset - free the resources of an SGE queue set
1992 *	@sc: the controller owning the queue set
1993 *	@q: the queue set
1994 *
1995 *	Release the HW and SW resources associated with an SGE queue set, such
1996 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1997 *	queue set must be quiesced prior to calling this.
1998 */
1999static void
2000t3_free_qset(adapter_t *sc, struct sge_qset *q)
2001{
2002	int i;
2003
2004	reclaim_completed_tx(q, 0, TXQ_ETH);
2005	if (q->txq[TXQ_ETH].txq_mr != NULL)
2006		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
2007	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
2008		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
2009		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
2010	}
2011
2012	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2013		if (q->fl[i].desc) {
2014			mtx_lock_spin(&sc->sge.reg_lock);
2015			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2016			mtx_unlock_spin(&sc->sge.reg_lock);
2017			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2018			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2019					q->fl[i].desc_map);
2020			bus_dma_tag_destroy(q->fl[i].desc_tag);
2021			bus_dma_tag_destroy(q->fl[i].entry_tag);
2022		}
2023		if (q->fl[i].sdesc) {
2024			free_rx_bufs(sc, &q->fl[i]);
2025			free(q->fl[i].sdesc, M_DEVBUF);
2026		}
2027	}
2028
2029	mtx_unlock(&q->lock);
2030	MTX_DESTROY(&q->lock);
2031	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2032		if (q->txq[i].desc) {
2033			mtx_lock_spin(&sc->sge.reg_lock);
2034			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2035			mtx_unlock_spin(&sc->sge.reg_lock);
2036			bus_dmamap_unload(q->txq[i].desc_tag,
2037					q->txq[i].desc_map);
2038			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2039					q->txq[i].desc_map);
2040			bus_dma_tag_destroy(q->txq[i].desc_tag);
2041			bus_dma_tag_destroy(q->txq[i].entry_tag);
2042		}
2043		if (q->txq[i].sdesc) {
2044			free(q->txq[i].sdesc, M_DEVBUF);
2045		}
2046	}
2047
2048	if (q->rspq.desc) {
2049		mtx_lock_spin(&sc->sge.reg_lock);
2050		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2051		mtx_unlock_spin(&sc->sge.reg_lock);
2052
2053		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2054		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2055			        q->rspq.desc_map);
2056		bus_dma_tag_destroy(q->rspq.desc_tag);
2057		MTX_DESTROY(&q->rspq.lock);
2058	}
2059
2060#if defined(INET6) || defined(INET)
2061	tcp_lro_free(&q->lro.ctrl);
2062#endif
2063
2064	bzero(q, sizeof(*q));
2065}
2066
2067/**
2068 *	t3_free_sge_resources - free SGE resources
2069 *	@sc: the adapter softc
2070 *
2071 *	Frees resources used by the SGE queue sets.
2072 */
2073void
2074t3_free_sge_resources(adapter_t *sc, int nqsets)
2075{
2076	int i;
2077
2078	for (i = 0; i < nqsets; ++i) {
2079		TXQ_LOCK(&sc->sge.qs[i]);
2080		t3_free_qset(sc, &sc->sge.qs[i]);
2081	}
2082}
2083
2084/**
2085 *	t3_sge_start - enable SGE
2086 *	@sc: the controller softc
2087 *
2088 *	Enables the SGE for DMAs.  This is the last step in starting packet
2089 *	transfers.
2090 */
2091void
2092t3_sge_start(adapter_t *sc)
2093{
2094	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2095}
2096
2097/**
2098 *	t3_sge_stop - disable SGE operation
2099 *	@sc: the adapter
2100 *
2101 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2102 *	from error interrupts) or from normal process context.  In the latter
2103 *	case it also disables any pending queue restart tasklets.  Note that
2104 *	if it is called in interrupt context it cannot disable the restart
2105 *	tasklets as it cannot wait, however the tasklets will have no effect
2106 *	since the doorbells are disabled and the driver will call this again
2107 *	later from process context, at which time the tasklets will be stopped
2108 *	if they are still running.
2109 */
2110void
2111t3_sge_stop(adapter_t *sc)
2112{
2113	int i, nqsets;
2114
2115	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2116
2117	if (sc->tq == NULL)
2118		return;
2119
2120	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2121		nqsets += sc->port[i].nqsets;
2122#ifdef notyet
2123	/*
2124	 *
2125	 * XXX
2126	 */
2127	for (i = 0; i < nqsets; ++i) {
2128		struct sge_qset *qs = &sc->sge.qs[i];
2129
2130		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2131		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2132	}
2133#endif
2134}
2135
2136/**
2137 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2138 *	@adapter: the adapter
2139 *	@q: the Tx queue to reclaim descriptors from
2140 *	@reclaimable: the number of descriptors to reclaim
2141 *      @m_vec_size: maximum number of buffers to reclaim
2142 *      @desc_reclaimed: returns the number of descriptors reclaimed
2143 *
2144 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2145 *	Tx buffers.  Called with the Tx queue lock held.
2146 *
2147 *      Returns number of buffers of reclaimed
2148 */
2149void
2150t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2151{
2152	struct tx_sw_desc *txsd;
2153	unsigned int cidx, mask;
2154	struct sge_txq *q = &qs->txq[queue];
2155
2156#ifdef T3_TRACE
2157	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2158		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2159#endif
2160	cidx = q->cidx;
2161	mask = q->size - 1;
2162	txsd = &q->sdesc[cidx];
2163
2164	mtx_assert(&qs->lock, MA_OWNED);
2165	while (reclaimable--) {
2166		prefetch(q->sdesc[(cidx + 1) & mask].m);
2167		prefetch(q->sdesc[(cidx + 2) & mask].m);
2168
2169		if (txsd->m != NULL) {
2170			if (txsd->flags & TX_SW_DESC_MAPPED) {
2171				bus_dmamap_unload(q->entry_tag, txsd->map);
2172				txsd->flags &= ~TX_SW_DESC_MAPPED;
2173			}
2174			m_freem_list(txsd->m);
2175			txsd->m = NULL;
2176		} else
2177			q->txq_skipped++;
2178
2179		++txsd;
2180		if (++cidx == q->size) {
2181			cidx = 0;
2182			txsd = q->sdesc;
2183		}
2184	}
2185	q->cidx = cidx;
2186
2187}
2188
2189/**
2190 *	is_new_response - check if a response is newly written
2191 *	@r: the response descriptor
2192 *	@q: the response queue
2193 *
2194 *	Returns true if a response descriptor contains a yet unprocessed
2195 *	response.
2196 */
2197static __inline int
2198is_new_response(const struct rsp_desc *r,
2199    const struct sge_rspq *q)
2200{
2201	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2202}
2203
2204#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2205#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2206			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2207			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2208			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2209
2210/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2211#define NOMEM_INTR_DELAY 2500
2212
2213#ifdef TCP_OFFLOAD
2214/**
2215 *	write_ofld_wr - write an offload work request
2216 *	@adap: the adapter
2217 *	@m: the packet to send
2218 *	@q: the Tx queue
2219 *	@pidx: index of the first Tx descriptor to write
2220 *	@gen: the generation value to use
2221 *	@ndesc: number of descriptors the packet will occupy
2222 *
2223 *	Write an offload work request to send the supplied packet.  The packet
2224 *	data already carry the work request with most fields populated.
2225 */
2226static void
2227write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
2228    unsigned int pidx, unsigned int gen, unsigned int ndesc)
2229{
2230	unsigned int sgl_flits, flits;
2231	int i, idx, nsegs, wrlen;
2232	struct work_request_hdr *from;
2233	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
2234	struct tx_desc *d = &q->desc[pidx];
2235	struct txq_state txqs;
2236	struct sglist_seg *segs;
2237	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2238	struct sglist *sgl;
2239
2240	from = (void *)(oh + 1);	/* Start of WR within mbuf */
2241	wrlen = m->m_len - sizeof(*oh);
2242
2243	if (!(oh->flags & F_HDR_SGL)) {
2244		write_imm(d, (caddr_t)from, wrlen, gen);
2245
2246		/*
2247		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
2248		 * t3_push_frames and freed in wr_ack.  Others, like those sent
2249		 * down by close_conn, t3_send_reset, etc. should be freed here.
2250		 */
2251		if (!(oh->flags & F_HDR_DF))
2252			m_free(m);
2253		return;
2254	}
2255
2256	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
2257
2258	sgl = oh->sgl;
2259	flits = wrlen / 8;
2260	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
2261
2262	nsegs = sgl->sg_nseg;
2263	segs = sgl->sg_segs;
2264	for (idx = 0, i = 0; i < nsegs; i++) {
2265		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
2266		if (i && idx == 0)
2267			++sgp;
2268		sgp->len[idx] = htobe32(segs[i].ss_len);
2269		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
2270		idx ^= 1;
2271	}
2272	if (idx) {
2273		sgp->len[idx] = 0;
2274		sgp->addr[idx] = 0;
2275	}
2276
2277	sgl_flits = sgl_len(nsegs);
2278	txqs.gen = gen;
2279	txqs.pidx = pidx;
2280	txqs.compl = 0;
2281
2282	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
2283	    from->wrh_hi, from->wrh_lo);
2284}
2285
2286/**
2287 *	ofld_xmit - send a packet through an offload queue
2288 *	@adap: the adapter
2289 *	@q: the Tx offload queue
2290 *	@m: the packet
2291 *
2292 *	Send an offload packet through an SGE offload queue.
2293 */
2294static int
2295ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2296{
2297	int ret;
2298	unsigned int ndesc;
2299	unsigned int pidx, gen;
2300	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2301	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2302
2303	ndesc = G_HDR_NDESC(oh->flags);
2304
2305	TXQ_LOCK(qs);
2306again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2307	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2308	if (__predict_false(ret)) {
2309		if (ret == 1) {
2310			TXQ_UNLOCK(qs);
2311			return (EINTR);
2312		}
2313		goto again;
2314	}
2315
2316	gen = q->gen;
2317	q->in_use += ndesc;
2318	pidx = q->pidx;
2319	q->pidx += ndesc;
2320	if (q->pidx >= q->size) {
2321		q->pidx -= q->size;
2322		q->gen ^= 1;
2323	}
2324
2325	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2326	check_ring_tx_db(adap, q, 1);
2327	TXQ_UNLOCK(qs);
2328
2329	return (0);
2330}
2331
2332/**
2333 *	restart_offloadq - restart a suspended offload queue
2334 *	@qs: the queue set cotaining the offload queue
2335 *
2336 *	Resumes transmission on a suspended Tx offload queue.
2337 */
2338static void
2339restart_offloadq(void *data, int npending)
2340{
2341	struct mbuf *m;
2342	struct sge_qset *qs = data;
2343	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2344	adapter_t *adap = qs->port->adapter;
2345	int cleaned;
2346
2347	TXQ_LOCK(qs);
2348again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2349
2350	while ((m = mbufq_first(&q->sendq)) != NULL) {
2351		unsigned int gen, pidx;
2352		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2353		unsigned int ndesc = G_HDR_NDESC(oh->flags);
2354
2355		if (__predict_false(q->size - q->in_use < ndesc)) {
2356			setbit(&qs->txq_stopped, TXQ_OFLD);
2357			if (should_restart_tx(q) &&
2358			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2359				goto again;
2360			q->stops++;
2361			break;
2362		}
2363
2364		gen = q->gen;
2365		q->in_use += ndesc;
2366		pidx = q->pidx;
2367		q->pidx += ndesc;
2368		if (q->pidx >= q->size) {
2369			q->pidx -= q->size;
2370			q->gen ^= 1;
2371		}
2372
2373		(void)mbufq_dequeue(&q->sendq);
2374		TXQ_UNLOCK(qs);
2375		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2376		TXQ_LOCK(qs);
2377	}
2378#if USE_GTS
2379	set_bit(TXQ_RUNNING, &q->flags);
2380	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2381#endif
2382	TXQ_UNLOCK(qs);
2383	wmb();
2384	t3_write_reg(adap, A_SG_KDOORBELL,
2385		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2386}
2387
2388/**
2389 *	t3_offload_tx - send an offload packet
2390 *	@m: the packet
2391 *
2392 *	Sends an offload packet.  We use the packet priority to select the
2393 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2394 *	should be sent as regular or control, bits 1-3 select the queue set.
2395 */
2396int
2397t3_offload_tx(struct adapter *sc, struct mbuf *m)
2398{
2399	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2400	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
2401
2402	if (oh->flags & F_HDR_CTRL) {
2403		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
2404		return (ctrl_xmit(sc, qs, m));
2405	} else
2406		return (ofld_xmit(sc, qs, m));
2407}
2408#endif
2409
2410static void
2411restart_tx(struct sge_qset *qs)
2412{
2413	struct adapter *sc = qs->port->adapter;
2414
2415	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2416	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2417	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2418		qs->txq[TXQ_OFLD].restarts++;
2419		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2420	}
2421
2422	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2423	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2424	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2425		qs->txq[TXQ_CTRL].restarts++;
2426		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2427	}
2428}
2429
2430/**
2431 *	t3_sge_alloc_qset - initialize an SGE queue set
2432 *	@sc: the controller softc
2433 *	@id: the queue set id
2434 *	@nports: how many Ethernet ports will be using this queue set
2435 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2436 *	@p: configuration parameters for this queue set
2437 *	@ntxq: number of Tx queues for the queue set
2438 *	@pi: port info for queue set
2439 *
2440 *	Allocate resources and initialize an SGE queue set.  A queue set
2441 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2442 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2443 *	queue, offload queue, and control queue.
2444 */
2445int
2446t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2447		  const struct qset_params *p, int ntxq, struct port_info *pi)
2448{
2449	struct sge_qset *q = &sc->sge.qs[id];
2450	int i, ret = 0;
2451
2452	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2453	q->port = pi;
2454	q->adap = sc;
2455
2456	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2457	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2458		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2459		goto err;
2460	}
2461	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2462	    M_NOWAIT | M_ZERO)) == NULL) {
2463		device_printf(sc->dev, "failed to allocate ifq\n");
2464		goto err;
2465	}
2466	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2467	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2468	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2469	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2470	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2471
2472	init_qset_cntxt(q, id);
2473	q->idx = id;
2474	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2475		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2476		    &q->fl[0].desc, &q->fl[0].sdesc,
2477		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2478		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2479		printf("error %d from alloc ring fl0\n", ret);
2480		goto err;
2481	}
2482
2483	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2484		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2485		    &q->fl[1].desc, &q->fl[1].sdesc,
2486		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2487		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2488		printf("error %d from alloc ring fl1\n", ret);
2489		goto err;
2490	}
2491
2492	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2493		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2494		    &q->rspq.desc_tag, &q->rspq.desc_map,
2495		    NULL, NULL)) != 0) {
2496		printf("error %d from alloc ring rspq\n", ret);
2497		goto err;
2498	}
2499
2500	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2501	    device_get_unit(sc->dev), irq_vec_idx);
2502	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2503
2504	for (i = 0; i < ntxq; ++i) {
2505		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2506
2507		if ((ret = alloc_ring(sc, p->txq_size[i],
2508			    sizeof(struct tx_desc), sz,
2509			    &q->txq[i].phys_addr, &q->txq[i].desc,
2510			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2511			    &q->txq[i].desc_map,
2512			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2513			printf("error %d from alloc ring tx %i\n", ret, i);
2514			goto err;
2515		}
2516		mbufq_init(&q->txq[i].sendq, INT_MAX);
2517		q->txq[i].gen = 1;
2518		q->txq[i].size = p->txq_size[i];
2519	}
2520
2521#ifdef TCP_OFFLOAD
2522	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2523#endif
2524	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2525	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2526	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2527
2528	q->fl[0].gen = q->fl[1].gen = 1;
2529	q->fl[0].size = p->fl_size;
2530	q->fl[1].size = p->jumbo_size;
2531
2532	q->rspq.gen = 1;
2533	q->rspq.cidx = 0;
2534	q->rspq.size = p->rspq_size;
2535
2536	q->txq[TXQ_ETH].stop_thres = nports *
2537	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2538
2539	q->fl[0].buf_size = MCLBYTES;
2540	q->fl[0].zone = zone_pack;
2541	q->fl[0].type = EXT_PACKET;
2542
2543	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2544		q->fl[1].zone = zone_jumbo16;
2545		q->fl[1].type = EXT_JUMBO16;
2546	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2547		q->fl[1].zone = zone_jumbo9;
2548		q->fl[1].type = EXT_JUMBO9;
2549	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2550		q->fl[1].zone = zone_jumbop;
2551		q->fl[1].type = EXT_JUMBOP;
2552	} else {
2553		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2554		ret = EDOOFUS;
2555		goto err;
2556	}
2557	q->fl[1].buf_size = p->jumbo_buf_size;
2558
2559	/* Allocate and setup the lro_ctrl structure */
2560	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2561#if defined(INET6) || defined(INET)
2562	ret = tcp_lro_init(&q->lro.ctrl);
2563	if (ret) {
2564		printf("error %d from tcp_lro_init\n", ret);
2565		goto err;
2566	}
2567#endif
2568	q->lro.ctrl.ifp = pi->ifp;
2569
2570	mtx_lock_spin(&sc->sge.reg_lock);
2571	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2572				   q->rspq.phys_addr, q->rspq.size,
2573				   q->fl[0].buf_size, 1, 0);
2574	if (ret) {
2575		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2576		goto err_unlock;
2577	}
2578
2579	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2580		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2581					  q->fl[i].phys_addr, q->fl[i].size,
2582					  q->fl[i].buf_size, p->cong_thres, 1,
2583					  0);
2584		if (ret) {
2585			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2586			goto err_unlock;
2587		}
2588	}
2589
2590	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2591				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2592				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2593				 1, 0);
2594	if (ret) {
2595		printf("error %d from t3_sge_init_ecntxt\n", ret);
2596		goto err_unlock;
2597	}
2598
2599	if (ntxq > 1) {
2600		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2601					 USE_GTS, SGE_CNTXT_OFLD, id,
2602					 q->txq[TXQ_OFLD].phys_addr,
2603					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2604		if (ret) {
2605			printf("error %d from t3_sge_init_ecntxt\n", ret);
2606			goto err_unlock;
2607		}
2608	}
2609
2610	if (ntxq > 2) {
2611		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2612					 SGE_CNTXT_CTRL, id,
2613					 q->txq[TXQ_CTRL].phys_addr,
2614					 q->txq[TXQ_CTRL].size,
2615					 q->txq[TXQ_CTRL].token, 1, 0);
2616		if (ret) {
2617			printf("error %d from t3_sge_init_ecntxt\n", ret);
2618			goto err_unlock;
2619		}
2620	}
2621
2622	mtx_unlock_spin(&sc->sge.reg_lock);
2623	t3_update_qset_coalesce(q, p);
2624
2625	refill_fl(sc, &q->fl[0], q->fl[0].size);
2626	refill_fl(sc, &q->fl[1], q->fl[1].size);
2627	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2628
2629	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2630		     V_NEWTIMER(q->rspq.holdoff_tmr));
2631
2632	return (0);
2633
2634err_unlock:
2635	mtx_unlock_spin(&sc->sge.reg_lock);
2636err:
2637	TXQ_LOCK(q);
2638	t3_free_qset(sc, q);
2639
2640	return (ret);
2641}
2642
2643/*
2644 * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2645 * ethernet data.  Hardware assistance with various checksums and any vlan tag
2646 * will also be taken into account here.
2647 */
2648void
2649t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad)
2650{
2651	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2652	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2653	struct ifnet *ifp = pi->ifp;
2654
2655	if (cpl->vlan_valid) {
2656		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2657		m->m_flags |= M_VLANTAG;
2658	}
2659
2660	m->m_pkthdr.rcvif = ifp;
2661	/*
2662	 * adjust after conversion to mbuf chain
2663	 */
2664	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2665	m->m_len -= (sizeof(*cpl) + ethpad);
2666	m->m_data += (sizeof(*cpl) + ethpad);
2667
2668	if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) {
2669		struct ether_header *eh = mtod(m, void *);
2670		uint16_t eh_type;
2671
2672		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2673			struct ether_vlan_header *evh = mtod(m, void *);
2674
2675			eh_type = evh->evl_proto;
2676		} else
2677			eh_type = eh->ether_type;
2678
2679		if (ifp->if_capenable & IFCAP_RXCSUM &&
2680		    eh_type == htons(ETHERTYPE_IP)) {
2681			m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
2682			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2683			m->m_pkthdr.csum_data = 0xffff;
2684		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
2685		    eh_type == htons(ETHERTYPE_IPV6)) {
2686			m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
2687			    CSUM_PSEUDO_HDR);
2688			m->m_pkthdr.csum_data = 0xffff;
2689		}
2690	}
2691}
2692
2693/**
2694 *	get_packet - return the next ingress packet buffer from a free list
2695 *	@adap: the adapter that received the packet
2696 *	@drop_thres: # of remaining buffers before we start dropping packets
2697 *	@qs: the qset that the SGE free list holding the packet belongs to
2698 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2699 *      @r: response descriptor
2700 *
2701 *	Get the next packet from a free list and complete setup of the
2702 *	sk_buff.  If the packet is small we make a copy and recycle the
2703 *	original buffer, otherwise we use the original buffer itself.  If a
2704 *	positive drop threshold is supplied packets are dropped and their
2705 *	buffers recycled if (a) the number of remaining buffers is under the
2706 *	threshold and the packet is too big to copy, or (b) the packet should
2707 *	be copied but there is no memory for the copy.
2708 */
2709static int
2710get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2711    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2712{
2713
2714	unsigned int len_cq =  ntohl(r->len_cq);
2715	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2716	int mask, cidx = fl->cidx;
2717	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2718	uint32_t len = G_RSPD_LEN(len_cq);
2719	uint32_t flags = M_EXT;
2720	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2721	caddr_t cl;
2722	struct mbuf *m;
2723	int ret = 0;
2724
2725	mask = fl->size - 1;
2726	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2727	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2728	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2729	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2730
2731	fl->credits--;
2732	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2733
2734	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2735	    sopeop == RSPQ_SOP_EOP) {
2736		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
2737			goto skip_recycle;
2738		cl = mtod(m, void *);
2739		memcpy(cl, sd->rxsd_cl, len);
2740		recycle_rx_buf(adap, fl, fl->cidx);
2741		m->m_pkthdr.len = m->m_len = len;
2742		m->m_flags = 0;
2743		mh->mh_head = mh->mh_tail = m;
2744		ret = 1;
2745		goto done;
2746	} else {
2747	skip_recycle:
2748		bus_dmamap_unload(fl->entry_tag, sd->map);
2749		cl = sd->rxsd_cl;
2750		m = sd->m;
2751
2752		if ((sopeop == RSPQ_SOP_EOP) ||
2753		    (sopeop == RSPQ_SOP))
2754			flags |= M_PKTHDR;
2755		m_init(m, M_NOWAIT, MT_DATA, flags);
2756		if (fl->zone == zone_pack) {
2757			/*
2758			 * restore clobbered data pointer
2759			 */
2760			m->m_data = m->m_ext.ext_buf;
2761		} else {
2762			m_cljset(m, cl, fl->type);
2763		}
2764		m->m_len = len;
2765	}
2766	switch(sopeop) {
2767	case RSPQ_SOP_EOP:
2768		ret = 1;
2769		/* FALLTHROUGH */
2770	case RSPQ_SOP:
2771		mh->mh_head = mh->mh_tail = m;
2772		m->m_pkthdr.len = len;
2773		break;
2774	case RSPQ_EOP:
2775		ret = 1;
2776		/* FALLTHROUGH */
2777	case RSPQ_NSOP_NEOP:
2778		if (mh->mh_tail == NULL) {
2779			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2780			m_freem(m);
2781			m = NULL;
2782			break;
2783		}
2784		mh->mh_tail->m_next = m;
2785		mh->mh_tail = m;
2786		mh->mh_head->m_pkthdr.len += len;
2787		break;
2788	}
2789	if (cxgb_debug && m != NULL)
2790		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2791done:
2792	if (++fl->cidx == fl->size)
2793		fl->cidx = 0;
2794
2795	return (ret);
2796}
2797
2798/**
2799 *	handle_rsp_cntrl_info - handles control information in a response
2800 *	@qs: the queue set corresponding to the response
2801 *	@flags: the response control flags
2802 *
2803 *	Handles the control information of an SGE response, such as GTS
2804 *	indications and completion credits for the queue set's Tx queues.
2805 *	HW coalesces credits, we don't do any extra SW coalescing.
2806 */
2807static __inline void
2808handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2809{
2810	unsigned int credits;
2811
2812#if USE_GTS
2813	if (flags & F_RSPD_TXQ0_GTS)
2814		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2815#endif
2816	credits = G_RSPD_TXQ0_CR(flags);
2817	if (credits)
2818		qs->txq[TXQ_ETH].processed += credits;
2819
2820	credits = G_RSPD_TXQ2_CR(flags);
2821	if (credits)
2822		qs->txq[TXQ_CTRL].processed += credits;
2823
2824# if USE_GTS
2825	if (flags & F_RSPD_TXQ1_GTS)
2826		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2827# endif
2828	credits = G_RSPD_TXQ1_CR(flags);
2829	if (credits)
2830		qs->txq[TXQ_OFLD].processed += credits;
2831
2832}
2833
2834static void
2835check_ring_db(adapter_t *adap, struct sge_qset *qs,
2836    unsigned int sleeping)
2837{
2838	;
2839}
2840
2841/**
2842 *	process_responses - process responses from an SGE response queue
2843 *	@adap: the adapter
2844 *	@qs: the queue set to which the response queue belongs
2845 *	@budget: how many responses can be processed in this round
2846 *
2847 *	Process responses from an SGE response queue up to the supplied budget.
2848 *	Responses include received packets as well as credits and other events
2849 *	for the queues that belong to the response queue's queue set.
2850 *	A negative budget is effectively unlimited.
2851 *
2852 *	Additionally choose the interrupt holdoff time for the next interrupt
2853 *	on this queue.  If the system is under memory shortage use a fairly
2854 *	long delay to help recovery.
2855 */
2856static int
2857process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2858{
2859	struct sge_rspq *rspq = &qs->rspq;
2860	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2861	int budget_left = budget;
2862	unsigned int sleeping = 0;
2863#if defined(INET6) || defined(INET)
2864	int lro_enabled = qs->lro.enabled;
2865	int skip_lro;
2866	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2867#endif
2868	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2869#ifdef DEBUG
2870	static int last_holdoff = 0;
2871	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2872		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2873		last_holdoff = rspq->holdoff_tmr;
2874	}
2875#endif
2876	rspq->next_holdoff = rspq->holdoff_tmr;
2877
2878	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2879		int eth, eop = 0, ethpad = 0;
2880		uint32_t flags = ntohl(r->flags);
2881		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2882		uint8_t opcode = r->rss_hdr.opcode;
2883
2884		eth = (opcode == CPL_RX_PKT);
2885
2886		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2887			struct mbuf *m;
2888
2889			if (cxgb_debug)
2890				printf("async notification\n");
2891
2892			if (mh->mh_head == NULL) {
2893				mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA);
2894				m = mh->mh_head;
2895			} else {
2896				m = m_gethdr(M_NOWAIT, MT_DATA);
2897			}
2898			if (m == NULL)
2899				goto no_mem;
2900
2901                        memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2902			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2903                        *mtod(m, uint8_t *) = CPL_ASYNC_NOTIF;
2904			opcode = CPL_ASYNC_NOTIF;
2905			eop = 1;
2906                        rspq->async_notif++;
2907			goto skip;
2908		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2909			struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
2910
2911			if (m == NULL) {
2912		no_mem:
2913				rspq->next_holdoff = NOMEM_INTR_DELAY;
2914				budget_left--;
2915				break;
2916			}
2917			if (mh->mh_head == NULL)
2918				mh->mh_head = m;
2919                        else
2920				mh->mh_tail->m_next = m;
2921			mh->mh_tail = m;
2922
2923			get_imm_packet(adap, r, m);
2924			mh->mh_head->m_pkthdr.len += m->m_len;
2925			eop = 1;
2926			rspq->imm_data++;
2927		} else if (r->len_cq) {
2928			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2929
2930			eop = get_packet(adap, drop_thresh, qs, mh, r);
2931			if (eop) {
2932				if (r->rss_hdr.hash_type && !adap->timestamp) {
2933					M_HASHTYPE_SET(mh->mh_head,
2934					    M_HASHTYPE_OPAQUE_HASH);
2935					mh->mh_head->m_pkthdr.flowid = rss_hash;
2936				}
2937			}
2938
2939			ethpad = 2;
2940		} else {
2941			rspq->pure_rsps++;
2942		}
2943	skip:
2944		if (flags & RSPD_CTRL_MASK) {
2945			sleeping |= flags & RSPD_GTS_MASK;
2946			handle_rsp_cntrl_info(qs, flags);
2947		}
2948
2949		if (!eth && eop) {
2950			rspq->offload_pkts++;
2951#ifdef TCP_OFFLOAD
2952			adap->cpl_handler[opcode](qs, r, mh->mh_head);
2953#else
2954			m_freem(mh->mh_head);
2955#endif
2956			mh->mh_head = NULL;
2957		} else if (eth && eop) {
2958			struct mbuf *m = mh->mh_head;
2959
2960			t3_rx_eth(adap, m, ethpad);
2961
2962			/*
2963			 * The T304 sends incoming packets on any qset.  If LRO
2964			 * is also enabled, we could end up sending packet up
2965			 * lro_ctrl->ifp's input.  That is incorrect.
2966			 *
2967			 * The mbuf's rcvif was derived from the cpl header and
2968			 * is accurate.  Skip LRO and just use that.
2969			 */
2970#if defined(INET6) || defined(INET)
2971			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
2972
2973			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
2974			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
2975			    ) {
2976				/* successfully queue'd for LRO */
2977			} else
2978#endif
2979			{
2980				/*
2981				 * LRO not enabled, packet unsuitable for LRO,
2982				 * or unable to queue.  Pass it up right now in
2983				 * either case.
2984				 */
2985				struct ifnet *ifp = m->m_pkthdr.rcvif;
2986				(*ifp->if_input)(ifp, m);
2987			}
2988			mh->mh_head = NULL;
2989
2990		}
2991
2992		r++;
2993		if (__predict_false(++rspq->cidx == rspq->size)) {
2994			rspq->cidx = 0;
2995			rspq->gen ^= 1;
2996			r = rspq->desc;
2997		}
2998
2999		if (++rspq->credits >= 64) {
3000			refill_rspq(adap, rspq, rspq->credits);
3001			rspq->credits = 0;
3002		}
3003		__refill_fl_lt(adap, &qs->fl[0], 32);
3004		__refill_fl_lt(adap, &qs->fl[1], 32);
3005		--budget_left;
3006	}
3007
3008#if defined(INET6) || defined(INET)
3009	/* Flush LRO */
3010	tcp_lro_flush_all(lro_ctrl);
3011#endif
3012
3013	if (sleeping)
3014		check_ring_db(adap, qs, sleeping);
3015
3016	mb();  /* commit Tx queue processed updates */
3017	if (__predict_false(qs->txq_stopped > 1))
3018		restart_tx(qs);
3019
3020	__refill_fl_lt(adap, &qs->fl[0], 512);
3021	__refill_fl_lt(adap, &qs->fl[1], 512);
3022	budget -= budget_left;
3023	return (budget);
3024}
3025
3026/*
3027 * A helper function that processes responses and issues GTS.
3028 */
3029static __inline int
3030process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3031{
3032	int work;
3033	static int last_holdoff = 0;
3034
3035	work = process_responses(adap, rspq_to_qset(rq), -1);
3036
3037	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3038		printf("next_holdoff=%d\n", rq->next_holdoff);
3039		last_holdoff = rq->next_holdoff;
3040	}
3041	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3042	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3043
3044	return (work);
3045}
3046
3047#ifdef NETDUMP
3048int
3049cxgb_netdump_poll_rx(adapter_t *adap, struct sge_qset *qs)
3050{
3051
3052	return (process_responses_gts(adap, &qs->rspq));
3053}
3054#endif
3055
3056/*
3057 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3058 * Handles data events from SGE response queues as well as error and other
3059 * async events as they all use the same interrupt pin.  We use one SGE
3060 * response queue per port in this mode and protect all response queues with
3061 * queue 0's lock.
3062 */
3063void
3064t3b_intr(void *data)
3065{
3066	uint32_t i, map;
3067	adapter_t *adap = data;
3068	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3069
3070	t3_write_reg(adap, A_PL_CLI, 0);
3071	map = t3_read_reg(adap, A_SG_DATA_INTR);
3072
3073	if (!map)
3074		return;
3075
3076	if (__predict_false(map & F_ERRINTR)) {
3077		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3078		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3079		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3080	}
3081
3082	mtx_lock(&q0->lock);
3083	for_each_port(adap, i)
3084	    if (map & (1 << i))
3085			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3086	mtx_unlock(&q0->lock);
3087}
3088
3089/*
3090 * The MSI interrupt handler.  This needs to handle data events from SGE
3091 * response queues as well as error and other async events as they all use
3092 * the same MSI vector.  We use one SGE response queue per port in this mode
3093 * and protect all response queues with queue 0's lock.
3094 */
3095void
3096t3_intr_msi(void *data)
3097{
3098	adapter_t *adap = data;
3099	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3100	int i, new_packets = 0;
3101
3102	mtx_lock(&q0->lock);
3103
3104	for_each_port(adap, i)
3105	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3106		    new_packets = 1;
3107	mtx_unlock(&q0->lock);
3108	if (new_packets == 0) {
3109		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3110		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3111		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3112	}
3113}
3114
3115void
3116t3_intr_msix(void *data)
3117{
3118	struct sge_qset *qs = data;
3119	adapter_t *adap = qs->port->adapter;
3120	struct sge_rspq *rspq = &qs->rspq;
3121
3122	if (process_responses_gts(adap, rspq) == 0)
3123		rspq->unhandled_irqs++;
3124}
3125
3126#define QDUMP_SBUF_SIZE		32 * 400
3127static int
3128t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3129{
3130	struct sge_rspq *rspq;
3131	struct sge_qset *qs;
3132	int i, err, dump_end, idx;
3133	struct sbuf *sb;
3134	struct rsp_desc *rspd;
3135	uint32_t data[4];
3136
3137	rspq = arg1;
3138	qs = rspq_to_qset(rspq);
3139	if (rspq->rspq_dump_count == 0)
3140		return (0);
3141	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3142		log(LOG_WARNING,
3143		    "dump count is too large %d\n", rspq->rspq_dump_count);
3144		rspq->rspq_dump_count = 0;
3145		return (EINVAL);
3146	}
3147	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3148		log(LOG_WARNING,
3149		    "dump start of %d is greater than queue size\n",
3150		    rspq->rspq_dump_start);
3151		rspq->rspq_dump_start = 0;
3152		return (EINVAL);
3153	}
3154	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3155	if (err)
3156		return (err);
3157	err = sysctl_wire_old_buffer(req, 0);
3158	if (err)
3159		return (err);
3160	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3161
3162	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3163	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3164	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3165	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3166	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3167
3168	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3169	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3170
3171	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3172	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3173		idx = i & (RSPQ_Q_SIZE-1);
3174
3175		rspd = &rspq->desc[idx];
3176		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3177		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3178		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3179		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3180		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3181		    be32toh(rspd->len_cq), rspd->intr_gen);
3182	}
3183
3184	err = sbuf_finish(sb);
3185	sbuf_delete(sb);
3186	return (err);
3187}
3188
3189static int
3190t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3191{
3192	struct sge_txq *txq;
3193	struct sge_qset *qs;
3194	int i, j, err, dump_end;
3195	struct sbuf *sb;
3196	struct tx_desc *txd;
3197	uint32_t *WR, wr_hi, wr_lo, gen;
3198	uint32_t data[4];
3199
3200	txq = arg1;
3201	qs = txq_to_qset(txq, TXQ_ETH);
3202	if (txq->txq_dump_count == 0) {
3203		return (0);
3204	}
3205	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3206		log(LOG_WARNING,
3207		    "dump count is too large %d\n", txq->txq_dump_count);
3208		txq->txq_dump_count = 1;
3209		return (EINVAL);
3210	}
3211	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3212		log(LOG_WARNING,
3213		    "dump start of %d is greater than queue size\n",
3214		    txq->txq_dump_start);
3215		txq->txq_dump_start = 0;
3216		return (EINVAL);
3217	}
3218	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3219	if (err)
3220		return (err);
3221	err = sysctl_wire_old_buffer(req, 0);
3222	if (err)
3223		return (err);
3224	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3225
3226	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3227	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3228	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3229	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3230	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3231	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3232	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3233	    txq->txq_dump_start,
3234	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3235
3236	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3237	for (i = txq->txq_dump_start; i < dump_end; i++) {
3238		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3239		WR = (uint32_t *)txd->flit;
3240		wr_hi = ntohl(WR[0]);
3241		wr_lo = ntohl(WR[1]);
3242		gen = G_WR_GEN(wr_lo);
3243
3244		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3245		    wr_hi, wr_lo, gen);
3246		for (j = 2; j < 30; j += 4)
3247			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3248			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3249
3250	}
3251	err = sbuf_finish(sb);
3252	sbuf_delete(sb);
3253	return (err);
3254}
3255
3256static int
3257t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3258{
3259	struct sge_txq *txq;
3260	struct sge_qset *qs;
3261	int i, j, err, dump_end;
3262	struct sbuf *sb;
3263	struct tx_desc *txd;
3264	uint32_t *WR, wr_hi, wr_lo, gen;
3265
3266	txq = arg1;
3267	qs = txq_to_qset(txq, TXQ_CTRL);
3268	if (txq->txq_dump_count == 0) {
3269		return (0);
3270	}
3271	if (txq->txq_dump_count > 256) {
3272		log(LOG_WARNING,
3273		    "dump count is too large %d\n", txq->txq_dump_count);
3274		txq->txq_dump_count = 1;
3275		return (EINVAL);
3276	}
3277	if (txq->txq_dump_start > 255) {
3278		log(LOG_WARNING,
3279		    "dump start of %d is greater than queue size\n",
3280		    txq->txq_dump_start);
3281		txq->txq_dump_start = 0;
3282		return (EINVAL);
3283	}
3284
3285	err = sysctl_wire_old_buffer(req, 0);
3286	if (err != 0)
3287		return (err);
3288	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3289	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3290	    txq->txq_dump_start,
3291	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3292
3293	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3294	for (i = txq->txq_dump_start; i < dump_end; i++) {
3295		txd = &txq->desc[i & (255)];
3296		WR = (uint32_t *)txd->flit;
3297		wr_hi = ntohl(WR[0]);
3298		wr_lo = ntohl(WR[1]);
3299		gen = G_WR_GEN(wr_lo);
3300
3301		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3302		    wr_hi, wr_lo, gen);
3303		for (j = 2; j < 30; j += 4)
3304			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3305			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3306
3307	}
3308	err = sbuf_finish(sb);
3309	sbuf_delete(sb);
3310	return (err);
3311}
3312
3313static int
3314t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3315{
3316	adapter_t *sc = arg1;
3317	struct qset_params *qsp = &sc->params.sge.qset[0];
3318	int coalesce_usecs;
3319	struct sge_qset *qs;
3320	int i, j, err, nqsets = 0;
3321	struct mtx *lock;
3322
3323	if ((sc->flags & FULL_INIT_DONE) == 0)
3324		return (ENXIO);
3325
3326	coalesce_usecs = qsp->coalesce_usecs;
3327        err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3328
3329	if (err != 0) {
3330		return (err);
3331	}
3332	if (coalesce_usecs == qsp->coalesce_usecs)
3333		return (0);
3334
3335	for (i = 0; i < sc->params.nports; i++)
3336		for (j = 0; j < sc->port[i].nqsets; j++)
3337			nqsets++;
3338
3339	coalesce_usecs = max(1, coalesce_usecs);
3340
3341	for (i = 0; i < nqsets; i++) {
3342		qs = &sc->sge.qs[i];
3343		qsp = &sc->params.sge.qset[i];
3344		qsp->coalesce_usecs = coalesce_usecs;
3345
3346		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3347			    &sc->sge.qs[0].rspq.lock;
3348
3349		mtx_lock(lock);
3350		t3_update_qset_coalesce(qs, qsp);
3351		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3352		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3353		mtx_unlock(lock);
3354	}
3355
3356	return (0);
3357}
3358
3359static int
3360t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3361{
3362	adapter_t *sc = arg1;
3363	int rc, timestamp;
3364
3365	if ((sc->flags & FULL_INIT_DONE) == 0)
3366		return (ENXIO);
3367
3368	timestamp = sc->timestamp;
3369	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3370
3371	if (rc != 0)
3372		return (rc);
3373
3374	if (timestamp != sc->timestamp) {
3375		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3376		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3377		sc->timestamp = timestamp;
3378	}
3379
3380	return (0);
3381}
3382
3383void
3384t3_add_attach_sysctls(adapter_t *sc)
3385{
3386	struct sysctl_ctx_list *ctx;
3387	struct sysctl_oid_list *children;
3388
3389	ctx = device_get_sysctl_ctx(sc->dev);
3390	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3391
3392	/* random information */
3393	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3394	    "firmware_version",
3395	    CTLFLAG_RD, sc->fw_version,
3396	    0, "firmware version");
3397	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3398	    "hw_revision",
3399	    CTLFLAG_RD, &sc->params.rev,
3400	    0, "chip model");
3401	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3402	    "port_types",
3403	    CTLFLAG_RD, sc->port_types,
3404	    0, "type of ports");
3405	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3406	    "enable_debug",
3407	    CTLFLAG_RW, &cxgb_debug,
3408	    0, "enable verbose debugging output");
3409	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3410	    CTLFLAG_RD, &sc->tunq_coalesce,
3411	    "#tunneled packets freed");
3412	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3413	    "txq_overrun",
3414	    CTLFLAG_RD, &txq_fills,
3415	    0, "#times txq overrun");
3416	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3417	    "core_clock",
3418	    CTLFLAG_RD, &sc->params.vpd.cclk,
3419	    0, "core clock frequency (in KHz)");
3420}
3421
3422
3423static const char *rspq_name = "rspq";
3424static const char *txq_names[] =
3425{
3426	"txq_eth",
3427	"txq_ofld",
3428	"txq_ctrl"
3429};
3430
3431static int
3432sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3433{
3434	struct port_info *p = arg1;
3435	uint64_t *parg;
3436
3437	if (!p)
3438		return (EINVAL);
3439
3440	cxgb_refresh_stats(p);
3441	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3442
3443	return (sysctl_handle_64(oidp, parg, 0, req));
3444}
3445
3446void
3447t3_add_configured_sysctls(adapter_t *sc)
3448{
3449	struct sysctl_ctx_list *ctx;
3450	struct sysctl_oid_list *children;
3451	int i, j;
3452
3453	ctx = device_get_sysctl_ctx(sc->dev);
3454	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3455
3456	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3457	    "intr_coal",
3458	    CTLTYPE_INT|CTLFLAG_RW, sc,
3459	    0, t3_set_coalesce_usecs,
3460	    "I", "interrupt coalescing timer (us)");
3461
3462	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3463	    "pkt_timestamp",
3464	    CTLTYPE_INT | CTLFLAG_RW, sc,
3465	    0, t3_pkt_timestamp,
3466	    "I", "provide packet timestamp instead of connection hash");
3467
3468	for (i = 0; i < sc->params.nports; i++) {
3469		struct port_info *pi = &sc->port[i];
3470		struct sysctl_oid *poid;
3471		struct sysctl_oid_list *poidlist;
3472		struct mac_stats *mstats = &pi->mac.stats;
3473
3474		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3475		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3476		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3477		poidlist = SYSCTL_CHILDREN(poid);
3478		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3479		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3480		    0, "#queue sets");
3481
3482		for (j = 0; j < pi->nqsets; j++) {
3483			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3484			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3485					  *ctrlqpoid, *lropoid;
3486			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3487					       *txqpoidlist, *ctrlqpoidlist,
3488					       *lropoidlist;
3489			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3490
3491			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3492
3493			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3494			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3495			qspoidlist = SYSCTL_CHILDREN(qspoid);
3496
3497			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3498					CTLFLAG_RD, &qs->fl[0].empty, 0,
3499					"freelist #0 empty");
3500			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3501					CTLFLAG_RD, &qs->fl[1].empty, 0,
3502					"freelist #1 empty");
3503
3504			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3505			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3506			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3507
3508			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3509			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3510			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3511
3512			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3513			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3514			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3515
3516			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3517			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3518			lropoidlist = SYSCTL_CHILDREN(lropoid);
3519
3520			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3521			    CTLFLAG_RD, &qs->rspq.size,
3522			    0, "#entries in response queue");
3523			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3524			    CTLFLAG_RD, &qs->rspq.cidx,
3525			    0, "consumer index");
3526			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3527			    CTLFLAG_RD, &qs->rspq.credits,
3528			    0, "#credits");
3529			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3530			    CTLFLAG_RD, &qs->rspq.starved,
3531			    0, "#times starved");
3532			SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3533			    CTLFLAG_RD, &qs->rspq.phys_addr,
3534			    "physical_address_of the queue");
3535			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3536			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3537			    0, "start rspq dump entry");
3538			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3539			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3540			    0, "#rspq entries to dump");
3541			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3542			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3543			    0, t3_dump_rspq, "A", "dump of the response queue");
3544
3545			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3546			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3547			    "#tunneled packets dropped");
3548			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3549			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.mq_len,
3550			    0, "#tunneled packets waiting to be sent");
3551#if 0
3552			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3553			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3554			    0, "#tunneled packets queue producer index");
3555			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3556			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3557			    0, "#tunneled packets queue consumer index");
3558#endif
3559			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3560			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3561			    0, "#tunneled packets processed by the card");
3562			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3563			    CTLFLAG_RD, &txq->cleaned,
3564			    0, "#tunneled packets cleaned");
3565			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3566			    CTLFLAG_RD, &txq->in_use,
3567			    0, "#tunneled packet slots in use");
3568			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees",
3569			    CTLFLAG_RD, &txq->txq_frees,
3570			    "#tunneled packets freed");
3571			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3572			    CTLFLAG_RD, &txq->txq_skipped,
3573			    0, "#tunneled packet descriptors skipped");
3574			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3575			    CTLFLAG_RD, &txq->txq_coalesced,
3576			    "#tunneled packets coalesced");
3577			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3578			    CTLFLAG_RD, &txq->txq_enqueued,
3579			    0, "#tunneled packets enqueued to hardware");
3580			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3581			    CTLFLAG_RD, &qs->txq_stopped,
3582			    0, "tx queues stopped");
3583			SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3584			    CTLFLAG_RD, &txq->phys_addr,
3585			    "physical_address_of the queue");
3586			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3587			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3588			    0, "txq generation");
3589			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3590			    CTLFLAG_RD, &txq->cidx,
3591			    0, "hardware queue cidx");
3592			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3593			    CTLFLAG_RD, &txq->pidx,
3594			    0, "hardware queue pidx");
3595			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3596			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3597			    0, "txq start idx for dump");
3598			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3599			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3600			    0, "txq #entries to dump");
3601			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3602			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3603			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3604
3605			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3606			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3607			    0, "ctrlq start idx for dump");
3608			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3609			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3610			    0, "ctrl #entries to dump");
3611			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3612			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3613			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3614
3615			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_queued",
3616			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3617			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3618			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3619			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3620			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3621			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3622			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3623		}
3624
3625		/* Now add a node for mac stats. */
3626		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3627		    CTLFLAG_RD, NULL, "MAC statistics");
3628		poidlist = SYSCTL_CHILDREN(poid);
3629
3630		/*
3631		 * We (ab)use the length argument (arg2) to pass on the offset
3632		 * of the data that we are interested in.  This is only required
3633		 * for the quad counters that are updated from the hardware (we
3634		 * make sure that we return the latest value).
3635		 * sysctl_handle_macstat first updates *all* the counters from
3636		 * the hardware, and then returns the latest value of the
3637		 * requested counter.  Best would be to update only the
3638		 * requested counter from hardware, but t3_mac_update_stats()
3639		 * hides all the register details and we don't want to dive into
3640		 * all that here.
3641		 */
3642#define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3643    (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3644    sysctl_handle_macstat, "QU", 0)
3645		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3646		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3647		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3648		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3649		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3650		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3651		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3652		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3653		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3654		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3655		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3656		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3657		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3658		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3659		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3660		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3661		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3662		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3663		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3664		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3665		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3666		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3667		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3668		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3669		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3670		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3671		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3672		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3673		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3674		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3675		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3676		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3677		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3678		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3679		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3680		CXGB_SYSCTL_ADD_QUAD(rx_short);
3681		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3682		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3683		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3684		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3685		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3686		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3687		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3688		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3689		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3690		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3691#undef CXGB_SYSCTL_ADD_QUAD
3692
3693#define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3694    CTLFLAG_RD, &mstats->a, 0)
3695		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3696		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3697		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3698		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3699		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3700		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3701		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3702		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3703		CXGB_SYSCTL_ADD_ULONG(num_resets);
3704		CXGB_SYSCTL_ADD_ULONG(link_faults);
3705#undef CXGB_SYSCTL_ADD_ULONG
3706	}
3707}
3708
3709/**
3710 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3711 *	@qs: the queue set
3712 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3713 *	@idx: the descriptor index in the queue
3714 *	@data: where to dump the descriptor contents
3715 *
3716 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3717 *	size of the descriptor.
3718 */
3719int
3720t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3721		unsigned char *data)
3722{
3723	if (qnum >= 6)
3724		return (EINVAL);
3725
3726	if (qnum < 3) {
3727		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3728			return -EINVAL;
3729		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3730		return sizeof(struct tx_desc);
3731	}
3732
3733	if (qnum == 3) {
3734		if (!qs->rspq.desc || idx >= qs->rspq.size)
3735			return (EINVAL);
3736		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3737		return sizeof(struct rsp_desc);
3738	}
3739
3740	qnum -= 4;
3741	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3742		return (EINVAL);
3743	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3744	return sizeof(struct rx_desc);
3745}
3746