cxgb_sge.c revision 175504
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29#define DEBUG_BUFRING
30
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/dev/cxgb/cxgb_sge.c 175504 2008-01-19 22:47:43Z kmacy $");
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/module.h>
39#include <sys/bus.h>
40#include <sys/conf.h>
41#include <machine/bus.h>
42#include <machine/resource.h>
43#include <sys/bus_dma.h>
44#include <sys/rman.h>
45#include <sys/queue.h>
46#include <sys/sysctl.h>
47#include <sys/taskqueue.h>
48
49#include <sys/proc.h>
50#include <sys/sbuf.h>
51#include <sys/sched.h>
52#include <sys/smp.h>
53#include <sys/systm.h>
54#include <sys/syslog.h>
55
56#include <netinet/in_systm.h>
57#include <netinet/in.h>
58#include <netinet/ip.h>
59#include <netinet/tcp.h>
60
61#include <dev/pci/pcireg.h>
62#include <dev/pci/pcivar.h>
63
64#include <vm/vm.h>
65#include <vm/pmap.h>
66
67#ifdef CONFIG_DEFINED
68#include <cxgb_include.h>
69#include <sys/mvec.h>
70#else
71#include <dev/cxgb/cxgb_include.h>
72#include <dev/cxgb/sys/mvec.h>
73#endif
74
75int      txq_fills = 0;
76static int recycle_enable = 1;
77extern int cxgb_txq_buf_ring_size;
78int cxgb_cached_allocations;
79int cxgb_cached;
80int cxgb_ext_freed;
81extern int cxgb_use_16k_clusters;
82extern int cxgb_pcpu_cache_enable;
83
84
85#define USE_GTS 0
86
87#define SGE_RX_SM_BUF_SIZE	1536
88#define SGE_RX_DROP_THRES	16
89#define SGE_RX_COPY_THRES	128
90
91/*
92 * Period of the Tx buffer reclaim timer.  This timer does not need to run
93 * frequently as Tx buffers are usually reclaimed by new Tx packets.
94 */
95#define TX_RECLAIM_PERIOD       (hz >> 1)
96
97/*
98 * Values for sge_txq.flags
99 */
100enum {
101	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
102	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
103};
104
105struct tx_desc {
106	uint64_t	flit[TX_DESC_FLITS];
107} __packed;
108
109struct rx_desc {
110	uint32_t	addr_lo;
111	uint32_t	len_gen;
112	uint32_t	gen2;
113	uint32_t	addr_hi;
114} __packed;;
115
116struct rsp_desc {               /* response queue descriptor */
117	struct rss_header	rss_hdr;
118	uint32_t		flags;
119	uint32_t		len_cq;
120	uint8_t			imm_data[47];
121	uint8_t			intr_gen;
122} __packed;
123
124#define RX_SW_DESC_MAP_CREATED	(1 << 0)
125#define TX_SW_DESC_MAP_CREATED	(1 << 1)
126#define RX_SW_DESC_INUSE        (1 << 3)
127#define TX_SW_DESC_MAPPED       (1 << 4)
128
129#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
130#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
131#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
132#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
133
134struct tx_sw_desc {                /* SW state per Tx descriptor */
135	struct mbuf_iovec mi;
136	bus_dmamap_t	map;
137	int		flags;
138};
139
140struct rx_sw_desc {                /* SW state per Rx descriptor */
141	caddr_t	         rxsd_cl;
142	caddr_t	         data;
143	bus_dmamap_t	  map;
144	int		  flags;
145};
146
147struct txq_state {
148	unsigned int compl;
149	unsigned int gen;
150	unsigned int pidx;
151};
152
153struct refill_fl_cb_arg {
154	int               error;
155	bus_dma_segment_t seg;
156	int               nseg;
157};
158
159/*
160 * Maps a number of flits to the number of Tx descriptors that can hold them.
161 * The formula is
162 *
163 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
164 *
165 * HW allows up to 4 descriptors to be combined into a WR.
166 */
167static uint8_t flit_desc_map[] = {
168	0,
169#if SGE_NUM_GENBITS == 1
170	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
171	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
172	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
173	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
174#elif SGE_NUM_GENBITS == 2
175	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
176	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
177	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
178	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
179#else
180# error "SGE_NUM_GENBITS must be 1 or 2"
181#endif
182};
183
184
185static int lro_default = 0;
186int cxgb_debug = 0;
187
188static void sge_timer_cb(void *arg);
189static void sge_timer_reclaim(void *arg, int ncount);
190static void sge_txq_reclaim_handler(void *arg, int ncount);
191
192/**
193 *	reclaim_completed_tx - reclaims completed Tx descriptors
194 *	@adapter: the adapter
195 *	@q: the Tx queue to reclaim completed descriptors from
196 *
197 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
198 *	and frees the associated buffers if possible.  Called with the Tx
199 *	queue's lock held.
200 */
201static __inline int
202reclaim_completed_tx_(struct sge_txq *q, int reclaim_min)
203{
204	int reclaim = desc_reclaimable(q);
205
206	if (reclaim < reclaim_min)
207		return (0);
208
209	mtx_assert(&q->lock, MA_OWNED);
210	if (reclaim > 0) {
211		t3_free_tx_desc(q, reclaim);
212		q->cleaned += reclaim;
213		q->in_use -= reclaim;
214	}
215	return (reclaim);
216}
217
218/**
219 *	should_restart_tx - are there enough resources to restart a Tx queue?
220 *	@q: the Tx queue
221 *
222 *	Checks if there are enough descriptors to restart a suspended Tx queue.
223 */
224static __inline int
225should_restart_tx(const struct sge_txq *q)
226{
227	unsigned int r = q->processed - q->cleaned;
228
229	return q->in_use - r < (q->size >> 1);
230}
231
232/**
233 *	t3_sge_init - initialize SGE
234 *	@adap: the adapter
235 *	@p: the SGE parameters
236 *
237 *	Performs SGE initialization needed every time after a chip reset.
238 *	We do not initialize any of the queue sets here, instead the driver
239 *	top-level must request those individually.  We also do not enable DMA
240 *	here, that should be done after the queues have been set up.
241 */
242void
243t3_sge_init(adapter_t *adap, struct sge_params *p)
244{
245	u_int ctrl, ups;
246
247	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
248
249	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
250	       F_CQCRDTCTRL |
251	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
252	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
253#if SGE_NUM_GENBITS == 1
254	ctrl |= F_EGRGENCTRL;
255#endif
256	if (adap->params.rev > 0) {
257		if (!(adap->flags & (USING_MSIX | USING_MSI)))
258			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
259		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
260	}
261	t3_write_reg(adap, A_SG_CONTROL, ctrl);
262	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
263		     V_LORCQDRBTHRSH(512));
264	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
265	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
266		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
267	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
268	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
269	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
270	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
271	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
272	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
273}
274
275
276/**
277 *	sgl_len - calculates the size of an SGL of the given capacity
278 *	@n: the number of SGL entries
279 *
280 *	Calculates the number of flits needed for a scatter/gather list that
281 *	can hold the given number of entries.
282 */
283static __inline unsigned int
284sgl_len(unsigned int n)
285{
286	return ((3 * n) / 2 + (n & 1));
287}
288
289/**
290 *	get_imm_packet - return the next ingress packet buffer from a response
291 *	@resp: the response descriptor containing the packet data
292 *
293 *	Return a packet containing the immediate data of the given response.
294 */
295#ifdef DISABLE_MBUF_IOVEC
296static __inline int
297get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh)
298{
299	struct mbuf *m = mh->m_head;
300
301	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
302	m->m_pkthdr.len = m->m_len = len;
303	return (0);
304}
305
306#else
307static int
308get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl, uint32_t flags)
309{
310
311	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
312	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
313	return (0);
314
315}
316#endif
317
318static __inline u_int
319flits_to_desc(u_int n)
320{
321	return (flit_desc_map[n]);
322}
323
324void
325t3_sge_err_intr_handler(adapter_t *adapter)
326{
327	unsigned int v, status;
328
329
330	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
331
332	if (status & F_RSPQCREDITOVERFOW)
333		CH_ALERT(adapter, "SGE response queue credit overflow\n");
334
335	if (status & F_RSPQDISABLED) {
336		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
337
338		CH_ALERT(adapter,
339			 "packet delivered to disabled response queue (0x%x)\n",
340			 (v >> S_RSPQ0DISABLED) & 0xff);
341	}
342
343	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
344	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
345		t3_fatal_err(adapter);
346}
347
348void
349t3_sge_prep(adapter_t *adap, struct sge_params *p)
350{
351	int i;
352
353	/* XXX Does ETHER_ALIGN need to be accounted for here? */
354	p->max_pkt_size = adap->sge.qs[0].fl[1].buf_size - sizeof(struct cpl_rx_data);
355
356	for (i = 0; i < SGE_QSETS; ++i) {
357		struct qset_params *q = p->qset + i;
358
359		q->polling = adap->params.rev > 0;
360
361		if (adap->params.nports > 2) {
362			q->coalesce_nsecs = 50000;
363		} else {
364#ifdef INVARIANTS
365			q->coalesce_nsecs = 10000;
366#else
367			q->coalesce_nsecs = 5000;
368#endif
369		}
370		q->rspq_size = RSPQ_Q_SIZE;
371		q->fl_size = FL_Q_SIZE;
372		q->jumbo_size = JUMBO_Q_SIZE;
373		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
374		q->txq_size[TXQ_OFLD] = 1024;
375		q->txq_size[TXQ_CTRL] = 256;
376		q->cong_thres = 0;
377	}
378}
379
380int
381t3_sge_alloc(adapter_t *sc)
382{
383
384	/* The parent tag. */
385	if (bus_dma_tag_create( NULL,			/* parent */
386				1, 0,			/* algnmnt, boundary */
387				BUS_SPACE_MAXADDR,	/* lowaddr */
388				BUS_SPACE_MAXADDR,	/* highaddr */
389				NULL, NULL,		/* filter, filterarg */
390				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
391				BUS_SPACE_UNRESTRICTED, /* nsegments */
392				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
393				0,			/* flags */
394				NULL, NULL,		/* lock, lockarg */
395				&sc->parent_dmat)) {
396		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
397		return (ENOMEM);
398	}
399
400	/*
401	 * DMA tag for normal sized RX frames
402	 */
403	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
404		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
405		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
406		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
407		return (ENOMEM);
408	}
409
410	/*
411	 * DMA tag for jumbo sized RX frames.
412	 */
413	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
414		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
415		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
416		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
417		return (ENOMEM);
418	}
419
420	/*
421	 * DMA tag for TX frames.
422	 */
423	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
424		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
425		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
426		NULL, NULL, &sc->tx_dmat)) {
427		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
428		return (ENOMEM);
429	}
430
431	return (0);
432}
433
434int
435t3_sge_free(struct adapter * sc)
436{
437
438	if (sc->tx_dmat != NULL)
439		bus_dma_tag_destroy(sc->tx_dmat);
440
441	if (sc->rx_jumbo_dmat != NULL)
442		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
443
444	if (sc->rx_dmat != NULL)
445		bus_dma_tag_destroy(sc->rx_dmat);
446
447	if (sc->parent_dmat != NULL)
448		bus_dma_tag_destroy(sc->parent_dmat);
449
450	return (0);
451}
452
453void
454t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
455{
456
457	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
458	qs->rspq.polling = 0 /* p->polling */;
459}
460
461#if !defined(__i386__) && !defined(__amd64__)
462static void
463refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
464{
465	struct refill_fl_cb_arg *cb_arg = arg;
466
467	cb_arg->error = error;
468	cb_arg->seg = segs[0];
469	cb_arg->nseg = nseg;
470
471}
472#endif
473/**
474 *	refill_fl - refill an SGE free-buffer list
475 *	@sc: the controller softc
476 *	@q: the free-list to refill
477 *	@n: the number of new buffers to allocate
478 *
479 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
480 *	The caller must assure that @n does not exceed the queue's capacity.
481 */
482static void
483refill_fl(adapter_t *sc, struct sge_fl *q, int n)
484{
485	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
486	struct rx_desc *d = &q->desc[q->pidx];
487	struct refill_fl_cb_arg cb_arg;
488	caddr_t cl;
489	int err;
490	int header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
491
492	cb_arg.error = 0;
493	while (n--) {
494		/*
495		 * We only allocate a cluster, mbuf allocation happens after rx
496		 */
497		if ((cl = cxgb_cache_get(q->zone)) == NULL) {
498			log(LOG_WARNING, "Failed to allocate cluster\n");
499			goto done;
500		}
501
502		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
503			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
504				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
505				uma_zfree(q->zone, cl);
506				goto done;
507			}
508			sd->flags |= RX_SW_DESC_MAP_CREATED;
509		}
510#if !defined(__i386__) && !defined(__amd64__)
511		err = bus_dmamap_load(q->entry_tag, sd->map,
512		    cl + header_size, q->buf_size,
513		    refill_fl_cb, &cb_arg, 0);
514
515		if (err != 0 || cb_arg.error) {
516			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
517			/*
518			 * XXX free cluster
519			 */
520			return;
521		}
522#else
523		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)(cl + header_size));
524#endif
525		sd->flags |= RX_SW_DESC_INUSE;
526		sd->rxsd_cl = cl;
527		sd->data = cl + header_size;
528		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
529		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
530		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
531		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
532
533		d++;
534		sd++;
535
536		if (++q->pidx == q->size) {
537			q->pidx = 0;
538			q->gen ^= 1;
539			sd = q->sdesc;
540			d = q->desc;
541		}
542		q->credits++;
543	}
544
545done:
546	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
547}
548
549
550/**
551 *	free_rx_bufs - free the Rx buffers on an SGE free list
552 *	@sc: the controle softc
553 *	@q: the SGE free list to clean up
554 *
555 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
556 *	this queue should be stopped before calling this function.
557 */
558static void
559free_rx_bufs(adapter_t *sc, struct sge_fl *q)
560{
561	u_int cidx = q->cidx;
562
563	while (q->credits--) {
564		struct rx_sw_desc *d = &q->sdesc[cidx];
565
566		if (d->flags & RX_SW_DESC_INUSE) {
567			bus_dmamap_unload(q->entry_tag, d->map);
568			bus_dmamap_destroy(q->entry_tag, d->map);
569			uma_zfree(q->zone, d->rxsd_cl);
570		}
571		d->rxsd_cl = NULL;
572		if (++cidx == q->size)
573			cidx = 0;
574	}
575}
576
577static __inline void
578__refill_fl(adapter_t *adap, struct sge_fl *fl)
579{
580	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
581}
582
583static __inline void
584__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
585{
586	if ((fl->size - fl->credits) < max)
587		refill_fl(adap, fl, min(max, fl->size - fl->credits));
588}
589
590void
591refill_fl_service(adapter_t *adap, struct sge_fl *fl)
592{
593	__refill_fl_lt(adap, fl, 512);
594}
595
596/**
597 *	recycle_rx_buf - recycle a receive buffer
598 *	@adapter: the adapter
599 *	@q: the SGE free list
600 *	@idx: index of buffer to recycle
601 *
602 *	Recycles the specified buffer on the given free list by adding it at
603 *	the next available slot on the list.
604 */
605static void
606recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
607{
608	struct rx_desc *from = &q->desc[idx];
609	struct rx_desc *to   = &q->desc[q->pidx];
610
611	q->sdesc[q->pidx] = q->sdesc[idx];
612	to->addr_lo = from->addr_lo;        // already big endian
613	to->addr_hi = from->addr_hi;        // likewise
614	wmb();
615	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
616	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
617	q->credits++;
618
619	if (++q->pidx == q->size) {
620		q->pidx = 0;
621		q->gen ^= 1;
622	}
623	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
624}
625
626static void
627alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
628{
629	uint32_t *addr;
630
631	addr = arg;
632	*addr = segs[0].ds_addr;
633}
634
635static int
636alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
637    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
638    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
639{
640	size_t len = nelem * elem_size;
641	void *s = NULL;
642	void *p = NULL;
643	int err;
644
645	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
646				      BUS_SPACE_MAXADDR_32BIT,
647				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
648				      len, 0, NULL, NULL, tag)) != 0) {
649		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
650		return (ENOMEM);
651	}
652
653	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
654				    map)) != 0) {
655		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
656		return (ENOMEM);
657	}
658
659	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
660	bzero(p, len);
661	*(void **)desc = p;
662
663	if (sw_size) {
664		len = nelem * sw_size;
665		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
666		*(void **)sdesc = s;
667	}
668	if (parent_entry_tag == NULL)
669		return (0);
670
671	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
672				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
673		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
674				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
675		                      NULL, NULL, entry_tag)) != 0) {
676		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
677		return (ENOMEM);
678	}
679	return (0);
680}
681
682static void
683sge_slow_intr_handler(void *arg, int ncount)
684{
685	adapter_t *sc = arg;
686
687	t3_slow_intr_handler(sc);
688}
689
690/**
691 *	sge_timer_cb - perform periodic maintenance of an SGE qset
692 *	@data: the SGE queue set to maintain
693 *
694 *	Runs periodically from a timer to perform maintenance of an SGE queue
695 *	set.  It performs two tasks:
696 *
697 *	a) Cleans up any completed Tx descriptors that may still be pending.
698 *	Normal descriptor cleanup happens when new packets are added to a Tx
699 *	queue so this timer is relatively infrequent and does any cleanup only
700 *	if the Tx queue has not seen any new packets in a while.  We make a
701 *	best effort attempt to reclaim descriptors, in that we don't wait
702 *	around if we cannot get a queue's lock (which most likely is because
703 *	someone else is queueing new packets and so will also handle the clean
704 *	up).  Since control queues use immediate data exclusively we don't
705 *	bother cleaning them up here.
706 *
707 *	b) Replenishes Rx queues that have run out due to memory shortage.
708 *	Normally new Rx buffers are added when existing ones are consumed but
709 *	when out of memory a queue can become empty.  We try to add only a few
710 *	buffers here, the queue will be replenished fully as these new buffers
711 *	are used up if memory shortage has subsided.
712 *
713 *	c) Return coalesced response queue credits in case a response queue is
714 *	starved.
715 *
716 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
717 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
718 */
719static void
720sge_timer_cb(void *arg)
721{
722	adapter_t *sc = arg;
723#ifndef IFNET_MULTIQUEUE
724	struct port_info *pi;
725	struct sge_qset *qs;
726	struct sge_txq  *txq;
727	int i, j;
728	int reclaim_ofl, refill_rx;
729
730	for (i = 0; i < sc->params.nports; i++)
731		for (j = 0; j < sc->port[i].nqsets; j++) {
732			qs = &sc->sge.qs[i + j];
733			txq = &qs->txq[0];
734			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
735			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
736			    (qs->fl[1].credits < qs->fl[1].size));
737			if (reclaim_ofl || refill_rx) {
738				pi = &sc->port[i];
739				taskqueue_enqueue(pi->tq, &pi->timer_reclaim_task);
740				break;
741			}
742		}
743#endif
744	if (sc->params.nports > 2) {
745		int i;
746
747		for_each_port(sc, i) {
748			struct port_info *pi = &sc->port[i];
749
750			t3_write_reg(sc, A_SG_KDOORBELL,
751				     F_SELEGRCNTX |
752				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
753		}
754	}
755	if (sc->open_device_map != 0)
756		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
757}
758
759/*
760 * This is meant to be a catch-all function to keep sge state private
761 * to sge.c
762 *
763 */
764int
765t3_sge_init_adapter(adapter_t *sc)
766{
767	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
768	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
769	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
770	mi_init();
771	cxgb_cache_init();
772	return (0);
773}
774
775int
776t3_sge_reset_adapter(adapter_t *sc)
777{
778	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
779	return (0);
780}
781
782int
783t3_sge_init_port(struct port_info *pi)
784{
785	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
786	return (0);
787}
788
789void
790t3_sge_deinit_sw(adapter_t *sc)
791{
792	int i;
793
794	callout_drain(&sc->sge_timer_ch);
795	if (sc->tq)
796		taskqueue_drain(sc->tq, &sc->slow_intr_task);
797	for (i = 0; i < sc->params.nports; i++)
798		if (sc->port[i].tq != NULL)
799			taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task);
800
801	mi_deinit();
802}
803
804/**
805 *	refill_rspq - replenish an SGE response queue
806 *	@adapter: the adapter
807 *	@q: the response queue to replenish
808 *	@credits: how many new responses to make available
809 *
810 *	Replenishes a response queue by making the supplied number of responses
811 *	available to HW.
812 */
813static __inline void
814refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
815{
816
817	/* mbufs are allocated on demand when a rspq entry is processed. */
818	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
819		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
820}
821
822static __inline void
823sge_txq_reclaim_(struct sge_txq *txq, int force)
824{
825
826	if (desc_reclaimable(txq) < 16)
827		return;
828	if (mtx_trylock(&txq->lock) == 0)
829		return;
830	reclaim_completed_tx_(txq, 16);
831	mtx_unlock(&txq->lock);
832
833}
834
835static void
836sge_txq_reclaim_handler(void *arg, int ncount)
837{
838	struct sge_txq *q = arg;
839
840	sge_txq_reclaim_(q, TRUE);
841}
842
843
844
845static void
846sge_timer_reclaim(void *arg, int ncount)
847{
848	struct port_info *pi = arg;
849	int i, nqsets = pi->nqsets;
850	adapter_t *sc = pi->adapter;
851	struct sge_qset *qs;
852	struct sge_txq *txq;
853	struct mtx *lock;
854
855#ifdef IFNET_MULTIQUEUE
856	panic("%s should not be called with multiqueue support\n", __FUNCTION__);
857#endif
858	for (i = 0; i < nqsets; i++) {
859		qs = &sc->sge.qs[i];
860
861		txq = &qs->txq[TXQ_OFLD];
862		sge_txq_reclaim_(txq, FALSE);
863
864		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
865			    &sc->sge.qs[0].rspq.lock;
866
867		if (mtx_trylock(lock)) {
868			/* XXX currently assume that we are *NOT* polling */
869			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
870
871			if (qs->fl[0].credits < qs->fl[0].size - 16)
872				__refill_fl(sc, &qs->fl[0]);
873			if (qs->fl[1].credits < qs->fl[1].size - 16)
874				__refill_fl(sc, &qs->fl[1]);
875
876			if (status & (1 << qs->rspq.cntxt_id)) {
877				if (qs->rspq.credits) {
878					refill_rspq(sc, &qs->rspq, 1);
879					qs->rspq.credits--;
880					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
881					    1 << qs->rspq.cntxt_id);
882				}
883			}
884			mtx_unlock(lock);
885		}
886	}
887}
888
889/**
890 *	init_qset_cntxt - initialize an SGE queue set context info
891 *	@qs: the queue set
892 *	@id: the queue set id
893 *
894 *	Initializes the TIDs and context ids for the queues of a queue set.
895 */
896static void
897init_qset_cntxt(struct sge_qset *qs, u_int id)
898{
899
900	qs->rspq.cntxt_id = id;
901	qs->fl[0].cntxt_id = 2 * id;
902	qs->fl[1].cntxt_id = 2 * id + 1;
903	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
904	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
905	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
906	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
907	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
908
909	mbufq_init(&qs->txq[TXQ_ETH].sendq);
910	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
911	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
912}
913
914
915static void
916txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
917{
918	txq->in_use += ndesc;
919	/*
920	 * XXX we don't handle stopping of queue
921	 * presumably start handles this when we bump against the end
922	 */
923	txqs->gen = txq->gen;
924	txq->unacked += ndesc;
925	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
926	txq->unacked &= 7;
927	txqs->pidx = txq->pidx;
928	txq->pidx += ndesc;
929#ifdef INVARIANTS
930	if (((txqs->pidx > txq->cidx) &&
931		(txq->pidx < txqs->pidx) &&
932		(txq->pidx >= txq->cidx)) ||
933	    ((txqs->pidx < txq->cidx) &&
934		(txq->pidx >= txq-> cidx)) ||
935	    ((txqs->pidx < txq->cidx) &&
936		(txq->cidx < txqs->pidx)))
937		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
938		    txqs->pidx, txq->pidx, txq->cidx);
939#endif
940	if (txq->pidx >= txq->size) {
941		txq->pidx -= txq->size;
942		txq->gen ^= 1;
943	}
944
945}
946
947/**
948 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
949 *	@m: the packet mbufs
950 *      @nsegs: the number of segments
951 *
952 * 	Returns the number of Tx descriptors needed for the given Ethernet
953 * 	packet.  Ethernet packets require addition of WR and CPL headers.
954 */
955static __inline unsigned int
956calc_tx_descs(const struct mbuf *m, int nsegs)
957{
958	unsigned int flits;
959
960	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
961		return 1;
962
963	flits = sgl_len(nsegs) + 2;
964#ifdef TSO_SUPPORTED
965	if (m->m_pkthdr.csum_flags & CSUM_TSO)
966		flits++;
967#endif
968	return flits_to_desc(flits);
969}
970
971static unsigned int
972busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
973    struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
974{
975	struct mbuf *m0;
976	int err, pktlen, pass = 0;
977
978retry:
979	err = 0;
980	m0 = *m;
981	pktlen = m0->m_pkthdr.len;
982#if defined(__i386__) || defined(__amd64__)
983	if (busdma_map_sg_collapse(m, segs, nsegs) == 0) {
984		goto done;
985	} else
986#endif
987		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, txsd->map, m0, segs, nsegs, 0);
988
989	if (err == 0) {
990		goto done;
991	}
992	if (err == EFBIG && pass == 0) {
993		pass = 1;
994		/* Too many segments, try to defrag */
995		m0 = m_defrag(m0, M_DONTWAIT);
996		if (m0 == NULL) {
997			m_freem(*m);
998			*m = NULL;
999			return (ENOBUFS);
1000		}
1001		*m = m0;
1002		goto retry;
1003	} else if (err == ENOMEM) {
1004		return (err);
1005	} if (err) {
1006		if (cxgb_debug)
1007			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1008		m_freem(m0);
1009		*m = NULL;
1010		return (err);
1011	}
1012done:
1013#if !defined(__i386__) && !defined(__amd64__)
1014	bus_dmamap_sync(txq->entry_tag, txsd->map, BUS_DMASYNC_PREWRITE);
1015#endif
1016	txsd->flags |= TX_SW_DESC_MAPPED;
1017
1018	return (0);
1019}
1020
1021/**
1022 *	make_sgl - populate a scatter/gather list for a packet
1023 *	@sgp: the SGL to populate
1024 *	@segs: the packet dma segments
1025 *	@nsegs: the number of segments
1026 *
1027 *	Generates a scatter/gather list for the buffers that make up a packet
1028 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1029 *	appropriately.
1030 */
1031static __inline void
1032make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1033{
1034	int i, idx;
1035
1036	for (idx = 0, i = 0; i < nsegs; i++) {
1037		/*
1038		 * firmware doesn't like empty segments
1039		 */
1040		if (segs[i].ds_len == 0)
1041			continue;
1042		if (i && idx == 0)
1043			++sgp;
1044
1045		sgp->len[idx] = htobe32(segs[i].ds_len);
1046		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1047		idx ^= 1;
1048	}
1049
1050	if (idx) {
1051		sgp->len[idx] = 0;
1052		sgp->addr[idx] = 0;
1053	}
1054}
1055
1056/**
1057 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1058 *	@adap: the adapter
1059 *	@q: the Tx queue
1060 *
1061 *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1062 *	where the HW is going to sleep just after we checked, however,
1063 *	then the interrupt handler will detect the outstanding TX packet
1064 *	and ring the doorbell for us.
1065 *
1066 *	When GTS is disabled we unconditionally ring the doorbell.
1067 */
1068static __inline void
1069check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1070{
1071#if USE_GTS
1072	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1073	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1074		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1075#ifdef T3_TRACE
1076		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1077			  q->cntxt_id);
1078#endif
1079		t3_write_reg(adap, A_SG_KDOORBELL,
1080			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1081	}
1082#else
1083	wmb();            /* write descriptors before telling HW */
1084	t3_write_reg(adap, A_SG_KDOORBELL,
1085		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1086#endif
1087}
1088
1089static __inline void
1090wr_gen2(struct tx_desc *d, unsigned int gen)
1091{
1092#if SGE_NUM_GENBITS == 2
1093	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1094#endif
1095}
1096
1097/**
1098 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1099 *	@ndesc: number of Tx descriptors spanned by the SGL
1100 *	@txd: first Tx descriptor to be written
1101 *	@txqs: txq state (generation and producer index)
1102 *	@txq: the SGE Tx queue
1103 *	@sgl: the SGL
1104 *	@flits: number of flits to the start of the SGL in the first descriptor
1105 *	@sgl_flits: the SGL size in flits
1106 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1107 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1108 *
1109 *	Write a work request header and an associated SGL.  If the SGL is
1110 *	small enough to fit into one Tx descriptor it has already been written
1111 *	and we just need to write the WR header.  Otherwise we distribute the
1112 *	SGL across the number of descriptors it spans.
1113 */
1114static void
1115write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1116    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1117    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1118{
1119
1120	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1121	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1122
1123	if (__predict_true(ndesc == 1)) {
1124		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1125		    V_WR_SGLSFLT(flits)) | wr_hi;
1126		wmb();
1127		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1128		    V_WR_GEN(txqs->gen)) | wr_lo;
1129		/* XXX gen? */
1130		wr_gen2(txd, txqs->gen);
1131
1132	} else {
1133		unsigned int ogen = txqs->gen;
1134		const uint64_t *fp = (const uint64_t *)sgl;
1135		struct work_request_hdr *wp = wrp;
1136
1137		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1138		    V_WR_SGLSFLT(flits)) | wr_hi;
1139
1140		while (sgl_flits) {
1141			unsigned int avail = WR_FLITS - flits;
1142
1143			if (avail > sgl_flits)
1144				avail = sgl_flits;
1145			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1146			sgl_flits -= avail;
1147			ndesc--;
1148			if (!sgl_flits)
1149				break;
1150
1151			fp += avail;
1152			txd++;
1153			txsd++;
1154			if (++txqs->pidx == txq->size) {
1155				txqs->pidx = 0;
1156				txqs->gen ^= 1;
1157				txd = txq->desc;
1158				txsd = txq->sdesc;
1159			}
1160
1161			/*
1162			 * when the head of the mbuf chain
1163			 * is freed all clusters will be freed
1164			 * with it
1165			 */
1166			KASSERT(txsd->mi.mi_base == NULL, ("overwrting valid entry mi_base==%p", txsd->mi.mi_base));
1167			wrp = (struct work_request_hdr *)txd;
1168			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1169			    V_WR_SGLSFLT(1)) | wr_hi;
1170			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1171				    sgl_flits + 1)) |
1172			    V_WR_GEN(txqs->gen)) | wr_lo;
1173			wr_gen2(txd, txqs->gen);
1174			flits = 1;
1175		}
1176		wrp->wr_hi |= htonl(F_WR_EOP);
1177		wmb();
1178		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1179		wr_gen2((struct tx_desc *)wp, ogen);
1180	}
1181}
1182
1183/* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1184#define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1185
1186#ifdef VLAN_SUPPORTED
1187#define GET_VTAG(cntrl, m) \
1188do { \
1189	if ((m)->m_flags & M_VLANTAG)					            \
1190		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1191} while (0)
1192
1193#define GET_VTAG_MI(cntrl, mi) \
1194do { \
1195	if ((mi)->mi_flags & M_VLANTAG)					\
1196		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((mi)->mi_ether_vtag); \
1197} while (0)
1198#else
1199#define GET_VTAG(cntrl, m)
1200#define GET_VTAG_MI(cntrl, m)
1201#endif
1202
1203int
1204t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
1205{
1206	adapter_t *sc;
1207	struct mbuf *m0;
1208	struct sge_txq *txq;
1209	struct txq_state txqs;
1210	struct port_info *pi;
1211	unsigned int ndesc, flits, cntrl, mlen;
1212	int err, nsegs, tso_info = 0;
1213
1214	struct work_request_hdr *wrp;
1215	struct tx_sw_desc *txsd;
1216	struct sg_ent *sgp, *sgl;
1217	uint32_t wr_hi, wr_lo, sgl_flits;
1218	bus_dma_segment_t segs[TX_MAX_SEGS];
1219
1220	struct tx_desc *txd;
1221	struct mbuf_vec *mv;
1222	struct mbuf_iovec *mi;
1223
1224	DPRINTF("t3_encap cpu=%d ", curcpu);
1225	KASSERT(qs->idx == 0, ("invalid qs %d", qs->idx));
1226
1227	mi = NULL;
1228	pi = qs->port;
1229	sc = pi->adapter;
1230	txq = &qs->txq[TXQ_ETH];
1231	txd = &txq->desc[txq->pidx];
1232	txsd = &txq->sdesc[txq->pidx];
1233	sgl = txq->txq_sgl;
1234	m0 = *m;
1235
1236	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
1237	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
1238	if (cxgb_debug)
1239		printf("mi_base=%p cidx=%d pidx=%d\n\n", txsd->mi.mi_base, txq->cidx, txq->pidx);
1240
1241	mtx_assert(&txq->lock, MA_OWNED);
1242	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1243/*
1244 * XXX need to add VLAN support for 6.x
1245 */
1246#ifdef VLAN_SUPPORTED
1247	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
1248		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1249#endif
1250	KASSERT(txsd->mi.mi_base == NULL, ("overwrting valid entry mi_base==%p",
1251		txsd->mi.mi_base));
1252	if (count > 1) {
1253		panic("count > 1 not support in CVS\n");
1254		if ((err = busdma_map_sg_vec(m, &m0, segs, count)))
1255			return (err);
1256		nsegs = count;
1257	} else if ((err = busdma_map_sg_collapse(&m0, segs, &nsegs))) {
1258		if (cxgb_debug)
1259			printf("failed ... err=%d\n", err);
1260		return (err);
1261	}
1262	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d count=%d", nsegs, count));
1263
1264	if (!(m0->m_pkthdr.len <= PIO_LEN)) {
1265		mi_collapse_mbuf(&txsd->mi, m0);
1266		mi = &txsd->mi;
1267	}
1268	if (count > 1) {
1269		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1270		int i, fidx;
1271		struct mbuf_iovec *batchmi;
1272
1273		mv = mtomv(m0);
1274		batchmi = mv->mv_vec;
1275
1276		wrp = (struct work_request_hdr *)txd;
1277
1278		flits = count*2 + 1;
1279		txq_prod(txq, 1, &txqs);
1280
1281		for (fidx = 1, i = 0; i < count; i++, batchmi++, fidx += 2) {
1282			struct cpl_tx_pkt_batch_entry *cbe = &cpl_batch->pkt_entry[i];
1283
1284			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1285			GET_VTAG_MI(cntrl, batchmi);
1286			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1287			cbe->cntrl = htonl(cntrl);
1288			cbe->len = htonl(batchmi->mi_len | 0x80000000);
1289			cbe->addr = htobe64(segs[i].ds_addr);
1290			txd->flit[fidx] |= htobe64(1 << 24);
1291		}
1292
1293		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1294		    V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1295		wmb();
1296		wrp->wr_lo = htonl(V_WR_LEN(flits) |
1297		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1298		/* XXX gen? */
1299		wr_gen2(txd, txqs.gen);
1300		check_ring_tx_db(sc, txq);
1301
1302		return (0);
1303	} else if (tso_info) {
1304		int undersized, eth_type;
1305		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1306		struct ip *ip;
1307		struct tcphdr *tcp;
1308		char *pkthdr, tmp[TCPPKTHDRSIZE];
1309		struct mbuf_vec *mv;
1310		struct mbuf_iovec *tmpmi;
1311
1312		mv = mtomv(m0);
1313		tmpmi = mv->mv_vec;
1314
1315		txd->flit[2] = 0;
1316		GET_VTAG_MI(cntrl, mi);
1317		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1318		hdr->cntrl = htonl(cntrl);
1319		mlen = m0->m_pkthdr.len;
1320		hdr->len = htonl(mlen | 0x80000000);
1321
1322		DPRINTF("tso buf len=%d\n", mlen);
1323		undersized = (((tmpmi->mi_len < TCPPKTHDRSIZE) &&
1324			(m0->m_flags & M_VLANTAG)) ||
1325		    (tmpmi->mi_len < TCPPKTHDRSIZE - ETHER_VLAN_ENCAP_LEN));
1326		if (__predict_false(undersized)) {
1327			pkthdr = tmp;
1328			dump_mi(mi);
1329			panic("discontig packet - fixxorz");
1330		} else
1331			pkthdr = m0->m_data;
1332
1333		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1334			eth_type = CPL_ETH_II_VLAN;
1335			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1336			    ETHER_VLAN_ENCAP_LEN);
1337		} else {
1338			eth_type = CPL_ETH_II;
1339			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1340		}
1341		tcp = (struct tcphdr *)((uint8_t *)ip +
1342		    sizeof(*ip));
1343
1344		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1345			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1346			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1347		hdr->lso_info = htonl(tso_info);
1348		flits = 3;
1349	} else {
1350		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1351
1352		GET_VTAG(cntrl, m0);
1353		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1354		cpl->cntrl = htonl(cntrl);
1355		mlen = m0->m_pkthdr.len;
1356		cpl->len = htonl(mlen | 0x80000000);
1357
1358		if (mlen <= PIO_LEN) {
1359			txq_prod(txq, 1, &txqs);
1360			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1361			m_freem(m0);
1362			m0 = NULL;
1363			flits = (mlen + 7) / 8 + 2;
1364			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1365					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1366					  F_WR_SOP | F_WR_EOP | txqs.compl);
1367			wmb();
1368			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1369			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1370
1371			wr_gen2(txd, txqs.gen);
1372			check_ring_tx_db(sc, txq);
1373			DPRINTF("pio buf\n");
1374			return (0);
1375		}
1376		DPRINTF("regular buf\n");
1377		flits = 2;
1378	}
1379	wrp = (struct work_request_hdr *)txd;
1380
1381#ifdef	nomore
1382	/*
1383	 * XXX need to move into one of the helper routines above
1384	 *
1385	 */
1386	if ((err = busdma_map_mbufs(m, txq, txsd, segs, &nsegs)) != 0)
1387		return (err);
1388	m0 = *m;
1389#endif
1390	ndesc = calc_tx_descs(m0, nsegs);
1391
1392	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1393	make_sgl(sgp, segs, nsegs);
1394
1395	sgl_flits = sgl_len(nsegs);
1396
1397	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1398	txq_prod(txq, ndesc, &txqs);
1399	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1400	wr_lo = htonl(V_WR_TID(txq->token));
1401	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
1402	check_ring_tx_db(pi->adapter, txq);
1403
1404	if ((m0->m_type == MT_DATA) &&
1405	    ((m0->m_flags & (M_EXT|M_NOFREE)) == M_EXT) &&
1406	    (m0->m_ext.ext_type != EXT_PACKET)) {
1407		m0->m_flags &= ~M_EXT ;
1408		cxgb_mbufs_outstanding--;
1409		m_free(m0);
1410	}
1411
1412	return (0);
1413}
1414
1415
1416/**
1417 *	write_imm - write a packet into a Tx descriptor as immediate data
1418 *	@d: the Tx descriptor to write
1419 *	@m: the packet
1420 *	@len: the length of packet data to write as immediate data
1421 *	@gen: the generation bit value to write
1422 *
1423 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1424 *	contains a work request at its beginning.  We must write the packet
1425 *	carefully so the SGE doesn't read accidentally before it's written in
1426 *	its entirety.
1427 */
1428static __inline void
1429write_imm(struct tx_desc *d, struct mbuf *m,
1430	  unsigned int len, unsigned int gen)
1431{
1432	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1433	struct work_request_hdr *to = (struct work_request_hdr *)d;
1434
1435	if (len > WR_LEN)
1436		panic("len too big %d\n", len);
1437	if (len < sizeof(*from))
1438		panic("len too small %d", len);
1439
1440	memcpy(&to[1], &from[1], len - sizeof(*from));
1441	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1442					V_WR_BCNTLFLT(len & 7));
1443	wmb();
1444	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1445					V_WR_LEN((len + 7) / 8));
1446	wr_gen2(d, gen);
1447
1448	/*
1449	 * This check is a hack we should really fix the logic so
1450	 * that this can't happen
1451	 */
1452	if (m->m_type != MT_DONTFREE)
1453		m_freem(m);
1454
1455}
1456
1457/**
1458 *	check_desc_avail - check descriptor availability on a send queue
1459 *	@adap: the adapter
1460 *	@q: the TX queue
1461 *	@m: the packet needing the descriptors
1462 *	@ndesc: the number of Tx descriptors needed
1463 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1464 *
1465 *	Checks if the requested number of Tx descriptors is available on an
1466 *	SGE send queue.  If the queue is already suspended or not enough
1467 *	descriptors are available the packet is queued for later transmission.
1468 *	Must be called with the Tx queue locked.
1469 *
1470 *	Returns 0 if enough descriptors are available, 1 if there aren't
1471 *	enough descriptors and the packet has been queued, and 2 if the caller
1472 *	needs to retry because there weren't enough descriptors at the
1473 *	beginning of the call but some freed up in the mean time.
1474 */
1475static __inline int
1476check_desc_avail(adapter_t *adap, struct sge_txq *q,
1477		 struct mbuf *m, unsigned int ndesc,
1478		 unsigned int qid)
1479{
1480	/*
1481	 * XXX We currently only use this for checking the control queue
1482	 * the control queue is only used for binding qsets which happens
1483	 * at init time so we are guaranteed enough descriptors
1484	 */
1485	if (__predict_false(!mbufq_empty(&q->sendq))) {
1486addq_exit:	mbufq_tail(&q->sendq, m);
1487		return 1;
1488	}
1489	if (__predict_false(q->size - q->in_use < ndesc)) {
1490
1491		struct sge_qset *qs = txq_to_qset(q, qid);
1492
1493		printf("stopping q\n");
1494
1495		setbit(&qs->txq_stopped, qid);
1496		smp_mb();
1497
1498		if (should_restart_tx(q) &&
1499		    test_and_clear_bit(qid, &qs->txq_stopped))
1500			return 2;
1501
1502		q->stops++;
1503		goto addq_exit;
1504	}
1505	return 0;
1506}
1507
1508
1509/**
1510 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1511 *	@q: the SGE control Tx queue
1512 *
1513 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1514 *	that send only immediate data (presently just the control queues) and
1515 *	thus do not have any mbufs
1516 */
1517static __inline void
1518reclaim_completed_tx_imm(struct sge_txq *q)
1519{
1520	unsigned int reclaim = q->processed - q->cleaned;
1521
1522	mtx_assert(&q->lock, MA_OWNED);
1523
1524	q->in_use -= reclaim;
1525	q->cleaned += reclaim;
1526}
1527
1528static __inline int
1529immediate(const struct mbuf *m)
1530{
1531	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1532}
1533
1534/**
1535 *	ctrl_xmit - send a packet through an SGE control Tx queue
1536 *	@adap: the adapter
1537 *	@q: the control queue
1538 *	@m: the packet
1539 *
1540 *	Send a packet through an SGE control Tx queue.  Packets sent through
1541 *	a control queue must fit entirely as immediate data in a single Tx
1542 *	descriptor and have no page fragments.
1543 */
1544static int
1545ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1546{
1547	int ret;
1548	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1549
1550	if (__predict_false(!immediate(m))) {
1551		m_freem(m);
1552		return 0;
1553	}
1554
1555	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1556	wrp->wr_lo = htonl(V_WR_TID(q->token));
1557
1558	mtx_lock(&q->lock);
1559again:	reclaim_completed_tx_imm(q);
1560
1561	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1562	if (__predict_false(ret)) {
1563		if (ret == 1) {
1564			mtx_unlock(&q->lock);
1565			log(LOG_ERR, "no desc available\n");
1566
1567			return (ENOSPC);
1568		}
1569		goto again;
1570	}
1571	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1572
1573	q->in_use++;
1574	if (++q->pidx >= q->size) {
1575		q->pidx = 0;
1576		q->gen ^= 1;
1577	}
1578	mtx_unlock(&q->lock);
1579	wmb();
1580	t3_write_reg(adap, A_SG_KDOORBELL,
1581		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1582	return (0);
1583}
1584
1585
1586/**
1587 *	restart_ctrlq - restart a suspended control queue
1588 *	@qs: the queue set cotaining the control queue
1589 *
1590 *	Resumes transmission on a suspended Tx control queue.
1591 */
1592static void
1593restart_ctrlq(void *data, int npending)
1594{
1595	struct mbuf *m;
1596	struct sge_qset *qs = (struct sge_qset *)data;
1597	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1598	adapter_t *adap = qs->port->adapter;
1599
1600	log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use);
1601
1602	mtx_lock(&q->lock);
1603again:	reclaim_completed_tx_imm(q);
1604
1605	while (q->in_use < q->size &&
1606	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1607
1608		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1609
1610		if (++q->pidx >= q->size) {
1611			q->pidx = 0;
1612			q->gen ^= 1;
1613		}
1614		q->in_use++;
1615	}
1616	if (!mbufq_empty(&q->sendq)) {
1617		setbit(&qs->txq_stopped, TXQ_CTRL);
1618		smp_mb();
1619
1620		if (should_restart_tx(q) &&
1621		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1622			goto again;
1623		q->stops++;
1624	}
1625	mtx_unlock(&q->lock);
1626	t3_write_reg(adap, A_SG_KDOORBELL,
1627		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1628}
1629
1630
1631/*
1632 * Send a management message through control queue 0
1633 */
1634int
1635t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1636{
1637	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1638}
1639
1640
1641/**
1642 *	free_qset - free the resources of an SGE queue set
1643 *	@sc: the controller owning the queue set
1644 *	@q: the queue set
1645 *
1646 *	Release the HW and SW resources associated with an SGE queue set, such
1647 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1648 *	queue set must be quiesced prior to calling this.
1649 */
1650void
1651t3_free_qset(adapter_t *sc, struct sge_qset *q)
1652{
1653	int i;
1654
1655	t3_free_tx_desc_all(&q->txq[TXQ_ETH]);
1656
1657	for (i = 0; i < SGE_TXQ_PER_SET; i++)
1658		if (q->txq[i].txq_mr.br_ring != NULL) {
1659			free(q->txq[i].txq_mr.br_ring, M_DEVBUF);
1660			mtx_destroy(&q->txq[i].txq_mr.br_lock);
1661		}
1662	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1663		if (q->fl[i].desc) {
1664			mtx_lock(&sc->sge.reg_lock);
1665			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1666			mtx_unlock(&sc->sge.reg_lock);
1667			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1668			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1669					q->fl[i].desc_map);
1670			bus_dma_tag_destroy(q->fl[i].desc_tag);
1671			bus_dma_tag_destroy(q->fl[i].entry_tag);
1672		}
1673		if (q->fl[i].sdesc) {
1674			free_rx_bufs(sc, &q->fl[i]);
1675			free(q->fl[i].sdesc, M_DEVBUF);
1676		}
1677	}
1678
1679	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
1680		if (q->txq[i].desc) {
1681			mtx_lock(&sc->sge.reg_lock);
1682			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1683			mtx_unlock(&sc->sge.reg_lock);
1684			bus_dmamap_unload(q->txq[i].desc_tag,
1685					q->txq[i].desc_map);
1686			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1687					q->txq[i].desc_map);
1688			bus_dma_tag_destroy(q->txq[i].desc_tag);
1689			bus_dma_tag_destroy(q->txq[i].entry_tag);
1690			MTX_DESTROY(&q->txq[i].lock);
1691		}
1692		if (q->txq[i].sdesc) {
1693			free(q->txq[i].sdesc, M_DEVBUF);
1694		}
1695	}
1696
1697	if (q->rspq.desc) {
1698		mtx_lock(&sc->sge.reg_lock);
1699		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1700		mtx_unlock(&sc->sge.reg_lock);
1701
1702		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1703		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1704			        q->rspq.desc_map);
1705		bus_dma_tag_destroy(q->rspq.desc_tag);
1706		MTX_DESTROY(&q->rspq.lock);
1707	}
1708
1709	bzero(q, sizeof(*q));
1710}
1711
1712/**
1713 *	t3_free_sge_resources - free SGE resources
1714 *	@sc: the adapter softc
1715 *
1716 *	Frees resources used by the SGE queue sets.
1717 */
1718void
1719t3_free_sge_resources(adapter_t *sc)
1720{
1721	int i, nqsets;
1722
1723#ifdef IFNET_MULTIQUEUE
1724	panic("%s should not be called when IFNET_MULTIQUEUE is defined", __FUNCTION__);
1725#endif
1726	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1727		nqsets += sc->port[i].nqsets;
1728
1729	for (i = 0; i < nqsets; ++i)
1730		t3_free_qset(sc, &sc->sge.qs[i]);
1731}
1732
1733/**
1734 *	t3_sge_start - enable SGE
1735 *	@sc: the controller softc
1736 *
1737 *	Enables the SGE for DMAs.  This is the last step in starting packet
1738 *	transfers.
1739 */
1740void
1741t3_sge_start(adapter_t *sc)
1742{
1743	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1744}
1745
1746/**
1747 *	t3_sge_stop - disable SGE operation
1748 *	@sc: the adapter
1749 *
1750 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
1751 *	from error interrupts) or from normal process context.  In the latter
1752 *	case it also disables any pending queue restart tasklets.  Note that
1753 *	if it is called in interrupt context it cannot disable the restart
1754 *	tasklets as it cannot wait, however the tasklets will have no effect
1755 *	since the doorbells are disabled and the driver will call this again
1756 *	later from process context, at which time the tasklets will be stopped
1757 *	if they are still running.
1758 */
1759void
1760t3_sge_stop(adapter_t *sc)
1761{
1762	int i, nqsets;
1763
1764	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
1765
1766	if (sc->tq == NULL)
1767		return;
1768
1769	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1770		nqsets += sc->port[i].nqsets;
1771#ifdef notyet
1772	/*
1773	 *
1774	 * XXX
1775	 */
1776	for (i = 0; i < nqsets; ++i) {
1777		struct sge_qset *qs = &sc->sge.qs[i];
1778
1779		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
1780		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
1781	}
1782#endif
1783}
1784
1785/**
1786 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
1787 *	@adapter: the adapter
1788 *	@q: the Tx queue to reclaim descriptors from
1789 *	@reclaimable: the number of descriptors to reclaim
1790 *      @m_vec_size: maximum number of buffers to reclaim
1791 *      @desc_reclaimed: returns the number of descriptors reclaimed
1792 *
1793 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1794 *	Tx buffers.  Called with the Tx queue lock held.
1795 *
1796 *      Returns number of buffers of reclaimed
1797 */
1798void
1799t3_free_tx_desc(struct sge_txq *q, int reclaimable)
1800{
1801	struct tx_sw_desc *txsd;
1802	unsigned int cidx;
1803
1804#ifdef T3_TRACE
1805	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1806		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
1807#endif
1808	cidx = q->cidx;
1809	txsd = &q->sdesc[cidx];
1810	DPRINTF("reclaiming %d WR\n", reclaimable);
1811	mtx_assert(&q->lock, MA_OWNED);
1812	while (reclaimable--) {
1813		DPRINTF("cidx=%d d=%p\n", cidx, txsd);
1814		if (txsd->mi.mi_base != NULL) {
1815			if (txsd->flags & TX_SW_DESC_MAPPED) {
1816				bus_dmamap_unload(q->entry_tag, txsd->map);
1817				txsd->flags &= ~TX_SW_DESC_MAPPED;
1818			}
1819			m_freem_iovec(&txsd->mi);
1820			buf_ring_scan(&q->txq_mr, txsd->mi.mi_base, __FILE__, __LINE__);
1821			txsd->mi.mi_base = NULL;
1822
1823#if defined(DIAGNOSTIC) && 0
1824			if (m_get_priority(txsd->m[0]) != cidx)
1825				printf("pri=%d cidx=%d\n",
1826				    (int)m_get_priority(txsd->m[0]), cidx);
1827#endif
1828
1829		} else
1830			q->txq_skipped++;
1831
1832		++txsd;
1833		if (++cidx == q->size) {
1834			cidx = 0;
1835			txsd = q->sdesc;
1836		}
1837	}
1838	q->cidx = cidx;
1839
1840}
1841
1842void
1843t3_free_tx_desc_all(struct sge_txq *q)
1844{
1845	int i;
1846	struct tx_sw_desc *txsd;
1847
1848	for (i = 0; i < q->size; i++) {
1849		txsd = &q->sdesc[i];
1850		if (txsd->mi.mi_base != NULL) {
1851			if (txsd->flags & TX_SW_DESC_MAPPED) {
1852				bus_dmamap_unload(q->entry_tag, txsd->map);
1853				txsd->flags &= ~TX_SW_DESC_MAPPED;
1854			}
1855			m_freem_iovec(&txsd->mi);
1856			bzero(&txsd->mi, sizeof(txsd->mi));
1857		}
1858	}
1859}
1860
1861/**
1862 *	is_new_response - check if a response is newly written
1863 *	@r: the response descriptor
1864 *	@q: the response queue
1865 *
1866 *	Returns true if a response descriptor contains a yet unprocessed
1867 *	response.
1868 */
1869static __inline int
1870is_new_response(const struct rsp_desc *r,
1871    const struct sge_rspq *q)
1872{
1873	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1874}
1875
1876#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1877#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1878			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1879			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1880			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1881
1882/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1883#define NOMEM_INTR_DELAY 2500
1884
1885/**
1886 *	write_ofld_wr - write an offload work request
1887 *	@adap: the adapter
1888 *	@m: the packet to send
1889 *	@q: the Tx queue
1890 *	@pidx: index of the first Tx descriptor to write
1891 *	@gen: the generation value to use
1892 *	@ndesc: number of descriptors the packet will occupy
1893 *
1894 *	Write an offload work request to send the supplied packet.  The packet
1895 *	data already carry the work request with most fields populated.
1896 */
1897static void
1898write_ofld_wr(adapter_t *adap, struct mbuf *m,
1899    struct sge_txq *q, unsigned int pidx,
1900    unsigned int gen, unsigned int ndesc,
1901    bus_dma_segment_t *segs, unsigned int nsegs)
1902{
1903	unsigned int sgl_flits, flits;
1904	struct work_request_hdr *from;
1905	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1906	struct tx_desc *d = &q->desc[pidx];
1907	struct txq_state txqs;
1908
1909	if (immediate(m) && segs == NULL) {
1910		write_imm(d, m, m->m_len, gen);
1911		return;
1912	}
1913
1914	/* Only TX_DATA builds SGLs */
1915	from = mtod(m, struct work_request_hdr *);
1916	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
1917
1918	flits = m->m_len / 8;
1919	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
1920
1921	make_sgl(sgp, segs, nsegs);
1922	sgl_flits = sgl_len(nsegs);
1923
1924	txqs.gen = gen;
1925	txqs.pidx = pidx;
1926	txqs.compl = 0;
1927
1928	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
1929	    from->wr_hi, from->wr_lo);
1930}
1931
1932/**
1933 *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1934 *	@m: the packet
1935 *
1936 * 	Returns the number of Tx descriptors needed for the given offload
1937 * 	packet.  These packets are already fully constructed.
1938 */
1939static __inline unsigned int
1940calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
1941{
1942	unsigned int flits, cnt = 0;
1943
1944
1945	if (m->m_len <= WR_LEN)
1946		return 1;                 /* packet fits as immediate data */
1947
1948	if (m->m_flags & M_IOVEC)
1949		cnt = mtomv(m)->mv_count;
1950
1951	/* headers */
1952	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;
1953
1954	return flits_to_desc(flits + sgl_len(cnt));
1955}
1956
1957/**
1958 *	ofld_xmit - send a packet through an offload queue
1959 *	@adap: the adapter
1960 *	@q: the Tx offload queue
1961 *	@m: the packet
1962 *
1963 *	Send an offload packet through an SGE offload queue.
1964 */
1965static int
1966ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1967{
1968	int ret, nsegs;
1969	unsigned int ndesc;
1970	unsigned int pidx, gen;
1971	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
1972	struct tx_sw_desc *stx;
1973
1974	nsegs = m_get_sgllen(m);
1975	vsegs = m_get_sgl(m);
1976	ndesc = calc_tx_descs_ofld(m, nsegs);
1977	busdma_map_sgl(vsegs, segs, nsegs);
1978
1979	stx = &q->sdesc[q->pidx];
1980	KASSERT(stx->mi.mi_base == NULL, ("mi_base set"));
1981
1982	mtx_lock(&q->lock);
1983again:	reclaim_completed_tx_(q, 16);
1984	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
1985	if (__predict_false(ret)) {
1986		if (ret == 1) {
1987			printf("no ofld desc avail\n");
1988
1989			m_set_priority(m, ndesc);     /* save for restart */
1990			mtx_unlock(&q->lock);
1991			return (EINTR);
1992		}
1993		goto again;
1994	}
1995
1996	gen = q->gen;
1997	q->in_use += ndesc;
1998	pidx = q->pidx;
1999	q->pidx += ndesc;
2000	if (q->pidx >= q->size) {
2001		q->pidx -= q->size;
2002		q->gen ^= 1;
2003	}
2004#ifdef T3_TRACE
2005	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2006		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2007		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2008		  skb_shinfo(skb)->nr_frags);
2009#endif
2010	mtx_unlock(&q->lock);
2011
2012	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2013	check_ring_tx_db(adap, q);
2014
2015	return (0);
2016}
2017
2018/**
2019 *	restart_offloadq - restart a suspended offload queue
2020 *	@qs: the queue set cotaining the offload queue
2021 *
2022 *	Resumes transmission on a suspended Tx offload queue.
2023 */
2024static void
2025restart_offloadq(void *data, int npending)
2026{
2027	struct mbuf *m;
2028	struct sge_qset *qs = data;
2029	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2030	adapter_t *adap = qs->port->adapter;
2031	bus_dma_segment_t segs[TX_MAX_SEGS];
2032	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2033	int nsegs, cleaned;
2034
2035	mtx_lock(&q->lock);
2036again:	cleaned = reclaim_completed_tx_(q, 16);
2037
2038	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2039		unsigned int gen, pidx;
2040		unsigned int ndesc = m_get_priority(m);
2041
2042		if (__predict_false(q->size - q->in_use < ndesc)) {
2043			setbit(&qs->txq_stopped, TXQ_OFLD);
2044			smp_mb();
2045
2046			if (should_restart_tx(q) &&
2047			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2048				goto again;
2049			q->stops++;
2050			break;
2051		}
2052
2053		gen = q->gen;
2054		q->in_use += ndesc;
2055		pidx = q->pidx;
2056		q->pidx += ndesc;
2057		if (q->pidx >= q->size) {
2058			q->pidx -= q->size;
2059			q->gen ^= 1;
2060		}
2061
2062		(void)mbufq_dequeue(&q->sendq);
2063		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2064		mtx_unlock(&q->lock);
2065		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2066		mtx_lock(&q->lock);
2067	}
2068	mtx_unlock(&q->lock);
2069
2070#if USE_GTS
2071	set_bit(TXQ_RUNNING, &q->flags);
2072	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2073#endif
2074	t3_write_reg(adap, A_SG_KDOORBELL,
2075		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2076}
2077
2078/**
2079 *	queue_set - return the queue set a packet should use
2080 *	@m: the packet
2081 *
2082 *	Maps a packet to the SGE queue set it should use.  The desired queue
2083 *	set is carried in bits 1-3 in the packet's priority.
2084 */
2085static __inline int
2086queue_set(const struct mbuf *m)
2087{
2088	return m_get_priority(m) >> 1;
2089}
2090
2091/**
2092 *	is_ctrl_pkt - return whether an offload packet is a control packet
2093 *	@m: the packet
2094 *
2095 *	Determines whether an offload packet should use an OFLD or a CTRL
2096 *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2097 */
2098static __inline int
2099is_ctrl_pkt(const struct mbuf *m)
2100{
2101	return m_get_priority(m) & 1;
2102}
2103
2104/**
2105 *	t3_offload_tx - send an offload packet
2106 *	@tdev: the offload device to send to
2107 *	@m: the packet
2108 *
2109 *	Sends an offload packet.  We use the packet priority to select the
2110 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2111 *	should be sent as regular or control, bits 1-3 select the queue set.
2112 */
2113int
2114t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2115{
2116	adapter_t *adap = tdev2adap(tdev);
2117	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2118
2119	if (__predict_false(is_ctrl_pkt(m)))
2120		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
2121
2122	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
2123}
2124
2125/**
2126 *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2127 *	@tdev: the offload device that will be receiving the packets
2128 *	@q: the SGE response queue that assembled the bundle
2129 *	@m: the partial bundle
2130 *	@n: the number of packets in the bundle
2131 *
2132 *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2133 */
2134static __inline void
2135deliver_partial_bundle(struct t3cdev *tdev,
2136			struct sge_rspq *q,
2137			struct mbuf *mbufs[], int n)
2138{
2139	if (n) {
2140		q->offload_bundles++;
2141		cxgb_ofld_recv(tdev, mbufs, n);
2142	}
2143}
2144
2145static __inline int
2146rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2147    struct mbuf *m, struct mbuf *rx_gather[],
2148    unsigned int gather_idx)
2149{
2150
2151	rq->offload_pkts++;
2152	m->m_pkthdr.header = mtod(m, void *);
2153	rx_gather[gather_idx++] = m;
2154	if (gather_idx == RX_BUNDLE_SIZE) {
2155		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2156		gather_idx = 0;
2157		rq->offload_bundles++;
2158	}
2159	return (gather_idx);
2160}
2161
2162static void
2163restart_tx(struct sge_qset *qs)
2164{
2165	struct adapter *sc = qs->port->adapter;
2166
2167
2168	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2169	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2170	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2171		qs->txq[TXQ_OFLD].restarts++;
2172		DPRINTF("restarting TXQ_OFLD\n");
2173		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2174	}
2175	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2176	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2177	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2178	    qs->txq[TXQ_CTRL].in_use);
2179
2180	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2181	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2182	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2183		qs->txq[TXQ_CTRL].restarts++;
2184		DPRINTF("restarting TXQ_CTRL\n");
2185		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2186	}
2187}
2188
2189/**
2190 *	t3_sge_alloc_qset - initialize an SGE queue set
2191 *	@sc: the controller softc
2192 *	@id: the queue set id
2193 *	@nports: how many Ethernet ports will be using this queue set
2194 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2195 *	@p: configuration parameters for this queue set
2196 *	@ntxq: number of Tx queues for the queue set
2197 *	@pi: port info for queue set
2198 *
2199 *	Allocate resources and initialize an SGE queue set.  A queue set
2200 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2201 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2202 *	queue, offload queue, and control queue.
2203 */
2204int
2205t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2206		  const struct qset_params *p, int ntxq, struct port_info *pi)
2207{
2208	struct sge_qset *q = &sc->sge.qs[id];
2209	int i, header_size, ret = 0;
2210
2211	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2212		if ((q->txq[i].txq_mr.br_ring = malloc(cxgb_txq_buf_ring_size*sizeof(struct mbuf *),
2213			    M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
2214			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2215			goto err;
2216		}
2217		q->txq[i].txq_mr.br_prod = q->txq[i].txq_mr.br_cons = 0;
2218		q->txq[i].txq_mr.br_size = cxgb_txq_buf_ring_size;
2219		mtx_init(&q->txq[i].txq_mr.br_lock, "txq mbuf ring", NULL, MTX_DEF);
2220	}
2221
2222	init_qset_cntxt(q, id);
2223	q->idx = id;
2224
2225	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2226		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2227		    &q->fl[0].desc, &q->fl[0].sdesc,
2228		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2229		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2230		printf("error %d from alloc ring fl0\n", ret);
2231		goto err;
2232	}
2233
2234	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2235		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2236		    &q->fl[1].desc, &q->fl[1].sdesc,
2237		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2238		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2239		printf("error %d from alloc ring fl1\n", ret);
2240		goto err;
2241	}
2242
2243	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2244		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2245		    &q->rspq.desc_tag, &q->rspq.desc_map,
2246		    NULL, NULL)) != 0) {
2247		printf("error %d from alloc ring rspq\n", ret);
2248		goto err;
2249	}
2250
2251	for (i = 0; i < ntxq; ++i) {
2252		/*
2253		 * The control queue always uses immediate data so does not
2254		 * need to keep track of any mbufs.
2255		 * XXX Placeholder for future TOE support.
2256		 */
2257		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2258
2259		if ((ret = alloc_ring(sc, p->txq_size[i],
2260			    sizeof(struct tx_desc), sz,
2261			    &q->txq[i].phys_addr, &q->txq[i].desc,
2262			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2263			    &q->txq[i].desc_map,
2264			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2265			printf("error %d from alloc ring tx %i\n", ret, i);
2266			goto err;
2267		}
2268		mbufq_init(&q->txq[i].sendq);
2269		q->txq[i].gen = 1;
2270		q->txq[i].size = p->txq_size[i];
2271		snprintf(q->txq[i].lockbuf, TXQ_NAME_LEN, "t3 txq lock %d:%d:%d",
2272		    device_get_unit(sc->dev), irq_vec_idx, i);
2273		MTX_INIT(&q->txq[i].lock, q->txq[i].lockbuf, NULL, MTX_DEF);
2274	}
2275
2276	q->txq[TXQ_ETH].port = pi;
2277
2278	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2279	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2280	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_ETH]);
2281	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_OFLD]);
2282
2283	q->fl[0].gen = q->fl[1].gen = 1;
2284	q->fl[0].size = p->fl_size;
2285	q->fl[1].size = p->jumbo_size;
2286
2287	q->rspq.gen = 1;
2288	q->rspq.cidx = 0;
2289	q->rspq.size = p->rspq_size;
2290
2291
2292	header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
2293	q->txq[TXQ_ETH].stop_thres = nports *
2294	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2295
2296	q->fl[0].buf_size = (MCLBYTES - header_size);
2297	q->fl[0].zone = zone_clust;
2298	q->fl[0].type = EXT_CLUSTER;
2299#if __FreeBSD_version > 800000
2300	if (cxgb_use_16k_clusters) {
2301		q->fl[1].buf_size = MJUM16BYTES - header_size;
2302		q->fl[1].zone = zone_jumbo16;
2303		q->fl[1].type = EXT_JUMBO16;
2304	} else {
2305		q->fl[1].buf_size = MJUM9BYTES - header_size;
2306		q->fl[1].zone = zone_jumbo9;
2307		q->fl[1].type = EXT_JUMBO9;
2308	}
2309#else
2310	q->fl[1].buf_size = MJUMPAGESIZE - header_size;
2311	q->fl[1].zone = zone_jumbop;
2312	q->fl[1].type = EXT_JUMBOP;
2313#endif
2314	q->lro.enabled = lro_default;
2315
2316	mtx_lock(&sc->sge.reg_lock);
2317	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2318				   q->rspq.phys_addr, q->rspq.size,
2319				   q->fl[0].buf_size, 1, 0);
2320	if (ret) {
2321		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2322		goto err_unlock;
2323	}
2324
2325	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2326		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2327					  q->fl[i].phys_addr, q->fl[i].size,
2328					  q->fl[i].buf_size, p->cong_thres, 1,
2329					  0);
2330		if (ret) {
2331			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2332			goto err_unlock;
2333		}
2334	}
2335
2336	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2337				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2338				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2339				 1, 0);
2340	if (ret) {
2341		printf("error %d from t3_sge_init_ecntxt\n", ret);
2342		goto err_unlock;
2343	}
2344
2345	if (ntxq > 1) {
2346		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2347					 USE_GTS, SGE_CNTXT_OFLD, id,
2348					 q->txq[TXQ_OFLD].phys_addr,
2349					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2350		if (ret) {
2351			printf("error %d from t3_sge_init_ecntxt\n", ret);
2352			goto err_unlock;
2353		}
2354	}
2355
2356	if (ntxq > 2) {
2357		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2358					 SGE_CNTXT_CTRL, id,
2359					 q->txq[TXQ_CTRL].phys_addr,
2360					 q->txq[TXQ_CTRL].size,
2361					 q->txq[TXQ_CTRL].token, 1, 0);
2362		if (ret) {
2363			printf("error %d from t3_sge_init_ecntxt\n", ret);
2364			goto err_unlock;
2365		}
2366	}
2367
2368	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2369	    device_get_unit(sc->dev), irq_vec_idx);
2370	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2371
2372	mtx_unlock(&sc->sge.reg_lock);
2373	t3_update_qset_coalesce(q, p);
2374	q->port = pi;
2375
2376	refill_fl(sc, &q->fl[0], q->fl[0].size);
2377	refill_fl(sc, &q->fl[1], q->fl[1].size);
2378	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2379
2380	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2381		     V_NEWTIMER(q->rspq.holdoff_tmr));
2382
2383	return (0);
2384
2385err_unlock:
2386	mtx_unlock(&sc->sge.reg_lock);
2387err:
2388	t3_free_qset(sc, q);
2389
2390	return (ret);
2391}
2392
2393void
2394t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2395{
2396	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2397	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2398	struct ifnet *ifp = pi->ifp;
2399
2400	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2401
2402	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2403	    cpl->csum_valid && cpl->csum == 0xffff) {
2404		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2405		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2406		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2407		m->m_pkthdr.csum_data = 0xffff;
2408	}
2409	/*
2410	 * XXX need to add VLAN support for 6.x
2411	 */
2412#ifdef VLAN_SUPPORTED
2413	if (__predict_false(cpl->vlan_valid)) {
2414		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2415		m->m_flags |= M_VLANTAG;
2416	}
2417#endif
2418
2419	m->m_pkthdr.rcvif = ifp;
2420	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2421#ifndef DISABLE_MBUF_IOVEC
2422	m_explode(m);
2423#endif
2424	/*
2425	 * adjust after conversion to mbuf chain
2426	 */
2427	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2428	m->m_len -= (sizeof(*cpl) + ethpad);
2429	m->m_data += (sizeof(*cpl) + ethpad);
2430
2431	(*ifp->if_input)(ifp, m);
2432}
2433
2434static void
2435ext_free_handler(void *cl, void * arg)
2436{
2437	uintptr_t type = (uintptr_t)arg;
2438	uma_zone_t zone;
2439	struct mbuf *m;
2440
2441	m = cl;
2442	zone = m_getzonefromtype(type);
2443	m->m_ext.ext_type = (int)type;
2444	cxgb_ext_freed++;
2445	cxgb_cache_put(zone, cl);
2446}
2447
2448static void
2449init_cluster_mbuf(caddr_t cl, int flags, int type, uma_zone_t zone)
2450{
2451	struct mbuf *m;
2452	int header_size;
2453
2454	header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) +
2455	    sizeof(struct m_ext_) + sizeof(uint32_t);
2456
2457	bzero(cl, header_size);
2458	m = (struct mbuf *)cl;
2459
2460	SLIST_INIT(&m->m_pkthdr.tags);
2461	m->m_type = MT_DATA;
2462	m->m_flags = flags | M_NOFREE | M_EXT;
2463	m->m_data = cl + header_size;
2464	m->m_ext.ext_buf = cl;
2465	m->m_ext.ref_cnt = (uint32_t *)(cl + header_size - sizeof(uint32_t));
2466	m->m_ext.ext_size = m_getsizefromtype(type);
2467	m->m_ext.ext_free = ext_free_handler;
2468	m->m_ext.ext_args = (void *)(uintptr_t)type;
2469	m->m_ext.ext_type = EXT_EXTREF;
2470	*(m->m_ext.ref_cnt) = 1;
2471	DPRINTF("data=%p ref_cnt=%p\n", m->m_data, m->m_ext.ref_cnt);
2472}
2473
2474
2475/**
2476 *	get_packet - return the next ingress packet buffer from a free list
2477 *	@adap: the adapter that received the packet
2478 *	@drop_thres: # of remaining buffers before we start dropping packets
2479 *	@qs: the qset that the SGE free list holding the packet belongs to
2480 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2481 *      @r: response descriptor
2482 *
2483 *	Get the next packet from a free list and complete setup of the
2484 *	sk_buff.  If the packet is small we make a copy and recycle the
2485 *	original buffer, otherwise we use the original buffer itself.  If a
2486 *	positive drop threshold is supplied packets are dropped and their
2487 *	buffers recycled if (a) the number of remaining buffers is under the
2488 *	threshold and the packet is too big to copy, or (b) the packet should
2489 *	be copied but there is no memory for the copy.
2490 */
2491#ifdef DISABLE_MBUF_IOVEC
2492
2493static int
2494get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2495    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2496{
2497
2498	unsigned int len_cq =  ntohl(r->len_cq);
2499	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2500	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2501	uint32_t len = G_RSPD_LEN(len_cq);
2502	uint32_t flags = ntohl(r->flags);
2503	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2504	struct mbuf *m;
2505	uint32_t *ref;
2506	int ret = 0;
2507
2508	prefetch(sd->rxsd_cl);
2509
2510	fl->credits--;
2511	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2512
2513	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2514		if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2515			goto skip_recycle;
2516		cl = mtod(m0, void *);
2517		memcpy(cl, sd->data, len);
2518		recycle_rx_buf(adap, fl, fl->cidx);
2519		m = m0;
2520	} else {
2521	skip_recycle:
2522		int flags = 0;
2523		bus_dmamap_unload(fl->entry_tag, sd->map);
2524		cl = sd->rxsd_cl;
2525		m = m0 = (struct mbuf *)cl;
2526
2527		m0->m_len = len;
2528		if ((sopeop == RSPQ_SOP_EOP) ||
2529		    (sopeop == RSPQ_SOP))
2530			flags = M_PKTHDR;
2531		init_cluster_mbuf(cl, flags, fl->type, fl->zone);
2532	}
2533
2534	switch(sopeop) {
2535	case RSPQ_SOP_EOP:
2536		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2537		mh->mh_head = mh->mh_tail = m;
2538		m->m_pkthdr.len = len;
2539		ret = 1;
2540		break;
2541	case RSPQ_NSOP_NEOP:
2542		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2543		if (mh->mh_tail == NULL) {
2544			printf("discarding intermediate descriptor entry\n");
2545			m_freem(m);
2546			break;
2547		}
2548		mh->mh_tail->m_next = m;
2549		mh->mh_tail = m;
2550		mh->mh_head->m_pkthdr.len += len;
2551		ret = 0;
2552		break;
2553	case RSPQ_SOP:
2554		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2555		m->m_pkthdr.len = len;
2556		mh->mh_head = mh->mh_tail = m;
2557		ret = 0;
2558		break;
2559	case RSPQ_EOP:
2560		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2561		mh->mh_head->m_pkthdr.len += len;
2562		mh->mh_tail->m_next = m;
2563		mh->mh_tail = m;
2564		ret = 1;
2565		break;
2566	}
2567	if (++fl->cidx == fl->size)
2568		fl->cidx = 0;
2569
2570	return (ret);
2571}
2572
2573#else
2574
2575static int
2576get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2577    struct mbuf **m, struct rsp_desc *r)
2578{
2579
2580	unsigned int len_cq =  ntohl(r->len_cq);
2581	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2582	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2583	uint32_t len = G_RSPD_LEN(len_cq);
2584	uint32_t flags = ntohl(r->flags);
2585	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2586	void *cl;
2587	int ret = 0;
2588	struct mbuf *m0;
2589#if 0
2590	if ((sd + 1 )->rxsd_cl)
2591		prefetch((sd + 1)->rxsd_cl);
2592	if ((sd + 2)->rxsd_cl)
2593		prefetch((sd + 2)->rxsd_cl);
2594#endif
2595	DPRINTF("rx cpu=%d\n", curcpu);
2596	fl->credits--;
2597	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2598
2599	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2600		if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2601			goto skip_recycle;
2602		cl = mtod(m0, void *);
2603		memcpy(cl, sd->data, len);
2604		recycle_rx_buf(adap, fl, fl->cidx);
2605		*m = m0;
2606	} else {
2607	skip_recycle:
2608		bus_dmamap_unload(fl->entry_tag, sd->map);
2609		cl = sd->rxsd_cl;
2610		*m = m0 = (struct mbuf *)cl;
2611	}
2612
2613	switch(sopeop) {
2614	case RSPQ_SOP_EOP:
2615		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2616		if (cl == sd->rxsd_cl)
2617			init_cluster_mbuf(cl, M_PKTHDR, fl->type, fl->zone);
2618		m0->m_len = m0->m_pkthdr.len = len;
2619		ret = 1;
2620		goto done;
2621		break;
2622	case RSPQ_NSOP_NEOP:
2623		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2624		panic("chaining unsupported");
2625		ret = 0;
2626		break;
2627	case RSPQ_SOP:
2628		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2629		panic("chaining unsupported");
2630		m_iovinit(m0);
2631		ret = 0;
2632		break;
2633	case RSPQ_EOP:
2634		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2635		panic("chaining unsupported");
2636		ret = 1;
2637		break;
2638	}
2639	panic("append not supported");
2640#if 0
2641	m_iovappend(m0, cl, fl->buf_size, len, sizeof(uint32_t), sd->rxsd_ref);
2642#endif
2643done:
2644	if (++fl->cidx == fl->size)
2645		fl->cidx = 0;
2646
2647	return (ret);
2648}
2649#endif
2650/**
2651 *	handle_rsp_cntrl_info - handles control information in a response
2652 *	@qs: the queue set corresponding to the response
2653 *	@flags: the response control flags
2654 *
2655 *	Handles the control information of an SGE response, such as GTS
2656 *	indications and completion credits for the queue set's Tx queues.
2657 *	HW coalesces credits, we don't do any extra SW coalescing.
2658 */
2659static __inline void
2660handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2661{
2662	unsigned int credits;
2663
2664#if USE_GTS
2665	if (flags & F_RSPD_TXQ0_GTS)
2666		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2667#endif
2668	credits = G_RSPD_TXQ0_CR(flags);
2669	if (credits)
2670		qs->txq[TXQ_ETH].processed += credits;
2671
2672	credits = G_RSPD_TXQ2_CR(flags);
2673	if (credits)
2674		qs->txq[TXQ_CTRL].processed += credits;
2675
2676# if USE_GTS
2677	if (flags & F_RSPD_TXQ1_GTS)
2678		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2679# endif
2680	credits = G_RSPD_TXQ1_CR(flags);
2681	if (credits)
2682		qs->txq[TXQ_OFLD].processed += credits;
2683
2684}
2685
2686static void
2687check_ring_db(adapter_t *adap, struct sge_qset *qs,
2688    unsigned int sleeping)
2689{
2690	;
2691}
2692
2693/**
2694 *	process_responses - process responses from an SGE response queue
2695 *	@adap: the adapter
2696 *	@qs: the queue set to which the response queue belongs
2697 *	@budget: how many responses can be processed in this round
2698 *
2699 *	Process responses from an SGE response queue up to the supplied budget.
2700 *	Responses include received packets as well as credits and other events
2701 *	for the queues that belong to the response queue's queue set.
2702 *	A negative budget is effectively unlimited.
2703 *
2704 *	Additionally choose the interrupt holdoff time for the next interrupt
2705 *	on this queue.  If the system is under memory shortage use a fairly
2706 *	long delay to help recovery.
2707 */
2708int
2709process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2710{
2711	struct sge_rspq *rspq = &qs->rspq;
2712	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2713	int budget_left = budget;
2714	unsigned int sleeping = 0;
2715	int lro = qs->lro.enabled;
2716	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2717	int ngathered = 0;
2718#ifdef DEBUG
2719	static int last_holdoff = 0;
2720	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2721		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2722		last_holdoff = rspq->holdoff_tmr;
2723	}
2724#endif
2725	rspq->next_holdoff = rspq->holdoff_tmr;
2726
2727	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2728		int eth, eop = 0, ethpad = 0;
2729		uint32_t flags = ntohl(r->flags);
2730		uint32_t rss_csum = *(const uint32_t *)r;
2731		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2732
2733		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2734
2735		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2736			/* XXX */
2737			printf("async notification\n");
2738
2739		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2740			struct mbuf *m = NULL;
2741
2742#ifdef DISABLE_MBUF_IOVEC
2743			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
2744			    r->rss_hdr.opcode, rspq->cidx);
2745
2746			m = rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2747			if (m == NULL) {
2748				rspq->next_holdoff = NOMEM_INTR_DELAY;
2749				budget_left--;
2750				break;
2751			}
2752
2753			get_imm_packet(adap, r, &rspq->rspq_mh);
2754			eop = 1;
2755#else
2756			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
2757			    r->rss_hdr.opcode, rspq->cidx);
2758			if (rspq->rspq_mbuf == NULL)
2759				rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
2760                        else
2761				m = m_gethdr(M_DONTWAIT, MT_DATA);
2762
2763			/*
2764			 * XXX revisit me
2765			 */
2766			if (rspq->rspq_mbuf == NULL &&  m == NULL) {
2767				rspq->next_holdoff = NOMEM_INTR_DELAY;
2768				budget_left--;
2769				break;
2770			}
2771			get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags);
2772
2773			eop = 1;
2774			rspq->imm_data++;
2775#endif
2776		} else if (r->len_cq) {
2777			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2778
2779#ifdef DISABLE_MBUF_IOVEC
2780			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
2781#else
2782			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mbuf, r);
2783#ifdef IFNET_MULTIQUEUE
2784			rspq->rspq_mbuf->m_pkthdr.rss_hash = rss_hash;
2785#endif
2786#endif
2787			ethpad = 2;
2788		} else {
2789			DPRINTF("pure response\n");
2790			rspq->pure_rsps++;
2791		}
2792
2793		if (flags & RSPD_CTRL_MASK) {
2794			sleeping |= flags & RSPD_GTS_MASK;
2795			handle_rsp_cntrl_info(qs, flags);
2796		}
2797
2798		r++;
2799		if (__predict_false(++rspq->cidx == rspq->size)) {
2800			rspq->cidx = 0;
2801			rspq->gen ^= 1;
2802			r = rspq->desc;
2803		}
2804		prefetch(r);
2805		if (++rspq->credits >= (rspq->size / 4)) {
2806			refill_rspq(adap, rspq, rspq->credits);
2807			rspq->credits = 0;
2808		}
2809		DPRINTF("eth=%d eop=%d flags=0x%x\n", eth, eop, flags);
2810
2811		if (!eth && eop) {
2812			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
2813			/*
2814			 * XXX size mismatch
2815			 */
2816			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
2817
2818			ngathered = rx_offload(&adap->tdev, rspq,
2819			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
2820			rspq->rspq_mh.mh_head = NULL;
2821			DPRINTF("received offload packet\n");
2822
2823		} else if (eth && eop) {
2824			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *));
2825			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES);
2826
2827			t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad,
2828			    rss_hash, rss_csum, lro);
2829			DPRINTF("received tunnel packet\n");
2830				rspq->rspq_mh.mh_head = NULL;
2831
2832		}
2833		__refill_fl_lt(adap, &qs->fl[0], 32);
2834		__refill_fl_lt(adap, &qs->fl[1], 32);
2835		--budget_left;
2836	}
2837
2838	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
2839	t3_lro_flush(adap, qs, &qs->lro);
2840
2841	if (sleeping)
2842		check_ring_db(adap, qs, sleeping);
2843
2844	smp_mb();  /* commit Tx queue processed updates */
2845	if (__predict_false(qs->txq_stopped > 1)) {
2846		printf("restarting tx on %p\n", qs);
2847
2848		restart_tx(qs);
2849	}
2850
2851	__refill_fl_lt(adap, &qs->fl[0], 512);
2852	__refill_fl_lt(adap, &qs->fl[1], 512);
2853	budget -= budget_left;
2854	return (budget);
2855}
2856
2857/*
2858 * A helper function that processes responses and issues GTS.
2859 */
2860static __inline int
2861process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2862{
2863	int work;
2864	static int last_holdoff = 0;
2865
2866	work = process_responses(adap, rspq_to_qset(rq), -1);
2867
2868	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2869		printf("next_holdoff=%d\n", rq->next_holdoff);
2870		last_holdoff = rq->next_holdoff;
2871	}
2872	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2873	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2874
2875	return (work);
2876}
2877
2878
2879/*
2880 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2881 * Handles data events from SGE response queues as well as error and other
2882 * async events as they all use the same interrupt pin.  We use one SGE
2883 * response queue per port in this mode and protect all response queues with
2884 * queue 0's lock.
2885 */
2886void
2887t3b_intr(void *data)
2888{
2889	uint32_t i, map;
2890	adapter_t *adap = data;
2891	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2892
2893	t3_write_reg(adap, A_PL_CLI, 0);
2894	map = t3_read_reg(adap, A_SG_DATA_INTR);
2895
2896	if (!map)
2897		return;
2898
2899	if (__predict_false(map & F_ERRINTR))
2900		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2901
2902	mtx_lock(&q0->lock);
2903	for_each_port(adap, i)
2904	    if (map & (1 << i))
2905			process_responses_gts(adap, &adap->sge.qs[i].rspq);
2906	mtx_unlock(&q0->lock);
2907}
2908
2909/*
2910 * The MSI interrupt handler.  This needs to handle data events from SGE
2911 * response queues as well as error and other async events as they all use
2912 * the same MSI vector.  We use one SGE response queue per port in this mode
2913 * and protect all response queues with queue 0's lock.
2914 */
2915void
2916t3_intr_msi(void *data)
2917{
2918	adapter_t *adap = data;
2919	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2920	int i, new_packets = 0;
2921
2922	mtx_lock(&q0->lock);
2923
2924	for_each_port(adap, i)
2925	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
2926		    new_packets = 1;
2927	mtx_unlock(&q0->lock);
2928	if (new_packets == 0)
2929		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2930}
2931
2932void
2933t3_intr_msix(void *data)
2934{
2935	struct sge_qset *qs = data;
2936	adapter_t *adap = qs->port->adapter;
2937	struct sge_rspq *rspq = &qs->rspq;
2938#ifndef IFNET_MULTIQUEUE
2939	mtx_lock(&rspq->lock);
2940#else
2941	if (mtx_trylock(&rspq->lock))
2942#endif
2943	{
2944
2945		if (process_responses_gts(adap, rspq) == 0)
2946			rspq->unhandled_irqs++;
2947		mtx_unlock(&rspq->lock);
2948	}
2949}
2950
2951#define QDUMP_SBUF_SIZE		32 * 400
2952static int
2953t3_dump_rspq(SYSCTL_HANDLER_ARGS)
2954{
2955	struct sge_rspq *rspq;
2956	struct sge_qset *qs;
2957	int i, err, dump_end, idx;
2958	static int multiplier = 1;
2959	struct sbuf *sb;
2960	struct rsp_desc *rspd;
2961	uint32_t data[4];
2962
2963	rspq = arg1;
2964	qs = rspq_to_qset(rspq);
2965	if (rspq->rspq_dump_count == 0)
2966		return (0);
2967	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
2968		log(LOG_WARNING,
2969		    "dump count is too large %d\n", rspq->rspq_dump_count);
2970		rspq->rspq_dump_count = 0;
2971		return (EINVAL);
2972	}
2973	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
2974		log(LOG_WARNING,
2975		    "dump start of %d is greater than queue size\n",
2976		    rspq->rspq_dump_start);
2977		rspq->rspq_dump_start = 0;
2978		return (EINVAL);
2979	}
2980	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
2981	if (err)
2982		return (err);
2983retry_sbufops:
2984	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
2985
2986	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
2987	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
2988	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
2989	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
2990	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
2991
2992	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
2993	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
2994
2995	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
2996	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
2997		idx = i & (RSPQ_Q_SIZE-1);
2998
2999		rspd = &rspq->desc[idx];
3000		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3001		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3002		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3003		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3004		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3005		    be32toh(rspd->len_cq), rspd->intr_gen);
3006	}
3007	if (sbuf_overflowed(sb)) {
3008		sbuf_delete(sb);
3009		multiplier++;
3010		goto retry_sbufops;
3011	}
3012	sbuf_finish(sb);
3013	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3014	sbuf_delete(sb);
3015	return (err);
3016}
3017
3018
3019/*
3020 * broken by recent mbuf changes
3021 */
3022static int
3023t3_dump_txq(SYSCTL_HANDLER_ARGS)
3024{
3025	struct sge_txq *txq;
3026	struct sge_qset *qs;
3027	int i, j, err, dump_end;
3028	static int multiplier = 1;
3029	struct sbuf *sb;
3030	struct tx_desc *txd;
3031	uint32_t *WR, wr_hi, wr_lo, gen;
3032	uint32_t data[4];
3033
3034	txq = arg1;
3035	qs = txq_to_qset(txq, TXQ_ETH);
3036	if (txq->txq_dump_count == 0) {
3037		return (0);
3038	}
3039	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3040		log(LOG_WARNING,
3041		    "dump count is too large %d\n", txq->txq_dump_count);
3042		txq->txq_dump_count = 1;
3043		return (EINVAL);
3044	}
3045	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3046		log(LOG_WARNING,
3047		    "dump start of %d is greater than queue size\n",
3048		    txq->txq_dump_start);
3049		txq->txq_dump_start = 0;
3050		return (EINVAL);
3051	}
3052	err = t3_sge_read_ecntxt(qs->port->adapter, txq->cntxt_id, data);
3053	if (err)
3054		return (err);
3055
3056
3057retry_sbufops:
3058	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3059
3060	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3061	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3062	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3063	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3064	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3065	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3066	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3067	    txq->txq_dump_start,
3068	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3069
3070	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3071	for (i = txq->txq_dump_start; i < dump_end; i++) {
3072		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3073		WR = (uint32_t *)txd->flit;
3074		wr_hi = ntohl(WR[0]);
3075		wr_lo = ntohl(WR[1]);
3076		gen = G_WR_GEN(wr_lo);
3077
3078		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3079		    wr_hi, wr_lo, gen);
3080		for (j = 2; j < 30; j += 4)
3081			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3082			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3083
3084	}
3085	if (sbuf_overflowed(sb)) {
3086		sbuf_delete(sb);
3087		multiplier++;
3088		goto retry_sbufops;
3089	}
3090	sbuf_finish(sb);
3091	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3092	sbuf_delete(sb);
3093	return (err);
3094}
3095
3096
3097static int
3098t3_lro_enable(SYSCTL_HANDLER_ARGS)
3099{
3100	adapter_t *sc;
3101	int i, j, enabled, err, nqsets = 0;
3102
3103#ifndef LRO_WORKING
3104	return (0);
3105#endif
3106	sc = arg1;
3107	enabled = sc->sge.qs[0].lro.enabled;
3108        err = sysctl_handle_int(oidp, &enabled, arg2, req);
3109
3110	if (err != 0)
3111		return (err);
3112	if (enabled == sc->sge.qs[0].lro.enabled)
3113		return (0);
3114
3115	for (i = 0; i < sc->params.nports; i++)
3116		for (j = 0; j < sc->port[i].nqsets; j++)
3117			nqsets++;
3118
3119	for (i = 0; i < nqsets; i++)
3120		sc->sge.qs[i].lro.enabled = enabled;
3121
3122	return (0);
3123}
3124
3125static int
3126t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
3127{
3128	adapter_t *sc = arg1;
3129	struct qset_params *qsp = &sc->params.sge.qset[0];
3130	int coalesce_nsecs;
3131	struct sge_qset *qs;
3132	int i, j, err, nqsets = 0;
3133	struct mtx *lock;
3134
3135	if ((sc->flags & FULL_INIT_DONE) == 0)
3136		return (ENXIO);
3137
3138	coalesce_nsecs = qsp->coalesce_nsecs;
3139        err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
3140
3141	if (err != 0) {
3142		return (err);
3143	}
3144	if (coalesce_nsecs == qsp->coalesce_nsecs)
3145		return (0);
3146
3147	for (i = 0; i < sc->params.nports; i++)
3148		for (j = 0; j < sc->port[i].nqsets; j++)
3149			nqsets++;
3150
3151	coalesce_nsecs = max(100, coalesce_nsecs);
3152
3153	for (i = 0; i < nqsets; i++) {
3154		qs = &sc->sge.qs[i];
3155		qsp = &sc->params.sge.qset[i];
3156		qsp->coalesce_nsecs = coalesce_nsecs;
3157
3158		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3159			    &sc->sge.qs[0].rspq.lock;
3160
3161		mtx_lock(lock);
3162		t3_update_qset_coalesce(qs, qsp);
3163		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3164		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3165		mtx_unlock(lock);
3166	}
3167
3168	return (0);
3169}
3170
3171
3172void
3173t3_add_attach_sysctls(adapter_t *sc)
3174{
3175	struct sysctl_ctx_list *ctx;
3176	struct sysctl_oid_list *children;
3177
3178	ctx = device_get_sysctl_ctx(sc->dev);
3179	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3180
3181	/* random information */
3182	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3183	    "firmware_version",
3184	    CTLFLAG_RD, &sc->fw_version,
3185	    0, "firmware version");
3186
3187	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3188	    "enable_lro",
3189	    CTLTYPE_INT|CTLFLAG_RW, sc,
3190	    0, t3_lro_enable,
3191	    "I", "enable large receive offload");
3192
3193	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3194	    "enable_debug",
3195	    CTLFLAG_RW, &cxgb_debug,
3196	    0, "enable verbose debugging output");
3197	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tunq_coalesce",
3198	    CTLFLAG_RD, &sc->tunq_coalesce,
3199	    "#tunneled packets freed");
3200	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3201	    "txq_overrun",
3202	    CTLFLAG_RD, &txq_fills,
3203	    0, "#times txq overrun");
3204	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3205	    "pcpu_cache_enable",
3206	    CTLFLAG_RW, &cxgb_pcpu_cache_enable,
3207	    0, "#enable driver local pcpu caches");
3208	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3209	    "cache_alloc",
3210	    CTLFLAG_RD, &cxgb_cached_allocations,
3211	    0, "#times a cluster was allocated from cache");
3212	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3213	    "cached",
3214	    CTLFLAG_RD, &cxgb_cached,
3215	    0, "#times a cluster was cached");
3216	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3217	    "ext_freed",
3218	    CTLFLAG_RD, &cxgb_ext_freed,
3219	    0, "#times a cluster was freed through ext_free");
3220	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3221	    "mbufs_outstanding",
3222	    CTLFLAG_RD, &cxgb_mbufs_outstanding,
3223	    0, "#mbufs in flight in the driver");
3224	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3225	    "pack_outstanding",
3226	    CTLFLAG_RD, &cxgb_pack_outstanding,
3227	    0, "#packet in flight in the driver");
3228}
3229
3230
3231static const char *rspq_name = "rspq";
3232static const char *txq_names[] =
3233{
3234	"txq_eth",
3235	"txq_ofld",
3236	"txq_ctrl"
3237};
3238
3239void
3240t3_add_configured_sysctls(adapter_t *sc)
3241{
3242	struct sysctl_ctx_list *ctx;
3243	struct sysctl_oid_list *children;
3244	int i, j;
3245
3246	ctx = device_get_sysctl_ctx(sc->dev);
3247	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3248
3249	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3250	    "intr_coal",
3251	    CTLTYPE_INT|CTLFLAG_RW, sc,
3252	    0, t3_set_coalesce_nsecs,
3253	    "I", "interrupt coalescing timer (ns)");
3254
3255	for (i = 0; i < sc->params.nports; i++) {
3256		struct port_info *pi = &sc->port[i];
3257		struct sysctl_oid *poid;
3258		struct sysctl_oid_list *poidlist;
3259
3260		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3261		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3262		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3263		poidlist = SYSCTL_CHILDREN(poid);
3264		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3265		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3266		    0, "#queue sets");
3267
3268		for (j = 0; j < pi->nqsets; j++) {
3269			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3270			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid;
3271			struct sysctl_oid_list *qspoidlist, *rspqpoidlist, *txqpoidlist;
3272			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3273
3274			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3275
3276			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3277			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3278			qspoidlist = SYSCTL_CHILDREN(qspoid);
3279
3280			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3281			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3282			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3283
3284			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3285			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3286			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3287
3288
3289
3290			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3291			    CTLFLAG_RD, &qs->rspq.size,
3292			    0, "#entries in response queue");
3293			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3294			    CTLFLAG_RD, &qs->rspq.cidx,
3295			    0, "consumer index");
3296			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3297			    CTLFLAG_RD, &qs->rspq.credits,
3298			    0, "#credits");
3299			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3300			    CTLFLAG_RD, &qs->rspq.phys_addr,
3301			    "physical_address_of the queue");
3302			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3303			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3304			    0, "start rspq dump entry");
3305			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3306			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3307			    0, "#rspq entries to dump");
3308			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3309			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3310			    0, t3_dump_rspq, "A", "dump of the response queue");
3311
3312
3313
3314			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "dropped",
3315			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3316			    0, "#tunneled packets dropped");
3317			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3318			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3319			    0, "#tunneled packets waiting to be sent");
3320			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3321			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3322			    0, "#tunneled packets queue producer index");
3323			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3324			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3325			    0, "#tunneled packets queue consumer index");
3326			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3327			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3328			    0, "#tunneled packets processed by the card");
3329			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3330			    CTLFLAG_RD, &txq->cleaned,
3331			    0, "#tunneled packets cleaned");
3332			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3333			    CTLFLAG_RD, &txq->in_use,
3334			    0, "#tunneled packet slots in use");
3335			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3336			    CTLFLAG_RD, &txq->txq_frees,
3337			    "#tunneled packets freed");
3338			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3339			    CTLFLAG_RD, &txq->txq_skipped,
3340			    0, "#tunneled packet descriptors skipped");
3341			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "coalesced",
3342			    CTLFLAG_RD, &txq->txq_coalesced,
3343			    0, "#tunneled packets coalesced");
3344			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3345			    CTLFLAG_RD, &txq->txq_enqueued,
3346			    0, "#tunneled packets enqueued to hardware");
3347			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3348			    CTLFLAG_RD, &qs->txq_stopped,
3349			    0, "tx queues stopped");
3350			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3351			    CTLFLAG_RD, &txq->phys_addr,
3352			    "physical_address_of the queue");
3353			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3354			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3355			    0, "txq generation");
3356			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3357			    CTLFLAG_RD, &txq->cidx,
3358			    0, "hardware queue cidx");
3359			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3360			    CTLFLAG_RD, &txq->pidx,
3361			    0, "hardware queue pidx");
3362			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3363			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3364			    0, "txq start idx for dump");
3365			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3366			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3367			    0, "txq #entries to dump");
3368			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3369			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3370			    0, t3_dump_txq, "A", "dump of the transmit queue");
3371		}
3372	}
3373}
3374
3375/**
3376 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3377 *	@qs: the queue set
3378 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3379 *	@idx: the descriptor index in the queue
3380 *	@data: where to dump the descriptor contents
3381 *
3382 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3383 *	size of the descriptor.
3384 */
3385int
3386t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3387		unsigned char *data)
3388{
3389	if (qnum >= 6)
3390		return (EINVAL);
3391
3392	if (qnum < 3) {
3393		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3394			return -EINVAL;
3395		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3396		return sizeof(struct tx_desc);
3397	}
3398
3399	if (qnum == 3) {
3400		if (!qs->rspq.desc || idx >= qs->rspq.size)
3401			return (EINVAL);
3402		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3403		return sizeof(struct rsp_desc);
3404	}
3405
3406	qnum -= 4;
3407	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3408		return (EINVAL);
3409	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3410	return sizeof(struct rx_desc);
3411}
3412