cxgb_sge.c revision 175369
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29#define DEBUG_BUFRING
30
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/dev/cxgb/cxgb_sge.c 175369 2008-01-15 22:01:26Z jhb $");
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/module.h>
39#include <sys/bus.h>
40#include <sys/conf.h>
41#include <machine/bus.h>
42#include <machine/resource.h>
43#include <sys/bus_dma.h>
44#include <sys/rman.h>
45#include <sys/queue.h>
46#include <sys/sysctl.h>
47#include <sys/taskqueue.h>
48
49#include <sys/proc.h>
50#include <sys/sbuf.h>
51#include <sys/sched.h>
52#include <sys/smp.h>
53#include <sys/systm.h>
54#include <sys/syslog.h>
55
56#include <netinet/in_systm.h>
57#include <netinet/in.h>
58#include <netinet/ip.h>
59#include <netinet/tcp.h>
60
61#include <dev/pci/pcireg.h>
62#include <dev/pci/pcivar.h>
63
64#include <vm/vm.h>
65#include <vm/pmap.h>
66
67#ifdef CONFIG_DEFINED
68#include <cxgb_include.h>
69#include <sys/mvec.h>
70#else
71#include <dev/cxgb/cxgb_include.h>
72#include <dev/cxgb/sys/mvec.h>
73#endif
74
75int      txq_fills = 0;
76static int bogus_imm = 0;
77static int recycle_enable = 0;
78extern int cxgb_txq_buf_ring_size;
79int cxgb_cached_allocations;
80int cxgb_cached;
81int cxgb_ext_freed;
82extern int cxgb_use_16k_clusters;
83
84
85#define USE_GTS 0
86
87#define SGE_RX_SM_BUF_SIZE	1536
88#define SGE_RX_DROP_THRES	16
89#define SGE_RX_COPY_THRES	128
90
91/*
92 * Period of the Tx buffer reclaim timer.  This timer does not need to run
93 * frequently as Tx buffers are usually reclaimed by new Tx packets.
94 */
95#define TX_RECLAIM_PERIOD       (hz >> 1)
96
97
98/*
99 * Values for sge_txq.flags
100 */
101enum {
102	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
103	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
104};
105
106struct tx_desc {
107	uint64_t	flit[TX_DESC_FLITS];
108} __packed;
109
110struct rx_desc {
111	uint32_t	addr_lo;
112	uint32_t	len_gen;
113	uint32_t	gen2;
114	uint32_t	addr_hi;
115} __packed;;
116
117struct rsp_desc {               /* response queue descriptor */
118	struct rss_header	rss_hdr;
119	uint32_t		flags;
120	uint32_t		len_cq;
121	uint8_t			imm_data[47];
122	uint8_t			intr_gen;
123} __packed;
124
125#define RX_SW_DESC_MAP_CREATED	(1 << 0)
126#define TX_SW_DESC_MAP_CREATED	(1 << 1)
127#define RX_SW_DESC_INUSE        (1 << 3)
128#define TX_SW_DESC_MAPPED       (1 << 4)
129
130#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
131#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
132#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
133#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
134
135struct tx_sw_desc {                /* SW state per Tx descriptor */
136	struct mbuf_iovec mi;
137	bus_dmamap_t	map;
138	int		flags;
139};
140
141struct rx_sw_desc {                /* SW state per Rx descriptor */
142	caddr_t	         rxsd_cl;
143	caddr_t	         data;
144	bus_dmamap_t	  map;
145	int		  flags;
146};
147
148struct txq_state {
149	unsigned int compl;
150	unsigned int gen;
151	unsigned int pidx;
152};
153
154struct refill_fl_cb_arg {
155	int               error;
156	bus_dma_segment_t seg;
157	int               nseg;
158};
159
160/*
161 * Maps a number of flits to the number of Tx descriptors that can hold them.
162 * The formula is
163 *
164 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
165 *
166 * HW allows up to 4 descriptors to be combined into a WR.
167 */
168static uint8_t flit_desc_map[] = {
169	0,
170#if SGE_NUM_GENBITS == 1
171	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
173	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
174	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
175#elif SGE_NUM_GENBITS == 2
176	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
177	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
178	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
179	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
180#else
181# error "SGE_NUM_GENBITS must be 1 or 2"
182#endif
183};
184
185
186static int lro_default = 0;
187int cxgb_debug = 0;
188
189static void sge_timer_cb(void *arg);
190static void sge_timer_reclaim(void *arg, int ncount);
191static void sge_txq_reclaim_handler(void *arg, int ncount);
192
193/**
194 *	reclaim_completed_tx - reclaims completed Tx descriptors
195 *	@adapter: the adapter
196 *	@q: the Tx queue to reclaim completed descriptors from
197 *
198 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
199 *	and frees the associated buffers if possible.  Called with the Tx
200 *	queue's lock held.
201 */
202static __inline int
203reclaim_completed_tx_(struct sge_txq *q, int reclaim_min)
204{
205	int reclaim = desc_reclaimable(q);
206
207	if (reclaim < reclaim_min)
208		return (0);
209
210	mtx_assert(&q->lock, MA_OWNED);
211	if (reclaim > 0) {
212		t3_free_tx_desc(q, reclaim);
213		q->cleaned += reclaim;
214		q->in_use -= reclaim;
215	}
216	return (reclaim);
217}
218
219/**
220 *	should_restart_tx - are there enough resources to restart a Tx queue?
221 *	@q: the Tx queue
222 *
223 *	Checks if there are enough descriptors to restart a suspended Tx queue.
224 */
225static __inline int
226should_restart_tx(const struct sge_txq *q)
227{
228	unsigned int r = q->processed - q->cleaned;
229
230	return q->in_use - r < (q->size >> 1);
231}
232
233/**
234 *	t3_sge_init - initialize SGE
235 *	@adap: the adapter
236 *	@p: the SGE parameters
237 *
238 *	Performs SGE initialization needed every time after a chip reset.
239 *	We do not initialize any of the queue sets here, instead the driver
240 *	top-level must request those individually.  We also do not enable DMA
241 *	here, that should be done after the queues have been set up.
242 */
243void
244t3_sge_init(adapter_t *adap, struct sge_params *p)
245{
246	u_int ctrl, ups;
247
248	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
249
250	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
251	       F_CQCRDTCTRL |
252	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
253	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
254#if SGE_NUM_GENBITS == 1
255	ctrl |= F_EGRGENCTRL;
256#endif
257	if (adap->params.rev > 0) {
258		if (!(adap->flags & (USING_MSIX | USING_MSI)))
259			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
260		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
261	}
262	t3_write_reg(adap, A_SG_CONTROL, ctrl);
263	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
264		     V_LORCQDRBTHRSH(512));
265	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
266	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
267		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
268	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
269	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
270	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
271	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
272	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
273	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
274}
275
276
277/**
278 *	sgl_len - calculates the size of an SGL of the given capacity
279 *	@n: the number of SGL entries
280 *
281 *	Calculates the number of flits needed for a scatter/gather list that
282 *	can hold the given number of entries.
283 */
284static __inline unsigned int
285sgl_len(unsigned int n)
286{
287	return ((3 * n) / 2 + (n & 1));
288}
289
290/**
291 *	get_imm_packet - return the next ingress packet buffer from a response
292 *	@resp: the response descriptor containing the packet data
293 *
294 *	Return a packet containing the immediate data of the given response.
295 */
296#ifdef DISABLE_MBUF_IOVEC
297static __inline int
298get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh)
299{
300	struct mbuf *m = mh->m_head;
301
302	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
303	m->m_pkthdr.len = m->m_len = len;
304	return (0);
305}
306
307#else
308static int
309get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl, uint32_t flags)
310{
311
312	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
313	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
314	return (0);
315
316}
317#endif
318
319static __inline u_int
320flits_to_desc(u_int n)
321{
322	return (flit_desc_map[n]);
323}
324
325void
326t3_sge_err_intr_handler(adapter_t *adapter)
327{
328	unsigned int v, status;
329
330
331	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
332
333	if (status & F_RSPQCREDITOVERFOW)
334		CH_ALERT(adapter, "SGE response queue credit overflow\n");
335
336	if (status & F_RSPQDISABLED) {
337		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
338
339		CH_ALERT(adapter,
340			 "packet delivered to disabled response queue (0x%x)\n",
341			 (v >> S_RSPQ0DISABLED) & 0xff);
342	}
343
344	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
345	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
346		t3_fatal_err(adapter);
347}
348
349void
350t3_sge_prep(adapter_t *adap, struct sge_params *p)
351{
352	int i;
353
354	/* XXX Does ETHER_ALIGN need to be accounted for here? */
355	p->max_pkt_size = adap->sge.qs[0].fl[1].buf_size - sizeof(struct cpl_rx_data);
356
357	for (i = 0; i < SGE_QSETS; ++i) {
358		struct qset_params *q = p->qset + i;
359
360		q->polling = adap->params.rev > 0;
361
362		if (adap->params.nports > 2) {
363			q->coalesce_nsecs = 50000;
364		} else {
365#ifdef INVARIANTS
366			q->coalesce_nsecs = 10000;
367#else
368			q->coalesce_nsecs = 5000;
369#endif
370		}
371		q->rspq_size = RSPQ_Q_SIZE;
372		q->fl_size = FL_Q_SIZE;
373		q->jumbo_size = JUMBO_Q_SIZE;
374		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
375		q->txq_size[TXQ_OFLD] = 1024;
376		q->txq_size[TXQ_CTRL] = 256;
377		q->cong_thres = 0;
378	}
379}
380
381int
382t3_sge_alloc(adapter_t *sc)
383{
384
385	/* The parent tag. */
386	if (bus_dma_tag_create( NULL,			/* parent */
387				1, 0,			/* algnmnt, boundary */
388				BUS_SPACE_MAXADDR,	/* lowaddr */
389				BUS_SPACE_MAXADDR,	/* highaddr */
390				NULL, NULL,		/* filter, filterarg */
391				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
392				BUS_SPACE_UNRESTRICTED, /* nsegments */
393				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
394				0,			/* flags */
395				NULL, NULL,		/* lock, lockarg */
396				&sc->parent_dmat)) {
397		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
398		return (ENOMEM);
399	}
400
401	/*
402	 * DMA tag for normal sized RX frames
403	 */
404	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
405		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
406		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
407		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
408		return (ENOMEM);
409	}
410
411	/*
412	 * DMA tag for jumbo sized RX frames.
413	 */
414	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
415		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
416		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
417		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
418		return (ENOMEM);
419	}
420
421	/*
422	 * DMA tag for TX frames.
423	 */
424	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
425		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
426		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
427		NULL, NULL, &sc->tx_dmat)) {
428		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
429		return (ENOMEM);
430	}
431
432	return (0);
433}
434
435int
436t3_sge_free(struct adapter * sc)
437{
438
439	if (sc->tx_dmat != NULL)
440		bus_dma_tag_destroy(sc->tx_dmat);
441
442	if (sc->rx_jumbo_dmat != NULL)
443		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
444
445	if (sc->rx_dmat != NULL)
446		bus_dma_tag_destroy(sc->rx_dmat);
447
448	if (sc->parent_dmat != NULL)
449		bus_dma_tag_destroy(sc->parent_dmat);
450
451	return (0);
452}
453
454void
455t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
456{
457
458	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
459	qs->rspq.polling = 0 /* p->polling */;
460}
461
462#if !defined(__i386__) && !defined(__amd64__)
463static void
464refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
465{
466	struct refill_fl_cb_arg *cb_arg = arg;
467
468	cb_arg->error = error;
469	cb_arg->seg = segs[0];
470	cb_arg->nseg = nseg;
471
472}
473#endif
474/**
475 *	refill_fl - refill an SGE free-buffer list
476 *	@sc: the controller softc
477 *	@q: the free-list to refill
478 *	@n: the number of new buffers to allocate
479 *
480 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
481 *	The caller must assure that @n does not exceed the queue's capacity.
482 */
483static void
484refill_fl(adapter_t *sc, struct sge_fl *q, int n)
485{
486	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
487	struct rx_desc *d = &q->desc[q->pidx];
488	struct refill_fl_cb_arg cb_arg;
489	caddr_t cl;
490	int err;
491	int header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
492
493	cb_arg.error = 0;
494	while (n--) {
495		/*
496		 * We only allocate a cluster, mbuf allocation happens after rx
497		 */
498		if ((cl = cxgb_cache_get(q->zone)) == NULL) {
499			log(LOG_WARNING, "Failed to allocate cluster\n");
500			goto done;
501		}
502
503		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
504			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
505				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
506				uma_zfree(q->zone, cl);
507				goto done;
508			}
509			sd->flags |= RX_SW_DESC_MAP_CREATED;
510		}
511#if !defined(__i386__) && !defined(__amd64__)
512		err = bus_dmamap_load(q->entry_tag, sd->map,
513		    cl + header_size, q->buf_size,
514		    refill_fl_cb, &cb_arg, 0);
515
516		if (err != 0 || cb_arg.error) {
517			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
518			/*
519			 * XXX free cluster
520			 */
521			return;
522		}
523#else
524		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)(cl + header_size));
525#endif
526		sd->flags |= RX_SW_DESC_INUSE;
527		sd->rxsd_cl = cl;
528		sd->data = cl + header_size;
529		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
530		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
531		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
532		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
533
534		d++;
535		sd++;
536
537		if (++q->pidx == q->size) {
538			q->pidx = 0;
539			q->gen ^= 1;
540			sd = q->sdesc;
541			d = q->desc;
542		}
543		q->credits++;
544	}
545
546done:
547	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
548}
549
550
551/**
552 *	free_rx_bufs - free the Rx buffers on an SGE free list
553 *	@sc: the controle softc
554 *	@q: the SGE free list to clean up
555 *
556 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
557 *	this queue should be stopped before calling this function.
558 */
559static void
560free_rx_bufs(adapter_t *sc, struct sge_fl *q)
561{
562	u_int cidx = q->cidx;
563
564	while (q->credits--) {
565		struct rx_sw_desc *d = &q->sdesc[cidx];
566
567		if (d->flags & RX_SW_DESC_INUSE) {
568			bus_dmamap_unload(q->entry_tag, d->map);
569			bus_dmamap_destroy(q->entry_tag, d->map);
570			uma_zfree(q->zone, d->rxsd_cl);
571		}
572		d->rxsd_cl = NULL;
573		if (++cidx == q->size)
574			cidx = 0;
575	}
576}
577
578static __inline void
579__refill_fl(adapter_t *adap, struct sge_fl *fl)
580{
581	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
582}
583
584static __inline void
585__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
586{
587	if ((fl->size - fl->credits) < max)
588		refill_fl(adap, fl, min(max, fl->size - fl->credits));
589}
590
591void
592refill_fl_service(adapter_t *adap, struct sge_fl *fl)
593{
594	__refill_fl_lt(adap, fl, 512);
595}
596
597/**
598 *	recycle_rx_buf - recycle a receive buffer
599 *	@adapter: the adapter
600 *	@q: the SGE free list
601 *	@idx: index of buffer to recycle
602 *
603 *	Recycles the specified buffer on the given free list by adding it at
604 *	the next available slot on the list.
605 */
606static void
607recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
608{
609	struct rx_desc *from = &q->desc[idx];
610	struct rx_desc *to   = &q->desc[q->pidx];
611
612	q->sdesc[q->pidx] = q->sdesc[idx];
613	to->addr_lo = from->addr_lo;        // already big endian
614	to->addr_hi = from->addr_hi;        // likewise
615	wmb();
616	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
617	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
618	q->credits++;
619
620	if (++q->pidx == q->size) {
621		q->pidx = 0;
622		q->gen ^= 1;
623	}
624	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
625}
626
627static void
628alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
629{
630	uint32_t *addr;
631
632	addr = arg;
633	*addr = segs[0].ds_addr;
634}
635
636static int
637alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
638    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
639    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
640{
641	size_t len = nelem * elem_size;
642	void *s = NULL;
643	void *p = NULL;
644	int err;
645
646	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
647				      BUS_SPACE_MAXADDR_32BIT,
648				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
649				      len, 0, NULL, NULL, tag)) != 0) {
650		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
651		return (ENOMEM);
652	}
653
654	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
655				    map)) != 0) {
656		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
657		return (ENOMEM);
658	}
659
660	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
661	bzero(p, len);
662	*(void **)desc = p;
663
664	if (sw_size) {
665		len = nelem * sw_size;
666		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
667		*(void **)sdesc = s;
668	}
669	if (parent_entry_tag == NULL)
670		return (0);
671
672	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
673				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
674		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
675				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
676		                      NULL, NULL, entry_tag)) != 0) {
677		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
678		return (ENOMEM);
679	}
680	return (0);
681}
682
683static void
684sge_slow_intr_handler(void *arg, int ncount)
685{
686	adapter_t *sc = arg;
687
688	t3_slow_intr_handler(sc);
689}
690
691/**
692 *	sge_timer_cb - perform periodic maintenance of an SGE qset
693 *	@data: the SGE queue set to maintain
694 *
695 *	Runs periodically from a timer to perform maintenance of an SGE queue
696 *	set.  It performs two tasks:
697 *
698 *	a) Cleans up any completed Tx descriptors that may still be pending.
699 *	Normal descriptor cleanup happens when new packets are added to a Tx
700 *	queue so this timer is relatively infrequent and does any cleanup only
701 *	if the Tx queue has not seen any new packets in a while.  We make a
702 *	best effort attempt to reclaim descriptors, in that we don't wait
703 *	around if we cannot get a queue's lock (which most likely is because
704 *	someone else is queueing new packets and so will also handle the clean
705 *	up).  Since control queues use immediate data exclusively we don't
706 *	bother cleaning them up here.
707 *
708 *	b) Replenishes Rx queues that have run out due to memory shortage.
709 *	Normally new Rx buffers are added when existing ones are consumed but
710 *	when out of memory a queue can become empty.  We try to add only a few
711 *	buffers here, the queue will be replenished fully as these new buffers
712 *	are used up if memory shortage has subsided.
713 *
714 *	c) Return coalesced response queue credits in case a response queue is
715 *	starved.
716 *
717 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
718 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
719 */
720static void
721sge_timer_cb(void *arg)
722{
723	adapter_t *sc = arg;
724#ifndef IFNET_MULTIQUEUE
725	struct port_info *pi;
726	struct sge_qset *qs;
727	struct sge_txq  *txq;
728	int i, j;
729	int reclaim_ofl, refill_rx;
730
731	for (i = 0; i < sc->params.nports; i++)
732		for (j = 0; j < sc->port[i].nqsets; j++) {
733			qs = &sc->sge.qs[i + j];
734			txq = &qs->txq[0];
735			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
736			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
737			    (qs->fl[1].credits < qs->fl[1].size));
738			if (reclaim_ofl || refill_rx) {
739				pi = &sc->port[i];
740				taskqueue_enqueue(pi->tq, &pi->timer_reclaim_task);
741				break;
742			}
743		}
744#endif
745	if (sc->params.nports > 2) {
746		int i;
747
748		for_each_port(sc, i) {
749			struct port_info *pi = &sc->port[i];
750
751			t3_write_reg(sc, A_SG_KDOORBELL,
752				     F_SELEGRCNTX |
753				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
754		}
755	}
756	if (sc->open_device_map != 0)
757		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
758}
759
760/*
761 * This is meant to be a catch-all function to keep sge state private
762 * to sge.c
763 *
764 */
765int
766t3_sge_init_adapter(adapter_t *sc)
767{
768	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
769	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
770	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
771	mi_init();
772	cxgb_cache_init();
773	return (0);
774}
775
776int
777t3_sge_reset_adapter(adapter_t *sc)
778{
779	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
780	return (0);
781}
782
783int
784t3_sge_init_port(struct port_info *pi)
785{
786	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
787	return (0);
788}
789
790void
791t3_sge_deinit_sw(adapter_t *sc)
792{
793	int i;
794
795	callout_drain(&sc->sge_timer_ch);
796	if (sc->tq)
797		taskqueue_drain(sc->tq, &sc->slow_intr_task);
798	for (i = 0; i < sc->params.nports; i++)
799		if (sc->port[i].tq != NULL)
800			taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task);
801
802	mi_deinit();
803}
804
805/**
806 *	refill_rspq - replenish an SGE response queue
807 *	@adapter: the adapter
808 *	@q: the response queue to replenish
809 *	@credits: how many new responses to make available
810 *
811 *	Replenishes a response queue by making the supplied number of responses
812 *	available to HW.
813 */
814static __inline void
815refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
816{
817
818	/* mbufs are allocated on demand when a rspq entry is processed. */
819	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
820		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
821}
822
823static __inline void
824sge_txq_reclaim_(struct sge_txq *txq, int force)
825{
826
827	if (desc_reclaimable(txq) < 16)
828		return;
829	if (mtx_trylock(&txq->lock) == 0)
830		return;
831	reclaim_completed_tx_(txq, 16);
832	mtx_unlock(&txq->lock);
833
834}
835
836static void
837sge_txq_reclaim_handler(void *arg, int ncount)
838{
839	struct sge_txq *q = arg;
840
841	sge_txq_reclaim_(q, TRUE);
842}
843
844
845
846static void
847sge_timer_reclaim(void *arg, int ncount)
848{
849	struct port_info *pi = arg;
850	int i, nqsets = pi->nqsets;
851	adapter_t *sc = pi->adapter;
852	struct sge_qset *qs;
853	struct sge_txq *txq;
854	struct mtx *lock;
855
856#ifdef IFNET_MULTIQUEUE
857	panic("%s should not be called with multiqueue support\n", __FUNCTION__);
858#endif
859	for (i = 0; i < nqsets; i++) {
860		qs = &sc->sge.qs[i];
861
862		txq = &qs->txq[TXQ_OFLD];
863		sge_txq_reclaim_(txq, FALSE);
864
865		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
866			    &sc->sge.qs[0].rspq.lock;
867
868		if (mtx_trylock(lock)) {
869			/* XXX currently assume that we are *NOT* polling */
870			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
871
872			if (qs->fl[0].credits < qs->fl[0].size - 16)
873				__refill_fl(sc, &qs->fl[0]);
874			if (qs->fl[1].credits < qs->fl[1].size - 16)
875				__refill_fl(sc, &qs->fl[1]);
876
877			if (status & (1 << qs->rspq.cntxt_id)) {
878				if (qs->rspq.credits) {
879					refill_rspq(sc, &qs->rspq, 1);
880					qs->rspq.credits--;
881					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
882					    1 << qs->rspq.cntxt_id);
883				}
884			}
885			mtx_unlock(lock);
886		}
887	}
888}
889
890/**
891 *	init_qset_cntxt - initialize an SGE queue set context info
892 *	@qs: the queue set
893 *	@id: the queue set id
894 *
895 *	Initializes the TIDs and context ids for the queues of a queue set.
896 */
897static void
898init_qset_cntxt(struct sge_qset *qs, u_int id)
899{
900
901	qs->rspq.cntxt_id = id;
902	qs->fl[0].cntxt_id = 2 * id;
903	qs->fl[1].cntxt_id = 2 * id + 1;
904	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
905	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
906	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
907	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
908	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
909
910	mbufq_init(&qs->txq[TXQ_ETH].sendq);
911	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
912	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
913}
914
915
916static void
917txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
918{
919	txq->in_use += ndesc;
920	/*
921	 * XXX we don't handle stopping of queue
922	 * presumably start handles this when we bump against the end
923	 */
924	txqs->gen = txq->gen;
925	txq->unacked += ndesc;
926	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
927	txq->unacked &= 7;
928	txqs->pidx = txq->pidx;
929	txq->pidx += ndesc;
930#ifdef INVARIANTS
931	if (((txqs->pidx > txq->cidx) &&
932		(txq->pidx < txqs->pidx) &&
933		(txq->pidx >= txq->cidx)) ||
934	    ((txqs->pidx < txq->cidx) &&
935		(txq->pidx >= txq-> cidx)) ||
936	    ((txqs->pidx < txq->cidx) &&
937		(txq->cidx < txqs->pidx)))
938		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
939		    txqs->pidx, txq->pidx, txq->cidx);
940#endif
941	if (txq->pidx >= txq->size) {
942		txq->pidx -= txq->size;
943		txq->gen ^= 1;
944	}
945
946}
947
948/**
949 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
950 *	@m: the packet mbufs
951 *      @nsegs: the number of segments
952 *
953 * 	Returns the number of Tx descriptors needed for the given Ethernet
954 * 	packet.  Ethernet packets require addition of WR and CPL headers.
955 */
956static __inline unsigned int
957calc_tx_descs(const struct mbuf *m, int nsegs)
958{
959	unsigned int flits;
960
961	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
962		return 1;
963
964	flits = sgl_len(nsegs) + 2;
965#ifdef TSO_SUPPORTED
966	if (m->m_pkthdr.csum_flags & CSUM_TSO)
967		flits++;
968#endif
969	return flits_to_desc(flits);
970}
971
972static unsigned int
973busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
974    struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
975{
976	struct mbuf *m0;
977	int err, pktlen, pass = 0;
978
979retry:
980	err = 0;
981	m0 = *m;
982	pktlen = m0->m_pkthdr.len;
983#if defined(__i386__) || defined(__amd64__)
984	if (busdma_map_sg_collapse(m, segs, nsegs) == 0) {
985		goto done;
986	} else
987#endif
988		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, txsd->map, m0, segs, nsegs, 0);
989
990	if (err == 0) {
991		goto done;
992	}
993	if (err == EFBIG && pass == 0) {
994		pass = 1;
995		/* Too many segments, try to defrag */
996		m0 = m_defrag(m0, M_DONTWAIT);
997		if (m0 == NULL) {
998			m_freem(*m);
999			*m = NULL;
1000			return (ENOBUFS);
1001		}
1002		*m = m0;
1003		goto retry;
1004	} else if (err == ENOMEM) {
1005		return (err);
1006	} if (err) {
1007		if (cxgb_debug)
1008			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1009		m_freem(m0);
1010		*m = NULL;
1011		return (err);
1012	}
1013done:
1014#if !defined(__i386__) && !defined(__amd64__)
1015	bus_dmamap_sync(txq->entry_tag, txsd->map, BUS_DMASYNC_PREWRITE);
1016#endif
1017	txsd->flags |= TX_SW_DESC_MAPPED;
1018
1019	return (0);
1020}
1021
1022/**
1023 *	make_sgl - populate a scatter/gather list for a packet
1024 *	@sgp: the SGL to populate
1025 *	@segs: the packet dma segments
1026 *	@nsegs: the number of segments
1027 *
1028 *	Generates a scatter/gather list for the buffers that make up a packet
1029 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1030 *	appropriately.
1031 */
1032static __inline void
1033make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1034{
1035	int i, idx;
1036
1037	for (idx = 0, i = 0; i < nsegs; i++) {
1038		/*
1039		 * firmware doesn't like empty segments
1040		 */
1041		if (segs[i].ds_len == 0)
1042			continue;
1043		if (i && idx == 0)
1044			++sgp;
1045
1046		sgp->len[idx] = htobe32(segs[i].ds_len);
1047		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1048		idx ^= 1;
1049	}
1050
1051	if (idx) {
1052		sgp->len[idx] = 0;
1053		sgp->addr[idx] = 0;
1054	}
1055}
1056
1057/**
1058 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1059 *	@adap: the adapter
1060 *	@q: the Tx queue
1061 *
1062 *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1063 *	where the HW is going to sleep just after we checked, however,
1064 *	then the interrupt handler will detect the outstanding TX packet
1065 *	and ring the doorbell for us.
1066 *
1067 *	When GTS is disabled we unconditionally ring the doorbell.
1068 */
1069static __inline void
1070check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1071{
1072#if USE_GTS
1073	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1074	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1075		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1076#ifdef T3_TRACE
1077		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1078			  q->cntxt_id);
1079#endif
1080		t3_write_reg(adap, A_SG_KDOORBELL,
1081			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1082	}
1083#else
1084	wmb();            /* write descriptors before telling HW */
1085	t3_write_reg(adap, A_SG_KDOORBELL,
1086		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1087#endif
1088}
1089
1090static __inline void
1091wr_gen2(struct tx_desc *d, unsigned int gen)
1092{
1093#if SGE_NUM_GENBITS == 2
1094	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1095#endif
1096}
1097
1098/**
1099 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1100 *	@ndesc: number of Tx descriptors spanned by the SGL
1101 *	@txd: first Tx descriptor to be written
1102 *	@txqs: txq state (generation and producer index)
1103 *	@txq: the SGE Tx queue
1104 *	@sgl: the SGL
1105 *	@flits: number of flits to the start of the SGL in the first descriptor
1106 *	@sgl_flits: the SGL size in flits
1107 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1108 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1109 *
1110 *	Write a work request header and an associated SGL.  If the SGL is
1111 *	small enough to fit into one Tx descriptor it has already been written
1112 *	and we just need to write the WR header.  Otherwise we distribute the
1113 *	SGL across the number of descriptors it spans.
1114 */
1115static void
1116write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1117    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1118    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1119{
1120
1121	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1122	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1123
1124	if (__predict_true(ndesc == 1)) {
1125		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1126		    V_WR_SGLSFLT(flits)) | wr_hi;
1127		wmb();
1128		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1129		    V_WR_GEN(txqs->gen)) | wr_lo;
1130		/* XXX gen? */
1131		wr_gen2(txd, txqs->gen);
1132
1133	} else {
1134		unsigned int ogen = txqs->gen;
1135		const uint64_t *fp = (const uint64_t *)sgl;
1136		struct work_request_hdr *wp = wrp;
1137
1138		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1139		    V_WR_SGLSFLT(flits)) | wr_hi;
1140
1141		while (sgl_flits) {
1142			unsigned int avail = WR_FLITS - flits;
1143
1144			if (avail > sgl_flits)
1145				avail = sgl_flits;
1146			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1147			sgl_flits -= avail;
1148			ndesc--;
1149			if (!sgl_flits)
1150				break;
1151
1152			fp += avail;
1153			txd++;
1154			txsd++;
1155			if (++txqs->pidx == txq->size) {
1156				txqs->pidx = 0;
1157				txqs->gen ^= 1;
1158				txd = txq->desc;
1159				txsd = txq->sdesc;
1160			}
1161
1162			/*
1163			 * when the head of the mbuf chain
1164			 * is freed all clusters will be freed
1165			 * with it
1166			 */
1167			KASSERT(txsd->mi.mi_base == NULL, ("overwrting valid entry mi_base==%p", txsd->mi.mi_base));
1168			wrp = (struct work_request_hdr *)txd;
1169			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1170			    V_WR_SGLSFLT(1)) | wr_hi;
1171			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1172				    sgl_flits + 1)) |
1173			    V_WR_GEN(txqs->gen)) | wr_lo;
1174			wr_gen2(txd, txqs->gen);
1175			flits = 1;
1176		}
1177		wrp->wr_hi |= htonl(F_WR_EOP);
1178		wmb();
1179		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1180		wr_gen2((struct tx_desc *)wp, ogen);
1181	}
1182}
1183
1184/* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1185#define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1186
1187#ifdef VLAN_SUPPORTED
1188#define GET_VTAG(cntrl, m) \
1189do { \
1190	if ((m)->m_flags & M_VLANTAG)					            \
1191		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1192} while (0)
1193
1194#define GET_VTAG_MI(cntrl, mi) \
1195do { \
1196	if ((mi)->mi_flags & M_VLANTAG)					\
1197		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((mi)->mi_ether_vtag); \
1198} while (0)
1199#else
1200#define GET_VTAG(cntrl, m)
1201#define GET_VTAG_MI(cntrl, m)
1202#endif
1203
1204int
1205t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
1206{
1207	adapter_t *sc;
1208	struct mbuf *m0;
1209	struct sge_txq *txq;
1210	struct txq_state txqs;
1211	struct port_info *pi;
1212	unsigned int ndesc, flits, cntrl, mlen;
1213	int err, nsegs, tso_info = 0;
1214
1215	struct work_request_hdr *wrp;
1216	struct tx_sw_desc *txsd;
1217	struct sg_ent *sgp, *sgl;
1218	uint32_t wr_hi, wr_lo, sgl_flits;
1219	bus_dma_segment_t segs[TX_MAX_SEGS];
1220
1221	struct tx_desc *txd;
1222	struct mbuf_vec *mv;
1223	struct mbuf_iovec *mi;
1224
1225	DPRINTF("t3_encap cpu=%d ", curcpu);
1226	KASSERT(qs->idx == 0, ("invalid qs %d", qs->idx));
1227
1228	mi = NULL;
1229	pi = qs->port;
1230	sc = pi->adapter;
1231	txq = &qs->txq[TXQ_ETH];
1232	txd = &txq->desc[txq->pidx];
1233	txsd = &txq->sdesc[txq->pidx];
1234	sgl = txq->txq_sgl;
1235	m0 = *m;
1236
1237	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
1238	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
1239	if (cxgb_debug)
1240		printf("mi_base=%p cidx=%d pidx=%d\n\n", txsd->mi.mi_base, txq->cidx, txq->pidx);
1241
1242	mtx_assert(&txq->lock, MA_OWNED);
1243	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1244/*
1245 * XXX need to add VLAN support for 6.x
1246 */
1247#ifdef VLAN_SUPPORTED
1248	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
1249		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1250#endif
1251	KASSERT(txsd->mi.mi_base == NULL, ("overwrting valid entry mi_base==%p",
1252		txsd->mi.mi_base));
1253	if (cxgb_debug)
1254		printf("uipc_mvec PIO_LEN=%zd\n", PIO_LEN);
1255
1256	if (count > 1) {
1257		panic("count > 1 not support in CVS\n");
1258		if ((err = busdma_map_sg_vec(m, &m0, segs, count)))
1259			return (err);
1260		nsegs = count;
1261	} else if ((err = busdma_map_sg_collapse(&m0, segs, &nsegs))) {
1262		if (cxgb_debug)
1263			printf("failed ... err=%d\n", err);
1264		return (err);
1265	}
1266	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d count=%d", nsegs, count));
1267
1268	if (!(m0->m_pkthdr.len <= PIO_LEN)) {
1269		mi_collapse_mbuf(&txsd->mi, m0);
1270		mi = &txsd->mi;
1271	}
1272	if (count > 1) {
1273		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1274		int i, fidx;
1275		struct mbuf_iovec *batchmi;
1276
1277		mv = mtomv(m0);
1278		batchmi = mv->mv_vec;
1279
1280		wrp = (struct work_request_hdr *)txd;
1281
1282		flits = count*2 + 1;
1283		txq_prod(txq, 1, &txqs);
1284
1285		for (fidx = 1, i = 0; i < count; i++, batchmi++, fidx += 2) {
1286			struct cpl_tx_pkt_batch_entry *cbe = &cpl_batch->pkt_entry[i];
1287
1288			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1289			GET_VTAG_MI(cntrl, batchmi);
1290			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1291			cbe->cntrl = htonl(cntrl);
1292			cbe->len = htonl(batchmi->mi_len | 0x80000000);
1293			cbe->addr = htobe64(segs[i].ds_addr);
1294			txd->flit[fidx] |= htobe64(1 << 24);
1295		}
1296
1297		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1298		    V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1299		wmb();
1300		wrp->wr_lo = htonl(V_WR_LEN(flits) |
1301		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1302		/* XXX gen? */
1303		wr_gen2(txd, txqs.gen);
1304		check_ring_tx_db(sc, txq);
1305
1306		return (0);
1307	} else if (tso_info) {
1308		int undersized, eth_type;
1309		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1310		struct ip *ip;
1311		struct tcphdr *tcp;
1312		char *pkthdr, tmp[TCPPKTHDRSIZE];
1313		struct mbuf_vec *mv;
1314		struct mbuf_iovec *tmpmi;
1315
1316		mv = mtomv(m0);
1317		tmpmi = mv->mv_vec;
1318
1319		txd->flit[2] = 0;
1320		GET_VTAG_MI(cntrl, mi);
1321		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1322		hdr->cntrl = htonl(cntrl);
1323		mlen = m0->m_pkthdr.len;
1324		hdr->len = htonl(mlen | 0x80000000);
1325
1326		DPRINTF("tso buf len=%d\n", mlen);
1327		undersized = (((tmpmi->mi_len < TCPPKTHDRSIZE) &&
1328			(m0->m_flags & M_VLANTAG)) ||
1329		    (tmpmi->mi_len < TCPPKTHDRSIZE - ETHER_VLAN_ENCAP_LEN));
1330		if (__predict_false(undersized)) {
1331			pkthdr = tmp;
1332			dump_mi(mi);
1333			panic("discontig packet - fixxorz");
1334		} else
1335			pkthdr = m0->m_data;
1336
1337		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1338			eth_type = CPL_ETH_II_VLAN;
1339			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1340			    ETHER_VLAN_ENCAP_LEN);
1341		} else {
1342			eth_type = CPL_ETH_II;
1343			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1344		}
1345		tcp = (struct tcphdr *)((uint8_t *)ip +
1346		    sizeof(*ip));
1347
1348		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1349			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1350			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1351		hdr->lso_info = htonl(tso_info);
1352		flits = 3;
1353	} else {
1354		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1355
1356		GET_VTAG(cntrl, m0);
1357		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1358		cpl->cntrl = htonl(cntrl);
1359		mlen = m0->m_pkthdr.len;
1360		cpl->len = htonl(mlen | 0x80000000);
1361
1362		if (mlen <= PIO_LEN) {
1363			txq_prod(txq, 1, &txqs);
1364			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1365			m_freem(m0);
1366			m0 = NULL;
1367			flits = (mlen + 7) / 8 + 2;
1368			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1369					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1370					  F_WR_SOP | F_WR_EOP | txqs.compl);
1371			wmb();
1372			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1373			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1374
1375			wr_gen2(txd, txqs.gen);
1376			check_ring_tx_db(sc, txq);
1377			DPRINTF("pio buf\n");
1378			return (0);
1379		}
1380		DPRINTF("regular buf\n");
1381		flits = 2;
1382	}
1383	wrp = (struct work_request_hdr *)txd;
1384
1385#ifdef	nomore
1386	/*
1387	 * XXX need to move into one of the helper routines above
1388	 *
1389	 */
1390	if ((err = busdma_map_mbufs(m, txq, txsd, segs, &nsegs)) != 0)
1391		return (err);
1392	m0 = *m;
1393#endif
1394	ndesc = calc_tx_descs(m0, nsegs);
1395
1396	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1397	make_sgl(sgp, segs, nsegs);
1398
1399	sgl_flits = sgl_len(nsegs);
1400
1401	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1402	txq_prod(txq, ndesc, &txqs);
1403	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1404	wr_lo = htonl(V_WR_TID(txq->token));
1405	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
1406	check_ring_tx_db(pi->adapter, txq);
1407
1408	if ((m0->m_type == MT_DATA) &&
1409	    ((m0->m_flags & (M_EXT|M_NOFREE)) == M_EXT) &&
1410	    (m0->m_ext.ext_type != EXT_PACKET)) {
1411		m0->m_flags &= ~M_EXT ;
1412		mbufs_outstanding--;
1413		m_free(m0);
1414	}
1415
1416	return (0);
1417}
1418
1419
1420/**
1421 *	write_imm - write a packet into a Tx descriptor as immediate data
1422 *	@d: the Tx descriptor to write
1423 *	@m: the packet
1424 *	@len: the length of packet data to write as immediate data
1425 *	@gen: the generation bit value to write
1426 *
1427 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1428 *	contains a work request at its beginning.  We must write the packet
1429 *	carefully so the SGE doesn't read accidentally before it's written in
1430 *	its entirety.
1431 */
1432static __inline void
1433write_imm(struct tx_desc *d, struct mbuf *m,
1434	  unsigned int len, unsigned int gen)
1435{
1436	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1437	struct work_request_hdr *to = (struct work_request_hdr *)d;
1438
1439	if (len > WR_LEN)
1440		panic("len too big %d\n", len);
1441	if (len < sizeof(*from))
1442		panic("len too small %d", len);
1443
1444	memcpy(&to[1], &from[1], len - sizeof(*from));
1445	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1446					V_WR_BCNTLFLT(len & 7));
1447	wmb();
1448	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1449					V_WR_LEN((len + 7) / 8));
1450	wr_gen2(d, gen);
1451
1452	/*
1453	 * This check is a hack we should really fix the logic so
1454	 * that this can't happen
1455	 */
1456	if (m->m_type != MT_DONTFREE)
1457		m_freem(m);
1458
1459}
1460
1461/**
1462 *	check_desc_avail - check descriptor availability on a send queue
1463 *	@adap: the adapter
1464 *	@q: the TX queue
1465 *	@m: the packet needing the descriptors
1466 *	@ndesc: the number of Tx descriptors needed
1467 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1468 *
1469 *	Checks if the requested number of Tx descriptors is available on an
1470 *	SGE send queue.  If the queue is already suspended or not enough
1471 *	descriptors are available the packet is queued for later transmission.
1472 *	Must be called with the Tx queue locked.
1473 *
1474 *	Returns 0 if enough descriptors are available, 1 if there aren't
1475 *	enough descriptors and the packet has been queued, and 2 if the caller
1476 *	needs to retry because there weren't enough descriptors at the
1477 *	beginning of the call but some freed up in the mean time.
1478 */
1479static __inline int
1480check_desc_avail(adapter_t *adap, struct sge_txq *q,
1481		 struct mbuf *m, unsigned int ndesc,
1482		 unsigned int qid)
1483{
1484	/*
1485	 * XXX We currently only use this for checking the control queue
1486	 * the control queue is only used for binding qsets which happens
1487	 * at init time so we are guaranteed enough descriptors
1488	 */
1489	if (__predict_false(!mbufq_empty(&q->sendq))) {
1490addq_exit:	mbufq_tail(&q->sendq, m);
1491		return 1;
1492	}
1493	if (__predict_false(q->size - q->in_use < ndesc)) {
1494
1495		struct sge_qset *qs = txq_to_qset(q, qid);
1496
1497		printf("stopping q\n");
1498
1499		setbit(&qs->txq_stopped, qid);
1500		smp_mb();
1501
1502		if (should_restart_tx(q) &&
1503		    test_and_clear_bit(qid, &qs->txq_stopped))
1504			return 2;
1505
1506		q->stops++;
1507		goto addq_exit;
1508	}
1509	return 0;
1510}
1511
1512
1513/**
1514 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1515 *	@q: the SGE control Tx queue
1516 *
1517 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1518 *	that send only immediate data (presently just the control queues) and
1519 *	thus do not have any mbufs
1520 */
1521static __inline void
1522reclaim_completed_tx_imm(struct sge_txq *q)
1523{
1524	unsigned int reclaim = q->processed - q->cleaned;
1525
1526	mtx_assert(&q->lock, MA_OWNED);
1527
1528	q->in_use -= reclaim;
1529	q->cleaned += reclaim;
1530}
1531
1532static __inline int
1533immediate(const struct mbuf *m)
1534{
1535	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1536}
1537
1538/**
1539 *	ctrl_xmit - send a packet through an SGE control Tx queue
1540 *	@adap: the adapter
1541 *	@q: the control queue
1542 *	@m: the packet
1543 *
1544 *	Send a packet through an SGE control Tx queue.  Packets sent through
1545 *	a control queue must fit entirely as immediate data in a single Tx
1546 *	descriptor and have no page fragments.
1547 */
1548static int
1549ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1550{
1551	int ret;
1552	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1553
1554	if (__predict_false(!immediate(m))) {
1555		m_freem(m);
1556		return 0;
1557	}
1558
1559	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1560	wrp->wr_lo = htonl(V_WR_TID(q->token));
1561
1562	mtx_lock(&q->lock);
1563again:	reclaim_completed_tx_imm(q);
1564
1565	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1566	if (__predict_false(ret)) {
1567		if (ret == 1) {
1568			mtx_unlock(&q->lock);
1569			log(LOG_ERR, "no desc available\n");
1570
1571			return (ENOSPC);
1572		}
1573		goto again;
1574	}
1575	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1576
1577	q->in_use++;
1578	if (++q->pidx >= q->size) {
1579		q->pidx = 0;
1580		q->gen ^= 1;
1581	}
1582	mtx_unlock(&q->lock);
1583	wmb();
1584	t3_write_reg(adap, A_SG_KDOORBELL,
1585		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1586	return (0);
1587}
1588
1589
1590/**
1591 *	restart_ctrlq - restart a suspended control queue
1592 *	@qs: the queue set cotaining the control queue
1593 *
1594 *	Resumes transmission on a suspended Tx control queue.
1595 */
1596static void
1597restart_ctrlq(void *data, int npending)
1598{
1599	struct mbuf *m;
1600	struct sge_qset *qs = (struct sge_qset *)data;
1601	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1602	adapter_t *adap = qs->port->adapter;
1603
1604	log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use);
1605
1606	mtx_lock(&q->lock);
1607again:	reclaim_completed_tx_imm(q);
1608
1609	while (q->in_use < q->size &&
1610	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1611
1612		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1613
1614		if (++q->pidx >= q->size) {
1615			q->pidx = 0;
1616			q->gen ^= 1;
1617		}
1618		q->in_use++;
1619	}
1620	if (!mbufq_empty(&q->sendq)) {
1621		setbit(&qs->txq_stopped, TXQ_CTRL);
1622		smp_mb();
1623
1624		if (should_restart_tx(q) &&
1625		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1626			goto again;
1627		q->stops++;
1628	}
1629	mtx_unlock(&q->lock);
1630	t3_write_reg(adap, A_SG_KDOORBELL,
1631		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1632}
1633
1634
1635/*
1636 * Send a management message through control queue 0
1637 */
1638int
1639t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1640{
1641	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1642}
1643
1644
1645/**
1646 *	free_qset - free the resources of an SGE queue set
1647 *	@sc: the controller owning the queue set
1648 *	@q: the queue set
1649 *
1650 *	Release the HW and SW resources associated with an SGE queue set, such
1651 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1652 *	queue set must be quiesced prior to calling this.
1653 */
1654void
1655t3_free_qset(adapter_t *sc, struct sge_qset *q)
1656{
1657	int i;
1658
1659	t3_free_tx_desc_all(&q->txq[TXQ_ETH]);
1660
1661	for (i = 0; i < SGE_TXQ_PER_SET; i++)
1662		if (q->txq[i].txq_mr.br_ring != NULL) {
1663			free(q->txq[i].txq_mr.br_ring, M_DEVBUF);
1664			mtx_destroy(&q->txq[i].txq_mr.br_lock);
1665		}
1666	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1667		if (q->fl[i].desc) {
1668			mtx_lock(&sc->sge.reg_lock);
1669			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1670			mtx_unlock(&sc->sge.reg_lock);
1671			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1672			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1673					q->fl[i].desc_map);
1674			bus_dma_tag_destroy(q->fl[i].desc_tag);
1675			bus_dma_tag_destroy(q->fl[i].entry_tag);
1676		}
1677		if (q->fl[i].sdesc) {
1678			free_rx_bufs(sc, &q->fl[i]);
1679			free(q->fl[i].sdesc, M_DEVBUF);
1680		}
1681	}
1682
1683	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
1684		if (q->txq[i].desc) {
1685			mtx_lock(&sc->sge.reg_lock);
1686			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1687			mtx_unlock(&sc->sge.reg_lock);
1688			bus_dmamap_unload(q->txq[i].desc_tag,
1689					q->txq[i].desc_map);
1690			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1691					q->txq[i].desc_map);
1692			bus_dma_tag_destroy(q->txq[i].desc_tag);
1693			bus_dma_tag_destroy(q->txq[i].entry_tag);
1694			MTX_DESTROY(&q->txq[i].lock);
1695		}
1696		if (q->txq[i].sdesc) {
1697			free(q->txq[i].sdesc, M_DEVBUF);
1698		}
1699	}
1700
1701	if (q->rspq.desc) {
1702		mtx_lock(&sc->sge.reg_lock);
1703		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1704		mtx_unlock(&sc->sge.reg_lock);
1705
1706		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1707		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1708			        q->rspq.desc_map);
1709		bus_dma_tag_destroy(q->rspq.desc_tag);
1710		MTX_DESTROY(&q->rspq.lock);
1711	}
1712
1713	bzero(q, sizeof(*q));
1714}
1715
1716/**
1717 *	t3_free_sge_resources - free SGE resources
1718 *	@sc: the adapter softc
1719 *
1720 *	Frees resources used by the SGE queue sets.
1721 */
1722void
1723t3_free_sge_resources(adapter_t *sc)
1724{
1725	int i, nqsets;
1726
1727#ifdef IFNET_MULTIQUEUE
1728	panic("%s should not be called when IFNET_MULTIQUEUE is defined", __FUNCTION__);
1729#endif
1730	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1731		nqsets += sc->port[i].nqsets;
1732
1733	for (i = 0; i < nqsets; ++i)
1734		t3_free_qset(sc, &sc->sge.qs[i]);
1735}
1736
1737/**
1738 *	t3_sge_start - enable SGE
1739 *	@sc: the controller softc
1740 *
1741 *	Enables the SGE for DMAs.  This is the last step in starting packet
1742 *	transfers.
1743 */
1744void
1745t3_sge_start(adapter_t *sc)
1746{
1747	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1748}
1749
1750/**
1751 *	t3_sge_stop - disable SGE operation
1752 *	@sc: the adapter
1753 *
1754 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
1755 *	from error interrupts) or from normal process context.  In the latter
1756 *	case it also disables any pending queue restart tasklets.  Note that
1757 *	if it is called in interrupt context it cannot disable the restart
1758 *	tasklets as it cannot wait, however the tasklets will have no effect
1759 *	since the doorbells are disabled and the driver will call this again
1760 *	later from process context, at which time the tasklets will be stopped
1761 *	if they are still running.
1762 */
1763void
1764t3_sge_stop(adapter_t *sc)
1765{
1766	int i, nqsets;
1767
1768	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
1769
1770	if (sc->tq == NULL)
1771		return;
1772
1773	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1774		nqsets += sc->port[i].nqsets;
1775#ifdef notyet
1776	/*
1777	 *
1778	 * XXX
1779	 */
1780	for (i = 0; i < nqsets; ++i) {
1781		struct sge_qset *qs = &sc->sge.qs[i];
1782
1783		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
1784		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
1785	}
1786#endif
1787}
1788
1789/**
1790 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
1791 *	@adapter: the adapter
1792 *	@q: the Tx queue to reclaim descriptors from
1793 *	@reclaimable: the number of descriptors to reclaim
1794 *      @m_vec_size: maximum number of buffers to reclaim
1795 *      @desc_reclaimed: returns the number of descriptors reclaimed
1796 *
1797 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1798 *	Tx buffers.  Called with the Tx queue lock held.
1799 *
1800 *      Returns number of buffers of reclaimed
1801 */
1802void
1803t3_free_tx_desc(struct sge_txq *q, int reclaimable)
1804{
1805	struct tx_sw_desc *txsd;
1806	unsigned int cidx;
1807
1808#ifdef T3_TRACE
1809	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1810		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
1811#endif
1812	cidx = q->cidx;
1813	txsd = &q->sdesc[cidx];
1814	DPRINTF("reclaiming %d WR\n", reclaimable);
1815	mtx_assert(&q->lock, MA_OWNED);
1816	while (reclaimable--) {
1817		DPRINTF("cidx=%d d=%p\n", cidx, txsd);
1818		if (txsd->mi.mi_base != NULL) {
1819			if (txsd->flags & TX_SW_DESC_MAPPED) {
1820				bus_dmamap_unload(q->entry_tag, txsd->map);
1821				txsd->flags &= ~TX_SW_DESC_MAPPED;
1822			}
1823			m_freem_iovec(&txsd->mi);
1824			buf_ring_scan(&q->txq_mr, txsd->mi.mi_base, __FILE__, __LINE__);
1825			txsd->mi.mi_base = NULL;
1826
1827#if defined(DIAGNOSTIC) && 0
1828			if (m_get_priority(txsd->m[0]) != cidx)
1829				printf("pri=%d cidx=%d\n",
1830				    (int)m_get_priority(txsd->m[0]), cidx);
1831#endif
1832
1833		} else
1834			q->txq_skipped++;
1835
1836		++txsd;
1837		if (++cidx == q->size) {
1838			cidx = 0;
1839			txsd = q->sdesc;
1840		}
1841	}
1842	q->cidx = cidx;
1843
1844}
1845
1846void
1847t3_free_tx_desc_all(struct sge_txq *q)
1848{
1849	int i;
1850	struct tx_sw_desc *txsd;
1851
1852	for (i = 0; i < q->size; i++) {
1853		txsd = &q->sdesc[i];
1854		if (txsd->mi.mi_base != NULL) {
1855			if (txsd->flags & TX_SW_DESC_MAPPED) {
1856				bus_dmamap_unload(q->entry_tag, txsd->map);
1857				txsd->flags &= ~TX_SW_DESC_MAPPED;
1858			}
1859			m_freem_iovec(&txsd->mi);
1860			bzero(&txsd->mi, sizeof(txsd->mi));
1861		}
1862	}
1863}
1864
1865/**
1866 *	is_new_response - check if a response is newly written
1867 *	@r: the response descriptor
1868 *	@q: the response queue
1869 *
1870 *	Returns true if a response descriptor contains a yet unprocessed
1871 *	response.
1872 */
1873static __inline int
1874is_new_response(const struct rsp_desc *r,
1875    const struct sge_rspq *q)
1876{
1877	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1878}
1879
1880#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1881#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1882			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1883			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1884			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1885
1886/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1887#define NOMEM_INTR_DELAY 2500
1888
1889/**
1890 *	write_ofld_wr - write an offload work request
1891 *	@adap: the adapter
1892 *	@m: the packet to send
1893 *	@q: the Tx queue
1894 *	@pidx: index of the first Tx descriptor to write
1895 *	@gen: the generation value to use
1896 *	@ndesc: number of descriptors the packet will occupy
1897 *
1898 *	Write an offload work request to send the supplied packet.  The packet
1899 *	data already carry the work request with most fields populated.
1900 */
1901static void
1902write_ofld_wr(adapter_t *adap, struct mbuf *m,
1903    struct sge_txq *q, unsigned int pidx,
1904    unsigned int gen, unsigned int ndesc,
1905    bus_dma_segment_t *segs, unsigned int nsegs)
1906{
1907	unsigned int sgl_flits, flits;
1908	struct work_request_hdr *from;
1909	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1910	struct tx_desc *d = &q->desc[pidx];
1911	struct txq_state txqs;
1912
1913	if (immediate(m) && segs == NULL) {
1914		write_imm(d, m, m->m_len, gen);
1915		return;
1916	}
1917
1918	/* Only TX_DATA builds SGLs */
1919	from = mtod(m, struct work_request_hdr *);
1920	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
1921
1922	flits = m->m_len / 8;
1923	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
1924
1925	make_sgl(sgp, segs, nsegs);
1926	sgl_flits = sgl_len(nsegs);
1927
1928	txqs.gen = gen;
1929	txqs.pidx = pidx;
1930	txqs.compl = 0;
1931
1932	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
1933	    from->wr_hi, from->wr_lo);
1934}
1935
1936/**
1937 *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1938 *	@m: the packet
1939 *
1940 * 	Returns the number of Tx descriptors needed for the given offload
1941 * 	packet.  These packets are already fully constructed.
1942 */
1943static __inline unsigned int
1944calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
1945{
1946	unsigned int flits, cnt = 0;
1947
1948
1949	if (m->m_len <= WR_LEN)
1950		return 1;                 /* packet fits as immediate data */
1951
1952	if (m->m_flags & M_IOVEC)
1953		cnt = mtomv(m)->mv_count;
1954
1955	/* headers */
1956	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;
1957
1958	return flits_to_desc(flits + sgl_len(cnt));
1959}
1960
1961/**
1962 *	ofld_xmit - send a packet through an offload queue
1963 *	@adap: the adapter
1964 *	@q: the Tx offload queue
1965 *	@m: the packet
1966 *
1967 *	Send an offload packet through an SGE offload queue.
1968 */
1969static int
1970ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1971{
1972	int ret, nsegs;
1973	unsigned int ndesc;
1974	unsigned int pidx, gen;
1975	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
1976	struct tx_sw_desc *stx;
1977
1978	nsegs = m_get_sgllen(m);
1979	vsegs = m_get_sgl(m);
1980	ndesc = calc_tx_descs_ofld(m, nsegs);
1981	busdma_map_sgl(vsegs, segs, nsegs);
1982
1983	stx = &q->sdesc[q->pidx];
1984	KASSERT(stx->mi.mi_base == NULL, ("mi_base set"));
1985
1986	mtx_lock(&q->lock);
1987again:	reclaim_completed_tx_(q, 16);
1988	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
1989	if (__predict_false(ret)) {
1990		if (ret == 1) {
1991			printf("no ofld desc avail\n");
1992
1993			m_set_priority(m, ndesc);     /* save for restart */
1994			mtx_unlock(&q->lock);
1995			return (EINTR);
1996		}
1997		goto again;
1998	}
1999
2000	gen = q->gen;
2001	q->in_use += ndesc;
2002	pidx = q->pidx;
2003	q->pidx += ndesc;
2004	if (q->pidx >= q->size) {
2005		q->pidx -= q->size;
2006		q->gen ^= 1;
2007	}
2008#ifdef T3_TRACE
2009	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2010		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2011		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2012		  skb_shinfo(skb)->nr_frags);
2013#endif
2014	mtx_unlock(&q->lock);
2015
2016	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2017	check_ring_tx_db(adap, q);
2018
2019	return (0);
2020}
2021
2022/**
2023 *	restart_offloadq - restart a suspended offload queue
2024 *	@qs: the queue set cotaining the offload queue
2025 *
2026 *	Resumes transmission on a suspended Tx offload queue.
2027 */
2028static void
2029restart_offloadq(void *data, int npending)
2030{
2031	struct mbuf *m;
2032	struct sge_qset *qs = data;
2033	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2034	adapter_t *adap = qs->port->adapter;
2035	bus_dma_segment_t segs[TX_MAX_SEGS];
2036	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2037	int nsegs, cleaned;
2038
2039	mtx_lock(&q->lock);
2040again:	cleaned = reclaim_completed_tx_(q, 16);
2041
2042	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2043		unsigned int gen, pidx;
2044		unsigned int ndesc = m_get_priority(m);
2045
2046		if (__predict_false(q->size - q->in_use < ndesc)) {
2047			setbit(&qs->txq_stopped, TXQ_OFLD);
2048			smp_mb();
2049
2050			if (should_restart_tx(q) &&
2051			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2052				goto again;
2053			q->stops++;
2054			break;
2055		}
2056
2057		gen = q->gen;
2058		q->in_use += ndesc;
2059		pidx = q->pidx;
2060		q->pidx += ndesc;
2061		if (q->pidx >= q->size) {
2062			q->pidx -= q->size;
2063			q->gen ^= 1;
2064		}
2065
2066		(void)mbufq_dequeue(&q->sendq);
2067		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2068		mtx_unlock(&q->lock);
2069		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2070		mtx_lock(&q->lock);
2071	}
2072	mtx_unlock(&q->lock);
2073
2074#if USE_GTS
2075	set_bit(TXQ_RUNNING, &q->flags);
2076	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2077#endif
2078	t3_write_reg(adap, A_SG_KDOORBELL,
2079		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2080}
2081
2082/**
2083 *	queue_set - return the queue set a packet should use
2084 *	@m: the packet
2085 *
2086 *	Maps a packet to the SGE queue set it should use.  The desired queue
2087 *	set is carried in bits 1-3 in the packet's priority.
2088 */
2089static __inline int
2090queue_set(const struct mbuf *m)
2091{
2092	return m_get_priority(m) >> 1;
2093}
2094
2095/**
2096 *	is_ctrl_pkt - return whether an offload packet is a control packet
2097 *	@m: the packet
2098 *
2099 *	Determines whether an offload packet should use an OFLD or a CTRL
2100 *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2101 */
2102static __inline int
2103is_ctrl_pkt(const struct mbuf *m)
2104{
2105	return m_get_priority(m) & 1;
2106}
2107
2108/**
2109 *	t3_offload_tx - send an offload packet
2110 *	@tdev: the offload device to send to
2111 *	@m: the packet
2112 *
2113 *	Sends an offload packet.  We use the packet priority to select the
2114 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2115 *	should be sent as regular or control, bits 1-3 select the queue set.
2116 */
2117int
2118t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2119{
2120	adapter_t *adap = tdev2adap(tdev);
2121	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2122
2123	if (__predict_false(is_ctrl_pkt(m)))
2124		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
2125
2126	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
2127}
2128
2129/**
2130 *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2131 *	@tdev: the offload device that will be receiving the packets
2132 *	@q: the SGE response queue that assembled the bundle
2133 *	@m: the partial bundle
2134 *	@n: the number of packets in the bundle
2135 *
2136 *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2137 */
2138static __inline void
2139deliver_partial_bundle(struct t3cdev *tdev,
2140			struct sge_rspq *q,
2141			struct mbuf *mbufs[], int n)
2142{
2143	if (n) {
2144		q->offload_bundles++;
2145		cxgb_ofld_recv(tdev, mbufs, n);
2146	}
2147}
2148
2149static __inline int
2150rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2151    struct mbuf *m, struct mbuf *rx_gather[],
2152    unsigned int gather_idx)
2153{
2154
2155	rq->offload_pkts++;
2156	m->m_pkthdr.header = mtod(m, void *);
2157	rx_gather[gather_idx++] = m;
2158	if (gather_idx == RX_BUNDLE_SIZE) {
2159		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2160		gather_idx = 0;
2161		rq->offload_bundles++;
2162	}
2163	return (gather_idx);
2164}
2165
2166static void
2167restart_tx(struct sge_qset *qs)
2168{
2169	struct adapter *sc = qs->port->adapter;
2170
2171
2172	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2173	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2174	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2175		qs->txq[TXQ_OFLD].restarts++;
2176		DPRINTF("restarting TXQ_OFLD\n");
2177		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2178	}
2179	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2180	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2181	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2182	    qs->txq[TXQ_CTRL].in_use);
2183
2184	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2185	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2186	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2187		qs->txq[TXQ_CTRL].restarts++;
2188		DPRINTF("restarting TXQ_CTRL\n");
2189		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2190	}
2191}
2192
2193/**
2194 *	t3_sge_alloc_qset - initialize an SGE queue set
2195 *	@sc: the controller softc
2196 *	@id: the queue set id
2197 *	@nports: how many Ethernet ports will be using this queue set
2198 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2199 *	@p: configuration parameters for this queue set
2200 *	@ntxq: number of Tx queues for the queue set
2201 *	@pi: port info for queue set
2202 *
2203 *	Allocate resources and initialize an SGE queue set.  A queue set
2204 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2205 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2206 *	queue, offload queue, and control queue.
2207 */
2208int
2209t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2210		  const struct qset_params *p, int ntxq, struct port_info *pi)
2211{
2212	struct sge_qset *q = &sc->sge.qs[id];
2213	int i, header_size, ret = 0;
2214
2215	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2216		if ((q->txq[i].txq_mr.br_ring = malloc(cxgb_txq_buf_ring_size*sizeof(struct mbuf *),
2217			    M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
2218			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2219			goto err;
2220		}
2221		q->txq[i].txq_mr.br_prod = q->txq[i].txq_mr.br_cons = 0;
2222		q->txq[i].txq_mr.br_size = cxgb_txq_buf_ring_size;
2223		mtx_init(&q->txq[i].txq_mr.br_lock, "txq mbuf ring", NULL, MTX_DEF);
2224	}
2225
2226	init_qset_cntxt(q, id);
2227	q->idx = id;
2228
2229	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2230		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2231		    &q->fl[0].desc, &q->fl[0].sdesc,
2232		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2233		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2234		printf("error %d from alloc ring fl0\n", ret);
2235		goto err;
2236	}
2237
2238	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2239		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2240		    &q->fl[1].desc, &q->fl[1].sdesc,
2241		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2242		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2243		printf("error %d from alloc ring fl1\n", ret);
2244		goto err;
2245	}
2246
2247	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2248		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2249		    &q->rspq.desc_tag, &q->rspq.desc_map,
2250		    NULL, NULL)) != 0) {
2251		printf("error %d from alloc ring rspq\n", ret);
2252		goto err;
2253	}
2254
2255	for (i = 0; i < ntxq; ++i) {
2256		/*
2257		 * The control queue always uses immediate data so does not
2258		 * need to keep track of any mbufs.
2259		 * XXX Placeholder for future TOE support.
2260		 */
2261		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2262
2263		if ((ret = alloc_ring(sc, p->txq_size[i],
2264			    sizeof(struct tx_desc), sz,
2265			    &q->txq[i].phys_addr, &q->txq[i].desc,
2266			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2267			    &q->txq[i].desc_map,
2268			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2269			printf("error %d from alloc ring tx %i\n", ret, i);
2270			goto err;
2271		}
2272		mbufq_init(&q->txq[i].sendq);
2273		q->txq[i].gen = 1;
2274		q->txq[i].size = p->txq_size[i];
2275		snprintf(q->txq[i].lockbuf, TXQ_NAME_LEN, "t3 txq lock %d:%d:%d",
2276		    device_get_unit(sc->dev), irq_vec_idx, i);
2277		MTX_INIT(&q->txq[i].lock, q->txq[i].lockbuf, NULL, MTX_DEF);
2278	}
2279
2280	q->txq[TXQ_ETH].port = pi;
2281
2282	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2283	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2284	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_ETH]);
2285	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_OFLD]);
2286
2287	q->fl[0].gen = q->fl[1].gen = 1;
2288	q->fl[0].size = p->fl_size;
2289	q->fl[1].size = p->jumbo_size;
2290
2291	q->rspq.gen = 1;
2292	q->rspq.cidx = 0;
2293	q->rspq.size = p->rspq_size;
2294
2295
2296	header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
2297	q->txq[TXQ_ETH].stop_thres = nports *
2298	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2299
2300	q->fl[0].buf_size = (MCLBYTES - header_size);
2301	q->fl[0].zone = zone_clust;
2302	q->fl[0].type = EXT_CLUSTER;
2303#if __FreeBSD_version > 800000
2304	if (cxgb_use_16k_clusters) {
2305		q->fl[1].buf_size = MJUM16BYTES - header_size;
2306		q->fl[1].zone = zone_jumbo16;
2307		q->fl[1].type = EXT_JUMBO16;
2308	} else {
2309		q->fl[1].buf_size = MJUM9BYTES - header_size;
2310		q->fl[1].zone = zone_jumbo9;
2311		q->fl[1].type = EXT_JUMBO9;
2312	}
2313#else
2314	q->fl[1].buf_size = MJUMPAGESIZE - header_size;
2315	q->fl[1].zone = zone_jumbop;
2316	q->fl[1].type = EXT_JUMBOP;
2317#endif
2318	q->lro.enabled = lro_default;
2319
2320	mtx_lock(&sc->sge.reg_lock);
2321	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2322				   q->rspq.phys_addr, q->rspq.size,
2323				   q->fl[0].buf_size, 1, 0);
2324	if (ret) {
2325		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2326		goto err_unlock;
2327	}
2328
2329	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2330		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2331					  q->fl[i].phys_addr, q->fl[i].size,
2332					  q->fl[i].buf_size, p->cong_thres, 1,
2333					  0);
2334		if (ret) {
2335			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2336			goto err_unlock;
2337		}
2338	}
2339
2340	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2341				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2342				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2343				 1, 0);
2344	if (ret) {
2345		printf("error %d from t3_sge_init_ecntxt\n", ret);
2346		goto err_unlock;
2347	}
2348
2349	if (ntxq > 1) {
2350		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2351					 USE_GTS, SGE_CNTXT_OFLD, id,
2352					 q->txq[TXQ_OFLD].phys_addr,
2353					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2354		if (ret) {
2355			printf("error %d from t3_sge_init_ecntxt\n", ret);
2356			goto err_unlock;
2357		}
2358	}
2359
2360	if (ntxq > 2) {
2361		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2362					 SGE_CNTXT_CTRL, id,
2363					 q->txq[TXQ_CTRL].phys_addr,
2364					 q->txq[TXQ_CTRL].size,
2365					 q->txq[TXQ_CTRL].token, 1, 0);
2366		if (ret) {
2367			printf("error %d from t3_sge_init_ecntxt\n", ret);
2368			goto err_unlock;
2369		}
2370	}
2371
2372	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2373	    device_get_unit(sc->dev), irq_vec_idx);
2374	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2375
2376	mtx_unlock(&sc->sge.reg_lock);
2377	t3_update_qset_coalesce(q, p);
2378	q->port = pi;
2379
2380	refill_fl(sc, &q->fl[0], q->fl[0].size);
2381	refill_fl(sc, &q->fl[1], q->fl[1].size);
2382	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2383
2384	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2385		     V_NEWTIMER(q->rspq.holdoff_tmr));
2386
2387	return (0);
2388
2389err_unlock:
2390	mtx_unlock(&sc->sge.reg_lock);
2391err:
2392	t3_free_qset(sc, q);
2393
2394	return (ret);
2395}
2396
2397void
2398t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2399{
2400	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2401	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2402	struct ifnet *ifp = pi->ifp;
2403
2404	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2405
2406	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2407	    cpl->csum_valid && cpl->csum == 0xffff) {
2408		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2409		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2410		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2411		m->m_pkthdr.csum_data = 0xffff;
2412	}
2413	/*
2414	 * XXX need to add VLAN support for 6.x
2415	 */
2416#ifdef VLAN_SUPPORTED
2417	if (__predict_false(cpl->vlan_valid)) {
2418		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2419		m->m_flags |= M_VLANTAG;
2420	}
2421#endif
2422
2423	m->m_pkthdr.rcvif = ifp;
2424	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2425#ifndef DISABLE_MBUF_IOVEC
2426	m_explode(m);
2427#endif
2428	/*
2429	 * adjust after conversion to mbuf chain
2430	 */
2431	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2432	m->m_len -= (sizeof(*cpl) + ethpad);
2433	m->m_data += (sizeof(*cpl) + ethpad);
2434
2435	(*ifp->if_input)(ifp, m);
2436}
2437
2438static void
2439ext_free_handler(void *cl, void * arg)
2440{
2441	uintptr_t type = (uintptr_t)arg;
2442	uma_zone_t zone;
2443	struct mbuf *m;
2444
2445	m = cl;
2446	zone = m_getzonefromtype(type);
2447	m->m_ext.ext_type = (int)type;
2448	cxgb_ext_freed++;
2449	cxgb_cache_put(zone, cl);
2450}
2451
2452static void
2453init_cluster_mbuf(caddr_t cl, int flags, int type, uma_zone_t zone)
2454{
2455	struct mbuf *m;
2456	int header_size;
2457
2458	header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) +
2459	    sizeof(struct m_ext_) + sizeof(uint32_t);
2460
2461	bzero(cl, header_size);
2462	m = (struct mbuf *)cl;
2463
2464	SLIST_INIT(&m->m_pkthdr.tags);
2465	m->m_type = MT_DATA;
2466	m->m_flags = flags | M_NOFREE | M_EXT;
2467	m->m_data = cl + header_size;
2468	m->m_ext.ext_buf = cl;
2469	m->m_ext.ref_cnt = (uint32_t *)(cl + header_size - sizeof(uint32_t));
2470	m->m_ext.ext_size = m_getsizefromtype(type);
2471	m->m_ext.ext_free = ext_free_handler;
2472	m->m_ext.ext_args = (void *)(uintptr_t)type;
2473	m->m_ext.ext_type = EXT_EXTREF;
2474	*(m->m_ext.ref_cnt) = 1;
2475	DPRINTF("data=%p ref_cnt=%p\n", m->m_data, m->m_ext.ref_cnt);
2476}
2477
2478
2479/**
2480 *	get_packet - return the next ingress packet buffer from a free list
2481 *	@adap: the adapter that received the packet
2482 *	@drop_thres: # of remaining buffers before we start dropping packets
2483 *	@qs: the qset that the SGE free list holding the packet belongs to
2484 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2485 *      @r: response descriptor
2486 *
2487 *	Get the next packet from a free list and complete setup of the
2488 *	sk_buff.  If the packet is small we make a copy and recycle the
2489 *	original buffer, otherwise we use the original buffer itself.  If a
2490 *	positive drop threshold is supplied packets are dropped and their
2491 *	buffers recycled if (a) the number of remaining buffers is under the
2492 *	threshold and the packet is too big to copy, or (b) the packet should
2493 *	be copied but there is no memory for the copy.
2494 */
2495#ifdef DISABLE_MBUF_IOVEC
2496
2497static int
2498get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2499    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2500{
2501
2502	unsigned int len_cq =  ntohl(r->len_cq);
2503	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2504	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2505	uint32_t len = G_RSPD_LEN(len_cq);
2506	uint32_t flags = ntohl(r->flags);
2507	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2508	struct mbuf *m;
2509	uint32_t *ref;
2510	int ret = 0;
2511
2512	prefetch(sd->rxsd_cl);
2513
2514	fl->credits--;
2515	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2516
2517	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2518		if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2519			goto skip_recycle;
2520		cl = mtod(m0, void *);
2521		memcpy(cl, sd->data, len);
2522		recycle_rx_buf(adap, fl, fl->cidx);
2523		m = m0;
2524	} else {
2525	skip_recycle:
2526		int flags = 0;
2527		bus_dmamap_unload(fl->entry_tag, sd->map);
2528		cl = sd->rxsd_cl;
2529		m = m0 = (struct mbuf *)cl;
2530
2531		m0->m_len = len;
2532		if ((sopeop == RSPQ_SOP_EOP) ||
2533		    (sopeop == RSPQ_SOP))
2534			flags = M_PKTHDR;
2535		init_cluster_mbuf(cl, flags, fl->type, fl->zone);
2536	}
2537
2538	switch(sopeop) {
2539	case RSPQ_SOP_EOP:
2540		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2541		mh->mh_head = mh->mh_tail = m;
2542		m->m_pkthdr.len = len;
2543		ret = 1;
2544		break;
2545	case RSPQ_NSOP_NEOP:
2546		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2547		if (mh->mh_tail == NULL) {
2548			printf("discarding intermediate descriptor entry\n");
2549			m_freem(m);
2550			break;
2551		}
2552		mh->mh_tail->m_next = m;
2553		mh->mh_tail = m;
2554		mh->mh_head->m_pkthdr.len += len;
2555		ret = 0;
2556		break;
2557	case RSPQ_SOP:
2558		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2559		m->m_pkthdr.len = len;
2560		mh->mh_head = mh->mh_tail = m;
2561		ret = 0;
2562		break;
2563	case RSPQ_EOP:
2564		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2565		mh->mh_head->m_pkthdr.len += len;
2566		mh->mh_tail->m_next = m;
2567		mh->mh_tail = m;
2568		ret = 1;
2569		break;
2570	}
2571	if (++fl->cidx == fl->size)
2572		fl->cidx = 0;
2573
2574	return (ret);
2575}
2576
2577#else
2578
2579static int
2580get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2581    struct mbuf **m, struct rsp_desc *r)
2582{
2583
2584	unsigned int len_cq =  ntohl(r->len_cq);
2585	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2586	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2587	uint32_t len = G_RSPD_LEN(len_cq);
2588	uint32_t flags = ntohl(r->flags);
2589	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2590	void *cl;
2591	int ret = 0;
2592	struct mbuf *m0;
2593#if 0
2594	if ((sd + 1 )->rxsd_cl)
2595		prefetch((sd + 1)->rxsd_cl);
2596	if ((sd + 2)->rxsd_cl)
2597		prefetch((sd + 2)->rxsd_cl);
2598#endif
2599	DPRINTF("rx cpu=%d\n", curcpu);
2600	fl->credits--;
2601	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2602
2603	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2604		if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2605			goto skip_recycle;
2606		cl = mtod(m0, void *);
2607		memcpy(cl, sd->data, len);
2608		recycle_rx_buf(adap, fl, fl->cidx);
2609		*m = m0;
2610	} else {
2611	skip_recycle:
2612		bus_dmamap_unload(fl->entry_tag, sd->map);
2613		cl = sd->rxsd_cl;
2614		*m = m0 = (struct mbuf *)cl;
2615	}
2616
2617	switch(sopeop) {
2618	case RSPQ_SOP_EOP:
2619		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2620		if (cl == sd->rxsd_cl)
2621			init_cluster_mbuf(cl, M_PKTHDR, fl->type, fl->zone);
2622		m0->m_len = m0->m_pkthdr.len = len;
2623		ret = 1;
2624		goto done;
2625		break;
2626	case RSPQ_NSOP_NEOP:
2627		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2628		panic("chaining unsupported");
2629		ret = 0;
2630		break;
2631	case RSPQ_SOP:
2632		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2633		panic("chaining unsupported");
2634		m_iovinit(m0);
2635		ret = 0;
2636		break;
2637	case RSPQ_EOP:
2638		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2639		panic("chaining unsupported");
2640		ret = 1;
2641		break;
2642	}
2643	panic("append not supported");
2644#if 0
2645	m_iovappend(m0, cl, fl->buf_size, len, sizeof(uint32_t), sd->rxsd_ref);
2646#endif
2647done:
2648	if (++fl->cidx == fl->size)
2649		fl->cidx = 0;
2650
2651	return (ret);
2652}
2653#endif
2654/**
2655 *	handle_rsp_cntrl_info - handles control information in a response
2656 *	@qs: the queue set corresponding to the response
2657 *	@flags: the response control flags
2658 *
2659 *	Handles the control information of an SGE response, such as GTS
2660 *	indications and completion credits for the queue set's Tx queues.
2661 *	HW coalesces credits, we don't do any extra SW coalescing.
2662 */
2663static __inline void
2664handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2665{
2666	unsigned int credits;
2667
2668#if USE_GTS
2669	if (flags & F_RSPD_TXQ0_GTS)
2670		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2671#endif
2672	credits = G_RSPD_TXQ0_CR(flags);
2673	if (credits)
2674		qs->txq[TXQ_ETH].processed += credits;
2675
2676	credits = G_RSPD_TXQ2_CR(flags);
2677	if (credits)
2678		qs->txq[TXQ_CTRL].processed += credits;
2679
2680# if USE_GTS
2681	if (flags & F_RSPD_TXQ1_GTS)
2682		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2683# endif
2684	credits = G_RSPD_TXQ1_CR(flags);
2685	if (credits)
2686		qs->txq[TXQ_OFLD].processed += credits;
2687
2688}
2689
2690static void
2691check_ring_db(adapter_t *adap, struct sge_qset *qs,
2692    unsigned int sleeping)
2693{
2694	;
2695}
2696
2697/**
2698 *	process_responses - process responses from an SGE response queue
2699 *	@adap: the adapter
2700 *	@qs: the queue set to which the response queue belongs
2701 *	@budget: how many responses can be processed in this round
2702 *
2703 *	Process responses from an SGE response queue up to the supplied budget.
2704 *	Responses include received packets as well as credits and other events
2705 *	for the queues that belong to the response queue's queue set.
2706 *	A negative budget is effectively unlimited.
2707 *
2708 *	Additionally choose the interrupt holdoff time for the next interrupt
2709 *	on this queue.  If the system is under memory shortage use a fairly
2710 *	long delay to help recovery.
2711 */
2712int
2713process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2714{
2715	struct sge_rspq *rspq = &qs->rspq;
2716	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2717	int budget_left = budget;
2718	unsigned int sleeping = 0;
2719	int lro = qs->lro.enabled;
2720	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2721	int ngathered = 0;
2722#ifdef DEBUG
2723	static int last_holdoff = 0;
2724	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2725		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2726		last_holdoff = rspq->holdoff_tmr;
2727	}
2728#endif
2729	rspq->next_holdoff = rspq->holdoff_tmr;
2730
2731	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2732		int eth, eop = 0, ethpad = 0;
2733		uint32_t flags = ntohl(r->flags);
2734		uint32_t rss_csum = *(const uint32_t *)r;
2735		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2736
2737		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2738
2739		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2740			/* XXX */
2741			printf("async notification\n");
2742
2743		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2744			struct mbuf *m = NULL;
2745
2746#ifdef DISABLE_MBUF_IOVEC
2747			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
2748			    r->rss_hdr.opcode, rspq->cidx);
2749
2750			m = rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2751			if (m == NULL) {
2752				rspq->next_holdoff = NOMEM_INTR_DELAY;
2753				budget_left--;
2754				break;
2755			}
2756
2757			get_imm_packet(adap, r, &rspq->rspq_mh);
2758			eop = 1;
2759#else
2760			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
2761			    r->rss_hdr.opcode, rspq->cidx);
2762			if (rspq->rspq_mbuf == NULL)
2763				rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
2764                        else
2765				m = m_gethdr(M_DONTWAIT, MT_DATA);
2766
2767			/*
2768			 * XXX revisit me
2769			 */
2770			if (rspq->rspq_mbuf == NULL &&  m == NULL) {
2771				rspq->next_holdoff = NOMEM_INTR_DELAY;
2772				budget_left--;
2773				break;
2774			}
2775			get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags);
2776
2777			eop = 1;
2778			rspq->imm_data++;
2779#endif
2780		} else if (r->len_cq) {
2781			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2782
2783#ifdef DISABLE_MBUF_IOVEC
2784			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
2785#else
2786			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mbuf, r);
2787#ifdef IFNET_MULTIQUEUE
2788			rspq->rspq_mbuf->m_pkthdr.rss_hash = rss_hash;
2789#endif
2790#endif
2791			ethpad = 2;
2792		} else {
2793			DPRINTF("pure response\n");
2794			rspq->pure_rsps++;
2795		}
2796
2797		if (flags & RSPD_CTRL_MASK) {
2798			sleeping |= flags & RSPD_GTS_MASK;
2799			handle_rsp_cntrl_info(qs, flags);
2800		}
2801
2802		r++;
2803		if (__predict_false(++rspq->cidx == rspq->size)) {
2804			rspq->cidx = 0;
2805			rspq->gen ^= 1;
2806			r = rspq->desc;
2807		}
2808		prefetch(r);
2809		if (++rspq->credits >= (rspq->size / 4)) {
2810			refill_rspq(adap, rspq, rspq->credits);
2811			rspq->credits = 0;
2812		}
2813		DPRINTF("eth=%d eop=%d flags=0x%x\n", eth, eop, flags);
2814
2815		if (!eth && eop) {
2816			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
2817			/*
2818			 * XXX size mismatch
2819			 */
2820			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
2821
2822			ngathered = rx_offload(&adap->tdev, rspq,
2823			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
2824			rspq->rspq_mh.mh_head = NULL;
2825			DPRINTF("received offload packet\n");
2826
2827		} else if (eth && eop) {
2828			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *));
2829			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES);
2830
2831			t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad,
2832			    rss_hash, rss_csum, lro);
2833			DPRINTF("received tunnel packet\n");
2834				rspq->rspq_mh.mh_head = NULL;
2835
2836		}
2837		__refill_fl_lt(adap, &qs->fl[0], 32);
2838		__refill_fl_lt(adap, &qs->fl[1], 32);
2839		--budget_left;
2840	}
2841
2842	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
2843	t3_lro_flush(adap, qs, &qs->lro);
2844
2845	if (sleeping)
2846		check_ring_db(adap, qs, sleeping);
2847
2848	smp_mb();  /* commit Tx queue processed updates */
2849	if (__predict_false(qs->txq_stopped > 1)) {
2850		printf("restarting tx on %p\n", qs);
2851
2852		restart_tx(qs);
2853	}
2854
2855	__refill_fl_lt(adap, &qs->fl[0], 512);
2856	__refill_fl_lt(adap, &qs->fl[1], 512);
2857	budget -= budget_left;
2858	return (budget);
2859}
2860
2861/*
2862 * A helper function that processes responses and issues GTS.
2863 */
2864static __inline int
2865process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2866{
2867	int work;
2868	static int last_holdoff = 0;
2869
2870	work = process_responses(adap, rspq_to_qset(rq), -1);
2871
2872	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2873		printf("next_holdoff=%d\n", rq->next_holdoff);
2874		last_holdoff = rq->next_holdoff;
2875	}
2876	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2877	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2878
2879	return (work);
2880}
2881
2882
2883/*
2884 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2885 * Handles data events from SGE response queues as well as error and other
2886 * async events as they all use the same interrupt pin.  We use one SGE
2887 * response queue per port in this mode and protect all response queues with
2888 * queue 0's lock.
2889 */
2890void
2891t3b_intr(void *data)
2892{
2893	uint32_t i, map;
2894	adapter_t *adap = data;
2895	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2896
2897	t3_write_reg(adap, A_PL_CLI, 0);
2898	map = t3_read_reg(adap, A_SG_DATA_INTR);
2899
2900	if (!map)
2901		return;
2902
2903	if (__predict_false(map & F_ERRINTR))
2904		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2905
2906	mtx_lock(&q0->lock);
2907	for_each_port(adap, i)
2908	    if (map & (1 << i))
2909			process_responses_gts(adap, &adap->sge.qs[i].rspq);
2910	mtx_unlock(&q0->lock);
2911}
2912
2913/*
2914 * The MSI interrupt handler.  This needs to handle data events from SGE
2915 * response queues as well as error and other async events as they all use
2916 * the same MSI vector.  We use one SGE response queue per port in this mode
2917 * and protect all response queues with queue 0's lock.
2918 */
2919void
2920t3_intr_msi(void *data)
2921{
2922	adapter_t *adap = data;
2923	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2924	int i, new_packets = 0;
2925
2926	mtx_lock(&q0->lock);
2927
2928	for_each_port(adap, i)
2929	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
2930		    new_packets = 1;
2931	mtx_unlock(&q0->lock);
2932	if (new_packets == 0)
2933		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2934}
2935
2936void
2937t3_intr_msix(void *data)
2938{
2939	struct sge_qset *qs = data;
2940	adapter_t *adap = qs->port->adapter;
2941	struct sge_rspq *rspq = &qs->rspq;
2942#ifndef IFNET_MULTIQUEUE
2943	mtx_lock(&rspq->lock);
2944#else
2945	if (mtx_trylock(&rspq->lock))
2946#endif
2947	{
2948
2949		if (process_responses_gts(adap, rspq) == 0)
2950			rspq->unhandled_irqs++;
2951		mtx_unlock(&rspq->lock);
2952	}
2953}
2954
2955#define QDUMP_SBUF_SIZE		32 * 400
2956static int
2957t3_dump_rspq(SYSCTL_HANDLER_ARGS)
2958{
2959	struct sge_rspq *rspq;
2960	struct sge_qset *qs;
2961	int i, err, dump_end, idx;
2962	static int multiplier = 1;
2963	struct sbuf *sb;
2964	struct rsp_desc *rspd;
2965	uint32_t data[4];
2966
2967	rspq = arg1;
2968	qs = rspq_to_qset(rspq);
2969	if (rspq->rspq_dump_count == 0)
2970		return (0);
2971	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
2972		log(LOG_WARNING,
2973		    "dump count is too large %d\n", rspq->rspq_dump_count);
2974		rspq->rspq_dump_count = 0;
2975		return (EINVAL);
2976	}
2977	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
2978		log(LOG_WARNING,
2979		    "dump start of %d is greater than queue size\n",
2980		    rspq->rspq_dump_start);
2981		rspq->rspq_dump_start = 0;
2982		return (EINVAL);
2983	}
2984	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
2985	if (err)
2986		return (err);
2987retry_sbufops:
2988	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
2989
2990	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
2991	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
2992	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
2993	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
2994	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
2995
2996	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
2997	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
2998
2999	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3000	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3001		idx = i & (RSPQ_Q_SIZE-1);
3002
3003		rspd = &rspq->desc[idx];
3004		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3005		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3006		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3007		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3008		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3009		    be32toh(rspd->len_cq), rspd->intr_gen);
3010	}
3011	if (sbuf_overflowed(sb)) {
3012		sbuf_delete(sb);
3013		multiplier++;
3014		goto retry_sbufops;
3015	}
3016	sbuf_finish(sb);
3017	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3018	sbuf_delete(sb);
3019	return (err);
3020}
3021
3022
3023/*
3024 * broken by recent mbuf changes
3025 */
3026static int
3027t3_dump_txq(SYSCTL_HANDLER_ARGS)
3028{
3029	struct sge_txq *txq;
3030	struct sge_qset *qs;
3031	int i, j, err, dump_end;
3032	static int multiplier = 1;
3033	struct sbuf *sb;
3034	struct tx_desc *txd;
3035	uint32_t *WR, wr_hi, wr_lo, gen;
3036	uint32_t data[4];
3037
3038	txq = arg1;
3039	qs = txq_to_qset(txq, TXQ_ETH);
3040	if (txq->txq_dump_count == 0) {
3041		return (0);
3042	}
3043	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3044		log(LOG_WARNING,
3045		    "dump count is too large %d\n", txq->txq_dump_count);
3046		txq->txq_dump_count = 1;
3047		return (EINVAL);
3048	}
3049	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3050		log(LOG_WARNING,
3051		    "dump start of %d is greater than queue size\n",
3052		    txq->txq_dump_start);
3053		txq->txq_dump_start = 0;
3054		return (EINVAL);
3055	}
3056	err = t3_sge_read_ecntxt(qs->port->adapter, txq->cntxt_id, data);
3057	if (err)
3058		return (err);
3059
3060
3061retry_sbufops:
3062	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3063
3064	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3065	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3066	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3067	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3068	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3069	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3070	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3071	    txq->txq_dump_start,
3072	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3073
3074	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3075	for (i = txq->txq_dump_start; i < dump_end; i++) {
3076		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3077		WR = (uint32_t *)txd->flit;
3078		wr_hi = ntohl(WR[0]);
3079		wr_lo = ntohl(WR[1]);
3080		gen = G_WR_GEN(wr_lo);
3081
3082		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3083		    wr_hi, wr_lo, gen);
3084		for (j = 2; j < 30; j += 4)
3085			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3086			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3087
3088	}
3089	if (sbuf_overflowed(sb)) {
3090		sbuf_delete(sb);
3091		multiplier++;
3092		goto retry_sbufops;
3093	}
3094	sbuf_finish(sb);
3095	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3096	sbuf_delete(sb);
3097	return (err);
3098}
3099
3100
3101static int
3102t3_lro_enable(SYSCTL_HANDLER_ARGS)
3103{
3104	adapter_t *sc;
3105	int i, j, enabled, err, nqsets = 0;
3106
3107#ifndef LRO_WORKING
3108	return (0);
3109#endif
3110	sc = arg1;
3111	enabled = sc->sge.qs[0].lro.enabled;
3112        err = sysctl_handle_int(oidp, &enabled, arg2, req);
3113
3114	if (err != 0)
3115		return (err);
3116	if (enabled == sc->sge.qs[0].lro.enabled)
3117		return (0);
3118
3119	for (i = 0; i < sc->params.nports; i++)
3120		for (j = 0; j < sc->port[i].nqsets; j++)
3121			nqsets++;
3122
3123	for (i = 0; i < nqsets; i++)
3124		sc->sge.qs[i].lro.enabled = enabled;
3125
3126	return (0);
3127}
3128
3129static int
3130t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
3131{
3132	adapter_t *sc = arg1;
3133	struct qset_params *qsp = &sc->params.sge.qset[0];
3134	int coalesce_nsecs;
3135	struct sge_qset *qs;
3136	int i, j, err, nqsets = 0;
3137	struct mtx *lock;
3138
3139	if ((sc->flags & FULL_INIT_DONE) == 0)
3140		return (ENXIO);
3141
3142	coalesce_nsecs = qsp->coalesce_nsecs;
3143        err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
3144
3145	if (err != 0) {
3146		return (err);
3147	}
3148	if (coalesce_nsecs == qsp->coalesce_nsecs)
3149		return (0);
3150
3151	for (i = 0; i < sc->params.nports; i++)
3152		for (j = 0; j < sc->port[i].nqsets; j++)
3153			nqsets++;
3154
3155	coalesce_nsecs = max(100, coalesce_nsecs);
3156
3157	for (i = 0; i < nqsets; i++) {
3158		qs = &sc->sge.qs[i];
3159		qsp = &sc->params.sge.qset[i];
3160		qsp->coalesce_nsecs = coalesce_nsecs;
3161
3162		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3163			    &sc->sge.qs[0].rspq.lock;
3164
3165		mtx_lock(lock);
3166		t3_update_qset_coalesce(qs, qsp);
3167		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3168		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3169		mtx_unlock(lock);
3170	}
3171
3172	return (0);
3173}
3174
3175
3176void
3177t3_add_attach_sysctls(adapter_t *sc)
3178{
3179	struct sysctl_ctx_list *ctx;
3180	struct sysctl_oid_list *children;
3181
3182	ctx = device_get_sysctl_ctx(sc->dev);
3183	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3184
3185	/* random information */
3186	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3187	    "firmware_version",
3188	    CTLFLAG_RD, &sc->fw_version,
3189	    0, "firmware version");
3190
3191	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3192	    "enable_lro",
3193	    CTLTYPE_INT|CTLFLAG_RW, sc,
3194	    0, t3_lro_enable,
3195	    "I", "enable large receive offload");
3196
3197	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3198	    "enable_debug",
3199	    CTLFLAG_RW, &cxgb_debug,
3200	    0, "enable verbose debugging output");
3201	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tunq_coalesce",
3202	    CTLFLAG_RD, &sc->tunq_coalesce,
3203	    "#tunneled packets freed");
3204	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3205	    "txq_overrun",
3206	    CTLFLAG_RD, &txq_fills,
3207	    0, "#times txq overrun");
3208	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3209	    "bogus_imm",
3210	    CTLFLAG_RD, &bogus_imm,
3211	    0, "#times a bogus immediate response was seen");
3212	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3213	    "cache_alloc",
3214	    CTLFLAG_RD, &cxgb_cached_allocations,
3215	    0, "#times a cluster was allocated from cache");
3216	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3217	    "cached",
3218	    CTLFLAG_RD, &cxgb_cached,
3219	    0, "#times a cluster was cached");
3220	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3221	    "ext_freed",
3222	    CTLFLAG_RD, &cxgb_ext_freed,
3223	    0, "#times a cluster was freed through ext_free");
3224	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3225	    "mbufs_outstanding",
3226	    CTLFLAG_RD, &mbufs_outstanding,
3227	    0, "#mbufs in flight in the driver");
3228}
3229
3230
3231static const char *rspq_name = "rspq";
3232static const char *txq_names[] =
3233{
3234	"txq_eth",
3235	"txq_ofld",
3236	"txq_ctrl"
3237};
3238
3239void
3240t3_add_configured_sysctls(adapter_t *sc)
3241{
3242	struct sysctl_ctx_list *ctx;
3243	struct sysctl_oid_list *children;
3244	int i, j;
3245
3246	ctx = device_get_sysctl_ctx(sc->dev);
3247	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3248
3249	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3250	    "intr_coal",
3251	    CTLTYPE_INT|CTLFLAG_RW, sc,
3252	    0, t3_set_coalesce_nsecs,
3253	    "I", "interrupt coalescing timer (ns)");
3254
3255	for (i = 0; i < sc->params.nports; i++) {
3256		struct port_info *pi = &sc->port[i];
3257		struct sysctl_oid *poid;
3258		struct sysctl_oid_list *poidlist;
3259
3260		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3261		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3262		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3263		poidlist = SYSCTL_CHILDREN(poid);
3264		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3265		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3266		    0, "#queue sets");
3267
3268		for (j = 0; j < pi->nqsets; j++) {
3269			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3270			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid;
3271			struct sysctl_oid_list *qspoidlist, *rspqpoidlist, *txqpoidlist;
3272			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3273
3274			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3275
3276			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3277			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3278			qspoidlist = SYSCTL_CHILDREN(qspoid);
3279
3280			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3281			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3282			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3283
3284			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3285			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3286			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3287
3288
3289
3290			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3291			    CTLFLAG_RD, &qs->rspq.size,
3292			    0, "#entries in response queue");
3293			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3294			    CTLFLAG_RD, &qs->rspq.cidx,
3295			    0, "consumer index");
3296			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3297			    CTLFLAG_RD, &qs->rspq.credits,
3298			    0, "#credits");
3299			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3300			    CTLFLAG_RD, &qs->rspq.phys_addr,
3301			    "physical_address_of the queue");
3302			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3303			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3304			    0, "start rspq dump entry");
3305			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3306			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3307			    0, "#rspq entries to dump");
3308			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3309			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3310			    0, t3_dump_rspq, "A", "dump of the response queue");
3311
3312
3313
3314			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "dropped",
3315			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3316			    0, "#tunneled packets dropped");
3317			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3318			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3319			    0, "#tunneled packets waiting to be sent");
3320			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3321			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3322			    0, "#tunneled packets queue producer index");
3323			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3324			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3325			    0, "#tunneled packets queue consumer index");
3326			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3327			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3328			    0, "#tunneled packets processed by the card");
3329			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3330			    CTLFLAG_RD, &txq->cleaned,
3331			    0, "#tunneled packets cleaned");
3332			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3333			    CTLFLAG_RD, &txq->in_use,
3334			    0, "#tunneled packet slots in use");
3335			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3336			    CTLFLAG_RD, &txq->txq_frees,
3337			    "#tunneled packets freed");
3338			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3339			    CTLFLAG_RD, &txq->txq_skipped,
3340			    0, "#tunneled packet descriptors skipped");
3341			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "coalesced",
3342			    CTLFLAG_RD, &txq->txq_coalesced,
3343			    0, "#tunneled packets coalesced");
3344			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3345			    CTLFLAG_RD, &txq->txq_enqueued,
3346			    0, "#tunneled packets enqueued to hardware");
3347			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3348			    CTLFLAG_RD, &qs->txq_stopped,
3349			    0, "tx queues stopped");
3350			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3351			    CTLFLAG_RD, &txq->phys_addr,
3352			    "physical_address_of the queue");
3353			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3354			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3355			    0, "txq generation");
3356			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3357			    CTLFLAG_RD, &txq->cidx,
3358			    0, "hardware queue cidx");
3359			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3360			    CTLFLAG_RD, &txq->pidx,
3361			    0, "hardware queue pidx");
3362			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3363			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3364			    0, "txq start idx for dump");
3365			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3366			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3367			    0, "txq #entries to dump");
3368			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3369			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3370			    0, t3_dump_txq, "A", "dump of the transmit queue");
3371		}
3372	}
3373}
3374
3375/**
3376 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3377 *	@qs: the queue set
3378 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3379 *	@idx: the descriptor index in the queue
3380 *	@data: where to dump the descriptor contents
3381 *
3382 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3383 *	size of the descriptor.
3384 */
3385int
3386t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3387		unsigned char *data)
3388{
3389	if (qnum >= 6)
3390		return (EINVAL);
3391
3392	if (qnum < 3) {
3393		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3394			return -EINVAL;
3395		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3396		return sizeof(struct tx_desc);
3397	}
3398
3399	if (qnum == 3) {
3400		if (!qs->rspq.desc || idx >= qs->rspq.size)
3401			return (EINVAL);
3402		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3403		return sizeof(struct rsp_desc);
3404	}
3405
3406	qnum -= 4;
3407	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3408		return (EINVAL);
3409	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3410	return sizeof(struct rx_desc);
3411}
3412