1/*
2 * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32#include <linux/skbuff.h>
33#include <linux/netdevice.h>
34#include <linux/etherdevice.h>
35#include <linux/if_vlan.h>
36#include <linux/ip.h>
37#include <linux/tcp.h>
38#include <linux/dma-mapping.h>
39#include <linux/slab.h>
40#include <linux/prefetch.h>
41#include <net/arp.h>
42#include "common.h"
43#include "regs.h"
44#include "sge_defs.h"
45#include "t3_cpl.h"
46#include "firmware_exports.h"
47#include "cxgb3_offload.h"
48
49#define USE_GTS 0
50
51#define SGE_RX_SM_BUF_SIZE 1536
52
53#define SGE_RX_COPY_THRES  256
54#define SGE_RX_PULL_LEN    128
55
56#define SGE_PG_RSVD SMP_CACHE_BYTES
57/*
58 * Page chunk size for FL0 buffers if FL0 is to be populated with page chunks.
59 * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
60 * directly.
61 */
62#define FL0_PG_CHUNK_SIZE  2048
63#define FL0_PG_ORDER 0
64#define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER)
65#define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
66#define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
67#define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
68
69#define SGE_RX_DROP_THRES 16
70#define RX_RECLAIM_PERIOD (HZ/4)
71
72/*
73 * Max number of Rx buffers we replenish at a time.
74 */
75#define MAX_RX_REFILL 16U
76/*
77 * Period of the Tx buffer reclaim timer.  This timer does not need to run
78 * frequently as Tx buffers are usually reclaimed by new Tx packets.
79 */
80#define TX_RECLAIM_PERIOD (HZ / 4)
81#define TX_RECLAIM_TIMER_CHUNK 64U
82#define TX_RECLAIM_CHUNK 16U
83
84/* WR size in bytes */
85#define WR_LEN (WR_FLITS * 8)
86
87/*
88 * Types of Tx queues in each queue set.  Order here matters, do not change.
89 */
90enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
91
92/* Values for sge_txq.flags */
93enum {
94	TXQ_RUNNING = 1 << 0,	/* fetch engine is running */
95	TXQ_LAST_PKT_DB = 1 << 1,	/* last packet rang the doorbell */
96};
97
98struct tx_desc {
99	__be64 flit[TX_DESC_FLITS];
100};
101
102struct rx_desc {
103	__be32 addr_lo;
104	__be32 len_gen;
105	__be32 gen2;
106	__be32 addr_hi;
107};
108
109struct tx_sw_desc {		/* SW state per Tx descriptor */
110	struct sk_buff *skb;
111	u8 eop;       /* set if last descriptor for packet */
112	u8 addr_idx;  /* buffer index of first SGL entry in descriptor */
113	u8 fragidx;   /* first page fragment associated with descriptor */
114	s8 sflit;     /* start flit of first SGL entry in descriptor */
115};
116
117struct rx_sw_desc {                /* SW state per Rx descriptor */
118	union {
119		struct sk_buff *skb;
120		struct fl_pg_chunk pg_chunk;
121	};
122	DEFINE_DMA_UNMAP_ADDR(dma_addr);
123};
124
125struct rsp_desc {		/* response queue descriptor */
126	struct rss_header rss_hdr;
127	__be32 flags;
128	__be32 len_cq;
129	struct_group(immediate,
130		u8 imm_data[47];
131		u8 intr_gen;
132	);
133};
134
135/*
136 * Holds unmapping information for Tx packets that need deferred unmapping.
137 * This structure lives at skb->head and must be allocated by callers.
138 */
139struct deferred_unmap_info {
140	struct pci_dev *pdev;
141	dma_addr_t addr[MAX_SKB_FRAGS + 1];
142};
143
144/*
145 * Maps a number of flits to the number of Tx descriptors that can hold them.
146 * The formula is
147 *
148 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
149 *
150 * HW allows up to 4 descriptors to be combined into a WR.
151 */
152static u8 flit_desc_map[] = {
153	0,
154#if SGE_NUM_GENBITS == 1
155	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
157	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
158	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
159#elif SGE_NUM_GENBITS == 2
160	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
162	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
163	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
164#else
165# error "SGE_NUM_GENBITS must be 1 or 2"
166#endif
167};
168
169static inline struct sge_qset *rspq_to_qset(const struct sge_rspq *q)
170{
171	return container_of(q, struct sge_qset, rspq);
172}
173
174static inline struct sge_qset *txq_to_qset(const struct sge_txq *q, int qidx)
175{
176	return container_of(q, struct sge_qset, txq[qidx]);
177}
178
179/**
180 *	refill_rspq - replenish an SGE response queue
181 *	@adapter: the adapter
182 *	@q: the response queue to replenish
183 *	@credits: how many new responses to make available
184 *
185 *	Replenishes a response queue by making the supplied number of responses
186 *	available to HW.
187 */
188static inline void refill_rspq(struct adapter *adapter,
189			       const struct sge_rspq *q, unsigned int credits)
190{
191	rmb();
192	t3_write_reg(adapter, A_SG_RSPQ_CREDIT_RETURN,
193		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
194}
195
196/**
197 *	need_skb_unmap - does the platform need unmapping of sk_buffs?
198 *
199 *	Returns true if the platform needs sk_buff unmapping.  The compiler
200 *	optimizes away unnecessary code if this returns true.
201 */
202static inline int need_skb_unmap(void)
203{
204#ifdef CONFIG_NEED_DMA_MAP_STATE
205	return 1;
206#else
207	return 0;
208#endif
209}
210
211/**
212 *	unmap_skb - unmap a packet main body and its page fragments
213 *	@skb: the packet
214 *	@q: the Tx queue containing Tx descriptors for the packet
215 *	@cidx: index of Tx descriptor
216 *	@pdev: the PCI device
217 *
218 *	Unmap the main body of an sk_buff and its page fragments, if any.
219 *	Because of the fairly complicated structure of our SGLs and the desire
220 *	to conserve space for metadata, the information necessary to unmap an
221 *	sk_buff is spread across the sk_buff itself (buffer lengths), the HW Tx
222 *	descriptors (the physical addresses of the various data buffers), and
223 *	the SW descriptor state (assorted indices).  The send functions
224 *	initialize the indices for the first packet descriptor so we can unmap
225 *	the buffers held in the first Tx descriptor here, and we have enough
226 *	information at this point to set the state for the next Tx descriptor.
227 *
228 *	Note that it is possible to clean up the first descriptor of a packet
229 *	before the send routines have written the next descriptors, but this
230 *	race does not cause any problem.  We just end up writing the unmapping
231 *	info for the descriptor first.
232 */
233static inline void unmap_skb(struct sk_buff *skb, struct sge_txq *q,
234			     unsigned int cidx, struct pci_dev *pdev)
235{
236	const struct sg_ent *sgp;
237	struct tx_sw_desc *d = &q->sdesc[cidx];
238	int nfrags, frag_idx, curflit, j = d->addr_idx;
239
240	sgp = (struct sg_ent *)&q->desc[cidx].flit[d->sflit];
241	frag_idx = d->fragidx;
242
243	if (frag_idx == 0 && skb_headlen(skb)) {
244		dma_unmap_single(&pdev->dev, be64_to_cpu(sgp->addr[0]),
245				 skb_headlen(skb), DMA_TO_DEVICE);
246		j = 1;
247	}
248
249	curflit = d->sflit + 1 + j;
250	nfrags = skb_shinfo(skb)->nr_frags;
251
252	while (frag_idx < nfrags && curflit < WR_FLITS) {
253		dma_unmap_page(&pdev->dev, be64_to_cpu(sgp->addr[j]),
254			       skb_frag_size(&skb_shinfo(skb)->frags[frag_idx]),
255			       DMA_TO_DEVICE);
256		j ^= 1;
257		if (j == 0) {
258			sgp++;
259			curflit++;
260		}
261		curflit++;
262		frag_idx++;
263	}
264
265	if (frag_idx < nfrags) {   /* SGL continues into next Tx descriptor */
266		d = cidx + 1 == q->size ? q->sdesc : d + 1;
267		d->fragidx = frag_idx;
268		d->addr_idx = j;
269		d->sflit = curflit - WR_FLITS - j; /* sflit can be -1 */
270	}
271}
272
273/**
274 *	free_tx_desc - reclaims Tx descriptors and their buffers
275 *	@adapter: the adapter
276 *	@q: the Tx queue to reclaim descriptors from
277 *	@n: the number of descriptors to reclaim
278 *
279 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
280 *	Tx buffers.  Called with the Tx queue lock held.
281 */
282static void free_tx_desc(struct adapter *adapter, struct sge_txq *q,
283			 unsigned int n)
284{
285	struct tx_sw_desc *d;
286	struct pci_dev *pdev = adapter->pdev;
287	unsigned int cidx = q->cidx;
288
289	const int need_unmap = need_skb_unmap() &&
290			       q->cntxt_id >= FW_TUNNEL_SGEEC_START;
291
292	d = &q->sdesc[cidx];
293	while (n--) {
294		if (d->skb) {	/* an SGL is present */
295			if (need_unmap)
296				unmap_skb(d->skb, q, cidx, pdev);
297			if (d->eop) {
298				dev_consume_skb_any(d->skb);
299				d->skb = NULL;
300			}
301		}
302		++d;
303		if (++cidx == q->size) {
304			cidx = 0;
305			d = q->sdesc;
306		}
307	}
308	q->cidx = cidx;
309}
310
311/**
312 *	reclaim_completed_tx - reclaims completed Tx descriptors
313 *	@adapter: the adapter
314 *	@q: the Tx queue to reclaim completed descriptors from
315 *	@chunk: maximum number of descriptors to reclaim
316 *
317 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
318 *	and frees the associated buffers if possible.  Called with the Tx
319 *	queue's lock held.
320 */
321static inline unsigned int reclaim_completed_tx(struct adapter *adapter,
322						struct sge_txq *q,
323						unsigned int chunk)
324{
325	unsigned int reclaim = q->processed - q->cleaned;
326
327	reclaim = min(chunk, reclaim);
328	if (reclaim) {
329		free_tx_desc(adapter, q, reclaim);
330		q->cleaned += reclaim;
331		q->in_use -= reclaim;
332	}
333	return q->processed - q->cleaned;
334}
335
336/**
337 *	should_restart_tx - are there enough resources to restart a Tx queue?
338 *	@q: the Tx queue
339 *
340 *	Checks if there are enough descriptors to restart a suspended Tx queue.
341 */
342static inline int should_restart_tx(const struct sge_txq *q)
343{
344	unsigned int r = q->processed - q->cleaned;
345
346	return q->in_use - r < (q->size >> 1);
347}
348
349static void clear_rx_desc(struct pci_dev *pdev, const struct sge_fl *q,
350			  struct rx_sw_desc *d)
351{
352	if (q->use_pages && d->pg_chunk.page) {
353		(*d->pg_chunk.p_cnt)--;
354		if (!*d->pg_chunk.p_cnt)
355			dma_unmap_page(&pdev->dev, d->pg_chunk.mapping,
356				       q->alloc_size, DMA_FROM_DEVICE);
357
358		put_page(d->pg_chunk.page);
359		d->pg_chunk.page = NULL;
360	} else {
361		dma_unmap_single(&pdev->dev, dma_unmap_addr(d, dma_addr),
362				 q->buf_size, DMA_FROM_DEVICE);
363		kfree_skb(d->skb);
364		d->skb = NULL;
365	}
366}
367
368/**
369 *	free_rx_bufs - free the Rx buffers on an SGE free list
370 *	@pdev: the PCI device associated with the adapter
371 *	@q: the SGE free list to clean up
372 *
373 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
374 *	this queue should be stopped before calling this function.
375 */
376static void free_rx_bufs(struct pci_dev *pdev, struct sge_fl *q)
377{
378	unsigned int cidx = q->cidx;
379
380	while (q->credits--) {
381		struct rx_sw_desc *d = &q->sdesc[cidx];
382
383
384		clear_rx_desc(pdev, q, d);
385		if (++cidx == q->size)
386			cidx = 0;
387	}
388
389	if (q->pg_chunk.page) {
390		__free_pages(q->pg_chunk.page, q->order);
391		q->pg_chunk.page = NULL;
392	}
393}
394
395/**
396 *	add_one_rx_buf - add a packet buffer to a free-buffer list
397 *	@va:  buffer start VA
398 *	@len: the buffer length
399 *	@d: the HW Rx descriptor to write
400 *	@sd: the SW Rx descriptor to write
401 *	@gen: the generation bit value
402 *	@pdev: the PCI device associated with the adapter
403 *
404 *	Add a buffer of the given length to the supplied HW and SW Rx
405 *	descriptors.
406 */
407static inline int add_one_rx_buf(void *va, unsigned int len,
408				 struct rx_desc *d, struct rx_sw_desc *sd,
409				 unsigned int gen, struct pci_dev *pdev)
410{
411	dma_addr_t mapping;
412
413	mapping = dma_map_single(&pdev->dev, va, len, DMA_FROM_DEVICE);
414	if (unlikely(dma_mapping_error(&pdev->dev, mapping)))
415		return -ENOMEM;
416
417	dma_unmap_addr_set(sd, dma_addr, mapping);
418
419	d->addr_lo = cpu_to_be32(mapping);
420	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
421	dma_wmb();
422	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
423	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
424	return 0;
425}
426
427static inline int add_one_rx_chunk(dma_addr_t mapping, struct rx_desc *d,
428				   unsigned int gen)
429{
430	d->addr_lo = cpu_to_be32(mapping);
431	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
432	dma_wmb();
433	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
434	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
435	return 0;
436}
437
438static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q,
439			  struct rx_sw_desc *sd, gfp_t gfp,
440			  unsigned int order)
441{
442	if (!q->pg_chunk.page) {
443		dma_addr_t mapping;
444
445		q->pg_chunk.page = alloc_pages(gfp, order);
446		if (unlikely(!q->pg_chunk.page))
447			return -ENOMEM;
448		q->pg_chunk.va = page_address(q->pg_chunk.page);
449		q->pg_chunk.p_cnt = q->pg_chunk.va + (PAGE_SIZE << order) -
450				    SGE_PG_RSVD;
451		q->pg_chunk.offset = 0;
452		mapping = dma_map_page(&adapter->pdev->dev, q->pg_chunk.page,
453				       0, q->alloc_size, DMA_FROM_DEVICE);
454		if (unlikely(dma_mapping_error(&adapter->pdev->dev, mapping))) {
455			__free_pages(q->pg_chunk.page, order);
456			q->pg_chunk.page = NULL;
457			return -EIO;
458		}
459		q->pg_chunk.mapping = mapping;
460	}
461	sd->pg_chunk = q->pg_chunk;
462
463	prefetch(sd->pg_chunk.p_cnt);
464
465	q->pg_chunk.offset += q->buf_size;
466	if (q->pg_chunk.offset == (PAGE_SIZE << order))
467		q->pg_chunk.page = NULL;
468	else {
469		q->pg_chunk.va += q->buf_size;
470		get_page(q->pg_chunk.page);
471	}
472
473	if (sd->pg_chunk.offset == 0)
474		*sd->pg_chunk.p_cnt = 1;
475	else
476		*sd->pg_chunk.p_cnt += 1;
477
478	return 0;
479}
480
481static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
482{
483	if (q->pend_cred >= q->credits / 4) {
484		q->pend_cred = 0;
485		wmb();
486		t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
487	}
488}
489
490/**
491 *	refill_fl - refill an SGE free-buffer list
492 *	@adap: the adapter
493 *	@q: the free-list to refill
494 *	@n: the number of new buffers to allocate
495 *	@gfp: the gfp flags for allocating new buffers
496 *
497 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers,
498 *	allocated with the supplied gfp flags.  The caller must assure that
499 *	@n does not exceed the queue's capacity.
500 */
501static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp)
502{
503	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
504	struct rx_desc *d = &q->desc[q->pidx];
505	unsigned int count = 0;
506
507	while (n--) {
508		dma_addr_t mapping;
509		int err;
510
511		if (q->use_pages) {
512			if (unlikely(alloc_pg_chunk(adap, q, sd, gfp,
513						    q->order))) {
514nomem:				q->alloc_failed++;
515				break;
516			}
517			mapping = sd->pg_chunk.mapping + sd->pg_chunk.offset;
518			dma_unmap_addr_set(sd, dma_addr, mapping);
519
520			add_one_rx_chunk(mapping, d, q->gen);
521			dma_sync_single_for_device(&adap->pdev->dev, mapping,
522						   q->buf_size - SGE_PG_RSVD,
523						   DMA_FROM_DEVICE);
524		} else {
525			void *buf_start;
526
527			struct sk_buff *skb = alloc_skb(q->buf_size, gfp);
528			if (!skb)
529				goto nomem;
530
531			sd->skb = skb;
532			buf_start = skb->data;
533			err = add_one_rx_buf(buf_start, q->buf_size, d, sd,
534					     q->gen, adap->pdev);
535			if (unlikely(err)) {
536				clear_rx_desc(adap->pdev, q, sd);
537				break;
538			}
539		}
540
541		d++;
542		sd++;
543		if (++q->pidx == q->size) {
544			q->pidx = 0;
545			q->gen ^= 1;
546			sd = q->sdesc;
547			d = q->desc;
548		}
549		count++;
550	}
551
552	q->credits += count;
553	q->pend_cred += count;
554	ring_fl_db(adap, q);
555
556	return count;
557}
558
559static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
560{
561	refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits),
562		  GFP_ATOMIC | __GFP_COMP);
563}
564
565/**
566 *	recycle_rx_buf - recycle a receive buffer
567 *	@adap: the adapter
568 *	@q: the SGE free list
569 *	@idx: index of buffer to recycle
570 *
571 *	Recycles the specified buffer on the given free list by adding it at
572 *	the next available slot on the list.
573 */
574static void recycle_rx_buf(struct adapter *adap, struct sge_fl *q,
575			   unsigned int idx)
576{
577	struct rx_desc *from = &q->desc[idx];
578	struct rx_desc *to = &q->desc[q->pidx];
579
580	q->sdesc[q->pidx] = q->sdesc[idx];
581	to->addr_lo = from->addr_lo;	/* already big endian */
582	to->addr_hi = from->addr_hi;	/* likewise */
583	dma_wmb();
584	to->len_gen = cpu_to_be32(V_FLD_GEN1(q->gen));
585	to->gen2 = cpu_to_be32(V_FLD_GEN2(q->gen));
586
587	if (++q->pidx == q->size) {
588		q->pidx = 0;
589		q->gen ^= 1;
590	}
591
592	q->credits++;
593	q->pend_cred++;
594	ring_fl_db(adap, q);
595}
596
597/**
598 *	alloc_ring - allocate resources for an SGE descriptor ring
599 *	@pdev: the PCI device
600 *	@nelem: the number of descriptors
601 *	@elem_size: the size of each descriptor
602 *	@sw_size: the size of the SW state associated with each ring element
603 *	@phys: the physical address of the allocated ring
604 *	@metadata: address of the array holding the SW state for the ring
605 *
606 *	Allocates resources for an SGE descriptor ring, such as Tx queues,
607 *	free buffer lists, or response queues.  Each SGE ring requires
608 *	space for its HW descriptors plus, optionally, space for the SW state
609 *	associated with each HW entry (the metadata).  The function returns
610 *	three values: the virtual address for the HW ring (the return value
611 *	of the function), the physical address of the HW ring, and the address
612 *	of the SW ring.
613 */
614static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
615			size_t sw_size, dma_addr_t * phys, void *metadata)
616{
617	size_t len = nelem * elem_size;
618	void *s = NULL;
619	void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
620
621	if (!p)
622		return NULL;
623	if (sw_size && metadata) {
624		s = kcalloc(nelem, sw_size, GFP_KERNEL);
625
626		if (!s) {
627			dma_free_coherent(&pdev->dev, len, p, *phys);
628			return NULL;
629		}
630		*(void **)metadata = s;
631	}
632	return p;
633}
634
635/**
636 *	t3_reset_qset - reset a sge qset
637 *	@q: the queue set
638 *
639 *	Reset the qset structure.
640 *	the NAPI structure is preserved in the event of
641 *	the qset's reincarnation, for example during EEH recovery.
642 */
643static void t3_reset_qset(struct sge_qset *q)
644{
645	if (q->adap &&
646	    !(q->adap->flags & NAPI_INIT)) {
647		memset(q, 0, sizeof(*q));
648		return;
649	}
650
651	q->adap = NULL;
652	memset(&q->rspq, 0, sizeof(q->rspq));
653	memset(q->fl, 0, sizeof(struct sge_fl) * SGE_RXQ_PER_SET);
654	memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
655	q->txq_stopped = 0;
656	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
657	q->rx_reclaim_timer.function = NULL;
658	q->nomem = 0;
659	napi_free_frags(&q->napi);
660}
661
662
663/**
664 *	t3_free_qset - free the resources of an SGE queue set
665 *	@adapter: the adapter owning the queue set
666 *	@q: the queue set
667 *
668 *	Release the HW and SW resources associated with an SGE queue set, such
669 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
670 *	queue set must be quiesced prior to calling this.
671 */
672static void t3_free_qset(struct adapter *adapter, struct sge_qset *q)
673{
674	int i;
675	struct pci_dev *pdev = adapter->pdev;
676
677	for (i = 0; i < SGE_RXQ_PER_SET; ++i)
678		if (q->fl[i].desc) {
679			spin_lock_irq(&adapter->sge.reg_lock);
680			t3_sge_disable_fl(adapter, q->fl[i].cntxt_id);
681			spin_unlock_irq(&adapter->sge.reg_lock);
682			free_rx_bufs(pdev, &q->fl[i]);
683			kfree(q->fl[i].sdesc);
684			dma_free_coherent(&pdev->dev,
685					  q->fl[i].size *
686					  sizeof(struct rx_desc), q->fl[i].desc,
687					  q->fl[i].phys_addr);
688		}
689
690	for (i = 0; i < SGE_TXQ_PER_SET; ++i)
691		if (q->txq[i].desc) {
692			spin_lock_irq(&adapter->sge.reg_lock);
693			t3_sge_enable_ecntxt(adapter, q->txq[i].cntxt_id, 0);
694			spin_unlock_irq(&adapter->sge.reg_lock);
695			if (q->txq[i].sdesc) {
696				free_tx_desc(adapter, &q->txq[i],
697					     q->txq[i].in_use);
698				kfree(q->txq[i].sdesc);
699			}
700			dma_free_coherent(&pdev->dev,
701					  q->txq[i].size *
702					  sizeof(struct tx_desc),
703					  q->txq[i].desc, q->txq[i].phys_addr);
704			__skb_queue_purge(&q->txq[i].sendq);
705		}
706
707	if (q->rspq.desc) {
708		spin_lock_irq(&adapter->sge.reg_lock);
709		t3_sge_disable_rspcntxt(adapter, q->rspq.cntxt_id);
710		spin_unlock_irq(&adapter->sge.reg_lock);
711		dma_free_coherent(&pdev->dev,
712				  q->rspq.size * sizeof(struct rsp_desc),
713				  q->rspq.desc, q->rspq.phys_addr);
714	}
715
716	t3_reset_qset(q);
717}
718
719/**
720 *	init_qset_cntxt - initialize an SGE queue set context info
721 *	@qs: the queue set
722 *	@id: the queue set id
723 *
724 *	Initializes the TIDs and context ids for the queues of a queue set.
725 */
726static void init_qset_cntxt(struct sge_qset *qs, unsigned int id)
727{
728	qs->rspq.cntxt_id = id;
729	qs->fl[0].cntxt_id = 2 * id;
730	qs->fl[1].cntxt_id = 2 * id + 1;
731	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
732	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
733	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
734	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
735	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
736}
737
738/**
739 *	sgl_len - calculates the size of an SGL of the given capacity
740 *	@n: the number of SGL entries
741 *
742 *	Calculates the number of flits needed for a scatter/gather list that
743 *	can hold the given number of entries.
744 */
745static inline unsigned int sgl_len(unsigned int n)
746{
747	/* alternatively: 3 * (n / 2) + 2 * (n & 1) */
748	return (3 * n) / 2 + (n & 1);
749}
750
751/**
752 *	flits_to_desc - returns the num of Tx descriptors for the given flits
753 *	@n: the number of flits
754 *
755 *	Calculates the number of Tx descriptors needed for the supplied number
756 *	of flits.
757 */
758static inline unsigned int flits_to_desc(unsigned int n)
759{
760	BUG_ON(n >= ARRAY_SIZE(flit_desc_map));
761	return flit_desc_map[n];
762}
763
764/**
765 *	get_packet - return the next ingress packet buffer from a free list
766 *	@adap: the adapter that received the packet
767 *	@fl: the SGE free list holding the packet
768 *	@len: the packet length including any SGE padding
769 *	@drop_thres: # of remaining buffers before we start dropping packets
770 *
771 *	Get the next packet from a free list and complete setup of the
772 *	sk_buff.  If the packet is small we make a copy and recycle the
773 *	original buffer, otherwise we use the original buffer itself.  If a
774 *	positive drop threshold is supplied packets are dropped and their
775 *	buffers recycled if (a) the number of remaining buffers is under the
776 *	threshold and the packet is too big to copy, or (b) the packet should
777 *	be copied but there is no memory for the copy.
778 */
779static struct sk_buff *get_packet(struct adapter *adap, struct sge_fl *fl,
780				  unsigned int len, unsigned int drop_thres)
781{
782	struct sk_buff *skb = NULL;
783	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
784
785	prefetch(sd->skb->data);
786	fl->credits--;
787
788	if (len <= SGE_RX_COPY_THRES) {
789		skb = alloc_skb(len, GFP_ATOMIC);
790		if (likely(skb != NULL)) {
791			__skb_put(skb, len);
792			dma_sync_single_for_cpu(&adap->pdev->dev,
793						dma_unmap_addr(sd, dma_addr),
794						len, DMA_FROM_DEVICE);
795			memcpy(skb->data, sd->skb->data, len);
796			dma_sync_single_for_device(&adap->pdev->dev,
797						   dma_unmap_addr(sd, dma_addr),
798						   len, DMA_FROM_DEVICE);
799		} else if (!drop_thres)
800			goto use_orig_buf;
801recycle:
802		recycle_rx_buf(adap, fl, fl->cidx);
803		return skb;
804	}
805
806	if (unlikely(fl->credits < drop_thres) &&
807	    refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits - 1),
808		      GFP_ATOMIC | __GFP_COMP) == 0)
809		goto recycle;
810
811use_orig_buf:
812	dma_unmap_single(&adap->pdev->dev, dma_unmap_addr(sd, dma_addr),
813			 fl->buf_size, DMA_FROM_DEVICE);
814	skb = sd->skb;
815	skb_put(skb, len);
816	__refill_fl(adap, fl);
817	return skb;
818}
819
820/**
821 *	get_packet_pg - return the next ingress packet buffer from a free list
822 *	@adap: the adapter that received the packet
823 *	@fl: the SGE free list holding the packet
824 *	@q: the queue
825 *	@len: the packet length including any SGE padding
826 *	@drop_thres: # of remaining buffers before we start dropping packets
827 *
828 *	Get the next packet from a free list populated with page chunks.
829 *	If the packet is small we make a copy and recycle the original buffer,
830 *	otherwise we attach the original buffer as a page fragment to a fresh
831 *	sk_buff.  If a positive drop threshold is supplied packets are dropped
832 *	and their buffers recycled if (a) the number of remaining buffers is
833 *	under the threshold and the packet is too big to copy, or (b) there's
834 *	no system memory.
835 *
836 * 	Note: this function is similar to @get_packet but deals with Rx buffers
837 * 	that are page chunks rather than sk_buffs.
838 */
839static struct sk_buff *get_packet_pg(struct adapter *adap, struct sge_fl *fl,
840				     struct sge_rspq *q, unsigned int len,
841				     unsigned int drop_thres)
842{
843	struct sk_buff *newskb, *skb;
844	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
845
846	dma_addr_t dma_addr = dma_unmap_addr(sd, dma_addr);
847
848	newskb = skb = q->pg_skb;
849	if (!skb && (len <= SGE_RX_COPY_THRES)) {
850		newskb = alloc_skb(len, GFP_ATOMIC);
851		if (likely(newskb != NULL)) {
852			__skb_put(newskb, len);
853			dma_sync_single_for_cpu(&adap->pdev->dev, dma_addr,
854						len, DMA_FROM_DEVICE);
855			memcpy(newskb->data, sd->pg_chunk.va, len);
856			dma_sync_single_for_device(&adap->pdev->dev, dma_addr,
857						   len, DMA_FROM_DEVICE);
858		} else if (!drop_thres)
859			return NULL;
860recycle:
861		fl->credits--;
862		recycle_rx_buf(adap, fl, fl->cidx);
863		q->rx_recycle_buf++;
864		return newskb;
865	}
866
867	if (unlikely(q->rx_recycle_buf || (!skb && fl->credits <= drop_thres)))
868		goto recycle;
869
870	prefetch(sd->pg_chunk.p_cnt);
871
872	if (!skb)
873		newskb = alloc_skb(SGE_RX_PULL_LEN, GFP_ATOMIC);
874
875	if (unlikely(!newskb)) {
876		if (!drop_thres)
877			return NULL;
878		goto recycle;
879	}
880
881	dma_sync_single_for_cpu(&adap->pdev->dev, dma_addr, len,
882				DMA_FROM_DEVICE);
883	(*sd->pg_chunk.p_cnt)--;
884	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
885		dma_unmap_page(&adap->pdev->dev, sd->pg_chunk.mapping,
886			       fl->alloc_size, DMA_FROM_DEVICE);
887	if (!skb) {
888		__skb_put(newskb, SGE_RX_PULL_LEN);
889		memcpy(newskb->data, sd->pg_chunk.va, SGE_RX_PULL_LEN);
890		skb_fill_page_desc(newskb, 0, sd->pg_chunk.page,
891				   sd->pg_chunk.offset + SGE_RX_PULL_LEN,
892				   len - SGE_RX_PULL_LEN);
893		newskb->len = len;
894		newskb->data_len = len - SGE_RX_PULL_LEN;
895		newskb->truesize += newskb->data_len;
896	} else {
897		skb_fill_page_desc(newskb, skb_shinfo(newskb)->nr_frags,
898				   sd->pg_chunk.page,
899				   sd->pg_chunk.offset, len);
900		newskb->len += len;
901		newskb->data_len += len;
902		newskb->truesize += len;
903	}
904
905	fl->credits--;
906	/*
907	 * We do not refill FLs here, we let the caller do it to overlap a
908	 * prefetch.
909	 */
910	return newskb;
911}
912
913/**
914 *	get_imm_packet - return the next ingress packet buffer from a response
915 *	@resp: the response descriptor containing the packet data
916 *
917 *	Return a packet containing the immediate data of the given response.
918 */
919static inline struct sk_buff *get_imm_packet(const struct rsp_desc *resp)
920{
921	struct sk_buff *skb = alloc_skb(IMMED_PKT_SIZE, GFP_ATOMIC);
922
923	if (skb) {
924		__skb_put(skb, IMMED_PKT_SIZE);
925		BUILD_BUG_ON(IMMED_PKT_SIZE != sizeof(resp->immediate));
926		skb_copy_to_linear_data(skb, &resp->immediate, IMMED_PKT_SIZE);
927	}
928	return skb;
929}
930
931/**
932 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
933 *	@skb: the packet
934 *
935 * 	Returns the number of Tx descriptors needed for the given Ethernet
936 * 	packet.  Ethernet packets require addition of WR and CPL headers.
937 */
938static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
939{
940	unsigned int flits;
941
942	if (skb->len <= WR_LEN - sizeof(struct cpl_tx_pkt))
943		return 1;
944
945	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 2;
946	if (skb_shinfo(skb)->gso_size)
947		flits++;
948	return flits_to_desc(flits);
949}
950
951/*	map_skb - map a packet main body and its page fragments
952 *	@pdev: the PCI device
953 *	@skb: the packet
954 *	@addr: placeholder to save the mapped addresses
955 *
956 *	map the main body of an sk_buff and its page fragments, if any.
957 */
958static int map_skb(struct pci_dev *pdev, const struct sk_buff *skb,
959		   dma_addr_t *addr)
960{
961	const skb_frag_t *fp, *end;
962	const struct skb_shared_info *si;
963
964	if (skb_headlen(skb)) {
965		*addr = dma_map_single(&pdev->dev, skb->data,
966				       skb_headlen(skb), DMA_TO_DEVICE);
967		if (dma_mapping_error(&pdev->dev, *addr))
968			goto out_err;
969		addr++;
970	}
971
972	si = skb_shinfo(skb);
973	end = &si->frags[si->nr_frags];
974
975	for (fp = si->frags; fp < end; fp++) {
976		*addr = skb_frag_dma_map(&pdev->dev, fp, 0, skb_frag_size(fp),
977					 DMA_TO_DEVICE);
978		if (dma_mapping_error(&pdev->dev, *addr))
979			goto unwind;
980		addr++;
981	}
982	return 0;
983
984unwind:
985	while (fp-- > si->frags)
986		dma_unmap_page(&pdev->dev, *--addr, skb_frag_size(fp),
987			       DMA_TO_DEVICE);
988
989	dma_unmap_single(&pdev->dev, addr[-1], skb_headlen(skb),
990			 DMA_TO_DEVICE);
991out_err:
992	return -ENOMEM;
993}
994
995/**
996 *	write_sgl - populate a scatter/gather list for a packet
997 *	@skb: the packet
998 *	@sgp: the SGL to populate
999 *	@start: start address of skb main body data to include in the SGL
1000 *	@len: length of skb main body data to include in the SGL
1001 *	@addr: the list of the mapped addresses
1002 *
1003 *	Copies the scatter/gather list for the buffers that make up a packet
1004 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1005 *	appropriately.
1006 */
1007static inline unsigned int write_sgl(const struct sk_buff *skb,
1008				     struct sg_ent *sgp, unsigned char *start,
1009				     unsigned int len, const dma_addr_t *addr)
1010{
1011	unsigned int i, j = 0, k = 0, nfrags;
1012
1013	if (len) {
1014		sgp->len[0] = cpu_to_be32(len);
1015		sgp->addr[j++] = cpu_to_be64(addr[k++]);
1016	}
1017
1018	nfrags = skb_shinfo(skb)->nr_frags;
1019	for (i = 0; i < nfrags; i++) {
1020		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1021
1022		sgp->len[j] = cpu_to_be32(skb_frag_size(frag));
1023		sgp->addr[j] = cpu_to_be64(addr[k++]);
1024		j ^= 1;
1025		if (j == 0)
1026			++sgp;
1027	}
1028	if (j)
1029		sgp->len[j] = 0;
1030	return ((nfrags + (len != 0)) * 3) / 2 + j;
1031}
1032
1033/**
1034 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1035 *	@adap: the adapter
1036 *	@q: the Tx queue
1037 *
1038 *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1039 *	where the HW is going to sleep just after we checked, however,
1040 *	then the interrupt handler will detect the outstanding TX packet
1041 *	and ring the doorbell for us.
1042 *
1043 *	When GTS is disabled we unconditionally ring the doorbell.
1044 */
1045static inline void check_ring_tx_db(struct adapter *adap, struct sge_txq *q)
1046{
1047#if USE_GTS
1048	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1049	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1050		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1051		t3_write_reg(adap, A_SG_KDOORBELL,
1052			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1053	}
1054#else
1055	wmb();			/* write descriptors before telling HW */
1056	t3_write_reg(adap, A_SG_KDOORBELL,
1057		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1058#endif
1059}
1060
1061static inline void wr_gen2(struct tx_desc *d, unsigned int gen)
1062{
1063#if SGE_NUM_GENBITS == 2
1064	d->flit[TX_DESC_FLITS - 1] = cpu_to_be64(gen);
1065#endif
1066}
1067
1068/**
1069 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1070 *	@ndesc: number of Tx descriptors spanned by the SGL
1071 *	@skb: the packet corresponding to the WR
1072 *	@d: first Tx descriptor to be written
1073 *	@pidx: index of above descriptors
1074 *	@q: the SGE Tx queue
1075 *	@sgl: the SGL
1076 *	@flits: number of flits to the start of the SGL in the first descriptor
1077 *	@sgl_flits: the SGL size in flits
1078 *	@gen: the Tx descriptor generation
1079 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1080 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1081 *
1082 *	Write a work request header and an associated SGL.  If the SGL is
1083 *	small enough to fit into one Tx descriptor it has already been written
1084 *	and we just need to write the WR header.  Otherwise we distribute the
1085 *	SGL across the number of descriptors it spans.
1086 */
1087static void write_wr_hdr_sgl(unsigned int ndesc, struct sk_buff *skb,
1088			     struct tx_desc *d, unsigned int pidx,
1089			     const struct sge_txq *q,
1090			     const struct sg_ent *sgl,
1091			     unsigned int flits, unsigned int sgl_flits,
1092			     unsigned int gen, __be32 wr_hi,
1093			     __be32 wr_lo)
1094{
1095	struct work_request_hdr *wrp = (struct work_request_hdr *)d;
1096	struct tx_sw_desc *sd = &q->sdesc[pidx];
1097
1098	sd->skb = skb;
1099	if (need_skb_unmap()) {
1100		sd->fragidx = 0;
1101		sd->addr_idx = 0;
1102		sd->sflit = flits;
1103	}
1104
1105	if (likely(ndesc == 1)) {
1106		sd->eop = 1;
1107		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1108				   V_WR_SGLSFLT(flits)) | wr_hi;
1109		dma_wmb();
1110		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1111				   V_WR_GEN(gen)) | wr_lo;
1112		wr_gen2(d, gen);
1113	} else {
1114		unsigned int ogen = gen;
1115		const u64 *fp = (const u64 *)sgl;
1116		struct work_request_hdr *wp = wrp;
1117
1118		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1119				   V_WR_SGLSFLT(flits)) | wr_hi;
1120
1121		while (sgl_flits) {
1122			unsigned int avail = WR_FLITS - flits;
1123
1124			if (avail > sgl_flits)
1125				avail = sgl_flits;
1126			memcpy(&d->flit[flits], fp, avail * sizeof(*fp));
1127			sgl_flits -= avail;
1128			ndesc--;
1129			if (!sgl_flits)
1130				break;
1131
1132			fp += avail;
1133			d++;
1134			sd->eop = 0;
1135			sd++;
1136			if (++pidx == q->size) {
1137				pidx = 0;
1138				gen ^= 1;
1139				d = q->desc;
1140				sd = q->sdesc;
1141			}
1142
1143			sd->skb = skb;
1144			wrp = (struct work_request_hdr *)d;
1145			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1146					   V_WR_SGLSFLT(1)) | wr_hi;
1147			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1148							sgl_flits + 1)) |
1149					   V_WR_GEN(gen)) | wr_lo;
1150			wr_gen2(d, gen);
1151			flits = 1;
1152		}
1153		sd->eop = 1;
1154		wrp->wr_hi |= htonl(F_WR_EOP);
1155		dma_wmb();
1156		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1157		wr_gen2((struct tx_desc *)wp, ogen);
1158		WARN_ON(ndesc != 0);
1159	}
1160}
1161
1162/**
1163 *	write_tx_pkt_wr - write a TX_PKT work request
1164 *	@adap: the adapter
1165 *	@skb: the packet to send
1166 *	@pi: the egress interface
1167 *	@pidx: index of the first Tx descriptor to write
1168 *	@gen: the generation value to use
1169 *	@q: the Tx queue
1170 *	@ndesc: number of descriptors the packet will occupy
1171 *	@compl: the value of the COMPL bit to use
1172 *	@addr: address
1173 *
1174 *	Generate a TX_PKT work request to send the supplied packet.
1175 */
1176static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
1177			    const struct port_info *pi,
1178			    unsigned int pidx, unsigned int gen,
1179			    struct sge_txq *q, unsigned int ndesc,
1180			    unsigned int compl, const dma_addr_t *addr)
1181{
1182	unsigned int flits, sgl_flits, cntrl, tso_info;
1183	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
1184	struct tx_desc *d = &q->desc[pidx];
1185	struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)d;
1186
1187	cpl->len = htonl(skb->len);
1188	cntrl = V_TXPKT_INTF(pi->port_id);
1189
1190	if (skb_vlan_tag_present(skb))
1191		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(skb_vlan_tag_get(skb));
1192
1193	tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
1194	if (tso_info) {
1195		int eth_type;
1196		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)cpl;
1197
1198		d->flit[2] = 0;
1199		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1200		hdr->cntrl = htonl(cntrl);
1201		eth_type = skb_network_offset(skb) == ETH_HLEN ?
1202		    CPL_ETH_II : CPL_ETH_II_VLAN;
1203		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1204		    V_LSO_IPHDR_WORDS(ip_hdr(skb)->ihl) |
1205		    V_LSO_TCPHDR_WORDS(tcp_hdr(skb)->doff);
1206		hdr->lso_info = htonl(tso_info);
1207		flits = 3;
1208	} else {
1209		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1210		cntrl |= F_TXPKT_IPCSUM_DIS;	/* SW calculates IP csum */
1211		cntrl |= V_TXPKT_L4CSUM_DIS(skb->ip_summed != CHECKSUM_PARTIAL);
1212		cpl->cntrl = htonl(cntrl);
1213
1214		if (skb->len <= WR_LEN - sizeof(*cpl)) {
1215			q->sdesc[pidx].skb = NULL;
1216			if (!skb->data_len)
1217				skb_copy_from_linear_data(skb, &d->flit[2],
1218							  skb->len);
1219			else
1220				skb_copy_bits(skb, 0, &d->flit[2], skb->len);
1221
1222			flits = (skb->len + 7) / 8 + 2;
1223			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(skb->len & 7) |
1224					      V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT)
1225					      | F_WR_SOP | F_WR_EOP | compl);
1226			dma_wmb();
1227			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(gen) |
1228					      V_WR_TID(q->token));
1229			wr_gen2(d, gen);
1230			dev_consume_skb_any(skb);
1231			return;
1232		}
1233
1234		flits = 2;
1235	}
1236
1237	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1238	sgl_flits = write_sgl(skb, sgp, skb->data, skb_headlen(skb), addr);
1239
1240	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
1241			 htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
1242			 htonl(V_WR_TID(q->token)));
1243}
1244
1245static inline void t3_stop_tx_queue(struct netdev_queue *txq,
1246				    struct sge_qset *qs, struct sge_txq *q)
1247{
1248	netif_tx_stop_queue(txq);
1249	set_bit(TXQ_ETH, &qs->txq_stopped);
1250	q->stops++;
1251}
1252
1253/**
1254 *	t3_eth_xmit - add a packet to the Ethernet Tx queue
1255 *	@skb: the packet
1256 *	@dev: the egress net device
1257 *
1258 *	Add a packet to an SGE Tx queue.  Runs with softirqs disabled.
1259 */
1260netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
1261{
1262	int qidx;
1263	unsigned int ndesc, pidx, credits, gen, compl;
1264	const struct port_info *pi = netdev_priv(dev);
1265	struct adapter *adap = pi->adapter;
1266	struct netdev_queue *txq;
1267	struct sge_qset *qs;
1268	struct sge_txq *q;
1269	dma_addr_t addr[MAX_SKB_FRAGS + 1];
1270
1271	/*
1272	 * The chip min packet length is 9 octets but play safe and reject
1273	 * anything shorter than an Ethernet header.
1274	 */
1275	if (unlikely(skb->len < ETH_HLEN)) {
1276		dev_kfree_skb_any(skb);
1277		return NETDEV_TX_OK;
1278	}
1279
1280	qidx = skb_get_queue_mapping(skb);
1281	qs = &pi->qs[qidx];
1282	q = &qs->txq[TXQ_ETH];
1283	txq = netdev_get_tx_queue(dev, qidx);
1284
1285	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
1286
1287	credits = q->size - q->in_use;
1288	ndesc = calc_tx_descs(skb);
1289
1290	if (unlikely(credits < ndesc)) {
1291		t3_stop_tx_queue(txq, qs, q);
1292		dev_err(&adap->pdev->dev,
1293			"%s: Tx ring %u full while queue awake!\n",
1294			dev->name, q->cntxt_id & 7);
1295		return NETDEV_TX_BUSY;
1296	}
1297
1298	/* Check if ethernet packet can't be sent as immediate data */
1299	if (skb->len > (WR_LEN - sizeof(struct cpl_tx_pkt))) {
1300		if (unlikely(map_skb(adap->pdev, skb, addr) < 0)) {
1301			dev_kfree_skb(skb);
1302			return NETDEV_TX_OK;
1303		}
1304	}
1305
1306	q->in_use += ndesc;
1307	if (unlikely(credits - ndesc < q->stop_thres)) {
1308		t3_stop_tx_queue(txq, qs, q);
1309
1310		if (should_restart_tx(q) &&
1311		    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1312			q->restarts++;
1313			netif_tx_start_queue(txq);
1314		}
1315	}
1316
1317	gen = q->gen;
1318	q->unacked += ndesc;
1319	compl = (q->unacked & 8) << (S_WR_COMPL - 3);
1320	q->unacked &= 7;
1321	pidx = q->pidx;
1322	q->pidx += ndesc;
1323	if (q->pidx >= q->size) {
1324		q->pidx -= q->size;
1325		q->gen ^= 1;
1326	}
1327
1328	/* update port statistics */
1329	if (skb->ip_summed == CHECKSUM_PARTIAL)
1330		qs->port_stats[SGE_PSTAT_TX_CSUM]++;
1331	if (skb_shinfo(skb)->gso_size)
1332		qs->port_stats[SGE_PSTAT_TSO]++;
1333	if (skb_vlan_tag_present(skb))
1334		qs->port_stats[SGE_PSTAT_VLANINS]++;
1335
1336	/*
1337	 * We do not use Tx completion interrupts to free DMAd Tx packets.
1338	 * This is good for performance but means that we rely on new Tx
1339	 * packets arriving to run the destructors of completed packets,
1340	 * which open up space in their sockets' send queues.  Sometimes
1341	 * we do not get such new packets causing Tx to stall.  A single
1342	 * UDP transmitter is a good example of this situation.  We have
1343	 * a clean up timer that periodically reclaims completed packets
1344	 * but it doesn't run often enough (nor do we want it to) to prevent
1345	 * lengthy stalls.  A solution to this problem is to run the
1346	 * destructor early, after the packet is queued but before it's DMAd.
1347	 * A cons is that we lie to socket memory accounting, but the amount
1348	 * of extra memory is reasonable (limited by the number of Tx
1349	 * descriptors), the packets do actually get freed quickly by new
1350	 * packets almost always, and for protocols like TCP that wait for
1351	 * acks to really free up the data the extra memory is even less.
1352	 * On the positive side we run the destructors on the sending CPU
1353	 * rather than on a potentially different completing CPU, usually a
1354	 * good thing.  We also run them without holding our Tx queue lock,
1355	 * unlike what reclaim_completed_tx() would otherwise do.
1356	 *
1357	 * Run the destructor before telling the DMA engine about the packet
1358	 * to make sure it doesn't complete and get freed prematurely.
1359	 */
1360	if (likely(!skb_shared(skb)))
1361		skb_orphan(skb);
1362
1363	write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl, addr);
1364	check_ring_tx_db(adap, q);
1365	return NETDEV_TX_OK;
1366}
1367
1368/**
1369 *	write_imm - write a packet into a Tx descriptor as immediate data
1370 *	@d: the Tx descriptor to write
1371 *	@skb: the packet
1372 *	@len: the length of packet data to write as immediate data
1373 *	@gen: the generation bit value to write
1374 *
1375 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1376 *	contains a work request at its beginning.  We must write the packet
1377 *	carefully so the SGE doesn't read it accidentally before it's written
1378 *	in its entirety.
1379 */
1380static inline void write_imm(struct tx_desc *d, struct sk_buff *skb,
1381			     unsigned int len, unsigned int gen)
1382{
1383	struct work_request_hdr *from = (struct work_request_hdr *)skb->data;
1384	struct work_request_hdr *to = (struct work_request_hdr *)d;
1385
1386	if (likely(!skb->data_len))
1387		memcpy(&to[1], &from[1], len - sizeof(*from));
1388	else
1389		skb_copy_bits(skb, sizeof(*from), &to[1], len - sizeof(*from));
1390
1391	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1392					V_WR_BCNTLFLT(len & 7));
1393	dma_wmb();
1394	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1395					V_WR_LEN((len + 7) / 8));
1396	wr_gen2(d, gen);
1397	kfree_skb(skb);
1398}
1399
1400/**
1401 *	check_desc_avail - check descriptor availability on a send queue
1402 *	@adap: the adapter
1403 *	@q: the send queue
1404 *	@skb: the packet needing the descriptors
1405 *	@ndesc: the number of Tx descriptors needed
1406 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1407 *
1408 *	Checks if the requested number of Tx descriptors is available on an
1409 *	SGE send queue.  If the queue is already suspended or not enough
1410 *	descriptors are available the packet is queued for later transmission.
1411 *	Must be called with the Tx queue locked.
1412 *
1413 *	Returns 0 if enough descriptors are available, 1 if there aren't
1414 *	enough descriptors and the packet has been queued, and 2 if the caller
1415 *	needs to retry because there weren't enough descriptors at the
1416 *	beginning of the call but some freed up in the mean time.
1417 */
1418static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q,
1419				   struct sk_buff *skb, unsigned int ndesc,
1420				   unsigned int qid)
1421{
1422	if (unlikely(!skb_queue_empty(&q->sendq))) {
1423	      addq_exit:__skb_queue_tail(&q->sendq, skb);
1424		return 1;
1425	}
1426	if (unlikely(q->size - q->in_use < ndesc)) {
1427		struct sge_qset *qs = txq_to_qset(q, qid);
1428
1429		set_bit(qid, &qs->txq_stopped);
1430		smp_mb__after_atomic();
1431
1432		if (should_restart_tx(q) &&
1433		    test_and_clear_bit(qid, &qs->txq_stopped))
1434			return 2;
1435
1436		q->stops++;
1437		goto addq_exit;
1438	}
1439	return 0;
1440}
1441
1442/**
1443 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1444 *	@q: the SGE control Tx queue
1445 *
1446 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1447 *	that send only immediate data (presently just the control queues) and
1448 *	thus do not have any sk_buffs to release.
1449 */
1450static inline void reclaim_completed_tx_imm(struct sge_txq *q)
1451{
1452	unsigned int reclaim = q->processed - q->cleaned;
1453
1454	q->in_use -= reclaim;
1455	q->cleaned += reclaim;
1456}
1457
1458static inline int immediate(const struct sk_buff *skb)
1459{
1460	return skb->len <= WR_LEN;
1461}
1462
1463/**
1464 *	ctrl_xmit - send a packet through an SGE control Tx queue
1465 *	@adap: the adapter
1466 *	@q: the control queue
1467 *	@skb: the packet
1468 *
1469 *	Send a packet through an SGE control Tx queue.  Packets sent through
1470 *	a control queue must fit entirely as immediate data in a single Tx
1471 *	descriptor and have no page fragments.
1472 */
1473static int ctrl_xmit(struct adapter *adap, struct sge_txq *q,
1474		     struct sk_buff *skb)
1475{
1476	int ret;
1477	struct work_request_hdr *wrp = (struct work_request_hdr *)skb->data;
1478
1479	if (unlikely(!immediate(skb))) {
1480		WARN_ON(1);
1481		dev_kfree_skb(skb);
1482		return NET_XMIT_SUCCESS;
1483	}
1484
1485	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1486	wrp->wr_lo = htonl(V_WR_TID(q->token));
1487
1488	spin_lock(&q->lock);
1489      again:reclaim_completed_tx_imm(q);
1490
1491	ret = check_desc_avail(adap, q, skb, 1, TXQ_CTRL);
1492	if (unlikely(ret)) {
1493		if (ret == 1) {
1494			spin_unlock(&q->lock);
1495			return NET_XMIT_CN;
1496		}
1497		goto again;
1498	}
1499
1500	write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1501
1502	q->in_use++;
1503	if (++q->pidx >= q->size) {
1504		q->pidx = 0;
1505		q->gen ^= 1;
1506	}
1507	spin_unlock(&q->lock);
1508	wmb();
1509	t3_write_reg(adap, A_SG_KDOORBELL,
1510		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1511	return NET_XMIT_SUCCESS;
1512}
1513
1514/**
1515 *	restart_ctrlq - restart a suspended control queue
1516 *	@w: pointer to the work associated with this handler
1517 *
1518 *	Resumes transmission on a suspended Tx control queue.
1519 */
1520static void restart_ctrlq(struct work_struct *w)
1521{
1522	struct sk_buff *skb;
1523	struct sge_qset *qs = container_of(w, struct sge_qset,
1524					   txq[TXQ_CTRL].qresume_task);
1525	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1526
1527	spin_lock(&q->lock);
1528      again:reclaim_completed_tx_imm(q);
1529
1530	while (q->in_use < q->size &&
1531	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
1532
1533		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1534
1535		if (++q->pidx >= q->size) {
1536			q->pidx = 0;
1537			q->gen ^= 1;
1538		}
1539		q->in_use++;
1540	}
1541
1542	if (!skb_queue_empty(&q->sendq)) {
1543		set_bit(TXQ_CTRL, &qs->txq_stopped);
1544		smp_mb__after_atomic();
1545
1546		if (should_restart_tx(q) &&
1547		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1548			goto again;
1549		q->stops++;
1550	}
1551
1552	spin_unlock(&q->lock);
1553	wmb();
1554	t3_write_reg(qs->adap, A_SG_KDOORBELL,
1555		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1556}
1557
1558/*
1559 * Send a management message through control queue 0
1560 */
1561int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
1562{
1563	int ret;
1564	local_bh_disable();
1565	ret = ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], skb);
1566	local_bh_enable();
1567
1568	return ret;
1569}
1570
1571/**
1572 *	deferred_unmap_destructor - unmap a packet when it is freed
1573 *	@skb: the packet
1574 *
1575 *	This is the packet destructor used for Tx packets that need to remain
1576 *	mapped until they are freed rather than until their Tx descriptors are
1577 *	freed.
1578 */
1579static void deferred_unmap_destructor(struct sk_buff *skb)
1580{
1581	int i;
1582	const dma_addr_t *p;
1583	const struct skb_shared_info *si;
1584	const struct deferred_unmap_info *dui;
1585
1586	dui = (struct deferred_unmap_info *)skb->head;
1587	p = dui->addr;
1588
1589	if (skb_tail_pointer(skb) - skb_transport_header(skb))
1590		dma_unmap_single(&dui->pdev->dev, *p++,
1591				 skb_tail_pointer(skb) - skb_transport_header(skb),
1592				 DMA_TO_DEVICE);
1593
1594	si = skb_shinfo(skb);
1595	for (i = 0; i < si->nr_frags; i++)
1596		dma_unmap_page(&dui->pdev->dev, *p++,
1597			       skb_frag_size(&si->frags[i]), DMA_TO_DEVICE);
1598}
1599
1600static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
1601				     const struct sg_ent *sgl, int sgl_flits)
1602{
1603	dma_addr_t *p;
1604	struct deferred_unmap_info *dui;
1605
1606	dui = (struct deferred_unmap_info *)skb->head;
1607	dui->pdev = pdev;
1608	for (p = dui->addr; sgl_flits >= 3; sgl++, sgl_flits -= 3) {
1609		*p++ = be64_to_cpu(sgl->addr[0]);
1610		*p++ = be64_to_cpu(sgl->addr[1]);
1611	}
1612	if (sgl_flits)
1613		*p = be64_to_cpu(sgl->addr[0]);
1614}
1615
1616/**
1617 *	write_ofld_wr - write an offload work request
1618 *	@adap: the adapter
1619 *	@skb: the packet to send
1620 *	@q: the Tx queue
1621 *	@pidx: index of the first Tx descriptor to write
1622 *	@gen: the generation value to use
1623 *	@ndesc: number of descriptors the packet will occupy
1624 *	@addr: the address
1625 *
1626 *	Write an offload work request to send the supplied packet.  The packet
1627 *	data already carry the work request with most fields populated.
1628 */
1629static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
1630			  struct sge_txq *q, unsigned int pidx,
1631			  unsigned int gen, unsigned int ndesc,
1632			  const dma_addr_t *addr)
1633{
1634	unsigned int sgl_flits, flits;
1635	struct work_request_hdr *from;
1636	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
1637	struct tx_desc *d = &q->desc[pidx];
1638
1639	if (immediate(skb)) {
1640		q->sdesc[pidx].skb = NULL;
1641		write_imm(d, skb, skb->len, gen);
1642		return;
1643	}
1644
1645	/* Only TX_DATA builds SGLs */
1646
1647	from = (struct work_request_hdr *)skb->data;
1648	memcpy(&d->flit[1], &from[1],
1649	       skb_transport_offset(skb) - sizeof(*from));
1650
1651	flits = skb_transport_offset(skb) / 8;
1652	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1653	sgl_flits = write_sgl(skb, sgp, skb_transport_header(skb),
1654			      skb_tail_pointer(skb) - skb_transport_header(skb),
1655			      addr);
1656	if (need_skb_unmap()) {
1657		setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
1658		skb->destructor = deferred_unmap_destructor;
1659	}
1660
1661	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits,
1662			 gen, from->wr_hi, from->wr_lo);
1663}
1664
1665/**
1666 *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1667 *	@skb: the packet
1668 *
1669 * 	Returns the number of Tx descriptors needed for the given offload
1670 * 	packet.  These packets are already fully constructed.
1671 */
1672static inline unsigned int calc_tx_descs_ofld(const struct sk_buff *skb)
1673{
1674	unsigned int flits, cnt;
1675
1676	if (skb->len <= WR_LEN)
1677		return 1;	/* packet fits as immediate data */
1678
1679	flits = skb_transport_offset(skb) / 8;	/* headers */
1680	cnt = skb_shinfo(skb)->nr_frags;
1681	if (skb_tail_pointer(skb) != skb_transport_header(skb))
1682		cnt++;
1683	return flits_to_desc(flits + sgl_len(cnt));
1684}
1685
1686/**
1687 *	ofld_xmit - send a packet through an offload queue
1688 *	@adap: the adapter
1689 *	@q: the Tx offload queue
1690 *	@skb: the packet
1691 *
1692 *	Send an offload packet through an SGE offload queue.
1693 */
1694static int ofld_xmit(struct adapter *adap, struct sge_txq *q,
1695		     struct sk_buff *skb)
1696{
1697	int ret;
1698	unsigned int ndesc = calc_tx_descs_ofld(skb), pidx, gen;
1699
1700	spin_lock(&q->lock);
1701again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
1702
1703	ret = check_desc_avail(adap, q, skb, ndesc, TXQ_OFLD);
1704	if (unlikely(ret)) {
1705		if (ret == 1) {
1706			skb->priority = ndesc;	/* save for restart */
1707			spin_unlock(&q->lock);
1708			return NET_XMIT_CN;
1709		}
1710		goto again;
1711	}
1712
1713	if (!immediate(skb) &&
1714	    map_skb(adap->pdev, skb, (dma_addr_t *)skb->head)) {
1715		spin_unlock(&q->lock);
1716		return NET_XMIT_SUCCESS;
1717	}
1718
1719	gen = q->gen;
1720	q->in_use += ndesc;
1721	pidx = q->pidx;
1722	q->pidx += ndesc;
1723	if (q->pidx >= q->size) {
1724		q->pidx -= q->size;
1725		q->gen ^= 1;
1726	}
1727	spin_unlock(&q->lock);
1728
1729	write_ofld_wr(adap, skb, q, pidx, gen, ndesc, (dma_addr_t *)skb->head);
1730	check_ring_tx_db(adap, q);
1731	return NET_XMIT_SUCCESS;
1732}
1733
1734/**
1735 *	restart_offloadq - restart a suspended offload queue
1736 *	@w: pointer to the work associated with this handler
1737 *
1738 *	Resumes transmission on a suspended Tx offload queue.
1739 */
1740static void restart_offloadq(struct work_struct *w)
1741{
1742	struct sk_buff *skb;
1743	struct sge_qset *qs = container_of(w, struct sge_qset,
1744					   txq[TXQ_OFLD].qresume_task);
1745	struct sge_txq *q = &qs->txq[TXQ_OFLD];
1746	const struct port_info *pi = netdev_priv(qs->netdev);
1747	struct adapter *adap = pi->adapter;
1748	unsigned int written = 0;
1749
1750	spin_lock(&q->lock);
1751again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
1752
1753	while ((skb = skb_peek(&q->sendq)) != NULL) {
1754		unsigned int gen, pidx;
1755		unsigned int ndesc = skb->priority;
1756
1757		if (unlikely(q->size - q->in_use < ndesc)) {
1758			set_bit(TXQ_OFLD, &qs->txq_stopped);
1759			smp_mb__after_atomic();
1760
1761			if (should_restart_tx(q) &&
1762			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
1763				goto again;
1764			q->stops++;
1765			break;
1766		}
1767
1768		if (!immediate(skb) &&
1769		    map_skb(adap->pdev, skb, (dma_addr_t *)skb->head))
1770			break;
1771
1772		gen = q->gen;
1773		q->in_use += ndesc;
1774		pidx = q->pidx;
1775		q->pidx += ndesc;
1776		written += ndesc;
1777		if (q->pidx >= q->size) {
1778			q->pidx -= q->size;
1779			q->gen ^= 1;
1780		}
1781		__skb_unlink(skb, &q->sendq);
1782		spin_unlock(&q->lock);
1783
1784		write_ofld_wr(adap, skb, q, pidx, gen, ndesc,
1785			      (dma_addr_t *)skb->head);
1786		spin_lock(&q->lock);
1787	}
1788	spin_unlock(&q->lock);
1789
1790#if USE_GTS
1791	set_bit(TXQ_RUNNING, &q->flags);
1792	set_bit(TXQ_LAST_PKT_DB, &q->flags);
1793#endif
1794	wmb();
1795	if (likely(written))
1796		t3_write_reg(adap, A_SG_KDOORBELL,
1797			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1798}
1799
1800/**
1801 *	queue_set - return the queue set a packet should use
1802 *	@skb: the packet
1803 *
1804 *	Maps a packet to the SGE queue set it should use.  The desired queue
1805 *	set is carried in bits 1-3 in the packet's priority.
1806 */
1807static inline int queue_set(const struct sk_buff *skb)
1808{
1809	return skb->priority >> 1;
1810}
1811
1812/**
1813 *	is_ctrl_pkt - return whether an offload packet is a control packet
1814 *	@skb: the packet
1815 *
1816 *	Determines whether an offload packet should use an OFLD or a CTRL
1817 *	Tx queue.  This is indicated by bit 0 in the packet's priority.
1818 */
1819static inline int is_ctrl_pkt(const struct sk_buff *skb)
1820{
1821	return skb->priority & 1;
1822}
1823
1824/**
1825 *	t3_offload_tx - send an offload packet
1826 *	@tdev: the offload device to send to
1827 *	@skb: the packet
1828 *
1829 *	Sends an offload packet.  We use the packet priority to select the
1830 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
1831 *	should be sent as regular or control, bits 1-3 select the queue set.
1832 */
1833int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
1834{
1835	struct adapter *adap = tdev2adap(tdev);
1836	struct sge_qset *qs = &adap->sge.qs[queue_set(skb)];
1837
1838	if (unlikely(is_ctrl_pkt(skb)))
1839		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], skb);
1840
1841	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], skb);
1842}
1843
1844/**
1845 *	offload_enqueue - add an offload packet to an SGE offload receive queue
1846 *	@q: the SGE response queue
1847 *	@skb: the packet
1848 *
1849 *	Add a new offload packet to an SGE response queue's offload packet
1850 *	queue.  If the packet is the first on the queue it schedules the RX
1851 *	softirq to process the queue.
1852 */
1853static inline void offload_enqueue(struct sge_rspq *q, struct sk_buff *skb)
1854{
1855	int was_empty = skb_queue_empty(&q->rx_queue);
1856
1857	__skb_queue_tail(&q->rx_queue, skb);
1858
1859	if (was_empty) {
1860		struct sge_qset *qs = rspq_to_qset(q);
1861
1862		napi_schedule(&qs->napi);
1863	}
1864}
1865
1866/**
1867 *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
1868 *	@tdev: the offload device that will be receiving the packets
1869 *	@q: the SGE response queue that assembled the bundle
1870 *	@skbs: the partial bundle
1871 *	@n: the number of packets in the bundle
1872 *
1873 *	Delivers a (partial) bundle of Rx offload packets to an offload device.
1874 */
1875static inline void deliver_partial_bundle(struct t3cdev *tdev,
1876					  struct sge_rspq *q,
1877					  struct sk_buff *skbs[], int n)
1878{
1879	if (n) {
1880		q->offload_bundles++;
1881		tdev->recv(tdev, skbs, n);
1882	}
1883}
1884
1885/**
1886 *	ofld_poll - NAPI handler for offload packets in interrupt mode
1887 *	@napi: the network device doing the polling
1888 *	@budget: polling budget
1889 *
1890 *	The NAPI handler for offload packets when a response queue is serviced
1891 *	by the hard interrupt handler, i.e., when it's operating in non-polling
1892 *	mode.  Creates small packet batches and sends them through the offload
1893 *	receive handler.  Batches need to be of modest size as we do prefetches
1894 *	on the packets in each.
1895 */
1896static int ofld_poll(struct napi_struct *napi, int budget)
1897{
1898	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
1899	struct sge_rspq *q = &qs->rspq;
1900	struct adapter *adapter = qs->adap;
1901	int work_done = 0;
1902
1903	while (work_done < budget) {
1904		struct sk_buff *skb, *tmp, *skbs[RX_BUNDLE_SIZE];
1905		struct sk_buff_head queue;
1906		int ngathered;
1907
1908		spin_lock_irq(&q->lock);
1909		__skb_queue_head_init(&queue);
1910		skb_queue_splice_init(&q->rx_queue, &queue);
1911		if (skb_queue_empty(&queue)) {
1912			napi_complete_done(napi, work_done);
1913			spin_unlock_irq(&q->lock);
1914			return work_done;
1915		}
1916		spin_unlock_irq(&q->lock);
1917
1918		ngathered = 0;
1919		skb_queue_walk_safe(&queue, skb, tmp) {
1920			if (work_done >= budget)
1921				break;
1922			work_done++;
1923
1924			__skb_unlink(skb, &queue);
1925			prefetch(skb->data);
1926			skbs[ngathered] = skb;
1927			if (++ngathered == RX_BUNDLE_SIZE) {
1928				q->offload_bundles++;
1929				adapter->tdev.recv(&adapter->tdev, skbs,
1930						   ngathered);
1931				ngathered = 0;
1932			}
1933		}
1934		if (!skb_queue_empty(&queue)) {
1935			/* splice remaining packets back onto Rx queue */
1936			spin_lock_irq(&q->lock);
1937			skb_queue_splice(&queue, &q->rx_queue);
1938			spin_unlock_irq(&q->lock);
1939		}
1940		deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered);
1941	}
1942
1943	return work_done;
1944}
1945
1946/**
1947 *	rx_offload - process a received offload packet
1948 *	@tdev: the offload device receiving the packet
1949 *	@rq: the response queue that received the packet
1950 *	@skb: the packet
1951 *	@rx_gather: a gather list of packets if we are building a bundle
1952 *	@gather_idx: index of the next available slot in the bundle
1953 *
1954 *	Process an ingress offload packet and add it to the offload ingress
1955 *	queue. 	Returns the index of the next available slot in the bundle.
1956 */
1957static inline int rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1958			     struct sk_buff *skb, struct sk_buff *rx_gather[],
1959			     unsigned int gather_idx)
1960{
1961	skb_reset_mac_header(skb);
1962	skb_reset_network_header(skb);
1963	skb_reset_transport_header(skb);
1964
1965	if (rq->polling) {
1966		rx_gather[gather_idx++] = skb;
1967		if (gather_idx == RX_BUNDLE_SIZE) {
1968			tdev->recv(tdev, rx_gather, RX_BUNDLE_SIZE);
1969			gather_idx = 0;
1970			rq->offload_bundles++;
1971		}
1972	} else
1973		offload_enqueue(rq, skb);
1974
1975	return gather_idx;
1976}
1977
1978/**
1979 *	restart_tx - check whether to restart suspended Tx queues
1980 *	@qs: the queue set to resume
1981 *
1982 *	Restarts suspended Tx queues of an SGE queue set if they have enough
1983 *	free resources to resume operation.
1984 */
1985static void restart_tx(struct sge_qset *qs)
1986{
1987	if (test_bit(TXQ_ETH, &qs->txq_stopped) &&
1988	    should_restart_tx(&qs->txq[TXQ_ETH]) &&
1989	    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1990		qs->txq[TXQ_ETH].restarts++;
1991		if (netif_running(qs->netdev))
1992			netif_tx_wake_queue(qs->tx_q);
1993	}
1994
1995	if (test_bit(TXQ_OFLD, &qs->txq_stopped) &&
1996	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
1997	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
1998		qs->txq[TXQ_OFLD].restarts++;
1999
2000		/* The work can be quite lengthy so we use driver's own queue */
2001		queue_work(cxgb3_wq, &qs->txq[TXQ_OFLD].qresume_task);
2002	}
2003	if (test_bit(TXQ_CTRL, &qs->txq_stopped) &&
2004	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2005	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2006		qs->txq[TXQ_CTRL].restarts++;
2007
2008		/* The work can be quite lengthy so we use driver's own queue */
2009		queue_work(cxgb3_wq, &qs->txq[TXQ_CTRL].qresume_task);
2010	}
2011}
2012
2013/**
2014 *	cxgb3_arp_process - process an ARP request probing a private IP address
2015 *	@pi: the port info
2016 *	@skb: the skbuff containing the ARP request
2017 *
2018 *	Check if the ARP request is probing the private IP address
2019 *	dedicated to iSCSI, generate an ARP reply if so.
2020 */
2021static void cxgb3_arp_process(struct port_info *pi, struct sk_buff *skb)
2022{
2023	struct net_device *dev = skb->dev;
2024	struct arphdr *arp;
2025	unsigned char *arp_ptr;
2026	unsigned char *sha;
2027	__be32 sip, tip;
2028
2029	if (!dev)
2030		return;
2031
2032	skb_reset_network_header(skb);
2033	arp = arp_hdr(skb);
2034
2035	if (arp->ar_op != htons(ARPOP_REQUEST))
2036		return;
2037
2038	arp_ptr = (unsigned char *)(arp + 1);
2039	sha = arp_ptr;
2040	arp_ptr += dev->addr_len;
2041	memcpy(&sip, arp_ptr, sizeof(sip));
2042	arp_ptr += sizeof(sip);
2043	arp_ptr += dev->addr_len;
2044	memcpy(&tip, arp_ptr, sizeof(tip));
2045
2046	if (tip != pi->iscsi_ipv4addr)
2047		return;
2048
2049	arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
2050		 pi->iscsic.mac_addr, sha);
2051
2052}
2053
2054static inline int is_arp(struct sk_buff *skb)
2055{
2056	return skb->protocol == htons(ETH_P_ARP);
2057}
2058
2059static void cxgb3_process_iscsi_prov_pack(struct port_info *pi,
2060					struct sk_buff *skb)
2061{
2062	if (is_arp(skb)) {
2063		cxgb3_arp_process(pi, skb);
2064		return;
2065	}
2066
2067	if (pi->iscsic.recv)
2068		pi->iscsic.recv(pi, skb);
2069
2070}
2071
2072/**
2073 *	rx_eth - process an ingress ethernet packet
2074 *	@adap: the adapter
2075 *	@rq: the response queue that received the packet
2076 *	@skb: the packet
2077 *	@pad: padding
2078 *	@lro: large receive offload
2079 *
2080 *	Process an ingress ethernet packet and deliver it to the stack.
2081 *	The padding is 2 if the packet was delivered in an Rx buffer and 0
2082 *	if it was immediate data in a response.
2083 */
2084static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
2085		   struct sk_buff *skb, int pad, int lro)
2086{
2087	struct cpl_rx_pkt *p = (struct cpl_rx_pkt *)(skb->data + pad);
2088	struct sge_qset *qs = rspq_to_qset(rq);
2089	struct port_info *pi;
2090
2091	skb_pull(skb, sizeof(*p) + pad);
2092	skb->protocol = eth_type_trans(skb, adap->port[p->iff]);
2093	pi = netdev_priv(skb->dev);
2094	if ((skb->dev->features & NETIF_F_RXCSUM) && p->csum_valid &&
2095	    p->csum == htons(0xffff) && !p->fragment) {
2096		qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2097		skb->ip_summed = CHECKSUM_UNNECESSARY;
2098	} else
2099		skb_checksum_none_assert(skb);
2100	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
2101
2102	if (p->vlan_valid) {
2103		qs->port_stats[SGE_PSTAT_VLANEX]++;
2104		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(p->vlan));
2105	}
2106	if (rq->polling) {
2107		if (lro)
2108			napi_gro_receive(&qs->napi, skb);
2109		else {
2110			if (unlikely(pi->iscsic.flags))
2111				cxgb3_process_iscsi_prov_pack(pi, skb);
2112			netif_receive_skb(skb);
2113		}
2114	} else
2115		netif_rx(skb);
2116}
2117
2118static inline int is_eth_tcp(u32 rss)
2119{
2120	return G_HASHTYPE(ntohl(rss)) == RSS_HASH_4_TUPLE;
2121}
2122
2123/**
2124 *	lro_add_page - add a page chunk to an LRO session
2125 *	@adap: the adapter
2126 *	@qs: the associated queue set
2127 *	@fl: the free list containing the page chunk to add
2128 *	@len: packet length
2129 *	@complete: Indicates the last fragment of a frame
2130 *
2131 *	Add a received packet contained in a page chunk to an existing LRO
2132 *	session.
2133 */
2134static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
2135			 struct sge_fl *fl, int len, int complete)
2136{
2137	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2138	struct port_info *pi = netdev_priv(qs->netdev);
2139	struct sk_buff *skb = NULL;
2140	struct cpl_rx_pkt *cpl;
2141	skb_frag_t *rx_frag;
2142	int nr_frags;
2143	int offset = 0;
2144
2145	if (!qs->nomem) {
2146		skb = napi_get_frags(&qs->napi);
2147		qs->nomem = !skb;
2148	}
2149
2150	fl->credits--;
2151
2152	dma_sync_single_for_cpu(&adap->pdev->dev,
2153				dma_unmap_addr(sd, dma_addr),
2154				fl->buf_size - SGE_PG_RSVD, DMA_FROM_DEVICE);
2155
2156	(*sd->pg_chunk.p_cnt)--;
2157	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
2158		dma_unmap_page(&adap->pdev->dev, sd->pg_chunk.mapping,
2159			       fl->alloc_size, DMA_FROM_DEVICE);
2160
2161	if (!skb) {
2162		put_page(sd->pg_chunk.page);
2163		if (complete)
2164			qs->nomem = 0;
2165		return;
2166	}
2167
2168	rx_frag = skb_shinfo(skb)->frags;
2169	nr_frags = skb_shinfo(skb)->nr_frags;
2170
2171	if (!nr_frags) {
2172		offset = 2 + sizeof(struct cpl_rx_pkt);
2173		cpl = qs->lro_va = sd->pg_chunk.va + 2;
2174
2175		if ((qs->netdev->features & NETIF_F_RXCSUM) &&
2176		     cpl->csum_valid && cpl->csum == htons(0xffff)) {
2177			skb->ip_summed = CHECKSUM_UNNECESSARY;
2178			qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2179		} else
2180			skb->ip_summed = CHECKSUM_NONE;
2181	} else
2182		cpl = qs->lro_va;
2183
2184	len -= offset;
2185
2186	rx_frag += nr_frags;
2187	skb_frag_fill_page_desc(rx_frag, sd->pg_chunk.page,
2188				sd->pg_chunk.offset + offset, len);
2189
2190	skb->len += len;
2191	skb->data_len += len;
2192	skb->truesize += len;
2193	skb_shinfo(skb)->nr_frags++;
2194
2195	if (!complete)
2196		return;
2197
2198	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
2199
2200	if (cpl->vlan_valid) {
2201		qs->port_stats[SGE_PSTAT_VLANEX]++;
2202		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan));
2203	}
2204	napi_gro_frags(&qs->napi);
2205}
2206
2207/**
2208 *	handle_rsp_cntrl_info - handles control information in a response
2209 *	@qs: the queue set corresponding to the response
2210 *	@flags: the response control flags
2211 *
2212 *	Handles the control information of an SGE response, such as GTS
2213 *	indications and completion credits for the queue set's Tx queues.
2214 *	HW coalesces credits, we don't do any extra SW coalescing.
2215 */
2216static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
2217{
2218	unsigned int credits;
2219
2220#if USE_GTS
2221	if (flags & F_RSPD_TXQ0_GTS)
2222		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2223#endif
2224
2225	credits = G_RSPD_TXQ0_CR(flags);
2226	if (credits)
2227		qs->txq[TXQ_ETH].processed += credits;
2228
2229	credits = G_RSPD_TXQ2_CR(flags);
2230	if (credits)
2231		qs->txq[TXQ_CTRL].processed += credits;
2232
2233# if USE_GTS
2234	if (flags & F_RSPD_TXQ1_GTS)
2235		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2236# endif
2237	credits = G_RSPD_TXQ1_CR(flags);
2238	if (credits)
2239		qs->txq[TXQ_OFLD].processed += credits;
2240}
2241
2242/**
2243 *	check_ring_db - check if we need to ring any doorbells
2244 *	@adap: the adapter
2245 *	@qs: the queue set whose Tx queues are to be examined
2246 *	@sleeping: indicates which Tx queue sent GTS
2247 *
2248 *	Checks if some of a queue set's Tx queues need to ring their doorbells
2249 *	to resume transmission after idling while they still have unprocessed
2250 *	descriptors.
2251 */
2252static void check_ring_db(struct adapter *adap, struct sge_qset *qs,
2253			  unsigned int sleeping)
2254{
2255	if (sleeping & F_RSPD_TXQ0_GTS) {
2256		struct sge_txq *txq = &qs->txq[TXQ_ETH];
2257
2258		if (txq->cleaned + txq->in_use != txq->processed &&
2259		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
2260			set_bit(TXQ_RUNNING, &txq->flags);
2261			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
2262				     V_EGRCNTX(txq->cntxt_id));
2263		}
2264	}
2265
2266	if (sleeping & F_RSPD_TXQ1_GTS) {
2267		struct sge_txq *txq = &qs->txq[TXQ_OFLD];
2268
2269		if (txq->cleaned + txq->in_use != txq->processed &&
2270		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
2271			set_bit(TXQ_RUNNING, &txq->flags);
2272			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
2273				     V_EGRCNTX(txq->cntxt_id));
2274		}
2275	}
2276}
2277
2278/**
2279 *	is_new_response - check if a response is newly written
2280 *	@r: the response descriptor
2281 *	@q: the response queue
2282 *
2283 *	Returns true if a response descriptor contains a yet unprocessed
2284 *	response.
2285 */
2286static inline int is_new_response(const struct rsp_desc *r,
2287				  const struct sge_rspq *q)
2288{
2289	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2290}
2291
2292static inline void clear_rspq_bufstate(struct sge_rspq * const q)
2293{
2294	q->pg_skb = NULL;
2295	q->rx_recycle_buf = 0;
2296}
2297
2298#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2299#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2300			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2301			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2302			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2303
2304/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2305#define NOMEM_INTR_DELAY 2500
2306
2307/**
2308 *	process_responses - process responses from an SGE response queue
2309 *	@adap: the adapter
2310 *	@qs: the queue set to which the response queue belongs
2311 *	@budget: how many responses can be processed in this round
2312 *
2313 *	Process responses from an SGE response queue up to the supplied budget.
2314 *	Responses include received packets as well as credits and other events
2315 *	for the queues that belong to the response queue's queue set.
2316 *	A negative budget is effectively unlimited.
2317 *
2318 *	Additionally choose the interrupt holdoff time for the next interrupt
2319 *	on this queue.  If the system is under memory shortage use a fairly
2320 *	long delay to help recovery.
2321 */
2322static int process_responses(struct adapter *adap, struct sge_qset *qs,
2323			     int budget)
2324{
2325	struct sge_rspq *q = &qs->rspq;
2326	struct rsp_desc *r = &q->desc[q->cidx];
2327	int budget_left = budget;
2328	unsigned int sleeping = 0;
2329	struct sk_buff *offload_skbs[RX_BUNDLE_SIZE];
2330	int ngathered = 0;
2331
2332	q->next_holdoff = q->holdoff_tmr;
2333
2334	while (likely(budget_left && is_new_response(r, q))) {
2335		int packet_complete, eth, ethpad = 2;
2336		int lro = !!(qs->netdev->features & NETIF_F_GRO);
2337		struct sk_buff *skb = NULL;
2338		u32 len, flags;
2339		__be32 rss_hi, rss_lo;
2340
2341		dma_rmb();
2342		eth = r->rss_hdr.opcode == CPL_RX_PKT;
2343		rss_hi = *(const __be32 *)r;
2344		rss_lo = r->rss_hdr.rss_hash_val;
2345		flags = ntohl(r->flags);
2346
2347		if (unlikely(flags & F_RSPD_ASYNC_NOTIF)) {
2348			skb = alloc_skb(AN_PKT_SIZE, GFP_ATOMIC);
2349			if (!skb)
2350				goto no_mem;
2351
2352			__skb_put_data(skb, r, AN_PKT_SIZE);
2353			skb->data[0] = CPL_ASYNC_NOTIF;
2354			rss_hi = htonl(CPL_ASYNC_NOTIF << 24);
2355			q->async_notif++;
2356		} else if (flags & F_RSPD_IMM_DATA_VALID) {
2357			skb = get_imm_packet(r);
2358			if (unlikely(!skb)) {
2359no_mem:
2360				q->next_holdoff = NOMEM_INTR_DELAY;
2361				q->nomem++;
2362				/* consume one credit since we tried */
2363				budget_left--;
2364				break;
2365			}
2366			q->imm_data++;
2367			ethpad = 0;
2368		} else if ((len = ntohl(r->len_cq)) != 0) {
2369			struct sge_fl *fl;
2370
2371			lro &= eth && is_eth_tcp(rss_hi);
2372
2373			fl = (len & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2374			if (fl->use_pages) {
2375				void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
2376
2377				net_prefetch(addr);
2378				__refill_fl(adap, fl);
2379				if (lro > 0) {
2380					lro_add_page(adap, qs, fl,
2381						     G_RSPD_LEN(len),
2382						     flags & F_RSPD_EOP);
2383					goto next_fl;
2384				}
2385
2386				skb = get_packet_pg(adap, fl, q,
2387						    G_RSPD_LEN(len),
2388						    eth ?
2389						    SGE_RX_DROP_THRES : 0);
2390				q->pg_skb = skb;
2391			} else
2392				skb = get_packet(adap, fl, G_RSPD_LEN(len),
2393						 eth ? SGE_RX_DROP_THRES : 0);
2394			if (unlikely(!skb)) {
2395				if (!eth)
2396					goto no_mem;
2397				q->rx_drops++;
2398			} else if (unlikely(r->rss_hdr.opcode == CPL_TRACE_PKT))
2399				__skb_pull(skb, 2);
2400next_fl:
2401			if (++fl->cidx == fl->size)
2402				fl->cidx = 0;
2403		} else
2404			q->pure_rsps++;
2405
2406		if (flags & RSPD_CTRL_MASK) {
2407			sleeping |= flags & RSPD_GTS_MASK;
2408			handle_rsp_cntrl_info(qs, flags);
2409		}
2410
2411		r++;
2412		if (unlikely(++q->cidx == q->size)) {
2413			q->cidx = 0;
2414			q->gen ^= 1;
2415			r = q->desc;
2416		}
2417		prefetch(r);
2418
2419		if (++q->credits >= (q->size / 4)) {
2420			refill_rspq(adap, q, q->credits);
2421			q->credits = 0;
2422		}
2423
2424		packet_complete = flags &
2425				  (F_RSPD_EOP | F_RSPD_IMM_DATA_VALID |
2426				   F_RSPD_ASYNC_NOTIF);
2427
2428		if (skb != NULL && packet_complete) {
2429			if (eth)
2430				rx_eth(adap, q, skb, ethpad, lro);
2431			else {
2432				q->offload_pkts++;
2433				/* Preserve the RSS info in csum & priority */
2434				skb->csum = rss_hi;
2435				skb->priority = rss_lo;
2436				ngathered = rx_offload(&adap->tdev, q, skb,
2437						       offload_skbs,
2438						       ngathered);
2439			}
2440
2441			if (flags & F_RSPD_EOP)
2442				clear_rspq_bufstate(q);
2443		}
2444		--budget_left;
2445	}
2446
2447	deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
2448
2449	if (sleeping)
2450		check_ring_db(adap, qs, sleeping);
2451
2452	smp_mb();		/* commit Tx queue .processed updates */
2453	if (unlikely(qs->txq_stopped != 0))
2454		restart_tx(qs);
2455
2456	budget -= budget_left;
2457	return budget;
2458}
2459
2460static inline int is_pure_response(const struct rsp_desc *r)
2461{
2462	__be32 n = r->flags & htonl(F_RSPD_ASYNC_NOTIF | F_RSPD_IMM_DATA_VALID);
2463
2464	return (n | r->len_cq) == 0;
2465}
2466
2467/**
2468 *	napi_rx_handler - the NAPI handler for Rx processing
2469 *	@napi: the napi instance
2470 *	@budget: how many packets we can process in this round
2471 *
2472 *	Handler for new data events when using NAPI.
2473 */
2474static int napi_rx_handler(struct napi_struct *napi, int budget)
2475{
2476	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
2477	struct adapter *adap = qs->adap;
2478	int work_done = process_responses(adap, qs, budget);
2479
2480	if (likely(work_done < budget)) {
2481		napi_complete_done(napi, work_done);
2482
2483		/*
2484		 * Because we don't atomically flush the following
2485		 * write it is possible that in very rare cases it can
2486		 * reach the device in a way that races with a new
2487		 * response being written plus an error interrupt
2488		 * causing the NAPI interrupt handler below to return
2489		 * unhandled status to the OS.  To protect against
2490		 * this would require flushing the write and doing
2491		 * both the write and the flush with interrupts off.
2492		 * Way too expensive and unjustifiable given the
2493		 * rarity of the race.
2494		 *
2495		 * The race cannot happen at all with MSI-X.
2496		 */
2497		t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2498			     V_NEWTIMER(qs->rspq.next_holdoff) |
2499			     V_NEWINDEX(qs->rspq.cidx));
2500	}
2501	return work_done;
2502}
2503
2504/**
2505 *	process_pure_responses - process pure responses from a response queue
2506 *	@adap: the adapter
2507 *	@qs: the queue set owning the response queue
2508 *	@r: the first pure response to process
2509 *
2510 *	A simpler version of process_responses() that handles only pure (i.e.,
2511 *	non data-carrying) responses.  Such respones are too light-weight to
2512 *	justify calling a softirq under NAPI, so we handle them specially in
2513 *	the interrupt handler.  The function is called with a pointer to a
2514 *	response, which the caller must ensure is a valid pure response.
2515 *
2516 *	Returns 1 if it encounters a valid data-carrying response, 0 otherwise.
2517 */
2518static int process_pure_responses(struct adapter *adap, struct sge_qset *qs,
2519				  struct rsp_desc *r)
2520{
2521	struct sge_rspq *q = &qs->rspq;
2522	unsigned int sleeping = 0;
2523
2524	do {
2525		u32 flags = ntohl(r->flags);
2526
2527		r++;
2528		if (unlikely(++q->cidx == q->size)) {
2529			q->cidx = 0;
2530			q->gen ^= 1;
2531			r = q->desc;
2532		}
2533		prefetch(r);
2534
2535		if (flags & RSPD_CTRL_MASK) {
2536			sleeping |= flags & RSPD_GTS_MASK;
2537			handle_rsp_cntrl_info(qs, flags);
2538		}
2539
2540		q->pure_rsps++;
2541		if (++q->credits >= (q->size / 4)) {
2542			refill_rspq(adap, q, q->credits);
2543			q->credits = 0;
2544		}
2545		if (!is_new_response(r, q))
2546			break;
2547		dma_rmb();
2548	} while (is_pure_response(r));
2549
2550	if (sleeping)
2551		check_ring_db(adap, qs, sleeping);
2552
2553	smp_mb();		/* commit Tx queue .processed updates */
2554	if (unlikely(qs->txq_stopped != 0))
2555		restart_tx(qs);
2556
2557	return is_new_response(r, q);
2558}
2559
2560/**
2561 *	handle_responses - decide what to do with new responses in NAPI mode
2562 *	@adap: the adapter
2563 *	@q: the response queue
2564 *
2565 *	This is used by the NAPI interrupt handlers to decide what to do with
2566 *	new SGE responses.  If there are no new responses it returns -1.  If
2567 *	there are new responses and they are pure (i.e., non-data carrying)
2568 *	it handles them straight in hard interrupt context as they are very
2569 *	cheap and don't deliver any packets.  Finally, if there are any data
2570 *	signaling responses it schedules the NAPI handler.  Returns 1 if it
2571 *	schedules NAPI, 0 if all new responses were pure.
2572 *
2573 *	The caller must ascertain NAPI is not already running.
2574 */
2575static inline int handle_responses(struct adapter *adap, struct sge_rspq *q)
2576{
2577	struct sge_qset *qs = rspq_to_qset(q);
2578	struct rsp_desc *r = &q->desc[q->cidx];
2579
2580	if (!is_new_response(r, q))
2581		return -1;
2582	dma_rmb();
2583	if (is_pure_response(r) && process_pure_responses(adap, qs, r) == 0) {
2584		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2585			     V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx));
2586		return 0;
2587	}
2588	napi_schedule(&qs->napi);
2589	return 1;
2590}
2591
2592/*
2593 * The MSI-X interrupt handler for an SGE response queue for the non-NAPI case
2594 * (i.e., response queue serviced in hard interrupt).
2595 */
2596static irqreturn_t t3_sge_intr_msix(int irq, void *cookie)
2597{
2598	struct sge_qset *qs = cookie;
2599	struct adapter *adap = qs->adap;
2600	struct sge_rspq *q = &qs->rspq;
2601
2602	spin_lock(&q->lock);
2603	if (process_responses(adap, qs, -1) == 0)
2604		q->unhandled_irqs++;
2605	t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2606		     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2607	spin_unlock(&q->lock);
2608	return IRQ_HANDLED;
2609}
2610
2611/*
2612 * The MSI-X interrupt handler for an SGE response queue for the NAPI case
2613 * (i.e., response queue serviced by NAPI polling).
2614 */
2615static irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie)
2616{
2617	struct sge_qset *qs = cookie;
2618	struct sge_rspq *q = &qs->rspq;
2619
2620	spin_lock(&q->lock);
2621
2622	if (handle_responses(qs->adap, q) < 0)
2623		q->unhandled_irqs++;
2624	spin_unlock(&q->lock);
2625	return IRQ_HANDLED;
2626}
2627
2628/*
2629 * The non-NAPI MSI interrupt handler.  This needs to handle data events from
2630 * SGE response queues as well as error and other async events as they all use
2631 * the same MSI vector.  We use one SGE response queue per port in this mode
2632 * and protect all response queues with queue 0's lock.
2633 */
2634static irqreturn_t t3_intr_msi(int irq, void *cookie)
2635{
2636	int new_packets = 0;
2637	struct adapter *adap = cookie;
2638	struct sge_rspq *q = &adap->sge.qs[0].rspq;
2639
2640	spin_lock(&q->lock);
2641
2642	if (process_responses(adap, &adap->sge.qs[0], -1)) {
2643		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2644			     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2645		new_packets = 1;
2646	}
2647
2648	if (adap->params.nports == 2 &&
2649	    process_responses(adap, &adap->sge.qs[1], -1)) {
2650		struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2651
2652		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q1->cntxt_id) |
2653			     V_NEWTIMER(q1->next_holdoff) |
2654			     V_NEWINDEX(q1->cidx));
2655		new_packets = 1;
2656	}
2657
2658	if (!new_packets && t3_slow_intr_handler(adap) == 0)
2659		q->unhandled_irqs++;
2660
2661	spin_unlock(&q->lock);
2662	return IRQ_HANDLED;
2663}
2664
2665static int rspq_check_napi(struct sge_qset *qs)
2666{
2667	struct sge_rspq *q = &qs->rspq;
2668
2669	return is_new_response(&q->desc[q->cidx], q) && napi_schedule(&qs->napi);
2670}
2671
2672/*
2673 * The MSI interrupt handler for the NAPI case (i.e., response queues serviced
2674 * by NAPI polling).  Handles data events from SGE response queues as well as
2675 * error and other async events as they all use the same MSI vector.  We use
2676 * one SGE response queue per port in this mode and protect all response
2677 * queues with queue 0's lock.
2678 */
2679static irqreturn_t t3_intr_msi_napi(int irq, void *cookie)
2680{
2681	int new_packets;
2682	struct adapter *adap = cookie;
2683	struct sge_rspq *q = &adap->sge.qs[0].rspq;
2684
2685	spin_lock(&q->lock);
2686
2687	new_packets = rspq_check_napi(&adap->sge.qs[0]);
2688	if (adap->params.nports == 2)
2689		new_packets += rspq_check_napi(&adap->sge.qs[1]);
2690	if (!new_packets && t3_slow_intr_handler(adap) == 0)
2691		q->unhandled_irqs++;
2692
2693	spin_unlock(&q->lock);
2694	return IRQ_HANDLED;
2695}
2696
2697/*
2698 * A helper function that processes responses and issues GTS.
2699 */
2700static inline int process_responses_gts(struct adapter *adap,
2701					struct sge_rspq *rq)
2702{
2703	int work;
2704
2705	work = process_responses(adap, rspq_to_qset(rq), -1);
2706	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2707		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2708	return work;
2709}
2710
2711/*
2712 * The legacy INTx interrupt handler.  This needs to handle data events from
2713 * SGE response queues as well as error and other async events as they all use
2714 * the same interrupt pin.  We use one SGE response queue per port in this mode
2715 * and protect all response queues with queue 0's lock.
2716 */
2717static irqreturn_t t3_intr(int irq, void *cookie)
2718{
2719	int work_done, w0, w1;
2720	struct adapter *adap = cookie;
2721	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2722	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2723
2724	spin_lock(&q0->lock);
2725
2726	w0 = is_new_response(&q0->desc[q0->cidx], q0);
2727	w1 = adap->params.nports == 2 &&
2728	    is_new_response(&q1->desc[q1->cidx], q1);
2729
2730	if (likely(w0 | w1)) {
2731		t3_write_reg(adap, A_PL_CLI, 0);
2732		t3_read_reg(adap, A_PL_CLI);	/* flush */
2733
2734		if (likely(w0))
2735			process_responses_gts(adap, q0);
2736
2737		if (w1)
2738			process_responses_gts(adap, q1);
2739
2740		work_done = w0 | w1;
2741	} else
2742		work_done = t3_slow_intr_handler(adap);
2743
2744	spin_unlock(&q0->lock);
2745	return IRQ_RETVAL(work_done != 0);
2746}
2747
2748/*
2749 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2750 * Handles data events from SGE response queues as well as error and other
2751 * async events as they all use the same interrupt pin.  We use one SGE
2752 * response queue per port in this mode and protect all response queues with
2753 * queue 0's lock.
2754 */
2755static irqreturn_t t3b_intr(int irq, void *cookie)
2756{
2757	u32 map;
2758	struct adapter *adap = cookie;
2759	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2760
2761	t3_write_reg(adap, A_PL_CLI, 0);
2762	map = t3_read_reg(adap, A_SG_DATA_INTR);
2763
2764	if (unlikely(!map))	/* shared interrupt, most likely */
2765		return IRQ_NONE;
2766
2767	spin_lock(&q0->lock);
2768
2769	if (unlikely(map & F_ERRINTR))
2770		t3_slow_intr_handler(adap);
2771
2772	if (likely(map & 1))
2773		process_responses_gts(adap, q0);
2774
2775	if (map & 2)
2776		process_responses_gts(adap, &adap->sge.qs[1].rspq);
2777
2778	spin_unlock(&q0->lock);
2779	return IRQ_HANDLED;
2780}
2781
2782/*
2783 * NAPI interrupt handler for legacy INTx interrupts for T3B-based cards.
2784 * Handles data events from SGE response queues as well as error and other
2785 * async events as they all use the same interrupt pin.  We use one SGE
2786 * response queue per port in this mode and protect all response queues with
2787 * queue 0's lock.
2788 */
2789static irqreturn_t t3b_intr_napi(int irq, void *cookie)
2790{
2791	u32 map;
2792	struct adapter *adap = cookie;
2793	struct sge_qset *qs0 = &adap->sge.qs[0];
2794	struct sge_rspq *q0 = &qs0->rspq;
2795
2796	t3_write_reg(adap, A_PL_CLI, 0);
2797	map = t3_read_reg(adap, A_SG_DATA_INTR);
2798
2799	if (unlikely(!map))	/* shared interrupt, most likely */
2800		return IRQ_NONE;
2801
2802	spin_lock(&q0->lock);
2803
2804	if (unlikely(map & F_ERRINTR))
2805		t3_slow_intr_handler(adap);
2806
2807	if (likely(map & 1))
2808		napi_schedule(&qs0->napi);
2809
2810	if (map & 2)
2811		napi_schedule(&adap->sge.qs[1].napi);
2812
2813	spin_unlock(&q0->lock);
2814	return IRQ_HANDLED;
2815}
2816
2817/**
2818 *	t3_intr_handler - select the top-level interrupt handler
2819 *	@adap: the adapter
2820 *	@polling: whether using NAPI to service response queues
2821 *
2822 *	Selects the top-level interrupt handler based on the type of interrupts
2823 *	(MSI-X, MSI, or legacy) and whether NAPI will be used to service the
2824 *	response queues.
2825 */
2826irq_handler_t t3_intr_handler(struct adapter *adap, int polling)
2827{
2828	if (adap->flags & USING_MSIX)
2829		return polling ? t3_sge_intr_msix_napi : t3_sge_intr_msix;
2830	if (adap->flags & USING_MSI)
2831		return polling ? t3_intr_msi_napi : t3_intr_msi;
2832	if (adap->params.rev > 0)
2833		return polling ? t3b_intr_napi : t3b_intr;
2834	return t3_intr;
2835}
2836
2837#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
2838		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
2839		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
2840		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
2841		    F_HIRCQPARITYERROR)
2842#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
2843#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
2844		      F_RSPQDISABLED)
2845
2846/**
2847 *	t3_sge_err_intr_handler - SGE async event interrupt handler
2848 *	@adapter: the adapter
2849 *
2850 *	Interrupt handler for SGE asynchronous (non-data) events.
2851 */
2852void t3_sge_err_intr_handler(struct adapter *adapter)
2853{
2854	unsigned int v, status = t3_read_reg(adapter, A_SG_INT_CAUSE) &
2855				 ~F_FLEMPTY;
2856
2857	if (status & SGE_PARERR)
2858		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
2859			 status & SGE_PARERR);
2860	if (status & SGE_FRAMINGERR)
2861		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
2862			 status & SGE_FRAMINGERR);
2863
2864	if (status & F_RSPQCREDITOVERFOW)
2865		CH_ALERT(adapter, "SGE response queue credit overflow\n");
2866
2867	if (status & F_RSPQDISABLED) {
2868		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
2869
2870		CH_ALERT(adapter,
2871			 "packet delivered to disabled response queue "
2872			 "(0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff);
2873	}
2874
2875	if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR))
2876		queue_work(cxgb3_wq, &adapter->db_drop_task);
2877
2878	if (status & (F_HIPRIORITYDBFULL | F_LOPRIORITYDBFULL))
2879		queue_work(cxgb3_wq, &adapter->db_full_task);
2880
2881	if (status & (F_HIPRIORITYDBEMPTY | F_LOPRIORITYDBEMPTY))
2882		queue_work(cxgb3_wq, &adapter->db_empty_task);
2883
2884	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
2885	if (status &  SGE_FATALERR)
2886		t3_fatal_err(adapter);
2887}
2888
2889/**
2890 *	sge_timer_tx - perform periodic maintenance of an SGE qset
2891 *	@t: a timer list containing the SGE queue set to maintain
2892 *
2893 *	Runs periodically from a timer to perform maintenance of an SGE queue
2894 *	set.  It performs two tasks:
2895 *
2896 *	Cleans up any completed Tx descriptors that may still be pending.
2897 *	Normal descriptor cleanup happens when new packets are added to a Tx
2898 *	queue so this timer is relatively infrequent and does any cleanup only
2899 *	if the Tx queue has not seen any new packets in a while.  We make a
2900 *	best effort attempt to reclaim descriptors, in that we don't wait
2901 *	around if we cannot get a queue's lock (which most likely is because
2902 *	someone else is queueing new packets and so will also handle the clean
2903 *	up).  Since control queues use immediate data exclusively we don't
2904 *	bother cleaning them up here.
2905 *
2906 */
2907static void sge_timer_tx(struct timer_list *t)
2908{
2909	struct sge_qset *qs = from_timer(qs, t, tx_reclaim_timer);
2910	struct port_info *pi = netdev_priv(qs->netdev);
2911	struct adapter *adap = pi->adapter;
2912	unsigned int tbd[SGE_TXQ_PER_SET] = {0, 0};
2913	unsigned long next_period;
2914
2915	if (__netif_tx_trylock(qs->tx_q)) {
2916                tbd[TXQ_ETH] = reclaim_completed_tx(adap, &qs->txq[TXQ_ETH],
2917                                                     TX_RECLAIM_TIMER_CHUNK);
2918		__netif_tx_unlock(qs->tx_q);
2919	}
2920
2921	if (spin_trylock(&qs->txq[TXQ_OFLD].lock)) {
2922		tbd[TXQ_OFLD] = reclaim_completed_tx(adap, &qs->txq[TXQ_OFLD],
2923						     TX_RECLAIM_TIMER_CHUNK);
2924		spin_unlock(&qs->txq[TXQ_OFLD].lock);
2925	}
2926
2927	next_period = TX_RECLAIM_PERIOD >>
2928                      (max(tbd[TXQ_ETH], tbd[TXQ_OFLD]) /
2929                      TX_RECLAIM_TIMER_CHUNK);
2930	mod_timer(&qs->tx_reclaim_timer, jiffies + next_period);
2931}
2932
2933/**
2934 *	sge_timer_rx - perform periodic maintenance of an SGE qset
2935 *	@t: the timer list containing the SGE queue set to maintain
2936 *
2937 *	a) Replenishes Rx queues that have run out due to memory shortage.
2938 *	Normally new Rx buffers are added when existing ones are consumed but
2939 *	when out of memory a queue can become empty.  We try to add only a few
2940 *	buffers here, the queue will be replenished fully as these new buffers
2941 *	are used up if memory shortage has subsided.
2942 *
2943 *	b) Return coalesced response queue credits in case a response queue is
2944 *	starved.
2945 *
2946 */
2947static void sge_timer_rx(struct timer_list *t)
2948{
2949	spinlock_t *lock;
2950	struct sge_qset *qs = from_timer(qs, t, rx_reclaim_timer);
2951	struct port_info *pi = netdev_priv(qs->netdev);
2952	struct adapter *adap = pi->adapter;
2953	u32 status;
2954
2955	lock = adap->params.rev > 0 ?
2956	       &qs->rspq.lock : &adap->sge.qs[0].rspq.lock;
2957
2958	if (!spin_trylock_irq(lock))
2959		goto out;
2960
2961	if (napi_is_scheduled(&qs->napi))
2962		goto unlock;
2963
2964	if (adap->params.rev < 4) {
2965		status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS);
2966
2967		if (status & (1 << qs->rspq.cntxt_id)) {
2968			qs->rspq.starved++;
2969			if (qs->rspq.credits) {
2970				qs->rspq.credits--;
2971				refill_rspq(adap, &qs->rspq, 1);
2972				qs->rspq.restarted++;
2973				t3_write_reg(adap, A_SG_RSPQ_FL_STATUS,
2974					     1 << qs->rspq.cntxt_id);
2975			}
2976		}
2977	}
2978
2979	if (qs->fl[0].credits < qs->fl[0].size)
2980		__refill_fl(adap, &qs->fl[0]);
2981	if (qs->fl[1].credits < qs->fl[1].size)
2982		__refill_fl(adap, &qs->fl[1]);
2983
2984unlock:
2985	spin_unlock_irq(lock);
2986out:
2987	mod_timer(&qs->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD);
2988}
2989
2990/**
2991 *	t3_update_qset_coalesce - update coalescing settings for a queue set
2992 *	@qs: the SGE queue set
2993 *	@p: new queue set parameters
2994 *
2995 *	Update the coalescing settings for an SGE queue set.  Nothing is done
2996 *	if the queue set is not initialized yet.
2997 */
2998void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
2999{
3000	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */
3001	qs->rspq.polling = p->polling;
3002	qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll;
3003}
3004
3005/**
3006 *	t3_sge_alloc_qset - initialize an SGE queue set
3007 *	@adapter: the adapter
3008 *	@id: the queue set id
3009 *	@nports: how many Ethernet ports will be using this queue set
3010 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
3011 *	@p: configuration parameters for this queue set
3012 *	@ntxq: number of Tx queues for the queue set
3013 *	@dev: net device associated with this queue set
3014 *	@netdevq: net device TX queue associated with this queue set
3015 *
3016 *	Allocate resources and initialize an SGE queue set.  A queue set
3017 *	comprises a response queue, two Rx free-buffer queues, and up to 3
3018 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
3019 *	queue, offload queue, and control queue.
3020 */
3021int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
3022		      int irq_vec_idx, const struct qset_params *p,
3023		      int ntxq, struct net_device *dev,
3024		      struct netdev_queue *netdevq)
3025{
3026	int i, avail, ret = -ENOMEM;
3027	struct sge_qset *q = &adapter->sge.qs[id];
3028
3029	init_qset_cntxt(q, id);
3030	timer_setup(&q->tx_reclaim_timer, sge_timer_tx, 0);
3031	timer_setup(&q->rx_reclaim_timer, sge_timer_rx, 0);
3032
3033	q->fl[0].desc = alloc_ring(adapter->pdev, p->fl_size,
3034				   sizeof(struct rx_desc),
3035				   sizeof(struct rx_sw_desc),
3036				   &q->fl[0].phys_addr, &q->fl[0].sdesc);
3037	if (!q->fl[0].desc)
3038		goto err;
3039
3040	q->fl[1].desc = alloc_ring(adapter->pdev, p->jumbo_size,
3041				   sizeof(struct rx_desc),
3042				   sizeof(struct rx_sw_desc),
3043				   &q->fl[1].phys_addr, &q->fl[1].sdesc);
3044	if (!q->fl[1].desc)
3045		goto err;
3046
3047	q->rspq.desc = alloc_ring(adapter->pdev, p->rspq_size,
3048				  sizeof(struct rsp_desc), 0,
3049				  &q->rspq.phys_addr, NULL);
3050	if (!q->rspq.desc)
3051		goto err;
3052
3053	for (i = 0; i < ntxq; ++i) {
3054		/*
3055		 * The control queue always uses immediate data so does not
3056		 * need to keep track of any sk_buffs.
3057		 */
3058		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
3059
3060		q->txq[i].desc = alloc_ring(adapter->pdev, p->txq_size[i],
3061					    sizeof(struct tx_desc), sz,
3062					    &q->txq[i].phys_addr,
3063					    &q->txq[i].sdesc);
3064		if (!q->txq[i].desc)
3065			goto err;
3066
3067		q->txq[i].gen = 1;
3068		q->txq[i].size = p->txq_size[i];
3069		spin_lock_init(&q->txq[i].lock);
3070		skb_queue_head_init(&q->txq[i].sendq);
3071	}
3072
3073	INIT_WORK(&q->txq[TXQ_OFLD].qresume_task, restart_offloadq);
3074	INIT_WORK(&q->txq[TXQ_CTRL].qresume_task, restart_ctrlq);
3075
3076	q->fl[0].gen = q->fl[1].gen = 1;
3077	q->fl[0].size = p->fl_size;
3078	q->fl[1].size = p->jumbo_size;
3079
3080	q->rspq.gen = 1;
3081	q->rspq.size = p->rspq_size;
3082	spin_lock_init(&q->rspq.lock);
3083	skb_queue_head_init(&q->rspq.rx_queue);
3084
3085	q->txq[TXQ_ETH].stop_thres = nports *
3086	    flits_to_desc(sgl_len(MAX_SKB_FRAGS + 1) + 3);
3087
3088#if FL0_PG_CHUNK_SIZE > 0
3089	q->fl[0].buf_size = FL0_PG_CHUNK_SIZE;
3090#else
3091	q->fl[0].buf_size = SGE_RX_SM_BUF_SIZE + sizeof(struct cpl_rx_data);
3092#endif
3093#if FL1_PG_CHUNK_SIZE > 0
3094	q->fl[1].buf_size = FL1_PG_CHUNK_SIZE;
3095#else
3096	q->fl[1].buf_size = is_offload(adapter) ?
3097		(16 * 1024) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
3098		MAX_FRAME_SIZE + 2 + sizeof(struct cpl_rx_pkt);
3099#endif
3100
3101	q->fl[0].use_pages = FL0_PG_CHUNK_SIZE > 0;
3102	q->fl[1].use_pages = FL1_PG_CHUNK_SIZE > 0;
3103	q->fl[0].order = FL0_PG_ORDER;
3104	q->fl[1].order = FL1_PG_ORDER;
3105	q->fl[0].alloc_size = FL0_PG_ALLOC_SIZE;
3106	q->fl[1].alloc_size = FL1_PG_ALLOC_SIZE;
3107
3108	spin_lock_irq(&adapter->sge.reg_lock);
3109
3110	/* FL threshold comparison uses < */
3111	ret = t3_sge_init_rspcntxt(adapter, q->rspq.cntxt_id, irq_vec_idx,
3112				   q->rspq.phys_addr, q->rspq.size,
3113				   q->fl[0].buf_size - SGE_PG_RSVD, 1, 0);
3114	if (ret)
3115		goto err_unlock;
3116
3117	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
3118		ret = t3_sge_init_flcntxt(adapter, q->fl[i].cntxt_id, 0,
3119					  q->fl[i].phys_addr, q->fl[i].size,
3120					  q->fl[i].buf_size - SGE_PG_RSVD,
3121					  p->cong_thres, 1, 0);
3122		if (ret)
3123			goto err_unlock;
3124	}
3125
3126	ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
3127				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
3128				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
3129				 1, 0);
3130	if (ret)
3131		goto err_unlock;
3132
3133	if (ntxq > 1) {
3134		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_OFLD].cntxt_id,
3135					 USE_GTS, SGE_CNTXT_OFLD, id,
3136					 q->txq[TXQ_OFLD].phys_addr,
3137					 q->txq[TXQ_OFLD].size, 0, 1, 0);
3138		if (ret)
3139			goto err_unlock;
3140	}
3141
3142	if (ntxq > 2) {
3143		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_CTRL].cntxt_id, 0,
3144					 SGE_CNTXT_CTRL, id,
3145					 q->txq[TXQ_CTRL].phys_addr,
3146					 q->txq[TXQ_CTRL].size,
3147					 q->txq[TXQ_CTRL].token, 1, 0);
3148		if (ret)
3149			goto err_unlock;
3150	}
3151
3152	spin_unlock_irq(&adapter->sge.reg_lock);
3153
3154	q->adap = adapter;
3155	q->netdev = dev;
3156	q->tx_q = netdevq;
3157	t3_update_qset_coalesce(q, p);
3158
3159	avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
3160			  GFP_KERNEL | __GFP_COMP);
3161	if (!avail) {
3162		CH_ALERT(adapter, "free list queue 0 initialization failed\n");
3163		ret = -ENOMEM;
3164		goto err;
3165	}
3166	if (avail < q->fl[0].size)
3167		CH_WARN(adapter, "free list queue 0 enabled with %d credits\n",
3168			avail);
3169
3170	avail = refill_fl(adapter, &q->fl[1], q->fl[1].size,
3171			  GFP_KERNEL | __GFP_COMP);
3172	if (avail < q->fl[1].size)
3173		CH_WARN(adapter, "free list queue 1 enabled with %d credits\n",
3174			avail);
3175	refill_rspq(adapter, &q->rspq, q->rspq.size - 1);
3176
3177	t3_write_reg(adapter, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
3178		     V_NEWTIMER(q->rspq.holdoff_tmr));
3179
3180	return 0;
3181
3182err_unlock:
3183	spin_unlock_irq(&adapter->sge.reg_lock);
3184err:
3185	t3_free_qset(adapter, q);
3186	return ret;
3187}
3188
3189/**
3190 *      t3_start_sge_timers - start SGE timer call backs
3191 *      @adap: the adapter
3192 *
3193 *      Starts each SGE queue set's timer call back
3194 */
3195void t3_start_sge_timers(struct adapter *adap)
3196{
3197	int i;
3198
3199	for (i = 0; i < SGE_QSETS; ++i) {
3200		struct sge_qset *q = &adap->sge.qs[i];
3201
3202		if (q->tx_reclaim_timer.function)
3203			mod_timer(&q->tx_reclaim_timer,
3204				  jiffies + TX_RECLAIM_PERIOD);
3205
3206		if (q->rx_reclaim_timer.function)
3207			mod_timer(&q->rx_reclaim_timer,
3208				  jiffies + RX_RECLAIM_PERIOD);
3209	}
3210}
3211
3212/**
3213 *	t3_stop_sge_timers - stop SGE timer call backs
3214 *	@adap: the adapter
3215 *
3216 *	Stops each SGE queue set's timer call back
3217 */
3218void t3_stop_sge_timers(struct adapter *adap)
3219{
3220	int i;
3221
3222	for (i = 0; i < SGE_QSETS; ++i) {
3223		struct sge_qset *q = &adap->sge.qs[i];
3224
3225		if (q->tx_reclaim_timer.function)
3226			del_timer_sync(&q->tx_reclaim_timer);
3227		if (q->rx_reclaim_timer.function)
3228			del_timer_sync(&q->rx_reclaim_timer);
3229	}
3230}
3231
3232/**
3233 *	t3_free_sge_resources - free SGE resources
3234 *	@adap: the adapter
3235 *
3236 *	Frees resources used by the SGE queue sets.
3237 */
3238void t3_free_sge_resources(struct adapter *adap)
3239{
3240	int i;
3241
3242	for (i = 0; i < SGE_QSETS; ++i)
3243		t3_free_qset(adap, &adap->sge.qs[i]);
3244}
3245
3246/**
3247 *	t3_sge_start - enable SGE
3248 *	@adap: the adapter
3249 *
3250 *	Enables the SGE for DMAs.  This is the last step in starting packet
3251 *	transfers.
3252 */
3253void t3_sge_start(struct adapter *adap)
3254{
3255	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
3256}
3257
3258/**
3259 *	t3_sge_stop_dma - Disable SGE DMA engine operation
3260 *	@adap: the adapter
3261 *
3262 *	Can be invoked from interrupt context e.g.  error handler.
3263 *
3264 *	Note that this function cannot disable the restart of works as
3265 *	it cannot wait if called from interrupt context, however the
3266 *	works will have no effect since the doorbells are disabled. The
3267 *	driver will call tg3_sge_stop() later from process context, at
3268 *	which time the works will be stopped if they are still running.
3269 */
3270void t3_sge_stop_dma(struct adapter *adap)
3271{
3272	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, 0);
3273}
3274
3275/**
3276 *	t3_sge_stop - disable SGE operation completly
3277 *	@adap: the adapter
3278 *
3279 *	Called from process context. Disables the DMA engine and any
3280 *	pending queue restart works.
3281 */
3282void t3_sge_stop(struct adapter *adap)
3283{
3284	int i;
3285
3286	t3_sge_stop_dma(adap);
3287
3288	/* workqueues aren't initialized otherwise */
3289	if (!(adap->flags & FULL_INIT_DONE))
3290		return;
3291	for (i = 0; i < SGE_QSETS; ++i) {
3292		struct sge_qset *qs = &adap->sge.qs[i];
3293
3294		cancel_work_sync(&qs->txq[TXQ_OFLD].qresume_task);
3295		cancel_work_sync(&qs->txq[TXQ_CTRL].qresume_task);
3296	}
3297}
3298
3299/**
3300 *	t3_sge_init - initialize SGE
3301 *	@adap: the adapter
3302 *	@p: the SGE parameters
3303 *
3304 *	Performs SGE initialization needed every time after a chip reset.
3305 *	We do not initialize any of the queue sets here, instead the driver
3306 *	top-level must request those individually.  We also do not enable DMA
3307 *	here, that should be done after the queues have been set up.
3308 */
3309void t3_sge_init(struct adapter *adap, struct sge_params *p)
3310{
3311	unsigned int ctrl, ups = ffs(pci_resource_len(adap->pdev, 2) >> 12);
3312
3313	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
3314	    F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
3315	    V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
3316	    V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
3317#if SGE_NUM_GENBITS == 1
3318	ctrl |= F_EGRGENCTRL;
3319#endif
3320	if (adap->params.rev > 0) {
3321		if (!(adap->flags & (USING_MSIX | USING_MSI)))
3322			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
3323	}
3324	t3_write_reg(adap, A_SG_CONTROL, ctrl);
3325	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
3326		     V_LORCQDRBTHRSH(512));
3327	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
3328	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
3329		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
3330	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
3331		     adap->params.rev < T3_REV_C ? 1000 : 500);
3332	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
3333	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
3334	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
3335	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
3336	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
3337}
3338
3339/**
3340 *	t3_sge_prep - one-time SGE initialization
3341 *	@adap: the associated adapter
3342 *	@p: SGE parameters
3343 *
3344 *	Performs one-time initialization of SGE SW state.  Includes determining
3345 *	defaults for the assorted SGE parameters, which admins can change until
3346 *	they are used to initialize the SGE.
3347 */
3348void t3_sge_prep(struct adapter *adap, struct sge_params *p)
3349{
3350	int i;
3351
3352	p->max_pkt_size = (16 * 1024) - sizeof(struct cpl_rx_data) -
3353	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
3354
3355	for (i = 0; i < SGE_QSETS; ++i) {
3356		struct qset_params *q = p->qset + i;
3357
3358		q->polling = adap->params.rev > 0;
3359		q->coalesce_usecs = 5;
3360		q->rspq_size = 1024;
3361		q->fl_size = 1024;
3362		q->jumbo_size = 512;
3363		q->txq_size[TXQ_ETH] = 1024;
3364		q->txq_size[TXQ_OFLD] = 1024;
3365		q->txq_size[TXQ_CTRL] = 256;
3366		q->cong_thres = 0;
3367	}
3368
3369	spin_lock_init(&adap->sge.reg_lock);
3370}
3371