1// SPDX-License-Identifier: (GPL-2.0 OR MIT)
2/* Google virtual Ethernet (gve) driver
3 *
4 * Copyright (C) 2015-2021 Google, Inc.
5 */
6
7#include "gve.h"
8#include "gve_adminq.h"
9#include "gve_utils.h"
10#include <linux/ip.h>
11#include <linux/tcp.h>
12#include <linux/vmalloc.h>
13#include <linux/skbuff.h>
14#include <net/xdp_sock_drv.h>
15
16static inline void gve_tx_put_doorbell(struct gve_priv *priv,
17				       struct gve_queue_resources *q_resources,
18				       u32 val)
19{
20	iowrite32be(val, &priv->db_bar2[be32_to_cpu(q_resources->db_index)]);
21}
22
23void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid)
24{
25	u32 tx_qid = gve_xdp_tx_queue_id(priv, xdp_qid);
26	struct gve_tx_ring *tx = &priv->tx[tx_qid];
27
28	gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
29}
30
31/* gvnic can only transmit from a Registered Segment.
32 * We copy skb payloads into the registered segment before writing Tx
33 * descriptors and ringing the Tx doorbell.
34 *
35 * gve_tx_fifo_* manages the Registered Segment as a FIFO - clients must
36 * free allocations in the order they were allocated.
37 */
38
39static int gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_fifo *fifo)
40{
41	fifo->base = vmap(fifo->qpl->pages, fifo->qpl->num_entries, VM_MAP,
42			  PAGE_KERNEL);
43	if (unlikely(!fifo->base)) {
44		netif_err(priv, drv, priv->dev, "Failed to vmap fifo, qpl_id = %d\n",
45			  fifo->qpl->id);
46		return -ENOMEM;
47	}
48
49	fifo->size = fifo->qpl->num_entries * PAGE_SIZE;
50	atomic_set(&fifo->available, fifo->size);
51	fifo->head = 0;
52	return 0;
53}
54
55static void gve_tx_fifo_release(struct gve_priv *priv, struct gve_tx_fifo *fifo)
56{
57	WARN(atomic_read(&fifo->available) != fifo->size,
58	     "Releasing non-empty fifo");
59
60	vunmap(fifo->base);
61}
62
63static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo,
64					  size_t bytes)
65{
66	return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
67}
68
69static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
70{
71	return (atomic_read(&fifo->available) <= bytes) ? false : true;
72}
73
74/* gve_tx_alloc_fifo - Allocate fragment(s) from Tx FIFO
75 * @fifo: FIFO to allocate from
76 * @bytes: Allocation size
77 * @iov: Scatter-gather elements to fill with allocation fragment base/len
78 *
79 * Returns number of valid elements in iov[] or negative on error.
80 *
81 * Allocations from a given FIFO must be externally synchronized but concurrent
82 * allocation and frees are allowed.
83 */
84static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
85			     struct gve_tx_iovec iov[2])
86{
87	size_t overflow, padding;
88	u32 aligned_head;
89	int nfrags = 0;
90
91	if (!bytes)
92		return 0;
93
94	/* This check happens before we know how much padding is needed to
95	 * align to a cacheline boundary for the payload, but that is fine,
96	 * because the FIFO head always start aligned, and the FIFO's boundaries
97	 * are aligned, so if there is space for the data, there is space for
98	 * the padding to the next alignment.
99	 */
100	WARN(!gve_tx_fifo_can_alloc(fifo, bytes),
101	     "Reached %s when there's not enough space in the fifo", __func__);
102
103	nfrags++;
104
105	iov[0].iov_offset = fifo->head;
106	iov[0].iov_len = bytes;
107	fifo->head += bytes;
108
109	if (fifo->head > fifo->size) {
110		/* If the allocation did not fit in the tail fragment of the
111		 * FIFO, also use the head fragment.
112		 */
113		nfrags++;
114		overflow = fifo->head - fifo->size;
115		iov[0].iov_len -= overflow;
116		iov[1].iov_offset = 0;	/* Start of fifo*/
117		iov[1].iov_len = overflow;
118
119		fifo->head = overflow;
120	}
121
122	/* Re-align to a cacheline boundary */
123	aligned_head = L1_CACHE_ALIGN(fifo->head);
124	padding = aligned_head - fifo->head;
125	iov[nfrags - 1].iov_padding = padding;
126	atomic_sub(bytes + padding, &fifo->available);
127	fifo->head = aligned_head;
128
129	if (fifo->head == fifo->size)
130		fifo->head = 0;
131
132	return nfrags;
133}
134
135/* gve_tx_free_fifo - Return space to Tx FIFO
136 * @fifo: FIFO to return fragments to
137 * @bytes: Bytes to free
138 */
139static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
140{
141	atomic_add(bytes, &fifo->available);
142}
143
144static size_t gve_tx_clear_buffer_state(struct gve_tx_buffer_state *info)
145{
146	size_t space_freed = 0;
147	int i;
148
149	for (i = 0; i < ARRAY_SIZE(info->iov); i++) {
150		space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
151		info->iov[i].iov_len = 0;
152		info->iov[i].iov_padding = 0;
153	}
154	return space_freed;
155}
156
157static int gve_clean_xdp_done(struct gve_priv *priv, struct gve_tx_ring *tx,
158			      u32 to_do)
159{
160	struct gve_tx_buffer_state *info;
161	u32 clean_end = tx->done + to_do;
162	u64 pkts = 0, bytes = 0;
163	size_t space_freed = 0;
164	u32 xsk_complete = 0;
165	u32 idx;
166
167	for (; tx->done < clean_end; tx->done++) {
168		idx = tx->done & tx->mask;
169		info = &tx->info[idx];
170
171		if (unlikely(!info->xdp.size))
172			continue;
173
174		bytes += info->xdp.size;
175		pkts++;
176		xsk_complete += info->xdp.is_xsk;
177
178		info->xdp.size = 0;
179		if (info->xdp_frame) {
180			xdp_return_frame(info->xdp_frame);
181			info->xdp_frame = NULL;
182		}
183		space_freed += gve_tx_clear_buffer_state(info);
184	}
185
186	gve_tx_free_fifo(&tx->tx_fifo, space_freed);
187	if (xsk_complete > 0 && tx->xsk_pool)
188		xsk_tx_completed(tx->xsk_pool, xsk_complete);
189	u64_stats_update_begin(&tx->statss);
190	tx->bytes_done += bytes;
191	tx->pkt_done += pkts;
192	u64_stats_update_end(&tx->statss);
193	return pkts;
194}
195
196static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,
197			     u32 to_do, bool try_to_wake);
198
199void gve_tx_stop_ring_gqi(struct gve_priv *priv, int idx)
200{
201	int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
202	struct gve_tx_ring *tx = &priv->tx[idx];
203
204	if (!gve_tx_was_added_to_block(priv, idx))
205		return;
206
207	gve_remove_napi(priv, ntfy_idx);
208	gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false);
209	netdev_tx_reset_queue(tx->netdev_txq);
210	gve_tx_remove_from_block(priv, idx);
211}
212
213static void gve_tx_free_ring_gqi(struct gve_priv *priv, struct gve_tx_ring *tx,
214				 struct gve_tx_alloc_rings_cfg *cfg)
215{
216	struct device *hdev = &priv->pdev->dev;
217	int idx = tx->q_num;
218	size_t bytes;
219	u32 slots;
220
221	slots = tx->mask + 1;
222	dma_free_coherent(hdev, sizeof(*tx->q_resources),
223			  tx->q_resources, tx->q_resources_bus);
224	tx->q_resources = NULL;
225
226	if (!tx->raw_addressing) {
227		gve_tx_fifo_release(priv, &tx->tx_fifo);
228		gve_unassign_qpl(cfg->qpl_cfg, tx->tx_fifo.qpl->id);
229		tx->tx_fifo.qpl = NULL;
230	}
231
232	bytes = sizeof(*tx->desc) * slots;
233	dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
234	tx->desc = NULL;
235
236	vfree(tx->info);
237	tx->info = NULL;
238
239	netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
240}
241
242void gve_tx_start_ring_gqi(struct gve_priv *priv, int idx)
243{
244	int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
245	struct gve_tx_ring *tx = &priv->tx[idx];
246
247	gve_tx_add_to_block(priv, idx);
248
249	tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
250	gve_add_napi(priv, ntfy_idx, gve_napi_poll);
251}
252
253static int gve_tx_alloc_ring_gqi(struct gve_priv *priv,
254				 struct gve_tx_alloc_rings_cfg *cfg,
255				 struct gve_tx_ring *tx,
256				 int idx)
257{
258	struct device *hdev = &priv->pdev->dev;
259	size_t bytes;
260
261	/* Make sure everything is zeroed to start */
262	memset(tx, 0, sizeof(*tx));
263	spin_lock_init(&tx->clean_lock);
264	spin_lock_init(&tx->xdp_lock);
265	tx->q_num = idx;
266
267	tx->mask = cfg->ring_size - 1;
268
269	/* alloc metadata */
270	tx->info = vcalloc(cfg->ring_size, sizeof(*tx->info));
271	if (!tx->info)
272		return -ENOMEM;
273
274	/* alloc tx queue */
275	bytes = sizeof(*tx->desc) * cfg->ring_size;
276	tx->desc = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
277	if (!tx->desc)
278		goto abort_with_info;
279
280	tx->raw_addressing = cfg->raw_addressing;
281	tx->dev = hdev;
282	if (!tx->raw_addressing) {
283		tx->tx_fifo.qpl = gve_assign_tx_qpl(cfg, idx);
284		if (!tx->tx_fifo.qpl)
285			goto abort_with_desc;
286		/* map Tx FIFO */
287		if (gve_tx_fifo_init(priv, &tx->tx_fifo))
288			goto abort_with_qpl;
289	}
290
291	tx->q_resources =
292		dma_alloc_coherent(hdev,
293				   sizeof(*tx->q_resources),
294				   &tx->q_resources_bus,
295				   GFP_KERNEL);
296	if (!tx->q_resources)
297		goto abort_with_fifo;
298
299	return 0;
300
301abort_with_fifo:
302	if (!tx->raw_addressing)
303		gve_tx_fifo_release(priv, &tx->tx_fifo);
304abort_with_qpl:
305	if (!tx->raw_addressing)
306		gve_unassign_qpl(cfg->qpl_cfg, tx->tx_fifo.qpl->id);
307abort_with_desc:
308	dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
309	tx->desc = NULL;
310abort_with_info:
311	vfree(tx->info);
312	tx->info = NULL;
313	return -ENOMEM;
314}
315
316int gve_tx_alloc_rings_gqi(struct gve_priv *priv,
317			   struct gve_tx_alloc_rings_cfg *cfg)
318{
319	struct gve_tx_ring *tx = cfg->tx;
320	int err = 0;
321	int i, j;
322
323	if (!cfg->raw_addressing && !cfg->qpls) {
324		netif_err(priv, drv, priv->dev,
325			  "Cannot alloc QPL ring before allocing QPLs\n");
326		return -EINVAL;
327	}
328
329	if (cfg->start_idx + cfg->num_rings > cfg->qcfg->max_queues) {
330		netif_err(priv, drv, priv->dev,
331			  "Cannot alloc more than the max num of Tx rings\n");
332		return -EINVAL;
333	}
334
335	if (cfg->start_idx == 0) {
336		tx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_tx_ring),
337			      GFP_KERNEL);
338		if (!tx)
339			return -ENOMEM;
340	} else if (!tx) {
341		netif_err(priv, drv, priv->dev,
342			  "Cannot alloc tx rings from a nonzero start idx without tx array\n");
343		return -EINVAL;
344	}
345
346	for (i = cfg->start_idx; i < cfg->start_idx + cfg->num_rings; i++) {
347		err = gve_tx_alloc_ring_gqi(priv, cfg, &tx[i], i);
348		if (err) {
349			netif_err(priv, drv, priv->dev,
350				  "Failed to alloc tx ring=%d: err=%d\n",
351				  i, err);
352			goto cleanup;
353		}
354	}
355
356	cfg->tx = tx;
357	return 0;
358
359cleanup:
360	for (j = 0; j < i; j++)
361		gve_tx_free_ring_gqi(priv, &tx[j], cfg);
362	if (cfg->start_idx == 0)
363		kvfree(tx);
364	return err;
365}
366
367void gve_tx_free_rings_gqi(struct gve_priv *priv,
368			   struct gve_tx_alloc_rings_cfg *cfg)
369{
370	struct gve_tx_ring *tx = cfg->tx;
371	int i;
372
373	if (!tx)
374		return;
375
376	for (i = cfg->start_idx; i < cfg->start_idx + cfg->num_rings; i++)
377		gve_tx_free_ring_gqi(priv, &tx[i], cfg);
378
379	if (cfg->start_idx == 0) {
380		kvfree(tx);
381		cfg->tx = NULL;
382	}
383}
384
385/* gve_tx_avail - Calculates the number of slots available in the ring
386 * @tx: tx ring to check
387 *
388 * Returns the number of slots available
389 *
390 * The capacity of the queue is mask + 1. We don't need to reserve an entry.
391 **/
392static inline u32 gve_tx_avail(struct gve_tx_ring *tx)
393{
394	return tx->mask + 1 - (tx->req - tx->done);
395}
396
397static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx,
398					      struct sk_buff *skb)
399{
400	int pad_bytes, align_hdr_pad;
401	int bytes;
402	int hlen;
403
404	hlen = skb_is_gso(skb) ? skb_checksum_start_offset(skb) + tcp_hdrlen(skb) :
405				 min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len);
406
407	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo,
408						   hlen);
409	/* We need to take into account the header alignment padding. */
410	align_hdr_pad = L1_CACHE_ALIGN(hlen) - hlen;
411	bytes = align_hdr_pad + pad_bytes + skb->len;
412
413	return bytes;
414}
415
416/* The most descriptors we could need is MAX_SKB_FRAGS + 4 :
417 * 1 for each skb frag
418 * 1 for the skb linear portion
419 * 1 for when tcp hdr needs to be in separate descriptor
420 * 1 if the payload wraps to the beginning of the FIFO
421 * 1 for metadata descriptor
422 */
423#define MAX_TX_DESC_NEEDED	(MAX_SKB_FRAGS + 4)
424static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info)
425{
426	if (info->skb) {
427		dma_unmap_single(dev, dma_unmap_addr(info, dma),
428				 dma_unmap_len(info, len),
429				 DMA_TO_DEVICE);
430		dma_unmap_len_set(info, len, 0);
431	} else {
432		dma_unmap_page(dev, dma_unmap_addr(info, dma),
433			       dma_unmap_len(info, len),
434			       DMA_TO_DEVICE);
435		dma_unmap_len_set(info, len, 0);
436	}
437}
438
439/* Check if sufficient resources (descriptor ring space, FIFO space) are
440 * available to transmit the given number of bytes.
441 */
442static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
443{
444	bool can_alloc = true;
445
446	if (!tx->raw_addressing)
447		can_alloc = gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required);
448
449	return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc);
450}
451
452static_assert(NAPI_POLL_WEIGHT >= MAX_TX_DESC_NEEDED);
453
454/* Stops the queue if the skb cannot be transmitted. */
455static int gve_maybe_stop_tx(struct gve_priv *priv, struct gve_tx_ring *tx,
456			     struct sk_buff *skb)
457{
458	int bytes_required = 0;
459	u32 nic_done;
460	u32 to_do;
461	int ret;
462
463	if (!tx->raw_addressing)
464		bytes_required = gve_skb_fifo_bytes_required(tx, skb);
465
466	if (likely(gve_can_tx(tx, bytes_required)))
467		return 0;
468
469	ret = -EBUSY;
470	spin_lock(&tx->clean_lock);
471	nic_done = gve_tx_load_event_counter(priv, tx);
472	to_do = nic_done - tx->done;
473
474	/* Only try to clean if there is hope for TX */
475	if (to_do + gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED) {
476		if (to_do > 0) {
477			to_do = min_t(u32, to_do, NAPI_POLL_WEIGHT);
478			gve_clean_tx_done(priv, tx, to_do, false);
479		}
480		if (likely(gve_can_tx(tx, bytes_required)))
481			ret = 0;
482	}
483	if (ret) {
484		/* No space, so stop the queue */
485		tx->stop_queue++;
486		netif_tx_stop_queue(tx->netdev_txq);
487	}
488	spin_unlock(&tx->clean_lock);
489
490	return ret;
491}
492
493static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc,
494				 u16 csum_offset, u8 ip_summed, bool is_gso,
495				 int l4_hdr_offset, u32 desc_cnt,
496				 u16 hlen, u64 addr, u16 pkt_len)
497{
498	/* l4_hdr_offset and csum_offset are in units of 16-bit words */
499	if (is_gso) {
500		pkt_desc->pkt.type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
501		pkt_desc->pkt.l4_csum_offset = csum_offset >> 1;
502		pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1;
503	} else if (likely(ip_summed == CHECKSUM_PARTIAL)) {
504		pkt_desc->pkt.type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
505		pkt_desc->pkt.l4_csum_offset = csum_offset >> 1;
506		pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1;
507	} else {
508		pkt_desc->pkt.type_flags = GVE_TXD_STD;
509		pkt_desc->pkt.l4_csum_offset = 0;
510		pkt_desc->pkt.l4_hdr_offset = 0;
511	}
512	pkt_desc->pkt.desc_cnt = desc_cnt;
513	pkt_desc->pkt.len = cpu_to_be16(pkt_len);
514	pkt_desc->pkt.seg_len = cpu_to_be16(hlen);
515	pkt_desc->pkt.seg_addr = cpu_to_be64(addr);
516}
517
518static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc,
519				 struct sk_buff *skb)
520{
521	BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt));
522
523	mtd_desc->mtd.type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
524	mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT |
525				   GVE_MTD_PATH_HASH_L4;
526	mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash);
527	mtd_desc->mtd.reserved0 = 0;
528	mtd_desc->mtd.reserved1 = 0;
529}
530
531static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc,
532				 u16 l3_offset, u16 gso_size,
533				 bool is_gso_v6, bool is_gso,
534				 u16 len, u64 addr)
535{
536	seg_desc->seg.type_flags = GVE_TXD_SEG;
537	if (is_gso) {
538		if (is_gso_v6)
539			seg_desc->seg.type_flags |= GVE_TXSF_IPV6;
540		seg_desc->seg.l3_offset = l3_offset >> 1;
541		seg_desc->seg.mss = cpu_to_be16(gso_size);
542	}
543	seg_desc->seg.seg_len = cpu_to_be16(len);
544	seg_desc->seg.seg_addr = cpu_to_be64(addr);
545}
546
547static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses,
548				    u64 iov_offset, u64 iov_len)
549{
550	u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
551	u64 first_page = iov_offset / PAGE_SIZE;
552	u64 page;
553
554	for (page = first_page; page <= last_page; page++)
555		dma_sync_single_for_device(dev, page_buses[page], PAGE_SIZE, DMA_TO_DEVICE);
556}
557
558static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, struct sk_buff *skb)
559{
560	int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset;
561	union gve_tx_desc *pkt_desc, *seg_desc;
562	struct gve_tx_buffer_state *info;
563	int mtd_desc_nr = !!skb->l4_hash;
564	bool is_gso = skb_is_gso(skb);
565	u32 idx = tx->req & tx->mask;
566	int payload_iov = 2;
567	int copy_offset;
568	u32 next_idx;
569	int i;
570
571	info = &tx->info[idx];
572	pkt_desc = &tx->desc[idx];
573
574	l4_hdr_offset = skb_checksum_start_offset(skb);
575	/* If the skb is gso, then we want the tcp header alone in the first segment
576	 * otherwise we want the minimum required by the gVNIC spec.
577	 */
578	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) :
579			min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len);
580
581	info->skb =  skb;
582	/* We don't want to split the header, so if necessary, pad to the end
583	 * of the fifo and then put the header at the beginning of the fifo.
584	 */
585	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo, hlen);
586	hdr_nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, hlen + pad_bytes,
587				       &info->iov[0]);
588	WARN(!hdr_nfrags, "hdr_nfrags should never be 0!");
589	payload_nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, skb->len - hlen,
590					   &info->iov[payload_iov]);
591
592	gve_tx_fill_pkt_desc(pkt_desc, skb->csum_offset, skb->ip_summed,
593			     is_gso, l4_hdr_offset,
594			     1 + mtd_desc_nr + payload_nfrags, hlen,
595			     info->iov[hdr_nfrags - 1].iov_offset, skb->len);
596
597	skb_copy_bits(skb, 0,
598		      tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset,
599		      hlen);
600	gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses,
601				info->iov[hdr_nfrags - 1].iov_offset,
602				info->iov[hdr_nfrags - 1].iov_len);
603	copy_offset = hlen;
604
605	if (mtd_desc_nr) {
606		next_idx = (tx->req + 1) & tx->mask;
607		gve_tx_fill_mtd_desc(&tx->desc[next_idx], skb);
608	}
609
610	for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
611		next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
612		seg_desc = &tx->desc[next_idx];
613
614		gve_tx_fill_seg_desc(seg_desc, skb_network_offset(skb),
615				     skb_shinfo(skb)->gso_size,
616				     skb_is_gso_v6(skb), is_gso,
617				     info->iov[i].iov_len,
618				     info->iov[i].iov_offset);
619
620		skb_copy_bits(skb, copy_offset,
621			      tx->tx_fifo.base + info->iov[i].iov_offset,
622			      info->iov[i].iov_len);
623		gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses,
624					info->iov[i].iov_offset,
625					info->iov[i].iov_len);
626		copy_offset += info->iov[i].iov_len;
627	}
628
629	return 1 + mtd_desc_nr + payload_nfrags;
630}
631
632static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
633				  struct sk_buff *skb)
634{
635	const struct skb_shared_info *shinfo = skb_shinfo(skb);
636	int hlen, num_descriptors, l4_hdr_offset;
637	union gve_tx_desc *pkt_desc, *mtd_desc, *seg_desc;
638	struct gve_tx_buffer_state *info;
639	int mtd_desc_nr = !!skb->l4_hash;
640	bool is_gso = skb_is_gso(skb);
641	u32 idx = tx->req & tx->mask;
642	u64 addr;
643	u32 len;
644	int i;
645
646	info = &tx->info[idx];
647	pkt_desc = &tx->desc[idx];
648
649	l4_hdr_offset = skb_checksum_start_offset(skb);
650	/* If the skb is gso, then we want only up to the tcp header in the first segment
651	 * to efficiently replicate on each segment otherwise we want the linear portion
652	 * of the skb (which will contain the checksum because skb->csum_start and
653	 * skb->csum_offset are given relative to skb->head) in the first segment.
654	 */
655	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : skb_headlen(skb);
656	len = skb_headlen(skb);
657
658	info->skb =  skb;
659
660	addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
661	if (unlikely(dma_mapping_error(tx->dev, addr))) {
662		tx->dma_mapping_error++;
663		goto drop;
664	}
665	dma_unmap_len_set(info, len, len);
666	dma_unmap_addr_set(info, dma, addr);
667
668	num_descriptors = 1 + shinfo->nr_frags;
669	if (hlen < len)
670		num_descriptors++;
671	if (mtd_desc_nr)
672		num_descriptors++;
673
674	gve_tx_fill_pkt_desc(pkt_desc, skb->csum_offset, skb->ip_summed,
675			     is_gso, l4_hdr_offset,
676			     num_descriptors, hlen, addr, skb->len);
677
678	if (mtd_desc_nr) {
679		idx = (idx + 1) & tx->mask;
680		mtd_desc = &tx->desc[idx];
681		gve_tx_fill_mtd_desc(mtd_desc, skb);
682	}
683
684	if (hlen < len) {
685		/* For gso the rest of the linear portion of the skb needs to
686		 * be in its own descriptor.
687		 */
688		len -= hlen;
689		addr += hlen;
690		idx = (idx + 1) & tx->mask;
691		seg_desc = &tx->desc[idx];
692		gve_tx_fill_seg_desc(seg_desc, skb_network_offset(skb),
693				     skb_shinfo(skb)->gso_size,
694				     skb_is_gso_v6(skb), is_gso, len, addr);
695	}
696
697	for (i = 0; i < shinfo->nr_frags; i++) {
698		const skb_frag_t *frag = &shinfo->frags[i];
699
700		idx = (idx + 1) & tx->mask;
701		seg_desc = &tx->desc[idx];
702		len = skb_frag_size(frag);
703		addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
704		if (unlikely(dma_mapping_error(tx->dev, addr))) {
705			tx->dma_mapping_error++;
706			goto unmap_drop;
707		}
708		tx->info[idx].skb = NULL;
709		dma_unmap_len_set(&tx->info[idx], len, len);
710		dma_unmap_addr_set(&tx->info[idx], dma, addr);
711
712		gve_tx_fill_seg_desc(seg_desc, skb_network_offset(skb),
713				     skb_shinfo(skb)->gso_size,
714				     skb_is_gso_v6(skb), is_gso, len, addr);
715	}
716
717	return num_descriptors;
718
719unmap_drop:
720	i += num_descriptors - shinfo->nr_frags;
721	while (i--) {
722		/* Skip metadata descriptor, if set */
723		if (i == 1 && mtd_desc_nr == 1)
724			continue;
725		idx--;
726		gve_tx_unmap_buf(tx->dev, &tx->info[idx & tx->mask]);
727	}
728drop:
729	tx->dropped_pkt++;
730	return 0;
731}
732
733netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev)
734{
735	struct gve_priv *priv = netdev_priv(dev);
736	struct gve_tx_ring *tx;
737	int nsegs;
738
739	WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues,
740	     "skb queue index out of range");
741	tx = &priv->tx[skb_get_queue_mapping(skb)];
742	if (unlikely(gve_maybe_stop_tx(priv, tx, skb))) {
743		/* We need to ring the txq doorbell -- we have stopped the Tx
744		 * queue for want of resources, but prior calls to gve_tx()
745		 * may have added descriptors without ringing the doorbell.
746		 */
747
748		gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
749		return NETDEV_TX_BUSY;
750	}
751	if (tx->raw_addressing)
752		nsegs = gve_tx_add_skb_no_copy(priv, tx, skb);
753	else
754		nsegs = gve_tx_add_skb_copy(priv, tx, skb);
755
756	/* If the packet is getting sent, we need to update the skb */
757	if (nsegs) {
758		netdev_tx_sent_queue(tx->netdev_txq, skb->len);
759		skb_tx_timestamp(skb);
760		tx->req += nsegs;
761	} else {
762		dev_kfree_skb_any(skb);
763	}
764
765	if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
766		return NETDEV_TX_OK;
767
768	/* Give packets to NIC. Even if this packet failed to send the doorbell
769	 * might need to be rung because of xmit_more.
770	 */
771	gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
772	return NETDEV_TX_OK;
773}
774
775static int gve_tx_fill_xdp(struct gve_priv *priv, struct gve_tx_ring *tx,
776			   void *data, int len, void *frame_p, bool is_xsk)
777{
778	int pad, nfrags, ndescs, iovi, offset;
779	struct gve_tx_buffer_state *info;
780	u32 reqi = tx->req;
781
782	pad = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo, len);
783	if (pad >= GVE_GQ_TX_MIN_PKT_DESC_BYTES)
784		pad = 0;
785	info = &tx->info[reqi & tx->mask];
786	info->xdp_frame = frame_p;
787	info->xdp.size = len;
788	info->xdp.is_xsk = is_xsk;
789
790	nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, pad + len,
791				   &info->iov[0]);
792	iovi = pad > 0;
793	ndescs = nfrags - iovi;
794	offset = 0;
795
796	while (iovi < nfrags) {
797		if (!offset)
798			gve_tx_fill_pkt_desc(&tx->desc[reqi & tx->mask], 0,
799					     CHECKSUM_NONE, false, 0, ndescs,
800					     info->iov[iovi].iov_len,
801					     info->iov[iovi].iov_offset, len);
802		else
803			gve_tx_fill_seg_desc(&tx->desc[reqi & tx->mask],
804					     0, 0, false, false,
805					     info->iov[iovi].iov_len,
806					     info->iov[iovi].iov_offset);
807
808		memcpy(tx->tx_fifo.base + info->iov[iovi].iov_offset,
809		       data + offset, info->iov[iovi].iov_len);
810		gve_dma_sync_for_device(&priv->pdev->dev,
811					tx->tx_fifo.qpl->page_buses,
812					info->iov[iovi].iov_offset,
813					info->iov[iovi].iov_len);
814		offset += info->iov[iovi].iov_len;
815		iovi++;
816		reqi++;
817	}
818
819	return ndescs;
820}
821
822int gve_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
823		 u32 flags)
824{
825	struct gve_priv *priv = netdev_priv(dev);
826	struct gve_tx_ring *tx;
827	int i, err = 0, qid;
828
829	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
830		return -EINVAL;
831
832	qid = gve_xdp_tx_queue_id(priv,
833				  smp_processor_id() % priv->num_xdp_queues);
834
835	tx = &priv->tx[qid];
836
837	spin_lock(&tx->xdp_lock);
838	for (i = 0; i < n; i++) {
839		err = gve_xdp_xmit_one(priv, tx, frames[i]->data,
840				       frames[i]->len, frames[i]);
841		if (err)
842			break;
843	}
844
845	if (flags & XDP_XMIT_FLUSH)
846		gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
847
848	spin_unlock(&tx->xdp_lock);
849
850	u64_stats_update_begin(&tx->statss);
851	tx->xdp_xmit += n;
852	tx->xdp_xmit_errors += n - i;
853	u64_stats_update_end(&tx->statss);
854
855	return i ? i : err;
856}
857
858int gve_xdp_xmit_one(struct gve_priv *priv, struct gve_tx_ring *tx,
859		     void *data, int len, void *frame_p)
860{
861	int nsegs;
862
863	if (!gve_can_tx(tx, len + GVE_GQ_TX_MIN_PKT_DESC_BYTES - 1))
864		return -EBUSY;
865
866	nsegs = gve_tx_fill_xdp(priv, tx, data, len, frame_p, false);
867	tx->req += nsegs;
868
869	return 0;
870}
871
872#define GVE_TX_START_THRESH	4096
873
874static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,
875			     u32 to_do, bool try_to_wake)
876{
877	struct gve_tx_buffer_state *info;
878	u64 pkts = 0, bytes = 0;
879	size_t space_freed = 0;
880	struct sk_buff *skb;
881	u32 idx;
882	int j;
883
884	for (j = 0; j < to_do; j++) {
885		idx = tx->done & tx->mask;
886		netif_info(priv, tx_done, priv->dev,
887			   "[%d] %s: idx=%d (req=%u done=%u)\n",
888			   tx->q_num, __func__, idx, tx->req, tx->done);
889		info = &tx->info[idx];
890		skb = info->skb;
891
892		/* Unmap the buffer */
893		if (tx->raw_addressing)
894			gve_tx_unmap_buf(tx->dev, info);
895		tx->done++;
896		/* Mark as free */
897		if (skb) {
898			info->skb = NULL;
899			bytes += skb->len;
900			pkts++;
901			dev_consume_skb_any(skb);
902			if (tx->raw_addressing)
903				continue;
904			space_freed += gve_tx_clear_buffer_state(info);
905		}
906	}
907
908	if (!tx->raw_addressing)
909		gve_tx_free_fifo(&tx->tx_fifo, space_freed);
910	u64_stats_update_begin(&tx->statss);
911	tx->bytes_done += bytes;
912	tx->pkt_done += pkts;
913	u64_stats_update_end(&tx->statss);
914	netdev_tx_completed_queue(tx->netdev_txq, pkts, bytes);
915
916	/* start the queue if we've stopped it */
917#ifndef CONFIG_BQL
918	/* Make sure that the doorbells are synced */
919	smp_mb();
920#endif
921	if (try_to_wake && netif_tx_queue_stopped(tx->netdev_txq) &&
922	    likely(gve_can_tx(tx, GVE_TX_START_THRESH))) {
923		tx->wake_queue++;
924		netif_tx_wake_queue(tx->netdev_txq);
925	}
926
927	return pkts;
928}
929
930u32 gve_tx_load_event_counter(struct gve_priv *priv,
931			      struct gve_tx_ring *tx)
932{
933	u32 counter_index = be32_to_cpu(tx->q_resources->counter_index);
934	__be32 counter = READ_ONCE(priv->counter_array[counter_index]);
935
936	return be32_to_cpu(counter);
937}
938
939static int gve_xsk_tx(struct gve_priv *priv, struct gve_tx_ring *tx,
940		      int budget)
941{
942	struct xdp_desc desc;
943	int sent = 0, nsegs;
944	void *data;
945
946	spin_lock(&tx->xdp_lock);
947	while (sent < budget) {
948		if (!gve_can_tx(tx, GVE_TX_START_THRESH))
949			goto out;
950
951		if (!xsk_tx_peek_desc(tx->xsk_pool, &desc)) {
952			tx->xdp_xsk_done = tx->xdp_xsk_wakeup;
953			goto out;
954		}
955
956		data = xsk_buff_raw_get_data(tx->xsk_pool, desc.addr);
957		nsegs = gve_tx_fill_xdp(priv, tx, data, desc.len, NULL, true);
958		tx->req += nsegs;
959		sent++;
960	}
961out:
962	if (sent > 0) {
963		gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
964		xsk_tx_release(tx->xsk_pool);
965	}
966	spin_unlock(&tx->xdp_lock);
967	return sent;
968}
969
970bool gve_xdp_poll(struct gve_notify_block *block, int budget)
971{
972	struct gve_priv *priv = block->priv;
973	struct gve_tx_ring *tx = block->tx;
974	u32 nic_done;
975	bool repoll;
976	u32 to_do;
977
978	/* Find out how much work there is to be done */
979	nic_done = gve_tx_load_event_counter(priv, tx);
980	to_do = min_t(u32, (nic_done - tx->done), budget);
981	gve_clean_xdp_done(priv, tx, to_do);
982	repoll = nic_done != tx->done;
983
984	if (tx->xsk_pool) {
985		int sent = gve_xsk_tx(priv, tx, budget);
986
987		u64_stats_update_begin(&tx->statss);
988		tx->xdp_xsk_sent += sent;
989		u64_stats_update_end(&tx->statss);
990		repoll |= (sent == budget);
991		if (xsk_uses_need_wakeup(tx->xsk_pool))
992			xsk_set_tx_need_wakeup(tx->xsk_pool);
993	}
994
995	/* If we still have work we want to repoll */
996	return repoll;
997}
998
999bool gve_tx_poll(struct gve_notify_block *block, int budget)
1000{
1001	struct gve_priv *priv = block->priv;
1002	struct gve_tx_ring *tx = block->tx;
1003	u32 nic_done;
1004	u32 to_do;
1005
1006	/* If budget is 0, do all the work */
1007	if (budget == 0)
1008		budget = INT_MAX;
1009
1010	/* In TX path, it may try to clean completed pkts in order to xmit,
1011	 * to avoid cleaning conflict, use spin_lock(), it yields better
1012	 * concurrency between xmit/clean than netif's lock.
1013	 */
1014	spin_lock(&tx->clean_lock);
1015	/* Find out how much work there is to be done */
1016	nic_done = gve_tx_load_event_counter(priv, tx);
1017	to_do = min_t(u32, (nic_done - tx->done), budget);
1018	gve_clean_tx_done(priv, tx, to_do, true);
1019	spin_unlock(&tx->clean_lock);
1020	/* If we still have work we want to repoll */
1021	return nic_done != tx->done;
1022}
1023
1024bool gve_tx_clean_pending(struct gve_priv *priv, struct gve_tx_ring *tx)
1025{
1026	u32 nic_done = gve_tx_load_event_counter(priv, tx);
1027
1028	return nic_done != tx->done;
1029}
1030