1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2023 Google LLC
5 *
6 * Redistribution and use in source and binary forms, with or without modification,
7 * are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * 3. Neither the name of the copyright holder nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software without
18 *    specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31#include "gve.h"
32#include "gve_adminq.h"
33
34#define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
35
36static int
37gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
38{
39	struct gve_queue_page_list *qpl = tx->com.qpl;
40	struct gve_tx_fifo *fifo = &tx->fifo;
41
42	fifo->size = qpl->num_pages * PAGE_SIZE;
43	fifo->base = qpl->kva;
44	atomic_store_int(&fifo->available, fifo->size);
45	fifo->head = 0;
46
47	return (0);
48}
49
50static void
51gve_tx_free_ring(struct gve_priv *priv, int i)
52{
53	struct gve_tx_ring *tx = &priv->tx[i];
54	struct gve_ring_com *com = &tx->com;
55
56	/* Safe to call even if never alloced */
57	gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
58
59	if (tx->br != NULL) {
60		buf_ring_free(tx->br, M_DEVBUF);
61		tx->br = NULL;
62	}
63
64	if (mtx_initialized(&tx->ring_mtx))
65		mtx_destroy(&tx->ring_mtx);
66
67	if (tx->info != NULL) {
68		free(tx->info, M_GVE);
69		tx->info = NULL;
70	}
71
72	if (tx->desc_ring != NULL) {
73		gve_dma_free_coherent(&tx->desc_ring_mem);
74		tx->desc_ring = NULL;
75	}
76
77	if (com->q_resources != NULL) {
78		gve_dma_free_coherent(&com->q_resources_mem);
79		com->q_resources = NULL;
80	}
81}
82
83static int
84gve_tx_alloc_ring(struct gve_priv *priv, int i)
85{
86	struct gve_tx_ring *tx = &priv->tx[i];
87	struct gve_ring_com *com = &tx->com;
88	char mtx_name[16];
89	int err;
90
91	com->priv = priv;
92	com->id = i;
93
94	com->qpl = &priv->qpls[i];
95	if (com->qpl == NULL) {
96		device_printf(priv->dev, "No QPL left for tx ring %d\n", i);
97		return (ENOMEM);
98	}
99
100	err = gve_tx_fifo_init(priv, tx);
101	if (err != 0)
102		goto abort;
103
104	tx->info = malloc(sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
105	    M_GVE, M_WAITOK | M_ZERO);
106
107	sprintf(mtx_name, "gvetx%d", i);
108	mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
109
110	tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF,
111	    M_WAITOK, &tx->ring_mtx);
112
113	gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
114
115	err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
116	    PAGE_SIZE, &com->q_resources_mem);
117	if (err != 0) {
118		device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i);
119		goto abort;
120	}
121	com->q_resources = com->q_resources_mem.cpu_addr;
122
123	err = gve_dma_alloc_coherent(priv,
124	    sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
125	    CACHE_LINE_SIZE, &tx->desc_ring_mem);
126	if (err != 0) {
127		device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i);
128		goto abort;
129	}
130	tx->desc_ring = tx->desc_ring_mem.cpu_addr;
131
132	return (0);
133
134abort:
135	gve_tx_free_ring(priv, i);
136	return (err);
137}
138
139int
140gve_alloc_tx_rings(struct gve_priv *priv)
141{
142	int err = 0;
143	int i;
144
145	priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues,
146	    M_GVE, M_WAITOK | M_ZERO);
147
148	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
149		err = gve_tx_alloc_ring(priv, i);
150		if (err != 0)
151			goto free_rings;
152
153	}
154
155	return (0);
156
157free_rings:
158	while (i--)
159		gve_tx_free_ring(priv, i);
160	free(priv->tx, M_GVE);
161	return (err);
162}
163
164void
165gve_free_tx_rings(struct gve_priv *priv)
166{
167	int i;
168
169	for (i = 0; i < priv->tx_cfg.num_queues; i++)
170		gve_tx_free_ring(priv, i);
171
172	free(priv->tx, M_GVE);
173}
174
175static void
176gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
177{
178	struct gve_ring_com *com = &tx->com;
179	int i;
180
181	for (i = 0; i < com->priv->tx_desc_cnt; i++) {
182		tx->desc_ring[i] = (union gve_tx_desc){};
183		tx->info[i] = (struct gve_tx_buffer_state){};
184	}
185
186	bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
187	    BUS_DMASYNC_PREWRITE);
188}
189
190static void
191gve_clear_tx_ring(struct gve_priv *priv, int i)
192{
193	struct gve_tx_ring *tx = &priv->tx[i];
194	struct gve_tx_fifo *fifo = &tx->fifo;
195
196	tx->req = 0;
197	tx->done = 0;
198	tx->mask = priv->tx_desc_cnt - 1;
199
200	atomic_store_int(&fifo->available, fifo->size);
201	fifo->head = 0;
202
203	gve_tx_clear_desc_ring(tx);
204}
205
206static void
207gve_start_tx_ring(struct gve_priv *priv, int i)
208{
209	struct gve_tx_ring *tx = &priv->tx[i];
210	struct gve_ring_com *com = &tx->com;
211
212	NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
213	com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
214	    taskqueue_thread_enqueue, &com->cleanup_tq);
215	taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
216	    device_get_nameunit(priv->dev), i);
217
218	TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
219	tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
220	    M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
221	taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
222	    device_get_nameunit(priv->dev), i);
223}
224
225int
226gve_create_tx_rings(struct gve_priv *priv)
227{
228	struct gve_ring_com *com;
229	struct gve_tx_ring *tx;
230	int err;
231	int i;
232
233	if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
234		return (0);
235
236	for (i = 0; i < priv->tx_cfg.num_queues; i++)
237		gve_clear_tx_ring(priv, i);
238
239	err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
240	if (err != 0)
241		return (err);
242
243	bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
244	    BUS_DMASYNC_POSTREAD);
245
246	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
247		tx = &priv->tx[i];
248		com = &tx->com;
249
250		com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
251
252		bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
253		    BUS_DMASYNC_POSTREAD);
254		com->db_offset = 4 * be32toh(com->q_resources->db_index);
255		com->counter_idx = be32toh(com->q_resources->counter_index);
256
257		gve_start_tx_ring(priv, i);
258	}
259
260	gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
261	return (0);
262}
263
264static void
265gve_stop_tx_ring(struct gve_priv *priv, int i)
266{
267	struct gve_tx_ring *tx = &priv->tx[i];
268	struct gve_ring_com *com = &tx->com;
269
270	if (com->cleanup_tq != NULL) {
271		taskqueue_quiesce(com->cleanup_tq);
272		taskqueue_free(com->cleanup_tq);
273		com->cleanup_tq = NULL;
274	}
275
276	if (tx->xmit_tq != NULL) {
277		taskqueue_quiesce(tx->xmit_tq);
278		taskqueue_free(tx->xmit_tq);
279		tx->xmit_tq = NULL;
280	}
281}
282
283int
284gve_destroy_tx_rings(struct gve_priv *priv)
285{
286	int err;
287	int i;
288
289	for (i = 0; i < priv->tx_cfg.num_queues; i++)
290		gve_stop_tx_ring(priv, i);
291
292	if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) {
293		err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
294		if (err != 0)
295			return (err);
296		gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
297	}
298
299	return (0);
300}
301
302int
303gve_tx_intr(void *arg)
304{
305	struct gve_tx_ring *tx = arg;
306	struct gve_priv *priv = tx->com.priv;
307	struct gve_ring_com *com = &tx->com;
308
309	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
310		return (FILTER_STRAY);
311
312	gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
313	taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
314	return (FILTER_HANDLED);
315}
316
317static uint32_t
318gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx)
319{
320	bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map,
321	    BUS_DMASYNC_POSTREAD);
322	uint32_t counter = priv->counters[tx->com.counter_idx];
323	return (be32toh(counter));
324}
325
326static void
327gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
328{
329	atomic_add_int(&fifo->available, bytes);
330}
331
332void
333gve_tx_cleanup_tq(void *arg, int pending)
334{
335	struct gve_tx_ring *tx = arg;
336	struct gve_priv *priv = tx->com.priv;
337	uint32_t nic_done = gve_tx_load_event_counter(priv, tx);
338	uint32_t todo = nic_done - tx->done;
339	size_t space_freed = 0;
340	int i, j;
341
342	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
343		return;
344
345	for (j = 0; j < todo; j++) {
346		uint32_t idx = tx->done & tx->mask;
347		struct gve_tx_buffer_state *info = &tx->info[idx];
348		struct mbuf *mbuf = info->mbuf;
349
350		tx->done++;
351		if (mbuf == NULL)
352			continue;
353
354		info->mbuf = NULL;
355		counter_enter();
356		counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
357		counter_u64_add_protected(tx->stats.tpackets, 1);
358		counter_exit();
359		m_freem(mbuf);
360
361		for (i = 0; i < GVE_TX_MAX_DESCS; i++) {
362			space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
363			info->iov[i].iov_len = 0;
364			info->iov[i].iov_padding = 0;
365		}
366	}
367
368	gve_tx_free_fifo(&tx->fifo, space_freed);
369
370	gve_db_bar_write_4(priv, tx->com.irq_db_offset,
371	    GVE_IRQ_ACK | GVE_IRQ_EVENT);
372
373	/*
374	 * Completions born before this barrier MAY NOT cause the NIC to send an
375	 * interrupt but they will still be handled by the enqueue below.
376	 * Completions born after the barrier WILL trigger an interrupt.
377	 */
378	mb();
379
380	nic_done = gve_tx_load_event_counter(priv, tx);
381	todo = nic_done - tx->done;
382	if (todo != 0) {
383		gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
384		taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
385	}
386}
387
388static void
389gve_dma_sync_for_device(struct gve_queue_page_list *qpl,
390			uint64_t iov_offset, uint64_t iov_len)
391{
392	uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
393	uint64_t first_page = iov_offset / PAGE_SIZE;
394	struct gve_dma_handle *dma;
395	uint64_t page;
396
397	for (page = first_page; page <= last_page; page++) {
398		dma = &(qpl->dmas[page]);
399		bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
400	}
401}
402
403static void
404gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf)
405{
406	mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
407	mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4;
408	mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid);
409	mtd_desc->reserved0 = 0;
410	mtd_desc->reserved1 = 0;
411}
412
413static void
414gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso,
415    uint16_t l4_hdr_offset, uint32_t desc_cnt,
416    uint16_t first_seg_len, uint64_t addr, bool has_csum_flag,
417    int csum_offset, uint16_t pkt_len)
418{
419	if (is_tso) {
420		pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
421		pkt_desc->l4_csum_offset = csum_offset >> 1;
422		pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
423	} else if (has_csum_flag) {
424		pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
425		pkt_desc->l4_csum_offset = csum_offset >> 1;
426		pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
427	} else {
428		pkt_desc->type_flags = GVE_TXD_STD;
429		pkt_desc->l4_csum_offset = 0;
430		pkt_desc->l4_hdr_offset = 0;
431	}
432	pkt_desc->desc_cnt = desc_cnt;
433	pkt_desc->len = htobe16(pkt_len);
434	pkt_desc->seg_len = htobe16(first_seg_len);
435	pkt_desc->seg_addr = htobe64(addr);
436}
437
438static void
439gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc,
440    bool is_tso, uint16_t len, uint64_t addr,
441    bool is_ipv6, uint8_t l3_off, uint16_t tso_mss)
442{
443	seg_desc->type_flags = GVE_TXD_SEG;
444	if (is_tso) {
445		if (is_ipv6)
446			seg_desc->type_flags |= GVE_TXSF_IPV6;
447		seg_desc->l3_offset = l3_off >> 1;
448		seg_desc->mss = htobe16(tso_mss);
449	}
450	seg_desc->seg_len = htobe16(len);
451	seg_desc->seg_addr = htobe64(addr);
452}
453
454static inline uint32_t
455gve_tx_avail(struct gve_tx_ring *tx)
456{
457	return (tx->mask + 1 - (tx->req - tx->done));
458}
459
460static bool
461gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
462{
463	return (atomic_load_int(&fifo->available) >= bytes);
464}
465
466static inline bool
467gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
468{
469	return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) &&
470	    gve_tx_fifo_can_alloc(&tx->fifo, bytes_required));
471}
472
473static int
474gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes)
475{
476	return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
477}
478
479static inline int
480gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len,
481    uint16_t pkt_len)
482{
483	int pad_bytes, align_hdr_pad;
484	int bytes;
485
486	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
487	/* We need to take into account the header alignment padding. */
488	align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len;
489	bytes = align_hdr_pad + pad_bytes + pkt_len;
490
491	return (bytes);
492}
493
494static int
495gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
496    struct gve_tx_iovec iov[2])
497{
498	size_t overflow, padding;
499	uint32_t aligned_head;
500	int nfrags = 0;
501
502	if (bytes == 0)
503		return (0);
504
505	/*
506	 * This check happens before we know how much padding is needed to
507	 * align to a cacheline boundary for the payload, but that is fine,
508	 * because the FIFO head always start aligned, and the FIFO's boundaries
509	 * are aligned, so if there is space for the data, there is space for
510	 * the padding to the next alignment.
511	 */
512	KASSERT(gve_tx_fifo_can_alloc(fifo, bytes),
513	    ("Allocating gve tx fifo when there is no room"));
514
515	nfrags++;
516
517	iov[0].iov_offset = fifo->head;
518	iov[0].iov_len = bytes;
519	fifo->head += bytes;
520
521	if (fifo->head > fifo->size) {
522		/*
523		 * If the allocation did not fit in the tail fragment of the
524		 * FIFO, also use the head fragment.
525		 */
526		nfrags++;
527		overflow = fifo->head - fifo->size;
528		iov[0].iov_len -= overflow;
529		iov[1].iov_offset = 0;	/* Start of fifo*/
530		iov[1].iov_len = overflow;
531
532		fifo->head = overflow;
533	}
534
535	/* Re-align to a cacheline boundary */
536	aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE);
537	padding = aligned_head - fifo->head;
538	iov[nfrags - 1].iov_padding = padding;
539	atomic_add_int(&fifo->available, -(bytes + padding));
540	fifo->head = aligned_head;
541
542	if (fifo->head == fifo->size)
543		fifo->head = 0;
544
545	return (nfrags);
546}
547
548/* Only error this returns is ENOBUFS when the tx fifo is short of space */
549static int
550gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
551{
552	bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false;
553	int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset;
554	uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len;
555	int pad_bytes, hdr_nfrags, payload_nfrags;
556	struct gve_tx_pkt_desc *pkt_desc;
557	struct gve_tx_seg_desc *seg_desc;
558	struct gve_tx_mtd_desc *mtd_desc;
559	struct gve_tx_buffer_state *info;
560	uint32_t idx = tx->req & tx->mask;
561	struct ether_header *eh;
562	struct mbuf *mbuf_next;
563	int payload_iov = 2;
564	int bytes_required;
565	struct ip6_hdr *ip6;
566	struct tcphdr *th;
567	uint32_t next_idx;
568	uint8_t l3_off;
569	struct ip *ip;
570	int i;
571
572	info = &tx->info[idx];
573	csum_flags = mbuf->m_pkthdr.csum_flags;
574	pkt_len = mbuf->m_pkthdr.len;
575	is_tso = csum_flags & CSUM_TSO;
576	has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
577	    CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
578	mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0;
579	tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0;
580
581	eh = mtod(mbuf, struct ether_header *);
582	KASSERT(eh->ether_type != ETHERTYPE_VLAN,
583	    ("VLAN-tagged packets not supported"));
584
585	is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6;
586	l3_off = ETHER_HDR_LEN;
587	mbuf_next = m_getptr(mbuf, l3_off, &offset);
588
589	if (is_ipv6) {
590		ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset));
591		l4_off = l3_off + sizeof(struct ip6_hdr);
592		is_tcp = (ip6->ip6_nxt == IPPROTO_TCP);
593		is_udp = (ip6->ip6_nxt == IPPROTO_UDP);
594		mbuf_next = m_getptr(mbuf, l4_off, &offset);
595	} else if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
596		ip = (struct ip *)(mtodo(mbuf_next, offset));
597		l4_off = l3_off + (ip->ip_hl << 2);
598		is_tcp = (ip->ip_p == IPPROTO_TCP);
599		is_udp = (ip->ip_p == IPPROTO_UDP);
600		mbuf_next = m_getptr(mbuf, l4_off, &offset);
601	}
602
603	l4_data_off = 0;
604	if (is_tcp) {
605		th = (struct tcphdr *)(mtodo(mbuf_next, offset));
606		l4_data_off = l4_off + (th->th_off << 2);
607	} else if (is_udp)
608		l4_data_off = l4_off + sizeof(struct udphdr);
609
610	if (has_csum_flag) {
611		if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0)
612			csum_offset = offsetof(struct tcphdr, th_sum);
613		else
614			csum_offset = offsetof(struct udphdr, uh_sum);
615	}
616
617	/*
618	 * If this packet is neither a TCP nor a UDP packet, the first segment,
619	 * the one represented by the packet descriptor, will carry the
620	 * spec-stipulated minimum of 182B.
621	 */
622	if (l4_data_off != 0)
623		first_seg_len = l4_data_off;
624	else
625		first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES);
626
627	bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
628	if (__predict_false(!gve_can_tx(tx, bytes_required))) {
629		counter_enter();
630		counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_device, 1);
631		counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
632		counter_exit();
633		return (ENOBUFS);
634	}
635
636	/* So that the cleanup taskqueue can free the mbuf eventually. */
637	info->mbuf = mbuf;
638
639	/*
640	 * We don't want to split the header, so if necessary, pad to the end
641	 * of the fifo and then put the header at the beginning of the fifo.
642	 */
643	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
644	hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes,
645	    &info->iov[0]);
646	KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0"));
647	payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len,
648	    &info->iov[payload_iov]);
649
650	pkt_desc = &tx->desc_ring[idx].pkt;
651	gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off,
652	    1 + mtd_desc_nr + payload_nfrags, first_seg_len,
653	    info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset,
654	    pkt_len);
655
656	m_copydata(mbuf, 0, first_seg_len,
657	    (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset);
658	gve_dma_sync_for_device(tx->com.qpl,
659	    info->iov[hdr_nfrags - 1].iov_offset,
660	    info->iov[hdr_nfrags - 1].iov_len);
661	copy_offset = first_seg_len;
662
663	if (mtd_desc_nr == 1) {
664		next_idx = (tx->req + 1) & tx->mask;
665		mtd_desc = &tx->desc_ring[next_idx].mtd;
666		gve_tx_fill_mtd_desc(mtd_desc, mbuf);
667	}
668
669	for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
670		next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
671		seg_desc = &tx->desc_ring[next_idx].seg;
672
673		gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len,
674		    info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss);
675
676		m_copydata(mbuf, copy_offset, info->iov[i].iov_len,
677		    (char *)tx->fifo.base + info->iov[i].iov_offset);
678		gve_dma_sync_for_device(tx->com.qpl,
679		    info->iov[i].iov_offset, info->iov[i].iov_len);
680		copy_offset += info->iov[i].iov_len;
681	}
682
683	tx->req += (1 + mtd_desc_nr + payload_nfrags);
684	if (is_tso) {
685		counter_enter();
686		counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
687		counter_exit();
688	}
689	return (0);
690}
691
692static void
693gve_xmit_br(struct gve_tx_ring *tx)
694{
695	struct gve_priv *priv = tx->com.priv;
696	struct ifnet *ifp = priv->ifp;
697	struct mbuf *mbuf;
698
699	while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
700	    (mbuf = drbr_peek(ifp, tx->br)) != NULL) {
701
702		if (__predict_false(gve_xmit(tx, mbuf) != 0)) {
703			drbr_putback(ifp, tx->br, mbuf);
704			taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
705			break;
706		}
707
708		drbr_advance(ifp, tx->br);
709		BPF_MTAP(ifp, mbuf);
710
711		bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
712		    BUS_DMASYNC_PREWRITE);
713		gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
714	}
715}
716
717void
718gve_xmit_tq(void *arg, int pending)
719{
720	struct gve_tx_ring *tx = (struct gve_tx_ring *)arg;
721
722	GVE_RING_LOCK(tx);
723	gve_xmit_br(tx);
724	GVE_RING_UNLOCK(tx);
725}
726
727static bool
728is_vlan_tagged_pkt(struct mbuf *mbuf)
729{
730	struct ether_header *eh;
731
732	eh = mtod(mbuf, struct ether_header *);
733	return (ntohs(eh->ether_type) == ETHERTYPE_VLAN);
734}
735
736int
737gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
738{
739	struct gve_priv *priv = if_getsoftc(ifp);
740	struct gve_tx_ring *tx;
741	bool is_br_empty;
742	int err;
743	uint32_t i;
744
745	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
746		return (ENODEV);
747
748	if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
749		i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues;
750	else
751		i = curcpu % priv->tx_cfg.num_queues;
752	tx = &priv->tx[i];
753
754	if (__predict_false(is_vlan_tagged_pkt(mbuf))) {
755		counter_enter();
756		counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1);
757		counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
758		counter_exit();
759		m_freem(mbuf);
760		return (ENODEV);
761	}
762
763	is_br_empty = drbr_empty(ifp, tx->br);
764	err = drbr_enqueue(ifp, tx->br, mbuf);
765	if (__predict_false(err != 0)) {
766		taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
767		counter_enter();
768		counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
769		counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
770		counter_exit();
771		return (err);
772	}
773
774	/*
775	 * If the mbuf we just enqueued is the only one on the ring, then
776	 * transmit it right away in the interests of low latency.
777	 */
778	if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
779		gve_xmit_br(tx);
780		GVE_RING_UNLOCK(tx);
781	} else {
782		taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
783	}
784
785	return (0);
786}
787
788void
789gve_qflush(if_t ifp)
790{
791	struct gve_priv *priv = if_getsoftc(ifp);
792	struct gve_tx_ring *tx;
793	int i;
794
795	for (i = 0; i < priv->tx_cfg.num_queues; ++i) {
796		tx = &priv->tx[i];
797		if (drbr_empty(ifp, tx->br) == 0) {
798			GVE_RING_LOCK(tx);
799			drbr_flush(ifp, tx->br);
800			GVE_RING_UNLOCK(tx);
801		}
802	}
803
804	if_qflush(ifp);
805}
806