1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2015-2023 Amazon.com, Inc. or its affiliates.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 *
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30#include <sys/cdefs.h>
31#include "opt_rss.h"
32#include "ena.h"
33#include "ena_datapath.h"
34#ifdef DEV_NETMAP
35#include "ena_netmap.h"
36#endif /* DEV_NETMAP */
37#ifdef RSS
38#include <net/rss_config.h>
39#endif /* RSS */
40
41#include <netinet6/ip6_var.h>
42
43/*********************************************************************
44 *  Static functions prototypes
45 *********************************************************************/
46
47static int ena_tx_cleanup(struct ena_ring *);
48static int ena_rx_cleanup(struct ena_ring *);
49static inline int ena_get_tx_req_id(struct ena_ring *tx_ring,
50    struct ena_com_io_cq *io_cq, uint16_t *req_id);
51static void ena_rx_hash_mbuf(struct ena_ring *, struct ena_com_rx_ctx *,
52    struct mbuf *);
53static struct mbuf *ena_rx_mbuf(struct ena_ring *, struct ena_com_rx_buf_info *,
54    struct ena_com_rx_ctx *, uint16_t *);
55static inline void ena_rx_checksum(struct ena_ring *, struct ena_com_rx_ctx *,
56    struct mbuf *);
57static void ena_tx_csum(struct ena_com_tx_ctx *, struct mbuf *, bool);
58static int ena_check_and_collapse_mbuf(struct ena_ring *tx_ring,
59    struct mbuf **mbuf);
60static int ena_xmit_mbuf(struct ena_ring *, struct mbuf **);
61static void ena_start_xmit(struct ena_ring *);
62
63/*********************************************************************
64 *  Global functions
65 *********************************************************************/
66
67void
68ena_cleanup(void *arg, int pending)
69{
70	struct ena_que *que = arg;
71	struct ena_adapter *adapter = que->adapter;
72	if_t ifp = adapter->ifp;
73	struct ena_ring *tx_ring;
74	struct ena_ring *rx_ring;
75	struct ena_com_io_cq *io_cq;
76	struct ena_eth_io_intr_reg intr_reg;
77	int qid, ena_qid;
78	int txc, rxc, i;
79
80	if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0))
81		return;
82
83	ena_log_io(adapter->pdev, DBG, "MSI-X TX/RX routine\n");
84
85	tx_ring = que->tx_ring;
86	rx_ring = que->rx_ring;
87	qid = que->id;
88	ena_qid = ENA_IO_TXQ_IDX(qid);
89	io_cq = &adapter->ena_dev->io_cq_queues[ena_qid];
90
91	atomic_store_8(&tx_ring->first_interrupt, 1);
92	atomic_store_8(&rx_ring->first_interrupt, 1);
93
94	for (i = 0; i < ENA_CLEAN_BUDGET; ++i) {
95		rxc = ena_rx_cleanup(rx_ring);
96		txc = ena_tx_cleanup(tx_ring);
97
98		if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0))
99			return;
100
101		if ((txc != ENA_TX_BUDGET) && (rxc != ENA_RX_BUDGET))
102			break;
103	}
104
105	/* Signal that work is done and unmask interrupt */
106	ena_com_update_intr_reg(&intr_reg, ENA_RX_IRQ_INTERVAL,
107	    ENA_TX_IRQ_INTERVAL, true, false);
108	counter_u64_add(tx_ring->tx_stats.unmask_interrupt_num, 1);
109	ena_com_unmask_intr(io_cq, &intr_reg);
110}
111
112void
113ena_deferred_mq_start(void *arg, int pending)
114{
115	struct ena_ring *tx_ring = (struct ena_ring *)arg;
116	if_t ifp = tx_ring->adapter->ifp;
117
118	while (!drbr_empty(ifp, tx_ring->br) && tx_ring->running &&
119	    (if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) {
120		ENA_RING_MTX_LOCK(tx_ring);
121		ena_start_xmit(tx_ring);
122		ENA_RING_MTX_UNLOCK(tx_ring);
123	}
124}
125
126int
127ena_mq_start(if_t ifp, struct mbuf *m)
128{
129	struct ena_adapter *adapter = if_getsoftc(ifp);
130	struct ena_ring *tx_ring;
131	int ret, is_drbr_empty;
132	uint32_t i;
133#ifdef RSS
134	uint32_t bucket_id;
135#endif
136
137	if (unlikely((if_getdrvflags(adapter->ifp) & IFF_DRV_RUNNING) == 0))
138		return (ENODEV);
139
140	/* Which queue to use */
141	/*
142	 * If everything is setup correctly, it should be the
143	 * same bucket that the current CPU we're on is.
144	 * It should improve performance.
145	 */
146	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
147#ifdef RSS
148		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
149		    &bucket_id) == 0)
150			i = bucket_id % adapter->num_io_queues;
151		else
152#endif
153			i = m->m_pkthdr.flowid % adapter->num_io_queues;
154	} else {
155		i = curcpu % adapter->num_io_queues;
156	}
157	tx_ring = &adapter->tx_ring[i];
158
159	/* Check if drbr is empty before putting packet */
160	is_drbr_empty = drbr_empty(ifp, tx_ring->br);
161	ret = drbr_enqueue(ifp, tx_ring->br, m);
162	if (unlikely(ret != 0)) {
163		taskqueue_enqueue(tx_ring->enqueue_tq, &tx_ring->enqueue_task);
164		return (ret);
165	}
166
167	if (is_drbr_empty && (ENA_RING_MTX_TRYLOCK(tx_ring) != 0)) {
168		ena_start_xmit(tx_ring);
169		ENA_RING_MTX_UNLOCK(tx_ring);
170	} else {
171		taskqueue_enqueue(tx_ring->enqueue_tq, &tx_ring->enqueue_task);
172	}
173
174	return (0);
175}
176
177void
178ena_qflush(if_t ifp)
179{
180	struct ena_adapter *adapter = if_getsoftc(ifp);
181	struct ena_ring *tx_ring = adapter->tx_ring;
182	int i;
183
184	for (i = 0; i < adapter->num_io_queues; ++i, ++tx_ring)
185		if (!drbr_empty(ifp, tx_ring->br)) {
186			ENA_RING_MTX_LOCK(tx_ring);
187			drbr_flush(ifp, tx_ring->br);
188			ENA_RING_MTX_UNLOCK(tx_ring);
189		}
190
191	if_qflush(ifp);
192}
193
194/*********************************************************************
195 *  Static functions
196 *********************************************************************/
197
198static inline int
199ena_get_tx_req_id(struct ena_ring *tx_ring, struct ena_com_io_cq *io_cq,
200    uint16_t *req_id)
201{
202	struct ena_adapter *adapter = tx_ring->adapter;
203	int rc;
204
205	rc = ena_com_tx_comp_req_id_get(io_cq, req_id);
206	if (rc == ENA_COM_TRY_AGAIN)
207		return (EAGAIN);
208
209	if (unlikely(rc != 0)) {
210		ena_log(adapter->pdev, ERR, "Invalid req_id %hu in qid %hu\n",
211		    *req_id, tx_ring->qid);
212		counter_u64_add(tx_ring->tx_stats.bad_req_id, 1);
213		goto err;
214	}
215
216	if (tx_ring->tx_buffer_info[*req_id].mbuf != NULL)
217		return (0);
218
219	ena_log(adapter->pdev, ERR,
220	    "tx_info doesn't have valid mbuf. req_id %hu qid %hu\n",
221	    *req_id, tx_ring->qid);
222err:
223	ena_trigger_reset(adapter, ENA_REGS_RESET_INV_TX_REQ_ID);
224
225	return (EFAULT);
226}
227
228/**
229 * ena_tx_cleanup - clear sent packets and corresponding descriptors
230 * @tx_ring: ring for which we want to clean packets
231 *
232 * Once packets are sent, we ask the device in a loop for no longer used
233 * descriptors. We find the related mbuf chain in a map (index in an array)
234 * and free it, then update ring state.
235 * This is performed in "endless" loop, updating ring pointers every
236 * TX_COMMIT. The first check of free descriptor is performed before the actual
237 * loop, then repeated at the loop end.
238 **/
239static int
240ena_tx_cleanup(struct ena_ring *tx_ring)
241{
242	struct ena_adapter *adapter;
243	struct ena_com_io_cq *io_cq;
244	uint16_t next_to_clean;
245	uint16_t req_id;
246	uint16_t ena_qid;
247	unsigned int total_done = 0;
248	int rc;
249	int commit = ENA_TX_COMMIT;
250	int budget = ENA_TX_BUDGET;
251	int work_done;
252	bool above_thresh;
253
254	adapter = tx_ring->que->adapter;
255	ena_qid = ENA_IO_TXQ_IDX(tx_ring->que->id);
256	io_cq = &adapter->ena_dev->io_cq_queues[ena_qid];
257	next_to_clean = tx_ring->next_to_clean;
258
259#ifdef DEV_NETMAP
260	if (netmap_tx_irq(adapter->ifp, tx_ring->qid) != NM_IRQ_PASS)
261		return (0);
262#endif /* DEV_NETMAP */
263
264	do {
265		struct ena_tx_buffer *tx_info;
266		struct mbuf *mbuf;
267
268		rc = ena_get_tx_req_id(tx_ring, io_cq, &req_id);
269		if (unlikely(rc != 0))
270			break;
271
272		tx_info = &tx_ring->tx_buffer_info[req_id];
273
274		mbuf = tx_info->mbuf;
275
276		tx_info->mbuf = NULL;
277		bintime_clear(&tx_info->timestamp);
278
279		bus_dmamap_sync(adapter->tx_buf_tag, tx_info->dmamap,
280		    BUS_DMASYNC_POSTWRITE);
281		bus_dmamap_unload(adapter->tx_buf_tag, tx_info->dmamap);
282
283		ena_log_io(adapter->pdev, DBG, "tx: q %d mbuf %p completed\n",
284		    tx_ring->qid, mbuf);
285
286		m_freem(mbuf);
287
288		total_done += tx_info->tx_descs;
289
290		tx_ring->free_tx_ids[next_to_clean] = req_id;
291		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
292		    tx_ring->ring_size);
293
294		if (unlikely(--commit == 0)) {
295			commit = ENA_TX_COMMIT;
296			/* update ring state every ENA_TX_COMMIT descriptor */
297			tx_ring->next_to_clean = next_to_clean;
298			ena_com_comp_ack(
299			    &adapter->ena_dev->io_sq_queues[ena_qid],
300			    total_done);
301			total_done = 0;
302		}
303	} while (likely(--budget));
304
305	work_done = ENA_TX_BUDGET - budget;
306
307	ena_log_io(adapter->pdev, DBG, "tx: q %d done. total pkts: %d\n",
308	    tx_ring->qid, work_done);
309
310	/* If there is still something to commit update ring state */
311	if (likely(commit != ENA_TX_COMMIT)) {
312		tx_ring->next_to_clean = next_to_clean;
313		ena_com_comp_ack(&adapter->ena_dev->io_sq_queues[ena_qid],
314		    total_done);
315	}
316
317	/*
318	 * Need to make the rings circular update visible to
319	 * ena_xmit_mbuf() before checking for tx_ring->running.
320	 */
321	mb();
322
323	above_thresh = ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
324	    ENA_TX_RESUME_THRESH);
325	if (unlikely(!tx_ring->running && above_thresh)) {
326		ENA_RING_MTX_LOCK(tx_ring);
327		above_thresh = ena_com_sq_have_enough_space(
328		    tx_ring->ena_com_io_sq, ENA_TX_RESUME_THRESH);
329		if (!tx_ring->running && above_thresh) {
330			tx_ring->running = true;
331			counter_u64_add(tx_ring->tx_stats.queue_wakeup, 1);
332			taskqueue_enqueue(tx_ring->enqueue_tq,
333			    &tx_ring->enqueue_task);
334		}
335		ENA_RING_MTX_UNLOCK(tx_ring);
336	}
337
338	tx_ring->tx_last_cleanup_ticks = ticks;
339
340	return (work_done);
341}
342
343static void
344ena_rx_hash_mbuf(struct ena_ring *rx_ring, struct ena_com_rx_ctx *ena_rx_ctx,
345    struct mbuf *mbuf)
346{
347	struct ena_adapter *adapter = rx_ring->adapter;
348
349	if (likely(ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
350		mbuf->m_pkthdr.flowid = ena_rx_ctx->hash;
351
352#ifdef RSS
353		/*
354		 * Hardware and software RSS are in agreement only when both are
355		 * configured to Toeplitz algorithm.  This driver configures
356		 * that algorithm only when software RSS is enabled and uses it.
357		 */
358		if (adapter->ena_dev->rss.hash_func != ENA_ADMIN_TOEPLITZ &&
359		    ena_rx_ctx->l3_proto != ENA_ETH_IO_L3_PROTO_UNKNOWN) {
360			M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH);
361			return;
362		}
363#endif
364
365		if (ena_rx_ctx->frag &&
366		    (ena_rx_ctx->l3_proto != ENA_ETH_IO_L3_PROTO_UNKNOWN)) {
367			M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH);
368			return;
369		}
370
371		switch (ena_rx_ctx->l3_proto) {
372		case ENA_ETH_IO_L3_PROTO_IPV4:
373			switch (ena_rx_ctx->l4_proto) {
374			case ENA_ETH_IO_L4_PROTO_TCP:
375				M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4);
376				break;
377			case ENA_ETH_IO_L4_PROTO_UDP:
378				M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4);
379				break;
380			default:
381				M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4);
382			}
383			break;
384		case ENA_ETH_IO_L3_PROTO_IPV6:
385			switch (ena_rx_ctx->l4_proto) {
386			case ENA_ETH_IO_L4_PROTO_TCP:
387				M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6);
388				break;
389			case ENA_ETH_IO_L4_PROTO_UDP:
390				M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6);
391				break;
392			default:
393				M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6);
394			}
395			break;
396		case ENA_ETH_IO_L3_PROTO_UNKNOWN:
397			M_HASHTYPE_SET(mbuf, M_HASHTYPE_NONE);
398			break;
399		default:
400			M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH);
401		}
402	} else {
403		mbuf->m_pkthdr.flowid = rx_ring->qid;
404		M_HASHTYPE_SET(mbuf, M_HASHTYPE_NONE);
405	}
406}
407
408/**
409 * ena_rx_mbuf - assemble mbuf from descriptors
410 * @rx_ring: ring for which we want to clean packets
411 * @ena_bufs: buffer info
412 * @ena_rx_ctx: metadata for this packet(s)
413 * @next_to_clean: ring pointer, will be updated only upon success
414 *
415 **/
416static struct mbuf *
417ena_rx_mbuf(struct ena_ring *rx_ring, struct ena_com_rx_buf_info *ena_bufs,
418    struct ena_com_rx_ctx *ena_rx_ctx, uint16_t *next_to_clean)
419{
420	struct mbuf *mbuf;
421	struct ena_rx_buffer *rx_info;
422	struct ena_adapter *adapter;
423	device_t pdev;
424	unsigned int descs = ena_rx_ctx->descs;
425	uint16_t ntc, len, req_id, buf = 0;
426
427	ntc = *next_to_clean;
428	adapter = rx_ring->adapter;
429	pdev = adapter->pdev;
430
431	len = ena_bufs[buf].len;
432	req_id = ena_bufs[buf].req_id;
433	rx_info = &rx_ring->rx_buffer_info[req_id];
434	if (unlikely(rx_info->mbuf == NULL)) {
435		ena_log(pdev, ERR, "NULL mbuf in rx_info");
436		return (NULL);
437	}
438
439	ena_log_io(pdev, DBG, "rx_info %p, mbuf %p, paddr %jx\n", rx_info,
440	    rx_info->mbuf, (uintmax_t)rx_info->ena_buf.paddr);
441
442	bus_dmamap_sync(adapter->rx_buf_tag, rx_info->map,
443	    BUS_DMASYNC_POSTREAD);
444	mbuf = rx_info->mbuf;
445	mbuf->m_flags |= M_PKTHDR;
446	mbuf->m_pkthdr.len = len;
447	mbuf->m_len = len;
448	/* Only for the first segment the data starts at specific offset */
449	mbuf->m_data = mtodo(mbuf, ena_rx_ctx->pkt_offset);
450	ena_log_io(pdev, DBG, "Mbuf data offset=%u\n", ena_rx_ctx->pkt_offset);
451	mbuf->m_pkthdr.rcvif = rx_ring->que->adapter->ifp;
452
453	/* Fill mbuf with hash key and it's interpretation for optimization */
454	ena_rx_hash_mbuf(rx_ring, ena_rx_ctx, mbuf);
455
456	ena_log_io(pdev, DBG, "rx mbuf 0x%p, flags=0x%x, len: %d\n", mbuf,
457	    mbuf->m_flags, mbuf->m_pkthdr.len);
458
459	/* DMA address is not needed anymore, unmap it */
460	bus_dmamap_unload(rx_ring->adapter->rx_buf_tag, rx_info->map);
461
462	rx_info->mbuf = NULL;
463	rx_ring->free_rx_ids[ntc] = req_id;
464	ntc = ENA_RX_RING_IDX_NEXT(ntc, rx_ring->ring_size);
465
466	/*
467	 * While we have more than 1 descriptors for one rcvd packet, append
468	 * other mbufs to the main one
469	 */
470	while (--descs) {
471		++buf;
472		len = ena_bufs[buf].len;
473		req_id = ena_bufs[buf].req_id;
474		rx_info = &rx_ring->rx_buffer_info[req_id];
475
476		if (unlikely(rx_info->mbuf == NULL)) {
477			ena_log(pdev, ERR, "NULL mbuf in rx_info");
478			/*
479			 * If one of the required mbufs was not allocated yet,
480			 * we can break there.
481			 * All earlier used descriptors will be reallocated
482			 * later and not used mbufs can be reused.
483			 * The next_to_clean pointer will not be updated in case
484			 * of an error, so caller should advance it manually
485			 * in error handling routine to keep it up to date
486			 * with hw ring.
487			 */
488			m_freem(mbuf);
489			return (NULL);
490		}
491
492		bus_dmamap_sync(adapter->rx_buf_tag, rx_info->map,
493		    BUS_DMASYNC_POSTREAD);
494		if (unlikely(m_append(mbuf, len, rx_info->mbuf->m_data) == 0)) {
495			counter_u64_add(rx_ring->rx_stats.mbuf_alloc_fail, 1);
496			ena_log_io(pdev, WARN, "Failed to append Rx mbuf %p\n",
497			    mbuf);
498		}
499
500		ena_log_io(pdev, DBG, "rx mbuf updated. len %d\n",
501		    mbuf->m_pkthdr.len);
502
503		/* Free already appended mbuf, it won't be useful anymore */
504		bus_dmamap_unload(rx_ring->adapter->rx_buf_tag, rx_info->map);
505		m_freem(rx_info->mbuf);
506		rx_info->mbuf = NULL;
507
508		rx_ring->free_rx_ids[ntc] = req_id;
509		ntc = ENA_RX_RING_IDX_NEXT(ntc, rx_ring->ring_size);
510	}
511
512	*next_to_clean = ntc;
513
514	return (mbuf);
515}
516
517/**
518 * ena_rx_checksum - indicate in mbuf if hw indicated a good cksum
519 **/
520static inline void
521ena_rx_checksum(struct ena_ring *rx_ring, struct ena_com_rx_ctx *ena_rx_ctx,
522    struct mbuf *mbuf)
523{
524	device_t pdev = rx_ring->adapter->pdev;
525
526	/* if IP and error */
527	if (unlikely((ena_rx_ctx->l3_proto == ENA_ETH_IO_L3_PROTO_IPV4) &&
528	    ena_rx_ctx->l3_csum_err)) {
529		/* ipv4 checksum error */
530		mbuf->m_pkthdr.csum_flags = 0;
531		counter_u64_add(rx_ring->rx_stats.csum_bad, 1);
532		ena_log_io(pdev, DBG, "RX IPv4 header checksum error\n");
533		return;
534	}
535
536	/* if TCP/UDP */
537	if ((ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_TCP) ||
538	    (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP)) {
539		if (ena_rx_ctx->l4_csum_err) {
540			/* TCP/UDP checksum error */
541			mbuf->m_pkthdr.csum_flags = 0;
542			counter_u64_add(rx_ring->rx_stats.csum_bad, 1);
543			ena_log_io(pdev, DBG, "RX L4 checksum error\n");
544		} else {
545			mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
546			mbuf->m_pkthdr.csum_flags |= CSUM_IP_VALID;
547			counter_u64_add(rx_ring->rx_stats.csum_good, 1);
548		}
549	}
550}
551
552/**
553 * ena_rx_cleanup - handle rx irq
554 * @arg: ring for which irq is being handled
555 **/
556static int
557ena_rx_cleanup(struct ena_ring *rx_ring)
558{
559	struct ena_adapter *adapter;
560	device_t pdev;
561	struct mbuf *mbuf;
562	struct ena_com_rx_ctx ena_rx_ctx;
563	struct ena_com_io_cq *io_cq;
564	struct ena_com_io_sq *io_sq;
565	enum ena_regs_reset_reason_types reset_reason;
566	if_t ifp;
567	uint16_t ena_qid;
568	uint16_t next_to_clean;
569	uint32_t refill_required;
570	uint32_t refill_threshold;
571	uint32_t do_if_input = 0;
572	unsigned int qid;
573	int rc, i;
574	int budget = ENA_RX_BUDGET;
575#ifdef DEV_NETMAP
576	int done;
577#endif /* DEV_NETMAP */
578
579	adapter = rx_ring->que->adapter;
580	pdev = adapter->pdev;
581	ifp = adapter->ifp;
582	qid = rx_ring->que->id;
583	ena_qid = ENA_IO_RXQ_IDX(qid);
584	io_cq = &adapter->ena_dev->io_cq_queues[ena_qid];
585	io_sq = &adapter->ena_dev->io_sq_queues[ena_qid];
586	next_to_clean = rx_ring->next_to_clean;
587
588#ifdef DEV_NETMAP
589	if (netmap_rx_irq(adapter->ifp, rx_ring->qid, &done) != NM_IRQ_PASS)
590		return (0);
591#endif /* DEV_NETMAP */
592
593	ena_log_io(pdev, DBG, "rx: qid %d\n", qid);
594
595	do {
596		ena_rx_ctx.ena_bufs = rx_ring->ena_bufs;
597		ena_rx_ctx.max_bufs = adapter->max_rx_sgl_size;
598		ena_rx_ctx.descs = 0;
599		ena_rx_ctx.pkt_offset = 0;
600
601		bus_dmamap_sync(io_cq->cdesc_addr.mem_handle.tag,
602		    io_cq->cdesc_addr.mem_handle.map, BUS_DMASYNC_POSTREAD);
603		rc = ena_com_rx_pkt(io_cq, io_sq, &ena_rx_ctx);
604		if (unlikely(rc != 0)) {
605			if (rc == ENA_COM_NO_SPACE) {
606				counter_u64_add(rx_ring->rx_stats.bad_desc_num,
607				    1);
608				reset_reason = ENA_REGS_RESET_TOO_MANY_RX_DESCS;
609			} else {
610				counter_u64_add(rx_ring->rx_stats.bad_req_id,
611				    1);
612				reset_reason = ENA_REGS_RESET_INV_RX_REQ_ID;
613			}
614			ena_trigger_reset(adapter, reset_reason);
615			return (0);
616		}
617
618		if (unlikely(ena_rx_ctx.descs == 0))
619			break;
620
621		ena_log_io(pdev, DBG,
622		    "rx: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
623		    rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
624		    ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
625
626		/* Receive mbuf from the ring */
627		mbuf = ena_rx_mbuf(rx_ring, rx_ring->ena_bufs, &ena_rx_ctx,
628		    &next_to_clean);
629		bus_dmamap_sync(io_cq->cdesc_addr.mem_handle.tag,
630		    io_cq->cdesc_addr.mem_handle.map, BUS_DMASYNC_PREREAD);
631		/* Exit if we failed to retrieve a buffer */
632		if (unlikely(mbuf == NULL)) {
633			for (i = 0; i < ena_rx_ctx.descs; ++i) {
634				rx_ring->free_rx_ids[next_to_clean] =
635				    rx_ring->ena_bufs[i].req_id;
636				next_to_clean = ENA_RX_RING_IDX_NEXT(
637				    next_to_clean, rx_ring->ring_size);
638			}
639			break;
640		}
641
642		if (((if_getcapenable(ifp) & IFCAP_RXCSUM) != 0) ||
643		    ((if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6) != 0)) {
644			ena_rx_checksum(rx_ring, &ena_rx_ctx, mbuf);
645		}
646
647		counter_enter();
648		counter_u64_add_protected(rx_ring->rx_stats.bytes,
649		    mbuf->m_pkthdr.len);
650		counter_u64_add_protected(adapter->hw_stats.rx_bytes,
651		    mbuf->m_pkthdr.len);
652		counter_exit();
653		/*
654		 * LRO is only for IP/TCP packets and TCP checksum of the packet
655		 * should be computed by hardware.
656		 */
657		do_if_input = 1;
658		if (((if_getcapenable(ifp) & IFCAP_LRO) != 0)  &&
659		    ((mbuf->m_pkthdr.csum_flags & CSUM_IP_VALID) != 0) &&
660		    (ena_rx_ctx.l4_proto == ENA_ETH_IO_L4_PROTO_TCP)) {
661			/*
662			 * Send to the stack if:
663			 *  - LRO not enabled, or
664			 *  - no LRO resources, or
665			 *  - lro enqueue fails
666			 */
667			if ((rx_ring->lro.lro_cnt != 0) &&
668			    (tcp_lro_rx(&rx_ring->lro, mbuf, 0) == 0))
669				do_if_input = 0;
670		}
671		if (do_if_input != 0) {
672			ena_log_io(pdev, DBG,
673			    "calling if_input() with mbuf %p\n", mbuf);
674			if_input(ifp, mbuf);
675		}
676
677		counter_enter();
678		counter_u64_add_protected(rx_ring->rx_stats.cnt, 1);
679		counter_u64_add_protected(adapter->hw_stats.rx_packets, 1);
680		counter_exit();
681	} while (--budget);
682
683	rx_ring->next_to_clean = next_to_clean;
684
685	refill_required = ena_com_free_q_entries(io_sq);
686	refill_threshold = min_t(int,
687	    rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER,
688	    ENA_RX_REFILL_THRESH_PACKET);
689
690	if (refill_required > refill_threshold) {
691		ena_refill_rx_bufs(rx_ring, refill_required);
692	}
693
694	tcp_lro_flush_all(&rx_ring->lro);
695
696	return (ENA_RX_BUDGET - budget);
697}
698
699static void
700ena_tx_csum(struct ena_com_tx_ctx *ena_tx_ctx, struct mbuf *mbuf,
701    bool disable_meta_caching)
702{
703	struct ena_com_tx_meta *ena_meta;
704	struct ether_vlan_header *eh;
705	struct mbuf *mbuf_next;
706	u32 mss;
707	bool offload;
708	uint16_t etype;
709	int ehdrlen;
710	struct ip *ip;
711	int ipproto;
712	int iphlen;
713	struct tcphdr *th;
714	int offset;
715
716	offload = false;
717	ena_meta = &ena_tx_ctx->ena_meta;
718	mss = mbuf->m_pkthdr.tso_segsz;
719
720	if (mss != 0)
721		offload = true;
722
723	if ((mbuf->m_pkthdr.csum_flags & CSUM_TSO) != 0)
724		offload = true;
725
726	if ((mbuf->m_pkthdr.csum_flags & CSUM_OFFLOAD) != 0)
727		offload = true;
728
729	if ((mbuf->m_pkthdr.csum_flags & CSUM6_OFFLOAD) != 0)
730		offload = true;
731
732	if (!offload) {
733		if (disable_meta_caching) {
734			memset(ena_meta, 0, sizeof(*ena_meta));
735			ena_tx_ctx->meta_valid = 1;
736		} else {
737			ena_tx_ctx->meta_valid = 0;
738		}
739		return;
740	}
741
742	/* Determine where frame payload starts. */
743	eh = mtod(mbuf, struct ether_vlan_header *);
744	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
745		etype = ntohs(eh->evl_proto);
746		ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
747	} else {
748		etype = ntohs(eh->evl_encap_proto);
749		ehdrlen = ETHER_HDR_LEN;
750	}
751
752	mbuf_next = m_getptr(mbuf, ehdrlen, &offset);
753
754	switch (etype) {
755	case ETHERTYPE_IP:
756		ip = (struct ip *)(mtodo(mbuf_next, offset));
757		iphlen = ip->ip_hl << 2;
758		ipproto = ip->ip_p;
759		ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV4;
760		if ((ip->ip_off & htons(IP_DF)) != 0)
761			ena_tx_ctx->df = 1;
762		break;
763	case ETHERTYPE_IPV6:
764		ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV6;
765		iphlen = ip6_lasthdr(mbuf, ehdrlen, IPPROTO_IPV6, &ipproto);
766		iphlen -= ehdrlen;
767		ena_tx_ctx->df = 1;
768		break;
769	default:
770		iphlen = 0;
771		ipproto = 0;
772		break;
773	}
774
775	mbuf_next = m_getptr(mbuf, iphlen + ehdrlen, &offset);
776	th = (struct tcphdr *)(mtodo(mbuf_next, offset));
777
778	if ((mbuf->m_pkthdr.csum_flags & CSUM_IP) != 0) {
779		ena_tx_ctx->l3_csum_enable = 1;
780	}
781	if ((mbuf->m_pkthdr.csum_flags & CSUM_TSO) != 0) {
782		ena_tx_ctx->tso_enable = 1;
783		ena_meta->l4_hdr_len = (th->th_off);
784	}
785
786	if (ipproto == IPPROTO_TCP) {
787		ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_TCP;
788		if ((mbuf->m_pkthdr.csum_flags &
789		    (CSUM_IP_TCP | CSUM_IP6_TCP)) != 0)
790			ena_tx_ctx->l4_csum_enable = 1;
791		else
792			ena_tx_ctx->l4_csum_enable = 0;
793	} else if (ipproto == IPPROTO_UDP) {
794		ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_UDP;
795		if ((mbuf->m_pkthdr.csum_flags &
796		    (CSUM_IP_UDP | CSUM_IP6_UDP)) != 0)
797			ena_tx_ctx->l4_csum_enable = 1;
798		else
799			ena_tx_ctx->l4_csum_enable = 0;
800	} else {
801		ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_UNKNOWN;
802		ena_tx_ctx->l4_csum_enable = 0;
803	}
804
805	ena_meta->mss = mss;
806	ena_meta->l3_hdr_len = iphlen;
807	ena_meta->l3_hdr_offset = ehdrlen;
808	ena_tx_ctx->meta_valid = 1;
809}
810
811static int
812ena_check_and_collapse_mbuf(struct ena_ring *tx_ring, struct mbuf **mbuf)
813{
814	struct ena_adapter *adapter;
815	struct mbuf *collapsed_mbuf;
816	int num_frags;
817
818	adapter = tx_ring->adapter;
819	num_frags = ena_mbuf_count(*mbuf);
820
821	/* One segment must be reserved for configuration descriptor. */
822	if (num_frags < adapter->max_tx_sgl_size)
823		return (0);
824
825	if ((num_frags == adapter->max_tx_sgl_size) &&
826	    ((*mbuf)->m_pkthdr.len < tx_ring->tx_max_header_size))
827		return (0);
828
829	counter_u64_add(tx_ring->tx_stats.collapse, 1);
830
831	collapsed_mbuf = m_collapse(*mbuf, M_NOWAIT,
832	    adapter->max_tx_sgl_size - 1);
833	if (unlikely(collapsed_mbuf == NULL)) {
834		counter_u64_add(tx_ring->tx_stats.collapse_err, 1);
835		return (ENOMEM);
836	}
837
838	/* If mbuf was collapsed succesfully, original mbuf is released. */
839	*mbuf = collapsed_mbuf;
840
841	return (0);
842}
843
844static int
845ena_tx_map_mbuf(struct ena_ring *tx_ring, struct ena_tx_buffer *tx_info,
846    struct mbuf *mbuf, void **push_hdr, u16 *header_len)
847{
848	struct ena_adapter *adapter = tx_ring->adapter;
849	struct ena_com_buf *ena_buf;
850	bus_dma_segment_t segs[ENA_BUS_DMA_SEGS];
851	size_t iseg = 0;
852	uint32_t mbuf_head_len;
853	uint16_t offset;
854	int rc, nsegs;
855
856	mbuf_head_len = mbuf->m_len;
857	tx_info->mbuf = mbuf;
858	ena_buf = tx_info->bufs;
859
860	/*
861	 * For easier maintaining of the DMA map, map the whole mbuf even if
862	 * the LLQ is used. The descriptors will be filled using the segments.
863	 */
864	rc = bus_dmamap_load_mbuf_sg(adapter->tx_buf_tag,
865	    tx_info->dmamap, mbuf, segs, &nsegs, BUS_DMA_NOWAIT);
866	if (unlikely((rc != 0) || (nsegs == 0))) {
867		ena_log_io(adapter->pdev, WARN,
868		    "dmamap load failed! err: %d nsegs: %d\n", rc, nsegs);
869		goto dma_error;
870	}
871
872	if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
873		/*
874		 * When the device is LLQ mode, the driver will copy
875		 * the header into the device memory space.
876		 * the ena_com layer assumes the header is in a linear
877		 * memory space.
878		 * This assumption might be wrong since part of the header
879		 * can be in the fragmented buffers.
880		 * First check if header fits in the mbuf. If not, copy it to
881		 * separate buffer that will be holding linearized data.
882		 */
883		*header_len = min_t(uint32_t, mbuf->m_pkthdr.len,
884		    tx_ring->tx_max_header_size);
885
886		/* If header is in linear space, just point into mbuf's data. */
887		if (likely(*header_len <= mbuf_head_len)) {
888			*push_hdr = mbuf->m_data;
889		/*
890		 * Otherwise, copy whole portion of header from multiple
891		 * mbufs to intermediate buffer.
892		 */
893		} else {
894			m_copydata(mbuf, 0, *header_len,
895			    tx_ring->push_buf_intermediate_buf);
896			*push_hdr = tx_ring->push_buf_intermediate_buf;
897
898			counter_u64_add(tx_ring->tx_stats.llq_buffer_copy, 1);
899		}
900
901		ena_log_io(adapter->pdev, DBG,
902		    "mbuf: %p header_buf->vaddr: %p push_len: %d\n",
903		    mbuf, *push_hdr, *header_len);
904
905		/* If packet is fitted in LLQ header, no need for DMA segments. */
906		if (mbuf->m_pkthdr.len <= tx_ring->tx_max_header_size) {
907			return (0);
908		} else {
909			offset = tx_ring->tx_max_header_size;
910			/*
911			 * As Header part is mapped to LLQ header, we can skip
912			 * it and just map the residuum of the mbuf to DMA
913			 * Segments.
914			 */
915			while (offset > 0) {
916				if (offset >= segs[iseg].ds_len) {
917					offset -= segs[iseg].ds_len;
918				} else {
919					ena_buf->paddr = segs[iseg].ds_addr +
920					    offset;
921					ena_buf->len = segs[iseg].ds_len -
922					    offset;
923					ena_buf++;
924					tx_info->num_of_bufs++;
925					offset = 0;
926				}
927				iseg++;
928			}
929		}
930	} else {
931		*push_hdr = NULL;
932		/*
933		 * header_len is just a hint for the device. Because FreeBSD is
934		 * not giving us information about packet header length and it
935		 * is not guaranteed that all packet headers will be in the 1st
936		 * mbuf, setting header_len to 0 is making the device ignore
937		 * this value and resolve header on it's own.
938		 */
939		*header_len = 0;
940	}
941
942	/* Map rest of the mbuf */
943	while (iseg < nsegs) {
944		ena_buf->paddr = segs[iseg].ds_addr;
945		ena_buf->len = segs[iseg].ds_len;
946		ena_buf++;
947		iseg++;
948		tx_info->num_of_bufs++;
949	}
950
951	return (0);
952
953dma_error:
954	counter_u64_add(tx_ring->tx_stats.dma_mapping_err, 1);
955	tx_info->mbuf = NULL;
956	return (rc);
957}
958
959static int
960ena_xmit_mbuf(struct ena_ring *tx_ring, struct mbuf **mbuf)
961{
962	struct ena_adapter *adapter;
963	device_t pdev;
964	struct ena_tx_buffer *tx_info;
965	struct ena_com_tx_ctx ena_tx_ctx;
966	struct ena_com_dev *ena_dev;
967	struct ena_com_io_sq *io_sq;
968	void *push_hdr;
969	uint16_t next_to_use;
970	uint16_t req_id;
971	uint16_t ena_qid;
972	uint16_t header_len;
973	int rc;
974	int nb_hw_desc;
975
976	ena_qid = ENA_IO_TXQ_IDX(tx_ring->que->id);
977	adapter = tx_ring->que->adapter;
978	pdev = adapter->pdev;
979	ena_dev = adapter->ena_dev;
980	io_sq = &ena_dev->io_sq_queues[ena_qid];
981
982	rc = ena_check_and_collapse_mbuf(tx_ring, mbuf);
983	if (unlikely(rc != 0)) {
984		ena_log_io(pdev, WARN, "Failed to collapse mbuf! err: %d\n",
985		    rc);
986		return (rc);
987	}
988
989	ena_log_io(pdev, DBG, "Tx: %d bytes\n", (*mbuf)->m_pkthdr.len);
990
991	next_to_use = tx_ring->next_to_use;
992	req_id = tx_ring->free_tx_ids[next_to_use];
993	tx_info = &tx_ring->tx_buffer_info[req_id];
994	tx_info->num_of_bufs = 0;
995
996	ENA_WARN(tx_info->mbuf != NULL, adapter->ena_dev,
997	    "mbuf isn't NULL for req_id %d\n", req_id);
998
999	rc = ena_tx_map_mbuf(tx_ring, tx_info, *mbuf, &push_hdr, &header_len);
1000	if (unlikely(rc != 0)) {
1001		ena_log_io(pdev, WARN, "Failed to map TX mbuf\n");
1002		return (rc);
1003	}
1004	memset(&ena_tx_ctx, 0x0, sizeof(struct ena_com_tx_ctx));
1005	ena_tx_ctx.ena_bufs = tx_info->bufs;
1006	ena_tx_ctx.push_header = push_hdr;
1007	ena_tx_ctx.num_bufs = tx_info->num_of_bufs;
1008	ena_tx_ctx.req_id = req_id;
1009	ena_tx_ctx.header_len = header_len;
1010
1011	/* Set flags and meta data */
1012	ena_tx_csum(&ena_tx_ctx, *mbuf, adapter->disable_meta_caching);
1013
1014	if (tx_ring->acum_pkts == ENA_DB_THRESHOLD ||
1015	    ena_com_is_doorbell_needed(tx_ring->ena_com_io_sq, &ena_tx_ctx)) {
1016		ena_log_io(pdev, DBG,
1017		    "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n",
1018		    tx_ring->que->id);
1019		ena_ring_tx_doorbell(tx_ring);
1020	}
1021
1022	/* Prepare the packet's descriptors and send them to device */
1023	rc = ena_com_prepare_tx(io_sq, &ena_tx_ctx, &nb_hw_desc);
1024	if (unlikely(rc != 0)) {
1025		if (likely(rc == ENA_COM_NO_MEM)) {
1026			ena_log_io(pdev, DBG, "tx ring[%d] is out of space\n",
1027			    tx_ring->que->id);
1028		} else {
1029			ena_log(pdev, ERR, "failed to prepare tx bufs\n");
1030			ena_trigger_reset(adapter,
1031			    ENA_REGS_RESET_DRIVER_INVALID_STATE);
1032		}
1033		counter_u64_add(tx_ring->tx_stats.prepare_ctx_err, 1);
1034		goto dma_error;
1035	}
1036
1037	counter_enter();
1038	counter_u64_add_protected(tx_ring->tx_stats.cnt, 1);
1039	counter_u64_add_protected(tx_ring->tx_stats.bytes,
1040	    (*mbuf)->m_pkthdr.len);
1041
1042	counter_u64_add_protected(adapter->hw_stats.tx_packets, 1);
1043	counter_u64_add_protected(adapter->hw_stats.tx_bytes,
1044	    (*mbuf)->m_pkthdr.len);
1045	counter_exit();
1046
1047	tx_info->tx_descs = nb_hw_desc;
1048	getbinuptime(&tx_info->timestamp);
1049	tx_info->print_once = true;
1050
1051	tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
1052	    tx_ring->ring_size);
1053
1054	/* stop the queue when no more space available, the packet can have up
1055	 * to sgl_size + 2. one for the meta descriptor and one for header
1056	 * (if the header is larger than tx_max_header_size).
1057	 */
1058	if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
1059	    adapter->max_tx_sgl_size + 2))) {
1060		ena_log_io(pdev, DBG, "Stop queue %d\n", tx_ring->que->id);
1061
1062		tx_ring->running = false;
1063		counter_u64_add(tx_ring->tx_stats.queue_stop, 1);
1064
1065		/* There is a rare condition where this function decides to
1066		 * stop the queue but meanwhile tx_cleanup() updates
1067		 * next_to_completion and terminates.
1068		 * The queue will remain stopped forever.
1069		 * To solve this issue this function performs mb(), checks
1070		 * the wakeup condition and wakes up the queue if needed.
1071		 */
1072		mb();
1073
1074		if (ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
1075		    ENA_TX_RESUME_THRESH)) {
1076			tx_ring->running = true;
1077			counter_u64_add(tx_ring->tx_stats.queue_wakeup, 1);
1078		}
1079	}
1080
1081	bus_dmamap_sync(adapter->tx_buf_tag, tx_info->dmamap,
1082	    BUS_DMASYNC_PREWRITE);
1083
1084	return (0);
1085
1086dma_error:
1087	tx_info->mbuf = NULL;
1088	bus_dmamap_unload(adapter->tx_buf_tag, tx_info->dmamap);
1089
1090	return (rc);
1091}
1092
1093static void
1094ena_start_xmit(struct ena_ring *tx_ring)
1095{
1096	struct mbuf *mbuf;
1097	struct ena_adapter *adapter = tx_ring->adapter;
1098	int ret = 0;
1099
1100	ENA_RING_MTX_ASSERT(tx_ring);
1101
1102	if (unlikely((if_getdrvflags(adapter->ifp) & IFF_DRV_RUNNING) == 0))
1103		return;
1104
1105	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_LINK_UP, adapter)))
1106		return;
1107
1108	while ((mbuf = drbr_peek(adapter->ifp, tx_ring->br)) != NULL) {
1109		ena_log_io(adapter->pdev, DBG,
1110		    "\ndequeued mbuf %p with flags %#x and header csum flags %#jx\n",
1111		    mbuf, mbuf->m_flags, (uint64_t)mbuf->m_pkthdr.csum_flags);
1112
1113		if (unlikely(!tx_ring->running)) {
1114			drbr_putback(adapter->ifp, tx_ring->br, mbuf);
1115			break;
1116		}
1117
1118		if (unlikely((ret = ena_xmit_mbuf(tx_ring, &mbuf)) != 0)) {
1119			if (ret == ENA_COM_NO_MEM) {
1120				drbr_putback(adapter->ifp, tx_ring->br, mbuf);
1121			} else if (ret == ENA_COM_NO_SPACE) {
1122				drbr_putback(adapter->ifp, tx_ring->br, mbuf);
1123			} else {
1124				m_freem(mbuf);
1125				drbr_advance(adapter->ifp, tx_ring->br);
1126			}
1127
1128			break;
1129		}
1130
1131		drbr_advance(adapter->ifp, tx_ring->br);
1132
1133		if (unlikely((if_getdrvflags(adapter->ifp) & IFF_DRV_RUNNING) == 0))
1134			return;
1135
1136		tx_ring->acum_pkts++;
1137
1138		BPF_MTAP(adapter->ifp, mbuf);
1139	}
1140
1141	if (likely(tx_ring->acum_pkts != 0)) {
1142		/* Trigger the dma engine */
1143		ena_ring_tx_doorbell(tx_ring);
1144	}
1145
1146	if (unlikely(!tx_ring->running))
1147		taskqueue_enqueue(tx_ring->que->cleanup_tq,
1148		    &tx_ring->que->cleanup_task);
1149}
1150