ixv_netmap.c revision 315333
1/******************************************************************************
2
3  Copyright (c) 2001-2017, Intel Corporation
4  All rights reserved.
5
6  Redistribution and use in source and binary forms, with or without
7  modification, are permitted provided that the following conditions are met:
8
9   1. Redistributions of source code must retain the above copyright notice,
10      this list of conditions and the following disclaimer.
11
12   2. Redistributions in binary form must reproduce the above copyright
13      notice, this list of conditions and the following disclaimer in the
14      documentation and/or other materials provided with the distribution.
15
16   3. Neither the name of the Intel Corporation nor the names of its
17      contributors may be used to endorse or promote products derived from
18      this software without specific prior written permission.
19
20  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  POSSIBILITY OF SUCH DAMAGE.
31
32******************************************************************************/
33/*$FreeBSD: stable/10/sys/dev/ixgbe/ixv_netmap.c 315333 2017-03-15 21:20:17Z erj $*/
34
35/*
36 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 *    notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 *    notice, this list of conditions and the following disclaimer in the
45 *    documentation and/or other materials provided with the distribution.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 */
59
60/*
61 * $FreeBSD: stable/10/sys/dev/ixgbe/ixv_netmap.c 315333 2017-03-15 21:20:17Z erj $
62 *
63 * netmap support for: ixgbe
64 *
65 * This file is meant to be a reference on how to implement
66 * netmap support for a network driver.
67 * This file contains code but only static or inline functions used
68 * by a single driver. To avoid replication of code we just #include
69 * it near the beginning of the standard driver.
70 */
71
72#ifdef DEV_NETMAP
73/*
74 * Some drivers may need the following headers. Others
75 * already include them by default
76
77#include <vm/vm.h>
78#include <vm/pmap.h>
79
80 */
81#include "ixv.h"
82
83/*
84 * device-specific sysctl variables:
85 *
86 * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
87 *	During regular operations the CRC is stripped, but on some
88 *	hardware reception of frames not multiple of 64 is slower,
89 *	so using crcstrip=0 helps in benchmarks.
90 *
91 * ix_rx_miss, ix_rx_miss_bufs:
92 *	count packets that might be missed due to lost interrupts.
93 */
94SYSCTL_DECL(_dev_netmap);
95static int ix_rx_miss, ix_rx_miss_bufs;
96int ix_crcstrip;
97SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip,
98    CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames");
99SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss,
100    CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr");
101SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs,
102    CTLFLAG_RW, &ix_rx_miss_bufs, 0, "potentially missed rx intr bufs");
103
104
105static void
106set_crcstrip(struct ixgbe_hw *hw, int onoff)
107{
108	/* crc stripping is set in two places:
109	 * IXGBE_HLREG0 (modified on init_locked and hw reset)
110	 * IXGBE_RDRXCTL (set by the original driver in
111	 *	ixgbe_setup_hw_rsc() called in init_locked.
112	 *	We disable the setting when netmap is compiled in).
113	 * We update the values here, but also in ixgbe.c because
114	 * init_locked sometimes is called outside our control.
115	 */
116	uint32_t hl, rxc;
117
118	hl = IXGBE_READ_REG(hw, IXGBE_HLREG0);
119	rxc = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
120	if (netmap_verbose)
121		D("%s read  HLREG 0x%x rxc 0x%x",
122			onoff ? "enter" : "exit", hl, rxc);
123	/* hw requirements ... */
124	rxc &= ~IXGBE_RDRXCTL_RSCFRSTSIZE;
125	rxc |= IXGBE_RDRXCTL_RSCACKC;
126	if (onoff && !ix_crcstrip) {
127		/* keep the crc. Fast rx */
128		hl &= ~IXGBE_HLREG0_RXCRCSTRP;
129		rxc &= ~IXGBE_RDRXCTL_CRCSTRIP;
130	} else {
131		/* reset default mode */
132		hl |= IXGBE_HLREG0_RXCRCSTRP;
133		rxc |= IXGBE_RDRXCTL_CRCSTRIP;
134	}
135	if (netmap_verbose)
136		D("%s write HLREG 0x%x rxc 0x%x",
137			onoff ? "enter" : "exit", hl, rxc);
138	IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hl);
139	IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc);
140}
141
142
143/*
144 * Register/unregister. We are already under netmap lock.
145 * Only called on the first register or the last unregister.
146 */
147static int
148ixgbe_netmap_reg(struct netmap_adapter *na, int onoff)
149{
150	struct ifnet *ifp = na->ifp;
151	struct adapter *adapter = ifp->if_softc;
152
153	IXGBE_CORE_LOCK(adapter);
154	adapter->stop_locked(adapter);
155
156	set_crcstrip(&adapter->hw, onoff);
157	/* enable or disable flags and callbacks in na and ifp */
158	if (onoff) {
159		nm_set_native_flags(na);
160	} else {
161		nm_clear_native_flags(na);
162	}
163	adapter->init_locked(adapter);	/* also enables intr */
164	set_crcstrip(&adapter->hw, onoff); // XXX why twice ?
165	IXGBE_CORE_UNLOCK(adapter);
166	return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1);
167}
168
169
170/*
171 * Reconcile kernel and user view of the transmit ring.
172 *
173 * All information is in the kring.
174 * Userspace wants to send packets up to the one before kring->rhead,
175 * kernel knows kring->nr_hwcur is the first unsent packet.
176 *
177 * Here we push packets out (as many as possible), and possibly
178 * reclaim buffers from previously completed transmission.
179 *
180 * The caller (netmap) guarantees that there is only one instance
181 * running at any time. Any interference with other driver
182 * methods should be handled by the individual drivers.
183 */
184static int
185ixgbe_netmap_txsync(struct netmap_kring *kring, int flags)
186{
187	struct netmap_adapter *na = kring->na;
188	struct ifnet *ifp = na->ifp;
189	struct netmap_ring *ring = kring->ring;
190	u_int nm_i;	/* index into the netmap ring */
191	u_int nic_i;	/* index into the NIC ring */
192	u_int n;
193	u_int const lim = kring->nkr_num_slots - 1;
194	u_int const head = kring->rhead;
195	/*
196	 * interrupts on every tx packet are expensive so request
197	 * them every half ring, or where NS_REPORT is set
198	 */
199	u_int report_frequency = kring->nkr_num_slots >> 1;
200
201	/* device-specific */
202	struct adapter *adapter = ifp->if_softc;
203	struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
204	int reclaim_tx;
205
206	bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
207			BUS_DMASYNC_POSTREAD);
208
209	/*
210	 * First part: process new packets to send.
211	 * nm_i is the current index in the netmap ring,
212	 * nic_i is the corresponding index in the NIC ring.
213	 * The two numbers differ because upon a *_init() we reset
214	 * the NIC ring but leave the netmap ring unchanged.
215	 * For the transmit ring, we have
216	 *
217	 *		nm_i = kring->nr_hwcur
218	 *		nic_i = IXGBE_TDT (not tracked in the driver)
219	 * and
220	 * 		nm_i == (nic_i + kring->nkr_hwofs) % ring_size
221	 *
222	 * In this driver kring->nkr_hwofs >= 0, but for other
223	 * drivers it might be negative as well.
224	 */
225
226	/*
227	 * If we have packets to send (kring->nr_hwcur != kring->rhead)
228	 * iterate over the netmap ring, fetch length and update
229	 * the corresponding slot in the NIC ring. Some drivers also
230	 * need to update the buffer's physical address in the NIC slot
231	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
232	 *
233	 * The netmap_reload_map() calls is especially expensive,
234	 * even when (as in this case) the tag is 0, so do only
235	 * when the buffer has actually changed.
236	 *
237	 * If possible do not set the report/intr bit on all slots,
238	 * but only a few times per ring or when NS_REPORT is set.
239	 *
240	 * Finally, on 10G and faster drivers, it might be useful
241	 * to prefetch the next slot and txr entry.
242	 */
243
244	nm_i = kring->nr_hwcur;
245	if (nm_i != head) {	/* we have new packets to send */
246		nic_i = netmap_idx_k2n(kring, nm_i);
247
248		__builtin_prefetch(&ring->slot[nm_i]);
249		__builtin_prefetch(&txr->tx_buffers[nic_i]);
250
251		for (n = 0; nm_i != head; n++) {
252			struct netmap_slot *slot = &ring->slot[nm_i];
253			u_int len = slot->len;
254			uint64_t paddr;
255			void *addr = PNMB(na, slot, &paddr);
256
257			/* device-specific */
258			union ixgbe_adv_tx_desc *curr = &txr->tx_base[nic_i];
259			struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[nic_i];
260			int flags = (slot->flags & NS_REPORT ||
261				nic_i == 0 || nic_i == report_frequency) ?
262				IXGBE_TXD_CMD_RS : 0;
263
264			/* prefetch for next round */
265			__builtin_prefetch(&ring->slot[nm_i + 1]);
266			__builtin_prefetch(&txr->tx_buffers[nic_i + 1]);
267
268			NM_CHECK_ADDR_LEN(na, addr, len);
269
270			if (slot->flags & NS_BUF_CHANGED) {
271				/* buffer has changed, reload map */
272				netmap_reload_map(na, txr->txtag, txbuf->map, addr);
273			}
274			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
275
276			/* Fill the slot in the NIC ring. */
277			/* Use legacy descriptor, they are faster? */
278			curr->read.buffer_addr = htole64(paddr);
279			curr->read.olinfo_status = 0;
280			curr->read.cmd_type_len = htole32(len | flags |
281				IXGBE_ADVTXD_DCMD_IFCS | IXGBE_TXD_CMD_EOP);
282
283			/* make sure changes to the buffer are synced */
284			bus_dmamap_sync(txr->txtag, txbuf->map,
285				BUS_DMASYNC_PREWRITE);
286
287			nm_i = nm_next(nm_i, lim);
288			nic_i = nm_next(nic_i, lim);
289		}
290		kring->nr_hwcur = head;
291
292		/* synchronize the NIC ring */
293		bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
294			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
295
296		/* (re)start the tx unit up to slot nic_i (excluded) */
297		IXGBE_WRITE_REG(&adapter->hw, txr->tail, nic_i);
298	}
299
300	/*
301	 * Second part: reclaim buffers for completed transmissions.
302	 * Because this is expensive (we read a NIC register etc.)
303	 * we only do it in specific cases (see below).
304	 */
305	if (flags & NAF_FORCE_RECLAIM) {
306		reclaim_tx = 1; /* forced reclaim */
307	} else if (!nm_kr_txempty(kring)) {
308		reclaim_tx = 0; /* have buffers, no reclaim */
309	} else {
310		/*
311		 * No buffers available. Locate previous slot with
312		 * REPORT_STATUS set.
313		 * If the slot has DD set, we can reclaim space,
314		 * otherwise wait for the next interrupt.
315		 * This enables interrupt moderation on the tx
316		 * side though it might reduce throughput.
317		 */
318		struct ixgbe_legacy_tx_desc *txd =
319		    (struct ixgbe_legacy_tx_desc *)txr->tx_base;
320
321		nic_i = txr->next_to_clean + report_frequency;
322		if (nic_i > lim)
323			nic_i -= lim + 1;
324		// round to the closest with dd set
325		nic_i = (nic_i < kring->nkr_num_slots / 4 ||
326			 nic_i >= kring->nkr_num_slots*3/4) ?
327			0 : report_frequency;
328		reclaim_tx = txd[nic_i].upper.fields.status & IXGBE_TXD_STAT_DD;	// XXX cpu_to_le32 ?
329	}
330	if (reclaim_tx) {
331		/*
332		 * Record completed transmissions.
333		 * We (re)use the driver's txr->next_to_clean to keep
334		 * track of the most recently completed transmission.
335		 *
336		 * The datasheet discourages the use of TDH to find
337		 * out the number of sent packets, but we only set
338		 * REPORT_STATUS in a few slots so TDH is the only
339		 * good way.
340		 */
341		nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(kring->ring_id));
342		if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
343			D("TDH wrap %d", nic_i);
344			nic_i -= kring->nkr_num_slots;
345		}
346		if (nic_i != txr->next_to_clean) {
347			/* some tx completed, increment avail */
348			txr->next_to_clean = nic_i;
349			kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
350		}
351	}
352
353	return 0;
354}
355
356
357/*
358 * Reconcile kernel and user view of the receive ring.
359 * Same as for the txsync, this routine must be efficient.
360 * The caller guarantees a single invocations, but races against
361 * the rest of the driver should be handled here.
362 *
363 * On call, kring->rhead is the first packet that userspace wants
364 * to keep, and kring->rcur is the wakeup point.
365 * The kernel has previously reported packets up to kring->rtail.
366 *
367 * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
368 * of whether or not we received an interrupt.
369 */
370static int
371ixgbe_netmap_rxsync(struct netmap_kring *kring, int flags)
372{
373	struct netmap_adapter *na = kring->na;
374	struct ifnet *ifp = na->ifp;
375	struct netmap_ring *ring = kring->ring;
376	u_int nm_i;	/* index into the netmap ring */
377	u_int nic_i;	/* index into the NIC ring */
378	u_int n;
379	u_int const lim = kring->nkr_num_slots - 1;
380	u_int const head = kring->rhead;
381	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
382
383	/* device-specific */
384	struct adapter *adapter = ifp->if_softc;
385	struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
386
387	if (head > lim)
388		return netmap_ring_reinit(kring);
389
390	/* XXX check sync modes */
391	bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
392			BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
393
394	/*
395	 * First part: import newly received packets.
396	 *
397	 * nm_i is the index of the next free slot in the netmap ring,
398	 * nic_i is the index of the next received packet in the NIC ring,
399	 * and they may differ in case if_init() has been called while
400	 * in netmap mode. For the receive ring we have
401	 *
402	 *	nic_i = rxr->next_to_check;
403	 *	nm_i = kring->nr_hwtail (previous)
404	 * and
405	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
406	 *
407	 * rxr->next_to_check is set to 0 on a ring reinit
408	 */
409	if (netmap_no_pendintr || force_update) {
410		int crclen = (ix_crcstrip) ? 0 : 4;
411		uint16_t slot_flags = kring->nkr_slot_flags;
412
413		nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail)
414		nm_i = netmap_idx_n2k(kring, nic_i);
415
416		for (n = 0; ; n++) {
417			union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i];
418			uint32_t staterr = le32toh(curr->wb.upper.status_error);
419
420			if ((staterr & IXGBE_RXD_STAT_DD) == 0)
421				break;
422			ring->slot[nm_i].len = le16toh(curr->wb.upper.length) - crclen;
423			ring->slot[nm_i].flags = slot_flags;
424			bus_dmamap_sync(rxr->ptag,
425			    rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD);
426			nm_i = nm_next(nm_i, lim);
427			nic_i = nm_next(nic_i, lim);
428		}
429		if (n) { /* update the state variables */
430			if (netmap_no_pendintr && !force_update) {
431				/* diagnostics */
432				ix_rx_miss ++;
433				ix_rx_miss_bufs += n;
434			}
435			rxr->next_to_check = nic_i;
436			kring->nr_hwtail = nm_i;
437		}
438		kring->nr_kflags &= ~NKR_PENDINTR;
439	}
440
441	/*
442	 * Second part: skip past packets that userspace has released.
443	 * (kring->nr_hwcur to kring->rhead excluded),
444	 * and make the buffers available for reception.
445	 * As usual nm_i is the index in the netmap ring,
446	 * nic_i is the index in the NIC ring, and
447	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
448	 */
449	nm_i = kring->nr_hwcur;
450	if (nm_i != head) {
451		nic_i = netmap_idx_k2n(kring, nm_i);
452		for (n = 0; nm_i != head; n++) {
453			struct netmap_slot *slot = &ring->slot[nm_i];
454			uint64_t paddr;
455			void *addr = PNMB(na, slot, &paddr);
456
457			union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i];
458			struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[nic_i];
459
460			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
461				goto ring_reset;
462
463			if (slot->flags & NS_BUF_CHANGED) {
464				/* buffer has changed, reload map */
465				netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr);
466				slot->flags &= ~NS_BUF_CHANGED;
467			}
468			curr->wb.upper.status_error = 0;
469			curr->read.pkt_addr = htole64(paddr);
470			bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
471			    BUS_DMASYNC_PREREAD);
472			nm_i = nm_next(nm_i, lim);
473			nic_i = nm_next(nic_i, lim);
474		}
475		kring->nr_hwcur = head;
476
477		bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
478		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
479		/*
480		 * IMPORTANT: we must leave one free slot in the ring,
481		 * so move nic_i back by one unit
482		 */
483		nic_i = nm_prev(nic_i, lim);
484		IXGBE_WRITE_REG(&adapter->hw, rxr->tail, nic_i);
485	}
486
487	return 0;
488
489ring_reset:
490	return netmap_ring_reinit(kring);
491}
492
493
494/*
495 * The attach routine, called near the end of ixgbe_attach(),
496 * fills the parameters for netmap_attach() and calls it.
497 * It cannot fail, in the worst case (such as no memory)
498 * netmap mode will be disabled and the driver will only
499 * operate in standard mode.
500 */
501void
502ixgbe_netmap_attach(struct adapter *adapter)
503{
504	struct netmap_adapter na;
505
506	bzero(&na, sizeof(na));
507
508	na.ifp = adapter->ifp;
509	na.na_flags = NAF_BDG_MAYSLEEP;
510	na.num_tx_desc = adapter->num_tx_desc;
511	na.num_rx_desc = adapter->num_rx_desc;
512	na.nm_txsync = ixgbe_netmap_txsync;
513	na.nm_rxsync = ixgbe_netmap_rxsync;
514	na.nm_register = ixgbe_netmap_reg;
515	na.num_tx_rings = na.num_rx_rings = adapter->num_queues;
516	netmap_attach(&na);
517}
518
519#endif /* DEV_NETMAP */
520
521/* end of file */
522