1/*
2 * Copyright (C) 2016-2018 Vincenzo Maffione
3 * Copyright (C) 2015 Stefano Garzarella
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *   1. Redistributions of source code must retain the above copyright
10 *      notice, this list of conditions and the following disclaimer.
11 *   2. Redistributions in binary form must reproduce the above copyright
12 *      notice, this list of conditions and the following disclaimer in the
13 *      documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * common headers
30 */
31#if defined(__FreeBSD__)
32
33#include <sys/param.h>
34#include <sys/kernel.h>
35#include <sys/types.h>
36#include <sys/selinfo.h>
37#include <sys/socket.h>
38#include <net/if.h>
39#include <net/if_var.h>
40#include <machine/bus.h>
41
42#define usleep_range(_1, _2) \
43        pause_sbt("sync-kloop-sleep", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE)
44
45#elif defined(linux)
46#include <bsd_glue.h>
47#include <linux/file.h>
48#include <linux/eventfd.h>
49#endif
50
51#include <net/netmap.h>
52#include <dev/netmap/netmap_kern.h>
53#include <net/netmap_virt.h>
54#include <dev/netmap/netmap_mem2.h>
55
56/* Support for eventfd-based notifications. */
57#if defined(linux)
58#define SYNC_KLOOP_POLL
59#endif
60
61/* Write kring pointers (hwcur, hwtail) to the CSB.
62 * This routine is coupled with ptnetmap_guest_read_kring_csb(). */
63static inline void
64sync_kloop_kernel_write(struct nm_csb_ktoa __user *ptr, uint32_t hwcur,
65			   uint32_t hwtail)
66{
67	/* Issue a first store-store barrier to make sure writes to the
68	 * netmap ring do not overcome updates on ktoa->hwcur and ktoa->hwtail. */
69	nm_stst_barrier();
70
71	/*
72	 * The same scheme used in nm_sync_kloop_appl_write() applies here.
73	 * We allow the application to read a value of hwcur more recent than the value
74	 * of hwtail, since this would anyway result in a consistent view of the
75	 * ring state (and hwcur can never wraparound hwtail, since hwcur must be
76	 * behind head).
77	 *
78	 * The following memory barrier scheme is used to make this happen:
79	 *
80	 *          Application            Kernel
81	 *
82	 *          STORE(hwcur)           LOAD(hwtail)
83	 *          wmb() <------------->  rmb()
84	 *          STORE(hwtail)          LOAD(hwcur)
85	 */
86	CSB_WRITE(ptr, hwcur, hwcur);
87	nm_stst_barrier();
88	CSB_WRITE(ptr, hwtail, hwtail);
89}
90
91/* Read kring pointers (head, cur, sync_flags) from the CSB.
92 * This routine is coupled with ptnetmap_guest_write_kring_csb(). */
93static inline void
94sync_kloop_kernel_read(struct nm_csb_atok __user *ptr,
95			  struct netmap_ring *shadow_ring,
96			  uint32_t num_slots)
97{
98	/*
99	 * We place a memory barrier to make sure that the update of head never
100	 * overtakes the update of cur.
101	 * (see explanation in sync_kloop_kernel_write).
102	 */
103	CSB_READ(ptr, head, shadow_ring->head);
104	nm_ldld_barrier();
105	CSB_READ(ptr, cur, shadow_ring->cur);
106	CSB_READ(ptr, sync_flags, shadow_ring->flags);
107
108	/* Make sure that loads from atok->head and atok->cur are not delayed
109	 * after the loads from the netmap ring. */
110	nm_ldld_barrier();
111}
112
113/* Enable or disable application --> kernel kicks. */
114static inline void
115csb_ktoa_kick_enable(struct nm_csb_ktoa __user *csb_ktoa, uint32_t val)
116{
117	CSB_WRITE(csb_ktoa, kern_need_kick, val);
118}
119
120#ifdef SYNC_KLOOP_POLL
121/* Are application interrupt enabled or disabled? */
122static inline uint32_t
123csb_atok_intr_enabled(struct nm_csb_atok __user *csb_atok)
124{
125	uint32_t v;
126
127	CSB_READ(csb_atok, appl_need_kick, v);
128
129	return v;
130}
131#endif  /* SYNC_KLOOP_POLL */
132
133static inline void
134sync_kloop_kring_dump(const char *title, const struct netmap_kring *kring)
135{
136	nm_prinf("%s, kring %s, hwcur %d, rhead %d, "
137		"rcur %d, rtail %d, hwtail %d",
138		title, kring->name, kring->nr_hwcur, kring->rhead,
139		kring->rcur, kring->rtail, kring->nr_hwtail);
140}
141
142/* Arguments for netmap_sync_kloop_tx_ring() and
143 * netmap_sync_kloop_rx_ring().
144 */
145struct sync_kloop_ring_args {
146	struct netmap_kring *kring;
147	struct nm_csb_atok *csb_atok;
148	struct nm_csb_ktoa *csb_ktoa;
149#ifdef SYNC_KLOOP_POLL
150	struct eventfd_ctx *irq_ctx;
151#endif /* SYNC_KLOOP_POLL */
152	/* Are we busy waiting rather than using a schedule() loop ? */
153	bool busy_wait;
154	/* Are we processing in the context of VM exit ? */
155	bool direct;
156};
157
158static void
159netmap_sync_kloop_tx_ring(const struct sync_kloop_ring_args *a)
160{
161	struct netmap_kring *kring = a->kring;
162	struct nm_csb_atok *csb_atok = a->csb_atok;
163	struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa;
164	struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
165#ifdef SYNC_KLOOP_POLL
166	bool more_txspace = false;
167#endif /* SYNC_KLOOP_POLL */
168	uint32_t num_slots;
169	int batch;
170
171	if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
172		return;
173	}
174
175	num_slots = kring->nkr_num_slots;
176
177	/* Disable application --> kernel notifications. */
178	if (!a->direct) {
179		csb_ktoa_kick_enable(csb_ktoa, 0);
180	}
181	/* Copy the application kring pointers from the CSB */
182	sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
183
184	for (;;) {
185		batch = shadow_ring.head - kring->nr_hwcur;
186		if (batch < 0)
187			batch += num_slots;
188
189#ifdef PTN_TX_BATCH_LIM
190		if (batch > PTN_TX_BATCH_LIM(num_slots)) {
191			/* If application moves ahead too fast, let's cut the move so
192			 * that we don't exceed our batch limit. */
193			uint32_t head_lim = kring->nr_hwcur + PTN_TX_BATCH_LIM(num_slots);
194
195			if (head_lim >= num_slots)
196				head_lim -= num_slots;
197			nm_prdis(1, "batch: %d head: %d head_lim: %d", batch, shadow_ring.head,
198					head_lim);
199			shadow_ring.head = head_lim;
200			batch = PTN_TX_BATCH_LIM(num_slots);
201		}
202#endif /* PTN_TX_BATCH_LIM */
203
204		if (nm_kr_txspace(kring) <= (num_slots >> 1)) {
205			shadow_ring.flags |= NAF_FORCE_RECLAIM;
206		}
207
208		/* Netmap prologue */
209		shadow_ring.tail = kring->rtail;
210		if (unlikely(nm_txsync_prologue(kring, &shadow_ring) >= num_slots)) {
211			/* Reinit ring and enable notifications. */
212			netmap_ring_reinit(kring);
213			if (!a->busy_wait) {
214				csb_ktoa_kick_enable(csb_ktoa, 1);
215			}
216			break;
217		}
218
219		if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) {
220			sync_kloop_kring_dump("pre txsync", kring);
221		}
222
223		if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
224			if (!a->busy_wait) {
225				/* Re-enable notifications. */
226				csb_ktoa_kick_enable(csb_ktoa, 1);
227			}
228			nm_prerr("txsync() failed");
229			break;
230		}
231
232		/*
233		 * Finalize
234		 * Copy kernel hwcur and hwtail into the CSB for the application sync(), and
235		 * do the nm_sync_finalize.
236		 */
237		sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur,
238				kring->nr_hwtail);
239		if (kring->rtail != kring->nr_hwtail) {
240			/* Some more room available in the parent adapter. */
241			kring->rtail = kring->nr_hwtail;
242#ifdef SYNC_KLOOP_POLL
243			more_txspace = true;
244#endif /* SYNC_KLOOP_POLL */
245		}
246
247		if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) {
248			sync_kloop_kring_dump("post txsync", kring);
249		}
250
251		/* Interrupt the application if needed. */
252#ifdef SYNC_KLOOP_POLL
253		if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) {
254			/* We could disable kernel --> application kicks here,
255			 * to avoid spurious interrupts. */
256			eventfd_signal(a->irq_ctx, 1);
257			more_txspace = false;
258		}
259#endif /* SYNC_KLOOP_POLL */
260
261		/* Read CSB to see if there is more work to do. */
262		sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
263		if (shadow_ring.head == kring->rhead) {
264			if (a->busy_wait) {
265				break;
266			}
267			/*
268			 * No more packets to transmit. We enable notifications and
269			 * go to sleep, waiting for a kick from the application when new
270			 * new slots are ready for transmission.
271			 */
272			/* Re-enable notifications. */
273			csb_ktoa_kick_enable(csb_ktoa, 1);
274			/* Double check, with store-load memory barrier. */
275			nm_stld_barrier();
276			sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
277			if (shadow_ring.head != kring->rhead) {
278				/* We won the race condition, there are more packets to
279				 * transmit. Disable notifications and do another cycle */
280				csb_ktoa_kick_enable(csb_ktoa, 0);
281				continue;
282			}
283			break;
284		}
285
286		if (nm_kr_txempty(kring)) {
287			/* No more available TX slots. We stop waiting for a notification
288			 * from the backend (netmap_tx_irq). */
289			nm_prdis(1, "TX ring");
290			break;
291		}
292	}
293
294	nm_kr_put(kring);
295
296#ifdef SYNC_KLOOP_POLL
297	if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) {
298		eventfd_signal(a->irq_ctx, 1);
299	}
300#endif /* SYNC_KLOOP_POLL */
301}
302
303/* RX cycle without receive any packets */
304#define SYNC_LOOP_RX_DRY_CYCLES_MAX	2
305
306static inline int
307sync_kloop_norxslots(struct netmap_kring *kring, uint32_t g_head)
308{
309	return (NM_ACCESS_ONCE(kring->nr_hwtail) == nm_prev(g_head,
310				kring->nkr_num_slots - 1));
311}
312
313static void
314netmap_sync_kloop_rx_ring(const struct sync_kloop_ring_args *a)
315{
316
317	struct netmap_kring *kring = a->kring;
318	struct nm_csb_atok *csb_atok = a->csb_atok;
319	struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa;
320	struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
321	int dry_cycles = 0;
322#ifdef SYNC_KLOOP_POLL
323	bool some_recvd = false;
324#endif /* SYNC_KLOOP_POLL */
325	uint32_t num_slots;
326
327	if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
328		return;
329	}
330
331	num_slots = kring->nkr_num_slots;
332
333	/* Get RX csb_atok and csb_ktoa pointers from the CSB. */
334	num_slots = kring->nkr_num_slots;
335
336	/* Disable notifications. */
337	if (!a->direct) {
338		csb_ktoa_kick_enable(csb_ktoa, 0);
339	}
340	/* Copy the application kring pointers from the CSB */
341	sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
342
343	for (;;) {
344		uint32_t hwtail;
345
346		/* Netmap prologue */
347		shadow_ring.tail = kring->rtail;
348		if (unlikely(nm_rxsync_prologue(kring, &shadow_ring) >= num_slots)) {
349			/* Reinit ring and enable notifications. */
350			netmap_ring_reinit(kring);
351			if (!a->busy_wait) {
352				csb_ktoa_kick_enable(csb_ktoa, 1);
353			}
354			break;
355		}
356
357		if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) {
358			sync_kloop_kring_dump("pre rxsync", kring);
359		}
360
361		if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
362			if (!a->busy_wait) {
363				/* Re-enable notifications. */
364				csb_ktoa_kick_enable(csb_ktoa, 1);
365			}
366			nm_prerr("rxsync() failed");
367			break;
368		}
369
370		/*
371		 * Finalize
372		 * Copy kernel hwcur and hwtail into the CSB for the application sync()
373		 */
374		hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
375		sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur, hwtail);
376		if (kring->rtail != hwtail) {
377			kring->rtail = hwtail;
378#ifdef SYNC_KLOOP_POLL
379			some_recvd = true;
380#endif /* SYNC_KLOOP_POLL */
381			dry_cycles = 0;
382		} else {
383			dry_cycles++;
384		}
385
386		if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) {
387			sync_kloop_kring_dump("post rxsync", kring);
388		}
389
390#ifdef SYNC_KLOOP_POLL
391		/* Interrupt the application if needed. */
392		if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) {
393			/* We could disable kernel --> application kicks here,
394			 * to avoid spurious interrupts. */
395			eventfd_signal(a->irq_ctx, 1);
396			some_recvd = false;
397		}
398#endif /* SYNC_KLOOP_POLL */
399
400		/* Read CSB to see if there is more work to do. */
401		sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
402		if (sync_kloop_norxslots(kring, shadow_ring.head)) {
403			if (a->busy_wait) {
404				break;
405			}
406			/*
407			 * No more slots available for reception. We enable notification and
408			 * go to sleep, waiting for a kick from the application when new receive
409			 * slots are available.
410			 */
411			/* Re-enable notifications. */
412			csb_ktoa_kick_enable(csb_ktoa, 1);
413			/* Double check, with store-load memory barrier. */
414			nm_stld_barrier();
415			sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
416			if (!sync_kloop_norxslots(kring, shadow_ring.head)) {
417				/* We won the race condition, more slots are available. Disable
418				 * notifications and do another cycle. */
419				csb_ktoa_kick_enable(csb_ktoa, 0);
420				continue;
421			}
422			break;
423		}
424
425		hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
426		if (unlikely(hwtail == kring->rhead ||
427					dry_cycles >= SYNC_LOOP_RX_DRY_CYCLES_MAX)) {
428			/* No more packets to be read from the backend. We stop and
429			 * wait for a notification from the backend (netmap_rx_irq). */
430			nm_prdis(1, "nr_hwtail: %d rhead: %d dry_cycles: %d",
431					hwtail, kring->rhead, dry_cycles);
432			break;
433		}
434	}
435
436	nm_kr_put(kring);
437
438#ifdef SYNC_KLOOP_POLL
439	/* Interrupt the application if needed. */
440	if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) {
441		eventfd_signal(a->irq_ctx, 1);
442	}
443#endif /* SYNC_KLOOP_POLL */
444}
445
446#ifdef SYNC_KLOOP_POLL
447struct sync_kloop_poll_ctx;
448struct sync_kloop_poll_entry {
449	/* Support for receiving notifications from
450	 * a netmap ring or from the application. */
451	struct file *filp;
452	wait_queue_t wait;
453	wait_queue_head_t *wqh;
454
455	/* Support for sending notifications to the application. */
456	struct eventfd_ctx *irq_ctx;
457	struct file *irq_filp;
458
459	/* Arguments for the ring processing function. Useful
460	 * in case of custom wake-up function. */
461	struct sync_kloop_ring_args *args;
462	struct sync_kloop_poll_ctx *parent;
463
464};
465
466struct sync_kloop_poll_ctx {
467	poll_table wait_table;
468	unsigned int next_entry;
469	int (*next_wake_fun)(wait_queue_t *, unsigned, int, void *);
470	unsigned int num_entries;
471	unsigned int num_tx_rings;
472	unsigned int num_rings;
473	/* First num_tx_rings entries are for the TX kicks.
474	 * Then the RX kicks entries follow. The last two
475	 * entries are for TX irq, and RX irq. */
476	struct sync_kloop_poll_entry entries[0];
477};
478
479static void
480sync_kloop_poll_table_queue_proc(struct file *file, wait_queue_head_t *wqh,
481				poll_table *pt)
482{
483	struct sync_kloop_poll_ctx *poll_ctx =
484		container_of(pt, struct sync_kloop_poll_ctx, wait_table);
485	struct sync_kloop_poll_entry *entry = poll_ctx->entries +
486						poll_ctx->next_entry;
487
488	BUG_ON(poll_ctx->next_entry >= poll_ctx->num_entries);
489	entry->wqh = wqh;
490	entry->filp = file;
491	/* Use the default wake up function. */
492	if (poll_ctx->next_wake_fun == NULL) {
493		init_waitqueue_entry(&entry->wait, current);
494	} else {
495		init_waitqueue_func_entry(&entry->wait,
496		    poll_ctx->next_wake_fun);
497	}
498	add_wait_queue(wqh, &entry->wait);
499}
500
501static int
502sync_kloop_tx_kick_wake_fun(wait_queue_t *wait, unsigned mode,
503    int wake_flags, void *key)
504{
505	struct sync_kloop_poll_entry *entry =
506	    container_of(wait, struct sync_kloop_poll_entry, wait);
507
508	netmap_sync_kloop_tx_ring(entry->args);
509
510	return 0;
511}
512
513static int
514sync_kloop_tx_irq_wake_fun(wait_queue_t *wait, unsigned mode,
515    int wake_flags, void *key)
516{
517	struct sync_kloop_poll_entry *entry =
518	    container_of(wait, struct sync_kloop_poll_entry, wait);
519	struct sync_kloop_poll_ctx *poll_ctx = entry->parent;
520	int i;
521
522	for (i = 0; i < poll_ctx->num_tx_rings; i++) {
523		struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx;
524
525		if (irq_ctx) {
526			eventfd_signal(irq_ctx, 1);
527		}
528	}
529
530	return 0;
531}
532
533static int
534sync_kloop_rx_kick_wake_fun(wait_queue_t *wait, unsigned mode,
535    int wake_flags, void *key)
536{
537	struct sync_kloop_poll_entry *entry =
538	    container_of(wait, struct sync_kloop_poll_entry, wait);
539
540	netmap_sync_kloop_rx_ring(entry->args);
541
542	return 0;
543}
544
545static int
546sync_kloop_rx_irq_wake_fun(wait_queue_t *wait, unsigned mode,
547    int wake_flags, void *key)
548{
549	struct sync_kloop_poll_entry *entry =
550	    container_of(wait, struct sync_kloop_poll_entry, wait);
551	struct sync_kloop_poll_ctx *poll_ctx = entry->parent;
552	int i;
553
554	for (i = poll_ctx->num_tx_rings; i < poll_ctx->num_rings; i++) {
555		struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx;
556
557		if (irq_ctx) {
558			eventfd_signal(irq_ctx, 1);
559		}
560	}
561
562	return 0;
563}
564#endif  /* SYNC_KLOOP_POLL */
565
566int
567netmap_sync_kloop(struct netmap_priv_d *priv, struct nmreq_header *hdr)
568{
569	struct nmreq_sync_kloop_start *req =
570		(struct nmreq_sync_kloop_start *)(uintptr_t)hdr->nr_body;
571	struct nmreq_opt_sync_kloop_eventfds *eventfds_opt = NULL;
572#ifdef SYNC_KLOOP_POLL
573	struct sync_kloop_poll_ctx *poll_ctx = NULL;
574#endif  /* SYNC_KLOOP_POLL */
575	int num_rx_rings, num_tx_rings, num_rings;
576	struct sync_kloop_ring_args *args = NULL;
577	uint32_t sleep_us = req->sleep_us;
578	struct nm_csb_atok* csb_atok_base;
579	struct nm_csb_ktoa* csb_ktoa_base;
580	struct netmap_adapter *na;
581	struct nmreq_option *opt;
582	bool na_could_sleep = false;
583	bool busy_wait = true;
584	bool direct_tx = false;
585	bool direct_rx = false;
586	int err = 0;
587	int i;
588
589	if (sleep_us > 1000000) {
590		/* We do not accept sleeping for more than a second. */
591		return EINVAL;
592	}
593
594	if (priv->np_nifp == NULL) {
595		return ENXIO;
596	}
597	mb(); /* make sure following reads are not from cache */
598
599	na = priv->np_na;
600	if (!nm_netmap_on(na)) {
601		return ENXIO;
602	}
603
604	NMG_LOCK();
605	/* Make sure the application is working in CSB mode. */
606	if (!priv->np_csb_atok_base || !priv->np_csb_ktoa_base) {
607		NMG_UNLOCK();
608		nm_prerr("sync-kloop on %s requires "
609				"NETMAP_REQ_OPT_CSB option", na->name);
610		return EINVAL;
611	}
612
613	csb_atok_base = priv->np_csb_atok_base;
614	csb_ktoa_base = priv->np_csb_ktoa_base;
615
616	/* Make sure that no kloop is currently running. */
617	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
618		err = EBUSY;
619	}
620	priv->np_kloop_state |= NM_SYNC_KLOOP_RUNNING;
621	NMG_UNLOCK();
622	if (err) {
623		return err;
624	}
625
626	num_rx_rings = priv->np_qlast[NR_RX] - priv->np_qfirst[NR_RX];
627	num_tx_rings = priv->np_qlast[NR_TX] - priv->np_qfirst[NR_TX];
628	num_rings = num_tx_rings + num_rx_rings;
629
630	args = nm_os_malloc(num_rings * sizeof(args[0]));
631	if (!args) {
632		err = ENOMEM;
633		goto out;
634	}
635
636	/* Prepare the arguments for netmap_sync_kloop_tx_ring()
637	 * and netmap_sync_kloop_rx_ring(). */
638	for (i = 0; i < num_tx_rings; i++) {
639		struct sync_kloop_ring_args *a = args + i;
640
641		a->kring = NMR(na, NR_TX)[i + priv->np_qfirst[NR_TX]];
642		a->csb_atok = csb_atok_base + i;
643		a->csb_ktoa = csb_ktoa_base + i;
644		a->busy_wait = busy_wait;
645		a->direct = direct_tx;
646	}
647	for (i = 0; i < num_rx_rings; i++) {
648		struct sync_kloop_ring_args *a = args + num_tx_rings + i;
649
650		a->kring = NMR(na, NR_RX)[i + priv->np_qfirst[NR_RX]];
651		a->csb_atok = csb_atok_base + num_tx_rings + i;
652		a->csb_ktoa = csb_ktoa_base + num_tx_rings + i;
653		a->busy_wait = busy_wait;
654		a->direct = direct_rx;
655	}
656
657	/* Validate notification options. */
658	opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_SYNC_KLOOP_MODE);
659	if (opt != NULL) {
660		struct nmreq_opt_sync_kloop_mode *mode_opt =
661		    (struct nmreq_opt_sync_kloop_mode *)opt;
662
663		direct_tx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_TX);
664		direct_rx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_RX);
665		if (mode_opt->mode & ~(NM_OPT_SYNC_KLOOP_DIRECT_TX |
666		    NM_OPT_SYNC_KLOOP_DIRECT_RX)) {
667			opt->nro_status = err = EINVAL;
668			goto out;
669		}
670		opt->nro_status = 0;
671	}
672	opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS);
673	if (opt != NULL) {
674		if (opt->nro_size != sizeof(*eventfds_opt) +
675			sizeof(eventfds_opt->eventfds[0]) * num_rings) {
676			/* Option size not consistent with the number of
677			 * entries. */
678			opt->nro_status = err = EINVAL;
679			goto out;
680		}
681#ifdef SYNC_KLOOP_POLL
682		eventfds_opt = (struct nmreq_opt_sync_kloop_eventfds *)opt;
683		opt->nro_status = 0;
684
685		/* Check if some ioeventfd entry is not defined, and force sleep
686		 * synchronization in that case. */
687		busy_wait = false;
688		for (i = 0; i < num_rings; i++) {
689			if (eventfds_opt->eventfds[i].ioeventfd < 0) {
690				busy_wait = true;
691				break;
692			}
693		}
694
695		if (busy_wait && (direct_tx || direct_rx)) {
696			/* For direct processing we need all the
697			 * ioeventfds to be valid. */
698			opt->nro_status = err = EINVAL;
699			goto out;
700		}
701
702		/* We need 2 poll entries for TX and RX notifications coming
703		 * from the netmap adapter, plus one entries per ring for the
704		 * notifications coming from the application. */
705		poll_ctx = nm_os_malloc(sizeof(*poll_ctx) +
706				(num_rings + 2) * sizeof(poll_ctx->entries[0]));
707		init_poll_funcptr(&poll_ctx->wait_table,
708					sync_kloop_poll_table_queue_proc);
709		poll_ctx->num_entries = 2 + num_rings;
710		poll_ctx->num_tx_rings = num_tx_rings;
711		poll_ctx->num_rings = num_rings;
712		poll_ctx->next_entry = 0;
713		poll_ctx->next_wake_fun = NULL;
714
715		if (direct_tx && (na->na_flags & NAF_BDG_MAYSLEEP)) {
716			/* In direct mode, VALE txsync is called from
717			 * wake-up context, where it is not possible
718			 * to sleep.
719			 */
720			na->na_flags &= ~NAF_BDG_MAYSLEEP;
721			na_could_sleep = true;
722		}
723
724		for (i = 0; i < num_rings + 2; i++) {
725			poll_ctx->entries[i].args = args + i;
726			poll_ctx->entries[i].parent = poll_ctx;
727		}
728
729		/* Poll for notifications coming from the applications through
730		 * eventfds. */
731		for (i = 0; i < num_rings; i++, poll_ctx->next_entry++) {
732			struct eventfd_ctx *irq = NULL;
733			struct file *filp = NULL;
734			unsigned long mask;
735			bool tx_ring = (i < num_tx_rings);
736
737			if (eventfds_opt->eventfds[i].irqfd >= 0) {
738				filp = eventfd_fget(
739				    eventfds_opt->eventfds[i].irqfd);
740				if (IS_ERR(filp)) {
741					err = PTR_ERR(filp);
742					goto out;
743				}
744				irq = eventfd_ctx_fileget(filp);
745				if (IS_ERR(irq)) {
746					err = PTR_ERR(irq);
747					goto out;
748				}
749			}
750			poll_ctx->entries[i].irq_filp = filp;
751			poll_ctx->entries[i].irq_ctx = irq;
752			poll_ctx->entries[i].args->busy_wait = busy_wait;
753			/* Don't let netmap_sync_kloop_*x_ring() use
754			 * IRQs in direct mode. */
755			poll_ctx->entries[i].args->irq_ctx =
756			    ((tx_ring && direct_tx) ||
757			    (!tx_ring && direct_rx)) ? NULL :
758			    poll_ctx->entries[i].irq_ctx;
759			poll_ctx->entries[i].args->direct =
760			    (tx_ring ? direct_tx : direct_rx);
761
762			if (!busy_wait) {
763				filp = eventfd_fget(
764				    eventfds_opt->eventfds[i].ioeventfd);
765				if (IS_ERR(filp)) {
766					err = PTR_ERR(filp);
767					goto out;
768				}
769				if (tx_ring && direct_tx) {
770					/* Override the wake up function
771					 * so that it can directly call
772					 * netmap_sync_kloop_tx_ring().
773					 */
774					poll_ctx->next_wake_fun =
775					    sync_kloop_tx_kick_wake_fun;
776				} else if (!tx_ring && direct_rx) {
777					/* Same for direct RX. */
778					poll_ctx->next_wake_fun =
779					    sync_kloop_rx_kick_wake_fun;
780				} else {
781					poll_ctx->next_wake_fun = NULL;
782				}
783				mask = filp->f_op->poll(filp,
784				    &poll_ctx->wait_table);
785				if (mask & POLLERR) {
786					err = EINVAL;
787					goto out;
788				}
789			}
790		}
791
792		/* Poll for notifications coming from the netmap rings bound to
793		 * this file descriptor. */
794		if (!busy_wait) {
795			NMG_LOCK();
796			/* In direct mode, override the wake up function so
797			 * that it can forward the netmap_tx_irq() to the
798			 * guest. */
799			poll_ctx->next_wake_fun = direct_tx ?
800			    sync_kloop_tx_irq_wake_fun : NULL;
801			poll_wait(priv->np_filp, priv->np_si[NR_TX],
802			    &poll_ctx->wait_table);
803			poll_ctx->next_entry++;
804
805			poll_ctx->next_wake_fun = direct_rx ?
806			    sync_kloop_rx_irq_wake_fun : NULL;
807			poll_wait(priv->np_filp, priv->np_si[NR_RX],
808			    &poll_ctx->wait_table);
809			poll_ctx->next_entry++;
810			NMG_UNLOCK();
811		}
812#else   /* SYNC_KLOOP_POLL */
813		opt->nro_status = EOPNOTSUPP;
814		goto out;
815#endif  /* SYNC_KLOOP_POLL */
816	}
817
818	nm_prinf("kloop busy_wait %u, direct_tx %u, direct_rx %u, "
819	    "na_could_sleep %u", busy_wait, direct_tx, direct_rx,
820	    na_could_sleep);
821
822	/* Main loop. */
823	for (;;) {
824		if (unlikely(NM_ACCESS_ONCE(priv->np_kloop_state) & NM_SYNC_KLOOP_STOPPING)) {
825			break;
826		}
827
828#ifdef SYNC_KLOOP_POLL
829		if (!busy_wait) {
830			/* It is important to set the task state as
831			 * interruptible before processing any TX/RX ring,
832			 * so that if a notification on ring Y comes after
833			 * we have processed ring Y, but before we call
834			 * schedule(), we don't miss it. This is true because
835			 * the wake up function will change the task state,
836			 * and therefore the schedule_timeout() call below
837			 * will observe the change).
838			 */
839			set_current_state(TASK_INTERRUPTIBLE);
840		}
841#endif  /* SYNC_KLOOP_POLL */
842
843		/* Process all the TX rings bound to this file descriptor. */
844		for (i = 0; !direct_tx && i < num_tx_rings; i++) {
845			struct sync_kloop_ring_args *a = args + i;
846			netmap_sync_kloop_tx_ring(a);
847		}
848
849		/* Process all the RX rings bound to this file descriptor. */
850		for (i = 0; !direct_rx && i < num_rx_rings; i++) {
851			struct sync_kloop_ring_args *a = args + num_tx_rings + i;
852			netmap_sync_kloop_rx_ring(a);
853		}
854
855		if (busy_wait) {
856			/* Default synchronization method: sleep for a while. */
857			usleep_range(sleep_us, sleep_us);
858		}
859#ifdef SYNC_KLOOP_POLL
860		else {
861			/* Yield to the scheduler waiting for a notification
862			 * to come either from netmap or the application. */
863			schedule_timeout(msecs_to_jiffies(3000));
864		}
865#endif /* SYNC_KLOOP_POLL */
866	}
867out:
868#ifdef SYNC_KLOOP_POLL
869	if (poll_ctx) {
870		/* Stop polling from netmap and the eventfds, and deallocate
871		 * the poll context. */
872		if (!busy_wait) {
873			__set_current_state(TASK_RUNNING);
874		}
875		for (i = 0; i < poll_ctx->next_entry; i++) {
876			struct sync_kloop_poll_entry *entry =
877						poll_ctx->entries + i;
878
879			if (entry->wqh)
880				remove_wait_queue(entry->wqh, &entry->wait);
881			/* We did not get a reference to the eventfds, but
882			 * don't do that on netmap file descriptors (since
883			 * a reference was not taken. */
884			if (entry->filp && entry->filp != priv->np_filp)
885				fput(entry->filp);
886			if (entry->irq_ctx)
887				eventfd_ctx_put(entry->irq_ctx);
888			if (entry->irq_filp)
889				fput(entry->irq_filp);
890		}
891		nm_os_free(poll_ctx);
892		poll_ctx = NULL;
893	}
894#endif /* SYNC_KLOOP_POLL */
895
896	if (args) {
897		nm_os_free(args);
898		args = NULL;
899	}
900
901	/* Reset the kloop state. */
902	NMG_LOCK();
903	priv->np_kloop_state = 0;
904	if (na_could_sleep) {
905		na->na_flags |= NAF_BDG_MAYSLEEP;
906	}
907	NMG_UNLOCK();
908
909	return err;
910}
911
912int
913netmap_sync_kloop_stop(struct netmap_priv_d *priv)
914{
915	struct netmap_adapter *na;
916	bool running = true;
917	int err = 0;
918
919	if (priv->np_nifp == NULL) {
920		return ENXIO;
921	}
922	mb(); /* make sure following reads are not from cache */
923
924	na = priv->np_na;
925	if (!nm_netmap_on(na)) {
926		return ENXIO;
927	}
928
929	/* Set the kloop stopping flag. */
930	NMG_LOCK();
931	priv->np_kloop_state |= NM_SYNC_KLOOP_STOPPING;
932	NMG_UNLOCK();
933
934	/* Send a notification to the kloop, in case it is blocked in
935	 * schedule_timeout(). We can use either RX or TX, because the
936	 * kloop is waiting on both. */
937	nm_os_selwakeup(priv->np_si[NR_RX]);
938
939	/* Wait for the kloop to actually terminate. */
940	while (running) {
941		usleep_range(1000, 1500);
942		NMG_LOCK();
943		running = (NM_ACCESS_ONCE(priv->np_kloop_state)
944				& NM_SYNC_KLOOP_RUNNING);
945		NMG_UNLOCK();
946	}
947
948	return err;
949}
950
951#ifdef WITH_PTNETMAP
952/*
953 * Guest ptnetmap txsync()/rxsync() routines, used in ptnet device drivers.
954 * These routines are reused across the different operating systems supported
955 * by netmap.
956 */
957
958/*
959 * Reconcile host and guest views of the transmit ring.
960 *
961 * Guest user wants to transmit packets up to the one before ring->head,
962 * and guest kernel knows tx_ring->hwcur is the first packet unsent
963 * by the host kernel.
964 *
965 * We push out as many packets as possible, and possibly
966 * reclaim buffers from previously completed transmission.
967 *
968 * Notifications from the host are enabled only if the user guest would
969 * block (no space in the ring).
970 */
971bool
972netmap_pt_guest_txsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa,
973			struct netmap_kring *kring, int flags)
974{
975	bool notify = false;
976
977	/* Disable notifications */
978	atok->appl_need_kick = 0;
979
980	/*
981	 * First part: tell the host to process the new packets,
982	 * updating the CSB.
983	 */
984	kring->nr_hwcur = ktoa->hwcur;
985	nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead);
986
987        /* Ask for a kick from a guest to the host if needed. */
988	if (((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring))
989		&& NM_ACCESS_ONCE(ktoa->kern_need_kick)) ||
990			(flags & NAF_FORCE_RECLAIM)) {
991		atok->sync_flags = flags;
992		notify = true;
993	}
994
995	/*
996	 * Second part: reclaim buffers for completed transmissions.
997	 */
998	if (nm_kr_wouldblock(kring) || (flags & NAF_FORCE_RECLAIM)) {
999		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1000					&kring->nr_hwcur);
1001	}
1002
1003        /*
1004         * No more room in the ring for new transmissions. The user thread will
1005	 * go to sleep and we need to be notified by the host when more free
1006	 * space is available.
1007         */
1008	if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) {
1009		/* Re-enable notifications. */
1010		atok->appl_need_kick = 1;
1011                /* Double check, with store-load memory barrier. */
1012		nm_stld_barrier();
1013		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1014					&kring->nr_hwcur);
1015                /* If there is new free space, disable notifications */
1016		if (unlikely(!nm_kr_wouldblock(kring))) {
1017			atok->appl_need_kick = 0;
1018		}
1019	}
1020
1021	nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)",
1022		kring->name, atok->head, atok->cur, ktoa->hwtail,
1023		kring->rhead, kring->rcur, kring->nr_hwtail);
1024
1025	return notify;
1026}
1027
1028/*
1029 * Reconcile host and guest view of the receive ring.
1030 *
1031 * Update hwcur/hwtail from host (reading from CSB).
1032 *
1033 * If guest user has released buffers up to the one before ring->head, we
1034 * also give them to the host.
1035 *
1036 * Notifications from the host are enabled only if the user guest would
1037 * block (no more completed slots in the ring).
1038 */
1039bool
1040netmap_pt_guest_rxsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa,
1041			struct netmap_kring *kring, int flags)
1042{
1043	bool notify = false;
1044
1045        /* Disable notifications */
1046	atok->appl_need_kick = 0;
1047
1048	/*
1049	 * First part: import newly received packets, by updating the kring
1050	 * hwtail to the hwtail known from the host (read from the CSB).
1051	 * This also updates the kring hwcur.
1052	 */
1053	nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur);
1054	kring->nr_kflags &= ~NKR_PENDINTR;
1055
1056	/*
1057	 * Second part: tell the host about the slots that guest user has
1058	 * released, by updating cur and head in the CSB.
1059	 */
1060	if (kring->rhead != kring->nr_hwcur) {
1061		nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead);
1062	}
1063
1064        /*
1065         * No more completed RX slots. The user thread will go to sleep and
1066	 * we need to be notified by the host when more RX slots have been
1067	 * completed.
1068         */
1069	if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) {
1070		/* Re-enable notifications. */
1071                atok->appl_need_kick = 1;
1072                /* Double check, with store-load memory barrier. */
1073		nm_stld_barrier();
1074		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1075					&kring->nr_hwcur);
1076                /* If there are new slots, disable notifications. */
1077		if (!nm_kr_wouldblock(kring)) {
1078                        atok->appl_need_kick = 0;
1079                }
1080        }
1081
1082	/* Ask for a kick from the guest to the host if needed. */
1083	if ((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring))
1084		&& NM_ACCESS_ONCE(ktoa->kern_need_kick)) {
1085		atok->sync_flags = flags;
1086		notify = true;
1087	}
1088
1089	nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)",
1090		kring->name, atok->head, atok->cur, ktoa->hwtail,
1091		kring->rhead, kring->rcur, kring->nr_hwtail);
1092
1093	return notify;
1094}
1095
1096/*
1097 * Callbacks for ptnet drivers: nm_krings_create, nm_krings_delete, nm_dtor.
1098 */
1099int
1100ptnet_nm_krings_create(struct netmap_adapter *na)
1101{
1102	struct netmap_pt_guest_adapter *ptna =
1103			(struct netmap_pt_guest_adapter *)na; /* Upcast. */
1104	struct netmap_adapter *na_nm = &ptna->hwup.up;
1105	struct netmap_adapter *na_dr = &ptna->dr.up;
1106	int ret;
1107
1108	if (ptna->backend_users) {
1109		return 0;
1110	}
1111
1112	/* Create krings on the public netmap adapter. */
1113	ret = netmap_hw_krings_create(na_nm);
1114	if (ret) {
1115		return ret;
1116	}
1117
1118	/* Copy krings into the netmap adapter private to the driver. */
1119	na_dr->tx_rings = na_nm->tx_rings;
1120	na_dr->rx_rings = na_nm->rx_rings;
1121
1122	return 0;
1123}
1124
1125void
1126ptnet_nm_krings_delete(struct netmap_adapter *na)
1127{
1128	struct netmap_pt_guest_adapter *ptna =
1129			(struct netmap_pt_guest_adapter *)na; /* Upcast. */
1130	struct netmap_adapter *na_nm = &ptna->hwup.up;
1131	struct netmap_adapter *na_dr = &ptna->dr.up;
1132
1133	if (ptna->backend_users) {
1134		return;
1135	}
1136
1137	na_dr->tx_rings = NULL;
1138	na_dr->rx_rings = NULL;
1139
1140	netmap_hw_krings_delete(na_nm);
1141}
1142
1143void
1144ptnet_nm_dtor(struct netmap_adapter *na)
1145{
1146	struct netmap_pt_guest_adapter *ptna =
1147			(struct netmap_pt_guest_adapter *)na;
1148
1149	netmap_mem_put(ptna->dr.up.nm_mem);
1150	memset(&ptna->dr, 0, sizeof(ptna->dr));
1151	netmap_mem_pt_guest_ifp_del(na->nm_mem, na->ifp);
1152}
1153
1154int
1155netmap_pt_guest_attach(struct netmap_adapter *arg,
1156		       unsigned int nifp_offset, unsigned int memid)
1157{
1158	struct netmap_pt_guest_adapter *ptna;
1159	if_t ifp = arg ? arg->ifp : NULL;
1160	int error;
1161
1162	/* get allocator */
1163	arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, memid);
1164	if (arg->nm_mem == NULL)
1165		return ENOMEM;
1166	arg->na_flags |= NAF_MEM_OWNER;
1167	error = netmap_attach_ext(arg, sizeof(struct netmap_pt_guest_adapter), 1);
1168	if (error)
1169		return error;
1170
1171	/* get the netmap_pt_guest_adapter */
1172	ptna = (struct netmap_pt_guest_adapter *) NA(ifp);
1173
1174	/* Initialize a separate pass-through netmap adapter that is going to
1175	 * be used by the ptnet driver only, and so never exposed to netmap
1176         * applications. We only need a subset of the available fields. */
1177	memset(&ptna->dr, 0, sizeof(ptna->dr));
1178	ptna->dr.up.ifp = ifp;
1179	ptna->dr.up.nm_mem = netmap_mem_get(ptna->hwup.up.nm_mem);
1180        ptna->dr.up.nm_config = ptna->hwup.up.nm_config;
1181
1182	ptna->backend_users = 0;
1183
1184	return 0;
1185}
1186
1187#endif /* WITH_PTNETMAP */
1188