1/*
2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *   1. Redistributions of source code must retain the above copyright
8 *      notice, this list of conditions and the following disclaimer.
9 *   2. Redistributions in binary form must reproduce the above copyright
10 *      notice, this list of conditions and the following disclaimer in the
11 *      documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26
27/*
28 * $FreeBSD$
29 *
30 * This module supports memory mapped access to network devices,
31 * see netmap(4).
32 *
33 * The module uses a large, memory pool allocated by the kernel
34 * and accessible as mmapped memory by multiple userspace threads/processes.
35 * The memory pool contains packet buffers and "netmap rings",
36 * i.e. user-accessible copies of the interface's queues.
37 *
38 * Access to the network card works like this:
39 * 1. a process/thread issues one or more open() on /dev/netmap, to create
40 *    select()able file descriptor on which events are reported.
41 * 2. on each descriptor, the process issues an ioctl() to identify
42 *    the interface that should report events to the file descriptor.
43 * 3. on each descriptor, the process issues an mmap() request to
44 *    map the shared memory region within the process' address space.
45 *    The list of interesting queues is indicated by a location in
46 *    the shared memory region.
47 * 4. using the functions in the netmap(4) userspace API, a process
48 *    can look up the occupation state of a queue, access memory buffers,
49 *    and retrieve received packets or enqueue packets to transmit.
50 * 5. using some ioctl()s the process can synchronize the userspace view
51 *    of the queue with the actual status in the kernel. This includes both
52 *    receiving the notification of new packets, and transmitting new
53 *    packets on the output interface.
54 * 6. select() or poll() can be used to wait for events on individual
55 *    transmit or receive queues (or all queues for a given interface).
56 *
57
58		SYNCHRONIZATION (USER)
59
60The netmap rings and data structures may be shared among multiple
61user threads or even independent processes.
62Any synchronization among those threads/processes is delegated
63to the threads themselves. Only one thread at a time can be in
64a system call on the same netmap ring. The OS does not enforce
65this and only guarantees against system crashes in case of
66invalid usage.
67
68		LOCKING (INTERNAL)
69
70Within the kernel, access to the netmap rings is protected as follows:
71
72- a spinlock on each ring, to handle producer/consumer races on
73  RX rings attached to the host stack (against multiple host
74  threads writing from the host stack to the same ring),
75  and on 'destination' rings attached to a VALE switch
76  (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77  protecting multiple active senders for the same destination)
78
79- an atomic variable to guarantee that there is at most one
80  instance of *_*xsync() on the ring at any time.
81  For rings connected to user file
82  descriptors, an atomic_test_and_set() protects this, and the
83  lock on the ring is not actually used.
84  For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85  is also used to prevent multiple executions (the driver might indeed
86  already guarantee this).
87  For NIC TX rings connected to a VALE switch, the lock arbitrates
88  access to the queue (both when allocating buffers and when pushing
89  them out).
90
91- *xsync() should be protected against initializations of the card.
92  On FreeBSD most devices have the reset routine protected by
93  a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94  the RING protection on rx_reset(), this should be added.
95
96  On linux there is an external lock on the tx path, which probably
97  also arbitrates access to the reset routine. XXX to be revised
98
99- a per-interface core_lock protecting access from the host stack
100  while interfaces may be detached from netmap mode.
101  XXX there should be no need for this lock if we detach the interfaces
102  only while they are down.
103
104
105--- VALE SWITCH ---
106
107NMG_LOCK() serializes all modifications to switches and ports.
108A switch cannot be deleted until all ports are gone.
109
110For each switch, an SX lock (RWlock on linux) protects
111deletion of ports. When configuring or deleting a new port, the
112lock is acquired in exclusive mode (after holding NMG_LOCK).
113When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114The lock is held throughout the entire forwarding cycle,
115during which the thread may incur in a page fault.
116Hence it is important that sleepable shared locks are used.
117
118On the rx ring, the per-port lock is grabbed initially to reserve
119a number of slot in the ring, then the lock is released,
120packets are copied from source to destination, and then
121the lock is acquired again and the receive ring is updated.
122(A similar thing is done on the tx ring for NIC and host stack
123ports attached to the switch)
124
125 */
126
127
128/* --- internals ----
129 *
130 * Roadmap to the code that implements the above.
131 *
132 * > 1. a process/thread issues one or more open() on /dev/netmap, to create
133 * >    select()able file descriptor on which events are reported.
134 *
135 *  	Internally, we allocate a netmap_priv_d structure, that will be
136 *  	initialized on ioctl(NIOCREGIF).
137 *
138 *      os-specific:
139 *  	    FreeBSD: netmap_open (netmap_freebsd.c). The priv is
140 *  		     per-thread.
141 *  	    linux:   linux_netmap_open (netmap_linux.c). The priv is
142 *  		     per-open.
143 *
144 * > 2. on each descriptor, the process issues an ioctl() to identify
145 * >    the interface that should report events to the file descriptor.
146 *
147 * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
148 * 	Most important things happen in netmap_get_na() and
149 * 	netmap_do_regif(), called from there. Additional details can be
150 * 	found in the comments above those functions.
151 *
152 * 	In all cases, this action creates/takes-a-reference-to a
153 * 	netmap_*_adapter describing the port, and allocates a netmap_if
154 * 	and all necessary netmap rings, filling them with netmap buffers.
155 *
156 *      In this phase, the sync callbacks for each ring are set (these are used
157 *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
158 *      The adapter creation/initialization code puts them in the
159 * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
160 * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
161 * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
162 * 	actually call netmap_krings_create() to perform this and the other
163 * 	common stuff. netmap_krings_create() also takes care of the host rings,
164 * 	if needed, by setting their sync callbacks appropriately.
165 *
166 * 	Additional actions depend on the kind of netmap_adapter that has been
167 * 	registered:
168 *
169 * 	- netmap_hw_adapter:  	     [netmap.c]
170 * 	     This is a system netdev/ifp with native netmap support.
171 * 	     The ifp is detached from the host stack by redirecting:
172 * 	       - transmissions (from the network stack) to netmap_transmit()
173 * 	       - receive notifications to the nm_notify() callback for
174 * 	         this adapter. The callback is normally netmap_notify(), unless
175 * 	         the ifp is attached to a bridge using bwrap, in which case it
176 * 	         is netmap_bwrap_intr_notify().
177 *
178 * 	- netmap_generic_adapter:      [netmap_generic.c]
179 * 	      A system netdev/ifp without native netmap support.
180 *
181 * 	(the decision about native/non native support is taken in
182 * 	 netmap_get_hw_na(), called by netmap_get_na())
183 *
184 * 	- netmap_vp_adapter 		[netmap_vale.c]
185 * 	      Returned by netmap_get_bdg_na().
186 * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
187 * 	      are created on the fly if they don't already exist, and are
188 * 	      always attached to a bridge.
189 * 	      Persistent VALE ports must must be created seperately, and i
190 * 	      then attached like normal NICs. The NIOCREGIF we are examining
191 * 	      will find them only if they had previosly been created and
192 * 	      attached (see VALE_CTL below).
193 *
194 * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
195 * 	      Returned by netmap_get_pipe_na().
196 * 	      Both pipe ends are created, if they didn't already exist.
197 *
198 * 	- netmap_monitor_adapter      [netmap_monitor.c]
199 * 	      Returned by netmap_get_monitor_na().
200 * 	      If successful, the nm_sync callbacks of the monitored adapter
201 * 	      will be intercepted by the returned monitor.
202 *
203 * 	- netmap_bwrap_adapter	      [netmap_vale.c]
204 * 	      Cannot be obtained in this way, see VALE_CTL below
205 *
206 *
207 * 	os-specific:
208 * 	    linux: we first go through linux_netmap_ioctl() to
209 * 	           adapt the FreeBSD interface to the linux one.
210 *
211 *
212 * > 3. on each descriptor, the process issues an mmap() request to
213 * >    map the shared memory region within the process' address space.
214 * >    The list of interesting queues is indicated by a location in
215 * >    the shared memory region.
216 *
217 *      os-specific:
218 *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
219 *  	    linux:   linux_netmap_mmap (netmap_linux.c).
220 *
221 * > 4. using the functions in the netmap(4) userspace API, a process
222 * >    can look up the occupation state of a queue, access memory buffers,
223 * >    and retrieve received packets or enqueue packets to transmit.
224 *
225 * 	these actions do not involve the kernel.
226 *
227 * > 5. using some ioctl()s the process can synchronize the userspace view
228 * >    of the queue with the actual status in the kernel. This includes both
229 * >    receiving the notification of new packets, and transmitting new
230 * >    packets on the output interface.
231 *
232 * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
233 * 	cases. They invoke the nm_sync callbacks on the netmap_kring
234 * 	structures, as initialized in step 2 and maybe later modified
235 * 	by a monitor. Monitors, however, will always call the original
236 * 	callback before doing anything else.
237 *
238 *
239 * > 6. select() or poll() can be used to wait for events on individual
240 * >    transmit or receive queues (or all queues for a given interface).
241 *
242 * 	Implemented in netmap_poll(). This will call the same nm_sync()
243 * 	callbacks as in step 5 above.
244 *
245 * 	os-specific:
246 * 		linux: we first go through linux_netmap_poll() to adapt
247 * 		       the FreeBSD interface to the linux one.
248 *
249 *
250 *  ----  VALE_CTL -----
251 *
252 *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
253 *  nr_cmd in the nmreq structure. These subcommands are handled by
254 *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
255 *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
256 *  subcommands, respectively.
257 *
258 *  Any network interface known to the system (including a persistent VALE
259 *  port) can be attached to a VALE switch by issuing the
260 *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
261 *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
262 *  attachment of other interfaces, instead, requires the creation of a
263 *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
264 *  netmap mode. This may require the creation of a netmap_generic_adapter if
265 *  we have no native support for the interface, or if generic adapters have
266 *  been forced by sysctl.
267 *
268 *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
269 *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
270 *  callback.  In the case of the bwrap, the callback creates the
271 *  netmap_bwrap_adapter.  The initialization of the bwrap is then
272 *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
273 *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
274 *  A generic adapter for the wrapped ifp will be created if needed, when
275 *  netmap_get_bdg_na() calls netmap_get_hw_na().
276 *
277 *
278 *  ---- DATAPATHS -----
279 *
280 *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
281 *
282 *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
283 *
284 *    - tx from netmap userspace:
285 *	 concurrently:
286 *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
287 *                kring->nm_sync() == DEVICE_netmap_txsync()
288 *           2) device interrupt handler
289 *                na->nm_notify()  == netmap_notify()
290 *    - rx from netmap userspace:
291 *       concurrently:
292 *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
293 *                kring->nm_sync() == DEVICE_netmap_rxsync()
294 *           2) device interrupt handler
295 *                na->nm_notify()  == netmap_notify()
296 *    - tx from host stack
297 *       concurrently:
298 *           1) host stack
299 *                netmap_transmit()
300 *                  na->nm_notify  == netmap_notify()
301 *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
302 *                kring->nm_sync() == netmap_rxsync_from_host_compat
303 *                  netmap_rxsync_from_host(na, NULL, NULL)
304 *    - tx to host stack
305 *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
306 *             kring->nm_sync() == netmap_txsync_to_host_compat
307 *               netmap_txsync_to_host(na)
308 *                 NM_SEND_UP()
309 *                   FreeBSD: na->if_input() == ?? XXX
310 *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
311 *
312 *
313 *
314 *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
315 *
316 *
317 *
318 *                           -= VALE PORT =-
319 *
320 *
321 *
322 *                           -= NETMAP PIPE =-
323 *
324 *
325 *
326 *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
327 *
328 *
329 *
330 *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
331 *
332 *
333 *
334 *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
335 *
336 *
337 *
338 *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
339 *
340 *
341 *
342 */
343
344/*
345 * OS-specific code that is used only within this file.
346 * Other OS-specific code that must be accessed by drivers
347 * is present in netmap_kern.h
348 */
349
350#if defined(__FreeBSD__)
351#include <sys/cdefs.h> /* prerequisite */
352#include <sys/types.h>
353#include <sys/errno.h>
354#include <sys/param.h>	/* defines used in kernel.h */
355#include <sys/kernel.h>	/* types used in module initialization */
356#include <sys/conf.h>	/* cdevsw struct, UID, GID */
357#include <sys/filio.h>	/* FIONBIO */
358#include <sys/sockio.h>
359#include <sys/socketvar.h>	/* struct socket */
360#include <sys/malloc.h>
361#include <sys/poll.h>
362#include <sys/rwlock.h>
363#include <sys/socket.h> /* sockaddrs */
364#include <sys/selinfo.h>
365#include <sys/sysctl.h>
366#include <sys/jail.h>
367#include <net/vnet.h>
368#include <net/if.h>
369#include <net/if_var.h>
370#include <net/bpf.h>		/* BIOCIMMEDIATE */
371#include <machine/bus.h>	/* bus_dmamap_* */
372#include <sys/endian.h>
373#include <sys/refcount.h>
374
375
376/* reduce conditional code */
377// linux API, use for the knlist in FreeBSD
378/* use a private mutex for the knlist */
379#define init_waitqueue_head(x) do {			\
380	struct mtx *m = &(x)->m;			\
381	mtx_init(m, "nm_kn_lock", NULL, MTX_DEF);	\
382	knlist_init_mtx(&(x)->si.si_note, m);		\
383    } while (0)
384
385#define OS_selrecord(a, b)	selrecord(a, &((b)->si))
386#define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
387
388#elif defined(linux)
389
390#include "bsd_glue.h"
391
392
393
394#elif defined(__APPLE__)
395
396#warning OSX support is only partial
397#include "osx_glue.h"
398
399#else
400
401#error	Unsupported platform
402
403#endif /* unsupported */
404
405/*
406 * common headers
407 */
408#include <net/netmap.h>
409#include <dev/netmap/netmap_kern.h>
410#include <dev/netmap/netmap_mem2.h>
411
412
413MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
414
415/*
416 * The following variables are used by the drivers and replicate
417 * fields in the global memory pool. They only refer to buffers
418 * used by physical interfaces.
419 */
420u_int netmap_total_buffers;
421u_int netmap_buf_size;
422char *netmap_buffer_base;	/* also address of an invalid buffer */
423
424/* user-controlled variables */
425int netmap_verbose;
426
427static int netmap_no_timestamp; /* don't timestamp on rxsync */
428
429SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
430SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
431    CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
432SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
433    CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
434int netmap_mitigate = 1;
435SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
436int netmap_no_pendintr = 1;
437SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
438    CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
439int netmap_txsync_retry = 2;
440SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
441    &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
442
443int netmap_adaptive_io = 0;
444SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
445    &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
446
447int netmap_flags = 0;	/* debug flags */
448int netmap_fwd = 0;	/* force transparent mode */
449int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
450
451/*
452 * netmap_admode selects the netmap mode to use.
453 * Invalid values are reset to NETMAP_ADMODE_BEST
454 */
455enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
456	NETMAP_ADMODE_NATIVE,	/* either native or none */
457	NETMAP_ADMODE_GENERIC,	/* force generic */
458	NETMAP_ADMODE_LAST };
459static int netmap_admode = NETMAP_ADMODE_BEST;
460
461int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
462int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
463int netmap_generic_rings = 1;   /* number of queues in generic. */
464
465SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
466SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
467SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
468SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
469SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
470SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
471SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
472
473NMG_LOCK_T	netmap_global_lock;
474
475
476static void
477nm_kr_get(struct netmap_kring *kr)
478{
479	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
480		tsleep(kr, 0, "NM_KR_GET", 4);
481}
482
483
484/*
485 * mark the ring as stopped, and run through the locks
486 * to make sure other users get to see it.
487 */
488static void
489netmap_disable_ring(struct netmap_kring *kr)
490{
491	kr->nkr_stopped = 1;
492	nm_kr_get(kr);
493	mtx_lock(&kr->q_lock);
494	mtx_unlock(&kr->q_lock);
495	nm_kr_put(kr);
496}
497
498/* stop or enable a single tx ring */
499void
500netmap_set_txring(struct netmap_adapter *na, u_int ring_id, int stopped)
501{
502	if (stopped)
503		netmap_disable_ring(na->tx_rings + ring_id);
504	else
505		na->tx_rings[ring_id].nkr_stopped = 0;
506	/* nofify that the stopped state has changed. This is currently
507	 *only used by bwrap to propagate the state to its own krings.
508	 * (see netmap_bwrap_intr_notify).
509	 */
510	na->nm_notify(na, ring_id, NR_TX, NAF_DISABLE_NOTIFY);
511}
512
513/* stop or enable a single rx ring */
514void
515netmap_set_rxring(struct netmap_adapter *na, u_int ring_id, int stopped)
516{
517	if (stopped)
518		netmap_disable_ring(na->rx_rings + ring_id);
519	else
520		na->rx_rings[ring_id].nkr_stopped = 0;
521	/* nofify that the stopped state has changed. This is currently
522	 *only used by bwrap to propagate the state to its own krings.
523	 * (see netmap_bwrap_intr_notify).
524	 */
525	na->nm_notify(na, ring_id, NR_RX, NAF_DISABLE_NOTIFY);
526}
527
528
529/* stop or enable all the rings of na */
530void
531netmap_set_all_rings(struct netmap_adapter *na, int stopped)
532{
533	int i;
534	u_int ntx, nrx;
535
536	if (!nm_netmap_on(na))
537		return;
538
539	ntx = netmap_real_tx_rings(na);
540	nrx = netmap_real_rx_rings(na);
541
542	for (i = 0; i < ntx; i++) {
543		netmap_set_txring(na, i, stopped);
544	}
545
546	for (i = 0; i < nrx; i++) {
547		netmap_set_rxring(na, i, stopped);
548	}
549}
550
551/*
552 * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
553 * to finish and prevents any new one from starting.  Call this before turning
554 * netmap mode off, or before removing the harware rings (e.g., on module
555 * onload).  As a rule of thumb for linux drivers, this should be placed near
556 * each napi_disable().
557 */
558void
559netmap_disable_all_rings(struct ifnet *ifp)
560{
561	netmap_set_all_rings(NA(ifp), 1 /* stopped */);
562}
563
564/*
565 * Convenience function used in drivers.  Re-enables rxsync and txsync on the
566 * adapter's rings In linux drivers, this should be placed near each
567 * napi_enable().
568 */
569void
570netmap_enable_all_rings(struct ifnet *ifp)
571{
572	netmap_set_all_rings(NA(ifp), 0 /* enabled */);
573}
574
575
576/*
577 * generic bound_checking function
578 */
579u_int
580nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
581{
582	u_int oldv = *v;
583	const char *op = NULL;
584
585	if (dflt < lo)
586		dflt = lo;
587	if (dflt > hi)
588		dflt = hi;
589	if (oldv < lo) {
590		*v = dflt;
591		op = "Bump";
592	} else if (oldv > hi) {
593		*v = hi;
594		op = "Clamp";
595	}
596	if (op && msg)
597		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
598	return *v;
599}
600
601
602/*
603 * packet-dump function, user-supplied or static buffer.
604 * The destination buffer must be at least 30+4*len
605 */
606const char *
607nm_dump_buf(char *p, int len, int lim, char *dst)
608{
609	static char _dst[8192];
610	int i, j, i0;
611	static char hex[] ="0123456789abcdef";
612	char *o;	/* output position */
613
614#define P_HI(x)	hex[((x) & 0xf0)>>4]
615#define P_LO(x)	hex[((x) & 0xf)]
616#define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
617	if (!dst)
618		dst = _dst;
619	if (lim <= 0 || lim > len)
620		lim = len;
621	o = dst;
622	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
623	o += strlen(o);
624	/* hexdump routine */
625	for (i = 0; i < lim; ) {
626		sprintf(o, "%5d: ", i);
627		o += strlen(o);
628		memset(o, ' ', 48);
629		i0 = i;
630		for (j=0; j < 16 && i < lim; i++, j++) {
631			o[j*3] = P_HI(p[i]);
632			o[j*3+1] = P_LO(p[i]);
633		}
634		i = i0;
635		for (j=0; j < 16 && i < lim; i++, j++)
636			o[j + 48] = P_C(p[i]);
637		o[j+48] = '\n';
638		o += j+49;
639	}
640	*o = '\0';
641#undef P_HI
642#undef P_LO
643#undef P_C
644	return dst;
645}
646
647
648/*
649 * Fetch configuration from the device, to cope with dynamic
650 * reconfigurations after loading the module.
651 */
652/* call with NMG_LOCK held */
653int
654netmap_update_config(struct netmap_adapter *na)
655{
656	u_int txr, txd, rxr, rxd;
657
658	txr = txd = rxr = rxd = 0;
659	if (na->nm_config == NULL ||
660	    na->nm_config(na, &txr, &txd, &rxr, &rxd)) {
661		/* take whatever we had at init time */
662		txr = na->num_tx_rings;
663		txd = na->num_tx_desc;
664		rxr = na->num_rx_rings;
665		rxd = na->num_rx_desc;
666	}
667
668	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
669	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
670		return 0; /* nothing changed */
671	if (netmap_verbose || na->active_fds > 0) {
672		D("stored config %s: txring %d x %d, rxring %d x %d",
673			na->name,
674			na->num_tx_rings, na->num_tx_desc,
675			na->num_rx_rings, na->num_rx_desc);
676		D("new config %s: txring %d x %d, rxring %d x %d",
677			na->name, txr, txd, rxr, rxd);
678	}
679	if (na->active_fds == 0) {
680		D("configuration changed (but fine)");
681		na->num_tx_rings = txr;
682		na->num_tx_desc = txd;
683		na->num_rx_rings = rxr;
684		na->num_rx_desc = rxd;
685		return 0;
686	}
687	D("configuration changed while active, this is bad...");
688	return 1;
689}
690
691/* kring->nm_sync callback for the host tx ring */
692static int
693netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
694{
695	(void)flags; /* unused */
696	netmap_txsync_to_host(kring->na);
697	return 0;
698}
699
700/* kring->nm_sync callback for the host rx ring */
701static int
702netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
703{
704	(void)flags; /* unused */
705	netmap_rxsync_from_host(kring->na, NULL, NULL);
706	return 0;
707}
708
709
710
711/* create the krings array and initialize the fields common to all adapters.
712 * The array layout is this:
713 *
714 *                    +----------+
715 * na->tx_rings ----->|          | \
716 *                    |          |  } na->num_tx_ring
717 *                    |          | /
718 *                    +----------+
719 *                    |          |    host tx kring
720 * na->rx_rings ----> +----------+
721 *                    |          | \
722 *                    |          |  } na->num_rx_rings
723 *                    |          | /
724 *                    +----------+
725 *                    |          |    host rx kring
726 *                    +----------+
727 * na->tailroom ----->|          | \
728 *                    |          |  } tailroom bytes
729 *                    |          | /
730 *                    +----------+
731 *
732 * Note: for compatibility, host krings are created even when not needed.
733 * The tailroom space is currently used by vale ports for allocating leases.
734 */
735/* call with NMG_LOCK held */
736int
737netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
738{
739	u_int i, len, ndesc;
740	struct netmap_kring *kring;
741	u_int ntx, nrx;
742
743	/* account for the (possibly fake) host rings */
744	ntx = na->num_tx_rings + 1;
745	nrx = na->num_rx_rings + 1;
746
747	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
748
749	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
750	if (na->tx_rings == NULL) {
751		D("Cannot allocate krings");
752		return ENOMEM;
753	}
754	na->rx_rings = na->tx_rings + ntx;
755
756	/*
757	 * All fields in krings are 0 except the one initialized below.
758	 * but better be explicit on important kring fields.
759	 */
760	ndesc = na->num_tx_desc;
761	for (i = 0; i < ntx; i++) { /* Transmit rings */
762		kring = &na->tx_rings[i];
763		bzero(kring, sizeof(*kring));
764		kring->na = na;
765		kring->ring_id = i;
766		kring->nkr_num_slots = ndesc;
767		if (i < na->num_tx_rings) {
768			kring->nm_sync = na->nm_txsync;
769		} else if (i == na->num_tx_rings) {
770			kring->nm_sync = netmap_txsync_to_host_compat;
771		}
772		/*
773		 * IMPORTANT: Always keep one slot empty.
774		 */
775		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
776		kring->rtail = kring->nr_hwtail = ndesc - 1;
777		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", na->name, i);
778		ND("ktx %s h %d c %d t %d",
779			kring->name, kring->rhead, kring->rcur, kring->rtail);
780		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
781		init_waitqueue_head(&kring->si);
782	}
783
784	ndesc = na->num_rx_desc;
785	for (i = 0; i < nrx; i++) { /* Receive rings */
786		kring = &na->rx_rings[i];
787		bzero(kring, sizeof(*kring));
788		kring->na = na;
789		kring->ring_id = i;
790		kring->nkr_num_slots = ndesc;
791		if (i < na->num_rx_rings) {
792			kring->nm_sync = na->nm_rxsync;
793		} else if (i == na->num_rx_rings) {
794			kring->nm_sync = netmap_rxsync_from_host_compat;
795		}
796		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
797		kring->rtail = kring->nr_hwtail = 0;
798		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", na->name, i);
799		ND("krx %s h %d c %d t %d",
800			kring->name, kring->rhead, kring->rcur, kring->rtail);
801		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
802		init_waitqueue_head(&kring->si);
803	}
804	init_waitqueue_head(&na->tx_si);
805	init_waitqueue_head(&na->rx_si);
806
807	na->tailroom = na->rx_rings + nrx;
808
809	return 0;
810}
811
812
813#ifdef __FreeBSD__
814static void
815netmap_knlist_destroy(NM_SELINFO_T *si)
816{
817	/* XXX kqueue(9) needed; these will mirror knlist_init. */
818	knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );
819	knlist_destroy(&si->si.si_note);
820	/* now we don't need the mutex anymore */
821	mtx_destroy(&si->m);
822}
823#endif /* __FreeBSD__ */
824
825
826/* undo the actions performed by netmap_krings_create */
827/* call with NMG_LOCK held */
828void
829netmap_krings_delete(struct netmap_adapter *na)
830{
831	struct netmap_kring *kring = na->tx_rings;
832
833	/* we rely on the krings layout described above */
834	for ( ; kring != na->tailroom; kring++) {
835		mtx_destroy(&kring->q_lock);
836		netmap_knlist_destroy(&kring->si);
837	}
838	free(na->tx_rings, M_DEVBUF);
839	na->tx_rings = na->rx_rings = na->tailroom = NULL;
840}
841
842
843/*
844 * Destructor for NIC ports. They also have an mbuf queue
845 * on the rings connected to the host so we need to purge
846 * them first.
847 */
848/* call with NMG_LOCK held */
849static void
850netmap_hw_krings_delete(struct netmap_adapter *na)
851{
852	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
853
854	ND("destroy sw mbq with len %d", mbq_len(q));
855	mbq_purge(q);
856	mbq_safe_destroy(q);
857	netmap_krings_delete(na);
858}
859
860
861/* create a new netmap_if for a newly registered fd.
862 * If this is the first registration of the adapter,
863 * also create the netmap rings and their in-kernel view,
864 * the netmap krings.
865 */
866/* call with NMG_LOCK held */
867static struct netmap_if*
868netmap_if_new(struct netmap_adapter *na)
869{
870	struct netmap_if *nifp;
871
872	if (netmap_update_config(na)) {
873		/* configuration mismatch, report and fail */
874		return NULL;
875	}
876
877	if (na->active_fds)	/* already registered */
878		goto final;
879
880	/* create and init the krings arrays.
881	 * Depending on the adapter, this may also create
882	 * the netmap rings themselves
883	 */
884	if (na->nm_krings_create(na))
885		return NULL;
886
887	/* create all missing netmap rings */
888	if (netmap_mem_rings_create(na))
889		goto cleanup;
890
891final:
892
893	/* in all cases, create a new netmap if */
894	nifp = netmap_mem_if_new(na);
895	if (nifp == NULL)
896		goto cleanup;
897
898	return (nifp);
899
900cleanup:
901
902	if (na->active_fds == 0) {
903		netmap_mem_rings_delete(na);
904		na->nm_krings_delete(na);
905	}
906
907	return NULL;
908}
909
910
911/* grab a reference to the memory allocator, if we don't have one already.  The
912 * reference is taken from the netmap_adapter registered with the priv.
913 */
914/* call with NMG_LOCK held */
915static int
916netmap_get_memory_locked(struct netmap_priv_d* p)
917{
918	struct netmap_mem_d *nmd;
919	int error = 0;
920
921	if (p->np_na == NULL) {
922		if (!netmap_mmap_unreg)
923			return ENODEV;
924		/* for compatibility with older versions of the API
925 		 * we use the global allocator when no interface has been
926 		 * registered
927 		 */
928		nmd = &nm_mem;
929	} else {
930		nmd = p->np_na->nm_mem;
931	}
932	if (p->np_mref == NULL) {
933		error = netmap_mem_finalize(nmd, p->np_na);
934		if (!error)
935			p->np_mref = nmd;
936	} else if (p->np_mref != nmd) {
937		/* a virtual port has been registered, but previous
938 		 * syscalls already used the global allocator.
939 		 * We cannot continue
940 		 */
941		error = ENODEV;
942	}
943	return error;
944}
945
946
947/* call with NMG_LOCK *not* held */
948int
949netmap_get_memory(struct netmap_priv_d* p)
950{
951	int error;
952	NMG_LOCK();
953	error = netmap_get_memory_locked(p);
954	NMG_UNLOCK();
955	return error;
956}
957
958
959/* call with NMG_LOCK held */
960static int
961netmap_have_memory_locked(struct netmap_priv_d* p)
962{
963	return p->np_mref != NULL;
964}
965
966
967/* call with NMG_LOCK held */
968static void
969netmap_drop_memory_locked(struct netmap_priv_d* p)
970{
971	if (p->np_mref) {
972		netmap_mem_deref(p->np_mref, p->np_na);
973		p->np_mref = NULL;
974	}
975}
976
977
978/*
979 * Call nm_register(ifp,0) to stop netmap mode on the interface and
980 * revert to normal operation.
981 * The second argument is the nifp to work on. In some cases it is
982 * not attached yet to the netmap_priv_d so we need to pass it as
983 * a separate argument.
984 */
985/* call with NMG_LOCK held */
986static void
987netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
988{
989	struct netmap_adapter *na = priv->np_na;
990
991	NMG_LOCK_ASSERT();
992	na->active_fds--;
993	if (na->active_fds <= 0) {	/* last instance */
994
995		if (netmap_verbose)
996			D("deleting last instance for %s", na->name);
997		/*
998		 * (TO CHECK) This function is only called
999		 * when the last reference to this file descriptor goes
1000		 * away. This means we cannot have any pending poll()
1001		 * or interrupt routine operating on the structure.
1002		 * XXX The file may be closed in a thread while
1003		 * another thread is using it.
1004		 * Linux keeps the file opened until the last reference
1005		 * by any outstanding ioctl/poll or mmap is gone.
1006		 * FreeBSD does not track mmap()s (but we do) and
1007		 * wakes up any sleeping poll(). Need to check what
1008		 * happens if the close() occurs while a concurrent
1009		 * syscall is running.
1010		 */
1011		na->nm_register(na, 0); /* off, clear flags */
1012		/* Wake up any sleeping threads. netmap_poll will
1013		 * then return POLLERR
1014		 * XXX The wake up now must happen during *_down(), when
1015		 * we order all activities to stop. -gl
1016		 */
1017		netmap_knlist_destroy(&na->tx_si);
1018		netmap_knlist_destroy(&na->rx_si);
1019
1020		/* delete rings and buffers */
1021		netmap_mem_rings_delete(na);
1022		na->nm_krings_delete(na);
1023	}
1024	/* delete the nifp */
1025	netmap_mem_if_delete(na, nifp);
1026}
1027
1028/* call with NMG_LOCK held */
1029static __inline int
1030nm_tx_si_user(struct netmap_priv_d *priv)
1031{
1032	return (priv->np_na != NULL &&
1033		(priv->np_txqlast - priv->np_txqfirst > 1));
1034}
1035
1036/* call with NMG_LOCK held */
1037static __inline int
1038nm_rx_si_user(struct netmap_priv_d *priv)
1039{
1040	return (priv->np_na != NULL &&
1041		(priv->np_rxqlast - priv->np_rxqfirst > 1));
1042}
1043
1044
1045/*
1046 * Destructor of the netmap_priv_d, called when the fd has
1047 * no active open() and mmap(). Also called in error paths.
1048 *
1049 * returns 1 if this is the last instance and we can free priv
1050 */
1051/* call with NMG_LOCK held */
1052int
1053netmap_dtor_locked(struct netmap_priv_d *priv)
1054{
1055	struct netmap_adapter *na = priv->np_na;
1056
1057#ifdef __FreeBSD__
1058	/*
1059	 * np_refcount is the number of active mmaps on
1060	 * this file descriptor
1061	 */
1062	if (--priv->np_refcount > 0) {
1063		return 0;
1064	}
1065#endif /* __FreeBSD__ */
1066	if (!na) {
1067	    return 1; //XXX is it correct?
1068	}
1069	netmap_do_unregif(priv, priv->np_nifp);
1070	priv->np_nifp = NULL;
1071	netmap_drop_memory_locked(priv);
1072	if (priv->np_na) {
1073		if (nm_tx_si_user(priv))
1074			na->tx_si_users--;
1075		if (nm_rx_si_user(priv))
1076			na->rx_si_users--;
1077		netmap_adapter_put(na);
1078		priv->np_na = NULL;
1079	}
1080	return 1;
1081}
1082
1083
1084/* call with NMG_LOCK *not* held */
1085void
1086netmap_dtor(void *data)
1087{
1088	struct netmap_priv_d *priv = data;
1089	int last_instance;
1090
1091	NMG_LOCK();
1092	last_instance = netmap_dtor_locked(priv);
1093	NMG_UNLOCK();
1094	if (last_instance) {
1095		bzero(priv, sizeof(*priv));	/* for safety */
1096		free(priv, M_DEVBUF);
1097	}
1098}
1099
1100
1101
1102
1103/*
1104 * Handlers for synchronization of the queues from/to the host.
1105 * Netmap has two operating modes:
1106 * - in the default mode, the rings connected to the host stack are
1107 *   just another ring pair managed by userspace;
1108 * - in transparent mode (XXX to be defined) incoming packets
1109 *   (from the host or the NIC) are marked as NS_FORWARD upon
1110 *   arrival, and the user application has a chance to reset the
1111 *   flag for packets that should be dropped.
1112 *   On the RXSYNC or poll(), packets in RX rings between
1113 *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1114 *   to the other side.
1115 * The transfer NIC --> host is relatively easy, just encapsulate
1116 * into mbufs and we are done. The host --> NIC side is slightly
1117 * harder because there might not be room in the tx ring so it
1118 * might take a while before releasing the buffer.
1119 */
1120
1121
1122/*
1123 * pass a chain of buffers to the host stack as coming from 'dst'
1124 * We do not need to lock because the queue is private.
1125 */
1126static void
1127netmap_send_up(struct ifnet *dst, struct mbq *q)
1128{
1129	struct mbuf *m;
1130
1131	/* send packets up, outside the lock */
1132	while ((m = mbq_dequeue(q)) != NULL) {
1133		if (netmap_verbose & NM_VERB_HOST)
1134			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1135		NM_SEND_UP(dst, m);
1136	}
1137	mbq_destroy(q);
1138}
1139
1140
1141/*
1142 * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1143 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
1144 * and pass them up. Drop remaining packets in the unlikely event
1145 * of an mbuf shortage.
1146 */
1147static void
1148netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1149{
1150	u_int const lim = kring->nkr_num_slots - 1;
1151	u_int const head = kring->ring->head;
1152	u_int n;
1153	struct netmap_adapter *na = kring->na;
1154
1155	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1156		struct mbuf *m;
1157		struct netmap_slot *slot = &kring->ring->slot[n];
1158
1159		if ((slot->flags & NS_FORWARD) == 0 && !force)
1160			continue;
1161		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1162			RD(5, "bad pkt at %d len %d", n, slot->len);
1163			continue;
1164		}
1165		slot->flags &= ~NS_FORWARD; // XXX needed ?
1166		/* XXX TODO: adapt to the case of a multisegment packet */
1167		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1168
1169		if (m == NULL)
1170			break;
1171		mbq_enqueue(q, m);
1172	}
1173}
1174
1175
1176/*
1177 * Send to the NIC rings packets marked NS_FORWARD between
1178 * kring->nr_hwcur and kring->rhead
1179 * Called under kring->rx_queue.lock on the sw rx ring,
1180 */
1181static u_int
1182netmap_sw_to_nic(struct netmap_adapter *na)
1183{
1184	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1185	struct netmap_slot *rxslot = kring->ring->slot;
1186	u_int i, rxcur = kring->nr_hwcur;
1187	u_int const head = kring->rhead;
1188	u_int const src_lim = kring->nkr_num_slots - 1;
1189	u_int sent = 0;
1190
1191	/* scan rings to find space, then fill as much as possible */
1192	for (i = 0; i < na->num_tx_rings; i++) {
1193		struct netmap_kring *kdst = &na->tx_rings[i];
1194		struct netmap_ring *rdst = kdst->ring;
1195		u_int const dst_lim = kdst->nkr_num_slots - 1;
1196
1197		/* XXX do we trust ring or kring->rcur,rtail ? */
1198		for (; rxcur != head && !nm_ring_empty(rdst);
1199		     rxcur = nm_next(rxcur, src_lim) ) {
1200			struct netmap_slot *src, *dst, tmp;
1201			u_int dst_cur = rdst->cur;
1202
1203			src = &rxslot[rxcur];
1204			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1205				continue;
1206
1207			sent++;
1208
1209			dst = &rdst->slot[dst_cur];
1210
1211			tmp = *src;
1212
1213			src->buf_idx = dst->buf_idx;
1214			src->flags = NS_BUF_CHANGED;
1215
1216			dst->buf_idx = tmp.buf_idx;
1217			dst->len = tmp.len;
1218			dst->flags = NS_BUF_CHANGED;
1219
1220			rdst->cur = nm_next(dst_cur, dst_lim);
1221		}
1222		/* if (sent) XXX txsync ? */
1223	}
1224	return sent;
1225}
1226
1227
1228/*
1229 * netmap_txsync_to_host() passes packets up. We are called from a
1230 * system call in user process context, and the only contention
1231 * can be among multiple user threads erroneously calling
1232 * this routine concurrently.
1233 */
1234void
1235netmap_txsync_to_host(struct netmap_adapter *na)
1236{
1237	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1238	struct netmap_ring *ring = kring->ring;
1239	u_int const lim = kring->nkr_num_slots - 1;
1240	u_int const head = kring->rhead;
1241	struct mbq q;
1242
1243	/* Take packets from hwcur to head and pass them up.
1244	 * force head = cur since netmap_grab_packets() stops at head
1245	 * In case of no buffers we give up. At the end of the loop,
1246	 * the queue is drained in all cases.
1247	 */
1248	mbq_init(&q);
1249	ring->cur = head;
1250	netmap_grab_packets(kring, &q, 1 /* force */);
1251	ND("have %d pkts in queue", mbq_len(&q));
1252	kring->nr_hwcur = head;
1253	kring->nr_hwtail = head + lim;
1254	if (kring->nr_hwtail > lim)
1255		kring->nr_hwtail -= lim + 1;
1256	nm_txsync_finalize(kring);
1257
1258	netmap_send_up(na->ifp, &q);
1259}
1260
1261
1262/*
1263 * rxsync backend for packets coming from the host stack.
1264 * They have been put in kring->rx_queue by netmap_transmit().
1265 * We protect access to the kring using kring->rx_queue.lock
1266 *
1267 * This routine also does the selrecord if called from the poll handler
1268 * (we know because td != NULL).
1269 *
1270 * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1271 *     as an additional hidden argument.
1272 * returns the number of packets delivered to tx queues in
1273 * transparent mode, or a negative value if error
1274 */
1275int
1276netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1277{
1278	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1279	struct netmap_ring *ring = kring->ring;
1280	u_int nm_i, n;
1281	u_int const lim = kring->nkr_num_slots - 1;
1282	u_int const head = kring->rhead;
1283	int ret = 0;
1284	struct mbq *q = &kring->rx_queue;
1285
1286	(void)pwait;	/* disable unused warnings */
1287	(void)td;
1288
1289	mbq_lock(q);
1290
1291	/* First part: import newly received packets */
1292	n = mbq_len(q);
1293	if (n) { /* grab packets from the queue */
1294		struct mbuf *m;
1295		uint32_t stop_i;
1296
1297		nm_i = kring->nr_hwtail;
1298		stop_i = nm_prev(nm_i, lim);
1299		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1300			int len = MBUF_LEN(m);
1301			struct netmap_slot *slot = &ring->slot[nm_i];
1302
1303			m_copydata(m, 0, len, NMB(na, slot));
1304			ND("nm %d len %d", nm_i, len);
1305			if (netmap_verbose)
1306                                D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1307
1308			slot->len = len;
1309			slot->flags = kring->nkr_slot_flags;
1310			nm_i = nm_next(nm_i, lim);
1311			m_freem(m);
1312		}
1313		kring->nr_hwtail = nm_i;
1314	}
1315
1316	/*
1317	 * Second part: skip past packets that userspace has released.
1318	 */
1319	nm_i = kring->nr_hwcur;
1320	if (nm_i != head) { /* something was released */
1321		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1322			ret = netmap_sw_to_nic(na);
1323		kring->nr_hwcur = head;
1324	}
1325
1326	nm_rxsync_finalize(kring);
1327
1328	/* access copies of cur,tail in the kring */
1329	if (kring->rcur == kring->rtail && td) /* no bufs available */
1330		OS_selrecord(td, &kring->si);
1331
1332	mbq_unlock(q);
1333	return ret;
1334}
1335
1336
1337/* Get a netmap adapter for the port.
1338 *
1339 * If it is possible to satisfy the request, return 0
1340 * with *na containing the netmap adapter found.
1341 * Otherwise return an error code, with *na containing NULL.
1342 *
1343 * When the port is attached to a bridge, we always return
1344 * EBUSY.
1345 * Otherwise, if the port is already bound to a file descriptor,
1346 * then we unconditionally return the existing adapter into *na.
1347 * In all the other cases, we return (into *na) either native,
1348 * generic or NULL, according to the following table:
1349 *
1350 *					native_support
1351 * active_fds   dev.netmap.admode         YES     NO
1352 * -------------------------------------------------------
1353 *    >0              *                 NA(ifp) NA(ifp)
1354 *
1355 *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1356 *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1357 *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1358 *
1359 */
1360
1361int
1362netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1363{
1364	/* generic support */
1365	int i = netmap_admode;	/* Take a snapshot. */
1366	int error = 0;
1367	struct netmap_adapter *prev_na;
1368	struct netmap_generic_adapter *gna;
1369
1370	*na = NULL; /* default */
1371
1372	/* reset in case of invalid value */
1373	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1374		i = netmap_admode = NETMAP_ADMODE_BEST;
1375
1376	if (NETMAP_CAPABLE(ifp)) {
1377		prev_na = NA(ifp);
1378		/* If an adapter already exists, return it if
1379		 * there are active file descriptors or if
1380		 * netmap is not forced to use generic
1381		 * adapters.
1382		 */
1383		if (NETMAP_OWNED_BY_ANY(prev_na)
1384			|| i != NETMAP_ADMODE_GENERIC
1385			|| prev_na->na_flags & NAF_FORCE_NATIVE
1386#ifdef WITH_PIPES
1387			/* ugly, but we cannot allow an adapter switch
1388			 * if some pipe is referring to this one
1389			 */
1390			|| prev_na->na_next_pipe > 0
1391#endif
1392		) {
1393			*na = prev_na;
1394			return 0;
1395		}
1396	}
1397
1398	/* If there isn't native support and netmap is not allowed
1399	 * to use generic adapters, we cannot satisfy the request.
1400	 */
1401	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1402		return EOPNOTSUPP;
1403
1404	/* Otherwise, create a generic adapter and return it,
1405	 * saving the previously used netmap adapter, if any.
1406	 *
1407	 * Note that here 'prev_na', if not NULL, MUST be a
1408	 * native adapter, and CANNOT be a generic one. This is
1409	 * true because generic adapters are created on demand, and
1410	 * destroyed when not used anymore. Therefore, if the adapter
1411	 * currently attached to an interface 'ifp' is generic, it
1412	 * must be that
1413	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1414	 * Consequently, if NA(ifp) is generic, we will enter one of
1415	 * the branches above. This ensures that we never override
1416	 * a generic adapter with another generic adapter.
1417	 */
1418	prev_na = NA(ifp);
1419	error = generic_netmap_attach(ifp);
1420	if (error)
1421		return error;
1422
1423	*na = NA(ifp);
1424	gna = (struct netmap_generic_adapter*)NA(ifp);
1425	gna->prev = prev_na; /* save old na */
1426	if (prev_na != NULL) {
1427		ifunit_ref(ifp->if_xname);
1428		// XXX add a refcount ?
1429		netmap_adapter_get(prev_na);
1430	}
1431	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1432
1433	return 0;
1434}
1435
1436
1437/*
1438 * MUST BE CALLED UNDER NMG_LOCK()
1439 *
1440 * Get a refcounted reference to a netmap adapter attached
1441 * to the interface specified by nmr.
1442 * This is always called in the execution of an ioctl().
1443 *
1444 * Return ENXIO if the interface specified by the request does
1445 * not exist, ENOTSUP if netmap is not supported by the interface,
1446 * EBUSY if the interface is already attached to a bridge,
1447 * EINVAL if parameters are invalid, ENOMEM if needed resources
1448 * could not be allocated.
1449 * If successful, hold a reference to the netmap adapter.
1450 *
1451 * No reference is kept on the real interface, which may then
1452 * disappear at any time.
1453 */
1454int
1455netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1456{
1457	struct ifnet *ifp = NULL;
1458	int error = 0;
1459	struct netmap_adapter *ret = NULL;
1460
1461	*na = NULL;     /* default return value */
1462
1463	NMG_LOCK_ASSERT();
1464
1465	/* we cascade through all possibile types of netmap adapter.
1466	 * All netmap_get_*_na() functions return an error and an na,
1467	 * with the following combinations:
1468	 *
1469	 * error    na
1470	 *   0	   NULL		type doesn't match
1471	 *  !0	   NULL		type matches, but na creation/lookup failed
1472	 *   0	  !NULL		type matches and na created/found
1473	 *  !0    !NULL		impossible
1474	 */
1475
1476	/* try to see if this is a monitor port */
1477	error = netmap_get_monitor_na(nmr, na, create);
1478	if (error || *na != NULL)
1479		return error;
1480
1481	/* try to see if this is a pipe port */
1482	error = netmap_get_pipe_na(nmr, na, create);
1483	if (error || *na != NULL)
1484		return error;
1485
1486	/* try to see if this is a bridge port */
1487	error = netmap_get_bdg_na(nmr, na, create);
1488	if (error)
1489		return error;
1490
1491	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1492		goto pipes;
1493
1494	/*
1495	 * This must be a hardware na, lookup the name in the system.
1496	 * Note that by hardware we actually mean "it shows up in ifconfig".
1497	 * This may still be a tap, a veth/epair, or even a
1498	 * persistent VALE port.
1499	 */
1500	ifp = ifunit_ref(nmr->nr_name);
1501	if (ifp == NULL) {
1502	        return ENXIO;
1503	}
1504
1505	error = netmap_get_hw_na(ifp, &ret);
1506	if (error)
1507		goto out;
1508
1509	*na = ret;
1510	netmap_adapter_get(ret);
1511
1512pipes:
1513	/*
1514	 * If we are opening a pipe whose parent was not in netmap mode,
1515	 * we have to allocate the pipe array now.
1516	 * XXX get rid of this clumsiness (2014-03-15)
1517	 */
1518	error = netmap_pipe_alloc(*na, nmr);
1519
1520out:
1521	if (error && ret != NULL)
1522		netmap_adapter_put(ret);
1523
1524	if (ifp)
1525		if_rele(ifp); /* allow live unloading of drivers modules */
1526
1527	return error;
1528}
1529
1530
1531/*
1532 * validate parameters on entry for *_txsync()
1533 * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1534 * in case of error.
1535 *
1536 * rhead, rcur and rtail=hwtail are stored from previous round.
1537 * hwcur is the next packet to send to the ring.
1538 *
1539 * We want
1540 *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1541 *
1542 * hwcur, rhead, rtail and hwtail are reliable
1543 */
1544u_int
1545nm_txsync_prologue(struct netmap_kring *kring)
1546{
1547	struct netmap_ring *ring = kring->ring;
1548	u_int head = ring->head; /* read only once */
1549	u_int cur = ring->cur; /* read only once */
1550	u_int n = kring->nkr_num_slots;
1551
1552	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1553		kring->name,
1554		kring->nr_hwcur, kring->nr_hwtail,
1555		ring->head, ring->cur, ring->tail);
1556#if 1 /* kernel sanity checks; but we can trust the kring. */
1557	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1558	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1559		goto error;
1560#endif /* kernel sanity checks */
1561	/*
1562	 * user sanity checks. We only use 'cur',
1563	 * A, B, ... are possible positions for cur:
1564	 *
1565	 *  0    A  cur   B  tail  C  n-1
1566	 *  0    D  tail  E  cur   F  n-1
1567	 *
1568	 * B, F, D are valid. A, C, E are wrong
1569	 */
1570	if (kring->rtail >= kring->rhead) {
1571		/* want rhead <= head <= rtail */
1572		if (head < kring->rhead || head > kring->rtail)
1573			goto error;
1574		/* and also head <= cur <= rtail */
1575		if (cur < head || cur > kring->rtail)
1576			goto error;
1577	} else { /* here rtail < rhead */
1578		/* we need head outside rtail .. rhead */
1579		if (head > kring->rtail && head < kring->rhead)
1580			goto error;
1581
1582		/* two cases now: head <= rtail or head >= rhead  */
1583		if (head <= kring->rtail) {
1584			/* want head <= cur <= rtail */
1585			if (cur < head || cur > kring->rtail)
1586				goto error;
1587		} else { /* head >= rhead */
1588			/* cur must be outside rtail..head */
1589			if (cur > kring->rtail && cur < head)
1590				goto error;
1591		}
1592	}
1593	if (ring->tail != kring->rtail) {
1594		RD(5, "tail overwritten was %d need %d",
1595			ring->tail, kring->rtail);
1596		ring->tail = kring->rtail;
1597	}
1598	kring->rhead = head;
1599	kring->rcur = cur;
1600	return head;
1601
1602error:
1603	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1604		kring->name,
1605		kring->nr_hwcur,
1606		kring->rcur, kring->nr_hwtail,
1607		cur, ring->tail);
1608	return n;
1609}
1610
1611
1612/*
1613 * validate parameters on entry for *_rxsync()
1614 * Returns ring->head if ok, kring->nkr_num_slots on error.
1615 *
1616 * For a valid configuration,
1617 * hwcur <= head <= cur <= tail <= hwtail
1618 *
1619 * We only consider head and cur.
1620 * hwcur and hwtail are reliable.
1621 *
1622 */
1623u_int
1624nm_rxsync_prologue(struct netmap_kring *kring)
1625{
1626	struct netmap_ring *ring = kring->ring;
1627	uint32_t const n = kring->nkr_num_slots;
1628	uint32_t head, cur;
1629
1630	ND("%s kc %d kt %d h %d c %d t %d",
1631		kring->name,
1632		kring->nr_hwcur, kring->nr_hwtail,
1633		ring->head, ring->cur, ring->tail);
1634	/*
1635	 * Before storing the new values, we should check they do not
1636	 * move backwards. However:
1637	 * - head is not an issue because the previous value is hwcur;
1638	 * - cur could in principle go back, however it does not matter
1639	 *   because we are processing a brand new rxsync()
1640	 */
1641	cur = kring->rcur = ring->cur;	/* read only once */
1642	head = kring->rhead = ring->head;	/* read only once */
1643#if 1 /* kernel sanity checks */
1644	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1645		goto error;
1646#endif /* kernel sanity checks */
1647	/* user sanity checks */
1648	if (kring->nr_hwtail >= kring->nr_hwcur) {
1649		/* want hwcur <= rhead <= hwtail */
1650		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1651			goto error;
1652		/* and also rhead <= rcur <= hwtail */
1653		if (cur < head || cur > kring->nr_hwtail)
1654			goto error;
1655	} else {
1656		/* we need rhead outside hwtail..hwcur */
1657		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1658			goto error;
1659		/* two cases now: head <= hwtail or head >= hwcur  */
1660		if (head <= kring->nr_hwtail) {
1661			/* want head <= cur <= hwtail */
1662			if (cur < head || cur > kring->nr_hwtail)
1663				goto error;
1664		} else {
1665			/* cur must be outside hwtail..head */
1666			if (cur < head && cur > kring->nr_hwtail)
1667				goto error;
1668		}
1669	}
1670	if (ring->tail != kring->rtail) {
1671		RD(5, "%s tail overwritten was %d need %d",
1672			kring->name,
1673			ring->tail, kring->rtail);
1674		ring->tail = kring->rtail;
1675	}
1676	return head;
1677
1678error:
1679	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1680		kring->nr_hwcur,
1681		kring->rcur, kring->nr_hwtail,
1682		kring->rhead, kring->rcur, ring->tail);
1683	return n;
1684}
1685
1686
1687/*
1688 * Error routine called when txsync/rxsync detects an error.
1689 * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1690 * Return 1 on reinit.
1691 *
1692 * This routine is only called by the upper half of the kernel.
1693 * It only reads hwcur (which is changed only by the upper half, too)
1694 * and hwtail (which may be changed by the lower half, but only on
1695 * a tx ring and only to increase it, so any error will be recovered
1696 * on the next call). For the above, we don't strictly need to call
1697 * it under lock.
1698 */
1699int
1700netmap_ring_reinit(struct netmap_kring *kring)
1701{
1702	struct netmap_ring *ring = kring->ring;
1703	u_int i, lim = kring->nkr_num_slots - 1;
1704	int errors = 0;
1705
1706	// XXX KASSERT nm_kr_tryget
1707	RD(10, "called for %s", kring->name);
1708	// XXX probably wrong to trust userspace
1709	kring->rhead = ring->head;
1710	kring->rcur  = ring->cur;
1711	kring->rtail = ring->tail;
1712
1713	if (ring->cur > lim)
1714		errors++;
1715	if (ring->head > lim)
1716		errors++;
1717	if (ring->tail > lim)
1718		errors++;
1719	for (i = 0; i <= lim; i++) {
1720		u_int idx = ring->slot[i].buf_idx;
1721		u_int len = ring->slot[i].len;
1722		if (idx < 2 || idx >= netmap_total_buffers) {
1723			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1724			ring->slot[i].buf_idx = 0;
1725			ring->slot[i].len = 0;
1726		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1727			ring->slot[i].len = 0;
1728			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1729		}
1730	}
1731	if (errors) {
1732		RD(10, "total %d errors", errors);
1733		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1734			kring->name,
1735			ring->cur, kring->nr_hwcur,
1736			ring->tail, kring->nr_hwtail);
1737		ring->head = kring->rhead = kring->nr_hwcur;
1738		ring->cur  = kring->rcur  = kring->nr_hwcur;
1739		ring->tail = kring->rtail = kring->nr_hwtail;
1740	}
1741	return (errors ? 1 : 0);
1742}
1743
1744/* interpret the ringid and flags fields of an nmreq, by translating them
1745 * into a pair of intervals of ring indices:
1746 *
1747 * [priv->np_txqfirst, priv->np_txqlast) and
1748 * [priv->np_rxqfirst, priv->np_rxqlast)
1749 *
1750 */
1751int
1752netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1753{
1754	struct netmap_adapter *na = priv->np_na;
1755	u_int j, i = ringid & NETMAP_RING_MASK;
1756	u_int reg = flags & NR_REG_MASK;
1757
1758	if (reg == NR_REG_DEFAULT) {
1759		/* convert from old ringid to flags */
1760		if (ringid & NETMAP_SW_RING) {
1761			reg = NR_REG_SW;
1762		} else if (ringid & NETMAP_HW_RING) {
1763			reg = NR_REG_ONE_NIC;
1764		} else {
1765			reg = NR_REG_ALL_NIC;
1766		}
1767		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1768	}
1769	switch (reg) {
1770	case NR_REG_ALL_NIC:
1771	case NR_REG_PIPE_MASTER:
1772	case NR_REG_PIPE_SLAVE:
1773		priv->np_txqfirst = 0;
1774		priv->np_txqlast = na->num_tx_rings;
1775		priv->np_rxqfirst = 0;
1776		priv->np_rxqlast = na->num_rx_rings;
1777		ND("%s %d %d", "ALL/PIPE",
1778			priv->np_rxqfirst, priv->np_rxqlast);
1779		break;
1780	case NR_REG_SW:
1781	case NR_REG_NIC_SW:
1782		if (!(na->na_flags & NAF_HOST_RINGS)) {
1783			D("host rings not supported");
1784			return EINVAL;
1785		}
1786		priv->np_txqfirst = (reg == NR_REG_SW ?
1787			na->num_tx_rings : 0);
1788		priv->np_txqlast = na->num_tx_rings + 1;
1789		priv->np_rxqfirst = (reg == NR_REG_SW ?
1790			na->num_rx_rings : 0);
1791		priv->np_rxqlast = na->num_rx_rings + 1;
1792		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1793			priv->np_rxqfirst, priv->np_rxqlast);
1794		break;
1795	case NR_REG_ONE_NIC:
1796		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1797			D("invalid ring id %d", i);
1798			return EINVAL;
1799		}
1800		/* if not enough rings, use the first one */
1801		j = i;
1802		if (j >= na->num_tx_rings)
1803			j = 0;
1804		priv->np_txqfirst = j;
1805		priv->np_txqlast = j + 1;
1806		j = i;
1807		if (j >= na->num_rx_rings)
1808			j = 0;
1809		priv->np_rxqfirst = j;
1810		priv->np_rxqlast = j + 1;
1811		break;
1812	default:
1813		D("invalid regif type %d", reg);
1814		return EINVAL;
1815	}
1816	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1817
1818	if (netmap_verbose) {
1819		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1820			na->name,
1821			priv->np_txqfirst,
1822			priv->np_txqlast,
1823			priv->np_rxqfirst,
1824			priv->np_rxqlast,
1825			i);
1826	}
1827	return 0;
1828}
1829
1830
1831/*
1832 * Set the ring ID. For devices with a single queue, a request
1833 * for all rings is the same as a single ring.
1834 */
1835static int
1836netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1837{
1838	struct netmap_adapter *na = priv->np_na;
1839	int error;
1840
1841	error = netmap_interp_ringid(priv, ringid, flags);
1842	if (error) {
1843		return error;
1844	}
1845
1846	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1847
1848	/* optimization: count the users registered for more than
1849	 * one ring, which are the ones sleeping on the global queue.
1850	 * The default netmap_notify() callback will then
1851	 * avoid signaling the global queue if nobody is using it
1852	 */
1853	if (nm_tx_si_user(priv))
1854		na->tx_si_users++;
1855	if (nm_rx_si_user(priv))
1856		na->rx_si_users++;
1857	return 0;
1858}
1859
1860/*
1861 * possibly move the interface to netmap-mode.
1862 * If success it returns a pointer to netmap_if, otherwise NULL.
1863 * This must be called with NMG_LOCK held.
1864 *
1865 * The following na callbacks are called in the process:
1866 *
1867 * na->nm_config()			[by netmap_update_config]
1868 * (get current number and size of rings)
1869 *
1870 *  	We have a generic one for linux (netmap_linux_config).
1871 *  	The bwrap has to override this, since it has to forward
1872 *  	the request to the wrapped adapter (netmap_bwrap_config).
1873 *
1874 *    	XXX netmap_if_new calls this again (2014-03-15)
1875 *
1876 * na->nm_krings_create()		[by netmap_if_new]
1877 * (create and init the krings array)
1878 *
1879 * 	One of the following:
1880 *
1881 *	* netmap_hw_krings_create, 			(hw ports)
1882 *		creates the standard layout for the krings
1883 * 		and adds the mbq (used for the host rings).
1884 *
1885 * 	* netmap_vp_krings_create			(VALE ports)
1886 * 		add leases and scratchpads
1887 *
1888 * 	* netmap_pipe_krings_create			(pipes)
1889 * 		create the krings and rings of both ends and
1890 * 		cross-link them
1891 *
1892 *      * netmap_monitor_krings_create 			(monitors)
1893 *      	avoid allocating the mbq
1894 *
1895 *      * netmap_bwrap_krings_create			(bwraps)
1896 *      	create both the brap krings array,
1897 *      	the krings array of the wrapped adapter, and
1898 *      	(if needed) the fake array for the host adapter
1899 *
1900 * na->nm_register(, 1)
1901 * (put the adapter in netmap mode)
1902 *
1903 * 	This may be one of the following:
1904 * 	(XXX these should be either all *_register or all *_reg 2014-03-15)
1905 *
1906 * 	* netmap_hw_register				(hw ports)
1907 * 		checks that the ifp is still there, then calls
1908 * 		the hardware specific callback;
1909 *
1910 * 	* netmap_vp_reg					(VALE ports)
1911 *		If the port is connected to a bridge,
1912 *		set the NAF_NETMAP_ON flag under the
1913 *		bridge write lock.
1914 *
1915 *	* netmap_pipe_reg				(pipes)
1916 *		inform the other pipe end that it is no
1917 *		longer responsibile for the lifetime of this
1918 *		pipe end
1919 *
1920 *	* netmap_monitor_reg				(monitors)
1921 *		intercept the sync callbacks of the monitored
1922 *		rings
1923 *
1924 *	* netmap_bwrap_register				(bwraps)
1925 *		cross-link the bwrap and hwna rings,
1926 *		forward the request to the hwna, override
1927 *		the hwna notify callback (to get the frames
1928 *		coming from outside go through the bridge).
1929 *
1930 * XXX maybe netmap_if_new() should be merged with this (2014-03-15).
1931 *
1932 */
1933struct netmap_if *
1934netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1935	uint16_t ringid, uint32_t flags, int *err)
1936{
1937	struct netmap_if *nifp = NULL;
1938	int error, need_mem = 0;
1939
1940	NMG_LOCK_ASSERT();
1941	/* ring configuration may have changed, fetch from the card */
1942	netmap_update_config(na);
1943	priv->np_na = na;     /* store the reference */
1944	error = netmap_set_ringid(priv, ringid, flags);
1945	if (error)
1946		goto out;
1947	/* ensure allocators are ready */
1948	need_mem = !netmap_have_memory_locked(priv);
1949	if (need_mem) {
1950		error = netmap_get_memory_locked(priv);
1951		ND("get_memory returned %d", error);
1952		if (error)
1953			goto out;
1954	}
1955	/* Allocate a netmap_if and, if necessary, all the netmap_ring's */
1956	nifp = netmap_if_new(na);
1957	if (nifp == NULL) { /* allocation failed */
1958		error = ENOMEM;
1959		goto out;
1960	}
1961	na->active_fds++;
1962	if (!nm_netmap_on(na)) {
1963		/* Netmap not active, set the card in netmap mode
1964		 * and make it use the shared buffers.
1965		 */
1966		/* cache the allocator info in the na */
1967		na->na_lut = netmap_mem_get_lut(na->nm_mem);
1968		ND("%p->na_lut == %p", na, na->na_lut);
1969		na->na_lut_objtotal = netmap_mem_get_buftotal(na->nm_mem);
1970		na->na_lut_objsize = netmap_mem_get_bufsize(na->nm_mem);
1971		error = na->nm_register(na, 1); /* mode on */
1972		if (error) {
1973			netmap_do_unregif(priv, nifp);
1974			nifp = NULL;
1975		}
1976	}
1977out:
1978	*err = error;
1979	if (error) {
1980		/* we should drop the allocator, but only
1981		 * if we were the ones who grabbed it
1982		 */
1983		if (need_mem)
1984			netmap_drop_memory_locked(priv);
1985		priv->np_na = NULL;
1986	}
1987	if (nifp != NULL) {
1988		/*
1989		 * advertise that the interface is ready bt setting ni_nifp.
1990		 * The barrier is needed because readers (poll and *SYNC)
1991		 * check for priv->np_nifp != NULL without locking
1992		 */
1993		wmb(); /* make sure previous writes are visible to all CPUs */
1994		priv->np_nifp = nifp;
1995	}
1996	return nifp;
1997}
1998
1999
2000
2001/*
2002 * ioctl(2) support for the "netmap" device.
2003 *
2004 * Following a list of accepted commands:
2005 * - NIOCGINFO
2006 * - SIOCGIFADDR	just for convenience
2007 * - NIOCREGIF
2008 * - NIOCTXSYNC
2009 * - NIOCRXSYNC
2010 *
2011 * Return 0 on success, errno otherwise.
2012 */
2013int
2014netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
2015	int fflag, struct thread *td)
2016{
2017	struct netmap_priv_d *priv = NULL;
2018	struct nmreq *nmr = (struct nmreq *) data;
2019	struct netmap_adapter *na = NULL;
2020	int error;
2021	u_int i, qfirst, qlast;
2022	struct netmap_if *nifp;
2023	struct netmap_kring *krings;
2024
2025	(void)dev;	/* UNUSED */
2026	(void)fflag;	/* UNUSED */
2027
2028	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2029		/* truncate name */
2030		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2031		if (nmr->nr_version != NETMAP_API) {
2032			D("API mismatch for %s got %d need %d",
2033				nmr->nr_name,
2034				nmr->nr_version, NETMAP_API);
2035			nmr->nr_version = NETMAP_API;
2036		}
2037		if (nmr->nr_version < NETMAP_MIN_API ||
2038		    nmr->nr_version > NETMAP_MAX_API) {
2039			return EINVAL;
2040		}
2041	}
2042	CURVNET_SET(TD_TO_VNET(td));
2043
2044	error = devfs_get_cdevpriv((void **)&priv);
2045	if (error) {
2046		CURVNET_RESTORE();
2047		/* XXX ENOENT should be impossible, since the priv
2048		 * is now created in the open */
2049		return (error == ENOENT ? ENXIO : error);
2050	}
2051
2052	switch (cmd) {
2053	case NIOCGINFO:		/* return capabilities etc */
2054		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2055			error = netmap_bdg_ctl(nmr, NULL);
2056			break;
2057		}
2058
2059		NMG_LOCK();
2060		do {
2061			/* memsize is always valid */
2062			struct netmap_mem_d *nmd = &nm_mem;
2063			u_int memflags;
2064
2065			if (nmr->nr_name[0] != '\0') {
2066				/* get a refcount */
2067				error = netmap_get_na(nmr, &na, 1 /* create */);
2068				if (error)
2069					break;
2070				nmd = na->nm_mem; /* get memory allocator */
2071			}
2072
2073			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2074				&nmr->nr_arg2);
2075			if (error)
2076				break;
2077			if (na == NULL) /* only memory info */
2078				break;
2079			nmr->nr_offset = 0;
2080			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2081			netmap_update_config(na);
2082			nmr->nr_rx_rings = na->num_rx_rings;
2083			nmr->nr_tx_rings = na->num_tx_rings;
2084			nmr->nr_rx_slots = na->num_rx_desc;
2085			nmr->nr_tx_slots = na->num_tx_desc;
2086			netmap_adapter_put(na);
2087		} while (0);
2088		NMG_UNLOCK();
2089		break;
2090
2091	case NIOCREGIF:
2092		/* possibly attach/detach NIC and VALE switch */
2093		i = nmr->nr_cmd;
2094		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2095				|| i == NETMAP_BDG_VNET_HDR
2096				|| i == NETMAP_BDG_NEWIF
2097				|| i == NETMAP_BDG_DELIF) {
2098			error = netmap_bdg_ctl(nmr, NULL);
2099			break;
2100		} else if (i != 0) {
2101			D("nr_cmd must be 0 not %d", i);
2102			error = EINVAL;
2103			break;
2104		}
2105
2106		/* protect access to priv from concurrent NIOCREGIF */
2107		NMG_LOCK();
2108		do {
2109			u_int memflags;
2110
2111			if (priv->np_na != NULL) {	/* thread already registered */
2112				error = EBUSY;
2113				break;
2114			}
2115			/* find the interface and a reference */
2116			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
2117			if (error)
2118				break;
2119			if (NETMAP_OWNED_BY_KERN(na)) {
2120				netmap_adapter_put(na);
2121				error = EBUSY;
2122				break;
2123			}
2124			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
2125			if (!nifp) {    /* reg. failed, release priv and ref */
2126				netmap_adapter_put(na);
2127				priv->np_nifp = NULL;
2128				break;
2129			}
2130			priv->np_td = td; // XXX kqueue, debugging only
2131
2132			/* return the offset of the netmap_if object */
2133			nmr->nr_rx_rings = na->num_rx_rings;
2134			nmr->nr_tx_rings = na->num_tx_rings;
2135			nmr->nr_rx_slots = na->num_rx_desc;
2136			nmr->nr_tx_slots = na->num_tx_desc;
2137			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2138				&nmr->nr_arg2);
2139			if (error) {
2140				netmap_adapter_put(na);
2141				break;
2142			}
2143			if (memflags & NETMAP_MEM_PRIVATE) {
2144				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2145			}
2146			priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
2147				&na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
2148			priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
2149				&na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
2150
2151			if (nmr->nr_arg3) {
2152				D("requested %d extra buffers", nmr->nr_arg3);
2153				nmr->nr_arg3 = netmap_extra_alloc(na,
2154					&nifp->ni_bufs_head, nmr->nr_arg3);
2155				D("got %d extra buffers", nmr->nr_arg3);
2156			}
2157			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2158		} while (0);
2159		NMG_UNLOCK();
2160		break;
2161
2162	case NIOCTXSYNC:
2163	case NIOCRXSYNC:
2164		nifp = priv->np_nifp;
2165
2166		if (nifp == NULL) {
2167			error = ENXIO;
2168			break;
2169		}
2170		mb(); /* make sure following reads are not from cache */
2171
2172		na = priv->np_na;      /* we have a reference */
2173
2174		if (na == NULL) {
2175			D("Internal error: nifp != NULL && na == NULL");
2176			error = ENXIO;
2177			break;
2178		}
2179
2180		if (!nm_netmap_on(na)) {
2181			error = ENXIO;
2182			break;
2183		}
2184
2185		if (cmd == NIOCTXSYNC) {
2186			krings = na->tx_rings;
2187			qfirst = priv->np_txqfirst;
2188			qlast = priv->np_txqlast;
2189		} else {
2190			krings = na->rx_rings;
2191			qfirst = priv->np_rxqfirst;
2192			qlast = priv->np_rxqlast;
2193		}
2194
2195		for (i = qfirst; i < qlast; i++) {
2196			struct netmap_kring *kring = krings + i;
2197			if (nm_kr_tryget(kring)) {
2198				error = EBUSY;
2199				goto out;
2200			}
2201			if (cmd == NIOCTXSYNC) {
2202				if (netmap_verbose & NM_VERB_TXSYNC)
2203					D("pre txsync ring %d cur %d hwcur %d",
2204					    i, kring->ring->cur,
2205					    kring->nr_hwcur);
2206				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2207					netmap_ring_reinit(kring);
2208				} else {
2209					kring->nm_sync(kring, NAF_FORCE_RECLAIM);
2210				}
2211				if (netmap_verbose & NM_VERB_TXSYNC)
2212					D("post txsync ring %d cur %d hwcur %d",
2213					    i, kring->ring->cur,
2214					    kring->nr_hwcur);
2215			} else {
2216				kring->nm_sync(kring, NAF_FORCE_READ);
2217				microtime(&na->rx_rings[i].ring->ts);
2218			}
2219			nm_kr_put(kring);
2220		}
2221
2222		break;
2223
2224	case NIOCCONFIG:
2225		error = netmap_bdg_config(nmr);
2226		break;
2227#ifdef __FreeBSD__
2228	case FIONBIO:
2229	case FIOASYNC:
2230		ND("FIONBIO/FIOASYNC are no-ops");
2231		break;
2232
2233	case BIOCIMMEDIATE:
2234	case BIOCGHDRCMPLT:
2235	case BIOCSHDRCMPLT:
2236	case BIOCSSEESENT:
2237		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2238		break;
2239
2240	default:	/* allow device-specific ioctls */
2241	    {
2242		struct ifnet *ifp = ifunit_ref(nmr->nr_name);
2243		if (ifp == NULL) {
2244			error = ENXIO;
2245		} else {
2246			struct socket so;
2247
2248			bzero(&so, sizeof(so));
2249			so.so_vnet = ifp->if_vnet;
2250			// so->so_proto not null.
2251			error = ifioctl(&so, cmd, data, td);
2252			if_rele(ifp);
2253		}
2254		break;
2255	    }
2256
2257#else /* linux */
2258	default:
2259		error = EOPNOTSUPP;
2260#endif /* linux */
2261	}
2262out:
2263
2264	CURVNET_RESTORE();
2265	return (error);
2266}
2267
2268
2269/*
2270 * select(2) and poll(2) handlers for the "netmap" device.
2271 *
2272 * Can be called for one or more queues.
2273 * Return true the event mask corresponding to ready events.
2274 * If there are no ready events, do a selrecord on either individual
2275 * selinfo or on the global one.
2276 * Device-dependent parts (locking and sync of tx/rx rings)
2277 * are done through callbacks.
2278 *
2279 * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2280 * The first one is remapped to pwait as selrecord() uses the name as an
2281 * hidden argument.
2282 */
2283int
2284netmap_poll(struct cdev *dev, int events, struct thread *td)
2285{
2286	struct netmap_priv_d *priv = NULL;
2287	struct netmap_adapter *na;
2288	struct netmap_kring *kring;
2289	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
2290	struct mbq q;		/* packets from hw queues to host stack */
2291	void *pwait = dev;	/* linux compatibility */
2292	int is_kevent = 0;
2293
2294	/*
2295	 * In order to avoid nested locks, we need to "double check"
2296	 * txsync and rxsync if we decide to do a selrecord().
2297	 * retry_tx (and retry_rx, later) prevent looping forever.
2298	 */
2299	int retry_tx = 1, retry_rx = 1;
2300
2301	(void)pwait;
2302	mbq_init(&q);
2303
2304	/*
2305	 * XXX kevent has curthread->tp_fop == NULL,
2306	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
2307	 * priv as the first argument, which is also useful to avoid
2308	 * the selrecord() which are not necessary in that case.
2309	 */
2310	if (devfs_get_cdevpriv((void **)&priv) != 0) {
2311		is_kevent = 1;
2312		if (netmap_verbose)
2313			D("called from kevent");
2314		priv = (struct netmap_priv_d *)dev;
2315	}
2316	if (priv == NULL)
2317		return POLLERR;
2318
2319	if (priv->np_nifp == NULL) {
2320		D("No if registered");
2321		return POLLERR;
2322	}
2323	rmb(); /* make sure following reads are not from cache */
2324
2325	na = priv->np_na;
2326
2327	if (!nm_netmap_on(na))
2328		return POLLERR;
2329
2330	if (netmap_verbose & 0x8000)
2331		D("device %s events 0x%x", na->name, events);
2332	want_tx = events & (POLLOUT | POLLWRNORM);
2333	want_rx = events & (POLLIN | POLLRDNORM);
2334
2335
2336	/*
2337	 * check_all_{tx|rx} are set if the card has more than one queue AND
2338	 * the file descriptor is bound to all of them. If so, we sleep on
2339	 * the "global" selinfo, otherwise we sleep on individual selinfo
2340	 * (FreeBSD only allows two selinfo's per file descriptor).
2341	 * The interrupt routine in the driver wake one or the other
2342	 * (or both) depending on which clients are active.
2343	 *
2344	 * rxsync() is only called if we run out of buffers on a POLLIN.
2345	 * txsync() is called if we run out of buffers on POLLOUT, or
2346	 * there are pending packets to send. The latter can be disabled
2347	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2348	 */
2349	check_all_tx = nm_tx_si_user(priv);
2350	check_all_rx = nm_rx_si_user(priv);
2351
2352	/*
2353	 * We start with a lock free round which is cheap if we have
2354	 * slots available. If this fails, then lock and call the sync
2355	 * routines.
2356	 */
2357	for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
2358		kring = &na->rx_rings[i];
2359		/* XXX compare ring->cur and kring->tail */
2360		if (!nm_ring_empty(kring->ring)) {
2361			revents |= want_rx;
2362			want_rx = 0;	/* also breaks the loop */
2363		}
2364	}
2365	for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
2366		kring = &na->tx_rings[i];
2367		/* XXX compare ring->cur and kring->tail */
2368		if (!nm_ring_empty(kring->ring)) {
2369			revents |= want_tx;
2370			want_tx = 0;	/* also breaks the loop */
2371		}
2372	}
2373
2374	/*
2375	 * If we want to push packets out (priv->np_txpoll) or
2376	 * want_tx is still set, we must issue txsync calls
2377	 * (on all rings, to avoid that the tx rings stall).
2378	 * XXX should also check cur != hwcur on the tx rings.
2379	 * Fortunately, normal tx mode has np_txpoll set.
2380	 */
2381	if (priv->np_txpoll || want_tx) {
2382		/*
2383		 * The first round checks if anyone is ready, if not
2384		 * do a selrecord and another round to handle races.
2385		 * want_tx goes to 0 if any space is found, and is
2386		 * used to skip rings with no pending transmissions.
2387		 */
2388flush_tx:
2389		for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
2390			int found = 0;
2391
2392			kring = &na->tx_rings[i];
2393			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2394				continue;
2395			/* only one thread does txsync */
2396			if (nm_kr_tryget(kring)) {
2397				/* either busy or stopped
2398				 * XXX if the ring is stopped, sleeping would
2399				 * be better. In current code, however, we only
2400				 * stop the rings for brief intervals (2014-03-14)
2401				 */
2402				if (netmap_verbose)
2403					RD(2, "%p lost race on txring %d, ok",
2404					    priv, i);
2405				continue;
2406			}
2407			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2408				netmap_ring_reinit(kring);
2409				revents |= POLLERR;
2410			} else {
2411				if (kring->nm_sync(kring, 0))
2412					revents |= POLLERR;
2413			}
2414
2415			/*
2416			 * If we found new slots, notify potential
2417			 * listeners on the same ring.
2418			 * Since we just did a txsync, look at the copies
2419			 * of cur,tail in the kring.
2420			 */
2421			found = kring->rcur != kring->rtail;
2422			nm_kr_put(kring);
2423			if (found) { /* notify other listeners */
2424				revents |= want_tx;
2425				want_tx = 0;
2426				na->nm_notify(na, i, NR_TX, 0);
2427			}
2428		}
2429		if (want_tx && retry_tx && !is_kevent) {
2430			OS_selrecord(td, check_all_tx ?
2431			    &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
2432			retry_tx = 0;
2433			goto flush_tx;
2434		}
2435	}
2436
2437	/*
2438	 * If want_rx is still set scan receive rings.
2439	 * Do it on all rings because otherwise we starve.
2440	 */
2441	if (want_rx) {
2442		int send_down = 0; /* transparent mode */
2443		/* two rounds here for race avoidance */
2444do_retry_rx:
2445		for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
2446			int found = 0;
2447
2448			kring = &na->rx_rings[i];
2449
2450			if (nm_kr_tryget(kring)) {
2451				if (netmap_verbose)
2452					RD(2, "%p lost race on rxring %d, ok",
2453					    priv, i);
2454				continue;
2455			}
2456
2457			/*
2458			 * transparent mode support: collect packets
2459			 * from the rxring(s).
2460			 * XXX NR_FORWARD should only be read on
2461			 * physical or NIC ports
2462			 */
2463			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2464				ND(10, "forwarding some buffers up %d to %d",
2465				    kring->nr_hwcur, kring->ring->cur);
2466				netmap_grab_packets(kring, &q, netmap_fwd);
2467			}
2468
2469			if (kring->nm_sync(kring, 0))
2470				revents |= POLLERR;
2471			if (netmap_no_timestamp == 0 ||
2472					kring->ring->flags & NR_TIMESTAMP) {
2473				microtime(&kring->ring->ts);
2474			}
2475			/* after an rxsync we can use kring->rcur, rtail */
2476			found = kring->rcur != kring->rtail;
2477			nm_kr_put(kring);
2478			if (found) {
2479				revents |= want_rx;
2480				retry_rx = 0;
2481				na->nm_notify(na, i, NR_RX, 0);
2482			}
2483		}
2484
2485		/* transparent mode XXX only during first pass ? */
2486		if (na->na_flags & NAF_HOST_RINGS) {
2487			kring = &na->rx_rings[na->num_rx_rings];
2488			if (check_all_rx
2489			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2490				/* XXX fix to use kring fields */
2491				if (nm_ring_empty(kring->ring))
2492					send_down = netmap_rxsync_from_host(na, td, dev);
2493				if (!nm_ring_empty(kring->ring))
2494					revents |= want_rx;
2495			}
2496		}
2497
2498		if (retry_rx && !is_kevent)
2499			OS_selrecord(td, check_all_rx ?
2500			    &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
2501		if (send_down > 0 || retry_rx) {
2502			retry_rx = 0;
2503			if (send_down)
2504				goto flush_tx; /* and retry_rx */
2505			else
2506				goto do_retry_rx;
2507		}
2508	}
2509
2510	/*
2511	 * Transparent mode: marked bufs on rx rings between
2512	 * kring->nr_hwcur and ring->head
2513	 * are passed to the other endpoint.
2514	 *
2515	 * In this mode we also scan the sw rxring, which in
2516	 * turn passes packets up.
2517	 *
2518	 * XXX Transparent mode at the moment requires to bind all
2519 	 * rings to a single file descriptor.
2520	 */
2521
2522	if (q.head && na->ifp != NULL)
2523		netmap_send_up(na->ifp, &q);
2524
2525	return (revents);
2526}
2527
2528
2529/*-------------------- driver support routines -------------------*/
2530
2531static int netmap_hw_krings_create(struct netmap_adapter *);
2532
2533/* default notify callback */
2534static int
2535netmap_notify(struct netmap_adapter *na, u_int n_ring,
2536	enum txrx tx, int flags)
2537{
2538	struct netmap_kring *kring;
2539
2540	if (tx == NR_TX) {
2541		kring = na->tx_rings + n_ring;
2542		OS_selwakeup(&kring->si, PI_NET);
2543		/* optimization: avoid a wake up on the global
2544		 * queue if nobody has registered for more
2545		 * than one ring
2546		 */
2547		if (na->tx_si_users > 0)
2548			OS_selwakeup(&na->tx_si, PI_NET);
2549	} else {
2550		kring = na->rx_rings + n_ring;
2551		OS_selwakeup(&kring->si, PI_NET);
2552		/* optimization: same as above */
2553		if (na->rx_si_users > 0)
2554			OS_selwakeup(&na->rx_si, PI_NET);
2555	}
2556	return 0;
2557}
2558
2559
2560/* called by all routines that create netmap_adapters.
2561 * Attach na to the ifp (if any) and provide defaults
2562 * for optional callbacks. Defaults assume that we
2563 * are creating an hardware netmap_adapter.
2564 */
2565int
2566netmap_attach_common(struct netmap_adapter *na)
2567{
2568	struct ifnet *ifp = na->ifp;
2569
2570	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2571		D("%s: invalid rings tx %d rx %d",
2572			na->name, na->num_tx_rings, na->num_rx_rings);
2573		return EINVAL;
2574	}
2575	/* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
2576	 * pipes, monitors). For bwrap we actually have a non-null ifp for
2577	 * use by the external modules, but that is set after this
2578	 * function has been called.
2579	 * XXX this is ugly, maybe split this function in two (2014-03-14)
2580	 */
2581	if (ifp != NULL) {
2582		WNA(ifp) = na;
2583
2584	/* the following is only needed for na that use the host port.
2585	 * XXX do we have something similar for linux ?
2586	 */
2587#ifdef __FreeBSD__
2588		na->if_input = ifp->if_input; /* for netmap_send_up */
2589#endif /* __FreeBSD__ */
2590
2591		NETMAP_SET_CAPABLE(ifp);
2592	}
2593	if (na->nm_krings_create == NULL) {
2594		/* we assume that we have been called by a driver,
2595		 * since other port types all provide their own
2596		 * nm_krings_create
2597		 */
2598		na->nm_krings_create = netmap_hw_krings_create;
2599		na->nm_krings_delete = netmap_hw_krings_delete;
2600	}
2601	if (na->nm_notify == NULL)
2602		na->nm_notify = netmap_notify;
2603	na->active_fds = 0;
2604
2605	if (na->nm_mem == NULL)
2606		/* use the global allocator */
2607		na->nm_mem = &nm_mem;
2608	if (na->nm_bdg_attach == NULL)
2609		/* no special nm_bdg_attach callback. On VALE
2610		 * attach, we need to interpose a bwrap
2611		 */
2612		na->nm_bdg_attach = netmap_bwrap_attach;
2613	return 0;
2614}
2615
2616
2617/* standard cleanup, called by all destructors */
2618void
2619netmap_detach_common(struct netmap_adapter *na)
2620{
2621	if (na->ifp != NULL)
2622		WNA(na->ifp) = NULL; /* XXX do we need this? */
2623
2624	if (na->tx_rings) { /* XXX should not happen */
2625		D("freeing leftover tx_rings");
2626		na->nm_krings_delete(na);
2627	}
2628	netmap_pipe_dealloc(na);
2629	if (na->na_flags & NAF_MEM_OWNER)
2630		netmap_mem_private_delete(na->nm_mem);
2631	bzero(na, sizeof(*na));
2632	free(na, M_DEVBUF);
2633}
2634
2635/* Wrapper for the register callback provided hardware drivers.
2636 * na->ifp == NULL means the the driver module has been
2637 * unloaded, so we cannot call into it.
2638 * Note that module unloading, in our patched linux drivers,
2639 * happens under NMG_LOCK and after having stopped all the
2640 * nic rings (see netmap_detach). This provides sufficient
2641 * protection for the other driver-provied callbacks
2642 * (i.e., nm_config and nm_*xsync), that therefore don't need
2643 * to wrapped.
2644 */
2645static int
2646netmap_hw_register(struct netmap_adapter *na, int onoff)
2647{
2648	struct netmap_hw_adapter *hwna =
2649		(struct netmap_hw_adapter*)na;
2650
2651	if (na->ifp == NULL)
2652		return onoff ? ENXIO : 0;
2653
2654	return hwna->nm_hw_register(na, onoff);
2655}
2656
2657
2658/*
2659 * Initialize a ``netmap_adapter`` object created by driver on attach.
2660 * We allocate a block of memory with room for a struct netmap_adapter
2661 * plus two sets of N+2 struct netmap_kring (where N is the number
2662 * of hardware rings):
2663 * krings	0..N-1	are for the hardware queues.
2664 * kring	N	is for the host stack queue
2665 * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2666 * Return 0 on success, ENOMEM otherwise.
2667 */
2668int
2669netmap_attach(struct netmap_adapter *arg)
2670{
2671	struct netmap_hw_adapter *hwna = NULL;
2672	// XXX when is arg == NULL ?
2673	struct ifnet *ifp = arg ? arg->ifp : NULL;
2674
2675	if (arg == NULL || ifp == NULL)
2676		goto fail;
2677	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2678	if (hwna == NULL)
2679		goto fail;
2680	hwna->up = *arg;
2681	hwna->up.na_flags |= NAF_HOST_RINGS;
2682	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2683	hwna->nm_hw_register = hwna->up.nm_register;
2684	hwna->up.nm_register = netmap_hw_register;
2685	if (netmap_attach_common(&hwna->up)) {
2686		free(hwna, M_DEVBUF);
2687		goto fail;
2688	}
2689	netmap_adapter_get(&hwna->up);
2690
2691#ifdef linux
2692	if (ifp->netdev_ops) {
2693		/* prepare a clone of the netdev ops */
2694#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2695		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2696#else
2697		hwna->nm_ndo = *ifp->netdev_ops;
2698#endif
2699	}
2700	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2701	if (ifp->ethtool_ops) {
2702		hwna->nm_eto = *ifp->ethtool_ops;
2703	}
2704	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2705#ifdef ETHTOOL_SCHANNELS
2706	hwna->nm_eto.set_channels = linux_netmap_set_channels;
2707#endif
2708	if (arg->nm_config == NULL) {
2709		hwna->up.nm_config = netmap_linux_config;
2710	}
2711#endif /* linux */
2712
2713#ifdef __FreeBSD__
2714	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
2715	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2716	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
2717#else
2718	D("success for %s tx %d/%d rx %d/%d queues/slots",
2719		hwna->up.name,
2720		hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2721		hwna->up.num_rx_rings, hwna->up.num_rx_desc
2722		);
2723#endif
2724	return 0;
2725
2726fail:
2727	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2728	if (ifp)
2729		netmap_detach(ifp);
2730	return (hwna ? EINVAL : ENOMEM);
2731}
2732
2733
2734void
2735NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2736{
2737	if (!na) {
2738		return;
2739	}
2740
2741	refcount_acquire(&na->na_refcount);
2742}
2743
2744
2745/* returns 1 iff the netmap_adapter is destroyed */
2746int
2747NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2748{
2749	if (!na)
2750		return 1;
2751
2752	if (!refcount_release(&na->na_refcount))
2753		return 0;
2754
2755	if (na->nm_dtor)
2756		na->nm_dtor(na);
2757
2758	netmap_detach_common(na);
2759
2760	return 1;
2761}
2762
2763/* nm_krings_create callback for all hardware native adapters */
2764int
2765netmap_hw_krings_create(struct netmap_adapter *na)
2766{
2767	int ret = netmap_krings_create(na, 0);
2768	if (ret == 0) {
2769		/* initialize the mbq for the sw rx ring */
2770		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2771		ND("initialized sw rx queue %d", na->num_rx_rings);
2772	}
2773	return ret;
2774}
2775
2776
2777
2778/*
2779 * Called on module unload by the netmap-enabled drivers
2780 */
2781void
2782netmap_detach(struct ifnet *ifp)
2783{
2784	struct netmap_adapter *na = NA(ifp);
2785
2786	if (!na)
2787		return;
2788
2789	NMG_LOCK();
2790	netmap_disable_all_rings(ifp);
2791	if (!netmap_adapter_put(na)) {
2792		/* someone is still using the adapter,
2793		 * tell them that the interface is gone
2794		 */
2795		na->ifp = NULL;
2796		// XXX also clear NAF_NATIVE_ON ?
2797		na->na_flags &= ~NAF_NETMAP_ON;
2798		/* give them a chance to notice */
2799		netmap_enable_all_rings(ifp);
2800	}
2801	NMG_UNLOCK();
2802}
2803
2804
2805/*
2806 * Intercept packets from the network stack and pass them
2807 * to netmap as incoming packets on the 'software' ring.
2808 *
2809 * We only store packets in a bounded mbq and then copy them
2810 * in the relevant rxsync routine.
2811 *
2812 * We rely on the OS to make sure that the ifp and na do not go
2813 * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2814 * In nm_register() or whenever there is a reinitialization,
2815 * we make sure to make the mode change visible here.
2816 */
2817int
2818netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2819{
2820	struct netmap_adapter *na = NA(ifp);
2821	struct netmap_kring *kring;
2822	u_int len = MBUF_LEN(m);
2823	u_int error = ENOBUFS;
2824	struct mbq *q;
2825	int space;
2826
2827	// XXX [Linux] we do not need this lock
2828	// if we follow the down/configure/up protocol -gl
2829	// mtx_lock(&na->core_lock);
2830
2831	if (!nm_netmap_on(na)) {
2832		D("%s not in netmap mode anymore", na->name);
2833		error = ENXIO;
2834		goto done;
2835	}
2836
2837	kring = &na->rx_rings[na->num_rx_rings];
2838	q = &kring->rx_queue;
2839
2840	// XXX reconsider long packets if we handle fragments
2841	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
2842		D("%s from_host, drop packet size %d > %d", na->name,
2843			len, NETMAP_BUF_SIZE(na));
2844		goto done;
2845	}
2846
2847	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2848	 * and maybe other instances of netmap_transmit (the latter
2849	 * not possible on Linux).
2850	 * Also avoid overflowing the queue.
2851	 */
2852	mbq_lock(q);
2853
2854        space = kring->nr_hwtail - kring->nr_hwcur;
2855        if (space < 0)
2856                space += kring->nkr_num_slots;
2857	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2858		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2859			na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2860			len, m);
2861	} else {
2862		mbq_enqueue(q, m);
2863		ND(10, "%s %d bufs in queue len %d m %p",
2864			na->name, mbq_len(q), len, m);
2865		/* notify outside the lock */
2866		m = NULL;
2867		error = 0;
2868	}
2869	mbq_unlock(q);
2870
2871done:
2872	if (m)
2873		m_freem(m);
2874	/* unconditionally wake up listeners */
2875	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2876	/* this is normally netmap_notify(), but for nics
2877	 * connected to a bridge it is netmap_bwrap_intr_notify(),
2878	 * that possibly forwards the frames through the switch
2879	 */
2880
2881	return (error);
2882}
2883
2884
2885/*
2886 * netmap_reset() is called by the driver routines when reinitializing
2887 * a ring. The driver is in charge of locking to protect the kring.
2888 * If native netmap mode is not set just return NULL.
2889 */
2890struct netmap_slot *
2891netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2892	u_int new_cur)
2893{
2894	struct netmap_kring *kring;
2895	int new_hwofs, lim;
2896
2897	if (!nm_native_on(na)) {
2898		ND("interface not in native netmap mode");
2899		return NULL;	/* nothing to reinitialize */
2900	}
2901
2902	/* XXX note- in the new scheme, we are not guaranteed to be
2903	 * under lock (e.g. when called on a device reset).
2904	 * In this case, we should set a flag and do not trust too
2905	 * much the values. In practice: TODO
2906	 * - set a RESET flag somewhere in the kring
2907	 * - do the processing in a conservative way
2908	 * - let the *sync() fixup at the end.
2909	 */
2910	if (tx == NR_TX) {
2911		if (n >= na->num_tx_rings)
2912			return NULL;
2913		kring = na->tx_rings + n;
2914		// XXX check whether we should use hwcur or rcur
2915		new_hwofs = kring->nr_hwcur - new_cur;
2916	} else {
2917		if (n >= na->num_rx_rings)
2918			return NULL;
2919		kring = na->rx_rings + n;
2920		new_hwofs = kring->nr_hwtail - new_cur;
2921	}
2922	lim = kring->nkr_num_slots - 1;
2923	if (new_hwofs > lim)
2924		new_hwofs -= lim + 1;
2925
2926	/* Always set the new offset value and realign the ring. */
2927	if (netmap_verbose)
2928	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2929		na->name,
2930		tx == NR_TX ? "TX" : "RX", n,
2931		kring->nkr_hwofs, new_hwofs,
2932		kring->nr_hwtail,
2933		tx == NR_TX ? lim : kring->nr_hwtail);
2934	kring->nkr_hwofs = new_hwofs;
2935	if (tx == NR_TX) {
2936		kring->nr_hwtail = kring->nr_hwcur + lim;
2937		if (kring->nr_hwtail > lim)
2938			kring->nr_hwtail -= lim + 1;
2939	}
2940
2941#if 0 // def linux
2942	/* XXX check that the mappings are correct */
2943	/* need ring_nr, adapter->pdev, direction */
2944	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2945	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2946		D("error mapping rx netmap buffer %d", i);
2947		// XXX fix error handling
2948	}
2949
2950#endif /* linux */
2951	/*
2952	 * Wakeup on the individual and global selwait
2953	 * We do the wakeup here, but the ring is not yet reconfigured.
2954	 * However, we are under lock so there are no races.
2955	 */
2956	na->nm_notify(na, n, tx, 0);
2957	return kring->ring->slot;
2958}
2959
2960
2961/*
2962 * Dispatch rx/tx interrupts to the netmap rings.
2963 *
2964 * "work_done" is non-null on the RX path, NULL for the TX path.
2965 * We rely on the OS to make sure that there is only one active
2966 * instance per queue, and that there is appropriate locking.
2967 *
2968 * The 'notify' routine depends on what the ring is attached to.
2969 * - for a netmap file descriptor, do a selwakeup on the individual
2970 *   waitqueue, plus one on the global one if needed
2971 *   (see netmap_notify)
2972 * - for a nic connected to a switch, call the proper forwarding routine
2973 *   (see netmap_bwrap_intr_notify)
2974 */
2975void
2976netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2977{
2978	struct netmap_adapter *na = NA(ifp);
2979	struct netmap_kring *kring;
2980
2981	q &= NETMAP_RING_MASK;
2982
2983	if (netmap_verbose) {
2984	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2985	}
2986
2987	if (work_done) { /* RX path */
2988		if (q >= na->num_rx_rings)
2989			return;	// not a physical queue
2990		kring = na->rx_rings + q;
2991		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2992		na->nm_notify(na, q, NR_RX, 0);
2993		*work_done = 1; /* do not fire napi again */
2994	} else { /* TX path */
2995		if (q >= na->num_tx_rings)
2996			return;	// not a physical queue
2997		kring = na->tx_rings + q;
2998		na->nm_notify(na, q, NR_TX, 0);
2999	}
3000}
3001
3002
3003/*
3004 * Default functions to handle rx/tx interrupts from a physical device.
3005 * "work_done" is non-null on the RX path, NULL for the TX path.
3006 *
3007 * If the card is not in netmap mode, simply return 0,
3008 * so that the caller proceeds with regular processing.
3009 * Otherwise call netmap_common_irq() and return 1.
3010 *
3011 * If the card is connected to a netmap file descriptor,
3012 * do a selwakeup on the individual queue, plus one on the global one
3013 * if needed (multiqueue card _and_ there are multiqueue listeners),
3014 * and return 1.
3015 *
3016 * Finally, if called on rx from an interface connected to a switch,
3017 * calls the proper forwarding routine, and return 1.
3018 */
3019int
3020netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3021{
3022	struct netmap_adapter *na = NA(ifp);
3023
3024	/*
3025	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3026	 * we still use the regular driver even though the previous
3027	 * check fails. It is unclear whether we should use
3028	 * nm_native_on() here.
3029	 */
3030	if (!nm_netmap_on(na))
3031		return 0;
3032
3033	if (na->na_flags & NAF_SKIP_INTR) {
3034		ND("use regular interrupt");
3035		return 0;
3036	}
3037
3038	netmap_common_irq(ifp, q, work_done);
3039	return 1;
3040}
3041
3042
3043/*
3044 * Module loader and unloader
3045 *
3046 * netmap_init() creates the /dev/netmap device and initializes
3047 * all global variables. Returns 0 on success, errno on failure
3048 * (but there is no chance)
3049 *
3050 * netmap_fini() destroys everything.
3051 */
3052
3053static struct cdev *netmap_dev; /* /dev/netmap character device. */
3054extern struct cdevsw netmap_cdevsw;
3055
3056
3057void
3058netmap_fini(void)
3059{
3060	// XXX destroy_bridges() ?
3061	if (netmap_dev)
3062		destroy_dev(netmap_dev);
3063	netmap_mem_fini();
3064	NMG_LOCK_DESTROY();
3065	printf("netmap: unloaded module.\n");
3066}
3067
3068
3069int
3070netmap_init(void)
3071{
3072	int error;
3073
3074	NMG_LOCK_INIT();
3075
3076	error = netmap_mem_init();
3077	if (error != 0)
3078		goto fail;
3079	/*
3080	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3081	 * when the module is compiled in.
3082	 * XXX could use make_dev_credv() to get error number
3083	 */
3084	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3085		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3086			      "netmap");
3087	if (!netmap_dev)
3088		goto fail;
3089
3090	netmap_init_bridges();
3091#ifdef __FreeBSD__
3092	nm_vi_init_index();
3093#endif
3094	printf("netmap: loaded module\n");
3095	return (0);
3096fail:
3097	netmap_fini();
3098	return (EINVAL); /* may be incorrect */
3099}
3100