netmap.c revision 343831
1/*
2 * Copyright (C) 2011-2014 Matteo Landi
3 * Copyright (C) 2011-2016 Luigi Rizzo
4 * Copyright (C) 2011-2016 Giuseppe Lettieri
5 * Copyright (C) 2011-2016 Vincenzo Maffione
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *   1. Redistributions of source code must retain the above copyright
12 *      notice, this list of conditions and the following disclaimer.
13 *   2. Redistributions in binary form must reproduce the above copyright
14 *      notice, this list of conditions and the following disclaimer in the
15 *      documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30
31/*
32 * $FreeBSD: stable/11/sys/dev/netmap/netmap.c 343831 2019-02-06 09:38:44Z vmaffione $
33 *
34 * This module supports memory mapped access to network devices,
35 * see netmap(4).
36 *
37 * The module uses a large, memory pool allocated by the kernel
38 * and accessible as mmapped memory by multiple userspace threads/processes.
39 * The memory pool contains packet buffers and "netmap rings",
40 * i.e. user-accessible copies of the interface's queues.
41 *
42 * Access to the network card works like this:
43 * 1. a process/thread issues one or more open() on /dev/netmap, to create
44 *    select()able file descriptor on which events are reported.
45 * 2. on each descriptor, the process issues an ioctl() to identify
46 *    the interface that should report events to the file descriptor.
47 * 3. on each descriptor, the process issues an mmap() request to
48 *    map the shared memory region within the process' address space.
49 *    The list of interesting queues is indicated by a location in
50 *    the shared memory region.
51 * 4. using the functions in the netmap(4) userspace API, a process
52 *    can look up the occupation state of a queue, access memory buffers,
53 *    and retrieve received packets or enqueue packets to transmit.
54 * 5. using some ioctl()s the process can synchronize the userspace view
55 *    of the queue with the actual status in the kernel. This includes both
56 *    receiving the notification of new packets, and transmitting new
57 *    packets on the output interface.
58 * 6. select() or poll() can be used to wait for events on individual
59 *    transmit or receive queues (or all queues for a given interface).
60 *
61
62		SYNCHRONIZATION (USER)
63
64The netmap rings and data structures may be shared among multiple
65user threads or even independent processes.
66Any synchronization among those threads/processes is delegated
67to the threads themselves. Only one thread at a time can be in
68a system call on the same netmap ring. The OS does not enforce
69this and only guarantees against system crashes in case of
70invalid usage.
71
72		LOCKING (INTERNAL)
73
74Within the kernel, access to the netmap rings is protected as follows:
75
76- a spinlock on each ring, to handle producer/consumer races on
77  RX rings attached to the host stack (against multiple host
78  threads writing from the host stack to the same ring),
79  and on 'destination' rings attached to a VALE switch
80  (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
81  protecting multiple active senders for the same destination)
82
83- an atomic variable to guarantee that there is at most one
84  instance of *_*xsync() on the ring at any time.
85  For rings connected to user file
86  descriptors, an atomic_test_and_set() protects this, and the
87  lock on the ring is not actually used.
88  For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
89  is also used to prevent multiple executions (the driver might indeed
90  already guarantee this).
91  For NIC TX rings connected to a VALE switch, the lock arbitrates
92  access to the queue (both when allocating buffers and when pushing
93  them out).
94
95- *xsync() should be protected against initializations of the card.
96  On FreeBSD most devices have the reset routine protected by
97  a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
98  the RING protection on rx_reset(), this should be added.
99
100  On linux there is an external lock on the tx path, which probably
101  also arbitrates access to the reset routine. XXX to be revised
102
103- a per-interface core_lock protecting access from the host stack
104  while interfaces may be detached from netmap mode.
105  XXX there should be no need for this lock if we detach the interfaces
106  only while they are down.
107
108
109--- VALE SWITCH ---
110
111NMG_LOCK() serializes all modifications to switches and ports.
112A switch cannot be deleted until all ports are gone.
113
114For each switch, an SX lock (RWlock on linux) protects
115deletion of ports. When configuring or deleting a new port, the
116lock is acquired in exclusive mode (after holding NMG_LOCK).
117When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
118The lock is held throughout the entire forwarding cycle,
119during which the thread may incur in a page fault.
120Hence it is important that sleepable shared locks are used.
121
122On the rx ring, the per-port lock is grabbed initially to reserve
123a number of slot in the ring, then the lock is released,
124packets are copied from source to destination, and then
125the lock is acquired again and the receive ring is updated.
126(A similar thing is done on the tx ring for NIC and host stack
127ports attached to the switch)
128
129 */
130
131
132/* --- internals ----
133 *
134 * Roadmap to the code that implements the above.
135 *
136 * > 1. a process/thread issues one or more open() on /dev/netmap, to create
137 * >    select()able file descriptor on which events are reported.
138 *
139 *  	Internally, we allocate a netmap_priv_d structure, that will be
140 *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
141 *  	structure for each open().
142 *
143 *      os-specific:
144 *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
145 *  	    linux:   see linux_netmap_open() (netmap_linux.c)
146 *
147 * > 2. on each descriptor, the process issues an ioctl() to identify
148 * >    the interface that should report events to the file descriptor.
149 *
150 * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
151 * 	Most important things happen in netmap_get_na() and
152 * 	netmap_do_regif(), called from there. Additional details can be
153 * 	found in the comments above those functions.
154 *
155 * 	In all cases, this action creates/takes-a-reference-to a
156 * 	netmap_*_adapter describing the port, and allocates a netmap_if
157 * 	and all necessary netmap rings, filling them with netmap buffers.
158 *
159 *      In this phase, the sync callbacks for each ring are set (these are used
160 *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
161 *      The adapter creation/initialization code puts them in the
162 * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
163 * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
164 * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
165 * 	actually call netmap_krings_create() to perform this and the other
166 * 	common stuff. netmap_krings_create() also takes care of the host rings,
167 * 	if needed, by setting their sync callbacks appropriately.
168 *
169 * 	Additional actions depend on the kind of netmap_adapter that has been
170 * 	registered:
171 *
172 * 	- netmap_hw_adapter:  	     [netmap.c]
173 * 	     This is a system netdev/ifp with native netmap support.
174 * 	     The ifp is detached from the host stack by redirecting:
175 * 	       - transmissions (from the network stack) to netmap_transmit()
176 * 	       - receive notifications to the nm_notify() callback for
177 * 	         this adapter. The callback is normally netmap_notify(), unless
178 * 	         the ifp is attached to a bridge using bwrap, in which case it
179 * 	         is netmap_bwrap_intr_notify().
180 *
181 * 	- netmap_generic_adapter:      [netmap_generic.c]
182 * 	      A system netdev/ifp without native netmap support.
183 *
184 * 	(the decision about native/non native support is taken in
185 * 	 netmap_get_hw_na(), called by netmap_get_na())
186 *
187 * 	- netmap_vp_adapter 		[netmap_vale.c]
188 * 	      Returned by netmap_get_bdg_na().
189 * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
190 * 	      are created on the fly if they don't already exist, and are
191 * 	      always attached to a bridge.
192 * 	      Persistent VALE ports must must be created separately, and i
193 * 	      then attached like normal NICs. The NIOCREGIF we are examining
194 * 	      will find them only if they had previosly been created and
195 * 	      attached (see VALE_CTL below).
196 *
197 * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
198 * 	      Returned by netmap_get_pipe_na().
199 * 	      Both pipe ends are created, if they didn't already exist.
200 *
201 * 	- netmap_monitor_adapter      [netmap_monitor.c]
202 * 	      Returned by netmap_get_monitor_na().
203 * 	      If successful, the nm_sync callbacks of the monitored adapter
204 * 	      will be intercepted by the returned monitor.
205 *
206 * 	- netmap_bwrap_adapter	      [netmap_vale.c]
207 * 	      Cannot be obtained in this way, see VALE_CTL below
208 *
209 *
210 * 	os-specific:
211 * 	    linux: we first go through linux_netmap_ioctl() to
212 * 	           adapt the FreeBSD interface to the linux one.
213 *
214 *
215 * > 3. on each descriptor, the process issues an mmap() request to
216 * >    map the shared memory region within the process' address space.
217 * >    The list of interesting queues is indicated by a location in
218 * >    the shared memory region.
219 *
220 *      os-specific:
221 *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
222 *  	    linux:   linux_netmap_mmap (netmap_linux.c).
223 *
224 * > 4. using the functions in the netmap(4) userspace API, a process
225 * >    can look up the occupation state of a queue, access memory buffers,
226 * >    and retrieve received packets or enqueue packets to transmit.
227 *
228 * 	these actions do not involve the kernel.
229 *
230 * > 5. using some ioctl()s the process can synchronize the userspace view
231 * >    of the queue with the actual status in the kernel. This includes both
232 * >    receiving the notification of new packets, and transmitting new
233 * >    packets on the output interface.
234 *
235 * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
236 * 	cases. They invoke the nm_sync callbacks on the netmap_kring
237 * 	structures, as initialized in step 2 and maybe later modified
238 * 	by a monitor. Monitors, however, will always call the original
239 * 	callback before doing anything else.
240 *
241 *
242 * > 6. select() or poll() can be used to wait for events on individual
243 * >    transmit or receive queues (or all queues for a given interface).
244 *
245 * 	Implemented in netmap_poll(). This will call the same nm_sync()
246 * 	callbacks as in step 5 above.
247 *
248 * 	os-specific:
249 * 		linux: we first go through linux_netmap_poll() to adapt
250 * 		       the FreeBSD interface to the linux one.
251 *
252 *
253 *  ----  VALE_CTL -----
254 *
255 *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
256 *  nr_cmd in the nmreq structure. These subcommands are handled by
257 *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
258 *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
259 *  subcommands, respectively.
260 *
261 *  Any network interface known to the system (including a persistent VALE
262 *  port) can be attached to a VALE switch by issuing the
263 *  NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
264 *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
265 *  attachment of other interfaces, instead, requires the creation of a
266 *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
267 *  netmap mode. This may require the creation of a netmap_generic_adapter if
268 *  we have no native support for the interface, or if generic adapters have
269 *  been forced by sysctl.
270 *
271 *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
272 *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
273 *  callback.  In the case of the bwrap, the callback creates the
274 *  netmap_bwrap_adapter.  The initialization of the bwrap is then
275 *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
276 *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
277 *  A generic adapter for the wrapped ifp will be created if needed, when
278 *  netmap_get_bdg_na() calls netmap_get_hw_na().
279 *
280 *
281 *  ---- DATAPATHS -----
282 *
283 *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
284 *
285 *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
286 *
287 *    - tx from netmap userspace:
288 *	 concurrently:
289 *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
290 *                kring->nm_sync() == DEVICE_netmap_txsync()
291 *           2) device interrupt handler
292 *                na->nm_notify()  == netmap_notify()
293 *    - rx from netmap userspace:
294 *       concurrently:
295 *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
296 *                kring->nm_sync() == DEVICE_netmap_rxsync()
297 *           2) device interrupt handler
298 *                na->nm_notify()  == netmap_notify()
299 *    - rx from host stack
300 *       concurrently:
301 *           1) host stack
302 *                netmap_transmit()
303 *                  na->nm_notify  == netmap_notify()
304 *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
305 *                kring->nm_sync() == netmap_rxsync_from_host
306 *                  netmap_rxsync_from_host(na, NULL, NULL)
307 *    - tx to host stack
308 *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
309 *             kring->nm_sync() == netmap_txsync_to_host
310 *               netmap_txsync_to_host(na)
311 *                 nm_os_send_up()
312 *                   FreeBSD: na->if_input() == ether_input()
313 *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
314 *
315 *
316 *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
317 *
318 *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
319 *
320 *    - tx from netmap userspace:
321 *       concurrently:
322 *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
323 *               kring->nm_sync() == generic_netmap_txsync()
324 *                   nm_os_generic_xmit_frame()
325 *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
326 *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
327 *                               gna->save_start_xmit == orig. dev. start_xmit
328 *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
329 *           2) generic_mbuf_destructor()
330 *                   na->nm_notify() == netmap_notify()
331 *    - rx from netmap userspace:
332 *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
333 *               kring->nm_sync() == generic_netmap_rxsync()
334 *                   mbq_safe_dequeue()
335 *           2) device driver
336 *               generic_rx_handler()
337 *                   mbq_safe_enqueue()
338 *                   na->nm_notify() == netmap_notify()
339 *    - rx from host stack
340 *        FreeBSD: same as native
341 *        Linux: same as native except:
342 *           1) host stack
343 *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
344 *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
345 *                       netmap_transmit()
346 *                           na->nm_notify() == netmap_notify()
347 *    - tx to host stack (same as native):
348 *
349 *
350 *                           -= VALE =-
351 *
352 *   INCOMING:
353 *
354 *      - VALE ports:
355 *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
356 *              kring->nm_sync() == netmap_vp_txsync()
357 *
358 *      - system device with native support:
359 *         from cable:
360 *             interrupt
361 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
362 *                     kring->nm_sync() == DEVICE_netmap_rxsync()
363 *                     netmap_vp_txsync()
364 *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365 *         from host stack:
366 *             netmap_transmit()
367 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
368 *                     kring->nm_sync() == netmap_rxsync_from_host()
369 *                     netmap_vp_txsync()
370 *
371 *      - system device with generic support:
372 *         from device driver:
373 *            generic_rx_handler()
374 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
375 *                     kring->nm_sync() == generic_netmap_rxsync()
376 *                     netmap_vp_txsync()
377 *                     kring->nm_sync() == generic_netmap_rxsync()
378 *         from host stack:
379 *            netmap_transmit()
380 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
381 *                     kring->nm_sync() == netmap_rxsync_from_host()
382 *                     netmap_vp_txsync()
383 *
384 *   (all cases) --> nm_bdg_flush()
385 *                      dest_na->nm_notify() == (see below)
386 *
387 *   OUTGOING:
388 *
389 *      - VALE ports:
390 *         concurrently:
391 *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
392 *                    kring->nm_sync() == netmap_vp_rxsync()
393 *             2) from nm_bdg_flush()
394 *                    na->nm_notify() == netmap_notify()
395 *
396 *      - system device with native support:
397 *          to cable:
398 *             na->nm_notify() == netmap_bwrap_notify()
399 *                 netmap_vp_rxsync()
400 *                 kring->nm_sync() == DEVICE_netmap_txsync()
401 *                 netmap_vp_rxsync()
402 *          to host stack:
403 *                 netmap_vp_rxsync()
404 *                 kring->nm_sync() == netmap_txsync_to_host
405 *                 netmap_vp_rxsync_locked()
406 *
407 *      - system device with generic adapter:
408 *          to device driver:
409 *             na->nm_notify() == netmap_bwrap_notify()
410 *                 netmap_vp_rxsync()
411 *                 kring->nm_sync() == generic_netmap_txsync()
412 *                 netmap_vp_rxsync()
413 *          to host stack:
414 *                 netmap_vp_rxsync()
415 *                 kring->nm_sync() == netmap_txsync_to_host
416 *                 netmap_vp_rxsync()
417 *
418 */
419
420/*
421 * OS-specific code that is used only within this file.
422 * Other OS-specific code that must be accessed by drivers
423 * is present in netmap_kern.h
424 */
425
426#if defined(__FreeBSD__)
427#include <sys/cdefs.h> /* prerequisite */
428#include <sys/types.h>
429#include <sys/errno.h>
430#include <sys/param.h>	/* defines used in kernel.h */
431#include <sys/kernel.h>	/* types used in module initialization */
432#include <sys/conf.h>	/* cdevsw struct, UID, GID */
433#include <sys/filio.h>	/* FIONBIO */
434#include <sys/sockio.h>
435#include <sys/socketvar.h>	/* struct socket */
436#include <sys/malloc.h>
437#include <sys/poll.h>
438#include <sys/rwlock.h>
439#include <sys/socket.h> /* sockaddrs */
440#include <sys/selinfo.h>
441#include <sys/sysctl.h>
442#include <sys/jail.h>
443#include <net/vnet.h>
444#include <net/if.h>
445#include <net/if_var.h>
446#include <net/bpf.h>		/* BIOCIMMEDIATE */
447#include <machine/bus.h>	/* bus_dmamap_* */
448#include <sys/endian.h>
449#include <sys/refcount.h>
450#include <net/ethernet.h>	/* ETHER_BPF_MTAP */
451
452
453#elif defined(linux)
454
455#include "bsd_glue.h"
456
457#elif defined(__APPLE__)
458
459#warning OSX support is only partial
460#include "osx_glue.h"
461
462#elif defined (_WIN32)
463
464#include "win_glue.h"
465
466#else
467
468#error	Unsupported platform
469
470#endif /* unsupported */
471
472/*
473 * common headers
474 */
475#include <net/netmap.h>
476#include <dev/netmap/netmap_kern.h>
477#include <dev/netmap/netmap_mem2.h>
478
479
480/* user-controlled variables */
481int netmap_verbose;
482#ifdef CONFIG_NETMAP_DEBUG
483int netmap_debug;
484#endif /* CONFIG_NETMAP_DEBUG */
485
486static int netmap_no_timestamp; /* don't timestamp on rxsync */
487int netmap_mitigate = 1;
488int netmap_no_pendintr = 1;
489int netmap_txsync_retry = 2;
490static int netmap_fwd = 0;	/* force transparent forwarding */
491
492/*
493 * netmap_admode selects the netmap mode to use.
494 * Invalid values are reset to NETMAP_ADMODE_BEST
495 */
496enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
497	NETMAP_ADMODE_NATIVE,	/* either native or none */
498	NETMAP_ADMODE_GENERIC,	/* force generic */
499	NETMAP_ADMODE_LAST };
500static int netmap_admode = NETMAP_ADMODE_BEST;
501
502/* netmap_generic_mit controls mitigation of RX notifications for
503 * the generic netmap adapter. The value is a time interval in
504 * nanoseconds. */
505int netmap_generic_mit = 100*1000;
506
507/* We use by default netmap-aware qdiscs with generic netmap adapters,
508 * even if there can be a little performance hit with hardware NICs.
509 * However, using the qdisc is the safer approach, for two reasons:
510 * 1) it prevents non-fifo qdiscs to break the TX notification
511 *    scheme, which is based on mbuf destructors when txqdisc is
512 *    not used.
513 * 2) it makes it possible to transmit over software devices that
514 *    change skb->dev, like bridge, veth, ...
515 *
516 * Anyway users looking for the best performance should
517 * use native adapters.
518 */
519#ifdef linux
520int netmap_generic_txqdisc = 1;
521#endif
522
523/* Default number of slots and queues for generic adapters. */
524int netmap_generic_ringsize = 1024;
525int netmap_generic_rings = 1;
526
527/* Non-zero to enable checksum offloading in NIC drivers */
528int netmap_generic_hwcsum = 0;
529
530/* Non-zero if ptnet devices are allowed to use virtio-net headers. */
531int ptnet_vnet_hdr = 1;
532
533/*
534 * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
535 * in some other operating systems
536 */
537SYSBEGIN(main_init);
538
539SYSCTL_DECL(_dev_netmap);
540SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
541SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
542		CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
543#ifdef CONFIG_NETMAP_DEBUG
544SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
545		CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
546#endif /* CONFIG_NETMAP_DEBUG */
547SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
548		CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
549SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
550		0, "Always look for new received packets.");
551SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate,
552		0, "Interrupt mitigation for netmap TX wakeups");
553SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
554		&netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
555
556SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
557		"Force NR_FORWARD mode");
558SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
559		"Adapter mode. 0 selects the best option available,"
560		"1 forces native adapter, 2 forces emulated adapter");
561SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
562		0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
563		"1 to enable checksum generation by the NIC");
564SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
565		0, "RX notification interval in nanoseconds");
566SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
567		&netmap_generic_ringsize, 0,
568		"Number of per-ring slots for emulated netmap mode");
569SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
570		&netmap_generic_rings, 0,
571		"Number of TX/RX queues for emulated netmap adapters");
572#ifdef linux
573SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
574		&netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
575#endif
576SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
577		0, "Allow ptnet devices to use virtio-net headers");
578
579SYSEND;
580
581NMG_LOCK_T	netmap_global_lock;
582
583/*
584 * mark the ring as stopped, and run through the locks
585 * to make sure other users get to see it.
586 * stopped must be either NR_KR_STOPPED (for unbounded stop)
587 * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
588 */
589static void
590netmap_disable_ring(struct netmap_kring *kr, int stopped)
591{
592	nm_kr_stop(kr, stopped);
593	// XXX check if nm_kr_stop is sufficient
594	mtx_lock(&kr->q_lock);
595	mtx_unlock(&kr->q_lock);
596	nm_kr_put(kr);
597}
598
599/* stop or enable a single ring */
600void
601netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
602{
603	if (stopped)
604		netmap_disable_ring(NMR(na, t)[ring_id], stopped);
605	else
606		NMR(na, t)[ring_id]->nkr_stopped = 0;
607}
608
609
610/* stop or enable all the rings of na */
611void
612netmap_set_all_rings(struct netmap_adapter *na, int stopped)
613{
614	int i;
615	enum txrx t;
616
617	if (!nm_netmap_on(na))
618		return;
619
620	for_rx_tx(t) {
621		for (i = 0; i < netmap_real_rings(na, t); i++) {
622			netmap_set_ring(na, i, t, stopped);
623		}
624	}
625}
626
627/*
628 * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
629 * to finish and prevents any new one from starting.  Call this before turning
630 * netmap mode off, or before removing the hardware rings (e.g., on module
631 * onload).
632 */
633void
634netmap_disable_all_rings(struct ifnet *ifp)
635{
636	if (NM_NA_VALID(ifp)) {
637		netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
638	}
639}
640
641/*
642 * Convenience function used in drivers.  Re-enables rxsync and txsync on the
643 * adapter's rings In linux drivers, this should be placed near each
644 * napi_enable().
645 */
646void
647netmap_enable_all_rings(struct ifnet *ifp)
648{
649	if (NM_NA_VALID(ifp)) {
650		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
651	}
652}
653
654void
655netmap_make_zombie(struct ifnet *ifp)
656{
657	if (NM_NA_VALID(ifp)) {
658		struct netmap_adapter *na = NA(ifp);
659		netmap_set_all_rings(na, NM_KR_LOCKED);
660		na->na_flags |= NAF_ZOMBIE;
661		netmap_set_all_rings(na, 0);
662	}
663}
664
665void
666netmap_undo_zombie(struct ifnet *ifp)
667{
668	if (NM_NA_VALID(ifp)) {
669		struct netmap_adapter *na = NA(ifp);
670		if (na->na_flags & NAF_ZOMBIE) {
671			netmap_set_all_rings(na, NM_KR_LOCKED);
672			na->na_flags &= ~NAF_ZOMBIE;
673			netmap_set_all_rings(na, 0);
674		}
675	}
676}
677
678/*
679 * generic bound_checking function
680 */
681u_int
682nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
683{
684	u_int oldv = *v;
685	const char *op = NULL;
686
687	if (dflt < lo)
688		dflt = lo;
689	if (dflt > hi)
690		dflt = hi;
691	if (oldv < lo) {
692		*v = dflt;
693		op = "Bump";
694	} else if (oldv > hi) {
695		*v = hi;
696		op = "Clamp";
697	}
698	if (op && msg)
699		nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
700	return *v;
701}
702
703
704/*
705 * packet-dump function, user-supplied or static buffer.
706 * The destination buffer must be at least 30+4*len
707 */
708const char *
709nm_dump_buf(char *p, int len, int lim, char *dst)
710{
711	static char _dst[8192];
712	int i, j, i0;
713	static char hex[] ="0123456789abcdef";
714	char *o;	/* output position */
715
716#define P_HI(x)	hex[((x) & 0xf0)>>4]
717#define P_LO(x)	hex[((x) & 0xf)]
718#define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
719	if (!dst)
720		dst = _dst;
721	if (lim <= 0 || lim > len)
722		lim = len;
723	o = dst;
724	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
725	o += strlen(o);
726	/* hexdump routine */
727	for (i = 0; i < lim; ) {
728		sprintf(o, "%5d: ", i);
729		o += strlen(o);
730		memset(o, ' ', 48);
731		i0 = i;
732		for (j=0; j < 16 && i < lim; i++, j++) {
733			o[j*3] = P_HI(p[i]);
734			o[j*3+1] = P_LO(p[i]);
735		}
736		i = i0;
737		for (j=0; j < 16 && i < lim; i++, j++)
738			o[j + 48] = P_C(p[i]);
739		o[j+48] = '\n';
740		o += j+49;
741	}
742	*o = '\0';
743#undef P_HI
744#undef P_LO
745#undef P_C
746	return dst;
747}
748
749
750/*
751 * Fetch configuration from the device, to cope with dynamic
752 * reconfigurations after loading the module.
753 */
754/* call with NMG_LOCK held */
755int
756netmap_update_config(struct netmap_adapter *na)
757{
758	struct nm_config_info info;
759
760	bzero(&info, sizeof(info));
761	if (na->nm_config == NULL ||
762	    na->nm_config(na, &info)) {
763		/* take whatever we had at init time */
764		info.num_tx_rings = na->num_tx_rings;
765		info.num_tx_descs = na->num_tx_desc;
766		info.num_rx_rings = na->num_rx_rings;
767		info.num_rx_descs = na->num_rx_desc;
768		info.rx_buf_maxsize = na->rx_buf_maxsize;
769	}
770
771	if (na->num_tx_rings == info.num_tx_rings &&
772	    na->num_tx_desc == info.num_tx_descs &&
773	    na->num_rx_rings == info.num_rx_rings &&
774	    na->num_rx_desc == info.num_rx_descs &&
775	    na->rx_buf_maxsize == info.rx_buf_maxsize)
776		return 0; /* nothing changed */
777	if (na->active_fds == 0) {
778		na->num_tx_rings = info.num_tx_rings;
779		na->num_tx_desc = info.num_tx_descs;
780		na->num_rx_rings = info.num_rx_rings;
781		na->num_rx_desc = info.num_rx_descs;
782		na->rx_buf_maxsize = info.rx_buf_maxsize;
783		if (netmap_verbose)
784			nm_prinf("configuration changed for %s: txring %d x %d, "
785				"rxring %d x %d, rxbufsz %d",
786				na->name, na->num_tx_rings, na->num_tx_desc,
787				na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
788		return 0;
789	}
790	nm_prerr("WARNING: configuration changed for %s while active: "
791		"txring %d x %d, rxring %d x %d, rxbufsz %d",
792		na->name, info.num_tx_rings, info.num_tx_descs,
793		info.num_rx_rings, info.num_rx_descs,
794		info.rx_buf_maxsize);
795	return 1;
796}
797
798/* nm_sync callbacks for the host rings */
799static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
800static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
801
802/* create the krings array and initialize the fields common to all adapters.
803 * The array layout is this:
804 *
805 *                    +----------+
806 * na->tx_rings ----->|          | \
807 *                    |          |  } na->num_tx_ring
808 *                    |          | /
809 *                    +----------+
810 *                    |          |    host tx kring
811 * na->rx_rings ----> +----------+
812 *                    |          | \
813 *                    |          |  } na->num_rx_rings
814 *                    |          | /
815 *                    +----------+
816 *                    |          |    host rx kring
817 *                    +----------+
818 * na->tailroom ----->|          | \
819 *                    |          |  } tailroom bytes
820 *                    |          | /
821 *                    +----------+
822 *
823 * Note: for compatibility, host krings are created even when not needed.
824 * The tailroom space is currently used by vale ports for allocating leases.
825 */
826/* call with NMG_LOCK held */
827int
828netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
829{
830	u_int i, len, ndesc;
831	struct netmap_kring *kring;
832	u_int n[NR_TXRX];
833	enum txrx t;
834
835	if (na->tx_rings != NULL) {
836		if (netmap_debug & NM_DEBUG_ON)
837			nm_prerr("warning: krings were already created");
838		return 0;
839	}
840
841	/* account for the (possibly fake) host rings */
842	n[NR_TX] = netmap_all_rings(na, NR_TX);
843	n[NR_RX] = netmap_all_rings(na, NR_RX);
844
845	len = (n[NR_TX] + n[NR_RX]) *
846		(sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
847		+ tailroom;
848
849	na->tx_rings = nm_os_malloc((size_t)len);
850	if (na->tx_rings == NULL) {
851		nm_prerr("Cannot allocate krings");
852		return ENOMEM;
853	}
854	na->rx_rings = na->tx_rings + n[NR_TX];
855	na->tailroom = na->rx_rings + n[NR_RX];
856
857	/* link the krings in the krings array */
858	kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
859	for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
860		na->tx_rings[i] = kring;
861		kring++;
862	}
863
864	/*
865	 * All fields in krings are 0 except the one initialized below.
866	 * but better be explicit on important kring fields.
867	 */
868	for_rx_tx(t) {
869		ndesc = nma_get_ndesc(na, t);
870		for (i = 0; i < n[t]; i++) {
871			kring = NMR(na, t)[i];
872			bzero(kring, sizeof(*kring));
873			kring->na = na;
874			kring->notify_na = na;
875			kring->ring_id = i;
876			kring->tx = t;
877			kring->nkr_num_slots = ndesc;
878			kring->nr_mode = NKR_NETMAP_OFF;
879			kring->nr_pending_mode = NKR_NETMAP_OFF;
880			if (i < nma_get_nrings(na, t)) {
881				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
882			} else {
883				if (!(na->na_flags & NAF_HOST_RINGS))
884					kring->nr_kflags |= NKR_FAKERING;
885				kring->nm_sync = (t == NR_TX ?
886						netmap_txsync_to_host:
887						netmap_rxsync_from_host);
888			}
889			kring->nm_notify = na->nm_notify;
890			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
891			/*
892			 * IMPORTANT: Always keep one slot empty.
893			 */
894			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
895			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
896					nm_txrx2str(t), i);
897			ND("ktx %s h %d c %d t %d",
898				kring->name, kring->rhead, kring->rcur, kring->rtail);
899			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
900			nm_os_selinfo_init(&kring->si);
901		}
902		nm_os_selinfo_init(&na->si[t]);
903	}
904
905
906	return 0;
907}
908
909
910/* undo the actions performed by netmap_krings_create */
911/* call with NMG_LOCK held */
912void
913netmap_krings_delete(struct netmap_adapter *na)
914{
915	struct netmap_kring **kring = na->tx_rings;
916	enum txrx t;
917
918	if (na->tx_rings == NULL) {
919		if (netmap_debug & NM_DEBUG_ON)
920			nm_prerr("warning: krings were already deleted");
921		return;
922	}
923
924	for_rx_tx(t)
925		nm_os_selinfo_uninit(&na->si[t]);
926
927	/* we rely on the krings layout described above */
928	for ( ; kring != na->tailroom; kring++) {
929		mtx_destroy(&(*kring)->q_lock);
930		nm_os_selinfo_uninit(&(*kring)->si);
931	}
932	nm_os_free(na->tx_rings);
933	na->tx_rings = na->rx_rings = na->tailroom = NULL;
934}
935
936
937/*
938 * Destructor for NIC ports. They also have an mbuf queue
939 * on the rings connected to the host so we need to purge
940 * them first.
941 */
942/* call with NMG_LOCK held */
943void
944netmap_hw_krings_delete(struct netmap_adapter *na)
945{
946	u_int lim = netmap_real_rings(na, NR_RX), i;
947
948	for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
949		struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
950		ND("destroy sw mbq with len %d", mbq_len(q));
951		mbq_purge(q);
952		mbq_safe_fini(q);
953	}
954	netmap_krings_delete(na);
955}
956
957static void
958netmap_mem_drop(struct netmap_adapter *na)
959{
960	int last = netmap_mem_deref(na->nm_mem, na);
961	/* if the native allocator had been overrided on regif,
962	 * restore it now and drop the temporary one
963	 */
964	if (last && na->nm_mem_prev) {
965		netmap_mem_put(na->nm_mem);
966		na->nm_mem = na->nm_mem_prev;
967		na->nm_mem_prev = NULL;
968	}
969}
970
971/*
972 * Undo everything that was done in netmap_do_regif(). In particular,
973 * call nm_register(ifp,0) to stop netmap mode on the interface and
974 * revert to normal operation.
975 */
976/* call with NMG_LOCK held */
977static void netmap_unset_ringid(struct netmap_priv_d *);
978static void netmap_krings_put(struct netmap_priv_d *);
979void
980netmap_do_unregif(struct netmap_priv_d *priv)
981{
982	struct netmap_adapter *na = priv->np_na;
983
984	NMG_LOCK_ASSERT();
985	na->active_fds--;
986	/* unset nr_pending_mode and possibly release exclusive mode */
987	netmap_krings_put(priv);
988
989#ifdef	WITH_MONITOR
990	/* XXX check whether we have to do something with monitor
991	 * when rings change nr_mode. */
992	if (na->active_fds <= 0) {
993		/* walk through all the rings and tell any monitor
994		 * that the port is going to exit netmap mode
995		 */
996		netmap_monitor_stop(na);
997	}
998#endif
999
1000	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
1001		na->nm_register(na, 0);
1002	}
1003
1004	/* delete rings and buffers that are no longer needed */
1005	netmap_mem_rings_delete(na);
1006
1007	if (na->active_fds <= 0) {	/* last instance */
1008		/*
1009		 * (TO CHECK) We enter here
1010		 * when the last reference to this file descriptor goes
1011		 * away. This means we cannot have any pending poll()
1012		 * or interrupt routine operating on the structure.
1013		 * XXX The file may be closed in a thread while
1014		 * another thread is using it.
1015		 * Linux keeps the file opened until the last reference
1016		 * by any outstanding ioctl/poll or mmap is gone.
1017		 * FreeBSD does not track mmap()s (but we do) and
1018		 * wakes up any sleeping poll(). Need to check what
1019		 * happens if the close() occurs while a concurrent
1020		 * syscall is running.
1021		 */
1022		if (netmap_debug & NM_DEBUG_ON)
1023			nm_prinf("deleting last instance for %s", na->name);
1024
1025		if (nm_netmap_on(na)) {
1026			nm_prerr("BUG: netmap on while going to delete the krings");
1027		}
1028
1029		na->nm_krings_delete(na);
1030	}
1031
1032	/* possibily decrement counter of tx_si/rx_si users */
1033	netmap_unset_ringid(priv);
1034	/* delete the nifp */
1035	netmap_mem_if_delete(na, priv->np_nifp);
1036	/* drop the allocator */
1037	netmap_mem_drop(na);
1038	/* mark the priv as unregistered */
1039	priv->np_na = NULL;
1040	priv->np_nifp = NULL;
1041}
1042
1043struct netmap_priv_d*
1044netmap_priv_new(void)
1045{
1046	struct netmap_priv_d *priv;
1047
1048	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1049	if (priv == NULL)
1050		return NULL;
1051	priv->np_refs = 1;
1052	nm_os_get_module();
1053	return priv;
1054}
1055
1056/*
1057 * Destructor of the netmap_priv_d, called when the fd is closed
1058 * Action: undo all the things done by NIOCREGIF,
1059 * On FreeBSD we need to track whether there are active mmap()s,
1060 * and we use np_active_mmaps for that. On linux, the field is always 0.
1061 * Return: 1 if we can free priv, 0 otherwise.
1062 *
1063 */
1064/* call with NMG_LOCK held */
1065void
1066netmap_priv_delete(struct netmap_priv_d *priv)
1067{
1068	struct netmap_adapter *na = priv->np_na;
1069
1070	/* number of active references to this fd */
1071	if (--priv->np_refs > 0) {
1072		return;
1073	}
1074	nm_os_put_module();
1075	if (na) {
1076		netmap_do_unregif(priv);
1077	}
1078	netmap_unget_na(na, priv->np_ifp);
1079	bzero(priv, sizeof(*priv));	/* for safety */
1080	nm_os_free(priv);
1081}
1082
1083
1084/* call with NMG_LOCK *not* held */
1085void
1086netmap_dtor(void *data)
1087{
1088	struct netmap_priv_d *priv = data;
1089
1090	NMG_LOCK();
1091	netmap_priv_delete(priv);
1092	NMG_UNLOCK();
1093}
1094
1095
1096/*
1097 * Handlers for synchronization of the rings from/to the host stack.
1098 * These are associated to a network interface and are just another
1099 * ring pair managed by userspace.
1100 *
1101 * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1102 * flags):
1103 *
1104 * - Before releasing buffers on hw RX rings, the application can mark
1105 *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1106 *   will be forwarded to the host stack, similarly to what happened if
1107 *   the application moved them to the host TX ring.
1108 *
1109 * - Before releasing buffers on the host RX ring, the application can
1110 *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1111 *   they will be forwarded to the hw TX rings, saving the application
1112 *   from doing the same task in user-space.
1113 *
1114 * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1115 * flag, or globally with the netmap_fwd sysctl.
1116 *
1117 * The transfer NIC --> host is relatively easy, just encapsulate
1118 * into mbufs and we are done. The host --> NIC side is slightly
1119 * harder because there might not be room in the tx ring so it
1120 * might take a while before releasing the buffer.
1121 */
1122
1123
1124/*
1125 * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1126 * We do not need to lock because the queue is private.
1127 * After this call the queue is empty.
1128 */
1129static void
1130netmap_send_up(struct ifnet *dst, struct mbq *q)
1131{
1132	struct mbuf *m;
1133	struct mbuf *head = NULL, *prev = NULL;
1134
1135	/* Send packets up, outside the lock; head/prev machinery
1136	 * is only useful for Windows. */
1137	while ((m = mbq_dequeue(q)) != NULL) {
1138		if (netmap_debug & NM_DEBUG_HOST)
1139			nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
1140		prev = nm_os_send_up(dst, m, prev);
1141		if (head == NULL)
1142			head = prev;
1143	}
1144	if (head)
1145		nm_os_send_up(dst, NULL, head);
1146	mbq_fini(q);
1147}
1148
1149
1150/*
1151 * Scan the buffers from hwcur to ring->head, and put a copy of those
1152 * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1153 * Drop remaining packets in the unlikely event
1154 * of an mbuf shortage.
1155 */
1156static void
1157netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1158{
1159	u_int const lim = kring->nkr_num_slots - 1;
1160	u_int const head = kring->rhead;
1161	u_int n;
1162	struct netmap_adapter *na = kring->na;
1163
1164	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1165		struct mbuf *m;
1166		struct netmap_slot *slot = &kring->ring->slot[n];
1167
1168		if ((slot->flags & NS_FORWARD) == 0 && !force)
1169			continue;
1170		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1171			RD(5, "bad pkt at %d len %d", n, slot->len);
1172			continue;
1173		}
1174		slot->flags &= ~NS_FORWARD; // XXX needed ?
1175		/* XXX TODO: adapt to the case of a multisegment packet */
1176		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1177
1178		if (m == NULL)
1179			break;
1180		mbq_enqueue(q, m);
1181	}
1182}
1183
1184static inline int
1185_nm_may_forward(struct netmap_kring *kring)
1186{
1187	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1188		 kring->na->na_flags & NAF_HOST_RINGS &&
1189		 kring->tx == NR_RX);
1190}
1191
1192static inline int
1193nm_may_forward_up(struct netmap_kring *kring)
1194{
1195	return	_nm_may_forward(kring) &&
1196		 kring->ring_id != kring->na->num_rx_rings;
1197}
1198
1199static inline int
1200nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1201{
1202	return	_nm_may_forward(kring) &&
1203		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1204		 kring->ring_id == kring->na->num_rx_rings;
1205}
1206
1207/*
1208 * Send to the NIC rings packets marked NS_FORWARD between
1209 * kring->nr_hwcur and kring->rhead.
1210 * Called under kring->rx_queue.lock on the sw rx ring.
1211 *
1212 * It can only be called if the user opened all the TX hw rings,
1213 * see NAF_CAN_FORWARD_DOWN flag.
1214 * We can touch the TX netmap rings (slots, head and cur) since
1215 * we are in poll/ioctl system call context, and the application
1216 * is not supposed to touch the ring (using a different thread)
1217 * during the execution of the system call.
1218 */
1219static u_int
1220netmap_sw_to_nic(struct netmap_adapter *na)
1221{
1222	struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1223	struct netmap_slot *rxslot = kring->ring->slot;
1224	u_int i, rxcur = kring->nr_hwcur;
1225	u_int const head = kring->rhead;
1226	u_int const src_lim = kring->nkr_num_slots - 1;
1227	u_int sent = 0;
1228
1229	/* scan rings to find space, then fill as much as possible */
1230	for (i = 0; i < na->num_tx_rings; i++) {
1231		struct netmap_kring *kdst = na->tx_rings[i];
1232		struct netmap_ring *rdst = kdst->ring;
1233		u_int const dst_lim = kdst->nkr_num_slots - 1;
1234
1235		/* XXX do we trust ring or kring->rcur,rtail ? */
1236		for (; rxcur != head && !nm_ring_empty(rdst);
1237		     rxcur = nm_next(rxcur, src_lim) ) {
1238			struct netmap_slot *src, *dst, tmp;
1239			u_int dst_head = rdst->head;
1240
1241			src = &rxslot[rxcur];
1242			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1243				continue;
1244
1245			sent++;
1246
1247			dst = &rdst->slot[dst_head];
1248
1249			tmp = *src;
1250
1251			src->buf_idx = dst->buf_idx;
1252			src->flags = NS_BUF_CHANGED;
1253
1254			dst->buf_idx = tmp.buf_idx;
1255			dst->len = tmp.len;
1256			dst->flags = NS_BUF_CHANGED;
1257
1258			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1259		}
1260		/* if (sent) XXX txsync ? it would be just an optimization */
1261	}
1262	return sent;
1263}
1264
1265
1266/*
1267 * netmap_txsync_to_host() passes packets up. We are called from a
1268 * system call in user process context, and the only contention
1269 * can be among multiple user threads erroneously calling
1270 * this routine concurrently.
1271 */
1272static int
1273netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1274{
1275	struct netmap_adapter *na = kring->na;
1276	u_int const lim = kring->nkr_num_slots - 1;
1277	u_int const head = kring->rhead;
1278	struct mbq q;
1279
1280	/* Take packets from hwcur to head and pass them up.
1281	 * Force hwcur = head since netmap_grab_packets() stops at head
1282	 */
1283	mbq_init(&q);
1284	netmap_grab_packets(kring, &q, 1 /* force */);
1285	ND("have %d pkts in queue", mbq_len(&q));
1286	kring->nr_hwcur = head;
1287	kring->nr_hwtail = head + lim;
1288	if (kring->nr_hwtail > lim)
1289		kring->nr_hwtail -= lim + 1;
1290
1291	netmap_send_up(na->ifp, &q);
1292	return 0;
1293}
1294
1295
1296/*
1297 * rxsync backend for packets coming from the host stack.
1298 * They have been put in kring->rx_queue by netmap_transmit().
1299 * We protect access to the kring using kring->rx_queue.lock
1300 *
1301 * also moves to the nic hw rings any packet the user has marked
1302 * for transparent-mode forwarding, then sets the NR_FORWARD
1303 * flag in the kring to let the caller push them out
1304 */
1305static int
1306netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1307{
1308	struct netmap_adapter *na = kring->na;
1309	struct netmap_ring *ring = kring->ring;
1310	u_int nm_i, n;
1311	u_int const lim = kring->nkr_num_slots - 1;
1312	u_int const head = kring->rhead;
1313	int ret = 0;
1314	struct mbq *q = &kring->rx_queue, fq;
1315
1316	mbq_init(&fq); /* fq holds packets to be freed */
1317
1318	mbq_lock(q);
1319
1320	/* First part: import newly received packets */
1321	n = mbq_len(q);
1322	if (n) { /* grab packets from the queue */
1323		struct mbuf *m;
1324		uint32_t stop_i;
1325
1326		nm_i = kring->nr_hwtail;
1327		stop_i = nm_prev(kring->nr_hwcur, lim);
1328		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1329			int len = MBUF_LEN(m);
1330			struct netmap_slot *slot = &ring->slot[nm_i];
1331
1332			m_copydata(m, 0, len, NMB(na, slot));
1333			ND("nm %d len %d", nm_i, len);
1334			if (netmap_debug & NM_DEBUG_HOST)
1335				nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1336
1337			slot->len = len;
1338			slot->flags = 0;
1339			nm_i = nm_next(nm_i, lim);
1340			mbq_enqueue(&fq, m);
1341		}
1342		kring->nr_hwtail = nm_i;
1343	}
1344
1345	/*
1346	 * Second part: skip past packets that userspace has released.
1347	 */
1348	nm_i = kring->nr_hwcur;
1349	if (nm_i != head) { /* something was released */
1350		if (nm_may_forward_down(kring, flags)) {
1351			ret = netmap_sw_to_nic(na);
1352			if (ret > 0) {
1353				kring->nr_kflags |= NR_FORWARD;
1354				ret = 0;
1355			}
1356		}
1357		kring->nr_hwcur = head;
1358	}
1359
1360	mbq_unlock(q);
1361
1362	mbq_purge(&fq);
1363	mbq_fini(&fq);
1364
1365	return ret;
1366}
1367
1368
1369/* Get a netmap adapter for the port.
1370 *
1371 * If it is possible to satisfy the request, return 0
1372 * with *na containing the netmap adapter found.
1373 * Otherwise return an error code, with *na containing NULL.
1374 *
1375 * When the port is attached to a bridge, we always return
1376 * EBUSY.
1377 * Otherwise, if the port is already bound to a file descriptor,
1378 * then we unconditionally return the existing adapter into *na.
1379 * In all the other cases, we return (into *na) either native,
1380 * generic or NULL, according to the following table:
1381 *
1382 *					native_support
1383 * active_fds   dev.netmap.admode         YES     NO
1384 * -------------------------------------------------------
1385 *    >0              *                 NA(ifp) NA(ifp)
1386 *
1387 *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1388 *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1389 *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1390 *
1391 */
1392static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1393int
1394netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1395{
1396	/* generic support */
1397	int i = netmap_admode;	/* Take a snapshot. */
1398	struct netmap_adapter *prev_na;
1399	int error = 0;
1400
1401	*na = NULL; /* default */
1402
1403	/* reset in case of invalid value */
1404	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1405		i = netmap_admode = NETMAP_ADMODE_BEST;
1406
1407	if (NM_NA_VALID(ifp)) {
1408		prev_na = NA(ifp);
1409		/* If an adapter already exists, return it if
1410		 * there are active file descriptors or if
1411		 * netmap is not forced to use generic
1412		 * adapters.
1413		 */
1414		if (NETMAP_OWNED_BY_ANY(prev_na)
1415			|| i != NETMAP_ADMODE_GENERIC
1416			|| prev_na->na_flags & NAF_FORCE_NATIVE
1417#ifdef WITH_PIPES
1418			/* ugly, but we cannot allow an adapter switch
1419			 * if some pipe is referring to this one
1420			 */
1421			|| prev_na->na_next_pipe > 0
1422#endif
1423		) {
1424			*na = prev_na;
1425			goto assign_mem;
1426		}
1427	}
1428
1429	/* If there isn't native support and netmap is not allowed
1430	 * to use generic adapters, we cannot satisfy the request.
1431	 */
1432	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1433		return EOPNOTSUPP;
1434
1435	/* Otherwise, create a generic adapter and return it,
1436	 * saving the previously used netmap adapter, if any.
1437	 *
1438	 * Note that here 'prev_na', if not NULL, MUST be a
1439	 * native adapter, and CANNOT be a generic one. This is
1440	 * true because generic adapters are created on demand, and
1441	 * destroyed when not used anymore. Therefore, if the adapter
1442	 * currently attached to an interface 'ifp' is generic, it
1443	 * must be that
1444	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1445	 * Consequently, if NA(ifp) is generic, we will enter one of
1446	 * the branches above. This ensures that we never override
1447	 * a generic adapter with another generic adapter.
1448	 */
1449	error = generic_netmap_attach(ifp);
1450	if (error)
1451		return error;
1452
1453	*na = NA(ifp);
1454
1455assign_mem:
1456	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1457	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1458		(*na)->nm_mem_prev = (*na)->nm_mem;
1459		(*na)->nm_mem = netmap_mem_get(nmd);
1460	}
1461
1462	return 0;
1463}
1464
1465/*
1466 * MUST BE CALLED UNDER NMG_LOCK()
1467 *
1468 * Get a refcounted reference to a netmap adapter attached
1469 * to the interface specified by req.
1470 * This is always called in the execution of an ioctl().
1471 *
1472 * Return ENXIO if the interface specified by the request does
1473 * not exist, ENOTSUP if netmap is not supported by the interface,
1474 * EBUSY if the interface is already attached to a bridge,
1475 * EINVAL if parameters are invalid, ENOMEM if needed resources
1476 * could not be allocated.
1477 * If successful, hold a reference to the netmap adapter.
1478 *
1479 * If the interface specified by req is a system one, also keep
1480 * a reference to it and return a valid *ifp.
1481 */
1482int
1483netmap_get_na(struct nmreq_header *hdr,
1484	      struct netmap_adapter **na, struct ifnet **ifp,
1485	      struct netmap_mem_d *nmd, int create)
1486{
1487	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1488	int error = 0;
1489	struct netmap_adapter *ret = NULL;
1490	int nmd_ref = 0;
1491
1492	*na = NULL;     /* default return value */
1493	*ifp = NULL;
1494
1495	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1496		return EINVAL;
1497	}
1498
1499	if (req->nr_mode == NR_REG_PIPE_MASTER ||
1500			req->nr_mode == NR_REG_PIPE_SLAVE) {
1501		/* Do not accept deprecated pipe modes. */
1502		nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1503		return EINVAL;
1504	}
1505
1506	NMG_LOCK_ASSERT();
1507
1508	/* if the request contain a memid, try to find the
1509	 * corresponding memory region
1510	 */
1511	if (nmd == NULL && req->nr_mem_id) {
1512		nmd = netmap_mem_find(req->nr_mem_id);
1513		if (nmd == NULL)
1514			return EINVAL;
1515		/* keep the rereference */
1516		nmd_ref = 1;
1517	}
1518
1519	/* We cascade through all possible types of netmap adapter.
1520	 * All netmap_get_*_na() functions return an error and an na,
1521	 * with the following combinations:
1522	 *
1523	 * error    na
1524	 *   0	   NULL		type doesn't match
1525	 *  !0	   NULL		type matches, but na creation/lookup failed
1526	 *   0	  !NULL		type matches and na created/found
1527	 *  !0    !NULL		impossible
1528	 */
1529	error = netmap_get_null_na(hdr, na, nmd, create);
1530	if (error || *na != NULL)
1531		goto out;
1532
1533	/* try to see if this is a monitor port */
1534	error = netmap_get_monitor_na(hdr, na, nmd, create);
1535	if (error || *na != NULL)
1536		goto out;
1537
1538	/* try to see if this is a pipe port */
1539	error = netmap_get_pipe_na(hdr, na, nmd, create);
1540	if (error || *na != NULL)
1541		goto out;
1542
1543	/* try to see if this is a bridge port */
1544	error = netmap_get_vale_na(hdr, na, nmd, create);
1545	if (error)
1546		goto out;
1547
1548	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1549		goto out;
1550
1551	/*
1552	 * This must be a hardware na, lookup the name in the system.
1553	 * Note that by hardware we actually mean "it shows up in ifconfig".
1554	 * This may still be a tap, a veth/epair, or even a
1555	 * persistent VALE port.
1556	 */
1557	*ifp = ifunit_ref(hdr->nr_name);
1558	if (*ifp == NULL) {
1559		error = ENXIO;
1560		goto out;
1561	}
1562
1563	error = netmap_get_hw_na(*ifp, nmd, &ret);
1564	if (error)
1565		goto out;
1566
1567	*na = ret;
1568	netmap_adapter_get(ret);
1569
1570out:
1571	if (error) {
1572		if (ret)
1573			netmap_adapter_put(ret);
1574		if (*ifp) {
1575			if_rele(*ifp);
1576			*ifp = NULL;
1577		}
1578	}
1579	if (nmd_ref)
1580		netmap_mem_put(nmd);
1581
1582	return error;
1583}
1584
1585/* undo netmap_get_na() */
1586void
1587netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1588{
1589	if (ifp)
1590		if_rele(ifp);
1591	if (na)
1592		netmap_adapter_put(na);
1593}
1594
1595
1596#define NM_FAIL_ON(t) do {						\
1597	if (unlikely(t)) {						\
1598		RD(5, "%s: fail '" #t "' "				\
1599			"h %d c %d t %d "				\
1600			"rh %d rc %d rt %d "				\
1601			"hc %d ht %d",					\
1602			kring->name,					\
1603			head, cur, ring->tail,				\
1604			kring->rhead, kring->rcur, kring->rtail,	\
1605			kring->nr_hwcur, kring->nr_hwtail);		\
1606		return kring->nkr_num_slots;				\
1607	}								\
1608} while (0)
1609
1610/*
1611 * validate parameters on entry for *_txsync()
1612 * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1613 * in case of error.
1614 *
1615 * rhead, rcur and rtail=hwtail are stored from previous round.
1616 * hwcur is the next packet to send to the ring.
1617 *
1618 * We want
1619 *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1620 *
1621 * hwcur, rhead, rtail and hwtail are reliable
1622 */
1623u_int
1624nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1625{
1626	u_int head = ring->head; /* read only once */
1627	u_int cur = ring->cur; /* read only once */
1628	u_int n = kring->nkr_num_slots;
1629
1630	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1631		kring->name,
1632		kring->nr_hwcur, kring->nr_hwtail,
1633		ring->head, ring->cur, ring->tail);
1634#if 1 /* kernel sanity checks; but we can trust the kring. */
1635	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1636	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1637#endif /* kernel sanity checks */
1638	/*
1639	 * user sanity checks. We only use head,
1640	 * A, B, ... are possible positions for head:
1641	 *
1642	 *  0    A  rhead   B  rtail   C  n-1
1643	 *  0    D  rtail   E  rhead   F  n-1
1644	 *
1645	 * B, F, D are valid. A, C, E are wrong
1646	 */
1647	if (kring->rtail >= kring->rhead) {
1648		/* want rhead <= head <= rtail */
1649		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1650		/* and also head <= cur <= rtail */
1651		NM_FAIL_ON(cur < head || cur > kring->rtail);
1652	} else { /* here rtail < rhead */
1653		/* we need head outside rtail .. rhead */
1654		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1655
1656		/* two cases now: head <= rtail or head >= rhead  */
1657		if (head <= kring->rtail) {
1658			/* want head <= cur <= rtail */
1659			NM_FAIL_ON(cur < head || cur > kring->rtail);
1660		} else { /* head >= rhead */
1661			/* cur must be outside rtail..head */
1662			NM_FAIL_ON(cur > kring->rtail && cur < head);
1663		}
1664	}
1665	if (ring->tail != kring->rtail) {
1666		RD(5, "%s tail overwritten was %d need %d", kring->name,
1667			ring->tail, kring->rtail);
1668		ring->tail = kring->rtail;
1669	}
1670	kring->rhead = head;
1671	kring->rcur = cur;
1672	return head;
1673}
1674
1675
1676/*
1677 * validate parameters on entry for *_rxsync()
1678 * Returns ring->head if ok, kring->nkr_num_slots on error.
1679 *
1680 * For a valid configuration,
1681 * hwcur <= head <= cur <= tail <= hwtail
1682 *
1683 * We only consider head and cur.
1684 * hwcur and hwtail are reliable.
1685 *
1686 */
1687u_int
1688nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1689{
1690	uint32_t const n = kring->nkr_num_slots;
1691	uint32_t head, cur;
1692
1693	ND(5,"%s kc %d kt %d h %d c %d t %d",
1694		kring->name,
1695		kring->nr_hwcur, kring->nr_hwtail,
1696		ring->head, ring->cur, ring->tail);
1697	/*
1698	 * Before storing the new values, we should check they do not
1699	 * move backwards. However:
1700	 * - head is not an issue because the previous value is hwcur;
1701	 * - cur could in principle go back, however it does not matter
1702	 *   because we are processing a brand new rxsync()
1703	 */
1704	cur = kring->rcur = ring->cur;	/* read only once */
1705	head = kring->rhead = ring->head;	/* read only once */
1706#if 1 /* kernel sanity checks */
1707	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1708#endif /* kernel sanity checks */
1709	/* user sanity checks */
1710	if (kring->nr_hwtail >= kring->nr_hwcur) {
1711		/* want hwcur <= rhead <= hwtail */
1712		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1713		/* and also rhead <= rcur <= hwtail */
1714		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1715	} else {
1716		/* we need rhead outside hwtail..hwcur */
1717		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1718		/* two cases now: head <= hwtail or head >= hwcur  */
1719		if (head <= kring->nr_hwtail) {
1720			/* want head <= cur <= hwtail */
1721			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1722		} else {
1723			/* cur must be outside hwtail..head */
1724			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1725		}
1726	}
1727	if (ring->tail != kring->rtail) {
1728		RD(5, "%s tail overwritten was %d need %d",
1729			kring->name,
1730			ring->tail, kring->rtail);
1731		ring->tail = kring->rtail;
1732	}
1733	return head;
1734}
1735
1736
1737/*
1738 * Error routine called when txsync/rxsync detects an error.
1739 * Can't do much more than resetting head = cur = hwcur, tail = hwtail
1740 * Return 1 on reinit.
1741 *
1742 * This routine is only called by the upper half of the kernel.
1743 * It only reads hwcur (which is changed only by the upper half, too)
1744 * and hwtail (which may be changed by the lower half, but only on
1745 * a tx ring and only to increase it, so any error will be recovered
1746 * on the next call). For the above, we don't strictly need to call
1747 * it under lock.
1748 */
1749int
1750netmap_ring_reinit(struct netmap_kring *kring)
1751{
1752	struct netmap_ring *ring = kring->ring;
1753	u_int i, lim = kring->nkr_num_slots - 1;
1754	int errors = 0;
1755
1756	// XXX KASSERT nm_kr_tryget
1757	RD(10, "called for %s", kring->name);
1758	// XXX probably wrong to trust userspace
1759	kring->rhead = ring->head;
1760	kring->rcur  = ring->cur;
1761	kring->rtail = ring->tail;
1762
1763	if (ring->cur > lim)
1764		errors++;
1765	if (ring->head > lim)
1766		errors++;
1767	if (ring->tail > lim)
1768		errors++;
1769	for (i = 0; i <= lim; i++) {
1770		u_int idx = ring->slot[i].buf_idx;
1771		u_int len = ring->slot[i].len;
1772		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1773			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1774			ring->slot[i].buf_idx = 0;
1775			ring->slot[i].len = 0;
1776		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1777			ring->slot[i].len = 0;
1778			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1779		}
1780	}
1781	if (errors) {
1782		RD(10, "total %d errors", errors);
1783		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1784			kring->name,
1785			ring->cur, kring->nr_hwcur,
1786			ring->tail, kring->nr_hwtail);
1787		ring->head = kring->rhead = kring->nr_hwcur;
1788		ring->cur  = kring->rcur  = kring->nr_hwcur;
1789		ring->tail = kring->rtail = kring->nr_hwtail;
1790	}
1791	return (errors ? 1 : 0);
1792}
1793
1794/* interpret the ringid and flags fields of an nmreq, by translating them
1795 * into a pair of intervals of ring indices:
1796 *
1797 * [priv->np_txqfirst, priv->np_txqlast) and
1798 * [priv->np_rxqfirst, priv->np_rxqlast)
1799 *
1800 */
1801int
1802netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1803			uint16_t nr_ringid, uint64_t nr_flags)
1804{
1805	struct netmap_adapter *na = priv->np_na;
1806	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1807	enum txrx t;
1808	u_int j;
1809
1810	for_rx_tx(t) {
1811		if (nr_flags & excluded_direction[t]) {
1812			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1813			continue;
1814		}
1815		switch (nr_mode) {
1816		case NR_REG_ALL_NIC:
1817		case NR_REG_NULL:
1818			priv->np_qfirst[t] = 0;
1819			priv->np_qlast[t] = nma_get_nrings(na, t);
1820			ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1821				priv->np_qfirst[t], priv->np_qlast[t]);
1822			break;
1823		case NR_REG_SW:
1824		case NR_REG_NIC_SW:
1825			if (!(na->na_flags & NAF_HOST_RINGS)) {
1826				nm_prerr("host rings not supported");
1827				return EINVAL;
1828			}
1829			priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1830				nma_get_nrings(na, t) : 0);
1831			priv->np_qlast[t] = netmap_all_rings(na, t);
1832			ND("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1833				nm_txrx2str(t),
1834				priv->np_qfirst[t], priv->np_qlast[t]);
1835			break;
1836		case NR_REG_ONE_NIC:
1837			if (nr_ringid >= na->num_tx_rings &&
1838					nr_ringid >= na->num_rx_rings) {
1839				nm_prerr("invalid ring id %d", nr_ringid);
1840				return EINVAL;
1841			}
1842			/* if not enough rings, use the first one */
1843			j = nr_ringid;
1844			if (j >= nma_get_nrings(na, t))
1845				j = 0;
1846			priv->np_qfirst[t] = j;
1847			priv->np_qlast[t] = j + 1;
1848			ND("ONE_NIC: %s %d %d", nm_txrx2str(t),
1849				priv->np_qfirst[t], priv->np_qlast[t]);
1850			break;
1851		default:
1852			nm_prerr("invalid regif type %d", nr_mode);
1853			return EINVAL;
1854		}
1855	}
1856	priv->np_flags = nr_flags;
1857
1858	/* Allow transparent forwarding mode in the host --> nic
1859	 * direction only if all the TX hw rings have been opened. */
1860	if (priv->np_qfirst[NR_TX] == 0 &&
1861			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1862		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1863	}
1864
1865	if (netmap_verbose) {
1866		nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
1867			na->name,
1868			priv->np_qfirst[NR_TX],
1869			priv->np_qlast[NR_TX],
1870			priv->np_qfirst[NR_RX],
1871			priv->np_qlast[NR_RX],
1872			nr_ringid);
1873	}
1874	return 0;
1875}
1876
1877
1878/*
1879 * Set the ring ID. For devices with a single queue, a request
1880 * for all rings is the same as a single ring.
1881 */
1882static int
1883netmap_set_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1884		uint16_t nr_ringid, uint64_t nr_flags)
1885{
1886	struct netmap_adapter *na = priv->np_na;
1887	int error;
1888	enum txrx t;
1889
1890	error = netmap_interp_ringid(priv, nr_mode, nr_ringid, nr_flags);
1891	if (error) {
1892		return error;
1893	}
1894
1895	priv->np_txpoll = (nr_flags & NR_NO_TX_POLL) ? 0 : 1;
1896
1897	/* optimization: count the users registered for more than
1898	 * one ring, which are the ones sleeping on the global queue.
1899	 * The default netmap_notify() callback will then
1900	 * avoid signaling the global queue if nobody is using it
1901	 */
1902	for_rx_tx(t) {
1903		if (nm_si_user(priv, t))
1904			na->si_users[t]++;
1905	}
1906	return 0;
1907}
1908
1909static void
1910netmap_unset_ringid(struct netmap_priv_d *priv)
1911{
1912	struct netmap_adapter *na = priv->np_na;
1913	enum txrx t;
1914
1915	for_rx_tx(t) {
1916		if (nm_si_user(priv, t))
1917			na->si_users[t]--;
1918		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1919	}
1920	priv->np_flags = 0;
1921	priv->np_txpoll = 0;
1922	priv->np_kloop_state = 0;
1923}
1924
1925
1926/* Set the nr_pending_mode for the requested rings.
1927 * If requested, also try to get exclusive access to the rings, provided
1928 * the rings we want to bind are not exclusively owned by a previous bind.
1929 */
1930static int
1931netmap_krings_get(struct netmap_priv_d *priv)
1932{
1933	struct netmap_adapter *na = priv->np_na;
1934	u_int i;
1935	struct netmap_kring *kring;
1936	int excl = (priv->np_flags & NR_EXCLUSIVE);
1937	enum txrx t;
1938
1939	if (netmap_debug & NM_DEBUG_ON)
1940		nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
1941			na->name,
1942			priv->np_qfirst[NR_TX],
1943			priv->np_qlast[NR_TX],
1944			priv->np_qfirst[NR_RX],
1945			priv->np_qlast[NR_RX]);
1946
1947	/* first round: check that all the requested rings
1948	 * are neither alread exclusively owned, nor we
1949	 * want exclusive ownership when they are already in use
1950	 */
1951	for_rx_tx(t) {
1952		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1953			kring = NMR(na, t)[i];
1954			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1955			    (kring->users && excl))
1956			{
1957				ND("ring %s busy", kring->name);
1958				return EBUSY;
1959			}
1960		}
1961	}
1962
1963	/* second round: increment usage count (possibly marking them
1964	 * as exclusive) and set the nr_pending_mode
1965	 */
1966	for_rx_tx(t) {
1967		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1968			kring = NMR(na, t)[i];
1969			kring->users++;
1970			if (excl)
1971				kring->nr_kflags |= NKR_EXCLUSIVE;
1972	                kring->nr_pending_mode = NKR_NETMAP_ON;
1973		}
1974	}
1975
1976	return 0;
1977
1978}
1979
1980/* Undo netmap_krings_get(). This is done by clearing the exclusive mode
1981 * if was asked on regif, and unset the nr_pending_mode if we are the
1982 * last users of the involved rings. */
1983static void
1984netmap_krings_put(struct netmap_priv_d *priv)
1985{
1986	struct netmap_adapter *na = priv->np_na;
1987	u_int i;
1988	struct netmap_kring *kring;
1989	int excl = (priv->np_flags & NR_EXCLUSIVE);
1990	enum txrx t;
1991
1992	ND("%s: releasing tx [%d, %d) rx [%d, %d)",
1993			na->name,
1994			priv->np_qfirst[NR_TX],
1995			priv->np_qlast[NR_TX],
1996			priv->np_qfirst[NR_RX],
1997			priv->np_qlast[MR_RX]);
1998
1999	for_rx_tx(t) {
2000		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2001			kring = NMR(na, t)[i];
2002			if (excl)
2003				kring->nr_kflags &= ~NKR_EXCLUSIVE;
2004			kring->users--;
2005			if (kring->users == 0)
2006				kring->nr_pending_mode = NKR_NETMAP_OFF;
2007		}
2008	}
2009}
2010
2011static int
2012nm_priv_rx_enabled(struct netmap_priv_d *priv)
2013{
2014	return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2015}
2016
2017/* Validate the CSB entries for both directions (atok and ktoa).
2018 * To be called under NMG_LOCK(). */
2019static int
2020netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
2021{
2022	struct nm_csb_atok *csb_atok_base =
2023		(struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
2024	struct nm_csb_ktoa *csb_ktoa_base =
2025		(struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
2026	enum txrx t;
2027	int num_rings[NR_TXRX], tot_rings;
2028	size_t entry_size[2];
2029	void *csb_start[2];
2030	int i;
2031
2032	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
2033		nm_prerr("Cannot update CSB while kloop is running");
2034		return EBUSY;
2035	}
2036
2037	tot_rings = 0;
2038	for_rx_tx(t) {
2039		num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
2040		tot_rings += num_rings[t];
2041	}
2042	if (tot_rings <= 0)
2043		return 0;
2044
2045	if (!(priv->np_flags & NR_EXCLUSIVE)) {
2046		nm_prerr("CSB mode requires NR_EXCLUSIVE");
2047		return EINVAL;
2048	}
2049
2050	entry_size[0] = sizeof(*csb_atok_base);
2051	entry_size[1] = sizeof(*csb_ktoa_base);
2052	csb_start[0] = (void *)csb_atok_base;
2053	csb_start[1] = (void *)csb_ktoa_base;
2054
2055	for (i = 0; i < 2; i++) {
2056		/* On Linux we could use access_ok() to simplify
2057		 * the validation. However, the advantage of
2058		 * this approach is that it works also on
2059		 * FreeBSD. */
2060		size_t csb_size = tot_rings * entry_size[i];
2061		void *tmp;
2062		int err;
2063
2064		if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
2065			nm_prerr("Unaligned CSB address");
2066			return EINVAL;
2067		}
2068
2069		tmp = nm_os_malloc(csb_size);
2070		if (!tmp)
2071			return ENOMEM;
2072		if (i == 0) {
2073			/* Application --> kernel direction. */
2074			err = copyin(csb_start[i], tmp, csb_size);
2075		} else {
2076			/* Kernel --> application direction. */
2077			memset(tmp, 0, csb_size);
2078			err = copyout(tmp, csb_start[i], csb_size);
2079		}
2080		nm_os_free(tmp);
2081		if (err) {
2082			nm_prerr("Invalid CSB address");
2083			return err;
2084		}
2085	}
2086
2087	priv->np_csb_atok_base = csb_atok_base;
2088	priv->np_csb_ktoa_base = csb_ktoa_base;
2089
2090	/* Initialize the CSB. */
2091	for_rx_tx(t) {
2092		for (i = 0; i < num_rings[t]; i++) {
2093			struct netmap_kring *kring =
2094				NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
2095			struct nm_csb_atok *csb_atok = csb_atok_base + i;
2096			struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
2097
2098			if (t == NR_RX) {
2099				csb_atok += num_rings[NR_TX];
2100				csb_ktoa += num_rings[NR_TX];
2101			}
2102
2103			CSB_WRITE(csb_atok, head, kring->rhead);
2104			CSB_WRITE(csb_atok, cur, kring->rcur);
2105			CSB_WRITE(csb_atok, appl_need_kick, 1);
2106			CSB_WRITE(csb_atok, sync_flags, 1);
2107			CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
2108			CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
2109			CSB_WRITE(csb_ktoa, kern_need_kick, 1);
2110
2111			nm_prinf("csb_init for kring %s: head %u, cur %u, "
2112				"hwcur %u, hwtail %u", kring->name,
2113				kring->rhead, kring->rcur, kring->nr_hwcur,
2114				kring->nr_hwtail);
2115		}
2116	}
2117
2118	return 0;
2119}
2120
2121/* Ensure that the netmap adapter can support the given MTU.
2122 * @return EINVAL if the na cannot be set to mtu, 0 otherwise.
2123 */
2124int
2125netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
2126	unsigned nbs = NETMAP_BUF_SIZE(na);
2127
2128	if (mtu <= na->rx_buf_maxsize) {
2129		/* The MTU fits a single NIC slot. We only
2130		 * Need to check that netmap buffers are
2131		 * large enough to hold an MTU. NS_MOREFRAG
2132		 * cannot be used in this case. */
2133		if (nbs < mtu) {
2134			nm_prerr("error: netmap buf size (%u) "
2135				 "< device MTU (%u)", nbs, mtu);
2136			return EINVAL;
2137		}
2138	} else {
2139		/* More NIC slots may be needed to receive
2140		 * or transmit a single packet. Check that
2141		 * the adapter supports NS_MOREFRAG and that
2142		 * netmap buffers are large enough to hold
2143		 * the maximum per-slot size. */
2144		if (!(na->na_flags & NAF_MOREFRAG)) {
2145			nm_prerr("error: large MTU (%d) needed "
2146				 "but %s does not support "
2147				 "NS_MOREFRAG", mtu,
2148				 na->ifp->if_xname);
2149			return EINVAL;
2150		} else if (nbs < na->rx_buf_maxsize) {
2151			nm_prerr("error: using NS_MOREFRAG on "
2152				 "%s requires netmap buf size "
2153				 ">= %u", na->ifp->if_xname,
2154				 na->rx_buf_maxsize);
2155			return EINVAL;
2156		} else {
2157			nm_prinf("info: netmap application on "
2158				 "%s needs to support "
2159				 "NS_MOREFRAG "
2160				 "(MTU=%u,netmap_buf_size=%u)",
2161				 na->ifp->if_xname, mtu, nbs);
2162		}
2163	}
2164	return 0;
2165}
2166
2167
2168/*
2169 * possibly move the interface to netmap-mode.
2170 * If success it returns a pointer to netmap_if, otherwise NULL.
2171 * This must be called with NMG_LOCK held.
2172 *
2173 * The following na callbacks are called in the process:
2174 *
2175 * na->nm_config()			[by netmap_update_config]
2176 * (get current number and size of rings)
2177 *
2178 *  	We have a generic one for linux (netmap_linux_config).
2179 *  	The bwrap has to override this, since it has to forward
2180 *  	the request to the wrapped adapter (netmap_bwrap_config).
2181 *
2182 *
2183 * na->nm_krings_create()
2184 * (create and init the krings array)
2185 *
2186 * 	One of the following:
2187 *
2188 *	* netmap_hw_krings_create, 			(hw ports)
2189 *		creates the standard layout for the krings
2190 * 		and adds the mbq (used for the host rings).
2191 *
2192 * 	* netmap_vp_krings_create			(VALE ports)
2193 * 		add leases and scratchpads
2194 *
2195 * 	* netmap_pipe_krings_create			(pipes)
2196 * 		create the krings and rings of both ends and
2197 * 		cross-link them
2198 *
2199 *      * netmap_monitor_krings_create 			(monitors)
2200 *      	avoid allocating the mbq
2201 *
2202 *      * netmap_bwrap_krings_create			(bwraps)
2203 *      	create both the brap krings array,
2204 *      	the krings array of the wrapped adapter, and
2205 *      	(if needed) the fake array for the host adapter
2206 *
2207 * na->nm_register(, 1)
2208 * (put the adapter in netmap mode)
2209 *
2210 * 	This may be one of the following:
2211 *
2212 * 	* netmap_hw_reg				        (hw ports)
2213 * 		checks that the ifp is still there, then calls
2214 * 		the hardware specific callback;
2215 *
2216 * 	* netmap_vp_reg					(VALE ports)
2217 *		If the port is connected to a bridge,
2218 *		set the NAF_NETMAP_ON flag under the
2219 *		bridge write lock.
2220 *
2221 *	* netmap_pipe_reg				(pipes)
2222 *		inform the other pipe end that it is no
2223 *		longer responsible for the lifetime of this
2224 *		pipe end
2225 *
2226 *	* netmap_monitor_reg				(monitors)
2227 *		intercept the sync callbacks of the monitored
2228 *		rings
2229 *
2230 *	* netmap_bwrap_reg				(bwraps)
2231 *		cross-link the bwrap and hwna rings,
2232 *		forward the request to the hwna, override
2233 *		the hwna notify callback (to get the frames
2234 *		coming from outside go through the bridge).
2235 *
2236 *
2237 */
2238int
2239netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2240	uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags)
2241{
2242	struct netmap_if *nifp = NULL;
2243	int error;
2244
2245	NMG_LOCK_ASSERT();
2246	priv->np_na = na;     /* store the reference */
2247	error = netmap_mem_finalize(na->nm_mem, na);
2248	if (error)
2249		goto err;
2250
2251	if (na->active_fds == 0) {
2252
2253		/* cache the allocator info in the na */
2254		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2255		if (error)
2256			goto err_drop_mem;
2257		ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2258					    na->na_lut.objsize);
2259
2260		/* ring configuration may have changed, fetch from the card */
2261		netmap_update_config(na);
2262	}
2263
2264	/* compute the range of tx and rx rings to monitor */
2265	error = netmap_set_ringid(priv, nr_mode, nr_ringid, nr_flags);
2266	if (error)
2267		goto err_put_lut;
2268
2269	if (na->active_fds == 0) {
2270		/*
2271		 * If this is the first registration of the adapter,
2272		 * perform sanity checks and create the in-kernel view
2273		 * of the netmap rings (the netmap krings).
2274		 */
2275		if (na->ifp && nm_priv_rx_enabled(priv)) {
2276			/* This netmap adapter is attached to an ifnet. */
2277			unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2278
2279			ND("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2280				na->name, mtu, na->rx_buf_maxsize, NETMAP_BUF_SIZE(na));
2281
2282			if (na->rx_buf_maxsize == 0) {
2283				nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
2284				error = EIO;
2285				goto err_drop_mem;
2286			}
2287
2288			error = netmap_buf_size_validate(na, mtu);
2289			if (error)
2290				goto err_drop_mem;
2291		}
2292
2293		/*
2294		 * Depending on the adapter, this may also create
2295		 * the netmap rings themselves
2296		 */
2297		error = na->nm_krings_create(na);
2298		if (error)
2299			goto err_put_lut;
2300
2301	}
2302
2303	/* now the krings must exist and we can check whether some
2304	 * previous bind has exclusive ownership on them, and set
2305	 * nr_pending_mode
2306	 */
2307	error = netmap_krings_get(priv);
2308	if (error)
2309		goto err_del_krings;
2310
2311	/* create all needed missing netmap rings */
2312	error = netmap_mem_rings_create(na);
2313	if (error)
2314		goto err_rel_excl;
2315
2316	/* in all cases, create a new netmap if */
2317	nifp = netmap_mem_if_new(na, priv);
2318	if (nifp == NULL) {
2319		error = ENOMEM;
2320		goto err_rel_excl;
2321	}
2322
2323	if (nm_kring_pending(priv)) {
2324		/* Some kring is switching mode, tell the adapter to
2325		 * react on this. */
2326		error = na->nm_register(na, 1);
2327		if (error)
2328			goto err_del_if;
2329	}
2330
2331	/* Commit the reference. */
2332	na->active_fds++;
2333
2334	/*
2335	 * advertise that the interface is ready by setting np_nifp.
2336	 * The barrier is needed because readers (poll, *SYNC and mmap)
2337	 * check for priv->np_nifp != NULL without locking
2338	 */
2339	mb(); /* make sure previous writes are visible to all CPUs */
2340	priv->np_nifp = nifp;
2341
2342	return 0;
2343
2344err_del_if:
2345	netmap_mem_if_delete(na, nifp);
2346err_rel_excl:
2347	netmap_krings_put(priv);
2348	netmap_mem_rings_delete(na);
2349err_del_krings:
2350	if (na->active_fds == 0)
2351		na->nm_krings_delete(na);
2352err_put_lut:
2353	if (na->active_fds == 0)
2354		memset(&na->na_lut, 0, sizeof(na->na_lut));
2355err_drop_mem:
2356	netmap_mem_drop(na);
2357err:
2358	priv->np_na = NULL;
2359	return error;
2360}
2361
2362
2363/*
2364 * update kring and ring at the end of rxsync/txsync.
2365 */
2366static inline void
2367nm_sync_finalize(struct netmap_kring *kring)
2368{
2369	/*
2370	 * Update ring tail to what the kernel knows
2371	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2372	 * if no carrier.
2373	 */
2374	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2375
2376	ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2377		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2378		kring->rhead, kring->rcur, kring->rtail);
2379}
2380
2381/* set ring timestamp */
2382static inline void
2383ring_timestamp_set(struct netmap_ring *ring)
2384{
2385	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2386		microtime(&ring->ts);
2387	}
2388}
2389
2390static int nmreq_copyin(struct nmreq_header *, int);
2391static int nmreq_copyout(struct nmreq_header *, int);
2392static int nmreq_checkoptions(struct nmreq_header *);
2393
2394/*
2395 * ioctl(2) support for the "netmap" device.
2396 *
2397 * Following a list of accepted commands:
2398 * - NIOCCTRL		device control API
2399 * - NIOCTXSYNC		sync TX rings
2400 * - NIOCRXSYNC		sync RX rings
2401 * - SIOCGIFADDR	just for convenience
2402 * - NIOCGINFO		deprecated (legacy API)
2403 * - NIOCREGIF		deprecated (legacy API)
2404 *
2405 * Return 0 on success, errno otherwise.
2406 */
2407int
2408netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2409		struct thread *td, int nr_body_is_user)
2410{
2411	struct mbq q;	/* packets from RX hw queues to host stack */
2412	struct netmap_adapter *na = NULL;
2413	struct netmap_mem_d *nmd = NULL;
2414	struct ifnet *ifp = NULL;
2415	int error = 0;
2416	u_int i, qfirst, qlast;
2417	struct netmap_kring **krings;
2418	int sync_flags;
2419	enum txrx t;
2420
2421	switch (cmd) {
2422	case NIOCCTRL: {
2423		struct nmreq_header *hdr = (struct nmreq_header *)data;
2424
2425		if (hdr->nr_version < NETMAP_MIN_API ||
2426		    hdr->nr_version > NETMAP_MAX_API) {
2427			nm_prerr("API mismatch: got %d need %d",
2428				hdr->nr_version, NETMAP_API);
2429			return EINVAL;
2430		}
2431
2432		/* Make a kernel-space copy of the user-space nr_body.
2433		 * For convenince, the nr_body pointer and the pointers
2434		 * in the options list will be replaced with their
2435		 * kernel-space counterparts. The original pointers are
2436		 * saved internally and later restored by nmreq_copyout
2437		 */
2438		error = nmreq_copyin(hdr, nr_body_is_user);
2439		if (error) {
2440			return error;
2441		}
2442
2443		/* Sanitize hdr->nr_name. */
2444		hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2445
2446		switch (hdr->nr_reqtype) {
2447		case NETMAP_REQ_REGISTER: {
2448			struct nmreq_register *req =
2449				(struct nmreq_register *)(uintptr_t)hdr->nr_body;
2450			struct netmap_if *nifp;
2451
2452			/* Protect access to priv from concurrent requests. */
2453			NMG_LOCK();
2454			do {
2455				struct nmreq_option *opt;
2456				u_int memflags;
2457
2458				if (priv->np_nifp != NULL) {	/* thread already registered */
2459					error = EBUSY;
2460					break;
2461				}
2462
2463#ifdef WITH_EXTMEM
2464				opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2465						NETMAP_REQ_OPT_EXTMEM);
2466				if (opt != NULL) {
2467					struct nmreq_opt_extmem *e =
2468						(struct nmreq_opt_extmem *)opt;
2469
2470					error = nmreq_checkduplicate(opt);
2471					if (error) {
2472						opt->nro_status = error;
2473						break;
2474					}
2475					nmd = netmap_mem_ext_create(e->nro_usrptr,
2476							&e->nro_info, &error);
2477					opt->nro_status = error;
2478					if (nmd == NULL)
2479						break;
2480				}
2481#endif /* WITH_EXTMEM */
2482
2483				if (nmd == NULL && req->nr_mem_id) {
2484					/* find the allocator and get a reference */
2485					nmd = netmap_mem_find(req->nr_mem_id);
2486					if (nmd == NULL) {
2487						if (netmap_verbose) {
2488							nm_prerr("%s: failed to find mem_id %u",
2489									hdr->nr_name, req->nr_mem_id);
2490						}
2491						error = EINVAL;
2492						break;
2493					}
2494				}
2495				/* find the interface and a reference */
2496				error = netmap_get_na(hdr, &na, &ifp, nmd,
2497						      1 /* create */); /* keep reference */
2498				if (error)
2499					break;
2500				if (NETMAP_OWNED_BY_KERN(na)) {
2501					error = EBUSY;
2502					break;
2503				}
2504
2505				if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2506					nm_prerr("virt_hdr_len=%d, but application does "
2507						"not accept it", na->virt_hdr_len);
2508					error = EIO;
2509					break;
2510				}
2511
2512				error = netmap_do_regif(priv, na, req->nr_mode,
2513							req->nr_ringid, req->nr_flags);
2514				if (error) {    /* reg. failed, release priv and ref */
2515					break;
2516				}
2517
2518				opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2519							NETMAP_REQ_OPT_CSB);
2520				if (opt != NULL) {
2521					struct nmreq_opt_csb *csbo =
2522						(struct nmreq_opt_csb *)opt;
2523					error = nmreq_checkduplicate(opt);
2524					if (!error) {
2525						error = netmap_csb_validate(priv, csbo);
2526					}
2527					opt->nro_status = error;
2528					if (error) {
2529						netmap_do_unregif(priv);
2530						break;
2531					}
2532				}
2533
2534				nifp = priv->np_nifp;
2535
2536				/* return the offset of the netmap_if object */
2537				req->nr_rx_rings = na->num_rx_rings;
2538				req->nr_tx_rings = na->num_tx_rings;
2539				req->nr_rx_slots = na->num_rx_desc;
2540				req->nr_tx_slots = na->num_tx_desc;
2541				error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2542					&req->nr_mem_id);
2543				if (error) {
2544					netmap_do_unregif(priv);
2545					break;
2546				}
2547				if (memflags & NETMAP_MEM_PRIVATE) {
2548					*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2549				}
2550				for_rx_tx(t) {
2551					priv->np_si[t] = nm_si_user(priv, t) ?
2552						&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2553				}
2554
2555				if (req->nr_extra_bufs) {
2556					if (netmap_verbose)
2557						nm_prinf("requested %d extra buffers",
2558							req->nr_extra_bufs);
2559					req->nr_extra_bufs = netmap_extra_alloc(na,
2560						&nifp->ni_bufs_head, req->nr_extra_bufs);
2561					if (netmap_verbose)
2562						nm_prinf("got %d extra buffers", req->nr_extra_bufs);
2563				}
2564				req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2565
2566				error = nmreq_checkoptions(hdr);
2567				if (error) {
2568					netmap_do_unregif(priv);
2569					break;
2570				}
2571
2572				/* store ifp reference so that priv destructor may release it */
2573				priv->np_ifp = ifp;
2574			} while (0);
2575			if (error) {
2576				netmap_unget_na(na, ifp);
2577			}
2578			/* release the reference from netmap_mem_find() or
2579			 * netmap_mem_ext_create()
2580			 */
2581			if (nmd)
2582				netmap_mem_put(nmd);
2583			NMG_UNLOCK();
2584			break;
2585		}
2586
2587		case NETMAP_REQ_PORT_INFO_GET: {
2588			struct nmreq_port_info_get *req =
2589				(struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2590
2591			NMG_LOCK();
2592			do {
2593				u_int memflags;
2594
2595				if (hdr->nr_name[0] != '\0') {
2596					/* Build a nmreq_register out of the nmreq_port_info_get,
2597					 * so that we can call netmap_get_na(). */
2598					struct nmreq_register regreq;
2599					bzero(&regreq, sizeof(regreq));
2600					regreq.nr_mode = NR_REG_ALL_NIC;
2601					regreq.nr_tx_slots = req->nr_tx_slots;
2602					regreq.nr_rx_slots = req->nr_rx_slots;
2603					regreq.nr_tx_rings = req->nr_tx_rings;
2604					regreq.nr_rx_rings = req->nr_rx_rings;
2605					regreq.nr_mem_id = req->nr_mem_id;
2606
2607					/* get a refcount */
2608					hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2609					hdr->nr_body = (uintptr_t)&regreq;
2610					error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2611					hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2612					hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2613					if (error) {
2614						na = NULL;
2615						ifp = NULL;
2616						break;
2617					}
2618					nmd = na->nm_mem; /* get memory allocator */
2619				} else {
2620					nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2621					if (nmd == NULL) {
2622						if (netmap_verbose)
2623							nm_prerr("%s: failed to find mem_id %u",
2624									hdr->nr_name,
2625									req->nr_mem_id ? req->nr_mem_id : 1);
2626						error = EINVAL;
2627						break;
2628					}
2629				}
2630
2631				error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2632					&req->nr_mem_id);
2633				if (error)
2634					break;
2635				if (na == NULL) /* only memory info */
2636					break;
2637				netmap_update_config(na);
2638				req->nr_rx_rings = na->num_rx_rings;
2639				req->nr_tx_rings = na->num_tx_rings;
2640				req->nr_rx_slots = na->num_rx_desc;
2641				req->nr_tx_slots = na->num_tx_desc;
2642			} while (0);
2643			netmap_unget_na(na, ifp);
2644			NMG_UNLOCK();
2645			break;
2646		}
2647#ifdef WITH_VALE
2648		case NETMAP_REQ_VALE_ATTACH: {
2649			error = netmap_vale_attach(hdr, NULL /* userspace request */);
2650			break;
2651		}
2652
2653		case NETMAP_REQ_VALE_DETACH: {
2654			error = netmap_vale_detach(hdr, NULL /* userspace request */);
2655			break;
2656		}
2657
2658		case NETMAP_REQ_VALE_LIST: {
2659			error = netmap_vale_list(hdr);
2660			break;
2661		}
2662
2663		case NETMAP_REQ_PORT_HDR_SET: {
2664			struct nmreq_port_hdr *req =
2665				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2666			/* Build a nmreq_register out of the nmreq_port_hdr,
2667			 * so that we can call netmap_get_bdg_na(). */
2668			struct nmreq_register regreq;
2669			bzero(&regreq, sizeof(regreq));
2670			regreq.nr_mode = NR_REG_ALL_NIC;
2671
2672			/* For now we only support virtio-net headers, and only for
2673			 * VALE ports, but this may change in future. Valid lengths
2674			 * for the virtio-net header are 0 (no header), 10 and 12. */
2675			if (req->nr_hdr_len != 0 &&
2676				req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
2677					req->nr_hdr_len != 12) {
2678				if (netmap_verbose)
2679					nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
2680				error = EINVAL;
2681				break;
2682			}
2683			NMG_LOCK();
2684			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2685			hdr->nr_body = (uintptr_t)&regreq;
2686			error = netmap_get_vale_na(hdr, &na, NULL, 0);
2687			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
2688			hdr->nr_body = (uintptr_t)req;
2689			if (na && !error) {
2690				struct netmap_vp_adapter *vpna =
2691					(struct netmap_vp_adapter *)na;
2692				na->virt_hdr_len = req->nr_hdr_len;
2693				if (na->virt_hdr_len) {
2694					vpna->mfs = NETMAP_BUF_SIZE(na);
2695				}
2696				if (netmap_verbose)
2697					nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
2698				netmap_adapter_put(na);
2699			} else if (!na) {
2700				error = ENXIO;
2701			}
2702			NMG_UNLOCK();
2703			break;
2704		}
2705
2706		case NETMAP_REQ_PORT_HDR_GET: {
2707			/* Get vnet-header length for this netmap port */
2708			struct nmreq_port_hdr *req =
2709				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2710			/* Build a nmreq_register out of the nmreq_port_hdr,
2711			 * so that we can call netmap_get_bdg_na(). */
2712			struct nmreq_register regreq;
2713			struct ifnet *ifp;
2714
2715			bzero(&regreq, sizeof(regreq));
2716			regreq.nr_mode = NR_REG_ALL_NIC;
2717			NMG_LOCK();
2718			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2719			hdr->nr_body = (uintptr_t)&regreq;
2720			error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
2721			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
2722			hdr->nr_body = (uintptr_t)req;
2723			if (na && !error) {
2724				req->nr_hdr_len = na->virt_hdr_len;
2725			}
2726			netmap_unget_na(na, ifp);
2727			NMG_UNLOCK();
2728			break;
2729		}
2730
2731		case NETMAP_REQ_VALE_NEWIF: {
2732			error = nm_vi_create(hdr);
2733			break;
2734		}
2735
2736		case NETMAP_REQ_VALE_DELIF: {
2737			error = nm_vi_destroy(hdr->nr_name);
2738			break;
2739		}
2740
2741		case NETMAP_REQ_VALE_POLLING_ENABLE:
2742		case NETMAP_REQ_VALE_POLLING_DISABLE: {
2743			error = nm_bdg_polling(hdr);
2744			break;
2745		}
2746#endif  /* WITH_VALE */
2747		case NETMAP_REQ_POOLS_INFO_GET: {
2748			/* Get information from the memory allocator used for
2749			 * hdr->nr_name. */
2750			struct nmreq_pools_info *req =
2751				(struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
2752			NMG_LOCK();
2753			do {
2754				/* Build a nmreq_register out of the nmreq_pools_info,
2755				 * so that we can call netmap_get_na(). */
2756				struct nmreq_register regreq;
2757				bzero(&regreq, sizeof(regreq));
2758				regreq.nr_mem_id = req->nr_mem_id;
2759				regreq.nr_mode = NR_REG_ALL_NIC;
2760
2761				hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2762				hdr->nr_body = (uintptr_t)&regreq;
2763				error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2764				hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
2765				hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2766				if (error) {
2767					na = NULL;
2768					ifp = NULL;
2769					break;
2770				}
2771				nmd = na->nm_mem; /* grab the memory allocator */
2772				if (nmd == NULL) {
2773					error = EINVAL;
2774					break;
2775				}
2776
2777				/* Finalize the memory allocator, get the pools
2778				 * information and release the allocator. */
2779				error = netmap_mem_finalize(nmd, na);
2780				if (error) {
2781					break;
2782				}
2783				error = netmap_mem_pools_info_get(req, nmd);
2784				netmap_mem_drop(na);
2785			} while (0);
2786			netmap_unget_na(na, ifp);
2787			NMG_UNLOCK();
2788			break;
2789		}
2790
2791		case NETMAP_REQ_CSB_ENABLE: {
2792			struct nmreq_option *opt;
2793
2794			opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2795						NETMAP_REQ_OPT_CSB);
2796			if (opt == NULL) {
2797				error = EINVAL;
2798			} else {
2799				struct nmreq_opt_csb *csbo =
2800					(struct nmreq_opt_csb *)opt;
2801				error = nmreq_checkduplicate(opt);
2802				if (!error) {
2803					NMG_LOCK();
2804					error = netmap_csb_validate(priv, csbo);
2805					NMG_UNLOCK();
2806				}
2807				opt->nro_status = error;
2808			}
2809			break;
2810		}
2811
2812		case NETMAP_REQ_SYNC_KLOOP_START: {
2813			error = netmap_sync_kloop(priv, hdr);
2814			break;
2815		}
2816
2817		case NETMAP_REQ_SYNC_KLOOP_STOP: {
2818			error = netmap_sync_kloop_stop(priv);
2819			break;
2820		}
2821
2822		default: {
2823			error = EINVAL;
2824			break;
2825		}
2826		}
2827		/* Write back request body to userspace and reset the
2828		 * user-space pointer. */
2829		error = nmreq_copyout(hdr, error);
2830		break;
2831	}
2832
2833	case NIOCTXSYNC:
2834	case NIOCRXSYNC: {
2835		if (unlikely(priv->np_nifp == NULL)) {
2836			error = ENXIO;
2837			break;
2838		}
2839		mb(); /* make sure following reads are not from cache */
2840
2841		if (unlikely(priv->np_csb_atok_base)) {
2842			nm_prerr("Invalid sync in CSB mode");
2843			error = EBUSY;
2844			break;
2845		}
2846
2847		na = priv->np_na;      /* we have a reference */
2848
2849		mbq_init(&q);
2850		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2851		krings = NMR(na, t);
2852		qfirst = priv->np_qfirst[t];
2853		qlast = priv->np_qlast[t];
2854		sync_flags = priv->np_sync_flags;
2855
2856		for (i = qfirst; i < qlast; i++) {
2857			struct netmap_kring *kring = krings[i];
2858			struct netmap_ring *ring = kring->ring;
2859
2860			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2861				error = (error ? EIO : 0);
2862				continue;
2863			}
2864
2865			if (cmd == NIOCTXSYNC) {
2866				if (netmap_debug & NM_DEBUG_TXSYNC)
2867					nm_prinf("pre txsync ring %d cur %d hwcur %d",
2868					    i, ring->cur,
2869					    kring->nr_hwcur);
2870				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2871					netmap_ring_reinit(kring);
2872				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2873					nm_sync_finalize(kring);
2874				}
2875				if (netmap_debug & NM_DEBUG_TXSYNC)
2876					nm_prinf("post txsync ring %d cur %d hwcur %d",
2877					    i, ring->cur,
2878					    kring->nr_hwcur);
2879			} else {
2880				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2881					netmap_ring_reinit(kring);
2882				}
2883				if (nm_may_forward_up(kring)) {
2884					/* transparent forwarding, see netmap_poll() */
2885					netmap_grab_packets(kring, &q, netmap_fwd);
2886				}
2887				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2888					nm_sync_finalize(kring);
2889				}
2890				ring_timestamp_set(ring);
2891			}
2892			nm_kr_put(kring);
2893		}
2894
2895		if (mbq_peek(&q)) {
2896			netmap_send_up(na->ifp, &q);
2897		}
2898
2899		break;
2900	}
2901
2902	default: {
2903		return netmap_ioctl_legacy(priv, cmd, data, td);
2904		break;
2905	}
2906	}
2907
2908	return (error);
2909}
2910
2911size_t
2912nmreq_size_by_type(uint16_t nr_reqtype)
2913{
2914	switch (nr_reqtype) {
2915	case NETMAP_REQ_REGISTER:
2916		return sizeof(struct nmreq_register);
2917	case NETMAP_REQ_PORT_INFO_GET:
2918		return sizeof(struct nmreq_port_info_get);
2919	case NETMAP_REQ_VALE_ATTACH:
2920		return sizeof(struct nmreq_vale_attach);
2921	case NETMAP_REQ_VALE_DETACH:
2922		return sizeof(struct nmreq_vale_detach);
2923	case NETMAP_REQ_VALE_LIST:
2924		return sizeof(struct nmreq_vale_list);
2925	case NETMAP_REQ_PORT_HDR_SET:
2926	case NETMAP_REQ_PORT_HDR_GET:
2927		return sizeof(struct nmreq_port_hdr);
2928	case NETMAP_REQ_VALE_NEWIF:
2929		return sizeof(struct nmreq_vale_newif);
2930	case NETMAP_REQ_VALE_DELIF:
2931	case NETMAP_REQ_SYNC_KLOOP_STOP:
2932	case NETMAP_REQ_CSB_ENABLE:
2933		return 0;
2934	case NETMAP_REQ_VALE_POLLING_ENABLE:
2935	case NETMAP_REQ_VALE_POLLING_DISABLE:
2936		return sizeof(struct nmreq_vale_polling);
2937	case NETMAP_REQ_POOLS_INFO_GET:
2938		return sizeof(struct nmreq_pools_info);
2939	case NETMAP_REQ_SYNC_KLOOP_START:
2940		return sizeof(struct nmreq_sync_kloop_start);
2941	}
2942	return 0;
2943}
2944
2945static size_t
2946nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
2947{
2948	size_t rv = sizeof(struct nmreq_option);
2949#ifdef NETMAP_REQ_OPT_DEBUG
2950	if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
2951		return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
2952#endif /* NETMAP_REQ_OPT_DEBUG */
2953	switch (nro_reqtype) {
2954#ifdef WITH_EXTMEM
2955	case NETMAP_REQ_OPT_EXTMEM:
2956		rv = sizeof(struct nmreq_opt_extmem);
2957		break;
2958#endif /* WITH_EXTMEM */
2959	case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
2960		if (nro_size >= rv)
2961			rv = nro_size;
2962		break;
2963	case NETMAP_REQ_OPT_CSB:
2964		rv = sizeof(struct nmreq_opt_csb);
2965		break;
2966	}
2967	/* subtract the common header */
2968	return rv - sizeof(struct nmreq_option);
2969}
2970
2971int
2972nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
2973{
2974	size_t rqsz, optsz, bufsz;
2975	int error;
2976	char *ker = NULL, *p;
2977	struct nmreq_option **next, *src;
2978	struct nmreq_option buf;
2979	uint64_t *ptrs;
2980
2981	if (hdr->nr_reserved) {
2982		if (netmap_verbose)
2983			nm_prerr("nr_reserved must be zero");
2984		return EINVAL;
2985	}
2986
2987	if (!nr_body_is_user)
2988		return 0;
2989
2990	hdr->nr_reserved = nr_body_is_user;
2991
2992	/* compute the total size of the buffer */
2993	rqsz = nmreq_size_by_type(hdr->nr_reqtype);
2994	if (rqsz > NETMAP_REQ_MAXSIZE) {
2995		error = EMSGSIZE;
2996		goto out_err;
2997	}
2998	if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
2999		(!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
3000		/* Request body expected, but not found; or
3001		 * request body found but unexpected. */
3002		if (netmap_verbose)
3003			nm_prerr("nr_body expected but not found, or vice versa");
3004		error = EINVAL;
3005		goto out_err;
3006	}
3007
3008	bufsz = 2 * sizeof(void *) + rqsz;
3009	optsz = 0;
3010	for (src = (struct nmreq_option *)(uintptr_t)hdr->nr_options; src;
3011	     src = (struct nmreq_option *)(uintptr_t)buf.nro_next)
3012	{
3013		error = copyin(src, &buf, sizeof(*src));
3014		if (error)
3015			goto out_err;
3016		optsz += sizeof(*src);
3017		optsz += nmreq_opt_size_by_type(buf.nro_reqtype, buf.nro_size);
3018		if (rqsz + optsz > NETMAP_REQ_MAXSIZE) {
3019			error = EMSGSIZE;
3020			goto out_err;
3021		}
3022		bufsz += optsz + sizeof(void *);
3023	}
3024
3025	ker = nm_os_malloc(bufsz);
3026	if (ker == NULL) {
3027		error = ENOMEM;
3028		goto out_err;
3029	}
3030	p = ker;
3031
3032	/* make a copy of the user pointers */
3033	ptrs = (uint64_t*)p;
3034	*ptrs++ = hdr->nr_body;
3035	*ptrs++ = hdr->nr_options;
3036	p = (char *)ptrs;
3037
3038	/* copy the body */
3039	error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
3040	if (error)
3041		goto out_restore;
3042	/* overwrite the user pointer with the in-kernel one */
3043	hdr->nr_body = (uintptr_t)p;
3044	p += rqsz;
3045
3046	/* copy the options */
3047	next = (struct nmreq_option **)&hdr->nr_options;
3048	src = *next;
3049	while (src) {
3050		struct nmreq_option *opt;
3051
3052		/* copy the option header */
3053		ptrs = (uint64_t *)p;
3054		opt = (struct nmreq_option *)(ptrs + 1);
3055		error = copyin(src, opt, sizeof(*src));
3056		if (error)
3057			goto out_restore;
3058		/* make a copy of the user next pointer */
3059		*ptrs = opt->nro_next;
3060		/* overwrite the user pointer with the in-kernel one */
3061		*next = opt;
3062
3063		/* initialize the option as not supported.
3064		 * Recognized options will update this field.
3065		 */
3066		opt->nro_status = EOPNOTSUPP;
3067
3068		p = (char *)(opt + 1);
3069
3070		/* copy the option body */
3071		optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
3072						opt->nro_size);
3073		if (optsz) {
3074			/* the option body follows the option header */
3075			error = copyin(src + 1, p, optsz);
3076			if (error)
3077				goto out_restore;
3078			p += optsz;
3079		}
3080
3081		/* move to next option */
3082		next = (struct nmreq_option **)&opt->nro_next;
3083		src = *next;
3084	}
3085	return 0;
3086
3087out_restore:
3088	ptrs = (uint64_t *)ker;
3089	hdr->nr_body = *ptrs++;
3090	hdr->nr_options = *ptrs++;
3091	hdr->nr_reserved = 0;
3092	nm_os_free(ker);
3093out_err:
3094	return error;
3095}
3096
3097static int
3098nmreq_copyout(struct nmreq_header *hdr, int rerror)
3099{
3100	struct nmreq_option *src, *dst;
3101	void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
3102	uint64_t *ptrs;
3103	size_t bodysz;
3104	int error;
3105
3106	if (!hdr->nr_reserved)
3107		return rerror;
3108
3109	/* restore the user pointers in the header */
3110	ptrs = (uint64_t *)ker - 2;
3111	bufstart = ptrs;
3112	hdr->nr_body = *ptrs++;
3113	src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3114	hdr->nr_options = *ptrs;
3115
3116	if (!rerror) {
3117		/* copy the body */
3118		bodysz = nmreq_size_by_type(hdr->nr_reqtype);
3119		error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
3120		if (error) {
3121			rerror = error;
3122			goto out;
3123		}
3124	}
3125
3126	/* copy the options */
3127	dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3128	while (src) {
3129		size_t optsz;
3130		uint64_t next;
3131
3132		/* restore the user pointer */
3133		next = src->nro_next;
3134		ptrs = (uint64_t *)src - 1;
3135		src->nro_next = *ptrs;
3136
3137		/* always copy the option header */
3138		error = copyout(src, dst, sizeof(*src));
3139		if (error) {
3140			rerror = error;
3141			goto out;
3142		}
3143
3144		/* copy the option body only if there was no error */
3145		if (!rerror && !src->nro_status) {
3146			optsz = nmreq_opt_size_by_type(src->nro_reqtype,
3147							src->nro_size);
3148			if (optsz) {
3149				error = copyout(src + 1, dst + 1, optsz);
3150				if (error) {
3151					rerror = error;
3152					goto out;
3153				}
3154			}
3155		}
3156		src = (struct nmreq_option *)(uintptr_t)next;
3157		dst = (struct nmreq_option *)(uintptr_t)*ptrs;
3158	}
3159
3160
3161out:
3162	hdr->nr_reserved = 0;
3163	nm_os_free(bufstart);
3164	return rerror;
3165}
3166
3167struct nmreq_option *
3168nmreq_findoption(struct nmreq_option *opt, uint16_t reqtype)
3169{
3170	for ( ; opt; opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3171		if (opt->nro_reqtype == reqtype)
3172			return opt;
3173	return NULL;
3174}
3175
3176int
3177nmreq_checkduplicate(struct nmreq_option *opt) {
3178	uint16_t type = opt->nro_reqtype;
3179	int dup = 0;
3180
3181	while ((opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)opt->nro_next,
3182			type))) {
3183		dup++;
3184		opt->nro_status = EINVAL;
3185	}
3186	return (dup ? EINVAL : 0);
3187}
3188
3189static int
3190nmreq_checkoptions(struct nmreq_header *hdr)
3191{
3192	struct nmreq_option *opt;
3193	/* return error if there is still any option
3194	 * marked as not supported
3195	 */
3196
3197	for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
3198	     opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3199		if (opt->nro_status == EOPNOTSUPP)
3200			return EOPNOTSUPP;
3201
3202	return 0;
3203}
3204
3205/*
3206 * select(2) and poll(2) handlers for the "netmap" device.
3207 *
3208 * Can be called for one or more queues.
3209 * Return true the event mask corresponding to ready events.
3210 * If there are no ready events (and 'sr' is not NULL), do a
3211 * selrecord on either individual selinfo or on the global one.
3212 * Device-dependent parts (locking and sync of tx/rx rings)
3213 * are done through callbacks.
3214 *
3215 * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3216 * The first one is remapped to pwait as selrecord() uses the name as an
3217 * hidden argument.
3218 */
3219int
3220netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3221{
3222	struct netmap_adapter *na;
3223	struct netmap_kring *kring;
3224	struct netmap_ring *ring;
3225	u_int i, want[NR_TXRX], revents = 0;
3226	NM_SELINFO_T *si[NR_TXRX];
3227#define want_tx want[NR_TX]
3228#define want_rx want[NR_RX]
3229	struct mbq q;	/* packets from RX hw queues to host stack */
3230
3231	/*
3232	 * In order to avoid nested locks, we need to "double check"
3233	 * txsync and rxsync if we decide to do a selrecord().
3234	 * retry_tx (and retry_rx, later) prevent looping forever.
3235	 */
3236	int retry_tx = 1, retry_rx = 1;
3237
3238	/* Transparent mode: send_down is 1 if we have found some
3239	 * packets to forward (host RX ring --> NIC) during the rx
3240	 * scan and we have not sent them down to the NIC yet.
3241	 * Transparent mode requires to bind all rings to a single
3242	 * file descriptor.
3243	 */
3244	int send_down = 0;
3245	int sync_flags = priv->np_sync_flags;
3246
3247	mbq_init(&q);
3248
3249	if (unlikely(priv->np_nifp == NULL)) {
3250		return POLLERR;
3251	}
3252	mb(); /* make sure following reads are not from cache */
3253
3254	na = priv->np_na;
3255
3256	if (unlikely(!nm_netmap_on(na)))
3257		return POLLERR;
3258
3259	if (unlikely(priv->np_csb_atok_base)) {
3260		nm_prerr("Invalid poll in CSB mode");
3261		return POLLERR;
3262	}
3263
3264	if (netmap_debug & NM_DEBUG_ON)
3265		nm_prinf("device %s events 0x%x", na->name, events);
3266	want_tx = events & (POLLOUT | POLLWRNORM);
3267	want_rx = events & (POLLIN | POLLRDNORM);
3268
3269	/*
3270	 * If the card has more than one queue AND the file descriptor is
3271	 * bound to all of them, we sleep on the "global" selinfo, otherwise
3272	 * we sleep on individual selinfo (FreeBSD only allows two selinfo's
3273	 * per file descriptor).
3274	 * The interrupt routine in the driver wake one or the other
3275	 * (or both) depending on which clients are active.
3276	 *
3277	 * rxsync() is only called if we run out of buffers on a POLLIN.
3278	 * txsync() is called if we run out of buffers on POLLOUT, or
3279	 * there are pending packets to send. The latter can be disabled
3280	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3281	 */
3282	si[NR_RX] = nm_si_user(priv, NR_RX) ? &na->si[NR_RX] :
3283				&na->rx_rings[priv->np_qfirst[NR_RX]]->si;
3284	si[NR_TX] = nm_si_user(priv, NR_TX) ? &na->si[NR_TX] :
3285				&na->tx_rings[priv->np_qfirst[NR_TX]]->si;
3286
3287#ifdef __FreeBSD__
3288	/*
3289	 * We start with a lock free round which is cheap if we have
3290	 * slots available. If this fails, then lock and call the sync
3291	 * routines. We can't do this on Linux, as the contract says
3292	 * that we must call nm_os_selrecord() unconditionally.
3293	 */
3294	if (want_tx) {
3295		const enum txrx t = NR_TX;
3296		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3297			kring = NMR(na, t)[i];
3298			if (kring->ring->cur != kring->ring->tail) {
3299				/* Some unseen TX space is available, so what
3300				 * we don't need to run txsync. */
3301				revents |= want[t];
3302				want[t] = 0;
3303				break;
3304			}
3305		}
3306	}
3307	if (want_rx) {
3308		const enum txrx t = NR_RX;
3309		int rxsync_needed = 0;
3310
3311		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3312			kring = NMR(na, t)[i];
3313			if (kring->ring->cur == kring->ring->tail
3314				|| kring->rhead != kring->ring->head) {
3315				/* There are no unseen packets on this ring,
3316				 * or there are some buffers to be returned
3317				 * to the netmap port. We therefore go ahead
3318				 * and run rxsync. */
3319				rxsync_needed = 1;
3320				break;
3321			}
3322		}
3323		if (!rxsync_needed) {
3324			revents |= want_rx;
3325			want_rx = 0;
3326		}
3327	}
3328#endif
3329
3330#ifdef linux
3331	/* The selrecord must be unconditional on linux. */
3332	nm_os_selrecord(sr, si[NR_RX]);
3333	nm_os_selrecord(sr, si[NR_TX]);
3334#endif /* linux */
3335
3336	/*
3337	 * If we want to push packets out (priv->np_txpoll) or
3338	 * want_tx is still set, we must issue txsync calls
3339	 * (on all rings, to avoid that the tx rings stall).
3340	 * Fortunately, normal tx mode has np_txpoll set.
3341	 */
3342	if (priv->np_txpoll || want_tx) {
3343		/*
3344		 * The first round checks if anyone is ready, if not
3345		 * do a selrecord and another round to handle races.
3346		 * want_tx goes to 0 if any space is found, and is
3347		 * used to skip rings with no pending transmissions.
3348		 */
3349flush_tx:
3350		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3351			int found = 0;
3352
3353			kring = na->tx_rings[i];
3354			ring = kring->ring;
3355
3356			/*
3357			 * Don't try to txsync this TX ring if we already found some
3358			 * space in some of the TX rings (want_tx == 0) and there are no
3359			 * TX slots in this ring that need to be flushed to the NIC
3360			 * (head == hwcur).
3361			 */
3362			if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3363				continue;
3364
3365			if (nm_kr_tryget(kring, 1, &revents))
3366				continue;
3367
3368			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3369				netmap_ring_reinit(kring);
3370				revents |= POLLERR;
3371			} else {
3372				if (kring->nm_sync(kring, sync_flags))
3373					revents |= POLLERR;
3374				else
3375					nm_sync_finalize(kring);
3376			}
3377
3378			/*
3379			 * If we found new slots, notify potential
3380			 * listeners on the same ring.
3381			 * Since we just did a txsync, look at the copies
3382			 * of cur,tail in the kring.
3383			 */
3384			found = kring->rcur != kring->rtail;
3385			nm_kr_put(kring);
3386			if (found) { /* notify other listeners */
3387				revents |= want_tx;
3388				want_tx = 0;
3389#ifndef linux
3390				kring->nm_notify(kring, 0);
3391#endif /* linux */
3392			}
3393		}
3394		/* if there were any packet to forward we must have handled them by now */
3395		send_down = 0;
3396		if (want_tx && retry_tx && sr) {
3397#ifndef linux
3398			nm_os_selrecord(sr, si[NR_TX]);
3399#endif /* !linux */
3400			retry_tx = 0;
3401			goto flush_tx;
3402		}
3403	}
3404
3405	/*
3406	 * If want_rx is still set scan receive rings.
3407	 * Do it on all rings because otherwise we starve.
3408	 */
3409	if (want_rx) {
3410		/* two rounds here for race avoidance */
3411do_retry_rx:
3412		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3413			int found = 0;
3414
3415			kring = na->rx_rings[i];
3416			ring = kring->ring;
3417
3418			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3419				continue;
3420
3421			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3422				netmap_ring_reinit(kring);
3423				revents |= POLLERR;
3424			}
3425			/* now we can use kring->rcur, rtail */
3426
3427			/*
3428			 * transparent mode support: collect packets from
3429			 * hw rxring(s) that have been released by the user
3430			 */
3431			if (nm_may_forward_up(kring)) {
3432				netmap_grab_packets(kring, &q, netmap_fwd);
3433			}
3434
3435			/* Clear the NR_FORWARD flag anyway, it may be set by
3436			 * the nm_sync() below only on for the host RX ring (see
3437			 * netmap_rxsync_from_host()). */
3438			kring->nr_kflags &= ~NR_FORWARD;
3439			if (kring->nm_sync(kring, sync_flags))
3440				revents |= POLLERR;
3441			else
3442				nm_sync_finalize(kring);
3443			send_down |= (kring->nr_kflags & NR_FORWARD);
3444			ring_timestamp_set(ring);
3445			found = kring->rcur != kring->rtail;
3446			nm_kr_put(kring);
3447			if (found) {
3448				revents |= want_rx;
3449				retry_rx = 0;
3450#ifndef linux
3451				kring->nm_notify(kring, 0);
3452#endif /* linux */
3453			}
3454		}
3455
3456#ifndef linux
3457		if (retry_rx && sr) {
3458			nm_os_selrecord(sr, si[NR_RX]);
3459		}
3460#endif /* !linux */
3461		if (send_down || retry_rx) {
3462			retry_rx = 0;
3463			if (send_down)
3464				goto flush_tx; /* and retry_rx */
3465			else
3466				goto do_retry_rx;
3467		}
3468	}
3469
3470	/*
3471	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3472	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3473	 * to the host stack.
3474	 */
3475
3476	if (mbq_peek(&q)) {
3477		netmap_send_up(na->ifp, &q);
3478	}
3479
3480	return (revents);
3481#undef want_tx
3482#undef want_rx
3483}
3484
3485int
3486nma_intr_enable(struct netmap_adapter *na, int onoff)
3487{
3488	bool changed = false;
3489	enum txrx t;
3490	int i;
3491
3492	for_rx_tx(t) {
3493		for (i = 0; i < nma_get_nrings(na, t); i++) {
3494			struct netmap_kring *kring = NMR(na, t)[i];
3495			int on = !(kring->nr_kflags & NKR_NOINTR);
3496
3497			if (!!onoff != !!on) {
3498				changed = true;
3499			}
3500			if (onoff) {
3501				kring->nr_kflags &= ~NKR_NOINTR;
3502			} else {
3503				kring->nr_kflags |= NKR_NOINTR;
3504			}
3505		}
3506	}
3507
3508	if (!changed) {
3509		return 0; /* nothing to do */
3510	}
3511
3512	if (!na->nm_intr) {
3513		nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3514		  na->name);
3515		return -1;
3516	}
3517
3518	na->nm_intr(na, onoff);
3519
3520	return 0;
3521}
3522
3523
3524/*-------------------- driver support routines -------------------*/
3525
3526/* default notify callback */
3527static int
3528netmap_notify(struct netmap_kring *kring, int flags)
3529{
3530	struct netmap_adapter *na = kring->notify_na;
3531	enum txrx t = kring->tx;
3532
3533	nm_os_selwakeup(&kring->si);
3534	/* optimization: avoid a wake up on the global
3535	 * queue if nobody has registered for more
3536	 * than one ring
3537	 */
3538	if (na->si_users[t] > 0)
3539		nm_os_selwakeup(&na->si[t]);
3540
3541	return NM_IRQ_COMPLETED;
3542}
3543
3544/* called by all routines that create netmap_adapters.
3545 * provide some defaults and get a reference to the
3546 * memory allocator
3547 */
3548int
3549netmap_attach_common(struct netmap_adapter *na)
3550{
3551	if (!na->rx_buf_maxsize) {
3552		/* Set a conservative default (larger is safer). */
3553		na->rx_buf_maxsize = PAGE_SIZE;
3554	}
3555
3556#ifdef __FreeBSD__
3557	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3558		na->if_input = na->ifp->if_input; /* for netmap_send_up */
3559	}
3560	na->pdev = na; /* make sure netmap_mem_map() is called */
3561#endif /* __FreeBSD__ */
3562	if (na->na_flags & NAF_HOST_RINGS) {
3563		if (na->num_host_rx_rings == 0)
3564			na->num_host_rx_rings = 1;
3565		if (na->num_host_tx_rings == 0)
3566			na->num_host_tx_rings = 1;
3567	}
3568	if (na->nm_krings_create == NULL) {
3569		/* we assume that we have been called by a driver,
3570		 * since other port types all provide their own
3571		 * nm_krings_create
3572		 */
3573		na->nm_krings_create = netmap_hw_krings_create;
3574		na->nm_krings_delete = netmap_hw_krings_delete;
3575	}
3576	if (na->nm_notify == NULL)
3577		na->nm_notify = netmap_notify;
3578	na->active_fds = 0;
3579
3580	if (na->nm_mem == NULL) {
3581		/* use the global allocator */
3582		na->nm_mem = netmap_mem_get(&nm_mem);
3583	}
3584#ifdef WITH_VALE
3585	if (na->nm_bdg_attach == NULL)
3586		/* no special nm_bdg_attach callback. On VALE
3587		 * attach, we need to interpose a bwrap
3588		 */
3589		na->nm_bdg_attach = netmap_default_bdg_attach;
3590#endif
3591
3592	return 0;
3593}
3594
3595/* Wrapper for the register callback provided netmap-enabled
3596 * hardware drivers.
3597 * nm_iszombie(na) means that the driver module has been
3598 * unloaded, so we cannot call into it.
3599 * nm_os_ifnet_lock() must guarantee mutual exclusion with
3600 * module unloading.
3601 */
3602static int
3603netmap_hw_reg(struct netmap_adapter *na, int onoff)
3604{
3605	struct netmap_hw_adapter *hwna =
3606		(struct netmap_hw_adapter*)na;
3607	int error = 0;
3608
3609	nm_os_ifnet_lock();
3610
3611	if (nm_iszombie(na)) {
3612		if (onoff) {
3613			error = ENXIO;
3614		} else if (na != NULL) {
3615			na->na_flags &= ~NAF_NETMAP_ON;
3616		}
3617		goto out;
3618	}
3619
3620	error = hwna->nm_hw_register(na, onoff);
3621
3622out:
3623	nm_os_ifnet_unlock();
3624
3625	return error;
3626}
3627
3628static void
3629netmap_hw_dtor(struct netmap_adapter *na)
3630{
3631	if (na->ifp == NULL)
3632		return;
3633
3634	NM_DETACH_NA(na->ifp);
3635}
3636
3637
3638/*
3639 * Allocate a netmap_adapter object, and initialize it from the
3640 * 'arg' passed by the driver on attach.
3641 * We allocate a block of memory of 'size' bytes, which has room
3642 * for struct netmap_adapter plus additional room private to
3643 * the caller.
3644 * Return 0 on success, ENOMEM otherwise.
3645 */
3646int
3647netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3648{
3649	struct netmap_hw_adapter *hwna = NULL;
3650	struct ifnet *ifp = NULL;
3651
3652	if (size < sizeof(struct netmap_hw_adapter)) {
3653		if (netmap_debug & NM_DEBUG_ON)
3654			nm_prerr("Invalid netmap adapter size %d", (int)size);
3655		return EINVAL;
3656	}
3657
3658	if (arg == NULL || arg->ifp == NULL) {
3659		if (netmap_debug & NM_DEBUG_ON)
3660			nm_prerr("either arg or arg->ifp is NULL");
3661		return EINVAL;
3662	}
3663
3664	if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
3665		if (netmap_debug & NM_DEBUG_ON)
3666			nm_prerr("%s: invalid rings tx %d rx %d",
3667				arg->name, arg->num_tx_rings, arg->num_rx_rings);
3668		return EINVAL;
3669	}
3670
3671	ifp = arg->ifp;
3672	if (NM_NA_CLASH(ifp)) {
3673		/* If NA(ifp) is not null but there is no valid netmap
3674		 * adapter it means that someone else is using the same
3675		 * pointer (e.g. ax25_ptr on linux). This happens for
3676		 * instance when also PF_RING is in use. */
3677		nm_prerr("Error: netmap adapter hook is busy");
3678		return EBUSY;
3679	}
3680
3681	hwna = nm_os_malloc(size);
3682	if (hwna == NULL)
3683		goto fail;
3684	hwna->up = *arg;
3685	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3686	strlcpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3687	if (override_reg) {
3688		hwna->nm_hw_register = hwna->up.nm_register;
3689		hwna->up.nm_register = netmap_hw_reg;
3690	}
3691	if (netmap_attach_common(&hwna->up)) {
3692		nm_os_free(hwna);
3693		goto fail;
3694	}
3695	netmap_adapter_get(&hwna->up);
3696
3697	NM_ATTACH_NA(ifp, &hwna->up);
3698
3699	nm_os_onattach(ifp);
3700
3701	if (arg->nm_dtor == NULL) {
3702		hwna->up.nm_dtor = netmap_hw_dtor;
3703	}
3704
3705	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3706	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3707	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3708	return 0;
3709
3710fail:
3711	nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3712	return (hwna ? EINVAL : ENOMEM);
3713}
3714
3715
3716int
3717netmap_attach(struct netmap_adapter *arg)
3718{
3719	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3720			1 /* override nm_reg */);
3721}
3722
3723
3724void
3725NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3726{
3727	if (!na) {
3728		return;
3729	}
3730
3731	refcount_acquire(&na->na_refcount);
3732}
3733
3734
3735/* returns 1 iff the netmap_adapter is destroyed */
3736int
3737NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3738{
3739	if (!na)
3740		return 1;
3741
3742	if (!refcount_release(&na->na_refcount))
3743		return 0;
3744
3745	if (na->nm_dtor)
3746		na->nm_dtor(na);
3747
3748	if (na->tx_rings) { /* XXX should not happen */
3749		if (netmap_debug & NM_DEBUG_ON)
3750			nm_prerr("freeing leftover tx_rings");
3751		na->nm_krings_delete(na);
3752	}
3753	netmap_pipe_dealloc(na);
3754	if (na->nm_mem)
3755		netmap_mem_put(na->nm_mem);
3756	bzero(na, sizeof(*na));
3757	nm_os_free(na);
3758
3759	return 1;
3760}
3761
3762/* nm_krings_create callback for all hardware native adapters */
3763int
3764netmap_hw_krings_create(struct netmap_adapter *na)
3765{
3766	int ret = netmap_krings_create(na, 0);
3767	if (ret == 0) {
3768		/* initialize the mbq for the sw rx ring */
3769		u_int lim = netmap_real_rings(na, NR_RX), i;
3770		for (i = na->num_rx_rings; i < lim; i++) {
3771			mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
3772		}
3773		ND("initialized sw rx queue %d", na->num_rx_rings);
3774	}
3775	return ret;
3776}
3777
3778
3779
3780/*
3781 * Called on module unload by the netmap-enabled drivers
3782 */
3783void
3784netmap_detach(struct ifnet *ifp)
3785{
3786	struct netmap_adapter *na = NA(ifp);
3787
3788	if (!na)
3789		return;
3790
3791	NMG_LOCK();
3792	netmap_set_all_rings(na, NM_KR_LOCKED);
3793	/*
3794	 * if the netmap adapter is not native, somebody
3795	 * changed it, so we can not release it here.
3796	 * The NAF_ZOMBIE flag will notify the new owner that
3797	 * the driver is gone.
3798	 */
3799	if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3800		na->na_flags |= NAF_ZOMBIE;
3801	}
3802	/* give active users a chance to notice that NAF_ZOMBIE has been
3803	 * turned on, so that they can stop and return an error to userspace.
3804	 * Note that this becomes a NOP if there are no active users and,
3805	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3806	 * NULL.
3807	 */
3808	netmap_enable_all_rings(ifp);
3809	NMG_UNLOCK();
3810}
3811
3812
3813/*
3814 * Intercept packets from the network stack and pass them
3815 * to netmap as incoming packets on the 'software' ring.
3816 *
3817 * We only store packets in a bounded mbq and then copy them
3818 * in the relevant rxsync routine.
3819 *
3820 * We rely on the OS to make sure that the ifp and na do not go
3821 * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3822 * In nm_register() or whenever there is a reinitialization,
3823 * we make sure to make the mode change visible here.
3824 */
3825int
3826netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3827{
3828	struct netmap_adapter *na = NA(ifp);
3829	struct netmap_kring *kring, *tx_kring;
3830	u_int len = MBUF_LEN(m);
3831	u_int error = ENOBUFS;
3832	unsigned int txr;
3833	struct mbq *q;
3834	int busy;
3835	u_int i;
3836
3837	i = MBUF_TXQ(m);
3838	if (i >= na->num_host_rx_rings) {
3839		i = i % na->num_host_rx_rings;
3840	}
3841	kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
3842
3843	// XXX [Linux] we do not need this lock
3844	// if we follow the down/configure/up protocol -gl
3845	// mtx_lock(&na->core_lock);
3846
3847	if (!nm_netmap_on(na)) {
3848		nm_prerr("%s not in netmap mode anymore", na->name);
3849		error = ENXIO;
3850		goto done;
3851	}
3852
3853	txr = MBUF_TXQ(m);
3854	if (txr >= na->num_tx_rings) {
3855		txr %= na->num_tx_rings;
3856	}
3857	tx_kring = NMR(na, NR_TX)[txr];
3858
3859	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3860		return MBUF_TRANSMIT(na, ifp, m);
3861	}
3862
3863	q = &kring->rx_queue;
3864
3865	// XXX reconsider long packets if we handle fragments
3866	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
3867		nm_prerr("%s from_host, drop packet size %d > %d", na->name,
3868			len, NETMAP_BUF_SIZE(na));
3869		goto done;
3870	}
3871
3872	if (!netmap_generic_hwcsum) {
3873		if (nm_os_mbuf_has_csum_offld(m)) {
3874			RD(1, "%s drop mbuf that needs checksum offload", na->name);
3875			goto done;
3876		}
3877	}
3878
3879	if (nm_os_mbuf_has_seg_offld(m)) {
3880		RD(1, "%s drop mbuf that needs generic segmentation offload", na->name);
3881		goto done;
3882	}
3883
3884#ifdef __FreeBSD__
3885	ETHER_BPF_MTAP(ifp, m);
3886#endif /* __FreeBSD__ */
3887
3888	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
3889	 * and maybe other instances of netmap_transmit (the latter
3890	 * not possible on Linux).
3891	 * We enqueue the mbuf only if we are sure there is going to be
3892	 * enough room in the host RX ring, otherwise we drop it.
3893	 */
3894	mbq_lock(q);
3895
3896	busy = kring->nr_hwtail - kring->nr_hwcur;
3897	if (busy < 0)
3898		busy += kring->nkr_num_slots;
3899	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
3900		RD(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
3901			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
3902	} else {
3903		mbq_enqueue(q, m);
3904		ND(2, "%s %d bufs in queue", na->name, mbq_len(q));
3905		/* notify outside the lock */
3906		m = NULL;
3907		error = 0;
3908	}
3909	mbq_unlock(q);
3910
3911done:
3912	if (m)
3913		m_freem(m);
3914	/* unconditionally wake up listeners */
3915	kring->nm_notify(kring, 0);
3916	/* this is normally netmap_notify(), but for nics
3917	 * connected to a bridge it is netmap_bwrap_intr_notify(),
3918	 * that possibly forwards the frames through the switch
3919	 */
3920
3921	return (error);
3922}
3923
3924
3925/*
3926 * netmap_reset() is called by the driver routines when reinitializing
3927 * a ring. The driver is in charge of locking to protect the kring.
3928 * If native netmap mode is not set just return NULL.
3929 * If native netmap mode is set, in particular, we have to set nr_mode to
3930 * NKR_NETMAP_ON.
3931 */
3932struct netmap_slot *
3933netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3934	u_int new_cur)
3935{
3936	struct netmap_kring *kring;
3937	int new_hwofs, lim;
3938
3939	if (!nm_native_on(na)) {
3940		ND("interface not in native netmap mode");
3941		return NULL;	/* nothing to reinitialize */
3942	}
3943
3944	/* XXX note- in the new scheme, we are not guaranteed to be
3945	 * under lock (e.g. when called on a device reset).
3946	 * In this case, we should set a flag and do not trust too
3947	 * much the values. In practice: TODO
3948	 * - set a RESET flag somewhere in the kring
3949	 * - do the processing in a conservative way
3950	 * - let the *sync() fixup at the end.
3951	 */
3952	if (tx == NR_TX) {
3953		if (n >= na->num_tx_rings)
3954			return NULL;
3955
3956		kring = na->tx_rings[n];
3957
3958		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3959			kring->nr_mode = NKR_NETMAP_OFF;
3960			return NULL;
3961		}
3962
3963		// XXX check whether we should use hwcur or rcur
3964		new_hwofs = kring->nr_hwcur - new_cur;
3965	} else {
3966		if (n >= na->num_rx_rings)
3967			return NULL;
3968		kring = na->rx_rings[n];
3969
3970		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3971			kring->nr_mode = NKR_NETMAP_OFF;
3972			return NULL;
3973		}
3974
3975		new_hwofs = kring->nr_hwtail - new_cur;
3976	}
3977	lim = kring->nkr_num_slots - 1;
3978	if (new_hwofs > lim)
3979		new_hwofs -= lim + 1;
3980
3981	/* Always set the new offset value and realign the ring. */
3982	if (netmap_debug & NM_DEBUG_ON)
3983	    nm_prinf("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
3984		na->name,
3985		tx == NR_TX ? "TX" : "RX", n,
3986		kring->nkr_hwofs, new_hwofs,
3987		kring->nr_hwtail,
3988		tx == NR_TX ? lim : kring->nr_hwtail);
3989	kring->nkr_hwofs = new_hwofs;
3990	if (tx == NR_TX) {
3991		kring->nr_hwtail = kring->nr_hwcur + lim;
3992		if (kring->nr_hwtail > lim)
3993			kring->nr_hwtail -= lim + 1;
3994	}
3995
3996	/*
3997	 * Wakeup on the individual and global selwait
3998	 * We do the wakeup here, but the ring is not yet reconfigured.
3999	 * However, we are under lock so there are no races.
4000	 */
4001	kring->nr_mode = NKR_NETMAP_ON;
4002	kring->nm_notify(kring, 0);
4003	return kring->ring->slot;
4004}
4005
4006
4007/*
4008 * Dispatch rx/tx interrupts to the netmap rings.
4009 *
4010 * "work_done" is non-null on the RX path, NULL for the TX path.
4011 * We rely on the OS to make sure that there is only one active
4012 * instance per queue, and that there is appropriate locking.
4013 *
4014 * The 'notify' routine depends on what the ring is attached to.
4015 * - for a netmap file descriptor, do a selwakeup on the individual
4016 *   waitqueue, plus one on the global one if needed
4017 *   (see netmap_notify)
4018 * - for a nic connected to a switch, call the proper forwarding routine
4019 *   (see netmap_bwrap_intr_notify)
4020 */
4021int
4022netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
4023{
4024	struct netmap_kring *kring;
4025	enum txrx t = (work_done ? NR_RX : NR_TX);
4026
4027	q &= NETMAP_RING_MASK;
4028
4029	if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
4030	        nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
4031	}
4032
4033	if (q >= nma_get_nrings(na, t))
4034		return NM_IRQ_PASS; // not a physical queue
4035
4036	kring = NMR(na, t)[q];
4037
4038	if (kring->nr_mode == NKR_NETMAP_OFF) {
4039		return NM_IRQ_PASS;
4040	}
4041
4042	if (t == NR_RX) {
4043		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
4044		*work_done = 1; /* do not fire napi again */
4045	}
4046
4047	return kring->nm_notify(kring, 0);
4048}
4049
4050
4051/*
4052 * Default functions to handle rx/tx interrupts from a physical device.
4053 * "work_done" is non-null on the RX path, NULL for the TX path.
4054 *
4055 * If the card is not in netmap mode, simply return NM_IRQ_PASS,
4056 * so that the caller proceeds with regular processing.
4057 * Otherwise call netmap_common_irq().
4058 *
4059 * If the card is connected to a netmap file descriptor,
4060 * do a selwakeup on the individual queue, plus one on the global one
4061 * if needed (multiqueue card _and_ there are multiqueue listeners),
4062 * and return NR_IRQ_COMPLETED.
4063 *
4064 * Finally, if called on rx from an interface connected to a switch,
4065 * calls the proper forwarding routine.
4066 */
4067int
4068netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
4069{
4070	struct netmap_adapter *na = NA(ifp);
4071
4072	/*
4073	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
4074	 * we still use the regular driver even though the previous
4075	 * check fails. It is unclear whether we should use
4076	 * nm_native_on() here.
4077	 */
4078	if (!nm_netmap_on(na))
4079		return NM_IRQ_PASS;
4080
4081	if (na->na_flags & NAF_SKIP_INTR) {
4082		ND("use regular interrupt");
4083		return NM_IRQ_PASS;
4084	}
4085
4086	return netmap_common_irq(na, q, work_done);
4087}
4088
4089/* set/clear native flags and if_transmit/netdev_ops */
4090void
4091nm_set_native_flags(struct netmap_adapter *na)
4092{
4093	struct ifnet *ifp = na->ifp;
4094
4095	/* We do the setup for intercepting packets only if we are the
4096	 * first user of this adapapter. */
4097	if (na->active_fds > 0) {
4098		return;
4099	}
4100
4101	na->na_flags |= NAF_NETMAP_ON;
4102	nm_os_onenter(ifp);
4103	nm_update_hostrings_mode(na);
4104}
4105
4106void
4107nm_clear_native_flags(struct netmap_adapter *na)
4108{
4109	struct ifnet *ifp = na->ifp;
4110
4111	/* We undo the setup for intercepting packets only if we are the
4112	 * last user of this adapter. */
4113	if (na->active_fds > 0) {
4114		return;
4115	}
4116
4117	nm_update_hostrings_mode(na);
4118	nm_os_onexit(ifp);
4119
4120	na->na_flags &= ~NAF_NETMAP_ON;
4121}
4122
4123/*
4124 * Module loader and unloader
4125 *
4126 * netmap_init() creates the /dev/netmap device and initializes
4127 * all global variables. Returns 0 on success, errno on failure
4128 * (but there is no chance)
4129 *
4130 * netmap_fini() destroys everything.
4131 */
4132
4133static struct cdev *netmap_dev; /* /dev/netmap character device. */
4134extern struct cdevsw netmap_cdevsw;
4135
4136
4137void
4138netmap_fini(void)
4139{
4140	if (netmap_dev)
4141		destroy_dev(netmap_dev);
4142	/* we assume that there are no longer netmap users */
4143	nm_os_ifnet_fini();
4144	netmap_uninit_bridges();
4145	netmap_mem_fini();
4146	NMG_LOCK_DESTROY();
4147	nm_prinf("netmap: unloaded module.");
4148}
4149
4150
4151int
4152netmap_init(void)
4153{
4154	int error;
4155
4156	NMG_LOCK_INIT();
4157
4158	error = netmap_mem_init();
4159	if (error != 0)
4160		goto fail;
4161	/*
4162	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
4163	 * when the module is compiled in.
4164	 * XXX could use make_dev_credv() to get error number
4165	 */
4166	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
4167		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
4168			      "netmap");
4169	if (!netmap_dev)
4170		goto fail;
4171
4172	error = netmap_init_bridges();
4173	if (error)
4174		goto fail;
4175
4176#ifdef __FreeBSD__
4177	nm_os_vi_init_index();
4178#endif
4179
4180	error = nm_os_ifnet_init();
4181	if (error)
4182		goto fail;
4183
4184	nm_prinf("netmap: loaded module");
4185	return (0);
4186fail:
4187	netmap_fini();
4188	return (EINVAL); /* may be incorrect */
4189}
4190