1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2011-2014 Matteo Landi
5 * Copyright (C) 2011-2016 Luigi Rizzo
6 * Copyright (C) 2011-2016 Giuseppe Lettieri
7 * Copyright (C) 2011-2016 Vincenzo Maffione
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *   1. Redistributions of source code must retain the above copyright
14 *      notice, this list of conditions and the following disclaimer.
15 *   2. Redistributions in binary form must reproduce the above copyright
16 *      notice, this list of conditions and the following disclaimer in the
17 *      documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32
33/*
34 *
35 * This module supports memory mapped access to network devices,
36 * see netmap(4).
37 *
38 * The module uses a large, memory pool allocated by the kernel
39 * and accessible as mmapped memory by multiple userspace threads/processes.
40 * The memory pool contains packet buffers and "netmap rings",
41 * i.e. user-accessible copies of the interface's queues.
42 *
43 * Access to the network card works like this:
44 * 1. a process/thread issues one or more open() on /dev/netmap, to create
45 *    select()able file descriptor on which events are reported.
46 * 2. on each descriptor, the process issues an ioctl() to identify
47 *    the interface that should report events to the file descriptor.
48 * 3. on each descriptor, the process issues an mmap() request to
49 *    map the shared memory region within the process' address space.
50 *    The list of interesting queues is indicated by a location in
51 *    the shared memory region.
52 * 4. using the functions in the netmap(4) userspace API, a process
53 *    can look up the occupation state of a queue, access memory buffers,
54 *    and retrieve received packets or enqueue packets to transmit.
55 * 5. using some ioctl()s the process can synchronize the userspace view
56 *    of the queue with the actual status in the kernel. This includes both
57 *    receiving the notification of new packets, and transmitting new
58 *    packets on the output interface.
59 * 6. select() or poll() can be used to wait for events on individual
60 *    transmit or receive queues (or all queues for a given interface).
61 *
62
63		SYNCHRONIZATION (USER)
64
65The netmap rings and data structures may be shared among multiple
66user threads or even independent processes.
67Any synchronization among those threads/processes is delegated
68to the threads themselves. Only one thread at a time can be in
69a system call on the same netmap ring. The OS does not enforce
70this and only guarantees against system crashes in case of
71invalid usage.
72
73		LOCKING (INTERNAL)
74
75Within the kernel, access to the netmap rings is protected as follows:
76
77- a spinlock on each ring, to handle producer/consumer races on
78  RX rings attached to the host stack (against multiple host
79  threads writing from the host stack to the same ring),
80  and on 'destination' rings attached to a VALE switch
81  (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
82  protecting multiple active senders for the same destination)
83
84- an atomic variable to guarantee that there is at most one
85  instance of *_*xsync() on the ring at any time.
86  For rings connected to user file
87  descriptors, an atomic_test_and_set() protects this, and the
88  lock on the ring is not actually used.
89  For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
90  is also used to prevent multiple executions (the driver might indeed
91  already guarantee this).
92  For NIC TX rings connected to a VALE switch, the lock arbitrates
93  access to the queue (both when allocating buffers and when pushing
94  them out).
95
96- *xsync() should be protected against initializations of the card.
97  On FreeBSD most devices have the reset routine protected by
98  a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
99  the RING protection on rx_reset(), this should be added.
100
101  On linux there is an external lock on the tx path, which probably
102  also arbitrates access to the reset routine. XXX to be revised
103
104- a per-interface core_lock protecting access from the host stack
105  while interfaces may be detached from netmap mode.
106  XXX there should be no need for this lock if we detach the interfaces
107  only while they are down.
108
109
110--- VALE SWITCH ---
111
112NMG_LOCK() serializes all modifications to switches and ports.
113A switch cannot be deleted until all ports are gone.
114
115For each switch, an SX lock (RWlock on linux) protects
116deletion of ports. When configuring or deleting a new port, the
117lock is acquired in exclusive mode (after holding NMG_LOCK).
118When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
119The lock is held throughout the entire forwarding cycle,
120during which the thread may incur in a page fault.
121Hence it is important that sleepable shared locks are used.
122
123On the rx ring, the per-port lock is grabbed initially to reserve
124a number of slot in the ring, then the lock is released,
125packets are copied from source to destination, and then
126the lock is acquired again and the receive ring is updated.
127(A similar thing is done on the tx ring for NIC and host stack
128ports attached to the switch)
129
130 */
131
132
133/* --- internals ----
134 *
135 * Roadmap to the code that implements the above.
136 *
137 * > 1. a process/thread issues one or more open() on /dev/netmap, to create
138 * >    select()able file descriptor on which events are reported.
139 *
140 *  	Internally, we allocate a netmap_priv_d structure, that will be
141 *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
142 *  	structure for each open().
143 *
144 *      os-specific:
145 *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
146 *  	    linux:   see linux_netmap_open() (netmap_linux.c)
147 *
148 * > 2. on each descriptor, the process issues an ioctl() to identify
149 * >    the interface that should report events to the file descriptor.
150 *
151 * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
152 * 	Most important things happen in netmap_get_na() and
153 * 	netmap_do_regif(), called from there. Additional details can be
154 * 	found in the comments above those functions.
155 *
156 * 	In all cases, this action creates/takes-a-reference-to a
157 * 	netmap_*_adapter describing the port, and allocates a netmap_if
158 * 	and all necessary netmap rings, filling them with netmap buffers.
159 *
160 *      In this phase, the sync callbacks for each ring are set (these are used
161 *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
162 *      The adapter creation/initialization code puts them in the
163 * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
164 * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
165 * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
166 * 	actually call netmap_krings_create() to perform this and the other
167 * 	common stuff. netmap_krings_create() also takes care of the host rings,
168 * 	if needed, by setting their sync callbacks appropriately.
169 *
170 * 	Additional actions depend on the kind of netmap_adapter that has been
171 * 	registered:
172 *
173 * 	- netmap_hw_adapter:  	     [netmap.c]
174 * 	     This is a system netdev/ifp with native netmap support.
175 * 	     The ifp is detached from the host stack by redirecting:
176 * 	       - transmissions (from the network stack) to netmap_transmit()
177 * 	       - receive notifications to the nm_notify() callback for
178 * 	         this adapter. The callback is normally netmap_notify(), unless
179 * 	         the ifp is attached to a bridge using bwrap, in which case it
180 * 	         is netmap_bwrap_intr_notify().
181 *
182 * 	- netmap_generic_adapter:      [netmap_generic.c]
183 * 	      A system netdev/ifp without native netmap support.
184 *
185 * 	(the decision about native/non native support is taken in
186 * 	 netmap_get_hw_na(), called by netmap_get_na())
187 *
188 * 	- netmap_vp_adapter 		[netmap_vale.c]
189 * 	      Returned by netmap_get_bdg_na().
190 * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
191 * 	      are created on the fly if they don't already exist, and are
192 * 	      always attached to a bridge.
193 * 	      Persistent VALE ports must must be created separately, and i
194 * 	      then attached like normal NICs. The NIOCREGIF we are examining
195 * 	      will find them only if they had previously been created and
196 * 	      attached (see VALE_CTL below).
197 *
198 * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
199 * 	      Returned by netmap_get_pipe_na().
200 * 	      Both pipe ends are created, if they didn't already exist.
201 *
202 * 	- netmap_monitor_adapter      [netmap_monitor.c]
203 * 	      Returned by netmap_get_monitor_na().
204 * 	      If successful, the nm_sync callbacks of the monitored adapter
205 * 	      will be intercepted by the returned monitor.
206 *
207 * 	- netmap_bwrap_adapter	      [netmap_vale.c]
208 * 	      Cannot be obtained in this way, see VALE_CTL below
209 *
210 *
211 * 	os-specific:
212 * 	    linux: we first go through linux_netmap_ioctl() to
213 * 	           adapt the FreeBSD interface to the linux one.
214 *
215 *
216 * > 3. on each descriptor, the process issues an mmap() request to
217 * >    map the shared memory region within the process' address space.
218 * >    The list of interesting queues is indicated by a location in
219 * >    the shared memory region.
220 *
221 *      os-specific:
222 *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
223 *  	    linux:   linux_netmap_mmap (netmap_linux.c).
224 *
225 * > 4. using the functions in the netmap(4) userspace API, a process
226 * >    can look up the occupation state of a queue, access memory buffers,
227 * >    and retrieve received packets or enqueue packets to transmit.
228 *
229 * 	these actions do not involve the kernel.
230 *
231 * > 5. using some ioctl()s the process can synchronize the userspace view
232 * >    of the queue with the actual status in the kernel. This includes both
233 * >    receiving the notification of new packets, and transmitting new
234 * >    packets on the output interface.
235 *
236 * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
237 * 	cases. They invoke the nm_sync callbacks on the netmap_kring
238 * 	structures, as initialized in step 2 and maybe later modified
239 * 	by a monitor. Monitors, however, will always call the original
240 * 	callback before doing anything else.
241 *
242 *
243 * > 6. select() or poll() can be used to wait for events on individual
244 * >    transmit or receive queues (or all queues for a given interface).
245 *
246 * 	Implemented in netmap_poll(). This will call the same nm_sync()
247 * 	callbacks as in step 5 above.
248 *
249 * 	os-specific:
250 * 		linux: we first go through linux_netmap_poll() to adapt
251 * 		       the FreeBSD interface to the linux one.
252 *
253 *
254 *  ----  VALE_CTL -----
255 *
256 *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
257 *  nr_cmd in the nmreq structure. These subcommands are handled by
258 *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
259 *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
260 *  subcommands, respectively.
261 *
262 *  Any network interface known to the system (including a persistent VALE
263 *  port) can be attached to a VALE switch by issuing the
264 *  NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
265 *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
266 *  attachment of other interfaces, instead, requires the creation of a
267 *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
268 *  netmap mode. This may require the creation of a netmap_generic_adapter if
269 *  we have no native support for the interface, or if generic adapters have
270 *  been forced by sysctl.
271 *
272 *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
273 *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
274 *  callback.  In the case of the bwrap, the callback creates the
275 *  netmap_bwrap_adapter.  The initialization of the bwrap is then
276 *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
277 *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
278 *  A generic adapter for the wrapped ifp will be created if needed, when
279 *  netmap_get_bdg_na() calls netmap_get_hw_na().
280 *
281 *
282 *  ---- DATAPATHS -----
283 *
284 *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
285 *
286 *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
287 *
288 *    - tx from netmap userspace:
289 *	 concurrently:
290 *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
291 *                kring->nm_sync() == DEVICE_netmap_txsync()
292 *           2) device interrupt handler
293 *                na->nm_notify()  == netmap_notify()
294 *    - rx from netmap userspace:
295 *       concurrently:
296 *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
297 *                kring->nm_sync() == DEVICE_netmap_rxsync()
298 *           2) device interrupt handler
299 *                na->nm_notify()  == netmap_notify()
300 *    - rx from host stack
301 *       concurrently:
302 *           1) host stack
303 *                netmap_transmit()
304 *                  na->nm_notify  == netmap_notify()
305 *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
306 *                kring->nm_sync() == netmap_rxsync_from_host
307 *                  netmap_rxsync_from_host(na, NULL, NULL)
308 *    - tx to host stack
309 *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
310 *             kring->nm_sync() == netmap_txsync_to_host
311 *               netmap_txsync_to_host(na)
312 *                 nm_os_send_up()
313 *                   FreeBSD: na->if_input() == ether_input()
314 *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
315 *
316 *
317 *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
318 *
319 *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
320 *
321 *    - tx from netmap userspace:
322 *       concurrently:
323 *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
324 *               kring->nm_sync() == generic_netmap_txsync()
325 *                   nm_os_generic_xmit_frame()
326 *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
327 *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
328 *                               gna->save_start_xmit == orig. dev. start_xmit
329 *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
330 *           2) generic_mbuf_destructor()
331 *                   na->nm_notify() == netmap_notify()
332 *    - rx from netmap userspace:
333 *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
334 *               kring->nm_sync() == generic_netmap_rxsync()
335 *                   mbq_safe_dequeue()
336 *           2) device driver
337 *               generic_rx_handler()
338 *                   mbq_safe_enqueue()
339 *                   na->nm_notify() == netmap_notify()
340 *    - rx from host stack
341 *        FreeBSD: same as native
342 *        Linux: same as native except:
343 *           1) host stack
344 *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
345 *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
346 *                       netmap_transmit()
347 *                           na->nm_notify() == netmap_notify()
348 *    - tx to host stack (same as native):
349 *
350 *
351 *                           -= VALE =-
352 *
353 *   INCOMING:
354 *
355 *      - VALE ports:
356 *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
357 *              kring->nm_sync() == netmap_vp_txsync()
358 *
359 *      - system device with native support:
360 *         from cable:
361 *             interrupt
362 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
363 *                     kring->nm_sync() == DEVICE_netmap_rxsync()
364 *                     netmap_vp_txsync()
365 *                     kring->nm_sync() == DEVICE_netmap_rxsync()
366 *         from host stack:
367 *             netmap_transmit()
368 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
369 *                     kring->nm_sync() == netmap_rxsync_from_host()
370 *                     netmap_vp_txsync()
371 *
372 *      - system device with generic support:
373 *         from device driver:
374 *            generic_rx_handler()
375 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
376 *                     kring->nm_sync() == generic_netmap_rxsync()
377 *                     netmap_vp_txsync()
378 *                     kring->nm_sync() == generic_netmap_rxsync()
379 *         from host stack:
380 *            netmap_transmit()
381 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
382 *                     kring->nm_sync() == netmap_rxsync_from_host()
383 *                     netmap_vp_txsync()
384 *
385 *   (all cases) --> nm_bdg_flush()
386 *                      dest_na->nm_notify() == (see below)
387 *
388 *   OUTGOING:
389 *
390 *      - VALE ports:
391 *         concurrently:
392 *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
393 *                    kring->nm_sync() == netmap_vp_rxsync()
394 *             2) from nm_bdg_flush()
395 *                    na->nm_notify() == netmap_notify()
396 *
397 *      - system device with native support:
398 *          to cable:
399 *             na->nm_notify() == netmap_bwrap_notify()
400 *                 netmap_vp_rxsync()
401 *                 kring->nm_sync() == DEVICE_netmap_txsync()
402 *                 netmap_vp_rxsync()
403 *          to host stack:
404 *                 netmap_vp_rxsync()
405 *                 kring->nm_sync() == netmap_txsync_to_host
406 *                 netmap_vp_rxsync_locked()
407 *
408 *      - system device with generic adapter:
409 *          to device driver:
410 *             na->nm_notify() == netmap_bwrap_notify()
411 *                 netmap_vp_rxsync()
412 *                 kring->nm_sync() == generic_netmap_txsync()
413 *                 netmap_vp_rxsync()
414 *          to host stack:
415 *                 netmap_vp_rxsync()
416 *                 kring->nm_sync() == netmap_txsync_to_host
417 *                 netmap_vp_rxsync()
418 *
419 */
420
421/*
422 * OS-specific code that is used only within this file.
423 * Other OS-specific code that must be accessed by drivers
424 * is present in netmap_kern.h
425 */
426
427#if defined(__FreeBSD__)
428#include <sys/cdefs.h> /* prerequisite */
429#include <sys/types.h>
430#include <sys/errno.h>
431#include <sys/param.h>	/* defines used in kernel.h */
432#include <sys/kernel.h>	/* types used in module initialization */
433#include <sys/conf.h>	/* cdevsw struct, UID, GID */
434#include <sys/filio.h>	/* FIONBIO */
435#include <sys/sockio.h>
436#include <sys/socketvar.h>	/* struct socket */
437#include <sys/malloc.h>
438#include <sys/poll.h>
439#include <sys/proc.h>
440#include <sys/rwlock.h>
441#include <sys/socket.h> /* sockaddrs */
442#include <sys/selinfo.h>
443#include <sys/sysctl.h>
444#include <sys/jail.h>
445#include <sys/epoch.h>
446#include <net/vnet.h>
447#include <net/if.h>
448#include <net/if_var.h>
449#include <net/bpf.h>		/* BIOCIMMEDIATE */
450#include <machine/bus.h>	/* bus_dmamap_* */
451#include <sys/endian.h>
452#include <sys/refcount.h>
453#include <net/ethernet.h>	/* ETHER_BPF_MTAP */
454
455
456#elif defined(linux)
457
458#include "bsd_glue.h"
459
460#elif defined(__APPLE__)
461
462#warning OSX support is only partial
463#include "osx_glue.h"
464
465#elif defined (_WIN32)
466
467#include "win_glue.h"
468
469#else
470
471#error	Unsupported platform
472
473#endif /* unsupported */
474
475/*
476 * common headers
477 */
478#include <net/netmap.h>
479#include <dev/netmap/netmap_kern.h>
480#include <dev/netmap/netmap_mem2.h>
481
482
483/* user-controlled variables */
484int netmap_verbose;
485#ifdef CONFIG_NETMAP_DEBUG
486int netmap_debug;
487#endif /* CONFIG_NETMAP_DEBUG */
488
489static int netmap_no_timestamp; /* don't timestamp on rxsync */
490int netmap_no_pendintr = 1;
491int netmap_txsync_retry = 2;
492static int netmap_fwd = 0;	/* force transparent forwarding */
493
494/*
495 * netmap_admode selects the netmap mode to use.
496 * Invalid values are reset to NETMAP_ADMODE_BEST
497 */
498enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
499	NETMAP_ADMODE_NATIVE,	/* either native or none */
500	NETMAP_ADMODE_GENERIC,	/* force generic */
501	NETMAP_ADMODE_LAST };
502static int netmap_admode = NETMAP_ADMODE_BEST;
503
504/* netmap_generic_mit controls mitigation of RX notifications for
505 * the generic netmap adapter. The value is a time interval in
506 * nanoseconds. */
507int netmap_generic_mit = 100*1000;
508
509/* We use by default netmap-aware qdiscs with generic netmap adapters,
510 * even if there can be a little performance hit with hardware NICs.
511 * However, using the qdisc is the safer approach, for two reasons:
512 * 1) it prevents non-fifo qdiscs to break the TX notification
513 *    scheme, which is based on mbuf destructors when txqdisc is
514 *    not used.
515 * 2) it makes it possible to transmit over software devices that
516 *    change skb->dev, like bridge, veth, ...
517 *
518 * Anyway users looking for the best performance should
519 * use native adapters.
520 */
521#ifdef linux
522int netmap_generic_txqdisc = 1;
523#endif
524
525/* Default number of slots and queues for generic adapters. */
526int netmap_generic_ringsize = 1024;
527int netmap_generic_rings = 1;
528
529/* Non-zero to enable checksum offloading in NIC drivers */
530int netmap_generic_hwcsum = 0;
531
532/* Non-zero if ptnet devices are allowed to use virtio-net headers. */
533int ptnet_vnet_hdr = 1;
534
535/*
536 * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
537 * in some other operating systems
538 */
539SYSBEGIN(main_init);
540
541SYSCTL_DECL(_dev_netmap);
542SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
543    "Netmap args");
544SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
545		CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
546#ifdef CONFIG_NETMAP_DEBUG
547SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
548		CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
549#endif /* CONFIG_NETMAP_DEBUG */
550SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
551		CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
552SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
553		0, "Always look for new received packets.");
554SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
555		&netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
556
557SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
558		"Force NR_FORWARD mode");
559SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
560		"Adapter mode. 0 selects the best option available,"
561		"1 forces native adapter, 2 forces emulated adapter");
562SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
563		0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
564		"1 to enable checksum generation by the NIC");
565SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
566		0, "RX notification interval in nanoseconds");
567SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
568		&netmap_generic_ringsize, 0,
569		"Number of per-ring slots for emulated netmap mode");
570SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
571		&netmap_generic_rings, 0,
572		"Number of TX/RX queues for emulated netmap adapters");
573#ifdef linux
574SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
575		&netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
576#endif
577SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
578		0, "Allow ptnet devices to use virtio-net headers");
579
580SYSEND;
581
582NMG_LOCK_T	netmap_global_lock;
583
584/*
585 * mark the ring as stopped, and run through the locks
586 * to make sure other users get to see it.
587 * stopped must be either NR_KR_STOPPED (for unbounded stop)
588 * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
589 */
590static void
591netmap_disable_ring(struct netmap_kring *kr, int stopped)
592{
593	nm_kr_stop(kr, stopped);
594	// XXX check if nm_kr_stop is sufficient
595	mtx_lock(&kr->q_lock);
596	mtx_unlock(&kr->q_lock);
597	nm_kr_put(kr);
598}
599
600/* stop or enable a single ring */
601void
602netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
603{
604	if (stopped)
605		netmap_disable_ring(NMR(na, t)[ring_id], stopped);
606	else
607		NMR(na, t)[ring_id]->nkr_stopped = 0;
608}
609
610
611/* stop or enable all the rings of na */
612void
613netmap_set_all_rings(struct netmap_adapter *na, int stopped)
614{
615	int i;
616	enum txrx t;
617
618	if (!nm_netmap_on(na))
619		return;
620
621	if (netmap_verbose) {
622		nm_prinf("%s: %sable all rings", na->name,
623		    (stopped ? "dis" : "en"));
624	}
625	for_rx_tx(t) {
626		for (i = 0; i < netmap_real_rings(na, t); i++) {
627			netmap_set_ring(na, i, t, stopped);
628		}
629	}
630}
631
632/*
633 * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
634 * to finish and prevents any new one from starting.  Call this before turning
635 * netmap mode off, or before removing the hardware rings (e.g., on module
636 * onload).
637 */
638void
639netmap_disable_all_rings(if_t ifp)
640{
641	if (NM_NA_VALID(ifp)) {
642		netmap_set_all_rings(NA(ifp), NM_KR_LOCKED);
643	}
644}
645
646/*
647 * Convenience function used in drivers.  Re-enables rxsync and txsync on the
648 * adapter's rings In linux drivers, this should be placed near each
649 * napi_enable().
650 */
651void
652netmap_enable_all_rings(if_t ifp)
653{
654	if (NM_NA_VALID(ifp)) {
655		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
656	}
657}
658
659void
660netmap_make_zombie(if_t ifp)
661{
662	if (NM_NA_VALID(ifp)) {
663		struct netmap_adapter *na = NA(ifp);
664		netmap_set_all_rings(na, NM_KR_LOCKED);
665		na->na_flags |= NAF_ZOMBIE;
666		netmap_set_all_rings(na, 0);
667	}
668}
669
670void
671netmap_undo_zombie(if_t ifp)
672{
673	if (NM_NA_VALID(ifp)) {
674		struct netmap_adapter *na = NA(ifp);
675		if (na->na_flags & NAF_ZOMBIE) {
676			netmap_set_all_rings(na, NM_KR_LOCKED);
677			na->na_flags &= ~NAF_ZOMBIE;
678			netmap_set_all_rings(na, 0);
679		}
680	}
681}
682
683/*
684 * generic bound_checking function
685 */
686u_int
687nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
688{
689	u_int oldv = *v;
690	const char *op = NULL;
691
692	if (dflt < lo)
693		dflt = lo;
694	if (dflt > hi)
695		dflt = hi;
696	if (oldv < lo) {
697		*v = dflt;
698		op = "Bump";
699	} else if (oldv > hi) {
700		*v = hi;
701		op = "Clamp";
702	}
703	if (op && msg)
704		nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
705	return *v;
706}
707
708
709/*
710 * packet-dump function, user-supplied or static buffer.
711 * The destination buffer must be at least 30+4*len
712 */
713const char *
714nm_dump_buf(char *p, int len, int lim, char *dst)
715{
716	static char _dst[8192];
717	int i, j, i0;
718	static char hex[] ="0123456789abcdef";
719	char *o;	/* output position */
720
721#define P_HI(x)	hex[((x) & 0xf0)>>4]
722#define P_LO(x)	hex[((x) & 0xf)]
723#define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
724	if (!dst)
725		dst = _dst;
726	if (lim <= 0 || lim > len)
727		lim = len;
728	o = dst;
729	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
730	o += strlen(o);
731	/* hexdump routine */
732	for (i = 0; i < lim; ) {
733		sprintf(o, "%5d: ", i);
734		o += strlen(o);
735		memset(o, ' ', 48);
736		i0 = i;
737		for (j=0; j < 16 && i < lim; i++, j++) {
738			o[j*3] = P_HI(p[i]);
739			o[j*3+1] = P_LO(p[i]);
740		}
741		i = i0;
742		for (j=0; j < 16 && i < lim; i++, j++)
743			o[j + 48] = P_C(p[i]);
744		o[j+48] = '\n';
745		o += j+49;
746	}
747	*o = '\0';
748#undef P_HI
749#undef P_LO
750#undef P_C
751	return dst;
752}
753
754
755/*
756 * Fetch configuration from the device, to cope with dynamic
757 * reconfigurations after loading the module.
758 */
759/* call with NMG_LOCK held */
760int
761netmap_update_config(struct netmap_adapter *na)
762{
763	struct nm_config_info info;
764
765	if (na->ifp && !nm_is_bwrap(na)) {
766		strlcpy(na->name, if_name(na->ifp), sizeof(na->name));
767	}
768
769	bzero(&info, sizeof(info));
770	if (na->nm_config == NULL ||
771	    na->nm_config(na, &info)) {
772		/* take whatever we had at init time */
773		info.num_tx_rings = na->num_tx_rings;
774		info.num_tx_descs = na->num_tx_desc;
775		info.num_rx_rings = na->num_rx_rings;
776		info.num_rx_descs = na->num_rx_desc;
777		info.rx_buf_maxsize = na->rx_buf_maxsize;
778	}
779
780	if (na->num_tx_rings == info.num_tx_rings &&
781	    na->num_tx_desc == info.num_tx_descs &&
782	    na->num_rx_rings == info.num_rx_rings &&
783	    na->num_rx_desc == info.num_rx_descs &&
784	    na->rx_buf_maxsize == info.rx_buf_maxsize)
785		return 0; /* nothing changed */
786	if (na->active_fds == 0) {
787		na->num_tx_rings = info.num_tx_rings;
788		na->num_tx_desc = info.num_tx_descs;
789		na->num_rx_rings = info.num_rx_rings;
790		na->num_rx_desc = info.num_rx_descs;
791		na->rx_buf_maxsize = info.rx_buf_maxsize;
792		if (netmap_verbose)
793			nm_prinf("configuration changed for %s: txring %d x %d, "
794				"rxring %d x %d, rxbufsz %d",
795				na->name, na->num_tx_rings, na->num_tx_desc,
796				na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
797		return 0;
798	}
799	nm_prerr("WARNING: configuration changed for %s while active: "
800		"txring %d x %d, rxring %d x %d, rxbufsz %d",
801		na->name, info.num_tx_rings, info.num_tx_descs,
802		info.num_rx_rings, info.num_rx_descs,
803		info.rx_buf_maxsize);
804	return 1;
805}
806
807/* nm_sync callbacks for the host rings */
808static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
809static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
810
811static int
812netmap_default_bufcfg(struct netmap_kring *kring, uint64_t target)
813{
814	kring->hwbuf_len = target;
815	kring->buf_align = 0; /* no alignment */
816	return 0;
817}
818
819/* create the krings array and initialize the fields common to all adapters.
820 * The array layout is this:
821 *
822 *                    +----------+
823 * na->tx_rings ----->|          | \
824 *                    |          |  } na->num_tx_ring
825 *                    |          | /
826 *                    +----------+
827 *                    |          |    host tx kring
828 * na->rx_rings ----> +----------+
829 *                    |          | \
830 *                    |          |  } na->num_rx_rings
831 *                    |          | /
832 *                    +----------+
833 *                    |          |    host rx kring
834 *                    +----------+
835 * na->tailroom ----->|          | \
836 *                    |          |  } tailroom bytes
837 *                    |          | /
838 *                    +----------+
839 *
840 * Note: for compatibility, host krings are created even when not needed.
841 * The tailroom space is currently used by vale ports for allocating leases.
842 */
843/* call with NMG_LOCK held */
844int
845netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
846{
847	u_int i, len, ndesc;
848	struct netmap_kring *kring;
849	u_int n[NR_TXRX];
850	enum txrx t;
851	int err = 0;
852
853	if (na->tx_rings != NULL) {
854		if (netmap_debug & NM_DEBUG_ON)
855			nm_prerr("warning: krings were already created");
856		return 0;
857	}
858
859	/* account for the (possibly fake) host rings */
860	n[NR_TX] = netmap_all_rings(na, NR_TX);
861	n[NR_RX] = netmap_all_rings(na, NR_RX);
862
863	len = (n[NR_TX] + n[NR_RX]) *
864		(sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
865		+ tailroom;
866
867	na->tx_rings = nm_os_malloc((size_t)len);
868	if (na->tx_rings == NULL) {
869		nm_prerr("Cannot allocate krings");
870		return ENOMEM;
871	}
872	na->rx_rings = na->tx_rings + n[NR_TX];
873	na->tailroom = na->rx_rings + n[NR_RX];
874
875	/* link the krings in the krings array */
876	kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
877	for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
878		na->tx_rings[i] = kring;
879		kring++;
880	}
881
882	/*
883	 * All fields in krings are 0 except the one initialized below.
884	 * but better be explicit on important kring fields.
885	 */
886	for_rx_tx(t) {
887		ndesc = nma_get_ndesc(na, t);
888		for (i = 0; i < n[t]; i++) {
889			kring = NMR(na, t)[i];
890			bzero(kring, sizeof(*kring));
891			kring->notify_na = na;
892			kring->ring_id = i;
893			kring->tx = t;
894			kring->nkr_num_slots = ndesc;
895			kring->nr_mode = NKR_NETMAP_OFF;
896			kring->nr_pending_mode = NKR_NETMAP_OFF;
897			if (i < nma_get_nrings(na, t)) {
898				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
899				kring->nm_bufcfg = na->nm_bufcfg;
900				if (kring->nm_bufcfg == NULL)
901					kring->nm_bufcfg = netmap_default_bufcfg;
902			} else {
903				if (!(na->na_flags & NAF_HOST_RINGS))
904					kring->nr_kflags |= NKR_FAKERING;
905				kring->nm_sync = (t == NR_TX ?
906						netmap_txsync_to_host:
907						netmap_rxsync_from_host);
908				kring->nm_bufcfg = netmap_default_bufcfg;
909			}
910			kring->nm_notify = na->nm_notify;
911			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
912			/*
913			 * IMPORTANT: Always keep one slot empty.
914			 */
915			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
916			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
917					nm_txrx2str(t), i);
918			nm_prdis("ktx %s h %d c %d t %d",
919				kring->name, kring->rhead, kring->rcur, kring->rtail);
920			err = nm_os_selinfo_init(&kring->si, kring->name);
921			if (err) {
922				netmap_krings_delete(na);
923				return err;
924			}
925			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
926			kring->na = na;	/* setting this field marks the mutex as initialized */
927		}
928		err = nm_os_selinfo_init(&na->si[t], na->name);
929		if (err) {
930			netmap_krings_delete(na);
931			return err;
932		}
933	}
934
935	return 0;
936}
937
938
939/* undo the actions performed by netmap_krings_create */
940/* call with NMG_LOCK held */
941void
942netmap_krings_delete(struct netmap_adapter *na)
943{
944	struct netmap_kring **kring = na->tx_rings;
945	enum txrx t;
946
947	if (na->tx_rings == NULL) {
948		if (netmap_debug & NM_DEBUG_ON)
949			nm_prerr("warning: krings were already deleted");
950		return;
951	}
952
953	for_rx_tx(t)
954		nm_os_selinfo_uninit(&na->si[t]);
955
956	/* we rely on the krings layout described above */
957	for ( ; kring != na->tailroom; kring++) {
958		if ((*kring)->na != NULL)
959			mtx_destroy(&(*kring)->q_lock);
960		nm_os_selinfo_uninit(&(*kring)->si);
961	}
962	nm_os_free(na->tx_rings);
963	na->tx_rings = na->rx_rings = na->tailroom = NULL;
964}
965
966
967/*
968 * Destructor for NIC ports. They also have an mbuf queue
969 * on the rings connected to the host so we need to purge
970 * them first.
971 */
972/* call with NMG_LOCK held */
973void
974netmap_hw_krings_delete(struct netmap_adapter *na)
975{
976	u_int lim = netmap_real_rings(na, NR_RX), i;
977
978	for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
979		struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
980		nm_prdis("destroy sw mbq with len %d", mbq_len(q));
981		mbq_purge(q);
982		mbq_safe_fini(q);
983	}
984	netmap_krings_delete(na);
985}
986
987void
988netmap_mem_restore(struct netmap_adapter *na)
989{
990	if (na->nm_mem_prev) {
991		netmap_mem_put(na->nm_mem);
992		na->nm_mem = na->nm_mem_prev;
993		na->nm_mem_prev = NULL;
994	}
995}
996
997static void
998netmap_mem_drop(struct netmap_adapter *na)
999{
1000	netmap_mem_deref(na->nm_mem, na);
1001
1002	if (na->active_fds <= 0) {
1003		/* if the native allocator had been overridden on regif,
1004		 * restore it now and drop the temporary one
1005		 */
1006		netmap_mem_restore(na);
1007	}
1008}
1009
1010static void
1011netmap_update_hostrings_mode(struct netmap_adapter *na)
1012{
1013	enum txrx t;
1014	struct netmap_kring *kring;
1015	int i;
1016
1017	for_rx_tx(t) {
1018		for (i = nma_get_nrings(na, t);
1019		     i < netmap_real_rings(na, t); i++) {
1020			kring = NMR(na, t)[i];
1021			kring->nr_mode = kring->nr_pending_mode;
1022		}
1023	}
1024}
1025
1026/*
1027 * Undo everything that was done in netmap_do_regif(). In particular,
1028 * call nm_register(ifp,0) to stop netmap mode on the interface and
1029 * revert to normal operation.
1030 */
1031/* call with NMG_LOCK held */
1032static void netmap_unset_ringid(struct netmap_priv_d *);
1033static void netmap_krings_put(struct netmap_priv_d *);
1034void
1035netmap_do_unregif(struct netmap_priv_d *priv)
1036{
1037	struct netmap_adapter *na = priv->np_na;
1038
1039	NMG_LOCK_ASSERT();
1040	na->active_fds--;
1041	/* unset nr_pending_mode and possibly release exclusive mode */
1042	netmap_krings_put(priv);
1043
1044#ifdef	WITH_MONITOR
1045	/* XXX check whether we have to do something with monitor
1046	 * when rings change nr_mode. */
1047	if (na->active_fds <= 0) {
1048		/* walk through all the rings and tell any monitor
1049		 * that the port is going to exit netmap mode
1050		 */
1051		netmap_monitor_stop(na);
1052	}
1053#endif
1054
1055	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
1056		netmap_set_all_rings(na, NM_KR_LOCKED);
1057		na->nm_register(na, 0);
1058		netmap_set_all_rings(na, 0);
1059	}
1060
1061	/* delete rings and buffers that are no longer needed */
1062	netmap_mem_rings_delete(na);
1063
1064	if (na->active_fds <= 0) {	/* last instance */
1065		/*
1066		 * (TO CHECK) We enter here
1067		 * when the last reference to this file descriptor goes
1068		 * away. This means we cannot have any pending poll()
1069		 * or interrupt routine operating on the structure.
1070		 * XXX The file may be closed in a thread while
1071		 * another thread is using it.
1072		 * Linux keeps the file opened until the last reference
1073		 * by any outstanding ioctl/poll or mmap is gone.
1074		 * FreeBSD does not track mmap()s (but we do) and
1075		 * wakes up any sleeping poll(). Need to check what
1076		 * happens if the close() occurs while a concurrent
1077		 * syscall is running.
1078		 */
1079		if (netmap_debug & NM_DEBUG_ON)
1080			nm_prinf("deleting last instance for %s", na->name);
1081
1082		if (nm_netmap_on(na)) {
1083			nm_prerr("BUG: netmap on while going to delete the krings");
1084		}
1085
1086		na->nm_krings_delete(na);
1087
1088		/* restore the default number of host tx and rx rings */
1089		if (na->na_flags & NAF_HOST_RINGS) {
1090			na->num_host_tx_rings = 1;
1091			na->num_host_rx_rings = 1;
1092		} else {
1093			na->num_host_tx_rings = 0;
1094			na->num_host_rx_rings = 0;
1095		}
1096	}
1097
1098	/* possibly decrement counter of tx_si/rx_si users */
1099	netmap_unset_ringid(priv);
1100	/* delete the nifp */
1101	netmap_mem_if_delete(na, priv->np_nifp);
1102	/* drop the allocator */
1103	netmap_mem_drop(na);
1104	/* mark the priv as unregistered */
1105	priv->np_na = NULL;
1106	priv->np_nifp = NULL;
1107}
1108
1109struct netmap_priv_d*
1110netmap_priv_new(void)
1111{
1112	struct netmap_priv_d *priv;
1113
1114	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1115	if (priv == NULL)
1116		return NULL;
1117	priv->np_refs = 1;
1118	nm_os_get_module();
1119	return priv;
1120}
1121
1122/*
1123 * Destructor of the netmap_priv_d, called when the fd is closed
1124 * Action: undo all the things done by NIOCREGIF,
1125 * On FreeBSD we need to track whether there are active mmap()s,
1126 * and we use np_active_mmaps for that. On linux, the field is always 0.
1127 * Return: 1 if we can free priv, 0 otherwise.
1128 *
1129 */
1130/* call with NMG_LOCK held */
1131void
1132netmap_priv_delete(struct netmap_priv_d *priv)
1133{
1134	struct netmap_adapter *na = priv->np_na;
1135
1136	/* number of active references to this fd */
1137	if (--priv->np_refs > 0) {
1138		return;
1139	}
1140	nm_os_put_module();
1141	if (na) {
1142		netmap_do_unregif(priv);
1143	}
1144	netmap_unget_na(na, priv->np_ifp);
1145	bzero(priv, sizeof(*priv));	/* for safety */
1146	nm_os_free(priv);
1147}
1148
1149
1150/* call with NMG_LOCK *not* held */
1151void
1152netmap_dtor(void *data)
1153{
1154	struct netmap_priv_d *priv = data;
1155
1156	NMG_LOCK();
1157	netmap_priv_delete(priv);
1158	NMG_UNLOCK();
1159}
1160
1161
1162/*
1163 * Handlers for synchronization of the rings from/to the host stack.
1164 * These are associated to a network interface and are just another
1165 * ring pair managed by userspace.
1166 *
1167 * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1168 * flags):
1169 *
1170 * - Before releasing buffers on hw RX rings, the application can mark
1171 *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1172 *   will be forwarded to the host stack, similarly to what happened if
1173 *   the application moved them to the host TX ring.
1174 *
1175 * - Before releasing buffers on the host RX ring, the application can
1176 *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1177 *   they will be forwarded to the hw TX rings, saving the application
1178 *   from doing the same task in user-space.
1179 *
1180 * Transparent forwarding can be enabled per-ring, by setting the NR_FORWARD
1181 * flag, or globally with the netmap_fwd sysctl.
1182 *
1183 * The transfer NIC --> host is relatively easy, just encapsulate
1184 * into mbufs and we are done. The host --> NIC side is slightly
1185 * harder because there might not be room in the tx ring so it
1186 * might take a while before releasing the buffer.
1187 */
1188
1189
1190/*
1191 * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1192 * We do not need to lock because the queue is private.
1193 * After this call the queue is empty.
1194 */
1195static void
1196netmap_send_up(if_t dst, struct mbq *q)
1197{
1198	struct mbuf *m;
1199	struct mbuf *head = NULL, *prev = NULL;
1200#ifdef __FreeBSD__
1201	struct epoch_tracker et;
1202
1203	NET_EPOCH_ENTER(et);
1204#endif /* __FreeBSD__ */
1205	/* Send packets up, outside the lock; head/prev machinery
1206	 * is only useful for Windows. */
1207	while ((m = mbq_dequeue(q)) != NULL) {
1208		if (netmap_debug & NM_DEBUG_HOST)
1209			nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
1210		prev = nm_os_send_up(dst, m, prev);
1211		if (head == NULL)
1212			head = prev;
1213	}
1214	if (head)
1215		nm_os_send_up(dst, NULL, head);
1216#ifdef __FreeBSD__
1217	NET_EPOCH_EXIT(et);
1218#endif /* __FreeBSD__ */
1219	mbq_fini(q);
1220}
1221
1222
1223/*
1224 * Scan the buffers from hwcur to ring->head, and put a copy of those
1225 * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1226 * Drop remaining packets in the unlikely event
1227 * of an mbuf shortage.
1228 */
1229static void
1230netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1231{
1232	u_int const lim = kring->nkr_num_slots - 1;
1233	u_int const head = kring->rhead;
1234	u_int n;
1235	struct netmap_adapter *na = kring->na;
1236
1237	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1238		struct mbuf *m;
1239		struct netmap_slot *slot = &kring->ring->slot[n];
1240
1241		if ((slot->flags & NS_FORWARD) == 0 && !force)
1242			continue;
1243		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1244			nm_prlim(5, "bad pkt at %d len %d", n, slot->len);
1245			continue;
1246		}
1247		slot->flags &= ~NS_FORWARD; // XXX needed ?
1248		/* XXX TODO: adapt to the case of a multisegment packet */
1249		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1250
1251		if (m == NULL)
1252			break;
1253		mbq_enqueue(q, m);
1254	}
1255}
1256
1257static inline int
1258_nm_may_forward(struct netmap_kring *kring)
1259{
1260	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1261		 kring->na->na_flags & NAF_HOST_RINGS &&
1262		 kring->tx == NR_RX);
1263}
1264
1265static inline int
1266nm_may_forward_up(struct netmap_kring *kring)
1267{
1268	return	_nm_may_forward(kring) &&
1269		 kring->ring_id != kring->na->num_rx_rings;
1270}
1271
1272static inline int
1273nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1274{
1275	return	_nm_may_forward(kring) &&
1276		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1277		 kring->ring_id == kring->na->num_rx_rings;
1278}
1279
1280/*
1281 * Send to the NIC rings packets marked NS_FORWARD between
1282 * kring->nr_hwcur and kring->rhead.
1283 * Called under kring->rx_queue.lock on the sw rx ring.
1284 *
1285 * It can only be called if the user opened all the TX hw rings,
1286 * see NAF_CAN_FORWARD_DOWN flag.
1287 * We can touch the TX netmap rings (slots, head and cur) since
1288 * we are in poll/ioctl system call context, and the application
1289 * is not supposed to touch the ring (using a different thread)
1290 * during the execution of the system call.
1291 */
1292static u_int
1293netmap_sw_to_nic(struct netmap_adapter *na)
1294{
1295	struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1296	struct netmap_slot *rxslot = kring->ring->slot;
1297	u_int i, rxcur = kring->nr_hwcur;
1298	u_int const head = kring->rhead;
1299	u_int const src_lim = kring->nkr_num_slots - 1;
1300	u_int sent = 0;
1301
1302	/* scan rings to find space, then fill as much as possible */
1303	for (i = 0; i < na->num_tx_rings; i++) {
1304		struct netmap_kring *kdst = na->tx_rings[i];
1305		struct netmap_ring *rdst = kdst->ring;
1306		u_int const dst_lim = kdst->nkr_num_slots - 1;
1307
1308		/* XXX do we trust ring or kring->rcur,rtail ? */
1309		for (; rxcur != head && !nm_ring_empty(rdst);
1310		     rxcur = nm_next(rxcur, src_lim) ) {
1311			struct netmap_slot *src, *dst, tmp;
1312			u_int dst_head = rdst->head;
1313
1314			src = &rxslot[rxcur];
1315			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1316				continue;
1317
1318			sent++;
1319
1320			dst = &rdst->slot[dst_head];
1321
1322			tmp = *src;
1323
1324			src->buf_idx = dst->buf_idx;
1325			src->flags = NS_BUF_CHANGED;
1326
1327			dst->buf_idx = tmp.buf_idx;
1328			dst->len = tmp.len;
1329			dst->flags = NS_BUF_CHANGED;
1330
1331			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1332		}
1333		/* if (sent) XXX txsync ? it would be just an optimization */
1334	}
1335	return sent;
1336}
1337
1338
1339/*
1340 * netmap_txsync_to_host() passes packets up. We are called from a
1341 * system call in user process context, and the only contention
1342 * can be among multiple user threads erroneously calling
1343 * this routine concurrently.
1344 */
1345static int
1346netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1347{
1348	struct netmap_adapter *na = kring->na;
1349	u_int const lim = kring->nkr_num_slots - 1;
1350	u_int const head = kring->rhead;
1351	struct mbq q;
1352
1353	/* Take packets from hwcur to head and pass them up.
1354	 * Force hwcur = head since netmap_grab_packets() stops at head
1355	 */
1356	mbq_init(&q);
1357	netmap_grab_packets(kring, &q, 1 /* force */);
1358	nm_prdis("have %d pkts in queue", mbq_len(&q));
1359	kring->nr_hwcur = head;
1360	kring->nr_hwtail = head + lim;
1361	if (kring->nr_hwtail > lim)
1362		kring->nr_hwtail -= lim + 1;
1363
1364	netmap_send_up(na->ifp, &q);
1365	return 0;
1366}
1367
1368
1369/*
1370 * rxsync backend for packets coming from the host stack.
1371 * They have been put in kring->rx_queue by netmap_transmit().
1372 * We protect access to the kring using kring->rx_queue.lock
1373 *
1374 * also moves to the nic hw rings any packet the user has marked
1375 * for transparent-mode forwarding, then sets the NR_FORWARD
1376 * flag in the kring to let the caller push them out
1377 */
1378static int
1379netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1380{
1381	struct netmap_adapter *na = kring->na;
1382	struct netmap_ring *ring = kring->ring;
1383	u_int nm_i, n;
1384	u_int const lim = kring->nkr_num_slots - 1;
1385	u_int const head = kring->rhead;
1386	int ret = 0;
1387	struct mbq *q = &kring->rx_queue, fq;
1388
1389	mbq_init(&fq); /* fq holds packets to be freed */
1390
1391	mbq_lock(q);
1392
1393	/* First part: import newly received packets */
1394	n = mbq_len(q);
1395	if (n) { /* grab packets from the queue */
1396		struct mbuf *m;
1397		uint32_t stop_i;
1398
1399		nm_i = kring->nr_hwtail;
1400		stop_i = nm_prev(kring->nr_hwcur, lim);
1401		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1402			int len = MBUF_LEN(m);
1403			struct netmap_slot *slot = &ring->slot[nm_i];
1404
1405			m_copydata(m, 0, len, NMB(na, slot));
1406			nm_prdis("nm %d len %d", nm_i, len);
1407			if (netmap_debug & NM_DEBUG_HOST)
1408				nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1409
1410			slot->len = len;
1411			slot->flags = 0;
1412			nm_i = nm_next(nm_i, lim);
1413			mbq_enqueue(&fq, m);
1414		}
1415		kring->nr_hwtail = nm_i;
1416	}
1417
1418	/*
1419	 * Second part: skip past packets that userspace has released.
1420	 */
1421	nm_i = kring->nr_hwcur;
1422	if (nm_i != head) { /* something was released */
1423		if (nm_may_forward_down(kring, flags)) {
1424			ret = netmap_sw_to_nic(na);
1425			if (ret > 0) {
1426				kring->nr_kflags |= NR_FORWARD;
1427				ret = 0;
1428			}
1429		}
1430		kring->nr_hwcur = head;
1431	}
1432
1433	mbq_unlock(q);
1434
1435	mbq_purge(&fq);
1436	mbq_fini(&fq);
1437
1438	return ret;
1439}
1440
1441
1442/* Get a netmap adapter for the port.
1443 *
1444 * If it is possible to satisfy the request, return 0
1445 * with *na containing the netmap adapter found.
1446 * Otherwise return an error code, with *na containing NULL.
1447 *
1448 * When the port is attached to a bridge, we always return
1449 * EBUSY.
1450 * Otherwise, if the port is already bound to a file descriptor,
1451 * then we unconditionally return the existing adapter into *na.
1452 * In all the other cases, we return (into *na) either native,
1453 * generic or NULL, according to the following table:
1454 *
1455 *					native_support
1456 * active_fds   dev.netmap.admode         YES     NO
1457 * -------------------------------------------------------
1458 *    >0              *                 NA(ifp) NA(ifp)
1459 *
1460 *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1461 *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1462 *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1463 *
1464 */
1465static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1466int
1467netmap_get_hw_na(if_t ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1468{
1469	/* generic support */
1470	int i = netmap_admode;	/* Take a snapshot. */
1471	struct netmap_adapter *prev_na;
1472	int error = 0;
1473
1474	*na = NULL; /* default */
1475
1476	/* reset in case of invalid value */
1477	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1478		i = netmap_admode = NETMAP_ADMODE_BEST;
1479
1480	if (NM_NA_VALID(ifp)) {
1481		prev_na = NA(ifp);
1482		/* If an adapter already exists, return it if
1483		 * there are active file descriptors or if
1484		 * netmap is not forced to use generic
1485		 * adapters.
1486		 */
1487		if (NETMAP_OWNED_BY_ANY(prev_na)
1488			|| i != NETMAP_ADMODE_GENERIC
1489			|| prev_na->na_flags & NAF_FORCE_NATIVE
1490#ifdef WITH_PIPES
1491			/* ugly, but we cannot allow an adapter switch
1492			 * if some pipe is referring to this one
1493			 */
1494			|| prev_na->na_next_pipe > 0
1495#endif
1496		) {
1497			*na = prev_na;
1498			goto assign_mem;
1499		}
1500	}
1501
1502	/* If there isn't native support and netmap is not allowed
1503	 * to use generic adapters, we cannot satisfy the request.
1504	 */
1505	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1506		return EOPNOTSUPP;
1507
1508	/* Otherwise, create a generic adapter and return it,
1509	 * saving the previously used netmap adapter, if any.
1510	 *
1511	 * Note that here 'prev_na', if not NULL, MUST be a
1512	 * native adapter, and CANNOT be a generic one. This is
1513	 * true because generic adapters are created on demand, and
1514	 * destroyed when not used anymore. Therefore, if the adapter
1515	 * currently attached to an interface 'ifp' is generic, it
1516	 * must be that
1517	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1518	 * Consequently, if NA(ifp) is generic, we will enter one of
1519	 * the branches above. This ensures that we never override
1520	 * a generic adapter with another generic adapter.
1521	 */
1522	error = generic_netmap_attach(ifp);
1523	if (error)
1524		return error;
1525
1526	*na = NA(ifp);
1527
1528assign_mem:
1529	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1530	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1531		(*na)->nm_mem_prev = (*na)->nm_mem;
1532		(*na)->nm_mem = netmap_mem_get(nmd);
1533	}
1534
1535	return 0;
1536}
1537
1538/*
1539 * MUST BE CALLED UNDER NMG_LOCK()
1540 *
1541 * Get a refcounted reference to a netmap adapter attached
1542 * to the interface specified by req.
1543 * This is always called in the execution of an ioctl().
1544 *
1545 * Return ENXIO if the interface specified by the request does
1546 * not exist, ENOTSUP if netmap is not supported by the interface,
1547 * EBUSY if the interface is already attached to a bridge,
1548 * EINVAL if parameters are invalid, ENOMEM if needed resources
1549 * could not be allocated.
1550 * If successful, hold a reference to the netmap adapter.
1551 *
1552 * If the interface specified by req is a system one, also keep
1553 * a reference to it and return a valid *ifp.
1554 */
1555int
1556netmap_get_na(struct nmreq_header *hdr,
1557	      struct netmap_adapter **na, if_t *ifp,
1558	      struct netmap_mem_d *nmd, int create)
1559{
1560	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1561	int error = 0;
1562	struct netmap_adapter *ret = NULL;
1563	int nmd_ref = 0;
1564
1565	*na = NULL;     /* default return value */
1566	*ifp = NULL;
1567
1568	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1569		return EINVAL;
1570	}
1571
1572	if (req->nr_mode == NR_REG_PIPE_MASTER ||
1573			req->nr_mode == NR_REG_PIPE_SLAVE) {
1574		/* Do not accept deprecated pipe modes. */
1575		nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1576		return EINVAL;
1577	}
1578
1579	NMG_LOCK_ASSERT();
1580
1581	/* if the request contain a memid, try to find the
1582	 * corresponding memory region
1583	 */
1584	if (nmd == NULL && req->nr_mem_id) {
1585		nmd = netmap_mem_find(req->nr_mem_id);
1586		if (nmd == NULL)
1587			return EINVAL;
1588		/* keep the rereference */
1589		nmd_ref = 1;
1590	}
1591
1592	/* We cascade through all possible types of netmap adapter.
1593	 * All netmap_get_*_na() functions return an error and an na,
1594	 * with the following combinations:
1595	 *
1596	 * error    na
1597	 *   0	   NULL		type doesn't match
1598	 *  !0	   NULL		type matches, but na creation/lookup failed
1599	 *   0	  !NULL		type matches and na created/found
1600	 *  !0    !NULL		impossible
1601	 */
1602	error = netmap_get_null_na(hdr, na, nmd, create);
1603	if (error || *na != NULL)
1604		goto out;
1605
1606	/* try to see if this is a monitor port */
1607	error = netmap_get_monitor_na(hdr, na, nmd, create);
1608	if (error || *na != NULL)
1609		goto out;
1610
1611	/* try to see if this is a pipe port */
1612	error = netmap_get_pipe_na(hdr, na, nmd, create);
1613	if (error || *na != NULL)
1614		goto out;
1615
1616	/* try to see if this is a vale port */
1617	error = netmap_get_vale_na(hdr, na, nmd, create);
1618	if (error)
1619		goto out;
1620
1621	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1622		goto out;
1623
1624	/*
1625	 * This must be a hardware na, lookup the name in the system.
1626	 * Note that by hardware we actually mean "it shows up in ifconfig".
1627	 * This may still be a tap, a veth/epair, or even a
1628	 * persistent VALE port.
1629	 */
1630	*ifp = ifunit_ref(hdr->nr_name);
1631	if (*ifp == NULL) {
1632		error = ENXIO;
1633		goto out;
1634	}
1635
1636	error = netmap_get_hw_na(*ifp, nmd, &ret);
1637	if (error)
1638		goto out;
1639
1640	*na = ret;
1641	netmap_adapter_get(ret);
1642
1643	/*
1644	 * if the adapter supports the host rings and it is not already open,
1645	 * try to set the number of host rings as requested by the user
1646	 */
1647	if (((*na)->na_flags & NAF_HOST_RINGS) && (*na)->active_fds == 0) {
1648		if (req->nr_host_tx_rings)
1649			(*na)->num_host_tx_rings = req->nr_host_tx_rings;
1650		if (req->nr_host_rx_rings)
1651			(*na)->num_host_rx_rings = req->nr_host_rx_rings;
1652	}
1653	nm_prdis("%s: host tx %d rx %u", (*na)->name, (*na)->num_host_tx_rings,
1654			(*na)->num_host_rx_rings);
1655
1656out:
1657	if (error) {
1658		if (ret)
1659			netmap_adapter_put(ret);
1660		if (*ifp) {
1661			if_rele(*ifp);
1662			*ifp = NULL;
1663		}
1664	}
1665	if (nmd_ref)
1666		netmap_mem_put(nmd);
1667
1668	return error;
1669}
1670
1671/* undo netmap_get_na() */
1672void
1673netmap_unget_na(struct netmap_adapter *na, if_t ifp)
1674{
1675	if (ifp)
1676		if_rele(ifp);
1677	if (na)
1678		netmap_adapter_put(na);
1679}
1680
1681
1682#define NM_FAIL_ON(t) do {						\
1683	if (unlikely(t)) {						\
1684		nm_prlim(5, "%s: fail '" #t "' "				\
1685			"h %d c %d t %d "				\
1686			"rh %d rc %d rt %d "				\
1687			"hc %d ht %d",					\
1688			kring->name,					\
1689			head, cur, ring->tail,				\
1690			kring->rhead, kring->rcur, kring->rtail,	\
1691			kring->nr_hwcur, kring->nr_hwtail);		\
1692		return kring->nkr_num_slots;				\
1693	}								\
1694} while (0)
1695
1696/*
1697 * validate parameters on entry for *_txsync()
1698 * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1699 * in case of error.
1700 *
1701 * rhead, rcur and rtail=hwtail are stored from previous round.
1702 * hwcur is the next packet to send to the ring.
1703 *
1704 * We want
1705 *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1706 *
1707 * hwcur, rhead, rtail and hwtail are reliable
1708 */
1709u_int
1710nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1711{
1712	u_int head = NM_ACCESS_ONCE(ring->head);
1713	u_int cur = NM_ACCESS_ONCE(ring->cur);
1714	u_int n = kring->nkr_num_slots;
1715
1716	nm_prdis(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1717		kring->name,
1718		kring->nr_hwcur, kring->nr_hwtail,
1719		ring->head, ring->cur, ring->tail);
1720#if 1 /* kernel sanity checks; but we can trust the kring. */
1721	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1722	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1723#endif /* kernel sanity checks */
1724	/*
1725	 * user sanity checks. We only use head,
1726	 * A, B, ... are possible positions for head:
1727	 *
1728	 *  0    A  rhead   B  rtail   C  n-1
1729	 *  0    D  rtail   E  rhead   F  n-1
1730	 *
1731	 * B, F, D are valid. A, C, E are wrong
1732	 */
1733	if (kring->rtail >= kring->rhead) {
1734		/* want rhead <= head <= rtail */
1735		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1736		/* and also head <= cur <= rtail */
1737		NM_FAIL_ON(cur < head || cur > kring->rtail);
1738	} else { /* here rtail < rhead */
1739		/* we need head outside rtail .. rhead */
1740		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1741
1742		/* two cases now: head <= rtail or head >= rhead  */
1743		if (head <= kring->rtail) {
1744			/* want head <= cur <= rtail */
1745			NM_FAIL_ON(cur < head || cur > kring->rtail);
1746		} else { /* head >= rhead */
1747			/* cur must be outside rtail..head */
1748			NM_FAIL_ON(cur > kring->rtail && cur < head);
1749		}
1750	}
1751	if (ring->tail != kring->rtail) {
1752		nm_prlim(5, "%s tail overwritten was %d need %d", kring->name,
1753			ring->tail, kring->rtail);
1754		ring->tail = kring->rtail;
1755	}
1756	kring->rhead = head;
1757	kring->rcur = cur;
1758	return head;
1759}
1760
1761
1762/*
1763 * validate parameters on entry for *_rxsync()
1764 * Returns ring->head if ok, kring->nkr_num_slots on error.
1765 *
1766 * For a valid configuration,
1767 * hwcur <= head <= cur <= tail <= hwtail
1768 *
1769 * We only consider head and cur.
1770 * hwcur and hwtail are reliable.
1771 *
1772 */
1773u_int
1774nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1775{
1776	uint32_t const n = kring->nkr_num_slots;
1777	uint32_t head, cur;
1778
1779	nm_prdis(5,"%s kc %d kt %d h %d c %d t %d",
1780		kring->name,
1781		kring->nr_hwcur, kring->nr_hwtail,
1782		ring->head, ring->cur, ring->tail);
1783	/*
1784	 * Before storing the new values, we should check they do not
1785	 * move backwards. However:
1786	 * - head is not an issue because the previous value is hwcur;
1787	 * - cur could in principle go back, however it does not matter
1788	 *   because we are processing a brand new rxsync()
1789	 */
1790	cur = kring->rcur = NM_ACCESS_ONCE(ring->cur);
1791	head = kring->rhead = NM_ACCESS_ONCE(ring->head);
1792#if 1 /* kernel sanity checks */
1793	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1794#endif /* kernel sanity checks */
1795	/* user sanity checks */
1796	if (kring->nr_hwtail >= kring->nr_hwcur) {
1797		/* want hwcur <= rhead <= hwtail */
1798		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1799		/* and also rhead <= rcur <= hwtail */
1800		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1801	} else {
1802		/* we need rhead outside hwtail..hwcur */
1803		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1804		/* two cases now: head <= hwtail or head >= hwcur  */
1805		if (head <= kring->nr_hwtail) {
1806			/* want head <= cur <= hwtail */
1807			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1808		} else {
1809			/* cur must be outside hwtail..head */
1810			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1811		}
1812	}
1813	if (ring->tail != kring->rtail) {
1814		nm_prlim(5, "%s tail overwritten was %d need %d",
1815			kring->name,
1816			ring->tail, kring->rtail);
1817		ring->tail = kring->rtail;
1818	}
1819	return head;
1820}
1821
1822
1823/*
1824 * Error routine called when txsync/rxsync detects an error.
1825 * Can't do much more than resetting head = cur = hwcur, tail = hwtail
1826 * Return 1 on reinit.
1827 *
1828 * This routine is only called by the upper half of the kernel.
1829 * It only reads hwcur (which is changed only by the upper half, too)
1830 * and hwtail (which may be changed by the lower half, but only on
1831 * a tx ring and only to increase it, so any error will be recovered
1832 * on the next call). For the above, we don't strictly need to call
1833 * it under lock.
1834 */
1835int
1836netmap_ring_reinit(struct netmap_kring *kring)
1837{
1838	struct netmap_ring *ring = kring->ring;
1839	u_int i, lim = kring->nkr_num_slots - 1;
1840	int errors = 0;
1841
1842	// XXX KASSERT nm_kr_tryget
1843	nm_prlim(10, "called for %s", kring->name);
1844	// XXX probably wrong to trust userspace
1845	kring->rhead = ring->head;
1846	kring->rcur  = ring->cur;
1847	kring->rtail = ring->tail;
1848
1849	if (ring->cur > lim)
1850		errors++;
1851	if (ring->head > lim)
1852		errors++;
1853	if (ring->tail > lim)
1854		errors++;
1855	for (i = 0; i <= lim; i++) {
1856		u_int idx = ring->slot[i].buf_idx;
1857		u_int len = ring->slot[i].len;
1858		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1859			nm_prlim(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1860			ring->slot[i].buf_idx = 0;
1861			ring->slot[i].len = 0;
1862		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1863			ring->slot[i].len = 0;
1864			nm_prlim(5, "bad len at slot %d idx %d len %d", i, idx, len);
1865		}
1866	}
1867	if (errors) {
1868		nm_prlim(10, "total %d errors", errors);
1869		nm_prlim(10, "%s reinit, cur %d -> %d tail %d -> %d",
1870			kring->name,
1871			ring->cur, kring->nr_hwcur,
1872			ring->tail, kring->nr_hwtail);
1873		ring->head = kring->rhead = kring->nr_hwcur;
1874		ring->cur  = kring->rcur  = kring->nr_hwcur;
1875		ring->tail = kring->rtail = kring->nr_hwtail;
1876	}
1877	return (errors ? 1 : 0);
1878}
1879
1880/* interpret the ringid and flags fields of an nmreq, by translating them
1881 * into a pair of intervals of ring indices:
1882 *
1883 * [priv->np_txqfirst, priv->np_txqlast) and
1884 * [priv->np_rxqfirst, priv->np_rxqlast)
1885 *
1886 */
1887int
1888netmap_interp_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
1889{
1890	struct netmap_adapter *na = priv->np_na;
1891	struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
1892	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1893	enum txrx t;
1894	u_int j;
1895	u_int nr_flags = reg->nr_flags, nr_mode = reg->nr_mode,
1896	      nr_ringid = reg->nr_ringid;
1897
1898	for_rx_tx(t) {
1899		if (nr_flags & excluded_direction[t]) {
1900			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1901			continue;
1902		}
1903		switch (nr_mode) {
1904		case NR_REG_ALL_NIC:
1905		case NR_REG_NULL:
1906			priv->np_qfirst[t] = 0;
1907			priv->np_qlast[t] = nma_get_nrings(na, t);
1908			nm_prdis("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1909				priv->np_qfirst[t], priv->np_qlast[t]);
1910			break;
1911		case NR_REG_SW:
1912		case NR_REG_NIC_SW:
1913			if (!(na->na_flags & NAF_HOST_RINGS)) {
1914				nm_prerr("host rings not supported");
1915				return EINVAL;
1916			}
1917			priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1918				nma_get_nrings(na, t) : 0);
1919			priv->np_qlast[t] = netmap_all_rings(na, t);
1920			nm_prdis("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1921				nm_txrx2str(t),
1922				priv->np_qfirst[t], priv->np_qlast[t]);
1923			break;
1924		case NR_REG_ONE_NIC:
1925			if (nr_ringid >= na->num_tx_rings &&
1926					nr_ringid >= na->num_rx_rings) {
1927				nm_prerr("invalid ring id %d", nr_ringid);
1928				return EINVAL;
1929			}
1930			/* if not enough rings, use the first one */
1931			j = nr_ringid;
1932			if (j >= nma_get_nrings(na, t))
1933				j = 0;
1934			priv->np_qfirst[t] = j;
1935			priv->np_qlast[t] = j + 1;
1936			nm_prdis("ONE_NIC: %s %d %d", nm_txrx2str(t),
1937				priv->np_qfirst[t], priv->np_qlast[t]);
1938			break;
1939		case NR_REG_ONE_SW:
1940			if (!(na->na_flags & NAF_HOST_RINGS)) {
1941				nm_prerr("host rings not supported");
1942				return EINVAL;
1943			}
1944			if (nr_ringid >= na->num_host_tx_rings &&
1945					nr_ringid >= na->num_host_rx_rings) {
1946				nm_prerr("invalid ring id %d", nr_ringid);
1947				return EINVAL;
1948			}
1949			/* if not enough rings, use the first one */
1950			j = nr_ringid;
1951			if (j >= nma_get_host_nrings(na, t))
1952				j = 0;
1953			priv->np_qfirst[t] = nma_get_nrings(na, t) + j;
1954			priv->np_qlast[t] = nma_get_nrings(na, t) + j + 1;
1955			nm_prdis("ONE_SW: %s %d %d", nm_txrx2str(t),
1956				priv->np_qfirst[t], priv->np_qlast[t]);
1957			break;
1958		default:
1959			nm_prerr("invalid regif type %d", nr_mode);
1960			return EINVAL;
1961		}
1962	}
1963	priv->np_flags = nr_flags;
1964
1965	/* Allow transparent forwarding mode in the host --> nic
1966	 * direction only if all the TX hw rings have been opened. */
1967	if (priv->np_qfirst[NR_TX] == 0 &&
1968			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1969		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1970	}
1971
1972	if (netmap_verbose) {
1973		nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
1974			na->name,
1975			priv->np_qfirst[NR_TX],
1976			priv->np_qlast[NR_TX],
1977			priv->np_qfirst[NR_RX],
1978			priv->np_qlast[NR_RX],
1979			nr_ringid);
1980	}
1981	return 0;
1982}
1983
1984
1985/*
1986 * Set the ring ID. For devices with a single queue, a request
1987 * for all rings is the same as a single ring.
1988 */
1989static int
1990netmap_set_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
1991{
1992	struct netmap_adapter *na = priv->np_na;
1993	struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
1994	int error;
1995	enum txrx t;
1996
1997	error = netmap_interp_ringid(priv, hdr);
1998	if (error) {
1999		return error;
2000	}
2001
2002	priv->np_txpoll = (reg->nr_flags & NR_NO_TX_POLL) ? 0 : 1;
2003
2004	/* optimization: count the users registered for more than
2005	 * one ring, which are the ones sleeping on the global queue.
2006	 * The default netmap_notify() callback will then
2007	 * avoid signaling the global queue if nobody is using it
2008	 */
2009	for_rx_tx(t) {
2010		if (nm_si_user(priv, t))
2011			na->si_users[t]++;
2012	}
2013	return 0;
2014}
2015
2016static void
2017netmap_unset_ringid(struct netmap_priv_d *priv)
2018{
2019	struct netmap_adapter *na = priv->np_na;
2020	enum txrx t;
2021
2022	for_rx_tx(t) {
2023		if (nm_si_user(priv, t))
2024			na->si_users[t]--;
2025		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
2026	}
2027	priv->np_flags = 0;
2028	priv->np_txpoll = 0;
2029	priv->np_kloop_state = 0;
2030}
2031
2032#define within_sel(p_, t_, i_)					  	  \
2033	((i_) < (p_)->np_qlast[(t_)])
2034#define nonempty_sel(p_, t_)						  \
2035	(within_sel((p_), (t_), (p_)->np_qfirst[(t_)]))
2036#define foreach_selected_ring(p_, t_, i_, kring_)			  \
2037	for ((t_) = nonempty_sel((p_), NR_RX) ? NR_RX : NR_TX,		  \
2038	     (i_) = (p_)->np_qfirst[(t_)];				  \
2039	     (t_ == NR_RX ||						  \
2040	      (t == NR_TX && within_sel((p_), (t_), (i_)))) &&     	  \
2041	      ((kring_) = NMR((p_)->np_na, (t_))[(i_)]); 		  \
2042	     (i_) = within_sel((p_), (t_), (i_) + 1) ? (i_) + 1 :         \
2043		(++(t_) < NR_TXRX ? (p_)->np_qfirst[(t_)] : (i_)))
2044
2045
2046/* Set the nr_pending_mode for the requested rings.
2047 * If requested, also try to get exclusive access to the rings, provided
2048 * the rings we want to bind are not exclusively owned by a previous bind.
2049 */
2050static int
2051netmap_krings_get(struct netmap_priv_d *priv)
2052{
2053	struct netmap_adapter *na = priv->np_na;
2054	u_int i;
2055	struct netmap_kring *kring;
2056	int excl = (priv->np_flags & NR_EXCLUSIVE);
2057	enum txrx t;
2058
2059	if (netmap_debug & NM_DEBUG_ON)
2060		nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
2061			na->name,
2062			priv->np_qfirst[NR_TX],
2063			priv->np_qlast[NR_TX],
2064			priv->np_qfirst[NR_RX],
2065			priv->np_qlast[NR_RX]);
2066
2067	/* first round: check that all the requested rings
2068	 * are neither already exclusively owned, nor we
2069	 * want exclusive ownership when they are already in use
2070	 */
2071	foreach_selected_ring(priv, t, i, kring) {
2072		if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
2073		    (kring->users && excl))
2074		{
2075			nm_prdis("ring %s busy", kring->name);
2076			return EBUSY;
2077		}
2078	}
2079
2080	/* second round: increment usage count (possibly marking them
2081	 * as exclusive) and set the nr_pending_mode
2082	 */
2083	foreach_selected_ring(priv, t, i, kring) {
2084		kring->users++;
2085		if (excl)
2086			kring->nr_kflags |= NKR_EXCLUSIVE;
2087		kring->nr_pending_mode = NKR_NETMAP_ON;
2088	}
2089
2090	return 0;
2091
2092}
2093
2094/* Undo netmap_krings_get(). This is done by clearing the exclusive mode
2095 * if was asked on regif, and unset the nr_pending_mode if we are the
2096 * last users of the involved rings. */
2097static void
2098netmap_krings_put(struct netmap_priv_d *priv)
2099{
2100	u_int i;
2101	struct netmap_kring *kring;
2102	int excl = (priv->np_flags & NR_EXCLUSIVE);
2103	enum txrx t;
2104
2105	nm_prdis("%s: releasing tx [%d, %d) rx [%d, %d)",
2106			na->name,
2107			priv->np_qfirst[NR_TX],
2108			priv->np_qlast[NR_TX],
2109			priv->np_qfirst[NR_RX],
2110			priv->np_qlast[MR_RX]);
2111
2112	foreach_selected_ring(priv, t, i, kring) {
2113		if (excl)
2114			kring->nr_kflags &= ~NKR_EXCLUSIVE;
2115		kring->users--;
2116		if (kring->users == 0)
2117			kring->nr_pending_mode = NKR_NETMAP_OFF;
2118	}
2119}
2120
2121static int
2122nm_priv_rx_enabled(struct netmap_priv_d *priv)
2123{
2124	return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2125}
2126
2127/* Validate the CSB entries for both directions (atok and ktoa).
2128 * To be called under NMG_LOCK(). */
2129static int
2130netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
2131{
2132	struct nm_csb_atok *csb_atok_base =
2133		(struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
2134	struct nm_csb_ktoa *csb_ktoa_base =
2135		(struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
2136	enum txrx t;
2137	int num_rings[NR_TXRX], tot_rings;
2138	size_t entry_size[2];
2139	void *csb_start[2];
2140	int i;
2141
2142	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
2143		nm_prerr("Cannot update CSB while kloop is running");
2144		return EBUSY;
2145	}
2146
2147	tot_rings = 0;
2148	for_rx_tx(t) {
2149		num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
2150		tot_rings += num_rings[t];
2151	}
2152	if (tot_rings <= 0)
2153		return 0;
2154
2155	if (!(priv->np_flags & NR_EXCLUSIVE)) {
2156		nm_prerr("CSB mode requires NR_EXCLUSIVE");
2157		return EINVAL;
2158	}
2159
2160	entry_size[0] = sizeof(*csb_atok_base);
2161	entry_size[1] = sizeof(*csb_ktoa_base);
2162	csb_start[0] = (void *)csb_atok_base;
2163	csb_start[1] = (void *)csb_ktoa_base;
2164
2165	for (i = 0; i < 2; i++) {
2166		/* On Linux we could use access_ok() to simplify
2167		 * the validation. However, the advantage of
2168		 * this approach is that it works also on
2169		 * FreeBSD. */
2170		size_t csb_size = tot_rings * entry_size[i];
2171		void *tmp;
2172		int err;
2173
2174		if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
2175			nm_prerr("Unaligned CSB address");
2176			return EINVAL;
2177		}
2178
2179		tmp = nm_os_malloc(csb_size);
2180		if (!tmp)
2181			return ENOMEM;
2182		if (i == 0) {
2183			/* Application --> kernel direction. */
2184			err = copyin(csb_start[i], tmp, csb_size);
2185		} else {
2186			/* Kernel --> application direction. */
2187			memset(tmp, 0, csb_size);
2188			err = copyout(tmp, csb_start[i], csb_size);
2189		}
2190		nm_os_free(tmp);
2191		if (err) {
2192			nm_prerr("Invalid CSB address");
2193			return err;
2194		}
2195	}
2196
2197	priv->np_csb_atok_base = csb_atok_base;
2198	priv->np_csb_ktoa_base = csb_ktoa_base;
2199
2200	/* Initialize the CSB. */
2201	for_rx_tx(t) {
2202		for (i = 0; i < num_rings[t]; i++) {
2203			struct netmap_kring *kring =
2204				NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
2205			struct nm_csb_atok *csb_atok = csb_atok_base + i;
2206			struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
2207
2208			if (t == NR_RX) {
2209				csb_atok += num_rings[NR_TX];
2210				csb_ktoa += num_rings[NR_TX];
2211			}
2212
2213			CSB_WRITE(csb_atok, head, kring->rhead);
2214			CSB_WRITE(csb_atok, cur, kring->rcur);
2215			CSB_WRITE(csb_atok, appl_need_kick, 1);
2216			CSB_WRITE(csb_atok, sync_flags, 1);
2217			CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
2218			CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
2219			CSB_WRITE(csb_ktoa, kern_need_kick, 1);
2220
2221			nm_prinf("csb_init for kring %s: head %u, cur %u, "
2222				"hwcur %u, hwtail %u", kring->name,
2223				kring->rhead, kring->rcur, kring->nr_hwcur,
2224				kring->nr_hwtail);
2225		}
2226	}
2227
2228	return 0;
2229}
2230
2231/* Ensure that the netmap adapter can support the given MTU.
2232 * @return EINVAL if the na cannot be set to mtu, 0 otherwise.
2233 */
2234int
2235netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
2236	unsigned nbs = NETMAP_BUF_SIZE(na);
2237
2238	if (mtu <= na->rx_buf_maxsize) {
2239		/* The MTU fits a single NIC slot. We only
2240		 * Need to check that netmap buffers are
2241		 * large enough to hold an MTU. NS_MOREFRAG
2242		 * cannot be used in this case. */
2243		if (nbs < mtu) {
2244			nm_prerr("error: netmap buf size (%u) "
2245				 "< device MTU (%u)", nbs, mtu);
2246			return EINVAL;
2247		}
2248	} else {
2249		/* More NIC slots may be needed to receive
2250		 * or transmit a single packet. Check that
2251		 * the adapter supports NS_MOREFRAG and that
2252		 * netmap buffers are large enough to hold
2253		 * the maximum per-slot size. */
2254		if (!(na->na_flags & NAF_MOREFRAG)) {
2255			nm_prerr("error: large MTU (%d) needed "
2256				 "but %s does not support "
2257				 "NS_MOREFRAG", mtu,
2258				 if_name(na->ifp));
2259			return EINVAL;
2260		} else if (nbs < na->rx_buf_maxsize) {
2261			nm_prerr("error: using NS_MOREFRAG on "
2262				 "%s requires netmap buf size "
2263				 ">= %u", if_name(na->ifp),
2264				 na->rx_buf_maxsize);
2265			return EINVAL;
2266		} else {
2267			nm_prinf("info: netmap application on "
2268				 "%s needs to support "
2269				 "NS_MOREFRAG "
2270				 "(MTU=%u,netmap_buf_size=%u)",
2271				 if_name(na->ifp), mtu, nbs);
2272		}
2273	}
2274	return 0;
2275}
2276
2277/* Handle the offset option, if present in the hdr.
2278 * Returns 0 on success, or an error.
2279 */
2280static int
2281netmap_offsets_init(struct netmap_priv_d *priv, struct nmreq_header *hdr)
2282{
2283	struct nmreq_opt_offsets *opt;
2284	struct netmap_adapter *na = priv->np_na;
2285	struct netmap_kring *kring;
2286	uint64_t mask = 0, bits = 0, maxbits = sizeof(uint64_t) * 8,
2287		 max_offset = 0, initial_offset = 0, min_gap = 0;
2288	u_int i;
2289	enum txrx t;
2290	int error = 0;
2291
2292	opt = (struct nmreq_opt_offsets *)
2293		nmreq_getoption(hdr, NETMAP_REQ_OPT_OFFSETS);
2294	if (opt == NULL)
2295		return 0;
2296
2297	if (!(na->na_flags & NAF_OFFSETS)) {
2298		if (netmap_verbose)
2299			nm_prerr("%s does not support offsets",
2300				na->name);
2301		error = EOPNOTSUPP;
2302		goto out;
2303	}
2304
2305	/* check sanity of the opt values */
2306	max_offset = opt->nro_max_offset;
2307	min_gap = opt->nro_min_gap;
2308	initial_offset = opt->nro_initial_offset;
2309	bits = opt->nro_offset_bits;
2310
2311	if (bits > maxbits) {
2312		if (netmap_verbose)
2313			nm_prerr("bits: %llu too large (max %llu)",
2314				(unsigned long long)bits,
2315				(unsigned long long)maxbits);
2316		error = EINVAL;
2317		goto out;
2318	}
2319	/* we take bits == 0 as a request to use the entire field */
2320	if (bits == 0 || bits == maxbits) {
2321		/* shifting a type by sizeof(type) is undefined */
2322		bits = maxbits;
2323		mask = 0xffffffffffffffff;
2324	} else {
2325		mask = (1ULL << bits) - 1;
2326	}
2327	if (max_offset > NETMAP_BUF_SIZE(na)) {
2328		if (netmap_verbose)
2329			nm_prerr("max offset %llu > buf size %u",
2330				(unsigned long long)max_offset, NETMAP_BUF_SIZE(na));
2331		error = EINVAL;
2332		goto out;
2333	}
2334	if ((max_offset & mask) != max_offset) {
2335		if (netmap_verbose)
2336			nm_prerr("max offset %llu to large for %llu bits",
2337				(unsigned long long)max_offset,
2338				(unsigned long long)bits);
2339		error = EINVAL;
2340		goto out;
2341	}
2342	if (initial_offset > max_offset) {
2343		if (netmap_verbose)
2344			nm_prerr("initial offset %llu > max offset %llu",
2345				(unsigned long long)initial_offset,
2346				(unsigned long long)max_offset);
2347		error = EINVAL;
2348		goto out;
2349	}
2350
2351	/* initialize the kring and ring fields. */
2352	foreach_selected_ring(priv, t, i, kring) {
2353		struct netmap_kring *kring = NMR(na, t)[i];
2354		struct netmap_ring *ring = kring->ring;
2355		u_int j;
2356
2357		/* it the ring is already in use we check that the
2358		 * new request is compatible with the existing one
2359		 */
2360		if (kring->offset_mask) {
2361			if ((kring->offset_mask & mask) != mask ||
2362			     kring->offset_max < max_offset) {
2363				if (netmap_verbose)
2364					nm_prinf("%s: cannot increase"
2365						 "offset mask and/or max"
2366						 "(current: mask=%llx,max=%llu",
2367							kring->name,
2368							(unsigned long long)kring->offset_mask,
2369							(unsigned long long)kring->offset_max);
2370				error = EBUSY;
2371				goto out;
2372			}
2373			mask = kring->offset_mask;
2374			max_offset = kring->offset_max;
2375		} else {
2376			kring->offset_mask = mask;
2377			*(uint64_t *)(uintptr_t)&ring->offset_mask = mask;
2378			kring->offset_max = max_offset;
2379			kring->offset_gap = min_gap;
2380		}
2381
2382		/* if there is an initial offset, put it into
2383		 * all the slots
2384		 *
2385		 * Note: we cannot change the offsets if the
2386		 * ring is already in use.
2387		 */
2388		if (!initial_offset || kring->users > 1)
2389			continue;
2390
2391		for (j = 0; j < kring->nkr_num_slots; j++) {
2392			struct netmap_slot *slot = ring->slot + j;
2393
2394			nm_write_offset(kring, slot, initial_offset);
2395		}
2396	}
2397
2398out:
2399	opt->nro_opt.nro_status = error;
2400	if (!error) {
2401		opt->nro_max_offset = max_offset;
2402	}
2403	return error;
2404
2405}
2406
2407
2408/* set the hardware buffer length in each one of the newly opened rings
2409 * (hwbuf_len field in the kring struct). The purpose it to select
2410 * the maximum supported input buffer lenght that will not cause writes
2411 * outside of the available space, even when offsets are in use.
2412 */
2413static int
2414netmap_compute_buf_len(struct netmap_priv_d *priv)
2415{
2416	enum txrx t;
2417	u_int i;
2418	struct netmap_kring *kring;
2419	int error = 0;
2420	unsigned mtu = 0;
2421	struct netmap_adapter *na = priv->np_na;
2422	uint64_t target;
2423
2424	foreach_selected_ring(priv, t, i, kring) {
2425		/* rings that are already active have their hwbuf_len
2426		 * already set and we cannot change it.
2427		 */
2428		if (kring->users > 1)
2429			continue;
2430
2431		/* For netmap buffers which are not shared among several ring
2432		 * slots (the normal case), the available space is the buf size
2433		 * minus the max offset declared by the user at open time.  If
2434		 * the user plans to have several slots pointing to different
2435		 * offsets into the same large buffer, she must also declare a
2436		 * "minimum gap" between two such consecutive offsets. In this
2437		 * case the user-declared 'offset_gap' is taken as the
2438		 * available space and offset_max is ignored.
2439		 */
2440
2441		/* start with the normal case (unshared buffers) */
2442		target = NETMAP_BUF_SIZE(kring->na) -
2443			kring->offset_max;
2444		/* if offset_gap is zero, the user does not intend to use
2445		 * shared buffers. In this case the minimum gap between
2446		 * two consective offsets into the same buffer can be
2447		 * assumed to be equal to the buffer size. In this way
2448		 * offset_gap always contains the available space ignoring
2449		 * offset_max. This may be used by drivers of NICs that
2450		 * are guaranteed to never write more than MTU bytes, even
2451		 * if the input buffer is larger: if the MTU is less
2452		 * than the target they can set hwbuf_len to offset_gap.
2453		 */
2454		if (!kring->offset_gap)
2455			kring->offset_gap =
2456				NETMAP_BUF_SIZE(kring->na);
2457
2458		if (kring->offset_gap < target)
2459			target = kring->offset_gap;
2460		error = kring->nm_bufcfg(kring, target);
2461		if (error)
2462			goto out;
2463
2464		*(uint64_t *)(uintptr_t)&kring->ring->buf_align = kring->buf_align;
2465
2466		if (mtu && t == NR_RX && kring->hwbuf_len < mtu) {
2467			if (!(na->na_flags & NAF_MOREFRAG)) {
2468				nm_prerr("error: large MTU (%d) needed "
2469					 "but %s does not support "
2470					 "NS_MOREFRAG", mtu,
2471					 na->name);
2472				error = EINVAL;
2473				goto out;
2474			} else {
2475				nm_prinf("info: netmap application on "
2476					 "%s needs to support "
2477					 "NS_MOREFRAG "
2478					 "(MTU=%u,buf_size=%llu)",
2479					 kring->name, mtu,
2480					 (unsigned long long)kring->hwbuf_len);
2481			}
2482		}
2483	}
2484out:
2485	return error;
2486}
2487
2488/*
2489 * possibly move the interface to netmap-mode.
2490 * If success it returns a pointer to netmap_if, otherwise NULL.
2491 * This must be called with NMG_LOCK held.
2492 *
2493 * The following na callbacks are called in the process:
2494 *
2495 * na->nm_config()			[by netmap_update_config]
2496 * (get current number and size of rings)
2497 *
2498 *  	We have a generic one for linux (netmap_linux_config).
2499 *  	The bwrap has to override this, since it has to forward
2500 *  	the request to the wrapped adapter (netmap_bwrap_config).
2501 *
2502 *
2503 * na->nm_krings_create()
2504 * (create and init the krings array)
2505 *
2506 * 	One of the following:
2507 *
2508 *	* netmap_hw_krings_create, 			(hw ports)
2509 *		creates the standard layout for the krings
2510 * 		and adds the mbq (used for the host rings).
2511 *
2512 * 	* netmap_vp_krings_create			(VALE ports)
2513 * 		add leases and scratchpads
2514 *
2515 * 	* netmap_pipe_krings_create			(pipes)
2516 * 		create the krings and rings of both ends and
2517 * 		cross-link them
2518 *
2519 *      * netmap_monitor_krings_create 			(monitors)
2520 *      	avoid allocating the mbq
2521 *
2522 *      * netmap_bwrap_krings_create			(bwraps)
2523 *      	create both the brap krings array,
2524 *      	the krings array of the wrapped adapter, and
2525 *      	(if needed) the fake array for the host adapter
2526 *
2527 * na->nm_register(, 1)
2528 * (put the adapter in netmap mode)
2529 *
2530 * 	This may be one of the following:
2531 *
2532 * 	* netmap_hw_reg				        (hw ports)
2533 * 		checks that the ifp is still there, then calls
2534 * 		the hardware specific callback;
2535 *
2536 * 	* netmap_vp_reg					(VALE ports)
2537 *		If the port is connected to a bridge,
2538 *		set the NAF_NETMAP_ON flag under the
2539 *		bridge write lock.
2540 *
2541 *	* netmap_pipe_reg				(pipes)
2542 *		inform the other pipe end that it is no
2543 *		longer responsible for the lifetime of this
2544 *		pipe end
2545 *
2546 *	* netmap_monitor_reg				(monitors)
2547 *		intercept the sync callbacks of the monitored
2548 *		rings
2549 *
2550 *	* netmap_bwrap_reg				(bwraps)
2551 *		cross-link the bwrap and hwna rings,
2552 *		forward the request to the hwna, override
2553 *		the hwna notify callback (to get the frames
2554 *		coming from outside go through the bridge).
2555 *
2556 *
2557 */
2558int
2559netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2560	struct nmreq_header *hdr)
2561{
2562	struct netmap_if *nifp = NULL;
2563	int error;
2564
2565	NMG_LOCK_ASSERT();
2566	priv->np_na = na;     /* store the reference */
2567	error = netmap_mem_finalize(na->nm_mem, na);
2568	if (error)
2569		goto err;
2570
2571	if (na->active_fds == 0) {
2572
2573		/* cache the allocator info in the na */
2574		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2575		if (error)
2576			goto err_drop_mem;
2577		nm_prdis("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2578					    na->na_lut.objsize);
2579
2580		/* ring configuration may have changed, fetch from the card */
2581		netmap_update_config(na);
2582	}
2583
2584	/* compute the range of tx and rx rings to monitor */
2585	error = netmap_set_ringid(priv, hdr);
2586	if (error)
2587		goto err_put_lut;
2588
2589	if (na->active_fds == 0) {
2590		/*
2591		 * If this is the first registration of the adapter,
2592		 * perform sanity checks and create the in-kernel view
2593		 * of the netmap rings (the netmap krings).
2594		 */
2595		if (na->ifp && nm_priv_rx_enabled(priv)) {
2596			/* This netmap adapter is attached to an ifnet. */
2597			unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2598
2599			nm_prdis("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2600				na->name, mtu, na->rx_buf_maxsize, NETMAP_BUF_SIZE(na));
2601
2602			if (na->rx_buf_maxsize == 0) {
2603				nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
2604				error = EIO;
2605				goto err_drop_mem;
2606			}
2607
2608			error = netmap_buf_size_validate(na, mtu);
2609			if (error)
2610				goto err_drop_mem;
2611		}
2612
2613		/*
2614		 * Depending on the adapter, this may also create
2615		 * the netmap rings themselves
2616		 */
2617		error = na->nm_krings_create(na);
2618		if (error)
2619			goto err_put_lut;
2620
2621	}
2622
2623	/* now the krings must exist and we can check whether some
2624	 * previous bind has exclusive ownership on them, and set
2625	 * nr_pending_mode
2626	 */
2627	error = netmap_krings_get(priv);
2628	if (error)
2629		goto err_del_krings;
2630
2631	/* create all needed missing netmap rings */
2632	error = netmap_mem_rings_create(na);
2633	if (error)
2634		goto err_rel_excl;
2635
2636	/* initialize offsets if requested */
2637	error = netmap_offsets_init(priv, hdr);
2638	if (error)
2639		goto err_rel_excl;
2640
2641	/* compute and validate the buf lengths */
2642	error = netmap_compute_buf_len(priv);
2643	if (error)
2644		goto err_rel_excl;
2645
2646	/* in all cases, create a new netmap if */
2647	nifp = netmap_mem_if_new(na, priv);
2648	if (nifp == NULL) {
2649		error = ENOMEM;
2650		goto err_rel_excl;
2651	}
2652
2653	if (nm_kring_pending(priv)) {
2654		/* Some kring is switching mode, tell the adapter to
2655		 * react on this. */
2656		netmap_set_all_rings(na, NM_KR_LOCKED);
2657		error = na->nm_register(na, 1);
2658		netmap_set_all_rings(na, 0);
2659		if (error)
2660			goto err_del_if;
2661	}
2662
2663	/* Commit the reference. */
2664	na->active_fds++;
2665
2666	/*
2667	 * advertise that the interface is ready by setting np_nifp.
2668	 * The barrier is needed because readers (poll, *SYNC and mmap)
2669	 * check for priv->np_nifp != NULL without locking
2670	 */
2671	mb(); /* make sure previous writes are visible to all CPUs */
2672	priv->np_nifp = nifp;
2673
2674	return 0;
2675
2676err_del_if:
2677	netmap_mem_if_delete(na, nifp);
2678err_rel_excl:
2679	netmap_krings_put(priv);
2680	netmap_mem_rings_delete(na);
2681err_del_krings:
2682	if (na->active_fds == 0)
2683		na->nm_krings_delete(na);
2684err_put_lut:
2685	if (na->active_fds == 0)
2686		memset(&na->na_lut, 0, sizeof(na->na_lut));
2687err_drop_mem:
2688	netmap_mem_drop(na);
2689err:
2690	priv->np_na = NULL;
2691	return error;
2692}
2693
2694
2695/*
2696 * update kring and ring at the end of rxsync/txsync.
2697 */
2698static inline void
2699nm_sync_finalize(struct netmap_kring *kring)
2700{
2701	/*
2702	 * Update ring tail to what the kernel knows
2703	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2704	 * if no carrier.
2705	 */
2706	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2707
2708	nm_prdis(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2709		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2710		kring->rhead, kring->rcur, kring->rtail);
2711}
2712
2713/* set ring timestamp */
2714static inline void
2715ring_timestamp_set(struct netmap_ring *ring)
2716{
2717	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2718		microtime(&ring->ts);
2719	}
2720}
2721
2722static int nmreq_copyin(struct nmreq_header *, int);
2723static int nmreq_copyout(struct nmreq_header *, int);
2724static int nmreq_checkoptions(struct nmreq_header *);
2725
2726/*
2727 * ioctl(2) support for the "netmap" device.
2728 *
2729 * Following a list of accepted commands:
2730 * - NIOCCTRL		device control API
2731 * - NIOCTXSYNC		sync TX rings
2732 * - NIOCRXSYNC		sync RX rings
2733 * - SIOCGIFADDR	just for convenience
2734 * - NIOCGINFO		deprecated (legacy API)
2735 * - NIOCREGIF		deprecated (legacy API)
2736 *
2737 * Return 0 on success, errno otherwise.
2738 */
2739int
2740netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2741		struct thread *td, int nr_body_is_user)
2742{
2743	struct mbq q;	/* packets from RX hw queues to host stack */
2744	struct netmap_adapter *na = NULL;
2745	struct netmap_mem_d *nmd = NULL;
2746	if_t ifp = NULL;
2747	int error = 0;
2748	u_int i, qfirst, qlast;
2749	struct netmap_kring **krings;
2750	int sync_flags;
2751	enum txrx t;
2752
2753	switch (cmd) {
2754	case NIOCCTRL: {
2755		struct nmreq_header *hdr = (struct nmreq_header *)data;
2756
2757		if (hdr->nr_version < NETMAP_MIN_API ||
2758		    hdr->nr_version > NETMAP_MAX_API) {
2759			nm_prerr("API mismatch: got %d need %d",
2760				hdr->nr_version, NETMAP_API);
2761			return EINVAL;
2762		}
2763
2764		/* Make a kernel-space copy of the user-space nr_body.
2765		 * For convenience, the nr_body pointer and the pointers
2766		 * in the options list will be replaced with their
2767		 * kernel-space counterparts. The original pointers are
2768		 * saved internally and later restored by nmreq_copyout
2769		 */
2770		error = nmreq_copyin(hdr, nr_body_is_user);
2771		if (error) {
2772			return error;
2773		}
2774
2775		/* Sanitize hdr->nr_name. */
2776		hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2777
2778		switch (hdr->nr_reqtype) {
2779		case NETMAP_REQ_REGISTER: {
2780			struct nmreq_register *req =
2781				(struct nmreq_register *)(uintptr_t)hdr->nr_body;
2782			struct netmap_if *nifp;
2783
2784			/* Protect access to priv from concurrent requests. */
2785			NMG_LOCK();
2786			do {
2787				struct nmreq_option *opt;
2788				u_int memflags;
2789
2790				if (priv->np_nifp != NULL) {	/* thread already registered */
2791					error = EBUSY;
2792					break;
2793				}
2794
2795#ifdef WITH_EXTMEM
2796				opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_EXTMEM);
2797				if (opt != NULL) {
2798					struct nmreq_opt_extmem *e =
2799						(struct nmreq_opt_extmem *)opt;
2800
2801					nmd = netmap_mem_ext_create(e->nro_usrptr,
2802							&e->nro_info, &error);
2803					opt->nro_status = error;
2804					if (nmd == NULL)
2805						break;
2806				}
2807#endif /* WITH_EXTMEM */
2808
2809				if (nmd == NULL && req->nr_mem_id) {
2810					/* find the allocator and get a reference */
2811					nmd = netmap_mem_find(req->nr_mem_id);
2812					if (nmd == NULL) {
2813						if (netmap_verbose) {
2814							nm_prerr("%s: failed to find mem_id %u",
2815									hdr->nr_name, req->nr_mem_id);
2816						}
2817						error = EINVAL;
2818						break;
2819					}
2820				}
2821				/* find the interface and a reference */
2822				error = netmap_get_na(hdr, &na, &ifp, nmd,
2823						      1 /* create */); /* keep reference */
2824				if (error)
2825					break;
2826				if (NETMAP_OWNED_BY_KERN(na)) {
2827					error = EBUSY;
2828					break;
2829				}
2830
2831				if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2832					nm_prerr("virt_hdr_len=%d, but application does "
2833						"not accept it", na->virt_hdr_len);
2834					error = EIO;
2835					break;
2836				}
2837
2838				error = netmap_do_regif(priv, na, hdr);
2839				if (error) {    /* reg. failed, release priv and ref */
2840					break;
2841				}
2842
2843				opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
2844				if (opt != NULL) {
2845					struct nmreq_opt_csb *csbo =
2846						(struct nmreq_opt_csb *)opt;
2847					error = netmap_csb_validate(priv, csbo);
2848					opt->nro_status = error;
2849					if (error) {
2850						netmap_do_unregif(priv);
2851						break;
2852					}
2853				}
2854
2855				nifp = priv->np_nifp;
2856
2857				/* return the offset of the netmap_if object */
2858				req->nr_rx_rings = na->num_rx_rings;
2859				req->nr_tx_rings = na->num_tx_rings;
2860				req->nr_rx_slots = na->num_rx_desc;
2861				req->nr_tx_slots = na->num_tx_desc;
2862				req->nr_host_tx_rings = na->num_host_tx_rings;
2863				req->nr_host_rx_rings = na->num_host_rx_rings;
2864				error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2865					&req->nr_mem_id);
2866				if (error) {
2867					netmap_do_unregif(priv);
2868					break;
2869				}
2870				if (memflags & NETMAP_MEM_PRIVATE) {
2871					*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2872				}
2873				for_rx_tx(t) {
2874					priv->np_si[t] = nm_si_user(priv, t) ?
2875						&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2876				}
2877
2878				if (req->nr_extra_bufs) {
2879					if (netmap_verbose)
2880						nm_prinf("requested %d extra buffers",
2881							req->nr_extra_bufs);
2882					req->nr_extra_bufs = netmap_extra_alloc(na,
2883						&nifp->ni_bufs_head, req->nr_extra_bufs);
2884					if (netmap_verbose)
2885						nm_prinf("got %d extra buffers", req->nr_extra_bufs);
2886				} else {
2887					nifp->ni_bufs_head = 0;
2888				}
2889				req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2890
2891				error = nmreq_checkoptions(hdr);
2892				if (error) {
2893					netmap_do_unregif(priv);
2894					break;
2895				}
2896
2897				/* store ifp reference so that priv destructor may release it */
2898				priv->np_ifp = ifp;
2899			} while (0);
2900			if (error) {
2901				netmap_unget_na(na, ifp);
2902			}
2903			/* release the reference from netmap_mem_find() or
2904			 * netmap_mem_ext_create()
2905			 */
2906			if (nmd)
2907				netmap_mem_put(nmd);
2908			NMG_UNLOCK();
2909			break;
2910		}
2911
2912		case NETMAP_REQ_PORT_INFO_GET: {
2913			struct nmreq_port_info_get *req =
2914				(struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2915			int nmd_ref = 0;
2916
2917			NMG_LOCK();
2918			do {
2919				u_int memflags;
2920
2921				if (hdr->nr_name[0] != '\0') {
2922					/* Build a nmreq_register out of the nmreq_port_info_get,
2923					 * so that we can call netmap_get_na(). */
2924					struct nmreq_register regreq;
2925					bzero(&regreq, sizeof(regreq));
2926					regreq.nr_mode = NR_REG_ALL_NIC;
2927					regreq.nr_tx_slots = req->nr_tx_slots;
2928					regreq.nr_rx_slots = req->nr_rx_slots;
2929					regreq.nr_tx_rings = req->nr_tx_rings;
2930					regreq.nr_rx_rings = req->nr_rx_rings;
2931					regreq.nr_host_tx_rings = req->nr_host_tx_rings;
2932					regreq.nr_host_rx_rings = req->nr_host_rx_rings;
2933					regreq.nr_mem_id = req->nr_mem_id;
2934
2935					/* get a refcount */
2936					hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2937					hdr->nr_body = (uintptr_t)&regreq;
2938					error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2939					hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2940					hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2941					if (error) {
2942						na = NULL;
2943						ifp = NULL;
2944						break;
2945					}
2946					nmd = na->nm_mem; /* get memory allocator */
2947				} else {
2948					nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2949					if (nmd == NULL) {
2950						if (netmap_verbose)
2951							nm_prerr("%s: failed to find mem_id %u",
2952									hdr->nr_name,
2953									req->nr_mem_id ? req->nr_mem_id : 1);
2954						error = EINVAL;
2955						break;
2956					}
2957					nmd_ref = 1;
2958				}
2959
2960				error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2961					&req->nr_mem_id);
2962				if (error)
2963					break;
2964				if (na == NULL) /* only memory info */
2965					break;
2966				netmap_update_config(na);
2967				req->nr_rx_rings = na->num_rx_rings;
2968				req->nr_tx_rings = na->num_tx_rings;
2969				req->nr_rx_slots = na->num_rx_desc;
2970				req->nr_tx_slots = na->num_tx_desc;
2971				req->nr_host_tx_rings = na->num_host_tx_rings;
2972				req->nr_host_rx_rings = na->num_host_rx_rings;
2973			} while (0);
2974			netmap_unget_na(na, ifp);
2975			if (nmd_ref)
2976				netmap_mem_put(nmd);
2977			NMG_UNLOCK();
2978			break;
2979		}
2980#ifdef WITH_VALE
2981		case NETMAP_REQ_VALE_ATTACH: {
2982			error = netmap_bdg_attach(hdr, NULL /* userspace request */);
2983			break;
2984		}
2985
2986		case NETMAP_REQ_VALE_DETACH: {
2987			error = netmap_bdg_detach(hdr, NULL /* userspace request */);
2988			break;
2989		}
2990
2991		case NETMAP_REQ_PORT_HDR_SET: {
2992			struct nmreq_port_hdr *req =
2993				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2994			/* Build a nmreq_register out of the nmreq_port_hdr,
2995			 * so that we can call netmap_get_bdg_na(). */
2996			struct nmreq_register regreq;
2997			bzero(&regreq, sizeof(regreq));
2998			regreq.nr_mode = NR_REG_ALL_NIC;
2999
3000			/* For now we only support virtio-net headers, and only for
3001			 * VALE ports, but this may change in future. Valid lengths
3002			 * for the virtio-net header are 0 (no header), 10 and 12. */
3003			if (req->nr_hdr_len != 0 &&
3004				req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
3005					req->nr_hdr_len != 12) {
3006				if (netmap_verbose)
3007					nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
3008				error = EINVAL;
3009				break;
3010			}
3011			NMG_LOCK();
3012			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
3013			hdr->nr_body = (uintptr_t)&regreq;
3014			error = netmap_get_vale_na(hdr, &na, NULL, 0);
3015			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
3016			hdr->nr_body = (uintptr_t)req;
3017			if (na && !error) {
3018				struct netmap_vp_adapter *vpna =
3019					(struct netmap_vp_adapter *)na;
3020				na->virt_hdr_len = req->nr_hdr_len;
3021				if (na->virt_hdr_len) {
3022					vpna->mfs = NETMAP_BUF_SIZE(na);
3023				}
3024				if (netmap_verbose)
3025					nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
3026				netmap_adapter_put(na);
3027			} else if (!na) {
3028				error = ENXIO;
3029			}
3030			NMG_UNLOCK();
3031			break;
3032		}
3033
3034		case NETMAP_REQ_PORT_HDR_GET: {
3035			/* Get vnet-header length for this netmap port */
3036			struct nmreq_port_hdr *req =
3037				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
3038			/* Build a nmreq_register out of the nmreq_port_hdr,
3039			 * so that we can call netmap_get_bdg_na(). */
3040			struct nmreq_register regreq;
3041			if_t ifp;
3042
3043			bzero(&regreq, sizeof(regreq));
3044			regreq.nr_mode = NR_REG_ALL_NIC;
3045			NMG_LOCK();
3046			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
3047			hdr->nr_body = (uintptr_t)&regreq;
3048			error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
3049			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
3050			hdr->nr_body = (uintptr_t)req;
3051			if (na && !error) {
3052				req->nr_hdr_len = na->virt_hdr_len;
3053			}
3054			netmap_unget_na(na, ifp);
3055			NMG_UNLOCK();
3056			break;
3057		}
3058
3059		case NETMAP_REQ_VALE_LIST: {
3060			error = netmap_vale_list(hdr);
3061			break;
3062		}
3063
3064		case NETMAP_REQ_VALE_NEWIF: {
3065			error = nm_vi_create(hdr);
3066			break;
3067		}
3068
3069		case NETMAP_REQ_VALE_DELIF: {
3070			error = nm_vi_destroy(hdr->nr_name);
3071			break;
3072		}
3073#endif  /* WITH_VALE */
3074
3075		case NETMAP_REQ_VALE_POLLING_ENABLE:
3076		case NETMAP_REQ_VALE_POLLING_DISABLE: {
3077			error = nm_bdg_polling(hdr);
3078			break;
3079		}
3080		case NETMAP_REQ_POOLS_INFO_GET: {
3081			/* Get information from the memory allocator used for
3082			 * hdr->nr_name. */
3083			struct nmreq_pools_info *req =
3084				(struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
3085			NMG_LOCK();
3086			do {
3087				/* Build a nmreq_register out of the nmreq_pools_info,
3088				 * so that we can call netmap_get_na(). */
3089				struct nmreq_register regreq;
3090				bzero(&regreq, sizeof(regreq));
3091				regreq.nr_mem_id = req->nr_mem_id;
3092				regreq.nr_mode = NR_REG_ALL_NIC;
3093
3094				hdr->nr_reqtype = NETMAP_REQ_REGISTER;
3095				hdr->nr_body = (uintptr_t)&regreq;
3096				error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
3097				hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
3098				hdr->nr_body = (uintptr_t)req; /* reset nr_body */
3099				if (error) {
3100					na = NULL;
3101					ifp = NULL;
3102					break;
3103				}
3104				nmd = na->nm_mem; /* grab the memory allocator */
3105				if (nmd == NULL) {
3106					error = EINVAL;
3107					break;
3108				}
3109
3110				/* Finalize the memory allocator, get the pools
3111				 * information and release the allocator. */
3112				error = netmap_mem_finalize(nmd, na);
3113				if (error) {
3114					break;
3115				}
3116				error = netmap_mem_pools_info_get(req, nmd);
3117				netmap_mem_drop(na);
3118			} while (0);
3119			netmap_unget_na(na, ifp);
3120			NMG_UNLOCK();
3121			break;
3122		}
3123
3124		case NETMAP_REQ_CSB_ENABLE: {
3125			struct nmreq_option *opt;
3126
3127			opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
3128			if (opt == NULL) {
3129				error = EINVAL;
3130			} else {
3131				struct nmreq_opt_csb *csbo =
3132					(struct nmreq_opt_csb *)opt;
3133				NMG_LOCK();
3134				error = netmap_csb_validate(priv, csbo);
3135				NMG_UNLOCK();
3136				opt->nro_status = error;
3137			}
3138			break;
3139		}
3140
3141		case NETMAP_REQ_SYNC_KLOOP_START: {
3142			error = netmap_sync_kloop(priv, hdr);
3143			break;
3144		}
3145
3146		case NETMAP_REQ_SYNC_KLOOP_STOP: {
3147			error = netmap_sync_kloop_stop(priv);
3148			break;
3149		}
3150
3151		default: {
3152			error = EINVAL;
3153			break;
3154		}
3155		}
3156		/* Write back request body to userspace and reset the
3157		 * user-space pointer. */
3158		error = nmreq_copyout(hdr, error);
3159		break;
3160	}
3161
3162	case NIOCTXSYNC:
3163	case NIOCRXSYNC: {
3164		if (unlikely(priv->np_nifp == NULL)) {
3165			error = ENXIO;
3166			break;
3167		}
3168		mb(); /* make sure following reads are not from cache */
3169
3170		if (unlikely(priv->np_csb_atok_base)) {
3171			nm_prerr("Invalid sync in CSB mode");
3172			error = EBUSY;
3173			break;
3174		}
3175
3176		na = priv->np_na;      /* we have a reference */
3177
3178		mbq_init(&q);
3179		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
3180		krings = NMR(na, t);
3181		qfirst = priv->np_qfirst[t];
3182		qlast = priv->np_qlast[t];
3183		sync_flags = priv->np_sync_flags;
3184
3185		for (i = qfirst; i < qlast; i++) {
3186			struct netmap_kring *kring = krings[i];
3187			struct netmap_ring *ring = kring->ring;
3188
3189			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
3190				error = (error ? EIO : 0);
3191				continue;
3192			}
3193
3194			if (cmd == NIOCTXSYNC) {
3195				if (netmap_debug & NM_DEBUG_TXSYNC)
3196					nm_prinf("pre txsync ring %d cur %d hwcur %d",
3197					    i, ring->cur,
3198					    kring->nr_hwcur);
3199				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3200					netmap_ring_reinit(kring);
3201				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
3202					nm_sync_finalize(kring);
3203				}
3204				if (netmap_debug & NM_DEBUG_TXSYNC)
3205					nm_prinf("post txsync ring %d cur %d hwcur %d",
3206					    i, ring->cur,
3207					    kring->nr_hwcur);
3208			} else {
3209				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3210					netmap_ring_reinit(kring);
3211				}
3212				if (nm_may_forward_up(kring)) {
3213					/* transparent forwarding, see netmap_poll() */
3214					netmap_grab_packets(kring, &q, netmap_fwd);
3215				}
3216				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
3217					nm_sync_finalize(kring);
3218				}
3219				ring_timestamp_set(ring);
3220			}
3221			nm_kr_put(kring);
3222		}
3223
3224		if (mbq_peek(&q)) {
3225			netmap_send_up(na->ifp, &q);
3226		}
3227
3228		break;
3229	}
3230
3231	default: {
3232		return netmap_ioctl_legacy(priv, cmd, data, td);
3233		break;
3234	}
3235	}
3236
3237	return (error);
3238}
3239
3240size_t
3241nmreq_size_by_type(uint16_t nr_reqtype)
3242{
3243	switch (nr_reqtype) {
3244	case NETMAP_REQ_REGISTER:
3245		return sizeof(struct nmreq_register);
3246	case NETMAP_REQ_PORT_INFO_GET:
3247		return sizeof(struct nmreq_port_info_get);
3248	case NETMAP_REQ_VALE_ATTACH:
3249		return sizeof(struct nmreq_vale_attach);
3250	case NETMAP_REQ_VALE_DETACH:
3251		return sizeof(struct nmreq_vale_detach);
3252	case NETMAP_REQ_VALE_LIST:
3253		return sizeof(struct nmreq_vale_list);
3254	case NETMAP_REQ_PORT_HDR_SET:
3255	case NETMAP_REQ_PORT_HDR_GET:
3256		return sizeof(struct nmreq_port_hdr);
3257	case NETMAP_REQ_VALE_NEWIF:
3258		return sizeof(struct nmreq_vale_newif);
3259	case NETMAP_REQ_VALE_DELIF:
3260	case NETMAP_REQ_SYNC_KLOOP_STOP:
3261	case NETMAP_REQ_CSB_ENABLE:
3262		return 0;
3263	case NETMAP_REQ_VALE_POLLING_ENABLE:
3264	case NETMAP_REQ_VALE_POLLING_DISABLE:
3265		return sizeof(struct nmreq_vale_polling);
3266	case NETMAP_REQ_POOLS_INFO_GET:
3267		return sizeof(struct nmreq_pools_info);
3268	case NETMAP_REQ_SYNC_KLOOP_START:
3269		return sizeof(struct nmreq_sync_kloop_start);
3270	}
3271	return 0;
3272}
3273
3274static size_t
3275nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
3276{
3277	size_t rv = sizeof(struct nmreq_option);
3278#ifdef NETMAP_REQ_OPT_DEBUG
3279	if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
3280		return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
3281#endif /* NETMAP_REQ_OPT_DEBUG */
3282	switch (nro_reqtype) {
3283#ifdef WITH_EXTMEM
3284	case NETMAP_REQ_OPT_EXTMEM:
3285		rv = sizeof(struct nmreq_opt_extmem);
3286		break;
3287#endif /* WITH_EXTMEM */
3288	case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
3289		if (nro_size >= rv)
3290			rv = nro_size;
3291		break;
3292	case NETMAP_REQ_OPT_CSB:
3293		rv = sizeof(struct nmreq_opt_csb);
3294		break;
3295	case NETMAP_REQ_OPT_SYNC_KLOOP_MODE:
3296		rv = sizeof(struct nmreq_opt_sync_kloop_mode);
3297		break;
3298	case NETMAP_REQ_OPT_OFFSETS:
3299		rv = sizeof(struct nmreq_opt_offsets);
3300		break;
3301	}
3302	/* subtract the common header */
3303	return rv - sizeof(struct nmreq_option);
3304}
3305
3306/*
3307 * nmreq_copyin: create an in-kernel version of the request.
3308 *
3309 * We build the following data structure:
3310 *
3311 * hdr -> +-------+                buf
3312 *        |       |          +---------------+
3313 *        +-------+          |usr body ptr   |
3314 *        |options|-.        +---------------+
3315 *        +-------+ |        |usr options ptr|
3316 *        |body   |--------->+---------------+
3317 *        +-------+ |        |               |
3318 *                  |        |  copy of body |
3319 *                  |        |               |
3320 *                  |        +---------------+
3321 *                  |        |    NULL       |
3322 *                  |        +---------------+
3323 *                  |    .---|               |\
3324 *                  |    |   +---------------+ |
3325 *                  | .------|               | |
3326 *                  | |  |   +---------------+  \ option table
3327 *                  | |  |   |      ...      |  / indexed by option
3328 *                  | |  |   +---------------+ |  type
3329 *                  | |  |   |               | |
3330 *                  | |  |   +---------------+/
3331 *                  | |  |   |usr next ptr 1 |
3332 *                  `-|----->+---------------+
3333 *                    |  |   | copy of opt 1 |
3334 *                    |  |   |               |
3335 *                    |  | .-| nro_next      |
3336 *                    |  | | +---------------+
3337 *                    |  | | |usr next ptr 2 |
3338 *                    |  `-`>+---------------+
3339 *                    |      | copy of opt 2 |
3340 *                    |      |               |
3341 *                    |    .-| nro_next      |
3342 *                    |    | +---------------+
3343 *                    |    | |               |
3344 *                    ~    ~ ~      ...      ~
3345 *                    |    .-|               |
3346 *                    `----->+---------------+
3347 *                         | |usr next ptr n |
3348 *                         `>+---------------+
3349 *                           | copy of opt n |
3350 *                           |               |
3351 *                           | nro_next(NULL)|
3352 *                           +---------------+
3353 *
3354 * The options and body fields of the hdr structure are overwritten
3355 * with in-kernel valid pointers inside the buf. The original user
3356 * pointers are saved in the buf and restored on copyout.
3357 * The list of options is copied and the pointers adjusted. The
3358 * original pointers are saved before the option they belonged.
3359 *
3360 * The option table has an entry for every available option.  Entries
3361 * for options that have not been passed contain NULL.
3362 *
3363 */
3364
3365int
3366nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
3367{
3368	size_t rqsz, optsz, bufsz;
3369	int error = 0;
3370	char *ker = NULL, *p;
3371	struct nmreq_option **next, *src, **opt_tab, *opt;
3372	uint64_t *ptrs;
3373
3374	if (hdr->nr_reserved) {
3375		if (netmap_verbose)
3376			nm_prerr("nr_reserved must be zero");
3377		return EINVAL;
3378	}
3379
3380	if (!nr_body_is_user)
3381		return 0;
3382
3383	hdr->nr_reserved = nr_body_is_user;
3384
3385	/* compute the total size of the buffer */
3386	rqsz = nmreq_size_by_type(hdr->nr_reqtype);
3387	if (rqsz > NETMAP_REQ_MAXSIZE) {
3388		error = EMSGSIZE;
3389		goto out_err;
3390	}
3391	if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
3392		(!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
3393		/* Request body expected, but not found; or
3394		 * request body found but unexpected. */
3395		if (netmap_verbose)
3396			nm_prerr("nr_body expected but not found, or vice versa");
3397		error = EINVAL;
3398		goto out_err;
3399	}
3400
3401	/*
3402	 * The buffer size must be large enough to store the request body,
3403	 * all the possible options and the additional user pointers
3404	 * (2+NETMAP_REQ_OPT_MAX). Note that the maximum size of body plus
3405	 * options can not exceed NETMAP_REQ_MAXSIZE;
3406	 */
3407	bufsz = (2 + NETMAP_REQ_OPT_MAX) * sizeof(void *) + NETMAP_REQ_MAXSIZE +
3408		NETMAP_REQ_OPT_MAX * sizeof(opt_tab);
3409
3410	ker = nm_os_malloc(bufsz);
3411	if (ker == NULL) {
3412		error = ENOMEM;
3413		goto out_err;
3414	}
3415	p = ker;	/* write pointer into the buffer */
3416
3417	/* make a copy of the user pointers */
3418	ptrs = (uint64_t*)p;
3419	*ptrs++ = hdr->nr_body;
3420	*ptrs++ = hdr->nr_options;
3421	p = (char *)ptrs;
3422	/* overwrite the user pointer with the in-kernel one */
3423	hdr->nr_body = (uintptr_t)p;
3424	/* prepare the options-list pointers and temporarily terminate
3425	 * the in-kernel list, in case we have to jump to out_restore
3426	 */
3427	next = (struct nmreq_option **)&hdr->nr_options;
3428	src = *next;
3429	hdr->nr_options = 0;
3430
3431	/* copy the body */
3432	error = copyin(*(void **)ker, p, rqsz);
3433	if (error)
3434		goto out_restore;
3435	p += rqsz;
3436	/* start of the options table */
3437	opt_tab = (struct nmreq_option **)p;
3438	p += sizeof(opt_tab) * NETMAP_REQ_OPT_MAX;
3439
3440	/* copy the options */
3441	while (src) {
3442		struct nmreq_option *nsrc;
3443
3444		if (p - ker + sizeof(uint64_t*) + sizeof(*src) > bufsz) {
3445			error = EMSGSIZE;
3446			/* there might be a loop in the list: don't try to
3447			 * copyout the options
3448			 */
3449			hdr->nr_options = 0;
3450			goto out_restore;
3451		}
3452		/* copy the option header */
3453		ptrs = (uint64_t *)p;
3454		opt = (struct nmreq_option *)(ptrs + 1);
3455		error = copyin(src, opt, sizeof(*src));
3456		if (error)
3457			goto out_restore;
3458		rqsz += sizeof(*src);
3459		p = (char *)(opt + 1);
3460
3461		/* make a copy of the user next pointer */
3462		*ptrs = opt->nro_next;
3463		/* append the option to the in-kernel list */
3464		*next = opt;
3465		/* temporarily teminate the in-kernel list, in case we have to
3466		 * jump to out_restore
3467		 */
3468		nsrc = (struct nmreq_option *)opt->nro_next;
3469		opt->nro_next = 0;
3470
3471		opt->nro_status = 0;
3472
3473		/* check for invalid types */
3474		if (opt->nro_reqtype < 1) {
3475			if (netmap_verbose)
3476				nm_prinf("invalid option type: %u", opt->nro_reqtype);
3477			opt->nro_status = EINVAL;
3478			error = EINVAL;
3479			goto out_restore;
3480		}
3481
3482		if (opt->nro_reqtype >= NETMAP_REQ_OPT_MAX) {
3483			/* opt->nro_status will be set to EOPNOTSUPP */
3484			goto next;
3485		}
3486
3487		/* if the type is valid, index the option in the table
3488		 * unless it is a duplicate.
3489		 */
3490		if (opt_tab[opt->nro_reqtype] != NULL) {
3491			if (netmap_verbose)
3492				nm_prinf("duplicate option: %u", opt->nro_reqtype);
3493			opt->nro_status = EINVAL;
3494			opt_tab[opt->nro_reqtype]->nro_status = EINVAL;
3495			error = EINVAL;
3496			goto out_restore;
3497		}
3498		opt_tab[opt->nro_reqtype] = opt;
3499
3500		/* copy the option body */
3501		optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
3502						opt->nro_size);
3503		/* check optsz and nro_size to avoid for possible integer overflows of rqsz */
3504		if ((optsz > NETMAP_REQ_MAXSIZE) || (opt->nro_size > NETMAP_REQ_MAXSIZE)
3505				|| (rqsz + optsz > NETMAP_REQ_MAXSIZE)
3506				|| (optsz > 0 && rqsz + optsz <= rqsz)) {
3507			error = EMSGSIZE;
3508			goto out_restore;
3509		}
3510		rqsz += optsz;
3511		if (optsz) {
3512			/* the option body follows the option header */
3513			error = copyin(src + 1, p, optsz);
3514			if (error)
3515				goto out_restore;
3516			p += optsz;
3517		}
3518
3519	next:
3520		/* move to next option */
3521		next = (struct nmreq_option **)&opt->nro_next;
3522		src = nsrc;
3523	}
3524
3525	/* initialize all the options as not supported.  Recognized options
3526	 * will update their field.
3527	 */
3528	for (src = (struct nmreq_option *)hdr->nr_options; src;
3529			src = (struct nmreq_option *)src->nro_next) {
3530		src->nro_status = EOPNOTSUPP;
3531	}
3532	return 0;
3533
3534out_restore:
3535	nmreq_copyout(hdr, error);
3536out_err:
3537	return error;
3538}
3539
3540static int
3541nmreq_copyout(struct nmreq_header *hdr, int rerror)
3542{
3543	struct nmreq_option *src, *dst;
3544	void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
3545	uint64_t *ptrs;
3546	size_t bodysz;
3547	int error;
3548
3549	if (!hdr->nr_reserved)
3550		return rerror;
3551
3552	/* restore the user pointers in the header */
3553	ptrs = (uint64_t *)ker - 2;
3554	bufstart = ptrs;
3555	hdr->nr_body = *ptrs++;
3556	src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3557	hdr->nr_options = *ptrs;
3558
3559	if (!rerror) {
3560		/* copy the body */
3561		bodysz = nmreq_size_by_type(hdr->nr_reqtype);
3562		error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
3563		if (error) {
3564			rerror = error;
3565			goto out;
3566		}
3567	}
3568
3569	/* copy the options */
3570	dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3571	while (src) {
3572		size_t optsz;
3573		uint64_t next;
3574
3575		/* restore the user pointer */
3576		next = src->nro_next;
3577		ptrs = (uint64_t *)src - 1;
3578		src->nro_next = *ptrs;
3579
3580		/* always copy the option header */
3581		error = copyout(src, dst, sizeof(*src));
3582		if (error) {
3583			rerror = error;
3584			goto out;
3585		}
3586
3587		/* copy the option body only if there was no error */
3588		if (!rerror && !src->nro_status) {
3589			optsz = nmreq_opt_size_by_type(src->nro_reqtype,
3590							src->nro_size);
3591			if (optsz) {
3592				error = copyout(src + 1, dst + 1, optsz);
3593				if (error) {
3594					rerror = error;
3595					goto out;
3596				}
3597			}
3598		}
3599		src = (struct nmreq_option *)(uintptr_t)next;
3600		dst = (struct nmreq_option *)(uintptr_t)*ptrs;
3601	}
3602
3603
3604out:
3605	hdr->nr_reserved = 0;
3606	nm_os_free(bufstart);
3607	return rerror;
3608}
3609
3610struct nmreq_option *
3611nmreq_getoption(struct nmreq_header *hdr, uint16_t reqtype)
3612{
3613	struct nmreq_option **opt_tab;
3614
3615	if (!hdr->nr_options)
3616		return NULL;
3617
3618	opt_tab = (struct nmreq_option **)((uintptr_t)hdr->nr_options) -
3619	    (NETMAP_REQ_OPT_MAX + 1);
3620	return opt_tab[reqtype];
3621}
3622
3623static int
3624nmreq_checkoptions(struct nmreq_header *hdr)
3625{
3626	struct nmreq_option *opt;
3627	/* return error if there is still any option
3628	 * marked as not supported
3629	 */
3630
3631	for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
3632	     opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3633		if (opt->nro_status == EOPNOTSUPP)
3634			return EOPNOTSUPP;
3635
3636	return 0;
3637}
3638
3639/*
3640 * select(2) and poll(2) handlers for the "netmap" device.
3641 *
3642 * Can be called for one or more queues.
3643 * Return true the event mask corresponding to ready events.
3644 * If there are no ready events (and 'sr' is not NULL), do a
3645 * selrecord on either individual selinfo or on the global one.
3646 * Device-dependent parts (locking and sync of tx/rx rings)
3647 * are done through callbacks.
3648 *
3649 * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3650 * The first one is remapped to pwait as selrecord() uses the name as an
3651 * hidden argument.
3652 */
3653int
3654netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3655{
3656	struct netmap_adapter *na;
3657	struct netmap_kring *kring;
3658	struct netmap_ring *ring;
3659	u_int i, want[NR_TXRX], revents = 0;
3660	NM_SELINFO_T *si[NR_TXRX];
3661#define want_tx want[NR_TX]
3662#define want_rx want[NR_RX]
3663	struct mbq q;	/* packets from RX hw queues to host stack */
3664
3665	/*
3666	 * In order to avoid nested locks, we need to "double check"
3667	 * txsync and rxsync if we decide to do a selrecord().
3668	 * retry_tx (and retry_rx, later) prevent looping forever.
3669	 */
3670	int retry_tx = 1, retry_rx = 1;
3671
3672	/* Transparent mode: send_down is 1 if we have found some
3673	 * packets to forward (host RX ring --> NIC) during the rx
3674	 * scan and we have not sent them down to the NIC yet.
3675	 * Transparent mode requires to bind all rings to a single
3676	 * file descriptor.
3677	 */
3678	int send_down = 0;
3679	int sync_flags = priv->np_sync_flags;
3680
3681	mbq_init(&q);
3682
3683	if (unlikely(priv->np_nifp == NULL)) {
3684		return POLLERR;
3685	}
3686	mb(); /* make sure following reads are not from cache */
3687
3688	na = priv->np_na;
3689
3690	if (unlikely(!nm_netmap_on(na)))
3691		return POLLERR;
3692
3693	if (unlikely(priv->np_csb_atok_base)) {
3694		nm_prerr("Invalid poll in CSB mode");
3695		return POLLERR;
3696	}
3697
3698	if (netmap_debug & NM_DEBUG_ON)
3699		nm_prinf("device %s events 0x%x", na->name, events);
3700	want_tx = events & (POLLOUT | POLLWRNORM);
3701	want_rx = events & (POLLIN | POLLRDNORM);
3702
3703	/*
3704	 * If the card has more than one queue AND the file descriptor is
3705	 * bound to all of them, we sleep on the "global" selinfo, otherwise
3706	 * we sleep on individual selinfo (FreeBSD only allows two selinfo's
3707	 * per file descriptor).
3708	 * The interrupt routine in the driver wake one or the other
3709	 * (or both) depending on which clients are active.
3710	 *
3711	 * rxsync() is only called if we run out of buffers on a POLLIN.
3712	 * txsync() is called if we run out of buffers on POLLOUT, or
3713	 * there are pending packets to send. The latter can be disabled
3714	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3715	 */
3716	si[NR_RX] = priv->np_si[NR_RX];
3717	si[NR_TX] = priv->np_si[NR_TX];
3718
3719#ifdef __FreeBSD__
3720	/*
3721	 * We start with a lock free round which is cheap if we have
3722	 * slots available. If this fails, then lock and call the sync
3723	 * routines. We can't do this on Linux, as the contract says
3724	 * that we must call nm_os_selrecord() unconditionally.
3725	 */
3726	if (want_tx) {
3727		const enum txrx t = NR_TX;
3728		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3729			kring = NMR(na, t)[i];
3730			if (kring->ring->cur != kring->ring->tail) {
3731				/* Some unseen TX space is available, so what
3732				 * we don't need to run txsync. */
3733				revents |= want[t];
3734				want[t] = 0;
3735				break;
3736			}
3737		}
3738	}
3739	if (want_rx) {
3740		const enum txrx t = NR_RX;
3741		int rxsync_needed = 0;
3742
3743		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3744			kring = NMR(na, t)[i];
3745			if (kring->ring->cur == kring->ring->tail
3746				|| kring->rhead != kring->ring->head) {
3747				/* There are no unseen packets on this ring,
3748				 * or there are some buffers to be returned
3749				 * to the netmap port. We therefore go ahead
3750				 * and run rxsync. */
3751				rxsync_needed = 1;
3752				break;
3753			}
3754		}
3755		if (!rxsync_needed) {
3756			revents |= want_rx;
3757			want_rx = 0;
3758		}
3759	}
3760#endif
3761
3762#ifdef linux
3763	/* The selrecord must be unconditional on linux. */
3764	nm_os_selrecord(sr, si[NR_RX]);
3765	nm_os_selrecord(sr, si[NR_TX]);
3766#endif /* linux */
3767
3768	/*
3769	 * If we want to push packets out (priv->np_txpoll) or
3770	 * want_tx is still set, we must issue txsync calls
3771	 * (on all rings, to avoid that the tx rings stall).
3772	 * Fortunately, normal tx mode has np_txpoll set.
3773	 */
3774	if (priv->np_txpoll || want_tx) {
3775		/*
3776		 * The first round checks if anyone is ready, if not
3777		 * do a selrecord and another round to handle races.
3778		 * want_tx goes to 0 if any space is found, and is
3779		 * used to skip rings with no pending transmissions.
3780		 */
3781flush_tx:
3782		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3783			int found = 0;
3784
3785			kring = na->tx_rings[i];
3786			ring = kring->ring;
3787
3788			/*
3789			 * Don't try to txsync this TX ring if we already found some
3790			 * space in some of the TX rings (want_tx == 0) and there are no
3791			 * TX slots in this ring that need to be flushed to the NIC
3792			 * (head == hwcur).
3793			 */
3794			if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3795				continue;
3796
3797			if (nm_kr_tryget(kring, 1, &revents))
3798				continue;
3799
3800			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3801				netmap_ring_reinit(kring);
3802				revents |= POLLERR;
3803			} else {
3804				if (kring->nm_sync(kring, sync_flags))
3805					revents |= POLLERR;
3806				else
3807					nm_sync_finalize(kring);
3808			}
3809
3810			/*
3811			 * If we found new slots, notify potential
3812			 * listeners on the same ring.
3813			 * Since we just did a txsync, look at the copies
3814			 * of cur,tail in the kring.
3815			 */
3816			found = kring->rcur != kring->rtail;
3817			nm_kr_put(kring);
3818			if (found) { /* notify other listeners */
3819				revents |= want_tx;
3820				want_tx = 0;
3821#ifndef linux
3822				kring->nm_notify(kring, 0);
3823#endif /* linux */
3824			}
3825		}
3826		/* if there were any packet to forward we must have handled them by now */
3827		send_down = 0;
3828		if (want_tx && retry_tx && sr) {
3829#ifndef linux
3830			nm_os_selrecord(sr, si[NR_TX]);
3831#endif /* !linux */
3832			retry_tx = 0;
3833			goto flush_tx;
3834		}
3835	}
3836
3837	/*
3838	 * If want_rx is still set scan receive rings.
3839	 * Do it on all rings because otherwise we starve.
3840	 */
3841	if (want_rx) {
3842		/* two rounds here for race avoidance */
3843do_retry_rx:
3844		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3845			int found = 0;
3846
3847			kring = na->rx_rings[i];
3848			ring = kring->ring;
3849
3850			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3851				continue;
3852
3853			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3854				netmap_ring_reinit(kring);
3855				revents |= POLLERR;
3856			}
3857			/* now we can use kring->rcur, rtail */
3858
3859			/*
3860			 * transparent mode support: collect packets from
3861			 * hw rxring(s) that have been released by the user
3862			 */
3863			if (nm_may_forward_up(kring)) {
3864				netmap_grab_packets(kring, &q, netmap_fwd);
3865			}
3866
3867			/* Clear the NR_FORWARD flag anyway, it may be set by
3868			 * the nm_sync() below only on for the host RX ring (see
3869			 * netmap_rxsync_from_host()). */
3870			kring->nr_kflags &= ~NR_FORWARD;
3871			if (kring->nm_sync(kring, sync_flags))
3872				revents |= POLLERR;
3873			else
3874				nm_sync_finalize(kring);
3875			send_down |= (kring->nr_kflags & NR_FORWARD);
3876			ring_timestamp_set(ring);
3877			found = kring->rcur != kring->rtail;
3878			nm_kr_put(kring);
3879			if (found) {
3880				revents |= want_rx;
3881				retry_rx = 0;
3882#ifndef linux
3883				kring->nm_notify(kring, 0);
3884#endif /* linux */
3885			}
3886		}
3887
3888#ifndef linux
3889		if (retry_rx && sr) {
3890			nm_os_selrecord(sr, si[NR_RX]);
3891		}
3892#endif /* !linux */
3893		if (send_down || retry_rx) {
3894			retry_rx = 0;
3895			if (send_down)
3896				goto flush_tx; /* and retry_rx */
3897			else
3898				goto do_retry_rx;
3899		}
3900	}
3901
3902	/*
3903	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3904	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3905	 * to the host stack.
3906	 */
3907
3908	if (mbq_peek(&q)) {
3909		netmap_send_up(na->ifp, &q);
3910	}
3911
3912	return (revents);
3913#undef want_tx
3914#undef want_rx
3915}
3916
3917int
3918nma_intr_enable(struct netmap_adapter *na, int onoff)
3919{
3920	bool changed = false;
3921	enum txrx t;
3922	int i;
3923
3924	for_rx_tx(t) {
3925		for (i = 0; i < nma_get_nrings(na, t); i++) {
3926			struct netmap_kring *kring = NMR(na, t)[i];
3927			int on = !(kring->nr_kflags & NKR_NOINTR);
3928
3929			if (!!onoff != !!on) {
3930				changed = true;
3931			}
3932			if (onoff) {
3933				kring->nr_kflags &= ~NKR_NOINTR;
3934			} else {
3935				kring->nr_kflags |= NKR_NOINTR;
3936			}
3937		}
3938	}
3939
3940	if (!changed) {
3941		return 0; /* nothing to do */
3942	}
3943
3944	if (!na->nm_intr) {
3945		nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3946		  na->name);
3947		return -1;
3948	}
3949
3950	na->nm_intr(na, onoff);
3951
3952	return 0;
3953}
3954
3955
3956/*-------------------- driver support routines -------------------*/
3957
3958/* default notify callback */
3959static int
3960netmap_notify(struct netmap_kring *kring, int flags)
3961{
3962	struct netmap_adapter *na = kring->notify_na;
3963	enum txrx t = kring->tx;
3964
3965	nm_os_selwakeup(&kring->si);
3966	/* optimization: avoid a wake up on the global
3967	 * queue if nobody has registered for more
3968	 * than one ring
3969	 */
3970	if (na->si_users[t] > 0)
3971		nm_os_selwakeup(&na->si[t]);
3972
3973	return NM_IRQ_COMPLETED;
3974}
3975
3976/* called by all routines that create netmap_adapters.
3977 * provide some defaults and get a reference to the
3978 * memory allocator
3979 */
3980int
3981netmap_attach_common(struct netmap_adapter *na)
3982{
3983	if (!na->rx_buf_maxsize) {
3984		/* Set a conservative default (larger is safer). */
3985		na->rx_buf_maxsize = PAGE_SIZE;
3986	}
3987
3988#ifdef __FreeBSD__
3989	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3990		na->if_input = if_getinputfn(na->ifp); /* for netmap_send_up */
3991	}
3992	na->pdev = na; /* make sure netmap_mem_map() is called */
3993#endif /* __FreeBSD__ */
3994	if (na->na_flags & NAF_HOST_RINGS) {
3995		if (na->num_host_rx_rings == 0)
3996			na->num_host_rx_rings = 1;
3997		if (na->num_host_tx_rings == 0)
3998			na->num_host_tx_rings = 1;
3999	}
4000	if (na->nm_krings_create == NULL) {
4001		/* we assume that we have been called by a driver,
4002		 * since other port types all provide their own
4003		 * nm_krings_create
4004		 */
4005		na->nm_krings_create = netmap_hw_krings_create;
4006		na->nm_krings_delete = netmap_hw_krings_delete;
4007	}
4008	if (na->nm_notify == NULL)
4009		na->nm_notify = netmap_notify;
4010	na->active_fds = 0;
4011
4012	if (na->nm_mem == NULL) {
4013		/* use iommu or global allocator */
4014		na->nm_mem = netmap_mem_get_iommu(na);
4015	}
4016	if (na->nm_bdg_attach == NULL)
4017		/* no special nm_bdg_attach callback. On VALE
4018		 * attach, we need to interpose a bwrap
4019		 */
4020		na->nm_bdg_attach = netmap_default_bdg_attach;
4021
4022	return 0;
4023}
4024
4025/* Wrapper for the register callback provided netmap-enabled
4026 * hardware drivers.
4027 * nm_iszombie(na) means that the driver module has been
4028 * unloaded, so we cannot call into it.
4029 * nm_os_ifnet_lock() must guarantee mutual exclusion with
4030 * module unloading.
4031 */
4032static int
4033netmap_hw_reg(struct netmap_adapter *na, int onoff)
4034{
4035	struct netmap_hw_adapter *hwna =
4036		(struct netmap_hw_adapter*)na;
4037	int error = 0;
4038
4039	nm_os_ifnet_lock();
4040
4041	if (nm_iszombie(na)) {
4042		if (onoff) {
4043			error = ENXIO;
4044		} else if (na != NULL) {
4045			na->na_flags &= ~NAF_NETMAP_ON;
4046		}
4047		goto out;
4048	}
4049
4050	error = hwna->nm_hw_register(na, onoff);
4051
4052out:
4053	nm_os_ifnet_unlock();
4054
4055	return error;
4056}
4057
4058static void
4059netmap_hw_dtor(struct netmap_adapter *na)
4060{
4061	if (na->ifp == NULL)
4062		return;
4063
4064	NM_DETACH_NA(na->ifp);
4065}
4066
4067
4068/*
4069 * Allocate a netmap_adapter object, and initialize it from the
4070 * 'arg' passed by the driver on attach.
4071 * We allocate a block of memory of 'size' bytes, which has room
4072 * for struct netmap_adapter plus additional room private to
4073 * the caller.
4074 * Return 0 on success, ENOMEM otherwise.
4075 */
4076int
4077netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
4078{
4079	struct netmap_hw_adapter *hwna = NULL;
4080	if_t ifp = NULL;
4081
4082	if (size < sizeof(struct netmap_hw_adapter)) {
4083		if (netmap_debug & NM_DEBUG_ON)
4084			nm_prerr("Invalid netmap adapter size %d", (int)size);
4085		return EINVAL;
4086	}
4087
4088	if (arg == NULL || arg->ifp == NULL) {
4089		if (netmap_debug & NM_DEBUG_ON)
4090			nm_prerr("either arg or arg->ifp is NULL");
4091		return EINVAL;
4092	}
4093
4094	if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
4095		if (netmap_debug & NM_DEBUG_ON)
4096			nm_prerr("%s: invalid rings tx %d rx %d",
4097				arg->name, arg->num_tx_rings, arg->num_rx_rings);
4098		return EINVAL;
4099	}
4100
4101	ifp = arg->ifp;
4102	if (NM_NA_CLASH(ifp)) {
4103		/* If NA(ifp) is not null but there is no valid netmap
4104		 * adapter it means that someone else is using the same
4105		 * pointer (e.g. ax25_ptr on linux). This happens for
4106		 * instance when also PF_RING is in use. */
4107		nm_prerr("Error: netmap adapter hook is busy");
4108		return EBUSY;
4109	}
4110
4111	hwna = nm_os_malloc(size);
4112	if (hwna == NULL)
4113		goto fail;
4114	hwna->up = *arg;
4115	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
4116	strlcpy(hwna->up.name, if_name(ifp), sizeof(hwna->up.name));
4117	if (override_reg) {
4118		hwna->nm_hw_register = hwna->up.nm_register;
4119		hwna->up.nm_register = netmap_hw_reg;
4120	}
4121	if (netmap_attach_common(&hwna->up)) {
4122		nm_os_free(hwna);
4123		goto fail;
4124	}
4125	netmap_adapter_get(&hwna->up);
4126
4127	NM_ATTACH_NA(ifp, &hwna->up);
4128
4129	nm_os_onattach(ifp);
4130
4131	if (arg->nm_dtor == NULL) {
4132		hwna->up.nm_dtor = netmap_hw_dtor;
4133	}
4134
4135	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
4136	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
4137	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
4138	return 0;
4139
4140fail:
4141	nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
4142	return (hwna ? EINVAL : ENOMEM);
4143}
4144
4145
4146int
4147netmap_attach(struct netmap_adapter *arg)
4148{
4149	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
4150			1 /* override nm_reg */);
4151}
4152
4153
4154void
4155NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
4156{
4157	if (!na) {
4158		return;
4159	}
4160
4161	refcount_acquire(&na->na_refcount);
4162}
4163
4164
4165/* returns 1 iff the netmap_adapter is destroyed */
4166int
4167NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
4168{
4169	if (!na)
4170		return 1;
4171
4172	if (!refcount_release(&na->na_refcount))
4173		return 0;
4174
4175	if (na->nm_dtor)
4176		na->nm_dtor(na);
4177
4178	if (na->tx_rings) { /* XXX should not happen */
4179		if (netmap_debug & NM_DEBUG_ON)
4180			nm_prerr("freeing leftover tx_rings");
4181		na->nm_krings_delete(na);
4182	}
4183	netmap_pipe_dealloc(na);
4184	if (na->nm_mem)
4185		netmap_mem_put(na->nm_mem);
4186	bzero(na, sizeof(*na));
4187	nm_os_free(na);
4188
4189	return 1;
4190}
4191
4192/* nm_krings_create callback for all hardware native adapters */
4193int
4194netmap_hw_krings_create(struct netmap_adapter *na)
4195{
4196	int ret = netmap_krings_create(na, 0);
4197	if (ret == 0) {
4198		/* initialize the mbq for the sw rx ring */
4199		u_int lim = netmap_real_rings(na, NR_RX), i;
4200		for (i = na->num_rx_rings; i < lim; i++) {
4201			mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
4202		}
4203		nm_prdis("initialized sw rx queue %d", na->num_rx_rings);
4204	}
4205	return ret;
4206}
4207
4208
4209
4210/*
4211 * Called on module unload by the netmap-enabled drivers
4212 */
4213void
4214netmap_detach(if_t ifp)
4215{
4216	struct netmap_adapter *na;
4217
4218	NMG_LOCK();
4219
4220	if (!NM_NA_VALID(ifp)) {
4221		NMG_UNLOCK();
4222		return;
4223	}
4224
4225	na = NA(ifp);
4226	netmap_set_all_rings(na, NM_KR_LOCKED);
4227	/*
4228	 * if the netmap adapter is not native, somebody
4229	 * changed it, so we can not release it here.
4230	 * The NAF_ZOMBIE flag will notify the new owner that
4231	 * the driver is gone.
4232	 */
4233	if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
4234		na->na_flags |= NAF_ZOMBIE;
4235	}
4236	/* give active users a chance to notice that NAF_ZOMBIE has been
4237	 * turned on, so that they can stop and return an error to userspace.
4238	 * Note that this becomes a NOP if there are no active users and,
4239	 * therefore, the put() above has deleted the na, since now NA(ifp) is
4240	 * NULL.
4241	 */
4242	netmap_enable_all_rings(ifp);
4243	NMG_UNLOCK();
4244}
4245
4246
4247/*
4248 * Intercept packets from the network stack and pass them
4249 * to netmap as incoming packets on the 'software' ring.
4250 *
4251 * We only store packets in a bounded mbq and then copy them
4252 * in the relevant rxsync routine.
4253 *
4254 * We rely on the OS to make sure that the ifp and na do not go
4255 * away (typically the caller checks for IFF_DRV_RUNNING or the like).
4256 * In nm_register() or whenever there is a reinitialization,
4257 * we make sure to make the mode change visible here.
4258 */
4259int
4260netmap_transmit(if_t ifp, struct mbuf *m)
4261{
4262	struct netmap_adapter *na = NA(ifp);
4263	struct netmap_kring *kring, *tx_kring;
4264	u_int len = MBUF_LEN(m);
4265	u_int error = ENOBUFS;
4266	unsigned int txr;
4267	struct mbq *q;
4268	int busy;
4269	u_int i;
4270
4271	i = MBUF_TXQ(m);
4272	if (i >= na->num_host_rx_rings) {
4273		i = i % na->num_host_rx_rings;
4274	}
4275	kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
4276
4277	// XXX [Linux] we do not need this lock
4278	// if we follow the down/configure/up protocol -gl
4279	// mtx_lock(&na->core_lock);
4280
4281	if (!nm_netmap_on(na)) {
4282		nm_prerr("%s not in netmap mode anymore", na->name);
4283		error = ENXIO;
4284		goto done;
4285	}
4286
4287	txr = MBUF_TXQ(m);
4288	if (txr >= na->num_tx_rings) {
4289		txr %= na->num_tx_rings;
4290	}
4291	tx_kring = NMR(na, NR_TX)[txr];
4292
4293	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
4294		return MBUF_TRANSMIT(na, ifp, m);
4295	}
4296
4297	q = &kring->rx_queue;
4298
4299	// XXX reconsider long packets if we handle fragments
4300	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
4301		nm_prerr("%s from_host, drop packet size %d > %d", na->name,
4302			len, NETMAP_BUF_SIZE(na));
4303		goto done;
4304	}
4305
4306	if (!netmap_generic_hwcsum) {
4307		if (nm_os_mbuf_has_csum_offld(m)) {
4308			nm_prlim(1, "%s drop mbuf that needs checksum offload", na->name);
4309			goto done;
4310		}
4311	}
4312
4313	if (nm_os_mbuf_has_seg_offld(m)) {
4314		nm_prlim(1, "%s drop mbuf that needs generic segmentation offload", na->name);
4315		goto done;
4316	}
4317
4318#ifdef __FreeBSD__
4319	ETHER_BPF_MTAP(ifp, m);
4320#endif /* __FreeBSD__ */
4321
4322	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
4323	 * and maybe other instances of netmap_transmit (the latter
4324	 * not possible on Linux).
4325	 * We enqueue the mbuf only if we are sure there is going to be
4326	 * enough room in the host RX ring, otherwise we drop it.
4327	 */
4328	mbq_lock(q);
4329
4330	busy = kring->nr_hwtail - kring->nr_hwcur;
4331	if (busy < 0)
4332		busy += kring->nkr_num_slots;
4333	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
4334		nm_prlim(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
4335			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
4336	} else {
4337		mbq_enqueue(q, m);
4338		nm_prdis(2, "%s %d bufs in queue", na->name, mbq_len(q));
4339		/* notify outside the lock */
4340		m = NULL;
4341		error = 0;
4342	}
4343	mbq_unlock(q);
4344
4345done:
4346	if (m) {
4347		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4348		m_freem(m);
4349	}
4350	/* unconditionally wake up listeners */
4351	kring->nm_notify(kring, 0);
4352	/* this is normally netmap_notify(), but for nics
4353	 * connected to a bridge it is netmap_bwrap_intr_notify(),
4354	 * that possibly forwards the frames through the switch
4355	 */
4356
4357	return (error);
4358}
4359
4360
4361/*
4362 * Reset function to be called by the driver routines when reinitializing
4363 * a hardware ring. The driver is in charge of locking to protect the kring
4364 * while this operation is being performed. This is normally achieved by
4365 * calling netmap_disable_all_rings() before triggering a reset.
4366 * If the kring is not in netmap mode, return NULL to inform the caller
4367 * that this is the case.
4368 * If the kring is in netmap mode, set hwofs so that the netmap indices
4369 * seen by userspace (head/cut/tail) do not change, although the internal
4370 * NIC indices have been reset to 0.
4371 * In any case, adjust kring->nr_mode.
4372 */
4373struct netmap_slot *
4374netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
4375	u_int new_cur)
4376{
4377	struct netmap_kring *kring;
4378	u_int new_hwtail, new_hwofs;
4379
4380	if (!nm_native_on(na)) {
4381		nm_prdis("interface not in native netmap mode");
4382		return NULL;	/* nothing to reinitialize */
4383	}
4384
4385	if (tx == NR_TX) {
4386		if (n >= na->num_tx_rings)
4387			return NULL;
4388		kring = na->tx_rings[n];
4389		/*
4390		 * Set hwofs to rhead, so that slots[rhead] is mapped to
4391		 * the NIC internal slot 0, and thus the netmap buffer
4392		 * at rhead is the next to be transmitted. Transmissions
4393		 * that were pending before the reset are considered as
4394		 * sent, so that we can have hwcur = rhead. All the slots
4395		 * are now owned by the user, so we can also reinit hwtail.
4396		 */
4397		new_hwofs = kring->rhead;
4398		new_hwtail = nm_prev(kring->rhead, kring->nkr_num_slots - 1);
4399	} else {
4400		if (n >= na->num_rx_rings)
4401			return NULL;
4402		kring = na->rx_rings[n];
4403		/*
4404		 * Set hwofs to hwtail, so that slots[hwtail] is mapped to
4405		 * the NIC internal slot 0, and thus the netmap buffer
4406		 * at hwtail is the next to be given to the NIC.
4407		 * Unread slots (the ones in [rhead,hwtail[) are owned by
4408		 * the user, and thus the caller cannot give them
4409		 * to the NIC right now.
4410		 */
4411		new_hwofs = kring->nr_hwtail;
4412		new_hwtail = kring->nr_hwtail;
4413	}
4414	if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
4415		kring->nr_mode = NKR_NETMAP_OFF;
4416		return NULL;
4417	}
4418	if (netmap_verbose) {
4419	    nm_prinf("%s, hc %u->%u, ht %u->%u, ho %u->%u", kring->name,
4420	        kring->nr_hwcur, kring->rhead,
4421	        kring->nr_hwtail, new_hwtail,
4422		kring->nkr_hwofs, new_hwofs);
4423	}
4424	kring->nr_hwcur = kring->rhead;
4425	kring->nr_hwtail = new_hwtail;
4426	kring->nkr_hwofs = new_hwofs;
4427
4428	/*
4429	 * Wakeup on the individual and global selwait
4430	 * We do the wakeup here, but the ring is not yet reconfigured.
4431	 * However, we are under lock so there are no races.
4432	 */
4433	kring->nr_mode = NKR_NETMAP_ON;
4434	kring->nm_notify(kring, 0);
4435	return kring->ring->slot;
4436}
4437
4438
4439/*
4440 * Dispatch rx/tx interrupts to the netmap rings.
4441 *
4442 * "work_done" is non-null on the RX path, NULL for the TX path.
4443 * We rely on the OS to make sure that there is only one active
4444 * instance per queue, and that there is appropriate locking.
4445 *
4446 * The 'notify' routine depends on what the ring is attached to.
4447 * - for a netmap file descriptor, do a selwakeup on the individual
4448 *   waitqueue, plus one on the global one if needed
4449 *   (see netmap_notify)
4450 * - for a nic connected to a switch, call the proper forwarding routine
4451 *   (see netmap_bwrap_intr_notify)
4452 */
4453int
4454netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
4455{
4456	struct netmap_kring *kring;
4457	enum txrx t = (work_done ? NR_RX : NR_TX);
4458
4459	q &= NETMAP_RING_MASK;
4460
4461	if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
4462	        nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
4463	}
4464
4465	if (q >= nma_get_nrings(na, t))
4466		return NM_IRQ_PASS; // not a physical queue
4467
4468	kring = NMR(na, t)[q];
4469
4470	if (kring->nr_mode == NKR_NETMAP_OFF) {
4471		return NM_IRQ_PASS;
4472	}
4473
4474	if (t == NR_RX) {
4475		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
4476		*work_done = 1; /* do not fire napi again */
4477	}
4478
4479	return kring->nm_notify(kring, 0);
4480}
4481
4482
4483/*
4484 * Default functions to handle rx/tx interrupts from a physical device.
4485 * "work_done" is non-null on the RX path, NULL for the TX path.
4486 *
4487 * If the card is not in netmap mode, simply return NM_IRQ_PASS,
4488 * so that the caller proceeds with regular processing.
4489 * Otherwise call netmap_common_irq().
4490 *
4491 * If the card is connected to a netmap file descriptor,
4492 * do a selwakeup on the individual queue, plus one on the global one
4493 * if needed (multiqueue card _and_ there are multiqueue listeners),
4494 * and return NR_IRQ_COMPLETED.
4495 *
4496 * Finally, if called on rx from an interface connected to a switch,
4497 * calls the proper forwarding routine.
4498 */
4499int
4500netmap_rx_irq(if_t ifp, u_int q, u_int *work_done)
4501{
4502	struct netmap_adapter *na = NA(ifp);
4503
4504	/*
4505	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
4506	 * we still use the regular driver even though the previous
4507	 * check fails. It is unclear whether we should use
4508	 * nm_native_on() here.
4509	 */
4510	if (!nm_netmap_on(na))
4511		return NM_IRQ_PASS;
4512
4513	if (na->na_flags & NAF_SKIP_INTR) {
4514		nm_prdis("use regular interrupt");
4515		return NM_IRQ_PASS;
4516	}
4517
4518	return netmap_common_irq(na, q, work_done);
4519}
4520
4521/* set/clear native flags and if_transmit/netdev_ops */
4522void
4523nm_set_native_flags(struct netmap_adapter *na)
4524{
4525	if_t ifp = na->ifp;
4526
4527	/* We do the setup for intercepting packets only if we are the
4528	 * first user of this adapter. */
4529	if (na->active_fds > 0) {
4530		return;
4531	}
4532
4533	na->na_flags |= NAF_NETMAP_ON;
4534	nm_os_onenter(ifp);
4535	netmap_update_hostrings_mode(na);
4536}
4537
4538void
4539nm_clear_native_flags(struct netmap_adapter *na)
4540{
4541	if_t ifp = na->ifp;
4542
4543	/* We undo the setup for intercepting packets only if we are the
4544	 * last user of this adapter. */
4545	if (na->active_fds > 0) {
4546		return;
4547	}
4548
4549	netmap_update_hostrings_mode(na);
4550	nm_os_onexit(ifp);
4551
4552	na->na_flags &= ~NAF_NETMAP_ON;
4553}
4554
4555void
4556netmap_krings_mode_commit(struct netmap_adapter *na, int onoff)
4557{
4558	enum txrx t;
4559
4560	for_rx_tx(t) {
4561		int i;
4562
4563		for (i = 0; i < netmap_real_rings(na, t); i++) {
4564			struct netmap_kring *kring = NMR(na, t)[i];
4565
4566			if (onoff && nm_kring_pending_on(kring))
4567				kring->nr_mode = NKR_NETMAP_ON;
4568			else if (!onoff && nm_kring_pending_off(kring))
4569				kring->nr_mode = NKR_NETMAP_OFF;
4570		}
4571	}
4572}
4573
4574/*
4575 * Module loader and unloader
4576 *
4577 * netmap_init() creates the /dev/netmap device and initializes
4578 * all global variables. Returns 0 on success, errno on failure
4579 * (but there is no chance)
4580 *
4581 * netmap_fini() destroys everything.
4582 */
4583
4584static struct cdev *netmap_dev; /* /dev/netmap character device. */
4585extern struct cdevsw netmap_cdevsw;
4586
4587
4588void
4589netmap_fini(void)
4590{
4591	if (netmap_dev)
4592		destroy_dev(netmap_dev);
4593	/* we assume that there are no longer netmap users */
4594	nm_os_ifnet_fini();
4595	netmap_uninit_bridges();
4596	netmap_mem_fini();
4597	NMG_LOCK_DESTROY();
4598	nm_prinf("netmap: unloaded module.");
4599}
4600
4601
4602int
4603netmap_init(void)
4604{
4605	int error;
4606
4607	NMG_LOCK_INIT();
4608
4609	error = netmap_mem_init();
4610	if (error != 0)
4611		goto fail;
4612	/*
4613	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
4614	 * when the module is compiled in.
4615	 * XXX could use make_dev_credv() to get error number
4616	 */
4617	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
4618		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
4619			      "netmap");
4620	if (!netmap_dev)
4621		goto fail;
4622
4623	error = netmap_init_bridges();
4624	if (error)
4625		goto fail;
4626
4627#ifdef __FreeBSD__
4628	nm_os_vi_init_index();
4629#endif
4630
4631	error = nm_os_ifnet_init();
4632	if (error)
4633		goto fail;
4634
4635#if !defined(__FreeBSD__) || defined(KLD_MODULE)
4636	nm_prinf("netmap: loaded module");
4637#endif
4638	return (0);
4639fail:
4640	netmap_fini();
4641	return (EINVAL); /* may be incorrect */
4642}
4643